From 03ba4131783cc9e872f8bb26a03f15bc11f27564 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Sat, 19 Sep 1998 19:15:08 +0000 Subject: - Merge with Linux 2.1.121. - Bugfixes. --- net/ipv4/af_inet.c | 31 +++++- net/ipv4/arp.c | 25 +++-- net/ipv4/devinet.c | 21 +++- net/ipv4/fib_frontend.c | 37 ++++--- net/ipv4/fib_hash.c | 4 +- net/ipv4/fib_rules.c | 20 ++-- net/ipv4/fib_semantics.c | 98 +------------------ net/ipv4/icmp.c | 24 ++++- net/ipv4/igmp.c | 10 +- net/ipv4/ip_forward.c | 48 ++++----- net/ipv4/ip_fragment.c | 2 +- net/ipv4/ip_fw.c | 26 ++--- net/ipv4/ip_gre.c | 6 +- net/ipv4/ip_input.c | 89 ++++++++++------- net/ipv4/ip_nat_dumb.c | 40 ++++++-- net/ipv4/ip_options.c | 8 +- net/ipv4/ip_output.c | 63 +++++------- net/ipv4/ip_sockglue.c | 46 ++++++--- net/ipv4/ipip.c | 4 +- net/ipv4/ipmr.c | 51 +++++----- net/ipv4/proc.c | 11 ++- net/ipv4/raw.c | 39 +++++--- net/ipv4/route.c | 105 ++++++++------------ net/ipv4/tcp.c | 103 ++++++++++++++------ net/ipv4/tcp_input.c | 144 +++++++++++++++++---------- net/ipv4/tcp_ipv4.c | 246 ++++++++++++++++++++++++++++++----------------- net/ipv4/tcp_output.c | 115 ++++++++++++++++------ net/ipv4/tcp_timer.c | 6 +- net/ipv4/timer.c | 2 +- net/ipv4/udp.c | 207 +++++++++++++++++++++------------------ 30 files changed, 927 insertions(+), 704 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 18c31f5c3..8282333dc 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -5,7 +5,7 @@ * * PF_INET protocol family socket handler. * - * Version: $Id: af_inet.c,v 1.74 1998/05/08 21:06:24 davem Exp $ + * Version: $Id: af_inet.c,v 1.75 1998/08/26 12:03:15 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -177,6 +177,8 @@ static __inline__ void kill_sk_now(struct sock *sk) if(sk->opt) kfree(sk->opt); dst_release(sk->dst_cache); + if (atomic_read(&sk->omem_alloc)) + printk(KERN_DEBUG "kill_sk_now: optmem leakage (%d bytes) detected.\n", atomic_read(&sk->omem_alloc)); sk_free(sk); } @@ -576,6 +578,24 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, return(0); } +static void inet_wait_for_connect(struct sock *sk) +{ + struct wait_queue wait = { current, NULL }; + + add_wait_queue(sk->sleep, &wait); + current->state = TASK_INTERRUPTIBLE; + while (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) { + if (signal_pending(current)) + break; + if (sk->err) + break; + schedule(); + current->state = TASK_INTERRUPTIBLE; + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); +} + /* * Connect to a remote host. There is regrettably still a little * TCP 'magic' in here. @@ -623,6 +643,13 @@ int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr, if (sk->state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) return (-EINPROGRESS); +#if 1 + if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) { + inet_wait_for_connect(sk); + if (signal_pending(current)) + return -ERESTARTSYS; + } +#else cli(); while(sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) { interruptible_sleep_on(sk->sleep); @@ -639,6 +666,7 @@ int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr, } } sti(); +#endif sock->state = SS_CONNECTED; if ((sk->state != TCP_ESTABLISHED) && sk->err) { @@ -876,7 +904,6 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case FIOGETOWN: case SIOCGPGRP: return put_user(sk->proc, (int *)arg); - return(0); case SIOCGSTAMP: if(sk->stamp.tv_sec==0) return -ENOENT; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index e6e272b0e..1ce69028f 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1,6 +1,6 @@ /* linux/net/inet/arp.c * - * Version: $Id: arp.c,v 1.67 1998/06/19 13:22:31 davem Exp $ + * Version: $Id: arp.c,v 1.70 1998/08/26 12:03:18 davem Exp $ * * Copyright (C) 1994 by Florian La Roche * @@ -760,7 +760,7 @@ int arp_req_set(struct arpreq *r, struct device * dev) r->arp_flags |= ATF_COM; if (dev == NULL) { struct rtable * rt; - if ((err = ip_route_output(&rt, ip, 0, 1, 0)) != 0) + if ((err = ip_route_output(&rt, ip, 0, RTO_ONLINK, 0)) != 0) return err; dev = rt->u.dst.dev; ip_rt_put(rt); @@ -843,11 +843,21 @@ int arp_req_delete(struct arpreq *r, struct device * dev) return -EINVAL; } + if (dev == NULL) { + struct rtable * rt; + if ((err = ip_route_output(&rt, ip, 0, RTO_ONLINK, 0)) != 0) + return err; + dev = rt->u.dst.dev; + ip_rt_put(rt); + if (!dev) + return -EINVAL; + } err = -ENXIO; start_bh_atomic(); neigh = __neigh_lookup(&arp_tbl, &ip, dev, 0); if (neigh) { - err = neigh_update(neigh, NULL, NUD_FAILED, 1, 0); + if (neigh->nud_state&~NUD_NOARP) + err = neigh_update(neigh, NULL, NUD_FAILED, 1, 0); neigh_release(neigh); } end_bh_atomic(); @@ -867,7 +877,7 @@ int arp_ioctl(unsigned int cmd, void *arg) switch(cmd) { case SIOCDARP: case SIOCSARP: - if (!suser()) + if (!capable(CAP_NET_ADMIN)) return -EPERM; case SIOCGARP: err = copy_from_user(&r, arg, sizeof(struct arpreq)); @@ -899,10 +909,8 @@ int arp_ioctl(unsigned int cmd, void *arg) err = -EINVAL; if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type) goto out; - } else if (cmd != SIOCSARP) { - /* dev has not been set ... */ - printk(KERN_ERR "arp_ioctl: invalid, null device\n"); - err = -EINVAL; + } else if (cmd == SIOCGARP) { + err = -ENODEV; goto out; } @@ -911,7 +919,6 @@ int arp_ioctl(unsigned int cmd, void *arg) err = arp_req_delete(&r, dev); break; case SIOCSARP: - /* This checks for dev == NULL */ err = arp_req_set(&r, dev); break; case SIOCGARP: diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 18293338e..ac7c04432 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1,7 +1,7 @@ /* * NET3 IP device support routines. * - * Version: $Id: devinet.c,v 1.22 1998/05/08 21:06:26 davem Exp $ + * Version: $Id: devinet.c,v 1.23 1998/08/26 12:03:21 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -533,8 +533,6 @@ int devinet_ioctl(unsigned int cmd, void *arg) inet_del_ifa(in_dev, ifap, 0); ifa->ifa_broadcast = 0; ifa->ifa_anycast = 0; - ifa->ifa_prefixlen = 32; - ifa->ifa_mask = inet_make_mask(32); } ifa->ifa_address = @@ -545,6 +543,9 @@ int devinet_ioctl(unsigned int cmd, void *arg) ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen); if ((dev->flags&IFF_BROADCAST) && ifa->ifa_prefixlen < 31) ifa->ifa_broadcast = ifa->ifa_address|~ifa->ifa_mask; + } else { + ifa->ifa_prefixlen = 32; + ifa->ifa_mask = inet_make_mask(32); } ret = inet_set_ifa(dev, ifa); break; @@ -702,6 +703,16 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, void case NETDEV_UNREGISTER: inetdev_destroy(in_dev); break; + case NETDEV_CHANGENAME: + if (in_dev->ifa_list) { + struct in_ifaddr *ifa; + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + /* Do not notify about label change, this event is + not interesting to applications using netlink. + */ + } + break; } return NOTIFY_DONE; @@ -716,7 +727,7 @@ struct notifier_block ip_netdev_notifier={ #ifdef CONFIG_RTNETLINK static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, - pid_t pid, u32 seq, int event) + u32 pid, u32 seq, int event) { struct ifaddrmsg *ifm; struct nlmsghdr *nlh; @@ -729,7 +740,7 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT; ifm->ifa_scope = ifa->ifa_scope; ifm->ifa_index = ifa->ifa_dev->dev->ifindex; - if (ifa->ifa_prefixlen) + if (ifa->ifa_address) RTA_PUT(skb, IFA_ADDRESS, 4, &ifa->ifa_address); if (ifa->ifa_local) RTA_PUT(skb, IFA_LOCAL, 4, &ifa->ifa_local); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index d9a150218..013a4ba9a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -5,7 +5,7 @@ * * IPv4 Forwarding Information Base: FIB frontend. * - * Version: $Id: fib_frontend.c,v 1.11 1998/06/11 03:15:40 davem Exp $ + * Version: $Id: fib_frontend.c,v 1.12 1998/08/26 12:03:24 davem Exp $ * * Authors: Alexey Kuznetsov, * @@ -300,10 +300,8 @@ static int inet_check_attr(struct rtmsg *r, struct rtattr **rta) if (attr) { if (RTA_PAYLOAD(attr) < 4) return -EINVAL; -#ifndef CONFIG_RTNL_OLD_IFINFO if (i != RTA_MULTIPATH && i != RTA_METRICS) -#endif - rta[i-1] = (struct rtattr*)RTA_DATA(attr); + rta[i-1] = (struct rtattr*)RTA_DATA(attr); } } return 0; @@ -527,6 +525,14 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) #undef BRD1_OK } +static void fib_disable_ip(struct device *dev, int force) +{ + if (fib_sync_down(0, dev, force)) + fib_flush(); + rt_cache_flush(0); + arp_ifdown(dev); +} + static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_ifaddr *ifa = (struct in_ifaddr*)ptr; @@ -537,8 +543,15 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, rt_cache_flush(-1); break; case NETDEV_DOWN: - fib_del_ifaddr(ifa); - rt_cache_flush(-1); + if (ifa->ifa_dev && ifa->ifa_dev->ifa_list == NULL) { + /* Last address was deleted from this interface. + Disable IP. + */ + fib_disable_ip(ifa->ifa_dev->dev, 1); + } else { + fib_del_ifaddr(ifa); + rt_cache_flush(-1); + } break; } return NOTIFY_DONE; @@ -563,18 +576,10 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo rt_cache_flush(-1); break; case NETDEV_DOWN: - if (fib_sync_down(0, dev, 0)) - fib_flush(); - rt_cache_flush(0); - arp_ifdown(dev); + fib_disable_ip(dev, 0); break; case NETDEV_UNREGISTER: - if (in_dev->ifa_list) - printk("About to crash!\n"); - if (fib_sync_down(0, dev, 1)) - fib_flush(); - rt_cache_flush(0); - arp_ifdown(dev); + fib_disable_ip(dev, 1); break; case NETDEV_CHANGEMTU: case NETDEV_CHANGE: diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 3e13671a2..618d247bd 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -5,7 +5,7 @@ * * IPv4 FIB: lookup engine and maintenance routines. * - * Version: $Id: fib_hash.c,v 1.4 1998/07/15 05:05:08 davem Exp $ + * Version: $Id: fib_hash.c,v 1.5 1998/08/26 12:03:27 davem Exp $ * * Authors: Alexey Kuznetsov, * @@ -713,7 +713,7 @@ static void rtmsg_fib(int event, struct fib_node* f, int z, int tb_id, struct nlmsghdr *n, struct netlink_skb_parms *req) { struct sk_buff *skb; - pid_t pid = req ? req->pid : 0; + u32 pid = req ? req->pid : 0; int size = NLMSG_SPACE(sizeof(struct rtmsg)+256); skb = alloc_skb(size, GFP_KERNEL); diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 592ff5ffb..2302f5322 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -5,7 +5,7 @@ * * IPv4 Forwarding Information Base: policy rules. * - * Version: $Id: fib_rules.c,v 1.5 1998/04/28 06:21:57 davem Exp $ + * Version: $Id: fib_rules.c,v 1.6 1998/08/26 12:03:30 davem Exp $ * * Authors: Alexey Kuznetsov, * @@ -45,10 +45,6 @@ #define FRprintk(a...) -#ifndef CONFIG_RTNL_OLD_IFINFO -#define RTA_IFNAME RTA_IIF -#endif - struct fib_rule { struct fib_rule *r_next; @@ -91,7 +87,7 @@ int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) rtm->rtm_tos == r->r_tos && (!rtm->rtm_type || rtm->rtm_type == r->r_action) && (!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) && - (!rta[RTA_IFNAME-1] || strcmp(RTA_DATA(rta[RTA_IFNAME-1]), r->r_ifname) == 0) && + (!rta[RTA_IIF-1] || strcmp(RTA_DATA(rta[RTA_IIF-1]), r->r_ifname) == 0) && (!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) { *rp = r->r_next; if (r != &default_rule && r != &main_rule && r != &local_rule) @@ -126,7 +122,7 @@ int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) (rtm->rtm_tos & ~IPTOS_TOS_MASK)) return -EINVAL; - if (rta[RTA_IFNAME-1] && RTA_PAYLOAD(rta[RTA_IFNAME-1]) > IFNAMSIZ) + if (rta[RTA_IIF-1] && RTA_PAYLOAD(rta[RTA_IIF-1]) > IFNAMSIZ) return -EINVAL; table_id = rtm->rtm_table; @@ -159,9 +155,9 @@ int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (rta[RTA_PRIORITY-1]) memcpy(&new_r->r_preference, RTA_DATA(rta[RTA_PRIORITY-1]), 4); new_r->r_table = table_id; - if (rta[RTA_IFNAME-1]) { + if (rta[RTA_IIF-1]) { struct device *dev; - memcpy(new_r->r_ifname, RTA_DATA(rta[RTA_IFNAME-1]), IFNAMSIZ); + memcpy(new_r->r_ifname, RTA_DATA(rta[RTA_IIF-1]), IFNAMSIZ); new_r->r_ifname[IFNAMSIZ-1] = 0; new_r->r_ifindex = -1; dev = dev_get(new_r->r_ifname); @@ -339,10 +335,6 @@ extern __inline__ int inet_fill_rule(struct sk_buff *skb, rtm->rtm_table = r->r_table; rtm->rtm_protocol = 0; rtm->rtm_scope = 0; -#ifdef CONFIG_RTNL_OLD_IFINFO - rtm->rtm_nhs = 0; - rtm->rtm_optlen = 0; -#endif rtm->rtm_type = r->r_action; rtm->rtm_flags = r->r_flags; @@ -351,7 +343,7 @@ extern __inline__ int inet_fill_rule(struct sk_buff *skb, if (r->r_src_len) RTA_PUT(skb, RTA_SRC, 4, &r->r_src); if (r->r_ifname[0]) - RTA_PUT(skb, RTA_IFNAME, IFNAMSIZ, &r->r_ifname); + RTA_PUT(skb, RTA_IIF, IFNAMSIZ, &r->r_ifname); if (r->r_preference) RTA_PUT(skb, RTA_PRIORITY, 4, &r->r_preference); if (r->r_srcmap) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 5537016d2..36c801e8c 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -5,7 +5,7 @@ * * IPv4 Forwarding Information Base: semantics. * - * Version: $Id: fib_semantics.c,v 1.9 1998/06/11 03:15:41 davem Exp $ + * Version: $Id: fib_semantics.c,v 1.10 1998/08/26 12:03:32 davem Exp $ * * Authors: Alexey Kuznetsov, * @@ -181,7 +181,6 @@ static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type) return 0; } -#ifndef CONFIG_RTNL_OLD_IFINFO static int fib_count_nexthops(struct rtattr *rta) { @@ -189,7 +188,7 @@ fib_count_nexthops(struct rtattr *rta) struct rtnexthop *nhp = RTA_DATA(rta); int nhlen = RTA_PAYLOAD(rta); - while (nhlen >= sizeof(struct rtnexthop)) { + while (nhlen >= (int)sizeof(struct rtnexthop)) { if ((nhlen -= nhp->rtnh_len) < 0) return 0; nhs++; @@ -197,21 +196,12 @@ fib_count_nexthops(struct rtattr *rta) }; return nhs; } -#endif -#ifdef CONFIG_RTNL_OLD_IFINFO -static int -fib_get_nhs(struct fib_info *fi, const struct nlmsghdr *nlh, const struct rtmsg *r) -{ - struct rtnexthop *nhp = RTM_RTNH(r); - int nhlen = RTM_NHLEN(nlh, r); -#else static int fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r) { struct rtnexthop *nhp = RTA_DATA(rta); int nhlen = RTA_PAYLOAD(rta); -#endif change_nexthops(fi) { int attrlen = nhlen - sizeof(struct rtnexthop); @@ -249,18 +239,10 @@ int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta, } #ifdef CONFIG_IP_ROUTE_MULTIPATH -#ifdef CONFIG_RTNL_OLD_IFINFO - if (r->rtm_nhs == 0) - return 0; - - nhp = RTM_RTNH(r); - nhlen = RTM_NHLEN(nlh, r); -#else if (rta->rta_mp == NULL) return 0; nhp = RTA_DATA(rta->rta_mp); nhlen = RTA_PAYLOAD(rta->rta_mp); -#endif for_nexthops(fi) { int attrlen = nhlen - sizeof(struct rtnexthop); @@ -397,11 +379,7 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta, struct fib_info *fi = NULL; struct fib_info *ofi; #ifdef CONFIG_IP_ROUTE_MULTIPATH -#ifdef CONFIG_RTNL_OLD_IFINFO - int nhs = r->rtm_nhs ? : 1; -#else int nhs = 1; -#endif #else const int nhs = 1; #endif @@ -411,13 +389,11 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta, goto err_inval; #ifdef CONFIG_IP_ROUTE_MULTIPATH -#ifndef CONFIG_RTNL_OLD_IFINFO if (rta->rta_mp) { nhs = fib_count_nexthops(rta->rta_mp); if (nhs == 0) goto err_inval; } -#endif #endif fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); @@ -429,14 +405,6 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta, fi->fib_protocol = r->rtm_protocol; fi->fib_nhs = nhs; fi->fib_flags = r->rtm_flags; -#ifdef CONFIG_RTNL_OLD_IFINFO - if (rta->rta_mtu) - fi->fib_mtu = *rta->rta_mtu; - if (rta->rta_rtt) - fi->fib_rtt = *rta->rta_rtt; - if (rta->rta_window) - fi->fib_window = *rta->rta_window; -#else if (rta->rta_mx) { int attrlen = RTA_PAYLOAD(rta->rta_mx); struct rtattr *attr = RTA_DATA(rta->rta_mx); @@ -451,21 +419,12 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta, attr = RTA_NEXT(attr, attrlen); } } -#endif if (rta->rta_prefsrc) memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4); -#ifndef CONFIG_RTNL_OLD_IFINFO if (rta->rta_mp) { -#else - if (r->rtm_nhs) { -#endif #ifdef CONFIG_IP_ROUTE_MULTIPATH -#ifdef CONFIG_RTNL_OLD_IFINFO - if ((err = fib_get_nhs(fi, nlh, r)) != 0) -#else if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0) -#endif goto failure; if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif) goto err_inval; @@ -504,11 +463,7 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta, #endif if (fib_props[r->rtm_type].error) { -#ifndef CONFIG_RTNL_OLD_IFINFO if (rta->rta_gw || rta->rta_oif || rta->rta_mp) -#else - if (rta->rta_gw || rta->rta_oif || r->rtm_nhs) -#endif goto err_inval; goto link_it; } @@ -637,16 +592,13 @@ u32 __fib_res_prefsrc(struct fib_result *res) #ifdef CONFIG_RTNETLINK int -fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, +fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos, struct fib_info *fi) { struct rtmsg *rtm; struct nlmsghdr *nlh; unsigned char *b = skb->tail; -#ifdef CONFIG_RTNL_OLD_IFINFO - unsigned char *o; -#endif nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm)); rtm = NLMSG_DATA(nlh); @@ -658,22 +610,9 @@ fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, rtm->rtm_type = type; rtm->rtm_flags = fi->fib_flags; rtm->rtm_scope = scope; -#ifdef CONFIG_RTNL_OLD_IFINFO - rtm->rtm_nhs = 0; - - o = skb->tail; -#endif if (rtm->rtm_dst_len) RTA_PUT(skb, RTA_DST, 4, dst); rtm->rtm_protocol = fi->fib_protocol; -#ifdef CONFIG_RTNL_OLD_IFINFO - if (fi->fib_mtu) - RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &fi->fib_mtu); - if (fi->fib_window) - RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &fi->fib_window); - if (fi->fib_rtt) - RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &fi->fib_rtt); -#else #ifdef CONFIG_NET_CLS_ROUTE if (fi->fib_nh[0].nh_tclassid) RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid); @@ -688,7 +627,6 @@ fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, } mx->rta_len = skb->tail - (u8*)mx; } -#endif if (fi->fib_prefsrc) RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc); if (fi->fib_nhs == 1) { @@ -697,18 +635,14 @@ fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, if (fi->fib_nh->nh_oif) RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif); } -#ifdef CONFIG_RTNL_OLD_IFINFO - rtm->rtm_optlen = skb->tail - o; -#endif #ifdef CONFIG_IP_ROUTE_MULTIPATH if (fi->fib_nhs > 1) { struct rtnexthop *nhp; -#ifndef CONFIG_RTNL_OLD_IFINFO struct rtattr *mp_head; if (skb_tailroom(skb) <= RTA_SPACE(0)) goto rtattr_failure; mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0)); -#endif + for_nexthops(fi) { if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) goto rtattr_failure; @@ -719,14 +653,9 @@ fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, if (nh->nh_gw) RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw); nhp->rtnh_len = skb->tail - (unsigned char*)nhp; -#ifdef CONFIG_RTNL_OLD_IFINFO - rtm->rtm_nhs++; -#endif } endfor_nexthops(fi); -#ifndef CONFIG_RTNL_OLD_IFINFO mp_head->rta_type = RTA_MULTIPATH; mp_head->rta_len = skb->tail - (u8*)mp_head; -#endif } #endif nlh->nlmsg_len = skb->tail - b; @@ -848,24 +777,6 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm, if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL) return -EINVAL; -#ifdef CONFIG_RTNL_OLD_IFINFO - /* Ugly conversion from rtentry types to unsigned */ - - if (r->rt_flags&RTF_IRTT) { - rta->rta_rtt = (unsigned*)&r->rt_pad3; - *rta->rta_rtt = r->rt_irtt; - } - if (r->rt_flags&RTF_WINDOW) { - rta->rta_window = (unsigned*)&r->rt_window; - if (sizeof(*rta->rta_window) != sizeof(r->rt_window)) - *rta->rta_window = r->rt_window; - } - if (r->rt_flags&RTF_MTU) { - rta->rta_mtu = (unsigned*)&r->rt_mtu; - if (sizeof(*rta->rta_mtu) != sizeof(r->rt_mtu)) - *rta->rta_mtu = r->rt_mtu; - } -#else if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) { struct rtattr *rec; struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL); @@ -896,7 +807,6 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm, *(u32*)RTA_DATA(rec) = r->rt_irtt; } } -#endif return 0; } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 4e947337a..9cc7c733b 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -3,7 +3,7 @@ * * Alan Cox, * - * Version: $Id: icmp.c,v 1.44 1998/06/16 04:38:27 davem Exp $ + * Version: $Id: icmp.c,v 1.45 1998/08/26 12:03:35 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -47,6 +47,9 @@ * into the dest entry and use a token * bucket filter (thanks to ANK). Make * the rates sysctl configurable. + * Yu Tianli : Fixed two ugly bugs in icmp_send + * - IP option length was accounted wrongly + * - ICMP header length was not accounted at all. * * RFC1122 (Host Requirements -- Comm. Layer) Status: * (boy, are there a lot of rules for ICMP) @@ -363,7 +366,7 @@ int xrlim_allow(struct dst_entry *dst, int timeout) now = jiffies; dst->rate_tokens += now - dst->rate_last; - if (dst->rate_tokens > 6*timeout) + if (dst->rate_tokens > XRLIM_BURST_FACTOR*timeout) dst->rate_tokens = XRLIM_BURST_FACTOR*timeout; if (dst->rate_tokens >= timeout) { dst->rate_tokens -= timeout; @@ -537,7 +540,17 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info) /* * Construct source address and options. */ - + +#ifdef CONFIG_IP_ROUTE_NAT + /* + * Restore original addresses if packet has been translated. + */ + if (rt->rt_flags&RTCF_NAT && IPCB(skb_in)->flags&IPSKB_TRANSLATED) { + iph->daddr = rt->key.dst; + iph->saddr = rt->key.src; + } +#endif + saddr = iph->daddr; if (!(rt->rt_flags & RTCF_LOCAL)) saddr = 0; @@ -587,8 +600,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info) room = rt->u.dst.pmtu; if (room > 576) room = 576; - room -= sizeof(struct iphdr) - icmp_param.replyopts.optlen; - + room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen; + room -= sizeof(struct icmphdr); + icmp_param.data_len=(iph->ihl<<2)+skb_in->len; if (icmp_param.data_len > room) icmp_param.data_len = room; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 74757adf8..af49104b3 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -8,7 +8,7 @@ * the older version didn't come out right using gcc 2.5.8, the newer one * seems to fall out with gcc 2.6.2. * - * Version: $Id: igmp.c,v 1.26 1998/03/08 05:56:19 davem Exp $ + * Version: $Id: igmp.c,v 1.27 1998/08/26 12:03:39 davem Exp $ * * Authors: * Alan Cox @@ -563,7 +563,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) goto done; } - iml = (struct ip_mc_socklist *)kmalloc(sizeof(*iml), GFP_KERNEL); + iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL); err = -EADDRINUSE; for (i=sk->ip_mc_list; i; i=i->next) { @@ -590,7 +590,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) done: rtnl_shunlock(); if (iml) - kfree(iml); + sock_kfree_s(sk, iml, sizeof(*iml)); return err; } @@ -613,7 +613,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) in_dev = inetdev_by_index(iml->multi.imr_ifindex); if (in_dev) ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr); - kfree_s(iml, sizeof(*iml)); + sock_kfree_s(sk, iml, sizeof(*iml)); return 0; } } @@ -633,7 +633,7 @@ void ip_mc_drop_socket(struct sock *sk) sk->ip_mc_list = iml->next; if ((in_dev = inetdev_by_index(iml->multi.imr_ifindex)) != NULL) ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); - kfree_s(iml, sizeof(*iml)); + sock_kfree_s(sk, iml, sizeof(*iml)); } } diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index e136a16ca..8cd0d5962 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -5,7 +5,7 @@ * * The IP forwarding functionality. * - * Version: $Id: ip_forward.c,v 1.40 1998/03/08 05:56:20 davem Exp $ + * Version: $Id: ip_forward.c,v 1.41 1998/08/26 12:03:42 davem Exp $ * * Authors: see ip.c * @@ -79,10 +79,8 @@ int ip_forward(struct sk_buff *skb) int fw_res = 0; #endif - if (IPCB(skb)->opt.router_alert) { - if (ip_call_ra_chain(skb)) - return 0; - } + if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb)) + return 0; if (skb->pkt_type != PACKET_HOST) goto drop; @@ -110,7 +108,7 @@ int ip_forward(struct sk_buff *skb) goto local_pkt; #endif - if (ip_decrease_ttl(iph) <= 0) + if (iph->ttl <= 1) goto too_many_hops; if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) @@ -121,22 +119,30 @@ int ip_forward(struct sk_buff *skb) * after asking the firewall permission to do so. */ - skb->priority = rt->u.dst.priority; + skb->priority = rt_tos2priority(iph->tos); dev2 = rt->u.dst.dev; - mtu = dev2->mtu; + mtu = rt->u.dst.pmtu; #ifdef CONFIG_NET_SECURITY call_fw_firewall(PF_SECURITY, dev2, NULL, &mtu, NULL); #endif /* - * In IP you never have to forward a frame on the interface that it - * arrived upon. We now generate an ICMP HOST REDIRECT giving the route + * We now generate an ICMP HOST REDIRECT giving the route * we calculated. */ if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr) ip_rt_send_redirect(skb); - + + /* We are about to mangle packet. Copy it! */ + if ((skb = skb_cow(skb, dev2->hard_header_len)) == NULL) + return -1; + iph = skb->nh.iph; + opt = &(IPCB(skb)->opt); + + /* Decrease ttl after skb cow done */ + ip_decrease_ttl(iph); + /* * We now may allocate a new buffer, and copy the datagram into it. * If the indicated interface is up and running, kick it. @@ -147,14 +153,6 @@ int ip_forward(struct sk_buff *skb) #ifdef CONFIG_IP_ROUTE_NAT if (rt->rt_flags & RTCF_NAT) { - if (skb_headroom(skb) < dev2->hard_header_len || skb_cloned(skb)) { - struct sk_buff *skb2; - skb2 = skb_realloc_headroom(skb, (dev2->hard_header_len + 15)&~15); - kfree_skb(skb); - if (skb2 == NULL) - return -1; - skb = skb2; - } if (ip_do_nat(skb)) { kfree_skb(skb); return -1; @@ -243,18 +241,6 @@ skip_call_fw_firewall: } #endif - if (skb_headroom(skb) < dev2->hard_header_len || skb_cloned(skb)) { - struct sk_buff *skb2; - skb2 = skb_realloc_headroom(skb, (dev2->hard_header_len + 15)&~15); - kfree_skb(skb); - - if (skb2 == NULL) { - NETDEBUG(printk(KERN_ERR "\nIP: No memory available for IP forward\n")); - return -1; - } - skb = skb2; - iph = skb2->nh.iph; - } #ifdef CONFIG_FIREWALL if ((fw_res = call_out_firewall(PF_INET, dev2, iph, NULL,&skb)) < FW_ACCEPT) { diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 9641aaae3..8a0e40f0f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -5,7 +5,7 @@ * * The IP fragmentation functionality. * - * Version: $Id: ip_fragment.c,v 1.38 1998/06/16 04:38:29 davem Exp $ + * Version: $Id: ip_fragment.c,v 1.39 1998/08/26 10:35:26 davem Exp $ * * Authors: Fred N. van Kempen * Alan Cox diff --git a/net/ipv4/ip_fw.c b/net/ipv4/ip_fw.c index 57e7761e3..b45457c72 100644 --- a/net/ipv4/ip_fw.c +++ b/net/ipv4/ip_fw.c @@ -427,18 +427,15 @@ static void dump_packet(const struct iphdr *ip, printk("\n"); } -/* function for checking chain labels for user space. Makes sure that - * there are no special characters in the string */ +/* function for checking chain labels for user space. */ static int check_label(ip_chainlabel label) { unsigned int i; - - for (i = 0; i < IP_FW_MAX_LABEL_LENGTH + 1 && label[i]; i++) - if (label[i] <= ' ') - return 0; - if (i == IP_FW_MAX_LABEL_LENGTH+1) - return 0; - return 1; + /* strlen must be < IP_FW_MAX_LABEL_LENGTH. */ + for (i = 0; i < IP_FW_MAX_LABEL_LENGTH + 1; i++) + if (label[i] == '\0') return 1; + + return 0; } /* This function returns a pointer to the first chain with a label @@ -1098,6 +1095,9 @@ static int create_chain(ip_chainlabel label) { struct ip_chain *tmp; + if (!check_label(label)) + return EINVAL; + FWC_HAVE_LOCK(fwc_wlocks); for (tmp = ip_fw_chains; tmp->next; tmp = tmp->next) if (strcmp(tmp->label,label) == 0) @@ -1512,14 +1512,14 @@ static int dump_rule(char *buffer, "%9s " /* Chain name */ "%08lX/%08lX->%08lX/%08lX " /* Source & Destination IPs */ "%.16s " /* Interface */ - "%hX %hX " /* fw_flg and fw_invflg fields */ - "%hu " /* Protocol */ + "%X %X " /* fw_flg and fw_invflg fields */ + "%u " /* Protocol */ "%-9u %-9u %-9u %-9u " /* Packet & byte counters */ - "%hu-%hu %hu-%hu " /* Source & Dest port ranges */ + "%u-%u %u-%u " /* Source & Dest port ranges */ "A%02X X%02X " /* TOS and and xor masks */ "%08X " /* Redirection port */ "%u " /* fw_mark field */ - "%hu " /* output size */ + "%u " /* output size */ "%9s\n", /* Target */ chainlabel, ntohl(rule->ipfw.fw_src.s_addr), diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 04fde6120..6a2e4eca5 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -684,7 +684,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct device *dev) else if (skb->protocol == __constant_htons(ETH_P_IPV6)) { struct rt6_info *rt6 = (struct rt6_info*)skb->dst; - if (rt6 && mtu < rt6->u.dst.pmtu && mtu >= 576) { + if (rt6 && mtu < rt6->u.dst.pmtu && mtu >= IPV6_MIN_MTU) { if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) || rt6->rt6i_dst.plen == 128) { rt6->rt6i_flags |= RTF_MODIFIED; @@ -692,7 +692,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct device *dev) } } - if (mtu >= 576 && mtu < skb->len - tunnel->hlen + gre_hlen) { + if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) { icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); ip_rt_put(rt); goto tx_error; @@ -722,6 +722,8 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct device *dev) tunnel->recursion--; return 0; } + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); dev_kfree_skb(skb); skb = new_skb; } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index f56a90332..e06ad8206 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) module. * - * Version: $Id: ip_input.c,v 1.31 1998/05/17 02:19:15 freitag Exp $ + * Version: $Id: ip_input.c,v 1.33 1998/08/26 12:03:47 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -115,38 +115,31 @@ * 2 of the License, or (at your option) any later version. */ -#include #include #include #include -#include -#include #include #include #include +#include #include #include #include #include #include #include -#include -#include #include #include #include #include -#include -#include #include #include #include #include #include #include -#include #include #ifdef CONFIG_IP_MASQUERADE #include @@ -154,7 +147,6 @@ #include #include #include -#include /* * SNMP management statistics @@ -199,6 +191,9 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) return 0; } +/* + * Process Router Attention IP option + */ int ip_call_ra_chain(struct sk_buff *skb) { struct ip_ra_chain *ra; @@ -229,6 +224,9 @@ int ip_call_ra_chain(struct sk_buff *skb) return 0; } +/* + * Deliver IP Packets to the higher protocol layers. + */ int ip_local_deliver(struct sk_buff *skb) { struct iphdr *iph = skb->nh.iph; @@ -282,9 +280,11 @@ int ip_local_deliver(struct sk_buff *skb) skb->h.raw = skb->nh.raw + iph->ihl*4; /* - * Deliver to raw sockets. This is fun as to avoid copies we want to make no surplus copies. + * Deliver to raw sockets. This is fun as to avoid copies we want to make no + * surplus copies. * * RFC 1122: SHOULD pass TOS value up to the transport layer. + * -> It does. And not only TOS, but all IP header. */ /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */ @@ -309,10 +309,7 @@ int ip_local_deliver(struct sk_buff *skb) skb1 = skb_clone(skb, GFP_ATOMIC); if(skb1) { - if(ipsec_sk_policy(raw_sk,skb1)) - raw_rcv(raw_sk, skb1); - else - kfree_skb(skb1); + raw_rcv(raw_sk, skb1); } } raw_sk = sknext; @@ -372,10 +369,8 @@ int ip_local_deliver(struct sk_buff *skb) if(raw_sk!=NULL) /* Shift to last raw user */ { - if(ipsec_sk_policy(raw_sk, skb)) - raw_rcv(raw_sk, skb); - else - kfree_skb(skb); + raw_rcv(raw_sk, skb); + } else if (!flag) /* Free and report errors */ { @@ -386,15 +381,16 @@ int ip_local_deliver(struct sk_buff *skb) return(0); } +/* + * Main IP Receive routine. + */ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) { struct iphdr *iph = skb->nh.iph; - struct ip_options * opt = NULL; - int err; /* - * When interface is in promisc. mode, drop all the crap - * that it receives, do not truing to analyse it. + * When the interface is in promisc. mode, drop all the crap + * that it receives, do not try to analyse it. */ if (skb->pkt_type == PACKET_OTHERHOST) goto drop; @@ -412,24 +408,32 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) * 4. Doesn't have a bogus length */ - if (skb->lenihl<5 || iph->version != 4 -#ifndef CONFIG_IP_ROUTER - || ip_fast_csum((unsigned char *)iph, iph->ihl) !=0 -#endif - || skb->len < ntohs(iph->tot_len)) - goto inhdr_error; + if (skb->len < sizeof(struct iphdr)) + goto inhdr_error; + if (iph->ihl < 5 || iph->version != 4 || ip_fast_csum((u8 *)iph, iph->ihl) != 0) + goto inhdr_error; + + { + __u32 len = ntohs(iph->tot_len); + if (skb->len < len) + goto inhdr_error; /* * Our transport medium may have padded the buffer out. Now we know it * is IP we can trim to the true length of the frame. * Note this now means skb->len holds ntohs(iph->tot_len). */ - __skb_trim(skb, ntohs(iph->tot_len)); + __skb_trim(skb, len); + } + + /* + * Initialise the virtual path cache for the packet. It describes + * how the packet travels inside Linux networking. + */ if (skb->dst == NULL) { - err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev); - if (err) - goto drop; + if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev)) + goto drop; #ifdef CONFIG_CPU_IS_SLOW if (net_cpu_congestion > 10 && !(iph->tos&IPTOS_RELIABILITY) && IPTOS_PREC(iph->tos) < IPTOS_PREC_INTERNETCONTROL) { @@ -449,6 +453,21 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) #endif if (iph->ihl > 5) { + struct ip_options *opt; + + /* It looks as overkill, because not all + IP options require packet mangling. + But it is the easiest for now, especially taking + into account that combination of IP options + and running sniffer is extremely rare condition. + --ANK (980813) + */ + + skb = skb_cow(skb, skb_headroom(skb)); + if (skb == NULL) + return 0; + iph = skb->nh.iph; + skb->ip_summed = 0; if (ip_options_compile(NULL, skb)) goto inhdr_error; @@ -458,8 +477,8 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) struct in_device *in_dev = dev->ip_ptr; if (in_dev && !IN_DEV_SOURCE_ROUTE(in_dev)) { if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) - printk(KERN_INFO "source route option %08lx -> %08lx\n", - ntohl(iph->saddr), ntohl(iph->daddr)); + printk(KERN_INFO "source route option %d.%d.%d.%d -> %d.%d.%d.%d\n", + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); goto drop; } if (ip_options_rcv_srr(skb)) diff --git a/net/ipv4/ip_nat_dumb.c b/net/ipv4/ip_nat_dumb.c index 07a7afc23..c48ea9263 100644 --- a/net/ipv4/ip_nat_dumb.c +++ b/net/ipv4/ip_nat_dumb.c @@ -5,7 +5,7 @@ * * Dumb Network Address Translation. * - * Version: $Id: ip_nat_dumb.c,v 1.3 1998/03/15 03:31:44 davem Exp $ + * Version: $Id: ip_nat_dumb.c,v 1.4 1998/08/26 12:03:49 davem Exp $ * * Authors: Alexey Kuznetsov, * @@ -17,11 +17,12 @@ * Fixes: * Rani Assaf : A zero checksum is a special case * only in UDP + * Rani Assaf : Added ICMP messages rewriting + * * * NOTE: It is just working model of real NAT. */ -#include #include #include #include @@ -36,9 +37,6 @@ #include #include #include -#ifdef CONFIG_IP_MASQUERADE -#include -#endif #include #include #include @@ -68,20 +66,48 @@ ip_do_nat(struct sk_buff *skb) switch(iph->protocol) { case IPPROTO_TCP: - cksum = (u16*)&((struct tcphdr*)(((char*)iph) + iph->ihl*4))->check; + cksum = (u16*)&((struct tcphdr*)(((char*)iph) + (iph->ihl<<2)))->check; + if ((u8*)(cksum+1) > skb->tail) + goto truncated; check = csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 0, ~(*cksum)); *cksum = csum_tcpudp_magic(~osaddr, ~odaddr, 0, 0, ~check); break; case IPPROTO_UDP: - cksum = (u16*)&((struct udphdr*)(((char*)iph) + iph->ihl*4))->check; + cksum = (u16*)&((struct udphdr*)(((char*)iph) + (iph->ihl<<2)))->check; + if ((u8*)(cksum+1) > skb->tail) + goto truncated; if ((check = *cksum) != 0) { check = csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 0, ~check); check = csum_tcpudp_magic(~osaddr, ~odaddr, 0, 0, ~check); *cksum = check ? : 0xFFFF; } + break; + case IPPROTO_ICMP: + { + struct icmphdr *icmph = (struct icmphdr*)((char*)iph + (iph->ihl<<2)); + struct iphdr *ciph; + + if ((icmph->type != ICMP_DEST_UNREACH) && + (icmph->type != ICMP_TIME_EXCEEDED) && + (icmph->type != ICMP_PARAMETERPROB)) break; + + ciph = (struct iphdr *) (icmph + 1); + + if ((u8*)(ciph+1) > skb->tail) + goto truncated; + + if (rt->rt_flags&RTCF_DNAT && ciph->saddr == odaddr) + ciph->saddr = iph->daddr; + if (rt->rt_flags&RTCF_SNAT && ciph->daddr == osaddr) + ciph->daddr = iph->saddr; + break; + } default: break; } } return 0; + +truncated: + return -EINVAL; } diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 3e3674ef7..153c7a391 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -5,7 +5,7 @@ * * The options processing module for ip.c * - * Version: $Id: ip_options.c,v 1.13 1998/02/12 07:43:12 davem Exp $ + * Version: $Id: ip_options.c,v 1.14 1998/08/26 12:03:51 davem Exp $ * * Authors: A.N.Kuznetsov * @@ -451,7 +451,7 @@ eol: error: if (skb) { - icmp_send(skb, ICMP_PARAMETERPROB, 0, pp_ptr-iph); + icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24)); kfree_skb(skb); } return -EINVAL; @@ -579,7 +579,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) if (rt->rt_type == RTN_UNICAST) { if (!opt->is_strictroute) return 0; - icmp_send(skb, ICMP_PARAMETERPROB, 0, 16); + icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24)); return -EINVAL; } if (rt->rt_type != RTN_LOCAL) @@ -587,7 +587,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) { if (srrptr + 3 > srrspace) { - icmp_send(skb, ICMP_PARAMETERPROB, 0, opt->srr+2); + icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24)); return -EINVAL; } memcpy(&nexthop, &optptr[srrptr-1], 4); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 0527c1b0b..9250051ab 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) output module. * - * Version: $Id: ip_output.c,v 1.59 1998/07/15 05:05:15 davem Exp $ + * Version: $Id: ip_output.c,v 1.61 1998/08/26 12:03:54 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -171,14 +171,7 @@ int ip_mc_output(struct sk_buff *skb) */ if (rt->rt_flags&RTCF_MULTICAST && (!sk || sk->ip_mc_loop)) { -#ifndef CONFIG_IP_MROUTE -#if 1 - /* It should never occur. Delete it eventually. --ANK */ - if (!(rt->rt_flags&RTCF_LOCAL) || (dev->flags&IFF_LOOPBACK)) - printk(KERN_DEBUG "ip_mc_output (mc): it should never occur\n"); - else -#endif -#else +#ifdef CONFIG_IP_MROUTE /* Small optimization: do not loopback not local frames, which returned after forwarding; they will be dropped by ip_mr_input in any case. @@ -199,15 +192,8 @@ int ip_mc_output(struct sk_buff *skb) } } - if (rt->rt_flags&RTCF_BROADCAST) { -#if 1 - /* It should never occur. Delete it eventually. --ANK */ - if (!(rt->rt_flags&RTCF_LOCAL) || (dev->flags&IFF_LOOPBACK)) - printk(KERN_DEBUG "ip_mc_output (brd): it should never occur!\n"); - else -#endif + if (rt->rt_flags&RTCF_BROADCAST) dev_loopback_xmit(skb); - } return ip_finish_output(skb); } @@ -281,8 +267,6 @@ void ip_queue_xmit(struct sk_buff *skb) iph->ihl = 5; iph->tos = sk->ip_tos; iph->frag_off = 0; - if(sk->ip_pmtudisc == IP_PMTUDISC_WANT && !(rt->u.dst.mxlock & (1 << RTAX_MTU))) - iph->frag_off |= __constant_htons(IP_DF); iph->ttl = sk->ip_ttl; iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; @@ -316,6 +300,8 @@ void ip_queue_xmit(struct sk_buff *skb) kfree_skb(skb); if (skb2 == NULL) return; + if (sk) + skb_set_owner_w(skb, sk); skb = skb2; iph = skb->nh.iph; } @@ -326,6 +312,9 @@ void ip_queue_xmit(struct sk_buff *skb) if (tot_len > rt->u.dst.pmtu) goto fragment; + if (sk->ip_pmtudisc == IP_PMTUDISC_WANT && !(rt->u.dst.mxlock & (1 << RTAX_MTU))) + iph->frag_off |= __constant_htons(IP_DF); + /* Add an IP checksum. */ ip_send_check(iph); @@ -334,7 +323,15 @@ void ip_queue_xmit(struct sk_buff *skb) return; fragment: - if ((iph->frag_off & htons(IP_DF)) != 0) { + if (sk->ip_pmtudisc == IP_PMTUDISC_WANT && + !(rt->u.dst.mxlock & (1 << RTAX_MTU)) && + tot_len > (iph->ihl<<2) + sizeof(struct tcphdr)+16) { + /* Reject packet ONLY if TCP might fragment + it itself, if were careful enough. + Test is not precise (f.e. it does not take sacks + into account). Actually, tcp should make it. --ANK (980801) + */ + iph->frag_off |= __constant_htons(IP_DF); printk(KERN_DEBUG "sending pkt_too_big to self\n"); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(rt->u.dst.pmtu)); @@ -701,7 +698,6 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) unsigned int mtu, hlen, left, len; int offset; int not_last_frag; - u16 dont_fragment; struct rtable *rt = (struct rtable*)skb->dst; dev = rt->u.dst.dev; @@ -726,10 +722,14 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) * The protocol doesn't seem to say what to do in the case that the * frame + options doesn't fit the mtu. As it used to fall down dead * in this case we were fortunate it didn't happen + * + * It is impossible, because mtu>=68. --ANK (980801) */ +#ifdef CONFIG_NET_PARANOIA if (mtu<8) goto fail; +#endif /* * Fragment the datagram. @@ -738,14 +738,6 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; not_last_frag = iph->frag_off & htons(IP_MF); - /* - * Nice moment: if DF is set and we are here, - * it means that packet should be fragmented and - * DF is set on fragments. If it works, - * path MTU discovery can be done by ONE segment(!). --ANK - */ - dont_fragment = iph->frag_off & htons(IP_DF); - /* * Keep copying data until we run out. */ @@ -805,7 +797,7 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) * Fill in the new header fields. */ iph = skb2->nh.iph; - iph->frag_off = htons((offset >> 3))|dont_fragment; + iph->frag_off = htons((offset >> 3)); /* ANK: dirty, but effective trick. Upgrade options only if * the segment to be fragmented was THE FIRST (otherwise, @@ -858,11 +850,6 @@ static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset, int len; int hdrflag = 1; -#if 0 - printk("ip_reply_glue_bits: offset=%u,flen=%u iov[0].l=%u,iov[1].len=%u\n", - offset,fraglen,dp->iov[0].iov_len,dp->iov[1].iov_len); -#endif - iov = &dp->iov[0]; if (offset >= iov->iov_len) { offset -= iov->iov_len; @@ -871,12 +858,6 @@ static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset, } len = iov->iov_len - offset; if (fraglen > len) { /* overlapping. */ -#if 1 - if (iov > &dp->iov[0]) { - printk("frag too long! (o=%u,fl=%u)\n",offset,fraglen); - return -1; - } -#endif dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len, dp->csum); offset = 0; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 8f712c801..3d8f4fab6 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -5,7 +5,7 @@ * * The IP to API glue. * - * Version: $Id: ip_sockglue.c,v 1.36 1998/07/15 05:05:06 davem Exp $ + * Version: $Id: ip_sockglue.c,v 1.37 1998/08/26 12:03:57 davem Exp $ * * Authors: see ip.c * @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -36,6 +37,9 @@ #include #include #include +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#include +#endif #include @@ -140,6 +144,10 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc) struct cmsghdr *cmsg; for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + if ((unsigned long)(((char*)cmsg - (char*)msg->msg_control) + + cmsg->cmsg_len) > msg->msg_controllen) { + return -EINVAL; + } if (cmsg->cmsg_level != SOL_IP) continue; switch (cmsg->cmsg_type) { @@ -255,22 +263,30 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt case IP_OPTIONS: { struct ip_options * opt = NULL; - struct ip_options * old_opt; if (optlen > 40 || optlen < 0) return -EINVAL; err = ip_options_get(&opt, optval, optlen, 1); if (err) return err; - /* - * ANK: I'm afraid that receive handler may change - * options from under us. - */ - cli(); - old_opt = sk->opt; - sk->opt = opt; - sti(); - if (old_opt) - kfree_s(old_opt, sizeof(struct ip_options) + old_opt->optlen); + start_bh_atomic(); + if (sk->type == SOCK_STREAM) { + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (sk->family == PF_INET || + ((tcp_connected(sk->state) || sk->state == TCP_SYN_SENT) + && sk->daddr != LOOPBACK4_IPV6)) { +#endif + if (opt) + tp->ext_header_len = opt->optlen; + tcp_sync_mss(sk, tp->pmtu_cookie); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + } +#endif + } + opt = xchg(&sk->opt, opt); + end_bh_atomic(); + if (opt) + kfree_s(opt, sizeof(struct ip_options) + opt->optlen); return 0; } case IP_PKTINFO: @@ -497,11 +513,11 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *op { unsigned char optbuf[sizeof(struct ip_options)+40]; struct ip_options * opt = (struct ip_options*)optbuf; - cli(); + start_bh_atomic(); opt->optlen = 0; if (sk->opt) memcpy(optbuf, sk->opt, sizeof(struct ip_options)+sk->opt->optlen); - sti(); + end_bh_atomic(); if (opt->optlen == 0) return put_user(0, optlen); @@ -511,7 +527,7 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *op if(put_user(len, optlen)) return -EFAULT; if(copy_to_user(optval, opt->__data, len)) - return -EFAULT; + return -EFAULT; return 0; } case IP_PKTINFO: diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index d0b3b5ff2..778ac15c1 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -1,7 +1,7 @@ /* * Linux NET3: IP/IP protocol decoder. * - * Version: $Id: ipip.c,v 1.22 1998/03/08 05:56:27 davem Exp $ + * Version: $Id: ipip.c,v 1.23 1998/08/26 12:04:00 davem Exp $ * * Authors: * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 @@ -551,6 +551,8 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct device *dev) tunnel->recursion--; return 0; } + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); dev_kfree_skb(skb); skb = new_skb; } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 29fd4b3ad..49cd6daf5 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -9,7 +9,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Version: $Id: ipmr.c,v 1.35 1998/05/13 06:23:24 davem Exp $ + * Version: $Id: ipmr.c,v 1.36 1998/08/26 12:04:03 davem Exp $ * * Fixes: * Michael Chastain : Incorrect size of copying. @@ -55,6 +55,8 @@ #include #include #include +#include +#include #include #include @@ -1044,7 +1046,12 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, dev = rt->u.dst.dev; - if (skb->len+encap > dev->mtu && (ntohs(iph->frag_off) & IP_DF)) { + if (skb->len+encap > rt->u.dst.pmtu /* && (ntohs(iph->frag_off) & IP_DF) */) { + /* Do not fragment multicasts. Alas, IPv4 does not + allow to send ICMP, so that packets will disappear + to blackhole. + */ + ip_statistics.IpFragFails++; ip_rt_put(rt); return; @@ -1052,11 +1059,6 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, encap += dev->hard_header_len; - if (skb->len+encap > 65534) { - ip_rt_put(rt); - return; - } - if (skb_headroom(skb) < encap || skb_cloned(skb) || !last) skb2 = skb_realloc_headroom(skb, (encap + 15)&~15); else if (atomic_read(&skb->users) != 1) @@ -1076,18 +1078,37 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, dst_release(skb2->dst); skb2->dst = &rt->u.dst; - iph = skb2->nh.iph; ip_decrease_ttl(iph); +#ifdef CONFIG_FIREWALL + if (call_fw_firewall(PF_INET, vif->dev, skb2->nh.iph, NULL, &skb2) < FW_ACCEPT) { + kfree_skb(skb2); + return; + } + if (call_out_firewall(PF_INET, vif->dev, skb2->nh.iph, NULL, &skb2) < FW_ACCEPT) { + kfree_skb(skb2); + return; + } +#endif if (vif->flags & VIFF_TUNNEL) { ip_encap(skb2, vif->local, vif->remote); +#ifdef CONFIG_FIREWALL + /* Double output firewalling on tunnels: one is on tunnel + another one is on real device. + */ + if (call_out_firewall(PF_INET, dev, skb2->nh.iph, NULL, &skb2) < FW_ACCEPT) { + kfree_skb(skb2); + return; + } +#endif ((struct ip_tunnel *)vif->dev->priv)->stat.tx_packets++; ((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb2->len; } IPCB(skb2)->flags |= IPSKB_FORWARDED; + /* * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally * not only before forwarding, but after forwarding on all output @@ -1351,21 +1372,12 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) struct rtnexthop *nhp; struct device *dev = vif_table[c->mfc_parent].dev; u8 *b = skb->tail; - -#ifdef CONFIG_RTNL_OLD_IFINFO - if (dev) { - u8 *o = skb->tail; - RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex); - rtm->rtm_optlen += skb->tail - o; - } -#else struct rtattr *mp_head; if (dev) RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex); mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0)); -#endif for (ct = c->mfc_minvif; ct < c->mfc_maxvif; ct++) { if (c->mfc_ttls[ct] < 255) { @@ -1376,15 +1388,10 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) nhp->rtnh_hops = c->mfc_ttls[ct]; nhp->rtnh_ifindex = vif_table[ct].dev->ifindex; nhp->rtnh_len = sizeof(*nhp); -#ifdef CONFIG_RTNL_OLD_IFINFO - rtm->rtm_nhs++; -#endif } } -#ifndef CONFIG_RTNL_OLD_IFINFO mp_head->rta_type = RTA_MULTIPATH; mp_head->rta_len = skb->tail - (u8*)mp_head; -#endif rtm->rtm_type = RTN_MULTICAST; return 1; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index b6e06242f..6f06f4345 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -7,7 +7,7 @@ * PROC file system. It is mainly used for debugging and * statistics. * - * Version: $Id: proc.c,v 1.30 1998/04/16 16:29:05 freitag Exp $ + * Version: $Id: proc.c,v 1.31 1998/07/29 20:09:25 freitag Exp $ * * Authors: Fred N. van Kempen, * Gerald J. Heim, @@ -357,12 +357,15 @@ int netstat_get_info(char *buffer, char **start, off_t offset, int length, int d len = sprintf(buffer, "TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed" - "EmbryonicRsts\n" - "TcpExt: %lu %lu %lu %lu\n", + " EmbryonicRsts PruneCalled RcvPruned OfoPruned\n" + "TcpExt: %lu %lu %lu %lu %lu %lu %lu\n", net_statistics.SyncookiesSent, net_statistics.SyncookiesRecv, net_statistics.SyncookiesFailed, - net_statistics.EmbryonicRsts); + net_statistics.EmbryonicRsts, + net_statistics.PruneCalled, + net_statistics.RcvPruned, + net_statistics.OfoPruned); if (offset >= len) { diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 8d8bdab97..e10ddc0dd 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -5,7 +5,7 @@ * * RAW - implementation of IP "raw" sockets. * - * Version: $Id: raw.c,v 1.36 1998/05/08 21:06:29 davem Exp $ + * Version: $Id: raw.c,v 1.37 1998/08/26 12:04:07 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -152,7 +152,7 @@ void raw_err (struct sock *sk, struct sk_buff *skb) int type = skb->h.icmph->type; int code = skb->h.icmph->code; - if (sk->ip_recverr && !sk->sock_readers) { + if (sk->ip_recverr) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2 && sock_queue_err_skb(sk, skb2)) kfree_skb(skb); @@ -194,10 +194,6 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) skb->h.raw = skb->nh.raw; - if (sk->sock_readers) { - __skb_queue_tail(&sk->back_log, skb); - return 0; - } raw_rcv_skb(sk, skb); return 0; } @@ -379,10 +375,33 @@ done: static void raw_close(struct sock *sk, unsigned long timeout) { + /* Observation: when raw_close is called, processes have + no access to socket anymore. But net still has. + Step one, detach it from networking: + + A. Remove from hash tables. + */ sk->state = TCP_CLOSE; + raw_v4_unhash(sk); + /* + B. Raw sockets may have direct kernel refereneces. Kill them. + */ ip_ra_control(sk, 0, NULL); + + /* In this point socket cannot receive new packets anymore */ + + + /* But we still have packets pending on receive + queue and probably, our own packets waiting in device queues. + sock_destroy will drain receive queue, but transmitted + packets will delay socket destruction. + Set sk->dead=1 in order to prevent wakeups, when these + packet will be freed. + */ sk->dead=1; destroy_sock(sk); + + /* That's all. No races here. */ } /* This gets rid of all the nasties in af_inet. -DaveM */ @@ -474,14 +493,8 @@ done: static int raw_init(struct sock *sk) { struct raw_opt *tp = &(sk->tp_pinfo.tp_raw4); - if (sk->num == IPPROTO_ICMP) { + if (sk->num == IPPROTO_ICMP) memset(&tp->filter, 0, sizeof(tp->filter)); - - /* By default block ECHO and TIMESTAMP requests */ - - set_bit(ICMP_ECHO, &tp->filter); - set_bit(ICMP_TIMESTAMP, &tp->filter); - } return 0; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e10f65c68..5788342c9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -5,7 +5,7 @@ * * ROUTE - implementation of the IP router. * - * Version: $Id: route.c,v 1.54 1998/07/15 05:05:22 davem Exp $ + * Version: $Id: route.c,v 1.57 1998/08/26 12:04:09 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -48,6 +48,7 @@ * route.c and rewritten from scratch. * Andi Kleen : Load-limit warning messages. * Vitaly E. Lavrov : Transparent proxy revived after year coma. + * Vitaly E. Lavrov : Race condition in ip_route_input_slow. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -90,6 +91,8 @@ #include #endif +#define IP_MAX_MTU 0xFFF0 + #define RT_GC_TIMEOUT (300*HZ) int ip_rt_min_delay = 2*HZ; @@ -166,7 +169,7 @@ __u8 ip_tos2prio[16] = { * Route cache. */ -static struct rtable *rt_hash_table[RT_HASH_DIVISOR]; +struct rtable *rt_hash_table[RT_HASH_DIVISOR]; static struct rtable * rt_intern_hash(unsigned hash, struct rtable * rth); @@ -246,6 +249,13 @@ static __inline__ void rt_free(struct rtable *rt) dst_free(&rt->u.dst); } +static __inline__ int rt_fast_clean(struct rtable *rth) +{ + /* Kill broadcast/multicast entries very aggresively, if they + collide in hash table with more useful entries */ + return ((rth->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST)) + && rth->key.iif && rth->u.rt_next); +} static void rt_check_expire(unsigned long dummy) { @@ -255,43 +265,30 @@ static void rt_check_expire(unsigned long dummy) unsigned long now = jiffies; for (i=0; iu.rt_next; - /* * Cleanup aged off entries. */ if (!atomic_read(&rth->u.dst.use) && - (now - rth->u.dst.lastuse > ip_rt_gc_timeout)) { - *rthp = rth_next; -#if RT_CACHE_DEBUG >= 2 - printk("rt_check_expire clean %02x@%08x\n", rover, rth->rt_dst); -#endif + (now - rth->u.dst.lastuse > tmo + || rt_fast_clean(rth))) { + *rthp = rth->u.rt_next; rt_free(rth); continue; } - if (!rth_next) - break; - - if ( (long)(rth_next->u.dst.lastuse - rth->u.dst.lastuse) > RT_CACHE_BUBBLE_THRESHOLD || - ((long)(rth->u.dst.lastuse - rth_next->u.dst.lastuse) < 0 && - atomic_read(&rth->u.dst.refcnt) < atomic_read(&rth_next->u.dst.refcnt))) { -#if RT_CACHE_DEBUG >= 2 - printk("rt_check_expire bubbled %02x@%08x<->%08x\n", rover, rth->rt_dst, rth_next->rt_dst); -#endif - *rthp = rth_next; - rth->u.rt_next = rth_next->u.rt_next; - rth_next->u.rt_next = rth; - rthp = &rth_next->u.rt_next; - continue; - } + tmo >>= 1; rthp = &rth->u.rt_next; } + + if ((jiffies - now) > 0) + break; } rt_periodic_timer.expires = now + ip_rt_gc_interval; add_timer(&rt_periodic_timer); @@ -305,21 +302,14 @@ static void rt_run_flush(unsigned long dummy) rt_deadline = 0; for (i=0; iu.rt_next; - nr++; rth->u.rt_next = NULL; rt_free(rth); } -#if RT_CACHE_DEBUG >= 2 - if (nr > 0) - printk("rt_cache_flush: %d@%02x\n", nr, i); -#endif } } @@ -384,17 +374,23 @@ static int rt_garbage_collect(void) expire++; for (i=0; iu.rt_next) { if (atomic_read(&rth->u.dst.use) || - now - rth->u.dst.lastuse < expire) + (now - rth->u.dst.lastuse < tmo && !rt_fast_clean(rth))) { + tmo >>= 1; continue; + } *rthp = rth->u.rt_next; rth->u.rt_next = NULL; rt_free(rth); break; } + if ((jiffies-now)>0) + break; } last_gc = now; @@ -412,8 +408,6 @@ static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt) struct rtable *rth, **rthp; unsigned long now = jiffies; - rt->u.dst.priority = rt_tos2priority(rt->key.tos); - start_bh_atomic(); rthp = &rt_hash_table[hash]; @@ -793,19 +787,17 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res) if (fi) { if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) rt->rt_gateway = FIB_RES_GW(*res); -#ifndef CONFIG_RTNL_OLD_IFINFO rt->u.dst.mxlock = fi->fib_metrics[RTAX_LOCK-1]; rt->u.dst.pmtu = fi->fib_mtu; if (fi->fib_mtu == 0) { rt->u.dst.pmtu = rt->u.dst.dev->mtu; + if (rt->u.dst.pmtu > IP_MAX_MTU) + rt->u.dst.pmtu = IP_MAX_MTU; if (rt->u.dst.mxlock&(1<rt_gateway != rt->rt_dst && rt->u.dst.pmtu > 576) rt->u.dst.pmtu = 576; } -#else - rt->u.dst.pmtu = fi->fib_mtu ? : rt->u.dst.dev->mtu; -#endif rt->u.dst.window= fi->fib_window ? : 0; rt->u.dst.rtt = fi->fib_rtt ? : TCP_TIMEOUT_INIT; #ifdef CONFIG_NET_CLS_ROUTE @@ -813,6 +805,8 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res) #endif } else { rt->u.dst.pmtu = rt->u.dst.dev->mtu; + if (rt->u.dst.pmtu > IP_MAX_MTU) + rt->u.dst.pmtu = IP_MAX_MTU; rt->u.dst.window= 0; rt->u.dst.rtt = TCP_TIMEOUT_INIT; } @@ -930,7 +924,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr)) goto martian_source; - if (daddr == 0xFFFFFFFF) + if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0)) goto brd_input; /* Accept zero addresses only to limited broadcast; @@ -991,6 +985,11 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, fib_select_multipath(&key, &res); #endif out_dev = FIB_RES_DEV(res)->ip_ptr; + if (out_dev == NULL) { + if (net_ratelimit()) + printk(KERN_CRIT "Bug in ip_route_input_slow(). Please, report\n"); + return -EINVAL; + } err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst); if (err < 0) @@ -1312,15 +1311,14 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int tables are looked up with only one purpose: to catch if destination is gatewayed, rather than direct. Moreover, if MSG_DONTROUTE is set, - we send packet, no matter of routing tables - of ifaddr state. --ANK + we send packet, ignoring both routing tables + and ifaddr state. --ANK We could make it even if oif is unknown, likely IPv6, but we do not. */ - printk(KERN_DEBUG "Dest not on link. Forcing...\n"); if (key.src == 0) key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); goto make_route; @@ -1475,7 +1473,7 @@ int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif) #ifdef CONFIG_RTNETLINK -static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int nowait) +static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait) { struct rtable *rt = (struct rtable*)skb->dst; struct rtmsg *r; @@ -1485,11 +1483,7 @@ static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int #ifdef CONFIG_IP_MROUTE struct rtattr *eptr; #endif -#ifdef CONFIG_RTNL_OLD_IFINFO - unsigned char *o; -#else struct rtattr *mx; -#endif nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r)); r = NLMSG_DATA(nlh); @@ -1503,11 +1497,6 @@ static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int r->rtm_scope = RT_SCOPE_UNIVERSE; r->rtm_protocol = RTPROT_UNSPEC; r->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED; -#ifdef CONFIG_RTNL_OLD_IFINFO - r->rtm_nhs = 0; - - o = skb->tail; -#endif RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst); if (rt->key.src) { r->rtm_src_len = 32; @@ -1521,11 +1510,6 @@ static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src); if (rt->rt_dst != rt->rt_gateway) RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway); -#ifdef CONFIG_RTNL_OLD_IFINFO - RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &rt->u.dst.pmtu); - RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &rt->u.dst.window); - RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &rt->u.dst.rtt); -#else mx = (struct rtattr*)skb->tail; RTA_PUT(skb, RTA_METRICS, 0, NULL); if (rt->u.dst.mxlock) @@ -1539,7 +1523,6 @@ static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int mx->rta_len = skb->tail - (u8*)mx; if (mx->rta_len == RTA_LENGTH(0)) skb_trim(skb, (u8*)mx - skb->data); -#endif ci.rta_lastuse = jiffies - rt->u.dst.lastuse; ci.rta_used = atomic_read(&rt->u.dst.refcnt); ci.rta_clntref = atomic_read(&rt->u.dst.use); @@ -1549,9 +1532,6 @@ static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int eptr = (struct rtattr*)skb->tail; #endif RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci); -#ifdef CONFIG_RTNL_OLD_IFINFO - r->rtm_optlen = skb->tail - o; -#endif if (rt->key.iif) { #ifdef CONFIG_IP_MROUTE u32 dst = rt->rt_dst; @@ -1573,9 +1553,6 @@ static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int #endif { RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif); -#ifdef CONFIG_RTNL_OLD_IFINFO - r->rtm_optlen = skb->tail - o; -#endif } } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3d6f188e7..30a0b0dd6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.116 1998/07/26 03:06:54 davem Exp $ + * Version: $Id: tcp.c,v 1.119 1998/08/26 12:04:14 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -201,6 +201,7 @@ * tcp_do_sendmsg to avoid burstiness. * Eric Schenk : Fix fast close down bug with * shutdown() followed by close(). + * Andi Kleen : Make poll agree with SIGIO * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -383,13 +384,14 @@ * * ICMP messages (4.2.3.9) * MUST act on ICMP errors. (does) - * MUST slow transmission upon receipt of a Source Quench. (does) + * MUST slow transmission upon receipt of a Source Quench. (doesn't anymore + * because that is deprecated now by the IETF, can be turned on) * MUST NOT abort connection upon receipt of soft Destination * Unreachables (0, 1, 5), Time Exceededs and Parameter * Problems. (doesn't) * SHOULD report soft Destination Unreachables etc. to the - * application. (does, but may drop them in the ICMP error handler - * during an accept()) + * application. (does, except during SYN_RECV and may drop messages + * in some rare cases before accept() - ICMP is unreliable) * SHOULD abort connection upon receipt of hard Destination Unreachable * messages (2, 3, 4). (does, but see above) * @@ -397,7 +399,7 @@ * MUST reject as an error OPEN for invalid remote IP address. (does) * MUST ignore SYN with invalid source address. (does) * MUST silently discard incoming SYN for broadcast/multicast - * address. (I'm not sure if it does. Someone should check this.) + * address. (does) * * Asynchronous Reports (4.2.4.1) * MUST provide mechanism for reporting soft errors to application @@ -536,6 +538,21 @@ static unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait) return 0; } +/* + * Compute minimal free write space needed to queue new packets. + */ +static inline int tcp_min_write_space(struct sock *sk, struct tcp_opt *tp) +{ + int space; +#if 1 /* This needs benchmarking and real world tests */ + space = max(tp->mss_cache + 128, MIN_WRITE_SPACE); +#else /* 2.0 way */ + /* More than half of the socket queue free? */ + space = atomic_read(&sk->wmem_alloc) / 2; +#endif + return space; +} + /* * Wait for a TCP event. * @@ -556,36 +573,56 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) mask = 0; if (sk->err) mask = POLLERR; + + /* + * POLLHUP is certainly not done right. But poll() doesn't + * have a notion of HUP in just one direction, and for a + * socket the read side is more interesting. + * + * Some poll() documentation says that POLLHUP is incompatible + * with the POLLOUT/POLLWR flags, so somebody should check this + * all. But careful, it tends to be safer to return too many + * bits than too few, and you can easily break real applications + * if you don't tell them that something has hung up! + * + * Check-me. + */ + if (sk->shutdown & RCV_SHUTDOWN) + mask |= POLLHUP; + /* Connected? */ if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) { - int space; - - if (sk->shutdown & RCV_SHUTDOWN) - mask |= POLLHUP; - if ((tp->rcv_nxt != tp->copied_seq) && (tp->urg_seq != tp->copied_seq || tp->rcv_nxt != tp->copied_seq+1 || sk->urginline || !tp->urg_data)) mask |= POLLIN | POLLRDNORM; -#if 1 /* This needs benchmarking and real world tests */ - space = (sk->dst_cache ? sk->dst_cache->pmtu : sk->mss) + 128; - if (space < 2048) /* XXX */ - space = 2048; -#else /* 2.0 way */ - /* More than half of the socket queue free? */ - space = atomic_read(&sk->wmem_alloc) / 2; -#endif /* Always wake the user up when an error occurred */ - if (sock_wspace(sk) >= space || sk->err) + if (sock_wspace(sk) >= tcp_min_write_space(sk, tp) || sk->err) mask |= POLLOUT | POLLWRNORM; if (tp->urg_data & URG_VALID) - mask |= POLLPRI; + mask |= POLLPRI; } return mask; } +/* + * Socket write_space callback. + * This (or rather the sock_wake_async) should agree with poll. + */ +void tcp_write_space(struct sock *sk) +{ + if (sk->dead) + return; + + wake_up_interruptible(sk->sleep); + if (sock_wspace(sk) >= + tcp_min_write_space(sk, &(sk->tp_pinfo.af_tcp))) + sock_wake_async(sk->socket, 2); +} + + int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) { int answ; @@ -707,7 +744,7 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) int copied = 0; /* Verify that the socket is locked */ - if (!sk->sock_readers) + if (!atomic_read(&sk->sock_readers)) printk("tcp_do_sendmsg: socket not locked!\n"); /* Wait for a connection to finish. */ @@ -1025,7 +1062,7 @@ static void cleanup_rbuf(struct sock *sk, int copied) * which don't advertize a larger window. */ if((copied >= rcv_window_now) && - ((rcv_window_now + sk->mss) <= tp->window_clamp)) + ((rcv_window_now + tp->mss_cache) <= tp->window_clamp)) tcp_read_wakeup(sk); } } @@ -1389,7 +1426,7 @@ void tcp_close(struct sock *sk, unsigned long timeout) * Check whether the socket is locked ... supposedly * it's impossible to tcp_close() a locked socket. */ - if (sk->sock_readers) + if (atomic_read(&sk->sock_readers)) printk("tcp_close: socket already locked!\n"); /* We need to grab some memory, and put together a FIN, @@ -1543,16 +1580,18 @@ struct sock *tcp_accept(struct sock *sk, int flags) tcp_synq_unlink(tp, req, prev); newsk = req->sk; + req->class->destructor(req); tcp_openreq_free(req); sk->ack_backlog--; - /* FIXME: need to check here if newsk has already - * an soft_err or err set. - * We have two options here then: reply (this behaviour matches - * Solaris) or return the error to the application (old Linux) - */ + /* + * This does not pass any already set errors on the new socket + * to the user, but they will be returned on the first socket operation + * after the accept. + */ + error = 0; - out: +out: release_sock(sk); sk->err = error; return newsk; @@ -1586,7 +1625,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, */ if(val<1||val>MAX_WINDOW) return -EINVAL; - sk->user_mss=val; + tp->user_mss=val; return 0; case TCP_NODELAY: sk->nonagle=(val==0)?0:1; @@ -1614,7 +1653,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, switch(optname) { case TCP_MAXSEG: - val=sk->user_mss; + val=tp->user_mss; break; case TCP_NODELAY: val=sk->nonagle; @@ -1640,7 +1679,7 @@ void tcp_set_keepalive(struct sock *sk, int val) extern void __skb_cb_too_small_for_tcp(int, int); -__initfunc(void tcp_init(void)) +void __init tcp_init(void) { struct sk_buff *skb = NULL; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a4ad2dc3c..6a3ae17bf 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_input.c,v 1.121 1998/07/15 04:39:12 davem Exp $ + * Version: $Id: tcp_input.c,v 1.127 1998/08/26 12:04:20 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -50,6 +50,9 @@ * Andi Kleen: Make sure we never ack data there is not * enough room for. Also make this condition * a fatal error if it might still happen. + * Andi Kleen: Add tcp_measure_rcv_mss to make + * connections with MSS @@ -214,7 +217,7 @@ extern __inline__ void tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp #define PAWS_24DAYS (HZ * 60 * 60 * 24 * 24) -extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, __u16 len) +extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, unsigned len) { /* ts_recent must be younger than 24 days */ return (((jiffies - tp->ts_recent_stamp) >= PAWS_24DAYS) || @@ -289,7 +292,7 @@ static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, /* The retransmission queue is always in order, so * we can short-circuit the walk early. */ - if(!before(start_seq, TCP_SKB_CB(skb)->end_seq)) + if(after(TCP_SKB_CB(skb)->end_seq, end_seq)) break; /* We play conservative, we don't allow SACKS to partially @@ -346,9 +349,11 @@ void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, i switch(opcode) { case TCPOPT_MSS: if(opsize==TCPOLEN_MSS && th->syn) { - tp->in_mss = ntohs(*(__u16 *)ptr); - if (tp->in_mss == 0) - tp->in_mss = 536; + u16 in_mss = ntohs(*(__u16 *)ptr); + if (in_mss == 0) + in_mss = 536; + if (tp->mss_clamp > in_mss) + tp->mss_clamp = in_mss; } break; case TCPOPT_WINDOW: @@ -466,10 +471,9 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) * to one half the current congestion window, but no less * than two segments. Retransmit the missing segment. */ + tp->dup_acks++; if (tp->high_seq == 0 || after(ack, tp->high_seq)) { - tp->dup_acks++; if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) { - tp->dup_acks++; tp->snd_ssthresh = max(tp->snd_cwnd >> (TCP_CWND_SHIFT + 1), 2); tp->snd_cwnd = (tp->snd_ssthresh + 3) << TCP_CWND_SHIFT; tp->high_seq = tp->snd_nxt; @@ -863,7 +867,7 @@ void tcp_timewait_kill(struct tcp_tw_bucket *tw) * reconnects and SYN/RST bits being set in the TCP header. */ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, - struct tcphdr *th, void *opt, __u16 len) + struct tcphdr *th, unsigned len) { /* RFC 1122: * "When a connection is [...] on TIME-WAIT state [...] @@ -893,7 +897,7 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, return 0; skb_set_owner_r(skb, sk); af_specific = sk->tp_pinfo.af_tcp.af_specific; - if(af_specific->conn_request(sk, skb, opt, isn) < 0) + if(af_specific->conn_request(sk, skb, isn) < 0) return 1; /* Toss a reset back. */ return 0; /* Discard the frame. */ } @@ -1309,7 +1313,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) tp->delayed_acks++; /* Tiny-grams with PSH set make us ACK quickly. */ - if(skb->h.th->psh && (skb->len < (sk->mss >> 1))) + if(skb->h.th->psh && (skb->len < (tp->mss_cache >> 1))) tp->ato = HZ/50; } /* This may have eaten into a SACK block. */ @@ -1429,7 +1433,6 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) } } - /* We no longer have anyone receiving data on this connection. */ tcp_data_queue(sk, skb); if (before(tp->rcv_nxt, tp->copied_seq)) { @@ -1464,6 +1467,26 @@ static void tcp_data_snd_check(struct sock *sk) } } +/* + * Adapt the MSS value used to make delayed ack decision to the + * real world. + */ +static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + unsigned int len = skb->len, lss; + + if (len > tp->rcv_mss) + tp->rcv_mss = len; + lss = tp->last_seg_size; + tp->last_seg_size = 0; + if (len >= 536) { + if (len == lss) + tp->rcv_mss = len; + tp->last_seg_size = len; + } +} + /* * Check if sending an ack is needed. */ @@ -1486,7 +1509,7 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk) */ /* Two full frames received or... */ - if (((tp->rcv_nxt - tp->rcv_wup) >= sk->mss * MAX_DELAY_ACK) || + if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) || /* We will update the window "significantly" or... */ tcp_raise_window(sk) || /* We entered "quick ACK" mode or... */ @@ -1595,11 +1618,14 @@ static int prune_queue(struct sock *sk) SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); + net_statistics.PruneCalled++; + /* First Clean the out_of_order queue. */ /* Start with the end because there are probably the least * useful packets (crossing fingers). */ while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue))) { + net_statistics.OfoPruned += skb->len; kfree_skb(skb); if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) return 0; @@ -1620,6 +1646,9 @@ static int prune_queue(struct sock *sk) tp->last_ack_sent); return -1; } + + net_statistics.RcvPruned += skb->len; + __skb_unlink(skb, skb->list); tp->rcv_nxt = TCP_SKB_CB(skb)->seq; SOCK_DEBUG(sk, "prune_queue: removing %x-%x (c=%x)\n", @@ -1633,7 +1662,7 @@ static int prune_queue(struct sock *sk) } int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - struct tcphdr *th, __u16 len) + struct tcphdr *th, unsigned len) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int queued = 0; @@ -1682,6 +1711,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, */ if (flg == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { + if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { + tcp_send_ack(sk); + goto discard; + } if (len <= th->doff*4) { /* Bulk data transfer: sender */ if (len == th->doff*4) { @@ -1696,15 +1729,12 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una) { /* Bulk data transfer: receiver */ - if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) { - /* We must send an ACK for zero window probes. */ - if (!before(TCP_SKB_CB(skb)->seq, - tp->rcv_wup + tp->rcv_wnd)) - tcp_send_ack(sk); + if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) goto discard; - } - skb_pull(skb,th->doff*4); + __skb_pull(skb,th->doff*4); + + tcp_measure_rcv_mss(sk, skb); /* DO NOT notify forward progress here. * It saves dozen of CPU instructions in fast path. --ANK @@ -1719,7 +1749,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_delack_estimator(tp); /* Tiny-grams with PSH set make us ACK quickly. */ - if(th->psh && (skb->len < (sk->mss >> 1))) + if(th->psh && (skb->len < (tp->mss_cache >> 1))) tp->ato = HZ/50; tp->delayed_acks++; @@ -1767,6 +1797,25 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, /* step 7: process the segment text */ queued = tcp_data(skb, sk, len); + /* This must be after tcp_data() does the skb_pull() to + * remove the header size from skb->len. + * + * Dave!!! Phrase above (and all about rcv_mss) has + * nothing to do with reality. rcv_mss must measure TOTAL + * size, including sacks, IP options etc. Hence, measure_rcv_mss + * must occure before pulling etc, otherwise it will flap + * like hell. Even putting it before tcp_data is wrong, + * it should use skb->tail - skb->nh.raw instead. + * --ANK (980805) + * + * BTW I broke it. Now all TCP options are handled equally + * in mss_clamp calculations (i.e. ignored, rfc1122), + * and mss_cache does include all of them (i.e. tstamps) + * except for sacks, to calulate effective mss faster. + * --ANK (980805) + */ + tcp_measure_rcv_mss(sk, skb); + /* Be careful, tcp_data() may have put this into TIME_WAIT. */ if(sk->state != TCP_CLOSE) { tcp_data_snd_check(sk); @@ -1853,7 +1902,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, */ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - struct tcphdr *th, void *opt, __u16 len) + struct tcphdr *th, unsigned len) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int queued = 0; @@ -1868,7 +1917,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 1; if(th->syn) { - if(tp->af_specific->conn_request(sk, skb, opt, 0) < 0) + if(tp->af_specific->conn_request(sk, skb, 0) < 0) return 1; /* Now we have several options: In theory there is @@ -1961,28 +2010,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* Can't be earlier, doff would be wrong. */ tcp_send_ack(sk); - /* Check for the case where we tried to advertise - * a window including timestamp options, but did not - * end up using them for this connection. - */ - if((tp->tstamp_ok == 0) && sysctl_tcp_timestamps) - sk->mss += TCPOLEN_TSTAMP_ALIGNED; - - /* Now limit it if the other end negotiated a smaller - * value. - */ - if (tp->in_mss) { - int real_mss = tp->in_mss; - - /* We store MSS locally with the timestamp bytes - * subtracted, TCP's advertise it with them - * included. Account for this fact. - */ - if(tp->tstamp_ok) - real_mss -= TCPOLEN_TSTAMP_ALIGNED; - sk->mss = min(sk->mss, real_mss); - } - sk->dport = th->source; tp->copied_seq = tp->rcv_nxt; @@ -1990,9 +2017,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, sk->state_change(sk); sock_wake_async(sk->socket, 0); } - - /* Drop through step 6 */ - goto step6; } else { if(th->syn && !th->rst) { /* The previous version of the code @@ -2017,11 +2041,20 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tcp_send_synack(sk); - goto discard; - } - + } else + break; } - break; + + /* tp->tcp_header_len and tp->mss_clamp + probably changed, synchronize mss. + */ + tcp_sync_mss(sk, tp->pmtu_cookie); + tp->rcv_mss = tp->mss_cache; + + if (sk->state == TCP_SYN_RECV) + goto discard; + + goto step6; } /* Parse the tcp_options present on this header. @@ -2167,6 +2200,11 @@ step6: case TCP_ESTABLISHED: queued = tcp_data(skb, sk, len); + + /* This must be after tcp_data() does the skb_pull() to + * remove the header size from skb->len. + */ + tcp_measure_rcv_mss(sk, skb); break; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e0ecdbfa5..bf3fb243b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_ipv4.c,v 1.150 1998/07/28 17:45:07 freitag Exp $ + * Version: $Id: tcp_ipv4.c,v 1.157 1998/08/28 00:27:47 davem Exp $ * * IPv4 specific functions * @@ -44,6 +44,7 @@ * Andi Kleen: various fixes. * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Andi Kleen : Fix new listen. + * Andi Kleen : Fix accept error reporting. */ #include @@ -140,7 +141,8 @@ void tcp_bucket_unlock(struct sock *sk) if(tb->port == snum) { if(tb->owners == NULL && (tb->flags & TCPB_FLAG_LOCKED)) { - tb->flags &= ~TCPB_FLAG_LOCKED; + tb->flags &= ~(TCPB_FLAG_LOCKED | + TCPB_FLAG_FASTREUSE); tcp_inc_slow_timer(TCP_SLT_BUCKETGC); } break; @@ -208,7 +210,7 @@ static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum) /* We must walk the whole port owner list in this case. -DaveM */ for(sk2 = tb->owners; sk2; sk2 = sk2->bind_next) { - if(sk->bound_dev_if == sk2->bound_dev_if) { + if (sk->bound_dev_if == sk2->bound_dev_if) { if(!sk_reuse || !sk2->reuse || sk2->state == TCP_LISTEN) { if(!sk2->rcv_saddr || !sk->rcv_saddr || @@ -223,16 +225,33 @@ static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum) } if(result == 0) { if(tb == NULL) { - if(tcp_bucket_create(snum) == NULL) + if((tb = tcp_bucket_create(snum)) == NULL) result = 1; + else if (sk->reuse && sk->state != TCP_LISTEN) + tb->flags |= TCPB_FLAG_FASTREUSE; } else { /* It could be pending garbage collection, this * kills the race and prevents it from disappearing * out from under us by the time we use it. -DaveM */ - if(tb->owners == NULL && !(tb->flags & TCPB_FLAG_LOCKED)) { - tb->flags = TCPB_FLAG_LOCKED; - tcp_dec_slow_timer(TCP_SLT_BUCKETGC); + if(tb->owners == NULL) { + if (!(tb->flags & TCPB_FLAG_LOCKED)) { + tb->flags = (TCPB_FLAG_LOCKED | + ((sk->reuse && + sk->state != TCP_LISTEN) ? + TCPB_FLAG_FASTREUSE : 0)); + tcp_dec_slow_timer(TCP_SLT_BUCKETGC); + } else if (!(tb->flags & TCPB_FLAG_GOODSOCKNUM)) { + /* Someone is in between the bind + * and the actual connect or listen. + * See if it was a legitimate reuse + * and we are as well, else punt. + */ + if (sk->reuse == 0 || + !(tb->flags & TCPB_FLAG_FASTREUSE)) + result = 1; + } else + tb->flags &= ~TCPB_FLAG_GOODSOCKNUM; } } } @@ -264,8 +283,11 @@ unsigned short tcp_good_socknum(void) next: } while(--remaining > 0); tcp_port_rover = rover; - if((remaining <= 0) || (tcp_bucket_create(rover) == NULL)) + tb = NULL; + if((remaining <= 0) || ((tb = tcp_bucket_create(rover)) == NULL)) rover = 0; + if (tb != NULL) + tb->flags |= TCPB_FLAG_GOODSOCKNUM; SOCKHASH_UNLOCK(); return rover; @@ -543,8 +565,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; struct sk_buff *buff; struct rtable *rt; + u32 daddr, nexthop; int tmp; - int mss; if (sk->state != TCP_CLOSE) return(-EISCONN); @@ -564,7 +586,14 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) printk(KERN_DEBUG "%s forgot to set AF_INET in " __FUNCTION__ "\n", current->comm); } - tmp = ip_route_connect(&rt, usin->sin_addr.s_addr, sk->saddr, + nexthop = daddr = usin->sin_addr.s_addr; + if (sk->opt && sk->opt->srr) { + if (daddr == 0) + return -EINVAL; + nexthop = sk->opt->faddr; + } + + tmp = ip_route_connect(&rt, nexthop, sk->saddr, RT_TOS(sk->ip_tos)|sk->localroute, sk->bound_dev_if); if (tmp < 0) return tmp; @@ -592,6 +621,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) */ sk->dport = usin->sin_port; sk->daddr = rt->rt_dst; + if (sk->opt && sk->opt->srr) + sk->daddr = daddr; if (!sk->saddr) sk->saddr = rt->rt_src; sk->rcv_saddr = sk->saddr; @@ -601,22 +632,28 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) return -EADDRNOTAVAIL; } - sk->mtu = rt->u.dst.pmtu; - if ((sk->ip_pmtudisc == IP_PMTUDISC_DONT || - (sk->ip_pmtudisc == IP_PMTUDISC_WANT && - (rt->u.dst.mxlock&(1<u.dst.pmtu > 576 && rt->rt_dst != rt->rt_gateway) - sk->mtu = 576; + tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr, + sk->sport, usin->sin_port); - if (sk->mtu < 64) - sk->mtu = 64; /* Sanity limit */ + tp->ext_header_len = 0; + if (sk->opt) + tp->ext_header_len = sk->opt->optlen; - mss = sk->mtu - sizeof(struct iphdr); + /* Reset mss clamp */ + tp->mss_clamp = ~0; - tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr, - sk->sport, usin->sin_port); + if ((sk->ip_pmtudisc == IP_PMTUDISC_DONT || + (sk->ip_pmtudisc == IP_PMTUDISC_WANT && + (rt->u.dst.mxlock&(1<u.dst.pmtu > 576 && rt->rt_dst != rt->rt_gateway) { + /* Clamp mss at maximum of 536 and user_mss. + Probably, user ordered to override tiny segment size + in gatewayed case. + */ + tp->mss_clamp = max(tp->user_mss, 536); + } - tcp_connect(sk, buff, mss); + tcp_connect(sk, buff, rt->u.dst.pmtu); return 0; } @@ -694,7 +731,6 @@ static struct open_request *tcp_v4_search_req(struct tcp_opt *tp, */ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip) { - int new_mtu; struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; /* Don't interested in TCP_LISTEN and open_requests (SYN-ACKs @@ -711,21 +747,19 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip) * route, but I think that's acceptable. */ if (sk->ip_pmtudisc != IP_PMTUDISC_DONT && sk->dst_cache) { - new_mtu = sk->dst_cache->pmtu - - (ip->ihl<<2) - tp->tcp_header_len; - if (new_mtu < sk->mss && new_mtu > 0) { - sk->mss = new_mtu; + if (tp->pmtu_cookie > sk->dst_cache->pmtu && + !atomic_read(&sk->sock_readers)) { + lock_sock(sk); + tcp_sync_mss(sk, sk->dst_cache->pmtu); + /* Resend the TCP packet because it's * clear that the old packet has been * dropped. This is the new "fast" path mtu * discovery. */ - if (!sk->sock_readers) { - lock_sock(sk); - tcp_simple_retransmit(sk); - release_sock(sk); - } /* else let the usual retransmit timer handle it */ - } + tcp_simple_retransmit(sk); + release_sock(sk); + } /* else let the usual retransmit timer handle it */ } } @@ -813,7 +847,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) /* Prevent race conditions with accept() - * ICMP is unreliable. */ - if (sk->sock_readers) { + if (atomic_read(&sk->sock_readers)) { /* XXX: add a counter here to profile this. * If too many ICMPs get dropped on busy * servers this needs to be solved differently. @@ -821,8 +855,15 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) return; } + /* The final ACK of the handshake should be already + * handled in the new socket context, not here. + * Strictly speaking - an ICMP error for the final + * ACK should set the opening flag, but that is too + * complicated right now. + */ if (!th->syn && !th->ack) return; + req = tcp_v4_search_req(tp, iph, th, &prev); if (!req) return; @@ -833,17 +874,33 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) seq, req->snt_isn); return; } - if (req->sk) { /* not yet accept()ed */ - sk = req->sk; /* report error in accept */ + if (req->sk) { + /* + * Already in ESTABLISHED and a big socket is created, + * set error code there. + * The error will _not_ be reported in the accept(), + * but only with the next operation on the socket after + * accept. + */ + sk = req->sk; } else { + /* + * Still in SYN_RECV, just remove it silently. + * There is no good way to pass the error to the newly + * created socket, and POSIX does not want network + * errors returned from accept(). + */ tp->syn_backlog--; tcp_synq_unlink(tp, req, prev); req->class->destructor(req); tcp_openreq_free(req); + return; } - /* FALL THOUGH */ + break; case TCP_SYN_SENT: case TCP_SYN_RECV: + if (!th->syn) + return; opening = 1; break; } @@ -855,10 +912,13 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) tcp_statistics.TcpAttemptFails++; if (sk->state != TCP_LISTEN) tcp_set_state(sk,TCP_CLOSE); + mb(); sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ } - } else /* Only an error on timeout */ + } else { /* Only an error on timeout */ sk->err_soft = icmp_err_convert[code].errno; + mb(); + } } /* This routine computes an IPv4 TCP checksum. */ @@ -916,7 +976,7 @@ static void tcp_v4_send_reset(struct sk_buff *skb) IPPROTO_TCP, 0); arg.n_iov = 1; - arg.csumoffset = offsetof(struct tcphdr, check) / sizeof(u16); + arg.csumoffset = offsetof(struct tcphdr, check) / 2; ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth); @@ -950,6 +1010,11 @@ int tcp_chkaddr(struct sk_buff *skb) } #endif +/* + * Send a SYN-ACK after having received an ACK. + * This still operates on a open_request only, not on a big + * socket. + */ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) { struct rtable *rt; @@ -974,7 +1039,7 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) return; } - mss = (rt->u.dst.pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr)); + mss = rt->u.dst.pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr); skb = tcp_make_synack(sk, &rt->u.dst, req, mss); if (skb) { @@ -994,6 +1059,9 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) ip_rt_put(rt); } +/* + * IPv4 open_request destructor. + */ static void tcp_v4_or_free(struct open_request *req) { if(!req->sk && req->af.v4_req.opt) @@ -1016,9 +1084,9 @@ static inline void syn_flood_warning(struct sk_buff *skb) * Save and compile IPv4 options into the open_request if needed. */ static inline struct ip_options * -tcp_v4_save_options(struct sock *sk, struct sk_buff *skb, - struct ip_options *opt) +tcp_v4_save_options(struct sock *sk, struct sk_buff *skb) { + struct ip_options *opt = &(IPCB(skb)->opt); struct ip_options *dopt = NULL; if (opt && opt->optlen) { @@ -1052,8 +1120,7 @@ struct or_calltable or_ipv4 = { #define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */ #define BACKLOGMAX(sk) sysctl_max_syn_backlog -int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, - __u32 isn) +int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, __u32 isn) { struct tcp_opt tp; struct open_request *req; @@ -1070,6 +1137,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, if (sk->dead) goto dead; + /* Never answer to SYNs send to broadcast or multicast */ + if (((struct rtable *)skb->dst)->rt_flags & + (RTCF_BROADCAST|RTCF_MULTICAST)) + goto drop; + /* XXX: Check against a global syn pool counter. */ if (BACKLOG(sk) > BACKLOGMAX(sk)) { #ifdef CONFIG_SYN_COOKIES @@ -1094,13 +1166,18 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, req->rcv_isn = TCP_SKB_CB(skb)->seq; tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0; - tp.in_mss = 536; + + tp.mss_clamp = 65535; tcp_parse_options(NULL, th, &tp, want_cookie); - req->mss = tp.in_mss; - if (tp.saw_tstamp) { - req->mss -= TCPOLEN_TSTAMP_ALIGNED; + if (tp.mss_clamp == 65535) + tp.mss_clamp = 576 - sizeof(struct iphdr) - sizeof(struct iphdr); + + if (sk->tp_pinfo.af_tcp.user_mss && sk->tp_pinfo.af_tcp.user_mss < tp.mss_clamp) + tp.mss_clamp = sk->tp_pinfo.af_tcp.user_mss; + req->mss = tp.mss_clamp; + + if (tp.saw_tstamp) req->ts_recent = tp.rcv_tsval; - } req->tstamp_ok = tp.tstamp_ok; req->sack_ok = tp.sack_ok; req->snd_wscale = tp.snd_wscale; @@ -1120,7 +1197,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, req->snt_isn = isn; - req->af.v4_req.opt = tcp_v4_save_options(sk, skb, ptr); + req->af.v4_req.opt = tcp_v4_save_options(sk, skb); req->class = &or_ipv4; req->retrans = 0; @@ -1139,7 +1216,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, tcp_synq_queue(&sk->tp_pinfo.af_tcp, req); } - sk->data_ready(sk, 0); return 0; dead: @@ -1160,8 +1236,7 @@ drop: * * This function wants to be moved to a common for IPv[46] file. --ANK */ -struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb, - int snd_mss) +struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb) { struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0); @@ -1175,11 +1250,12 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, /* Clone the TCP header template */ newsk->dport = req->rmt_port; - newsk->sock_readers = 0; + atomic_set(&newsk->sock_readers, 0); atomic_set(&newsk->rmem_alloc, 0); skb_queue_head_init(&newsk->receive_queue); atomic_set(&newsk->wmem_alloc, 0); skb_queue_head_init(&newsk->write_queue); + atomic_set(&newsk->omem_alloc, 0); newsk->done = 0; newsk->proc = 0; @@ -1231,7 +1307,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newtp->copied_seq = req->rcv_isn + 1; newtp->saw_tstamp = 0; - newtp->in_mss = 536; + newtp->mss_clamp = req->mss; init_timer(&newtp->probe_timer); newtp->probe_timer.function = &tcp_probe_timer; @@ -1242,12 +1318,14 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newtp->urg_data = 0; tcp_synq_init(newtp); newtp->syn_backlog = 0; + if (skb->len >= 536) + newtp->last_seg_size = skb->len; /* Back to base struct sock members. */ newsk->err = 0; newsk->ack_backlog = 0; newsk->max_ack_backlog = SOMAXCONN; - newsk->priority = 1; + newsk->priority = 0; /* IP layer stuff */ newsk->timeout = 0; @@ -1276,14 +1354,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, } else { newtp->tcp_header_len = sizeof(struct tcphdr); } - - snd_mss -= newtp->tcp_header_len; - - if (sk->user_mss) - snd_mss = min(snd_mss, sk->user_mss); - - newsk->mss = min(req->mss, snd_mss); - } return newsk; } @@ -1299,8 +1369,6 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct ip_options *opt = req->af.v4_req.opt; struct tcp_opt *newtp; struct sock *newsk; - int snd_mss; - int mtu; if (sk->ack_backlog > sk->max_ack_backlog) goto exit; /* head drop */ @@ -1324,12 +1392,7 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, goto exit; #endif - mtu = dst->pmtu; - if (mtu < 68) /* XXX: we should turn pmtu disc off when this happens. */ - mtu = 68; - snd_mss = mtu - sizeof(struct iphdr); - - newsk = tcp_create_openreq_child(sk, req, skb, snd_mss); + newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) goto exit; @@ -1347,15 +1410,22 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->sport = req->lcl_port; #endif newsk->opt = req->af.v4_req.opt; - newsk->mtu = mtu; - - if (newsk->rcvbuf < (3 * newsk->mtu)) - newsk->rcvbuf = min ((3 * newsk->mtu), sysctl_rmem_max); - if (newsk->sndbuf < (3 * newsk->mtu)) - newsk->sndbuf = min ((3 * newsk->mtu), sysctl_wmem_max); + newtp->ext_header_len = 0; + if (newsk->opt) + newtp->ext_header_len = newsk->opt->optlen; + + tcp_sync_mss(newsk, dst->pmtu); + newtp->rcv_mss = newtp->mss_clamp; + + /* It would be better to use newtp->mss_clamp here */ + if (newsk->rcvbuf < (3 * newtp->pmtu_cookie)) + newsk->rcvbuf = min ((3 * newtp->pmtu_cookie), sysctl_rmem_max); + if (newsk->sndbuf < (3 * newtp->pmtu_cookie)) + newsk->sndbuf = min ((3 * newtp->pmtu_cookie), sysctl_wmem_max); tcp_v4_hash(newsk); add_to_prot_sklist(newsk); + sk->data_ready(sk, 0); /* Deliver SIGIO */ return newsk; @@ -1373,8 +1443,8 @@ static void tcp_v4_rst_req(struct sock *sk, struct sk_buff *skb) if (!req) return; /* Sequence number check required by RFC793 */ - if (before(TCP_SKB_CB(skb)->seq, req->snt_isn) || - after(TCP_SKB_CB(skb)->seq, req->snt_isn+1)) + if (before(TCP_SKB_CB(skb)->seq, req->rcv_isn) || + after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1)) return; tcp_synq_unlink(tp, req, prev); (req->sk ? sk->ack_backlog : tp->syn_backlog)--; @@ -1461,7 +1531,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) sk = nsk; } - if (tcp_rcv_state_process(sk, skb, skb->h.th, &(IPCB(skb)->opt), skb->len)) + if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) goto reset; release_sock(sk); return 0; @@ -1543,7 +1613,7 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len) if (sk->state == TCP_TIME_WAIT) goto do_time_wait; - if (!sk->sock_readers) + if (!atomic_read(&sk->sock_readers)) return tcp_v4_do_rcv(sk, skb); __skb_queue_tail(&sk->back_log, skb); @@ -1559,7 +1629,7 @@ discard_it: do_time_wait: if(tcp_timewait_state_process((struct tcp_tw_bucket *)sk, - skb, th, &(IPCB(skb)->opt), skb->len)) + skb, th, skb->len)) goto no_tcp_socket; goto discard_it; } @@ -1665,6 +1735,8 @@ struct tcp_func ipv4_specific = { tcp_v4_conn_request, tcp_v4_syn_recv_sock, tcp_v4_get_sock, + sizeof(struct iphdr), + ip_setsockopt, ip_getsockopt, v4_addr2sockaddr, @@ -1683,7 +1755,7 @@ static int tcp_v4_init_sock(struct sock *sk) tp->rto = TCP_TIMEOUT_INIT; /*TCP_WRITE_TIME*/ tp->mdev = TCP_TIMEOUT_INIT; - tp->in_mss = 536; + tp->mss_clamp = ~0; /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. @@ -1691,11 +1763,11 @@ static int tcp_v4_init_sock(struct sock *sk) tp->snd_cwnd = (1 << TCP_CWND_SHIFT); tp->snd_ssthresh = 0x7fffffff; /* Infinity */ - sk->priority = 1; sk->state = TCP_CLOSE; sk->max_ack_backlog = SOMAXCONN; - sk->mtu = 576; - sk->mss = 536; + tp->rcv_mss = 536; + + sk->write_space = tcp_write_space; /* Init SYN queue. */ tcp_synq_init(tp); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 84535341f..03696cbe0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.92 1998/06/19 13:22:44 davem Exp $ + * Version: $Id: tcp_output.c,v 1.93 1998/08/26 12:04:32 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -117,7 +117,7 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) * is never scaled. */ th->window = htons(tp->rcv_wnd); - tcp_syn_build_options((__u32 *)(th + 1), sk->mss, + tcp_syn_build_options((__u32 *)(th + 1), tp->mss_clamp, sysctl_tcp_timestamps, sysctl_tcp_sack, sysctl_tcp_window_scaling, @@ -227,6 +227,65 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) return 0; } +/* This function synchronize snd mss to current pmtu/exthdr set. + + tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts + for TCP options, but includes only bare TCP header. + + tp->mss_clamp is mss negotiated at connection setup. + It is minumum of user_mss and mss received with SYN. + It also does not include TCP options. + + tp->pmtu_cookie is last pmtu, seen by this function. + + tp->mss_cache is current effective sending mss, including + all tcp options except for SACKs. It is evaluated, + taking into account current pmtu, but never exceeds + tp->mss_clamp. + + NOTE1. rfc1122 clearly states that advertised MSS + DOES NOT include either tcp or ip options. + + NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside + this function. --ANK (980731) + */ + +int tcp_sync_mss(struct sock *sk, u32 pmtu) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + int mss_now; + + /* Calculate base mss without TCP options: + It is MMS_S - sizeof(tcphdr) of rfc1122 + */ + mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr); + + /* Clamp it (mss_clamp does not include tcp options) */ + if (mss_now > tp->mss_clamp) + mss_now = tp->mss_clamp; + + /* Now subtract TCP options size, not including SACKs */ + mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); + + /* Now subtract optional transport overhead */ + mss_now -= tp->ext_header_len; + + /* It we got too small (or even negative) value, + clamp it by 8 from below. Why 8 ? + Well, it could be 1 with the same success, + but if IP accepted segment of length 1, + it would love 8 even more 8) --ANK (980731) + */ + if (mss_now < 8) + mss_now = 8; + + /* And store cached results */ + tp->pmtu_cookie = pmtu; + tp->mss_cache = mss_now; + return mss_now; +} + + /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -334,7 +393,7 @@ void tcp_write_xmit(struct sock *sk) u32 __tcp_select_window(struct sock *sk, u32 cur_win) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - unsigned int mss = sk->mss; + unsigned int mss = tp->mss_cache; int free_space; u32 window; @@ -624,7 +683,7 @@ void tcp_send_fin(struct sock *sk) */ if(tp->send_head == skb && !sk->nonagle && - skb->len < (sk->mss >> 1) && + skb->len < (tp->mss_cache >> 1) && tp->packets_out && !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) { update_send_head(sk); @@ -738,20 +797,15 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, skb->dst = dst_clone(dst); - if (sk->user_mss) - mss = min(mss, sk->user_mss); - if (req->tstamp_ok) - mss -= TCPOLEN_TSTAMP_ALIGNED; - /* Don't offer more than they did. * This way we don't have to memorize who said what. * FIXME: maybe this should be changed for better performance * with syncookies. */ req->mss = min(mss, req->mss); - if (req->mss < 1) { - printk(KERN_DEBUG "initial req->mss below 1\n"); - req->mss = 1; + if (req->mss < 8) { + printk(KERN_DEBUG "initial req->mss below 8\n"); + req->mss = 8; } tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS + @@ -796,7 +850,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, return skb; } -void tcp_connect(struct sock *sk, struct sk_buff *buff, int mss) +void tcp_connect(struct sock *sk, struct sk_buff *buff, int mtu) { struct dst_entry *dst = sk->dst_cache; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); @@ -804,9 +858,6 @@ void tcp_connect(struct sock *sk, struct sk_buff *buff, int mss) /* Reserve space for headers. */ skb_reserve(buff, MAX_HEADER + sk->prot->max_header); - if (sk->priority == 0) - sk->priority = dst->priority; - tp->snd_wnd = 0; tp->snd_wl1 = 0; tp->snd_wl2 = tp->write_seq; @@ -821,17 +872,25 @@ void tcp_connect(struct sock *sk, struct sk_buff *buff, int mss) tp->tcp_header_len = sizeof(struct tcphdr) + (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); - mss -= tp->tcp_header_len; - - if (sk->user_mss) - mss = min(mss, sk->user_mss); - - if (mss < 1) { - printk(KERN_DEBUG "initial sk->mss below 1\n"); - mss = 1; /* Sanity limit */ - } - - sk->mss = mss; + /* If user gave his TCP_MAXSEG, record it to clamp */ + if (tp->user_mss) + tp->mss_clamp = tp->user_mss; + tcp_sync_mss(sk, mtu); + + /* Now unpleasant action: if initial pmtu is too low + set lower clamp. I am not sure that it is good. + To be more exact, I do not think that clamping at value, which + is apparently transient and may improve in future is good idea. + It would be better to wait until peer will returns its MSS + (probably 65535 too) and now advertise something sort of 65535 + or at least first hop device mtu. Is it clear, what I mean? + We should tell peer what maximal mss we expect to RECEIVE, + it has nothing to do with pmtu. + I am afraid someone will be confused by such huge value. + --ANK (980731) + */ + if (tp->mss_cache + tp->tcp_header_len - sizeof(struct tcphdr) < tp->mss_clamp ) + tp->mss_clamp = tp->mss_cache + tp->tcp_header_len - sizeof(struct tcphdr); TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; TCP_SKB_CB(buff)->sacked = 0; @@ -842,7 +901,7 @@ void tcp_connect(struct sock *sk, struct sk_buff *buff, int mss) tp->snd_nxt = TCP_SKB_CB(buff)->end_seq; tp->window_clamp = dst->window; - tcp_select_initial_window(sock_rspace(sk)/2,sk->mss, + tcp_select_initial_window(sock_rspace(sk)/2,tp->mss_clamp, &tp->rcv_wnd, &tp->window_clamp, sysctl_tcp_window_scaling, diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 665a448bb..94275718b 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -182,7 +182,7 @@ void tcp_probe_timer(unsigned long data) if(sk->zapped) return; - if (sk->sock_readers) { + if (atomic_read(&sk->sock_readers)) { /* Try again in second. */ tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ); return; @@ -432,7 +432,7 @@ void tcp_retransmit_timer(unsigned long data) return; } - if (sk->sock_readers) { + if (atomic_read(&sk->sock_readers)) { /* Try again in a second. */ tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ); return; @@ -518,7 +518,7 @@ static void tcp_syn_recv_timer(unsigned long data) struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; /* TCP_LISTEN is implied. */ - if (!sk->sock_readers && tp->syn_wait_queue) { + if (!atomic_read(&sk->sock_readers) && tp->syn_wait_queue) { struct open_request *prev = (struct open_request *)(&tp->syn_wait_queue); struct open_request *req = tp->syn_wait_queue; do { diff --git a/net/ipv4/timer.c b/net/ipv4/timer.c index a0501bd19..d5f6d3eb5 100644 --- a/net/ipv4/timer.c +++ b/net/ipv4/timer.c @@ -73,7 +73,7 @@ void net_timer (unsigned long data) int why = sk->timeout; /* Only process if socket is not in use. */ - if (sk->sock_readers) { + if (atomic_read(&sk->sock_readers)) { sk->timer.expires = jiffies+HZ; add_timer(&sk->timer); return; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 7e2c7bfa6..eab552c36 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -5,7 +5,7 @@ * * The User Datagram Protocol (UDP). * - * Version: $Id: udp.c,v 1.57 1998/05/14 06:32:44 davem Exp $ + * Version: $Id: udp.c,v 1.61 1998/08/29 17:11:10 freitag Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -59,6 +59,8 @@ * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Melvin Smith : Check msg_name not msg_namelen in sendto(), * return ENOTCONN for unconnected sockets (POSIX) + * Janos Farkas : don't deliver multi/broadcasts to a different + * bound-to-device socket * * * This program is free software; you can redistribute it and/or @@ -80,7 +82,7 @@ MUST provide facility for checksumming (OK) MAY allow application to control checksumming (OK) MUST default to checksumming on (OK) - MUST discard silently datagrams with bad csums (OK) + MUST discard silently datagrams with bad csums (OK, except during debugging) 4.1.3.5 (UDP Multihoming) MUST allow application to specify source address (OK) SHOULD be able to communicate the chosen src addr up to application @@ -93,14 +95,12 @@ #include #include #include -#include #include #include #include #include #include #include -#include #include #include #include @@ -108,14 +108,12 @@ #include #include #include -#include #include #include #include #include #include #include -#include /* * Snmp MIB for the UDP layer @@ -447,7 +445,8 @@ static inline struct sock *udp_v4_mcast_next(struct sock *sk, unsigned short num, unsigned long raddr, unsigned short rnum, - unsigned long laddr) + unsigned long laddr, + int dif) { struct sock *s = sk; unsigned short hnum = ntohs(num); @@ -455,8 +454,9 @@ static inline struct sock *udp_v4_mcast_next(struct sock *sk, if ((s->num != hnum) || (s->dead && (s->state == TCP_CLOSE)) || (s->daddr && s->daddr!=raddr) || - (s->dport != rnum && s->dport != 0) || - (s->rcv_saddr && s->rcv_saddr != laddr)) + (s->dport != rnum && s->dport != 0) || + (s->rcv_saddr && s->rcv_saddr != laddr) || + (s->bound_dev_if && s->bound_dev_if != dif)) continue; break; } @@ -493,7 +493,7 @@ void udp_err(struct sk_buff *skb, unsigned char *dp, int len) return; /* No socket for error */ } - if (sk->ip_recverr && !sk->sock_readers) { + if (sk->ip_recverr && !atomic_read(&sk->sock_readers)) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2 && sock_queue_err_skb(sk, skb2)) kfree_skb(skb2); @@ -619,7 +619,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) struct ipcm_cookie ipc; struct udpfakehdr ufh; struct rtable *rt = NULL; - int free = 0, localroute = 0; + int free = 0; + int connected = 0; u32 daddr; u8 tos; int err; @@ -674,27 +675,15 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) ufh.uh.dest = usin->sin_port; if (ufh.uh.dest == 0) return -EINVAL; - /* XXX: is a one-behind cache for the dst_entry worth it? - - Nope. ip_route_output is slower than nothing, but it - is enough fast to forget about caching its results. - Really, checking route validity in general case - is not much faster complete lookup. - It was main reason why I removed it from 2.1. - The second reason was that idle sockets held - a lot of stray destinations. --ANK - */ } else { if (sk->state != TCP_ESTABLISHED) return -ENOTCONN; ufh.daddr = sk->daddr; ufh.uh.dest = sk->dport; - - /* - BUGGG Khm... And who will validate it? Fixing it fastly... - --ANK + /* Open fast path for connected socket. + Route will not be used, if at least one option is set. */ - rt = (struct rtable *)dst_check(&sk->dst_cache, 0); + connected = 1; } #ifdef CONFIG_IP_TRANSPARENT_PROXY if (msg->msg_flags&MSG_PROXY) { @@ -710,6 +699,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) ufh.uh.source = from->sin_port; if (ipc.addr == 0) ipc.addr = sk->saddr; + connected = 0; } else #endif { @@ -725,6 +715,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) return err; if (ipc.opt) free = 1; + connected = 0; } if (!ipc.opt) ipc.opt = sk->opt; @@ -736,12 +727,13 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) if (!daddr) return -EINVAL; daddr = ipc.opt->faddr; + connected = 0; } tos = RT_TOS(sk->ip_tos); if (sk->localroute || (msg->msg_flags&MSG_DONTROUTE) || (ipc.opt && ipc.opt->is_strictroute)) { tos |= RTO_ONLINK; - rt = NULL; /* sorry */ + connected = 0; } if (MULTICAST(daddr)) { @@ -749,8 +741,12 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) ipc.oif = sk->ip_mc_index; if (!ufh.saddr) ufh.saddr = sk->ip_mc_addr; + connected = 0; } + if (connected) + rt = (struct rtable*)dst_clone(sk->dst_cache); + if (rt == NULL) { err = ip_route_output(&rt, daddr, ufh.saddr, #ifdef CONFIG_IP_TRANSPARENT_PROXY @@ -759,7 +755,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) tos, ipc.oif); if (err) goto out; - localroute = 1; err = -EACCES; if (rt->rt_flags&RTCF_BROADCAST && !sk->broadcast) @@ -777,17 +772,13 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) /* RFC1122: OK. Provides the checksumming facility (MUST) as per */ /* 4.1.3.4. It's configurable by the application via setsockopt() */ - /* (MAY) and it defaults to on (MUST). Almost makes up for the */ - /* violation above. -- MS */ + /* (MAY) and it defaults to on (MUST). */ - lock_sock(sk); err = ip_build_xmit(sk,sk->no_check ? udp_getfrag_nosum : udp_getfrag, &ufh, ulen, &ipc, rt, msg->msg_flags); - release_sock(sk); out: - if (localroute) - ip_rt_put(rt); + ip_rt_put(rt); if (free) kfree(ipc.opt); if (!err) { @@ -822,7 +813,9 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) if (sk->state == TCP_LISTEN) return(-EINVAL); amount = 0; - /* N.B. Is this interrupt safe?? */ + /* N.B. Is this interrupt safe?? + -> Yes. Interrupts do not remove skbs. --ANK (980725) + */ skb = skb_peek(&sk->receive_queue); if (skb != NULL) { /* @@ -841,6 +834,9 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) return(0); } +#if defined(CONFIG_FILTER) || !defined(HAVE_CSUM_COPY_USER) +#undef CONFIG_UDP_DELAY_CSUM +#endif /* * This should be easy, if there is something there we @@ -848,7 +844,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) */ int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len, - int noblock, int flags, int *addr_len) + int noblock, int flags, int *addr_len) { struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; struct sk_buff *skb; @@ -880,18 +876,36 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len, goto out; copied = skb->len - sizeof(struct udphdr); - if (copied > len) - { + if (copied > len) { copied = len; msg->msg_flags |= MSG_TRUNC; } - /* - * FIXME : should use udp header size info value - */ - +#ifndef CONFIG_UDP_DELAY_CSUM err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, copied); +#else + if (sk->no_check || skb->ip_summed==CHECKSUM_UNNECESSARY) { + err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, + copied); + } else if (copied > msg->msg_iov[0].iov_len || (msg->msg_flags&MSG_TRUNC)) { + if (csum_fold(csum_partial(skb->h.raw, ntohs(skb->h.uh->len), skb->csum))) + goto csum_copy_err; + err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, + copied); + } else { + unsigned int csum; + + err = 0; + csum = csum_partial(skb->h.raw, sizeof(struct udphdr), skb->csum); + csum = csum_and_copy_to_user((char*)&skb->h.uh[1], msg->msg_iov[0].iov_base, + copied, csum, &err); + if (err) + goto out_free; + if (csum_fold(csum)) + goto csum_copy_err; + } +#endif if (err) goto out_free; sk->stamp=skb->stamp; @@ -928,6 +942,18 @@ out_free: skb_free_datagram(sk, skb); out: return err; + +#ifdef CONFIG_UDP_DELAY_CSUM +csum_copy_err: + udp_statistics.UdpInErrors++; + skb_free_datagram(sk, skb); + + /* + * Error for blocking case is chosen to masquerade + * as some normal condition. + */ + return (msg->msg_flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; +#endif } int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) @@ -986,28 +1012,15 @@ int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) static void udp_close(struct sock *sk, unsigned long timeout) { - lock_sock(sk); + /* See for explanation: raw_close in ipv4/raw.c */ sk->state = TCP_CLOSE; - if(uh_cache_sk == sk) - uh_cache_sk = NULL; - sk->dead = 1; - release_sock(sk); udp_v4_unhash(sk); + sk->dead = 1; destroy_sock(sk); } static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) { - /* - * Check the security clearance - */ - - if(!ipsec_sk_policy(sk,skb)) - { - kfree_skb(skb); - return(0); - } - /* * Charge it to the socket, dropping if the queue is full. */ @@ -1026,10 +1039,6 @@ static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) static inline void udp_deliver(struct sock *sk, struct sk_buff *skb) { - if (sk->sock_readers) { - __skb_queue_tail(&sk->back_log, skb); - return; - } udp_queue_rcv_skb(sk, skb); } @@ -1043,9 +1052,11 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh, u32 saddr, u32 daddr) { struct sock *sk; + int dif; sk = udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]; - sk = udp_v4_mcast_next(sk, uh->dest, saddr, uh->source, daddr); + dif = skb->dev->ifindex; + sk = udp_v4_mcast_next(sk, uh->dest, saddr, uh->source, daddr, dif); if (sk) { struct sock *sknext = NULL; @@ -1053,7 +1064,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh, struct sk_buff *skb1 = skb; sknext = udp_v4_mcast_next(sk->next, uh->dest, saddr, - uh->source, daddr); + uh->source, daddr, dif); if(sknext) skb1 = skb_clone(skb, GFP_ATOMIC); @@ -1113,7 +1124,8 @@ int udp_rcv(struct sk_buff *skb, unsigned short len) */ uh = skb->h.uh; - + __skb_pull(skb, skb->h.raw - skb->data); + ip_statistics.IpInDelivers++; /* @@ -1121,44 +1133,31 @@ int udp_rcv(struct sk_buff *skb, unsigned short len) */ ulen = ntohs(uh->len); - - if (ulen > len || len < sizeof(*uh) || ulen < sizeof(*uh)) { + + if (ulen > len || ulen < sizeof(*uh)) { NETDEBUG(printk(KERN_DEBUG "UDP: short packet: %d/%d\n", ulen, len)); udp_statistics.UdpInErrors++; kfree_skb(skb); return(0); } + skb_trim(skb, ulen); +#ifndef CONFIG_UDP_DELAY_CSUM if (uh->check && - (((skb->ip_summed==CHECKSUM_HW)&&udp_check(uh,len,saddr,daddr,skb->csum)) || + (((skb->ip_summed==CHECKSUM_HW)&&udp_check(uh,ulen,saddr,daddr,skb->csum)) || ((skb->ip_summed==CHECKSUM_NONE) && - (udp_check(uh,len,saddr,daddr, csum_partial((char*)uh, len, 0)))))) { - /* wants to know, who sent it, to - go and stomp on the garbage sender... */ - - /* RFC1122: OK. Discards the bad packet silently (as far as */ - /* the network is concerned, anyway) as per 4.1.3.4 (MUST). */ - - NETDEBUG(printk(KERN_DEBUG "UDP: bad checksum. From %08lX:%d to %08lX:%d ulen %d\n", - ntohl(saddr),ntohs(uh->source), - ntohl(daddr),ntohs(uh->dest), - ulen)); - udp_statistics.UdpInErrors++; - kfree_skb(skb); - return(0); - } - - - len = ulen; - - /* - * FIXME: - * Trimming things wrongly. We must adjust the base/end to allow - * for the headers we keep! - * --ANK - */ - skb_trim(skb,len); - + (udp_check(uh,ulen,saddr,daddr, csum_partial((char*)uh, ulen, 0)))))) + goto csum_error; +#else + if (uh->check==0) + skb->ip_summed = CHECKSUM_UNNECESSARY; + else if (skb->ip_summed==CHECKSUM_HW) { + if (udp_check(uh,ulen,saddr,daddr,skb->csum)) + goto csum_error; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); +#endif if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return udp_v4_mcast_deliver(skb, uh, saddr, daddr); @@ -1173,6 +1172,11 @@ int udp_rcv(struct sk_buff *skb, unsigned short len) sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex); if (sk == NULL) { +#ifdef CONFIG_UDP_DELAY_CSUM + if (skb->ip_summed != CHECKSUM_UNNECESSARY && + csum_fold(csum_partial((char*)uh, ulen, skb->csum))) + goto csum_error; +#endif udp_statistics.UdpNoPorts++; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); @@ -1185,6 +1189,19 @@ int udp_rcv(struct sk_buff *skb, unsigned short len) } udp_deliver(sk, skb); return 0; + +csum_error: + /* + * RFC1122: OK. Discards the bad packet silently (as far as + * the network is concerned, anyway) as per 4.1.3.4 (MUST). + */ + NETDEBUG(printk(KERN_DEBUG "UDP: bad checksum. From %08lX:%d to %08lX:%d ulen %d\n", + ntohl(saddr),ntohs(uh->source), + ntohl(daddr),ntohs(uh->dest), + ulen)); + udp_statistics.UdpInErrors++; + kfree_skb(skb); + return(0); } struct proto udp_prot = { @@ -1214,7 +1231,7 @@ struct proto udp_prot = { udp_v4_verify_bind, /* verify_bind */ 128, /* max_header */ 0, /* retransmits */ - "UDP", /* name */ + "UDP", /* name */ 0, /* inuse */ 0 /* highestinuse */ }; -- cgit v1.2.3