diff options
Diffstat (limited to 'net/ipv4')
31 files changed, 4047 insertions, 2828 deletions
diff --git a/net/ipv4/Config.in b/net/ipv4/Config.in index 7a44fa565..ccc8c6d1d 100644 --- a/net/ipv4/Config.in +++ b/net/ipv4/Config.in @@ -25,7 +25,6 @@ if [ "$CONFIG_IP_PNP" = "y" ]; then # not yet ready.. # bool ' IP: ARP support' CONFIG_IP_PNP_ARP fi -bool ' IP: optimize as router not host' CONFIG_IP_ROUTER tristate ' IP: tunneling' CONFIG_NET_IPIP tristate ' IP: GRE tunnels over IP' CONFIG_NET_IPGRE if [ "$CONFIG_IP_MULTICAST" = "y" ]; then @@ -38,7 +37,6 @@ if [ "$CONFIG_IP_MULTICAST" = "y" ]; then bool ' IP: PIM-SM version 2 support' CONFIG_IP_PIMSM_V2 fi fi -bool ' IP: aliasing support' CONFIG_IP_ALIAS if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then if [ "$CONFIG_RTNETLINK" = "y" ]; then bool ' IP: ARP daemon support (EXPERIMENTAL)' CONFIG_ARPD diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index b162de66c..1a6a53bc8 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -11,7 +11,7 @@ O_TARGET := ipv4.o IPV4_OBJS := utils.o route.o inetpeer.o proc.o protocol.o \ ip_input.o ip_fragment.o ip_forward.o ip_options.o \ ip_output.o ip_sockglue.o \ - tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o\ + tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o IPV4X_OBJS := diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 40aa7cd3a..6e80ed912 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -5,7 +5,7 @@ * * PF_INET protocol family socket handler. * - * Version: $Id: af_inet.c,v 1.110 2000/04/25 04:13:34 davem Exp $ + * Version: $Id: af_inet.c,v 1.112 2000/08/16 16:20:56 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -159,6 +159,8 @@ void inet_sock_destruct(struct sock *sk) BUG_TRAP(atomic_read(&sk->rmem_alloc) == 0); BUG_TRAP(atomic_read(&sk->wmem_alloc) == 0); + BUG_TRAP(sk->wmem_queued == 0); + BUG_TRAP(sk->forward_alloc == 0); if (sk->protinfo.af_inet.opt) kfree(sk->protinfo.af_inet.opt); @@ -300,9 +302,6 @@ out: /* * Create an inet socket. - * - * FIXME: Gcc would generate much better code if we set the parameters - * up in in-memory structure order. Gcc68K even more so */ static int inet_create(struct socket *sock, int protocol) @@ -494,6 +493,8 @@ static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; } + if (sk->rcv_saddr) + sk->userlocks |= SOCK_BINDADDR_LOCK; sk->sport = htons(sk->num); sk->daddr = 0; sk->dport = 0; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 050f5283d..7af589b75 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1,7 +1,7 @@ /* * NET3 IP device support routines. * - * Version: $Id: devinet.c,v 1.37 2000/07/26 01:04:15 davem Exp $ + * Version: $Id: devinet.c,v 1.38 2000/08/19 23:22:56 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -470,9 +470,7 @@ int devinet_ioctl(unsigned int cmd, void *arg) struct in_ifaddr **ifap = NULL; struct in_ifaddr *ifa = NULL; struct net_device *dev; -#ifdef CONFIG_IP_ALIAS char *colon; -#endif int ret = 0; /* @@ -483,11 +481,9 @@ int devinet_ioctl(unsigned int cmd, void *arg) return -EFAULT; ifr.ifr_name[IFNAMSIZ-1] = 0; -#ifdef CONFIG_IP_ALIAS colon = strchr(ifr.ifr_name, ':'); if (colon) *colon = 0; -#endif #ifdef CONFIG_KMOD dev_load(ifr.ifr_name); @@ -530,10 +526,8 @@ int devinet_ioctl(unsigned int cmd, void *arg) goto done; } -#ifdef CONFIG_IP_ALIAS if (colon) *colon = ':'; -#endif if ((in_dev=__in_dev_get(dev)) != NULL) { for (ifap=&in_dev->ifa_list; (ifa=*ifap) != NULL; ifap=&ifa->ifa_next) @@ -564,7 +558,6 @@ int devinet_ioctl(unsigned int cmd, void *arg) goto rarok; case SIOCSIFFLAGS: -#ifdef CONFIG_IP_ALIAS if (colon) { if (ifa == NULL) { ret = -EADDRNOTAVAIL; @@ -574,7 +567,6 @@ int devinet_ioctl(unsigned int cmd, void *arg) inet_del_ifa(in_dev, ifap, 1); break; } -#endif ret = dev_change_flags(dev, ifr.ifr_flags); break; @@ -589,12 +581,10 @@ int devinet_ioctl(unsigned int cmd, void *arg) ret = -ENOBUFS; break; } -#ifdef CONFIG_IP_ALIAS if (colon) memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ); else -#endif - memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); } else { ret = 0; if (ifa->ifa_local == sin->sin_addr.s_addr) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 2601d2412..bc8de3496 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -5,7 +5,7 @@ * * IPv4 Forwarding Information Base: semantics. * - * Version: $Id: fib_semantics.c,v 1.16 2000/06/21 17:14:50 davem Exp $ + * Version: $Id: fib_semantics.c,v 1.17 2000/08/19 23:22:56 davem Exp $ * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * @@ -764,25 +764,20 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm, rtm->rtm_type = RTN_UNICAST; if (r->rt_dev) { -#ifdef CONFIG_IP_ALIAS char *colon; -#endif struct net_device *dev; char devname[IFNAMSIZ]; if (copy_from_user(devname, r->rt_dev, IFNAMSIZ-1)) return -EFAULT; devname[IFNAMSIZ-1] = 0; -#ifdef CONFIG_IP_ALIAS colon = strchr(devname, ':'); if (colon) *colon = 0; -#endif dev = __dev_get_by_name(devname); if (!dev) return -ENODEV; rta->rta_oif = &dev->ifindex; -#ifdef CONFIG_IP_ALIAS if (colon) { struct in_ifaddr *ifa; struct in_device *in_dev = __in_dev_get(dev); @@ -796,7 +791,6 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm, return -ENODEV; rta->rta_prefsrc = &ifa->ifa_local; } -#endif } ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 700bb24f7..bb76f7ce5 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) module. * - * Version: $Id: ip_input.c,v 1.48 2000/04/15 01:48:10 davem Exp $ + * Version: $Id: ip_input.c,v 1.49 2000/08/21 20:41:55 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -297,7 +297,6 @@ int ip_local_deliver(struct sk_buff *skb) skb = ip_defrag(skb); if (!skb) return 0; - iph = skb->nh.iph; } return NF_HOOK(PF_INET, NF_IP_LOCAL_IN, skb, skb->dev, NULL, diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 22429bb5e..4287c7525 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -5,7 +5,7 @@ * * The IP to API glue. * - * Version: $Id: ip_sockglue.c,v 1.50 2000/07/26 01:04:17 davem Exp $ + * Version: $Id: ip_sockglue.c,v 1.51 2000/08/09 11:59:04 davem Exp $ * * Authors: see ip.c * @@ -724,16 +724,14 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *op break; case IP_MULTICAST_IF: { - struct ip_mreqn mreq; - len = min(len,sizeof(struct ip_mreqn)); - mreq.imr_ifindex = sk->protinfo.af_inet.mc_index; - mreq.imr_address.s_addr = sk->protinfo.af_inet.mc_addr; - mreq.imr_multiaddr.s_addr = 0; + struct in_addr addr; + len = min(len,sizeof(struct in_addr)); + addr.s_addr = sk->protinfo.af_inet.mc_addr; release_sock(sk); if(put_user(len, optlen)) return -EFAULT; - if(copy_to_user((void *)optval, &mreq, len)) + if(copy_to_user((void *)optval, &addr, len)) return -EFAULT; return 0; } diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index cbf6e19eb..cb430624f 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -19,7 +19,8 @@ IP_NF_COMPAT_LAYER:=ip_fw_compat.o ip_fw_compat_redir.o ip_fw_compat_masq.o $(IP # Link order matters here. ifeq ($(CONFIG_IP_NF_CONNTRACK),y) -O_OBJS += ip_conntrack_standalone.o $(IP_NF_CONNTRACK_OBJ) +OX_OBJS += ip_conntrack_standalone.o +O_OBJS += $(IP_NF_CONNTRACK_OBJ) else ifeq ($(CONFIG_IP_NF_CONNTRACK),m) MI_OBJS += $(IP_NF_CONNTRACK_OBJ) diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index da3f97821..2e4dd82ee 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -660,8 +660,8 @@ unsigned int ip_conntrack_in(unsigned int hooknum, } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) == htonl(0x000000FF)) { printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n", - IP_PARTS((*pskb)->nh.iph->saddr), - IP_PARTS((*pskb)->nh.iph->daddr), + NIPQUAD((*pskb)->nh.iph->saddr), + NIPQUAD((*pskb)->nh.iph->daddr), (*pskb)->sk, (*pskb)->pkt_type); } #endif @@ -998,7 +998,7 @@ getorigdst(struct sock *sk, int optval, void *user, int *len) .tuple.dst.ip; DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", - IP_PARTS(sin.sin_addr.s_addr), ntohs(sin.sin_port)); + NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); ip_conntrack_put(h->ctrack); if (copy_to_user(user, &sin, sizeof(sin)) != 0) return -EFAULT; @@ -1006,8 +1006,8 @@ getorigdst(struct sock *sk, int optval, void *user, int *len) return 0; } DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n", - IP_PARTS(tuple.src.ip), ntohs(tuple.src.u.tcp.port), - IP_PARTS(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port)); + NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port), + NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port)); return -ENOENT; } diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index ce0023ec3..cfdb28f12 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -21,14 +21,6 @@ struct module *ip_conntrack_ftp = THIS_MODULE; #define DEBUGP(format, args...) #endif -#define IP_PARTS_NATIVE(n) \ -(unsigned int)((n)>>24)&0xFF, \ -(unsigned int)((n)>>16)&0xFF, \ -(unsigned int)((n)>>8)&0xFF, \ -(unsigned int)((n)&0xFF) - -#define IP_PARTS(n) IP_PARTS_NATIVE(ntohl(n)) - static struct { const char *pattern; size_t plen; @@ -111,7 +103,7 @@ static int help(const struct iphdr *iph, size_t len, struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) { - /* tcplen not negative guarenteed by ip_conntrack_tcp.c */ + /* tcplen not negative guaranteed by ip_conntrack_tcp.c */ struct tcphdr *tcph = (void *)iph + iph->ihl * 4; const char *data = (const char *)tcph + tcph->doff * 4; unsigned int tcplen = len - iph->ihl * 4; @@ -142,8 +134,8 @@ static int help(const struct iphdr *iph, size_t len, if (tcp_v4_check(tcph, tcplen, iph->saddr, iph->daddr, csum_partial((char *)tcph, tcplen, 0))) { DEBUGP("ftp_help: bad csum: %p %u %u.%u.%u.%u %u.%u.%u.%u\n", - tcph, tcplen, IP_PARTS(iph->saddr), - IP_PARTS(iph->daddr)); + tcph, tcplen, NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr)); return NF_ACCEPT; } diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index 20e4aa426..f1faab1be 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -332,7 +332,6 @@ static void __exit fini(void) module_init(init); module_exit(fini); -#ifdef MODULE EXPORT_SYMBOL(ip_conntrack_protocol_register); EXPORT_SYMBOL(invert_tuplepr); EXPORT_SYMBOL(ip_conntrack_alter_reply); @@ -346,4 +345,3 @@ EXPORT_SYMBOL(ip_ct_refresh); EXPORT_SYMBOL(ip_conntrack_expect_related); EXPORT_SYMBOL(ip_conntrack_tuple_taken); EXPORT_SYMBOL(ip_ct_gather_frags); -#endif diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c index 0f7b4f8ca..9ba62dc84 100644 --- a/net/ipv4/netfilter/ip_nat_core.c +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -206,7 +206,7 @@ do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp) /* FIXME: IPTOS_TOS(iph->tos) --RR */ if (ip_route_output(&rt, var_ip, 0, 0, 0) != 0) { DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n", - IP_PARTS(var_ip)); + NIPQUAD(var_ip)); return 0; } @@ -312,7 +312,7 @@ find_best_ips_proto(struct ip_conntrack_tuple *tuple, && *var_ipp != orig_dstip && !do_extra_mangle(*var_ipp, other_ipp)) { DEBUGP("Range %u %u.%u.%u.%u rt failed!\n", - i, IP_PARTS(*var_ipp)); + i, NIPQUAD(*var_ipp)); /* Can't route? This whole range part is * probably screwed, but keep trying * anyway. */ @@ -513,8 +513,8 @@ ip_nat_setup_info(struct ip_conntrack *conntrack, ? " PROTO_SPECIFIED" : "", (mr->range[i].flags & IP_NAT_RANGE_FULL) ? " FULL" : "", - IP_PARTS(mr->range[i].min_ip), - IP_PARTS(mr->range[i].max_ip), + NIPQUAD(mr->range[i].min_ip), + NIPQUAD(mr->range[i].max_ip), mr->range[i].min.all, mr->range[i].max.all); } @@ -715,7 +715,7 @@ do_bindings(struct ip_conntrack *ct, *pskb, info->manips[i].maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST", - IP_PARTS(info->manips[i].manip.ip), + NIPQUAD(info->manips[i].manip.ip), htons(info->manips[i].manip.u.all)); manip_pkt((*pskb)->nh.iph->protocol, (*pskb)->nh.iph, @@ -797,7 +797,7 @@ icmp_reply_translation(struct sk_buff *skb, DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n", info->manips[i].maniptype == IP_NAT_MANIP_SRC ? "DST" : "SRC", - IP_PARTS(info->manips[i].manip.ip), + NIPQUAD(info->manips[i].manip.ip), ntohs(info->manips[i].manip.u.udp.port)); manip_pkt(inner->protocol, inner, skb->len - ((void *)inner - (void *)iph), @@ -812,7 +812,7 @@ icmp_reply_translation(struct sk_buff *skb, DEBUGP("icmp_reply: outer %s -> %u.%u.%u.%u\n", info->manips[i].maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST", - IP_PARTS(info->manips[i].manip.ip)); + NIPQUAD(info->manips[i].manip.ip)); manip_pkt(0, iph, skb->len, &info->manips[i].manip, info->manips[i].maniptype, diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c index d4eb36405..c3d8ccab0 100644 --- a/net/ipv4/netfilter/ip_nat_ftp.c +++ b/net/ipv4/netfilter/ip_nat_ftp.c @@ -54,13 +54,13 @@ ftp_nat_expected(struct sk_buff **pskb, newdstip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip; newsrcip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip; DEBUGP("nat_expected: PORT cmd. %u.%u.%u.%u->%u.%u.%u.%u\n", - IP_PARTS(newsrcip), IP_PARTS(newdstip)); + NIPQUAD(newsrcip), NIPQUAD(newdstip)); } else { /* PASV command: make the connection go to the server */ newdstip = master->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip; newsrcip = master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip; DEBUGP("nat_expected: PASV cmd. %u.%u.%u.%u->%u.%u.%u.%u\n", - IP_PARTS(newsrcip), IP_PARTS(newdstip)); + NIPQUAD(newsrcip), NIPQUAD(newdstip)); } UNLOCK_BH(&ip_ftp_lock); @@ -69,7 +69,7 @@ ftp_nat_expected(struct sk_buff **pskb, else newip = newdstip; - DEBUGP("nat_expected: IP to %u.%u.%u.%u\n", IP_PARTS(newip)); + DEBUGP("nat_expected: IP to %u.%u.%u.%u\n", NIPQUAD(newip)); mr.rangesize = 1; /* We don't want to manip the per-protocol, just the IPs... */ @@ -110,7 +110,7 @@ mangle_packet(struct sk_buff **pskb, MUST_BE_LOCKED(&ip_ftp_lock); sprintf(buffer, "%u,%u,%u,%u,%u,%u", - IP_PARTS(newip), port>>8, port&0xFF); + NIPQUAD(newip), port>>8, port&0xFF); tcplen = (*pskb)->len - iph->ihl * 4; newtcplen = tcplen - matchlen + strlen(buffer); diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c index 1ebea495c..a22858cb3 100644 --- a/net/ipv4/netfilter/ip_nat_rule.c +++ b/net/ipv4/netfilter/ip_nat_rule.c @@ -226,7 +226,7 @@ alloc_null_binding(struct ip_conntrack *conntrack, = { 1, { { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } } } }; DEBUGP("Allocating NULL binding for %p (%u.%u.%u.%u)\n", conntrack, - IP_PARTS(ip)); + NIPQUAD(ip)); return ip_nat_setup_info(conntrack, &mr, hooknum); } diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 4f8a8de07..85787ed88 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -4,10 +4,11 @@ * * (C) 2000 James Morris, this code is GPL. * - * 2000-03-27: Simplified code (thanks to Andi Kleen for clues). (JM) - * 2000-05-20: Fixed notifier problems (following Miguel Freitas' report). (JM) + * 2000-03-27: Simplified code (thanks to Andi Kleen for clues). + * 2000-05-20: Fixed notifier problems (following Miguel Freitas' report). * 2000-06-19: Fixed so nfmark is copied to metadata (reported by Sebastian - * Zander). (JM) + * Zander). + * 2000-08-01: Added Nick Williams' MAC support. * */ #include <linux/module.h> @@ -398,6 +399,14 @@ static struct sk_buff *netlink_build_message(ipq_queue_element_t *e, int *errp) else pm->indev_name[0] = '\0'; if (e->info->outdev) strcpy(pm->outdev_name, e->info->outdev->name); else pm->outdev_name[0] = '\0'; + pm->hw_protocol = e->skb->protocol; + if (e->skb->rx_dev) { + pm->hw_type = e->skb->rx_dev->type; + if (e->skb->rx_dev->hard_header_parse) + pm->hw_addrlen = + e->skb->rx_dev->hard_header_parse(e->skb, + pm->hw_addr); + } if (data_len) memcpy(pm->payload, e->skb->data, data_len); nlh->nlmsg_len = skb->tail - old_tail; diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index a04a5a801..bdb4fd99c 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -288,14 +288,15 @@ ipt_log_target(struct sk_buff **pskb, if (in && !out) { /* MAC logging for input chain only. */ printk("MAC="); - if ((*pskb)->dev && (*pskb)->dev->hard_header_len) { + if ((*pskb)->dev && (*pskb)->dev->hard_header_len && (*pskb)->mac.raw != iph) { int i; unsigned char *p = (*pskb)->mac.raw; for (i = 0; i < (*pskb)->dev->hard_header_len; i++,p++) printk("%02x%c", *p, i==(*pskb)->dev->hard_header_len - 1 ? ' ':':'); - } + } else + printk(" "); } dump_packet(loginfo, iph, (*pskb)->len, 1); diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 2f9c11915..99164a7a0 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -92,7 +92,7 @@ masquerade_target(struct sk_buff **pskb, } newsrc = rt->rt_src; - DEBUGP("newsrc = %u.%u.%u.%u\n", IP_PARTS(newsrc)); + DEBUGP("newsrc = %u.%u.%u.%u\n", NIPQUAD(newsrc)); ip_rt_put(rt); WRITE_LOCK(&masq_lock); diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 7c8bf2f1e..2d8ad255f 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -247,11 +247,6 @@ static int check(const char *tablename, DEBUGP("REJECT: TCP_RESET illegal for non-tcp\n"); return 0; } - /* Only for local input. Rest is too dangerous. */ - if ((hook_mask & ~(1 << NF_IP_LOCAL_IN)) != 0) { - DEBUGP("REJECT: TCP_RESET only from INPUT\n"); - return 0; - } } return 1; diff --git a/net/ipv4/netfilter/ipt_limit.c b/net/ipv4/netfilter/ipt_limit.c index ae0904a4d..6665f1ce4 100644 --- a/net/ipv4/netfilter/ipt_limit.c +++ b/net/ipv4/netfilter/ipt_limit.c @@ -15,14 +15,6 @@ #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_limit.h> -#define IP_PARTS_NATIVE(n) \ -(unsigned int)((n)>>24)&0xFF, \ -(unsigned int)((n)>>16)&0xFF, \ -(unsigned int)((n)>>8)&0xFF, \ -(unsigned int)((n)&0xFF) - -#define IP_PARTS(n) IP_PARTS_NATIVE(ntohl(n)) - /* The algorithm used is the Simple Token Bucket Filter (TBF) * see net/sched/sch_tbf.c in the linux source tree */ diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c index 82bafe19f..7467dfaf0 100644 --- a/net/ipv4/netfilter/ipt_owner.c +++ b/net/ipv4/netfilter/ipt_owner.c @@ -15,18 +15,29 @@ static int match_pid(const struct sk_buff *skb, pid_t pid) { struct task_struct *p; + struct files_struct *files; int i; read_lock(&tasklist_lock); p = find_task_by_pid(pid); - if(p && p->files) { - for (i=0; i < p->files->max_fds; i++) { - if (fcheck_files(p->files, i) == skb->sk->socket->file) { + if (!p) + goto out; + task_lock(p); + files = p->files; + if(files) { + read_lock(&files->file_lock); + for (i=0; i < files->max_fds; i++) { + if (fcheck_files(files, i) == skb->sk->socket->file) { + read_unlock(&files->file_lock); + task_unlock(p); read_unlock(&tasklist_lock); return 1; } - } + } + read_unlock(&files->file_lock); } + task_unlock(p); +out: read_unlock(&tasklist_lock); return 0; } @@ -35,19 +46,28 @@ static int match_sid(const struct sk_buff *skb, pid_t sid) { struct task_struct *p; + struct file *file = skb->sk->socket->file; int i, found=0; read_lock(&tasklist_lock); for_each_task(p) { - if ((p->session != sid) || !p->files) + struct files_struct *files; + if (p->session != sid) continue; - for (i=0; i < p->files->max_fds; i++) { - if (fcheck_files(p->files, i) == skb->sk->socket->file) { - found = 1; - break; + task_lock(p); + files = p->files; + if (files) { + read_lock(&files->file_lock); + for (i=0; i < files->max_fds; i++) { + if (fcheck_files(files, i) == file) { + found = 1; + break; + } } + read_unlock(&files->file_lock); } + task_unlock(p); if(found) break; } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index f1ff8f1ee..559d75aac 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -7,7 +7,7 @@ * PROC file system. It is mainly used for debugging and * statistics. * - * Version: $Id: proc.c,v 1.43 2000/07/07 22:29:42 davem Exp $ + * Version: $Id: proc.c,v 1.44 2000/08/09 11:59:04 davem Exp $ * * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de> @@ -71,9 +71,11 @@ int afinet_get_info(char *buffer, char **start, off_t offset, int length) int len = socket_get_info(buffer,start,offset,length); - len += sprintf(buffer+len,"TCP: inuse %d orphan %d tw %d\n", + len += sprintf(buffer+len,"TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", fold_prot_inuse(&tcp_prot), - atomic_read(&tcp_orphan_count), tcp_tw_count); + atomic_read(&tcp_orphan_count), tcp_tw_count, + atomic_read(&tcp_sockets_allocated), + atomic_read(&tcp_memory_allocated)); len += sprintf(buffer+len,"UDP: inuse %d\n", fold_prot_inuse(&udp_prot)); len += sprintf(buffer+len,"RAW: inuse %d\n", @@ -175,7 +177,22 @@ int netstat_get_info(char *buffer, char **start, off_t offset, int length) " ListenOverflows ListenDrops" " TCPPrequeued TCPDirectCopyFromBacklog" " TCPDirectCopyFromPrequeue TCPPrequeueDropped" - " TCPHPHits TCPHPHitsToUser\n" + " TCPHPHits TCPHPHitsToUser" + " TCPPureAcks TCPHPAcks" + " TCPRenoRecovery TCPSackRecovery" + " TCPSACKReneging" + " TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder" + " TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo" + " TCPLoss TCPLostRetransmit" + " TCPRenoFailures TCPSackFailures TCPLossFailures" + " TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans" + " TCPTimeouts" + " TCPRenoRecoveryFail TCPSackRecoveryFail" + " TCPSchedulerFailed TCPRcvCollapsed" + " TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv" + " TCPAbortOnSyn TCPAbortOnData TCPAbortOnClose" + " TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger" + " TCPAbortFailed TCPMemoryPressures\n" "TcpExt:"); for (i=0; i<offsetof(struct linux_mib, __pad)/sizeof(unsigned long); i++) len += sprintf(buffer+len, " %lu", fold_field((unsigned long*)net_statistics, sizeof(struct linux_mib), i)); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 5ac30dc40..81f20361a 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -5,7 +5,7 @@ * * RAW - implementation of IP "raw" sockets. * - * Version: $Id: raw.c,v 1.52 2000/07/08 00:20:43 davem Exp $ + * Version: $Id: raw.c,v 1.53 2000/08/09 11:59:04 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -628,8 +628,8 @@ static void get_raw_sock(struct sock *sp, char *tmpbuf, int i) i, src, srcp, dest, destp, sp->state, atomic_read(&sp->wmem_alloc), atomic_read(&sp->rmem_alloc), timer_active, timer_expires-jiffies, 0, - sp->socket->inode->i_uid, 0, - sp->socket ? sp->socket->inode->i_ino : 0, + sock_i_uid(sp), 0, + sock_i_ino(sp), atomic_read(&sp->refcnt), sp); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index eb00518bd..d4e9806a0 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -5,7 +5,7 @@ * * ROUTE - implementation of the IP router. * - * Version: $Id: route.c,v 1.88 2000/07/07 23:47:45 davem Exp $ + * Version: $Id: route.c,v 1.89 2000/08/09 11:59:04 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -1127,8 +1127,6 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) memcpy(&rt->u.dst.mxlock, fi->fib_metrics, sizeof(fi->fib_metrics)); if (fi->fib_mtu == 0) { rt->u.dst.pmtu = rt->u.dst.dev->mtu; - if (rt->u.dst.pmtu > IP_MAX_MTU) - rt->u.dst.pmtu = IP_MAX_MTU; if (rt->u.dst.mxlock&(1<<RTAX_MTU) && rt->rt_gateway != rt->rt_dst && rt->u.dst.pmtu > 576) @@ -1139,9 +1137,9 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) #endif } else { rt->u.dst.pmtu = rt->u.dst.dev->mtu; - if (rt->u.dst.pmtu > IP_MAX_MTU) - rt->u.dst.pmtu = IP_MAX_MTU; } + if (rt->u.dst.pmtu > IP_MAX_MTU) + rt->u.dst.pmtu = IP_MAX_MTU; if (rt->u.dst.advmss == 0) rt->u.dst.advmss = max(rt->u.dst.dev->mtu-40, ip_rt_min_advmss); if (rt->u.dst.advmss > 65535-40) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d9416525b..4274045e8 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1,7 +1,7 @@ /* * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem. * - * $Id: sysctl_net_ipv4.c,v 1.43 2000/01/16 05:11:27 davem Exp $ + * $Id: sysctl_net_ipv4.c,v 1.44 2000/08/09 11:59:04 davem Exp $ * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS] @@ -209,6 +209,24 @@ ctl_table ipv4_table[] = { &proc_dointvec_jiffies, &sysctl_jiffies}, {NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries", &sysctl_tcp_orphan_retries, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_TCP_FACK, "tcp_fack", + &sysctl_tcp_fack, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_TCP_REORDERING, "tcp_reordering", + &sysctl_tcp_reordering, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_TCP_ECN, "tcp_ecn", + &sysctl_tcp_ecn, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_TCP_DSACK, "tcp_dsack", + &sysctl_tcp_dsack, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_TCP_MEM, "tcp_mem", + &sysctl_tcp_mem, sizeof(sysctl_tcp_mem), 0644, NULL, &proc_dointvec}, + {NET_TCP_WMEM, "tcp_wmem", + &sysctl_tcp_wmem, sizeof(sysctl_tcp_wmem), 0644, NULL, &proc_dointvec}, + {NET_TCP_RMEM, "tcp_rmem", + &sysctl_tcp_rmem, sizeof(sysctl_tcp_rmem), 0644, NULL, &proc_dointvec}, + {NET_TCP_APP_WIN, "tcp_app_win", + &sysctl_tcp_app_win, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale", + &sysctl_tcp_adv_win_scale, sizeof(int), 0644, NULL, &proc_dointvec}, {0} }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index dbf680233..d828a7f3f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.170 2000/07/08 00:20:43 davem Exp $ + * Version: $Id: tcp.c,v 1.173 2000/08/15 20:15:23 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -201,7 +201,7 @@ * tcp_do_sendmsg to avoid burstiness. * Eric Schenk : Fix fast close down bug with * shutdown() followed by close(). - * Andi Kleen : Make poll agree with SIGIO + * Andi Kleen : Make poll agree with SIGIO * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and * lingertime == 0 (RFC 793 ABORT Call) * @@ -436,6 +436,96 @@ kmem_cache_t *tcp_timewait_cachep; atomic_t tcp_orphan_count = ATOMIC_INIT(0); +int sysctl_tcp_mem[3] = { 0, }; +int sysctl_tcp_wmem[3] = { 4*1024, 16*1024, 128*1024 }; +int sysctl_tcp_rmem[3] = { 4*1024, 87380, 87380*2 }; + +atomic_t tcp_memory_allocated; /* Current allocated memory. */ +atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */ + +/* Pressure flag: try to collapse. + * Technical note: it is used by multiple contexts non atomically. + * All the tcp_mem_schedule() is of this nature: accounting + * is strict, actions are advisory and have some latency. */ +int tcp_memory_pressure; + +#define TCP_PAGES(amt) (((amt)+TCP_MEM_QUANTUM-1)/TCP_MEM_QUANTUM) + +int tcp_mem_schedule(struct sock *sk, int size, int kind) +{ + int amt = TCP_PAGES(size); + + sk->forward_alloc += amt*TCP_MEM_QUANTUM; + atomic_add(amt, &tcp_memory_allocated); + + /* Under limit. */ + if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + if (tcp_memory_pressure) + tcp_memory_pressure = 0; + return 1; + } + + /* Over hard limit. */ + if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) { + tcp_enter_memory_pressure(); + goto suppress_allocation; + } + + /* Under pressure. */ + if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1]) + tcp_enter_memory_pressure(); + + if (kind) { + if (atomic_read(&sk->rmem_alloc) < sysctl_tcp_rmem[0]) + return 1; + } else { + if (sk->wmem_queued < sysctl_tcp_wmem[0]) + return 1; + } + + if (!tcp_memory_pressure || + sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated) + * TCP_PAGES(sk->wmem_queued+atomic_read(&sk->rmem_alloc)+ + sk->forward_alloc)) + return 1; + +suppress_allocation: + + if (kind == 0) { + tcp_moderate_sndbuf(sk); + + /* Fail only if socket is _under_ its sndbuf. + * In this case we cannot block, so that we have to fail. + */ + if (sk->wmem_queued+size >= sk->sndbuf) + return 1; + } + + /* Alas. Undo changes. */ + sk->forward_alloc -= amt*TCP_MEM_QUANTUM; + atomic_sub(amt, &tcp_memory_allocated); + return 0; +} + +void __tcp_mem_reclaim(struct sock *sk) +{ + if (sk->forward_alloc >= TCP_MEM_QUANTUM) { + atomic_sub(sk->forward_alloc/TCP_MEM_QUANTUM, &tcp_memory_allocated); + sk->forward_alloc &= (TCP_MEM_QUANTUM-1); + if (tcp_memory_pressure && + atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) + tcp_memory_pressure = 0; + } +} + +void tcp_rfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + atomic_sub(skb->truesize, &sk->rmem_alloc); + sk->forward_alloc += skb->truesize; +} + /* * LISTEN is a special case for poll.. */ @@ -504,6 +594,9 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) /* Connected? */ if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) { + /* Potential race condition. If read of tp below will + * escape above sk->state, we can be illegally awaken + * in SYN_* states. */ if ((tp->rcv_nxt != tp->copied_seq) && (tp->urg_seq != tp->copied_seq || tp->rcv_nxt != tp->copied_seq+1 || @@ -511,7 +604,7 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) mask |= POLLIN | POLLRDNORM; if (!(sk->shutdown & SEND_SHUTDOWN)) { - if (sock_wspace(sk) >= tcp_min_write_space(sk)) { + if (tcp_wspace(sk) >= tcp_min_write_space(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); @@ -521,7 +614,7 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) * wspace test but before the flags are set, * IO signal will be lost. */ - if (sock_wspace(sk) >= tcp_min_write_space(sk)) + if (tcp_wspace(sk) >= tcp_min_write_space(sk)) mask |= POLLOUT | POLLWRNORM; } } @@ -533,39 +626,10 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) } /* - * Socket write_space callback. - * This (or rather the sock_wake_async) should agree with poll. - * - * WARNING. This callback is called, when socket is not locked. - * - * This wakeup is used by TCP only as dead-lock breaker, real - * wakeup occurs when incoming ack frees some space in buffer. + * TCP socket write_space callback. Not used. */ void tcp_write_space(struct sock *sk) { - struct socket *sock; - - read_lock(&sk->callback_lock); - if ((sock = sk->socket) != NULL && atomic_read(&sk->wmem_alloc) == 0) { - if (test_bit(SOCK_NOSPACE, &sock->flags)) { - if (sk->sleep && waitqueue_active(sk->sleep)) { - clear_bit(SOCK_NOSPACE, &sock->flags); - wake_up_interruptible(sk->sleep); - } - } - - if (sock->fasync_list) - sock_wake_async(sock, 2, POLL_OUT); - } - read_unlock(&sk->callback_lock); -} - -/* Listening TCP sockets never sleep to wait for memory, so - * it is completely silly to wake them up on queue space - * available events. So we hook them up to this dummy callback. - */ -static void tcp_listen_write_space(struct sock *sk) -{ } int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) @@ -647,7 +711,6 @@ int tcp_listen_start(struct sock *sk) if (sk->prot->get_port(sk, sk->num) == 0) { sk->sport = htons(sk->num); - sk->write_space = tcp_listen_write_space; sk_dst_reset(sk); sk->prot->hash(sk); @@ -774,7 +837,7 @@ static int wait_for_tcp_connect(struct sock * sk, int flags, long *timeo_p) static inline int tcp_memory_free(struct sock *sk) { - return atomic_read(&sk->wmem_alloc) < sk->sndbuf; + return sk->wmem_queued < sk->sndbuf; } /* @@ -782,33 +845,44 @@ static inline int tcp_memory_free(struct sock *sk) */ static long wait_for_tcp_memory(struct sock * sk, long timeo) { - if (!tcp_memory_free(sk)) { - DECLARE_WAITQUEUE(wait, current); + long vm_wait = 0; + long current_timeo = timeo; + DECLARE_WAITQUEUE(wait, current); - clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); + if (tcp_memory_free(sk)) + current_timeo = vm_wait = (net_random()%(HZ/5))+2; - add_wait_queue(sk->sleep, &wait); - for (;;) { - set_bit(SOCK_NOSPACE, &sk->socket->flags); + clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); - set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(sk->sleep, &wait); + for (;;) { + set_bit(SOCK_NOSPACE, &sk->socket->flags); - if (signal_pending(current)) - break; - if (tcp_memory_free(sk)) - break; - if (sk->shutdown & SEND_SHUTDOWN) - break; - if (sk->err) - break; - release_sock(sk); - if (!tcp_memory_free(sk)) - timeo = schedule_timeout(timeo); - lock_sock(sk); + set_current_state(TASK_INTERRUPTIBLE); + + if (signal_pending(current)) + break; + if (tcp_memory_free(sk) && !vm_wait) + break; + if (sk->shutdown & SEND_SHUTDOWN) + break; + if (sk->err) + break; + release_sock(sk); + if (!tcp_memory_free(sk) || vm_wait) + current_timeo = schedule_timeout(current_timeo); + lock_sock(sk); + if (vm_wait) { + if (timeo != MAX_SCHEDULE_TIMEOUT && + (timeo -= vm_wait-current_timeo) < 0) + timeo = 0; + break; + } else { + timeo = current_timeo; } - current->state = TASK_RUNNING; - remove_wait_queue(sk->sleep, &wait); } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); return timeo; } @@ -925,43 +999,35 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size) from += copy; copied += copy; seglen -= copy; - if (PSH_NEEDED) + if (PSH_NEEDED || + after(tp->write_seq, tp->pushed_seq+(tp->max_window>>1))) { TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + tp->pushed_seq = tp->write_seq; + } continue; } } - /* A chunk was here doing something strange - * with psh etc. It is deleted, because it was - * evident non-sense. --ANK - */ - copy = min(seglen, mss_now); /* Determine how large of a buffer to allocate. */ - tmp = MAX_TCP_HEADER + 15; + tmp = MAX_TCP_HEADER + 15 + tp->mss_cache; if (copy < mss_now && !(flags & MSG_OOB)) { - tmp += mss_now; - /* What is happening here is that we want to * tack on later members of the users iovec * if possible into a single frame. When we - * leave this loop our caller checks to see if + * leave this loop our we check to see if * we can send queued frames onto the wire. - * See tcp_v[46]_sendmsg() for this. */ queue_it = 1; } else { - tmp += copy; queue_it = 0; } - if (tcp_memory_free(sk)) { - skb = alloc_skb(tmp, GFP_KERNEL); - if (skb == NULL) - goto do_oom; - skb_set_owner_w(skb, sk); - } else { + skb = NULL; + if (tcp_memory_free(sk)) + skb = tcp_alloc_skb(sk, tmp, GFP_KERNEL); + if (skb == NULL) { /* If we didn't get any memory, we need to sleep. */ set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); set_bit(SOCK_NOSPACE, &sk->socket->flags); @@ -987,11 +1053,18 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size) seglen -= copy; /* Prepare control bits for TCP header creation engine. */ - TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | - ((PSH_NEEDED) ? - TCPCB_FLAG_PSH : 0)); + TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; + if (PSH_NEEDED || + after(tp->write_seq+copy, tp->pushed_seq+(tp->max_window>>1))) { + TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK|TCPCB_FLAG_PSH; + tp->pushed_seq = tp->write_seq + copy; + } else { + TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; + } TCP_SKB_CB(skb)->sacked = 0; if (flags & MSG_OOB) { + /* Funny. 8) This makes URG fully meaningless. + * Well, OK. It does not contradict to anything yet. */ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_URG; TCP_SKB_CB(skb)->urg_ptr = copy; } else @@ -1041,15 +1114,12 @@ do_shutdown: err = -EPIPE; } goto out; -do_oom: - err = copied ? : -ENOBUFS; - goto out; do_interrupted: if(copied) err = copied; goto out; do_fault: - kfree_skb(skb); + __kfree_skb(skb); do_fault2: err = -EFAULT; goto out; @@ -1072,7 +1142,7 @@ static int tcp_recv_urg(struct sock * sk, long timeo, if (sk->urginline || !tp->urg_data || tp->urg_data == TCP_URG_READ) return -EINVAL; /* Yes this is right ! */ - if (sk->done) + if (sk->state==TCP_CLOSE && !sk->done) return -ENOTCONN; if (tp->urg_data & TCP_URG_VALID) { @@ -1095,7 +1165,6 @@ static int tcp_recv_urg(struct sock * sk, long timeo, return err ? -EFAULT : len; } - /* Do not set sk->done, it is set only by normal data receive */ if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN)) return 0; @@ -1117,8 +1186,6 @@ static int tcp_recv_urg(struct sock * sk, long timeo, static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb) { __skb_unlink(skb, &sk->receive_queue); - BUG_TRAP(atomic_read(&skb->users) == 1); - /* Well, if I missed something then punishment will be terrible oops. */ __kfree_skb(skb); } @@ -1143,34 +1210,19 @@ static void cleanup_rbuf(struct sock *sk, int copied) tcp_eat_skb(sk, skb); } - if (tp->ack.pending) { + if (tcp_ack_scheduled(tp)) { /* Delayed ACKs frequently hit locked sockets during bulk receive. */ if (tp->ack.blocked -#ifdef TCP_MORE_COARSE_ACKS /* Once-per-two-segments ACK was not sent by tcp_input.c */ || tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss -#endif /* - * If this read emptied read buffer, we send ACK when: - * - * -- ATO estimator diverged. In this case it is useless - * to delay ACK, it will miss in any case. - * - * -- The second condition is triggered when we did not - * ACK 8 segments not depending of their size. - * Linux senders allocate full-sized frame even for one byte - * packets, so that default queue for MTU=8K can hold - * only 8 packets. Note, that no other workarounds - * but counting packets are possible. If sender selected - * a small sndbuf or have larger mtu lockup will still - * occur. Well, not lockup, but 10-20msec gap. - * It is essentially dead lockup for 1Gib ethernet - * and loopback :-). The value 8 covers all reasonable - * cases and we may receive packet of any size - * with maximal possible rate now. + * If this read emptied read buffer, we send ACK, if + * connection is not bidirectional, user drained + * receive buffer and there was a small segment + * in queue. */ || (copied > 0 && - (tp->ack.ato >= TCP_DELACK_MAX || tp->ack.rcv_segs > 7) && + (tp->ack.pending&TCP_ACK_PUSHED) && !tp->ack.pingpong && atomic_read(&sk->rmem_alloc) == 0)) { time_to_ack = 1; @@ -1185,15 +1237,19 @@ static void cleanup_rbuf(struct sock *sk, int copied) */ if(copied > 0 && !time_to_ack && !(sk->shutdown&RCV_SHUTDOWN)) { __u32 rcv_window_now = tcp_receive_window(tp); - __u32 new_window = __tcp_select_window(sk); - /* Send ACK now, if this read freed lots of space - * in our buffer. Certainly, new_window is new window. - * We can advertise it now, if it is not less than current one. - * "Lots" means "at least twice" here. - */ - if(new_window && new_window >= 2*rcv_window_now) - time_to_ack = 1; + /* Optimize, __tcp_select_window() is not cheap. */ + if (2*rcv_window_now <= tp->window_clamp) { + __u32 new_window = __tcp_select_window(sk); + + /* Send ACK now, if this read freed lots of space + * in our buffer. Certainly, new_window is new window. + * We can advertise it now, if it is not less than current one. + * "Lots" means "at least twice" here. + */ + if(new_window && new_window >= 2*rcv_window_now) + time_to_ack = 1; + } } if (time_to_ack) tcp_send_ack(sk); @@ -1345,23 +1401,25 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, !timeo) break; } else { + if (sk->done) + break; + if (sk->err) { copied = sock_error(sk); break; } - if (sk->shutdown & RCV_SHUTDOWN) { - if (!(flags&MSG_PEEK)) - sk->done = 1; + if (sk->shutdown & RCV_SHUTDOWN) break; - } if (sk->state == TCP_CLOSE) { - if (sk->done) { + if (!sk->done) { + /* This occurs when user tries to read + * from never connected socket. + */ copied = -ENOTCONN; break; - } else if (!(flags&MSG_PEEK)) - sk->done = 1; + } break; } @@ -1629,14 +1687,20 @@ static inline int closing(struct sock * sk) static __inline__ void tcp_kill_sk_queues(struct sock *sk) { /* First the read buffer. */ - skb_queue_purge(&sk->receive_queue); + __skb_queue_purge(&sk->receive_queue); /* Next, the error queue. */ - skb_queue_purge(&sk->error_queue); + __skb_queue_purge(&sk->error_queue); /* Next, the write queue. */ BUG_TRAP(skb_queue_empty(&sk->write_queue)); + /* Account for returned memory. */ + tcp_mem_reclaim(sk); + + BUG_TRAP(sk->wmem_queued == 0); + BUG_TRAP(sk->forward_alloc == 0); + /* It is _impossible_ for the backlog to contain anything * when we get here. All user references to this socket * have gone away, only the net layer knows can touch it. @@ -1706,9 +1770,11 @@ void tcp_close(struct sock *sk, long timeout) while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) { u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin; data_was_unread += len; - kfree_skb(skb); + __kfree_skb(skb); } + tcp_mem_reclaim(sk); + /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section * 3.10, we send a RST here because data was lost. To * witness the awful effects of the old behavior of always @@ -1720,11 +1786,13 @@ void tcp_close(struct sock *sk, long timeout) */ if(data_was_unread != 0) { /* Unread data was tossed, zap the connection. */ + NET_INC_STATS_USER(TCPAbortOnClose); tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_KERNEL); } else if (sk->linger && sk->lingertime==0) { /* Check zero linger _after_ checking for unread data. */ sk->prot->disconnect(sk, 0); + NET_INC_STATS_USER(TCPAbortOnData); } else if (tcp_close_state(sk)) { /* We FIN if the application ate all the data before * zapping the connection. @@ -1807,6 +1875,7 @@ adjudge_to_death: if (tp->linger2 < 0) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); + NET_INC_STATS_BH(TCPAbortOnLinger); } else { int tmo = tcp_fin_time(tp); @@ -1819,12 +1888,17 @@ adjudge_to_death: } } } - if (sk->state != TCP_CLOSE && - atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans) { - if (net_ratelimit()) - printk(KERN_INFO "TCP: too many of orphaned sockets\n"); - tcp_set_state(sk, TCP_CLOSE); - tcp_send_active_reset(sk, GFP_ATOMIC); + if (sk->state != TCP_CLOSE) { + tcp_mem_reclaim(sk); + if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans || + (sk->wmem_queued > SOCK_MIN_SNDBUF && + atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { + if (net_ratelimit()) + printk(KERN_INFO "TCP: too many of orphaned sockets\n"); + tcp_set_state(sk, TCP_CLOSE); + tcp_send_active_reset(sk, GFP_ATOMIC); + NET_INC_STATS_BH(TCPAbortOnMemory); + } } atomic_inc(&tcp_orphan_count); @@ -1873,7 +1947,7 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->receive_queue); - __skb_queue_purge(&sk->write_queue); + tcp_writequeue_purge(sk); __skb_queue_purge(&tp->out_of_order_queue); sk->dport = 0; @@ -1887,25 +1961,21 @@ int tcp_disconnect(struct sock *sk, int flags) sk->shutdown = 0; sk->done = 0; - sk->write_space = tcp_write_space; tp->srtt = 0; - if (sysctl_tcp_tw_recycle) { - if ((tp->write_seq += 2) == 0) - tp->write_seq = 1; - } else { - tp->write_seq = 0; - } + if ((tp->write_seq += tp->max_window+2) == 0) + tp->write_seq = 1; tp->backoff = 0; tp->snd_cwnd = 2; tp->probes_out = 0; tp->packets_out = 0; - tp->high_seq = 0; tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_cnt = 0; - tp->dup_acks = 0; + tp->ca_state = TCP_CA_Open; + tcp_clear_retrans(tp); tcp_delack_init(tp); - tp->send_head = tp->retrans_head = NULL; + tp->send_head = NULL; tp->saw_tstamp = 0; + tcp_sack_reset(tp); __sk_dst_reset(sk); BUG_TRAP(!sk->num || sk->prev); @@ -1916,8 +1986,7 @@ int tcp_disconnect(struct sock *sk, int flags) /* * Wait for an incoming connection, avoid race - * conditions. This must be called with the socket locked, - * and without the kernel lock held. + * conditions. This must be called with the socket locked. */ static int wait_for_connect(struct sock * sk, long timeo) { @@ -1965,8 +2034,6 @@ static int wait_for_connect(struct sock * sk, long timeo) /* * This will accept the next outstanding connection. - * - * Be careful about race conditions here - this is subtle. */ struct sock *tcp_accept(struct sock *sk, int flags, int *err) @@ -2095,7 +2162,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, err = -EINVAL; else { tp->keepalive_time = val * HZ; - if (sk->keepopen) { + if (sk->keepopen && !((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))) { __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp; if (tp->keepalive_time > elapsed) elapsed = tp->keepalive_time - elapsed; @@ -2152,7 +2219,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, tp->window_clamp = 0; } else { tp->window_clamp = val<SOCK_MIN_RCVBUF/2 ? - SOCK_MIN_SNDBUF : val; + SOCK_MIN_RCVBUF/2 : val; } break; @@ -2318,6 +2385,21 @@ void __init tcp_init(void) } tcp_port_rover = sysctl_local_port_range[0] - 1; + sysctl_tcp_mem[0] = 64<<order; + sysctl_tcp_mem[1] = 200<<order; + sysctl_tcp_mem[2] = 256<<order; + if (sysctl_tcp_mem[2] - sysctl_tcp_mem[1] > 512) + sysctl_tcp_mem[1] = sysctl_tcp_mem[2] - 512; + if (sysctl_tcp_mem[1] - sysctl_tcp_mem[0] > 512) + sysctl_tcp_mem[0] = sysctl_tcp_mem[1] - 512; + + if (order < 3) { + sysctl_tcp_wmem[2] = 64*1024; + sysctl_tcp_rmem[0] = PAGE_SIZE; + sysctl_tcp_rmem[1] = 43689; + sysctl_tcp_rmem[2] = 2*43689; + } + printk("TCP: Hash tables configured (established %d bind %d)\n", tcp_ehash_size<<1, tcp_bhash_size); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f062cb2fb..76791d724 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_input.c,v 1.193 2000/04/20 14:41:16 davem Exp $ + * Version: $Id: tcp_input.c,v 1.198 2000/08/15 20:15:23 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -55,20 +55,15 @@ * work without delayed acks. * Andi Kleen: Process packets with PSH set in the * fast path. + * J Hadi Salim: ECN support */ -#include <linux/config.h> #include <linux/mm.h> #include <linux/sysctl.h> #include <net/tcp.h> #include <net/inet_common.h> #include <linux/ipsec.h> -#ifdef CONFIG_SYSCTL -#define SYNC_INIT 0 /* let the user enable it */ -#else -#define SYNC_INIT 1 -#endif /* These are on by default so the code paths get tested. * For the final 2.2 this may be undone at our discretion. -DaveM @@ -76,33 +71,39 @@ int sysctl_tcp_timestamps = 1; int sysctl_tcp_window_scaling = 1; int sysctl_tcp_sack = 1; - -int sysctl_tcp_syncookies = SYNC_INIT; -int sysctl_tcp_stdurg; -int sysctl_tcp_rfc1337; -int sysctl_tcp_tw_recycle = 1; -int sysctl_tcp_abort_on_overflow = 0; +int sysctl_tcp_fack = 1; +int sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; +int sysctl_tcp_ecn = 1; +int sysctl_tcp_dsack = 1; +int sysctl_tcp_app_win = 31; +int sysctl_tcp_adv_win_scale = 2; + +int sysctl_tcp_stdurg = 0; +int sysctl_tcp_rfc1337 = 0; int sysctl_tcp_max_orphans = NR_FILE; -int sysctl_tcp_max_tw_buckets = NR_FILE*2; -static int prune_queue(struct sock *sk); +#define FLAG_DATA 0x01 /* Incoming frame contained data. */ +#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ +#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ +#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ +#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ +#define FLAG_DATA_SACKED 0x20 /* New SACK. */ +#define FLAG_ECE 0x40 /* ECE in this ACK */ +#define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */ +#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ + +#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) +#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) +#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) +#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) + +#define IsReno(tp) ((tp)->sack_ok == 0) +#define IsFack(tp) ((tp)->sack_ok & 2) -/* - * Adapt the MSS value used to make delayed ack decision to the +#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) + +/* Adapt the MSS value used to make delayed ack decision to the * real world. - * - * The constant 536 hasn't any good meaning. In IPv4 world - * MTU may be smaller, though it contradicts to RFC1122, which - * states that MSS must be at least 536. - * We use the constant to do not ACK each second - * packet in a stream of tiny size packets. - * It means that super-low mtu links will be aggressively delacked. - * Seems, it is even good. If they have so low mtu, they are weirdly - * slow. - * - * AK: BTW it may be useful to add an option to lock the rcv_mss. - * this way the beowulf people wouldn't need ugly patches to get the - * ack frequencies they want and it would be an elegant way to tune delack. */ static __inline__ void tcp_measure_rcv_mss(struct tcp_opt *tp, struct sk_buff *skb) { @@ -117,6 +118,9 @@ static __inline__ void tcp_measure_rcv_mss(struct tcp_opt *tp, struct sk_buff *s len = skb->len; if (len >= tp->ack.rcv_mss) { tp->ack.rcv_mss = len; + /* Dubious? Rather, it is final cut. 8) */ + if (tcp_flag_word(skb->h.th)&TCP_REMNANT) + tp->ack.pending |= TCP_ACK_PUSHED; } else { /* Otherwise, we make more careful check taking into account, * that SACKs block is variable. @@ -124,37 +128,47 @@ static __inline__ void tcp_measure_rcv_mss(struct tcp_opt *tp, struct sk_buff *s * "len" is invariant segment length, including TCP header. */ len = skb->tail - skb->h.raw; - if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr)) { + if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) || + /* If PSH is not set, packet should be + * full sized, provided peer TCP is not badly broken. + * This observation (if it is correct 8)) allows + * to handle super-low mtu links fairly. + */ + (len >= TCP_MIN_MSS + sizeof(struct tcphdr) && + !(tcp_flag_word(skb->h.th)&TCP_REMNANT))) { /* Subtract also invariant (if peer is RFC compliant), * tcp header plus fixed timestamp option length. * Resulting "len" is MSS free of SACK jitter. */ len -= tp->tcp_header_len; - if (len == lss) - tp->ack.rcv_mss = len; tp->ack.last_seg_size = len; + if (len == lss) { + tp->ack.rcv_mss = len; + return; + } } + tp->ack.pending |= TCP_ACK_PUSHED; } } - -static __inline__ void tcp_enter_quickack_mode(struct tcp_opt *tp) +static void tcp_incr_quickack(struct tcp_opt *tp) { - unsigned quickacks = tcp_receive_window(tp)/(2*tp->ack.rcv_mss); + unsigned quickacks = tp->rcv_wnd/(2*tp->ack.rcv_mss); - tp->ack.quick = max(min(quickacks, 127), 1); + if (quickacks==0) + quickacks=2; + if (quickacks > tp->ack.quick) + tp->ack.quick = min(quickacks, TCP_MAX_QUICKACKS); +} - if (!tp->tstamp_ok && tp->ack.quick>2) { - /* Quick ACKs are _dangerous_, if RTTM is not used. - * See comment in tcp_init_metrics(). We still help - * them to overcome the most difficult, initial - * phase of slow start. - */ - tp->ack.quick = 2; - } +void tcp_enter_quickack_mode(struct tcp_opt *tp) +{ + tcp_incr_quickack(tp); + tp->ack.pingpong = 0; + tp->ack.ato = TCP_ATO_MIN; } -/* Send ACKs quickly, if "quick" count is not ehausted +/* Send ACKs quickly, if "quick" count is not exhausted * and the session is not interactive. */ @@ -163,6 +177,173 @@ static __inline__ int tcp_in_quickack_mode(struct tcp_opt *tp) return (tp->ack.quick && !tp->ack.pingpong); } +/* Buffer size and advertised window tuning. + * + * 1. Tuning sk->sndbuf, when connection enters established state. + */ + +static void tcp_fixup_sndbuf(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int sndmem = tp->mss_clamp+MAX_TCP_HEADER+16+sizeof(struct sk_buff); + + if (sk->sndbuf < 3*sndmem) + sk->sndbuf = min(3*sndmem, sysctl_tcp_wmem[2]); +} + +/* 2. Tuning advertised window (window_clamp, rcv_ssthresh) + * + * All tcp_full_space() is split to two parts: "network" buffer, allocated + * forward and advertised in receiver window (tp->rcv_wnd) and + * "application buffer", required to isolate scheduling/application + * latencies from network. + * window_clamp is maximal advertised window. It can be less than + * tcp_full_space(), in this case tcp_full_space() - window_clamp + * is reserved for "application" buffer. The less window_clamp is + * the smoother our behaviour from viewpoint of network, but the lower + * throughput and the higher sensitivity of the connection to losses. 8) + * + * rcv_ssthresh is more strict window_clamp used at "slow start" + * phase to predict further behaviour of this connection. + * It is used for two goals: + * - to enforce header prediction at sender, even when application + * requires some significant "application buffer". It is check #1. + * - to prevent pruning of receive queue because of misprediction + * of receiver window. Check #2. + * + * The scheme does not work when sender sends good segments opening + * window and then starts to feed us spagetti. But it should work + * in common situations. Otherwise, we have to rely on queue collapsing. + */ + +/* Slow part of check#2. */ +static int +__tcp_grow_window(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb) +{ + /* Optimize this! */ + int truesize = tcp_win_from_space(skb->truesize)/2; + int window = tcp_full_space(sk)/2; + + while (tp->rcv_ssthresh <= window) { + if (truesize <= skb->len) + return 2*tp->ack.rcv_mss; + + truesize >>= 1; + window >>= 1; + } + return 0; +} + +static __inline__ void +tcp_grow_window(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb) +{ + /* Check #1 */ + if (tp->rcv_ssthresh < tp->window_clamp && + (int)tp->rcv_ssthresh < tcp_space(sk) && + !tcp_memory_pressure) { + int incr; + + /* Check #2. Increase window, if skb with such overhead + * will fit to rcvbuf in future. + */ + if (tcp_win_from_space(skb->truesize) <= skb->len) + incr = 2*tp->advmss; + else + incr = __tcp_grow_window(sk, tp, skb); + + if (incr) { + tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); + tp->ack.quick |= 1; + } + } +} + +/* 3. Tuning rcvbuf, when connection enters established state. */ + +static void tcp_fixup_rcvbuf(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int rcvmem = tp->advmss+MAX_TCP_HEADER+16+sizeof(struct sk_buff); + + /* Try to select rcvbuf so that 4 mss-sized segments + * will fit to window and correspoding skbs will fit to our rcvbuf. + * (was 3; 4 is minimum to allow fast retransmit to work.) + */ + while (tcp_win_from_space(rcvmem) < tp->advmss) + rcvmem += 128; + if (sk->rcvbuf < 4*rcvmem) + sk->rcvbuf = min(4*rcvmem, sysctl_tcp_rmem[2]); +} + +/* 4. Try to fixup all. It is made iimediately after connection enters + * established state. + */ +static void tcp_init_buffer_space(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int maxwin; + + if (!(sk->userlocks&SOCK_RCVBUF_LOCK)) + tcp_fixup_rcvbuf(sk); + if (!(sk->userlocks&SOCK_SNDBUF_LOCK)) + tcp_fixup_sndbuf(sk); + + maxwin = tcp_full_space(sk); + + if (tp->window_clamp >= maxwin) { + tp->window_clamp = maxwin; + + if (sysctl_tcp_app_win && maxwin>4*tp->advmss) + tp->window_clamp = max(maxwin-(maxwin>>sysctl_tcp_app_win), 4*tp->advmss); + } + + /* Force reservation of one segment. */ + if (sysctl_tcp_app_win && + tp->window_clamp > 2*tp->advmss && + tp->window_clamp + tp->advmss > maxwin) + tp->window_clamp = max(2*tp->advmss, maxwin-tp->advmss); + + tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +/* 5. Recalculate window clamp after socket hit its memory bounds. */ +static void tcp_clamp_window(struct sock *sk, struct tcp_opt *tp) +{ + struct sk_buff *skb; + int app_win = tp->rcv_nxt - tp->copied_seq; + int ofo_win = 0; + + tp->ack.quick = 0; + + skb_queue_walk(&tp->out_of_order_queue, skb) { + ofo_win += skb->len; + } + + /* If overcommit is due to out of order segments, + * do not clamp window. Try to expand rcvbuf instead. + */ + if (ofo_win) { + if (sk->rcvbuf < sysctl_tcp_rmem[2] && + !(sk->userlocks&SOCK_RCVBUF_LOCK) && + !tcp_memory_pressure && + atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) + sk->rcvbuf = min(atomic_read(&sk->rmem_alloc), sysctl_tcp_rmem[2]); + } + if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) { + app_win += ofo_win; + if (atomic_read(&sk->rmem_alloc) >= 2*sk->rcvbuf) + app_win >>= 1; + if (app_win > tp->ack.rcv_mss) + app_win -= tp->ack.rcv_mss; + app_win = max(app_win, 2*tp->advmss); + + if (!ofo_win) + tp->window_clamp = min(tp->window_clamp, app_win); + tp->rcv_ssthresh = min(tp->window_clamp, 2*tp->advmss); + } +} + /* There is something which you must keep in mind when you analyze the * behavior of the tp->ato delayed ack timeout interval. When a * connection starts up, we want to ack as quickly as possible. The @@ -173,14 +354,13 @@ static __inline__ int tcp_in_quickack_mode(struct tcp_opt *tp) * each ACK we send, he increments snd_cwnd and transmits more of his * queue. -DaveM */ -static void tcp_event_data_recv(struct tcp_opt *tp, struct sk_buff *skb) +static void tcp_event_data_recv(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb) { u32 now; - tcp_measure_rcv_mss(tp, skb); + tcp_schedule_ack(tp); - tp->ack.pending = 1; - tp->ack.rcv_segs++; + tcp_measure_rcv_mss(tp, skb); now = tcp_time_stamp; @@ -188,37 +368,31 @@ static void tcp_event_data_recv(struct tcp_opt *tp, struct sk_buff *skb) /* The _first_ data packet received, initialize * delayed ACK engine. */ - - /* Help sender leave slow start quickly. */ tcp_enter_quickack_mode(tp); - - /* Pingpong is off, session is not interactive by default */ - tp->ack.pingpong = 0; - - /* ATO is minimal */ - tp->ack.ato = TCP_ATO_MIN; } else { int m = now - tp->ack.lrcvtime; - if (m > TCP_ATO_MAX/2) { - /* Do not touch ATO, if interval is out of bounds. - * It will be deflated by delack timer, if our peer - * really sends too rarely. + if (m <= TCP_ATO_MIN/2) { + /* The fastest case is the first. */ + tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2; + } else if (m < tp->ack.ato) { + tp->ack.ato = (tp->ack.ato>>1) + m; + if (tp->ack.ato > tp->rto) + tp->ack.ato = tp->rto; + } else if (m > tp->rto) { + /* Too long gap. Apparently sender falled to + * restart window, so that we send ACKs quickly. */ - if (m > tp->rto) { - /* Too long gap. Apparently sender falled to - * restart window, so that we send ACKs quickly. - */ - tcp_enter_quickack_mode(tp); - } - } else { - if (m <= 0) - m = TCP_ATO_MIN/2; - if (m <= tp->ack.ato) - tp->ack.ato = (tp->ack.ato >> 1) + m; + tcp_incr_quickack(tp); + tcp_mem_reclaim(sk); } } tp->ack.lrcvtime = now; + + TCP_ECN_check_ce(tp, skb); + + if (skb->len >= 128) + tcp_grow_window(sk, tp, skb); } /* Called to compute a smoothed rtt estimate. The data fed to this @@ -230,7 +404,6 @@ static void tcp_event_data_recv(struct tcp_opt *tp, struct sk_buff *skb) * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ - static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt) { long m = mrtt; /* RTT */ @@ -243,6 +416,13 @@ static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt) * * On a 1990 paper the rto value is changed to: * RTO = rtt + 4 * mdev + * + * Funny. This algorithm seems to be very broken. + * These formulae increase RTO, when it should be decreased, increase + * too slowly, when it should be incresed fastly, decrease too fastly + * etc. I guess in BSD RTO takes ONE value, so that it is absolutely + * does not matter how to _calculate_ it. Seems, it was trap + * that VJ failed to avoid. 8) */ if(m == 0) m = 1; @@ -263,16 +443,27 @@ static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt) /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ - static __inline__ void tcp_set_rto(struct tcp_opt *tp) { tp->rto = (tp->srtt >> 3) + tp->mdev; /* I am not enough educated to understand this magic. * However, it smells bad. snd_cwnd>31 is common case. */ + /* OK, I found comment in 2.0 source tree, it deserves + * to be reproduced: + * ==== + * Note: Jacobson's algorithm is fine on BSD which has a 1/2 second + * granularity clock, but with our 1/100 second granularity clock we + * become too sensitive to minor changes in the round trip time. + * We add in two compensating factors. First we multiply by 5/4. + * For large congestion windows this allows us to tolerate burst + * traffic delaying up to 1/4 of our packets. We also add in + * a rtt / cong_window term. For small congestion windows this allows + * a single packet delay, but has negligible effect + * on the compensation for large windows. + */ tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1)); } - /* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound * on packet lifetime in the internet. We need the HZ/5 lower @@ -292,11 +483,12 @@ static __inline__ void tcp_bound_rto(struct tcp_opt *tp) tp->rto = TCP_RTO_MAX; } + /* Save metrics learned by this TCP session. This function is called only, when TCP finishes sucessfully i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE. */ -static void tcp_update_metrics(struct sock *sk) +void tcp_update_metrics(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct dst_entry *dst = __sk_dst_get(sk); @@ -344,19 +536,20 @@ static void tcp_update_metrics(struct sock *sk) dst->rttvar -= (dst->rttvar - m)>>2; } - if (tp->snd_ssthresh == 0x7FFFFFFF) { + if (tp->snd_ssthresh >= 0xFFFF) { /* Slow start still did not finish. */ if (dst->ssthresh && !(dst->mxlock&(1<<RTAX_SSTHRESH)) && - tp->snd_cwnd > dst->ssthresh) - dst->ssthresh = tp->snd_cwnd; + (tp->snd_cwnd>>1) > dst->ssthresh) + dst->ssthresh = (tp->snd_cwnd>>1); if (!(dst->mxlock&(1<<RTAX_CWND)) && tp->snd_cwnd > dst->cwnd) dst->cwnd = tp->snd_cwnd; - } else if (tp->snd_cwnd >= tp->snd_ssthresh && !tp->high_seq) { + } else if (tp->snd_cwnd > tp->snd_ssthresh && + tp->ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ if (!(dst->mxlock&(1<<RTAX_SSTHRESH))) - dst->ssthresh = tp->snd_cwnd; + dst->ssthresh = max(tp->snd_cwnd>>1, tp->snd_ssthresh); if (!(dst->mxlock&(1<<RTAX_CWND))) dst->cwnd = (dst->cwnd + tp->snd_cwnd)>>1; } else { @@ -370,9 +563,37 @@ static void tcp_update_metrics(struct sock *sk) tp->snd_ssthresh > dst->ssthresh) dst->ssthresh = tp->snd_ssthresh; } + + if (!(dst->mxlock&(1<<RTAX_REORDERING))) { + if (dst->reordering < tp->reordering && + tp->reordering != sysctl_tcp_reordering) + dst->reordering = tp->reordering; + } } } +/* Increase initial CWND conservatively: if estimated + * RTT is low enough (<20msec) or if we have some preset ssthresh. + * + * Numbers are taken from RFC1414. + */ +__u32 tcp_init_cwnd(struct tcp_opt *tp) +{ + __u32 cwnd; + + if (tp->mss_cache > 1460) + return 2; + + cwnd = (tp->mss_cache > 1095) ? 3 : 4; + + if (!tp->srtt || (tp->snd_ssthresh >= 0xFFFF && tp->srtt > ((HZ/50)<<3))) + cwnd = 2; + else if (cwnd > tp->snd_ssthresh) + cwnd = tp->snd_ssthresh; + + return min(cwnd, tp->snd_cwnd_clamp); +} + /* Initialize metrics on socket. */ static void tcp_init_metrics(struct sock *sk) @@ -392,6 +613,10 @@ static void tcp_init_metrics(struct sock *sk) if (tp->snd_ssthresh > tp->snd_cwnd_clamp) tp->snd_ssthresh = tp->snd_cwnd_clamp; } + if (dst->reordering && tp->reordering != dst->reordering) { + tp->sack_ok &= ~2; + tp->reordering = dst->reordering; + } if (dst->rtt == 0) goto reset; @@ -422,9 +647,9 @@ static void tcp_init_metrics(struct sock *sk) if (tp->rto < TCP_TIMEOUT_INIT && !tp->saw_tstamp) goto reset; tp->snd_cwnd = tcp_init_cwnd(tp); + tp->snd_cwnd_stamp = tcp_time_stamp; return; - reset: /* Play conservative. If timestamps are not * supported, TCP will fail to recalculate correct @@ -437,402 +662,967 @@ reset: } } -/* WARNING: this must not be called if tp->saw_tstamp was false. */ -extern __inline__ void -tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq) +static void tcp_update_reordering(struct tcp_opt *tp, int metric, int ts) { - if (!after(seq, tp->rcv_wup)) { - /* PAWS bug workaround wrt. ACK frames, the PAWS discard - * extra check below makes sure this can only happen - * for pure ACK frames. -DaveM - * - * Not only, also it occurs for expired timestamps - * and RSTs with bad timestamp option. --ANK - */ + if (metric > tp->reordering) { + tp->reordering = min(TCP_MAX_REORDERING, metric); + + /* This exciting event is worth to be remembered. 8) */ + if (ts) + NET_INC_STATS_BH(TCPTSReorder); + else if (IsReno(tp)) + NET_INC_STATS_BH(TCPRenoReorder); + else if (IsFack(tp)) + NET_INC_STATS_BH(TCPFACKReorder); + else + NET_INC_STATS_BH(TCPSACKReorder); +#if FASTRETRANS_DEBUG > 1 + printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", + tp->sack_ok, tp->ca_state, + tp->reordering, tp->fackets_out, tp->sacked_out, + tp->undo_marker ? tp->undo_retrans : 0); +#endif + /* Disable FACK yet. */ + tp->sack_ok &= ~2; + } +} - if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0 || - xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS) { - tp->ts_recent = tp->rcv_tsval; - tp->ts_recent_stamp = xtime.tv_sec; +/* This procedure tags the retransmission queue when SACKs arrive. + * + * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). + * Packets in queue with these bits set are counted in variables + * sacked_out, retrans_out and lost_out, correspondingly. + * + * Valid combinations are: + * Tag InFlight Description + * 0 1 - orig segment is in flight. + * S 0 - nothing flies, orig reached receiver. + * L 0 - nothing flies, orig lost by net. + * R 2 - both orig and retransmit are in flight. + * L|R 1 - orig is lost, retransmit is in flight. + * S|R 1 - orig reached receiver, retrans is still in flight. + * (L|S|R is logically valid, it could occur when L|R is sacked, + * but it is equivalent to plain S and code short-curcuits it to S. + * L|S is logically invalid, it would mean -1 packet in flight 8)) + * + * These 6 states form finite state machine, controlled by the following events: + * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) + * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) + * 3. Loss detection event of one of three flavors: + * A. Scoreboard estimator decided the packet is lost. + * A'. Reno "three dupacks" marks head of queue lost. + * A''. Its FACK modfication, head until snd.fack is lost. + * B. SACK arrives sacking data transmitted after never retransmitted + * hole was sent out. + * C. SACK arrives sacking SND.NXT at the moment, when the + * segment was retransmitted. + * 4. D-SACK added new rule: D-SACK changes any tag to S. + * + * It is pleasant to note, that state diagram turns out to be commutative, + * so that we are allowed not to be bothered by order of our actions, + * when multiple events arrive simultaneously. (see the function below). + * + * Reordering detection. + * -------------------- + * Reordering metric is maximal distance, which a packet can be displaced + * in packet stream. With SACKs we can estimate it: + * + * 1. SACK fills old hole and the corresponding segment was not + * ever retransmitted -> reordering. Alas, we cannot use it + * when segment was retransmitted. + * 2. The last flaw is solved with D-SACK. D-SACK arrives + * for retransmitted and already SACKed segment -> reordering.. + * Both of these heuristics are not used in Loss state, when we cannot + * account for retransmits accurately. + */ +static int +tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked; + struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2); + int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3; + int reord = tp->packets_out; + int prior_fackets; + u32 lost_retrans = 0; + int flag = 0; + int i; + + if (!tp->sacked_out) + tp->fackets_out = 0; + prior_fackets = tp->fackets_out; + + for (i=0; i<num_sacks; i++, sp++) { + struct sk_buff *skb; + __u32 start_seq = ntohl(sp->start_seq); + __u32 end_seq = ntohl(sp->end_seq); + int fack_count = 0; + int dup_sack = 0; + + /* Check for D-SACK. */ + if (i == 0) { + u32 ack = TCP_SKB_CB(ack_skb)->ack_seq; + + if (before(start_seq, ack)) { + dup_sack = 1; + NET_INC_STATS_BH(TCPDSACKRecv); + } else if (num_sacks > 1 && + !after(end_seq, ntohl(sp[1].end_seq)) && + !before(start_seq, ntohl(sp[1].start_seq))) { + dup_sack = 1; + NET_INC_STATS_BH(TCPDSACKOfoRecv); + } + + /* D-SACK for already forgotten data... + * Do dumb counting. */ + if (dup_sack && + !after(end_seq, prior_snd_una) && + after(end_seq, tp->undo_marker)) + tp->undo_retrans--; + + /* Eliminate too old ACKs, but take into + * account more or less fresh ones, they can + * contain valid SACK info. + */ + if (before(ack, prior_snd_una-tp->max_window)) + return 0; + } + + /* Event "B" in the comment above. */ + if (after(end_seq, tp->high_seq)) + flag |= FLAG_DATA_LOST; + + for_retrans_queue(skb, sk, tp) { + u8 sacked = TCP_SKB_CB(skb)->sacked; + int in_sack; + + /* The retransmission queue is always in order, so + * we can short-circuit the walk early. + */ + if(!before(TCP_SKB_CB(skb)->seq, end_seq)) + break; + + fack_count++; + + in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && + !before(end_seq, TCP_SKB_CB(skb)->end_seq); + + /* Account D-SACK for retransmitted packet. */ + if ((dup_sack && in_sack) && + (sacked & TCPCB_RETRANS) && + after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) + tp->undo_retrans--; + + /* The frame is ACKed. */ + if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) { + if (sacked&TCPCB_RETRANS) { + if ((dup_sack && in_sack) && + (sacked&TCPCB_SACKED_ACKED)) + reord = min(fack_count, reord); + } else { + /* If it was in a hole, we detected reordering. */ + if (fack_count < prior_fackets && + !(sacked&TCPCB_SACKED_ACKED)) + reord = min(fack_count, reord); + } + + /* Nothing to do; acked frame is about to be dropped. */ + continue; + } + + if ((sacked&TCPCB_SACKED_RETRANS) && + after(end_seq, TCP_SKB_CB(skb)->ack_seq) && + (!lost_retrans || after(end_seq, lost_retrans))) + lost_retrans = end_seq; + + if (!in_sack) + continue; + + if (!(sacked&TCPCB_SACKED_ACKED)) { + if (sacked & TCPCB_SACKED_RETRANS) { + /* If the segment is not tagged as lost, + * we do not clear RETRANS, believing + * that retransmission is still in flight. + */ + if (sacked & TCPCB_LOST) { + TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); + tp->lost_out--; + tp->retrans_out--; + } + } else { + /* New sack for not retransmitted frame, + * which was in hole. It is reordering. + */ + if (!(sacked & TCPCB_RETRANS) && + fack_count < prior_fackets) + reord = min(fack_count, reord); + + if (sacked & TCPCB_LOST) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + tp->lost_out--; + } + } + + TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; + flag |= FLAG_DATA_SACKED; + tp->sacked_out++; + + if (fack_count > tp->fackets_out) + tp->fackets_out = fack_count; + } else { + if (dup_sack && (sacked&TCPCB_RETRANS)) + reord = min(fack_count, reord); + } + + /* D-SACK. We can detect redundant retransmission + * in S|R and plain R frames and clear it. + * undo_retrans is decreased above, L|R frames + * are accounted above as well. + */ + if (dup_sack && + (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out--; + } } } + + /* Check for lost retransmit. This superb idea is + * borrowed from "ratehalving". Event "C". + * Later note: FACK people cheated me again 8), + * we have to account for reordering! Ugly, + * but should help. + */ + if (lost_retrans && tp->ca_state == TCP_CA_Recovery) { + struct sk_buff *skb; + + for_retrans_queue(skb, sk, tp) { + if (after(TCP_SKB_CB(skb)->seq, lost_retrans)) + break; + if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) + continue; + if ((TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) && + after(lost_retrans, TCP_SKB_CB(skb)->ack_seq) && + (IsFack(tp) || + !before(lost_retrans, TCP_SKB_CB(skb)->ack_seq+tp->reordering*tp->mss_cache))) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out--; + + if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { + tp->lost_out++; + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + flag |= FLAG_DATA_SACKED; + NET_INC_STATS_BH(TCPLostRetransmit); + } + } + } + } + + tp->left_out = tp->sacked_out + tp->lost_out; + + if (reord < tp->fackets_out && tp->ca_state != TCP_CA_Loss) + tcp_update_reordering(tp, (tp->fackets_out+1)-reord, 0); + +#if FASTRETRANS_DEBUG > 0 + BUG_TRAP((int)tp->sacked_out >= 0); + BUG_TRAP((int)tp->lost_out >= 0); + BUG_TRAP((int)tp->retrans_out >= 0); + BUG_TRAP((int)tcp_packets_in_flight(tp) >= 0); +#endif + return flag; } -extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb) +void tcp_clear_retrans(struct tcp_opt *tp) { - return ((s32)(tp->rcv_tsval - tp->ts_recent) < 0 && - xtime.tv_sec < tp->ts_recent_stamp + TCP_PAWS_24DAYS + tp->left_out = 0; + tp->retrans_out = 0; + + tp->fackets_out = 0; + tp->sacked_out = 0; + tp->lost_out = 0; - /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM + tp->undo_marker = 0; + tp->undo_retrans = 0; +} + +/* Enter Loss state. If "how" is not zero, forget all SACK information + * and reset tags completely, otherwise preserve SACKs. If receiver + * dropped its ofo queue, we will know this due to reneging detection. + */ +void tcp_enter_loss(struct sock *sk, int how) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct sk_buff *skb; + int cnt = 0; + + /* Reduce ssthresh if it has not yet been made inside this window. */ + if (tp->ca_state <= TCP_CA_Disorder || + tp->snd_una == tp->high_seq || + (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { + tp->prior_ssthresh = tcp_current_ssthresh(tp); + tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + } + tp->snd_cwnd = 1; + tp->snd_cwnd_cnt = 0; + tp->snd_cwnd_stamp = tcp_time_stamp; + + tcp_clear_retrans(tp); + + /* Push undo marker, if it was plain RTO and nothing + * was retransmitted. */ + if (!how) + tp->undo_marker = tp->snd_una; + + for_retrans_queue(skb, sk, tp) { + cnt++; + if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) + tp->undo_marker = 0; + TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out++; + } else { + tp->sacked_out++; + tp->fackets_out = cnt; + } + } + tp->left_out = tp->sacked_out + tp->lost_out; - I cannot see quitely as all the idea behind PAWS - is destroyed 8) + tp->reordering = min(tp->reordering, sysctl_tcp_reordering); + tp->ca_state = TCP_CA_Loss; + tp->high_seq = tp->snd_nxt; + TCP_ECN_queue_cwr(tp); +} - The problem is only in reordering duplicate ACKs. - Hence, we can check this rare case more carefully. +static int tcp_check_sack_reneging(struct sock *sk, struct tcp_opt *tp) +{ + struct sk_buff *skb; - 1. Check that it is really duplicate ACK (ack==snd_una) - 2. Give it some small "replay" window (~RTO) + /* If ACK arrived pointing to a remembered SACK, + * it means that our remembered SACKs do not reflect + * real state of receiver i.e. + * receiver _host_ is heavily congested (or buggy). + * Do processing similar to RTO timeout. + */ + if ((skb = skb_peek(&sk->write_queue)) != NULL && + (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { + NET_INC_STATS_BH(TCPSACKReneging); - We do not know units of foreign ts values, but make conservative - assumption that they are >=1ms. It solves problem - noted in Dave's mail to tcpimpl and does not harm PAWS. --ANK - */ - && (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq || - TCP_SKB_CB(skb)->ack_seq != tp->snd_una || - !skb->h.th->ack || - (s32)(tp->ts_recent - tp->rcv_tsval) > (tp->rto*1024)/HZ)); + tcp_enter_loss(sk, 1); + tp->retransmits++; + tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + return 1; + } + return 0; +} + +static inline int tcp_fackets_out(struct tcp_opt *tp) +{ + return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out; } -static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) +/* Linux NewReno/SACK/FACK/ECN state machine. + * -------------------------------------- + * + * "Open" Normal state, no dubious events, fast path. + * "Disorder" In all the respects it is "Open", + * but requires a bit more attention. It is entered when + * we see some SACKs or dupacks. It is split of "Open" + * mainly to move some processing from fast path to slow one. + * "CWR" CWND was reduced due to some Congestion Notification event. + * It can be ECN, ICMP source quench, local device congestion. + * "Recovery" CWND was reduced, we are fast-retransmitting. + * "Loss" CWND was reduced due to RTO timeout or SACK reneging. + * + * tcp_fastretrans_alert() is entered: + * - each incoming ACK, if state is not "Open" + * - when arrived ACK is unusual, namely: + * * SACK + * * Duplicate ACK. + * * ECN ECE. + * + * Counting packets in flight is pretty simple. + * + * in_flight = packets_out - left_out + retrans_out + * + * packets_out is SND.NXT-SND.UNA counted in packets. + * + * retrans_out is number of retransmitted segments. + * + * left_out is number of segments left network, but not ACKed yet. + * + * left_out = sacked_out + lost_out + * + * sacked_out: Packets, which arrived to receiver out of order + * and hence not ACKed. With SACKs this number is simply + * amount of SACKed data. Even without SACKs + * it is easy to give pretty reliable estimate of this number, + * counting duplicate ACKs. + * + * lost_out: Packets lost by network. TCP has no explicit + * "loss notification" feedback from network (for now). + * It means that this number can be only _guessed_. + * Actually, it is the heuristics to predict lossage that + * distinguishes different algorithms. + * + * F.e. after RTO, when all the queue is considered as lost, + * lost_out = packets_out and in_flight = retrans_out. + * + * Essentially, we have now two algorithms counting + * lost packets. + * + * FACK: It is the simplest heuristics. As soon as we decided + * that something is lost, we decide that _all_ not SACKed + * packets until the most forward SACK are lost. I.e. + * lost_out = fackets_out - sacked_out and left_out = fackets_out. + * It is absolutely correct estimate, if network does not reorder + * packets. And it loses any connection to reality when reordering + * takes place. We use FACK by default until reordering + * is suspected on the path to this destination. + * + * NewReno: when Recovery is entered, we assume that one segment + * is lost (classic Reno). While we are in Recovery and + * a partial ACK arrives, we assume that one more packet + * is lost (NewReno). This heuristics are the same in NewReno + * and SACK. + * + * Imagine, that's all! Forget about all this shamanism about CWND inflation + * deflation etc. CWND is real congestion window, never inflated, changes + * only according to classic VJ rules. + * + * Really tricky (and requiring careful tuning) part of algorithm + * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue(). + * The first determines the moment _when_ we should reduce CWND and, + * hence, slow down forward transmission. In fact, it determines the moment + * when we decide that hole is caused by loss, rather than by a reorder. + * + * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill + * holes, caused by lost packets. + * + * And the most logically complicated part of algorithm is undo + * heuristics. We detect false retransmits due to both too early + * fast retransmit (reordering) and underestimated RTO, analyzing + * timestamps and D-SACKs. When we detect that some segments were + * retransmitted by mistake and CWND reduction was wrong, we undo + * window reduction and abort recovery phase. This logic is hidden + * inside several functions named tcp_try_undo_<something>. + */ + +/* This function decides, when we should leave Disordered state + * and enter Recovery phase, reducing congestion window. + * + * Main question: may we further continue forward transmission + * with the same cwnd? + */ +static int +tcp_time_to_recover(struct sock *sk, struct tcp_opt *tp) { - u32 end_window = tp->rcv_wup + tp->rcv_wnd; -#ifdef TCP_FORMAL_WINDOW - u32 rcv_wnd = tcp_receive_window(tp); -#else - u32 rcv_wnd = tp->rcv_wnd; -#endif + /* Trick#1: The loss is proven. */ + if (tp->lost_out) + return 1; - if (rcv_wnd && - after(end_seq, tp->rcv_nxt) && - before(seq, end_window)) + /* Not-A-Trick#2 : Classic rule... */ + if (tcp_fackets_out(tp) > tp->reordering) return 1; - if (seq != end_window) - return 0; - return (seq == end_seq); + + /* Trick#3: It is still not OK... But will it be useful to delay + * recovery more? + */ + if (tp->packets_out <= tp->reordering && + tp->sacked_out >= max(tp->packets_out/2, sysctl_tcp_reordering) && + !tcp_may_send_now(sk, tp)) { + /* We have nothing to send. This connection is limited + * either by receiver window or by application. + */ + return 1; + } + + return 0; } -/* This functions checks to see if the tcp header is actually acceptable. */ -extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) +/* If we receive more dupacks than we expected counting segments + * in assumption of absent reordering, interpret this as reordering. + * The only another reason could be bug in receiver TCP. + */ +static void tcp_check_reno_reordering(struct tcp_opt *tp, int addend) { -#ifdef TCP_FORMAL_WINDOW - u32 rcv_wnd = tcp_receive_window(tp); -#else - u32 rcv_wnd = tp->rcv_wnd; -#endif - if (seq == tp->rcv_nxt) - return (rcv_wnd || (end_seq == seq)); + if (tp->sacked_out + 1 > tp->packets_out) { + tp->sacked_out = tp->packets_out ? tp->packets_out - 1 : 0; + tcp_update_reordering(tp, tp->packets_out+addend, 0); + } +} - return __tcp_sequence(tp, seq, end_seq); +/* Emulate SACKs for SACKless connection: account for a new dupack. */ + +static void tcp_add_reno_sack(struct tcp_opt *tp) +{ + ++tp->sacked_out; + tcp_check_reno_reordering(tp, 0); + tp->left_out = tp->sacked_out + tp->lost_out; } -/* When we get a reset we do this. */ -static void tcp_reset(struct sock *sk) +/* Account for ACK, ACKing some data in Reno Recovery phase. */ + +static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_opt *tp, int acked) { - /* We want the right error as BSD sees it (and indeed as we do). */ - switch (sk->state) { - case TCP_SYN_SENT: - sk->err = ECONNREFUSED; - break; - case TCP_CLOSE_WAIT: - sk->err = EPIPE; + if (acked > 0) { + /* One ACK eated lost packet. Must eat! */ + BUG_TRAP(tp->lost_out == 0); + + /* The rest eat duplicate ACKs. */ + if (acked-1 >= tp->sacked_out) + tp->sacked_out = 0; + else + tp->sacked_out -= acked-1; + } + tcp_check_reno_reordering(tp, acked); + tp->left_out = tp->sacked_out + tp->lost_out; +} + +static inline void tcp_reset_reno_sack(struct tcp_opt *tp) +{ + tp->sacked_out = 0; + tp->left_out = tp->lost_out; +} + +/* Mark head of queue up as lost. */ +static void +tcp_mark_head_lost(struct sock *sk, struct tcp_opt *tp, int packets, u32 high_seq) +{ + struct sk_buff *skb; + int cnt = packets; + + BUG_TRAP(cnt <= tp->packets_out); + + for_retrans_queue(skb, sk, tp) { + if (--cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq)) break; - case TCP_CLOSE: - return; - default: - sk->err = ECONNRESET; + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out++; + } } + tp->left_out = tp->sacked_out + tp->lost_out; +} - if (!sk->dead) - sk->error_report(sk); +/* Account newly detected lost packet(s) */ - tcp_done(sk); +static void tcp_update_scoreboard(struct sock *sk, struct tcp_opt *tp) +{ + if (IsFack(tp)) { + int lost = tp->fackets_out - tp->reordering; + if (lost <= 0) + lost = 1; + tcp_mark_head_lost(sk, tp, lost, tp->high_seq); + } else { + tcp_mark_head_lost(sk, tp, 1, tp->high_seq); + } } -/* This tags the retransmission queue when SACKs arrive. */ -static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, int nsacks) +/* CWND moderation, preventing bursts due to too big ACKs + * in dubious situations. + */ +static __inline__ void tcp_moderate_cwnd(struct tcp_opt *tp) { - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int i = nsacks; + tp->snd_cwnd = min(tp->snd_cwnd, + tcp_packets_in_flight(tp)+tcp_max_burst(tp)); + tp->snd_cwnd_stamp = tcp_time_stamp; +} - while(i--) { - struct sk_buff *skb = skb_peek(&sk->write_queue); - __u32 start_seq = ntohl(sp->start_seq); - __u32 end_seq = ntohl(sp->end_seq); - int fack_count = 0; +/* Decrease cwnd each second ack. */ - while((skb != NULL) && - (skb != tp->send_head) && - (skb != (struct sk_buff *)&sk->write_queue)) { - /* The retransmission queue is always in order, so - * we can short-circuit the walk early. - */ - if(after(TCP_SKB_CB(skb)->seq, end_seq)) - break; +static void tcp_cwnd_down(struct tcp_opt *tp) +{ + int decr = tp->snd_cwnd_cnt + 1; - /* We play conservative, we don't allow SACKS to partially - * tag a sequence space. - */ - fack_count++; - if(!after(start_seq, TCP_SKB_CB(skb)->seq) && - !before(end_seq, TCP_SKB_CB(skb)->end_seq)) { - /* If this was a retransmitted frame, account for it. */ - if((TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) && - tp->retrans_out) - tp->retrans_out--; - TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; + tp->snd_cwnd_cnt = decr&1; + decr >>= 1; - /* RULE: All new SACKs will either decrease retrans_out - * or advance fackets_out. - */ - if(fack_count > tp->fackets_out) - tp->fackets_out = fack_count; - } - skb = skb->next; - } - sp++; /* Move on to the next SACK block. */ - } + if (decr && tp->snd_cwnd > tp->snd_ssthresh/2) + tp->snd_cwnd -= decr; + + tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); + tp->snd_cwnd_stamp = tcp_time_stamp; } -/* Look for tcp options. Normally only called on SYN and SYNACK packets. - * But, this can also be called on packets in the established flow when - * the fast version below fails. +/* Nothing was retransmitted or returned timestamp is less + * than timestamp of the first retransmission. */ -void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy) +static __inline__ int tcp_packet_delayed(struct tcp_opt *tp) { - unsigned char *ptr; - int length=(th->doff*4)-sizeof(struct tcphdr); + return !tp->retrans_stamp || + (tp->saw_tstamp && + (__s32)(tp->rcv_tsecr - tp->retrans_stamp) < 0); +} - ptr = (unsigned char *)(th + 1); - tp->saw_tstamp = 0; +/* Undo procedures. */ - while(length>0) { - int opcode=*ptr++; - int opsize; +#if FASTRETRANS_DEBUG > 1 +static void DBGUNDO(struct sock *sk, struct tcp_opt *tp, const char *msg) +{ + printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n", + msg, + NIPQUAD(sk->daddr), ntohs(sk->dport), + tp->snd_cwnd, tp->left_out, + tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); +} +#else +#define DBGUNDO(x...) do { } while (0) +#endif - switch (opcode) { - case TCPOPT_EOL: - return; - case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ - length--; - continue; - default: - opsize=*ptr++; - if (opsize < 2) /* "silly options" */ - return; - if (opsize > length) - break; /* don't parse partial options */ - switch(opcode) { - case TCPOPT_MSS: - if(opsize==TCPOLEN_MSS && th->syn) { - u16 in_mss = ntohs(*(__u16 *)ptr); - if (in_mss) { - if (tp->user_mss && tp->user_mss < in_mss) - in_mss = tp->user_mss; - tp->mss_clamp = in_mss; - } - } - break; - case TCPOPT_WINDOW: - if(opsize==TCPOLEN_WINDOW && th->syn) - if (!no_fancy && sysctl_tcp_window_scaling) { - tp->wscale_ok = 1; - tp->snd_wscale = *(__u8 *)ptr; - if(tp->snd_wscale > 14) { - if(net_ratelimit()) - printk("tcp_parse_options: Illegal window " - "scaling value %d >14 received.", - tp->snd_wscale); - tp->snd_wscale = 14; - } - } - break; - case TCPOPT_TIMESTAMP: - if(opsize==TCPOLEN_TIMESTAMP) { - if (sysctl_tcp_timestamps && !no_fancy) { - tp->tstamp_ok = 1; - tp->saw_tstamp = 1; - tp->rcv_tsval = ntohl(*(__u32 *)ptr); - tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4)); - } - } - break; - case TCPOPT_SACK_PERM: - if(opsize==TCPOLEN_SACK_PERM && th->syn) { - if (sysctl_tcp_sack && !no_fancy) { - tp->sack_ok = 1; - tp->num_sacks = 0; - } - } - break; +static void tcp_undo_cwr(struct tcp_opt *tp, int undo) +{ + if (tp->prior_ssthresh) { + tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); + if (undo && tp->prior_ssthresh > tp->snd_ssthresh) + tp->snd_ssthresh = tp->prior_ssthresh; + } else { + tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); + } + tcp_moderate_cwnd(tp); + tp->snd_cwnd_stamp = tcp_time_stamp; +} - case TCPOPT_SACK: - if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && - sysctl_tcp_sack && (sk != NULL) && !th->syn) { - int sack_bytes = opsize - TCPOLEN_SACK_BASE; +static inline int tcp_may_undo(struct tcp_opt *tp) +{ + return tp->undo_marker && + (!tp->undo_retrans || tcp_packet_delayed(tp)); +} - if(!(sack_bytes % TCPOLEN_SACK_PERBLOCK)) { - int num_sacks = sack_bytes >> 3; - struct tcp_sack_block *sackp; +/* People celebrate: "We love our President!" */ +static int tcp_try_undo_recovery(struct sock *sk, struct tcp_opt *tp) +{ + if (tcp_may_undo(tp)) { + /* Happy end! We did not retransmit anything + * or our original transmission succeeded. + */ + DBGUNDO(sk, tp, tp->ca_state == TCP_CA_Loss ? "loss" : "retrans"); + tcp_undo_cwr(tp, 1); + if (tp->ca_state == TCP_CA_Loss) + NET_INC_STATS_BH(TCPLossUndo); + else + NET_INC_STATS_BH(TCPFullUndo); + tp->undo_marker = 0; + } + if (tp->snd_una == tp->high_seq && IsReno(tp)) { + /* Hold old state until something *above* high_seq + * is ACKed. For Reno it is MUST to prevent false + * fast retransmits (RFC2582). SACK TCP is safe. */ + tcp_moderate_cwnd(tp); + return 1; + } + tp->ca_state = TCP_CA_Open; + return 0; +} - sackp = (struct tcp_sack_block *)ptr; - tcp_sacktag_write_queue(sk, sackp, num_sacks); - } - } - }; - ptr+=opsize-2; - length-=opsize; - }; +/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ +static void tcp_try_undo_dsack(struct sock *sk, struct tcp_opt *tp) +{ + if (tp->undo_marker && !tp->undo_retrans) { + DBGUNDO(sk, tp, "D-SACK"); + tcp_undo_cwr(tp, 1); + tp->undo_marker = 0; + NET_INC_STATS_BH(TCPDSACKUndo); } } -/* Fast parse options. This hopes to only see timestamps. - * If it is wrong it falls back on tcp_parse_options(). - */ -static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp) +/* Undo during fast recovery after partial ACK. */ + +static int tcp_try_undo_partial(struct sock *sk, struct tcp_opt *tp, int acked) { - /* If we didn't send out any options ignore them all. */ - if (tp->tcp_header_len == sizeof(struct tcphdr)) - return 0; - if (th->doff == sizeof(struct tcphdr)>>2) { - tp->saw_tstamp = 0; - return 0; - } else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { - __u32 *ptr = (__u32 *)(th + 1); - if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) - | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { - tp->saw_tstamp = 1; - ++ptr; - tp->rcv_tsval = ntohl(*ptr); - ++ptr; - tp->rcv_tsecr = ntohl(*ptr); - return 1; + /* Partial ACK arrived. Force Hoe's retransmit. */ + int failed = IsReno(tp) || tp->fackets_out>tp->reordering; + + if (tcp_may_undo(tp)) { + /* Plain luck! Hole if filled with delayed + * packet, rather than with a retransmit. + */ + if (tp->retrans_out == 0) + tp->retrans_stamp = 0; + + tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1); + + DBGUNDO(sk, tp, "Hoe"); + tcp_undo_cwr(tp, 0); + NET_INC_STATS_BH(TCPPartialUndo); + + /* So... Do not make Hoe's retransmit yet. + * If the first packet was delayed, the rest + * ones are most probably delayed as well. + */ + failed = 0; + } + return failed; +} + +/* Undo during loss recovery after partial ACK. */ +static int tcp_try_undo_loss(struct sock *sk, struct tcp_opt *tp) +{ + if (tcp_may_undo(tp)) { + struct sk_buff *skb; + for_retrans_queue(skb, sk, tp) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + } + DBGUNDO(sk, tp, "partial loss"); + tp->lost_out = 0; + tp->left_out = tp->sacked_out; + tcp_undo_cwr(tp, 1); + NET_INC_STATS_BH(TCPLossUndo); + tp->retransmits = 0; + tp->undo_marker = 0; + if (!IsReno(tp)) { + tp->ca_state = TCP_CA_Open; + tp->backoff = 0; } + return 1; } - tcp_parse_options(sk, th, tp, 0); - return 1; + return 0; } -#define FLAG_DATA 0x01 /* Incoming frame contained data. */ -#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ -#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ -#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ -#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged new data. */ +static __inline__ void tcp_complete_cwr(struct tcp_opt *tp) +{ + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + tp->snd_cwnd_stamp = tcp_time_stamp; +} -static __inline__ void clear_fast_retransmit(struct tcp_opt *tp) +static void tcp_try_to_open(struct sock *sk, struct tcp_opt *tp, int flag) { - if (tp->dup_acks > 3) - tp->snd_cwnd = (tp->snd_ssthresh); + tp->left_out = tp->sacked_out; - tp->dup_acks = 0; + if (tp->retrans_out == 0) + tp->retrans_stamp = 0; + + if (flag&FLAG_ECE) + tcp_enter_cwr(tp); + + if (tp->ca_state != TCP_CA_CWR) { + int state = TCP_CA_Open; + + if (tp->left_out || + tp->retrans_out || + tp->undo_marker) + state = TCP_CA_Disorder; + + if (tp->ca_state != state) { + tp->ca_state = state; + tp->high_seq = tp->snd_nxt; + } + tcp_moderate_cwnd(tp); + } else { + tcp_cwnd_down(tp); + } } -/* NOTE: This code assumes that tp->dup_acks gets cleared when a - * retransmit timer fires. +/* Process an event, which can update packets-in-flight not trivially. + * Main goal of this function is to calculate new estimate for left_out, + * taking into account both packets sitting in receiver's buffer and + * packets lost by network. + * + * Besides that it does CWND reduction, when packet loss is detected + * and changes state of machine. + * + * It does _not_ decide what to send, it is made in function + * tcp_xmit_retransmit_queue(). */ -static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) +static void +tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, + int prior_packets, int flag) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP)); + + /* Some technical things: + * 1. Reno does not count dupacks (sacked_out) automatically. */ + if (!tp->packets_out) + tp->sacked_out = 0; + /* 2. SACK counts snd_fack in packets inaccurately. */ + if (tp->sacked_out == 0) + tp->fackets_out = 0; + + /* Now state machine starts. + * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ + if (flag&FLAG_ECE) + tp->prior_ssthresh = 0; + + /* B. In all the states check for reneging SACKs. */ + if (tp->sacked_out && tcp_check_sack_reneging(sk, tp)) + return; - /* Note: If not_dup is set this implies we got a - * data carrying packet or a window update. - * This carries no new information about possible - * lost packets, so we have to ignore it for the purposes - * of counting duplicate acks. Ideally this does not imply we - * should stop our fast retransmit phase, more acks may come - * later without data to help us. Unfortunately this would make - * the code below much more complex. For now if I see such - * a packet I clear the fast retransmit phase. - */ - if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) { - /* This is the standard reno style fast retransmit branch. */ - - /* 1. When the third duplicate ack is received, set ssthresh - * to one half the current congestion window, but no less - * than two segments. Retransmit the missing segment. - */ - if (tp->high_seq == 0 || after(ack, tp->high_seq)) { - tp->dup_acks++; - if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) { - __tcp_enter_cong_avoid(tp); - /* ... and account for 3 ACKs, which are - * already received to this time. - */ - tp->snd_cwnd += 3; - - if(!tp->fackets_out) - tcp_retransmit_skb(sk, - skb_peek(&sk->write_queue)); - else - tcp_fack_retransmit(sk); - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + /* C. Process data loss notification, provided it is valid. */ + if ((flag&FLAG_DATA_LOST) && + before(tp->snd_una, tp->high_seq) && + tp->ca_state != TCP_CA_Open && + tp->fackets_out > tp->reordering) { + tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq); + NET_INC_STATS_BH(TCPLoss); + } + + /* D. Synchronize left_out to current state. */ + tp->left_out = tp->sacked_out + tp->lost_out; + + /* E. Check state exit conditions. State can be terminated + * when high_seq is ACKed. */ + if (tp->ca_state == TCP_CA_Open) { + BUG_TRAP(tp->retrans_out == 0); + tp->retrans_stamp = 0; + } else if (!before(tp->snd_una, tp->high_seq)) { + switch (tp->ca_state) { + case TCP_CA_Loss: + tp->retransmits = 0; + if (tcp_try_undo_recovery(sk, tp)) + return; + tp->backoff = 0; + break; + + case TCP_CA_CWR: + /* CWR is to be held something *above* high_seq + * is ACKed for CWR bit to reach receiver. */ + if (tp->snd_una != tp->high_seq) { + tcp_complete_cwr(tp); + tp->ca_state = TCP_CA_Open; } - } else if (++tp->dup_acks > 3) { - /* 2. Each time another duplicate ACK arrives, increment - * cwnd by the segment size. [...] Transmit a packet... - * - * Packet transmission will be done on normal flow processing - * since we're not in "retransmit mode". We do not use - * duplicate ACKs to artificially inflate the congestion - * window when doing FACK. - */ - if(!tp->fackets_out) { - tp->snd_cwnd++; - } else { - /* Fill any further holes which may have - * appeared. - * - * We may want to change this to run every - * further multiple-of-3 dup ack increments, - * to be more robust against out-of-order - * packet delivery. -DaveM - */ - tcp_fack_retransmit(sk); + break; + + case TCP_CA_Disorder: + tcp_try_undo_dsack(sk, tp); + if (IsReno(tp) || !tp->undo_marker) { + tp->undo_marker = 0; + tp->ca_state = TCP_CA_Open; } + break; + + case TCP_CA_Recovery: + if (IsReno(tp)) + tcp_reset_reno_sack(tp); + if (tcp_try_undo_recovery(sk, tp)) + return; + tcp_complete_cwr(tp); + break; } - } else if (tp->high_seq != 0) { - /* In this branch we deal with clearing the Floyd style - * block on duplicate fast retransmits, and if requested - * we do Hoe style secondary fast retransmits. - */ - if (!before(ack, tp->high_seq) || (not_dup & FLAG_DATA) != 0) { - /* Once we have acked all the packets up to high_seq - * we are done this fast retransmit phase. - * Alternatively data arrived. In this case we - * Have to abort the fast retransmit attempt. - * Note that we do want to accept a window - * update since this is expected with Hoe's algorithm. - */ - clear_fast_retransmit(tp); + } - /* After we have cleared up to high_seq we can - * clear the Floyd style block. - */ - if (!before(ack, tp->high_seq)) { - tp->high_seq = 0; - tp->fackets_out = 0; - } - } else if (tp->dup_acks >= 3) { - if (!tp->fackets_out) { - /* Hoe Style. We didn't ack the whole - * window. Take this as a cue that - * another packet was lost and retransmit it. - * Don't muck with the congestion window here. - * Note that we have to be careful not to - * act if this was a window update and it - * didn't ack new data, since this does - * not indicate a packet left the system. - * We can test this by just checking - * if ack changed from snd_una, since - * the only way to get here without advancing - * from snd_una is if this was a window update. - */ - if (ack != tp->snd_una && before(ack, tp->high_seq)) { - tcp_retransmit_skb(sk, - skb_peek(&sk->write_queue)); - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); - } - } else { - /* FACK style, fill any remaining holes in - * receiver's queue. - */ - tcp_fack_retransmit(sk); - } + /* F. Process state. */ + switch (tp->ca_state) { + case TCP_CA_Recovery: + if (prior_snd_una == tp->snd_una) { + if (IsReno(tp) && is_dupack) + tcp_add_reno_sack(tp); + } else { + int acked = prior_packets - tp->packets_out; + if (IsReno(tp)) + tcp_remove_reno_sacks(sk, tp, acked); + is_dupack = tcp_try_undo_partial(sk, tp, acked); } + break; + case TCP_CA_Loss: + if (flag & FLAG_ACKED) + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + if (!tcp_try_undo_loss(sk, tp)) { + tcp_moderate_cwnd(tp); + tcp_xmit_retransmit_queue(sk); + return; + } + if (tp->ca_state != TCP_CA_Open) + return; + /* Loss is undone; fall through to processing in Open state. */ + default: + if (IsReno(tp)) { + if (tp->snd_una != prior_snd_una) + tcp_reset_reno_sack(tp); + if (is_dupack) + tcp_add_reno_sack(tp); + } + + if (tp->ca_state == TCP_CA_Disorder) + tcp_try_undo_dsack(sk, tp); + + if (!tcp_time_to_recover(sk, tp)) { + tcp_try_to_open(sk, tp, flag); + return; + } + + /* Otherwise enter Recovery state */ + + if (IsReno(tp)) + NET_INC_STATS_BH(TCPRenoRecovery); + else + NET_INC_STATS_BH(TCPSackRecovery); + + tp->high_seq = tp->snd_nxt; + tp->prior_ssthresh = 0; + tp->undo_marker = tp->snd_una; + tp->undo_retrans = tp->retrans_out; + + if (tp->ca_state < TCP_CA_CWR) { + if (!(flag&FLAG_ECE)) + tp->prior_ssthresh = tcp_current_ssthresh(tp); + tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + TCP_ECN_queue_cwr(tp); + } + + tp->snd_cwnd_cnt = 0; + tp->ca_state = TCP_CA_Recovery; + } + + if (is_dupack) + tcp_update_scoreboard(sk, tp); + tcp_cwnd_down(tp); + tcp_xmit_retransmit_queue(sk); +} + +/* Read draft-ietf-tcplw-high-performance before mucking + * with this code. (Superceeds RFC1323) + */ +static void tcp_ack_saw_tstamp(struct tcp_opt *tp) +{ + __u32 seq_rtt; + + /* RTTM Rule: A TSecr value received in a segment is used to + * update the averaged RTT measurement only if the segment + * acknowledges some new data, i.e., only if it advances the + * left edge of the send window. + * + * See draft-ietf-tcplw-high-performance-00, section 3.3. + * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> + */ + seq_rtt = tcp_time_stamp - tp->rcv_tsecr; + tcp_rtt_estimator(tp, seq_rtt); + tcp_set_rto(tp); + tp->rto <<= tp->backoff; + tcp_bound_rto(tp); +} + +static void tcp_ack_no_tstamp(struct tcp_opt *tp, u32 seq_rtt, int flag) +{ + /* We don't have a timestamp. Can only use + * packets that are not retransmitted to determine + * rtt estimates. Also, we must not reset the + * backoff for rto until we get a non-retransmitted + * packet. This allows us to deal with a situation + * where the network delay has increased suddenly. + * I.e. Karn's algorithm. (SIGCOMM '87, p5.) + */ + + if (!tp->retransmits && !(flag & FLAG_RETRANS_DATA_ACKED)) { + tp->backoff = 0; + tcp_rtt_estimator(tp, seq_rtt); + tcp_set_rto(tp); + tcp_bound_rto(tp); } } +static __inline__ void +tcp_ack_update_rtt(struct tcp_opt *tp, int flag, u32 seq_rtt) +{ + if (tp->saw_tstamp) + tcp_ack_saw_tstamp(tp); + else + tcp_ack_no_tstamp(tp, seq_rtt, flag); +} + /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ @@ -855,31 +1645,38 @@ static __inline__ void tcp_cong_avoid(struct tcp_opt *tp) } } +static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) +{ + if (tp->packets_out==0) { + tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS); + } else { + struct sk_buff *skb = skb_peek(&sk->write_queue); + __u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when); + + if ((__s32)when <= 0) + when = TCP_RTO_MIN; + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, when); + } +} + /* Remove acknowledged frames from the retransmission queue. */ -static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, - __u32 *seq, __u32 *seq_rtt) +static int tcp_clean_rtx_queue(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; __u32 now = tcp_time_stamp; int acked = 0; - - /* If we are retransmitting, and this ACK clears up to - * the retransmit head, or further, then clear our state. - */ - if (tp->retrans_head != NULL && - !before(ack, TCP_SKB_CB(tp->retrans_head)->end_seq)) - tp->retrans_head = NULL; + __u32 seq_rtt = 0; /* F..g gcc... */ while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); __u8 sacked = scb->sacked; - + /* If our packet is before the ack sequence we can * discard it as it's confirmed to have arrived at * the other end. */ - if (after(scb->end_seq, ack)) + if (after(scb->end_seq, tp->snd_una)) break; /* Initial outgoing SYN's get put onto the write_queue @@ -889,711 +1686,482 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, * connection startup slow start one packet too * quickly. This is severely frowned upon behavior. */ - if((sacked & TCPCB_SACKED_RETRANS) && tp->retrans_out) - tp->retrans_out--; if(!(scb->flags & TCPCB_FLAG_SYN)) { acked |= FLAG_DATA_ACKED; - if(sacked & TCPCB_SACKED_RETRANS) - acked |= FLAG_RETRANS_DATA_ACKED; - if(tp->fackets_out) - tp->fackets_out--; } else { acked |= FLAG_SYN_ACKED; - /* This is pure paranoia. */ - tp->retrans_head = NULL; } + + if (sacked) { + if(sacked & TCPCB_RETRANS) { + if(sacked & TCPCB_SACKED_RETRANS) + tp->retrans_out--; + acked |= FLAG_RETRANS_DATA_ACKED; + } + if(sacked & TCPCB_SACKED_ACKED) + tp->sacked_out--; + if(sacked & TCPCB_LOST) + tp->lost_out--; + } + if(tp->fackets_out) + tp->fackets_out--; tp->packets_out--; - *seq = scb->seq; - *seq_rtt = now - scb->when; + seq_rtt = now - scb->when; __skb_unlink(skb, skb->list); - kfree_skb(skb); + tcp_free_skb(sk, skb); + } + + if (acked&FLAG_ACKED) { + tcp_ack_update_rtt(tp, acked, seq_rtt); + tcp_ack_packets_out(sk, tp); + } + +#if FASTRETRANS_DEBUG > 0 + BUG_TRAP((int)tp->sacked_out >= 0); + BUG_TRAP((int)tp->lost_out >= 0); + BUG_TRAP((int)tp->retrans_out >= 0); + if (tp->packets_out==0 && tp->sack_ok) { + if (tp->lost_out) { + printk(KERN_DEBUG "Leak l=%u %d\n", tp->lost_out, tp->ca_state); + tp->lost_out = 0; + } + if (tp->sacked_out) { + printk(KERN_DEBUG "Leak s=%u %d\n", tp->sacked_out, tp->ca_state); + tp->sacked_out = 0; + } + if (tp->retrans_out) { + printk(KERN_DEBUG "Leak r=%u %d\n", tp->retrans_out, tp->ca_state); + tp->retrans_out = 0; + } } +#endif return acked; } -static void tcp_ack_probe(struct sock *sk, __u32 ack) +static void tcp_ack_probe(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - + /* Was it a usable window open? */ - if (tp->send_head != NULL) { - if (!after(TCP_SKB_CB(tp->send_head)->end_seq, ack + tp->snd_wnd)) { - tp->backoff = 0; - tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0); - /* If packets_out==0, socket must be waked up by - * subsequent tcp_data_snd_check(). This function is - * not for random using! - */ - } else if (!tp->packets_out) { - tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, - min(tp->rto << tp->backoff, TCP_RTO_MAX)); - } + if (!after(TCP_SKB_CB(tp->send_head)->end_seq, tp->snd_una + tp->snd_wnd)) { + tp->backoff = 0; + tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0); + /* Socket must be waked up by subsequent tcp_data_snd_check(). + * This function is not for random using! + */ + } else { + tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, + min(tp->rto << tp->backoff, TCP_RTO_MAX)); } } -/* Should we open up the congestion window? */ -static __inline__ int should_advance_cwnd(struct tcp_opt *tp, int flag) +static __inline__ int tcp_ack_is_dubious(struct tcp_opt *tp, int flag) { - /* Data must have been acked. */ - if ((flag & FLAG_DATA_ACKED) == 0) - return 0; - - /* Some of the data acked was retransmitted somehow? */ - if ((flag & FLAG_RETRANS_DATA_ACKED) != 0) { - /* We advance in all cases except during - * non-FACK fast retransmit/recovery. - */ - if (tp->fackets_out != 0 || - tp->retransmits != 0) - return 1; + return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || + tp->ca_state != TCP_CA_Open); +} - /* Non-FACK fast retransmit does it's own - * congestion window management, don't get - * in the way. - */ - return 0; - } +static __inline__ int tcp_may_raise_cwnd(struct tcp_opt *tp, int flag) +{ + return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && + !((1<<tp->ca_state)&(TCPF_CA_Recovery|TCPF_CA_CWR)); +} - /* New non-retransmitted data acked, always advance. */ - return 1; +/* Check that window update is acceptable. + * The function assumes that snd_una<=ack<=snd_next. + */ +static __inline__ int +tcp_may_update_window(struct tcp_opt *tp, u32 ack, u32 ack_seq, u32 nwin) +{ + return (after(ack, tp->snd_una) || + after(ack_seq, tp->snd_wl1) || + (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd)); } -/* Read draft-ietf-tcplw-high-performance before mucking - * with this code. (Superceeds RFC1323) +/* Update our send window. + * + * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 + * and in FreeBSD. NetBSD's one is even worse.) is wrong. */ -static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp, - u32 seq, u32 ack, int flag) +static int tcp_ack_update_window(struct sock *sk, struct tcp_opt *tp, + struct sk_buff *skb, u32 ack, u32 ack_seq) { - __u32 seq_rtt; + int flag = 0; + u32 nwin = ntohs(skb->h.th->window) << tp->snd_wscale; - /* RTTM Rule: A TSecr value received in a segment is used to - * update the averaged RTT measurement only if the segment - * acknowledges some new data, i.e., only if it advances the - * left edge of the send window. - * - * See draft-ietf-tcplw-high-performance-00, section 3.3. - * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> - */ - if (!(flag & (FLAG_DATA_ACKED|FLAG_SYN_ACKED))) - return; + if (tcp_may_update_window(tp, ack, ack_seq, nwin)) { + flag |= FLAG_WIN_UPDATE; + tcp_update_wl(tp, ack, ack_seq); - seq_rtt = tcp_time_stamp - tp->rcv_tsecr; - tcp_rtt_estimator(tp, seq_rtt); - if (tp->retransmits) { - if (tp->packets_out == 0) { - tp->retransmits = 0; - tp->fackets_out = 0; - tp->retrans_out = 0; - tp->backoff = 0; - tcp_set_rto(tp); - } else { - /* Still retransmitting, use backoff */ - tcp_set_rto(tp); - tp->rto = tp->rto << tp->backoff; + if (tp->snd_wnd != nwin) { + tp->snd_wnd = nwin; + + /* Note, it is the only place, where + * fast path is recovered for sending TCP. + */ + if (skb_queue_len(&tp->out_of_order_queue) == 0 && +#ifdef TCP_FORMAL_WINDOW + tcp_receive_window(tp) && +#endif + !tp->urg_data) + tcp_fast_path_on(tp); + + if (nwin > tp->max_window) { + tp->max_window = nwin; + tcp_sync_mss(sk, tp->pmtu_cookie); + } } - } else { - tcp_set_rto(tp); } - tcp_bound_rto(tp); -} - -static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) -{ - struct sk_buff *skb = skb_peek(&sk->write_queue); + tp->snd_una = ack; #ifdef TCP_DEBUG - /* It occured in 2.3, because of racy timers. Namely, - * retransmit timer did not check packets_out and retransmitted - * send_head sometimes and, hence, messed all the write_queue. - * Now it is impossible, I bet. --ANK - */ - if (skb == NULL) { - printk("Sucks! packets_out=%d, sk=%p, %d\n", tp->packets_out, sk, sk->state); - return; + if (before(tp->snd_una + tp->snd_wnd, tp->snd_nxt)) { + if (net_ratelimit()) + printk(KERN_DEBUG "TCP: peer shrinks window. Bad, what else can I say?\n"); } #endif - /* Some data was ACK'd, if still retransmitting (due to a - * timeout), resend more of the retransmit queue. The - * congestion window is handled properly by that code. - */ - if (tp->retransmits) { - tcp_xmit_retransmit_queue(sk); - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); - } else { - __u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when); - if ((__s32)when < 0) - when = 1; - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, when); - } + return flag; } /* This routine deals with incoming acks, but not outgoing ones. */ -static int tcp_ack(struct sock *sk, struct tcphdr *th, - u32 ack_seq, u32 ack, int len) +static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int flag = 0; - u32 seq = 0; - u32 seq_rtt = 0; - - if(sk->state == TCP_CLOSE) - return 1; /* Dead, can't ack any more so why bother */ + u32 prior_snd_una = tp->snd_una; + u32 ack_seq = TCP_SKB_CB(skb)->seq; + u32 ack = TCP_SKB_CB(skb)->ack_seq; + u32 prior_in_flight; + int prior_packets; /* If the ack is newer than sent or older than previous acks * then we can probably ignore it. */ - if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una)) + if (after(ack, tp->snd_nxt)) goto uninteresting_ack; - /* If there is data set flag 1 */ - if (len != th->doff*4) - flag |= FLAG_DATA; + if (before(ack, prior_snd_una)) + goto old_ack; - /* Update our send window. */ - - /* This is the window update code as per RFC 793 - * snd_wl{1,2} are used to prevent unordered - * segments from shrinking the window - */ - if (before(tp->snd_wl1, ack_seq) || - (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) { - u32 nwin = ntohs(th->window) << tp->snd_wscale; + if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { + /* Window is constant, pure forward advance. + * No more checks are required. + * Note, we use the fact that SND.UNA>=SND.WL2. + */ + tcp_update_wl(tp, ack, ack_seq); + tp->snd_una = ack; + flag |= FLAG_WIN_UPDATE; - if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) { - flag |= FLAG_WIN_UPDATE; - if (tp->snd_wnd != nwin) { - tp->snd_wnd = nwin; + NET_INC_STATS_BH(TCPHPAcks); + } else { + if (ack_seq != TCP_SKB_CB(skb)->end_seq) + flag |= FLAG_DATA; + else + NET_INC_STATS_BH(TCPPureAcks); - /* Note, it is the only place, where - * fast path is recovered for sending TCP. - */ - if (skb_queue_len(&tp->out_of_order_queue) == 0 && -#ifdef TCP_FORMAL_WINDOW - tcp_receive_window(tp) && -#endif - !tp->urg_data) - tcp_fast_path_on(tp); + flag |= tcp_ack_update_window(sk, tp, skb, ack, ack_seq); - if (nwin > tp->max_window) { - tp->max_window = nwin; - tcp_sync_mss(sk, tp->pmtu_cookie); - } - } + if (TCP_SKB_CB(skb)->sacked) + flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); - tp->snd_wl1 = ack_seq; - tp->snd_wl2 = ack; - } + if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) + flag |= FLAG_ECE; } - /* BEWARE! From this place and until return from this function - * snd_nxt and snd_wnd are out of sync. All the routines, called - * from here must get "ack" as argument or they should not depend - * on right edge of window. It is _UGLY_. It cries to be fixed. --ANK - */ - /* We passed data and got it acked, remove any soft error * log. Something worked... */ sk->err_soft = 0; - tp->probes_out = 0; tp->rcv_tstamp = tcp_time_stamp; + if ((prior_packets = tp->packets_out) == 0) + goto no_queue; - /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt); - - /* If this ack opens up a zero window, clear backoff. It was - * being used to time the probes, and is probably far higher than - * it needs to be for normal retransmission. - */ - if (tcp_timer_is_set(sk, TCP_TIME_PROBE0)) - tcp_ack_probe(sk, ack); - - /* We must do this here, before code below clears out important - * state contained in tp->fackets_out and tp->retransmits. -DaveM - */ - if (should_advance_cwnd(tp, flag)) - tcp_cong_avoid(tp); - - /* If we have a timestamp, we always do rtt estimates. */ - if (tp->saw_tstamp) { - tcp_ack_saw_tstamp(sk, tp, seq, ack, flag); - } else { - /* If we were retransmiting don't count rtt estimate. */ - if (tp->retransmits) { - if (tp->packets_out == 0) { - tp->retransmits = 0; - tp->fackets_out = 0; - tp->retrans_out = 0; - } - } else { - /* We don't have a timestamp. Can only use - * packets that are not retransmitted to determine - * rtt estimates. Also, we must not reset the - * backoff for rto until we get a non-retransmitted - * packet. This allows us to deal with a situation - * where the network delay has increased suddenly. - * I.e. Karn's algorithm. (SIGCOMM '87, p5.) - */ - if (flag & (FLAG_DATA_ACKED|FLAG_SYN_ACKED)) { - if(!(flag & FLAG_RETRANS_DATA_ACKED)) { - tp->backoff = 0; - tcp_rtt_estimator(tp, seq_rtt); - tcp_set_rto(tp); - tcp_bound_rto(tp); - } - } - } - } + prior_in_flight = tcp_packets_in_flight(tp); - if (tp->packets_out) { - if (flag & FLAG_DATA_ACKED) - tcp_ack_packets_out(sk, tp); + /* See if we can take anything off of the retransmit queue. */ + flag |= tcp_clean_rtx_queue(sk); + + if (tcp_ack_is_dubious(tp, flag)) { + /* Advanve CWND, if state allows this. */ + if ((flag&FLAG_DATA_ACKED) && prior_in_flight >= tp->snd_cwnd && + tcp_may_raise_cwnd(tp, flag)) + tcp_cong_avoid(tp); + tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); } else { - tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS); + if ((flag&FLAG_DATA_ACKED) && prior_in_flight >= tp->snd_cwnd) + tcp_cong_avoid(tp); } - flag &= (FLAG_DATA | FLAG_WIN_UPDATE); - if ((ack == tp->snd_una && tp->packets_out && flag == 0) || - (tp->high_seq != 0)) { - tcp_fast_retrans(sk, ack, flag); - } else { - /* Clear any aborted fast retransmit starts. */ - tp->dup_acks = 0; - } - /* It is not a brain fart, I thought a bit now. 8) - * - * Forward progress is indicated, if: - * 1. the ack acknowledges new data. - * 2. or the ack is duplicate, but it is caused by new segment - * arrival. This case is filtered by: - * - it contains no data, syn or fin. - * - it does not update window. - * 3. or new SACK. It is difficult to check, so that we ignore it. - * - * Forward progress is also indicated by arrival new data, - * which was caused by window open from our side. This case is more - * difficult and it is made (alas, incorrectly) in tcp_data_queue(). - * --ANK (990513) - */ - if (ack != tp->snd_una || (flag == 0 && !th->fin)) + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) dst_confirm(sk->dst_cache); - if (ack != tp->snd_una) - tp->sorry = 1; - - /* Remember the highest ack received. */ - tp->snd_una = ack; return 1; -uninteresting_ack: - SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt); - return 0; -} - -int tcp_paws_check(struct tcp_opt *tp, int rst) -{ - if ((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) - return 0; - if (xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS) - return 0; +no_queue: + tp->probes_out = 0; - /* RST segments are not recommended to carry timestamp, - and, if they do, it is recommended to ignore PAWS because - "their cleanup function should take precedence over timestamps." - Certainly, it is mistake. It is necessary to understand the reasons - of this constraint to relax it: if peer reboots, clock may go - out-of-sync and half-open connections will not be reset. - Actually, the problem would be not existing if all - the implementations followed draft about maintaining clock - via reboots. Linux-2.2 DOES NOT! - - However, we can relax time bounds for RST segments to MSL. + /* If this ack opens up a zero window, clear backoff. It was + * being used to time the probes, and is probably far higher than + * it needs to be for normal retransmission. */ - if (rst && xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_MSL) - return 0; + if (tp->send_head) + tcp_ack_probe(sk); return 1; -} - -static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) -{ - if (seq == s_win) - return 1; - if (after(end_seq, s_win) && before(seq, e_win)) - return 1; - return (seq == e_win && seq == end_seq); -} -/* New-style handling of TIME_WAIT sockets. */ +old_ack: + if (TCP_SKB_CB(skb)->sacked) + tcp_sacktag_write_queue(sk, skb, prior_snd_una); -/* Must be called with locally disabled BHs. */ -void tcp_timewait_kill(struct tcp_tw_bucket *tw) -{ - struct tcp_ehash_bucket *ehead; - struct tcp_bind_hashbucket *bhead; - struct tcp_bind_bucket *tb; +uninteresting_ack: + SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt); + return 0; +} - /* Unlink from established hashes. */ - ehead = &tcp_ehash[tw->hashent]; - write_lock(&ehead->lock); - if (!tw->pprev) { - write_unlock(&ehead->lock); - return; - } - if(tw->next) - tw->next->pprev = tw->pprev; - *(tw->pprev) = tw->next; - tw->pprev = NULL; - write_unlock(&ehead->lock); - - /* Disassociate with bind bucket. */ - bhead = &tcp_bhash[tcp_bhashfn(tw->num)]; - spin_lock(&bhead->lock); - if ((tb = tw->tb) != NULL) { - if(tw->bind_next) - tw->bind_next->bind_pprev = tw->bind_pprev; - *(tw->bind_pprev) = tw->bind_next; - tw->tb = NULL; - if (tb->owners == NULL) { - if (tb->next) - tb->next->pprev = tb->pprev; - *(tb->pprev) = tb->next; - kmem_cache_free(tcp_bucket_cachep, tb); - } - } - spin_unlock(&bhead->lock); -#ifdef INET_REFCNT_DEBUG - if (atomic_read(&tw->refcnt) != 1) { - printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw, atomic_read(&tw->refcnt)); - } -#endif - tcp_tw_put(tw); -} - -/* - * * Main purpose of TIME-WAIT state is to close connection gracefully, - * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN - * (and, probably, tail of data) and one or more our ACKs are lost. - * * What is TIME-WAIT timeout? It is associated with maximal packet - * lifetime in the internet, which results in wrong conclusion, that - * it is set to catch "old duplicate segments" wandering out of their path. - * It is not quite correct. This timeout is calculated so that it exceeds - * maximal retransmision timeout enough to allow to lose one (or more) - * segments sent by peer and our ACKs. This time may be calculated from RTO. - * * When TIME-WAIT socket receives RST, it means that another end - * finally closed and we are allowed to kill TIME-WAIT too. - * * Second purpose of TIME-WAIT is catching old duplicate segments. - * Well, certainly it is pure paranoia, but if we load TIME-WAIT - * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs. - * * If we invented some more clever way to catch duplicates - * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs. - * - * The algorithm below is based on FORMAL INTERPRETATION of RFCs. - * When you compare it to RFCs, please, read section SEGMENT ARRIVES - * from the very beginning. - * - * NOTE. With recycling (and later with fin-wait-2) TW bucket - * is _not_ stateless. It means, that strictly speaking we must - * spinlock it. I do not want! Well, probability of misbehaviour - * is ridiculously low and, seems, we could use some mb() tricks - * to avoid misread sequence numbers, states etc. --ANK +/* Look for tcp options. Normally only called on SYN and SYNACK packets. + * But, this can also be called on packets in the established flow when + * the fast version below fails. */ -enum tcp_tw_status -tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, - struct tcphdr *th, unsigned len) +void tcp_parse_options(struct sk_buff *skb, struct tcp_opt *tp) { - struct tcp_opt tp; - int paws_reject = 0; - - tp.saw_tstamp = 0; - if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) { - tcp_parse_options(NULL, th, &tp, 0); - - if (tp.saw_tstamp) { - tp.ts_recent = tw->ts_recent; - tp.ts_recent_stamp = tw->ts_recent_stamp; - paws_reject = tcp_paws_check(&tp, th->rst); - } - } - - if (tw->substate == TCP_FIN_WAIT2) { - /* Just repeat all the checks of tcp_rcv_state_process() */ - - /* Out of window, send ACK */ - if (paws_reject || - !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tw->rcv_nxt, tw->rcv_nxt + tw->rcv_wnd)) - return TCP_TW_ACK; - - if (th->rst) - goto kill; - - if (th->syn && TCP_SKB_CB(skb)->seq != tw->syn_seq) - goto kill_with_rst; + unsigned char *ptr; + struct tcphdr *th = skb->h.th; + int length=(th->doff*4)-sizeof(struct tcphdr); - /* Dup ACK? */ - if (!after(TCP_SKB_CB(skb)->end_seq, tw->rcv_nxt) || - TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { - tcp_tw_put(tw); - return TCP_TW_SUCCESS; - } + ptr = (unsigned char *)(th + 1); + tp->saw_tstamp = 0; - /* New data or FIN. If new data arrive after half-duplex close, - * reset. - */ - if (!th->fin || TCP_SKB_CB(skb)->end_seq != tw->rcv_nxt+1) { -kill_with_rst: - tcp_tw_deschedule(tw); - tcp_timewait_kill(tw); - tcp_tw_put(tw); - return TCP_TW_RST; - } + while(length>0) { + int opcode=*ptr++; + int opsize; - /* FIN arrived, enter true time-wait state. */ - tw->substate = TCP_TIME_WAIT; - tw->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - if (tp.saw_tstamp) { - tw->ts_recent_stamp = xtime.tv_sec; - tw->ts_recent = tp.rcv_tsval; - } + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize=*ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + break; /* don't parse partial options */ + switch(opcode) { + case TCPOPT_MSS: + if(opsize==TCPOLEN_MSS && th->syn) { + u16 in_mss = ntohs(*(__u16 *)ptr); + if (in_mss) { + if (tp->user_mss && tp->user_mss < in_mss) + in_mss = tp->user_mss; + tp->mss_clamp = in_mss; + } + } + break; + case TCPOPT_WINDOW: + if(opsize==TCPOLEN_WINDOW && th->syn) + if (sysctl_tcp_window_scaling) { + tp->wscale_ok = 1; + tp->snd_wscale = *(__u8 *)ptr; + if(tp->snd_wscale > 14) { + if(net_ratelimit()) + printk("tcp_parse_options: Illegal window " + "scaling value %d >14 received.", + tp->snd_wscale); + tp->snd_wscale = 14; + } + } + break; + case TCPOPT_TIMESTAMP: + if(opsize==TCPOLEN_TIMESTAMP) { + if (sysctl_tcp_timestamps) { + tp->tstamp_ok = 1; + tp->saw_tstamp = 1; + tp->rcv_tsval = ntohl(*(__u32 *)ptr); + tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4)); + } + } + break; + case TCPOPT_SACK_PERM: + if(opsize==TCPOLEN_SACK_PERM && th->syn) { + if (sysctl_tcp_sack) { + tp->sack_ok = 1; + tcp_sack_reset(tp); + } + } + break; - /* I am shamed, but failed to make it more elegant. - * Yes, it is direct reference to IP, which is impossible - * to generalize to IPv6. Taking into account that IPv6 - * do not undertsnad recycling in any case, it not - * a big problem in practice. --ANK */ - if (tw->family == AF_INET && - sysctl_tcp_tw_recycle && tw->ts_recent_stamp && - tcp_v4_tw_remember_stamp(tw)) - tcp_tw_schedule(tw, tw->timeout); - else - tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); - return TCP_TW_ACK; + case TCPOPT_SACK: + if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && + !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) && + tp->sack_ok) { + TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th; + } + }; + ptr+=opsize-2; + length-=opsize; + }; } +} - /* - * Now real TIME-WAIT state. - * - * RFC 1122: - * "When a connection is [...] on TIME-WAIT state [...] - * [a TCP] MAY accept a new SYN from the remote TCP to - * reopen the connection directly, if it: - * - * (1) assigns its initial sequence number for the new - * connection to be larger than the largest sequence - * number it used on the previous connection incarnation, - * and - * - * (2) returns to TIME-WAIT state if the SYN turns out - * to be an old duplicate". - */ - - if (!paws_reject && - (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq && - TCP_SKB_CB(skb)->seq == tw->rcv_nxt)) { - /* In window segment, it may be only reset or bare ack. */ - - if (th->rst) { - /* This is TIME_WAIT assasination, in two flavors. - * Oh well... nobody has a sufficient solution to this - * protocol bug yet. - */ - if (sysctl_tcp_rfc1337 == 0) { -kill: - tcp_tw_deschedule(tw); - tcp_timewait_kill(tw); - tcp_tw_put(tw); - return TCP_TW_SUCCESS; - } - } - tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); - - if (tp.saw_tstamp) { - tw->ts_recent = tp.rcv_tsval; - tw->ts_recent_stamp = xtime.tv_sec; +/* Fast parse options. This hopes to only see timestamps. + * If it is wrong it falls back on tcp_parse_options(). + */ +static __inline__ int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, struct tcp_opt *tp) +{ + if (th->doff == sizeof(struct tcphdr)>>2) { + tp->saw_tstamp = 0; + return 0; + } else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { + __u32 *ptr = (__u32 *)(th + 1); + if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { + tp->saw_tstamp = 1; + ++ptr; + tp->rcv_tsval = ntohl(*ptr); + ++ptr; + tp->rcv_tsecr = ntohl(*ptr); + return 1; } - - tcp_tw_put(tw); - return TCP_TW_SUCCESS; - } - - /* Out of window segment. - - All the segments are ACKed immediately. - - The only exception is new SYN. We accept it, if it is - not old duplicate and we are not in danger to be killed - by delayed old duplicates. RFC check is that it has - newer sequence number works at rates <40Mbit/sec. - However, if paws works, it is reliable AND even more, - we even may relax silly seq space cutoff. - - RED-PEN: we violate main RFC requirement, if this SYN will appear - old duplicate (i.e. we receive RST in reply to SYN-ACK), - we must return socket to time-wait state. It is not good, - but not fatal yet. - */ - - if (th->syn && !th->rst && !th->ack && !paws_reject && - (after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt) || - (tp.saw_tstamp && (s32)(tw->ts_recent - tp.rcv_tsval) < 0))) { - u32 isn = tw->snd_nxt + 2; - if (isn == 0) - isn++; - TCP_SKB_CB(skb)->when = isn; - return TCP_TW_SYN; } + tcp_parse_options(skb, tp); + return 1; +} - if (paws_reject) - NET_INC_STATS_BH(PAWSEstabRejected); +extern __inline__ void +tcp_store_ts_recent(struct tcp_opt *tp) +{ + tp->ts_recent = tp->rcv_tsval; + tp->ts_recent_stamp = xtime.tv_sec; +} - if(!th->rst) { - /* In this case we must reset the TIMEWAIT timer. +extern __inline__ void +tcp_replace_ts_recent(struct tcp_opt *tp, u32 seq) +{ + if (tp->saw_tstamp && !after(seq, tp->rcv_wup)) { + /* PAWS bug workaround wrt. ACK frames, the PAWS discard + * extra check below makes sure this can only happen + * for pure ACK frames. -DaveM * - * If it is ACKless SYN it may be both old duplicate - * and new good SYN with random sequence number <rcv_nxt. - * Do not reschedule in the last case. + * Not only, also it occurs for expired timestamps. */ - if (paws_reject || th->ack) - tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); - /* Send ACK. Note, we do not put the bucket, - * it will be released by caller. - */ - return TCP_TW_ACK; + if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0 || + xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS) + tcp_store_ts_recent(tp); } - tcp_tw_put(tw); - return TCP_TW_SUCCESS; } -/* Enter the time wait state. This is called with locally disabled BH. - * Essentially we whip up a timewait bucket, copy the - * relevant info into it from the SK, and mess with hash chains - * and list linkage. +/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM + * + * It is not fatal. If this ACK does _not_ change critical state (seqs, window) + * it can pass through stack. So, the following predicate verifies that + * this segment is not used for anything but congestion avoidance or + * fast retransmit. Moreover, we even are able to eliminate most of such + * second order effects, if we apply some small "replay" window (~RTO) + * to timestamp space. + * + * All these measures still do not guarantee that we reject wrapped ACKs + * on networks with high bandwidth, when sequence space is recycled fastly, + * but it guarantees that such events will be very rare and do not affect + * connection seriously. This doesn't look nice, but alas, PAWS is really + * buggy extension. + * + * [ Later note. Even worse! It is buggy for segments _with_ data. RFC + * states that events when retransmit arrives after original data are rare. + * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is + * the biggest problem on large power networks even with minor reordering. + * OK, let's give it small replay window. If peer clock is even 1hz, it is safe + * up to bandwidth of 18Gigabit/sec. 8) ] */ -static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) + +static int tcp_disordered_ack(struct tcp_opt *tp, struct sk_buff *skb) { - struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->hashent]; - struct tcp_bind_hashbucket *bhead; - struct sock **head, *sktw; + struct tcphdr *th = skb->h.th; + u32 seq = TCP_SKB_CB(skb)->seq; + u32 ack = TCP_SKB_CB(skb)->ack_seq; - write_lock(&ehead->lock); + return (/* 1. Pure ACK with correct sequence number. */ + (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) && - /* Step 1: Remove SK from established hash. */ - if (sk->pprev) { - if(sk->next) - sk->next->pprev = sk->pprev; - *sk->pprev = sk->next; - sk->pprev = NULL; - sock_prot_dec_use(sk->prot); - } + /* 2. ... and duplicate ACK. */ + ack == tp->snd_una && - /* Step 2: Hash TW into TIMEWAIT half of established hash table. */ - head = &(ehead + tcp_ehash_size)->chain; - sktw = (struct sock *)tw; - if((sktw->next = *head) != NULL) - (*head)->pprev = &sktw->next; - *head = sktw; - sktw->pprev = head; - atomic_inc(&tw->refcnt); + /* 3. ... and does not update window. */ + !tcp_may_update_window(tp, ack, seq, ntohs(th->window)<<tp->snd_wscale) && - write_unlock(&ehead->lock); + /* 4. ... and sits in replay window. */ + (s32)(tp->ts_recent - tp->rcv_tsval) <= (tp->rto*1024)/HZ); +} - /* Step 3: Put TW into bind hash. Original socket stays there too. - Note, that any socket with sk->num!=0 MUST be bound in binding - cache, even if it is closed. - */ - bhead = &tcp_bhash[tcp_bhashfn(sk->num)]; - spin_lock(&bhead->lock); - tw->tb = (struct tcp_bind_bucket *)sk->prev; - BUG_TRAP(sk->prev!=NULL); - if ((tw->bind_next = tw->tb->owners) != NULL) - tw->tb->owners->bind_pprev = &tw->bind_next; - tw->tb->owners = (struct sock*)tw; - tw->bind_pprev = &tw->tb->owners; - spin_unlock(&bhead->lock); -} - -/* - * Move a socket to time-wait or dead fin-wait-2 state. - */ -void tcp_time_wait(struct sock *sk, int state, int timeo) +extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb) { - struct tcp_tw_bucket *tw = NULL; - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int recycle_ok = 0; - - if (sysctl_tcp_tw_recycle && tp->ts_recent_stamp) - recycle_ok = tp->af_specific->remember_stamp(sk); - - if (tcp_tw_count < sysctl_tcp_max_tw_buckets) - tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC); - - if(tw != NULL) { - int rto = (tp->rto<<2) - (tp->rto>>1); - - /* Give us an identity. */ - tw->daddr = sk->daddr; - tw->rcv_saddr = sk->rcv_saddr; - tw->bound_dev_if= sk->bound_dev_if; - tw->num = sk->num; - tw->state = TCP_TIME_WAIT; - tw->substate = state; - tw->sport = sk->sport; - tw->dport = sk->dport; - tw->family = sk->family; - tw->reuse = sk->reuse; - tw->rcv_wscale = tp->rcv_wscale; - atomic_set(&tw->refcnt, 0); - - tw->hashent = sk->hashent; - tw->rcv_nxt = tp->rcv_nxt; - tw->snd_nxt = tp->snd_nxt; - tw->rcv_wnd = tcp_receive_window(tp); - tw->syn_seq = tp->syn_seq; - tw->ts_recent = tp->ts_recent; - tw->ts_recent_stamp= tp->ts_recent_stamp; - tw->pprev_death = NULL; - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - if(tw->family == PF_INET6) { - memcpy(&tw->v6_daddr, - &sk->net_pinfo.af_inet6.daddr, - sizeof(struct in6_addr)); - memcpy(&tw->v6_rcv_saddr, - &sk->net_pinfo.af_inet6.rcv_saddr, - sizeof(struct in6_addr)); - } + return ((s32)(tp->ts_recent - tp->rcv_tsval) > TCP_PAWS_WINDOW && + xtime.tv_sec < tp->ts_recent_stamp + TCP_PAWS_24DAYS && + !tcp_disordered_ack(tp, skb)); +} + +static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) +{ + u32 end_window = tp->rcv_wup + tp->rcv_wnd; +#ifdef TCP_FORMAL_WINDOW + u32 rcv_wnd = tcp_receive_window(tp); +#else + u32 rcv_wnd = tp->rcv_wnd; #endif - /* Linkage updates. */ - __tcp_tw_hashdance(sk, tw); - /* Get the TIME_WAIT timeout firing. */ - if (timeo < rto) - timeo = rto; + if (rcv_wnd && + after(end_seq, tp->rcv_nxt) && + before(seq, end_window)) + return 1; + if (seq != end_window) + return 0; + return (seq == end_seq); +} - if (recycle_ok) { - tw->timeout = rto; - } else { - tw->timeout = TCP_TIMEWAIT_LEN; - if (state == TCP_TIME_WAIT) - timeo = TCP_TIMEWAIT_LEN; - } +/* This functions checks to see if the tcp header is actually acceptable. + * + * Actually, our check is seriously broken, we must accept RST,ACK,URG + * even on zero window effectively trimming data. It is RFC, guys. + * But our check is so beautiful, that I do not want to repair it + * now. However, taking into account those stupid plans to start to + * send some texts with RST, we have to handle at least this case. --ANK + */ +extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq, int rst) +{ +#ifdef TCP_FORMAL_WINDOW + u32 rcv_wnd = tcp_receive_window(tp); +#else + u32 rcv_wnd = tp->rcv_wnd; +#endif + if (seq == tp->rcv_nxt) + return (rcv_wnd || (end_seq == seq) || rst); - tcp_tw_schedule(tw, timeo); - } else { - /* Sorry, if we're out of memory, just CLOSE this - * socket up. We've got bigger problems than - * non-graceful socket closings. - */ - if (net_ratelimit()) - printk(KERN_INFO "TCP: time wait bucket table overflow\n"); + return __tcp_sequence(tp, seq, end_seq); +} + +/* When we get a reset we do this. */ +static void tcp_reset(struct sock *sk) +{ + /* We want the right error as BSD sees it (and indeed as we do). */ + switch (sk->state) { + case TCP_SYN_SENT: + sk->err = ECONNREFUSED; + break; + case TCP_CLOSE_WAIT: + sk->err = EPIPE; + break; + case TCP_CLOSE: + return; + default: + sk->err = ECONNRESET; } - tcp_update_metrics(sk); + if (!sk->dead) + sk->error_report(sk); + tcp_done(sk); } @@ -1611,22 +2179,22 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) * * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. */ - static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); tp->fin_seq = TCP_SKB_CB(skb)->end_seq; - tp->ack.pending = 1; - tp->ack.quick = 0; + tcp_schedule_ack(tp); sk->shutdown |= RCV_SHUTDOWN; + sk->done = 1; switch(sk->state) { case TCP_SYN_RECV: case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); + tp->ack.pingpong = 1; break; case TCP_CLOSE_WAIT: @@ -1644,6 +2212,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) * happens, we must ack the received FIN and * enter the CLOSING state. */ + tcp_send_ack(sk); tcp_set_state(sk, TCP_CLOSING); break; case TCP_FIN_WAIT2: @@ -1664,7 +2233,8 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) */ __skb_queue_purge(&tp->out_of_order_queue); if (tp->sack_ok) - tp->num_sacks = 0; + tcp_sack_reset(tp); + tcp_mem_reclaim(sk); if (!sk->dead) { sk->state_change(sk); @@ -1677,51 +2247,90 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) } } +static __inline__ int +tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) +{ + if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { + if (before(seq, sp->start_seq)) + sp->start_seq = seq; + if (after(end_seq, sp->end_seq)) + sp->end_seq = end_seq; + return 1; + } + return 0; +} + +static __inline__ void tcp_dsack_set(struct tcp_opt *tp, u32 seq, u32 end_seq) +{ + if (tp->sack_ok && sysctl_tcp_dsack) { + if (before(seq, tp->rcv_nxt)) + NET_INC_STATS_BH(TCPDSACKOldSent); + else + NET_INC_STATS_BH(TCPDSACKOfoSent); + + tp->dsack = 1; + tp->duplicate_sack[0].start_seq = seq; + tp->duplicate_sack[0].end_seq = end_seq; + tp->eff_sacks = min(tp->num_sacks+1, 4-tp->tstamp_ok); + } +} + +static __inline__ void tcp_dsack_extend(struct tcp_opt *tp, u32 seq, u32 end_seq) +{ + if (!tp->dsack) + tcp_dsack_set(tp, seq, end_seq); + else + tcp_sack_extend(tp->duplicate_sack, seq, end_seq); +} + +static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + NET_INC_STATS_BH(DelayedACKLost); + tcp_enter_quickack_mode(tp); + + if (tp->sack_ok && sysctl_tcp_dsack) { + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + + if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) + end_seq = tp->rcv_nxt; + tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, end_seq); + } + } + + tcp_send_ack(sk); +} + /* These routines update the SACK block as out-of-order packets arrive or * in-order packets close up the sequence space. */ -static void tcp_sack_maybe_coalesce(struct tcp_opt *tp, struct tcp_sack_block *sp) +static void tcp_sack_maybe_coalesce(struct tcp_opt *tp) { - int this_sack, num_sacks = tp->num_sacks; - struct tcp_sack_block *swalk = &tp->selective_acks[0]; + int this_sack; + struct tcp_sack_block *sp = &tp->selective_acks[0]; + struct tcp_sack_block *swalk = sp+1; - /* If more than one SACK block, see if the recent change to SP eats into + /* See if the recent change to the first SACK eats into * or hits the sequence space of other SACK blocks, if so coalesce. */ - if(num_sacks != 1) { - for(this_sack = 0; this_sack < num_sacks; this_sack++, swalk++) { - if(swalk == sp) - continue; + for (this_sack = 1; this_sack < tp->num_sacks; ) { + if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) { + int i; - /* First case, bottom of SP moves into top of the - * sequence space of SWALK. + /* Zap SWALK, by moving every further SACK up by one slot. + * Decrease num_sacks. */ - if(between(sp->start_seq, swalk->start_seq, swalk->end_seq)) { - sp->start_seq = swalk->start_seq; - goto coalesce; - } - /* Second case, top of SP moves into bottom of the - * sequence space of SWALK. - */ - if(between(sp->end_seq, swalk->start_seq, swalk->end_seq)) { - sp->end_seq = swalk->end_seq; - goto coalesce; - } + tp->num_sacks--; + tp->eff_sacks = min(tp->num_sacks+tp->dsack, 4-tp->tstamp_ok); + for(i=this_sack; i < tp->num_sacks; i++) + sp[i] = sp[i+1]; + continue; } + this_sack++, swalk++; } - /* SP is the only SACK, or no coalescing cases found. */ - return; - -coalesce: - /* Zap SWALK, by moving every further SACK up by one slot. - * Decrease num_sacks. - */ - for(; this_sack < num_sacks-1; this_sack++, swalk++) { - struct tcp_sack_block *next = (swalk + 1); - swalk->start_seq = next->start_seq; - swalk->end_seq = next->end_seq; - } - tp->num_sacks--; } static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) @@ -1737,151 +2346,117 @@ static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sa sack2->end_seq = tmp; } -static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb) +static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct tcp_sack_block *sp = &tp->selective_acks[0]; int cur_sacks = tp->num_sacks; + int this_sack; if (!cur_sacks) goto new_sack; - /* Optimize for the common case, new ofo frames arrive - * "in order". ;-) This also satisfies the requirements - * of RFC2018 about ordering of SACKs. - */ - if(sp->end_seq == TCP_SKB_CB(skb)->seq) { - sp->end_seq = TCP_SKB_CB(skb)->end_seq; - tcp_sack_maybe_coalesce(tp, sp); - } else if(sp->start_seq == TCP_SKB_CB(skb)->end_seq) { - /* Re-ordered arrival, in this case, can be optimized - * as well. - */ - sp->start_seq = TCP_SKB_CB(skb)->seq; - tcp_sack_maybe_coalesce(tp, sp); - } else { - struct tcp_sack_block *swap = sp + 1; - int this_sack, max_sacks = (tp->tstamp_ok ? 3 : 4); - - /* Oh well, we have to move things around. - * Try to find a SACK we can tack this onto. - */ - - for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) { - if((swap->end_seq == TCP_SKB_CB(skb)->seq) || - (swap->start_seq == TCP_SKB_CB(skb)->end_seq)) { - if(swap->end_seq == TCP_SKB_CB(skb)->seq) - swap->end_seq = TCP_SKB_CB(skb)->end_seq; - else - swap->start_seq = TCP_SKB_CB(skb)->seq; - tcp_sack_swap(sp, swap); - tcp_sack_maybe_coalesce(tp, sp); - return; - } - } - - /* Could not find an adjacent existing SACK, build a new one, - * put it at the front, and shift everyone else down. We - * always know there is at least one SACK present already here. - * - * If the sack array is full, forget about the last one. - */ - if (cur_sacks >= max_sacks) { - cur_sacks--; - tp->num_sacks--; - } - while(cur_sacks >= 1) { - struct tcp_sack_block *this = &tp->selective_acks[cur_sacks]; - struct tcp_sack_block *prev = (this - 1); - this->start_seq = prev->start_seq; - this->end_seq = prev->end_seq; - cur_sacks--; + for (this_sack=0; this_sack<cur_sacks; this_sack++, sp++) { + if (tcp_sack_extend(sp, seq, end_seq)) { + /* Rotate this_sack to the first one. */ + for (; this_sack>0; this_sack--, sp--) + tcp_sack_swap(sp, sp-1); + if (cur_sacks > 1) + tcp_sack_maybe_coalesce(tp); + return; } + } - new_sack: - /* Build the new head SACK, and we're done. */ - sp->start_seq = TCP_SKB_CB(skb)->seq; - sp->end_seq = TCP_SKB_CB(skb)->end_seq; - tp->num_sacks++; + /* Could not find an adjacent existing SACK, build a new one, + * put it at the front, and shift everyone else down. We + * always know there is at least one SACK present already here. + * + * If the sack array is full, forget about the last one. + */ + if (this_sack >= 4) { + this_sack--; + tp->num_sacks--; + sp--; } + for(; this_sack > 0; this_sack--, sp--) + *sp = *(sp-1); + +new_sack: + /* Build the new head SACK, and we're done. */ + sp->start_seq = seq; + sp->end_seq = end_seq; + tp->num_sacks++; + tp->eff_sacks = min(tp->num_sacks+tp->dsack, 4-tp->tstamp_ok); } -static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb) +/* RCV.NXT advances, some SACKs should be eaten. */ + +static void tcp_sack_remove(struct tcp_opt *tp) { struct tcp_sack_block *sp = &tp->selective_acks[0]; int num_sacks = tp->num_sacks; int this_sack; - /* This is an in order data segment _or_ an out-of-order SKB being - * moved to the receive queue, so we know this removed SKB will eat - * from the front of a SACK. - */ - for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) { - /* Check if the start of the sack is covered by skb. */ - if(!before(sp->start_seq, TCP_SKB_CB(skb)->seq) && - before(sp->start_seq, TCP_SKB_CB(skb)->end_seq)) - break; + /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ + if (skb_queue_len(&tp->out_of_order_queue) == 0) { + tp->num_sacks = 0; + tp->eff_sacks = tp->dsack; + return; } - /* This should only happen if so many SACKs get built that some get - * pushed out before we get here, or we eat some in sequence packets - * which are before the first SACK block. - */ - if(this_sack >= num_sacks) - return; + for(this_sack = 0; this_sack < num_sacks; ) { + /* Check if the start of the sack is covered by RCV.NXT. */ + if (!before(tp->rcv_nxt, sp->start_seq)) { + int i; + + /* RCV.NXT must cover all the block! */ + BUG_TRAP(!before(tp->rcv_nxt, sp->end_seq)); - sp->start_seq = TCP_SKB_CB(skb)->end_seq; - if(!before(sp->start_seq, sp->end_seq)) { - /* Zap this SACK, by moving forward any other SACKS. */ - for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) { - struct tcp_sack_block *next = (sp + 1); - sp->start_seq = next->start_seq; - sp->end_seq = next->end_seq; + /* Zap this SACK, by moving forward any other SACKS. */ + for (i=this_sack+1; i < num_sacks; i++) + sp[i-1] = sp[i]; + num_sacks--; + continue; } - tp->num_sacks--; + this_sack++; + sp++; } -} - -static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sk_buff *new_skb) -{ - struct tcp_sack_block *sp = &tp->selective_acks[0]; - int num_sacks = tp->num_sacks; - int this_sack; - - for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) { - if(sp->end_seq == TCP_SKB_CB(old_skb)->end_seq) - break; + if (num_sacks != tp->num_sacks) { + tp->num_sacks = num_sacks; + tp->eff_sacks = min(tp->num_sacks+tp->dsack, 4-tp->tstamp_ok); } - if(this_sack >= num_sacks) - return; - sp->end_seq = TCP_SKB_CB(new_skb)->end_seq; } - /* This one checks to see if we can put data from the * out_of_order queue into the receive_queue. */ static void tcp_ofo_queue(struct sock *sk) { - struct sk_buff *skb; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + __u32 dsack_high = tp->rcv_nxt; + struct sk_buff *skb; - while ((skb = skb_peek(&tp->out_of_order_queue))) { + while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) break; + if (before(TCP_SKB_CB(skb)->seq, dsack_high)) { + __u32 dsack = dsack_high; + if (before(TCP_SKB_CB(skb)->end_seq, dsack_high)) + dsack_high = TCP_SKB_CB(skb)->end_seq; + tcp_dsack_extend(tp, TCP_SKB_CB(skb)->seq, dsack); + } + if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { SOCK_DEBUG(sk, "ofo packet was already received \n"); __skb_unlink(skb, skb->list); - kfree_skb(skb); + __kfree_skb(skb); continue; } SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); - if(tp->sack_ok) - tcp_sack_remove_skb(tp, skb); __skb_unlink(skb, skb->list); __skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; @@ -1892,10 +2467,14 @@ static void tcp_ofo_queue(struct sock *sk) static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { - struct sk_buff *skb1; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int eaten = 0; + if (tp->dsack) { + tp->dsack = 0; + tp->eff_sacks = min(tp->num_sacks, 4-tp->tstamp_ok); + } + /* Queue data for delivery to the user. * Packets in sequence go to the receive queue. * Out of sequence packets to the out_of_order_queue. @@ -1924,20 +2503,27 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (!eaten) { queue_and_out: - skb_set_owner_r(skb, sk); + tcp_set_owner_r(skb, sk); __skb_queue_tail(&sk->receive_queue, skb); } - dst_confirm(sk->dst_cache); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; if(skb->len) - tcp_event_data_recv(tp, skb); + tcp_event_data_recv(sk, tp, skb); if(skb->h.th->fin) tcp_fin(skb, sk, skb->h.th); - /* This may have eaten into a SACK block. */ - if(tp->sack_ok && tp->num_sacks) - tcp_sack_remove_skb(tp, skb); - tcp_ofo_queue(sk); + if (skb_queue_len(&tp->out_of_order_queue)) { + tcp_ofo_queue(sk); + + /* RFC2581. 4.2. SHOULD send immediate ACK, when + * gap in queue is filled. + */ + if (skb_queue_len(&tp->out_of_order_queue) == 0) + tp->ack.pingpong = 0; + } + + if(tp->num_sacks) + tcp_sack_remove(tp); /* Turn on fast path. */ if (skb_queue_len(&tp->out_of_order_queue) == 0 && @@ -1948,24 +2534,28 @@ queue_and_out: tcp_fast_path_on(tp); if (eaten) { - kfree_skb(skb); + __kfree_skb(skb); } else if (!sk->dead) sk->data_ready(sk, 0); return; } +#ifdef TCP_DEBUG /* An old packet, either a retransmit or some packet got lost. */ if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { /* A retransmit, 2nd most common case. Force an imediate ack. * * It is impossible, seq is checked by top level. */ - NETDEBUG(printk("retransmit in tcp_data_queue: seq %X\n", TCP_SKB_CB(skb)->seq)); + printk("BUG: retransmit in tcp_data_queue: seq %X\n", TCP_SKB_CB(skb)->seq); tcp_enter_quickack_mode(tp); - tp->ack.pending = 1; - kfree_skb(skb); + tcp_schedule_ack(tp); + __kfree_skb(skb); return; } +#endif + + tcp_enter_quickack_mode(tp); if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { /* Partial packet, seq < rcv_next < end_seq */ @@ -1973,67 +2563,198 @@ queue_and_out: tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, tp->rcv_nxt); goto queue_and_out; } - /* Ok. This is an out_of_order segment, force an ack. */ - tp->ack.pending = 1; + TCP_ECN_check_ce(tp, skb); /* Disable header prediction. */ tp->pred_flags = 0; - + tcp_schedule_ack(tp); SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); - skb_set_owner_r(skb, sk); + tcp_set_owner_r(skb, sk); if (skb_peek(&tp->out_of_order_queue) == NULL) { /* Initial out of order segment, build 1 SACK. */ if(tp->sack_ok) { tp->num_sacks = 1; + tp->dsack = 0; + tp->eff_sacks = 1; tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq; } __skb_queue_head(&tp->out_of_order_queue,skb); } else { - for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) { - /* Already there. */ - if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb1)->seq) { - if (skb->len >= skb1->len) { - if(tp->sack_ok) - tcp_sack_extend(tp, skb1, skb); - __skb_append(skb1, skb); - __skb_unlink(skb1, skb1->list); - kfree_skb(skb1); - } else { - /* A duplicate, smaller than what is in the - * out-of-order queue right now, toss it. - */ - kfree_skb(skb); - } + struct sk_buff *skb1=tp->out_of_order_queue.prev; + u32 seq = TCP_SKB_CB(skb)->seq; + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + + if (seq == TCP_SKB_CB(skb1)->end_seq) { + __skb_append(skb1, skb); + + if (tp->num_sacks == 0 || + tp->selective_acks[0].end_seq != seq) + goto add_sack; + + /* Common case: data arrive in order after hole. */ + tp->selective_acks[0].end_seq = end_seq; + return; + } + + /* Find place to insert this segment. */ + do { + if (!after(TCP_SKB_CB(skb1)->seq, seq)) break; + } while ((skb1=skb1->prev) != (struct sk_buff*)&tp->out_of_order_queue); + + /* Do skb overlap to previous one? */ + if (skb1 != (struct sk_buff*)&tp->out_of_order_queue && + before(seq, TCP_SKB_CB(skb1)->end_seq)) { + if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + /* All the bits are present. Drop. */ + __kfree_skb(skb); + tcp_dsack_set(tp, seq, end_seq); + goto add_sack; } - - if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) { - __skb_append(skb1, skb); - if(tp->sack_ok) - tcp_sack_new_ofo_skb(sk, skb); - break; + if (after(seq, TCP_SKB_CB(skb1)->seq)) { + /* Partial overlap. */ + tcp_dsack_set(tp, seq, TCP_SKB_CB(skb1)->end_seq); + } else { + skb1 = skb1->prev; } + } + __skb_insert(skb, skb1, skb1->next, &tp->out_of_order_queue); + + /* And clean segments covered by new one as whole. */ + while ((skb1 = skb->next) != (struct sk_buff*)&tp->out_of_order_queue && + after(end_seq, TCP_SKB_CB(skb1)->seq)) { + if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq); + break; + } + __skb_unlink(skb1, skb1->list); + tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); + __kfree_skb(skb1); + } - /* See if we've hit the start. If so insert. */ - if (skb1 == skb_peek(&tp->out_of_order_queue)) { - __skb_queue_head(&tp->out_of_order_queue,skb); - if(tp->sack_ok) - tcp_sack_new_ofo_skb(sk, skb); - break; +add_sack: + if (tp->sack_ok) + tcp_sack_new_ofo_skb(sk, seq, end_seq); + } +} + + +static void tcp_collapse_queue(struct sock *sk, struct sk_buff_head *q) +{ + struct sk_buff *skb = skb_peek(q); + struct sk_buff *skb_next; + + while (skb && + skb != (struct sk_buff *)q && + (skb_next = skb->next) != (struct sk_buff *)q) { + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + struct tcp_skb_cb *scb_next = TCP_SKB_CB(skb_next); + + if (scb->end_seq == scb_next->seq && + skb_tailroom(skb) >= skb_next->len && +#define TCP_DONT_COLLAPSE (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN) + !(tcp_flag_word(skb->h.th)&TCP_DONT_COLLAPSE) && + !(tcp_flag_word(skb_next->h.th)&TCP_DONT_COLLAPSE)) { + /* OK to collapse two skbs to one */ + memcpy(skb_put(skb, skb_next->len), skb_next->data, skb_next->len); + __skb_unlink(skb_next, skb_next->list); + scb->end_seq = scb_next->end_seq; + __kfree_skb(skb_next); + NET_INC_STATS_BH(TCPRcvCollapsed); + } else { + /* Lots of spare tailroom, reallocate this skb to trim it. */ + if (tcp_win_from_space(skb->truesize) > skb->len && + skb_tailroom(skb) > sizeof(struct sk_buff) + 16) { + struct sk_buff *nskb; + + nskb = skb_copy_expand(skb, skb_headroom(skb), 0, GFP_ATOMIC); + if (nskb) { + tcp_set_owner_r(nskb, sk); + memcpy(nskb->data-skb_headroom(skb), + skb->data-skb_headroom(skb), + skb_headroom(skb)); + __skb_append(skb, nskb); + __skb_unlink(skb, skb->list); + __kfree_skb(skb); + } } + skb = skb_next; } } - return; } +/* Clean the out_of_order queue if we can, trying to get + * the socket within its memory limits again. + * + * Return less than zero if we should start dropping frames + * until the socket owning process reads some of the data + * to stabilize the situation. + */ +static int tcp_prune_queue(struct sock *sk) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); + + NET_INC_STATS_BH(PruneCalled); + + if (atomic_read(&sk->rmem_alloc) >= sk->rcvbuf) + tcp_clamp_window(sk, tp); + else if (tcp_memory_pressure) + tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4*tp->advmss); + + tcp_collapse_queue(sk, &sk->receive_queue); + tcp_collapse_queue(sk, &tp->out_of_order_queue); + tcp_mem_reclaim(sk); + + if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) + return 0; + + /* Collapsing did not help, destructive actions follow. + * This must not ever occur. */ + + /* First, purge the out_of_order queue. */ + if (skb_queue_len(&tp->out_of_order_queue)) { + net_statistics[smp_processor_id()*2].OfoPruned += skb_queue_len(&tp->out_of_order_queue); + __skb_queue_purge(&tp->out_of_order_queue); + + /* Reset SACK state. A conforming SACK implementation will + * do the same at a timeout based retransmit. When a connection + * is in a sad state like this, we care only about integrity + * of the connection not performance. + */ + if(tp->sack_ok) + tcp_sack_reset(tp); + tcp_mem_reclaim(sk); + } + + if(atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) + return 0; + + /* If we are really being abused, tell the caller to silently + * drop receive data on the floor. It will get retransmitted + * and hopefully then we'll have sufficient space. + */ + NET_INC_STATS_BH(RcvPruned); + + /* Massive buffer overcommit. */ + return -1; +} + +static inline int tcp_rmem_schedule(struct sock *sk, struct sk_buff *skb) +{ + return (int)skb->truesize <= sk->forward_alloc || + tcp_mem_schedule(sk, skb->truesize, 1); +} /* * This routine handles the data. If there is room in the buffer, @@ -2053,53 +2774,103 @@ static void tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) if (skb->len == 0 && !th->fin) goto drop; + TCP_ECN_accept_cwr(tp, skb); + /* * If our receive queue has grown past its limits shrink it. * Make sure to do this before moving rcv_nxt, otherwise * data might be acked for that we don't have enough room. */ - if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) { - if (prune_queue(sk) < 0) { - /* Still not enough room. That can happen when - * skb->true_size differs significantly from skb->len. - */ + if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf || + !tcp_rmem_schedule(sk, skb)) { + if (tcp_prune_queue(sk) < 0 || !tcp_rmem_schedule(sk, skb)) goto drop; - } } tcp_data_queue(sk, skb); +#ifdef TCP_DEBUG if (before(tp->rcv_nxt, tp->copied_seq)) { printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n"); tp->rcv_nxt = tp->copied_seq; } +#endif return; drop: - kfree_skb(skb); + __kfree_skb(skb); +} + +/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. + * As additional protections, we do not touch cwnd in retransmission phases, + * and if application hit its sndbuf limit recently. + */ +void tcp_cwnd_application_limited(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + if (tp->ca_state == TCP_CA_Open && + sk->socket && !test_bit(SOCK_NOSPACE, &sk->socket->flags)) { + /* Limited by application or receiver window. */ + u32 win_used = max(tp->snd_cwnd_used, 2); + if (win_used < tp->snd_cwnd) { + tp->snd_ssthresh = tcp_current_ssthresh(tp); + tp->snd_cwnd = (tp->snd_cwnd+win_used)>>1; + } + tp->snd_cwnd_used = 0; + } + tp->snd_cwnd_stamp = tcp_time_stamp; } + /* When incoming ACK allowed to free some skb from write_queue, - * we remember this in flag tp->sorry and wake up socket on the exit - * from tcp input handler. Probably, handler has already eat this space - * sending ACK and cloned frames from tcp_write_xmit(). + * we remember this event in flag tp->queue_shrunk and wake up socket + * on the exit from tcp input handler. */ -static __inline__ void tcp_new_space(struct sock *sk) +static void tcp_new_space(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct socket *sock; - tp->sorry = 0; + if (tp->packets_out < tp->snd_cwnd && + !(sk->userlocks&SOCK_SNDBUF_LOCK) && + !tcp_memory_pressure && + atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + int sndmem, demanded; + + sndmem = tp->mss_clamp+MAX_TCP_HEADER+16+sizeof(struct sk_buff); + demanded = max(tp->snd_cwnd, tp->reordering+1); + sndmem *= 2*demanded; + if (sndmem > sk->sndbuf) + sk->sndbuf = min(sndmem, sysctl_tcp_wmem[2]); + tp->snd_cwnd_stamp = tcp_time_stamp; + } + + /* Wakeup users. */ + if (tcp_wspace(sk) >= tcp_min_write_space(sk)) { + struct socket *sock = sk->socket; - if (sock_wspace(sk) >= tcp_min_write_space(sk) && - (sock = sk->socket) != NULL) { clear_bit(SOCK_NOSPACE, &sock->flags); if (sk->sleep && waitqueue_active(sk->sleep)) wake_up_interruptible(sk->sleep); - if (sock->fasync_list) + if (sock->fasync_list && !(sk->shutdown&SEND_SHUTDOWN)) sock_wake_async(sock, 2, POLL_OUT); + + /* Satisfy those who hook write_space() callback. */ + if (sk->write_space != tcp_write_space) + sk->write_space(sk); + } +} + +static inline void tcp_check_space(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + if (tp->queue_shrunk) { + tp->queue_shrunk = 0; + if (sk->socket && test_bit(SOCK_NOSPACE, &sk->socket->flags)) + tcp_new_space(sk); } } @@ -2118,7 +2889,8 @@ static __inline__ void tcp_data_snd_check(struct sock *sk) struct sk_buff *skb = sk->tp_pinfo.af_tcp.send_head; if (skb != NULL) - __tcp_data_snd_check(sk, skb); + __tcp_data_snd_check(sk, skb); + tcp_check_space(sk); } /* @@ -2128,32 +2900,15 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - /* This also takes care of updating the window. - * This if statement needs to be simplified. - * - * Rules for delaying an ack: - * - delay time <= 0.5 HZ - * - we don't have a window update to send - * - must send at least every 2 full sized packets - * - must send an ACK if we have any out of order data - * - * With an extra heuristic to handle loss of packet - * situations and also helping the sender leave slow - * start in an expediant manner. - */ - - /* More than one full frame received or... */ + /* More than one full frame received... */ if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss -#ifdef TCP_MORE_COARSE_ACKS - /* Avoid to send immediate ACK from input path, if it - * does not advance window far enough. tcp_recvmsg() will do this. + /* ... and right edge of window advances far enough. + * (tcp_recvmsg() will send ACK otherwise). Or... */ - && (!sysctl_tcp_retrans_collapse || __tcp_select_window(sk) >= tp->rcv_wnd) -#endif - ) || + && __tcp_select_window(sk) >= tp->rcv_wnd) || /* We ACK each frame or... */ tcp_in_quickack_mode(tp) || - /* We have out of order data or */ + /* We have out of order data. */ (ofo_possible && skb_peek(&tp->out_of_order_queue) != NULL)) { /* Then ack it now */ @@ -2167,14 +2922,13 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) static __inline__ void tcp_ack_snd_check(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - if (tp->ack.pending == 0) { + if (!tcp_ack_scheduled(tp)) { /* We sent a data segment already. */ return; } __tcp_ack_snd_check(sk, 1); } - /* * This routine is only called when we have urgent data * signalled. Its the 'slow' part of tcp_urg. It could be @@ -2248,92 +3002,6 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len } } -/* Clean the out_of_order queue if we can, trying to get - * the socket within its memory limits again. - * - * Return less than zero if we should start dropping frames - * until the socket owning process reads some of the data - * to stabilize the situation. - */ -static int prune_queue(struct sock *sk) -{ - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - struct sk_buff *skb; - int pruned = 0; - - SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); - - NET_INC_STATS_BH(PruneCalled); - - /* First, purge the out_of_order queue. */ - skb = __skb_dequeue_tail(&tp->out_of_order_queue); - if(skb != NULL) { - /* Free it all. */ - do { - pruned += skb->len; - net_statistics[smp_processor_id()*2].OfoPruned += skb->len; - kfree_skb(skb); - skb = __skb_dequeue_tail(&tp->out_of_order_queue); - } while(skb != NULL); - - /* Reset SACK state. A conforming SACK implementation will - * do the same at a timeout based retransmit. When a connection - * is in a sad state like this, we care only about integrity - * of the connection not performance. - */ - if(tp->sack_ok) - tp->num_sacks = 0; - } - - /* If we are really being abused, tell the caller to silently - * drop receive data on the floor. It will get retransmitted - * and hopefully then we'll have sufficient space. - * - * We used to try to purge the in-order packets too, but that - * turns out to be deadly and fraught with races. Consider: - * - * 1) If we acked the data, we absolutely cannot drop the - * packet. This data would then never be retransmitted. - * 2) It is possible, with a proper sequence of events involving - * delayed acks and backlog queue handling, to have the user - * read the data before it gets acked. The previous code - * here got this wrong, and it lead to data corruption. - * 3) Too much state changes happen when the FIN arrives, so once - * we've seen that we can't remove any in-order data safely. - * - * The net result is that removing in-order receive data is too - * complex for anyones sanity. So we don't do it anymore. But - * if we are really having our buffer space abused we stop accepting - * new receive data. - * - * 8) The arguments are interesting, but I even cannot imagine - * what kind of arguments could force us to drop NICE, ALREADY - * RECEIVED DATA only to get one more packet? --ANK - * - * FIXME: it should recompute SACK state and only remove enough - * buffers to get into bounds again. The current scheme loses - * badly sometimes on links with large RTT, especially when - * the driver has high overhead per skb. - * (increasing the rcvbuf is not enough because it inflates the - * the window too, disabling flow control effectively) -AK - * - * Mmm... Why not to scale it seprately then? Just replace - * / WINDOW_ADVERTISE_DIVISOR with >> sk->window_advertise_scale - * and adjust it dynamically, when TCP window flow control - * fails? -ANK - */ - - tp->ack.quick = 0; - - if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1)) - return 0; - - NET_INC_STATS_BH(RcvPruned); - - /* Massive buffer overcommit. */ - return -1; -} - static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); @@ -2454,9 +3122,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, * We do checksum and copy also but from device to kernel. */ - /* RED-PEN. Using static variables to pass function arguments - * cannot be good idea... - */ tp->saw_tstamp = 0; /* pred_flags is 0xS?10 << 16 + snd_wnd @@ -2468,7 +3133,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, * PSH flag is ignored. */ - if ((tcp_flag_word(th) & ~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) == tp->pred_flags && + if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { int tcp_header_len = tp->tcp_header_len; @@ -2500,10 +3165,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, * seq == rcv_nxt and rcv_wup <= rcv_nxt. * Hence, check seq<=rcv_wup reduces to: */ - if (tp->rcv_nxt == tp->rcv_wup) { - tp->ts_recent = tp->rcv_tsval; - tp->ts_recent_stamp = xtime.tv_sec; - } + if (tp->rcv_nxt == tp->rcv_wup) + tcp_store_ts_recent(tp); } if (len <= tcp_header_len) { @@ -2512,18 +3175,15 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, /* We know that such packets are checksummed * on entry. */ - tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->ack_seq, len); - kfree_skb(skb); + tcp_ack(sk, skb, 0); + __kfree_skb(skb); tcp_data_snd_check(sk); - if (tp->sorry) - tcp_new_space(sk); return 0; } else { /* Header too small */ TCP_INC_STATS_BH(TcpInErrs); goto discard; } - } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una) { + } else { int eaten = 0; if (tp->ucopy.task == current && @@ -2546,67 +3206,59 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (tcp_checksum_complete_user(sk, skb)) goto csum_error; - if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) + if ((int)skb->truesize > sk->forward_alloc) goto step5; NET_INC_STATS_BH(TCPHPHits); /* Bulk data transfer: receiver */ __skb_pull(skb,tcp_header_len); - - /* DO NOT notify forward progress here. - * It saves dozen of CPU instructions in fast path. --ANK - * And where is it signaled then ? -AK - * Nowhere. 8) --ANK - */ __skb_queue_tail(&sk->receive_queue, skb); - skb_set_owner_r(skb, sk); - + tcp_set_owner_r(skb, sk); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - - /* FIN bit check is not done since if FIN is set in - * this frame, the pred_flags won't match up. -DaveM - */ - sk->data_ready(sk, 0); } - tcp_event_data_recv(tp, skb); + tcp_event_data_recv(sk, tp, skb); + + if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { + /* Well, only one small jumplet in fast path... */ + tcp_ack(sk, skb, FLAG_DATA); + tcp_data_snd_check(sk); + if (!tcp_ack_scheduled(tp)) + goto no_ack; + } -#ifdef TCP_MORE_COARSE_ACKS if (eaten) { if (tcp_in_quickack_mode(tp)) { tcp_send_ack(sk); } else { tcp_send_delayed_ack(sk); } - } else -#endif - __tcp_ack_snd_check(sk, 0); + } else { + __tcp_ack_snd_check(sk, 0); + } +no_ack: if (eaten) - kfree_skb(skb); + __kfree_skb(skb); + else + sk->data_ready(sk, 0); return 0; } - /* Packet is in sequence, flags are trivial; - * only ACK is strange. Jump to step 5. - */ - if (tcp_checksum_complete_user(sk, skb)) - goto csum_error; - goto step5; } slow_path: - if (tcp_checksum_complete_user(sk, skb)) + if (len < (th->doff<<2) || tcp_checksum_complete_user(sk, skb)) goto csum_error; /* * RFC1323: H1. Apply PAWS check first. */ - if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp && + if (tcp_fast_parse_options(skb, th, tp) && tp->saw_tstamp && tcp_paws_discard(tp, skb)) { if (!th->rst) { NET_INC_STATS_BH(PAWSEstabRejected); - tcp_send_ack(sk); + tcp_send_dupack(sk, skb); goto discard; } /* Resets are accepted even if PAWS failed. @@ -2620,23 +3272,15 @@ slow_path: * Standard slow path. */ - if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { + if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, th->rst)) { /* RFC793, page 37: "In all states except SYN-SENT, all reset * (RST) segments are validated by checking their SEQ-fields." * And page 69: "If an incoming segment is not acceptable, * an acknowledgment should be sent in reply (unless the RST bit * is set, if so drop the segment and return)". */ - if (th->rst) - goto discard; - if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { - SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n", - TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tp->rcv_wup, tp->rcv_wnd); - } - tcp_enter_quickack_mode(tp); - tcp_send_ack(sk); - NET_INC_STATS_BH(DelayedACKLost); + if (!th->rst) + tcp_send_dupack(sk, skb); goto discard; } @@ -2645,378 +3289,43 @@ slow_path: goto discard; } - if (tp->saw_tstamp) { - tcp_replace_ts_recent(sk, tp, - TCP_SKB_CB(skb)->seq); - } + tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) { - SOCK_DEBUG(sk, "syn in established state\n"); TCP_INC_STATS_BH(TcpInErrs); + NET_INC_STATS_BH(TCPAbortOnSyn); tcp_reset(sk); return 1; } step5: if(th->ack) - tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len); - + tcp_ack(sk, skb, FLAG_SLOWPATH); + /* Process urgent data. */ tcp_urg(sk, th, len); /* step 7: process the segment text */ tcp_data(skb, sk, len); - /* Be careful, tcp_data() may have put this into TIME_WAIT. */ - if(sk->state != TCP_CLOSE) { - tcp_data_snd_check(sk); - tcp_ack_snd_check(sk); - if (tp->sorry) - tcp_new_space(sk); - } - + tcp_data_snd_check(sk); + tcp_ack_snd_check(sk); return 0; csum_error: TCP_INC_STATS_BH(TcpInErrs); discard: - kfree_skb(skb); + __kfree_skb(skb); return 0; } - -/* This is not only more efficient than what we used to do, it eliminates - * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM - * - * Actually, we could lots of memory writes here. tp of listening - * socket contains all necessary default parameters. - */ -struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb) -{ - struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0); - - if(newsk != NULL) { - struct tcp_opt *newtp; -#ifdef CONFIG_FILTER - struct sk_filter *filter; -#endif - - memcpy(newsk, sk, sizeof(*newsk)); - newsk->state = TCP_SYN_RECV; - - /* SANITY */ - newsk->pprev = NULL; - newsk->prev = NULL; - - /* Clone the TCP header template */ - newsk->dport = req->rmt_port; - - sock_lock_init(newsk); - bh_lock_sock(newsk); - - atomic_set(&newsk->rmem_alloc, 0); - skb_queue_head_init(&newsk->receive_queue); - atomic_set(&newsk->wmem_alloc, 0); - skb_queue_head_init(&newsk->write_queue); - atomic_set(&newsk->omem_alloc, 0); - - newsk->done = 0; - newsk->proc = 0; - newsk->backlog.head = newsk->backlog.tail = NULL; - skb_queue_head_init(&newsk->error_queue); - newsk->write_space = tcp_write_space; -#ifdef CONFIG_FILTER - if ((filter = newsk->filter) != NULL) - sk_filter_charge(newsk, filter); -#endif - - /* Now setup tcp_opt */ - newtp = &(newsk->tp_pinfo.af_tcp); - newtp->pred_flags = 0; - newtp->rcv_nxt = req->rcv_isn + 1; - newtp->snd_nxt = req->snt_isn + 1; - newtp->snd_una = req->snt_isn + 1; - newtp->snd_sml = req->snt_isn + 1; - - tcp_delack_init(newtp); - if (skb->len >= 536) - newtp->ack.last_seg_size = skb->len; - - tcp_prequeue_init(newtp); - - newtp->snd_wl1 = req->rcv_isn; - newtp->snd_wl2 = req->snt_isn; - - newtp->retransmits = 0; - newtp->backoff = 0; - newtp->srtt = 0; - newtp->mdev = TCP_TIMEOUT_INIT; - newtp->rto = TCP_TIMEOUT_INIT; - - newtp->packets_out = 0; - newtp->fackets_out = 0; - newtp->retrans_out = 0; - newtp->snd_ssthresh = 0x7fffffff; - - /* So many TCP implementations out there (incorrectly) count the - * initial SYN frame in their delayed-ACK and congestion control - * algorithms that we must have the following bandaid to talk - * efficiently to them. -DaveM - */ - newtp->snd_cwnd = 2; - newtp->snd_cwnd_cnt = 0; - newtp->high_seq = 0; - - newtp->dup_acks = 0; - tcp_init_xmit_timers(newsk); - skb_queue_head_init(&newtp->out_of_order_queue); - newtp->send_head = newtp->retrans_head = NULL; - newtp->rcv_wup = req->rcv_isn + 1; - newtp->write_seq = req->snt_isn + 1; - newtp->copied_seq = req->rcv_isn + 1; - - newtp->saw_tstamp = 0; - - newtp->probes_out = 0; - newtp->num_sacks = 0; - newtp->syn_seq = req->rcv_isn; - newtp->fin_seq = req->rcv_isn; - newtp->urg_data = 0; - newtp->listen_opt = NULL; - newtp->accept_queue = newtp->accept_queue_tail = NULL; - /* Deinitialize syn_wait_lock to trap illegal accesses. */ - memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock)); - - /* Back to base struct sock members. */ - newsk->err = 0; - newsk->priority = 0; - atomic_set(&newsk->refcnt, 1); -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet_sock_nr); -#endif - - if (newsk->keepopen) - tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); - newsk->socket = NULL; - newsk->sleep = NULL; - - newtp->tstamp_ok = req->tstamp_ok; - if((newtp->sack_ok = req->sack_ok) != 0) - newtp->num_sacks = 0; - newtp->window_clamp = req->window_clamp; - newtp->rcv_wnd = req->rcv_wnd; - newtp->wscale_ok = req->wscale_ok; - if (newtp->wscale_ok) { - newtp->snd_wscale = req->snd_wscale; - newtp->rcv_wscale = req->rcv_wscale; - } else { - newtp->snd_wscale = newtp->rcv_wscale = 0; - newtp->window_clamp = min(newtp->window_clamp,65535); - } - newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->snd_wscale; - newtp->max_window = newtp->snd_wnd; - - if (newtp->tstamp_ok) { - newtp->ts_recent = req->ts_recent; - newtp->ts_recent_stamp = xtime.tv_sec; - newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; - } else { - newtp->ts_recent_stamp = 0; - newtp->tcp_header_len = sizeof(struct tcphdr); - } - newtp->mss_clamp = req->mss; - } - return newsk; -} - -/* - * Process an incoming packet for SYN_RECV sockets represented - * as an open_request. - */ - -struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, - struct open_request *req, - struct open_request **prev) -{ - struct tcphdr *th = skb->h.th; - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); - int paws_reject = 0; - struct tcp_opt ttp; - struct sock *child; - - ttp.saw_tstamp = 0; - if (th->doff > (sizeof(struct tcphdr)>>2)) { - tcp_parse_options(NULL, th, &ttp, 0); - - if (ttp.saw_tstamp) { - ttp.ts_recent = req->ts_recent; - /* We do not store true stamp, but it is not required, - * it can be estimated (approximately) - * from another data. - */ - ttp.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); - paws_reject = tcp_paws_check(&ttp, th->rst); - } - } - - /* Check for pure retransmited SYN. */ - if (TCP_SKB_CB(skb)->seq == req->rcv_isn && - flg == TCP_FLAG_SYN && - !paws_reject) { - /* - * RFC793 draws (Incorrectly! It was fixed in RFC1122) - * this case on figure 6 and figure 8, but formal - * protocol description says NOTHING. - * To be more exact, it says that we should send ACK, - * because this segment (at least, if it has no data) - * is out of window. - * - * CONCLUSION: RFC793 (even with RFC1122) DOES NOT - * describe SYN-RECV state. All the description - * is wrong, we cannot believe to it and should - * rely only on common sense and implementation - * experience. - * - * Enforce "SYN-ACK" according to figure 8, figure 6 - * of RFC793, fixed by RFC1122. - */ - req->class->rtx_syn_ack(sk, req, NULL); - return NULL; - } - - /* Further reproduces section "SEGMENT ARRIVES" - for state SYN-RECEIVED of RFC793. - It is broken, however, it does not work only - when SYNs are crossed, which is impossible in our - case. - - But generally, we should (RFC lies!) to accept ACK - from SYNACK both here and in tcp_rcv_state_process(). - tcp_rcv_state_process() does not, hence, we do not too. - - Note that the case is absolutely generic: - we cannot optimize anything here without - violating protocol. All the checks must be made - before attempt to create socket. - */ - - /* RFC793: "first check sequence number". */ - - if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) { - /* Out of window: send ACK and drop. */ - if (!(flg & TCP_FLAG_RST)) - req->class->send_ack(skb, req); - if (paws_reject) - NET_INC_STATS_BH(PAWSEstabRejected); - return NULL; - } - - /* In sequence, PAWS is OK. */ - - if (ttp.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1)) - req->ts_recent = ttp.rcv_tsval; - - if (TCP_SKB_CB(skb)->seq == req->rcv_isn) { - /* Truncate SYN, it is out of window starting - at req->rcv_isn+1. */ - flg &= ~TCP_FLAG_SYN; - } - - /* RFC793: "second check the RST bit" and - * "fourth, check the SYN bit" - */ - if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) - goto embryonic_reset; - - /* RFC793: "fifth check the ACK field" */ - - if (!(flg & TCP_FLAG_ACK)) - return NULL; - - /* Invalid ACK: reset will be sent by listening socket */ - if (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1) - return sk; - /* Also, it would be not so bad idea to check rcv_tsecr, which - * is essentially ACK extension and too early or too late values - * should cause reset in unsynchronized states. - */ - - /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ - if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) { - req->acked = 1; - return NULL; - } - - /* OK, ACK is valid, create big socket and - * feed this segment to it. It will repeat all - * the tests. THIS SEGMENT MUST MOVE SOCKET TO - * ESTABLISHED STATE. If it will be dropped after - * socket is created, wait for troubles. - */ - child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); - if (child == NULL) - goto listen_overflow; - - tcp_synq_unlink(tp, req, prev); - tcp_synq_removed(sk, req); - - tcp_acceptq_queue(sk, req, child); - return child; - -listen_overflow: - if (!sysctl_tcp_abort_on_overflow) { - req->acked = 1; - return NULL; - } - -embryonic_reset: - NET_INC_STATS_BH(EmbryonicRsts); - if (!(flg & TCP_FLAG_RST)) - req->class->send_reset(skb); - - tcp_synq_drop(sk, req, prev); - return NULL; -} - -/* - * Queue segment on the new socket if the new socket is active, - * otherwise we just shortcircuit this and continue with - * the new socket. - */ - -int tcp_child_process(struct sock *parent, struct sock *child, - struct sk_buff *skb) -{ - int ret = 0; - int state = child->state; - - if (child->lock.users == 0) { - ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len); - - /* Wakeup parent, send SIGIO */ - if (state == TCP_SYN_RECV && child->state != state) - parent->data_ready(parent, 0); - } else { - /* Alas, it is possible again, because we do lookup - * in main socket hash table and lock on listening - * socket does not protect us more. - */ - sk_add_backlog(child, skb); - } - - bh_unlock_sock(child); - return ret; -} - static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - tcp_parse_options(sk, th, tp, 0); + tcp_parse_options(skb, tp); if (th->ack) { /* rfc793: @@ -3027,24 +3336,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * a reset (unless the RST bit is set, if so drop * the segment and return)" * - * I cite this place to emphasize one essential - * detail, this check is different of one - * in established state: SND.UNA <= SEG.ACK <= SND.NXT. - * SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT, - * because we have no previous data sent before SYN. - * --ANK(990513) - * * We do not send data with SYN, so that RFC-correct * test reduces to: */ if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) return 1; - /* Check not from any RFC, but it is evident consequence - * of combining PAWS and usual SYN-SENT logic: ACK _is_ - * checked in SYN-SENT unlike another states, hence - * echoed tstamp must be checked too. - */ if (tp->saw_tstamp) { if (tp->rcv_tsecr == 0) { /* Workaround for bug in linux-2.1 and early @@ -3055,13 +3352,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tp->saw_tstamp = 0; /* But do not forget to store peer's timestamp! */ - if (th->syn) { - tp->ts_recent = tp->rcv_tsval; - tp->ts_recent_stamp = xtime.tv_sec; - } - } else if ((__s32)(tp->rcv_tsecr - tcp_time_stamp) > 0 || - (__s32)(tp->rcv_tsecr - tp->syn_stamp) < 0) { - NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "TCP: synsent reject.\n")); + if (th->syn) + tcp_store_ts_recent(tp); + } else if (!between(tp->rcv_tsecr, tp->retrans_stamp, tcp_time_stamp)) { NET_INC_STATS_BH(PAWSActiveRejected); return 1; } @@ -3095,30 +3388,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * are acceptable then ... * (our SYN has been ACKed), change the connection * state to ESTABLISHED..." - * - * Do you see? SYN-less ACKs in SYN-SENT state are - * completely ignored. - * - * The bug causing stalled SYN-SENT sockets - * was here: tcp_ack advanced snd_una and canceled - * retransmit timer, so that bare ACK received - * in SYN-SENT state (even with invalid ack==ISS, - * because tcp_ack check is too weak for SYN-SENT) - * causes moving socket to invalid semi-SYN-SENT, - * semi-ESTABLISHED state and connection hangs. - * --ANK (990514) - * - * Bare ACK is valid, however. - * Actually, RFC793 requires to send such ACK - * in reply to any out of window packet. - * It is wrong, but Linux also send such - * useless ACKs sometimes. - * --ANK (990724) */ + TCP_ECN_rcv_synack(tp, th); + tp->snd_wl1 = TCP_SKB_CB(skb)->seq; - tcp_ack(sk,th, TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->ack_seq, len); + tcp_ack(sk, skb, FLAG_SLOWPATH); /* Ok.. it's good. Set up sequence numbers and * move to established. @@ -3130,12 +3405,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * never scaled. */ tp->snd_wnd = ntohs(th->window); - tp->snd_wl1 = TCP_SKB_CB(skb)->seq; - tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq; + tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(skb)->seq); + tp->syn_seq = TCP_SKB_CB(skb)->seq; tp->fin_seq = TCP_SKB_CB(skb)->seq; - tcp_set_state(sk, TCP_ESTABLISHED); - if (tp->wscale_ok == 0) { tp->snd_wscale = tp->rcv_wscale = 0; tp->window_clamp = min(tp->window_clamp,65535); @@ -3144,12 +3417,14 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tp->tstamp_ok) { tp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; } else tp->tcp_header_len = sizeof(struct tcphdr); - if (tp->saw_tstamp) { - tp->ts_recent = tp->rcv_tsval; - tp->ts_recent_stamp = xtime.tv_sec; - } + if (tp->saw_tstamp) + tcp_store_ts_recent(tp); + if (tp->sack_ok && sysctl_tcp_fack) + tp->sack_ok |= 2; + tcp_sync_mss(sk, tp->pmtu_cookie); tcp_initialize_rcv_mss(sk); tcp_init_metrics(sk); @@ -3158,15 +3433,24 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (sk->keepopen) tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); + if (tp->snd_wscale == 0) + __tcp_fast_path_on(tp, tp->snd_wnd); + else + tp->pred_flags = 0; + + /* Remember, tcp_poll() does not lock socket! + * Change state from SYN-SENT only after copied_seq + * is initilized. */ tp->copied_seq = tp->rcv_nxt; - __tcp_fast_path_on(tp, tp->snd_wnd); + mb(); + tcp_set_state(sk, TCP_ESTABLISHED); if(!sk->dead) { sk->state_change(sk); sk_wake_async(sk, 0, POLL_OUT); } - if (tp->write_pending) { + if (tp->write_pending || tp->defer_accept) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. * @@ -3174,11 +3458,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * look so _wonderfully_ clever, that I was not able * to stand against the temptation 8) --ANK */ - tp->ack.pending = 1; + tcp_schedule_ack(tp); tp->ack.lrcvtime = tcp_time_stamp; tcp_enter_quickack_mode(tp); - tp->ack.ato = TCP_ATO_MIN; - tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MIN); + tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); goto discard; } else { tcp_send_ack(sk); @@ -3204,20 +3487,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (th->syn) { /* We see SYN without ACK. It is attempt of - * simultaneous connect with crossed SYNs. - * - * The previous version of the code - * checked for "connecting to self" - * here. that check is done now in - * tcp_connect. - * - * RED-PEN: BTW, it does not. 8) + * simultaneous connect with crossed SYNs. + * Particularly, it can be connect to self. */ tcp_set_state(sk, TCP_SYN_RECV); - if (tp->saw_tstamp) { - tp->ts_recent = tp->rcv_tsval; - tp->ts_recent_stamp = xtime.tv_sec; - } + if (tp->saw_tstamp) + tcp_store_ts_recent(tp); tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; @@ -3232,6 +3507,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_sync_mss(sk, tp->pmtu_cookie); tcp_initialize_rcv_mss(sk); + TCP_ECN_rcv_syn(tp, th); + tcp_send_synack(sk); #if 0 /* Note, we could accept data and URG from this segment. @@ -3251,7 +3528,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, */ discard: - kfree_skb(skb); + __kfree_skb(skb); return 0; } @@ -3273,35 +3550,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, switch (sk->state) { case TCP_CLOSE: - /* When state == CLOSED, hash lookup always fails. - * - * But, there is a back door, the backlog queue. - * If we have a sequence of packets in the backlog - * during __release_sock() which have a sequence such - * that: - * packet X causes entry to TCP_CLOSE state - * ... - * packet X + N has FIN bit set - * - * We report a (luckily) harmless error in this case. - * The issue is that backlog queue processing bypasses - * any hash lookups (we know which socket packets are for). - * The correct behavior here is what 2.0.x did, since - * a TCP_CLOSE socket does not exist. Drop the frame - * and send a RST back to the other end. - */ - - /* 1. The socket may be moved to TIME-WAIT state. - 2. While this socket was locked, another socket - with the same identity could be created. - 3. To continue? - - CONCLUSION: discard and only discard! - - Alternative would be relookup and recurse into tcp_v?_rcv - (not *_do_rcv) to work with timewait and listen states - correctly. - */ goto discard; case TCP_LISTEN: @@ -3340,56 +3588,20 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto step6; } - /* Parse the tcp_options present on this header. - * By this point we really only expect timestamps. - * Note that this really has to be here and not later for PAWS - * (RFC1323) to work. - */ - if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp && + if (tcp_fast_parse_options(skb, th, tp) && tp->saw_tstamp && tcp_paws_discard(tp, skb)) { if (!th->rst) { - tcp_send_ack(sk); + NET_INC_STATS_BH(PAWSEstabRejected); + tcp_send_dupack(sk, skb); goto discard; } /* Reset is accepted even if it did not pass PAWS. */ } - /* The silly FIN test here is necessary to see an advancing ACK in - * retransmitted FIN frames properly. Consider the following sequence: - * - * host1 --> host2 FIN XSEQ:XSEQ(0) ack YSEQ - * host2 --> host1 FIN YSEQ:YSEQ(0) ack XSEQ - * host1 --> host2 XSEQ:XSEQ(0) ack YSEQ+1 - * host2 --> host1 FIN YSEQ:YSEQ(0) ack XSEQ+1 (fails tcp_sequence test) - * - * At this point the connection will deadlock with host1 believing - * that his FIN is never ACK'd, and thus it will retransmit it's FIN - * forever. The following fix is from Taral (taral@taral.net). - * - * RED-PEN. Seems, the above is not true. - * If at least one end is RFC compliant, it will send ACK to - * out of window FIN and, hence, move peer to TIME-WAIT. - * I comment out this line. --ANK - * - * RED-PEN. DANGER! tcp_sequence check rejects also SYN-ACKs - * received in SYN-RECV. The problem is that description of - * segment processing in SYN-RECV state in RFC792 is WRONG. - * Correct check would accept ACK from this SYN-ACK, see - * figures 6 and 8 (fixed by RFC1122). Compare this - * to problem with FIN, they smell similarly. --ANK - */ - /* step 1: check sequence number */ - if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq) -#if 0 - && !(th->fin && TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt) -#endif - ) { - if (!th->rst) { - NET_INC_STATS_BH(DelayedACKLost); - tcp_enter_quickack_mode(tp); - tcp_send_ack(sk); - } + if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, th->rst)) { + if (!th->rst) + tcp_send_dupack(sk, skb); goto discard; } @@ -3399,10 +3611,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; } - if (tp->saw_tstamp) { - tcp_replace_ts_recent(sk, tp, - TCP_SKB_CB(skb)->seq); - } + tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); /* step 3: check security and precedence [ignored] */ @@ -3423,47 +3632,51 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) { + NET_INC_STATS_BH(TCPAbortOnSyn); tcp_reset(sk); return 1; } /* step 5: check the ACK field */ if (th->ack) { - int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->ack_seq, len); + int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH); switch(sk->state) { case TCP_SYN_RECV: if (acceptable) { - tcp_set_state(sk, TCP_ESTABLISHED); tp->copied_seq = tp->rcv_nxt; + mb(); + tcp_set_state(sk, TCP_ESTABLISHED); /* Note, that this wakeup is only for marginal * crossed SYN case. Passively open sockets * are not waked up, because sk->sleep == NULL * and sk->socket == NULL. */ - if (!sk->dead) { + if (sk->socket) { sk->state_change(sk); sk_wake_async(sk,0,POLL_OUT); } tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = ntohs(th->window) << tp->snd_wscale; - tp->snd_wl1 = TCP_SKB_CB(skb)->seq; - tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq; + tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(skb)->seq); /* tcp_ack considers this ACK as duplicate * and does not calculate rtt. * Fix it at least with timestamps. */ if (tp->saw_tstamp && !tp->srtt) - tcp_ack_saw_tstamp(sk, tp, 0, 0, FLAG_SYN_ACKED); + tcp_ack_saw_tstamp(tp); + + if (tp->tstamp_ok) + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; tcp_init_metrics(sk); + tcp_initialize_rcv_mss(sk); + tcp_init_buffer_space(sk); tcp_fast_path_on(tp); } else { - SOCK_DEBUG(sk, "bad ack\n"); return 1; } break; @@ -3484,6 +3697,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { tcp_done(sk); + NET_INC_STATS_BH(TCPAbortOnData); return 1; } @@ -3543,6 +3757,7 @@ step6: if (sk->shutdown & RCV_SHUTDOWN) { if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { + NET_INC_STATS_BH(TCPAbortOnData); tcp_reset(sk); return 1; } @@ -3558,13 +3773,11 @@ step6: if (sk->state != TCP_CLOSE) { tcp_data_snd_check(sk); tcp_ack_snd_check(sk); - if (tp->sorry) - tcp_new_space(sk); } if (!queued) { discard: - kfree_skb(skb); + __kfree_skb(skb); } return 0; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d9f7dd80d..0c1e678ef 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_ipv4.c,v 1.210 2000/07/26 01:04:19 davem Exp $ + * Version: $Id: tcp_ipv4.c,v 1.212 2000/08/18 17:10:04 davem Exp $ * * IPv4 specific functions * @@ -574,9 +574,8 @@ static int tcp_v4_check_established(struct sock *sk) fall back to VJ's scheme and use initial timestamp retrieved from peer table. */ - if (tw->substate == TCP_TIME_WAIT && - sysctl_tcp_tw_recycle && tw->ts_recent_stamp) { - if ((tp->write_seq = tw->snd_nxt + 2) == 0) + if (tw->ts_recent_stamp) { + if ((tp->write_seq = tw->snd_nxt+65535+2) == 0) tp->write_seq = 1; tp->ts_recent = tw->ts_recent; tp->ts_recent_stamp = tw->ts_recent_stamp; @@ -691,7 +690,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) daddr = rt->rt_dst; err = -ENOBUFS; - buff = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 0, GFP_KERNEL); + buff = alloc_skb(MAX_TCP_HEADER + 15, GFP_KERNEL); if (buff == NULL) goto failure; @@ -926,7 +925,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) * we have no reasons to ignore it. */ if (sk->lock.users == 0) - tcp_enter_cong_avoid(tp); + tcp_enter_cwr(tp); goto out; case ICMP_PARAMETERPROB: err = EPROTO; @@ -1296,7 +1295,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct tcp_opt tp; struct open_request *req; - struct tcphdr *th = skb->h.th; __u32 saddr = skb->nh.iph->saddr; __u32 daddr = skb->nh.iph->daddr; __u32 isn = TCP_SKB_CB(skb)->when; @@ -1341,7 +1339,15 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tp.mss_clamp = 536; tp.user_mss = sk->tp_pinfo.af_tcp.user_mss; - tcp_parse_options(NULL, th, &tp, want_cookie); + tcp_parse_options(skb, &tp); + + if (want_cookie) { + tp.sack_ok = 0; + tp.wscale_ok = 0; + tp.snd_wscale = 0; + tp.tstamp_ok = 0; + tp.saw_tstamp = 0; + } if (tp.saw_tstamp && tp.rcv_tsval == 0) { /* Some OSes (unknown ones, but I see them on web server, which @@ -1359,6 +1365,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) req->af.v4_req.rmt_addr = saddr; req->af.v4_req.opt = tcp_v4_save_options(sk, skb); req->class = &or_ipv4; + if (!want_cookie) + TCP_ECN_create_request(req, skb->h.th); if (want_cookie) { #ifdef CONFIG_SYN_COOKIES @@ -1384,8 +1392,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) peer->v4daddr == saddr) { if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL && (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) { - NETDEBUG(printk(KERN_DEBUG "TW_REC: reject openreq %u/%u %u.%u.%u.%u/%u\n", \ - peer->tcp_ts, req->ts_recent, NIPQUAD(saddr), ntohs(skb->h.th->source))); NET_INC_STATS_BH(PAWSPassiveRejected); dst_release(dst); goto drop_and_free; @@ -1470,10 +1476,8 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen; tcp_sync_mss(newsk, dst->pmtu); - tcp_initialize_rcv_mss(newsk); newtp->advmss = dst->advmss; - - tcp_init_buffer_space(newsk); + tcp_initialize_rcv_mss(newsk); __tcp_v4_hash(newsk); __tcp_inherit_port(sk, newsk); @@ -1493,33 +1497,30 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb) struct open_request *req, **prev; struct tcphdr *th = skb->h.th; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sock *nsk; /* Find possible connection requests. */ req = tcp_v4_search_req(tp, skb->nh.iph, th, &prev); if (req) return tcp_check_req(sk, skb, req, prev); - if (tp->accept_queue) { - struct sock *nsk; - - nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr, - th->source, - skb->nh.iph->daddr, - ntohs(th->dest), - tcp_v4_iif(skb)); + nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr, + th->source, + skb->nh.iph->daddr, + ntohs(th->dest), + tcp_v4_iif(skb)); - if (nsk) { - if (nsk->state != TCP_TIME_WAIT) { - bh_lock_sock(nsk); - return nsk; - } - tcp_tw_put((struct tcp_tw_bucket*)sk); - return NULL; + if (nsk) { + if (nsk->state != TCP_TIME_WAIT) { + bh_lock_sock(nsk); + return nsk; } + tcp_tw_put((struct tcp_tw_bucket*)sk); + return NULL; } #ifdef CONFIG_SYN_COOKIES - if (!th->rst && (th->syn || th->ack)) + if (!th->rst && !th->syn && th->ack) sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); #endif return sk; @@ -1534,8 +1535,8 @@ static int tcp_v4_checksum_init(struct sk_buff *skb) return -1; } skb->ip_summed = CHECKSUM_UNNECESSARY; - } else if (skb->ip_summed != CHECKSUM_UNNECESSARY) { - if (skb->len <= 68) { + } else { + if (skb->len <= 76) { if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr, skb->nh.iph->daddr, csum_partial((char *)skb->h.th, skb->len, 0))) @@ -1576,7 +1577,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; } - if (tcp_checksum_complete(skb)) + if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb)) goto csum_err; if (sk->state == TCP_LISTEN) { @@ -1634,10 +1635,13 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len) /* Count it even if it's bad */ TCP_INC_STATS_BH(TcpInSegs); - if (len < sizeof(struct tcphdr)) - goto bad_packet; - - if (tcp_v4_checksum_init(skb) < 0) + /* An explanation is required here, I think. + * Packet length and doff are validated by header prediction, + * provided case of th->doff==0 is elimineted. + * So, we defer the checks. */ + if (th->doff < sizeof(struct tcphdr)/4 || + (skb->ip_summed != CHECKSUM_UNNECESSARY && + tcp_v4_checksum_init(skb) < 0)) goto bad_packet; TCP_SKB_CB(skb)->seq = ntohl(th->seq); @@ -1645,6 +1649,8 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len) len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->when = 0; + TCP_SKB_CB(skb)->flags = skb->nh.iph->tos; + TCP_SKB_CB(skb)->sacked = 0; skb->used = 0; sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source, @@ -1674,7 +1680,7 @@ process: return ret; no_tcp_socket: - if (tcp_checksum_complete(skb)) { + if (len < (th->doff<<2) || tcp_checksum_complete(skb)) { bad_packet: TCP_INC_STATS_BH(TcpInErrs); } else { @@ -1691,7 +1697,7 @@ discard_and_relse: goto discard_it; do_time_wait: - if (tcp_checksum_complete(skb)) { + if (len < (th->doff<<2) || tcp_checksum_complete(skb)) { TCP_INC_STATS_BH(TcpInErrs); goto discard_and_relse; } @@ -1734,7 +1740,8 @@ int tcp_v4_rebuild_header(struct sock *sk) { struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); __u32 new_saddr; - int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT; + int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT && + !(sk->userlocks & SOCK_BINDADDR_LOCK); if (rt == NULL) { int err; @@ -1755,11 +1762,7 @@ int tcp_v4_rebuild_header(struct sock *sk) __sk_dst_set(sk, &rt->u.dst); } - /* Force route checking if want_rewrite. - * The idea is good, the implementation is disguisting. - * Well, if I made bind on this socket, you cannot randomly ovewrite - * its source address. --ANK - */ + /* Force route checking if want_rewrite. */ if (want_rewrite) { int tmp; struct rtable *new_rt; @@ -1932,12 +1935,19 @@ static int tcp_v4_init_sock(struct sock *sk) tp->snd_cwnd_clamp = ~0; tp->mss_cache = 536; + tp->reordering = sysctl_tcp_reordering; + sk->state = TCP_CLOSE; sk->write_space = tcp_write_space; sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific; + sk->sndbuf = sysctl_tcp_wmem[1]; + sk->rcvbuf = sysctl_tcp_rmem[1]; + + atomic_inc(&tcp_sockets_allocated); + return 0; } @@ -1948,7 +1958,7 @@ static int tcp_v4_destroy_sock(struct sock *sk) tcp_clear_xmit_timers(sk); /* Cleanup up the write buffer. */ - __skb_queue_purge(&sk->write_queue); + tcp_writequeue_purge(sk); /* Cleans up our, hopefuly empty, out_of_order_queue. */ __skb_queue_purge(&tp->out_of_order_queue); @@ -1960,11 +1970,13 @@ static int tcp_v4_destroy_sock(struct sock *sk) if(sk->prev != NULL) tcp_put_port(sk); + atomic_dec(&tcp_sockets_allocated); + return 0; } /* Proc filesystem TCP sock list dumping. */ -static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i) +static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i, int uid) { int ttd = req->expires - jiffies; @@ -1980,7 +1992,7 @@ static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, 1, /* timers active (only the expire timer) */ ttd, req->retrans, - sk->socket ? sk->socket->inode->i_uid : 0, + uid, 0, /* non standard timer */ 0, /* open_requests have no inode */ atomic_read(&sk->refcnt), @@ -2000,33 +2012,31 @@ static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i) src = sp->rcv_saddr; destp = ntohs(sp->dport); srcp = ntohs(sp->sport); - timer_active = 0; - timer_expires = (unsigned) -1; - if (timer_pending(&tp->retransmit_timer) && tp->retransmit_timer.expires < timer_expires) { + if (tp->pending == TCP_TIME_RETRANS) { timer_active = 1; - timer_expires = tp->retransmit_timer.expires; - } else if (timer_pending(&tp->probe_timer) && tp->probe_timer.expires < timer_expires) { + timer_expires = tp->timeout; + } else if (tp->pending == TCP_TIME_PROBE0) { timer_active = 4; - timer_expires = tp->probe_timer.expires; - } - if (timer_pending(&sp->timer) && sp->timer.expires < timer_expires) { + timer_expires = tp->timeout; + } else if (timer_pending(&sp->timer)) { timer_active = 2; timer_expires = sp->timer.expires; - } - if(timer_active == 0) + } else { + timer_active = 0; timer_expires = jiffies; + } sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u", + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u %d", i, src, srcp, dest, destp, sp->state, tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq, timer_active, timer_expires-jiffies, tp->retransmits, - sp->socket ? sp->socket->inode->i_uid : 0, + sock_i_uid(sp), tp->probes_out, - sp->socket ? sp->socket->inode->i_ino : 0, + sock_i_ino(sp), atomic_read(&sp->refcnt), sp, - tp->rto, tp->ack.ato, tp->ack.quick, tp->ack.pingpong + tp->rto, tp->ack.ato, tp->ack.quick, tp->ack.pingpong, sp->sndbuf ); } @@ -2051,18 +2061,20 @@ static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i) atomic_read(&tw->refcnt), tw); } +#define TMPSZ 150 + int tcp_get_info(char *buffer, char **start, off_t offset, int length) { int len = 0, num = 0, i; off_t begin, pos = 0; - char tmpbuf[129]; + char tmpbuf[TMPSZ+1]; - if (offset < 128) - len += sprintf(buffer, "%-127s\n", + if (offset < TMPSZ) + len += sprintf(buffer, "%-*s\n", TMPSZ-1, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout inode"); - pos = 128; + pos = TMPSZ; /* First, walk listening socket table. */ tcp_listen_lock(); @@ -2073,15 +2085,16 @@ int tcp_get_info(char *buffer, char **start, off_t offset, int length) for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) { struct open_request *req; + int uid; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); if (!TCP_INET_FAMILY(sk->family)) goto skip_listen; - pos += 128; + pos += TMPSZ; if (pos >= offset) { get_tcp_sock(sk, tmpbuf, num); - len += sprintf(buffer+len, "%-127s\n", tmpbuf); + len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf); if (len >= length) { tcp_listen_unlock(); goto out_no_bh; @@ -2089,6 +2102,7 @@ int tcp_get_info(char *buffer, char **start, off_t offset, int length) } skip_listen: + uid = sock_i_uid(sk); read_lock_bh(&tp->syn_wait_lock); lopt = tp->listen_opt; if (lopt && lopt->qlen != 0) { @@ -2097,11 +2111,11 @@ skip_listen: if (!TCP_INET_FAMILY(req->class->family)) continue; - pos += 128; + pos += TMPSZ; if (pos < offset) continue; - get_openreq(sk, req, tmpbuf, num); - len += sprintf(buffer+len, "%-127s\n", tmpbuf); + get_openreq(sk, req, tmpbuf, num, uid); + len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf); if(len >= length) { read_unlock_bh(&tp->syn_wait_lock); tcp_listen_unlock(); @@ -2129,11 +2143,11 @@ skip_listen: for(sk = head->chain; sk; sk = sk->next, num++) { if (!TCP_INET_FAMILY(sk->family)) continue; - pos += 128; + pos += TMPSZ; if (pos < offset) continue; get_tcp_sock(sk, tmpbuf, num); - len += sprintf(buffer+len, "%-127s\n", tmpbuf); + len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf); if(len >= length) { read_unlock(&head->lock); goto out; @@ -2144,11 +2158,11 @@ skip_listen: tw = (struct tcp_tw_bucket *)tw->next, num++) { if (!TCP_INET_FAMILY(tw->family)) continue; - pos += 128; + pos += TMPSZ; if (pos < offset) continue; get_timewait_sock(tw, tmpbuf, num); - len += sprintf(buffer+len, "%-127s\n", tmpbuf); + len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf); if(len >= length) { read_unlock(&head->lock); goto out; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c new file mode 100644 index 000000000..ef7fc36cb --- /dev/null +++ b/net/ipv4/tcp_minisocks.c @@ -0,0 +1,970 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: $Id: tcp_minisocks.c,v 1.1 2000/08/09 11:59:04 davem Exp $ + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Mark Evans, <evansmp@uhura.aston.ac.uk> + * Corey Minyard <wf-rch!minyard@relay.EU.net> + * Florian La Roche, <flla@stud.uni-sb.de> + * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> + * Linus Torvalds, <torvalds@cs.helsinki.fi> + * Alan Cox, <gw4pts@gw4pts.ampr.org> + * Matthew Dillon, <dillon@apollo.west.oic.com> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Jorge Cwik, <jorge@laser.satlink.net> + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/sysctl.h> +#include <net/tcp.h> +#include <net/inet_common.h> + +#ifdef CONFIG_SYSCTL +#define SYNC_INIT 0 /* let the user enable it */ +#else +#define SYNC_INIT 1 +#endif + +int sysctl_tcp_tw_recycle = 0; +int sysctl_tcp_max_tw_buckets = NR_FILE*2; + +int sysctl_tcp_syncookies = SYNC_INIT; +int sysctl_tcp_abort_on_overflow = 0; + +static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) +{ + if (seq == s_win) + return 1; + if (after(end_seq, s_win) && before(seq, e_win)) + return 1; + return (seq == e_win && seq == end_seq); +} + +/* New-style handling of TIME_WAIT sockets. */ + +int tcp_tw_count = 0; + + +/* Must be called with locally disabled BHs. */ +void tcp_timewait_kill(struct tcp_tw_bucket *tw) +{ + struct tcp_ehash_bucket *ehead; + struct tcp_bind_hashbucket *bhead; + struct tcp_bind_bucket *tb; + + /* Unlink from established hashes. */ + ehead = &tcp_ehash[tw->hashent]; + write_lock(&ehead->lock); + if (!tw->pprev) { + write_unlock(&ehead->lock); + return; + } + if(tw->next) + tw->next->pprev = tw->pprev; + *(tw->pprev) = tw->next; + tw->pprev = NULL; + write_unlock(&ehead->lock); + + /* Disassociate with bind bucket. */ + bhead = &tcp_bhash[tcp_bhashfn(tw->num)]; + spin_lock(&bhead->lock); + if ((tb = tw->tb) != NULL) { + if(tw->bind_next) + tw->bind_next->bind_pprev = tw->bind_pprev; + *(tw->bind_pprev) = tw->bind_next; + tw->tb = NULL; + if (tb->owners == NULL) { + if (tb->next) + tb->next->pprev = tb->pprev; + *(tb->pprev) = tb->next; + kmem_cache_free(tcp_bucket_cachep, tb); + } + } + spin_unlock(&bhead->lock); + +#ifdef INET_REFCNT_DEBUG + if (atomic_read(&tw->refcnt) != 1) { + printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw, atomic_read(&tw->refcnt)); + } +#endif + tcp_tw_put(tw); +} + +/* + * * Main purpose of TIME-WAIT state is to close connection gracefully, + * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN + * (and, probably, tail of data) and one or more our ACKs are lost. + * * What is TIME-WAIT timeout? It is associated with maximal packet + * lifetime in the internet, which results in wrong conclusion, that + * it is set to catch "old duplicate segments" wandering out of their path. + * It is not quite correct. This timeout is calculated so that it exceeds + * maximal retransmision timeout enough to allow to lose one (or more) + * segments sent by peer and our ACKs. This time may be calculated from RTO. + * * When TIME-WAIT socket receives RST, it means that another end + * finally closed and we are allowed to kill TIME-WAIT too. + * * Second purpose of TIME-WAIT is catching old duplicate segments. + * Well, certainly it is pure paranoia, but if we load TIME-WAIT + * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs. + * * If we invented some more clever way to catch duplicates + * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs. + * + * The algorithm below is based on FORMAL INTERPRETATION of RFCs. + * When you compare it to RFCs, please, read section SEGMENT ARRIVES + * from the very beginning. + * + * NOTE. With recycling (and later with fin-wait-2) TW bucket + * is _not_ stateless. It means, that strictly speaking we must + * spinlock it. I do not want! Well, probability of misbehaviour + * is ridiculously low and, seems, we could use some mb() tricks + * to avoid misread sequence numbers, states etc. --ANK + */ +enum tcp_tw_status +tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, + struct tcphdr *th, unsigned len) +{ + struct tcp_opt tp; + int paws_reject = 0; + + tp.saw_tstamp = 0; + if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) { + tcp_parse_options(skb, &tp); + + if (tp.saw_tstamp) { + tp.ts_recent = tw->ts_recent; + tp.ts_recent_stamp = tw->ts_recent_stamp; + paws_reject = tcp_paws_check(&tp, th->rst); + } + } + + if (tw->substate == TCP_FIN_WAIT2) { + /* Just repeat all the checks of tcp_rcv_state_process() */ + + /* Out of window, send ACK */ + if (paws_reject || + !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, + tw->rcv_nxt, tw->rcv_nxt + tw->rcv_wnd)) + return TCP_TW_ACK; + + if (th->rst) + goto kill; + + if (th->syn && TCP_SKB_CB(skb)->seq != tw->syn_seq) + goto kill_with_rst; + + /* Dup ACK? */ + if (!after(TCP_SKB_CB(skb)->end_seq, tw->rcv_nxt) || + TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { + tcp_tw_put(tw); + return TCP_TW_SUCCESS; + } + + /* New data or FIN. If new data arrive after half-duplex close, + * reset. + */ + if (!th->fin || TCP_SKB_CB(skb)->end_seq != tw->rcv_nxt+1) { +kill_with_rst: + tcp_tw_deschedule(tw); + tcp_timewait_kill(tw); + tcp_tw_put(tw); + return TCP_TW_RST; + } + + /* FIN arrived, enter true time-wait state. */ + tw->substate = TCP_TIME_WAIT; + tw->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + if (tp.saw_tstamp) { + tw->ts_recent_stamp = xtime.tv_sec; + tw->ts_recent = tp.rcv_tsval; + } + + /* I am shamed, but failed to make it more elegant. + * Yes, it is direct reference to IP, which is impossible + * to generalize to IPv6. Taking into account that IPv6 + * do not undertsnad recycling in any case, it not + * a big problem in practice. --ANK */ + if (tw->family == AF_INET && + sysctl_tcp_tw_recycle && tw->ts_recent_stamp && + tcp_v4_tw_remember_stamp(tw)) + tcp_tw_schedule(tw, tw->timeout); + else + tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); + return TCP_TW_ACK; + } + + /* + * Now real TIME-WAIT state. + * + * RFC 1122: + * "When a connection is [...] on TIME-WAIT state [...] + * [a TCP] MAY accept a new SYN from the remote TCP to + * reopen the connection directly, if it: + * + * (1) assigns its initial sequence number for the new + * connection to be larger than the largest sequence + * number it used on the previous connection incarnation, + * and + * + * (2) returns to TIME-WAIT state if the SYN turns out + * to be an old duplicate". + */ + + if (!paws_reject && + (TCP_SKB_CB(skb)->seq == tw->rcv_nxt && + (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { + /* In window segment, it may be only reset or bare ack. */ + + if (th->rst) { + /* This is TIME_WAIT assasination, in two flavors. + * Oh well... nobody has a sufficient solution to this + * protocol bug yet. + */ + if (sysctl_tcp_rfc1337 == 0) { +kill: + tcp_tw_deschedule(tw); + tcp_timewait_kill(tw); + tcp_tw_put(tw); + return TCP_TW_SUCCESS; + } + } + tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); + + if (tp.saw_tstamp) { + tw->ts_recent = tp.rcv_tsval; + tw->ts_recent_stamp = xtime.tv_sec; + } + + tcp_tw_put(tw); + return TCP_TW_SUCCESS; + } + + /* Out of window segment. + + All the segments are ACKed immediately. + + The only exception is new SYN. We accept it, if it is + not old duplicate and we are not in danger to be killed + by delayed old duplicates. RFC check is that it has + newer sequence number works at rates <40Mbit/sec. + However, if paws works, it is reliable AND even more, + we even may relax silly seq space cutoff. + + RED-PEN: we violate main RFC requirement, if this SYN will appear + old duplicate (i.e. we receive RST in reply to SYN-ACK), + we must return socket to time-wait state. It is not good, + but not fatal yet. + */ + + if (th->syn && !th->rst && !th->ack && !paws_reject && + (after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt) || + (tp.saw_tstamp && (s32)(tw->ts_recent - tp.rcv_tsval) < 0))) { + u32 isn = tw->snd_nxt+65535+2; + if (isn == 0) + isn++; + TCP_SKB_CB(skb)->when = isn; + return TCP_TW_SYN; + } + + if (paws_reject) + NET_INC_STATS_BH(PAWSEstabRejected); + + if(!th->rst) { + /* In this case we must reset the TIMEWAIT timer. + * + * If it is ACKless SYN it may be both old duplicate + * and new good SYN with random sequence number <rcv_nxt. + * Do not reschedule in the last case. + */ + if (paws_reject || th->ack) + tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); + + /* Send ACK. Note, we do not put the bucket, + * it will be released by caller. + */ + return TCP_TW_ACK; + } + tcp_tw_put(tw); + return TCP_TW_SUCCESS; +} + +/* Enter the time wait state. This is called with locally disabled BH. + * Essentially we whip up a timewait bucket, copy the + * relevant info into it from the SK, and mess with hash chains + * and list linkage. + */ +static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) +{ + struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->hashent]; + struct tcp_bind_hashbucket *bhead; + struct sock **head, *sktw; + + write_lock(&ehead->lock); + + /* Step 1: Remove SK from established hash. */ + if (sk->pprev) { + if(sk->next) + sk->next->pprev = sk->pprev; + *sk->pprev = sk->next; + sk->pprev = NULL; + sock_prot_dec_use(sk->prot); + } + + /* Step 2: Hash TW into TIMEWAIT half of established hash table. */ + head = &(ehead + tcp_ehash_size)->chain; + sktw = (struct sock *)tw; + if((sktw->next = *head) != NULL) + (*head)->pprev = &sktw->next; + *head = sktw; + sktw->pprev = head; + atomic_inc(&tw->refcnt); + + write_unlock(&ehead->lock); + + /* Step 3: Put TW into bind hash. Original socket stays there too. + Note, that any socket with sk->num!=0 MUST be bound in binding + cache, even if it is closed. + */ + bhead = &tcp_bhash[tcp_bhashfn(sk->num)]; + spin_lock(&bhead->lock); + tw->tb = (struct tcp_bind_bucket *)sk->prev; + BUG_TRAP(sk->prev!=NULL); + if ((tw->bind_next = tw->tb->owners) != NULL) + tw->tb->owners->bind_pprev = &tw->bind_next; + tw->tb->owners = (struct sock*)tw; + tw->bind_pprev = &tw->tb->owners; + spin_unlock(&bhead->lock); +} + +/* + * Move a socket to time-wait or dead fin-wait-2 state. + */ +void tcp_time_wait(struct sock *sk, int state, int timeo) +{ + struct tcp_tw_bucket *tw = NULL; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int recycle_ok = 0; + + if (sysctl_tcp_tw_recycle && tp->ts_recent_stamp) + recycle_ok = tp->af_specific->remember_stamp(sk); + + if (tcp_tw_count < sysctl_tcp_max_tw_buckets) + tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC); + + if(tw != NULL) { + int rto = (tp->rto<<2) - (tp->rto>>1); + + /* Give us an identity. */ + tw->daddr = sk->daddr; + tw->rcv_saddr = sk->rcv_saddr; + tw->bound_dev_if= sk->bound_dev_if; + tw->num = sk->num; + tw->state = TCP_TIME_WAIT; + tw->substate = state; + tw->sport = sk->sport; + tw->dport = sk->dport; + tw->family = sk->family; + tw->reuse = sk->reuse; + tw->rcv_wscale = tp->rcv_wscale; + atomic_set(&tw->refcnt, 0); + + tw->hashent = sk->hashent; + tw->rcv_nxt = tp->rcv_nxt; + tw->snd_nxt = tp->snd_nxt; + tw->rcv_wnd = tcp_receive_window(tp); + tw->syn_seq = tp->syn_seq; + tw->ts_recent = tp->ts_recent; + tw->ts_recent_stamp= tp->ts_recent_stamp; + tw->pprev_death = NULL; + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if(tw->family == PF_INET6) { + memcpy(&tw->v6_daddr, + &sk->net_pinfo.af_inet6.daddr, + sizeof(struct in6_addr)); + memcpy(&tw->v6_rcv_saddr, + &sk->net_pinfo.af_inet6.rcv_saddr, + sizeof(struct in6_addr)); + } +#endif + /* Linkage updates. */ + __tcp_tw_hashdance(sk, tw); + + /* Get the TIME_WAIT timeout firing. */ + if (timeo < rto) + timeo = rto; + + if (recycle_ok) { + tw->timeout = rto; + } else { + tw->timeout = TCP_TIMEWAIT_LEN; + if (state == TCP_TIME_WAIT) + timeo = TCP_TIMEWAIT_LEN; + } + + tcp_tw_schedule(tw, timeo); + } else { + /* Sorry, if we're out of memory, just CLOSE this + * socket up. We've got bigger problems than + * non-graceful socket closings. + */ + if (net_ratelimit()) + printk(KERN_INFO "TCP: time wait bucket table overflow\n"); + } + + tcp_update_metrics(sk); + tcp_done(sk); +} + +/* Kill off TIME_WAIT sockets once their lifetime has expired. */ +static int tcp_tw_death_row_slot = 0; + +static void tcp_twkill(unsigned long); + +static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS]; +static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED; +static struct timer_list tcp_tw_timer = { function: tcp_twkill }; + +static void SMP_TIMER_NAME(tcp_twkill)(unsigned long dummy) +{ + struct tcp_tw_bucket *tw; + int killed = 0; + + /* NOTE: compare this to previous version where lock + * was released after detaching chain. It was racy, + * because tw buckets are scheduled in not serialized context + * in 2.3 (with netfilter), and with softnet it is common, because + * soft irqs are not sequenced. + */ + spin_lock(&tw_death_lock); + + if (tcp_tw_count == 0) + goto out; + + while((tw = tcp_tw_death_row[tcp_tw_death_row_slot]) != NULL) { + tcp_tw_death_row[tcp_tw_death_row_slot] = tw->next_death; + tw->pprev_death = NULL; + spin_unlock(&tw_death_lock); + + tcp_timewait_kill(tw); + tcp_tw_put(tw); + + killed++; + + spin_lock(&tw_death_lock); + } + tcp_tw_death_row_slot = + ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); + + if ((tcp_tw_count -= killed) != 0) + mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); + net_statistics[smp_processor_id()*2].TimeWaited += killed; +out: + spin_unlock(&tw_death_lock); +} + +SMP_TIMER_DEFINE(tcp_twkill, tcp_twkill_task); + +/* These are always called from BH context. See callers in + * tcp_input.c to verify this. + */ + +/* This is for handling early-kills of TIME_WAIT sockets. */ +void tcp_tw_deschedule(struct tcp_tw_bucket *tw) +{ + spin_lock(&tw_death_lock); + if (tw->pprev_death) { + if(tw->next_death) + tw->next_death->pprev_death = tw->pprev_death; + *tw->pprev_death = tw->next_death; + tw->pprev_death = NULL; + tcp_tw_put(tw); + if (--tcp_tw_count == 0) + del_timer(&tcp_tw_timer); + } + spin_unlock(&tw_death_lock); +} + +/* Short-time timewait calendar */ + +static int tcp_twcal_hand = -1; +static int tcp_twcal_jiffie; +static void tcp_twcal_tick(unsigned long); +static struct timer_list tcp_twcal_timer = {function: tcp_twcal_tick}; +static struct tcp_tw_bucket *tcp_twcal_row[TCP_TW_RECYCLE_SLOTS]; + +void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo) +{ + struct tcp_tw_bucket **tpp; + int slot; + + /* timeout := RTO * 3.5 + * + * 3.5 = 1+2+0.5 to wait for two retransmits. + * + * RATIONALE: if FIN arrived and we entered TIME-WAIT state, + * our ACK acking that FIN can be lost. If N subsequent retransmitted + * FINs (or previous seqments) are lost (probability of such event + * is p^(N+1), where p is probability to lose single packet and + * time to detect the loss is about RTO*(2^N - 1) with exponential + * backoff). Normal timewait length is calculated so, that we + * waited at least for one retransmitted FIN (maximal RTO is 120sec). + * [ BTW Linux. following BSD, violates this requirement waiting + * only for 60sec, we should wait at least for 240 secs. + * Well, 240 consumes too much of resources 8) + * ] + * This interval is not reduced to catch old duplicate and + * responces to our wandering segments living for two MSLs. + * However, if we use PAWS to detect + * old duplicates, we can reduce the interval to bounds required + * by RTO, rather than MSL. So, if peer understands PAWS, we + * kill tw bucket after 3.5*RTO (it is important that this number + * is greater than TS tick!) and detect old duplicates with help + * of PAWS. + */ + slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK; + + spin_lock(&tw_death_lock); + + /* Unlink it, if it was scheduled */ + if (tw->pprev_death) { + if(tw->next_death) + tw->next_death->pprev_death = tw->pprev_death; + *tw->pprev_death = tw->next_death; + tw->pprev_death = NULL; + tcp_tw_count--; + } else + atomic_inc(&tw->refcnt); + + if (slot >= TCP_TW_RECYCLE_SLOTS) { + /* Schedule to slow timer */ + if (timeo >= TCP_TIMEWAIT_LEN) { + slot = TCP_TWKILL_SLOTS-1; + } else { + slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD; + if (slot >= TCP_TWKILL_SLOTS) + slot = TCP_TWKILL_SLOTS-1; + } + tw->ttd = jiffies + timeo; + slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1); + tpp = &tcp_tw_death_row[slot]; + } else { + tw->ttd = jiffies + (slot<<TCP_TW_RECYCLE_TICK); + + if (tcp_twcal_hand < 0) { + tcp_twcal_hand = 0; + tcp_twcal_jiffie = jiffies; + tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK); + add_timer(&tcp_twcal_timer); + } else { + if ((long)(tcp_twcal_timer.expires - jiffies) > (slot<<TCP_TW_RECYCLE_TICK)) + mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK)); + slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1); + } + tpp = &tcp_twcal_row[slot]; + } + + if((tw->next_death = *tpp) != NULL) + (*tpp)->pprev_death = &tw->next_death; + *tpp = tw; + tw->pprev_death = tpp; + + if (tcp_tw_count++ == 0) + mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); + spin_unlock(&tw_death_lock); +} + +void SMP_TIMER_NAME(tcp_twcal_tick)(unsigned long dummy) +{ + int n, slot; + unsigned long j; + unsigned long now = jiffies; + int killed = 0; + int adv = 0; + + spin_lock(&tw_death_lock); + if (tcp_twcal_hand < 0) + goto out; + + slot = tcp_twcal_hand; + j = tcp_twcal_jiffie; + + for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) { + if ((long)(j - now) <= 0) { + struct tcp_tw_bucket *tw; + + while((tw = tcp_twcal_row[slot]) != NULL) { + tcp_twcal_row[slot] = tw->next_death; + tw->pprev_death = NULL; + + tcp_timewait_kill(tw); + tcp_tw_put(tw); + killed++; + } + } else { + if (!adv) { + adv = 1; + tcp_twcal_jiffie = j; + tcp_twcal_hand = slot; + } + + if (tcp_twcal_row[slot] != NULL) { + mod_timer(&tcp_twcal_timer, j); + goto out; + } + } + j += (1<<TCP_TW_RECYCLE_TICK); + slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1); + } + tcp_twcal_hand = -1; + +out: + if ((tcp_tw_count -= killed) == 0) + del_timer(&tcp_tw_timer); + net_statistics[smp_processor_id()*2].TimeWaitKilled += killed; + spin_unlock(&tw_death_lock); +} + +SMP_TIMER_DEFINE(tcp_twcal_tick, tcp_twcal_tasklet); + + +/* This is not only more efficient than what we used to do, it eliminates + * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM + * + * Actually, we could lots of memory writes here. tp of listening + * socket contains all necessary default parameters. + */ +struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb) +{ + struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0); + + if(newsk != NULL) { + struct tcp_opt *newtp; +#ifdef CONFIG_FILTER + struct sk_filter *filter; +#endif + + memcpy(newsk, sk, sizeof(*newsk)); + newsk->state = TCP_SYN_RECV; + + /* SANITY */ + newsk->pprev = NULL; + newsk->prev = NULL; + + /* Clone the TCP header template */ + newsk->dport = req->rmt_port; + + sock_lock_init(newsk); + bh_lock_sock(newsk); + + atomic_set(&newsk->rmem_alloc, 0); + skb_queue_head_init(&newsk->receive_queue); + atomic_set(&newsk->wmem_alloc, 0); + skb_queue_head_init(&newsk->write_queue); + atomic_set(&newsk->omem_alloc, 0); + newsk->wmem_queued = 0; + newsk->forward_alloc = 0; + + newsk->done = 0; + newsk->proc = 0; + newsk->backlog.head = newsk->backlog.tail = NULL; + skb_queue_head_init(&newsk->error_queue); + newsk->write_space = tcp_write_space; +#ifdef CONFIG_FILTER + if ((filter = newsk->filter) != NULL) + sk_filter_charge(newsk, filter); +#endif + + /* Now setup tcp_opt */ + newtp = &(newsk->tp_pinfo.af_tcp); + newtp->pred_flags = 0; + newtp->rcv_nxt = req->rcv_isn + 1; + newtp->snd_nxt = req->snt_isn + 1; + newtp->snd_una = req->snt_isn + 1; + newtp->snd_sml = req->snt_isn + 1; + + tcp_delack_init(newtp); + + tcp_prequeue_init(newtp); + + tcp_init_wl(newtp, req->snt_isn, req->rcv_isn); + + newtp->retransmits = 0; + newtp->backoff = 0; + newtp->srtt = 0; + newtp->mdev = TCP_TIMEOUT_INIT; + newtp->rto = TCP_TIMEOUT_INIT; + + newtp->packets_out = 0; + newtp->left_out = 0; + newtp->retrans_out = 0; + newtp->sacked_out = 0; + newtp->fackets_out = 0; + newtp->snd_ssthresh = 0x7fffffff; + + /* So many TCP implementations out there (incorrectly) count the + * initial SYN frame in their delayed-ACK and congestion control + * algorithms that we must have the following bandaid to talk + * efficiently to them. -DaveM + */ + newtp->snd_cwnd = 2; + newtp->snd_cwnd_cnt = 0; + + newtp->ca_state = TCP_CA_Open; + tcp_init_xmit_timers(newsk); + skb_queue_head_init(&newtp->out_of_order_queue); + newtp->send_head = NULL; + newtp->rcv_wup = req->rcv_isn + 1; + newtp->write_seq = req->snt_isn + 1; + newtp->pushed_seq = newtp->write_seq; + newtp->copied_seq = req->rcv_isn + 1; + + newtp->saw_tstamp = 0; + + newtp->dsack = 0; + newtp->eff_sacks = 0; + + newtp->probes_out = 0; + newtp->num_sacks = 0; + newtp->syn_seq = req->rcv_isn; + newtp->fin_seq = req->rcv_isn; + newtp->urg_data = 0; + newtp->listen_opt = NULL; + newtp->accept_queue = newtp->accept_queue_tail = NULL; + /* Deinitialize syn_wait_lock to trap illegal accesses. */ + memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock)); + + /* Back to base struct sock members. */ + newsk->err = 0; + newsk->priority = 0; + atomic_set(&newsk->refcnt, 1); +#ifdef INET_REFCNT_DEBUG + atomic_inc(&inet_sock_nr); +#endif + atomic_inc(&tcp_sockets_allocated); + + if (newsk->keepopen) + tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); + newsk->socket = NULL; + newsk->sleep = NULL; + + newtp->tstamp_ok = req->tstamp_ok; + if((newtp->sack_ok = req->sack_ok) != 0) { + if (sysctl_tcp_fack) + newtp->sack_ok |= 2; + } + newtp->window_clamp = req->window_clamp; + newtp->rcv_ssthresh = req->rcv_wnd; + newtp->rcv_wnd = req->rcv_wnd; + newtp->wscale_ok = req->wscale_ok; + if (newtp->wscale_ok) { + newtp->snd_wscale = req->snd_wscale; + newtp->rcv_wscale = req->rcv_wscale; + } else { + newtp->snd_wscale = newtp->rcv_wscale = 0; + newtp->window_clamp = min(newtp->window_clamp,65535); + } + newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->snd_wscale; + newtp->max_window = newtp->snd_wnd; + + if (newtp->tstamp_ok) { + newtp->ts_recent = req->ts_recent; + newtp->ts_recent_stamp = xtime.tv_sec; + newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + } else { + newtp->ts_recent_stamp = 0; + newtp->tcp_header_len = sizeof(struct tcphdr); + } + if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len) + newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len; + newtp->mss_clamp = req->mss; + TCP_ECN_openreq_child(newtp, req); + } + return newsk; +} + +/* + * Process an incoming packet for SYN_RECV sockets represented + * as an open_request. + */ + +struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, + struct open_request *req, + struct open_request **prev) +{ + struct tcphdr *th = skb->h.th; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); + int paws_reject = 0; + struct tcp_opt ttp; + struct sock *child; + + ttp.saw_tstamp = 0; + if (th->doff > (sizeof(struct tcphdr)>>2)) { + tcp_parse_options(skb, &ttp); + + if (ttp.saw_tstamp) { + ttp.ts_recent = req->ts_recent; + /* We do not store true stamp, but it is not required, + * it can be estimated (approximately) + * from another data. + */ + ttp.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); + paws_reject = tcp_paws_check(&ttp, th->rst); + } + } + + /* Check for pure retransmited SYN. */ + if (TCP_SKB_CB(skb)->seq == req->rcv_isn && + flg == TCP_FLAG_SYN && + !paws_reject) { + /* + * RFC793 draws (Incorrectly! It was fixed in RFC1122) + * this case on figure 6 and figure 8, but formal + * protocol description says NOTHING. + * To be more exact, it says that we should send ACK, + * because this segment (at least, if it has no data) + * is out of window. + * + * CONCLUSION: RFC793 (even with RFC1122) DOES NOT + * describe SYN-RECV state. All the description + * is wrong, we cannot believe to it and should + * rely only on common sense and implementation + * experience. + * + * Enforce "SYN-ACK" according to figure 8, figure 6 + * of RFC793, fixed by RFC1122. + */ + req->class->rtx_syn_ack(sk, req, NULL); + return NULL; + } + + /* Further reproduces section "SEGMENT ARRIVES" + for state SYN-RECEIVED of RFC793. + It is broken, however, it does not work only + when SYNs are crossed, which is impossible in our + case. + + But generally, we should (RFC lies!) to accept ACK + from SYNACK both here and in tcp_rcv_state_process(). + tcp_rcv_state_process() does not, hence, we do not too. + + Note that the case is absolutely generic: + we cannot optimize anything here without + violating protocol. All the checks must be made + before attempt to create socket. + */ + + /* RFC793: "first check sequence number". */ + + if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, + req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) { + /* Out of window: send ACK and drop. */ + if (!(flg & TCP_FLAG_RST)) + req->class->send_ack(skb, req); + if (paws_reject) + NET_INC_STATS_BH(PAWSEstabRejected); + return NULL; + } + + /* In sequence, PAWS is OK. */ + + if (ttp.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1)) + req->ts_recent = ttp.rcv_tsval; + + if (TCP_SKB_CB(skb)->seq == req->rcv_isn) { + /* Truncate SYN, it is out of window starting + at req->rcv_isn+1. */ + flg &= ~TCP_FLAG_SYN; + } + + /* RFC793: "second check the RST bit" and + * "fourth, check the SYN bit" + */ + if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) + goto embryonic_reset; + + /* RFC793: "fifth check the ACK field" */ + + if (!(flg & TCP_FLAG_ACK)) + return NULL; + + /* Invalid ACK: reset will be sent by listening socket */ + if (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1) + return sk; + /* Also, it would be not so bad idea to check rcv_tsecr, which + * is essentially ACK extension and too early or too late values + * should cause reset in unsynchronized states. + */ + + /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ + if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) { + req->acked = 1; + return NULL; + } + + /* OK, ACK is valid, create big socket and + * feed this segment to it. It will repeat all + * the tests. THIS SEGMENT MUST MOVE SOCKET TO + * ESTABLISHED STATE. If it will be dropped after + * socket is created, wait for troubles. + */ + child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); + if (child == NULL) + goto listen_overflow; + + tcp_synq_unlink(tp, req, prev); + tcp_synq_removed(sk, req); + + tcp_acceptq_queue(sk, req, child); + return child; + +listen_overflow: + if (!sysctl_tcp_abort_on_overflow) { + req->acked = 1; + return NULL; + } + +embryonic_reset: + NET_INC_STATS_BH(EmbryonicRsts); + if (!(flg & TCP_FLAG_RST)) + req->class->send_reset(skb); + + tcp_synq_drop(sk, req, prev); + return NULL; +} + +/* + * Queue segment on the new socket if the new socket is active, + * otherwise we just shortcircuit this and continue with + * the new socket. + */ + +int tcp_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb) +{ + int ret = 0; + int state = child->state; + + if (child->lock.users == 0) { + ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len); + + /* Wakeup parent, send SIGIO */ + if (state == TCP_SYN_RECV && child->state != state) + parent->data_ready(parent, 0); + } else { + /* Alas, it is possible again, because we do lookup + * in main socket hash table and lock on listening + * socket does not protect us more. + */ + sk_add_backlog(child, skb); + } + + bh_unlock_sock(child); + return ret; +} diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0fdb6b3f8..8dca4474b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.124 2000/04/08 07:21:24 davem Exp $ + * Version: $Id: tcp_output.c,v 1.127 2000/08/15 20:15:23 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -32,6 +32,7 @@ * David S. Miller : Output engine completely rewritten. * Andrea Arcangeli: SYNACK carry ts_recent in tsecr. * Cacophonix Gaul : draft-minshall-nagle-01 + * J Hadi Salim : ECN support * */ @@ -42,13 +43,29 @@ /* People can turn this off for buggy TCP's found in printers etc. */ int sysctl_tcp_retrans_collapse = 1; -static __inline__ void update_send_head(struct sock *sk) +static __inline__ +void update_send_head(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb) { - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - - tp->send_head = tp->send_head->next; + tp->send_head = skb->next; if (tp->send_head == (struct sk_buff *) &sk->write_queue) tp->send_head = NULL; + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + if (tp->packets_out++ == 0) + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); +} + +/* SND.NXT, if window was not shrunk. + * If window has been shrunk, what should we make? It is not clear at all. + * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( + * Anything in between SND.UNA...SND.UNA+SND.WND also can be already + * invalid. OK, let's make this for now: + */ +static __inline__ __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_opt *tp) +{ + if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt)) + return tp->snd_nxt; + else + return tp->snd_una+tp->snd_wnd; } /* Calculate mss to advertise in SYN segment. @@ -79,15 +96,38 @@ static __u16 tcp_advertise_mss(struct sock *sk) return (__u16)mss; } +/* RFC2861. Reset CWND after idle period longer RTO to "restart window". + * This is the first part of cwnd validation mechanism. */ +static void tcp_cwnd_restart(struct tcp_opt *tp) +{ + s32 delta = tcp_time_stamp - tp->lsndtime; + u32 restart_cwnd = tcp_init_cwnd(tp); + u32 cwnd = tp->snd_cwnd; + + tp->snd_ssthresh = tcp_current_ssthresh(tp); + restart_cwnd = min(restart_cwnd, cwnd); + + while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd) + cwnd >>= 1; + tp->snd_cwnd = max(cwnd, restart_cwnd); + tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_used = 0; +} + static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb) { - /* If we had a reply for ato after last received + u32 now = tcp_time_stamp; + + if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto) + tcp_cwnd_restart(tp); + + tp->lsndtime = now; + + /* If it is a reply for ato after last received * packet, enter pingpong mode. */ - if ((u32)(tp->lsndtime - tp->ack.lrcvtime) < tp->ack.ato) + if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato) tp->ack.pingpong = 1; - - tp->lsndtime = tcp_time_stamp; } static __inline__ void tcp_event_ack_sent(struct sock *sk) @@ -95,11 +135,56 @@ static __inline__ void tcp_event_ack_sent(struct sock *sk) struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); tcp_dec_quickack_mode(tp); - tp->ack.pending = 0; - tp->ack.rcv_segs = 0; tcp_clear_xmit_timer(sk, TCP_TIME_DACK); } +/* Chose a new window to advertise, update state in tcp_opt for the + * socket, and return result with RFC1323 scaling applied. The return + * value can be stuffed directly into th->window for an outgoing + * frame. + */ +static __inline__ u16 tcp_select_window(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + u32 cur_win = tcp_receive_window(tp); + u32 new_win = __tcp_select_window(sk); + + /* Never shrink the offered window */ + if(new_win < cur_win) { + /* Danger Will Robinson! + * Don't update rcv_wup/rcv_wnd here or else + * we will not be able to advertise a zero + * window in time. --DaveM + * + * Relax Will Robinson. + */ + new_win = cur_win; + } + tp->rcv_wnd = new_win; + tp->rcv_wup = tp->rcv_nxt; + + /* RFC1323 scaling applied */ + new_win >>= tp->rcv_wscale; + +#ifdef TCP_FORMAL_WINDOW + if (new_win == 0) { + /* If we advertise zero window, disable fast path. */ + tp->pred_flags = 0; + } else if (cur_win == 0 && tp->pred_flags == 0 && + skb_queue_len(&tp->out_of_order_queue) == 0 && + !tp->urg_data) { + /* If we open zero window, enable fast path. + Without this it will be open by the first data packet, + it is too late to merge checksumming to copy. + */ + tcp_fast_path_on(tp); + } +#endif + + return new_win; +} + + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -141,12 +226,12 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; } - } else if (tp->num_sacks) { + } else if (tp->eff_sacks) { /* A SACK is 2 pad bytes, a 2 byte header, plus * 2 32-bit sequence numbers for each SACK block. */ tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + - (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); + (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK)); } th = (struct tcphdr *) skb_push(skb, tcp_header_size); skb->h.th = th; @@ -155,7 +240,7 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) /* Build TCP header and checksum it. */ th->source = sk->sport; th->dest = sk->dport; - th->seq = htonl(TCP_SKB_CB(skb)->seq); + th->seq = htonl(tcb->seq); th->ack_seq = htonl(tp->rcv_nxt); *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags); if (tcb->flags & TCPCB_FLAG_SYN) { @@ -176,11 +261,13 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) (sysctl_flags & SYSCTL_FLAG_SACK), (sysctl_flags & SYSCTL_FLAG_WSCALE), tp->rcv_wscale, - TCP_SKB_CB(skb)->when, + tcb->when, tp->ts_recent); } else { tcp_build_and_update_options((__u32 *)(th + 1), - tp, TCP_SKB_CB(skb)->when); + tp, tcb->when); + + TCP_ECN_send(sk, tp, skb, tcp_header_size); } tp->af_specific->send_check(sk, th, skb->len, skb); @@ -196,7 +283,7 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) if (err <= 0) return err; - tcp_enter_cong_avoid(tp); + tcp_enter_cwr(tp); /* NET_XMIT_CN is special. It does not guarantee, * that this packet is lost. It tells that device @@ -212,6 +299,7 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) #undef SYSCTL_FLAG_SACK } + /* This is the main buffer sending routine. We queue the buffer * and decide whether to queue or transmit now. * @@ -225,15 +313,15 @@ void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue, unsigne /* Advance write_seq and place onto the write_queue. */ tp->write_seq = TCP_SKB_CB(skb)->end_seq; __skb_queue_tail(&sk->write_queue, skb); + tcp_charge_skb(sk, skb); if (!force_queue && tp->send_head == NULL && tcp_snd_test(tp, skb, cur_mss, 1)) { /* Send it out now. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)) == 0) { tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tcp_minshall_update(tp, cur_mss, skb->len); - tp->packets_out++; - if(!tcp_timer_is_set(sk, TCP_TIME_RETRANS)) + tcp_minshall_update(tp, cur_mss, skb); + if (tp->packets_out++ == 0) tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); return; } @@ -250,16 +338,16 @@ void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue, unsigne */ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) { + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; struct sk_buff *buff; int nsize = skb->len - len; u16 flags; /* Get a new skb... force flag on. */ - buff = sock_wmalloc(sk, - (nsize + MAX_TCP_HEADER + 15), - 1, GFP_ATOMIC); + buff = tcp_alloc_skb(sk, nsize + MAX_TCP_HEADER + 15, GFP_ATOMIC); if (buff == NULL) return -ENOMEM; /* We'll just try again later. */ + tcp_charge_skb(sk, buff); /* Reserve space for headers. */ skb_reserve(buff, MAX_TCP_HEADER); @@ -286,7 +374,11 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) if(!(flags & TCPCB_FLAG_URG)) TCP_SKB_CB(buff)->urg_ptr = 0; TCP_SKB_CB(buff)->flags = flags; - TCP_SKB_CB(buff)->sacked = 0; + TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_EVER_RETRANS); + if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) { + tp->lost_out++; + tp->left_out++; + } /* Copy and checksum data tail into the new buffer. */ buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize), @@ -301,11 +393,6 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) /* Looks stupid, but our code really uses when of * skbs, which it never sent before. --ANK - * - * NOTE: several days after I added this, Dave repaired - * tcp_simple_retransmit() and it should not use ->when - * of never sent skbs more. I am not sure, so that - * this line remains until more careful investigation. --ANK */ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; @@ -401,13 +488,6 @@ int tcp_write_xmit(struct sock *sk) */ mss_now = tcp_current_mss(sk); - /* Anything on the transmit queue that fits the window can - * be added providing we are: - * - * a) following SWS avoidance [and Nagle algorithm] - * b) not exceeding our congestion window. - * c) not retransmitting [Nagle] - */ while((skb = tp->send_head) && tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb))) { if (skb->len > mss_now) { @@ -419,19 +499,13 @@ int tcp_write_xmit(struct sock *sk) if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) break; /* Advance the send_head. This one is sent out. */ - update_send_head(sk); - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tcp_minshall_update(tp, mss_now, skb->len); - tp->packets_out++; + update_send_head(sk, tp, skb); + tcp_minshall_update(tp, mss_now, skb); sent_pkts = 1; } - /* If we sent anything, make sure the retransmit - * timer is active. - */ if (sent_pkts) { - if (!tcp_timer_is_set(sk, TCP_TIME_RETRANS)) - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + tcp_cwnd_validate(sk, tp); return 0; } @@ -506,28 +580,22 @@ u32 __tcp_select_window(struct sock *sk) /* Sometimes free_space can be < 0. */ free_space = tcp_space(sk); - if (free_space > ((int) tp->window_clamp)) - free_space = tp->window_clamp; if (tp->window_clamp < mss) mss = tp->window_clamp; - if (free_space < min((int)tp->window_clamp, tcp_full_space(sk)) / 2) { - /* THIS IS _VERY_ GOOD PLACE to play window clamp. - * if free_space becomes suspiciously low - * verify ratio rmem_alloc/(rcv_nxt - copied_seq), - * and if we predict that when free_space will be lower mss, - * rmem_alloc will run out of rcvbuf*2, shrink window_clamp. - * It will eliminate most of prune events! Very simple, - * it is the next thing to do. --ANK - * - * Provided we found a way to raise it back... --ANK - */ + if (free_space < (int)min(tp->window_clamp, tcp_full_space(sk)) / 2) { tp->ack.quick = 0; + if (tcp_memory_pressure) + tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4*tp->advmss); + if (free_space < ((int) (mss/2))) return 0; } + if (free_space > tp->rcv_ssthresh) + free_space = tp->rcv_ssthresh; + /* Get the largest window that is a nice multiple of mss. * Window clamp already applied above. * If our current window offering is within 1 mss of the @@ -547,6 +615,7 @@ u32 __tcp_select_window(struct sock *sk) /* Attempt to collapse two adjacent SKB's during retransmission. */ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now) { + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; struct sk_buff *next_skb = skb->next; /* The first test we must make is that neither of these two @@ -564,6 +633,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED) return; + /* Next skb is out of window. */ + if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd)) + return; + /* Punt if not enough space exists in the first SKB for * the data in the second, or the total combined payload * would exceed the MSS. @@ -602,8 +675,20 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m /* All done, get rid of second SKB and account for it so * packet counting does not break. */ - kfree_skb(next_skb); - sk->tp_pinfo.af_tcp.packets_out--; + TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&TCPCB_EVER_RETRANS; + if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS) + tp->retrans_out--; + if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) { + tp->lost_out--; + tp->left_out--; + } + if (!tp->sack_ok && tp->sacked_out) { + /* Reno case is special. Sigh... */ + tp->sacked_out--; + tp->left_out--; + } + tcp_free_skb(sk, next_skb); + tp->packets_out--; } } @@ -614,53 +699,43 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m void tcp_simple_retransmit(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct sk_buff *skb, *old_next_skb; + struct sk_buff *skb; unsigned int mss = tcp_current_mss(sk); - - /* Don't muck with the congestion window here. */ - tp->dup_acks = 0; - tp->high_seq = tp->snd_nxt; - tp->retrans_head = NULL; - - /* Input control flow will see that this was retransmitted - * and not use it for RTT calculation in the absence of - * the timestamp option. - */ - for (old_next_skb = skb = skb_peek(&sk->write_queue); - ((skb != tp->send_head) && - (skb != (struct sk_buff *)&sk->write_queue)); - skb = skb->next) { - int resend_skb = 0; - - /* Our goal is to push out the packets which we - * sent already, but are being chopped up now to - * account for the PMTU information we have. - * - * As we resend the queue, packets are fragmented - * into two pieces, and when we try to send the - * second piece it may be collapsed together with - * a subsequent packet, and so on. -DaveM - */ - if (old_next_skb != skb || skb->len > mss) - resend_skb = 1; - old_next_skb = skb->next; - if (resend_skb != 0) { - if (tcp_retransmit_skb(sk, skb)) - break; + int lost = 0; + + for_retrans_queue(skb, sk, tp) { + if (skb->len > mss && + !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { + if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out--; + } + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) { + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out++; + lost = 1; + } } } -} -static __inline__ void update_retrans_head(struct sock *sk) -{ - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - - tp->retrans_head = tp->retrans_head->next; - if((tp->retrans_head == tp->send_head) || - (tp->retrans_head == (struct sk_buff *) &sk->write_queue)) { - tp->retrans_head = NULL; - tp->rexmt_done = 1; + if (!lost) + return; + + tp->left_out = tp->sacked_out + tp->lost_out; + + /* Don't muck with the congestion window here. + * Reason is that we do not increase amount of _data_ + * in network, but units changed and effective + * cwnd/ssthresh really reduced now. + */ + if (tp->ca_state != TCP_CA_Loss) { + tp->high_seq = tp->snd_nxt; + tp->snd_ssthresh = tcp_current_ssthresh(tp); + tp->prior_ssthresh = 0; + tp->undo_marker = 0; + tp->ca_state = TCP_CA_Loss; } + tcp_xmit_retransmit_queue(sk); } /* This retransmits one SKB. Policy decisions and retransmit queue @@ -671,18 +746,13 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); unsigned int cur_mss = tcp_current_mss(sk); + int err; -#ifdef TCP_DEBUG - /* It was possible this summer, that retransmit timer - * raced with its deletion and hit socket with packets_out==0. - * I fixed it, but preserved the check in the place, - * where the fault occured. --ANK + /* Do not sent more than we queued. 1/4 is reserved for possible + * copying overhead: frgagmentation, tunneling, mangling etc. */ - if (skb == NULL) { - printk("tcp_retransmit_skb: bug, skb==NULL, caller=%p\n", NET_CALLER(sk)); - return -EFAULT; - } -#endif + if (atomic_read(&sk->wmem_alloc) > min(sk->wmem_queued+(sk->wmem_queued>>2),sk->sndbuf)) + return -EAGAIN; if(skb->len > cur_mss) { if(tcp_fragment(sk, skb, cur_mss)) @@ -715,23 +785,40 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) skb->csum = 0; } - /* Ok, we're gonna send it out, update state. */ - TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_RETRANS; - tp->retrans_out++; - /* Make a copy, if the first transmission SKB clone we made * is still in somebody's hands, else make a clone. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - if(skb_cloned(skb)) - skb = skb_copy(skb, GFP_ATOMIC); - else - skb = skb_clone(skb, GFP_ATOMIC); - /* Update global TCP statistics and return success. */ - TCP_INC_STATS(TcpRetransSegs); + err = tcp_transmit_skb(sk, (skb_cloned(skb) ? + skb_copy(skb, GFP_ATOMIC): + skb_clone(skb, GFP_ATOMIC))); - return tcp_transmit_skb(sk, skb); + if (err == 0) { + /* Update global TCP statistics. */ + TCP_INC_STATS(TcpRetransSegs); + +#if FASTRETRANS_DEBUG > 0 + if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { + if (net_ratelimit()) + printk(KERN_DEBUG "retrans_out leaked.\n"); + } +#endif + TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; + tp->retrans_out++; + + /* Save stamp of the first retransmit. */ + if (!tp->retrans_stamp) + tp->retrans_stamp = TCP_SKB_CB(skb)->when; + + tp->undo_retrans++; + + /* snd_nxt is stored to detect loss of retransmitted segment, + * see tcp_input.c tcp_sacktag_write_queue(). + */ + TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; + } + return err; } /* This gets called after a retransmit timeout, and the initially @@ -746,71 +833,79 @@ void tcp_xmit_retransmit_queue(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; + int packet_cnt = tp->lost_out; + + /* First pass: retransmit lost packets. */ + if (packet_cnt) { + for_retrans_queue(skb, sk, tp) { + __u8 sacked = TCP_SKB_CB(skb)->sacked; + + if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) + return; + + if (sacked&TCPCB_LOST) { + if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { + if (tcp_retransmit_skb(sk, skb)) + return; + if (tp->ca_state != TCP_CA_Loss) + NET_INC_STATS_BH(TCPFastRetrans); + else + NET_INC_STATS_BH(TCPSlowStartRetrans); + + if (skb == skb_peek(&sk->write_queue)) + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + } + + if (--packet_cnt <= 0) + break; + } + } + } - if (tp->retrans_head == NULL && - tp->rexmt_done == 0) - tp->retrans_head = skb_peek(&sk->write_queue); - if (tp->retrans_head == tp->send_head) - tp->retrans_head = NULL; + /* OK, demanded retransmission is finished. */ - /* Each time, advance the retrans_head if we got - * a packet out or we skipped one because it was - * SACK'd. -DaveM + /* Forward retransmissions are possible only during Recovery. */ + if (tp->ca_state != TCP_CA_Recovery) + return; + + /* No forward retransmissions in Reno are possible. */ + if (!tp->sack_ok) + return; + + /* Yeah, we have to make difficult choice between forward transmission + * and retransmission... Both ways have their merits... + * + * For now we do not retrnamsit anything, while we have some new + * segments to send. */ - while ((skb = tp->retrans_head) != NULL) { - /* If it has been ack'd by a SACK block, we don't - * retransmit it. - */ - if(!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { - /* Send it out, punt if error occurred. */ - if(tcp_retransmit_skb(sk, skb)) - break; - update_retrans_head(sk); - - /* Stop retransmitting if we've hit the congestion - * window limit. - */ - if (tp->retrans_out >= tp->snd_cwnd) - break; - } else { - update_retrans_head(sk); - } - } -} + if (tcp_may_send_now(sk, tp)) + return; -/* Using FACK information, retransmit all missing frames at the receiver - * up to the forward most SACK'd packet (tp->fackets_out) if the packet - * has not been retransmitted already. - */ -void tcp_fack_retransmit(struct sock *sk) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct sk_buff *skb = skb_peek(&sk->write_queue); - int packet_cnt = 0; + packet_cnt = 0; + + for_retrans_queue(skb, sk, tp) { + if(++packet_cnt > tp->fackets_out) + break; - while((skb != NULL) && - (skb != tp->send_head) && - (skb != (struct sk_buff *)&sk->write_queue)) { - __u8 sacked = TCP_SKB_CB(skb)->sacked; + if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) + break; - if(sacked & (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS)) - goto next_packet; + if(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) + continue; /* Ok, retransmit it. */ if(tcp_retransmit_skb(sk, skb)) break; - if(tcp_packets_in_flight(tp) >= tp->snd_cwnd) - break; -next_packet: - packet_cnt++; - if(packet_cnt >= tp->fackets_out) - break; - skb = skb->next; + if (skb == skb_peek(&sk->write_queue)) + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + + NET_INC_STATS_BH(TCPForwardRetrans); } } + /* Send a fin. The caller locks the socket for us. This cannot be * allowed to fail queueing a FIN frame under any circumstances. */ @@ -839,30 +934,19 @@ void tcp_send_fin(struct sock *sk) /* Special case to avoid Nagle bogosity. If this * segment is the last segment, and it was queued * due to Nagle/SWS-avoidance, send it out now. - * - * Hmm... actually it overrides also congestion - * avoidance (OK for FIN) and retransmit phase - * (not OK? Added.). */ if(tp->send_head == skb && - !after(tp->write_seq, tp->snd_una + tp->snd_wnd) && - !tp->retransmits) { + !after(tp->write_seq, tp->snd_una + tp->snd_wnd)) { TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (!tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) { - update_send_head(sk); - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tp->packets_out++; - if(!tcp_timer_is_set(sk, TCP_TIME_RETRANS)) - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); - } else + if (!tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL))) + update_send_head(sk, tp, skb); + else tcp_check_probe_timer(sk, tp); } } else { /* Socket is locked, keep trying until memory is available. */ for (;;) { - skb = sock_wmalloc(sk, - MAX_TCP_HEADER + 15, - 1, GFP_KERNEL); + skb = alloc_skb(MAX_TCP_HEADER + 15, GFP_KERNEL); if (skb) break; current->policy |= SCHED_YIELD; @@ -896,8 +980,10 @@ void tcp_send_active_reset(struct sock *sk, int priority) /* NOTE: No TCP options attached and we never retransmit this. */ skb = alloc_skb(MAX_TCP_HEADER + 15, priority); - if (!skb) + if (!skb) { + NET_INC_STATS(TCPAbortFailed); return; + } /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); @@ -907,10 +993,11 @@ void tcp_send_active_reset(struct sock *sk, int priority) TCP_SKB_CB(skb)->urg_ptr = 0; /* Send it off. */ - TCP_SKB_CB(skb)->seq = tp->snd_nxt; + TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_transmit_skb(sk, skb); + if (tcp_transmit_skb(sk, skb)) + NET_INC_STATS(TCPAbortFailed); } /* WARNING: This routine must only be called when we have already sent @@ -920,27 +1007,29 @@ void tcp_send_active_reset(struct sock *sk, int priority) */ int tcp_send_synack(struct sock *sk) { - struct tcp_opt* tp = &(sk->tp_pinfo.af_tcp); - struct sk_buff* skb; + struct sk_buff* skb; - skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, - 1, GFP_ATOMIC); - if (skb == NULL) - return -ENOMEM; - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, MAX_TCP_HEADER); - skb->csum = 0; - TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_SYN); - TCP_SKB_CB(skb)->sacked = 0; - TCP_SKB_CB(skb)->urg_ptr = 0; + skb = skb_peek(&sk->write_queue); + if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) { + printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); + return -EFAULT; + } + if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) { + if (skb_cloned(skb)) { + struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); + if (nskb == NULL) + return -ENOMEM; + __skb_unlink(skb, &sk->write_queue); + __skb_queue_head(&sk->write_queue, nskb); + tcp_free_skb(sk, skb); + tcp_charge_skb(sk, nskb); + skb = nskb; + } - /* SYN eats a sequence byte. */ - TCP_SKB_CB(skb)->seq = tp->snd_una; - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; - __skb_queue_tail(&sk->write_queue, skb); + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK; + TCP_ECN_send_synack(&sk->tp_pinfo.af_tcp, skb); + } TCP_SKB_CB(skb)->when = tcp_time_stamp; - tp->packets_out++; return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); } @@ -974,6 +1063,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; + TCP_ECN_make_synack(req, th); th->source = sk->sport; th->dest = req->rmt_port; TCP_SKB_CB(skb)->seq = req->snt_isn; @@ -983,7 +1073,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ __u8 rcv_wscale; /* Set this up on the first call only */ - req->window_clamp = tp->window_clamp ? : skb->dst->window; + req->window_clamp = tp->window_clamp ? : dst->window; /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(tcp_full_space(sk), dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), @@ -1028,18 +1118,20 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff) tp->mss_clamp = tp->user_mss; tp->max_window = 0; tcp_sync_mss(sk, dst->pmtu); - tcp_initialize_rcv_mss(sk); if (!tp->window_clamp) tp->window_clamp = dst->window; tp->advmss = dst->advmss; + tcp_initialize_rcv_mss(sk); tcp_select_initial_window(tcp_full_space(sk), - tp->advmss - (tp->tcp_header_len - sizeof(struct tcphdr)), - &tp->rcv_wnd, - &tp->window_clamp, - sysctl_tcp_window_scaling, - &tp->rcv_wscale); + tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), + &tp->rcv_wnd, + &tp->window_clamp, + sysctl_tcp_window_scaling, + &tp->rcv_wscale); + + tp->rcv_ssthresh = tp->rcv_wnd; /* Socket identity change complete, no longer * in TCP_CLOSE, so enter ourselves into the @@ -1052,8 +1144,7 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff) sk->err = 0; sk->done = 0; tp->snd_wnd = 0; - tp->snd_wl1 = 0; - tp->snd_wl2 = tp->write_seq; + tcp_init_wl(tp, tp->write_seq, 0); tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; tp->rcv_nxt = 0; @@ -1061,23 +1152,24 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff) tp->copied_seq = 0; tp->rto = TCP_TIMEOUT_INIT; - tcp_init_xmit_timers(sk); tp->retransmits = 0; - tp->fackets_out = 0; - tp->retrans_out = 0; + tcp_clear_retrans(tp); TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; + TCP_ECN_send_syn(tp, buff); TCP_SKB_CB(buff)->sacked = 0; TCP_SKB_CB(buff)->urg_ptr = 0; buff->csum = 0; TCP_SKB_CB(buff)->seq = tp->write_seq++; TCP_SKB_CB(buff)->end_seq = tp->write_seq; tp->snd_nxt = tp->write_seq; + tp->pushed_seq = tp->write_seq; /* Send it off. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; - tp->syn_stamp = TCP_SKB_CB(buff)->when; + tp->retrans_stamp = TCP_SKB_CB(buff)->when; __skb_queue_tail(&sk->write_queue, buff); + tcp_charge_skb(sk, buff); tp->packets_out++; tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); TCP_INC_STATS(TcpActiveOpens); @@ -1099,20 +1191,27 @@ err_out: void tcp_send_delayed_ack(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - long ato = tp->ack.ato; + int ato = tp->ack.ato; unsigned long timeout; if (ato > TCP_DELACK_MIN) { - int max_ato; + int max_ato = HZ/2; + + if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED)) + max_ato = TCP_DELACK_MAX; + + /* Slow path, intersegment interval is "high". */ /* If some rtt estimate is known, use it to bound delayed ack. * Do not use tp->rto here, use results of rtt measurements * directly. */ - if (tp->srtt) - max_ato = (tp->srtt >> 3) + tp->mdev; - else - max_ato = TCP_DELACK_MAX; + if (tp->srtt) { + int rtt = max(tp->srtt>>3, TCP_DELACK_MIN); + + if (rtt < max_ato) + max_ato = rtt; + } ato = min(ato, max_ato); } @@ -1121,20 +1220,20 @@ void tcp_send_delayed_ack(struct sock *sk) timeout = jiffies + ato; /* Use new timeout only if there wasn't a older one earlier. */ - if (timer_pending(&tp->delack_timer)) { - unsigned long old_timeout = tp->delack_timer.expires; - + if (tp->ack.pending&TCP_ACK_TIMER) { /* If delack timer was blocked or is about to expire, * send ACK now. */ - if (tp->ack.blocked || time_before_eq(old_timeout, jiffies+(ato>>2))) { + if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) { tcp_send_ack(sk); return; } - if (!time_before(timeout, old_timeout)) - timeout = old_timeout; + if (!time_before(timeout, tp->ack.timeout)) + timeout = tp->ack.timeout; } + tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER; + tp->ack.timeout = timeout; if (!mod_timer(&tp->delack_timer, timeout)) sock_hold(sk); @@ -1170,8 +1269,8 @@ void tcp_send_ack(struct sock *sk) */ buff = alloc_skb(MAX_TCP_HEADER + 15, GFP_ATOMIC); if (buff == NULL) { - tp->ack.pending = 1; - tp->ack.ato = TCP_ATO_MAX; + tcp_schedule_ack(tp); + tp->ack.ato = TCP_ATO_MIN; tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); return; } @@ -1184,7 +1283,7 @@ void tcp_send_ack(struct sock *sk) TCP_SKB_CB(buff)->urg_ptr = 0; /* Send it off, this clears delayed acks for us. */ - TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tp->snd_nxt; + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); TCP_SKB_CB(buff)->when = tcp_time_stamp; tcp_transmit_skb(sk, buff); } @@ -1193,66 +1292,68 @@ void tcp_send_ack(struct sock *sk) /* This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. */ +static int tcp_xmit_probe_skb(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb; + + /* We don't queue it, tcp_transmit_skb() sets ownership. */ + skb = alloc_skb(MAX_TCP_HEADER + 15, GFP_ATOMIC); + if (skb == NULL) + return -1; + + /* Reserve space for headers and set control bits. */ + skb_reserve(skb, MAX_TCP_HEADER); + skb->csum = 0; + TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; + TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->urg_ptr = 0; + + /* Use a previous sequence. This should cause the other + * end to send an ack. Don't queue or clone SKB, just + * send it. + */ + TCP_SKB_CB(skb)->seq = tp->snd_una - 1; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; + TCP_SKB_CB(skb)->when = tcp_time_stamp; + return tcp_transmit_skb(sk, skb); +} + int tcp_write_wakeup(struct sock *sk) { if (sk->state != TCP_CLOSE) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; - /* Now this function is never called, while - * we have something not ACKed in queue. - */ - BUG_TRAP(tp->snd_una == tp->snd_nxt); - - if (tp->snd_wnd > (tp->snd_nxt-tp->snd_una) - && ((skb = tp->send_head) != NULL)) { + if ((skb = tp->send_head) != NULL && + before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) { int err; - unsigned long win_size; + int mss = tcp_current_mss(sk); + int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq; + + if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) + tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; /* We are probing the opening of a window * but the window size is != 0 * must have been a result SWS avoidance ( sender ) */ - win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); - if (win_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) { - if (tcp_fragment(sk, skb, win_size)) + if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || + skb->len > mss) { + seg_size = min(seg_size, mss); + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + if (tcp_fragment(sk, skb, seg_size)) return -1; } + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; TCP_SKB_CB(skb)->when = tcp_time_stamp; err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); if (!err) { - update_send_head(sk); - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tp->packets_out++; - if (!tcp_timer_is_set(sk, TCP_TIME_RETRANS)) - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + update_send_head(sk, tp, skb); } return err; } else { - /* We don't queue it, tcp_transmit_skb() sets ownership. */ - skb = alloc_skb(MAX_TCP_HEADER + 15, GFP_ATOMIC); - if (skb == NULL) - return -1; - - /* Reserve space for headers and set control bits. */ - skb_reserve(skb, MAX_TCP_HEADER); - skb->csum = 0; - TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; - TCP_SKB_CB(skb)->sacked = 0; - TCP_SKB_CB(skb)->urg_ptr = 0; - - /* Use a previous sequence. This should cause the other - * end to send an ack. Don't queue or clone SKB, just - * send it. - * - * RED-PEN: logically it should be snd_una-1. - * snd_nxt-1 will not be acked. snd_una==snd_nxt - * in this place however. Right? - */ - TCP_SKB_CB(skb)->seq = tp->snd_una - 1; - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; - TCP_SKB_CB(skb)->when = tcp_time_stamp; - return tcp_transmit_skb(sk, skb); + return tcp_xmit_probe_skb(sk); } } return -1; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 4ed38175b..d98376840 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_timer.c,v 1.77 2000/06/30 10:18:38 davem Exp $ + * Version: $Id: tcp_timer.c,v 1.79 2000/08/11 00:13:36 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -29,13 +29,11 @@ int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; int sysctl_tcp_retries1 = TCP_RETR1; int sysctl_tcp_retries2 = TCP_RETR2; -int sysctl_tcp_orphan_retries = TCP_ORPHAN_RETRIES; +int sysctl_tcp_orphan_retries = 0; -static void tcp_retransmit_timer(unsigned long); +static void tcp_write_timer(unsigned long); static void tcp_delack_timer(unsigned long); -static void tcp_probe_timer(unsigned long); static void tcp_keepalive_timer (unsigned long data); -static void tcp_twkill(unsigned long); const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n"; @@ -50,73 +48,35 @@ void tcp_init_xmit_timers(struct sock *sk) struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; init_timer(&tp->retransmit_timer); - tp->retransmit_timer.function=&tcp_retransmit_timer; + tp->retransmit_timer.function=&tcp_write_timer; tp->retransmit_timer.data = (unsigned long) sk; + tp->pending = 0; init_timer(&tp->delack_timer); tp->delack_timer.function=&tcp_delack_timer; tp->delack_timer.data = (unsigned long) sk; - - init_timer(&tp->probe_timer); - tp->probe_timer.function=&tcp_probe_timer; - tp->probe_timer.data = (unsigned long) sk; + tp->ack.pending = 0; init_timer(&sk->timer); sk->timer.function=&tcp_keepalive_timer; sk->timer.data = (unsigned long) sk; } -/* - * Reset the retransmission timer - */ - -void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when) -{ - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - - switch (what) { - case TCP_TIME_RETRANS: - /* When seting the transmit timer the probe timer - * should not be set. - * The delayed ack timer can be set if we are changing the - * retransmit timer when removing acked frames. - */ - if (timer_pending(&tp->probe_timer) && del_timer(&tp->probe_timer)) - __sock_put(sk); - if (when > TCP_RTO_MAX) { - printk(KERN_DEBUG "reset_xmit_timer sk=%p when=0x%lx, caller=%p\n", sk, when, NET_CALLER(sk)); - when = TCP_RTO_MAX; - } - if (!mod_timer(&tp->retransmit_timer, jiffies+when)) - sock_hold(sk); - break; - - case TCP_TIME_DACK: - if (!mod_timer(&tp->delack_timer, jiffies+when)) - sock_hold(sk); - break; - - case TCP_TIME_PROBE0: - if (!mod_timer(&tp->probe_timer, jiffies+when)) - sock_hold(sk); - break; - - default: - printk(KERN_DEBUG "bug: unknown timer value\n"); - }; -} - void tcp_clear_xmit_timers(struct sock *sk) -{ +{ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - if(timer_pending(&tp->retransmit_timer) && del_timer(&tp->retransmit_timer)) - __sock_put(sk); - if(timer_pending(&tp->delack_timer) && del_timer(&tp->delack_timer)) + tp->pending = 0; + if (timer_pending(&tp->retransmit_timer) && + del_timer(&tp->retransmit_timer)) __sock_put(sk); + + tp->ack.pending = 0; tp->ack.blocked = 0; - if(timer_pending(&tp->probe_timer) && del_timer(&tp->probe_timer)) + if (timer_pending(&tp->delack_timer) && + del_timer(&tp->delack_timer)) __sock_put(sk); + if(timer_pending(&sk->timer) && del_timer(&sk->timer)) __sock_put(sk); } @@ -127,6 +87,7 @@ static void tcp_write_err(struct sock *sk) sk->error_report(sk); tcp_done(sk); + NET_INC_STATS_BH(TCPAbortOnTimeout); } /* Do not allow orphaned sockets to eat all our resources. @@ -138,26 +99,60 @@ static void tcp_write_err(struct sock *sk) * We kill the socket, if: * 1. If number of orphaned sockets exceeds an administratively configured * limit. - * 2. Under pessimistic assumption that all the orphans eat memory not - * less than this one, total consumed memory exceeds all - * the available memory. + * 2. If we have strong memory pressure. */ static int tcp_out_of_resources(struct sock *sk, int do_reset) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int orphans = atomic_read(&tcp_orphan_count); + /* If peer does not open window for long time, or did not transmit + * anything for long time, penalize it. */ + if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) + orphans <<= 1; + + /* If some dubious ICMP arrived, penalize even more. */ + if (sk->err_soft) + orphans <<= 1; + if (orphans >= sysctl_tcp_max_orphans || - ((orphans*atomic_read(&sk->wmem_alloc))>>PAGE_SHIFT) >= num_physpages) { + (sk->wmem_queued > SOCK_MIN_SNDBUF && + atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { if (net_ratelimit()) printk(KERN_INFO "Out of socket memory\n"); + + /* Catch exceptional cases, when connection requires reset. + * 1. Last segment was sent recently. */ + if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || + /* 2. Window is closed. */ + (!tp->snd_wnd && !tp->packets_out)) + do_reset = 1; if (do_reset) tcp_send_active_reset(sk, GFP_ATOMIC); tcp_done(sk); + NET_INC_STATS_BH(TCPAbortOnMemory); return 1; } return 0; } +/* Calculate maximal number or retries on an orphaned socket. */ +static int tcp_orphan_retries(struct sock *sk, int alive) +{ + int retries = sysctl_tcp_orphan_retries; /* May be zero. */ + + /* We know from an ICMP that something is wrong. */ + if (sk->err_soft && !alive) + retries = 0; + + /* However, if socket sent something recently, select some safe + * number of retries. 8 corresponds to >100 seconds with minimal + * RTO of 200msec. */ + if (retries == 0 && alive) + retries = 8; + return retries; +} + /* A write timeout has occurred. Process the after effects. */ static int tcp_write_timeout(struct sock *sk) { @@ -195,10 +190,12 @@ static int tcp_write_timeout(struct sock *sk) retry_until = sysctl_tcp_retries2; if (sk->dead) { - if (tcp_out_of_resources(sk, tp->retransmits < retry_until)) - return 1; + int alive = (tp->rto < TCP_RTO_MAX); + + retry_until = tcp_orphan_retries(sk, alive); - retry_until = sysctl_tcp_orphan_retries; + if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until)) + return 1; } } @@ -220,14 +217,38 @@ static void tcp_delack_timer(unsigned long data) /* Try again later. */ tp->ack.blocked = 1; NET_INC_STATS_BH(DelayedACKLocked); - tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MIN); + if (!mod_timer(&tp->delack_timer, jiffies + TCP_DELACK_MIN)) + sock_hold(sk); goto out_unlock; } - if (tp->ack.pending) { + tcp_mem_reclaim(sk); + + if (sk->state == TCP_CLOSE || !(tp->ack.pending&TCP_ACK_TIMER)) + goto out; + + if ((long)(tp->ack.timeout - jiffies) > 0) { + if (!mod_timer(&tp->delack_timer, tp->ack.timeout)) + sock_hold(sk); + goto out; + } + tp->ack.pending &= ~TCP_ACK_TIMER; + + if (skb_queue_len(&tp->ucopy.prequeue)) { + struct sk_buff *skb; + + net_statistics[smp_processor_id()*2].TCPSchedulerFailed += skb_queue_len(&tp->ucopy.prequeue); + + while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) + sk->backlog_rcv(sk, skb); + + tp->ucopy.memory = 0; + } + + if (tcp_ack_scheduled(tp)) { if (!tp->ack.pingpong) { /* Delayed ACK missed: inflate ATO. */ - tp->ack.ato = min(tp->ack.ato<<1, TCP_ATO_MAX); + tp->ack.ato = min(tp->ack.ato<<1, tp->rto); } else { /* Delayed ACK missed: leave pingpong mode and * deflate ATO. @@ -240,30 +261,22 @@ static void tcp_delack_timer(unsigned long data) } TCP_CHECK_TIMER(sk); +out: + if (tcp_memory_pressure) + tcp_mem_reclaim(sk); out_unlock: bh_unlock_sock(sk); sock_put(sk); } -static void tcp_probe_timer(unsigned long data) +static void tcp_probe_timer(struct sock *sk) { - struct sock *sk = (struct sock*)data; struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; int max_probes; - bh_lock_sock(sk); - if (sk->lock.users) { - /* Try again later. */ - tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, HZ/5); - goto out_unlock; - } - - if (sk->state == TCP_CLOSE) - goto out_unlock; - if (tp->packets_out || !tp->send_head) { tp->probes_out = 0; - goto out_unlock; + return; } /* *WARNING* RFC 1122 forbids this @@ -284,10 +297,12 @@ static void tcp_probe_timer(unsigned long data) max_probes = sysctl_tcp_retries2; if (sk->dead) { - if (tcp_out_of_resources(sk, tp->probes_out <= max_probes)) - goto out_unlock; + int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX); + + max_probes = tcp_orphan_retries(sk, alive); - max_probes = sysctl_tcp_orphan_retries; + if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes)) + return; } if (tp->probes_out > max_probes) { @@ -295,284 +310,47 @@ static void tcp_probe_timer(unsigned long data) } else { /* Only send another probe if we didn't close things up. */ tcp_send_probe0(sk); - TCP_CHECK_TIMER(sk); } -out_unlock: - bh_unlock_sock(sk); - sock_put(sk); } - -/* Kill off TIME_WAIT sockets once their lifetime has expired. */ -static int tcp_tw_death_row_slot = 0; -int tcp_tw_count = 0; - -static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS]; -static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED; -static struct timer_list tcp_tw_timer = { function: tcp_twkill }; - -static void SMP_TIMER_NAME(tcp_twkill)(unsigned long dummy) -{ - struct tcp_tw_bucket *tw; - int killed = 0; - - /* NOTE: compare this to previous version where lock - * was released after detaching chain. It was racy, - * because tw buckets are scheduled in not serialized context - * in 2.3 (with netfilter), and with softnet it is common, because - * soft irqs are not sequenced. - */ - spin_lock(&tw_death_lock); - - if (tcp_tw_count == 0) - goto out; - - while((tw = tcp_tw_death_row[tcp_tw_death_row_slot]) != NULL) { - tcp_tw_death_row[tcp_tw_death_row_slot] = tw->next_death; - tw->pprev_death = NULL; - spin_unlock(&tw_death_lock); - - tcp_timewait_kill(tw); - tcp_tw_put(tw); - - killed++; - - spin_lock(&tw_death_lock); - } - tcp_tw_death_row_slot = - ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); - - if ((tcp_tw_count -= killed) != 0) - mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); - net_statistics[smp_processor_id()*2].TimeWaited += killed; -out: - spin_unlock(&tw_death_lock); -} - -SMP_TIMER_DEFINE(tcp_twkill, tcp_twkill_task); - -/* These are always called from BH context. See callers in - * tcp_input.c to verify this. - */ - -/* This is for handling early-kills of TIME_WAIT sockets. */ -void tcp_tw_deschedule(struct tcp_tw_bucket *tw) -{ - spin_lock(&tw_death_lock); - if (tw->pprev_death) { - if(tw->next_death) - tw->next_death->pprev_death = tw->pprev_death; - *tw->pprev_death = tw->next_death; - tw->pprev_death = NULL; - tcp_tw_put(tw); - if (--tcp_tw_count == 0) - del_timer(&tcp_tw_timer); - } - spin_unlock(&tw_death_lock); -} - -/* Short-time timewait calendar */ - -static int tcp_twcal_hand = -1; -static int tcp_twcal_jiffie; -static void tcp_twcal_tick(unsigned long); -static struct timer_list tcp_twcal_timer = {function: tcp_twcal_tick}; -static struct tcp_tw_bucket *tcp_twcal_row[TCP_TW_RECYCLE_SLOTS]; - -void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo) -{ - struct tcp_tw_bucket **tpp; - int slot; - - /* timeout := RTO * 3.5 - * - * 3.5 = 1+2+0.5 to wait for two retransmits. - * - * RATIONALE: if FIN arrived and we entered TIME-WAIT state, - * our ACK acking that FIN can be lost. If N subsequent retransmitted - * FINs (or previous seqments) are lost (probability of such event - * is p^(N+1), where p is probability to lose single packet and - * time to detect the loss is about RTO*(2^N - 1) with exponential - * backoff). Normal timewait length is calculated so, that we - * waited at least for one retransmitted FIN (maximal RTO is 120sec). - * [ BTW Linux. following BSD, violates this requirement waiting - * only for 60sec, we should wait at least for 240 secs. - * Well, 240 consumes too much of resources 8) - * ] - * This interval is not reduced to catch old duplicate and - * responces to our wandering segments living for two MSLs. - * However, if we use PAWS to detect - * old duplicates, we can reduce the interval to bounds required - * by RTO, rather than MSL. So, if peer understands PAWS, we - * kill tw bucket after 3.5*RTO (it is important that this number - * is greater than TS tick!) and detect old duplicates with help - * of PAWS. - */ - slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK; - - spin_lock(&tw_death_lock); - - /* Unlink it, if it was scheduled */ - if (tw->pprev_death) { - if(tw->next_death) - tw->next_death->pprev_death = tw->pprev_death; - *tw->pprev_death = tw->next_death; - tw->pprev_death = NULL; - tcp_tw_count--; - } else - atomic_inc(&tw->refcnt); - - if (slot >= TCP_TW_RECYCLE_SLOTS) { - /* Schedule to slow timer */ - if (timeo >= TCP_TIMEWAIT_LEN) { - slot = TCP_TWKILL_SLOTS-1; - } else { - slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD; - if (slot >= TCP_TWKILL_SLOTS) - slot = TCP_TWKILL_SLOTS-1; - } - tw->ttd = jiffies + timeo; - slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1); - tpp = &tcp_tw_death_row[slot]; - } else { - tw->ttd = jiffies + (slot<<TCP_TW_RECYCLE_TICK); - - if (tcp_twcal_hand < 0) { - tcp_twcal_hand = 0; - tcp_twcal_jiffie = jiffies; - tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK); - add_timer(&tcp_twcal_timer); - } else { - if ((long)(tcp_twcal_timer.expires - jiffies) > (slot<<TCP_TW_RECYCLE_TICK)) - mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK)); - slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1); - } - tpp = &tcp_twcal_row[slot]; - } - - if((tw->next_death = *tpp) != NULL) - (*tpp)->pprev_death = &tw->next_death; - *tpp = tw; - tw->pprev_death = tpp; - - if (tcp_tw_count++ == 0) - mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); - spin_unlock(&tw_death_lock); -} - -void SMP_TIMER_NAME(tcp_twcal_tick)(unsigned long dummy) -{ - int n, slot; - unsigned long j; - unsigned long now = jiffies; - int killed = 0; - int adv = 0; - - spin_lock(&tw_death_lock); - if (tcp_twcal_hand < 0) - goto out; - - slot = tcp_twcal_hand; - j = tcp_twcal_jiffie; - - for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) { - if ((long)(j - now) <= 0) { - struct tcp_tw_bucket *tw; - - while((tw = tcp_twcal_row[slot]) != NULL) { - tcp_twcal_row[slot] = tw->next_death; - tw->pprev_death = NULL; - - tcp_timewait_kill(tw); - tcp_tw_put(tw); - killed++; - } - } else { - if (!adv) { - adv = 1; - tcp_twcal_jiffie = j; - tcp_twcal_hand = slot; - } - - if (tcp_twcal_row[slot] != NULL) { - mod_timer(&tcp_twcal_timer, j); - goto out; - } - } - j += (1<<TCP_TW_RECYCLE_TICK); - slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1); - } - tcp_twcal_hand = -1; - -out: - if ((tcp_tw_count -= killed) == 0) - del_timer(&tcp_tw_timer); - net_statistics[smp_processor_id()*2].TimeWaitKilled += killed; - spin_unlock(&tw_death_lock); -} - -SMP_TIMER_DEFINE(tcp_twcal_tick, tcp_twcal_tasklet); - /* * The TCP retransmit timer. */ -static void tcp_retransmit_timer(unsigned long data) +static void tcp_retransmit_timer(struct sock *sk) { - struct sock *sk = (struct sock*)data; struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - bh_lock_sock(sk); - if (sk->lock.users) { - /* Try again later */ - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, HZ/20); - goto out_unlock; - } - - if (sk->state == TCP_CLOSE || tp->packets_out == 0) - goto out_unlock; + if (tp->packets_out == 0) + goto out; BUG_TRAP(!skb_queue_empty(&sk->write_queue)); if (tcp_write_timeout(sk)) - goto out_unlock; + goto out; - /* RFC 2018, clear all 'sacked' flags in retransmission queue, - * the sender may have dropped out of order frames and we must - * send them out should this timer fire on us. - */ - if(tp->sack_ok) { - struct sk_buff *skb = skb_peek(&sk->write_queue); - - while((skb != NULL) && - (skb != tp->send_head) && - (skb != (struct sk_buff *)&sk->write_queue)) { - TCP_SKB_CB(skb)->sacked &= - ~(TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS); - skb = skb->next; + if (tp->retransmits == 0) { + if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) { + if (tp->sack_ok) { + if (tp->ca_state == TCP_CA_Recovery) + NET_INC_STATS_BH(TCPSackRecoveryFail); + else + NET_INC_STATS_BH(TCPSackFailures); + } else { + if (tp->ca_state == TCP_CA_Recovery) + NET_INC_STATS_BH(TCPRenoRecoveryFail); + else + NET_INC_STATS_BH(TCPRenoFailures); + } + } else if (tp->ca_state == TCP_CA_Loss) { + NET_INC_STATS_BH(TCPLossFailures); + } else { + NET_INC_STATS_BH(TCPTimeouts); } } - /* Retransmission. */ - tp->retrans_head = NULL; - tp->rexmt_done = 0; - tp->fackets_out = 0; - tp->retrans_out = 0; - if (tp->retransmits == 0) { - /* Remember window where we lost: - * "one half of the current window but at least 2 segments" - * - * Here "current window" means the effective one, which - * means it must be an accurate representation of our current - * sending rate _and_ the snd_wnd. - */ - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); - tp->snd_cwnd_cnt = 0; - tp->snd_cwnd = 1; - } + tcp_enter_loss(sk, 0); - tp->dup_acks = 0; - tp->high_seq = tp->snd_nxt; if (tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)) > 0) { /* Retransmission failed because of local congestion, * do not backoff. @@ -581,8 +359,7 @@ static void tcp_retransmit_timer(unsigned long data) tp->retransmits=1; tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL)); - TCP_CHECK_TIMER(sk); - goto out_unlock; + goto out; } /* Increase the timeout each time we retransmit. Note that @@ -606,8 +383,48 @@ static void tcp_retransmit_timer(unsigned long data) tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); if (tp->retransmits > sysctl_tcp_retries1) __sk_dst_reset(sk); + +out: +} + +static void tcp_write_timer(unsigned long data) +{ + struct sock *sk = (struct sock*)data; + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + int event; + + bh_lock_sock(sk); + if (sk->lock.users) { + /* Try again later */ + if (!mod_timer(&tp->retransmit_timer, jiffies + (HZ/20))) + sock_hold(sk); + goto out_unlock; + } + + if (sk->state == TCP_CLOSE || !tp->pending) + goto out; + + if ((long)(tp->timeout - jiffies) > 0) { + if (!mod_timer(&tp->retransmit_timer, tp->timeout)) + sock_hold(sk); + goto out; + } + + event = tp->pending; + tp->pending = 0; + + switch (event) { + case TCP_TIME_RETRANS: + tcp_retransmit_timer(sk); + break; + case TCP_TIME_PROBE0: + tcp_probe_timer(sk); + break; + } TCP_CHECK_TIMER(sk); +out: + tcp_mem_reclaim(sk); out_unlock: bh_unlock_sock(sk); sock_put(sk); @@ -794,6 +611,7 @@ static void tcp_keepalive_timer (unsigned long data) } TCP_CHECK_TIMER(sk); + tcp_mem_reclaim(sk); resched: tcp_reset_keepalive_timer (sk, elapsed); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index dec2a6126..59afc3cee 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -5,7 +5,7 @@ * * The User Datagram Protocol (UDP). * - * Version: $Id: udp.c,v 1.84 2000/07/08 00:20:43 davem Exp $ + * Version: $Id: udp.c,v 1.85 2000/08/09 11:59:04 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -997,8 +997,8 @@ static void get_udp_sock(struct sock *sp, char *tmpbuf, int i) i, src, srcp, dest, destp, sp->state, atomic_read(&sp->wmem_alloc), atomic_read(&sp->rmem_alloc), timer_active, timer_expires-jiffies, 0, - sp->socket->inode->i_uid, 0, - sp->socket ? sp->socket->inode->i_ino : 0, + sock_i_uid(sp), 0, + sock_i_ino(sp), atomic_read(&sp->refcnt), sp); } |