diff options
author | Ralf Baechle <ralf@linux-mips.org> | 2000-02-18 00:24:27 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 2000-02-18 00:24:27 +0000 |
commit | b9558d5f86c471a125abf1fb3a3882fb053b1f8c (patch) | |
tree | 707b53ec64e740a7da87d5f36485e3cd9b1c794e /net/ipv4 | |
parent | b3ac367c7a3e6047abe74817db27e34e759f279f (diff) |
Merge with Linux 2.3.41.
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/af_inet.c | 261 | ||||
-rw-r--r-- | net/ipv4/arp.c | 10 | ||||
-rw-r--r-- | net/ipv4/ip_input.c | 13 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 12 | ||||
-rw-r--r-- | net/ipv4/ip_sockglue.c | 4 | ||||
-rw-r--r-- | net/ipv4/ipconfig.c | 9 | ||||
-rw-r--r-- | net/ipv4/proc.c | 16 | ||||
-rw-r--r-- | net/ipv4/raw.c | 8 | ||||
-rw-r--r-- | net/ipv4/route.c | 49 | ||||
-rw-r--r-- | net/ipv4/syncookies.c | 34 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 34 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 1048 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 1370 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 951 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 495 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 648 | ||||
-rw-r--r-- | net/ipv4/udp.c | 182 |
17 files changed, 2977 insertions, 2167 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 94fb19f92..bc2c97779 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -5,7 +5,7 @@ * * PF_INET protocol family socket handler. * - * Version: $Id: af_inet.c,v 1.101 2000/01/09 02:19:38 davem Exp $ + * Version: $Id: af_inet.c,v 1.104 2000/01/18 08:24:14 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -117,7 +117,9 @@ struct linux_mib net_statistics[NR_CPUS*2]; +#ifdef INET_REFCNT_DEBUG atomic_t inet_sock_nr; +#endif extern int raw_get_info(char *, char **, off_t, int); extern int snmp_get_info(char *, char **, off_t, int); @@ -159,8 +161,8 @@ void inet_sock_destruct(struct sock *sk) if (sk->protinfo.af_inet.opt) kfree(sk->protinfo.af_inet.opt); dst_release(sk->dst_cache); - atomic_dec(&inet_sock_nr); #ifdef INET_REFCNT_DEBUG + atomic_dec(&inet_sock_nr); printk(KERN_DEBUG "INET socket %p released, %d are still alive\n", sk, atomic_read(&inet_sock_nr)); #endif } @@ -171,32 +173,28 @@ void inet_sock_release(struct sock *sk) sk->prot->destroy(sk); /* Observation: when inet_sock_release is called, processes have - no access to socket. But net still has. - Step one, detach it from networking: - - A. Remove from hash tables. + * no access to socket. But net still has. + * Step one, detach it from networking: + * + * A. Remove from hash tables. */ sk->prot->unhash(sk); /* In this point socket cannot receive new packets, - but it is possible that some packets are in flight - because some CPU runs receiver and did hash table lookup - before we unhashed socket. They will achieve receive queue - and will be purged by socket destructor. - - Also we still have packets pending on receive - queue and probably, our own packets waiting in device queues. - sock_destroy will drain receive queue, but transmitted - packets will delay socket destruction until the last reference - will be released. + * but it is possible that some packets are in flight + * because some CPU runs receiver and did hash table lookup + * before we unhashed socket. They will achieve receive queue + * and will be purged by socket destructor. + * + * Also we still have packets pending on receive + * queue and probably, our own packets waiting in device queues. + * sock_destroy will drain receive queue, but transmitted + * packets will delay socket destruction until the last reference + * will be released. */ - write_lock_irq(&sk->callback_lock); - sk->dead=1; - sk->socket = NULL; - sk->sleep = NULL; - write_unlock_irq(&sk->callback_lock); + sock_orphan(sk); #ifdef INET_REFCNT_DEBUG if (atomic_read(&sk->refcnt) != 1) { @@ -222,8 +220,7 @@ int inet_setsockopt(struct socket *sock, int level, int optname, char *optval, int optlen) { struct sock *sk=sock->sk; - if (sk->prot->setsockopt==NULL) - return -EOPNOTSUPP; + return sk->prot->setsockopt(sk,level,optname,optval,optlen); } @@ -239,8 +236,7 @@ int inet_getsockopt(struct socket *sock, int level, int optname, char *optval, int *optlen) { struct sock *sk=sock->sk; - if (sk->prot->getsockopt==NULL) - return -EOPNOTSUPP; + return sk->prot->getsockopt(sk,level,optname,optval,optlen); } @@ -264,14 +260,6 @@ static int inet_autobind(struct sock *sk) return 0; } -/* Listening INET sockets never sleep to wait for memory, so - * it is completely silly to wake them up on queue space - * available events. So we hook them up to this dummy callback. - */ -static void inet_listen_write_space(struct sock *sk) -{ -} - /* * Move a socket into listening state. */ @@ -282,12 +270,13 @@ int inet_listen(struct socket *sock, int backlog) unsigned char old_state; int err; + lock_sock(sk); + + err = -EINVAL; if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) - return -EINVAL; + goto out; - lock_sock(sk); old_state = sk->state; - err = -EINVAL; if (!((1<<old_state)&(TCPF_CLOSE|TCPF_LISTEN))) goto out; @@ -295,25 +284,9 @@ int inet_listen(struct socket *sock, int backlog) * we can only allow the backlog to be adjusted. */ if (old_state != TCP_LISTEN) { - sk->state = TCP_LISTEN; - sk->ack_backlog = 0; - if (sk->num == 0) { - if (sk->prot->get_port(sk, 0) != 0) { - sk->state = old_state; - err = -EAGAIN; - goto out; - } - sk->sport = htons(sk->num); - } else { - /* Not nice, but the simplest solution however */ - if (sk->prev) - ((struct tcp_bind_bucket*)sk->prev)->fastreuse = 0; - } - - sk_dst_reset(sk); - sk->prot->hash(sk); - sk->socket->flags |= SO_ACCEPTCON; - sk->write_space = inet_listen_write_space; + err = tcp_listen_start(sk); + if (err) + goto out; } sk->max_ack_backlog = backlog; err = 0; @@ -345,10 +318,6 @@ static int inet_create(struct socket *sock, int protocol) if (protocol && protocol != IPPROTO_TCP) goto free_and_noproto; protocol = IPPROTO_TCP; - if (ipv4_config.no_pmtu_disc) - sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT; - else - sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_WANT; prot = &tcp_prot; sock->ops = &inet_stream_ops; break; @@ -359,7 +328,6 @@ static int inet_create(struct socket *sock, int protocol) goto free_and_noproto; protocol = IPPROTO_UDP; sk->no_check = UDP_CSUM_DEFAULT; - sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT; prot=&udp_prot; sock->ops = &inet_dgram_ops; break; @@ -370,7 +338,6 @@ static int inet_create(struct socket *sock, int protocol) goto free_and_noproto; prot = &raw_prot; sk->reuse = 1; - sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT; sk->num = protocol; sock->ops = &inet_dgram_ops; if (protocol == IPPROTO_RAW) @@ -380,23 +347,22 @@ static int inet_create(struct socket *sock, int protocol) goto free_and_badtype; } + if (ipv4_config.no_pmtu_disc) + sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT; + else + sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_WANT; + sock_init_data(sock,sk); sk->destruct = inet_sock_destruct; - sk->zapped=0; -#ifdef CONFIG_TCP_NAGLE_OFF - sk->nonagle = 1; -#endif + sk->zapped = 0; sk->family = PF_INET; sk->protocol = protocol; sk->prot = prot; sk->backlog_rcv = prot->backlog_rcv; - sk->timer.data = (unsigned long)sk; - sk->timer.function = &tcp_keepalive_timer; - sk->protinfo.af_inet.ttl=sysctl_ip_default_ttl; sk->protinfo.af_inet.mc_loop=1; @@ -404,7 +370,9 @@ static int inet_create(struct socket *sock, int protocol) sk->protinfo.af_inet.mc_index=0; sk->protinfo.af_inet.mc_list=NULL; +#ifdef INET_REFCNT_DEBUG atomic_inc(&inet_sock_nr); +#endif if (sk->num) { /* It assumes that any protocol which allows @@ -469,11 +437,8 @@ int inet_release(struct socket *sock) * linger.. */ timeout = 0; - if (sk->linger && !(current->flags & PF_EXITING)) { - timeout = HZ * sk->lingertime; - if (!timeout) - timeout = MAX_SCHEDULE_TIMEOUT; - } + if (sk->linger && !(current->flags & PF_EXITING)) + timeout = sk->lingertime; sock->sk = NULL; sk->prot->close(sk, timeout); } @@ -496,10 +461,6 @@ static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) return -EINVAL; chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); - if (addr->sin_addr.s_addr != 0 && chk_addr_ret != RTN_LOCAL && - chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) { - return -EADDRNOTAVAIL; /* Source address MUST be ours! */ - } snum = ntohs(addr->sin_port); if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) @@ -555,25 +516,29 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, return sk->prot->connect(sk, (struct sockaddr *)uaddr, addr_len); } -static void inet_wait_for_connect(struct sock *sk) +static long inet_wait_for_connect(struct sock *sk, long timeo) { DECLARE_WAITQUEUE(wait, current); __set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(sk->sleep, &wait); + /* Basic assumption: if someone sets sk->err, he _must_ + * change state of the socket from TCP_SYN_*. + * Connect() does not allow to get error notifications + * without closing the socket. + */ while ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) { - if (signal_pending(current)) - break; - if (sk->err) - break; release_sock(sk); - schedule(); + timeo = schedule_timeout(timeo); lock_sock(sk); + if (signal_pending(current) || !timeo) + break; set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); remove_wait_queue(sk->sleep, &wait); + return timeo; } /* @@ -586,16 +551,16 @@ int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr, { struct sock *sk=sock->sk; int err; + long timeo; + + lock_sock(sk); if (uaddr->sa_family == AF_UNSPEC) { - lock_sock(sk); err = sk->prot->disconnect(sk, flags); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; - release_sock(sk); - return err; + goto out; } - lock_sock(sk); switch (sock->state) { default: err = -EINVAL; @@ -604,40 +569,58 @@ int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr, err = -EISCONN; goto out; case SS_CONNECTING: - if (tcp_established(sk->state)) { - sock->state = SS_CONNECTED; - err = 0; - goto out; - } - if (sk->err) - goto sock_error; err = -EALREADY; - if (flags & O_NONBLOCK) - goto out; + /* Fall out of switch with err, set for this state */ break; case SS_UNCONNECTED: + err = -EISCONN; + if (sk->state != TCP_CLOSE) + goto out; + + err = -EAGAIN; + if (sk->num == 0) { + if (sk->prot->get_port(sk, 0) != 0) + goto out; + sk->sport = htons(sk->num); + } + err = sk->prot->connect(sk, uaddr, addr_len); if (err < 0) goto out; + sock->state = SS_CONNECTING; - } - if (sk->state > TCP_FIN_WAIT2) - goto sock_error; + /* Just entered SS_CONNECTING state; the only + * difference is that return value in non-blocking + * case is EINPROGRESS, rather than EALREADY. + */ + err = -EINPROGRESS; + break; + } - err = -EINPROGRESS; - if (!tcp_established(sk->state) && (flags & O_NONBLOCK)) - goto out; + timeo = sock_sndtimeo(sk, flags&O_NONBLOCK); if ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) { - inet_wait_for_connect(sk); + /* Error code is set above */ + if (!timeo || !inet_wait_for_connect(sk, timeo)) + goto out; + err = -ERESTARTSYS; if (signal_pending(current)) goto out; } - if (sk->err && !tcp_established(sk->state)) - goto sock_error; + /* Connection was closed by RST, timeout, ICMP error + * or another process disconnected us. + */ + if (sk->state == TCP_CLOSE) + goto sock_error; + + /* sk->err may be not zero now, if RECVERR was ordered by user + * and error was received after socket entered established state. + * Hence, it is handled normally after connect() return successfully. + */ + sock->state = SS_CONNECTED; err = 0; out: @@ -647,11 +630,9 @@ out: sock_error: err = sock_error(sk) ? : -ECONNABORTED; sock->state = SS_UNCONNECTED; - if (sk->prot->disconnect(sk, O_NONBLOCK)) + if (sk->prot->disconnect(sk, flags)) sock->state = SS_DISCONNECTING; - release_sock(sk); - - return err; + goto out; } /* @@ -671,11 +652,7 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags) BUG_TRAP((1<<sk2->state)&(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_CLOSE)); - write_lock_irq(&sk2->callback_lock); - sk2->sleep = &newsock->wait; - newsock->sk = sk2; - sk2->socket = newsock; - write_unlock_irq(&sk2->callback_lock); + sock_graft(sk2, newsock); newsock->state = SS_CONNECTED; release_sock(sk2); @@ -749,7 +726,7 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, int size, int inet_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; - int err; + int err = 0; /* This should really check to make sure * the socket is a TCP socket. (WHY AC...) @@ -759,35 +736,45 @@ int inet_shutdown(struct socket *sock, int how) 2->3 */ if ((how & ~SHUTDOWN_MASK) || how==0) /* MAXINT->0 */ return -EINVAL; - if (!sk) - return -ENOTCONN; lock_sock(sk); - if (sock->state == SS_CONNECTING && tcp_established(sk->state)) - sock->state = SS_CONNECTED; - err = -ENOTCONN; - if (!tcp_connected(sk->state)) - goto out; - sk->shutdown |= how; - if (sk->prot->shutdown) - sk->prot->shutdown(sk, how); + if (sock->state == SS_CONNECTING) { + if ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE)) + sock->state = SS_DISCONNECTING; + else + sock->state = SS_CONNECTED; + } + + switch (sk->state) { + default: + sk->shutdown |= how; + if (sk->prot->shutdown) + sk->prot->shutdown(sk, how); + break; + case TCP_CLOSE: + err = -ENOTCONN; + break; + + /* Remaining two branches are temporary solution for missing + * close() in multithreaded environment. It is _not_ a good idea, + * but we have no choice until close() is repaired at VFS level. + */ + case TCP_LISTEN: + if (!(how & RCV_SHUTDOWN)) + break; + /* Fall through */ + case TCP_SYN_SENT: + err = sk->prot->disconnect(sk, O_NONBLOCK); + sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; + break; + } + /* Wake up anyone sleeping in poll. */ sk->state_change(sk); - err = 0; -out: release_sock(sk); return err; } -unsigned int inet_poll(struct file * file, struct socket *sock, poll_table *wait) -{ - struct sock *sk = sock->sk; - - if (sk->prot->poll == NULL) - return(0); - return sk->prot->poll(file, sock, wait); -} - /* * ioctl() calls you can issue on an INET socket. Most of these are * device configuration and stuff and very rarely used. Some ioctls @@ -909,7 +896,7 @@ struct proto_ops inet_stream_ops = { sock_no_socketpair, inet_accept, inet_getname, - inet_poll, + tcp_poll, inet_ioctl, inet_listen, inet_shutdown, diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 591f3cceb..588cdf030 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1,6 +1,6 @@ /* linux/net/inet/arp.c * - * Version: $Id: arp.c,v 1.83 1999/12/15 22:39:03 davem Exp $ + * Version: $Id: arp.c,v 1.84 2000/01/18 08:24:14 davem Exp $ * * Copyright (C) 1994 by Florian La Roche * @@ -487,7 +487,9 @@ void arp_send(int type, int ptype, u32 dest_ip, /* * Fill the device header for the ARP frame */ - dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len); + if (dev->hard_header && + dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len) < 0) + goto out; /* * Fill out the arp protocol part. @@ -552,6 +554,10 @@ void arp_send(int type, int ptype, u32 dest_ip, skb->dev = dev; dev_queue_xmit(skb); + return; + +out: + kfree_skb(skb); } static void parp_redo(struct sk_buff *skb) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 11a8c319b..23389d249 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) module. * - * Version: $Id: ip_input.c,v 1.44 2000/01/09 02:19:30 davem Exp $ + * Version: $Id: ip_input.c,v 1.45 2000/01/16 05:11:22 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -317,13 +317,12 @@ static inline int ip_rcv_finish(struct sk_buff *skb) #ifdef CONFIG_NET_CLS_ROUTE if (skb->dst->tclassid) { + struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id(); u32 idx = skb->dst->tclassid; - write_lock(&ip_rt_acct_lock); - ip_rt_acct[idx&0xFF].o_packets++; - ip_rt_acct[idx&0xFF].o_bytes+=skb->len; - ip_rt_acct[(idx>>16)&0xFF].i_packets++; - ip_rt_acct[(idx>>16)&0xFF].i_bytes+=skb->len; - write_unlock(&ip_rt_acct_lock); + st[idx&0xFF].o_packets++; + st[idx&0xFF].o_bytes+=skb->len; + st[(idx>>16)&0xFF].i_packets++; + st[(idx>>16)&0xFF].i_bytes+=skb->len; } #endif diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 59e6ff865..2a4e3cf41 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) output module. * - * Version: $Id: ip_output.c,v 1.77 2000/01/09 02:19:31 davem Exp $ + * Version: $Id: ip_output.c,v 1.78 2000/01/16 05:11:22 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -149,8 +149,8 @@ output_maybe_reroute(struct sk_buff *skb) /* * Add an ip header to a skbuff and send it out. */ -void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, - u32 saddr, u32 daddr, struct ip_options *opt) +int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, + u32 saddr, u32 daddr, struct ip_options *opt) { struct rtable *rt = (struct rtable *)skb->dst; struct iphdr *iph; @@ -182,8 +182,8 @@ void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, ip_send_check(iph); /* Send it out. */ - NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - output_maybe_reroute); + return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, + output_maybe_reroute); } static inline int ip_finish_output2(struct sk_buff *skb) @@ -257,7 +257,7 @@ int ip_mc_output(struct sk_buff *skb) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) - NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, newskb, NULL, + NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL, newskb->dev, ip_dev_loopback_xmit); } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index c618689b2..90b74447f 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -5,7 +5,7 @@ * * The IP to API glue. * - * Version: $Id: ip_sockglue.c,v 1.46 2000/01/09 02:19:32 davem Exp $ + * Version: $Id: ip_sockglue.c,v 1.47 2000/01/16 05:11:23 davem Exp $ * * Authors: see ip.c * @@ -415,7 +415,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (sk->family == PF_INET || - ((tcp_connected(sk->state) || sk->state == TCP_SYN_SENT) + (!((1<<sk->state)&(TCPF_LISTEN|TCPF_CLOSE)) && sk->daddr != LOOPBACK4_IPV6)) { #endif if (opt) diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 4d2195312..d4d556cb5 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -534,7 +534,14 @@ static void __init ic_bootp_send_if(struct ic_device *d, u32 jiffies) /* Construct BOOTP header */ b->op = BOOTP_REQUEST; - b->htype = dev->type; + if (dev->type < 256) /* check for false types */ + b->htype = dev->type; + else if (dev->type == ARPHRD_IEEE802_TR) /* fix for token ring */ + b->htype = ARPHRD_IEEE802; + else { + printk("Unknown ARP type 0x%04x for device %s\n", dev->type, dev->name); + b->htype = dev->type; /* can cause undefined behavior */ + } b->hlen = dev->addr_len; memcpy(b->hw_addr, dev->dev_addr, dev->addr_len); b->secs = htons(jiffies / HZ); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index b3e86f58c..d6a7c57f5 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -7,7 +7,7 @@ * PROC file system. It is mainly used for debugging and * statistics. * - * Version: $Id: proc.c,v 1.38 2000/01/09 02:19:30 davem Exp $ + * Version: $Id: proc.c,v 1.41 2000/01/21 23:45:57 davem Exp $ * * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de> @@ -71,8 +71,9 @@ int afinet_get_info(char *buffer, char **start, off_t offset, int length) int len = socket_get_info(buffer,start,offset,length); - len += sprintf(buffer+len,"TCP: inuse %d\n", - fold_prot_inuse(&tcp_prot)); + len += sprintf(buffer+len,"TCP: inuse %d orphan %d tw %d\n", + fold_prot_inuse(&tcp_prot), + atomic_read(&tcp_orphan_count), tcp_tw_count); len += sprintf(buffer+len,"UDP: inuse %d\n", fold_prot_inuse(&udp_prot)); len += sprintf(buffer+len,"RAW: inuse %d\n", @@ -163,7 +164,14 @@ int netstat_get_info(char *buffer, char **start, off_t offset, int length) len = sprintf(buffer, "TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed" " EmbryonicRsts PruneCalled RcvPruned OfoPruned" - " OutOfWindowIcmps LockDroppedIcmps\n" + " OutOfWindowIcmps LockDroppedIcmps" + " TW TWRecycled TWKilled" + " PAWSPassive PAWSActive PAWSEstab" + " DelayedACKs DelayedACKLocked DelayedACKLost" + " ListenOverflows ListenDrops" + " TCPPrequeued TCPDirectCopyFromBacklog" + " TCPDirectCopyFromPrequeue TCPPrequeueDropped" + " TCPHPHits TCPHPHitsToUser\n" "TcpExt:"); for (i=0; i<offsetof(struct linux_mib, __pad)/sizeof(unsigned long); i++) len += sprintf(buffer+len, " %lu", fold_field((unsigned long*)net_statistics, sizeof(struct linux_mib), i)); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 6fc5e59c5..e9aa1952a 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -5,7 +5,7 @@ * * RAW - implementation of IP "raw" sockets. * - * Version: $Id: raw.c,v 1.46 2000/01/09 02:19:30 davem Exp $ + * Version: $Id: raw.c,v 1.48 2000/01/18 08:24:15 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -648,10 +648,6 @@ struct proto raw_prot = { udp_connect, /* connect */ udp_disconnect, /* disconnect */ NULL, /* accept */ - NULL, /* retransmit */ - NULL, /* write_wakeup */ - NULL, /* read_wakeup */ - datagram_poll, /* poll */ #ifdef CONFIG_IP_MROUTE ipmr_ioctl, /* ioctl */ #else @@ -669,7 +665,5 @@ struct proto raw_prot = { raw_v4_hash, /* hash */ raw_v4_unhash, /* unhash */ NULL, /* get_port */ - 128, /* max_header */ - 0, /* retransmits */ "RAW", /* name */ }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index add42730d..bbc6ec111 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -5,7 +5,7 @@ * * ROUTE - implementation of the IP router. * - * Version: $Id: route.c,v 1.78 2000/01/13 00:06:58 davem Exp $ + * Version: $Id: route.c,v 1.80 2000/01/21 06:37:27 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -1178,6 +1178,7 @@ ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr, rth->u.dst.output= ip_rt_bug; atomic_set(&rth->u.dst.__refcnt, 1); + rth->u.dst.flags= DST_HOST; rth->key.dst = daddr; rth->rt_dst = daddr; rth->key.tos = tos; @@ -1385,6 +1386,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, goto e_nobufs; atomic_set(&rth->u.dst.__refcnt, 1); + rth->u.dst.flags= DST_HOST; rth->key.dst = daddr; rth->rt_dst = daddr; rth->key.tos = tos; @@ -1462,6 +1464,7 @@ local_input: rth->u.dst.output= ip_rt_bug; atomic_set(&rth->u.dst.__refcnt, 1); + rth->u.dst.flags= DST_HOST; rth->key.dst = daddr; rth->rt_dst = daddr; rth->key.tos = tos; @@ -1815,6 +1818,7 @@ make_route: goto e_nobufs; atomic_set(&rth->u.dst.__refcnt, 1); + rth->u.dst.flags= DST_HOST; rth->key.dst = daddr; rth->key.tos = tos; rth->key.src = saddr; @@ -2208,8 +2212,7 @@ ctl_table ipv4_route_table[] = { #endif #ifdef CONFIG_NET_CLS_ROUTE -struct ip_rt_acct ip_rt_acct[256]; -rwlock_t ip_rt_acct_lock = RW_LOCK_UNLOCKED; +struct ip_rt_acct *ip_rt_acct; #ifdef CONFIG_PROC_FS static int ip_rt_acct_read(char *buffer, char **start, off_t offset, @@ -2217,14 +2220,34 @@ static int ip_rt_acct_read(char *buffer, char **start, off_t offset, { *start=buffer; - if (offset + length > sizeof(ip_rt_acct)) { - length = sizeof(ip_rt_acct) - offset; + if ((offset&3) || (length&3)) + return -EIO; + + if (offset + length >= sizeof(struct ip_rt_acct)*256) { + length = sizeof(struct ip_rt_acct)*256 - offset; *eof = 1; } if (length > 0) { - read_lock_bh(&ip_rt_acct_lock); - memcpy(buffer, ((u8*)&ip_rt_acct)+offset, length); - read_unlock_bh(&ip_rt_acct_lock); + u32 *dst = (u32*)buffer; + u32 *src = (u32*)(((u8*)ip_rt_acct) + offset); + + memcpy(dst, src, length); + +#ifdef __SMP__ + if (smp_num_cpus > 1) { + int i; + int cnt = length/4; + + for (i=1; i<smp_num_cpus; i++) { + int k; + + src += (256/4)*sizeof(struct ip_rt_acct); + + for (k=0; k<cnt; k++) + dst[k] += src[k]; + } + } +#endif return length; } return 0; @@ -2236,6 +2259,16 @@ void __init ip_rt_init(void) { int i, order, goal; +#ifdef CONFIG_NET_CLS_ROUTE + for (order=0; + (PAGE_SIZE<<order) < 256*sizeof(ip_rt_acct)*smp_num_cpus; order++) + /* NOTHING */; + ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order); + if (!ip_rt_acct) + panic("IP: failed to allocate ip_rt_acct\n"); + memset(ip_rt_acct, 0, PAGE_SIZE<<order); +#endif + ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, SLAB_HWCACHE_ALIGN, diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index e82233cfd..d218c3bdb 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -9,7 +9,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * $Id: syncookies.c,v 1.10 2000/01/09 02:19:35 davem Exp $ + * $Id: syncookies.c,v 1.11 2000/01/16 05:11:27 davem Exp $ * * Missing: IPv6 support. */ @@ -102,23 +102,16 @@ static inline struct sock * get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct open_request *req, struct dst_entry *dst) { - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sock *child; - /* Oops! It was missing, syn_recv_sock decreases it. */ - tp->syn_backlog++; + child = tp->af_specific->syn_recv_sock(sk, skb, req, dst); + if (child) + tcp_acceptq_queue(sk, req, child); + else + tcp_openreq_free(req); - sk = tp->af_specific->syn_recv_sock(sk, skb, req, dst); - if (sk) { - req->sk = sk; - - /* Queue up for accept() */ - tcp_synq_queue(tp, req); - } else { - tp->syn_backlog--; - req->class->destructor(req); - tcp_openreq_free(req); - } - return sk; + return child; } struct sock * @@ -171,9 +164,9 @@ cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt) } } } - + req->snd_wscale = req->rcv_wscale = req->tstamp_ok = 0; - req->wscale_ok = 0; + req->wscale_ok = req->sack_ok = 0; req->expires = 0UL; req->retrans = 0; @@ -189,8 +182,8 @@ cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt) req->af.v4_req.loc_addr, sk->protinfo.af_inet.tos | RTO_CONN, 0)) { - tcp_openreq_free(req); - return NULL; + tcp_openreq_free(req); + return NULL; } /* Try to redo what tcp_v4_send_synack did. */ @@ -198,6 +191,7 @@ cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt) tcp_select_initial_window(tcp_full_space(sk),req->mss, &req->rcv_wnd, &req->window_clamp, 0, &rcv_wscale); + /* BTW win scale with syncookies is 0 by definition */ req->rcv_wscale = rcv_wscale; return get_cookie_sock(sk, skb, req, &rt->u.dst); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 9465e4021..d9416525b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1,7 +1,7 @@ /* * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem. * - * $Id: sysctl_net_ipv4.c,v 1.42 2000/01/09 02:19:37 davem Exp $ + * $Id: sysctl_net_ipv4.c,v 1.43 2000/01/16 05:11:27 davem Exp $ * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS] @@ -41,26 +41,6 @@ extern int sysctl_ipfrag_time; /* From ip_output.c */ extern int sysctl_ip_dynaddr; -/* From ip_masq.c */ -extern int sysctl_ip_masq_debug; - -extern int sysctl_tcp_timestamps; -extern int sysctl_tcp_window_scaling; -extern int sysctl_tcp_sack; -extern int sysctl_tcp_retrans_collapse; -extern int sysctl_tcp_keepalive_time; -extern int sysctl_tcp_keepalive_probes; -extern int sysctl_tcp_retries1; -extern int sysctl_tcp_retries2; -extern int sysctl_tcp_fin_timeout; -extern int sysctl_tcp_syncookies; -extern int sysctl_tcp_syn_retries; -extern int sysctl_tcp_stdurg; -extern int sysctl_tcp_rfc1337; -extern int sysctl_tcp_syn_taildrop; -extern int sysctl_max_syn_backlog; -extern int sysctl_tcp_tw_recycle; - /* From icmp.c */ extern int sysctl_icmp_destunreach_time; extern int sysctl_icmp_timeexceed_time; @@ -142,6 +122,12 @@ ctl_table ipv4_table[] = { &proc_dointvec}, {NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries", &sysctl_tcp_syn_retries, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_TCP_SYNACK_RETRIES, "tcp_synack_retries", + &sysctl_tcp_synack_retries, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_TCP_MAX_ORPHANS, "tcp_max_orphans", + &sysctl_tcp_max_orphans, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets", + &sysctl_tcp_max_tw_buckets, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh", &sysctl_ipfrag_high_thresh, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh", @@ -172,10 +158,10 @@ ctl_table ipv4_table[] = { {NET_TCP_SYNCOOKIES, "tcp_syncookies", &sysctl_tcp_syncookies, sizeof(int), 0644, NULL, &proc_dointvec}, #endif -#ifdef CONFIG_TCP_TW_RECYCLE {NET_TCP_TW_RECYCLE, "tcp_tw_recycle", &sysctl_tcp_tw_recycle, sizeof(int), 0644, NULL, &proc_dointvec}, -#endif + {NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow", + &sysctl_tcp_abort_on_overflow, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_TCP_STDURG, "tcp_stdurg", &sysctl_tcp_stdurg, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_TCP_RFC1337, "tcp_rfc1337", &sysctl_tcp_rfc1337, @@ -221,6 +207,8 @@ ctl_table ipv4_table[] = { {NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime", &inet_peer_gc_maxtime, sizeof(int), 0644, NULL, &proc_dointvec_jiffies, &sysctl_jiffies}, + {NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries", + &sysctl_tcp_orphan_retries, sizeof(int), 0644, NULL, &proc_dointvec}, {0} }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8e24e19a4..e01892326 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.153 2000/01/09 02:19:33 davem Exp $ + * Version: $Id: tcp.c,v 1.160 2000/01/24 18:40:32 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -202,6 +202,8 @@ * Eric Schenk : Fix fast close down bug with * shutdown() followed by close(). * Andi Kleen : Make poll agree with SIGIO + * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and + * lingertime == 0 (RFC 793 ABORT Call) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -432,113 +434,14 @@ kmem_cache_t *tcp_openreq_cachep; kmem_cache_t *tcp_bucket_cachep; kmem_cache_t *tcp_timewait_cachep; -/* - * Find someone to 'accept'. Must be called with - * the listening socket locked. - */ - -static struct open_request *tcp_find_established(struct tcp_opt *tp, - struct open_request **prevp) -{ - struct open_request *req = tp->syn_wait_queue; - struct open_request *prev = (struct open_request *)&tp->syn_wait_queue; - while(req) { - if (req->sk) { - if((1 << req->sk->state) & - ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) - break; - } - prev = req; - req = req->dl_next; - } - *prevp = prev; - return req; -} - -/* - * Walk down the receive queue counting readable data. - * - * Must be called with the socket lock held. - */ - -static int tcp_readable(struct sock *sk) -{ - unsigned long counted; - unsigned long amount; - struct sk_buff *skb; - int sum; - - SOCK_DEBUG(sk, "tcp_readable: %p - ",sk); - - skb = skb_peek(&sk->receive_queue); - if (skb == NULL) { - SOCK_DEBUG(sk, "empty\n"); - return(0); - } - - counted = sk->tp_pinfo.af_tcp.copied_seq; /* Where we are at the moment */ - amount = 0; - - /* Do until a push or until we are out of data. */ - do { - /* Found a hole so stops here. */ - if (before(counted, TCP_SKB_CB(skb)->seq)) /* should not happen */ - break; - - /* Length - header but start from where we are up to - * avoid overlaps. - */ - sum = skb->len - (counted - TCP_SKB_CB(skb)->seq); - if (sum >= 0) { - /* Add it up, move on. */ - amount += sum; - counted += sum; - if (skb->h.th->syn) - counted++; - } - - /* Don't count urg data ... but do it in the right place! - * Consider: "old_data (ptr is here) URG PUSH data" - * The old code would stop at the first push because - * it counted the urg (amount==1) and then does amount-- - * *after* the loop. This means tcp_readable() always - * returned zero if any URG PUSH was in the queue, even - * though there was normal data available. If we subtract - * the urg data right here, we even get it to work for more - * than one URG PUSH skb without normal data. - * This means that poll() finally works now with urg data - * in the queue. Note that rlogin was never affected - * because it doesn't use poll(); it uses two processes - * and a blocking read(). And the queue scan in tcp_read() - * was correct. Mike <pall@rz.uni-karlsruhe.de> - */ - - /* Don't count urg data. */ - if (skb->h.th->urg) - amount--; -#if 0 - if (amount && skb->h.th->psh) break; -#endif - skb = skb->next; - } while(skb != (struct sk_buff *)&sk->receive_queue); - - SOCK_DEBUG(sk, "got %lu bytes.\n",amount); - return(amount); -} +atomic_t tcp_orphan_count = ATOMIC_INIT(0); /* * LISTEN is a special case for poll.. */ -static unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait) +static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait) { - struct open_request *req, *dummy; - - lock_sock(sk); - req = tcp_find_established(&sk->tp_pinfo.af_tcp, &dummy); - release_sock(sk); - if (req) - return POLLIN | POLLRDNORM; - return 0; + return sk->tp_pinfo.af_tcp.accept_queue ? (POLLIN | POLLRDNORM) : 0; } /* @@ -585,9 +488,25 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) * if you don't tell them that something has hung up! * * Check-me. + * + * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and + * our fs/select.c). It means that after we received EOF, + * poll always returns immediately, making impossible poll() on write() + * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP + * if and only if shutdown has been made in both directions. + * Actually, it is interesting to look how Solaris and DUX + * solve this dilemma. I would prefer, if PULLHUP were maskable, + * then we could set it on SND_SHUTDOWN. BTW examples given + * in Stevens' books assume exactly this behaviour, it explains + * why PULLHUP is incompatible with POLLOUT. --ANK + * + * NOTE. Check for TCP_CLOSE is added. The goal is to prevent + * blocking on fresh not-connected or disconnected socket. --ANK */ - if (sk->shutdown & RCV_SHUTDOWN) + if (sk->shutdown == SHUTDOWN_MASK || sk->state == TCP_CLOSE) mask |= POLLHUP; + if (sk->shutdown & RCV_SHUTDOWN) + mask |= POLLIN | POLLRDNORM; /* Connected? */ if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) { @@ -605,7 +524,7 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) } } - if (tp->urg_data & URG_VALID) + if (tp->urg_data & TCP_URG_VALID) mask |= POLLPRI; } return mask; @@ -631,32 +550,48 @@ void tcp_write_space(struct sock *sk) read_unlock(&sk->callback_lock); } +/* Listening TCP sockets never sleep to wait for memory, so + * it is completely silly to wake them up on queue space + * available events. So we hook them up to this dummy callback. + */ +static void tcp_listen_write_space(struct sock *sk) +{ +} int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int answ; switch(cmd) { - case TIOCINQ: -#ifdef FIXME /* FIXME: */ - case FIONREAD: -#endif + case SIOCINQ: if (sk->state == TCP_LISTEN) return(-EINVAL); + lock_sock(sk); - answ = tcp_readable(sk); + if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV)) + answ = 0; + else if (sk->urginline || !tp->urg_data || + before(tp->urg_seq,tp->copied_seq) || + !before(tp->urg_seq,tp->rcv_nxt)) + answ = tp->rcv_nxt - tp->copied_seq; + else + answ = tp->urg_seq - tp->copied_seq; release_sock(sk); break; case SIOCATMARK: { - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); answ = tp->urg_data && tp->urg_seq == tp->copied_seq; break; } - case TIOCOUTQ: + case SIOCOUTQ: if (sk->state == TCP_LISTEN) return(-EINVAL); - answ = sock_wspace(sk); + + if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV)) + answ = 0; + else + answ = tp->write_seq - tp->snd_una; break; default: return(-ENOIOCTLCMD); @@ -665,12 +600,131 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) return put_user(answ, (int *)arg); } + +int tcp_listen_start(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct tcp_listen_opt *lopt; + + sk->max_ack_backlog = 0; + sk->ack_backlog = 0; + tp->accept_queue = NULL; + tp->syn_wait_lock = RW_LOCK_UNLOCKED; + + lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL); + if (!lopt) + return -ENOMEM; + + memset(lopt, 0, sizeof(struct tcp_listen_opt)); + for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++) + if ((1<<lopt->max_qlen_log) >= sysctl_max_syn_backlog) + break; + + write_lock_bh(&tp->syn_wait_lock); + tp->listen_opt = lopt; + write_unlock_bh(&tp->syn_wait_lock); + + sk->state = TCP_LISTEN; + if (sk->num == 0) { + if (sk->prot->get_port(sk, 0) != 0) { + sk->state = TCP_CLOSE; + write_lock_bh(&tp->syn_wait_lock); + tp->listen_opt = NULL; + write_unlock_bh(&tp->syn_wait_lock); + kfree(lopt); + return -EAGAIN; + } + sk->sport = htons(sk->num); + } else { + if (sk->prev) + ((struct tcp_bind_bucket*)sk->prev)->fastreuse = 0; + } + + sk_dst_reset(sk); + sk->prot->hash(sk); + sk->socket->flags |= SO_ACCEPTCON; + sk->write_space = tcp_listen_write_space; + + return 0; +} + +/* + * This routine closes sockets which have been at least partially + * opened, but not yet accepted. + */ + +static void tcp_listen_stop (struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct tcp_listen_opt *lopt = tp->listen_opt; + struct open_request *acc_req = tp->accept_queue; + struct open_request *req; + int i; + + tcp_delete_keepalive_timer(sk); + + /* make all the listen_opt local to us */ + write_lock_bh(&tp->syn_wait_lock); + tp->listen_opt =NULL; + write_unlock_bh(&tp->syn_wait_lock); + tp->accept_queue = NULL; + + if (lopt->qlen) { + for (i=0; i<TCP_SYNQ_HSIZE; i++) { + while ((req = lopt->syn_table[i]) != NULL) { + lopt->syn_table[i] = req->dl_next; + lopt->qlen--; + tcp_openreq_free(req); + + /* Following specs, it would be better either to send FIN + * (and enter FIN-WAIT-1, it is normal close) + * or to send active reset (abort). + * Certainly, it is pretty dangerous while synflood, but it is + * bad justification for our negligence 8) + * To be honest, we are not able to make either + * of the variants now. --ANK + */ + } + } + } + BUG_TRAP(lopt->qlen == 0); + + kfree(lopt); + + while ((req=acc_req) != NULL) { + struct sock *child = req->sk; + + acc_req = req->dl_next; + + local_bh_disable(); + bh_lock_sock(child); + BUG_TRAP(child->lock.users==0); + sock_hold(child); + + tcp_disconnect(child, O_NONBLOCK); + + sock_orphan(child); + + atomic_inc(&tcp_orphan_count); + + tcp_destroy_sock(child); + + bh_unlock_sock(child); + local_bh_enable(); + sock_put(child); + + tcp_acceptq_removed(sk); + tcp_openreq_fastfree(req); + } + BUG_TRAP(sk->ack_backlog == 0); +} + /* * Wait for a socket to get into the connected state * * Note: Must be called with the socket locked. */ -static int wait_for_tcp_connect(struct sock * sk, int flags) +static int wait_for_tcp_connect(struct sock * sk, int flags, long *timeo_p) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -684,7 +738,7 @@ static int wait_for_tcp_connect(struct sock * sk, int flags) send_sig(SIGPIPE, tsk, 0); return -EPIPE; } - if(flags & MSG_DONTWAIT) + if(!*timeo_p) return -EAGAIN; if(signal_pending(tsk)) return -ERESTARTSYS; @@ -694,7 +748,7 @@ static int wait_for_tcp_connect(struct sock * sk, int flags) sk->tp_pinfo.af_tcp.write_pending++; release_sock(sk); - schedule(); + *timeo_p = schedule_timeout(*timeo_p); lock_sock(sk); __set_task_state(tsk, TASK_RUNNING); @@ -712,7 +766,7 @@ static inline int tcp_memory_free(struct sock *sk) /* * Wait for more memory for a socket */ -static void wait_for_tcp_memory(struct sock * sk) +static long wait_for_tcp_memory(struct sock * sk, long timeo) { if (!tcp_memory_free(sk)) { DECLARE_WAITQUEUE(wait, current); @@ -732,12 +786,13 @@ static void wait_for_tcp_memory(struct sock * sk) break; release_sock(sk); if (!tcp_memory_free(sk)) - schedule(); + timeo = schedule_timeout(timeo); lock_sock(sk); } current->state = TASK_RUNNING; remove_wait_queue(sk->sleep, &wait); } + return timeo; } /* When all user supplied data has been queued set the PSH bit */ @@ -746,11 +801,9 @@ static void wait_for_tcp_memory(struct sock * sk) /* * This routine copies from a user buffer into a socket, * and starts the transmit system. - * - * Note: must be called with the socket locked. */ -int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) +int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size) { struct iovec *iov; struct tcp_opt *tp; @@ -758,15 +811,22 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) int iovlen, flags; int mss_now; int err, copied; + long timeo; err = 0; tp = &(sk->tp_pinfo.af_tcp); - /* Wait for a connection to finish. */ + lock_sock(sk); + TCP_CHECK_TIMER(sk); + flags = msg->msg_flags; + + timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT); + + /* Wait for a connection to finish. */ if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) - if((err = wait_for_tcp_connect(sk, flags)) != 0) - goto out; + if((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0) + goto out_unlock; /* This should be in poll */ sk->socket->flags &= ~SO_NOSPACE; /* clear SIGIO XXX */ @@ -777,7 +837,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) iovlen = msg->msg_iovlen; iov = msg->msg_iov; copied = 0; - + while(--iovlen >= 0) { int seglen=iov->iov_len; unsigned char * from=iov->iov_base; @@ -785,7 +845,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) iov++; while(seglen > 0) { - int copy, tmp, queue_it, psh; + int copy, tmp, queue_it; if (err) goto do_fault2; @@ -811,8 +871,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) * welcome. */ if (skb_tailroom(skb) > 0 && - (mss_now - copy) > 0 && - tp->snd_nxt < TCP_SKB_CB(skb)->end_seq) { + (mss_now - copy) > 0) { int last_byte_was_odd = (copy % 4); copy = mss_now - copy; @@ -855,34 +914,17 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) } } - /* We also need to worry about the window. If - * window < 1/2 the maximum window we've seen - * from this host, don't use it. This is - * sender side silly window prevention, as - * specified in RFC1122. (Note that this is - * different than earlier versions of SWS - * prevention, e.g. RFC813.). What we - * actually do is use the whole MSS. Since - * the results in the right edge of the packet - * being outside the window, it will be queued - * for later rather than sent. + /* A chunk was here doing something strange + * with psh etc. It is deleted, because it was + * evident non-sense. --ANK */ - psh = 0; - copy = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); - if(copy > (tp->max_window >> 1)) { - copy = min(copy, mss_now); - psh = 1; - } else { - copy = mss_now; - } - if(copy > seglen) - copy = seglen; + + copy = min(seglen, mss_now); /* Determine how large of a buffer to allocate. */ - tmp = MAX_HEADER + sk->prot->max_header; - if (copy < min(mss_now, tp->max_window >> 1) && - !(flags & MSG_OOB)) { - tmp += min(mss_now, tp->max_window); + tmp = MAX_TCP_HEADER + 15; + if (copy < mss_now && !(flags & MSG_OOB)) { + tmp += mss_now; /* What is happening here is that we want to * tack on later members of the users iovec @@ -901,7 +943,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) /* If we didn't get any memory, we need to sleep. */ if (skb == NULL) { sk->socket->flags |= SO_NOSPACE; - if (flags&MSG_DONTWAIT) { + if (!timeo) { err = -EAGAIN; goto do_interrupted; } @@ -909,8 +951,8 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) err = -ERESTARTSYS; goto do_interrupted; } - tcp_push_pending_frames(sk, tp); - wait_for_tcp_memory(sk); + __tcp_push_pending_frames(sk, tp, mss_now); + timeo = wait_for_tcp_memory(sk, timeo); /* If SACK's were formed or PMTU events happened, * we must find out about it. @@ -923,7 +965,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) /* Prepare control bits for TCP header creation engine. */ TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | - ((PSH_NEEDED || psh) ? + ((PSH_NEEDED) ? TCPCB_FLAG_PSH : 0)); TCP_SKB_CB(skb)->sacked = 0; if (flags & MSG_OOB) { @@ -936,7 +978,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) * TCP+IP+DEV headers are SKB_PUSH()'d beneath. * Reserve header space and checksum the data. */ - skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb_reserve(skb, MAX_TCP_HEADER); skb->csum = csum_and_copy_from_user(from, skb_put(skb, copy), copy, 0, &err); @@ -950,7 +992,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + copy; /* This advances tp->write_seq for us. */ - tcp_send_skb(sk, skb, queue_it); + tcp_send_skb(sk, skb, queue_it, mss_now); } } sk->err = 0; @@ -981,63 +1023,39 @@ do_fault: do_fault2: err = -EFAULT; out: - tcp_push_pending_frames(sk, tp); + __tcp_push_pending_frames(sk, tp, mss_now); + TCP_CHECK_TIMER(sk); +out_unlock: + release_sock(sk); return err; } #undef PSH_NEEDED /* - * Send an ack if one is backlogged at this point. Ought to merge - * this with tcp_send_ack(). - * This is called for delayed acks also. - */ - -void tcp_read_wakeup(struct sock *sk) -{ - /* If we're closed, don't send an ack, or we'll get a RST - * from the closed destination. - */ - if (sk->state != TCP_CLOSE) - tcp_send_ack(sk); -} - -/* * Handle reading urgent data. BSD has very simple semantics for * this, no blocking and very strange errors 8) */ -static int tcp_recv_urg(struct sock * sk, int nonblock, +static int tcp_recv_urg(struct sock * sk, long timeo, struct msghdr *msg, int len, int flags, int *addr_len) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* No URG data to read. */ - if (sk->urginline || !tp->urg_data || tp->urg_data == URG_READ) + if (sk->urginline || !tp->urg_data || tp->urg_data == TCP_URG_READ) return -EINVAL; /* Yes this is right ! */ if (sk->done) return -ENOTCONN; - if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN)) { - sk->done = 1; - return 0; - } - - if (tp->urg_data & URG_VALID) { + if (tp->urg_data & TCP_URG_VALID) { int err = 0; char c = tp->urg_data; if (!(flags & MSG_PEEK)) - tp->urg_data = URG_READ; - - if(msg->msg_name) - tp->af_specific->addr2sockaddr(sk, (struct sockaddr *) - msg->msg_name); - - if(addr_len) - *addr_len = tp->af_specific->sockaddr_len; + tp->urg_data = TCP_URG_READ; /* Read urgent data. */ msg->msg_flags|=MSG_OOB; @@ -1051,6 +1069,10 @@ static int tcp_recv_urg(struct sock * sk, int nonblock, return err ? -EFAULT : len; } + /* Do not set sk->done, it is set only by normal data receive */ + if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN)) + return 0; + /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and * the available implementations agree in this case: * this call should never block, independent of the @@ -1069,6 +1091,8 @@ static int tcp_recv_urg(struct sock * sk, int nonblock, static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb) { __skb_unlink(skb, &sk->receive_queue); + BUG_TRAP(atomic_read(&skb->users) == 1); + /* Well, if I missed something then punishment will be terrible oops. */ __kfree_skb(skb); } @@ -1080,22 +1104,34 @@ static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb) */ static void cleanup_rbuf(struct sock *sk, int copied) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; + int time_to_ack; /* NOTE! The socket must be locked, so that we don't get * a messed-up receive queue. */ while ((skb=skb_peek(&sk->receive_queue)) != NULL) { - if (!skb->used || atomic_read(&skb->users) > 1) + if (!skb->used) break; tcp_eat_skb(sk, skb); } + /* Delayed ACKs frequently hit locked sockets during bulk receive. */ + time_to_ack = tp->ack.blocked && tp->ack.pending; +#if 1/*def CONFIG_TCP_MORE_COARSE_ACKS*/ + if (tp->ack.pending && + (tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss) + time_to_ack = 1; +#endif + /* We send an ACK if we can now advertise a non-zero window * which has been raised "significantly". + * + * Even if window raised up to infinity, do not send window open ACK + * in states, where we will not receive more. It is useless. */ - if(copied > 0) { - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + if(copied > 0 && !time_to_ack && !(sk->shutdown&RCV_SHUTDOWN)) { __u32 rcv_window_now = tcp_receive_window(tp); __u32 new_window = __tcp_select_window(sk); @@ -1106,16 +1142,20 @@ static void cleanup_rbuf(struct sock *sk, int copied) * which don't advertize a larger window. */ if((new_window && (new_window >= rcv_window_now * 2)) && - ((rcv_window_now + tp->mss_cache) <= tp->window_clamp)) - tcp_read_wakeup(sk); + ((rcv_window_now + tp->ack.rcv_mss) <= tp->window_clamp)) + time_to_ack = 1; } + if (time_to_ack) + tcp_send_ack(sk); } /* Now socket state including sk->err is changed only under lock, - hence we should check only pending signals. + * hence we may omit checks after joining wait queue. + * We check receive queue before schedule() only as optimization; + * it is very likely that release_sock() added new data. */ -static void tcp_data_wait(struct sock *sk) +static long tcp_data_wait(struct sock *sk, long timeo) { DECLARE_WAITQUEUE(wait, current); @@ -1127,17 +1167,39 @@ static void tcp_data_wait(struct sock *sk) release_sock(sk); if (skb_queue_empty(&sk->receive_queue)) - schedule(); + timeo = schedule_timeout(timeo); lock_sock(sk); sk->socket->flags &= ~SO_WAITDATA; remove_wait_queue(sk->sleep, &wait); __set_current_state(TASK_RUNNING); + return timeo; +} + +static void tcp_prequeue_process(struct sock *sk) +{ + struct sk_buff *skb; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + net_statistics[smp_processor_id()*2+1].TCPPrequeued += skb_queue_len(&tp->ucopy.prequeue); + + /* RX process wants to run with disabled BHs, though it is not necessary */ + local_bh_disable(); + while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) + sk->backlog_rcv(sk, skb); + local_bh_enable(); + + /* Clear memory counter. */ + tp->ucopy.memory = 0; } /* * This routine copies from a sock struct into the user buffer. + * + * Technical note: in 2.3 we work on _locked_ socket, so that + * tricks with *seq access order and skb->users are not required. + * Probably, code can be easily improved even more. */ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, @@ -1146,13 +1208,18 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int copied = 0; u32 peek_seq; - volatile u32 *seq; /* So gcc doesn't overoptimise */ + u32 *seq; unsigned long used; int err; - int target = 1; /* Read at least this many bytes */ + int target; /* Read at least this many bytes */ + long timeo; + struct task_struct *user_recv = NULL; lock_sock(sk); + TCP_CHECK_TIMER(sk); + + if (sk->err) goto out_err; @@ -1160,24 +1227,20 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, if (sk->state == TCP_LISTEN) goto out; + timeo = sock_rcvtimeo(sk, nonblock); + /* Urgent data needs to be handled specially. */ if (flags & MSG_OOB) goto recv_urg; - /* Copying sequence to update. This is volatile to handle - * the multi-reader case neatly (memcpy_to/fromfs might be - * inline and thus not flush cached variables otherwise). - */ - peek_seq = tp->copied_seq; seq = &tp->copied_seq; - if (flags & MSG_PEEK) + if (flags & MSG_PEEK) { + peek_seq = tp->copied_seq; seq = &peek_seq; + } - /* Handle the POSIX bogosity MSG_WAITALL. */ - if (flags & MSG_WAITALL) - target=len; + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); - /* * BUG BUG BUG * This violates 1003.1g compliance. We must wait for @@ -1200,7 +1263,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, if (copied) break; copied = -ERESTARTSYS; - if (nonblock) + if (!timeo) copied = -EAGAIN; break; } @@ -1232,47 +1295,128 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, skb = skb->next; } while (skb != (struct sk_buff *)&sk->receive_queue); - if (copied >= target) + /* Well, if we have backlog, try to process it now yet. */ + + if (copied >= target && sk->backlog.tail == NULL) break; - if (sk->err && !(flags&MSG_PEEK)) { - if (!copied) + if (copied) { + if (sk->err || + sk->state == TCP_CLOSE || + (sk->shutdown & RCV_SHUTDOWN) || + !timeo) + break; + } else { + if (sk->err) { copied = sock_error(sk); - break; - } + break; + } - if (sk->shutdown & RCV_SHUTDOWN) { - sk->done = 1; - break; - } + if (sk->done) { + copied = -ENOTCONN; + break; + } - if (sk->state == TCP_CLOSE) { - if (!sk->done) { - sk->done = 1; + if (sk->state == TCP_CLOSE) { + if (!(flags&MSG_PEEK)) + sk->done = 1; break; } - if (!copied) - copied = -ENOTCONN; - break; - } - if (nonblock) { - copied = -EAGAIN; - break; + if (sk->shutdown & RCV_SHUTDOWN) + break; + + if (!timeo) { + copied = -EAGAIN; + break; + } } cleanup_rbuf(sk, copied); - tcp_data_wait(sk); + + if (tp->ucopy.task == user_recv) { + /* Install new reader */ + if (user_recv == NULL && !(flags&MSG_PEEK)) { + user_recv = current; + tp->ucopy.task = user_recv; + tp->ucopy.iov = msg->msg_iov; + } + + tp->ucopy.len = len; + + BUG_TRAP(tp->copied_seq == tp->rcv_nxt); + + /* Ugly... If prequeue is not empty, we have to + * process it before releasing socket, otherwise + * order will be broken at second iteration. + * More elegant solution is required!!! + * + * Look: we have the following (pseudo)queues: + * + * 1. packets in flight + * 2. backlog + * 3. prequeue + * 4. receive_queue + * + * Each queue can be processed only if the next ones + * are empty. At this point we have empty receive_queue. + * But prequeue _can_ be not empty after second iteration, + * when we jumped to start of loop because backlog + * processing added something to receive_queue. + * We cannot release_sock(), because backlog contains + * packets arrived _after_ prequeued ones. + * + * Shortly, algorithm is clear --- to process all + * the queues in order. We could make it more directly, + * requeueing packets from backlog to prequeue, if + * is not empty. It is more elegant, but eats cycles, + * unfortunately. + */ + if (skb_queue_len(&tp->ucopy.prequeue)) + goto do_prequeue; + + /* __ Set realtime policy in scheduler __ */ + } + + if (copied >= target) { + /* Do not sleep, just process backlog. */ + release_sock(sk); + lock_sock(sk); + } else { + timeo = tcp_data_wait(sk, timeo); + } + + if (user_recv) { + int chunk; + + /* __ Restore normal policy in scheduler __ */ + + if ((chunk = len - tp->ucopy.len) != 0) { + net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromBacklog += chunk; + len -= chunk; + copied += chunk; + } + + if (tp->rcv_nxt == tp->copied_seq && + skb_queue_len(&tp->ucopy.prequeue)) { +do_prequeue: + tcp_prequeue_process(sk); + + if ((chunk = len - tp->ucopy.len) != 0) { + net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk; + len -= chunk; + copied += chunk; + } + } +#if 1/*def CONFIG_TCP_MORE_COARSE_ACKS*/ + if (tp->ack.pending && + (tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss) + tcp_send_ack(sk); +#endif + } continue; found_ok_skb: - /* Lock the buffer. We can be fairly relaxed as - * an interrupt will never steal a buffer we are - * using unless I've missed something serious in - * tcp_data. - */ - atomic_inc(&skb->users); - /* Ok so how much can we use? */ used = skb->len - offset; if (len < used) @@ -1293,36 +1437,28 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, } } - /* Copy it - We _MUST_ update *seq first so that we - * don't ever double read when we have dual readers - */ - *seq += used; - - /* This memcpy_toiovec can sleep. If it sleeps and we - * do a second read it relies on the skb->users to avoid - * a crash when cleanup_rbuf() gets called. - */ err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used); if (err) { /* Exception. Bailout! */ - atomic_dec(&skb->users); - copied = -EFAULT; + if (!copied) + copied = -EFAULT; break; } + *seq += used; copied += used; len -= used; - /* We now will not sleep again until we are finished - * with skb. Sorry if you are doing the SMP port - * but you'll just have to fix it neatly ;) - * - * Very funny Alan... -DaveM - */ - atomic_dec(&skb->users); - - if (after(tp->copied_seq,tp->urg_seq)) + if (after(tp->copied_seq,tp->urg_seq)) { tp->urg_data = 0; + if (skb_queue_len(&tp->out_of_order_queue) == 0 +#ifdef TCP_FORMAL_WINDOW + && tcp_receive_window(tp) +#endif + ) { + tcp_fast_path_on(tp); + } + } if (used + offset < skb->len) continue; @@ -1334,8 +1470,30 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, if (flags & MSG_PEEK) continue; skb->used = 1; - if (atomic_read(&skb->users) == 1) - tcp_eat_skb(sk, skb); + tcp_eat_skb(sk, skb); + +#ifdef CONFIG_TCP_LESS_COARSE_ACKS + /* Possible improvement. When sender is faster than receiver, + * traffic looks like: fill window ... wait for window open ... + * fill window. We lose at least one rtt, because call + * cleanup_rbuf only once. Probably, if "len" was large + * we should insert several intermediate cleanup_rbuf(s). + * + * F.e.: + */ + do { + u32 full_space = min(tp->window_clamp, tcp_full_space(sk)); + + /* Try to ACK, if total buffer length is larger + than maximal window and if rcv_window has + chances to increase twice. It will result + to exponentially decreased ACKing during + read to huge (usually, mmapped) buffer. + */ + if (len >= full_space && tp->rcv_wnd <= full_space/2) + cleanup_rbuf(sk, copied); + } while (0); +#endif continue; found_fin_ok: @@ -1345,19 +1503,36 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, /* All is done. */ skb->used = 1; - sk->shutdown |= RCV_SHUTDOWN; break; } - if (copied >= 0 && msg->msg_name) - tp->af_specific->addr2sockaddr(sk, (struct sockaddr *) - msg->msg_name); + if (user_recv) { + if (skb_queue_len(&tp->ucopy.prequeue)) { + int chunk; + + tp->ucopy.len = copied > 0 ? len : 0; + + tcp_prequeue_process(sk); + + if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) { + net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk; + len -= chunk; + copied += chunk; + } + } - if(addr_len) - *addr_len = tp->af_specific->sockaddr_len; + tp->ucopy.task = NULL; + tp->ucopy.len = 0; + } + + /* According to UNIX98, msg_name/msg_namelen are ignored + * on connected socket. I was just happy when found this 8) --ANK + */ /* Clean up data we have read: This will do ACK frames. */ cleanup_rbuf(sk, copied); + + TCP_CHECK_TIMER(sk); release_sock(sk); return copied; @@ -1365,24 +1540,16 @@ out_err: err = sock_error(sk); out: + TCP_CHECK_TIMER(sk); release_sock(sk); return err; recv_urg: - err = tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len); + err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len); goto out; } /* - * Check whether to renew the timer. - */ -static inline void tcp_check_fin_timer(struct sock *sk) -{ - if (sk->state == TCP_FIN_WAIT2) - tcp_reset_keepalive_timer(sk, sysctl_tcp_fin_timeout); -} - -/* * State processing on a close. This implements the state shift for * sending our FIN frame. Note that we only send a FIN for some * states. A shutdown() may have already sent the FIN, or we may be @@ -1405,24 +1572,13 @@ static unsigned char new_state[16] = { /* TCP_CLOSING */ TCP_CLOSING, }; -static int tcp_close_state(struct sock *sk, int dead) +static int tcp_close_state(struct sock *sk) { int next = (int) new_state[sk->state]; int ns = (next & TCP_STATE_MASK); tcp_set_state(sk, ns); - /* This is a (useful) BSD violating of the RFC. There is a - * problem with TCP as specified in that the other end could - * keep a socket open forever with no application left this end. - * We use a 3 minute timeout (about the same as BSD) then kill - * our end. If they send after that then tough - BUT: long enough - * that we won't make the old 4*rto = almost no time - whoops - * reset mistake. - */ - if (dead) - tcp_check_fin_timer(sk); - return (next & TCP_ACTION_FIN); } @@ -1443,9 +1599,8 @@ void tcp_shutdown(struct sock *sk, int how) /* If we've already sent a FIN, or it's a closed state, skip this. */ if ((1 << sk->state) & (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) { - /* Clear out any half completed packets. FIN if needed. */ - if (tcp_close_state(sk,0)) + if (tcp_close_state(sk)) tcp_send_fin(sk); } } @@ -1460,40 +1615,6 @@ static inline int closing(struct sock * sk) return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK)); } -/* - * This routine closes sockets which have been at least partially - * opened, but not yet accepted. Currently it is only called by - * tcp_close. - */ - -static void tcp_close_pending (struct sock *sk) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct open_request *req = tp->syn_wait_queue; - - while(req) { - struct open_request *iter; - - if (req->sk) - tcp_close(req->sk, 0); - - iter = req; - req = req->dl_next; - - if (iter->sk) { - sk->ack_backlog--; - } else { - tcp_dec_slow_timer(TCP_SLT_SYNACK); - tp->syn_backlog--; - } - (*iter->class->destructor)(iter); - tcp_openreq_free(iter); - } - BUG_TRAP(tp->syn_backlog == 0); - BUG_TRAP(sk->ack_backlog == 0); - tcp_synq_init(tp); -} - static __inline__ void tcp_kill_sk_queues(struct sock *sk) { /* First the read buffer. */ @@ -1528,6 +1649,14 @@ void tcp_destroy_sock(struct sock *sk) /* It it has not 0 sk->num, it must be bound */ BUG_TRAP(!sk->num || sk->prev!=NULL); +#ifdef TCP_DEBUG + if (sk->zapped) { + printk("TCP: double destroy sk=%p\n", sk); + sock_hold(sk); + } + sk->zapped = 1; +#endif + sk->prot->destroy(sk); tcp_kill_sk_queues(sk); @@ -1538,6 +1667,7 @@ void tcp_destroy_sock(struct sock *sk) } #endif + atomic_dec(&tcp_orphan_count); sock_put(sk); } @@ -1547,17 +1677,17 @@ void tcp_close(struct sock *sk, long timeout) int data_was_unread = 0; lock_sock(sk); + sk->shutdown = SHUTDOWN_MASK; + if(sk->state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); /* Special case. */ - tcp_close_pending(sk); + tcp_listen_stop(sk); goto adjudge_to_death; } - sk->shutdown = SHUTDOWN_MASK; - /* We need to flush the recv. buffs. We do this only on the * descriptor close, not protocol-sourced closes, because the * reader process may not have drained the data yet! @@ -1581,10 +1711,35 @@ void tcp_close(struct sock *sk, long timeout) /* Unread data was tossed, zap the connection. */ tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_KERNEL); - } else if (tcp_close_state(sk,1)) { + } else if (sk->linger && sk->lingertime==0) { + /* Check zero linger _after_ checking for unread data. */ + sk->prot->disconnect(sk, 0); + } else if (tcp_close_state(sk)) { /* We FIN if the application ate all the data before * zapping the connection. */ + + /* RED-PEN. Formally speaking, we have broken TCP state + * machine. State transitions: + * + * TCP_ESTABLISHED -> TCP_FIN_WAIT1 + * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible) + * TCP_CLOSE_WAIT -> TCP_LAST_ACK + * + * are legal only when FIN has been sent (i.e. in window), + * rather than queued out of window. Purists blame. + * + * F.e. "RFC state" is ESTABLISHED, + * if Linux state is FIN-WAIT-1, but FIN is still not sent. + * + * The visible declinations are that sometimes + * we enter time-wait state, when it is not required really + * (harmless), do not send active resets, when they are + * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when + * they look as CLOSING or LAST_ACK for Linux) + * Probably, I missed some more holelets. + * --ANK + */ tcp_send_fin(sk); } @@ -1594,26 +1749,19 @@ void tcp_close(struct sock *sk, long timeout) add_wait_queue(sk->sleep, &wait); - while (1) { + do { set_current_state(TASK_INTERRUPTIBLE); if (!closing(sk)) break; release_sock(sk); timeout = schedule_timeout(timeout); lock_sock(sk); - if (!signal_pending(tsk) || timeout) - break; - } + } while (!signal_pending(tsk) && timeout); tsk->state = TASK_RUNNING; remove_wait_queue(sk->sleep, &wait); } - /* Now that the socket is dead, if we are in the FIN_WAIT2 state - * we may need to set up a timer. - */ - tcp_check_fin_timer(sk); - adjudge_to_death: /* It is the last release_sock in its life. It will remove backlog. */ release_sock(sk); @@ -1627,23 +1775,67 @@ adjudge_to_death: BUG_TRAP(sk->lock.users==0); sock_hold(sk); + sock_orphan(sk); + + /* This is a (useful) BSD violating of the RFC. There is a + * problem with TCP as specified in that the other end could + * keep a socket open forever with no application left this end. + * We use a 3 minute timeout (about the same as BSD) then kill + * our end. If they send after that then tough - BUT: long enough + * that we won't make the old 4*rto = almost no time - whoops + * reset mistake. + * + * Nope, it was not mistake. It is really desired behaviour + * f.e. on http servers, when such sockets are useless, but + * consume significant resources. Let's do it with special + * linger2 option. --ANK + */ + + if (sk->state == TCP_FIN_WAIT2) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + if (tp->linger2 < 0) { + tcp_set_state(sk, TCP_CLOSE); + tcp_send_active_reset(sk, GFP_ATOMIC); + } else { + int tmo = tcp_fin_time(tp); - /* Announce socket dead, detach it from wait queue and inode. */ - write_lock_irq(&sk->callback_lock); - sk->dead = 1; - sk->socket = NULL; - sk->sleep = NULL; - write_unlock_irq(&sk->callback_lock); + if (tmo > TCP_TIMEWAIT_LEN) { + tcp_reset_keepalive_timer(sk, tcp_fin_time(tp)); + } else { + atomic_inc(&tcp_orphan_count); + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto out; + } + } + } + if (sk->state != TCP_CLOSE && + atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans) { + if (net_ratelimit()) + printk(KERN_INFO "TCP: too many of orphaned sockets\n"); + tcp_set_state(sk, TCP_CLOSE); + tcp_send_active_reset(sk, GFP_ATOMIC); + } + atomic_inc(&tcp_orphan_count); if (sk->state == TCP_CLOSE) tcp_destroy_sock(sk); /* Otherwise, socket is reprieved until protocol close. */ +out: bh_unlock_sock(sk); local_bh_enable(); sock_put(sk); } +/* These states need RST on ABORT according to RFC793 */ + +extern __inline__ int tcp_need_reset(int state) +{ + return ((1 << state) & + (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1| + TCPF_FIN_WAIT2|TCPF_SYN_RECV)); +} + int tcp_disconnect(struct sock *sk, int flags) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; @@ -1656,9 +1848,14 @@ int tcp_disconnect(struct sock *sk, int flags) /* ABORT function of RFC793 */ if (old_state == TCP_LISTEN) { - tcp_close_pending(sk); - } else if (tcp_connected(old_state)) { - tcp_send_active_reset(sk, GFP_KERNEL); + tcp_listen_stop(sk); + } else if (tcp_need_reset(old_state) || + (tp->snd_nxt != tp->write_seq && + (1<<old_state)&(TCPF_CLOSING|TCPF_LAST_ACK))) { + /* The last check adjusts for discrepance of Linux wrt. RFC + * states + */ + tcp_send_active_reset(sk, gfp_any()); sk->err = ECONNRESET; } else if (old_state == TCP_SYN_SENT) sk->err = ECONNRESET; @@ -1677,26 +1874,25 @@ int tcp_disconnect(struct sock *sk, int flags) memset(&sk->net_pinfo.af_inet6.rcv_saddr, 0, 16); #endif - sk->zapped = 0; sk->shutdown = 0; sk->done = 0; sk->write_space = tcp_write_space; tp->srtt = 0; -#ifdef CONFIG_TCP_TW_RECYCLE - if ((tp->write_seq += 2) == 0) - tp->write_seq = 1; -#else - tp->write_seq = 0; -#endif - tp->ato = 0; + if (sysctl_tcp_tw_recycle) { + if ((tp->write_seq += 2) == 0) + tp->write_seq = 1; + } else { + tp->write_seq = 0; + } tp->backoff = 0; tp->snd_cwnd = 2; tp->probes_out = 0; + tp->packets_out = 0; tp->high_seq = 0; tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_cnt = 0; tp->dup_acks = 0; - tp->delayed_acks = 0; + tcp_delack_init(tp); tp->send_head = tp->retrans_head = NULL; tp->saw_tstamp = 0; __sk_dst_reset(sk); @@ -1712,11 +1908,10 @@ int tcp_disconnect(struct sock *sk, int flags) * conditions. This must be called with the socket locked, * and without the kernel lock held. */ -static struct open_request * wait_for_connect(struct sock * sk, - struct open_request **pprev) +static int wait_for_connect(struct sock * sk, long timeo) { DECLARE_WAITQUEUE(wait, current); - struct open_request *req; + int err; /* * True wake-one mechanism for incoming connections: only @@ -1736,17 +1931,25 @@ static struct open_request * wait_for_connect(struct sock * sk, for (;;) { current->state = TASK_EXCLUSIVE | TASK_INTERRUPTIBLE; release_sock(sk); - schedule(); + if (sk->tp_pinfo.af_tcp.accept_queue == NULL) + timeo = schedule_timeout(timeo); lock_sock(sk); - req = tcp_find_established(&(sk->tp_pinfo.af_tcp), pprev); - if (req) + err = 0; + if (sk->tp_pinfo.af_tcp.accept_queue) + break; + err = -EINVAL; + if (sk->state != TCP_LISTEN) break; + err = -ERESTARTSYS; if (signal_pending(current)) break; + err = -EAGAIN; + if (!timeo) + break; } current->state = TASK_RUNNING; remove_wait_queue(sk->sleep, &wait); - return req; + return err; } /* @@ -1758,9 +1961,10 @@ static struct open_request * wait_for_connect(struct sock * sk, struct sock *tcp_accept(struct sock *sk, int flags, int *err) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - struct open_request *req, *prev; + struct open_request *req; struct sock *newsk; int error; + long timeo; lock_sock(sk); @@ -1771,25 +1975,27 @@ struct sock *tcp_accept(struct sock *sk, int flags, int *err) if (sk->state != TCP_LISTEN) goto out; + timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + /* Find already established connection */ - req = tcp_find_established(tp, &prev); - if (!req) { + if (!tp->accept_queue) { /* If this is a non blocking socket don't sleep */ error = -EAGAIN; - if (flags & O_NONBLOCK) + if (!timeo) goto out; - error = -ERESTARTSYS; - req = wait_for_connect(sk, &prev); - if (!req) + error = wait_for_connect(sk, timeo); + if (error) goto out; } - tcp_synq_unlink(tp, req, prev); - newsk = req->sk; - req->class->destructor(req); - tcp_openreq_free(req); - sk->ack_backlog--; + req = tp->accept_queue; + tp->accept_queue = req->dl_next; + + newsk = req->sk; + tcp_acceptq_removed(sk); + tcp_openreq_fastfree(req); + BUG_TRAP(newsk->state != TCP_SYN_RECV); release_sock(sk); return newsk; @@ -1828,7 +2034,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, * the point when this call is done we typically don't yet know * which interface is going to be used */ - if(val < 1 || val > MAX_WINDOW) { + if(val < 8 || val > MAX_TCP_WINDOW) { err = -EINVAL; break; } @@ -1839,11 +2045,11 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, /* You cannot try to use this and TCP_CORK in * tandem, so let the user know. */ - if (sk->nonagle == 2) { + if (tp->nonagle == 2) { err = -EINVAL; break; } - sk->nonagle = (val == 0) ? 0 : 1; + tp->nonagle = (val == 0) ? 0 : 1; break; case TCP_CORK: @@ -1858,14 +2064,14 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, * You cannot try to use TCP_NODELAY and this mechanism * at the same time, so let the user know. */ - if (sk->nonagle == 1) { + if (tp->nonagle == 1) { err = -EINVAL; break; } if (val != 0) { - sk->nonagle = 2; + tp->nonagle = 2; } else { - sk->nonagle = 0; + tp->nonagle = 0; tcp_push_pending_frames(sk, tp); } @@ -1905,6 +2111,38 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, tp->syn_retries = val; break; + case TCP_LINGER2: + if (val < 0) + tp->linger2 = -1; + else if (val > sysctl_tcp_fin_timeout/HZ) + tp->linger2 = 0; + else + tp->linger2 = val*HZ; + break; + + case TCP_DEFER_ACCEPT: + tp->defer_accept = 0; + if (val > 0) { + /* Translate value in seconds to number of retransmits */ + while (val > ((TCP_TIMEOUT_INIT/HZ)<<tp->defer_accept)) + tp->defer_accept++; + tp->defer_accept++; + } + break; + + case TCP_WINDOW_CLAMP: + if (val==0) { + if (sk->state != TCP_CLOSE) { + err = -EINVAL; + break; + } + tp->window_clamp = 0; + } else { + tp->window_clamp = val<SOCK_MIN_RCVBUF/2 ? + SOCK_MIN_SNDBUF : val; + } + break; + default: err = -ENOPROTOOPT; break; @@ -1930,37 +2168,38 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, switch(optname) { case TCP_MAXSEG: - val = tp->user_mss; + val = tp->mss_cache; + if (val == 0 && ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))) + val = tp->user_mss; break; case TCP_NODELAY: - val = (sk->nonagle == 1); + val = (tp->nonagle == 1); break; case TCP_CORK: - val = (sk->nonagle == 2); + val = (tp->nonagle == 2); break; case TCP_KEEPIDLE: - if (tp->keepalive_time) - val = tp->keepalive_time / HZ; - else - val = sysctl_tcp_keepalive_time / HZ; + val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time)/HZ; break; case TCP_KEEPINTVL: - if (tp->keepalive_intvl) - val = tp->keepalive_intvl / HZ; - else - val = sysctl_tcp_keepalive_intvl / HZ; + val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl)/HZ; break; case TCP_KEEPCNT: - if (tp->keepalive_probes) - val = tp->keepalive_probes; - else - val = sysctl_tcp_keepalive_probes; + val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; break; case TCP_SYNCNT: - if (tp->syn_retries) - val = tp->syn_retries; - else - val = sysctl_tcp_syn_retries; + val = tp->syn_retries ? : sysctl_tcp_syn_retries; + break; + case TCP_LINGER2: + val = tp->linger2; + if (val > 0) + val = (val ? : sysctl_tcp_fin_timeout)/HZ; + break; + case TCP_DEFER_ACCEPT: + val = tp->defer_accept == 0 ? 0 : (TCP_TIMEOUT_INIT<<(tp->defer_accept-1)); + break; + case TCP_WINDOW_CLAMP: + val = tp->window_clamp; break; default: return -ENOPROTOOPT; @@ -2049,11 +2288,20 @@ void __init tcp_init(void) tcp_bhash[i].chain = NULL; } + /* Try to be a bit smarter and adjust defaults depending + * on available memory. + */ if (order > 4) { sysctl_local_port_range[0] = 32768; sysctl_local_port_range[1] = 61000; + sysctl_tcp_max_tw_buckets = 180000; + sysctl_tcp_max_orphans = 4096<<(order-4); + sysctl_max_syn_backlog = 1024; } else if (order < 3) { sysctl_local_port_range[0] = 1024*(3-order); + sysctl_tcp_max_tw_buckets >>= (3-order); + sysctl_tcp_max_orphans >>= (3-order); + sysctl_max_syn_backlog = 128; } tcp_port_rover = sysctl_local_port_range[0] - 1; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3b4ae64a2..d61a5df02 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_input.c,v 1.177 2000/01/09 02:19:39 davem Exp $ + * Version: $Id: tcp_input.c,v 1.183 2000/01/24 18:40:33 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -70,9 +70,6 @@ #define SYNC_INIT 1 #endif -extern int sysctl_tcp_fin_timeout; -extern int sysctl_tcp_keepalive_time; - /* These are on by default so the code paths get tested. * For the final 2.2 this may be undone at our discretion. -DaveM */ @@ -83,10 +80,108 @@ int sysctl_tcp_sack = 1; int sysctl_tcp_syncookies = SYNC_INIT; int sysctl_tcp_stdurg; int sysctl_tcp_rfc1337; -int sysctl_tcp_tw_recycle; +int sysctl_tcp_tw_recycle = 1; +int sysctl_tcp_abort_on_overflow = 0; +int sysctl_tcp_max_orphans = NR_FILE; +int sysctl_tcp_max_tw_buckets = NR_FILE*2; static int prune_queue(struct sock *sk); +/* + * Adapt the MSS value used to make delayed ack decision to the + * real world. + * + * The constant 536 hasn't any good meaning. In IPv4 world + * MTU may be smaller, though it contradicts to RFC1122, which + * states that MSS must be at least 536. + * We use the constant to do not ACK each second + * packet in a stream of tiny size packets. + * It means that super-low mtu links will be aggressively delacked. + * Seems, it is even good. If they have so low mtu, they are weirdly + * slow. + * + * AK: BTW it may be useful to add an option to lock the rcv_mss. + * this way the beowulf people wouldn't need ugly patches to get the + * ack frequencies they want and it would be an elegant way to tune delack. + */ +static __inline__ void tcp_measure_rcv_mss(struct tcp_opt *tp, struct sk_buff *skb) +{ + unsigned int len, lss; + + lss = tp->ack.last_seg_size; + tp->ack.last_seg_size = 0; + + /* skb->len may jitter because of SACKs, even if peer + * sends good full-sized frames. + */ + len = skb->len; + if (len >= tp->ack.rcv_mss) { + tp->ack.rcv_mss = len; + } else { + /* Otherwise, we make more careful check taking into account, + * that SACKs block is variable. + * + * "len" is invariant segment length, including TCP header. + */ + len = skb->tail - skb->h.raw; + if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr)) { + /* Subtract also invariant (if peer is RFC compliant), + * tcp header plus fixed timestamp option length. + * Resulting "len" is MSS free of SACK jitter. + */ + len -= tp->tcp_header_len; + if (len == lss) + tp->ack.rcv_mss = len; + tp->ack.last_seg_size = len; + } + +#if 0 + /* Tiny-grams with PSH set artifically deflate our + * ato measurement. + * + * Mmm... I copied this test from tcp_remember_ack(), but + * I did not understand this. Is it to speedup nagling sender? + * It does not because classic (non-Minshall) sender nagles + * guided by not-acked frames not depending on size. + * And it does not help NODELAY sender, because latency + * is too high in any case. The only result is timer trashing + * and redundant ACKs. Grr... Seems, I missed something. --ANK + * + * Let me to comment out this yet... TCP should work + * perfectly without this. --ANK + */ + if (len < (tp->ack.rcv_mss >> 1) && skb->h.th->psh) + tp->ack.ato = TCP_ATO_MIN; +#endif + } +} + + +static __inline__ void tcp_enter_quickack_mode(struct tcp_opt *tp) +{ + unsigned quickacks = tcp_receive_window(tp)/(2*tp->ack.rcv_mss); + + tp->ack.quick = max(min(quickacks, 127), 1); + + if (!tp->tstamp_ok && tp->ack.quick>2) { + /* Quick ACKs are _dangerous_, if RTTM is not used. + * See comment in tcp_init_metrics(). We still help + * them to overcome the most difficult, initial + * phase of slow start. + */ + tp->ack.quick = 2; + } +} + +/* Send ACKs quickly, if "quick" count is not ehausted + * and the session is not interactive. + */ + +static __inline__ int tcp_in_quickack_mode(struct tcp_opt *tp) +{ + return (tp->ack.quick && !tp->ack.pingpong); +} + /* There is something which you must keep in mind when you analyze the * behavior of the tp->ato delayed ack timeout interval. When a * connection starts up, we want to ack as quickly as possible. The @@ -97,53 +192,52 @@ static int prune_queue(struct sock *sk); * each ACK we send, he increments snd_cwnd and transmits more of his * queue. -DaveM */ -static void tcp_delack_estimator(struct tcp_opt *tp) +static void tcp_event_data_recv(struct tcp_opt *tp, struct sk_buff *skb) { - if(tp->ato == 0) { - tp->lrcvtime = tcp_time_stamp; + u32 now; - /* Help sender leave slow start quickly, - * and also makes sure we do not take this - * branch ever again for this connection. + tcp_measure_rcv_mss(tp, skb); + + tp->ack.pending = 1; + + now = tcp_time_stamp; + + if (!tp->ack.ato) { + /* The _first_ data packet received, initialize + * delayed ACK engine. */ - tp->ato = 1; + + /* Help sender leave slow start quickly. */ tcp_enter_quickack_mode(tp); + + /* Pingpong is off, session is not interactive by default */ + tp->ack.pingpong = 0; + + /* ATO is minimal */ + tp->ack.ato = TCP_ATO_MIN; } else { - int m = tcp_time_stamp - tp->lrcvtime; - - tp->lrcvtime = tcp_time_stamp; - if(m <= 0) - m = 1; - if(m > tp->rto) - tp->ato = tp->rto; - else { - /* This funny shift makes sure we - * clear the "quick ack mode" bit. + int m = now - tp->ack.lrcvtime; + + if (m > TCP_ATO_MAX/2) { + /* Do not touch ATO, if interval is out of bounds. + * It will be deflated by delack timer, if our peer + * really sends too rarely. */ - tp->ato = ((tp->ato << 1) >> 2) + m; + if (m > tp->rto) { + /* Too long gap. Apparently sender falled to + * restart window, so that we send ACKs quickly. + */ + tcp_enter_quickack_mode(tp); + } + } else { + if (m <= 0) + m = TCP_ATO_MIN/2; + tp->ack.ato = (tp->ack.ato >> 1) + m; } } + tp->ack.lrcvtime = now; } -/* - * Remember to send an ACK later. - */ -static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th, - struct sk_buff *skb) -{ - tp->delayed_acks++; - - /* Tiny-grams with PSH set artifically deflate our - * ato measurement, but with a lower bound. - */ - if(th->psh && (skb->len < (tp->rcv_mss >> 1))) { - /* Preserve the quickack state. */ - if((tp->ato & 0x7fffffff) > HZ/50) - tp->ato = ((tp->ato & 0x80000000) | - (HZ/50)); - } -} - /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge @@ -209,10 +303,10 @@ static __inline__ void tcp_set_rto(struct tcp_opt *tp) */ static __inline__ void tcp_bound_rto(struct tcp_opt *tp) { - if (tp->rto > 120*HZ) - tp->rto = 120*HZ; - if (tp->rto < HZ/5) - tp->rto = HZ/5; + if (tp->rto < TCP_RTO_MIN) + tp->rto = TCP_RTO_MIN; + else if (tp->rto > TCP_RTO_MAX) + tp->rto = TCP_RTO_MAX; } /* Save metrics learned by this TCP session. @@ -224,7 +318,9 @@ static void tcp_update_metrics(struct sock *sk) struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct dst_entry *dst = __sk_dst_get(sk); - if (dst) { + dst_confirm(dst); + + if (dst && (dst->flags&DST_HOST)) { int m; if (tp->backoff || !tp->srtt) { @@ -237,8 +333,6 @@ static void tcp_update_metrics(struct sock *sk) return; } - dst_confirm(dst); - m = dst->rtt - tp->srtt; /* If newly calculated rtt larger than stored one, @@ -308,10 +402,18 @@ static void tcp_init_metrics(struct sock *sk) dst_confirm(dst); + if (dst->mxlock&(1<<RTAX_CWND)) + tp->snd_cwnd_clamp = dst->cwnd; + if (dst->ssthresh) { + tp->snd_ssthresh = dst->ssthresh; + if (tp->snd_ssthresh > tp->snd_cwnd_clamp) + tp->snd_ssthresh = tp->snd_cwnd_clamp; + } + if (dst->rtt == 0) goto reset; - if (!tp->srtt || !tp->saw_tstamp) + if (!tp->srtt && dst->rtt < (TCP_TIMEOUT_INIT<<3)) goto reset; /* Initial rtt is determined from SYN,SYN-ACK. @@ -334,14 +436,9 @@ static void tcp_init_metrics(struct sock *sk) tp->mdev = dst->rttvar; tcp_set_rto(tp); tcp_bound_rto(tp); - - if (dst->mxlock&(1<<RTAX_CWND)) - tp->snd_cwnd_clamp = dst->cwnd; - if (dst->ssthresh) { - tp->snd_ssthresh = dst->ssthresh; - if (tp->snd_ssthresh > tp->snd_cwnd_clamp) - tp->snd_ssthresh = tp->snd_cwnd_clamp; - } + if (tp->rto < TCP_TIMEOUT_INIT && !tp->saw_tstamp) + goto reset; + tp->snd_cwnd = tcp_init_cwnd(tp); return; @@ -357,9 +454,6 @@ reset: } } -#define PAWS_24DAYS (60 * 60 * 24 * 24) - - /* WARNING: this must not be called if tp->saw_tstamp was false. */ extern __inline__ void tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq) @@ -374,7 +468,7 @@ tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq) */ if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0 || - xtime.tv_sec >= tp->ts_recent_stamp + PAWS_24DAYS) { + xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS) { tp->ts_recent = tp->rcv_tsval; tp->ts_recent_stamp = xtime.tv_sec; } @@ -384,7 +478,7 @@ tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq) extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb) { return ((s32)(tp->rcv_tsval - tp->ts_recent) < 0 && - xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS + xtime.tv_sec < tp->ts_recent_stamp + TCP_PAWS_24DAYS /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM @@ -411,8 +505,13 @@ extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb) static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) { u32 end_window = tp->rcv_wup + tp->rcv_wnd; +#ifdef TCP_FORMAL_WINDOW + u32 rcv_wnd = tcp_receive_window(tp); +#else + u32 rcv_wnd = tp->rcv_wnd; +#endif - if (tp->rcv_wnd && + if (rcv_wnd && after(end_seq, tp->rcv_nxt) && before(seq, end_window)) return 1; @@ -424,8 +523,13 @@ static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) /* This functions checks to see if the tcp header is actually acceptable. */ extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) { +#ifdef TCP_FORMAL_WINDOW + u32 rcv_wnd = tcp_receive_window(tp); +#else + u32 rcv_wnd = tp->rcv_wnd; +#endif if (seq == tp->rcv_nxt) - return (tp->rcv_wnd || (end_seq == seq)); + return (rcv_wnd || (end_seq == seq)); return __tcp_sequence(tp, seq, end_seq); } @@ -433,8 +537,6 @@ extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) /* When we get a reset we do this. */ static void tcp_reset(struct sock *sk) { - sk->zapped = 1; - /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->state) { case TCP_SYN_SENT: @@ -447,9 +549,8 @@ static void tcp_reset(struct sock *sk) return; default: sk->err = ECONNRESET; - }; - tcp_set_state(sk, TCP_CLOSE); - tcp_clear_xmit_timers(sk); + } + tcp_done(sk); } @@ -658,17 +759,18 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) if (tp->high_seq == 0 || after(ack, tp->high_seq)) { tp->dup_acks++; if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) { - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); - if (tp->snd_ssthresh > tp->snd_cwnd_clamp) - tp->snd_ssthresh = tp->snd_cwnd_clamp; - tp->snd_cwnd = (tp->snd_ssthresh + 3); - tp->high_seq = tp->snd_nxt; + __tcp_enter_cong_avoid(tp); + /* ... and account for 3 ACKs, which are + * already received to this time. + */ + tp->snd_cwnd += 3; + if(!tp->fackets_out) tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); else tcp_fack_retransmit(sk); - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); } } else if (++tp->dup_acks > 3) { /* 2. Each time another duplicate ACK arrives, increment @@ -733,7 +835,7 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) if (ack != tp->snd_una && before(ack, tp->high_seq)) { tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); } } else { /* FACK style, fill any remaining holes in @@ -752,7 +854,8 @@ static __inline__ void tcp_cong_avoid(struct tcp_opt *tp) { if (tp->snd_cwnd <= tp->snd_ssthresh) { /* In "safe" area, increase. */ - tp->snd_cwnd++; + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; } else { /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd @@ -826,23 +929,23 @@ static void tcp_ack_probe(struct sock *sk, __u32 ack) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - /* Our probe was answered. */ - tp->probes_out = 0; - /* Was it a usable window open? */ - /* should always be non-null */ - if (tp->send_head != NULL && - !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) { - tp->backoff = 0; - tp->pending = 0; - tcp_clear_xmit_timer(sk, TIME_PROBE0); - } else { - tcp_reset_xmit_timer(sk, TIME_PROBE0, - min(tp->rto << tp->backoff, 120*HZ)); + if (tp->send_head != NULL) { + if (!after(TCP_SKB_CB(tp->send_head)->end_seq, ack + tp->snd_wnd)) { + tp->backoff = 0; + tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0); + /* If packets_out==0, socket must be waked up by + * subsequent tcp_data_snd_check(). This function is + * not for random using! + */ + } else if (!tp->packets_out) { + tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, + min(tp->rto << tp->backoff, TCP_RTO_MAX)); + } } } - + /* Should we open up the congestion window? */ static __inline__ int should_advance_cwnd(struct tcp_opt *tp, int flag) { @@ -914,18 +1017,30 @@ static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) { struct sk_buff *skb = skb_peek(&sk->write_queue); +#ifdef TCP_DEBUG + /* It occured in 2.3, because of racy timers. Namely, + * retransmit timer did not check packets_out and retransmitted + * send_head sometimes and, hence, messed all the write_queue. + * Now it is impossible, I bet. --ANK + */ + if (skb == NULL) { + printk("Sucks! packets_out=%d, sk=%p, %d\n", tp->packets_out, sk, sk->state); + return; + } +#endif + /* Some data was ACK'd, if still retransmitting (due to a * timeout), resend more of the retransmit queue. The * congestion window is handled properly by that code. */ if (tp->retransmits) { tcp_xmit_retransmit_queue(sk); - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); } else { __u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when); if ((__s32)when < 0) when = 1; - tcp_reset_xmit_timer(sk, TIME_RETRANS, when); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, when); } } @@ -938,13 +1053,8 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 seq = 0; u32 seq_rtt = 0; - if(sk->zapped) - return(1); /* Dead, can't ack any more so why bother */ - - if (tp->pending == TIME_KEEPOPEN) - tp->probes_out = 0; - - tp->rcv_tstamp = tcp_time_stamp; + if(sk->state == TCP_CLOSE) + return 1; /* Dead, can't ack any more so why bother */ /* If the ack is newer than sent or older than previous acks * then we can probably ignore it. @@ -953,10 +1063,8 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, goto uninteresting_ack; /* If there is data set flag 1 */ - if (len != th->doff*4) { + if (len != th->doff*4) flag |= FLAG_DATA; - tcp_delack_estimator(tp); - } /* Update our send window. */ @@ -970,31 +1078,53 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) { flag |= FLAG_WIN_UPDATE; - tp->snd_wnd = nwin; + if (tp->snd_wnd != nwin) { + tp->snd_wnd = nwin; + + /* Note, it is the only place, where + * fast path is recovered for sending TCP. + */ + if (skb_queue_len(&tp->out_of_order_queue) == 0 && +#ifdef TCP_FORMAL_WINDOW + tcp_receive_window(tp) && +#endif + !tp->urg_data) + tcp_fast_path_on(tp); + + if (nwin > tp->max_window) { + tp->max_window = nwin; + tcp_sync_mss(sk, tp->pmtu_cookie); + } + } tp->snd_wl1 = ack_seq; tp->snd_wl2 = ack; - - if (nwin > tp->max_window) - tp->max_window = nwin; } } + /* BEWARE! From this place and until return from this function + * snd_nxt and snd_wnd are out of sync. All the routines, called + * from here must get "ack" as argument or they should not depend + * on right edge of window. It is _UGLY_. It cries to be fixed. --ANK + */ + /* We passed data and got it acked, remove any soft error * log. Something worked... */ sk->err_soft = 0; + tp->probes_out = 0; + tp->rcv_tstamp = tcp_time_stamp; + + /* See if we can take anything off of the retransmit queue. */ + flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. */ - if (tp->pending == TIME_PROBE0) + if (tcp_timer_is_set(sk, TCP_TIME_PROBE0)) tcp_ack_probe(sk, ack); - /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt); - /* We must do this here, before code below clears out important * state contained in tp->fackets_out and tp->retransmits. -DaveM */ @@ -1036,7 +1166,7 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, if (flag & FLAG_DATA_ACKED) tcp_ack_packets_out(sk, tp); } else { - tcp_clear_xmit_timer(sk, TIME_RETRANS); + tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS); } flag &= (FLAG_DATA | FLAG_WIN_UPDATE); @@ -1074,9 +1204,42 @@ uninteresting_ack: return 0; } +int tcp_paws_check(struct tcp_opt *tp, int rst) +{ + if ((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) + return 0; + if (xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS) + return 0; + + /* RST segments are not recommended to carry timestamp, + and, if they do, it is recommended to ignore PAWS because + "their cleanup function should take precedence over timestamps." + Certainly, it is mistake. It is necessary to understand the reasons + of this constraint to relax it: if peer reboots, clock may go + out-of-sync and half-open connections will not be reset. + Actually, the problem would be not existing if all + the implementations followed draft about maintaining clock + via reboots. Linux-2.2 DOES NOT! + + However, we can relax time bounds for RST segments to MSL. + */ + if (rst && xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_MSL) + return 0; + return 1; +} + +static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) +{ + if (seq == s_win) + return 1; + if (after(end_seq, s_win) && before(seq, e_win)) + return 1; + return (seq == e_win && seq == end_seq); +} + /* New-style handling of TIME_WAIT sockets. */ -/* Must be called only from BH context. */ +/* Must be called with locally disabled BHs. */ void tcp_timewait_kill(struct tcp_tw_bucket *tw) { struct tcp_ehash_bucket *ehead; @@ -1121,13 +1284,6 @@ void tcp_timewait_kill(struct tcp_tw_bucket *tw) tcp_tw_put(tw); } -/* We come here as a special case from the AF specific TCP input processing, - * and the SKB has no owner. Essentially handling this is very simple, - * we just keep silently eating rx'd packets until none show up for the - * entire timeout period. The only special cases are for BSD TIME_WAIT - * reconnects and SYN/RST bits being set in the TCP header. - */ - /* * * Main purpose of TIME-WAIT state is to close connection gracefully, * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN @@ -1149,6 +1305,12 @@ void tcp_timewait_kill(struct tcp_tw_bucket *tw) * The algorithm below is based on FORMAL INTERPRETATION of RFCs. * When you compare it to RFCs, please, read section SEGMENT ARRIVES * from the very beginning. + * + * NOTE. With recycling (and later with fin-wait-2) TW bucket + * is _not_ stateless. It means, that strictly speaking we must + * spinlock it. I do not want! Well, probability of misbehaviour + * is ridiculously low and, seems, we could use some mb() tricks + * to avoid misread sequence numbers, states etc. --ANK */ enum tcp_tw_status tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, @@ -1157,7 +1319,75 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, struct tcp_opt tp; int paws_reject = 0; - /* RFC 1122: + tp.saw_tstamp = 0; + if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) { + tcp_parse_options(NULL, th, &tp, 0); + + if (tp.saw_tstamp) { + tp.ts_recent = tw->ts_recent; + tp.ts_recent_stamp = tw->ts_recent_stamp; + paws_reject = tcp_paws_check(&tp, th->rst); + } + } + + if (tw->substate == TCP_FIN_WAIT2) { + /* Just repeat all the checks of tcp_rcv_state_process() */ + + /* Out of window, send ACK */ + if (paws_reject || + !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, + tw->rcv_nxt, tw->rcv_nxt + tw->rcv_wnd)) + return TCP_TW_ACK; + + if (th->rst) + goto kill; + + if (th->syn && TCP_SKB_CB(skb)->seq != tw->syn_seq) + goto kill_with_rst; + + /* Dup ACK? */ + if (!after(TCP_SKB_CB(skb)->end_seq, tw->rcv_nxt)) { + tcp_tw_put(tw); + return TCP_TW_SUCCESS; + } + + /* New data or FIN. If new data arrive after half-duplex close, + * reset. + */ + if (!th->fin || TCP_SKB_CB(skb)->end_seq != tw->rcv_nxt+1) { +kill_with_rst: + tcp_tw_deschedule(tw); + tcp_timewait_kill(tw); + tcp_tw_put(tw); + return TCP_TW_RST; + } + + /* FIN arrived, enter true time-wait state. */ + tw->substate = TCP_TIME_WAIT; + tw->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + if (tp.saw_tstamp) { + tw->ts_recent_stamp = xtime.tv_sec; + tw->ts_recent = tp.rcv_tsval; + } + + /* I am shamed, but failed to make it more elegant. + * Yes, it is direct reference to IP, which is impossible + * to generalize to IPv6. Taking into account that IPv6 + * do not undertsnad recycling in any case, it not + * a big problem in practice. --ANK */ + if (tw->family == AF_INET && + sysctl_tcp_tw_recycle && tw->ts_recent_stamp && + tcp_v4_tw_remember_stamp(tw)) + tcp_tw_schedule(tw, tw->timeout); + else + tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); + return TCP_TW_ACK; + } + + /* + * Now real TIME-WAIT state. + * + * RFC 1122: * "When a connection is [...] on TIME-WAIT state [...] * [a TCP] MAY accept a new SYN from the remote TCP to * reopen the connection directly, if it: @@ -1171,47 +1401,31 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, * to be an old duplicate". */ - tp.saw_tstamp = 0; - if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) { - tcp_parse_options(NULL, th, &tp, 0); - - paws_reject = tp.saw_tstamp && - ((s32)(tp.rcv_tsval - tw->ts_recent) < 0 && - xtime.tv_sec < tw->ts_recent_stamp + PAWS_24DAYS); - } - if (!paws_reject && (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq && TCP_SKB_CB(skb)->seq == tw->rcv_nxt)) { /* In window segment, it may be only reset or bare ack. */ if (th->rst) { -#ifdef CONFIG_TCP_TW_RECYCLE - /* When recycling, always follow rfc1337, - * but mark bucket as ready to recycling immediately. - */ - if (sysctl_tcp_tw_recycle) { - /* May kill it now. */ - tw->rto = 0; - tw->ttd = jiffies; - } else -#endif /* This is TIME_WAIT assasination, in two flavors. * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ - if(sysctl_tcp_rfc1337 == 0) { + if (sysctl_tcp_rfc1337 == 0) { +kill: tcp_tw_deschedule(tw); tcp_timewait_kill(tw); + tcp_tw_put(tw); + return TCP_TW_SUCCESS; } - } else { - tcp_tw_reschedule(tw); } + tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); if (tp.saw_tstamp) { tw->ts_recent = tp.rcv_tsval; tw->ts_recent_stamp = xtime.tv_sec; } + tcp_tw_put(tw); return TCP_TW_SUCCESS; } @@ -1235,7 +1449,7 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, if (th->syn && !th->rst && !th->ack && !paws_reject && (after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt) || - (tp.saw_tstamp && tw->ts_recent != tp.rcv_tsval))) { + (tp.saw_tstamp && (s32)(tw->ts_recent - tp.rcv_tsval) < 0))) { u32 isn = tw->snd_nxt + 2; if (isn == 0) isn++; @@ -1243,20 +1457,18 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, return TCP_TW_SYN; } + if (paws_reject) + NET_INC_STATS_BH(PAWSEstabRejected); + if(!th->rst) { /* In this case we must reset the TIMEWAIT timer. - - If it is ACKless SYN it may be both old duplicate - and new good SYN with random sequence number <rcv_nxt. - Do not reschedule in the last case. + * + * If it is ACKless SYN it may be both old duplicate + * and new good SYN with random sequence number <rcv_nxt. + * Do not reschedule in the last case. */ - if (paws_reject || th->ack) { - tcp_tw_reschedule(tw); -#ifdef CONFIG_TCP_TW_RECYCLE - tw->rto = min(120*HZ, tw->rto<<1); - tw->ttd = jiffies + tw->rto; -#endif - } + if (paws_reject || th->ack) + tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); /* Send ACK. Note, we do not put the bucket, * it will be released by caller. @@ -1267,8 +1479,8 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, return TCP_TW_SUCCESS; } -/* Enter the time wait state. This is always called from BH - * context. Essentially we whip up a timewait bucket, copy the +/* Enter the time wait state. This is called with locally disabled BH. + * Essentially we whip up a timewait bucket, copy the * relevant info into it from the SK, and mess with hash chains * and list linkage. */ @@ -1286,6 +1498,7 @@ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) sk->next->pprev = sk->pprev; *sk->pprev = sk->next; sk->pprev = NULL; + sock_prot_dec_use(sk->prot); } /* Step 2: Hash TW into TIMEWAIT half of established hash table. */ @@ -1312,41 +1525,49 @@ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) tw->tb->owners = (struct sock*)tw; tw->bind_pprev = &tw->tb->owners; spin_unlock(&bhead->lock); - - /* Step 4: Un-charge protocol socket in-use count. */ - sock_prot_dec_use(sk->prot); } /* - * Move a socket to time-wait. + * Move a socket to time-wait or dead fin-wait-2 state. */ -void tcp_time_wait(struct sock *sk) +void tcp_time_wait(struct sock *sk, int state, int timeo) { - struct tcp_tw_bucket *tw; + struct tcp_tw_bucket *tw = NULL; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int recycle_ok = 0; + + if (sysctl_tcp_tw_recycle && tp->ts_recent_stamp) + recycle_ok = tp->af_specific->remember_stamp(sk); + + if (tcp_tw_count < sysctl_tcp_max_tw_buckets) + tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC); - tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC); if(tw != NULL) { + int rto = (tp->rto<<2) - (tp->rto>>1); + /* Give us an identity. */ tw->daddr = sk->daddr; tw->rcv_saddr = sk->rcv_saddr; tw->bound_dev_if= sk->bound_dev_if; tw->num = sk->num; tw->state = TCP_TIME_WAIT; + tw->substate = state; tw->sport = sk->sport; tw->dport = sk->dport; tw->family = sk->family; tw->reuse = sk->reuse; - tw->hashent = sk->hashent; - tw->rcv_nxt = sk->tp_pinfo.af_tcp.rcv_nxt; - tw->snd_nxt = sk->tp_pinfo.af_tcp.snd_nxt; - tw->ts_recent = sk->tp_pinfo.af_tcp.ts_recent; - tw->ts_recent_stamp= sk->tp_pinfo.af_tcp.ts_recent_stamp; -#ifdef CONFIG_TCP_TW_RECYCLE - tw->rto = sk->tp_pinfo.af_tcp.rto; - tw->ttd = jiffies + 2*tw->rto; -#endif + tw->rcv_wscale = tp->rcv_wscale; atomic_set(&tw->refcnt, 0); + tw->hashent = sk->hashent; + tw->rcv_nxt = tp->rcv_nxt; + tw->snd_nxt = tp->snd_nxt; + tw->rcv_wnd = tcp_receive_window(tp); + tw->syn_seq = tp->syn_seq; + tw->ts_recent = tp->ts_recent; + tw->ts_recent_stamp= tp->ts_recent_stamp; + tw->pprev_death = NULL; + #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if(tw->family == PF_INET6) { memcpy(&tw->v6_daddr, @@ -1361,22 +1582,28 @@ void tcp_time_wait(struct sock *sk) __tcp_tw_hashdance(sk, tw); /* Get the TIME_WAIT timeout firing. */ - tcp_tw_schedule(tw); + if (timeo < rto) + timeo = rto; - /* CLOSE the SK. */ - if(sk->state == TCP_ESTABLISHED) - tcp_statistics[smp_processor_id()*2].TcpCurrEstab--; - sk->state = TCP_CLOSE; + if (recycle_ok) { + tw->timeout = rto; + } else { + tw->timeout = TCP_TIMEWAIT_LEN; + if (state == TCP_TIME_WAIT) + timeo = TCP_TIMEWAIT_LEN; + } + + tcp_tw_schedule(tw, timeo); } else { - /* Sorry, we're out of memory, just CLOSE this + /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than * non-graceful socket closings. */ - tcp_set_state(sk, TCP_CLOSE); + if (net_ratelimit()) + printk(KERN_INFO "TCP: time wait bucket table overflow\n"); } tcp_update_metrics(sk); - tcp_clear_xmit_timers(sk); tcp_done(sk); } @@ -1397,10 +1624,13 @@ void tcp_time_wait(struct sock *sk) static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) { - sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + tp->fin_seq = TCP_SKB_CB(skb)->end_seq; tcp_send_ack(sk); + sk->shutdown |= RCV_SHUTDOWN; + switch(sk->state) { case TCP_SYN_RECV: case TCP_ESTABLISHED: @@ -1427,7 +1657,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) break; case TCP_FIN_WAIT2: /* Received a FIN -- send ACK and enter TIME_WAIT. */ - tcp_time_wait(sk); + tcp_time_wait(sk, TCP_TIME_WAIT, 0); break; default: /* Only TCP_LISTEN and TCP_CLOSE are left, in these @@ -1435,9 +1665,17 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) */ printk("tcp_fin: Impossible, sk->state=%d\n", sk->state); break; - } + }; + + /* It _is_ possible, that we have something out-of-order _after_ FIN. + * Probably, we should reset in this case. For now drop them. + */ + __skb_queue_purge(&tp->out_of_order_queue); + if (tp->sack_ok) + tp->num_sacks = 0; + if (!sk->dead) { - wake_up_interruptible(sk->sleep); + sk->state_change(sk); sock_wake_async(sk->socket, 1, POLL_HUP); } } @@ -1622,6 +1860,7 @@ static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sp->end_seq = TCP_SKB_CB(new_skb)->end_seq; } + /* This one checks to see if we can put data from the * out_of_order queue into the receive_queue. */ @@ -1658,6 +1897,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct sk_buff *skb1; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int eaten = 0; /* Queue data for delivery to the user. * Packets in sequence go to the receive queue. @@ -1665,33 +1905,68 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) */ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { /* Ok. In sequence. */ - queue_and_out: + if (tp->ucopy.task == current && + tp->copied_seq == tp->rcv_nxt && + tp->ucopy.len && + sk->lock.users && + !tp->urg_data) { + int chunk = min(skb->len, tp->ucopy.len); + + local_bh_enable(); + if (memcpy_toiovec(tp->ucopy.iov, skb->data, chunk)) { + sk->err = EFAULT; + sk->error_report(sk); + } + local_bh_disable(); + tp->ucopy.len -= chunk; + tp->copied_seq += chunk; + eaten = (chunk == skb->len && !skb->h.th->fin); + } + + if (!eaten) { +queue_and_out: + skb_set_owner_r(skb, sk); + __skb_queue_tail(&sk->receive_queue, skb); + } dst_confirm(sk->dst_cache); - __skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - if(skb->h.th->fin) { + if(skb->len) + tcp_event_data_recv(tp, skb); + if(skb->h.th->fin) tcp_fin(skb, sk, skb->h.th); - } else { - tcp_remember_ack(tp, skb->h.th, skb); - } + /* This may have eaten into a SACK block. */ if(tp->sack_ok && tp->num_sacks) tcp_sack_remove_skb(tp, skb); tcp_ofo_queue(sk); /* Turn on fast path. */ - if (skb_queue_len(&tp->out_of_order_queue) == 0) - tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) | - ntohl(TCP_FLAG_ACK) | - tp->snd_wnd); + if (skb_queue_len(&tp->out_of_order_queue) == 0 && +#ifdef TCP_FORMAL_WINDOW + tcp_receive_window(tp) && +#endif + !tp->urg_data) + tcp_fast_path_on(tp); + + if (eaten) + kfree_skb(skb); + + if (!sk->dead) { + wake_up_interruptible(sk->sleep); + sock_wake_async(sk->socket,1, POLL_IN); + } return; } - + /* An old packet, either a retransmit or some packet got lost. */ if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { - /* A retransmit, 2nd most common case. Force an imediate ack. */ - SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq); + /* A retransmit, 2nd most common case. Force an imediate ack. + * + * It is impossible, seq is checked by top level. + */ + NETDEBUG(printk("retransmit in tcp_data_queue: seq %X\n", TCP_SKB_CB(skb)->seq)); tcp_enter_quickack_mode(tp); + tp->ack.pending = 1; kfree_skb(skb); return; } @@ -1706,15 +1981,17 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) } /* Ok. This is an out_of_order segment, force an ack. */ - tp->delayed_acks++; - tcp_enter_quickack_mode(tp); + tp->ack.pending = 1; /* Disable header prediction. */ tp->pred_flags = 0; + SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + skb_set_owner_r(skb, sk); + if (skb_peek(&tp->out_of_order_queue) == NULL) { /* Initial out of order segment, build 1 SACK. */ if(tp->sack_ok) { @@ -1758,6 +2035,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) } } } + return; } @@ -1767,7 +2045,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) * room, then we will just have to discard the packet. */ -static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) +static void tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) { struct tcphdr *th; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); @@ -1777,11 +2055,11 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) skb_trim(skb, len - (th->doff*4)); if (skb->len == 0 && !th->fin) - return(0); + goto drop; /* * If our receive queue has grown past its limits shrink it. - * Make sure to do this before moving snd_nxt, otherwise + * Make sure to do this before moving rcv_nxt, otherwise * data might be acked for that we don't have enough room. */ if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) { @@ -1789,7 +2067,7 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) /* Still not enough room. That can happen when * skb->true_size differs significantly from skb->len. */ - return 0; + goto drop; } } @@ -1799,29 +2077,20 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n"); tp->rcv_nxt = tp->copied_seq; } + return; - /* Above, tcp_data_queue() increments delayed_acks appropriately. - * Now tell the user we may have some data. - */ - if (!sk->dead) { - wake_up_interruptible(sk->sleep); - sock_wake_async(sk->socket,1, POLL_IN); - } - return(1); +drop: + kfree_skb(skb); } static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) && - tcp_packets_in_flight(tp) < tp->snd_cwnd) { - /* Put more data onto the wire. */ - tcp_write_xmit(sk); - } else if (tp->packets_out == 0 && !tp->pending) { - /* Start probing the receivers window. */ - tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); - } + if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || + tcp_packets_in_flight(tp) >= tp->snd_cwnd || + tcp_write_xmit(sk)) + tcp_check_probe_timer(sk, tp); } static __inline__ void tcp_data_snd_check(struct sock *sk) @@ -1832,57 +2101,6 @@ static __inline__ void tcp_data_snd_check(struct sock *sk) __tcp_data_snd_check(sk, skb); } -/* - * Adapt the MSS value used to make delayed ack decision to the - * real world. - * - * The constant 536 hasn't any good meaning. In IPv4 world - * MTU may be smaller, though it contradicts to RFC1122, which - * states that MSS must be at least 536. - * We use the constant to do not ACK each second - * packet in a stream of tiny size packets. - * It means that super-low mtu links will be aggressively delacked. - * Seems, it is even good. If they have so low mtu, they are weirdly - * slow. - * - * AK: BTW it may be useful to add an option to lock the rcv_mss. - * this way the beowulf people wouldn't need ugly patches to get the - * ack frequencies they want and it would be an elegant way to tune delack. - */ -static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - unsigned int len, lss; - - lss = tp->last_seg_size; - tp->last_seg_size = 0; - - /* skb->len may jitter because of SACKs, even if peer - * sends good full-sized frames. - */ - len = skb->len; - if (len >= tp->rcv_mss) { - tp->rcv_mss = len; - } else { - /* Otherwise, we make more careful check taking into account, - * that SACKs block is variable. - * - * "len" is invariant segment length, including TCP header. - */ - len = skb->tail - skb->h.raw; - if (len >= 536 + sizeof(struct tcphdr)) { - /* Subtract also invariant (if peer is RFC compliant), - * tcp header plus fixed timestamp option length. - * Resulting "len" is MSS free of SACK jitter. - */ - len -= tp->tcp_header_len; - if (len == lss) - tp->rcv_mss = len; - tp->last_seg_size = len; - } - } -} - /* * Check if sending an ack is needed. */ @@ -1904,26 +2122,25 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) * start in an expediant manner. */ - /* Two full frames received or... */ - if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) || - /* We will update the window "significantly" or... */ - tcp_raise_window(sk) || - /* We entered "quick ACK" mode or... */ + /* More than one full frame received or... */ + if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss) || + /* We ACK each frame or... */ tcp_in_quickack_mode(tp) || - /* We have out of order data */ - (ofo_possible && (skb_peek(&tp->out_of_order_queue) != NULL))) { + /* We have out of order data or */ + (ofo_possible && + skb_peek(&tp->out_of_order_queue) != NULL)) { /* Then ack it now */ tcp_send_ack(sk); } else { /* Else, send delayed ack. */ - tcp_send_delayed_ack(sk, HZ/2); + tcp_send_delayed_ack(sk); } } static __inline__ void tcp_ack_snd_check(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - if (tp->delayed_acks == 0) { + if (tp->ack.pending == 0) { /* We sent a data segment already. */ return; } @@ -1975,7 +2192,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) */ if (tp->urg_seq == tp->copied_seq) tp->copied_seq++; /* Move the copied sequence on correctly */ - tp->urg_data = URG_NOTYET; + tp->urg_data = TCP_URG_NOTYET; tp->urg_seq = ptr; /* Disable header prediction. */ @@ -1992,12 +2209,12 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len tcp_check_urg(sk,th); /* Do we wait for any urgent data? - normally not... */ - if (tp->urg_data == URG_NOTYET) { + if (tp->urg_data == TCP_URG_NOTYET) { u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4); /* Is the urgent pointer pointing into this packet? */ if (ptr < len) { - tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th); + tp->urg_data = TCP_URG_VALID | *(ptr + (unsigned char *) th); if (!sk->dead) sk->data_ready(sk,0); } @@ -2014,7 +2231,8 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len static int prune_queue(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - struct sk_buff * skb; + struct sk_buff *skb; + int pruned = 0; SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); @@ -2024,7 +2242,9 @@ static int prune_queue(struct sock *sk) skb = __skb_dequeue_tail(&tp->out_of_order_queue); if(skb != NULL) { /* Free it all. */ - do { net_statistics[smp_processor_id()*2].OfoPruned += skb->len; + do { + pruned += skb->len; + net_statistics[smp_processor_id()*2].OfoPruned += skb->len; kfree_skb(skb); skb = __skb_dequeue_tail(&tp->out_of_order_queue); } while(skb != NULL); @@ -2059,13 +2279,47 @@ static int prune_queue(struct sock *sk) * if we are really having our buffer space abused we stop accepting * new receive data. * + * 8) The arguments are interesting, but I even cannot imagine + * what kind of arguments could force us to drop NICE, ALREADY + * RECEIVED DATA only to get one more packet? --ANK + * * FIXME: it should recompute SACK state and only remove enough * buffers to get into bounds again. The current scheme loses - * badly sometimes on links with large RTT, especially when - * the driver has high overhead per skb. - * (increasing the rcvbuf is not enough because it inflates the - * the window too, disabling flow control effectively) -AK + * badly sometimes on links with large RTT, especially when + * the driver has high overhead per skb. + * (increasing the rcvbuf is not enough because it inflates the + * the window too, disabling flow control effectively) -AK + * + * Mmm... Why not to scale it seprately then? Just replace + * / WINDOW_ADVERTISE_DIVISOR with >> sk->window_advertise_scale + * and adjust it dynamically, when TCP window flow control + * fails? -ANK + */ + + /* F.e. one possible tactics is: */ + do { + u32 new_clamp = (tp->rcv_nxt-tp->copied_seq) + pruned; + + /* This guy is not a good guy. I bet, he martirized cats, + * when was child and grew up to finished sadist. Clamp him! + */ + if (new_clamp > 3*tp->ack.rcv_mss) + new_clamp -= tp->ack.rcv_mss; + else + new_clamp = 2*tp->ack.rcv_mss; + tp->window_clamp = min(tp->window_clamp, new_clamp); + } while (0); + /* Though it should be made earlier, when we are still not + * congested. This header prediction logic sucks + * without true implementation of VJ algorithm. + * I am really anxious. How was it possible to combine + * header prediction and sending ACKs outside of recvmsg() context? + * They _are_ incompatible. We should not advance window so + * brainlessly and we should not advertise so huge window from the very + * beginning. BTW window "prediction" does not speedup anything! + * SIlly, silly, silly. */ + if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1)) return 0; @@ -2073,6 +2327,57 @@ static int prune_queue(struct sock *sk) return -1; } +static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int chunk = skb->len - hlen; + int err; + + local_bh_enable(); + if (skb->ip_summed==CHECKSUM_UNNECESSARY) + err = memcpy_toiovec(tp->ucopy.iov, skb->h.raw + hlen, chunk); + else + err = copy_and_csum_toiovec(tp->ucopy.iov, skb, hlen); + + if (!err) { +update: + tp->ucopy.len -= chunk; + tp->copied_seq += chunk; + local_bh_disable(); + return 0; + } + + if (err == -EFAULT) { + sk->err = EFAULT; + sk->error_report(sk); + goto update; + } + + local_bh_disable(); + return err; +} + +static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) +{ + int result; + + if (sk->lock.users) { + local_bh_enable(); + result = __tcp_checksum_complete(skb); + local_bh_disable(); + } else { + result = __tcp_checksum_complete(skb); + } + return result; +} + +static __inline__ int +tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) +{ + return skb->ip_summed != CHECKSUM_UNNECESSARY && + __tcp_checksum_complete_user(sk, skb); +} + /* * TCP receive function for the ESTABLISHED state. * @@ -2080,7 +2385,33 @@ static int prune_queue(struct sock *sk) * disabled when: * - A zero window was announced from us - zero window probing * is only handled properly in the slow path. - * - Out of order segments arrived. + * [ NOTE: actually, it was made incorrectly and nobody ever noticed + * this! Reason is clear: 1. Correct senders do not send + * to zero window. 2. Even if a sender sends to zero window, + * nothing terrible occurs. + * + * For now I cleaned this and fast path is really always disabled, + * when window is zero, but I would be more happy to remove these + * checks. Code will be only cleaner and _faster_. --ANK + * + * Later note. I've just found that slow path also accepts + * out of window segments, look at tcp_sequence(). So... + * it is the last argument: I repair all and comment out + * repaired code by TCP_FORMAL_WINDOW. + * [ I remember one rhyme from a chidren's book. (I apologize, + * the trasnlation is not rhymed 8)): people in one (jewish) village + * decided to build sauna, but divided to two parties. + * The first one insisted that battens should not be dubbed, + * another objected that foots will suffer of splinters, + * the first fended that dubbed wet battens are too slippy + * and people will fall and it is much more serious! + * Certaiinly, all they went to rabbi. + * After some thinking, he judged: "Do not be lazy! + * Certainly, dub the battens! But put them by dubbed surface down." + * ] + * ] + * + * - Out of order segments arrived. * - Urgent data is expected. * - There is no buffer space left * - Unexpected TCP flags/window values/header lengths are received @@ -2088,7 +2419,7 @@ static int prune_queue(struct sock *sk) * - Data is sent in both directions. Fast path only supports pure senders * or pure receivers (this means either the sequence number or the ack * value must stay constant) - * - Unexpected TCP option. + * - Unexpected TCP option. * * When these conditions are not satisfied it drops into a standard * receive procedure patterned after RFC793 to handle all cases. @@ -2116,7 +2447,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, * We do checksum and copy also but from device to kernel. */ - /* RED-PEN. Using static variables to pass function arguments * cannot be good idea... */ @@ -2133,13 +2463,12 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if ((tcp_flag_word(th) & ~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { - int tcp_header_len = th->doff*4; - - /* Timestamp header prediction */ + int tcp_header_len = tp->tcp_header_len; - /* Non-standard header f.e. SACKs -> slow path */ - if (tcp_header_len != tp->tcp_header_len) - goto slow_path; + /* Timestamp header prediction: tcp_header_len + * is automatically equal to th->doff*4 due to pred_flags + * match. + */ /* Check timestamp */ if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { @@ -2161,8 +2490,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, goto slow_path; /* Predicted packet is in window by definition. - seq == rcv_nxt and last_ack_sent <= rcv_nxt. - Hence, check seq<=last_ack_sent reduces to: + * seq == rcv_nxt and last_ack_sent <= rcv_nxt. + * Hence, check seq<=last_ack_sent reduces to: */ if (tp->rcv_nxt == tp->last_ack_sent) { tp->ts_recent = tp->rcv_tsval; @@ -2173,6 +2502,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (len <= tcp_header_len) { /* Bulk data transfer: sender */ if (len == tcp_header_len) { + /* We know that such packets are checksummed + * on entry. + */ tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len); kfree_skb(skb); @@ -2182,19 +2514,42 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, TCP_INC_STATS_BH(TcpInErrs); goto discard; } - } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una && - atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) { - /* Bulk data transfer: receiver */ - __skb_pull(skb,tcp_header_len); + } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una) { + int eaten = 0; - /* Is it possible to simplify this? */ - tcp_measure_rcv_mss(sk, skb); + if (tp->ucopy.task == current && + tp->copied_seq == tp->rcv_nxt && + len - tcp_header_len <= tp->ucopy.len && + sk->lock.users) { + eaten = 1; + + NET_INC_STATS_BH(TCPHPHitsToUser); + + if (tcp_copy_to_iovec(sk, skb, tcp_header_len)) + goto csum_error; + + __skb_pull(skb,tcp_header_len); + } else { + if (tcp_checksum_complete_user(sk, skb)) + goto csum_error; + + if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) + goto step5; + + NET_INC_STATS_BH(TCPHPHits); + + /* Bulk data transfer: receiver */ + __skb_pull(skb,tcp_header_len); + + /* DO NOT notify forward progress here. + * It saves dozen of CPU instructions in fast path. --ANK + * And where is it signaled then ? -AK + * Nowhere. 8) --ANK + */ + __skb_queue_tail(&sk->receive_queue, skb); + skb_set_owner_r(skb, sk); + } - /* DO NOT notify forward progress here. - * It saves dozen of CPU instructions in fast path. --ANK - * And where is it signaled then ? -AK - */ - __skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; /* FIN bit check is not done since if FIN is set in @@ -2202,27 +2557,43 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, */ wake_up_interruptible(sk->sleep); sock_wake_async(sk->socket,1, POLL_IN); - tcp_delack_estimator(tp); - tcp_remember_ack(tp, th, skb); + tcp_event_data_recv(tp, skb); +#if 1/*def CONFIG_TCP_MORE_COARSE_ACKS*/ + if (eaten) { + if (tcp_in_quickack_mode(tp)) { + tcp_send_ack(sk); + } else { + tcp_send_delayed_ack(sk); + } + } else +#endif __tcp_ack_snd_check(sk, 0); + + if (eaten) + kfree_skb(skb); return 0; } /* Packet is in sequence, flags are trivial; - * only ACK is strange or we are tough on memory. - * Jump to step 5. + * only ACK is strange. Jump to step 5. */ + if (tcp_checksum_complete_user(sk, skb)) + goto csum_error; goto step5; } slow_path: + if (tcp_checksum_complete_user(sk, skb)) + goto csum_error; + /* * RFC1323: H1. Apply PAWS check first. */ if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp && tcp_paws_discard(tp, skb)) { if (!th->rst) { + NET_INC_STATS_BH(PAWSEstabRejected); tcp_send_ack(sk); goto discard; } @@ -2251,7 +2622,9 @@ slow_path: TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_wup, tp->rcv_wnd); } + tcp_enter_quickack_mode(tp); tcp_send_ack(sk); + NET_INC_STATS_BH(DelayedACKLost); goto discard; } @@ -2279,11 +2652,8 @@ step5: /* Process urgent data. */ tcp_urg(sk, th, len); - { /* step 7: process the segment text */ - int queued = tcp_data(skb, sk, len); - - tcp_measure_rcv_mss(sk, skb); + tcp_data(skb, sk, len); /* Be careful, tcp_data() may have put this into TIME_WAIT. */ if(sk->state != TCP_CLOSE) { @@ -2291,12 +2661,13 @@ step5: tcp_ack_snd_check(sk); } - if (!queued) { - discard: - kfree_skb(skb); - } - } + return 0; + +csum_error: + TCP_INC_STATS_BH(TcpInErrs); +discard: + kfree_skb(skb); return 0; } @@ -2328,6 +2699,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newsk->dport = req->rmt_port; sock_lock_init(newsk); + bh_lock_sock(newsk); atomic_set(&newsk->rmem_alloc, 0); skb_queue_head_init(&newsk->receive_queue); @@ -2351,22 +2723,27 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newtp->rcv_nxt = req->rcv_isn + 1; newtp->snd_nxt = req->snt_isn + 1; newtp->snd_una = req->snt_isn + 1; - newtp->srtt = 0; - newtp->ato = 0; + newtp->snd_sml = req->snt_isn + 1; + + tcp_delack_init(newtp); + if (skb->len >= 536) + newtp->ack.last_seg_size = skb->len; + + tcp_prequeue_init(newtp); + newtp->snd_wl1 = req->rcv_isn; newtp->snd_wl2 = req->snt_isn; - /* RFC1323: The window in SYN & SYN/ACK segments - * is never scaled. - */ - newtp->snd_wnd = ntohs(skb->h.th->window); - - newtp->max_window = newtp->snd_wnd; - newtp->pending = 0; newtp->retransmits = 0; - newtp->last_ack_sent = req->rcv_isn + 1; newtp->backoff = 0; + newtp->srtt = 0; newtp->mdev = TCP_TIMEOUT_INIT; + newtp->rto = TCP_TIMEOUT_INIT; + + newtp->packets_out = 0; + newtp->fackets_out = 0; + newtp->retrans_out = 0; + newtp->snd_ssthresh = 0x7fffffff; /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control @@ -2374,22 +2751,11 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, * efficiently to them. -DaveM */ newtp->snd_cwnd = 2; - - newtp->rto = TCP_TIMEOUT_INIT; - newtp->packets_out = 0; - newtp->fackets_out = 0; - newtp->retrans_out = 0; - newtp->high_seq = 0; - newtp->snd_ssthresh = 0x7fffffff; newtp->snd_cwnd_cnt = 0; + newtp->high_seq = 0; + newtp->dup_acks = 0; - newtp->delayed_acks = 0; - init_timer(&newtp->retransmit_timer); - newtp->retransmit_timer.function = &tcp_retransmit_timer; - newtp->retransmit_timer.data = (unsigned long) newsk; - init_timer(&newtp->delack_timer); - newtp->delack_timer.function = &tcp_delack_timer; - newtp->delack_timer.data = (unsigned long) newsk; + tcp_init_xmit_timers(newsk); skb_queue_head_init(&newtp->out_of_order_queue); newtp->send_head = newtp->retrans_head = NULL; newtp->rcv_wup = req->rcv_isn + 1; @@ -2397,31 +2763,25 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newtp->copied_seq = req->rcv_isn + 1; newtp->saw_tstamp = 0; + newtp->last_ack_sent = req->rcv_isn + 1; - init_timer(&newtp->probe_timer); - newtp->probe_timer.function = &tcp_probe_timer; - newtp->probe_timer.data = (unsigned long) newsk; newtp->probes_out = 0; newtp->syn_seq = req->rcv_isn; newtp->fin_seq = req->rcv_isn; newtp->urg_data = 0; - tcp_synq_init(newtp); - newtp->syn_backlog = 0; - if (skb->len >= 536) - newtp->last_seg_size = skb->len; + newtp->listen_opt = NULL; + newtp->accept_queue = NULL; + /* Deinitialize syn_wait_lock to trap illegal accesses. */ + memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock)); /* Back to base struct sock members. */ newsk->err = 0; - newsk->ack_backlog = 0; - newsk->max_ack_backlog = SOMAXCONN; newsk->priority = 0; atomic_set(&newsk->refcnt, 1); +#ifdef INET_REFCNT_DEBUG atomic_inc(&inet_sock_nr); +#endif - spin_lock_init(&sk->timer_lock); - init_timer(&newsk->timer); - newsk->timer.function = &tcp_keepalive_timer; - newsk->timer.data = (unsigned long) newsk; if (newsk->keepopen) tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); newsk->socket = NULL; @@ -2440,6 +2800,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newtp->snd_wscale = newtp->rcv_wscale = 0; newtp->window_clamp = min(newtp->window_clamp,65535); } + newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->snd_wscale; + newtp->max_window = newtp->snd_wnd; + if (newtp->tstamp_ok) { newtp->ts_recent = req->ts_recent; newtp->ts_recent_stamp = xtime.tv_sec; @@ -2453,16 +2816,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, return newsk; } -static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) -{ - if (seq == s_win) - return 1; - if (after(end_seq, s_win) && before(seq, e_win)) - return 1; - return (seq == e_win && seq == end_seq); -} - - /* * Process an incoming packet for SYN_RECV sockets represented * as an open_request. @@ -2470,30 +2823,28 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, struct open_request *req, - struct open_request *prev) + struct open_request **prev) { struct tcphdr *th = skb->h.th; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); int paws_reject = 0; struct tcp_opt ttp; - - /* If socket has already been created, process - packet in its context. - - We fall here only due to race, when packets were enqueued - to backlog of listening socket. - */ - if (req->sk) - return req->sk; + struct sock *child; ttp.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { - tcp_parse_options(NULL, th, &ttp, 0); - paws_reject = ttp.saw_tstamp && - (s32)(ttp.rcv_tsval - req->ts_recent) < 0; + if (ttp.saw_tstamp) { + ttp.ts_recent = req->ts_recent; + /* We do not store true stamp, but it is not required, + * it can be estimated (approximately) + * from another data. + */ + ttp.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); + paws_reject = tcp_paws_check(&ttp, th->rst); + } } /* Check for pure retransmited SYN. */ @@ -2517,7 +2868,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, * Enforce "SYN-ACK" according to figure 8, figure 6 * of RFC793, fixed by RFC1122. */ - req->class->rtx_syn_ack(sk, req); + req->class->rtx_syn_ack(sk, req, NULL); return NULL; } @@ -2544,6 +2895,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST)) req->class->send_ack(skb, req); + if (paws_reject) + NET_INC_STATS_BH(PAWSEstabRejected); return NULL; } @@ -2572,35 +2925,78 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, /* Invalid ACK: reset will be sent by listening socket */ if (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1) return sk; - - /* OK, ACK is valid, create big socket and - feed this segment to it. It will repeat all - the tests. THIS SEGMENT MUST MOVE SOCKET TO - ESTABLISHED STATE. If it will be dropped after - socket is created, wait for troubles. + /* Also, it would be not so bad idea to check rcv_tsecr, which + * is essentially ACK extension and too early or too late values + * should cause reset in unsynchronized states. */ - sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); - if (sk == NULL) + + /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ + if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) { + req->acked = 1; return NULL; + } - tcp_dec_slow_timer(TCP_SLT_SYNACK); - req->sk = sk; - return sk; + /* OK, ACK is valid, create big socket and + * feed this segment to it. It will repeat all + * the tests. THIS SEGMENT MUST MOVE SOCKET TO + * ESTABLISHED STATE. If it will be dropped after + * socket is created, wait for troubles. + */ + child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); + if (child == NULL) + goto listen_overflow; -embryonic_reset: tcp_synq_unlink(tp, req, prev); - tp->syn_backlog--; - tcp_dec_slow_timer(TCP_SLT_SYNACK); + tcp_synq_removed(sk, req); + + tcp_acceptq_queue(sk, req, child); + return child; +listen_overflow: + if (!sysctl_tcp_abort_on_overflow) { + req->acked = 1; + return NULL; + } + +embryonic_reset: NET_INC_STATS_BH(EmbryonicRsts); if (!(flg & TCP_FLAG_RST)) req->class->send_reset(skb); - req->class->destructor(req); - tcp_openreq_free(req); + tcp_synq_drop(sk, req, prev); return NULL; } +/* + * Queue segment on the new socket if the new socket is active, + * otherwise we just shortcircuit this and continue with + * the new socket. + */ + +int tcp_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb) +{ + int ret = 0; + int state = child->state; + + if (child->lock.users == 0) { + ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len); + + /* Wakeup parent, send SIGIO */ + if (state == TCP_SYN_RECV && child->state != state) + parent->data_ready(parent, 0); + } else { + /* Alas, it is possible again, because we do lookup + * in main socket hash table and lock on listening + * socket does not protect us more. + */ + sk_add_backlog(child, skb); + } + + bh_unlock_sock(child); + return ret; +} + static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) { @@ -2608,25 +3004,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_parse_options(sk, th, tp, 0); -#ifdef CONFIG_TCP_TW_RECYCLE - if (tp->ts_recent_stamp && tp->saw_tstamp && !th->rst && - (s32)(tp->rcv_tsval - tp->ts_recent) < 0 && - xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS) { - /* Old duplicate segment. We remember last - ts_recent from this host in timewait bucket. - - Actually, we could implement per host cache - to truncate timewait state after RTO. Paranoidal arguments - of rfc1337 are not enough to close this nice possibility. - */ - if (net_ratelimit()) - printk(KERN_DEBUG "TCP: tw recycle, PAWS worked. Good.\n"); - if (th->ack) - return 1; - goto discard; - } -#endif - if (th->ack) { /* rfc793: * "If the state is SYN-SENT then @@ -2646,10 +3023,36 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * We do not send data with SYN, so that RFC-correct * test reduces to: */ - if (sk->zapped || - TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) + if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) return 1; + /* Check not from any RFC, but it is evident consequence + * of combining PAWS and usual SYN-SENT logic: ACK _is_ + * checked in SYN-SENT unlike another states, hence + * echoed tstamp must be checked too. + */ + if (tp->saw_tstamp) { + if (tp->rcv_tsecr == 0) { + /* Workaround for bug in linux-2.1 and early + * 2.2 kernels. Let's pretend that we did not + * see such timestamp to avoid bogus rtt value, + * calculated by tcp_ack(). + */ + tp->saw_tstamp = 0; + + /* But do not forget to store peer's timestamp! */ + if (th->syn) { + tp->ts_recent = tp->rcv_tsval; + tp->ts_recent_stamp = xtime.tv_sec; + } + } else if ((__s32)(tp->rcv_tsecr - tcp_time_stamp) > 0 || + (__s32)(tp->rcv_tsecr - tp->syn_stamp) < 0) { + NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "TCP: synsent reject.\n")); + NET_INC_STATS_BH(PAWSActiveRejected); + return 1; + } + } + /* Now ACK is acceptable. * * "If the RST bit is set @@ -2689,18 +3092,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * because tcp_ack check is too weak for SYN-SENT) * causes moving socket to invalid semi-SYN-SENT, * semi-ESTABLISHED state and connection hangs. - * - * There exist buggy stacks, which really send - * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp) - * Actually, if this host did not try to get something - * from ftp.inr.ac.ru I'd never find this bug 8) - * * --ANK (990514) * - * I was wrong, I apologize. Bare ACK is valid. + * Bare ACK is valid, however. * Actually, RFC793 requires to send such ACK * in reply to any out of window packet. - * It is wrong, but Linux also does it sometimes. + * It is wrong, but Linux also send such + * useless ACKs sometimes. * --ANK (990724) */ @@ -2717,7 +3115,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, /* RFC1323: The window in SYN & SYN/ACK segments is * never scaled. */ - tp->snd_wnd = htons(th->window); + tp->snd_wnd = ntohs(th->window); tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq; tp->fin_seq = TCP_SKB_CB(skb)->seq; @@ -2742,26 +3140,35 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_initialize_rcv_mss(sk); tcp_init_metrics(sk); + if (sk->keepopen) + tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); + + tp->copied_seq = tp->rcv_nxt; + __tcp_fast_path_on(tp, tp->snd_wnd); + + if(!sk->dead) { + sk->state_change(sk); + sock_wake_async(sk->socket, 0, POLL_OUT); + } + if (tp->write_pending) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. * - * How to make this correctly? + * It may be deleted, but with this feature tcpdumps + * look so _wonderfully_ clever, that I was not able + * to stand against the temptation 8) --ANK */ - tp->delayed_acks++; - if (tp->ato == 0) - tp->ato = tp->rto; - tcp_send_delayed_ack(sk, tp->rto); + tp->ack.pending = 1; + tp->ack.lrcvtime = tcp_time_stamp; + tcp_enter_quickack_mode(tp); + tp->ack.pingpong = 1; + tp->ack.ato = TCP_ATO_MIN; + tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MIN); + goto discard; } else { tcp_send_ack(sk); } - - tp->copied_seq = tp->rcv_nxt; - - if(!sk->dead) { - wake_up_interruptible(sk->sleep); - sock_wake_async(sk->socket, 0, POLL_OUT); - } return -1; } @@ -2777,6 +3184,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, goto discard; } + /* PAWS check. */ + if (tp->ts_recent_stamp && tp->saw_tstamp && tcp_paws_check(tp, 0)) + goto discard; + if (th->syn) { /* We see SYN without ACK. It is attempt of * simultaneous connect with crossed SYNs. @@ -2800,8 +3211,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, /* RFC1323: The window in SYN & SYN/ACK segments is * never scaled. */ - tp->snd_wnd = htons(th->window); + tp->snd_wnd = ntohs(th->window); tp->snd_wl1 = TCP_SKB_CB(skb)->seq; + tp->max_window = tp->snd_wnd; tcp_sync_mss(sk, tp->pmtu_cookie); tcp_initialize_rcv_mss(sk); @@ -2960,6 +3372,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, #endif ) { if (!th->rst) { + NET_INC_STATS_BH(DelayedACKLost); + tcp_enter_quickack_mode(tp); tcp_send_ack(sk); } goto discard; @@ -3011,28 +3425,29 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->copied_seq = tp->rcv_nxt; /* Note, that this wakeup is only for marginal - crossed SYN case. Passively open sockets - are not waked up, because sk->sleep == NULL - and sk->socket == NULL. + * crossed SYN case. Passively open sockets + * are not waked up, because sk->sleep == NULL + * and sk->socket == NULL. */ - if (!sk->dead && sk->sleep) { - wake_up_interruptible(sk->sleep); + if (!sk->dead) { + sk->state_change(sk); sock_wake_async(sk->socket,0,POLL_OUT); } tp->snd_una = TCP_SKB_CB(skb)->ack_seq; - tp->snd_wnd = htons(th->window) << tp->snd_wscale; + tp->snd_wnd = ntohs(th->window) << tp->snd_wscale; tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq; /* tcp_ack considers this ACK as duplicate - * and does not calculate rtt. It is wrong. + * and does not calculate rtt. * Fix it at least with timestamps. */ if (tp->saw_tstamp && !tp->srtt) tcp_ack_saw_tstamp(sk, tp, 0, 0, FLAG_SYN_ACKED); tcp_init_metrics(sk); + tcp_fast_path_on(tp); } else { SOCK_DEBUG(sk, "bad ack\n"); return 1; @@ -3041,26 +3456,50 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, case TCP_FIN_WAIT1: if (tp->snd_una == tp->write_seq) { - sk->shutdown |= SEND_SHUTDOWN; tcp_set_state(sk, TCP_FIN_WAIT2); - if (!sk->dead) - sk->state_change(sk); - else - tcp_reset_keepalive_timer(sk, sysctl_tcp_fin_timeout); + sk->shutdown |= SEND_SHUTDOWN; dst_confirm(sk->dst_cache); + + if (!sk->dead) { + /* Wake up lingering close() */ + sk->state_change(sk); + } else { + int tmo; + + if (tp->linger2 < 0 || + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { + tcp_done(sk); + return 1; + } + + tmo = tcp_fin_time(tp); + if (tmo > TCP_TIMEWAIT_LEN) { + tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + } else if (th->fin || sk->lock.users) { + /* Bad case. We could lose such FIN otherwise. + * It is not a big problem, but it looks confusing + * and not so rare event. We still can lose it now, + * if it spins in bh_lock_sock(), but it is really + * marginal case. + */ + tcp_reset_keepalive_timer(sk, tmo); + } else { + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto discard; + } + } } break; - case TCP_CLOSING: + case TCP_CLOSING: if (tp->snd_una == tp->write_seq) { - tcp_time_wait(sk); + tcp_time_wait(sk, TCP_TIME_WAIT, 0); goto discard; } break; case TCP_LAST_ACK: if (tp->snd_una == tp->write_seq) { - tcp_set_state(sk,TCP_CLOSE); tcp_update_metrics(sk); tcp_done(sk); goto discard; @@ -3080,27 +3519,22 @@ step6: case TCP_CLOSING: if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq)) break; - case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: /* RFC 793 says to queue data in these states, * RFC 1122 says we MUST send a reset. * BSD 4.4 also does reset. */ - if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) { + if (sk->shutdown & RCV_SHUTDOWN) { if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { tcp_reset(sk); return 1; } } - + /* Fall through */ case TCP_ESTABLISHED: - queued = tcp_data(skb, sk, len); - - /* This must be after tcp_data() does the skb_pull() to - * remove the header size from skb->len. - */ - tcp_measure_rcv_mss(sk, skb); + tcp_data(skb, sk, len); + queued = 1; break; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 22c35a191..7420e268f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_ipv4.c,v 1.194 2000/01/09 02:19:41 davem Exp $ + * Version: $Id: tcp_ipv4.c,v 1.197 2000/01/21 06:37:28 davem Exp $ * * IPv4 specific functions * @@ -52,7 +52,6 @@ #include <linux/fcntl.h> #include <linux/random.h> #include <linux/init.h> -#include <linux/ipsec.h> #include <net/icmp.h> #include <net/tcp.h> @@ -61,15 +60,9 @@ #include <linux/inet.h> #include <linux/stddef.h> +#include <linux/ipsec.h> -extern int sysctl_tcp_timestamps; -extern int sysctl_tcp_window_scaling; -extern int sysctl_tcp_sack; -extern int sysctl_tcp_syncookies; -extern int sysctl_tcp_tw_recycle; extern int sysctl_ip_dynaddr; -extern __u32 sysctl_wmem_max; -extern __u32 sysctl_rmem_max; /* Check TCP sequence numbers in ICMP packets. */ #define ICMP_MIN_LENGTH 8 @@ -319,89 +312,13 @@ void tcp_put_port(struct sock *sk) local_bh_enable(); } -#ifdef CONFIG_TCP_TW_RECYCLE -/* - Very stupid pseudo-"algoritm". If the approach will be successful - (and it will!), we have to make it more reasonable. - Now it eats lots of CPU, when we are tough on ports. - - Apparently, it should be hash table indexed by daddr/dport. - - How does it work? We allow to truncate time-wait state, if: - 1. PAWS works on it. - 2. timewait bucket did not receive data for timeout: - - initially timeout := 2*RTO, so that if our ACK to first - transmitted peer's FIN is lost, we will see first retransmit. - - if we receive anything, the timout is increased exponentially - to follow normal TCP backoff pattern. - It is important that minimal RTO (HZ/5) > minimal timestamp - step (1ms). - 3. When creating new socket, we inherit sequence number - and ts_recent of time-wait bucket, increasinf them a bit. - - These two conditions guarantee, that data will not be corrupted - both by retransmitted and by delayed segments. They do not guarantee - that peer will leave LAST-ACK/CLOSING state gracefully, it will be - reset sometimes, namely, when more than two our ACKs to its FINs are lost. - This reset is harmless and even good. +/* This lock without TASK_EXCLUSIVE is good on UP and it can be very bad on SMP. + * Look, when several writers sleep and reader wakes them up, all but one + * immediately hit write lock and grab all the cpus. Exclusive sleep solves + * this, _but_ remember, it adds useless work on UP machines (wake up each + * exclusive lock release). It should be ifdefed really. */ -int tcp_v4_tw_recycle(struct sock *sk, u32 daddr, u16 dport) -{ - static int tw_rover; - - struct tcp_tw_bucket *tw; - struct tcp_bind_hashbucket *head; - struct tcp_bind_bucket *tb; - - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - unsigned long now = jiffies; - int i, rover; - - rover = tw_rover; - - local_bh_disable(); - for (i=0; i<tcp_bhash_size; i++, rover++) { - rover &= (tcp_bhash_size-1); - head = &tcp_bhash[rover]; - - spin_lock(&head->lock); - for (tb = head->chain; tb; tb = tb->next) { - tw = (struct tcp_tw_bucket*)tb->owners; - - if (tw->state != TCP_TIME_WAIT || - tw->dport != dport || - tw->daddr != daddr || - tw->rcv_saddr != sk->rcv_saddr || - tb->port < low || - tb->port >= high || - !TCP_INET_FAMILY(tw->family) || - tw->ts_recent_stamp == 0 || - (long)(now - tw->ttd) <= 0) - continue; - tw_rover = rover; - goto hit; - } - spin_unlock(&head->lock); - } - local_bh_enable(); - tw_rover = rover; - return -EAGAIN; - -hit: - sk->num = tw->num; - if ((sk->bind_next = tb->owners) != NULL) - tb->owners->bind_pprev = &sk->bind_next; - tb->owners = sk; - sk->bind_pprev = &tb->owners; - sk->prev = (struct sock *) tb; - spin_unlock_bh(&head->lock); - return 0; -} -#endif - - void tcp_listen_wlock(void) { write_lock(&tcp_lhash_lock); @@ -409,9 +326,9 @@ void tcp_listen_wlock(void) if (atomic_read(&tcp_lhash_users)) { DECLARE_WAITQUEUE(wait, current); - add_wait_queue(&tcp_lhash_wait, &wait); + add_wait_queue_exclusive(&tcp_lhash_wait, &wait); for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE|TASK_EXCLUSIVE); if (atomic_read(&tcp_lhash_users) == 0) break; write_unlock_bh(&tcp_lhash_lock); @@ -445,6 +362,8 @@ static __inline__ void __tcp_v4_hash(struct sock *sk) sk->pprev = skp; sock_prot_inc_use(sk->prot); write_unlock(lock); + if (sk->state == TCP_LISTEN) + wake_up(&tcp_lhash_wait); } static void tcp_v4_hash(struct sock *sk) @@ -478,6 +397,8 @@ void tcp_unhash(struct sock *sk) sock_prot_dec_use(sk->prot); } write_unlock_bh(lock); + if (sk->state == TCP_LISTEN) + wake_up(&tcp_lhash_wait); } /* Don't inline this cruft. Here are some nice properties to @@ -546,8 +467,9 @@ sherry_cache: * * Local BH must be disabled here. */ -static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport, - u32 daddr, u16 hnum, int dif) + +static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport, + u32 daddr, u16 hnum, int dif) { struct tcp_ehash_bucket *head; TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) @@ -572,7 +494,7 @@ static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport, goto hit; read_unlock(&head->lock); - return tcp_v4_lookup_listener(daddr, hnum, dif); + return NULL; hit: sock_hold(sk); @@ -580,6 +502,19 @@ hit: return sk; } +static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport, + u32 daddr, u16 hnum, int dif) +{ + struct sock *sk; + + sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif); + + if (sk) + return sk; + + return tcp_v4_lookup_listener(daddr, hnum, dif); +} + __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif) { struct sock *sk; @@ -609,21 +544,16 @@ static int tcp_v4_check_established(struct sock *sk) int hash = tcp_hashfn(daddr, sk->num, saddr, sk->dport); struct tcp_ehash_bucket *head = &tcp_ehash[hash]; struct sock *sk2, **skp; -#ifdef CONFIG_TCP_TW_RECYCLE struct tcp_tw_bucket *tw; -#endif write_lock_bh(&head->lock); /* Check TIME-WAIT sockets first. */ for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL; skp = &sk2->next) { -#ifdef CONFIG_TCP_TW_RECYCLE tw = (struct tcp_tw_bucket*)sk2; -#endif if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) { -#ifdef CONFIG_TCP_TW_RECYCLE struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* With PAWS, it is safe from the viewpoint @@ -631,12 +561,17 @@ static int tcp_v4_check_established(struct sock *sk) is safe provided sequence spaces do not overlap i.e. at data rates <= 80Mbit/sec. - Actually, the idea is close to VJ's (rfc1332) - one, only timestamp cache is held not per host, + Actually, the idea is close to VJ's one, + only timestamp cache is held not per host, but per port pair and TW bucket is used as state holder. + + If TW bucket has been already destroyed we + fall back to VJ's scheme and use initial + timestamp retrieved from peer table. */ - if (sysctl_tcp_tw_recycle && tw->ts_recent_stamp) { + if (tw->substate == TCP_TIME_WAIT && + sysctl_tcp_tw_recycle && tw->ts_recent_stamp) { if ((tp->write_seq = tw->snd_nxt + 2) == 0) tp->write_seq = 1; tp->ts_recent = tw->ts_recent; @@ -645,13 +580,10 @@ static int tcp_v4_check_established(struct sock *sk) skp = &head->chain; goto unique; } else -#endif - goto not_unique; + goto not_unique; } } -#ifdef CONFIG_TCP_TW_RECYCLE tw = NULL; -#endif /* And established part... */ for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) { @@ -659,9 +591,7 @@ static int tcp_v4_check_established(struct sock *sk) goto not_unique; } -#ifdef CONFIG_TCP_TW_RECYCLE unique: -#endif BUG_TRAP(sk->pprev==NULL); if ((sk->next = *skp) != NULL) (*skp)->pprev = &sk->next; @@ -671,17 +601,17 @@ unique: sock_prot_inc_use(sk->prot); write_unlock_bh(&head->lock); -#ifdef CONFIG_TCP_TW_RECYCLE if (tw) { /* Silly. Should hash-dance instead... */ local_bh_disable(); tcp_tw_deschedule(tw); tcp_timewait_kill(tw); + NET_INC_STATS_BH(TimeWaitRecycled); local_bh_enable(); tcp_tw_put(tw); } -#endif + return 0; not_unique: @@ -727,9 +657,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) int tmp; int err; - if (sk->state != TCP_CLOSE) - return(-EISCONN); - if (addr_len < sizeof(struct sockaddr_in)) return(-EINVAL); @@ -759,8 +686,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) daddr = rt->rt_dst; err = -ENOBUFS; - buff = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header), - 0, GFP_KERNEL); + buff = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 0, GFP_KERNEL); if (buff == NULL) goto failure; @@ -769,27 +695,28 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk->saddr = rt->rt_src; sk->rcv_saddr = sk->saddr; - if (!sk->num) { - if (sk->prot->get_port(sk, 0) -#ifdef CONFIG_TCP_TW_RECYCLE - && (!sysctl_tcp_tw_recycle || - tcp_v4_tw_recycle(sk, daddr, usin->sin_port)) -#endif - ) { - kfree_skb(buff); - err = -EAGAIN; - goto failure; - } - sk->sport = htons(sk->num); - } -#ifdef CONFIG_TCP_TW_RECYCLE - else if (tp->ts_recent_stamp && sk->daddr != daddr) { + if (tp->ts_recent_stamp && sk->daddr != daddr) { /* Reset inherited state */ tp->ts_recent = 0; tp->ts_recent_stamp = 0; tp->write_seq = 0; } -#endif + + if (sysctl_tcp_tw_recycle && + !tp->ts_recent_stamp && + rt->rt_dst == daddr) { + struct inet_peer *peer = rt_get_peer(rt); + + /* VJ's idea. We save last timestamp seen from + * the destination in peer table, when entering state TIME-WAIT + * and initialize ts_recent from it, when trying new connection. + */ + + if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) { + tp->ts_recent_stamp = peer->tcp_ts_stamp; + tp->ts_recent = peer->tcp_ts; + } + } sk->dport = usin->sin_port; sk->daddr = daddr; @@ -814,85 +741,62 @@ failure: return err; } -static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len) +static __inline__ int tcp_v4_iif(struct sk_buff *skb) { - int retval = -EINVAL; - - lock_sock(sk); - - /* Do sanity checking for sendmsg/sendto/send. */ - if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL)) - goto out; - if (msg->msg_name) { - struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name; - - if (msg->msg_namelen < sizeof(*addr)) - goto out; - if (addr->sin_family && addr->sin_family != AF_INET) - goto out; - retval = -ENOTCONN; - if(sk->state == TCP_CLOSE) - goto out; - retval = -EISCONN; - if (addr->sin_port != sk->dport) - goto out; - if (addr->sin_addr.s_addr != sk->daddr) - goto out; - } - retval = tcp_do_sendmsg(sk, msg); - -out: - release_sock(sk); - return retval; + return ((struct rtable*)skb->dst)->rt_iif; } +static __inline__ unsigned tcp_v4_synq_hash(u32 raddr, u16 rport) +{ + unsigned h = raddr ^ rport; + h ^= h>>16; + h ^= h>>8; + return h&(TCP_SYNQ_HSIZE-1); +} -/* - * Do a linear search in the socket open_request list. - * This should be replaced with a global hash table. - */ static struct open_request *tcp_v4_search_req(struct tcp_opt *tp, struct iphdr *iph, struct tcphdr *th, - struct open_request **prevp) + struct open_request ***prevp) { - struct open_request *req, *prev; - __u16 rport = th->source; - - /* assumption: the socket is not in use. - * as we checked the user count on tcp_rcv and we're - * running from a soft interrupt. - */ - prev = (struct open_request *) (&tp->syn_wait_queue); - for (req = prev->dl_next; req; req = req->dl_next) { - if (req->af.v4_req.rmt_addr == iph->saddr && + struct tcp_listen_opt *lopt = tp->listen_opt; + struct open_request *req, **prev; + __u16 rport = th->source; + __u32 raddr = iph->saddr; + + for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport)]; + (req = *prev) != NULL; + prev = &req->dl_next) { + if (req->rmt_port == rport && + req->af.v4_req.rmt_addr == raddr && req->af.v4_req.loc_addr == iph->daddr && - req->rmt_port == rport && TCP_INET_FAMILY(req->class->family)) { - if (req->sk) { - /* Weird case: connection was established - and then killed by RST before user accepted - it. This connection is dead, but we cannot - kill openreq to avoid blocking in accept(). - - accept() will collect this garbage, - but such reqs must be ignored, when talking - to network. - */ - bh_lock_sock(req->sk); - BUG_TRAP(req->sk->lock.users==0); - if (req->sk->state == TCP_CLOSE) { - bh_unlock_sock(req->sk); - prev = req; - continue; - } - } + BUG_TRAP(req->sk == NULL); *prevp = prev; return req; } - prev = req; } - return NULL; + + return NULL; +} + +static void tcp_v4_synq_add(struct sock *sk, struct open_request *req) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct tcp_listen_opt *lopt = tp->listen_opt; + unsigned h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port); + + req->expires = jiffies + TCP_TIMEOUT_INIT; + req->retrans = 0; + req->sk = NULL; + req->index = h; + req->dl_next = lopt->syn_table[h]; + + write_lock(&tp->syn_wait_lock); + lopt->syn_table[h] = req; + write_unlock(&tp->syn_wait_lock); + + tcp_synq_added(sk); } @@ -984,7 +888,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) th = (struct tcphdr*)(dp+(iph->ihl<<2)); - sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex); + sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb)); if (sk == NULL) { ICMP_INC_STATS_BH(IcmpInErrors); return; @@ -1001,6 +905,9 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) if (sk->lock.users != 0) NET_INC_STATS_BH(LockDroppedIcmps); + if (sk->state == TCP_CLOSE) + goto out; + tp = &sk->tp_pinfo.af_tcp; seq = ntohl(th->seq); if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) { @@ -1010,14 +917,11 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) switch (type) { case ICMP_SOURCE_QUENCH: -#ifndef OLD_SOURCE_QUENCH /* This is deprecated */ - if (sk->lock.users == 0) { - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); - tp->snd_cwnd = tp->snd_ssthresh; - tp->snd_cwnd_cnt = 0; - tp->high_seq = tp->snd_nxt; - } -#endif + /* This is deprecated, but if someone generated it, + * we have no reasons to ignore it. + */ + if (sk->lock.users == 0) + tcp_enter_cong_avoid(tp); goto out; case ICMP_PARAMETERPROB: err = EPROTO; @@ -1042,7 +946,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) } switch (sk->state) { - struct open_request *req, *prev; + struct open_request *req, **prev; case TCP_LISTEN: if (sk->lock.users != 0) goto out; @@ -1060,47 +964,25 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) if (!req) goto out; - if (req->sk) { - struct sock *nsk = req->sk; - - /* - * Already in ESTABLISHED and a big socket is created, - * set error code there. - * The error will _not_ be reported in the accept(), - * but only with the next operation on the socket after - * accept. - */ - sock_hold(nsk); - bh_unlock_sock(sk); - sock_put(sk); - sk = nsk; - - BUG_TRAP(sk->lock.users == 0); - tp = &sk->tp_pinfo.af_tcp; - if (!between(seq, tp->snd_una, tp->snd_nxt)) { - NET_INC_STATS(OutOfWindowIcmps); - goto out; - } - } else { - if (seq != req->snt_isn) { - NET_INC_STATS(OutOfWindowIcmps); - goto out; - } + /* ICMPs are not backlogged, hence we cannot get + an established socket here. + */ + BUG_TRAP(req->sk == NULL); - /* - * Still in SYN_RECV, just remove it silently. - * There is no good way to pass the error to the newly - * created socket, and POSIX does not want network - * errors returned from accept(). - */ - tp->syn_backlog--; - tcp_synq_unlink(tp, req, prev); - tcp_dec_slow_timer(TCP_SLT_SYNACK); - req->class->destructor(req); - tcp_openreq_free(req); + if (seq != req->snt_isn) { + NET_INC_STATS_BH(OutOfWindowIcmps); goto out; } - break; + + /* + * Still in SYN_RECV, just remove it silently. + * There is no good way to pass the error to the newly + * created socket, and POSIX does not want network + * errors returned from accept(). + */ + tcp_synq_drop(sk, req, prev); + goto out; + case TCP_SYN_SENT: case TCP_SYN_RECV: /* Cannot happen. It can f.e. if SYNs crossed. @@ -1110,10 +992,9 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) if (sk->lock.users == 0) { TCP_INC_STATS_BH(TcpAttemptFails); sk->err = err; - /* Wake people up to see the error (see connect in sock.c) */ + sk->error_report(sk); - tcp_set_state(sk, TCP_CLOSE); tcp_done(sk); } else { sk->err_soft = err; @@ -1270,28 +1151,23 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) { struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; - tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt, 0, tw->ts_recent); + tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt, + tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent); tcp_tw_put(tw); } static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req) { - tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd, req->ts_recent); + tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd, + req->ts_recent); } -/* - * Send a SYN-ACK after having received an ACK. - * This still operates on a open_request only, not on a big - * socket. - */ -static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) +static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req) { struct rtable *rt; struct ip_options *opt; - struct sk_buff * skb; - /* First, grab a route. */ opt = req->af.v4_req.opt; if(ip_route_output(&rt, ((opt && opt->srr) ? opt->faddr : @@ -1300,15 +1176,33 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute, sk->bound_dev_if)) { IP_INC_STATS_BH(IpOutNoRoutes); - return; + return NULL; } - if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { + if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { ip_rt_put(rt); IP_INC_STATS_BH(IpOutNoRoutes); - return; + return NULL; } + return &rt->u.dst; +} + +/* + * Send a SYN-ACK after having received an ACK. + * This still operates on a open_request only, not on a big + * socket. + */ +static int tcp_v4_send_synack(struct sock *sk, struct open_request *req, + struct dst_entry *dst) +{ + int err = -1; + struct sk_buff * skb; - skb = tcp_make_synack(sk, &rt->u.dst, req); + /* First, grab a route. */ + if (dst == NULL && + (dst = tcp_v4_route_req(sk, req)) == NULL) + goto out; + + skb = tcp_make_synack(sk, dst, req); if (skb) { struct tcphdr *th = skb->h.th; @@ -1317,10 +1211,15 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr, csum_partial((char *)th, skb->len, skb->csum)); - ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr, - req->af.v4_req.rmt_addr, req->af.v4_req.opt); + err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr, + req->af.v4_req.rmt_addr, req->af.v4_req.opt); + if (err == NET_XMIT_CN) + err = 0; } - ip_rt_put(rt); + +out: + dst_release(dst); + return err; } /* @@ -1328,7 +1227,7 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) */ static void tcp_v4_or_free(struct open_request *req) { - if(!req->sk && req->af.v4_req.opt) + if (req->af.v4_req.opt) kfree_s(req->af.v4_req.opt, optlength(req->af.v4_req.opt)); } @@ -1372,8 +1271,14 @@ tcp_v4_save_options(struct sock *sk, struct sk_buff *skb) * It would be better to replace it with a global counter for all sockets * but then some measure against one socket starving all other sockets * would be needed. + * + * It was 128 by default. Experiments with real servers show, that + * it is absolutely not enough even at 100conn/sec. 256 cures most + * of problems. This value is adjusted to 128 for very small machines + * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb). + * Further increasing requires to change hash table size. */ -int sysctl_max_syn_backlog = 128; +int sysctl_max_syn_backlog = 256; struct or_calltable or_ipv4 = { PF_INET, @@ -1383,9 +1288,6 @@ struct or_calltable or_ipv4 = { tcp_v4_send_reset }; -#define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */ -#define BACKLOGMAX(sk) sysctl_max_syn_backlog - int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct tcp_opt tp; @@ -1394,6 +1296,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) __u32 saddr = skb->nh.iph->saddr; __u32 daddr = skb->nh.iph->daddr; __u32 isn = TCP_SKB_CB(skb)->when; + struct dst_entry *dst = NULL; #ifdef CONFIG_SYN_COOKIES int want_cookie = 0; #else @@ -1405,84 +1308,108 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) (RTCF_BROADCAST|RTCF_MULTICAST)) goto drop; - /* XXX: Check against a global syn pool counter. */ - if (BACKLOG(sk) > BACKLOGMAX(sk)) { + /* TW buckets are converted to open requests without + * limitations, they conserve resources and peer is + * evidently real one. + */ + if (tcp_synq_is_full(sk) && !isn) { #ifdef CONFIG_SYN_COOKIES - if (sysctl_tcp_syncookies && !isn) { - syn_flood_warning(skb); + if (sysctl_tcp_syncookies) { want_cookie = 1; } else #endif goto drop; - } else { - if (isn == 0) - isn = tcp_v4_init_sequence(sk, skb); - BACKLOG(sk)++; } - req = tcp_openreq_alloc(); - if (req == NULL) { - goto dropbacklog; - } + /* Accept backlog is full. If we have already queued enough + * of warm entries in syn queue, drop request. It is better than + * clogging syn queue with openreqs with exponentially increasing + * timeout. + */ + if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) + goto drop; - req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ + req = tcp_openreq_alloc(); + if (req == NULL) + goto drop; - req->rcv_isn = TCP_SKB_CB(skb)->seq; tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0; - tp.mss_clamp = 536; tp.user_mss = sk->tp_pinfo.af_tcp.user_mss; tcp_parse_options(NULL, th, &tp, want_cookie); - req->mss = tp.mss_clamp; - req->ts_recent = tp.saw_tstamp ? tp.rcv_tsval : 0; - req->tstamp_ok = tp.tstamp_ok; - req->sack_ok = tp.sack_ok; - req->snd_wscale = tp.snd_wscale; - req->wscale_ok = tp.wscale_ok; - req->rmt_port = th->source; + tcp_openreq_init(req, &tp, skb); + req->af.v4_req.loc_addr = daddr; req->af.v4_req.rmt_addr = saddr; + req->af.v4_req.opt = tcp_v4_save_options(sk, skb); + req->class = &or_ipv4; - /* Note that we ignore the isn passed from the TIME_WAIT - * state here. That's the price we pay for cookies. - * - * RED-PEN. The price is high... Then we cannot kill TIME-WAIT - * and should reject connection attempt, duplicates with random - * sequence number can corrupt data. Right? - * I disabled sending cookie to request matching to a timewait - * bucket. - */ - if (want_cookie) + if (want_cookie) { +#ifdef CONFIG_SYN_COOKIES + syn_flood_warning(skb); +#endif isn = cookie_v4_init_sequence(sk, skb, &req->mss); + } else if (isn == 0) { + struct inet_peer *peer = NULL; - req->snt_isn = isn; - - req->af.v4_req.opt = tcp_v4_save_options(sk, skb); + /* VJ's idea. We save last timestamp seen + * from the destination in peer table, when entering + * state TIME-WAIT, and check against it before + * accepting new connection request. + * + * If "isn" is not zero, this request hit alive + * timewait bucket, so that all the necessary checks + * are made in the function processing timewait state. + */ + if (tp.saw_tstamp && + sysctl_tcp_tw_recycle && + (dst = tcp_v4_route_req(sk, req)) != NULL && + (peer = rt_get_peer((struct rtable*)dst)) != NULL && + peer->v4daddr == saddr) { + if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL && + (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) { + NETDEBUG(printk(KERN_DEBUG "TW_REC: reject openreq %u/%u %08x/%u\n", peer->tcp_ts, req->ts_recent, saddr, ntohs(skb->h.th->source))); + NET_INC_STATS_BH(PAWSPassiveRejected); + dst_release(dst); + goto drop_and_free; + } + } + /* Kill the following clause, if you dislike this way. */ + else if (!sysctl_tcp_syncookies && + (sysctl_max_syn_backlog - tcp_synq_len(sk) + < (sysctl_max_syn_backlog>>2)) && + (!peer || !peer->tcp_ts_stamp) && + (!dst || !dst->rtt)) { + /* Without syncookies last quarter of + * backlog is filled with destinations, proven to be alive. + * It means that we continue to communicate + * to destinations, already remembered + * to the moment of synflood. + */ + NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "TCP: drop open request from %08x/%u\n", saddr, ntohs(skb->h.th->source))); + TCP_INC_STATS_BH(TcpAttemptFails); + dst_release(dst); + goto drop_and_free; + } - req->class = &or_ipv4; - req->retrans = 0; - req->sk = NULL; + isn = tcp_v4_init_sequence(sk, skb); + } + req->snt_isn = isn; - tcp_v4_send_synack(sk, req); + if (tcp_v4_send_synack(sk, req, dst)) + goto drop_and_free; if (want_cookie) { - if (req->af.v4_req.opt) - kfree(req->af.v4_req.opt); - tcp_v4_or_free(req); tcp_openreq_free(req); } else { - req->expires = jiffies + TCP_TIMEOUT_INIT; - tcp_inc_slow_timer(TCP_SLT_SYNACK); - tcp_synq_queue(&sk->tp_pinfo.af_tcp, req); + tcp_v4_synq_add(sk, req); } - return 0; -dropbacklog: - if (!want_cookie) - BACKLOG(sk)--; +drop_and_free: + tcp_openreq_free(req); drop: TCP_INC_STATS_BH(TcpAttemptFails); return 0; @@ -1497,29 +1424,20 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct open_request *req, struct dst_entry *dst) { - struct ip_options *opt = req->af.v4_req.opt; struct tcp_opt *newtp; struct sock *newsk; - if (sk->ack_backlog > sk->max_ack_backlog) - goto exit; /* head drop */ - if (dst == NULL) { - struct rtable *rt; - - if (ip_route_output(&rt, - opt && opt->srr ? opt->faddr : req->af.v4_req.rmt_addr, - req->af.v4_req.loc_addr, sk->protinfo.af_inet.tos|RTO_CONN, 0)) - return NULL; - dst = &rt->u.dst; - } + if (tcp_acceptq_is_full(sk)) + goto exit_overflow; + + if (dst == NULL && + (dst = tcp_v4_route_req(sk, req)) == NULL) + goto exit; newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) goto exit; - sk->tp_pinfo.af_tcp.syn_backlog--; - sk->ack_backlog++; - newsk->dst_cache = dst; newtp = &(newsk->tp_pinfo.af_tcp); @@ -1527,7 +1445,8 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->saddr = req->af.v4_req.loc_addr; newsk->rcv_saddr = req->af.v4_req.loc_addr; newsk->protinfo.af_inet.opt = req->af.v4_req.opt; - newsk->protinfo.af_inet.mc_index = ((struct rtable*)skb->dst)->rt_iif; + req->af.v4_req.opt = NULL; + newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb); newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl; newtp->ext_header_len = 0; if (newsk->protinfo.af_inet.opt) @@ -1535,28 +1454,26 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, tcp_sync_mss(newsk, dst->pmtu); tcp_initialize_rcv_mss(newsk); + newtp->advmss = dst->advmss; - if (newsk->rcvbuf < (3 * (dst->advmss+40+MAX_HEADER+15))) - newsk->rcvbuf = min ((3 * (dst->advmss+40+MAX_HEADER+15)), sysctl_rmem_max); - if (newsk->sndbuf < (3 * (newtp->mss_clamp+40+MAX_HEADER+15))) - newsk->sndbuf = min ((3 * (newtp->mss_clamp+40+MAX_HEADER+15)), sysctl_wmem_max); + tcp_init_buffer_space(newsk); - bh_lock_sock(newsk); - __tcp_v4_hash(newsk); __tcp_inherit_port(sk, newsk); return newsk; +exit_overflow: + NET_INC_STATS_BH(ListenOverflows); exit: + NET_INC_STATS_BH(ListenDrops); dst_release(dst); return NULL; } - static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb) { - struct open_request *req, *prev; + struct open_request *req, **prev; struct tcphdr *th = skb->h.th; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); @@ -1565,6 +1482,25 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb) if (req) return tcp_check_req(sk, skb, req, prev); + if (tp->accept_queue) { + struct sock *nsk; + + nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr, + th->source, + skb->nh.iph->daddr, + ntohs(th->dest), + tcp_v4_iif(skb)); + + if (nsk) { + if (nsk->state != TCP_TIME_WAIT) { + bh_lock_sock(nsk); + return nsk; + } + tcp_tw_put((struct tcp_tw_bucket*)sk); + return NULL; + } + } + #ifdef CONFIG_SYN_COOKIES if (!th->rst && (th->syn || th->ack)) sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); @@ -1572,27 +1508,26 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb) return sk; } -static int tcp_csum_verify(struct sk_buff *skb) +static int tcp_v4_checksum_init(struct sk_buff *skb) { - switch (skb->ip_summed) { - case CHECKSUM_NONE: - skb->csum = csum_partial((char *)skb->h.th, skb->len, 0); - case CHECKSUM_HW: - if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,skb->nh.iph->daddr,skb->csum)) { - NETDEBUG(printk(KERN_DEBUG "TCPv4 bad checksum " - "from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, " - "len=%d/%d\n", - NIPQUAD(skb->nh.iph->saddr), - ntohs(skb->h.th->source), - NIPQUAD(skb->nh.iph->daddr), - ntohs(skb->h.th->dest), - skb->len, - ntohs(skb->nh.iph->tot_len))); - return 1; + if (skb->ip_summed == CHECKSUM_HW) { + if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr, + skb->nh.iph->daddr,skb->csum)) { + NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n")); + return -1; } skb->ip_summed = CHECKSUM_UNNECESSARY; - default: - /* CHECKSUM_UNNECESSARY */ + } else if (skb->ip_summed != CHECKSUM_UNNECESSARY) { + if (skb->len <= 68) { + if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr, + skb->nh.iph->daddr, + csum_partial((char *)skb->h.th, skb->len, 0))) + return -1; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else { + skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr, + skb->nh.iph->daddr,0); + } } return 0; } @@ -1614,66 +1549,35 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) goto discard; #endif /* CONFIG_FILTER */ - /* - * This doesn't check if the socket has enough room for the packet. - * Either process the packet _without_ queueing it and then free it, - * or do the check later. - */ - skb_set_owner_r(skb, sk); + IP_INC_STATS_BH(IpInDelivers); if (sk->state == TCP_ESTABLISHED) { /* Fast path */ - /* Ready to move deeper ... */ - if (tcp_csum_verify(skb)) - goto csum_err; + TCP_CHECK_TIMER(sk); if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) goto reset; + TCP_CHECK_TIMER(sk); return 0; - } + } - if (tcp_csum_verify(skb)) + if (tcp_checksum_complete(skb)) goto csum_err; if (sk->state == TCP_LISTEN) { - struct sock *nsk; - - nsk = tcp_v4_hnd_req(sk, skb); + struct sock *nsk = tcp_v4_hnd_req(sk, skb); if (!nsk) goto discard; - /* - * Queue it on the new socket if the new socket is active, - * otherwise we just shortcircuit this and continue with - * the new socket.. - */ if (nsk != sk) { - int ret; - int state = nsk->state; - - skb_orphan(skb); - - BUG_TRAP(nsk->lock.users == 0); - skb_set_owner_r(skb, nsk); - ret = tcp_rcv_state_process(nsk, skb, skb->h.th, skb->len); - - /* Wakeup parent, send SIGIO, if this packet changed - socket state from SYN-RECV. - - It still looks ugly, however it is much better - than miracleous double wakeup in syn_recv_sock() - and tcp_rcv_state_process(). - */ - if (state == TCP_SYN_RECV && nsk->state != state) - sk->data_ready(sk, 0); - - bh_unlock_sock(nsk); - if (ret) + if (tcp_child_process(sk, nsk, skb)) goto reset; return 0; } } - + + TCP_CHECK_TIMER(sk); if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) goto reset; + TCP_CHECK_TIMER(sk); return 0; reset: @@ -1716,6 +1620,9 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len) if (len < sizeof(struct tcphdr)) goto bad_packet; + if (tcp_v4_checksum_init(skb) < 0) + goto bad_packet; + TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + len - th->doff*4); @@ -1724,7 +1631,7 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len) skb->used = 0; sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source, - skb->nh.iph->daddr, ntohs(th->dest), skb->dev->ifindex); + skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb)); if (!sk) goto no_tcp_socket; @@ -1738,9 +1645,10 @@ process: bh_lock_sock(sk); ret = 0; - if (!sk->lock.users) - ret = tcp_v4_do_rcv(sk, skb); - else + if (!sk->lock.users) { + if (!tcp_prequeue(sk, skb)) + ret = tcp_v4_do_rcv(sk, skb); + } else sk_add_backlog(sk, skb); bh_unlock_sock(sk); @@ -1749,7 +1657,7 @@ process: return ret; no_tcp_socket: - if (tcp_csum_verify(skb)) { + if (tcp_checksum_complete(skb)) { bad_packet: TCP_INC_STATS_BH(TcpInErrs); } else { @@ -1766,7 +1674,7 @@ discard_and_relse: goto discard_it; do_time_wait: - if (tcp_csum_verify(skb)) { + if (tcp_checksum_complete(skb)) { TCP_INC_STATS_BH(TcpInErrs); goto discard_and_relse; } @@ -1776,7 +1684,7 @@ do_time_wait: { struct sock *sk2; - sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), skb->dev->ifindex); + sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb)); if (sk2 != NULL) { tcp_tw_deschedule((struct tcp_tw_bucket *)sk); tcp_timewait_kill((struct tcp_tw_bucket *)sk); @@ -1796,36 +1704,39 @@ do_time_wait: goto discard_it; } +/* With per-bucket locks this operation is not-atomic, so that + * this version is not worse. + */ static void __tcp_v4_rehash(struct sock *sk) { - struct tcp_ehash_bucket *oldhead = &tcp_ehash[sk->hashent]; - struct tcp_ehash_bucket *head = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))]; - struct sock **skp = &head->chain; - - write_lock_bh(&oldhead->lock); - if(sk->pprev) { - if(sk->next) - sk->next->pprev = sk->pprev; - *sk->pprev = sk->next; - sk->pprev = NULL; - } - write_unlock(&oldhead->lock); - write_lock(&head->lock); - if((sk->next = *skp) != NULL) - (*skp)->pprev = &sk->next; - *skp = sk; - sk->pprev = skp; - write_unlock_bh(&head->lock); + sk->prot->unhash(sk); + sk->prot->hash(sk); } int tcp_v4_rebuild_header(struct sock *sk) { - struct rtable *rt = (struct rtable *)__sk_dst_get(sk); + struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); __u32 new_saddr; int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT; - if(rt == NULL) - return 0; + if (rt == NULL) { + int err; + + u32 daddr = sk->daddr; + + if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) + daddr = sk->protinfo.af_inet.opt->faddr; + + err = ip_route_output(&rt, daddr, sk->saddr, + RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute, + sk->bound_dev_if); + if (err) { + sk->err_soft=-err; + sk->error_report(sk); + return -1; + } + __sk_dst_set(sk, &rt->u.dst); + } /* Force route checking if want_rewrite. * The idea is good, the implementation is disguisting. @@ -1855,16 +1766,6 @@ int tcp_v4_rebuild_header(struct sock *sk) dst_release(&new_rt->u.dst); } } - if (rt->u.dst.obsolete) { - int err; - err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos|RTO_CONN, rt->key.oif); - if (err) { - sk->err_soft=-err; - sk->error_report(sk); - return -1; - } - __sk_dst_set(sk, &rt->u.dst); - } return 0; @@ -1877,7 +1778,7 @@ do_rewrite: "saddr=%08X rcv_saddr=%08X\n", ntohl(sk->saddr), ntohl(sk->rcv_saddr)); - return 0; + return -1; } if (new_saddr != sk->saddr) { @@ -1895,7 +1796,7 @@ do_rewrite: * XXX really change the sockets identity after * XXX it has entered the hashes. -DaveM * - * Besides that, it does not check for connetion + * Besides that, it does not check for connection * uniqueness. Wait for troubles. */ __tcp_v4_rehash(sk); @@ -1913,6 +1814,63 @@ static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) sin->sin_port = sk->dport; } +/* VJ's idea. Save last timestamp seen from this destination + * and hold it at least for normal timewait interval to use for duplicate + * segment detection in subsequent connections, before they enter synchronized + * state. + */ + +int tcp_v4_remember_stamp(struct sock *sk) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct rtable *rt = (struct rtable*)__sk_dst_get(sk); + struct inet_peer *peer = NULL; + int release_it = 0; + + if (rt == NULL || rt->rt_dst != sk->daddr) { + peer = inet_getpeer(sk->daddr, 1); + release_it = 1; + } else { + if (rt->peer == NULL) + rt_bind_peer(rt, 1); + peer = rt->peer; + } + + if (peer) { + if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 || + (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && + peer->tcp_ts_stamp <= tp->ts_recent_stamp)) { + peer->tcp_ts_stamp = tp->ts_recent_stamp; + peer->tcp_ts = tp->ts_recent; + } + if (release_it) + inet_putpeer(peer); + return 1; + } + + return 0; +} + +int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw) +{ + struct inet_peer *peer = NULL; + + peer = inet_getpeer(tw->daddr, 1); + + if (peer) { + if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 || + (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && + peer->tcp_ts_stamp <= tw->ts_recent_stamp)) { + peer->tcp_ts_stamp = tw->ts_recent_stamp; + peer->tcp_ts = tw->ts_recent; + } + inet_putpeer(peer); + return 1; + } + + return 0; +} + struct tcp_func ipv4_specific = { ip_queue_xmit, tcp_v4_send_check, @@ -1920,6 +1878,7 @@ struct tcp_func ipv4_specific = { tcp_v4_conn_request, tcp_v4_syn_recv_sock, tcp_v4_hash_connecting, + tcp_v4_remember_stamp, sizeof(struct iphdr), ip_setsockopt, @@ -1937,6 +1896,7 @@ static int tcp_v4_init_sock(struct sock *sk) skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); + tcp_prequeue_init(tp); tp->rto = TCP_TIMEOUT_INIT; tp->mdev = TCP_TIMEOUT_INIT; @@ -1951,19 +1911,14 @@ static int tcp_v4_init_sock(struct sock *sk) /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. */ - tp->snd_cwnd_cnt = 0; tp->snd_ssthresh = 0x7fffffff; /* Infinity */ tp->snd_cwnd_clamp = ~0; tp->mss_cache = 536; sk->state = TCP_CLOSE; - sk->max_ack_backlog = SOMAXCONN; sk->write_space = tcp_write_space; - /* Init SYN queue. */ - tcp_synq_init(tp); - sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific; return 0; @@ -1981,9 +1936,10 @@ static int tcp_v4_destroy_sock(struct sock *sk) /* Cleans up our, hopefuly empty, out_of_order_queue. */ __skb_queue_purge(&tp->out_of_order_queue); - /* Clean up a referenced TCP bind bucket, this only happens if a - * port is allocated for a socket, but it never fully connects. - */ + /* Clean prequeue, it must be empty really */ + __skb_queue_purge(&tp->ucopy.prequeue); + + /* Clean up a referenced TCP bind bucket. */ if(sk->prev != NULL) tcp_put_port(sk); @@ -1993,17 +1949,19 @@ static int tcp_v4_destroy_sock(struct sock *sk) /* Proc filesystem TCP sock list dumping. */ static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i) { - sprintf(tmpbuf, "%4d: %08lX:%04X %08lX:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p", + int ttd = req->expires - jiffies; + + sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p", i, - (long unsigned int)req->af.v4_req.loc_addr, + req->af.v4_req.loc_addr, ntohs(sk->sport), - (long unsigned int)req->af.v4_req.rmt_addr, + req->af.v4_req.rmt_addr, ntohs(req->rmt_port), TCP_SYN_RECV, 0,0, /* could print option size, but that is af dependent. */ 1, /* timers active (only the expire timer) */ - (unsigned long)(req->expires - jiffies), + ttd, req->retrans, sk->socket ? sk->socket->inode->i_uid : 0, 0, /* non standard timer */ @@ -2017,7 +1975,7 @@ static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i) { unsigned int dest, src; __u16 destp, srcp; - int timer_active, timer_active1, timer_active2; + int timer_active; unsigned long timer_expires; struct tcp_opt *tp = &sp->tp_pinfo.af_tcp; @@ -2025,15 +1983,16 @@ static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i) src = sp->rcv_saddr; destp = ntohs(sp->dport); srcp = ntohs(sp->sport); - timer_active1 = tp->retransmit_timer.prev != NULL; - timer_active2 = sp->timer.prev != NULL; timer_active = 0; timer_expires = (unsigned) -1; - if (timer_active1 && tp->retransmit_timer.expires < timer_expires) { + if (tp->retransmit_timer.prev != NULL && tp->retransmit_timer.expires < timer_expires) { timer_active = 1; timer_expires = tp->retransmit_timer.expires; + } else if (tp->probe_timer.prev != NULL && tp->probe_timer.expires < timer_expires) { + timer_active = 4; + timer_expires = tp->probe_timer.expires; } - if (timer_active2 && sp->timer.expires < timer_expires) { + if (sp->timer.prev != NULL && sp->timer.expires < timer_expires) { timer_active = 2; timer_expires = sp->timer.expires; } @@ -2041,38 +2000,37 @@ static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i) timer_expires = jiffies; sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld %d %p", + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld %d %p %u %u %u %u", i, src, srcp, dest, destp, sp->state, tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq, timer_active, timer_expires-jiffies, tp->retransmits, sp->socket ? sp->socket->inode->i_uid : 0, - 0, + tp->probes_out, sp->socket ? sp->socket->inode->i_ino : 0, - atomic_read(&sp->refcnt), sp); + atomic_read(&sp->refcnt), sp, + tp->rto, tp->ack.ato, tp->ack.quick, tp->ack.pingpong + ); } static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i) { unsigned int dest, src; __u16 destp, srcp; - int slot_dist; + int ttd = tw->ttd - jiffies; + + if (ttd < 0) + ttd = 0; dest = tw->daddr; src = tw->rcv_saddr; destp = ntohs(tw->dport); srcp = ntohs(tw->sport); - slot_dist = tw->death_slot; - if(slot_dist > tcp_tw_death_row_slot) - slot_dist = (TCP_TWKILL_SLOTS - slot_dist) + tcp_tw_death_row_slot; - else - slot_dist = tcp_tw_death_row_slot - slot_dist; - sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p", - i, src, srcp, dest, destp, TCP_TIME_WAIT, 0, 0, - 3, slot_dist * TCP_TWKILL_PERIOD, 0, 0, 0, 0, + i, src, srcp, dest, destp, tw->substate, 0, 0, + 3, ttd, 0, 0, 0, 0, atomic_read(&tw->refcnt), tw); } @@ -2093,6 +2051,8 @@ int tcp_get_info(char *buffer, char **start, off_t offset, int length) tcp_listen_lock(); for(i = 0; i < TCP_LHTABLE_SIZE; i++) { struct sock *sk = tcp_listening_hash[i]; + struct tcp_listen_opt *lopt; + int k; for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) { struct open_request *req; @@ -2112,25 +2072,30 @@ int tcp_get_info(char *buffer, char **start, off_t offset, int length) } skip_listen: - lock_sock(sk); - for (req = tp->syn_wait_queue; req; req = req->dl_next, num++) { - if (req->sk) - continue; - if (!TCP_INET_FAMILY(req->class->family)) - continue; - - pos += 128; - if (pos < offset) - continue; - get_openreq(sk, req, tmpbuf, num); - len += sprintf(buffer+len, "%-127s\n", tmpbuf); - if(len >= length) { - tcp_listen_unlock(); - release_sock(sk); - goto out_no_bh; + read_lock_bh(&tp->syn_wait_lock); + lopt = tp->listen_opt; + if (lopt && lopt->qlen != 0) { + for (k=0; k<TCP_SYNQ_HSIZE; k++) { + for (req = lopt->syn_table[k]; req; req = req->dl_next, num++) { + if (!TCP_INET_FAMILY(req->class->family)) + continue; + + pos += 128; + if (pos < offset) + continue; + get_openreq(sk, req, tmpbuf, num); + len += sprintf(buffer+len, "%-127s\n", tmpbuf); + if(len >= length) { + read_unlock_bh(&tp->syn_wait_lock); + tcp_listen_unlock(); + goto out_no_bh; + } + } } } - release_sock(sk); + read_unlock_bh(&tp->syn_wait_lock); + + /* Completed requests are in normal socket hash table */ } } tcp_listen_unlock(); @@ -2194,28 +2159,24 @@ struct proto tcp_prot = { tcp_v4_connect, /* connect */ tcp_disconnect, /* disconnect */ tcp_accept, /* accept */ - NULL, /* retransmit */ - tcp_write_wakeup, /* write_wakeup */ - tcp_read_wakeup, /* read_wakeup */ - tcp_poll, /* poll */ tcp_ioctl, /* ioctl */ tcp_v4_init_sock, /* init */ tcp_v4_destroy_sock, /* destroy */ tcp_shutdown, /* shutdown */ tcp_setsockopt, /* setsockopt */ tcp_getsockopt, /* getsockopt */ - tcp_v4_sendmsg, /* sendmsg */ + tcp_sendmsg, /* sendmsg */ tcp_recvmsg, /* recvmsg */ NULL, /* bind */ tcp_v4_do_rcv, /* backlog_rcv */ tcp_v4_hash, /* hash */ tcp_unhash, /* unhash */ tcp_v4_get_port, /* get_port */ - 128, /* max_header */ - 0, /* retransmits */ "TCP", /* name */ }; + + void __init tcp_v4_init(struct net_proto_family *ops) { int err; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e3d884dda..d6bc8a205 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.116 2000/01/13 00:19:49 davem Exp $ + * Version: $Id: tcp_output.c,v 1.119 2000/01/19 04:06:15 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -31,6 +31,7 @@ * during syn/ack processing. * David S. Miller : Output engine completely rewritten. * Andrea Arcangeli: SYNACK carry ts_recent in tsecr. + * Cacophonix Gaul : draft-minshall-nagle-01 * */ @@ -38,75 +39,65 @@ #include <linux/smp_lock.h> -extern int sysctl_tcp_timestamps; -extern int sysctl_tcp_window_scaling; -extern int sysctl_tcp_sack; - /* People can turn this off for buggy TCP's found in printers etc. */ int sysctl_tcp_retrans_collapse = 1; -/* Get rid of any delayed acks, we sent one already.. */ -static __inline__ void clear_delayed_acks(struct sock * sk) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - tp->delayed_acks = 0; - if(tcp_in_quickack_mode(tp)) - tcp_exit_quickack_mode(tp); - tcp_clear_xmit_timer(sk, TIME_DACK); -} - static __inline__ void update_send_head(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - + tp->send_head = tp->send_head->next; if (tp->send_head == (struct sk_buff *) &sk->write_queue) tp->send_head = NULL; } /* Calculate mss to advertise in SYN segment. - RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that: - - 1. It is independent of path mtu. - 2. Ideally, it is maximal possible segment size i.e. 65535-40. - 3. For IPv4 it is reasonable to calculate it from maximal MTU of - attached devices, because some buggy hosts are confused by - large MSS. - 4. We do not make 3, we advertise MSS, calculated from first - hop device mtu, but allow to raise it to ip_rt_min_advmss. - This may be overriden via information stored in routing table. - 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible, - probably even Jumbo". + * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that: + * + * 1. It is independent of path mtu. + * 2. Ideally, it is maximal possible segment size i.e. 65535-40. + * 3. For IPv4 it is reasonable to calculate it from maximal MTU of + * attached devices, because some buggy hosts are confused by + * large MSS. + * 4. We do not make 3, we advertise MSS, calculated from first + * hop device mtu, but allow to raise it to ip_rt_min_advmss. + * This may be overriden via information stored in routing table. + * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible, + * probably even Jumbo". */ static __u16 tcp_advertise_mss(struct sock *sk) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct dst_entry *dst = __sk_dst_get(sk); - int mss; + int mss = tp->advmss; - if (dst) { + if (dst && dst->advmss < mss) { mss = dst->advmss; - } else { - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + tp->advmss = mss; + } - /* No dst. It is bad. Guess some reasonable value. - * Actually, this case should not be possible. - * SANITY. - */ - BUG_TRAP(dst!=NULL); + return (__u16)mss; +} - mss = tp->mss_cache; - mss += (tp->tcp_header_len - sizeof(struct tcphdr)) + - tp->ext_header_len; +static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb) +{ + /* If we had a reply for ato after last received + * packet, enter pingpong mode. + */ + if ((u32)(tp->lsndtime - tp->ack.lrcvtime) < tp->ack.ato) + tp->ack.pingpong = 1; - /* Minimal MSS to include full set of of TCP/IP options - plus 8 bytes of data. It corresponds to mtu 128. - */ - if (mss < 88) - mss = 88; - } + tp->lsndtime = tcp_time_stamp; +} - return (__u16)mss; +static __inline__ void tcp_event_ack_sent(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + tp->last_ack_sent = tp->rcv_nxt; + tcp_dec_quickack_mode(tp); + tp->ack.pending = 0; + tcp_clear_xmit_timer(sk, TCP_TIME_DACK); } /* This routine actually transmits TCP packets queued in by @@ -120,7 +111,7 @@ static __u16 tcp_advertise_mss(struct sock *sk) * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ -void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) +int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) { if(skb != NULL) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); @@ -128,6 +119,7 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) int tcp_header_size = tp->tcp_header_len; struct tcphdr *th; int sysctl_flags; + int err; #define SYSCTL_FLAG_TSTAMPS 0x1 #define SYSCTL_FLAG_WSCALE 0x2 @@ -190,11 +182,29 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) } tp->af_specific->send_check(sk, th, skb->len, skb); - clear_delayed_acks(sk); - tp->last_ack_sent = tp->rcv_nxt; + if (th->ack) + tcp_event_ack_sent(sk); + + if (skb->len != tcp_header_size) + tcp_event_data_sent(tp, skb); + TCP_INC_STATS(TcpOutSegs); - tp->af_specific->queue_xmit(skb); + + err = tp->af_specific->queue_xmit(skb); + if (err <= 0) + return err; + + tcp_enter_cong_avoid(tp); + + /* NET_XMIT_CN is special. It does not guarantee, + * that this packet is lost. It tells that device + * is about to start to drop packets or already + * drops some packets of the same priority and + * invokes us to send less aggressively. + */ + return err == NET_XMIT_CN ? 0 : err; } + return -ENOBUFS; #undef SYSCTL_FLAG_TSTAMPS #undef SYSCTL_FLAG_WSCALE #undef SYSCTL_FLAG_SACK @@ -202,32 +212,33 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) /* This is the main buffer sending routine. We queue the buffer * and decide whether to queue or transmit now. + * + * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, + * otherwise socket can stall. */ -void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue) +void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue, unsigned cur_mss) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* Advance write_seq and place onto the write_queue. */ - tp->write_seq += (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq); + tp->write_seq = TCP_SKB_CB(skb)->end_seq; __skb_queue_tail(&sk->write_queue, skb); - if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) { + if (!force_queue && tp->send_head == NULL && tcp_snd_test(tp, skb, cur_mss, 1)) { /* Send it out now. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tp->packets_out++; - tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)); - if(!tcp_timer_is_set(sk, TIME_RETRANS)) - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - } else { - /* Queue it, remembering where we must start sending. */ - if (tp->send_head == NULL) - tp->send_head = skb; - if (!force_queue && tp->packets_out == 0 && !tp->pending) { - tp->pending = TIME_PROBE0; - tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); + if (tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)) == 0) { + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_minshall_update(tp, cur_mss, skb->len); + tp->packets_out++; + if(!tcp_timer_is_set(sk, TCP_TIME_RETRANS)) + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + return; } } + /* Queue it, remembering where we must start sending. */ + if (tp->send_head == NULL) + tp->send_head = skb; } /* Function to create two new TCP segments. Shrinks the given segment @@ -243,13 +254,13 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) /* Get a new skb... force flag on. */ buff = sock_wmalloc(sk, - (nsize + MAX_HEADER + sk->prot->max_header), + (nsize + MAX_TCP_HEADER + 15), 1, GFP_ATOMIC); if (buff == NULL) - return -1; /* We'll just try again later. */ + return -ENOMEM; /* We'll just try again later. */ /* Reserve space for headers. */ - skb_reserve(buff, MAX_HEADER + sk->prot->max_header); + skb_reserve(buff, MAX_TCP_HEADER); /* Correct the sequence numbers. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; @@ -276,8 +287,8 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) TCP_SKB_CB(buff)->sacked = 0; /* Copy and checksum data tail into the new buffer. */ - buff->csum = csum_partial_copy(skb->data + len, skb_put(buff, nsize), - nsize, 0); + buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize), + nsize, 0); /* This takes care of the FIN sequence number too. */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; @@ -288,6 +299,11 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) /* Looks stupid, but our code really uses when of * skbs, which it never sent before. --ANK + * + * NOTE: several days after I added this, Dave repaired + * tcp_simple_retransmit() and it should not use ->when + * of never sent skbs more. I am not sure, so that + * this line remains until more careful investigation. --ANK */ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; @@ -335,20 +351,19 @@ int tcp_sync_mss(struct sock *sk, u32 pmtu) if (mss_now > tp->mss_clamp) mss_now = tp->mss_clamp; - /* Now subtract TCP options size, not including SACKs */ - mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); - /* Now subtract optional transport overhead */ mss_now -= tp->ext_header_len; - /* It we got too small (or even negative) value, - clamp it by 8 from below. Why 8 ? - Well, it could be 1 with the same success, - but if IP accepted segment of length 1, - it would love 8 even more 8) --ANK (980731) - */ - if (mss_now < 8) - mss_now = 8; + /* Then reserve room for full set of TCP options and 8 bytes of data */ + if (mss_now < 48) + mss_now = 48; + + /* Now subtract TCP options size, not including SACKs */ + mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); + + /* Bound mss with half of window */ + if (tp->max_window && mss_now > (tp->max_window>>1)) + mss_now = max((tp->max_window>>1), 1); /* And store cached results */ tp->pmtu_cookie = pmtu; @@ -360,27 +375,30 @@ int tcp_sync_mss(struct sock *sk, u32 pmtu) /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. + * + * Returns 1, if no segments are in flight and we have queued segments, but + * cannot send anything now because of SWS or another problem. */ -void tcp_write_xmit(struct sock *sk) +int tcp_write_xmit(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); unsigned int mss_now; - /* Account for SACKS, we may need to fragment due to this. - * It is just like the real MSS changing on us midstream. - * We also handle things correctly when the user adds some - * IP options mid-stream. Silly to do, but cover it. - */ - mss_now = tcp_current_mss(sk); - - /* If we are zapped, the bytes will have to remain here. - * In time closedown will empty the write queue and all + /* If we are closed, the bytes will have to remain here. + * In time closedown will finish, we empty the write queue and all * will be happy. */ - if(!sk->zapped) { + if(sk->state != TCP_CLOSE) { struct sk_buff *skb; int sent_pkts = 0; + /* Account for SACKS, we may need to fragment due to this. + * It is just like the real MSS changing on us midstream. + * We also handle things correctly when the user adds some + * IP options mid-stream. Silly to do, but cover it. + */ + mss_now = tcp_current_mss(sk); + /* Anything on the transmit queue that fits the window can * be added providing we are: * @@ -388,27 +406,36 @@ void tcp_write_xmit(struct sock *sk) * b) not exceeding our congestion window. * c) not retransmitting [Nagle] */ - while((skb = tp->send_head) && tcp_snd_test(sk, skb)) { + while((skb = tp->send_head) && + tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb))) { if (skb->len > mss_now) { if (tcp_fragment(sk, skb, mss_now)) break; } - /* Advance the send_head. This one is going out. */ - update_send_head(sk); TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) + break; + /* Advance the send_head. This one is sent out. */ + update_send_head(sk); tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_minshall_update(tp, mss_now, skb->len); tp->packets_out++; - tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); sent_pkts = 1; } /* If we sent anything, make sure the retransmit * timer is active. */ - if (sent_pkts && !tcp_timer_is_set(sk, TIME_RETRANS)) - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + if (sent_pkts) { + if (!tcp_timer_is_set(sk, TCP_TIME_RETRANS)) + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + return 0; + } + + return !tp->packets_out && tp->send_head; } + return 0; } /* This function returns the amount that we can raise the @@ -471,7 +498,7 @@ u32 __tcp_select_window(struct sock *sk) * but may be worse for the performance because of rcv_mss * fluctuations. --SAW 1998/11/1 */ - unsigned int mss = tp->rcv_mss; + unsigned int mss = tp->ack.rcv_mss; int free_space; u32 window; @@ -481,11 +508,19 @@ u32 __tcp_select_window(struct sock *sk) free_space = tp->window_clamp; if (tp->window_clamp < mss) mss = tp->window_clamp; - - if ((free_space < (tcp_full_space(sk) / 2)) && + + if ((free_space < (min((int)tp->window_clamp, tcp_full_space(sk)) / 2)) && (free_space < ((int) (mss/2)))) { window = 0; - tp->pred_flags = 0; + + /* THIS IS _VERY_ GOOD PLACE to play window clamp. + * if free_space becomes suspiciously low + * verify ratio rmem_alloc/(rcv_nxt - copied_seq), + * and if we predict that when free_space will be lower mss, + * rmem_alloc will run out of rcvbuf*2, shrink window_clamp. + * It will eliminate most of prune events! Very simple, + * it is the next thing to do. --ANK + */ } else { /* Get the largest window that is a nice multiple of mss. * Window clamp already applied above. @@ -542,9 +577,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m /* Optimize, actually we could also combine next_skb->csum * to skb->csum using a single add w/carry operation too. */ - skb->csum = csum_partial_copy(next_skb->data, - skb_put(skb, next_skb_size), - next_skb_size, skb->csum); + skb->csum = csum_partial_copy_nocheck(next_skb->data, + skb_put(skb, next_skb_size), + next_skb_size, skb->csum); } /* Update sequence range on original skb. */ @@ -603,8 +638,10 @@ void tcp_simple_retransmit(struct sock *sk) if (old_next_skb != skb || skb->len > mss) resend_skb = 1; old_next_skb = skb->next; - if (resend_skb != 0) - tcp_retransmit_skb(sk, skb); + if (resend_skb != 0) { + if (tcp_retransmit_skb(sk, skb)) + break; + } } } @@ -629,9 +666,21 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); unsigned int cur_mss = tcp_current_mss(sk); +#ifdef TCP_DEBUG + /* It was possible this summer, that retransmit timer + * raced with its deletion and hit socket with packets_out==0. + * I fixed it, but preserved the check in the place, + * where the fault occured. --ANK + */ + if (skb == NULL) { + printk("tcp_retransmit_skb: bug, skb==NULL, caller=%p\n", NET_CALLER(sk)); + return -EFAULT; + } +#endif + if(skb->len > cur_mss) { if(tcp_fragment(sk, skb, cur_mss)) - return 1; /* We'll try again later. */ + return -ENOMEM; /* We'll try again later. */ /* New SKB created, account for it. */ tp->packets_out++; @@ -646,7 +695,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) tcp_retrans_try_collapse(sk, skb, cur_mss); if(tp->af_specific->rebuild_header(sk)) - return 1; /* Routing failure or similar. */ + return -EHOSTUNREACH; /* Routing failure or similar. */ /* Some Solaris stacks overoptimize and ignore the FIN on a * retransmit when old data is attached. So strip it off @@ -673,13 +722,10 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) else skb = skb_clone(skb, GFP_ATOMIC); - tcp_transmit_skb(sk, skb); - /* Update global TCP statistics and return success. */ - sk->prot->retransmits++; TCP_INC_STATS(TcpRetransSegs); - return 0; + return tcp_transmit_skb(sk, skb); } /* This gets called after a retransmit timeout, and the initially @@ -774,7 +820,11 @@ void tcp_send_fin(struct sock *sk) */ mss_now = tcp_current_mss(sk); - if((tp->send_head != NULL) && (skb->len < mss_now)) { + /* Please, find seven differences of 2.3.33 and loook + * what I broke here. 8) --ANK + */ + + if(tp->send_head != NULL) { /* tcp_write_xmit() takes care of the rest. */ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; TCP_SKB_CB(skb)->end_seq++; @@ -783,31 +833,34 @@ void tcp_send_fin(struct sock *sk) /* Special case to avoid Nagle bogosity. If this * segment is the last segment, and it was queued * due to Nagle/SWS-avoidance, send it out now. + * + * Hmm... actually it overrides also congestion + * avoidance (OK for FIN) and retransmit phase + * (not OK? Added.). */ if(tp->send_head == skb && - !sk->nonagle && - skb->len < (tp->rcv_mss >> 1) && - tp->packets_out && - !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) { - update_send_head(sk); + !after(tp->write_seq, tp->snd_una + tp->snd_wnd) && + !tp->retransmits) { TCP_SKB_CB(skb)->when = tcp_time_stamp; - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tp->packets_out++; - tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); - if(!tcp_timer_is_set(sk, TIME_RETRANS)) - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + if (!tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) { + update_send_head(sk); + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + tp->packets_out++; + if(!tcp_timer_is_set(sk, TCP_TIME_RETRANS)) + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + } else + tcp_check_probe_timer(sk, tp); } } else { /* Socket is locked, keep trying until memory is available. */ do { skb = sock_wmalloc(sk, - (MAX_HEADER + - sk->prot->max_header), + MAX_TCP_HEADER + 15, 1, GFP_KERNEL); } while (skb == NULL); /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb_reserve(skb, MAX_TCP_HEADER); skb->csum = 0; TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); TCP_SKB_CB(skb)->sacked = 0; @@ -816,7 +869,8 @@ void tcp_send_fin(struct sock *sk) /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */ TCP_SKB_CB(skb)->seq = tp->write_seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; - tcp_send_skb(sk, skb, 0); + tcp_send_skb(sk, skb, 0, mss_now); + __tcp_push_pending_frames(sk, tp, mss_now); } } @@ -831,19 +885,19 @@ void tcp_send_active_reset(struct sock *sk, int priority) struct sk_buff *skb; /* NOTE: No TCP options attached and we never retransmit this. */ - skb = alloc_skb(MAX_HEADER + sk->prot->max_header, priority); + skb = alloc_skb(MAX_TCP_HEADER + 15, priority); if (!skb) return; /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb_reserve(skb, MAX_TCP_HEADER); skb->csum = 0; TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->urg_ptr = 0; /* Send it off. */ - TCP_SKB_CB(skb)->seq = tp->write_seq; + TCP_SKB_CB(skb)->seq = tp->snd_nxt; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; tcp_transmit_skb(sk, skb); @@ -859,13 +913,13 @@ int tcp_send_synack(struct sock *sk) struct tcp_opt* tp = &(sk->tp_pinfo.af_tcp); struct sk_buff* skb; - skb = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header), + skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb_reserve(skb, MAX_TCP_HEADER); skb->csum = 0; TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_SYN); TCP_SKB_CB(skb)->sacked = 0; @@ -877,8 +931,7 @@ int tcp_send_synack(struct sock *sk) __skb_queue_tail(&sk->write_queue, skb); TCP_SKB_CB(skb)->when = tcp_time_stamp; tp->packets_out++; - tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); - return 0; + return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); } /* @@ -887,16 +940,17 @@ int tcp_send_synack(struct sock *sk) struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, struct open_request *req) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct tcphdr *th; int tcp_header_size; struct sk_buff *skb; - skb = sock_wmalloc(sk, MAX_HEADER + sk->prot->max_header, 1, GFP_ATOMIC); + skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); if (skb == NULL) return NULL; /* Reserve space for headers. */ - skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb_reserve(skb, MAX_TCP_HEADER); skb->dst = dst_clone(dst); @@ -919,7 +973,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ __u8 rcv_wscale; /* Set this up on the first call only */ - req->window_clamp = skb->dst->window; + req->window_clamp = tp->window_clamp ? : skb->dst->window; /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(tcp_full_space(sk), dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), @@ -951,7 +1005,7 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff) struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* Reserve space for headers. */ - skb_reserve(buff, MAX_HEADER + sk->prot->max_header); + skb_reserve(buff, MAX_TCP_HEADER + 15); /* We'll fix this up when we get a response from the other end. * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. @@ -962,12 +1016,16 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff) /* If user gave his TCP_MAXSEG, record it to clamp */ if (tp->user_mss) tp->mss_clamp = tp->user_mss; + tp->max_window = 0; tcp_sync_mss(sk, dst->pmtu); + tcp_initialize_rcv_mss(sk); - tp->window_clamp = dst->window; + if (!tp->window_clamp) + tp->window_clamp = dst->window; + tp->advmss = dst->advmss; tcp_select_initial_window(tcp_full_space(sk), - dst->advmss - (tp->tcp_header_len - sizeof(struct tcphdr)), + tp->advmss - (tp->tcp_header_len - sizeof(struct tcphdr)), &tp->rcv_wnd, &tp->window_clamp, sysctl_tcp_window_scaling, @@ -982,10 +1040,12 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff) goto err_out; sk->err = 0; + sk->done = 0; tp->snd_wnd = 0; tp->snd_wl1 = 0; tp->snd_wl2 = tp->write_seq; tp->snd_una = tp->write_seq; + tp->snd_sml = tp->write_seq; tp->rcv_nxt = 0; tp->rcv_wup = 0; tp->copied_seq = 0; @@ -1006,13 +1066,14 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff) /* Send it off. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; + tp->syn_stamp = TCP_SKB_CB(buff)->when; __skb_queue_tail(&sk->write_queue, buff); tp->packets_out++; tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); TCP_INC_STATS(TcpActiveOpens); /* Timer for repeating the SYN until an answer. */ - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); return 0; err_out: @@ -1025,16 +1086,14 @@ err_out: * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() * for details. */ -void tcp_send_delayed_ack(struct sock *sk, int max_timeout) +void tcp_send_delayed_ack(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; unsigned long timeout; /* Stay within the limit we were given */ - timeout = (tp->ato << 1) >> 1; - if (timeout > max_timeout) - timeout = max_timeout; - timeout += jiffies; + timeout = tp->ack.ato; + timeout += jiffies + (timeout>>2); /* Use new timeout only if there wasn't a older one earlier. */ spin_lock_bh(&sk->timer_lock); @@ -1042,18 +1101,46 @@ void tcp_send_delayed_ack(struct sock *sk, int max_timeout) sock_hold(sk); tp->delack_timer.expires = timeout; } else { + /* If delack timer was blocked or is about to expire, + * send ACK now. + */ + if (tp->ack.blocked || time_before_eq(tp->delack_timer.expires, jiffies+(tp->ack.ato>>2))) { + spin_unlock_bh(&sk->timer_lock); + + tcp_send_ack(sk); + __sock_put(sk); + return; + } + if (time_before(timeout, tp->delack_timer.expires)) tp->delack_timer.expires = timeout; } add_timer(&tp->delack_timer); spin_unlock_bh(&sk->timer_lock); + +#ifdef TCP_FORMAL_WINDOW + /* Explanation. Header prediction path does not handle + * case of zero window. If we send ACK immediately, pred_flags + * are reset when sending ACK. If rcv_nxt is advanced and + * ack is not sent, than delayed ack is scheduled. + * Hence, it is the best place to check for zero window. + */ + if (tp->pred_flags) { + if (tcp_receive_window(tp) == 0) + tp->pred_flags = 0; + } else { + if (skb_queue_len(&tp->out_of_order_queue) == 0 && + !tp->urg_data) + tcp_fast_path_on(tp); + } +#endif } /* This routine sends an ack and also updates the window. */ void tcp_send_ack(struct sock *sk) { /* If we have been reset, we may not send again. */ - if(!sk->zapped) { + if(sk->state != TCP_CLOSE) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *buff; @@ -1061,29 +1148,15 @@ void tcp_send_ack(struct sock *sk) * tcp_transmit_skb() will set the ownership to this * sock. */ - buff = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_ATOMIC); + buff = alloc_skb(MAX_TCP_HEADER + 15, GFP_ATOMIC); if (buff == NULL) { - /* Force it to send an ack. We don't have to do this - * (ACK is unreliable) but it's much better use of - * bandwidth on slow links to send a spare ack than - * resend packets. - * - * This is the one possible way that we can delay an - * ACK and have tp->ato indicate that we are in - * quick ack mode, so clear it. It is also the only - * possible way for ato to be zero, when ACK'ing a - * SYNACK because we've taken no ATO measurement yet. - */ - if (tcp_in_quickack_mode(tp)) - tcp_exit_quickack_mode(tp); - if (!tp->ato) - tp->ato = tp->rto; - tcp_send_delayed_ack(sk, HZ/2); + tp->ack.pending = 1; + tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); return; } /* Reserve space for headers and prepare control bits. */ - skb_reserve(buff, MAX_HEADER + sk->prot->max_header); + skb_reserve(buff, MAX_TCP_HEADER); buff->csum = 0; TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(buff)->sacked = 0; @@ -1099,24 +1172,20 @@ void tcp_send_ack(struct sock *sk) /* This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. */ -void tcp_write_wakeup(struct sock *sk) +int tcp_write_wakeup(struct sock *sk) { - /* After a valid reset we can send no more. */ - if (!sk->zapped) { + if (sk->state != TCP_CLOSE) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; - /* Write data can still be transmitted/retransmitted in the - * following states. If any other state is encountered, return. - * [listen/close will never occur here anyway] + /* Now this function is never called, while + * we have something not ACKed in queue. */ - if ((1 << sk->state) & - ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1| - TCPF_FIN_WAIT2|TCPF_LAST_ACK|TCPF_CLOSING)) - return; + BUG_TRAP(tp->snd_una == tp->snd_nxt); - if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) && - ((skb = tp->send_head) != NULL)) { + if (tp->snd_wnd > (tp->snd_nxt-tp->snd_una) + && ((skb = tp->send_head) != NULL)) { + int err; unsigned long win_size; /* We are probing the opening of a window @@ -1126,24 +1195,26 @@ void tcp_write_wakeup(struct sock *sk) win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); if (win_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) { if (tcp_fragment(sk, skb, win_size)) - return; /* Let a retransmit get it. */ + return -1; } - update_send_head(sk); TCP_SKB_CB(skb)->when = tcp_time_stamp; - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tp->packets_out++; - tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); - if (!tcp_timer_is_set(sk, TIME_RETRANS)) - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + if (!err) { + update_send_head(sk); + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + tp->packets_out++; + if (!tcp_timer_is_set(sk, TCP_TIME_RETRANS)) + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + } + return err; } else { /* We don't queue it, tcp_transmit_skb() sets ownership. */ - skb = alloc_skb(MAX_HEADER + sk->prot->max_header, - GFP_ATOMIC); + skb = alloc_skb(MAX_TCP_HEADER + 15, GFP_ATOMIC); if (skb == NULL) - return; + return -1; /* Reserve space for headers and set control bits. */ - skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb_reserve(skb, MAX_TCP_HEADER); skb->csum = 0; TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(skb)->sacked = 0; @@ -1152,13 +1223,18 @@ void tcp_write_wakeup(struct sock *sk) /* Use a previous sequence. This should cause the other * end to send an ack. Don't queue or clone SKB, just * send it. + * + * RED-PEN: logically it should be snd_una-1. + * snd_nxt-1 will not be acked. snd_una==snd_nxt + * in this place however. Right? */ - TCP_SKB_CB(skb)->seq = tp->snd_nxt - 1; + TCP_SKB_CB(skb)->seq = tp->snd_una - 1; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_transmit_skb(sk, skb); + return tcp_transmit_skb(sk, skb); } } + return -1; } /* A window probe timeout has occurred. If window is not closed send @@ -1167,11 +1243,32 @@ void tcp_write_wakeup(struct sock *sk) void tcp_send_probe0(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int err; + + err = tcp_write_wakeup(sk); + + if (tp->packets_out || !tp->send_head) { + /* Cancel probe timer, if it is not required. */ + tp->probes_out = 0; + tp->backoff = 0; + return; + } - tcp_write_wakeup(sk); - tp->pending = TIME_PROBE0; - tp->backoff++; - tp->probes_out++; - tcp_reset_xmit_timer (sk, TIME_PROBE0, - min(tp->rto << tp->backoff, 120*HZ)); + if (err <= 0) { + tp->backoff++; + tp->probes_out++; + tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, + min(tp->rto << tp->backoff, TCP_RTO_MAX)); + } else { + /* If packet was not sent due to local congestion, + * do not backoff and do not remember probes_out. + * Let local senders to fight for local resources. + * + * Use accumulated backoff yet. + */ + if (!tp->probes_out) + tp->probes_out=1; + tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, + min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL)); + } } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a38724e42..bff4e872f 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_timer.c,v 1.68 1999/09/07 02:31:43 davem Exp $ + * Version: $Id: tcp_timer.c,v 1.71 2000/01/18 08:24:19 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -23,29 +23,20 @@ #include <net/tcp.h> int sysctl_tcp_syn_retries = TCP_SYN_RETRIES; +int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; int sysctl_tcp_retries1 = TCP_RETR1; int sysctl_tcp_retries2 = TCP_RETR2; +int sysctl_tcp_orphan_retries = TCP_ORPHAN_RETRIES; - -static void tcp_sltimer_handler(unsigned long); -static void tcp_syn_recv_timer(unsigned long); +static void tcp_retransmit_timer(unsigned long); +static void tcp_delack_timer(unsigned long); +static void tcp_probe_timer(unsigned long); +static void tcp_keepalive_timer (unsigned long data); static void tcp_twkill(unsigned long); -struct timer_list tcp_slow_timer = { - NULL, NULL, - 0, 0, - tcp_sltimer_handler, -}; - - -struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = { - {ATOMIC_INIT(0), TCP_SYNACK_PERIOD, 0, tcp_syn_recv_timer},/* SYNACK */ - {ATOMIC_INIT(0), TCP_TWKILL_PERIOD, 0, tcp_twkill} /* TWKILL */ -}; - const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n"; /* @@ -56,17 +47,25 @@ const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n"; void tcp_init_xmit_timers(struct sock *sk) { - init_timer(&sk->tp_pinfo.af_tcp.retransmit_timer); - sk->tp_pinfo.af_tcp.retransmit_timer.function=&tcp_retransmit_timer; - sk->tp_pinfo.af_tcp.retransmit_timer.data = (unsigned long) sk; - - init_timer(&sk->tp_pinfo.af_tcp.delack_timer); - sk->tp_pinfo.af_tcp.delack_timer.function=&tcp_delack_timer; - sk->tp_pinfo.af_tcp.delack_timer.data = (unsigned long) sk; - - init_timer(&sk->tp_pinfo.af_tcp.probe_timer); - sk->tp_pinfo.af_tcp.probe_timer.function=&tcp_probe_timer; - sk->tp_pinfo.af_tcp.probe_timer.data = (unsigned long) sk; + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + spin_lock_init(&sk->timer_lock); + + init_timer(&tp->retransmit_timer); + tp->retransmit_timer.function=&tcp_retransmit_timer; + tp->retransmit_timer.data = (unsigned long) sk; + + init_timer(&tp->delack_timer); + tp->delack_timer.function=&tcp_delack_timer; + tp->delack_timer.data = (unsigned long) sk; + + init_timer(&tp->probe_timer); + tp->probe_timer.function=&tcp_probe_timer; + tp->probe_timer.data = (unsigned long) sk; + + init_timer(&sk->timer); + sk->timer.function=&tcp_keepalive_timer; + sk->timer.data = (unsigned long) sk; } /* @@ -79,7 +78,7 @@ void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when) spin_lock_bh(&sk->timer_lock); switch (what) { - case TIME_RETRANS: + case TCP_TIME_RETRANS: /* When seting the transmit timer the probe timer * should not be set. * The delayed ack timer can be set if we are changing the @@ -89,29 +88,25 @@ void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when) __sock_put(sk); if (!tp->retransmit_timer.prev || !del_timer(&tp->retransmit_timer)) sock_hold(sk); - if (when > 120*HZ) { + if (when > TCP_RTO_MAX) { printk(KERN_DEBUG "reset_xmit_timer sk=%p when=0x%lx, caller=%p\n", sk, when, NET_CALLER(sk)); - when = 120*HZ; + when = TCP_RTO_MAX; } mod_timer(&tp->retransmit_timer, jiffies+when); break; - case TIME_DACK: + case TCP_TIME_DACK: if (!tp->delack_timer.prev || !del_timer(&tp->delack_timer)) sock_hold(sk); mod_timer(&tp->delack_timer, jiffies+when); break; - case TIME_PROBE0: + case TCP_TIME_PROBE0: if (!tp->probe_timer.prev || !del_timer(&tp->probe_timer)) sock_hold(sk); mod_timer(&tp->probe_timer, jiffies+when); break; - case TIME_WRITE: - printk(KERN_DEBUG "bug: tcp_reset_xmit_timer TIME_WRITE\n"); - break; - default: printk(KERN_DEBUG "bug: unknown timer value\n"); }; @@ -127,6 +122,7 @@ void tcp_clear_xmit_timers(struct sock *sk) __sock_put(sk); if(tp->delack_timer.prev && del_timer(&tp->delack_timer)) __sock_put(sk); + tp->ack.blocked = 0; if(tp->probe_timer.prev && del_timer(&tp->probe_timer)) __sock_put(sk); if(sk->timer.prev && del_timer(&sk->timer)) @@ -134,39 +130,33 @@ void tcp_clear_xmit_timers(struct sock *sk) spin_unlock_bh(&sk->timer_lock); } -static void tcp_write_err(struct sock *sk, int force) +static void tcp_write_err(struct sock *sk) { - sk->err = sk->err_soft ? sk->err_soft : ETIMEDOUT; + sk->err = sk->err_soft ? : ETIMEDOUT; sk->error_report(sk); - tcp_clear_xmit_timers(sk); - - /* Do not time wait the socket. It is timed out and, hence, - * idle for 120*HZ. "force" argument is ignored, delete - * it eventually. - */ - - /* Clean up time. */ - tcp_set_state(sk, TCP_CLOSE); tcp_done(sk); } /* A write timeout has occurred. Process the after effects. */ -static void tcp_write_timeout(struct sock *sk) +static int tcp_write_timeout(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int retry_until; - /* Look for a 'soft' timeout. */ - if ((sk->state == TCP_ESTABLISHED && - tp->retransmits && (tp->retransmits % TCP_QUICK_TRIES) == 0) || - (sk->state != TCP_ESTABLISHED && tp->retransmits > sysctl_tcp_retries1)) { - /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black - hole detection. :-( + if ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) { + if (tp->retransmits) + dst_negative_advice(&sk->dst_cache); + retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries; + } else { + if (tp->retransmits >= sysctl_tcp_retries1) { + /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black + hole detection. :-( - It is place to make it. It is not made. I do not want - to make it. It is disguisting. It does not work in any - case. Let me to cite the same draft, which requires for - us to implement this: + It is place to make it. It is not made. I do not want + to make it. It is disguisting. It does not work in any + case. Let me to cite the same draft, which requires for + us to implement this: "The one security concern raised by this memo is that ICMP black holes are often caused by over-zealous security administrators who block @@ -177,57 +167,70 @@ static void tcp_write_timeout(struct sock *sk) be far nicer to have all of the black holes fixed rather than fixing all of the TCP implementations." - Golden words :-). - */ + Golden words :-). + */ - dst_negative_advice(&sk->dst_cache); + dst_negative_advice(&sk->dst_cache); + } + retry_until = sysctl_tcp_retries2; + if (sk->dead) + retry_until = sysctl_tcp_orphan_retries; } - - /* Have we tried to SYN too many times (repent repent 8)) */ - if (sk->state == TCP_SYN_SENT && - ((!tp->syn_retries && tp->retransmits > sysctl_tcp_syn_retries) || - (tp->syn_retries && tp->retransmits > tp->syn_retries))) { - tcp_write_err(sk, 1); - /* Don't FIN, we got nothing back */ - } else if (tp->retransmits > sysctl_tcp_retries2) { + + if (tp->retransmits >= retry_until) { /* Has it gone just too far? */ - tcp_write_err(sk, 0); + tcp_write_err(sk); + return 1; } + return 0; } -void tcp_delack_timer(unsigned long data) +static void tcp_delack_timer(unsigned long data) { struct sock *sk = (struct sock*)data; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); bh_lock_sock(sk); if (sk->lock.users) { /* Try again later. */ - tcp_reset_xmit_timer(sk, TIME_DACK, HZ/5); + tp->ack.blocked = 1; + NET_INC_STATS_BH(DelayedACKLocked); + tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MIN); goto out_unlock; } - if(!sk->zapped && - sk->tp_pinfo.af_tcp.delayed_acks && - sk->state != TCP_CLOSE) + if (tp->ack.pending) { + /* Delayed ACK missed: inflate ATO, leave pingpong mode */ + tp->ack.ato = min(tp->ack.ato<<1, TCP_ATO_MAX); + tp->ack.pingpong = 0; tcp_send_ack(sk); + NET_INC_STATS_BH(DelayedACKs); + } + TCP_CHECK_TIMER(sk); out_unlock: bh_unlock_sock(sk); sock_put(sk); } -void tcp_probe_timer(unsigned long data) +static void tcp_probe_timer(unsigned long data) { struct sock *sk = (struct sock*)data; struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - - if(sk->zapped) - goto out; + int max_probes; bh_lock_sock(sk); if (sk->lock.users) { /* Try again later. */ - tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ/5); + tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, HZ/5); + goto out_unlock; + } + + if (sk->state == TCP_CLOSE) + goto out_unlock; + + if (tp->packets_out || !tp->send_head) { + tp->probes_out = 0; goto out_unlock; } @@ -246,151 +249,251 @@ void tcp_probe_timer(unsigned long data) * with RFCs, only probe timer combines both retransmission timeout * and probe timeout in one bottle. --ANK */ - if (tp->probes_out > sysctl_tcp_retries2) { - tcp_write_err(sk, 0); + max_probes = sk->dead ? sysctl_tcp_orphan_retries : sysctl_tcp_retries2; + + if (tp->probes_out > max_probes) { + tcp_write_err(sk); } else { /* Only send another probe if we didn't close things up. */ tcp_send_probe0(sk); + TCP_CHECK_TIMER(sk); } out_unlock: bh_unlock_sock(sk); -out: sock_put(sk); } /* Kill off TIME_WAIT sockets once their lifetime has expired. */ -int tcp_tw_death_row_slot = 0; -static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS] = - { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }; -static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED; +static int tcp_tw_death_row_slot = 0; +int tcp_tw_count = 0; +static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS]; +static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED; +static struct timer_list tcp_tw_timer = { function: tcp_twkill }; static void tcp_twkill(unsigned long data) { struct tcp_tw_bucket *tw; int killed = 0; - /* The death-row tw chains are only ever touched - * in BH context so no BH disabling (for now) is needed. + /* NOTE: compare this to previous version where lock + * was released after detaching chain. It was racy, + * because tw buckets are scheduled in not serialized context + * in 2.3 (with netfilter), and with softnet it is common, because + * soft irqs are not sequenced. */ spin_lock(&tw_death_lock); - tw = tcp_tw_death_row[tcp_tw_death_row_slot]; - tcp_tw_death_row[tcp_tw_death_row_slot] = NULL; - tcp_tw_death_row_slot = - ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); - spin_unlock(&tw_death_lock); - while(tw != NULL) { - struct tcp_tw_bucket *next = tw->next_death; + if (tcp_tw_count == 0) + goto out; + + while((tw = tcp_tw_death_row[tcp_tw_death_row_slot]) != NULL) { + tcp_tw_death_row[tcp_tw_death_row_slot] = tw->next_death; + tw->pprev_death = NULL; + spin_unlock(&tw_death_lock); tcp_timewait_kill(tw); tcp_tw_put(tw); + killed++; - tw = next; - } - if(killed != 0) { - struct tcp_sl_timer *slt = (struct tcp_sl_timer *)data; - atomic_sub(killed, &slt->count); + + spin_lock(&tw_death_lock); } + tcp_tw_death_row_slot = + ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); + + if ((tcp_tw_count -= killed) != 0) + mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); + net_statistics[smp_processor_id()*2].TimeWaited += killed; +out: + spin_unlock(&tw_death_lock); } /* These are always called from BH context. See callers in * tcp_input.c to verify this. */ -void tcp_tw_schedule(struct tcp_tw_bucket *tw) -{ - struct tcp_tw_bucket **tpp; - int slot; +/* This is for handling early-kills of TIME_WAIT sockets. */ +void tcp_tw_deschedule(struct tcp_tw_bucket *tw) +{ spin_lock(&tw_death_lock); - slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1); - tpp = &tcp_tw_death_row[slot]; - if((tw->next_death = *tpp) != NULL) - (*tpp)->pprev_death = &tw->next_death; - *tpp = tw; - tw->pprev_death = tpp; - - tw->death_slot = slot; - atomic_inc(&tw->refcnt); + if (tw->pprev_death) { + if(tw->next_death) + tw->next_death->pprev_death = tw->pprev_death; + *tw->pprev_death = tw->next_death; + tw->pprev_death = NULL; + tcp_tw_put(tw); + if (--tcp_tw_count == 0) + del_timer(&tcp_tw_timer); + } spin_unlock(&tw_death_lock); - - tcp_inc_slow_timer(TCP_SLT_TWKILL); } -/* Happens rarely if at all, no care about scalability here. */ -void tcp_tw_reschedule(struct tcp_tw_bucket *tw) +/* Short-time timewait calendar */ + +static int tcp_twcal_hand = -1; +static int tcp_twcal_jiffie; +static void tcp_twcal_tick(unsigned long); +static struct timer_list tcp_twcal_timer = {NULL, NULL, 0, 0, tcp_twcal_tick,}; +static struct tcp_tw_bucket *tcp_twcal_row[TCP_TW_RECYCLE_SLOTS]; + +void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo) { struct tcp_tw_bucket **tpp; int slot; + /* timeout := RTO * 3.5 + * + * 3.5 = 1+2+0.5 to wait for two retransmits. + * + * RATIONALE: if FIN arrived and we entered TIME-WAIT state, + * our ACK acking that FIN can be lost. If N subsequent retransmitted + * FINs (or previous seqments) are lost (probability of such event + * is p^(N+1), where p is probability to lose single packet and + * time to detect the loss is about RTO*(2^N - 1) with exponential + * backoff). Normal timewait length is calculated so, that we + * waited at least for one retransmitted FIN (maximal RTO is 120sec). + * [ BTW Linux. following BSD, violates this requirement waiting + * only for 60sec, we should wait at least for 240 secs. + * Well, 240 consumes too much of resources 8) + * ] + * This interval is not reduced to catch old duplicate and + * responces to our wandering segments living for two MSLs. + * However, if we use PAWS to detect + * old duplicates, we can reduce the interval to bounds required + * by RTO, rather than MSL. So, if peer understands PAWS, we + * kill tw bucket after 3.5*RTO (it is important that this number + * is greater than TS tick!) and detect old duplicates with help + * of PAWS. + */ + slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK; + spin_lock(&tw_death_lock); + + /* Unlink it, if it was scheduled */ if (tw->pprev_death) { if(tw->next_death) tw->next_death->pprev_death = tw->pprev_death; *tw->pprev_death = tw->next_death; tw->pprev_death = NULL; + tcp_tw_count--; } else atomic_inc(&tw->refcnt); - slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1); - tpp = &tcp_tw_death_row[slot]; + if (slot >= TCP_TW_RECYCLE_SLOTS) { + /* Schedule to slow timer */ + if (timeo >= TCP_TIMEWAIT_LEN) { + slot = TCP_TWKILL_SLOTS-1; + } else { + slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD; + if (slot >= TCP_TWKILL_SLOTS) + slot = TCP_TWKILL_SLOTS-1; + } + tw->ttd = jiffies + timeo; + slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1); + tpp = &tcp_tw_death_row[slot]; + } else { + tw->ttd = jiffies + (slot<<TCP_TW_RECYCLE_TICK); + + if (tcp_twcal_hand < 0) { + tcp_twcal_hand = 0; + tcp_twcal_jiffie = jiffies; + tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK); + add_timer(&tcp_twcal_timer); + } else { + if ((long)(tcp_twcal_timer.expires - jiffies) > (slot<<TCP_TW_RECYCLE_TICK)) + mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK)); + slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1); + } + tpp = &tcp_twcal_row[slot]; + } + if((tw->next_death = *tpp) != NULL) (*tpp)->pprev_death = &tw->next_death; *tpp = tw; tw->pprev_death = tpp; - tw->death_slot = slot; + if (tcp_tw_count++ == 0) + mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); spin_unlock(&tw_death_lock); - - /* Timer was incremented when we first entered the table. */ } -/* This is for handling early-kills of TIME_WAIT sockets. */ -void tcp_tw_deschedule(struct tcp_tw_bucket *tw) +void tcp_twcal_tick(unsigned long dummy) { + int n, slot; + unsigned long j; + unsigned long now = jiffies; + int killed = 0; + int adv = 0; + spin_lock(&tw_death_lock); - if (tw->pprev_death) { - if(tw->next_death) - tw->next_death->pprev_death = tw->pprev_death; - *tw->pprev_death = tw->next_death; - tw->pprev_death = NULL; - tcp_tw_put(tw); + if (tcp_twcal_hand < 0) + goto out; + + slot = tcp_twcal_hand; + j = tcp_twcal_jiffie; + + for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) { + if ((long)(j - now) <= 0) { + struct tcp_tw_bucket *tw; + + while((tw = tcp_twcal_row[slot]) != NULL) { + tcp_twcal_row[slot] = tw->next_death; + tw->pprev_death = NULL; + + tcp_timewait_kill(tw); + tcp_tw_put(tw); + killed++; + } + } else { + if (!adv) { + adv = 1; + tcp_twcal_jiffie = j; + tcp_twcal_hand = slot; + } + + if (tcp_twcal_row[slot] != NULL) { + mod_timer(&tcp_twcal_timer, j); + goto out; + } + } + j += (1<<TCP_TW_RECYCLE_TICK); + slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1); } - spin_unlock(&tw_death_lock); + tcp_twcal_hand = -1; - tcp_dec_slow_timer(TCP_SLT_TWKILL); +out: + if ((tcp_tw_count -= killed) == 0) + del_timer(&tcp_tw_timer); + net_statistics[smp_processor_id()*2].TimeWaitKilled += killed; + spin_unlock(&tw_death_lock); } /* * The TCP retransmit timer. - * - * 1. An initial rtt timeout on the probe0 should cause what we can - * of the first write queue buffer to be split and sent. - * 2. On a 'major timeout' as defined by RFC1122 we do not report - * ETIMEDOUT if we know an additional 'soft' error caused this. - * tcp_err saves a 'soft error' for us. */ -void tcp_retransmit_timer(unsigned long data) +static void tcp_retransmit_timer(unsigned long data) { struct sock *sk = (struct sock*)data; struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - /* We are reset. We will send no more retransmits. */ - if(sk->zapped) - goto out; - bh_lock_sock(sk); if (sk->lock.users) { /* Try again later */ - tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ/20); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, HZ/20); goto out_unlock; } - /* Clear delay ack timer. */ - tcp_clear_xmit_timer(sk, TIME_DACK); + if (sk->state == TCP_CLOSE || tp->packets_out == 0) + goto out_unlock; + + BUG_TRAP(!skb_queue_empty(&sk->write_queue)); + + if (tcp_write_timeout(sk)) + goto out_unlock; /* RFC 2018, clear all 'sacked' flags in retransmission queue, * the sender may have dropped out of order frames and we must @@ -426,11 +529,19 @@ void tcp_retransmit_timer(unsigned long data) tp->snd_cwnd = 1; } - tp->retransmits++; - tp->dup_acks = 0; tp->high_seq = tp->snd_nxt; - tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); + if (tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)) > 0) { + /* Retransmission failed because of local congestion, + * do not backoff. + */ + if (!tp->retransmits) + tp->retransmits=1; + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, + min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL)); + TCP_CHECK_TIMER(sk); + goto out_unlock; + } /* Increase the timeout each time we retransmit. Note that * we do not increase the rtt estimate. rto is initialized @@ -448,132 +559,105 @@ void tcp_retransmit_timer(unsigned long data) * the 120 second clamps though! */ tp->backoff++; - tp->rto = min(tp->rto << 1, 120*HZ); - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - - tcp_write_timeout(sk); + tp->retransmits++; + tp->rto = min(tp->rto << 1, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + TCP_CHECK_TIMER(sk); out_unlock: bh_unlock_sock(sk); -out: sock_put(sk); } /* - * Slow timer for SYN-RECV sockets + * Timer for listening sockets */ -static void tcp_do_syn_queue(struct sock *sk, struct tcp_opt *tp, unsigned long now) -{ - struct open_request *prev, *req; - - prev = (struct open_request *) &tp->syn_wait_queue; - for(req = tp->syn_wait_queue; req; ) { - struct open_request *next = req->dl_next; - - if (!req->sk && (long)(now - req->expires) >= 0) { - tcp_synq_unlink(tp, req, prev); - if(req->retrans >= sysctl_tcp_retries1) { - (*req->class->destructor)(req); - tcp_dec_slow_timer(TCP_SLT_SYNACK); - tp->syn_backlog--; - tcp_openreq_free(req); - if (! tp->syn_wait_queue) - break; - } else { - unsigned long timeo; - struct open_request *rp; - - (*req->class->rtx_syn_ack)(sk, req); - req->retrans++; - timeo = min((TCP_TIMEOUT_INIT << req->retrans), - (120 * HZ)); - req->expires = now + timeo; - rp = prev->dl_next; - tcp_synq_queue(tp, req); - if(rp != prev->dl_next) - prev = prev->dl_next; - } - } else - prev = req; - req = next; - } -} - -/* This now scales very nicely. -DaveM */ -static void tcp_syn_recv_timer(unsigned long data) +static void tcp_synack_timer(struct sock *sk) { - struct sock *sk; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct tcp_listen_opt *lopt = tp->listen_opt; + int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries; + int thresh = max_retries; unsigned long now = jiffies; - int i; - - read_lock(&tcp_lhash_lock); - for(i = 0; i < TCP_LHTABLE_SIZE; i++) { - sk = tcp_listening_hash[i]; - while(sk) { - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - - /* TCP_LISTEN is implied. */ - bh_lock_sock(sk); - if (!sk->lock.users && tp->syn_wait_queue) - tcp_do_syn_queue(sk, tp, now); - bh_unlock_sock(sk); - sk = sk->next; + struct open_request **reqp, *req; + int i, budget; + + if (lopt == NULL || lopt->qlen == 0) + return; + + /* Normally all the openreqs are young and become mature + * (i.e. converted to established socket) for first timeout. + * If synack was not acknowledged for 3 seconds, it means + * one of the following things: synack was lost, ack was lost, + * rtt is high or nobody planned to ack (i.e. synflood). + * When server is a bit loaded, queue is populated with old + * open requests, reducing effective size of queue. + * When server is well loaded, queue size reduces to zero + * after several minutes of work. It is not synflood, + * it is normal operation. The solution is pruning + * too old entries overriding normal timeout, when + * situation becomes dangerous. + * + * Essentially, we reserve half of room for young + * embrions; and abort old ones without pity, if old + * ones are about to clog our table. + */ + if (lopt->qlen>>(lopt->max_qlen_log-1)) { + int young = (lopt->qlen_young<<1); + + while (thresh > 2) { + if (lopt->qlen < young) + break; + thresh--; + young <<= 1; } } - read_unlock(&tcp_lhash_lock); -} - -void tcp_sltimer_handler(unsigned long data) -{ - struct tcp_sl_timer *slt = tcp_slt_array; - unsigned long next = ~0UL; - unsigned long now = jiffies; - int i; - for (i=0; i < TCP_SLT_MAX; i++, slt++) { - if (atomic_read(&slt->count)) { - long trigger; - - trigger = slt->period - ((long)(now - slt->last)); - - if (trigger <= 0) { - (*slt->handler)((unsigned long) slt); - slt->last = now; - trigger = slt->period; + if (tp->defer_accept) + max_retries = tp->defer_accept; + + budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL)); + i = lopt->clock_hand; + + do { + reqp=&lopt->syn_table[i]; + while ((req = *reqp) != NULL) { + if ((long)(now - req->expires) >= 0) { + if ((req->retrans < thresh || + (req->acked && req->retrans < max_retries)) + && !req->class->rtx_syn_ack(sk, req, NULL)) { + unsigned long timeo; + + if (req->retrans++ == 0) + lopt->qlen_young--; + timeo = min((TCP_TIMEOUT_INIT << req->retrans), + TCP_RTO_MAX); + req->expires = now + timeo; + reqp = &req->dl_next; + continue; + } + + /* Drop this request */ + write_lock(&tp->syn_wait_lock); + *reqp = req->dl_next; + write_unlock(&tp->syn_wait_lock); + lopt->qlen--; + if (req->retrans == 0) + lopt->qlen_young--; + tcp_openreq_free(req); } - - /* Only reschedule if some events remain. */ - if (atomic_read(&slt->count)) - next = min(next, trigger); + reqp = &req->dl_next; } - } - if (next != ~0UL) - mod_timer(&tcp_slow_timer, (now + next)); -} -/* __tcp_inc_slow_timer is called when an slow timer is started - * first time (slt->count was 0). There is race condition between - * timer creation and deletion and if we do not force adding timer here, - * we might lose timer. We could avoid it with global spinlock, but - * it is apparently overkill, so that we restart timer ALWAYS when - * this function is entered, it guarantees that timer will not lost. - */ + i = (i+1)&(TCP_SYNQ_HSIZE-1); -void __tcp_inc_slow_timer(struct tcp_sl_timer *slt) -{ - unsigned long now = jiffies; - unsigned long when; + } while (--budget > 0); - slt->last = now; + lopt->clock_hand = i; - when = now + slt->period; - - if (tcp_slow_timer.prev && - (long)(tcp_slow_timer.expires - when) < 0) - when = tcp_slow_timer.expires; - - mod_timer(&tcp_slow_timer, when); + if (lopt->qlen) + tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL); } void tcp_delete_keepalive_timer (struct sock *sk) @@ -595,6 +679,9 @@ void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len) void tcp_set_keepalive(struct sock *sk, int val) { + if ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN)) + return; + if (val && !sk->keepopen) tcp_reset_keepalive_timer(sk, keepalive_time_when(&sk->tp_pinfo.af_tcp)); else if (!val) @@ -602,7 +689,7 @@ void tcp_set_keepalive(struct sock *sk, int val) } -void tcp_keepalive_timer (unsigned long data) +static void tcp_keepalive_timer (unsigned long data) { struct sock *sk = (struct sock *) data; struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; @@ -616,14 +703,31 @@ void tcp_keepalive_timer (unsigned long data) goto out; } - if (sk->state == TCP_FIN_WAIT2 && sk->dead) + if (sk->state == TCP_LISTEN) { + tcp_synack_timer(sk); + goto out; + } + + if (sk->state == TCP_FIN_WAIT2 && sk->dead) { + if (tp->linger2 >= 0) { + int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN; + + if (tmo > 0) { + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto out; + } + } + tcp_send_active_reset(sk, GFP_ATOMIC); goto death; + } - if (!sk->keepopen) + if (!sk->keepopen || sk->state == TCP_CLOSE) goto out; elapsed = keepalive_time_when(tp); - if (!((1<<sk->state) & (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT2))) + + /* It is alive without keepalive 8) */ + if (tp->packets_out || tp->send_head) goto resched; elapsed = tcp_time_stamp - tp->rcv_tstamp; @@ -632,28 +736,30 @@ void tcp_keepalive_timer (unsigned long data) if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) || (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) { tcp_send_active_reset(sk, GFP_ATOMIC); - tcp_write_err(sk, 1); + tcp_write_err(sk); goto out; } - tp->probes_out++; - tp->pending = TIME_KEEPOPEN; - tcp_write_wakeup(sk); - elapsed = keepalive_intvl_when(tp); + if (tcp_write_wakeup(sk) <= 0) { + tp->probes_out++; + elapsed = keepalive_intvl_when(tp); + } else { + /* If keepalive was lost due to local congestion, + * try harder. + */ + elapsed = TCP_RESOURCE_PROBE_INTERVAL; + } } else { /* It is tp->rcv_tstamp + keepalive_time_when(tp) */ - if (keepalive_time_when(tp) > elapsed) - elapsed = keepalive_time_when(tp) - elapsed; - else - elapsed = 0; + elapsed = keepalive_time_when(tp) - elapsed; } + TCP_CHECK_TIMER(sk); + resched: tcp_reset_keepalive_timer (sk, elapsed); goto out; death: - tcp_set_state(sk, TCP_CLOSE); - tcp_clear_xmit_timers(sk); tcp_done(sk); out: diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 9ace56abd..c052d2eb8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -5,7 +5,7 @@ * * The User Datagram Protocol (UDP). * - * Version: $Id: udp.c,v 1.77 2000/01/09 02:19:44 davem Exp $ + * Version: $Id: udp.c,v 1.79 2000/01/18 08:24:20 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -369,30 +369,15 @@ void udp_err(struct sk_buff *skb, unsigned char *dp, int len) } /* - * Various people wanted BSD UDP semantics. Well they've come - * back out because they slow down response to stuff like dead - * or unreachable name servers and they screw term users something - * chronic. Oh and it violates RFC1122. So basically fix your - * client code people. - */ - - /* * RFC1122: OK. Passes ICMP errors back to application, as per - * 4.1.3.3. After the comment above, that should be no surprise. - */ - - if (!harderr && !sk->protinfo.af_inet.recverr) - goto out; - - /* - * 4.x BSD compatibility item. Break RFC1122 to - * get BSD socket semantics. + * 4.1.3.3. */ - if(sk->bsdism && sk->state!=TCP_ESTABLISHED && !sk->protinfo.af_inet.recverr) - goto out; - - if (sk->protinfo.af_inet.recverr) + if (!sk->protinfo.af_inet.recverr) { + if (!harderr || sk->state != TCP_ESTABLISHED) + goto out; + } else { ip_icmp_error(sk, skb, err, uh->dest, info, (u8*)(uh+1)); + } sk->err = err; sk->error_report(sk); out: @@ -629,15 +614,13 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) { switch(cmd) { - case TIOCOUTQ: + case SIOCOUTQ: { - unsigned long amount; - - amount = sock_wspace(sk); + int amount = atomic_read(&sk->wmem_alloc); return put_user(amount, (int *)arg); } - case TIOCINQ: + case SIOCINQ: { struct sk_buff *skb; unsigned long amount; @@ -663,6 +646,17 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) return(0); } +static __inline__ int __udp_checksum_complete(struct sk_buff *skb) +{ + return (unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum)); +} + +static __inline__ int udp_checksum_complete(struct sk_buff *skb) +{ + return skb->ip_summed != CHECKSUM_UNNECESSARY && + __udp_checksum_complete(skb); +} + /* * This should be easy, if there is something there we * return it, otherwise we block. @@ -699,31 +693,21 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len, msg->msg_flags |= MSG_TRUNC; } -#ifndef CONFIG_UDP_DELAY_CSUM - err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, - copied); -#else if (skb->ip_summed==CHECKSUM_UNNECESSARY) { err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, copied); - } else if (copied > msg->msg_iov[0].iov_len || (msg->msg_flags&MSG_TRUNC)) { - if ((unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum))) + } else if (msg->msg_flags&MSG_TRUNC) { + if (__udp_checksum_complete(skb)) goto csum_copy_err; err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, copied); } else { - unsigned int csum; + err = copy_and_csum_toiovec(msg->msg_iov, skb, sizeof(struct udphdr)); - err = 0; - csum = csum_partial(skb->h.raw, sizeof(struct udphdr), skb->csum); - csum = csum_and_copy_to_user((char*)&skb->h.uh[1], msg->msg_iov[0].iov_base, - copied, csum, &err); if (err) - goto out_free; - if ((unsigned short)csum_fold(csum)) goto csum_copy_err; } -#endif + if (err) goto out_free; sk->stamp=skb->stamp; @@ -744,7 +728,6 @@ out_free: out: return err; -#ifdef CONFIG_UDP_DELAY_CSUM csum_copy_err: UDP_INC_STATS_BH(UdpInErrors); @@ -768,7 +751,6 @@ csum_copy_err: * as some normal condition. */ return (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; -#endif } int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) @@ -831,9 +813,9 @@ static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) * Charge it to the socket, dropping if the queue is full. */ -#if defined(CONFIG_FILTER) && defined(CONFIG_UDP_DELAY_CSUM) +#if defined(CONFIG_FILTER) if (sk->filter && skb->ip_summed != CHECKSUM_UNNECESSARY) { - if ((unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum))) { + if (__udp_checksum_complete(skb)) { UDP_INC_STATS_BH(UdpInErrors); IP_INC_STATS_BH(IpInDiscards); ip_statistics[smp_processor_id()*2].IpInDelivers--; @@ -855,12 +837,6 @@ static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) return 0; } - -static inline void udp_deliver(struct sock *sk, struct sk_buff *skb) -{ - udp_queue_rcv_skb(sk, skb); -} - /* * Multicasts and broadcasts go to each listener. * @@ -889,7 +865,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh, skb1 = skb_clone(skb, GFP_ATOMIC); if(skb1) - udp_deliver(sk, skb1); + udp_queue_rcv_skb(sk, skb1); sk = sknext; } while(sknext); } else @@ -898,30 +874,25 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh, return 0; } -static int udp_checksum_verify(struct sk_buff *skb, struct udphdr *uh, - unsigned short ulen, u32 saddr, u32 daddr, - int full_csum_deferred) +/* Initialize UDP checksum. If exited with zero value (success), + * CHECKSUM_UNNECESSARY means, that no more checks are required. + * Otherwise, csum completion requires chacksumming packet body, + * including udp header and folding it to skb->csum. + */ +static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, + unsigned short ulen, u32 saddr, u32 daddr) { - if (!full_csum_deferred) { - if (uh->check) { - if (skb->ip_summed == CHECKSUM_HW && - udp_check(uh, ulen, saddr, daddr, skb->csum)) - return -1; - if (skb->ip_summed == CHECKSUM_NONE && - udp_check(uh, ulen, saddr, daddr, - csum_partial((char *)uh, ulen, 0))) - return -1; - } - } else { - if (uh->check == 0) - skb->ip_summed = CHECKSUM_UNNECESSARY; - else if (skb->ip_summed == CHECKSUM_HW) { - if (udp_check(uh, ulen, saddr, daddr, skb->csum)) - return -1; - skb->ip_summed = CHECKSUM_UNNECESSARY; - } else if (skb->ip_summed != CHECKSUM_UNNECESSARY) - skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); - } + if (uh->check == 0) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else if (skb->ip_summed == CHECKSUM_HW) { + if (udp_check(uh, ulen, saddr, daddr, skb->csum)) + return -1; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); + /* Probably, we should checksum udp header (it should be in cache + * in any case) and data in tiny packets (< rx copybreak). + */ return 0; } @@ -961,50 +932,33 @@ int udp_rcv(struct sk_buff *skb, unsigned short len) } skb_trim(skb, ulen); - if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) { - int defer; + if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0) + goto csum_error; -#ifdef CONFIG_UDP_DELAY_CSUM - defer = 1; -#else - defer = 0; -#endif - if (udp_checksum_verify(skb, uh, ulen, saddr, daddr, defer)) - goto csum_error; + if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return udp_v4_mcast_deliver(skb, uh, saddr, daddr); - } sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex); - - if (sk == NULL) { - /* No socket. Drop packet silently, if checksum is wrong */ - if (udp_checksum_verify(skb, uh, ulen, saddr, daddr, 0)) - goto csum_error; - - UDP_INC_STATS_BH(UdpNoPorts); - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); - /* - * Hmm. We got an UDP packet to a port to which we - * don't wanna listen. Ignore it. - */ - kfree_skb(skb); - return(0); - } - if (udp_checksum_verify(skb, uh, ulen, saddr, daddr, -#ifdef CONFIG_UDP_DELAY_CSUM - 1 -#else - (sk->no_check & UDP_CSUM_NORCV) != 0 -#endif - )) { + if (sk != NULL) { + udp_queue_rcv_skb(sk, skb); sock_put(sk); - goto csum_error; + return 0; } - udp_deliver(sk, skb); - __sock_put(sk); - return 0; + /* No socket. Drop packet silently, if checksum is wrong */ + if (udp_checksum_complete(skb)) + goto csum_error; + + UDP_INC_STATS_BH(UdpNoPorts); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + /* + * Hmm. We got an UDP packet to a port to which we + * don't wanna listen. Ignore it. + */ + kfree_skb(skb); + return(0); csum_error: /* @@ -1090,10 +1044,6 @@ struct proto udp_prot = { udp_connect, /* connect */ udp_disconnect, /* disconnect */ NULL, /* accept */ - NULL, /* retransmit */ - NULL, /* write_wakeup */ - NULL, /* read_wakeup */ - datagram_poll, /* poll */ udp_ioctl, /* ioctl */ NULL, /* init */ NULL, /* destroy */ @@ -1107,7 +1057,5 @@ struct proto udp_prot = { udp_v4_hash, /* hash */ udp_v4_unhash, /* unhash */ udp_v4_get_port, /* good_socknum */ - 128, /* max_header */ - 0, /* retransmits */ "UDP", /* name */ }; |