Merge with Linux 2.3.41.

author: Ralf Baechle <ralf@linux-mips.org> 2000-02-18 00:24:27 +0000
committer: Ralf Baechle <ralf@linux-mips.org> 2000-02-18 00:24:27 +0000
commit: b9558d5f86c471a125abf1fb3a3882fb053b1f8c (patch)
tree: 707b53ec64e740a7da87d5f36485e3cd9b1c794e /net/ipv4
parent: b3ac367c7a3e6047abe74817db27e34e759f279f (diff)
17 files changed, 2977 insertions, 2167 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 94fb19f92..bc2c97779 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -5,7 +5,7 @@
  *
  *		PF_INET protocol family socket handler.
  *
- * Version:	$Id: af_inet.c,v 1.101 2000/01/09 02:19:38 davem Exp $
+ * Version:	$Id: af_inet.c,v 1.104 2000/01/18 08:24:14 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -117,7 +117,9 @@
 
 struct linux_mib net_statistics[NR_CPUS*2];
 
+#ifdef INET_REFCNT_DEBUG
 atomic_t inet_sock_nr;
+#endif
 
 extern int raw_get_info(char *, char **, off_t, int);
 extern int snmp_get_info(char *, char **, off_t, int);
@@ -159,8 +161,8 @@ void inet_sock_destruct(struct sock *sk)
 	if (sk->protinfo.af_inet.opt)
 		kfree(sk->protinfo.af_inet.opt);
 	dst_release(sk->dst_cache);
-	atomic_dec(&inet_sock_nr);
 #ifdef INET_REFCNT_DEBUG
+	atomic_dec(&inet_sock_nr);
 	printk(KERN_DEBUG "INET socket %p released, %d are still alive\n", sk, atomic_read(&inet_sock_nr));
 #endif
 }
@@ -171,32 +173,28 @@ void inet_sock_release(struct sock *sk)
 		sk->prot->destroy(sk);
 
 	/* Observation: when inet_sock_release is called, processes have
-	   no access to socket. But net still has.
-	   Step one, detach it from networking:
-
-	   A. Remove from hash tables.
+	 * no access to socket. But net still has.
+	 * Step one, detach it from networking:
+	 *
+	 * A. Remove from hash tables.
 	 */
 
 	sk->prot->unhash(sk);
 
 	/* In this point socket cannot receive new packets,
-	   but it is possible that some packets are in flight
-	   because some CPU runs receiver and did hash table lookup
-	   before we unhashed socket. They will achieve receive queue
-	   and will be purged by socket destructor.
-
-	   Also we still have packets pending on receive
-	   queue and probably, our own packets waiting in device queues.
-	   sock_destroy will drain receive queue, but transmitted
-	   packets will delay socket destruction until the last reference
-	   will be released.
+	 * but it is possible that some packets are in flight
+	 * because some CPU runs receiver and did hash table lookup
+	 * before we unhashed socket. They will achieve receive queue
+	 * and will be purged by socket destructor.
+	 *
+	 * Also we still have packets pending on receive
+	 * queue and probably, our own packets waiting in device queues.
+	 * sock_destroy will drain receive queue, but transmitted
+	 * packets will delay socket destruction until the last reference
+	 * will be released.
 	 */
 
-	write_lock_irq(&sk->callback_lock);
-	sk->dead=1;
-	sk->socket = NULL;
-	sk->sleep = NULL;
-	write_unlock_irq(&sk->callback_lock);
+	sock_orphan(sk);
 
 #ifdef INET_REFCNT_DEBUG
 	if (atomic_read(&sk->refcnt) != 1) {
@@ -222,8 +220,7 @@ int inet_setsockopt(struct socket *sock, int level, int optname,
 		    char *optval, int optlen)
 {
 	struct sock *sk=sock->sk;
-	if (sk->prot->setsockopt==NULL)
-		return -EOPNOTSUPP;
+
 	return sk->prot->setsockopt(sk,level,optname,optval,optlen);
 }
 
@@ -239,8 +236,7 @@ int inet_getsockopt(struct socket *sock, int level, int optname,
 		    char *optval, int *optlen)
 {
 	struct sock *sk=sock->sk;
-	if (sk->prot->getsockopt==NULL)
-		return -EOPNOTSUPP;
+
 	return sk->prot->getsockopt(sk,level,optname,optval,optlen);
 }
 
@@ -264,14 +260,6 @@ static int inet_autobind(struct sock *sk)
 	return 0;
 }
 
-/* Listening INET sockets never sleep to wait for memory, so
- * it is completely silly to wake them up on queue space
- * available events.  So we hook them up to this dummy callback.
- */
-static void inet_listen_write_space(struct sock *sk)
-{
-}
-
 /*
  *	Move a socket into listening state.
  */
@@ -282,12 +270,13 @@ int inet_listen(struct socket *sock, int backlog)
 	unsigned char old_state;
 	int err;
 
+	lock_sock(sk);
+
+	err = -EINVAL;
 	if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
-		return -EINVAL;
+		goto out;
 
-	lock_sock(sk);
 	old_state = sk->state;
-	err = -EINVAL;
 	if (!((1<<old_state)&(TCPF_CLOSE|TCPF_LISTEN)))
 		goto out;
 
@@ -295,25 +284,9 @@ int inet_listen(struct socket *sock, int backlog)
 	 * we can only allow the backlog to be adjusted.
 	 */
 	if (old_state != TCP_LISTEN) {
-		sk->state = TCP_LISTEN;
-		sk->ack_backlog = 0;
-		if (sk->num == 0) {
-			if (sk->prot->get_port(sk, 0) != 0) {
-				sk->state = old_state;
-				err = -EAGAIN;
-				goto out;
-			}
-			sk->sport = htons(sk->num);
-		} else {
-			/* Not nice, but the simplest solution however */
-			if (sk->prev)
-				((struct tcp_bind_bucket*)sk->prev)->fastreuse = 0;
-		}
-
-		sk_dst_reset(sk);
-		sk->prot->hash(sk);
-		sk->socket->flags |= SO_ACCEPTCON;
-		sk->write_space = inet_listen_write_space;
+		err = tcp_listen_start(sk);
+		if (err)
+			goto out;
 	}
 	sk->max_ack_backlog = backlog;
 	err = 0;
@@ -345,10 +318,6 @@ static int inet_create(struct socket *sock, int protocol)
 		if (protocol && protocol != IPPROTO_TCP)
 			goto free_and_noproto;
 		protocol = IPPROTO_TCP;
-		if (ipv4_config.no_pmtu_disc)
-			sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT;
-		else
-			sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_WANT;
 		prot = &tcp_prot;
 		sock->ops = &inet_stream_ops;
 		break;
@@ -359,7 +328,6 @@ static int inet_create(struct socket *sock, int protocol)
 			goto free_and_noproto;
 		protocol = IPPROTO_UDP;
 		sk->no_check = UDP_CSUM_DEFAULT;
-		sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT;
 		prot=&udp_prot;
 		sock->ops = &inet_dgram_ops;
 		break;
@@ -370,7 +338,6 @@ static int inet_create(struct socket *sock, int protocol)
 			goto free_and_noproto;
 		prot = &raw_prot;
 		sk->reuse = 1;
-		sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT;
 		sk->num = protocol;
 		sock->ops = &inet_dgram_ops;
 		if (protocol == IPPROTO_RAW)
@@ -380,23 +347,22 @@ static int inet_create(struct socket *sock, int protocol)
 		goto free_and_badtype;
 	}
 
+	if (ipv4_config.no_pmtu_disc)
+		sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT;
+	else
+		sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_WANT;
+
 	sock_init_data(sock,sk);
 
 	sk->destruct = inet_sock_destruct;
 
-	sk->zapped=0;
-#ifdef CONFIG_TCP_NAGLE_OFF
-	sk->nonagle = 1;
-#endif  
+	sk->zapped = 0;
 	sk->family = PF_INET;
 	sk->protocol = protocol;
 
 	sk->prot = prot;
 	sk->backlog_rcv = prot->backlog_rcv;
 
-	sk->timer.data = (unsigned long)sk;
-	sk->timer.function = &tcp_keepalive_timer;
-
 	sk->protinfo.af_inet.ttl=sysctl_ip_default_ttl;
 
 	sk->protinfo.af_inet.mc_loop=1;
@@ -404,7 +370,9 @@ static int inet_create(struct socket *sock, int protocol)
 	sk->protinfo.af_inet.mc_index=0;
 	sk->protinfo.af_inet.mc_list=NULL;
 
+#ifdef INET_REFCNT_DEBUG
 	atomic_inc(&inet_sock_nr);
+#endif
 
 	if (sk->num) {
 		/* It assumes that any protocol which allows
@@ -469,11 +437,8 @@ int inet_release(struct socket *sock)
 		 * linger..
 		 */
 		timeout = 0;
-		if (sk->linger && !(current->flags & PF_EXITING)) {
-			timeout = HZ * sk->lingertime;
-			if (!timeout)
-				timeout = MAX_SCHEDULE_TIMEOUT;
-		}
+		if (sk->linger && !(current->flags & PF_EXITING))
+			timeout = sk->lingertime;
 		sock->sk = NULL;
 		sk->prot->close(sk, timeout);
 	}
@@ -496,10 +461,6 @@ static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		return -EINVAL;
 		
 	chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
-	if (addr->sin_addr.s_addr != 0 && chk_addr_ret != RTN_LOCAL &&
-	    chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) {
-		return -EADDRNOTAVAIL;	/* Source address MUST be ours! */
-	}
 
 	snum = ntohs(addr->sin_port);
 	if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
@@ -555,25 +516,29 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
 	return sk->prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
 }
 
-static void inet_wait_for_connect(struct sock *sk)
+static long inet_wait_for_connect(struct sock *sk, long timeo)
 {
 	DECLARE_WAITQUEUE(wait, current);
 
 	__set_current_state(TASK_INTERRUPTIBLE);
 	add_wait_queue(sk->sleep, &wait);
 
+	/* Basic assumption: if someone sets sk->err, he _must_
+	 * change state of the socket from TCP_SYN_*.
+	 * Connect() does not allow to get error notifications
+	 * without closing the socket.
+	 */
 	while ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
-		if (signal_pending(current))
-			break;
-		if (sk->err)
-			break;
 		release_sock(sk);
-		schedule();
+		timeo = schedule_timeout(timeo);
 		lock_sock(sk);
+		if (signal_pending(current) || !timeo)
+			break;
 		set_current_state(TASK_INTERRUPTIBLE);
 	}
 	__set_current_state(TASK_RUNNING);
 	remove_wait_queue(sk->sleep, &wait);
+	return timeo;
 }
 
 /*
@@ -586,16 +551,16 @@ int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr,
 {
 	struct sock *sk=sock->sk;
 	int err;
+	long timeo;
+
+	lock_sock(sk);
 
 	if (uaddr->sa_family == AF_UNSPEC) {
-		lock_sock(sk);
 		err = sk->prot->disconnect(sk, flags);
 		sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
-		release_sock(sk);
-		return err;
+		goto out;
 	}
 
-	lock_sock(sk);
 	switch (sock->state) {
 	default:
 		err = -EINVAL;
@@ -604,40 +569,58 @@ int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr,
 		err = -EISCONN;
 		goto out;
 	case SS_CONNECTING:
-		if (tcp_established(sk->state)) {
-			sock->state = SS_CONNECTED;
-			err = 0;
-			goto out;
-		}
-		if (sk->err)
-			goto sock_error;
 		err = -EALREADY;
-		if (flags & O_NONBLOCK)
-			goto out;
+		/* Fall out of switch with err, set for this state */
 		break;
 	case SS_UNCONNECTED:
+		err = -EISCONN;
+		if (sk->state != TCP_CLOSE) 
+			goto out;
+
+		err = -EAGAIN;
+		if (sk->num == 0) {
+			if (sk->prot->get_port(sk, 0) != 0)
+				goto out;
+			sk->sport = htons(sk->num);
+		}
+
 		err = sk->prot->connect(sk, uaddr, addr_len);
 		if (err < 0)
 			goto out;
+
   		sock->state = SS_CONNECTING;
-	}
 
-	if (sk->state > TCP_FIN_WAIT2)
-		goto sock_error;
+		/* Just entered SS_CONNECTING state; the only
+		 * difference is that return value in non-blocking
+		 * case is EINPROGRESS, rather than EALREADY.
+		 */
+		err = -EINPROGRESS;
+		break;
+	}
 
-	err = -EINPROGRESS;
-	if (!tcp_established(sk->state) && (flags & O_NONBLOCK))
-		goto out;
+	timeo = sock_sndtimeo(sk, flags&O_NONBLOCK);
 
 	if ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
-		inet_wait_for_connect(sk);
+		/* Error code is set above */
+		if (!timeo || !inet_wait_for_connect(sk, timeo))
+			goto out;
+
 		err = -ERESTARTSYS;
 		if (signal_pending(current))
 			goto out;
 	}
 
-	if (sk->err && !tcp_established(sk->state))
-		goto sock_error; 
+	/* Connection was closed by RST, timeout, ICMP error
+	 * or another process disconnected us.
+	 */
+	if (sk->state == TCP_CLOSE)
+		goto sock_error;
+
+	/* sk->err may be not zero now, if RECVERR was ordered by user
+	 * and error was received after socket entered established state.
+	 * Hence, it is handled normally after connect() return successfully.
+	 */
+
 	sock->state = SS_CONNECTED;
 	err = 0;
 out:
@@ -647,11 +630,9 @@ out:
 sock_error:
 	err = sock_error(sk) ? : -ECONNABORTED;
 	sock->state = SS_UNCONNECTED;
-	if (sk->prot->disconnect(sk, O_NONBLOCK))
+	if (sk->prot->disconnect(sk, flags))
 		sock->state = SS_DISCONNECTING;
-	release_sock(sk);
-
-	return err;
+	goto out;
 }
 
 /*
@@ -671,11 +652,7 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
 
 	BUG_TRAP((1<<sk2->state)&(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_CLOSE));
 
-	write_lock_irq(&sk2->callback_lock);
-	sk2->sleep = &newsock->wait;
-	newsock->sk = sk2;
-	sk2->socket = newsock;
-	write_unlock_irq(&sk2->callback_lock);
+	sock_graft(sk2, newsock);
 
 	newsock->state = SS_CONNECTED;
 	release_sock(sk2);
@@ -749,7 +726,7 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, int size,
 int inet_shutdown(struct socket *sock, int how)
 {
 	struct sock *sk = sock->sk;
-	int err;
+	int err = 0;
 
 	/* This should really check to make sure
 	 * the socket is a TCP socket. (WHY AC...)
@@ -759,35 +736,45 @@ int inet_shutdown(struct socket *sock, int how)
 		       2->3 */
 	if ((how & ~SHUTDOWN_MASK) || how==0)	/* MAXINT->0 */
 		return -EINVAL;
-	if (!sk)
-		return -ENOTCONN;
 
 	lock_sock(sk);
-	if (sock->state == SS_CONNECTING && tcp_established(sk->state))
-		sock->state = SS_CONNECTED;
-	err = -ENOTCONN;
-	if (!tcp_connected(sk->state))
-		goto out;
-	sk->shutdown |= how;
-	if (sk->prot->shutdown)
-		sk->prot->shutdown(sk, how);
+	if (sock->state == SS_CONNECTING) {
+		if ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE))
+			sock->state = SS_DISCONNECTING;
+		else
+			sock->state = SS_CONNECTED;
+	}
+
+	switch (sk->state) {
+	default:	
+		sk->shutdown |= how;
+		if (sk->prot->shutdown)
+			sk->prot->shutdown(sk, how);
+		break;
+	case TCP_CLOSE:
+		err = -ENOTCONN;
+		break;
+
+	/* Remaining two branches are temporary solution for missing
+	 * close() in multithreaded environment. It is _not_ a good idea,
+	 * but we have no choice until close() is repaired at VFS level.
+	 */
+	case TCP_LISTEN:
+		if (!(how & RCV_SHUTDOWN))
+			break;
+		/* Fall through */
+	case TCP_SYN_SENT:
+		err = sk->prot->disconnect(sk, O_NONBLOCK);
+		sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+		break;
+	}
+
 	/* Wake up anyone sleeping in poll. */
 	sk->state_change(sk);
-	err = 0;
-out:
 	release_sock(sk);
 	return err;
 }
 
-unsigned int inet_poll(struct file * file, struct socket *sock, poll_table *wait)
-{
-	struct sock *sk = sock->sk;
-
-	if (sk->prot->poll == NULL)
-		return(0);
-	return sk->prot->poll(file, sock, wait);
-}
-
 /*
  *	ioctl() calls you can issue on an INET socket. Most of these are
  *	device configuration and stuff and very rarely used. Some ioctls
@@ -909,7 +896,7 @@ struct proto_ops inet_stream_ops = {
 	sock_no_socketpair,
 	inet_accept,
 	inet_getname, 
-	inet_poll,
+	tcp_poll,
 	inet_ioctl,
 	inet_listen,
 	inet_shutdown,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 591f3cceb..588cdf030 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1,6 +1,6 @@
 /* linux/net/inet/arp.c
  *
- * Version:	$Id: arp.c,v 1.83 1999/12/15 22:39:03 davem Exp $
+ * Version:	$Id: arp.c,v 1.84 2000/01/18 08:24:14 davem Exp $
  *
  * Copyright (C) 1994 by Florian  La Roche
  *
@@ -487,7 +487,9 @@ void arp_send(int type, int ptype, u32 dest_ip,
 	/*
 	 *	Fill the device header for the ARP frame
 	 */
-	dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len);
+	if (dev->hard_header &&
+	    dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len) < 0)
+		goto out;
 
 	/*
 	 * Fill out the arp protocol part.
@@ -552,6 +554,10 @@ void arp_send(int type, int ptype, u32 dest_ip,
 	skb->dev = dev;
 
 	dev_queue_xmit(skb);
+	return;
+
+out:
+	kfree_skb(skb);
 }
 
 static void parp_redo(struct sk_buff *skb)
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 11a8c319b..23389d249 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -5,7 +5,7 @@
  *
  *		The Internet Protocol (IP) module.
  *
- * Version:	$Id: ip_input.c,v 1.44 2000/01/09 02:19:30 davem Exp $
+ * Version:	$Id: ip_input.c,v 1.45 2000/01/16 05:11:22 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -317,13 +317,12 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
 
 #ifdef CONFIG_NET_CLS_ROUTE
 	if (skb->dst->tclassid) {
+		struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id();
 		u32 idx = skb->dst->tclassid;
-		write_lock(&ip_rt_acct_lock);
-		ip_rt_acct[idx&0xFF].o_packets++;
-		ip_rt_acct[idx&0xFF].o_bytes+=skb->len;
-		ip_rt_acct[(idx>>16)&0xFF].i_packets++;
-		ip_rt_acct[(idx>>16)&0xFF].i_bytes+=skb->len;
-		write_unlock(&ip_rt_acct_lock);
+		st[idx&0xFF].o_packets++;
+		st[idx&0xFF].o_bytes+=skb->len;
+		st[(idx>>16)&0xFF].i_packets++;
+		st[(idx>>16)&0xFF].i_bytes+=skb->len;
 	}
 #endif
 
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 59e6ff865..2a4e3cf41 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -5,7 +5,7 @@
  *
  *		The Internet Protocol (IP) output module.
  *
- * Version:	$Id: ip_output.c,v 1.77 2000/01/09 02:19:31 davem Exp $
+ * Version:	$Id: ip_output.c,v 1.78 2000/01/16 05:11:22 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -149,8 +149,8 @@ output_maybe_reroute(struct sk_buff *skb)
 /* 
  *		Add an ip header to a skbuff and send it out.
  */
-void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
-			   u32 saddr, u32 daddr, struct ip_options *opt)
+int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
+			  u32 saddr, u32 daddr, struct ip_options *opt)
 {
 	struct rtable *rt = (struct rtable *)skb->dst;
 	struct iphdr *iph;
@@ -182,8 +182,8 @@ void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 	ip_send_check(iph);
 
 	/* Send it out. */
-	NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
-		output_maybe_reroute);
+	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+		       output_maybe_reroute);
 }
 
 static inline int ip_finish_output2(struct sk_buff *skb)
@@ -257,7 +257,7 @@ int ip_mc_output(struct sk_buff *skb)
 		{
 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 			if (newskb)
-				NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, newskb, NULL,
+				NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
 					newskb->dev, 
 					ip_dev_loopback_xmit);
 		}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index c618689b2..90b74447f 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -5,7 +5,7 @@
  *
  *		The IP to API glue.
  *		
- * Version:	$Id: ip_sockglue.c,v 1.46 2000/01/09 02:19:32 davem Exp $
+ * Version:	$Id: ip_sockglue.c,v 1.47 2000/01/16 05:11:23 davem Exp $
  *
  * Authors:	see ip.c
  *
@@ -415,7 +415,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt
 				struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 				if (sk->family == PF_INET ||
-				    ((tcp_connected(sk->state) || sk->state == TCP_SYN_SENT)
+				    (!((1<<sk->state)&(TCPF_LISTEN|TCPF_CLOSE))
 				     && sk->daddr != LOOPBACK4_IPV6)) {
 #endif
 					if (opt)
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 4d2195312..d4d556cb5 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -534,7 +534,14 @@ static void __init ic_bootp_send_if(struct ic_device *d, u32 jiffies)
 
 	/* Construct BOOTP header */
 	b->op = BOOTP_REQUEST;
-	b->htype = dev->type;
+	if (dev->type < 256) /* check for false types */
+		b->htype = dev->type;
+	else if (dev->type == ARPHRD_IEEE802_TR) /* fix for token ring */
+		b->htype = ARPHRD_IEEE802;
+	else {
+		printk("Unknown ARP type 0x%04x for device %s\n", dev->type, dev->name);
+		b->htype = dev->type; /* can cause undefined behavior */
+	}
 	b->hlen = dev->addr_len;
 	memcpy(b->hw_addr, dev->dev_addr, dev->addr_len);
 	b->secs = htons(jiffies / HZ);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index b3e86f58c..d6a7c57f5 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -7,7 +7,7 @@
  *		PROC file system.  It is mainly used for debugging and
  *		statistics.
  *
- * Version:	$Id: proc.c,v 1.38 2000/01/09 02:19:30 davem Exp $
+ * Version:	$Id: proc.c,v 1.41 2000/01/21 23:45:57 davem Exp $
  *
  * Authors:	Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *		Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de>
@@ -71,8 +71,9 @@ int afinet_get_info(char *buffer, char **start, off_t offset, int length)
 
 	int len  = socket_get_info(buffer,start,offset,length);
 
-	len += sprintf(buffer+len,"TCP: inuse %d\n",
-		       fold_prot_inuse(&tcp_prot));
+	len += sprintf(buffer+len,"TCP: inuse %d orphan %d tw %d\n",
+		       fold_prot_inuse(&tcp_prot),
+		       atomic_read(&tcp_orphan_count), tcp_tw_count);
 	len += sprintf(buffer+len,"UDP: inuse %d\n",
 		       fold_prot_inuse(&udp_prot));
 	len += sprintf(buffer+len,"RAW: inuse %d\n",
@@ -163,7 +164,14 @@ int netstat_get_info(char *buffer, char **start, off_t offset, int length)
 	len = sprintf(buffer,
 		      "TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed"
 		      " EmbryonicRsts PruneCalled RcvPruned OfoPruned"
-		      " OutOfWindowIcmps LockDroppedIcmps\n" 	
+		      " OutOfWindowIcmps LockDroppedIcmps"
+		      " TW TWRecycled TWKilled"
+		      " PAWSPassive PAWSActive PAWSEstab"
+		      " DelayedACKs DelayedACKLocked DelayedACKLost"
+		      " ListenOverflows ListenDrops"
+		      " TCPPrequeued TCPDirectCopyFromBacklog"
+		      " TCPDirectCopyFromPrequeue TCPPrequeueDropped"
+		      " TCPHPHits TCPHPHitsToUser\n"
 		      "TcpExt:");
 	for (i=0; i<offsetof(struct linux_mib, __pad)/sizeof(unsigned long); i++)
 		len += sprintf(buffer+len, " %lu", fold_field((unsigned long*)net_statistics, sizeof(struct linux_mib), i));
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 6fc5e59c5..e9aa1952a 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -5,7 +5,7 @@
  *
  *		RAW - implementation of IP "raw" sockets.
  *
- * Version:	$Id: raw.c,v 1.46 2000/01/09 02:19:30 davem Exp $
+ * Version:	$Id: raw.c,v 1.48 2000/01/18 08:24:15 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -648,10 +648,6 @@ struct proto raw_prot = {
 	udp_connect,			/* connect */
 	udp_disconnect,			/* disconnect */
 	NULL,				/* accept */
-	NULL,				/* retransmit */
-	NULL,				/* write_wakeup */
-	NULL,				/* read_wakeup */
-	datagram_poll,			/* poll */
 #ifdef CONFIG_IP_MROUTE
 	ipmr_ioctl,			/* ioctl */
 #else
@@ -669,7 +665,5 @@ struct proto raw_prot = {
 	raw_v4_hash,			/* hash */
 	raw_v4_unhash,			/* unhash */
 	NULL,				/* get_port */
-	128,				/* max_header */
-	0,				/* retransmits */
 	"RAW",				/* name */
 };
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index add42730d..bbc6ec111 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -5,7 +5,7 @@
  *
  *		ROUTE - implementation of the IP router.
  *
- * Version:	$Id: route.c,v 1.78 2000/01/13 00:06:58 davem Exp $
+ * Version:	$Id: route.c,v 1.80 2000/01/21 06:37:27 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -1178,6 +1178,7 @@ ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
 	rth->u.dst.output= ip_rt_bug;
 
 	atomic_set(&rth->u.dst.__refcnt, 1);
+	rth->u.dst.flags= DST_HOST;
 	rth->key.dst	= daddr;
 	rth->rt_dst	= daddr;
 	rth->key.tos	= tos;
@@ -1385,6 +1386,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
 		goto e_nobufs;
 
 	atomic_set(&rth->u.dst.__refcnt, 1);
+	rth->u.dst.flags= DST_HOST;
 	rth->key.dst	= daddr;
 	rth->rt_dst	= daddr;
 	rth->key.tos	= tos;
@@ -1462,6 +1464,7 @@ local_input:
 	rth->u.dst.output= ip_rt_bug;
 
 	atomic_set(&rth->u.dst.__refcnt, 1);
+	rth->u.dst.flags= DST_HOST;
 	rth->key.dst	= daddr;
 	rth->rt_dst	= daddr;
 	rth->key.tos	= tos;
@@ -1815,6 +1818,7 @@ make_route:
 		goto e_nobufs;
 
 	atomic_set(&rth->u.dst.__refcnt, 1);
+	rth->u.dst.flags= DST_HOST;
 	rth->key.dst	= daddr;
 	rth->key.tos	= tos;
 	rth->key.src	= saddr;
@@ -2208,8 +2212,7 @@ ctl_table ipv4_route_table[] = {
 #endif
 
 #ifdef CONFIG_NET_CLS_ROUTE
-struct ip_rt_acct ip_rt_acct[256];
-rwlock_t ip_rt_acct_lock = RW_LOCK_UNLOCKED;
+struct ip_rt_acct *ip_rt_acct;
 
 #ifdef CONFIG_PROC_FS
 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
@@ -2217,14 +2220,34 @@ static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
 {
 	*start=buffer;
 
-	if (offset + length > sizeof(ip_rt_acct)) {
-		length = sizeof(ip_rt_acct) - offset;
+	if ((offset&3) || (length&3))
+		return -EIO;
+
+	if (offset + length >= sizeof(struct ip_rt_acct)*256) {
+		length = sizeof(struct ip_rt_acct)*256 - offset;
 		*eof = 1;
 	}
 	if (length > 0) {
-		read_lock_bh(&ip_rt_acct_lock);
-		memcpy(buffer, ((u8*)&ip_rt_acct)+offset, length);
-		read_unlock_bh(&ip_rt_acct_lock);
+		u32 *dst = (u32*)buffer;
+		u32 *src = (u32*)(((u8*)ip_rt_acct) + offset);
+
+		memcpy(dst, src, length);
+
+#ifdef __SMP__
+		if (smp_num_cpus > 1) {
+			int i;
+			int cnt = length/4;
+
+			for (i=1; i<smp_num_cpus; i++) {
+				int k;
+
+				src += (256/4)*sizeof(struct ip_rt_acct);
+
+				for (k=0; k<cnt; k++)
+					dst[k] += src[k];
+			}
+		}
+#endif
 		return length;
 	}
 	return 0;
@@ -2236,6 +2259,16 @@ void __init ip_rt_init(void)
 {
 	int i, order, goal;
 
+#ifdef CONFIG_NET_CLS_ROUTE
+	for (order=0;
+	     (PAGE_SIZE<<order) < 256*sizeof(ip_rt_acct)*smp_num_cpus; order++)
+		/* NOTHING */;
+	ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
+	if (!ip_rt_acct)
+		panic("IP: failed to allocate ip_rt_acct\n");
+	memset(ip_rt_acct, 0, PAGE_SIZE<<order);
+#endif
+
 	ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
 						     sizeof(struct rtable),
 						     0, SLAB_HWCACHE_ALIGN,
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index e82233cfd..d218c3bdb 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -9,7 +9,7 @@
  *      as published by the Free Software Foundation; either version
  *      2 of the License, or (at your option) any later version.
  * 
- *  $Id: syncookies.c,v 1.10 2000/01/09 02:19:35 davem Exp $
+ *  $Id: syncookies.c,v 1.11 2000/01/16 05:11:27 davem Exp $
  *
  *  Missing: IPv6 support. 
  */
@@ -102,23 +102,16 @@ static inline struct sock *
 get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct open_request *req,
 		struct dst_entry *dst)
 {
-	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	struct sock *child;
 
-	/* Oops! It was missing, syn_recv_sock decreases it. */
-	tp->syn_backlog++;
+	child = tp->af_specific->syn_recv_sock(sk, skb, req, dst);
+	if (child)
+		tcp_acceptq_queue(sk, req, child);
+	else
+		tcp_openreq_free(req);
 
-	sk = tp->af_specific->syn_recv_sock(sk, skb, req, dst);
-	if (sk) {
-		req->sk = sk; 
-
-		/* Queue up for accept() */
-		tcp_synq_queue(tp, req);
-	} else {
-		tp->syn_backlog--;
-		req->class->destructor(req);
-		tcp_openreq_free(req); 
-	}
-	return sk; 
+	return child;
 }
 
 struct sock *
@@ -171,9 +164,9 @@ cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt)
 			}
 		}
 	}
-	
+
 	req->snd_wscale = req->rcv_wscale = req->tstamp_ok = 0;
-	req->wscale_ok = 0; 
+	req->wscale_ok = req->sack_ok = 0; 
 	req->expires = 0UL; 
 	req->retrans = 0; 
 	
@@ -189,8 +182,8 @@ cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt)
 			    req->af.v4_req.loc_addr,
 			    sk->protinfo.af_inet.tos | RTO_CONN,
 			    0)) { 
-	    tcp_openreq_free(req);
-	    return NULL; 
+		tcp_openreq_free(req);
+		return NULL; 
 	}
 
 	/* Try to redo what tcp_v4_send_synack did. */
@@ -198,6 +191,7 @@ cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt)
 	tcp_select_initial_window(tcp_full_space(sk),req->mss,
 				  &req->rcv_wnd, &req->window_clamp, 
 				  0, &rcv_wscale);
+	/* BTW win scale with syncookies is 0 by definition */
 	req->rcv_wscale = rcv_wscale; 
 
 	return get_cookie_sock(sk, skb, req, &rt->u.dst);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 9465e4021..d9416525b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1,7 +1,7 @@
 /*
  * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem.
  *
- * $Id: sysctl_net_ipv4.c,v 1.42 2000/01/09 02:19:37 davem Exp $
+ * $Id: sysctl_net_ipv4.c,v 1.43 2000/01/16 05:11:27 davem Exp $
  *
  * Begun April 1, 1996, Mike Shaver.
  * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS]
@@ -41,26 +41,6 @@ extern int sysctl_ipfrag_time;
 /* From ip_output.c */
 extern int sysctl_ip_dynaddr;
 
-/* From ip_masq.c */
-extern int sysctl_ip_masq_debug;
-
-extern int sysctl_tcp_timestamps;
-extern int sysctl_tcp_window_scaling;
-extern int sysctl_tcp_sack;
-extern int sysctl_tcp_retrans_collapse;
-extern int sysctl_tcp_keepalive_time;
-extern int sysctl_tcp_keepalive_probes;
-extern int sysctl_tcp_retries1;
-extern int sysctl_tcp_retries2;
-extern int sysctl_tcp_fin_timeout;
-extern int sysctl_tcp_syncookies;
-extern int sysctl_tcp_syn_retries;
-extern int sysctl_tcp_stdurg;
-extern int sysctl_tcp_rfc1337;
-extern int sysctl_tcp_syn_taildrop; 
-extern int sysctl_max_syn_backlog; 
-extern int sysctl_tcp_tw_recycle;
-
 /* From icmp.c */
 extern int sysctl_icmp_destunreach_time;
 extern int sysctl_icmp_timeexceed_time;
@@ -142,6 +122,12 @@ ctl_table ipv4_table[] = {
          &proc_dointvec},
 	{NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries",
 	 &sysctl_tcp_syn_retries, sizeof(int), 0644, NULL, &proc_dointvec},
+	{NET_TCP_SYNACK_RETRIES, "tcp_synack_retries",
+	 &sysctl_tcp_synack_retries, sizeof(int), 0644, NULL, &proc_dointvec},
+	{NET_TCP_MAX_ORPHANS, "tcp_max_orphans",
+	 &sysctl_tcp_max_orphans, sizeof(int), 0644, NULL, &proc_dointvec},
+	{NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets",
+	 &sysctl_tcp_max_tw_buckets, sizeof(int), 0644, NULL, &proc_dointvec},
 	{NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh",
 	 &sysctl_ipfrag_high_thresh, sizeof(int), 0644, NULL, &proc_dointvec},
 	{NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh",
@@ -172,10 +158,10 @@ ctl_table ipv4_table[] = {
 	{NET_TCP_SYNCOOKIES, "tcp_syncookies",
 	 &sysctl_tcp_syncookies, sizeof(int), 0644, NULL, &proc_dointvec},
 #endif
-#ifdef CONFIG_TCP_TW_RECYCLE
 	{NET_TCP_TW_RECYCLE, "tcp_tw_recycle",
 	 &sysctl_tcp_tw_recycle, sizeof(int), 0644, NULL, &proc_dointvec},
-#endif
+	{NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow",
+	 &sysctl_tcp_abort_on_overflow, sizeof(int), 0644, NULL, &proc_dointvec},
 	{NET_TCP_STDURG, "tcp_stdurg", &sysctl_tcp_stdurg,
 	 sizeof(int), 0644, NULL, &proc_dointvec},
 	{NET_TCP_RFC1337, "tcp_rfc1337", &sysctl_tcp_rfc1337,
@@ -221,6 +207,8 @@ ctl_table ipv4_table[] = {
 	{NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime",
 	 &inet_peer_gc_maxtime, sizeof(int), 0644, NULL,
 	 &proc_dointvec_jiffies, &sysctl_jiffies},
+	{NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries",
+	 &sysctl_tcp_orphan_retries, sizeof(int), 0644, NULL, &proc_dointvec},
 	{0}
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8e24e19a4..e01892326 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp.c,v 1.153 2000/01/09 02:19:33 davem Exp $
+ * Version:	$Id: tcp.c,v 1.160 2000/01/24 18:40:32 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -202,6 +202,8 @@
  *		Eric Schenk	:	Fix fast close down bug with
  *					shutdown() followed by close().
  *		Andi Kleen :	Make poll agree with SIGIO
+ *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
+ *					lingertime == 0 (RFC 793 ABORT Call)
  *					
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -432,113 +434,14 @@ kmem_cache_t *tcp_openreq_cachep;
 kmem_cache_t *tcp_bucket_cachep;
 kmem_cache_t *tcp_timewait_cachep;
 
-/*
- *	Find someone to 'accept'. Must be called with
- *	the listening socket locked.
- */
-
-static struct open_request *tcp_find_established(struct tcp_opt *tp, 
-						 struct open_request **prevp)
-{
-	struct open_request *req = tp->syn_wait_queue;
-	struct open_request *prev = (struct open_request *)&tp->syn_wait_queue; 
-	while(req) {
-		if (req->sk) {
-			if((1 << req->sk->state) &
-			   ~(TCPF_SYN_SENT|TCPF_SYN_RECV))
-				break;
-		}
-		prev = req; 
-		req = req->dl_next;
-	}
-	*prevp = prev; 
-	return req;
-}
-
-/*
- *	Walk down the receive queue counting readable data.
- *
- *	Must be called with the socket lock held.
- */
-
-static int tcp_readable(struct sock *sk)
-{
-	unsigned long counted;
-	unsigned long amount;
-	struct sk_buff *skb;
-	int sum;
-
-	SOCK_DEBUG(sk, "tcp_readable: %p - ",sk);
-
-	skb = skb_peek(&sk->receive_queue);
-	if (skb == NULL) {
-		SOCK_DEBUG(sk, "empty\n");
-	  	return(0);
-	}
-
-	counted = sk->tp_pinfo.af_tcp.copied_seq;	/* Where we are at the moment */
-	amount = 0;
-
-	/* Do until a push or until we are out of data. */
-	do {
-		/* Found a hole so stops here. */
-		if (before(counted, TCP_SKB_CB(skb)->seq))	/* should not happen */
-			break;
-
-		/* Length - header but start from where we are up to
-		 * avoid overlaps.
-		 */
-		sum = skb->len - (counted - TCP_SKB_CB(skb)->seq);
-		if (sum >= 0) {
-			/* Add it up, move on. */
-			amount += sum;
-			counted += sum;
-			if (skb->h.th->syn)
-				counted++;
-		}
-
-		/* Don't count urg data ... but do it in the right place!
-		 * Consider: "old_data (ptr is here) URG PUSH data"
-		 * The old code would stop at the first push because
-		 * it counted the urg (amount==1) and then does amount--
-		 * *after* the loop.  This means tcp_readable() always
-		 * returned zero if any URG PUSH was in the queue, even
-		 * though there was normal data available. If we subtract
-		 * the urg data right here, we even get it to work for more
-		 * than one URG PUSH skb without normal data.
-		 * This means that poll() finally works now with urg data
-		 * in the queue.  Note that rlogin was never affected
-		 * because it doesn't use poll(); it uses two processes
-		 * and a blocking read().  And the queue scan in tcp_read()
-		 * was correct.  Mike <pall@rz.uni-karlsruhe.de>
-		 */
-
-		/* Don't count urg data. */
-		if (skb->h.th->urg)
-			amount--;
-#if 0
-		if (amount && skb->h.th->psh) break;
-#endif
-		skb = skb->next;
-	} while(skb != (struct sk_buff *)&sk->receive_queue);
-
-	SOCK_DEBUG(sk, "got %lu bytes.\n",amount);
-	return(amount);
-}
+atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 
 /*
  * LISTEN is a special case for poll..
  */
-static unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
+static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
 {
-	struct open_request *req, *dummy;
-
-	lock_sock(sk);
-	req = tcp_find_established(&sk->tp_pinfo.af_tcp, &dummy);
-	release_sock(sk);
-	if (req)
-		return POLLIN | POLLRDNORM;
-	return 0;
+	return sk->tp_pinfo.af_tcp.accept_queue ? (POLLIN | POLLRDNORM) : 0;
 }
 
 /*
@@ -585,9 +488,25 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
 	 * if you don't tell them that something has hung up!
 	 *
 	 * Check-me.
+	 *
+	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
+	 * our fs/select.c). It means that after we received EOF,
+	 * poll always returns immediately, making impossible poll() on write()
+	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
+	 * if and only if shutdown has been made in both directions.
+	 * Actually, it is interesting to look how Solaris and DUX
+	 * solve this dilemma. I would prefer, if PULLHUP were maskable,
+	 * then we could set it on SND_SHUTDOWN. BTW examples given
+	 * in Stevens' books assume exactly this behaviour, it explains
+	 * why PULLHUP is incompatible with POLLOUT.	--ANK
+	 *
+	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
+	 * blocking on fresh not-connected or disconnected socket. --ANK
 	 */
-	if (sk->shutdown & RCV_SHUTDOWN)
+	if (sk->shutdown == SHUTDOWN_MASK || sk->state == TCP_CLOSE)
 		mask |= POLLHUP;
+	if (sk->shutdown & RCV_SHUTDOWN)
+		mask |= POLLIN | POLLRDNORM;
 
 	/* Connected? */
 	if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
@@ -605,7 +524,7 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
 			}
 		}
 
-		if (tp->urg_data & URG_VALID)
+		if (tp->urg_data & TCP_URG_VALID)
 			mask |= POLLPRI;
 	}
 	return mask;
@@ -631,32 +550,48 @@ void tcp_write_space(struct sock *sk)
 	read_unlock(&sk->callback_lock);
 }
 
+/* Listening TCP sockets never sleep to wait for memory, so
+ * it is completely silly to wake them up on queue space
+ * available events.  So we hook them up to this dummy callback.
+ */
+static void tcp_listen_write_space(struct sock *sk)
+{
+}
 
 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 {
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	int answ;
 
 	switch(cmd) {
-	case TIOCINQ:
-#ifdef FIXME	/* FIXME: */
-	case FIONREAD:
-#endif
+	case SIOCINQ:
 		if (sk->state == TCP_LISTEN)
 			return(-EINVAL);
+
 		lock_sock(sk);
-		answ = tcp_readable(sk);
+		if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
+			answ = 0;
+		else if (sk->urginline || !tp->urg_data ||
+			 before(tp->urg_seq,tp->copied_seq) ||
+			 !before(tp->urg_seq,tp->rcv_nxt))
+			answ = tp->rcv_nxt - tp->copied_seq;
+		else
+			answ = tp->urg_seq - tp->copied_seq;
 		release_sock(sk);
 		break;
 	case SIOCATMARK:
 		{
-			struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 			answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 			break;
 		}
-	case TIOCOUTQ:
+	case SIOCOUTQ:
 		if (sk->state == TCP_LISTEN)
 			return(-EINVAL);
-		answ = sock_wspace(sk);
+
+		if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
+			answ = 0;
+		else
+			answ = tp->write_seq - tp->snd_una;
 		break;
 	default:
 		return(-ENOIOCTLCMD);
@@ -665,12 +600,131 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 	return put_user(answ, (int *)arg);
 }
 
+
+int tcp_listen_start(struct sock *sk)
+{
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	struct tcp_listen_opt *lopt;
+
+	sk->max_ack_backlog = 0;
+	sk->ack_backlog = 0;
+	tp->accept_queue = NULL;
+	tp->syn_wait_lock = RW_LOCK_UNLOCKED;
+
+	lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
+	if (!lopt)
+		return -ENOMEM;
+
+	memset(lopt, 0, sizeof(struct tcp_listen_opt));
+	for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
+		if ((1<<lopt->max_qlen_log) >= sysctl_max_syn_backlog)
+			break;
+
+	write_lock_bh(&tp->syn_wait_lock);
+	tp->listen_opt = lopt;
+	write_unlock_bh(&tp->syn_wait_lock);
+
+	sk->state = TCP_LISTEN;
+	if (sk->num == 0) {
+		if (sk->prot->get_port(sk, 0) != 0) {
+			sk->state = TCP_CLOSE;
+			write_lock_bh(&tp->syn_wait_lock);
+			tp->listen_opt = NULL;
+			write_unlock_bh(&tp->syn_wait_lock);
+			kfree(lopt);
+			return -EAGAIN;
+		}
+		sk->sport = htons(sk->num);
+	} else {
+		if (sk->prev)
+			((struct tcp_bind_bucket*)sk->prev)->fastreuse = 0;
+	}
+
+	sk_dst_reset(sk);
+	sk->prot->hash(sk);
+	sk->socket->flags |= SO_ACCEPTCON;
+	sk->write_space = tcp_listen_write_space;
+
+	return 0;
+}
+
+/*
+ *	This routine closes sockets which have been at least partially
+ *	opened, but not yet accepted.
+ */
+
+static void tcp_listen_stop (struct sock *sk)
+{
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	struct tcp_listen_opt *lopt = tp->listen_opt;
+	struct open_request *acc_req = tp->accept_queue;
+	struct open_request *req;
+	int i;
+
+	tcp_delete_keepalive_timer(sk);
+
+	/* make all the listen_opt local to us */
+	write_lock_bh(&tp->syn_wait_lock);
+	tp->listen_opt =NULL;
+	write_unlock_bh(&tp->syn_wait_lock);
+	tp->accept_queue = NULL;
+
+	if (lopt->qlen) {
+		for (i=0; i<TCP_SYNQ_HSIZE; i++) {
+			while ((req = lopt->syn_table[i]) != NULL) {
+				lopt->syn_table[i] = req->dl_next;
+				lopt->qlen--;
+				tcp_openreq_free(req);
+
+		/* Following specs, it would be better either to send FIN
+		 * (and enter FIN-WAIT-1, it is normal close)
+		 * or to send active reset (abort). 
+		 * Certainly, it is pretty dangerous while synflood, but it is
+		 * bad justification for our negligence 8)
+		 * To be honest, we are not able to make either
+		 * of the variants now.			--ANK
+		 */
+			}
+		}
+	}
+	BUG_TRAP(lopt->qlen == 0);
+
+	kfree(lopt);
+
+	while ((req=acc_req) != NULL) {
+		struct sock *child = req->sk;
+
+		acc_req = req->dl_next;
+
+		local_bh_disable();
+		bh_lock_sock(child);
+		BUG_TRAP(child->lock.users==0);
+		sock_hold(child);
+
+		tcp_disconnect(child, O_NONBLOCK);
+
+		sock_orphan(child);
+
+		atomic_inc(&tcp_orphan_count);
+
+		tcp_destroy_sock(child);
+
+		bh_unlock_sock(child);
+		local_bh_enable();
+		sock_put(child);
+
+		tcp_acceptq_removed(sk);
+		tcp_openreq_fastfree(req);
+	}
+	BUG_TRAP(sk->ack_backlog == 0);
+}
+
 /*
  *	Wait for a socket to get into the connected state
  *
  *	Note: Must be called with the socket locked.
  */
-static int wait_for_tcp_connect(struct sock * sk, int flags)
+static int wait_for_tcp_connect(struct sock * sk, int flags, long *timeo_p)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -684,7 +738,7 @@ static int wait_for_tcp_connect(struct sock * sk, int flags)
 				send_sig(SIGPIPE, tsk, 0);
 			return -EPIPE;
 		}
-		if(flags & MSG_DONTWAIT)
+		if(!*timeo_p)
 			return -EAGAIN;
 		if(signal_pending(tsk))
 			return -ERESTARTSYS;
@@ -694,7 +748,7 @@ static int wait_for_tcp_connect(struct sock * sk, int flags)
 		sk->tp_pinfo.af_tcp.write_pending++;
 
 		release_sock(sk);
-		schedule();
+		*timeo_p = schedule_timeout(*timeo_p);
 		lock_sock(sk);
 
 		__set_task_state(tsk, TASK_RUNNING);
@@ -712,7 +766,7 @@ static inline int tcp_memory_free(struct sock *sk)
 /*
  *	Wait for more memory for a socket
  */
-static void wait_for_tcp_memory(struct sock * sk)
+static long wait_for_tcp_memory(struct sock * sk, long timeo)
 {
 	if (!tcp_memory_free(sk)) {
 		DECLARE_WAITQUEUE(wait, current);
@@ -732,12 +786,13 @@ static void wait_for_tcp_memory(struct sock * sk)
 				break;
 			release_sock(sk);
 			if (!tcp_memory_free(sk))
-				schedule();
+				timeo = schedule_timeout(timeo);
 			lock_sock(sk);
 		}
 		current->state = TASK_RUNNING;
 		remove_wait_queue(sk->sleep, &wait);
 	}
+	return timeo;
 }
 
 /* When all user supplied data has been queued set the PSH bit */
@@ -746,11 +801,9 @@ static void wait_for_tcp_memory(struct sock * sk)
 /*
  *	This routine copies from a user buffer into a socket,
  *	and starts the transmit system.
- *
- *	Note: must be called with the socket locked.
  */
 
-int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
+int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size)
 {
 	struct iovec *iov;
 	struct tcp_opt *tp;
@@ -758,15 +811,22 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
 	int iovlen, flags;
 	int mss_now;
 	int err, copied;
+	long timeo;
 
 	err = 0;
 	tp = &(sk->tp_pinfo.af_tcp);
 
-	/* Wait for a connection to finish. */
+	lock_sock(sk);
+	TCP_CHECK_TIMER(sk);
+
 	flags = msg->msg_flags;
+
+	timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
+
+	/* Wait for a connection to finish. */
 	if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
-		if((err = wait_for_tcp_connect(sk, flags)) != 0)
-			goto out;
+		if((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
+			goto out_unlock;
 
 	/* This should be in poll */
 	sk->socket->flags &= ~SO_NOSPACE; /* clear SIGIO XXX */
@@ -777,7 +837,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
 	iovlen = msg->msg_iovlen;
 	iov = msg->msg_iov;
 	copied = 0;
-	
+
 	while(--iovlen >= 0) {
 		int seglen=iov->iov_len;
 		unsigned char * from=iov->iov_base;
@@ -785,7 +845,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
 		iov++;
 
 		while(seglen > 0) {
-			int copy, tmp, queue_it, psh;
+			int copy, tmp, queue_it;
 
 			if (err)
 				goto do_fault2;
@@ -811,8 +871,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
 				 * welcome.
 				 */
 				if (skb_tailroom(skb) > 0 &&
-				    (mss_now - copy) > 0 &&
-				    tp->snd_nxt < TCP_SKB_CB(skb)->end_seq) {
+				    (mss_now - copy) > 0) {
 					int last_byte_was_odd = (copy % 4);
 
 					copy = mss_now - copy;
@@ -855,34 +914,17 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
 				}
 			}
 
-			/* We also need to worry about the window.  If
-			 * window < 1/2 the maximum window we've seen
-			 * from this host, don't use it.  This is
-			 * sender side silly window prevention, as
-			 * specified in RFC1122.  (Note that this is
-			 * different than earlier versions of SWS
-			 * prevention, e.g. RFC813.).  What we
-			 * actually do is use the whole MSS.  Since
-			 * the results in the right edge of the packet
-			 * being outside the window, it will be queued
-			 * for later rather than sent.
+			/* A chunk was here doing something strange
+			 * with psh etc. It is deleted, because it was
+			 * evident non-sense.			 --ANK
 			 */
-			psh = 0;
-			copy = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
-			if(copy > (tp->max_window >> 1)) {
-				copy = min(copy, mss_now);
-				psh = 1;
-			} else {
-				copy = mss_now;
-			}
-			if(copy > seglen)
-				copy = seglen;
+
+			copy = min(seglen, mss_now);
 
 			/* Determine how large of a buffer to allocate.  */
-			tmp = MAX_HEADER + sk->prot->max_header;
-			if (copy < min(mss_now, tp->max_window >> 1) &&
-			    !(flags & MSG_OOB)) {
-				tmp += min(mss_now, tp->max_window);
+			tmp = MAX_TCP_HEADER + 15;
+			if (copy < mss_now && !(flags & MSG_OOB)) {
+				tmp += mss_now;
 
 				/* What is happening here is that we want to
 				 * tack on later members of the users iovec
@@ -901,7 +943,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
 			/* If we didn't get any memory, we need to sleep. */
 			if (skb == NULL) {
 				sk->socket->flags |= SO_NOSPACE;
-				if (flags&MSG_DONTWAIT) {
+				if (!timeo) {
 					err = -EAGAIN;
 					goto do_interrupted;
 				}
@@ -909,8 +951,8 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
 					err = -ERESTARTSYS;
 					goto do_interrupted;
 				}
-				tcp_push_pending_frames(sk, tp);
-				wait_for_tcp_memory(sk);
+				__tcp_push_pending_frames(sk, tp, mss_now);
+				timeo = wait_for_tcp_memory(sk, timeo);
 
 				/* If SACK's were formed or PMTU events happened,
 				 * we must find out about it.
@@ -923,7 +965,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
 
 			/* Prepare control bits for TCP header creation engine. */
 			TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK |
-						  ((PSH_NEEDED || psh) ?
+						  ((PSH_NEEDED) ?
 						   TCPCB_FLAG_PSH : 0));
 			TCP_SKB_CB(skb)->sacked = 0;
 			if (flags & MSG_OOB) {
@@ -936,7 +978,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
 			 * TCP+IP+DEV headers are SKB_PUSH()'d beneath.
 			 * Reserve header space and checksum the data.
 			 */
-			skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+			skb_reserve(skb, MAX_TCP_HEADER);
 			skb->csum = csum_and_copy_from_user(from,
 					skb_put(skb, copy), copy, 0, &err);
 
@@ -950,7 +992,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
 			TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + copy;
 
 			/* This advances tp->write_seq for us. */
-			tcp_send_skb(sk, skb, queue_it);
+			tcp_send_skb(sk, skb, queue_it, mss_now);
 		}
 	}
 	sk->err = 0;
@@ -981,63 +1023,39 @@ do_fault:
 do_fault2:
 	err = -EFAULT;
 out:
-	tcp_push_pending_frames(sk, tp);
+	__tcp_push_pending_frames(sk, tp, mss_now);
+	TCP_CHECK_TIMER(sk);
+out_unlock:
+	release_sock(sk);
 	return err;
 }
 
 #undef PSH_NEEDED
 
 /*
- *	Send an ack if one is backlogged at this point. Ought to merge
- *	this with tcp_send_ack().
- *      This is called for delayed acks also.
- */
- 
-void tcp_read_wakeup(struct sock *sk)
-{
-	/* If we're closed, don't send an ack, or we'll get a RST
-	 * from the closed destination.
-	 */
-	if (sk->state != TCP_CLOSE)
-		tcp_send_ack(sk);
-}
-
-/*
  *	Handle reading urgent data. BSD has very simple semantics for
  *	this, no blocking and very strange errors 8)
  */
 
-static int tcp_recv_urg(struct sock * sk, int nonblock,
+static int tcp_recv_urg(struct sock * sk, long timeo,
 			struct msghdr *msg, int len, int flags, 
 			int *addr_len)
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
 	/* No URG data to read. */
-	if (sk->urginline || !tp->urg_data || tp->urg_data == URG_READ)
+	if (sk->urginline || !tp->urg_data || tp->urg_data == TCP_URG_READ)
 		return -EINVAL;	/* Yes this is right ! */
 
 	if (sk->done)
 		return -ENOTCONN;
 
-	if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN)) {
-		sk->done = 1;
-		return 0;
-	}
-
-	if (tp->urg_data & URG_VALID) {
+	if (tp->urg_data & TCP_URG_VALID) {
 		int err = 0; 
 		char c = tp->urg_data;
 
 		if (!(flags & MSG_PEEK))
-			tp->urg_data = URG_READ;
-
-		if(msg->msg_name)
-			tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
-						       msg->msg_name);       
-
-		if(addr_len)
-			*addr_len = tp->af_specific->sockaddr_len;
+			tp->urg_data = TCP_URG_READ;
 
 		/* Read urgent data. */
 		msg->msg_flags|=MSG_OOB;
@@ -1051,6 +1069,10 @@ static int tcp_recv_urg(struct sock * sk, int nonblock,
 		return err ? -EFAULT : len;
 	}
 
+	/* Do not set sk->done, it is set only by normal data receive */
+	if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN))
+		return 0;
+
 	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
 	 * the available implementations agree in this case:
 	 * this call should never block, independent of the
@@ -1069,6 +1091,8 @@ static int tcp_recv_urg(struct sock * sk, int nonblock,
 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
 {
 	__skb_unlink(skb, &sk->receive_queue);
+	BUG_TRAP(atomic_read(&skb->users) == 1);
+	/* Well, if I missed something then punishment will be terrible oops. */
 	__kfree_skb(skb);
 }
 
@@ -1080,22 +1104,34 @@ static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
  */
 static void cleanup_rbuf(struct sock *sk, int copied)
 {
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	struct sk_buff *skb;
+	int time_to_ack;
 	
 	/* NOTE! The socket must be locked, so that we don't get
 	 * a messed-up receive queue.
 	 */
 	while ((skb=skb_peek(&sk->receive_queue)) != NULL) {
-		if (!skb->used || atomic_read(&skb->users) > 1)
+		if (!skb->used)
 			break;
 		tcp_eat_skb(sk, skb);
 	}
 
+	/* Delayed ACKs frequently hit locked sockets during bulk receive. */
+	time_to_ack = tp->ack.blocked && tp->ack.pending;
+#if 1/*def CONFIG_TCP_MORE_COARSE_ACKS*/
+	if (tp->ack.pending &&
+	    (tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss)
+		time_to_ack = 1;
+#endif
+
   	/* We send an ACK if we can now advertise a non-zero window
 	 * which has been raised "significantly".
+	 *
+	 * Even if window raised up to infinity, do not send window open ACK
+	 * in states, where we will not receive more. It is useless.
   	 */
-	if(copied > 0) {
-		struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	if(copied > 0 && !time_to_ack && !(sk->shutdown&RCV_SHUTDOWN)) {
 		__u32 rcv_window_now = tcp_receive_window(tp);
 		__u32 new_window = __tcp_select_window(sk);
 
@@ -1106,16 +1142,20 @@ static void cleanup_rbuf(struct sock *sk, int copied)
 		 * which don't advertize a larger window.
 		 */
 		if((new_window && (new_window >= rcv_window_now * 2)) &&
-		   ((rcv_window_now + tp->mss_cache) <= tp->window_clamp))
-			tcp_read_wakeup(sk);
+		   ((rcv_window_now + tp->ack.rcv_mss) <= tp->window_clamp))
+			time_to_ack = 1;
 	}
+	if (time_to_ack)
+		tcp_send_ack(sk);
 }
 
 /* Now socket state including sk->err is changed only under lock,
-   hence we should check only pending signals.
+ * hence we may omit checks after joining wait queue.
+ * We check receive queue before schedule() only as optimization;
+ * it is very likely that release_sock() added new data.
  */
 
-static void tcp_data_wait(struct sock *sk)
+static long tcp_data_wait(struct sock *sk, long timeo)
 {
 	DECLARE_WAITQUEUE(wait, current);
 
@@ -1127,17 +1167,39 @@ static void tcp_data_wait(struct sock *sk)
 	release_sock(sk);
 
 	if (skb_queue_empty(&sk->receive_queue))
-		schedule();
+		timeo = schedule_timeout(timeo);
 
 	lock_sock(sk);
 	sk->socket->flags &= ~SO_WAITDATA;
 
 	remove_wait_queue(sk->sleep, &wait);
 	__set_current_state(TASK_RUNNING);
+	return timeo;
+}
+
+static void tcp_prequeue_process(struct sock *sk)
+{
+	struct sk_buff *skb;
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+	net_statistics[smp_processor_id()*2+1].TCPPrequeued += skb_queue_len(&tp->ucopy.prequeue);
+
+	/* RX process wants to run with disabled BHs, though it is not necessary */
+	local_bh_disable();
+	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
+		sk->backlog_rcv(sk, skb);
+	local_bh_enable();
+
+	/* Clear memory counter. */
+	tp->ucopy.memory = 0;
 }
 
 /*
  *	This routine copies from a sock struct into the user buffer. 
+ *
+ *	Technical note: in 2.3 we work on _locked_ socket, so that
+ *	tricks with *seq access order and skb->users are not required.
+ *	Probably, code can be easily improved even more.
  */
  
 int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
@@ -1146,13 +1208,18 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	int copied = 0;
 	u32 peek_seq;
-	volatile u32 *seq;	/* So gcc doesn't overoptimise */
+	u32 *seq;
 	unsigned long used;
 	int err;
-	int target = 1;		/* Read at least this many bytes */
+	int target;		/* Read at least this many bytes */
+	long timeo;
+	struct task_struct *user_recv = NULL;
 
 	lock_sock(sk);
 
+	TCP_CHECK_TIMER(sk);
+
+
 	if (sk->err)
 		goto out_err;
 
@@ -1160,24 +1227,20 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
 	if (sk->state == TCP_LISTEN)
 		goto out;
 
+	timeo = sock_rcvtimeo(sk, nonblock);
+
 	/* Urgent data needs to be handled specially. */
 	if (flags & MSG_OOB)
 		goto recv_urg;
 
-	/*	Copying sequence to update. This is volatile to handle
-	 *	the multi-reader case neatly (memcpy_to/fromfs might be
-	 *	inline and thus not flush cached variables otherwise).
-	 */
-	peek_seq = tp->copied_seq;
 	seq = &tp->copied_seq;
-	if (flags & MSG_PEEK)
+	if (flags & MSG_PEEK) {
+		peek_seq = tp->copied_seq;
 		seq = &peek_seq;
+	}
 
-	/* Handle the POSIX bogosity MSG_WAITALL. */
-	if (flags & MSG_WAITALL)
-		target=len;
+	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
 
-	
 	/*
 	 *	BUG BUG BUG
 	 *	This violates 1003.1g compliance. We must wait for 
@@ -1200,7 +1263,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
 			if (copied)
 				break;
 			copied = -ERESTARTSYS;
-			if (nonblock)
+			if (!timeo)
 				copied = -EAGAIN;
 			break;
 		}
@@ -1232,47 +1295,128 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
 			skb = skb->next;
 		} while (skb != (struct sk_buff *)&sk->receive_queue);
 
-		if (copied >= target)
+		/* Well, if we have backlog, try to process it now yet. */
+
+		if (copied >= target && sk->backlog.tail == NULL)
 			break;
 
-		if (sk->err && !(flags&MSG_PEEK)) {
-			if (!copied)
+		if (copied) {
+			if (sk->err ||
+			    sk->state == TCP_CLOSE ||
+			    (sk->shutdown & RCV_SHUTDOWN) ||
+			    !timeo)
+				break;
+		} else {
+			if (sk->err) {
 				copied = sock_error(sk);
-			break;
-		}
+				break;
+			}
 
-		if (sk->shutdown & RCV_SHUTDOWN) {
-			sk->done = 1;
-			break;
-		}
+			if (sk->done) {
+				copied = -ENOTCONN;
+				break;
+			}
 
-		if (sk->state == TCP_CLOSE) {
-			if (!sk->done) {
-				sk->done = 1;
+			if (sk->state == TCP_CLOSE) {
+				if (!(flags&MSG_PEEK))
+					sk->done = 1;
 				break;
 			}
-			if (!copied)
-				copied = -ENOTCONN;
-			break;
-		}
 
-		if (nonblock) {
-			copied = -EAGAIN;
-			break;
+			if (sk->shutdown & RCV_SHUTDOWN)
+				break;
+
+			if (!timeo) {
+				copied = -EAGAIN;
+				break;
+			}
 		}
 
 		cleanup_rbuf(sk, copied);
-		tcp_data_wait(sk);
+
+		if (tp->ucopy.task == user_recv) {
+			/* Install new reader */
+			if (user_recv == NULL && !(flags&MSG_PEEK)) {
+				user_recv = current;
+				tp->ucopy.task = user_recv;
+				tp->ucopy.iov = msg->msg_iov;
+			}
+
+			tp->ucopy.len = len;
+
+			BUG_TRAP(tp->copied_seq == tp->rcv_nxt);
+
+			/* Ugly... If prequeue is not empty, we have to
+			 * process it before releasing socket, otherwise
+			 * order will be broken at second iteration.
+			 * More elegant solution is required!!!
+			 *
+			 * Look: we have the following (pseudo)queues:
+			 *
+			 * 1. packets in flight
+			 * 2. backlog
+			 * 3. prequeue
+			 * 4. receive_queue
+			 *
+			 * Each queue can be processed only if the next ones
+			 * are empty. At this point we have empty receive_queue.
+			 * But prequeue _can_ be not empty after second iteration,
+			 * when we jumped to start of loop because backlog
+			 * processing added something to receive_queue.
+			 * We cannot release_sock(), because backlog contains
+			 * packets arrived _after_ prequeued ones.
+			 *
+			 * Shortly, algorithm is clear --- to process all
+			 * the queues in order. We could make it more directly,
+			 * requeueing packets from backlog to prequeue, if
+			 * is not empty. It is more elegant, but eats cycles,
+			 * unfortunately.
+			 */
+			if (skb_queue_len(&tp->ucopy.prequeue))
+				goto do_prequeue;
+
+			/* __ Set realtime policy in scheduler __ */
+		}
+
+		if (copied >= target) {
+			/* Do not sleep, just process backlog. */
+			release_sock(sk);
+			lock_sock(sk);
+		} else {
+			timeo = tcp_data_wait(sk, timeo);
+		}
+
+		if (user_recv) {
+			int chunk;
+
+			/* __ Restore normal policy in scheduler __ */
+
+			if ((chunk = len - tp->ucopy.len) != 0) {
+				net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromBacklog += chunk;
+				len -= chunk;
+				copied += chunk;
+			}
+
+			if (tp->rcv_nxt == tp->copied_seq &&
+			    skb_queue_len(&tp->ucopy.prequeue)) {
+do_prequeue:
+				tcp_prequeue_process(sk);
+
+				if ((chunk = len - tp->ucopy.len) != 0) {
+					net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
+					len -= chunk;
+					copied += chunk;
+				}
+			}
+#if 1/*def CONFIG_TCP_MORE_COARSE_ACKS*/
+			if (tp->ack.pending &&
+			    (tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss)
+				tcp_send_ack(sk);
+#endif
+		}
 		continue;
 
 	found_ok_skb:
-		/*	Lock the buffer. We can be fairly relaxed as
-		 *	an interrupt will never steal a buffer we are
-		 *	using unless I've missed something serious in
-		 *	tcp_data.
-		 */
-		atomic_inc(&skb->users);
-
 		/* Ok so how much can we use? */
 		used = skb->len - offset;
 		if (len < used)
@@ -1293,36 +1437,28 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
 			}
 		}
 
-		/*	Copy it - We _MUST_ update *seq first so that we
-		 *	don't ever double read when we have dual readers
-		 */
-		*seq += used;
-
-		/*	This memcpy_toiovec can sleep. If it sleeps and we
-		 *	do a second read it relies on the skb->users to avoid
-		 *	a crash when cleanup_rbuf() gets called.
-		 */
 		err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used);
 		if (err) {
 			/* Exception. Bailout! */
-			atomic_dec(&skb->users);
-			copied = -EFAULT;
+			if (!copied)
+				copied = -EFAULT;
 			break;
 		}
 
+		*seq += used;
 		copied += used;
 		len -= used;
 
-		/*	We now will not sleep again until we are finished
-		 *	with skb. Sorry if you are doing the SMP port
-		 *	but you'll just have to fix it neatly ;)
-		 *
-		 *	Very funny Alan... -DaveM
-		 */
-		atomic_dec(&skb->users);
-
-		if (after(tp->copied_seq,tp->urg_seq))
+		if (after(tp->copied_seq,tp->urg_seq)) {
 			tp->urg_data = 0;
+			if (skb_queue_len(&tp->out_of_order_queue) == 0
+#ifdef TCP_FORMAL_WINDOW
+			    && tcp_receive_window(tp)
+#endif
+			    ) {
+				tcp_fast_path_on(tp);
+			}
+		}
 		if (used + offset < skb->len)
 			continue;
 
@@ -1334,8 +1470,30 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
 		if (flags & MSG_PEEK)
 			continue;
 		skb->used = 1;
-		if (atomic_read(&skb->users) == 1)
-			tcp_eat_skb(sk, skb);
+		tcp_eat_skb(sk, skb);
+
+#ifdef CONFIG_TCP_LESS_COARSE_ACKS
+		/* Possible improvement. When sender is faster than receiver,
+		 * traffic looks like: fill window ... wait for window open ...
+		 * fill window. We lose at least one rtt, because call
+		 * cleanup_rbuf only once. Probably, if "len" was large
+		 * we should insert several intermediate cleanup_rbuf(s).
+		 *
+		 * F.e.:
+		 */
+		do {
+			u32 full_space = min(tp->window_clamp, tcp_full_space(sk));
+
+			/* Try to ACK, if total buffer length is larger
+			   than maximal window and if rcv_window has
+			   chances to increase twice. It will result
+			   to exponentially decreased ACKing during
+			   read to huge (usually, mmapped) buffer.
+			 */
+			if (len >= full_space && tp->rcv_wnd <= full_space/2)
+				cleanup_rbuf(sk, copied);
+		} while (0);
+#endif
 		continue;
 
 	found_fin_ok:
@@ -1345,19 +1503,36 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
 
 		/* All is done. */
 		skb->used = 1;
-		sk->shutdown |= RCV_SHUTDOWN;
 		break;
 	}
 
-	if (copied >= 0 && msg->msg_name)
-		tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
-					       msg->msg_name);       
+	if (user_recv) {
+		if (skb_queue_len(&tp->ucopy.prequeue)) {
+			int chunk;
+
+			tp->ucopy.len = copied > 0 ? len : 0;
+
+			tcp_prequeue_process(sk);
+
+			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
+				net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
+				len -= chunk;
+				copied += chunk;
+			}
+		}
 
-	if(addr_len)
-		*addr_len = tp->af_specific->sockaddr_len;
+		tp->ucopy.task = NULL;
+		tp->ucopy.len = 0;
+	}
+
+	/* According to UNIX98, msg_name/msg_namelen are ignored
+	 * on connected socket. I was just happy when found this 8) --ANK
+	 */
 
 	/* Clean up data we have read: This will do ACK frames. */
 	cleanup_rbuf(sk, copied);
+
+	TCP_CHECK_TIMER(sk);
 	release_sock(sk);
 	return copied;
 
@@ -1365,24 +1540,16 @@ out_err:
 	err = sock_error(sk);
 
 out:
+	TCP_CHECK_TIMER(sk);
 	release_sock(sk);
 	return err;
 
 recv_urg:
-	err = tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
+	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
 	goto out;
 }
 
 /*
- * Check whether to renew the timer.
- */
-static inline void tcp_check_fin_timer(struct sock *sk)
-{
-	if (sk->state == TCP_FIN_WAIT2)
-		tcp_reset_keepalive_timer(sk, sysctl_tcp_fin_timeout);
-}
-
-/*
  *	State processing on a close. This implements the state shift for
  *	sending our FIN frame. Note that we only send a FIN for some
  *	states. A shutdown() may have already sent the FIN, or we may be
@@ -1405,24 +1572,13 @@ static unsigned char new_state[16] = {
   /* TCP_CLOSING	*/ TCP_CLOSING,
 };
 
-static int tcp_close_state(struct sock *sk, int dead)
+static int tcp_close_state(struct sock *sk)
 {
 	int next = (int) new_state[sk->state];
 	int ns = (next & TCP_STATE_MASK);
 
 	tcp_set_state(sk, ns);
 
-	/*	This is a (useful) BSD violating of the RFC. There is a
-	 *	problem with TCP as specified in that the other end could
-	 *	keep a socket open forever with no application left this end.
-	 *	We use a 3 minute timeout (about the same as BSD) then kill
-	 *	our end. If they send after that then tough - BUT: long enough
-	 *	that we won't make the old 4*rto = almost no time - whoops
-	 *	reset mistake.
-	 */
-	if (dead)
-		tcp_check_fin_timer(sk);
-
 	return (next & TCP_ACTION_FIN);
 }
 
@@ -1443,9 +1599,8 @@ void tcp_shutdown(struct sock *sk, int how)
 	/* If we've already sent a FIN, or it's a closed state, skip this. */
 	if ((1 << sk->state) &
 	    (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
-
 		/* Clear out any half completed packets.  FIN if needed. */
-		if (tcp_close_state(sk,0))
+		if (tcp_close_state(sk))
 			tcp_send_fin(sk);
 	}
 }
@@ -1460,40 +1615,6 @@ static inline int closing(struct sock * sk)
 	return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
 }
 
-/*
- *	This routine closes sockets which have been at least partially
- *	opened, but not yet accepted. Currently it is only called by
- *	tcp_close.
- */
-
-static void tcp_close_pending (struct sock *sk)
-{
-	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-	struct open_request *req = tp->syn_wait_queue;
-
-	while(req) {
-		struct open_request *iter;
-		
-		if (req->sk)
-			tcp_close(req->sk, 0);
-
-		iter = req;
-		req = req->dl_next;
-
-		if (iter->sk) {
-			sk->ack_backlog--;
-		} else {
-			tcp_dec_slow_timer(TCP_SLT_SYNACK);
-			tp->syn_backlog--;
-		}
-		(*iter->class->destructor)(iter);
-		tcp_openreq_free(iter);
-	}
-	BUG_TRAP(tp->syn_backlog == 0);
-	BUG_TRAP(sk->ack_backlog == 0);
-	tcp_synq_init(tp);
-}
-
 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
 {
 	/* First the read buffer. */
@@ -1528,6 +1649,14 @@ void tcp_destroy_sock(struct sock *sk)
 	/* It it has not 0 sk->num, it must be bound */
 	BUG_TRAP(!sk->num || sk->prev!=NULL);
 
+#ifdef TCP_DEBUG
+	if (sk->zapped) {
+		printk("TCP: double destroy sk=%p\n", sk);
+		sock_hold(sk);
+	}
+	sk->zapped = 1;
+#endif
+
 	sk->prot->destroy(sk);
 
 	tcp_kill_sk_queues(sk);
@@ -1538,6 +1667,7 @@ void tcp_destroy_sock(struct sock *sk)
 	}
 #endif
 
+	atomic_dec(&tcp_orphan_count);
 	sock_put(sk);
 }
 
@@ -1547,17 +1677,17 @@ void tcp_close(struct sock *sk, long timeout)
 	int data_was_unread = 0;
 
 	lock_sock(sk);
+	sk->shutdown = SHUTDOWN_MASK;
+
 	if(sk->state == TCP_LISTEN) {
 		tcp_set_state(sk, TCP_CLOSE);
 
 		/* Special case. */
-		tcp_close_pending(sk);
+		tcp_listen_stop(sk);
 
 		goto adjudge_to_death;
 	}
 
-	sk->shutdown = SHUTDOWN_MASK;
-
 	/*  We need to flush the recv. buffs.  We do this only on the
 	 *  descriptor close, not protocol-sourced closes, because the
 	 *  reader process may not have drained the data yet!
@@ -1581,10 +1711,35 @@ void tcp_close(struct sock *sk, long timeout)
 		/* Unread data was tossed, zap the connection. */
 		tcp_set_state(sk, TCP_CLOSE);
 		tcp_send_active_reset(sk, GFP_KERNEL);
-	} else if (tcp_close_state(sk,1)) {
+	} else if (sk->linger && sk->lingertime==0) {
+		/* Check zero linger _after_ checking for unread data. */
+		sk->prot->disconnect(sk, 0);
+	} else if (tcp_close_state(sk)) {
 		/* We FIN if the application ate all the data before
 		 * zapping the connection.
 		 */
+
+		/* RED-PEN. Formally speaking, we have broken TCP state
+		 * machine. State transitions:
+		 *
+		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
+		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
+		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
+		 *
+		 * are legal only when FIN has been sent (i.e. in window),
+		 * rather than queued out of window. Purists blame.
+		 *
+		 * F.e. "RFC state" is ESTABLISHED,
+		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
+		 *
+		 * The visible declinations are that sometimes
+		 * we enter time-wait state, when it is not required really
+		 * (harmless), do not send active resets, when they are
+		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
+		 * they look as CLOSING or LAST_ACK for Linux)
+		 * Probably, I missed some more holelets.
+		 * 						--ANK
+		 */
 		tcp_send_fin(sk);
 	}
 
@@ -1594,26 +1749,19 @@ void tcp_close(struct sock *sk, long timeout)
 
 		add_wait_queue(sk->sleep, &wait);
 
-		while (1) {
+		do {
 			set_current_state(TASK_INTERRUPTIBLE);
 			if (!closing(sk))
 				break;
 			release_sock(sk);
 			timeout = schedule_timeout(timeout);
 			lock_sock(sk);
-			if (!signal_pending(tsk) || timeout)
-				break;
-		}
+		} while (!signal_pending(tsk) && timeout);
 
 		tsk->state = TASK_RUNNING;
 		remove_wait_queue(sk->sleep, &wait);
 	}
 
-	/* Now that the socket is dead, if we are in the FIN_WAIT2 state
-	 * we may need to set up a timer.
-         */
-	tcp_check_fin_timer(sk);
-
 adjudge_to_death:
 	/* It is the last release_sock in its life. It will remove backlog. */
 	release_sock(sk);
@@ -1627,23 +1775,67 @@ adjudge_to_death:
 	BUG_TRAP(sk->lock.users==0);
 
 	sock_hold(sk);
+	sock_orphan(sk);
+
+	/*	This is a (useful) BSD violating of the RFC. There is a
+	 *	problem with TCP as specified in that the other end could
+	 *	keep a socket open forever with no application left this end.
+	 *	We use a 3 minute timeout (about the same as BSD) then kill
+	 *	our end. If they send after that then tough - BUT: long enough
+	 *	that we won't make the old 4*rto = almost no time - whoops
+	 *	reset mistake.
+	 *
+	 *	Nope, it was not mistake. It is really desired behaviour
+	 *	f.e. on http servers, when such sockets are useless, but
+	 *	consume significant resources. Let's do it with special
+	 *	linger2	option.					--ANK
+	 */
+
+	if (sk->state == TCP_FIN_WAIT2) {
+		struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+		if (tp->linger2 < 0) {
+			tcp_set_state(sk, TCP_CLOSE);
+			tcp_send_active_reset(sk, GFP_ATOMIC);
+		} else {
+			int tmo = tcp_fin_time(tp);
 
-	/* Announce socket dead, detach it from wait queue and inode. */
-	write_lock_irq(&sk->callback_lock);
-	sk->dead = 1;
-	sk->socket = NULL;
-	sk->sleep = NULL;
-	write_unlock_irq(&sk->callback_lock);
+			if (tmo > TCP_TIMEWAIT_LEN) {
+				tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
+			} else {
+				atomic_inc(&tcp_orphan_count);
+				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+				goto out;
+			}
+		}
+	}
+	if (sk->state != TCP_CLOSE &&
+	    atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans) {
+		if (net_ratelimit())
+			printk(KERN_INFO "TCP: too many of orphaned sockets\n");
+		tcp_set_state(sk, TCP_CLOSE);
+		tcp_send_active_reset(sk, GFP_ATOMIC);
+	}
+	atomic_inc(&tcp_orphan_count);
 
 	if (sk->state == TCP_CLOSE)
 		tcp_destroy_sock(sk);
 	/* Otherwise, socket is reprieved until protocol close. */
 
+out:
 	bh_unlock_sock(sk);
 	local_bh_enable();
 	sock_put(sk);
 }
 
+/* These states need RST on ABORT according to RFC793 */
+
+extern __inline__ int tcp_need_reset(int state)
+{
+	return ((1 << state) &
+	       	(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
+		 TCPF_FIN_WAIT2|TCPF_SYN_RECV));
+}
+
 int tcp_disconnect(struct sock *sk, int flags)
 {
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
@@ -1656,9 +1848,14 @@ int tcp_disconnect(struct sock *sk, int flags)
 
 	/* ABORT function of RFC793 */
 	if (old_state == TCP_LISTEN) {
-		tcp_close_pending(sk);
-	} else if (tcp_connected(old_state)) {
-		tcp_send_active_reset(sk, GFP_KERNEL);
+		tcp_listen_stop(sk);
+	} else if (tcp_need_reset(old_state) ||
+		   (tp->snd_nxt != tp->write_seq &&
+		    (1<<old_state)&(TCPF_CLOSING|TCPF_LAST_ACK))) {
+		/* The last check adjusts for discrepance of Linux wrt. RFC
+		 * states
+		 */
+		tcp_send_active_reset(sk, gfp_any());
 		sk->err = ECONNRESET;
 	} else if (old_state == TCP_SYN_SENT)
 		sk->err = ECONNRESET;
@@ -1677,26 +1874,25 @@ int tcp_disconnect(struct sock *sk, int flags)
 	memset(&sk->net_pinfo.af_inet6.rcv_saddr, 0, 16);
 #endif
 
-	sk->zapped = 0;
 	sk->shutdown = 0;
 	sk->done = 0;
 	sk->write_space = tcp_write_space;
 	tp->srtt = 0;
-#ifdef CONFIG_TCP_TW_RECYCLE
-	if ((tp->write_seq += 2) == 0)
-		tp->write_seq = 1;
-#else
-	tp->write_seq = 0;
-#endif
-	tp->ato = 0;
+	if (sysctl_tcp_tw_recycle) {
+		if ((tp->write_seq += 2) == 0)
+			tp->write_seq = 1;
+	} else {
+		tp->write_seq = 0;
+	}
 	tp->backoff = 0;
 	tp->snd_cwnd = 2;
 	tp->probes_out = 0;
+	tp->packets_out = 0;
 	tp->high_seq = 0;
 	tp->snd_ssthresh = 0x7fffffff;
 	tp->snd_cwnd_cnt = 0;
 	tp->dup_acks = 0;
-	tp->delayed_acks = 0;
+	tcp_delack_init(tp);
 	tp->send_head = tp->retrans_head = NULL;
 	tp->saw_tstamp = 0;
 	__sk_dst_reset(sk);
@@ -1712,11 +1908,10 @@ int tcp_disconnect(struct sock *sk, int flags)
  *	conditions. This must be called with the socket locked,
  *	and without the kernel lock held.
  */
-static struct open_request * wait_for_connect(struct sock * sk,
-					      struct open_request **pprev)
+static int wait_for_connect(struct sock * sk, long timeo)
 {
 	DECLARE_WAITQUEUE(wait, current);
-	struct open_request *req;
+	int err;
 
 	/*
 	 * True wake-one mechanism for incoming connections: only
@@ -1736,17 +1931,25 @@ static struct open_request * wait_for_connect(struct sock * sk,
 	for (;;) {
 		current->state = TASK_EXCLUSIVE | TASK_INTERRUPTIBLE;
 		release_sock(sk);
-		schedule();
+		if (sk->tp_pinfo.af_tcp.accept_queue == NULL)
+			timeo = schedule_timeout(timeo);
 		lock_sock(sk);
-		req = tcp_find_established(&(sk->tp_pinfo.af_tcp), pprev);
-		if (req) 
+		err = 0;
+		if (sk->tp_pinfo.af_tcp.accept_queue)
+			break;
+		err = -EINVAL;
+		if (sk->state != TCP_LISTEN)
 			break;
+		err = -ERESTARTSYS;
 		if (signal_pending(current))
 			break;
+		err = -EAGAIN;
+		if (!timeo)
+			break;
 	}
 	current->state = TASK_RUNNING;
 	remove_wait_queue(sk->sleep, &wait);
-	return req;
+	return err;
 }
 
 /*
@@ -1758,9 +1961,10 @@ static struct open_request * wait_for_connect(struct sock * sk,
 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
 {
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-	struct open_request *req, *prev;
+	struct open_request *req;
 	struct sock *newsk;
 	int error;
+	long timeo;
 
 	lock_sock(sk); 
 
@@ -1771,25 +1975,27 @@ struct sock *tcp_accept(struct sock *sk, int flags, int *err)
 	if (sk->state != TCP_LISTEN)
 		goto out;
 
+	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
 	/* Find already established connection */
-	req = tcp_find_established(tp, &prev);
-	if (!req) {
+	if (!tp->accept_queue) {
 		/* If this is a non blocking socket don't sleep */
 		error = -EAGAIN;
-		if (flags & O_NONBLOCK)
+		if (!timeo)
 			goto out;
 
-		error = -ERESTARTSYS;
-		req = wait_for_connect(sk, &prev);
-		if (!req)
+		error = wait_for_connect(sk, timeo);
+		if (error)
 			goto out;
 	}
 
-	tcp_synq_unlink(tp, req, prev);
-	newsk = req->sk;
-	req->class->destructor(req);
-	tcp_openreq_free(req);
-	sk->ack_backlog--; 
+	req = tp->accept_queue;
+	tp->accept_queue = req->dl_next;
+
+ 	newsk = req->sk;
+	tcp_acceptq_removed(sk);
+	tcp_openreq_fastfree(req);
+	BUG_TRAP(newsk->state != TCP_SYN_RECV);
 	release_sock(sk);
 	return newsk;
 
@@ -1828,7 +2034,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
 		 * the point when this call is done we typically don't yet know
 		 * which interface is going to be used
 		 */
-		if(val < 1 || val > MAX_WINDOW) {
+		if(val < 8 || val > MAX_TCP_WINDOW) {
 			err = -EINVAL;
 			break;
 		}
@@ -1839,11 +2045,11 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
 		/* You cannot try to use this and TCP_CORK in
 		 * tandem, so let the user know.
 		 */
-		if (sk->nonagle == 2) {
+		if (tp->nonagle == 2) {
 			err = -EINVAL;
 			break;
 		}
-		sk->nonagle = (val == 0) ? 0 : 1;
+		tp->nonagle = (val == 0) ? 0 : 1;
 		break;
 
 	case TCP_CORK:
@@ -1858,14 +2064,14 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
 		 * You cannot try to use TCP_NODELAY and this mechanism
 		 * at the same time, so let the user know.
 		 */
-		if (sk->nonagle == 1) {
+		if (tp->nonagle == 1) {
 			err = -EINVAL;
 			break;
 		}
 		if (val != 0) {
-			sk->nonagle = 2;
+			tp->nonagle = 2;
 		} else {
-			sk->nonagle = 0;
+			tp->nonagle = 0;
 
 			tcp_push_pending_frames(sk, tp);
 		}
@@ -1905,6 +2111,38 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
 			tp->syn_retries = val;
 		break;
 
+	case TCP_LINGER2:
+		if (val < 0)
+			tp->linger2 = -1;
+		else if (val > sysctl_tcp_fin_timeout/HZ)
+			tp->linger2 = 0;
+		else
+			tp->linger2 = val*HZ;
+		break;
+
+	case TCP_DEFER_ACCEPT:
+		tp->defer_accept = 0;
+		if (val > 0) {
+			/* Translate value in seconds to number of retransmits */
+			while (val > ((TCP_TIMEOUT_INIT/HZ)<<tp->defer_accept))
+				tp->defer_accept++;
+			tp->defer_accept++;
+		}
+		break;
+
+	case TCP_WINDOW_CLAMP:
+		if (val==0) {
+			if (sk->state != TCP_CLOSE) {
+				err = -EINVAL;
+				break;
+			}
+			tp->window_clamp = 0;
+		} else {
+			tp->window_clamp = val<SOCK_MIN_RCVBUF/2 ?
+				SOCK_MIN_SNDBUF : val;
+		}
+		break;
+
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -1930,37 +2168,38 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
 
 	switch(optname) {
 	case TCP_MAXSEG:
-		val = tp->user_mss;
+		val = tp->mss_cache;
+		if (val == 0 && ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN)))
+			val = tp->user_mss;
 		break;
 	case TCP_NODELAY:
-		val = (sk->nonagle == 1);
+		val = (tp->nonagle == 1);
 		break;
 	case TCP_CORK:
-		val = (sk->nonagle == 2);
+		val = (tp->nonagle == 2);
 		break;
 	case TCP_KEEPIDLE:
-		if (tp->keepalive_time)
-			val = tp->keepalive_time / HZ;
-		else
-			val = sysctl_tcp_keepalive_time / HZ;
+		val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time)/HZ;
 		break;
 	case TCP_KEEPINTVL:
-		if (tp->keepalive_intvl)
-			val = tp->keepalive_intvl / HZ;
-		else
-			val = sysctl_tcp_keepalive_intvl / HZ;
+		val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl)/HZ;
 		break;
 	case TCP_KEEPCNT:
-		if (tp->keepalive_probes)
-			val = tp->keepalive_probes;
-		else
-			val = sysctl_tcp_keepalive_probes;
+		val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
 		break;
 	case TCP_SYNCNT:
-		if (tp->syn_retries)
-			val = tp->syn_retries;
-		else
-			val = sysctl_tcp_syn_retries;
+		val = tp->syn_retries ? : sysctl_tcp_syn_retries;
+		break;
+	case TCP_LINGER2:
+		val = tp->linger2;
+		if (val > 0)
+			val = (val ? : sysctl_tcp_fin_timeout)/HZ;
+		break;
+	case TCP_DEFER_ACCEPT:
+		val = tp->defer_accept == 0 ? 0 : (TCP_TIMEOUT_INIT<<(tp->defer_accept-1));
+		break;
+	case TCP_WINDOW_CLAMP:
+		val = tp->window_clamp;
 		break;
 	default:
 		return -ENOPROTOOPT;
@@ -2049,11 +2288,20 @@ void __init tcp_init(void)
 		tcp_bhash[i].chain = NULL;
 	}
 
+	/* Try to be a bit smarter and adjust defaults depending
+	 * on available memory.
+	 */
 	if (order > 4) {
 		sysctl_local_port_range[0] = 32768;
 		sysctl_local_port_range[1] = 61000;
+		sysctl_tcp_max_tw_buckets = 180000;
+		sysctl_tcp_max_orphans = 4096<<(order-4);
+		sysctl_max_syn_backlog = 1024;
 	} else if (order < 3) {
 		sysctl_local_port_range[0] = 1024*(3-order);
+		sysctl_tcp_max_tw_buckets >>= (3-order);
+		sysctl_tcp_max_orphans >>= (3-order);
+		sysctl_max_syn_backlog = 128;
 	}
 	tcp_port_rover = sysctl_local_port_range[0] - 1;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3b4ae64a2..d61a5df02 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_input.c,v 1.177 2000/01/09 02:19:39 davem Exp $
+ * Version:	$Id: tcp_input.c,v 1.183 2000/01/24 18:40:33 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -70,9 +70,6 @@
 #define SYNC_INIT 1
 #endif
 
-extern int sysctl_tcp_fin_timeout;
-extern int sysctl_tcp_keepalive_time;
-
 /* These are on by default so the code paths get tested.
  * For the final 2.2 this may be undone at our discretion. -DaveM
  */
@@ -83,10 +80,108 @@ int sysctl_tcp_sack = 1;
 int sysctl_tcp_syncookies = SYNC_INIT; 
 int sysctl_tcp_stdurg;
 int sysctl_tcp_rfc1337;
-int sysctl_tcp_tw_recycle;
+int sysctl_tcp_tw_recycle = 1;
+int sysctl_tcp_abort_on_overflow = 0;
+int sysctl_tcp_max_orphans = NR_FILE;
+int sysctl_tcp_max_tw_buckets = NR_FILE*2;
 
 static int prune_queue(struct sock *sk);
 
+/* 
+ * Adapt the MSS value used to make delayed ack decision to the 
+ * real world.
+ *
+ * The constant 536 hasn't any good meaning.  In IPv4 world
+ * MTU may be smaller, though it contradicts to RFC1122, which
+ * states that MSS must be at least 536.
+ * We use the constant to do not ACK each second
+ * packet in a stream of tiny size packets.
+ * It means that super-low mtu links will be aggressively delacked.
+ * Seems, it is even good. If they have so low mtu, they are weirdly
+ * slow.
+ *
+ * AK: BTW it may be useful to add an option to lock the rcv_mss.
+ *     this way the beowulf people wouldn't need ugly patches to get the
+ *     ack frequencies they want and it would be an elegant way to tune delack.
+ */ 
+static __inline__ void tcp_measure_rcv_mss(struct tcp_opt *tp, struct sk_buff *skb)
+{
+	unsigned int len, lss;
+
+	lss = tp->ack.last_seg_size; 
+	tp->ack.last_seg_size = 0; 
+
+	/* skb->len may jitter because of SACKs, even if peer
+	 * sends good full-sized frames.
+	 */
+	len = skb->len;
+	if (len >= tp->ack.rcv_mss) {
+		tp->ack.rcv_mss = len;
+	} else {
+		/* Otherwise, we make more careful check taking into account,
+		 * that SACKs block is variable.
+		 *
+		 * "len" is invariant segment length, including TCP header.
+		 */
+		len = skb->tail - skb->h.raw;
+		if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr)) {
+			/* Subtract also invariant (if peer is RFC compliant),
+			 * tcp header plus fixed timestamp option length.
+			 * Resulting "len" is MSS free of SACK jitter.
+			 */
+			len -= tp->tcp_header_len;
+			if (len == lss)
+				tp->ack.rcv_mss = len;
+			tp->ack.last_seg_size = len;
+		}
+
+#if 0
+		/* Tiny-grams with PSH set artifically deflate our
+		 * ato measurement.
+		 *
+		 * Mmm... I copied this test from tcp_remember_ack(), but
+		 * I did not understand this. Is it to speedup nagling sender?
+		 * It does not because classic (non-Minshall) sender nagles
+		 * guided by not-acked frames not depending on size.
+		 * And it does not help NODELAY sender, because latency
+		 * is too high in any case. The only result is timer trashing
+		 * and redundant ACKs. Grr... Seems, I missed something.  --ANK
+		 *
+		 * Let me to comment out this yet... TCP should work
+		 * perfectly without this. 				  --ANK
+		 */
+		if (len < (tp->ack.rcv_mss >> 1) && skb->h.th->psh)
+			tp->ack.ato = TCP_ATO_MIN;
+#endif
+	}
+}
+
+
+static __inline__ void tcp_enter_quickack_mode(struct tcp_opt *tp)
+{
+	unsigned quickacks = tcp_receive_window(tp)/(2*tp->ack.rcv_mss);
+
+	tp->ack.quick = max(min(quickacks, 127), 1);
+
+	if (!tp->tstamp_ok && tp->ack.quick>2) {
+		/* Quick ACKs are _dangerous_, if RTTM is not used.
+		 * See comment in tcp_init_metrics(). We still help
+		 * them to overcome the most difficult, initial
+		 * phase of slow start.
+		 */
+		tp->ack.quick = 2;
+	}
+}
+
+/* Send ACKs quickly, if "quick" count is not ehausted
+ * and the session is not interactive.
+ */
+
+static __inline__ int tcp_in_quickack_mode(struct tcp_opt *tp)
+{
+	return (tp->ack.quick && !tp->ack.pingpong);
+}
+
 /* There is something which you must keep in mind when you analyze the
  * behavior of the tp->ato delayed ack timeout interval.  When a
  * connection starts up, we want to ack as quickly as possible.  The
@@ -97,53 +192,52 @@ static int prune_queue(struct sock *sk);
  * each ACK we send, he increments snd_cwnd and transmits more of his
  * queue.  -DaveM
  */
-static void tcp_delack_estimator(struct tcp_opt *tp)
+static void tcp_event_data_recv(struct tcp_opt *tp, struct sk_buff *skb)
 {
-	if(tp->ato == 0) {
-		tp->lrcvtime = tcp_time_stamp;
+	u32 now;
 
-		/* Help sender leave slow start quickly,
-		 * and also makes sure we do not take this
-		 * branch ever again for this connection.
+	tcp_measure_rcv_mss(tp, skb);
+
+	tp->ack.pending = 1;
+
+	now = tcp_time_stamp;
+
+	if (!tp->ack.ato) {
+		/* The _first_ data packet received, initialize
+		 * delayed ACK engine.
 		 */
-		tp->ato = 1;
+
+		/* Help sender leave slow start quickly. */
 		tcp_enter_quickack_mode(tp);
+
+		/* Pingpong is off, session is not interactive by default */
+		tp->ack.pingpong = 0;
+
+		/* ATO is minimal */
+		tp->ack.ato = TCP_ATO_MIN;
 	} else {
-		int m = tcp_time_stamp - tp->lrcvtime;
-
-		tp->lrcvtime = tcp_time_stamp;
-		if(m <= 0)
-			m = 1;
-		if(m > tp->rto)
-			tp->ato = tp->rto;
-		else {
-			/* This funny shift makes sure we
-			 * clear the "quick ack mode" bit.
+		int m = now - tp->ack.lrcvtime;
+
+		if (m > TCP_ATO_MAX/2) {
+			/* Do not touch ATO, if interval is out of bounds.
+			 * It will be deflated by delack timer, if our peer
+			 * really sends too rarely.
 			 */
-			tp->ato = ((tp->ato << 1) >> 2) + m;
+			if (m > tp->rto) {
+				/* Too long gap. Apparently sender falled to
+				 * restart window, so that we send ACKs quickly.
+				 */
+				tcp_enter_quickack_mode(tp);
+			}
+		} else {
+			if (m <= 0)
+				m = TCP_ATO_MIN/2;
+			tp->ack.ato = (tp->ack.ato >> 1) + m;
 		}
 	}
+	tp->ack.lrcvtime = now;
 }
 
-/* 
- * Remember to send an ACK later.
- */
-static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th, 
-					struct sk_buff *skb)
-{
-	tp->delayed_acks++; 
-
-	/* Tiny-grams with PSH set artifically deflate our
-	 * ato measurement, but with a lower bound.
-	 */
-	if(th->psh && (skb->len < (tp->rcv_mss >> 1))) {
-		/* Preserve the quickack state. */
-		if((tp->ato & 0x7fffffff) > HZ/50)
-			tp->ato = ((tp->ato & 0x80000000) |
-				   (HZ/50));
-	}
-} 
-
 /* Called to compute a smoothed rtt estimate. The data fed to this
  * routine either comes from timestamps, or from segments that were
  * known _not_ to have been retransmitted [see Karn/Partridge
@@ -209,10 +303,10 @@ static __inline__ void tcp_set_rto(struct tcp_opt *tp)
  */
 static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
 {
-	if (tp->rto > 120*HZ)
-		tp->rto = 120*HZ;
-	if (tp->rto < HZ/5)
-		tp->rto = HZ/5;
+	if (tp->rto < TCP_RTO_MIN)
+		tp->rto = TCP_RTO_MIN;
+	else if (tp->rto > TCP_RTO_MAX)
+		tp->rto = TCP_RTO_MAX;
 }
 
 /* Save metrics learned by this TCP session.
@@ -224,7 +318,9 @@ static void tcp_update_metrics(struct sock *sk)
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	struct dst_entry *dst = __sk_dst_get(sk);
 
-	if (dst) {
+	dst_confirm(dst);
+
+	if (dst && (dst->flags&DST_HOST)) {
 		int m;
 
 		if (tp->backoff || !tp->srtt) {
@@ -237,8 +333,6 @@ static void tcp_update_metrics(struct sock *sk)
 			return;
 		}
 
-		dst_confirm(dst);
-
 		m = dst->rtt - tp->srtt;
 
 		/* If newly calculated rtt larger than stored one,
@@ -308,10 +402,18 @@ static void tcp_init_metrics(struct sock *sk)
 
 	dst_confirm(dst);
 
+	if (dst->mxlock&(1<<RTAX_CWND))
+		tp->snd_cwnd_clamp = dst->cwnd;
+	if (dst->ssthresh) {
+		tp->snd_ssthresh = dst->ssthresh;
+		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+			tp->snd_ssthresh = tp->snd_cwnd_clamp;
+	}
+
 	if (dst->rtt == 0)
 		goto reset;
 
-	if (!tp->srtt || !tp->saw_tstamp)
+	if (!tp->srtt && dst->rtt < (TCP_TIMEOUT_INIT<<3))
 		goto reset;
 
 	/* Initial rtt is determined from SYN,SYN-ACK.
@@ -334,14 +436,9 @@ static void tcp_init_metrics(struct sock *sk)
 		tp->mdev = dst->rttvar;
 	tcp_set_rto(tp);
 	tcp_bound_rto(tp);
-
-	if (dst->mxlock&(1<<RTAX_CWND))
-		tp->snd_cwnd_clamp = dst->cwnd;
-	if (dst->ssthresh) {
-		tp->snd_ssthresh = dst->ssthresh;
-		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
-			tp->snd_ssthresh = tp->snd_cwnd_clamp;
-	}
+	if (tp->rto < TCP_TIMEOUT_INIT && !tp->saw_tstamp)
+		goto reset;
+	tp->snd_cwnd = tcp_init_cwnd(tp);
 	return;
 
 
@@ -357,9 +454,6 @@ reset:
 	}
 }
 
-#define PAWS_24DAYS	(60 * 60 * 24 * 24)
-
-
 /* WARNING: this must not be called if tp->saw_tstamp was false. */
 extern __inline__ void
 tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq)
@@ -374,7 +468,7 @@ tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq)
 		 */
 
 		if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0 ||
-		   xtime.tv_sec >= tp->ts_recent_stamp + PAWS_24DAYS) {
+		   xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS) {
 			tp->ts_recent = tp->rcv_tsval;
 			tp->ts_recent_stamp = xtime.tv_sec;
 		}
@@ -384,7 +478,7 @@ tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq)
 extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb)
 {
 	return ((s32)(tp->rcv_tsval - tp->ts_recent) < 0 &&
-		xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS
+		xtime.tv_sec < tp->ts_recent_stamp + TCP_PAWS_24DAYS
 
 		 /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
 
@@ -411,8 +505,13 @@ extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb)
 static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
 {
 	u32 end_window = tp->rcv_wup + tp->rcv_wnd;
+#ifdef TCP_FORMAL_WINDOW
+	u32 rcv_wnd = tcp_receive_window(tp);
+#else
+	u32 rcv_wnd = tp->rcv_wnd;
+#endif
 
-	if (tp->rcv_wnd &&
+	if (rcv_wnd &&
 	    after(end_seq, tp->rcv_nxt) &&
 	    before(seq, end_window))
 		return 1;
@@ -424,8 +523,13 @@ static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
 /* This functions checks to see if the tcp header is actually acceptable. */
 extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
 {
+#ifdef TCP_FORMAL_WINDOW
+	u32 rcv_wnd = tcp_receive_window(tp);
+#else
+	u32 rcv_wnd = tp->rcv_wnd;
+#endif
 	if (seq == tp->rcv_nxt)
-		return (tp->rcv_wnd || (end_seq == seq));
+		return (rcv_wnd || (end_seq == seq));
 
 	return __tcp_sequence(tp, seq, end_seq);
 }
@@ -433,8 +537,6 @@ extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
 /* When we get a reset we do this. */
 static void tcp_reset(struct sock *sk)
 {
-	sk->zapped = 1;
-
 	/* We want the right error as BSD sees it (and indeed as we do). */
 	switch (sk->state) {
 		case TCP_SYN_SENT:
@@ -447,9 +549,8 @@ static void tcp_reset(struct sock *sk)
 			return;
 		default:
 			sk->err = ECONNRESET;
-	};
-	tcp_set_state(sk, TCP_CLOSE);
-	tcp_clear_xmit_timers(sk);
+	}
+
 	tcp_done(sk);
 }
 
@@ -658,17 +759,18 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
 		if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
 			tp->dup_acks++;
 			if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) {
-                                tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
-				if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
-					tp->snd_ssthresh = tp->snd_cwnd_clamp;
-                                tp->snd_cwnd = (tp->snd_ssthresh + 3);
-				tp->high_seq = tp->snd_nxt;
+				__tcp_enter_cong_avoid(tp);
+				/* ... and account for 3 ACKs, which are
+				 * already received to this time.
+				 */
+                                tp->snd_cwnd += 3;
+
 				if(!tp->fackets_out)
 					tcp_retransmit_skb(sk,
 							   skb_peek(&sk->write_queue));
 				else
 					tcp_fack_retransmit(sk);
-                                tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+                                tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
 			}
 		} else if (++tp->dup_acks > 3) {
 			/* 2. Each time another duplicate ACK arrives, increment 
@@ -733,7 +835,7 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
 				if (ack != tp->snd_una && before(ack, tp->high_seq)) {
                                 	tcp_retransmit_skb(sk,
 							   skb_peek(&sk->write_queue));
-                                	tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+                                	tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
 				}
 			} else {
 				/* FACK style, fill any remaining holes in
@@ -752,7 +854,8 @@ static __inline__ void tcp_cong_avoid(struct tcp_opt *tp)
 {
         if (tp->snd_cwnd <= tp->snd_ssthresh) {
                 /* In "safe" area, increase. */
-                tp->snd_cwnd++;
+		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+			tp->snd_cwnd++;
 	} else {
                 /* In dangerous area, increase slowly.
 		 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
@@ -826,23 +929,23 @@ static void tcp_ack_probe(struct sock *sk, __u32 ack)
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	
-	/* Our probe was answered. */
-	tp->probes_out = 0;
-	
 	/* Was it a usable window open? */
 
-	/* should always be non-null */
-	if (tp->send_head != NULL &&
-	    !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) {
-		tp->backoff = 0;
-		tp->pending = 0;
-		tcp_clear_xmit_timer(sk, TIME_PROBE0);
-	} else {
-		tcp_reset_xmit_timer(sk, TIME_PROBE0,
-				     min(tp->rto << tp->backoff, 120*HZ));
+	if (tp->send_head != NULL) {
+		if (!after(TCP_SKB_CB(tp->send_head)->end_seq, ack + tp->snd_wnd)) {
+			tp->backoff = 0;
+			tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0);
+			/* If packets_out==0, socket must be waked up by
+			 * subsequent tcp_data_snd_check(). This function is
+			 * not for random using!
+			 */
+		} else if (!tp->packets_out) {
+			tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0,
+					     min(tp->rto << tp->backoff, TCP_RTO_MAX));
+		}
 	}
 }
- 
+
 /* Should we open up the congestion window? */
 static __inline__ int should_advance_cwnd(struct tcp_opt *tp, int flag)
 {
@@ -914,18 +1017,30 @@ static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
 {
 	struct sk_buff *skb = skb_peek(&sk->write_queue);
 
+#ifdef TCP_DEBUG
+	/* It occured in 2.3, because of racy timers. Namely,
+	 * retransmit timer did not check packets_out and retransmitted
+	 * send_head sometimes and, hence, messed all the write_queue.
+	 * Now it is impossible, I bet. --ANK
+	 */
+	if (skb == NULL) {
+		printk("Sucks! packets_out=%d, sk=%p, %d\n", tp->packets_out, sk, sk->state);
+		return;
+	}
+#endif
+
 	/* Some data was ACK'd, if still retransmitting (due to a
 	 * timeout), resend more of the retransmit queue.  The
 	 * congestion window is handled properly by that code.
 	 */
 	if (tp->retransmits) {
 		tcp_xmit_retransmit_queue(sk);
-		tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+		tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
 	} else {
 		__u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when);
 		if ((__s32)when < 0)
 			when = 1;
-		tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
+		tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, when);
 	}
 }
 
@@ -938,13 +1053,8 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
 	u32 seq = 0;
 	u32 seq_rtt = 0;
 
-	if(sk->zapped)
-		return(1);	/* Dead, can't ack any more so why bother */
-
-	if (tp->pending == TIME_KEEPOPEN)
-	  	tp->probes_out = 0;
-
-	tp->rcv_tstamp = tcp_time_stamp;
+	if(sk->state == TCP_CLOSE)
+		return 1;	/* Dead, can't ack any more so why bother */
 
 	/* If the ack is newer than sent or older than previous acks
 	 * then we can probably ignore it.
@@ -953,10 +1063,8 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
 		goto uninteresting_ack;
 
 	/* If there is data set flag 1 */
-	if (len != th->doff*4) {
+	if (len != th->doff*4)
 		flag |= FLAG_DATA;
-		tcp_delack_estimator(tp);
-	}
 
 	/* Update our send window. */
 
@@ -970,31 +1078,53 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
 
 		if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) {
 			flag |= FLAG_WIN_UPDATE;
-			tp->snd_wnd = nwin;
+			if (tp->snd_wnd != nwin) {
+				tp->snd_wnd = nwin;
+
+				/* Note, it is the only place, where
+				 * fast path is recovered for sending TCP.
+				 */
+				if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
+#ifdef TCP_FORMAL_WINDOW
+				    tcp_receive_window(tp) &&
+#endif
+				    !tp->urg_data)
+					tcp_fast_path_on(tp);
+
+				if (nwin > tp->max_window) {
+					tp->max_window = nwin;
+					tcp_sync_mss(sk, tp->pmtu_cookie);
+				}
+			}
 
 			tp->snd_wl1 = ack_seq;
 			tp->snd_wl2 = ack;
-
-			if (nwin > tp->max_window)
-				tp->max_window = nwin;
 		}
 	}
 
+	/* BEWARE! From this place and until return from this function
+	 * snd_nxt and snd_wnd are out of sync. All the routines, called
+	 * from here must get "ack" as argument or they should not depend
+	 * on right edge of window. It is _UGLY_. It cries to be fixed. --ANK
+	 */
+
 	/* We passed data and got it acked, remove any soft error
 	 * log. Something worked...
 	 */
 	sk->err_soft = 0;
+	tp->probes_out = 0;
+	tp->rcv_tstamp = tcp_time_stamp;
+
+	/* See if we can take anything off of the retransmit queue. */
+	flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt);
 
 	/* If this ack opens up a zero window, clear backoff.  It was
 	 * being used to time the probes, and is probably far higher than
 	 * it needs to be for normal retransmission.
 	 */
-	if (tp->pending == TIME_PROBE0)
+	if (tcp_timer_is_set(sk, TCP_TIME_PROBE0))
 		tcp_ack_probe(sk, ack);
 
-	/* See if we can take anything off of the retransmit queue. */
-	flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt);
-
 	/* We must do this here, before code below clears out important
 	 * state contained in tp->fackets_out and tp->retransmits.  -DaveM
 	 */
@@ -1036,7 +1166,7 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
 		if (flag & FLAG_DATA_ACKED)
 			tcp_ack_packets_out(sk, tp);
 	} else {
-		tcp_clear_xmit_timer(sk, TIME_RETRANS);
+		tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS);
 	}
 
 	flag &= (FLAG_DATA | FLAG_WIN_UPDATE);
@@ -1074,9 +1204,42 @@ uninteresting_ack:
 	return 0;
 }
 
+int tcp_paws_check(struct tcp_opt *tp, int rst)
+{
+	if ((s32)(tp->rcv_tsval - tp->ts_recent) >= 0)
+		return 0;
+	if (xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS)
+		return 0;
+
+	/* RST segments are not recommended to carry timestamp,
+	   and, if they do, it is recommended to ignore PAWS because
+	   "their cleanup function should take precedence over timestamps."
+	   Certainly, it is mistake. It is necessary to understand the reasons
+	   of this constraint to relax it: if peer reboots, clock may go
+	   out-of-sync and half-open connections will not be reset.
+	   Actually, the problem would be not existing if all
+	   the implementations followed draft about maintaining clock
+	   via reboots. Linux-2.2 DOES NOT!
+
+	   However, we can relax time bounds for RST segments to MSL.
+	 */
+	if (rst && xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_MSL)
+		return 0;
+	return 1;
+}
+
+static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
+{
+	if (seq == s_win)
+		return 1;
+	if (after(end_seq, s_win) && before(seq, e_win))
+		return 1;
+	return (seq == e_win && seq == end_seq);
+}
+
 /* New-style handling of TIME_WAIT sockets. */
 
-/* Must be called only from BH context. */
+/* Must be called with locally disabled BHs. */
 void tcp_timewait_kill(struct tcp_tw_bucket *tw)
 {
 	struct tcp_ehash_bucket *ehead;
@@ -1121,13 +1284,6 @@ void tcp_timewait_kill(struct tcp_tw_bucket *tw)
 	tcp_tw_put(tw);
 }
 
-/* We come here as a special case from the AF specific TCP input processing,
- * and the SKB has no owner.  Essentially handling this is very simple,
- * we just keep silently eating rx'd packets until none show up for the
- * entire timeout period.  The only special cases are for BSD TIME_WAIT
- * reconnects and SYN/RST bits being set in the TCP header.
- */
-
 /* 
  * * Main purpose of TIME-WAIT state is to close connection gracefully,
  *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
@@ -1149,6 +1305,12 @@ void tcp_timewait_kill(struct tcp_tw_bucket *tw)
  * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
  * When you compare it to RFCs, please, read section SEGMENT ARRIVES
  * from the very beginning.
+ *
+ * NOTE. With recycling (and later with fin-wait-2) TW bucket
+ * is _not_ stateless. It means, that strictly speaking we must
+ * spinlock it. I do not want! Well, probability of misbehaviour
+ * is ridiculously low and, seems, we could use some mb() tricks
+ * to avoid misread sequence numbers, states etc.  --ANK
  */
 enum tcp_tw_status
 tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
@@ -1157,7 +1319,75 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 	struct tcp_opt tp;
 	int paws_reject = 0;
 
-	/*	RFC 1122:
+	tp.saw_tstamp = 0;
+	if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) {
+		tcp_parse_options(NULL, th, &tp, 0);
+
+		if (tp.saw_tstamp) {
+			tp.ts_recent = tw->ts_recent;
+			tp.ts_recent_stamp = tw->ts_recent_stamp;
+			paws_reject = tcp_paws_check(&tp, th->rst);
+		}
+	}
+
+	if (tw->substate == TCP_FIN_WAIT2) {
+		/* Just repeat all the checks of tcp_rcv_state_process() */
+
+		/* Out of window, send ACK */
+		if (paws_reject ||
+		    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+				   tw->rcv_nxt, tw->rcv_nxt + tw->rcv_wnd))
+			return TCP_TW_ACK;
+
+		if (th->rst)
+			goto kill;
+
+		if (th->syn && TCP_SKB_CB(skb)->seq != tw->syn_seq)
+			goto kill_with_rst;
+
+		/* Dup ACK? */
+		if (!after(TCP_SKB_CB(skb)->end_seq, tw->rcv_nxt)) {
+			tcp_tw_put(tw);
+			return TCP_TW_SUCCESS;
+		}
+
+		/* New data or FIN. If new data arrive after half-duplex close,
+		 * reset.
+		 */
+		if (!th->fin || TCP_SKB_CB(skb)->end_seq != tw->rcv_nxt+1) {
+kill_with_rst:
+			tcp_tw_deschedule(tw);
+			tcp_timewait_kill(tw);
+			tcp_tw_put(tw);
+			return TCP_TW_RST;
+		}
+
+		/* FIN arrived, enter true time-wait state. */
+		tw->substate = TCP_TIME_WAIT;
+		tw->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+		if (tp.saw_tstamp) {
+			tw->ts_recent_stamp = xtime.tv_sec;
+			tw->ts_recent = tp.rcv_tsval;
+		}
+
+		/* I am shamed, but failed to make it more elegant.
+		 * Yes, it is direct reference to IP, which is impossible
+		 * to generalize to IPv6. Taking into account that IPv6
+		 * do not undertsnad recycling in any case, it not
+		 * a big problem in practice. --ANK */
+		if (tw->family == AF_INET &&
+		    sysctl_tcp_tw_recycle && tw->ts_recent_stamp &&
+		    tcp_v4_tw_remember_stamp(tw))
+			tcp_tw_schedule(tw, tw->timeout);
+		else
+			tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+		return TCP_TW_ACK;
+	}
+
+	/*
+	 *	Now real TIME-WAIT state.
+	 *
+	 *	RFC 1122:
 	 *	"When a connection is [...] on TIME-WAIT state [...]
 	 *	[a TCP] MAY accept a new SYN from the remote TCP to
 	 *	reopen the connection directly, if it:
@@ -1171,47 +1401,31 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 	 *	to be an old duplicate".
 	 */
 
-	tp.saw_tstamp = 0;
-	if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) {
-		tcp_parse_options(NULL, th, &tp, 0);
-
-		paws_reject = tp.saw_tstamp &&
-			((s32)(tp.rcv_tsval - tw->ts_recent) < 0 &&
-			 xtime.tv_sec < tw->ts_recent_stamp + PAWS_24DAYS);
-	}
-
 	if (!paws_reject &&
 	    (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
 	     TCP_SKB_CB(skb)->seq == tw->rcv_nxt)) {
 		/* In window segment, it may be only reset or bare ack. */
 
 		if (th->rst) {
-#ifdef CONFIG_TCP_TW_RECYCLE
-			/* When recycling, always follow rfc1337,
-			 * but mark bucket as ready to recycling immediately.
-			 */
-			if (sysctl_tcp_tw_recycle) {
-				/* May kill it now. */
-				tw->rto = 0;
-				tw->ttd = jiffies;
-			} else
-#endif
 			/* This is TIME_WAIT assasination, in two flavors.
 			 * Oh well... nobody has a sufficient solution to this
 			 * protocol bug yet.
 			 */
-			if(sysctl_tcp_rfc1337 == 0) {
+			if (sysctl_tcp_rfc1337 == 0) {
+kill:
 				tcp_tw_deschedule(tw);
 				tcp_timewait_kill(tw);
+				tcp_tw_put(tw);
+				return TCP_TW_SUCCESS;
 			}
-		} else {
-			tcp_tw_reschedule(tw);
 		}
+		tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
 
 		if (tp.saw_tstamp) {
 			tw->ts_recent = tp.rcv_tsval;
 			tw->ts_recent_stamp = xtime.tv_sec;
 		}
+
 		tcp_tw_put(tw);
 		return TCP_TW_SUCCESS;
 	}
@@ -1235,7 +1449,7 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 
 	if (th->syn && !th->rst && !th->ack && !paws_reject &&
 	    (after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt) ||
-	     (tp.saw_tstamp && tw->ts_recent != tp.rcv_tsval))) {
+	     (tp.saw_tstamp && (s32)(tw->ts_recent - tp.rcv_tsval) < 0))) {
 		u32 isn = tw->snd_nxt + 2;
 		if (isn == 0)
 			isn++;
@@ -1243,20 +1457,18 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 		return TCP_TW_SYN;
 	}
 
+	if (paws_reject)
+		NET_INC_STATS_BH(PAWSEstabRejected);
+
 	if(!th->rst) {
 		/* In this case we must reset the TIMEWAIT timer.
-
-		   If it is ACKless SYN it may be both old duplicate
-		   and new good SYN with random sequence number <rcv_nxt.
-		   Do not reschedule in the last case.
+		 *
+		 * If it is ACKless SYN it may be both old duplicate
+		 * and new good SYN with random sequence number <rcv_nxt.
+		 * Do not reschedule in the last case.
 		 */
-		if (paws_reject || th->ack) {
-			tcp_tw_reschedule(tw);
-#ifdef CONFIG_TCP_TW_RECYCLE
-			tw->rto = min(120*HZ, tw->rto<<1);
-			tw->ttd = jiffies + tw->rto;
-#endif
-		}
+		if (paws_reject || th->ack)
+			tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
 
 		/* Send ACK. Note, we do not put the bucket,
 		 * it will be released by caller.
@@ -1267,8 +1479,8 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 	return TCP_TW_SUCCESS;
 }
 
-/* Enter the time wait state.  This is always called from BH
- * context.  Essentially we whip up a timewait bucket, copy the
+/* Enter the time wait state.  This is called with locally disabled BH.
+ * Essentially we whip up a timewait bucket, copy the
  * relevant info into it from the SK, and mess with hash chains
  * and list linkage.
  */
@@ -1286,6 +1498,7 @@ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
 			sk->next->pprev = sk->pprev;
 		*sk->pprev = sk->next;
 		sk->pprev = NULL;
+		sock_prot_dec_use(sk->prot);
 	}
 
 	/* Step 2: Hash TW into TIMEWAIT half of established hash table. */
@@ -1312,41 +1525,49 @@ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
 	tw->tb->owners = (struct sock*)tw;
 	tw->bind_pprev = &tw->tb->owners;
 	spin_unlock(&bhead->lock);
-
-	/* Step 4: Un-charge protocol socket in-use count. */
-	sock_prot_dec_use(sk->prot);
 }
 
 /* 
- * Move a socket to time-wait.
+ * Move a socket to time-wait or dead fin-wait-2 state.
  */ 
-void tcp_time_wait(struct sock *sk)
+void tcp_time_wait(struct sock *sk, int state, int timeo)
 {
-	struct tcp_tw_bucket *tw;
+	struct tcp_tw_bucket *tw = NULL;
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	int recycle_ok = 0;
+
+	if (sysctl_tcp_tw_recycle && tp->ts_recent_stamp)
+		recycle_ok = tp->af_specific->remember_stamp(sk);
+
+	if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
+		tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
 
-	tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
 	if(tw != NULL) {
+		int rto = (tp->rto<<2) - (tp->rto>>1);
+
 		/* Give us an identity. */
 		tw->daddr	= sk->daddr;
 		tw->rcv_saddr	= sk->rcv_saddr;
 		tw->bound_dev_if= sk->bound_dev_if;
 		tw->num		= sk->num;
 		tw->state	= TCP_TIME_WAIT;
+		tw->substate	= state;
 		tw->sport	= sk->sport;
 		tw->dport	= sk->dport;
 		tw->family	= sk->family;
 		tw->reuse	= sk->reuse;
-		tw->hashent	= sk->hashent;
-		tw->rcv_nxt	= sk->tp_pinfo.af_tcp.rcv_nxt;
-		tw->snd_nxt	= sk->tp_pinfo.af_tcp.snd_nxt;
-		tw->ts_recent	= sk->tp_pinfo.af_tcp.ts_recent;
-		tw->ts_recent_stamp= sk->tp_pinfo.af_tcp.ts_recent_stamp;
-#ifdef CONFIG_TCP_TW_RECYCLE
-		tw->rto		= sk->tp_pinfo.af_tcp.rto;
-		tw->ttd		= jiffies + 2*tw->rto;
-#endif
+		tw->rcv_wscale	= tp->rcv_wscale;
 		atomic_set(&tw->refcnt, 0);
 
+		tw->hashent	= sk->hashent;
+		tw->rcv_nxt	= tp->rcv_nxt;
+		tw->snd_nxt	= tp->snd_nxt;
+		tw->rcv_wnd	= tcp_receive_window(tp);
+		tw->syn_seq	= tp->syn_seq;
+		tw->ts_recent	= tp->ts_recent;
+		tw->ts_recent_stamp= tp->ts_recent_stamp;
+		tw->pprev_death = NULL;
+
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 		if(tw->family == PF_INET6) {
 			memcpy(&tw->v6_daddr,
@@ -1361,22 +1582,28 @@ void tcp_time_wait(struct sock *sk)
 		__tcp_tw_hashdance(sk, tw);
 
 		/* Get the TIME_WAIT timeout firing. */
-		tcp_tw_schedule(tw);
+		if (timeo < rto)
+			timeo = rto;
 
-		/* CLOSE the SK. */
-		if(sk->state == TCP_ESTABLISHED)
-			tcp_statistics[smp_processor_id()*2].TcpCurrEstab--;
-		sk->state = TCP_CLOSE;
+		if (recycle_ok) {
+			tw->timeout = rto;
+		} else {
+			tw->timeout = TCP_TIMEWAIT_LEN;
+			if (state == TCP_TIME_WAIT)
+				timeo = TCP_TIMEWAIT_LEN;
+		}
+
+		tcp_tw_schedule(tw, timeo);
 	} else {
-		/* Sorry, we're out of memory, just CLOSE this
+		/* Sorry, if we're out of memory, just CLOSE this
 		 * socket up.  We've got bigger problems than
 		 * non-graceful socket closings.
 		 */
-		tcp_set_state(sk, TCP_CLOSE);
+		if (net_ratelimit())
+			printk(KERN_INFO "TCP: time wait bucket table overflow\n");
 	}
 
 	tcp_update_metrics(sk);
-	tcp_clear_xmit_timers(sk);
 	tcp_done(sk);
 }
 
@@ -1397,10 +1624,13 @@ void tcp_time_wait(struct sock *sk)
  
 static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
 {
-	sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq;
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
+	tp->fin_seq = TCP_SKB_CB(skb)->end_seq;
 	tcp_send_ack(sk);
 
+	sk->shutdown |= RCV_SHUTDOWN;
+
 	switch(sk->state) {
 		case TCP_SYN_RECV:
 		case TCP_ESTABLISHED:
@@ -1427,7 +1657,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
 			break;
 		case TCP_FIN_WAIT2:
 			/* Received a FIN -- send ACK and enter TIME_WAIT. */
-			tcp_time_wait(sk);
+			tcp_time_wait(sk, TCP_TIME_WAIT, 0);
 			break;
 		default:
 			/* Only TCP_LISTEN and TCP_CLOSE are left, in these
@@ -1435,9 +1665,17 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
 			 */
 			printk("tcp_fin: Impossible, sk->state=%d\n", sk->state);
 			break;
-	}
+	};
+
+	/* It _is_ possible, that we have something out-of-order _after_ FIN.
+	 * Probably, we should reset in this case. For now drop them.
+	 */
+	__skb_queue_purge(&tp->out_of_order_queue);
+	if (tp->sack_ok)
+		tp->num_sacks = 0;
+
 	if (!sk->dead) {
-		wake_up_interruptible(sk->sleep);
+		sk->state_change(sk);
 		sock_wake_async(sk->socket, 1, POLL_HUP);
 	}
 }
@@ -1622,6 +1860,7 @@ static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct
 	sp->end_seq = TCP_SKB_CB(new_skb)->end_seq;
 }
 
+
 /* This one checks to see if we can put data from the
  * out_of_order queue into the receive_queue.
  */
@@ -1658,6 +1897,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 {
 	struct sk_buff *skb1;
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	int eaten = 0;
 
 	/*  Queue data for delivery to the user.
 	 *  Packets in sequence go to the receive queue.
@@ -1665,33 +1905,68 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 	 */
 	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
 		/* Ok. In sequence. */
-	queue_and_out:
+		if (tp->ucopy.task == current &&
+		    tp->copied_seq == tp->rcv_nxt &&
+		    tp->ucopy.len &&
+		    sk->lock.users &&
+		    !tp->urg_data) {
+			int chunk = min(skb->len, tp->ucopy.len);
+
+			local_bh_enable();
+			if (memcpy_toiovec(tp->ucopy.iov, skb->data, chunk)) {
+				sk->err = EFAULT;
+				sk->error_report(sk);
+			}
+			local_bh_disable();
+			tp->ucopy.len -= chunk;
+			tp->copied_seq += chunk;
+			eaten = (chunk == skb->len && !skb->h.th->fin);
+		}
+
+		if (!eaten) {
+queue_and_out:
+			skb_set_owner_r(skb, sk);
+			__skb_queue_tail(&sk->receive_queue, skb);
+		}
 		dst_confirm(sk->dst_cache);
-		__skb_queue_tail(&sk->receive_queue, skb);
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
-		if(skb->h.th->fin) {
+		if(skb->len)
+			tcp_event_data_recv(tp, skb); 
+		if(skb->h.th->fin)
 			tcp_fin(skb, sk, skb->h.th);
-		} else {
-			tcp_remember_ack(tp, skb->h.th, skb); 
-		}
+
 		/* This may have eaten into a SACK block. */
 		if(tp->sack_ok && tp->num_sacks)
 			tcp_sack_remove_skb(tp, skb);
 		tcp_ofo_queue(sk);
 
 		/* Turn on fast path. */ 
-		if (skb_queue_len(&tp->out_of_order_queue) == 0)
-			tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) |
-					       ntohl(TCP_FLAG_ACK) |
-					       tp->snd_wnd);
+		if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
+#ifdef TCP_FORMAL_WINDOW
+		    tcp_receive_window(tp) &&
+#endif
+		    !tp->urg_data)
+			tcp_fast_path_on(tp);
+
+		if (eaten)
+			kfree_skb(skb);
+
+		if (!sk->dead) {
+			wake_up_interruptible(sk->sleep);
+			sock_wake_async(sk->socket,1, POLL_IN);
+		}
 		return;
 	}
-	
+
 	/* An old packet, either a retransmit or some packet got lost. */
 	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
-		/* A retransmit, 2nd most common case.  Force an imediate ack. */
-		SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq);
+		/* A retransmit, 2nd most common case.  Force an imediate ack.
+		 * 
+		 * It is impossible, seq is checked by top level.
+		 */
+		NETDEBUG(printk("retransmit in tcp_data_queue: seq %X\n", TCP_SKB_CB(skb)->seq));
 		tcp_enter_quickack_mode(tp);
+		tp->ack.pending = 1;
 		kfree_skb(skb);
 		return;
 	}
@@ -1706,15 +1981,17 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 	}
 
 	/* Ok. This is an out_of_order segment, force an ack. */
-	tp->delayed_acks++;
-	tcp_enter_quickack_mode(tp);
+	tp->ack.pending = 1;
 
 	/* Disable header prediction. */
 	tp->pred_flags = 0;
 
+
 	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
 		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
 
+	skb_set_owner_r(skb, sk);
+
 	if (skb_peek(&tp->out_of_order_queue) == NULL) {
 		/* Initial out of order segment, build 1 SACK. */
 		if(tp->sack_ok) {
@@ -1758,6 +2035,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 			}
 		}
 	}
+	return;
 }
 
 
@@ -1767,7 +2045,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
  *	room, then we will just have to discard the packet.
  */
 
-static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
+static void tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
 {
 	struct tcphdr *th;
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
@@ -1777,11 +2055,11 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
 	skb_trim(skb, len - (th->doff*4));
 
         if (skb->len == 0 && !th->fin)
-		return(0);
+		goto drop;
 
 	/* 
 	 *	If our receive queue has grown past its limits shrink it.
-	 *	Make sure to do this before moving snd_nxt, otherwise
+	 *	Make sure to do this before moving rcv_nxt, otherwise
 	 *	data might be acked for that we don't have enough room.
 	 */
 	if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) { 
@@ -1789,7 +2067,7 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
 			/* Still not enough room. That can happen when
 			 * skb->true_size differs significantly from skb->len.
 			 */
-			return 0;
+			goto drop;
 		}
 	}
 
@@ -1799,29 +2077,20 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
 		printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
 		tp->rcv_nxt = tp->copied_seq;
 	}
+	return;
 
-	/* Above, tcp_data_queue() increments delayed_acks appropriately.
-	 * Now tell the user we may have some data.
-	 */
-	if (!sk->dead) {
-		wake_up_interruptible(sk->sleep);
-		sock_wake_async(sk->socket,1, POLL_IN);
-	}
-	return(1);
+drop:
+	kfree_skb(skb);
 }
 
 static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
-	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) &&
-	    tcp_packets_in_flight(tp) < tp->snd_cwnd) {
-		/* Put more data onto the wire. */
-		tcp_write_xmit(sk);
-	} else if (tp->packets_out == 0 && !tp->pending) {
-		/* Start probing the receivers window. */
-		tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
-	}
+	if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
+	    tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
+	    tcp_write_xmit(sk))
+		tcp_check_probe_timer(sk, tp);
 }
 
 static __inline__ void tcp_data_snd_check(struct sock *sk)
@@ -1832,57 +2101,6 @@ static __inline__ void tcp_data_snd_check(struct sock *sk)
 		__tcp_data_snd_check(sk, skb); 
 }
 
-/* 
- * Adapt the MSS value used to make delayed ack decision to the 
- * real world.
- *
- * The constant 536 hasn't any good meaning.  In IPv4 world
- * MTU may be smaller, though it contradicts to RFC1122, which
- * states that MSS must be at least 536.
- * We use the constant to do not ACK each second
- * packet in a stream of tiny size packets.
- * It means that super-low mtu links will be aggressively delacked.
- * Seems, it is even good. If they have so low mtu, they are weirdly
- * slow.
- *
- * AK: BTW it may be useful to add an option to lock the rcv_mss.
- *     this way the beowulf people wouldn't need ugly patches to get the
- *     ack frequencies they want and it would be an elegant way to tune delack.
- */ 
-static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb)
-{
-	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-	unsigned int len, lss;
-
-	lss = tp->last_seg_size; 
-	tp->last_seg_size = 0; 
-
-	/* skb->len may jitter because of SACKs, even if peer
-	 * sends good full-sized frames.
-	 */
-	len = skb->len;
-	if (len >= tp->rcv_mss) {
-		tp->rcv_mss = len;
-	} else {
-		/* Otherwise, we make more careful check taking into account,
-		 * that SACKs block is variable.
-		 *
-		 * "len" is invariant segment length, including TCP header.
-		 */
-		len = skb->tail - skb->h.raw;
-		if (len >= 536 + sizeof(struct tcphdr)) {
-			/* Subtract also invariant (if peer is RFC compliant),
-			 * tcp header plus fixed timestamp option length.
-			 * Resulting "len" is MSS free of SACK jitter.
-			 */
-			len -= tp->tcp_header_len;
-			if (len == lss)
-				tp->rcv_mss = len;
-			tp->last_seg_size = len;
-		}
-	}
-}
-
 /*
  * Check if sending an ack is needed.
  */
@@ -1904,26 +2122,25 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 	 * start in an expediant manner.
 	 */
 
-	    /* Two full frames received or... */
-	if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) ||
-	    /* We will update the window "significantly" or... */
-	    tcp_raise_window(sk) ||
-	    /* We entered "quick ACK" mode or... */
+	    /* More than one full frame received or... */
+	if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss) ||
+	    /* We ACK each frame or... */
 	    tcp_in_quickack_mode(tp) ||
-	    /* We have out of order data */
-	    (ofo_possible && (skb_peek(&tp->out_of_order_queue) != NULL))) {
+	    /* We have out of order data or */
+	    (ofo_possible &&
+	     skb_peek(&tp->out_of_order_queue) != NULL)) {
 		/* Then ack it now */
 		tcp_send_ack(sk);
 	} else {
 		/* Else, send delayed ack. */
-		tcp_send_delayed_ack(sk, HZ/2);
+		tcp_send_delayed_ack(sk);
 	}
 }
 
 static __inline__ void tcp_ack_snd_check(struct sock *sk)
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-	if (tp->delayed_acks == 0) {
+	if (tp->ack.pending == 0) {
 		/* We sent a data segment already. */
 		return;
 	}
@@ -1975,7 +2192,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
 	 */
 	if (tp->urg_seq == tp->copied_seq)
 		tp->copied_seq++;	/* Move the copied sequence on correctly */
-	tp->urg_data = URG_NOTYET;
+	tp->urg_data = TCP_URG_NOTYET;
 	tp->urg_seq = ptr;
 
 	/* Disable header prediction. */
@@ -1992,12 +2209,12 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len
 		tcp_check_urg(sk,th);
 
 	/* Do we wait for any urgent data? - normally not... */
-	if (tp->urg_data == URG_NOTYET) {
+	if (tp->urg_data == TCP_URG_NOTYET) {
 		u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4);
 
 		/* Is the urgent pointer pointing into this packet? */	 
 		if (ptr < len) {
-			tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
+			tp->urg_data = TCP_URG_VALID | *(ptr + (unsigned char *) th);
 			if (!sk->dead)
 				sk->data_ready(sk,0);
 		}
@@ -2014,7 +2231,8 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len
 static int prune_queue(struct sock *sk)
 {
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; 
-	struct sk_buff * skb;
+	struct sk_buff *skb;
+	int pruned = 0;
 
 	SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
 
@@ -2024,7 +2242,9 @@ static int prune_queue(struct sock *sk)
 	skb = __skb_dequeue_tail(&tp->out_of_order_queue);
 	if(skb != NULL) {
 		/* Free it all. */
-		do {	net_statistics[smp_processor_id()*2].OfoPruned += skb->len; 
+		do {
+			pruned += skb->len;
+			net_statistics[smp_processor_id()*2].OfoPruned += skb->len; 
 			kfree_skb(skb);
 			skb = __skb_dequeue_tail(&tp->out_of_order_queue);
 		} while(skb != NULL);
@@ -2059,13 +2279,47 @@ static int prune_queue(struct sock *sk)
 	 * if we are really having our buffer space abused we stop accepting
 	 * new receive data.
 	 *
+	 * 8) The arguments are interesting, but I even cannot imagine
+	 * what kind of arguments could force us to drop NICE, ALREADY
+	 * RECEIVED DATA only to get one more packet? --ANK
+	 *
 	 * FIXME: it should recompute SACK state and only remove enough
 	 *        buffers to get into bounds again. The current scheme loses
-     *        badly sometimes on links with large RTT, especially when 
-     *        the driver has high overhead per skb.
-     *        (increasing the rcvbuf is not enough because it inflates the
-     *         the window too, disabling flow control effectively) -AK
+	 *        badly sometimes on links with large RTT, especially when 
+	 *        the driver has high overhead per skb.
+	 *        (increasing the rcvbuf is not enough because it inflates the
+	 *         the window too, disabling flow control effectively) -AK
+	 *
+	 *	  Mmm... Why not to scale it seprately then? Just replace
+	 *	  / WINDOW_ADVERTISE_DIVISOR with >> sk->window_advertise_scale
+	 *	  and adjust it dynamically, when TCP window flow control
+	 *	  fails?						-ANK
+	 */
+
+	/* F.e. one possible tactics is: */
+	do {
+		u32 new_clamp = (tp->rcv_nxt-tp->copied_seq) + pruned;
+
+		/* This guy is not a good guy. I bet, he martirized cats,
+		 * when was child and grew up to finished sadist. Clamp him!
+		 */
+		if (new_clamp > 3*tp->ack.rcv_mss)
+			new_clamp -= tp->ack.rcv_mss;
+		else
+			new_clamp = 2*tp->ack.rcv_mss;
+		tp->window_clamp = min(tp->window_clamp, new_clamp);
+	} while (0);
+	/* Though it should be made earlier, when we are still not
+	 * congested. This header prediction logic sucks
+	 * without true implementation of VJ algorithm.
+	 * I am really anxious. How was it possible to combine
+	 * header prediction and sending ACKs outside of recvmsg() context?
+	 * They _are_ incompatible. We should not advance window so
+	 * brainlessly and we should not advertise so huge window from the very
+	 * beginning. BTW window "prediction" does not speedup anything!
+	 * SIlly, silly, silly.
 	 */
+
 	if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1))
 		return 0;
 
@@ -2073,6 +2327,57 @@ static int prune_queue(struct sock *sk)
 	return -1;
 }
 
+static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
+{
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	int chunk = skb->len - hlen;
+	int err;
+
+	local_bh_enable();
+	if (skb->ip_summed==CHECKSUM_UNNECESSARY)
+		err = memcpy_toiovec(tp->ucopy.iov, skb->h.raw + hlen, chunk);
+	else
+		err = copy_and_csum_toiovec(tp->ucopy.iov, skb, hlen);
+
+	if (!err) {
+update:
+		tp->ucopy.len -= chunk;
+		tp->copied_seq += chunk;
+		local_bh_disable();
+		return 0;
+	}
+
+	if (err == -EFAULT) {
+		sk->err = EFAULT;
+		sk->error_report(sk);
+		goto update;
+	}
+
+	local_bh_disable();
+	return err;
+}
+
+static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
+{
+	int result;
+
+	if (sk->lock.users) {
+		local_bh_enable();
+		result = __tcp_checksum_complete(skb);
+		local_bh_disable();
+	} else {
+		result = __tcp_checksum_complete(skb);
+	}
+	return result;
+}
+
+static __inline__ int
+tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
+{
+	return skb->ip_summed != CHECKSUM_UNNECESSARY &&
+		__tcp_checksum_complete_user(sk, skb);
+}
+
 /*
  *	TCP receive function for the ESTABLISHED state. 
  *
@@ -2080,7 +2385,33 @@ static int prune_queue(struct sock *sk)
  * 	disabled when:
  *	- A zero window was announced from us - zero window probing
  *        is only handled properly in the slow path. 
- *  - Out of order segments arrived.
+ *	  [ NOTE: actually, it was made incorrectly and nobody ever noticed
+ *	    this! Reason is clear: 1. Correct senders do not send
+ *	    to zero window. 2. Even if a sender sends to zero window,
+ *	    nothing terrible occurs.
+ *
+ *	    For now I cleaned this and fast path is really always disabled,
+ *	    when window is zero, but I would be more happy to remove these
+ *	    checks. Code will be only cleaner and _faster_.    --ANK
+ *	
+ *	    Later note. I've just found that slow path also accepts
+ *	    out of window segments, look at tcp_sequence(). So...
+ *	    it is the last argument: I repair all and comment out
+ *	    repaired code by TCP_FORMAL_WINDOW.
+ *	    [ I remember one rhyme from a chidren's book. (I apologize,
+ *	      the trasnlation is not rhymed 8)): people in one (jewish) village
+ *	      decided to build sauna, but divided to two parties.
+ *	      The first one insisted that battens should not be dubbed,
+ *	      another objected that foots will suffer of splinters,
+ *	      the first fended that dubbed wet battens are too slippy
+ *	      and people will fall and it is much more serious!
+ *	      Certaiinly, all they went to rabbi.
+ *	      After some thinking, he judged: "Do not be lazy!
+ *	      Certainly, dub the battens! But put them by dubbed surface down."
+ *          ]
+ *        ]
+ *
+ *	- Out of order segments arrived.
  *	- Urgent data is expected.
  *	- There is no buffer space left
  *	- Unexpected TCP flags/window values/header lengths are received
@@ -2088,7 +2419,7 @@ static int prune_queue(struct sock *sk)
  *	- Data is sent in both directions. Fast path only supports pure senders
  *	  or pure receivers (this means either the sequence number or the ack
  *	  value must stay constant)
- *  - Unexpected TCP option.
+ *	- Unexpected TCP option.
  *
  *	When these conditions are not satisfied it drops into a standard 
  *	receive procedure patterned after RFC793 to handle all cases.
@@ -2116,7 +2447,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	 *	We do checksum and copy also but from device to kernel.
 	 */
 
-
 	/* RED-PEN. Using static variables to pass function arguments
 	 * cannot be good idea...
 	 */
@@ -2133,13 +2463,12 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 
 	if ((tcp_flag_word(th) & ~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) == tp->pred_flags &&
 		TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
-		int tcp_header_len = th->doff*4;
-
-		/* Timestamp header prediction */
+		int tcp_header_len = tp->tcp_header_len;
 
-		/* Non-standard header f.e. SACKs -> slow path */
-		if (tcp_header_len != tp->tcp_header_len)
-			goto slow_path;
+		/* Timestamp header prediction: tcp_header_len
+		 * is automatically equal to th->doff*4 due to pred_flags
+		 * match.
+		 */
 
 		/* Check timestamp */
 		if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
@@ -2161,8 +2490,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 				goto slow_path;
 
 			/* Predicted packet is in window by definition.
-			   seq == rcv_nxt and last_ack_sent <= rcv_nxt.
-			   Hence, check seq<=last_ack_sent reduces to:
+			 * seq == rcv_nxt and last_ack_sent <= rcv_nxt.
+			 * Hence, check seq<=last_ack_sent reduces to:
 			 */
 			if (tp->rcv_nxt == tp->last_ack_sent) {
 				tp->ts_recent = tp->rcv_tsval;
@@ -2173,6 +2502,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 		if (len <= tcp_header_len) {
 			/* Bulk data transfer: sender */
 			if (len == tcp_header_len) {
+				/* We know that such packets are checksummed
+				 * on entry.
+				 */
 				tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
 					TCP_SKB_CB(skb)->ack_seq, len); 
 				kfree_skb(skb); 
@@ -2182,19 +2514,42 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 				TCP_INC_STATS_BH(TcpInErrs);
 				goto discard;
 			}
-		} else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una &&
-			   atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) {
-			/* Bulk data transfer: receiver */
-			__skb_pull(skb,tcp_header_len);
+		} else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una) {
+			int eaten = 0;
 
-			/* Is it possible to simplify this? */
-			tcp_measure_rcv_mss(sk, skb); 
+			if (tp->ucopy.task == current &&
+			    tp->copied_seq == tp->rcv_nxt &&
+			    len - tcp_header_len <= tp->ucopy.len &&
+			    sk->lock.users) {
+				eaten = 1;
+
+				NET_INC_STATS_BH(TCPHPHitsToUser);
+
+				if (tcp_copy_to_iovec(sk, skb, tcp_header_len))
+					goto csum_error;
+
+				__skb_pull(skb,tcp_header_len);
+			} else {
+				if (tcp_checksum_complete_user(sk, skb))
+					goto csum_error;
+
+				if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf)
+					goto step5;
+
+				NET_INC_STATS_BH(TCPHPHits);
+
+				/* Bulk data transfer: receiver */
+				__skb_pull(skb,tcp_header_len);
+
+				/* DO NOT notify forward progress here.
+				 * It saves dozen of CPU instructions in fast path. --ANK
+				 * And where is it signaled then ? -AK
+				 * Nowhere. 8) --ANK
+				 */
+				__skb_queue_tail(&sk->receive_queue, skb);
+				skb_set_owner_r(skb, sk);
+			}
 
-			/* DO NOT notify forward progress here.
-			 * It saves dozen of CPU instructions in fast path. --ANK
-			 * And where is it signaled then ? -AK
-			 */
-			__skb_queue_tail(&sk->receive_queue, skb);
 			tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 
 			/* FIN bit check is not done since if FIN is set in
@@ -2202,27 +2557,43 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			 */
 			wake_up_interruptible(sk->sleep);
 			sock_wake_async(sk->socket,1, POLL_IN);
-			tcp_delack_estimator(tp);
 
-			tcp_remember_ack(tp, th, skb); 
+			tcp_event_data_recv(tp, skb);
 
+#if 1/*def CONFIG_TCP_MORE_COARSE_ACKS*/
+			if (eaten) {
+				if (tcp_in_quickack_mode(tp)) {
+					tcp_send_ack(sk);
+				} else {
+					tcp_send_delayed_ack(sk);
+				}
+			} else
+#endif
 			__tcp_ack_snd_check(sk, 0);
+
+			if (eaten)
+				kfree_skb(skb);
 			return 0;
 		}
 		/* Packet is in sequence, flags are trivial;
-		 * only ACK is strange or we are tough on memory.
-		 * Jump to step 5.
+		 * only ACK is strange. Jump to step 5.
 		 */
+		if (tcp_checksum_complete_user(sk, skb))
+			goto csum_error;
 		goto step5;
 	}
 
 slow_path:
+	if (tcp_checksum_complete_user(sk, skb))
+		goto csum_error;
+
 	/*
 	 * RFC1323: H1. Apply PAWS check first.
 	 */
 	if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp &&
 	    tcp_paws_discard(tp, skb)) {
 		if (!th->rst) {
+			NET_INC_STATS_BH(PAWSEstabRejected);
 			tcp_send_ack(sk);
 			goto discard;
 		}
@@ -2251,7 +2622,9 @@ slow_path:
 				   TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
 				   tp->rcv_wup, tp->rcv_wnd);
 		}
+		tcp_enter_quickack_mode(tp);
 		tcp_send_ack(sk);
+		NET_INC_STATS_BH(DelayedACKLost);
 		goto discard;
 	}
 
@@ -2279,11 +2652,8 @@ step5:
 	/* Process urgent data. */
 	tcp_urg(sk, th, len);
 
-	{
 	/* step 7: process the segment text */
-	int queued = tcp_data(skb, sk, len);
-
-	tcp_measure_rcv_mss(sk, skb); 
+	tcp_data(skb, sk, len);
 
 	/* Be careful, tcp_data() may have put this into TIME_WAIT. */
 	if(sk->state != TCP_CLOSE) {
@@ -2291,12 +2661,13 @@ step5:
 		tcp_ack_snd_check(sk);
 	}
 
-	if (!queued) {
-	discard:
-		kfree_skb(skb);
-	}
-	}
+	return 0;
+
+csum_error:
+	TCP_INC_STATS_BH(TcpInErrs);
 
+discard:
+	kfree_skb(skb);
 	return 0;
 }
 
@@ -2328,6 +2699,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 		newsk->dport = req->rmt_port;
 
 		sock_lock_init(newsk);
+		bh_lock_sock(newsk);
 
 		atomic_set(&newsk->rmem_alloc, 0);
 		skb_queue_head_init(&newsk->receive_queue);
@@ -2351,22 +2723,27 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 		newtp->rcv_nxt = req->rcv_isn + 1;
 		newtp->snd_nxt = req->snt_isn + 1;
 		newtp->snd_una = req->snt_isn + 1;
-		newtp->srtt = 0;
-		newtp->ato = 0;
+		newtp->snd_sml = req->snt_isn + 1;
+
+		tcp_delack_init(newtp);
+		if (skb->len >= 536)
+			newtp->ack.last_seg_size = skb->len;
+
+		tcp_prequeue_init(newtp);
+
 		newtp->snd_wl1 = req->rcv_isn;
 		newtp->snd_wl2 = req->snt_isn;
 
-		/* RFC1323: The window in SYN & SYN/ACK segments
-		 * is never scaled.
-		 */
-		newtp->snd_wnd = ntohs(skb->h.th->window);
-
-		newtp->max_window = newtp->snd_wnd;
-		newtp->pending = 0;
 		newtp->retransmits = 0;
-		newtp->last_ack_sent = req->rcv_isn + 1;
 		newtp->backoff = 0;
+		newtp->srtt = 0;
 		newtp->mdev = TCP_TIMEOUT_INIT;
+		newtp->rto = TCP_TIMEOUT_INIT;
+
+		newtp->packets_out = 0;
+		newtp->fackets_out = 0;
+		newtp->retrans_out = 0;
+		newtp->snd_ssthresh = 0x7fffffff;
 
 		/* So many TCP implementations out there (incorrectly) count the
 		 * initial SYN frame in their delayed-ACK and congestion control
@@ -2374,22 +2751,11 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 		 * efficiently to them.  -DaveM
 		 */
 		newtp->snd_cwnd = 2;
-
-		newtp->rto = TCP_TIMEOUT_INIT;
-		newtp->packets_out = 0;
-		newtp->fackets_out = 0;
-		newtp->retrans_out = 0;
-		newtp->high_seq = 0;
-		newtp->snd_ssthresh = 0x7fffffff;
 		newtp->snd_cwnd_cnt = 0;
+		newtp->high_seq = 0;
+
 		newtp->dup_acks = 0;
-		newtp->delayed_acks = 0;
-		init_timer(&newtp->retransmit_timer);
-		newtp->retransmit_timer.function = &tcp_retransmit_timer;
-		newtp->retransmit_timer.data = (unsigned long) newsk;
-		init_timer(&newtp->delack_timer);
-		newtp->delack_timer.function = &tcp_delack_timer;
-		newtp->delack_timer.data = (unsigned long) newsk;
+		tcp_init_xmit_timers(newsk);
 		skb_queue_head_init(&newtp->out_of_order_queue);
 		newtp->send_head = newtp->retrans_head = NULL;
 		newtp->rcv_wup = req->rcv_isn + 1;
@@ -2397,31 +2763,25 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 		newtp->copied_seq = req->rcv_isn + 1;
 
 		newtp->saw_tstamp = 0;
+		newtp->last_ack_sent = req->rcv_isn + 1;
 
-		init_timer(&newtp->probe_timer);
-		newtp->probe_timer.function = &tcp_probe_timer;
-		newtp->probe_timer.data = (unsigned long) newsk;
 		newtp->probes_out = 0;
 		newtp->syn_seq = req->rcv_isn;
 		newtp->fin_seq = req->rcv_isn;
 		newtp->urg_data = 0;
-		tcp_synq_init(newtp);
-		newtp->syn_backlog = 0;
-		if (skb->len >= 536)
-			newtp->last_seg_size = skb->len; 
+		newtp->listen_opt = NULL;
+		newtp->accept_queue = NULL;
+		/* Deinitialize syn_wait_lock to trap illegal accesses. */
+		memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
 
 		/* Back to base struct sock members. */
 		newsk->err = 0;
-		newsk->ack_backlog = 0;
-		newsk->max_ack_backlog = SOMAXCONN;
 		newsk->priority = 0;
 		atomic_set(&newsk->refcnt, 1);
+#ifdef INET_REFCNT_DEBUG
 		atomic_inc(&inet_sock_nr);
+#endif
 
-		spin_lock_init(&sk->timer_lock);
-		init_timer(&newsk->timer);
-		newsk->timer.function = &tcp_keepalive_timer;
-		newsk->timer.data = (unsigned long) newsk;
 		if (newsk->keepopen)
 			tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp));
 		newsk->socket = NULL;
@@ -2440,6 +2800,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 			newtp->snd_wscale = newtp->rcv_wscale = 0;
 			newtp->window_clamp = min(newtp->window_clamp,65535);
 		}
+		newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->snd_wscale;
+		newtp->max_window = newtp->snd_wnd;
+
 		if (newtp->tstamp_ok) {
 			newtp->ts_recent = req->ts_recent;
 			newtp->ts_recent_stamp = xtime.tv_sec;
@@ -2453,16 +2816,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 	return newsk;
 }
 
-static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
-{
-	if (seq == s_win)
-		return 1;
-	if (after(end_seq, s_win) && before(seq, e_win))
-		return 1;
-	return (seq == e_win && seq == end_seq);
-}
-
-
 /* 
  *	Process an incoming packet for SYN_RECV sockets represented
  *	as an open_request.
@@ -2470,30 +2823,28 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 
 struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 			   struct open_request *req,
-			   struct open_request *prev)
+			   struct open_request **prev)
 {
 	struct tcphdr *th = skb->h.th;
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
 	int paws_reject = 0;
 	struct tcp_opt ttp;
-
-	/* If socket has already been created, process
-	   packet in its context.
-
-	   We fall here only due to race, when packets were enqueued
-	   to backlog of listening socket.
-	 */
-	if (req->sk)
-		return req->sk;
+	struct sock *child;
 
 	ttp.saw_tstamp = 0;
 	if (th->doff > (sizeof(struct tcphdr)>>2)) {
-
 		tcp_parse_options(NULL, th, &ttp, 0);
 
-		paws_reject = ttp.saw_tstamp &&
-			(s32)(ttp.rcv_tsval - req->ts_recent) < 0;
+		if (ttp.saw_tstamp) {
+			ttp.ts_recent = req->ts_recent;
+			/* We do not store true stamp, but it is not required,
+			 * it can be estimated (approximately)
+			 * from another data.
+			 */
+			ttp.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
+			paws_reject = tcp_paws_check(&ttp, th->rst);
+		}
 	}
 
 	/* Check for pure retransmited SYN. */
@@ -2517,7 +2868,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 		 * Enforce "SYN-ACK" according to figure 8, figure 6
 		 * of RFC793, fixed by RFC1122.
 		 */
-		req->class->rtx_syn_ack(sk, req);
+		req->class->rtx_syn_ack(sk, req, NULL);
 		return NULL;
 	}
 
@@ -2544,6 +2895,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 		/* Out of window: send ACK and drop. */
 		if (!(flg & TCP_FLAG_RST))
 			req->class->send_ack(skb, req);
+		if (paws_reject)
+			NET_INC_STATS_BH(PAWSEstabRejected);
 		return NULL;
 	}
 
@@ -2572,35 +2925,78 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 	/* Invalid ACK: reset will be sent by listening socket */
 	if (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1)
 		return sk;
-
-	/* OK, ACK is valid, create big socket and
-	   feed this segment to it. It will repeat all
-	   the tests. THIS SEGMENT MUST MOVE SOCKET TO
-	   ESTABLISHED STATE. If it will be dropped after
-	   socket is created, wait for troubles.
+	/* Also, it would be not so bad idea to check rcv_tsecr, which
+	 * is essentially ACK extension and too early or too late values
+	 * should cause reset in unsynchronized states.
 	 */
-	sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
-	if (sk == NULL)
+
+	/* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
+	if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) {
+		req->acked = 1;
 		return NULL;
+	}
 
-	tcp_dec_slow_timer(TCP_SLT_SYNACK);
-	req->sk = sk;
-	return sk;
+	/* OK, ACK is valid, create big socket and
+	 * feed this segment to it. It will repeat all
+	 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
+	 * ESTABLISHED STATE. If it will be dropped after
+	 * socket is created, wait for troubles.
+	 */
+	child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
+	if (child == NULL)
+		goto listen_overflow;
 
-embryonic_reset:
 	tcp_synq_unlink(tp, req, prev);
-	tp->syn_backlog--;
-	tcp_dec_slow_timer(TCP_SLT_SYNACK);
+	tcp_synq_removed(sk, req);
+
+	tcp_acceptq_queue(sk, req, child);
+	return child;
 
+listen_overflow:
+	if (!sysctl_tcp_abort_on_overflow) {
+		req->acked = 1;
+		return NULL;
+	}
+
+embryonic_reset:
 	NET_INC_STATS_BH(EmbryonicRsts);
 	if (!(flg & TCP_FLAG_RST))
 		req->class->send_reset(skb);
 
-	req->class->destructor(req);
-	tcp_openreq_free(req); 
+	tcp_synq_drop(sk, req, prev);
 	return NULL;
 }
 
+/*
+ * Queue segment on the new socket if the new socket is active,
+ * otherwise we just shortcircuit this and continue with
+ * the new socket.
+ */
+
+int tcp_child_process(struct sock *parent, struct sock *child,
+		      struct sk_buff *skb)
+{
+	int ret = 0;
+	int state = child->state;
+
+	if (child->lock.users == 0) {
+		ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);
+
+		/* Wakeup parent, send SIGIO */
+		if (state == TCP_SYN_RECV && child->state != state)
+			parent->data_ready(parent, 0);
+	} else {
+		/* Alas, it is possible again, because we do lookup
+		 * in main socket hash table and lock on listening
+		 * socket does not protect us more.
+		 */
+		sk_add_backlog(child, skb);
+	}
+
+	bh_unlock_sock(child);
+	return ret;
+}
+
 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 					 struct tcphdr *th, unsigned len)
 {
@@ -2608,25 +3004,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 
 	tcp_parse_options(sk, th, tp, 0);
 
-#ifdef CONFIG_TCP_TW_RECYCLE
-	if (tp->ts_recent_stamp && tp->saw_tstamp && !th->rst &&
-	    (s32)(tp->rcv_tsval - tp->ts_recent) < 0 &&
-	    xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS) {
-		/* Old duplicate segment. We remember last
-		   ts_recent from this host in timewait bucket.
-
-		   Actually, we could implement per host cache
-		   to truncate timewait state after RTO. Paranoidal arguments
-		   of rfc1337 are not enough to close this nice possibility.
-		*/
-		if (net_ratelimit())
-			printk(KERN_DEBUG "TCP: tw recycle, PAWS worked. Good.\n");
-		if (th->ack)
-			return 1;
-		goto discard;
-	}
-#endif
-
 	if (th->ack) {
 		/* rfc793:
 		 * "If the state is SYN-SENT then
@@ -2646,10 +3023,36 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		 *  We do not send data with SYN, so that RFC-correct
 		 *  test reduces to:
 		 */
-		if (sk->zapped ||
-		    TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
+		if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
 			return 1;
 
+		/* Check not from any RFC, but it is evident consequence
+		 * of combining PAWS and usual SYN-SENT logic: ACK _is_
+		 * checked in SYN-SENT unlike another states, hence
+		 * echoed tstamp must be checked too.
+		 */
+		if (tp->saw_tstamp) {
+			if (tp->rcv_tsecr == 0) {
+				/* Workaround for bug in linux-2.1 and early
+				 * 2.2 kernels. Let's pretend that we did not
+				 * see such timestamp to avoid bogus rtt value,
+				 * calculated by tcp_ack().
+				 */
+				tp->saw_tstamp = 0;
+
+				/* But do not forget to store peer's timestamp! */
+				if (th->syn) {
+					tp->ts_recent = tp->rcv_tsval;
+					tp->ts_recent_stamp = xtime.tv_sec;
+				}
+			} else if ((__s32)(tp->rcv_tsecr - tcp_time_stamp) > 0 ||
+				   (__s32)(tp->rcv_tsecr - tp->syn_stamp) < 0) {
+				NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "TCP: synsent reject.\n"));
+				NET_INC_STATS_BH(PAWSActiveRejected);
+				return 1;
+			}
+		}
+
 		/* Now ACK is acceptable.
 		 *
 		 * "If the RST bit is set
@@ -2689,18 +3092,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		 * because tcp_ack check is too weak for SYN-SENT)
 		 * causes moving socket to invalid semi-SYN-SENT,
 		 * semi-ESTABLISHED state and connection hangs.
-		 *
-		 * There exist buggy stacks, which really send
-		 * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp)
-		 * Actually, if this host did not try to get something
-		 * from ftp.inr.ac.ru I'd never find this bug 8)
-		 *
 		 *                                     --ANK (990514)
 		 *
-		 * I was wrong, I apologize. Bare ACK is valid.
+		 * Bare ACK is valid, however.
 		 * Actually, RFC793 requires to send such ACK
 		 * in reply to any out of window packet.
-		 * It is wrong, but Linux also does it sometimes.
+		 * It is wrong, but Linux also send such
+		 * useless ACKs sometimes.
 		 *                                     --ANK (990724)
 		 */
 
@@ -2717,7 +3115,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		/* RFC1323: The window in SYN & SYN/ACK segments is
 		 * never scaled.
 		 */
-		tp->snd_wnd = htons(th->window);
+		tp->snd_wnd = ntohs(th->window);
 		tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
 		tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
 		tp->fin_seq = TCP_SKB_CB(skb)->seq;
@@ -2742,26 +3140,35 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		tcp_initialize_rcv_mss(sk);
 		tcp_init_metrics(sk);
 
+		if (sk->keepopen)
+			tcp_reset_keepalive_timer(sk, keepalive_time_when(tp));
+
+		tp->copied_seq = tp->rcv_nxt;
+		__tcp_fast_path_on(tp, tp->snd_wnd);
+
+		if(!sk->dead) {
+			sk->state_change(sk);
+			sock_wake_async(sk->socket, 0, POLL_OUT);
+		}
+
 		if (tp->write_pending) {
 			/* Save one ACK. Data will be ready after
 			 * several ticks, if write_pending is set.
 			 *
-			 * How to make this correctly?
+			 * It may be deleted, but with this feature tcpdumps
+			 * look so _wonderfully_ clever, that I was not able
+			 * to stand against the temptation 8)     --ANK
 			 */
-			tp->delayed_acks++;
-			if (tp->ato == 0)
-				tp->ato = tp->rto;
-			tcp_send_delayed_ack(sk, tp->rto);
+			tp->ack.pending = 1;
+			tp->ack.lrcvtime = tcp_time_stamp;
+			tcp_enter_quickack_mode(tp);
+			tp->ack.pingpong = 1;
+			tp->ack.ato = TCP_ATO_MIN;
+			tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MIN);
+			goto discard;
 		} else {
 			tcp_send_ack(sk);
 		}
-
-		tp->copied_seq = tp->rcv_nxt;
-
-		if(!sk->dead) {
-			wake_up_interruptible(sk->sleep);
-			sock_wake_async(sk->socket, 0, POLL_OUT);
-		}
 		return -1;
 	}
 
@@ -2777,6 +3184,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		goto discard;
 	}
 
+	/* PAWS check. */
+	if (tp->ts_recent_stamp && tp->saw_tstamp && tcp_paws_check(tp, 0))
+		goto discard;
+
 	if (th->syn) {
 		/* We see SYN without ACK. It is attempt of
 		 *  simultaneous connect with crossed SYNs.
@@ -2800,8 +3211,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		/* RFC1323: The window in SYN & SYN/ACK segments is
 		 * never scaled.
 		 */
-		tp->snd_wnd = htons(th->window);
+		tp->snd_wnd = ntohs(th->window);
 		tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+		tp->max_window = tp->snd_wnd;
 
 		tcp_sync_mss(sk, tp->pmtu_cookie);
 		tcp_initialize_rcv_mss(sk);
@@ -2960,6 +3372,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 #endif
 	    ) {
 		if (!th->rst) {
+			NET_INC_STATS_BH(DelayedACKLost);
+			tcp_enter_quickack_mode(tp);
 			tcp_send_ack(sk);
 		}
 		goto discard;
@@ -3011,28 +3425,29 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				tp->copied_seq = tp->rcv_nxt;
 
 				/* Note, that this wakeup is only for marginal
-				   crossed SYN case. Passively open sockets
-				   are not waked up, because sk->sleep == NULL
-				   and sk->socket == NULL.
+				 * crossed SYN case. Passively open sockets
+				 * are not waked up, because sk->sleep == NULL
+				 * and sk->socket == NULL.
 				 */
-				if (!sk->dead && sk->sleep) {
-					wake_up_interruptible(sk->sleep);
+				if (!sk->dead) {
+					sk->state_change(sk);
 					sock_wake_async(sk->socket,0,POLL_OUT);
 				}
 
 				tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
-				tp->snd_wnd = htons(th->window) << tp->snd_wscale;
+				tp->snd_wnd = ntohs(th->window) << tp->snd_wscale;
 				tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
 				tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
 
 				/* tcp_ack considers this ACK as duplicate
-				 * and does not calculate rtt. It is wrong.
+				 * and does not calculate rtt.
 				 * Fix it at least with timestamps.
 				 */
 				if (tp->saw_tstamp && !tp->srtt)
 					tcp_ack_saw_tstamp(sk, tp, 0, 0, FLAG_SYN_ACKED);
 
 				tcp_init_metrics(sk);
+				tcp_fast_path_on(tp);
 			} else {
 				SOCK_DEBUG(sk, "bad ack\n");
 				return 1;
@@ -3041,26 +3456,50 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 
 		case TCP_FIN_WAIT1:
 			if (tp->snd_una == tp->write_seq) {
-				sk->shutdown |= SEND_SHUTDOWN;
 				tcp_set_state(sk, TCP_FIN_WAIT2);
-				if (!sk->dead)
-					sk->state_change(sk);
-				else
-					tcp_reset_keepalive_timer(sk, sysctl_tcp_fin_timeout);
+				sk->shutdown |= SEND_SHUTDOWN;
 				dst_confirm(sk->dst_cache);
+
+				if (!sk->dead) {
+					/* Wake up lingering close() */
+					sk->state_change(sk);
+				} else {
+					int tmo;
+
+					if (tp->linger2 < 0 ||
+					    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
+						tcp_done(sk);
+						return 1;
+					}
+
+					tmo = tcp_fin_time(tp);
+					if (tmo > TCP_TIMEWAIT_LEN) {
+						tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
+					} else if (th->fin || sk->lock.users) {
+						/* Bad case. We could lose such FIN otherwise.
+						 * It is not a big problem, but it looks confusing
+						 * and not so rare event. We still can lose it now,
+						 * if it spins in bh_lock_sock(), but it is really
+						 * marginal case.
+						 */
+						tcp_reset_keepalive_timer(sk, tmo);
+					} else {
+						tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+						goto discard;
+					}
+				}
 			}
 			break;
 
-		case TCP_CLOSING:	
+		case TCP_CLOSING:
 			if (tp->snd_una == tp->write_seq) {
-				tcp_time_wait(sk);
+				tcp_time_wait(sk, TCP_TIME_WAIT, 0);
 				goto discard;
 			}
 			break;
 
 		case TCP_LAST_ACK:
 			if (tp->snd_una == tp->write_seq) {
-				tcp_set_state(sk,TCP_CLOSE);
 				tcp_update_metrics(sk);
 				tcp_done(sk);
 				goto discard;
@@ -3080,27 +3519,22 @@ step6:
 	case TCP_CLOSING:
 		if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq))
 			break;
-	
 	case TCP_FIN_WAIT1:
 	case TCP_FIN_WAIT2:
 		/* RFC 793 says to queue data in these states,
 		 * RFC 1122 says we MUST send a reset. 
 		 * BSD 4.4 also does reset.
 		 */
-		if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) {
+		if (sk->shutdown & RCV_SHUTDOWN) {
 			if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
 				tcp_reset(sk);
 				return 1;
 			}
 		}
-		
+		/* Fall through */
 	case TCP_ESTABLISHED: 
-		queued = tcp_data(skb, sk, len);
-
-		/* This must be after tcp_data() does the skb_pull() to
-		 * remove the header size from skb->len.
-		 */
-		tcp_measure_rcv_mss(sk, skb); 
+		tcp_data(skb, sk, len);
+		queued = 1;
 		break;
 	}
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 22c35a191..7420e268f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_ipv4.c,v 1.194 2000/01/09 02:19:41 davem Exp $
+ * Version:	$Id: tcp_ipv4.c,v 1.197 2000/01/21 06:37:28 davem Exp $
  *
  *		IPv4 specific functions
  *
@@ -52,7 +52,6 @@
 #include <linux/fcntl.h>
 #include <linux/random.h>
 #include <linux/init.h>
-#include <linux/ipsec.h>
 
 #include <net/icmp.h>
 #include <net/tcp.h>
@@ -61,15 +60,9 @@
 
 #include <linux/inet.h>
 #include <linux/stddef.h>
+#include <linux/ipsec.h>
 
-extern int sysctl_tcp_timestamps;
-extern int sysctl_tcp_window_scaling;
-extern int sysctl_tcp_sack;
-extern int sysctl_tcp_syncookies;
-extern int sysctl_tcp_tw_recycle;
 extern int sysctl_ip_dynaddr;
-extern __u32 sysctl_wmem_max;
-extern __u32 sysctl_rmem_max;
 
 /* Check TCP sequence numbers in ICMP packets. */
 #define ICMP_MIN_LENGTH 8
@@ -319,89 +312,13 @@ void tcp_put_port(struct sock *sk)
 	local_bh_enable();
 }
 
-#ifdef CONFIG_TCP_TW_RECYCLE
-/*
-   Very stupid pseudo-"algoritm". If the approach will be successful
-   (and it will!), we have to make it more reasonable.
-   Now it eats lots of CPU, when we are tough on ports.
-
-   Apparently, it should be hash table indexed by daddr/dport.
-
-   How does it work? We allow to truncate time-wait state, if:
-   1. PAWS works on it.
-   2. timewait bucket did not receive data for timeout:
-      - initially timeout := 2*RTO, so that if our ACK to first
-        transmitted peer's FIN is lost, we will see first retransmit.
-      - if we receive anything, the timout is increased exponentially
-        to follow normal TCP backoff pattern.
-      It is important that minimal RTO (HZ/5) > minimal timestamp
-      step (1ms).
-   3. When creating new socket, we inherit sequence number
-      and ts_recent of time-wait bucket, increasinf them a bit.
-
-   These two conditions guarantee, that data will not be corrupted
-   both by retransmitted and by delayed segments. They do not guarantee
-   that peer will leave LAST-ACK/CLOSING state gracefully, it will be
-   reset sometimes, namely, when more than two our ACKs to its FINs are lost.
-   This reset is harmless and even good.
+/* This lock without TASK_EXCLUSIVE is good on UP and it can be very bad on SMP.
+ * Look, when several writers sleep and reader wakes them up, all but one
+ * immediately hit write lock and grab all the cpus. Exclusive sleep solves
+ * this, _but_ remember, it adds useless work on UP machines (wake up each
+ * exclusive lock release). It should be ifdefed really.
  */
 
-int tcp_v4_tw_recycle(struct sock *sk, u32 daddr, u16 dport)
-{
-	static int tw_rover;
-
-	struct tcp_tw_bucket *tw;
-	struct tcp_bind_hashbucket *head;
-	struct tcp_bind_bucket *tb;
-
-	int low = sysctl_local_port_range[0];
-	int high = sysctl_local_port_range[1];
-	unsigned long now = jiffies;
-	int i, rover;
-
-	rover = tw_rover;
-
-	local_bh_disable();
-	for (i=0; i<tcp_bhash_size; i++, rover++) {
-		rover &= (tcp_bhash_size-1);
-		head = &tcp_bhash[rover];
-
-		spin_lock(&head->lock);
-		for (tb = head->chain; tb; tb = tb->next) {
-			tw = (struct tcp_tw_bucket*)tb->owners;
-
-			if (tw->state != TCP_TIME_WAIT ||
-			    tw->dport != dport ||
-			    tw->daddr != daddr ||
-			    tw->rcv_saddr != sk->rcv_saddr ||
-			    tb->port < low ||
-			    tb->port >= high ||
-			    !TCP_INET_FAMILY(tw->family) ||
-			    tw->ts_recent_stamp == 0 ||
-			    (long)(now - tw->ttd) <= 0)
-				continue;
-			tw_rover = rover;
-			goto hit;
-		}
-		spin_unlock(&head->lock);
-	}
-	local_bh_enable();
-	tw_rover = rover;
-	return -EAGAIN;
-
-hit:
-	sk->num = tw->num;
-	if ((sk->bind_next = tb->owners) != NULL)
-		tb->owners->bind_pprev = &sk->bind_next;
-	tb->owners = sk;
-	sk->bind_pprev = &tb->owners;
-	sk->prev = (struct sock *) tb;
-	spin_unlock_bh(&head->lock);
-	return 0;
-}
-#endif
-
-
 void tcp_listen_wlock(void)
 {
 	write_lock(&tcp_lhash_lock);
@@ -409,9 +326,9 @@ void tcp_listen_wlock(void)
 	if (atomic_read(&tcp_lhash_users)) {
 		DECLARE_WAITQUEUE(wait, current);
 
-		add_wait_queue(&tcp_lhash_wait, &wait);
+		add_wait_queue_exclusive(&tcp_lhash_wait, &wait);
 		for (;;) {
-			set_current_state(TASK_UNINTERRUPTIBLE);
+			set_current_state(TASK_UNINTERRUPTIBLE|TASK_EXCLUSIVE);
 			if (atomic_read(&tcp_lhash_users) == 0)
 				break;
 			write_unlock_bh(&tcp_lhash_lock);
@@ -445,6 +362,8 @@ static __inline__ void __tcp_v4_hash(struct sock *sk)
 	sk->pprev = skp;
 	sock_prot_inc_use(sk->prot);
 	write_unlock(lock);
+	if (sk->state == TCP_LISTEN)
+		wake_up(&tcp_lhash_wait);
 }
 
 static void tcp_v4_hash(struct sock *sk)
@@ -478,6 +397,8 @@ void tcp_unhash(struct sock *sk)
 		sock_prot_dec_use(sk->prot);
 	}
 	write_unlock_bh(lock);
+	if (sk->state == TCP_LISTEN)
+		wake_up(&tcp_lhash_wait);
 }
 
 /* Don't inline this cruft.  Here are some nice properties to
@@ -546,8 +467,9 @@ sherry_cache:
  *
  * Local BH must be disabled here.
  */
-static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
-					   u32 daddr, u16 hnum, int dif)
+
+static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
+						       u32 daddr, u16 hnum, int dif)
 {
 	struct tcp_ehash_bucket *head;
 	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
@@ -572,7 +494,7 @@ static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 			goto hit;
 	read_unlock(&head->lock);
 
-	return tcp_v4_lookup_listener(daddr, hnum, dif);
+	return NULL;
 
 hit:
 	sock_hold(sk);
@@ -580,6 +502,19 @@ hit:
 	return sk;
 }
 
+static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
+					   u32 daddr, u16 hnum, int dif)
+{
+	struct sock *sk;
+
+	sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif);
+
+	if (sk)
+		return sk;
+		
+	return tcp_v4_lookup_listener(daddr, hnum, dif);
+}
+
 __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
 {
 	struct sock *sk;
@@ -609,21 +544,16 @@ static int tcp_v4_check_established(struct sock *sk)
 	int hash = tcp_hashfn(daddr, sk->num, saddr, sk->dport);
 	struct tcp_ehash_bucket *head = &tcp_ehash[hash];
 	struct sock *sk2, **skp;
-#ifdef CONFIG_TCP_TW_RECYCLE
 	struct tcp_tw_bucket *tw;
-#endif
 
 	write_lock_bh(&head->lock);
 
 	/* Check TIME-WAIT sockets first. */
 	for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
 	    skp = &sk2->next) {
-#ifdef CONFIG_TCP_TW_RECYCLE
 		tw = (struct tcp_tw_bucket*)sk2;
-#endif
 
 		if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
-#ifdef CONFIG_TCP_TW_RECYCLE
 			struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
 			/* With PAWS, it is safe from the viewpoint
@@ -631,12 +561,17 @@ static int tcp_v4_check_established(struct sock *sk)
 			   is safe provided sequence spaces do not
 			   overlap i.e. at data rates <= 80Mbit/sec.
 
-			   Actually, the idea is close to VJ's (rfc1332)
-			   one, only timestamp cache is held not per host,
+			   Actually, the idea is close to VJ's one,
+			   only timestamp cache is held not per host,
 			   but per port pair and TW bucket is used
 			   as state holder.
+
+			   If TW bucket has been already destroyed we
+			   fall back to VJ's scheme and use initial
+			   timestamp retrieved from peer table.
 			 */
-			if (sysctl_tcp_tw_recycle && tw->ts_recent_stamp) {
+			if (tw->substate == TCP_TIME_WAIT &&
+			    sysctl_tcp_tw_recycle && tw->ts_recent_stamp) {
 				if ((tp->write_seq = tw->snd_nxt + 2) == 0)
 					tp->write_seq = 1;
 				tp->ts_recent = tw->ts_recent;
@@ -645,13 +580,10 @@ static int tcp_v4_check_established(struct sock *sk)
 				skp = &head->chain;
 				goto unique;
 			} else
-#endif
-			goto not_unique;
+				goto not_unique;
 		}
 	}
-#ifdef CONFIG_TCP_TW_RECYCLE
 	tw = NULL;
-#endif
 
 	/* And established part... */
 	for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) {
@@ -659,9 +591,7 @@ static int tcp_v4_check_established(struct sock *sk)
 			goto not_unique;
 	}
 
-#ifdef CONFIG_TCP_TW_RECYCLE
 unique:
-#endif
 	BUG_TRAP(sk->pprev==NULL);
 	if ((sk->next = *skp) != NULL)
 		(*skp)->pprev = &sk->next;
@@ -671,17 +601,17 @@ unique:
 	sock_prot_inc_use(sk->prot);
 	write_unlock_bh(&head->lock);
 
-#ifdef CONFIG_TCP_TW_RECYCLE
 	if (tw) {
 		/* Silly. Should hash-dance instead... */
 		local_bh_disable();
 		tcp_tw_deschedule(tw);
 		tcp_timewait_kill(tw);
+		NET_INC_STATS_BH(TimeWaitRecycled);
 		local_bh_enable();
 
 		tcp_tw_put(tw);
 	}
-#endif
+
 	return 0;
 
 not_unique:
@@ -727,9 +657,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	int tmp;
 	int err;
 
-	if (sk->state != TCP_CLOSE) 
-		return(-EISCONN);
-
 	if (addr_len < sizeof(struct sockaddr_in))
 		return(-EINVAL);
 
@@ -759,8 +686,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		daddr = rt->rt_dst;
 
 	err = -ENOBUFS;
-	buff = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
-			    0, GFP_KERNEL);
+	buff = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 0, GFP_KERNEL);
 
 	if (buff == NULL)
 		goto failure;
@@ -769,27 +695,28 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		sk->saddr = rt->rt_src;
 	sk->rcv_saddr = sk->saddr;
 
-	if (!sk->num) {
-		if (sk->prot->get_port(sk, 0)
-#ifdef CONFIG_TCP_TW_RECYCLE
-		    && (!sysctl_tcp_tw_recycle ||
-			tcp_v4_tw_recycle(sk, daddr, usin->sin_port))
-#endif
-		    ) {
-			kfree_skb(buff);
-			err = -EAGAIN;
-			goto failure;
-		}
-		sk->sport = htons(sk->num);
-	}
-#ifdef CONFIG_TCP_TW_RECYCLE
-	else if (tp->ts_recent_stamp && sk->daddr != daddr) {
+	if (tp->ts_recent_stamp && sk->daddr != daddr) {
 		/* Reset inherited state */
 		tp->ts_recent = 0;
 		tp->ts_recent_stamp = 0;
 		tp->write_seq = 0;
 	}
-#endif
+
+	if (sysctl_tcp_tw_recycle &&
+	    !tp->ts_recent_stamp &&
+	    rt->rt_dst == daddr) {
+		struct inet_peer *peer = rt_get_peer(rt);
+
+		/* VJ's idea. We save last timestamp seen from
+		 * the destination in peer table, when entering state TIME-WAIT
+		 * and initialize ts_recent from it, when trying new connection.
+		 */
+
+		if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
+			tp->ts_recent_stamp = peer->tcp_ts_stamp;
+			tp->ts_recent = peer->tcp_ts;
+		}
+	}
 
 	sk->dport = usin->sin_port;
 	sk->daddr = daddr;
@@ -814,85 +741,62 @@ failure:
 	return err;
 }
 
-static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len)
+static __inline__ int tcp_v4_iif(struct sk_buff *skb)
 {
-	int retval = -EINVAL;
-
-	lock_sock(sk);
-
-	/* Do sanity checking for sendmsg/sendto/send. */
-	if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL))
-		goto out;
-	if (msg->msg_name) {
-		struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
-
-		if (msg->msg_namelen < sizeof(*addr))
-			goto out;
-		if (addr->sin_family && addr->sin_family != AF_INET)
-			goto out;
-		retval = -ENOTCONN;
-		if(sk->state == TCP_CLOSE)
-			goto out;
-		retval = -EISCONN;
-		if (addr->sin_port != sk->dport)
-			goto out;
-		if (addr->sin_addr.s_addr != sk->daddr)
-			goto out;
-	}
-	retval = tcp_do_sendmsg(sk, msg);
-
-out:
-	release_sock(sk);
-	return retval;
+	return ((struct rtable*)skb->dst)->rt_iif;
 }
 
+static __inline__ unsigned tcp_v4_synq_hash(u32 raddr, u16 rport)
+{
+	unsigned h = raddr ^ rport;
+	h ^= h>>16;
+	h ^= h>>8;
+	return h&(TCP_SYNQ_HSIZE-1);
+}
 
-/*
- * Do a linear search in the socket open_request list. 
- * This should be replaced with a global hash table.
- */
 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp, 
 					      struct iphdr *iph,
 					      struct tcphdr *th,
-					      struct open_request **prevp)
+					      struct open_request ***prevp)
 {
-	struct open_request *req, *prev;  
-	__u16 rport = th->source; 
-
-	/*	assumption: the socket is not in use.
-	 *	as we checked the user count on tcp_rcv and we're
-	 *	running from a soft interrupt.
-	 */
-	prev = (struct open_request *) (&tp->syn_wait_queue); 
-	for (req = prev->dl_next; req; req = req->dl_next) {
-		if (req->af.v4_req.rmt_addr == iph->saddr &&
+	struct tcp_listen_opt *lopt = tp->listen_opt;
+	struct open_request *req, **prev;  
+	__u16 rport = th->source;
+	__u32 raddr = iph->saddr;
+
+	for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport)];
+	     (req = *prev) != NULL;
+	     prev = &req->dl_next) {
+		if (req->rmt_port == rport &&
+		    req->af.v4_req.rmt_addr == raddr &&
 		    req->af.v4_req.loc_addr == iph->daddr &&
-		    req->rmt_port == rport &&
 		    TCP_INET_FAMILY(req->class->family)) {
-			if (req->sk) {
-				/* Weird case: connection was established
-				   and then killed by RST before user accepted
-				   it. This connection is dead, but we cannot
-				   kill openreq to avoid blocking in accept().
-
-				   accept() will collect this garbage,
-				   but such reqs must be ignored, when talking
-				   to network.
-				 */
-				bh_lock_sock(req->sk);
-				BUG_TRAP(req->sk->lock.users==0);
-				if (req->sk->state == TCP_CLOSE) {
-					bh_unlock_sock(req->sk);
-					prev = req;
-					continue;
-				}
-			}
+			BUG_TRAP(req->sk == NULL);
 			*prevp = prev;
 			return req; 
 		}
-		prev = req; 
 	}
-	return NULL; 
+
+	return NULL;
+}
+
+static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
+{
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+	struct tcp_listen_opt *lopt = tp->listen_opt;
+	unsigned h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port);
+
+	req->expires = jiffies + TCP_TIMEOUT_INIT;
+	req->retrans = 0;
+	req->sk = NULL;
+	req->index = h;
+	req->dl_next = lopt->syn_table[h];
+
+	write_lock(&tp->syn_wait_lock);
+	lopt->syn_table[h] = req;
+	write_unlock(&tp->syn_wait_lock);
+
+	tcp_synq_added(sk);
 }
 
 
@@ -984,7 +888,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 
 	th = (struct tcphdr*)(dp+(iph->ihl<<2));
 
-	sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex);
+	sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb));
 	if (sk == NULL) {
 		ICMP_INC_STATS_BH(IcmpInErrors);
 		return;
@@ -1001,6 +905,9 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 	if (sk->lock.users != 0)
 		NET_INC_STATS_BH(LockDroppedIcmps);
 
+	if (sk->state == TCP_CLOSE)
+		goto out;
+
 	tp = &sk->tp_pinfo.af_tcp;
 	seq = ntohl(th->seq);
 	if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
@@ -1010,14 +917,11 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 
 	switch (type) {
 	case ICMP_SOURCE_QUENCH:
-#ifndef OLD_SOURCE_QUENCH /* This is deprecated */
-		if (sk->lock.users == 0) {
-			tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
-			tp->snd_cwnd = tp->snd_ssthresh;
-			tp->snd_cwnd_cnt = 0;
-			tp->high_seq = tp->snd_nxt;
-		}
-#endif
+		/* This is deprecated, but if someone generated it,
+		 * we have no reasons to ignore it.
+		 */
+		if (sk->lock.users == 0)
+			tcp_enter_cong_avoid(tp);
 		goto out;
 	case ICMP_PARAMETERPROB:
 		err = EPROTO;
@@ -1042,7 +946,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 	}
 
 	switch (sk->state) {
-		struct open_request *req, *prev;
+		struct open_request *req, **prev;
 	case TCP_LISTEN:
 		if (sk->lock.users != 0)
 			goto out;
@@ -1060,47 +964,25 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 		if (!req)
 			goto out;
 
-		if (req->sk) {
-			struct sock *nsk = req->sk;
-
-			/* 
-			 * Already in ESTABLISHED and a big socket is created,
-			 * set error code there.
-			 * The error will _not_ be reported in the accept(),
-			 * but only with the next operation on the socket after
-			 * accept. 
-			 */
-			sock_hold(nsk);
-			bh_unlock_sock(sk);
-			sock_put(sk);
-			sk = nsk;
-
-			BUG_TRAP(sk->lock.users == 0);
-			tp = &sk->tp_pinfo.af_tcp;
-			if (!between(seq, tp->snd_una, tp->snd_nxt)) {
-				NET_INC_STATS(OutOfWindowIcmps);
-				goto out;
-			}
-		} else {
-			if (seq != req->snt_isn) {
-				NET_INC_STATS(OutOfWindowIcmps);
-				goto out;
-			}
+		/* ICMPs are not backlogged, hence we cannot get
+		   an established socket here.
+		 */
+		BUG_TRAP(req->sk == NULL);
 
-			/* 
-			 * Still in SYN_RECV, just remove it silently.
-			 * There is no good way to pass the error to the newly
-			 * created socket, and POSIX does not want network
-			 * errors returned from accept(). 
-			 */ 
-			tp->syn_backlog--;
-			tcp_synq_unlink(tp, req, prev);
-			tcp_dec_slow_timer(TCP_SLT_SYNACK);
-			req->class->destructor(req);
-			tcp_openreq_free(req);
+		if (seq != req->snt_isn) {
+			NET_INC_STATS_BH(OutOfWindowIcmps);
 			goto out;
 		}
-		break;
+
+		/* 
+		 * Still in SYN_RECV, just remove it silently.
+		 * There is no good way to pass the error to the newly
+		 * created socket, and POSIX does not want network
+		 * errors returned from accept(). 
+		 */ 
+		tcp_synq_drop(sk, req, prev);
+		goto out;
+
 	case TCP_SYN_SENT:
 	case TCP_SYN_RECV:  /* Cannot happen.
 			       It can f.e. if SYNs crossed.
@@ -1110,10 +992,9 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 		if (sk->lock.users == 0) {
 			TCP_INC_STATS_BH(TcpAttemptFails);
 			sk->err = err;
-			/* Wake people up to see the error (see connect in sock.c) */
+
 			sk->error_report(sk);
 
-			tcp_set_state(sk, TCP_CLOSE);
 			tcp_done(sk);
 		} else {
 			sk->err_soft = err;
@@ -1270,28 +1151,23 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
 
-	tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt, 0, tw->ts_recent);
+	tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt,
+			tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent);
 
 	tcp_tw_put(tw);
 }
 
 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
 {
-	tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd, req->ts_recent);
+	tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd,
+			req->ts_recent);
 }
 
-/*
- *	Send a SYN-ACK after having received an ACK. 
- *	This still operates on a open_request only, not on a big
- *	socket.
- */ 
-static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
+static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req)
 {
 	struct rtable *rt;
 	struct ip_options *opt;
-	struct sk_buff * skb;
 
-	/* First, grab a route. */
 	opt = req->af.v4_req.opt;
 	if(ip_route_output(&rt, ((opt && opt->srr) ?
 				 opt->faddr :
@@ -1300,15 +1176,33 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
 			   RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
 			   sk->bound_dev_if)) {
 		IP_INC_STATS_BH(IpOutNoRoutes);
-		return;
+		return NULL;
 	}
-	if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
+	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
 		ip_rt_put(rt);
 		IP_INC_STATS_BH(IpOutNoRoutes);
-		return;
+		return NULL;
 	}
+	return &rt->u.dst;
+}
+
+/*
+ *	Send a SYN-ACK after having received an ACK. 
+ *	This still operates on a open_request only, not on a big
+ *	socket.
+ */ 
+static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
+			      struct dst_entry *dst)
+{
+	int err = -1;
+	struct sk_buff * skb;
 
-	skb = tcp_make_synack(sk, &rt->u.dst, req);
+	/* First, grab a route. */
+	if (dst == NULL &&
+	    (dst = tcp_v4_route_req(sk, req)) == NULL)
+		goto out;
+
+	skb = tcp_make_synack(sk, dst, req);
 
 	if (skb) {
 		struct tcphdr *th = skb->h.th;
@@ -1317,10 +1211,15 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
 					 req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
 					 csum_partial((char *)th, skb->len, skb->csum));
 
-		ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
-				      req->af.v4_req.rmt_addr, req->af.v4_req.opt);
+		err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
+					    req->af.v4_req.rmt_addr, req->af.v4_req.opt);
+		if (err == NET_XMIT_CN)
+			err = 0;
 	}
-	ip_rt_put(rt);
+
+out:
+	dst_release(dst);
+	return err;
 }
 
 /*
@@ -1328,7 +1227,7 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
  */ 
 static void tcp_v4_or_free(struct open_request *req)
 {
-	if(!req->sk && req->af.v4_req.opt)
+	if (req->af.v4_req.opt)
 		kfree_s(req->af.v4_req.opt, optlength(req->af.v4_req.opt));
 }
 
@@ -1372,8 +1271,14 @@ tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
  * It would be better to replace it with a global counter for all sockets
  * but then some measure against one socket starving all other sockets
  * would be needed.
+ *
+ * It was 128 by default. Experiments with real servers show, that
+ * it is absolutely not enough even at 100conn/sec. 256 cures most
+ * of problems. This value is adjusted to 128 for very small machines
+ * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
+ * Further increasing requires to change hash table size.
  */
-int sysctl_max_syn_backlog = 128; 
+int sysctl_max_syn_backlog = 256; 
 
 struct or_calltable or_ipv4 = {
 	PF_INET,
@@ -1383,9 +1288,6 @@ struct or_calltable or_ipv4 = {
 	tcp_v4_send_reset
 };
 
-#define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */
-#define BACKLOGMAX(sk) sysctl_max_syn_backlog
-
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_opt tp;
@@ -1394,6 +1296,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	__u32 saddr = skb->nh.iph->saddr;
 	__u32 daddr = skb->nh.iph->daddr;
 	__u32 isn = TCP_SKB_CB(skb)->when;
+	struct dst_entry *dst = NULL;
 #ifdef CONFIG_SYN_COOKIES
 	int want_cookie = 0;
 #else
@@ -1405,84 +1308,108 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	    (RTCF_BROADCAST|RTCF_MULTICAST))
 		goto drop; 
 
-	/* XXX: Check against a global syn pool counter. */
-	if (BACKLOG(sk) > BACKLOGMAX(sk)) {
+	/* TW buckets are converted to open requests without
+	 * limitations, they conserve resources and peer is
+	 * evidently real one.
+	 */
+	if (tcp_synq_is_full(sk) && !isn) {
 #ifdef CONFIG_SYN_COOKIES
-		if (sysctl_tcp_syncookies && !isn) {
-			syn_flood_warning(skb);
+		if (sysctl_tcp_syncookies) {
 			want_cookie = 1; 
 		} else
 #endif
 		goto drop;
-	} else { 
-		if (isn == 0)
-			isn = tcp_v4_init_sequence(sk, skb);
-		BACKLOG(sk)++;
 	}
 
-	req = tcp_openreq_alloc();
-	if (req == NULL) {
-		goto dropbacklog;
-	}
+	/* Accept backlog is full. If we have already queued enough
+	 * of warm entries in syn queue, drop request. It is better than
+	 * clogging syn queue with openreqs with exponentially increasing
+	 * timeout.
+	 */
+	if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
+		goto drop;
 
-	req->rcv_wnd = 0;		/* So that tcp_send_synack() knows! */
+	req = tcp_openreq_alloc();
+	if (req == NULL)
+		goto drop;
 
-	req->rcv_isn = TCP_SKB_CB(skb)->seq;
  	tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
-
 	tp.mss_clamp = 536;
 	tp.user_mss = sk->tp_pinfo.af_tcp.user_mss;
 
 	tcp_parse_options(NULL, th, &tp, want_cookie);
 
-	req->mss = tp.mss_clamp;
-	req->ts_recent = tp.saw_tstamp ? tp.rcv_tsval : 0;
-	req->tstamp_ok = tp.tstamp_ok;
-	req->sack_ok = tp.sack_ok;
-	req->snd_wscale = tp.snd_wscale;
-	req->wscale_ok = tp.wscale_ok;
-	req->rmt_port = th->source;
+	tcp_openreq_init(req, &tp, skb);
+
 	req->af.v4_req.loc_addr = daddr;
 	req->af.v4_req.rmt_addr = saddr;
+	req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
+	req->class = &or_ipv4;
 
-	/* Note that we ignore the isn passed from the TIME_WAIT
-	 * state here. That's the price we pay for cookies.
-	 *
-	 * RED-PEN. The price is high... Then we cannot kill TIME-WAIT
-	 * and should reject connection attempt, duplicates with random
-	 * sequence number can corrupt data. Right?
-	 * I disabled sending cookie to request matching to a timewait
-	 * bucket.
-	 */
-	if (want_cookie)
+	if (want_cookie) {
+#ifdef CONFIG_SYN_COOKIES
+		syn_flood_warning(skb);
+#endif
 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
+	} else if (isn == 0) {
+		struct inet_peer *peer = NULL;
 
-	req->snt_isn = isn;
-
-	req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
+		/* VJ's idea. We save last timestamp seen
+		 * from the destination in peer table, when entering
+		 * state TIME-WAIT, and check against it before
+		 * accepting new connection request.
+		 *
+		 * If "isn" is not zero, this request hit alive
+		 * timewait bucket, so that all the necessary checks
+		 * are made in the function processing timewait state.
+		 */
+		if (tp.saw_tstamp &&
+		    sysctl_tcp_tw_recycle &&
+		    (dst = tcp_v4_route_req(sk, req)) != NULL &&
+		    (peer = rt_get_peer((struct rtable*)dst)) != NULL &&
+		    peer->v4daddr == saddr) {
+			if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
+			    (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) {
+				NETDEBUG(printk(KERN_DEBUG "TW_REC: reject openreq %u/%u %08x/%u\n", peer->tcp_ts, req->ts_recent, saddr, ntohs(skb->h.th->source)));
+				NET_INC_STATS_BH(PAWSPassiveRejected);
+				dst_release(dst);
+				goto drop_and_free;
+			}
+		}
+		/* Kill the following clause, if you dislike this way. */
+		else if (!sysctl_tcp_syncookies &&
+			 (sysctl_max_syn_backlog - tcp_synq_len(sk)
+			  < (sysctl_max_syn_backlog>>2)) &&
+			 (!peer || !peer->tcp_ts_stamp) &&
+			 (!dst || !dst->rtt)) {
+			/* Without syncookies last quarter of
+			 * backlog is filled with destinations, proven to be alive.
+			 * It means that we continue to communicate
+			 * to destinations, already remembered
+			 * to the moment of synflood.
+			 */
+			NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "TCP: drop open request from %08x/%u\n", saddr, ntohs(skb->h.th->source)));
+			TCP_INC_STATS_BH(TcpAttemptFails);
+			dst_release(dst);
+			goto drop_and_free;
+		}
 
-	req->class = &or_ipv4;
-	req->retrans = 0;
-	req->sk = NULL;
+		isn = tcp_v4_init_sequence(sk, skb);
+	}
+	req->snt_isn = isn;
 
-	tcp_v4_send_synack(sk, req);
+	if (tcp_v4_send_synack(sk, req, dst))
+		goto drop_and_free;
 
 	if (want_cookie) {
-		if (req->af.v4_req.opt)
-			kfree(req->af.v4_req.opt);
-		tcp_v4_or_free(req); 
 	   	tcp_openreq_free(req); 
 	} else {
-		req->expires = jiffies + TCP_TIMEOUT_INIT;
-		tcp_inc_slow_timer(TCP_SLT_SYNACK);
-		tcp_synq_queue(&sk->tp_pinfo.af_tcp, req);
+		tcp_v4_synq_add(sk, req);
 	}
-
 	return 0;
 
-dropbacklog:
-	if (!want_cookie) 
-		BACKLOG(sk)--;
+drop_and_free:
+	tcp_openreq_free(req); 
 drop:
 	TCP_INC_STATS_BH(TcpAttemptFails);
 	return 0;
@@ -1497,29 +1424,20 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 				   struct open_request *req,
 				   struct dst_entry *dst)
 {
-	struct ip_options *opt = req->af.v4_req.opt;
 	struct tcp_opt *newtp;
 	struct sock *newsk;
 
-	if (sk->ack_backlog > sk->max_ack_backlog)
-		goto exit; /* head drop */
-	if (dst == NULL) { 
-		struct rtable *rt;
-		
-		if (ip_route_output(&rt,
-			opt && opt->srr ? opt->faddr : req->af.v4_req.rmt_addr,
-			req->af.v4_req.loc_addr, sk->protinfo.af_inet.tos|RTO_CONN, 0))
-			return NULL;
-	        dst = &rt->u.dst;
-	}
+	if (tcp_acceptq_is_full(sk))
+		goto exit_overflow;
+
+	if (dst == NULL &&
+	    (dst = tcp_v4_route_req(sk, req)) == NULL)
+		goto exit;
 
 	newsk = tcp_create_openreq_child(sk, req, skb);
 	if (!newsk)
 		goto exit;
 
-	sk->tp_pinfo.af_tcp.syn_backlog--;
-	sk->ack_backlog++;
-
 	newsk->dst_cache = dst;
 
 	newtp = &(newsk->tp_pinfo.af_tcp);
@@ -1527,7 +1445,8 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	newsk->saddr = req->af.v4_req.loc_addr;
 	newsk->rcv_saddr = req->af.v4_req.loc_addr;
 	newsk->protinfo.af_inet.opt = req->af.v4_req.opt;
-	newsk->protinfo.af_inet.mc_index = ((struct rtable*)skb->dst)->rt_iif;
+	req->af.v4_req.opt = NULL;
+	newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb);
 	newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl;
 	newtp->ext_header_len = 0;
 	if (newsk->protinfo.af_inet.opt)
@@ -1535,28 +1454,26 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 	tcp_sync_mss(newsk, dst->pmtu);
 	tcp_initialize_rcv_mss(newsk);
+	newtp->advmss = dst->advmss;
 
-	if (newsk->rcvbuf < (3 * (dst->advmss+40+MAX_HEADER+15)))
-		newsk->rcvbuf = min ((3 * (dst->advmss+40+MAX_HEADER+15)), sysctl_rmem_max);
-	if (newsk->sndbuf < (3 * (newtp->mss_clamp+40+MAX_HEADER+15)))
-		newsk->sndbuf = min ((3 * (newtp->mss_clamp+40+MAX_HEADER+15)), sysctl_wmem_max);
+	tcp_init_buffer_space(newsk);
 
-	bh_lock_sock(newsk);
- 
 	__tcp_v4_hash(newsk);
 	__tcp_inherit_port(sk, newsk);
 
 	return newsk;
 
+exit_overflow:
+	NET_INC_STATS_BH(ListenOverflows);
 exit:
+	NET_INC_STATS_BH(ListenDrops);
 	dst_release(dst);
 	return NULL;
 }
 
-
 static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
 {
-	struct open_request *req, *prev;
+	struct open_request *req, **prev;
 	struct tcphdr *th = skb->h.th;
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
@@ -1565,6 +1482,25 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
 	if (req)
 		return tcp_check_req(sk, skb, req, prev);
 
+	if (tp->accept_queue) {
+		struct sock *nsk;
+
+		nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
+						  th->source,
+						  skb->nh.iph->daddr,
+						  ntohs(th->dest),
+						  tcp_v4_iif(skb));
+
+		if (nsk) {
+			if (nsk->state != TCP_TIME_WAIT) {
+				bh_lock_sock(nsk);
+				return nsk;
+			}
+			tcp_tw_put((struct tcp_tw_bucket*)sk);
+			return NULL;
+		}
+	}
+
 #ifdef CONFIG_SYN_COOKIES
 	if (!th->rst && (th->syn || th->ack))
 		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
@@ -1572,27 +1508,26 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
 	return sk;
 }
 
-static int tcp_csum_verify(struct sk_buff *skb)
+static int tcp_v4_checksum_init(struct sk_buff *skb)
 {
-	switch (skb->ip_summed) {
-	case CHECKSUM_NONE:
-		skb->csum = csum_partial((char *)skb->h.th, skb->len, 0);
-	case CHECKSUM_HW:
-		if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,skb->nh.iph->daddr,skb->csum)) {
-			NETDEBUG(printk(KERN_DEBUG "TCPv4 bad checksum "
-					"from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, "
-					"len=%d/%d\n",
-					NIPQUAD(skb->nh.iph->saddr),
-					ntohs(skb->h.th->source), 
-					NIPQUAD(skb->nh.iph->daddr),
-					ntohs(skb->h.th->dest),
-					skb->len,
-					ntohs(skb->nh.iph->tot_len)));
-			return 1;
+	if (skb->ip_summed == CHECKSUM_HW) {
+		if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
+				 skb->nh.iph->daddr,skb->csum)) {
+			NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
+			return -1;
 		}
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
-	default:
-		/* CHECKSUM_UNNECESSARY */
+	} else if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
+		if (skb->len <= 68) {
+			if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
+					 skb->nh.iph->daddr,
+					 csum_partial((char *)skb->h.th, skb->len, 0)))
+				return -1;
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+		} else {
+			skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
+						  skb->nh.iph->daddr,0);
+		}
 	}
 	return 0;
 }
@@ -1614,66 +1549,35 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 		goto discard;
 #endif /* CONFIG_FILTER */
 
-	/* 
-	 * This doesn't check if the socket has enough room for the packet.
-	 * Either process the packet _without_ queueing it and then free it,
-	 * or do the check later.
-	 */
-	skb_set_owner_r(skb, sk);
+  	IP_INC_STATS_BH(IpInDelivers);
 
 	if (sk->state == TCP_ESTABLISHED) { /* Fast path */
-		/* Ready to move deeper ... */
-		if (tcp_csum_verify(skb))
-			goto csum_err;
+		TCP_CHECK_TIMER(sk);
 		if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
 			goto reset;
+		TCP_CHECK_TIMER(sk);
 		return 0; 
-	} 
+	}
 
-	if (tcp_csum_verify(skb))
+	if (tcp_checksum_complete(skb))
 		goto csum_err;
 
 	if (sk->state == TCP_LISTEN) { 
-		struct sock *nsk;
-
-		nsk = tcp_v4_hnd_req(sk, skb);
+		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
 		if (!nsk)
 			goto discard;
 
-		/*
-		 * Queue it on the new socket if the new socket is active,
-		 * otherwise we just shortcircuit this and continue with
-		 * the new socket..
-		 */
 		if (nsk != sk) {
-			int ret;
-			int state = nsk->state;
-
-			skb_orphan(skb);
-
-			BUG_TRAP(nsk->lock.users == 0);
-			skb_set_owner_r(skb, nsk);
-			ret = tcp_rcv_state_process(nsk, skb, skb->h.th, skb->len);
-
-			/* Wakeup parent, send SIGIO, if this packet changed
-			   socket state from SYN-RECV.
-
-			   It still looks ugly, however it is much better
-			   than miracleous double wakeup in syn_recv_sock()
-			   and tcp_rcv_state_process().
-			 */
-			if (state == TCP_SYN_RECV && nsk->state != state)
-				sk->data_ready(sk, 0);
-
-			bh_unlock_sock(nsk);
-			if (ret)
+			if (tcp_child_process(sk, nsk, skb))
 				goto reset;
 			return 0;
 		}
 	}
-	
+
+	TCP_CHECK_TIMER(sk);
 	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
 		goto reset;
+	TCP_CHECK_TIMER(sk);
 	return 0;
 
 reset:
@@ -1716,6 +1620,9 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
 	if (len < sizeof(struct tcphdr))
 		goto bad_packet;
 
+	if (tcp_v4_checksum_init(skb) < 0)
+		goto bad_packet;
+
 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
 				    len - th->doff*4);
@@ -1724,7 +1631,7 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
 	skb->used = 0;
 
 	sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
-			     skb->nh.iph->daddr, ntohs(th->dest), skb->dev->ifindex);
+			     skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
 
 	if (!sk)
 		goto no_tcp_socket;
@@ -1738,9 +1645,10 @@ process:
 
 	bh_lock_sock(sk);
 	ret = 0;
-	if (!sk->lock.users)
-		ret = tcp_v4_do_rcv(sk, skb);
-	else
+	if (!sk->lock.users) {
+		if (!tcp_prequeue(sk, skb))
+			ret = tcp_v4_do_rcv(sk, skb);
+	} else
 		sk_add_backlog(sk, skb);
 	bh_unlock_sock(sk);
 
@@ -1749,7 +1657,7 @@ process:
 	return ret;
 
 no_tcp_socket:
-	if (tcp_csum_verify(skb)) {
+	if (tcp_checksum_complete(skb)) {
 bad_packet:
 		TCP_INC_STATS_BH(TcpInErrs);
 	} else {
@@ -1766,7 +1674,7 @@ discard_and_relse:
 	goto discard_it;
 
 do_time_wait:
-	if (tcp_csum_verify(skb)) {
+	if (tcp_checksum_complete(skb)) {
 		TCP_INC_STATS_BH(TcpInErrs);
 		goto discard_and_relse;
 	}
@@ -1776,7 +1684,7 @@ do_time_wait:
 	{
 		struct sock *sk2;
 
-		sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), skb->dev->ifindex);
+		sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
 		if (sk2 != NULL) {
 			tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
 			tcp_timewait_kill((struct tcp_tw_bucket *)sk);
@@ -1796,36 +1704,39 @@ do_time_wait:
 	goto discard_it;
 }
 
+/* With per-bucket locks this operation is not-atomic, so that
+ * this version is not worse.
+ */
 static void __tcp_v4_rehash(struct sock *sk)
 {
-	struct tcp_ehash_bucket *oldhead = &tcp_ehash[sk->hashent];
-	struct tcp_ehash_bucket *head = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))];
-	struct sock **skp = &head->chain;
-
-	write_lock_bh(&oldhead->lock);
-	if(sk->pprev) {
-		if(sk->next)
-			sk->next->pprev = sk->pprev;
-		*sk->pprev = sk->next;
-		sk->pprev = NULL;
-	}
-	write_unlock(&oldhead->lock);
-	write_lock(&head->lock);
-	if((sk->next = *skp) != NULL)
-		(*skp)->pprev = &sk->next;
-	*skp = sk;
-	sk->pprev = skp;
-	write_unlock_bh(&head->lock);
+	sk->prot->unhash(sk);
+	sk->prot->hash(sk);
 }
 
 int tcp_v4_rebuild_header(struct sock *sk)
 {
-	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
+	struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
 	__u32 new_saddr;
         int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT;
 
-	if(rt == NULL)
-		return 0;
+	if (rt == NULL) {
+		int err;
+
+		u32 daddr = sk->daddr;
+
+		if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
+			daddr = sk->protinfo.af_inet.opt->faddr;
+
+		err = ip_route_output(&rt, daddr, sk->saddr,
+				      RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
+				      sk->bound_dev_if);
+		if (err) {
+			sk->err_soft=-err;
+			sk->error_report(sk);
+			return -1;
+		}
+		__sk_dst_set(sk, &rt->u.dst);
+	}
 
 	/* Force route checking if want_rewrite.
 	 * The idea is good, the implementation is disguisting.
@@ -1855,16 +1766,6 @@ int tcp_v4_rebuild_header(struct sock *sk)
 			dst_release(&new_rt->u.dst);
 		}
 	}
-	if (rt->u.dst.obsolete) {
-		int err;
-		err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos|RTO_CONN, rt->key.oif);
-		if (err) {
-			sk->err_soft=-err;
-			sk->error_report(sk);
-			return -1;
-		}
-		__sk_dst_set(sk, &rt->u.dst);
-	}
 
 	return 0;
 
@@ -1877,7 +1778,7 @@ do_rewrite:
 		       "saddr=%08X rcv_saddr=%08X\n",
 		       ntohl(sk->saddr), 
 		       ntohl(sk->rcv_saddr));
-		return 0;
+		return -1;
 	}
 
 	if (new_saddr != sk->saddr) {
@@ -1895,7 +1796,7 @@ do_rewrite:
 		 * XXX really change the sockets identity after
 		 * XXX it has entered the hashes. -DaveM
 		 *
-		 * Besides that, it does not check for connetion
+		 * Besides that, it does not check for connection
 		 * uniqueness. Wait for troubles.
 		 */
 		__tcp_v4_rehash(sk);
@@ -1913,6 +1814,63 @@ static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
 	sin->sin_port		= sk->dport;
 }
 
+/* VJ's idea. Save last timestamp seen from this destination
+ * and hold it at least for normal timewait interval to use for duplicate
+ * segment detection in subsequent connections, before they enter synchronized
+ * state.
+ */
+
+int tcp_v4_remember_stamp(struct sock *sk)
+{
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+	struct rtable *rt = (struct rtable*)__sk_dst_get(sk);
+	struct inet_peer *peer = NULL;
+	int release_it = 0;
+
+	if (rt == NULL || rt->rt_dst != sk->daddr) {
+		peer = inet_getpeer(sk->daddr, 1);
+		release_it = 1;
+	} else {
+		if (rt->peer == NULL)
+			rt_bind_peer(rt, 1);
+		peer = rt->peer;
+	}
+
+	if (peer) {
+		if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
+		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
+		     peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
+			peer->tcp_ts_stamp = tp->ts_recent_stamp;
+			peer->tcp_ts = tp->ts_recent;
+		}
+		if (release_it)
+			inet_putpeer(peer);
+		return 1;
+	}
+
+	return 0;
+}
+
+int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
+{
+	struct inet_peer *peer = NULL;
+
+	peer = inet_getpeer(tw->daddr, 1);
+
+	if (peer) {
+		if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 ||
+		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
+		     peer->tcp_ts_stamp <= tw->ts_recent_stamp)) {
+			peer->tcp_ts_stamp = tw->ts_recent_stamp;
+			peer->tcp_ts = tw->ts_recent;
+		}
+		inet_putpeer(peer);
+		return 1;
+	}
+
+	return 0;
+}
+
 struct tcp_func ipv4_specific = {
 	ip_queue_xmit,
 	tcp_v4_send_check,
@@ -1920,6 +1878,7 @@ struct tcp_func ipv4_specific = {
 	tcp_v4_conn_request,
 	tcp_v4_syn_recv_sock,
 	tcp_v4_hash_connecting,
+	tcp_v4_remember_stamp,
 	sizeof(struct iphdr),
 
 	ip_setsockopt,
@@ -1937,6 +1896,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 
 	skb_queue_head_init(&tp->out_of_order_queue);
 	tcp_init_xmit_timers(sk);
+	tcp_prequeue_init(tp);
 
 	tp->rto  = TCP_TIMEOUT_INIT;
 	tp->mdev = TCP_TIMEOUT_INIT;
@@ -1951,19 +1911,14 @@ static int tcp_v4_init_sock(struct sock *sk)
 	/* See draft-stevens-tcpca-spec-01 for discussion of the
 	 * initialization of these values.
 	 */
-	tp->snd_cwnd_cnt = 0;
 	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
 	tp->snd_cwnd_clamp = ~0;
 	tp->mss_cache = 536;
 
 	sk->state = TCP_CLOSE;
-	sk->max_ack_backlog = SOMAXCONN;
 
 	sk->write_space = tcp_write_space; 
 
-	/* Init SYN queue. */
-	tcp_synq_init(tp);
-
 	sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
 
 	return 0;
@@ -1981,9 +1936,10 @@ static int tcp_v4_destroy_sock(struct sock *sk)
 	/* Cleans up our, hopefuly empty, out_of_order_queue. */
   	__skb_queue_purge(&tp->out_of_order_queue);
 
-	/* Clean up a referenced TCP bind bucket, this only happens if a
-	 * port is allocated for a socket, but it never fully connects.
-	 */
+	/* Clean prequeue, it must be empty really */
+	__skb_queue_purge(&tp->ucopy.prequeue);
+
+	/* Clean up a referenced TCP bind bucket. */
 	if(sk->prev != NULL)
 		tcp_put_port(sk);
 
@@ -1993,17 +1949,19 @@ static int tcp_v4_destroy_sock(struct sock *sk)
 /* Proc filesystem TCP sock list dumping. */
 static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i)
 {
-	sprintf(tmpbuf, "%4d: %08lX:%04X %08lX:%04X"
-		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
+	int ttd = req->expires - jiffies;
+
+	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
+		" %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
 		i,
-		(long unsigned int)req->af.v4_req.loc_addr,
+		req->af.v4_req.loc_addr,
 		ntohs(sk->sport),
-		(long unsigned int)req->af.v4_req.rmt_addr,
+		req->af.v4_req.rmt_addr,
 		ntohs(req->rmt_port),
 		TCP_SYN_RECV,
 		0,0, /* could print option size, but that is af dependent. */
 		1,   /* timers active (only the expire timer) */  
-		(unsigned long)(req->expires - jiffies), 
+		ttd, 
 		req->retrans,
 		sk->socket ? sk->socket->inode->i_uid : 0,
 		0,  /* non standard timer */  
@@ -2017,7 +1975,7 @@ static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
 {
 	unsigned int dest, src;
 	__u16 destp, srcp;
-	int timer_active, timer_active1, timer_active2;
+	int timer_active;
 	unsigned long timer_expires;
 	struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
 
@@ -2025,15 +1983,16 @@ static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
 	src   = sp->rcv_saddr;
 	destp = ntohs(sp->dport);
 	srcp  = ntohs(sp->sport);
-	timer_active1 = tp->retransmit_timer.prev != NULL;
-	timer_active2 = sp->timer.prev != NULL;
 	timer_active	= 0;
 	timer_expires	= (unsigned) -1;
-	if (timer_active1 && tp->retransmit_timer.expires < timer_expires) {
+	if (tp->retransmit_timer.prev != NULL && tp->retransmit_timer.expires < timer_expires) {
 		timer_active	= 1;
 		timer_expires	= tp->retransmit_timer.expires;
+	} else if (tp->probe_timer.prev != NULL && tp->probe_timer.expires < timer_expires) {
+		timer_active	= 4;
+		timer_expires	= tp->probe_timer.expires;
 	}
-	if (timer_active2 && sp->timer.expires < timer_expires) {
+	if (sp->timer.prev != NULL && sp->timer.expires < timer_expires) {
 		timer_active	= 2;
 		timer_expires	= sp->timer.expires;
 	}
@@ -2041,38 +2000,37 @@ static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
 		timer_expires = jiffies;
 
 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
-		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld %d %p",
+		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld %d %p %u %u %u %u",
 		i, src, srcp, dest, destp, sp->state, 
 		tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
 		timer_active, timer_expires-jiffies,
 		tp->retransmits,
 		sp->socket ? sp->socket->inode->i_uid : 0,
-		0,
+		tp->probes_out,
 		sp->socket ? sp->socket->inode->i_ino : 0,
-		atomic_read(&sp->refcnt), sp);
+		atomic_read(&sp->refcnt), sp,
+		tp->rto, tp->ack.ato, tp->ack.quick, tp->ack.pingpong
+		);
 }
 
 static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
 {
 	unsigned int dest, src;
 	__u16 destp, srcp;
-	int slot_dist;
+	int ttd = tw->ttd - jiffies;
+
+	if (ttd < 0)
+		ttd = 0;
 
 	dest  = tw->daddr;
 	src   = tw->rcv_saddr;
 	destp = ntohs(tw->dport);
 	srcp  = ntohs(tw->sport);
 
-	slot_dist = tw->death_slot;
-	if(slot_dist > tcp_tw_death_row_slot)
-		slot_dist = (TCP_TWKILL_SLOTS - slot_dist) + tcp_tw_death_row_slot;
-	else
-		slot_dist = tcp_tw_death_row_slot - slot_dist;
-
 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
 		" %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
-		i, src, srcp, dest, destp, TCP_TIME_WAIT, 0, 0,
-		3, slot_dist * TCP_TWKILL_PERIOD, 0, 0, 0, 0,
+		i, src, srcp, dest, destp, tw->substate, 0, 0,
+		3, ttd, 0, 0, 0, 0,
 		atomic_read(&tw->refcnt), tw);
 }
 
@@ -2093,6 +2051,8 @@ int tcp_get_info(char *buffer, char **start, off_t offset, int length)
 	tcp_listen_lock();
 	for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
 		struct sock *sk = tcp_listening_hash[i];
+		struct tcp_listen_opt *lopt;
+		int k;
 
 		for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) {
 			struct open_request *req;
@@ -2112,25 +2072,30 @@ int tcp_get_info(char *buffer, char **start, off_t offset, int length)
 			}
 
 skip_listen:
-			lock_sock(sk);
-			for (req = tp->syn_wait_queue; req; req = req->dl_next, num++) {
-				if (req->sk)
-					continue;
-				if (!TCP_INET_FAMILY(req->class->family))
-					continue;
-
-				pos += 128;
-				if (pos < offset)
-					continue;
-				get_openreq(sk, req, tmpbuf, num);
-				len += sprintf(buffer+len, "%-127s\n", tmpbuf);
-				if(len >= length) {
-					tcp_listen_unlock();
-					release_sock(sk);
-					goto out_no_bh;
+			read_lock_bh(&tp->syn_wait_lock);
+			lopt = tp->listen_opt;
+			if (lopt && lopt->qlen != 0) {
+				for (k=0; k<TCP_SYNQ_HSIZE; k++) {
+					for (req = lopt->syn_table[k]; req; req = req->dl_next, num++) {
+						if (!TCP_INET_FAMILY(req->class->family))
+							continue;
+
+						pos += 128;
+						if (pos < offset)
+							continue;
+						get_openreq(sk, req, tmpbuf, num);
+						len += sprintf(buffer+len, "%-127s\n", tmpbuf);
+						if(len >= length) {
+							read_unlock_bh(&tp->syn_wait_lock);
+							tcp_listen_unlock();
+							goto out_no_bh;
+						}
+					}
 				}
 			}
-			release_sock(sk);
+			read_unlock_bh(&tp->syn_wait_lock);
+
+			/* Completed requests are in normal socket hash table */
 		}
 	}
 	tcp_listen_unlock();
@@ -2194,28 +2159,24 @@ struct proto tcp_prot = {
 	tcp_v4_connect,			/* connect */
 	tcp_disconnect,			/* disconnect */
 	tcp_accept,			/* accept */
-	NULL,				/* retransmit */
-	tcp_write_wakeup,		/* write_wakeup */
-	tcp_read_wakeup,		/* read_wakeup */
-	tcp_poll,			/* poll */
 	tcp_ioctl,			/* ioctl */
 	tcp_v4_init_sock,		/* init */
 	tcp_v4_destroy_sock,		/* destroy */
 	tcp_shutdown,			/* shutdown */
 	tcp_setsockopt,			/* setsockopt */
 	tcp_getsockopt,			/* getsockopt */
-	tcp_v4_sendmsg,			/* sendmsg */
+	tcp_sendmsg,			/* sendmsg */
 	tcp_recvmsg,			/* recvmsg */
 	NULL,				/* bind */
 	tcp_v4_do_rcv,			/* backlog_rcv */
 	tcp_v4_hash,			/* hash */
 	tcp_unhash,			/* unhash */
 	tcp_v4_get_port,		/* get_port */
-	128,				/* max_header */
-	0,				/* retransmits */
 	"TCP",				/* name */
 };
 
+
+
 void __init tcp_v4_init(struct net_proto_family *ops)
 {
 	int err;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e3d884dda..d6bc8a205 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_output.c,v 1.116 2000/01/13 00:19:49 davem Exp $
+ * Version:	$Id: tcp_output.c,v 1.119 2000/01/19 04:06:15 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -31,6 +31,7 @@
  *					during syn/ack processing.
  *		David S. Miller :	Output engine completely rewritten.
  *		Andrea Arcangeli:	SYNACK carry ts_recent in tsecr.
+ *		Cacophonix Gaul :	draft-minshall-nagle-01
  *
  */
 
@@ -38,75 +39,65 @@
 
 #include <linux/smp_lock.h>
 
-extern int sysctl_tcp_timestamps;
-extern int sysctl_tcp_window_scaling;
-extern int sysctl_tcp_sack;
-
 /* People can turn this off for buggy TCP's found in printers etc. */
 int sysctl_tcp_retrans_collapse = 1;
 
-/* Get rid of any delayed acks, we sent one already.. */
-static __inline__ void clear_delayed_acks(struct sock * sk)
-{
-	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-
-	tp->delayed_acks = 0;
-	if(tcp_in_quickack_mode(tp))
-		tcp_exit_quickack_mode(tp);
-	tcp_clear_xmit_timer(sk, TIME_DACK);
-}
-
 static __inline__ void update_send_head(struct sock *sk)
 {
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-	
+
 	tp->send_head = tp->send_head->next;
 	if (tp->send_head == (struct sk_buff *) &sk->write_queue)
 		tp->send_head = NULL;
 }
 
 /* Calculate mss to advertise in SYN segment.
-   RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
-
-   1. It is independent of path mtu.
-   2. Ideally, it is maximal possible segment size i.e. 65535-40.
-   3. For IPv4 it is reasonable to calculate it from maximal MTU of
-      attached devices, because some buggy hosts are confused by
-      large MSS.
-   4. We do not make 3, we advertise MSS, calculated from first
-      hop device mtu, but allow to raise it to ip_rt_min_advmss.
-      This may be overriden via information stored in routing table.
-   5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
-      probably even Jumbo".
+ * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
+ *
+ * 1. It is independent of path mtu.
+ * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
+ * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
+ *    attached devices, because some buggy hosts are confused by
+ *    large MSS.
+ * 4. We do not make 3, we advertise MSS, calculated from first
+ *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
+ *    This may be overriden via information stored in routing table.
+ * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
+ *    probably even Jumbo".
  */
 static __u16 tcp_advertise_mss(struct sock *sk)
 {
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	struct dst_entry *dst = __sk_dst_get(sk);
-	int mss;
+	int mss = tp->advmss;
 
-	if (dst) {
+	if (dst && dst->advmss < mss) {
 		mss = dst->advmss;
-	} else {
-		struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+		tp->advmss = mss;
+	}
 
-		/* No dst. It is bad. Guess some reasonable value.
-		 * Actually, this case should not be possible.
-		 * SANITY.
-		 */
-		BUG_TRAP(dst!=NULL);
+	return (__u16)mss;
+}
 
-		mss = tp->mss_cache;
-		mss += (tp->tcp_header_len - sizeof(struct tcphdr)) +
-			tp->ext_header_len;
+static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb)
+{
+	/* If we had a reply for ato after last received
+	 * packet, enter pingpong mode.
+	 */
+	if ((u32)(tp->lsndtime - tp->ack.lrcvtime) < tp->ack.ato)
+		tp->ack.pingpong = 1;
 
-		/* Minimal MSS to include full set of of TCP/IP options
-		   plus 8 bytes of data. It corresponds to mtu 128.
-		 */
-		if (mss < 88)
-			mss = 88;
-	}
+	tp->lsndtime = tcp_time_stamp;
+}
 
-	return (__u16)mss;
+static __inline__ void tcp_event_ack_sent(struct sock *sk)
+{
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+	tp->last_ack_sent = tp->rcv_nxt;
+	tcp_dec_quickack_mode(tp);
+	tp->ack.pending = 0;
+	tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
 }
 
 /* This routine actually transmits TCP packets queued in by
@@ -120,7 +111,7 @@ static __u16 tcp_advertise_mss(struct sock *sk)
  * We are working here with either a clone of the original
  * SKB, or a fresh unique copy made by the retransmit engine.
  */
-void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
+int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 {
 	if(skb != NULL) {
 		struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
@@ -128,6 +119,7 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 		int tcp_header_size = tp->tcp_header_len;
 		struct tcphdr *th;
 		int sysctl_flags;
+		int err;
 
 #define SYSCTL_FLAG_TSTAMPS	0x1
 #define SYSCTL_FLAG_WSCALE	0x2
@@ -190,11 +182,29 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 		}
 		tp->af_specific->send_check(sk, th, skb->len, skb);
 
-		clear_delayed_acks(sk);
-		tp->last_ack_sent = tp->rcv_nxt;
+		if (th->ack)
+			tcp_event_ack_sent(sk);
+
+		if (skb->len != tcp_header_size)
+			tcp_event_data_sent(tp, skb);
+
 		TCP_INC_STATS(TcpOutSegs);
-		tp->af_specific->queue_xmit(skb);
+
+		err = tp->af_specific->queue_xmit(skb);
+		if (err <= 0)
+			return err;
+
+		tcp_enter_cong_avoid(tp);
+
+		/* NET_XMIT_CN is special. It does not guarantee,
+		 * that this packet is lost. It tells that device
+		 * is about to start to drop packets or already
+		 * drops some packets of the same priority and
+		 * invokes us to send less aggressively.
+		 */
+		return err == NET_XMIT_CN ? 0 : err;
 	}
+	return -ENOBUFS;
 #undef SYSCTL_FLAG_TSTAMPS
 #undef SYSCTL_FLAG_WSCALE
 #undef SYSCTL_FLAG_SACK
@@ -202,32 +212,33 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 
 /* This is the main buffer sending routine. We queue the buffer
  * and decide whether to queue or transmit now.
+ *
+ * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
+ * otherwise socket can stall.
  */
-void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue)
+void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue, unsigned cur_mss)
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
 	/* Advance write_seq and place onto the write_queue. */
-	tp->write_seq += (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq);
+	tp->write_seq = TCP_SKB_CB(skb)->end_seq;
 	__skb_queue_tail(&sk->write_queue, skb);
 
-	if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) {
+	if (!force_queue && tp->send_head == NULL && tcp_snd_test(tp, skb, cur_mss, 1)) {
 		/* Send it out now. */
 		TCP_SKB_CB(skb)->when = tcp_time_stamp;
-		tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
-		tp->packets_out++;
-		tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
-		if(!tcp_timer_is_set(sk, TIME_RETRANS))
-			tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
-	} else {
-		/* Queue it, remembering where we must start sending. */
-		if (tp->send_head == NULL)
-			tp->send_head = skb;
-		if (!force_queue && tp->packets_out == 0 && !tp->pending) {
-			tp->pending = TIME_PROBE0;
-			tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
+		if (tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)) == 0) {
+			tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+			tcp_minshall_update(tp, cur_mss, skb->len);
+			tp->packets_out++;
+			if(!tcp_timer_is_set(sk, TCP_TIME_RETRANS))
+				tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+			return;
 		}
 	}
+	/* Queue it, remembering where we must start sending. */
+	if (tp->send_head == NULL)
+		tp->send_head = skb;
 }
 
 /* Function to create two new TCP segments.  Shrinks the given segment
@@ -243,13 +254,13 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
 
 	/* Get a new skb... force flag on. */
 	buff = sock_wmalloc(sk,
-			    (nsize + MAX_HEADER + sk->prot->max_header),
+			    (nsize + MAX_TCP_HEADER + 15),
 			    1, GFP_ATOMIC);
 	if (buff == NULL)
-		return -1; /* We'll just try again later. */
+		return -ENOMEM; /* We'll just try again later. */
 
 	/* Reserve space for headers. */
-	skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
+	skb_reserve(buff, MAX_TCP_HEADER);
 		
 	/* Correct the sequence numbers. */
 	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
@@ -276,8 +287,8 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
 	TCP_SKB_CB(buff)->sacked = 0;
 
 	/* Copy and checksum data tail into the new buffer. */
-	buff->csum = csum_partial_copy(skb->data + len, skb_put(buff, nsize),
-				       nsize, 0);
+	buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
+					       nsize, 0);
 
 	/* This takes care of the FIN sequence number too. */
 	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
@@ -288,6 +299,11 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
 
 	/* Looks stupid, but our code really uses when of
 	 * skbs, which it never sent before. --ANK
+	 *
+	 * NOTE: several days after I added this, Dave repaired
+	 * tcp_simple_retransmit() and it should not use ->when
+	 * of never sent skbs more. I am not sure, so that
+	 * this line remains until more careful investigation. --ANK
 	 */
 	TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
 
@@ -335,20 +351,19 @@ int tcp_sync_mss(struct sock *sk, u32 pmtu)
 	if (mss_now > tp->mss_clamp)
 		mss_now = tp->mss_clamp;
 
-	/* Now subtract TCP options size, not including SACKs */
-	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
-
 	/* Now subtract optional transport overhead */
 	mss_now -= tp->ext_header_len;
 
-	/* It we got too small (or even negative) value,
-	   clamp it by 8 from below. Why 8 ?
-	   Well, it could be 1 with the same success,
-	   but if IP accepted segment of length 1,
-	   it would love 8 even more 8)		--ANK (980731)
-	 */
-	if (mss_now < 8)
-		mss_now = 8;
+	/* Then reserve room for full set of TCP options and 8 bytes of data */
+	if (mss_now < 48)
+		mss_now = 48;
+
+	/* Now subtract TCP options size, not including SACKs */
+	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+
+	/* Bound mss with half of window */
+	if (tp->max_window && mss_now > (tp->max_window>>1))
+		mss_now = max((tp->max_window>>1), 1);
 
 	/* And store cached results */
 	tp->pmtu_cookie = pmtu;
@@ -360,27 +375,30 @@ int tcp_sync_mss(struct sock *sk, u32 pmtu)
 /* This routine writes packets to the network.  It advances the
  * send_head.  This happens as incoming acks open up the remote
  * window for us.
+ *
+ * Returns 1, if no segments are in flight and we have queued segments, but
+ * cannot send anything now because of SWS or another problem.
  */
-void tcp_write_xmit(struct sock *sk)
+int tcp_write_xmit(struct sock *sk)
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	unsigned int mss_now;
 
-	/* Account for SACKS, we may need to fragment due to this.
-	 * It is just like the real MSS changing on us midstream.
-	 * We also handle things correctly when the user adds some
-	 * IP options mid-stream.  Silly to do, but cover it.
-	 */
-	mss_now = tcp_current_mss(sk); 
-
-	/* If we are zapped, the bytes will have to remain here.
-	 * In time closedown will empty the write queue and all
+	/* If we are closed, the bytes will have to remain here.
+	 * In time closedown will finish, we empty the write queue and all
 	 * will be happy.
 	 */
-	if(!sk->zapped) {
+	if(sk->state != TCP_CLOSE) {
 		struct sk_buff *skb;
 		int sent_pkts = 0;
 
+		/* Account for SACKS, we may need to fragment due to this.
+		 * It is just like the real MSS changing on us midstream.
+		 * We also handle things correctly when the user adds some
+		 * IP options mid-stream.  Silly to do, but cover it.
+		 */
+		mss_now = tcp_current_mss(sk); 
+
 		/* Anything on the transmit queue that fits the window can
 		 * be added providing we are:
 		 *
@@ -388,27 +406,36 @@ void tcp_write_xmit(struct sock *sk)
 		 * b) not exceeding our congestion window.
 		 * c) not retransmitting [Nagle]
 		 */
-		while((skb = tp->send_head) && tcp_snd_test(sk, skb)) {
+		while((skb = tp->send_head) &&
+		      tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb))) {
 			if (skb->len > mss_now) {
 				if (tcp_fragment(sk, skb, mss_now))
 					break;
 			}
 
-			/* Advance the send_head.  This one is going out. */
-			update_send_head(sk);
 			TCP_SKB_CB(skb)->when = tcp_time_stamp;
+			if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
+				break;
+			/* Advance the send_head.  This one is sent out. */
+			update_send_head(sk);
 			tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+			tcp_minshall_update(tp, mss_now, skb->len);
 			tp->packets_out++;
-			tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
 			sent_pkts = 1;
 		}
 
 		/* If we sent anything, make sure the retransmit
 		 * timer is active.
 		 */
-		if (sent_pkts && !tcp_timer_is_set(sk, TIME_RETRANS))
-			tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+		if (sent_pkts) {
+			if (!tcp_timer_is_set(sk, TCP_TIME_RETRANS))
+				tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+			return 0;
+		}
+
+		return !tp->packets_out && tp->send_head;
 	}
+	return 0;
 }
 
 /* This function returns the amount that we can raise the
@@ -471,7 +498,7 @@ u32 __tcp_select_window(struct sock *sk)
 	 * but may be worse for the performance because of rcv_mss
 	 * fluctuations.  --SAW  1998/11/1
 	 */
-	unsigned int mss = tp->rcv_mss;
+	unsigned int mss = tp->ack.rcv_mss;
 	int free_space;
 	u32 window;
 
@@ -481,11 +508,19 @@ u32 __tcp_select_window(struct sock *sk)
 		free_space = tp->window_clamp;
 	if (tp->window_clamp < mss)
 		mss = tp->window_clamp; 
-	
-	if ((free_space < (tcp_full_space(sk) / 2)) && 
+
+	if ((free_space < (min((int)tp->window_clamp, tcp_full_space(sk)) / 2)) && 
 		(free_space < ((int) (mss/2)))) {
 		window = 0;
-		tp->pred_flags = 0; 
+
+		/* THIS IS _VERY_ GOOD PLACE to play window clamp.
+		 * if free_space becomes suspiciously low
+		 * verify ratio rmem_alloc/(rcv_nxt - copied_seq),
+		 * and if we predict that when free_space will be lower mss,
+		 * rmem_alloc will run out of rcvbuf*2, shrink window_clamp.
+		 * It will eliminate most of prune events! Very simple,
+		 * it is the next thing to do.			--ANK
+		 */
 	} else {
 		/* Get the largest window that is a nice multiple of mss.
 		 * Window clamp already applied above.
@@ -542,9 +577,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
 			/* Optimize, actually we could also combine next_skb->csum
 			 * to skb->csum using a single add w/carry operation too.
 			 */
-			skb->csum = csum_partial_copy(next_skb->data,
-						      skb_put(skb, next_skb_size),
-						      next_skb_size, skb->csum);
+			skb->csum = csum_partial_copy_nocheck(next_skb->data,
+							      skb_put(skb, next_skb_size),
+							      next_skb_size, skb->csum);
 		}
 	
 		/* Update sequence range on original skb. */
@@ -603,8 +638,10 @@ void tcp_simple_retransmit(struct sock *sk)
 		if (old_next_skb != skb || skb->len > mss)
 			resend_skb = 1;
 		old_next_skb = skb->next;
-		if (resend_skb != 0)
-			tcp_retransmit_skb(sk, skb);
+		if (resend_skb != 0) {
+			if (tcp_retransmit_skb(sk, skb))
+				break;
+		}
 	}
 }
 
@@ -629,9 +666,21 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	unsigned int cur_mss = tcp_current_mss(sk);
 
+#ifdef TCP_DEBUG
+	/* It was possible this summer, that retransmit timer
+	 * raced with its deletion and hit socket with packets_out==0.
+	 * I fixed it, but preserved the check in the place,
+	 * where the fault occured. --ANK
+	 */
+	if (skb == NULL) {
+		printk("tcp_retransmit_skb: bug, skb==NULL, caller=%p\n", NET_CALLER(sk));
+		return -EFAULT;
+	}
+#endif
+
 	if(skb->len > cur_mss) {
 		if(tcp_fragment(sk, skb, cur_mss))
-			return 1; /* We'll try again later. */
+			return -ENOMEM; /* We'll try again later. */
 
 		/* New SKB created, account for it. */
 		tp->packets_out++;
@@ -646,7 +695,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 		tcp_retrans_try_collapse(sk, skb, cur_mss);
 
 	if(tp->af_specific->rebuild_header(sk))
-		return 1; /* Routing failure or similar. */
+		return -EHOSTUNREACH; /* Routing failure or similar. */
 
 	/* Some Solaris stacks overoptimize and ignore the FIN on a
 	 * retransmit when old data is attached.  So strip it off
@@ -673,13 +722,10 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 	else
 		skb = skb_clone(skb, GFP_ATOMIC);
 
-	tcp_transmit_skb(sk, skb);
-
 	/* Update global TCP statistics and return success. */
-	sk->prot->retransmits++;
 	TCP_INC_STATS(TcpRetransSegs);
 
-	return 0;
+	return tcp_transmit_skb(sk, skb);
 }
 
 /* This gets called after a retransmit timeout, and the initially
@@ -774,7 +820,11 @@ void tcp_send_fin(struct sock *sk)
 	 */
 	mss_now = tcp_current_mss(sk); 
 
-	if((tp->send_head != NULL) && (skb->len < mss_now)) {
+	/* Please, find seven differences of 2.3.33 and loook
+	 * what I broke here. 8) --ANK
+	 */
+
+	if(tp->send_head != NULL) {
 		/* tcp_write_xmit() takes care of the rest. */
 		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
 		TCP_SKB_CB(skb)->end_seq++;
@@ -783,31 +833,34 @@ void tcp_send_fin(struct sock *sk)
 		/* Special case to avoid Nagle bogosity.  If this
 		 * segment is the last segment, and it was queued
 		 * due to Nagle/SWS-avoidance, send it out now.
+		 *
+		 * Hmm... actually it overrides also congestion
+		 * avoidance (OK for FIN) and retransmit phase
+		 * (not OK? Added.).
 		 */
 		if(tp->send_head == skb &&
-		   !sk->nonagle &&
-		   skb->len < (tp->rcv_mss >> 1) &&
-		   tp->packets_out &&
-		   !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) {
-			update_send_head(sk);
+		   !after(tp->write_seq, tp->snd_una + tp->snd_wnd) &&
+		   !tp->retransmits) {
 			TCP_SKB_CB(skb)->when = tcp_time_stamp;
-			tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
-			tp->packets_out++;
-			tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
-			if(!tcp_timer_is_set(sk, TIME_RETRANS))
-				tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+			if (!tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) {
+				update_send_head(sk);
+				tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+				tp->packets_out++;
+				if(!tcp_timer_is_set(sk, TCP_TIME_RETRANS))
+					tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+			} else
+				tcp_check_probe_timer(sk, tp);
 		}
 	} else {
 		/* Socket is locked, keep trying until memory is available. */
 		do {
 			skb = sock_wmalloc(sk,
-					   (MAX_HEADER +
-					    sk->prot->max_header),
+					   MAX_TCP_HEADER + 15,
 					   1, GFP_KERNEL);
 		} while (skb == NULL);
 
 		/* Reserve space for headers and prepare control bits. */
-		skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+		skb_reserve(skb, MAX_TCP_HEADER);
 		skb->csum = 0;
 		TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
 		TCP_SKB_CB(skb)->sacked = 0;
@@ -816,7 +869,8 @@ void tcp_send_fin(struct sock *sk)
 		/* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */
 		TCP_SKB_CB(skb)->seq = tp->write_seq;
 		TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
-		tcp_send_skb(sk, skb, 0);
+		tcp_send_skb(sk, skb, 0, mss_now);
+		__tcp_push_pending_frames(sk, tp, mss_now);
 	}
 }
 
@@ -831,19 +885,19 @@ void tcp_send_active_reset(struct sock *sk, int priority)
 	struct sk_buff *skb;
 
 	/* NOTE: No TCP options attached and we never retransmit this. */
-	skb = alloc_skb(MAX_HEADER + sk->prot->max_header, priority);
+	skb = alloc_skb(MAX_TCP_HEADER + 15, priority);
 	if (!skb)
 		return;
 
 	/* Reserve space for headers and prepare control bits. */
-	skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+	skb_reserve(skb, MAX_TCP_HEADER);
 	skb->csum = 0;
 	TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
 	TCP_SKB_CB(skb)->sacked = 0;
 	TCP_SKB_CB(skb)->urg_ptr = 0;
 
 	/* Send it off. */
-	TCP_SKB_CB(skb)->seq = tp->write_seq;
+	TCP_SKB_CB(skb)->seq = tp->snd_nxt;
 	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
 	tcp_transmit_skb(sk, skb);
@@ -859,13 +913,13 @@ int tcp_send_synack(struct sock *sk)
 	struct tcp_opt* tp = &(sk->tp_pinfo.af_tcp);
 	struct sk_buff* skb;	
 
-	skb = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
+	skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15,
 			   1, GFP_ATOMIC);
 	if (skb == NULL) 
 		return -ENOMEM;
 
 	/* Reserve space for headers and prepare control bits. */
-	skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+	skb_reserve(skb, MAX_TCP_HEADER);
 	skb->csum = 0;
 	TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_SYN);
 	TCP_SKB_CB(skb)->sacked = 0;
@@ -877,8 +931,7 @@ int tcp_send_synack(struct sock *sk)
 	__skb_queue_tail(&sk->write_queue, skb);
 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
 	tp->packets_out++;
-	tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
-	return 0;
+	return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
 }
 
 /*
@@ -887,16 +940,17 @@ int tcp_send_synack(struct sock *sk)
 struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 				 struct open_request *req)
 {
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	struct tcphdr *th;
 	int tcp_header_size;
 	struct sk_buff *skb;
 
-	skb = sock_wmalloc(sk, MAX_HEADER + sk->prot->max_header, 1, GFP_ATOMIC);
+	skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
 	if (skb == NULL)
 		return NULL;
 
 	/* Reserve space for headers. */
-	skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+	skb_reserve(skb, MAX_TCP_HEADER);
 
 	skb->dst = dst_clone(dst);
 
@@ -919,7 +973,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
 		__u8 rcv_wscale; 
 		/* Set this up on the first call only */
-		req->window_clamp = skb->dst->window;
+		req->window_clamp = tp->window_clamp ? : skb->dst->window;
 		/* tcp_full_space because it is guaranteed to be the first packet */
 		tcp_select_initial_window(tcp_full_space(sk), 
 			dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
@@ -951,7 +1005,7 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff)
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
 	/* Reserve space for headers. */
-	skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
+	skb_reserve(buff, MAX_TCP_HEADER + 15);
 
 	/* We'll fix this up when we get a response from the other end.
 	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
@@ -962,12 +1016,16 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff)
 	/* If user gave his TCP_MAXSEG, record it to clamp */
 	if (tp->user_mss)
 		tp->mss_clamp = tp->user_mss;
+	tp->max_window = 0;
 	tcp_sync_mss(sk, dst->pmtu);
+	tcp_initialize_rcv_mss(sk);
 
-	tp->window_clamp = dst->window;
+	if (!tp->window_clamp)
+		tp->window_clamp = dst->window;
+	tp->advmss = dst->advmss;
 
 	tcp_select_initial_window(tcp_full_space(sk),
-		dst->advmss - (tp->tcp_header_len - sizeof(struct tcphdr)),
+		tp->advmss - (tp->tcp_header_len - sizeof(struct tcphdr)),
 		&tp->rcv_wnd,
 		&tp->window_clamp,
 		sysctl_tcp_window_scaling,
@@ -982,10 +1040,12 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff)
 		goto err_out;
 
 	sk->err = 0;
+	sk->done = 0;
 	tp->snd_wnd = 0;
 	tp->snd_wl1 = 0;
 	tp->snd_wl2 = tp->write_seq;
 	tp->snd_una = tp->write_seq;
+	tp->snd_sml = tp->write_seq;
 	tp->rcv_nxt = 0;
 	tp->rcv_wup = 0;
 	tp->copied_seq = 0;
@@ -1006,13 +1066,14 @@ int tcp_connect(struct sock *sk, struct sk_buff *buff)
 
 	/* Send it off. */
 	TCP_SKB_CB(buff)->when = tcp_time_stamp;
+	tp->syn_stamp = TCP_SKB_CB(buff)->when;
 	__skb_queue_tail(&sk->write_queue, buff);
 	tp->packets_out++;
 	tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
 	TCP_INC_STATS(TcpActiveOpens);
 
 	/* Timer for repeating the SYN until an answer. */
-	tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+	tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
 	return 0;
 
 err_out:
@@ -1025,16 +1086,14 @@ err_out:
  * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
  * for details.
  */
-void tcp_send_delayed_ack(struct sock *sk, int max_timeout)
+void tcp_send_delayed_ack(struct sock *sk)
 {
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 	unsigned long timeout;
 
 	/* Stay within the limit we were given */
-	timeout = (tp->ato << 1) >> 1;
-	if (timeout > max_timeout)
-		timeout = max_timeout;
-	timeout += jiffies;
+	timeout = tp->ack.ato;
+	timeout += jiffies + (timeout>>2);
 
 	/* Use new timeout only if there wasn't a older one earlier. */
 	spin_lock_bh(&sk->timer_lock);
@@ -1042,18 +1101,46 @@ void tcp_send_delayed_ack(struct sock *sk, int max_timeout)
 		sock_hold(sk);
 		tp->delack_timer.expires = timeout;
 	} else {
+		/* If delack timer was blocked or is about to expire,
+		 * send ACK now.
+		 */
+		if (tp->ack.blocked || time_before_eq(tp->delack_timer.expires, jiffies+(tp->ack.ato>>2))) {
+			spin_unlock_bh(&sk->timer_lock);
+
+			tcp_send_ack(sk);
+			__sock_put(sk);
+			return;
+		}
+
 		if (time_before(timeout, tp->delack_timer.expires))
 			tp->delack_timer.expires = timeout;
 	}
 	add_timer(&tp->delack_timer);
 	spin_unlock_bh(&sk->timer_lock);
+
+#ifdef TCP_FORMAL_WINDOW
+	/* Explanation. Header prediction path does not handle
+	 * case of zero window. If we send ACK immediately, pred_flags
+	 * are reset when sending ACK. If rcv_nxt is advanced and
+	 * ack is not sent, than delayed ack is scheduled.
+	 * Hence, it is the best place to check for zero window.
+	 */
+	if (tp->pred_flags) {
+		if (tcp_receive_window(tp) == 0)
+			tp->pred_flags = 0;
+	} else {
+		if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
+		    !tp->urg_data)
+			tcp_fast_path_on(tp);
+	}
+#endif
 }
 
 /* This routine sends an ack and also updates the window. */
 void tcp_send_ack(struct sock *sk)
 {
 	/* If we have been reset, we may not send again. */
-	if(!sk->zapped) {
+	if(sk->state != TCP_CLOSE) {
 		struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 		struct sk_buff *buff;
 
@@ -1061,29 +1148,15 @@ void tcp_send_ack(struct sock *sk)
 		 * tcp_transmit_skb() will set the ownership to this
 		 * sock.
 		 */
-		buff = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_ATOMIC);
+		buff = alloc_skb(MAX_TCP_HEADER + 15, GFP_ATOMIC);
 		if (buff == NULL) {
-			/* Force it to send an ack. We don't have to do this
-			 * (ACK is unreliable) but it's much better use of
-			 * bandwidth on slow links to send a spare ack than
-			 * resend packets.
-			 *
-			 * This is the one possible way that we can delay an
-			 * ACK and have tp->ato indicate that we are in
-			 * quick ack mode, so clear it.  It is also the only
-			 * possible way for ato to be zero, when ACK'ing a
-			 * SYNACK because we've taken no ATO measurement yet.
-			 */
-			if (tcp_in_quickack_mode(tp))
-				tcp_exit_quickack_mode(tp);
-			if (!tp->ato)
-				tp->ato = tp->rto;
-			tcp_send_delayed_ack(sk, HZ/2);
+			tp->ack.pending = 1;
+			tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
 			return;
 		}
 
 		/* Reserve space for headers and prepare control bits. */
-		skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
+		skb_reserve(buff, MAX_TCP_HEADER);
 		buff->csum = 0;
 		TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
 		TCP_SKB_CB(buff)->sacked = 0;
@@ -1099,24 +1172,20 @@ void tcp_send_ack(struct sock *sk)
 /* This routine sends a packet with an out of date sequence
  * number. It assumes the other end will try to ack it.
  */
-void tcp_write_wakeup(struct sock *sk)
+int tcp_write_wakeup(struct sock *sk)
 {
-	/* After a valid reset we can send no more. */
-	if (!sk->zapped) {
+	if (sk->state != TCP_CLOSE) {
 		struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 		struct sk_buff *skb;
 
-		/* Write data can still be transmitted/retransmitted in the
-		 * following states.  If any other state is encountered, return.
-		 * [listen/close will never occur here anyway]
+		/* Now this function is never called, while
+		 * we have something not ACKed in queue.
 		 */
-		if ((1 << sk->state) &
-		    ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
-		      TCPF_FIN_WAIT2|TCPF_LAST_ACK|TCPF_CLOSING))
-			return;
+		BUG_TRAP(tp->snd_una == tp->snd_nxt);
 
-		if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) &&
-		    ((skb = tp->send_head) != NULL)) {
+		if (tp->snd_wnd > (tp->snd_nxt-tp->snd_una)
+		    && ((skb = tp->send_head) != NULL)) {
+			int err;
 			unsigned long win_size;
 
 			/* We are probing the opening of a window
@@ -1126,24 +1195,26 @@ void tcp_write_wakeup(struct sock *sk)
 			win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
 			if (win_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) {
 				if (tcp_fragment(sk, skb, win_size))
-					return; /* Let a retransmit get it. */
+					return -1;
 			}
-			update_send_head(sk);
 			TCP_SKB_CB(skb)->when = tcp_time_stamp;
-			tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
-			tp->packets_out++;
-			tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
-			if (!tcp_timer_is_set(sk, TIME_RETRANS))
-				tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+			err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+			if (!err) {
+				update_send_head(sk);
+				tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+				tp->packets_out++;
+				if (!tcp_timer_is_set(sk, TCP_TIME_RETRANS))
+					tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+			}
+			return err;
 		} else {
 			/* We don't queue it, tcp_transmit_skb() sets ownership. */
-			skb = alloc_skb(MAX_HEADER + sk->prot->max_header,
-					GFP_ATOMIC);
+			skb = alloc_skb(MAX_TCP_HEADER + 15, GFP_ATOMIC);
 			if (skb == NULL) 
-				return;
+				return -1;
 
 			/* Reserve space for headers and set control bits. */
-			skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+			skb_reserve(skb, MAX_TCP_HEADER);
 			skb->csum = 0;
 			TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
 			TCP_SKB_CB(skb)->sacked = 0;
@@ -1152,13 +1223,18 @@ void tcp_write_wakeup(struct sock *sk)
 			/* Use a previous sequence.  This should cause the other
 			 * end to send an ack.  Don't queue or clone SKB, just
 			 * send it.
+			 *
+			 * RED-PEN: logically it should be snd_una-1.
+			 * snd_nxt-1 will not be acked. snd_una==snd_nxt
+			 * in this place however. Right?
 			 */
-			TCP_SKB_CB(skb)->seq = tp->snd_nxt - 1;
+			TCP_SKB_CB(skb)->seq = tp->snd_una - 1;
 			TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
 			TCP_SKB_CB(skb)->when = tcp_time_stamp;
-			tcp_transmit_skb(sk, skb);
+			return tcp_transmit_skb(sk, skb);
 		}
 	}
+	return -1;
 }
 
 /* A window probe timeout has occurred.  If window is not closed send
@@ -1167,11 +1243,32 @@ void tcp_write_wakeup(struct sock *sk)
 void tcp_send_probe0(struct sock *sk)
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	int err;
+
+	err = tcp_write_wakeup(sk);
+
+	if (tp->packets_out || !tp->send_head) {
+		/* Cancel probe timer, if it is not required. */
+		tp->probes_out = 0;
+		tp->backoff = 0;
+		return;
+	}
 
-	tcp_write_wakeup(sk);
-	tp->pending = TIME_PROBE0;
-	tp->backoff++;
-	tp->probes_out++;
-	tcp_reset_xmit_timer (sk, TIME_PROBE0, 
-			      min(tp->rto << tp->backoff, 120*HZ));
+	if (err <= 0) {
+		tp->backoff++;
+		tp->probes_out++;
+		tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
+				      min(tp->rto << tp->backoff, TCP_RTO_MAX));
+	} else {
+		/* If packet was not sent due to local congestion,
+		 * do not backoff and do not remember probes_out.
+		 * Let local senders to fight for local resources.
+		 *
+		 * Use accumulated backoff yet.
+		 */
+		if (!tp->probes_out)
+			tp->probes_out=1;
+		tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
+				      min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
+	}
 }
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index a38724e42..bff4e872f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_timer.c,v 1.68 1999/09/07 02:31:43 davem Exp $
+ * Version:	$Id: tcp_timer.c,v 1.71 2000/01/18 08:24:19 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -23,29 +23,20 @@
 #include <net/tcp.h>
 
 int sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 
+int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 
 int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
 int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
 int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
 int sysctl_tcp_retries1 = TCP_RETR1;
 int sysctl_tcp_retries2 = TCP_RETR2;
+int sysctl_tcp_orphan_retries = TCP_ORPHAN_RETRIES;
 
-
-static void tcp_sltimer_handler(unsigned long);
-static void tcp_syn_recv_timer(unsigned long);
+static void tcp_retransmit_timer(unsigned long);
+static void tcp_delack_timer(unsigned long);
+static void tcp_probe_timer(unsigned long);
+static void tcp_keepalive_timer (unsigned long data);
 static void tcp_twkill(unsigned long);
 
-struct timer_list	tcp_slow_timer = {
-	NULL, NULL,
-	0, 0,
-	tcp_sltimer_handler,
-};
-
-
-struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = {
-	{ATOMIC_INIT(0), TCP_SYNACK_PERIOD, 0, tcp_syn_recv_timer},/* SYNACK	*/
-	{ATOMIC_INIT(0), TCP_TWKILL_PERIOD, 0, tcp_twkill}         /* TWKILL	*/
-};
-
 const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
 
 /*
@@ -56,17 +47,25 @@ const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
 
 void tcp_init_xmit_timers(struct sock *sk)
 {
-	init_timer(&sk->tp_pinfo.af_tcp.retransmit_timer);
-	sk->tp_pinfo.af_tcp.retransmit_timer.function=&tcp_retransmit_timer;
-	sk->tp_pinfo.af_tcp.retransmit_timer.data = (unsigned long) sk;
-	
-	init_timer(&sk->tp_pinfo.af_tcp.delack_timer);
-	sk->tp_pinfo.af_tcp.delack_timer.function=&tcp_delack_timer;
-	sk->tp_pinfo.af_tcp.delack_timer.data = (unsigned long) sk;
-
-	init_timer(&sk->tp_pinfo.af_tcp.probe_timer);
-	sk->tp_pinfo.af_tcp.probe_timer.function=&tcp_probe_timer;
-	sk->tp_pinfo.af_tcp.probe_timer.data = (unsigned long) sk;
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+	spin_lock_init(&sk->timer_lock);
+
+	init_timer(&tp->retransmit_timer);
+	tp->retransmit_timer.function=&tcp_retransmit_timer;
+	tp->retransmit_timer.data = (unsigned long) sk;
+
+	init_timer(&tp->delack_timer);
+	tp->delack_timer.function=&tcp_delack_timer;
+	tp->delack_timer.data = (unsigned long) sk;
+
+	init_timer(&tp->probe_timer);
+	tp->probe_timer.function=&tcp_probe_timer;
+	tp->probe_timer.data = (unsigned long) sk;
+
+	init_timer(&sk->timer);
+	sk->timer.function=&tcp_keepalive_timer;
+	sk->timer.data = (unsigned long) sk;
 }
 
 /*
@@ -79,7 +78,7 @@ void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
 
 	spin_lock_bh(&sk->timer_lock);
 	switch (what) {
-	case TIME_RETRANS:
+	case TCP_TIME_RETRANS:
 		/* When seting the transmit timer the probe timer 
 		 * should not be set.
 		 * The delayed ack timer can be set if we are changing the
@@ -89,29 +88,25 @@ void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
 			__sock_put(sk);
 		if (!tp->retransmit_timer.prev || !del_timer(&tp->retransmit_timer))
 			sock_hold(sk);
-		if (when > 120*HZ) {
+		if (when > TCP_RTO_MAX) {
 			printk(KERN_DEBUG "reset_xmit_timer sk=%p when=0x%lx, caller=%p\n", sk, when, NET_CALLER(sk));
-			when = 120*HZ;
+			when = TCP_RTO_MAX;
 		}
 		mod_timer(&tp->retransmit_timer, jiffies+when);
 		break;
 
-	case TIME_DACK:
+	case TCP_TIME_DACK:
 		if (!tp->delack_timer.prev || !del_timer(&tp->delack_timer))
 			sock_hold(sk);
 		mod_timer(&tp->delack_timer, jiffies+when);
 		break;
 
-	case TIME_PROBE0:
+	case TCP_TIME_PROBE0:
 		if (!tp->probe_timer.prev || !del_timer(&tp->probe_timer))
 			sock_hold(sk);
 		mod_timer(&tp->probe_timer, jiffies+when);
 		break;	
 
-	case TIME_WRITE:
-		printk(KERN_DEBUG "bug: tcp_reset_xmit_timer TIME_WRITE\n");
-		break;
-
 	default:
 		printk(KERN_DEBUG "bug: unknown timer value\n");
 	};
@@ -127,6 +122,7 @@ void tcp_clear_xmit_timers(struct sock *sk)
 		__sock_put(sk);
 	if(tp->delack_timer.prev && del_timer(&tp->delack_timer))
 		__sock_put(sk);
+	tp->ack.blocked = 0;
 	if(tp->probe_timer.prev && del_timer(&tp->probe_timer))
 		__sock_put(sk);
 	if(sk->timer.prev && del_timer(&sk->timer))
@@ -134,39 +130,33 @@ void tcp_clear_xmit_timers(struct sock *sk)
 	spin_unlock_bh(&sk->timer_lock);
 }
 
-static void tcp_write_err(struct sock *sk, int force)
+static void tcp_write_err(struct sock *sk)
 {
-	sk->err = sk->err_soft ? sk->err_soft : ETIMEDOUT;
+	sk->err = sk->err_soft ? : ETIMEDOUT;
 	sk->error_report(sk);
 
-	tcp_clear_xmit_timers(sk);
-
-	/* Do not time wait the socket. It is timed out and, hence,
-	 * idle for 120*HZ. "force" argument is ignored, delete
-	 * it eventually.
-	 */
-
-	/* Clean up time. */
-	tcp_set_state(sk, TCP_CLOSE);
 	tcp_done(sk);
 }
 
 /* A write timeout has occurred. Process the after effects. */
-static void tcp_write_timeout(struct sock *sk)
+static int tcp_write_timeout(struct sock *sk)
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	int retry_until;
 
-	/* Look for a 'soft' timeout. */
-	if ((sk->state == TCP_ESTABLISHED &&
-	     tp->retransmits && (tp->retransmits % TCP_QUICK_TRIES) == 0) ||
-	    (sk->state != TCP_ESTABLISHED && tp->retransmits > sysctl_tcp_retries1)) {
-		/* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
-		   hole detection. :-(
+	if ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
+		if (tp->retransmits)
+			dst_negative_advice(&sk->dst_cache);
+		retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
+	} else {
+		if (tp->retransmits >= sysctl_tcp_retries1) {
+			/* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
+			   hole detection. :-(
 
-		   It is place to make it. It is not made. I do not want
-		   to make it. It is disguisting. It does not work in any
-		   case. Let me to cite the same draft, which requires for
-		   us to implement this:
+			   It is place to make it. It is not made. I do not want
+			   to make it. It is disguisting. It does not work in any
+			   case. Let me to cite the same draft, which requires for
+			   us to implement this:
 
    "The one security concern raised by this memo is that ICMP black holes
    are often caused by over-zealous security administrators who block
@@ -177,57 +167,70 @@ static void tcp_write_timeout(struct sock *sk)
    be far nicer to have all of the black holes fixed rather than fixing
    all of the TCP implementations."
 
-                   Golden words :-).
-		 */
+                           Golden words :-).
+		   */
 
-		dst_negative_advice(&sk->dst_cache);
+			dst_negative_advice(&sk->dst_cache);
+		}
+		retry_until = sysctl_tcp_retries2;
+		if (sk->dead)
+			retry_until = sysctl_tcp_orphan_retries;
 	}
-	
-	/* Have we tried to SYN too many times (repent repent 8)) */
-	if (sk->state == TCP_SYN_SENT && 
-	    ((!tp->syn_retries && tp->retransmits > sysctl_tcp_syn_retries) ||
-	      (tp->syn_retries && tp->retransmits > tp->syn_retries))) {
-		tcp_write_err(sk, 1);
-		/* Don't FIN, we got nothing back */
-	} else if (tp->retransmits > sysctl_tcp_retries2) {
+
+	if (tp->retransmits >= retry_until) {
 		/* Has it gone just too far? */
-		tcp_write_err(sk, 0);
+		tcp_write_err(sk);
+		return 1;
 	}
+	return 0;
 }
 
-void tcp_delack_timer(unsigned long data)
+static void tcp_delack_timer(unsigned long data)
 {
 	struct sock *sk = (struct sock*)data;
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
 	bh_lock_sock(sk);
 	if (sk->lock.users) {
 		/* Try again later. */
-		tcp_reset_xmit_timer(sk, TIME_DACK, HZ/5);
+		tp->ack.blocked = 1;
+		NET_INC_STATS_BH(DelayedACKLocked);
+		tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MIN);
 		goto out_unlock;
 	}
 
-	if(!sk->zapped &&
-	   sk->tp_pinfo.af_tcp.delayed_acks &&
-	   sk->state != TCP_CLOSE)
+	if (tp->ack.pending) {
+		/* Delayed ACK missed: inflate ATO, leave pingpong mode */
+		tp->ack.ato = min(tp->ack.ato<<1, TCP_ATO_MAX);
+		tp->ack.pingpong = 0;
 		tcp_send_ack(sk);
+		NET_INC_STATS_BH(DelayedACKs);
+	}
+	TCP_CHECK_TIMER(sk);
 
 out_unlock:
 	bh_unlock_sock(sk);
 	sock_put(sk);
 }
 
-void tcp_probe_timer(unsigned long data)
+static void tcp_probe_timer(unsigned long data)
 {
 	struct sock *sk = (struct sock*)data;
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-
-	if(sk->zapped)
-		goto out;
+	int max_probes;
 
 	bh_lock_sock(sk);
 	if (sk->lock.users) {
 		/* Try again later. */
-		tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ/5);
+		tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, HZ/5);
+		goto out_unlock;
+	}
+
+	if (sk->state == TCP_CLOSE)
+		goto out_unlock;
+
+	if (tp->packets_out || !tp->send_head) {
+		tp->probes_out = 0;
 		goto out_unlock;
 	}
 
@@ -246,151 +249,251 @@ void tcp_probe_timer(unsigned long data)
 	 * with RFCs, only probe timer combines both retransmission timeout
 	 * and probe timeout in one bottle.				--ANK
 	 */
-	if (tp->probes_out > sysctl_tcp_retries2) {
-		tcp_write_err(sk, 0);
+	max_probes = sk->dead ? sysctl_tcp_orphan_retries : sysctl_tcp_retries2;
+
+	if (tp->probes_out > max_probes) {
+		tcp_write_err(sk);
 	} else {
 		/* Only send another probe if we didn't close things up. */
 		tcp_send_probe0(sk);
+		TCP_CHECK_TIMER(sk);
 	}
 out_unlock:
 	bh_unlock_sock(sk);
-out:
 	sock_put(sk);
 }
 
 
 /* Kill off TIME_WAIT sockets once their lifetime has expired. */
-int tcp_tw_death_row_slot = 0;
-static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS] =
-	{ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL };
-static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED;
+static int tcp_tw_death_row_slot = 0;
+int tcp_tw_count = 0;
 
+static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS];
+static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED;
+static struct timer_list tcp_tw_timer = { function: tcp_twkill };
 
 static void tcp_twkill(unsigned long data)
 {
 	struct tcp_tw_bucket *tw;
 	int killed = 0;
 
-	/* The death-row tw chains are only ever touched
-	 * in BH context so no BH disabling (for now) is needed.
+	/* NOTE: compare this to previous version where lock
+	 * was released after detaching chain. It was racy,
+	 * because tw buckets are scheduled in not serialized context
+	 * in 2.3 (with netfilter), and with softnet it is common, because
+	 * soft irqs are not sequenced.
 	 */
 	spin_lock(&tw_death_lock);
-	tw = tcp_tw_death_row[tcp_tw_death_row_slot];
-	tcp_tw_death_row[tcp_tw_death_row_slot] = NULL;
-	tcp_tw_death_row_slot =
-	  ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
-	spin_unlock(&tw_death_lock);
 
-	while(tw != NULL) {
-		struct tcp_tw_bucket *next = tw->next_death;
+	if (tcp_tw_count == 0)
+		goto out;
+
+	while((tw = tcp_tw_death_row[tcp_tw_death_row_slot]) != NULL) {
+		tcp_tw_death_row[tcp_tw_death_row_slot] = tw->next_death;
+		tw->pprev_death = NULL;
+		spin_unlock(&tw_death_lock);
 
 		tcp_timewait_kill(tw);
 		tcp_tw_put(tw);
+
 		killed++;
-		tw = next;
-	}
-	if(killed != 0) {
-		struct tcp_sl_timer *slt = (struct tcp_sl_timer *)data;
-		atomic_sub(killed, &slt->count);
+
+		spin_lock(&tw_death_lock);
 	}
+	tcp_tw_death_row_slot =
+		((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
+
+	if ((tcp_tw_count -= killed) != 0)
+		mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
+	net_statistics[smp_processor_id()*2].TimeWaited += killed;
+out:
+	spin_unlock(&tw_death_lock);
 }
 
 /* These are always called from BH context.  See callers in
  * tcp_input.c to verify this.
  */
-void tcp_tw_schedule(struct tcp_tw_bucket *tw)
-{
-	struct tcp_tw_bucket **tpp;
-	int slot;
 
+/* This is for handling early-kills of TIME_WAIT sockets. */
+void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
+{
 	spin_lock(&tw_death_lock);
-	slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1);
-	tpp = &tcp_tw_death_row[slot];
-	if((tw->next_death = *tpp) != NULL)
-		(*tpp)->pprev_death = &tw->next_death;
-	*tpp = tw;
-	tw->pprev_death = tpp;
-
-	tw->death_slot = slot;
-	atomic_inc(&tw->refcnt);
+	if (tw->pprev_death) {
+		if(tw->next_death)
+			tw->next_death->pprev_death = tw->pprev_death;
+		*tw->pprev_death = tw->next_death;
+		tw->pprev_death = NULL;
+		tcp_tw_put(tw);
+		if (--tcp_tw_count == 0)
+			del_timer(&tcp_tw_timer);
+	}
 	spin_unlock(&tw_death_lock);
-
-	tcp_inc_slow_timer(TCP_SLT_TWKILL);
 }
 
-/* Happens rarely if at all, no care about scalability here. */
-void tcp_tw_reschedule(struct tcp_tw_bucket *tw)
+/* Short-time timewait calendar */
+
+static int tcp_twcal_hand = -1;
+static int tcp_twcal_jiffie;
+static void tcp_twcal_tick(unsigned long);
+static struct timer_list tcp_twcal_timer = {NULL, NULL, 0, 0, tcp_twcal_tick,};
+static struct tcp_tw_bucket *tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
+
+void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
 {
 	struct tcp_tw_bucket **tpp;
 	int slot;
 
+	/* timeout := RTO * 3.5
+	 *
+	 * 3.5 = 1+2+0.5 to wait for two retransmits.
+	 *
+	 * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
+	 * our ACK acking that FIN can be lost. If N subsequent retransmitted
+	 * FINs (or previous seqments) are lost (probability of such event
+	 * is p^(N+1), where p is probability to lose single packet and
+	 * time to detect the loss is about RTO*(2^N - 1) with exponential
+	 * backoff). Normal timewait length is calculated so, that we
+	 * waited at least for one retransmitted FIN (maximal RTO is 120sec).
+	 * [ BTW Linux. following BSD, violates this requirement waiting
+	 *   only for 60sec, we should wait at least for 240 secs.
+	 *   Well, 240 consumes too much of resources 8)
+	 * ]
+	 * This interval is not reduced to catch old duplicate and
+	 * responces to our wandering segments living for two MSLs.
+	 * However, if we use PAWS to detect
+	 * old duplicates, we can reduce the interval to bounds required
+	 * by RTO, rather than MSL. So, if peer understands PAWS, we
+	 * kill tw bucket after 3.5*RTO (it is important that this number
+	 * is greater than TS tick!) and detect old duplicates with help
+	 * of PAWS.
+	 */
+	slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
+
 	spin_lock(&tw_death_lock);
+
+	/* Unlink it, if it was scheduled */
 	if (tw->pprev_death) {
 		if(tw->next_death)
 			tw->next_death->pprev_death = tw->pprev_death;
 		*tw->pprev_death = tw->next_death;
 		tw->pprev_death = NULL;
+		tcp_tw_count--;
 	} else
 		atomic_inc(&tw->refcnt);
 
-	slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1);
-	tpp = &tcp_tw_death_row[slot];
+	if (slot >= TCP_TW_RECYCLE_SLOTS) {
+		/* Schedule to slow timer */
+		if (timeo >= TCP_TIMEWAIT_LEN) {
+			slot = TCP_TWKILL_SLOTS-1;
+		} else {
+			slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
+			if (slot >= TCP_TWKILL_SLOTS)
+				slot = TCP_TWKILL_SLOTS-1;
+		}
+		tw->ttd = jiffies + timeo;
+		slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
+		tpp = &tcp_tw_death_row[slot];
+	} else {
+		tw->ttd = jiffies + (slot<<TCP_TW_RECYCLE_TICK);
+
+		if (tcp_twcal_hand < 0) {
+			tcp_twcal_hand = 0;
+			tcp_twcal_jiffie = jiffies;
+			tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
+			add_timer(&tcp_twcal_timer);
+		} else {
+			if ((long)(tcp_twcal_timer.expires - jiffies) > (slot<<TCP_TW_RECYCLE_TICK))
+				mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
+			slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
+		}
+		tpp = &tcp_twcal_row[slot];
+	}
+
 	if((tw->next_death = *tpp) != NULL)
 		(*tpp)->pprev_death = &tw->next_death;
 	*tpp = tw;
 	tw->pprev_death = tpp;
 
-	tw->death_slot = slot;
+	if (tcp_tw_count++ == 0)
+		mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
 	spin_unlock(&tw_death_lock);
-
-	/* Timer was incremented when we first entered the table. */
 }
 
-/* This is for handling early-kills of TIME_WAIT sockets. */
-void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
+void tcp_twcal_tick(unsigned long dummy)
 {
+	int n, slot;
+	unsigned long j;
+	unsigned long now = jiffies;
+	int killed = 0;
+	int adv = 0;
+
 	spin_lock(&tw_death_lock);
-	if (tw->pprev_death) {
-		if(tw->next_death)
-			tw->next_death->pprev_death = tw->pprev_death;
-		*tw->pprev_death = tw->next_death;
-		tw->pprev_death = NULL;
-		tcp_tw_put(tw);
+	if (tcp_twcal_hand < 0)
+		goto out;
+
+	slot = tcp_twcal_hand;
+	j = tcp_twcal_jiffie;
+
+	for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
+		if ((long)(j - now) <= 0) {
+			struct tcp_tw_bucket *tw;
+
+			while((tw = tcp_twcal_row[slot]) != NULL) {
+				tcp_twcal_row[slot] = tw->next_death;
+				tw->pprev_death = NULL;
+
+				tcp_timewait_kill(tw);
+				tcp_tw_put(tw);
+				killed++;
+			}
+		} else {
+			if (!adv) {
+				adv = 1;
+				tcp_twcal_jiffie = j;
+				tcp_twcal_hand = slot;
+			}
+
+			if (tcp_twcal_row[slot] != NULL) {
+				mod_timer(&tcp_twcal_timer, j);
+				goto out;
+			}
+		}
+		j += (1<<TCP_TW_RECYCLE_TICK);
+		slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
 	}
-	spin_unlock(&tw_death_lock);
+	tcp_twcal_hand = -1;
 
-	tcp_dec_slow_timer(TCP_SLT_TWKILL);
+out:
+	if ((tcp_tw_count -= killed) == 0)
+		del_timer(&tcp_tw_timer);
+	net_statistics[smp_processor_id()*2].TimeWaitKilled += killed;
+	spin_unlock(&tw_death_lock);
 }
 
 
 /*
  *	The TCP retransmit timer.
- *
- *	1. 	An initial rtt timeout on the probe0 should cause what we can
- *		of the first write queue buffer to be split and sent.
- *	2.	On a 'major timeout' as defined by RFC1122 we do not report
- *		ETIMEDOUT if we know an additional 'soft' error caused this.
- *		tcp_err saves a 'soft error' for us.
  */
 
-void tcp_retransmit_timer(unsigned long data)
+static void tcp_retransmit_timer(unsigned long data)
 {
 	struct sock *sk = (struct sock*)data;
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 
-	/* We are reset. We will send no more retransmits. */
-	if(sk->zapped)
-		goto out;
-
 	bh_lock_sock(sk);
 	if (sk->lock.users) {
 		/* Try again later */  
-		tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ/20);
+		tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, HZ/20);
 		goto out_unlock;
 	}
 
-	/* Clear delay ack timer. */
-	tcp_clear_xmit_timer(sk, TIME_DACK);
+	if (sk->state == TCP_CLOSE || tp->packets_out == 0)
+		goto out_unlock;
+
+	BUG_TRAP(!skb_queue_empty(&sk->write_queue));
+
+	if (tcp_write_timeout(sk))
+		goto out_unlock;
 
 	/* RFC 2018, clear all 'sacked' flags in retransmission queue,
 	 * the sender may have dropped out of order frames and we must
@@ -426,11 +529,19 @@ void tcp_retransmit_timer(unsigned long data)
 		tp->snd_cwnd = 1;
 	}
 
-	tp->retransmits++;
-
 	tp->dup_acks = 0;
 	tp->high_seq = tp->snd_nxt;
-	tcp_retransmit_skb(sk, skb_peek(&sk->write_queue));
+	if (tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)) > 0) {
+		/* Retransmission failed because of local congestion,
+		 * do not backoff.
+		 */
+		if (!tp->retransmits)
+			tp->retransmits=1;
+		tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
+				     min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
+		TCP_CHECK_TIMER(sk);
+		goto out_unlock;
+	}
 
 	/* Increase the timeout each time we retransmit.  Note that
 	 * we do not increase the rtt estimate.  rto is initialized
@@ -448,132 +559,105 @@ void tcp_retransmit_timer(unsigned long data)
 	 * the 120 second clamps though!
 	 */
 	tp->backoff++;
-	tp->rto = min(tp->rto << 1, 120*HZ);
-	tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
-
-	tcp_write_timeout(sk);
+	tp->retransmits++;
+	tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
+	tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+	TCP_CHECK_TIMER(sk);
 
 out_unlock:
 	bh_unlock_sock(sk);
-out:
 	sock_put(sk);
 }
 
 /*
- *	Slow timer for SYN-RECV sockets
+ *	Timer for listening sockets
  */
 
-static void tcp_do_syn_queue(struct sock *sk, struct tcp_opt *tp, unsigned long now)
-{
-	struct open_request *prev, *req;
-
-	prev = (struct open_request *) &tp->syn_wait_queue;
-	for(req = tp->syn_wait_queue; req; ) {
-		struct open_request *next = req->dl_next;
-
-		if (!req->sk && (long)(now - req->expires) >= 0) {
-			tcp_synq_unlink(tp, req, prev);
-			if(req->retrans >= sysctl_tcp_retries1) {
-				(*req->class->destructor)(req);
-				tcp_dec_slow_timer(TCP_SLT_SYNACK);
-				tp->syn_backlog--;
-				tcp_openreq_free(req);
-				if (! tp->syn_wait_queue)
-					break;
-			} else {
-				unsigned long timeo;
-				struct open_request *rp;
-
-				(*req->class->rtx_syn_ack)(sk, req);
-				req->retrans++;
-				timeo = min((TCP_TIMEOUT_INIT << req->retrans),
-					    (120 * HZ));
-				req->expires = now + timeo;
-				rp = prev->dl_next;
-				tcp_synq_queue(tp, req);
-				if(rp != prev->dl_next)
-					prev = prev->dl_next;
-			}
-		} else
-			prev = req;
-		req = next;
-	}
-}
-
-/* This now scales very nicely. -DaveM */
-static void tcp_syn_recv_timer(unsigned long data)
+static void tcp_synack_timer(struct sock *sk)
 {
-	struct sock *sk;
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	struct tcp_listen_opt *lopt = tp->listen_opt;
+	int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
+	int thresh = max_retries;
 	unsigned long now = jiffies;
-	int i;
-
-	read_lock(&tcp_lhash_lock);
-	for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
-		sk = tcp_listening_hash[i];
-		while(sk) {
-			struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-			
-			/* TCP_LISTEN is implied. */
-			bh_lock_sock(sk);
-			if (!sk->lock.users && tp->syn_wait_queue)
-				tcp_do_syn_queue(sk, tp, now);
-			bh_unlock_sock(sk);
-			sk = sk->next;
+	struct open_request **reqp, *req;
+	int i, budget;
+
+	if (lopt == NULL || lopt->qlen == 0)
+		return;
+
+	/* Normally all the openreqs are young and become mature
+	 * (i.e. converted to established socket) for first timeout.
+	 * If synack was not acknowledged for 3 seconds, it means
+	 * one of the following things: synack was lost, ack was lost,
+	 * rtt is high or nobody planned to ack (i.e. synflood).
+	 * When server is a bit loaded, queue is populated with old
+	 * open requests, reducing effective size of queue.
+	 * When server is well loaded, queue size reduces to zero
+	 * after several minutes of work. It is not synflood,
+	 * it is normal operation. The solution is pruning
+	 * too old entries overriding normal timeout, when
+	 * situation becomes dangerous.
+	 *
+	 * Essentially, we reserve half of room for young
+	 * embrions; and abort old ones without pity, if old
+	 * ones are about to clog our table.
+	 */
+	if (lopt->qlen>>(lopt->max_qlen_log-1)) {
+		int young = (lopt->qlen_young<<1);
+
+		while (thresh > 2) {
+			if (lopt->qlen < young)
+				break;
+			thresh--;
+			young <<= 1;
 		}
 	}
-	read_unlock(&tcp_lhash_lock);
-}
-
-void tcp_sltimer_handler(unsigned long data)
-{
-	struct tcp_sl_timer *slt = tcp_slt_array;
-	unsigned long next = ~0UL;
-	unsigned long now = jiffies;
-	int i;
 
-	for (i=0; i < TCP_SLT_MAX; i++, slt++) {
-		if (atomic_read(&slt->count)) {
-			long trigger;
-
-			trigger = slt->period - ((long)(now - slt->last));
-
-			if (trigger <= 0) {
-				(*slt->handler)((unsigned long) slt);
-				slt->last = now;
-				trigger = slt->period;
+	if (tp->defer_accept)
+		max_retries = tp->defer_accept;
+
+	budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
+	i = lopt->clock_hand;
+
+	do {
+		reqp=&lopt->syn_table[i];
+		while ((req = *reqp) != NULL) {
+			if ((long)(now - req->expires) >= 0) {
+				if ((req->retrans < thresh ||
+				     (req->acked && req->retrans < max_retries))
+				    && !req->class->rtx_syn_ack(sk, req, NULL)) {
+					unsigned long timeo;
+
+					if (req->retrans++ == 0)
+						lopt->qlen_young--;
+					timeo = min((TCP_TIMEOUT_INIT << req->retrans),
+						    TCP_RTO_MAX);
+					req->expires = now + timeo;
+					reqp = &req->dl_next;
+					continue;
+				}
+
+				/* Drop this request */
+				write_lock(&tp->syn_wait_lock);
+				*reqp = req->dl_next;
+				write_unlock(&tp->syn_wait_lock);
+				lopt->qlen--;
+				if (req->retrans == 0)
+					lopt->qlen_young--;
+				tcp_openreq_free(req);
 			}
-
-			/* Only reschedule if some events remain. */
-			if (atomic_read(&slt->count))
-				next = min(next, trigger);
+			reqp = &req->dl_next;
 		}
-	}
-	if (next != ~0UL)
-		mod_timer(&tcp_slow_timer, (now + next));
-}
 
-/* __tcp_inc_slow_timer is called when an slow timer is started
- * first time (slt->count was 0). There is race condition between
- * timer creation and deletion and if we do not force adding timer here,
- * we might lose timer. We could avoid it with global spinlock, but
- * it is apparently overkill, so that we restart timer ALWAYS when
- * this function is entered, it guarantees that timer will not lost.
- */
+		i = (i+1)&(TCP_SYNQ_HSIZE-1);
 
-void __tcp_inc_slow_timer(struct tcp_sl_timer *slt)
-{
-	unsigned long now = jiffies;
-	unsigned long when;
+	} while (--budget > 0);
 
-	slt->last = now;
+	lopt->clock_hand = i;
 
-	when = now + slt->period;
-
-	if (tcp_slow_timer.prev &&
-	    (long)(tcp_slow_timer.expires - when) < 0)
-		when = tcp_slow_timer.expires;
-
-	mod_timer(&tcp_slow_timer, when);
+	if (lopt->qlen)
+		tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
 }
 
 void tcp_delete_keepalive_timer (struct sock *sk)
@@ -595,6 +679,9 @@ void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
 
 void tcp_set_keepalive(struct sock *sk, int val)
 {
+	if ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))
+		return;
+
 	if (val && !sk->keepopen)
 		tcp_reset_keepalive_timer(sk, keepalive_time_when(&sk->tp_pinfo.af_tcp));
 	else if (!val)
@@ -602,7 +689,7 @@ void tcp_set_keepalive(struct sock *sk, int val)
 }
 
 
-void tcp_keepalive_timer (unsigned long data)
+static void tcp_keepalive_timer (unsigned long data)
 {
 	struct sock *sk = (struct sock *) data;
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
@@ -616,14 +703,31 @@ void tcp_keepalive_timer (unsigned long data)
 		goto out;
 	}
 
-	if (sk->state == TCP_FIN_WAIT2 && sk->dead)
+	if (sk->state == TCP_LISTEN) {
+		tcp_synack_timer(sk);
+		goto out;
+	}
+
+	if (sk->state == TCP_FIN_WAIT2 && sk->dead) {
+		if (tp->linger2 >= 0) {
+			int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
+
+			if (tmo > 0) {
+				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+				goto out;
+			}
+		}
+		tcp_send_active_reset(sk, GFP_ATOMIC);
 		goto death;
+	}
 
-	if (!sk->keepopen)
+	if (!sk->keepopen || sk->state == TCP_CLOSE)
 		goto out;
 
 	elapsed = keepalive_time_when(tp);
-	if (!((1<<sk->state) & (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT2)))
+
+	/* It is alive without keepalive 8) */
+	if (tp->packets_out || tp->send_head)
 		goto resched;
 
 	elapsed = tcp_time_stamp - tp->rcv_tstamp;
@@ -632,28 +736,30 @@ void tcp_keepalive_timer (unsigned long data)
 		if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
 		     (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
 			tcp_send_active_reset(sk, GFP_ATOMIC);
-			tcp_write_err(sk, 1);
+			tcp_write_err(sk);
 			goto out;
 		}
-		tp->probes_out++;
-		tp->pending = TIME_KEEPOPEN;
-		tcp_write_wakeup(sk);
-		elapsed = keepalive_intvl_when(tp);
+		if (tcp_write_wakeup(sk) <= 0) {
+			tp->probes_out++;
+			elapsed = keepalive_intvl_when(tp);
+		} else {
+			/* If keepalive was lost due to local congestion,
+			 * try harder.
+			 */
+			elapsed = TCP_RESOURCE_PROBE_INTERVAL;
+		}
 	} else {
 		/* It is tp->rcv_tstamp + keepalive_time_when(tp) */
-		if (keepalive_time_when(tp) > elapsed)
-			elapsed = keepalive_time_when(tp) - elapsed;
-		else
-			elapsed = 0;
+		elapsed = keepalive_time_when(tp) - elapsed;
 	}
 
+	TCP_CHECK_TIMER(sk);
+
 resched:
 	tcp_reset_keepalive_timer (sk, elapsed);
 	goto out;
 
 death:	
-	tcp_set_state(sk, TCP_CLOSE);
-	tcp_clear_xmit_timers(sk);
 	tcp_done(sk);
 
 out:
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9ace56abd..c052d2eb8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -5,7 +5,7 @@
  *
  *		The User Datagram Protocol (UDP).
  *
- * Version:	$Id: udp.c,v 1.77 2000/01/09 02:19:44 davem Exp $
+ * Version:	$Id: udp.c,v 1.79 2000/01/18 08:24:20 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -369,30 +369,15 @@ void udp_err(struct sk_buff *skb, unsigned char *dp, int len)
 	}
 
 	/*
-	 *	Various people wanted BSD UDP semantics. Well they've come 
-	 *	back out because they slow down response to stuff like dead
-	 *	or unreachable name servers and they screw term users something
-	 *	chronic. Oh and it violates RFC1122. So basically fix your 
-	 *	client code people.
-	 */
-	 
-	/*
 	 *      RFC1122: OK.  Passes ICMP errors back to application, as per 
-	 *	4.1.3.3. After the comment above, that should be no surprise. 
-	 */
-
-	if (!harderr && !sk->protinfo.af_inet.recverr)
-		goto out;
-
-	/*
-	 *	4.x BSD compatibility item. Break RFC1122 to
-	 *	get BSD socket semantics.
+	 *	4.1.3.3.
 	 */
-	if(sk->bsdism && sk->state!=TCP_ESTABLISHED && !sk->protinfo.af_inet.recverr)
-		goto out;
-
-	if (sk->protinfo.af_inet.recverr)
+	if (!sk->protinfo.af_inet.recverr) {
+		if (!harderr || sk->state != TCP_ESTABLISHED)
+			goto out;
+	} else {
 		ip_icmp_error(sk, skb, err, uh->dest, info, (u8*)(uh+1));
+	}
 	sk->err = err;
 	sk->error_report(sk);
 out:
@@ -629,15 +614,13 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 {
 	switch(cmd) 
 	{
-		case TIOCOUTQ:
+		case SIOCOUTQ:
 		{
-			unsigned long amount;
-
-			amount = sock_wspace(sk);
+			int amount = atomic_read(&sk->wmem_alloc);
 			return put_user(amount, (int *)arg);
 		}
 
-		case TIOCINQ:
+		case SIOCINQ:
 		{
 			struct sk_buff *skb;
 			unsigned long amount;
@@ -663,6 +646,17 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 	return(0);
 }
 
+static __inline__ int __udp_checksum_complete(struct sk_buff *skb)
+{
+	return (unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum));
+}
+
+static __inline__ int udp_checksum_complete(struct sk_buff *skb)
+{
+	return skb->ip_summed != CHECKSUM_UNNECESSARY &&
+		__udp_checksum_complete(skb);
+}
+
 /*
  * 	This should be easy, if there is something there we
  * 	return it, otherwise we block.
@@ -699,31 +693,21 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-#ifndef CONFIG_UDP_DELAY_CSUM
-	err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
-					copied);
-#else
 	if (skb->ip_summed==CHECKSUM_UNNECESSARY) {
 		err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
 					      copied);
-	} else if (copied > msg->msg_iov[0].iov_len || (msg->msg_flags&MSG_TRUNC)) {
-		if ((unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum))) 
+	} else if (msg->msg_flags&MSG_TRUNC) {
+		if (__udp_checksum_complete(skb))
 			goto csum_copy_err;
 		err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
 					      copied);
 	} else {
-		unsigned int csum;
+		err = copy_and_csum_toiovec(msg->msg_iov, skb, sizeof(struct udphdr));
 
-		err = 0;
-		csum = csum_partial(skb->h.raw, sizeof(struct udphdr), skb->csum);
-		csum = csum_and_copy_to_user((char*)&skb->h.uh[1], msg->msg_iov[0].iov_base, 
-					     copied, csum, &err);
 		if (err)
-			goto out_free;
-		if ((unsigned short)csum_fold(csum)) 
 			goto csum_copy_err;
 	}
-#endif
+
 	if (err)
 		goto out_free;
 	sk->stamp=skb->stamp;
@@ -744,7 +728,6 @@ out_free:
 out:
   	return err;
 
-#ifdef CONFIG_UDP_DELAY_CSUM
 csum_copy_err:
 	UDP_INC_STATS_BH(UdpInErrors);
 
@@ -768,7 +751,6 @@ csum_copy_err:
    	 * as some normal condition.
 	 */
 	return (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH;	
-#endif
 }
 
 int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
@@ -831,9 +813,9 @@ static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
 	 *	Charge it to the socket, dropping if the queue is full.
 	 */
 
-#if defined(CONFIG_FILTER) && defined(CONFIG_UDP_DELAY_CSUM)
+#if defined(CONFIG_FILTER)
 	if (sk->filter && skb->ip_summed != CHECKSUM_UNNECESSARY) {
-		if ((unsigned short)csum_fold(csum_partial(skb->h.raw, skb->len, skb->csum))) {
+		if (__udp_checksum_complete(skb)) {
 			UDP_INC_STATS_BH(UdpInErrors);
 			IP_INC_STATS_BH(IpInDiscards);
 			ip_statistics[smp_processor_id()*2].IpInDelivers--;
@@ -855,12 +837,6 @@ static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
 	return 0;
 }
 
-
-static inline void udp_deliver(struct sock *sk, struct sk_buff *skb)
-{
-	udp_queue_rcv_skb(sk, skb);
-}
-
 /*
  *	Multicasts and broadcasts go to each listener.
  *
@@ -889,7 +865,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
 				skb1 = skb_clone(skb, GFP_ATOMIC);
 
 			if(skb1)
-				udp_deliver(sk, skb1);
+				udp_queue_rcv_skb(sk, skb1);
 			sk = sknext;
 		} while(sknext);
 	} else
@@ -898,30 +874,25 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
 	return 0;
 }
 
-static int udp_checksum_verify(struct sk_buff *skb, struct udphdr *uh,
-			       unsigned short ulen, u32 saddr, u32 daddr,
-			       int full_csum_deferred)
+/* Initialize UDP checksum. If exited with zero value (success),
+ * CHECKSUM_UNNECESSARY means, that no more checks are required.
+ * Otherwise, csum completion requires chacksumming packet body,
+ * including udp header and folding it to skb->csum.
+ */
+static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
+			     unsigned short ulen, u32 saddr, u32 daddr)
 {
-	if (!full_csum_deferred) {
-		if (uh->check) {
-			if (skb->ip_summed == CHECKSUM_HW &&
-			    udp_check(uh, ulen, saddr, daddr, skb->csum))
-				return -1;
-			if (skb->ip_summed == CHECKSUM_NONE &&
-			    udp_check(uh, ulen, saddr, daddr,
-				      csum_partial((char *)uh, ulen, 0)))
-				return -1;
-		}
-	} else {
-		if (uh->check == 0)
-			skb->ip_summed = CHECKSUM_UNNECESSARY;
-		else if (skb->ip_summed == CHECKSUM_HW) {
-			if (udp_check(uh, ulen, saddr, daddr, skb->csum))
-				return -1;
-			skb->ip_summed = CHECKSUM_UNNECESSARY;
-		} else if (skb->ip_summed != CHECKSUM_UNNECESSARY)
-			skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
-	}
+	if (uh->check == 0) {
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	} else if (skb->ip_summed == CHECKSUM_HW) {
+		if (udp_check(uh, ulen, saddr, daddr, skb->csum))
+			return -1;
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	} else if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+		skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
+	/* Probably, we should checksum udp header (it should be in cache
+	 * in any case) and data in tiny packets (< rx copybreak).
+	 */
 	return 0;
 }
 
@@ -961,50 +932,33 @@ int udp_rcv(struct sk_buff *skb, unsigned short len)
 	}
 	skb_trim(skb, ulen);
 
-	if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) {
-		int defer;
+	if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0)
+		goto csum_error;
 
-#ifdef CONFIG_UDP_DELAY_CSUM
-		defer = 1;
-#else
-		defer = 0;
-#endif
-		if (udp_checksum_verify(skb, uh, ulen, saddr, daddr, defer))
-			goto csum_error;
+	if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
 		return udp_v4_mcast_deliver(skb, uh, saddr, daddr);
-	}
 
 	sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex);
-	
-	if (sk == NULL) {
-		/* No socket. Drop packet silently, if checksum is wrong */
-		if (udp_checksum_verify(skb, uh, ulen, saddr, daddr, 0))
-			goto csum_error;
-
-		UDP_INC_STATS_BH(UdpNoPorts);
-		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 
-		/*
-		 * Hmm.  We got an UDP packet to a port to which we
-		 * don't wanna listen.  Ignore it.
-		 */
-		kfree_skb(skb);
-		return(0);
-  	}
-	if (udp_checksum_verify(skb, uh, ulen, saddr, daddr,
-#ifdef CONFIG_UDP_DELAY_CSUM
-				1
-#else
-				(sk->no_check & UDP_CSUM_NORCV) != 0
-#endif
-		)) {
+	if (sk != NULL) {
+		udp_queue_rcv_skb(sk, skb);
 		sock_put(sk);
-		goto csum_error;
+		return 0;
 	}
 
-	udp_deliver(sk, skb);
-	__sock_put(sk);
-	return 0;
+	/* No socket. Drop packet silently, if checksum is wrong */
+	if (udp_checksum_complete(skb))
+		goto csum_error;
+
+	UDP_INC_STATS_BH(UdpNoPorts);
+	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+	/*
+	 * Hmm.  We got an UDP packet to a port to which we
+	 * don't wanna listen.  Ignore it.
+	 */
+	kfree_skb(skb);
+	return(0);
 
 csum_error:
 	/* 
@@ -1090,10 +1044,6 @@ struct proto udp_prot = {
 	udp_connect,			/* connect */
 	udp_disconnect,			/* disconnect */
 	NULL,				/* accept */
-	NULL,				/* retransmit */
-	NULL,				/* write_wakeup */
-	NULL,				/* read_wakeup */
-	datagram_poll,			/* poll */
 	udp_ioctl,			/* ioctl */
 	NULL,				/* init */
 	NULL,				/* destroy */
@@ -1107,7 +1057,5 @@ struct proto udp_prot = {
 	udp_v4_hash,			/* hash */
 	udp_v4_unhash,			/* unhash */
 	udp_v4_get_port,		/* good_socknum */
-	128,				/* max_header */
-	0,				/* retransmits */
  	"UDP",				/* name */
 };
author	Ralf Baechle <ralf@linux-mips.org>	2000-02-18 00:24:27 +0000
committer	Ralf Baechle <ralf@linux-mips.org>	2000-02-18 00:24:27 +0000
commit	b9558d5f86c471a125abf1fb3a3882fb053b1f8c (patch)
tree	707b53ec64e740a7da87d5f36485e3cd9b1c794e /net/ipv4
parent	b3ac367c7a3e6047abe74817db27e34e759f279f (diff)