o Merge with Linux 2.1.90.

o Divide L1 cache sizes by 1024 before printing, makes the numbers a bit more credible ...
author: Ralf Baechle <ralf@linux-mips.org> 1998-03-18 17:17:51 +0000
committer: Ralf Baechle <ralf@linux-mips.org> 1998-03-18 17:17:51 +0000
commit: f1382dc4850bb459d24a81c6cb0ef93ea7bd4a79 (patch)
tree: 225271a3d5dcd4e9dea5ee393556abd754c964b1 /net/ipv4
parent: 135b00fc2e90e605ac2a96b20b0ebd93851a3f89 (diff)
35 files changed, 1707 insertions, 1613 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 584ad8c7a..ef1c44620 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -5,7 +5,7 @@
  *
  *		AF_INET protocol family socket handler.
  *
- * Version:	$Id: af_inet.c,v 1.5 1997/12/16 05:37:33 ralf Exp $
+ * Version:	$Id: af_inet.c,v 1.6 1998/03/17 22:18:20 ralf Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -52,6 +52,7 @@
  *	Willy Konynenberg	:	Transparent proxying support.
  *		David S. Miller	:	New socket lookup architecture.
  *					Some other random speedups.
+ *		Cyrus Durgin	:	Cleaned up file for kmod hacks.
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -107,8 +108,8 @@
 #ifdef CONFIG_BRIDGE
 #include <net/br.h>
 #endif
-#ifdef CONFIG_KERNELD
-#include <linux/kerneld.h>
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
 #endif
 #ifdef CONFIG_NET_RADIO
 #include <linux/wireless.h>
@@ -327,7 +328,7 @@ static int inet_create(struct socket *sock, int protocol)
 		static int warned; 
 		if (net_families[AF_PACKET]==NULL)
 		{
-#if defined(CONFIG_KERNELD) && defined(CONFIG_PACKET_MODULE)
+#if defined(CONFIG_KMOD) && defined(CONFIG_PACKET_MODULE)
 			char module_name[30];
 			sprintf(module_name,"net-pf-%d", AF_PACKET);
 			request_module(module_name);
@@ -341,7 +342,7 @@ static int inet_create(struct socket *sock, int protocol)
 	}
 
 	sock->state = SS_UNCONNECTED;
-	sk = sk_alloc(AF_INET, GFP_KERNEL);
+	sk = sk_alloc(AF_INET, GFP_KERNEL, 1);
 	if (sk == NULL) 
 		goto do_oom;
 
@@ -894,7 +895,7 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 		case SIOCDRARP:
 		case SIOCGRARP:
 		case SIOCSRARP:
-#ifdef CONFIG_KERNELD
+#ifdef CONFIG_KMOD
 			if (rarp_ioctl_hook == NULL)
 				request_module("rarp");
 #endif
@@ -928,7 +929,7 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 
 #ifdef CONFIG_DLCI_MODULE
 
-#ifdef CONFIG_KERNELD
+#ifdef CONFIG_KMOD
 			if (dlci_ioctl_hook == NULL)
 				request_module("dlci");
 #endif
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 94ae4263e..dd7ce9e0f 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1,6 +1,6 @@
 /* linux/net/inet/arp.c
  *
- * Version:	$Id: arp.c,v 1.4 1998/03/03 01:23:36 ralf Exp $
+ * Version:	$Id: arp.c,v 1.5 1998/03/17 22:18:21 ralf Exp $
  *
  * Copyright (C) 1994 by Florian  La Roche
  *
@@ -189,7 +189,7 @@ struct neigh_table arp_tbl =
 	NULL,
 	parp_redo,
         { NULL, NULL, &arp_tbl, 0, NULL, NULL,
-		  30*HZ, 1*HZ, 60*HZ, 30*HZ, 5*HZ, 3, 3, 0, 3, 1*HZ, (8*HZ)/10, 1*HZ, 64 },
+		  30*HZ, 1*HZ, 60*HZ, 30*HZ, 5*HZ, 3, 3, 0, 3, 1*HZ, (8*HZ)/10, 64, 1*HZ },
 	30*HZ, 128, 512, 1024,
 };
 
@@ -954,6 +954,10 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy
 			struct device *dev = n->dev;
 			int hatype = dev->type;
 
+			/* Do not confuse users "arp -a" with magic entries */
+			if (!(n->nud_state&~NUD_NOARP))
+				continue;
+
 			/* I'd get great pleasure deleting
 			   this ugly code. Let's output it in hexadecimal format.
 			   "arp" utility will eventually repaired  --ANK
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 7d5f0021f..87394f906 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1,7 +1,7 @@
 /*
  *	NET3	IP device support routines.
  *
- *	Version: $Id: devinet.c,v 1.3 1997/12/16 05:37:35 ralf Exp $
+ *	Version: $Id: devinet.c,v 1.4 1998/03/17 22:18:21 ralf Exp $
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -19,6 +19,7 @@
  *
  *	Changes:
  *	        Alexey Kuznetsov:	pa_* fields are replaced with ifaddr lists.
+ 		Cyrus Durgin:		updated for kmod
  */
 
 #include <linux/config.h>
@@ -49,8 +50,8 @@
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
 #endif
-#ifdef CONFIG_KERNELD
-#include <linux/kerneld.h>
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
 #endif
 
 #include <net/ip.h>
@@ -157,28 +158,32 @@ static void
 inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy)
 {
 	struct in_ifaddr *ifa1 = *ifap;
-	struct in_ifaddr *ifa;
-
-	/* 1. Unlink it */
 
-	*ifap = ifa1->ifa_next;
-
-	/* 2. Deleting primary ifaddr forces deletion all secondaries */
+	/* 1. Deleting primary ifaddr forces deletion all secondaries */
 
 	if (!(ifa1->ifa_flags&IFA_F_SECONDARY)) {
-		while ((ifa=*ifap) != NULL) {
-			if (ifa1->ifa_mask != ifa->ifa_mask ||
+		struct in_ifaddr *ifa;
+		struct in_ifaddr **ifap1 = &ifa1->ifa_next;
+
+		while ((ifa=*ifap1) != NULL) {
+			if (!(ifa->ifa_flags&IFA_F_SECONDARY) ||
+			    ifa1->ifa_mask != ifa->ifa_mask ||
 			    !inet_ifa_match(ifa1->ifa_address, ifa)) {
-				ifap = &ifa->ifa_next;
+				ifap1 = &ifa->ifa_next;
 				continue;
 			}
-			*ifap = ifa->ifa_next;
+			*ifap1 = ifa->ifa_next;
 			rtmsg_ifa(RTM_DELADDR, ifa);
 			notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa);
 			inet_free_ifa(ifa);
 		}
 	}
 
+	/* 2. Unlink it */
+
+	*ifap = ifa1->ifa_next;
+
+
 	/* 3. Announce address deletion */
 
 	/* Send message first, then call notifier.
@@ -232,10 +237,9 @@ inet_insert_ifa(struct in_device *in_dev, struct in_ifaddr *ifa)
 		ifap = last_primary;
 	}
 
-	cli();
 	ifa->ifa_next = *ifap;
+	/* ATOMIC_SET */
 	*ifap = ifa;
-	sti();
 
 	/* Send message first, then call notifier.
 	   Notifier will trigger FIB update, so that
@@ -413,7 +417,7 @@ int devinet_ioctl(unsigned int cmd, void *arg)
 		*colon = 0;
 #endif
 
-#ifdef CONFIG_KERNELD
+#ifdef CONFIG_KMOD
 	dev_load(ifr.ifr_name);
 #endif
 
@@ -960,6 +964,8 @@ static void devinet_sysctl_register(struct in_device *in_dev, struct ipv4_devcon
 	t->sysctl_header = register_sysctl_table(t->devinet_root_dir, 0);
 	if (t->sysctl_header == NULL)
 		kfree(t);
+	else
+		p->sysctl = t;
 }
 
 static void devinet_sysctl_unregister(struct ipv4_devconf *p)
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 409db8209..6350a6366 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -5,7 +5,7 @@
  *
  *		IPv4 Forwarding Information Base: FIB frontend.
  *
- * Version:	$Id: fib_frontend.c,v 1.6 1997/12/13 21:52:48 kuznet Exp $
+ * Version:	$Id: fib_frontend.c,v 1.9 1998/03/08 20:52:36 davem Exp $
  *
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
@@ -151,7 +151,6 @@ struct device * ip_dev_find(u32 addr)
 
 	memset(&key, 0, sizeof(key));
 	key.dst = addr;
-	key.scope = RT_SCOPE_UNIVERSE;
 
 	if (!local_table || local_table->tb_lookup(local_table, &key, &res)
 	    || res.type != RTN_LOCAL)
@@ -344,6 +343,10 @@ int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 	int s_t;
 	struct fib_table *tb;
 
+	if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) &&
+	    ((struct rtmsg*)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED)
+		return ip_rt_dump(skb, cb);
+
 	s_t = cb->args[0];
 	if (s_t == 0)
 		s_t = cb->args[0] = RT_TABLE_MIN;
@@ -423,8 +426,13 @@ static void fib_add_ifaddr(struct in_ifaddr *ifa)
 	u32 addr = ifa->ifa_local;
 	u32 prefix = ifa->ifa_address&mask;
 
-	if (ifa->ifa_flags&IFA_F_SECONDARY)
+	if (ifa->ifa_flags&IFA_F_SECONDARY) {
 		prim = inet_ifa_byprefix(in_dev, prefix, mask);
+		if (prim == NULL) {
+			printk(KERN_DEBUG "fib_add_ifaddr: bug: prim == NULL\n");
+			return;
+		}
+	}
 
 	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
 
@@ -435,7 +443,8 @@ static void fib_add_ifaddr(struct in_ifaddr *ifa)
 	if (ifa->ifa_broadcast && ifa->ifa_broadcast != 0xFFFFFFFF)
 		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
 
-	if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY)) {
+	if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) &&
+	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
 		fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
 			  RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim);
 
@@ -464,8 +473,13 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
 	if (!(ifa->ifa_flags&IFA_F_SECONDARY))
 		fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
 			  RTN_UNICAST, any, ifa->ifa_prefixlen, prim);
-	else 
+	else {
 		prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
+		if (prim == NULL) {
+			printk(KERN_DEBUG "fib_del_ifaddr: bug: prim == NULL\n");
+			return;
+		}
+	}
 
 	/* Deletion is more complicated than add.
 	   We should take care of not to delete too much :-)
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 33bcf0321..4b89ab676 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -5,7 +5,7 @@
  *
  *		IPv4 FIB: lookup engine and maintenance routines.
  *
- * Version:	$Id: fib_hash.c,v 1.1 1997/11/09 19:53:13 kuznet Exp $
+ * Version:	$Id: fib_hash.c,v 1.3 1998/03/08 05:56:16 davem Exp $
  *
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 3ffb404b5..7ec60a5be 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -5,7 +5,7 @@
  *
  *		IPv4 Forwarding Information Base: policy rules.
  *
- * Version:	$Id: fib_rules.c,v 1.2 1997/10/10 22:40:49 davem Exp $
+ * Version:	$Id: fib_rules.c,v 1.3 1998/03/08 05:56:17 davem Exp $
  *
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 3883fcba0..d2d37e11e 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -5,7 +5,7 @@
  *
  *		IPv4 Forwarding Information Base: semantics.
  *
- * Version:	$Id: fib_semantics.c,v 1.6 1997/12/13 21:52:49 kuznet Exp $
+ * Version:	$Id: fib_semantics.c,v 1.7 1998/03/08 05:56:18 davem Exp $
  *
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index b2c7151d1..e8f636e21 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -3,7 +3,7 @@
  *	
  *		Alan Cox, <alan@cymru.net>
  *
- *	Version: $Id: icmp.c,v 1.4 1998/03/03 01:23:37 ralf Exp $
+ *	Version: $Id: icmp.c,v 1.5 1998/03/17 22:18:23 ralf Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *	modify it under the terms of the GNU General Public License
@@ -928,10 +928,8 @@ int icmp_chkaddr(struct sk_buff *skb)
 			struct tcphdr *th = (struct tcphdr *)(((unsigned char *)iph)+(iph->ihl<<2));
 
 			sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex);
-			if (!sk) return 0;
-			if (sk->saddr != iph->saddr) return 0;
-			if (sk->daddr != iph->daddr) return 0;
-			if (sk->dummy_th.dest != th->dest) return 0;
+			if (!sk || (sk->state == TCP_LISTEN))
+				return 0;
 			/*
 			 * This packet came from us.
 			 */
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 166b68b42..d3414a0fe 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -8,7 +8,7 @@
  *	the older version didn't come out right using gcc 2.5.8, the newer one
  *	seems to fall out with gcc 2.6.2.
  *
- *	Version: $Id: igmp.c,v 1.3 1997/12/16 05:37:36 ralf Exp $
+ *	Version: $Id: igmp.c,v 1.4 1998/03/17 22:18:24 ralf Exp $
  *
  *	Authors:
  *		Alan Cox <Alan.Cox@linux.org>
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 45a2ed588..8df8414cd 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -5,7 +5,7 @@
  *
  *		The IP forwarding functionality.
  *		
- * Version:	$Id: ip_forward.c,v 1.3 1998/03/03 01:23:37 ralf Exp $
+ * Version:	$Id: ip_forward.c,v 1.4 1998/03/17 22:18:25 ralf Exp $
  *
  * Authors:	see ip.c
  *
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 9dccb5324..e6831adb8 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -5,7 +5,7 @@
  *
  *		The IP fragmentation functionality.
  *		
- * Version:	$Id: ip_fragment.c,v 1.30 1997/12/29 19:52:32 kuznet Exp $
+ * Version:	$Id: ip_fragment.c,v 1.32 1998/03/08 05:56:21 davem Exp $
  *
  * Authors:	Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
  *		Alan Cox <Alan.Cox@linux.org>
diff --git a/net/ipv4/ip_fw.c b/net/ipv4/ip_fw.c
index d78aa0f66..4eb41c325 100644
--- a/net/ipv4/ip_fw.c
+++ b/net/ipv4/ip_fw.c
@@ -6,7 +6,7 @@
  *	license in recognition of the original copyright. 
  *				-- Alan Cox.
  *
- *	$Id: ip_fw.c,v 1.3 1997/12/16 05:37:37 ralf Exp $
+ *	$Id: ip_fw.c,v 1.4 1998/03/17 22:18:25 ralf Exp $
  *
  *	Ported from BSD to Linux,
  *		Alan Cox 22/Nov/1994.
@@ -392,6 +392,39 @@ int ip_fw_chk(struct iphdr *ip, struct device *rif, __u16 *redirport, struct ip_
 				continue;	/* Mismatch */
 		}
 
+		/* This looks stupid, because we scan almost static
+		   list, searching for static key. However, this way seems
+		   to be only reasonable way of handling fw_via rules
+		   (btw bsd makes the same thing).
+
+		   It will not affect performance if you will follow
+		   the following simple rules:
+
+		   - if inteface is aliased, ALWAYS specify fw_viadev,
+		     so that previous check will guarantee, that we will
+		     not waste time when packet arrive on another interface.
+
+		   - avoid using fw_via.s_addr if fw_via.s_addr is owned
+		     by an aliased interface.
+
+		                                                       --ANK
+		 */
+		if (f->fw_via.s_addr && rif) {
+			struct in_ifaddr *ifa;
+
+			if (rif->ip_ptr == NULL)
+				continue;	/* Mismatch */
+
+			for (ifa = ((struct in_device*)(rif->ip_ptr))->ifa_list;
+			     ifa; ifa = ifa->ifa_next) {
+				if (ifa->ifa_local == f->fw_via.s_addr)
+					goto ifa_ok;
+			}
+			continue;	/* Mismatch */
+
+		ifa_ok:
+		}
+
 		/*
 		 *	Ok the chain addresses match.
 		 */
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 61c364542..fa8208959 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -5,7 +5,7 @@
  *
  *		The Internet Protocol (IP) module.
  *
- * Version:	$Id: ip_input.c,v 1.2 1997/12/16 05:37:38 ralf Exp $
+ * Version:	$Id: ip_input.c,v 1.3 1998/03/17 22:18:26 ralf Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
diff --git a/net/ipv4/ip_masq_mod.c b/net/ipv4/ip_masq_mod.c
index 797f9112f..2265161f3 100644
--- a/net/ipv4/ip_masq_mod.c
+++ b/net/ipv4/ip_masq_mod.c
@@ -12,6 +12,8 @@
  *	as published by the Free Software Foundation; either version
  *	2 of the License, or (at your option) any later version.
  *
+ * Changes:
+ *		Cyrus Durgin:		fixed kerneld stuff for kmod.
  */
 
 #include <linux/config.h>
@@ -21,8 +23,8 @@
 #include <linux/errno.h>
 #include <net/ip_masq.h>
 #include <net/ip_masq_mod.h>
-#ifdef CONFIG_KERNELD
-#include <linux/kerneld.h>
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
 #endif
 
 EXPORT_SYMBOL(register_ip_masq_mod);
@@ -290,7 +292,7 @@ struct ip_masq_mod * ip_masq_mod_getbyname(const char *mmod_name)
 int ip_masq_mod_ctl(int optname, struct ip_fw_masqctl *mctl, int optlen)
 {
 	struct ip_masq_mod * mmod;
-#ifdef CONFIG_KERNELD
+#ifdef CONFIG_KMOD
 	char kmod_name[IP_MASQ_MOD_NMAX+8];
 #endif
 	/* tappo */
@@ -299,7 +301,7 @@ int ip_masq_mod_ctl(int optname, struct ip_fw_masqctl *mctl, int optlen)
 	mmod = ip_masq_mod_getbyname(mctl->u.mod.name);
 	if (mmod)
 		return mmod->mmod_ctl(optname, mctl, optlen);
-#ifdef CONFIG_KERNELD
+#ifdef CONFIG_KMOD
 	sprintf(kmod_name,"ip_masq_%s", mctl->u.mod.name);
 
 	IP_MASQ_DEBUG(1, "About to request \"%s\" module\n", kmod_name);
diff --git a/net/ipv4/ip_masq_raudio.c b/net/ipv4/ip_masq_raudio.c
index f7e28f21a..377b8223e 100644
--- a/net/ipv4/ip_masq_raudio.c
+++ b/net/ipv4/ip_masq_raudio.c
@@ -2,7 +2,7 @@
  *		IP_MASQ_RAUDIO  - Real Audio masquerading module
  *
  *
- * Version:	@(#)$Id: ip_masq_raudio.c,v 1.8 1997/11/28 15:32:32 alan Exp $
+ * Version:	@(#)$Id: ip_masq_raudio.c,v 1.9 1998/02/23 02:50:19 davem Exp $
  *
  * Author:	Nigel Metheringham
  *		Real Time Streaming code by Progressive Networks
diff --git a/net/ipv4/ip_nat_dumb.c b/net/ipv4/ip_nat_dumb.c
index 06e9be8fb..def66858c 100644
--- a/net/ipv4/ip_nat_dumb.c
+++ b/net/ipv4/ip_nat_dumb.c
@@ -5,7 +5,7 @@
  *
  *		Dumb Network Address Translation.
  *
- * Version:	$Id: ip_nat_dumb.c,v 1.2 1997/10/10 22:41:05 davem Exp $
+ * Version:	$Id: ip_nat_dumb.c,v 1.2 1997/12/16 05:37:40 ralf Exp $
  *
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
@@ -14,6 +14,9 @@
  *		as published by the Free Software Foundation; either version
  *		2 of the License, or (at your option) any later version.
  *
+ * Fixes:
+ *		Rani Assaf	:	A zero checksum is a special case
+ *					only in UDP
  *
  * NOTE:	It is just working model of real NAT.
  */
@@ -49,7 +52,6 @@ ip_do_nat(struct sk_buff *skb)
 	u32 odaddr = iph->daddr;
 	u32 osaddr = iph->saddr;
 	u16	check;
-	u16	*cksum = NULL;
 
 	IPCB(skb)->flags |= IPSKB_TRANSLATED;
 
@@ -62,17 +64,23 @@ ip_do_nat(struct sk_buff *skb)
 	/* If it is the first fragment, rewrite protocol headers */
 
 	if (!(iph->frag_off & htons(IP_OFFSET))) {
-		/* Only plain TCP/UDP headers rewriting is implemented :-( */
-		if (iph->protocol == IPPROTO_TCP)
-			cksum = (u16*)&((struct tcphdr*)(((char*)iph) + iph->ihl*4))->check;
-		else if (iph->protocol == IPPROTO_UDP)
-			cksum = (u16*)&((struct udphdr*)(((char*)iph) + iph->ihl*4))->check;
-		if (cksum && (check = *cksum) != 0) {
-			check = csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 0, ~check);
-			check = csum_tcpudp_magic(~osaddr, ~odaddr, 0, 0, ~check);
-			if (!check)
-				check = 0xFFFF;
-			*cksum = check;
+		u16	*cksum;
+
+		switch(iph->protocol) {
+		case IPPROTO_TCP:
+			cksum  = (u16*)&((struct tcphdr*)(((char*)iph) + iph->ihl*4))->check;
+			check  = csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 0, ~(*cksum));
+			*cksum = csum_tcpudp_magic(~osaddr, ~odaddr, 0, 0, ~check);
+			break;
+		case IPPROTO_UDP:
+			cksum  = (u16*)&((struct udphdr*)(((char*)iph) + iph->ihl*4))->check;
+			if ((check = *cksum) != 0) {
+				check = csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 0, ~check);
+				check = csum_tcpudp_magic(~osaddr, ~odaddr, 0, 0, ~check);
+				*cksum = check ? : 0xFFFF;
+			}
+		default:
+			break;
 		}
 	}
 	return 0;
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 53c680eed..d78cc1ff0 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -5,7 +5,7 @@
  *
  *		The options processing module for ip.c
  *
- * Version:	$Id: ip_options.c,v 1.2 1997/12/16 05:37:40 ralf Exp $
+ * Version:	$Id: ip_options.c,v 1.3 1998/03/17 22:18:28 ralf Exp $
  *
  * Authors:	A.N.Kuznetsov
  *		
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index ac4ac22ae..63fbbfe1e 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -5,7 +5,7 @@
  *
  *		The Internet Protocol (IP) output module.
  *
- * Version:	$Id: ip_output.c,v 1.4 1998/03/03 01:23:41 ralf Exp $
+ * Version:	$Id: ip_output.c,v 1.5 1998/03/17 22:18:29 ralf Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index a500a72e5..1b7f44e8f 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -5,7 +5,7 @@
  *
  *		The IP to API glue.
  *		
- * Version:	$Id: ip_sockglue.c,v 1.4 1998/03/03 01:23:41 ralf Exp $
+ * Version:	$Id: ip_sockglue.c,v 1.5 1998/03/17 22:18:29 ralf Exp $
  *
  * Authors:	see ip.c
  *
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 20521e643..1e44ae8aa 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1,5 +1,5 @@
 /*
- *  $Id: ipconfig.c,v 1.6 1998/01/09 17:19:46 mj Exp $
+ *  $Id: ipconfig.c,v 1.11 1998/02/12 07:43:16 davem Exp $
  *
  *  Automatic Configuration of IP -- use BOOTP or RARP or user-supplied
  *  information to configure own IP address and routes.
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 949661f41..ce071d406 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -1,7 +1,7 @@
 /*
  *	Linux NET3:	IP/IP protocol decoder. 
  *
- *	Version: $Id: ipip.c,v 1.4 1997/12/16 05:37:42 ralf Exp $
+ *	Version: $Id: ipip.c,v 1.5 1998/03/17 22:18:30 ralf Exp $
  *
  *	Authors:
  *		Sam Lantinga (slouken@cs.ucdavis.edu)  02/01/95
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index d3c07dca3..1177f33ac 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -9,7 +9,7 @@
  *	as published by the Free Software Foundation; either version
  *	2 of the License, or (at your option) any later version.
  *
- *	Version: $Id: ipmr.c,v 1.29 1997/12/13 21:52:55 kuznet Exp $
+ *	Version: $Id: ipmr.c,v 1.4 1998/03/17 22:18:31 ralf Exp $
  *
  *	Fixes:
  *	Michael Chastain	:	Incorrect size of copying.
@@ -1351,6 +1351,7 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
 	int ct;
 	struct rtnexthop *nhp;
 	struct device *dev = vif_table[c->mfc_parent].dev;
+	u8 *b = skb->tail;
 
 #ifdef CONFIG_RTNL_OLD_IFINFO
 	if (dev) {
@@ -1389,10 +1390,11 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
 	return 1;
 
 rtattr_failure:
+	skb_trim(skb, b - skb->data);
 	return -EMSGSIZE;
 }
 
-int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm)
+int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
 {
 	struct mfc_cache *cache;
 	struct rtable *rt = (struct rtable*)skb->dst;
@@ -1400,10 +1402,16 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm)
 	start_bh_atomic();
 	cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
 	if (cache==NULL || (cache->mfc_flags&MFC_QUEUED)) {
-		struct device *dev = skb->dev;
+		struct device *dev;
 		int vif;
 		int err;
 
+		if (nowait) {
+			end_bh_atomic();
+			return -EAGAIN;
+		}
+
+		dev = skb->dev;
 		if (dev == NULL || (vif = ipmr_find_vif(dev)) == ALL_VIFS) {
 			end_bh_atomic();
 			return -ENODEV;
@@ -1422,7 +1430,7 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm)
 	 */
 	end_bh_atomic();
 
-	if (rtm->rtm_flags & RTM_F_NOTIFY)
+	if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
 		cache->mfc_flags |= MFC_NOTIFY;
 	return ipmr_fill_mroute(skb, cache, rtm);
 }
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 7f3b5f9bb..221207205 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -7,7 +7,7 @@
  *		PROC file system.  It is mainly used for debugging and
  *		statistics.
  *
- * Version:	$Id: proc.c,v 1.23 1997/10/30 23:52:20 davem Exp $
+ * Version:	$Id: proc.c,v 1.4 1997/12/16 05:37:43 ralf Exp $
  *
  * Authors:	Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *		Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de>
@@ -77,11 +77,12 @@ static inline void get__sock(struct sock *sp, char *tmpbuf, int i, int format)
 	unsigned long  dest, src;
 	unsigned short destp, srcp;
 	int timer_active, timer_active1, timer_active2;
+	int tw_bucket = 0;
 	unsigned long timer_expires;
 	struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
 
 	dest  = sp->daddr;
-	src   = sp->saddr;
+	src   = sp->rcv_saddr;
 	destp = sp->dummy_th.dest;
 	srcp  = sp->dummy_th.source;
 	
@@ -96,30 +97,47 @@ static inline void get__sock(struct sock *sp, char *tmpbuf, int i, int format)
 	
 	destp = ntohs(destp);
 	srcp  = ntohs(srcp);
-	timer_active1 = del_timer(&tp->retransmit_timer);
-	timer_active2 = del_timer(&sp->timer);
-	if (!timer_active1) tp->retransmit_timer.expires=0;
-	if (!timer_active2) sp->timer.expires=0;
-	timer_active=0;
-	timer_expires=(unsigned)-1;
+	if((format == 0) && (sp->state == TCP_TIME_WAIT)) {
+		struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sp;
+
+		tw_bucket	= 1;
+		timer_active1	= timer_active2 = 0;
+		timer_active	= 3;
+		timer_expires	= tw->timer.expires;
+	} else {
+		timer_active1 = del_timer(&tp->retransmit_timer);
+		timer_active2 = del_timer(&sp->timer);
+		if (!timer_active1) tp->retransmit_timer.expires=0;
+		if (!timer_active2) sp->timer.expires=0;
+		timer_active	= 0;
+		timer_expires	= (unsigned) -1;
+	}
 	if (timer_active1 && tp->retransmit_timer.expires < timer_expires) {
-		timer_active=timer_active1;
-		timer_expires=tp->retransmit_timer.expires;
+		timer_active	= 1;
+		timer_expires	= tp->retransmit_timer.expires;
 	}
 	if (timer_active2 && sp->timer.expires < timer_expires) {
-		timer_active=timer_active2;
-		timer_expires=sp->timer.expires;
-		}
+		timer_active	= 2;
+		timer_expires	= sp->timer.expires;
+	}
+	if(timer_active == 0)
+		timer_expires = jiffies;
 	sprintf(tmpbuf, "%4d: %08lX:%04X %08lX:%04X"
 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld",
 		i, src, srcp, dest, destp, sp->state, 
-		format==0?sp->write_seq-tp->snd_una:atomic_read(&sp->wmem_alloc), 
-		format==0?tp->rcv_nxt-sp->copied_seq:atomic_read(&sp->rmem_alloc),
-				timer_active, timer_expires-jiffies,
-		tp->retransmits,
-		sp->socket ? sp->socket->inode->i_uid:0,
-		timer_active?sp->timeout:0,
-		sp->socket ? sp->socket->inode->i_ino:0);
+		(tw_bucket ?
+		 0 :
+		 (format == 0) ?
+		 tp->write_seq-tp->snd_una : atomic_read(&sp->wmem_alloc)),
+		(tw_bucket ?
+		 0 :
+		 (format == 0) ?
+		 tp->rcv_nxt-tp->copied_seq: atomic_read(&sp->rmem_alloc)),
+		timer_active, timer_expires-jiffies,
+		(tw_bucket ? 0 : tp->retransmits),
+		(!tw_bucket && sp->socket) ? sp->socket->inode->i_uid : 0,
+		(!tw_bucket && timer_active) ? sp->timeout : 0,
+		(!tw_bucket && sp->socket) ? sp->socket->inode->i_ino : 0);
 	
 	if (timer_active1) add_timer(&tp->retransmit_timer);
 	if (timer_active2) add_timer(&sp->timer);	
diff --git a/net/ipv4/rarp.c b/net/ipv4/rarp.c
index 9e944495f..e1eba43c5 100644
--- a/net/ipv4/rarp.c
+++ b/net/ipv4/rarp.c
@@ -3,7 +3,7 @@
  * Copyright (C) 1994 by Ross Martin
  * Based on linux/net/inet/arp.c, Copyright (C) 1994 by Florian La Roche
  *
- * $Id: rarp.c,v 1.3 1997/12/16 05:37:44 ralf Exp $
+ * $Id: rarp.c,v 1.4 1998/03/17 22:18:31 ralf Exp $
  *
  * This module implements the Reverse Address Resolution Protocol 
  * (RARP, RFC 903), which is used to convert low level addresses such
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index b3644f10d..baebab777 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -5,7 +5,7 @@
  *
  *		RAW - implementation of IP "raw" sockets.
  *
- * Version:	$Id: raw.c,v 1.3 1997/12/16 05:37:44 ralf Exp $
+ * Version:	$Id: raw.c,v 1.4 1998/03/17 22:18:32 ralf Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b73c3ed11..8ce4a95f4 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -5,7 +5,7 @@
  *
  *		ROUTE - implementation of the IP router.
  *
- * Version:	$Id: route.c,v 1.4 1998/03/03 01:23:42 ralf Exp $
+ * Version:	$Id: route.c,v 1.5 1998/03/17 22:18:32 ralf Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -167,7 +167,7 @@ __u8 ip_tos2prio[16] = {
 
 static struct rtable 	*rt_hash_table[RT_HASH_DIVISOR];
 
-static struct rtable * rt_intern_hash(unsigned hash, struct rtable * rth, u16 protocol);
+static struct rtable * rt_intern_hash(unsigned hash, struct rtable * rth);
 
 static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
 {
@@ -301,6 +301,8 @@ static void rt_run_flush(unsigned long dummy)
 	int i;
 	struct rtable * rth, * next;
 
+	rt_deadline = 0;
+
 	for (i=0; i<RT_HASH_DIVISOR; i++) {
 		int nr=0;
 
@@ -322,37 +324,41 @@ static void rt_run_flush(unsigned long dummy)
   
 void rt_cache_flush(int delay)
 {
+	unsigned long now = jiffies;
+	int user_mode = !in_interrupt();
+
 	if (delay < 0)
 		delay = ip_rt_min_delay;
 
 	start_bh_atomic();
 
 	if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
-		long tmo = (long)(rt_deadline - rt_flush_timer.expires);
+		long tmo = (long)(rt_deadline - now);
 
 		/* If flush timer is already running
 		   and flush request is not immediate (delay > 0):
 
-		   if deadline is not achieved, prolongate timer to "dealy",
+		   if deadline is not achieved, prolongate timer to "delay",
 		   otherwise fire it at deadline time.
 		 */
 
+		if (user_mode && (long)(rt_deadline-now) < ip_rt_max_delay-ip_rt_min_delay)
+			tmo = 0;
+		
 		if (delay > tmo)
 			delay = tmo;
 	}
 
 	if (delay <= 0) {
-		rt_deadline = 0;
 		end_bh_atomic();
-
 		rt_run_flush(0);
 		return;
 	}
 
 	if (rt_deadline == 0)
-		rt_deadline = jiffies + ip_rt_max_delay;
+		rt_deadline = now + ip_rt_max_delay;
 
-	rt_flush_timer.expires = jiffies + delay;
+	rt_flush_timer.expires = now + delay;
 	add_timer(&rt_flush_timer);
 	end_bh_atomic();
 }
@@ -400,7 +406,7 @@ out:
 	return (atomic_read(&ipv4_dst_ops.entries) > ip_rt_max_size);
 }
 
-static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt, u16 protocol)
+static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt)
 {
 	struct rtable	*rth, **rthp;
 	unsigned long	now = jiffies;
@@ -472,7 +478,9 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
 		goto reject_redirect;
 
 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
-		if (ip_fib_check_default(new_gw, dev))
+		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
+			goto reject_redirect;
+		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 			goto reject_redirect;
 	} else {
 		if (inet_addr_type(new_gw) != RTN_UNICAST)
@@ -504,9 +512,13 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
 				    rth->u.dst.dev != dev)
 					break;
 
+				dst_clone(&rth->u.dst);
+
 				rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
-				if (rt == NULL)
+				if (rt == NULL) {
+					ip_rt_put(rth);
 					return;
+				}
 
 				/*
 				 * Copy all the information.
@@ -531,14 +543,16 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
 					if (rt->u.dst.neighbour)
 						neigh_event_send(rt->u.dst.neighbour, NULL);
 					ip_rt_put(rt);
+					ip_rt_put(rth);
 					rt_free(rt);
 					break;
 				}
 
 				*rthp = rth->u.rt_next;
-				rt_free(rth);
-				rt = rt_intern_hash(hash, rt, ETH_P_IP);
+				rt = rt_intern_hash(hash, rt);
 				ip_rt_put(rt);
+				ip_rt_put(rth);
+				rt_free(rth);
 				break;
 			}
 		}
@@ -762,19 +776,45 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
 	u32 src;
 	struct fib_result res;
 
-	if (rt->key.iif == 0) {
-		memcpy(addr, &rt->rt_src, 4);
-		return;
-	}
-	if (fib_lookup(&rt->key, &res) == 0) {
+	if (rt->key.iif == 0)
+		src = rt->rt_src;
+	else if (fib_lookup(&rt->key, &res) == 0)
 		src = FIB_RES_PREFSRC(res);
-		memcpy(addr, &src, 4);
-		return;
-	}
-	src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
+	else
+		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
 	memcpy(addr, &src, 4);
 }
 
+static void rt_set_nexthop(struct rtable *rt, struct fib_result *res)
+{
+	struct fib_info *fi = res->fi;
+
+	if (fi) {
+		if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
+			rt->rt_gateway = FIB_RES_GW(*res);
+#ifndef CONFIG_RTNL_OLD_IFINFO
+		rt->u.dst.mxlock = fi->fib_metrics[RTAX_LOCK-1];
+		rt->u.dst.pmtu = fi->fib_mtu;
+		if (fi->fib_mtu == 0) {
+			rt->u.dst.pmtu = rt->u.dst.dev->mtu;
+			if (rt->u.dst.mxlock&(1<<RTAX_MTU) &&
+			    rt->rt_gateway != rt->rt_dst &&
+			    rt->u.dst.pmtu > 576)
+				rt->u.dst.pmtu = 576;
+		}
+#else
+		rt->u.dst.pmtu	= fi->fib_mtu ? : rt->u.dst.dev->mtu;
+#endif
+		rt->u.dst.window= fi->fib_window ? : 0;
+		rt->u.dst.rtt	= fi->fib_rtt ? : TCP_TIMEOUT_INIT;
+	} else {
+		rt->u.dst.pmtu	= rt->u.dst.dev->mtu;
+		rt->u.dst.window= 0;
+		rt->u.dst.rtt	= TCP_TIMEOUT_INIT;
+	}
+        rt->rt_type = res->type;
+}
+
 static int
 ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
 		  u8 tos, struct device *dev, int our)
@@ -832,7 +872,7 @@ ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
 #endif
 
 	hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos);
-	skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, 0);
+	skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth);
 	return 0;
 }
 
@@ -990,18 +1030,9 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
 	rth->u.dst.input = ip_forward;
 	rth->u.dst.output = ip_output;
 
-	rth->u.dst.pmtu	= res.fi->fib_mtu ? : out_dev->dev->mtu;
-	rth->u.dst.window=res.fi->fib_window ? : 0;
-	rth->u.dst.rtt	= res.fi->fib_rtt ? : TCP_TIMEOUT_INIT;
-#ifndef CONFIG_RTNL_OLD_IFINFO
-	rth->u.dst.mxlock = res.fi->fib_metrics[RTAX_LOCK-1];
-#endif
-
-	if (FIB_RES_GW(res) && FIB_RES_NH(res).nh_scope == RT_SCOPE_LINK)
-		rth->rt_gateway	= FIB_RES_GW(res);
+	rt_set_nexthop(rth, &res);
 
 	rth->rt_flags = flags;
-	rth->rt_type = res.type;
 
 #ifdef CONFIG_NET_FASTROUTE
 	if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
@@ -1014,7 +1045,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
 	}
 #endif
 
-	skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, ntohs(skb->protocol));
+	skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth);
 	return 0;
 
 brd_input:
@@ -1062,7 +1093,7 @@ local_input:
 	}
 	rth->rt_flags 	= flags|RTCF_LOCAL;
 	rth->rt_type	= res.type;
-	skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, 0);
+	skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth);
 	return 0;
 
 no_route:
@@ -1362,7 +1393,7 @@ make_route:
 	rth->rt_dst_map	= key.dst;
 	rth->rt_src_map	= key.src;
 #endif
-	rth->rt_iif	= dev_out->ifindex;
+	rth->rt_iif	= oif ? : dev_out->ifindex;
 	rth->u.dst.dev	= dev_out;
 	rth->rt_gateway = key.dst;
 	rth->rt_spec_dst= key.src;
@@ -1388,24 +1419,12 @@ make_route:
 #endif
 	}
 
-	if (res.fi) {
-		if (FIB_RES_GW(res) && FIB_RES_NH(res).nh_scope == RT_SCOPE_LINK)
-			rth->rt_gateway = FIB_RES_GW(res);
-		rth->u.dst.pmtu	= res.fi->fib_mtu ? : dev_out->mtu;
-		rth->u.dst.window=res.fi->fib_window ? : 0;
-		rth->u.dst.rtt	= res.fi->fib_rtt ? : TCP_TIMEOUT_INIT;
-#ifndef CONFIG_RTNL_OLD_IFINFO
-		rth->u.dst.mxlock = res.fi->fib_metrics[RTAX_LOCK-1];
-#endif
-	} else {
-		rth->u.dst.pmtu	= dev_out->mtu;
-		rth->u.dst.window=0;
-		rth->u.dst.rtt	= TCP_TIMEOUT_INIT;
-	}
+	rt_set_nexthop(rth, &res);
+
 	rth->rt_flags = flags;
-        rth->rt_type = res.type;
+
 	hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
-	*rp = rt_intern_hash(hash, rth, ETH_P_IP);
+	*rp = rt_intern_hash(hash, rth);
 	return 0;
 }
 
@@ -1444,6 +1463,113 @@ int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
 
 #ifdef CONFIG_RTNETLINK
 
+static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int nowait)
+{
+	struct rtable *rt = (struct rtable*)skb->dst;
+	struct rtmsg *r;
+	struct nlmsghdr  *nlh;
+	unsigned char	 *b = skb->tail;
+	struct rta_cacheinfo ci;
+#ifdef CONFIG_IP_MROUTE
+	struct rtattr *eptr;
+#endif
+#ifdef CONFIG_RTNL_OLD_IFINFO
+	unsigned char 	 *o;
+#else
+	struct rtattr *mx;
+#endif
+
+	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
+	r = NLMSG_DATA(nlh);
+	nlh->nlmsg_flags = nowait ? NLM_F_MULTI : 0;
+	r->rtm_family = AF_INET;
+	r->rtm_dst_len = 32;
+	r->rtm_src_len = 32;
+	r->rtm_tos = rt->key.tos;
+	r->rtm_table = RT_TABLE_MAIN;
+	r->rtm_type = rt->rt_type;
+	r->rtm_scope = RT_SCOPE_UNIVERSE;
+	r->rtm_protocol = RTPROT_UNSPEC;
+	r->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED;
+#ifdef CONFIG_RTNL_OLD_IFINFO
+	r->rtm_nhs = 0;
+
+	o = skb->tail;
+#endif
+	RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
+	RTA_PUT(skb, RTA_SRC, 4, &rt->rt_src);
+	if (rt->u.dst.dev)
+		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
+	if (rt->rt_dst != rt->rt_gateway)
+		RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
+#ifdef CONFIG_RTNL_OLD_IFINFO
+	RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &rt->u.dst.pmtu);
+	RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &rt->u.dst.window);
+	RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &rt->u.dst.rtt);
+#else
+	mx = (struct rtattr*)skb->tail;
+	RTA_PUT(skb, RTA_METRICS, 0, NULL);
+	if (rt->u.dst.mxlock)
+		RTA_PUT(skb, RTAX_LOCK, sizeof(unsigned), &rt->u.dst.mxlock);
+	if (rt->u.dst.pmtu)
+		RTA_PUT(skb, RTAX_MTU, sizeof(unsigned), &rt->u.dst.pmtu);
+	if (rt->u.dst.window)
+		RTA_PUT(skb, RTAX_WINDOW, sizeof(unsigned), &rt->u.dst.window);
+	if (rt->u.dst.rtt)
+		RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt);
+	mx->rta_len = skb->tail - (u8*)mx;
+	if (mx->rta_len == RTA_LENGTH(0))
+		skb_trim(skb, (u8*)mx - skb->data);
+#endif
+	RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
+	ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
+	ci.rta_used = atomic_read(&rt->u.dst.refcnt);
+	ci.rta_clntref = atomic_read(&rt->u.dst.use);
+	ci.rta_expires = 0;
+	ci.rta_error = rt->u.dst.error;
+#ifdef CONFIG_IP_MROUTE
+	eptr = (struct rtattr*)skb->tail;
+#endif
+	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
+#ifdef CONFIG_RTNL_OLD_IFINFO
+	r->rtm_optlen = skb->tail - o;
+#endif
+	if (rt->key.iif) {
+#ifdef CONFIG_IP_MROUTE
+		u32 dst = rt->rt_dst;
+
+		if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_devconf.mc_forwarding) {
+			int err = ipmr_get_route(skb, r, nowait);
+			if (err <= 0) {
+				if (!nowait) {
+					if (err == 0)
+						return 0;
+					goto nlmsg_failure;
+				} else {
+					if (err == -EMSGSIZE)
+						goto nlmsg_failure;
+					((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
+				}
+			}
+		} else
+#endif
+		{
+			RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
+#ifdef CONFIG_RTNL_OLD_IFINFO
+			r->rtm_optlen = skb->tail - o;
+#endif
+		}
+	}
+
+	nlh->nlmsg_len = skb->tail - b;
+	return skb->len;
+
+nlmsg_failure:
+rtattr_failure:
+	skb_trim(skb, b - skb->data);
+	return -1;
+}
+
 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
 {
 	struct rtattr **rta = arg;
@@ -1454,12 +1580,6 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
 	int iif = 0;
 	int err;
 	struct sk_buff *skb;
-	struct rta_cacheinfo ci;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	unsigned char 	 *o;
-#else
-	struct rtattr *mx;
-#endif
 
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (skb == NULL)
@@ -1506,83 +1626,53 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
 
-	nlh = NLMSG_PUT(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
-			RTM_NEWROUTE, sizeof(*rtm));
-	rtm = NLMSG_DATA(nlh);
-	nlh->nlmsg_flags = 0;
-	rtm->rtm_family = AF_INET;
-	rtm->rtm_dst_len = 32;
-	rtm->rtm_src_len = 32;
-	rtm->rtm_tos = rt->key.tos;
-	rtm->rtm_table = RT_TABLE_MAIN;
-	rtm->rtm_type = rt->rt_type;
-	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
-	rtm->rtm_protocol = RTPROT_UNSPEC;
-	rtm->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	rtm->rtm_nhs = 0;
+	NETLINK_CB(skb).pid = NETLINK_CB(in_skb).pid;
+
+	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0);
+	if (err == 0)
+		return 0;
+	if (err < 0)
+		return -EMSGSIZE;
 
-	o = skb->tail;
-#endif
-	RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
-	RTA_PUT(skb, RTA_SRC, 4, &rt->rt_src);
-	if (rt->u.dst.dev)
-		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
-	if (rt->rt_dst != rt->rt_gateway)
-		RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &rt->u.dst.pmtu);
-	RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &rt->u.dst.window);
-	RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &rt->u.dst.rtt);
-#else
-	mx = (struct rtattr*)skb->tail;
-	RTA_PUT(skb, RTA_METRICS, 0, NULL);
-	if (rt->u.dst.mxlock)
-		RTA_PUT(skb, RTAX_LOCK, sizeof(unsigned), &rt->u.dst.mxlock);
-	if (rt->u.dst.pmtu)
-		RTA_PUT(skb, RTAX_MTU, sizeof(unsigned), &rt->u.dst.pmtu);
-	if (rt->u.dst.window)
-		RTA_PUT(skb, RTAX_WINDOW, sizeof(unsigned), &rt->u.dst.window);
-	if (rt->u.dst.rtt)
-		RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt);
-	mx->rta_len = skb->tail - (u8*)mx;
-#endif
-	RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
-	ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
-	ci.rta_used = atomic_read(&rt->u.dst.refcnt);
-	ci.rta_clntref = atomic_read(&rt->u.dst.use);
-	ci.rta_expires = 0;
-	ci.rta_error = rt->u.dst.error;
-	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	rtm->rtm_optlen = skb->tail - o;
-#endif
-	if (iif) {
-#ifdef CONFIG_IP_MROUTE
-		if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_devconf.mc_forwarding) {
-			NETLINK_CB(skb).pid = NETLINK_CB(in_skb).pid;
-			err = ipmr_get_route(skb, rtm);
-			if (err <= 0)
-				return err;
-		} else
-#endif
-		{
-			RTA_PUT(skb, RTA_IIF, sizeof(int), &iif);
-#ifdef CONFIG_RTNL_OLD_IFINFO
-			rtm->rtm_optlen = skb->tail - o;
-#endif
-		}
-	}
-	nlh->nlmsg_len = skb->tail - (u8*)nlh;
 	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
 	if (err < 0)
 		return err;
 	return 0;
+}
 
-nlmsg_failure:
-rtattr_failure:
-	kfree_skb(skb);
-	return -EMSGSIZE;
+
+int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
+{
+	struct rtable *rt;
+	int h, s_h;
+	int idx, s_idx;
+
+	s_h = cb->args[0];
+	s_idx = idx = cb->args[1];
+	for (h=0; h < RT_HASH_DIVISOR; h++) {
+		if (h < s_h) continue;
+		if (h > s_h)
+			memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(int));
+		start_bh_atomic();
+		for (rt = rt_hash_table[h], idx = 0; rt; rt = rt->u.rt_next, idx++) {
+			if (idx < s_idx)
+				continue;
+			skb->dst = dst_clone(&rt->u.dst);
+			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
+					 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) {
+				dst_release(xchg(&skb->dst, NULL));
+				end_bh_atomic();
+				goto done;
+			}
+			dst_release(xchg(&skb->dst, NULL));
+		}
+		end_bh_atomic();
+	}
+
+done:
+	cb->args[0] = h;
+	cb->args[1] = idx;
+	return skb->len;
 }
 
 #endif /* CONFIG_RTNETLINK */
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 7d119716e..00dd0a8ef 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -9,7 +9,7 @@
  *      as published by the Free Software Foundation; either version
  *      2 of the License, or (at your option) any later version.
  * 
- *  $Id: syncookies.c,v 1.3 1997/09/16 17:16:21 freitag Exp $
+ *  $Id: syncookies.c,v 1.4 1998/03/08 05:56:34 davem Exp $
  *
  *  Missing: IPv6 support. 
  *           Some counter so that the Administrator can see when the machine
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3a8a7efb4..767c5d00b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1,7 +1,7 @@
 /*
  * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem.
  *
- * $Id: sysctl_net_ipv4.c,v 1.6 1998/03/03 01:23:42 ralf Exp $
+ * $Id: sysctl_net_ipv4.c,v 1.7 1998/03/17 22:18:33 ralf Exp $
  *
  * Begun April 1, 1996, Mike Shaver.
  * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS]
@@ -45,8 +45,6 @@ extern int sysctl_ip_masq_debug;
 
 extern int sysctl_tcp_cong_avoidance;
 extern int sysctl_tcp_hoe_retransmits;
-extern int sysctl_tcp_sack;
-extern int sysctl_tcp_tsack;
 extern int sysctl_tcp_timestamps;
 extern int sysctl_tcp_window_scaling;
 extern int sysctl_tcp_keepalive_time;
@@ -57,7 +55,8 @@ extern int sysctl_tcp_retries2;
 extern int sysctl_tcp_fin_timeout;
 extern int sysctl_tcp_syncookies;
 extern int sysctl_tcp_syn_retries;
-extern int sysctl_tcp_stdurg; 
+extern int sysctl_tcp_stdurg;
+extern int sysctl_tcp_rfc1337;
 extern int sysctl_tcp_syn_taildrop; 
 extern int sysctl_max_syn_backlog; 
 
@@ -99,12 +98,6 @@ ctl_table ipv4_table[] = {
         {NET_IPV4_TCP_HOE_RETRANSMITS, "tcp_hoe_retransmits",
          &sysctl_tcp_hoe_retransmits, sizeof(int), 0644, NULL,
          &proc_dointvec},
-        {NET_IPV4_TCP_SACK, "tcp_sack",
-         &sysctl_tcp_sack, sizeof(int), 0644, NULL,
-         &proc_dointvec},
-        {NET_IPV4_TCP_TSACK, "tcp_tsack",
-         &sysctl_tcp_tsack, sizeof(int), 0644, NULL,
-         &proc_dointvec},
         {NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps",
          &sysctl_tcp_timestamps, sizeof(int), 0644, NULL,
          &proc_dointvec},
@@ -162,6 +155,8 @@ ctl_table ipv4_table[] = {
 #endif
 	{NET_TCP_STDURG, "tcp_stdurg", &sysctl_tcp_stdurg,
 	 sizeof(int), 0644, NULL, &proc_dointvec},
+	{NET_TCP_RFC1337, "tcp_rfc1337", &sysctl_tcp_rfc1337,
+	 sizeof(int), 0644, NULL, &proc_dointvec},
 	{NET_TCP_SYN_TAILDROP, "tcp_syn_taildrop", &sysctl_tcp_syn_taildrop,
 	 sizeof(int), 0644, NULL, &proc_dointvec},
 	{NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog", &sysctl_max_syn_backlog,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 17ec6def9..b20df83d2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp.c,v 1.77 1998/01/15 22:40:18 freitag Exp $
+ * Version:	$Id: tcp.c,v 1.96 1998/03/16 02:25:55 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -169,7 +169,7 @@
  *					Fixed tcp_write_timeout: stuck close,
  *					and TCP syn retries gets used now.
  *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
- *					ack if stat is TCP_CLOSED.
+ *					ack if state is TCP_CLOSED.
  *		Alan Cox	:	Look up device on a retransmit - routes may
  *					change. Doesn't yet cope with MSS shrink right
  *					but its a start!
@@ -425,6 +425,8 @@ int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 struct tcp_mib	tcp_statistics;
 
 kmem_cache_t *tcp_openreq_cachep;
+kmem_cache_t *tcp_bucket_cachep;
+kmem_cache_t *tcp_timewait_cachep;
 
 /*
  *	Find someone to 'accept'. Must be called with
@@ -478,20 +480,6 @@ static void tcp_close_pending (struct sock *sk)
 }
 
 /*
- *	Enter the time wait state.
- */
-
-void tcp_time_wait(struct sock *sk)
-{
-	tcp_set_state(sk,TCP_TIME_WAIT);
-	sk->shutdown = SHUTDOWN_MASK;
-	if (!sk->dead)
-		sk->state_change(sk);
-	tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
-}
-
-
-/*
  *	Walk down the receive queue counting readable data.
  *
  *	Must be called with the socket lock held.
@@ -512,7 +500,7 @@ static int tcp_readable(struct sock *sk)
 	  	return(0);
 	}
 
-	counted = sk->copied_seq;	/* Where we are at the moment */
+	counted = sk->tp_pinfo.af_tcp.copied_seq;	/* Where we are at the moment */
 	amount = 0;
 
 	/* Do until a push or until we are out of data. */
@@ -606,10 +594,10 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
 		if (sk->shutdown & RCV_SHUTDOWN)
 			mask |= POLLHUP;
 		
-		if ((tp->rcv_nxt != sk->copied_seq) &&
-		    (sk->urg_seq != sk->copied_seq ||
-		     tp->rcv_nxt != sk->copied_seq+1 ||
-		     sk->urginline || !sk->urg_data))
+		if ((tp->rcv_nxt != tp->copied_seq) &&
+		    (tp->urg_seq != tp->copied_seq ||
+		     tp->rcv_nxt != tp->copied_seq+1 ||
+		     sk->urginline || !tp->urg_data))
 			mask |= POLLIN | POLLRDNORM;
 
 #if 1 /* This needs benchmarking and real world tests */
@@ -621,9 +609,9 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
 		space = atomic_read(&sk->wmem_alloc) / 2;
 #endif
 		/* Always wake the user up when an error occured */
-		if (sock_wspace(sk) >= space)
+		if (sock_wspace(sk) >= space || sk->err)
 			mask |= POLLOUT | POLLWRNORM;
-		if (sk->urg_data)
+		if (tp->urg_data)
 		    	mask |= POLLPRI;
 	}
 	return mask;
@@ -649,7 +637,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 		}
 		case SIOCATMARK:
 		{
-			int answ = sk->urg_data && sk->urg_seq == sk->copied_seq;
+			struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+			int answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 			return put_user(answ,(int *) arg);
 		}
 		case TIOCOUTQ:
@@ -669,21 +658,38 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 /*
  *	Wait for a socket to get into the connected state
  */
-static void wait_for_tcp_connect(struct sock * sk)
+static int wait_for_tcp_connect(struct sock * sk, int flags)
 {
 	struct task_struct *tsk = current;
 	struct wait_queue wait = { tsk, NULL };
 
-	tsk->state = TASK_INTERRUPTIBLE;
-	add_wait_queue(sk->sleep, &wait);
-	release_sock(sk);
+	while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
+		if(sk->err)
+			return sock_error(sk);
+		if((1 << sk->state) &
+		   ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+			if(sk->keepopen)
+				send_sig(SIGPIPE, tsk, 0);
+			return -EPIPE;
+		}
+		if(flags & MSG_DONTWAIT)
+			return -EAGAIN;
+		if(signal_pending(tsk))
+			return -ERESTARTSYS;
 
-	if (((1 << sk->state) & ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT)) && sk->err == 0)
-		schedule();
+		tsk->state = TASK_INTERRUPTIBLE;
+		add_wait_queue(sk->sleep, &wait);
+		release_sock(sk);
 
-	tsk->state = TASK_RUNNING;
-	remove_wait_queue(sk->sleep, &wait);
-	lock_sock(sk);
+		if (((1 << sk->state) & ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT)) &&
+		    sk->err == 0)
+			schedule();
+
+		tsk->state = TASK_RUNNING;
+		remove_wait_queue(sk->sleep, &wait);
+		lock_sock(sk);
+	}
+	return 0;
 }
 
 static inline int tcp_memory_free(struct sock *sk)
@@ -720,32 +726,6 @@ static void wait_for_tcp_memory(struct sock * sk)
 	lock_sock(sk);
 }
 
-
-static int tcp_append_tail(struct sock *sk, struct sk_buff *skb, u8 *from,
-			   int tcp_size, int seglen)
-{
-	int fault;
-	int copy;
-
-	/* Add more stuff to the end of the skb. */
-	copy = min(sk->mss - tcp_size, skb_tailroom(skb));
-	copy = min(copy, seglen);
-
-	tcp_size += copy;
-
-	fault = copy_from_user(skb->tail, from, copy);
-	if (fault)
-		return -1;
-
-	skb_put(skb, copy);
-	skb->csum = csum_partial(skb->tail - tcp_size, tcp_size, 0);
-
-	sk->write_seq += copy;
-	skb->end_seq += copy;
-
-	return copy;
-}
-
 /*
  *	This routine copies from a user buffer into a socket,
  *	and starts the transmit system.
@@ -758,24 +738,9 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags)
 	struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
 
 	/* Wait for a connection to finish. */
-	while ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
-		if (sk->err)
-			return sock_error(sk);
-
-		if ((1 << sk->state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
-			if (sk->keepopen)
-				send_sig(SIGPIPE, current, 0);
-			return -EPIPE;
-		}
-		
-		if (flags&MSG_DONTWAIT)
-			return -EAGAIN;
-		
-		if (signal_pending(current))
-			return -ERESTARTSYS;
-		
-		wait_for_tcp_connect(sk);
-	}
+	if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+		if((err = wait_for_tcp_connect(sk, flags)) != 0)
+			return err;
 
 	/* Ok commence sending. */
 	while(--iovlen >= 0) {
@@ -785,41 +750,28 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags)
 		iov++;
 
 		while(seglen > 0) {
-			unsigned int actual_win;
-			int copy;
-			int tmp;
+			int copy, tmp, queue_it;
 			struct sk_buff *skb;
 
 			if (err)
 				return -EFAULT;
 
 			/* Stop on errors. */
-			if (sk->err) {
-				if (copied) 
-					return copied;
-				return sock_error(sk);
-			}
+			if (sk->err)
+				goto do_sock_err;
 
 			/* Make sure that we are established. */
-			if (sk->shutdown & SEND_SHUTDOWN) {
-				if (copied)
-					return copied;
-				send_sig(SIGPIPE,current,0);
-				return -EPIPE;
-			}
+			if (sk->shutdown & SEND_SHUTDOWN)
+				goto do_shutdown;
 	
-			/* Now we need to check if we have a half built packet. */
-
-			/* If we have queued packets.. */
+			/* Now we need to check if we have a half
+			 * built packet we can tack some data onto.
+			 */
 			if (tp->send_head && !(flags & MSG_OOB)) {
-				int tcp_size;
-
-				/* Tail */
-
 				skb = sk->write_queue.prev;
-				tcp_size = skb->tail -
-					((unsigned char *)(skb->h.th) + tp->tcp_header_len);
-
+				copy = skb->tail -
+					((unsigned char *)(skb->h.th) +
+					 tp->tcp_header_len);
 				/* This window_seq test is somewhat dangerous
 				 * If the remote does SWS avoidance we should
 				 * queue the best we can if not we should in 
@@ -827,79 +779,92 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags)
 				 * a method for detecting this would be most
 				 * welcome
 				 */
-				if (skb->end > skb->tail &&
-				    sk->mss - tcp_size > 0 &&
+				if (skb_tailroom(skb) > 0 &&
+				    (sk->mss - copy) > 0 &&
 				    tp->snd_nxt < skb->end_seq) {
-					int tcopy;
-
-					tcopy = tcp_append_tail(sk, skb, from,
-							       tcp_size,
-							       seglen);
-					if (tcopy == -1)
-						return -EFAULT;
-
-					from += tcopy;
-					copied += tcopy;
-					seglen -= tcopy;
-
-					/*	FIXME: if we're nagling we
-					 *	should send here.
-					 */
+					int last_byte_was_odd = (copy & 1);
+
+					copy = sk->mss - copy;
+					if(copy > skb_tailroom(skb))
+						copy = skb_tailroom(skb);
+					if(copy > seglen)
+						copy = seglen;
+					if(last_byte_was_odd) {
+						if(copy_from_user(skb_put(skb, copy),
+								  from, copy))
+							err = -EFAULT;
+						skb->csum = csum_partial(
+							(((unsigned char *)skb->h.th) +
+							 tp->tcp_header_len),
+							(skb->tail -
+							 (((unsigned char *)skb->h.th) +
+							  tp->tcp_header_len)), 0);
+					} else {
+						skb->csum =
+							csum_and_copy_from_user(
+							from, skb_put(skb, copy),
+							copy, skb->csum, &err);
+					}
+					tp->write_seq += copy;
+					skb->end_seq += copy;
+					from += copy;
+					copied += copy;
+					seglen -= copy;
 					continue;
 				}
 			}
 
-			/*   We also need to worry about the window.
-			 *   If window < 1/2 the maximum window we've seen from this
-			 *   host, don't use it.  This is sender side
-			 *   silly window prevention, as specified in RFC1122.
-			 *   (Note that this is different than earlier versions of
-			 *   SWS prevention, e.g. RFC813.).  What we actually do is
-			 *   use the whole MSS.  Since the results in the right
-			 *   edge of the packet being outside the window, it will
-			 *   be queued for later rather than sent.
+			/* We also need to worry about the window.  If
+			 * window < 1/2 the maximum window we've seen
+			 * from this host, don't use it.  This is
+			 * sender side silly window prevention, as
+			 * specified in RFC1122.  (Note that this is
+			 * different than earlier versions of SWS
+			 * prevention, e.g. RFC813.).  What we
+			 * actually do is use the whole MSS.  Since
+			 * the results in the right edge of the packet
+			 * being outside the window, it will be queued
+			 * for later rather than sent.
 			 */
-			copy = min(seglen, sk->mss);
-			actual_win = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
-
-			if (copy > actual_win &&
-			    (((int) actual_win) >= (tp->max_window >> 1)) &&
-			    actual_win)
-				copy = actual_win;
-
-			if (copy <= 0) {
-				printk(KERN_DEBUG "sendmsg: copy < 0\n");
-				return -EIO;
-			}
+			copy = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
+			if(copy >= (tp->max_window >> 1))
+				copy = min(copy, sk->mss);
+			else
+				copy = sk->mss;
+			if(copy > seglen)
+				copy = seglen;
 
-			/* If tp->packets_out > 0 segment will be nagled
-			 *  else we kick it right away.
-			 */
-			tmp = MAX_HEADER + sk->prot->max_header + 
+			tmp = MAX_HEADER + sk->prot->max_header +
 				sizeof(struct sk_buff) + 15;
-			if (copy < min(sk->mss, tp->max_window >> 1) && 
-			    !(flags & MSG_OOB) && tp->packets_out)
+			queue_it = 0;
+			if (copy < min(sk->mss, tp->max_window >> 1) &&
+			    !(flags & MSG_OOB)) {
 				tmp += min(sk->mss, tp->max_window);
-			else
-				tmp += copy;
 
+				/* What is happening here is that we want to
+				 * tack on later members of the users iovec
+				 * if possible into a single frame.  When we
+				 * leave this loop our caller checks to see if
+				 * we can send queued frames onto the wire.
+				 * See tcp_v[46]_sendmsg() for this.
+				 */
+				queue_it = 1;
+			} else {
+				tmp += copy;
+			}
 			skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL);
 
 			/* If we didn't get any memory, we need to sleep. */
 			if (skb == NULL) {
 				sk->socket->flags |= SO_NOSPACE;
 				if (flags&MSG_DONTWAIT) {
-					if (copied) 
-						return copied;
-					return -EAGAIN;
+					err = -EAGAIN;
+					goto do_interrupted;
 				}
-
 				if (signal_pending(current)) {
-					if (copied)
-						return copied;
-					return -ERESTARTSYS;
+					err = -ERESTARTSYS;
+					goto do_interrupted;
 				}
-
 				wait_for_tcp_memory(sk);
 				continue;
 			}
@@ -910,9 +875,8 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags)
 			tmp = tp->af_specific->build_net_header(sk, skb);
 			if (tmp < 0) {
 				kfree_skb(skb);
-				if (copied)
-					return(copied);
-				return(tmp);
+				err = tmp;
+				goto do_interrupted;
 			}
 
 			skb->h.th =(struct tcphdr *)
@@ -920,7 +884,6 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags)
 
 			seglen -= copy;
 			tcp_build_header_data(skb->h.th, sk, seglen || iovlen);
-			/* FIXME: still need to think about SACK options here. */
 
 			if (flags & MSG_OOB) {
 				skb->h.th->urg = 1;
@@ -933,21 +896,29 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags)
 			from += copy;
 			copied += copy;
 
-			sk->write_seq += copy;
+			tp->write_seq += copy;
 
-			tcp_send_skb(sk, skb);
-
-			release_sock(sk);
-			lock_sock(sk);
+			tcp_send_skb(sk, skb, queue_it);
 		}
 	}
-
 	sk->err = 0;
-
 	if (err)
 		return -EFAULT;
-
 	return copied;
+
+do_sock_err:
+	if(copied)
+		return copied;
+	return sock_error(sk);
+do_shutdown:
+	if(copied)
+		return copied;
+	send_sig(SIGPIPE, current, 0);
+	return -EPIPE;
+do_interrupted:
+	if(copied)
+		return copied;
+	return err;
 }
 
 /*
@@ -980,7 +951,7 @@ static int tcp_recv_urg(struct sock * sk, int nonblock,
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
 	/* No URG data to read. */
-	if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
+	if (sk->urginline || !tp->urg_data || tp->urg_data == URG_READ)
 		return -EINVAL;	/* Yes this is right ! */
 
 	if (sk->err)
@@ -1000,18 +971,10 @@ static int tcp_recv_urg(struct sock * sk, int nonblock,
 	}
 
 	lock_sock(sk);
-	if (sk->urg_data & URG_VALID) {
-		char c = sk->urg_data;
+	if (tp->urg_data & URG_VALID) {
+		char c = tp->urg_data;
 		if (!(flags & MSG_PEEK))
-			sk->urg_data = URG_READ;
-			
-		if(len>0)
-		{
-			err = memcpy_toiovec(msg->msg_iov, &c, 1);
-			msg->msg_flags|=MSG_OOB;
-		}
-		else
-			msg->msg_flags|=MSG_TRUNC;
+			tp->urg_data = URG_READ;
 			
 		if(msg->msg_name)
 			tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
@@ -1023,6 +986,15 @@ static int tcp_recv_urg(struct sock * sk, int nonblock,
 		/* Read urgent data. */
 		msg->msg_flags|=MSG_OOB;
 		release_sock(sk);
+
+		if(len>0)
+		{
+			err = memcpy_toiovec(msg->msg_iov, &c, 1);
+			msg->msg_flags|=MSG_OOB;
+		}
+		else
+			msg->msg_flags|=MSG_TRUNC;
+			
 		return err ? -EFAULT : 1;
 	}
 	release_sock(sk);
@@ -1044,45 +1016,37 @@ static int tcp_recv_urg(struct sock * sk, int nonblock,
 
 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
 {
-	sk->tp_pinfo.af_tcp.delayed_acks++;
-
 	__skb_unlink(skb, &sk->receive_queue);
 	kfree_skb(skb);
 }
 
-
-static void cleanup_rbuf(struct sock *sk)
+/* Clean up the receive buffer for full frames taken by the user,
+ * then send an ACK if necessary.  COPIED is the number of bytes
+ * tcp_recvmsg has given to the user so far, it speeds up the
+ * calculation of whether or not we must ACK for the sake of
+ * a window update.
+ */
+static void cleanup_rbuf(struct sock *sk, int copied)
 {
 	struct sk_buff *skb;
-	struct tcp_opt *tp;
 	
 	/* NOTE! The socket must be locked, so that we don't get
 	 * a messed-up receive queue.
 	 */
 	while ((skb=skb_peek(&sk->receive_queue)) != NULL) {
-		if (!skb->used || atomic_read(&skb->users)>1)
+		if (!skb->used || atomic_read(&skb->users) > 1)
 			break;
 		tcp_eat_skb(sk, skb);
 	}
 
 	SOCK_DEBUG(sk, "sk->rspace = %lu\n", sock_rspace(sk));
 
-	tp = &(sk->tp_pinfo.af_tcp);
-
-  	/* We send a ACK if the sender is blocked
-  	 * else let tcp_data deal with the acking policy.
+  	/* We send an ACK if we can now advertise a non-zero window
+	 * which has been raised "significantly".
   	 */
-	if (tp->delayed_acks) {
-		__u32 rcv_wnd;
-
-	 	/* FIXME: double check this rule, then check against
-		 * other use of similar rules. Abtract if possible.
-		 */
-		rcv_wnd = tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup);
-
-		if ((rcv_wnd < sk->mss) && (sock_rspace(sk) > rcv_wnd))
-			tcp_read_wakeup(sk);
-	}
+	if((copied > 0) &&
+	   (copied >= tcp_receive_window(&sk->tp_pinfo.af_tcp)))
+		tcp_read_wakeup(sk);
 }
 
 
@@ -1100,7 +1064,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
 	volatile u32 *seq;	/* So gcc doesn't overoptimise */
 	unsigned long used;
 	int err = 0; 
-	int target = 1;		/* Read at least this may bytes */
+	int target = 1;		/* Read at least this many bytes */
 
 	if (sk->state == TCP_LISTEN)
 		return -ENOTCONN;
@@ -1113,8 +1077,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
 	 *	the multi-reader case neatly (memcpy_to/fromfs might be
 	 *	inline and thus not flush cached variables otherwise).
 	 */
-	peek_seq = sk->copied_seq;
-	seq = &sk->copied_seq;
+	peek_seq = tp->copied_seq;
+	seq = &tp->copied_seq;
 	if (flags & MSG_PEEK)
 		seq = &peek_seq;
 		
@@ -1129,7 +1093,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
 		u32 offset;
 
 		/* Are we at urgent data? Stop if we have read anything. */
-		if (copied && sk->urg_data && sk->urg_seq == *seq)
+		if (copied && tp->urg_data && tp->urg_seq == *seq)
 			break;
 
 		/* We need to check signals first, to get correct SIGURG
@@ -1200,7 +1164,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
 			break;
 		}
 
-		cleanup_rbuf(sk);
+		cleanup_rbuf(sk, copied);
 		release_sock(sk);
 		sk->socket->flags |= SO_WAITDATA;
 		schedule();
@@ -1222,8 +1186,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
 			used = len;
 
 		/* Do we have urgent data here? */
-		if (sk->urg_data) {
-			u32 urg_offset = sk->urg_seq - *seq;
+		if (tp->urg_data) {
+			u32 urg_offset = tp->urg_seq - *seq;
 			if (urg_offset < used) {
 				if (!urg_offset) {
 					if (!sk->urginline) {
@@ -1264,8 +1228,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
 		 */
 		atomic_dec(&skb->users);
 
-		if (after(sk->copied_seq,sk->urg_seq))
-			sk->urg_data = 0;
+		if (after(tp->copied_seq,tp->urg_seq))
+			tp->urg_data = 0;
 		if (used + offset < skb->len)
 			continue;
 
@@ -1303,7 +1267,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
 	current->state = TASK_RUNNING;
 
 	/* Clean up data we have read: This will do ACK frames. */
-	cleanup_rbuf(sk);
+	cleanup_rbuf(sk, copied);
 	release_sock(sk);
 	return copied;
 }
@@ -1356,8 +1320,7 @@ static int tcp_close_state(struct sock *sk, int dead)
 	 *	reset mistake.
 	 */
 	if(dead && ns==TCP_FIN_WAIT2) {
-		int timer_active=del_timer(&sk->timer);
-		if(timer_active)
+		if(sk->timer.prev && del_timer(&sk->timer))
 			add_timer(&sk->timer);
 		else
 			tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout);
@@ -1410,6 +1373,7 @@ static inline int closing(struct sock * sk)
 void tcp_close(struct sock *sk, unsigned long timeout)
 {
 	struct sk_buff *skb;
+	int data_was_unread = 0;
 
 	/* We need to grab some memory, and put together a FIN,
 	 * and then put it into the queue to be sent.
@@ -1421,7 +1385,6 @@ void tcp_close(struct sock *sk, unsigned long timeout)
 		tcp_close_pending(sk);
 		release_sock(sk);
 		sk->dead = 1;
-		sk->prot->unhash(sk);
 		return;
 	}
 
@@ -1435,14 +1398,30 @@ void tcp_close(struct sock *sk, unsigned long timeout)
 	 *  descriptor close, not protocol-sourced closes, because the
 	 *  reader process may not have drained the data yet!
 	 */
-	while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
+	while((skb=skb_dequeue(&sk->receive_queue))!=NULL) {
+		data_was_unread++;
 		kfree_skb(skb);
+	}
 
-	/*  Timeout is not the same thing - however the code likes
-	 *  to send both the same way (sigh).
+	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
+	 * 3.10, we send a RST here because data was lost.  To
+	 * witness the awful effects of the old behavior of always
+	 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
+	 * a bulk GET in an FTP client, suspend the process, wait
+	 * for the client to advertise a zero window, then kill -9
+	 * the FTP client, wheee...  Note: timeout is always zero
+	 * in such a case.
 	 */
-	if (tcp_close_state(sk,1)==1)
+	if(data_was_unread != 0) {
+		/* Unread data was tossed, zap the connection. */
+		tcp_set_state(sk, TCP_CLOSE);
+		tcp_send_active_reset(sk);
+	} else if (tcp_close_state(sk,1)) {
+		/* We FIN if the application ate all the data before
+		 * zapping the connection.
+		 */
 		tcp_send_fin(sk);
+	}
 
 	if (timeout) {
 		struct task_struct *tsk = current;
@@ -1470,8 +1449,7 @@ void tcp_close(struct sock *sk, unsigned long timeout)
 	 * we may need to set up a timer.
          */
 	if (sk->state==TCP_FIN_WAIT2) {
-		int timer_active=del_timer(&sk->timer);
-		if(timer_active)
+		if(sk->timer.prev && del_timer(&sk->timer))
 			add_timer(&sk->timer);
 		else
 			tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout);
@@ -1479,9 +1457,6 @@ void tcp_close(struct sock *sk, unsigned long timeout)
 
 	sk->dead = 1;
 	release_sock(sk);
-
-	if(sk->state == TCP_CLOSE)
-		sk->prot->unhash(sk);
 }
 
 /*
@@ -1538,13 +1513,12 @@ struct sock *tcp_accept(struct sock *sk, int flags)
 	    /* If this is a non blocking socket don't sleep */
 	    error = EAGAIN;
 	    if (flags & O_NONBLOCK)
-		goto out;
+			goto out;
 	    
 	    error = ERESTARTSYS;
 	    req = wait_for_connect(sk, &prev);
 	    if (!req) 
-		goto out;
-	    error = 0; 
+			goto out;
 	}
 
 	tcp_synq_unlink(tp, req, prev);
@@ -1647,9 +1621,23 @@ void tcp_set_keepalive(struct sock *sk, int val)
 __initfunc(void tcp_init(void))
 {
 	tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
-					       sizeof(struct open_request),
+						   sizeof(struct open_request),
 					       0, SLAB_HWCACHE_ALIGN,
 					       NULL, NULL);
 	if(!tcp_openreq_cachep)
 		panic("tcp_init: Cannot alloc open_request cache.");
+
+	tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
+					      sizeof(struct tcp_bind_bucket),
+					      0, SLAB_HWCACHE_ALIGN,
+					      NULL, NULL);
+	if(!tcp_bucket_cachep)
+		panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
+
+	tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
+						sizeof(struct tcp_tw_bucket),
+						0, SLAB_HWCACHE_ALIGN,
+						NULL, NULL);
+	if(!tcp_timewait_cachep)
+		panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 841359739..4b7dcc9e9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_input.c,v 1.66 1998/01/15 22:40:29 freitag Exp $
+ * Version:	$Id: tcp_input.c,v 1.84 1998/03/15 03:23:20 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -67,57 +67,54 @@ static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack,
 
 extern int sysctl_tcp_fin_timeout;
 
+/* These are on by default so the code paths get tested.
+ * For the final 2.2 this may be undone at our discretion. -DaveM
+ */
+int sysctl_tcp_timestamps = 1;
+int sysctl_tcp_window_scaling = 1;
+
 int sysctl_tcp_cong_avoidance;
 int sysctl_tcp_hoe_retransmits;
-int sysctl_tcp_sack;
-int sysctl_tcp_tsack;
-int sysctl_tcp_timestamps;
-int sysctl_tcp_window_scaling;
 int sysctl_tcp_syncookies = SYNC_INIT; 
 int sysctl_tcp_stdurg;
+int sysctl_tcp_rfc1337;
 
 static tcp_sys_cong_ctl_t tcp_sys_cong_ctl_f = &tcp_cong_avoid_vanj;
 
-/*
- *	Called each time to estimate the delayed ack timeout. This is
- *	how it should be done so a fast link isnt impacted by ack delay.
- *
- *	I think we need a medium deviation here also...
- *	The estimated value is changing to fast
+/* There is something which you must keep in mind when you analyze the
+ * behavior of the tp->ato delayed ack timeout interval.  When a
+ * connection starts up, we want to ack as quickly as possible.  The
+ * problem is that "good" TCP's do slow start at the beginning of data
+ * transmission.  The means that until we send the first few ACK's the
+ * sender will sit on his end and only queue most of his data, because
+ * he can only send snd_cwnd unacked packets at any given time.  For
+ * each ACK we send, he increments snd_cwnd and transmits more of his
+ * queue.  -DaveM
  */
- 
 static void tcp_delack_estimator(struct tcp_opt *tp)
 {
-	int m;
-
-	/* Delayed ACK time estimator. */
-	
-	m = jiffies - tp->lrcvtime;
-
-	tp->lrcvtime = jiffies;
+	if(tp->ato == 0) {
+		tp->lrcvtime = jiffies;
 
-	if (m < 0)
-		return;
-
-	/* if the mesured value is bigger than
-	 * twice the round trip time ignore it.
-	 */
-	if ((m << 2) <= tp->srtt) {
-		m -= (tp->iat >> 3);
-		tp->iat += m;
-
-		if (m <0)
-			m = -m;
-
-		m -= (tp->iat_mdev >> 2);
-		tp->iat_mdev += m;
+		/* Help sender leave slow start quickly,
+		 * this sets our initial ato value.
+		 */
+		tcp_enter_quickack_mode(tp);
+	} else {
+		int m = jiffies - tp->lrcvtime;
 
-		tp->ato = (tp->iat >> 3) + (tp->iat_mdev >> 2);
+		tp->lrcvtime = jiffies;
+		if(m <= 0)
+			m = 1;
+		if(m > tp->rto)
+			tp->ato = tp->rto;
+		else
+			tp->ato = (tp->ato >> 1) + m;
 
-		if (tp->ato < HZ/50)
-			tp->ato = HZ/50;
-	} else
-		tp->ato = 0;
+		/* We are not in "quick ack" mode. */
+		if(tp->ato <= (HZ/100))
+			tp->ato = ((HZ/100)*2);
+	}
 }
 
 /* Called to compute a smoothed rtt estimate. The data fed to this
@@ -132,9 +129,9 @@ static void tcp_delack_estimator(struct tcp_opt *tp)
 
 static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
 {
-	long m;
-	/*
-	 *	The following amusing code comes from Jacobson's
+	long m = mrtt; /* RTT */
+
+	/*	The following amusing code comes from Jacobson's
 	 *	article in SIGCOMM '88.  Note that rtt and mdev
 	 *	are scaled versions of rtt and mean deviation.
 	 *	This is designed to be as fast as possible 
@@ -143,12 +140,9 @@ static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
 	 *	On a 1990 paper the rto value is changed to:
 	 *	RTO = rtt + 4 * mdev
 	 */
-
-	m = mrtt;  /* RTT */
-
+	if(m == 0)
+		m = 1;
 	if (tp->srtt != 0) {
-		if(m<=0)
-			m=1;		/* IS THIS RIGHT FOR <0 ??? */
 		m -= (tp->srtt >> 3);	/* m is now error in rtt est */
 		tp->srtt += m;		/* rtt = 7/8 rtt + 1/8 new */
 		if (m < 0)
@@ -202,19 +196,17 @@ extern __inline__ void tcp_replace_ts_recent(struct tcp_opt *tp, __u32 end_seq)
 	 */
 	if (!before(end_seq,tp->last_ack_sent)) {
 		tp->ts_recent = tp->rcv_tsval;
-		/* FIXME: need a corse timestamp. Days uptime
-		 * would be good.
-		 */
 		tp->ts_recent_stamp = jiffies;
 	}
 }
 
+#define PAWS_24DAYS	(HZ * 60 * 60 * 24 * 24)
+
 extern __inline__ int tcp_paws_discard(struct tcp_opt *tp)
 {
-	/* FIXME: must check that ts_recent is not
- 	 * more than 24 days old here. Yuck.
- 	 */
-	return ((s32)(tp->rcv_tsval-tp->ts_recent) < 0);
+	/* ts_recent must be younger than 24 days */
+	return (((jiffies - tp->ts_recent_stamp) >= PAWS_24DAYS) ||
+		((s32)(tp->rcv_tsval-tp->ts_recent) < 0));
 }
 
 
@@ -257,8 +249,6 @@ static void tcp_reset(struct sock *sk, struct sk_buff *skb)
 
 	/* We want the right error as BSD sees it (and indeed as we do). */
 	switch (sk->state) {
-		case TCP_TIME_WAIT:
-			break;
 		case TCP_SYN_SENT:
 			sk->err = ECONNREFUSED;
 			break;
@@ -268,23 +258,8 @@ static void tcp_reset(struct sock *sk, struct sk_buff *skb)
 		default:
 			sk->err = ECONNRESET;
 	};
-#ifdef CONFIG_TCP_RFC1337
-	/*
-	 *	Time wait assassination protection [RFC1337]
-	 *
-	 *	This is a good idea, but causes more sockets to take time to close.
-	 *
-	 *	Ian Heavens has since shown this is an inadequate fix for the protocol
-	 *	bug in question.
-	 */
-	if(sk->state!=TCP_TIME_WAIT) {
-		tcp_set_state(sk,TCP_CLOSE);
-		sk->shutdown = SHUTDOWN_MASK;
-	}
-#else	
 	tcp_set_state(sk,TCP_CLOSE);
 	sk->shutdown = SHUTDOWN_MASK;
-#endif	
 	if (!sk->dead) 
 		sk->state_change(sk);
 }
@@ -302,7 +277,6 @@ void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp, int no_fancy)
 	int length=(th->doff*4)-sizeof(struct tcphdr);
 
 	ptr = (unsigned char *)(th + 1);
-	tp->sacks = 0;
 	tp->saw_tstamp = 0;
 
 	while(length>0) {
@@ -336,10 +310,6 @@ void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp, int no_fancy)
 								tp->snd_wscale = *(__u8 *)ptr;
 							}
 						break;
-					case TCPOPT_SACK_PERM:
-	  					if(opsize==TCPOLEN_SACK_PERM && th->syn)
-							if (sysctl_tcp_sack && !no_fancy)
-								tp->sack_ok = 1;
 					case TCPOPT_TIMESTAMP:
 	  					if(opsize==TCPOLEN_TIMESTAMP) {
 							/* Cheaper to set again then to
@@ -353,18 +323,6 @@ void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp, int no_fancy)
 							}
 						}
 						break;
-					case TCPOPT_SACK:
-						if (no_fancy || !sysctl_tcp_sack) 
-							break; 
-						tp->sacks = (opsize-2)>>3;
-						if (tp->sacks<<3 == opsize-2) {
-							int i;
-							for (i = 0; i < tp->sacks; i++) {
-								tp->left_sack[i] = ntohl(((__u32 *)ptr)[2*i]);
-								tp->right_sack[i] = ntohl(((__u32 *)ptr)[2*i+1]);
-							}
-						} else
-							tp->sacks = 0;
 	  			}
 	  			ptr+=opsize-2;
 	  			length-=opsize;
@@ -374,7 +332,7 @@ void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp, int no_fancy)
 
 /* Fast parse options. This hopes to only see timestamps.
  * If it is wrong it falls back on tcp_parse_option().
- * This should probably get extended for timestamps + SACK as well.
+ * This should probably get extended for timestamps as well.
  * Assembly code anyone? -- erics
  */
 static __inline__ int tcp_fast_parse_options(struct tcphdr *th, struct tcp_opt *tp)
@@ -384,14 +342,12 @@ static __inline__ int tcp_fast_parse_options(struct tcphdr *th, struct tcp_opt *
 		return 0;
 	if (th->doff == sizeof(struct tcphdr)>>2) {
 		tp->saw_tstamp = 0;
-		tp->sacks = 0;
 		return 0;
-	} else if (th->doff == (sizeof(struct tcphdr)>>2)+3) {
+	} else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
 		__u32 *ptr = (__u32 *)(th + 1);
-		if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
-				  | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
+		if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+					     | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
 			tp->saw_tstamp = 1;
-			tp->sacks = 0;
 			tp->rcv_tsval = ntohl(*++ptr);
 			tp->rcv_tsecr = ntohl(*++ptr);
 			return 1;
@@ -401,89 +357,6 @@ static __inline__ int tcp_fast_parse_options(struct tcphdr *th, struct tcp_opt *
 	return 1;
 }
 
-#if 0
-
-/*
- * This is the old fast retransmit code. It will go away eventually. -- erics
- */
-
-/* 
- *  See draft-stevens-tcpca-spec-01 for documentation.
- */
-
-static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
-{
-	struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
-
-	/* FIXME: if we are already retransmitting should this code
-	 * be skipped? [Floyd high_seq check sort of does this]
-	 * The case I'm worried about is falling into a fast
-	 * retransmit on a link with a congestion window of 1 or 2.
-	 * There was some evidence in 2.0.x that this was problem
-	 * on really slow links (1200 or 2400 baud). I need to
-	 * try this situation again and see what happens.
-	 */
-
-	/*
-	 * An ACK is a duplicate if:
-	 * (1) it has the same sequence number as the largest number we've 
-	 *     seen,
-	 * (2) it has the same window as the last ACK,
-	 * (3) we have outstanding data that has not been ACKed
-	 * (4) The packet was not carrying any data.
-	 * (5) [From Floyds paper on fast retransmit wars]
-	 *     The packet acked data after high_seq;
-	 */
-
-	if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
-		/* 1. When the third duplicate ack is received, set ssthresh 
-		 *    to one half the current congestion window, but no less 
-		 *    than two segments. Retransmit the missing segment.
-		 */
-		if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
-			tp->dup_acks++;
-
-			if (tp->dup_acks == 3) {
-				tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2);
-				tp->snd_cwnd = tp->snd_ssthresh + 3;
-				tcp_do_retransmit(sk, 0);
-
-				/* Careful not to timeout just after fast
-				 * retransmit!
-				 */
-				tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
-			}
-		}
-
-		/* 2. Each time another duplicate ACK arrives, increment 
-		 *    cwnd by the segment size. [...] Transmit a packet...
-		 *
-		 *    Packet transmission will be done on normal flow processing
-		 *    since we're not in "retransmit mode".
-		 */
-		if (tp->dup_acks >= 3) {
-			tp->dup_acks++;
-			tp->snd_cwnd++;
-		}
-	} else {
-		/* 3. When the next ACK arrives that acknowledges new data,
-		 *    set cwnd to ssthresh.
-		 */
-		if (tp->dup_acks >= 3) {
-			tp->retrans_head = NULL;
-			tp->snd_cwnd = max(tp->snd_ssthresh, 1);
-			tp->retransmits = 0;
-		}
-		tp->dup_acks = 0;
-
-		/* FIXME: This is wrong if the new ack that arrives
-		 * is below the value for high_seq.
-		 */
-		tp->high_seq = 0;
-	}
-}
-#endif
-
 #define FLAG_DATA		0x01
 #define FLAG_WIN_UPDATE		0x02
 #define FLAG_DATA_ACKED		0x04
@@ -579,9 +452,8 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
 				 * not indicate a packet left the system.
 				 * We can test this by just checking
 				 * if ack changed from snd_una, since
-				 * the only way to get here without changing
-				 * advancing from snd_una is if this was a
-				 * window update.
+				 * the only way to get here without advancing
+				 * from snd_una is if this was a window update.
 				 */
 				if (ack != tp->snd_una && before(ack,tp->high_seq)) {
                                 	tcp_do_retransmit(sk, 0);
@@ -596,9 +468,6 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
 				clear_fast_retransmit(sk);
 			}
 		}
-	} else {
-		/* Clear any aborted fast retransmit starts. */
-		tp->dup_acks = 0;
 	}
 }
 
@@ -649,7 +518,6 @@ static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack,
 
 	expected = (tp->snd_nxt - tp->snd_una) * inv_basertt;
 
-	/* XXX sk->mss should move into tcp_opt as well -DaveM */
 	inv_basebd = sk->mss * inv_basertt;
 
 	/* Slow Start */
@@ -731,13 +599,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq,
 	int acked = 0;
 
 	while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) {
-#ifdef TCP_DEBUG
-		/* Check for a bug. */
-		if (skb->next != (struct sk_buff*) &sk->write_queue &&
-		    after(skb->end_seq, skb->next->seq))
-			printk(KERN_DEBUG "INET: tcp_input.c: *** "
-			       "bug send_list out of order.\n");
-#endif								
 		/* If our packet is before the ack sequence we can
 		 * discard it as it's confirmed to have arrived the 
 		 * other end.
@@ -745,12 +606,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq,
 		if (after(skb->end_seq, ack))
 			break;
 
-#if 0
-		SOCK_DEBUG(sk, "removing seg %x-%x from retransmit queue\n",
-			   skb->seq, skb->end_seq);
-#endif
-
-		acked = FLAG_DATA_ACKED;
+		/* Initial outgoing SYN's get put onto the write_queue
+		 * just like anything else we transmit.  It is not
+		 * true data, and if we misinform our callers that
+		 * this ACK acks real data, we will erroneously exit
+		 * connection startup slow start one packet too
+		 * quickly.  This is severely frowned upon behavior.
+		 */
+		if(!skb->h.th->syn)
+			acked = FLAG_DATA_ACKED;
 		
 		/* FIXME: packet counting may break if we have to
 		 * do packet "repackaging" for stacks that don't
@@ -766,11 +630,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq,
 		kfree_skb(skb);
 	}
 
-	if (acked) {
+	if (acked)
 		tp->retrans_head = NULL;
-		if (!sk->dead)
-			sk->write_space(sk);
-	}
+
 	return acked;
 }
 
@@ -795,6 +657,66 @@ static void tcp_ack_probe(struct sock *sk, __u32 ack)
 	}
 }
  
+/* Read draft-ietf-tcplw-high-performance before mucking
+ * with this code. (Superceeds RFC1323)
+ */
+static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
+			       u32 seq, u32 ack, int flag)
+{
+	__u32 seq_rtt = (jiffies-tp->rcv_tsecr);
+	tcp_rtt_estimator(tp, seq_rtt);
+	if (tp->retransmits) {
+		if (tp->packets_out == 0) {
+			tp->retransmits = 0;
+			tp->backoff = 0;
+			tcp_set_rto(tp);
+		} else {
+			/* Still retransmitting, use backoff */
+			tcp_set_rto(tp);
+			tp->rto = tp->rto << tp->backoff;
+		}
+	} else {
+		tcp_set_rto(tp);
+		if (flag & FLAG_DATA_ACKED)
+			(*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt);
+	}
+	/* NOTE: safe here so long as cong_ctl doesn't use rto */
+	tcp_bound_rto(tp);
+}
+
+static void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
+{
+	struct sk_buff *skb;
+	long when;
+
+	skb = skb_peek(&sk->write_queue);
+	when = tp->rto - (jiffies - skb->when);
+
+	/* FIXME: This assumes that when we are retransmitting
+	 * we should only ever respond with one packet.
+	 * This means congestion windows should not grow
+	 * during recovery. In 2.0.X we allow the congestion
+	 * window to grow. It is not clear to me which
+	 * decision is correct. The RFCs should be double
+	 * checked as should the behavior of other stacks.
+	 * Also note that if we do want to allow the
+	 * congestion window to grow during retransmits
+	 * we have to fix the call to congestion window
+	 * updates so that it works during retransmission.
+	 */
+	if (tp->retransmits) {
+		tp->retrans_head = NULL;
+
+		/* This is tricky. We are retransmiting a 
+		 * segment of a window when congestion occured.
+		 */
+		tcp_do_retransmit(sk, 0);
+		tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+	} else {
+		tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
+	}
+}
+
 /*
  *	This routine deals with incoming acks, but not outgoing ones.
  */
@@ -806,7 +728,6 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
 	int flag = 0;
 	u32 seq = 0;
 	u32 seq_rtt = 0;
-	struct sk_buff *skb;
 
 	if(sk->zapped)
 		return(1);	/* Dead, can't ack any more so why bother */
@@ -838,7 +759,7 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
 	 */
 	if (before(tp->snd_wl1, ack_seq) ||
 	    (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) {
-		unsigned long nwin = ntohs(th->window) << tp->snd_wscale;
+		u32 nwin = ntohs(th->window) << tp->snd_wscale;
 
 		if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) {
 			flag |= FLAG_WIN_UPDATE;
@@ -869,28 +790,7 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
 
 	/* If we have a timestamp, we always do rtt estimates. */
 	if (tp->saw_tstamp) {
-		/* Read draft-ietf-tcplw-high-performance before mucking
-		 * with this code. (Superceeds RFC1323)
-		 */
-		seq_rtt = (jiffies-tp->rcv_tsecr);
-		tcp_rtt_estimator(tp, seq_rtt);
-		if (tp->retransmits) {
-			if (tp->packets_out == 0) {
-				tp->retransmits = 0;
-				tp->backoff = 0;
-				tcp_set_rto(tp);
-			} else {
-				/* Still retransmitting, use backoff */
-				tcp_set_rto(tp);
-				tp->rto = tp->rto << tp->backoff;
-			}
-		} else {
-			tcp_set_rto(tp);
-			if (flag & FLAG_DATA_ACKED)
-				(*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt);
-		}
-		/* NOTE: safe here so long as cong_ctl doesn't use rto */
-		tcp_bound_rto(tp);
+		tcp_ack_saw_tstamp(sk, tp, seq, ack, flag);
 	} else {
 		/* If we were retransmiting don't count rtt estimate. */
 		if (tp->retransmits) {
@@ -916,51 +816,217 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
 	}
 
 	if (tp->packets_out) {
-		if (flag & FLAG_DATA_ACKED) {
-			long when;
-
-			skb = skb_peek(&sk->write_queue);
-			when = tp->rto - (jiffies - skb->when);
-
-			/* FIXME: This assumes that when we are retransmitting
-			 * we should only ever respond with one packet.
-			 * This means congestion windows should not grow
-			 * during recovery. In 2.0.X we allow the congestion
-			 * window to grow. It is not clear to me which
-			 * decision is correct. The RFCs should be double
-			 * checked as should the behavior of other stacks.
-			 * Also note that if we do want to allow the
-			 * congestion window to grow during retransmits
-			 * we have to fix the call to congestion window
-			 * updates so that it works during retransmission.
-			 */
-			if (tp->retransmits) {
-				tp->retrans_head = NULL;
-
-				/* This is tricky. We are retransmiting a 
-				 * segment of a window when congestion occured.
-				 */
-				tcp_do_retransmit(sk, 0);
-				tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
-			} else
-				tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
-		}
-	} else
+		if (flag & FLAG_DATA_ACKED)
+			tcp_ack_packets_out(sk, tp);
+	} else {
 		tcp_clear_xmit_timer(sk, TIME_RETRANS);
+	}
 
-	tcp_fast_retrans(sk, ack, (flag & (FLAG_DATA|FLAG_WIN_UPDATE)));
-
+	flag &= (FLAG_DATA | FLAG_WIN_UPDATE);
+	if ((ack == tp->snd_una	&& tp->packets_out && flag == 0) ||
+	    (tp->high_seq != 0)) {
+		tcp_fast_retrans(sk, ack, flag);
+	} else {
+		/* Clear any aborted fast retransmit starts. */
+		tp->dup_acks = 0;
+	}
 	/* Remember the highest ack received. */
 	tp->snd_una = ack;
-
 	return 1;
 
 uninteresting_ack:
-
 	SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt);
 	return 0;
 }
 
+/* New-style handling of TIME_WAIT sockets. */
+static void tcp_timewait_kill(unsigned long __arg)
+{
+	struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)__arg;
+
+	/* Zap the timer. */
+	del_timer(&tw->timer);
+
+	/* Unlink from various places. */
+	if(tw->bind_next)
+		tw->bind_next->bind_pprev = tw->bind_pprev;
+	*(tw->bind_pprev) = tw->bind_next;
+	if(tw->tb->owners == NULL)
+		tcp_inc_slow_timer(TCP_SLT_BUCKETGC);
+
+	if(tw->next)
+		tw->next->pprev = tw->pprev;
+	*tw->pprev = tw->next;
+
+	/* We decremented the prot->inuse count when we entered TIME_WAIT
+	 * and the sock from which this came was destroyed.
+	 */
+	tw->sklist_next->sklist_prev = tw->sklist_prev;
+	tw->sklist_prev->sklist_next = tw->sklist_next;
+
+	/* Ok, now free it up. */
+	kmem_cache_free(tcp_timewait_cachep, tw);
+}
+
+/* We come here as a special case from the AF specific TCP input processing,
+ * and the SKB has no owner.  Essentially handling this is very simple,
+ * we just keep silently eating rx'd packets until none show up for the
+ * entire timeout period.  The only special cases are for BSD TIME_WAIT
+ * reconnects and SYN/RST bits being set in the TCP header.
+ */
+int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
+			       struct tcphdr *th, void *opt, __u16 len)
+{
+	/*	RFC 1122:
+	 *	"When a connection is [...] on TIME-WAIT state [...]
+	 *	[a TCP] MAY accept a new SYN from the remote TCP to
+	 *	reopen the connection directly, if it:
+	 *	
+	 *	(1)  assigns its initial sequence number for the new
+	 *	connection to be larger than the largest sequence
+	 *	number it used on the previous connection incarnation,
+	 *	and
+	 *
+	 *	(2)  returns to TIME-WAIT state if the SYN turns out 
+	 *	to be an old duplicate".
+	 */
+	if(th->syn && !th->rst && after(skb->seq, tw->rcv_nxt)) {
+		struct sock *sk;
+		struct tcp_func *af_specific = tw->af_specific;
+		__u32 isn;
+
+		isn = tw->rcv_nxt + 128000;
+		if(isn == 0)
+			isn++;
+		tcp_timewait_kill((unsigned long)tw);
+		sk = af_specific->get_sock(skb, th);
+		if(sk == NULL || !ipsec_sk_policy(sk,skb))
+			return 0;
+		skb_set_owner_r(skb, sk);
+		af_specific = sk->tp_pinfo.af_tcp.af_specific;
+		if(af_specific->conn_request(sk, skb, opt, isn) < 0)
+			return 1; /* Toss a reset back. */
+		return 0; /* Discard the frame. */
+	}
+
+	/* Check RST or SYN */
+	if(th->rst || th->syn) {
+		/* This is TIME_WAIT assasination, in two flavors.
+		 * Oh well... nobody has a sufficient solution to this
+		 * protocol bug yet.
+		 */
+		if(sysctl_tcp_rfc1337 == 0)
+			tcp_timewait_kill((unsigned long)tw);
+
+		if(!th->rst)
+			return 1; /* toss a reset back */
+	} else {
+		if(th->ack) {
+			/* In this case we must reset the TIMEWAIT timer. */
+			del_timer(&tw->timer);
+			tw->timer.expires = jiffies + TCP_TIMEWAIT_LEN;
+			add_timer(&tw->timer);
+		}
+	}
+	return 0; /* Discard the frame. */
+}
+
+/* Enter the time wait state.  This is always called from BH
+ * context.  Essentially we whip up a timewait bucket, copy the
+ * relevant info into it from the SK, and mess with hash chains
+ * and list linkage.
+ */
+static __inline__ void tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
+{
+	struct sock **head, *sktw;
+
+	/* Step 1: Remove SK from established hash. */
+	if(sk->next)
+		sk->next->pprev = sk->pprev;
+	*sk->pprev = sk->next;
+	sk->pprev = NULL;
+	tcp_reg_zap(sk);
+
+	/* Step 2: Put TW into bind hash where SK was. */
+	tw->tb = (struct tcp_bind_bucket *)sk->prev;
+	if((tw->bind_next = sk->bind_next) != NULL)
+		sk->bind_next->bind_pprev = &tw->bind_next;
+	tw->bind_pprev = sk->bind_pprev;
+	*sk->bind_pprev = (struct sock *)tw;
+
+	/* Step 3: Same for the protocol sklist. */
+	(tw->sklist_next = sk->sklist_next)->sklist_prev = (struct sock *)tw;
+	(tw->sklist_prev = sk->sklist_prev)->sklist_next = (struct sock *)tw;
+	sk->sklist_next = NULL;
+	sk->prot->inuse--;
+
+	/* Step 4: Hash TW into TIMEWAIT half of established hash table. */
+	head = &tcp_established_hash[sk->hashent + (TCP_HTABLE_SIZE/2)];
+	sktw = (struct sock *)tw;
+	if((sktw->next = *head) != NULL)
+		(*head)->pprev = &sktw->next;
+	*head = sktw;
+	sktw->pprev = head;
+}
+
+void tcp_time_wait(struct sock *sk)
+{
+	struct tcp_tw_bucket *tw;
+
+	tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
+	if(tw != NULL) {
+		/* Give us an identity. */
+		tw->daddr	= sk->daddr;
+		tw->rcv_saddr	= sk->rcv_saddr;
+		tw->bound_dev_if= sk->bound_dev_if;
+		tw->num		= sk->num;
+		tw->state	= TCP_TIME_WAIT;
+		tw->family	= sk->family;
+		tw->source	= sk->dummy_th.source;
+		tw->dest	= sk->dummy_th.dest;
+		tw->rcv_nxt	= sk->tp_pinfo.af_tcp.rcv_nxt;
+		tw->af_specific	= sk->tp_pinfo.af_tcp.af_specific;
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		if(tw->family == AF_INET6) {
+			memcpy(&tw->v6_daddr,
+			       &sk->net_pinfo.af_inet6.daddr,
+			       sizeof(struct in6_addr));
+			memcpy(&tw->v6_rcv_saddr,
+			       &sk->net_pinfo.af_inet6.rcv_saddr,
+			       sizeof(struct in6_addr));
+		}
+#endif
+		/* Linkage updates. */
+		tcp_tw_hashdance(sk, tw);
+
+		/* Get the TIME_WAIT timeout firing. */
+		init_timer(&tw->timer);
+		tw->timer.function = tcp_timewait_kill;
+		tw->timer.data = (unsigned long) tw;
+		tw->timer.expires = jiffies + TCP_TIMEWAIT_LEN;
+		add_timer(&tw->timer);
+
+		/* CLOSE the SK. */
+		if(sk->state == TCP_ESTABLISHED)
+			tcp_statistics.TcpCurrEstab--;
+		sk->state = TCP_CLOSE;
+		net_reset_timer(sk, TIME_DONE,
+				min(sk->tp_pinfo.af_tcp.srtt * 2, TCP_DONE_TIME));
+	} else {
+		/* Sorry, we're out of memory, just CLOSE this
+		 * socket up.  We've got bigger problems than
+		 * non-graceful socket closings.
+		 */
+		tcp_set_state(sk, TCP_CLOSE);
+	}
+
+	/* Prevent rcvmsg/sndmsg calls, and wake people up. */
+	sk->shutdown = SHUTDOWN_MASK;
+	if(!sk->dead)
+		sk->state_change(sk);
+}
+
 /*
  * 	Process the FIN bit. This now behaves as it is supposed to work
  *	and the FIN takes effect when it is validly part of sequence
@@ -976,17 +1042,9 @@ uninteresting_ack:
  *	If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
  */
  
-static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
+static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
 {
-	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-
-	if(sk->state == TCP_SYN_SENT) {
-		/* RFC793 says to drop the segment and return. */
-		return 1;
-	}
-
-	/* XXX This fin_seq thing should disappear... -DaveM */
-	tp->fin_seq = skb->end_seq;
+	sk->tp_pinfo.af_tcp.fin_seq = skb->end_seq;
 
 	tcp_send_ack(sk);
 
@@ -1013,12 +1071,6 @@ static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
 		case TCP_LAST_ACK:
 			/* RFC793: Remain in the LAST-ACK state. */
 			break;
-		case TCP_TIME_WAIT:
-			/* Received a retransmission of the FIN,
-			 * restart the TIME_WAIT timer.
-			 */
-			tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
-			break;
 
 		case TCP_FIN_WAIT1:
 			/* This case occurs when a simultaneous close
@@ -1035,21 +1087,15 @@ static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
 			break;
 		case TCP_FIN_WAIT2:
 			/* Received a FIN -- send ACK and enter TIME_WAIT. */
-			tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
-			sk->shutdown |= SHUTDOWN_MASK;
-			tcp_set_state(sk,TCP_TIME_WAIT);
-			break;
-		case TCP_CLOSE:
-			/* Already in CLOSE. */
+			tcp_time_wait(sk);
 			break;
 		default:
-			/* Only TCP_LISTEN is left, in that case we should never
-			 * reach this piece of code.
+			/* Only TCP_LISTEN and TCP_CLOSE are left, in these
+			 * cases we should never reach this piece of code.
 			 */
 			printk("tcp_fin: Impossible, sk->state=%d\n", sk->state);
 			break;
 	};
-	return 0;
 }
 
 /* This one checks to see if we can put data from the
@@ -1060,7 +1106,7 @@ static void tcp_ofo_queue(struct sock *sk)
 	struct sk_buff *skb;
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
-	while ((skb = skb_peek(&sk->out_of_order_queue))) {
+	while ((skb = skb_peek(&tp->out_of_order_queue))) {
 		if (after(skb->seq, tp->rcv_nxt))
 			break;
 
@@ -1076,6 +1122,8 @@ static void tcp_ofo_queue(struct sock *sk)
 		skb_unlink(skb);
 		skb_queue_tail(&sk->receive_queue, skb);
 		tp->rcv_nxt = skb->end_seq;
+		if(skb->h.th->fin)
+			tcp_fin(skb, sk, skb->h.th);
 	}
 }
 
@@ -1094,8 +1142,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 		dst_confirm(sk->dst_cache);
 		skb_queue_tail(&sk->receive_queue, skb);
 		tp->rcv_nxt = skb->end_seq;
+		if(skb->h.th->fin)
+			tcp_fin(skb, sk, skb->h.th);
+		else
+			tp->delayed_acks++;
 		tcp_ofo_queue(sk);
-		if (skb_queue_len(&sk->out_of_order_queue) == 0)
+		if (skb_queue_len(&tp->out_of_order_queue) == 0)
 			tp->pred_flags = htonl((0x5010 << 16) | tp->snd_wnd);
 		return;
 	}
@@ -1104,8 +1156,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 	if (!after(skb->end_seq, tp->rcv_nxt)) {
 		/* A retransmit, 2nd most common case.  Force an imediate ack. */
 		SOCK_DEBUG(sk, "retransmit received: seq %X\n", skb->seq);
-
-		tp->delayed_acks = MAX_DELAY_ACK;
+		tcp_enter_quickack_mode(tp);
 		kfree_skb(skb);
 		return;
 	}
@@ -1119,7 +1170,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 	}
 
 	/* Ok. This is an out_of_order segment, force an ack. */
-	tp->delayed_acks = MAX_DELAY_ACK;
+	tp->delayed_acks++;
+	tcp_enter_quickack_mode(tp);
 
 	/* Disable header predition. */
 	tp->pred_flags = 0;
@@ -1127,10 +1179,10 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
 		   tp->rcv_nxt, skb->seq, skb->end_seq);
 
-	if (skb_peek(&sk->out_of_order_queue) == NULL) {
-		skb_queue_head(&sk->out_of_order_queue,skb);
+	if (skb_peek(&tp->out_of_order_queue) == NULL) {
+		skb_queue_head(&tp->out_of_order_queue,skb);
 	} else {
-		for(skb1=sk->out_of_order_queue.prev; ; skb1 = skb1->prev) {
+		for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) {
 			/* Already there. */
 			if (skb->seq == skb1->seq && skb->len >= skb1->len) {
  				skb_append(skb1, skb);
@@ -1145,8 +1197,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 			}
 
                         /* See if we've hit the start. If so insert. */
-			if (skb1 == skb_peek(&sk->out_of_order_queue)) {
-				skb_queue_head(&sk->out_of_order_queue,skb);
+			if (skb1 == skb_peek(&tp->out_of_order_queue)) {
+				skb_queue_head(&tp->out_of_order_queue,skb);
 				break;
 			}
 		}
@@ -1172,23 +1224,17 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
         if (skb->len == 0 && !th->fin)
 		return(0);
 
-	/* FIXME: don't accept data after the received fin.
-	 *
-	 * Would checking snd_seq against fin_seq be enough?
-	 * If so, how do we handle that case exactly? -DaveM
-	 */
-
 	/* We no longer have anyone receiving data on this connection. */
 	tcp_data_queue(sk, skb);
 
-	if (before(tp->rcv_nxt, sk->copied_seq)) {
+	if (before(tp->rcv_nxt, tp->copied_seq)) {
 		printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
-		tp->rcv_nxt = sk->copied_seq;
+		tp->rcv_nxt = tp->copied_seq;
 	}
 
-	tp->delayed_acks++;
-
-	/* Now tell the user we may have some data. */
+	/* Above, tcp_data_queue() increments delayed_acks appropriately.
+	 * Now tell the user we may have some data.
+	 */
 	if (!sk->dead) {
 		SOCK_DEBUG(sk, "Data wakeup.\n");
 		sk->data_ready(sk,0);
@@ -1204,23 +1250,10 @@ static void tcp_data_snd_check(struct sock *sk)
 	if ((skb = tp->send_head)) {
 		if (!after(skb->end_seq, tp->snd_una + tp->snd_wnd) &&
 		    tp->packets_out < tp->snd_cwnd ) {
-			/* Add more data to the send queue. */
-
-			/* FIXME: the congestion window is checked
-			 * again in tcp_write_xmit anyway?! -- erics
-			 *
-			 * I think it must, it bumps tp->packets_out for
-			 * each packet it fires onto the wire. -DaveM
-			 */
+			/* Put more data onto the wire. */
 			tcp_write_xmit(sk);
-			if(!sk->dead)
-				sk->write_space(sk);
 		} else if (tp->packets_out == 0 && !tp->pending) {
- 			/* Data to queue but no room. */
-
-			/* FIXME: Is it right to do a zero window probe into
-			 * a congestion window limited window??? -- erics
-			 */
+ 			/* Start probing the receivers window. */
  			tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
  		}
 	}
@@ -1240,12 +1273,24 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk)
 	 *      - delay time <= 0.5 HZ
 	 *      - we don't have a window update to send
 	 *      - must send at least every 2 full sized packets
+	 *
+	 * With an extra heuristic to handle loss of packet
+	 * situations and also helping the sender leave slow
+	 * start in an expediant manner.
 	 */
 
-	if (tp->delayed_acks >= MAX_DELAY_ACK || tcp_raise_window(sk))
+	    /* Two full frames received or... */
+	if (((tp->rcv_nxt - tp->rcv_wup) >= (sk->mss << 1)) ||
+	    /* We will update the window "significantly" or... */
+	    tcp_raise_window(sk) ||
+	    /* We entered "quick ACK" mode */
+	    tcp_in_quickack_mode(tp)) {
+		/* Then ack it now */
 		tcp_send_ack(sk);
-	else
-		tcp_send_delayed_ack(sk, HZ/2);
+	} else {
+		/* Else, send delayed ack. */
+		tcp_send_delayed_ack(tp, HZ/2);
+	}
 }
 
 static __inline__ void tcp_ack_snd_check(struct sock *sk)
@@ -1279,11 +1324,11 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
 	ptr += ntohl(th->seq);
 
 	/* Ignore urgent data that we've already seen and read. */
-	if (after(sk->copied_seq, ptr))
+	if (after(tp->copied_seq, ptr))
 		return;
 
 	/* Do we already have a newer (or duplicate) urgent pointer? */
-	if (sk->urg_data && !after(ptr, sk->urg_seq))
+	if (tp->urg_data && !after(ptr, tp->urg_seq))
 		return;
 
 	/* Tell the world about our new urgent pointer. */
@@ -1296,14 +1341,14 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
 
 	/* We may be adding urgent data when the last byte read was
 	 * urgent. To do this requires some care. We cannot just ignore
-	 * sk->copied_seq since we would read the last urgent byte again
+	 * tp->copied_seq since we would read the last urgent byte again
 	 * as data, nor can we alter copied_seq until this data arrives
 	 * or we break the sematics of SIOCATMARK (and thus sockatmark())
 	 */
-	if (sk->urg_seq == sk->copied_seq)
-		sk->copied_seq++;	/* Move the copied sequence on correctly */
-	sk->urg_data = URG_NOTYET;
-	sk->urg_seq = ptr;
+	if (tp->urg_seq == tp->copied_seq)
+		tp->copied_seq++;	/* Move the copied sequence on correctly */
+	tp->urg_data = URG_NOTYET;
+	tp->urg_seq = ptr;
 
 	/* Disable header prediction. */
 	tp->pred_flags = 0;
@@ -1312,17 +1357,19 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
 /* This is the 'fast' part of urgent handling. */
 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
 {
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
 	/* Check if we get a new urgent pointer - normally not. */
 	if (th->urg)
 		tcp_check_urg(sk,th);
 
 	/* Do we wait for any urgent data? - normally not... */
-	if (sk->urg_data == URG_NOTYET) {
-		u32 ptr = sk->urg_seq - ntohl(th->seq) + (th->doff*4);
+	if (tp->urg_data == URG_NOTYET) {
+		u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4);
 
 		/* Is the urgent pointer pointing into this packet? */	 
 		if (ptr < len) {
-			sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
+			tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
 			if (!sk->dead)
 				sk->data_ready(sk,0);
 		}
@@ -1335,33 +1382,39 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len
  */
 static void prune_queue(struct sock *sk)
 {
-	struct tcp_opt *tp; 
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; 
 	struct sk_buff * skb;
 
-	SOCK_DEBUG(sk, "prune_queue: c=%x\n", sk->copied_seq);
+	SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
 
 	/* First Clean the out_of_order queue. */
 	/* Start with the end because there are probably the least
 	 * useful packets (crossing fingers).
 	 */
-	while ((skb = skb_dequeue_tail(&sk->out_of_order_queue))) { 
+	while ((skb = skb_dequeue_tail(&tp->out_of_order_queue))) { 
 		kfree_skb(skb);
 		if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf)
 			return;
 	}
 	
-	tp = &sk->tp_pinfo.af_tcp;
-
 	/* Now continue with the receive queue if it wasn't enough */
 	while ((skb = skb_peek_tail(&sk->receive_queue))) {
+		/* Never toss anything when we've seen the FIN.
+		 * It's just too complex to recover from it.
+		 */
+		if(skb->h.th->fin)
+			break;
+
 		/* Never remove packets that have been already acked */
 		if (before(skb->end_seq, tp->last_ack_sent+1)) {
 			printk(KERN_DEBUG "prune_queue: hit acked data c=%x,%x,%x\n",
-				sk->copied_seq, skb->end_seq, tp->last_ack_sent);
+				tp->copied_seq, skb->end_seq, tp->last_ack_sent);
 			break; 
 		}
 		skb_unlink(skb);
 		tp->rcv_nxt = skb->seq;
+		SOCK_DEBUG(sk, "prune_queue: removing %x-%x (c=%x)\n",
+			   skb->seq, skb->end_seq, tp->copied_seq); 
 		kfree_skb(skb);
 		if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) 
 			break;
@@ -1429,7 +1482,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			}
 		} else if (skb->ack_seq == tp->snd_una) {
 			/* Bulk data transfer: receiver */
-			
 			if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) 
 				goto discard;
 			
@@ -1441,18 +1493,13 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			skb_queue_tail(&sk->receive_queue, skb);
 			tp->rcv_nxt = skb->end_seq;
 
+			/* FIN bit check is not done since if FIN is set in
+			 * this frame, the pred_flags won't match up. -DaveM
+			 */
 			sk->data_ready(sk, 0);
 			tcp_delack_estimator(tp);
-
-#if 1			/* This checks for required window updates too. */
 			tp->delayed_acks++;
 			__tcp_ack_snd_check(sk);
-#else
-			if (tp->delayed_acks++ == 0)
-				tcp_send_delayed_ack(sk, HZ/2);
-			else
-				tcp_send_ack(sk);
-#endif
 			return 0;
 		}
 	}
@@ -1469,7 +1516,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 		}
 	}
 
-	if(th->syn && skb->seq != sk->syn_seq) {
+	if(th->syn && skb->seq != tp->syn_seq) {
 		SOCK_DEBUG(sk, "syn in established state\n");
 		tcp_statistics.TcpInErrs++;
 		tcp_reset(sk, skb);
@@ -1490,10 +1537,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	/* step 7: process the segment text */
 	queued = tcp_data(skb, sk, len);
 
-	/* step 8: check the FIN bit */
-	if (th->fin)
-		(void) tcp_fin(skb, sk, th);
-
 	tcp_data_snd_check(sk);
 
 	/* If our receive queue has grown past its limits shrink it */
@@ -1657,19 +1700,19 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			tp->snd_wnd = htons(th->window) << tp->snd_wscale;
 			tp->snd_wl1 = skb->seq;
 			tp->snd_wl2 = skb->ack_seq;
-
 			tp->fin_seq = skb->seq;
 
 			tcp_set_state(sk, TCP_ESTABLISHED);
 			tcp_parse_options(th,tp,0);
-			/* FIXME: need to make room for SACK still */
+
         		if (tp->wscale_ok == 0) {
                 		tp->snd_wscale = tp->rcv_wscale = 0;
                 		tp->window_clamp = min(tp->window_clamp,65535);
         		}
 			if (tp->tstamp_ok) {
-				tp->tcp_header_len = sizeof(struct tcphdr) + 12;	/* FIXME: Define constant! */
-				sk->dummy_th.doff += 3;		/* reserve space of options */
+				tp->tcp_header_len =
+					sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+				sk->dummy_th.doff += (TCPOLEN_TSTAMP_ALIGNED >> 2);
 			} else
 				tp->tcp_header_len = sizeof(struct tcphdr);
 			if (tp->saw_tstamp) {
@@ -1680,14 +1723,30 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			/* Can't be earlier, doff would be wrong. */
 			tcp_send_ack(sk);
 
-			if (tp->in_mss)
-				sk->mss = min(sk->mss, tp->in_mss);
-
-			/* Take out space for tcp options. */
-			sk->mss -= tp->tcp_header_len - sizeof(struct tcphdr);
+			/* Check for the case where we tried to advertise
+			 * a window including timestamp options, but did not
+			 * end up using them for this connection.
+			 */
+			if((tp->tstamp_ok == 0) && sysctl_tcp_timestamps)
+				sk->mss += TCPOLEN_TSTAMP_ALIGNED;
 			
+			/* Now limit it if the other end negotiated a smaller
+			 * value.
+			 */
+			if (tp->in_mss) {
+				int real_mss = tp->in_mss;
+
+				/* We store MSS locally with the timestamp bytes
+				 * subtracted, TCP's advertise it with them
+				 * included.  Account for this fact.
+				 */
+				if(tp->tstamp_ok)
+					real_mss -= TCPOLEN_TSTAMP_ALIGNED;
+				sk->mss = min(sk->mss, real_mss);
+			}
+
 			sk->dummy_th.dest = th->source;
-			sk->copied_seq = tp->rcv_nxt;
+			tp->copied_seq = tp->rcv_nxt;
 
 			if(!sk->dead) {
 				sk->state_change(sk);
@@ -1722,52 +1781,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 
 		}
 		break;
-
-	case TCP_TIME_WAIT:
-	        /*	RFC 1122:
-		 *	"When a connection is [...] on TIME-WAIT state [...]
-		 *	[a TCP] MAY accept a new SYN from the remote TCP to
-		 *	reopen the connection directly, if it:
-		 *	
-		 *	(1)  assigns its initial sequence number for the new
-                 *	connection to be larger than the largest sequence
-                 *	number it used on the previous connection incarnation,
-                 *	and
-		 *
-		 *	(2)  returns to TIME-WAIT state if the SYN turns out 
-		 *	to be an old duplicate".
-		 */
-		if (th->syn && !th->rst && after(skb->seq, tp->rcv_nxt)) {
-			__u32 isn;
-
-			skb_orphan(skb);
-                        sk->err = ECONNRESET;
-                        tcp_set_state(sk, TCP_CLOSE);
-                        sk->shutdown = SHUTDOWN_MASK;
-
-			isn = tp->rcv_nxt + 128000;
-			if (isn == 0)  
-				isn++; 
-
-			sk = tp->af_specific->get_sock(skb, th);
-
-			if (sk == NULL || !ipsec_sk_policy(sk,skb))
-				goto discard;
-
-			skb_set_owner_r(skb, sk);
-			tp = &sk->tp_pinfo.af_tcp;
-
-			if(tp->af_specific->conn_request(sk, skb, opt, isn) < 0)
-				return 1;
-
-			goto discard;
-		}
-
-		break;
 	}
 
 	/*   Parse the tcp_options present on this header.
-	 *   By this point we really only expect timestamps and SACKs.
+	 *   By this point we really only expect timestamps.
 	 *   Note that this really has to be here and not later for PAWS
 	 *   (RFC1323) to work.
 	 */
@@ -1819,7 +1836,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 	 *	original syn. 
 	 */
 
-	if (th->syn && skb->seq!=sk->syn_seq) {
+	if (th->syn && skb->seq!=tp->syn_seq) {
 		tcp_reset(sk, skb);
 		return 1;
 	}
@@ -1833,7 +1850,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			if (acceptable) {
 				tcp_set_state(sk, TCP_ESTABLISHED);
 				sk->dummy_th.dest=th->source;
-				sk->copied_seq = tp->rcv_nxt;
+				tp->copied_seq = tp->rcv_nxt;
 
 				if(!sk->dead)
 					sk->state_change(sk);		
@@ -1850,7 +1867,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			break;
 
 		case TCP_FIN_WAIT1:
-			if (tp->snd_una == sk->write_seq) {
+			if (tp->snd_una == tp->write_seq) {
 				sk->shutdown |= SEND_SHUTDOWN;
 				tcp_set_state(sk, TCP_FIN_WAIT2);
 				if (!sk->dead)
@@ -1861,12 +1878,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			break;
 
 		case TCP_CLOSING:	
-			if (tp->snd_una == sk->write_seq)
+			if (tp->snd_una == tp->write_seq)
 				tcp_time_wait(sk);
 			break;
 
 		case TCP_LAST_ACK:
-			if (tp->snd_una == sk->write_seq) {
+			if (tp->snd_una == tp->write_seq) {
 				sk->shutdown = SHUTDOWN_MASK;
 				tcp_set_state(sk,TCP_CLOSE);
 				if (!sk->dead)
@@ -1874,13 +1891,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				goto discard;
 			}
 			break;
-
-		case TCP_TIME_WAIT:
-			/* Keep us in TIME_WAIT until we stop getting 
-			 * packets, reset the timeout.
-			 */
-			tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
-			break;
 		}
 	} else
 		goto discard;
@@ -1918,12 +1928,6 @@ step6:
 		break;
 	}
 
-	/* step 8: check the FIN bit */
-	if (th->fin) {
-		if(tcp_fin(skb, sk, th) != 0)
-			goto discard;
-	}
-
 	tcp_data_snd_check(sk);
 	tcp_ack_snd_check(sk);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e4f8981ac..91f21ff75 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_ipv4.c,v 1.79 1998/01/15 22:40:47 freitag Exp $
+ * Version:	$Id: tcp_ipv4.c,v 1.109 1998/03/15 07:24:15 davem Exp $
  *
  *		IPv4 specific functions
  *
@@ -60,8 +60,6 @@
 
 #include <linux/inet.h>
 
-extern int sysctl_tcp_sack;
-extern int sysctl_tcp_tsack;
 extern int sysctl_tcp_timestamps;
 extern int sysctl_tcp_window_scaling;
 extern int sysctl_tcp_syncookies;
@@ -89,16 +87,19 @@ void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  */
 struct sock *tcp_established_hash[TCP_HTABLE_SIZE];
 
+/* Ok, let's try this, I give up, we do need a local binding
+ * TCP hash as well as the others for fast bind/connect.
+ */
+struct tcp_bind_bucket *tcp_bound_hash[TCP_BHTABLE_SIZE];
+
 /* All sockets in TCP_LISTEN state will be in here.  This is the only table
  * where wildcard'd TCP sockets can exist.  Hash function here is just local
  * port number.
  */
 struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE];
 
-/* Ok, let's try this, I give up, we do need a local binding
- * TCP hash as well as the others for fast bind/connect.
- */
-struct sock *tcp_bound_hash[TCP_BHTABLE_SIZE];
+/* Register cache. */
+struct sock *tcp_regs[TCP_NUM_REGS];
 
 /*
  * This array holds the first and last local port number.
@@ -106,6 +107,7 @@ struct sock *tcp_bound_hash[TCP_BHTABLE_SIZE];
  * 32768-61000
  */
 int sysctl_local_port_range[2] = { 1024, 4999 };
+int tcp_port_rover = (1024 - 1);
 
 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 				 __u32 faddr, __u16 fport)
@@ -123,155 +125,135 @@ static __inline__ int tcp_sk_hashfn(struct sock *sk)
 	return tcp_hashfn(laddr, lport, faddr, fport);
 }
 
-static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum)
+/* Invariant, sk->num is non-zero. */
+void tcp_bucket_unlock(struct sock *sk)
 {
-	struct sock *sk2;
-	int retval = 0, sk_reuse = sk->reuse;
+	struct tcp_bind_bucket *tb;
+	unsigned short snum = sk->num;
 
 	SOCKHASH_LOCK();
-	sk2 = tcp_bound_hash[tcp_bhashfn(snum)];
-	for(; sk2 != NULL; sk2 = sk2->bind_next) {
-		if((sk2->num == snum) && (sk2 != sk)) {
-			unsigned char state = sk2->state;
-			int sk2_reuse = sk2->reuse;
-
-			/* Two sockets can be bound to the same port if they're
-			 * bound to different interfaces.
-			 */
-
-			if(sk->bound_dev_if != sk2->bound_dev_if)
-				continue;
-
-			if(!sk2->rcv_saddr || !sk->rcv_saddr) {
-				if((!sk2_reuse)			||
-				   (!sk_reuse)			||
-				   (state == TCP_LISTEN)) {
-					retval = 1;
-					break;
-				}
-			} else if(sk2->rcv_saddr == sk->rcv_saddr) {
-				if((!sk_reuse)			||
-				   (!sk2_reuse)			||
-				   (state == TCP_LISTEN)) {
-					retval = 1;
-					break;
-				}
+	for(tb = tcp_bound_hash[tcp_bhashfn(snum)]; tb; tb = tb->next) {
+		if(tb->port == snum) {
+			if(tb->owners == NULL &&
+			   (tb->flags & TCPB_FLAG_LOCKED)) {
+				tb->flags &= ~TCPB_FLAG_LOCKED;
+				tcp_inc_slow_timer(TCP_SLT_BUCKETGC);
 			}
+			break;
 		}
 	}
 	SOCKHASH_UNLOCK();
+}
 
-	return retval;
+struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum)
+{
+	struct tcp_bind_bucket *tb;
+
+	tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
+	if(tb != NULL) {
+		struct tcp_bind_bucket **head =
+			&tcp_bound_hash[tcp_bhashfn(snum)];
+		tb->port = snum;
+		tb->flags = TCPB_FLAG_LOCKED;
+		tb->owners = NULL;
+		if((tb->next = *head) != NULL)
+			tb->next->pprev = &tb->next;
+		*head = tb;
+		tb->pprev = head;
+	}
+	return tb;
 }
 
-static __inline__ int tcp_lport_inuse(int num)
+static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum)
 {
-	struct sock *sk = tcp_bound_hash[tcp_bhashfn(num)];
+	struct tcp_bind_bucket *tb;
+	int result = 0;
 
-	for(; sk != NULL; sk = sk->bind_next) {
-		if(sk->num == num)
-			return 1;
+	SOCKHASH_LOCK();
+	for(tb = tcp_bound_hash[tcp_bhashfn(snum)];
+	    (tb && (tb->port != snum));
+	    tb = tb->next)
+		;
+	if(tb && tb->owners) {
+		/* Fast path for reuse ports, see include/net/tcp.h for a very
+		 * detailed description of why this works, and why it is worth
+		 * the effort at all. -DaveM
+		 */
+		if((tb->flags & TCPB_FLAG_FASTREUSE)	&&
+		   (sk->reuse != 0)) {
+			goto go_like_smoke;
+		} else {
+			struct sock *sk2;
+			int sk_reuse = sk->reuse;
+
+			/* We must walk the whole port owner list in this case. -DaveM */
+			for(sk2 = tb->owners; sk2; sk2 = sk2->bind_next) {
+				if(sk->bound_dev_if == sk2->bound_dev_if) {
+					if(!sk_reuse || !sk2->reuse || sk2->state == TCP_LISTEN) {
+						if(!sk2->rcv_saddr		||
+						   !sk->rcv_saddr		||
+						   (sk2->rcv_saddr == sk->rcv_saddr))
+							break;
+					}
+				}
+			}
+			if(sk2 != NULL)
+				result = 1;
+		}
 	}
-	return 0;
+	if((result == 0) &&
+	   (tb == NULL) &&
+	   (tcp_bucket_create(snum) == NULL))
+		result = 1;
+go_like_smoke:
+	SOCKHASH_UNLOCK();
+	return result;
 }
 
-/* Find a "good" local port, this is family independent.
- * There are several strategies working in unison here to
- * get the best possible performance.  The current socket
- * load is kept track of, if it is zero there is a strong
- * likely hood that there is a zero length chain we will
- * find with a small amount of searching, else the load is
- * what we shoot for for when the chains all have at least
- * one entry.  The base helps us walk the chains in an
- * order such that a good chain is found as quickly as possible.  -DaveM
- */
 unsigned short tcp_good_socknum(void)
 {
-	static int start = 0;
-	static int binding_contour = 0;
-	int best = 0;
-	int size = 32767; /* a big num. */
-	int retval = 0, i, end, bc;
+	struct tcp_bind_bucket *tb;
+	int low = sysctl_local_port_range[0];
+	int high = sysctl_local_port_range[1];
+	int remaining = high - low;
+	int rover;
 
 	SOCKHASH_LOCK();
-        if (start > sysctl_local_port_range[1] || start < sysctl_local_port_range[0])
-		start = sysctl_local_port_range[0];
-        i = tcp_bhashfn(start);
-        end = i + TCP_BHTABLE_SIZE;
-        bc = binding_contour;
-        do {
-                struct sock *sk = tcp_bound_hash[i&(TCP_BHTABLE_SIZE-1)];
-                if(!sk) {
-                        /* find the smallest value no smaller than start
-                         * that has this hash value.
-                         */
-                        retval = tcp_bhashnext(start-1,i&(TCP_BHTABLE_SIZE-1));
-
-                        /* Check for decreasing load. */
-                        if (bc != 0)
-                                binding_contour = 0;
-                        goto done;
-                } else {
-                        int j = 0;
-                        do { sk = sk->bind_next; } while (++j < size && sk);
-                        if (j < size) {
-                                best = i&(TCP_BHTABLE_SIZE-1);
-                                size = j;
-                                if (bc && size <= bc)
-                                        goto verify;
-                        }
-                }
-        } while(++i != end);
-        i = best;
-
-        /* Socket load is increasing, adjust our load average. */
-        binding_contour = size;
-verify:
-        if (size < binding_contour)
-                binding_contour = size;
-
-        retval = tcp_bhashnext(start-1,i);
-
-	best = retval;	/* mark the starting point to avoid infinite loops */
-        while(tcp_lport_inuse(retval)) {
-               	retval = tcp_bhashnext(retval,i);
-		if (retval > sysctl_local_port_range[1]) /* Upper bound */
-			retval = tcp_bhashnext(sysctl_local_port_range[0],i);
-		if (retval == best) {
-			/* This hash chain is full. No answer. */
-			retval = 0;
-			break;
+	rover = tcp_port_rover;
+	do {
+		rover += 1;
+		if((rover < low) || (rover > high))
+			rover = low;
+		tb = tcp_bound_hash[tcp_bhashfn(rover)];
+		for( ; tb; tb = tb->next) {
+			if(tb->port == rover)
+				goto next;
 		}
-        }
-
-done:
-        start = (retval + 1);
+		break;
+	next:
+	} while(--remaining > 0);
+	tcp_port_rover = rover;
+	if((remaining <= 0) || (tcp_bucket_create(rover) == NULL))
+		rover = 0;
 	SOCKHASH_UNLOCK();
 
-	return retval;
+	return rover;
 }
 
 static void tcp_v4_hash(struct sock *sk)
 {
-	unsigned char state;
-
-	SOCKHASH_LOCK();
-	state = sk->state;
-	if(state != TCP_CLOSE || !sk->dead) {
+	if (sk->state != TCP_CLOSE) {
 		struct sock **skp;
 
-		if(state == TCP_LISTEN)
-			skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
-		else
-			skp = &tcp_established_hash[tcp_sk_hashfn(sk)];
-
+		SOCKHASH_LOCK();
+		skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))];
 		if((sk->next = *skp) != NULL)
 			(*skp)->pprev = &sk->next;
 		*skp = sk;
 		sk->pprev = skp;
 		tcp_sk_bindify(sk);
+		SOCKHASH_UNLOCK();
 	}
-	SOCKHASH_UNLOCK();
 }
 
 static void tcp_v4_unhash(struct sock *sk)
@@ -282,6 +264,7 @@ static void tcp_v4_unhash(struct sock *sk)
 			sk->next->pprev = sk->pprev;
 		*sk->pprev = sk->next;
 		sk->pprev = NULL;
+		tcp_reg_zap(sk);
 		tcp_sk_unbindify(sk);
 	}
 	SOCKHASH_UNLOCK();
@@ -293,30 +276,27 @@ static void tcp_v4_rehash(struct sock *sk)
 
 	SOCKHASH_LOCK();
 	state = sk->state;
-	if(sk->pprev) {
+	if(sk->pprev != NULL) {
 		if(sk->next)
 			sk->next->pprev = sk->pprev;
 		*sk->pprev = sk->next;
 		sk->pprev = NULL;
-		tcp_sk_unbindify(sk);
+		tcp_reg_zap(sk);
 	}
-	if(state != TCP_CLOSE || !sk->dead) {
+	if(state != TCP_CLOSE) {
 		struct sock **skp;
 
-		if(state == TCP_LISTEN) {
+		if(state == TCP_LISTEN)
 			skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
-		} else {
-			int hash= tcp_sk_hashfn(sk);
-			if(state == TCP_TIME_WAIT)
-				hash += (TCP_HTABLE_SIZE/2);
-			skp = &tcp_established_hash[hash];
-		}
+		else
+			skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))];
 
 		if((sk->next = *skp) != NULL)
 			(*skp)->pprev = &sk->next;
 		*skp = sk;
 		sk->pprev = skp;
-		tcp_sk_bindify(sk);
+		if(state == TCP_LISTEN)
+			tcp_sk_bindify(sk);
 	}
 	SOCKHASH_UNLOCK();
 }
@@ -360,37 +340,64 @@ static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int d
 	return result;
 }
 
+/* Until this is verified... -DaveM */
+/* #define USE_QUICKSYNS */
+
 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
+ * It is assumed that this code only gets called from within NET_BH.
  */
 static inline struct sock *__tcp_v4_lookup(struct tcphdr *th,
-					   u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
+					   u32 saddr, u16 sport,
+					   u32 daddr, u16 dport, int dif)
 {
 	unsigned short hnum = ntohs(dport);
 	struct sock *sk;
-	int hash = tcp_hashfn(daddr, hnum, saddr, sport);
+	int hash;
+
+#ifdef USE_QUICKSYNS
+	/* Incomming connection short-cut. */
+	if (th && th->syn == 1 && th->ack == 0)
+		goto listener_shortcut;
+#endif
+
+	/* Check TCP register quick cache first. */
+	sk = TCP_RHASH(sport);
+	if(sk						&&
+	   sk->daddr		== saddr		&& /* remote address */
+	   sk->dummy_th.dest	== sport		&& /* remote port    */
+	   sk->num		== hnum			&& /* local port     */
+	   sk->rcv_saddr	== daddr		&& /* local address  */
+	   (!sk->bound_dev_if || sk->bound_dev_if == dif))
+		goto hit;
 
 	/* Optimize here for direct hit, only listening connections can
-	 * have wildcards anyways.  It is assumed that this code only
-	 * gets called from within NET_BH.
+	 * have wildcards anyways.
 	 */
-	for(sk = tcp_established_hash[hash]; sk; sk = sk->next)
+	hash = tcp_hashfn(daddr, hnum, saddr, sport);
+	for(sk = tcp_established_hash[hash]; sk; sk = sk->next) {
 		if(sk->daddr		== saddr		&& /* remote address */
 		   sk->dummy_th.dest	== sport		&& /* remote port    */
 		   sk->num		== hnum			&& /* local port     */
 		   sk->rcv_saddr	== daddr		&& /* local address  */
-		   (!sk->bound_dev_if || sk->bound_dev_if == dif))
+		   (!sk->bound_dev_if || sk->bound_dev_if == dif)) {
+			if (sk->state == TCP_ESTABLISHED)
+				TCP_RHASH(sport) = sk;
 			goto hit; /* You sunk my battleship! */
-
+		}
+	}
 	/* Must check for a TIME_WAIT'er before going to listener hash. */
-	for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next)
+	for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next) {
 		if(sk->daddr		== saddr		&& /* remote address */
 		   sk->dummy_th.dest	== sport		&& /* remote port    */
 		   sk->num		== hnum			&& /* local port     */
 		   sk->rcv_saddr	== daddr		&& /* local address  */
 		   (!sk->bound_dev_if || sk->bound_dev_if == dif))
 			goto hit;
-
+	}
+#ifdef USE_QUICKSYNS
+listener_shortcut:
+#endif
 	sk = tcp_v4_lookup_listener(daddr, hnum, dif);
 hit:
 	return sk;
@@ -402,20 +409,11 @@ __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport
 }
 
 #ifdef CONFIG_IP_TRANSPARENT_PROXY
-#define secondlist(hpnum, sk, fpass) \
-({ struct sock *s1; if(!(sk) && (fpass)--) \
-	s1 = tcp_bound_hash[tcp_bhashfn(hpnum)]; \
-   else \
-	s1 = (sk); \
-   s1; \
-})
-
-#define tcp_v4_proxy_loop_init(hnum, hpnum, sk, fpass) \
-	secondlist((hpnum), tcp_bound_hash[tcp_bhashfn(hnum)],(fpass))
-
-#define tcp_v4_proxy_loop_next(hnum, hpnum, sk, fpass) \
-	secondlist((hpnum),(sk)->bind_next,(fpass))
-
+/* Cleaned up a little and adapted to new bind bucket scheme.
+ * Oddly, this should increase performance here for
+ * transparent proxy, as tests within the inner loop have
+ * been eliminated. -DaveM
+ */
 static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr,
 					unsigned short rnum, unsigned long laddr,
 					struct device *dev, unsigned short pnum,
@@ -436,51 +434,60 @@ static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr,
 	}
 
 	/* This code must run only from NET_BH. */
-	for(s = tcp_v4_proxy_loop_init(hnum, hpnum, s, firstpass);
-	    s != NULL;
-	    s = tcp_v4_proxy_loop_next(hnum, hpnum, s, firstpass)) {
-		if(s->num == hnum || s->num == hpnum) {
-			int score = 0;
-			if(s->dead && (s->state == TCP_CLOSE))
+	{
+		struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hnum)];
+		for( ; (tb && tb->port != hnum); tb = tb->next)
+			;
+		if(tb == NULL)
+			goto next;
+		s = tb->owners;
+	}
+pass2:
+	for(; s; s = s->bind_next) {
+		int score = 0;
+		if(s->rcv_saddr) {
+			if((s->num != hpnum || s->rcv_saddr != paddr) &&
+			   (s->num != hnum || s->rcv_saddr != laddr))
 				continue;
-			if(s->rcv_saddr) {
-				if((s->num != hpnum || s->rcv_saddr != paddr) &&
-				   (s->num != hnum || s->rcv_saddr != laddr))
-					continue;
-				score++;
-			}
-			if(s->daddr) {
-				if(s->daddr != raddr)
-					continue;
-				score++;
-			}
-			if(s->dummy_th.dest) {
-				if(s->dummy_th.dest != rnum)
-					continue;
-				score++;
-			}
-			if(s->bound_dev_if) {
-				if(s->bound_dev_if != dif)
-					continue;
-				score++;
-			}
-			if(score == 4 && s->num == hnum) {
-				result = s;
-				break;
-			} else if(score > badness && (s->num == hpnum || s->rcv_saddr)) {
-					result = s;
-					badness = score;
-			}
+			score++;
+		}
+		if(s->daddr) {
+			if(s->daddr != raddr)
+				continue;
+			score++;
+		}
+		if(s->dummy_th.dest) {
+			if(s->dummy_th.dest != rnum)
+				continue;
+			score++;
+		}
+		if(s->bound_dev_if) {
+			if(s->bound_dev_if != dif)
+				continue;
+			score++;
+		}
+		if(score == 4 && s->num == hnum) {
+			result = s;
+			goto gotit;
+		} else if(score > badness && (s->num == hpnum || s->rcv_saddr)) {
+			result = s;
+			badness = score;
 		}
 	}
+next:
+	if(firstpass--) {
+		struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hpnum)];
+		for( ; (tb && tb->port != hpnum); tb = tb->next)
+			;
+		if(tb) {
+			s = tb->owners;
+			goto pass2;
+		}
+	}
+gotit:
 	return result;
 }
-
-#undef secondlist
-#undef tcp_v4_proxy_loop_init
-#undef tcp_v4_proxy_loop_next
-
-#endif
+#endif /* CONFIG_IP_TRANSPARENT_PROXY */
 
 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 {
@@ -495,41 +502,35 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
 
 /*
  * Check that a TCP address is unique, don't allow multiple
- * connects to/from the same address
+ * connects to/from the same address.  Actually we can optimize
+ * quite a bit, since the socket about to connect is still
+ * in TCP_CLOSE, a tcp_bind_bucket for the local port he will
+ * use will exist, with a NULL owners list.  So check for that.
+ * The good_socknum and verify_bind scheme we use makes this
+ * work.
  */
 
-static int tcp_unique_address(u32 saddr, u16 snum, u32 daddr, u16 dnum)
+static int tcp_unique_address(struct sock *sk)
 {
-	int retval = 1, hashent = tcp_hashfn(saddr, snum, daddr, dnum);
-	struct sock * sk;
+	struct tcp_bind_bucket *tb;
+	unsigned short snum = sk->num;
+	int retval = 1;
 
-	/* Make sure we are allowed to connect here.
-	 * But freeze the hash while we snoop around.
-	 */
+	/* Freeze the hash while we snoop around. */
 	SOCKHASH_LOCK();
-	sk = tcp_established_hash[hashent];
-	for (; sk != NULL; sk = sk->next) {
-		if(sk->daddr		== daddr		&& /* remote address */
-		   sk->dummy_th.dest	== dnum			&& /* remote port */
-		   sk->num		== snum			&& /* local port */
-		   sk->saddr		== saddr) {		   /* local address */
-			retval = 0;
-			goto out;
-		}
-	}
-
-	/* Must check TIME_WAIT'ers too. */
-	sk = tcp_established_hash[hashent + (TCP_HTABLE_SIZE/2)];
-	for (; sk != NULL; sk = sk->next) {
-		if(sk->daddr		== daddr		&& /* remote address */
-		   sk->dummy_th.dest	== dnum			&& /* remote port */
-		   sk->num		== snum			&& /* local port */
-		   sk->saddr		== saddr) {		   /* local address */
-			retval = 0;
-			goto out;
+	tb = tcp_bound_hash[tcp_bhashfn(snum)];
+	for(; tb; tb = tb->next) {
+		if(tb->port == snum && tb->owners != NULL) {
+			/* Almost certainly the re-use port case, search the real hashes
+			 * so it actually scales.
+			 */
+			sk = __tcp_v4_lookup(NULL, sk->daddr, sk->dummy_th.dest,
+					     sk->rcv_saddr, snum, sk->bound_dev_if);
+			if((sk != NULL) && (sk->state != TCP_LISTEN))
+				retval = 0;
+			break;
 		}
 	}
-out:
 	SOCKHASH_UNLOCK();
 	return retval;
 }
@@ -578,8 +579,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		return -ENETUNREACH;
 	}
 
-	if (!tcp_unique_address(rt->rt_src, sk->num, rt->rt_dst,
-				usin->sin_port)) {
+	if (!tcp_unique_address(sk)) {
 		ip_rt_put(rt);
 		return -EADDRNOTAVAIL;
 	}
@@ -587,7 +587,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	lock_sock(sk);
 
 	/* Do this early, so there is less state to unwind on failure. */
-	buff = sock_wmalloc(sk, MAX_SYN_SIZE, 0, GFP_KERNEL);
+	buff = sock_wmalloc(sk, (MAX_SYN_SIZE + sizeof(struct sk_buff)),
+			    0, GFP_KERNEL);
 	if (buff == NULL) {
 		release_sock(sk);
 		ip_rt_put(rt);
@@ -605,15 +606,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 	sk->dummy_th.dest = usin->sin_port;
 
-	sk->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
+	tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
 						   sk->dummy_th.source,
 						   usin->sin_port);
-
 	tp->snd_wnd = 0;
 	tp->snd_wl1 = 0;
-	tp->snd_wl2 = sk->write_seq;
-	tp->snd_una = sk->write_seq;
-
+	tp->snd_wl2 = tp->write_seq;
+	tp->snd_una = tp->write_seq;
 	tp->rcv_nxt = 0;
 
 	sk->err = 0;
@@ -635,14 +634,22 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 	/* No failure conditions can result past this point. */
 
+	/* We'll fix this up when we get a response from the other end.
+	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
+	 */
+	tp->tcp_header_len = sizeof(struct tcphdr) +
+		(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
+
 	th = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
 	buff->h.th = th;
 
 	memcpy(th,(void *)&(sk->dummy_th), sizeof(*th));
-	buff->seq = sk->write_seq++;
+	/* th->doff gets fixed up below if we tack on options. */
+
+	buff->seq = tp->write_seq++;
 	th->seq = htonl(buff->seq);
-	tp->snd_nxt = sk->write_seq;
-	buff->end_seq = sk->write_seq;
+	tp->snd_nxt = tp->write_seq;
+	buff->end_seq = tp->write_seq;
 	th->ack = 0;
 	th->syn = 1;
 
@@ -656,11 +663,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if(sk->mtu < 64)
 		sk->mtu = 64;	/* Sanity limit */
 
-	if (sk->user_mss)
-		sk->mss = sk->user_mss;
-	else
-		sk->mss = (sk->mtu - sizeof(struct iphdr) -
-			   sizeof(struct tcphdr));
+	sk->mss = (sk->mtu - sizeof(struct iphdr) - tp->tcp_header_len);
+	if(sk->user_mss)
+		sk->mss = min(sk->mss, sk->user_mss);
 
 	if (sk->mss < 1) {
 		printk(KERN_DEBUG "intial sk->mss below 1\n");
@@ -675,9 +680,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		&tp->rcv_wscale);
 	th->window = htons(tp->rcv_wnd);
 
-	tmp = tcp_syn_build_options(buff, sk->mss, sysctl_tcp_sack,
-		sysctl_tcp_timestamps,
-		sysctl_tcp_window_scaling,tp->rcv_wscale);
+	tmp = tcp_syn_build_options(buff, sk->mss, sysctl_tcp_timestamps,
+		sysctl_tcp_window_scaling, tp->rcv_wscale);
 	buff->csum = 0;
 	th->doff = (sizeof(*th)+ tmp)>>2;
 
@@ -686,9 +690,10 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	tcp_set_state(sk,TCP_SYN_SENT);
 
 	/* Socket identity change complete, no longer
-	 * in TCP_CLOSE, so rehash.
+	 * in TCP_CLOSE, so enter ourselves into the
+	 * hash tables.
 	 */
-	tcp_v4_rehash(sk);
+	tcp_v4_hash(sk);
 
 	tp->rto = rt->u.dst.rtt;
 
@@ -715,6 +720,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 {
+	struct tcp_opt *tp;
 	int retval = -EINVAL;
 
 	/* Do sanity checking for sendmsg/sendto/send. */
@@ -740,7 +746,10 @@ static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 	lock_sock(sk);
 	retval = tcp_do_sendmsg(sk, msg->msg_iovlen, msg->msg_iov,
 				msg->msg_flags);
-
+	/* Push out partial tail frames if needed. */
+	tp = &(sk->tp_pinfo.af_tcp);
+	if(tp->send_head && tcp_snd_test(sk, tp->send_head))
+		tcp_write_xmit(sk);
 	release_sock(sk);
 
 out:
@@ -854,7 +863,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 	th = (struct tcphdr*)(dp+(iph->ihl<<2));
 
 	sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex);
-	if (sk == NULL) {
+	if (sk == NULL || sk->state == TCP_TIME_WAIT) {
 		icmp_statistics.IcmpInErrors++;
 		return; 
 	}
@@ -1011,7 +1020,8 @@ static void tcp_v4_send_reset(struct sk_buff *skb)
 	skb1->csum = csum_partial((u8 *) th1, sizeof(*th1), 0);
 	th1->check = tcp_v4_check(th1, sizeof(*th1), skb1->nh.iph->saddr,
 				  skb1->nh.iph->daddr, skb1->csum);
-	/* FIXME: should this carry an options packet? */
+
+	/* Do not place TCP options in a reset. */
 	ip_queue_xmit(skb1);
 	tcp_statistics.TcpOutSegs++;
 	tcp_statistics.TcpOutRsts++;
@@ -1063,6 +1073,14 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
 	mss = (skb->dst->pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
 	if (sk->user_mss)
 		mss = min(mss, sk->user_mss);
+	if(req->tstamp_ok)
+		mss -= TCPOLEN_TSTAMP_ALIGNED;
+	else
+		req->mss += TCPOLEN_TSTAMP_ALIGNED;
+
+	/* tcp_syn_build_options will do an skb_put() to obtain the TCP
+	 * options bytes below.
+	 */
 	skb->h.th = th = (struct tcphdr *) skb_put(skb, sizeof(struct tcphdr));
 
 	/* Don't offer more than they did.
@@ -1081,9 +1099,8 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
 	memset(th, 0, sizeof(struct tcphdr));
 	th->syn = 1;
 	th->ack = 1;
-	th->source =
 #ifdef CONFIG_IP_TRANSPARENT_PROXY
-	req->lcl_port; /* LVE */
+	th->source = req->lcl_port; /* LVE */
 #else
 	th->source = sk->dummy_th.source;
 #endif
@@ -1104,16 +1121,7 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
 		req->rcv_wscale = rcv_wscale; 
 	}
 	th->window = htons(req->rcv_wnd);
-
-	/* XXX Partial csum of 4 byte quantity is itself! -DaveM
-	 * Yes, but it's a bit harder to special case now. It's
-	 * now computed inside the tcp_v4_send_check() to clean up
-	 * updating the options fields in the mainline send code.
-	 * If someone thinks this is really bad let me know and
-	 * I'll try to do it a different way. -- erics
-	 */
-
-	tmp = tcp_syn_build_options(skb, req->mss, req->sack_ok, req->tstamp_ok,
+	tmp = tcp_syn_build_options(skb, req->mss, req->tstamp_ok,
 		req->wscale_ok,req->rcv_wscale);
 	skb->csum = 0;
 	th->doff = (sizeof(*th) + tmp)>>2;
@@ -1232,14 +1240,15 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
 	req->rcv_wnd = 0;		/* So that tcp_send_synack() knows! */
 
 	req->rcv_isn = skb->seq;
- 	tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
+ 	tp.tstamp_ok = tp.wscale_ok = tp.snd_wscale = 0;
 	tp.in_mss = 536;
 	tcp_parse_options(th,&tp,want_cookie);
-	if (tp.saw_tstamp)
-		req->ts_recent = tp.rcv_tsval;
 	req->mss = tp.in_mss;
+	if (tp.saw_tstamp) {
+		req->mss -= TCPOLEN_TSTAMP_ALIGNED;
+		req->ts_recent = tp.rcv_tsval;
+	}
 	req->tstamp_ok = tp.tstamp_ok;
-	req->sack_ok = tp.sack_ok;
 	req->snd_wscale = tp.snd_wscale;
 	req->wscale_ok = tp.wscale_ok;
 	req->rmt_port = th->source;
@@ -1289,6 +1298,113 @@ error:
 	return 0;
 }
 
+/* This is not only more efficient than what we used to do, it eliminates
+ * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
+ */
+struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
+{
+	struct sock *newsk = sk_alloc(AF_INET, GFP_ATOMIC, 0);
+
+	if(newsk != NULL) {
+		struct tcp_opt *newtp;
+
+		memcpy(newsk, sk, sizeof(*newsk));
+		newsk->sklist_next = NULL;
+		newsk->daddr = req->af.v4_req.rmt_addr;
+		newsk->rcv_saddr = req->af.v4_req.loc_addr;
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+		newsk->num = ntohs(skb->h.th->dest);
+#endif
+		newsk->state = TCP_SYN_RECV;
+
+		/* Clone the TCP header template */
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+		newsk->dummy_th.source = req->lcl_port;
+#endif
+		newsk->dummy_th.dest = req->rmt_port;
+		newsk->dummy_th.ack = 1;
+		newsk->dummy_th.doff = sizeof(struct tcphdr)>>2;
+
+		newsk->sock_readers = 0;
+		atomic_set(&newsk->rmem_alloc, 0);
+		skb_queue_head_init(&newsk->receive_queue);
+		atomic_set(&newsk->wmem_alloc, 0);
+		skb_queue_head_init(&newsk->write_queue);
+		newsk->saddr = req->af.v4_req.loc_addr;
+
+		newsk->done = 0;
+		newsk->proc = 0;
+		newsk->pair = NULL;
+		skb_queue_head_init(&newsk->back_log);
+		skb_queue_head_init(&newsk->error_queue);
+
+		/* Now setup tcp_opt */
+		newtp = &(newsk->tp_pinfo.af_tcp);
+		newtp->pred_flags = 0;
+		newtp->rcv_nxt = req->rcv_isn + 1;
+		newtp->snd_nxt = req->snt_isn + 1;
+		newtp->snd_una = req->snt_isn + 1;
+		newtp->srtt = 0;
+		newtp->ato = 0;
+		newtp->snd_wl1 = req->rcv_isn;
+		newtp->snd_wl2 = req->snt_isn;
+		newtp->snd_wnd = ntohs(skb->h.th->window);
+		newtp->max_window = newtp->snd_wnd;
+		newtp->pending = 0;
+		newtp->retransmits = 0;
+		newtp->last_ack_sent = req->rcv_isn + 1;
+		newtp->backoff = 0;
+		newtp->mdev = TCP_TIMEOUT_INIT;
+		newtp->snd_cwnd = 1;
+		newtp->rto = TCP_TIMEOUT_INIT;
+		newtp->packets_out = 0;
+		newtp->high_seq = 0;
+		newtp->snd_ssthresh = 0x7fffffff;
+		newtp->snd_cwnd_cnt = 0;
+		newtp->dup_acks = 0;
+		newtp->delayed_acks = 0;
+		init_timer(&newtp->retransmit_timer);
+		newtp->retransmit_timer.function = &tcp_retransmit_timer;
+		newtp->retransmit_timer.data = (unsigned long) newsk;
+		init_timer(&newtp->delack_timer);
+		newtp->delack_timer.function = &tcp_delack_timer;
+		newtp->delack_timer.data = (unsigned long) newsk;
+		skb_queue_head_init(&newtp->out_of_order_queue);
+		newtp->send_head = newtp->retrans_head = NULL;
+		newtp->rcv_wup = req->rcv_isn + 1;
+		newtp->write_seq = req->snt_isn + 1;
+		newtp->copied_seq = req->rcv_isn + 1;
+
+		newtp->saw_tstamp = 0;
+		newtp->in_mss = 536;
+
+		init_timer(&newtp->probe_timer);
+		newtp->probe_timer.function = &tcp_probe_timer;
+		newtp->probe_timer.data = (unsigned long) newsk;
+		newtp->probes_out = 0;
+		newtp->syn_seq = req->rcv_isn;
+		newtp->fin_seq = req->rcv_isn;
+		newtp->urg_data = 0;
+		tcp_synq_init(newtp);
+		newtp->syn_backlog = 0;
+
+		/* Back to base struct sock members. */
+		newsk->err = 0;
+		newsk->ack_backlog = 0;
+		newsk->max_ack_backlog = SOMAXCONN;
+		newsk->priority = 1;
+
+		/* IP layer stuff */
+		newsk->opt = req->af.v4_req.opt;
+		newsk->timeout = 0;
+		init_timer(&newsk->timer);
+		newsk->timer.function = &net_timer;
+		newsk->timer.data = (unsigned long) newsk;
+		newsk->socket = NULL;
+	}
+	return newsk;
+}
+
 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 				   struct open_request *req,
 				   struct dst_entry *dst)
@@ -1301,98 +1417,14 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	if (sk->ack_backlog > sk->max_ack_backlog)
 		goto exit; /* head drop */
 #endif
-	newsk = sk_alloc(AF_INET, GFP_ATOMIC);
+	newsk = tcp_create_openreq_child(sk, req, skb);
 	if (!newsk) 
 		goto exit;
 #ifdef NEW_LISTEN
 	sk->ack_backlog++;
 #endif
-	memcpy(newsk, sk, sizeof(*newsk));
-
-	/* Or else we die! -DaveM */
-	newsk->sklist_next = NULL;
-
-	newsk->opt = req->af.v4_req.opt; 
 
-	skb_queue_head_init(&newsk->write_queue);
-	skb_queue_head_init(&newsk->receive_queue);
-	skb_queue_head_init(&newsk->out_of_order_queue);
-	skb_queue_head_init(&newsk->error_queue);
-
-	/* Unused */
 	newtp = &(newsk->tp_pinfo.af_tcp);
-	newtp->send_head = NULL;
-	newtp->retrans_head = NULL;
-
-	newtp->pending = 0;
-
-	skb_queue_head_init(&newsk->back_log);
-
-	newsk->prot->init(newsk);
-
-	newtp->snd_cwnd_cnt = 0;
-	newtp->backoff = 0;
-	newsk->proc = 0;
-	newsk->done = 0;
-	newsk->pair = NULL;
-	atomic_set(&newsk->wmem_alloc, 0);
-	atomic_set(&newsk->rmem_alloc, 0);
-	newsk->localroute = sk->localroute;
-
-	newsk->err = 0;
-	newsk->shutdown = 0;
-	newsk->ack_backlog = 0;
-
-	newtp->fin_seq = req->rcv_isn;
-	newsk->syn_seq = req->rcv_isn;
-	newsk->state = TCP_SYN_RECV;
-	newsk->timeout = 0;
-
-	newsk->write_seq = req->snt_isn;
-
-	newtp->snd_wnd = ntohs(skb->h.th->window);
-	newtp->max_window = newtp->snd_wnd;
-	newtp->snd_wl1 = req->rcv_isn;
-	newtp->snd_wl2 = newsk->write_seq;
-	newtp->snd_una = newsk->write_seq++;
-	newtp->snd_nxt = newsk->write_seq;
-
-	newsk->urg_data = 0;
-	newtp->packets_out = 0;
-	newtp->retransmits = 0;
-	newsk->linger=0;
-	newsk->destroy = 0;
-	init_timer(&newsk->timer);
-	newsk->timer.data = (unsigned long) newsk;
-	newsk->timer.function = &net_timer;
-
-	tcp_init_xmit_timers(newsk);
-
-	newsk->dummy_th.source = 
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
-	req->lcl_port; /* LVE */
-#else
-	sk->dummy_th.source;
-#endif
-	newsk->dummy_th.dest = req->rmt_port;
-	newsk->sock_readers=0;
-
-	newtp->last_ack_sent = newtp->rcv_nxt = req->rcv_isn + 1;
-	newtp->rcv_wup = req->rcv_isn + 1;
-	newsk->copied_seq = req->rcv_isn + 1;
-
-	newsk->socket = NULL;
-
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
-	/*
-	*      Deal with possibly redirected traffic by setting num to
-	*      the intended destination port of the received packet.
-        */
-	newsk->num = ntohs(skb->h.th->dest);
-#endif
-	newsk->daddr = req->af.v4_req.rmt_addr;
-	newsk->saddr = req->af.v4_req.loc_addr;
-	newsk->rcv_saddr = req->af.v4_req.loc_addr;
 
 	/* options / mss / route_cache */
 	if (dst == NULL) { 
@@ -1418,7 +1450,6 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	if (newsk->mtu < 64)
 		newsk->mtu = 64;
 
-	newtp->sack_ok = req->sack_ok;
 	newtp->tstamp_ok = req->tstamp_ok;
 	newtp->window_clamp = req->window_clamp;
 	newtp->rcv_wnd = req->rcv_wnd;
@@ -1433,8 +1464,8 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	if (newtp->tstamp_ok) {
 		newtp->ts_recent = req->ts_recent;
 		newtp->ts_recent_stamp = jiffies;
-		newtp->tcp_header_len = sizeof(struct tcphdr) + 12;	/* FIXME: define constant! */
-		newsk->dummy_th.doff += 3;
+		newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+		newsk->dummy_th.doff += (TCPOLEN_TSTAMP_ALIGNED >> 2);
 	} else {
 		newtp->tcp_header_len = sizeof(struct tcphdr);
 	}
@@ -1446,13 +1477,13 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	/* Make sure our mtu is adjusted for headers. */
 	newsk->mss = min(req->mss, snd_mss) + sizeof(struct tcphdr) - newtp->tcp_header_len;
 
-	tcp_v4_hash(newsk);
+	/* Must use the af_specific ops here for the case of IPv6 mapped. */
+	newsk->prot->hash(newsk);
 	add_to_prot_sklist(newsk);
 	return newsk;
 
 exit:
-	if (dst) 
-		dst_release(dst);
+	dst_release(dst);
 	return NULL;
 }
 
@@ -1623,6 +1654,8 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
 
 	skb->used = 0;
 
+	if (sk->state == TCP_TIME_WAIT)
+		goto do_time_wait;
 	if (!sk->sock_readers)
 		return tcp_v4_do_rcv(sk, skb);
 
@@ -1636,6 +1669,12 @@ discard_it:
 	/* Discard frame. */
 	kfree_skb(skb);
   	return 0;
+
+do_time_wait:
+	if(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
+				      skb, th, &(IPCB(skb)->opt), skb->len))
+		goto no_tcp_socket;
+	goto discard_it;
 }
 
 int tcp_v4_build_header(struct sock *sk, struct sk_buff *skb)
@@ -1770,33 +1809,21 @@ struct tcp_func ipv4_specific = {
 	sizeof(struct sockaddr_in)
 };
 
+/* NOTE: A lot of things set to zero explicitly by call to
+ *       sk_alloc() so need not be done here.
+ */
 static int tcp_v4_init_sock(struct sock *sk)
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
-	skb_queue_head_init(&sk->out_of_order_queue);
+	skb_queue_head_init(&tp->out_of_order_queue);
 	tcp_init_xmit_timers(sk);
 
-	tp->srtt  = 0;
 	tp->rto  = TCP_TIMEOUT_INIT;		/*TCP_WRITE_TIME*/
 	tp->mdev = TCP_TIMEOUT_INIT;
-
-	tp->ato = 0;
-	tp->iat = (HZ/5) << 3;
-
-	/* FIXME: tie this to sk->rcvbuf? (May be unnecessary) */
-	/* tp->rcv_wnd = 8192; */
-	tp->tstamp_ok = 0;
-	tp->sack_ok = 0;
-	tp->wscale_ok = 0;
 	tp->in_mss = 536;
-	tp->snd_wscale = 0;
-	tp->sacks = 0;
-	tp->saw_tstamp = 0;
-	tp->syn_backlog = 0;
 
-	/*
-	 * See draft-stevens-tcpca-spec-01 for discussion of the
+	/* See draft-stevens-tcpca-spec-01 for discussion of the
 	 * initialization of these values.
 	 */
 	tp->snd_cwnd = 1;
@@ -1804,9 +1831,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 
 	sk->priority = 1;
 	sk->state = TCP_CLOSE;
-
 	sk->max_ack_backlog = SOMAXCONN;
-
 	sk->mtu = 576;
 	sk->mss = 536;
 
@@ -1824,6 +1849,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 
 static int tcp_v4_destroy_sock(struct sock *sk)
 {
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	struct sk_buff *skb;
 
 	tcp_clear_xmit_timers(sk);
@@ -1836,9 +1862,17 @@ static int tcp_v4_destroy_sock(struct sock *sk)
 		kfree_skb(skb);
 
 	/* Cleans up our, hopefuly empty, out_of_order_queue. */
-  	while((skb = skb_dequeue(&sk->out_of_order_queue)) != NULL)
+  	while((skb = skb_dequeue(&tp->out_of_order_queue)) != NULL)
 		kfree_skb(skb);
 
+	/* Clean up a locked TCP bind bucket, this only happens if a
+	 * port is allocated for a socket, but it never fully connects.
+	 * In which case we will find num to be non-zero and daddr to
+	 * be zero.
+	 */
+	if(sk->daddr == 0 && sk->num != 0)
+		tcp_bucket_unlock(sk);
+
 	return 0;
 }
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fbae5cfa6..d8c3c6480 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_output.c,v 1.51 1998/01/15 22:40:39 freitag Exp $
+ * Version:	$Id: tcp_output.c,v 1.65 1998/03/15 12:07:03 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -34,8 +34,6 @@
 
 #include <net/tcp.h>
 
-extern int sysctl_tcp_sack;
-extern int sysctl_tcp_tsack;
 extern int sysctl_tcp_timestamps;
 extern int sysctl_tcp_window_scaling;
 
@@ -45,7 +43,8 @@ static __inline__ void clear_delayed_acks(struct sock * sk)
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
 	tp->delayed_acks = 0;
-	sk->ack_backlog = 0;
+	if(tcp_in_quickack_mode(tp))
+		tp->ato = ((HZ/100)*2);
 	tcp_clear_xmit_timer(sk, TIME_DACK);
 }
 
@@ -58,69 +57,26 @@ static __inline__ void update_send_head(struct sock *sk)
 		tp->send_head = NULL;
 }
 
-static __inline__ int tcp_snd_test(struct sock *sk, struct sk_buff *skb)
-{
-	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-	int nagle_check = 1;
-	int len;
-
-	/*	RFC 1122 - section 4.2.3.4
-	 *
-	 *	We must queue if
-	 *
-	 *	a) The right edge of this frame exceeds the window
-	 *	b) There are packets in flight and we have a small segment
-	 *	   [SWS avoidance and Nagle algorithm]
-	 *	   (part of SWS is done on packetization)
-	 *	c) We are retransmiting [Nagle]
-	 *	d) We have too many packets 'in flight'
-	 *
-	 * 	Don't use the nagle rule for urgent data.
-	 */
-	len = skb->end_seq - skb->seq;
-	if (!sk->nonagle && len < (sk->mss >> 1) && tp->packets_out && 
-	    !skb->h.th->urg)
-		nagle_check = 0;
-
-	return (nagle_check && tp->packets_out < tp->snd_cwnd &&
-		!after(skb->end_seq, tp->snd_una + tp->snd_wnd) &&
-		tp->retransmits == 0);
-}
-
 /*
  *	This is the main buffer sending routine. We queue the buffer
  *	having checked it is sane seeming.
  */
  
-void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
+void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue)
 {
-	struct tcphdr * th = skb->h.th;
+	struct tcphdr *th = skb->h.th;
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	int size;
 
 	/* Length of packet (not counting length of pre-tcp headers). */
 	size = skb->len - ((unsigned char *) th - skb->data);
 
-	/* Sanity check it.. */
-	if (size < sizeof(struct tcphdr) || size > skb->len) {
-		printk(KERN_DEBUG "tcp_send_skb: bad skb "
-		       "(skb = %p, data = %p, th = %p, len = %u)\n",
-		       skb, skb->data, th, skb->len);
-		kfree_skb(skb);
-		return;
-	}
-
-	/* If we have queued a header size packet.. (these crash a few
-	 * tcp stacks if ack is not set)
-	 * FIXME: What is the equivalent below when we have options?
-	 */
-	if (size == sizeof(struct tcphdr)) {
-		/* If it's got a syn or fin discard. */
-		if(!th->syn && !th->fin) {
-			printk(KERN_DEBUG "tcp_send_skb: attempt to queue a bogon.\n");
-			kfree_skb(skb);
-			return;
-		}
+	/* If there is a FIN or a SYN we add it onto the size. */
+	if (th->fin || th->syn) {
+		if(th->syn)
+			size++;
+		if(th->fin)
+			size++;
 	}
 
 	/* Actual processing. */
@@ -129,14 +85,14 @@ void tcp_send_skb(struct sock *sk, struct sk_buff *skb)
 
 	skb_queue_tail(&sk->write_queue, skb);
 
-	if (tp->send_head == NULL && tcp_snd_test(sk, skb)) {
+	if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) {
 		struct sk_buff * buff;
 
 		/* This is going straight out. */
 		tp->last_ack_sent = tp->rcv_nxt;
 		th->ack_seq = htonl(tp->rcv_nxt);
 		th->window = htons(tcp_select_window(sk));
-		tcp_update_options((__u32 *)(th+1),tp);
+		tcp_update_options((__u32 *)(th + 1),tp);
 
 		tp->af_specific->send_check(sk, th, size, skb);
 
@@ -165,11 +121,10 @@ queue:
 	/* Remember where we must start sending. */
 	if (tp->send_head == NULL)
 		tp->send_head = skb;
-	if (tp->packets_out == 0 && !tp->pending) {
+	if (!force_queue && tp->packets_out == 0 && !tp->pending) {
 		tp->pending = TIME_PROBE0;
 		tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
 	}
-	return;
 }
 
 /*
@@ -214,8 +169,6 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
 	buff->h.th = nth;
 	memcpy(nth, th, tp->tcp_header_len);
 
-	/* FIXME: Make sure this gets tcp options right. */
-	
 	/* Correct the new header. */
 	buff->seq = skb->seq + len;
 	buff->end_seq = skb->end_seq;
@@ -281,14 +234,6 @@ static int tcp_wrxmit_frag(struct sock *sk, struct sk_buff *skb, int size)
 		tp->send_head = skb;
 		tp->packets_out--;
 		return -1;
-	} else {
-#if 0
-		/* If tcp_fragment succeded then
-		 * the send head is the resulting
-		 * fragment
-		 */
-		tp->send_head = skb->next;
-#endif
 	}
 	return 0;
 }
@@ -346,9 +291,10 @@ void tcp_write_xmit(struct sock *sk)
 			size = skb->len - (((unsigned char*)th) - skb->data);
 		}
 
-		tp->last_ack_sent = th->ack_seq = htonl(tp->rcv_nxt);
+		tp->last_ack_sent = tp->rcv_nxt;
+		th->ack_seq = htonl(tp->rcv_nxt);
 		th->window = rcv_wnd;
-		tcp_update_options((__u32 *)(th+1),tp);
+		tcp_update_options((__u32 *)(th + 1),tp);
 
 		tp->af_specific->send_check(sk, th, size, skb);
 
@@ -437,128 +383,44 @@ void tcp_write_xmit(struct sock *sk)
  * taken by headers, and the remaining space will be available for TCP data.
  * This should be accounted for correctly instead.
  */
-unsigned short tcp_select_window(struct sock *sk)
+u32 __tcp_select_window(struct sock *sk)
 {
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-	int mss = sk->mss;
-	long free_space = sock_rspace(sk) / 2;
-	long window, cur_win;
+	unsigned int mss = sk->mss;
+	unsigned int free_space;
+	u32 window, cur_win;
 
+	free_space = (sk->rcvbuf - atomic_read(&sk->rmem_alloc)) / 2;
 	if (tp->window_clamp) {
 		free_space = min(tp->window_clamp, free_space);
 		mss = min(tp->window_clamp, mss);
-	} 
-#ifdef NO_ANK_FIX
-	/* I am tired of this message */
-	  else
-		printk(KERN_DEBUG "Clamp failure. Water leaking.\n");
-#endif
+	} else {
+		printk("tcp_select_window: tp->window_clamp == 0.\n");
+	}
 
 	if (mss < 1) {
 		mss = 1;
-		printk(KERN_DEBUG "tcp_select_window: mss fell to 0.\n");
+		printk("tcp_select_window: sk->mss fell to 0.\n");
 	}
 	
-	/* compute the actual window i.e.
-	 * old_window - received_bytes_on_that_win
-	 */
-	cur_win = tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup);
-	window  = tp->rcv_wnd;
-
-	if (cur_win < 0) {
-		cur_win = 0;
-#ifdef NO_ANK_FIX
-	/* And this too. */
-		printk(KERN_DEBUG "TSW: win < 0 w=%d 1=%u 2=%u\n",
-		       tp->rcv_wnd, tp->rcv_nxt, tp->rcv_wup);
-#endif
-	}
-
-	if (free_space < sk->rcvbuf/4 && free_space < mss/2)
+	cur_win = tcp_receive_window(tp);
+	if (free_space < sk->rcvbuf/4 && free_space < mss/2) {
 		window = 0;
-
-	/* Get the largest window that is a nice multiple of mss.
-	 * Window clamp already applied above.
-	 * If our current window offering is within 1 mss of the
-	 * free space we just keep it. This prevents the divide
-	 * and multiply from happening most of the time.
-	 * We also don't do any window rounding when the free space
-	 * is too small.
-	 */
-	if (window < free_space - mss && free_space > mss)
-		window = (free_space/mss)*mss;
-
-	/* Never shrink the offered window */
-	if (window < cur_win)
-		window = cur_win;
-
-	tp->rcv_wnd = window;
-	tp->rcv_wup = tp->rcv_nxt;
-	return window >> tp->rcv_wscale;	/* RFC1323 scaling applied */
-}
-
-#if 0
-/* Old algorithm for window selection */
-unsigned short tcp_select_window(struct sock *sk)
-{
-	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-	int mss = sk->mss;
-	long free_space = sock_rspace(sk);
-	long window, cur_win, usable;
-
-	if (tp->window_clamp) {
-		free_space = min(tp->window_clamp, free_space);
-		mss = min(tp->window_clamp, mss);
-	}
-	
-	/* compute the actual window i.e.
-	 * old_window - received_bytes_on_that_win
-	 */
-	cur_win = tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup);
-	window  = tp->rcv_wnd;
-
-	if (cur_win < 0) {
-		cur_win = 0;
-		printk(KERN_DEBUG "TSW: win < 0 w=%d 1=%u 2=%u\n",
-		       tp->rcv_wnd, tp->rcv_nxt, tp->rcv_wup);
-	}
-
-	/* RFC 1122:
-	 * "the suggested [SWS] avoidance algoritm for the receiver is to keep
-	 *  RECV.NEXT + RCV.WIN fixed until:
-	 *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
-	 *
-	 * i.e. don't raise the right edge of the window until you can raise
-	 * it at least MSS bytes.
-	 */
-
-	usable = free_space - cur_win;
-	if (usable < 0)
-		usable = 0;
-
-	if (window < usable) {
-		/*	Window is not blocking the sender
-		 *	and we have enough free space for it
-		 */
-		if (cur_win > (sk->mss << 1))
-			goto out;
-	}
-       	
-	if (window >= usable) {
-		/*	We are offering too much, cut it down... 
-		 *	but don't shrink the window
-		 */
-		window = max(usable, cur_win);
 	} else {
-		while ((usable - window) >= mss)
-			window += mss;
+		/* Get the largest window that is a nice multiple of mss.
+		 * Window clamp already applied above.
+		 * If our current window offering is within 1 mss of the
+		 * free space we just keep it. This prevents the divide
+		 * and multiply from happening most of the time.
+		 * We also don't do any window rounding when the free space
+		 * is too small.
+		 */
+		window = tp->rcv_wnd;
+		if ((window <= (free_space - mss)) || (window > free_space))
+			window = (free_space/mss)*mss;
 	}
-out:
-	tp->rcv_wnd = window;
-	tp->rcv_wup = tp->rcv_nxt;
 	return window;
 }
-#endif
 
 static int tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb)
 {
@@ -729,84 +591,123 @@ void tcp_do_retransmit(struct sock *sk, int all)
 	}
 }
 
-/*
- *	Send a fin.
+/* Send a fin.  The caller locks the socket for us.  This cannot be
+ * allowed to fail queueing a FIN frame under any circumstances.
  */
-
 void tcp_send_fin(struct sock *sk)
 {
-	struct tcphdr *th =(struct tcphdr *)&sk->dummy_th;
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);	
-	struct tcphdr *t1;
-	struct sk_buff *buff;
-	int tmp;
 	
-	buff = sock_wmalloc(sk, BASE_ACK_SIZE + tp->tcp_header_len, 1, GFP_KERNEL);
-	if (buff == NULL) {
-		/* FIXME: This is a disaster if it occurs. */
-		printk(KERN_INFO "tcp_send_fin: Impossible malloc failure");
-		return;
-	}
+	/* Optimization, tack on the FIN if we have a queue of
+	 * unsent frames.
+	 */
+	if(tp->send_head != NULL) {
+		struct sk_buff *tail = skb_peek_tail(&sk->write_queue);
+		struct tcphdr *th = tail->h.th;
+		int data_len;
+
+		/* Unfortunately tcp_write_xmit won't check for going over
+		 * the MSS due to the FIN sequence number, so we have to
+		 * watch out for it here.
+		 */
+		data_len = (tail->tail - (((unsigned char *)th)+tp->tcp_header_len));
+		if(data_len >= sk->mss)
+			goto build_new_frame; /* ho hum... */
 
-	/* Administrivia. */
-	buff->csum = 0;
+		/* tcp_write_xmit() will checksum the header etc. for us. */
+		th->fin = 1;
+		tail->end_seq++;
+	} else {
+		struct sk_buff *buff;
+		struct tcphdr *th;
 
-	/* Put in the IP header and routing stuff. */
-	tmp = tp->af_specific->build_net_header(sk, buff);
-	if (tmp < 0) {
-		int t;
+build_new_frame:
+		buff = sock_wmalloc(sk,
+				    (BASE_ACK_SIZE + tp->tcp_header_len +
+				     sizeof(struct sk_buff)),
+				    1, GFP_KERNEL);
+		if (buff == NULL) {
+			/* We can only fail due to low memory situations, not
+			 * due to going over our sndbuf limits (due to the
+			 * force flag passed to sock_wmalloc).  So just keep
+			 * trying.  We cannot allow this fail.  The socket is
+			 * still locked, so we need not check if the connection
+			 * was reset in the meantime etc.
+			 */
+			goto build_new_frame;
+		}
 
-  		/* FIXME: We must not throw this out. Eventually we must
-                 * put a FIN into the queue, otherwise it never gets queued.
-  		 */
-		kfree_skb(buff);
-		sk->write_seq++;
-		t = del_timer(&sk->timer);
-		if (t)
-			add_timer(&sk->timer);
-		else
-			tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
-		return;
-	}
-	
-	/* We ought to check if the end of the queue is a buffer and
-	 * if so simply add the fin to that buffer, not send it ahead.
-	 */
-	t1 =(struct tcphdr *)skb_put(buff,tp->tcp_header_len);
-	buff->h.th =  t1;
-	tcp_build_options((__u32 *)(t1+1),tp);
-
-	memcpy(t1, th, sizeof(*t1));
-	buff->seq = sk->write_seq;
-	sk->write_seq++;
-	buff->end_seq = sk->write_seq;
-	t1->seq = htonl(buff->seq);
-	t1->ack_seq = htonl(tp->rcv_nxt);
-	t1->window = htons(tcp_select_window(sk));
-	t1->fin = 1;
-
-	tp->af_specific->send_check(sk, t1, tp->tcp_header_len, buff);
-
-	/* The fin can only be transmited after the data. */
-	skb_queue_tail(&sk->write_queue, buff);
- 	if (tp->send_head == NULL) {
-		/* FIXME: BUG! we need to check if the fin fits into the window
-		 * here. If not we need to do window probing (sick, but true)
+		/* Administrivia. */
+		buff->csum = 0;
+
+		/* Put in the IP header and routing stuff.
+		 *
+		 * FIXME:
+		 * We can fail if the interface for the route
+		 * this socket takes goes down right before
+		 * we get here.  ANK is there a way to point
+		 * this into a "black hole" route in such a
+		 * case?  Ideally, we should still be able to
+		 * queue this and let the retransmit timer
+		 * keep trying until the destination becomes
+		 * reachable once more.  -DaveM
 		 */
-		struct sk_buff *skb1;
+		if(tp->af_specific->build_net_header(sk, buff) < 0) {
+			kfree_skb(buff);
+			goto update_write_seq;
+		}
+		th = (struct tcphdr *) skb_put(buff, tp->tcp_header_len);
+		buff->h.th = th;
 
-		tp->packets_out++;
-		tp->snd_nxt = sk->write_seq;
-		buff->when = jiffies;
+		memcpy(th, (void *) &(sk->dummy_th), sizeof(*th));
+		th->seq = htonl(tp->write_seq);
+		th->fin = 1;
+		tcp_build_options((__u32 *)(th + 1), tp);
 
-		skb1 = skb_clone(buff, GFP_KERNEL);
-		if (skb1) {
-			skb_set_owner_w(skb1, sk);
-			tp->af_specific->queue_xmit(skb1);
-		}
+		/* This makes sure we do things like abide by the congestion
+		 * window and other constraints which prevent us from sending.
+		 */
+		tcp_send_skb(sk, buff, 0);
+	}
+update_write_seq:
+	/* So that we recognize the ACK coming back for
+	 * this FIN as being legitimate.
+	 */
+	tp->write_seq++;
+}
 
-                if (!tcp_timer_is_set(sk, TIME_RETRANS))
-			tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+/* We get here when a process closes a file descriptor (either due to
+ * an explicit close() or as a byproduct of exit()'ing) and there
+ * was unread data in the receive queue.  This behavior is recommended
+ * by draft-ietf-tcpimpl-prob-03.txt section 3.10.  -DaveM
+ */
+void tcp_send_active_reset(struct sock *sk)
+{
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	struct sk_buff *skb;
+	struct tcphdr *th;
+
+again:
+	/* NOTE: No TCP options attached and we never retransmit this. */
+	skb = sock_wmalloc(sk, (BASE_ACK_SIZE + sizeof(*th)), 1, GFP_KERNEL);
+	if(skb == NULL)
+		goto again;
+	skb->csum = 0;
+	if(tp->af_specific->build_net_header(sk, skb) < 0) {
+		kfree_skb(skb);
+	} else {
+		th = (struct tcphdr *) skb_put(skb, sizeof(*th));
+		memcpy(th, &(sk->dummy_th), sizeof(*th));
+		th->seq = htonl(tp->write_seq);
+		th->rst = 1;
+		th->doff = sizeof(*th) / 4;
+		tp->last_ack_sent = tp->rcv_nxt;
+		th->ack_seq = htonl(tp->rcv_nxt);
+		th->window = htons(tcp_select_window(sk));
+		tp->af_specific->send_check(sk, th, sizeof(*th), skb);
+		tp->af_specific->queue_xmit(skb);
+		tcp_statistics.TcpOutSegs++;
+		tcp_statistics.TcpOutRsts++;
 	}
 }
 
@@ -814,6 +715,9 @@ void tcp_send_fin(struct sock *sk)
  * a SYN packet that crossed the incoming SYN that caused this routine
  * to get called. If this assumption fails then the initial rcv_wnd
  * and rcv_wscale values will not be correct.
+ *
+ * XXX When you have time Dave, redo this to use tcp_send_skb() just
+ * XXX like tcp_send_fin() above now does.... -DaveM
  */
 int tcp_send_synack(struct sock *sk)
 {
@@ -823,7 +727,7 @@ int tcp_send_synack(struct sock *sk)
 	struct tcphdr *th;
 	int tmp;
 	
-	skb = sock_wmalloc(sk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
+	skb = sock_wmalloc(sk, MAX_SYN_SIZE + sizeof(struct sk_buff), 1, GFP_ATOMIC);
 	if (skb == NULL) 
 		return -ENOMEM;
 
@@ -855,8 +759,7 @@ int tcp_send_synack(struct sock *sk)
 	tp->last_ack_sent = th->ack_seq = htonl(tp->rcv_nxt);
 
 	tmp = tcp_syn_build_options(skb, sk->mss,
-		tp->sack_ok, tp->tstamp_ok,
-		tp->wscale_ok,tp->rcv_wscale);
+		tp->tstamp_ok, tp->wscale_ok, tp->rcv_wscale);
 	skb->csum = 0;
 	th->doff = (sizeof(*th) + tmp)>>2;
 
@@ -880,31 +783,24 @@ int tcp_send_synack(struct sock *sk)
 }
 
 /*
- *	Set up the timers for sending a delayed ack..
- *
- *      rules for delaying an ack:
- *      - delay time <= 0.5 HZ
- *      - must send at least every 2 full sized packets
- *      - we don't have a window update to send
+ * Send out a delayed ack, the caller does the policy checking
+ * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
+ * for details.
  */
 
-void tcp_send_delayed_ack(struct sock * sk, int max_timeout)
+void tcp_send_delayed_ack(struct tcp_opt *tp, int max_timeout)
 {
-	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-	unsigned long timeout, now;
+	unsigned long timeout;
 
-	/* Calculate new timeout. */
-	now = jiffies;
+	/* Stay within the limit we were given */
 	timeout = tp->ato;
-
-	if (timeout > max_timeout ||
-	    ((tp->rcv_nxt - tp->rcv_wup) > (sk->mss << 2)))
-		timeout = now;
-	else
-		timeout += now;
+	if (timeout > max_timeout)
+		timeout = max_timeout;
+	timeout += jiffies;
 
 	/* Use new timeout only if there wasn't a older one earlier. */
-	if (!del_timer(&tp->delack_timer) || timeout < tp->delack_timer.expires)
+	if ((!tp->delack_timer.prev || !del_timer(&tp->delack_timer)) ||
+	    (timeout < tp->delack_timer.expires))
 		tp->delack_timer.expires = timeout;
 
 	add_timer(&tp->delack_timer);
@@ -928,8 +824,6 @@ void tcp_send_ack(struct sock *sk)
 
 	/* We need to grab some memory, and put together an ack,
 	 * and then put it into the queue to be sent.
-	 * FIXME: is it better to waste memory here and use a
-	 * constant sized ACK?
 	 */
 	buff = sock_wmalloc(sk, BASE_ACK_SIZE + tp->tcp_header_len, 1, GFP_ATOMIC);
 	if (buff == NULL) {
@@ -938,7 +832,7 @@ void tcp_send_ack(struct sock *sk)
 		 *	bandwidth on slow links to send a spare ack than
 		 *	resend packets.
 		 */
-		tcp_send_delayed_ack(sk, HZ/2);
+		tcp_send_delayed_ack(tp, HZ/2);
 		return;
 	}
 
@@ -956,22 +850,16 @@ void tcp_send_ack(struct sock *sk)
 
 	th = (struct tcphdr *)skb_put(buff,tp->tcp_header_len);
 	memcpy(th, &sk->dummy_th, sizeof(struct tcphdr));
-	tcp_build_options((__u32 *)(th+1),tp);
 
 	/* Swap the send and the receive. */
 	th->window	= ntohs(tcp_select_window(sk));
 	th->seq		= ntohl(tp->snd_nxt);
 	tp->last_ack_sent = tp->rcv_nxt;
 	th->ack_seq	= htonl(tp->rcv_nxt);
+	tcp_build_and_update_options((__u32 *)(th + 1), tp);
 
   	/* Fill in the packet and send it. */
 	tp->af_specific->send_check(sk, th, tp->tcp_header_len, buff);
-
-#if 0
-	SOCK_DEBUG(sk, "\rtcp_send_ack: seq %x ack %x\n",
-		   tp->snd_nxt, tp->rcv_nxt);
-#endif
-
 	tp->af_specific->queue_xmit(buff);
   	tcp_statistics.TcpOutSegs++;
 }
@@ -1017,6 +905,7 @@ void tcp_write_wakeup(struct sock *sk)
 		}
 
 		th = skb->h.th;
+		tcp_update_options((__u32 *)(th + 1), tp);
 		tp->af_specific->send_check(sk, th, th->doff * 4 + win_size, skb);
 		buff = skb_clone(skb, GFP_ATOMIC);
 		if (buff == NULL)
@@ -1047,25 +936,19 @@ void tcp_write_wakeup(struct sock *sk)
 			return;
 		}
 
-		t1 = (struct tcphdr *) skb_put(buff, sizeof(struct tcphdr));
+		t1 = (struct tcphdr *) skb_put(buff, tp->tcp_header_len);
 		memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1));
-		/* FIXME: should zero window probes have SACK and/or TIMESTAMP data?
-		 * If so we have to tack them on here.
-		 */
 
 		/*	Use a previous sequence.
 		 *	This should cause the other end to send an ack.
 		 */
 	 
 		t1->seq = htonl(tp->snd_nxt-1);
-/*		t1->fin = 0;	-- We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */
 		t1->ack_seq = htonl(tp->rcv_nxt);
 		t1->window = htons(tcp_select_window(sk));
+		tcp_build_and_update_options((__u32 *)(t1 + 1), tp);
 
-		/* Value from dummy_th may be larger. */
-		t1->doff = sizeof(struct tcphdr)/4;
-
-		tp->af_specific->send_check(sk, t1, sizeof(*t1), buff);
+		tp->af_specific->send_check(sk, t1, tp->tcp_header_len, buff);
 	}
 
 	/* Send it. */
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 76ccedab2..fdf8f50ec 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_timer.c,v 1.5 1998/03/03 01:23:44 ralf Exp $
+ * Version:	$Id: tcp_timer.c,v 1.6 1998/03/17 22:18:35 ralf Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -31,6 +31,7 @@ int sysctl_tcp_retries2 = TCP_RETR2;
 static void tcp_sltimer_handler(unsigned long);
 static void tcp_syn_recv_timer(unsigned long);
 static void tcp_keepalive(unsigned long data);
+static void tcp_bucketgc(unsigned long);
 
 struct timer_list	tcp_slow_timer = {
 	NULL, NULL,
@@ -41,7 +42,8 @@ struct timer_list	tcp_slow_timer = {
 
 struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = {
 	{ATOMIC_INIT(0), TCP_SYNACK_PERIOD, 0, tcp_syn_recv_timer},/* SYNACK	*/
-	{ATOMIC_INIT(0), TCP_KEEPALIVE_PERIOD, 0, tcp_keepalive}   /* KEEPALIVE	*/
+	{ATOMIC_INIT(0), TCP_KEEPALIVE_PERIOD, 0, tcp_keepalive},  /* KEEPALIVE	*/
+	{ATOMIC_INIT(0), TCP_BUCKETGC_PERIOD, 0, tcp_bucketgc}     /* BUCKETGC	*/
 };
 
 const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
@@ -87,20 +89,24 @@ void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
 		 * The delayed ack timer can be set if we are changing the
 		 * retransmit timer when removing acked frames.
 		 */
-		del_timer(&tp->probe_timer);
-		del_timer(&tp->retransmit_timer);
+		if(tp->probe_timer.prev)
+			del_timer(&tp->probe_timer);
+		if(tp->retransmit_timer.prev)
+			del_timer(&tp->retransmit_timer);
 		tp->retransmit_timer.expires=jiffies+when;
 		add_timer(&tp->retransmit_timer);
 		break;
 
 	case TIME_DACK:
-		del_timer(&tp->delack_timer);
+		if(tp->delack_timer.prev)
+			del_timer(&tp->delack_timer);
 		tp->delack_timer.expires=jiffies+when;
 		add_timer(&tp->delack_timer);
 		break;
 
 	case TIME_PROBE0:
-		del_timer(&tp->probe_timer);
+		if(tp->probe_timer.prev)
+			del_timer(&tp->probe_timer);
 		tp->probe_timer.expires=jiffies+when;
 		add_timer(&tp->probe_timer);
 		break;	
@@ -118,9 +124,12 @@ void tcp_clear_xmit_timers(struct sock *sk)
 {	
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 
-	del_timer(&tp->retransmit_timer);
-	del_timer(&tp->delack_timer);
-	del_timer(&tp->probe_timer);
+	if(tp->retransmit_timer.prev)
+		del_timer(&tp->retransmit_timer);
+	if(tp->delack_timer.prev)
+		del_timer(&tp->delack_timer);
+	if(tp->probe_timer.prev)
+		del_timer(&tp->probe_timer);
 }
 
 static int tcp_write_err(struct sock *sk, int force)
@@ -131,9 +140,8 @@ static int tcp_write_err(struct sock *sk, int force)
 	tcp_clear_xmit_timers(sk);
 	
 	/* Time wait the socket. */
-	if (!force && (1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING)) {
-		tcp_set_state(sk,TCP_TIME_WAIT);
-		tcp_reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
+	if (!force && ((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING))) {
+		tcp_time_wait(sk);
 	} else {
 		/* Clean up time. */
 		tcp_set_state(sk, TCP_CLOSE);
@@ -173,9 +181,8 @@ static int tcp_write_timeout(struct sock *sk)
 	return 1;
 }
 
-
-void tcp_delack_timer(unsigned long data) {
-
+void tcp_delack_timer(unsigned long data)
+{
 	struct sock *sk = (struct sock*)data;
 
 	if(sk->zapped)
@@ -185,8 +192,8 @@ void tcp_delack_timer(unsigned long data) {
 		tcp_read_wakeup(sk); 		
 }
 
-void tcp_probe_timer(unsigned long data) {
-
+void tcp_probe_timer(unsigned long data)
+{
 	struct sock *sk = (struct sock*)data;
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 
@@ -212,10 +219,9 @@ void tcp_probe_timer(unsigned long data) {
 			sk->err = ETIMEDOUT;
 		sk->error_report(sk);
 
-		/* Time wait the socket. */
 		if ((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING)) {
-			tcp_set_state(sk, TCP_TIME_WAIT);
-			tcp_reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
+			/* Time wait the socket. */
+			tcp_time_wait(sk);
 		} else {
 			/* Clean up time. */
 			tcp_set_state(sk, TCP_CLOSE);
@@ -252,6 +258,35 @@ static __inline__ int tcp_keepopen_proc(struct sock *sk)
 	return res;
 }
 
+/* Garbage collect TCP bind buckets. */
+static void tcp_bucketgc(unsigned long __unused)
+{
+	int i;
+
+	for(i = 0; i < TCP_BHTABLE_SIZE; i++) {
+		struct tcp_bind_bucket *tb = tcp_bound_hash[i];
+
+		while(tb) {
+			struct tcp_bind_bucket *next = tb->next;
+
+			if((tb->owners == NULL) &&
+			   !(tb->flags & TCPB_FLAG_LOCKED)) {
+				/* Eat timer reference. */
+				tcp_dec_slow_timer(TCP_SLT_BUCKETGC);
+
+				/* Unlink bucket. */
+				if(tb->next)
+					tb->next->pprev = tb->pprev;
+				*tb->pprev = tb->next;
+
+				/* Finally, free it up. */
+				kmem_cache_free(tcp_bucket_cachep, tb);
+			}
+			tb = next;
+		}
+	}
+}
+
 /*
  *	Check all sockets for keepalive timer
  *	Called every 75 seconds
diff --git a/net/ipv4/timer.c b/net/ipv4/timer.c
index fe02b3f4c..79ae3309e 100644
--- a/net/ipv4/timer.c
+++ b/net/ipv4/timer.c
@@ -5,7 +5,7 @@
  *
  *		TIMER - implementation of software timers for IP.
  *
- * Version:	$Id: timer.c,v 1.7 1997/09/17 18:50:26 freitag Exp $
+ * Version:	$Id: timer.c,v 1.2 1997/12/16 05:37:48 ralf Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -52,76 +52,52 @@
 
 void net_delete_timer (struct sock *t)
 {
-	unsigned long flags;
-
-	save_flags (flags);
-	cli();
-
+	if(t->timer.prev)
+		del_timer (&t->timer);
 	t->timeout = 0;
-	del_timer (&t->timer);
-
-	restore_flags (flags);
 }
 
 void net_reset_timer (struct sock *t, int timeout, unsigned long len)
 {
 	net_delete_timer (t);
 	t->timeout = timeout;
-#if 1
-  /* FIXME: ??? */
-	if ((int) len < 0)	/* prevent close to infinite timers. THEY _DO_ */
-		len = 3;	/* happen (negative values ?) - don't ask me why ! -FB */
-#endif
 	t->timer.expires = jiffies+len;
 	add_timer (&t->timer);
 }
 
-
-/*
- *	Now we will only be called whenever we need to do
- *	something, but we must be sure to process all of the
- *	sockets that need it.
+/* Now we will only be called whenever we need to do
+ * something, but we must be sure to process all of the
+ * sockets that need it.
  */
-
 void net_timer (unsigned long data)
 {
 	struct sock *sk = (struct sock*)data;
 	int why = sk->timeout;
 
-	/* 
-	 * only process if socket is not in use
-	 */
-
-	if (sk->sock_readers)
-	{
+	/* Only process if socket is not in use. */
+	if (sk->sock_readers) {
 		sk->timer.expires = jiffies+HZ;
 		add_timer(&sk->timer);
-		sti();
 		return;
 	}
 
 	/* Always see if we need to send an ack. */
-
-	if (sk->ack_backlog && !sk->zapped) 
-	{
+	if (sk->tp_pinfo.af_tcp.delayed_acks && !sk->zapped) {
 		sk->prot->read_wakeup (sk);
-		if (! sk->dead)
-		sk->data_ready(sk,0);
+		if (!sk->dead)
+			sk->data_ready(sk,0);
 	}
 
 	/* Now we need to figure out why the socket was on the timer. */
-
-	switch (why) 
-	{
+	switch (why) {
 		case TIME_DONE:
-			/* If the socket hasn't been closed off, re-try a bit later */
+			/* If the socket hasn't been closed off, re-try a bit later. */
 			if (!sk->dead) {
 				net_reset_timer(sk, TIME_DONE, TCP_DONE_TIME);
 				break;
 			}
 
-			if (sk->state != TCP_CLOSE) 
-			{
+			if (sk->state != TCP_CLOSE) {
 				printk (KERN_DEBUG "non CLOSE socket in time_done\n");
 				break;
 			}
@@ -129,11 +105,9 @@ void net_timer (unsigned long data)
 			break;
 
 		case TIME_DESTROY:
-		/*
-		 *	We've waited for a while for all the memory associated with
-		 *	the socket to be freed.
-		 */
-
+			/* We've waited for a while for all the memory associated with
+			 * the socket to be freed.
+			 */
 			destroy_sock(sk);
 			break;
 
@@ -148,7 +122,8 @@ void net_timer (unsigned long data)
 			break;
 
 		default:
-			printk (KERN_DEBUG "net_timer: timer expired - reason %d is unknown\n", why);
+			/* I want to see these... */
+			printk ("net_timer: timer expired - reason %d is unknown\n", why);
 			break;
 	}
 }
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f355caa85..6ba50b280 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -5,7 +5,7 @@
  *
  *		The User Datagram Protocol (UDP).
  *
- * Version:	$Id: udp.c,v 1.3 1998/03/03 01:23:44 ralf Exp $
+ * Version:	$Id: udp.c,v 1.4 1998/03/17 22:18:36 ralf Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -828,7 +828,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 				 * of this packet since that is all
 				 * that will be read.
 				 */
-				amount = skb->tail - skb->h.raw;
+				amount = skb->len - sizeof(struct udphdr);
 			}
 			return put_user(amount, (int *)arg);
 		}
@@ -1033,17 +1033,18 @@ static inline void udp_deliver(struct sock *sk, struct sk_buff *skb)
 
 /*
  *	Multicasts and broadcasts go to each listener.
+ *
+ *	Note: called only from the BH handler context,
+ *	so we don't need to lock the hashes.
  */
 static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
 				 u32 saddr, u32 daddr)
 {
 	struct sock *sk;
-	int given = 0;
 
-	SOCKHASH_LOCK();
 	sk = udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)];
 	sk = udp_v4_mcast_next(sk, uh->dest, saddr, uh->source, daddr);
-	if(sk) {
+	if (sk) {
 		struct sock *sknext = NULL;
 
 		do {
@@ -1058,10 +1059,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
 				udp_deliver(sk, skb1);
 			sk = sknext;
 		} while(sknext);
-		given = 1;
-	}
-	SOCKHASH_UNLOCK();
-	if(!given)
+	} else
 		kfree_skb(skb);
 	return 0;
 }
author	Ralf Baechle <ralf@linux-mips.org>	1998-03-18 17:17:51 +0000
committer	Ralf Baechle <ralf@linux-mips.org>	1998-03-18 17:17:51 +0000
commit	f1382dc4850bb459d24a81c6cb0ef93ea7bd4a79 (patch)
tree	225271a3d5dcd4e9dea5ee393556abd754c964b1 /net/ipv4
parent	135b00fc2e90e605ac2a96b20b0ebd93851a3f89 (diff)