27 files changed, 525 insertions, 415 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d96910bb0..a3a126529 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -76,6 +76,7 @@
 #include <linux/interrupt.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
+#include <linux/init.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -1063,7 +1064,7 @@ extern void tcp_init(void);
  *	Called by socket.c on kernel startup.  
  */
  
-void inet_proto_init(struct net_proto *pro)
+__initfunc(void inet_proto_init(struct net_proto *pro))
 {
 	struct sk_buff *dummy_skb;
 	struct inet_protocol *p;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 8ef0be2af..ebf2c6c6b 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -90,6 +90,7 @@
 #include <linux/skbuff.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
+#include <linux/init.h>
 
 #include <net/ip.h>
 #include <net/icmp.h>
@@ -378,7 +379,7 @@ static void arp_neigh_destroy(struct neighbour *neigh)
 			extern atomic_t hh_count;
 			atomic_dec(&hh_count);
 #endif
-			kfree_s(hh, sizeof(struct(struct hh_cache)));
+			kfree_s(hh, sizeof(struct hh_cache));
 		}
 	}
 }
@@ -1976,7 +1977,7 @@ static struct proc_dir_entry proc_net_arp = {
 };
 #endif
 
-void arp_init (void)
+__initfunc(void arp_init (void))
 {
 	dev_add_pack(&arp_packet_type);
 	/* Start with the regular checks for expired arp entries. */
diff --git a/net/ipv4/fib.c b/net/ipv4/fib.c
index c2182728c..b25187a20 100644
--- a/net/ipv4/fib.c
+++ b/net/ipv4/fib.c
@@ -42,6 +42,7 @@
 #include <linux/if_arp.h>
 #include <linux/proc_fs.h>
 #include <linux/skbuff.h>
+#include <linux/init.h>
 
 #include <net/ip.h>
 #include <net/protocol.h>
@@ -1646,16 +1647,21 @@ int ip_rt_ioctl(unsigned int cmd, void *arg)
 	{
 		case SIOCADDRT:		/* Add a route */
 		case SIOCDELRT:		/* Delete a route */
+printk("ip_rt_ioctl() #1\n");
 			if (!suser())
 				return -EPERM;
+printk("ip_rt_ioctl() #2\n");
 			err = get_rt_from_user(&m.rtmsg, arg);
 			if (err)
 				return err;
+printk("ip_rt_ioctl() #3\n");
 			fib_lock();
+printk("ip_rt_ioctl() #4\n");
 			dummy_nlh.nlmsg_type = cmd == SIOCDELRT ? RTMSG_DELROUTE
 					    : RTMSG_NEWROUTE;
 			err = rtmsg_process(&dummy_nlh, &m.rtmsg);
 			fib_unlock();
+printk("ip_rt_ioctl() #5: err == %d\n", err);
 			return err;
 		case SIOCRTMSG:
 			if (!suser())
@@ -2020,7 +2026,7 @@ int ip_rt_event(int event, struct device *dev)
 }
 
 
-void ip_fib_init()
+__initfunc(void ip_fib_init(void))
 {
 	struct in_rtrulemsg r;
 
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 6b697d001..79bf058c5 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -256,6 +256,7 @@
 #include <net/sock.h>
 #include <linux/errno.h>
 #include <linux/timer.h>
+#include <linux/init.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <net/checksum.h>
@@ -373,7 +374,7 @@ struct socket *icmp_socket=&icmp_inode.u.socket_i;
 
 #ifndef CONFIG_NO_ICMP_LIMIT
 
-static void xrlim_init(void)
+__initfunc(static void xrlim_init(void))
 {
 	int type, entry;
 	struct icmp_xrlim *xr;
@@ -1020,7 +1021,7 @@ int icmp_chkaddr(struct sk_buff *skb)
 			{
 			struct tcphdr *th = (struct tcphdr *)(((unsigned char *)iph)+(iph->ihl<<2));
 
-			sk = tcp_v4_lookup(iph->saddr, th->source, iph->daddr, th->dest);
+			sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source);
 			if (!sk) return 0;
 			if (sk->saddr != iph->saddr) return 0;
 			if (sk->daddr != iph->daddr) return 0;
@@ -1034,7 +1035,7 @@ int icmp_chkaddr(struct sk_buff *skb)
 			{
 			struct udphdr *uh = (struct udphdr *)(((unsigned char *)iph)+(iph->ihl<<2));
 
-			sk = udp_v4_lookup(iph->saddr, uh->source, iph->daddr, uh->dest);
+			sk = udp_v4_lookup(iph->daddr, uh->dest, iph->saddr, uh->source);
 			if (!sk) return 0;
 			if (sk->saddr != iph->saddr && __ip_chk_addr(iph->saddr) != IS_MYADDR)
 				return 0;
@@ -1167,7 +1168,7 @@ static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1] = {
  { &icmp_statistics.IcmpOutAddrMaskReps, &icmp_statistics.IcmpInAddrMaskReps, icmp_address_reply, 0, NULL }
 };
 
-void icmp_init(struct net_proto_family *ops)
+__initfunc(void icmp_init(struct net_proto_family *ops))
 {
 	int err;
 
diff --git a/net/ipv4/ip_alias.c b/net/ipv4/ip_alias.c
index 74ff42a74..a78eef17a 100644
--- a/net/ipv4/ip_alias.c
+++ b/net/ipv4/ip_alias.c
@@ -26,6 +26,7 @@
 #include <linux/in.h>
 #include <linux/ip.h>
 #include <linux/route.h>
+#include <linux/init.h>
 #include <net/route.h>
 
 #ifdef ALIAS_USER_LAND_DEBUG
@@ -137,7 +138,7 @@ struct net_alias_type ip_alias_type =
  * ip_alias module initialization
  */
 
-int ip_alias_init(void)
+__initfunc(int ip_alias_init(void))
 {
 	return register_net_alias_type(&ip_alias_type, AF_INET);
 }
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index bf549b047..290f871a1 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -5,11 +5,15 @@
  *
  *		The IP fragmentation functionality.
  *		
+ * Version:	$Id: ip_fragment.c,v 1.22 1997/05/17 05:21:56 freitag Exp $
+ *
  * Authors:	Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
  *		Alan Cox <Alan.Cox@linux.org>
  *
  * Fixes:
  *		Alan Cox	:	Split from ip.c , see ip_input.c for history.
+ *		David S. Miller :	Begin massive cleanup...
+ *		Andi Kleen	:	Add sysctls.
  */
 
 #include <linux/types.h>
@@ -29,31 +33,49 @@
 #include <linux/ip_fw.h>
 #include <net/checksum.h>
 
-/*
- *	Fragment cache limits. We will commit 256K at one time. Should we
- *	cross that limit we will prune down to 192K. This should cope with
- *	even the most extreme cases without allowing an attacker to measurably
- *	harm machine performance.
- */
- 
-#define IPFRAG_HIGH_THRESH		(256*1024)
-#define IPFRAG_LOW_THRESH		(192*1024)
-
-/*
- *	This fragment handler is a bit of a heap. On the other hand it works quite
- *	happily and handles things quite well.
+/* Fragment cache limits. We will commit 256K at one time. Should we
+ * cross that limit we will prune down to 192K. This should cope with
+ * even the most extreme cases without allowing an attacker to measurably
+ * harm machine performance.
  */
-
-static struct ipq *ipqueue = NULL;		/* IP fragment queue	*/
+int sysctl_ipfrag_high_thresh = 256*1024;
+int sysctl_ipfrag_low_thresh = 192*1024;
+
+/* Describe an IP fragment. */
+struct ipfrag {
+	int		offset;		/* offset of fragment in IP datagram	*/
+	int		end;		/* last byte of data in datagram	*/
+	int		len;		/* length of this fragment		*/
+	struct sk_buff	*skb;		/* complete received fragment		*/
+	unsigned char	*ptr;		/* pointer into real fragment data	*/
+	struct ipfrag	*next;		/* linked list pointers			*/
+	struct ipfrag	*prev;
+};
+
+/* Describe an entry in the "incomplete datagrams" queue. */
+struct ipq {
+	struct iphdr	*iph;		/* pointer to IP header			*/
+	struct ipq	*next;		/* linked list pointers			*/
+	struct ipfrag	*fragments;	/* linked list of received fragments	*/
+	int		len;		/* total length of original datagram	*/
+	short		ihlen;		/* length of the IP header		*/	
+	struct timer_list timer;	/* when will this queue expire?		*/
+	struct ipq	**pprev;
+	struct device	*dev;		/* Device - for icmp replies */
+};
+
+#define IPQ_HASHSZ	64
+
+struct ipq *ipq_hash[IPQ_HASHSZ];
+
+#define ipqhashfn(id, saddr, daddr, prot) \
+	((((id) >> 1) ^ (saddr) ^ (daddr) ^ (prot)) & (IPQ_HASHSZ - 1))
 
 atomic_t ip_frag_mem = ATOMIC_INIT(0);		/* Memory used for fragments */
 
-char *in_ntoa(unsigned long in);
+char *in_ntoa(__u32 in);
 
-/*
- *	Memory Tracking Functions
- */
- 
+/* Memory Tracking Functions. */
 extern __inline__ void frag_kfree_skb(struct sk_buff *skb, int type)
 {
 	atomic_sub(skb->truesize, &ip_frag_mem);
@@ -69,28 +91,24 @@ extern __inline__ void frag_kfree_s(void *ptr, int len)
 extern __inline__ void *frag_kmalloc(int size, int pri)
 {
 	void *vp=kmalloc(size,pri);
+
 	if(!vp)
 		return NULL;
 	atomic_add(size, &ip_frag_mem);
 	return vp;
 }
  
-/*
- *	Create a new fragment entry.
- */
-
-static struct ipfrag *ip_frag_create(int offset, int end, struct sk_buff *skb, unsigned char *ptr)
+/* Create a new fragment entry. */
+static struct ipfrag *ip_frag_create(int offset, int end,
+				     struct sk_buff *skb, unsigned char *ptr)
 {
 	struct ipfrag *fp;
-	unsigned long flags;
 
 	fp = (struct ipfrag *) frag_kmalloc(sizeof(struct ipfrag), GFP_ATOMIC);
-	if (fp == NULL)
-	{
+	if (fp == NULL) {
 		NETDEBUG(printk(KERN_ERR "IP: frag_create: no memory left !\n"));
 		return(NULL);
 	}
-	memset(fp, 0, sizeof(struct ipfrag));
 
 	/* Fill in the structure. */
 	fp->offset = offset;
@@ -98,85 +116,63 @@ static struct ipfrag *ip_frag_create(int offset, int end, struct sk_buff *skb, u
 	fp->len = end - offset;
 	fp->skb = skb;
 	fp->ptr = ptr;
+	fp->next = fp->prev = NULL;
 	
-	/*
-	 *	Charge for the SKB as well.
-	 */
-	 
-	save_flags(flags);
-	cli();
+	/* Charge for the SKB as well. */
 	atomic_add(skb->truesize, &ip_frag_mem);
-	restore_flags(flags);
 
 	return(fp);
 }
 
-
-/*
- *	Find the correct entry in the "incomplete datagrams" queue for
- *	this IP datagram, and return the queue entry address if found.
+/* Find the correct entry in the "incomplete datagrams" queue for
+ * this IP datagram, and return the queue entry address if found.
  */
-
-static struct ipq *ip_find(struct iphdr *iph)
+static inline struct ipq *ip_find(struct iphdr *iph)
 {
+	__u16 id = iph->id;
+	__u32 saddr = iph->saddr;
+	__u32 daddr = iph->daddr;
+	__u8 protocol = iph->protocol;
+	unsigned int hash = ipqhashfn(id, saddr, daddr, protocol);
 	struct ipq *qp;
-	struct ipq *qplast;
-
-	cli();
-	qplast = NULL;
-	for(qp = ipqueue; qp != NULL; qplast = qp, qp = qp->next)
-	{
-		if (iph->id== qp->iph->id && iph->saddr == qp->iph->saddr &&
-			iph->daddr == qp->iph->daddr && iph->protocol == qp->iph->protocol)
-		{
-			del_timer(&qp->timer);	/* So it doesn't vanish on us. The timer will be reset anyway */
-			sti();
-			return(qp);
+
+	start_bh_atomic();
+	for(qp = ipq_hash[hash]; qp; qp = qp->next) {
+		if(qp->iph->id == id		&&
+		   qp->iph->saddr == saddr	&&
+		   qp->iph->daddr == daddr	&&
+		   qp->iph->protocol == protocol) {
+			del_timer(&qp->timer);
+			break;
 		}
 	}
-	sti();
-	return(NULL);
+	end_bh_atomic();
+	return qp;
 }
 
-
-/*
- *	Remove an entry from the "incomplete datagrams" queue, either
- *	because we completed, reassembled and processed it, or because
- *	it timed out.
+/* Remove an entry from the "incomplete datagrams" queue, either
+ * because we completed, reassembled and processed it, or because
+ * it timed out.
  */
-
 static void ip_free(struct ipq *qp)
 {
 	struct ipfrag *fp;
-	struct ipfrag *xp;
-
-	/*
-	 * Stop the timer for this entry.
-	 */
 
+	/* Stop the timer for this entry. */
 	del_timer(&qp->timer);
 
 	/* Remove this entry from the "incomplete datagrams" queue. */
-	cli();
-	if (qp->prev == NULL)
-	{
-		ipqueue = qp->next;
-		if (ipqueue != NULL)
-			ipqueue->prev = NULL;
-	}
-	else
-	{
-		qp->prev->next = qp->next;
-		if (qp->next != NULL)
-			qp->next->prev = qp->prev;
-	}
+	start_bh_atomic();
+	if(qp->next)
+		qp->next->pprev = qp->pprev;
+	*qp->pprev = qp->next;
+	end_bh_atomic();
 
 	/* Release all fragment data. */
-
 	fp = qp->fragments;
-	while (fp != NULL)
-	{
-		xp = fp->next;
+	while (fp) {
+		struct ipfrag *xp = fp->next;
+
 		frag_kfree_skb(fp->skb,FREE_READ);
 		frag_kfree_s(fp, sizeof(struct ipfrag));
 		fp = xp;
@@ -187,83 +183,65 @@ static void ip_free(struct ipq *qp)
 
 	/* Finally, release the queue descriptor itself. */
 	frag_kfree_s(qp, sizeof(struct ipq));
-	sti();
 }
 
-
-/*
- *	Oops- a fragment queue timed out.  Kill it and send an ICMP reply.
- */
-
+/* Oops, a fragment queue timed out.  Kill it and send an ICMP reply. */
 static void ip_expire(unsigned long arg)
 {
-	struct ipq *qp;
-
-	qp = (struct ipq *)arg;
-
-	/*
-	 *	Send an ICMP "Fragment Reassembly Timeout" message.
-	 */
+	struct ipq *qp = (struct ipq *) arg;
 
+	/* Send an ICMP "Fragment Reassembly Timeout" message. */
 	ip_statistics.IpReasmTimeout++;
 	ip_statistics.IpReasmFails++;   
-	/* This if is always true... shrug */
-	if(qp->fragments!=NULL)
-		icmp_send(qp->fragments->skb,ICMP_TIME_EXCEEDED,
-				ICMP_EXC_FRAGTIME, 0);
+	icmp_send(qp->fragments->skb,ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
 
-	/*
-	 *	Nuke the fragment queue.
-	 */
+	/* Nuke the fragment queue. */
 	ip_free(qp);
 }
 
-/*
- *	Memory limiting on fragments. Evictor trashes the oldest 
- *	fragment queue until we are back under the low threshold
+/* Memory limiting on fragments.  Evictor trashes the oldest 
+ * fragment queue until we are back under the low threshold.
  */
- 
 static void ip_evictor(void)
 {
-	while(atomic_read(&ip_frag_mem)>IPFRAG_LOW_THRESH)
-	{
-		if(!ipqueue)
+	while(atomic_read(&ip_frag_mem)>sysctl_ipfrag_low_thresh) {
+		int i;
+
+		/* FIXME: Make LRU queue of frag heads. -DaveM */
+		for(i = 0; i < IPQ_HASHSZ; i++)
+			if(ipq_hash[i])
+				break;
+		if(i >= IPQ_HASHSZ)
 			panic("ip_evictor: memcount");
-		ip_free(ipqueue);
+		ip_free(ipq_hash[i]);
 	}
 }
 
-/*
- * 	Add an entry to the 'ipq' queue for a newly received IP datagram.
- * 	We will (hopefully :-) receive all other fragments of this datagram
- * 	in time, so we just create a queue for this datagram, in which we
- * 	will insert the received fragments at their respective positions.
+/* Add an entry to the 'ipq' queue for a newly received IP datagram.
+ * We will (hopefully :-) receive all other fragments of this datagram
+ * in time, so we just create a queue for this datagram, in which we
+ * will insert the received fragments at their respective positions.
  */
-
 static struct ipq *ip_create(struct sk_buff *skb, struct iphdr *iph)
 {
 	struct ipq *qp;
+	unsigned int hash;
 	int ihlen;
 
 	qp = (struct ipq *) frag_kmalloc(sizeof(struct ipq), GFP_ATOMIC);
-	if (qp == NULL)
-	{
+	if (qp == NULL) {
 		NETDEBUG(printk(KERN_ERR "IP: create: no memory left !\n"));
 		return(NULL);
 	}
-	memset(qp, 0, sizeof(struct ipq));
-
-	/*
-	 *	Allocate memory for the IP header (plus 8 octets for ICMP).
-	 */
 
+	/* Allocate memory for the IP header (plus 8 octets for ICMP). */
 	ihlen = iph->ihl * 4;
+
 	qp->iph = (struct iphdr *) frag_kmalloc(64 + 8, GFP_ATOMIC);
-	if (qp->iph == NULL)
-	{
+	if (qp->iph == NULL) {
 		NETDEBUG(printk(KERN_ERR "IP: create: no memory left !\n"));
 		frag_kfree_s(qp, sizeof(struct ipq));
-		return(NULL);
+		return NULL;
 	}
 
 	memcpy(qp->iph, iph, ihlen + 8);
@@ -279,21 +257,19 @@ static struct ipq *ip_create(struct sk_buff *skb, struct iphdr *iph)
 	add_timer(&qp->timer);
 
 	/* Add this entry to the queue. */
-	qp->prev = NULL;
-	cli();
-	qp->next = ipqueue;
-	if (qp->next != NULL)
-		qp->next->prev = qp;
-	ipqueue = qp;
-	sti();
-	return(qp);
-}
+	hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
 
+	start_bh_atomic();
+	if((qp->next = ipq_hash[hash]) != NULL)
+		qp->next->pprev = &qp->next;
+	ipq_hash[hash] = qp;
+	qp->pprev = &ipq_hash[hash];
+	end_bh_atomic();
 
-/*
- *	See if a fragment queue is complete.
- */
+	return qp;
+}
 
+/* See if a fragment queue is complete. */
 static int ip_done(struct ipq *qp)
 {
 	struct ipfrag *fp;
@@ -301,13 +277,12 @@ static int ip_done(struct ipq *qp)
 
 	/* Only possible if we received the final fragment. */
 	if (qp->len == 0)
-		return(0);
+		return 0;
 
 	/* Check all fragment offsets to see if they connect. */
 	fp = qp->fragments;
 	offset = 0;
-	while (fp != NULL)
-	{
+	while (fp) {
 		if (fp->offset > offset)
 			return(0);	/* fragment(s) missing */
 		offset = fp->end;
@@ -315,18 +290,15 @@ static int ip_done(struct ipq *qp)
 	}
 
 	/* All fragments are present. */
-	return(1);
+	return 1;
 }
 
-
-/*
- *	Build a new IP datagram from all its fragments.
+/* Build a new IP datagram from all its fragments.
  *
- *	FIXME: We copy here because we lack an effective way of handling lists
- *	of bits on input. Until the new skb data handling is in I'm not going
- *	to touch this with a bargepole. 
+ * FIXME: We copy here because we lack an effective way of handling lists
+ * of bits on input. Until the new skb data handling is in I'm not going
+ * to touch this with a bargepole. 
  */
-
 static struct sk_buff *ip_glue(struct ipq *qp)
 {
 	struct sk_buff *skb;
@@ -335,25 +307,23 @@ static struct sk_buff *ip_glue(struct ipq *qp)
 	unsigned char *ptr;
 	int count, len;
 
-	/*
-	 *	Allocate a new buffer for the datagram.
-	 */
+	/* Allocate a new buffer for the datagram. */
 	len = qp->ihlen + qp->len;
 	
-	if(len>65535)
-	{
-		printk(KERN_INFO "Oversized IP packet from %s.\n", in_ntoa(qp->iph->saddr));
+	if(len>65535) {
+		printk(KERN_INFO "Oversized IP packet from %s.\n",
+		       in_ntoa(qp->iph->saddr));
 		ip_statistics.IpReasmFails++;
 		ip_free(qp);
 		return NULL;
 	}
 	
-	if ((skb = dev_alloc_skb(len)) == NULL)
-	{
+	if ((skb = dev_alloc_skb(len)) == NULL) {
 		ip_statistics.IpReasmFails++;
-		NETDEBUG(printk(KERN_ERR "IP: queue_glue: no memory for gluing queue %p\n", qp));
+		NETDEBUG(printk(KERN_ERR "IP: queue_glue: no memory for gluing "
+				"queue %p\n", qp));
 		ip_free(qp);
-		return(NULL);
+		return NULL;
 	}
 
 	/* Fill in the basic details. */
@@ -368,11 +338,10 @@ static struct sk_buff *ip_glue(struct ipq *qp)
 
 	/* Copy the data portions of all fragments into the new buffer. */
 	fp = qp->fragments;
-	while(fp != NULL)
-	{
-		if(count+fp->len > skb->len)
-		{
-			NETDEBUG(printk(KERN_ERR "Invalid fragment list: Fragment over size.\n"));
+	while(fp) {
+		if(count+fp->len > skb->len) {
+			NETDEBUG(printk(KERN_ERR "Invalid fragment list: "
+					"Fragment over size.\n"));
 			ip_free(qp);
 			kfree_skb(skb,FREE_WRITE);
 			ip_statistics.IpReasmFails++;
@@ -396,14 +365,10 @@ static struct sk_buff *ip_glue(struct ipq *qp)
 	iph->tot_len = htons((iph->ihl * 4) + count);
 
 	ip_statistics.IpReasmOKs++;
-	return(skb);
+	return skb;
 }
 
-
-/*
- *	Process an incoming IP datagram fragment.
- */
-
+/* Process an incoming IP datagram fragment. */
 struct sk_buff *ip_defrag(struct sk_buff *skb)
 {
 	struct iphdr *iph = skb->nh.iph;
@@ -417,45 +382,37 @@ struct sk_buff *ip_defrag(struct sk_buff *skb)
 	
 	ip_statistics.IpReasmReqds++;
 
-	/*
-	 *	Start by cleaning up the memory
-	 */
-
-	if(atomic_read(&ip_frag_mem)>IPFRAG_HIGH_THRESH)
+	/* Start by cleaning up the memory. */
+	if(atomic_read(&ip_frag_mem)>sysctl_ipfrag_high_thresh)
 		ip_evictor();
-	/* 
-	 *	Find the entry of this IP datagram in the "incomplete datagrams" queue. 
-	 */
-	 
+
+	/* Find the entry of this IP datagram in the "incomplete datagrams" queue. */
 	qp = ip_find(iph);
 
 	/* Is this a non-fragmented datagram? */
 	offset = ntohs(iph->frag_off);
 	flags = offset & ~IP_OFFSET;
 	offset &= IP_OFFSET;
-	if (((flags & IP_MF) == 0) && (offset == 0))
-	{
-		if (qp != NULL)
-			ip_free(qp);	/* Fragmented frame replaced by full unfragmented copy */
-		return(skb);
+	if (((flags & IP_MF) == 0) && (offset == 0)) {
+		if (qp != NULL) {
+			/* Fragmented frame replaced by full unfragmented copy. */
+			ip_free(qp);
+		}
+		return skb;
 	}
 
 	offset <<= 3;		/* offset is in 8-byte chunks */
 	ihl = iph->ihl * 4;
 
-	/*
-	 * If the queue already existed, keep restarting its timer as long
+	/* If the queue already existed, keep restarting its timer as long
 	 * as we still are receiving fragments.  Otherwise, create a fresh
 	 * queue entry.
 	 */
-
-	if (qp != NULL)
-	{
+	if (qp) {
 		/* ANK. If the first fragment is received,
 		 * we should remember the correct IP header (with options)
 		 */
-	        if (offset == 0)
-		{
+	        if (offset == 0) {
 			qp->ihlen = ihl;
 			memcpy(qp->iph, iph, ihl+8);
 		}
@@ -464,84 +421,59 @@ struct sk_buff *ip_defrag(struct sk_buff *skb)
 		qp->timer.data = (unsigned long) qp;	/* pointer to queue */
 		qp->timer.function = ip_expire;		/* expire function */
 		add_timer(&qp->timer);
-	}
-	else
-	{
-		/*
-		 *	If we failed to create it, then discard the frame
-		 */
-		if ((qp = ip_create(skb, iph)) == NULL)
-		{
+	} else {
+		/* If we failed to create it, then discard the frame. */
+		if ((qp = ip_create(skb, iph)) == NULL) {
 			kfree_skb(skb, FREE_READ);
 			ip_statistics.IpReasmFails++;
 			return NULL;
 		}
 	}
 	
-	/*
-	 *	Attempt to construct an oversize packet.
-	 */
-	 
-	if(ntohs(iph->tot_len)+(int)offset>65535)
-	{
-		printk(KERN_INFO "Oversized packet received from %s\n",in_ntoa(iph->saddr));
+	/* Attempt to construct an oversize packet. */
+	if(ntohs(iph->tot_len)+(int)offset>65535) {
+		printk(KERN_INFO "Oversized packet received from %s\n",
+		       in_ntoa(iph->saddr));
 		frag_kfree_skb(skb, FREE_READ);
 		ip_statistics.IpReasmFails++;
 		return NULL;
 	}	
 
-	/*
-	 *	Determine the position of this fragment.
-	 */
-
+	/* Determine the position of this fragment. */
 	end = offset + ntohs(iph->tot_len) - ihl;
 
-	/*
-	 *	Point into the IP datagram 'data' part.
-	 */
-
+	/* Point into the IP datagram 'data' part. */
 	ptr = skb->data + ihl;
 
-	/*
-	 *	Is this the final fragment?
-	 */
-
+	/* Is this the final fragment? */
 	if ((flags & IP_MF) == 0)
 		qp->len = end;
 
-	/*
-	 * 	Find out which fragments are in front and at the back of us
-	 * 	in the chain of fragments so far.  We must know where to put
-	 * 	this fragment, right?
+	/* Find out which fragments are in front and at the back of us
+	 * in the chain of fragments so far.  We must know where to put
+	 * this fragment, right?
 	 */
-
 	prev = NULL;
-	for(next = qp->fragments; next != NULL; next = next->next)
-	{
-		if (next->offset > offset)
+	for(next = qp->fragments; next != NULL; next = next->next) {
+		if (next->offset >= offset)
 			break;	/* bingo! */
 		prev = next;
 	}
 
-	/*
-	 * 	We found where to put this one.
-	 * 	Check for overlap with preceding fragment, and, if needed,
-	 * 	align things so that any overlaps are eliminated.
+	/* We found where to put this one.  Check for overlap with
+	 * preceding fragment, and, if needed, align things so that
+	 * any overlaps are eliminated.
 	 */
-	if (prev != NULL && offset < prev->end)
-	{
+	if (prev != NULL && offset < prev->end) {
 		i = prev->end - offset;
 		offset += i;	/* ptr into datagram */
 		ptr += i;	/* ptr into fragment data */
 	}
 
-	/*
-	 * Look for overlap with succeeding segments.
+	/* Look for overlap with succeeding segments.
 	 * If we can merge fragments, do it.
 	 */
-
-	for(tmp=next; tmp != NULL; tmp = tfp)
-	{
+	for(tmp=next; tmp != NULL; tmp = tfp) {
 		tfp = tmp->next;
 		if (tmp->offset >= end)
 			break;		/* no overlaps at all */
@@ -550,12 +482,11 @@ struct sk_buff *ip_defrag(struct sk_buff *skb)
 		tmp->len -= i;				/* so reduce size of	*/
 		tmp->offset += i;			/* next fragment	*/
 		tmp->ptr += i;
-		/*
-		 *	If we get a frag size of <= 0, remove it and the packet
-		 *	that it goes with.
+
+		/* If we get a frag size of <= 0, remove it and the packet
+		 * that it goes with.
 		 */
-		if (tmp->len <= 0)
-		{
+		if (tmp->len <= 0) {
 			if (tmp->prev != NULL)
 				tmp->prev->next = tmp->next;
 			else
@@ -564,26 +495,20 @@ struct sk_buff *ip_defrag(struct sk_buff *skb)
 			if (tmp->next != NULL)
 				tmp->next->prev = tmp->prev;
 			
-			next=tfp;	/* We have killed the original next frame */
+			/* We have killed the original next frame. */
+			next = tfp;
 
 			frag_kfree_skb(tmp->skb,FREE_READ);
 			frag_kfree_s(tmp, sizeof(struct ipfrag));
 		}
 	}
 
-	/*
-	 *	Insert this fragment in the chain of fragments.
-	 */
-
+	/* Insert this fragment in the chain of fragments. */
 	tfp = NULL;
 	tfp = ip_frag_create(offset, end, skb, ptr);
 
-	/*
-	 *	No memory to save the fragment - so throw the lot
-	 */
-
-	if (!tfp)
-	{
+	/* No memory to save the fragment - so throw the lot. */
+	if (!tfp) {
 		frag_kfree_skb(skb, FREE_READ);
 		return NULL;
 	}
@@ -597,16 +522,14 @@ struct sk_buff *ip_defrag(struct sk_buff *skb)
 	if (next != NULL)
 		next->prev = tfp;
 
-	/*
-	 * 	OK, so we inserted this new fragment into the chain.
-	 * 	Check if we now have a full IP datagram which we can
-	 * 	bump up to the IP layer...
+	/* OK, so we inserted this new fragment into the chain.
+	 * Check if we now have a full IP datagram which we can
+	 * bump up to the IP layer...
 	 */
-
-	if (ip_done(qp))
-	{
-		skb2 = ip_glue(qp);		/* glue together the fragments */
+	if (ip_done(qp)) {
+		/* Glue together the fragments. */
+		skb2 = ip_glue(qp);
 		return(skb2);
 	}
-	return(NULL);
+	return NULL;
 }
diff --git a/net/ipv4/ip_fw.c b/net/ipv4/ip_fw.c
index e516a2baa..ea9fe48b0 100644
--- a/net/ipv4/ip_fw.c
+++ b/net/ipv4/ip_fw.c
@@ -107,6 +107,7 @@
 #include <net/netlink.h>
 #include <linux/firewall.h>
 #include <linux/ip_fw.h>
+#include <linux/init.h>
 
 #ifdef CONFIG_IP_MASQUERADE
 #include <net/ip_masq.h>
@@ -1298,7 +1299,7 @@ static struct proc_dir_entry proc_net_ipfwfwd = {
 #endif
 
 
-void ip_fw_init(void)
+__initfunc(void ip_fw_init(void))
 {
 #ifdef CONFIG_PROC_FS
 #ifdef CONFIG_IP_ACCT
diff --git a/net/ipv4/ip_masq.c b/net/ipv4/ip_masq.c
index c5976614e..2d2fd3717 100644
--- a/net/ipv4/ip_masq.c
+++ b/net/ipv4/ip_masq.c
@@ -31,6 +31,7 @@
 #include <linux/in.h>
 #include <linux/ip.h>
 #include <linux/inet.h>
+#include <linux/init.h>
 #include <net/protocol.h>
 #include <net/icmp.h>
 #include <net/tcp.h>
@@ -1010,7 +1011,7 @@ static struct proc_dir_entry proc_net_ipmsqhst = {
 /*
  *	Initialize ip masquerading
  */
-int ip_masq_init(void)
+__initfunc(int ip_masq_init(void))
 {
 #ifdef CONFIG_PROC_FS
 	proc_net_register(&proc_net_ipmsqhst);
diff --git a/net/ipv4/ip_masq_app.c b/net/ipv4/ip_masq_app.c
index 456888bc1..f7449e0ba 100644
--- a/net/ipv4/ip_masq_app.c
+++ b/net/ipv4/ip_masq_app.c
@@ -30,6 +30,7 @@
 #include <linux/skbuff.h>
 #include <linux/in.h>
 #include <linux/ip.h>
+#include <linux/init.h>
 #include <net/protocol.h>
 #include <net/tcp.h>
 #include <net/udp.h>
@@ -482,7 +483,7 @@ static struct proc_dir_entry proc_net_ip_masq_app = {
  *	Initialization routine
  */
 
-int ip_masq_app_init(void)
+__initfunc(int ip_masq_app_init(void))
 {
 #ifdef CONFIG_PROC_FS
 	proc_net_register(&proc_net_ip_masq_app);
diff --git a/net/ipv4/ip_masq_ftp.c b/net/ipv4/ip_masq_ftp.c
index cc2481746..4d5568d0a 100644
--- a/net/ipv4/ip_masq_ftp.c
+++ b/net/ipv4/ip_masq_ftp.c
@@ -28,6 +28,7 @@
 #include <linux/skbuff.h>
 #include <linux/in.h>
 #include <linux/ip.h>
+#include <linux/init.h>
 #include <net/protocol.h>
 #include <net/tcp.h>
 #include <net/ip_masq.h>
@@ -187,7 +188,7 @@ struct ip_masq_app ip_masq_ftp = {
  * 	ip_masq_ftp initialization
  */
 
-int ip_masq_ftp_init(void)
+__initfunc(int ip_masq_ftp_init(void))
 {
         return register_ip_masq_app(&ip_masq_ftp, IPPROTO_TCP, 21);
 }
diff --git a/net/ipv4/ip_masq_irc.c b/net/ipv4/ip_masq_irc.c
index e0b94f0d6..a1be56f81 100644
--- a/net/ipv4/ip_masq_irc.c
+++ b/net/ipv4/ip_masq_irc.c
@@ -29,6 +29,7 @@
 #include <linux/skbuff.h>
 #include <linux/in.h>
 #include <linux/ip.h>
+#include <linux/init.h>
 #include <net/protocol.h>
 #include <net/tcp.h>
 #include <net/ip_masq.h>
@@ -238,7 +239,7 @@ struct ip_masq_app ip_masq_irc = {
  * 	ip_masq_irc initialization
  */
 
-int ip_masq_irc_init(void)
+__initfunc(int ip_masq_irc_init(void))
 {
         return register_ip_masq_app(&ip_masq_irc, IPPROTO_TCP, 6667);
 }
diff --git a/net/ipv4/ip_masq_quake.c b/net/ipv4/ip_masq_quake.c
index 3614f0cf5..08a062bc7 100644
--- a/net/ipv4/ip_masq_quake.c
+++ b/net/ipv4/ip_masq_quake.c
@@ -28,6 +28,7 @@
 #include <linux/skbuff.h>
 #include <linux/in.h>
 #include <linux/ip.h>
+#include <linux/init.h>
 #include <net/protocol.h>
 #include <net/udp.h>
 #include <net/ip_masq.h>
@@ -279,7 +280,7 @@ struct ip_masq_app ip_masq_quakenew = {
  * 	ip_masq_quake initialization
  */
 
-int ip_masq_quake_init(void)
+__initfunc(int ip_masq_quake_init(void))
 {
         return (register_ip_masq_app(&ip_masq_quake, IPPROTO_UDP, 26000) +
 		register_ip_masq_app(&ip_masq_quakenew, IPPROTO_UDP, 27000));
diff --git a/net/ipv4/ip_masq_raudio.c b/net/ipv4/ip_masq_raudio.c
index 85bba590e..52f439102 100644
--- a/net/ipv4/ip_masq_raudio.c
+++ b/net/ipv4/ip_masq_raudio.c
@@ -2,7 +2,7 @@
  *		IP_MASQ_RAUDIO  - Real Audio masquerading module
  *
  *
- * Version:	@(#)$Id: ip_masq_raudio.c,v 1.5 1997/04/03 08:52:02 davem Exp $
+ * Version:	@(#)$Id: ip_masq_raudio.c,v 1.6 1997/04/29 09:38:26 mj Exp $
  *
  * Author:	Nigel Metheringham
  *		[strongly based on ftp module by Juan Jose Ciarlante & Wouter Gadeyne]
@@ -45,6 +45,7 @@
 #include <linux/skbuff.h>
 #include <linux/in.h>
 #include <linux/ip.h>
+#include <linux/init.h>
 #include <net/protocol.h>
 #include <net/tcp.h>
 #include <net/ip_masq.h>
@@ -200,7 +201,7 @@ struct ip_masq_app ip_masq_raudio = {
  * 	ip_masq_raudio initialization
  */
 
-int ip_masq_raudio_init(void)
+__initfunc(int ip_masq_raudio_init(void))
 {
         return register_ip_masq_app(&ip_masq_raudio, IPPROTO_TCP, 7070);
 }
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 2c7974506..80baf8364 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -505,7 +505,7 @@ int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, in
 	opt->is_data = 1;
 	opt->is_setbyuser = 1;
 	if (optlen && ip_options_compile(opt, NULL)) {
-		kfree_s(opt, sizeof(struct options) + optlen);
+		kfree_s(opt, sizeof(struct ip_options) + optlen);
 		return -EINVAL;
 	}
 	*optp = opt;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 41e60de61..6558b56e4 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -27,6 +27,8 @@
  *					(in case if packet not accepted by
  *					output firewall rules)
  *		Alexey Kuznetsov:	use new route cache
+ *		Andi Kleen:		Fix broken PMTU recovery and remove
+ *					some redundant tests.
  */
 
 #include <asm/uaccess.h>
@@ -47,6 +49,7 @@
 #include <linux/etherdevice.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
+#include <linux/init.h>
 
 #include <net/snmp.h>
 #include <net/ip.h>
@@ -126,9 +129,8 @@ int ip_build_pkt(struct sk_buff *skb, struct sock *sk, u32 saddr, u32 daddr,
 	iph->ihl      = 5;
 	iph->tos      = sk->ip_tos;
 	iph->frag_off = 0;
-	if (sk->ip_pmtudisc == IP_PMTUDISC_DONT ||
-	    (sk->ip_pmtudisc == IP_PMTUDISC_WANT && 
-	     rt->rt_flags&RTF_NOPMTUDISC))
+	if (sk->ip_pmtudisc == IP_PMTUDISC_WANT && 
+		!(rt->rt_flags & RTF_NOPMTUDISC))
 		iph->frag_off |= htons(IP_DF);
 	iph->ttl      = sk->ip_ttl;
 	iph->daddr    = rt->rt_dst;
@@ -207,9 +209,8 @@ int ip_build_header(struct sk_buff *skb, struct sock *sk)
 	iph->ihl      = 5;
 	iph->tos      = sk->ip_tos;
 	iph->frag_off = 0;
-	if (sk->ip_pmtudisc == IP_PMTUDISC_DONT ||
-	    (sk->ip_pmtudisc == IP_PMTUDISC_WANT && 
-	     rt->rt_flags&RTF_NOPMTUDISC))
+	if (sk->ip_pmtudisc == IP_PMTUDISC_WANT &&
+		!(rt->rt_flags & RTF_NOPMTUDISC))
 		iph->frag_off |= htons(IP_DF);
 	iph->ttl      = sk->ip_ttl;
 	iph->daddr    = rt->rt_dst;
@@ -480,8 +481,7 @@ int ip_build_xmit(struct sock *sk,
 #endif	
 
 	if (sk->ip_pmtudisc == IP_PMTUDISC_DONT ||
-	    (sk->ip_pmtudisc == IP_PMTUDISC_WANT &&
-	     rt->rt_flags&RTF_NOPMTUDISC))
+	     rt->rt_flags&RTF_NOPMTUDISC)
 		df = 0;
 
 	 
@@ -1036,7 +1036,7 @@ static struct proc_dir_entry proc_net_igmp = {
  *	IP registers the packet type and then calls the subprotocol initialisers
  */
 
-void ip_init(void)
+__initfunc(void ip_init(void))
 {
 	dev_add_pack(&ip_packet_type);
 
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 1689159ed..8c2463d04 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -126,26 +126,24 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc, struct device **de
 	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
 		if (cmsg->cmsg_level != SOL_IP)
 			continue;
-		switch (cmsg->cmsg_type)
-		{
+		switch (cmsg->cmsg_type) {
 		case IP_LOCALADDR:
-			if (cmsg->cmsg_len < sizeof(struct in_addr)+sizeof(*cmsg))
+			if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_addr)))
 				return -EINVAL;
-			memcpy(&ipc->addr, cmsg->cmsg_data, 4);
+			memcpy(&ipc->addr, CMSG_DATA(cmsg), sizeof(struct in_addr));
 			break;
 		case IP_RETOPTS:
-			err = cmsg->cmsg_len - sizeof(*cmsg);
-			err = ip_options_get(&ipc->opt, cmsg->cmsg_data,
-					     err < 40 ? err : 40, 0);
+			err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+			err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0);
 			if (err)
 				return err;
 			break;
 		case IP_TXINFO:
 		{
 			struct in_pktinfo *info;
-			if (cmsg->cmsg_len < sizeof(*info)+sizeof(*cmsg))
+			if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo)))
 				return -EINVAL;
-			info = (struct in_pktinfo*)cmsg->cmsg_data;
+			info = (struct in_pktinfo *)CMSG_DATA(cmsg);
 			if (info->ipi_ifindex && !devp)
 				return -EINVAL;
 			if ((*devp = dev_get_by_index(info->ipi_ifindex)) == NULL)
@@ -212,7 +210,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt
 			sk->opt = opt;
 			sti();
 			if (old_opt)
-				kfree_s(old_opt, sizeof(struct optlen) + old_opt->optlen);
+				kfree_s(old_opt, sizeof(struct ip_options) + old_opt->optlen);
 			return 0;
 		}
 		case IP_RXINFO:
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index f76c5b52d..1a38c5275 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -47,6 +47,7 @@
 #include <linux/netdevice.h>
 #include <linux/proc_fs.h>
 #include <linux/mroute.h>
+#include <linux/init.h>
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <linux/skbuff.h>
@@ -1065,7 +1066,7 @@ static struct proc_dir_entry proc_net_ipmr_mfc = {
  *	Setup for IP multicast routing
  */
  
-void ip_mr_init(void)
+__initfunc(void ip_mr_init(void))
 {
 	printk(KERN_INFO "Linux IP multicast router 0.06.\n");
 	register_netdevice_notifier(&ip_mr_notifier);
diff --git a/net/ipv4/rarp.c b/net/ipv4/rarp.c
index fb9e2a738..e0323bb85 100644
--- a/net/ipv4/rarp.c
+++ b/net/ipv4/rarp.c
@@ -45,6 +45,7 @@
 #include <linux/if_arp.h>
 #include <linux/in.h>
 #include <linux/config.h>
+#include <linux/init.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -553,8 +554,8 @@ struct proc_dir_entry proc_net_rarp = {
 	rarp_get_info
 };
 
-void
-rarp_init(void)
+__initfunc(void
+rarp_init(void))
 {
 	proc_net_register(&proc_net_rarp);
 	rarp_ioctl_hook = rarp_ioctl;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 5ba6467d9..4a4c5321c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -69,6 +69,7 @@
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/proc_fs.h>
+#include <linux/init.h>
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <net/route.h>
@@ -1379,7 +1380,7 @@ void ip_rt_multicast_event(struct device *dev)
 	rt_cache_flush(0);
 }
 
-void ip_rt_init()
+__initfunc(void ip_rt_init(void))
 {
 	ip_fib_init();
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 84ba6578b..18a8d2bf8 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -35,22 +35,27 @@ extern int sysctl_arp_check_interval;
 extern int sysctl_arp_confirm_interval;
 extern int sysctl_arp_confirm_timeout;
 
+/* From ip_fragment.c */
+extern int sysctl_ipfrag_low_thresh;
+extern int sysctl_ipfrag_high_thresh; 
+
 extern int sysctl_tcp_cong_avoidance;
 extern int sysctl_tcp_hoe_retransmits;
 extern int sysctl_tcp_sack;
 extern int sysctl_tcp_tsack;
 extern int sysctl_tcp_timestamps;
 extern int sysctl_tcp_window_scaling;
+extern int sysctl_syn_retries;
 
 extern int tcp_sysctl_congavoid(ctl_table *ctl, int write, struct file * filp,
 				void *buffer, size_t *lenp);
 
-struct ipv4_config ipv4_config = { 1, 1, 1, 1, };
+struct ipv4_config ipv4_config = { 1, 1, 1, 0, };
 
 #ifdef CONFIG_SYSCTL
 
 struct ipv4_config ipv4_def_router_config = { 0, 1, 1, 1, 1, 1, 1, };
-struct ipv4_config ipv4_def_host_config = { 1, 1, 1, 1, };
+struct ipv4_config ipv4_def_host_config = { 1, 1, 1, 0, };
 
 int ipv4_sysctl_forwarding(ctl_table *ctl, int write, struct file * filp,
 			   void *buffer, size_t *lenp)
@@ -144,6 +149,12 @@ ctl_table ipv4_table[] = {
         {NET_IPV4_RFC1620_REDIRECTS, "ip_rfc1620_redirects",
          &ipv4_config.rfc1620_redirects, sizeof(int), 0644, NULL,
          &proc_dointvec},
+	{NET_TCP_SYN_RETRIES, "tcp_syn_retries",
+	&sysctl_syn_retries, sizeof(int), 0644, NULL, &proc_dointvec},
+	{NET_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh",
+	&sysctl_ipfrag_high_thresh, sizeof(int), 0644, NULL, &proc_dointvec},
+	{NET_IPFRAG_LOW_THRESH, "ipfrag_low_thresh",
+	&sysctl_ipfrag_low_thresh, sizeof(int), 0644, NULL, &proc_dointvec},
 	{0}
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 420db4777..000813b94 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp.c,v 1.61 1997/04/22 02:53:10 davem Exp $
+ * Version:	$Id: tcp.c,v 1.65 1997/05/06 09:31:43 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -424,6 +424,7 @@
 #include <linux/types.h>
 #include <linux/fcntl.h>
 #include <linux/poll.h>
+#include <linux/init.h>
 
 #include <net/icmp.h>
 #include <net/tcp.h>
@@ -849,7 +850,6 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags)
 				tcp_size = skb->tail -
 					((unsigned char *)(skb->h.th) + tp->tcp_header_len);
 
-				/* printk("extending buffer\n"); */
 				/* This window_seq test is somewhat dangerous
 				 * If the remote does SWS avoidance we should
 				 * queue the best we can if not we should in 
@@ -1100,6 +1100,9 @@ static void cleanup_rbuf(struct sock *sk)
 		struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 		__u32 rcv_wnd;
 
+	 	/* FIXME: double check this rule, then check against
+		 * other use of similar rules. Abtract if possible.
+		 */
 		rcv_wnd = tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup);
 
 		if ((rcv_wnd < sk->mss) && (sock_rspace(sk) > rcv_wnd))
@@ -1357,7 +1360,10 @@ static int tcp_close_state(struct sock *sk, int dead)
 		case TCP_CLOSE:
 		case TCP_LISTEN:
 			break;
-		case TCP_LAST_ACK:	/* Could have shutdown() then close() */
+		case TCP_LAST_ACK:	/* Could have shutdown() then close()
+					 * (but don't do send_fin again!) */
+			ns=TCP_LAST_ACK;
+			break;
 		case TCP_CLOSE_WAIT:	/* They have FIN'd us. We send our FIN and
 					   wait only for the ACK */
 			ns=TCP_LAST_ACK;
@@ -1655,11 +1661,11 @@ void tcp_set_keepalive(struct sock *sk, int val)
 		tcp_dec_slow_timer(TCP_SLT_KEEPALIVE);
 }
 
-void tcp_init(void)
+__initfunc(void tcp_init(void))
 {
 	tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
 					       sizeof(struct open_request),
-					       sizeof(long)*8, SLAB_HWCACHE_ALIGN,
+					       0, SLAB_HWCACHE_ALIGN,
 					       NULL, NULL);
 	if(!tcp_openreq_cachep)
 		panic("tcp_init: Cannot alloc open_request cache.");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ab2b1ef82..3ab1dee42 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_input.c,v 1.50 1997/04/22 02:53:12 davem Exp $
+ * Version:	$Id: tcp_input.c,v 1.51 1997/04/27 19:24:40 schenk Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -321,8 +321,10 @@ void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp)
 	  					break;
 					case TCPOPT_WINDOW:
 	  					if(opsize==TCPOLEN_WINDOW && th->syn)
-							if (sysctl_tcp_window_scaling)
+							if (sysctl_tcp_window_scaling) {
+								tp->wscale_ok = 1;
 								tp->snd_wscale = *(__u8 *)ptr;
+							}
 						break;
 					case TCPOPT_SACK_PERM:
 	  					if(opsize==TCPOLEN_SACK_PERM && th->syn)
@@ -816,7 +818,7 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
 	 */
 	if (before(tp->snd_wl1, ack_seq) ||
 	    (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) {
-		unsigned long nwin = ntohs(th->window);
+		unsigned long nwin = ntohs(th->window) << tp->snd_wscale;
 
 		if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) {
 			flag |= FLAG_WIN_UPDATE;
@@ -1464,17 +1466,21 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			if(tp->af_specific->conn_request(sk, skb, opt, isn) < 0)
 				return 1;
 
-			/*  Now we have several options: In theory there is 
-			 *  nothing else in the frame. KA9Q has an option to 
-			 *  send data with the syn, BSD accepts data with the
-			 *  syn up to the [to be] advertised window and 
-			 *  Solaris 2.1 gives you a protocol error. For now 
-			 *  we just ignore it, that fits the spec precisely 
-			 *  and avoids incompatibilities. It would be nice in
-			 *  future to drop through and process the data.
+			/* Now we have several options: In theory there is 
+			 * nothing else in the frame. KA9Q has an option to 
+			 * send data with the syn, BSD accepts data with the
+			 * syn up to the [to be] advertised window and 
+			 * Solaris 2.1 gives you a protocol error. For now 
+			 * we just ignore it, that fits the spec precisely 
+			 * and avoids incompatibilities. It would be nice in
+			 * future to drop through and process the data.
 			 *
-			 *  Now that TTCP is starting to be used we ought to 
-			 *  queue this data.
+			 * Now that TTCP is starting to be used we ought to 
+			 * queue this data.
+			 * But, this leaves one open to an easy denial of
+		 	 * service attack, and SYN cookies can't defend
+			 * against this problem. So, we drop the data
+			 * in the interest of security over speed.
 			 */
 			return 0;
 		}
@@ -1514,10 +1520,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			 * move to established.
 			 */
 			tp->rcv_nxt = skb->seq+1;
-			tp->rcv_wnd = 0;
 			tp->rcv_wup = skb->seq+1;
 
-			tp->snd_wnd = htons(th->window);
+			tp->snd_wnd = htons(th->window) << tp->snd_wscale;
 			tp->snd_wl1 = skb->seq;
 			tp->snd_wl2 = skb->ack_seq;
 
@@ -1526,6 +1531,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			tcp_set_state(sk, TCP_ESTABLISHED);
 			tcp_parse_options(th,tp);
 			/* FIXME: need to make room for SACK still */
+        		if (tp->wscale_ok == 0) {
+                		tp->snd_wscale = tp->rcv_wscale = 0;
+                		tp->window_clamp = min(tp->window_clamp,65535);
+        		}
 			if (tp->tstamp_ok) {
 				tp->tcp_header_len = sizeof(struct tcphdr) + 12;	/* FIXME: Define constant! */
 				sk->dummy_th.doff += 3;		/* reserve space of options */
@@ -1695,7 +1704,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 					sk->state_change(sk);		
 
 				tp->snd_una = skb->ack_seq;
-				tp->snd_wnd = htons(th->window);
+				tp->snd_wnd = htons(th->window) << tp->snd_wscale;
 				tp->snd_wl1 = skb->seq;
 				tp->snd_wl2 = skb->ack_seq;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f4528f552..c4d12a54f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_ipv4.c,v 1.39 1997/04/22 02:53:14 davem Exp $
+ * Version:	$Id: tcp_ipv4.c,v 1.43 1997/05/06 09:31:44 davem Exp $
  *
  *		IPv4 specific functions
  *
@@ -465,7 +465,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	struct sk_buff *buff;
 	struct sk_buff *skb1;
 	int tmp;
-	struct tcphdr *t1;
+	struct tcphdr *th;
 	struct rtable *rt;
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
@@ -546,20 +546,17 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		return(-ENETUNREACH);
 	}
 
-	t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
-	buff->h.th = t1;
+	th = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
+	buff->h.th = th;
 
-	memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
+	memcpy(th,(void *)&(sk->dummy_th), sizeof(*th));
 	buff->seq = sk->write_seq++;
-	t1->seq = htonl(buff->seq);
+	th->seq = htonl(buff->seq);
 	tp->snd_nxt = sk->write_seq;
 	buff->end_seq = sk->write_seq;
-	t1->ack = 0;
-	t1->window = htons(512);
-	t1->syn = 1;
+	th->ack = 0;
+	th->syn = 1;
 
-	/* Use 512 or whatever user asked for. */
-	tp->window_clamp = rt->u.dst.window;
 
 	sk->mtu = rt->u.dst.pmtu;
 	if ((sk->ip_pmtudisc == IP_PMTUDISC_DONT ||
@@ -577,13 +574,26 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		sk->mss = (sk->mtu - sizeof(struct iphdr) -
 			   sizeof(struct tcphdr));
 
+	if (sk->mss < 1) {
+		printk(KERN_DEBUG "intial sk->mss below 1\n");
+		sk->mss = 1;	/* Sanity limit */
+	}
+
+	tp->window_clamp = rt->u.dst.window;
+	tcp_select_initial_window(sock_rspace(sk)/2,sk->mss,
+		&tp->rcv_wnd,
+		&tp->window_clamp,
+		sysctl_tcp_window_scaling,
+		&tp->rcv_wscale);
+	th->window = htons(tp->rcv_wnd);
+
 	tmp = tcp_syn_build_options(buff, sk->mss, sysctl_tcp_sack,
 		sysctl_tcp_timestamps,
-		sysctl_tcp_window_scaling?tp->rcv_wscale:0);
+		sysctl_tcp_window_scaling,tp->rcv_wscale);
 	buff->csum = 0;
-	t1->doff = (sizeof(*t1)+ tmp)>>2;
+	th->doff = (sizeof(*th)+ tmp)>>2;
 
-	tcp_v4_send_check(sk, t1, sizeof(struct tcphdr) + tmp, buff);
+	tcp_v4_send_check(sk, th, sizeof(struct tcphdr) + tmp, buff);
 
 	tcp_set_state(sk,TCP_SYN_SENT);
 
@@ -803,7 +813,6 @@ int tcp_chkaddr(struct sk_buff *skb)
 
 static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
 {
-	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 	struct sk_buff * skb;
 	struct tcphdr *th;
 	int tmp;
@@ -829,6 +838,11 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
 	 */
 	req->mss = min(mss, req->mss);
 
+	if (req->mss < 1) {
+		printk(KERN_DEBUG "initial req->mss below 1\n");
+		req->mss = 1;
+	}
+
 	/* Yuck, make this header setup more efficient... -DaveM */
 	memset(th, 0, sizeof(struct tcphdr));
 	th->syn = 1;
@@ -839,7 +853,16 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
 	skb->end_seq = skb->seq + 1;
 	th->seq = ntohl(skb->seq);
 	th->ack_seq = htonl(req->rcv_isn + 1);
-	th->window = ntohs(tp->rcv_wnd);
+	if (req->rcv_wnd == 0) {
+		/* Set this up on the first call only */
+		req->window_clamp = skb->dst->window;
+		tcp_select_initial_window(sock_rspace(sk)/2,req->mss,
+			&req->rcv_wnd,
+			&req->window_clamp,
+			req->wscale_ok,
+			&req->rcv_wscale);
+	}
+	th->window = htons(req->rcv_wnd);
 
 	/* XXX Partial csum of 4 byte quantity is itself! -DaveM
 	 * Yes, but it's a bit harder to special case now. It's
@@ -850,7 +873,7 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
 	 */
 
 	tmp = tcp_syn_build_options(skb, req->mss, req->sack_ok, req->tstamp_ok,
-		(req->snd_wscale)?tp->rcv_wscale:0);
+		req->wscale_ok,req->rcv_wscale);
 	skb->csum = 0;
 	th->doff = (sizeof(*th) + tmp)>>2;
 	th->check = tcp_v4_check(th, sizeof(*th) + tmp,
@@ -865,7 +888,7 @@ static void tcp_v4_or_free(struct open_request *req)
 {
 	if(!req->sk && req->af.v4_req.opt)
 		kfree_s(req->af.v4_req.opt,
-			sizeof(struct options) + req->af.v4_req.opt->optlen);
+			sizeof(struct ip_options) + req->af.v4_req.opt->optlen);
 }
 
 static struct or_calltable or_ipv4 = {
@@ -881,7 +904,7 @@ static int tcp_v4_syn_filter(struct sock *sk, struct sk_buff *skb, __u32 saddr)
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, __u32 isn)
 {
 	struct ip_options *opt = (struct ip_options *) ptr;
-	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	struct tcp_opt tp;
 	struct open_request *req;
 	struct tcphdr *th = skb->h.th;
 	__u32 saddr = skb->nh.iph->saddr;
@@ -913,19 +936,20 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, __u32 i
 
 	sk->ack_backlog++;
 
+	req->rcv_wnd = 0;		/* So that tcp_send_synack() knows! */
+
 	req->rcv_isn = skb->seq;
 	req->snt_isn = isn;
-	tp->tstamp_ok = tp->sack_ok = tp->snd_wscale = 0;
-	tcp_parse_options(th,tp);
-	if (tp->saw_tstamp) {
-		tp->ts_recent = tp->rcv_tsval;
-		tp->ts_recent_stamp = jiffies;
-	}
-	req->mss = tp->in_mss;
-	req->tstamp_ok = tp->tstamp_ok;
-	req->sack_ok = tp->sack_ok;
-	req->snd_wscale = tp->snd_wscale;
-	req->ts_recent = tp->ts_recent;
+	tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
+	tp.in_mss = 536;
+	tcp_parse_options(th,&tp);
+	if (tp.saw_tstamp)
+		req->ts_recent = tp.rcv_tsval;
+	req->mss = tp.in_mss;
+	req->tstamp_ok = tp.tstamp_ok;
+	req->sack_ok = tp.sack_ok;
+	req->snd_wscale = tp.snd_wscale;
+	req->wscale_ok = tp.wscale_ok;
 	req->rmt_port = th->source;
 	req->af.v4_req.loc_addr = daddr;
 	req->af.v4_req.rmt_addr = saddr;
@@ -1004,8 +1028,6 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	atomic_set(&newsk->rmem_alloc, 0);
 	newsk->localroute = sk->localroute;
 
-	newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
-
 	newsk->err = 0;
 	newsk->shutdown = 0;
 	newsk->ack_backlog = 0;
@@ -1060,7 +1082,6 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 	newsk->dst_cache = &rt->u.dst;
 
-	newtp->window_clamp = rt->u.dst.window;
 	snd_mss = rt->u.dst.pmtu;
 
 	/* FIXME: is mtu really the same as snd_mss? */
@@ -1072,10 +1093,19 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 	newtp->sack_ok = req->sack_ok;
 	newtp->tstamp_ok = req->tstamp_ok;
-	newtp->snd_wscale = req->snd_wscale;
-	newtp->ts_recent = req->ts_recent;
-	newtp->ts_recent_stamp = jiffies;
+	newtp->window_clamp = req->window_clamp;
+	newtp->rcv_wnd = req->rcv_wnd;
+	newtp->wscale_ok = req->wscale_ok;
+	if (newtp->wscale_ok) {
+		newtp->snd_wscale = req->snd_wscale;
+		newtp->rcv_wscale = req->rcv_wscale;
+	} else {
+		newtp->snd_wscale = newtp->rcv_wscale = 0;
+		newtp->window_clamp = min(newtp->window_clamp,65535);
+	}
 	if (newtp->tstamp_ok) {
+		newtp->ts_recent = req->ts_recent;
+		newtp->ts_recent_stamp = jiffies;
 		newtp->tcp_header_len = sizeof(struct tcphdr) + 12;	/* FIXME: define constant! */
 		newsk->dummy_th.doff += 3;
 	} else {
@@ -1219,9 +1249,8 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
 	case CHECKSUM_HW:
 		if (tcp_v4_check(th,len,saddr,daddr,skb->csum)) {
 			struct iphdr * iph = skb->nh.iph;
-			printk(KERN_DEBUG "TCPv4 bad checksum from %08x:%04x to %08x:%04x, ack = %u, seq = %u, len=%d/%d/%d\n",
+			printk(KERN_DEBUG "TCPv4 bad checksum from %08x:%04x to %08x:%04x, len=%d/%d/%d\n",
 			       saddr, ntohs(th->source), daddr,
-			       ntohl(th->ack_seq), ntohl(th->seq),
 			       ntohs(th->dest), len, skb->len, ntohs(iph->tot_len));
 					goto discard_it;
 		}
@@ -1346,10 +1375,12 @@ static int tcp_v4_init_sock(struct sock *sk)
 	tp->ato = 0;
 	tp->iat = (HZ/5) << 3;
 
-	tp->rcv_wnd = 8192;
+	/* FIXME: tie this to sk->rcvbuf? (May be unnecessary) */
+	/* tp->rcv_wnd = 8192; */
 	tp->tstamp_ok = 0;
 	tp->sack_ok = 0;
-	tp->in_mss = 0;
+	tp->wscale_ok = 0;
+	tp->in_mss = 536;
 	tp->snd_wscale = 0;
 	tp->sacks = 0;
 	tp->saw_tstamp = 0;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7f157abe2..bdc79525f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_output.c,v 1.42 1997/04/22 01:06:33 davem Exp $
+ * Version:	$Id: tcp_output.c,v 1.43 1997/04/27 19:24:43 schenk Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -404,14 +404,115 @@ void tcp_write_xmit(struct sock *sk)
 
 
 
-/*
- *      This function returns the amount that we can raise the
- *      usable window based on the following constraints
+/* This function returns the amount that we can raise the
+ * usable window based on the following constraints
  *  
- *	1. The window can never be shrunk once it is offered (RFC 793)
- *	2. We limit memory per socket
+ * 1. The window can never be shrunk once it is offered (RFC 793)
+ * 2. We limit memory per socket
+ *
+ * RFC 1122:
+ * "the suggested [SWS] avoidance algoritm for the receiver is to keep
+ *  RECV.NEXT + RCV.WIN fixed until:
+ *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
+ *
+ * i.e. don't raise the right edge of the window until you can raise
+ * it at least MSS bytes.
+ *
+ * Unfortunately, the recomended algorithm breaks header prediction,
+ * since header prediction assumes th->window stays fixed.
+ *
+ * Strictly speaking, keeping th->window fixed violates the receiver
+ * side SWS prevention criteria. The problem is that under this rule
+ * a stream of single byte packets will cause the right side of the
+ * window to always advance by a single byte.
+ * 
+ * Of course, if the sender implements sender side SWS prevention
+ * then this will not be a problem.
+ * 
+ * BSD seems to make the following compromise:
+ * 
+ *	If the free space is less than the 1/4 of the maximum
+ *	space available and the free space is less than 1/2 mss,
+ *	then set the window to 0.
+ *	Otherwise, just prevent the window from shrinking
+ *	and from being larger than the largest representable value.
+ *
+ * This prevents incremental opening of the window in the regime
+ * where TCP is limited by the speed of the reader side taking
+ * data out of the TCP receive queue. It does nothing about
+ * those cases where the window is constrained on the sender side
+ * because the pipeline is full.
+ *
+ * BSD also seems to "accidentally" limit itself to windows that are a
+ * multiple of MSS, at least until the free space gets quite small.
+ * This would appear to be a side effect of the mbuf implementation.
+ * Combining these two algorithms results in the observed behavior
+ * of having a fixed window size at almost all times.
+ *
+ * Below we obtain similar behavior by forcing the offered window to
+ * a multiple of the mss when it is feasible to do so.
+ *
+ * FIXME: In our current implementation the value returned by sock_rpsace(sk)
+ * is the total space we have allocated to the socket to store skbuf's.
+ * The current design assumes that up to half of that space will be
+ * taken by headers, and the remaining space will be available for TCP data.
+ * This should be accounted for correctly instead.
  */
+unsigned short tcp_select_window(struct sock *sk)
+{
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+	int mss = sk->mss;
+	long free_space = sock_rspace(sk)/2;
+	long window, cur_win;
+
+	if (tp->window_clamp) {
+		free_space = min(tp->window_clamp, free_space);
+		mss = min(tp->window_clamp, mss);
+	} else
+		printk(KERN_DEBUG "Clamp failure. Water leaking.\n");
+
+	if (mss < 1) {
+		mss = 1;
+		printk(KERN_DEBUG "tcp_select_window: mss fell to 0.\n");
+	}
+	
+	/* compute the actual window i.e.
+	 * old_window - received_bytes_on_that_win
+	 */
+	cur_win = tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup);
+	window  = tp->rcv_wnd;
+
+	if (cur_win < 0) {
+		cur_win = 0;
+		printk(KERN_DEBUG "TSW: win < 0 w=%d 1=%u 2=%u\n",
+		       tp->rcv_wnd, tp->rcv_nxt, tp->rcv_wup);
+	}
+
+	if (free_space < sk->rcvbuf/4 && free_space < mss/2)
+		window = 0;
+
+	/* Get the largest window that is a nice multiple of mss.
+	 * Window clamp already applied above.
+	 * If our current window offering is within 1 mss of the
+	 * free space we just keep it. This prevents the divide
+	 * and multiply from happening most of the time.
+	 * We also don't do any window rounding when the free space
+	 * is too small.
+	 */
+	if (window < free_space - mss && free_space > mss)
+		window = (free_space/mss)*mss;
 
+	/* Never shrink the offered window */
+	if (window < cur_win)
+		window = cur_win;
+
+	tp->rcv_wnd = window;
+	tp->rcv_wup = tp->rcv_nxt;
+	return window >> tp->rcv_wscale;	/* RFC1323 scaling applied */
+}
+
+#if 0
+/* Old algorithm for window selection */
 unsigned short tcp_select_window(struct sock *sk)
 {
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
@@ -427,37 +528,31 @@ unsigned short tcp_select_window(struct sock *sk)
 	/* compute the actual window i.e.
 	 * old_window - received_bytes_on_that_win
 	 */
-	cur_win = tp->rcv_wup - (tp->rcv_nxt - tp->rcv_wnd);
+	cur_win = tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup);
 	window  = tp->rcv_wnd;
-	
+
 	if (cur_win < 0) {
 		cur_win = 0;
 		printk(KERN_DEBUG "TSW: win < 0 w=%d 1=%u 2=%u\n",
 		       tp->rcv_wnd, tp->rcv_nxt, tp->rcv_wup);
 	}
 
-	/*
-	 * RFC 1122:
+	/* RFC 1122:
 	 * "the suggested [SWS] avoidance algoritm for the receiver is to keep
 	 *  RECV.NEXT + RCV.WIN fixed until:
 	 *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
 	 *
-	 * i.e. don't raise the right edge of the window until you can't raise
-	 * it MSS bytes
+	 * i.e. don't raise the right edge of the window until you can raise
+	 * it at least MSS bytes.
 	 */
 
-	/* It would be a good idea if it didn't break header prediction.
-	 * and BSD made the header predition standard...
-	 * It expects the same value in the header i.e. th->window to be
-	 * constant
-	 */
 	usable = free_space - cur_win;
 	if (usable < 0)
 		usable = 0;
 
 	if (window < usable) {
 		/*	Window is not blocking the sender
-		 *	and we have enought free space for it
+		 *	and we have enough free space for it
 		 */
 		if (cur_win > (sk->mss << 1))
 			goto out;
@@ -469,7 +564,7 @@ unsigned short tcp_select_window(struct sock *sk)
 		 */
 		window = max(usable, cur_win);
 	} else {
-		if ((usable - window) >= mss)
+		while ((usable - window) >= mss)
 			window += mss;
 	}
 out:
@@ -477,6 +572,7 @@ out:
 	tp->rcv_wup = tp->rcv_nxt;
 	return window;
 }
+#endif
 
 static int tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb)
 {
@@ -703,6 +799,11 @@ void tcp_send_fin(struct sock *sk)
 	}
 }
 
+/* WARNING: This routine must only be called when we have already sent
+ * a SYN packet that crossed the incoming SYN that caused this routine
+ * to get called. If this assumption fails then the initial rcv_wnd
+ * and rcv_wscale values will not be correct.
+ */
 int tcp_send_synack(struct sock *sk)
 {
 	struct tcp_opt * tp = &(sk->tp_pinfo.af_tcp);
@@ -735,13 +836,16 @@ int tcp_send_synack(struct sock *sk)
 	skb->end_seq = skb->seq + 1 /* th->syn */ ;
 	th->seq = ntohl(skb->seq);
 
-	th->window = ntohs(tp->rcv_wnd);
+	/* This is a resend of a previous SYN, now with an ACK.
+	 * we must reuse the previously offered window.
+	 */
+	th->window = htons(tp->rcv_wnd);
 
 	tp->last_ack_sent = th->ack_seq = htonl(tp->rcv_nxt);
 
 	tmp = tcp_syn_build_options(skb, sk->mss,
 		tp->sack_ok, tp->tstamp_ok,
-		tp->snd_wscale?tp->rcv_wscale:0);
+		tp->wscale_ok,tp->rcv_wscale);
 	skb->csum = 0;
 	th->doff = (sizeof(*th) + tmp)>>2;
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 365d3dac2..ce6c60feb 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,6 +22,8 @@
 
 #include <net/tcp.h>
 
+int sysctl_syn_retries = TCP_SYN_RETRIES; 
+
 static void tcp_sltimer_handler(unsigned long);
 static void tcp_syn_recv_timer(unsigned long);
 static void tcp_keepalive(unsigned long data);
@@ -178,7 +180,7 @@ static int tcp_write_timeout(struct sock *sk)
 	}
 	
 	/* Have we tried to SYN too many times (repent repent 8)) */
-	if(tp->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT) {
+	if(tp->retransmits > sysctl_syn_retries && sk->state==TCP_SYN_SENT) {
 		if(sk->err_soft)
 			sk->err=sk->err_soft;
 		else
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9ca5f3045..ed84d5b0f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -154,7 +154,7 @@ static int udp_v4_verify_bind(struct sock *sk, unsigned short snum)
 	return retval;
 }
 
-static inline int udp_lport_inuse(int num)
+static inline int udp_lport_inuse(u16 num)
 {
 	struct sock *sk = udp_hash[num & (UDP_HTABLE_SIZE - 1)];
 
@@ -168,36 +168,42 @@ static inline int udp_lport_inuse(int num)
 /* Shared by v4/v6 tcp. */
 unsigned short udp_good_socknum(void)
 {
-	static int start = 0;
-	unsigned short base;
-	int i, best = 0, size = 32767; /* a big num. */
 	int result;
-
-	base = PROT_SOCK + (start & 1023) + 1;
+	static int start = 0;
+	int i, best, best_size_so_far;
 
 	SOCKHASH_LOCK();
-	for(i = 0; i < UDP_HTABLE_SIZE; i++) {
-		struct sock *sk = udp_hash[i];
-		if(!sk) {
-			start = (i + 1 + start) & 1023;
-			result = i + base + 1;
+
+	/* Select initial not-so-random "best" */
+	best = PROT_SOCK + 1 + (start & 1023);
+	best_size_so_far = 32767;	/* "big" num */
+	result = best;
+	for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) {
+		struct sock *sk;
+		int size;
+
+		sk = udp_hash[result & (UDP_HTABLE_SIZE - 1)];
+
+		/* No clashes - take it */
+		if (!sk)
 			goto out;
-		} else {
-			int j = 0;
-			do {
-				if(++j >= size)
-					goto next;
-			} while((sk = sk->next));
-			best = i;
-			size = j;
-		}
-	next:
+
+		/* Is this one better than our best so far? */
+		size = 0;
+		do {
+			if(++size >= best_size_so_far)
+				goto next;
+		} while((sk = sk->next) != NULL);
+		best_size_so_far = size;
+		best = result;
+next:
 	}
 
-	while(udp_lport_inuse(base + best + 1))
+	while (udp_lport_inuse(best))
 		best += UDP_HTABLE_SIZE;
-	result = (best + base + 1);
+	result = best;
 out:
+	start = result;
 	SOCKHASH_UNLOCK();
 	return result;
 }
diff --git a/net/ipv4/utils.c b/net/ipv4/utils.c
index cbce01b68..4253c85db 100644
--- a/net/ipv4/utils.c
+++ b/net/ipv4/utils.c
@@ -46,7 +46,7 @@
  *	Display an IP address in readable format. 
  */
  
-char *in_ntoa(unsigned long in)
+char *in_ntoa(__u32 in)
 {
 	static char buff[18];
 	char *p;
@@ -62,7 +62,7 @@ char *in_ntoa(unsigned long in)
  *	Convert an ASCII string to binary IP. 
  */
  
-unsigned long in_aton(const char *str)
+__u32 in_aton(const char *str)
 {
 	unsigned long l;
 	unsigned int val;