Import of Linux/MIPS 2.1.36

author: Ralf Baechle <ralf@linux-mips.org> 1997-04-29 21:13:14 +0000
committer: <ralf@linux-mips.org> 1997-04-29 21:13:14 +0000
commit: 19c9bba94152148523ba0f7ef7cffe3d45656b11 (patch)
tree: 40b1cb534496a7f1ca0f5c314a523c69f1fee464 /net/ipv4/tcp_input.c
parent: 7206675c40394c78a90e74812bbdbf8cf3cca1be (diff)
1 files changed, 818 insertions, 888 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 076568961..ab2b1ef82 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	@(#)tcp_input.c	1.0.16	05/25/93
+ * Version:	$Id: tcp_input.c,v 1.50 1997/04/22 02:53:12 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -21,12 +21,6 @@
  */
 
 /*
- *	TODO
- *		- A better sock cache
- *
- */
-
-/*
  * Changes:
  *		Pedro Roque	:	Fast Retransmit/Recovery.
  *					Two receive queues.
@@ -42,14 +36,16 @@
  *		Eric Schenk	:	Yet another double ACK bug.
  *		Eric Schenk	:	Delayed ACK bug fixes.
  *		Eric Schenk	:	Floyd style fast retrans war avoidance.
+ *		David S. Miller	:	Don't allow zero congestion window.
+ *		Eric Schenk	:	Fix retransmitter so that it sends
+ *					next packet on ack of previous packet.
  */
 
 #include <linux/config.h>
 #include <linux/mm.h>
 #include <linux/sysctl.h>
 #include <net/tcp.h>
-
-
+#include <linux/ipsec.h>
 
 typedef void			(*tcp_sys_cong_ctl_t)(struct sock *sk,
 						      u32 seq, u32 ack,
@@ -61,6 +57,12 @@ static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack,
 				 u32 seq_rtt);
 
 int sysctl_tcp_cong_avoidance = 0;
+int sysctl_tcp_hoe_retransmits = 0;
+int sysctl_tcp_sack = 0;
+int sysctl_tcp_tsack = 0;
+int sysctl_tcp_timestamps = 0;
+int sysctl_tcp_window_scaling = 0;
+ 
 
 static tcp_sys_cong_ctl_t tcp_sys_cong_ctl_f = &tcp_cong_avoid_vanj;
 
@@ -76,9 +78,7 @@ static void tcp_delack_estimator(struct tcp_opt *tp)
 {
 	int m;
 
-	/*
-	 *	Delayed ACK time estimator.
-	 */
+	/* Delayed ACK time estimator. */
 	
 	m = jiffies - tp->lrcvtime;
 
@@ -87,12 +87,10 @@ static void tcp_delack_estimator(struct tcp_opt *tp)
 	if (m < 0)
 		return;
 
-	/*
-	 * if the mesured value is bigger than
+	/* if the mesured value is bigger than
 	 * twice the round trip time ignore it.
 	 */
-	if ((m << 2) <= tp->srtt) 
-	{
+	if ((m << 2) <= tp->srtt) {
 		m -= (tp->iat >> 3);
 		tp->iat += m;
 
@@ -106,18 +104,21 @@ static void tcp_delack_estimator(struct tcp_opt *tp)
 
 		if (tp->ato < HZ/50)
 			tp->ato = HZ/50;
-	}
-	else
+	} else
 		tp->ato = 0;
 }
 
-/*
- *	Called on frames that were known _not_ to have been
- *	retransmitted [see Karn/Partridge Proceedings SIGCOMM 87]. 
- *	The algorithm is from the SIGCOMM 88 piece by Van Jacobson.
+/* Called to compute a smoothed rtt estimate. The data fed to this
+ * routine either comes from timestamps, or from segments that were
+ * known _not_ to have been retransmitted [see Karn/Partridge
+ * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
+ * piece by Van Jacobson.
+ * NOTE: the next three routines used to be one big routine.
+ * To save cycles in the RFC 1323 implementation it was better to break
+ * it up into three procedures. -- erics
  */
 
-extern __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
+static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
 {
 	long m;
 	/*
@@ -126,8 +127,7 @@ extern __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
 	 *	are scaled versions of rtt and mean deviation.
 	 *	This is designed to be as fast as possible 
 	 *	m stands for "measurement".
-	 */
-	/*
+	 *
 	 *	On a 1990 paper the rto value is changed to:
 	 *	RTO = rtt + 4 * mdev
 	 */
@@ -144,55 +144,94 @@ extern __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
 		m -= (tp->mdev >> 2);   /* similar update on mdev */
 		tp->mdev += m;	    	/* mdev = 3/4 mdev + 1/4 new */
 	} else {
-					/* no previous measure. */
+		/* no previous measure. */
 		tp->srtt = m<<3;	/* take the measured time to be rtt */
 		tp->mdev = m<<2;	/* make sure rto = 3*rtt */
 	}
+}
 
+/* Calculate rto without backoff. This is the second half of Van Jacobsons
+ * routine refered to above.
+ */
 
-	/*
-	 *	Now update timeout.  Note that this removes any backoff.
-	 */
-			 
+static __inline__ void tcp_set_rto(struct tcp_opt *tp)
+{
 	tp->rto = (tp->srtt >> 3) + tp->mdev;
+	tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1));
+}
+ 
+
+/* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound
+ * on packet lifetime in the internet. We need the HZ/5 lower
+ * bound to behave correctly against BSD stacks with a fixed
+ * delayed ack.
+ * FIXME: It's not entirely clear this lower bound is the best
+ * way to avoid the problem. Is it possible to drop the lower
+ * bound and still avoid trouble with BSD stacks? Perhaps
+ * some modification to the RTO calculation that takes delayed
+ * ack bais into account? This needs serious thought. -- erics
+ */
 
+static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
+{
 	if (tp->rto > 120*HZ)
 		tp->rto = 120*HZ;
-
-	/* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
 	if (tp->rto < HZ/5)
 		tp->rto = HZ/5;
+}
+
+/* WARNING: this must not be called if tp->saw_timestamp was false. */
 
-	tp->backoff = 0;
+extern __inline__ void tcp_replace_ts_recent(struct tcp_opt *tp, __u32 end_seq)
+{
+	/* From draft-ietf-tcplw-high-performance: the correct
+	 * test is last_ack_sent <= end_seq.
+	 * (RFC1323 stated last_ack_sent < end_seq.)
+	 */
+	if (!before(end_seq,tp->last_ack_sent)) {
+		tp->ts_recent = tp->rcv_tsval;
+		/* FIXME: need a corse timestamp. Days uptime
+		 * would be good.
+		 */
+		tp->ts_recent_stamp = jiffies;
+	}
+}
+
+extern __inline__ int tcp_paws_discard(struct tcp_opt *tp)
+{
+	/* FIXME: must check that ts_recent is not
+ 	 * more than 24 days old here. Yuck.
+ 	 */
+	return (tp->rcv_tsval-tp->ts_recent < 0);
+}
+
+
+static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
+{
+	u32 end_window = tp->rcv_wup + tp->rcv_wnd;
+
+	if (tp->rcv_wnd) {
+		if (!before(seq, tp->rcv_nxt) && before(seq, end_window))
+			return 1;
+
+		if ((end_seq - seq) && after(end_seq, tp->rcv_nxt) &&
+		    !after(end_seq, end_window))
+			return 1;
+	}
+
+	return 0;
 }
- 
 
 /*
  *	This functions checks to see if the tcp header is actually acceptable. 
  */
  
-extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 seg_nxt)
+extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
 {
-	u32 end_window = tp->rcv_wup + tp->rcv_wnd;
-	u32 end_seq = seg_nxt;
-
-	/*
-	 *	When the window is open (most common case)
-	 *	we want to accept segments if they have yet unseen data
-	 *	or in the case of a dataless segment if seg.seq == rcv.nxt
-	 *	this means:
-	 *
-	 *	if (seq == end_seq)
-	 *		end_seq >= rcv.nxt
-	 *	else
-	 *		end_seq >  rcv.nxt
-	 */
-
-	if (seq == end_seq)
-		end_seq++;
+	if (seq == tp->rcv_nxt)
+		return (tp->rcv_wnd || (end_seq == seq));
 
-	return ((before(seq, end_window) && after(end_seq, tp->rcv_nxt)) ||
-		(seq == end_window && seq == end_seq));
+	return __tcp_sequence(tp, seq, end_seq);
 }
 
 /*
@@ -203,9 +242,8 @@ extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 seg_nxt)
 static int tcp_reset(struct sock *sk, struct sk_buff *skb)
 {
 	sk->zapped = 1;
-	/*
-	 *	We want the right error as BSD sees it (and indeed as we do).
-	 */
+
+	/* We want the right error as BSD sees it (and indeed as we do). */
 	switch (sk->state) {
 		case TCP_TIME_WAIT:
 			break;
@@ -217,7 +255,7 @@ static int tcp_reset(struct sock *sk, struct sk_buff *skb)
 			break;
 		default:
 			sk->err = ECONNRESET;
-	}
+	};
 #ifdef CONFIG_TCP_RFC1337
 	/*
 	 *	Time wait assassination protection [RFC1337]
@@ -227,8 +265,7 @@ static int tcp_reset(struct sock *sk, struct sk_buff *skb)
 	 *	Ian Heavens has since shown this is an inadequate fix for the protocol
 	 *	bug in question.
 	 */
-	if(sk->state!=TCP_TIME_WAIT)
-	{	
+	if(sk->state!=TCP_TIME_WAIT) {
 		tcp_set_state(sk,TCP_CLOSE);
 		sk->shutdown = SHUTDOWN_MASK;
 	}
@@ -242,34 +279,30 @@ static int tcp_reset(struct sock *sk, struct sk_buff *skb)
 	return(0);
 }
 
-
 /*
- *	Look for tcp options. Parses everything but only knows about MSS.
- *	This routine is always called with the packet containing the SYN.
- *	However it may also be called with the ack to the SYN.  So you
- *	can't assume this is always the SYN.  It's always called after
- *	we have set up sk->mtu to our own MTU.
- *
- *	We need at minimum to add PAWS support here. Possibly large windows
- *	as Linux gets deployed on 100Mb/sec networks.
+ *	Look for tcp options. Normally only called on SYN and SYNACK packets.
+ *	But, this can also be called on packets in the established flow when
+ *	the fast version below fails.
+ *	FIXME: surely this can be more efficient. -- erics
  */
  
-int tcp_parse_options(struct tcphdr *th)
+void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp)
 {
 	unsigned char *ptr;
 	int length=(th->doff*4)-sizeof(struct tcphdr);
-	int mss = 0;
-    
+
 	ptr = (unsigned char *)(th + 1);
-  
-	while(length>0)
-	{
+	tp->sacks = 0;
+	tp->saw_tstamp = 0;
+
+	while(length>0) {
 	  	int opcode=*ptr++;
 	  	int opsize=*ptr++;
-	  	switch(opcode)
-	  	{
+		if (length - opsize < 0)	/* Don't parse partial options */
+			break;
+	  	switch(opcode) {
 	  		case TCPOPT_EOL:
-	  			return 0;
+	  			return;
 	  		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
 	  			length--;
 	  			ptr--;		/* the opsize=*ptr++ above was a mistake */
@@ -277,25 +310,86 @@ int tcp_parse_options(struct tcphdr *th)
 	  		
 	  		default:
 	  			if(opsize<=2)	/* Avoid silly options looping forever */
-	  				return 0;
-	  			switch(opcode)
-	  			{
+	  				return;
+	  			switch(opcode) {
 	  				case TCPOPT_MSS:
-	  					if(opsize==TCPOLEN_MSS && th->syn)
-	  					{
-							mss = ntohs(*(unsigned short *)ptr);
-	  					}
+	  					if(opsize==TCPOLEN_MSS && th->syn) {
+							tp->in_mss = ntohs(*(__u16 *)ptr);
+							if (tp->in_mss == 0)
+								tp->in_mss = 536;
+						}
 	  					break;
-		  				/* Add other options here as people feel the urge to implement stuff like large windows */
+					case TCPOPT_WINDOW:
+	  					if(opsize==TCPOLEN_WINDOW && th->syn)
+							if (sysctl_tcp_window_scaling)
+								tp->snd_wscale = *(__u8 *)ptr;
+						break;
+					case TCPOPT_SACK_PERM:
+	  					if(opsize==TCPOLEN_SACK_PERM && th->syn)
+							if (sysctl_tcp_sack)
+								tp->sack_ok = 1;
+					case TCPOPT_TIMESTAMP:
+	  					if(opsize==TCPOLEN_TIMESTAMP) {
+							/* Cheaper to set again then to
+							 * test syn. Optimize this?
+							 */
+							if (sysctl_tcp_timestamps)
+								tp->tstamp_ok = 1;
+							tp->saw_tstamp = 1;
+							tp->rcv_tsval = ntohl(*(__u32 *)ptr);
+							tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4));
+						}
+						break;
+					case TCPOPT_SACK:
+						tp->sacks = (opsize-2)>>3;
+						if (tp->sacks<<3 == opsize-2) {
+							int i;
+							for (i = 0; i < tp->sacks; i++) {
+								tp->left_sack[i] = ntohl(((__u32 *)ptr)[2*i]);
+								tp->right_sack[i] = ntohl(((__u32 *)ptr)[2*i+1]);
+							}
+						} else
+							tp->sacks = 0;
 	  			}
 	  			ptr+=opsize-2;
 	  			length-=opsize;
-	  	}
+	  	};
 	}
+}
 
-	return mss;
+/* Fast parse options. This hopes to only see timestamps.
+ * If it is wrong it falls back on tcp_parse_option().
+ * This should probably get extended for timestamps + SACK as well.
+ * Assembly code anyone? -- erics
+ */
+static __inline__ int tcp_fast_parse_options(struct tcphdr *th, struct tcp_opt *tp)
+{
+	if (tp->tcp_header_len == sizeof(struct tcphdr))
+		return 0;
+	if (th->doff == sizeof(struct tcphdr)>>2) {
+		tp->saw_tstamp = 0;
+		tp->sacks = 0;
+		return 0;
+	} else if (th->doff == (sizeof(struct tcphdr)>>2)+3) {
+		__u32 *ptr = (__u32 *)(th + 1);
+		if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+				  | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
+			tp->saw_tstamp = 1;
+			tp->sacks = 0;
+			tp->rcv_tsval = ntohl(*++ptr);
+			tp->rcv_tsecr = ntohl(*++ptr);
+			return 1;
+		}
+	}
+	tcp_parse_options(th,tp);
+	return 1;
 }
 
+#if 0
+
+/*
+ * This is the old fast retransmit code. It will go away eventually. -- erics
+ */
 
 /* 
  *  See draft-stevens-tcpca-spec-01 for documentation.
@@ -305,6 +399,15 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
 {
 	struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
 
+	/* FIXME: if we are already retransmitting should this code
+	 * be skipped? [Floyd high_seq check sort of does this]
+	 * The case I'm worried about is falling into a fast
+	 * retransmit on a link with a congestion window of 1 or 2.
+	 * There was some evidence in 2.0.x that this was problem
+	 * on really slow links (1200 or 2400 baud). I need to
+	 * try this situation again and see what happens.
+	 */
+
 	/*
 	 * An ACK is a duplicate if:
 	 * (1) it has the same sequence number as the largest number we've 
@@ -316,55 +419,169 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
 	 *     The packet acked data after high_seq;
 	 */
 
-	if (ack == tp->snd_una && sk->packets_out && (not_dup == 0) &&
-	    after(ack, tp->high_seq))
-	{
-		
-		sk->dup_acks++;	
-		
-
-		/*
-		 * 1. When the third duplicate ack is received, set ssthresh 
-		 * to one half the current congestion window, but no less 
-		 * than two segments. Retransmit the missing segment.
+	if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
+		/* 1. When the third duplicate ack is received, set ssthresh 
+		 *    to one half the current congestion window, but no less 
+		 *    than two segments. Retransmit the missing segment.
 		 */
-	
-		if (sk->dup_acks == 3) 
-		{
-			sk->ssthresh = max(sk->cong_window >> 1, 2);
-			sk->cong_window = sk->ssthresh + 3;
-			tcp_do_retransmit(sk, 0);
+		if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
+			tp->dup_acks++;
+
+			if (tp->dup_acks == 3) {
+				tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2);
+				tp->snd_cwnd = tp->snd_ssthresh + 3;
+				tcp_do_retransmit(sk, 0);
+
+				/* Careful not to timeout just after fast
+				 * retransmit!
+				 */
+				tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+			}
 		}
 
-		/*
-		 * 2. Each time another duplicate ACK arrives, increment 
-		 * cwnd by the segment size. [...] Transmit a packet...
+		/* 2. Each time another duplicate ACK arrives, increment 
+		 *    cwnd by the segment size. [...] Transmit a packet...
 		 *
-		 * Packet transmission will be done on normal flow processing
-		 * since we're not in "retransmit mode"
+		 *    Packet transmission will be done on normal flow processing
+		 *    since we're not in "retransmit mode".
 		 */
-		
-		if (sk->dup_acks > 3) 
-		{
-			sk->cong_window++;
+		if (tp->dup_acks >= 3) {
+			tp->dup_acks++;
+			tp->snd_cwnd++;
+		}
+	} else {
+		/* 3. When the next ACK arrives that acknowledges new data,
+		 *    set cwnd to ssthresh.
+		 */
+		if (tp->dup_acks >= 3) {
+			tp->retrans_head = NULL;
+			tp->snd_cwnd = max(tp->snd_ssthresh, 1);
+			tp->retransmits = 0;
 		}
+		tp->dup_acks = 0;
+
+		/* FIXME: This is wrong if the new ack that arrives
+		 * is below the value for high_seq.
+		 */
+		tp->high_seq = 0;
 	}
-	else
-	{
-		/*
-		 * 3. When the next ACK arrives that acknowledges new data,
-		 *    set cwnd to ssthresh
+}
+#endif
+
+#define FLAG_DATA		0x01
+#define FLAG_WIN_UPDATE		0x02
+#define FLAG_DATA_ACKED		0x04
+
+static __inline__ void clear_fast_retransmit(struct sock *sk) {
+	struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
+	if (tp->dup_acks > 3) {
+		tp->retrans_head = NULL;
+		tp->snd_cwnd = max(tp->snd_ssthresh, 1);
+	}
+	tp->dup_acks = 0;
+}
+
+/*
+ * NOTE: This code assumes that tp->dup_acks gets cleared when a
+ * retransmit timer fires.
+ */
+
+static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
+{
+	struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
+
+	/*
+	 * Note: If not_dup is set this implies we got a
+	 * data carrying packet or a window update.
+	 * This carries no new information about possible
+	 * lost packets, so we have to ignore it for the purposes
+	 * of counting duplicate acks. Ideally this does not imply we
+	 * should stop our fast retransmit phase, more acks may come
+	 * later without data to help us. Unfortunately this would make
+	 * the code below much more complex. For now if I see such
+	 * a packet I clear the fast retransmit phase.
+	 */
+
+	if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
+		/* This is the standard reno style fast retransmit branch. */
+
+                /* 1. When the third duplicate ack is received, set ssthresh 
+                 * to one half the current congestion window, but no less 
+                 * than two segments. Retransmit the missing segment.
+                 */
+		if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
+			tp->dup_acks++;
+			if (tp->dup_acks == 3) {
+				tp->dup_acks++;
+                                tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2);
+                                tp->snd_cwnd = tp->snd_ssthresh + 3;
+				tp->high_seq = tp->snd_nxt;
+                                tcp_do_retransmit(sk, 0);
+                                tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+			}
+		}
+
+                /* 2. Each time another duplicate ACK arrives, increment 
+                 * cwnd by the segment size. [...] Transmit a packet...
+                 *
+                 * Packet transmission will be done on normal flow processing
+                 * since we're not in "retransmit mode"
+                 */
+                if (tp->dup_acks > 3)
+                        tp->snd_cwnd++;
+	} else if (tp->high_seq != 0) {
+		/* In this branch we deal with clearing the Floyd style
+		 * block on duplicate fast retransmits, and if requested
+		 * we do Hoe style secondary fast retransmits.
 		 */
+		if (!before(ack,tp->high_seq) || (not_dup&FLAG_DATA) != 0) {
+			/* Once we have acked all the packets up to high_seq
+			 * we are done this fast retransmit phase.
+			 * Alternatively data arrived. In this case we
+			 * Have to abort the fast retransmit attempt.
+			 * Note that we do want to accept a window
+			 * update since this is expected with Hoe's algorithm.
+			 */
+			clear_fast_retransmit(sk);
 
-		if (sk->dup_acks >= 3)
-		{
-			sk->tp_pinfo.af_tcp.retrans_head = NULL;
-			sk->cong_window = sk->ssthresh;
-			sk->retransmits = 0;
+			/* After we have cleared up to high_seq we can
+			 * clear the Floyd style block.
+			 */
+			if (after(ack,tp->high_seq))
+				tp->high_seq = 0;
+		} else if (tp->dup_acks >= 3) {
+			if (sysctl_tcp_hoe_retransmits) {
+				/* Hoe Style. We didn't ack the whole
+				 * window. Take this as a cue that
+				 * another packet was lost and retransmit it.
+				 * Don't muck with the congestion window here.
+				 * Note that we have to be careful not to
+				 * act if this was a window update and it
+				 * didn't ack new data, since this does
+				 * not indicate a packet left the system.
+				 * We can test this by just checking
+				 * if ack changed from snd_una, since
+				 * the only way to get here without changing
+				 * advancing from snd_una is if this was a
+				 * window update.
+				 */
+				if (ack != tp->snd_una && before(ack,tp->high_seq)) {
+                                	tcp_do_retransmit(sk, 0);
+                                	tcp_reset_xmit_timer(sk, TIME_RETRANS,
+						tp->rto);
+				}
+			} else {
+				/* Reno style. We didn't ack the whole
+				 * window, now we have to drop out of
+				 * fast retransmit and wait for a timeout.
+				 */
+				clear_fast_retransmit(sk);
+			}
 		}
-		sk->dup_acks = 0;
+	} else {
+		/* Clear any aborted fast retransmit starts. */
+		tp->dup_acks = 0;
 	}
-	
 }
 
 /*
@@ -379,148 +596,114 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
 static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack,
 				 u32 seq_rtt)
 {
-	/*
-	 *	From:
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	unsigned int actual, expected;
+	unsigned int inv_rtt, inv_basertt, inv_basebd;
+	u32 snt_bytes;
+
+	/*	From:
 	 *      TCP Vegas: New Techniques for Congestion 
 	 *	Detection and Avoidance.
-	 *              
 	 *
 	 *	Warning: This code is a scratch implementation taken
 	 *	from the paper only. The code they distribute seams
 	 *	to have improved several things over the initial spec.
 	 */
 
-	struct tcp_opt * tp;
-	unsigned int Actual, Expected;
-	unsigned int inv_rtt, inv_basertt; 
-	u32 snt_bytes;
-	
-
-	tp = &(sk->tp_pinfo.af_tcp);
-
 	if (!seq_rtt)
 		seq_rtt = 1;
-	
+
 	if (tp->basertt)
 		tp->basertt = min(seq_rtt, tp->basertt);
 	else
 		tp->basertt = seq_rtt;
 
-	/*
-	 * 
-	 *	Actual	 = throughput for this segment.
-	 *	Expected = number_of_bytes in transit / BaseRTT
-	 * 
+	/*	actual	 = throughput for this segment.
+	 *	expected = number_of_bytes in transit / BaseRTT
 	 */
 
-	snt_bytes = (ack - seq) << SHIFT_FACTOR;
-	inv_rtt = (1 << SHIFT_FACTOR) / seq_rtt;
-       
-	Actual =  snt_bytes * inv_rtt;
+	snt_bytes = ack - seq;
 
+	inv_rtt = (1 << SHIFT_FACTOR) / seq_rtt;
 	inv_basertt = (1 << SHIFT_FACTOR) / tp->basertt;
-	Expected = ((tp->snd_nxt - tp->snd_una) << SHIFT_FACTOR) * inv_basertt;
 
-	/*
-	 *      Slow Start
-	 */
-	
-	if (sk->cong_window < sk->ssthresh &&
+	actual =  snt_bytes * inv_rtt;
+
+	expected = (tp->snd_nxt - tp->snd_una) * inv_basertt;
+
+	/* XXX sk->mss should move into tcp_opt as well -DaveM */
+	inv_basebd = sk->mss * inv_basertt;
+
+	/* Slow Start */
+	if (tp->snd_cwnd < tp->snd_ssthresh &&
 	    (seq == tp->snd_nxt ||
-	      (((Expected - Actual) <=
-		((TCP_VEGAS_GAMMA << SHIFT_FACTOR) * sk->mss * inv_basertt))
-	       )
-	     ))
-	{
-		/*
-		 * "Vegas allows exponential growth only every other
-		 *  RTT"
-		 */
-			
-		if (!(sk->cong_count++))
-		{
-			sk->cong_window++;
-			sk->cong_count = 0;
+	     (expected - actual <= TCP_VEGAS_GAMMA * inv_basebd))) {
+		/* "Vegas allows exponential growth only every other RTT" */
+		if (tp->snd_cwnd_cnt++) {
+			tp->snd_cwnd++;
+			tp->snd_cwnd_cnt = 0;
 		}
-	}
-	else 
-	{
-		/*
-		 *      Congestion Avoidance
-		 */
-			
-		if (Expected - Actual <=
-		    ((TCP_VEGAS_ALPHA << SHIFT_FACTOR) * sk->mss * inv_basertt))
-		{
+	} else {
+		/* Congestion Avoidance */
+		if (expected - actual <= TCP_VEGAS_ALPHA * inv_basebd) {
 			/* Increase Linearly */
-				
-			if (sk->cong_count++ >= sk->cong_window)
-			{
-				sk->cong_window++;
-				sk->cong_count = 0;
+			if (tp->snd_cwnd_cnt++ >= tp->snd_cwnd) {
+				tp->snd_cwnd++;
+				tp->snd_cwnd_cnt = 0;
 			}
 		}
-			
-		if (Expected - Actual >=
-		    ((TCP_VEGAS_BETA << SHIFT_FACTOR) * sk->mss * inv_basertt))
-		{
+
+		if (expected - actual >= TCP_VEGAS_BETA * inv_basebd) {
 			/* Decrease Linearly */
-				
-			if (sk->cong_count++ >= sk->cong_window)
-			{
-				sk->cong_window--;
-				sk->cong_count = 0;
+			if (tp->snd_cwnd_cnt++ >= tp->snd_cwnd) {
+				tp->snd_cwnd--;
+				tp->snd_cwnd_cnt = 0;
 			}
-			
-			/* Never less than 2 segments */
-			if (sk->cong_window < 2)
-				sk->cong_window = 2;
+
+			/* Never less than 2 segments. */
+			if (tp->snd_cwnd < 2)
+				tp->snd_cwnd = 2;
 		}
 	}
 }
 
 static void tcp_cong_avoid_vanj(struct sock *sk, u32 seq, u32 ack, u32 seq_rtt)
 {
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	
-        /* 
-         * This is Jacobson's slow start and congestion avoidance. 
+        /* This is Jacobson's slow start and congestion avoidance. 
          * SIGCOMM '88, p. 328.  Because we keep cong_window in 
          * integral mss's, we can't do cwnd += 1 / cwnd.  
          * Instead, maintain a counter and increment it once every 
          * cwnd times.  
+	 * FIXME: Check to be sure the mathematics works out right
+	 * on this trick when we have to reduce the congestion window.
+	 * The snd_cwnd_cnt has to be reset properly when reduction events
+	 * happen.
+	 * FIXME: What happens when the congestion window gets larger
+	 * than the maximum receiver window by some large factor
+	 * Suppose the pipeline never looses packets for a long
+	 * period of time, then traffic increases causing packet loss.
+	 * The congestion window should be reduced, but what it should
+	 * be reduced to is not clear, since 1/2 the old window may
+	 * still be larger than the maximum sending rate we ever achieved.
          */
-
-        if (sk->cong_window <= sk->ssthresh)  
-	{
-                /* 
-                 *	In "safe" area, increase
-                 */
-
-                sk->cong_window++;
-	}
-        else 
-	{
-                /*
-                 *	In dangerous area, increase slowly.  
-                 *      In theory this is
-                 *  	sk->cong_window += 1 / sk->cong_window
+        if (tp->snd_cwnd <= tp->snd_ssthresh) {
+                /* In "safe" area, increase. */
+                tp->snd_cwnd++;
+	} else {
+                /* In dangerous area, increase slowly.  In theory this is
+                 * tp->snd_cwnd += 1 / tp->snd_cwnd
                  */
-
-                if (sk->cong_count >= sk->cong_window) {
-			
-                        sk->cong_window++;
-                        sk->cong_count = 0;
-                }
-                else 
-                        sk->cong_count++;
+                if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+                        tp->snd_cwnd++;
+                        tp->snd_cwnd_cnt = 0;
+                } else 
+                        tp->snd_cwnd_cnt++;
         }       
 }
 
 
-#define FLAG_DATA		0x01
-#define FLAG_WIN_UPDATE		0x02
-#define FLAG_DATA_ACKED		0x04
-
 static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq,
 			       __u32 *seq_rtt)
 {
@@ -528,52 +711,46 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq,
 	struct sk_buff *skb;
 	unsigned long now = jiffies;
 	int acked = 0;
-	
-	while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head))
-	{
 
+	while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) {
 #ifdef TCP_DEBUG
 		/* Check for a bug. */
-
 		if (skb->next != (struct sk_buff*) &sk->write_queue &&
-		    after(skb->end_seq, skb->next->seq)) 
-			printk("INET: tcp_input.c: *** "
+		    after(skb->end_seq, skb->next->seq))
+			printk(KERN_DEBUG "INET: tcp_input.c: *** "
 			       "bug send_list out of order.\n");
 #endif								
-		/*
-		 *	If our packet is before the ack sequence we can
-		 *	discard it as it's confirmed to have arrived the 
-		 *	other end.
+		/* If our packet is before the ack sequence we can
+		 * discard it as it's confirmed to have arrived the 
+		 * other end.
 		 */
-		 
 		if (after(skb->end_seq, ack))
 			break;
-		
-		if (sk->debug)
-		{
-			printk(KERN_DEBUG "removing seg %x-%x from "
-			       "retransmit queue\n", skb->seq, skb->end_seq);
-		}
-		
+
+		SOCK_DEBUG(sk, "removing seg %x-%x from retransmit queue\n",
+			   skb->seq, skb->end_seq);
+
 		acked = FLAG_DATA_ACKED;
 		
-		atomic_dec(&sk->packets_out);
+		/* FIXME: packet counting may break if we have to
+		 * do packet "repackaging" for stacks that don't
+		 * like overlapping packets.
+		 */
+		tp->packets_out--;
 
 		*seq = skb->seq;
 		*seq_rtt = now - skb->when;
-				
-		skb_unlink(skb);		
-		skb->free = 1;
+
+		skb_unlink(skb);
 		
 		kfree_skb(skb, FREE_WRITE);
 	}
 
-	if (acked && !sk->dead)
-	{
+	if (acked) {
 		tp->retrans_head = NULL;
-		sk->write_space(sk);
+		if (!sk->dead)
+			sk->write_space(sk);
 	}
-	
 	return acked;
 }
 
@@ -581,27 +758,18 @@ static void tcp_ack_probe(struct sock *sk, __u32 ack)
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	
-	/*
-	 *	Our probe was answered
-	 */
+	/* Our probe was answered. */
 	tp->probes_out = 0;
 	
-	/*
-	 *	Was it a usable window open ?
-	 */
+	/* Was it a usable window open? */
 	
 	/* should always be non-null */
 	if (tp->send_head != NULL &&
-	    !before (ack + tp->snd_wnd, tp->send_head->end_seq))
-	{
+	    !before (ack + tp->snd_wnd, tp->send_head->end_seq)) {
 		tp->backoff = 0;
 		tp->pending = 0;
-		
 		tcp_clear_xmit_timer(sk, TIME_PROBE0);
-		
-	}
-	else
-	{
+	} else {
 		tcp_reset_xmit_timer(sk, TIME_PROBE0,
 				     min(tp->rto << tp->backoff, 120*HZ));
 	}
@@ -614,192 +782,164 @@ static void tcp_ack_probe(struct sock *sk, __u32 ack)
 static int tcp_ack(struct sock *sk, struct tcphdr *th, 
 		   u32 ack_seq, u32 ack, int len)
 {
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	int flag = 0;
 	u32 seq = 0;
 	u32 seq_rtt = 0;
 	struct sk_buff *skb;
-	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-
 
 	if(sk->zapped)
 		return(1);	/* Dead, can't ack any more so why bother */
 
-	 
-	if (tp->pending == TIME_KEEPOPEN) 
-	{
+	if (tp->pending == TIME_KEEPOPEN)
 	  	tp->probes_out = 0;
-	}
 
 	tp->rcv_tstamp = jiffies;
-		
-	/*
-	 *	If the ack is newer than sent or older than previous acks
-	 *	then we can probably ignore it.
+
+	/* If the ack is newer than sent or older than previous acks
+	 * then we can probably ignore it.
 	 */
-	 
 	if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una))
 		goto uninteresting_ack;
 
-	/*
-	 *	If there is data set flag 1
-	 */
-	 
-	if (len != th->doff*4) 
-	{
+	/* If there is data set flag 1 */
+	if (len != th->doff*4) {
 		flag |= FLAG_DATA;
 		tcp_delack_estimator(tp);
 	}
 
-	/*
-	 *	Update our send window
-	 */
+	/* Update our send window. */
 
-	/*
-	 *	This is the window update code as per RFC 793
-	 *	snd_wl{1,2} are used to prevent unordered
-	 *	segments from shrinking the window 
+	/* This is the window update code as per RFC 793
+	 * snd_wl{1,2} are used to prevent unordered
+	 * segments from shrinking the window 
 	 */
+	if (before(tp->snd_wl1, ack_seq) ||
+	    (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) {
+		unsigned long nwin = ntohs(th->window);
 
-	if ((tp->snd_wl1 == 0) || before(tp->snd_wl1, ack_seq) ||
-	    (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack)))
-	{
-		tp->snd_wnd = ntohs(th->window);
-		tp->snd_wl1 = ack_seq;
-		tp->snd_wl2 = ack;
+		if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) {
+			flag |= FLAG_WIN_UPDATE;
+			tp->snd_wnd = nwin;
 
-		flag |= FLAG_WIN_UPDATE;
+			tp->snd_wl1 = ack_seq;
+			tp->snd_wl2 = ack;
 
-		if (tp->snd_wnd > sk->max_window)
-		{
-			sk->max_window = tp->snd_wnd;
+			if (nwin > tp->max_window)
+				tp->max_window = nwin;
 		}
 	}
 
-	
-	/*
-	 *	We passed data and got it acked, remove any soft error
-	 *	log. Something worked...
+	/* We passed data and got it acked, remove any soft error
+	 * log. Something worked...
 	 */
-	 
 	sk->err_soft = 0;
 
-	/*
-	 *	If this ack opens up a zero window, clear backoff.  It was
-	 *	being used to time the probes, and is probably far higher than
-	 *	it needs to be for normal retransmission.
+	/* If this ack opens up a zero window, clear backoff.  It was
+	 * being used to time the probes, and is probably far higher than
+	 * it needs to be for normal retransmission.
 	 */
-
 	if (tp->pending == TIME_PROBE0)
-	{
 		tcp_ack_probe(sk, ack);
-	}
-
-	/* 
-	 *	See if we can take anything off of the retransmit queue.
-	 */
 
+	/* See if we can take anything off of the retransmit queue. */
 	if (tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt))
 		flag |= FLAG_DATA_ACKED;
 
-
-	/* 
-	 * if we where retransmiting don't count rtt estimate
-	 */
-
-	if (sk->retransmits)
-	{
-		if (sk->packets_out == 0)
-			sk->retransmits = 0;
-	}
-	else
-	{
-		/*
-		 * Note that we only reset backoff and rto in the
-		 * rtt recomputation code.  And that doesn't happen
-		 * if there were retransmissions in effect.  So the
-		 * first new packet after the retransmissions is
-		 * sent with the backoff still in effect.  Not until
-		 * we get an ack from a non-retransmitted packet do
-		 * we reset the backoff and rto.  This allows us to deal
-		 * with a situation where the network delay has increased
-		 * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
+	/* If we have a timestamp, we always do rtt estimates. */
+	if (tp->saw_tstamp) {
+		/* Read draft-ietf-tcplw-high-performance before mucking
+		 * with this code. (Superceeds RFC1323)
 		 */
-
-		if (flag & FLAG_DATA_ACKED)
-		{
-			tcp_rtt_estimator(tp, seq_rtt);
-			
-			(*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt);
+		seq_rtt = (jiffies-tp->rcv_tsecr);
+		tcp_rtt_estimator(tp, seq_rtt);
+		if (tp->retransmits) {
+			if (tp->packets_out == 0) {
+				tp->retransmits = 0;
+				tp->backoff = 0;
+				tcp_set_rto(tp);
+			} else {
+				/* Still retransmitting, use backoff */
+				tcp_set_rto(tp);
+				tp->rto = tp->rto << tp->backoff;
+			}
+		} else {
+			tcp_set_rto(tp);
+			if (flag && FLAG_DATA_ACKED)
+				(*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt);
+		}
+		/* NOTE: safe here so long as cong_ctl doesn't use rto */
+		tcp_bound_rto(tp);
+	} else {
+		/* If we were retransmiting don't count rtt estimate. */
+		if (tp->retransmits) {
+			if (tp->packets_out == 0)
+				tp->retransmits = 0;
+		} else {
+			/* We don't have a timestamp. Can only use
+			 * packets that are not retransmitted to determine
+			 * rtt estimates. Also, we must not reset the
+			 * backoff for rto until we get a non-retransmitted
+			 * packet. This allows us to deal with a situation
+			 * where the network delay has increased suddenly.
+			 * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
+			 */
+			if (flag & FLAG_DATA_ACKED) {
+				tp->backoff = 0;
+				tcp_rtt_estimator(tp, seq_rtt);
+				tcp_set_rto(tp);
+				tcp_bound_rto(tp);
+				(*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt);
+			}
 		}
 	}
 
-			
-#ifdef TCP_DEBUG
-
-	/* Sanity check out packets_out counter */
-	if (skb_queue_len(&sk->write_queue) == 0 || 
-	    ack == tp->snd_nxt ) 
-	{
-		if (sk->packets_out) 
-		{
-			printk(KERN_DEBUG "tcp_ack: packets_out %d\n",
-			       sk->packets_out);
-                        sk->packets_out = 0;
-                }
-	}
-#endif
-
-	if (sk->packets_out)
-	{
-		if (flag & FLAG_DATA_ACKED)
-		{
+	if (tp->packets_out) {
+		if (flag & FLAG_DATA_ACKED) {
 			long when;
-				
+
 			skb = skb_peek(&sk->write_queue);
-		
 			when = tp->rto - (jiffies - skb->when);
-		
-			if (when <= 0) 
-			{
+
+			/* FIXME: This assumes that when we are retransmitting
+			 * we should only ever respond with one packet.
+			 * This means congestion windows should not grow
+			 * during recovery. In 2.0.X we allow the congestion
+			 * window to grow. It is not clear to me which
+			 * decision is correct. The RFCs should be double
+			 * checked as should the behavior of other stacks.
+			 * Also note that if we do want to allow the
+			 * congestion window to grow during retransmits
+			 * we have to fix the call to congestion window
+			 * updates so that it works during retransmission.
+			 */
+			if (tp->retransmits) {
 				tp->retrans_head = NULL;
-				/* 
-				 * This is tricky. We are retransmiting a 
+
+				/* This is tricky. We are retransmiting a 
 				 * segment of a window when congestion occured.
 				 */
 				tcp_do_retransmit(sk, 0);
-				tcp_reset_xmit_timer(sk, TIME_RETRANS,
-						     tp->rto);
-			}
-			else 
+				tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+			} else
 				tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
 		}
-	}
-	else
+	} else
 		tcp_clear_xmit_timer(sk, TIME_RETRANS);
-	
-
-	/*
-	 *	Remember the highest ack received.
-	 */
-	 
-	tp->snd_una = ack;
 
 	tcp_fast_retrans(sk, ack, (flag & (FLAG_DATA|FLAG_WIN_UPDATE)));
 
+	/* Remember the highest ack received. */
+	tp->snd_una = ack;
 
 	return 1;
 
 uninteresting_ack:
 
-	tcp_fast_retrans(sk, ack, 0);
-
-	if(sk->debug)
-		printk("Ack ignored %u %u\n",ack,tp->snd_nxt);
-			
+	SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt);
 	return 0;
 }
 
-
 /*
  * 	Process the FIN bit. This now behaves as it is supposed to work
  *	and the FIN takes effect when it is validly part of sequence
@@ -813,53 +953,46 @@ uninteresting_ack:
  *	close and we go into CLOSING (and later onto TIME-WAIT)
  *
  *	If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
- *
  */
  
 static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
 {
-	sk->fin_seq = skb->end_seq;
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+	/* XXX This fin_seq thing should disappear... -DaveM */
+	tp->fin_seq = skb->end_seq;
 
 	tcp_send_ack(sk);
 
-	if (!sk->dead) 
-	{
+	if (!sk->dead) {
 		sk->state_change(sk);
 		sock_wake_async(sk->socket, 1);
 	}
 
-	switch(sk->state) 
-	{
+	switch(sk->state) {
 		case TCP_SYN_RECV:
 		case TCP_SYN_SENT:
 		case TCP_ESTABLISHED:
-			/*
-			 * move to CLOSE_WAIT
-			 */
-
+			/* Move to CLOSE_WAIT */
 			tcp_set_state(sk, TCP_CLOSE_WAIT);
-			
 			if (th->rst)
 				sk->shutdown = SHUTDOWN_MASK;
 			break;
 
 		case TCP_CLOSE_WAIT:
 		case TCP_CLOSING:
-			/*
-			 * received a retransmission of the FIN, do
+			/* Received a retransmission of the FIN, do
 			 * nothing.
 			 */
 			break;
 		case TCP_TIME_WAIT:
-			/*
-			 * received a retransmission of the FIN,
+			/* Received a retransmission of the FIN,
 			 * restart the TIME_WAIT timer.
 			 */
 			tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 			return(0);
 		case TCP_FIN_WAIT1:
-			/*
-			 * This case occurs when a simultaneous close
+			/* This case occurs when a simultaneous close
 			 * happens, we must ack the received FIN and
 			 * enter the CLOSING state.
 			 *
@@ -869,202 +1002,128 @@ static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
 			 * FIN lost hang). The TIME_WRITE code is already 
 			 * correct for handling this timeout.
 			 */
-
 			tcp_set_state(sk, TCP_CLOSING);
 			break;
 		case TCP_FIN_WAIT2:
-			/*
-			 * received a FIN -- send ACK and enter TIME_WAIT
-			 */
+			/* Received a FIN -- send ACK and enter TIME_WAIT. */
 			tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
-			sk->shutdown|=SHUTDOWN_MASK;
+			sk->shutdown |= SHUTDOWN_MASK;
 			tcp_set_state(sk,TCP_TIME_WAIT);
 			break;
 		case TCP_CLOSE:
-			/*
-			 * already in CLOSE
-			 */
+			/* Already in CLOSE. */
 			break;
 		default:
+			/* FIXME: Document whats happening in this case. -DaveM */
 			tcp_set_state(sk,TCP_LAST_ACK);
 	
 			/* Start the timers. */
 			tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 			return(0);
-	}
+	};
 
 	return(0);
 }
 
-
-
-	/*
-	 * This one checks to see if we can put data from the
-	 * out_of_order queue into the receive_queue
-	 */
-
-static void  tcp_ofo_queue(struct sock *sk)
+/* This one checks to see if we can put data from the
+ * out_of_order queue into the receive_queue.
+ */
+static void tcp_ofo_queue(struct sock *sk)
 {
-	struct sk_buff * skb;
-	struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
+	struct sk_buff *skb;
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
+	/* FIXME: out_of_order_queue is a strong tcp_opt candidate... -DaveM */
 	while ((skb = skb_peek(&sk->out_of_order_queue))) {
-		
 		if (after(skb->seq, tp->rcv_nxt))
 			break;
 
 		if (!after(skb->end_seq, tp->rcv_nxt)) {
-
-			if (sk->debug)
-				printk("ofo packet was allready received \n");
-
+			SOCK_DEBUG(sk, "ofo packet was allready received \n");
 			skb_unlink(skb);
 			kfree_skb(skb, FREE_READ);
-			
 			continue;
 		}
+		SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
+			   tp->rcv_nxt, skb->seq, skb->end_seq);
 
-		if (sk->debug) 
-			printk("ofo requeuing : rcv_next %X seq %X - %X\n", 
-			       tp->rcv_nxt, skb->seq, skb->end_seq);
-		
 		skb_unlink(skb);
-
-		 
 		skb_queue_tail(&sk->receive_queue, skb);
-
-
 		tp->rcv_nxt = skb->end_seq;
 	}
 }
 
 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 {
-	struct sk_buff * skb1;
-	struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
+	struct sk_buff *skb1;
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
-	/*
-	 *  Queue data for delivery to the user
-	 *  Packets in sequence go to the receive queue
-	 *  Out of sequence packets to out_of_order_queue
+	/*  Queue data for delivery to the user.
+	 *  Packets in sequence go to the receive queue.
+	 *  Out of sequence packets to out_of_order_queue.
 	 */
-
-
 	if (skb->seq == tp->rcv_nxt) {
-
-		/*
-		 * Ok. In sequence.
-		 */
-		
- 
+		/* Ok. In sequence. */
+queue_and_out:
 		skb_queue_tail(&sk->receive_queue, skb);
-
-
 		tp->rcv_nxt = skb->end_seq;
-
 		tcp_ofo_queue(sk);
-		
 		if (skb_queue_len(&sk->out_of_order_queue) == 0)
 			tp->pred_flags = htonl((0x5010 << 16) | tp->snd_wnd);
-
 		return;
 	}
 	
-	/*
-	 *  Not in sequence
-	 *  either a retransmit or some packet got lost
-	 */
-
+	/* Not in sequence, either a retransmit or some packet got lost. */
 	if (!after(skb->end_seq, tp->rcv_nxt)) {
-		
-		/* 
-		 * A retransmit.
-		 * 2nd most common case.
-		 * force an imediate ack
-		 */
+		/* A retransmit, 2nd most common case.  Force an imediate ack. */
+		SOCK_DEBUG(sk, "retransmit received: seq %X\n", skb->seq);
 
-		if (sk->debug) 
-			printk("retransmit received: seq %X\n", skb->seq);
-
-		sk->delayed_acks = MAX_DELAY_ACK;
+		tp->delayed_acks = MAX_DELAY_ACK;
 		kfree_skb(skb, FREE_READ);
-
 		return;
 	}
 
-
 	if (before(skb->seq, tp->rcv_nxt)) {
+		/* Partial packet, seq < rcv_next < end_seq */
+		SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
+			   tp->rcv_nxt, skb->seq, skb->end_seq);
 
-		/*
-		 * Partial packet
-		 * seq < rcv_next < end_seq
-		 */
-
-		if (sk->debug) 
-			printk("partial packet: rcv_next %X seq %X - %X\n", 
-			       tp->rcv_nxt, skb->seq, skb->end_seq);
-		
-		skb_queue_tail(&sk->receive_queue, skb);
-
-
-		tp->rcv_nxt = skb->end_seq;
-
-		tcp_ofo_queue(sk);
-
-		if (skb_queue_len(&sk->out_of_order_queue) == 0)
-			tp->pred_flags = htonl((0x5010 << 16) | tp->snd_wnd);
-
-		return;		
+		goto queue_and_out;
 	}
 
-	/* 
-	 * Ok. This is an out_of_order segment 
-	 */
-	
-	/* Force an ack */
-	
-	sk->delayed_acks = MAX_DELAY_ACK;
-
-	/*
-	 *	disable header predition
-	 */
+	/* Ok. This is an out_of_order segment, force an ack. */
+	tp->delayed_acks = MAX_DELAY_ACK;
 
+	/* Disable header predition. */
 	tp->pred_flags = 0;
 
-	if (sk->debug) 
-		printk("out of order segment: rcv_next %X seq %X - %X\n", 
-		       tp->rcv_nxt, skb->seq, skb->end_seq);
+	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
+		   tp->rcv_nxt, skb->seq, skb->end_seq);
 
 	if (skb_peek(&sk->out_of_order_queue) == NULL) {
 		skb_queue_head(&sk->out_of_order_queue,skb);
-	}
-	else 
+	} else {
 		for(skb1=sk->out_of_order_queue.prev; ; skb1 = skb1->prev) {
-
-			/* allready there */
-			if (skb->seq==skb1->seq && skb->len>=skb1->len)
-			{
- 				skb_append(skb1,skb);
+			/* Already there. */
+			if (skb->seq == skb1->seq && skb->len >= skb1->len) {
+ 				skb_append(skb1, skb);
  				skb_unlink(skb1);
- 				kfree_skb(skb1,FREE_READ);
+ 				kfree_skb(skb1, FREE_READ);
 				break;
 			}
 			
-			if (after(skb->seq, skb1->seq))
-			{
+			if (after(skb->seq, skb1->seq)) {
 				skb_append(skb1,skb);
 				break;
 			}
-			
-                        /*
-			 *	See if we've hit the start. If so insert.
-			 */
+
+                        /* See if we've hit the start. If so insert. */
 			if (skb1 == skb_peek(&sk->out_of_order_queue)) {
 				skb_queue_head(&sk->out_of_order_queue,skb);
 				break;
 			}
 		}
-			
+	}
 }
 
 
@@ -1077,53 +1136,36 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
 {
 	struct tcphdr *th;
-	struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
 	th = skb->h.th;
-	skb_pull(skb,th->doff*4);
-	skb_trim(skb,len-(th->doff*4));
+	skb_pull(skb, th->doff*4);
+	skb_trim(skb, len - (th->doff*4));
 
         if (skb->len == 0 && !th->fin)
-        {
 		return(0);
-        }
-
-	/*
-	 *	FIXME: don't accept data after the receved fin
-	 */
 
-	/*
-	 *	The bytes in the receive read/assembly queue has increased. 
-	 *	Needed for the low memory discard algorithm 
-	 */
-	   
-	sk->bytes_rcv += skb->len;
-		
-	/*
-	 *	We no longer have anyone receiving data on this connection.
+	/* FIXME: don't accept data after the received fin.
+	 *
+	 * Would checking snd_seq against fin_seq be enough?
+	 * If so, how do we handle that case exactly? -DaveM
 	 */
 
+	/* We no longer have anyone receiving data on this connection. */
 	tcp_data_queue(sk, skb);
 
-	if (before(tp->rcv_nxt, sk->copied_seq)) 
-	{
-		printk("*** tcp.c:tcp_data bug acked < copied\n");
+	if (before(tp->rcv_nxt, sk->copied_seq)) {
+		printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
 		tp->rcv_nxt = sk->copied_seq;
 	}
 
-	sk->delayed_acks++;
-	
+	tp->delayed_acks++;
 
-	/*
-	 *	Now tell the user we may have some data. 
-	 */
-	 
-	if (!sk->dead) 
-	{
-        	if(sk->debug)
-        		printk("Data wakeup.\n");
+	/* Now tell the user we may have some data. */
+	if (!sk->dead) {
+		SOCK_DEBUG(sk, "Data wakeup.\n");
 		sk->data_ready(sk,0);
-	} 
+	}
 	return(1);
 }
 
@@ -1132,51 +1174,52 @@ static void tcp_data_snd_check(struct sock *sk)
 	struct sk_buff *skb;
 	struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
 
-	if ((skb = tp->send_head)) 
-	{
+	if ((skb = tp->send_head)) {
 		if (!after(skb->end_seq, tp->snd_una + tp->snd_wnd) &&
-		    sk->packets_out < sk->cong_window )
-		{
-			/*
-			 *	Add more data to the send queue.
-			 */
+		    tp->packets_out < tp->snd_cwnd ) {
+			/* Add more data to the send queue. */
 
+			/* FIXME: the congestion window is checked
+			 * again in tcp_write_xmit anyway?! -- erics
+			 *
+			 * I think it must, it bumps tp->packets_out for
+			 * each packet it fires onto the wire. -DaveM
+			 */
 			tcp_write_xmit(sk);
-			wake_up_interruptible(sk->sleep);
-		}
-		else if (sk->packets_out == 0 && !tp->pending)
- 		{
- 			/*
- 			 *	Data to queue but no room.
- 			 */
+			if(!sk->dead)
+				sk->write_space(sk);
+		} else if (tp->packets_out == 0 && !tp->pending) {
+ 			/* Data to queue but no room. */
+
+			/* FIXME: Is it right to do a zero window probe into
+			 * a congestion window limited window??? -- erics
+			 */
  			tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
- 		}		
+ 		}
 	}
-}	
+}
 
 static __inline__ void tcp_ack_snd_check(struct sock *sk)
 {
-	/*
-	 *	This also takes care of updating the window.
-	 *	This if statement needs to be simplified.
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+	/* This also takes care of updating the window.
+	 * This if statement needs to be simplified.
 	 *
-	 *      rules for delaying an ack:
+	 * Rules for delaying an ack:
 	 *      - delay time <= 0.5 HZ
 	 *      - we don't have a window update to send
 	 *      - must send at least every 2 full sized packets
 	 */
-
-	if (sk->delayed_acks == 0)
+	if (tp->delayed_acks == 0) {
+		/* We sent a data segment already. */
 		return;
+	}
 
-	if (sk->delayed_acks >= MAX_DELAY_ACK || tcp_raise_window(sk)) 
-	{
+	if (tp->delayed_acks >= MAX_DELAY_ACK || tcp_raise_window(sk))
 		tcp_send_ack(sk);
-	}
-	else 
-	{	
-		tcp_send_delayed_ack(sk, HZ/2);		
-	}
+	else
+		tcp_send_delayed_ack(sk, HZ/2);
 }
 
 /*
@@ -1198,62 +1241,49 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
 		ptr--;
 	ptr += ntohl(th->seq);
 
-	/* ignore urgent data that we've already seen and read */
+	/* Ignore urgent data that we've already seen and read. */
 	if (after(sk->copied_seq, ptr))
 		return;
 
-	/* do we already have a newer (or duplicate) urgent pointer? */
+	/* Do we already have a newer (or duplicate) urgent pointer? */
 	if (sk->urg_data && !after(ptr, sk->urg_seq))
 		return;
 
-	/* tell the world about our new urgent pointer */
+	/* Tell the world about our new urgent pointer. */
 	if (sk->proc != 0) {
-		if (sk->proc > 0) {
+		if (sk->proc > 0)
 			kill_proc(sk->proc, SIGURG, 1);
-		} else {
+		else
 			kill_pg(-sk->proc, SIGURG, 1);
-		}
 	}
-	/*
-	 *	We may be adding urgent data when the last byte read was
-	 *	urgent. To do this requires some care. We cannot just ignore
-	 *	sk->copied_seq since we would read the last urgent byte again
-	 *	as data, nor can we alter copied_seq until this data arrives
-	 *	or we break the sematics of SIOCATMARK (and thus sockatmark())
+
+	/* We may be adding urgent data when the last byte read was
+	 * urgent. To do this requires some care. We cannot just ignore
+	 * sk->copied_seq since we would read the last urgent byte again
+	 * as data, nor can we alter copied_seq until this data arrives
+	 * or we break the sematics of SIOCATMARK (and thus sockatmark())
 	 */
 	if (sk->urg_seq == sk->copied_seq)
 		sk->copied_seq++;	/* Move the copied sequence on correctly */
 	sk->urg_data = URG_NOTYET;
 	sk->urg_seq = ptr;
 
-	/* disable header prediction */
+	/* Disable header prediction. */
 	tp->pred_flags = 0;
 }
 
-/*
- *	This is the 'fast' part of urgent handling.
- */
- 
+/* This is the 'fast' part of urgent handling. */
 static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len)
 {
-	/*
-	 *	Check if we get a new urgent pointer - normally not 
-	 */
-	 
+	/* Check if we get a new urgent pointer - normally not. */
 	if (th->urg)
 		tcp_check_urg(sk,th);
 
-	/*
-	 *	Do we wait for any urgent data? - normally not
-	 */
-	 
+	/* Do we wait for any urgent data? - normally not... */
 	if (sk->urg_data == URG_NOTYET) {
-		u32 ptr;
+		u32 ptr = sk->urg_seq - ntohl(th->seq) + (th->doff*4);
 
-		/*
-		 *	Is the urgent pointer pointing into this packet? 
-		 */	 
-		ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4;
+		/* Is the urgent pointer pointing into this packet? */	 
 		if (ptr < len) {
 			sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
 			if (!sk->dead)
@@ -1262,26 +1292,19 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len
 	}
 }
 
-
 static void prune_queue(struct sock *sk)
 {
 	struct sk_buff * skb;
 
-	/*
-	 *	clean the out_of_order queue
-	 */
-
+	/* Clean the out_of_order queue. */
 	while ((skb = skb_dequeue(&sk->out_of_order_queue))) 
-	{
 		kfree_skb(skb, FREE_READ);
-	}
 }
 
-
-void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
-			 struct tcphdr *th, __u16 len)
+int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
+			struct tcphdr *th, __u16 len)
 {
-	struct tcp_opt *tp;
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	int queued = 0;
 	u32 flg;
 	
@@ -1301,6 +1324,23 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	 */
 
 	tp = &(sk->tp_pinfo.af_tcp); 
+
+	/*
+	 * RFC1323: H1. Apply PAWS check first.
+	 */
+	if (tcp_fast_parse_options(th,tp)) {
+		if (tp->saw_tstamp) {
+			if (tcp_paws_discard(tp)) {
+				if (!th->rst) {
+					tcp_send_ack(sk);
+					kfree_skb(skb, FREE_READ);
+					return 0;
+				}
+			}
+			tcp_replace_ts_recent(tp,skb->end_seq);
+		}
+	}
+
 	flg = *(((u32 *)th) + 3);
 		
 	/*
@@ -1311,127 +1351,88 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	 *	 space for instance)
 	 */
 
-	if (flg == tp->pred_flags && skb->seq == tp->rcv_nxt)
-	{
-		if (len <= sizeof(struct tcphdr))
-		{
-			if (len == sizeof(struct tcphdr))
-			{
+	if (flg == tp->pred_flags && skb->seq == tp->rcv_nxt) {
+		if (len <= th->doff*4) {
+			/* Bulk data transfer: sender */
+			if (len == th->doff*4) {
 				tcp_ack(sk, th, skb->seq, skb->ack_seq, len);
+				tcp_data_snd_check(sk);
 			}
 
-			tcp_data_snd_check(sk);
-
 			kfree_skb(skb, FREE_READ);
-			return;
-			
-		}
-		else if (skb->ack_seq == tp->snd_una)
-		{
-			/* 
-			 * Bulk data transfer: receiver 
-			 */
+			return 0;
+		} else if (skb->ack_seq == tp->snd_una) {
+			/* Bulk data transfer: receiver */
 			
-			skb_pull(skb,sizeof(struct tcphdr));
+			skb_pull(skb,th->doff*4);
 			
 			skb_queue_tail(&sk->receive_queue, skb);
 			tp->rcv_nxt = skb->end_seq;
-			sk->bytes_rcv += len - sizeof(struct tcphdr);
-			
+
 			sk->data_ready(sk, 0);
 			tcp_delack_estimator(tp);
 
-			if (sk->delayed_acks++)
-			{
+			if (tp->delayed_acks++ == 0)
 				tcp_send_delayed_ack(sk, HZ/2);
-			}
 			else
 				tcp_send_ack(sk);
 
-			return;
+			return 0;
 		}
 	}
 
-	if (!tcp_sequence(tp, skb->seq, skb->end_seq))
-	{
-		if (!th->rst)
-		{
-			if (after(skb->seq, tp->rcv_nxt))
-			{
-				printk(KERN_DEBUG "->seq:%d end:%d "
-				       "wup:%d wnd:%d\n",
-				       skb->seq, skb->end_seq, 
-				       tp->rcv_wup, tp->rcv_wnd);
+	if (!tcp_sequence(tp, skb->seq, skb->end_seq)) {
+		if (!th->rst) {
+			if (after(skb->seq, tp->rcv_nxt)) {
+				SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n",
+					   skb->seq, skb->end_seq,
+					   tp->rcv_wup, tp->rcv_wnd);
 			}
 			tcp_send_ack(sk);
 			kfree_skb(skb, FREE_READ);
-			return;
+			return 0;
 		}
 	}
 
-	if(th->syn && skb->seq != sk->syn_seq)
-	{
+	if(th->syn && skb->seq != sk->syn_seq) {
 		printk(KERN_DEBUG "syn in established state\n");
 		tcp_reset(sk, skb);
 		kfree_skb(skb, FREE_READ);
-		return;
+		return 1;
 	}
 	
-	if(th->rst)
-	{
+	if(th->rst) {
 		tcp_reset(sk,skb);
 		kfree_skb(skb, FREE_READ);
-		return;
+		return 0;
 	}
 	
 	if(th->ack)
-	{
 		tcp_ack(sk, th, skb->seq, skb->ack_seq, len);
-	}
-
 	
-	/*
-	 *	Process urgent data
-	 */
-
+	/* Process urgent data. */
 	tcp_urg(sk, th, len);
 
-	/*
-	 *	step 7: process the segment text
-	 */
-
-
+	/* step 7: process the segment text */
 	queued = tcp_data(skb, sk, len);
 
-	/*
-	 *	step 8: check the FIN bit
-	 */
-
+	/* step 8: check the FIN bit */
 	if (th->fin)
-	{
 		tcp_fin(skb, sk, th);
-	}
 
 	tcp_data_snd_check(sk);
 	tcp_ack_snd_check(sk);
 
-	/*
-	 *	If our receive queue has grown past its limits,
-	 *	try to prune away duplicates etc..
+	/* If our receive queue has grown past its limits,
+	 * try to prune away duplicates etc..
 	 */
-	if (sk->rmem_alloc > sk->rcvbuf)
+	if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf)
 		prune_queue(sk);
 
-	/*
-	 *	And done
-	 */	
-	
-	if (queued)
-		return;
-
-	kfree_skb(skb, FREE_READ);
+	if (!queued)
+		kfree_skb(skb, FREE_READ);
+	return 0;
 }
-		
 
 /*
  *	This function implements the receiving procedure of RFC 793.
@@ -1444,49 +1445,26 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	int queued = 0;
-	int rcv_mss;
-
-	/*
-	 *	state == CLOSED
-	 *	tested in tcp_v{4,6}_rcv
-	 */
 
+	/* state == CLOSED, hash lookup always fails, so no worries. -DaveM */
 	switch (sk->state) {
-
-
 	case TCP_LISTEN:
-		
 		if (th->rst)			
 			goto discard;
 
-		/* 
-		 * These use the socket TOS.. 
+		/* These use the socket TOS.. 
 		 * might want to be the received TOS 
 		 */
-
 		if(th->ack)
-		{	
-			/*
-			 *  send reset
-			 */
-
-			return 1;
-		}
-		
+			return 1; /* send reset */
 		
-		if(th->syn)
-		{
-			int err;
-			__u32 isn;
-
-			isn = tp->af_specific->init_sequence(sk, skb);
-			err = tp->af_specific->conn_request(sk, skb, opt, isn);
+		if(th->syn) {
+			__u32 isn = tp->af_specific->init_sequence(sk, skb);
 
-			if (err < 0)
+			if(tp->af_specific->conn_request(sk, skb, opt, isn) < 0)
 				return 1;
 
-			/*
-			 *  Now we have several options: In theory there is 
+			/*  Now we have several options: In theory there is 
 			 *  nothing else in the frame. KA9Q has an option to 
 			 *  send data with the syn, BSD accepts data with the
 			 *  syn up to the [to be] advertised window and 
@@ -1498,7 +1476,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			 *  Now that TTCP is starting to be used we ought to 
 			 *  queue this data.
 			 */
-
 			return 0;
 		}
 		
@@ -1506,45 +1483,36 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		break;
 
 	case TCP_SYN_SENT:
-		
-		/*
-		 *	SYN sent means we have to look for a suitable ack and 
-		 *	either reset for bad matches or go to connected. 
-		 *	The SYN_SENT case is unusual and should
-		 *	not be in line code. [AC]
+		/* SYN sent means we have to look for a suitable ack and 
+		 * either reset for bad matches or go to connected. 
+		 * The SYN_SENT case is unusual and should
+		 * not be in line code. [AC]
 		 */
-	   
-		if(th->ack)
-		{
-			/* We got an ack, but it's not a good ack */
-			if(!tcp_ack(sk,th, skb->seq, skb->ack_seq, len))
-			{
+		if(th->ack) {
+			tp->snd_wl1 = skb->seq;
+
+			/* We got an ack, but it's not a good ack. */
+			if(!tcp_ack(sk,th, skb->seq, skb->ack_seq, len)) {
 				tcp_statistics.TcpAttemptFails++;
 				return 1;
 			}
 
-			if(th->rst)
-			{
+			if(th->rst) {
 				tcp_reset(sk,skb);
 				goto discard;
 			}
 
-			if(!th->syn)
-			{
-				/* 
-				 *  A valid ack from a different connection
-				 *  start. Shouldn't happen but cover it 
+			if(!th->syn) {
+				/* A valid ack from a different connection
+				 * start.  Shouldn't happen but cover it.
 				 */
 				tcp_statistics.TcpAttemptFails++;
 				return 1;
 			}
 
-			/*
-			 *	Ok.. it's good. Set up sequence 
-			 *	numbers and
-			 *	move to established.
+			/* Ok.. it's good. Set up sequence numbers and
+			 * move to established.
 			 */
-
 			tp->rcv_nxt = skb->seq+1;
 			tp->rcv_wnd = 0;
 			tp->rcv_wup = skb->seq+1;
@@ -1553,43 +1521,53 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			tp->snd_wl1 = skb->seq;
 			tp->snd_wl2 = skb->ack_seq;
 
-			sk->fin_seq = skb->seq;
-			tcp_send_ack(sk);
+			tp->fin_seq = skb->seq;
 
 			tcp_set_state(sk, TCP_ESTABLISHED);
-			rcv_mss = tcp_parse_options(th);
-			
-			if (rcv_mss == 0)
-			{
-				rcv_mss = 536;
+			tcp_parse_options(th,tp);
+			/* FIXME: need to make room for SACK still */
+			if (tp->tstamp_ok) {
+				tp->tcp_header_len = sizeof(struct tcphdr) + 12;	/* FIXME: Define constant! */
+				sk->dummy_th.doff += 3;		/* reserve space of options */
+			} else
+				tp->tcp_header_len = sizeof(struct tcphdr);
+			if (tp->saw_tstamp) {
+				tp->ts_recent = tp->rcv_tsval;
+				tp->ts_recent_stamp = jiffies;
 			}
 
-			sk->mss = min(sk->mss, rcv_mss);
+			/* Can't be earlier, doff would be wrong. */
+			tcp_send_ack(sk);
+
+			if (tp->in_mss)
+				sk->mss = min(sk->mss, tp->in_mss);
+
+			/* Take out space for tcp options. */
+			sk->mss -= tp->tcp_header_len - sizeof(struct tcphdr);
 			
 			sk->dummy_th.dest = th->source;
 			sk->copied_seq = tp->rcv_nxt;
 
-			if(!sk->dead)
-			{
+			if(!sk->dead) {
 				sk->state_change(sk);
 				sock_wake_async(sk->socket, 0);
 			}
 
 			/* Drop through step 6 */
 			goto step6;
-		}
-		else
-		{
-			if(th->syn && !th->rst)
-			{
-				/* 
-				 * the previous version of the code
+		} else {
+			if(th->syn && !th->rst) {
+				/* The previous version of the code
 				 * checked for "connecting to self"
 				 * here. that check is done now in
-				 * tcp_connect
+				 * tcp_connect.
 				 */
-
 				tcp_set_state(sk, TCP_SYN_RECV);
+				tcp_parse_options(th,tp);
+				if (tp->saw_tstamp) {
+					tp->ts_recent = tp->rcv_tsval;
+					tp->ts_recent_stamp = jiffies;
+				}
 				
 				tp->rcv_nxt = skb->seq + 1;
 				tp->rcv_wup = skb->seq + 1;
@@ -1605,8 +1583,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		break;
 
 	case TCP_TIME_WAIT:
-	        /*
-		 *	RFC 1122:
+	        /*	RFC 1122:
 		 *	"When a connection is [...] on TIME-WAIT state [...]
 		 *	[a TCP] MAY accept a new SYN from the remote TCP to
 		 *	reopen the connection directly, if it:
@@ -1619,14 +1596,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		 *	(2)  returns to TIME-WAIT state if the SYN turns out 
 		 *	to be an old duplicate".
 		 */
-
-		if (th->syn && !th->rst && after(skb->seq, tp->rcv_nxt))
-		{
+		if (th->syn && !th->rst && after(skb->seq, tp->rcv_nxt)) {
 			__u32 isn;
-			int err;
 
-                        atomic_sub(skb->truesize, &sk->rmem_alloc);
-                        skb->sk = NULL;
+			skb_orphan(skb);
                         sk->err = ECONNRESET;
                         tcp_set_state(sk, TCP_CLOSE);
                         sk->shutdown = SHUTDOWN_MASK;
@@ -1635,56 +1608,58 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 
 			sk = tp->af_specific->get_sock(skb, th);
 
-			if (sk == NULL)
+			if (sk == NULL || !ipsec_sk_policy(sk,skb))
 				goto discard;
 
-			skb->sk = sk;
+			skb_set_owner_r(skb, sk);
 			tp = &sk->tp_pinfo.af_tcp;
-			atomic_add(skb->truesize, &sk->rmem_alloc);
-			
-			err = tp->af_specific->conn_request(sk, skb, opt, isn);
 
-			if (err < 0)
+			if(tp->af_specific->conn_request(sk, skb, opt, isn) < 0)
 				return 1;
-
 			return 0;
 		}
 
 		break;
-
 	}
 
-	/*
-	 *	step 1: check sequence number
+	/*   Parse the tcp_options present on this header.
+	 *   By this point we really only expect timestamps and SACKs.
+	 *   Note that this really has to be here and not later for PAWS
+	 *   (RFC1323) to work.
 	 */
+	if (tcp_fast_parse_options(th,tp)) {
+		/* NOTE: assumes saw_tstamp is never set if we didn't
+		 * negotiate the option. tcp_fast_parse_options() must
+		 * guarantee this.
+		 */
+		if (tp->saw_tstamp) {
+			if (tcp_paws_discard(tp)) {
+				if (!th->rst) {
+					tcp_send_ack(sk);
+					goto discard;
+				}
+			}
+			tcp_replace_ts_recent(tp,skb->end_seq);
+		}
+	}
 
-	if (!tcp_sequence(tp, skb->seq, skb->end_seq))
-	{
-		if (!th->rst)
-		{
+	/* step 1: check sequence number */
+	if (!tcp_sequence(tp, skb->seq, skb->end_seq)) {
+		if (!th->rst) {
 			tcp_send_ack(sk);
 			goto discard;
 		}
 	}
 
-
-	/*
-	 *	step 2: check RST bit
-	 */
-
-	if(th->rst)
-	{
+	/* step 2: check RST bit */
+	if(th->rst) {
 		tcp_reset(sk,skb);
 		goto discard;
 	}
 
-	/*
-	 *	step 3: check security and precedence 
-	 *	[ignored]
-	 */
+	/* step 3: check security and precedence [ignored] */
 
-	/*
-	 *	step 4:
+	/*	step 4:
 	 *
 	 *	Check for a SYN, and ensure it matches the SYN we were
 	 *	first sent. We have to handle the rather unusual (but valid)
@@ -1700,24 +1675,18 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 	 *	original syn. 
 	 */
 
-	if (th->syn && skb->seq!=sk->syn_seq)
-	{
+	if (th->syn && skb->seq!=sk->syn_seq) {
 		tcp_reset(sk, skb);
 		return 1;
 	}
 
-	/*
-	 *	step 5: check the ACK field
-	 */
-
-	if (th->ack) 
-	{
+	/* step 5: check the ACK field */
+	if (th->ack) {
 		int acceptable = tcp_ack(sk,th,skb->seq, skb->ack_seq,len);
 		
 		switch(sk->state) {
 		case TCP_SYN_RECV:
-			if (acceptable)
-			{
+			if (acceptable) {
 				tcp_set_state(sk, TCP_ESTABLISHED);
 				sk->dummy_th.dest=th->source;
 				sk->copied_seq = tp->rcv_nxt;
@@ -1730,36 +1699,26 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				tp->snd_wl1 = skb->seq;
 				tp->snd_wl2 = skb->ack_seq;
 
-			}
-			else
+			} else
 				return 1;
 			break;
 
 		case TCP_FIN_WAIT1:
-			
-			if (tp->snd_una == sk->write_seq) 
-			{
+			if (tp->snd_una == sk->write_seq) {
 				sk->shutdown |= SEND_SHUTDOWN;
 				tcp_set_state(sk, TCP_FIN_WAIT2);
-				if (!sk->dead) 
+				if (!sk->dead)
 					sk->state_change(sk);
 			}
 			break;
 
-		case TCP_CLOSING:			
-
-			if (tp->snd_una == sk->write_seq) 
-			{
+		case TCP_CLOSING:	
+			if (tp->snd_una == sk->write_seq)
 				tcp_time_wait(sk);
-				if (!sk->dead) 
-					sk->state_change(sk);
-			}
 			break;
 
 		case TCP_LAST_ACK:
-
-			if (tp->snd_una == sk->write_seq) 
-			{
+			if (tp->snd_una == sk->write_seq) {
 				sk->shutdown = SHUTDOWN_MASK;
 				tcp_set_state(sk,TCP_CLOSE);
 				if (!sk->dead)
@@ -1769,49 +1728,34 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			break;
 
 		case TCP_TIME_WAIT:
-			/*
-			 * keep us in TIME_WAIT until we stop getting 
+			/* Keep us in TIME_WAIT until we stop getting 
 			 * packets, reset the timeout.
 			 */
 			tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
 			break;
-
 		}
-	}
-	else
+	} else
 		goto discard;
 
-  step6:
-
-	/*
-	 *	step 6: check the URG bit
-	 */
-
+step6:
+	/* step 6: check the URG bit */
 	tcp_urg(sk, th, len);
 
-	/*
-	 *	step 7: process the segment text
-	 */
-
+	/* step 7: process the segment text */
 	switch (sk->state) {
 	case TCP_CLOSE_WAIT:
 	case TCP_CLOSING:
-		if (!before(skb->seq, sk->fin_seq))
+		if (!before(skb->seq, tp->fin_seq))
 			break;
 	
 	case TCP_FIN_WAIT1:
 	case TCP_FIN_WAIT2:
-
-		/*
-		 *	RFC 793 says to queue data in this states,
-		 *	RFC 1122 says we MUST send a reset. 
-		 *	BSD 4.4 also does reset.
+		/* RFC 793 says to queue data in these states,
+		 * RFC 1122 says we MUST send a reset. 
+		 * BSD 4.4 also does reset.
 		 */
-
-		if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead)
-		{
-			if (after(skb->end_seq - th->fin, tp->rcv_nxt))
-			{
+		if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) {
+			if (after(skb->end_seq - th->fin, tp->rcv_nxt)) {
 				tcp_reset(sk, skb);
 				return 1;
 			}
@@ -1819,25 +1763,19 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		
 	case TCP_ESTABLISHED:
 		queued = tcp_data(skb, sk, len);
-		break;		
+		break;
 	}
 
-	/*
-	 *	step 8: check the FIN bit
-	 */
-
+	/* step 8: check the FIN bit */
 	if (th->fin)
-	{
 		tcp_fin(skb, sk, th);
-	}
 
 	tcp_data_snd_check(sk);
 	tcp_ack_snd_check(sk);
 
 	if (queued)
 		return 0;
-  discard:
-
+discard:
 	kfree_skb(skb, FREE_READ);
 	return 0;
 }
@@ -1847,30 +1785,22 @@ int tcp_sysctl_congavoid(ctl_table *ctl, int write, struct file * filp,
 {
 	int val = sysctl_tcp_cong_avoidance;
 	int retv;
-	
-	retv  = proc_dointvec(ctl, write, filp, buffer, lenp);
-	
-	if (write)
-	{
+
+	retv = proc_dointvec(ctl, write, filp, buffer, lenp);
+
+	if (write) {
 		switch (sysctl_tcp_cong_avoidance) {
-			case 0:
-				tcp_sys_cong_ctl_f = &tcp_cong_avoid_vanj;
-				break;
-			case 1:
-				tcp_sys_cong_ctl_f = &tcp_cong_avoid_vegas;
-				break;
-			default:
-				retv = -EINVAL;
-				sysctl_tcp_cong_avoidance = val;
-		}
+		case 0:
+			tcp_sys_cong_ctl_f = &tcp_cong_avoid_vanj;
+			break;
+		case 1:
+			tcp_sys_cong_ctl_f = &tcp_cong_avoid_vegas;
+			break;
+		default:
+			retv = -EINVAL;
+			sysctl_tcp_cong_avoidance = val;
+		};
 	}
-	
+
 	return retv;
 }
-
-/*
- * Local variables:
- *  compile-command: "gcc -D__KERNEL__ -I/usr/src/linux/include -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer -fno-strength-reduce -pipe -m486 -DCPU=486 -c -o tcp_input.o tcp_input.c"
- * c-file-style: "Linux"
- * End:
- */
author	Ralf Baechle <ralf@linux-mips.org>	1997-04-29 21:13:14 +0000
committer	<ralf@linux-mips.org>	1997-04-29 21:13:14 +0000
commit	19c9bba94152148523ba0f7ef7cffe3d45656b11 (patch)
tree	40b1cb534496a7f1ca0f5c314a523c69f1fee464 /net/ipv4/tcp_input.c
parent	7206675c40394c78a90e74812bbdbf8cf3cca1be (diff)