Merge with Linux 2.3.41.

author: Ralf Baechle <ralf@linux-mips.org> 2000-02-18 00:24:27 +0000
committer: Ralf Baechle <ralf@linux-mips.org> 2000-02-18 00:24:27 +0000
commit: b9558d5f86c471a125abf1fb3a3882fb053b1f8c (patch)
tree: 707b53ec64e740a7da87d5f36485e3cd9b1c794e /net/ipv4/tcp_input.c
parent: b3ac367c7a3e6047abe74817db27e34e759f279f (diff)
1 files changed, 902 insertions, 468 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3b4ae64a2..d61a5df02 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_input.c,v 1.177 2000/01/09 02:19:39 davem Exp $
+ * Version:	$Id: tcp_input.c,v 1.183 2000/01/24 18:40:33 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -70,9 +70,6 @@
 #define SYNC_INIT 1
 #endif
 
-extern int sysctl_tcp_fin_timeout;
-extern int sysctl_tcp_keepalive_time;
-
 /* These are on by default so the code paths get tested.
  * For the final 2.2 this may be undone at our discretion. -DaveM
  */
@@ -83,10 +80,108 @@ int sysctl_tcp_sack = 1;
 int sysctl_tcp_syncookies = SYNC_INIT; 
 int sysctl_tcp_stdurg;
 int sysctl_tcp_rfc1337;
-int sysctl_tcp_tw_recycle;
+int sysctl_tcp_tw_recycle = 1;
+int sysctl_tcp_abort_on_overflow = 0;
+int sysctl_tcp_max_orphans = NR_FILE;
+int sysctl_tcp_max_tw_buckets = NR_FILE*2;
 
 static int prune_queue(struct sock *sk);
 
+/* 
+ * Adapt the MSS value used to make delayed ack decision to the 
+ * real world.
+ *
+ * The constant 536 hasn't any good meaning.  In IPv4 world
+ * MTU may be smaller, though it contradicts to RFC1122, which
+ * states that MSS must be at least 536.
+ * We use the constant to do not ACK each second
+ * packet in a stream of tiny size packets.
+ * It means that super-low mtu links will be aggressively delacked.
+ * Seems, it is even good. If they have so low mtu, they are weirdly
+ * slow.
+ *
+ * AK: BTW it may be useful to add an option to lock the rcv_mss.
+ *     this way the beowulf people wouldn't need ugly patches to get the
+ *     ack frequencies they want and it would be an elegant way to tune delack.
+ */ 
+static __inline__ void tcp_measure_rcv_mss(struct tcp_opt *tp, struct sk_buff *skb)
+{
+	unsigned int len, lss;
+
+	lss = tp->ack.last_seg_size; 
+	tp->ack.last_seg_size = 0; 
+
+	/* skb->len may jitter because of SACKs, even if peer
+	 * sends good full-sized frames.
+	 */
+	len = skb->len;
+	if (len >= tp->ack.rcv_mss) {
+		tp->ack.rcv_mss = len;
+	} else {
+		/* Otherwise, we make more careful check taking into account,
+		 * that SACKs block is variable.
+		 *
+		 * "len" is invariant segment length, including TCP header.
+		 */
+		len = skb->tail - skb->h.raw;
+		if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr)) {
+			/* Subtract also invariant (if peer is RFC compliant),
+			 * tcp header plus fixed timestamp option length.
+			 * Resulting "len" is MSS free of SACK jitter.
+			 */
+			len -= tp->tcp_header_len;
+			if (len == lss)
+				tp->ack.rcv_mss = len;
+			tp->ack.last_seg_size = len;
+		}
+
+#if 0
+		/* Tiny-grams with PSH set artifically deflate our
+		 * ato measurement.
+		 *
+		 * Mmm... I copied this test from tcp_remember_ack(), but
+		 * I did not understand this. Is it to speedup nagling sender?
+		 * It does not because classic (non-Minshall) sender nagles
+		 * guided by not-acked frames not depending on size.
+		 * And it does not help NODELAY sender, because latency
+		 * is too high in any case. The only result is timer trashing
+		 * and redundant ACKs. Grr... Seems, I missed something.  --ANK
+		 *
+		 * Let me to comment out this yet... TCP should work
+		 * perfectly without this. 				  --ANK
+		 */
+		if (len < (tp->ack.rcv_mss >> 1) && skb->h.th->psh)
+			tp->ack.ato = TCP_ATO_MIN;
+#endif
+	}
+}
+
+
+static __inline__ void tcp_enter_quickack_mode(struct tcp_opt *tp)
+{
+	unsigned quickacks = tcp_receive_window(tp)/(2*tp->ack.rcv_mss);
+
+	tp->ack.quick = max(min(quickacks, 127), 1);
+
+	if (!tp->tstamp_ok && tp->ack.quick>2) {
+		/* Quick ACKs are _dangerous_, if RTTM is not used.
+		 * See comment in tcp_init_metrics(). We still help
+		 * them to overcome the most difficult, initial
+		 * phase of slow start.
+		 */
+		tp->ack.quick = 2;
+	}
+}
+
+/* Send ACKs quickly, if "quick" count is not ehausted
+ * and the session is not interactive.
+ */
+
+static __inline__ int tcp_in_quickack_mode(struct tcp_opt *tp)
+{
+	return (tp->ack.quick && !tp->ack.pingpong);
+}
+
 /* There is something which you must keep in mind when you analyze the
  * behavior of the tp->ato delayed ack timeout interval.  When a
  * connection starts up, we want to ack as quickly as possible.  The
@@ -97,53 +192,52 @@ static int prune_queue(struct sock *sk);
  * each ACK we send, he increments snd_cwnd and transmits more of his
  * queue.  -DaveM
  */
-static void tcp_delack_estimator(struct tcp_opt *tp)
+static void tcp_event_data_recv(struct tcp_opt *tp, struct sk_buff *skb)
 {
-	if(tp->ato == 0) {
-		tp->lrcvtime = tcp_time_stamp;
+	u32 now;
 
-		/* Help sender leave slow start quickly,
-		 * and also makes sure we do not take this
-		 * branch ever again for this connection.
+	tcp_measure_rcv_mss(tp, skb);
+
+	tp->ack.pending = 1;
+
+	now = tcp_time_stamp;
+
+	if (!tp->ack.ato) {
+		/* The _first_ data packet received, initialize
+		 * delayed ACK engine.
 		 */
-		tp->ato = 1;
+
+		/* Help sender leave slow start quickly. */
 		tcp_enter_quickack_mode(tp);
+
+		/* Pingpong is off, session is not interactive by default */
+		tp->ack.pingpong = 0;
+
+		/* ATO is minimal */
+		tp->ack.ato = TCP_ATO_MIN;
 	} else {
-		int m = tcp_time_stamp - tp->lrcvtime;
-
-		tp->lrcvtime = tcp_time_stamp;
-		if(m <= 0)
-			m = 1;
-		if(m > tp->rto)
-			tp->ato = tp->rto;
-		else {
-			/* This funny shift makes sure we
-			 * clear the "quick ack mode" bit.
+		int m = now - tp->ack.lrcvtime;
+
+		if (m > TCP_ATO_MAX/2) {
+			/* Do not touch ATO, if interval is out of bounds.
+			 * It will be deflated by delack timer, if our peer
+			 * really sends too rarely.
 			 */
-			tp->ato = ((tp->ato << 1) >> 2) + m;
+			if (m > tp->rto) {
+				/* Too long gap. Apparently sender falled to
+				 * restart window, so that we send ACKs quickly.
+				 */
+				tcp_enter_quickack_mode(tp);
+			}
+		} else {
+			if (m <= 0)
+				m = TCP_ATO_MIN/2;
+			tp->ack.ato = (tp->ack.ato >> 1) + m;
 		}
 	}
+	tp->ack.lrcvtime = now;
 }
 
-/* 
- * Remember to send an ACK later.
- */
-static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th, 
-					struct sk_buff *skb)
-{
-	tp->delayed_acks++; 
-
-	/* Tiny-grams with PSH set artifically deflate our
-	 * ato measurement, but with a lower bound.
-	 */
-	if(th->psh && (skb->len < (tp->rcv_mss >> 1))) {
-		/* Preserve the quickack state. */
-		if((tp->ato & 0x7fffffff) > HZ/50)
-			tp->ato = ((tp->ato & 0x80000000) |
-				   (HZ/50));
-	}
-} 
-
 /* Called to compute a smoothed rtt estimate. The data fed to this
  * routine either comes from timestamps, or from segments that were
  * known _not_ to have been retransmitted [see Karn/Partridge
@@ -209,10 +303,10 @@ static __inline__ void tcp_set_rto(struct tcp_opt *tp)
  */
 static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
 {
-	if (tp->rto > 120*HZ)
-		tp->rto = 120*HZ;
-	if (tp->rto < HZ/5)
-		tp->rto = HZ/5;
+	if (tp->rto < TCP_RTO_MIN)
+		tp->rto = TCP_RTO_MIN;
+	else if (tp->rto > TCP_RTO_MAX)
+		tp->rto = TCP_RTO_MAX;
 }
 
 /* Save metrics learned by this TCP session.
@@ -224,7 +318,9 @@ static void tcp_update_metrics(struct sock *sk)
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	struct dst_entry *dst = __sk_dst_get(sk);
 
-	if (dst) {
+	dst_confirm(dst);
+
+	if (dst && (dst->flags&DST_HOST)) {
 		int m;
 
 		if (tp->backoff || !tp->srtt) {
@@ -237,8 +333,6 @@ static void tcp_update_metrics(struct sock *sk)
 			return;
 		}
 
-		dst_confirm(dst);
-
 		m = dst->rtt - tp->srtt;
 
 		/* If newly calculated rtt larger than stored one,
@@ -308,10 +402,18 @@ static void tcp_init_metrics(struct sock *sk)
 
 	dst_confirm(dst);
 
+	if (dst->mxlock&(1<<RTAX_CWND))
+		tp->snd_cwnd_clamp = dst->cwnd;
+	if (dst->ssthresh) {
+		tp->snd_ssthresh = dst->ssthresh;
+		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+			tp->snd_ssthresh = tp->snd_cwnd_clamp;
+	}
+
 	if (dst->rtt == 0)
 		goto reset;
 
-	if (!tp->srtt || !tp->saw_tstamp)
+	if (!tp->srtt && dst->rtt < (TCP_TIMEOUT_INIT<<3))
 		goto reset;
 
 	/* Initial rtt is determined from SYN,SYN-ACK.
@@ -334,14 +436,9 @@ static void tcp_init_metrics(struct sock *sk)
 		tp->mdev = dst->rttvar;
 	tcp_set_rto(tp);
 	tcp_bound_rto(tp);
-
-	if (dst->mxlock&(1<<RTAX_CWND))
-		tp->snd_cwnd_clamp = dst->cwnd;
-	if (dst->ssthresh) {
-		tp->snd_ssthresh = dst->ssthresh;
-		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
-			tp->snd_ssthresh = tp->snd_cwnd_clamp;
-	}
+	if (tp->rto < TCP_TIMEOUT_INIT && !tp->saw_tstamp)
+		goto reset;
+	tp->snd_cwnd = tcp_init_cwnd(tp);
 	return;
 
 
@@ -357,9 +454,6 @@ reset:
 	}
 }
 
-#define PAWS_24DAYS	(60 * 60 * 24 * 24)
-
-
 /* WARNING: this must not be called if tp->saw_tstamp was false. */
 extern __inline__ void
 tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq)
@@ -374,7 +468,7 @@ tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq)
 		 */
 
 		if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0 ||
-		   xtime.tv_sec >= tp->ts_recent_stamp + PAWS_24DAYS) {
+		   xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS) {
 			tp->ts_recent = tp->rcv_tsval;
 			tp->ts_recent_stamp = xtime.tv_sec;
 		}
@@ -384,7 +478,7 @@ tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq)
 extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb)
 {
 	return ((s32)(tp->rcv_tsval - tp->ts_recent) < 0 &&
-		xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS
+		xtime.tv_sec < tp->ts_recent_stamp + TCP_PAWS_24DAYS
 
 		 /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
 
@@ -411,8 +505,13 @@ extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb)
 static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
 {
 	u32 end_window = tp->rcv_wup + tp->rcv_wnd;
+#ifdef TCP_FORMAL_WINDOW
+	u32 rcv_wnd = tcp_receive_window(tp);
+#else
+	u32 rcv_wnd = tp->rcv_wnd;
+#endif
 
-	if (tp->rcv_wnd &&
+	if (rcv_wnd &&
 	    after(end_seq, tp->rcv_nxt) &&
 	    before(seq, end_window))
 		return 1;
@@ -424,8 +523,13 @@ static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
 /* This functions checks to see if the tcp header is actually acceptable. */
 extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
 {
+#ifdef TCP_FORMAL_WINDOW
+	u32 rcv_wnd = tcp_receive_window(tp);
+#else
+	u32 rcv_wnd = tp->rcv_wnd;
+#endif
 	if (seq == tp->rcv_nxt)
-		return (tp->rcv_wnd || (end_seq == seq));
+		return (rcv_wnd || (end_seq == seq));
 
 	return __tcp_sequence(tp, seq, end_seq);
 }
@@ -433,8 +537,6 @@ extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq)
 /* When we get a reset we do this. */
 static void tcp_reset(struct sock *sk)
 {
-	sk->zapped = 1;
-
 	/* We want the right error as BSD sees it (and indeed as we do). */
 	switch (sk->state) {
 		case TCP_SYN_SENT:
@@ -447,9 +549,8 @@ static void tcp_reset(struct sock *sk)
 			return;
 		default:
 			sk->err = ECONNRESET;
-	};
-	tcp_set_state(sk, TCP_CLOSE);
-	tcp_clear_xmit_timers(sk);
+	}
+
 	tcp_done(sk);
 }
 
@@ -658,17 +759,18 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
 		if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
 			tp->dup_acks++;
 			if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) {
-                                tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
-				if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
-					tp->snd_ssthresh = tp->snd_cwnd_clamp;
-                                tp->snd_cwnd = (tp->snd_ssthresh + 3);
-				tp->high_seq = tp->snd_nxt;
+				__tcp_enter_cong_avoid(tp);
+				/* ... and account for 3 ACKs, which are
+				 * already received to this time.
+				 */
+                                tp->snd_cwnd += 3;
+
 				if(!tp->fackets_out)
 					tcp_retransmit_skb(sk,
 							   skb_peek(&sk->write_queue));
 				else
 					tcp_fack_retransmit(sk);
-                                tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+                                tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
 			}
 		} else if (++tp->dup_acks > 3) {
 			/* 2. Each time another duplicate ACK arrives, increment 
@@ -733,7 +835,7 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
 				if (ack != tp->snd_una && before(ack, tp->high_seq)) {
                                 	tcp_retransmit_skb(sk,
 							   skb_peek(&sk->write_queue));
-                                	tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+                                	tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
 				}
 			} else {
 				/* FACK style, fill any remaining holes in
@@ -752,7 +854,8 @@ static __inline__ void tcp_cong_avoid(struct tcp_opt *tp)
 {
         if (tp->snd_cwnd <= tp->snd_ssthresh) {
                 /* In "safe" area, increase. */
-                tp->snd_cwnd++;
+		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+			tp->snd_cwnd++;
 	} else {
                 /* In dangerous area, increase slowly.
 		 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
@@ -826,23 +929,23 @@ static void tcp_ack_probe(struct sock *sk, __u32 ack)
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	
-	/* Our probe was answered. */
-	tp->probes_out = 0;
-	
 	/* Was it a usable window open? */
 
-	/* should always be non-null */
-	if (tp->send_head != NULL &&
-	    !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) {
-		tp->backoff = 0;
-		tp->pending = 0;
-		tcp_clear_xmit_timer(sk, TIME_PROBE0);
-	} else {
-		tcp_reset_xmit_timer(sk, TIME_PROBE0,
-				     min(tp->rto << tp->backoff, 120*HZ));
+	if (tp->send_head != NULL) {
+		if (!after(TCP_SKB_CB(tp->send_head)->end_seq, ack + tp->snd_wnd)) {
+			tp->backoff = 0;
+			tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0);
+			/* If packets_out==0, socket must be waked up by
+			 * subsequent tcp_data_snd_check(). This function is
+			 * not for random using!
+			 */
+		} else if (!tp->packets_out) {
+			tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0,
+					     min(tp->rto << tp->backoff, TCP_RTO_MAX));
+		}
 	}
 }
- 
+
 /* Should we open up the congestion window? */
 static __inline__ int should_advance_cwnd(struct tcp_opt *tp, int flag)
 {
@@ -914,18 +1017,30 @@ static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
 {
 	struct sk_buff *skb = skb_peek(&sk->write_queue);
 
+#ifdef TCP_DEBUG
+	/* It occured in 2.3, because of racy timers. Namely,
+	 * retransmit timer did not check packets_out and retransmitted
+	 * send_head sometimes and, hence, messed all the write_queue.
+	 * Now it is impossible, I bet. --ANK
+	 */
+	if (skb == NULL) {
+		printk("Sucks! packets_out=%d, sk=%p, %d\n", tp->packets_out, sk, sk->state);
+		return;
+	}
+#endif
+
 	/* Some data was ACK'd, if still retransmitting (due to a
 	 * timeout), resend more of the retransmit queue.  The
 	 * congestion window is handled properly by that code.
 	 */
 	if (tp->retransmits) {
 		tcp_xmit_retransmit_queue(sk);
-		tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+		tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
 	} else {
 		__u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when);
 		if ((__s32)when < 0)
 			when = 1;
-		tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
+		tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, when);
 	}
 }
 
@@ -938,13 +1053,8 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
 	u32 seq = 0;
 	u32 seq_rtt = 0;
 
-	if(sk->zapped)
-		return(1);	/* Dead, can't ack any more so why bother */
-
-	if (tp->pending == TIME_KEEPOPEN)
-	  	tp->probes_out = 0;
-
-	tp->rcv_tstamp = tcp_time_stamp;
+	if(sk->state == TCP_CLOSE)
+		return 1;	/* Dead, can't ack any more so why bother */
 
 	/* If the ack is newer than sent or older than previous acks
 	 * then we can probably ignore it.
@@ -953,10 +1063,8 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
 		goto uninteresting_ack;
 
 	/* If there is data set flag 1 */
-	if (len != th->doff*4) {
+	if (len != th->doff*4)
 		flag |= FLAG_DATA;
-		tcp_delack_estimator(tp);
-	}
 
 	/* Update our send window. */
 
@@ -970,31 +1078,53 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
 
 		if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) {
 			flag |= FLAG_WIN_UPDATE;
-			tp->snd_wnd = nwin;
+			if (tp->snd_wnd != nwin) {
+				tp->snd_wnd = nwin;
+
+				/* Note, it is the only place, where
+				 * fast path is recovered for sending TCP.
+				 */
+				if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
+#ifdef TCP_FORMAL_WINDOW
+				    tcp_receive_window(tp) &&
+#endif
+				    !tp->urg_data)
+					tcp_fast_path_on(tp);
+
+				if (nwin > tp->max_window) {
+					tp->max_window = nwin;
+					tcp_sync_mss(sk, tp->pmtu_cookie);
+				}
+			}
 
 			tp->snd_wl1 = ack_seq;
 			tp->snd_wl2 = ack;
-
-			if (nwin > tp->max_window)
-				tp->max_window = nwin;
 		}
 	}
 
+	/* BEWARE! From this place and until return from this function
+	 * snd_nxt and snd_wnd are out of sync. All the routines, called
+	 * from here must get "ack" as argument or they should not depend
+	 * on right edge of window. It is _UGLY_. It cries to be fixed. --ANK
+	 */
+
 	/* We passed data and got it acked, remove any soft error
 	 * log. Something worked...
 	 */
 	sk->err_soft = 0;
+	tp->probes_out = 0;
+	tp->rcv_tstamp = tcp_time_stamp;
+
+	/* See if we can take anything off of the retransmit queue. */
+	flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt);
 
 	/* If this ack opens up a zero window, clear backoff.  It was
 	 * being used to time the probes, and is probably far higher than
 	 * it needs to be for normal retransmission.
 	 */
-	if (tp->pending == TIME_PROBE0)
+	if (tcp_timer_is_set(sk, TCP_TIME_PROBE0))
 		tcp_ack_probe(sk, ack);
 
-	/* See if we can take anything off of the retransmit queue. */
-	flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt);
-
 	/* We must do this here, before code below clears out important
 	 * state contained in tp->fackets_out and tp->retransmits.  -DaveM
 	 */
@@ -1036,7 +1166,7 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
 		if (flag & FLAG_DATA_ACKED)
 			tcp_ack_packets_out(sk, tp);
 	} else {
-		tcp_clear_xmit_timer(sk, TIME_RETRANS);
+		tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS);
 	}
 
 	flag &= (FLAG_DATA | FLAG_WIN_UPDATE);
@@ -1074,9 +1204,42 @@ uninteresting_ack:
 	return 0;
 }
 
+int tcp_paws_check(struct tcp_opt *tp, int rst)
+{
+	if ((s32)(tp->rcv_tsval - tp->ts_recent) >= 0)
+		return 0;
+	if (xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS)
+		return 0;
+
+	/* RST segments are not recommended to carry timestamp,
+	   and, if they do, it is recommended to ignore PAWS because
+	   "their cleanup function should take precedence over timestamps."
+	   Certainly, it is mistake. It is necessary to understand the reasons
+	   of this constraint to relax it: if peer reboots, clock may go
+	   out-of-sync and half-open connections will not be reset.
+	   Actually, the problem would be not existing if all
+	   the implementations followed draft about maintaining clock
+	   via reboots. Linux-2.2 DOES NOT!
+
+	   However, we can relax time bounds for RST segments to MSL.
+	 */
+	if (rst && xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_MSL)
+		return 0;
+	return 1;
+}
+
+static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
+{
+	if (seq == s_win)
+		return 1;
+	if (after(end_seq, s_win) && before(seq, e_win))
+		return 1;
+	return (seq == e_win && seq == end_seq);
+}
+
 /* New-style handling of TIME_WAIT sockets. */
 
-/* Must be called only from BH context. */
+/* Must be called with locally disabled BHs. */
 void tcp_timewait_kill(struct tcp_tw_bucket *tw)
 {
 	struct tcp_ehash_bucket *ehead;
@@ -1121,13 +1284,6 @@ void tcp_timewait_kill(struct tcp_tw_bucket *tw)
 	tcp_tw_put(tw);
 }
 
-/* We come here as a special case from the AF specific TCP input processing,
- * and the SKB has no owner.  Essentially handling this is very simple,
- * we just keep silently eating rx'd packets until none show up for the
- * entire timeout period.  The only special cases are for BSD TIME_WAIT
- * reconnects and SYN/RST bits being set in the TCP header.
- */
-
 /* 
  * * Main purpose of TIME-WAIT state is to close connection gracefully,
  *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
@@ -1149,6 +1305,12 @@ void tcp_timewait_kill(struct tcp_tw_bucket *tw)
  * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
  * When you compare it to RFCs, please, read section SEGMENT ARRIVES
  * from the very beginning.
+ *
+ * NOTE. With recycling (and later with fin-wait-2) TW bucket
+ * is _not_ stateless. It means, that strictly speaking we must
+ * spinlock it. I do not want! Well, probability of misbehaviour
+ * is ridiculously low and, seems, we could use some mb() tricks
+ * to avoid misread sequence numbers, states etc.  --ANK
  */
 enum tcp_tw_status
 tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
@@ -1157,7 +1319,75 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 	struct tcp_opt tp;
 	int paws_reject = 0;
 
-	/*	RFC 1122:
+	tp.saw_tstamp = 0;
+	if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) {
+		tcp_parse_options(NULL, th, &tp, 0);
+
+		if (tp.saw_tstamp) {
+			tp.ts_recent = tw->ts_recent;
+			tp.ts_recent_stamp = tw->ts_recent_stamp;
+			paws_reject = tcp_paws_check(&tp, th->rst);
+		}
+	}
+
+	if (tw->substate == TCP_FIN_WAIT2) {
+		/* Just repeat all the checks of tcp_rcv_state_process() */
+
+		/* Out of window, send ACK */
+		if (paws_reject ||
+		    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+				   tw->rcv_nxt, tw->rcv_nxt + tw->rcv_wnd))
+			return TCP_TW_ACK;
+
+		if (th->rst)
+			goto kill;
+
+		if (th->syn && TCP_SKB_CB(skb)->seq != tw->syn_seq)
+			goto kill_with_rst;
+
+		/* Dup ACK? */
+		if (!after(TCP_SKB_CB(skb)->end_seq, tw->rcv_nxt)) {
+			tcp_tw_put(tw);
+			return TCP_TW_SUCCESS;
+		}
+
+		/* New data or FIN. If new data arrive after half-duplex close,
+		 * reset.
+		 */
+		if (!th->fin || TCP_SKB_CB(skb)->end_seq != tw->rcv_nxt+1) {
+kill_with_rst:
+			tcp_tw_deschedule(tw);
+			tcp_timewait_kill(tw);
+			tcp_tw_put(tw);
+			return TCP_TW_RST;
+		}
+
+		/* FIN arrived, enter true time-wait state. */
+		tw->substate = TCP_TIME_WAIT;
+		tw->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+		if (tp.saw_tstamp) {
+			tw->ts_recent_stamp = xtime.tv_sec;
+			tw->ts_recent = tp.rcv_tsval;
+		}
+
+		/* I am shamed, but failed to make it more elegant.
+		 * Yes, it is direct reference to IP, which is impossible
+		 * to generalize to IPv6. Taking into account that IPv6
+		 * do not undertsnad recycling in any case, it not
+		 * a big problem in practice. --ANK */
+		if (tw->family == AF_INET &&
+		    sysctl_tcp_tw_recycle && tw->ts_recent_stamp &&
+		    tcp_v4_tw_remember_stamp(tw))
+			tcp_tw_schedule(tw, tw->timeout);
+		else
+			tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+		return TCP_TW_ACK;
+	}
+
+	/*
+	 *	Now real TIME-WAIT state.
+	 *
+	 *	RFC 1122:
 	 *	"When a connection is [...] on TIME-WAIT state [...]
 	 *	[a TCP] MAY accept a new SYN from the remote TCP to
 	 *	reopen the connection directly, if it:
@@ -1171,47 +1401,31 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 	 *	to be an old duplicate".
 	 */
 
-	tp.saw_tstamp = 0;
-	if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) {
-		tcp_parse_options(NULL, th, &tp, 0);
-
-		paws_reject = tp.saw_tstamp &&
-			((s32)(tp.rcv_tsval - tw->ts_recent) < 0 &&
-			 xtime.tv_sec < tw->ts_recent_stamp + PAWS_24DAYS);
-	}
-
 	if (!paws_reject &&
 	    (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
 	     TCP_SKB_CB(skb)->seq == tw->rcv_nxt)) {
 		/* In window segment, it may be only reset or bare ack. */
 
 		if (th->rst) {
-#ifdef CONFIG_TCP_TW_RECYCLE
-			/* When recycling, always follow rfc1337,
-			 * but mark bucket as ready to recycling immediately.
-			 */
-			if (sysctl_tcp_tw_recycle) {
-				/* May kill it now. */
-				tw->rto = 0;
-				tw->ttd = jiffies;
-			} else
-#endif
 			/* This is TIME_WAIT assasination, in two flavors.
 			 * Oh well... nobody has a sufficient solution to this
 			 * protocol bug yet.
 			 */
-			if(sysctl_tcp_rfc1337 == 0) {
+			if (sysctl_tcp_rfc1337 == 0) {
+kill:
 				tcp_tw_deschedule(tw);
 				tcp_timewait_kill(tw);
+				tcp_tw_put(tw);
+				return TCP_TW_SUCCESS;
 			}
-		} else {
-			tcp_tw_reschedule(tw);
 		}
+		tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
 
 		if (tp.saw_tstamp) {
 			tw->ts_recent = tp.rcv_tsval;
 			tw->ts_recent_stamp = xtime.tv_sec;
 		}
+
 		tcp_tw_put(tw);
 		return TCP_TW_SUCCESS;
 	}
@@ -1235,7 +1449,7 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 
 	if (th->syn && !th->rst && !th->ack && !paws_reject &&
 	    (after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt) ||
-	     (tp.saw_tstamp && tw->ts_recent != tp.rcv_tsval))) {
+	     (tp.saw_tstamp && (s32)(tw->ts_recent - tp.rcv_tsval) < 0))) {
 		u32 isn = tw->snd_nxt + 2;
 		if (isn == 0)
 			isn++;
@@ -1243,20 +1457,18 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 		return TCP_TW_SYN;
 	}
 
+	if (paws_reject)
+		NET_INC_STATS_BH(PAWSEstabRejected);
+
 	if(!th->rst) {
 		/* In this case we must reset the TIMEWAIT timer.
-
-		   If it is ACKless SYN it may be both old duplicate
-		   and new good SYN with random sequence number <rcv_nxt.
-		   Do not reschedule in the last case.
+		 *
+		 * If it is ACKless SYN it may be both old duplicate
+		 * and new good SYN with random sequence number <rcv_nxt.
+		 * Do not reschedule in the last case.
 		 */
-		if (paws_reject || th->ack) {
-			tcp_tw_reschedule(tw);
-#ifdef CONFIG_TCP_TW_RECYCLE
-			tw->rto = min(120*HZ, tw->rto<<1);
-			tw->ttd = jiffies + tw->rto;
-#endif
-		}
+		if (paws_reject || th->ack)
+			tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
 
 		/* Send ACK. Note, we do not put the bucket,
 		 * it will be released by caller.
@@ -1267,8 +1479,8 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 	return TCP_TW_SUCCESS;
 }
 
-/* Enter the time wait state.  This is always called from BH
- * context.  Essentially we whip up a timewait bucket, copy the
+/* Enter the time wait state.  This is called with locally disabled BH.
+ * Essentially we whip up a timewait bucket, copy the
  * relevant info into it from the SK, and mess with hash chains
  * and list linkage.
  */
@@ -1286,6 +1498,7 @@ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
 			sk->next->pprev = sk->pprev;
 		*sk->pprev = sk->next;
 		sk->pprev = NULL;
+		sock_prot_dec_use(sk->prot);
 	}
 
 	/* Step 2: Hash TW into TIMEWAIT half of established hash table. */
@@ -1312,41 +1525,49 @@ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
 	tw->tb->owners = (struct sock*)tw;
 	tw->bind_pprev = &tw->tb->owners;
 	spin_unlock(&bhead->lock);
-
-	/* Step 4: Un-charge protocol socket in-use count. */
-	sock_prot_dec_use(sk->prot);
 }
 
 /* 
- * Move a socket to time-wait.
+ * Move a socket to time-wait or dead fin-wait-2 state.
  */ 
-void tcp_time_wait(struct sock *sk)
+void tcp_time_wait(struct sock *sk, int state, int timeo)
 {
-	struct tcp_tw_bucket *tw;
+	struct tcp_tw_bucket *tw = NULL;
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	int recycle_ok = 0;
+
+	if (sysctl_tcp_tw_recycle && tp->ts_recent_stamp)
+		recycle_ok = tp->af_specific->remember_stamp(sk);
+
+	if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
+		tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
 
-	tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
 	if(tw != NULL) {
+		int rto = (tp->rto<<2) - (tp->rto>>1);
+
 		/* Give us an identity. */
 		tw->daddr	= sk->daddr;
 		tw->rcv_saddr	= sk->rcv_saddr;
 		tw->bound_dev_if= sk->bound_dev_if;
 		tw->num		= sk->num;
 		tw->state	= TCP_TIME_WAIT;
+		tw->substate	= state;
 		tw->sport	= sk->sport;
 		tw->dport	= sk->dport;
 		tw->family	= sk->family;
 		tw->reuse	= sk->reuse;
-		tw->hashent	= sk->hashent;
-		tw->rcv_nxt	= sk->tp_pinfo.af_tcp.rcv_nxt;
-		tw->snd_nxt	= sk->tp_pinfo.af_tcp.snd_nxt;
-		tw->ts_recent	= sk->tp_pinfo.af_tcp.ts_recent;
-		tw->ts_recent_stamp= sk->tp_pinfo.af_tcp.ts_recent_stamp;
-#ifdef CONFIG_TCP_TW_RECYCLE
-		tw->rto		= sk->tp_pinfo.af_tcp.rto;
-		tw->ttd		= jiffies + 2*tw->rto;
-#endif
+		tw->rcv_wscale	= tp->rcv_wscale;
 		atomic_set(&tw->refcnt, 0);
 
+		tw->hashent	= sk->hashent;
+		tw->rcv_nxt	= tp->rcv_nxt;
+		tw->snd_nxt	= tp->snd_nxt;
+		tw->rcv_wnd	= tcp_receive_window(tp);
+		tw->syn_seq	= tp->syn_seq;
+		tw->ts_recent	= tp->ts_recent;
+		tw->ts_recent_stamp= tp->ts_recent_stamp;
+		tw->pprev_death = NULL;
+
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 		if(tw->family == PF_INET6) {
 			memcpy(&tw->v6_daddr,
@@ -1361,22 +1582,28 @@ void tcp_time_wait(struct sock *sk)
 		__tcp_tw_hashdance(sk, tw);
 
 		/* Get the TIME_WAIT timeout firing. */
-		tcp_tw_schedule(tw);
+		if (timeo < rto)
+			timeo = rto;
 
-		/* CLOSE the SK. */
-		if(sk->state == TCP_ESTABLISHED)
-			tcp_statistics[smp_processor_id()*2].TcpCurrEstab--;
-		sk->state = TCP_CLOSE;
+		if (recycle_ok) {
+			tw->timeout = rto;
+		} else {
+			tw->timeout = TCP_TIMEWAIT_LEN;
+			if (state == TCP_TIME_WAIT)
+				timeo = TCP_TIMEWAIT_LEN;
+		}
+
+		tcp_tw_schedule(tw, timeo);
 	} else {
-		/* Sorry, we're out of memory, just CLOSE this
+		/* Sorry, if we're out of memory, just CLOSE this
 		 * socket up.  We've got bigger problems than
 		 * non-graceful socket closings.
 		 */
-		tcp_set_state(sk, TCP_CLOSE);
+		if (net_ratelimit())
+			printk(KERN_INFO "TCP: time wait bucket table overflow\n");
 	}
 
 	tcp_update_metrics(sk);
-	tcp_clear_xmit_timers(sk);
 	tcp_done(sk);
 }
 
@@ -1397,10 +1624,13 @@ void tcp_time_wait(struct sock *sk)
  
 static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
 {
-	sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq;
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
+	tp->fin_seq = TCP_SKB_CB(skb)->end_seq;
 	tcp_send_ack(sk);
 
+	sk->shutdown |= RCV_SHUTDOWN;
+
 	switch(sk->state) {
 		case TCP_SYN_RECV:
 		case TCP_ESTABLISHED:
@@ -1427,7 +1657,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
 			break;
 		case TCP_FIN_WAIT2:
 			/* Received a FIN -- send ACK and enter TIME_WAIT. */
-			tcp_time_wait(sk);
+			tcp_time_wait(sk, TCP_TIME_WAIT, 0);
 			break;
 		default:
 			/* Only TCP_LISTEN and TCP_CLOSE are left, in these
@@ -1435,9 +1665,17 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
 			 */
 			printk("tcp_fin: Impossible, sk->state=%d\n", sk->state);
 			break;
-	}
+	};
+
+	/* It _is_ possible, that we have something out-of-order _after_ FIN.
+	 * Probably, we should reset in this case. For now drop them.
+	 */
+	__skb_queue_purge(&tp->out_of_order_queue);
+	if (tp->sack_ok)
+		tp->num_sacks = 0;
+
 	if (!sk->dead) {
-		wake_up_interruptible(sk->sleep);
+		sk->state_change(sk);
 		sock_wake_async(sk->socket, 1, POLL_HUP);
 	}
 }
@@ -1622,6 +1860,7 @@ static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct
 	sp->end_seq = TCP_SKB_CB(new_skb)->end_seq;
 }
 
+
 /* This one checks to see if we can put data from the
  * out_of_order queue into the receive_queue.
  */
@@ -1658,6 +1897,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 {
 	struct sk_buff *skb1;
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	int eaten = 0;
 
 	/*  Queue data for delivery to the user.
 	 *  Packets in sequence go to the receive queue.
@@ -1665,33 +1905,68 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 	 */
 	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
 		/* Ok. In sequence. */
-	queue_and_out:
+		if (tp->ucopy.task == current &&
+		    tp->copied_seq == tp->rcv_nxt &&
+		    tp->ucopy.len &&
+		    sk->lock.users &&
+		    !tp->urg_data) {
+			int chunk = min(skb->len, tp->ucopy.len);
+
+			local_bh_enable();
+			if (memcpy_toiovec(tp->ucopy.iov, skb->data, chunk)) {
+				sk->err = EFAULT;
+				sk->error_report(sk);
+			}
+			local_bh_disable();
+			tp->ucopy.len -= chunk;
+			tp->copied_seq += chunk;
+			eaten = (chunk == skb->len && !skb->h.th->fin);
+		}
+
+		if (!eaten) {
+queue_and_out:
+			skb_set_owner_r(skb, sk);
+			__skb_queue_tail(&sk->receive_queue, skb);
+		}
 		dst_confirm(sk->dst_cache);
-		__skb_queue_tail(&sk->receive_queue, skb);
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
-		if(skb->h.th->fin) {
+		if(skb->len)
+			tcp_event_data_recv(tp, skb); 
+		if(skb->h.th->fin)
 			tcp_fin(skb, sk, skb->h.th);
-		} else {
-			tcp_remember_ack(tp, skb->h.th, skb); 
-		}
+
 		/* This may have eaten into a SACK block. */
 		if(tp->sack_ok && tp->num_sacks)
 			tcp_sack_remove_skb(tp, skb);
 		tcp_ofo_queue(sk);
 
 		/* Turn on fast path. */ 
-		if (skb_queue_len(&tp->out_of_order_queue) == 0)
-			tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) |
-					       ntohl(TCP_FLAG_ACK) |
-					       tp->snd_wnd);
+		if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
+#ifdef TCP_FORMAL_WINDOW
+		    tcp_receive_window(tp) &&
+#endif
+		    !tp->urg_data)
+			tcp_fast_path_on(tp);
+
+		if (eaten)
+			kfree_skb(skb);
+
+		if (!sk->dead) {
+			wake_up_interruptible(sk->sleep);
+			sock_wake_async(sk->socket,1, POLL_IN);
+		}
 		return;
 	}
-	
+
 	/* An old packet, either a retransmit or some packet got lost. */
 	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
-		/* A retransmit, 2nd most common case.  Force an imediate ack. */
-		SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq);
+		/* A retransmit, 2nd most common case.  Force an imediate ack.
+		 * 
+		 * It is impossible, seq is checked by top level.
+		 */
+		NETDEBUG(printk("retransmit in tcp_data_queue: seq %X\n", TCP_SKB_CB(skb)->seq));
 		tcp_enter_quickack_mode(tp);
+		tp->ack.pending = 1;
 		kfree_skb(skb);
 		return;
 	}
@@ -1706,15 +1981,17 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 	}
 
 	/* Ok. This is an out_of_order segment, force an ack. */
-	tp->delayed_acks++;
-	tcp_enter_quickack_mode(tp);
+	tp->ack.pending = 1;
 
 	/* Disable header prediction. */
 	tp->pred_flags = 0;
 
+
 	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
 		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
 
+	skb_set_owner_r(skb, sk);
+
 	if (skb_peek(&tp->out_of_order_queue) == NULL) {
 		/* Initial out of order segment, build 1 SACK. */
 		if(tp->sack_ok) {
@@ -1758,6 +2035,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 			}
 		}
 	}
+	return;
 }
 
 
@@ -1767,7 +2045,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
  *	room, then we will just have to discard the packet.
  */
 
-static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
+static void tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
 {
 	struct tcphdr *th;
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
@@ -1777,11 +2055,11 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
 	skb_trim(skb, len - (th->doff*4));
 
         if (skb->len == 0 && !th->fin)
-		return(0);
+		goto drop;
 
 	/* 
 	 *	If our receive queue has grown past its limits shrink it.
-	 *	Make sure to do this before moving snd_nxt, otherwise
+	 *	Make sure to do this before moving rcv_nxt, otherwise
 	 *	data might be acked for that we don't have enough room.
 	 */
 	if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) { 
@@ -1789,7 +2067,7 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
 			/* Still not enough room. That can happen when
 			 * skb->true_size differs significantly from skb->len.
 			 */
-			return 0;
+			goto drop;
 		}
 	}
 
@@ -1799,29 +2077,20 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
 		printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n");
 		tp->rcv_nxt = tp->copied_seq;
 	}
+	return;
 
-	/* Above, tcp_data_queue() increments delayed_acks appropriately.
-	 * Now tell the user we may have some data.
-	 */
-	if (!sk->dead) {
-		wake_up_interruptible(sk->sleep);
-		sock_wake_async(sk->socket,1, POLL_IN);
-	}
-	return(1);
+drop:
+	kfree_skb(skb);
 }
 
 static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
-	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) &&
-	    tcp_packets_in_flight(tp) < tp->snd_cwnd) {
-		/* Put more data onto the wire. */
-		tcp_write_xmit(sk);
-	} else if (tp->packets_out == 0 && !tp->pending) {
-		/* Start probing the receivers window. */
-		tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
-	}
+	if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
+	    tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
+	    tcp_write_xmit(sk))
+		tcp_check_probe_timer(sk, tp);
 }
 
 static __inline__ void tcp_data_snd_check(struct sock *sk)
@@ -1832,57 +2101,6 @@ static __inline__ void tcp_data_snd_check(struct sock *sk)
 		__tcp_data_snd_check(sk, skb); 
 }
 
-/* 
- * Adapt the MSS value used to make delayed ack decision to the 
- * real world.
- *
- * The constant 536 hasn't any good meaning.  In IPv4 world
- * MTU may be smaller, though it contradicts to RFC1122, which
- * states that MSS must be at least 536.
- * We use the constant to do not ACK each second
- * packet in a stream of tiny size packets.
- * It means that super-low mtu links will be aggressively delacked.
- * Seems, it is even good. If they have so low mtu, they are weirdly
- * slow.
- *
- * AK: BTW it may be useful to add an option to lock the rcv_mss.
- *     this way the beowulf people wouldn't need ugly patches to get the
- *     ack frequencies they want and it would be an elegant way to tune delack.
- */ 
-static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb)
-{
-	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-	unsigned int len, lss;
-
-	lss = tp->last_seg_size; 
-	tp->last_seg_size = 0; 
-
-	/* skb->len may jitter because of SACKs, even if peer
-	 * sends good full-sized frames.
-	 */
-	len = skb->len;
-	if (len >= tp->rcv_mss) {
-		tp->rcv_mss = len;
-	} else {
-		/* Otherwise, we make more careful check taking into account,
-		 * that SACKs block is variable.
-		 *
-		 * "len" is invariant segment length, including TCP header.
-		 */
-		len = skb->tail - skb->h.raw;
-		if (len >= 536 + sizeof(struct tcphdr)) {
-			/* Subtract also invariant (if peer is RFC compliant),
-			 * tcp header plus fixed timestamp option length.
-			 * Resulting "len" is MSS free of SACK jitter.
-			 */
-			len -= tp->tcp_header_len;
-			if (len == lss)
-				tp->rcv_mss = len;
-			tp->last_seg_size = len;
-		}
-	}
-}
-
 /*
  * Check if sending an ack is needed.
  */
@@ -1904,26 +2122,25 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 	 * start in an expediant manner.
 	 */
 
-	    /* Two full frames received or... */
-	if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) ||
-	    /* We will update the window "significantly" or... */
-	    tcp_raise_window(sk) ||
-	    /* We entered "quick ACK" mode or... */
+	    /* More than one full frame received or... */
+	if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss) ||
+	    /* We ACK each frame or... */
 	    tcp_in_quickack_mode(tp) ||
-	    /* We have out of order data */
-	    (ofo_possible && (skb_peek(&tp->out_of_order_queue) != NULL))) {
+	    /* We have out of order data or */
+	    (ofo_possible &&
+	     skb_peek(&tp->out_of_order_queue) != NULL)) {
 		/* Then ack it now */
 		tcp_send_ack(sk);
 	} else {
 		/* Else, send delayed ack. */
-		tcp_send_delayed_ack(sk, HZ/2);
+		tcp_send_delayed_ack(sk);
 	}
 }
 
 static __inline__ void tcp_ack_snd_check(struct sock *sk)
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-	if (tp->delayed_acks == 0) {
+	if (tp->ack.pending == 0) {
 		/* We sent a data segment already. */
 		return;
 	}
@@ -1975,7 +2192,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
 	 */
 	if (tp->urg_seq == tp->copied_seq)
 		tp->copied_seq++;	/* Move the copied sequence on correctly */
-	tp->urg_data = URG_NOTYET;
+	tp->urg_data = TCP_URG_NOTYET;
 	tp->urg_seq = ptr;
 
 	/* Disable header prediction. */
@@ -1992,12 +2209,12 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len
 		tcp_check_urg(sk,th);
 
 	/* Do we wait for any urgent data? - normally not... */
-	if (tp->urg_data == URG_NOTYET) {
+	if (tp->urg_data == TCP_URG_NOTYET) {
 		u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4);
 
 		/* Is the urgent pointer pointing into this packet? */	 
 		if (ptr < len) {
-			tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th);
+			tp->urg_data = TCP_URG_VALID | *(ptr + (unsigned char *) th);
 			if (!sk->dead)
 				sk->data_ready(sk,0);
 		}
@@ -2014,7 +2231,8 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len
 static int prune_queue(struct sock *sk)
 {
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; 
-	struct sk_buff * skb;
+	struct sk_buff *skb;
+	int pruned = 0;
 
 	SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
 
@@ -2024,7 +2242,9 @@ static int prune_queue(struct sock *sk)
 	skb = __skb_dequeue_tail(&tp->out_of_order_queue);
 	if(skb != NULL) {
 		/* Free it all. */
-		do {	net_statistics[smp_processor_id()*2].OfoPruned += skb->len; 
+		do {
+			pruned += skb->len;
+			net_statistics[smp_processor_id()*2].OfoPruned += skb->len; 
 			kfree_skb(skb);
 			skb = __skb_dequeue_tail(&tp->out_of_order_queue);
 		} while(skb != NULL);
@@ -2059,13 +2279,47 @@ static int prune_queue(struct sock *sk)
 	 * if we are really having our buffer space abused we stop accepting
 	 * new receive data.
 	 *
+	 * 8) The arguments are interesting, but I even cannot imagine
+	 * what kind of arguments could force us to drop NICE, ALREADY
+	 * RECEIVED DATA only to get one more packet? --ANK
+	 *
 	 * FIXME: it should recompute SACK state and only remove enough
 	 *        buffers to get into bounds again. The current scheme loses
-     *        badly sometimes on links with large RTT, especially when 
-     *        the driver has high overhead per skb.
-     *        (increasing the rcvbuf is not enough because it inflates the
-     *         the window too, disabling flow control effectively) -AK
+	 *        badly sometimes on links with large RTT, especially when 
+	 *        the driver has high overhead per skb.
+	 *        (increasing the rcvbuf is not enough because it inflates the
+	 *         the window too, disabling flow control effectively) -AK
+	 *
+	 *	  Mmm... Why not to scale it seprately then? Just replace
+	 *	  / WINDOW_ADVERTISE_DIVISOR with >> sk->window_advertise_scale
+	 *	  and adjust it dynamically, when TCP window flow control
+	 *	  fails?						-ANK
+	 */
+
+	/* F.e. one possible tactics is: */
+	do {
+		u32 new_clamp = (tp->rcv_nxt-tp->copied_seq) + pruned;
+
+		/* This guy is not a good guy. I bet, he martirized cats,
+		 * when was child and grew up to finished sadist. Clamp him!
+		 */
+		if (new_clamp > 3*tp->ack.rcv_mss)
+			new_clamp -= tp->ack.rcv_mss;
+		else
+			new_clamp = 2*tp->ack.rcv_mss;
+		tp->window_clamp = min(tp->window_clamp, new_clamp);
+	} while (0);
+	/* Though it should be made earlier, when we are still not
+	 * congested. This header prediction logic sucks
+	 * without true implementation of VJ algorithm.
+	 * I am really anxious. How was it possible to combine
+	 * header prediction and sending ACKs outside of recvmsg() context?
+	 * They _are_ incompatible. We should not advance window so
+	 * brainlessly and we should not advertise so huge window from the very
+	 * beginning. BTW window "prediction" does not speedup anything!
+	 * SIlly, silly, silly.
 	 */
+
 	if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1))
 		return 0;
 
@@ -2073,6 +2327,57 @@ static int prune_queue(struct sock *sk)
 	return -1;
 }
 
+static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
+{
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	int chunk = skb->len - hlen;
+	int err;
+
+	local_bh_enable();
+	if (skb->ip_summed==CHECKSUM_UNNECESSARY)
+		err = memcpy_toiovec(tp->ucopy.iov, skb->h.raw + hlen, chunk);
+	else
+		err = copy_and_csum_toiovec(tp->ucopy.iov, skb, hlen);
+
+	if (!err) {
+update:
+		tp->ucopy.len -= chunk;
+		tp->copied_seq += chunk;
+		local_bh_disable();
+		return 0;
+	}
+
+	if (err == -EFAULT) {
+		sk->err = EFAULT;
+		sk->error_report(sk);
+		goto update;
+	}
+
+	local_bh_disable();
+	return err;
+}
+
+static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
+{
+	int result;
+
+	if (sk->lock.users) {
+		local_bh_enable();
+		result = __tcp_checksum_complete(skb);
+		local_bh_disable();
+	} else {
+		result = __tcp_checksum_complete(skb);
+	}
+	return result;
+}
+
+static __inline__ int
+tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
+{
+	return skb->ip_summed != CHECKSUM_UNNECESSARY &&
+		__tcp_checksum_complete_user(sk, skb);
+}
+
 /*
  *	TCP receive function for the ESTABLISHED state. 
  *
@@ -2080,7 +2385,33 @@ static int prune_queue(struct sock *sk)
  * 	disabled when:
  *	- A zero window was announced from us - zero window probing
  *        is only handled properly in the slow path. 
- *  - Out of order segments arrived.
+ *	  [ NOTE: actually, it was made incorrectly and nobody ever noticed
+ *	    this! Reason is clear: 1. Correct senders do not send
+ *	    to zero window. 2. Even if a sender sends to zero window,
+ *	    nothing terrible occurs.
+ *
+ *	    For now I cleaned this and fast path is really always disabled,
+ *	    when window is zero, but I would be more happy to remove these
+ *	    checks. Code will be only cleaner and _faster_.    --ANK
+ *	
+ *	    Later note. I've just found that slow path also accepts
+ *	    out of window segments, look at tcp_sequence(). So...
+ *	    it is the last argument: I repair all and comment out
+ *	    repaired code by TCP_FORMAL_WINDOW.
+ *	    [ I remember one rhyme from a chidren's book. (I apologize,
+ *	      the trasnlation is not rhymed 8)): people in one (jewish) village
+ *	      decided to build sauna, but divided to two parties.
+ *	      The first one insisted that battens should not be dubbed,
+ *	      another objected that foots will suffer of splinters,
+ *	      the first fended that dubbed wet battens are too slippy
+ *	      and people will fall and it is much more serious!
+ *	      Certaiinly, all they went to rabbi.
+ *	      After some thinking, he judged: "Do not be lazy!
+ *	      Certainly, dub the battens! But put them by dubbed surface down."
+ *          ]
+ *        ]
+ *
+ *	- Out of order segments arrived.
  *	- Urgent data is expected.
  *	- There is no buffer space left
  *	- Unexpected TCP flags/window values/header lengths are received
@@ -2088,7 +2419,7 @@ static int prune_queue(struct sock *sk)
  *	- Data is sent in both directions. Fast path only supports pure senders
  *	  or pure receivers (this means either the sequence number or the ack
  *	  value must stay constant)
- *  - Unexpected TCP option.
+ *	- Unexpected TCP option.
  *
  *	When these conditions are not satisfied it drops into a standard 
  *	receive procedure patterned after RFC793 to handle all cases.
@@ -2116,7 +2447,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	 *	We do checksum and copy also but from device to kernel.
 	 */
 
-
 	/* RED-PEN. Using static variables to pass function arguments
 	 * cannot be good idea...
 	 */
@@ -2133,13 +2463,12 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 
 	if ((tcp_flag_word(th) & ~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) == tp->pred_flags &&
 		TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
-		int tcp_header_len = th->doff*4;
-
-		/* Timestamp header prediction */
+		int tcp_header_len = tp->tcp_header_len;
 
-		/* Non-standard header f.e. SACKs -> slow path */
-		if (tcp_header_len != tp->tcp_header_len)
-			goto slow_path;
+		/* Timestamp header prediction: tcp_header_len
+		 * is automatically equal to th->doff*4 due to pred_flags
+		 * match.
+		 */
 
 		/* Check timestamp */
 		if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
@@ -2161,8 +2490,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 				goto slow_path;
 
 			/* Predicted packet is in window by definition.
-			   seq == rcv_nxt and last_ack_sent <= rcv_nxt.
-			   Hence, check seq<=last_ack_sent reduces to:
+			 * seq == rcv_nxt and last_ack_sent <= rcv_nxt.
+			 * Hence, check seq<=last_ack_sent reduces to:
 			 */
 			if (tp->rcv_nxt == tp->last_ack_sent) {
 				tp->ts_recent = tp->rcv_tsval;
@@ -2173,6 +2502,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 		if (len <= tcp_header_len) {
 			/* Bulk data transfer: sender */
 			if (len == tcp_header_len) {
+				/* We know that such packets are checksummed
+				 * on entry.
+				 */
 				tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
 					TCP_SKB_CB(skb)->ack_seq, len); 
 				kfree_skb(skb); 
@@ -2182,19 +2514,42 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 				TCP_INC_STATS_BH(TcpInErrs);
 				goto discard;
 			}
-		} else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una &&
-			   atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) {
-			/* Bulk data transfer: receiver */
-			__skb_pull(skb,tcp_header_len);
+		} else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una) {
+			int eaten = 0;
 
-			/* Is it possible to simplify this? */
-			tcp_measure_rcv_mss(sk, skb); 
+			if (tp->ucopy.task == current &&
+			    tp->copied_seq == tp->rcv_nxt &&
+			    len - tcp_header_len <= tp->ucopy.len &&
+			    sk->lock.users) {
+				eaten = 1;
+
+				NET_INC_STATS_BH(TCPHPHitsToUser);
+
+				if (tcp_copy_to_iovec(sk, skb, tcp_header_len))
+					goto csum_error;
+
+				__skb_pull(skb,tcp_header_len);
+			} else {
+				if (tcp_checksum_complete_user(sk, skb))
+					goto csum_error;
+
+				if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf)
+					goto step5;
+
+				NET_INC_STATS_BH(TCPHPHits);
+
+				/* Bulk data transfer: receiver */
+				__skb_pull(skb,tcp_header_len);
+
+				/* DO NOT notify forward progress here.
+				 * It saves dozen of CPU instructions in fast path. --ANK
+				 * And where is it signaled then ? -AK
+				 * Nowhere. 8) --ANK
+				 */
+				__skb_queue_tail(&sk->receive_queue, skb);
+				skb_set_owner_r(skb, sk);
+			}
 
-			/* DO NOT notify forward progress here.
-			 * It saves dozen of CPU instructions in fast path. --ANK
-			 * And where is it signaled then ? -AK
-			 */
-			__skb_queue_tail(&sk->receive_queue, skb);
 			tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 
 			/* FIN bit check is not done since if FIN is set in
@@ -2202,27 +2557,43 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			 */
 			wake_up_interruptible(sk->sleep);
 			sock_wake_async(sk->socket,1, POLL_IN);
-			tcp_delack_estimator(tp);
 
-			tcp_remember_ack(tp, th, skb); 
+			tcp_event_data_recv(tp, skb);
 
+#if 1/*def CONFIG_TCP_MORE_COARSE_ACKS*/
+			if (eaten) {
+				if (tcp_in_quickack_mode(tp)) {
+					tcp_send_ack(sk);
+				} else {
+					tcp_send_delayed_ack(sk);
+				}
+			} else
+#endif
 			__tcp_ack_snd_check(sk, 0);
+
+			if (eaten)
+				kfree_skb(skb);
 			return 0;
 		}
 		/* Packet is in sequence, flags are trivial;
-		 * only ACK is strange or we are tough on memory.
-		 * Jump to step 5.
+		 * only ACK is strange. Jump to step 5.
 		 */
+		if (tcp_checksum_complete_user(sk, skb))
+			goto csum_error;
 		goto step5;
 	}
 
 slow_path:
+	if (tcp_checksum_complete_user(sk, skb))
+		goto csum_error;
+
 	/*
 	 * RFC1323: H1. Apply PAWS check first.
 	 */
 	if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp &&
 	    tcp_paws_discard(tp, skb)) {
 		if (!th->rst) {
+			NET_INC_STATS_BH(PAWSEstabRejected);
 			tcp_send_ack(sk);
 			goto discard;
 		}
@@ -2251,7 +2622,9 @@ slow_path:
 				   TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
 				   tp->rcv_wup, tp->rcv_wnd);
 		}
+		tcp_enter_quickack_mode(tp);
 		tcp_send_ack(sk);
+		NET_INC_STATS_BH(DelayedACKLost);
 		goto discard;
 	}
 
@@ -2279,11 +2652,8 @@ step5:
 	/* Process urgent data. */
 	tcp_urg(sk, th, len);
 
-	{
 	/* step 7: process the segment text */
-	int queued = tcp_data(skb, sk, len);
-
-	tcp_measure_rcv_mss(sk, skb); 
+	tcp_data(skb, sk, len);
 
 	/* Be careful, tcp_data() may have put this into TIME_WAIT. */
 	if(sk->state != TCP_CLOSE) {
@@ -2291,12 +2661,13 @@ step5:
 		tcp_ack_snd_check(sk);
 	}
 
-	if (!queued) {
-	discard:
-		kfree_skb(skb);
-	}
-	}
+	return 0;
+
+csum_error:
+	TCP_INC_STATS_BH(TcpInErrs);
 
+discard:
+	kfree_skb(skb);
 	return 0;
 }
 
@@ -2328,6 +2699,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 		newsk->dport = req->rmt_port;
 
 		sock_lock_init(newsk);
+		bh_lock_sock(newsk);
 
 		atomic_set(&newsk->rmem_alloc, 0);
 		skb_queue_head_init(&newsk->receive_queue);
@@ -2351,22 +2723,27 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 		newtp->rcv_nxt = req->rcv_isn + 1;
 		newtp->snd_nxt = req->snt_isn + 1;
 		newtp->snd_una = req->snt_isn + 1;
-		newtp->srtt = 0;
-		newtp->ato = 0;
+		newtp->snd_sml = req->snt_isn + 1;
+
+		tcp_delack_init(newtp);
+		if (skb->len >= 536)
+			newtp->ack.last_seg_size = skb->len;
+
+		tcp_prequeue_init(newtp);
+
 		newtp->snd_wl1 = req->rcv_isn;
 		newtp->snd_wl2 = req->snt_isn;
 
-		/* RFC1323: The window in SYN & SYN/ACK segments
-		 * is never scaled.
-		 */
-		newtp->snd_wnd = ntohs(skb->h.th->window);
-
-		newtp->max_window = newtp->snd_wnd;
-		newtp->pending = 0;
 		newtp->retransmits = 0;
-		newtp->last_ack_sent = req->rcv_isn + 1;
 		newtp->backoff = 0;
+		newtp->srtt = 0;
 		newtp->mdev = TCP_TIMEOUT_INIT;
+		newtp->rto = TCP_TIMEOUT_INIT;
+
+		newtp->packets_out = 0;
+		newtp->fackets_out = 0;
+		newtp->retrans_out = 0;
+		newtp->snd_ssthresh = 0x7fffffff;
 
 		/* So many TCP implementations out there (incorrectly) count the
 		 * initial SYN frame in their delayed-ACK and congestion control
@@ -2374,22 +2751,11 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 		 * efficiently to them.  -DaveM
 		 */
 		newtp->snd_cwnd = 2;
-
-		newtp->rto = TCP_TIMEOUT_INIT;
-		newtp->packets_out = 0;
-		newtp->fackets_out = 0;
-		newtp->retrans_out = 0;
-		newtp->high_seq = 0;
-		newtp->snd_ssthresh = 0x7fffffff;
 		newtp->snd_cwnd_cnt = 0;
+		newtp->high_seq = 0;
+
 		newtp->dup_acks = 0;
-		newtp->delayed_acks = 0;
-		init_timer(&newtp->retransmit_timer);
-		newtp->retransmit_timer.function = &tcp_retransmit_timer;
-		newtp->retransmit_timer.data = (unsigned long) newsk;
-		init_timer(&newtp->delack_timer);
-		newtp->delack_timer.function = &tcp_delack_timer;
-		newtp->delack_timer.data = (unsigned long) newsk;
+		tcp_init_xmit_timers(newsk);
 		skb_queue_head_init(&newtp->out_of_order_queue);
 		newtp->send_head = newtp->retrans_head = NULL;
 		newtp->rcv_wup = req->rcv_isn + 1;
@@ -2397,31 +2763,25 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 		newtp->copied_seq = req->rcv_isn + 1;
 
 		newtp->saw_tstamp = 0;
+		newtp->last_ack_sent = req->rcv_isn + 1;
 
-		init_timer(&newtp->probe_timer);
-		newtp->probe_timer.function = &tcp_probe_timer;
-		newtp->probe_timer.data = (unsigned long) newsk;
 		newtp->probes_out = 0;
 		newtp->syn_seq = req->rcv_isn;
 		newtp->fin_seq = req->rcv_isn;
 		newtp->urg_data = 0;
-		tcp_synq_init(newtp);
-		newtp->syn_backlog = 0;
-		if (skb->len >= 536)
-			newtp->last_seg_size = skb->len; 
+		newtp->listen_opt = NULL;
+		newtp->accept_queue = NULL;
+		/* Deinitialize syn_wait_lock to trap illegal accesses. */
+		memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
 
 		/* Back to base struct sock members. */
 		newsk->err = 0;
-		newsk->ack_backlog = 0;
-		newsk->max_ack_backlog = SOMAXCONN;
 		newsk->priority = 0;
 		atomic_set(&newsk->refcnt, 1);
+#ifdef INET_REFCNT_DEBUG
 		atomic_inc(&inet_sock_nr);
+#endif
 
-		spin_lock_init(&sk->timer_lock);
-		init_timer(&newsk->timer);
-		newsk->timer.function = &tcp_keepalive_timer;
-		newsk->timer.data = (unsigned long) newsk;
 		if (newsk->keepopen)
 			tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp));
 		newsk->socket = NULL;
@@ -2440,6 +2800,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 			newtp->snd_wscale = newtp->rcv_wscale = 0;
 			newtp->window_clamp = min(newtp->window_clamp,65535);
 		}
+		newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->snd_wscale;
+		newtp->max_window = newtp->snd_wnd;
+
 		if (newtp->tstamp_ok) {
 			newtp->ts_recent = req->ts_recent;
 			newtp->ts_recent_stamp = xtime.tv_sec;
@@ -2453,16 +2816,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 	return newsk;
 }
 
-static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
-{
-	if (seq == s_win)
-		return 1;
-	if (after(end_seq, s_win) && before(seq, e_win))
-		return 1;
-	return (seq == e_win && seq == end_seq);
-}
-
-
 /* 
  *	Process an incoming packet for SYN_RECV sockets represented
  *	as an open_request.
@@ -2470,30 +2823,28 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 
 struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 			   struct open_request *req,
-			   struct open_request *prev)
+			   struct open_request **prev)
 {
 	struct tcphdr *th = skb->h.th;
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
 	int paws_reject = 0;
 	struct tcp_opt ttp;
-
-	/* If socket has already been created, process
-	   packet in its context.
-
-	   We fall here only due to race, when packets were enqueued
-	   to backlog of listening socket.
-	 */
-	if (req->sk)
-		return req->sk;
+	struct sock *child;
 
 	ttp.saw_tstamp = 0;
 	if (th->doff > (sizeof(struct tcphdr)>>2)) {
-
 		tcp_parse_options(NULL, th, &ttp, 0);
 
-		paws_reject = ttp.saw_tstamp &&
-			(s32)(ttp.rcv_tsval - req->ts_recent) < 0;
+		if (ttp.saw_tstamp) {
+			ttp.ts_recent = req->ts_recent;
+			/* We do not store true stamp, but it is not required,
+			 * it can be estimated (approximately)
+			 * from another data.
+			 */
+			ttp.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
+			paws_reject = tcp_paws_check(&ttp, th->rst);
+		}
 	}
 
 	/* Check for pure retransmited SYN. */
@@ -2517,7 +2868,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 		 * Enforce "SYN-ACK" according to figure 8, figure 6
 		 * of RFC793, fixed by RFC1122.
 		 */
-		req->class->rtx_syn_ack(sk, req);
+		req->class->rtx_syn_ack(sk, req, NULL);
 		return NULL;
 	}
 
@@ -2544,6 +2895,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 		/* Out of window: send ACK and drop. */
 		if (!(flg & TCP_FLAG_RST))
 			req->class->send_ack(skb, req);
+		if (paws_reject)
+			NET_INC_STATS_BH(PAWSEstabRejected);
 		return NULL;
 	}
 
@@ -2572,35 +2925,78 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 	/* Invalid ACK: reset will be sent by listening socket */
 	if (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1)
 		return sk;
-
-	/* OK, ACK is valid, create big socket and
-	   feed this segment to it. It will repeat all
-	   the tests. THIS SEGMENT MUST MOVE SOCKET TO
-	   ESTABLISHED STATE. If it will be dropped after
-	   socket is created, wait for troubles.
+	/* Also, it would be not so bad idea to check rcv_tsecr, which
+	 * is essentially ACK extension and too early or too late values
+	 * should cause reset in unsynchronized states.
 	 */
-	sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
-	if (sk == NULL)
+
+	/* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
+	if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) {
+		req->acked = 1;
 		return NULL;
+	}
 
-	tcp_dec_slow_timer(TCP_SLT_SYNACK);
-	req->sk = sk;
-	return sk;
+	/* OK, ACK is valid, create big socket and
+	 * feed this segment to it. It will repeat all
+	 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
+	 * ESTABLISHED STATE. If it will be dropped after
+	 * socket is created, wait for troubles.
+	 */
+	child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
+	if (child == NULL)
+		goto listen_overflow;
 
-embryonic_reset:
 	tcp_synq_unlink(tp, req, prev);
-	tp->syn_backlog--;
-	tcp_dec_slow_timer(TCP_SLT_SYNACK);
+	tcp_synq_removed(sk, req);
+
+	tcp_acceptq_queue(sk, req, child);
+	return child;
 
+listen_overflow:
+	if (!sysctl_tcp_abort_on_overflow) {
+		req->acked = 1;
+		return NULL;
+	}
+
+embryonic_reset:
 	NET_INC_STATS_BH(EmbryonicRsts);
 	if (!(flg & TCP_FLAG_RST))
 		req->class->send_reset(skb);
 
-	req->class->destructor(req);
-	tcp_openreq_free(req); 
+	tcp_synq_drop(sk, req, prev);
 	return NULL;
 }
 
+/*
+ * Queue segment on the new socket if the new socket is active,
+ * otherwise we just shortcircuit this and continue with
+ * the new socket.
+ */
+
+int tcp_child_process(struct sock *parent, struct sock *child,
+		      struct sk_buff *skb)
+{
+	int ret = 0;
+	int state = child->state;
+
+	if (child->lock.users == 0) {
+		ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);
+
+		/* Wakeup parent, send SIGIO */
+		if (state == TCP_SYN_RECV && child->state != state)
+			parent->data_ready(parent, 0);
+	} else {
+		/* Alas, it is possible again, because we do lookup
+		 * in main socket hash table and lock on listening
+		 * socket does not protect us more.
+		 */
+		sk_add_backlog(child, skb);
+	}
+
+	bh_unlock_sock(child);
+	return ret;
+}
+
 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 					 struct tcphdr *th, unsigned len)
 {
@@ -2608,25 +3004,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 
 	tcp_parse_options(sk, th, tp, 0);
 
-#ifdef CONFIG_TCP_TW_RECYCLE
-	if (tp->ts_recent_stamp && tp->saw_tstamp && !th->rst &&
-	    (s32)(tp->rcv_tsval - tp->ts_recent) < 0 &&
-	    xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS) {
-		/* Old duplicate segment. We remember last
-		   ts_recent from this host in timewait bucket.
-
-		   Actually, we could implement per host cache
-		   to truncate timewait state after RTO. Paranoidal arguments
-		   of rfc1337 are not enough to close this nice possibility.
-		*/
-		if (net_ratelimit())
-			printk(KERN_DEBUG "TCP: tw recycle, PAWS worked. Good.\n");
-		if (th->ack)
-			return 1;
-		goto discard;
-	}
-#endif
-
 	if (th->ack) {
 		/* rfc793:
 		 * "If the state is SYN-SENT then
@@ -2646,10 +3023,36 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		 *  We do not send data with SYN, so that RFC-correct
 		 *  test reduces to:
 		 */
-		if (sk->zapped ||
-		    TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
+		if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
 			return 1;
 
+		/* Check not from any RFC, but it is evident consequence
+		 * of combining PAWS and usual SYN-SENT logic: ACK _is_
+		 * checked in SYN-SENT unlike another states, hence
+		 * echoed tstamp must be checked too.
+		 */
+		if (tp->saw_tstamp) {
+			if (tp->rcv_tsecr == 0) {
+				/* Workaround for bug in linux-2.1 and early
+				 * 2.2 kernels. Let's pretend that we did not
+				 * see such timestamp to avoid bogus rtt value,
+				 * calculated by tcp_ack().
+				 */
+				tp->saw_tstamp = 0;
+
+				/* But do not forget to store peer's timestamp! */
+				if (th->syn) {
+					tp->ts_recent = tp->rcv_tsval;
+					tp->ts_recent_stamp = xtime.tv_sec;
+				}
+			} else if ((__s32)(tp->rcv_tsecr - tcp_time_stamp) > 0 ||
+				   (__s32)(tp->rcv_tsecr - tp->syn_stamp) < 0) {
+				NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "TCP: synsent reject.\n"));
+				NET_INC_STATS_BH(PAWSActiveRejected);
+				return 1;
+			}
+		}
+
 		/* Now ACK is acceptable.
 		 *
 		 * "If the RST bit is set
@@ -2689,18 +3092,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		 * because tcp_ack check is too weak for SYN-SENT)
 		 * causes moving socket to invalid semi-SYN-SENT,
 		 * semi-ESTABLISHED state and connection hangs.
-		 *
-		 * There exist buggy stacks, which really send
-		 * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp)
-		 * Actually, if this host did not try to get something
-		 * from ftp.inr.ac.ru I'd never find this bug 8)
-		 *
 		 *                                     --ANK (990514)
 		 *
-		 * I was wrong, I apologize. Bare ACK is valid.
+		 * Bare ACK is valid, however.
 		 * Actually, RFC793 requires to send such ACK
 		 * in reply to any out of window packet.
-		 * It is wrong, but Linux also does it sometimes.
+		 * It is wrong, but Linux also send such
+		 * useless ACKs sometimes.
 		 *                                     --ANK (990724)
 		 */
 
@@ -2717,7 +3115,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		/* RFC1323: The window in SYN & SYN/ACK segments is
 		 * never scaled.
 		 */
-		tp->snd_wnd = htons(th->window);
+		tp->snd_wnd = ntohs(th->window);
 		tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
 		tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
 		tp->fin_seq = TCP_SKB_CB(skb)->seq;
@@ -2742,26 +3140,35 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		tcp_initialize_rcv_mss(sk);
 		tcp_init_metrics(sk);
 
+		if (sk->keepopen)
+			tcp_reset_keepalive_timer(sk, keepalive_time_when(tp));
+
+		tp->copied_seq = tp->rcv_nxt;
+		__tcp_fast_path_on(tp, tp->snd_wnd);
+
+		if(!sk->dead) {
+			sk->state_change(sk);
+			sock_wake_async(sk->socket, 0, POLL_OUT);
+		}
+
 		if (tp->write_pending) {
 			/* Save one ACK. Data will be ready after
 			 * several ticks, if write_pending is set.
 			 *
-			 * How to make this correctly?
+			 * It may be deleted, but with this feature tcpdumps
+			 * look so _wonderfully_ clever, that I was not able
+			 * to stand against the temptation 8)     --ANK
 			 */
-			tp->delayed_acks++;
-			if (tp->ato == 0)
-				tp->ato = tp->rto;
-			tcp_send_delayed_ack(sk, tp->rto);
+			tp->ack.pending = 1;
+			tp->ack.lrcvtime = tcp_time_stamp;
+			tcp_enter_quickack_mode(tp);
+			tp->ack.pingpong = 1;
+			tp->ack.ato = TCP_ATO_MIN;
+			tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MIN);
+			goto discard;
 		} else {
 			tcp_send_ack(sk);
 		}
-
-		tp->copied_seq = tp->rcv_nxt;
-
-		if(!sk->dead) {
-			wake_up_interruptible(sk->sleep);
-			sock_wake_async(sk->socket, 0, POLL_OUT);
-		}
 		return -1;
 	}
 
@@ -2777,6 +3184,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		goto discard;
 	}
 
+	/* PAWS check. */
+	if (tp->ts_recent_stamp && tp->saw_tstamp && tcp_paws_check(tp, 0))
+		goto discard;
+
 	if (th->syn) {
 		/* We see SYN without ACK. It is attempt of
 		 *  simultaneous connect with crossed SYNs.
@@ -2800,8 +3211,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		/* RFC1323: The window in SYN & SYN/ACK segments is
 		 * never scaled.
 		 */
-		tp->snd_wnd = htons(th->window);
+		tp->snd_wnd = ntohs(th->window);
 		tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+		tp->max_window = tp->snd_wnd;
 
 		tcp_sync_mss(sk, tp->pmtu_cookie);
 		tcp_initialize_rcv_mss(sk);
@@ -2960,6 +3372,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 #endif
 	    ) {
 		if (!th->rst) {
+			NET_INC_STATS_BH(DelayedACKLost);
+			tcp_enter_quickack_mode(tp);
 			tcp_send_ack(sk);
 		}
 		goto discard;
@@ -3011,28 +3425,29 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				tp->copied_seq = tp->rcv_nxt;
 
 				/* Note, that this wakeup is only for marginal
-				   crossed SYN case. Passively open sockets
-				   are not waked up, because sk->sleep == NULL
-				   and sk->socket == NULL.
+				 * crossed SYN case. Passively open sockets
+				 * are not waked up, because sk->sleep == NULL
+				 * and sk->socket == NULL.
 				 */
-				if (!sk->dead && sk->sleep) {
-					wake_up_interruptible(sk->sleep);
+				if (!sk->dead) {
+					sk->state_change(sk);
 					sock_wake_async(sk->socket,0,POLL_OUT);
 				}
 
 				tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
-				tp->snd_wnd = htons(th->window) << tp->snd_wscale;
+				tp->snd_wnd = ntohs(th->window) << tp->snd_wscale;
 				tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
 				tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
 
 				/* tcp_ack considers this ACK as duplicate
-				 * and does not calculate rtt. It is wrong.
+				 * and does not calculate rtt.
 				 * Fix it at least with timestamps.
 				 */
 				if (tp->saw_tstamp && !tp->srtt)
 					tcp_ack_saw_tstamp(sk, tp, 0, 0, FLAG_SYN_ACKED);
 
 				tcp_init_metrics(sk);
+				tcp_fast_path_on(tp);
 			} else {
 				SOCK_DEBUG(sk, "bad ack\n");
 				return 1;
@@ -3041,26 +3456,50 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 
 		case TCP_FIN_WAIT1:
 			if (tp->snd_una == tp->write_seq) {
-				sk->shutdown |= SEND_SHUTDOWN;
 				tcp_set_state(sk, TCP_FIN_WAIT2);
-				if (!sk->dead)
-					sk->state_change(sk);
-				else
-					tcp_reset_keepalive_timer(sk, sysctl_tcp_fin_timeout);
+				sk->shutdown |= SEND_SHUTDOWN;
 				dst_confirm(sk->dst_cache);
+
+				if (!sk->dead) {
+					/* Wake up lingering close() */
+					sk->state_change(sk);
+				} else {
+					int tmo;
+
+					if (tp->linger2 < 0 ||
+					    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
+						tcp_done(sk);
+						return 1;
+					}
+
+					tmo = tcp_fin_time(tp);
+					if (tmo > TCP_TIMEWAIT_LEN) {
+						tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
+					} else if (th->fin || sk->lock.users) {
+						/* Bad case. We could lose such FIN otherwise.
+						 * It is not a big problem, but it looks confusing
+						 * and not so rare event. We still can lose it now,
+						 * if it spins in bh_lock_sock(), but it is really
+						 * marginal case.
+						 */
+						tcp_reset_keepalive_timer(sk, tmo);
+					} else {
+						tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+						goto discard;
+					}
+				}
 			}
 			break;
 
-		case TCP_CLOSING:	
+		case TCP_CLOSING:
 			if (tp->snd_una == tp->write_seq) {
-				tcp_time_wait(sk);
+				tcp_time_wait(sk, TCP_TIME_WAIT, 0);
 				goto discard;
 			}
 			break;
 
 		case TCP_LAST_ACK:
 			if (tp->snd_una == tp->write_seq) {
-				tcp_set_state(sk,TCP_CLOSE);
 				tcp_update_metrics(sk);
 				tcp_done(sk);
 				goto discard;
@@ -3080,27 +3519,22 @@ step6:
 	case TCP_CLOSING:
 		if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq))
 			break;
-	
 	case TCP_FIN_WAIT1:
 	case TCP_FIN_WAIT2:
 		/* RFC 793 says to queue data in these states,
 		 * RFC 1122 says we MUST send a reset. 
 		 * BSD 4.4 also does reset.
 		 */
-		if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) {
+		if (sk->shutdown & RCV_SHUTDOWN) {
 			if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
 				tcp_reset(sk);
 				return 1;
 			}
 		}
-		
+		/* Fall through */
 	case TCP_ESTABLISHED: 
-		queued = tcp_data(skb, sk, len);
-
-		/* This must be after tcp_data() does the skb_pull() to
-		 * remove the header size from skb->len.
-		 */
-		tcp_measure_rcv_mss(sk, skb); 
+		tcp_data(skb, sk, len);
+		queued = 1;
 		break;
 	}
author	Ralf Baechle <ralf@linux-mips.org>	2000-02-18 00:24:27 +0000
committer	Ralf Baechle <ralf@linux-mips.org>	2000-02-18 00:24:27 +0000
commit	b9558d5f86c471a125abf1fb3a3882fb053b1f8c (patch)
tree	707b53ec64e740a7da87d5f36485e3cd9b1c794e /net/ipv4/tcp_input.c
parent	b3ac367c7a3e6047abe74817db27e34e759f279f (diff)