1 files changed, 107 insertions, 156 deletions
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index e96089fab..365d3dac2 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -34,8 +34,8 @@ struct timer_list	tcp_slow_timer = {
 
 
 struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = {
-	{0, TCP_SYNACK_PERIOD, 0, tcp_syn_recv_timer},		/* SYNACK	*/
-	{0, TCP_KEEPALIVE_PERIOD, 0, tcp_keepalive}		/* KEEPALIVE	*/
+	{ATOMIC_INIT(0), TCP_SYNACK_PERIOD, 0, tcp_syn_recv_timer},/* SYNACK	*/
+	{ATOMIC_INIT(0), TCP_KEEPALIVE_PERIOD, 0, tcp_keepalive}   /* KEEPALIVE	*/
 };
 
 /*
@@ -67,16 +67,14 @@ void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
 {
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 
-	if((long)when <= 0)
-	{		
-		printk("xmit_timer <= 0 - timer:%d when:%lx\n", what, when);
+	if((long)when <= 0) {
+		printk(KERN_DEBUG "xmit_timer <= 0 - timer:%d when:%lx\n", what, when);
 		when=HZ/50;
 	}
 
 	switch (what) {
 	case TIME_RETRANS:
-		/*
-		 * When seting the transmit timer the probe timer 
+		/* When seting the transmit timer the probe timer 
 		 * should not be set.
 		 * The delayed ack timer can be set if we are changing the
 		 * retransmit timer when removing acked frames.
@@ -100,12 +98,12 @@ void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
 		break;	
 
 	case TIME_WRITE:
-		printk("bug: tcp_reset_xmit_timer TIME_WRITE\n");
+		printk(KERN_DEBUG "bug: tcp_reset_xmit_timer TIME_WRITE\n");
 		break;
 
 	default:
-		printk("bug: unknown timer value\n");
-	}
+		printk(KERN_DEBUG "bug: unknown timer value\n");
+	};
 }
 
 void tcp_clear_xmit_timer(struct sock *sk, int what)
@@ -123,8 +121,8 @@ void tcp_clear_xmit_timer(struct sock *sk, int what)
 		del_timer(&tp->probe_timer);
 		break;	
 	default:
-		printk("bug: unknown timer value\n");
-	}
+		printk(KERN_DEBUG "bug: unknown timer value\n");
+	};
 }
 
 int tcp_timer_is_set(struct sock *sk, int what)
@@ -142,8 +140,8 @@ int tcp_timer_is_set(struct sock *sk, int what)
 		return tp->probe_timer.next != NULL;
 		break;	
 	default:
-		printk("bug: unknown timer value\n");
-	}
+		printk(KERN_DEBUG "bug: unknown timer value\n");
+	};
 	return 0;
 }
 
@@ -162,25 +160,25 @@ void tcp_clear_xmit_timers(struct sock *sk)
 
 static int tcp_write_timeout(struct sock *sk)
 {
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
 	/*
 	 *	Look for a 'soft' timeout.
 	 */
-	if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7))
-		|| (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) 
-	{
-		/*
-		 *	Attempt to recover if arp has changed (unlikely!) or
+	if ((sk->state == TCP_ESTABLISHED &&
+
+	     /* Eric, what the heck is this doing?!?! */
+	     tp->retransmits && !(tp->retransmits & 7)) ||
+
+	    (sk->state != TCP_ESTABLISHED && tp->retransmits > TCP_RETR1)) {
+		/*	Attempt to recover if arp has changed (unlikely!) or
 		 *	a route has shifted (not supported prior to 1.3).
 		 */
-		ip_rt_advice(&sk->ip_route_cache, 0);
+		ip_rt_advice((struct rtable**)&sk->dst_cache, 0);
 	}
 	
-	/*
-	 *	Have we tried to SYN too many times (repent repent 8))
-	 */
-	 
-	if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT)
-	{
+	/* Have we tried to SYN too many times (repent repent 8)) */
+	if(tp->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT) {
 		if(sk->err_soft)
 			sk->err=sk->err_soft;
 		else
@@ -196,11 +194,9 @@ static int tcp_write_timeout(struct sock *sk)
 		/* Don't FIN, we got nothing back */
 		return 0;
 	}
-	/*
-	 *	Has it gone just too far ?
-	 */
-	if (sk->retransmits > TCP_RETR2) 
-	{
+
+	/* Has it gone just too far? */
+	if (tp->retransmits > TCP_RETR2) {
 		if(sk->err_soft)
 			sk->err = sk->err_soft;
 		else
@@ -209,19 +205,12 @@ static int tcp_write_timeout(struct sock *sk)
 
 		tcp_clear_xmit_timers(sk);
 
-		/*
-		 *	Time wait the socket 
-		 */
-		if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) 
-		{
+		/* Time wait the socket. */
+		if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING) {
 			tcp_set_state(sk,TCP_TIME_WAIT);
 			tcp_reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
-		}
-		else
-		{
-			/*
-			 *	Clean up time.
-			 */
+		} else {
+			/* Clean up time. */
 			tcp_set_state(sk, TCP_CLOSE);
 			return 0;
 		}
@@ -235,14 +224,10 @@ void tcp_delack_timer(unsigned long data) {
 	struct sock *sk = (struct sock*)data;
 
 	if(sk->zapped)
-	{
 		return;
-	}
 	
-	if (sk->delayed_acks)
-	{
+	if (sk->tp_pinfo.af_tcp.delayed_acks)
 		tcp_read_wakeup(sk); 		
-	}
 }
 
 void tcp_probe_timer(unsigned long data) {
@@ -251,16 +236,10 @@ void tcp_probe_timer(unsigned long data) {
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 
 	if(sk->zapped) 
-	{		
 		return;
-	}
 	
-	if (sk->users) 
-	{
-		/* 
-		 * Try again in second 
-		 */
-
+	if (sk->sock_readers) {
+		/* Try again in second. */
 		tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ);
 		return;
 	}
@@ -270,28 +249,20 @@ void tcp_probe_timer(unsigned long data) {
 	 *	FIXME: We ought not to do it, Solaris 2.5 actually has fixing
 	 *	this behaviour in Solaris down as a bug fix. [AC]
 	 */
-	if (tp->probes_out > TCP_RETR2) 
-	{
+	if (tp->probes_out > TCP_RETR2) {
 		if(sk->err_soft)
 			sk->err = sk->err_soft;
 		else
 			sk->err = ETIMEDOUT;
 		sk->error_report(sk);
 
-		/*
-		 *	Time wait the socket 
-		 */
+		/* Time wait the socket. */
 		if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 
-		    || sk->state == TCP_CLOSING ) 
-		{
+		    || sk->state == TCP_CLOSING) {
 			tcp_set_state(sk, TCP_TIME_WAIT);
 			tcp_reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
-		}
-		else
-		{
-			/*
-			 *	Clean up time.
-			 */
+		} else {
+			/* Clean up time. */
 			tcp_set_state(sk, TCP_CLOSE);
 		}
 	}
@@ -303,24 +274,20 @@ static __inline__ int tcp_keepopen_proc(struct sock *sk)
 {
 	int res = 0;
 
-	if (sk->state == TCP_ESTABLISHED || sk->state == TCP_CLOSE_WAIT)
-	{
+	if (sk->state == TCP_ESTABLISHED || sk->state == TCP_CLOSE_WAIT ||
+	    sk->state == TCP_FIN_WAIT2) {
 		struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 		__u32 elapsed = jiffies - tp->rcv_tstamp;
 
-		if (elapsed >= TCP_KEEPALIVE_TIME)
-		{
-			if (tp->probes_out > TCP_KEEPALIVE_PROBES)
-			{
+		if (elapsed >= TCP_KEEPALIVE_TIME) {
+			if (tp->probes_out > TCP_KEEPALIVE_PROBES) {
 				if(sk->err_soft)
 					sk->err = sk->err_soft;
 				else
 					sk->err = ETIMEDOUT;
 
 				tcp_set_state(sk, TCP_CLOSE);
-			}
-			else
-			{
+			} else {
 				tp->probes_out++;
 				tp->pending = TIME_KEEPOPEN;
 				tcp_write_wakeup(sk);
@@ -347,28 +314,40 @@ static __inline__ int tcp_keepopen_proc(struct sock *sk)
  */
 #define MAX_KA_PROBES	5
 
+/* Keepopen's are only valid for "established" TCP's, nicely our listener
+ * hash gets rid of most of the useless testing, so we run through a couple
+ * of the established hash chains each clock tick.  -DaveM
+ *
+ * And now, even more magic... TIME_WAIT TCP's cannot have keepalive probes
+ * going off for them, so we only need check the first half of the established
+ * hash table, even less testing under heavy load.
+ *
+ * I _really_ would rather do this by adding a new timer_struct to struct sock,
+ * and this way only those who set the keepalive option will get the overhead.
+ * The idea is you set it for 2 hours when the sock is first connected, when it
+ * does fire off (if at all, most sockets die earlier) you check for the keepalive
+ * option and also if the sock has been idle long enough to start probing.
+ */
 static void tcp_keepalive(unsigned long data)
 {
-	struct sock *sk;
+	static int chain_start = 0;
 	int count = 0;
 	int i;
 	
-	for(i=0; i < SOCK_ARRAY_SIZE; i++)
-	{
-		sk = tcp_prot.sock_array[i];
-		while (sk)
-		{
-			if (sk->keepopen)
-			{
+	for(i = chain_start; i < (chain_start + ((TCP_HTABLE_SIZE/2) >> 2)); i++) {
+		struct sock *sk = tcp_established_hash[i];
+		while(sk) {
+			if(sk->keepopen) {
 				count += tcp_keepopen_proc(sk);
+				if(count == MAX_KA_PROBES)
+					goto out;
 			}
-
-			if (count == MAX_KA_PROBES)
-				return;
-			
-			sk = sk->next;	    
+			sk = sk->next;
 		}
 	}
+out:
+	chain_start = ((chain_start + ((TCP_HTABLE_SIZE/2)>>2)) &
+		       ((TCP_HTABLE_SIZE/2) - 1));
 }
 
 /*
@@ -389,47 +368,35 @@ void tcp_retransmit_timer(unsigned long data)
 	struct sock *sk = (struct sock*)data;
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 
-	/*
-	 *	We are reset. We will send no more retransmits.
-	 */
-
-	if(sk->zapped)
-	{
+	/* We are reset. We will send no more retransmits. */
+	if(sk->zapped) {
 		tcp_clear_xmit_timer(sk, TIME_RETRANS);
 		return;
 	}
 
-	/*
-	 * Clear delay ack timer
-	 */
+	lock_sock(sk);
 
+	/* Clear delay ack timer. */
 	tcp_clear_xmit_timer(sk, TIME_DACK);
 
-	/*
-	 *	Retransmission
-	 */
-
+	/* Retransmission. */
 	tp->retrans_head = NULL;
-	
-
-	if (sk->retransmits == 0)
-	{
-		/* 
-		 * remember window where we lost 
+	if (tp->retransmits == 0) {
+		/* remember window where we lost
 		 * "one half of the current window but at least 2 segments"
 		 */
-		
-		sk->ssthresh = max(sk->cong_window >> 1, 2); 
-		sk->cong_count = 0;
-		sk->cong_window = 1;
+		tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2);
+		tp->snd_cwnd_cnt = 0;
+		tp->snd_cwnd = 1;
 	}
 
-	atomic_inc(&sk->retransmits);
+	tp->retransmits++;
 
+	tp->dup_acks = 0;
+	tp->high_seq = tp->snd_nxt;
 	tcp_do_retransmit(sk, 0);
 
-	/*
-	 * Increase the timeout each time we retransmit.  Note that
+	/* Increase the timeout each time we retransmit.  Note that
 	 * we do not increase the rtt estimate.  rto is initialized
 	 * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 	 * that doubling rto each time is the least we can get away with.
@@ -444,38 +411,35 @@ void tcp_retransmit_timer(unsigned long data)
 	 * implemented ftp to mars will work nicely. We will have to fix
 	 * the 120 second clamps though!
 	 */
-
-	tp->backoff++;
+	tp->backoff++;	/* FIXME: always same as retransmits? -- erics */
 	tp->rto = min(tp->rto << 1, 120*HZ);
 	tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 
 	tcp_write_timeout(sk);
+
+	release_sock(sk);
 }
 
 /*
  *	Slow timer for SYN-RECV sockets
  */
 
+/* This now scales very nicely. -DaveM */
 static void tcp_syn_recv_timer(unsigned long data)
 {
 	struct sock *sk;
 	unsigned long now = jiffies;
 	int i;
 
-	for(i=0; i < SOCK_ARRAY_SIZE; i++)
-	{
-		sk = tcp_prot.sock_array[i];
-		while (sk)
-		{
+	for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
+		sk = tcp_listening_hash[i];
+
+		while(sk) {
 			struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 			
-			if (sk->state == TCP_LISTEN && !sk->users &&
-			    tp->syn_wait_queue)
-			{
-				struct open_request *req;
-
-				req = tp->syn_wait_queue;
-
+			/* TCP_LISTEN is implied. */
+			if (!sk->sock_readers && tp->syn_wait_queue) {
+				struct open_request *req = tp->syn_wait_queue;
 				do {
 					struct open_request *conn;
 				  
@@ -483,17 +447,13 @@ static void tcp_syn_recv_timer(unsigned long data)
 					req = req->dl_next;
 
 					if (conn->sk)
-					{
 						continue;
-					}
-					
+
 					if ((long)(now - conn->expires) <= 0)
 						break;
 
 					tcp_synq_unlink(tp, conn);
-					
-					if (conn->retrans >= TCP_RETR1)
-					{
+					if (conn->retrans >= TCP_RETR1) {
 #ifdef TCP_DEBUG
 						printk(KERN_DEBUG "syn_recv: "
 						       "too many retransmits\n");
@@ -501,20 +461,19 @@ static void tcp_syn_recv_timer(unsigned long data)
 						(*conn->class->destructor)(conn);
 						tcp_dec_slow_timer(TCP_SLT_SYNACK);
 						sk->ack_backlog--;
-						kfree(conn);
+						tcp_openreq_free(conn);
 
 						if (!tp->syn_wait_queue)
 							break;
-					}
-					else
-					{
+					} else {
 						__u32 timeo;
-						
+
 						(*conn->class->rtx_syn_ack)(sk, conn);
 
 						conn->retrans++;
 #ifdef TCP_DEBUG
-						printk(KERN_DEBUG "syn_ack rtx %d\n", conn->retrans);
+						printk(KERN_DEBUG "syn_ack rtx %d\n",
+						       conn->retrans);
 #endif
 						timeo = min((TCP_TIMEOUT_INIT 
 							     << conn->retrans),
@@ -522,9 +481,8 @@ static void tcp_syn_recv_timer(unsigned long data)
 						conn->expires = now + timeo;
 						tcp_synq_queue(tp, conn);
 					}
-				} while (req != tp->syn_wait_queue);
+				} while (req);
 			}
-			
 			sk = sk->next;
 		}
 	}
@@ -537,16 +495,13 @@ void tcp_sltimer_handler(unsigned long data)
 	unsigned long now = jiffies;
 	int i;
 
-	for (i=0; i < TCP_SLT_MAX; i++, slt++)
-	{
-		if (slt->count)
-		{
+	for (i=0; i < TCP_SLT_MAX; i++, slt++) {
+		if (atomic_read(&slt->count)) {
 			long trigger;
 
 			trigger = slt->period - ((long)(now - slt->last));
 
-			if (trigger <= 0)
-			{
+			if (trigger <= 0) {
 				(*slt->handler)((unsigned long) slt);
 				slt->last = now;
 				trigger = slt->period;
@@ -555,8 +510,7 @@ void tcp_sltimer_handler(unsigned long data)
 		}
 	}
 
-	if (next != ~0UL)
-	{
+	if (next != ~0UL) {
 		tcp_slow_timer.expires = now + next;
 		add_timer(&tcp_slow_timer);
 	}
@@ -572,13 +526,10 @@ void __tcp_inc_slow_timer(struct tcp_sl_timer *slt)
 		
 	when = now + slt->period;
 	if (del_timer(&tcp_slow_timer))
-	{
 		next = tcp_slow_timer.expires;
-	}
+
 	if (next && ((long)(next - when) < 0))
-	{
 		when = next;
-	}
 		
 	tcp_slow_timer.expires = when;
 	add_timer(&tcp_slow_timer);