Merge with Linux 2.3.41.

author: Ralf Baechle <ralf@linux-mips.org> 2000-02-18 00:24:27 +0000
committer: Ralf Baechle <ralf@linux-mips.org> 2000-02-18 00:24:27 +0000
commit: b9558d5f86c471a125abf1fb3a3882fb053b1f8c (patch)
tree: 707b53ec64e740a7da87d5f36485e3cd9b1c794e /net/ipv4/tcp_ipv4.c
parent: b3ac367c7a3e6047abe74817db27e34e759f279f (diff)
1 files changed, 456 insertions, 495 deletions
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 22c35a191..7420e268f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_ipv4.c,v 1.194 2000/01/09 02:19:41 davem Exp $
+ * Version:	$Id: tcp_ipv4.c,v 1.197 2000/01/21 06:37:28 davem Exp $
  *
  *		IPv4 specific functions
  *
@@ -52,7 +52,6 @@
 #include <linux/fcntl.h>
 #include <linux/random.h>
 #include <linux/init.h>
-#include <linux/ipsec.h>
 
 #include <net/icmp.h>
 #include <net/tcp.h>
@@ -61,15 +60,9 @@
 
 #include <linux/inet.h>
 #include <linux/stddef.h>
+#include <linux/ipsec.h>
 
-extern int sysctl_tcp_timestamps;
-extern int sysctl_tcp_window_scaling;
-extern int sysctl_tcp_sack;
-extern int sysctl_tcp_syncookies;
-extern int sysctl_tcp_tw_recycle;
 extern int sysctl_ip_dynaddr;
-extern __u32 sysctl_wmem_max;
-extern __u32 sysctl_rmem_max;
 
 /* Check TCP sequence numbers in ICMP packets. */
 #define ICMP_MIN_LENGTH 8
@@ -319,89 +312,13 @@ void tcp_put_port(struct sock *sk)
 	local_bh_enable();
 }
 
-#ifdef CONFIG_TCP_TW_RECYCLE
-/*
-   Very stupid pseudo-"algoritm". If the approach will be successful
-   (and it will!), we have to make it more reasonable.
-   Now it eats lots of CPU, when we are tough on ports.
-
-   Apparently, it should be hash table indexed by daddr/dport.
-
-   How does it work? We allow to truncate time-wait state, if:
-   1. PAWS works on it.
-   2. timewait bucket did not receive data for timeout:
-      - initially timeout := 2*RTO, so that if our ACK to first
-        transmitted peer's FIN is lost, we will see first retransmit.
-      - if we receive anything, the timout is increased exponentially
-        to follow normal TCP backoff pattern.
-      It is important that minimal RTO (HZ/5) > minimal timestamp
-      step (1ms).
-   3. When creating new socket, we inherit sequence number
-      and ts_recent of time-wait bucket, increasinf them a bit.
-
-   These two conditions guarantee, that data will not be corrupted
-   both by retransmitted and by delayed segments. They do not guarantee
-   that peer will leave LAST-ACK/CLOSING state gracefully, it will be
-   reset sometimes, namely, when more than two our ACKs to its FINs are lost.
-   This reset is harmless and even good.
+/* This lock without TASK_EXCLUSIVE is good on UP and it can be very bad on SMP.
+ * Look, when several writers sleep and reader wakes them up, all but one
+ * immediately hit write lock and grab all the cpus. Exclusive sleep solves
+ * this, _but_ remember, it adds useless work on UP machines (wake up each
+ * exclusive lock release). It should be ifdefed really.
  */
 
-int tcp_v4_tw_recycle(struct sock *sk, u32 daddr, u16 dport)
-{
-	static int tw_rover;
-
-	struct tcp_tw_bucket *tw;
-	struct tcp_bind_hashbucket *head;
-	struct tcp_bind_bucket *tb;
-
-	int low = sysctl_local_port_range[0];
-	int high = sysctl_local_port_range[1];
-	unsigned long now = jiffies;
-	int i, rover;
-
-	rover = tw_rover;
-
-	local_bh_disable();
-	for (i=0; i<tcp_bhash_size; i++, rover++) {
-		rover &= (tcp_bhash_size-1);
-		head = &tcp_bhash[rover];
-
-		spin_lock(&head->lock);
-		for (tb = head->chain; tb; tb = tb->next) {
-			tw = (struct tcp_tw_bucket*)tb->owners;
-
-			if (tw->state != TCP_TIME_WAIT ||
-			    tw->dport != dport ||
-			    tw->daddr != daddr ||
-			    tw->rcv_saddr != sk->rcv_saddr ||
-			    tb->port < low ||
-			    tb->port >= high ||
-			    !TCP_INET_FAMILY(tw->family) ||
-			    tw->ts_recent_stamp == 0 ||
-			    (long)(now - tw->ttd) <= 0)
-				continue;
-			tw_rover = rover;
-			goto hit;
-		}
-		spin_unlock(&head->lock);
-	}
-	local_bh_enable();
-	tw_rover = rover;
-	return -EAGAIN;
-
-hit:
-	sk->num = tw->num;
-	if ((sk->bind_next = tb->owners) != NULL)
-		tb->owners->bind_pprev = &sk->bind_next;
-	tb->owners = sk;
-	sk->bind_pprev = &tb->owners;
-	sk->prev = (struct sock *) tb;
-	spin_unlock_bh(&head->lock);
-	return 0;
-}
-#endif
-
-
 void tcp_listen_wlock(void)
 {
 	write_lock(&tcp_lhash_lock);
@@ -409,9 +326,9 @@ void tcp_listen_wlock(void)
 	if (atomic_read(&tcp_lhash_users)) {
 		DECLARE_WAITQUEUE(wait, current);
 
-		add_wait_queue(&tcp_lhash_wait, &wait);
+		add_wait_queue_exclusive(&tcp_lhash_wait, &wait);
 		for (;;) {
-			set_current_state(TASK_UNINTERRUPTIBLE);
+			set_current_state(TASK_UNINTERRUPTIBLE|TASK_EXCLUSIVE);
 			if (atomic_read(&tcp_lhash_users) == 0)
 				break;
 			write_unlock_bh(&tcp_lhash_lock);
@@ -445,6 +362,8 @@ static __inline__ void __tcp_v4_hash(struct sock *sk)
 	sk->pprev = skp;
 	sock_prot_inc_use(sk->prot);
 	write_unlock(lock);
+	if (sk->state == TCP_LISTEN)
+		wake_up(&tcp_lhash_wait);
 }
 
 static void tcp_v4_hash(struct sock *sk)
@@ -478,6 +397,8 @@ void tcp_unhash(struct sock *sk)
 		sock_prot_dec_use(sk->prot);
 	}
 	write_unlock_bh(lock);
+	if (sk->state == TCP_LISTEN)
+		wake_up(&tcp_lhash_wait);
 }
 
 /* Don't inline this cruft.  Here are some nice properties to
@@ -546,8 +467,9 @@ sherry_cache:
  *
  * Local BH must be disabled here.
  */
-static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
-					   u32 daddr, u16 hnum, int dif)
+
+static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
+						       u32 daddr, u16 hnum, int dif)
 {
 	struct tcp_ehash_bucket *head;
 	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
@@ -572,7 +494,7 @@ static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 			goto hit;
 	read_unlock(&head->lock);
 
-	return tcp_v4_lookup_listener(daddr, hnum, dif);
+	return NULL;
 
 hit:
 	sock_hold(sk);
@@ -580,6 +502,19 @@ hit:
 	return sk;
 }
 
+static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
+					   u32 daddr, u16 hnum, int dif)
+{
+	struct sock *sk;
+
+	sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif);
+
+	if (sk)
+		return sk;
+		
+	return tcp_v4_lookup_listener(daddr, hnum, dif);
+}
+
 __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
 {
 	struct sock *sk;
@@ -609,21 +544,16 @@ static int tcp_v4_check_established(struct sock *sk)
 	int hash = tcp_hashfn(daddr, sk->num, saddr, sk->dport);
 	struct tcp_ehash_bucket *head = &tcp_ehash[hash];
 	struct sock *sk2, **skp;
-#ifdef CONFIG_TCP_TW_RECYCLE
 	struct tcp_tw_bucket *tw;
-#endif
 
 	write_lock_bh(&head->lock);
 
 	/* Check TIME-WAIT sockets first. */
 	for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
 	    skp = &sk2->next) {
-#ifdef CONFIG_TCP_TW_RECYCLE
 		tw = (struct tcp_tw_bucket*)sk2;
-#endif
 
 		if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
-#ifdef CONFIG_TCP_TW_RECYCLE
 			struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
 			/* With PAWS, it is safe from the viewpoint
@@ -631,12 +561,17 @@ static int tcp_v4_check_established(struct sock *sk)
 			   is safe provided sequence spaces do not
 			   overlap i.e. at data rates <= 80Mbit/sec.
 
-			   Actually, the idea is close to VJ's (rfc1332)
-			   one, only timestamp cache is held not per host,
+			   Actually, the idea is close to VJ's one,
+			   only timestamp cache is held not per host,
 			   but per port pair and TW bucket is used
 			   as state holder.
+
+			   If TW bucket has been already destroyed we
+			   fall back to VJ's scheme and use initial
+			   timestamp retrieved from peer table.
 			 */
-			if (sysctl_tcp_tw_recycle && tw->ts_recent_stamp) {
+			if (tw->substate == TCP_TIME_WAIT &&
+			    sysctl_tcp_tw_recycle && tw->ts_recent_stamp) {
 				if ((tp->write_seq = tw->snd_nxt + 2) == 0)
 					tp->write_seq = 1;
 				tp->ts_recent = tw->ts_recent;
@@ -645,13 +580,10 @@ static int tcp_v4_check_established(struct sock *sk)
 				skp = &head->chain;
 				goto unique;
 			} else
-#endif
-			goto not_unique;
+				goto not_unique;
 		}
 	}
-#ifdef CONFIG_TCP_TW_RECYCLE
 	tw = NULL;
-#endif
 
 	/* And established part... */
 	for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) {
@@ -659,9 +591,7 @@ static int tcp_v4_check_established(struct sock *sk)
 			goto not_unique;
 	}
 
-#ifdef CONFIG_TCP_TW_RECYCLE
 unique:
-#endif
 	BUG_TRAP(sk->pprev==NULL);
 	if ((sk->next = *skp) != NULL)
 		(*skp)->pprev = &sk->next;
@@ -671,17 +601,17 @@ unique:
 	sock_prot_inc_use(sk->prot);
 	write_unlock_bh(&head->lock);
 
-#ifdef CONFIG_TCP_TW_RECYCLE
 	if (tw) {
 		/* Silly. Should hash-dance instead... */
 		local_bh_disable();
 		tcp_tw_deschedule(tw);
 		tcp_timewait_kill(tw);
+		NET_INC_STATS_BH(TimeWaitRecycled);
 		local_bh_enable();
 
 		tcp_tw_put(tw);
 	}
-#endif
+
 	return 0;
 
 not_unique:
@@ -727,9 +657,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	int tmp;
 	int err;
 
-	if (sk->state != TCP_CLOSE) 
-		return(-EISCONN);
-
 	if (addr_len < sizeof(struct sockaddr_in))
 		return(-EINVAL);
 
@@ -759,8 +686,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		daddr = rt->rt_dst;
 
 	err = -ENOBUFS;
-	buff = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
-			    0, GFP_KERNEL);
+	buff = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 0, GFP_KERNEL);
 
 	if (buff == NULL)
 		goto failure;
@@ -769,27 +695,28 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		sk->saddr = rt->rt_src;
 	sk->rcv_saddr = sk->saddr;
 
-	if (!sk->num) {
-		if (sk->prot->get_port(sk, 0)
-#ifdef CONFIG_TCP_TW_RECYCLE
-		    && (!sysctl_tcp_tw_recycle ||
-			tcp_v4_tw_recycle(sk, daddr, usin->sin_port))
-#endif
-		    ) {
-			kfree_skb(buff);
-			err = -EAGAIN;
-			goto failure;
-		}
-		sk->sport = htons(sk->num);
-	}
-#ifdef CONFIG_TCP_TW_RECYCLE
-	else if (tp->ts_recent_stamp && sk->daddr != daddr) {
+	if (tp->ts_recent_stamp && sk->daddr != daddr) {
 		/* Reset inherited state */
 		tp->ts_recent = 0;
 		tp->ts_recent_stamp = 0;
 		tp->write_seq = 0;
 	}
-#endif
+
+	if (sysctl_tcp_tw_recycle &&
+	    !tp->ts_recent_stamp &&
+	    rt->rt_dst == daddr) {
+		struct inet_peer *peer = rt_get_peer(rt);
+
+		/* VJ's idea. We save last timestamp seen from
+		 * the destination in peer table, when entering state TIME-WAIT
+		 * and initialize ts_recent from it, when trying new connection.
+		 */
+
+		if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
+			tp->ts_recent_stamp = peer->tcp_ts_stamp;
+			tp->ts_recent = peer->tcp_ts;
+		}
+	}
 
 	sk->dport = usin->sin_port;
 	sk->daddr = daddr;
@@ -814,85 +741,62 @@ failure:
 	return err;
 }
 
-static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len)
+static __inline__ int tcp_v4_iif(struct sk_buff *skb)
 {
-	int retval = -EINVAL;
-
-	lock_sock(sk);
-
-	/* Do sanity checking for sendmsg/sendto/send. */
-	if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL))
-		goto out;
-	if (msg->msg_name) {
-		struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
-
-		if (msg->msg_namelen < sizeof(*addr))
-			goto out;
-		if (addr->sin_family && addr->sin_family != AF_INET)
-			goto out;
-		retval = -ENOTCONN;
-		if(sk->state == TCP_CLOSE)
-			goto out;
-		retval = -EISCONN;
-		if (addr->sin_port != sk->dport)
-			goto out;
-		if (addr->sin_addr.s_addr != sk->daddr)
-			goto out;
-	}
-	retval = tcp_do_sendmsg(sk, msg);
-
-out:
-	release_sock(sk);
-	return retval;
+	return ((struct rtable*)skb->dst)->rt_iif;
 }
 
+static __inline__ unsigned tcp_v4_synq_hash(u32 raddr, u16 rport)
+{
+	unsigned h = raddr ^ rport;
+	h ^= h>>16;
+	h ^= h>>8;
+	return h&(TCP_SYNQ_HSIZE-1);
+}
 
-/*
- * Do a linear search in the socket open_request list. 
- * This should be replaced with a global hash table.
- */
 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp, 
 					      struct iphdr *iph,
 					      struct tcphdr *th,
-					      struct open_request **prevp)
+					      struct open_request ***prevp)
 {
-	struct open_request *req, *prev;  
-	__u16 rport = th->source; 
-
-	/*	assumption: the socket is not in use.
-	 *	as we checked the user count on tcp_rcv and we're
-	 *	running from a soft interrupt.
-	 */
-	prev = (struct open_request *) (&tp->syn_wait_queue); 
-	for (req = prev->dl_next; req; req = req->dl_next) {
-		if (req->af.v4_req.rmt_addr == iph->saddr &&
+	struct tcp_listen_opt *lopt = tp->listen_opt;
+	struct open_request *req, **prev;  
+	__u16 rport = th->source;
+	__u32 raddr = iph->saddr;
+
+	for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport)];
+	     (req = *prev) != NULL;
+	     prev = &req->dl_next) {
+		if (req->rmt_port == rport &&
+		    req->af.v4_req.rmt_addr == raddr &&
 		    req->af.v4_req.loc_addr == iph->daddr &&
-		    req->rmt_port == rport &&
 		    TCP_INET_FAMILY(req->class->family)) {
-			if (req->sk) {
-				/* Weird case: connection was established
-				   and then killed by RST before user accepted
-				   it. This connection is dead, but we cannot
-				   kill openreq to avoid blocking in accept().
-
-				   accept() will collect this garbage,
-				   but such reqs must be ignored, when talking
-				   to network.
-				 */
-				bh_lock_sock(req->sk);
-				BUG_TRAP(req->sk->lock.users==0);
-				if (req->sk->state == TCP_CLOSE) {
-					bh_unlock_sock(req->sk);
-					prev = req;
-					continue;
-				}
-			}
+			BUG_TRAP(req->sk == NULL);
 			*prevp = prev;
 			return req; 
 		}
-		prev = req; 
 	}
-	return NULL; 
+
+	return NULL;
+}
+
+static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
+{
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+	struct tcp_listen_opt *lopt = tp->listen_opt;
+	unsigned h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port);
+
+	req->expires = jiffies + TCP_TIMEOUT_INIT;
+	req->retrans = 0;
+	req->sk = NULL;
+	req->index = h;
+	req->dl_next = lopt->syn_table[h];
+
+	write_lock(&tp->syn_wait_lock);
+	lopt->syn_table[h] = req;
+	write_unlock(&tp->syn_wait_lock);
+
+	tcp_synq_added(sk);
 }
 
 
@@ -984,7 +888,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 
 	th = (struct tcphdr*)(dp+(iph->ihl<<2));
 
-	sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex);
+	sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb));
 	if (sk == NULL) {
 		ICMP_INC_STATS_BH(IcmpInErrors);
 		return;
@@ -1001,6 +905,9 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 	if (sk->lock.users != 0)
 		NET_INC_STATS_BH(LockDroppedIcmps);
 
+	if (sk->state == TCP_CLOSE)
+		goto out;
+
 	tp = &sk->tp_pinfo.af_tcp;
 	seq = ntohl(th->seq);
 	if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
@@ -1010,14 +917,11 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 
 	switch (type) {
 	case ICMP_SOURCE_QUENCH:
-#ifndef OLD_SOURCE_QUENCH /* This is deprecated */
-		if (sk->lock.users == 0) {
-			tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
-			tp->snd_cwnd = tp->snd_ssthresh;
-			tp->snd_cwnd_cnt = 0;
-			tp->high_seq = tp->snd_nxt;
-		}
-#endif
+		/* This is deprecated, but if someone generated it,
+		 * we have no reasons to ignore it.
+		 */
+		if (sk->lock.users == 0)
+			tcp_enter_cong_avoid(tp);
 		goto out;
 	case ICMP_PARAMETERPROB:
 		err = EPROTO;
@@ -1042,7 +946,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 	}
 
 	switch (sk->state) {
-		struct open_request *req, *prev;
+		struct open_request *req, **prev;
 	case TCP_LISTEN:
 		if (sk->lock.users != 0)
 			goto out;
@@ -1060,47 +964,25 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 		if (!req)
 			goto out;
 
-		if (req->sk) {
-			struct sock *nsk = req->sk;
-
-			/* 
-			 * Already in ESTABLISHED and a big socket is created,
-			 * set error code there.
-			 * The error will _not_ be reported in the accept(),
-			 * but only with the next operation on the socket after
-			 * accept. 
-			 */
-			sock_hold(nsk);
-			bh_unlock_sock(sk);
-			sock_put(sk);
-			sk = nsk;
-
-			BUG_TRAP(sk->lock.users == 0);
-			tp = &sk->tp_pinfo.af_tcp;
-			if (!between(seq, tp->snd_una, tp->snd_nxt)) {
-				NET_INC_STATS(OutOfWindowIcmps);
-				goto out;
-			}
-		} else {
-			if (seq != req->snt_isn) {
-				NET_INC_STATS(OutOfWindowIcmps);
-				goto out;
-			}
+		/* ICMPs are not backlogged, hence we cannot get
+		   an established socket here.
+		 */
+		BUG_TRAP(req->sk == NULL);
 
-			/* 
-			 * Still in SYN_RECV, just remove it silently.
-			 * There is no good way to pass the error to the newly
-			 * created socket, and POSIX does not want network
-			 * errors returned from accept(). 
-			 */ 
-			tp->syn_backlog--;
-			tcp_synq_unlink(tp, req, prev);
-			tcp_dec_slow_timer(TCP_SLT_SYNACK);
-			req->class->destructor(req);
-			tcp_openreq_free(req);
+		if (seq != req->snt_isn) {
+			NET_INC_STATS_BH(OutOfWindowIcmps);
 			goto out;
 		}
-		break;
+
+		/* 
+		 * Still in SYN_RECV, just remove it silently.
+		 * There is no good way to pass the error to the newly
+		 * created socket, and POSIX does not want network
+		 * errors returned from accept(). 
+		 */ 
+		tcp_synq_drop(sk, req, prev);
+		goto out;
+
 	case TCP_SYN_SENT:
 	case TCP_SYN_RECV:  /* Cannot happen.
 			       It can f.e. if SYNs crossed.
@@ -1110,10 +992,9 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 		if (sk->lock.users == 0) {
 			TCP_INC_STATS_BH(TcpAttemptFails);
 			sk->err = err;
-			/* Wake people up to see the error (see connect in sock.c) */
+
 			sk->error_report(sk);
 
-			tcp_set_state(sk, TCP_CLOSE);
 			tcp_done(sk);
 		} else {
 			sk->err_soft = err;
@@ -1270,28 +1151,23 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
 
-	tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt, 0, tw->ts_recent);
+	tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt,
+			tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent);
 
 	tcp_tw_put(tw);
 }
 
 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
 {
-	tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd, req->ts_recent);
+	tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd,
+			req->ts_recent);
 }
 
-/*
- *	Send a SYN-ACK after having received an ACK. 
- *	This still operates on a open_request only, not on a big
- *	socket.
- */ 
-static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
+static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req)
 {
 	struct rtable *rt;
 	struct ip_options *opt;
-	struct sk_buff * skb;
 
-	/* First, grab a route. */
 	opt = req->af.v4_req.opt;
 	if(ip_route_output(&rt, ((opt && opt->srr) ?
 				 opt->faddr :
@@ -1300,15 +1176,33 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
 			   RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
 			   sk->bound_dev_if)) {
 		IP_INC_STATS_BH(IpOutNoRoutes);
-		return;
+		return NULL;
 	}
-	if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
+	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
 		ip_rt_put(rt);
 		IP_INC_STATS_BH(IpOutNoRoutes);
-		return;
+		return NULL;
 	}
+	return &rt->u.dst;
+}
+
+/*
+ *	Send a SYN-ACK after having received an ACK. 
+ *	This still operates on a open_request only, not on a big
+ *	socket.
+ */ 
+static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
+			      struct dst_entry *dst)
+{
+	int err = -1;
+	struct sk_buff * skb;
 
-	skb = tcp_make_synack(sk, &rt->u.dst, req);
+	/* First, grab a route. */
+	if (dst == NULL &&
+	    (dst = tcp_v4_route_req(sk, req)) == NULL)
+		goto out;
+
+	skb = tcp_make_synack(sk, dst, req);
 
 	if (skb) {
 		struct tcphdr *th = skb->h.th;
@@ -1317,10 +1211,15 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
 					 req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
 					 csum_partial((char *)th, skb->len, skb->csum));
 
-		ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
-				      req->af.v4_req.rmt_addr, req->af.v4_req.opt);
+		err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
+					    req->af.v4_req.rmt_addr, req->af.v4_req.opt);
+		if (err == NET_XMIT_CN)
+			err = 0;
 	}
-	ip_rt_put(rt);
+
+out:
+	dst_release(dst);
+	return err;
 }
 
 /*
@@ -1328,7 +1227,7 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
  */ 
 static void tcp_v4_or_free(struct open_request *req)
 {
-	if(!req->sk && req->af.v4_req.opt)
+	if (req->af.v4_req.opt)
 		kfree_s(req->af.v4_req.opt, optlength(req->af.v4_req.opt));
 }
 
@@ -1372,8 +1271,14 @@ tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
  * It would be better to replace it with a global counter for all sockets
  * but then some measure against one socket starving all other sockets
  * would be needed.
+ *
+ * It was 128 by default. Experiments with real servers show, that
+ * it is absolutely not enough even at 100conn/sec. 256 cures most
+ * of problems. This value is adjusted to 128 for very small machines
+ * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
+ * Further increasing requires to change hash table size.
  */
-int sysctl_max_syn_backlog = 128; 
+int sysctl_max_syn_backlog = 256; 
 
 struct or_calltable or_ipv4 = {
 	PF_INET,
@@ -1383,9 +1288,6 @@ struct or_calltable or_ipv4 = {
 	tcp_v4_send_reset
 };
 
-#define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */
-#define BACKLOGMAX(sk) sysctl_max_syn_backlog
-
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_opt tp;
@@ -1394,6 +1296,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	__u32 saddr = skb->nh.iph->saddr;
 	__u32 daddr = skb->nh.iph->daddr;
 	__u32 isn = TCP_SKB_CB(skb)->when;
+	struct dst_entry *dst = NULL;
 #ifdef CONFIG_SYN_COOKIES
 	int want_cookie = 0;
 #else
@@ -1405,84 +1308,108 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	    (RTCF_BROADCAST|RTCF_MULTICAST))
 		goto drop; 
 
-	/* XXX: Check against a global syn pool counter. */
-	if (BACKLOG(sk) > BACKLOGMAX(sk)) {
+	/* TW buckets are converted to open requests without
+	 * limitations, they conserve resources and peer is
+	 * evidently real one.
+	 */
+	if (tcp_synq_is_full(sk) && !isn) {
 #ifdef CONFIG_SYN_COOKIES
-		if (sysctl_tcp_syncookies && !isn) {
-			syn_flood_warning(skb);
+		if (sysctl_tcp_syncookies) {
 			want_cookie = 1; 
 		} else
 #endif
 		goto drop;
-	} else { 
-		if (isn == 0)
-			isn = tcp_v4_init_sequence(sk, skb);
-		BACKLOG(sk)++;
 	}
 
-	req = tcp_openreq_alloc();
-	if (req == NULL) {
-		goto dropbacklog;
-	}
+	/* Accept backlog is full. If we have already queued enough
+	 * of warm entries in syn queue, drop request. It is better than
+	 * clogging syn queue with openreqs with exponentially increasing
+	 * timeout.
+	 */
+	if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
+		goto drop;
 
-	req->rcv_wnd = 0;		/* So that tcp_send_synack() knows! */
+	req = tcp_openreq_alloc();
+	if (req == NULL)
+		goto drop;
 
-	req->rcv_isn = TCP_SKB_CB(skb)->seq;
  	tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
-
 	tp.mss_clamp = 536;
 	tp.user_mss = sk->tp_pinfo.af_tcp.user_mss;
 
 	tcp_parse_options(NULL, th, &tp, want_cookie);
 
-	req->mss = tp.mss_clamp;
-	req->ts_recent = tp.saw_tstamp ? tp.rcv_tsval : 0;
-	req->tstamp_ok = tp.tstamp_ok;
-	req->sack_ok = tp.sack_ok;
-	req->snd_wscale = tp.snd_wscale;
-	req->wscale_ok = tp.wscale_ok;
-	req->rmt_port = th->source;
+	tcp_openreq_init(req, &tp, skb);
+
 	req->af.v4_req.loc_addr = daddr;
 	req->af.v4_req.rmt_addr = saddr;
+	req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
+	req->class = &or_ipv4;
 
-	/* Note that we ignore the isn passed from the TIME_WAIT
-	 * state here. That's the price we pay for cookies.
-	 *
-	 * RED-PEN. The price is high... Then we cannot kill TIME-WAIT
-	 * and should reject connection attempt, duplicates with random
-	 * sequence number can corrupt data. Right?
-	 * I disabled sending cookie to request matching to a timewait
-	 * bucket.
-	 */
-	if (want_cookie)
+	if (want_cookie) {
+#ifdef CONFIG_SYN_COOKIES
+		syn_flood_warning(skb);
+#endif
 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
+	} else if (isn == 0) {
+		struct inet_peer *peer = NULL;
 
-	req->snt_isn = isn;
-
-	req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
+		/* VJ's idea. We save last timestamp seen
+		 * from the destination in peer table, when entering
+		 * state TIME-WAIT, and check against it before
+		 * accepting new connection request.
+		 *
+		 * If "isn" is not zero, this request hit alive
+		 * timewait bucket, so that all the necessary checks
+		 * are made in the function processing timewait state.
+		 */
+		if (tp.saw_tstamp &&
+		    sysctl_tcp_tw_recycle &&
+		    (dst = tcp_v4_route_req(sk, req)) != NULL &&
+		    (peer = rt_get_peer((struct rtable*)dst)) != NULL &&
+		    peer->v4daddr == saddr) {
+			if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
+			    (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) {
+				NETDEBUG(printk(KERN_DEBUG "TW_REC: reject openreq %u/%u %08x/%u\n", peer->tcp_ts, req->ts_recent, saddr, ntohs(skb->h.th->source)));
+				NET_INC_STATS_BH(PAWSPassiveRejected);
+				dst_release(dst);
+				goto drop_and_free;
+			}
+		}
+		/* Kill the following clause, if you dislike this way. */
+		else if (!sysctl_tcp_syncookies &&
+			 (sysctl_max_syn_backlog - tcp_synq_len(sk)
+			  < (sysctl_max_syn_backlog>>2)) &&
+			 (!peer || !peer->tcp_ts_stamp) &&
+			 (!dst || !dst->rtt)) {
+			/* Without syncookies last quarter of
+			 * backlog is filled with destinations, proven to be alive.
+			 * It means that we continue to communicate
+			 * to destinations, already remembered
+			 * to the moment of synflood.
+			 */
+			NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "TCP: drop open request from %08x/%u\n", saddr, ntohs(skb->h.th->source)));
+			TCP_INC_STATS_BH(TcpAttemptFails);
+			dst_release(dst);
+			goto drop_and_free;
+		}
 
-	req->class = &or_ipv4;
-	req->retrans = 0;
-	req->sk = NULL;
+		isn = tcp_v4_init_sequence(sk, skb);
+	}
+	req->snt_isn = isn;
 
-	tcp_v4_send_synack(sk, req);
+	if (tcp_v4_send_synack(sk, req, dst))
+		goto drop_and_free;
 
 	if (want_cookie) {
-		if (req->af.v4_req.opt)
-			kfree(req->af.v4_req.opt);
-		tcp_v4_or_free(req); 
 	   	tcp_openreq_free(req); 
 	} else {
-		req->expires = jiffies + TCP_TIMEOUT_INIT;
-		tcp_inc_slow_timer(TCP_SLT_SYNACK);
-		tcp_synq_queue(&sk->tp_pinfo.af_tcp, req);
+		tcp_v4_synq_add(sk, req);
 	}
-
 	return 0;
 
-dropbacklog:
-	if (!want_cookie) 
-		BACKLOG(sk)--;
+drop_and_free:
+	tcp_openreq_free(req); 
 drop:
 	TCP_INC_STATS_BH(TcpAttemptFails);
 	return 0;
@@ -1497,29 +1424,20 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 				   struct open_request *req,
 				   struct dst_entry *dst)
 {
-	struct ip_options *opt = req->af.v4_req.opt;
 	struct tcp_opt *newtp;
 	struct sock *newsk;
 
-	if (sk->ack_backlog > sk->max_ack_backlog)
-		goto exit; /* head drop */
-	if (dst == NULL) { 
-		struct rtable *rt;
-		
-		if (ip_route_output(&rt,
-			opt && opt->srr ? opt->faddr : req->af.v4_req.rmt_addr,
-			req->af.v4_req.loc_addr, sk->protinfo.af_inet.tos|RTO_CONN, 0))
-			return NULL;
-	        dst = &rt->u.dst;
-	}
+	if (tcp_acceptq_is_full(sk))
+		goto exit_overflow;
+
+	if (dst == NULL &&
+	    (dst = tcp_v4_route_req(sk, req)) == NULL)
+		goto exit;
 
 	newsk = tcp_create_openreq_child(sk, req, skb);
 	if (!newsk)
 		goto exit;
 
-	sk->tp_pinfo.af_tcp.syn_backlog--;
-	sk->ack_backlog++;
-
 	newsk->dst_cache = dst;
 
 	newtp = &(newsk->tp_pinfo.af_tcp);
@@ -1527,7 +1445,8 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	newsk->saddr = req->af.v4_req.loc_addr;
 	newsk->rcv_saddr = req->af.v4_req.loc_addr;
 	newsk->protinfo.af_inet.opt = req->af.v4_req.opt;
-	newsk->protinfo.af_inet.mc_index = ((struct rtable*)skb->dst)->rt_iif;
+	req->af.v4_req.opt = NULL;
+	newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb);
 	newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl;
 	newtp->ext_header_len = 0;
 	if (newsk->protinfo.af_inet.opt)
@@ -1535,28 +1454,26 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 	tcp_sync_mss(newsk, dst->pmtu);
 	tcp_initialize_rcv_mss(newsk);
+	newtp->advmss = dst->advmss;
 
-	if (newsk->rcvbuf < (3 * (dst->advmss+40+MAX_HEADER+15)))
-		newsk->rcvbuf = min ((3 * (dst->advmss+40+MAX_HEADER+15)), sysctl_rmem_max);
-	if (newsk->sndbuf < (3 * (newtp->mss_clamp+40+MAX_HEADER+15)))
-		newsk->sndbuf = min ((3 * (newtp->mss_clamp+40+MAX_HEADER+15)), sysctl_wmem_max);
+	tcp_init_buffer_space(newsk);
 
-	bh_lock_sock(newsk);
- 
 	__tcp_v4_hash(newsk);
 	__tcp_inherit_port(sk, newsk);
 
 	return newsk;
 
+exit_overflow:
+	NET_INC_STATS_BH(ListenOverflows);
 exit:
+	NET_INC_STATS_BH(ListenDrops);
 	dst_release(dst);
 	return NULL;
 }
 
-
 static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
 {
-	struct open_request *req, *prev;
+	struct open_request *req, **prev;
 	struct tcphdr *th = skb->h.th;
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
@@ -1565,6 +1482,25 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
 	if (req)
 		return tcp_check_req(sk, skb, req, prev);
 
+	if (tp->accept_queue) {
+		struct sock *nsk;
+
+		nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
+						  th->source,
+						  skb->nh.iph->daddr,
+						  ntohs(th->dest),
+						  tcp_v4_iif(skb));
+
+		if (nsk) {
+			if (nsk->state != TCP_TIME_WAIT) {
+				bh_lock_sock(nsk);
+				return nsk;
+			}
+			tcp_tw_put((struct tcp_tw_bucket*)sk);
+			return NULL;
+		}
+	}
+
 #ifdef CONFIG_SYN_COOKIES
 	if (!th->rst && (th->syn || th->ack))
 		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
@@ -1572,27 +1508,26 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
 	return sk;
 }
 
-static int tcp_csum_verify(struct sk_buff *skb)
+static int tcp_v4_checksum_init(struct sk_buff *skb)
 {
-	switch (skb->ip_summed) {
-	case CHECKSUM_NONE:
-		skb->csum = csum_partial((char *)skb->h.th, skb->len, 0);
-	case CHECKSUM_HW:
-		if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,skb->nh.iph->daddr,skb->csum)) {
-			NETDEBUG(printk(KERN_DEBUG "TCPv4 bad checksum "
-					"from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, "
-					"len=%d/%d\n",
-					NIPQUAD(skb->nh.iph->saddr),
-					ntohs(skb->h.th->source), 
-					NIPQUAD(skb->nh.iph->daddr),
-					ntohs(skb->h.th->dest),
-					skb->len,
-					ntohs(skb->nh.iph->tot_len)));
-			return 1;
+	if (skb->ip_summed == CHECKSUM_HW) {
+		if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
+				 skb->nh.iph->daddr,skb->csum)) {
+			NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
+			return -1;
 		}
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
-	default:
-		/* CHECKSUM_UNNECESSARY */
+	} else if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
+		if (skb->len <= 68) {
+			if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
+					 skb->nh.iph->daddr,
+					 csum_partial((char *)skb->h.th, skb->len, 0)))
+				return -1;
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+		} else {
+			skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
+						  skb->nh.iph->daddr,0);
+		}
 	}
 	return 0;
 }
@@ -1614,66 +1549,35 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 		goto discard;
 #endif /* CONFIG_FILTER */
 
-	/* 
-	 * This doesn't check if the socket has enough room for the packet.
-	 * Either process the packet _without_ queueing it and then free it,
-	 * or do the check later.
-	 */
-	skb_set_owner_r(skb, sk);
+  	IP_INC_STATS_BH(IpInDelivers);
 
 	if (sk->state == TCP_ESTABLISHED) { /* Fast path */
-		/* Ready to move deeper ... */
-		if (tcp_csum_verify(skb))
-			goto csum_err;
+		TCP_CHECK_TIMER(sk);
 		if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
 			goto reset;
+		TCP_CHECK_TIMER(sk);
 		return 0; 
-	} 
+	}
 
-	if (tcp_csum_verify(skb))
+	if (tcp_checksum_complete(skb))
 		goto csum_err;
 
 	if (sk->state == TCP_LISTEN) { 
-		struct sock *nsk;
-
-		nsk = tcp_v4_hnd_req(sk, skb);
+		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
 		if (!nsk)
 			goto discard;
 
-		/*
-		 * Queue it on the new socket if the new socket is active,
-		 * otherwise we just shortcircuit this and continue with
-		 * the new socket..
-		 */
 		if (nsk != sk) {
-			int ret;
-			int state = nsk->state;
-
-			skb_orphan(skb);
-
-			BUG_TRAP(nsk->lock.users == 0);
-			skb_set_owner_r(skb, nsk);
-			ret = tcp_rcv_state_process(nsk, skb, skb->h.th, skb->len);
-
-			/* Wakeup parent, send SIGIO, if this packet changed
-			   socket state from SYN-RECV.
-
-			   It still looks ugly, however it is much better
-			   than miracleous double wakeup in syn_recv_sock()
-			   and tcp_rcv_state_process().
-			 */
-			if (state == TCP_SYN_RECV && nsk->state != state)
-				sk->data_ready(sk, 0);
-
-			bh_unlock_sock(nsk);
-			if (ret)
+			if (tcp_child_process(sk, nsk, skb))
 				goto reset;
 			return 0;
 		}
 	}
-	
+
+	TCP_CHECK_TIMER(sk);
 	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
 		goto reset;
+	TCP_CHECK_TIMER(sk);
 	return 0;
 
 reset:
@@ -1716,6 +1620,9 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
 	if (len < sizeof(struct tcphdr))
 		goto bad_packet;
 
+	if (tcp_v4_checksum_init(skb) < 0)
+		goto bad_packet;
+
 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
 				    len - th->doff*4);
@@ -1724,7 +1631,7 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
 	skb->used = 0;
 
 	sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
-			     skb->nh.iph->daddr, ntohs(th->dest), skb->dev->ifindex);
+			     skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
 
 	if (!sk)
 		goto no_tcp_socket;
@@ -1738,9 +1645,10 @@ process:
 
 	bh_lock_sock(sk);
 	ret = 0;
-	if (!sk->lock.users)
-		ret = tcp_v4_do_rcv(sk, skb);
-	else
+	if (!sk->lock.users) {
+		if (!tcp_prequeue(sk, skb))
+			ret = tcp_v4_do_rcv(sk, skb);
+	} else
 		sk_add_backlog(sk, skb);
 	bh_unlock_sock(sk);
 
@@ -1749,7 +1657,7 @@ process:
 	return ret;
 
 no_tcp_socket:
-	if (tcp_csum_verify(skb)) {
+	if (tcp_checksum_complete(skb)) {
 bad_packet:
 		TCP_INC_STATS_BH(TcpInErrs);
 	} else {
@@ -1766,7 +1674,7 @@ discard_and_relse:
 	goto discard_it;
 
 do_time_wait:
-	if (tcp_csum_verify(skb)) {
+	if (tcp_checksum_complete(skb)) {
 		TCP_INC_STATS_BH(TcpInErrs);
 		goto discard_and_relse;
 	}
@@ -1776,7 +1684,7 @@ do_time_wait:
 	{
 		struct sock *sk2;
 
-		sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), skb->dev->ifindex);
+		sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
 		if (sk2 != NULL) {
 			tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
 			tcp_timewait_kill((struct tcp_tw_bucket *)sk);
@@ -1796,36 +1704,39 @@ do_time_wait:
 	goto discard_it;
 }
 
+/* With per-bucket locks this operation is not-atomic, so that
+ * this version is not worse.
+ */
 static void __tcp_v4_rehash(struct sock *sk)
 {
-	struct tcp_ehash_bucket *oldhead = &tcp_ehash[sk->hashent];
-	struct tcp_ehash_bucket *head = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))];
-	struct sock **skp = &head->chain;
-
-	write_lock_bh(&oldhead->lock);
-	if(sk->pprev) {
-		if(sk->next)
-			sk->next->pprev = sk->pprev;
-		*sk->pprev = sk->next;
-		sk->pprev = NULL;
-	}
-	write_unlock(&oldhead->lock);
-	write_lock(&head->lock);
-	if((sk->next = *skp) != NULL)
-		(*skp)->pprev = &sk->next;
-	*skp = sk;
-	sk->pprev = skp;
-	write_unlock_bh(&head->lock);
+	sk->prot->unhash(sk);
+	sk->prot->hash(sk);
 }
 
 int tcp_v4_rebuild_header(struct sock *sk)
 {
-	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
+	struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
 	__u32 new_saddr;
         int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT;
 
-	if(rt == NULL)
-		return 0;
+	if (rt == NULL) {
+		int err;
+
+		u32 daddr = sk->daddr;
+
+		if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
+			daddr = sk->protinfo.af_inet.opt->faddr;
+
+		err = ip_route_output(&rt, daddr, sk->saddr,
+				      RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
+				      sk->bound_dev_if);
+		if (err) {
+			sk->err_soft=-err;
+			sk->error_report(sk);
+			return -1;
+		}
+		__sk_dst_set(sk, &rt->u.dst);
+	}
 
 	/* Force route checking if want_rewrite.
 	 * The idea is good, the implementation is disguisting.
@@ -1855,16 +1766,6 @@ int tcp_v4_rebuild_header(struct sock *sk)
 			dst_release(&new_rt->u.dst);
 		}
 	}
-	if (rt->u.dst.obsolete) {
-		int err;
-		err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos|RTO_CONN, rt->key.oif);
-		if (err) {
-			sk->err_soft=-err;
-			sk->error_report(sk);
-			return -1;
-		}
-		__sk_dst_set(sk, &rt->u.dst);
-	}
 
 	return 0;
 
@@ -1877,7 +1778,7 @@ do_rewrite:
 		       "saddr=%08X rcv_saddr=%08X\n",
 		       ntohl(sk->saddr), 
 		       ntohl(sk->rcv_saddr));
-		return 0;
+		return -1;
 	}
 
 	if (new_saddr != sk->saddr) {
@@ -1895,7 +1796,7 @@ do_rewrite:
 		 * XXX really change the sockets identity after
 		 * XXX it has entered the hashes. -DaveM
 		 *
-		 * Besides that, it does not check for connetion
+		 * Besides that, it does not check for connection
 		 * uniqueness. Wait for troubles.
 		 */
 		__tcp_v4_rehash(sk);
@@ -1913,6 +1814,63 @@ static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
 	sin->sin_port		= sk->dport;
 }
 
+/* VJ's idea. Save last timestamp seen from this destination
+ * and hold it at least for normal timewait interval to use for duplicate
+ * segment detection in subsequent connections, before they enter synchronized
+ * state.
+ */
+
+int tcp_v4_remember_stamp(struct sock *sk)
+{
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+	struct rtable *rt = (struct rtable*)__sk_dst_get(sk);
+	struct inet_peer *peer = NULL;
+	int release_it = 0;
+
+	if (rt == NULL || rt->rt_dst != sk->daddr) {
+		peer = inet_getpeer(sk->daddr, 1);
+		release_it = 1;
+	} else {
+		if (rt->peer == NULL)
+			rt_bind_peer(rt, 1);
+		peer = rt->peer;
+	}
+
+	if (peer) {
+		if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
+		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
+		     peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
+			peer->tcp_ts_stamp = tp->ts_recent_stamp;
+			peer->tcp_ts = tp->ts_recent;
+		}
+		if (release_it)
+			inet_putpeer(peer);
+		return 1;
+	}
+
+	return 0;
+}
+
+int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
+{
+	struct inet_peer *peer = NULL;
+
+	peer = inet_getpeer(tw->daddr, 1);
+
+	if (peer) {
+		if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 ||
+		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
+		     peer->tcp_ts_stamp <= tw->ts_recent_stamp)) {
+			peer->tcp_ts_stamp = tw->ts_recent_stamp;
+			peer->tcp_ts = tw->ts_recent;
+		}
+		inet_putpeer(peer);
+		return 1;
+	}
+
+	return 0;
+}
+
 struct tcp_func ipv4_specific = {
 	ip_queue_xmit,
 	tcp_v4_send_check,
@@ -1920,6 +1878,7 @@ struct tcp_func ipv4_specific = {
 	tcp_v4_conn_request,
 	tcp_v4_syn_recv_sock,
 	tcp_v4_hash_connecting,
+	tcp_v4_remember_stamp,
 	sizeof(struct iphdr),
 
 	ip_setsockopt,
@@ -1937,6 +1896,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 
 	skb_queue_head_init(&tp->out_of_order_queue);
 	tcp_init_xmit_timers(sk);
+	tcp_prequeue_init(tp);
 
 	tp->rto  = TCP_TIMEOUT_INIT;
 	tp->mdev = TCP_TIMEOUT_INIT;
@@ -1951,19 +1911,14 @@ static int tcp_v4_init_sock(struct sock *sk)
 	/* See draft-stevens-tcpca-spec-01 for discussion of the
 	 * initialization of these values.
 	 */
-	tp->snd_cwnd_cnt = 0;
 	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
 	tp->snd_cwnd_clamp = ~0;
 	tp->mss_cache = 536;
 
 	sk->state = TCP_CLOSE;
-	sk->max_ack_backlog = SOMAXCONN;
 
 	sk->write_space = tcp_write_space; 
 
-	/* Init SYN queue. */
-	tcp_synq_init(tp);
-
 	sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
 
 	return 0;
@@ -1981,9 +1936,10 @@ static int tcp_v4_destroy_sock(struct sock *sk)
 	/* Cleans up our, hopefuly empty, out_of_order_queue. */
   	__skb_queue_purge(&tp->out_of_order_queue);
 
-	/* Clean up a referenced TCP bind bucket, this only happens if a
-	 * port is allocated for a socket, but it never fully connects.
-	 */
+	/* Clean prequeue, it must be empty really */
+	__skb_queue_purge(&tp->ucopy.prequeue);
+
+	/* Clean up a referenced TCP bind bucket. */
 	if(sk->prev != NULL)
 		tcp_put_port(sk);
 
@@ -1993,17 +1949,19 @@ static int tcp_v4_destroy_sock(struct sock *sk)
 /* Proc filesystem TCP sock list dumping. */
 static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i)
 {
-	sprintf(tmpbuf, "%4d: %08lX:%04X %08lX:%04X"
-		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
+	int ttd = req->expires - jiffies;
+
+	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
+		" %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
 		i,
-		(long unsigned int)req->af.v4_req.loc_addr,
+		req->af.v4_req.loc_addr,
 		ntohs(sk->sport),
-		(long unsigned int)req->af.v4_req.rmt_addr,
+		req->af.v4_req.rmt_addr,
 		ntohs(req->rmt_port),
 		TCP_SYN_RECV,
 		0,0, /* could print option size, but that is af dependent. */
 		1,   /* timers active (only the expire timer) */  
-		(unsigned long)(req->expires - jiffies), 
+		ttd, 
 		req->retrans,
 		sk->socket ? sk->socket->inode->i_uid : 0,
 		0,  /* non standard timer */  
@@ -2017,7 +1975,7 @@ static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
 {
 	unsigned int dest, src;
 	__u16 destp, srcp;
-	int timer_active, timer_active1, timer_active2;
+	int timer_active;
 	unsigned long timer_expires;
 	struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
 
@@ -2025,15 +1983,16 @@ static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
 	src   = sp->rcv_saddr;
 	destp = ntohs(sp->dport);
 	srcp  = ntohs(sp->sport);
-	timer_active1 = tp->retransmit_timer.prev != NULL;
-	timer_active2 = sp->timer.prev != NULL;
 	timer_active	= 0;
 	timer_expires	= (unsigned) -1;
-	if (timer_active1 && tp->retransmit_timer.expires < timer_expires) {
+	if (tp->retransmit_timer.prev != NULL && tp->retransmit_timer.expires < timer_expires) {
 		timer_active	= 1;
 		timer_expires	= tp->retransmit_timer.expires;
+	} else if (tp->probe_timer.prev != NULL && tp->probe_timer.expires < timer_expires) {
+		timer_active	= 4;
+		timer_expires	= tp->probe_timer.expires;
 	}
-	if (timer_active2 && sp->timer.expires < timer_expires) {
+	if (sp->timer.prev != NULL && sp->timer.expires < timer_expires) {
 		timer_active	= 2;
 		timer_expires	= sp->timer.expires;
 	}
@@ -2041,38 +2000,37 @@ static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
 		timer_expires = jiffies;
 
 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
-		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld %d %p",
+		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld %d %p %u %u %u %u",
 		i, src, srcp, dest, destp, sp->state, 
 		tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
 		timer_active, timer_expires-jiffies,
 		tp->retransmits,
 		sp->socket ? sp->socket->inode->i_uid : 0,
-		0,
+		tp->probes_out,
 		sp->socket ? sp->socket->inode->i_ino : 0,
-		atomic_read(&sp->refcnt), sp);
+		atomic_read(&sp->refcnt), sp,
+		tp->rto, tp->ack.ato, tp->ack.quick, tp->ack.pingpong
+		);
 }
 
 static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
 {
 	unsigned int dest, src;
 	__u16 destp, srcp;
-	int slot_dist;
+	int ttd = tw->ttd - jiffies;
+
+	if (ttd < 0)
+		ttd = 0;
 
 	dest  = tw->daddr;
 	src   = tw->rcv_saddr;
 	destp = ntohs(tw->dport);
 	srcp  = ntohs(tw->sport);
 
-	slot_dist = tw->death_slot;
-	if(slot_dist > tcp_tw_death_row_slot)
-		slot_dist = (TCP_TWKILL_SLOTS - slot_dist) + tcp_tw_death_row_slot;
-	else
-		slot_dist = tcp_tw_death_row_slot - slot_dist;
-
 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
 		" %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
-		i, src, srcp, dest, destp, TCP_TIME_WAIT, 0, 0,
-		3, slot_dist * TCP_TWKILL_PERIOD, 0, 0, 0, 0,
+		i, src, srcp, dest, destp, tw->substate, 0, 0,
+		3, ttd, 0, 0, 0, 0,
 		atomic_read(&tw->refcnt), tw);
 }
 
@@ -2093,6 +2051,8 @@ int tcp_get_info(char *buffer, char **start, off_t offset, int length)
 	tcp_listen_lock();
 	for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
 		struct sock *sk = tcp_listening_hash[i];
+		struct tcp_listen_opt *lopt;
+		int k;
 
 		for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) {
 			struct open_request *req;
@@ -2112,25 +2072,30 @@ int tcp_get_info(char *buffer, char **start, off_t offset, int length)
 			}
 
 skip_listen:
-			lock_sock(sk);
-			for (req = tp->syn_wait_queue; req; req = req->dl_next, num++) {
-				if (req->sk)
-					continue;
-				if (!TCP_INET_FAMILY(req->class->family))
-					continue;
-
-				pos += 128;
-				if (pos < offset)
-					continue;
-				get_openreq(sk, req, tmpbuf, num);
-				len += sprintf(buffer+len, "%-127s\n", tmpbuf);
-				if(len >= length) {
-					tcp_listen_unlock();
-					release_sock(sk);
-					goto out_no_bh;
+			read_lock_bh(&tp->syn_wait_lock);
+			lopt = tp->listen_opt;
+			if (lopt && lopt->qlen != 0) {
+				for (k=0; k<TCP_SYNQ_HSIZE; k++) {
+					for (req = lopt->syn_table[k]; req; req = req->dl_next, num++) {
+						if (!TCP_INET_FAMILY(req->class->family))
+							continue;
+
+						pos += 128;
+						if (pos < offset)
+							continue;
+						get_openreq(sk, req, tmpbuf, num);
+						len += sprintf(buffer+len, "%-127s\n", tmpbuf);
+						if(len >= length) {
+							read_unlock_bh(&tp->syn_wait_lock);
+							tcp_listen_unlock();
+							goto out_no_bh;
+						}
+					}
 				}
 			}
-			release_sock(sk);
+			read_unlock_bh(&tp->syn_wait_lock);
+
+			/* Completed requests are in normal socket hash table */
 		}
 	}
 	tcp_listen_unlock();
@@ -2194,28 +2159,24 @@ struct proto tcp_prot = {
 	tcp_v4_connect,			/* connect */
 	tcp_disconnect,			/* disconnect */
 	tcp_accept,			/* accept */
-	NULL,				/* retransmit */
-	tcp_write_wakeup,		/* write_wakeup */
-	tcp_read_wakeup,		/* read_wakeup */
-	tcp_poll,			/* poll */
 	tcp_ioctl,			/* ioctl */
 	tcp_v4_init_sock,		/* init */
 	tcp_v4_destroy_sock,		/* destroy */
 	tcp_shutdown,			/* shutdown */
 	tcp_setsockopt,			/* setsockopt */
 	tcp_getsockopt,			/* getsockopt */
-	tcp_v4_sendmsg,			/* sendmsg */
+	tcp_sendmsg,			/* sendmsg */
 	tcp_recvmsg,			/* recvmsg */
 	NULL,				/* bind */
 	tcp_v4_do_rcv,			/* backlog_rcv */
 	tcp_v4_hash,			/* hash */
 	tcp_unhash,			/* unhash */
 	tcp_v4_get_port,		/* get_port */
-	128,				/* max_header */
-	0,				/* retransmits */
 	"TCP",				/* name */
 };
 
+
+
 void __init tcp_v4_init(struct net_proto_family *ops)
 {
 	int err;
author	Ralf Baechle <ralf@linux-mips.org>	2000-02-18 00:24:27 +0000
committer	Ralf Baechle <ralf@linux-mips.org>	2000-02-18 00:24:27 +0000
commit	b9558d5f86c471a125abf1fb3a3882fb053b1f8c (patch)
tree	707b53ec64e740a7da87d5f36485e3cd9b1c794e /net/ipv4/tcp_ipv4.c
parent	b3ac367c7a3e6047abe74817db27e34e759f279f (diff)