1 files changed, 90 insertions, 31 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8c1c9f9be..779c31cef 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp.c,v 1.140 1999/04/22 10:34:31 davem Exp $
+ * Version:	$Id: tcp.c,v 1.144 1999/05/27 01:03:37 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -416,6 +416,7 @@
 #include <linux/fcntl.h>
 #include <linux/poll.h>
 #include <linux/init.h>
+#include <linux/smp_lock.h>
 
 #include <net/icmp.h>
 #include <net/tcp.h>
@@ -432,7 +433,7 @@ kmem_cache_t *tcp_timewait_cachep;
 
 /*
  *	Find someone to 'accept'. Must be called with
- *	the socket locked or with interrupts disabled
+ *	the listening socket locked.
  */
 
 static struct open_request *tcp_find_established(struct tcp_opt *tp, 
@@ -441,10 +442,11 @@ static struct open_request *tcp_find_established(struct tcp_opt *tp,
 	struct open_request *req = tp->syn_wait_queue;
 	struct open_request *prev = (struct open_request *)&tp->syn_wait_queue; 
 	while(req) {
-		if (req->sk && 
-		    ((1 << req->sk->state) &
-		     ~(TCPF_SYN_SENT|TCPF_SYN_RECV)))
-			break;
+		if (req->sk) {
+			if((1 << req->sk->state) &
+			   ~(TCPF_SYN_SENT|TCPF_SYN_RECV))
+				break;
+		}
 		prev = req; 
 		req = req->dl_next;
 	}
@@ -655,12 +657,13 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 /*
  *	Wait for a socket to get into the connected state
  *
- *	Note: must be called with the socket locked.
+ *	Note: Must be called with the socket locked, and it
+ *	      runs with the kernel fully unlocked.
  */
 static int wait_for_tcp_connect(struct sock * sk, int flags)
 {
 	struct task_struct *tsk = current;
-	struct wait_queue wait = { tsk, NULL };
+	DECLARE_WAITQUEUE(wait, tsk);
 
 	while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
 		if(sk->err)
@@ -698,12 +701,14 @@ static inline int tcp_memory_free(struct sock *sk)
 
 /*
  *	Wait for more memory for a socket
+ *
+ * NOTE: This runs with the kernel fully unlocked.
  */
 static void wait_for_tcp_memory(struct sock * sk)
 {
 	release_sock(sk);
 	if (!tcp_memory_free(sk)) {
-		struct wait_queue wait = { current, NULL };
+		DECLARE_WAITQUEUE(wait, current);
 
 		sk->socket->flags &= ~SO_NOSPACE;
 		add_wait_queue(sk->sleep, &wait);
@@ -744,6 +749,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
 	int mss_now;
 	int err, copied;
 
+	unlock_kernel();
 	lock_sock(sk);
 
 	err = 0;
@@ -896,6 +902,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
 					err = -ERESTARTSYS;
 					goto do_interrupted;
 				}
+				tcp_push_pending_frames(sk, tp);
 				wait_for_tcp_memory(sk);
 
 				/* If SACK's were formed or PMTU events happened,
@@ -969,6 +976,7 @@ do_fault2:
 out:
 	tcp_push_pending_frames(sk, tp);
 	release_sock(sk);
+	lock_kernel();
 	return err;
 }
 
@@ -1117,7 +1125,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
 		int len, int nonblock, int flags, int *addr_len)
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-	struct wait_queue wait = { current, NULL };
+	DECLARE_WAITQUEUE(wait, current);
 	int copied = 0;
 	u32 peek_seq;
 	volatile u32 *seq;	/* So gcc doesn't overoptimise */
@@ -1148,6 +1156,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
 	if (flags & MSG_WAITALL)
 		target=len;
 
+	unlock_kernel();
 	add_wait_queue(sk->sleep, &wait);
 	lock_sock(sk);
 	
@@ -1300,6 +1309,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
 		/*	We now will not sleep again until we are finished
 		 *	with skb. Sorry if you are doing the SMP port
 		 *	but you'll just have to fix it neatly ;)
+		 *
+		 *	Very funny Alan... -DaveM
 		 */
 		atomic_dec(&skb->users);
 
@@ -1344,6 +1355,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
 	/* Clean up data we have read: This will do ACK frames. */
 	cleanup_rbuf(sk, copied);
 	release_sock(sk);
+	lock_kernel();
 	return copied;
 }
 
@@ -1415,16 +1427,15 @@ void tcp_shutdown(struct sock *sk, int how)
 		return;
 
 	/* If we've already sent a FIN, or it's a closed state, skip this. */
+	lock_sock(sk);
 	if ((1 << sk->state) &
 	    (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
-		lock_sock(sk);
 
 		/* Clear out any half completed packets.  FIN if needed. */
 		if (tcp_close_state(sk,0))
 			tcp_send_fin(sk);
-
-		release_sock(sk);
 	}
+	release_sock(sk);
 }
 
 
@@ -1471,13 +1482,6 @@ void tcp_close(struct sock *sk, long timeout)
 	struct sk_buff *skb;
 	int data_was_unread = 0;
 
-	/*
-	 * Check whether the socket is locked ... supposedly
-	 * it's impossible to tcp_close() a locked socket.
-	 */
-	if (atomic_read(&sk->sock_readers))
-		printk("tcp_close: socket already locked!\n");
-
 	/* We need to grab some memory, and put together a FIN,
 	 * and then put it into the queue to be sent.
 	 */
@@ -1491,6 +1495,8 @@ void tcp_close(struct sock *sk, long timeout)
 		return;
 	}
 
+	unlock_kernel();
+
 	/* It is questionable, what the role of this is now.
 	 * In any event either it should be removed, or
 	 * increment of SLT_KEEPALIVE be done, this is causing
@@ -1534,24 +1540,23 @@ void tcp_close(struct sock *sk, long timeout)
 
 	if (timeout) {
 		struct task_struct *tsk = current;
-		struct wait_queue wait = { tsk, NULL };
+		DECLARE_WAITQUEUE(wait, current);
 
 		add_wait_queue(sk->sleep, &wait);
-		release_sock(sk);
 
 		while (1) {
 			tsk->state = TASK_INTERRUPTIBLE;
 			if (!closing(sk))
 				break;
+			release_sock(sk);
 			timeout = schedule_timeout(timeout);
+			lock_sock(sk);
 			if (signal_pending(tsk) || !timeout)
 				break;
 		}
 
 		tsk->state = TASK_RUNNING;
 		remove_wait_queue(sk->sleep, &wait);
-		
-		lock_sock(sk);
 	}
 
 	/* Now that the socket is dead, if we are in the FIN_WAIT2 state
@@ -1559,23 +1564,40 @@ void tcp_close(struct sock *sk, long timeout)
          */
 	tcp_check_fin_timer(sk);
 
-	release_sock(sk);
 	sk->dead = 1;
+
+	release_sock(sk);
+	lock_kernel();
 }
 
 /*
  *	Wait for an incoming connection, avoid race
- *	conditions. This must be called with the socket locked.
+ *	conditions. This must be called with the socket locked,
+ *	and without the kernel lock held.
  */
 static struct open_request * wait_for_connect(struct sock * sk,
 					      struct open_request **pprev)
 {
-	struct wait_queue wait = { current, NULL };
+	DECLARE_WAITQUEUE(wait, current);
 	struct open_request *req;
 
-	add_wait_queue(sk->sleep, &wait);
+	/*
+	 * True wake-one mechanism for incoming connections: only
+	 * one process gets woken up, not the 'whole herd'.
+	 * Since we do not 'race & poll' for established sockets
+	 * anymore, the common case will execute the loop only once.
+	 *
+	 * Subtle issue: "add_wait_queue_exclusive()" will be added
+	 * after any current non-exclusive waiters, and we know that
+	 * it will always _stay_ after any new non-exclusive waiters
+	 * because all non-exclusive waiters are added at the
+	 * beginning of the wait-queue. As such, it's ok to "drop"
+	 * our exclusiveness temporarily when we get woken up without
+	 * having to remove and re-insert us on the wait queue.
+	 */
+	add_wait_queue_exclusive(sk->sleep, &wait);
 	for (;;) {
-		current->state = TASK_INTERRUPTIBLE;
+		current->state = TASK_EXCLUSIVE | TASK_INTERRUPTIBLE;
 		release_sock(sk);
 		schedule();
 		lock_sock(sk);
@@ -1603,6 +1625,7 @@ struct sock *tcp_accept(struct sock *sk, int flags)
 	struct sock *newsk = NULL;
 	int error;
 
+	unlock_kernel();
 	lock_sock(sk); 
 
 	/* We need to make sure that this socket is listening,
@@ -1633,16 +1656,17 @@ struct sock *tcp_accept(struct sock *sk, int flags)
 	sk->ack_backlog--; 
 	if(sk->keepopen)
 		tcp_inc_slow_timer(TCP_SLT_KEEPALIVE);
-
 	release_sock(sk);
+	lock_kernel();
 	return newsk;
 
 out:
 	/* sk should be in LISTEN state, thus accept can use sk->err for
-	 * internal purposes without stomping one anyone's feed.
+	 * internal purposes without stomping on anyone's feed.
 	 */ 
 	sk->err = error; 
 	release_sock(sk);
+	lock_kernel();
 	return newsk;
 }
 
@@ -1765,6 +1789,8 @@ extern void __skb_cb_too_small_for_tcp(int, int);
 void __init tcp_init(void)
 {
 	struct sk_buff *skb = NULL;
+	unsigned long goal;
+	int order;
 
 	if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
 		__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
@@ -1790,4 +1816,37 @@ void __init tcp_init(void)
 						NULL, NULL);
 	if(!tcp_timewait_cachep)
 		panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
+
+	/* Size and allocate the main established and bind bucket
+	 * hash tables.
+	 *
+	 * The methodology is similar to that of the buffer cache.
+	 */
+	goal = num_physpages >> (20 - PAGE_SHIFT);
+	for(order = 5; (1UL << order) < goal; order++)
+		;
+	do {
+		tcp_ehash_size = (1UL << order) * PAGE_SIZE /
+			sizeof(struct sock *);
+		tcp_ehash = (struct sock **)
+			__get_free_pages(GFP_ATOMIC, order);
+	} while (tcp_ehash == NULL && --order > 4);
+
+	if (!tcp_ehash)
+		panic("Failed to allocate TCP established hash table\n");
+	memset(tcp_ehash, 0, tcp_ehash_size * sizeof(struct sock *));
+
+	do {
+		tcp_bhash_size = (1UL << order) * PAGE_SIZE /
+			sizeof(struct tcp_bind_bucket *);
+		tcp_bhash = (struct tcp_bind_bucket **)
+			__get_free_pages(GFP_ATOMIC, order);
+	} while (tcp_bhash == NULL && --order > 4);
+
+	if (!tcp_bhash)
+		panic("Failed to allocate TCP bind hash table\n");
+	memset(tcp_bhash, 0, tcp_bhash_size * sizeof(struct tcp_bind_bucket *));
+
+	printk("TCP: Hash tables configured (established %d bind %d)\n",
+	       tcp_ehash_size, tcp_bhash_size);
 }