summaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r--net/ipv4/tcp.c121
1 files changed, 90 insertions, 31 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8c1c9f9be..779c31cef 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp.c,v 1.140 1999/04/22 10:34:31 davem Exp $
+ * Version: $Id: tcp.c,v 1.144 1999/05/27 01:03:37 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -416,6 +416,7 @@
#include <linux/fcntl.h>
#include <linux/poll.h>
#include <linux/init.h>
+#include <linux/smp_lock.h>
#include <net/icmp.h>
#include <net/tcp.h>
@@ -432,7 +433,7 @@ kmem_cache_t *tcp_timewait_cachep;
/*
* Find someone to 'accept'. Must be called with
- * the socket locked or with interrupts disabled
+ * the listening socket locked.
*/
static struct open_request *tcp_find_established(struct tcp_opt *tp,
@@ -441,10 +442,11 @@ static struct open_request *tcp_find_established(struct tcp_opt *tp,
struct open_request *req = tp->syn_wait_queue;
struct open_request *prev = (struct open_request *)&tp->syn_wait_queue;
while(req) {
- if (req->sk &&
- ((1 << req->sk->state) &
- ~(TCPF_SYN_SENT|TCPF_SYN_RECV)))
- break;
+ if (req->sk) {
+ if((1 << req->sk->state) &
+ ~(TCPF_SYN_SENT|TCPF_SYN_RECV))
+ break;
+ }
prev = req;
req = req->dl_next;
}
@@ -655,12 +657,13 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
/*
* Wait for a socket to get into the connected state
*
- * Note: must be called with the socket locked.
+ * Note: Must be called with the socket locked, and it
+ * runs with the kernel fully unlocked.
*/
static int wait_for_tcp_connect(struct sock * sk, int flags)
{
struct task_struct *tsk = current;
- struct wait_queue wait = { tsk, NULL };
+ DECLARE_WAITQUEUE(wait, tsk);
while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
if(sk->err)
@@ -698,12 +701,14 @@ static inline int tcp_memory_free(struct sock *sk)
/*
* Wait for more memory for a socket
+ *
+ * NOTE: This runs with the kernel fully unlocked.
*/
static void wait_for_tcp_memory(struct sock * sk)
{
release_sock(sk);
if (!tcp_memory_free(sk)) {
- struct wait_queue wait = { current, NULL };
+ DECLARE_WAITQUEUE(wait, current);
sk->socket->flags &= ~SO_NOSPACE;
add_wait_queue(sk->sleep, &wait);
@@ -744,6 +749,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
int mss_now;
int err, copied;
+ unlock_kernel();
lock_sock(sk);
err = 0;
@@ -896,6 +902,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
err = -ERESTARTSYS;
goto do_interrupted;
}
+ tcp_push_pending_frames(sk, tp);
wait_for_tcp_memory(sk);
/* If SACK's were formed or PMTU events happened,
@@ -969,6 +976,7 @@ do_fault2:
out:
tcp_push_pending_frames(sk, tp);
release_sock(sk);
+ lock_kernel();
return err;
}
@@ -1117,7 +1125,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
int len, int nonblock, int flags, int *addr_len)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- struct wait_queue wait = { current, NULL };
+ DECLARE_WAITQUEUE(wait, current);
int copied = 0;
u32 peek_seq;
volatile u32 *seq; /* So gcc doesn't overoptimise */
@@ -1148,6 +1156,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
if (flags & MSG_WAITALL)
target=len;
+ unlock_kernel();
add_wait_queue(sk->sleep, &wait);
lock_sock(sk);
@@ -1300,6 +1309,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
/* We now will not sleep again until we are finished
* with skb. Sorry if you are doing the SMP port
* but you'll just have to fix it neatly ;)
+ *
+ * Very funny Alan... -DaveM
*/
atomic_dec(&skb->users);
@@ -1344,6 +1355,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
/* Clean up data we have read: This will do ACK frames. */
cleanup_rbuf(sk, copied);
release_sock(sk);
+ lock_kernel();
return copied;
}
@@ -1415,16 +1427,15 @@ void tcp_shutdown(struct sock *sk, int how)
return;
/* If we've already sent a FIN, or it's a closed state, skip this. */
+ lock_sock(sk);
if ((1 << sk->state) &
(TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
- lock_sock(sk);
/* Clear out any half completed packets. FIN if needed. */
if (tcp_close_state(sk,0))
tcp_send_fin(sk);
-
- release_sock(sk);
}
+ release_sock(sk);
}
@@ -1471,13 +1482,6 @@ void tcp_close(struct sock *sk, long timeout)
struct sk_buff *skb;
int data_was_unread = 0;
- /*
- * Check whether the socket is locked ... supposedly
- * it's impossible to tcp_close() a locked socket.
- */
- if (atomic_read(&sk->sock_readers))
- printk("tcp_close: socket already locked!\n");
-
/* We need to grab some memory, and put together a FIN,
* and then put it into the queue to be sent.
*/
@@ -1491,6 +1495,8 @@ void tcp_close(struct sock *sk, long timeout)
return;
}
+ unlock_kernel();
+
/* It is questionable, what the role of this is now.
* In any event either it should be removed, or
* increment of SLT_KEEPALIVE be done, this is causing
@@ -1534,24 +1540,23 @@ void tcp_close(struct sock *sk, long timeout)
if (timeout) {
struct task_struct *tsk = current;
- struct wait_queue wait = { tsk, NULL };
+ DECLARE_WAITQUEUE(wait, current);
add_wait_queue(sk->sleep, &wait);
- release_sock(sk);
while (1) {
tsk->state = TASK_INTERRUPTIBLE;
if (!closing(sk))
break;
+ release_sock(sk);
timeout = schedule_timeout(timeout);
+ lock_sock(sk);
if (signal_pending(tsk) || !timeout)
break;
}
tsk->state = TASK_RUNNING;
remove_wait_queue(sk->sleep, &wait);
-
- lock_sock(sk);
}
/* Now that the socket is dead, if we are in the FIN_WAIT2 state
@@ -1559,23 +1564,40 @@ void tcp_close(struct sock *sk, long timeout)
*/
tcp_check_fin_timer(sk);
- release_sock(sk);
sk->dead = 1;
+
+ release_sock(sk);
+ lock_kernel();
}
/*
* Wait for an incoming connection, avoid race
- * conditions. This must be called with the socket locked.
+ * conditions. This must be called with the socket locked,
+ * and without the kernel lock held.
*/
static struct open_request * wait_for_connect(struct sock * sk,
struct open_request **pprev)
{
- struct wait_queue wait = { current, NULL };
+ DECLARE_WAITQUEUE(wait, current);
struct open_request *req;
- add_wait_queue(sk->sleep, &wait);
+ /*
+ * True wake-one mechanism for incoming connections: only
+ * one process gets woken up, not the 'whole herd'.
+ * Since we do not 'race & poll' for established sockets
+ * anymore, the common case will execute the loop only once.
+ *
+ * Subtle issue: "add_wait_queue_exclusive()" will be added
+ * after any current non-exclusive waiters, and we know that
+ * it will always _stay_ after any new non-exclusive waiters
+ * because all non-exclusive waiters are added at the
+ * beginning of the wait-queue. As such, it's ok to "drop"
+ * our exclusiveness temporarily when we get woken up without
+ * having to remove and re-insert us on the wait queue.
+ */
+ add_wait_queue_exclusive(sk->sleep, &wait);
for (;;) {
- current->state = TASK_INTERRUPTIBLE;
+ current->state = TASK_EXCLUSIVE | TASK_INTERRUPTIBLE;
release_sock(sk);
schedule();
lock_sock(sk);
@@ -1603,6 +1625,7 @@ struct sock *tcp_accept(struct sock *sk, int flags)
struct sock *newsk = NULL;
int error;
+ unlock_kernel();
lock_sock(sk);
/* We need to make sure that this socket is listening,
@@ -1633,16 +1656,17 @@ struct sock *tcp_accept(struct sock *sk, int flags)
sk->ack_backlog--;
if(sk->keepopen)
tcp_inc_slow_timer(TCP_SLT_KEEPALIVE);
-
release_sock(sk);
+ lock_kernel();
return newsk;
out:
/* sk should be in LISTEN state, thus accept can use sk->err for
- * internal purposes without stomping one anyone's feed.
+ * internal purposes without stomping on anyone's feed.
*/
sk->err = error;
release_sock(sk);
+ lock_kernel();
return newsk;
}
@@ -1765,6 +1789,8 @@ extern void __skb_cb_too_small_for_tcp(int, int);
void __init tcp_init(void)
{
struct sk_buff *skb = NULL;
+ unsigned long goal;
+ int order;
if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
@@ -1790,4 +1816,37 @@ void __init tcp_init(void)
NULL, NULL);
if(!tcp_timewait_cachep)
panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
+
+ /* Size and allocate the main established and bind bucket
+ * hash tables.
+ *
+ * The methodology is similar to that of the buffer cache.
+ */
+ goal = num_physpages >> (20 - PAGE_SHIFT);
+ for(order = 5; (1UL << order) < goal; order++)
+ ;
+ do {
+ tcp_ehash_size = (1UL << order) * PAGE_SIZE /
+ sizeof(struct sock *);
+ tcp_ehash = (struct sock **)
+ __get_free_pages(GFP_ATOMIC, order);
+ } while (tcp_ehash == NULL && --order > 4);
+
+ if (!tcp_ehash)
+ panic("Failed to allocate TCP established hash table\n");
+ memset(tcp_ehash, 0, tcp_ehash_size * sizeof(struct sock *));
+
+ do {
+ tcp_bhash_size = (1UL << order) * PAGE_SIZE /
+ sizeof(struct tcp_bind_bucket *);
+ tcp_bhash = (struct tcp_bind_bucket **)
+ __get_free_pages(GFP_ATOMIC, order);
+ } while (tcp_bhash == NULL && --order > 4);
+
+ if (!tcp_bhash)
+ panic("Failed to allocate TCP bind hash table\n");
+ memset(tcp_bhash, 0, tcp_bhash_size * sizeof(struct tcp_bind_bucket *));
+
+ printk("TCP: Hash tables configured (established %d bind %d)\n",
+ tcp_ehash_size, tcp_bhash_size);
}