summaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp.c
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>1997-04-29 21:13:14 +0000
committer <ralf@linux-mips.org>1997-04-29 21:13:14 +0000
commit19c9bba94152148523ba0f7ef7cffe3d45656b11 (patch)
tree40b1cb534496a7f1ca0f5c314a523c69f1fee464 /net/ipv4/tcp.c
parent7206675c40394c78a90e74812bbdbf8cf3cca1be (diff)
Import of Linux/MIPS 2.1.36
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r--net/ipv4/tcp.c856
1 files changed, 329 insertions, 527 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ac6e2ea53..420db4777 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: @(#)tcp.c 1.0.16 05/25/93
+ * Version: $Id: tcp.c,v 1.61 1997/04/22 02:53:10 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -30,7 +30,7 @@
* socket was looked up backwards. Nobody
* tested any icmp error code obviously.
* Alan Cox : tcp_err() now handled properly. It
- * wakes people on errors. select
+ * wakes people on errors. poll
* behaves and the icmp error race
* has gone by moving it into sock.c
* Alan Cox : tcp_send_reset() fixed to work for
@@ -102,12 +102,12 @@
* Alan Cox : BSD accept semantics.
* Alan Cox : Reset on closedown bug.
* Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
- * Michael Pall : Handle select() after URG properly in
+ * Michael Pall : Handle poll() after URG properly in
* all cases.
* Michael Pall : Undo the last fix in tcp_read_urg()
* (multi URG PUSH broke rlogin).
* Michael Pall : Fix the multi URG PUSH problem in
- * tcp_readable(), select() after URG
+ * tcp_readable(), poll() after URG
* works now.
* Michael Pall : recv(...,MSG_OOB) never blocks in the
* BSD api.
@@ -128,7 +128,7 @@
* Alan Cox : Reset tracing code.
* Alan Cox : Spurious resets on shutdown.
* Alan Cox : Giant 15 minute/60 second timer error
- * Alan Cox : Small whoops in selecting before an
+ * Alan Cox : Small whoops in polling before an
* accept.
* Alan Cox : Kept the state trace facility since
* it's handy for debugging.
@@ -162,7 +162,7 @@
* generates them.
* Alan Cox : Cache last socket.
* Alan Cox : Per route irtt.
- * Matt Day : Select() match BSD precisely on error
+ * Matt Day : poll()->select() match BSD precisely on error
* Alan Cox : New buffers
* Marc Tamsky : Various sk->prot->retransmits and
* sk->retransmits misupdating fixed.
@@ -196,6 +196,10 @@
* improvement.
* Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
* Willy Konynenberg : Transparent proxying support.
+ * Keith Owens : Do proper meging with partial SKB's in
+ * tcp_do_sendmsg to avoid burstiness.
+ * Eric Schenk : Fix fast close down bug with
+ * shutdown() followed by close().
*
* To Fix:
* Fast path the code. Two things here - fix the window calculation
@@ -204,7 +208,7 @@
*
* Rewrite output state machine to use a single queue.
* Speed up input assembly algorithm.
- * RFC1323 - PAWS and window scaling.
+ * RFC1323 - PAWS and window scaling.[Required for IPv6]
* User settable/learned rtt/max window/mtu
*
* Change the fundamental structure to a single send queue maintained
@@ -419,6 +423,7 @@
#include <linux/types.h>
#include <linux/fcntl.h>
+#include <linux/poll.h>
#include <net/icmp.h>
#include <net/tcp.h>
@@ -428,7 +433,7 @@
unsigned long seq_offset;
struct tcp_mib tcp_statistics;
-
+kmem_cache_t *tcp_openreq_cachep;
/*
* Find someone to 'accept'. Must be called with
@@ -437,26 +442,16 @@ struct tcp_mib tcp_statistics;
static struct open_request *tcp_find_established(struct tcp_opt *tp)
{
- struct open_request *req;
+ struct open_request *req = tp->syn_wait_queue;
- req = tp->syn_wait_queue;
-
- if (!req)
- return NULL;
-
- do {
+ while(req) {
if (req->sk &&
(req->sk->state == TCP_ESTABLISHED ||
req->sk->state >= TCP_FIN_WAIT1))
- {
- return req;
- }
-
+ break;
req = req->dl_next;
-
- } while (req != tp->syn_wait_queue);
-
- return NULL;
+ }
+ return req;
}
/*
@@ -467,14 +462,26 @@ static struct open_request *tcp_find_established(struct tcp_opt *tp)
static void tcp_close_pending (struct sock *sk)
{
- struct sk_buff *skb;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct open_request *req = tp->syn_wait_queue;
- while ((skb = skb_dequeue(&sk->receive_queue)) != NULL)
- {
- tcp_close(skb->sk, 0);
- kfree_skb(skb, FREE_READ);
+ while(req) {
+ struct open_request *iter;
+
+ if (req->sk)
+ tcp_close(req->sk, 0);
+
+ iter = req;
+ req = req->dl_next;
+
+ (*iter->class->destructor)(iter);
+ tcp_dec_slow_timer(TCP_SLT_SYNACK);
+ sk->ack_backlog--;
+ tcp_openreq_free(iter);
}
- return;
+
+ tp->syn_wait_queue = NULL;
+ tp->syn_wait_last = &tp->syn_wait_queue;
}
/*
@@ -505,48 +512,40 @@ static int tcp_readable(struct sock *sk)
int sum;
unsigned long flags;
- if(sk && sk->debug)
- printk("tcp_readable: %p - ",sk);
+ SOCK_DEBUG(sk, "tcp_readable: %p - ",sk);
save_flags(flags);
cli();
- if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL)
- {
+ if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL) {
restore_flags(flags);
- if(sk && sk->debug)
- printk("empty\n");
+ SOCK_DEBUG(sk, "empty\n");
return(0);
}
counted = sk->copied_seq; /* Where we are at the moment */
amount = 0;
- /*
- * Do until a push or until we are out of data.
- */
-
- do
- {
- /* Found a hole so stops here */
+ /* Do until a push or until we are out of data. */
+ do {
+ /* Found a hole so stops here. */
if (before(counted, skb->seq))
break;
- /*
- * Length - header but start from where we are up to
- * avoid overlaps
+
+ /* Length - header but start from where we are up to
+ * avoid overlaps.
*/
- sum = skb->len - (counted - skb->seq);
+ sum = skb->len - (counted - skb->seq);
if (skb->h.th->syn)
sum++;
- if (sum > 0)
- {
- /* Add it up, move on */
+ if (sum > 0) {
+ /* Add it up, move on. */
amount += sum;
if (skb->h.th->syn)
amount--;
counted += sum;
}
- /*
- * Don't count urg data ... but do it in the right place!
+
+ /* Don't count urg data ... but do it in the right place!
* Consider: "old_data (ptr is here) URG PUSH data"
* The old code would stop at the first push because
* it counted the urg (amount==1) and then does amount--
@@ -555,111 +554,89 @@ static int tcp_readable(struct sock *sk)
* though there was normal data available. If we subtract
* the urg data right here, we even get it to work for more
* than one URG PUSH skb without normal data.
- * This means that select() finally works now with urg data
+ * This means that poll() finally works now with urg data
* in the queue. Note that rlogin was never affected
- * because it doesn't use select(); it uses two processes
+ * because it doesn't use poll(); it uses two processes
* and a blocking read(). And the queue scan in tcp_read()
* was correct. Mike <pall@rz.uni-karlsruhe.de>
*/
- /* don't count urg data */
+ /* Don't count urg data. */
if (skb->h.th->urg)
amount--;
#if 0
if (amount && skb->h.th->psh) break;
#endif
skb = skb->next;
- }
- while(skb != (struct sk_buff *)&sk->receive_queue);
+ } while(skb != (struct sk_buff *)&sk->receive_queue);
restore_flags(flags);
- if(sk->debug)
- printk("got %lu bytes.\n",amount);
+ SOCK_DEBUG(sk, "got %lu bytes.\n",amount);
return(amount);
}
/*
- * LISTEN is a special case for select..
+ * LISTEN is a special case for poll..
*/
-static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
+static unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
{
- if (sel_type == SEL_IN) {
- struct open_request *req;
+ struct open_request *req;
- lock_sock(sk);
- req = tcp_find_established(&sk->tp_pinfo.af_tcp);
- release_sock(sk);
- if (req)
- return 1;
- select_wait(sk->sleep,wait);
- return 0;
- }
+ lock_sock(sk);
+ req = tcp_find_established(&sk->tp_pinfo.af_tcp);
+ release_sock(sk);
+ if (req)
+ return POLLIN | POLLRDNORM;
return 0;
}
/*
* Wait for a TCP event.
*
- * Note that we don't need to lock the socket, as the upper select layers
+ * Note that we don't need to lock the socket, as the upper poll layers
* take care of normal races (between the test and the event) and we don't
* go look at any of the socket buffers directly.
*/
-int tcp_select(struct sock *sk, int sel_type, select_table *wait)
+unsigned int tcp_poll(struct socket *sock, poll_table *wait)
{
+ unsigned int mask;
+ struct sock *sk = sock->sk;
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ poll_wait(sk->sleep, wait);
if (sk->state == TCP_LISTEN)
- return tcp_listen_select(sk, sel_type, wait);
-
- switch(sel_type) {
- case SEL_IN:
- if (sk->err)
- return 1;
- if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
- break;
+ return tcp_listen_poll(sk, wait);
+ mask = 0;
+ if (sk->err)
+ mask = POLLERR;
+ /* Connected? */
+ if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) {
if (sk->shutdown & RCV_SHUTDOWN)
- return 1;
-
- if (tp->rcv_nxt == sk->copied_seq)
- break;
+ mask |= POLLHUP;
- if (sk->urg_seq != sk->copied_seq ||
- tp->rcv_nxt != sk->copied_seq+1 ||
- sk->urginline || !sk->urg_data)
- return 1;
- break;
+ if ((tp->rcv_nxt != sk->copied_seq) &&
+ (sk->urg_seq != sk->copied_seq ||
+ tp->rcv_nxt != sk->copied_seq+1 ||
+ sk->urginline || !sk->urg_data))
+ mask |= POLLIN | POLLRDNORM;
- case SEL_OUT:
- if (sk->err)
- return 1;
- if (sk->shutdown & SEND_SHUTDOWN)
- return 0;
- if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
- break;
- /*
- * This is now right thanks to a small fix
- * by Matt Dillon.
+ /* FIXME: this assumed sk->mtu is correctly maintained.
+ * I see no evidence this is the case. -- erics
*/
+ if (!(sk->shutdown & SEND_SHUTDOWN) &&
+ (sock_wspace(sk) >= sk->mtu+128+sk->prot->max_header))
+ mask |= POLLOUT | POLLWRNORM;
- if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
- break;
- return 1;
-
- case SEL_EX:
if (sk->urg_data)
- return 1;
- break;
+ mask |= POLLPRI;
}
- select_wait(sk->sleep, wait);
- return 0;
+ return mask;
}
int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
- switch(cmd)
- {
-
+ switch(cmd) {
case TIOCINQ:
#ifdef FIXME /* FIXME: */
case FIONREAD:
@@ -690,28 +667,39 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
}
default:
return(-EINVAL);
- }
+ };
}
/*
* This routine builds a generic TCP header.
+ * It also builds in the RFC1323 Timestamp.
+ * It can't (unfortunately) do SACK as well.
*/
-extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
+extern __inline void tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
{
- struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
th->seq = htonl(sk->write_seq);
-
th->psh =(push == 0) ? 1 : 0;
-
- sk->bytes_rcv = 0;
- sk->ack_timed = 0;
th->ack_seq = htonl(tp->rcv_nxt);
th->window = htons(tcp_select_window(sk));
- return(sizeof(*th));
+ /* FIXME: could use the inline found in tcp_output.c as well.
+ * Probably that means we should move these up to an include file. --erics
+ */
+ if (tp->tstamp_ok) {
+ __u32 *ptr = (__u32 *)(th+1);
+ *ptr++ = ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+ | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
+ /* FIXME: Not sure it's worth setting these here already, but I'm
+ * also not sure we replace them on all paths later. --erics
+ */
+ *ptr++ = jiffies;
+ *ptr++ = tp->ts_recent;
+ }
}
/*
@@ -722,16 +710,14 @@ static void wait_for_tcp_connect(struct sock * sk)
release_sock(sk);
cli();
if (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT && sk->err == 0)
- {
interruptible_sleep_on(sk->sleep);
- }
sti();
lock_sock(sk);
}
static inline int tcp_memory_free(struct sock *sk)
{
- return sk->wmem_alloc < sk->sndbuf;
+ return atomic_read(&sk->wmem_alloc) < sk->sndbuf;
}
/*
@@ -770,22 +756,15 @@ static int tcp_append_tail(struct sock *sk, struct sk_buff *skb, u8 *from,
int fault;
int copy;
- /*
- * Add more stuff to the end
- * of the skb
- */
-
+ /* Add more stuff to the end of the skb. */
copy = min(sk->mss - tcp_size, skb_tailroom(skb));
copy = min(copy, seglen);
-
+
tcp_size += copy;
-
+
fault = copy_from_user(skb->tail, from, copy);
-
if (fault)
- {
return -1;
- }
skb_put(skb, copy);
skb->csum = csum_partial(skb->tail - tcp_size, tcp_size, 0);
@@ -801,32 +780,24 @@ static int tcp_append_tail(struct sock *sk, struct sk_buff *skb, u8 *from,
* and starts the transmit system.
*/
-int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov,
- int len, int nonblock, int flags)
+int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags)
{
+ int err = 0;
int copied = 0;
struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
- /*
- * Wait for a connection to finish.
- */
- while (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
- {
-
- if (copied)
- return copied;
-
- if (sk->err)
+ /* Wait for a connection to finish. */
+ while (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) {
+ if (sk->err)
return sock_error(sk);
-
- if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
- {
+
+ if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) {
if (sk->keepopen)
send_sig(SIGPIPE, current, 0);
return -EPIPE;
}
- if (nonblock)
+ if (flags&MSG_DONTWAIT)
return -EAGAIN;
if (current->signal & ~current->blocked)
@@ -834,163 +805,126 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov,
wait_for_tcp_connect(sk);
}
-
-
- /*
- * Ok commence sending
- */
-
- while(--iovlen >= 0)
- {
+
+ /* Ok commence sending. */
+ while(--iovlen >= 0) {
int seglen=iov->iov_len;
unsigned char * from=iov->iov_base;
- u32 actual_win;
iov++;
- while(seglen > 0)
- {
+ while(seglen > 0) {
+ unsigned int actual_win;
int copy;
int tmp;
struct sk_buff *skb;
- /*
- * Stop on errors
- */
- if (sk->err)
- {
+ if (err)
+ return (err);
+
+ /* Stop on errors. */
+ if (sk->err) {
if (copied)
return copied;
return sock_error(sk);
}
- /*
- * Make sure that we are established.
- */
- if (sk->shutdown & SEND_SHUTDOWN)
- {
+ /* Make sure that we are established. */
+ if (sk->shutdown & SEND_SHUTDOWN) {
if (copied)
return copied;
send_sig(SIGPIPE,current,0);
return -EPIPE;
}
- /*
- *Now we need to check if we have a half built packet.
- */
+ /* Now we need to check if we have a half built packet. */
- /* if we have queued packets */
- if (tp->send_head && !(flags & MSG_OOB) )
- {
+ /* If we have queued packets.. */
+ if (tp->send_head && !(flags & MSG_OOB)) {
int tcp_size;
/* Tail */
-
+
skb = sk->write_queue.prev;
- tcp_size = skb->tail -
- (unsigned char *)(skb->h.th + 1);
-
- /*
- * This window_seq test is somewhat dangerous
+ tcp_size = skb->tail -
+ ((unsigned char *)(skb->h.th) + tp->tcp_header_len);
+
+ /* printk("extending buffer\n"); */
+ /* This window_seq test is somewhat dangerous
* If the remote does SWS avoidance we should
- * queue the best we can
- * if not we should in fact send multiple
- * packets...
+ * queue the best we can if not we should in
+ * fact send multiple packets...
* a method for detecting this would be most
* welcome
*/
-
if (skb->end > skb->tail &&
sk->mss - tcp_size > 0 &&
- skb->end_seq < tp->snd_una + tp->snd_wnd)
- {
+ tp->snd_nxt < skb->end_seq) {
int tcopy;
-
+
tcopy = tcp_append_tail(sk, skb, from,
tcp_size,
seglen);
if (tcopy == -1)
- {
return -EFAULT;
- }
-
+
from += tcopy;
copied += tcopy;
- len -= tcopy;
seglen -= tcopy;
-
- /*
- * FIXME: if we're nagling we
+
+ /* FIXME: if we're nagling we
* should send here.
*/
continue;
}
}
-
- /*
- * We also need to worry about the window.
- * If window < 1/2 the maximum window we've seen from this
- * host, don't use it. This is sender side
- * silly window prevention, as specified in RFC1122.
- * (Note that this is different than earlier versions of
- * SWS prevention, e.g. RFC813.). What we actually do is
- * use the whole MSS. Since the results in the right
- * edge of the packet being outside the window, it will
- * be queued for later rather than sent.
- */
-
+ /* We also need to worry about the window.
+ * If window < 1/2 the maximum window we've seen from this
+ * host, don't use it. This is sender side
+ * silly window prevention, as specified in RFC1122.
+ * (Note that this is different than earlier versions of
+ * SWS prevention, e.g. RFC813.). What we actually do is
+ * use the whole MSS. Since the results in the right
+ * edge of the packet being outside the window, it will
+ * be queued for later rather than sent.
+ */
copy = min(seglen, sk->mss);
-
actual_win = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
- if (copy > actual_win &&
- (((long) actual_win) >= (sk->max_window >> 1)))
- {
+ if (copy > actual_win &&
+ (((int) actual_win) >= (tp->max_window >> 1)) &&
+ actual_win)
copy = actual_win;
- }
- if (copy <= 0)
- {
+ if (copy <= 0) {
printk(KERN_DEBUG "sendmsg: copy < 0\n");
return -EIO;
}
- /*
- * If sk->packets_out > 0 segment will be nagled
- * else we kick it right away
+ /* If tp->packets_out > 0 segment will be nagled
+ * else we kick it right away.
*/
-
tmp = MAX_HEADER + sk->prot->max_header +
sizeof(struct sk_buff) + 15;
- if (copy < min(sk->mss, sk->max_window >> 1) &&
- !(flags & MSG_OOB) && sk->packets_out)
- {
- tmp += min(sk->mss, sk->max_window);
- }
+ if (copy < min(sk->mss, tp->max_window >> 1) &&
+ !(flags & MSG_OOB) && tp->packets_out)
+ tmp += min(sk->mss, tp->max_window);
else
- {
tmp += copy;
- }
skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL);
-
- /*
- * If we didn't get any memory, we need to sleep.
- */
-
- if (skb == NULL)
- {
+
+ /* If we didn't get any memory, we need to sleep. */
+ if (skb == NULL) {
sk->socket->flags |= SO_NOSPACE;
- if (nonblock)
- {
+ if (flags&MSG_DONTWAIT) {
if (copied)
return copied;
return -EAGAIN;
}
- if (current->signal & ~current->blocked)
- {
+ if (current->signal & ~current->blocked) {
if (copied)
return copied;
return -ERESTARTSYS;
@@ -1000,54 +934,37 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov,
continue;
}
- skb->sk = sk;
- skb->free = 0;
- skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
-
- /*
- * FIXME: we need to optimize this.
+ /* FIXME: we need to optimize this.
* Perhaps some hints here would be good.
*/
-
tmp = tp->af_specific->build_net_header(sk, skb);
-
- if (tmp < 0)
- {
- sock_wfree(sk, skb);
+ if (tmp < 0) {
+ kfree_skb(skb, FREE_WRITE);
if (copied)
return(copied);
return(tmp);
}
- skb->h.th =(struct tcphdr *)
- skb_put(skb,sizeof(struct tcphdr));
+ skb->h.th =(struct tcphdr *)
+ skb_put(skb,tp->tcp_header_len);
seglen -= copy;
- tmp = tcp_build_header(skb->h.th, sk, seglen || iovlen);
-
- if (tmp < 0)
- {
- sock_wfree(sk, skb);
- if (copied)
- return(copied);
- return(tmp);
- }
-
- if (flags & MSG_OOB)
- {
+ tcp_build_header(skb->h.th, sk, seglen || iovlen);
+ /* FIXME: still need to think about SACK options here. */
+
+ if (flags & MSG_OOB) {
skb->h.th->urg = 1;
skb->h.th->urg_ptr = ntohs(copy);
}
- skb->csum = csum_partial_copy_fromuser(from,
- skb_put(skb, copy), copy, 0);
-
+ skb->csum = csum_partial_copy_from_user(from,
+ skb_put(skb, copy), copy, 0, &err);
+
from += copy;
copied += copy;
- len -= copy;
- skb->free = 0;
+
sk->write_seq += copy;
-
+
tcp_send_skb(sk, skb);
release_sock(sk);
@@ -1057,12 +974,12 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov,
sk->err = 0;
+ if (err)
+ return (err);
+
return copied;
}
-
-
-
/*
* Send an ack if one is backlogged at this point. Ought to merge
* this with tcp_send_ack().
@@ -1071,18 +988,15 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov,
void tcp_read_wakeup(struct sock *sk)
{
- /*
- * If we're closed, don't send an ack, or we'll get a RST
+ /* If we're closed, don't send an ack, or we'll get a RST
* from the closed destination.
*/
-
if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
return;
tcp_send_ack(sk);
}
-
/*
* Handle reading urgent data. BSD has very simple semantics for
* this, no blocking and very strange errors 8)
@@ -1095,33 +1009,28 @@ static int tcp_recv_urg(struct sock * sk, int nonblock,
int err=0;
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- /*
- * No URG data to read
- */
+ /* No URG data to read. */
if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ)
return -EINVAL; /* Yes this is right ! */
if (sk->err)
return sock_error(sk);
- if (sk->state == TCP_CLOSE || sk->done)
- {
- if (!sk->done)
- {
+ if (sk->state == TCP_CLOSE || sk->done) {
+ if (!sk->done) {
sk->done = 1;
return 0;
}
return -ENOTCONN;
}
- if (sk->shutdown & RCV_SHUTDOWN)
- {
+ if (sk->shutdown & RCV_SHUTDOWN) {
sk->done = 1;
return 0;
}
+
lock_sock(sk);
- if (sk->urg_data & URG_VALID)
- {
+ if (sk->urg_data & URG_VALID) {
char c = sk->urg_data;
if (!(flags & MSG_PEEK))
sk->urg_data = URG_READ;
@@ -1132,23 +1041,20 @@ static int tcp_recv_urg(struct sock * sk, int nonblock,
msg->msg_flags|=MSG_TRUNC;
if(msg->msg_name)
- {
tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
msg->msg_name);
- }
+
if(addr_len)
- *addr_len= tp->af_specific->sockaddr_len;
- /*
- * Read urgent data
- */
+ *addr_len = tp->af_specific->sockaddr_len;
+
+ /* Read urgent data. */
msg->msg_flags|=MSG_OOB;
release_sock(sk);
return err ? -EFAULT : 1;
}
release_sock(sk);
- /*
- * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
+ /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
* the available implementations agree in this case:
* this call should never block, independent of the
* blocking state of the socket.
@@ -1165,9 +1071,8 @@ static int tcp_recv_urg(struct sock * sk, int nonblock,
static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
{
- sk->ack_backlog++;
+ sk->tp_pinfo.af_tcp.delayed_acks++;
- skb->sk = sk;
__skb_unlink(skb, &sk->receive_queue);
kfree_skb(skb, FREE_READ);
}
@@ -1176,36 +1081,31 @@ static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
static void cleanup_rbuf(struct sock *sk)
{
struct sk_buff *skb;
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- /*
- * NOTE! The socket must be locked, so that we don't get
+ /* NOTE! The socket must be locked, so that we don't get
* a messed-up receive queue.
*/
-
while ((skb=skb_peek(&sk->receive_queue)) != NULL) {
- if (!skb->used || skb->users)
+ if (!skb->used || atomic_read(&skb->users)>1)
break;
tcp_eat_skb(sk, skb);
}
-
- if(sk->debug)
- printk("sk->rspace = %lu\n", sock_rspace(sk));
-
- /*
- * We send a ACK if the sender is blocked
- * else let tcp_data deal with the acking policy.
- */
- if (sock_rspace(sk) > tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup) &&
- (tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup) < sk->mss))
- {
- /* Send an ack right now. */
- sk->delayed_acks++;
- tcp_read_wakeup(sk);
- }
-
-}
+ SOCK_DEBUG(sk, "sk->rspace = %lu\n", sock_rspace(sk));
+
+ /* We send a ACK if the sender is blocked
+ * else let tcp_data deal with the acking policy.
+ */
+ if (sk->tp_pinfo.af_tcp.delayed_acks) {
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ __u32 rcv_wnd;
+
+ rcv_wnd = tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup);
+
+ if ((rcv_wnd < sk->mss) && (sock_rspace(sk) > rcv_wnd))
+ tcp_read_wakeup(sk);
+ }
+}
/*
@@ -1227,47 +1127,34 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
if (sk->state == TCP_LISTEN)
return -ENOTCONN;
- /*
- * Urgent data needs to be handled specially.
- */
-
+ /* Urgent data needs to be handled specially. */
if (flags & MSG_OOB)
return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
- /*
- * Copying sequence to update. This is volatile to handle
+ /* Copying sequence to update. This is volatile to handle
* the multi-reader case neatly (memcpy_to/fromfs might be
* inline and thus not flush cached variables otherwise).
*/
-
peek_seq = sk->copied_seq;
seq = &sk->copied_seq;
if (flags & MSG_PEEK)
seq = &peek_seq;
- /*
- * Handle the POSIX bogosity MSG_WAITALL
- */
-
+ /* Handle the POSIX bogosity MSG_WAITALL. */
if (flags & MSG_WAITALL)
target=len;
add_wait_queue(sk->sleep, &wait);
lock_sock(sk);
- while (len > 0)
- {
+ while (len > 0) {
struct sk_buff * skb;
u32 offset;
- /*
- * Are we at urgent data? Stop if we have read anything.
- */
-
+ /* Are we at urgent data? Stop if we have read anything. */
if (copied && sk->urg_data && sk->urg_seq == *seq)
break;
- /*
- * We need to check signals first, to get correct SIGURG
+ /* We need to check signals first, to get correct SIGURG
* handling. FIXME: Need to check this doesnt impact 1003.1g
* and move it down to the bottom of the loop
*/
@@ -1275,26 +1162,24 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
if (copied)
break;
copied = -ERESTARTSYS;
+ if (nonblock)
+ copied = -EAGAIN;
break;
}
- /*
- * Next get a buffer.
- */
-
+ /* Next get a buffer. */
current->state = TASK_INTERRUPTIBLE;
skb = skb_peek(&sk->receive_queue);
- do
- {
+ do {
if (!skb)
break;
- /*
- * now that we have two receive queues this
- * shouldn't happen
+
+ /* Now that we have two receive queues this
+ * shouldn't happen.
*/
if (before(*seq, skb->seq)) {
- printk("recvmsg bug: copied %X seq %X\n",
+ printk(KERN_INFO "recvmsg bug: copied %X seq %X\n",
*seq, skb->seq);
break;
}
@@ -1308,22 +1193,18 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
if (!(flags & MSG_PEEK))
skb->used = 1;
skb = skb->next;
- }
- while (skb != (struct sk_buff *)&sk->receive_queue);
+ } while (skb != (struct sk_buff *)&sk->receive_queue);
if (copied >= target)
break;
- if (sk->err && !(flags&MSG_PEEK))
- {
+ if (sk->err && !(flags&MSG_PEEK)) {
copied = sock_error(sk);
break;
}
- if (sk->state == TCP_CLOSE)
- {
- if (!sk->done)
- {
+ if (sk->state == TCP_CLOSE) {
+ if (!sk->done) {
sk->done = 1;
break;
}
@@ -1331,14 +1212,12 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
break;
}
- if (sk->shutdown & RCV_SHUTDOWN)
- {
+ if (sk->shutdown & RCV_SHUTDOWN) {
sk->done = 1;
break;
}
- if (nonblock)
- {
+ if (nonblock) {
copied = -EAGAIN;
break;
}
@@ -1352,97 +1231,75 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
continue;
found_ok_skb:
- /*
- * Lock the buffer. We can be fairly relaxed as
+ /* Lock the buffer. We can be fairly relaxed as
* an interrupt will never steal a buffer we are
* using unless I've missed something serious in
* tcp_data.
*/
+ atomic_inc(&skb->users);
- skb->users++;
-
- /*
- * Ok so how much can we use ?
- */
-
+ /* Ok so how much can we use? */
used = skb->len - offset;
if (len < used)
used = len;
- /*
- * Do we have urgent data here?
- */
- if (sk->urg_data)
- {
+ /* Do we have urgent data here? */
+ if (sk->urg_data) {
u32 urg_offset = sk->urg_seq - *seq;
- if (urg_offset < used)
- {
- if (!urg_offset)
- {
- if (!sk->urginline)
- {
+ if (urg_offset < used) {
+ if (!urg_offset) {
+ if (!sk->urginline) {
++*seq;
offset++;
used--;
}
- }
- else
+ } else
used = urg_offset;
}
}
- /*
- * Copy it - We _MUST_ update *seq first so that we
+ /* Copy it - We _MUST_ update *seq first so that we
* don't ever double read when we have dual readers
*/
-
*seq += used;
- /*
- * This memcpy_toiovec can sleep. If it sleeps and we
+ /* This memcpy_toiovec can sleep. If it sleeps and we
* do a second read it relies on the skb->users to avoid
* a crash when cleanup_rbuf() gets called.
*/
-
err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used);
- if (err)
- {
- /*
- * exception. bailout!
- */
+ if (err) {
+ /* Exception. Bailout! */
*seq -= err;
- skb->users--;
- return -EFAULT;
+ atomic_dec(&skb->users);
+ copied = -EFAULT;
+ break;
}
copied += used;
len -= used;
- /*
- * We now will not sleep again until we are finished
+ /* We now will not sleep again until we are finished
* with skb. Sorry if you are doing the SMP port
* but you'll just have to fix it neatly ;)
*/
-
- skb->users--;
+ atomic_dec(&skb->users);
if (after(sk->copied_seq,sk->urg_seq))
sk->urg_data = 0;
if (used + offset < skb->len)
continue;
- /*
- * Process the FIN. We may also need to handle PSH
- * here and make it break out of MSG_WAITALL
+ /* Process the FIN. We may also need to handle PSH
+ * here and make it break out of MSG_WAITALL.
*/
-
if (skb->h.th->fin)
goto found_fin_ok;
if (flags & MSG_PEEK)
continue;
skb->used = 1;
- if (!skb->users)
+ if (atomic_read(&skb->users) == 1)
tcp_eat_skb(sk, skb);
continue;
@@ -1451,35 +1308,28 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
if (flags & MSG_PEEK)
break;
- /*
- * All is done
- */
-
+ /* All is done. */
skb->used = 1;
sk->shutdown |= RCV_SHUTDOWN;
break;
-
}
if(copied > 0 && msg->msg_name)
- {
tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
msg->msg_name);
- }
+
if(addr_len)
- *addr_len= tp->af_specific->sockaddr_len;
+ *addr_len = tp->af_specific->sockaddr_len;
remove_wait_queue(sk->sleep, &wait);
current->state = TASK_RUNNING;
- /* Clean up data we have read: This will do ACK frames */
+ /* Clean up data we have read: This will do ACK frames. */
cleanup_rbuf(sk);
release_sock(sk);
return copied;
}
-
-
/*
* State processing on a close. This implements the state shift for
* sending our FIN frame. Note that we only send a FIN for some
@@ -1491,8 +1341,7 @@ static int tcp_close_state(struct sock *sk, int dead)
{
int ns=TCP_CLOSE;
int send_fin=0;
- switch(sk->state)
- {
+ switch(sk->state) {
case TCP_SYN_SENT: /* No SYN back, no FIN needed */
break;
case TCP_SYN_RECV:
@@ -1508,16 +1357,16 @@ static int tcp_close_state(struct sock *sk, int dead)
case TCP_CLOSE:
case TCP_LISTEN:
break;
+ case TCP_LAST_ACK: /* Could have shutdown() then close() */
case TCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and
wait only for the ACK */
ns=TCP_LAST_ACK;
send_fin=1;
- }
+ };
tcp_set_state(sk,ns);
- /*
- * This is a (useful) BSD violating of the RFC. There is a
+ /* This is a (useful) BSD violating of the RFC. There is a
* problem with TCP as specified in that the other end could
* keep a socket open forever with no application left this end.
* We use a 3 minute timeout (about the same as BSD) then kill
@@ -1525,8 +1374,7 @@ static int tcp_close_state(struct sock *sk, int dead)
* that we won't make the old 4*rto = almost no time - whoops
* reset mistake.
*/
- if(dead && ns==TCP_FIN_WAIT2)
- {
+ if(dead && ns==TCP_FIN_WAIT2) {
int timer_active=del_timer(&sk->timer);
if(timer_active)
add_timer(&sk->timer);
@@ -1544,50 +1392,29 @@ static int tcp_close_state(struct sock *sk, int dead)
void tcp_shutdown(struct sock *sk, int how)
{
- /*
- * We need to grab some memory, and put together a FIN,
+ /* We need to grab some memory, and put together a FIN,
* and then put it into the queue to be sent.
* Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
*/
-
if (!(how & SEND_SHUTDOWN))
return;
- /*
- * If we've already sent a FIN, or it's a closed state
- */
-
- if (sk->state == TCP_FIN_WAIT1 ||
- sk->state == TCP_FIN_WAIT2 ||
- sk->state == TCP_CLOSING ||
- sk->state == TCP_LAST_ACK ||
- sk->state == TCP_TIME_WAIT ||
- sk->state == TCP_CLOSE ||
- sk->state == TCP_LISTEN
- )
- {
- return;
- }
- lock_sock(sk);
-
- /*
- * flag that the sender has shutdown
- */
+ /* If we've already sent a FIN, or it's a closed state, skip this. */
+ if (sk->state == TCP_ESTABLISHED ||
+ sk->state == TCP_SYN_SENT ||
+ sk->state == TCP_SYN_RECV ||
+ sk->state == TCP_CLOSE_WAIT) {
+ lock_sock(sk);
- sk->shutdown |= SEND_SHUTDOWN;
+ /* Flag that the sender has shutdown. */
+ sk->shutdown |= SEND_SHUTDOWN;
- /*
- * Clear out any half completed packets.
- */
+ /* Clear out any half completed packets. FIN if needed. */
+ if (tcp_close_state(sk,0))
+ tcp_send_fin(sk);
- /*
- * FIN if needed
- */
-
- if (tcp_close_state(sk,0))
- tcp_send_fin(sk);
-
- release_sock(sk);
+ release_sock(sk);
+ }
}
@@ -1602,7 +1429,7 @@ static inline int closing(struct sock * sk)
case TCP_CLOSING:
case TCP_LAST_ACK:
return 1;
- }
+ };
return 0;
}
@@ -1611,21 +1438,17 @@ void tcp_close(struct sock *sk, unsigned long timeout)
{
struct sk_buff *skb;
- /*
- * We need to grab some memory, and put together a FIN,
+ /* We need to grab some memory, and put together a FIN,
* and then put it into the queue to be sent.
*/
-
lock_sock(sk);
-
- tcp_cache_zap();
- if(sk->state == TCP_LISTEN)
- {
- /* Special case */
+ if(sk->state == TCP_LISTEN) {
+ /* Special case. */
tcp_set_state(sk, TCP_CLOSE);
tcp_close_pending(sk);
release_sock(sk);
sk->dead = 1;
+ sk->prot->unhash(sk);
return;
}
@@ -1635,54 +1458,37 @@ void tcp_close(struct sock *sk, unsigned long timeout)
if (!sk->dead)
sk->state_change(sk);
- /*
- * We need to flush the recv. buffs. We do this only on the
+ /* We need to flush the recv. buffs. We do this only on the
* descriptor close, not protocol-sourced closes, because the
* reader process may not have drained the data yet!
*/
-
while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
kfree_skb(skb, FREE_READ);
-
- /*
- * Timeout is not the same thing - however the code likes
- * to send both the same way (sigh).
+ /* Timeout is not the same thing - however the code likes
+ * to send both the same way (sigh).
*/
-
if (tcp_close_state(sk,1)==1)
- {
tcp_send_fin(sk);
- }
if (timeout) {
cli();
release_sock(sk);
current->timeout = timeout;
- while(closing(sk) && current->timeout)
- {
+ while(closing(sk) && current->timeout) {
interruptible_sleep_on(sk->sleep);
if (current->signal & ~current->blocked)
- {
break;
- }
}
current->timeout=0;
lock_sock(sk);
sti();
}
- /*
- * This will destroy it. The timers will take care of actually
- * free'ing up the memory.
- */
- tcp_cache_zap(); /* Kill the cache again. */
-
/* Now that the socket is dead, if we are in the FIN_WAIT2 state
* we may need to set up a timer.
*/
- if (sk->state==TCP_FIN_WAIT2)
- {
+ if (sk->state==TCP_FIN_WAIT2) {
int timer_active=del_timer(&sk->timer);
if(timer_active)
add_timer(&sk->timer);
@@ -1690,10 +1496,12 @@ void tcp_close(struct sock *sk, unsigned long timeout)
tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
}
- release_sock(sk);
sk->dead = 1;
-}
+ release_sock(sk);
+ if(sk->state == TCP_CLOSE)
+ sk->prot->unhash(sk);
+}
/*
* Wait for an incoming connection, avoid race
@@ -1720,7 +1528,6 @@ static struct open_request * wait_for_connect(struct sock * sk)
return req;
}
-
/*
* This will accept the next outstanding connection.
*
@@ -1734,11 +1541,9 @@ struct sock *tcp_accept(struct sock *sk, int flags)
struct sock *newsk = NULL;
int error;
- /*
- * We need to make sure that this socket is listening,
- * and that it has something pending.
- */
-
+ /* We need to make sure that this socket is listening,
+ * and that it has something pending.
+ */
error = EINVAL;
if (sk->state != TCP_LISTEN)
goto no_listen;
@@ -1750,7 +1555,7 @@ struct sock *tcp_accept(struct sock *sk, int flags)
got_new_connect:
tcp_synq_unlink(tp, req);
newsk = req->sk;
- kfree(req);
+ tcp_openreq_free(req);
sk->ack_backlog--;
error = 0;
out:
@@ -1770,7 +1575,6 @@ no_listen:
goto out;
}
-
/*
* Socket option code for TCP.
*/
@@ -1782,22 +1586,18 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
int val;
if (level != SOL_TCP)
- {
return tp->af_specific->setsockopt(sk, level, optname,
optval, optlen);
- }
-
- if (optval == NULL)
- return(-EINVAL);
+
+ if(optlen<sizeof(int))
+ return -EINVAL;
if (get_user(val, (int *)optval))
return -EFAULT;
- switch(optname)
- {
+ switch(optname) {
case TCP_MAXSEG:
-/*
- * values greater than interface MTU won't take effect. however at
+/* values greater than interface MTU won't take effect. however at
* the point when this call is done we typically don't yet know
* which interface is going to be used
*/
@@ -1810,23 +1610,26 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
return 0;
default:
return(-ENOPROTOOPT);
- }
+ };
}
int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
int *optlen)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- int val,err;
+ int val;
+ int len;
if(level != SOL_TCP)
- {
return tp->af_specific->getsockopt(sk, level, optname,
optval, optlen);
- }
+
+ if(get_user(len,optlen))
+ return -EFAULT;
+
+ len = min(len,sizeof(int));
- switch(optname)
- {
+ switch(optname) {
case TCP_MAXSEG:
val=sk->user_mss;
break;
@@ -1835,30 +1638,29 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
break;
default:
return(-ENOPROTOOPT);
- }
-
- err = put_user(sizeof(int),(int *) optlen);
- if (!err)
- err = put_user(val,(int *)optval);
+ };
- return err;
+ if(put_user(len, optlen))
+ return -EFAULT;
+ if(copy_to_user(optval, &val,len))
+ return -EFAULT;
+ return 0;
}
void tcp_set_keepalive(struct sock *sk, int val)
{
if (!sk->keepopen && val)
- {
tcp_inc_slow_timer(TCP_SLT_KEEPALIVE);
- }
else if (sk->keepopen && !val)
- {
tcp_dec_slow_timer(TCP_SLT_KEEPALIVE);
- }
}
-/*
- * Local variables:
- * compile-command: "gcc -D__KERNEL__ -I/usr/src/linux/include -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer -fno-strength-reduce -pipe -m486 -DCPU=486 -c -o tcp.o tcp.c"
- * c-file-style: "Linux"
- * End:
- */
+void tcp_init(void)
+{
+ tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
+ sizeof(struct open_request),
+ sizeof(long)*8, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ if(!tcp_openreq_cachep)
+ panic("tcp_init: Cannot alloc open_request cache.");
+}