diff options
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r-- | net/ipv4/tcp.c | 856 |
1 files changed, 329 insertions, 527 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ac6e2ea53..420db4777 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: @(#)tcp.c 1.0.16 05/25/93 + * Version: $Id: tcp.c,v 1.61 1997/04/22 02:53:10 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -30,7 +30,7 @@ * socket was looked up backwards. Nobody * tested any icmp error code obviously. * Alan Cox : tcp_err() now handled properly. It - * wakes people on errors. select + * wakes people on errors. poll * behaves and the icmp error race * has gone by moving it into sock.c * Alan Cox : tcp_send_reset() fixed to work for @@ -102,12 +102,12 @@ * Alan Cox : BSD accept semantics. * Alan Cox : Reset on closedown bug. * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). - * Michael Pall : Handle select() after URG properly in + * Michael Pall : Handle poll() after URG properly in * all cases. * Michael Pall : Undo the last fix in tcp_read_urg() * (multi URG PUSH broke rlogin). * Michael Pall : Fix the multi URG PUSH problem in - * tcp_readable(), select() after URG + * tcp_readable(), poll() after URG * works now. * Michael Pall : recv(...,MSG_OOB) never blocks in the * BSD api. @@ -128,7 +128,7 @@ * Alan Cox : Reset tracing code. * Alan Cox : Spurious resets on shutdown. * Alan Cox : Giant 15 minute/60 second timer error - * Alan Cox : Small whoops in selecting before an + * Alan Cox : Small whoops in polling before an * accept. * Alan Cox : Kept the state trace facility since * it's handy for debugging. @@ -162,7 +162,7 @@ * generates them. * Alan Cox : Cache last socket. * Alan Cox : Per route irtt. - * Matt Day : Select() match BSD precisely on error + * Matt Day : poll()->select() match BSD precisely on error * Alan Cox : New buffers * Marc Tamsky : Various sk->prot->retransmits and * sk->retransmits misupdating fixed. @@ -196,6 +196,10 @@ * improvement. * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD * Willy Konynenberg : Transparent proxying support. + * Keith Owens : Do proper meging with partial SKB's in + * tcp_do_sendmsg to avoid burstiness. + * Eric Schenk : Fix fast close down bug with + * shutdown() followed by close(). * * To Fix: * Fast path the code. Two things here - fix the window calculation @@ -204,7 +208,7 @@ * * Rewrite output state machine to use a single queue. * Speed up input assembly algorithm. - * RFC1323 - PAWS and window scaling. + * RFC1323 - PAWS and window scaling.[Required for IPv6] * User settable/learned rtt/max window/mtu * * Change the fundamental structure to a single send queue maintained @@ -419,6 +423,7 @@ #include <linux/types.h> #include <linux/fcntl.h> +#include <linux/poll.h> #include <net/icmp.h> #include <net/tcp.h> @@ -428,7 +433,7 @@ unsigned long seq_offset; struct tcp_mib tcp_statistics; - +kmem_cache_t *tcp_openreq_cachep; /* * Find someone to 'accept'. Must be called with @@ -437,26 +442,16 @@ struct tcp_mib tcp_statistics; static struct open_request *tcp_find_established(struct tcp_opt *tp) { - struct open_request *req; + struct open_request *req = tp->syn_wait_queue; - req = tp->syn_wait_queue; - - if (!req) - return NULL; - - do { + while(req) { if (req->sk && (req->sk->state == TCP_ESTABLISHED || req->sk->state >= TCP_FIN_WAIT1)) - { - return req; - } - + break; req = req->dl_next; - - } while (req != tp->syn_wait_queue); - - return NULL; + } + return req; } /* @@ -467,14 +462,26 @@ static struct open_request *tcp_find_established(struct tcp_opt *tp) static void tcp_close_pending (struct sock *sk) { - struct sk_buff *skb; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct open_request *req = tp->syn_wait_queue; - while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) - { - tcp_close(skb->sk, 0); - kfree_skb(skb, FREE_READ); + while(req) { + struct open_request *iter; + + if (req->sk) + tcp_close(req->sk, 0); + + iter = req; + req = req->dl_next; + + (*iter->class->destructor)(iter); + tcp_dec_slow_timer(TCP_SLT_SYNACK); + sk->ack_backlog--; + tcp_openreq_free(iter); } - return; + + tp->syn_wait_queue = NULL; + tp->syn_wait_last = &tp->syn_wait_queue; } /* @@ -505,48 +512,40 @@ static int tcp_readable(struct sock *sk) int sum; unsigned long flags; - if(sk && sk->debug) - printk("tcp_readable: %p - ",sk); + SOCK_DEBUG(sk, "tcp_readable: %p - ",sk); save_flags(flags); cli(); - if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL) - { + if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL) { restore_flags(flags); - if(sk && sk->debug) - printk("empty\n"); + SOCK_DEBUG(sk, "empty\n"); return(0); } counted = sk->copied_seq; /* Where we are at the moment */ amount = 0; - /* - * Do until a push or until we are out of data. - */ - - do - { - /* Found a hole so stops here */ + /* Do until a push or until we are out of data. */ + do { + /* Found a hole so stops here. */ if (before(counted, skb->seq)) break; - /* - * Length - header but start from where we are up to - * avoid overlaps + + /* Length - header but start from where we are up to + * avoid overlaps. */ - sum = skb->len - (counted - skb->seq); + sum = skb->len - (counted - skb->seq); if (skb->h.th->syn) sum++; - if (sum > 0) - { - /* Add it up, move on */ + if (sum > 0) { + /* Add it up, move on. */ amount += sum; if (skb->h.th->syn) amount--; counted += sum; } - /* - * Don't count urg data ... but do it in the right place! + + /* Don't count urg data ... but do it in the right place! * Consider: "old_data (ptr is here) URG PUSH data" * The old code would stop at the first push because * it counted the urg (amount==1) and then does amount-- @@ -555,111 +554,89 @@ static int tcp_readable(struct sock *sk) * though there was normal data available. If we subtract * the urg data right here, we even get it to work for more * than one URG PUSH skb without normal data. - * This means that select() finally works now with urg data + * This means that poll() finally works now with urg data * in the queue. Note that rlogin was never affected - * because it doesn't use select(); it uses two processes + * because it doesn't use poll(); it uses two processes * and a blocking read(). And the queue scan in tcp_read() * was correct. Mike <pall@rz.uni-karlsruhe.de> */ - /* don't count urg data */ + /* Don't count urg data. */ if (skb->h.th->urg) amount--; #if 0 if (amount && skb->h.th->psh) break; #endif skb = skb->next; - } - while(skb != (struct sk_buff *)&sk->receive_queue); + } while(skb != (struct sk_buff *)&sk->receive_queue); restore_flags(flags); - if(sk->debug) - printk("got %lu bytes.\n",amount); + SOCK_DEBUG(sk, "got %lu bytes.\n",amount); return(amount); } /* - * LISTEN is a special case for select.. + * LISTEN is a special case for poll.. */ -static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait) +static unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait) { - if (sel_type == SEL_IN) { - struct open_request *req; + struct open_request *req; - lock_sock(sk); - req = tcp_find_established(&sk->tp_pinfo.af_tcp); - release_sock(sk); - if (req) - return 1; - select_wait(sk->sleep,wait); - return 0; - } + lock_sock(sk); + req = tcp_find_established(&sk->tp_pinfo.af_tcp); + release_sock(sk); + if (req) + return POLLIN | POLLRDNORM; return 0; } /* * Wait for a TCP event. * - * Note that we don't need to lock the socket, as the upper select layers + * Note that we don't need to lock the socket, as the upper poll layers * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ -int tcp_select(struct sock *sk, int sel_type, select_table *wait) +unsigned int tcp_poll(struct socket *sock, poll_table *wait) { + unsigned int mask; + struct sock *sk = sock->sk; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + poll_wait(sk->sleep, wait); if (sk->state == TCP_LISTEN) - return tcp_listen_select(sk, sel_type, wait); - - switch(sel_type) { - case SEL_IN: - if (sk->err) - return 1; - if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) - break; + return tcp_listen_poll(sk, wait); + mask = 0; + if (sk->err) + mask = POLLERR; + /* Connected? */ + if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) { if (sk->shutdown & RCV_SHUTDOWN) - return 1; - - if (tp->rcv_nxt == sk->copied_seq) - break; + mask |= POLLHUP; - if (sk->urg_seq != sk->copied_seq || - tp->rcv_nxt != sk->copied_seq+1 || - sk->urginline || !sk->urg_data) - return 1; - break; + if ((tp->rcv_nxt != sk->copied_seq) && + (sk->urg_seq != sk->copied_seq || + tp->rcv_nxt != sk->copied_seq+1 || + sk->urginline || !sk->urg_data)) + mask |= POLLIN | POLLRDNORM; - case SEL_OUT: - if (sk->err) - return 1; - if (sk->shutdown & SEND_SHUTDOWN) - return 0; - if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) - break; - /* - * This is now right thanks to a small fix - * by Matt Dillon. + /* FIXME: this assumed sk->mtu is correctly maintained. + * I see no evidence this is the case. -- erics */ + if (!(sk->shutdown & SEND_SHUTDOWN) && + (sock_wspace(sk) >= sk->mtu+128+sk->prot->max_header)) + mask |= POLLOUT | POLLWRNORM; - if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header) - break; - return 1; - - case SEL_EX: if (sk->urg_data) - return 1; - break; + mask |= POLLPRI; } - select_wait(sk->sleep, wait); - return 0; + return mask; } int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) { - switch(cmd) - { - + switch(cmd) { case TIOCINQ: #ifdef FIXME /* FIXME: */ case FIONREAD: @@ -690,28 +667,39 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) } default: return(-EINVAL); - } + }; } /* * This routine builds a generic TCP header. + * It also builds in the RFC1323 Timestamp. + * It can't (unfortunately) do SACK as well. */ -extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push) +extern __inline void tcp_build_header(struct tcphdr *th, struct sock *sk, int push) { - struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + memcpy(th,(void *) &(sk->dummy_th), sizeof(*th)); th->seq = htonl(sk->write_seq); - th->psh =(push == 0) ? 1 : 0; - - sk->bytes_rcv = 0; - sk->ack_timed = 0; th->ack_seq = htonl(tp->rcv_nxt); th->window = htons(tcp_select_window(sk)); - return(sizeof(*th)); + /* FIXME: could use the inline found in tcp_output.c as well. + * Probably that means we should move these up to an include file. --erics + */ + if (tp->tstamp_ok) { + __u32 *ptr = (__u32 *)(th+1); + *ptr++ = ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + /* FIXME: Not sure it's worth setting these here already, but I'm + * also not sure we replace them on all paths later. --erics + */ + *ptr++ = jiffies; + *ptr++ = tp->ts_recent; + } } /* @@ -722,16 +710,14 @@ static void wait_for_tcp_connect(struct sock * sk) release_sock(sk); cli(); if (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT && sk->err == 0) - { interruptible_sleep_on(sk->sleep); - } sti(); lock_sock(sk); } static inline int tcp_memory_free(struct sock *sk) { - return sk->wmem_alloc < sk->sndbuf; + return atomic_read(&sk->wmem_alloc) < sk->sndbuf; } /* @@ -770,22 +756,15 @@ static int tcp_append_tail(struct sock *sk, struct sk_buff *skb, u8 *from, int fault; int copy; - /* - * Add more stuff to the end - * of the skb - */ - + /* Add more stuff to the end of the skb. */ copy = min(sk->mss - tcp_size, skb_tailroom(skb)); copy = min(copy, seglen); - + tcp_size += copy; - + fault = copy_from_user(skb->tail, from, copy); - if (fault) - { return -1; - } skb_put(skb, copy); skb->csum = csum_partial(skb->tail - tcp_size, tcp_size, 0); @@ -801,32 +780,24 @@ static int tcp_append_tail(struct sock *sk, struct sk_buff *skb, u8 *from, * and starts the transmit system. */ -int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, - int len, int nonblock, int flags) +int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) { + int err = 0; int copied = 0; struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); - /* - * Wait for a connection to finish. - */ - while (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) - { - - if (copied) - return copied; - - if (sk->err) + /* Wait for a connection to finish. */ + while (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) { + if (sk->err) return sock_error(sk); - - if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) - { + + if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) { if (sk->keepopen) send_sig(SIGPIPE, current, 0); return -EPIPE; } - if (nonblock) + if (flags&MSG_DONTWAIT) return -EAGAIN; if (current->signal & ~current->blocked) @@ -834,163 +805,126 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, wait_for_tcp_connect(sk); } - - - /* - * Ok commence sending - */ - - while(--iovlen >= 0) - { + + /* Ok commence sending. */ + while(--iovlen >= 0) { int seglen=iov->iov_len; unsigned char * from=iov->iov_base; - u32 actual_win; iov++; - while(seglen > 0) - { + while(seglen > 0) { + unsigned int actual_win; int copy; int tmp; struct sk_buff *skb; - /* - * Stop on errors - */ - if (sk->err) - { + if (err) + return (err); + + /* Stop on errors. */ + if (sk->err) { if (copied) return copied; return sock_error(sk); } - /* - * Make sure that we are established. - */ - if (sk->shutdown & SEND_SHUTDOWN) - { + /* Make sure that we are established. */ + if (sk->shutdown & SEND_SHUTDOWN) { if (copied) return copied; send_sig(SIGPIPE,current,0); return -EPIPE; } - /* - *Now we need to check if we have a half built packet. - */ + /* Now we need to check if we have a half built packet. */ - /* if we have queued packets */ - if (tp->send_head && !(flags & MSG_OOB) ) - { + /* If we have queued packets.. */ + if (tp->send_head && !(flags & MSG_OOB)) { int tcp_size; /* Tail */ - + skb = sk->write_queue.prev; - tcp_size = skb->tail - - (unsigned char *)(skb->h.th + 1); - - /* - * This window_seq test is somewhat dangerous + tcp_size = skb->tail - + ((unsigned char *)(skb->h.th) + tp->tcp_header_len); + + /* printk("extending buffer\n"); */ + /* This window_seq test is somewhat dangerous * If the remote does SWS avoidance we should - * queue the best we can - * if not we should in fact send multiple - * packets... + * queue the best we can if not we should in + * fact send multiple packets... * a method for detecting this would be most * welcome */ - if (skb->end > skb->tail && sk->mss - tcp_size > 0 && - skb->end_seq < tp->snd_una + tp->snd_wnd) - { + tp->snd_nxt < skb->end_seq) { int tcopy; - + tcopy = tcp_append_tail(sk, skb, from, tcp_size, seglen); if (tcopy == -1) - { return -EFAULT; - } - + from += tcopy; copied += tcopy; - len -= tcopy; seglen -= tcopy; - - /* - * FIXME: if we're nagling we + + /* FIXME: if we're nagling we * should send here. */ continue; } } - - /* - * We also need to worry about the window. - * If window < 1/2 the maximum window we've seen from this - * host, don't use it. This is sender side - * silly window prevention, as specified in RFC1122. - * (Note that this is different than earlier versions of - * SWS prevention, e.g. RFC813.). What we actually do is - * use the whole MSS. Since the results in the right - * edge of the packet being outside the window, it will - * be queued for later rather than sent. - */ - + /* We also need to worry about the window. + * If window < 1/2 the maximum window we've seen from this + * host, don't use it. This is sender side + * silly window prevention, as specified in RFC1122. + * (Note that this is different than earlier versions of + * SWS prevention, e.g. RFC813.). What we actually do is + * use the whole MSS. Since the results in the right + * edge of the packet being outside the window, it will + * be queued for later rather than sent. + */ copy = min(seglen, sk->mss); - actual_win = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); - if (copy > actual_win && - (((long) actual_win) >= (sk->max_window >> 1))) - { + if (copy > actual_win && + (((int) actual_win) >= (tp->max_window >> 1)) && + actual_win) copy = actual_win; - } - if (copy <= 0) - { + if (copy <= 0) { printk(KERN_DEBUG "sendmsg: copy < 0\n"); return -EIO; } - /* - * If sk->packets_out > 0 segment will be nagled - * else we kick it right away + /* If tp->packets_out > 0 segment will be nagled + * else we kick it right away. */ - tmp = MAX_HEADER + sk->prot->max_header + sizeof(struct sk_buff) + 15; - if (copy < min(sk->mss, sk->max_window >> 1) && - !(flags & MSG_OOB) && sk->packets_out) - { - tmp += min(sk->mss, sk->max_window); - } + if (copy < min(sk->mss, tp->max_window >> 1) && + !(flags & MSG_OOB) && tp->packets_out) + tmp += min(sk->mss, tp->max_window); else - { tmp += copy; - } skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL); - - /* - * If we didn't get any memory, we need to sleep. - */ - - if (skb == NULL) - { + + /* If we didn't get any memory, we need to sleep. */ + if (skb == NULL) { sk->socket->flags |= SO_NOSPACE; - if (nonblock) - { + if (flags&MSG_DONTWAIT) { if (copied) return copied; return -EAGAIN; } - if (current->signal & ~current->blocked) - { + if (current->signal & ~current->blocked) { if (copied) return copied; return -ERESTARTSYS; @@ -1000,54 +934,37 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, continue; } - skb->sk = sk; - skb->free = 0; - skb->localroute = sk->localroute|(flags&MSG_DONTROUTE); - - /* - * FIXME: we need to optimize this. + /* FIXME: we need to optimize this. * Perhaps some hints here would be good. */ - tmp = tp->af_specific->build_net_header(sk, skb); - - if (tmp < 0) - { - sock_wfree(sk, skb); + if (tmp < 0) { + kfree_skb(skb, FREE_WRITE); if (copied) return(copied); return(tmp); } - skb->h.th =(struct tcphdr *) - skb_put(skb,sizeof(struct tcphdr)); + skb->h.th =(struct tcphdr *) + skb_put(skb,tp->tcp_header_len); seglen -= copy; - tmp = tcp_build_header(skb->h.th, sk, seglen || iovlen); - - if (tmp < 0) - { - sock_wfree(sk, skb); - if (copied) - return(copied); - return(tmp); - } - - if (flags & MSG_OOB) - { + tcp_build_header(skb->h.th, sk, seglen || iovlen); + /* FIXME: still need to think about SACK options here. */ + + if (flags & MSG_OOB) { skb->h.th->urg = 1; skb->h.th->urg_ptr = ntohs(copy); } - skb->csum = csum_partial_copy_fromuser(from, - skb_put(skb, copy), copy, 0); - + skb->csum = csum_partial_copy_from_user(from, + skb_put(skb, copy), copy, 0, &err); + from += copy; copied += copy; - len -= copy; - skb->free = 0; + sk->write_seq += copy; - + tcp_send_skb(sk, skb); release_sock(sk); @@ -1057,12 +974,12 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, sk->err = 0; + if (err) + return (err); + return copied; } - - - /* * Send an ack if one is backlogged at this point. Ought to merge * this with tcp_send_ack(). @@ -1071,18 +988,15 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, void tcp_read_wakeup(struct sock *sk) { - /* - * If we're closed, don't send an ack, or we'll get a RST + /* If we're closed, don't send an ack, or we'll get a RST * from the closed destination. */ - if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT)) return; tcp_send_ack(sk); } - /* * Handle reading urgent data. BSD has very simple semantics for * this, no blocking and very strange errors 8) @@ -1095,33 +1009,28 @@ static int tcp_recv_urg(struct sock * sk, int nonblock, int err=0; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - /* - * No URG data to read - */ + /* No URG data to read. */ if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ) return -EINVAL; /* Yes this is right ! */ if (sk->err) return sock_error(sk); - if (sk->state == TCP_CLOSE || sk->done) - { - if (!sk->done) - { + if (sk->state == TCP_CLOSE || sk->done) { + if (!sk->done) { sk->done = 1; return 0; } return -ENOTCONN; } - if (sk->shutdown & RCV_SHUTDOWN) - { + if (sk->shutdown & RCV_SHUTDOWN) { sk->done = 1; return 0; } + lock_sock(sk); - if (sk->urg_data & URG_VALID) - { + if (sk->urg_data & URG_VALID) { char c = sk->urg_data; if (!(flags & MSG_PEEK)) sk->urg_data = URG_READ; @@ -1132,23 +1041,20 @@ static int tcp_recv_urg(struct sock * sk, int nonblock, msg->msg_flags|=MSG_TRUNC; if(msg->msg_name) - { tp->af_specific->addr2sockaddr(sk, (struct sockaddr *) msg->msg_name); - } + if(addr_len) - *addr_len= tp->af_specific->sockaddr_len; - /* - * Read urgent data - */ + *addr_len = tp->af_specific->sockaddr_len; + + /* Read urgent data. */ msg->msg_flags|=MSG_OOB; release_sock(sk); return err ? -EFAULT : 1; } release_sock(sk); - /* - * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and + /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and * the available implementations agree in this case: * this call should never block, independent of the * blocking state of the socket. @@ -1165,9 +1071,8 @@ static int tcp_recv_urg(struct sock * sk, int nonblock, static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb) { - sk->ack_backlog++; + sk->tp_pinfo.af_tcp.delayed_acks++; - skb->sk = sk; __skb_unlink(skb, &sk->receive_queue); kfree_skb(skb, FREE_READ); } @@ -1176,36 +1081,31 @@ static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb) static void cleanup_rbuf(struct sock *sk) { struct sk_buff *skb; - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - /* - * NOTE! The socket must be locked, so that we don't get + /* NOTE! The socket must be locked, so that we don't get * a messed-up receive queue. */ - while ((skb=skb_peek(&sk->receive_queue)) != NULL) { - if (!skb->used || skb->users) + if (!skb->used || atomic_read(&skb->users)>1) break; tcp_eat_skb(sk, skb); } - - if(sk->debug) - printk("sk->rspace = %lu\n", sock_rspace(sk)); - - /* - * We send a ACK if the sender is blocked - * else let tcp_data deal with the acking policy. - */ - if (sock_rspace(sk) > tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup) && - (tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup) < sk->mss)) - { - /* Send an ack right now. */ - sk->delayed_acks++; - tcp_read_wakeup(sk); - } - -} + SOCK_DEBUG(sk, "sk->rspace = %lu\n", sock_rspace(sk)); + + /* We send a ACK if the sender is blocked + * else let tcp_data deal with the acking policy. + */ + if (sk->tp_pinfo.af_tcp.delayed_acks) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + __u32 rcv_wnd; + + rcv_wnd = tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup); + + if ((rcv_wnd < sk->mss) && (sock_rspace(sk) > rcv_wnd)) + tcp_read_wakeup(sk); + } +} /* @@ -1227,47 +1127,34 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, if (sk->state == TCP_LISTEN) return -ENOTCONN; - /* - * Urgent data needs to be handled specially. - */ - + /* Urgent data needs to be handled specially. */ if (flags & MSG_OOB) return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len); - /* - * Copying sequence to update. This is volatile to handle + /* Copying sequence to update. This is volatile to handle * the multi-reader case neatly (memcpy_to/fromfs might be * inline and thus not flush cached variables otherwise). */ - peek_seq = sk->copied_seq; seq = &sk->copied_seq; if (flags & MSG_PEEK) seq = &peek_seq; - /* - * Handle the POSIX bogosity MSG_WAITALL - */ - + /* Handle the POSIX bogosity MSG_WAITALL. */ if (flags & MSG_WAITALL) target=len; add_wait_queue(sk->sleep, &wait); lock_sock(sk); - while (len > 0) - { + while (len > 0) { struct sk_buff * skb; u32 offset; - /* - * Are we at urgent data? Stop if we have read anything. - */ - + /* Are we at urgent data? Stop if we have read anything. */ if (copied && sk->urg_data && sk->urg_seq == *seq) break; - /* - * We need to check signals first, to get correct SIGURG + /* We need to check signals first, to get correct SIGURG * handling. FIXME: Need to check this doesnt impact 1003.1g * and move it down to the bottom of the loop */ @@ -1275,26 +1162,24 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, if (copied) break; copied = -ERESTARTSYS; + if (nonblock) + copied = -EAGAIN; break; } - /* - * Next get a buffer. - */ - + /* Next get a buffer. */ current->state = TASK_INTERRUPTIBLE; skb = skb_peek(&sk->receive_queue); - do - { + do { if (!skb) break; - /* - * now that we have two receive queues this - * shouldn't happen + + /* Now that we have two receive queues this + * shouldn't happen. */ if (before(*seq, skb->seq)) { - printk("recvmsg bug: copied %X seq %X\n", + printk(KERN_INFO "recvmsg bug: copied %X seq %X\n", *seq, skb->seq); break; } @@ -1308,22 +1193,18 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, if (!(flags & MSG_PEEK)) skb->used = 1; skb = skb->next; - } - while (skb != (struct sk_buff *)&sk->receive_queue); + } while (skb != (struct sk_buff *)&sk->receive_queue); if (copied >= target) break; - if (sk->err && !(flags&MSG_PEEK)) - { + if (sk->err && !(flags&MSG_PEEK)) { copied = sock_error(sk); break; } - if (sk->state == TCP_CLOSE) - { - if (!sk->done) - { + if (sk->state == TCP_CLOSE) { + if (!sk->done) { sk->done = 1; break; } @@ -1331,14 +1212,12 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, break; } - if (sk->shutdown & RCV_SHUTDOWN) - { + if (sk->shutdown & RCV_SHUTDOWN) { sk->done = 1; break; } - if (nonblock) - { + if (nonblock) { copied = -EAGAIN; break; } @@ -1352,97 +1231,75 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, continue; found_ok_skb: - /* - * Lock the buffer. We can be fairly relaxed as + /* Lock the buffer. We can be fairly relaxed as * an interrupt will never steal a buffer we are * using unless I've missed something serious in * tcp_data. */ + atomic_inc(&skb->users); - skb->users++; - - /* - * Ok so how much can we use ? - */ - + /* Ok so how much can we use? */ used = skb->len - offset; if (len < used) used = len; - /* - * Do we have urgent data here? - */ - if (sk->urg_data) - { + /* Do we have urgent data here? */ + if (sk->urg_data) { u32 urg_offset = sk->urg_seq - *seq; - if (urg_offset < used) - { - if (!urg_offset) - { - if (!sk->urginline) - { + if (urg_offset < used) { + if (!urg_offset) { + if (!sk->urginline) { ++*seq; offset++; used--; } - } - else + } else used = urg_offset; } } - /* - * Copy it - We _MUST_ update *seq first so that we + /* Copy it - We _MUST_ update *seq first so that we * don't ever double read when we have dual readers */ - *seq += used; - /* - * This memcpy_toiovec can sleep. If it sleeps and we + /* This memcpy_toiovec can sleep. If it sleeps and we * do a second read it relies on the skb->users to avoid * a crash when cleanup_rbuf() gets called. */ - err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used); - if (err) - { - /* - * exception. bailout! - */ + if (err) { + /* Exception. Bailout! */ *seq -= err; - skb->users--; - return -EFAULT; + atomic_dec(&skb->users); + copied = -EFAULT; + break; } copied += used; len -= used; - /* - * We now will not sleep again until we are finished + /* We now will not sleep again until we are finished * with skb. Sorry if you are doing the SMP port * but you'll just have to fix it neatly ;) */ - - skb->users--; + atomic_dec(&skb->users); if (after(sk->copied_seq,sk->urg_seq)) sk->urg_data = 0; if (used + offset < skb->len) continue; - /* - * Process the FIN. We may also need to handle PSH - * here and make it break out of MSG_WAITALL + /* Process the FIN. We may also need to handle PSH + * here and make it break out of MSG_WAITALL. */ - if (skb->h.th->fin) goto found_fin_ok; if (flags & MSG_PEEK) continue; skb->used = 1; - if (!skb->users) + if (atomic_read(&skb->users) == 1) tcp_eat_skb(sk, skb); continue; @@ -1451,35 +1308,28 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, if (flags & MSG_PEEK) break; - /* - * All is done - */ - + /* All is done. */ skb->used = 1; sk->shutdown |= RCV_SHUTDOWN; break; - } if(copied > 0 && msg->msg_name) - { tp->af_specific->addr2sockaddr(sk, (struct sockaddr *) msg->msg_name); - } + if(addr_len) - *addr_len= tp->af_specific->sockaddr_len; + *addr_len = tp->af_specific->sockaddr_len; remove_wait_queue(sk->sleep, &wait); current->state = TASK_RUNNING; - /* Clean up data we have read: This will do ACK frames */ + /* Clean up data we have read: This will do ACK frames. */ cleanup_rbuf(sk); release_sock(sk); return copied; } - - /* * State processing on a close. This implements the state shift for * sending our FIN frame. Note that we only send a FIN for some @@ -1491,8 +1341,7 @@ static int tcp_close_state(struct sock *sk, int dead) { int ns=TCP_CLOSE; int send_fin=0; - switch(sk->state) - { + switch(sk->state) { case TCP_SYN_SENT: /* No SYN back, no FIN needed */ break; case TCP_SYN_RECV: @@ -1508,16 +1357,16 @@ static int tcp_close_state(struct sock *sk, int dead) case TCP_CLOSE: case TCP_LISTEN: break; + case TCP_LAST_ACK: /* Could have shutdown() then close() */ case TCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and wait only for the ACK */ ns=TCP_LAST_ACK; send_fin=1; - } + }; tcp_set_state(sk,ns); - /* - * This is a (useful) BSD violating of the RFC. There is a + /* This is a (useful) BSD violating of the RFC. There is a * problem with TCP as specified in that the other end could * keep a socket open forever with no application left this end. * We use a 3 minute timeout (about the same as BSD) then kill @@ -1525,8 +1374,7 @@ static int tcp_close_state(struct sock *sk, int dead) * that we won't make the old 4*rto = almost no time - whoops * reset mistake. */ - if(dead && ns==TCP_FIN_WAIT2) - { + if(dead && ns==TCP_FIN_WAIT2) { int timer_active=del_timer(&sk->timer); if(timer_active) add_timer(&sk->timer); @@ -1544,50 +1392,29 @@ static int tcp_close_state(struct sock *sk, int dead) void tcp_shutdown(struct sock *sk, int how) { - /* - * We need to grab some memory, and put together a FIN, + /* We need to grab some memory, and put together a FIN, * and then put it into the queue to be sent. * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. */ - if (!(how & SEND_SHUTDOWN)) return; - /* - * If we've already sent a FIN, or it's a closed state - */ - - if (sk->state == TCP_FIN_WAIT1 || - sk->state == TCP_FIN_WAIT2 || - sk->state == TCP_CLOSING || - sk->state == TCP_LAST_ACK || - sk->state == TCP_TIME_WAIT || - sk->state == TCP_CLOSE || - sk->state == TCP_LISTEN - ) - { - return; - } - lock_sock(sk); - - /* - * flag that the sender has shutdown - */ + /* If we've already sent a FIN, or it's a closed state, skip this. */ + if (sk->state == TCP_ESTABLISHED || + sk->state == TCP_SYN_SENT || + sk->state == TCP_SYN_RECV || + sk->state == TCP_CLOSE_WAIT) { + lock_sock(sk); - sk->shutdown |= SEND_SHUTDOWN; + /* Flag that the sender has shutdown. */ + sk->shutdown |= SEND_SHUTDOWN; - /* - * Clear out any half completed packets. - */ + /* Clear out any half completed packets. FIN if needed. */ + if (tcp_close_state(sk,0)) + tcp_send_fin(sk); - /* - * FIN if needed - */ - - if (tcp_close_state(sk,0)) - tcp_send_fin(sk); - - release_sock(sk); + release_sock(sk); + } } @@ -1602,7 +1429,7 @@ static inline int closing(struct sock * sk) case TCP_CLOSING: case TCP_LAST_ACK: return 1; - } + }; return 0; } @@ -1611,21 +1438,17 @@ void tcp_close(struct sock *sk, unsigned long timeout) { struct sk_buff *skb; - /* - * We need to grab some memory, and put together a FIN, + /* We need to grab some memory, and put together a FIN, * and then put it into the queue to be sent. */ - lock_sock(sk); - - tcp_cache_zap(); - if(sk->state == TCP_LISTEN) - { - /* Special case */ + if(sk->state == TCP_LISTEN) { + /* Special case. */ tcp_set_state(sk, TCP_CLOSE); tcp_close_pending(sk); release_sock(sk); sk->dead = 1; + sk->prot->unhash(sk); return; } @@ -1635,54 +1458,37 @@ void tcp_close(struct sock *sk, unsigned long timeout) if (!sk->dead) sk->state_change(sk); - /* - * We need to flush the recv. buffs. We do this only on the + /* We need to flush the recv. buffs. We do this only on the * descriptor close, not protocol-sourced closes, because the * reader process may not have drained the data yet! */ - while((skb=skb_dequeue(&sk->receive_queue))!=NULL) kfree_skb(skb, FREE_READ); - - /* - * Timeout is not the same thing - however the code likes - * to send both the same way (sigh). + /* Timeout is not the same thing - however the code likes + * to send both the same way (sigh). */ - if (tcp_close_state(sk,1)==1) - { tcp_send_fin(sk); - } if (timeout) { cli(); release_sock(sk); current->timeout = timeout; - while(closing(sk) && current->timeout) - { + while(closing(sk) && current->timeout) { interruptible_sleep_on(sk->sleep); if (current->signal & ~current->blocked) - { break; - } } current->timeout=0; lock_sock(sk); sti(); } - /* - * This will destroy it. The timers will take care of actually - * free'ing up the memory. - */ - tcp_cache_zap(); /* Kill the cache again. */ - /* Now that the socket is dead, if we are in the FIN_WAIT2 state * we may need to set up a timer. */ - if (sk->state==TCP_FIN_WAIT2) - { + if (sk->state==TCP_FIN_WAIT2) { int timer_active=del_timer(&sk->timer); if(timer_active) add_timer(&sk->timer); @@ -1690,10 +1496,12 @@ void tcp_close(struct sock *sk, unsigned long timeout) tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT); } - release_sock(sk); sk->dead = 1; -} + release_sock(sk); + if(sk->state == TCP_CLOSE) + sk->prot->unhash(sk); +} /* * Wait for an incoming connection, avoid race @@ -1720,7 +1528,6 @@ static struct open_request * wait_for_connect(struct sock * sk) return req; } - /* * This will accept the next outstanding connection. * @@ -1734,11 +1541,9 @@ struct sock *tcp_accept(struct sock *sk, int flags) struct sock *newsk = NULL; int error; - /* - * We need to make sure that this socket is listening, - * and that it has something pending. - */ - + /* We need to make sure that this socket is listening, + * and that it has something pending. + */ error = EINVAL; if (sk->state != TCP_LISTEN) goto no_listen; @@ -1750,7 +1555,7 @@ struct sock *tcp_accept(struct sock *sk, int flags) got_new_connect: tcp_synq_unlink(tp, req); newsk = req->sk; - kfree(req); + tcp_openreq_free(req); sk->ack_backlog--; error = 0; out: @@ -1770,7 +1575,6 @@ no_listen: goto out; } - /* * Socket option code for TCP. */ @@ -1782,22 +1586,18 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int val; if (level != SOL_TCP) - { return tp->af_specific->setsockopt(sk, level, optname, optval, optlen); - } - - if (optval == NULL) - return(-EINVAL); + + if(optlen<sizeof(int)) + return -EINVAL; if (get_user(val, (int *)optval)) return -EFAULT; - switch(optname) - { + switch(optname) { case TCP_MAXSEG: -/* - * values greater than interface MTU won't take effect. however at +/* values greater than interface MTU won't take effect. however at * the point when this call is done we typically don't yet know * which interface is going to be used */ @@ -1810,23 +1610,26 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, return 0; default: return(-ENOPROTOOPT); - } + }; } int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int val,err; + int val; + int len; if(level != SOL_TCP) - { return tp->af_specific->getsockopt(sk, level, optname, optval, optlen); - } + + if(get_user(len,optlen)) + return -EFAULT; + + len = min(len,sizeof(int)); - switch(optname) - { + switch(optname) { case TCP_MAXSEG: val=sk->user_mss; break; @@ -1835,30 +1638,29 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, break; default: return(-ENOPROTOOPT); - } - - err = put_user(sizeof(int),(int *) optlen); - if (!err) - err = put_user(val,(int *)optval); + }; - return err; + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval, &val,len)) + return -EFAULT; + return 0; } void tcp_set_keepalive(struct sock *sk, int val) { if (!sk->keepopen && val) - { tcp_inc_slow_timer(TCP_SLT_KEEPALIVE); - } else if (sk->keepopen && !val) - { tcp_dec_slow_timer(TCP_SLT_KEEPALIVE); - } } -/* - * Local variables: - * compile-command: "gcc -D__KERNEL__ -I/usr/src/linux/include -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer -fno-strength-reduce -pipe -m486 -DCPU=486 -c -o tcp.o tcp.c" - * c-file-style: "Linux" - * End: - */ +void tcp_init(void) +{ + tcp_openreq_cachep = kmem_cache_create("tcp_open_request", + sizeof(struct open_request), + sizeof(long)*8, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if(!tcp_openreq_cachep) + panic("tcp_init: Cannot alloc open_request cache."); +} |