diff options
Diffstat (limited to 'net')
47 files changed, 2138 insertions, 3509 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c index cdab70aba..186ccf81b 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -132,15 +132,13 @@ restart: unsigned long flags; save_flags(flags); cli(); - skb=skb_peek(&sk->receive_queue); + skb = skb_peek(&sk->receive_queue); if(skb!=NULL) atomic_inc(&skb->users); restore_flags(flags); - if(skb==NULL) /* shouldn't happen but .. */ - goto restart; - return skb; - } - skb = skb_dequeue(&sk->receive_queue); + } else + skb = skb_dequeue(&sk->receive_queue); + if (!skb) /* Avoid race if someone beats us to the data */ goto restart; return skb; @@ -163,30 +161,23 @@ void skb_free_datagram(struct sock * sk, struct sk_buff *skb) int skb_copy_datagram(struct sk_buff *skb, int offset, char *to, int size) { - int err; - err = copy_to_user(to, skb->h.raw+offset, size); - if (err) - { - err = -EFAULT; - } + int err = -EFAULT; + + if (!copy_to_user(to, skb->h.raw + offset, size)) + err = 0; return err; } /* * Copy a datagram to an iovec. + * Note: the iovec is modified during the copy. */ int skb_copy_datagram_iovec(struct sk_buff *skb, int offset, struct iovec *to, int size) { - int err; - err = memcpy_toiovec(to, skb->h.raw+offset, size); - if (err) - { - err = -EFAULT; - } - return err; + return memcpy_toiovec(to, skb->h.raw + offset, size); } /* diff --git a/net/core/dst.c b/net/core/dst.c index 4cad680c2..9007dde66 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -101,6 +101,14 @@ void * dst_alloc(int size, struct dst_ops * ops) void __dst_free(struct dst_entry * dst) { start_bh_atomic(); + /* The first case (dev==NULL) is required, when + protocol module is unloaded. + */ + if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { + dst->input = dst_discard; + dst->output = dst_blackhole; + dst->dev = &loopback_dev; + } dst->obsolete = 2; dst->next = dst_garbage_list; dst_garbage_list = dst; diff --git a/net/core/iovec.c b/net/core/iovec.c index 9e8873646..5b684a48f 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -30,7 +30,6 @@ /* * Verify iovec - * verify area does a simple check for completly bogus addresses * * Save time not doing verify_area. copy_*_user will make this work * in any case. @@ -79,22 +78,21 @@ out_free: } /* - * Copy kernel to iovec. + * Copy kernel to iovec. Returns -EFAULT on error. * * Note: this modifies the original iovec. */ int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) { - int err; + int err = -EFAULT; while(len>0) { if(iov->iov_len) { int copy = min(iov->iov_len, len); - err = copy_to_user(iov->iov_base, kdata, copy); - if (err) + if (copy_to_user(iov->iov_base, kdata, copy)) goto out; kdata+=copy; len-=copy; @@ -109,7 +107,7 @@ out: } /* - * Copy iovec to kernel. + * Copy iovec to kernel. Returns -EFAULT on error. * * Note: this modifies the original iovec. */ @@ -147,35 +145,23 @@ int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset, { int err = -EFAULT; - while(offset>0) + /* Skip over the finished iovecs */ + while(offset >= iov->iov_len) { - if (offset > iov->iov_len) - { - offset -= iov->iov_len; - } - else - { - u8 *base = iov->iov_base + offset; - int copy = min(len, iov->iov_len - offset); - - offset = 0; - - if (copy_from_user(kdata, base, copy)) - goto out; - len-=copy; - kdata+=copy; - } + offset -= iov->iov_len; iov++; } - while (len>0) + while (len > 0) { - int copy = min(len, iov->iov_len); + u8 *base = iov->iov_base + offset; + int copy = min(len, iov->iov_len - offset); - if (copy_from_user(kdata, iov->iov_base, copy)) + offset = 0; + if (copy_from_user(kdata, base, copy)) goto out; - len-=copy; - kdata+=copy; + len -= copy; + kdata += copy; iov++; } err = 0; @@ -195,51 +181,22 @@ out: int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset, unsigned int len, int *csump) { - int partial_cnt = 0; - int err = 0; - int csum; + int csum = *csump; + int partial_cnt = 0, err = 0; - do { - int copy = iov->iov_len - offset; - - if (copy > 0) { - u8 *base = iov->iov_base + offset; - - /* Normal case (single iov component) is fastly detected */ - if (len <= copy) { - *csump = csum_and_copy_from_user(base, kdata, - len, *csump, &err); - goto out; - } - - partial_cnt = copy % 4; - if (partial_cnt) { - copy -= partial_cnt; - if (copy_from_user(kdata + copy, base + copy, - partial_cnt)) - goto out_fault; - } - - *csump = csum_and_copy_from_user(base, kdata, copy, - *csump, &err); - if (err) - goto out; - len -= copy + partial_cnt; - kdata += copy + partial_cnt; - iov++; - break; - } + /* Skip over the finished iovecs */ + while (offset >= iov->iov_len) + { + offset -= iov->iov_len; iov++; - offset = -copy; - } while (offset > 0); - - csum = *csump; + } while (len > 0) { - u8 *base = iov->iov_base; - unsigned int copy = min(len, iov->iov_len); + u8 *base = iov->iov_base + offset; + unsigned int copy = min(len, iov->iov_len - offset); + offset = 0; /* There is a remnant from previous iov. */ if (partial_cnt) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9180b8b54..57e58f85a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -16,6 +16,7 @@ * only put in the headers * Ray VanTassle : Fixed --skb->lock in free * Alan Cox : skb_copy copy arp field + * Andi Kleen : slabified it. * * NOTE: * The __skb_ routines should be called with interrupts @@ -45,6 +46,8 @@ #include <linux/netdevice.h> #include <linux/string.h> #include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/init.h> #include <net/ip.h> #include <net/protocol.h> @@ -57,6 +60,11 @@ #include <asm/system.h> /* + * Skb list spinlock + */ +spinlock_t skb_queue_lock = SPIN_LOCK_UNLOCKED; + +/* * Resource tracking variables */ @@ -66,6 +74,8 @@ static atomic_t net_fails = ATOMIC_INIT(0); extern atomic_t ip_frag_mem; +static kmem_cache_t *skbuff_head_cache; + /* * Strings we don't want inline's duplicating */ @@ -87,138 +97,119 @@ void show_net_buffers(void) #endif } -/* - * Free an sk_buff. Release anything attached to the buffer. - */ - -void __kfree_skb(struct sk_buff *skb) -{ - if (skb->list) - printk(KERN_WARNING "Warning: kfree_skb passed an skb still " - "on a list (from %p).\n", __builtin_return_address(0)); - - dst_release(skb->dst); - if(skb->destructor) - skb->destructor(skb); - kfree_skbmem(skb); -} - -/* - * Allocate a new skbuff. We do this ourselves so we can fill in a few 'private' - * fields and also do memory statistics to find all the [BEEP] leaks. - * - * Note: For now we put the header after the data to get better cache - * usage. Once we have a good cache aware kmalloc this will cease - * to be a good idea. +/* Allocate a new skbuff. We do this ourselves so we can fill in a few + * 'private' fields and also do memory statistics to find all the + * [BEEP] leaks. + * */ struct sk_buff *alloc_skb(unsigned int size,int gfp_mask) { struct sk_buff *skb; - unsigned char *bptr; - int len; + u8 *data; if (in_interrupt() && (gfp_mask & __GFP_WAIT)) { static int count = 0; if (++count < 5) { printk(KERN_ERR "alloc_skb called nonatomically " "from interrupt %p\n", __builtin_return_address(0)); - gfp_mask &= ~__GFP_WAIT; } + gfp_mask &= ~__GFP_WAIT; } - /* - * FIXME: We could do with an architecture dependent - * 'alignment mask'. - */ - - /* Allow for alignments. Make a multiple of 16 bytes */ - size = (size + 15) & ~15; - len = size; - - /* And stick the control itself on the end */ - size += sizeof(struct sk_buff); - - /* - * Allocate some space - */ - - bptr = kmalloc(size,gfp_mask); - if (bptr == NULL) { - atomic_inc(&net_fails); - return NULL; - } + /* Get the HEAD */ + skb = kmem_cache_alloc(skbuff_head_cache, gfp_mask); + if (skb == NULL) + goto nohead; - /* - * Now we play a little game with the caches. Linux kmalloc is - * a bit cache dumb, in fact its just about maximally non - * optimal for typical kernel buffers. We actually run faster - * by doing the following. Which is to deliberately put the - * skb at the _end_ not the start of the memory block. + /* Get the DATA. Size must match skb_add_mtu(). */ + size = ((size + 15) & ~15); + data = kmalloc(size + sizeof(atomic_t), gfp_mask); + if (data == NULL) + goto nodata; + + /* Note that this counter is useless now - you can just look in the + * skbuff_head entry in /proc/slabinfo. We keep it only for emergency + * cases. */ atomic_inc(&net_allocs); - - skb = (struct sk_buff *)(bptr + size) - 1; - atomic_set(&skb->count, 1); /* only one reference to this */ - skb->data_skb = skb; /* and we're our own data skb */ + skb->truesize = size; + + atomic_inc(&net_skbcount); + + /* Load the data pointers. */ + skb->head = data; + skb->data = data; + skb->tail = data; + skb->end = data + size; + + /* Set up other state */ + skb->len = 0; + skb->is_clone = 0; + skb->cloned = 0; + atomic_set(&skb->users, 1); + atomic_set(skb_datarefp(skb), 1); + return skb; + +nodata: + kmem_cache_free(skbuff_head_cache, skb); +nohead: + atomic_inc(&net_fails); + return NULL; +} + + +/* + * Slab constructor for a skb head. + */ +static inline void skb_headerinit(void *p, kmem_cache_t *cache, + unsigned long flags) +{ + struct sk_buff *skb = p; + + skb->destructor = NULL; skb->pkt_type = PACKET_HOST; /* Default type */ skb->pkt_bridged = 0; /* Not bridged */ skb->prev = skb->next = NULL; skb->list = NULL; skb->sk = NULL; - skb->truesize=size; skb->stamp.tv_sec=0; /* No idea about time */ skb->ip_summed = 0; skb->security = 0; /* By default packets are insecure */ skb->dst = NULL; - skb->destructor = NULL; memset(skb->cb, 0, sizeof(skb->cb)); skb->priority = 0; - atomic_inc(&net_skbcount); - atomic_set(&skb->users, 1); - - /* Load the data pointers. */ - skb->head = bptr; - skb->data = bptr; - skb->tail = bptr; - skb->end = bptr + len; - skb->len = 0; - skb->inclone = 0; - return skb; } /* - * Free an skbuff by memory + * Free an skbuff by memory without cleaning the state. */ - -extern inline void __kfree_skbmem(struct sk_buff *skb) +void kfree_skbmem(struct sk_buff *skb) { - /* don't do anything if somebody still uses us */ - if (atomic_dec_and_test(&skb->count)) { + if (!skb->cloned || atomic_dec_and_test(skb_datarefp(skb))) kfree(skb->head); - atomic_dec(&net_skbcount); - } + + kmem_cache_free(skbuff_head_cache, skb); + atomic_dec(&net_skbcount); } -void kfree_skbmem(struct sk_buff *skb) -{ - void * addr = skb->head; +/* + * Free an sk_buff. Release anything attached to the buffer. Clean the state. + */ - /* don't do anything if somebody still uses us */ - if (atomic_dec_and_test(&skb->count)) { - int free_head = (skb->inclone != SKB_CLONE_INLINE); +void __kfree_skb(struct sk_buff *skb) +{ + if (skb->list) + printk(KERN_WARNING "Warning: kfree_skb passed an skb still " + "on a list (from %p).\n", __builtin_return_address(0)); - /* free the skb that contains the actual data if we've clone()'d */ - if (skb->data_skb != skb) { - addr = skb; - __kfree_skbmem(skb->data_skb); - } - if (free_head) - kfree(addr); - atomic_dec(&net_skbcount); - } + dst_release(skb->dst); + if(skb->destructor) + skb->destructor(skb); + skb_headerinit(skb, NULL, 0); /* clean state */ + kfree_skbmem(skb); } /* @@ -228,32 +219,24 @@ void kfree_skbmem(struct sk_buff *skb) struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) { struct sk_buff *n; - int inbuff = 0; - if (!skb->inclone && skb_tailroom(skb) >= sizeof(struct sk_buff)) { - n = ((struct sk_buff *) skb->end) - 1; - skb->end -= sizeof(struct sk_buff); - skb->inclone = SKB_CLONE_ORIG; - inbuff = SKB_CLONE_INLINE; - } else { - n = kmalloc(sizeof(*n), gfp_mask); - if (!n) - return NULL; - } + n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); + if (!n) + return NULL; + memcpy(n, skb, sizeof(*n)); - atomic_set(&n->count, 1); - skb = skb->data_skb; - atomic_inc(&skb->count); + atomic_inc(skb_datarefp(skb)); + skb->cloned = 1; + atomic_inc(&net_allocs); atomic_inc(&net_skbcount); dst_clone(n->dst); - n->data_skb = skb; + n->cloned = 1; n->next = n->prev = NULL; n->list = NULL; n->sk = NULL; - n->tries = 0; + n->is_clone = 1; atomic_set(&n->users, 1); - n->inclone = inbuff; n->destructor = NULL; return n; } @@ -287,6 +270,7 @@ struct sk_buff *skb_copy(struct sk_buff *skb, int gfp_mask) skb_put(n,skb->len); /* Copy the bytes */ memcpy(n->head,skb->head,skb->end-skb->head); + n->csum = skb->csum; n->list=NULL; n->sk=NULL; n->when=skb->when; @@ -302,7 +286,7 @@ struct sk_buff *skb_copy(struct sk_buff *skb, int gfp_mask) n->ack_seq=skb->ack_seq; memcpy(n->cb, skb->cb, sizeof(skb->cb)); n->used=skb->used; - n->tries=0; + n->is_clone=0; atomic_set(&n->users, 1); n->pkt_type=skb->pkt_type; n->stamp=skb->stamp; @@ -321,7 +305,7 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, int newheadroom) * Allocate the copy buffer */ - n=alloc_skb(skb->truesize+newheadroom-headroom-sizeof(struct sk_buff), GFP_ATOMIC); + n=alloc_skb(skb->truesize+newheadroom-headroom, GFP_ATOMIC); if(n==NULL) return NULL; @@ -352,7 +336,7 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, int newheadroom) n->end_seq=skb->end_seq; n->ack_seq=skb->ack_seq; n->used=skb->used; - n->tries=0; + n->is_clone=0; atomic_set(&n->users, 1); n->pkt_type=skb->pkt_type; n->stamp=skb->stamp; @@ -361,3 +345,27 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, int newheadroom) return n; } + +#if 0 +/* + * Tune the memory allocator for a new MTU size. + */ +void skb_add_mtu(int mtu) +{ + /* Must match allocation in alloc_skb */ + mtu = ((mtu + 15) & ~15) + sizeof(atomic_t); + + kmem_add_cache_size(mtu); +} +#endif + +__initfunc(void skb_init(void)) +{ + skbuff_head_cache = kmem_cache_create("skbuff_head_cache", + sizeof(struct sk_buff), + 0, + SLAB_HWCACHE_ALIGN, + skb_headerinit, NULL); + if (!skbuff_head_cache) + panic("cannot create skbuff cache"); +} diff --git a/net/core/sock.c b/net/core/sock.c index f940e5a80..7707c70d0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -663,31 +663,13 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, unsigne goto failure; /* - * FIXME: Check 1003.1g should we deliver - * a signal here ??? + * We should send SIGPIPE in these cases according to + * 1003.1g draft 6.4. If we (the user) did a shutdown() + * call however we should not. * - * Alan, could we solve this question once and forever? - * - * I believe, datagram sockets should never - * generate SIGPIPE. Moreover, I DO think that - * TCP is allowed to generate it only on write() - * call, but never on send/sendto/sendmsg. - * (btw, Solaris generates it even on read() :-)) - * - * The reason is that SIGPIPE is global flag, - * so that library function using sockets (f.e. syslog()), - * must save/disable it on entry and restore on exit. - * As result, signal arriving for another thread will - * be lost. Generation it on write() is still necessary - * because a lot of stupid programs never check write() - * return value. - * - * Seems, SIGPIPE is very bad idea, sort of gets(). - * At least, we could have an option disabling - * this behaviour on per-socket and/or per-message base. - * BTW it is very easy - MSG_SIGPIPE flag, which - * always set by read/write and checked here. - * --ANK + * Note: This routine isnt just used for datagrams and + * anyway some datagram protocols have a notion of + * close down. */ err = -EPIPE; @@ -699,7 +681,7 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, unsigne else { /* The buffer get won't block, or use the atomic queue. It does produce annoying no free page messages still.... */ - skb = sock_wmalloc(sk, size, 0 , GFP_BUFFER); + skb = sock_wmalloc(sk, size, 0, GFP_BUFFER); if (!skb) skb=sock_wmalloc(sk, fallback, 0, sk->allocation); } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ef1c44620..6667b8d72 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -274,7 +274,7 @@ static int inet_autobind(struct sock *sk) sk->num = sk->prot->good_socknum(); if (sk->num == 0) return(-EAGAIN); - sk->dummy_th.source = htons(sk->num); + sk->sport = htons(sk->num); sk->prot->hash(sk); add_to_prot_sklist(sk); } @@ -304,6 +304,7 @@ int inet_listen(struct socket *sock, int backlog) if (sk->state != TCP_LISTEN) { sk->ack_backlog = 0; sk->state = TCP_LISTEN; + dst_release(xchg(&sk->dst_cache, NULL)); sk->prot->rehash(sk); add_to_prot_sklist(sk); } @@ -348,7 +349,6 @@ static int inet_create(struct socket *sock, int protocol) switch (sock->type) { case SOCK_STREAM: - /* Note for tcp that also wiped the dummy_th block for us. */ if (protocol && protocol != IPPROTO_TCP) goto free_and_noproto; protocol = IPPROTO_TCP; @@ -412,17 +412,13 @@ static int inet_create(struct socket *sock, int protocol) sk->ip_mc_index=0; sk->ip_mc_list=NULL; - /* Speed up by setting some standard state for the dummy_th - * if TCP uses it (maybe move to tcp_init later) - */ - if (sk->num) { /* It assumes that any protocol which allows * the user to assign a number at socket * creation time automatically * shares. */ - sk->dummy_th.source = htons(sk->num); + sk->sport = htons(sk->num); /* Add to protocol hash chains. */ sk->prot->hash(sk); @@ -552,9 +548,9 @@ static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) return -EADDRINUSE; sk->num = snum; - sk->dummy_th.source = htons(snum); + sk->sport = htons(snum); sk->daddr = 0; - sk->dummy_th.dest = 0; + sk->dport = 0; sk->prot->rehash(sk); add_to_prot_sklist(sk); dst_release(sk->dst_cache); @@ -753,13 +749,13 @@ static int inet_getname(struct socket *sock, struct sockaddr *uaddr, if (peer) { if (!tcp_connected(sk->state)) return(-ENOTCONN); - sin->sin_port = sk->dummy_th.dest; + sin->sin_port = sk->dport; sin->sin_addr.s_addr = sk->daddr; } else { __u32 addr = sk->rcv_saddr; if (!addr) addr = sk->saddr; - sin->sin_port = sk->dummy_th.source; + sin->sin_port = sk->sport; sin->sin_addr.s_addr = addr; } *uaddr_len = sizeof(*sin); @@ -798,7 +794,8 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, int size, struct sock *sk = sock->sk; if (sk->shutdown & SEND_SHUTDOWN) { - send_sig(SIGPIPE, current, 1); + if (!(msg->msg_flags&MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 1); return(-EPIPE); } if (sk->prot->sendmsg == NULL) diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 7ec60a5be..cd9b5ba21 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -5,7 +5,7 @@ * * IPv4 Forwarding Information Base: policy rules. * - * Version: $Id: fib_rules.c,v 1.3 1998/03/08 05:56:17 davem Exp $ + * Version: $Id: fib_rules.c,v 1.4 1998/03/21 07:27:58 davem Exp $ * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * @@ -86,7 +86,7 @@ int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) rtm->rtm_dst_len == r->r_dst_len && (!rta[RTA_DST-1] || memcmp(RTA_DATA(rta[RTA_DST-1]), &r->r_dst, 4) == 0) && rtm->rtm_tos == r->r_tos && - rtm->rtm_type == r->r_action && + (!rtm->rtm_type || rtm->rtm_type == r->r_action) && (!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) && (!rta[RTA_IFNAME-1] || strcmp(RTA_DATA(rta[RTA_IFNAME-1]), r->r_ifname) == 0) && (!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) { diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index e6831adb8..21205362f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -5,7 +5,7 @@ * * The IP fragmentation functionality. * - * Version: $Id: ip_fragment.c,v 1.32 1998/03/08 05:56:21 davem Exp $ + * Version: $Id: ip_fragment.c,v 1.33 1998/03/19 08:34:08 davem Exp $ * * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox <Alan.Cox@linux.org> @@ -430,11 +430,8 @@ struct sk_buff *ip_defrag(struct sk_buff *skb) qp->ihlen = ihl; memcpy(qp->iph, iph, ihl+8); } - del_timer(&qp->timer); - qp->timer.expires = jiffies + sysctl_ipfrag_time; /* about 30 seconds */ - qp->timer.data = (unsigned long) qp; /* pointer to queue */ - qp->timer.function = ip_expire; /* expire function */ - add_timer(&qp->timer); + /* about 30 seconds */ + mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time); } else { /* If we failed to create it, then discard the frame. */ if ((qp = ip_create(skb, iph)) == NULL) { diff --git a/net/ipv4/ip_fw.c b/net/ipv4/ip_fw.c index 4eb41c325..b364f66de 100644 --- a/net/ipv4/ip_fw.c +++ b/net/ipv4/ip_fw.c @@ -683,11 +683,6 @@ static int insert_in_chain(struct ip_fw *volatile* chainptr, struct ip_fw *frwl, if ((ftmp->fw_vianame)[0]) { if (!(ftmp->fw_viadev = dev_get(ftmp->fw_vianame))) ftmp->fw_viadev = (struct device *) -1; - } else if (ftmp->fw_via.s_addr) { - if (!(ftmp->fw_viadev = ip_dev_find(ftmp->fw_via.s_addr))) - ftmp->fw_viadev = (struct device *) -1; - else - memcpy(ftmp->fw_vianame, ftmp->fw_viadev->name, IFNAMSIZ); } else ftmp->fw_viadev = NULL; @@ -732,11 +727,6 @@ static int append_to_chain(struct ip_fw *volatile* chainptr, struct ip_fw *frwl, if ((ftmp->fw_vianame)[0]) { if (!(ftmp->fw_viadev = dev_get(ftmp->fw_vianame))) ftmp->fw_viadev = (struct device *) -1; - } else if (ftmp->fw_via.s_addr) { - if (!(ftmp->fw_viadev = ip_dev_find(ftmp->fw_via.s_addr))) - ftmp->fw_viadev = (struct device *) -1; - else - memcpy(ftmp->fw_vianame, ftmp->fw_viadev->name, IFNAMSIZ); } else ftmp->fw_viadev = NULL; diff --git a/net/ipv4/ip_masq.c b/net/ipv4/ip_masq.c index dc367a289..cf92b1638 100644 --- a/net/ipv4/ip_masq.c +++ b/net/ipv4/ip_masq.c @@ -1819,13 +1819,9 @@ int ip_masq_ctl(int optname, void *arg, int arglen) struct ip_fw_masqctl *mctl = arg; int ret = EINVAL; - ip_masq_lockz(&__ip_masq_lock, &masq_wait, 0); - if (1) /* (mctl->mctl_action == IP_MASQ_MOD_CTL) */ ret = ip_masq_mod_ctl(optname, mctl, arglen); - ip_masq_unlockz(&__ip_masq_lock, &masq_wait, 0); - return ret; } diff --git a/net/ipv4/ip_masq_autofw.c b/net/ipv4/ip_masq_autofw.c index 30493d4cd..27b98bb03 100644 --- a/net/ipv4/ip_masq_autofw.c +++ b/net/ipv4/ip_masq_autofw.c @@ -119,10 +119,8 @@ static __inline__ void ip_autofw_update_out (__u32 who, __u32 where, __u16 port, { if (af->flags & IP_AUTOFW_USETIME) { - if (af->timer.expires) - del_timer(&af->timer); - af->timer.expires=jiffies+IP_AUTOFW_EXPIRE; - add_timer(&af->timer); + mod_timer(&af->timer, + jiffies+IP_AUTOFW_EXPIRE); } af->flags|=IP_AUTOFW_ACTIVE; af->lastcontact=where; @@ -139,9 +137,7 @@ static __inline__ void ip_autofw_update_in (__u32 where, __u16 port, __u16 proto af=ip_autofw_check_range(where, port,protocol); if (af) { - del_timer(&af->timer); - af->timer.expires=jiffies+IP_AUTOFW_EXPIRE; - add_timer(&af->timer); + mod_timer(&af->timer, jiffies+IP_AUTOFW_EXPIRE); } } #endif diff --git a/net/ipv4/ip_masq_mod.c b/net/ipv4/ip_masq_mod.c index 2265161f3..f6a50dfc6 100644 --- a/net/ipv4/ip_masq_mod.c +++ b/net/ipv4/ip_masq_mod.c @@ -275,7 +275,7 @@ struct ip_masq_mod * ip_masq_mod_getbyname(const char *mmod_name) IP_MASQ_DEBUG(1, "searching mmod_name \"%s\"\n", mmod_name); - for (mmod=ip_masq_mod_reg_base; mmod ; mmod=mmod->next) { + for (mmod=ip_masq_mod_reg_base; mmod ; mmod=mmod->next_reg) { if (mmod->mmod_ctl && *(mmod_name) && (strcmp(mmod_name, mmod->mmod_name)==0)) { /* HIT */ diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 63fbbfe1e..69179738e 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -81,46 +81,24 @@ int sysctl_ip_dynaddr = 0; int ip_id_count = 0; -int ip_build_pkt(struct sk_buff *skb, struct sock *sk, u32 saddr, u32 daddr, - struct ip_options *opt) +/* Generate a checksum for an outgoing IP datagram. */ +__inline__ void ip_send_check(struct iphdr *iph) { - struct rtable *rt; - u32 final_daddr = daddr; + iph->check = 0; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); +} + +void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, + u32 saddr, u32 daddr, struct ip_options *opt) +{ + struct rtable *rt = (struct rtable *)skb->dst; struct iphdr *iph; - int err; - if (opt && opt->srr) - daddr = opt->faddr; - - err = ip_route_output(&rt, daddr, saddr, RT_TOS(sk->ip_tos) | - RTO_CONN | sk->localroute, sk->bound_dev_if); - if (err) - { - ip_statistics.IpOutNoRoutes++; - return err; - } - - if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { - ip_rt_put(rt); - ip_statistics.IpOutNoRoutes++; - return -ENETUNREACH; - } - - skb->dst = dst_clone(&rt->u.dst); - skb_reserve(skb, (rt->u.dst.dev->hard_header_len+15)&~15); - - /* - * Now build the IP header. - */ - - /* - * Build the IP addresses - */ - + /* Build the IP header. */ if (opt) - iph=(struct iphdr *)skb_put(skb,sizeof(struct iphdr) + opt->optlen); + iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen); else - iph=(struct iphdr *)skb_put(skb,sizeof(struct iphdr)); + iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr)); iph->version = 4; iph->ihl = 5; @@ -133,92 +111,19 @@ int ip_build_pkt(struct sk_buff *skb, struct sock *sk, u32 saddr, u32 daddr, iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; iph->protocol = sk->protocol; + iph->tot_len = htons(skb->len); + iph->id = htons(ip_id_count++); skb->nh.iph = iph; - skb->h.raw = (unsigned char*)(iph+1); - if (opt && opt->optlen) - { + if (opt && opt->optlen) { iph->ihl += opt->optlen>>2; - skb->h.raw += opt->optlen; - ip_options_build(skb, opt, final_daddr, rt, 0); + ip_options_build(skb, opt, daddr, rt, 0); } - - ip_rt_put(rt); - return 0; -} -/* - * This routine builds the appropriate hardware/IP headers for - * the routine. - */ -int ip_build_header(struct sk_buff *skb, struct sock *sk) -{ - struct rtable *rt; - struct ip_options *opt = sk->opt; - u32 daddr = sk->daddr; - u32 final_daddr = daddr; - struct iphdr *iph; - int err; - - if (opt && opt->srr) - daddr = opt->faddr; - - rt = (struct rtable*)sk->dst_cache; - - if (!rt || rt->u.dst.obsolete) { - sk->dst_cache = NULL; - ip_rt_put(rt); - err = ip_route_output(&rt, daddr, sk->saddr, RT_TOS(sk->ip_tos) | - RTO_CONN | sk->localroute, sk->bound_dev_if); - if (err) - return err; - sk->dst_cache = &rt->u.dst; - } - - if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { - sk->dst_cache = NULL; - ip_rt_put(rt); - ip_statistics.IpOutNoRoutes++; - return -ENETUNREACH; - } - - skb->dst = dst_clone(sk->dst_cache); - skb_reserve(skb, MAX_HEADER); - - /* - * Now build the IP header. - */ - - /* - * Build the IP addresses - */ - - if (opt) - iph=(struct iphdr *)skb_put(skb,sizeof(struct iphdr) + opt->optlen); - else - iph=(struct iphdr *)skb_put(skb,sizeof(struct iphdr)); - - iph->version = 4; - iph->ihl = 5; - iph->tos = sk->ip_tos; - iph->frag_off = 0; - if (sk->ip_pmtudisc == IP_PMTUDISC_WANT && - !(rt->u.dst.mxlock&(1<<RTAX_MTU))) - iph->frag_off |= htons(IP_DF); - iph->ttl = sk->ip_ttl; - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; - iph->protocol = sk->protocol; - skb->nh.iph = iph; - skb->h.raw = (unsigned char*)(iph+1); - - if (!opt || !opt->optlen) - return 0; - iph->ihl += opt->optlen>>2; - skb->h.raw += opt->optlen; - ip_options_build(skb, opt, final_daddr, rt, 0); + ip_send_check(iph); - return 0; + /* Send it out. */ + skb->dst->output(skb); } int __ip_finish_output(struct sk_buff *skb) @@ -322,78 +227,101 @@ int ip_acct_output(struct sk_buff *skb) } #endif -/* - * Generate a checksum for an outgoing IP datagram. - */ - -void ip_send_check(struct iphdr *iph) -{ - iph->check = 0; - iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); -} - - - -/* - * Queues a packet to be sent, and starts the transmitter if necessary. +/* Queues a packet to be sent, and starts the transmitter if necessary. * This routine also needs to put in the total length and compute the - * checksum + * checksum. We use to do this in two stages, ip_build_header() then + * this, but that scheme created a mess when routes disappeared etc. + * So we do it all here, and the TCP send engine has been changed to + * match. (No more unroutable FIN disasters, etc. wheee...) This will + * most likely make other reliable transport layers above IP easier + * to implement under Linux. */ - void ip_queue_xmit(struct sk_buff *skb) { struct sock *sk = skb->sk; - struct rtable *rt = (struct rtable*)skb->dst; + struct ip_options *opt = sk->opt; + struct rtable *rt; struct device *dev; + struct iphdr *iph; unsigned int tot_len; - struct iphdr *iph = skb->nh.iph; - tot_len = skb->len; - iph->tot_len = htons(tot_len); - iph->id = htons(ip_id_count++); + /* Make sure we can route this packet. */ + rt = (struct rtable *) sk->dst_cache; + if(rt == NULL || rt->u.dst.obsolete) { + u32 daddr; - if (rt->u.dst.obsolete) { - /* Ugly... ugly... but what can I do? - Essentially it is "ip_reroute_output" function. --ANK - */ - struct rtable *nrt; - if (ip_route_output(&nrt, rt->key.dst, rt->key.src, - rt->key.tos | RTO_CONN, - sk?sk->bound_dev_if:0)) - goto drop; - skb->dst = &nrt->u.dst; + sk->dst_cache = NULL; ip_rt_put(rt); - rt = nrt; + + /* Use correct destination address if we have options. */ + daddr = sk->daddr; + if(opt && opt->srr) + daddr = opt->faddr; + + /* If this fails, retransmit mechanism of transport layer will + * keep trying until route appears or the connection times itself + * out. + */ + if(ip_route_output(&rt, daddr, sk->saddr, + RT_TOS(sk->ip_tos) | RTO_CONN | sk->localroute, + sk->bound_dev_if)) + goto drop; + sk->dst_cache = &rt->u.dst; + } + if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) + goto no_route; + + /* We have a route, so grab a reference. */ + skb->dst = dst_clone(sk->dst_cache); + + /* OK, we know where to send it, allocate and build IP header. */ + iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); + iph->version = 4; + iph->ihl = 5; + iph->tos = sk->ip_tos; + iph->frag_off = 0; + if(sk->ip_pmtudisc == IP_PMTUDISC_WANT && !(rt->u.dst.mxlock & (1 << RTAX_MTU))) + iph->frag_off |= __constant_htons(IP_DF); + iph->ttl = sk->ip_ttl; + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + iph->protocol = sk->protocol; + skb->nh.iph = iph; + /* Transport layer set skb->h.foo itself. */ + + if(opt && opt->optlen) { + iph->ihl += opt->optlen >> 2; + ip_options_build(skb, opt, sk->daddr, rt, 0); } + tot_len = skb->len; + iph->tot_len = htons(tot_len); + iph->id = htons(ip_id_count++); + dev = rt->u.dst.dev; - if (call_out_firewall(PF_INET, dev, iph, NULL,&skb) < FW_ACCEPT) + if (call_out_firewall(PF_INET, dev, iph, NULL, &skb) < FW_ACCEPT) goto drop; #ifdef CONFIG_NET_SECURITY - /* - * Add an IP checksum (must do this before SECurity because - * of possible tunneling) + /* Add an IP checksum (must do this before SECurity because + * of possible tunneling). */ - ip_send_check(iph); - - if (call_out_firewall(PF_SECURITY, NULL, NULL, (void *) 4, &skb)<FW_ACCEPT) + if (call_out_firewall(PF_SECURITY, NULL, NULL, (void *) 4, &skb) < FW_ACCEPT) goto drop; - iph = skb->nh.iph; - /* don't update tot_len, as the dev->mtu is already decreased */ + /* Don't update tot_len, as the dev->mtu is already decreased. */ #endif - + /* This can happen when the transport layer has segments queued + * with a cached route, and by the time we get here things are + * re-routed to a device with a different MTU than the original + * device. Sick, but we must cover it. + */ if (skb_headroom(skb) < dev->hard_header_len && dev->hard_header) { struct sk_buff *skb2; - /* ANK: It is almost impossible, but - * if you loaded module device with hh_len > MAX_HEADER, - * and if a route changed to this device, - * and if (uh...) TCP had segments queued on this route... - */ - skb2 = skb_realloc_headroom(skb, (dev->hard_header_len+15)&~15); + + skb2 = skb_realloc_headroom(skb, (dev->hard_header_len + 15) & ~15); kfree_skb(skb); if (skb2 == NULL) return; @@ -401,40 +329,35 @@ void ip_queue_xmit(struct sk_buff *skb) iph = skb->nh.iph; } - /* - * Do we need to fragment. Again this is inefficient. - * We need to somehow lock the original buffer and use - * bits of it. + /* Do we need to fragment. Again this is inefficient. We + * need to somehow lock the original buffer and use bits of it. */ - if (tot_len > rt->u.dst.pmtu) goto fragment; #ifndef CONFIG_NET_SECURITY - /* - * Add an IP checksum - */ - + /* Add an IP checksum. */ ip_send_check(iph); #endif - - if (sk) - skb->priority = sk->priority; + skb->priority = sk->priority; skb->dst->output(skb); return; fragment: - if ((iph->frag_off & htons(IP_DF))) - { + if ((iph->frag_off & htons(IP_DF)) != 0) { printk(KERN_DEBUG "sending pkt_too_big to self\n"); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(rt->u.dst.pmtu)); goto drop; } - ip_fragment(skb, skb->dst->output); return; +no_route: + sk->dst_cache = NULL; + ip_rt_put(rt); + ip_statistics.IpOutNoRoutes++; + /* Fall through... */ drop: kfree_skb(skb); } @@ -948,14 +871,7 @@ struct sk_buff * ip_reply(struct sk_buff *skb, int payload) reply->dst = &rt->u.dst; skb_reserve(reply, (rt->u.dst.dev->hard_header_len+15)&~15); - /* - * Now build the IP header. - */ - - /* - * Build the IP addresses - */ - + /* Now build the IP header. */ reply->nh.iph = iph = (struct iphdr *)skb_put(reply, iphlen); iph->version = 4; @@ -966,6 +882,7 @@ struct sk_buff * ip_reply(struct sk_buff *skb, int payload) iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; iph->protocol = skb->nh.iph->protocol; + iph->id = htons(ip_id_count++); ip_options_build(reply, &replyopts.opt, daddr, rt, 0); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 221207205..0ea231adf 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -59,7 +59,7 @@ static inline void get__openreq(struct sock *sk, struct open_request *req, " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu", i, (long unsigned int)req->af.v4_req.loc_addr, - ntohs(sk->dummy_th.source), + ntohs(sk->sport), (long unsigned int)req->af.v4_req.rmt_addr, req->rmt_port, TCP_SYN_RECV, @@ -83,8 +83,8 @@ static inline void get__sock(struct sock *sp, char *tmpbuf, int i, int format) dest = sp->daddr; src = sp->rcv_saddr; - destp = sp->dummy_th.dest; - srcp = sp->dummy_th.source; + destp = sp->dport; + srcp = sp->sport; /* FIXME: The fact that retransmit_timer occurs as a field * in two different parts of the socket structure is, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 8ce4a95f4..464090776 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -104,6 +104,7 @@ int ip_rt_redirect_load = HZ/50; int ip_rt_redirect_silence = ((HZ/50) << (9+1)); int ip_rt_error_cost = HZ; int ip_rt_error_burst = 5*HZ; +int ip_rt_gc_elasticity = 8; static unsigned long rt_deadline = 0; @@ -398,10 +399,10 @@ static int rt_garbage_collect(void) last_gc = now; if (atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) - expire = ip_rt_gc_timeout; + expire = ip_rt_gc_timeout>>1; out: - expire >>= 1; + expire -= expire>>ip_rt_gc_elasticity; end_bh_atomic(); return (atomic_read(&ipv4_dst_ops.entries) > ip_rt_max_size); } @@ -1740,6 +1741,9 @@ ctl_table ipv4_route_table[] = { {NET_IPV4_ROUTE_ERROR_BURST, "error_burst", &ip_rt_error_burst, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity", + &ip_rt_gc_elasticity, sizeof(int), 0644, NULL, + &proc_dointvec}, {0} }; #endif diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 767c5d00b..da64fc186 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -47,6 +47,7 @@ extern int sysctl_tcp_cong_avoidance; extern int sysctl_tcp_hoe_retransmits; extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; +extern int sysctl_tcp_sack; extern int sysctl_tcp_keepalive_time; extern int sysctl_tcp_keepalive_probes; extern int sysctl_tcp_max_ka_probes; @@ -104,6 +105,9 @@ ctl_table ipv4_table[] = { {NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling", &sysctl_tcp_window_scaling, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_TCP_SACK, "tcp_sack", + &sysctl_tcp_sack, sizeof(int), 0644, NULL, + &proc_dointvec}, {NET_IPV4_TCP_VEGAS_CONG_AVOID, "tcp_vegas_cong_avoid", &sysctl_tcp_cong_avoidance, sizeof(int), 0644, NULL, &tcp_sysctl_congavoid }, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b20df83d2..d57b7e3ef 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.96 1998/03/16 02:25:55 davem Exp $ + * Version: $Id: tcp.c,v 1.104 1998/03/22 22:10:30 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -668,7 +668,7 @@ static int wait_for_tcp_connect(struct sock * sk, int flags) return sock_error(sk); if((1 << sk->state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { - if(sk->keepopen) + if(sk->keepopen && !(flags&MSG_NOSIGNAL)) send_sig(SIGPIPE, tsk, 0); return -EPIPE; } @@ -733,15 +733,25 @@ static void wait_for_tcp_memory(struct sock * sk) int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int mss_now = sk->mss; int err = 0; int copied = 0; - struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); /* Wait for a connection to finish. */ if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) if((err = wait_for_tcp_connect(sk, flags)) != 0) return err; + /* The socket is locked, nothing can change the state of pending + * SACKs or IP options. + */ + if(tp->sack_ok && tp->num_sacks) + mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + + (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); + if(sk->opt && sk->opt->optlen) + mss_now -= (sk->opt->optlen); + /* Ok commence sending. */ while(--iovlen >= 0) { int seglen=iov->iov_len; @@ -769,22 +779,19 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) */ if (tp->send_head && !(flags & MSG_OOB)) { skb = sk->write_queue.prev; - copy = skb->tail - - ((unsigned char *)(skb->h.th) + - tp->tcp_header_len); - /* This window_seq test is somewhat dangerous - * If the remote does SWS avoidance we should + copy = skb->len; + /* If the remote does SWS avoidance we should * queue the best we can if not we should in * fact send multiple packets... - * a method for detecting this would be most - * welcome + * A method for detecting this would be most + * welcome. */ if (skb_tailroom(skb) > 0 && - (sk->mss - copy) > 0 && + (mss_now - copy) > 0 && tp->snd_nxt < skb->end_seq) { - int last_byte_was_odd = (copy & 1); + int last_byte_was_odd = (copy % 4); - copy = sk->mss - copy; + copy = mss_now - copy; if(copy > skb_tailroom(skb)) copy = skb_tailroom(skb); if(copy > seglen) @@ -793,12 +800,8 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) if(copy_from_user(skb_put(skb, copy), from, copy)) err = -EFAULT; - skb->csum = csum_partial( - (((unsigned char *)skb->h.th) + - tp->tcp_header_len), - (skb->tail - - (((unsigned char *)skb->h.th) + - tp->tcp_header_len)), 0); + skb->csum = csum_partial(skb->data, + skb->len, 0); } else { skb->csum = csum_and_copy_from_user( @@ -810,6 +813,8 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) from += copy; copied += copy; seglen -= copy; + if(!seglen && !iovlen) + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; continue; } } @@ -828,18 +833,17 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) */ copy = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); if(copy >= (tp->max_window >> 1)) - copy = min(copy, sk->mss); + copy = min(copy, mss_now); else - copy = sk->mss; + copy = mss_now; if(copy > seglen) copy = seglen; - tmp = MAX_HEADER + sk->prot->max_header + - sizeof(struct sk_buff) + 15; + tmp = MAX_HEADER + sk->prot->max_header + 15; queue_it = 0; - if (copy < min(sk->mss, tp->max_window >> 1) && + if (copy < min(mss_now, tp->max_window >> 1) && !(flags & MSG_OOB)) { - tmp += min(sk->mss, tp->max_window); + tmp += min(mss_now, tp->max_window); /* What is happening here is that we want to * tack on later members of the users iovec @@ -869,35 +873,34 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) continue; } - /* FIXME: we need to optimize this. - * Perhaps some hints here would be good. - */ - tmp = tp->af_specific->build_net_header(sk, skb); - if (tmp < 0) { - kfree_skb(skb); - err = tmp; - goto do_interrupted; - } - - skb->h.th =(struct tcphdr *) - skb_put(skb,tp->tcp_header_len); - seglen -= copy; - tcp_build_header_data(skb->h.th, sk, seglen || iovlen); + /* Prepare control bits for TCP header creation engine. */ + TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | + ((!seglen && !iovlen) ? + TCPCB_FLAG_PSH : 0)); + TCP_SKB_CB(skb)->sacked = 0; if (flags & MSG_OOB) { - skb->h.th->urg = 1; - skb->h.th->urg_ptr = ntohs(copy); - } - + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_URG; + TCP_SKB_CB(skb)->urg_ptr = copy; + } else + TCP_SKB_CB(skb)->urg_ptr = 0; + + /* TCP data bytes are SKB_PUT() on top, later + * TCP+IP+DEV headers are SKB_PUSH()'d beneath. + * Reserve header space and checksum the data. + */ + skb_reserve(skb, MAX_HEADER + sk->prot->max_header); skb->csum = csum_and_copy_from_user(from, skb_put(skb, copy), copy, 0, &err); from += copy; copied += copy; - tp->write_seq += copy; + skb->seq = tp->write_seq; + skb->end_seq = skb->seq + copy; + /* This advances tp->write_seq for us. */ tcp_send_skb(sk, skb, queue_it); } } @@ -913,7 +916,8 @@ do_sock_err: do_shutdown: if(copied) return copied; - send_sig(SIGPIPE, current, 0); + if (!(flags&MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); return -EPIPE; do_interrupted: if(copied) @@ -1044,9 +1048,20 @@ static void cleanup_rbuf(struct sock *sk, int copied) /* We send an ACK if we can now advertise a non-zero window * which has been raised "significantly". */ - if((copied > 0) && - (copied >= tcp_receive_window(&sk->tp_pinfo.af_tcp))) - tcp_read_wakeup(sk); + if(copied > 0) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + __u32 rcv_window_now = tcp_receive_window(tp); + + /* We won't be raising the window any further than + * the window-clamp allows. Our window selection + * also keeps things a nice multiple of MSS. These + * checks are necessary to prevent spurious ACKs + * which don't advertize a larger window. + */ + if((copied >= rcv_window_now) && + ((rcv_window_now + sk->mss) <= tp->window_clamp)) + tcp_read_wakeup(sk); + } } @@ -1319,12 +1334,8 @@ static int tcp_close_state(struct sock *sk, int dead) * that we won't make the old 4*rto = almost no time - whoops * reset mistake. */ - if(dead && ns==TCP_FIN_WAIT2) { - if(sk->timer.prev && del_timer(&sk->timer)) - add_timer(&sk->timer); - else - tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout); - } + if(dead && ns == TCP_FIN_WAIT2 && !sk->timer.prev) + tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout); return send_fin; } @@ -1448,12 +1459,8 @@ void tcp_close(struct sock *sk, unsigned long timeout) /* Now that the socket is dead, if we are in the FIN_WAIT2 state * we may need to set up a timer. */ - if (sk->state==TCP_FIN_WAIT2) { - if(sk->timer.prev && del_timer(&sk->timer)) - add_timer(&sk->timer); - else - tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout); - } + if (sk->state == TCP_FIN_WAIT2 && !sk->timer.prev) + tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout); sk->dead = 1; release_sock(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4b7dcc9e9..1c34e6693 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_input.c,v 1.84 1998/03/15 03:23:20 davem Exp $ + * Version: $Id: tcp_input.c,v 1.98 1998/03/23 22:54:48 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -72,9 +72,10 @@ extern int sysctl_tcp_fin_timeout; */ int sysctl_tcp_timestamps = 1; int sysctl_tcp_window_scaling = 1; +int sysctl_tcp_sack = 1; +int sysctl_tcp_hoe_retransmits = 1; int sysctl_tcp_cong_avoidance; -int sysctl_tcp_hoe_retransmits; int sysctl_tcp_syncookies = SYNC_INIT; int sysctl_tcp_stdurg; int sysctl_tcp_rfc1337; @@ -177,7 +178,6 @@ static __inline__ void tcp_set_rto(struct tcp_opt *tp) * some modification to the RTO calculation that takes delayed * ack bais into account? This needs serious thought. -- erics */ - static __inline__ void tcp_bound_rto(struct tcp_opt *tp) { if (tp->rto > 120*HZ) @@ -187,7 +187,6 @@ static __inline__ void tcp_bound_rto(struct tcp_opt *tp) } /* WARNING: this must not be called if tp->saw_timestamp was false. */ - extern __inline__ void tcp_replace_ts_recent(struct tcp_opt *tp, __u32 end_seq) { /* From draft-ietf-tcplw-high-performance: the correct @@ -226,10 +225,7 @@ static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) return 0; } -/* - * This functions checks to see if the tcp header is actually acceptable. - */ - +/* This functions checks to see if the tcp header is actually acceptable. */ extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) { if (seq == tp->rcv_nxt) @@ -238,11 +234,7 @@ extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) return __tcp_sequence(tp, seq, end_seq); } -/* - * When we get a reset we do this. This probably is a tcp_output routine - * really. - */ - +/* When we get a reset we do this. */ static void tcp_reset(struct sock *sk, struct sk_buff *skb) { sk->zapped = 1; @@ -264,14 +256,36 @@ static void tcp_reset(struct sock *sk, struct sk_buff *skb) sk->state_change(sk); } -/* - * Look for tcp options. Normally only called on SYN and SYNACK packets. - * But, this can also be called on packets in the established flow when - * the fast version below fails. - * FIXME: surely this can be more efficient. -- erics +/* This tags the retransmission queue when SACKs arrive. */ +static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, int nsacks) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int i = nsacks; + + while(i--) { + struct sk_buff *skb = skb_peek(&sk->write_queue); + __u32 start_seq = ntohl(sp->start_seq); + __u32 end_seq = ntohl(sp->end_seq); + + while((skb != NULL) && + (skb != tp->send_head) && + (skb != (struct sk_buff *)&sk->write_queue)) { + /* We play conservative, we don't allow SACKS to partially + * tag a sequence space. + */ + if(!after(start_seq, skb->seq) && !before(end_seq, skb->end_seq)) + TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; + skb = skb->next; + } + sp++; /* Move on to the next SACK block. */ + } +} + +/* Look for tcp options. Normally only called on SYN and SYNACK packets. + * But, this can also be called on packets in the established flow when + * the fast version below fails. */ - -void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp, int no_fancy) +void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy) { unsigned char *ptr; int length=(th->doff*4)-sizeof(struct tcphdr); @@ -281,49 +295,68 @@ void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp, int no_fancy) while(length>0) { int opcode=*ptr++; - int opsize=*ptr++; - if (length - opsize < 0) /* Don't parse partial options */ - break; - switch(opcode) { - case TCPOPT_EOL: - return; - case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ - length--; - ptr--; /* the opsize=*ptr++ above was a mistake */ - continue; - - default: - if(opsize<=2) /* Avoid silly options looping forever */ - return; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize=*ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + break; /* don't parse partial options */ switch(opcode) { - case TCPOPT_MSS: - if(opsize==TCPOLEN_MSS && th->syn) { - tp->in_mss = ntohs(*(__u16 *)ptr); - if (tp->in_mss == 0) - tp->in_mss = 536; + case TCPOPT_MSS: + if(opsize==TCPOLEN_MSS && th->syn) { + tp->in_mss = ntohs(*(__u16 *)ptr); + if (tp->in_mss == 0) + tp->in_mss = 536; + } + break; + case TCPOPT_WINDOW: + if(opsize==TCPOLEN_WINDOW && th->syn) + if (!no_fancy && sysctl_tcp_window_scaling) { + tp->wscale_ok = 1; + tp->snd_wscale = *(__u8 *)ptr; } - break; - case TCPOPT_WINDOW: - if(opsize==TCPOLEN_WINDOW && th->syn) - if (!no_fancy && sysctl_tcp_window_scaling) { - tp->wscale_ok = 1; - tp->snd_wscale = *(__u8 *)ptr; - } - break; - case TCPOPT_TIMESTAMP: - if(opsize==TCPOLEN_TIMESTAMP) { - /* Cheaper to set again then to - * test syn. Optimize this? - */ - if (sysctl_tcp_timestamps && !no_fancy) { - tp->tstamp_ok = 1; - tp->saw_tstamp = 1; - tp->rcv_tsval = ntohl(*(__u32 *)ptr); - tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4)); - } + break; + case TCPOPT_TIMESTAMP: + if(opsize==TCPOLEN_TIMESTAMP) { + if (sysctl_tcp_timestamps && !no_fancy) { + tp->tstamp_ok = 1; + tp->saw_tstamp = 1; + tp->rcv_tsval = ntohl(*(__u32 *)ptr); + tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4)); + } + } + break; + case TCPOPT_SACK_PERM: + if(opsize==TCPOLEN_SACK_PERM && th->syn) { + if (sysctl_tcp_sack && !no_fancy) { + tp->sack_ok = 1; + tp->num_sacks = 0; + } + } + break; + + case TCPOPT_SACK: + if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && + sysctl_tcp_sack && (sk != NULL) && !th->syn) { + int sack_bytes = opsize - TCPOLEN_SACK_BASE; + + if(!(sack_bytes % TCPOLEN_SACK_PERBLOCK)) { + int num_sacks = sack_bytes >> 3; + struct tcp_sack_block *sackp; + + sackp = (struct tcp_sack_block *)ptr; + tcp_sacktag_write_queue(sk, sackp, num_sacks); } - break; - } + } + }; ptr+=opsize-2; length-=opsize; }; @@ -331,13 +364,11 @@ void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp, int no_fancy) } /* Fast parse options. This hopes to only see timestamps. - * If it is wrong it falls back on tcp_parse_option(). - * This should probably get extended for timestamps as well. - * Assembly code anyone? -- erics + * If it is wrong it falls back on tcp_parse_options(). */ -static __inline__ int tcp_fast_parse_options(struct tcphdr *th, struct tcp_opt *tp) +static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp) { - /* If we didn't send out any options ignore them all */ + /* If we didn't send out any options ignore them all. */ if (tp->tcp_header_len == sizeof(struct tcphdr)) return 0; if (th->doff == sizeof(struct tcphdr)>>2) { @@ -353,13 +384,14 @@ static __inline__ int tcp_fast_parse_options(struct tcphdr *th, struct tcp_opt * return 1; } } - tcp_parse_options(th,tp,0); + tcp_parse_options(sk, th, tp, 0); return 1; } -#define FLAG_DATA 0x01 -#define FLAG_WIN_UPDATE 0x02 -#define FLAG_DATA_ACKED 0x04 +#define FLAG_DATA 0x01 /* Incoming frame contained data. */ +#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ +#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ +#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ static __inline__ void clear_fast_retransmit(struct sock *sk) { @@ -372,11 +404,9 @@ static __inline__ void clear_fast_retransmit(struct sock *sk) tp->dup_acks = 0; } -/* - * NOTE: This code assumes that tp->dup_acks gets cleared when a +/* NOTE: This code assumes that tp->dup_acks gets cleared when a * retransmit timer fires. */ - static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) { struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); @@ -407,7 +437,7 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2); tp->snd_cwnd = tp->snd_ssthresh + 3; tp->high_seq = tp->snd_nxt; - tcp_do_retransmit(sk, 0); + tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); } } @@ -425,7 +455,7 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) * block on duplicate fast retransmits, and if requested * we do Hoe style secondary fast retransmits. */ - if (!before(ack,tp->high_seq) || (not_dup&FLAG_DATA) != 0) { + if (!before(ack, tp->high_seq) || (not_dup & FLAG_DATA) != 0) { /* Once we have acked all the packets up to high_seq * we are done this fast retransmit phase. * Alternatively data arrived. In this case we @@ -438,7 +468,7 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) /* After we have cleared up to high_seq we can * clear the Floyd style block. */ - if (after(ack,tp->high_seq)) + if (after(ack, tp->high_seq)) tp->high_seq = 0; } else if (tp->dup_acks >= 3) { if (sysctl_tcp_hoe_retransmits) { @@ -455,10 +485,9 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) * the only way to get here without advancing * from snd_una is if this was a window update. */ - if (ack != tp->snd_una && before(ack,tp->high_seq)) { - tcp_do_retransmit(sk, 0); - tcp_reset_xmit_timer(sk, TIME_RETRANS, - tp->rto); + if (ack != tp->snd_una && before(ack, tp->high_seq)) { + tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); } } else { /* Reno style. We didn't ack the whole @@ -589,9 +618,9 @@ static void tcp_cong_avoid_vanj(struct sock *sk, u32 seq, u32 ack, u32 seq_rtt) } } - -static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq, - __u32 *seq_rtt) +/* Remove acknowledged frames from the retransmission queue. */ +static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, + __u32 *seq, __u32 *seq_rtt) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; @@ -600,8 +629,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq, while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) { /* If our packet is before the ack sequence we can - * discard it as it's confirmed to have arrived the - * other end. + * discard it as it's confirmed to have arrived at + * the other end. */ if (after(skb->end_seq, ack)) break; @@ -613,26 +642,22 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq, * connection startup slow start one packet too * quickly. This is severely frowned upon behavior. */ - if(!skb->h.th->syn) - acked = FLAG_DATA_ACKED; - - /* FIXME: packet counting may break if we have to - * do packet "repackaging" for stacks that don't - * like overlapping packets. - */ + if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) { + acked |= FLAG_DATA_ACKED; + if(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) + acked |= FLAG_RETRANS_DATA_ACKED; + } else { + tp->retrans_head = NULL; + } tp->packets_out--; - *seq = skb->seq; *seq_rtt = now - skb->when; - skb_unlink(skb); - kfree_skb(skb); } if (acked) tp->retrans_head = NULL; - return acked; } @@ -686,41 +711,23 @@ static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp, static void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) { - struct sk_buff *skb; - long when; - - skb = skb_peek(&sk->write_queue); - when = tp->rto - (jiffies - skb->when); - - /* FIXME: This assumes that when we are retransmitting - * we should only ever respond with one packet. - * This means congestion windows should not grow - * during recovery. In 2.0.X we allow the congestion - * window to grow. It is not clear to me which - * decision is correct. The RFCs should be double - * checked as should the behavior of other stacks. - * Also note that if we do want to allow the - * congestion window to grow during retransmits - * we have to fix the call to congestion window - * updates so that it works during retransmission. + struct sk_buff *skb = skb_peek(&sk->write_queue); + long when = tp->rto - (jiffies - skb->when); + + /* Some data was ACK'd, if still retransmitting (due to a + * timeout), resend more of the retransmit queue. The + * congestion window is handled properly by that code. */ if (tp->retransmits) { tp->retrans_head = NULL; - - /* This is tricky. We are retransmiting a - * segment of a window when congestion occured. - */ - tcp_do_retransmit(sk, 0); + tcp_xmit_retransmit_queue(sk); tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); } else { tcp_reset_xmit_timer(sk, TIME_RETRANS, when); } } -/* - * This routine deals with incoming acks, but not outgoing ones. - */ - +/* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack_seq, u32 ack, int len) { @@ -805,7 +812,8 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, * where the network delay has increased suddenly. * I.e. Karn's algorithm. (SIGCOMM '87, p5.) */ - if (flag & FLAG_DATA_ACKED) { + if ((flag & FLAG_DATA_ACKED) && + !(flag & FLAG_RETRANS_DATA_ACKED)) { tp->backoff = 0; tcp_rtt_estimator(tp, seq_rtt); tcp_set_rto(tp); @@ -923,9 +931,7 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, } else { if(th->ack) { /* In this case we must reset the TIMEWAIT timer. */ - del_timer(&tw->timer); - tw->timer.expires = jiffies + TCP_TIMEWAIT_LEN; - add_timer(&tw->timer); + mod_timer(&tw->timer, jiffies + TCP_TIMEWAIT_LEN); } } return 0; /* Discard the frame. */ @@ -981,9 +987,10 @@ void tcp_time_wait(struct sock *sk) tw->bound_dev_if= sk->bound_dev_if; tw->num = sk->num; tw->state = TCP_TIME_WAIT; + tw->sport = sk->sport; + tw->dport = sk->dport; tw->family = sk->family; - tw->source = sk->dummy_th.source; - tw->dest = sk->dummy_th.dest; + tw->reuse = sk->reuse; tw->rcv_nxt = sk->tp_pinfo.af_tcp.rcv_nxt; tw->af_specific = sk->tp_pinfo.af_tcp.af_specific; @@ -1098,6 +1105,175 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) }; } +/* These routines update the SACK block as out-of-order packets arrive or + * in-order packets close up the sequence space. + */ +static void tcp_sack_maybe_coalesce(struct tcp_opt *tp, struct tcp_sack_block *sp) +{ + int this_sack, num_sacks = tp->num_sacks; + struct tcp_sack_block *swalk = &tp->selective_acks[0]; + + /* If more than one SACK block, see if the recent change to SP eats into + * or hits the sequence space of other SACK blocks, if so coalesce. + */ + if(num_sacks != 1) { + for(this_sack = 0; this_sack < num_sacks; this_sack++, swalk++) { + if(swalk == sp) + continue; + + /* First case, bottom of SP moves into top of the + * sequence space of SWALK. + */ + if(between(sp->start_seq, swalk->start_seq, swalk->end_seq)) { + sp->start_seq = swalk->start_seq; + goto coalesce; + } + /* Second case, top of SP moves into bottom of the + * sequence space of SWALK. + */ + if(between(sp->end_seq, swalk->start_seq, swalk->end_seq)) { + sp->end_seq = swalk->end_seq; + goto coalesce; + } + } + } + /* SP is the only SACK, or no coalescing cases found. */ + return; + +coalesce: + /* Zap SWALK, by moving every further SACK up by one slot. + * Decrease num_sacks. + */ + for(this_sack += 1; this_sack < num_sacks; this_sack++, swalk++) { + struct tcp_sack_block *next = (swalk + 1); + swalk->start_seq = next->start_seq; + swalk->end_seq = next->end_seq; + } + tp->num_sacks--; +} + +static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) +{ + __u32 tmp; + + tmp = sack1->start_seq; + sack1->start_seq = sack2->start_seq; + sack2->start_seq = tmp; + + tmp = sack1->end_seq; + sack1->end_seq = sack2->end_seq; + sack2->end_seq = tmp; +} + +static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct tcp_sack_block *sp = &tp->selective_acks[0]; + + /* Optimize for the common case, new ofo frames arrive + * "in order". ;-) This also satisfies the requirements + * of RFC2018 about ordering of SACKs. + */ + if(sp->end_seq == skb->seq) { + sp->end_seq = skb->end_seq; + tcp_sack_maybe_coalesce(tp, sp); + } else if(sp->start_seq == skb->end_seq) { + /* Re-ordered arrival, in this case, can be optimized + * as well. + */ + sp->start_seq = skb->seq; + tcp_sack_maybe_coalesce(tp, sp); + } else { + int cur_sacks = tp->num_sacks; + int max_sacks = (tp->tstamp_ok ? 3 : 4); + + /* Oh well, we have to move things around. + * Try to find a SACK we can tack this onto. + */ + if(cur_sacks > 1) { + struct tcp_sack_block *swap = sp + 1; + int this_sack; + + for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) { + if((swap->end_seq == skb->seq) || + (swap->start_seq == skb->end_seq)) { + if(swap->end_seq == skb->seq) + swap->end_seq = skb->end_seq; + else + swap->start_seq = skb->seq; + tcp_sack_swap(sp, swap); + tcp_sack_maybe_coalesce(tp, sp); + return; + } + } + } + + /* Could not find an adjacent existing SACK, build a new one, + * put it at the front, and shift everyone else down. We + * always know there is at least one SACK present already here. + */ + while(cur_sacks >= 1) { + struct tcp_sack_block *this = &tp->selective_acks[cur_sacks]; + struct tcp_sack_block *prev = (this - 1); + this->start_seq = prev->start_seq; + this->end_seq = prev->end_seq; + cur_sacks--; + } + + /* Build head SACK, and we're done. */ + sp->start_seq = skb->seq; + sp->end_seq = skb->end_seq; + if(tp->num_sacks < max_sacks) + tp->num_sacks++; + } +} + +static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb) +{ + struct tcp_sack_block *sp = &tp->selective_acks[0]; + int num_sacks = tp->num_sacks; + int this_sack; + + /* We know this removed SKB will eat from the front of a SACK. */ + for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) { + if(sp->start_seq == skb->seq) + break; + } + + /* This should only happen if so many SACKs get built that some get + * pushed out before we get here, or we eat some in sequence packets + * which are before the first SACK block. + */ + if(this_sack >= num_sacks) + return; + + sp->start_seq = skb->end_seq; + if(!before(sp->start_seq, sp->end_seq)) { + /* Zap this SACK, by moving forward any other SACKS. */ + for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) { + struct tcp_sack_block *next = (sp + 1); + sp->start_seq = next->start_seq; + sp->end_seq = next->end_seq; + } + tp->num_sacks--; + } +} + +static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct sk_buff *new_skb) +{ + struct tcp_sack_block *sp = &tp->selective_acks[0]; + int num_sacks = tp->num_sacks; + int this_sack; + + for(this_sack = 0; this_sack < num_sacks; this_sack++, tp++) { + if(sp->end_seq == old_skb->end_seq) + break; + } + if(this_sack >= num_sacks) + return; + sp->end_seq = new_skb->end_seq; +} + /* This one checks to see if we can put data from the * out_of_order queue into the receive_queue. */ @@ -1119,6 +1295,8 @@ static void tcp_ofo_queue(struct sock *sk) SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", tp->rcv_nxt, skb->seq, skb->end_seq); + if(tp->sack_ok) + tcp_sack_remove_skb(tp, skb); skb_unlink(skb); skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = skb->end_seq; @@ -1142,13 +1320,23 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) dst_confirm(sk->dst_cache); skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = skb->end_seq; - if(skb->h.th->fin) + if(skb->h.th->fin) { tcp_fin(skb, sk, skb->h.th); - else + } else { tp->delayed_acks++; + + /* Tiny-grams with PSH set make us ACK quickly. */ + if(skb->h.th->psh && (skb->len < (sk->mss >> 1))) + tp->ato = HZ/50; + } + /* This may have eaten into a SACK block. */ + if(tp->sack_ok && tp->num_sacks) + tcp_sack_remove_skb(tp, skb); tcp_ofo_queue(sk); if (skb_queue_len(&tp->out_of_order_queue) == 0) - tp->pred_flags = htonl((0x5010 << 16) | tp->snd_wnd); + tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) | + (0x10 << 16) | + tp->snd_wnd); return; } @@ -1180,25 +1368,44 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) tp->rcv_nxt, skb->seq, skb->end_seq); if (skb_peek(&tp->out_of_order_queue) == NULL) { + /* Initial out of order segment, build 1 SACK. */ + if(tp->sack_ok) { + tp->num_sacks = 1; + tp->selective_acks[0].start_seq = skb->seq; + tp->selective_acks[0].end_seq = skb->end_seq; + } skb_queue_head(&tp->out_of_order_queue,skb); } else { for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) { /* Already there. */ - if (skb->seq == skb1->seq && skb->len >= skb1->len) { - skb_append(skb1, skb); - skb_unlink(skb1); - kfree_skb(skb1); + if (skb->seq == skb1->seq) { + if (skb->len >= skb1->len) { + if(tp->sack_ok) + tcp_sack_extend(tp, skb1, skb); + skb_append(skb1, skb); + skb_unlink(skb1); + kfree_skb(skb1); + } else { + /* A duplicate, smaller than what is in the + * out-of-order queue right now, toss it. + */ + kfree_skb(skb); + } break; } if (after(skb->seq, skb1->seq)) { skb_append(skb1,skb); + if(tp->sack_ok) + tcp_sack_new_ofo_skb(sk, skb); break; } /* See if we've hit the start. If so insert. */ if (skb1 == skb_peek(&tp->out_of_order_queue)) { skb_queue_head(&tp->out_of_order_queue,skb); + if(tp->sack_ok) + tcp_sack_new_ofo_skb(sk, skb); break; } } @@ -1244,8 +1451,8 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) static void tcp_data_snd_check(struct sock *sk) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; - struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); if ((skb = tp->send_head)) { if (!after(skb->end_seq, tp->snd_una + tp->snd_wnd) && @@ -1273,6 +1480,7 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk) * - delay time <= 0.5 HZ * - we don't have a window update to send * - must send at least every 2 full sized packets + * - must send an ACK if we have any SACKs * * With an extra heuristic to handle loss of packet * situations and also helping the sender leave slow @@ -1283,8 +1491,10 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk) if (((tp->rcv_nxt - tp->rcv_wup) >= (sk->mss << 1)) || /* We will update the window "significantly" or... */ tcp_raise_window(sk) || - /* We entered "quick ACK" mode */ - tcp_in_quickack_mode(tp)) { + /* We entered "quick ACK" mode or... */ + tcp_in_quickack_mode(tp) || + /* We have pending SACKs */ + (tp->sack_ok && tp->num_sacks)) { /* Then ack it now */ tcp_send_ack(sk); } else { @@ -1446,7 +1656,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, /* * RFC1323: H1. Apply PAWS check first. */ - if (tcp_fast_parse_options(th,tp)) { + if (tcp_fast_parse_options(sk, th, tp)) { if (tp->saw_tstamp) { if (tcp_paws_discard(tp)) { if (!th->rst) { @@ -1460,10 +1670,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, flg = *(((u32 *)th) + 3); - /* - * pred_flags is 0x5?10 << 16 + snd_wnd + /* pred_flags is 0xS?10 << 16 + snd_wnd * if header_predition is to be made - * ? will be 0 else it will be !0 + * 'S' will always be tp->tcp_header_len >> 2 + * '?' will be 0 else it will be !0 * (when there are holes in the receive * space for instance) */ @@ -1498,6 +1708,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, */ sk->data_ready(sk, 0); tcp_delack_estimator(tp); + + /* Tiny-grams with PSH set make us ACK quickly. */ + if(th->psh && (skb->len < (sk->mss >> 1))) + tp->ato = HZ/50; + tp->delayed_acks++; __tcp_ack_snd_check(sk); return 0; @@ -1703,7 +1918,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->fin_seq = skb->seq; tcp_set_state(sk, TCP_ESTABLISHED); - tcp_parse_options(th,tp,0); + tcp_parse_options(sk, th, tp, 0); if (tp->wscale_ok == 0) { tp->snd_wscale = tp->rcv_wscale = 0; @@ -1712,7 +1927,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (tp->tstamp_ok) { tp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; - sk->dummy_th.doff += (TCPOLEN_TSTAMP_ALIGNED >> 2); } else tp->tcp_header_len = sizeof(struct tcphdr); if (tp->saw_tstamp) { @@ -1745,7 +1959,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, sk->mss = min(sk->mss, real_mss); } - sk->dummy_th.dest = th->source; + sk->dport = th->source; tp->copied_seq = tp->rcv_nxt; if(!sk->dead) { @@ -1763,7 +1977,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * tcp_connect. */ tcp_set_state(sk, TCP_SYN_RECV); - tcp_parse_options(th,tp,0); + tcp_parse_options(sk, th, tp, 0); if (tp->saw_tstamp) { tp->ts_recent = tp->rcv_tsval; tp->ts_recent_stamp = jiffies; @@ -1788,7 +2002,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * Note that this really has to be here and not later for PAWS * (RFC1323) to work. */ - if (tcp_fast_parse_options(th,tp)) { + if (tcp_fast_parse_options(sk, th, tp)) { /* NOTE: assumes saw_tstamp is never set if we didn't * negotiate the option. tcp_fast_parse_options() must * guarantee this. @@ -1849,7 +2063,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, case TCP_SYN_RECV: if (acceptable) { tcp_set_state(sk, TCP_ESTABLISHED); - sk->dummy_th.dest=th->source; + sk->dport = th->source; tp->copied_seq = tp->rcv_nxt; if(!sk->dead) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 91f21ff75..ee53f47d6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_ipv4.c,v 1.109 1998/03/15 07:24:15 davem Exp $ + * Version: $Id: tcp_ipv4.c,v 1.119 1998/03/22 19:14:47 davem Exp $ * * IPv4 specific functions * @@ -62,16 +62,12 @@ extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; +extern int sysctl_tcp_sack; extern int sysctl_tcp_syncookies; extern int sysctl_ip_dynaddr; /* Check TCP sequence numbers in ICMP packets. */ -#define ICMP_PARANOIA 1 -#ifndef ICMP_PARANOIA -#define ICMP_MIN_LENGTH 4 -#else #define ICMP_MIN_LENGTH 8 -#endif static void tcp_v4_send_reset(struct sk_buff *skb); @@ -120,7 +116,7 @@ static __inline__ int tcp_sk_hashfn(struct sock *sk) __u32 laddr = sk->rcv_saddr; __u16 lport = sk->num; __u32 faddr = sk->daddr; - __u16 fport = sk->dummy_th.dest; + __u16 fport = sk->dport; return tcp_hashfn(laddr, lport, faddr, fport); } @@ -365,7 +361,7 @@ static inline struct sock *__tcp_v4_lookup(struct tcphdr *th, sk = TCP_RHASH(sport); if(sk && sk->daddr == saddr && /* remote address */ - sk->dummy_th.dest == sport && /* remote port */ + sk->dport == sport && /* remote port */ sk->num == hnum && /* local port */ sk->rcv_saddr == daddr && /* local address */ (!sk->bound_dev_if || sk->bound_dev_if == dif)) @@ -377,7 +373,7 @@ static inline struct sock *__tcp_v4_lookup(struct tcphdr *th, hash = tcp_hashfn(daddr, hnum, saddr, sport); for(sk = tcp_established_hash[hash]; sk; sk = sk->next) { if(sk->daddr == saddr && /* remote address */ - sk->dummy_th.dest == sport && /* remote port */ + sk->dport == sport && /* remote port */ sk->num == hnum && /* local port */ sk->rcv_saddr == daddr && /* local address */ (!sk->bound_dev_if || sk->bound_dev_if == dif)) { @@ -389,7 +385,7 @@ static inline struct sock *__tcp_v4_lookup(struct tcphdr *th, /* Must check for a TIME_WAIT'er before going to listener hash. */ for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next) { if(sk->daddr == saddr && /* remote address */ - sk->dummy_th.dest == sport && /* remote port */ + sk->dport == sport && /* remote port */ sk->num == hnum && /* local port */ sk->rcv_saddr == daddr && /* local address */ (!sk->bound_dev_if || sk->bound_dev_if == dif)) @@ -456,8 +452,8 @@ pass2: continue; score++; } - if(s->dummy_th.dest) { - if(s->dummy_th.dest != rnum) + if(s->dport) { + if(s->dport != rnum) continue; score++; } @@ -496,12 +492,7 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) skb->h.th->source); } -/* - * From tcp.c - */ - -/* - * Check that a TCP address is unique, don't allow multiple +/* Check that a TCP address is unique, don't allow multiple * connects to/from the same address. Actually we can optimize * quite a bit, since the socket about to connect is still * in TCP_CLOSE, a tcp_bind_bucket for the local port he will @@ -509,8 +500,7 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) * The good_socknum and verify_bind scheme we use makes this * work. */ - -static int tcp_unique_address(struct sock *sk) +static int tcp_v4_unique_address(struct sock *sk) { struct tcp_bind_bucket *tb; unsigned short snum = sk->num; @@ -524,7 +514,7 @@ static int tcp_unique_address(struct sock *sk) /* Almost certainly the re-use port case, search the real hashes * so it actually scales. */ - sk = __tcp_v4_lookup(NULL, sk->daddr, sk->dummy_th.dest, + sk = __tcp_v4_lookup(NULL, sk->daddr, sk->dport, sk->rcv_saddr, snum, sk->bound_dev_if); if((sk != NULL) && (sk->state != TCP_LISTEN)) retval = 0; @@ -535,19 +525,15 @@ static int tcp_unique_address(struct sock *sk) return retval; } - -/* - * This will initiate an outgoing connection. - */ - +/* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { - struct sk_buff *buff; - int tmp; - struct tcphdr *th; - struct rtable *rt; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; + struct sk_buff *buff; + struct rtable *rt; + int tmp; + int mss; if (sk->state != TCP_CLOSE) return(-EISCONN); @@ -567,8 +553,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) printk(KERN_DEBUG "%s forgot to set AF_INET in " __FUNCTION__ "\n", current->comm); } - dst_release(xchg(&sk->dst_cache, NULL)); - tmp = ip_route_connect(&rt, usin->sin_addr.s_addr, sk->saddr, RT_TOS(sk->ip_tos)|sk->localroute, sk->bound_dev_if); if (tmp < 0) @@ -579,143 +563,52 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) return -ENETUNREACH; } - if (!tcp_unique_address(sk)) { - ip_rt_put(rt); - return -EADDRNOTAVAIL; - } - - lock_sock(sk); + dst_release(xchg(&sk->dst_cache, rt)); - /* Do this early, so there is less state to unwind on failure. */ - buff = sock_wmalloc(sk, (MAX_SYN_SIZE + sizeof(struct sk_buff)), + buff = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header), 0, GFP_KERNEL); - if (buff == NULL) { - release_sock(sk); - ip_rt_put(rt); - return(-ENOBUFS); - } - sk->dst_cache = &rt->u.dst; + if (buff == NULL) + return -ENOBUFS; + + /* Socket has no identity, so lock_sock() is useless. Also + * since state==TCP_CLOSE (checked above) the socket cannot + * possibly be in the hashes. TCP hash locking is only + * needed while checking quickly for a unique address. + * However, the socket does need to be (and is) locked + * in tcp_connect(). + * Perhaps this addresses all of ANK's concerns. 8-) -DaveM + */ + sk->dport = usin->sin_port; sk->daddr = rt->rt_dst; if (!sk->saddr) sk->saddr = rt->rt_src; sk->rcv_saddr = sk->saddr; - if (sk->priority == 0) - sk->priority = rt->u.dst.priority; - - sk->dummy_th.dest = usin->sin_port; - - tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr, - sk->dummy_th.source, - usin->sin_port); - tp->snd_wnd = 0; - tp->snd_wl1 = 0; - tp->snd_wl2 = tp->write_seq; - tp->snd_una = tp->write_seq; - tp->rcv_nxt = 0; - - sk->err = 0; - - /* Put in the IP header and routing stuff. */ - tmp = ip_build_header(buff, sk); - if (tmp < 0) { - /* Caller has done ip_rt_put(rt) and set sk->dst_cache - * to NULL. We must unwind the half built TCP socket - * state so that this failure does not create a "stillborn" - * sock (ie. future re-tries of connect() would fail). - */ - sk->daddr = 0; - sk->saddr = sk->rcv_saddr = 0; + if (!tcp_v4_unique_address(sk)) { kfree_skb(buff); - release_sock(sk); - return(-ENETUNREACH); + return -EADDRNOTAVAIL; } - /* No failure conditions can result past this point. */ - - /* We'll fix this up when we get a response from the other end. - * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. - */ - tp->tcp_header_len = sizeof(struct tcphdr) + - (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); - - th = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr)); - buff->h.th = th; - - memcpy(th,(void *)&(sk->dummy_th), sizeof(*th)); - /* th->doff gets fixed up below if we tack on options. */ - - buff->seq = tp->write_seq++; - th->seq = htonl(buff->seq); - tp->snd_nxt = tp->write_seq; - buff->end_seq = tp->write_seq; - th->ack = 0; - th->syn = 1; - sk->mtu = rt->u.dst.pmtu; if ((sk->ip_pmtudisc == IP_PMTUDISC_DONT || (sk->ip_pmtudisc == IP_PMTUDISC_WANT && (rt->u.dst.mxlock&(1<<RTAX_MTU)))) && - rt->u.dst.pmtu > 576) + rt->u.dst.pmtu > 576 && rt->rt_dst != rt->rt_gateway) sk->mtu = 576; - if(sk->mtu < 64) + if (sk->mtu < 64) sk->mtu = 64; /* Sanity limit */ - sk->mss = (sk->mtu - sizeof(struct iphdr) - tp->tcp_header_len); - if(sk->user_mss) - sk->mss = min(sk->mss, sk->user_mss); - - if (sk->mss < 1) { - printk(KERN_DEBUG "intial sk->mss below 1\n"); - sk->mss = 1; /* Sanity limit */ - } - - tp->window_clamp = rt->u.dst.window; - tcp_select_initial_window(sock_rspace(sk)/2,sk->mss, - &tp->rcv_wnd, - &tp->window_clamp, - sysctl_tcp_window_scaling, - &tp->rcv_wscale); - th->window = htons(tp->rcv_wnd); - - tmp = tcp_syn_build_options(buff, sk->mss, sysctl_tcp_timestamps, - sysctl_tcp_window_scaling, tp->rcv_wscale); - buff->csum = 0; - th->doff = (sizeof(*th)+ tmp)>>2; - - tcp_v4_send_check(sk, th, sizeof(struct tcphdr) + tmp, buff); - - tcp_set_state(sk,TCP_SYN_SENT); + mss = sk->mtu - sizeof(struct iphdr); + if (sk->opt) + mss -= sk->opt->optlen; - /* Socket identity change complete, no longer - * in TCP_CLOSE, so enter ourselves into the - * hash tables. - */ - tcp_v4_hash(sk); - - tp->rto = rt->u.dst.rtt; - - tcp_init_xmit_timers(sk); - - /* Now works the right way instead of a hacked initial setting. */ - tp->retransmits = 0; - - skb_queue_tail(&sk->write_queue, buff); - - tp->packets_out++; - buff->when = jiffies; - - ip_queue_xmit(skb_clone(buff, GFP_KERNEL)); + tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr, + sk->sport, usin->sin_port); - /* Timer for repeating the SYN until an answer. */ - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - tcp_statistics.TcpActiveOpens++; - tcp_statistics.TcpOutSegs++; - - release_sock(sk); - return(0); + tcp_connect(sk, buff, mss); + return 0; } static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len) @@ -724,7 +617,7 @@ static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len) int retval = -EINVAL; /* Do sanity checking for sendmsg/sendto/send. */ - if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT)) + if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL)) goto out; if (msg->msg_name) { struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name; @@ -737,7 +630,7 @@ static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len) if(sk->state == TCP_CLOSE) goto out; retval = -EISCONN; - if (addr->sin_port != sk->dummy_th.dest) + if (addr->sin_port != sk->dport) goto out; if (addr->sin_addr.s_addr != sk->daddr) goto out; @@ -851,9 +744,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) int code = skb->h.icmph->code; struct sock *sk; int opening; -#ifdef ICMP_PARANOIA __u32 seq; -#endif if (len < (iph->ihl << 2) + ICMP_MIN_LENGTH) { icmp_statistics.IcmpInErrors++; @@ -869,7 +760,6 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) } tp = &sk->tp_pinfo.af_tcp; -#ifdef ICMP_PARANOIA seq = ntohl(th->seq); if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, max(tp->snd_una+32768,tp->snd_nxt))) { @@ -879,7 +769,6 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) (int)sk->state, seq, tp->snd_una, tp->snd_nxt); return; } -#endif switch (type) { case ICMP_SOURCE_QUENCH: @@ -927,7 +816,6 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) req = tcp_v4_search_req(tp, iph, th, &prev); if (!req) return; -#ifdef ICMP_PARANOIA if (seq != req->snt_isn) { if (net_ratelimit()) printk(KERN_DEBUG "icmp packet for openreq " @@ -935,7 +823,6 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) seq, req->snt_isn); return; } -#endif if (req->sk) { /* not yet accept()ed */ sk = req->sk; /* report error in accept */ } else { @@ -987,44 +874,50 @@ void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, static void tcp_v4_send_reset(struct sk_buff *skb) { - struct tcphdr *th = skb->h.th; - struct sk_buff *skb1; - struct tcphdr *th1; + struct tcphdr *th = skb->h.th; - if (th->rst) - return; + /* Never send a reset in response to a reset. */ + if (th->rst == 0) { + struct tcphdr *th = skb->h.th; + struct sk_buff *skb1 = ip_reply(skb, sizeof(struct tcphdr)); + struct tcphdr *th1; - skb1 = ip_reply(skb, sizeof(struct tcphdr)); - if (skb1 == NULL) - return; + if (skb1 == NULL) + return; - skb1->h.th = th1 = (struct tcphdr *)skb_put(skb1, sizeof(struct tcphdr)); - memset(th1, 0, sizeof(*th1)); - - /* Swap the send and the receive. */ - th1->dest = th->source; - th1->source = th->dest; - th1->doff = sizeof(*th1)/4; - th1->rst = 1; - - if (th->ack) - th1->seq = th->ack_seq; - else { - th1->ack = 1; - if (!th->syn) - th1->ack_seq = th->seq; - else - th1->ack_seq = htonl(ntohl(th->seq)+1); - } + skb1->h.th = th1 = (struct tcphdr *) + skb_put(skb1, sizeof(struct tcphdr)); + + /* Swap the send and the receive. */ + memset(th1, 0, sizeof(*th1)); + th1->dest = th->source; + th1->source = th->dest; + th1->doff = sizeof(*th1)/4; + th1->rst = 1; + + if (th->ack) { + th1->seq = th->ack_seq; + } else { + th1->ack = 1; + if (!th->syn) + th1->ack_seq = th->seq; + else + th1->ack_seq = htonl(ntohl(th->seq)+1); + } + skb1->csum = csum_partial((u8 *) th1, sizeof(*th1), 0); + th1->check = tcp_v4_check(th1, sizeof(*th1), skb1->nh.iph->saddr, + skb1->nh.iph->daddr, skb1->csum); - skb1->csum = csum_partial((u8 *) th1, sizeof(*th1), 0); - th1->check = tcp_v4_check(th1, sizeof(*th1), skb1->nh.iph->saddr, - skb1->nh.iph->daddr, skb1->csum); + /* Finish up some IP bits. */ + skb1->nh.iph->tot_len = htons(skb1->len); + ip_send_check(skb1->nh.iph); - /* Do not place TCP options in a reset. */ - ip_queue_xmit(skb1); - tcp_statistics.TcpOutSegs++; - tcp_statistics.TcpOutRsts++; + /* All the other work was done by ip_reply(). */ + skb1->dst->output(skb1); + + tcp_statistics.TcpOutSegs++; + tcp_statistics.TcpOutRsts++; + } } #ifdef CONFIG_IP_TRANSPARENT_PROXY @@ -1055,82 +948,48 @@ int tcp_chkaddr(struct sk_buff *skb) static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) { + struct rtable *rt; + struct ip_options *opt; struct sk_buff * skb; - struct tcphdr *th; - int tmp; int mss; - skb = sock_wmalloc(sk, MAX_SYN_SIZE, 1, GFP_ATOMIC); - if (skb == NULL) + /* First, grab a route. */ + opt = req->af.v4_req.opt; + if(ip_route_output(&rt, ((opt && opt->srr) ? + opt->faddr : + req->af.v4_req.rmt_addr), + req->af.v4_req.loc_addr, + RT_TOS(sk->ip_tos) | RTO_CONN | sk->localroute, + sk->bound_dev_if)) { + ip_statistics.IpOutNoRoutes++; return; - - if(ip_build_pkt(skb, sk, req->af.v4_req.loc_addr, - req->af.v4_req.rmt_addr, req->af.v4_req.opt) < 0) { - kfree_skb(skb); + } + if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { + ip_rt_put(rt); + ip_statistics.IpOutNoRoutes++; return; } - - mss = (skb->dst->pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr)); - if (sk->user_mss) - mss = min(mss, sk->user_mss); - if(req->tstamp_ok) - mss -= TCPOLEN_TSTAMP_ALIGNED; - else - req->mss += TCPOLEN_TSTAMP_ALIGNED; - /* tcp_syn_build_options will do an skb_put() to obtain the TCP - * options bytes below. - */ - skb->h.th = th = (struct tcphdr *) skb_put(skb, sizeof(struct tcphdr)); + mss = (rt->u.dst.pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr)); + if (opt) + mss -= opt->optlen; - /* Don't offer more than they did. - * This way we don't have to memorize who said what. - * FIXME: maybe this should be changed for better performance - * with syncookies. - */ - req->mss = min(mss, req->mss); + skb = tcp_make_synack(sk, &rt->u.dst, req, mss); + if (skb) { + struct tcphdr *th = skb->h.th; - if (req->mss < 1) { - printk(KERN_DEBUG "initial req->mss below 1\n"); - req->mss = 1; - } - - /* Yuck, make this header setup more efficient... -DaveM */ - memset(th, 0, sizeof(struct tcphdr)); - th->syn = 1; - th->ack = 1; #ifdef CONFIG_IP_TRANSPARENT_PROXY - th->source = req->lcl_port; /* LVE */ -#else - th->source = sk->dummy_th.source; + th->source = req->lcl_port; /* LVE */ #endif - th->dest = req->rmt_port; - skb->seq = req->snt_isn; - skb->end_seq = skb->seq + 1; - th->seq = htonl(skb->seq); - th->ack_seq = htonl(req->rcv_isn + 1); - if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ - __u8 rcv_wscale; - /* Set this up on the first call only */ - req->window_clamp = skb->dst->window; - tcp_select_initial_window(sock_rspace(sk)/2,req->mss, - &req->rcv_wnd, - &req->window_clamp, - req->wscale_ok, - &rcv_wscale); - req->rcv_wscale = rcv_wscale; + + th->check = tcp_v4_check(th, skb->len, + req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr, + csum_partial((char *)th, skb->len, skb->csum)); + + ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr, + req->af.v4_req.rmt_addr, req->af.v4_req.opt); } - th->window = htons(req->rcv_wnd); - tmp = tcp_syn_build_options(skb, req->mss, req->tstamp_ok, - req->wscale_ok,req->rcv_wscale); - skb->csum = 0; - th->doff = (sizeof(*th) + tmp)>>2; - th->check = tcp_v4_check(th, sizeof(*th) + tmp, - req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr, - csum_partial((char *)th, sizeof(*th)+tmp, skb->csum)); - - ip_queue_xmit(skb); - tcp_statistics.TcpOutSegs++; + ip_rt_put(rt); } static void tcp_v4_or_free(struct open_request *req) @@ -1240,15 +1099,16 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ req->rcv_isn = skb->seq; - tp.tstamp_ok = tp.wscale_ok = tp.snd_wscale = 0; + tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0; tp.in_mss = 536; - tcp_parse_options(th,&tp,want_cookie); + tcp_parse_options(NULL, th, &tp, want_cookie); req->mss = tp.in_mss; if (tp.saw_tstamp) { req->mss -= TCPOLEN_TSTAMP_ALIGNED; req->ts_recent = tp.rcv_tsval; } req->tstamp_ok = tp.tstamp_ok; + req->sack_ok = tp.sack_ok; req->snd_wscale = tp.snd_wscale; req->wscale_ok = tp.wscale_ok; req->rmt_port = th->source; @@ -1300,8 +1160,11 @@ error: /* This is not only more efficient than what we used to do, it eliminates * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM + * + * This function wants to be moved to a common for IPv[46] file. --ANK */ -struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb) +struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb, + int snd_mss) { struct sock *newsk = sk_alloc(AF_INET, GFP_ATOMIC, 0); @@ -1310,27 +1173,16 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, memcpy(newsk, sk, sizeof(*newsk)); newsk->sklist_next = NULL; - newsk->daddr = req->af.v4_req.rmt_addr; - newsk->rcv_saddr = req->af.v4_req.loc_addr; -#ifdef CONFIG_IP_TRANSPARENT_PROXY - newsk->num = ntohs(skb->h.th->dest); -#endif newsk->state = TCP_SYN_RECV; /* Clone the TCP header template */ -#ifdef CONFIG_IP_TRANSPARENT_PROXY - newsk->dummy_th.source = req->lcl_port; -#endif - newsk->dummy_th.dest = req->rmt_port; - newsk->dummy_th.ack = 1; - newsk->dummy_th.doff = sizeof(struct tcphdr)>>2; + newsk->dport = req->rmt_port; newsk->sock_readers = 0; atomic_set(&newsk->rmem_alloc, 0); skb_queue_head_init(&newsk->receive_queue); atomic_set(&newsk->wmem_alloc, 0); skb_queue_head_init(&newsk->write_queue); - newsk->saddr = req->af.v4_req.loc_addr; newsk->done = 0; newsk->proc = 0; @@ -1395,12 +1247,40 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newsk->priority = 1; /* IP layer stuff */ - newsk->opt = req->af.v4_req.opt; newsk->timeout = 0; init_timer(&newsk->timer); newsk->timer.function = &net_timer; newsk->timer.data = (unsigned long) newsk; newsk->socket = NULL; + + newtp->tstamp_ok = req->tstamp_ok; + if((newtp->sack_ok = req->sack_ok) != 0) + newtp->num_sacks = 0; + newtp->window_clamp = req->window_clamp; + newtp->rcv_wnd = req->rcv_wnd; + newtp->wscale_ok = req->wscale_ok; + if (newtp->wscale_ok) { + newtp->snd_wscale = req->snd_wscale; + newtp->rcv_wscale = req->rcv_wscale; + } else { + newtp->snd_wscale = newtp->rcv_wscale = 0; + newtp->window_clamp = min(newtp->window_clamp,65535); + } + if (newtp->tstamp_ok) { + newtp->ts_recent = req->ts_recent; + newtp->ts_recent_stamp = jiffies; + newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + } else { + newtp->tcp_header_len = sizeof(struct tcphdr); + } + + snd_mss -= newtp->tcp_header_len; + + if (sk->user_mss) + snd_mss = min(snd_mss, sk->user_mss); + + newsk->mss = min(req->mss, snd_mss); + } return newsk; } @@ -1409,77 +1289,58 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct open_request *req, struct dst_entry *dst) { + struct ip_options *opt = req->af.v4_req.opt; struct tcp_opt *newtp; struct sock *newsk; int snd_mss; + int mtu; #ifdef NEW_LISTEN if (sk->ack_backlog > sk->max_ack_backlog) goto exit; /* head drop */ #endif - newsk = tcp_create_openreq_child(sk, req, skb); - if (!newsk) - goto exit; -#ifdef NEW_LISTEN - sk->ack_backlog++; -#endif - - newtp = &(newsk->tp_pinfo.af_tcp); - - /* options / mss / route_cache */ if (dst == NULL) { struct rtable *rt; if (ip_route_output(&rt, - newsk->opt && newsk->opt->srr ? - newsk->opt->faddr : newsk->daddr, - newsk->saddr, newsk->ip_tos|RTO_CONN, 0)) { - sk_free(newsk); + opt && opt->srr ? opt->faddr : req->af.v4_req.rmt_addr, + req->af.v4_req.loc_addr, sk->ip_tos|RTO_CONN, 0)) return NULL; - } dst = &rt->u.dst; - } - newsk->dst_cache = dst; - - snd_mss = dst->pmtu; - - /* FIXME: is mtu really the same as snd_mss? */ - newsk->mtu = snd_mss; - /* FIXME: where does mtu get used after this? */ - /* sanity check */ - if (newsk->mtu < 64) - newsk->mtu = 64; - - newtp->tstamp_ok = req->tstamp_ok; - newtp->window_clamp = req->window_clamp; - newtp->rcv_wnd = req->rcv_wnd; - newtp->wscale_ok = req->wscale_ok; - if (newtp->wscale_ok) { - newtp->snd_wscale = req->snd_wscale; - newtp->rcv_wscale = req->rcv_wscale; - } else { - newtp->snd_wscale = newtp->rcv_wscale = 0; - newtp->window_clamp = min(newtp->window_clamp,65535); - } - if (newtp->tstamp_ok) { - newtp->ts_recent = req->ts_recent; - newtp->ts_recent_stamp = jiffies; - newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; - newsk->dummy_th.doff += (TCPOLEN_TSTAMP_ALIGNED >> 2); - } else { - newtp->tcp_header_len = sizeof(struct tcphdr); } - snd_mss -= sizeof(struct iphdr) + sizeof(struct tcphdr); - if (sk->user_mss) - snd_mss = min(snd_mss, sk->user_mss); +#ifdef NEW_LISTEN + sk->ack_backlog++; +#endif + + mtu = dst->pmtu; + if (mtu < 68) + mtu = 68; + snd_mss = mtu - sizeof(struct iphdr); + if (opt) + snd_mss -= opt->optlen; - /* Make sure our mtu is adjusted for headers. */ - newsk->mss = min(req->mss, snd_mss) + sizeof(struct tcphdr) - newtp->tcp_header_len; + newsk = tcp_create_openreq_child(sk, req, skb, snd_mss); + if (!newsk) + goto exit; + + newsk->dst_cache = dst; + + newtp = &(newsk->tp_pinfo.af_tcp); + newsk->daddr = req->af.v4_req.rmt_addr; + newsk->saddr = req->af.v4_req.loc_addr; + newsk->rcv_saddr = req->af.v4_req.loc_addr; +#ifdef CONFIG_IP_TRANSPARENT_PROXY + newsk->num = ntohs(skb->h.th->dest); + newsk->sport = req->lcl_port; +#endif + newsk->opt = req->af.v4_req.opt; + newsk->mtu = mtu; /* Must use the af_specific ops here for the case of IPv6 mapped. */ newsk->prot->hash(newsk); add_to_prot_sklist(newsk); + return newsk; exit: @@ -1677,106 +1538,82 @@ do_time_wait: goto discard_it; } -int tcp_v4_build_header(struct sock *sk, struct sk_buff *skb) -{ - return ip_build_header(skb, sk); -} - -int tcp_v4_rebuild_header(struct sock *sk, struct sk_buff *skb) +int tcp_v4_rebuild_header(struct sock *sk) { - struct rtable *rt; - struct iphdr *iph; - struct tcphdr *th; - int size; + struct rtable *rt = (struct rtable *)sk->dst_cache; + __u32 new_saddr; int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT; - /* Check route */ - - rt = (struct rtable*)skb->dst; + if(rt == NULL) + return 0; - /* Force route checking if want_rewrite */ - /* The idea is good, the implementation is disguisting. - Well, if I made bind on this socket, you cannot randomly ovewrite - its source address. --ANK + /* Force route checking if want_rewrite. + * The idea is good, the implementation is disguisting. + * Well, if I made bind on this socket, you cannot randomly ovewrite + * its source address. --ANK */ if (want_rewrite) { int tmp; + struct rtable *new_rt; __u32 old_saddr = rt->rt_src; - /* Query new route */ - tmp = ip_route_connect(&rt, rt->rt_dst, 0, + /* Query new route using another rt buffer */ + tmp = ip_route_connect(&new_rt, rt->rt_dst, 0, RT_TOS(sk->ip_tos)|sk->localroute, sk->bound_dev_if); /* Only useful if different source addrs */ - if (tmp == 0 || rt->rt_src != old_saddr ) { - dst_release(skb->dst); - skb->dst = &rt->u.dst; - } else { - want_rewrite = 0; - dst_release(&rt->u.dst); + if (tmp == 0) { + /* + * Only useful if different source addrs + */ + if (new_rt->rt_src != old_saddr ) { + dst_release(sk->dst_cache); + sk->dst_cache = &new_rt->u.dst; + rt = new_rt; + goto do_rewrite; + } + dst_release(&new_rt->u.dst); } - } else + } if (rt->u.dst.obsolete) { int err; err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos|RTO_CONN, rt->key.oif); if (err) { sk->err_soft=-err; - sk->error_report(skb->sk); + sk->error_report(sk); return -1; } - dst_release(skb->dst); - skb->dst = &rt->u.dst; + dst_release(xchg(&sk->dst_cache, &rt->u.dst)); } - iph = skb->nh.iph; - th = skb->h.th; - size = skb->tail - skb->h.raw; + return 0; - if (want_rewrite) { - __u32 new_saddr = rt->rt_src; +do_rewrite: + new_saddr = rt->rt_src; - /* - * Ouch!, this should not happen. - */ - if (!sk->saddr || !sk->rcv_saddr) { - printk(KERN_WARNING "tcp_v4_rebuild_header(): not valid sock addrs: saddr=%08lX rcv_saddr=%08lX\n", - ntohl(sk->saddr), - ntohl(sk->rcv_saddr)); - return 0; - } - - /* - * Maybe whe are in a skb chain loop and socket address has - * yet been 'damaged'. - */ - - if (new_saddr != sk->saddr) { - if (sysctl_ip_dynaddr > 1) { - printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr from %d.%d.%d.%d to %d.%d.%d.%d\n", - NIPQUAD(sk->saddr), - NIPQUAD(new_saddr)); - } + /* Ouch!, this should not happen. */ + if (!sk->saddr || !sk->rcv_saddr) { + printk(KERN_WARNING "tcp_v4_rebuild_header(): not valid sock addrs: " + "saddr=%08lX rcv_saddr=%08lX\n", + ntohl(sk->saddr), + ntohl(sk->rcv_saddr)); + return 0; + } - sk->saddr = new_saddr; - sk->rcv_saddr = new_saddr; - /* sk->prot->rehash(sk); */ - tcp_v4_rehash(sk); - } - - if (new_saddr != iph->saddr) { - if (sysctl_ip_dynaddr > 1) { - printk(KERN_INFO "tcp_v4_rebuild_header(): shifting iph->saddr from %d.%d.%d.%d to %d.%d.%d.%d\n", - NIPQUAD(iph->saddr), - NIPQUAD(new_saddr)); - } + if (new_saddr != sk->saddr) { + if (sysctl_ip_dynaddr > 1) { + printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr " + "from %d.%d.%d.%d to %d.%d.%d.%d\n", + NIPQUAD(sk->saddr), + NIPQUAD(new_saddr)); + } - iph->saddr = new_saddr; - ip_send_check(iph); - } + sk->saddr = new_saddr; + sk->rcv_saddr = new_saddr; + tcp_v4_rehash(sk); + } - } - return 0; } @@ -1792,11 +1629,10 @@ static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) sin->sin_family = AF_INET; sin->sin_addr.s_addr = sk->daddr; - sin->sin_port = sk->dummy_th.dest; + sin->sin_port = sk->dport; } struct tcp_func ipv4_specific = { - tcp_v4_build_header, ip_queue_xmit, tcp_v4_send_check, tcp_v4_rebuild_header, @@ -1835,10 +1671,6 @@ static int tcp_v4_init_sock(struct sock *sk) sk->mtu = 576; sk->mss = 536; - /* Speed up by setting some standard state for the dummy_th. */ - sk->dummy_th.ack=1; - sk->dummy_th.doff=sizeof(struct tcphdr)>>2; - /* Init SYN queue. */ tcp_synq_init(tp); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d8c3c6480..465ee3fdc 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.65 1998/03/15 12:07:03 davem Exp $ + * Version: $Id: tcp_output.c,v 1.76 1998/03/22 22:10:24 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -29,6 +29,7 @@ * Linus Torvalds : send_delayed_ack * David S. Miller : Charge memory using the right skb * during syn/ack processing. + * David S. Miller : Output engine completely rewritten. * */ @@ -57,278 +58,227 @@ static __inline__ void update_send_head(struct sock *sk) tp->send_head = NULL; } -/* - * This is the main buffer sending routine. We queue the buffer - * having checked it is sane seeming. +/* This routine actually transmits TCP packets queued in by + * tcp_do_sendmsg(). This is used by both the initial + * transmission and possible later retransmissions. + * All SKB's seen here are completely headerless. It is our + * job to build the TCP header, and pass the packet down to + * IP so it can do the same plus pass the packet off to the + * device. + * + * We are working here with either a clone of the original + * SKB, or a fresh unique copy made by the retransmit engine. */ - -void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue) +void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) { - struct tcphdr *th = skb->h.th; - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int size; + if(skb != NULL) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + int tcp_header_size = tp->tcp_header_len; + struct tcphdr *th; - /* Length of packet (not counting length of pre-tcp headers). */ - size = skb->len - ((unsigned char *) th - skb->data); + if(tcb->flags & TCPCB_FLAG_SYN) { + tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; + if(sysctl_tcp_timestamps) + tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; + if(sysctl_tcp_window_scaling) + tcp_header_size += TCPOLEN_WSCALE_ALIGNED; + if(sysctl_tcp_sack && !sysctl_tcp_timestamps) + tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; + } else if(tp->sack_ok && tp->num_sacks) { + /* A SACK is 2 pad bytes, a 2 byte header, plus + * 2 32-bit sequence numbers for each SACK block. + */ + tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + + (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); + } + th = (struct tcphdr *) skb_push(skb, tcp_header_size); + skb->h.th = th; + skb_set_owner_w(skb, sk); + + /* Build TCP header and checksum it. */ + th->source = sk->sport; + th->dest = sk->dport; + th->seq = htonl(skb->seq); + th->ack_seq = htonl(tp->rcv_nxt); + th->doff = (tcp_header_size >> 2); + th->res1 = 0; + *(((__u8 *)th) + 13) = tcb->flags; + th->window = htons(tcp_select_window(sk)); + th->check = 0; + th->urg_ptr = ntohs(tcb->urg_ptr); + if(tcb->flags & TCPCB_FLAG_SYN) { + th->window = htons(tp->rcv_wnd); + tcp_syn_build_options((__u32 *)(th + 1), sk->mss, + sysctl_tcp_timestamps, + sysctl_tcp_sack, + sysctl_tcp_window_scaling, + tp->rcv_wscale, + skb->when); + } else { + tcp_build_and_update_options((__u32 *)(th + 1), + tp, skb->when); + } + tp->af_specific->send_check(sk, th, skb->len, skb); - /* If there is a FIN or a SYN we add it onto the size. */ - if (th->fin || th->syn) { - if(th->syn) - size++; - if(th->fin) - size++; + clear_delayed_acks(sk); + tp->last_ack_sent = tp->rcv_nxt; + tcp_statistics.TcpOutSegs++; + tp->af_specific->queue_xmit(skb); } +} - /* Actual processing. */ - skb->seq = ntohl(th->seq); - skb->end_seq = skb->seq + size - 4*th->doff; +/* This is the main buffer sending routine. We queue the buffer + * and decide whether to queue or transmit now. + */ +void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + /* Advance write_seq and place onto the write_queue. */ + tp->write_seq += (skb->end_seq - skb->seq); skb_queue_tail(&sk->write_queue, skb); if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) { - struct sk_buff * buff; - - /* This is going straight out. */ - tp->last_ack_sent = tp->rcv_nxt; - th->ack_seq = htonl(tp->rcv_nxt); - th->window = htons(tcp_select_window(sk)); - tcp_update_options((__u32 *)(th + 1),tp); - - tp->af_specific->send_check(sk, th, size, skb); - - buff = skb_clone(skb, GFP_KERNEL); - if (buff == NULL) - goto queue; - - clear_delayed_acks(sk); - skb_set_owner_w(buff, sk); - + /* Send it out now. */ + skb->when = jiffies; tp->snd_nxt = skb->end_seq; tp->packets_out++; - - skb->when = jiffies; - - tcp_statistics.TcpOutSegs++; - tp->af_specific->queue_xmit(buff); - - if (!tcp_timer_is_set(sk, TIME_RETRANS)) + tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)); + if(!tcp_timer_is_set(sk, TIME_RETRANS)) tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - - return; - } - -queue: - /* Remember where we must start sending. */ - if (tp->send_head == NULL) - tp->send_head = skb; - if (!force_queue && tp->packets_out == 0 && !tp->pending) { - tp->pending = TIME_PROBE0; - tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); + } else { + /* Queue it, remembering where we must start sending. */ + if (tp->send_head == NULL) + tp->send_head = skb; + if (!force_queue && tp->packets_out == 0 && !tp->pending) { + tp->pending = TIME_PROBE0; + tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); + } } } -/* - * Function to create two new tcp segments. - * Shrinks the given segment to the specified size and appends a new - * segment with the rest of the packet to the list. - * This won't be called frenquently, I hope... +/* Function to create two new tcp segments. Shrinks the given segment + * to the specified size and appends a new segment with the rest of the + * packet to the list. This won't be called frenquently, I hope... + * Remember, these are still header-less SKB's at this point. */ - static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) { - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *buff; - struct tcphdr *th, *nth; - int nsize; - int tmp; - - th = skb->h.th; - - /* Size of new segment. */ - nsize = skb->tail - ((unsigned char *)(th)+tp->tcp_header_len) - len; - if (nsize <= 0) { - printk(KERN_DEBUG "tcp_fragment: bug size <= 0\n"); - return -1; - } + int nsize = skb->len - len; + u16 flags; /* Get a new skb... force flag on. */ - buff = sock_wmalloc(sk, nsize + 128 + sk->prot->max_header + 15, 1, - GFP_ATOMIC); + buff = sock_wmalloc(sk, + (nsize + + MAX_HEADER + + sk->prot->max_header + 15), + 1, GFP_ATOMIC); if (buff == NULL) - return -1; + return -1; /* We'll just try again later. */ - /* Put headers on the new packet. */ - tmp = tp->af_specific->build_net_header(sk, buff); - if (tmp < 0) { - kfree_skb(buff); - return -1; - } + /* Reserve space for headers. */ + skb_reserve(buff, MAX_HEADER + sk->prot->max_header); - /* Move the TCP header over. */ - nth = (struct tcphdr *) skb_put(buff, tp->tcp_header_len); - buff->h.th = nth; - memcpy(nth, th, tp->tcp_header_len); - - /* Correct the new header. */ + /* Correct the sequence numbers. */ buff->seq = skb->seq + len; buff->end_seq = skb->end_seq; - nth->seq = htonl(buff->seq); - nth->check = 0; - nth->doff = th->doff; - /* urg data is always an headache */ - if (th->urg) { - if (th->urg_ptr > len) { - th->urg = 0; - nth->urg_ptr -= len; + /* PSH and FIN should only be set in the second packet. */ + flags = TCP_SKB_CB(skb)->flags; + TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); + if(flags & TCPCB_FLAG_URG) { + u16 old_urg_ptr = TCP_SKB_CB(skb)->urg_ptr; + + /* Urgent data is always a pain in the ass. */ + if(old_urg_ptr > len) { + TCP_SKB_CB(skb)->flags &= ~(TCPCB_FLAG_URG); + TCP_SKB_CB(skb)->urg_ptr = 0; + TCP_SKB_CB(buff)->urg_ptr = old_urg_ptr - len; } else { - nth->urg = 0; + flags &= ~(TCPCB_FLAG_URG); } } + if(!(flags & TCPCB_FLAG_URG)) + TCP_SKB_CB(buff)->urg_ptr = 0; + TCP_SKB_CB(buff)->flags = flags; + TCP_SKB_CB(buff)->sacked = 0; - /* Copy data tail to our new buffer. */ - buff->csum = csum_partial_copy(((u8 *)(th)+tp->tcp_header_len) + len, - skb_put(buff, nsize), + /* Copy and checksum data tail into the new buffer. */ + buff->csum = csum_partial_copy(skb->data + len, skb_put(buff, nsize), nsize, 0); skb->end_seq -= nsize; skb_trim(skb, skb->len - nsize); - /* Remember to checksum this packet afterwards. */ - th->check = 0; - skb->csum = csum_partial((u8*)(th) + tp->tcp_header_len, skb->tail - ((u8 *) (th)+tp->tcp_header_len), - 0); + /* Rechecksum original buffer. */ + skb->csum = csum_partial(skb->data, skb->len, 0); + /* Link BUFF into the send queue. */ skb_append(skb, buff); return 0; } -static void tcp_wrxmit_prob(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - /* This is acked data. We can discard it. This cannot currently occur. */ - tp->retransmits = 0; - - printk(KERN_DEBUG "tcp_write_xmit: bug skb in write queue\n"); - - update_send_head(sk); - - skb_unlink(skb); - kfree_skb(skb); - - if (!sk->dead) - sk->write_space(sk); -} - -static int tcp_wrxmit_frag(struct sock *sk, struct sk_buff *skb, int size) -{ - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - - SOCK_DEBUG(sk, "tcp_write_xmit: frag needed size=%d mss=%d\n", - size, sk->mss); - - if (tcp_fragment(sk, skb, sk->mss)) { - /* !tcp_frament Failed! */ - tp->send_head = skb; - tp->packets_out--; - return -1; - } - return 0; -} - -/* - * This routine writes packets to the network. - * It advances the send_head. - * This happens as incoming acks open up the remote window for us. +/* This routine writes packets to the network. It advances the + * send_head. This happens as incoming acks open up the remote + * window for us. */ - void tcp_write_xmit(struct sock *sk) { - struct sk_buff *skb; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - u16 rcv_wnd; - int sent_pkts = 0; + int mss_now = sk->mss; - /* The bytes will have to remain here. In time closedown will - * empty the write queue and all will be happy. + /* Account for SACKS, we may need to fragment due to this. + * It is just like the real MSS changing on us midstream. + * We also handle things correctly when the user adds some + * IP options mid-stream. Silly to do, but cover it. */ - if(sk->zapped) - return; - - /* Anything on the transmit queue that fits the window can - * be added providing we are: - * - * a) following SWS avoidance [and Nagle algorithm] - * b) not exceeding our congestion window. - * c) not retransmiting [Nagle] + if(tp->sack_ok && tp->num_sacks) + mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + + (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); + if(sk->opt && sk->opt->optlen) + mss_now -= sk->opt->optlen; + + /* If we are zapped, the bytes will have to remain here. + * In time closedown will empty the write queue and all + * will be happy. */ - rcv_wnd = htons(tcp_select_window(sk)); - while((skb = tp->send_head) && tcp_snd_test(sk, skb)) { - struct tcphdr *th; - struct sk_buff *buff; - int size; + if(!sk->zapped) { + struct sk_buff *skb; + int sent_pkts = 0; - /* See if we really need to send the packet. (debugging code) */ - if (!after(skb->end_seq, tp->snd_una)) { - tcp_wrxmit_prob(sk, skb); - continue; - } - - /* Put in the ack seq and window at this point rather - * than earlier, in order to keep them monotonic. - * We really want to avoid taking back window allocations. - * That's legal, but RFC1122 says it's frowned on. - * Ack and window will in general have changed since - * this packet was put on the write queue. + /* Anything on the transmit queue that fits the window can + * be added providing we are: + * + * a) following SWS avoidance [and Nagle algorithm] + * b) not exceeding our congestion window. + * c) not retransmiting [Nagle] */ - th = skb->h.th; - size = skb->len - (((unsigned char *) th) - skb->data); - if (size - (th->doff << 2) > sk->mss) { - if (tcp_wrxmit_frag(sk, skb, size)) - break; - size = skb->len - (((unsigned char*)th) - skb->data); - } - - tp->last_ack_sent = tp->rcv_nxt; - th->ack_seq = htonl(tp->rcv_nxt); - th->window = rcv_wnd; - tcp_update_options((__u32 *)(th + 1),tp); - - tp->af_specific->send_check(sk, th, size, skb); - -#ifdef TCP_DEBUG - if (before(skb->end_seq, tp->snd_nxt)) - printk(KERN_DEBUG "tcp_write_xmit:" - " sending already sent seq\n"); -#endif - - buff = skb_clone(skb, GFP_ATOMIC); - if (buff == NULL) - break; - - /* Advance the send_head. This one is going out. */ - update_send_head(sk); - clear_delayed_acks(sk); - - tp->packets_out++; - skb_set_owner_w(buff, sk); - - tp->snd_nxt = skb->end_seq; + while((skb = tp->send_head) && tcp_snd_test(sk, skb)) { + if (skb->len > mss_now) { + if (tcp_fragment(sk, skb, mss_now)) + break; + } - skb->when = jiffies; + /* Advance the send_head. This one is going out. */ + update_send_head(sk); + skb->when = jiffies; + tp->snd_nxt = skb->end_seq; + tp->packets_out++; + tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + sent_pkts = 1; + } - sent_pkts = 1; - tp->af_specific->queue_xmit(buff); + /* If we sent anything, make sure the retransmit + * timer is active. + */ + if (sent_pkts && !tcp_timer_is_set(sk, TIME_RETRANS)) + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); } - - if (sent_pkts && !tcp_timer_is_set(sk, TIME_RETRANS)) - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); } - - /* This function returns the amount that we can raise the * usable window based on the following constraints * @@ -377,11 +327,7 @@ void tcp_write_xmit(struct sock *sk) * Below we obtain similar behavior by forcing the offered window to * a multiple of the mss when it is feasible to do so. * - * FIXME: In our current implementation the value returned by sock_rpsace(sk) - * is the total space we have allocated to the socket to store skbuf's. - * The current design assumes that up to half of that space will be - * taken by headers, and the remaining space will be available for TCP data. - * This should be accounted for correctly instead. + * Note, we don't "adjust" for TIMESTAMP or SACK option bytes. */ u32 __tcp_select_window(struct sock *sk) { @@ -422,57 +368,72 @@ u32 __tcp_select_window(struct sock *sk) return window; } -static int tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb) +/* Attempt to collapse two adjacent SKB's during retransmission. */ +static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now) { - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct tcphdr *th1, *th2; - int size1, size2, avail; - struct sk_buff *buff = skb->next; - - th1 = skb->h.th; - - if (th1->urg) - return -1; + struct sk_buff *next_skb = skb->next; - avail = skb_tailroom(skb); + /* The first test we must make is that neither of these two + * SKB's are still referenced by someone else. + */ + if(!skb_cloned(skb) && !skb_cloned(next_skb)) { + int skb_size = skb->len, next_skb_size = next_skb->len; + u16 flags = TCP_SKB_CB(skb)->flags; - /* Size of TCP payload. */ - size1 = skb->tail - ((u8 *) (th1)+(th1->doff<<2)); + /* Punt if the first SKB has URG set. */ + if(flags & TCPCB_FLAG_URG) + return; - th2 = buff->h.th; - size2 = buff->tail - ((u8 *) (th2)+(th2->doff<<2)); + /* Also punt if next skb has been SACK'd. */ + if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED) + return; - if (size2 > avail || size1 + size2 > sk->mss ) - return -1; + /* Punt if not enough space exists in the first SKB for + * the data in the second, or the total combined payload + * would exceed the MSS. + */ + if ((next_skb_size > skb_tailroom(skb)) || + ((skb_size + next_skb_size) > mss_now)) + return; - /* Ok. We will be able to collapse the packet. */ - skb_unlink(buff); - memcpy(skb_put(skb, size2), ((char *) th2) + (th2->doff << 2), size2); - - /* Update sizes on original skb, both TCP and IP. */ - skb->end_seq += buff->end_seq - buff->seq; - if (th2->urg) { - th1->urg = 1; - th1->urg_ptr = th2->urg_ptr + size1; - } - if (th2->fin) - th1->fin = 1; + /* Ok. We will be able to collapse the packet. */ + skb_unlink(next_skb); - /* ... and off you go. */ - kfree_skb(buff); - tp->packets_out--; + if(skb->len % 4) { + /* Must copy and rechecksum all data. */ + memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); + skb->csum = csum_partial(skb->data, skb->len, 0); + } else { + /* Optimize, actually we could also combine next_skb->csum + * to skb->csum using a single add w/carry operation too. + */ + skb->csum = csum_partial_copy(next_skb->data, + skb_put(skb, next_skb_size), + next_skb_size, skb->csum); + } + + /* Update sequence range on original skb. */ + skb->end_seq += next_skb->end_seq - next_skb->seq; + + /* Merge over control information. */ + flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */ + if(flags & TCPCB_FLAG_URG) { + u16 urgptr = TCP_SKB_CB(next_skb)->urg_ptr; + TCP_SKB_CB(skb)->urg_ptr = urgptr + skb_size; + } + TCP_SKB_CB(skb)->flags = flags; - /* Header checksum will be set by the retransmit procedure - * after calling rebuild header. - */ - th1->check = 0; - skb->csum = csum_partial((u8*)(th1)+(th1->doff<<2), size1 + size2, 0); - return 0; + /* All done, get rid of second SKB and account for it so + * packet counting does not break. + */ + kfree_skb(next_skb); + sk->tp_pinfo.af_tcp.packets_out--; + } } /* Do a simple retransmit without using the backoff mechanisms in * tcp_timer. This is used to speed up path mtu recovery. Note that - * these simple retransmit aren't counted in the usual tcp retransmit + * these simple retransmits aren't counted in the usual tcp retransmit * backoff counters. * The socket is already locked here. */ @@ -480,114 +441,114 @@ void tcp_simple_retransmit(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - /* Clear delay ack timer. */ - tcp_clear_xmit_timer(sk, TIME_DACK); - - tp->retrans_head = NULL; /* Don't muck with the congestion window here. */ tp->dup_acks = 0; tp->high_seq = tp->snd_nxt; + /* FIXME: make the current rtt sample invalid */ - tcp_do_retransmit(sk, 0); + tp->retrans_head = NULL; + tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); } -/* - * A socket has timed out on its send queue and wants to do a - * little retransmitting. - * retrans_head can be different from the head of the write_queue - * if we are doing fast retransmit. - */ +static __inline__ void update_retrans_head(struct sock *sk) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + tp->retrans_head = tp->retrans_head->next; + if((tp->retrans_head == tp->send_head) || + (tp->retrans_head == (struct sk_buff *) &sk->write_queue)) + tp->retrans_head = NULL; +} -void tcp_do_retransmit(struct sock *sk, int all) +/* This retransmits one SKB. Policy decisions and retransmit queue + * state updates are done by the caller. Returns non-zero if an + * error occured which prevented the send. + */ +int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) { - struct sk_buff * skb; - int ct=0; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int current_mss = sk->mss; - if (tp->retrans_head == NULL) - tp->retrans_head = skb_peek(&sk->write_queue); - - if (tp->retrans_head == tp->send_head) - tp->retrans_head = NULL; - - while ((skb = tp->retrans_head) != NULL) { - struct sk_buff *buff; - struct tcphdr *th; - int tcp_size; - int size; - - /* In general it's OK just to use the old packet. However we - * need to use the current ack and window fields. Urg and - * urg_ptr could possibly stand to be updated as well, but we - * don't keep the necessary data. That shouldn't be a problem, - * if the other end is doing the right thing. Since we're - * changing the packet, we have to issue a new IP identifier. - */ + /* Account for outgoing SACKS and IP options, if any. */ + if(tp->sack_ok && tp->num_sacks) + current_mss -= (TCPOLEN_SACK_BASE_ALIGNED + + (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); + if(sk->opt && sk->opt->optlen) + current_mss -= sk->opt->optlen; - th = skb->h.th; + if(skb->len > current_mss) { + if(tcp_fragment(sk, skb, current_mss)) + return 1; /* We'll try again later. */ - tcp_size = skb->tail - ((unsigned char *)(th)+tp->tcp_header_len); + /* New SKB created, account for it. */ + tp->packets_out++; + } - if (tcp_size > sk->mss) { - if (tcp_fragment(sk, skb, sk->mss)) { - printk(KERN_DEBUG "tcp_fragment failed\n"); - return; - } - tp->packets_out++; - } + /* Collapse two adjacent packets if worthwhile and we can. */ + if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && + (skb->len < (current_mss >> 1)) && + (skb->next != tp->send_head) && + (skb->next != (struct sk_buff *)&sk->write_queue)) + tcp_retrans_try_collapse(sk, skb, current_mss); - if (!th->syn && - tcp_size < (sk->mss >> 1) && - skb->next != tp->send_head && - skb->next != (struct sk_buff *)&sk->write_queue) - tcp_retrans_try_collapse(sk, skb); - - if (tp->af_specific->rebuild_header(sk, skb)) { -#ifdef TCP_DEBUG - printk(KERN_DEBUG "tcp_do_rebuild_header failed\n"); -#endif - break; - } + if(tp->af_specific->rebuild_header(sk)) + return 1; /* Routing failure or similar. */ - SOCK_DEBUG(sk, "retransmit sending seq=%x\n", skb->seq); + /* Ok, we're gonna send it out, update state. */ + TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_RETRANS; - /* Update ack and window. */ - tp->last_ack_sent = th->ack_seq = htonl(tp->rcv_nxt); - th->window = ntohs(tcp_select_window(sk)); - tcp_update_options((__u32 *)(th+1),tp); + /* Make a copy, if the first transmission SKB clone we made + * is still in somebodies hands, else make a clone. + */ + skb->when = jiffies; + if(skb_cloned(skb)) + skb = skb_copy(skb, GFP_ATOMIC); + else + skb = skb_clone(skb, GFP_ATOMIC); + tcp_transmit_skb(sk, skb); - size = skb->tail - (unsigned char *) th; - tp->af_specific->send_check(sk, th, size, skb); + /* Update global TCP statistics and return success. */ + sk->prot->retransmits++; + tcp_statistics.TcpRetransSegs++; - skb->when = jiffies; + return 0; +} - buff = skb_clone(skb, GFP_ATOMIC); - if (buff == NULL) - break; +/* This gets called after a retransmit timeout, and the initially + * retransmitted data is acknowledged. It tries to continue + * resending the rest of the retransmit queue, until either + * we've sent it all or the congestion window limit is reached. + */ +void tcp_xmit_retransmit_queue(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb; + int ct = 0; - skb_set_owner_w(buff, sk); + if (tp->retrans_head == NULL) + tp->retrans_head = skb_peek(&sk->write_queue); + if (tp->retrans_head == tp->send_head) + tp->retrans_head = NULL; - clear_delayed_acks(sk); - tp->af_specific->queue_xmit(buff); + while ((skb = tp->retrans_head) != NULL) { + /* If it has been ack'd by a SACK block, we don't + * retransmit it. + */ + if(!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { + /* Send it out, punt if error occurred. */ + if(tcp_retransmit_skb(sk, skb)) + break; - /* Count retransmissions. */ - ct++; - sk->prot->retransmits++; - tcp_statistics.TcpRetransSegs++; - - /* Only one retransmit requested. */ - if (!all) - break; - - /* This should cut it off before we send too many packets. */ - if (ct >= tp->snd_cwnd) - break; - - /* Advance the pointer. */ - tp->retrans_head = skb->next; - if ((tp->retrans_head == tp->send_head) || - (tp->retrans_head == (struct sk_buff *) &sk->write_queue)) - tp->retrans_head = NULL; + /* Count retransmissions locally. */ + ct++; + + /* Stop retransmitting if we've hit the congestion + * window limit. + */ + if (ct >= tp->snd_cwnd) + break; + } + update_retrans_head(sk); } } @@ -597,83 +558,44 @@ void tcp_do_retransmit(struct sock *sk, int all) void tcp_send_fin(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb = skb_peek_tail(&sk->write_queue); + int mss_now = sk->mss; /* Optimization, tack on the FIN if we have a queue of - * unsent frames. + * unsent frames. But be careful about outgoing SACKS + * and IP options. */ - if(tp->send_head != NULL) { - struct sk_buff *tail = skb_peek_tail(&sk->write_queue); - struct tcphdr *th = tail->h.th; - int data_len; - - /* Unfortunately tcp_write_xmit won't check for going over - * the MSS due to the FIN sequence number, so we have to - * watch out for it here. - */ - data_len = (tail->tail - (((unsigned char *)th)+tp->tcp_header_len)); - if(data_len >= sk->mss) - goto build_new_frame; /* ho hum... */ - - /* tcp_write_xmit() will checksum the header etc. for us. */ - th->fin = 1; - tail->end_seq++; + if(tp->sack_ok && tp->num_sacks) + mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + + (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); + if(sk->opt && sk->opt->optlen) + mss_now -= sk->opt->optlen; + if((tp->send_head != NULL) && (skb->len < mss_now)) { + /* tcp_write_xmit() takes care of the rest. */ + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; + skb->end_seq++; + tp->write_seq++; } else { - struct sk_buff *buff; - struct tcphdr *th; - -build_new_frame: - buff = sock_wmalloc(sk, - (BASE_ACK_SIZE + tp->tcp_header_len + - sizeof(struct sk_buff)), - 1, GFP_KERNEL); - if (buff == NULL) { - /* We can only fail due to low memory situations, not - * due to going over our sndbuf limits (due to the - * force flag passed to sock_wmalloc). So just keep - * trying. We cannot allow this fail. The socket is - * still locked, so we need not check if the connection - * was reset in the meantime etc. - */ - goto build_new_frame; - } - - /* Administrivia. */ - buff->csum = 0; - - /* Put in the IP header and routing stuff. - * - * FIXME: - * We can fail if the interface for the route - * this socket takes goes down right before - * we get here. ANK is there a way to point - * this into a "black hole" route in such a - * case? Ideally, we should still be able to - * queue this and let the retransmit timer - * keep trying until the destination becomes - * reachable once more. -DaveM - */ - if(tp->af_specific->build_net_header(sk, buff) < 0) { - kfree_skb(buff); - goto update_write_seq; - } - th = (struct tcphdr *) skb_put(buff, tp->tcp_header_len); - buff->h.th = th; - - memcpy(th, (void *) &(sk->dummy_th), sizeof(*th)); - th->seq = htonl(tp->write_seq); - th->fin = 1; - tcp_build_options((__u32 *)(th + 1), tp); - - /* This makes sure we do things like abide by the congestion - * window and other constraints which prevent us from sending. - */ - tcp_send_skb(sk, buff, 0); + /* Socket is locked, keep trying until memory is available. */ + do { + skb = sock_wmalloc(sk, + (MAX_HEADER + + sk->prot->max_header), + 1, GFP_KERNEL); + } while (skb == NULL); + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb->csum = 0; + TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); + TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->urg_ptr = 0; + + /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */ + skb->seq = tp->write_seq; + skb->end_seq = skb->seq + 1; + tcp_send_skb(sk, skb, 0); } -update_write_seq: - /* So that we recognize the ACK coming back for - * this FIN as being legitimate. - */ - tp->write_seq++; } /* We get here when a process closes a file descriptor (either due to @@ -685,109 +607,218 @@ void tcp_send_active_reset(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; - struct tcphdr *th; -again: /* NOTE: No TCP options attached and we never retransmit this. */ - skb = sock_wmalloc(sk, (BASE_ACK_SIZE + sizeof(*th)), 1, GFP_KERNEL); - if(skb == NULL) - goto again; + do { + skb = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_KERNEL); + } while(skb == NULL); + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_HEADER + sk->prot->max_header); skb->csum = 0; - if(tp->af_specific->build_net_header(sk, skb) < 0) { - kfree_skb(skb); - } else { - th = (struct tcphdr *) skb_put(skb, sizeof(*th)); - memcpy(th, &(sk->dummy_th), sizeof(*th)); - th->seq = htonl(tp->write_seq); - th->rst = 1; - th->doff = sizeof(*th) / 4; - tp->last_ack_sent = tp->rcv_nxt; - th->ack_seq = htonl(tp->rcv_nxt); - th->window = htons(tcp_select_window(sk)); - tp->af_specific->send_check(sk, th, sizeof(*th), skb); - tp->af_specific->queue_xmit(skb); - tcp_statistics.TcpOutSegs++; - tcp_statistics.TcpOutRsts++; - } + TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST); + TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->urg_ptr = 0; + + /* Send it off. */ + skb->seq = tp->write_seq; + skb->end_seq = skb->seq; + skb->when = jiffies; + tcp_transmit_skb(sk, skb); } /* WARNING: This routine must only be called when we have already sent * a SYN packet that crossed the incoming SYN that caused this routine * to get called. If this assumption fails then the initial rcv_wnd * and rcv_wscale values will not be correct. - * - * XXX When you have time Dave, redo this to use tcp_send_skb() just - * XXX like tcp_send_fin() above now does.... -DaveM */ int tcp_send_synack(struct sock *sk) { - struct tcp_opt * tp = &(sk->tp_pinfo.af_tcp); - struct sk_buff * skb; - struct sk_buff * buff; - struct tcphdr *th; - int tmp; + struct tcp_opt* tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff* skb; - skb = sock_wmalloc(sk, MAX_SYN_SIZE + sizeof(struct sk_buff), 1, GFP_ATOMIC); + skb = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header), + 1, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; - tmp = tp->af_specific->build_net_header(sk, skb); - if (tmp < 0) { - kfree_skb(skb); - return tmp; + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb->csum = 0; + TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_SYN); + TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->urg_ptr = 0; + + /* SYN eats a sequence byte. */ + skb->seq = tp->snd_una; + skb->end_seq = skb->seq + 1; + skb_queue_tail(&sk->write_queue, skb); + skb->when = jiffies; + tp->packets_out++; + tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + return 0; +} + +struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, + struct open_request *req, int mss) +{ + struct tcphdr *th; + int tcp_header_size; + struct sk_buff *skb; + + skb = sock_wmalloc(sk, MAX_HEADER + sk->prot->max_header, 1, GFP_ATOMIC); + if (skb == NULL) + return NULL; + + /* Reserve space for headers. */ + skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + + skb->dst = dst_clone(dst); + + if (sk->user_mss) + mss = min(mss, sk->user_mss); + if (req->tstamp_ok) + mss -= TCPOLEN_TSTAMP_ALIGNED; + else + req->mss += TCPOLEN_TSTAMP_ALIGNED; + + /* Don't offer more than they did. + * This way we don't have to memorize who said what. + * FIXME: maybe this should be changed for better performance + * with syncookies. + */ + req->mss = min(mss, req->mss); + if (req->mss < 1) { + printk(KERN_DEBUG "initial req->mss below 1\n"); + req->mss = 1; } - th =(struct tcphdr *) skb_put(skb, sizeof(struct tcphdr)); - skb->h.th = th; - memset(th, 0, sizeof(struct tcphdr)); + tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS + + (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) + + (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) + + /* SACK_PERM is in the place of NOP NOP of TS */ + ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0)); + skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size); + memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; + th->source = sk->sport; + th->dest = req->rmt_port; + skb->seq = req->snt_isn; + skb->end_seq = skb->seq + 1; + th->seq = htonl(skb->seq); + th->ack_seq = htonl(req->rcv_isn + 1); + if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ + __u8 rcv_wscale; + /* Set this up on the first call only */ + req->window_clamp = skb->dst->window; + tcp_select_initial_window(sock_rspace(sk)/2,req->mss, + &req->rcv_wnd, + &req->window_clamp, + req->wscale_ok, + &rcv_wscale); + req->rcv_wscale = rcv_wscale; + } + th->window = htons(req->rcv_wnd); - th->source = sk->dummy_th.source; - th->dest = sk->dummy_th.dest; - - skb->seq = tp->snd_una; - skb->end_seq = skb->seq + 1 /* th->syn */ ; - th->seq = ntohl(skb->seq); + skb->when = jiffies; + tcp_syn_build_options((__u32 *)(th + 1), req->mss, req->tstamp_ok, + req->sack_ok, req->wscale_ok, req->rcv_wscale, + skb->when); - /* This is a resend of a previous SYN, now with an ACK. - * we must reuse the previously offered window. - */ - th->window = htons(tp->rcv_wnd); + skb->csum = 0; + th->doff = (tcp_header_size >> 2); + tcp_statistics.TcpOutSegs++; + return skb; +} - tp->last_ack_sent = th->ack_seq = htonl(tp->rcv_nxt); +void tcp_connect(struct sock *sk, struct sk_buff *buff, int mss) +{ + struct dst_entry *dst = sk->dst_cache; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - tmp = tcp_syn_build_options(skb, sk->mss, - tp->tstamp_ok, tp->wscale_ok, tp->rcv_wscale); - skb->csum = 0; - th->doff = (sizeof(*th) + tmp)>>2; + /* Reserve space for headers. */ + skb_reserve(buff, MAX_HEADER + sk->prot->max_header); - tp->af_specific->send_check(sk, th, sizeof(*th)+tmp, skb); + if (sk->priority == 0) + sk->priority = dst->priority; - skb_queue_tail(&sk->write_queue, skb); + tp->snd_wnd = 0; + tp->snd_wl1 = 0; + tp->snd_wl2 = tp->write_seq; + tp->snd_una = tp->write_seq; + tp->rcv_nxt = 0; + + sk->err = 0; - buff = skb_clone(skb, GFP_ATOMIC); - if (buff) { - skb_set_owner_w(buff, sk); + /* We'll fix this up when we get a response from the other end. + * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. + */ + tp->tcp_header_len = sizeof(struct tcphdr) + + (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); - tp->packets_out++; - skb->when = jiffies; + mss -= tp->tcp_header_len; - tp->af_specific->queue_xmit(buff); - tcp_statistics.TcpOutSegs++; + if (sk->user_mss) + mss = min(mss, sk->user_mss); - tcp_reset_xmit_timer(sk, TIME_RETRANS, TCP_TIMEOUT_INIT); + if (mss < 1) { + printk(KERN_DEBUG "intial sk->mss below 1\n"); + mss = 1; /* Sanity limit */ } - return 0; + + sk->mss = mss; + + TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; + TCP_SKB_CB(buff)->sacked = 0; + TCP_SKB_CB(buff)->urg_ptr = 0; + buff->csum = 0; + buff->seq = tp->write_seq++; + buff->end_seq = tp->write_seq; + tp->snd_nxt = buff->end_seq; + + tp->window_clamp = dst->window; + tcp_select_initial_window(sock_rspace(sk)/2,sk->mss, + &tp->rcv_wnd, + &tp->window_clamp, + sysctl_tcp_window_scaling, + &tp->rcv_wscale); + + /* Ok, now lock the socket before we make it visible to + * the incoming packet engine. + */ + lock_sock(sk); + + /* Socket identity change complete, no longer + * in TCP_CLOSE, so enter ourselves into the + * hash tables. + */ + tcp_set_state(sk,TCP_SYN_SENT); + sk->prot->hash(sk); + + tp->rto = dst->rtt; + tcp_init_xmit_timers(sk); + tp->retransmits = 0; + + /* Send it off. */ + skb_queue_tail(&sk->write_queue, buff); + buff->when = jiffies; + tp->packets_out++; + tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); + tcp_statistics.TcpActiveOpens++; + + /* Timer for repeating the SYN until an answer. */ + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + + /* Now, it is safe to release the socket. */ + release_sock(sk); } -/* - * Send out a delayed ack, the caller does the policy checking +/* Send out a delayed ack, the caller does the policy checking * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() * for details. */ - void tcp_send_delayed_ack(struct tcp_opt *tp, int max_timeout) { unsigned long timeout; @@ -799,169 +830,120 @@ void tcp_send_delayed_ack(struct tcp_opt *tp, int max_timeout) timeout += jiffies; /* Use new timeout only if there wasn't a older one earlier. */ - if ((!tp->delack_timer.prev || !del_timer(&tp->delack_timer)) || - (timeout < tp->delack_timer.expires)) + if (!tp->delack_timer.prev) { tp->delack_timer.expires = timeout; - - add_timer(&tp->delack_timer); + add_timer(&tp->delack_timer); + } else { + if (timeout < tp->delack_timer.expires) + mod_timer(&tp->delack_timer, timeout); + } } - - -/* - * This routine sends an ack and also updates the window. - */ - +/* This routine sends an ack and also updates the window. */ void tcp_send_ack(struct sock *sk) { - struct sk_buff *buff; - struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); - struct tcphdr *th; - int tmp; - - if(sk->zapped) - return; /* We have been reset, we may not send again. */ + /* If we have been reset, we may not send again. */ + if(!sk->zapped) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *buff; - /* We need to grab some memory, and put together an ack, - * and then put it into the queue to be sent. - */ - buff = sock_wmalloc(sk, BASE_ACK_SIZE + tp->tcp_header_len, 1, GFP_ATOMIC); - if (buff == NULL) { - /* Force it to send an ack. We don't have to do this - * (ACK is unreliable) but it's much better use of - * bandwidth on slow links to send a spare ack than - * resend packets. + /* We are not putting this on the write queue, so + * tcp_transmit_skb() will set the ownership to this + * sock. */ - tcp_send_delayed_ack(tp, HZ/2); - return; - } - - clear_delayed_acks(sk); - - /* Assemble a suitable TCP frame. */ - buff->csum = 0; + buff = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_ATOMIC); + if (buff == NULL) { + /* Force it to send an ack. We don't have to do this + * (ACK is unreliable) but it's much better use of + * bandwidth on slow links to send a spare ack than + * resend packets. + */ + tcp_send_delayed_ack(tp, HZ/2); + return; + } - /* Put in the IP header and routing stuff. */ - tmp = tp->af_specific->build_net_header(sk, buff); - if (tmp < 0) { - kfree_skb(buff); - return; + /* Reserve space for headers and prepare control bits. */ + skb_reserve(buff, MAX_HEADER + sk->prot->max_header); + buff->csum = 0; + TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK; + TCP_SKB_CB(buff)->sacked = 0; + TCP_SKB_CB(buff)->urg_ptr = 0; + + /* Send it off, this clears delayed acks for us. */ + buff->seq = buff->end_seq = tp->snd_nxt; + buff->when = jiffies; + tcp_transmit_skb(sk, buff); } - - th = (struct tcphdr *)skb_put(buff,tp->tcp_header_len); - memcpy(th, &sk->dummy_th, sizeof(struct tcphdr)); - - /* Swap the send and the receive. */ - th->window = ntohs(tcp_select_window(sk)); - th->seq = ntohl(tp->snd_nxt); - tp->last_ack_sent = tp->rcv_nxt; - th->ack_seq = htonl(tp->rcv_nxt); - tcp_build_and_update_options((__u32 *)(th + 1), tp); - - /* Fill in the packet and send it. */ - tp->af_specific->send_check(sk, th, tp->tcp_header_len, buff); - tp->af_specific->queue_xmit(buff); - tcp_statistics.TcpOutSegs++; } -/* - * This routine sends a packet with an out of date sequence - * number. It assumes the other end will try to ack it. +/* This routine sends a packet with an out of date sequence + * number. It assumes the other end will try to ack it. */ - void tcp_write_wakeup(struct sock *sk) { - struct sk_buff *buff, *skb; - struct tcphdr *t1; - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int tmp; - - if (sk->zapped) - return; /* After a valid reset we can send no more. */ - - /* Write data can still be transmitted/retransmitted in the - * following states. If any other state is encountered, return. - * [listen/close will never occur here anyway] - */ - if ((1 << sk->state) & - ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|TCPF_LAST_ACK|TCPF_CLOSING)) - return; - - if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) && (skb=tp->send_head)) { - struct tcphdr *th; - unsigned long win_size; - - /* We are probing the opening of a window - * but the window size is != 0 - * must have been a result SWS avoidance ( sender ) - */ - win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); - if (win_size < skb->end_seq - skb->seq) { - if (tcp_fragment(sk, skb, win_size)) { - printk(KERN_DEBUG "tcp_write_wakeup: " - "fragment failed\n"); - return; - } - } - - th = skb->h.th; - tcp_update_options((__u32 *)(th + 1), tp); - tp->af_specific->send_check(sk, th, th->doff * 4 + win_size, skb); - buff = skb_clone(skb, GFP_ATOMIC); - if (buff == NULL) + /* After a valid reset we can send no more. */ + if (!sk->zapped) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb; + + /* Write data can still be transmitted/retransmitted in the + * following states. If any other state is encountered, return. + * [listen/close will never occur here anyway] + */ + if ((1 << sk->state) & + ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1| + TCPF_LAST_ACK|TCPF_CLOSING)) return; - skb_set_owner_w(buff, sk); - tp->packets_out++; - - clear_delayed_acks(sk); + if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) && + ((skb = tp->send_head) != NULL)) { + unsigned long win_size; - if (!tcp_timer_is_set(sk, TIME_RETRANS)) - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - - skb->when = jiffies; - update_send_head(sk); - tp->snd_nxt = skb->end_seq; - } else { - buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC); - if (buff == NULL) - return; + /* We are probing the opening of a window + * but the window size is != 0 + * must have been a result SWS avoidance ( sender ) + */ + win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); + if (win_size < skb->end_seq - skb->seq) { + if (tcp_fragment(sk, skb, win_size)) + return; /* Let a retransmit get it. */ + } + update_send_head(sk); + skb->when = jiffies; + tp->snd_nxt = skb->end_seq; + tp->packets_out++; + tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + if (!tcp_timer_is_set(sk, TIME_RETRANS)) + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + } else { + /* We don't queue it, tcp_transmit_skb() sets ownership. */ + skb = alloc_skb(MAX_HEADER + sk->prot->max_header, + GFP_ATOMIC); + if (skb == NULL) + return; - buff->csum = 0; + /* Reserve space for headers and set control bits. */ + skb_reserve(skb, MAX_HEADER + sk->prot->max_header); + skb->csum = 0; + TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; + TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->urg_ptr = 0; - /* Put in the IP header and routing stuff. */ - tmp = tp->af_specific->build_net_header(sk, buff); - if (tmp < 0) { - kfree_skb(buff); - return; + /* Use a previous sequence. This should cause the other + * end to send an ack. Don't queue or clone SKB, just + * send it. + */ + skb->seq = tp->snd_nxt - 1; + skb->end_seq = skb->seq; + skb->when = jiffies; + tcp_transmit_skb(sk, skb); } - - t1 = (struct tcphdr *) skb_put(buff, tp->tcp_header_len); - memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1)); - - /* Use a previous sequence. - * This should cause the other end to send an ack. - */ - - t1->seq = htonl(tp->snd_nxt-1); - t1->ack_seq = htonl(tp->rcv_nxt); - t1->window = htons(tcp_select_window(sk)); - tcp_build_and_update_options((__u32 *)(t1 + 1), tp); - - tp->af_specific->send_check(sk, t1, tp->tcp_header_len, buff); } - - /* Send it. */ - tp->af_specific->queue_xmit(buff); - tcp_statistics.TcpOutSegs++; } -/* - * A window probe timeout has occurred. - * If window is not closed send a partial packet - * else a zero probe. +/* A window probe timeout has occurred. If window is not closed send + * a partial packet else a zero probe. */ - void tcp_send_probe0(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index fdf8f50ec..54380b07d 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -77,11 +77,6 @@ void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - if((long)when <= 0) { - printk(KERN_DEBUG "xmit_timer <= 0 - timer:%d when:%lx\n", what, when); - when=HZ/50; - } - switch (what) { case TIME_RETRANS: /* When seting the transmit timer the probe timer @@ -91,24 +86,15 @@ void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when) */ if(tp->probe_timer.prev) del_timer(&tp->probe_timer); - if(tp->retransmit_timer.prev) - del_timer(&tp->retransmit_timer); - tp->retransmit_timer.expires=jiffies+when; - add_timer(&tp->retransmit_timer); + mod_timer(&tp->retransmit_timer, jiffies+when); break; case TIME_DACK: - if(tp->delack_timer.prev) - del_timer(&tp->delack_timer); - tp->delack_timer.expires=jiffies+when; - add_timer(&tp->delack_timer); + mod_timer(&tp->delack_timer, jiffies+when); break; case TIME_PROBE0: - if(tp->probe_timer.prev) - del_timer(&tp->probe_timer); - tp->probe_timer.expires=jiffies+when; - add_timer(&tp->probe_timer); + mod_timer(&tp->probe_timer, jiffies+when); break; case TIME_WRITE: @@ -150,17 +136,12 @@ static int tcp_write_err(struct sock *sk, int force) return 1; } -/* - * A write timeout has occurred. Process the after effects. BROKEN (badly) - */ - +/* A write timeout has occurred. Process the after effects. */ static int tcp_write_timeout(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - /* - * Look for a 'soft' timeout. - */ + /* Look for a 'soft' timeout. */ if ((sk->state == TCP_ESTABLISHED && tp->retransmits && (tp->retransmits % TCP_QUICK_TRIES) == 0) || (sk->state != TCP_ESTABLISHED && tp->retransmits > sysctl_tcp_retries1)) { @@ -206,11 +187,10 @@ void tcp_probe_timer(unsigned long data) return; } - /* - * *WARNING* RFC 1122 forbids this - * It doesn't AFAIK, because we kill the retransmit timer -AK - * FIXME: We ought not to do it, Solaris 2.5 actually has fixing - * this behaviour in Solaris down as a bug fix. [AC] + /* *WARNING* RFC 1122 forbids this + * It doesn't AFAIK, because we kill the retransmit timer -AK + * FIXME: We ought not to do it, Solaris 2.5 actually has fixing + * this behaviour in Solaris down as a bug fix. [AC] */ if (tp->probes_out > sysctl_tcp_retries2) { if(sk->err_soft) @@ -226,9 +206,10 @@ void tcp_probe_timer(unsigned long data) /* Clean up time. */ tcp_set_state(sk, TCP_CLOSE); } + } else { + /* Only send another probe if we didn't close things up. */ + tcp_send_probe0(sk); } - - tcp_send_probe0(sk); } static __inline__ int tcp_keepopen_proc(struct sock *sk) @@ -375,6 +356,21 @@ void tcp_retransmit_timer(unsigned long data) /* Clear delay ack timer. */ tcp_clear_xmit_timer(sk, TIME_DACK); + /* RFC 2018, clear all 'sacked' flags in retransmission queue, + * the sender may have dropped out of order frames and we must + * send them out should this timer fire on us. + */ + if(tp->sack_ok) { + struct sk_buff *skb = skb_peek(&sk->write_queue); + + while((skb != NULL) && + (skb != tp->send_head) && + (skb != (struct sk_buff *)&sk->write_queue)) { + TCP_SKB_CB(skb)->sacked = 0; + skb = skb->next; + } + } + /* Retransmission. */ tp->retrans_head = NULL; if (tp->retransmits == 0) { @@ -390,7 +386,7 @@ void tcp_retransmit_timer(unsigned long data) tp->dup_acks = 0; tp->high_seq = tp->snd_nxt; - tcp_do_retransmit(sk, 0); + tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); /* Increase the timeout each time we retransmit. Note that * we do not increase the rtt estimate. rto is initialized @@ -407,7 +403,7 @@ void tcp_retransmit_timer(unsigned long data) * implemented ftp to mars will work nicely. We will have to fix * the 120 second clamps though! */ - tp->backoff++; /* FIXME: always same as retransmits? -- erics */ + tp->backoff++; tp->rto = min(tp->rto << 1, 120*HZ); tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); @@ -523,18 +519,18 @@ void tcp_sltimer_handler(unsigned long data) void __tcp_inc_slow_timer(struct tcp_sl_timer *slt) { unsigned long now = jiffies; - unsigned long next = 0; unsigned long when; slt->last = now; - + when = now + slt->period; - if (del_timer(&tcp_slow_timer)) - next = tcp_slow_timer.expires; - - if (next && ((long)(next - when) < 0)) - when = next; - - tcp_slow_timer.expires = when; - add_timer(&tcp_slow_timer); + + if (tcp_slow_timer.prev) { + if ((long)(tcp_slow_timer.expires - when) >= 0) { + mod_timer(&tcp_slow_timer, when); + } + } else { + tcp_slow_timer.expires = when; + add_timer(&tcp_slow_timer); + } } diff --git a/net/ipv4/timer.c b/net/ipv4/timer.c index 79ae3309e..5c5e5eeb3 100644 --- a/net/ipv4/timer.c +++ b/net/ipv4/timer.c @@ -59,10 +59,8 @@ void net_delete_timer (struct sock *t) void net_reset_timer (struct sock *t, int timeout, unsigned long len) { - net_delete_timer (t); t->timeout = timeout; - t->timer.expires = jiffies+len; - add_timer (&t->timer); + mod_timer(&t->timer, jiffies+len); } /* Now we will only be called whenever we need to do diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6ba50b280..a580b0010 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -315,8 +315,8 @@ struct sock *udp_v4_lookup_longway(u32 saddr, u16 sport, u32 daddr, u16 dport, i continue; score++; } - if(sk->dummy_th.dest) { - if(sk->dummy_th.dest != sport) + if(sk->dport) { + if(sk->dport != sport) continue; score++; } @@ -412,8 +412,8 @@ static struct sock *udp_v4_proxy_lookup(unsigned short num, unsigned long raddr, continue; score++; } - if(s->dummy_th.dest) { - if(s->dummy_th.dest != rnum) + if(s->dport) { + if(s->dport != rnum) continue; score++; } @@ -453,7 +453,7 @@ static inline struct sock *udp_v4_mcast_next(struct sock *sk, if ((s->num != hnum) || (s->dead && (s->state == TCP_CLOSE)) || (s->daddr && s->daddr!=raddr) || - (s->dummy_th.dest != rnum && s->dummy_th.dest != 0) || + (s->dport != rnum && s->dport != 0) || (s->rcv_saddr && s->rcv_saddr != laddr)) continue; break; @@ -644,12 +644,12 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) return -EOPNOTSUPP; #ifdef CONFIG_IP_TRANSPARENT_PROXY - if (msg->msg_flags&~(MSG_DONTROUTE|MSG_DONTWAIT|MSG_PROXY)) + if (msg->msg_flags&~(MSG_DONTROUTE|MSG_DONTWAIT|MSG_PROXY|MSG_NOSIGNAL)) return -EINVAL; if ((msg->msg_flags&MSG_PROXY) && !suser() ) return -EPERM; #else - if (msg->msg_flags&~(MSG_DONTROUTE|MSG_DONTWAIT)) + if (msg->msg_flags&~(MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL)) return -EINVAL; #endif @@ -686,7 +686,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) if (sk->state != TCP_ESTABLISHED) return -EINVAL; ufh.daddr = sk->daddr; - ufh.uh.dest = sk->dummy_th.dest; + ufh.uh.dest = sk->dport; /* BUGGG Khm... And who will validate it? Fixing it fastly... @@ -712,7 +712,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) #endif { ipc.addr = sk->saddr; - ufh.uh.source = sk->dummy_th.source; + ufh.uh.source = sk->sport; } ipc.opt = NULL; @@ -971,7 +971,7 @@ int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if(!sk->rcv_saddr) sk->rcv_saddr = rt->rt_src; sk->daddr = rt->rt_dst; - sk->dummy_th.dest = usin->sin_port; + sk->dport = usin->sin_port; sk->state = TCP_ESTABLISHED; if(uh_cache_sk == sk) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 4a4060601..0241e0459 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: addrconf.c,v 1.37 1998/03/08 20:52:46 davem Exp $ + * $Id: addrconf.c,v 1.38 1998/03/20 09:12:14 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -298,10 +298,9 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) struct inet6_ifaddr *iter, **back; int hash; - ipv6_ifa_notify(RTM_DELADDR, ifp); - if (atomic_read(&addr_list_lock)) { ifp->flags |= ADDR_INVALID; + ipv6_ifa_notify(RTM_DELADDR, ifp); return; } @@ -330,6 +329,8 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) } back = &(iter->if_next); } + + ipv6_ifa_notify(RTM_DELADDR, ifp); kfree(ifp); } @@ -543,7 +544,7 @@ static int ipv6_generate_eui64(u8 *eui, struct device *dev) static void addrconf_prefix_route(struct in6_addr *pfx, int plen, struct device *dev, - unsigned long info) + unsigned long expires, unsigned flags) { struct in6_rtmsg rtmsg; int err; @@ -553,8 +554,8 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct device *dev, rtmsg.rtmsg_dst_len = plen; rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; rtmsg.rtmsg_ifindex = dev->ifindex; - rtmsg.rtmsg_info = info; - rtmsg.rtmsg_flags = RTF_UP|RTF_ADDRCONF; + rtmsg.rtmsg_info = expires; + rtmsg.rtmsg_flags = RTF_UP|flags; rtmsg.rtmsg_type = RTMSG_NEWROUTE; /* Prevent useless cloning on PtP SIT. @@ -608,7 +609,7 @@ static void addrconf_add_lroute(struct device *dev) struct in6_addr addr; ipv6_addr_set(&addr, __constant_htonl(0xFE800000), 0, 0, 0); - addrconf_prefix_route(&addr, 10, dev, 0); + addrconf_prefix_route(&addr, 10, dev, 0, RTF_ADDRCONF); } static struct inet6_dev *addrconf_add_dev(struct device *dev) @@ -688,18 +689,20 @@ void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len) else rt_expires = jiffies + valid_lft * HZ; - rt = rt6_lookup(&pinfo->prefix, NULL, dev, RTF_LINKRT); + rt = rt6_lookup(&pinfo->prefix, NULL, dev->ifindex, RTF_LINKRT); if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) { - if (pinfo->onlink == 0 || valid_lft == 0) { - ip6_del_rt(rt); - rt = NULL; - } else { - rt->rt6i_expires = rt_expires; + if (rt->rt6i_flags&RTF_EXPIRES) { + if (pinfo->onlink == 0 || valid_lft == 0) { + ip6_del_rt(rt); + rt = NULL; + } else { + rt->rt6i_expires = rt_expires; + } } } else if (pinfo->onlink && valid_lft) { addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, - dev, rt_expires); + dev, rt_expires, RTF_ADDRCONF|RTF_EXPIRES); } /* Try to figure out our local address for this prefix */ @@ -1265,8 +1268,8 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp) addrconf_join_solict(dev, &ifp->addr); - if (ifp->prefix_len != 128) - addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, 0); + if (ifp->prefix_len != 128 && (ifp->flags&ADDR_PERMANENT)) + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, 0, RTF_ADDRCONF); if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { start_bh_atomic(); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index bc5ba892a..6a24bea8b 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -7,7 +7,7 @@ * * Adapted from linux/net/ipv4/af_inet.c * - * $Id: af_inet6.c,v 1.28 1998/03/08 05:56:49 davem Exp $ + * $Id: af_inet6.c,v 1.29 1998/03/18 07:52:11 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -75,7 +75,6 @@ static int inet6_create(struct socket *sock, int protocol) if (sk == NULL) goto do_oom; - /* Note for tcp that also wiped the dummy_th block for us. */ if(sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET) { if (protocol && protocol != IPPROTO_TCP) goto free_and_noproto; @@ -138,7 +137,7 @@ static int inet6_create(struct socket *sock, int protocol) * the user to assign a number at socket * creation time automatically shares. */ - sk->dummy_th.source = ntohs(sk->num); + sk->sport = ntohs(sk->num); sk->prot->hash(sk); add_to_prot_sklist(sk); } @@ -229,8 +228,8 @@ static int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) return -EADDRINUSE; sk->num = snum; - sk->dummy_th.source = ntohs(sk->num); - sk->dummy_th.dest = 0; + sk->sport = ntohs(sk->num); + sk->dport = 0; sk->daddr = 0; sk->prot->rehash(sk); add_to_prot_sklist(sk); @@ -259,7 +258,7 @@ static int inet6_getname(struct socket *sock, struct sockaddr *uaddr, if (peer) { if (!tcp_connected(sk->state)) return(-ENOTCONN); - sin->sin6_port = sk->dummy_th.dest; + sin->sin6_port = sk->dport; memcpy(&sin->sin6_addr, &sk->net_pinfo.af_inet6.daddr, sizeof(struct in6_addr)); } else { @@ -272,7 +271,7 @@ static int inet6_getname(struct socket *sock, struct sockaddr *uaddr, &sk->net_pinfo.af_inet6.rcv_saddr, sizeof(struct in6_addr)); - sin->sin6_port = sk->dummy_th.source; + sin->sin6_port = sk->sport; } *uaddr_len = sizeof(*sin); return(0); diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 875e0f2ed..b87f31b06 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: datagram.c,v 1.13 1997/12/13 21:53:09 kuznet Exp $ + * $Id: datagram.c,v 1.14 1998/03/20 09:12:15 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -55,7 +55,7 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) return 0; } -int datagram_send_ctl(struct msghdr *msg, struct device **src_dev, +int datagram_send_ctl(struct msghdr *msg, int *oif, struct in6_addr **src_addr, struct ipv6_options *opt, int *hlimit) { @@ -81,15 +81,15 @@ int datagram_send_ctl(struct msghdr *msg, struct device **src_dev, src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); if (src_info->ipi6_ifindex) { - int index = src_info->ipi6_ifindex; - - *src_dev = dev_get_by_index(index); + if (*oif && src_info->ipi6_ifindex != *oif) + return -EINVAL; + *oif = src_info->ipi6_ifindex; } - + if (!ipv6_addr_any(&src_info->ipi6_addr)) { struct inet6_ifaddr *ifp; - ifp = ipv6_chk_addr(&src_info->ipi6_addr, *src_dev, 0); + ifp = ipv6_chk_addr(&src_info->ipi6_addr, NULL, 0); if (ifp == NULL) { err = -EINVAL; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 96867403b..f181aec52 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: icmp.c,v 1.13 1998/02/12 07:43:41 davem Exp $ + * $Id: icmp.c,v 1.15 1998/03/21 07:28:03 davem Exp $ * * Based on net/ipv4/icmp.c * @@ -153,7 +153,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, struct ipv6hdr *hdr = skb->nh.ipv6h; struct sock *sk = icmpv6_socket->sk; struct in6_addr *saddr = NULL; - struct device *src_dev = NULL; + int iif = 0; struct icmpv6_msg msg; struct flowi fl; int addr_type = 0; @@ -203,7 +203,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, */ if (addr_type & IPV6_ADDR_LINKLOCAL) - src_dev = skb->dev; + iif = skb->dev->ifindex; /* * Must not send if we know that source is Anycast also. @@ -251,12 +251,17 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, fl.proto = IPPROTO_ICMPV6; fl.nl_u.ip6_u.daddr = &hdr->saddr; fl.nl_u.ip6_u.saddr = saddr; - fl.dev = src_dev; + fl.oif = iif; fl.uli_u.icmpt.type = type; fl.uli_u.icmpt.code = code; ip6_build_xmit(sk, icmpv6_getfrag, &msg, &fl, len, NULL, -1, MSG_DONTWAIT); + + /* Oops! We must purge cached dst, otherwise + all the following ICMP messages will go there :) --ANK + */ + dst_release(xchg(&sk->dst_cache, NULL)); } static void icmpv6_echo_reply(struct sk_buff *skb) @@ -294,12 +299,17 @@ static void icmpv6_echo_reply(struct sk_buff *skb) fl.proto = IPPROTO_ICMPV6; fl.nl_u.ip6_u.daddr = &hdr->saddr; fl.nl_u.ip6_u.saddr = saddr; - fl.dev = skb->dev; + fl.oif = skb->dev->ifindex; fl.uli_u.icmpt.type = ICMPV6_ECHO_REPLY; fl.uli_u.icmpt.code = 0; ip6_build_xmit(sk, icmpv6_getfrag, &msg, &fl, len, NULL, -1, MSG_DONTWAIT); + + /* Oops! We must purge cached dst, otherwise + all the following ICMP messages will go there :) --ANK + */ + dst_release(xchg(&sk->dst_cache, NULL)); } static __inline__ int ipv6_ext_hdr(u8 nexthdr) @@ -317,7 +327,8 @@ static __inline__ int ipv6_ext_hdr(u8 nexthdr) } -static void icmpv6_notify(int type, int code, unsigned char *buff, int len, +static void icmpv6_notify(struct sk_buff *skb, + int type, int code, unsigned char *buff, int len, struct in6_addr *saddr, struct in6_addr *daddr, struct inet6_protocol *protocol) { @@ -367,7 +378,7 @@ static void icmpv6_notify(int type, int code, unsigned char *buff, int len, continue; if (ipprot->err_handler) - ipprot->err_handler(type, code, pbuff, info, + ipprot->err_handler(skb, type, code, pbuff, info, saddr, daddr, ipprot); return; } @@ -457,7 +468,7 @@ int icmpv6_rcv(struct sk_buff *skb, struct device *dev, case ICMPV6_TIME_EXCEED: case ICMPV6_PARAMPROB: - icmpv6_notify(hdr->icmp6_type, hdr->icmp6_code, + icmpv6_notify(skb, hdr->icmp6_type, hdr->icmp6_code, (char *) (hdr + 1), ulen, saddr, daddr, protocol); break; @@ -493,7 +504,7 @@ int icmpv6_rcv(struct sk_buff *skb, struct device *dev, * must pass to upper level */ - icmpv6_notify(hdr->icmp6_type, hdr->icmp6_code, + icmpv6_notify(skb, hdr->icmp6_type, hdr->icmp6_code, (char *) (hdr + 1), ulen, saddr, daddr, protocol); }; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 9fce1acca..735ceeb5f 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: ip6_fib.c,v 1.11 1998/03/08 05:56:50 davem Exp $ + * $Id: ip6_fib.c,v 1.12 1998/03/20 09:12:16 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -418,9 +418,13 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt) (iter->rt6i_flowr == rt->rt6i_flowr) && (ipv6_addr_cmp(&iter->rt6i_gateway, &rt->rt6i_gateway) == 0)) { - if (rt->rt6i_expires == 0 || - (long)(rt->rt6i_expires - iter->rt6i_expires) > 0) - rt->rt6i_expires = iter->rt6i_expires; + if (!(iter->rt6i_flags&RTF_EXPIRES)) + return -EEXIST; + iter->rt6i_expires = rt->rt6i_expires; + if (!(rt->rt6i_flags&RTF_EXPIRES)) { + iter->rt6i_flags &= ~RTF_EXPIRES; + iter->rt6i_expires = rt->rt6i_expires; + } return -EEXIST; } } @@ -931,7 +935,8 @@ static int fib6_gc_node(struct fib6_node *fn, int timeout) * Seems, radix tree walking is absolutely broken, * but we will try in any case --ANK */ - if (rt->rt6i_expires && (long)(now - rt->rt6i_expires) < 0) { + if ((rt->rt6i_flags&RTF_EXPIRES) && rt->rt6i_expires + && (long)(now - rt->rt6i_expires) > 0) { struct rt6_info *old; old = rt; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 13029e175..0f1c710d3 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: ip6_output.c,v 1.9 1998/03/08 05:56:50 davem Exp $ + * $Id: ip6_output.c,v 1.10 1998/03/20 09:12:17 davem Exp $ * * Based on linux/net/ipv4/ip_output.c * @@ -82,64 +82,43 @@ int ip6_output(struct sk_buff *skb) /* * xmit an sk_buff (used by TCP) - * sk can be NULL (for sending RESETs) */ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, struct ipv6_options *opt) { - struct ipv6_pinfo *np = NULL; - struct dst_entry *dst = NULL; + struct ipv6_pinfo * np = sk ? &sk->net_pinfo.af_inet6 : NULL; + struct dst_entry *dst = skb->dst; struct ipv6hdr *hdr; int seg_len; + int hlimit; - hdr = skb->nh.ipv6h; - - if (sk) { - np = &sk->net_pinfo.af_inet6; - - if (sk->dst_cache) { - /* - * dst_check returns NULL if route is no longer valid - */ - dst = dst_check(&sk->dst_cache, np->dst_cookie); - } - } - - if (dst == NULL) { - dst = ip6_route_output(sk, fl); + /* Do something with IPv6 options headers here. */ - if (dst->error) { - /* - * NETUNREACH usually - */ - dst_release(dst); - return dst->error; - } - } + seg_len = skb->len; - skb->dst = dst_clone(dst); - seg_len = skb->tail - ((unsigned char *) hdr); - hdr = skb->nh.ipv6h; + hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr)); /* * Fill in the IPv6 header */ hdr->version = 6; - hdr->priority = np ? np->priority : 0; - - if (np) + if (np) { + hdr->priority = np->priority; memcpy(hdr->flow_lbl, (void *) &np->flow_lbl, 3); - else + hlimit = np->hop_limit; + } else { + hdr->priority = 0; memset(hdr->flow_lbl, 0, 3); + hlimit = -1; + } + if (hlimit < 0) + hlimit = ((struct rt6_info*)dst)->rt6i_hoplimit; - hdr->payload_len = htons(seg_len - sizeof(struct ipv6hdr)); + hdr->payload_len = htons(seg_len); hdr->nexthdr = fl->proto; - if (np == NULL || np->hop_limit < 0) - hdr->hop_limit = ((struct rt6_info*)dst)->rt6i_hoplimit; - else - hdr->hop_limit = np->hop_limit; + hdr->hop_limit = hlimit; ipv6_addr_copy(&hdr->saddr, fl->nl_u.ip6_u.saddr); ipv6_addr_copy(&hdr->daddr, fl->nl_u.ip6_u.daddr); @@ -147,12 +126,6 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, ipv6_statistics.Ip6OutRequests++; dst->output(skb); - if (sk) { - if (sk->dst_cache == NULL) - ip6_dst_store(sk, dst); - } else - dst_release(dst); - return 0; } @@ -412,6 +385,9 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data, } dst = NULL; + + if (!fl->oif && ipv6_addr_is_multicast(fl->nl_u.ip6_u.daddr)) + fl->oif = np->mcast_oif; if (sk->dst_cache) dst = dst_check(&sk->dst_cache, np->dst_cookie); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index c6714eea3..9bb2d4d3c 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -7,7 +7,7 @@ * * Based on linux/net/ipv4/ip_sockglue.c * - * $Id: ipv6_sockglue.c,v 1.17 1998/03/08 05:56:51 davem Exp $ + * $Id: ipv6_sockglue.c,v 1.18 1998/03/20 09:12:18 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -157,15 +157,13 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval, case IPV6_MULTICAST_IF: { + int oif = 0; struct in6_addr addr; - err = copy_from_user(&addr, optval, sizeof(struct in6_addr)); - if(err) + if (copy_from_user(&addr, optval, sizeof(struct in6_addr))) return -EFAULT; - if (ipv6_addr_any(&addr)) { - np->oif = NULL; - } else { + if (!ipv6_addr_any(&addr)) { struct inet6_ifaddr *ifp; ifp = ipv6_chk_addr(&addr, NULL, 0); @@ -175,8 +173,13 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval, break; } - np->oif = ifp->idev->dev; + oif = ifp->idev->dev->ifindex; + } + if (sk->bound_dev_if && sk->bound_dev_if != oif) { + retv = -EINVAL; + break; } + np->mcast_oif = oif; retv = 0; break; } diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 3f881673c..407698eb1 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: mcast.c,v 1.13 1998/01/04 15:28:31 mj Exp $ + * $Id: mcast.c,v 1.14 1998/03/20 09:12:18 davem Exp $ * * Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c * @@ -91,7 +91,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr) if (ifindex == 0) { struct rt6_info *rt; - rt = rt6_lookup(addr, NULL, NULL, 0); + rt = rt6_lookup(addr, NULL, 0, 0); if (rt) dev = rt->rt6i_dev; } else diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index ce37117a3..2e437f2de 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -774,7 +774,7 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, int hlen; dev = skb->dev; - rt = rt6_lookup(&skb->nh.ipv6h->saddr, NULL, dev, 0); + rt = rt6_lookup(&skb->nh.ipv6h->saddr, NULL, dev->ifindex, 0); if (rt == NULL || rt->u.dst.error) { ND_PRINTK1("ndisc_send_redirect: hostunreach\n"); diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index b87d4696b..c010b0964 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -7,7 +7,7 @@ * PROC file system. This is very similar to the IPv4 version, * except it reports the sockets in the INET6 address family. * - * Version: $Id: proc.c,v 1.6 1998/03/13 08:02:19 davem Exp $ + * Version: $Id: proc.c,v 1.7 1998/03/18 07:52:13 davem Exp $ * * Authors: David S. Miller (davem@caip.rutgers.edu) * @@ -68,8 +68,8 @@ static int get__netinfo6(struct proto *pro, char *buffer, int format, char **sta dest = &sp->net_pinfo.af_inet6.daddr; src = &sp->net_pinfo.af_inet6.rcv_saddr; } - destp = ntohs(sp->dummy_th.dest); - srcp = ntohs(sp->dummy_th.source); + destp = ntohs(sp->dport); + srcp = ntohs(sp->sport); if((format == 0) && (sp->state == TCP_TIME_WAIT)) { timer_active1 = timer_active2 = 0; timer_active = 3; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 5b182b7ef..7429a9210 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -7,7 +7,7 @@ * * Adapted from linux/net/ipv4/raw.c * - * $Id: raw.c,v 1.18 1998/03/08 05:56:54 davem Exp $ + * $Id: raw.c,v 1.19 1998/03/20 09:12:20 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -349,7 +349,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len) struct sockaddr_in6 * sin6 = (struct sockaddr_in6 *) msg->msg_name; struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; struct ipv6_options *opt = NULL; - struct device *dev = NULL; struct in6_addr *saddr = NULL; struct flowi fl; int addr_len = msg->msg_namelen; @@ -419,15 +418,15 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len) return(-EINVAL); } + fl.oif = sk->bound_dev_if; + if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_options)); - err = datagram_send_ctl(msg, &dev, &saddr, opt, &hlimit); - if (err < 0) { - printk(KERN_DEBUG "invalid msg_control\n"); + err = datagram_send_ctl(msg, &fl.oif, &saddr, opt, &hlimit); + if (err < 0) return err; - } } raw_opt = &sk->tp_pinfo.tp_raw; @@ -435,7 +434,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len) fl.proto = proto; fl.nl_u.ip6_u.daddr = daddr; fl.nl_u.ip6_u.saddr = saddr; - fl.dev = dev; fl.uli_u.icmpt.type = 0; fl.uli_u.icmpt.code = 0; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 5188de864..3015d254b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: route.c,v 1.25 1998/03/15 03:31:47 davem Exp $ + * $Id: route.c,v 1.27 1998/03/21 07:28:04 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -36,6 +36,7 @@ #include <net/ip6_route.h> #include <net/ndisc.h> #include <net/addrconf.h> +#include <net/tcp.h> #include <linux/netlink.h> #include <linux/rtnetlink.h> @@ -60,6 +61,7 @@ int ip6_rt_max_size = 4096; int ip6_rt_gc_min_interval = 5*HZ; int ip6_rt_gc_timeout = 60*HZ; int ip6_rt_gc_interval = 30*HZ; +int ip6_rt_gc_elasticity = 9; static struct rt6_info * ip6_rt_copy(struct rt6_info *ort); static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); @@ -205,21 +207,20 @@ static __inline__ void rt6_unlock(void) */ static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt, - struct device *dev, + int oif, int strict) { struct rt6_info *local = NULL; struct rt6_info *sprt; - RDBG(("rt6_device_match: (%p,%p,%d) ", rt, dev, strict)); - if (dev) { + if (oif) { for (sprt = rt; sprt; sprt = sprt->u.next) { - if (sprt->rt6i_dev == dev) { - RDBG(("match --> %p\n", sprt)); - return sprt; + if (sprt->rt6i_dev) { + if (sprt->rt6i_dev->ifindex == oif) + return sprt; + if (sprt->rt6i_dev->flags&IFF_LOOPBACK) + local = sprt; } - if (sprt->rt6i_dev && (sprt->rt6i_dev->flags&IFF_LOOPBACK)) - local = sprt; } if (local) @@ -239,13 +240,12 @@ static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt, */ static struct rt6_info *rt6_dflt_pointer = NULL; -static struct rt6_info *rt6_best_dflt(struct rt6_info *rt, struct device *dev) +static struct rt6_info *rt6_best_dflt(struct rt6_info *rt, int oif) { struct rt6_info *match = NULL; struct rt6_info *sprt; int mpri = 0; - RDBG(("rt6_best_dflt(%p,%p): ", rt, dev)); for (sprt = rt; sprt; sprt = sprt->u.next) { struct neighbour *neigh; @@ -278,8 +278,7 @@ static struct rt6_info *rt6_best_dflt(struct rt6_info *rt, struct device *dev) break; }; - if (dev && sprt->rt6i_dev == dev) { - RDBG(("dev&&sprt->rt6i_dev==dev(%p), m+=2, ", dev)); + if (oif && sprt->rt6i_dev && sprt->rt6i_dev->ifindex == oif) { m += 2; } @@ -316,17 +315,14 @@ out: } struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr, - struct device *dev, int flags) + int oif, int flags) { struct fib6_node *fn; struct rt6_info *rt; - RDBG(("rt6_lookup(%p,%p,%p,%x) from %p\n", - daddr, saddr, dev, flags, __builtin_return_address(0))); rt6_lock(); fn = fib6_lookup(&ip6_routing_table, daddr, saddr); - - rt = rt6_device_match(fn->leaf, dev, flags&RTF_LINKRT); + rt = rt6_device_match(fn->leaf, oif, flags&RTF_LINKRT); rt6_unlock(); return rt; } @@ -414,7 +410,7 @@ void ip6_route_input(struct sk_buff *skb) if ((rt->rt6i_flags & RTF_CACHE)) { if (ip6_rt_policy == 0) { - rt = rt6_device_match(rt, skb->dev, 0); + rt = rt6_device_match(rt, skb->dev->ifindex, 0); goto out; } @@ -432,7 +428,7 @@ void ip6_route_input(struct sk_buff *skb) #endif } - rt = rt6_device_match(rt, skb->dev, 0); + rt = rt6_device_match(rt, skb->dev->ifindex, 0); if (ip6_rt_policy == 0) { if (!rt->rt6i_nexthop && rt->rt6i_dev && @@ -462,44 +458,19 @@ struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl) struct dst_entry *dst; int strict; - RDBG(("ip6_route_output(%p,%p) from(%p)", sk, fl, - __builtin_return_address(0))); strict = ipv6_addr_type(fl->nl_u.ip6_u.daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL); rt6_lock(); -#if RT6_DEBUG >= 3 - RDBG(("lkup(")); - if(fl->nl_u.ip6_u.daddr) { - struct in6_addr *addr = fl->nl_u.ip6_u.daddr; - int i; - RDBG(("daddr[")); - for(i = 0; i < 8; i++) { - RDBG(("%04x%c", addr->s6_addr16[i], - i == 7 ? ']' : ':')); - } - } - if(fl->nl_u.ip6_u.saddr) { - struct in6_addr *addr = fl->nl_u.ip6_u.saddr; - int i; - RDBG(("saddr[")); - for(i = 0; i < 8; i++) { - RDBG(("%04x%c", addr->s6_addr16[i], - i == 7 ? ']' : ':')); - } - } -#endif fn = fib6_lookup(&ip6_routing_table, fl->nl_u.ip6_u.daddr, fl->nl_u.ip6_u.saddr); - RDBG(("-->(%p[%s])) ", fn, fn == &ip6_routing_table ? "ROOT" : "!ROOT")); - restart: rt = fn->leaf; if ((rt->rt6i_flags & RTF_CACHE)) { RDBG(("RTF_CACHE ")); if (ip6_rt_policy == 0) { - rt = rt6_device_match(rt, fl->dev, strict); + rt = rt6_device_match(rt, fl->oif, strict); /* BUGGGG! It is capital bug, that was hidden by not-cloning multicast routes. However, @@ -536,11 +507,11 @@ restart: if (rt->rt6i_flags & RTF_DEFAULT) { RDBG(("RTF_DEFAULT ")); if (rt->rt6i_metric >= IP6_RT_PRIO_ADDRCONF) { - rt = rt6_best_dflt(rt, fl->dev); + rt = rt6_best_dflt(rt, fl->oif); RDBG(("best_dflt(%p) ", rt)); } } else { - rt = rt6_device_match(rt, fl->dev, strict); + rt = rt6_device_match(rt, fl->oif, strict); RDBG(("!RTF_DEFAULT devmatch(%p) ", rt)); } @@ -638,10 +609,10 @@ static int ip6_dst_gc() fib6_run_gc(expire); last_gc = now; if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh) - expire = ip6_rt_gc_timeout; + expire = ip6_rt_gc_timeout>>1; out: - expire >>= 1; + expire -= expire>>ip6_rt_gc_elasticity; end_bh_atomic(); return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size); } @@ -780,7 +751,7 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) goto out; } - grt = rt6_lookup(gw_addr, NULL, dev, RTF_LINKRT); + grt = rt6_lookup(gw_addr, NULL, dev->ifindex, RTF_LINKRT); if (grt == NULL || (grt->rt6i_flags&RTF_GATEWAY)) { *err = -EHOSTUNREACH; @@ -814,6 +785,7 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) rt->rt6i_dev = dev; rt->u.dst.pmtu = ipv6_get_mtu(dev); + rt->u.dst.rtt = TCP_TIMEOUT_INIT; if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) rt->rt6i_hoplimit = IPV6_DEFAULT_MCASTHOPS; else @@ -1078,7 +1050,7 @@ struct rt6_info *rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr, struct rt6_info *rt, *nrt; /* Locate old route to this destination. */ - rt = rt6_lookup(dest, NULL, dev, 0); + rt = rt6_lookup(dest, NULL, dev->ifindex, 0); if (rt == NULL || rt->u.dst.error) return NULL; @@ -1200,7 +1172,7 @@ void rt6_pmtu_discovery(struct in6_addr *addr, struct device *dev, int pmtu) return; } - rt = rt6_lookup(addr, NULL, dev, 0); + rt = rt6_lookup(addr, NULL, dev->ifindex, 0); if (rt == NULL || rt->u.dst.error) { #if RT6_DEBUG >= 2 @@ -1268,6 +1240,9 @@ static struct rt6_info * ip6_rt_copy(struct rt6_info *ort) rt->u.dst.output = ort->u.dst.output; rt->u.dst.pmtu = ort->u.dst.pmtu; + rt->u.dst.rtt = ort->u.dst.rtt; + rt->u.dst.window = ort->u.dst.window; + rt->u.dst.mxlock = ort->u.dst.mxlock; rt->rt6i_hoplimit = ort->rt6i_hoplimit; rt->rt6i_dev = ort->rt6i_dev; @@ -1472,6 +1447,7 @@ int ip6_rt_addr_add(struct in6_addr *addr, struct device *dev) rt->u.dst.input = ip6_input; rt->u.dst.output = ip6_output; rt->rt6i_dev = dev_get("lo"); + rt->u.dst.rtt = TCP_TIMEOUT_INIT; rt->u.dst.pmtu = ipv6_get_mtu(rt->rt6i_dev); rt->rt6i_hoplimit = ipv6_get_hoplimit(rt->rt6i_dev); rt->u.dst.obsolete = -1; @@ -1501,7 +1477,7 @@ int ip6_rt_addr_del(struct in6_addr *addr, struct device *dev) { struct rt6_info *rt; - rt = rt6_lookup(addr, NULL, dev_get("lo"), RTF_LINKRT); + rt = rt6_lookup(addr, NULL, loopback_dev.ifindex, RTF_LINKRT); if (rt && rt->rt6i_dst.plen == 128) return ip6_del_rt(rt); @@ -1811,6 +1787,8 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, #else mx = (struct rtattr*)skb->tail; RTA_PUT(skb, RTA_METRICS, 0, NULL); + if (rt->u.dst.mxlock) + RTA_PUT(skb, RTAX_LOCK, sizeof(unsigned), &rt->u.dst.mxlock); if (rt->u.dst.pmtu) RTA_PUT(skb, RTAX_MTU, sizeof(unsigned), &rt->u.dst.pmtu); if (rt->u.dst.window) @@ -2158,6 +2136,9 @@ ctl_table ipv6_route_table[] = { {NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval", &ip6_rt_gc_interval, sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity", + &ip6_rt_gc_elasticity, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, {0} }; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 1d082c195..4f176cd60 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: tcp_ipv6.c,v 1.60 1998/03/15 02:59:32 davem Exp $ + * $Id: tcp_ipv6.c,v 1.68 1998/03/22 19:14:50 davem Exp $ * * Based on: * linux/net/ipv4/tcp.c @@ -42,8 +42,6 @@ #include <asm/uaccess.h> -#define ICMP_PARANOIA - extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; @@ -52,7 +50,6 @@ static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, struct sk_buff *skb); static int tcp_v6_backlog_rcv(struct sock *sk, struct sk_buff *skb); -static int tcp_v6_build_header(struct sock *sk, struct sk_buff *skb); static void tcp_v6_xmit(struct sk_buff *skb); static struct open_request *tcp_v6_search_req(struct tcp_opt *tp, struct ipv6hdr *ip6h, @@ -79,7 +76,7 @@ static __inline__ int tcp_v6_sk_hashfn(struct sock *sk) struct in6_addr *laddr = &sk->net_pinfo.af_inet6.rcv_saddr; struct in6_addr *faddr = &sk->net_pinfo.af_inet6.daddr; __u16 lport = sk->num; - __u16 fport = sk->dummy_th.dest; + __u16 fport = sk->dport; return tcp_v6_hashfn(laddr, lport, faddr, fport); } @@ -113,12 +110,14 @@ static int tcp_v6_verify_bind(struct sock *sk, unsigned short snum) /* We must walk the whole port owner list in this case. -DaveM */ for(sk2 = tb->owners; sk2; sk2 = sk2->bind_next) { - if(!sk_reuse || !sk2->reuse || sk2->state == TCP_LISTEN) { - if(addr_type == IPV6_ADDR_ANY || - !sk2->rcv_saddr || - !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, - &sk2->net_pinfo.af_inet6.rcv_saddr)) - break; + if(sk->bound_dev_if == sk2->bound_dev_if) { + if(!sk_reuse || !sk2->reuse || sk2->state == TCP_LISTEN) { + if(addr_type == IPV6_ADDR_ANY || + !sk2->rcv_saddr || + !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, + &sk2->net_pinfo.af_inet6.rcv_saddr)) + break; + } } } if(sk2 != NULL) @@ -195,20 +194,35 @@ static void tcp_v6_rehash(struct sock *sk) SOCKHASH_UNLOCK(); } -static struct sock *tcp_v6_lookup_listener(struct in6_addr *daddr, unsigned short hnum) +static struct sock *tcp_v6_lookup_listener(struct in6_addr *daddr, unsigned short hnum, int dif) { struct sock *sk; struct sock *result = NULL; + int score, hiscore; + hiscore=0; sk = tcp_listening_hash[tcp_lhashfn(hnum)]; for(; sk; sk = sk->next) { if((sk->num == hnum) && (sk->family == AF_INET6)) { struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; + + score = 1; if(!ipv6_addr_any(&np->rcv_saddr)) { - if(!ipv6_addr_cmp(&np->rcv_saddr, daddr)) - return sk; /* Best possible match. */ - } else if(!result) + if(ipv6_addr_cmp(&np->rcv_saddr, daddr)) + continue; + score++; + } + if (sk->bound_dev_if) { + if (sk->bound_dev_if != dif) + continue; + score++; + } + if (score == 3) + return sk; + if (score > hiscore) { + hiscore = score; result = sk; + } } } return result; @@ -223,7 +237,8 @@ static struct sock *tcp_v6_lookup_listener(struct in6_addr *daddr, unsigned shor */ static inline struct sock *__tcp_v6_lookup(struct tcphdr *th, struct in6_addr *saddr, u16 sport, - struct in6_addr *daddr, u16 dport) + struct in6_addr *daddr, u16 dport, + int dif) { unsigned short hnum = ntohs(dport); struct sock *sk; @@ -240,9 +255,10 @@ static inline struct sock *__tcp_v6_lookup(struct tcphdr *th, if(sk && sk->num == hnum && /* local port */ sk->family == AF_INET6 && /* address family */ - sk->dummy_th.dest == sport && /* remote port */ + sk->dport == sport && /* remote port */ !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.daddr, saddr) && - !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, daddr)) + !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, daddr) && + (!sk->bound_dev_if || sk->bound_dev_if == dif)) goto hit; /* Optimize here for direct hit, only listening connections can @@ -253,9 +269,10 @@ static inline struct sock *__tcp_v6_lookup(struct tcphdr *th, /* For IPV6 do the cheaper port and family tests first. */ if(sk->num == hnum && /* local port */ sk->family == AF_INET6 && /* address family */ - sk->dummy_th.dest == sport && /* remote port */ + sk->dport == sport && /* remote port */ !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.daddr, saddr) && - !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, daddr)) { + !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, daddr) && + (!sk->bound_dev_if || sk->bound_dev_if == dif)) { if (sk->state == TCP_ESTABLISHED) TCP_RHASH(sport) = sk; goto hit; /* You sunk my battleship! */ @@ -265,21 +282,22 @@ static inline struct sock *__tcp_v6_lookup(struct tcphdr *th, for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next) if(sk->num == hnum && /* local port */ sk->family == AF_INET6 && /* address family */ - sk->dummy_th.dest == sport) { /* remote port */ + sk->dport == sport) { /* remote port */ struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; if(!ipv6_addr_cmp(&tw->v6_daddr, saddr) && - !ipv6_addr_cmp(&tw->v6_rcv_saddr, daddr)) + !ipv6_addr_cmp(&tw->v6_rcv_saddr, daddr) && + (!sk->bound_dev_if || sk->bound_dev_if == dif)) goto hit; } #ifdef USE_QUICKSYNS listener_shortcut: #endif - sk = tcp_v6_lookup_listener(daddr, hnum); + sk = tcp_v6_lookup_listener(daddr, hnum, dif); hit: return sk; } -#define tcp_v6_lookup(sa, sp, da, dp) __tcp_v6_lookup((0),(sa),(sp),(da),(dp)) +#define tcp_v6_lookup(sa, sp, da, dp, dif) __tcp_v6_lookup((0),(sa),(sp),(da),(dp),(dif)) static __inline__ u16 tcp_v6_check(struct tcphdr *th, int len, struct in6_addr *saddr, @@ -323,8 +341,9 @@ static int tcp_v6_unique_address(struct sock *sk) * use passive ftp, I just cover this case for completeness) */ sk = __tcp_v6_lookup(NULL, &sk->net_pinfo.af_inet6.daddr, - sk->dummy_th.dest, - &sk->net_pinfo.af_inet6.rcv_saddr, snum); + sk->dport, + &sk->net_pinfo.af_inet6.rcv_saddr, snum, + sk->bound_dev_if); if((sk != NULL) && (sk->state != TCP_LISTEN)) retval = 0; break; @@ -344,11 +363,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct in6_addr *saddr = NULL; struct flowi fl; struct dst_entry *dst; - struct tcphdr *th; struct sk_buff *buff; - struct sk_buff *skb1; - int tmp; int addr_type; + int mss; if (sk->state != TCP_CLOSE) return(-EISCONN); @@ -383,7 +400,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, */ if (ipv6_addr_cmp(&usin->sin6_addr, &np->saddr) == 0 && - usin->sin6_port == sk->dummy_th.source) + usin->sin6_port == sk->sport) return (-EINVAL); memcpy(&np->daddr, &usin->sin6_addr, sizeof(struct in6_addr)); @@ -421,9 +438,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl.proto = IPPROTO_TCP; fl.nl_u.ip6_u.daddr = &np->daddr; fl.nl_u.ip6_u.saddr = saddr; - fl.dev = NULL; + fl.oif = sk->bound_dev_if; fl.uli_u.ports.dport = usin->sin6_port; - fl.uli_u.ports.sport = sk->dummy_th.source; + fl.uli_u.ports.sport = sk->sport; dst = ip6_route_output(sk, &fl); @@ -431,11 +448,23 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, dst_release(dst); return dst->error; } - + + if (dst->pmtu < 576) { + dst_release(dst); + return -EINVAL; + } + + if (fl.oif == 0 && addr_type&IPV6_ADDR_LINKLOCAL) { + /* Ough! This guy tries to connect to link local + * address and did not specify interface. + * Actually we should kick him out, but + * we will be patient :) --ANK + */ + sk->bound_dev_if = dst->dev->ifindex; + } + ip6_dst_store(sk, dst); - np->oif = dst->dev; - if (saddr == NULL) { ifa = ipv6_get_saddr(dst, &np->daddr); @@ -449,117 +478,38 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, ipv6_addr_copy(&np->saddr, saddr); } - sk->dummy_th.dest = usin->sin6_port; - if (!tcp_v6_unique_address(sk)) + buff = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header), + 0, GFP_KERNEL); + + if (buff == NULL) + return -ENOBUFS; + + sk->dport = usin->sin6_port; + + if (!tcp_v6_unique_address(sk)) { + kfree_skb(buff); return -EADDRNOTAVAIL; + } /* * Init variables */ - lock_sock(sk); - tp->write_seq = secure_tcp_sequence_number(np->saddr.s6_addr32[3], np->daddr.s6_addr32[3], - sk->dummy_th.source, - sk->dummy_th.dest); - - tp->snd_wnd = 0; - tp->snd_wl1 = 0; - tp->snd_wl2 = tp->write_seq; - tp->snd_una = tp->write_seq; - - tp->rcv_nxt = 0; - - sk->err = 0; - - release_sock(sk); - - buff = sock_wmalloc(sk, (MAX_SYN_SIZE + sizeof(struct sk_buff)), - 0, GFP_KERNEL); - if (buff == NULL) { - /* FIXME: Free route references etc??? */ - return(-ENOMEM); - } - - lock_sock(sk); - - tcp_v6_build_header(sk, buff); - - tp->tcp_header_len = sizeof(struct tcphdr) + - (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); - - /* build the tcp header */ - th = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr)); - buff->h.th = th; - - memcpy(th, (void *) &(sk->dummy_th), sizeof(*th)); - buff->seq = tp->write_seq++; - th->seq = htonl(buff->seq); - tp->snd_nxt = tp->write_seq; - buff->end_seq = tp->write_seq; - th->ack = 0; - th->syn = 1; - + sk->sport, sk->dport); sk->mtu = dst->pmtu; - sk->mss = (sk->mtu - sizeof(struct ipv6hdr) - tp->tcp_header_len); - - if (sk->mss < 1) { - printk(KERN_DEBUG "intial ipv6 sk->mss below 1\n"); - sk->mss = 1; /* Sanity limit */ - } - - tp->window_clamp = 0; /* FIXME: shouldn't ipv6 dst cache have this? */ - tcp_select_initial_window(sock_rspace(sk)/2,sk->mss, - &tp->rcv_wnd, - &tp->window_clamp, - sysctl_tcp_window_scaling, - &tp->rcv_wscale); - th->window = htons(tp->rcv_wnd); - - /* - * Put in the TCP options to say MTU. - */ - - tmp = tcp_syn_build_options(buff, sk->mss, sysctl_tcp_timestamps, - sysctl_tcp_window_scaling,tp->rcv_wscale); - th->doff = sizeof(*th)/4 + (tmp>>2); - buff->csum = 0; - tcp_v6_send_check(sk, th, sizeof(struct tcphdr) + tmp, buff); - - tcp_set_state(sk, TCP_SYN_SENT); - - /* Socket identity change complete, no longer - * in TCP_CLOSE, so enter ourselves into the - * hash tables. - */ - sk->prot->hash(sk); - - /* FIXME: should use dcache->rtt if availiable */ - tp->rto = TCP_TIMEOUT_INIT; - - tcp_init_xmit_timers(sk); - - tp->retransmits = 0; - - skb_queue_tail(&sk->write_queue, buff); - tp->packets_out++; - buff->when = jiffies; - skb1 = skb_clone(buff, GFP_KERNEL); - if(skb1 != NULL) { - skb_set_owner_w(skb1, sk); - tcp_v6_xmit(skb1); + mss = sk->mtu - sizeof(struct ipv6hdr); +#if 0 + if (np->opt) { + /* Adjust mss */ } +#endif - /* Timer for repeating the SYN until an answer */ - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - tcp_statistics.TcpActiveOpens++; - tcp_statistics.TcpOutSegs++; + tcp_connect(sk, buff, mss); - release_sock(sk); - - return(0); + return 0; } static int tcp_v6_sendmsg(struct sock *sk, struct msghdr *msg, int len) @@ -572,7 +522,7 @@ static int tcp_v6_sendmsg(struct sock *sk, struct msghdr *msg, int len) * Do sanity checking for sendmsg/sendto/send */ - if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT)) + if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL)) goto out; if (msg->msg_name) { struct sockaddr_in6 *addr=(struct sockaddr_in6 *)msg->msg_name; @@ -587,7 +537,7 @@ static int tcp_v6_sendmsg(struct sock *sk, struct msghdr *msg, int len) if(sk->state == TCP_CLOSE) goto out; retval = -EISCONN; - if (addr->sin6_port != sk->dummy_th.dest) + if (addr->sin6_port != sk->dport) goto out; if (ipv6_addr_cmp(&addr->sin6_addr, &np->daddr)) goto out; @@ -606,7 +556,7 @@ out: return retval; } -void tcp_v6_err(int type, int code, unsigned char *header, __u32 info, +void tcp_v6_err(struct sk_buff *skb, int type, int code, unsigned char *header, __u32 info, struct in6_addr *saddr, struct in6_addr *daddr, struct inet6_protocol *protocol) { @@ -616,13 +566,11 @@ void tcp_v6_err(int type, int code, unsigned char *header, __u32 info, int err; int opening; struct tcp_opt *tp; -#ifdef ICMP_PARANOIA __u32 seq; -#endif /* XXX: length check for tcphdr missing here */ - sk = tcp_v6_lookup(daddr, th->dest, saddr, th->source); + sk = tcp_v6_lookup(daddr, th->dest, saddr, th->source, skb->dev->ifindex); if (sk == NULL || sk->state == TCP_TIME_WAIT) { /* XXX: Update ICMP error count */ @@ -630,7 +578,6 @@ void tcp_v6_err(int type, int code, unsigned char *header, __u32 info, } tp = &sk->tp_pinfo.af_tcp; -#ifdef ICMP_PARANOIA seq = ntohl(th->seq); if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) { if (net_ratelimit()) @@ -639,8 +586,6 @@ void tcp_v6_err(int type, int code, unsigned char *header, __u32 info, (int)sk->state, seq, tp->snd_una, tp->snd_nxt); return; } -#endif - np = &sk->net_pinfo.af_inet6; if (type == ICMPV6_PKT_TOOBIG && sk->state != TCP_LISTEN) { @@ -656,9 +601,9 @@ void tcp_v6_err(int type, int code, unsigned char *header, __u32 info, fl.proto = IPPROTO_TCP; fl.nl_u.ip6_u.daddr = &np->daddr; fl.nl_u.ip6_u.saddr = &np->saddr; - fl.dev = np->oif; - fl.uli_u.ports.dport = sk->dummy_th.dest; - fl.uli_u.ports.sport = sk->dummy_th.source; + fl.oif = sk->bound_dev_if; + fl.uli_u.ports.dport = sk->dport; + fl.uli_u.ports.sport = sk->sport; dst = ip6_route_output(sk, &fl); @@ -696,7 +641,6 @@ void tcp_v6_err(int type, int code, unsigned char *header, __u32 info, req = tcp_v6_search_req(tp, &hd,th, &prev); if (!req) return; -#ifdef ICMP_PARANOIA if (seq != req->snt_isn) { if (net_ratelimit()) printk(KERN_DEBUG "icmp packet for openreq " @@ -704,7 +648,6 @@ void tcp_v6_err(int type, int code, unsigned char *header, __u32 info, seq, req->snt_isn); return; } -#endif if (req->sk) { sk = req->sk; /* report error in accept */ } else { @@ -739,87 +682,42 @@ void tcp_v6_err(int type, int code, unsigned char *header, __u32 info, static void tcp_v6_send_synack(struct sock *sk, struct open_request *req) { struct sk_buff * skb; - struct tcphdr *th; struct dst_entry *dst; struct flowi fl; - int tmp; - - skb = sock_wmalloc(sk, MAX_SYN_SIZE, 1, GFP_ATOMIC); - if (skb == NULL) - return; + int mss; fl.proto = IPPROTO_TCP; fl.nl_u.ip6_u.daddr = &req->af.v6_req.rmt_addr; fl.nl_u.ip6_u.saddr = &req->af.v6_req.loc_addr; - fl.dev = req->af.v6_req.dev; + fl.oif = req->af.v6_req.iif; fl.uli_u.ports.dport = req->rmt_port; - fl.uli_u.ports.sport = sk->dummy_th.source; + fl.uli_u.ports.sport = sk->sport; dst = ip6_route_output(sk, &fl); if (dst->error) { - kfree_skb(skb); dst_release(dst); return; } - skb->dev = dst->dev; - skb_reserve(skb, (skb->dev->hard_header_len + 15) & ~15); - skb->nh.ipv6h = (struct ipv6hdr *) skb_put(skb,sizeof(struct ipv6hdr)); - - skb->h.th = th = (struct tcphdr *) skb_put(skb, sizeof(struct tcphdr)); - - /* Yuck, make this header setup more efficient... -DaveM */ - memset(th, 0, sizeof(struct tcphdr)); - th->syn = 1; - th->ack = 1; - th->source = sk->dummy_th.source; - th->dest = req->rmt_port; - skb->seq = req->snt_isn; - skb->end_seq = skb->seq + 1; - th->seq = ntohl(skb->seq); - th->ack_seq = htonl(req->rcv_isn + 1); - - /* Don't offer more than they did. - * This way we don't have to memorize who said what. - * FIXME: the selection of initial mss here doesn't quite - * match what happens under IPV4. Figure out the right thing to do. - */ - req->mss = min(sk->mss, req->mss); - if(sk->user_mss) - req->mss = min(req->mss, sk->user_mss); - if(req->tstamp_ok == 0) - req->mss += TCPOLEN_TSTAMP_ALIGNED; - - if (req->rcv_wnd == 0) { - __u8 rcv_wscale; - /* Set this up on the first call only */ - req->window_clamp = 0; /* FIXME: should be in dst cache */ - tcp_select_initial_window(sock_rspace(sk)/2,req->mss, - &req->rcv_wnd, - &req->window_clamp, - req->wscale_ok, - &rcv_wscale); - req->rcv_wscale = rcv_wscale; + mss = dst->pmtu - sizeof(struct ipv6hdr) - sizeof(struct tcphdr); +#if 0 + /* Subtract option length... */ + if (opt) { + mss -= opt->optlen; } - th->window = htons(req->rcv_wnd); - - tmp = tcp_syn_build_options(skb, req->mss, req->tstamp_ok, - req->wscale_ok,req->rcv_wscale); - skb->csum = 0; - th->doff = (sizeof(*th) + tmp)>>2; - th->check = tcp_v6_check(th, sizeof(*th) + tmp, - &req->af.v6_req.loc_addr, &req->af.v6_req.rmt_addr, - csum_partial((char *)th, sizeof(*th)+tmp, skb->csum)); - - /* Actually we should not attach dst to socket in state LISTEN, - it results in stale destination per listen socket and - overflow of routing cache. - (IPv4 has the same flaw with more unpleasant consequences.) - */ - ip6_dst_store(sk, dst); - ip6_xmit(sk, skb, &fl, req->af.v6_req.opt); +#endif + + skb = tcp_make_synack(sk, dst, req, mss); + if (skb) { + struct tcphdr *th = skb->h.th; - tcp_statistics.TcpOutSegs++; + th->check = tcp_v6_check(th, skb->len, + &req->af.v6_req.loc_addr, &req->af.v6_req.rmt_addr, + csum_partial((char *)th, skb->len, skb->csum)); + + ip6_xmit(sk, skb, &fl, req->af.v6_req.opt); + } + dst_release(dst); } static void tcp_v6_or_free(struct open_request *req) @@ -866,8 +764,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, req = tcp_openreq_alloc(); if (req == NULL) { - tcp_statistics.TcpAttemptFails++; - goto exit; } sk->ack_backlog++; @@ -876,22 +772,27 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, req->rcv_isn = skb->seq; req->snt_isn = isn; - tp.tstamp_ok = tp.wscale_ok = tp.snd_wscale = 0; + tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0; tp.in_mss = 536; - tcp_parse_options(skb->h.th,&tp,0); + tcp_parse_options(NULL, skb->h.th, &tp, 0); req->mss = tp.in_mss; if (tp.saw_tstamp) { req->mss -= TCPOLEN_TSTAMP_ALIGNED; req->ts_recent = tp.rcv_tsval; } req->tstamp_ok = tp.tstamp_ok; + req->sack_ok = tp.sack_ok; req->snd_wscale = tp.snd_wscale; req->wscale_ok = tp.wscale_ok; req->rmt_port = skb->h.th->source; ipv6_addr_copy(&req->af.v6_req.rmt_addr, &skb->nh.ipv6h->saddr); ipv6_addr_copy(&req->af.v6_req.loc_addr, &skb->nh.ipv6h->daddr); req->af.v6_req.opt = NULL; /* FIXME: options */ - req->af.v6_req.dev = skb->dev; /* So that link locals have meaning */ + req->af.v6_req.iif = sk->bound_dev_if; + + /* So that link locals have meaning */ + if (!sk->bound_dev_if && ipv6_addr_type(&req->af.v6_req.rmt_addr)&IPV6_ADDR_LINKLOCAL) + req->af.v6_req.iif = skb->dev->ifindex; req->class = &or_ipv6; req->retrans = 0; @@ -928,6 +829,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct flowi fl; struct tcp_opt *newtp; struct sock *newsk; + int mss; if (skb->protocol == __constant_htons(ETH_P_IP)) { /* @@ -955,13 +857,36 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, return newsk; } - newsk = tcp_create_openreq_child(sk, req, skb); - if (newsk == NULL) { - dst_release(dst); - return NULL; + + if (dst == NULL) { + /* + * options / mss / route cache + */ + + fl.proto = IPPROTO_TCP; + fl.nl_u.ip6_u.daddr = &req->af.v6_req.rmt_addr; + fl.nl_u.ip6_u.saddr = &req->af.v6_req.loc_addr; + fl.oif = sk->bound_dev_if; + fl.uli_u.ports.dport = req->rmt_port; + fl.uli_u.ports.sport = sk->sport; + + dst = ip6_route_output(sk, &fl); } - newsk->dst_cache = NULL; + if (dst->error || dst->pmtu < 576) + goto out; + + + mss = dst->pmtu - sizeof(struct ipv6hdr); +#if 0 + /* Adjust mss by option size */ +#endif + + newsk = tcp_create_openreq_child(sk, req, skb, mss); + if (newsk == NULL) + goto out; + + ip6_dst_store(newsk, dst); newtp = &(newsk->tp_pinfo.af_tcp); @@ -969,52 +894,9 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, ipv6_addr_copy(&np->daddr, &req->af.v6_req.rmt_addr); ipv6_addr_copy(&np->saddr, &req->af.v6_req.loc_addr); ipv6_addr_copy(&np->rcv_saddr, &req->af.v6_req.loc_addr); - np->oif = req->af.v6_req.dev; - - if (dst == NULL) { - /* - * options / mss / route cache - */ - - fl.proto = IPPROTO_TCP; - fl.nl_u.ip6_u.daddr = &np->daddr; - fl.nl_u.ip6_u.saddr = &np->saddr; - fl.dev = np->oif; - fl.uli_u.ports.dport = newsk->dummy_th.dest; - fl.uli_u.ports.sport = newsk->dummy_th.source; - - dst = ip6_route_output(newsk, &fl); - } - - ip6_dst_store(newsk, dst); - - newtp->tstamp_ok = req->tstamp_ok; - newtp->window_clamp = req->window_clamp; - newtp->rcv_wnd = req->rcv_wnd; - newtp->wscale_ok = req->wscale_ok; - if (newtp->wscale_ok) { - newtp->snd_wscale = req->snd_wscale; - newtp->rcv_wscale = req->rcv_wscale; - } else { - newtp->snd_wscale = newtp->rcv_wscale = 0; - newtp->window_clamp = min(newtp->window_clamp,65535); - } - if (newtp->tstamp_ok) { - newtp->ts_recent = req->ts_recent; - newtp->ts_recent_stamp = jiffies; - newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; - newsk->dummy_th.doff += (TCPOLEN_TSTAMP_ALIGNED >> 2); - } else { - newtp->tcp_header_len = sizeof(struct tcphdr); - } - - if (dst->error) - newsk->mtu = req->af.v6_req.dev->mtu; - else - newsk->mtu = dst->pmtu; - - newsk->mss = min(req->mss+sizeof(struct tcphdr)-newtp->tcp_header_len, - (newsk->mtu - sizeof(struct ipv6hdr) - newtp->tcp_header_len)); + newsk->bound_dev_if = req->af.v6_req.iif; + newsk->mtu = dst->pmtu; + newsk->opt = NULL; newsk->daddr = LOOPBACK4_IPV6; newsk->saddr = LOOPBACK4_IPV6; @@ -1023,6 +905,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->prot->hash(newsk); add_to_prot_sklist(newsk); return newsk; + +out: + dst_release(dst); + return NULL; } static void tcp_v6_send_reset(struct sk_buff *skb) @@ -1031,7 +917,7 @@ static void tcp_v6_send_reset(struct sk_buff *skb) struct sk_buff *buff; struct flowi fl; - if(th->rst) + if (th->rst) return; /* @@ -1039,21 +925,16 @@ static void tcp_v6_send_reset(struct sk_buff *skb) * and then put it into the queue to be sent. */ - buff = alloc_skb(MAX_RESET_SIZE, GFP_ATOMIC); + buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr), GFP_ATOMIC); if (buff == NULL) return; - buff->dev = skb->dev; + skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr)); - tcp_v6_build_header(NULL, buff); + t1 = (struct tcphdr *) skb_push(buff,sizeof(struct tcphdr)); - t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr)); + /* Swap the send and the receive. */ memset(t1, 0, sizeof(*t1)); - - /* - * Swap the send and the receive. - */ - t1->dest = th->source; t1->source = th->dest; t1->doff = sizeof(*t1)/4; @@ -1080,13 +961,21 @@ static void tcp_v6_send_reset(struct sk_buff *skb) buff->csum); fl.proto = IPPROTO_TCP; - fl.dev = skb->dev; + fl.oif = skb->dev->ifindex; fl.uli_u.ports.dport = t1->dest; fl.uli_u.ports.sport = t1->source; - ip6_xmit(NULL, buff, &fl, NULL); - tcp_statistics.TcpOutSegs++; - tcp_statistics.TcpOutRsts++; + /* sk = NULL, but it is safe for now. RST socket required. */ + buff->dst = ip6_route_output(NULL, &fl); + + if (buff->dst->error == 0) { + ip6_xmit(NULL, buff, &fl, NULL); + tcp_statistics.TcpOutSegs++; + tcp_statistics.TcpOutRsts++; + return; + } + + kfree_skb(buff); } static struct open_request *tcp_v6_search_req(struct tcp_opt *tp, @@ -1182,7 +1071,7 @@ int tcp_v6_rcv(struct sk_buff *skb, struct device *dev, /* CHECKSUM_UNNECESSARY */ }; - sk = __tcp_v6_lookup(th, saddr, th->source, daddr, th->dest); + sk = __tcp_v6_lookup(th, saddr, th->source, daddr, th->dest, dev->ifindex); if (!sk) { printk(KERN_DEBUG "socket not found\n"); @@ -1267,37 +1156,35 @@ do_time_wait: goto discard_it; } -static int tcp_v6_rebuild_header(struct sock *sk, struct sk_buff *skb) +static int tcp_v6_rebuild_header(struct sock *sk) { + struct dst_entry *dst = NULL; struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; if (sk->dst_cache) - dst_check(&sk->dst_cache, np->dst_cookie); + dst = dst_check(&sk->dst_cache, np->dst_cookie); - if (sk->dst_cache == NULL) { + if (dst == NULL) { struct flowi fl; - struct dst_entry *dst; fl.proto = IPPROTO_TCP; fl.nl_u.ip6_u.daddr = &np->daddr; fl.nl_u.ip6_u.saddr = &np->saddr; - fl.dev = np->oif; - fl.uli_u.ports.dport = sk->dummy_th.dest; - fl.uli_u.ports.sport = sk->dummy_th.source; + fl.oif = sk->bound_dev_if; + fl.uli_u.ports.dport = sk->dport; + fl.uli_u.ports.sport = sk->sport; dst = ip6_route_output(sk, &fl); - ip6_dst_store(sk, dst); - } - if (sk->dst_cache->error) { - /* - * lost route to destination - */ - return -EHOSTUNREACH; + if (dst->error) { + dst_release(dst); + return dst->error; + } + + ip6_dst_store(sk, dst); } - skb_pull(skb, skb->nh.raw - skb->data); - return 0; + return dst->error; } static int tcp_v6_backlog_rcv(struct sock *sk, struct sk_buff *skb) @@ -1319,20 +1206,7 @@ static struct sock * tcp_v6_get_sock(struct sk_buff *skb, struct tcphdr *th) saddr = &skb->nh.ipv6h->saddr; daddr = &skb->nh.ipv6h->daddr; - return tcp_v6_lookup(saddr, th->source, daddr, th->dest); -} - -static int tcp_v6_build_header(struct sock *sk, struct sk_buff *skb) -{ - skb_reserve(skb, (MAX_HEADER + 15) & ~15); - skb->nh.raw = skb_put(skb, sizeof(struct ipv6hdr)); - - /* - * FIXME: reserve space for option headers - * length member of np->opt - */ - - return 0; + return tcp_v6_lookup(saddr, th->source, daddr, th->dest, skb->dev->ifindex); } static void tcp_v6_xmit(struct sk_buff *skb) @@ -1340,22 +1214,33 @@ static void tcp_v6_xmit(struct sk_buff *skb) struct sock *sk = skb->sk; struct ipv6_pinfo * np = &sk->net_pinfo.af_inet6; struct flowi fl; - int err; + struct dst_entry *dst = sk->dst_cache; fl.proto = IPPROTO_TCP; fl.nl_u.ip6_u.daddr = &np->daddr; fl.nl_u.ip6_u.saddr = &np->saddr; - fl.dev = np->oif; - fl.uli_u.ports.sport = sk->dummy_th.source; - fl.uli_u.ports.dport = sk->dummy_th.dest; + fl.oif = sk->bound_dev_if; + fl.uli_u.ports.sport = sk->sport; + fl.uli_u.ports.dport = sk->dport; - err = ip6_xmit(sk, skb, &fl, np->opt); + if (sk->dst_cache) + dst = dst_check(&sk->dst_cache, np->dst_cookie); - /* - * FIXME: check error handling. - */ + if (dst == NULL) { + dst = ip6_route_output(sk, &fl); - sk->err_soft = err; + if (dst->error) { + sk->err_soft = dst->error; + dst_release(dst); + return; + } + + ip6_dst_store(sk, dst); + } + + skb->dst = dst_clone(dst); + + ip6_xmit(sk, skb, &fl, np->opt); } static void v6_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) @@ -1365,11 +1250,10 @@ static void v6_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) sin6->sin6_family = AF_INET6; memcpy(&sin6->sin6_addr, &np->daddr, sizeof(struct in6_addr)); - sin6->sin6_port = sk->dummy_th.dest; + sin6->sin6_port = sk->dport; } static struct tcp_func ipv6_specific = { - tcp_v6_build_header, tcp_v6_xmit, tcp_v6_send_check, tcp_v6_rebuild_header, @@ -1387,7 +1271,6 @@ static struct tcp_func ipv6_specific = { */ static struct tcp_func ipv6_mapped = { - tcp_v4_build_header, ip_queue_xmit, tcp_v4_send_check, tcp_v4_rebuild_header, @@ -1425,11 +1308,6 @@ static int tcp_v6_init_sock(struct sock *sk) sk->max_ack_backlog = SOMAXCONN; sk->mtu = 576; sk->mss = 536; - sk->dummy_th.doff = sizeof(sk->dummy_th)/4; - - /* Speed up by setting some standard state for the dummy_th. */ - sk->dummy_th.ack=1; - sk->dummy_th.doff=sizeof(struct tcphdr)>>2; /* Init SYN queue. */ tcp_synq_init(tp); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 40e9b0233..6078ab679 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -7,7 +7,7 @@ * * Based on linux/ipv4/udp.c * - * $Id: udp.c,v 1.24 1998/03/12 03:20:21 davem Exp $ + * $Id: udp.c,v 1.27 1998/03/21 07:28:06 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -151,8 +151,8 @@ static struct sock *udp_v6_lookup(struct in6_addr *saddr, u16 sport, !(sk->dead && (sk->state == TCP_CLOSE))) { struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; int score = 0; - if(sk->dummy_th.dest) { - if(sk->dummy_th.dest != sport) + if(sk->dport) { + if(sk->dport != sport) continue; score++; } @@ -241,7 +241,7 @@ int udpv6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) ipv6_addr_copy(&np->daddr, daddr); - sk->dummy_th.dest = usin->sin6_port; + sk->dport = usin->sin6_port; /* * Check for a route to destination an obtain the @@ -251,9 +251,9 @@ int udpv6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) fl.proto = IPPROTO_UDP; fl.nl_u.ip6_u.daddr = daddr; fl.nl_u.ip6_u.saddr = NULL; - fl.dev = NULL; - fl.uli_u.ports.dport = sk->dummy_th.dest; - fl.uli_u.ports.sport = sk->dummy_th.source; + fl.oif = sk->bound_dev_if; + fl.uli_u.ports.dport = sk->dport; + fl.uli_u.ports.sport = sk->sport; dst = ip6_route_output(sk, &fl); @@ -363,7 +363,7 @@ out: return err; } -void udpv6_err(int type, int code, unsigned char *buff, __u32 info, +void udpv6_err(struct sk_buff *skb, int type, int code, unsigned char *buff, __u32 info, struct in6_addr *saddr, struct in6_addr *daddr, struct inet6_protocol *protocol) { @@ -428,8 +428,8 @@ static struct sock *udp_v6_mcast_next(struct sock *sk, if((s->num == num) && !(s->dead && (s->state == TCP_CLOSE))) { struct ipv6_pinfo *np = &s->net_pinfo.af_inet6; - if(s->dummy_th.dest) { - if(s->dummy_th.dest != rmt_port) + if(s->dport) { + if(s->dport != rmt_port) continue; } if(!ipv6_addr_any(&np->daddr) && @@ -644,7 +644,6 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen) struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) msg->msg_name; struct ipv6_options *opt = NULL; - struct device *dev = NULL; struct flowi fl; int addr_len = msg->msg_namelen; struct in6_addr *daddr; @@ -692,7 +691,7 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen) if (sk->state != TCP_ESTABLISHED) return(-EINVAL); - udh.uh.dest = sk->dummy_th.dest; + udh.uh.dest = sk->dport; daddr = &sk->net_pinfo.af_inet6.daddr; } @@ -708,22 +707,21 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen) } udh.daddr = NULL; + fl.oif = sk->bound_dev_if; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_options)); - err = datagram_send_ctl(msg, &dev, &saddr, opt, &hlimit); - if (err < 0) { - printk(KERN_DEBUG "invalid msg_control\n"); + err = datagram_send_ctl(msg, &fl.oif, &saddr, opt, &hlimit); + if (err < 0) return err; - } if (opt->srcrt) udh.daddr = daddr; } - udh.uh.source = sk->dummy_th.source; + udh.uh.source = sk->sport; udh.uh.len = htons(len); udh.uh.check = 0; udh.iov = msg->msg_iov; @@ -733,7 +731,6 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen) fl.proto = IPPROTO_UDP; fl.nl_u.ip6_u.daddr = daddr; fl.nl_u.ip6_u.saddr = saddr; - fl.dev = dev; fl.uli_u.ports.dport = udh.uh.dest; fl.uli_u.ports.sport = udh.uh.source; diff --git a/net/netbeui/README b/net/netbeui/README deleted file mode 100644 index 02e270b5f..000000000 --- a/net/netbeui/README +++ /dev/null @@ -1,19 +0,0 @@ - -NetBEUI is a rather weird protocol. There are about three different set -of connection and name spaces here. - -Firstly we have an array of 802.2 LLC links acting as reliable inter node -links for the nodes we are talking to do. We create and tear these down as -needed. In effect it goes around pretending ethernet is a set of bits of -wire and running pseudo X.25 over it. The LLC code is elsewhere (net/802). - -Secondly we have the netbios name space. When we sit on multiple networks -we have fun. Netbios isnt routable, so we have to arse around looking on -all our devices for names. - -Thirdly we have logical netbeui sessions on top of the whole heap. - - *Don't blame us* - -We didn't design the protocol. - diff --git a/net/netbeui/af_netbeui.c b/net/netbeui/af_netbeui.c deleted file mode 100644 index 6769edde5..000000000 --- a/net/netbeui/af_netbeui.c +++ /dev/null @@ -1,659 +0,0 @@ - -#include <linux/config.h> -#include <linux/module.h> -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/bitops.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> -#include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/if_ether.h> -#include <linux/route.h> -#include <linux/inet.h> -#include <linux/notifier.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/if_arp.h> -#include <linux/skbuff.h> -#include <linux/termios.h> /* For TIOCOUTQ/INQ */ -#include <linux/poll.h> -#include <net/datalink.h> -#include <net/p8022.h> -#include <net/psnap.h> -#include <net/sock.h> -#include <linux/proc_fs.h> -#include <linux/stat.h> -#include <linux/firewall.h> -#include <linux/init.h> - - -#undef NETBEUI_DEBUG - - -#ifdef NETBEUI_DEBUG -#define DPRINT(x) print(x) -#else -#define DPRINT(x) -#endif - -#define min(a,b) (((a)<(b))?(a):(b)) - -/***********************************************************************************************************************\ -* * -* Handlers for the socket list. * -* * -\***********************************************************************************************************************/ - -static netbeui_socket *netbeui_socket_list=NULL; - -/* - * Note: Sockets may not be removed _during_ an interrupt or inet_bh - * handler using this technique. They can be added although we do not - * use this facility. - */ - -extern inline void netbeui_remove_socket(netbeui_socket *sk) -{ - sklist_remove_socket(&netbeui_socket_list,sk); -} - -extenr inline void netbeui_insert_socket(netbeui_socket *sk) -{ - sklist_insert_socket(&netbeui_socket_list,sk); - netbeui_socket_list=sk; - restore_flags(flags); -} - -static void netbeui_destroy_socket(netbeui_socket *sk) -{ - /* - * Release netbios logical channels first - */ - if(sk->af_nb.nb_link) - { - netbeui_delete_channel(sk->af_nb.nb_link); - sk->af_nb.nb_link=NULL; - } - if(sk->af_nb.src_name) - { - netbeui_release_name(sk->af_nb.src_name); - sk->af_nb.src_name=NULL; - } - if(sk->af_nb.dst_name) - { - netbeui_release_name(sk->af_nb.dst_name); - sk->af_nb.dst_name=NULL; - } - netbeui_remove_listener(sk); - sklist_destroy_socket(&netbeui_socket,sk); -} - -/* - * Called from proc fs - */ - -int netbeui_get_info(char *buffer, char **start, off_t offset, int length, int dummy) -{ - return 0; -} - -/* - * A device event has occurred. Watch for devices going down and - * delete our use of them (iface and route). - */ - -static int nb_device_event(struct notifier_block *this, unsigned long event, void *ptr) -{ - if(event==NETDEV_DOWN) - { - /* Discard any use of this */ - netbeui_drop_device((struct device *)ptr); - } - return NOTIFY_DONE; -} - -/*******************************************************************************************************************\ -* * -* Handling for system calls applied via the various interfaces to a netbeui socket object * -* * -\*******************************************************************************************************************/ - -static int netbeui_listen(struct socket *sock, int backlog) -{ - struct sock *sk=(netbeui_socket *)sock->data; - if(sk->state!=TCP_CLOSED) - return -EINVAL; - if(backlog<0) - return -EINVAL; - if(backlog<128) - sk->backlog=backlog; - else - sk->backlog=128; - sk->state=TCP_LISTEN; - sk->state_change(sk); - netbeui_add_listener(sk); - return 0; -} - -/* - * Create a socket. Initialise the socket, blank the addresses - * set the state. - */ - -static int netbeui_create(struct socket *sock, int protocol) -{ - netbeui_socket *sk; - sk=(netbeui_socket *)sk_alloc(GFP_KERNEL, 1); - if(sk==NULL) - return(-ENOBUFS); - switch(sock->type) - { - case SOCK_DGRAM: - break; - case SOCK_SEQPACKET: - break; - default: - sk_free((void *)sk); - return(-ESOCKTNOSUPPORT); - } - - MOD_INC_USE_COUNT; - - sock_init_data(sock,sk); - sk->mtu=1500; - return(0); -} - -/* - * Copy a socket. No work needed. - */ - -static int netbeui_dup(struct socket *newsock,struct socket *oldsock) -{ - return(netbeui_create(newsock,oldsock->type)); -} - -/* - * Free a socket. No work needed - */ - -static int netbeui_release(struct socket *sock, struct socket *peer) -{ - netbeui_socket *sk=(netbeui_socket *)sock->data; - if(sk==NULL) - return(0); - if(!sk->dead) - sk->state_change(sk); - sk->dead=1; - sock->data=NULL; - netbeui_destroy_socket(sk); - return(0); -} - -/* - * Set the address 'our end' of the connection. - */ - -static int netbeui_bind(struct socket *sock, struct sockaddr *uaddr,size_t addr_len) -{ - netbeui_socket *sk; - struct sockaddr_netbeui *addr=(struct sockaddr_netbeui *)uaddr; - int err; - - sk=(netbeui_socket *)sock->data; - - if(sk->zapped==0) - return(-EINVAL); - - if(addr_len!=sizeof(struct sockaddr_netbeui)) - return -EINVAL; - - if(addr->snb_family!=AF_NETBEUI) - return -EAFNOSUPPORT; - - /* - * This will sleep. To meet POSIX it is non interruptible. - * Someone should give the 1003.1g authors an injection of - * imagination... - */ - - if(sk->af_nb.src_name!=NULL) - return -EINVAL; - - /* - * Try and get the name. It may return various 'invalid' name - * problem reports or EADDRINUSE if we or another node holds - * the desired name. - */ - - sk->af_nb.src_name=netbeui_alloc_name(addr, &err); - if(sk->af_nb.src_name==NULL) - return err; - /* - * Add us to the active socket list - */ - netbeui_insert_socket(sk); - sk->zapped=0; - return(0); -} - -/* - * Set the address we talk to. - */ - -static int netbeui_connect(struct socket *sock, struct sockaddr *uaddr, - size_t addr_len, int flags) -{ - netbeui_socket *sk=(netbeui_socket *)sock->data; - struct sockaddr_netbeui *addr=(struct sockaddr_netbeui *)uaddr; - - /* - * Check pending operations - */ - - if(sk->state==TCP_ESTABLISHED && sock->state == SS_CONNECTING) - { - sock->state==SS_CONNECTED; - return 0; - } - - if(sk->state == TCP_CLOSE & sock->state == SS_CONNECTING) - { - sock->state==SS_UNCONNECTED; - return -ECONNREFUSED; - } - - if(sock->state == SS_CONNECTING && (flags & O_NONBLOCK)) - return -EINPROGRESS; - - if(sk->state==TCP_ESTABLISHED) - return -EISCONN; - - /* - * If this is new it must really be new... - */ - - if(sk->af_nb.dst_name==NULL) - { - if(addr_len != sizeof(struct sockaddr_nb)) - return -EINVAL; - if(addr->snb_family!=AF_NETBEUI) - return -EAFNOSUPPORT; - /* - * Try and find the name - */ - } -} - -/* - * Not relevant - */ - -static int netbeui_socketpair(struct socket *sock1, struct socket *sock2) -{ - return(-EOPNOTSUPP); -} - -/* - * WRITE ME - */ - -static int netbeui_accept(struct socket *sock, struct socket *newsock, int flags) -{ - if(newsock->data) - sk_free(newsock->data); - return -EOPNOTSUPP; -} - -/* - * Find the name of a netbeui socket. Just copy the right - * fields into the sockaddr. - */ - -static int netbeui_getname(struct socket *sock, struct sockaddr *uaddr, - size_t *uaddr_len, int peer) -{ - struct sockaddr_netbeui snb; - netbeui_socket *sk; - - sk=(netbeui_socket *)sock->data; - if(sk->zapped) - { - return -EINVAL; - } - - *uaddr_len = sizeof(struct sockaddr_netbeui); - - if(peer) - { - if(sk->state!=TCP_ESTABLISHED) - return -ENOTCONN; - } - else - { - } - snb.snb_family = AF_NETBEUI; - memcpy(uaddr,&snb,sizeof(snb)); - return(0); -} - -/* - * Receive a packet (in skb) from device dev. - */ - -static int netbeui_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) -{ - return nb_llc_rcv(skb); -} - -static int netbeui_sendmsg(struct socket *sock, struct msghdr *msg, int len, int nonblock, int flags) -{ - netbeui_socket *sk=(netbeui_socket *)sock->data; - struct sockaddr_nb *usnb=(struct sockaddr_nb *)msg->msg_name; - struct sk_buff *skb; - struct device *dev; - struct nbhdr *nbp; - int size; - struct netbeui_route *rt; - int loopback=0; - int err; - - if(flags) - return -EINVAL; - - if(len>1500) /* - headers!! */ - return -EMSGSIZE; - - if(usnb) - { - if(sk->zapped) - { - if(netbeui_autobind(sk)<0) - return -EBUSY; - } - - if(msg->msg_namelen <sizeof(*usnb)) - return(-EINVAL); - if(usnb->snb_family != AF_NETBEUI) - return -EINVAL; - /* Check broadcast */ - } - else - { - if(sk->state!=TCP_ESTABLISHED) - return -ENOTCONN; - /* Connected .. */ - } - - /* Build a packet */ - SOCK_DEBUG(sk, "SK %p: Got address.\n",sk); - size=sizeof(struct nbhdr)+len+nb_dl->header_length; /* For headers */ - - SOCK_DEBUG(sk, "SK %p: Size needed %d, device %s\n", sk, size, dev->name); - size += dev->hard_header_len; - skb = sock_alloc_send_skb(sk, size, 0, 0 , &err); - if(skb==NULL) - return err; - - skb->sk=sk; - skb->free=1; - skb->arp=1; - skb_reserve(skb,nb_dl->header_length); - skb_reserve(skb,dev->hard_header_len); - skb->dev=dev; - SOCK_DEBUG(sk, "SK %p: Begin build.\n", sk); - nbp=(struct nbhdr *)skb_put(skb,sizeof(struct nbhdr)); - SOCK_DEBUG(sk, "SK %p: Copy user data (%d bytes).\n", sk, len); - err = memcpy_fromiovec(skb_put(skb,len),msg->msg_iov,len); - if (err) - { - kfree_skb(skb); - return -EFAULT; - } - -#ifdef CONFIG_FIREWALL - - if(call_out_firewall(AF_NETBEUI, skb->dev, nbp, NULL)!=FW_ACCEPT) - { - kfree_skb(skb); - return -EPERM; - } - -#endif - - if(nb_send_low(dev,skb,&usat->sat_addr, NULL)==-1) - kfree_skb(skb); - SOCK_DEBUG(sk, "SK %p: Done write (%d).\n", sk, len); - return len; -} - - -static int netbeui_recvmsg(struct socket *sock, struct msghdr *msg, int size, int noblock, int flags, int *addr_len) -{ - netbeui_socket *sk=(netbeui_socket *)sock->data; - struct sockaddr_nb *snb=(struct sockaddr_nb *)msg->msg_name; - struct nbphdr *nbp = NULL; - int copied = 0; - struct sk_buff *skb; - int er = 0; - - if(addr_len) - *addr_len=sizeof(*snb); - - skb=skb_recv_datagram(sk,flags,noblock,&er); - if(skb==NULL) - return er; - - snb = (struct nbphdr *)(skb->h.raw); - if(sk->type==SOCK_RAW) - { - copied=skb->len - if(copied > size) - { - copied=size; - msg->msg_flags|=MSG_TRUNC; - } - er = skb_copy_datagram_iovec(skb,0,msg->msg_iov,copied); - if (er) - goto out; - } - else - { - copied=skb->len - sizeof(*nbp); - if (copied > size) - { - copied = size; - msg->msg_flags|=MSG_TRUNC; - } - er = skb_copy_datagram_iovec(skb,sizeof(*nbp),msg->msg_iov,copied); - if (er) - goto out; - } - if(snb) - { - sat->sat_family=AF_NETBEUI; - /* Copy name over */ - } -out: - skb_free_datagram(sk, skb); - return er ? er : (copied); -} - - -static int netbeui_shutdown(struct socket *sk,int how) -{ - return -EOPNOTSUPP; -} - -static int netbeui_poll(struct socket *sock, poll_table *wait) -{ - netbeui_socket *sk=(netbeui_socket *)sock->data; - - return datagram_poll(sk,wait); -} - -/* - * Netbeui ioctl calls. - */ - -static int netbeui_ioctl(struct socket *sock,unsigned int cmd, unsigned long arg) -{ - long amount=0; - netbeui_socket *sk=(netbeui_socket *)sock->data; - - switch(cmd) - { - /* - * Protocol layer - */ - case TIOCOUTQ: - amount = sk->sndbuf - atomic_read(&sk->wmem_alloc); - if(amount<0) - amount=0; - break; - case TIOCINQ: - { - struct sk_buff *skb; - /* These two are safe on a single CPU system as only user tasks fiddle here */ - if((skb=skb_peek(&sk->receive_queue))!=NULL) - amount=skb->len-sizeof(struct ddpehdr); - break; - } - case SIOCGSTAMP: - if (sk) - { - if(sk->stamp.tv_sec==0) - return -ENOENT; - return copy_to_user((void *)arg,&sk->stamp,sizeof(struct timeval)) ? -EFAULT : 0; - } - return -EINVAL; - /* - * Routing - */ - case SIOCADDRT: - case SIOCDELRT: - if(!suser()) - return -EPERM; - return(nbrtr_ioctl(cmd,(void *)arg)); - /* - * Interface - */ - case SIOCGIFADDR: - case SIOCSIFADDR: - case SIOCGIFBRDADDR: - return nbif_ioctl(cmd,(void *)arg); - /* - * Physical layer ioctl calls - */ - case SIOCSIFLINK: - case SIOCGIFHWADDR: - case SIOCSIFHWADDR: - case SIOCGIFFLAGS: - case SIOCSIFFLAGS: - case SIOCGIFMTU: - case SIOCGIFCONF: - case SIOCADDMULTI: - case SIOCDELMULTI: - - return(dev_ioctl(cmd,(void *) arg)); - - case SIOCSIFMETRIC: - case SIOCSIFBRDADDR: - case SIOCGIFNETMASK: - case SIOCSIFNETMASK: - case SIOCGIFMEM: - case SIOCSIFMEM: - case SIOCGIFDSTADDR: - case SIOCSIFDSTADDR: - return -EINVAL; - - default: - return -EINVAL; - } - return put_user(amount, (int *)arg); -} - -static struct proto_ops netbeui_proto_ops = { - AF_NETBEUI, - - netbeui_create, - netbeui_dup, - netbeui_release, - netbeui_bind, - netbeui_connect, - netbeui_socketpair, - netbeui_accept, - netbeui_getname, - netbeui_poll, - netbeui_ioctl, - netbeui_listen, - netbeui_shutdown, - sock_no_setsockopt, - sock_no_getsockopt, - sock_no_fcntl, - netbeui_sendmsg, - netbeui_recvmsg -}; - -static struct notifier_block nb_notifier={ - nb_device_event, - NULL, - 0 -}; - -static char nb_snap_id[]={0x08,0x00,0x07,0x80,0x9B}; - -#ifdef CONFIG_PROC_FS -static struct proc_dir_entry proc_netbeui = { - PROC_NET_NETBEUI, 9, "netbeui", - S_IFREG | S_IRUGO, 1, 0, 0 - 0, &proc_net_inode_operations, - netbeui_get_info -}; -#endif - -/* Called by proto.c on kernel start up */ - -__initfunc(void netbeui_proto_init(struct net_proto *pro)) -{ - (void) sock_register(netbeui_proto_ops.family, &netbeui_proto_ops); - if ((nb_dl = register_8022_client(nb_8022_id, netbeui_rcv)) == NULL) - printk(KERN_CRIT "Unable to register Netbeui with 802.2.\n"); - - register_netdevice_notifier(&nb_notifier); - -#ifdef CONFIG_PROC_FS - proc_net_register(&proc_netbeui); -#endif - - printk(KERN_INFO "NetBEUI 0.03 for Linux NET3.037\n"); -} - -#ifdef MODULE -EXPORT_NO_SYMBOLS; - -int init_module(void) -{ - netbeui_proto_init(NULL); - return 0; -} - -void cleanup_module(void) -{ - unsigned long flags; -#ifdef CONFIG_PROC_FS - proc_net_unregister(PROC_NET_NETBEUI); -#endif - unregister_netdevice_notifier(&nb_notifier); - unregister_snap_client(nb_snap_id); - sock_unregister(netbeui_proto_ops.family); -} - -#endif /* MODULE */ diff --git a/net/netbeui/netbeui_llc.c b/net/netbeui/netbeui_llc.c deleted file mode 100644 index 29edc5acf..000000000 --- a/net/netbeui/netbeui_llc.c +++ /dev/null @@ -1,265 +0,0 @@ -/* - * NET3: 802.2 LLC supervisor for the netbeui protocols. - * - * The basic aim is to provide a self managing link layer supervisor - * for netbeui. It creates and destroys the 802.2 virtual connections - * as needed, and copes with the various races when a link goes down - * just as its requested etc. - * - * The upper layers are presented with the notion of an nb_link which - * is a potentially shared object that represents a logical path - * between two hosts. Each nb_link has usage counts and users can - * treat it as if its their own. - */ - -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/notifier.h> -#include <linux/netdevice.h> -#include <linux/skbuff.h> -#include <net/datalink.h> -#include <net/p8022.h> -#include <net/psnap.h> -#include <net/sock.h> -#include <net/llc.h> -#include <net/netbeui.h> - - -/* - * When this routine is called the netbeui layer has decided to - * drop the link. There is a tiny risk that we might reuse the - * link after we decide. Thus before we blast the link into little - * tiny pieces we must check.... - */ - -static void netbeui_do_destroy(struct nb_link *nb) -{ - /* - * Are we wanted again. Bring it back. Sigh, wish people - * would make up their minds 8) - */ - if(nb->users>0) - { - nb->state=NETBEUI_CONNWAIT; - llc_connect_request(&nb->llc); - return; - } - /* - * Blam.... into oblivion it goes - */ - - llc_unregister(&nb->llc); - netbeui_free_link(nb); -} - -/* - * Handle netbeui events. Basically that means keep it up when it - * should be up, down when it should be down and handle all the data. - */ - -static void netbeui_event(llcptr llc) -{ - struct nb_link *nb=(struct nb_link *)llc; - - /* - * See what has occured - */ - - - /* - * Connect completion confirmation - */ - - if(llc->llc_callbacks&LLC_CONN_CONFIRM) - { - /* - * Link up if desired. Otherwise try frantically - * to close it. - */ - if(nb->state!=NETBEUI_DEADWAIT) - { - /* - * Wake pending writers - */ - nb->state=NETBEUI_OPEN; - netbeui_wakeup(nb); - } - else - llc_disconnect_request(llc); - } - - /* - * Data is passed to the upper netbeui layer - */ - - if(llc->llc_callbacks&LLC_DATA_INDIC) - { - netbeu_rcv_stream(llc,llc->inc_skb); - /* - * Frame free is controlled by our stream processor - */ - return; - } - - /* - * We got disconnected - */ - - if(llc->llc_callbacks&LLC_DISC_INDICATION) - { - if(nb->state==NETBEUI_DEADWAIT) - { - netbeui_do_destroy(nb); - return; - } - if(nb->state==NETBEUI_DISCWAIT) - { - llc_connect_request(llc); - nb->state=NETBEUI_CONNWAIT; - } - } - - /* - * Miscellaneous burps - */ - - if(llc->llc_callbacks&(LLC_RESET_INDIC_LOC|LLC_RESET_INDIC_REM| - LLC_RST_CONFIRM)) - { - /* - * Reset. - * Q: Is tearing the link down the right answer ? - * - * For now we just carry on - */ - } - - /* - * Track link busy status - */ - - if(llc->llc_callbacks&LLC_REMOTE_BUSY) - nb->busy=1; /* Send no more for a bit */ - if(llc->llc_callbacks&LLC_REMOTE_NOTBUSY) - { - /* Coming unbusy may wake sending threads */ - nb->busy=0; - netbeui_wakeup(nb); - } - /* - * UI frames are passed to the upper netbeui layer. - */ - if(llc->llc_callbacks&LLC_UI_DATA) - { - netbeui_rcv_dgram(llc,llc->inc_skb); - return; - } - - /* We ignore TST, XID, FRMR stuff */ - /* FIXME: We need to free frames here once I fix the callback! */ - if(llc->inc_skb) - kfree_skb(skb); -} - -/* - * Netbeui has created a new logical link. As a result we will - * need to find or create a suitable 802.2 LLC session and join - * it. - */ - -struct nb_link *netbeui_create_channel(struct device *dev, u8 *remote_mac, int pri) -{ - struct nb_link *nb=netbeui_find_channel(dev,remote_mac); - if(nb) - { - if(nb->state==NETBEUI_DEADWAIT) - { - /* - * We had commenced a final shutdown. We - * cannot abort that (we sent the packet) but - * we can shift the mode to DISCWAIT. That will - * cause the disconnect event to bounce us - * back into connected state. - */ - nb->state==NETBEUI_DISCWAIT; - } - nb->users++; - return nb; - } - nb=netbeui_alloc_link(pri); - if(nb==NULL) - return NULL; - - /* - * Internal book keeping - */ - - nb->dev=dev; - nb->users=1; - nb->busy=0; - nb->wakeup=NULL; - nb->state=NETBEUI_CONNWAIT; - memcpy(nb->remote_mac, remote_mac, ETH_ALEN); - - /* - * Now try and attach an LLC. - */ - - if(register_cl2llc_client(&nb->llc,dev->name,netbeui_event, - remote_mac, NETBEUI_SAP, NETBEUI_SAP)<0) - { - netbeui_free_link(nb); - return NULL; - } - - /* - * Commence connection establishment. - */ - - llc_connect_request(&nb->llc); - - /* - * Done - */ - - nb->next=nb_link_list; - nb_link_list=nb; - - return nb; -} - -/* - * A logical netbeui channel has died. If the channel has no - * further users we commence shutdown. - */ - -int netbeui_delete_channel(struct nb_link *nb) -{ - nb->users--; - - /* - * FIXME: Must remove ourselves from the nb_link chain when - * we add that bit - */ - - if(nb->users) - return 0; - - /* - * Ensure we drop soon. The disconnect confirm will let - * us fix the deletion. If someone wants the link at - * the wrong moment nothing bad will occur. The create - * or the do_destroy will sort it. - */ - - nb->state = NETBEUI_DEADWAIT; - llc_disconnect_request(lp); - return 0; -} - - diff --git a/net/netbeui/netbeui_name.c b/net/netbeui/netbeui_name.c deleted file mode 100644 index c5a579597..000000000 --- a/net/netbeui/netbeui_name.c +++ /dev/null @@ -1,163 +0,0 @@ -/* - * NetBIOS name handler - */ - -/* - * You must hold the netbios name lock before using these. - */ - -struct nb_name *nb_name_find(struct device *dev,const char * name) -{ - struct nb_name *nb=nb_name_list; - while(nb!=NULL) - { - if((dev==NULL || dev==nb->dev) && - strncmp(name,nb->name, NB_NAME_LEN)==0) - return nb; - nb=nb->next; - } - return NULL; -} - -int nb_name_add(struct device *dev, const char *name, int ours, int pri) -{ - struct nb_name *nb=kmalloc(sizeof(*nb), pri); - if(nb==NULL) - return NULL; - nb->dev=dev; - strncpy(nb->name,name,NB_NAME_LEN); - nb->name[NB_NAME_LEN-1]=0; - nb->next=nb_name_list; - nb->ours=ours; - nb_name_list=nb; -} - -void nb_name_delete(struct nb_name *nb) -{ - struct nb_name *i=&nb_name_list; - while((*i)!=NULL) - { - if(*i==nb) - { - *i=nb->next; - kfree_s(nb,sizeof(*nb)); - return; - } - i=&((*i)->next); - } - printk(KERN_ERR "nb_name_delete: bad name pointer!\n"); -} - -/* - * NETBIOS name handlers - */ - -static void nb_defend(struct device *dev, const char *name) -{ - struct sk_buff *nskb=nb_alloc_skb(NB_CONTROL_LEN, GFP_ATOMIC); - if(nskb==NULL) - return; - /* Build a name defence packet */ - nskb->dev = dev; - nskb->priority = TC_PRIO_CONTROL; - dev_queue_xmit(nskb); -} - -void netbeui_heard_name(struct device *dev, struct sk_buff *skb) -{ - struct nb_name *nb; - name=... - - if((nb=nb_name_find(dev,name))!=NULL) - { - /* - * If we own the name then defend it - */ - if(nb->our && !nb->state==NB_ACQUIRE) - nb_defend(dev,name); - /* - * A name has been resolved. Wake up pending - * connectors. - */ - if(nb->state==NB_QUERY) - { - nb->state=NB_OTHER; - nb_complete(nb,skb); - } - } - kfree_skb(skb); - return 0; -} - -/* - * Handle incoming name defences - */ - -void netbeui_name_defence(struct dev *dev, struct sk_buff *skb) -{ - struct nb_name *name; - name= - - if((nb=nb_name_find(dev,name))!=NULL) - { - if(nb->ours) - { - /* - * We wanted it, we got told its used - */ - if(nb->state==NB_ACQUIRE) - { - /* - * Fill in the record for its true - * owner. Set the state first as - * nb_complete may well delete the - * record. - */ - nb->state=NB_OTHER; - nb_complete(nb,skb); - nb_wakeup(); - } - /* - * We own it we got told its used. This is - * a deep cack even that can only occur when - * a bridge comes back and the net was split. - * Make sure both sides lose. - */ - if(nb->state==NB_OURS || nb->state==NB_COLLIDE) - { - nb->state=NR_COLLIDE; - nb_wakeup(); - /* - * Kill the other copy too - */ - nb_defend(dev,name); - /* - * Timer expiry will delete our - * record. - */ - nb_start_timer(nb, NB_TIME_COLLIDED); - } - } - } - kfree_skb(skb); -} - -void netbeui_name_query(struct dev *dev, struct sk_buff *skb) -{ - char *name=... - struct nb_name *nb=nb_find_name(dev,name); - - if(nb!=NULL && nb->ours) - { - struct sk_buff *nskb=nb_alloc_skb(NB_CONTROL_LEN, GFP_ATOMIC); - if(nskb!=NULL) - { - /* Build a name reply packet */ - nskb->dev = dev; - nskb->priority = TC_PRIO_CONTROL; - dev_queue_xmit(nskb); - } - } - kfree_skb(skb); -} - diff --git a/net/netsyms.c b/net/netsyms.c index ad51e9a3e..9ce58d285 100644 --- a/net/netsyms.c +++ b/net/netsyms.c @@ -273,7 +273,6 @@ EXPORT_SYMBOL(tcp_statistics); EXPORT_SYMBOL(tcp_rcv_state_process); EXPORT_SYMBOL(tcp_timewait_state_process); EXPORT_SYMBOL(tcp_do_sendmsg); -EXPORT_SYMBOL(tcp_v4_build_header); EXPORT_SYMBOL(tcp_v4_rebuild_header); EXPORT_SYMBOL(tcp_v4_send_check); EXPORT_SYMBOL(tcp_v4_conn_request); @@ -291,6 +290,9 @@ EXPORT_SYMBOL(tcp_prot); EXPORT_SYMBOL(tcp_openreq_cachep); EXPORT_SYMBOL(ipv4_specific); EXPORT_SYMBOL(tcp_simple_retransmit); +EXPORT_SYMBOL(tcp_transmit_skb); +EXPORT_SYMBOL(tcp_connect); +EXPORT_SYMBOL(tcp_make_synack); EXPORT_SYMBOL(xrlim_allow); @@ -374,6 +376,7 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); /* support for loadable net drivers */ #ifdef CONFIG_NET +EXPORT_SYMBOL(loopback_dev); EXPORT_SYMBOL(register_netdevice); EXPORT_SYMBOL(unregister_netdevice); EXPORT_SYMBOL(register_netdev); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 74fc7af82..858ea0e73 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -262,9 +262,9 @@ static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg, int len, struct scm_cookie *scm) { struct sock *sk = sock->sk; + struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name; struct sk_buff *skb; struct device *dev; - struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name; unsigned short proto=0; int err; @@ -309,6 +309,7 @@ static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg, int len, return -EMSGSIZE; dev_lock_list(); + err = -ENOBUFS; skb = sock_wmalloc(sk, len+dev->hard_header_len+15, 0, GFP_KERNEL); /* @@ -318,10 +319,7 @@ static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg, int len, */ if (skb == NULL) - { - dev_unlock_list(); - return(-ENOBUFS); - } + goto out_unlock; /* * Fill it in @@ -339,36 +337,32 @@ static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg, int len, skb->data -= dev->hard_header_len; skb->tail -= dev->hard_header_len; } + + /* Returns -EFAULT on error */ err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); skb->protocol = proto; skb->dev = dev; skb->priority = sk->priority; - dev_unlock_list(); + if (err) + goto out_free; + + err = -ENETDOWN; + if (!(dev->flags & IFF_UP)) + goto out_free; /* * Now send it */ - if (err) - { - err = -EFAULT; - } - else - { - if (!(dev->flags & IFF_UP)) - { - err = -ENETDOWN; - } - } - - if (err) - { - kfree_skb(skb); - return err; - } - + dev_unlock_list(); dev_queue_xmit(skb); return(len); + +out_free: + kfree_skb(skb); +out_unlock: + dev_unlock_list(); + return err; } #endif @@ -434,13 +428,12 @@ static int packet_sendmsg(struct socket *sock, struct msghdr *msg, int len, struct scm_cookie *scm) { struct sock *sk = sock->sk; + struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name; struct sk_buff *skb; struct device *dev; - struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name; unsigned short proto; - int ifindex; - int err; - int reserve = 0; + unsigned char *addr; + int ifindex, err, reserve = 0; /* * Check the flags. @@ -454,13 +447,15 @@ static int packet_sendmsg(struct socket *sock, struct msghdr *msg, int len, */ if (saddr == NULL) { - ifindex = sk->protinfo.af_packet->ifindex; - proto = sk->num; + ifindex = sk->protinfo.af_packet->ifindex; + proto = sk->num; + addr = NULL; } else { if (msg->msg_namelen < sizeof(struct sockaddr_ll)) return -EINVAL; - ifindex = saddr->sll_ifindex; - proto = saddr->sll_protocol; + ifindex = saddr->sll_ifindex; + proto = saddr->sll_protocol; + addr = saddr->sll_addr; } dev = dev_get_by_index(ifindex); @@ -474,55 +469,50 @@ static int packet_sendmsg(struct socket *sock, struct msghdr *msg, int len, dev_lock_list(); - skb = sock_alloc_send_skb(sk, len+dev->hard_header_len+15, 0, msg->msg_flags&MSG_DONTWAIT, &err); - - if (skb==NULL) { - dev_unlock_list(); - return err; - } + skb = sock_alloc_send_skb(sk, len+dev->hard_header_len+15, 0, + msg->msg_flags & MSG_DONTWAIT, &err); + if (skb==NULL) + goto out_unlock; skb_reserve(skb, (dev->hard_header_len+15)&~15); skb->nh.raw = skb->data; if (dev->hard_header) { - if (dev->hard_header(skb, dev, ntohs(proto), - saddr ? saddr->sll_addr : NULL, - NULL, len) < 0 - && sock->type == SOCK_DGRAM) { - kfree_skb(skb); - dev_unlock_list(); - return -EINVAL; - } + int res; + err = -EINVAL; + res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len); if (sock->type != SOCK_DGRAM) { skb->tail = skb->data; skb->len = 0; - } + } else if (res < 0) + goto out_free; } + /* Returns -EFAULT on error */ err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); skb->protocol = proto; skb->dev = dev; skb->priority = sk->priority; - dev_unlock_list(); + if (err) + goto out_free; + + err = -ENETDOWN; + if (!(dev->flags & IFF_UP)) + goto out_free; /* * Now send it */ - if (err) { - err = -EFAULT; - } else { - if (!(dev->flags & IFF_UP)) - err = -ENETDOWN; - } - - if (err) { - kfree_skb(skb); - return err; - } - + dev_unlock_list(); dev_queue_xmit(skb); return(len); + +out_free: + kfree_skb(skb); +out_unlock: + dev_unlock_list(); + return err; } static void packet_destroy_timer(unsigned long data) @@ -699,6 +689,7 @@ static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len static int packet_create(struct socket *sock, int protocol) { struct sock *sk; + int err; if (!suser()) return -EPERM; @@ -711,27 +702,23 @@ static int packet_create(struct socket *sock, int protocol) sock->state = SS_UNCONNECTED; MOD_INC_USE_COUNT; + + err = -ENOBUFS; sk = sk_alloc(AF_PACKET, GFP_KERNEL, 1); - if (sk == NULL) { - MOD_DEC_USE_COUNT; - return -ENOBUFS; - } + if (sk == NULL) + goto out; sk->reuse = 1; + sock->ops = &packet_ops; #ifdef CONFIG_SOCK_PACKET if (sock->type == SOCK_PACKET) sock->ops = &packet_ops_spkt; - else #endif - sock->ops = &packet_ops; sock_init_data(sock,sk); sk->protinfo.af_packet = kmalloc(sizeof(struct packet_opt), GFP_KERNEL); - if (sk->protinfo.af_packet == NULL) { - sk_free(sk); - MOD_DEC_USE_COUNT; - return -ENOBUFS; - } + if (sk->protinfo.af_packet == NULL) + goto out_free; memset(sk->protinfo.af_packet, 0, sizeof(struct packet_opt)); sk->zapped=0; sk->family = AF_PACKET; @@ -741,13 +728,11 @@ static int packet_create(struct socket *sock, int protocol) * Attach a protocol block */ + sk->protinfo.af_packet->prot_hook.func = packet_rcv; #ifdef CONFIG_SOCK_PACKET if (sock->type == SOCK_PACKET) sk->protinfo.af_packet->prot_hook.func = packet_rcv_spkt; - else #endif - sk->protinfo.af_packet->prot_hook.func = packet_rcv; - sk->protinfo.af_packet->prot_hook.data = (void *)sk; if (protocol) { @@ -758,6 +743,12 @@ static int packet_create(struct socket *sock, int protocol) sklist_insert_socket(&packet_sklist, sk); return(0); + +out_free: + sk_free(sk); +out: + MOD_DEC_USE_COUNT; + return err; } /* @@ -832,10 +823,8 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, int len, /* We can't use skb_copy_datagram here */ err = memcpy_toiovec(msg->msg_iov, skb->data, copied); - if (err) { - err = -EFAULT; + if (err) goto out_free; - } sk->stamp=skb->stamp; if (msg->msg_name) @@ -932,37 +921,39 @@ static void packet_dev_mclist(struct device *dev, struct packet_mclist *i, int w static int packet_mc_add(struct sock *sk, struct packet_mreq *mreq) { - int err; struct packet_mclist *ml, *i; struct device *dev; + int err; rtnl_shlock(); - dev = dev_get_by_index(mreq->mr_ifindex); - - i = NULL; err = -ENODEV; + dev = dev_get_by_index(mreq->mr_ifindex); if (!dev) goto done; + err = -EINVAL; if (mreq->mr_alen > dev->addr_len) goto done; + err = -ENOBUFS; i = (struct packet_mclist *)kmalloc(sizeof(*i), GFP_KERNEL); + if (i == NULL) + goto done; + err = 0; for (ml=sk->protinfo.af_packet->mclist; ml; ml=ml->next) { if (ml->ifindex == mreq->mr_ifindex && ml->type == mreq->mr_type && ml->alen == mreq->mr_alen && memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { ml->count++; - err = 0; + /* Free the new element ... */ + kfree(i); goto done; } } - err = -ENOBUFS; - if (i == NULL) - goto done; + i->type = mreq->mr_type; i->ifindex = mreq->mr_ifindex; i->alen = mreq->mr_alen; @@ -971,13 +962,9 @@ static int packet_mc_add(struct sock *sk, struct packet_mreq *mreq) i->next = sk->protinfo.af_packet->mclist; sk->protinfo.af_packet->mclist = i; packet_dev_mc(dev, i, +1); - i = NULL; - err = 0; done: rtnl_shunlock(); - if (i) - kfree(i); return err; } @@ -1109,13 +1096,12 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg case FIOGETOWN: case SIOCGPGRP: return put_user(sk->proc, (int *)arg); - return(0); case SIOCGSTAMP: if(sk->stamp.tv_sec==0) return -ENOENT; - err = copy_to_user((void *)arg,&sk->stamp,sizeof(struct timeval)); - if (err) - err = -EFAULT; + err = -EFAULT; + if (!copy_to_user((void *)arg, &sk->stamp, sizeof(struct timeval))) + err = 0; return err; case SIOCGIFFLAGS: #ifndef CONFIG_INET diff --git a/net/socket.c b/net/socket.c index dc77ef3e8..6220cff45 100644 --- a/net/socket.c +++ b/net/socket.c @@ -646,15 +646,17 @@ asmlinkage int sys_socket(int family, int type, int protocol) goto out; retval = get_fd(sock->inode); - if (retval < 0) { - sock_release(sock); - goto out; - } + if (retval < 0) + goto out_release; + sock->file = fcheck(retval); - sock->file = current->files->fd[retval]; out: unlock_kernel(); return retval; + +out_release: + sock_release(sock); + goto out; } /* @@ -787,9 +789,8 @@ asmlinkage int sys_accept(int fd, struct sockaddr *upeer_sockaddr, int *upeer_ad { struct inode *inode; struct socket *sock, *newsock; - int err; + int err, len; char address[MAX_SOCK_ADDR]; - int len; lock_kernel(); sock = sockfd_lookup(fd, &err); @@ -815,7 +816,7 @@ restart: if ((err = get_fd(inode)) < 0) goto out_release; - newsock->file = current->files->fd[err]; + newsock->file = fcheck(err); if (upeer_sockaddr) { @@ -1141,19 +1142,21 @@ asmlinkage int sys_sendmsg(int fd, struct msghdr *msg, unsigned flags) char address[MAX_SOCK_ADDR]; struct iovec iov[UIO_FASTIOV]; unsigned char ctl[sizeof(struct cmsghdr) + 20]; /* 20 is size of ipv6_pktinfo */ - struct msghdr msg_sys; - int err= -EINVAL; - int total_len; unsigned char *ctl_buf = ctl; + struct msghdr msg_sys; + int err, total_len; lock_kernel(); - err=-EFAULT; + err = -EFAULT; if (copy_from_user(&msg_sys,msg,sizeof(struct msghdr))) goto out; + /* do not move before msg_sys is valid */ - if (msg_sys.msg_iovlen>UIO_MAXIOV) + err = -EINVAL; + if (msg_sys.msg_iovlen > UIO_MAXIOV) goto out; + /* This will also move the address data into kernel space */ err = verify_iovec(&msg_sys, iov, address, VERIFY_READ); if (err < 0) @@ -1163,7 +1166,7 @@ asmlinkage int sys_sendmsg(int fd, struct msghdr *msg, unsigned flags) sock = sockfd_lookup(fd, &err); if (!sock) - goto out; + goto out_freeiov; if (msg_sys.msg_controllen) { @@ -1197,9 +1200,10 @@ failed: if (ctl_buf != ctl) sock_kfree_s(sock->sk, ctl_buf, msg_sys.msg_controllen); failed2: + sockfd_put(sock); +out_freeiov: if (msg_sys.msg_iov != iov) kfree(msg_sys.msg_iov); - sockfd_put(sock); out: unlock_kernel(); return err; @@ -1228,16 +1232,13 @@ asmlinkage int sys_recvmsg(int fd, struct msghdr *msg, unsigned int flags) int *uaddr_len; lock_kernel(); + err=-EFAULT; if (copy_from_user(&msg_sys,msg,sizeof(struct msghdr))) - { - err=-EFAULT; goto out; - } - if (msg_sys.msg_iovlen>UIO_MAXIOV) - { - err=-EINVAL; + + err=-EINVAL; + if (msg_sys.msg_iovlen > UIO_MAXIOV) goto out; - } /* * Save the user-mode address (verify_iovec will change the diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 761bfd242..0c4cc7f5a 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -14,11 +14,12 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/auth.h> +#define NFS_NGROUPS 16 struct unx_cred { struct rpc_cred uc_base; uid_t uc_fsuid; gid_t uc_gid, uc_fsgid; - gid_t uc_gids[16]; + gid_t uc_gids[NFS_NGROUPS]; }; #define uc_uid uc_base.cr_uid #define uc_count uc_base.cr_count @@ -84,12 +85,18 @@ unx_create_cred(struct rpc_task *task) cred->uc_gid = cred->uc_fsgid = 0; cred->uc_gids[0] = NOGROUP; } else { + int groups = current->ngroups; + if (groups > NFS_NGROUPS) + groups = NFS_NGROUPS; + cred->uc_uid = current->uid; cred->uc_gid = current->gid; cred->uc_fsuid = current->fsuid; cred->uc_fsgid = current->fsgid; - for (i = 0; i < 16 && i < NGROUPS; i++) + for (i = 0; i < groups; i++) cred->uc_gids[i] = (gid_t) current->groups[i]; + if (i < NFS_NGROUPS) + cred->uc_gids[i] = NOGROUP; } return (struct rpc_cred *) cred; @@ -135,13 +142,18 @@ unx_match(struct rpc_task * task, struct rpc_cred *rcred) int i; if (!RPC_DO_ROOTOVERRIDE(task)) { + int groups; + if (cred->uc_uid != current->uid || cred->uc_gid != current->gid || cred->uc_fsuid != current->fsuid || cred->uc_fsgid != current->fsgid) return 0; - for (i = 0; i < 16 && i < NGROUPS; i++) + groups = current->ngroups; + if (groups > NFS_NGROUPS) + groups = NFS_NGROUPS; + for (i = 0; i < groups ; i++) if (cred->uc_gids[i] != (gid_t) current->groups[i]) return 0; return 1; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index cec276857..47d1104dc 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -866,8 +866,7 @@ if (svsk->sk_sk == NULL) /* Register socket with portmapper */ if (*errp >= 0 && pmap_register) - *errp = svc_register(serv, inet->protocol, - ntohs(inet->dummy_th.source)); + *errp = svc_register(serv, inet->protocol, ntohs(inet->sport)); if (*errp < 0) { inet->user_data = NULL; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index b04072d80..624cbb8d8 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -687,17 +687,19 @@ static int unix_stream_connect1(struct socket *sock, struct msghdr *msg, skb=sock_alloc_send_skb(sk, len, 0, nonblock, &err); /* Marker object */ if(skb==NULL) - return err; + goto out; memcpy(&UNIXCB(skb), cmsg, sizeof(*cmsg)); - if (len) - memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); + if (len) { + err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, + len); + if (err) + goto out_free; + } + sk->state=TCP_CLOSE; other=unix_find_other(sunaddr, addr_len, sk->type, hash, &err); if(other==NULL) - { - kfree_skb(skb); - return err; - } + goto out_free; other->ack_backlog++; unix_peer(sk)=other; skb_queue_tail(&other->receive_queue,skb); @@ -738,6 +740,11 @@ static int unix_stream_connect1(struct socket *sock, struct msghdr *msg, if (!sk->protinfo.af_unix.addr) unix_autobind(sock); return 0; + +out_free: + kfree_skb(skb); +out: + return err; } @@ -908,8 +915,8 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, int len, struct scm_cookie *scm) { struct sock *sk = sock->sk; - unix_socket *other; struct sockaddr_un *sunaddr=msg->msg_name; + unix_socket *other; int namelen = 0; /* fake GCC */ int err; unsigned hash; @@ -918,7 +925,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, int len, if (msg->msg_flags&MSG_OOB) return -EOPNOTSUPP; - if (msg->msg_flags&~MSG_DONTWAIT) + if (msg->msg_flags&~(MSG_DONTWAIT|MSG_NOSIGNAL)) return -EINVAL; if (msg->msg_namelen) { @@ -935,9 +942,8 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, int len, unix_autobind(sock); skb = sock_alloc_send_skb(sk, len, 0, msg->msg_flags&MSG_DONTWAIT, &err); - if (skb==NULL) - return err; + goto out; memcpy(UNIXCREDS(skb), &scm->creds, sizeof(struct ucred)); UNIXCB(skb).attr = msg->msg_flags; @@ -945,7 +951,9 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, int len, unix_attach_fds(scm, skb); skb->h.raw = skb->data; - memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); + err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); + if (err) + goto out_free; other = unix_peer(sk); if (other && other->dead) @@ -957,26 +965,18 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, int len, unix_unlock(other); unix_peer(sk)=NULL; other = NULL; - if (sunaddr == NULL) { - kfree_skb(skb); - return -ECONNRESET; - } + err = -ECONNRESET; + if (sunaddr == NULL) + goto out_free; } if (!other) { other = unix_find_other(sunaddr, namelen, sk->type, hash, &err); - if (other==NULL) - { - kfree_skb(skb); - return err; - } + goto out_free; + err = -EINVAL; if (!unix_may_send(sk, other)) - { - unix_unlock(other); - kfree_skb(skb); - return -EINVAL; - } + goto out_unlock; } skb_queue_tail(&other->receive_queue, skb); @@ -985,6 +985,13 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, int len, if (!unix_peer(sk)) unix_unlock(other); return len; + +out_unlock: + unix_unlock(other); +out_free: + kfree_skb(skb); +out: + return err; } @@ -1005,7 +1012,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len, if (msg->msg_flags&MSG_OOB) return -EOPNOTSUPP; - if (msg->msg_flags&~MSG_DONTWAIT) + if (msg->msg_flags&~(MSG_DONTWAIT|MSG_NOSIGNAL)) return -EINVAL; if (msg->msg_namelen) { @@ -1020,7 +1027,8 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len, } if (sk->shutdown&SEND_SHUTDOWN) { - send_sig(SIGPIPE,current,0); + if (!(msg->msg_flags&MSG_NOSIGNAL)) + send_sig(SIGPIPE,current,0); return -EPIPE; } @@ -1085,7 +1093,8 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len, kfree_skb(skb); if(sent) goto out; - send_sig(SIGPIPE,current,0); + if (!(msg->msg_flags&MSG_NOSIGNAL)) + send_sig(SIGPIPE,current,0); return -EPIPE; } @@ -1265,9 +1274,7 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, int size } chunk = min(skb->len, size); - /* N.B. This could fail with a non-zero value (which means -EFAULT - * and the non-zero value is the number of bytes not copied). - */ + /* N.B. This could fail with -EFAULT */ memcpy_toiovec(msg->msg_iov, skb->data, chunk); copied += chunk; size -= chunk; |