diff options
author | Ralf Baechle <ralf@linux-mips.org> | 1998-03-18 17:17:51 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 1998-03-18 17:17:51 +0000 |
commit | f1382dc4850bb459d24a81c6cb0ef93ea7bd4a79 (patch) | |
tree | 225271a3d5dcd4e9dea5ee393556abd754c964b1 /net | |
parent | 135b00fc2e90e605ac2a96b20b0ebd93851a3f89 (diff) |
o Merge with Linux 2.1.90.
o Divide L1 cache sizes by 1024 before printing, makes the numbers a
bit more credible ...
Diffstat (limited to 'net')
72 files changed, 2278 insertions, 2110 deletions
diff --git a/net/802/sysctl_net_802.c b/net/802/sysctl_net_802.c index f97141d3c..19cd47af5 100644 --- a/net/802/sysctl_net_802.c +++ b/net/802/sysctl_net_802.c @@ -23,5 +23,6 @@ extern int sysctl_tr_rif_timeout; ctl_table tr_table[] = { {NET_TR_RIF_TIMEOUT, "rif_timeout", &sysctl_tr_rif_timeout, sizeof(int), 0644, NULL, &proc_dointvec}, + {0} }; #endif diff --git a/net/802/tr.c b/net/802/tr.c index bf6cd83d7..3550b81ed 100644 --- a/net/802/tr.c +++ b/net/802/tr.c @@ -510,10 +510,18 @@ int rif_get_info(char *buffer,char **start, off_t offset, int length, int dummy) * Called during bootup. We don't actually have to initialise * too much for this. */ - + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry tr_rif_proc = { + PROC_NET_TR_RIF, 6, "tr_rif", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + rif_get_info +}; +#endif + __initfunc(void rif_init(struct net_proto *unused)) { - rif_timer.expires = RIF_TIMEOUT; rif_timer.data = 0L; rif_timer.function = rif_check_expire; @@ -521,11 +529,6 @@ __initfunc(void rif_init(struct net_proto *unused)) add_timer(&rif_timer); #ifdef CONFIG_PROC_FS - proc_net_register(&(struct proc_dir_entry) { - PROC_NET_TR_RIF, 6, "tr_rif", - S_IFREG | S_IRUGO, 1, 0, 0, - 0, &proc_net_inode_operations, - rif_get_info - }); + proc_net_register(&tr_rif_proc); #endif } diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 8b724361d..c56adc148 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -990,7 +990,7 @@ static int atalk_create(struct socket *sock, int protocol) { struct sock *sk; - sk = sk_alloc(AF_APPLETALK, GFP_KERNEL); + sk = sk_alloc(AF_APPLETALK, GFP_KERNEL, 1); if(sk == NULL) return (-ENOMEM); @@ -1404,6 +1404,31 @@ static int atalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type return (0); } +#if defined(CONFIG_IPDDP) || defined(CONFIG_IPDDP_MODULE) + /* + * Check if IP-over-DDP + */ + if(skb->data[12] == 22) + { + struct device *dev; + + /* This needs to be able to handle ipddp"N" devices */ + if((dev = dev_get("ipddp0")) == NULL) + return (-ENODEV); + + skb->protocol = htons(ETH_P_IP); + skb_pull(skb, 13); + skb->dev = dev; + skb->h.raw = skb->data; + + ((struct net_device_stats *)dev->priv)->rx_packets++; + ((struct net_device_stats *)dev->priv)->rx_bytes += skb->len+13; + netif_rx(skb); /* Send the SKB up to a higher place. */ + + return (0); + } +#endif + /* * Which socket - atalk_search_socket() looks for a *full match* * of the <net,node,port> tuple. @@ -1420,38 +1445,6 @@ static int atalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type return (0); } -#ifdef CONFIG_IPDDP - /* - * Check if IP-over-DDP - */ - if(skb->data[12] == 22) - { - struct device *dev; - struct net_device_stats *estats; - - if((dev = dev_get("ipddp0")) == NULL) - return (-ENODEV); - - estats = (struct net_device_stats *) dev->priv; - skb->protocol = htons(ETH_P_IP); - skb_pull(skb, 13); - skb->dev = dev; - skb->h.raw = skb->data; - skb->nh.raw = skb->data; - - /* printk("passing up ipddp, 0x%02x better be 45\n",skb->data[0]); - * printk("tot_len %d, skb->len %d\n", - * ntohs(skb->h.iph->tot_len),skb->len); - */ - - estats->rx_packets++; - estats->rx_bytes += skb->len + 13; - netif_rx(skb); /* Send the SKB up to a higher place. */ - - return (0); - } -#endif /* CONFIG_IPDDP */ - /* * Queue packet (standard) */ diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 3a4196b3f..107f481d6 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -828,7 +828,7 @@ int ax25_create(struct socket *sock, int protocol) return -ESOCKTNOSUPPORT; } - if ((sk = sk_alloc(AF_AX25, GFP_ATOMIC)) == NULL) + if ((sk = sk_alloc(AF_AX25, GFP_ATOMIC, 1)) == NULL) return -ENOMEM; if ((ax25 = ax25_create_cb()) == NULL) { @@ -854,7 +854,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev) struct sock *sk; ax25_cb *ax25; - if ((sk = sk_alloc(AF_AX25, GFP_ATOMIC)) == NULL) + if ((sk = sk_alloc(AF_AX25, GFP_ATOMIC, 1)) == NULL) return NULL; if ((ax25 = ax25_create_cb()) == NULL) { @@ -1237,6 +1237,8 @@ static int ax25_accept(struct socket *sock, struct socket *newsock, int flags) newsk = skb->sk; newsk->pair = NULL; + newsk->socket = newsock; + newsk->sleep = &newsock->wait; sti(); /* Now attach up the new socket */ diff --git a/net/core/dev.c b/net/core/dev.c index b06d0053e..36efa363b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -50,6 +50,7 @@ * is no device open function. * Andi Kleen : Fix error reporting for SIOCGIFCONF * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF + * Cyrus Durgin : Cleaned for KMOD * */ @@ -81,7 +82,7 @@ #include <net/pkt_sched.h> #include <net/profile.h> #include <linux/init.h> -#include <linux/kerneld.h> +#include <linux/kmod.h> #ifdef CONFIG_NET_RADIO #include <linux/wireless.h> #endif /* CONFIG_NET_RADIO */ @@ -316,7 +317,7 @@ struct device *dev_alloc(const char *name, int *err) * Find and possibly load an interface. */ -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD void dev_load(const char *name) { @@ -398,20 +399,24 @@ int dev_open(struct device *dev) } #ifdef CONFIG_NET_FASTROUTE -void dev_clear_fastroute(struct device *dev) + +static __inline__ void dev_do_clear_fastroute(struct device *dev) { - int i; + if (dev->accept_fastpath) { + int i; - if (dev) { for (i=0; i<=NETDEV_FASTROUTE_HMASK; i++) dst_release(xchg(dev->fastpath+i, NULL)); + } +} + +void dev_clear_fastroute(struct device *dev) +{ + if (dev) { + dev_do_clear_fastroute(dev); } else { - for (dev = dev_base; dev; dev = dev->next) { - if (dev->accept_fastpath) { - for (i=0; i<=NETDEV_FASTROUTE_HMASK; i++) - dst_release(xchg(dev->fastpath+i, NULL)); - } - } + for (dev = dev_base; dev; dev = dev->next) + dev_do_clear_fastroute(dev); } } #endif @@ -643,7 +648,7 @@ int netdev_register_fc(struct device *dev, void (*stimul)(struct device *dev)) set_bit(bit, &netdev_fc_mask); clear_bit(bit, &netdev_fc_xoff); } - sti(); + restore_flags(flags); return bit; } @@ -659,7 +664,7 @@ void netdev_unregister_fc(int bit) clear_bit(bit, &netdev_fc_mask); clear_bit(bit, &netdev_fc_xoff); } - sti(); + restore_flags(flags); } static void netdev_wakeup(void) @@ -978,39 +983,6 @@ int register_gifconf(unsigned int family, gifconf_func_t * gifconf) /* - This ioctl is wrong by design. It really existed in some - old SYSV systems, only was named SIOCGIFNUM. - In multiprotocol environment it is just useless. - Well, SIOCGIFCONF is wrong too, but we have to preserve - it by compatibility reasons. - - If someone wants to achieve the same effect, please, use undocumented - feature of SIOCGIFCONF: it returns buffer length, if buffer - is not supplied. - - Let's remove it, until someone started to use it. --ANK - - In any case, if someone cannot live without it, it should - be renamed to SIOCGIFNUM. - */ - - -/* - * Count the installed interfaces (SIOCGIFCOUNT) - */ - -static int dev_ifcount(unsigned int *arg) -{ - struct device *dev; - unsigned int count = 0; - - for (dev = dev_base; dev != NULL; dev = dev->next) - count++; - - return put_user(count, arg); -} - -/* * Map an interface index to its name (SIOCGIFNAME) */ @@ -1022,6 +994,11 @@ static int dev_ifcount(unsigned int *arg) * Besides that, it is pretty silly to put "drawing" facility * to kernel, it is useful only to print ifindices * in readable form, is not it? --ANK + * + * We need this ioctl for efficient implementation of the + * if_indextoname() function required by the IPv6 API. Without + * it, we would have to search all the interfaces to find a + * match. --pb */ static int dev_ifname(struct ifreq *arg) @@ -1120,20 +1097,21 @@ static int sprintf_stats(char *buffer, struct device *dev) int size; if (stats) - size = sprintf(buffer, "%6s:%8lu %7lu %4lu %4lu %4lu %4lu %8lu %8lu %4lu %4lu %4lu %5lu %4lu %4lu\n", - dev->name, + size = sprintf(buffer, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu %8lu %8lu %4lu %4lu %4lu %5lu %4lu %4lu\n", + dev->name, stats->rx_bytes, stats->rx_packets, stats->rx_errors, stats->rx_dropped + stats->rx_missed_errors, stats->rx_fifo_errors, stats->rx_length_errors + stats->rx_over_errors + stats->rx_crc_errors + stats->rx_frame_errors, + stats->rx_compressed, stats->multicast, stats->tx_bytes, stats->tx_packets, stats->tx_errors, stats->tx_dropped, stats->tx_fifo_errors, stats->collisions, stats->tx_carrier_errors + stats->tx_aborted_errors + stats->tx_window_errors + stats->tx_heartbeat_errors, - stats->multicast); + stats->tx_compressed); else size = sprintf(buffer, "%6s: No statistics available.\n", dev->name); @@ -1156,8 +1134,8 @@ int dev_get_info(char *buffer, char **start, off_t offset, int length, int dummy size = sprintf(buffer, - "Inter-| Receive | Transmit\n" - " face |bytes packets errs drop fifo frame|bytes packets errs drop fifo colls carrier multicast\n"); + "Inter-| Receive | Transmit\n" + " face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n"); pos+=size; len+=size; @@ -1555,9 +1533,6 @@ int dev_ioctl(unsigned int cmd, void *arg) rtnl_shunlock(); return ret; } - if (cmd == SIOCGIFCOUNT) { - return dev_ifcount((unsigned int*)arg); - } if (cmd == SIOCGIFNAME) { return dev_ifname((struct ifreq *)arg); } diff --git a/net/core/dst.c b/net/core/dst.c index e94ef2967..4cad680c2 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -101,7 +101,7 @@ void * dst_alloc(int size, struct dst_ops * ops) void __dst_free(struct dst_entry * dst) { start_bh_atomic(); - dst->obsolete = 1; + dst->obsolete = 2; dst->next = dst_garbage_list; dst_garbage_list = dst; if (dst_gc_timer_inc > DST_GC_INC) { diff --git a/net/core/iovec.c b/net/core/iovec.c index 18a9a3b5b..9e8873646 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -80,18 +80,21 @@ out_free: /* * Copy kernel to iovec. + * + * Note: this modifies the original iovec. */ int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) { - int err = -EFAULT; + int err; while(len>0) { if(iov->iov_len) { int copy = min(iov->iov_len, len); - if (copy_to_user(iov->iov_base, kdata, copy)) + err = copy_to_user(iov->iov_base, kdata, copy); + if (err) goto out; kdata+=copy; len-=copy; @@ -107,6 +110,8 @@ out: /* * Copy iovec to kernel. + * + * Note: this modifies the original iovec. */ int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) @@ -187,9 +192,8 @@ out: * call to this function will be unaligned also. */ -int csum_partial_copy_fromiovecend(unsigned char *kdata, - struct iovec *iov, int offset, - unsigned int len, int *csump) +int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov, + int offset, unsigned int len, int *csump) { int partial_cnt = 0; int err = 0; @@ -246,9 +250,9 @@ int csum_partial_copy_fromiovecend(unsigned char *kdata, if (copy_from_user(kdata, base, copy)) goto out_fault; kdata += copy; - base += copy; + base += copy; partial_cnt += copy; - len -= copy; + len -= copy; iov++; if (len) continue; @@ -260,9 +264,9 @@ int csum_partial_copy_fromiovecend(unsigned char *kdata, goto out_fault; csum = csum_partial(kdata - partial_cnt, 4, csum); kdata += par_len; - base += par_len; - copy -= par_len; - len -= par_len; + base += par_len; + copy -= par_len; + len -= par_len; partial_cnt = 0; } @@ -278,16 +282,12 @@ int csum_partial_copy_fromiovecend(unsigned char *kdata, } } - /* Why do we want to break?? There may be more to copy ... */ - if (copy == 0) { -if (len > partial_cnt) -printk("csum_iovec: early break? len=%d, partial=%d\n", len, partial_cnt); - break; + if (copy) { + csum = csum_and_copy_from_user(base, kdata, copy, + csum, &err); + if (err) + goto out; } - - csum = csum_and_copy_from_user(base, kdata, copy, csum, &err); - if (err) - goto out; len -= copy + partial_cnt; kdata += copy + partial_cnt; iov++; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 3de3743e0..a8d72604d 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -153,12 +153,14 @@ int neigh_ifdown(struct neigh_table *tbl, struct device *dev) static struct neighbour *neigh_alloc(struct neigh_table *tbl, int creat) { struct neighbour *n; + unsigned long now = jiffies; if (tbl->entries > tbl->gc_thresh1) { if (creat < 0) return NULL; - if (tbl->entries > tbl->gc_thresh2 || - jiffies - tbl->last_flush > 5*HZ) { + if (tbl->entries > tbl->gc_thresh3 || + (tbl->entries > tbl->gc_thresh2 && + now - tbl->last_flush > 5*HZ)) { if (neigh_forced_gc(tbl) == 0 && tbl->entries > tbl->gc_thresh3) return NULL; @@ -172,7 +174,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, int creat) memset(n, 0, tbl->entry_size); skb_queue_head_init(&n->arp_queue); - n->updated = n->used = jiffies; + n->updated = n->used = now; n->nud_state = NUD_NONE; n->output = neigh_blackhole; n->parms = &tbl->parms; @@ -666,8 +668,18 @@ int neigh_update(struct neighbour *neigh, u8 *lladdr, u8 new, int override, int neigh_suspect(neigh); if (!(old&NUD_VALID)) { struct sk_buff *skb; - while ((skb=__skb_dequeue(&neigh->arp_queue)) != NULL) - neigh->output(skb); + + /* Again: avoid dead loop if something went wrong */ + + while (neigh->nud_state&NUD_VALID && + (skb=__skb_dequeue(&neigh->arp_queue)) != NULL) { + struct neighbour *n1 = neigh; + /* On shaper/eql skb->dst->neighbour != neigh :( */ + if (skb->dst && skb->dst->neighbour) + n1 = skb->dst->neighbour; + n1->output(skb); + } + skb_queue_purge(&neigh->arp_queue); } return 0; } @@ -1228,7 +1240,7 @@ struct neigh_sysctl_table &proc_dointvec}, {0}}, - {{1, "default", NULL, 0, 0555, NULL},{0}}, + {{NET_PROTO_CONF_DEFAULT, "default", NULL, 0, 0555, NULL},{0}}, {{0, "neigh", NULL, 0, 0555, NULL},{0}}, {{0, NULL, NULL, 0, 0555, NULL},{0}}, {{CTL_NET, "net", NULL, 0, 0555, NULL},{0}} @@ -1243,10 +1255,11 @@ int neigh_sysctl_register(struct device *dev, struct neigh_parms *p, if (t == NULL) return -ENOBUFS; memcpy(t, &neigh_sysctl_template, sizeof(*t)); + t->neigh_vars[0].data = &p->mcast_probes; t->neigh_vars[1].data = &p->ucast_probes; t->neigh_vars[2].data = &p->app_probes; t->neigh_vars[3].data = &p->retrans_time; - t->neigh_vars[4].data = &p->reachable_time; + t->neigh_vars[4].data = &p->base_reachable_time; t->neigh_vars[5].data = &p->delay_probe_time; t->neigh_vars[6].data = &p->gc_staletime; t->neigh_vars[7].data = &p->queue_len; @@ -1256,7 +1269,7 @@ int neigh_sysctl_register(struct device *dev, struct neigh_parms *p, t->neigh_vars[11].data = &p->locktime; if (dev) { t->neigh_dev[0].procname = dev->name; - t->neigh_dev[0].ctl_name = dev->ifindex+1; + t->neigh_dev[0].ctl_name = dev->ifindex; memset(&t->neigh_vars[12], 0, sizeof(ctl_table)); } else { t->neigh_vars[12].data = (&p->locktime) + 1; diff --git a/net/core/sock.c b/net/core/sock.c index 6da5f5a0d..f940e5a80 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -137,6 +137,8 @@ __u32 sysctl_wmem_default = SK_WMEM_MAX; __u32 sysctl_rmem_default = SK_RMEM_MAX; int sysctl_core_destroy_delay = SOCK_DESTROY_TIME; +/* Maximal space eaten by iovec (still not made (2.1.88)!) plus some space */ +int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512); /* * This is meant for all protocols to use and covers goings on @@ -472,11 +474,11 @@ static kmem_cache_t *sk_cachep; * usage. */ -struct sock *sk_alloc(int family, int priority) +struct sock *sk_alloc(int family, int priority, int zero_it) { struct sock *sk = kmem_cache_alloc(sk_cachep, priority); - if(sk) { + if(sk && zero_it) { memset(sk, 0, sizeof(struct sock)); sk->family = family; } @@ -561,34 +563,22 @@ struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int void *sock_kmalloc(struct sock *sk, int size, int priority) { void *mem = NULL; - /* Always use wmem.. */ - if (atomic_read(&sk->wmem_alloc)+size < sk->sndbuf) { + if (atomic_read(&sk->omem_alloc)+size < sysctl_optmem_max) { /* First do the add, to avoid the race if kmalloc * might sleep. */ - atomic_add(size, &sk->wmem_alloc); + atomic_add(size, &sk->omem_alloc); mem = kmalloc(size, priority); - if (mem) - return mem; - atomic_sub(size, &sk->wmem_alloc); } return mem; } void sock_kfree_s(struct sock *sk, void *mem, int size) { -#if 1 /* Debug */ - if (atomic_read(&sk->wmem_alloc) < size) { - printk(KERN_DEBUG "sock_kfree_s: mem not accounted.\n"); - return; - } -#endif kfree_s(mem, size); - atomic_sub(size, &sk->wmem_alloc); - sk->write_space(sk); + atomic_sub(size, &sk->omem_alloc); } - /* FIXME: this is insane. We are trying suppose to be controlling how * how much space we have for data bytes, not packet headers. * This really points out that we need a better system for doing the @@ -633,6 +623,30 @@ unsigned long sock_wspace(struct sock *sk) return(0); } +/* It is almost wait_for_tcp_memory minus release_sock/lock_sock. + I think, these locks should be removed for datagram sockets. + */ +static void sock_wait_for_wmem(struct sock * sk) +{ + struct wait_queue wait = { current, NULL }; + + sk->socket->flags &= ~SO_NOSPACE; + add_wait_queue(sk->sleep, &wait); + for (;;) { + if (signal_pending(current)) + break; + current->state = TASK_INTERRUPTIBLE; + if (atomic_read(&sk->wmem_alloc) < sk->sndbuf) + break; + if (sk->shutdown & SEND_SHUTDOWN) + break; + if (sk->err) + break; + schedule(); + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); +} /* @@ -641,94 +655,78 @@ unsigned long sock_wspace(struct sock *sk) struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, unsigned long fallback, int noblock, int *errcode) { + int err; struct sk_buff *skb; - do - { - if(sk->err!=0) - { - *errcode=xchg(&sk->err,0); - return NULL; - } - - if(sk->shutdown&SEND_SHUTDOWN) - { - /* - * FIXME: Check 1003.1g should we deliver - * a signal here ??? - */ - *errcode=-EPIPE; - return NULL; - } - - if(!fallback) + do { + if ((err = xchg(&sk->err,0)) != 0) + goto failure; + + /* + * FIXME: Check 1003.1g should we deliver + * a signal here ??? + * + * Alan, could we solve this question once and forever? + * + * I believe, datagram sockets should never + * generate SIGPIPE. Moreover, I DO think that + * TCP is allowed to generate it only on write() + * call, but never on send/sendto/sendmsg. + * (btw, Solaris generates it even on read() :-)) + * + * The reason is that SIGPIPE is global flag, + * so that library function using sockets (f.e. syslog()), + * must save/disable it on entry and restore on exit. + * As result, signal arriving for another thread will + * be lost. Generation it on write() is still necessary + * because a lot of stupid programs never check write() + * return value. + * + * Seems, SIGPIPE is very bad idea, sort of gets(). + * At least, we could have an option disabling + * this behaviour on per-socket and/or per-message base. + * BTW it is very easy - MSG_SIGPIPE flag, which + * always set by read/write and checked here. + * --ANK + */ + + err = -EPIPE; + if (sk->shutdown&SEND_SHUTDOWN) + goto failure; + + if (!fallback) skb = sock_wmalloc(sk, size, 0, sk->allocation); - else - { + else { /* The buffer get won't block, or use the atomic queue. It does produce annoying no free page messages still.... */ skb = sock_wmalloc(sk, size, 0 , GFP_BUFFER); - if(!skb) + if (!skb) skb=sock_wmalloc(sk, fallback, 0, sk->allocation); } - + /* * This means we have too many buffers for this socket already. */ - - if(skb==NULL) - { - unsigned long tmp; + /* The following code is stolen "as is" from tcp.c */ + + if (skb==NULL) { sk->socket->flags |= SO_NOSPACE; - if(noblock) - { - *errcode=-EAGAIN; - return NULL; - } - if(sk->shutdown&SEND_SHUTDOWN) - { - *errcode=-EPIPE; - return NULL; - } - tmp = atomic_read(&sk->wmem_alloc); - cli(); - if(sk->shutdown&SEND_SHUTDOWN) - { - sti(); - *errcode=-EPIPE; - return NULL; - } - -#if 1 - if( tmp <= atomic_read(&sk->wmem_alloc)) -#else - /* ANK: Line above seems either incorrect - * or useless. sk->wmem_alloc has a tiny chance to change - * between tmp = sk->w... and cli(), - * but it might(?) change earlier. In real life - * it does not (I never seen the message). - * In any case I'd delete this check at all, or - * change it to: - */ - if (atomic_read(&sk->wmem_alloc) >= sk->sndbuf) -#endif - { - sk->socket->flags &= ~SO_NOSPACE; - interruptible_sleep_on(sk->sleep); - if (signal_pending(current)) - { - sti(); - *errcode = -ERESTARTSYS; - return NULL; - } - } - sti(); + err = -EAGAIN; + if (noblock) + goto failure; + err = -ERESTARTSYS; + if (signal_pending(current)) + goto failure; + sock_wait_for_wmem(sk); } - } - while(skb==NULL); - + } while (skb==NULL); + return skb; + +failure: + *errcode = err; + return NULL; } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 1da2cc152..47c85d006 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -22,6 +22,7 @@ extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; extern int sysctl_core_destroy_delay; +extern int sysctl_optmem_max; ctl_table core_table[] = { {NET_CORE_WMEM_MAX, "wmem_max", @@ -53,6 +54,9 @@ ctl_table core_table[] = { {NET_CORE_MSG_BURST, "message_burst", &net_msg_burst, sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_CORE_OPTMEM_MAX, "optmem_max", + &sysctl_optmem_max, sizeof(int), 0644, NULL, + &proc_dointvec}, { 0 } }; #endif diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 584ad8c7a..ef1c44620 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -5,7 +5,7 @@ * * AF_INET protocol family socket handler. * - * Version: $Id: af_inet.c,v 1.5 1997/12/16 05:37:33 ralf Exp $ + * Version: $Id: af_inet.c,v 1.6 1998/03/17 22:18:20 ralf Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -52,6 +52,7 @@ * Willy Konynenberg : Transparent proxying support. * David S. Miller : New socket lookup architecture. * Some other random speedups. + * Cyrus Durgin : Cleaned up file for kmod hacks. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -107,8 +108,8 @@ #ifdef CONFIG_BRIDGE #include <net/br.h> #endif -#ifdef CONFIG_KERNELD -#include <linux/kerneld.h> +#ifdef CONFIG_KMOD +#include <linux/kmod.h> #endif #ifdef CONFIG_NET_RADIO #include <linux/wireless.h> @@ -327,7 +328,7 @@ static int inet_create(struct socket *sock, int protocol) static int warned; if (net_families[AF_PACKET]==NULL) { -#if defined(CONFIG_KERNELD) && defined(CONFIG_PACKET_MODULE) +#if defined(CONFIG_KMOD) && defined(CONFIG_PACKET_MODULE) char module_name[30]; sprintf(module_name,"net-pf-%d", AF_PACKET); request_module(module_name); @@ -341,7 +342,7 @@ static int inet_create(struct socket *sock, int protocol) } sock->state = SS_UNCONNECTED; - sk = sk_alloc(AF_INET, GFP_KERNEL); + sk = sk_alloc(AF_INET, GFP_KERNEL, 1); if (sk == NULL) goto do_oom; @@ -894,7 +895,7 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCDRARP: case SIOCGRARP: case SIOCSRARP: -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD if (rarp_ioctl_hook == NULL) request_module("rarp"); #endif @@ -928,7 +929,7 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) #ifdef CONFIG_DLCI_MODULE -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD if (dlci_ioctl_hook == NULL) request_module("dlci"); #endif diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 94ae4263e..dd7ce9e0f 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1,6 +1,6 @@ /* linux/net/inet/arp.c * - * Version: $Id: arp.c,v 1.4 1998/03/03 01:23:36 ralf Exp $ + * Version: $Id: arp.c,v 1.5 1998/03/17 22:18:21 ralf Exp $ * * Copyright (C) 1994 by Florian La Roche * @@ -189,7 +189,7 @@ struct neigh_table arp_tbl = NULL, parp_redo, { NULL, NULL, &arp_tbl, 0, NULL, NULL, - 30*HZ, 1*HZ, 60*HZ, 30*HZ, 5*HZ, 3, 3, 0, 3, 1*HZ, (8*HZ)/10, 1*HZ, 64 }, + 30*HZ, 1*HZ, 60*HZ, 30*HZ, 5*HZ, 3, 3, 0, 3, 1*HZ, (8*HZ)/10, 64, 1*HZ }, 30*HZ, 128, 512, 1024, }; @@ -954,6 +954,10 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy struct device *dev = n->dev; int hatype = dev->type; + /* Do not confuse users "arp -a" with magic entries */ + if (!(n->nud_state&~NUD_NOARP)) + continue; + /* I'd get great pleasure deleting this ugly code. Let's output it in hexadecimal format. "arp" utility will eventually repaired --ANK diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 7d5f0021f..87394f906 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1,7 +1,7 @@ /* * NET3 IP device support routines. * - * Version: $Id: devinet.c,v 1.3 1997/12/16 05:37:35 ralf Exp $ + * Version: $Id: devinet.c,v 1.4 1998/03/17 22:18:21 ralf Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -19,6 +19,7 @@ * * Changes: * Alexey Kuznetsov: pa_* fields are replaced with ifaddr lists. + Cyrus Durgin: updated for kmod */ #include <linux/config.h> @@ -49,8 +50,8 @@ #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif -#ifdef CONFIG_KERNELD -#include <linux/kerneld.h> +#ifdef CONFIG_KMOD +#include <linux/kmod.h> #endif #include <net/ip.h> @@ -157,28 +158,32 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy) { struct in_ifaddr *ifa1 = *ifap; - struct in_ifaddr *ifa; - - /* 1. Unlink it */ - *ifap = ifa1->ifa_next; - - /* 2. Deleting primary ifaddr forces deletion all secondaries */ + /* 1. Deleting primary ifaddr forces deletion all secondaries */ if (!(ifa1->ifa_flags&IFA_F_SECONDARY)) { - while ((ifa=*ifap) != NULL) { - if (ifa1->ifa_mask != ifa->ifa_mask || + struct in_ifaddr *ifa; + struct in_ifaddr **ifap1 = &ifa1->ifa_next; + + while ((ifa=*ifap1) != NULL) { + if (!(ifa->ifa_flags&IFA_F_SECONDARY) || + ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) { - ifap = &ifa->ifa_next; + ifap1 = &ifa->ifa_next; continue; } - *ifap = ifa->ifa_next; + *ifap1 = ifa->ifa_next; rtmsg_ifa(RTM_DELADDR, ifa); notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa); inet_free_ifa(ifa); } } + /* 2. Unlink it */ + + *ifap = ifa1->ifa_next; + + /* 3. Announce address deletion */ /* Send message first, then call notifier. @@ -232,10 +237,9 @@ inet_insert_ifa(struct in_device *in_dev, struct in_ifaddr *ifa) ifap = last_primary; } - cli(); ifa->ifa_next = *ifap; + /* ATOMIC_SET */ *ifap = ifa; - sti(); /* Send message first, then call notifier. Notifier will trigger FIB update, so that @@ -413,7 +417,7 @@ int devinet_ioctl(unsigned int cmd, void *arg) *colon = 0; #endif -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD dev_load(ifr.ifr_name); #endif @@ -960,6 +964,8 @@ static void devinet_sysctl_register(struct in_device *in_dev, struct ipv4_devcon t->sysctl_header = register_sysctl_table(t->devinet_root_dir, 0); if (t->sysctl_header == NULL) kfree(t); + else + p->sysctl = t; } static void devinet_sysctl_unregister(struct ipv4_devconf *p) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 409db8209..6350a6366 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -5,7 +5,7 @@ * * IPv4 Forwarding Information Base: FIB frontend. * - * Version: $Id: fib_frontend.c,v 1.6 1997/12/13 21:52:48 kuznet Exp $ + * Version: $Id: fib_frontend.c,v 1.9 1998/03/08 20:52:36 davem Exp $ * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * @@ -151,7 +151,6 @@ struct device * ip_dev_find(u32 addr) memset(&key, 0, sizeof(key)); key.dst = addr; - key.scope = RT_SCOPE_UNIVERSE; if (!local_table || local_table->tb_lookup(local_table, &key, &res) || res.type != RTN_LOCAL) @@ -344,6 +343,10 @@ int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) int s_t; struct fib_table *tb; + if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) && + ((struct rtmsg*)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED) + return ip_rt_dump(skb, cb); + s_t = cb->args[0]; if (s_t == 0) s_t = cb->args[0] = RT_TABLE_MIN; @@ -423,8 +426,13 @@ static void fib_add_ifaddr(struct in_ifaddr *ifa) u32 addr = ifa->ifa_local; u32 prefix = ifa->ifa_address&mask; - if (ifa->ifa_flags&IFA_F_SECONDARY) + if (ifa->ifa_flags&IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, prefix, mask); + if (prim == NULL) { + printk(KERN_DEBUG "fib_add_ifaddr: bug: prim == NULL\n"); + return; + } + } fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim); @@ -435,7 +443,8 @@ static void fib_add_ifaddr(struct in_ifaddr *ifa) if (ifa->ifa_broadcast && ifa->ifa_broadcast != 0xFFFFFFFF) fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); - if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY)) { + if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) && + (prefix != addr || ifa->ifa_prefixlen < 32)) { fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim); @@ -464,8 +473,13 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) if (!(ifa->ifa_flags&IFA_F_SECONDARY)) fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, any, ifa->ifa_prefixlen, prim); - else + else { prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); + if (prim == NULL) { + printk(KERN_DEBUG "fib_del_ifaddr: bug: prim == NULL\n"); + return; + } + } /* Deletion is more complicated than add. We should take care of not to delete too much :-) diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 33bcf0321..4b89ab676 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -5,7 +5,7 @@ * * IPv4 FIB: lookup engine and maintenance routines. * - * Version: $Id: fib_hash.c,v 1.1 1997/11/09 19:53:13 kuznet Exp $ + * Version: $Id: fib_hash.c,v 1.3 1998/03/08 05:56:16 davem Exp $ * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 3ffb404b5..7ec60a5be 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -5,7 +5,7 @@ * * IPv4 Forwarding Information Base: policy rules. * - * Version: $Id: fib_rules.c,v 1.2 1997/10/10 22:40:49 davem Exp $ + * Version: $Id: fib_rules.c,v 1.3 1998/03/08 05:56:17 davem Exp $ * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 3883fcba0..d2d37e11e 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -5,7 +5,7 @@ * * IPv4 Forwarding Information Base: semantics. * - * Version: $Id: fib_semantics.c,v 1.6 1997/12/13 21:52:49 kuznet Exp $ + * Version: $Id: fib_semantics.c,v 1.7 1998/03/08 05:56:18 davem Exp $ * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index b2c7151d1..e8f636e21 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -3,7 +3,7 @@ * * Alan Cox, <alan@cymru.net> * - * Version: $Id: icmp.c,v 1.4 1998/03/03 01:23:37 ralf Exp $ + * Version: $Id: icmp.c,v 1.5 1998/03/17 22:18:23 ralf Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -928,10 +928,8 @@ int icmp_chkaddr(struct sk_buff *skb) struct tcphdr *th = (struct tcphdr *)(((unsigned char *)iph)+(iph->ihl<<2)); sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex); - if (!sk) return 0; - if (sk->saddr != iph->saddr) return 0; - if (sk->daddr != iph->daddr) return 0; - if (sk->dummy_th.dest != th->dest) return 0; + if (!sk || (sk->state == TCP_LISTEN)) + return 0; /* * This packet came from us. */ diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 166b68b42..d3414a0fe 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -8,7 +8,7 @@ * the older version didn't come out right using gcc 2.5.8, the newer one * seems to fall out with gcc 2.6.2. * - * Version: $Id: igmp.c,v 1.3 1997/12/16 05:37:36 ralf Exp $ + * Version: $Id: igmp.c,v 1.4 1998/03/17 22:18:24 ralf Exp $ * * Authors: * Alan Cox <Alan.Cox@linux.org> diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 45a2ed588..8df8414cd 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -5,7 +5,7 @@ * * The IP forwarding functionality. * - * Version: $Id: ip_forward.c,v 1.3 1998/03/03 01:23:37 ralf Exp $ + * Version: $Id: ip_forward.c,v 1.4 1998/03/17 22:18:25 ralf Exp $ * * Authors: see ip.c * diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 9dccb5324..e6831adb8 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -5,7 +5,7 @@ * * The IP fragmentation functionality. * - * Version: $Id: ip_fragment.c,v 1.30 1997/12/29 19:52:32 kuznet Exp $ + * Version: $Id: ip_fragment.c,v 1.32 1998/03/08 05:56:21 davem Exp $ * * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox <Alan.Cox@linux.org> diff --git a/net/ipv4/ip_fw.c b/net/ipv4/ip_fw.c index d78aa0f66..4eb41c325 100644 --- a/net/ipv4/ip_fw.c +++ b/net/ipv4/ip_fw.c @@ -6,7 +6,7 @@ * license in recognition of the original copyright. * -- Alan Cox. * - * $Id: ip_fw.c,v 1.3 1997/12/16 05:37:37 ralf Exp $ + * $Id: ip_fw.c,v 1.4 1998/03/17 22:18:25 ralf Exp $ * * Ported from BSD to Linux, * Alan Cox 22/Nov/1994. @@ -392,6 +392,39 @@ int ip_fw_chk(struct iphdr *ip, struct device *rif, __u16 *redirport, struct ip_ continue; /* Mismatch */ } + /* This looks stupid, because we scan almost static + list, searching for static key. However, this way seems + to be only reasonable way of handling fw_via rules + (btw bsd makes the same thing). + + It will not affect performance if you will follow + the following simple rules: + + - if inteface is aliased, ALWAYS specify fw_viadev, + so that previous check will guarantee, that we will + not waste time when packet arrive on another interface. + + - avoid using fw_via.s_addr if fw_via.s_addr is owned + by an aliased interface. + + --ANK + */ + if (f->fw_via.s_addr && rif) { + struct in_ifaddr *ifa; + + if (rif->ip_ptr == NULL) + continue; /* Mismatch */ + + for (ifa = ((struct in_device*)(rif->ip_ptr))->ifa_list; + ifa; ifa = ifa->ifa_next) { + if (ifa->ifa_local == f->fw_via.s_addr) + goto ifa_ok; + } + continue; /* Mismatch */ + + ifa_ok: + } + /* * Ok the chain addresses match. */ diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 61c364542..fa8208959 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) module. * - * Version: $Id: ip_input.c,v 1.2 1997/12/16 05:37:38 ralf Exp $ + * Version: $Id: ip_input.c,v 1.3 1998/03/17 22:18:26 ralf Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> diff --git a/net/ipv4/ip_masq_mod.c b/net/ipv4/ip_masq_mod.c index 797f9112f..2265161f3 100644 --- a/net/ipv4/ip_masq_mod.c +++ b/net/ipv4/ip_masq_mod.c @@ -12,6 +12,8 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * + * Changes: + * Cyrus Durgin: fixed kerneld stuff for kmod. */ #include <linux/config.h> @@ -21,8 +23,8 @@ #include <linux/errno.h> #include <net/ip_masq.h> #include <net/ip_masq_mod.h> -#ifdef CONFIG_KERNELD -#include <linux/kerneld.h> +#ifdef CONFIG_KMOD +#include <linux/kmod.h> #endif EXPORT_SYMBOL(register_ip_masq_mod); @@ -290,7 +292,7 @@ struct ip_masq_mod * ip_masq_mod_getbyname(const char *mmod_name) int ip_masq_mod_ctl(int optname, struct ip_fw_masqctl *mctl, int optlen) { struct ip_masq_mod * mmod; -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD char kmod_name[IP_MASQ_MOD_NMAX+8]; #endif /* tappo */ @@ -299,7 +301,7 @@ int ip_masq_mod_ctl(int optname, struct ip_fw_masqctl *mctl, int optlen) mmod = ip_masq_mod_getbyname(mctl->u.mod.name); if (mmod) return mmod->mmod_ctl(optname, mctl, optlen); -#ifdef CONFIG_KERNELD +#ifdef CONFIG_KMOD sprintf(kmod_name,"ip_masq_%s", mctl->u.mod.name); IP_MASQ_DEBUG(1, "About to request \"%s\" module\n", kmod_name); diff --git a/net/ipv4/ip_masq_raudio.c b/net/ipv4/ip_masq_raudio.c index f7e28f21a..377b8223e 100644 --- a/net/ipv4/ip_masq_raudio.c +++ b/net/ipv4/ip_masq_raudio.c @@ -2,7 +2,7 @@ * IP_MASQ_RAUDIO - Real Audio masquerading module * * - * Version: @(#)$Id: ip_masq_raudio.c,v 1.8 1997/11/28 15:32:32 alan Exp $ + * Version: @(#)$Id: ip_masq_raudio.c,v 1.9 1998/02/23 02:50:19 davem Exp $ * * Author: Nigel Metheringham * Real Time Streaming code by Progressive Networks diff --git a/net/ipv4/ip_nat_dumb.c b/net/ipv4/ip_nat_dumb.c index 06e9be8fb..def66858c 100644 --- a/net/ipv4/ip_nat_dumb.c +++ b/net/ipv4/ip_nat_dumb.c @@ -5,7 +5,7 @@ * * Dumb Network Address Translation. * - * Version: $Id: ip_nat_dumb.c,v 1.2 1997/10/10 22:41:05 davem Exp $ + * Version: $Id: ip_nat_dumb.c,v 1.2 1997/12/16 05:37:40 ralf Exp $ * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * @@ -14,6 +14,9 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * + * Fixes: + * Rani Assaf : A zero checksum is a special case + * only in UDP * * NOTE: It is just working model of real NAT. */ @@ -49,7 +52,6 @@ ip_do_nat(struct sk_buff *skb) u32 odaddr = iph->daddr; u32 osaddr = iph->saddr; u16 check; - u16 *cksum = NULL; IPCB(skb)->flags |= IPSKB_TRANSLATED; @@ -62,17 +64,23 @@ ip_do_nat(struct sk_buff *skb) /* If it is the first fragment, rewrite protocol headers */ if (!(iph->frag_off & htons(IP_OFFSET))) { - /* Only plain TCP/UDP headers rewriting is implemented :-( */ - if (iph->protocol == IPPROTO_TCP) - cksum = (u16*)&((struct tcphdr*)(((char*)iph) + iph->ihl*4))->check; - else if (iph->protocol == IPPROTO_UDP) - cksum = (u16*)&((struct udphdr*)(((char*)iph) + iph->ihl*4))->check; - if (cksum && (check = *cksum) != 0) { - check = csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 0, ~check); - check = csum_tcpudp_magic(~osaddr, ~odaddr, 0, 0, ~check); - if (!check) - check = 0xFFFF; - *cksum = check; + u16 *cksum; + + switch(iph->protocol) { + case IPPROTO_TCP: + cksum = (u16*)&((struct tcphdr*)(((char*)iph) + iph->ihl*4))->check; + check = csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 0, ~(*cksum)); + *cksum = csum_tcpudp_magic(~osaddr, ~odaddr, 0, 0, ~check); + break; + case IPPROTO_UDP: + cksum = (u16*)&((struct udphdr*)(((char*)iph) + iph->ihl*4))->check; + if ((check = *cksum) != 0) { + check = csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 0, ~check); + check = csum_tcpudp_magic(~osaddr, ~odaddr, 0, 0, ~check); + *cksum = check ? : 0xFFFF; + } + default: + break; } } return 0; diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 53c680eed..d78cc1ff0 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -5,7 +5,7 @@ * * The options processing module for ip.c * - * Version: $Id: ip_options.c,v 1.2 1997/12/16 05:37:40 ralf Exp $ + * Version: $Id: ip_options.c,v 1.3 1998/03/17 22:18:28 ralf Exp $ * * Authors: A.N.Kuznetsov * diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index ac4ac22ae..63fbbfe1e 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) output module. * - * Version: $Id: ip_output.c,v 1.4 1998/03/03 01:23:41 ralf Exp $ + * Version: $Id: ip_output.c,v 1.5 1998/03/17 22:18:29 ralf Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index a500a72e5..1b7f44e8f 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -5,7 +5,7 @@ * * The IP to API glue. * - * Version: $Id: ip_sockglue.c,v 1.4 1998/03/03 01:23:41 ralf Exp $ + * Version: $Id: ip_sockglue.c,v 1.5 1998/03/17 22:18:29 ralf Exp $ * * Authors: see ip.c * diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 20521e643..1e44ae8aa 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1,5 +1,5 @@ /* - * $Id: ipconfig.c,v 1.6 1998/01/09 17:19:46 mj Exp $ + * $Id: ipconfig.c,v 1.11 1998/02/12 07:43:16 davem Exp $ * * Automatic Configuration of IP -- use BOOTP or RARP or user-supplied * information to configure own IP address and routes. diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 949661f41..ce071d406 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -1,7 +1,7 @@ /* * Linux NET3: IP/IP protocol decoder. * - * Version: $Id: ipip.c,v 1.4 1997/12/16 05:37:42 ralf Exp $ + * Version: $Id: ipip.c,v 1.5 1998/03/17 22:18:30 ralf Exp $ * * Authors: * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index d3c07dca3..1177f33ac 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -9,7 +9,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Version: $Id: ipmr.c,v 1.29 1997/12/13 21:52:55 kuznet Exp $ + * Version: $Id: ipmr.c,v 1.4 1998/03/17 22:18:31 ralf Exp $ * * Fixes: * Michael Chastain : Incorrect size of copying. @@ -1351,6 +1351,7 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) int ct; struct rtnexthop *nhp; struct device *dev = vif_table[c->mfc_parent].dev; + u8 *b = skb->tail; #ifdef CONFIG_RTNL_OLD_IFINFO if (dev) { @@ -1389,10 +1390,11 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) return 1; rtattr_failure: + skb_trim(skb, b - skb->data); return -EMSGSIZE; } -int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm) +int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait) { struct mfc_cache *cache; struct rtable *rt = (struct rtable*)skb->dst; @@ -1400,10 +1402,16 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm) start_bh_atomic(); cache = ipmr_cache_find(rt->rt_src, rt->rt_dst); if (cache==NULL || (cache->mfc_flags&MFC_QUEUED)) { - struct device *dev = skb->dev; + struct device *dev; int vif; int err; + if (nowait) { + end_bh_atomic(); + return -EAGAIN; + } + + dev = skb->dev; if (dev == NULL || (vif = ipmr_find_vif(dev)) == ALL_VIFS) { end_bh_atomic(); return -ENODEV; @@ -1422,7 +1430,7 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm) */ end_bh_atomic(); - if (rtm->rtm_flags & RTM_F_NOTIFY) + if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) cache->mfc_flags |= MFC_NOTIFY; return ipmr_fill_mroute(skb, cache, rtm); } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 7f3b5f9bb..221207205 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -7,7 +7,7 @@ * PROC file system. It is mainly used for debugging and * statistics. * - * Version: $Id: proc.c,v 1.23 1997/10/30 23:52:20 davem Exp $ + * Version: $Id: proc.c,v 1.4 1997/12/16 05:37:43 ralf Exp $ * * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de> @@ -77,11 +77,12 @@ static inline void get__sock(struct sock *sp, char *tmpbuf, int i, int format) unsigned long dest, src; unsigned short destp, srcp; int timer_active, timer_active1, timer_active2; + int tw_bucket = 0; unsigned long timer_expires; struct tcp_opt *tp = &sp->tp_pinfo.af_tcp; dest = sp->daddr; - src = sp->saddr; + src = sp->rcv_saddr; destp = sp->dummy_th.dest; srcp = sp->dummy_th.source; @@ -96,30 +97,47 @@ static inline void get__sock(struct sock *sp, char *tmpbuf, int i, int format) destp = ntohs(destp); srcp = ntohs(srcp); - timer_active1 = del_timer(&tp->retransmit_timer); - timer_active2 = del_timer(&sp->timer); - if (!timer_active1) tp->retransmit_timer.expires=0; - if (!timer_active2) sp->timer.expires=0; - timer_active=0; - timer_expires=(unsigned)-1; + if((format == 0) && (sp->state == TCP_TIME_WAIT)) { + struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sp; + + tw_bucket = 1; + timer_active1 = timer_active2 = 0; + timer_active = 3; + timer_expires = tw->timer.expires; + } else { + timer_active1 = del_timer(&tp->retransmit_timer); + timer_active2 = del_timer(&sp->timer); + if (!timer_active1) tp->retransmit_timer.expires=0; + if (!timer_active2) sp->timer.expires=0; + timer_active = 0; + timer_expires = (unsigned) -1; + } if (timer_active1 && tp->retransmit_timer.expires < timer_expires) { - timer_active=timer_active1; - timer_expires=tp->retransmit_timer.expires; + timer_active = 1; + timer_expires = tp->retransmit_timer.expires; } if (timer_active2 && sp->timer.expires < timer_expires) { - timer_active=timer_active2; - timer_expires=sp->timer.expires; - } + timer_active = 2; + timer_expires = sp->timer.expires; + } + if(timer_active == 0) + timer_expires = jiffies; sprintf(tmpbuf, "%4d: %08lX:%04X %08lX:%04X" " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld", i, src, srcp, dest, destp, sp->state, - format==0?sp->write_seq-tp->snd_una:atomic_read(&sp->wmem_alloc), - format==0?tp->rcv_nxt-sp->copied_seq:atomic_read(&sp->rmem_alloc), - timer_active, timer_expires-jiffies, - tp->retransmits, - sp->socket ? sp->socket->inode->i_uid:0, - timer_active?sp->timeout:0, - sp->socket ? sp->socket->inode->i_ino:0); + (tw_bucket ? + 0 : + (format == 0) ? + tp->write_seq-tp->snd_una : atomic_read(&sp->wmem_alloc)), + (tw_bucket ? + 0 : + (format == 0) ? + tp->rcv_nxt-tp->copied_seq: atomic_read(&sp->rmem_alloc)), + timer_active, timer_expires-jiffies, + (tw_bucket ? 0 : tp->retransmits), + (!tw_bucket && sp->socket) ? sp->socket->inode->i_uid : 0, + (!tw_bucket && timer_active) ? sp->timeout : 0, + (!tw_bucket && sp->socket) ? sp->socket->inode->i_ino : 0); if (timer_active1) add_timer(&tp->retransmit_timer); if (timer_active2) add_timer(&sp->timer); diff --git a/net/ipv4/rarp.c b/net/ipv4/rarp.c index 9e944495f..e1eba43c5 100644 --- a/net/ipv4/rarp.c +++ b/net/ipv4/rarp.c @@ -3,7 +3,7 @@ * Copyright (C) 1994 by Ross Martin * Based on linux/net/inet/arp.c, Copyright (C) 1994 by Florian La Roche * - * $Id: rarp.c,v 1.3 1997/12/16 05:37:44 ralf Exp $ + * $Id: rarp.c,v 1.4 1998/03/17 22:18:31 ralf Exp $ * * This module implements the Reverse Address Resolution Protocol * (RARP, RFC 903), which is used to convert low level addresses such diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index b3644f10d..baebab777 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -5,7 +5,7 @@ * * RAW - implementation of IP "raw" sockets. * - * Version: $Id: raw.c,v 1.3 1997/12/16 05:37:44 ralf Exp $ + * Version: $Id: raw.c,v 1.4 1998/03/17 22:18:32 ralf Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b73c3ed11..8ce4a95f4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -5,7 +5,7 @@ * * ROUTE - implementation of the IP router. * - * Version: $Id: route.c,v 1.4 1998/03/03 01:23:42 ralf Exp $ + * Version: $Id: route.c,v 1.5 1998/03/17 22:18:32 ralf Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -167,7 +167,7 @@ __u8 ip_tos2prio[16] = { static struct rtable *rt_hash_table[RT_HASH_DIVISOR]; -static struct rtable * rt_intern_hash(unsigned hash, struct rtable * rth, u16 protocol); +static struct rtable * rt_intern_hash(unsigned hash, struct rtable * rth); static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos) { @@ -301,6 +301,8 @@ static void rt_run_flush(unsigned long dummy) int i; struct rtable * rth, * next; + rt_deadline = 0; + for (i=0; i<RT_HASH_DIVISOR; i++) { int nr=0; @@ -322,37 +324,41 @@ static void rt_run_flush(unsigned long dummy) void rt_cache_flush(int delay) { + unsigned long now = jiffies; + int user_mode = !in_interrupt(); + if (delay < 0) delay = ip_rt_min_delay; start_bh_atomic(); if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) { - long tmo = (long)(rt_deadline - rt_flush_timer.expires); + long tmo = (long)(rt_deadline - now); /* If flush timer is already running and flush request is not immediate (delay > 0): - if deadline is not achieved, prolongate timer to "dealy", + if deadline is not achieved, prolongate timer to "delay", otherwise fire it at deadline time. */ + if (user_mode && (long)(rt_deadline-now) < ip_rt_max_delay-ip_rt_min_delay) + tmo = 0; + if (delay > tmo) delay = tmo; } if (delay <= 0) { - rt_deadline = 0; end_bh_atomic(); - rt_run_flush(0); return; } if (rt_deadline == 0) - rt_deadline = jiffies + ip_rt_max_delay; + rt_deadline = now + ip_rt_max_delay; - rt_flush_timer.expires = jiffies + delay; + rt_flush_timer.expires = now + delay; add_timer(&rt_flush_timer); end_bh_atomic(); } @@ -400,7 +406,7 @@ out: return (atomic_read(&ipv4_dst_ops.entries) > ip_rt_max_size); } -static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt, u16 protocol) +static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt) { struct rtable *rth, **rthp; unsigned long now = jiffies; @@ -472,7 +478,9 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, goto reject_redirect; if (!IN_DEV_SHARED_MEDIA(in_dev)) { - if (ip_fib_check_default(new_gw, dev)) + if (!inet_addr_onlink(in_dev, new_gw, old_gw)) + goto reject_redirect; + if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) goto reject_redirect; } else { if (inet_addr_type(new_gw) != RTN_UNICAST) @@ -504,9 +512,13 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, rth->u.dst.dev != dev) break; + dst_clone(&rth->u.dst); + rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); - if (rt == NULL) + if (rt == NULL) { + ip_rt_put(rth); return; + } /* * Copy all the information. @@ -531,14 +543,16 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, if (rt->u.dst.neighbour) neigh_event_send(rt->u.dst.neighbour, NULL); ip_rt_put(rt); + ip_rt_put(rth); rt_free(rt); break; } *rthp = rth->u.rt_next; - rt_free(rth); - rt = rt_intern_hash(hash, rt, ETH_P_IP); + rt = rt_intern_hash(hash, rt); ip_rt_put(rt); + ip_rt_put(rth); + rt_free(rth); break; } } @@ -762,19 +776,45 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) u32 src; struct fib_result res; - if (rt->key.iif == 0) { - memcpy(addr, &rt->rt_src, 4); - return; - } - if (fib_lookup(&rt->key, &res) == 0) { + if (rt->key.iif == 0) + src = rt->rt_src; + else if (fib_lookup(&rt->key, &res) == 0) src = FIB_RES_PREFSRC(res); - memcpy(addr, &src, 4); - return; - } - src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); + else + src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); memcpy(addr, &src, 4); } +static void rt_set_nexthop(struct rtable *rt, struct fib_result *res) +{ + struct fib_info *fi = res->fi; + + if (fi) { + if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) + rt->rt_gateway = FIB_RES_GW(*res); +#ifndef CONFIG_RTNL_OLD_IFINFO + rt->u.dst.mxlock = fi->fib_metrics[RTAX_LOCK-1]; + rt->u.dst.pmtu = fi->fib_mtu; + if (fi->fib_mtu == 0) { + rt->u.dst.pmtu = rt->u.dst.dev->mtu; + if (rt->u.dst.mxlock&(1<<RTAX_MTU) && + rt->rt_gateway != rt->rt_dst && + rt->u.dst.pmtu > 576) + rt->u.dst.pmtu = 576; + } +#else + rt->u.dst.pmtu = fi->fib_mtu ? : rt->u.dst.dev->mtu; +#endif + rt->u.dst.window= fi->fib_window ? : 0; + rt->u.dst.rtt = fi->fib_rtt ? : TCP_TIMEOUT_INIT; + } else { + rt->u.dst.pmtu = rt->u.dst.dev->mtu; + rt->u.dst.window= 0; + rt->u.dst.rtt = TCP_TIMEOUT_INIT; + } + rt->rt_type = res->type; +} + static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr, u8 tos, struct device *dev, int our) @@ -832,7 +872,7 @@ ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr, #endif hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos); - skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, 0); + skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth); return 0; } @@ -990,18 +1030,9 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, rth->u.dst.input = ip_forward; rth->u.dst.output = ip_output; - rth->u.dst.pmtu = res.fi->fib_mtu ? : out_dev->dev->mtu; - rth->u.dst.window=res.fi->fib_window ? : 0; - rth->u.dst.rtt = res.fi->fib_rtt ? : TCP_TIMEOUT_INIT; -#ifndef CONFIG_RTNL_OLD_IFINFO - rth->u.dst.mxlock = res.fi->fib_metrics[RTAX_LOCK-1]; -#endif - - if (FIB_RES_GW(res) && FIB_RES_NH(res).nh_scope == RT_SCOPE_LINK) - rth->rt_gateway = FIB_RES_GW(res); + rt_set_nexthop(rth, &res); rth->rt_flags = flags; - rth->rt_type = res.type; #ifdef CONFIG_NET_FASTROUTE if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) { @@ -1014,7 +1045,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, } #endif - skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, ntohs(skb->protocol)); + skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth); return 0; brd_input: @@ -1062,7 +1093,7 @@ local_input: } rth->rt_flags = flags|RTCF_LOCAL; rth->rt_type = res.type; - skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, 0); + skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth); return 0; no_route: @@ -1362,7 +1393,7 @@ make_route: rth->rt_dst_map = key.dst; rth->rt_src_map = key.src; #endif - rth->rt_iif = dev_out->ifindex; + rth->rt_iif = oif ? : dev_out->ifindex; rth->u.dst.dev = dev_out; rth->rt_gateway = key.dst; rth->rt_spec_dst= key.src; @@ -1388,24 +1419,12 @@ make_route: #endif } - if (res.fi) { - if (FIB_RES_GW(res) && FIB_RES_NH(res).nh_scope == RT_SCOPE_LINK) - rth->rt_gateway = FIB_RES_GW(res); - rth->u.dst.pmtu = res.fi->fib_mtu ? : dev_out->mtu; - rth->u.dst.window=res.fi->fib_window ? : 0; - rth->u.dst.rtt = res.fi->fib_rtt ? : TCP_TIMEOUT_INIT; -#ifndef CONFIG_RTNL_OLD_IFINFO - rth->u.dst.mxlock = res.fi->fib_metrics[RTAX_LOCK-1]; -#endif - } else { - rth->u.dst.pmtu = dev_out->mtu; - rth->u.dst.window=0; - rth->u.dst.rtt = TCP_TIMEOUT_INIT; - } + rt_set_nexthop(rth, &res); + rth->rt_flags = flags; - rth->rt_type = res.type; + hash = rt_hash_code(daddr, saddr^(oif<<5), tos); - *rp = rt_intern_hash(hash, rth, ETH_P_IP); + *rp = rt_intern_hash(hash, rth); return 0; } @@ -1444,6 +1463,113 @@ int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif) #ifdef CONFIG_RTNETLINK +static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int nowait) +{ + struct rtable *rt = (struct rtable*)skb->dst; + struct rtmsg *r; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct rta_cacheinfo ci; +#ifdef CONFIG_IP_MROUTE + struct rtattr *eptr; +#endif +#ifdef CONFIG_RTNL_OLD_IFINFO + unsigned char *o; +#else + struct rtattr *mx; +#endif + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r)); + r = NLMSG_DATA(nlh); + nlh->nlmsg_flags = nowait ? NLM_F_MULTI : 0; + r->rtm_family = AF_INET; + r->rtm_dst_len = 32; + r->rtm_src_len = 32; + r->rtm_tos = rt->key.tos; + r->rtm_table = RT_TABLE_MAIN; + r->rtm_type = rt->rt_type; + r->rtm_scope = RT_SCOPE_UNIVERSE; + r->rtm_protocol = RTPROT_UNSPEC; + r->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED; +#ifdef CONFIG_RTNL_OLD_IFINFO + r->rtm_nhs = 0; + + o = skb->tail; +#endif + RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst); + RTA_PUT(skb, RTA_SRC, 4, &rt->rt_src); + if (rt->u.dst.dev) + RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex); + if (rt->rt_dst != rt->rt_gateway) + RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway); +#ifdef CONFIG_RTNL_OLD_IFINFO + RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &rt->u.dst.pmtu); + RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &rt->u.dst.window); + RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &rt->u.dst.rtt); +#else + mx = (struct rtattr*)skb->tail; + RTA_PUT(skb, RTA_METRICS, 0, NULL); + if (rt->u.dst.mxlock) + RTA_PUT(skb, RTAX_LOCK, sizeof(unsigned), &rt->u.dst.mxlock); + if (rt->u.dst.pmtu) + RTA_PUT(skb, RTAX_MTU, sizeof(unsigned), &rt->u.dst.pmtu); + if (rt->u.dst.window) + RTA_PUT(skb, RTAX_WINDOW, sizeof(unsigned), &rt->u.dst.window); + if (rt->u.dst.rtt) + RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt); + mx->rta_len = skb->tail - (u8*)mx; + if (mx->rta_len == RTA_LENGTH(0)) + skb_trim(skb, (u8*)mx - skb->data); +#endif + RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst); + ci.rta_lastuse = jiffies - rt->u.dst.lastuse; + ci.rta_used = atomic_read(&rt->u.dst.refcnt); + ci.rta_clntref = atomic_read(&rt->u.dst.use); + ci.rta_expires = 0; + ci.rta_error = rt->u.dst.error; +#ifdef CONFIG_IP_MROUTE + eptr = (struct rtattr*)skb->tail; +#endif + RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci); +#ifdef CONFIG_RTNL_OLD_IFINFO + r->rtm_optlen = skb->tail - o; +#endif + if (rt->key.iif) { +#ifdef CONFIG_IP_MROUTE + u32 dst = rt->rt_dst; + + if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_devconf.mc_forwarding) { + int err = ipmr_get_route(skb, r, nowait); + if (err <= 0) { + if (!nowait) { + if (err == 0) + return 0; + goto nlmsg_failure; + } else { + if (err == -EMSGSIZE) + goto nlmsg_failure; + ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err; + } + } + } else +#endif + { + RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif); +#ifdef CONFIG_RTNL_OLD_IFINFO + r->rtm_optlen = skb->tail - o; +#endif + } + } + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) { struct rtattr **rta = arg; @@ -1454,12 +1580,6 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) int iif = 0; int err; struct sk_buff *skb; - struct rta_cacheinfo ci; -#ifdef CONFIG_RTNL_OLD_IFINFO - unsigned char *o; -#else - struct rtattr *mx; -#endif skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (skb == NULL) @@ -1506,83 +1626,53 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; - nlh = NLMSG_PUT(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, - RTM_NEWROUTE, sizeof(*rtm)); - rtm = NLMSG_DATA(nlh); - nlh->nlmsg_flags = 0; - rtm->rtm_family = AF_INET; - rtm->rtm_dst_len = 32; - rtm->rtm_src_len = 32; - rtm->rtm_tos = rt->key.tos; - rtm->rtm_table = RT_TABLE_MAIN; - rtm->rtm_type = rt->rt_type; - rtm->rtm_scope = RT_SCOPE_UNIVERSE; - rtm->rtm_protocol = RTPROT_UNSPEC; - rtm->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED; -#ifdef CONFIG_RTNL_OLD_IFINFO - rtm->rtm_nhs = 0; + NETLINK_CB(skb).pid = NETLINK_CB(in_skb).pid; + + err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0); + if (err == 0) + return 0; + if (err < 0) + return -EMSGSIZE; - o = skb->tail; -#endif - RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst); - RTA_PUT(skb, RTA_SRC, 4, &rt->rt_src); - if (rt->u.dst.dev) - RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex); - if (rt->rt_dst != rt->rt_gateway) - RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway); -#ifdef CONFIG_RTNL_OLD_IFINFO - RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &rt->u.dst.pmtu); - RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &rt->u.dst.window); - RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &rt->u.dst.rtt); -#else - mx = (struct rtattr*)skb->tail; - RTA_PUT(skb, RTA_METRICS, 0, NULL); - if (rt->u.dst.mxlock) - RTA_PUT(skb, RTAX_LOCK, sizeof(unsigned), &rt->u.dst.mxlock); - if (rt->u.dst.pmtu) - RTA_PUT(skb, RTAX_MTU, sizeof(unsigned), &rt->u.dst.pmtu); - if (rt->u.dst.window) - RTA_PUT(skb, RTAX_WINDOW, sizeof(unsigned), &rt->u.dst.window); - if (rt->u.dst.rtt) - RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt); - mx->rta_len = skb->tail - (u8*)mx; -#endif - RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst); - ci.rta_lastuse = jiffies - rt->u.dst.lastuse; - ci.rta_used = atomic_read(&rt->u.dst.refcnt); - ci.rta_clntref = atomic_read(&rt->u.dst.use); - ci.rta_expires = 0; - ci.rta_error = rt->u.dst.error; - RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci); -#ifdef CONFIG_RTNL_OLD_IFINFO - rtm->rtm_optlen = skb->tail - o; -#endif - if (iif) { -#ifdef CONFIG_IP_MROUTE - if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_devconf.mc_forwarding) { - NETLINK_CB(skb).pid = NETLINK_CB(in_skb).pid; - err = ipmr_get_route(skb, rtm); - if (err <= 0) - return err; - } else -#endif - { - RTA_PUT(skb, RTA_IIF, sizeof(int), &iif); -#ifdef CONFIG_RTNL_OLD_IFINFO - rtm->rtm_optlen = skb->tail - o; -#endif - } - } - nlh->nlmsg_len = skb->tail - (u8*)nlh; err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); if (err < 0) return err; return 0; +} -nlmsg_failure: -rtattr_failure: - kfree_skb(skb); - return -EMSGSIZE; + +int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct rtable *rt; + int h, s_h; + int idx, s_idx; + + s_h = cb->args[0]; + s_idx = idx = cb->args[1]; + for (h=0; h < RT_HASH_DIVISOR; h++) { + if (h < s_h) continue; + if (h > s_h) + memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(int)); + start_bh_atomic(); + for (rt = rt_hash_table[h], idx = 0; rt; rt = rt->u.rt_next, idx++) { + if (idx < s_idx) + continue; + skb->dst = dst_clone(&rt->u.dst); + if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) { + dst_release(xchg(&skb->dst, NULL)); + end_bh_atomic(); + goto done; + } + dst_release(xchg(&skb->dst, NULL)); + } + end_bh_atomic(); + } + +done: + cb->args[0] = h; + cb->args[1] = idx; + return skb->len; } #endif /* CONFIG_RTNETLINK */ diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 7d119716e..00dd0a8ef 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -9,7 +9,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * $Id: syncookies.c,v 1.3 1997/09/16 17:16:21 freitag Exp $ + * $Id: syncookies.c,v 1.4 1998/03/08 05:56:34 davem Exp $ * * Missing: IPv6 support. * Some counter so that the Administrator can see when the machine diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 3a8a7efb4..767c5d00b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1,7 +1,7 @@ /* * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem. * - * $Id: sysctl_net_ipv4.c,v 1.6 1998/03/03 01:23:42 ralf Exp $ + * $Id: sysctl_net_ipv4.c,v 1.7 1998/03/17 22:18:33 ralf Exp $ * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS] @@ -45,8 +45,6 @@ extern int sysctl_ip_masq_debug; extern int sysctl_tcp_cong_avoidance; extern int sysctl_tcp_hoe_retransmits; -extern int sysctl_tcp_sack; -extern int sysctl_tcp_tsack; extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_keepalive_time; @@ -57,7 +55,8 @@ extern int sysctl_tcp_retries2; extern int sysctl_tcp_fin_timeout; extern int sysctl_tcp_syncookies; extern int sysctl_tcp_syn_retries; -extern int sysctl_tcp_stdurg; +extern int sysctl_tcp_stdurg; +extern int sysctl_tcp_rfc1337; extern int sysctl_tcp_syn_taildrop; extern int sysctl_max_syn_backlog; @@ -99,12 +98,6 @@ ctl_table ipv4_table[] = { {NET_IPV4_TCP_HOE_RETRANSMITS, "tcp_hoe_retransmits", &sysctl_tcp_hoe_retransmits, sizeof(int), 0644, NULL, &proc_dointvec}, - {NET_IPV4_TCP_SACK, "tcp_sack", - &sysctl_tcp_sack, sizeof(int), 0644, NULL, - &proc_dointvec}, - {NET_IPV4_TCP_TSACK, "tcp_tsack", - &sysctl_tcp_tsack, sizeof(int), 0644, NULL, - &proc_dointvec}, {NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps", &sysctl_tcp_timestamps, sizeof(int), 0644, NULL, &proc_dointvec}, @@ -162,6 +155,8 @@ ctl_table ipv4_table[] = { #endif {NET_TCP_STDURG, "tcp_stdurg", &sysctl_tcp_stdurg, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_TCP_RFC1337, "tcp_rfc1337", &sysctl_tcp_rfc1337, + sizeof(int), 0644, NULL, &proc_dointvec}, {NET_TCP_SYN_TAILDROP, "tcp_syn_taildrop", &sysctl_tcp_syn_taildrop, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog", &sysctl_max_syn_backlog, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 17ec6def9..b20df83d2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.77 1998/01/15 22:40:18 freitag Exp $ + * Version: $Id: tcp.c,v 1.96 1998/03/16 02:25:55 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -169,7 +169,7 @@ * Fixed tcp_write_timeout: stuck close, * and TCP syn retries gets used now. * Mark Yarvis : In tcp_read_wakeup(), don't send an - * ack if stat is TCP_CLOSED. + * ack if state is TCP_CLOSED. * Alan Cox : Look up device on a retransmit - routes may * change. Doesn't yet cope with MSS shrink right * but its a start! @@ -425,6 +425,8 @@ int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; struct tcp_mib tcp_statistics; kmem_cache_t *tcp_openreq_cachep; +kmem_cache_t *tcp_bucket_cachep; +kmem_cache_t *tcp_timewait_cachep; /* * Find someone to 'accept'. Must be called with @@ -478,20 +480,6 @@ static void tcp_close_pending (struct sock *sk) } /* - * Enter the time wait state. - */ - -void tcp_time_wait(struct sock *sk) -{ - tcp_set_state(sk,TCP_TIME_WAIT); - sk->shutdown = SHUTDOWN_MASK; - if (!sk->dead) - sk->state_change(sk); - tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); -} - - -/* * Walk down the receive queue counting readable data. * * Must be called with the socket lock held. @@ -512,7 +500,7 @@ static int tcp_readable(struct sock *sk) return(0); } - counted = sk->copied_seq; /* Where we are at the moment */ + counted = sk->tp_pinfo.af_tcp.copied_seq; /* Where we are at the moment */ amount = 0; /* Do until a push or until we are out of data. */ @@ -606,10 +594,10 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) if (sk->shutdown & RCV_SHUTDOWN) mask |= POLLHUP; - if ((tp->rcv_nxt != sk->copied_seq) && - (sk->urg_seq != sk->copied_seq || - tp->rcv_nxt != sk->copied_seq+1 || - sk->urginline || !sk->urg_data)) + if ((tp->rcv_nxt != tp->copied_seq) && + (tp->urg_seq != tp->copied_seq || + tp->rcv_nxt != tp->copied_seq+1 || + sk->urginline || !tp->urg_data)) mask |= POLLIN | POLLRDNORM; #if 1 /* This needs benchmarking and real world tests */ @@ -621,9 +609,9 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) space = atomic_read(&sk->wmem_alloc) / 2; #endif /* Always wake the user up when an error occured */ - if (sock_wspace(sk) >= space) + if (sock_wspace(sk) >= space || sk->err) mask |= POLLOUT | POLLWRNORM; - if (sk->urg_data) + if (tp->urg_data) mask |= POLLPRI; } return mask; @@ -649,7 +637,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) } case SIOCATMARK: { - int answ = sk->urg_data && sk->urg_seq == sk->copied_seq; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int answ = tp->urg_data && tp->urg_seq == tp->copied_seq; return put_user(answ,(int *) arg); } case TIOCOUTQ: @@ -669,21 +658,38 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) /* * Wait for a socket to get into the connected state */ -static void wait_for_tcp_connect(struct sock * sk) +static int wait_for_tcp_connect(struct sock * sk, int flags) { struct task_struct *tsk = current; struct wait_queue wait = { tsk, NULL }; - tsk->state = TASK_INTERRUPTIBLE; - add_wait_queue(sk->sleep, &wait); - release_sock(sk); + while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { + if(sk->err) + return sock_error(sk); + if((1 << sk->state) & + ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { + if(sk->keepopen) + send_sig(SIGPIPE, tsk, 0); + return -EPIPE; + } + if(flags & MSG_DONTWAIT) + return -EAGAIN; + if(signal_pending(tsk)) + return -ERESTARTSYS; - if (((1 << sk->state) & ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT)) && sk->err == 0) - schedule(); + tsk->state = TASK_INTERRUPTIBLE; + add_wait_queue(sk->sleep, &wait); + release_sock(sk); - tsk->state = TASK_RUNNING; - remove_wait_queue(sk->sleep, &wait); - lock_sock(sk); + if (((1 << sk->state) & ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT)) && + sk->err == 0) + schedule(); + + tsk->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); + lock_sock(sk); + } + return 0; } static inline int tcp_memory_free(struct sock *sk) @@ -720,32 +726,6 @@ static void wait_for_tcp_memory(struct sock * sk) lock_sock(sk); } - -static int tcp_append_tail(struct sock *sk, struct sk_buff *skb, u8 *from, - int tcp_size, int seglen) -{ - int fault; - int copy; - - /* Add more stuff to the end of the skb. */ - copy = min(sk->mss - tcp_size, skb_tailroom(skb)); - copy = min(copy, seglen); - - tcp_size += copy; - - fault = copy_from_user(skb->tail, from, copy); - if (fault) - return -1; - - skb_put(skb, copy); - skb->csum = csum_partial(skb->tail - tcp_size, tcp_size, 0); - - sk->write_seq += copy; - skb->end_seq += copy; - - return copy; -} - /* * This routine copies from a user buffer into a socket, * and starts the transmit system. @@ -758,24 +738,9 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); /* Wait for a connection to finish. */ - while ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { - if (sk->err) - return sock_error(sk); - - if ((1 << sk->state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { - if (sk->keepopen) - send_sig(SIGPIPE, current, 0); - return -EPIPE; - } - - if (flags&MSG_DONTWAIT) - return -EAGAIN; - - if (signal_pending(current)) - return -ERESTARTSYS; - - wait_for_tcp_connect(sk); - } + if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) + if((err = wait_for_tcp_connect(sk, flags)) != 0) + return err; /* Ok commence sending. */ while(--iovlen >= 0) { @@ -785,41 +750,28 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) iov++; while(seglen > 0) { - unsigned int actual_win; - int copy; - int tmp; + int copy, tmp, queue_it; struct sk_buff *skb; if (err) return -EFAULT; /* Stop on errors. */ - if (sk->err) { - if (copied) - return copied; - return sock_error(sk); - } + if (sk->err) + goto do_sock_err; /* Make sure that we are established. */ - if (sk->shutdown & SEND_SHUTDOWN) { - if (copied) - return copied; - send_sig(SIGPIPE,current,0); - return -EPIPE; - } + if (sk->shutdown & SEND_SHUTDOWN) + goto do_shutdown; - /* Now we need to check if we have a half built packet. */ - - /* If we have queued packets.. */ + /* Now we need to check if we have a half + * built packet we can tack some data onto. + */ if (tp->send_head && !(flags & MSG_OOB)) { - int tcp_size; - - /* Tail */ - skb = sk->write_queue.prev; - tcp_size = skb->tail - - ((unsigned char *)(skb->h.th) + tp->tcp_header_len); - + copy = skb->tail - + ((unsigned char *)(skb->h.th) + + tp->tcp_header_len); /* This window_seq test is somewhat dangerous * If the remote does SWS avoidance we should * queue the best we can if not we should in @@ -827,79 +779,92 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) * a method for detecting this would be most * welcome */ - if (skb->end > skb->tail && - sk->mss - tcp_size > 0 && + if (skb_tailroom(skb) > 0 && + (sk->mss - copy) > 0 && tp->snd_nxt < skb->end_seq) { - int tcopy; - - tcopy = tcp_append_tail(sk, skb, from, - tcp_size, - seglen); - if (tcopy == -1) - return -EFAULT; - - from += tcopy; - copied += tcopy; - seglen -= tcopy; - - /* FIXME: if we're nagling we - * should send here. - */ + int last_byte_was_odd = (copy & 1); + + copy = sk->mss - copy; + if(copy > skb_tailroom(skb)) + copy = skb_tailroom(skb); + if(copy > seglen) + copy = seglen; + if(last_byte_was_odd) { + if(copy_from_user(skb_put(skb, copy), + from, copy)) + err = -EFAULT; + skb->csum = csum_partial( + (((unsigned char *)skb->h.th) + + tp->tcp_header_len), + (skb->tail - + (((unsigned char *)skb->h.th) + + tp->tcp_header_len)), 0); + } else { + skb->csum = + csum_and_copy_from_user( + from, skb_put(skb, copy), + copy, skb->csum, &err); + } + tp->write_seq += copy; + skb->end_seq += copy; + from += copy; + copied += copy; + seglen -= copy; continue; } } - /* We also need to worry about the window. - * If window < 1/2 the maximum window we've seen from this - * host, don't use it. This is sender side - * silly window prevention, as specified in RFC1122. - * (Note that this is different than earlier versions of - * SWS prevention, e.g. RFC813.). What we actually do is - * use the whole MSS. Since the results in the right - * edge of the packet being outside the window, it will - * be queued for later rather than sent. + /* We also need to worry about the window. If + * window < 1/2 the maximum window we've seen + * from this host, don't use it. This is + * sender side silly window prevention, as + * specified in RFC1122. (Note that this is + * different than earlier versions of SWS + * prevention, e.g. RFC813.). What we + * actually do is use the whole MSS. Since + * the results in the right edge of the packet + * being outside the window, it will be queued + * for later rather than sent. */ - copy = min(seglen, sk->mss); - actual_win = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); - - if (copy > actual_win && - (((int) actual_win) >= (tp->max_window >> 1)) && - actual_win) - copy = actual_win; - - if (copy <= 0) { - printk(KERN_DEBUG "sendmsg: copy < 0\n"); - return -EIO; - } + copy = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); + if(copy >= (tp->max_window >> 1)) + copy = min(copy, sk->mss); + else + copy = sk->mss; + if(copy > seglen) + copy = seglen; - /* If tp->packets_out > 0 segment will be nagled - * else we kick it right away. - */ - tmp = MAX_HEADER + sk->prot->max_header + + tmp = MAX_HEADER + sk->prot->max_header + sizeof(struct sk_buff) + 15; - if (copy < min(sk->mss, tp->max_window >> 1) && - !(flags & MSG_OOB) && tp->packets_out) + queue_it = 0; + if (copy < min(sk->mss, tp->max_window >> 1) && + !(flags & MSG_OOB)) { tmp += min(sk->mss, tp->max_window); - else - tmp += copy; + /* What is happening here is that we want to + * tack on later members of the users iovec + * if possible into a single frame. When we + * leave this loop our caller checks to see if + * we can send queued frames onto the wire. + * See tcp_v[46]_sendmsg() for this. + */ + queue_it = 1; + } else { + tmp += copy; + } skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL); /* If we didn't get any memory, we need to sleep. */ if (skb == NULL) { sk->socket->flags |= SO_NOSPACE; if (flags&MSG_DONTWAIT) { - if (copied) - return copied; - return -EAGAIN; + err = -EAGAIN; + goto do_interrupted; } - if (signal_pending(current)) { - if (copied) - return copied; - return -ERESTARTSYS; + err = -ERESTARTSYS; + goto do_interrupted; } - wait_for_tcp_memory(sk); continue; } @@ -910,9 +875,8 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) tmp = tp->af_specific->build_net_header(sk, skb); if (tmp < 0) { kfree_skb(skb); - if (copied) - return(copied); - return(tmp); + err = tmp; + goto do_interrupted; } skb->h.th =(struct tcphdr *) @@ -920,7 +884,6 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) seglen -= copy; tcp_build_header_data(skb->h.th, sk, seglen || iovlen); - /* FIXME: still need to think about SACK options here. */ if (flags & MSG_OOB) { skb->h.th->urg = 1; @@ -933,21 +896,29 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) from += copy; copied += copy; - sk->write_seq += copy; + tp->write_seq += copy; - tcp_send_skb(sk, skb); - - release_sock(sk); - lock_sock(sk); + tcp_send_skb(sk, skb, queue_it); } } - sk->err = 0; - if (err) return -EFAULT; - return copied; + +do_sock_err: + if(copied) + return copied; + return sock_error(sk); +do_shutdown: + if(copied) + return copied; + send_sig(SIGPIPE, current, 0); + return -EPIPE; +do_interrupted: + if(copied) + return copied; + return err; } /* @@ -980,7 +951,7 @@ static int tcp_recv_urg(struct sock * sk, int nonblock, struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* No URG data to read. */ - if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ) + if (sk->urginline || !tp->urg_data || tp->urg_data == URG_READ) return -EINVAL; /* Yes this is right ! */ if (sk->err) @@ -1000,18 +971,10 @@ static int tcp_recv_urg(struct sock * sk, int nonblock, } lock_sock(sk); - if (sk->urg_data & URG_VALID) { - char c = sk->urg_data; + if (tp->urg_data & URG_VALID) { + char c = tp->urg_data; if (!(flags & MSG_PEEK)) - sk->urg_data = URG_READ; - - if(len>0) - { - err = memcpy_toiovec(msg->msg_iov, &c, 1); - msg->msg_flags|=MSG_OOB; - } - else - msg->msg_flags|=MSG_TRUNC; + tp->urg_data = URG_READ; if(msg->msg_name) tp->af_specific->addr2sockaddr(sk, (struct sockaddr *) @@ -1023,6 +986,15 @@ static int tcp_recv_urg(struct sock * sk, int nonblock, /* Read urgent data. */ msg->msg_flags|=MSG_OOB; release_sock(sk); + + if(len>0) + { + err = memcpy_toiovec(msg->msg_iov, &c, 1); + msg->msg_flags|=MSG_OOB; + } + else + msg->msg_flags|=MSG_TRUNC; + return err ? -EFAULT : 1; } release_sock(sk); @@ -1044,45 +1016,37 @@ static int tcp_recv_urg(struct sock * sk, int nonblock, static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb) { - sk->tp_pinfo.af_tcp.delayed_acks++; - __skb_unlink(skb, &sk->receive_queue); kfree_skb(skb); } - -static void cleanup_rbuf(struct sock *sk) +/* Clean up the receive buffer for full frames taken by the user, + * then send an ACK if necessary. COPIED is the number of bytes + * tcp_recvmsg has given to the user so far, it speeds up the + * calculation of whether or not we must ACK for the sake of + * a window update. + */ +static void cleanup_rbuf(struct sock *sk, int copied) { struct sk_buff *skb; - struct tcp_opt *tp; /* NOTE! The socket must be locked, so that we don't get * a messed-up receive queue. */ while ((skb=skb_peek(&sk->receive_queue)) != NULL) { - if (!skb->used || atomic_read(&skb->users)>1) + if (!skb->used || atomic_read(&skb->users) > 1) break; tcp_eat_skb(sk, skb); } SOCK_DEBUG(sk, "sk->rspace = %lu\n", sock_rspace(sk)); - tp = &(sk->tp_pinfo.af_tcp); - - /* We send a ACK if the sender is blocked - * else let tcp_data deal with the acking policy. + /* We send an ACK if we can now advertise a non-zero window + * which has been raised "significantly". */ - if (tp->delayed_acks) { - __u32 rcv_wnd; - - /* FIXME: double check this rule, then check against - * other use of similar rules. Abtract if possible. - */ - rcv_wnd = tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup); - - if ((rcv_wnd < sk->mss) && (sock_rspace(sk) > rcv_wnd)) - tcp_read_wakeup(sk); - } + if((copied > 0) && + (copied >= tcp_receive_window(&sk->tp_pinfo.af_tcp))) + tcp_read_wakeup(sk); } @@ -1100,7 +1064,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, volatile u32 *seq; /* So gcc doesn't overoptimise */ unsigned long used; int err = 0; - int target = 1; /* Read at least this may bytes */ + int target = 1; /* Read at least this many bytes */ if (sk->state == TCP_LISTEN) return -ENOTCONN; @@ -1113,8 +1077,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, * the multi-reader case neatly (memcpy_to/fromfs might be * inline and thus not flush cached variables otherwise). */ - peek_seq = sk->copied_seq; - seq = &sk->copied_seq; + peek_seq = tp->copied_seq; + seq = &tp->copied_seq; if (flags & MSG_PEEK) seq = &peek_seq; @@ -1129,7 +1093,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, u32 offset; /* Are we at urgent data? Stop if we have read anything. */ - if (copied && sk->urg_data && sk->urg_seq == *seq) + if (copied && tp->urg_data && tp->urg_seq == *seq) break; /* We need to check signals first, to get correct SIGURG @@ -1200,7 +1164,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, break; } - cleanup_rbuf(sk); + cleanup_rbuf(sk, copied); release_sock(sk); sk->socket->flags |= SO_WAITDATA; schedule(); @@ -1222,8 +1186,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, used = len; /* Do we have urgent data here? */ - if (sk->urg_data) { - u32 urg_offset = sk->urg_seq - *seq; + if (tp->urg_data) { + u32 urg_offset = tp->urg_seq - *seq; if (urg_offset < used) { if (!urg_offset) { if (!sk->urginline) { @@ -1264,8 +1228,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, */ atomic_dec(&skb->users); - if (after(sk->copied_seq,sk->urg_seq)) - sk->urg_data = 0; + if (after(tp->copied_seq,tp->urg_seq)) + tp->urg_data = 0; if (used + offset < skb->len) continue; @@ -1303,7 +1267,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, current->state = TASK_RUNNING; /* Clean up data we have read: This will do ACK frames. */ - cleanup_rbuf(sk); + cleanup_rbuf(sk, copied); release_sock(sk); return copied; } @@ -1356,8 +1320,7 @@ static int tcp_close_state(struct sock *sk, int dead) * reset mistake. */ if(dead && ns==TCP_FIN_WAIT2) { - int timer_active=del_timer(&sk->timer); - if(timer_active) + if(sk->timer.prev && del_timer(&sk->timer)) add_timer(&sk->timer); else tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout); @@ -1410,6 +1373,7 @@ static inline int closing(struct sock * sk) void tcp_close(struct sock *sk, unsigned long timeout) { struct sk_buff *skb; + int data_was_unread = 0; /* We need to grab some memory, and put together a FIN, * and then put it into the queue to be sent. @@ -1421,7 +1385,6 @@ void tcp_close(struct sock *sk, unsigned long timeout) tcp_close_pending(sk); release_sock(sk); sk->dead = 1; - sk->prot->unhash(sk); return; } @@ -1435,14 +1398,30 @@ void tcp_close(struct sock *sk, unsigned long timeout) * descriptor close, not protocol-sourced closes, because the * reader process may not have drained the data yet! */ - while((skb=skb_dequeue(&sk->receive_queue))!=NULL) + while((skb=skb_dequeue(&sk->receive_queue))!=NULL) { + data_was_unread++; kfree_skb(skb); + } - /* Timeout is not the same thing - however the code likes - * to send both the same way (sigh). + /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section + * 3.10, we send a RST here because data was lost. To + * witness the awful effects of the old behavior of always + * doing a FIN, run an older 2.1.x kernel or 2.0.x, start + * a bulk GET in an FTP client, suspend the process, wait + * for the client to advertise a zero window, then kill -9 + * the FTP client, wheee... Note: timeout is always zero + * in such a case. */ - if (tcp_close_state(sk,1)==1) + if(data_was_unread != 0) { + /* Unread data was tossed, zap the connection. */ + tcp_set_state(sk, TCP_CLOSE); + tcp_send_active_reset(sk); + } else if (tcp_close_state(sk,1)) { + /* We FIN if the application ate all the data before + * zapping the connection. + */ tcp_send_fin(sk); + } if (timeout) { struct task_struct *tsk = current; @@ -1470,8 +1449,7 @@ void tcp_close(struct sock *sk, unsigned long timeout) * we may need to set up a timer. */ if (sk->state==TCP_FIN_WAIT2) { - int timer_active=del_timer(&sk->timer); - if(timer_active) + if(sk->timer.prev && del_timer(&sk->timer)) add_timer(&sk->timer); else tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout); @@ -1479,9 +1457,6 @@ void tcp_close(struct sock *sk, unsigned long timeout) sk->dead = 1; release_sock(sk); - - if(sk->state == TCP_CLOSE) - sk->prot->unhash(sk); } /* @@ -1538,13 +1513,12 @@ struct sock *tcp_accept(struct sock *sk, int flags) /* If this is a non blocking socket don't sleep */ error = EAGAIN; if (flags & O_NONBLOCK) - goto out; + goto out; error = ERESTARTSYS; req = wait_for_connect(sk, &prev); if (!req) - goto out; - error = 0; + goto out; } tcp_synq_unlink(tp, req, prev); @@ -1647,9 +1621,23 @@ void tcp_set_keepalive(struct sock *sk, int val) __initfunc(void tcp_init(void)) { tcp_openreq_cachep = kmem_cache_create("tcp_open_request", - sizeof(struct open_request), + sizeof(struct open_request), 0, SLAB_HWCACHE_ALIGN, NULL, NULL); if(!tcp_openreq_cachep) panic("tcp_init: Cannot alloc open_request cache."); + + tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket", + sizeof(struct tcp_bind_bucket), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if(!tcp_bucket_cachep) + panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); + + tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket", + sizeof(struct tcp_tw_bucket), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if(!tcp_timewait_cachep) + panic("tcp_init: Cannot alloc tcp_tw_bucket cache."); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 841359739..4b7dcc9e9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_input.c,v 1.66 1998/01/15 22:40:29 freitag Exp $ + * Version: $Id: tcp_input.c,v 1.84 1998/03/15 03:23:20 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -67,57 +67,54 @@ static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack, extern int sysctl_tcp_fin_timeout; +/* These are on by default so the code paths get tested. + * For the final 2.2 this may be undone at our discretion. -DaveM + */ +int sysctl_tcp_timestamps = 1; +int sysctl_tcp_window_scaling = 1; + int sysctl_tcp_cong_avoidance; int sysctl_tcp_hoe_retransmits; -int sysctl_tcp_sack; -int sysctl_tcp_tsack; -int sysctl_tcp_timestamps; -int sysctl_tcp_window_scaling; int sysctl_tcp_syncookies = SYNC_INIT; int sysctl_tcp_stdurg; +int sysctl_tcp_rfc1337; static tcp_sys_cong_ctl_t tcp_sys_cong_ctl_f = &tcp_cong_avoid_vanj; -/* - * Called each time to estimate the delayed ack timeout. This is - * how it should be done so a fast link isnt impacted by ack delay. - * - * I think we need a medium deviation here also... - * The estimated value is changing to fast +/* There is something which you must keep in mind when you analyze the + * behavior of the tp->ato delayed ack timeout interval. When a + * connection starts up, we want to ack as quickly as possible. The + * problem is that "good" TCP's do slow start at the beginning of data + * transmission. The means that until we send the first few ACK's the + * sender will sit on his end and only queue most of his data, because + * he can only send snd_cwnd unacked packets at any given time. For + * each ACK we send, he increments snd_cwnd and transmits more of his + * queue. -DaveM */ - static void tcp_delack_estimator(struct tcp_opt *tp) { - int m; - - /* Delayed ACK time estimator. */ - - m = jiffies - tp->lrcvtime; - - tp->lrcvtime = jiffies; + if(tp->ato == 0) { + tp->lrcvtime = jiffies; - if (m < 0) - return; - - /* if the mesured value is bigger than - * twice the round trip time ignore it. - */ - if ((m << 2) <= tp->srtt) { - m -= (tp->iat >> 3); - tp->iat += m; - - if (m <0) - m = -m; - - m -= (tp->iat_mdev >> 2); - tp->iat_mdev += m; + /* Help sender leave slow start quickly, + * this sets our initial ato value. + */ + tcp_enter_quickack_mode(tp); + } else { + int m = jiffies - tp->lrcvtime; - tp->ato = (tp->iat >> 3) + (tp->iat_mdev >> 2); + tp->lrcvtime = jiffies; + if(m <= 0) + m = 1; + if(m > tp->rto) + tp->ato = tp->rto; + else + tp->ato = (tp->ato >> 1) + m; - if (tp->ato < HZ/50) - tp->ato = HZ/50; - } else - tp->ato = 0; + /* We are not in "quick ack" mode. */ + if(tp->ato <= (HZ/100)) + tp->ato = ((HZ/100)*2); + } } /* Called to compute a smoothed rtt estimate. The data fed to this @@ -132,9 +129,9 @@ static void tcp_delack_estimator(struct tcp_opt *tp) static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt) { - long m; - /* - * The following amusing code comes from Jacobson's + long m = mrtt; /* RTT */ + + /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. * This is designed to be as fast as possible @@ -143,12 +140,9 @@ static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt) * On a 1990 paper the rto value is changed to: * RTO = rtt + 4 * mdev */ - - m = mrtt; /* RTT */ - + if(m == 0) + m = 1; if (tp->srtt != 0) { - if(m<=0) - m=1; /* IS THIS RIGHT FOR <0 ??? */ m -= (tp->srtt >> 3); /* m is now error in rtt est */ tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) @@ -202,19 +196,17 @@ extern __inline__ void tcp_replace_ts_recent(struct tcp_opt *tp, __u32 end_seq) */ if (!before(end_seq,tp->last_ack_sent)) { tp->ts_recent = tp->rcv_tsval; - /* FIXME: need a corse timestamp. Days uptime - * would be good. - */ tp->ts_recent_stamp = jiffies; } } +#define PAWS_24DAYS (HZ * 60 * 60 * 24 * 24) + extern __inline__ int tcp_paws_discard(struct tcp_opt *tp) { - /* FIXME: must check that ts_recent is not - * more than 24 days old here. Yuck. - */ - return ((s32)(tp->rcv_tsval-tp->ts_recent) < 0); + /* ts_recent must be younger than 24 days */ + return (((jiffies - tp->ts_recent_stamp) >= PAWS_24DAYS) || + ((s32)(tp->rcv_tsval-tp->ts_recent) < 0)); } @@ -257,8 +249,6 @@ static void tcp_reset(struct sock *sk, struct sk_buff *skb) /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->state) { - case TCP_TIME_WAIT: - break; case TCP_SYN_SENT: sk->err = ECONNREFUSED; break; @@ -268,23 +258,8 @@ static void tcp_reset(struct sock *sk, struct sk_buff *skb) default: sk->err = ECONNRESET; }; -#ifdef CONFIG_TCP_RFC1337 - /* - * Time wait assassination protection [RFC1337] - * - * This is a good idea, but causes more sockets to take time to close. - * - * Ian Heavens has since shown this is an inadequate fix for the protocol - * bug in question. - */ - if(sk->state!=TCP_TIME_WAIT) { - tcp_set_state(sk,TCP_CLOSE); - sk->shutdown = SHUTDOWN_MASK; - } -#else tcp_set_state(sk,TCP_CLOSE); sk->shutdown = SHUTDOWN_MASK; -#endif if (!sk->dead) sk->state_change(sk); } @@ -302,7 +277,6 @@ void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp, int no_fancy) int length=(th->doff*4)-sizeof(struct tcphdr); ptr = (unsigned char *)(th + 1); - tp->sacks = 0; tp->saw_tstamp = 0; while(length>0) { @@ -336,10 +310,6 @@ void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp, int no_fancy) tp->snd_wscale = *(__u8 *)ptr; } break; - case TCPOPT_SACK_PERM: - if(opsize==TCPOLEN_SACK_PERM && th->syn) - if (sysctl_tcp_sack && !no_fancy) - tp->sack_ok = 1; case TCPOPT_TIMESTAMP: if(opsize==TCPOLEN_TIMESTAMP) { /* Cheaper to set again then to @@ -353,18 +323,6 @@ void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp, int no_fancy) } } break; - case TCPOPT_SACK: - if (no_fancy || !sysctl_tcp_sack) - break; - tp->sacks = (opsize-2)>>3; - if (tp->sacks<<3 == opsize-2) { - int i; - for (i = 0; i < tp->sacks; i++) { - tp->left_sack[i] = ntohl(((__u32 *)ptr)[2*i]); - tp->right_sack[i] = ntohl(((__u32 *)ptr)[2*i+1]); - } - } else - tp->sacks = 0; } ptr+=opsize-2; length-=opsize; @@ -374,7 +332,7 @@ void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp, int no_fancy) /* Fast parse options. This hopes to only see timestamps. * If it is wrong it falls back on tcp_parse_option(). - * This should probably get extended for timestamps + SACK as well. + * This should probably get extended for timestamps as well. * Assembly code anyone? -- erics */ static __inline__ int tcp_fast_parse_options(struct tcphdr *th, struct tcp_opt *tp) @@ -384,14 +342,12 @@ static __inline__ int tcp_fast_parse_options(struct tcphdr *th, struct tcp_opt * return 0; if (th->doff == sizeof(struct tcphdr)>>2) { tp->saw_tstamp = 0; - tp->sacks = 0; return 0; - } else if (th->doff == (sizeof(struct tcphdr)>>2)+3) { + } else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { __u32 *ptr = (__u32 *)(th + 1); - if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) - | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { + if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { tp->saw_tstamp = 1; - tp->sacks = 0; tp->rcv_tsval = ntohl(*++ptr); tp->rcv_tsecr = ntohl(*++ptr); return 1; @@ -401,89 +357,6 @@ static __inline__ int tcp_fast_parse_options(struct tcphdr *th, struct tcp_opt * return 1; } -#if 0 - -/* - * This is the old fast retransmit code. It will go away eventually. -- erics - */ - -/* - * See draft-stevens-tcpca-spec-01 for documentation. - */ - -static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) -{ - struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); - - /* FIXME: if we are already retransmitting should this code - * be skipped? [Floyd high_seq check sort of does this] - * The case I'm worried about is falling into a fast - * retransmit on a link with a congestion window of 1 or 2. - * There was some evidence in 2.0.x that this was problem - * on really slow links (1200 or 2400 baud). I need to - * try this situation again and see what happens. - */ - - /* - * An ACK is a duplicate if: - * (1) it has the same sequence number as the largest number we've - * seen, - * (2) it has the same window as the last ACK, - * (3) we have outstanding data that has not been ACKed - * (4) The packet was not carrying any data. - * (5) [From Floyds paper on fast retransmit wars] - * The packet acked data after high_seq; - */ - - if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) { - /* 1. When the third duplicate ack is received, set ssthresh - * to one half the current congestion window, but no less - * than two segments. Retransmit the missing segment. - */ - if (tp->high_seq == 0 || after(ack, tp->high_seq)) { - tp->dup_acks++; - - if (tp->dup_acks == 3) { - tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2); - tp->snd_cwnd = tp->snd_ssthresh + 3; - tcp_do_retransmit(sk, 0); - - /* Careful not to timeout just after fast - * retransmit! - */ - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - } - } - - /* 2. Each time another duplicate ACK arrives, increment - * cwnd by the segment size. [...] Transmit a packet... - * - * Packet transmission will be done on normal flow processing - * since we're not in "retransmit mode". - */ - if (tp->dup_acks >= 3) { - tp->dup_acks++; - tp->snd_cwnd++; - } - } else { - /* 3. When the next ACK arrives that acknowledges new data, - * set cwnd to ssthresh. - */ - if (tp->dup_acks >= 3) { - tp->retrans_head = NULL; - tp->snd_cwnd = max(tp->snd_ssthresh, 1); - tp->retransmits = 0; - } - tp->dup_acks = 0; - - /* FIXME: This is wrong if the new ack that arrives - * is below the value for high_seq. - */ - tp->high_seq = 0; - } -} -#endif - #define FLAG_DATA 0x01 #define FLAG_WIN_UPDATE 0x02 #define FLAG_DATA_ACKED 0x04 @@ -579,9 +452,8 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) * not indicate a packet left the system. * We can test this by just checking * if ack changed from snd_una, since - * the only way to get here without changing - * advancing from snd_una is if this was a - * window update. + * the only way to get here without advancing + * from snd_una is if this was a window update. */ if (ack != tp->snd_una && before(ack,tp->high_seq)) { tcp_do_retransmit(sk, 0); @@ -596,9 +468,6 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) clear_fast_retransmit(sk); } } - } else { - /* Clear any aborted fast retransmit starts. */ - tp->dup_acks = 0; } } @@ -649,7 +518,6 @@ static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack, expected = (tp->snd_nxt - tp->snd_una) * inv_basertt; - /* XXX sk->mss should move into tcp_opt as well -DaveM */ inv_basebd = sk->mss * inv_basertt; /* Slow Start */ @@ -731,13 +599,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq, int acked = 0; while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) { -#ifdef TCP_DEBUG - /* Check for a bug. */ - if (skb->next != (struct sk_buff*) &sk->write_queue && - after(skb->end_seq, skb->next->seq)) - printk(KERN_DEBUG "INET: tcp_input.c: *** " - "bug send_list out of order.\n"); -#endif /* If our packet is before the ack sequence we can * discard it as it's confirmed to have arrived the * other end. @@ -745,12 +606,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq, if (after(skb->end_seq, ack)) break; -#if 0 - SOCK_DEBUG(sk, "removing seg %x-%x from retransmit queue\n", - skb->seq, skb->end_seq); -#endif - - acked = FLAG_DATA_ACKED; + /* Initial outgoing SYN's get put onto the write_queue + * just like anything else we transmit. It is not + * true data, and if we misinform our callers that + * this ACK acks real data, we will erroneously exit + * connection startup slow start one packet too + * quickly. This is severely frowned upon behavior. + */ + if(!skb->h.th->syn) + acked = FLAG_DATA_ACKED; /* FIXME: packet counting may break if we have to * do packet "repackaging" for stacks that don't @@ -766,11 +630,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq, kfree_skb(skb); } - if (acked) { + if (acked) tp->retrans_head = NULL; - if (!sk->dead) - sk->write_space(sk); - } + return acked; } @@ -795,6 +657,66 @@ static void tcp_ack_probe(struct sock *sk, __u32 ack) } } +/* Read draft-ietf-tcplw-high-performance before mucking + * with this code. (Superceeds RFC1323) + */ +static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp, + u32 seq, u32 ack, int flag) +{ + __u32 seq_rtt = (jiffies-tp->rcv_tsecr); + tcp_rtt_estimator(tp, seq_rtt); + if (tp->retransmits) { + if (tp->packets_out == 0) { + tp->retransmits = 0; + tp->backoff = 0; + tcp_set_rto(tp); + } else { + /* Still retransmitting, use backoff */ + tcp_set_rto(tp); + tp->rto = tp->rto << tp->backoff; + } + } else { + tcp_set_rto(tp); + if (flag & FLAG_DATA_ACKED) + (*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt); + } + /* NOTE: safe here so long as cong_ctl doesn't use rto */ + tcp_bound_rto(tp); +} + +static void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) +{ + struct sk_buff *skb; + long when; + + skb = skb_peek(&sk->write_queue); + when = tp->rto - (jiffies - skb->when); + + /* FIXME: This assumes that when we are retransmitting + * we should only ever respond with one packet. + * This means congestion windows should not grow + * during recovery. In 2.0.X we allow the congestion + * window to grow. It is not clear to me which + * decision is correct. The RFCs should be double + * checked as should the behavior of other stacks. + * Also note that if we do want to allow the + * congestion window to grow during retransmits + * we have to fix the call to congestion window + * updates so that it works during retransmission. + */ + if (tp->retransmits) { + tp->retrans_head = NULL; + + /* This is tricky. We are retransmiting a + * segment of a window when congestion occured. + */ + tcp_do_retransmit(sk, 0); + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + } else { + tcp_reset_xmit_timer(sk, TIME_RETRANS, when); + } +} + /* * This routine deals with incoming acks, but not outgoing ones. */ @@ -806,7 +728,6 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, int flag = 0; u32 seq = 0; u32 seq_rtt = 0; - struct sk_buff *skb; if(sk->zapped) return(1); /* Dead, can't ack any more so why bother */ @@ -838,7 +759,7 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, */ if (before(tp->snd_wl1, ack_seq) || (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) { - unsigned long nwin = ntohs(th->window) << tp->snd_wscale; + u32 nwin = ntohs(th->window) << tp->snd_wscale; if ((tp->snd_wl2 != ack) || (nwin > tp->snd_wnd)) { flag |= FLAG_WIN_UPDATE; @@ -869,28 +790,7 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, /* If we have a timestamp, we always do rtt estimates. */ if (tp->saw_tstamp) { - /* Read draft-ietf-tcplw-high-performance before mucking - * with this code. (Superceeds RFC1323) - */ - seq_rtt = (jiffies-tp->rcv_tsecr); - tcp_rtt_estimator(tp, seq_rtt); - if (tp->retransmits) { - if (tp->packets_out == 0) { - tp->retransmits = 0; - tp->backoff = 0; - tcp_set_rto(tp); - } else { - /* Still retransmitting, use backoff */ - tcp_set_rto(tp); - tp->rto = tp->rto << tp->backoff; - } - } else { - tcp_set_rto(tp); - if (flag & FLAG_DATA_ACKED) - (*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt); - } - /* NOTE: safe here so long as cong_ctl doesn't use rto */ - tcp_bound_rto(tp); + tcp_ack_saw_tstamp(sk, tp, seq, ack, flag); } else { /* If we were retransmiting don't count rtt estimate. */ if (tp->retransmits) { @@ -916,51 +816,217 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, } if (tp->packets_out) { - if (flag & FLAG_DATA_ACKED) { - long when; - - skb = skb_peek(&sk->write_queue); - when = tp->rto - (jiffies - skb->when); - - /* FIXME: This assumes that when we are retransmitting - * we should only ever respond with one packet. - * This means congestion windows should not grow - * during recovery. In 2.0.X we allow the congestion - * window to grow. It is not clear to me which - * decision is correct. The RFCs should be double - * checked as should the behavior of other stacks. - * Also note that if we do want to allow the - * congestion window to grow during retransmits - * we have to fix the call to congestion window - * updates so that it works during retransmission. - */ - if (tp->retransmits) { - tp->retrans_head = NULL; - - /* This is tricky. We are retransmiting a - * segment of a window when congestion occured. - */ - tcp_do_retransmit(sk, 0); - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); - } else - tcp_reset_xmit_timer(sk, TIME_RETRANS, when); - } - } else + if (flag & FLAG_DATA_ACKED) + tcp_ack_packets_out(sk, tp); + } else { tcp_clear_xmit_timer(sk, TIME_RETRANS); + } - tcp_fast_retrans(sk, ack, (flag & (FLAG_DATA|FLAG_WIN_UPDATE))); - + flag &= (FLAG_DATA | FLAG_WIN_UPDATE); + if ((ack == tp->snd_una && tp->packets_out && flag == 0) || + (tp->high_seq != 0)) { + tcp_fast_retrans(sk, ack, flag); + } else { + /* Clear any aborted fast retransmit starts. */ + tp->dup_acks = 0; + } /* Remember the highest ack received. */ tp->snd_una = ack; - return 1; uninteresting_ack: - SOCK_DEBUG(sk, "Ack ignored %u %u\n", ack, tp->snd_nxt); return 0; } +/* New-style handling of TIME_WAIT sockets. */ +static void tcp_timewait_kill(unsigned long __arg) +{ + struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)__arg; + + /* Zap the timer. */ + del_timer(&tw->timer); + + /* Unlink from various places. */ + if(tw->bind_next) + tw->bind_next->bind_pprev = tw->bind_pprev; + *(tw->bind_pprev) = tw->bind_next; + if(tw->tb->owners == NULL) + tcp_inc_slow_timer(TCP_SLT_BUCKETGC); + + if(tw->next) + tw->next->pprev = tw->pprev; + *tw->pprev = tw->next; + + /* We decremented the prot->inuse count when we entered TIME_WAIT + * and the sock from which this came was destroyed. + */ + tw->sklist_next->sklist_prev = tw->sklist_prev; + tw->sklist_prev->sklist_next = tw->sklist_next; + + /* Ok, now free it up. */ + kmem_cache_free(tcp_timewait_cachep, tw); +} + +/* We come here as a special case from the AF specific TCP input processing, + * and the SKB has no owner. Essentially handling this is very simple, + * we just keep silently eating rx'd packets until none show up for the + * entire timeout period. The only special cases are for BSD TIME_WAIT + * reconnects and SYN/RST bits being set in the TCP header. + */ +int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, + struct tcphdr *th, void *opt, __u16 len) +{ + /* RFC 1122: + * "When a connection is [...] on TIME-WAIT state [...] + * [a TCP] MAY accept a new SYN from the remote TCP to + * reopen the connection directly, if it: + * + * (1) assigns its initial sequence number for the new + * connection to be larger than the largest sequence + * number it used on the previous connection incarnation, + * and + * + * (2) returns to TIME-WAIT state if the SYN turns out + * to be an old duplicate". + */ + if(th->syn && !th->rst && after(skb->seq, tw->rcv_nxt)) { + struct sock *sk; + struct tcp_func *af_specific = tw->af_specific; + __u32 isn; + + isn = tw->rcv_nxt + 128000; + if(isn == 0) + isn++; + tcp_timewait_kill((unsigned long)tw); + sk = af_specific->get_sock(skb, th); + if(sk == NULL || !ipsec_sk_policy(sk,skb)) + return 0; + skb_set_owner_r(skb, sk); + af_specific = sk->tp_pinfo.af_tcp.af_specific; + if(af_specific->conn_request(sk, skb, opt, isn) < 0) + return 1; /* Toss a reset back. */ + return 0; /* Discard the frame. */ + } + + /* Check RST or SYN */ + if(th->rst || th->syn) { + /* This is TIME_WAIT assasination, in two flavors. + * Oh well... nobody has a sufficient solution to this + * protocol bug yet. + */ + if(sysctl_tcp_rfc1337 == 0) + tcp_timewait_kill((unsigned long)tw); + + if(!th->rst) + return 1; /* toss a reset back */ + } else { + if(th->ack) { + /* In this case we must reset the TIMEWAIT timer. */ + del_timer(&tw->timer); + tw->timer.expires = jiffies + TCP_TIMEWAIT_LEN; + add_timer(&tw->timer); + } + } + return 0; /* Discard the frame. */ +} + +/* Enter the time wait state. This is always called from BH + * context. Essentially we whip up a timewait bucket, copy the + * relevant info into it from the SK, and mess with hash chains + * and list linkage. + */ +static __inline__ void tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) +{ + struct sock **head, *sktw; + + /* Step 1: Remove SK from established hash. */ + if(sk->next) + sk->next->pprev = sk->pprev; + *sk->pprev = sk->next; + sk->pprev = NULL; + tcp_reg_zap(sk); + + /* Step 2: Put TW into bind hash where SK was. */ + tw->tb = (struct tcp_bind_bucket *)sk->prev; + if((tw->bind_next = sk->bind_next) != NULL) + sk->bind_next->bind_pprev = &tw->bind_next; + tw->bind_pprev = sk->bind_pprev; + *sk->bind_pprev = (struct sock *)tw; + + /* Step 3: Same for the protocol sklist. */ + (tw->sklist_next = sk->sklist_next)->sklist_prev = (struct sock *)tw; + (tw->sklist_prev = sk->sklist_prev)->sklist_next = (struct sock *)tw; + sk->sklist_next = NULL; + sk->prot->inuse--; + + /* Step 4: Hash TW into TIMEWAIT half of established hash table. */ + head = &tcp_established_hash[sk->hashent + (TCP_HTABLE_SIZE/2)]; + sktw = (struct sock *)tw; + if((sktw->next = *head) != NULL) + (*head)->pprev = &sktw->next; + *head = sktw; + sktw->pprev = head; +} + +void tcp_time_wait(struct sock *sk) +{ + struct tcp_tw_bucket *tw; + + tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC); + if(tw != NULL) { + /* Give us an identity. */ + tw->daddr = sk->daddr; + tw->rcv_saddr = sk->rcv_saddr; + tw->bound_dev_if= sk->bound_dev_if; + tw->num = sk->num; + tw->state = TCP_TIME_WAIT; + tw->family = sk->family; + tw->source = sk->dummy_th.source; + tw->dest = sk->dummy_th.dest; + tw->rcv_nxt = sk->tp_pinfo.af_tcp.rcv_nxt; + tw->af_specific = sk->tp_pinfo.af_tcp.af_specific; + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if(tw->family == AF_INET6) { + memcpy(&tw->v6_daddr, + &sk->net_pinfo.af_inet6.daddr, + sizeof(struct in6_addr)); + memcpy(&tw->v6_rcv_saddr, + &sk->net_pinfo.af_inet6.rcv_saddr, + sizeof(struct in6_addr)); + } +#endif + /* Linkage updates. */ + tcp_tw_hashdance(sk, tw); + + /* Get the TIME_WAIT timeout firing. */ + init_timer(&tw->timer); + tw->timer.function = tcp_timewait_kill; + tw->timer.data = (unsigned long) tw; + tw->timer.expires = jiffies + TCP_TIMEWAIT_LEN; + add_timer(&tw->timer); + + /* CLOSE the SK. */ + if(sk->state == TCP_ESTABLISHED) + tcp_statistics.TcpCurrEstab--; + sk->state = TCP_CLOSE; + net_reset_timer(sk, TIME_DONE, + min(sk->tp_pinfo.af_tcp.srtt * 2, TCP_DONE_TIME)); + } else { + /* Sorry, we're out of memory, just CLOSE this + * socket up. We've got bigger problems than + * non-graceful socket closings. + */ + tcp_set_state(sk, TCP_CLOSE); + } + + /* Prevent rcvmsg/sndmsg calls, and wake people up. */ + sk->shutdown = SHUTDOWN_MASK; + if(!sk->dead) + sk->state_change(sk); +} + /* * Process the FIN bit. This now behaves as it is supposed to work * and the FIN takes effect when it is validly part of sequence @@ -976,17 +1042,9 @@ uninteresting_ack: * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. */ -static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) +static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) { - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - if(sk->state == TCP_SYN_SENT) { - /* RFC793 says to drop the segment and return. */ - return 1; - } - - /* XXX This fin_seq thing should disappear... -DaveM */ - tp->fin_seq = skb->end_seq; + sk->tp_pinfo.af_tcp.fin_seq = skb->end_seq; tcp_send_ack(sk); @@ -1013,12 +1071,6 @@ static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) case TCP_LAST_ACK: /* RFC793: Remain in the LAST-ACK state. */ break; - case TCP_TIME_WAIT: - /* Received a retransmission of the FIN, - * restart the TIME_WAIT timer. - */ - tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); - break; case TCP_FIN_WAIT1: /* This case occurs when a simultaneous close @@ -1035,21 +1087,15 @@ static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) break; case TCP_FIN_WAIT2: /* Received a FIN -- send ACK and enter TIME_WAIT. */ - tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); - sk->shutdown |= SHUTDOWN_MASK; - tcp_set_state(sk,TCP_TIME_WAIT); - break; - case TCP_CLOSE: - /* Already in CLOSE. */ + tcp_time_wait(sk); break; default: - /* Only TCP_LISTEN is left, in that case we should never - * reach this piece of code. + /* Only TCP_LISTEN and TCP_CLOSE are left, in these + * cases we should never reach this piece of code. */ printk("tcp_fin: Impossible, sk->state=%d\n", sk->state); break; }; - return 0; } /* This one checks to see if we can put data from the @@ -1060,7 +1106,7 @@ static void tcp_ofo_queue(struct sock *sk) struct sk_buff *skb; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - while ((skb = skb_peek(&sk->out_of_order_queue))) { + while ((skb = skb_peek(&tp->out_of_order_queue))) { if (after(skb->seq, tp->rcv_nxt)) break; @@ -1076,6 +1122,8 @@ static void tcp_ofo_queue(struct sock *sk) skb_unlink(skb); skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = skb->end_seq; + if(skb->h.th->fin) + tcp_fin(skb, sk, skb->h.th); } } @@ -1094,8 +1142,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) dst_confirm(sk->dst_cache); skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = skb->end_seq; + if(skb->h.th->fin) + tcp_fin(skb, sk, skb->h.th); + else + tp->delayed_acks++; tcp_ofo_queue(sk); - if (skb_queue_len(&sk->out_of_order_queue) == 0) + if (skb_queue_len(&tp->out_of_order_queue) == 0) tp->pred_flags = htonl((0x5010 << 16) | tp->snd_wnd); return; } @@ -1104,8 +1156,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (!after(skb->end_seq, tp->rcv_nxt)) { /* A retransmit, 2nd most common case. Force an imediate ack. */ SOCK_DEBUG(sk, "retransmit received: seq %X\n", skb->seq); - - tp->delayed_acks = MAX_DELAY_ACK; + tcp_enter_quickack_mode(tp); kfree_skb(skb); return; } @@ -1119,7 +1170,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) } /* Ok. This is an out_of_order segment, force an ack. */ - tp->delayed_acks = MAX_DELAY_ACK; + tp->delayed_acks++; + tcp_enter_quickack_mode(tp); /* Disable header predition. */ tp->pred_flags = 0; @@ -1127,10 +1179,10 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", tp->rcv_nxt, skb->seq, skb->end_seq); - if (skb_peek(&sk->out_of_order_queue) == NULL) { - skb_queue_head(&sk->out_of_order_queue,skb); + if (skb_peek(&tp->out_of_order_queue) == NULL) { + skb_queue_head(&tp->out_of_order_queue,skb); } else { - for(skb1=sk->out_of_order_queue.prev; ; skb1 = skb1->prev) { + for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) { /* Already there. */ if (skb->seq == skb1->seq && skb->len >= skb1->len) { skb_append(skb1, skb); @@ -1145,8 +1197,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) } /* See if we've hit the start. If so insert. */ - if (skb1 == skb_peek(&sk->out_of_order_queue)) { - skb_queue_head(&sk->out_of_order_queue,skb); + if (skb1 == skb_peek(&tp->out_of_order_queue)) { + skb_queue_head(&tp->out_of_order_queue,skb); break; } } @@ -1172,23 +1224,17 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) if (skb->len == 0 && !th->fin) return(0); - /* FIXME: don't accept data after the received fin. - * - * Would checking snd_seq against fin_seq be enough? - * If so, how do we handle that case exactly? -DaveM - */ - /* We no longer have anyone receiving data on this connection. */ tcp_data_queue(sk, skb); - if (before(tp->rcv_nxt, sk->copied_seq)) { + if (before(tp->rcv_nxt, tp->copied_seq)) { printk(KERN_DEBUG "*** tcp.c:tcp_data bug acked < copied\n"); - tp->rcv_nxt = sk->copied_seq; + tp->rcv_nxt = tp->copied_seq; } - tp->delayed_acks++; - - /* Now tell the user we may have some data. */ + /* Above, tcp_data_queue() increments delayed_acks appropriately. + * Now tell the user we may have some data. + */ if (!sk->dead) { SOCK_DEBUG(sk, "Data wakeup.\n"); sk->data_ready(sk,0); @@ -1204,23 +1250,10 @@ static void tcp_data_snd_check(struct sock *sk) if ((skb = tp->send_head)) { if (!after(skb->end_seq, tp->snd_una + tp->snd_wnd) && tp->packets_out < tp->snd_cwnd ) { - /* Add more data to the send queue. */ - - /* FIXME: the congestion window is checked - * again in tcp_write_xmit anyway?! -- erics - * - * I think it must, it bumps tp->packets_out for - * each packet it fires onto the wire. -DaveM - */ + /* Put more data onto the wire. */ tcp_write_xmit(sk); - if(!sk->dead) - sk->write_space(sk); } else if (tp->packets_out == 0 && !tp->pending) { - /* Data to queue but no room. */ - - /* FIXME: Is it right to do a zero window probe into - * a congestion window limited window??? -- erics - */ + /* Start probing the receivers window. */ tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); } } @@ -1240,12 +1273,24 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk) * - delay time <= 0.5 HZ * - we don't have a window update to send * - must send at least every 2 full sized packets + * + * With an extra heuristic to handle loss of packet + * situations and also helping the sender leave slow + * start in an expediant manner. */ - if (tp->delayed_acks >= MAX_DELAY_ACK || tcp_raise_window(sk)) + /* Two full frames received or... */ + if (((tp->rcv_nxt - tp->rcv_wup) >= (sk->mss << 1)) || + /* We will update the window "significantly" or... */ + tcp_raise_window(sk) || + /* We entered "quick ACK" mode */ + tcp_in_quickack_mode(tp)) { + /* Then ack it now */ tcp_send_ack(sk); - else - tcp_send_delayed_ack(sk, HZ/2); + } else { + /* Else, send delayed ack. */ + tcp_send_delayed_ack(tp, HZ/2); + } } static __inline__ void tcp_ack_snd_check(struct sock *sk) @@ -1279,11 +1324,11 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) ptr += ntohl(th->seq); /* Ignore urgent data that we've already seen and read. */ - if (after(sk->copied_seq, ptr)) + if (after(tp->copied_seq, ptr)) return; /* Do we already have a newer (or duplicate) urgent pointer? */ - if (sk->urg_data && !after(ptr, sk->urg_seq)) + if (tp->urg_data && !after(ptr, tp->urg_seq)) return; /* Tell the world about our new urgent pointer. */ @@ -1296,14 +1341,14 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) /* We may be adding urgent data when the last byte read was * urgent. To do this requires some care. We cannot just ignore - * sk->copied_seq since we would read the last urgent byte again + * tp->copied_seq since we would read the last urgent byte again * as data, nor can we alter copied_seq until this data arrives * or we break the sematics of SIOCATMARK (and thus sockatmark()) */ - if (sk->urg_seq == sk->copied_seq) - sk->copied_seq++; /* Move the copied sequence on correctly */ - sk->urg_data = URG_NOTYET; - sk->urg_seq = ptr; + if (tp->urg_seq == tp->copied_seq) + tp->copied_seq++; /* Move the copied sequence on correctly */ + tp->urg_data = URG_NOTYET; + tp->urg_seq = ptr; /* Disable header prediction. */ tp->pred_flags = 0; @@ -1312,17 +1357,19 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) /* This is the 'fast' part of urgent handling. */ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + /* Check if we get a new urgent pointer - normally not. */ if (th->urg) tcp_check_urg(sk,th); /* Do we wait for any urgent data? - normally not... */ - if (sk->urg_data == URG_NOTYET) { - u32 ptr = sk->urg_seq - ntohl(th->seq) + (th->doff*4); + if (tp->urg_data == URG_NOTYET) { + u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4); /* Is the urgent pointer pointing into this packet? */ if (ptr < len) { - sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th); + tp->urg_data = URG_VALID | *(ptr + (unsigned char *) th); if (!sk->dead) sk->data_ready(sk,0); } @@ -1335,33 +1382,39 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len */ static void prune_queue(struct sock *sk) { - struct tcp_opt *tp; + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; struct sk_buff * skb; - SOCK_DEBUG(sk, "prune_queue: c=%x\n", sk->copied_seq); + SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); /* First Clean the out_of_order queue. */ /* Start with the end because there are probably the least * useful packets (crossing fingers). */ - while ((skb = skb_dequeue_tail(&sk->out_of_order_queue))) { + while ((skb = skb_dequeue_tail(&tp->out_of_order_queue))) { kfree_skb(skb); if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) return; } - tp = &sk->tp_pinfo.af_tcp; - /* Now continue with the receive queue if it wasn't enough */ while ((skb = skb_peek_tail(&sk->receive_queue))) { + /* Never toss anything when we've seen the FIN. + * It's just too complex to recover from it. + */ + if(skb->h.th->fin) + break; + /* Never remove packets that have been already acked */ if (before(skb->end_seq, tp->last_ack_sent+1)) { printk(KERN_DEBUG "prune_queue: hit acked data c=%x,%x,%x\n", - sk->copied_seq, skb->end_seq, tp->last_ack_sent); + tp->copied_seq, skb->end_seq, tp->last_ack_sent); break; } skb_unlink(skb); tp->rcv_nxt = skb->seq; + SOCK_DEBUG(sk, "prune_queue: removing %x-%x (c=%x)\n", + skb->seq, skb->end_seq, tp->copied_seq); kfree_skb(skb); if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) break; @@ -1429,7 +1482,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } } else if (skb->ack_seq == tp->snd_una) { /* Bulk data transfer: receiver */ - if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) goto discard; @@ -1441,18 +1493,13 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, skb_queue_tail(&sk->receive_queue, skb); tp->rcv_nxt = skb->end_seq; + /* FIN bit check is not done since if FIN is set in + * this frame, the pred_flags won't match up. -DaveM + */ sk->data_ready(sk, 0); tcp_delack_estimator(tp); - -#if 1 /* This checks for required window updates too. */ tp->delayed_acks++; __tcp_ack_snd_check(sk); -#else - if (tp->delayed_acks++ == 0) - tcp_send_delayed_ack(sk, HZ/2); - else - tcp_send_ack(sk); -#endif return 0; } } @@ -1469,7 +1516,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } } - if(th->syn && skb->seq != sk->syn_seq) { + if(th->syn && skb->seq != tp->syn_seq) { SOCK_DEBUG(sk, "syn in established state\n"); tcp_statistics.TcpInErrs++; tcp_reset(sk, skb); @@ -1490,10 +1537,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, /* step 7: process the segment text */ queued = tcp_data(skb, sk, len); - /* step 8: check the FIN bit */ - if (th->fin) - (void) tcp_fin(skb, sk, th); - tcp_data_snd_check(sk); /* If our receive queue has grown past its limits shrink it */ @@ -1657,19 +1700,19 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->snd_wnd = htons(th->window) << tp->snd_wscale; tp->snd_wl1 = skb->seq; tp->snd_wl2 = skb->ack_seq; - tp->fin_seq = skb->seq; tcp_set_state(sk, TCP_ESTABLISHED); tcp_parse_options(th,tp,0); - /* FIXME: need to make room for SACK still */ + if (tp->wscale_ok == 0) { tp->snd_wscale = tp->rcv_wscale = 0; tp->window_clamp = min(tp->window_clamp,65535); } if (tp->tstamp_ok) { - tp->tcp_header_len = sizeof(struct tcphdr) + 12; /* FIXME: Define constant! */ - sk->dummy_th.doff += 3; /* reserve space of options */ + tp->tcp_header_len = + sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + sk->dummy_th.doff += (TCPOLEN_TSTAMP_ALIGNED >> 2); } else tp->tcp_header_len = sizeof(struct tcphdr); if (tp->saw_tstamp) { @@ -1680,14 +1723,30 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* Can't be earlier, doff would be wrong. */ tcp_send_ack(sk); - if (tp->in_mss) - sk->mss = min(sk->mss, tp->in_mss); - - /* Take out space for tcp options. */ - sk->mss -= tp->tcp_header_len - sizeof(struct tcphdr); + /* Check for the case where we tried to advertise + * a window including timestamp options, but did not + * end up using them for this connection. + */ + if((tp->tstamp_ok == 0) && sysctl_tcp_timestamps) + sk->mss += TCPOLEN_TSTAMP_ALIGNED; + /* Now limit it if the other end negotiated a smaller + * value. + */ + if (tp->in_mss) { + int real_mss = tp->in_mss; + + /* We store MSS locally with the timestamp bytes + * subtracted, TCP's advertise it with them + * included. Account for this fact. + */ + if(tp->tstamp_ok) + real_mss -= TCPOLEN_TSTAMP_ALIGNED; + sk->mss = min(sk->mss, real_mss); + } + sk->dummy_th.dest = th->source; - sk->copied_seq = tp->rcv_nxt; + tp->copied_seq = tp->rcv_nxt; if(!sk->dead) { sk->state_change(sk); @@ -1722,52 +1781,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, } break; - - case TCP_TIME_WAIT: - /* RFC 1122: - * "When a connection is [...] on TIME-WAIT state [...] - * [a TCP] MAY accept a new SYN from the remote TCP to - * reopen the connection directly, if it: - * - * (1) assigns its initial sequence number for the new - * connection to be larger than the largest sequence - * number it used on the previous connection incarnation, - * and - * - * (2) returns to TIME-WAIT state if the SYN turns out - * to be an old duplicate". - */ - if (th->syn && !th->rst && after(skb->seq, tp->rcv_nxt)) { - __u32 isn; - - skb_orphan(skb); - sk->err = ECONNRESET; - tcp_set_state(sk, TCP_CLOSE); - sk->shutdown = SHUTDOWN_MASK; - - isn = tp->rcv_nxt + 128000; - if (isn == 0) - isn++; - - sk = tp->af_specific->get_sock(skb, th); - - if (sk == NULL || !ipsec_sk_policy(sk,skb)) - goto discard; - - skb_set_owner_r(skb, sk); - tp = &sk->tp_pinfo.af_tcp; - - if(tp->af_specific->conn_request(sk, skb, opt, isn) < 0) - return 1; - - goto discard; - } - - break; } /* Parse the tcp_options present on this header. - * By this point we really only expect timestamps and SACKs. + * By this point we really only expect timestamps. * Note that this really has to be here and not later for PAWS * (RFC1323) to work. */ @@ -1819,7 +1836,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * original syn. */ - if (th->syn && skb->seq!=sk->syn_seq) { + if (th->syn && skb->seq!=tp->syn_seq) { tcp_reset(sk, skb); return 1; } @@ -1833,7 +1850,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (acceptable) { tcp_set_state(sk, TCP_ESTABLISHED); sk->dummy_th.dest=th->source; - sk->copied_seq = tp->rcv_nxt; + tp->copied_seq = tp->rcv_nxt; if(!sk->dead) sk->state_change(sk); @@ -1850,7 +1867,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, break; case TCP_FIN_WAIT1: - if (tp->snd_una == sk->write_seq) { + if (tp->snd_una == tp->write_seq) { sk->shutdown |= SEND_SHUTDOWN; tcp_set_state(sk, TCP_FIN_WAIT2); if (!sk->dead) @@ -1861,12 +1878,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, break; case TCP_CLOSING: - if (tp->snd_una == sk->write_seq) + if (tp->snd_una == tp->write_seq) tcp_time_wait(sk); break; case TCP_LAST_ACK: - if (tp->snd_una == sk->write_seq) { + if (tp->snd_una == tp->write_seq) { sk->shutdown = SHUTDOWN_MASK; tcp_set_state(sk,TCP_CLOSE); if (!sk->dead) @@ -1874,13 +1891,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; } break; - - case TCP_TIME_WAIT: - /* Keep us in TIME_WAIT until we stop getting - * packets, reset the timeout. - */ - tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); - break; } } else goto discard; @@ -1918,12 +1928,6 @@ step6: break; } - /* step 8: check the FIN bit */ - if (th->fin) { - if(tcp_fin(skb, sk, th) != 0) - goto discard; - } - tcp_data_snd_check(sk); tcp_ack_snd_check(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e4f8981ac..91f21ff75 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_ipv4.c,v 1.79 1998/01/15 22:40:47 freitag Exp $ + * Version: $Id: tcp_ipv4.c,v 1.109 1998/03/15 07:24:15 davem Exp $ * * IPv4 specific functions * @@ -60,8 +60,6 @@ #include <linux/inet.h> -extern int sysctl_tcp_sack; -extern int sysctl_tcp_tsack; extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_syncookies; @@ -89,16 +87,19 @@ void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, */ struct sock *tcp_established_hash[TCP_HTABLE_SIZE]; +/* Ok, let's try this, I give up, we do need a local binding + * TCP hash as well as the others for fast bind/connect. + */ +struct tcp_bind_bucket *tcp_bound_hash[TCP_BHTABLE_SIZE]; + /* All sockets in TCP_LISTEN state will be in here. This is the only table * where wildcard'd TCP sockets can exist. Hash function here is just local * port number. */ struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE]; -/* Ok, let's try this, I give up, we do need a local binding - * TCP hash as well as the others for fast bind/connect. - */ -struct sock *tcp_bound_hash[TCP_BHTABLE_SIZE]; +/* Register cache. */ +struct sock *tcp_regs[TCP_NUM_REGS]; /* * This array holds the first and last local port number. @@ -106,6 +107,7 @@ struct sock *tcp_bound_hash[TCP_BHTABLE_SIZE]; * 32768-61000 */ int sysctl_local_port_range[2] = { 1024, 4999 }; +int tcp_port_rover = (1024 - 1); static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport, __u32 faddr, __u16 fport) @@ -123,155 +125,135 @@ static __inline__ int tcp_sk_hashfn(struct sock *sk) return tcp_hashfn(laddr, lport, faddr, fport); } -static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum) +/* Invariant, sk->num is non-zero. */ +void tcp_bucket_unlock(struct sock *sk) { - struct sock *sk2; - int retval = 0, sk_reuse = sk->reuse; + struct tcp_bind_bucket *tb; + unsigned short snum = sk->num; SOCKHASH_LOCK(); - sk2 = tcp_bound_hash[tcp_bhashfn(snum)]; - for(; sk2 != NULL; sk2 = sk2->bind_next) { - if((sk2->num == snum) && (sk2 != sk)) { - unsigned char state = sk2->state; - int sk2_reuse = sk2->reuse; - - /* Two sockets can be bound to the same port if they're - * bound to different interfaces. - */ - - if(sk->bound_dev_if != sk2->bound_dev_if) - continue; - - if(!sk2->rcv_saddr || !sk->rcv_saddr) { - if((!sk2_reuse) || - (!sk_reuse) || - (state == TCP_LISTEN)) { - retval = 1; - break; - } - } else if(sk2->rcv_saddr == sk->rcv_saddr) { - if((!sk_reuse) || - (!sk2_reuse) || - (state == TCP_LISTEN)) { - retval = 1; - break; - } + for(tb = tcp_bound_hash[tcp_bhashfn(snum)]; tb; tb = tb->next) { + if(tb->port == snum) { + if(tb->owners == NULL && + (tb->flags & TCPB_FLAG_LOCKED)) { + tb->flags &= ~TCPB_FLAG_LOCKED; + tcp_inc_slow_timer(TCP_SLT_BUCKETGC); } + break; } } SOCKHASH_UNLOCK(); +} - return retval; +struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum) +{ + struct tcp_bind_bucket *tb; + + tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC); + if(tb != NULL) { + struct tcp_bind_bucket **head = + &tcp_bound_hash[tcp_bhashfn(snum)]; + tb->port = snum; + tb->flags = TCPB_FLAG_LOCKED; + tb->owners = NULL; + if((tb->next = *head) != NULL) + tb->next->pprev = &tb->next; + *head = tb; + tb->pprev = head; + } + return tb; } -static __inline__ int tcp_lport_inuse(int num) +static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum) { - struct sock *sk = tcp_bound_hash[tcp_bhashfn(num)]; + struct tcp_bind_bucket *tb; + int result = 0; - for(; sk != NULL; sk = sk->bind_next) { - if(sk->num == num) - return 1; + SOCKHASH_LOCK(); + for(tb = tcp_bound_hash[tcp_bhashfn(snum)]; + (tb && (tb->port != snum)); + tb = tb->next) + ; + if(tb && tb->owners) { + /* Fast path for reuse ports, see include/net/tcp.h for a very + * detailed description of why this works, and why it is worth + * the effort at all. -DaveM + */ + if((tb->flags & TCPB_FLAG_FASTREUSE) && + (sk->reuse != 0)) { + goto go_like_smoke; + } else { + struct sock *sk2; + int sk_reuse = sk->reuse; + + /* We must walk the whole port owner list in this case. -DaveM */ + for(sk2 = tb->owners; sk2; sk2 = sk2->bind_next) { + if(sk->bound_dev_if == sk2->bound_dev_if) { + if(!sk_reuse || !sk2->reuse || sk2->state == TCP_LISTEN) { + if(!sk2->rcv_saddr || + !sk->rcv_saddr || + (sk2->rcv_saddr == sk->rcv_saddr)) + break; + } + } + } + if(sk2 != NULL) + result = 1; + } } - return 0; + if((result == 0) && + (tb == NULL) && + (tcp_bucket_create(snum) == NULL)) + result = 1; +go_like_smoke: + SOCKHASH_UNLOCK(); + return result; } -/* Find a "good" local port, this is family independent. - * There are several strategies working in unison here to - * get the best possible performance. The current socket - * load is kept track of, if it is zero there is a strong - * likely hood that there is a zero length chain we will - * find with a small amount of searching, else the load is - * what we shoot for for when the chains all have at least - * one entry. The base helps us walk the chains in an - * order such that a good chain is found as quickly as possible. -DaveM - */ unsigned short tcp_good_socknum(void) { - static int start = 0; - static int binding_contour = 0; - int best = 0; - int size = 32767; /* a big num. */ - int retval = 0, i, end, bc; + struct tcp_bind_bucket *tb; + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int remaining = high - low; + int rover; SOCKHASH_LOCK(); - if (start > sysctl_local_port_range[1] || start < sysctl_local_port_range[0]) - start = sysctl_local_port_range[0]; - i = tcp_bhashfn(start); - end = i + TCP_BHTABLE_SIZE; - bc = binding_contour; - do { - struct sock *sk = tcp_bound_hash[i&(TCP_BHTABLE_SIZE-1)]; - if(!sk) { - /* find the smallest value no smaller than start - * that has this hash value. - */ - retval = tcp_bhashnext(start-1,i&(TCP_BHTABLE_SIZE-1)); - - /* Check for decreasing load. */ - if (bc != 0) - binding_contour = 0; - goto done; - } else { - int j = 0; - do { sk = sk->bind_next; } while (++j < size && sk); - if (j < size) { - best = i&(TCP_BHTABLE_SIZE-1); - size = j; - if (bc && size <= bc) - goto verify; - } - } - } while(++i != end); - i = best; - - /* Socket load is increasing, adjust our load average. */ - binding_contour = size; -verify: - if (size < binding_contour) - binding_contour = size; - - retval = tcp_bhashnext(start-1,i); - - best = retval; /* mark the starting point to avoid infinite loops */ - while(tcp_lport_inuse(retval)) { - retval = tcp_bhashnext(retval,i); - if (retval > sysctl_local_port_range[1]) /* Upper bound */ - retval = tcp_bhashnext(sysctl_local_port_range[0],i); - if (retval == best) { - /* This hash chain is full. No answer. */ - retval = 0; - break; + rover = tcp_port_rover; + do { + rover += 1; + if((rover < low) || (rover > high)) + rover = low; + tb = tcp_bound_hash[tcp_bhashfn(rover)]; + for( ; tb; tb = tb->next) { + if(tb->port == rover) + goto next; } - } - -done: - start = (retval + 1); + break; + next: + } while(--remaining > 0); + tcp_port_rover = rover; + if((remaining <= 0) || (tcp_bucket_create(rover) == NULL)) + rover = 0; SOCKHASH_UNLOCK(); - return retval; + return rover; } static void tcp_v4_hash(struct sock *sk) { - unsigned char state; - - SOCKHASH_LOCK(); - state = sk->state; - if(state != TCP_CLOSE || !sk->dead) { + if (sk->state != TCP_CLOSE) { struct sock **skp; - if(state == TCP_LISTEN) - skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; - else - skp = &tcp_established_hash[tcp_sk_hashfn(sk)]; - + SOCKHASH_LOCK(); + skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))]; if((sk->next = *skp) != NULL) (*skp)->pprev = &sk->next; *skp = sk; sk->pprev = skp; tcp_sk_bindify(sk); + SOCKHASH_UNLOCK(); } - SOCKHASH_UNLOCK(); } static void tcp_v4_unhash(struct sock *sk) @@ -282,6 +264,7 @@ static void tcp_v4_unhash(struct sock *sk) sk->next->pprev = sk->pprev; *sk->pprev = sk->next; sk->pprev = NULL; + tcp_reg_zap(sk); tcp_sk_unbindify(sk); } SOCKHASH_UNLOCK(); @@ -293,30 +276,27 @@ static void tcp_v4_rehash(struct sock *sk) SOCKHASH_LOCK(); state = sk->state; - if(sk->pprev) { + if(sk->pprev != NULL) { if(sk->next) sk->next->pprev = sk->pprev; *sk->pprev = sk->next; sk->pprev = NULL; - tcp_sk_unbindify(sk); + tcp_reg_zap(sk); } - if(state != TCP_CLOSE || !sk->dead) { + if(state != TCP_CLOSE) { struct sock **skp; - if(state == TCP_LISTEN) { + if(state == TCP_LISTEN) skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; - } else { - int hash= tcp_sk_hashfn(sk); - if(state == TCP_TIME_WAIT) - hash += (TCP_HTABLE_SIZE/2); - skp = &tcp_established_hash[hash]; - } + else + skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))]; if((sk->next = *skp) != NULL) (*skp)->pprev = &sk->next; *skp = sk; sk->pprev = skp; - tcp_sk_bindify(sk); + if(state == TCP_LISTEN) + tcp_sk_bindify(sk); } SOCKHASH_UNLOCK(); } @@ -360,37 +340,64 @@ static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int d return result; } +/* Until this is verified... -DaveM */ +/* #define USE_QUICKSYNS */ + /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM + * It is assumed that this code only gets called from within NET_BH. */ static inline struct sock *__tcp_v4_lookup(struct tcphdr *th, - u32 saddr, u16 sport, u32 daddr, u16 dport, int dif) + u32 saddr, u16 sport, + u32 daddr, u16 dport, int dif) { unsigned short hnum = ntohs(dport); struct sock *sk; - int hash = tcp_hashfn(daddr, hnum, saddr, sport); + int hash; + +#ifdef USE_QUICKSYNS + /* Incomming connection short-cut. */ + if (th && th->syn == 1 && th->ack == 0) + goto listener_shortcut; +#endif + + /* Check TCP register quick cache first. */ + sk = TCP_RHASH(sport); + if(sk && + sk->daddr == saddr && /* remote address */ + sk->dummy_th.dest == sport && /* remote port */ + sk->num == hnum && /* local port */ + sk->rcv_saddr == daddr && /* local address */ + (!sk->bound_dev_if || sk->bound_dev_if == dif)) + goto hit; /* Optimize here for direct hit, only listening connections can - * have wildcards anyways. It is assumed that this code only - * gets called from within NET_BH. + * have wildcards anyways. */ - for(sk = tcp_established_hash[hash]; sk; sk = sk->next) + hash = tcp_hashfn(daddr, hnum, saddr, sport); + for(sk = tcp_established_hash[hash]; sk; sk = sk->next) { if(sk->daddr == saddr && /* remote address */ sk->dummy_th.dest == sport && /* remote port */ sk->num == hnum && /* local port */ sk->rcv_saddr == daddr && /* local address */ - (!sk->bound_dev_if || sk->bound_dev_if == dif)) + (!sk->bound_dev_if || sk->bound_dev_if == dif)) { + if (sk->state == TCP_ESTABLISHED) + TCP_RHASH(sport) = sk; goto hit; /* You sunk my battleship! */ - + } + } /* Must check for a TIME_WAIT'er before going to listener hash. */ - for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next) + for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next) { if(sk->daddr == saddr && /* remote address */ sk->dummy_th.dest == sport && /* remote port */ sk->num == hnum && /* local port */ sk->rcv_saddr == daddr && /* local address */ (!sk->bound_dev_if || sk->bound_dev_if == dif)) goto hit; - + } +#ifdef USE_QUICKSYNS +listener_shortcut: +#endif sk = tcp_v4_lookup_listener(daddr, hnum, dif); hit: return sk; @@ -402,20 +409,11 @@ __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport } #ifdef CONFIG_IP_TRANSPARENT_PROXY -#define secondlist(hpnum, sk, fpass) \ -({ struct sock *s1; if(!(sk) && (fpass)--) \ - s1 = tcp_bound_hash[tcp_bhashfn(hpnum)]; \ - else \ - s1 = (sk); \ - s1; \ -}) - -#define tcp_v4_proxy_loop_init(hnum, hpnum, sk, fpass) \ - secondlist((hpnum), tcp_bound_hash[tcp_bhashfn(hnum)],(fpass)) - -#define tcp_v4_proxy_loop_next(hnum, hpnum, sk, fpass) \ - secondlist((hpnum),(sk)->bind_next,(fpass)) - +/* Cleaned up a little and adapted to new bind bucket scheme. + * Oddly, this should increase performance here for + * transparent proxy, as tests within the inner loop have + * been eliminated. -DaveM + */ static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr, unsigned short rnum, unsigned long laddr, struct device *dev, unsigned short pnum, @@ -436,51 +434,60 @@ static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr, } /* This code must run only from NET_BH. */ - for(s = tcp_v4_proxy_loop_init(hnum, hpnum, s, firstpass); - s != NULL; - s = tcp_v4_proxy_loop_next(hnum, hpnum, s, firstpass)) { - if(s->num == hnum || s->num == hpnum) { - int score = 0; - if(s->dead && (s->state == TCP_CLOSE)) + { + struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hnum)]; + for( ; (tb && tb->port != hnum); tb = tb->next) + ; + if(tb == NULL) + goto next; + s = tb->owners; + } +pass2: + for(; s; s = s->bind_next) { + int score = 0; + if(s->rcv_saddr) { + if((s->num != hpnum || s->rcv_saddr != paddr) && + (s->num != hnum || s->rcv_saddr != laddr)) continue; - if(s->rcv_saddr) { - if((s->num != hpnum || s->rcv_saddr != paddr) && - (s->num != hnum || s->rcv_saddr != laddr)) - continue; - score++; - } - if(s->daddr) { - if(s->daddr != raddr) - continue; - score++; - } - if(s->dummy_th.dest) { - if(s->dummy_th.dest != rnum) - continue; - score++; - } - if(s->bound_dev_if) { - if(s->bound_dev_if != dif) - continue; - score++; - } - if(score == 4 && s->num == hnum) { - result = s; - break; - } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) { - result = s; - badness = score; - } + score++; + } + if(s->daddr) { + if(s->daddr != raddr) + continue; + score++; + } + if(s->dummy_th.dest) { + if(s->dummy_th.dest != rnum) + continue; + score++; + } + if(s->bound_dev_if) { + if(s->bound_dev_if != dif) + continue; + score++; + } + if(score == 4 && s->num == hnum) { + result = s; + goto gotit; + } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) { + result = s; + badness = score; } } +next: + if(firstpass--) { + struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hpnum)]; + for( ; (tb && tb->port != hpnum); tb = tb->next) + ; + if(tb) { + s = tb->owners; + goto pass2; + } + } +gotit: return result; } - -#undef secondlist -#undef tcp_v4_proxy_loop_init -#undef tcp_v4_proxy_loop_next - -#endif +#endif /* CONFIG_IP_TRANSPARENT_PROXY */ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) { @@ -495,41 +502,35 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) /* * Check that a TCP address is unique, don't allow multiple - * connects to/from the same address + * connects to/from the same address. Actually we can optimize + * quite a bit, since the socket about to connect is still + * in TCP_CLOSE, a tcp_bind_bucket for the local port he will + * use will exist, with a NULL owners list. So check for that. + * The good_socknum and verify_bind scheme we use makes this + * work. */ -static int tcp_unique_address(u32 saddr, u16 snum, u32 daddr, u16 dnum) +static int tcp_unique_address(struct sock *sk) { - int retval = 1, hashent = tcp_hashfn(saddr, snum, daddr, dnum); - struct sock * sk; + struct tcp_bind_bucket *tb; + unsigned short snum = sk->num; + int retval = 1; - /* Make sure we are allowed to connect here. - * But freeze the hash while we snoop around. - */ + /* Freeze the hash while we snoop around. */ SOCKHASH_LOCK(); - sk = tcp_established_hash[hashent]; - for (; sk != NULL; sk = sk->next) { - if(sk->daddr == daddr && /* remote address */ - sk->dummy_th.dest == dnum && /* remote port */ - sk->num == snum && /* local port */ - sk->saddr == saddr) { /* local address */ - retval = 0; - goto out; - } - } - - /* Must check TIME_WAIT'ers too. */ - sk = tcp_established_hash[hashent + (TCP_HTABLE_SIZE/2)]; - for (; sk != NULL; sk = sk->next) { - if(sk->daddr == daddr && /* remote address */ - sk->dummy_th.dest == dnum && /* remote port */ - sk->num == snum && /* local port */ - sk->saddr == saddr) { /* local address */ - retval = 0; - goto out; + tb = tcp_bound_hash[tcp_bhashfn(snum)]; + for(; tb; tb = tb->next) { + if(tb->port == snum && tb->owners != NULL) { + /* Almost certainly the re-use port case, search the real hashes + * so it actually scales. + */ + sk = __tcp_v4_lookup(NULL, sk->daddr, sk->dummy_th.dest, + sk->rcv_saddr, snum, sk->bound_dev_if); + if((sk != NULL) && (sk->state != TCP_LISTEN)) + retval = 0; + break; } } -out: SOCKHASH_UNLOCK(); return retval; } @@ -578,8 +579,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) return -ENETUNREACH; } - if (!tcp_unique_address(rt->rt_src, sk->num, rt->rt_dst, - usin->sin_port)) { + if (!tcp_unique_address(sk)) { ip_rt_put(rt); return -EADDRNOTAVAIL; } @@ -587,7 +587,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) lock_sock(sk); /* Do this early, so there is less state to unwind on failure. */ - buff = sock_wmalloc(sk, MAX_SYN_SIZE, 0, GFP_KERNEL); + buff = sock_wmalloc(sk, (MAX_SYN_SIZE + sizeof(struct sk_buff)), + 0, GFP_KERNEL); if (buff == NULL) { release_sock(sk); ip_rt_put(rt); @@ -605,15 +606,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk->dummy_th.dest = usin->sin_port; - sk->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr, + tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr, sk->dummy_th.source, usin->sin_port); - tp->snd_wnd = 0; tp->snd_wl1 = 0; - tp->snd_wl2 = sk->write_seq; - tp->snd_una = sk->write_seq; - + tp->snd_wl2 = tp->write_seq; + tp->snd_una = tp->write_seq; tp->rcv_nxt = 0; sk->err = 0; @@ -635,14 +634,22 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) /* No failure conditions can result past this point. */ + /* We'll fix this up when we get a response from the other end. + * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. + */ + tp->tcp_header_len = sizeof(struct tcphdr) + + (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); + th = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr)); buff->h.th = th; memcpy(th,(void *)&(sk->dummy_th), sizeof(*th)); - buff->seq = sk->write_seq++; + /* th->doff gets fixed up below if we tack on options. */ + + buff->seq = tp->write_seq++; th->seq = htonl(buff->seq); - tp->snd_nxt = sk->write_seq; - buff->end_seq = sk->write_seq; + tp->snd_nxt = tp->write_seq; + buff->end_seq = tp->write_seq; th->ack = 0; th->syn = 1; @@ -656,11 +663,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if(sk->mtu < 64) sk->mtu = 64; /* Sanity limit */ - if (sk->user_mss) - sk->mss = sk->user_mss; - else - sk->mss = (sk->mtu - sizeof(struct iphdr) - - sizeof(struct tcphdr)); + sk->mss = (sk->mtu - sizeof(struct iphdr) - tp->tcp_header_len); + if(sk->user_mss) + sk->mss = min(sk->mss, sk->user_mss); if (sk->mss < 1) { printk(KERN_DEBUG "intial sk->mss below 1\n"); @@ -675,9 +680,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) &tp->rcv_wscale); th->window = htons(tp->rcv_wnd); - tmp = tcp_syn_build_options(buff, sk->mss, sysctl_tcp_sack, - sysctl_tcp_timestamps, - sysctl_tcp_window_scaling,tp->rcv_wscale); + tmp = tcp_syn_build_options(buff, sk->mss, sysctl_tcp_timestamps, + sysctl_tcp_window_scaling, tp->rcv_wscale); buff->csum = 0; th->doff = (sizeof(*th)+ tmp)>>2; @@ -686,9 +690,10 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tcp_set_state(sk,TCP_SYN_SENT); /* Socket identity change complete, no longer - * in TCP_CLOSE, so rehash. + * in TCP_CLOSE, so enter ourselves into the + * hash tables. */ - tcp_v4_rehash(sk); + tcp_v4_hash(sk); tp->rto = rt->u.dst.rtt; @@ -715,6 +720,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len) { + struct tcp_opt *tp; int retval = -EINVAL; /* Do sanity checking for sendmsg/sendto/send. */ @@ -740,7 +746,10 @@ static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len) lock_sock(sk); retval = tcp_do_sendmsg(sk, msg->msg_iovlen, msg->msg_iov, msg->msg_flags); - + /* Push out partial tail frames if needed. */ + tp = &(sk->tp_pinfo.af_tcp); + if(tp->send_head && tcp_snd_test(sk, tp->send_head)) + tcp_write_xmit(sk); release_sock(sk); out: @@ -854,7 +863,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) th = (struct tcphdr*)(dp+(iph->ihl<<2)); sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex); - if (sk == NULL) { + if (sk == NULL || sk->state == TCP_TIME_WAIT) { icmp_statistics.IcmpInErrors++; return; } @@ -1011,7 +1020,8 @@ static void tcp_v4_send_reset(struct sk_buff *skb) skb1->csum = csum_partial((u8 *) th1, sizeof(*th1), 0); th1->check = tcp_v4_check(th1, sizeof(*th1), skb1->nh.iph->saddr, skb1->nh.iph->daddr, skb1->csum); - /* FIXME: should this carry an options packet? */ + + /* Do not place TCP options in a reset. */ ip_queue_xmit(skb1); tcp_statistics.TcpOutSegs++; tcp_statistics.TcpOutRsts++; @@ -1063,6 +1073,14 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) mss = (skb->dst->pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr)); if (sk->user_mss) mss = min(mss, sk->user_mss); + if(req->tstamp_ok) + mss -= TCPOLEN_TSTAMP_ALIGNED; + else + req->mss += TCPOLEN_TSTAMP_ALIGNED; + + /* tcp_syn_build_options will do an skb_put() to obtain the TCP + * options bytes below. + */ skb->h.th = th = (struct tcphdr *) skb_put(skb, sizeof(struct tcphdr)); /* Don't offer more than they did. @@ -1081,9 +1099,8 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; - th->source = #ifdef CONFIG_IP_TRANSPARENT_PROXY - req->lcl_port; /* LVE */ + th->source = req->lcl_port; /* LVE */ #else th->source = sk->dummy_th.source; #endif @@ -1104,16 +1121,7 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) req->rcv_wscale = rcv_wscale; } th->window = htons(req->rcv_wnd); - - /* XXX Partial csum of 4 byte quantity is itself! -DaveM - * Yes, but it's a bit harder to special case now. It's - * now computed inside the tcp_v4_send_check() to clean up - * updating the options fields in the mainline send code. - * If someone thinks this is really bad let me know and - * I'll try to do it a different way. -- erics - */ - - tmp = tcp_syn_build_options(skb, req->mss, req->sack_ok, req->tstamp_ok, + tmp = tcp_syn_build_options(skb, req->mss, req->tstamp_ok, req->wscale_ok,req->rcv_wscale); skb->csum = 0; th->doff = (sizeof(*th) + tmp)>>2; @@ -1232,14 +1240,15 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ req->rcv_isn = skb->seq; - tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0; + tp.tstamp_ok = tp.wscale_ok = tp.snd_wscale = 0; tp.in_mss = 536; tcp_parse_options(th,&tp,want_cookie); - if (tp.saw_tstamp) - req->ts_recent = tp.rcv_tsval; req->mss = tp.in_mss; + if (tp.saw_tstamp) { + req->mss -= TCPOLEN_TSTAMP_ALIGNED; + req->ts_recent = tp.rcv_tsval; + } req->tstamp_ok = tp.tstamp_ok; - req->sack_ok = tp.sack_ok; req->snd_wscale = tp.snd_wscale; req->wscale_ok = tp.wscale_ok; req->rmt_port = th->source; @@ -1289,6 +1298,113 @@ error: return 0; } +/* This is not only more efficient than what we used to do, it eliminates + * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM + */ +struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb) +{ + struct sock *newsk = sk_alloc(AF_INET, GFP_ATOMIC, 0); + + if(newsk != NULL) { + struct tcp_opt *newtp; + + memcpy(newsk, sk, sizeof(*newsk)); + newsk->sklist_next = NULL; + newsk->daddr = req->af.v4_req.rmt_addr; + newsk->rcv_saddr = req->af.v4_req.loc_addr; +#ifdef CONFIG_IP_TRANSPARENT_PROXY + newsk->num = ntohs(skb->h.th->dest); +#endif + newsk->state = TCP_SYN_RECV; + + /* Clone the TCP header template */ +#ifdef CONFIG_IP_TRANSPARENT_PROXY + newsk->dummy_th.source = req->lcl_port; +#endif + newsk->dummy_th.dest = req->rmt_port; + newsk->dummy_th.ack = 1; + newsk->dummy_th.doff = sizeof(struct tcphdr)>>2; + + newsk->sock_readers = 0; + atomic_set(&newsk->rmem_alloc, 0); + skb_queue_head_init(&newsk->receive_queue); + atomic_set(&newsk->wmem_alloc, 0); + skb_queue_head_init(&newsk->write_queue); + newsk->saddr = req->af.v4_req.loc_addr; + + newsk->done = 0; + newsk->proc = 0; + newsk->pair = NULL; + skb_queue_head_init(&newsk->back_log); + skb_queue_head_init(&newsk->error_queue); + + /* Now setup tcp_opt */ + newtp = &(newsk->tp_pinfo.af_tcp); + newtp->pred_flags = 0; + newtp->rcv_nxt = req->rcv_isn + 1; + newtp->snd_nxt = req->snt_isn + 1; + newtp->snd_una = req->snt_isn + 1; + newtp->srtt = 0; + newtp->ato = 0; + newtp->snd_wl1 = req->rcv_isn; + newtp->snd_wl2 = req->snt_isn; + newtp->snd_wnd = ntohs(skb->h.th->window); + newtp->max_window = newtp->snd_wnd; + newtp->pending = 0; + newtp->retransmits = 0; + newtp->last_ack_sent = req->rcv_isn + 1; + newtp->backoff = 0; + newtp->mdev = TCP_TIMEOUT_INIT; + newtp->snd_cwnd = 1; + newtp->rto = TCP_TIMEOUT_INIT; + newtp->packets_out = 0; + newtp->high_seq = 0; + newtp->snd_ssthresh = 0x7fffffff; + newtp->snd_cwnd_cnt = 0; + newtp->dup_acks = 0; + newtp->delayed_acks = 0; + init_timer(&newtp->retransmit_timer); + newtp->retransmit_timer.function = &tcp_retransmit_timer; + newtp->retransmit_timer.data = (unsigned long) newsk; + init_timer(&newtp->delack_timer); + newtp->delack_timer.function = &tcp_delack_timer; + newtp->delack_timer.data = (unsigned long) newsk; + skb_queue_head_init(&newtp->out_of_order_queue); + newtp->send_head = newtp->retrans_head = NULL; + newtp->rcv_wup = req->rcv_isn + 1; + newtp->write_seq = req->snt_isn + 1; + newtp->copied_seq = req->rcv_isn + 1; + + newtp->saw_tstamp = 0; + newtp->in_mss = 536; + + init_timer(&newtp->probe_timer); + newtp->probe_timer.function = &tcp_probe_timer; + newtp->probe_timer.data = (unsigned long) newsk; + newtp->probes_out = 0; + newtp->syn_seq = req->rcv_isn; + newtp->fin_seq = req->rcv_isn; + newtp->urg_data = 0; + tcp_synq_init(newtp); + newtp->syn_backlog = 0; + + /* Back to base struct sock members. */ + newsk->err = 0; + newsk->ack_backlog = 0; + newsk->max_ack_backlog = SOMAXCONN; + newsk->priority = 1; + + /* IP layer stuff */ + newsk->opt = req->af.v4_req.opt; + newsk->timeout = 0; + init_timer(&newsk->timer); + newsk->timer.function = &net_timer; + newsk->timer.data = (unsigned long) newsk; + newsk->socket = NULL; + } + return newsk; +} + struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct open_request *req, struct dst_entry *dst) @@ -1301,98 +1417,14 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (sk->ack_backlog > sk->max_ack_backlog) goto exit; /* head drop */ #endif - newsk = sk_alloc(AF_INET, GFP_ATOMIC); + newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) goto exit; #ifdef NEW_LISTEN sk->ack_backlog++; #endif - memcpy(newsk, sk, sizeof(*newsk)); - - /* Or else we die! -DaveM */ - newsk->sklist_next = NULL; - - newsk->opt = req->af.v4_req.opt; - skb_queue_head_init(&newsk->write_queue); - skb_queue_head_init(&newsk->receive_queue); - skb_queue_head_init(&newsk->out_of_order_queue); - skb_queue_head_init(&newsk->error_queue); - - /* Unused */ newtp = &(newsk->tp_pinfo.af_tcp); - newtp->send_head = NULL; - newtp->retrans_head = NULL; - - newtp->pending = 0; - - skb_queue_head_init(&newsk->back_log); - - newsk->prot->init(newsk); - - newtp->snd_cwnd_cnt = 0; - newtp->backoff = 0; - newsk->proc = 0; - newsk->done = 0; - newsk->pair = NULL; - atomic_set(&newsk->wmem_alloc, 0); - atomic_set(&newsk->rmem_alloc, 0); - newsk->localroute = sk->localroute; - - newsk->err = 0; - newsk->shutdown = 0; - newsk->ack_backlog = 0; - - newtp->fin_seq = req->rcv_isn; - newsk->syn_seq = req->rcv_isn; - newsk->state = TCP_SYN_RECV; - newsk->timeout = 0; - - newsk->write_seq = req->snt_isn; - - newtp->snd_wnd = ntohs(skb->h.th->window); - newtp->max_window = newtp->snd_wnd; - newtp->snd_wl1 = req->rcv_isn; - newtp->snd_wl2 = newsk->write_seq; - newtp->snd_una = newsk->write_seq++; - newtp->snd_nxt = newsk->write_seq; - - newsk->urg_data = 0; - newtp->packets_out = 0; - newtp->retransmits = 0; - newsk->linger=0; - newsk->destroy = 0; - init_timer(&newsk->timer); - newsk->timer.data = (unsigned long) newsk; - newsk->timer.function = &net_timer; - - tcp_init_xmit_timers(newsk); - - newsk->dummy_th.source = -#ifdef CONFIG_IP_TRANSPARENT_PROXY - req->lcl_port; /* LVE */ -#else - sk->dummy_th.source; -#endif - newsk->dummy_th.dest = req->rmt_port; - newsk->sock_readers=0; - - newtp->last_ack_sent = newtp->rcv_nxt = req->rcv_isn + 1; - newtp->rcv_wup = req->rcv_isn + 1; - newsk->copied_seq = req->rcv_isn + 1; - - newsk->socket = NULL; - -#ifdef CONFIG_IP_TRANSPARENT_PROXY - /* - * Deal with possibly redirected traffic by setting num to - * the intended destination port of the received packet. - */ - newsk->num = ntohs(skb->h.th->dest); -#endif - newsk->daddr = req->af.v4_req.rmt_addr; - newsk->saddr = req->af.v4_req.loc_addr; - newsk->rcv_saddr = req->af.v4_req.loc_addr; /* options / mss / route_cache */ if (dst == NULL) { @@ -1418,7 +1450,6 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (newsk->mtu < 64) newsk->mtu = 64; - newtp->sack_ok = req->sack_ok; newtp->tstamp_ok = req->tstamp_ok; newtp->window_clamp = req->window_clamp; newtp->rcv_wnd = req->rcv_wnd; @@ -1433,8 +1464,8 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (newtp->tstamp_ok) { newtp->ts_recent = req->ts_recent; newtp->ts_recent_stamp = jiffies; - newtp->tcp_header_len = sizeof(struct tcphdr) + 12; /* FIXME: define constant! */ - newsk->dummy_th.doff += 3; + newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + newsk->dummy_th.doff += (TCPOLEN_TSTAMP_ALIGNED >> 2); } else { newtp->tcp_header_len = sizeof(struct tcphdr); } @@ -1446,13 +1477,13 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, /* Make sure our mtu is adjusted for headers. */ newsk->mss = min(req->mss, snd_mss) + sizeof(struct tcphdr) - newtp->tcp_header_len; - tcp_v4_hash(newsk); + /* Must use the af_specific ops here for the case of IPv6 mapped. */ + newsk->prot->hash(newsk); add_to_prot_sklist(newsk); return newsk; exit: - if (dst) - dst_release(dst); + dst_release(dst); return NULL; } @@ -1623,6 +1654,8 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len) skb->used = 0; + if (sk->state == TCP_TIME_WAIT) + goto do_time_wait; if (!sk->sock_readers) return tcp_v4_do_rcv(sk, skb); @@ -1636,6 +1669,12 @@ discard_it: /* Discard frame. */ kfree_skb(skb); return 0; + +do_time_wait: + if(tcp_timewait_state_process((struct tcp_tw_bucket *)sk, + skb, th, &(IPCB(skb)->opt), skb->len)) + goto no_tcp_socket; + goto discard_it; } int tcp_v4_build_header(struct sock *sk, struct sk_buff *skb) @@ -1770,33 +1809,21 @@ struct tcp_func ipv4_specific = { sizeof(struct sockaddr_in) }; +/* NOTE: A lot of things set to zero explicitly by call to + * sk_alloc() so need not be done here. + */ static int tcp_v4_init_sock(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - skb_queue_head_init(&sk->out_of_order_queue); + skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); - tp->srtt = 0; tp->rto = TCP_TIMEOUT_INIT; /*TCP_WRITE_TIME*/ tp->mdev = TCP_TIMEOUT_INIT; - - tp->ato = 0; - tp->iat = (HZ/5) << 3; - - /* FIXME: tie this to sk->rcvbuf? (May be unnecessary) */ - /* tp->rcv_wnd = 8192; */ - tp->tstamp_ok = 0; - tp->sack_ok = 0; - tp->wscale_ok = 0; tp->in_mss = 536; - tp->snd_wscale = 0; - tp->sacks = 0; - tp->saw_tstamp = 0; - tp->syn_backlog = 0; - /* - * See draft-stevens-tcpca-spec-01 for discussion of the + /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. */ tp->snd_cwnd = 1; @@ -1804,9 +1831,7 @@ static int tcp_v4_init_sock(struct sock *sk) sk->priority = 1; sk->state = TCP_CLOSE; - sk->max_ack_backlog = SOMAXCONN; - sk->mtu = 576; sk->mss = 536; @@ -1824,6 +1849,7 @@ static int tcp_v4_init_sock(struct sock *sk) static int tcp_v4_destroy_sock(struct sock *sk) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; tcp_clear_xmit_timers(sk); @@ -1836,9 +1862,17 @@ static int tcp_v4_destroy_sock(struct sock *sk) kfree_skb(skb); /* Cleans up our, hopefuly empty, out_of_order_queue. */ - while((skb = skb_dequeue(&sk->out_of_order_queue)) != NULL) + while((skb = skb_dequeue(&tp->out_of_order_queue)) != NULL) kfree_skb(skb); + /* Clean up a locked TCP bind bucket, this only happens if a + * port is allocated for a socket, but it never fully connects. + * In which case we will find num to be non-zero and daddr to + * be zero. + */ + if(sk->daddr == 0 && sk->num != 0) + tcp_bucket_unlock(sk); + return 0; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index fbae5cfa6..d8c3c6480 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.51 1998/01/15 22:40:39 freitag Exp $ + * Version: $Id: tcp_output.c,v 1.65 1998/03/15 12:07:03 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -34,8 +34,6 @@ #include <net/tcp.h> -extern int sysctl_tcp_sack; -extern int sysctl_tcp_tsack; extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; @@ -45,7 +43,8 @@ static __inline__ void clear_delayed_acks(struct sock * sk) struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); tp->delayed_acks = 0; - sk->ack_backlog = 0; + if(tcp_in_quickack_mode(tp)) + tp->ato = ((HZ/100)*2); tcp_clear_xmit_timer(sk, TIME_DACK); } @@ -58,69 +57,26 @@ static __inline__ void update_send_head(struct sock *sk) tp->send_head = NULL; } -static __inline__ int tcp_snd_test(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int nagle_check = 1; - int len; - - /* RFC 1122 - section 4.2.3.4 - * - * We must queue if - * - * a) The right edge of this frame exceeds the window - * b) There are packets in flight and we have a small segment - * [SWS avoidance and Nagle algorithm] - * (part of SWS is done on packetization) - * c) We are retransmiting [Nagle] - * d) We have too many packets 'in flight' - * - * Don't use the nagle rule for urgent data. - */ - len = skb->end_seq - skb->seq; - if (!sk->nonagle && len < (sk->mss >> 1) && tp->packets_out && - !skb->h.th->urg) - nagle_check = 0; - - return (nagle_check && tp->packets_out < tp->snd_cwnd && - !after(skb->end_seq, tp->snd_una + tp->snd_wnd) && - tp->retransmits == 0); -} - /* * This is the main buffer sending routine. We queue the buffer * having checked it is sane seeming. */ -void tcp_send_skb(struct sock *sk, struct sk_buff *skb) +void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue) { - struct tcphdr * th = skb->h.th; + struct tcphdr *th = skb->h.th; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int size; /* Length of packet (not counting length of pre-tcp headers). */ size = skb->len - ((unsigned char *) th - skb->data); - /* Sanity check it.. */ - if (size < sizeof(struct tcphdr) || size > skb->len) { - printk(KERN_DEBUG "tcp_send_skb: bad skb " - "(skb = %p, data = %p, th = %p, len = %u)\n", - skb, skb->data, th, skb->len); - kfree_skb(skb); - return; - } - - /* If we have queued a header size packet.. (these crash a few - * tcp stacks if ack is not set) - * FIXME: What is the equivalent below when we have options? - */ - if (size == sizeof(struct tcphdr)) { - /* If it's got a syn or fin discard. */ - if(!th->syn && !th->fin) { - printk(KERN_DEBUG "tcp_send_skb: attempt to queue a bogon.\n"); - kfree_skb(skb); - return; - } + /* If there is a FIN or a SYN we add it onto the size. */ + if (th->fin || th->syn) { + if(th->syn) + size++; + if(th->fin) + size++; } /* Actual processing. */ @@ -129,14 +85,14 @@ void tcp_send_skb(struct sock *sk, struct sk_buff *skb) skb_queue_tail(&sk->write_queue, skb); - if (tp->send_head == NULL && tcp_snd_test(sk, skb)) { + if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) { struct sk_buff * buff; /* This is going straight out. */ tp->last_ack_sent = tp->rcv_nxt; th->ack_seq = htonl(tp->rcv_nxt); th->window = htons(tcp_select_window(sk)); - tcp_update_options((__u32 *)(th+1),tp); + tcp_update_options((__u32 *)(th + 1),tp); tp->af_specific->send_check(sk, th, size, skb); @@ -165,11 +121,10 @@ queue: /* Remember where we must start sending. */ if (tp->send_head == NULL) tp->send_head = skb; - if (tp->packets_out == 0 && !tp->pending) { + if (!force_queue && tp->packets_out == 0 && !tp->pending) { tp->pending = TIME_PROBE0; tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); } - return; } /* @@ -214,8 +169,6 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) buff->h.th = nth; memcpy(nth, th, tp->tcp_header_len); - /* FIXME: Make sure this gets tcp options right. */ - /* Correct the new header. */ buff->seq = skb->seq + len; buff->end_seq = skb->end_seq; @@ -281,14 +234,6 @@ static int tcp_wrxmit_frag(struct sock *sk, struct sk_buff *skb, int size) tp->send_head = skb; tp->packets_out--; return -1; - } else { -#if 0 - /* If tcp_fragment succeded then - * the send head is the resulting - * fragment - */ - tp->send_head = skb->next; -#endif } return 0; } @@ -346,9 +291,10 @@ void tcp_write_xmit(struct sock *sk) size = skb->len - (((unsigned char*)th) - skb->data); } - tp->last_ack_sent = th->ack_seq = htonl(tp->rcv_nxt); + tp->last_ack_sent = tp->rcv_nxt; + th->ack_seq = htonl(tp->rcv_nxt); th->window = rcv_wnd; - tcp_update_options((__u32 *)(th+1),tp); + tcp_update_options((__u32 *)(th + 1),tp); tp->af_specific->send_check(sk, th, size, skb); @@ -437,128 +383,44 @@ void tcp_write_xmit(struct sock *sk) * taken by headers, and the remaining space will be available for TCP data. * This should be accounted for correctly instead. */ -unsigned short tcp_select_window(struct sock *sk) +u32 __tcp_select_window(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - int mss = sk->mss; - long free_space = sock_rspace(sk) / 2; - long window, cur_win; + unsigned int mss = sk->mss; + unsigned int free_space; + u32 window, cur_win; + free_space = (sk->rcvbuf - atomic_read(&sk->rmem_alloc)) / 2; if (tp->window_clamp) { free_space = min(tp->window_clamp, free_space); mss = min(tp->window_clamp, mss); - } -#ifdef NO_ANK_FIX - /* I am tired of this message */ - else - printk(KERN_DEBUG "Clamp failure. Water leaking.\n"); -#endif + } else { + printk("tcp_select_window: tp->window_clamp == 0.\n"); + } if (mss < 1) { mss = 1; - printk(KERN_DEBUG "tcp_select_window: mss fell to 0.\n"); + printk("tcp_select_window: sk->mss fell to 0.\n"); } - /* compute the actual window i.e. - * old_window - received_bytes_on_that_win - */ - cur_win = tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup); - window = tp->rcv_wnd; - - if (cur_win < 0) { - cur_win = 0; -#ifdef NO_ANK_FIX - /* And this too. */ - printk(KERN_DEBUG "TSW: win < 0 w=%d 1=%u 2=%u\n", - tp->rcv_wnd, tp->rcv_nxt, tp->rcv_wup); -#endif - } - - if (free_space < sk->rcvbuf/4 && free_space < mss/2) + cur_win = tcp_receive_window(tp); + if (free_space < sk->rcvbuf/4 && free_space < mss/2) { window = 0; - - /* Get the largest window that is a nice multiple of mss. - * Window clamp already applied above. - * If our current window offering is within 1 mss of the - * free space we just keep it. This prevents the divide - * and multiply from happening most of the time. - * We also don't do any window rounding when the free space - * is too small. - */ - if (window < free_space - mss && free_space > mss) - window = (free_space/mss)*mss; - - /* Never shrink the offered window */ - if (window < cur_win) - window = cur_win; - - tp->rcv_wnd = window; - tp->rcv_wup = tp->rcv_nxt; - return window >> tp->rcv_wscale; /* RFC1323 scaling applied */ -} - -#if 0 -/* Old algorithm for window selection */ -unsigned short tcp_select_window(struct sock *sk) -{ - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - int mss = sk->mss; - long free_space = sock_rspace(sk); - long window, cur_win, usable; - - if (tp->window_clamp) { - free_space = min(tp->window_clamp, free_space); - mss = min(tp->window_clamp, mss); - } - - /* compute the actual window i.e. - * old_window - received_bytes_on_that_win - */ - cur_win = tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup); - window = tp->rcv_wnd; - - if (cur_win < 0) { - cur_win = 0; - printk(KERN_DEBUG "TSW: win < 0 w=%d 1=%u 2=%u\n", - tp->rcv_wnd, tp->rcv_nxt, tp->rcv_wup); - } - - /* RFC 1122: - * "the suggested [SWS] avoidance algoritm for the receiver is to keep - * RECV.NEXT + RCV.WIN fixed until: - * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" - * - * i.e. don't raise the right edge of the window until you can raise - * it at least MSS bytes. - */ - - usable = free_space - cur_win; - if (usable < 0) - usable = 0; - - if (window < usable) { - /* Window is not blocking the sender - * and we have enough free space for it - */ - if (cur_win > (sk->mss << 1)) - goto out; - } - - if (window >= usable) { - /* We are offering too much, cut it down... - * but don't shrink the window - */ - window = max(usable, cur_win); } else { - while ((usable - window) >= mss) - window += mss; + /* Get the largest window that is a nice multiple of mss. + * Window clamp already applied above. + * If our current window offering is within 1 mss of the + * free space we just keep it. This prevents the divide + * and multiply from happening most of the time. + * We also don't do any window rounding when the free space + * is too small. + */ + window = tp->rcv_wnd; + if ((window <= (free_space - mss)) || (window > free_space)) + window = (free_space/mss)*mss; } -out: - tp->rcv_wnd = window; - tp->rcv_wup = tp->rcv_nxt; return window; } -#endif static int tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb) { @@ -729,84 +591,123 @@ void tcp_do_retransmit(struct sock *sk, int all) } } -/* - * Send a fin. +/* Send a fin. The caller locks the socket for us. This cannot be + * allowed to fail queueing a FIN frame under any circumstances. */ - void tcp_send_fin(struct sock *sk) { - struct tcphdr *th =(struct tcphdr *)&sk->dummy_th; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct tcphdr *t1; - struct sk_buff *buff; - int tmp; - buff = sock_wmalloc(sk, BASE_ACK_SIZE + tp->tcp_header_len, 1, GFP_KERNEL); - if (buff == NULL) { - /* FIXME: This is a disaster if it occurs. */ - printk(KERN_INFO "tcp_send_fin: Impossible malloc failure"); - return; - } + /* Optimization, tack on the FIN if we have a queue of + * unsent frames. + */ + if(tp->send_head != NULL) { + struct sk_buff *tail = skb_peek_tail(&sk->write_queue); + struct tcphdr *th = tail->h.th; + int data_len; + + /* Unfortunately tcp_write_xmit won't check for going over + * the MSS due to the FIN sequence number, so we have to + * watch out for it here. + */ + data_len = (tail->tail - (((unsigned char *)th)+tp->tcp_header_len)); + if(data_len >= sk->mss) + goto build_new_frame; /* ho hum... */ - /* Administrivia. */ - buff->csum = 0; + /* tcp_write_xmit() will checksum the header etc. for us. */ + th->fin = 1; + tail->end_seq++; + } else { + struct sk_buff *buff; + struct tcphdr *th; - /* Put in the IP header and routing stuff. */ - tmp = tp->af_specific->build_net_header(sk, buff); - if (tmp < 0) { - int t; +build_new_frame: + buff = sock_wmalloc(sk, + (BASE_ACK_SIZE + tp->tcp_header_len + + sizeof(struct sk_buff)), + 1, GFP_KERNEL); + if (buff == NULL) { + /* We can only fail due to low memory situations, not + * due to going over our sndbuf limits (due to the + * force flag passed to sock_wmalloc). So just keep + * trying. We cannot allow this fail. The socket is + * still locked, so we need not check if the connection + * was reset in the meantime etc. + */ + goto build_new_frame; + } - /* FIXME: We must not throw this out. Eventually we must - * put a FIN into the queue, otherwise it never gets queued. - */ - kfree_skb(buff); - sk->write_seq++; - t = del_timer(&sk->timer); - if (t) - add_timer(&sk->timer); - else - tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); - return; - } - - /* We ought to check if the end of the queue is a buffer and - * if so simply add the fin to that buffer, not send it ahead. - */ - t1 =(struct tcphdr *)skb_put(buff,tp->tcp_header_len); - buff->h.th = t1; - tcp_build_options((__u32 *)(t1+1),tp); - - memcpy(t1, th, sizeof(*t1)); - buff->seq = sk->write_seq; - sk->write_seq++; - buff->end_seq = sk->write_seq; - t1->seq = htonl(buff->seq); - t1->ack_seq = htonl(tp->rcv_nxt); - t1->window = htons(tcp_select_window(sk)); - t1->fin = 1; - - tp->af_specific->send_check(sk, t1, tp->tcp_header_len, buff); - - /* The fin can only be transmited after the data. */ - skb_queue_tail(&sk->write_queue, buff); - if (tp->send_head == NULL) { - /* FIXME: BUG! we need to check if the fin fits into the window - * here. If not we need to do window probing (sick, but true) + /* Administrivia. */ + buff->csum = 0; + + /* Put in the IP header and routing stuff. + * + * FIXME: + * We can fail if the interface for the route + * this socket takes goes down right before + * we get here. ANK is there a way to point + * this into a "black hole" route in such a + * case? Ideally, we should still be able to + * queue this and let the retransmit timer + * keep trying until the destination becomes + * reachable once more. -DaveM */ - struct sk_buff *skb1; + if(tp->af_specific->build_net_header(sk, buff) < 0) { + kfree_skb(buff); + goto update_write_seq; + } + th = (struct tcphdr *) skb_put(buff, tp->tcp_header_len); + buff->h.th = th; - tp->packets_out++; - tp->snd_nxt = sk->write_seq; - buff->when = jiffies; + memcpy(th, (void *) &(sk->dummy_th), sizeof(*th)); + th->seq = htonl(tp->write_seq); + th->fin = 1; + tcp_build_options((__u32 *)(th + 1), tp); - skb1 = skb_clone(buff, GFP_KERNEL); - if (skb1) { - skb_set_owner_w(skb1, sk); - tp->af_specific->queue_xmit(skb1); - } + /* This makes sure we do things like abide by the congestion + * window and other constraints which prevent us from sending. + */ + tcp_send_skb(sk, buff, 0); + } +update_write_seq: + /* So that we recognize the ACK coming back for + * this FIN as being legitimate. + */ + tp->write_seq++; +} - if (!tcp_timer_is_set(sk, TIME_RETRANS)) - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); +/* We get here when a process closes a file descriptor (either due to + * an explicit close() or as a byproduct of exit()'ing) and there + * was unread data in the receive queue. This behavior is recommended + * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM + */ +void tcp_send_active_reset(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb; + struct tcphdr *th; + +again: + /* NOTE: No TCP options attached and we never retransmit this. */ + skb = sock_wmalloc(sk, (BASE_ACK_SIZE + sizeof(*th)), 1, GFP_KERNEL); + if(skb == NULL) + goto again; + skb->csum = 0; + if(tp->af_specific->build_net_header(sk, skb) < 0) { + kfree_skb(skb); + } else { + th = (struct tcphdr *) skb_put(skb, sizeof(*th)); + memcpy(th, &(sk->dummy_th), sizeof(*th)); + th->seq = htonl(tp->write_seq); + th->rst = 1; + th->doff = sizeof(*th) / 4; + tp->last_ack_sent = tp->rcv_nxt; + th->ack_seq = htonl(tp->rcv_nxt); + th->window = htons(tcp_select_window(sk)); + tp->af_specific->send_check(sk, th, sizeof(*th), skb); + tp->af_specific->queue_xmit(skb); + tcp_statistics.TcpOutSegs++; + tcp_statistics.TcpOutRsts++; } } @@ -814,6 +715,9 @@ void tcp_send_fin(struct sock *sk) * a SYN packet that crossed the incoming SYN that caused this routine * to get called. If this assumption fails then the initial rcv_wnd * and rcv_wscale values will not be correct. + * + * XXX When you have time Dave, redo this to use tcp_send_skb() just + * XXX like tcp_send_fin() above now does.... -DaveM */ int tcp_send_synack(struct sock *sk) { @@ -823,7 +727,7 @@ int tcp_send_synack(struct sock *sk) struct tcphdr *th; int tmp; - skb = sock_wmalloc(sk, MAX_SYN_SIZE, 1, GFP_ATOMIC); + skb = sock_wmalloc(sk, MAX_SYN_SIZE + sizeof(struct sk_buff), 1, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; @@ -855,8 +759,7 @@ int tcp_send_synack(struct sock *sk) tp->last_ack_sent = th->ack_seq = htonl(tp->rcv_nxt); tmp = tcp_syn_build_options(skb, sk->mss, - tp->sack_ok, tp->tstamp_ok, - tp->wscale_ok,tp->rcv_wscale); + tp->tstamp_ok, tp->wscale_ok, tp->rcv_wscale); skb->csum = 0; th->doff = (sizeof(*th) + tmp)>>2; @@ -880,31 +783,24 @@ int tcp_send_synack(struct sock *sk) } /* - * Set up the timers for sending a delayed ack.. - * - * rules for delaying an ack: - * - delay time <= 0.5 HZ - * - must send at least every 2 full sized packets - * - we don't have a window update to send + * Send out a delayed ack, the caller does the policy checking + * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() + * for details. */ -void tcp_send_delayed_ack(struct sock * sk, int max_timeout) +void tcp_send_delayed_ack(struct tcp_opt *tp, int max_timeout) { - struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - unsigned long timeout, now; + unsigned long timeout; - /* Calculate new timeout. */ - now = jiffies; + /* Stay within the limit we were given */ timeout = tp->ato; - - if (timeout > max_timeout || - ((tp->rcv_nxt - tp->rcv_wup) > (sk->mss << 2))) - timeout = now; - else - timeout += now; + if (timeout > max_timeout) + timeout = max_timeout; + timeout += jiffies; /* Use new timeout only if there wasn't a older one earlier. */ - if (!del_timer(&tp->delack_timer) || timeout < tp->delack_timer.expires) + if ((!tp->delack_timer.prev || !del_timer(&tp->delack_timer)) || + (timeout < tp->delack_timer.expires)) tp->delack_timer.expires = timeout; add_timer(&tp->delack_timer); @@ -928,8 +824,6 @@ void tcp_send_ack(struct sock *sk) /* We need to grab some memory, and put together an ack, * and then put it into the queue to be sent. - * FIXME: is it better to waste memory here and use a - * constant sized ACK? */ buff = sock_wmalloc(sk, BASE_ACK_SIZE + tp->tcp_header_len, 1, GFP_ATOMIC); if (buff == NULL) { @@ -938,7 +832,7 @@ void tcp_send_ack(struct sock *sk) * bandwidth on slow links to send a spare ack than * resend packets. */ - tcp_send_delayed_ack(sk, HZ/2); + tcp_send_delayed_ack(tp, HZ/2); return; } @@ -956,22 +850,16 @@ void tcp_send_ack(struct sock *sk) th = (struct tcphdr *)skb_put(buff,tp->tcp_header_len); memcpy(th, &sk->dummy_th, sizeof(struct tcphdr)); - tcp_build_options((__u32 *)(th+1),tp); /* Swap the send and the receive. */ th->window = ntohs(tcp_select_window(sk)); th->seq = ntohl(tp->snd_nxt); tp->last_ack_sent = tp->rcv_nxt; th->ack_seq = htonl(tp->rcv_nxt); + tcp_build_and_update_options((__u32 *)(th + 1), tp); /* Fill in the packet and send it. */ tp->af_specific->send_check(sk, th, tp->tcp_header_len, buff); - -#if 0 - SOCK_DEBUG(sk, "\rtcp_send_ack: seq %x ack %x\n", - tp->snd_nxt, tp->rcv_nxt); -#endif - tp->af_specific->queue_xmit(buff); tcp_statistics.TcpOutSegs++; } @@ -1017,6 +905,7 @@ void tcp_write_wakeup(struct sock *sk) } th = skb->h.th; + tcp_update_options((__u32 *)(th + 1), tp); tp->af_specific->send_check(sk, th, th->doff * 4 + win_size, skb); buff = skb_clone(skb, GFP_ATOMIC); if (buff == NULL) @@ -1047,25 +936,19 @@ void tcp_write_wakeup(struct sock *sk) return; } - t1 = (struct tcphdr *) skb_put(buff, sizeof(struct tcphdr)); + t1 = (struct tcphdr *) skb_put(buff, tp->tcp_header_len); memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1)); - /* FIXME: should zero window probes have SACK and/or TIMESTAMP data? - * If so we have to tack them on here. - */ /* Use a previous sequence. * This should cause the other end to send an ack. */ t1->seq = htonl(tp->snd_nxt-1); -/* t1->fin = 0; -- We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */ t1->ack_seq = htonl(tp->rcv_nxt); t1->window = htons(tcp_select_window(sk)); + tcp_build_and_update_options((__u32 *)(t1 + 1), tp); - /* Value from dummy_th may be larger. */ - t1->doff = sizeof(struct tcphdr)/4; - - tp->af_specific->send_check(sk, t1, sizeof(*t1), buff); + tp->af_specific->send_check(sk, t1, tp->tcp_header_len, buff); } /* Send it. */ diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 76ccedab2..fdf8f50ec 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_timer.c,v 1.5 1998/03/03 01:23:44 ralf Exp $ + * Version: $Id: tcp_timer.c,v 1.6 1998/03/17 22:18:35 ralf Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -31,6 +31,7 @@ int sysctl_tcp_retries2 = TCP_RETR2; static void tcp_sltimer_handler(unsigned long); static void tcp_syn_recv_timer(unsigned long); static void tcp_keepalive(unsigned long data); +static void tcp_bucketgc(unsigned long); struct timer_list tcp_slow_timer = { NULL, NULL, @@ -41,7 +42,8 @@ struct timer_list tcp_slow_timer = { struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = { {ATOMIC_INIT(0), TCP_SYNACK_PERIOD, 0, tcp_syn_recv_timer},/* SYNACK */ - {ATOMIC_INIT(0), TCP_KEEPALIVE_PERIOD, 0, tcp_keepalive} /* KEEPALIVE */ + {ATOMIC_INIT(0), TCP_KEEPALIVE_PERIOD, 0, tcp_keepalive}, /* KEEPALIVE */ + {ATOMIC_INIT(0), TCP_BUCKETGC_PERIOD, 0, tcp_bucketgc} /* BUCKETGC */ }; const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n"; @@ -87,20 +89,24 @@ void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when) * The delayed ack timer can be set if we are changing the * retransmit timer when removing acked frames. */ - del_timer(&tp->probe_timer); - del_timer(&tp->retransmit_timer); + if(tp->probe_timer.prev) + del_timer(&tp->probe_timer); + if(tp->retransmit_timer.prev) + del_timer(&tp->retransmit_timer); tp->retransmit_timer.expires=jiffies+when; add_timer(&tp->retransmit_timer); break; case TIME_DACK: - del_timer(&tp->delack_timer); + if(tp->delack_timer.prev) + del_timer(&tp->delack_timer); tp->delack_timer.expires=jiffies+when; add_timer(&tp->delack_timer); break; case TIME_PROBE0: - del_timer(&tp->probe_timer); + if(tp->probe_timer.prev) + del_timer(&tp->probe_timer); tp->probe_timer.expires=jiffies+when; add_timer(&tp->probe_timer); break; @@ -118,9 +124,12 @@ void tcp_clear_xmit_timers(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - del_timer(&tp->retransmit_timer); - del_timer(&tp->delack_timer); - del_timer(&tp->probe_timer); + if(tp->retransmit_timer.prev) + del_timer(&tp->retransmit_timer); + if(tp->delack_timer.prev) + del_timer(&tp->delack_timer); + if(tp->probe_timer.prev) + del_timer(&tp->probe_timer); } static int tcp_write_err(struct sock *sk, int force) @@ -131,9 +140,8 @@ static int tcp_write_err(struct sock *sk, int force) tcp_clear_xmit_timers(sk); /* Time wait the socket. */ - if (!force && (1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING)) { - tcp_set_state(sk,TCP_TIME_WAIT); - tcp_reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); + if (!force && ((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING))) { + tcp_time_wait(sk); } else { /* Clean up time. */ tcp_set_state(sk, TCP_CLOSE); @@ -173,9 +181,8 @@ static int tcp_write_timeout(struct sock *sk) return 1; } - -void tcp_delack_timer(unsigned long data) { - +void tcp_delack_timer(unsigned long data) +{ struct sock *sk = (struct sock*)data; if(sk->zapped) @@ -185,8 +192,8 @@ void tcp_delack_timer(unsigned long data) { tcp_read_wakeup(sk); } -void tcp_probe_timer(unsigned long data) { - +void tcp_probe_timer(unsigned long data) +{ struct sock *sk = (struct sock*)data; struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; @@ -212,10 +219,9 @@ void tcp_probe_timer(unsigned long data) { sk->err = ETIMEDOUT; sk->error_report(sk); - /* Time wait the socket. */ if ((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING)) { - tcp_set_state(sk, TCP_TIME_WAIT); - tcp_reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); + /* Time wait the socket. */ + tcp_time_wait(sk); } else { /* Clean up time. */ tcp_set_state(sk, TCP_CLOSE); @@ -252,6 +258,35 @@ static __inline__ int tcp_keepopen_proc(struct sock *sk) return res; } +/* Garbage collect TCP bind buckets. */ +static void tcp_bucketgc(unsigned long __unused) +{ + int i; + + for(i = 0; i < TCP_BHTABLE_SIZE; i++) { + struct tcp_bind_bucket *tb = tcp_bound_hash[i]; + + while(tb) { + struct tcp_bind_bucket *next = tb->next; + + if((tb->owners == NULL) && + !(tb->flags & TCPB_FLAG_LOCKED)) { + /* Eat timer reference. */ + tcp_dec_slow_timer(TCP_SLT_BUCKETGC); + + /* Unlink bucket. */ + if(tb->next) + tb->next->pprev = tb->pprev; + *tb->pprev = tb->next; + + /* Finally, free it up. */ + kmem_cache_free(tcp_bucket_cachep, tb); + } + tb = next; + } + } +} + /* * Check all sockets for keepalive timer * Called every 75 seconds diff --git a/net/ipv4/timer.c b/net/ipv4/timer.c index fe02b3f4c..79ae3309e 100644 --- a/net/ipv4/timer.c +++ b/net/ipv4/timer.c @@ -5,7 +5,7 @@ * * TIMER - implementation of software timers for IP. * - * Version: $Id: timer.c,v 1.7 1997/09/17 18:50:26 freitag Exp $ + * Version: $Id: timer.c,v 1.2 1997/12/16 05:37:48 ralf Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -52,76 +52,52 @@ void net_delete_timer (struct sock *t) { - unsigned long flags; - - save_flags (flags); - cli(); - + if(t->timer.prev) + del_timer (&t->timer); t->timeout = 0; - del_timer (&t->timer); - - restore_flags (flags); } void net_reset_timer (struct sock *t, int timeout, unsigned long len) { net_delete_timer (t); t->timeout = timeout; -#if 1 - /* FIXME: ??? */ - if ((int) len < 0) /* prevent close to infinite timers. THEY _DO_ */ - len = 3; /* happen (negative values ?) - don't ask me why ! -FB */ -#endif t->timer.expires = jiffies+len; add_timer (&t->timer); } - -/* - * Now we will only be called whenever we need to do - * something, but we must be sure to process all of the - * sockets that need it. +/* Now we will only be called whenever we need to do + * something, but we must be sure to process all of the + * sockets that need it. */ - void net_timer (unsigned long data) { struct sock *sk = (struct sock*)data; int why = sk->timeout; - /* - * only process if socket is not in use - */ - - if (sk->sock_readers) - { + /* Only process if socket is not in use. */ + if (sk->sock_readers) { sk->timer.expires = jiffies+HZ; add_timer(&sk->timer); - sti(); return; } /* Always see if we need to send an ack. */ - - if (sk->ack_backlog && !sk->zapped) - { + if (sk->tp_pinfo.af_tcp.delayed_acks && !sk->zapped) { sk->prot->read_wakeup (sk); - if (! sk->dead) - sk->data_ready(sk,0); + if (!sk->dead) + sk->data_ready(sk,0); } /* Now we need to figure out why the socket was on the timer. */ - - switch (why) - { + switch (why) { case TIME_DONE: - /* If the socket hasn't been closed off, re-try a bit later */ + /* If the socket hasn't been closed off, re-try a bit later. */ if (!sk->dead) { net_reset_timer(sk, TIME_DONE, TCP_DONE_TIME); break; } - if (sk->state != TCP_CLOSE) - { + if (sk->state != TCP_CLOSE) { printk (KERN_DEBUG "non CLOSE socket in time_done\n"); break; } @@ -129,11 +105,9 @@ void net_timer (unsigned long data) break; case TIME_DESTROY: - /* - * We've waited for a while for all the memory associated with - * the socket to be freed. - */ - + /* We've waited for a while for all the memory associated with + * the socket to be freed. + */ destroy_sock(sk); break; @@ -148,7 +122,8 @@ void net_timer (unsigned long data) break; default: - printk (KERN_DEBUG "net_timer: timer expired - reason %d is unknown\n", why); + /* I want to see these... */ + printk ("net_timer: timer expired - reason %d is unknown\n", why); break; } } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f355caa85..6ba50b280 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -5,7 +5,7 @@ * * The User Datagram Protocol (UDP). * - * Version: $Id: udp.c,v 1.3 1998/03/03 01:23:44 ralf Exp $ + * Version: $Id: udp.c,v 1.4 1998/03/17 22:18:36 ralf Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -828,7 +828,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) * of this packet since that is all * that will be read. */ - amount = skb->tail - skb->h.raw; + amount = skb->len - sizeof(struct udphdr); } return put_user(amount, (int *)arg); } @@ -1033,17 +1033,18 @@ static inline void udp_deliver(struct sock *sk, struct sk_buff *skb) /* * Multicasts and broadcasts go to each listener. + * + * Note: called only from the BH handler context, + * so we don't need to lock the hashes. */ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh, u32 saddr, u32 daddr) { struct sock *sk; - int given = 0; - SOCKHASH_LOCK(); sk = udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]; sk = udp_v4_mcast_next(sk, uh->dest, saddr, uh->source, daddr); - if(sk) { + if (sk) { struct sock *sknext = NULL; do { @@ -1058,10 +1059,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh, udp_deliver(sk, skb1); sk = sknext; } while(sknext); - given = 1; - } - SOCKHASH_UNLOCK(); - if(!given) + } else kfree_skb(skb); return 0; } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c4faba4b7..4a4060601 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: addrconf.c,v 1.32 1997/12/27 20:41:18 kuznet Exp $ + * $Id: addrconf.c,v 1.37 1998/03/08 20:52:46 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -1753,6 +1753,8 @@ static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf t->sysctl_header = register_sysctl_table(t->addrconf_root_dir, 0); if (t->sysctl_header == NULL) kfree(t); + else + p->sysctl = t; } static void addrconf_sysctl_unregister(struct ipv6_devconf *p) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b0a0eb702..bc5ba892a 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -7,7 +7,7 @@ * * Adapted from linux/net/ipv4/af_inet.c * - * $Id: af_inet6.c,v 1.24 1997/12/13 21:53:08 kuznet Exp $ + * $Id: af_inet6.c,v 1.28 1998/03/08 05:56:49 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -71,7 +71,7 @@ static int inet6_create(struct socket *sock, int protocol) struct sock *sk; struct proto *prot; - sk = sk_alloc(AF_INET6, GFP_KERNEL); + sk = sk_alloc(AF_INET6, GFP_KERNEL, 1); if (sk == NULL) goto do_oom; @@ -139,8 +139,7 @@ static int inet6_create(struct socket *sock, int protocol) * creation time automatically shares. */ sk->dummy_th.source = ntohs(sk->num); - if(sk->prot->hash) - sk->prot->hash(sk); + sk->prot->hash(sk); add_to_prot_sklist(sk); } diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 6b7508666..af29057ec 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: exthdrs.c,v 1.4 1997/03/18 18:24:29 davem Exp $ + * $Id: exthdrs.c,v 1.5 1998/02/12 07:43:39 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index b84dc9268..96867403b 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: icmp.c,v 1.12 1997/12/13 21:53:10 kuznet Exp $ + * $Id: icmp.c,v 1.13 1998/02/12 07:43:41 davem Exp $ * * Based on net/ipv4/icmp.c * diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 15ce420ac..9fce1acca 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: ip6_fib.c,v 1.10 1997/12/13 21:53:10 kuznet Exp $ + * $Id: ip6_fib.c,v 1.11 1998/03/08 05:56:50 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/ipv6/ip6_fw.c b/net/ipv6/ip6_fw.c index 7316a30f1..3c3a0cfc5 100644 --- a/net/ipv6/ip6_fw.c +++ b/net/ipv6/ip6_fw.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: ip6_fw.c,v 1.8 1997/12/13 21:53:11 kuznet Exp $ + * $Id: ip6_fw.c,v 1.9 1998/02/12 07:43:42 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index ead32047a..71ad7e1a0 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -6,7 +6,7 @@ * Pedro Roque <roque@di.fc.ul.pt> * Ian P. Morris <I.P.Morris@soton.ac.uk> * - * $Id: ip6_input.c,v 1.7 1997/09/20 20:48:27 davem Exp $ + * $Id: ip6_input.c,v 1.8 1998/02/12 07:43:43 davem Exp $ * * Based in linux/net/ipv4/ip_input.c * diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 67b81d041..13029e175 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: ip6_output.c,v 1.7 1997/12/29 19:52:46 kuznet Exp $ + * $Id: ip6_output.c,v 1.9 1998/03/08 05:56:50 davem Exp $ * * Based on linux/net/ipv4/ip_output.c * diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index f2ef3fd76..c6714eea3 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -7,7 +7,7 @@ * * Based on linux/net/ipv4/ip_sockglue.c * - * $Id: ipv6_sockglue.c,v 1.16 1997/12/13 21:53:13 kuznet Exp $ + * $Id: ipv6_sockglue.c,v 1.17 1998/03/08 05:56:51 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 3fb0680bc..ce37117a3 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -132,7 +132,7 @@ struct neigh_table nd_tbl = pndisc_destructor, pndisc_redo, { NULL, NULL, &nd_tbl, 0, NULL, NULL, - 30*HZ, 1*HZ, 60*HZ, 30*HZ, 5*HZ, 3, 3, 0, 3, 1*HZ, (8*HZ)/10, 0, 64 }, + 30*HZ, 1*HZ, 60*HZ, 30*HZ, 5*HZ, 3, 3, 0, 3, 1*HZ, (8*HZ)/10, 64, 0 }, 30*HZ, 128, 512, 1024, }; diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index b9b811e35..b87d4696b 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -7,7 +7,7 @@ * PROC file system. This is very similar to the IPv4 version, * except it reports the sockets in the INET6 address family. * - * Version: $Id: proc.c,v 1.4 1997/04/20 22:50:44 schenk Exp $ + * Version: $Id: proc.c,v 1.6 1998/03/13 08:02:19 davem Exp $ * * Authors: David S. Miller (davem@caip.rutgers.edu) * @@ -21,6 +21,7 @@ #include <linux/net.h> #include <linux/in6.h> #include <net/sock.h> +#include <net/tcp.h> #include <net/transp_v6.h> /* This is the main implementation workhorse of all these routines. */ @@ -52,21 +53,35 @@ static int get__netinfo6(struct proto *pro, char *buffer, int format, char **sta SOCKHASH_LOCK(); sp = pro->sklist_next; while(sp != (struct sock *)pro) { + struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sp; + int tw_bucket = 0; + pos += 149; if(pos < offset) goto next; tp = &(sp->tp_pinfo.af_tcp); - dest = &sp->net_pinfo.af_inet6.daddr; - src = &sp->net_pinfo.af_inet6.rcv_saddr; + if((format == 0) && (sp->state == TCP_TIME_WAIT)) { + tw_bucket = 1; + dest = &tw->v6_daddr; + src = &tw->v6_rcv_saddr; + } else { + dest = &sp->net_pinfo.af_inet6.daddr; + src = &sp->net_pinfo.af_inet6.rcv_saddr; + } destp = ntohs(sp->dummy_th.dest); srcp = ntohs(sp->dummy_th.source); - - timer_active1 = del_timer(&tp->retransmit_timer); - timer_active2 = del_timer(&sp->timer); - if(!timer_active1) tp->retransmit_timer.expires = 0; - if(!timer_active2) sp->timer.expires = 0; - timer_active = 0; - timer_expires = (unsigned) -1; + if((format == 0) && (sp->state == TCP_TIME_WAIT)) { + timer_active1 = timer_active2 = 0; + timer_active = 3; + timer_expires = tw->timer.expires; + } else { + timer_active1 = del_timer(&tp->retransmit_timer); + timer_active2 = del_timer(&sp->timer); + if(!timer_active1) tp->retransmit_timer.expires = 0; + if(!timer_active2) sp->timer.expires = 0; + timer_active = 0; + timer_expires = (unsigned) -1; + } if(timer_active1 && tp->retransmit_timer.expires < timer_expires) { timer_active = timer_active1; timer_expires = tp->retransmit_timer.expires; @@ -75,6 +90,8 @@ static int get__netinfo6(struct proto *pro, char *buffer, int format, char **sta timer_active = timer_active2; timer_expires = sp->timer.expires; } + if(timer_active == 0) + timer_expires = jiffies; sprintf(tmpbuf, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5d %8d %ld", i, @@ -83,13 +100,23 @@ static int get__netinfo6(struct proto *pro, char *buffer, int format, char **sta dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, sp->state, - format==0?sp->write_seq-tp->snd_una:atomic_read(&sp->wmem_alloc), - format==0?tp->rcv_nxt-sp->copied_seq:atomic_read(&sp->rmem_alloc), + (tw_bucket ? + 0 : + (format == 0) ? + tp->write_seq-tp->snd_una : + atomic_read(&sp->wmem_alloc)), + (tw_bucket ? + 0 : + (format == 0) ? + tp->rcv_nxt-tp->copied_seq : + atomic_read(&sp->rmem_alloc)), timer_active, timer_expires-jiffies, - tp->retransmits, - sp->socket ? sp->socket->inode->i_uid:0, - timer_active?sp->timeout:0, - sp->socket ? sp->socket->inode->i_ino:0); + (tw_bucket ? 0 : tp->retransmits), + ((!tw_bucket && sp->socket) ? + sp->socket->inode->i_uid : 0), + (!tw_bucket && timer_active) ? sp->timeout : 0, + ((!tw_bucket && sp->socket) ? + sp->socket->inode->i_ino : 0)); if(timer_active1) add_timer(&tp->retransmit_timer); if(timer_active2) add_timer(&sp->timer); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 4ee1b13ad..5b182b7ef 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -7,7 +7,7 @@ * * Adapted from linux/net/ipv4/raw.c * - * $Id: raw.c,v 1.16 1997/12/29 19:52:48 kuznet Exp $ + * $Id: raw.c,v 1.18 1998/03/08 05:56:54 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index aa027da14..55fecc676 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: reassembly.c,v 1.8 1997/12/29 19:52:50 kuznet Exp $ + * $Id: reassembly.c,v 1.9 1998/02/12 07:43:48 davem Exp $ * * Based on: net/ipv4/ip_fragment.c * diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 28ee43e78..5188de864 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: route.c,v 1.19 1997/12/13 21:53:16 kuznet Exp $ + * $Id: route.c,v 1.25 1998/03/15 03:31:47 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -85,18 +85,18 @@ struct dst_ops ip6_dst_ops = { }; struct rt6_info ip6_null_entry = { - {{NULL, ATOMIC_INIT(0), ATOMIC_INIT(0), NULL, + {{NULL, ATOMIC_INIT(1), ATOMIC_INIT(1), NULL, -1, 0, 0, 0, 0, 0, 0, 0, 0, -ENETUNREACH, NULL, NULL, ip6_pkt_discard, ip6_pkt_discard, &ip6_dst_ops}}, NULL, {{{0}}}, 256, RTF_REJECT|RTF_NONEXTHOP, ~0U, - 0, 255, {NULL}, {{{{0}}}, 128}, {{{{0}}}, 128} + 255, 0, {NULL}, {{{{0}}}, 0}, {{{{0}}}, 0} }; struct fib6_node ip6_routing_table = { NULL, NULL, NULL, NULL, &ip6_null_entry, - 0, RTN_ROOT|RTN_TL_ROOT, 0 + 0, RTN_ROOT|RTN_TL_ROOT|RTN_RTINFO, 0 }; #ifdef CONFIG_RT6_POLICY @@ -709,14 +709,14 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) if (rt == NULL) { RDBG(("dalloc fails, ")); *err = -ENOMEM; - goto out; + return NULL; } rt->u.dst.obsolete = -1; rt->rt6i_expires = rtmsg->rtmsg_info; addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst); - + if (addr_type & IPV6_ADDR_MULTICAST) { RDBG(("MCAST, ")); rt->u.dst.input = ip6_mc_input; @@ -743,6 +743,21 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) rt->rt6i_src.plen = rtmsg->rtmsg_src_len; ipv6_wash_prefix(&rt->rt6i_src.addr, rt->rt6i_src.plen); + /* We cannot add true routes via loopback here, + they would result in kernel looping; promote them to reject routes + */ + if ((rtmsg->rtmsg_flags&RTF_REJECT) || + (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) { + dev = dev_get("lo"); + rt->u.dst.output = ip6_pkt_discard; + rt->u.dst.input = ip6_pkt_discard; + rt->u.dst.error = -ENETUNREACH; + rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP; + rt->rt6i_metric = rtmsg->rtmsg_metric; + rt->rt6i_dev = dev; + goto install_route; + } + if (rtmsg->rtmsg_flags & RTF_GATEWAY) { struct in6_addr *gw_addr; int gwa_type; @@ -773,7 +788,7 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) } dev = grt->rt6i_dev; } - if (dev == NULL) { + if (dev == NULL || (dev->flags&IFF_LOOPBACK)) { *err = -EINVAL; goto out; } @@ -805,6 +820,7 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) rt->rt6i_hoplimit = ipv6_get_hoplimit(dev); rt->rt6i_flags = rtmsg->rtmsg_flags; +install_route: RDBG(("rt6ins(%p) ", rt)); rt6_lock(); @@ -1421,6 +1437,7 @@ int ipv6_route_ioctl(unsigned int cmd, void *arg) int ip6_pkt_discard(struct sk_buff *skb) { ipv6_statistics.Ip6OutNoRoutes++; + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev); kfree_skb(skb); return 0; } @@ -1671,7 +1688,8 @@ static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta, rtmsg->rtmsg_dst_len = r->rtm_dst_len; rtmsg->rtmsg_src_len = r->rtm_src_len; rtmsg->rtmsg_flags = RTF_UP; - rtmsg->rtmsg_metric = IP6_RT_PRIO_USER; + if (r->rtm_type == RTN_UNREACHABLE) + rtmsg->rtmsg_flags |= RTF_REJECT; if (rta[RTA_GATEWAY-1]) { if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16)) @@ -1754,7 +1772,12 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, rtm->rtm_src_len = rt->rt6i_src.plen; rtm->rtm_tos = 0; rtm->rtm_table = RT_TABLE_MAIN; - rtm->rtm_type = RTN_UNICAST; + if (rt->rt6i_flags&RTF_REJECT) + rtm->rtm_type = RTN_UNREACHABLE; + else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK)) + rtm->rtm_type = RTN_LOCAL; + else + rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; rtm->rtm_scope = RT_SCOPE_UNIVERSE; #ifdef CONFIG_RTNL_OLD_IFINFO @@ -1795,6 +1818,8 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, if (rt->u.dst.rtt) RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt); mx->rta_len = skb->tail - (u8*)mx; + if (mx->rta_len == RTA_LENGTH(0)) + skb_trim(skb, (u8*)mx - skb->data); #endif if (rt->u.dst.neighbour) RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index f029942df..577b85d0f 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -6,7 +6,7 @@ * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * - * $Id: sit.c,v 1.24 1997/12/13 21:53:17 kuznet Exp $ + * $Id: sit.c,v 1.27 1998/03/08 05:56:57 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f7a080a0d..1d082c195 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: tcp_ipv6.c,v 1.44 1997/12/13 21:53:18 kuznet Exp $ + * $Id: tcp_ipv6.c,v 1.60 1998/03/15 02:59:32 davem Exp $ * * Based on: * linux/net/ipv4/tcp.c @@ -44,7 +44,6 @@ #define ICMP_PARANOIA -extern int sysctl_tcp_sack; extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; @@ -86,62 +85,69 @@ static __inline__ int tcp_v6_sk_hashfn(struct sock *sk) /* Grrr, addr_type already calculated by caller, but I don't want * to add some silly "cookie" argument to this method just for that. + * But it doesn't matter, the recalculation is in the rarest path + * this function ever takes. */ static int tcp_v6_verify_bind(struct sock *sk, unsigned short snum) { - struct sock *sk2; - int addr_type = ipv6_addr_type(&sk->net_pinfo.af_inet6.rcv_saddr); - int retval = 0, sk_reuse = sk->reuse; + struct tcp_bind_bucket *tb; + int result = 0; SOCKHASH_LOCK(); - sk2 = tcp_bound_hash[tcp_sk_bhashfn(sk)]; - for(; sk2 != NULL; sk2 = sk2->bind_next) { - if((sk2->num == snum) && (sk2 != sk)) { - unsigned char state = sk2->state; - int sk2_reuse = sk2->reuse; - if(addr_type == IPV6_ADDR_ANY || (!sk2->rcv_saddr)) { - if((!sk2_reuse) || - (!sk_reuse) || - (state == TCP_LISTEN)) { - retval = 1; - break; - } - } else if(!ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, - &sk2->net_pinfo.af_inet6.rcv_saddr)) { - if((!sk_reuse) || - (!sk2_reuse) || - (state == TCP_LISTEN)) { - retval = 1; - break; + for(tb = tcp_bound_hash[tcp_bhashfn(snum)]; + (tb && (tb->port != snum)); + tb = tb->next) + ; + if(tb && tb->owners) { + /* Fast path for reuse ports, see include/net/tcp.h for a very + * detailed description of why this works, and why it is worth + * the effort at all. -DaveM + */ + if((tb->flags & TCPB_FLAG_FASTREUSE) && + (sk->reuse != 0)) { + goto go_like_smoke; + } else { + struct sock *sk2; + int sk_reuse = sk->reuse; + int addr_type = ipv6_addr_type(&sk->net_pinfo.af_inet6.rcv_saddr); + + /* We must walk the whole port owner list in this case. -DaveM */ + for(sk2 = tb->owners; sk2; sk2 = sk2->bind_next) { + if(!sk_reuse || !sk2->reuse || sk2->state == TCP_LISTEN) { + if(addr_type == IPV6_ADDR_ANY || + !sk2->rcv_saddr || + !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, + &sk2->net_pinfo.af_inet6.rcv_saddr)) + break; } } + if(sk2 != NULL) + result = 1; } } + if((result == 0) && + (tb == NULL) && + (tcp_bucket_create(snum) == NULL)) + result = 1; +go_like_smoke: SOCKHASH_UNLOCK(); - - return retval; + return result; } static void tcp_v6_hash(struct sock *sk) { - unsigned char state; - - SOCKHASH_LOCK(); - state = sk->state; - if(state != TCP_CLOSE) { + if(sk->state != TCP_CLOSE) { struct sock **skp; - if(state == TCP_LISTEN) - skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; - else - skp = &tcp_established_hash[tcp_v6_sk_hashfn(sk)]; + SOCKHASH_LOCK(); + skp = &tcp_established_hash[(sk->hashent = tcp_v6_sk_hashfn(sk))]; if((sk->next = *skp) != NULL) (*skp)->pprev = &sk->next; *skp = sk; sk->pprev = skp; tcp_sk_bindify(sk); + SOCKHASH_UNLOCK(); } - SOCKHASH_UNLOCK(); } static void tcp_v6_unhash(struct sock *sk) @@ -153,6 +159,7 @@ static void tcp_v6_unhash(struct sock *sk) *sk->pprev = sk->next; sk->pprev = NULL; tcp_sk_unbindify(sk); + tcp_reg_zap(sk); } SOCKHASH_UNLOCK(); } @@ -163,29 +170,27 @@ static void tcp_v6_rehash(struct sock *sk) SOCKHASH_LOCK(); state = sk->state; - if(sk->pprev) { + if(sk->pprev != NULL) { if(sk->next) sk->next->pprev = sk->pprev; *sk->pprev = sk->next; sk->pprev = NULL; - tcp_sk_unbindify(sk); + tcp_reg_zap(sk); } if(state != TCP_CLOSE) { struct sock **skp; - if(state == TCP_LISTEN) { + if(state == TCP_LISTEN) skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; - } else { - int hash = tcp_v6_sk_hashfn(sk); - if(state == TCP_TIME_WAIT) - hash += (TCP_HTABLE_SIZE/2); - skp = &tcp_established_hash[hash]; - } + else + skp = &tcp_established_hash[(sk->hashent = tcp_v6_sk_hashfn(sk))]; + if((sk->next = *skp) != NULL) (*skp)->pprev = &sk->next; *skp = sk; sk->pprev = skp; - tcp_sk_bindify(sk); + if(state == TCP_LISTEN) + tcp_sk_bindify(sk); } SOCKHASH_UNLOCK(); } @@ -209,8 +214,12 @@ static struct sock *tcp_v6_lookup_listener(struct in6_addr *daddr, unsigned shor return result; } +/* Until this is verified... -DaveM */ +/* #define USE_QUICKSYNS */ + /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM + * It is assumed that this code only gets called from within NET_BH. */ static inline struct sock *__tcp_v6_lookup(struct tcphdr *th, struct in6_addr *saddr, u16 sport, @@ -218,30 +227,53 @@ static inline struct sock *__tcp_v6_lookup(struct tcphdr *th, { unsigned short hnum = ntohs(dport); struct sock *sk; - int hash = tcp_v6_hashfn(daddr, hnum, saddr, sport); + int hash; + +#ifdef USE_QUICKSYNS + /* Incomming connection short-cut. */ + if (th && th->syn == 1 && th->ack == 0) + goto listener_shortcut; +#endif + + /* Check TCP register quick cache first. */ + sk = TCP_RHASH(sport); + if(sk && + sk->num == hnum && /* local port */ + sk->family == AF_INET6 && /* address family */ + sk->dummy_th.dest == sport && /* remote port */ + !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.daddr, saddr) && + !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, daddr)) + goto hit; /* Optimize here for direct hit, only listening connections can - * have wildcards anyways. It is assumed that this code only - * gets called from within NET_BH. + * have wildcards anyways. */ - for(sk = tcp_established_hash[hash]; sk; sk = sk->next) + hash = tcp_v6_hashfn(daddr, hnum, saddr, sport); + for(sk = tcp_established_hash[hash]; sk; sk = sk->next) { /* For IPV6 do the cheaper port and family tests first. */ if(sk->num == hnum && /* local port */ sk->family == AF_INET6 && /* address family */ sk->dummy_th.dest == sport && /* remote port */ !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.daddr, saddr) && - !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, daddr)) + !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, daddr)) { + if (sk->state == TCP_ESTABLISHED) + TCP_RHASH(sport) = sk; goto hit; /* You sunk my battleship! */ - + } + } /* Must check for a TIME_WAIT'er before going to listener hash. */ for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next) if(sk->num == hnum && /* local port */ sk->family == AF_INET6 && /* address family */ - sk->dummy_th.dest == sport && /* remote port */ - !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.daddr, saddr) && - !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, daddr)) - goto hit; - + sk->dummy_th.dest == sport) { /* remote port */ + struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; + if(!ipv6_addr_cmp(&tw->v6_daddr, saddr) && + !ipv6_addr_cmp(&tw->v6_rcv_saddr, daddr)) + goto hit; + } +#ifdef USE_QUICKSYNS +listener_shortcut: +#endif sk = tcp_v6_lookup_listener(daddr, hnum); hit: return sk; @@ -275,6 +307,33 @@ static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb) skb->h.th->source); } +static int tcp_v6_unique_address(struct sock *sk) +{ + struct tcp_bind_bucket *tb; + unsigned short snum = sk->num; + int retval = 1; + + /* Freeze the hash while we snoop around. */ + SOCKHASH_LOCK(); + tb = tcp_bound_hash[tcp_bhashfn(snum)]; + for(; tb; tb = tb->next) { + if(tb->port == snum && tb->owners != NULL) { + /* Almost certainly the re-use port case, search the real hashes + * so it actually scales. (we hope that all ipv6 ftp servers will + * use passive ftp, I just cover this case for completeness) + */ + sk = __tcp_v6_lookup(NULL, &sk->net_pinfo.af_inet6.daddr, + sk->dummy_th.dest, + &sk->net_pinfo.af_inet6.rcv_saddr, snum); + if((sk != NULL) && (sk->state != TCP_LISTEN)) + retval = 0; + break; + } + } + SOCKHASH_UNLOCK(); + return retval; +} + static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -390,7 +449,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, ipv6_addr_copy(&np->saddr, saddr); } - /* FIXME: Need to do tcp_v6_unique_address() here! -DaveM */ + sk->dummy_th.dest = usin->sin6_port; + if (!tcp_v6_unique_address(sk)) + return -EADDRNOTAVAIL; /* * Init variables @@ -398,16 +459,15 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, lock_sock(sk); - sk->dummy_th.dest = usin->sin6_port; - sk->write_seq = secure_tcp_sequence_number(np->saddr.s6_addr32[3], + tp->write_seq = secure_tcp_sequence_number(np->saddr.s6_addr32[3], np->daddr.s6_addr32[3], sk->dummy_th.source, sk->dummy_th.dest); tp->snd_wnd = 0; tp->snd_wl1 = 0; - tp->snd_wl2 = sk->write_seq; - tp->snd_una = sk->write_seq; + tp->snd_wl2 = tp->write_seq; + tp->snd_una = tp->write_seq; tp->rcv_nxt = 0; @@ -415,30 +475,35 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, release_sock(sk); - buff = sock_wmalloc(sk, MAX_SYN_SIZE, 0, GFP_KERNEL); - - if (buff == NULL) + buff = sock_wmalloc(sk, (MAX_SYN_SIZE + sizeof(struct sk_buff)), + 0, GFP_KERNEL); + if (buff == NULL) { + /* FIXME: Free route references etc??? */ return(-ENOMEM); + } lock_sock(sk); tcp_v6_build_header(sk, buff); + tp->tcp_header_len = sizeof(struct tcphdr) + + (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); + /* build the tcp header */ th = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr)); buff->h.th = th; memcpy(th, (void *) &(sk->dummy_th), sizeof(*th)); - buff->seq = sk->write_seq++; + buff->seq = tp->write_seq++; th->seq = htonl(buff->seq); - tp->snd_nxt = sk->write_seq; - buff->end_seq = sk->write_seq; + tp->snd_nxt = tp->write_seq; + buff->end_seq = tp->write_seq; th->ack = 0; th->syn = 1; sk->mtu = dst->pmtu; - sk->mss = sk->mtu - sizeof(struct ipv6hdr) - sizeof(struct tcphdr); + sk->mss = (sk->mtu - sizeof(struct ipv6hdr) - tp->tcp_header_len); if (sk->mss < 1) { printk(KERN_DEBUG "intial ipv6 sk->mss below 1\n"); @@ -457,8 +522,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, * Put in the TCP options to say MTU. */ - tmp = tcp_syn_build_options(buff, sk->mss, sysctl_tcp_sack, - sysctl_tcp_timestamps, + tmp = tcp_syn_build_options(buff, sk->mss, sysctl_tcp_timestamps, sysctl_tcp_window_scaling,tp->rcv_wscale); th->doff = sizeof(*th)/4 + (tmp>>2); buff->csum = 0; @@ -467,9 +531,10 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, tcp_set_state(sk, TCP_SYN_SENT); /* Socket identity change complete, no longer - * in TCP_CLOSE, so rehash. + * in TCP_CLOSE, so enter ourselves into the + * hash tables. */ - sk->prot->rehash(sk); + sk->prot->hash(sk); /* FIXME: should use dcache->rtt if availiable */ tp->rto = TCP_TIMEOUT_INIT; @@ -482,12 +547,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, tp->packets_out++; buff->when = jiffies; skb1 = skb_clone(buff, GFP_KERNEL); - skb_set_owner_w(skb1, sk); - - tcp_v6_xmit(skb1); + if(skb1 != NULL) { + skb_set_owner_w(skb1, sk); + tcp_v6_xmit(skb1); + } /* Timer for repeating the SYN until an answer */ - tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); tcp_statistics.TcpActiveOpens++; tcp_statistics.TcpOutSegs++; @@ -499,6 +564,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, static int tcp_v6_sendmsg(struct sock *sk, struct msghdr *msg, int len) { + struct tcp_opt *tp; struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; int retval = -EINVAL; @@ -530,7 +596,10 @@ static int tcp_v6_sendmsg(struct sock *sk, struct msghdr *msg, int len) lock_sock(sk); retval = tcp_do_sendmsg(sk, msg->msg_iovlen, msg->msg_iov, msg->msg_flags); - + /* Push out partial tail frames if needed. */ + tp = &(sk->tp_pinfo.af_tcp); + if(tp->send_head && tcp_snd_test(sk, tp->send_head)) + tcp_write_xmit(sk); release_sock(sk); out: @@ -555,7 +624,7 @@ void tcp_v6_err(int type, int code, unsigned char *header, __u32 info, sk = tcp_v6_lookup(daddr, th->dest, saddr, th->source); - if (sk == NULL) { + if (sk == NULL || sk->state == TCP_TIME_WAIT) { /* XXX: Update ICMP error count */ return; } @@ -596,11 +665,14 @@ void tcp_v6_err(int type, int code, unsigned char *header, __u32 info, ip6_dst_store(sk, dst); } - if (sk->dst_cache->error) + if (sk->dst_cache->error) { sk->err_soft = sk->dst_cache->error; - else + } else { + /* FIXME: Reset sk->mss, taking into account TCP option + * bytes for timestamps. -DaveM + */ sk->mtu = sk->dst_cache->pmtu; - + } if (sk->sock_readers) { /* remove later */ printk(KERN_DEBUG "tcp_v6_err: pmtu disc: socket locked.\n"); return; @@ -713,11 +785,10 @@ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req) * match what happens under IPV4. Figure out the right thing to do. */ req->mss = min(sk->mss, req->mss); - - if (req->mss < 1) { - printk(KERN_DEBUG "initial req->mss below 1\n"); - req->mss = 1; - } + if(sk->user_mss) + req->mss = min(req->mss, sk->user_mss); + if(req->tstamp_ok == 0) + req->mss += TCPOLEN_TSTAMP_ALIGNED; if (req->rcv_wnd == 0) { __u8 rcv_wscale; @@ -732,7 +803,7 @@ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req) } th->window = htons(req->rcv_wnd); - tmp = tcp_syn_build_options(skb, req->mss, req->sack_ok, req->tstamp_ok, + tmp = tcp_syn_build_options(skb, req->mss, req->tstamp_ok, req->wscale_ok,req->rcv_wscale); skb->csum = 0; th->doff = (sizeof(*th) + tmp)>>2; @@ -740,9 +811,13 @@ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req) &req->af.v6_req.loc_addr, &req->af.v6_req.rmt_addr, csum_partial((char *)th, sizeof(*th)+tmp, skb->csum)); + /* Actually we should not attach dst to socket in state LISTEN, + it results in stale destination per listen socket and + overflow of routing cache. + (IPv4 has the same flaw with more unpleasant consequences.) + */ ip6_dst_store(sk, dst); ip6_xmit(sk, skb, &fl, req->af.v6_req.opt); - dst_release(dst); tcp_statistics.TcpOutSegs++; } @@ -801,14 +876,15 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, req->rcv_isn = skb->seq; req->snt_isn = isn; - tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0; + tp.tstamp_ok = tp.wscale_ok = tp.snd_wscale = 0; tp.in_mss = 536; tcp_parse_options(skb->h.th,&tp,0); - if (tp.saw_tstamp) - req->ts_recent = tp.rcv_tsval; req->mss = tp.in_mss; + if (tp.saw_tstamp) { + req->mss -= TCPOLEN_TSTAMP_ALIGNED; + req->ts_recent = tp.rcv_tsval; + } req->tstamp_ok = tp.tstamp_ok; - req->sack_ok = tp.sack_ok; req->snd_wscale = tp.snd_wscale; req->wscale_ok = tp.wscale_ok; req->rmt_port = skb->h.th->source; @@ -879,92 +955,17 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, return newsk; } - newsk = sk_alloc(AF_INET6, GFP_ATOMIC); + newsk = tcp_create_openreq_child(sk, req, skb); if (newsk == NULL) { - if (dst) - dst_release(dst); + dst_release(dst); return NULL; } - memcpy(newsk, sk, sizeof(*newsk)); - - /* Or else we die! -DaveM */ - newsk->sklist_next = NULL; - - newsk->opt = NULL; newsk->dst_cache = NULL; - skb_queue_head_init(&newsk->write_queue); - skb_queue_head_init(&newsk->receive_queue); - skb_queue_head_init(&newsk->out_of_order_queue); - skb_queue_head_init(&newsk->error_queue); - - /* - * Unused - */ newtp = &(newsk->tp_pinfo.af_tcp); - np = &newsk->net_pinfo.af_inet6; - - newtp->send_head = NULL; - newtp->retrans_head = NULL; - - newtp->pending = 0; - - skb_queue_head_init(&newsk->back_log); - - newsk->prot->init(newsk); - - newtp->snd_cwnd_cnt = 0; -#if 0 /* Don't mess up the initialization we did in the init routine! */ - newtp->snd_ssthresh = 0; -#endif - newtp->backoff = 0; - newsk->proc = 0; - newsk->done = 0; - newsk->pair = NULL; - atomic_set(&newsk->wmem_alloc, 0); - atomic_set(&newsk->rmem_alloc, 0); - newsk->localroute = sk->localroute; - - newsk->err = 0; - newsk->shutdown = 0; - newsk->ack_backlog = 0; - - newtp->fin_seq = req->rcv_isn; - newsk->syn_seq = req->rcv_isn; - newsk->state = TCP_SYN_RECV; - newsk->timeout = 0; - - newsk->write_seq = req->snt_isn; - - newtp->snd_wnd = ntohs(skb->h.th->window); - newtp->max_window = newtp->snd_wnd; - newtp->snd_wl1 = req->rcv_isn; - newtp->snd_wl2 = newsk->write_seq; - newtp->snd_una = newsk->write_seq++; - newtp->snd_nxt = newsk->write_seq; - - newsk->urg_data = 0; - newtp->packets_out = 0; - newtp->retransmits = 0; - newsk->linger=0; - newsk->destroy = 0; - init_timer(&newsk->timer); - newsk->timer.data = (unsigned long) newsk; - newsk->timer.function = &net_timer; - - tcp_init_xmit_timers(newsk); - - newsk->dummy_th.source = sk->dummy_th.source; - newsk->dummy_th.dest = req->rmt_port; - newsk->sock_readers=0; - - newtp->rcv_nxt = req->rcv_isn + 1; - newtp->rcv_wup = req->rcv_isn + 1; - newsk->copied_seq = req->rcv_isn + 1; - - newsk->socket = NULL; + np = &newsk->net_pinfo.af_inet6; ipv6_addr_copy(&np->daddr, &req->af.v6_req.rmt_addr); ipv6_addr_copy(&np->saddr, &req->af.v6_req.loc_addr); ipv6_addr_copy(&np->rcv_saddr, &req->af.v6_req.loc_addr); @@ -987,14 +988,22 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, ip6_dst_store(newsk, dst); - newtp->sack_ok = req->sack_ok; newtp->tstamp_ok = req->tstamp_ok; - newtp->snd_wscale = req->snd_wscale; + newtp->window_clamp = req->window_clamp; + newtp->rcv_wnd = req->rcv_wnd; newtp->wscale_ok = req->wscale_ok; - newtp->ts_recent = req->ts_recent; + if (newtp->wscale_ok) { + newtp->snd_wscale = req->snd_wscale; + newtp->rcv_wscale = req->rcv_wscale; + } else { + newtp->snd_wscale = newtp->rcv_wscale = 0; + newtp->window_clamp = min(newtp->window_clamp,65535); + } if (newtp->tstamp_ok) { - newtp->tcp_header_len = sizeof(struct tcphdr) + 12; /* FIXME: define the contant. */ - newsk->dummy_th.doff += 3; + newtp->ts_recent = req->ts_recent; + newtp->ts_recent_stamp = jiffies; + newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + newsk->dummy_th.doff += (TCPOLEN_TSTAMP_ALIGNED >> 2); } else { newtp->tcp_header_len = sizeof(struct tcphdr); } @@ -1006,7 +1015,6 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->mss = min(req->mss+sizeof(struct tcphdr)-newtp->tcp_header_len, (newsk->mtu - sizeof(struct ipv6hdr) - newtp->tcp_header_len)); - /* XXX tp->window_clamp??? -DaveM */ newsk->daddr = LOOPBACK4_IPV6; newsk->saddr = LOOPBACK4_IPV6; @@ -1181,12 +1189,14 @@ int tcp_v6_rcv(struct sk_buff *skb, struct device *dev, goto no_tcp_socket; } - skb->sk = sk; skb->seq = ntohl(th->seq); skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4; skb->ack_seq = ntohl(th->ack_seq); - skb->used = 0; + if(sk->state == TCP_TIME_WAIT) + goto do_time_wait; + + skb->sk = sk; } /* @@ -1249,6 +1259,12 @@ discard_it: kfree_skb(skb); return 0; + +do_time_wait: + if(tcp_timewait_state_process((struct tcp_tw_bucket *)sk, + skb, th, &(IPCB(skb)->opt), skb->len)) + goto no_tcp_socket; + goto discard_it; } static int tcp_v6_rebuild_header(struct sock *sk, struct sk_buff *skb) @@ -1384,51 +1400,34 @@ static struct tcp_func ipv6_mapped = { sizeof(struct sockaddr_in6) }; +/* NOTE: A lot of things set to zero explicitly by call to + * sk_alloc() so need not be done here. + */ static int tcp_v6_init_sock(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - skb_queue_head_init(&sk->out_of_order_queue); + skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); - tp->srtt = 0; tp->rto = TCP_TIMEOUT_INIT; /*TCP_WRITE_TIME*/ tp->mdev = TCP_TIMEOUT_INIT; - - tp->ato = 0; - tp->iat = (HZ/5) << 3; - - /* FIXME: right thing? */ - tp->rcv_wnd = 0; tp->in_mss = 536; - /* tp->rcv_wnd = 8192; */ - tp->tstamp_ok = 0; - tp->sack_ok = 0; - tp->wscale_ok = 0; - tp->snd_wscale = 0; - tp->sacks = 0; - tp->saw_tstamp = 0; - tp->syn_backlog = 0; - - /* start with only sending one packet at a time. */ + + /* See draft-stevens-tcpca-spec-01 for discussion of the + * initialization of these values. + */ tp->snd_cwnd = 1; tp->snd_ssthresh = 0x7fffffff; - - sk->priority = 1; sk->state = TCP_CLOSE; - sk->max_ack_backlog = SOMAXCONN; - sk->mtu = 576; sk->mss = 536; - sk->dummy_th.doff = sizeof(sk->dummy_th)/4; - /* - * Speed up by setting some standard state for the dummy_th. - */ + /* Speed up by setting some standard state for the dummy_th. */ sk->dummy_th.ack=1; sk->dummy_th.doff=sizeof(struct tcphdr)>>2; @@ -1442,6 +1441,7 @@ static int tcp_v6_init_sock(struct sock *sk) static int tcp_v6_destroy_sock(struct sock *sk) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; tcp_clear_xmit_timers(sk); @@ -1460,15 +1460,22 @@ static int tcp_v6_destroy_sock(struct sock *sk) * Cleans up our, hopefuly empty, out_of_order_queue */ - while((skb = skb_dequeue(&sk->out_of_order_queue)) != NULL) + while((skb = skb_dequeue(&tp->out_of_order_queue)) != NULL) kfree_skb(skb); /* * Release destination entry */ - dst_release(sk->dst_cache); - sk->dst_cache = NULL; + dst_release(xchg(&sk->dst_cache,NULL)); + + /* Clean up a locked TCP bind bucket, this only happens if a + * port is allocated for a socket, but it never fully connects. + * In which case we will find num to be non-zero and daddr to + * be zero. + */ + if(ipv6_addr_any(&(sk->net_pinfo.af_inet6.daddr)) && sk->num != 0) + tcp_bucket_unlock(sk); return 0; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b99dc19e3..40e9b0233 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -7,7 +7,7 @@ * * Based on linux/ipv4/udp.c * - * $Id: udp.c,v 1.21 1997/12/29 19:52:52 kuznet Exp $ + * $Id: udp.c,v 1.24 1998/03/12 03:20:21 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -448,32 +448,43 @@ static struct sock *udp_v6_mcast_next(struct sock *sk, return NULL; } +/* + * Note: called only from the BH handler context, + * so we don't need to lock the hashes. + */ static void udpv6_mcast_deliver(struct udphdr *uh, struct in6_addr *saddr, struct in6_addr *daddr, struct sk_buff *skb) { struct sock *sk, *sk2; + struct sk_buff *buff; - SOCKHASH_LOCK(); sk = udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]; sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr); - if(sk) { - sk2 = sk; - while((sk2 = udp_v6_mcast_next(sk2->next, - uh->dest, saddr, - uh->source, daddr))) { - struct sk_buff *buff = skb_clone(skb, GFP_ATOMIC); - if (buff && sock_queue_rcv_skb(sk2, buff) < 0) { - buff->sk = NULL; - kfree_skb(buff); - } + if (!sk) + goto free_skb; + + buff = NULL; + sk2 = sk; + while((sk2 = udp_v6_mcast_next(sk2->next, uh->dest, saddr, + uh->source, daddr))) { + if (!buff) { + buff = skb_clone(skb, GFP_ATOMIC); + if (!buff) + continue; } + if (sock_queue_rcv_skb(sk2, buff) >= 0) + buff = NULL; + } + if (buff) { + buff->sk = NULL; + kfree_skb(buff); } - if(!sk || sock_queue_rcv_skb(sk, skb) < 0) { + if (sock_queue_rcv_skb(sk, skb) < 0) { + free_skb: skb->sk = NULL; kfree_skb(skb); } - SOCKHASH_UNLOCK(); } int udpv6_rcv(struct sk_buff *skb, struct device *dev, diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index cf56df492..904fa1174 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1713,7 +1713,7 @@ static int ipx_getsockopt(struct socket *sock, int level, int optname, static int ipx_create(struct socket *sock, int protocol) { struct sock *sk; - sk=sk_alloc(AF_IPX, GFP_KERNEL); + sk=sk_alloc(AF_IPX, GFP_KERNEL, 1); if(sk==NULL) return(-ENOMEM); switch(sock->type) diff --git a/net/netbeui/af_netbeui.c b/net/netbeui/af_netbeui.c index 85bd8f4d1..6769edde5 100644 --- a/net/netbeui/af_netbeui.c +++ b/net/netbeui/af_netbeui.c @@ -150,7 +150,7 @@ static int netbeui_listen(struct socket *sock, int backlog) static int netbeui_create(struct socket *sock, int protocol) { netbeui_socket *sk; - sk=(netbeui_socket *)sk_alloc(GFP_KERNEL); + sk=(netbeui_socket *)sk_alloc(GFP_KERNEL, 1); if(sk==NULL) return(-ENOBUFS); switch(sock->type) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 3f02f4c3c..8b8e5a4b8 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -147,7 +147,7 @@ static int netlink_create(struct socket *sock, int protocol) sock->ops = &netlink_ops; - sk = sk_alloc(AF_NETLINK, GFP_KERNEL); + sk = sk_alloc(AF_NETLINK, GFP_KERNEL, 1); if (!sk) return -ENOMEM; diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index a84d1fd53..9d8a206da 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -98,7 +98,7 @@ static struct sock *nr_alloc_sock(void) struct sock *sk; nr_cb *nr; - if ((sk = sk_alloc(AF_NETROM, GFP_ATOMIC)) == NULL) + if ((sk = sk_alloc(AF_NETROM, GFP_ATOMIC, 1)) == NULL) return NULL; if ((nr = kmalloc(sizeof(*nr), GFP_ATOMIC)) == NULL) { @@ -759,6 +759,8 @@ static int nr_accept(struct socket *sock, struct socket *newsock, int flags) newsk = skb->sk; newsk->pair = NULL; + newsk->socket = newsock; + newsk->sleep = &newsock->wait; sti(); /* Now attach up the new socket */ diff --git a/net/netsyms.c b/net/netsyms.c index b7809863b..ad51e9a3e 100644 --- a/net/netsyms.c +++ b/net/netsyms.c @@ -244,7 +244,6 @@ EXPORT_SYMBOL(csum_partial_copy_fromiovecend); EXPORT_SYMBOL(__release_sock); EXPORT_SYMBOL(net_timer); /* UDP/TCP exported functions for TCPv6 */ -EXPORT_SYMBOL(sysctl_tcp_sack); EXPORT_SYMBOL(sysctl_tcp_timestamps); EXPORT_SYMBOL(sysctl_tcp_window_scaling); EXPORT_SYMBOL(sock_rspace); @@ -272,11 +271,15 @@ EXPORT_SYMBOL(tcp_slt_array); EXPORT_SYMBOL(__tcp_inc_slow_timer); EXPORT_SYMBOL(tcp_statistics); EXPORT_SYMBOL(tcp_rcv_state_process); +EXPORT_SYMBOL(tcp_timewait_state_process); EXPORT_SYMBOL(tcp_do_sendmsg); EXPORT_SYMBOL(tcp_v4_build_header); EXPORT_SYMBOL(tcp_v4_rebuild_header); EXPORT_SYMBOL(tcp_v4_send_check); EXPORT_SYMBOL(tcp_v4_conn_request); +EXPORT_SYMBOL(tcp_create_openreq_child); +EXPORT_SYMBOL(tcp_bucket_create); +EXPORT_SYMBOL(tcp_bucket_unlock); EXPORT_SYMBOL(tcp_v4_syn_recv_sock); EXPORT_SYMBOL(tcp_v4_do_rcv); EXPORT_SYMBOL(tcp_v4_connect); @@ -290,6 +293,11 @@ EXPORT_SYMBOL(ipv4_specific); EXPORT_SYMBOL(tcp_simple_retransmit); EXPORT_SYMBOL(xrlim_allow); + +EXPORT_SYMBOL(tcp_write_xmit); +EXPORT_SYMBOL(dev_loopback_xmit); +EXPORT_SYMBOL(tcp_regs); + #endif #ifdef CONFIG_NETLINK diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a098f59b9..74fc7af82 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -35,6 +35,7 @@ * Alan Cox : sendmsg/recvmsg support. * Alan Cox : Protocol setting support * Alexey Kuznetsov : Untied from IPv4 stack. + * Cyrus Durgin : Fixed kerneld for kmod. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -54,7 +55,7 @@ #include <linux/netdevice.h> #include <linux/if_packet.h> #include <linux/wireless.h> -#include <linux/kerneld.h> +#include <linux/kmod.h> #include <net/ip.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -710,7 +711,7 @@ static int packet_create(struct socket *sock, int protocol) sock->state = SS_UNCONNECTED; MOD_INC_USE_COUNT; - sk = sk_alloc(AF_PACKET, GFP_KERNEL); + sk = sk_alloc(AF_PACKET, GFP_KERNEL, 1); if (sk == NULL) { MOD_DEC_USE_COUNT; return -ENOBUFS; @@ -831,9 +832,10 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, int len, /* We can't use skb_copy_datagram here */ err = memcpy_toiovec(msg->msg_iov, skb->data, copied); - if (err) + if (err) { + err = -EFAULT; goto out_free; - + } sk->stamp=skb->stamp; if (msg->msg_name) diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index eeb396350..a575402c7 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -149,7 +149,7 @@ static struct sock *rose_alloc_sock(void) struct sock *sk; rose_cb *rose; - if ((sk = sk_alloc(AF_ROSE, GFP_ATOMIC)) == NULL) + if ((sk = sk_alloc(AF_ROSE, GFP_ATOMIC, 1)) == NULL) return NULL; if ((rose = kmalloc(sizeof(*rose), GFP_ATOMIC)) == NULL) { @@ -847,6 +847,8 @@ static int rose_accept(struct socket *sock, struct socket *newsock, int flags) newsk = skb->sk; newsk->pair = NULL; + newsk->socket = newsock; + newsk->sleep = &newsock->wait; sti(); /* Now attach up the new socket */ diff --git a/net/socket.c b/net/socket.c index 5c9534031..dc77ef3e8 100644 --- a/net/socket.c +++ b/net/socket.c @@ -76,8 +76,8 @@ #include <linux/init.h> #include <linux/poll.h> -#if defined(CONFIG_KERNELD) && defined(CONFIG_NET) -#include <linux/kerneld.h> +#if defined(CONFIG_KMOD) && defined(CONFIG_NET) +#include <linux/kmod.h> #endif #include <asm/system.h> @@ -577,7 +577,7 @@ int sock_create(int family, int type, int protocol, struct socket **res) if(family<0||family>=NPROTO) return -EINVAL; -#if defined(CONFIG_KERNELD) && defined(CONFIG_NET) +#if defined(CONFIG_KMOD) && defined(CONFIG_NET) /* Attempt to load a protocol module if the find failed. * * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user @@ -814,7 +814,7 @@ restart: newsock = socki_lookup(inode); if ((err = get_fd(inode)) < 0) - goto out_inval; + goto out_release; newsock->file = current->files->fd[err]; if (upeer_sockaddr) @@ -835,8 +835,6 @@ out: unlock_kernel(); return err; -out_inval: - err = -EINVAL; out_release: sock_release(newsock); goto out_put; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 2fbce16fe..b04072d80 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -364,7 +364,7 @@ static int unix_create(struct socket *sock, int protocol) default: return -ESOCKTNOSUPPORT; } - sk = sk_alloc(AF_UNIX, GFP_KERNEL); + sk = sk_alloc(AF_UNIX, GFP_KERNEL, 1); if (!sk) return -ENOMEM; @@ -1265,7 +1265,9 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, int size } chunk = min(skb->len, size); - /* N.B. This could fail with -EFAULT */ + /* N.B. This could fail with a non-zero value (which means -EFAULT + * and the non-zero value is the number of bytes not copied). + */ memcpy_toiovec(msg->msg_iov, skb->data, chunk); copied += chunk; size -= chunk; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 7e3c9cae2..a85aeea5f 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -423,7 +423,7 @@ static struct sock *x25_alloc_socket(void) struct sock *sk; x25_cb *x25; - if ((sk = sk_alloc(AF_X25, GFP_ATOMIC)) == NULL) + if ((sk = sk_alloc(AF_X25, GFP_ATOMIC, 1)) == NULL) return NULL; if ((x25 = kmalloc(sizeof(*x25), GFP_ATOMIC)) == NULL) { |