diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/af_inet.c | 57 | ||||
-rw-r--r-- | net/ipv4/arp.c | 84 | ||||
-rw-r--r-- | net/ipv4/devinet.c | 34 | ||||
-rw-r--r-- | net/ipv4/fib_frontend.c | 4 | ||||
-rw-r--r-- | net/ipv4/fib_hash.c | 62 | ||||
-rw-r--r-- | net/ipv4/fib_rules.c | 40 | ||||
-rw-r--r-- | net/ipv4/icmp.c | 23 | ||||
-rw-r--r-- | net/ipv4/igmp.c | 102 | ||||
-rw-r--r-- | net/ipv4/ip_fragment.c | 27 | ||||
-rw-r--r-- | net/ipv4/ip_input.c | 260 | ||||
-rw-r--r-- | net/ipv4/ip_masq_mfw.c | 4 | ||||
-rw-r--r-- | net/ipv4/ip_masq_quake.c | 4 | ||||
-rw-r--r-- | net/ipv4/ip_masq_vdolive.c | 6 | ||||
-rw-r--r-- | net/ipv4/ip_options.c | 3 | ||||
-rw-r--r-- | net/ipv4/ipconfig.c | 8 | ||||
-rw-r--r-- | net/ipv4/ipmr.c | 11 | ||||
-rw-r--r-- | net/ipv4/proc.c | 15 | ||||
-rw-r--r-- | net/ipv4/raw.c | 94 | ||||
-rw-r--r-- | net/ipv4/route.c | 97 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 121 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 155 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 220 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 11 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 165 | ||||
-rw-r--r-- | net/ipv4/timer.c | 17 | ||||
-rw-r--r-- | net/ipv4/udp.c | 128 | ||||
-rw-r--r-- | net/ipv4/utils.c | 7 |
27 files changed, 1093 insertions, 666 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 70fcf4024..ca0f27d0c 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -5,7 +5,7 @@ * * PF_INET protocol family socket handler. * - * Version: $Id: af_inet.c,v 1.87 1999/04/22 10:07:33 davem Exp $ + * Version: $Id: af_inet.c,v 1.91 1999/06/09 08:28:55 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -147,22 +147,17 @@ static __inline__ void kill_sk_queues(struct sock *sk) struct sk_buff *skb; /* First the read buffer. */ - while((skb = skb_dequeue(&sk->receive_queue)) != NULL) { - /* This will take care of closing sockets that were - * listening and didn't accept everything. - */ - if (skb->sk != NULL && skb->sk != sk) - skb->sk->prot->close(skb->sk, 0); + while((skb = skb_dequeue(&sk->receive_queue)) != NULL) kfree_skb(skb); - } /* Next, the error queue. */ while((skb = skb_dequeue(&sk->error_queue)) != NULL) kfree_skb(skb); - /* Now the backlog. */ - while((skb=skb_dequeue(&sk->back_log)) != NULL) - kfree_skb(skb); + /* It is _impossible_ for the backlog to contain anything + * when we get here. All user references to this socket + * have gone away, only the net layer knows can touch it. + */ } static __inline__ void kill_sk_now(struct sock *sk) @@ -195,14 +190,19 @@ static __inline__ void kill_sk_later(struct sock *sk) sk->destroy = 1; sk->ack_backlog = 0; - release_sock(sk); + bh_unlock_sock(sk); net_reset_timer(sk, TIME_DESTROY, SOCK_DESTROY_TIME); } +/* Callers must hold the BH spinlock. + * + * At this point, there should be no process reference to this + * socket, and thus no user references at all. Therefore we + * can assume the socket waitqueue is inactive and nobody will + * try to jump onto it. + */ void destroy_sock(struct sock *sk) { - lock_sock(sk); /* just to be safe. */ - /* Now we can no longer get new packets or once the * timers are killed, send them. */ @@ -213,12 +213,6 @@ void destroy_sock(struct sock *sk) kill_sk_queues(sk); - /* Now if it has a half accepted/ closed socket. */ - if (sk->pair) { - sk->pair->prot->close(sk->pair, 0); - sk->pair = NULL; - } - /* Now if everything is gone we can free the socket * structure, otherwise we need to keep it around until * everything is gone. @@ -284,6 +278,14 @@ static int inet_autobind(struct sock *sk) return 0; } +/* Listening INET sockets never sleep to wait for memory, so + * it is completely silly to wake them up on queue space + * available events. So we hook them up to this dummy callback. + */ +static void inet_listen_write_space(struct sock *sk) +{ +} + /* * Move a socket into listening state. */ @@ -310,6 +312,7 @@ int inet_listen(struct socket *sock, int backlog) dst_release(xchg(&sk->dst_cache, NULL)); sk->prot->rehash(sk); add_to_prot_sklist(sk); + sk->write_space = inet_listen_write_space; } sk->socket->flags |= SO_ACCEPTCON; return(0); @@ -368,7 +371,7 @@ static int inet_create(struct socket *sock, int protocol) if (protocol && protocol != IPPROTO_UDP) goto free_and_noproto; protocol = IPPROTO_UDP; - sk->no_check = UDP_NO_CHECK; + sk->no_check = UDP_CSUM_DEFAULT; sk->ip_pmtudisc = IP_PMTUDISC_DONT; prot=&udp_prot; sock->ops = &inet_dgram_ops; @@ -578,7 +581,7 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, static void inet_wait_for_connect(struct sock *sk) { - struct wait_queue wait = { current, NULL }; + DECLARE_WAITQUEUE(wait, current); add_wait_queue(sk->sleep, &wait); current->state = TASK_INTERRUPTIBLE; @@ -684,14 +687,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags) if (sk1->prot->accept == NULL) goto do_err; - /* Restore the state if we have been interrupted, and then returned. */ - if (sk1->pair != NULL) { - sk2 = sk1->pair; - sk1->pair = NULL; - } else { - if((sk2 = sk1->prot->accept(sk1,flags)) == NULL) - goto do_sk1_err; - } + if((sk2 = sk1->prot->accept(sk1,flags)) == NULL) + goto do_sk1_err; /* * We've been passed an extra socket. diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 2c311f233..a3ca88701 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1,6 +1,6 @@ /* linux/net/inet/arp.c * - * Version: $Id: arp.c,v 1.77 1999/03/21 05:22:30 davem Exp $ + * Version: $Id: arp.c,v 1.78 1999/06/09 10:10:36 davem Exp $ * * Copyright (C) 1994 by Florian La Roche * @@ -119,6 +119,11 @@ #include <asm/system.h> #include <asm/uaccess.h> +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) +static char *ax2asc2(ax25_address *a, char *buf); +#endif + + /* * Interface to generic neighbour cache. */ @@ -304,7 +309,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) u8 *dst_ha = NULL; struct device *dev = neigh->dev; u32 target = *(u32*)neigh->primary_key; - int probes = neigh->probes; + int probes = atomic_read(&neigh->probes); if (skb && inet_addr_type(skb->nh.iph->saddr) == RTN_LOCAL) saddr = skb->nh.iph->saddr; @@ -315,6 +320,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) if (!(neigh->nud_state&NUD_VALID)) printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n"); dst_ha = neigh->ha; + read_lock_bh(&neigh->lock); } else if ((probes -= neigh->parms->app_probes) < 0) { #ifdef CONFIG_ARPD neigh_app_ns(neigh); @@ -324,6 +330,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, dst_ha, dev->dev_addr, NULL); + if (dst_ha) + read_unlock_bh(&neigh->lock); } /* OBSOLETE FUNCTIONS */ @@ -372,29 +380,25 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) if (arp_set_predefined(inet_addr_type(paddr), haddr, paddr, dev)) return 0; - start_bh_atomic(); n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); if (n) { n->used = jiffies; if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) { - memcpy(haddr, n->ha, dev->addr_len); + read_lock_bh(&n->lock); + memcpy(haddr, n->ha, dev->addr_len); + read_unlock_bh(&n->lock); neigh_release(n); - end_bh_atomic(); return 0; } + neigh_release(n); } else kfree_skb(skb); - neigh_release(n); - end_bh_atomic(); return 1; } /* END OF OBSOLETE FUNCTIONS */ -/* - * Note: requires bh_atomic locking. - */ int arp_bind_neighbour(struct dst_entry *dst) { struct device *dev = dst->dev; @@ -672,7 +676,8 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) (addr_type == RTN_UNICAST && rt->u.dst.dev != dev && (IN_DEV_PROXY_ARP(in_dev) || pneigh_lookup(&arp_tbl, &tip, dev, 0)))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); - neigh_release(n); + if (n) + neigh_release(n); if (skb->stamp.tv_sec == 0 || skb->pkt_type == PACKET_HOST || @@ -785,7 +790,6 @@ int arp_req_set(struct arpreq *r, struct device * dev) return -EINVAL; err = -ENOBUFS; - start_bh_atomic(); neigh = __neigh_lookup(&arp_tbl, &ip, dev, 1); if (neigh) { unsigned state = NUD_STALE; @@ -795,7 +799,6 @@ int arp_req_set(struct arpreq *r, struct device * dev) r->arp_ha.sa_data : NULL, state, 1, 0); neigh_release(neigh); } - end_bh_atomic(); return err; } @@ -819,17 +822,17 @@ static int arp_req_get(struct arpreq *r, struct device *dev) struct neighbour *neigh; int err = -ENXIO; - start_bh_atomic(); - neigh = __neigh_lookup(&arp_tbl, &ip, dev, 0); + neigh = neigh_lookup(&arp_tbl, &ip, dev); if (neigh) { + read_lock_bh(&neigh->lock); memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len); + r->arp_flags = arp_state_to_flags(neigh); + read_unlock_bh(&neigh->lock); r->arp_ha.sa_family = dev->type; strncpy(r->arp_dev, dev->name, sizeof(r->arp_dev)); - r->arp_flags = arp_state_to_flags(neigh); neigh_release(neigh); err = 0; } - end_bh_atomic(); return err; } @@ -867,14 +870,12 @@ int arp_req_delete(struct arpreq *r, struct device * dev) return -EINVAL; } err = -ENXIO; - start_bh_atomic(); - neigh = __neigh_lookup(&arp_tbl, &ip, dev, 0); + neigh = neigh_lookup(&arp_tbl, &ip, dev); if (neigh) { if (neigh->nud_state&~NUD_NOARP) err = neigh_update(neigh, NULL, NUD_FAILED, 1, 0); neigh_release(neigh); } - end_bh_atomic(); return err; } @@ -961,16 +962,16 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy char hbuffer[HBUFFERLEN]; int i,j,k; const char hexbuf[] = "0123456789ABCDEF"; + char abuf[16]; size = sprintf(buffer,"IP address HW type Flags HW address Mask Device\n"); pos+=size; len+=size; - neigh_table_lock(&arp_tbl); - - for(i=0; i<=NEIGH_HASHMASK; i++) { + for(i=0; i<=NEIGH_HASHMASK; i++) { struct neighbour *n; + read_lock_bh(&arp_tbl.lock); for (n=arp_tbl.hash_buckets[i]; n; n=n->next) { struct device *dev = n->dev; int hatype = dev->type; @@ -979,17 +980,14 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy if (!(n->nud_state&~NUD_NOARP)) continue; - /* I'd get great pleasure deleting - this ugly code. Let's output it in hexadecimal format. - "arp" utility will eventually repaired --ANK - */ -#if 1 /* UGLY CODE */ + read_lock(&n->lock); + /* * Convert hardware address to XX:XX:XX:XX ... form. */ #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM) - strcpy(hbuffer,ax2asc((ax25_address *)n->ha)); + ax2asc2((ax25_address *)n->ha, hbuffer); else { #endif for (k=0,j=0;k<HBUFFERLEN-3 && j<dev->addr_len;j++) { @@ -998,37 +996,33 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy hbuffer[k++]=':'; } hbuffer[--k]=0; - + #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) } #endif -#else - if ((neigh->nud_state&NUD_VALID) && dev->addr_len) { - int j; - for (j=0; j < dev->addr_len; j++) - sprintf(hbuffer+2*j, "%02x", neigh->ha[j]); - } else - sprintf(hbuffer, "0"); -#endif size = sprintf(buffer+len, "%-17s0x%-10x0x%-10x%s", - in_ntoa(*(u32*)n->primary_key), + in_ntoa2(*(u32*)n->primary_key, abuf), hatype, arp_state_to_flags(n), hbuffer); size += sprintf(buffer+len+size, " %-17s %s\n", "*", dev->name); + read_unlock(&n->lock); len += size; pos += size; if (pos <= offset) len=0; - if (pos >= offset+length) - goto done; + if (pos >= offset+length) { + read_unlock_bh(&arp_tbl.lock); + goto done; + } } + read_unlock_bh(&arp_tbl.lock); } for (i=0; i<=PNEIGH_HASHMASK; i++) { @@ -1039,7 +1033,7 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy size = sprintf(buffer+len, "%-17s0x%-10x0x%-10x%s", - in_ntoa(*(u32*)n->key), + in_ntoa2(*(u32*)n->key, abuf), hatype, ATF_PUBL|ATF_PERM, "00:00:00:00:00:00"); @@ -1058,7 +1052,6 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy } done: - neigh_table_unlock(&arp_tbl); *start = buffer+len-(pos-offset); /* Start of wanted data */ len = pos-offset; /* Start slop */ @@ -1117,14 +1110,13 @@ __initfunc(void arp_init (void)) } -#ifdef CONFIG_AX25_MODULE +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) /* * ax25 -> ASCII conversion */ -char *ax2asc(ax25_address *a) +char *ax2asc2(ax25_address *a, char *buf) { - static char buf[11]; char c, *s; int n; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index c8b0fbbc8..ff2c930d1 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1,7 +1,7 @@ /* * NET3 IP device support routines. * - * Version: $Id: devinet.c,v 1.28 1999/05/08 20:00:16 davem Exp $ + * Version: $Id: devinet.c,v 1.32 1999/06/09 11:15:33 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -607,41 +607,39 @@ inet_gifconf(struct device *dev, char *buf, int len) { struct in_device *in_dev = dev->ip_ptr; struct in_ifaddr *ifa; - struct ifreq ifr; + struct ifreq *ifr = (struct ifreq *) buf; int done=0; if (in_dev==NULL || (ifa=in_dev->ifa_list)==NULL) return 0; for ( ; ifa; ifa = ifa->ifa_next) { - if (!buf) { + if (!ifr) { done += sizeof(ifr); continue; } if (len < (int) sizeof(ifr)) return done; - memset(&ifr, 0, sizeof(struct ifreq)); + memset(ifr, 0, sizeof(struct ifreq)); if (ifa->ifa_label) - strcpy(ifr.ifr_name, ifa->ifa_label); + strcpy(ifr->ifr_name, ifa->ifa_label); else - strcpy(ifr.ifr_name, dev->name); + strcpy(ifr->ifr_name, dev->name); - (*(struct sockaddr_in *) &ifr.ifr_addr).sin_family = AF_INET; - (*(struct sockaddr_in *) &ifr.ifr_addr).sin_addr.s_addr = ifa->ifa_local; + (*(struct sockaddr_in *) &ifr->ifr_addr).sin_family = AF_INET; + (*(struct sockaddr_in *) &ifr->ifr_addr).sin_addr.s_addr = ifa->ifa_local; - if (copy_to_user(buf, &ifr, sizeof(struct ifreq))) - return -EFAULT; - buf += sizeof(struct ifreq); + ifr++; len -= sizeof(struct ifreq); done += sizeof(struct ifreq); } return done; } -u32 inet_select_addr(struct device *dev, u32 dst, int scope) +u32 inet_select_addr(const struct device *dev, u32 dst, int scope) { u32 addr = 0; - struct in_device *in_dev = dev->ip_ptr; + const struct in_device *in_dev = dev->ip_ptr; if (in_dev == NULL) return 0; @@ -661,15 +659,19 @@ u32 inet_select_addr(struct device *dev, u32 dst, int scope) in this case. It is importnat that lo is the first interface in dev_base list. */ + read_lock(&dev_base_lock); for (dev=dev_base; dev; dev=dev->next) { if ((in_dev=dev->ip_ptr) == NULL) continue; for_primary_ifa(in_dev) { - if (ifa->ifa_scope <= scope) + if (ifa->ifa_scope <= scope) { + read_unlock(&dev_base_lock); return ifa->ifa_local; + } } endfor_ifa(in_dev); } + read_unlock(&dev_base_lock); return 0; } @@ -790,6 +792,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) s_idx = cb->args[0]; s_ip_idx = ip_idx = cb->args[1]; + read_lock(&dev_base_lock); for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { if (idx < s_idx) continue; @@ -807,6 +810,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) } } done: + read_unlock(&dev_base_lock); cb->args[0] = idx; cb->args[1] = ip_idx; @@ -881,11 +885,13 @@ void inet_forward_change() ipv4_devconf.accept_redirects = !on; ipv4_devconf_dflt.forwarding = on; + read_lock(&dev_base_lock); for (dev = dev_base; dev; dev = dev->next) { struct in_device *in_dev = dev->ip_ptr; if (in_dev) in_dev->cnf.forwarding = on; } + read_unlock(&dev_base_lock); rt_cache_flush(0); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index a17470483..d57d4daa9 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -5,7 +5,7 @@ * * IPv4 Forwarding Information Base: FIB frontend. * - * Version: $Id: fib_frontend.c,v 1.15 1999/03/21 05:22:31 davem Exp $ + * Version: $Id: fib_frontend.c,v 1.16 1999/06/09 10:10:42 davem Exp $ * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * @@ -123,13 +123,11 @@ fib_get_procinfo(char *buffer, char **start, off_t offset, int length, int dummy first = 0; } - /* rtnl_shlock(); -- it is pointless at the moment --ANK */ if (main_table && count > 0) { int n = main_table->tb_get_info(main_table, ptr, first, count); count -= n; ptr += n*128; } - /* rtnl_shunlock(); */ len = ptr - *start; if (len >= length) return length; diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index d9e029cef..0472f6118 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -5,7 +5,7 @@ * * IPv4 FIB: lookup engine and maintenance routines. * - * Version: $Id: fib_hash.c,v 1.8 1999/03/25 10:04:17 davem Exp $ + * Version: $Id: fib_hash.c,v 1.10 1999/06/09 10:10:45 davem Exp $ * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * @@ -145,13 +145,16 @@ extern __inline__ int fn_key_leq(fn_key_t a, fn_key_t b) return a.datum <= b.datum; } +static rwlock_t fib_hash_lock = RW_LOCK_UNLOCKED; + #define FZ_MAX_DIVISOR 1024 #ifdef CONFIG_IP_ROUTE_LARGE_TABLES +/* The fib hash lock must be held when this is called. */ static __inline__ void fn_rebuild_zone(struct fn_zone *fz, - struct fib_node **old_ht, - int old_divisor) + struct fib_node **old_ht, + int old_divisor) { int i; struct fib_node *f, **fp, *next; @@ -198,13 +201,13 @@ static void fn_rehash_zone(struct fn_zone *fz) if (ht) { memset(ht, 0, new_divisor*sizeof(struct fib_node*)); - start_bh_atomic(); + write_lock_bh(&fib_hash_lock); old_ht = fz->fz_hash; fz->fz_hash = ht; fz->fz_hashmask = new_hashmask; fz->fz_divisor = new_divisor; fn_rebuild_zone(fz, old_ht, old_divisor); - end_bh_atomic(); + write_unlock_bh(&fib_hash_lock); kfree(old_ht); } } @@ -246,6 +249,7 @@ fn_new_zone(struct fn_hash *table, int z) for (i=z+1; i<=32; i++) if (table->fn_zones[i]) break; + write_lock_bh(&fib_hash_lock); if (i>32) { /* No more specific masks, we are the first. */ fz->fz_next = table->fn_zone_list; @@ -255,6 +259,7 @@ fn_new_zone(struct fn_hash *table, int z) table->fn_zones[i]->fz_next = fz; } table->fn_zones[z] = fz; + write_unlock_bh(&fib_hash_lock); return fz; } @@ -265,6 +270,7 @@ fn_hash_lookup(struct fib_table *tb, const struct rt_key *key, struct fib_result struct fn_zone *fz; struct fn_hash *t = (struct fn_hash*)tb->tb_data; + read_lock(&fib_hash_lock); for (fz = t->fn_zone_list; fz; fz = fz->fz_next) { struct fib_node *f; fn_key_t k = fz_key(key->dst, fz); @@ -293,13 +299,16 @@ fn_hash_lookup(struct fib_table *tb, const struct rt_key *key, struct fib_result res->scope = f->fn_scope; res->prefixlen = fz->fz_order; res->prefix = &fz_prefix(f->fn_key, fz); - return 0; + goto out; } if (err < 0) - return err; + goto out; } } - return 1; + err = 1; +out: + read_unlock(&fib_hash_lock); + return err; } static int fn_hash_last_dflt=-1; @@ -344,6 +353,7 @@ fn_hash_select_default(struct fib_table *tb, const struct rt_key *key, struct fi last_resort = NULL; order = -1; + read_lock(&fib_hash_lock); for (f = fz->fz_hash[0]; f; f = f->fn_next) { struct fib_info *next_fi = FIB_INFO(f); @@ -364,7 +374,7 @@ fn_hash_select_default(struct fib_table *tb, const struct rt_key *key, struct fi } else if (!fib_detect_death(fi, order, &last_resort, &last_idx)) { res->fi = fi; fn_hash_last_dflt = order; - return; + goto out; } fi = next_fi; order++; @@ -372,18 +382,20 @@ fn_hash_select_default(struct fib_table *tb, const struct rt_key *key, struct fi if (order<=0 || fi==NULL) { fn_hash_last_dflt = -1; - return; + goto out; } if (!fib_detect_death(fi, order, &last_resort, &last_idx)) { res->fi = fi; fn_hash_last_dflt = order; - return; + goto out; } if (last_idx >= 0) res->fi = last_resort; fn_hash_last_dflt = last_idx; +out: + read_unlock(&fib_hash_lock); } #define FIB_SCAN(f, fp) \ @@ -457,6 +469,7 @@ rta->rta_prefsrc ? *(u32*)rta->rta_prefsrc : 0); fp = fz_chain_p(key, fz); + /* * Scan list to find the first route with the same destination */ @@ -560,14 +573,17 @@ replace: */ new_f->fn_next = f; + write_lock_bh(&fib_hash_lock); *fp = new_f; + write_unlock_bh(&fib_hash_lock); fz->fz_nent++; if (del_fp) { f = *del_fp; /* Unlink replaced node */ + write_lock_bh(&fib_hash_lock); *del_fp = f->fn_next; - synchronize_bh(); + write_unlock_bh(&fib_hash_lock); if (!(f->fn_state&FN_S_ZOMBIE)) rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req); @@ -619,11 +635,13 @@ FTprint("tb(%d)_delete: %d %08x/%d %d\n", tb->tb_id, r->rtm_type, rta->rta_dst ? fp = fz_chain_p(key, fz); + FIB_SCAN(f, fp) { if (fn_key_eq(f->fn_key, key)) break; - if (fn_key_leq(key, f->fn_key)) + if (fn_key_leq(key, f->fn_key)) { return -ESRCH; + } } #ifdef CONFIG_IP_ROUTE_TOS FIB_SCAN_KEY(f, fp, key) { @@ -637,9 +655,9 @@ FTprint("tb(%d)_delete: %d %08x/%d %d\n", tb->tb_id, r->rtm_type, rta->rta_dst ? FIB_SCAN_TOS(f, fp, key, tos) { struct fib_info * fi = FIB_INFO(f); - if (f->fn_state&FN_S_ZOMBIE) + if (f->fn_state&FN_S_ZOMBIE) { return -ESRCH; - + } matched++; if (del_fp == NULL && @@ -655,8 +673,9 @@ FTprint("tb(%d)_delete: %d %08x/%d %d\n", tb->tb_id, r->rtm_type, rta->rta_dst ? rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req); if (matched != 1) { + write_lock_bh(&fib_hash_lock); *del_fp = f->fn_next; - synchronize_bh(); + write_unlock_bh(&fib_hash_lock); if (f->fn_state&FN_S_ACCESSED) rt_cache_flush(-1); @@ -687,8 +706,9 @@ fn_flush_list(struct fib_node ** fp, int z, struct fn_hash *table) struct fib_info *fi = FIB_INFO(f); if (fi && ((f->fn_state&FN_S_ZOMBIE) || (fi->fib_flags&RTNH_F_DEAD))) { + write_lock_bh(&fib_hash_lock); *fp = f->fn_next; - synchronize_bh(); + write_unlock_bh(&fib_hash_lock); fn_free_node(f); found++; @@ -727,6 +747,7 @@ static int fn_hash_get_info(struct fib_table *tb, char *buffer, int first, int c int pos = 0; int n = 0; + read_lock(&fib_hash_lock); for (fz=table->fn_zone_list; fz; fz = fz->fz_next) { int i; struct fib_node *f; @@ -752,10 +773,12 @@ static int fn_hash_get_info(struct fib_table *tb, char *buffer, int first, int c FZ_MASK(fz), buffer); buffer += 128; if (++n >= count) - return n; + goto out; } } } +out: + read_unlock(&fib_hash_lock); return n; } #endif @@ -818,15 +841,18 @@ static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin struct fn_hash *table = (struct fn_hash*)tb->tb_data; s_m = cb->args[1]; + read_lock(&fib_hash_lock); for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) { if (m < s_m) continue; if (m > s_m) memset(&cb->args[2], 0, sizeof(cb->args) - 2*sizeof(cb->args[0])); if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) { cb->args[1] = m; + read_unlock(&fib_hash_lock); return -1; } } + read_unlock(&fib_hash_lock); cb->args[1] = m; return skb->len; } diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 868c44c31..97074198e 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -5,7 +5,7 @@ * * IPv4 Forwarding Information Base: policy rules. * - * Version: $Id: fib_rules.c,v 1.9 1999/03/25 10:04:23 davem Exp $ + * Version: $Id: fib_rules.c,v 1.11 1999/06/09 10:10:47 davem Exp $ * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * @@ -79,12 +79,14 @@ static struct fib_rule main_rule = { &default_rule, 0x7FFE, RT_TABLE_MAIN, RTN_U static struct fib_rule local_rule = { &main_rule, 0, RT_TABLE_LOCAL, RTN_UNICAST, }; static struct fib_rule *fib_rules = &local_rule; +static rwlock_t fib_rules_lock = RW_LOCK_UNLOCKED; int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { struct rtattr **rta = arg; struct rtmsg *rtm = NLMSG_DATA(nlh); struct fib_rule *r, **rp; + int err = -ESRCH; for (rp=&fib_rules; (r=*rp) != NULL; rp=&r->r_next) { if ((!rta[RTA_SRC-1] || memcmp(RTA_DATA(rta[RTA_SRC-1]), &r->r_src, 4) == 0) && @@ -99,18 +101,20 @@ int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) (!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) && (!rta[RTA_IIF-1] || strcmp(RTA_DATA(rta[RTA_IIF-1]), r->r_ifname) == 0) && (!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) { + err = -EPERM; if (r == &local_rule) - return -EPERM; + break; + write_lock_bh(&fib_rules_lock); *rp = r->r_next; - synchronize_bh(); - + write_unlock_bh(&fib_rules_lock); if (r != &default_rule && r != &main_rule) kfree(r); - return 0; + err = 0; + break; } } - return -ESRCH; + return err; } /* Allocate new unique table id */ @@ -205,7 +209,9 @@ int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) } new_r->r_next = r; + write_lock_bh(&fib_rules_lock); *rp = new_r; + write_unlock_bh(&fib_rules_lock); return 0; } @@ -250,8 +256,11 @@ static void fib_rules_detach(struct device *dev) struct fib_rule *r; for (r=fib_rules; r; r=r->r_next) { - if (r->r_ifindex == dev->ifindex) + if (r->r_ifindex == dev->ifindex) { + write_lock_bh(&fib_rules_lock); r->r_ifindex = -1; + write_unlock_bh(&fib_rules_lock); + } } } @@ -260,8 +269,11 @@ static void fib_rules_attach(struct device *dev) struct fib_rule *r; for (r=fib_rules; r; r=r->r_next) { - if (r->r_ifindex == -1 && strcmp(dev->name, r->r_ifname) == 0) + if (r->r_ifindex == -1 && strcmp(dev->name, r->r_ifname) == 0) { + write_lock_bh(&fib_rules_lock); r->r_ifindex = dev->ifindex; + write_unlock_bh(&fib_rules_lock); + } } } @@ -275,6 +287,7 @@ int fib_lookup(const struct rt_key *key, struct fib_result *res) u32 saddr = key->src; FRprintk("Lookup: %08x <- %08x ", key->dst, key->src); + read_lock(&fib_rules_lock); for (r = fib_rules; r; r=r->r_next) { if (((saddr^r->r_src) & r->r_srcmask) || ((daddr^r->r_dst) & r->r_dstmask) || @@ -294,11 +307,14 @@ FRprintk("tb %d r %d ", r->r_table, r->r_action); policy = r; break; case RTN_UNREACHABLE: + read_unlock(&fib_rules_lock); return -ENETUNREACH; default: case RTN_BLACKHOLE: + read_unlock(&fib_rules_lock); return -EINVAL; case RTN_PROHIBIT: + read_unlock(&fib_rules_lock); return -EACCES; } @@ -308,12 +324,16 @@ FRprintk("tb %d r %d ", r->r_table, r->r_action); if (err == 0) { FRprintk("ok\n"); res->r = policy; + read_unlock(&fib_rules_lock); return 0; } - if (err < 0 && err != -EAGAIN) + if (err < 0 && err != -EAGAIN) { + read_unlock(&fib_rules_lock); return err; + } } FRprintk("FAILURE\n"); + read_unlock(&fib_rules_lock); return -ENETUNREACH; } @@ -400,12 +420,14 @@ int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb) int s_idx = cb->args[0]; struct fib_rule *r; + read_lock(&fib_rules_lock); for (r=fib_rules, idx=0; r; r = r->r_next, idx++) { if (idx < s_idx) continue; if (inet_fill_rule(skb, r, cb) < 0) break; } + read_unlock(&fib_rules_lock); cb->args[0] = idx; return skb->len; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 199550ffb..9456c7f29 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1,9 +1,9 @@ /* * NET3: Implementation of the ICMP protocol layer. * - * Alan Cox, <alan@cymru.net> + * Alan Cox, <alan@redhat.com> * - * Version: $Id: icmp.c,v 1.52 1999/03/21 12:04:11 davem Exp $ + * Version: $Id: icmp.c,v 1.57 1999/06/09 10:10:50 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -699,8 +699,8 @@ static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb, int len) case ICMP_FRAG_NEEDED: if (ipv4_config.no_pmtu_disc) { if (net_ratelimit()) - printk(KERN_INFO "ICMP: %s: fragmentation needed and DF set.\n", - in_ntoa(iph->daddr)); + printk(KERN_INFO "ICMP: %d.%d.%d.%d: fragmentation needed and DF set.\n", + NIPQUAD(iph->daddr)); } else { unsigned short new_mtu; new_mtu = ip_rt_frag_needed(iph, ntohs(icmph->un.frag.mtu)); @@ -711,7 +711,7 @@ static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb, int len) break; case ICMP_SR_FAILED: if (net_ratelimit()) - printk(KERN_INFO "ICMP: %s: Source Route Failed.\n", in_ntoa(iph->daddr)); + printk(KERN_INFO "ICMP: %d.%d.%d.%d: Source Route Failed.\n", NIPQUAD(iph->daddr)); break; default: break; @@ -741,8 +741,8 @@ static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb, int len) if (inet_addr_type(iph->daddr) == RTN_BROADCAST) { if (net_ratelimit()) - printk(KERN_WARNING "%s sent an invalid ICMP error to a broadcast.\n", - in_ntoa(skb->nh.iph->saddr)); + printk(KERN_WARNING "%d.%d.%d.%d sent an invalid ICMP error to a broadcast.\n", + NIPQUAD(skb->nh.iph->saddr)); return; } } @@ -1142,6 +1142,8 @@ __initfunc(void icmp_init(struct net_proto_family *ops)) icmp_inode.i_sock = 1; icmp_inode.i_uid = 0; icmp_inode.i_gid = 0; + init_waitqueue_head(&icmp_inode.i_wait); + init_waitqueue_head(&icmp_inode.u.socket_i.wait); icmp_socket->inode = &icmp_inode; icmp_socket->state = SS_UNCONNECTED; @@ -1150,6 +1152,11 @@ __initfunc(void icmp_init(struct net_proto_family *ops)) if ((err=ops->create(icmp_socket, IPPROTO_ICMP))<0) panic("Failed to create the ICMP control socket.\n"); icmp_socket->sk->allocation=GFP_ATOMIC; - icmp_socket->sk->num = 256; /* Don't receive any data */ icmp_socket->sk->ip_ttl = MAXTTL; + + /* Unhash it so that IP input processing does not even + * see it, we do not wish this socket to see incoming + * packets. + */ + icmp_socket->sk->prot->unhash(icmp_socket->sk); } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 68e52633e..61c530418 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -8,7 +8,7 @@ * the older version didn't come out right using gcc 2.5.8, the newer one * seems to fall out with gcc 2.6.2. * - * Version: $Id: igmp.c,v 1.30 1999/03/25 10:04:10 davem Exp $ + * Version: $Id: igmp.c,v 1.32 1999/06/09 10:10:53 davem Exp $ * * Authors: * Alan Cox <Alan.Cox@linux.org> @@ -97,6 +97,15 @@ #include <linux/mroute.h> #endif +/* Big mc list lock for all the devices */ +static rwlock_t ip_mc_lock = RW_LOCK_UNLOCKED; +/* Big mc list semaphore for all the sockets. + We do not refer to this list in IP data paths or from BH, + so that semaphore is OK. + */ +DECLARE_MUTEX(ip_sk_mc_sem); + + #define IP_MAX_MEMBERSHIPS 20 #ifdef CONFIG_IP_MULTICAST @@ -216,6 +225,8 @@ static void igmp_timer_expire(unsigned long data) struct in_device *in_dev = im->interface; int err; + read_lock(&ip_mc_lock); + im->tm_running=0; if (IGMP_V1_SEEN(in_dev)) @@ -234,6 +245,7 @@ static void igmp_timer_expire(unsigned long data) igmp_start_timer(im, IGMP_Unsolicited_Report_Interval); } im->reporter = 1; + read_unlock(&ip_mc_lock); } static void igmp_heard_report(struct in_device *in_dev, u32 group) @@ -245,14 +257,16 @@ static void igmp_heard_report(struct in_device *in_dev, u32 group) if (LOCAL_MCAST(group)) return; + read_lock(&ip_mc_lock); for (im=in_dev->mc_list; im!=NULL; im=im->next) { if (im->multiaddr == group) { igmp_stop_timer(im); im->reporter = 0; im->unsolicit_count = 0; - return; + break; } } + read_unlock(&ip_mc_lock); } static void igmp_heard_query(struct in_device *in_dev, unsigned char max_resp_time, @@ -281,6 +295,7 @@ static void igmp_heard_query(struct in_device *in_dev, unsigned char max_resp_ti * - Use the igmp->igmp_code field as the maximum * delay possible */ + read_lock(&ip_mc_lock); for (im=in_dev->mc_list; im!=NULL; im=im->next) { if (group && group != im->multiaddr) continue; @@ -291,6 +306,7 @@ static void igmp_heard_query(struct in_device *in_dev, unsigned char max_resp_ti igmp_stop_timer(im); igmp_start_timer(im, max_delay); } + read_unlock(&ip_mc_lock); } int igmp_rcv(struct sk_buff *skb, unsigned short len) @@ -380,9 +396,7 @@ static void igmp_group_dropped(struct ip_mc_list *im) if (LOCAL_MCAST(im->multiaddr)) return; - start_bh_atomic(); igmp_stop_timer(im); - end_bh_atomic(); if (im->reporter && !IGMP_V1_SEEN(im->interface)) igmp_send_report(im->interface->dev, im->multiaddr, IGMP_HOST_LEAVE_MESSAGE); @@ -400,9 +414,7 @@ static void igmp_group_added(struct ip_mc_list *im) if (LOCAL_MCAST(im->multiaddr)) return; - start_bh_atomic(); igmp_start_timer(im, IGMP_Initial_Report_Delay); - end_bh_atomic(); #endif } @@ -422,16 +434,17 @@ void ip_mc_inc_group(struct in_device *in_dev, u32 addr) im = (struct ip_mc_list *)kmalloc(sizeof(*im), GFP_KERNEL); + write_lock_bh(&ip_mc_lock); for (i=in_dev->mc_list; i; i=i->next) { if (i->multiaddr == addr) { i->users++; if (im) kfree(im); - return; + goto out; } } if (!im) - return; + goto out; im->users=1; im->interface=in_dev; im->multiaddr=addr; @@ -447,9 +460,13 @@ void ip_mc_inc_group(struct in_device *in_dev, u32 addr) im->next=in_dev->mc_list; in_dev->mc_list=im; igmp_group_added(im); + write_unlock_bh(&ip_mc_lock); if (in_dev->dev->flags & IFF_UP) ip_rt_multicast_event(in_dev); return; +out: + write_unlock_bh(&ip_mc_lock); + return; } /* @@ -458,22 +475,27 @@ void ip_mc_inc_group(struct in_device *in_dev, u32 addr) int ip_mc_dec_group(struct in_device *in_dev, u32 addr) { + int err = -ESRCH; struct ip_mc_list *i, **ip; + write_lock_bh(&ip_mc_lock); for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) { if (i->multiaddr==addr) { if (--i->users == 0) { *ip = i->next; - synchronize_bh(); - igmp_group_dropped(i); + + write_unlock_bh(&ip_mc_lock); if (in_dev->dev->flags & IFF_UP) ip_rt_multicast_event(in_dev); kfree_s(i, sizeof(*i)); + return 0; } - return 0; + err = 0; + break; } } + write_unlock_bh(&ip_mc_lock); return -ESRCH; } @@ -483,8 +505,10 @@ void ip_mc_down(struct in_device *in_dev) { struct ip_mc_list *i; + read_lock_bh(&ip_mc_lock); for (i=in_dev->mc_list; i; i=i->next) igmp_group_dropped(i); + read_unlock_bh(&ip_mc_lock); ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS); } @@ -497,8 +521,10 @@ void ip_mc_up(struct in_device *in_dev) ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); + read_lock_bh(&ip_mc_lock); for (i=in_dev->mc_list; i; i=i->next) igmp_group_added(i); + read_unlock_bh(&ip_mc_lock); } /* @@ -509,11 +535,13 @@ void ip_mc_destroy_dev(struct in_device *in_dev) { struct ip_mc_list *i; + write_lock_bh(&ip_mc_lock); while ((i = in_dev->mc_list) != NULL) { in_dev->mc_list = i->next; igmp_group_dropped(i); kfree_s(i, sizeof(*i)); } + write_unlock_bh(&ip_mc_lock); } static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr) @@ -570,6 +598,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL); err = -EADDRINUSE; + down(&ip_sk_mc_sem); for (i=sk->ip_mc_list; i; i=i->next) { if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) { /* New style additions are reference counted */ @@ -577,13 +606,13 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) i->count++; err = 0; } - goto done; + goto done_unlock; } count++; } err = -ENOBUFS; if (iml == NULL || count >= sysctl_igmp_max_memberships) - goto done; + goto done_unlock; memcpy(&iml->multi, imr, sizeof(*imr)); iml->next = sk->ip_mc_list; iml->count = 1; @@ -591,6 +620,9 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) ip_mc_inc_group(in_dev, addr); iml = NULL; err = 0; + +done_unlock: + up(&ip_sk_mc_sem); done: rtnl_shunlock(); if (iml) @@ -606,6 +638,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) { struct ip_mc_socklist *iml, **imlp; + down(&ip_sk_mc_sem); for (imlp=&sk->ip_mc_list; (iml=*imlp)!=NULL; imlp=&iml->next) { if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr && iml->multi.imr_address.s_addr==imr->imr_address.s_addr && @@ -615,7 +648,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) return 0; *imlp = iml->next; - synchronize_bh(); + up(&ip_sk_mc_sem); in_dev = inetdev_by_index(iml->multi.imr_ifindex); if (in_dev) @@ -624,6 +657,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) return 0; } } + up(&ip_sk_mc_sem); return -EADDRNOTAVAIL; } @@ -635,13 +669,37 @@ void ip_mc_drop_socket(struct sock *sk) { struct ip_mc_socklist *iml; + down(&ip_sk_mc_sem); while ((iml=sk->ip_mc_list) != NULL) { struct in_device *in_dev; sk->ip_mc_list = iml->next; + up(&ip_sk_mc_sem); + if ((in_dev = inetdev_by_index(iml->multi.imr_ifindex)) != NULL) ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); sock_kfree_s(sk, iml, sizeof(*iml)); + + down(&ip_sk_mc_sem); } + up(&ip_sk_mc_sem); +} + +int ip_check_mc(struct device *dev, u32 mc_addr) +{ + struct in_device *in_dev = dev->ip_ptr; + struct ip_mc_list *im; + + if (in_dev) { + read_lock(&ip_mc_lock); + for (im=in_dev->mc_list; im; im=im->next) { + if (im->multiaddr == mc_addr) { + read_unlock(&ip_mc_lock); + return 1; + } + } + read_unlock(&ip_mc_lock); + } + return 0; } @@ -653,11 +711,11 @@ int ip_mc_procinfo(char *buffer, char **start, off_t offset, int length, int dum struct ip_mc_list *im; int len=0; struct device *dev; - + len=sprintf(buffer,"Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n"); - - for(dev = dev_base; dev; dev = dev->next) - { + + read_lock(&dev_base_lock); + for(dev = dev_base; dev; dev = dev->next) { struct in_device *in_dev = dev->ip_ptr; char *querier = "NONE"; @@ -669,6 +727,7 @@ int ip_mc_procinfo(char *buffer, char **start, off_t offset, int length, int dum len+=sprintf(buffer+len,"%d\t%-10s: %5d %7s\n", dev->ifindex, dev->name, dev->mc_count, querier); + read_lock(&ip_mc_lock); for (im = in_dev->mc_list; im; im = im->next) { len+=sprintf(buffer+len, "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n", @@ -681,11 +740,16 @@ int ip_mc_procinfo(char *buffer, char **start, off_t offset, int length, int dum len=0; begin=pos; } - if(pos>offset+length) + if(pos>offset+length) { + read_unlock(&ip_mc_lock); goto done; + } } + read_unlock(&ip_mc_lock); } done: + read_unlock(&dev_base_lock); + *start=buffer+(offset-begin); len-=(offset-begin); if(len>length) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index f066e6073..29747fee6 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -5,7 +5,7 @@ * * The IP fragmentation functionality. * - * Version: $Id: ip_fragment.c,v 1.40 1999/03/20 23:58:34 davem Exp $ + * Version: $Id: ip_fragment.c,v 1.41 1999/05/27 00:38:07 davem Exp $ * * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox <Alan.Cox@linux.org> @@ -71,7 +71,8 @@ struct ipq { #define IPQ_HASHSZ 64 -struct ipq *ipq_hash[IPQ_HASHSZ]; +static struct ipq *ipq_hash[IPQ_HASHSZ]; +static spinlock_t ipfrag_lock = SPIN_LOCK_UNLOCKED; #define ipqhashfn(id, saddr, daddr, prot) \ ((((id) >> 1) ^ (saddr) ^ (daddr) ^ (prot)) & (IPQ_HASHSZ - 1)) @@ -141,7 +142,9 @@ static inline struct ipq *ip_find(struct iphdr *iph, struct dst_entry *dst) unsigned int hash = ipqhashfn(id, saddr, daddr, protocol); struct ipq *qp; - /* Always, we are in a BH context, so no locking. -DaveM */ + /* We are always in BH context, and protected by the + * ipfrag lock. + */ for(qp = ipq_hash[hash]; qp; qp = qp->next) { if(qp->iph->id == id && qp->iph->saddr == saddr && @@ -158,8 +161,9 @@ static inline struct ipq *ip_find(struct iphdr *iph, struct dst_entry *dst) * because we completed, reassembled and processed it, or because * it timed out. * - * This is called _only_ from BH contexts, on packet reception - * processing and from frag queue expiration timers. -DaveM + * This is called _only_ from BH contexts with the ipfrag lock held, + * on packet reception processing and from frag queue expiration + * timers. -DaveM */ static void ip_free(struct ipq *qp) { @@ -197,6 +201,7 @@ static void ip_expire(unsigned long arg) { struct ipq *qp = (struct ipq *) arg; + spin_lock(&ipfrag_lock); if(!qp->fragments) { #ifdef IP_EXPIRE_DEBUG @@ -213,10 +218,13 @@ static void ip_expire(unsigned long arg) out: /* Nuke the fragment queue. */ ip_free(qp); + spin_lock(&ipfrag_lock); } /* Memory limiting on fragments. Evictor trashes the oldest * fragment queue until we are back under the low threshold. + * + * We are always called in BH with the ipfrag lock held. */ static void ip_evictor(void) { @@ -229,9 +237,6 @@ restart: struct ipq *qp; if (atomic_read(&ip_frag_mem) <= sysctl_ipfrag_low_thresh) return; - /* We are in a BH context, so these queue - * accesses are safe. -DaveM - */ qp = ipq_hash[i]; if (qp) { /* find the oldest queue for this hash bucket */ @@ -283,7 +288,7 @@ static struct ipq *ip_create(struct sk_buff *skb, struct iphdr *iph) /* Add this entry to the queue. */ hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); - /* We are in a BH context, no locking necessary. -DaveM */ + /* In a BH context and ipfrag lock is held. -DaveM */ if((qp->next = ipq_hash[hash]) != NULL) qp->next->pprev = &qp->next; ipq_hash[hash] = qp; @@ -421,6 +426,8 @@ struct sk_buff *ip_defrag(struct sk_buff *skb) ip_statistics.IpReasmReqds++; + spin_lock(&ipfrag_lock); + /* Start by cleaning up the memory. */ if (atomic_read(&ip_frag_mem) > sysctl_ipfrag_high_thresh) ip_evictor(); @@ -565,6 +572,7 @@ struct sk_buff *ip_defrag(struct sk_buff *skb) out_freequeue: ip_free(qp); out_skb: + spin_unlock(&ipfrag_lock); return skb; } @@ -574,6 +582,7 @@ out_skb: out_timer: mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time); /* ~ 30 seconds */ out: + spin_unlock(&ipfrag_lock); return NULL; /* diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 7a3e2618b..107ccaa16 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) module. * - * Version: $Id: ip_input.c,v 1.37 1999/04/22 10:38:36 davem Exp $ + * Version: $Id: ip_input.c,v 1.40 1999/06/09 10:10:55 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -154,44 +154,11 @@ struct ip_mib ip_statistics={2,IPDEFTTL,}; /* Forwarding=No, Default TTL=64 */ - -/* - * Handle the issuing of an ioctl() request - * for the ip device. This is scheduled to - * disappear - */ - -int ip_ioctl(struct sock *sk, int cmd, unsigned long arg) -{ - switch(cmd) - { - default: - return(-EINVAL); - } -} - - #if defined(CONFIG_IP_TRANSPARENT_PROXY) && !defined(CONFIG_IP_ALWAYS_DEFRAG) #define CONFIG_IP_ALWAYS_DEFRAG 1 #endif /* - * 0 - deliver - * 1 - block - */ -static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) -{ - int type; - - type = skb->h.icmph->type; - if (type < 32) - return test_bit(type, &sk->tp_pinfo.tp_raw4.filter); - - /* Do not block unknown ICMP types */ - return 0; -} - -/* * Process Router Attention IP option */ int ip_call_ra_chain(struct sk_buff *skb) @@ -224,16 +191,37 @@ int ip_call_ra_chain(struct sk_buff *skb) return 0; } +/* Handle this out of line, it is rare. */ +static int ip_run_ipprot(struct sk_buff *skb, struct iphdr *iph, + struct inet_protocol *ipprot, int force_copy) +{ + int ret = 0; + + do { + if (ipprot->protocol == iph->protocol) { + struct sk_buff *skb2 = skb; + if (ipprot->copy || force_copy) + skb2 = skb_clone(skb, GFP_ATOMIC); + if(skb2 != NULL) { + ret = 1; + ipprot->handler(skb2, + ntohs(iph->tot_len) - (iph->ihl * 4)); + } + } + ipprot = (struct inet_protocol *) ipprot->next; + } while(ipprot != NULL); + + return ret; +} + +extern struct sock *raw_v4_input(struct sk_buff *, struct iphdr *, int); + /* * Deliver IP Packets to the higher protocol layers. */ int ip_local_deliver(struct sk_buff *skb) { struct iphdr *iph = skb->nh.iph; - struct inet_protocol *ipprot; - struct sock *raw_sk=NULL; - unsigned char hash; - int flag = 0; #ifndef CONFIG_IP_ALWAYS_DEFRAG /* @@ -249,34 +237,29 @@ int ip_local_deliver(struct sk_buff *skb) #endif #ifdef CONFIG_IP_MASQUERADE - /* - * Do we need to de-masquerade this packet? - */ - { - int ret; - /* - * Some masq modules can re-inject packets if - * bad configured. + /* Do we need to de-masquerade this packet? */ + if((IPCB(skb)->flags&IPSKB_MASQUERADED)) { + /* Some masq modules can re-inject packets if + * bad configured. */ + printk(KERN_DEBUG "ip_input(): demasq recursion detected. " + "Check masq modules configuration\n"); + kfree_skb(skb); + return 0; + } else { + int ret = ip_fw_demasquerade(&skb); - if((IPCB(skb)->flags&IPSKB_MASQUERADED)) { - printk(KERN_DEBUG "ip_input(): demasq recursion detected. Check masq modules configuration\n"); - kfree_skb(skb); - return 0; - } - - ret = ip_fw_demasquerade(&skb); if (ret < 0) { kfree_skb(skb); return 0; } - if (ret) { - iph=skb->nh.iph; + iph = skb->nh.iph; IPCB(skb)->flags |= IPSKB_MASQUERADED; dst_release(skb->dst); skb->dst = NULL; - if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, skb->dev)) { + if (ip_route_input(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev)) { kfree_skb(skb); return 0; } @@ -285,112 +268,50 @@ int ip_local_deliver(struct sk_buff *skb) } #endif - /* - * Point into the IP datagram, just past the header. - */ - + /* Point into the IP datagram, just past the header. */ skb->h.raw = skb->nh.raw + iph->ihl*4; - /* - * Deliver to raw sockets. This is fun as to avoid copies we want to make no - * surplus copies. - * - * RFC 1122: SHOULD pass TOS value up to the transport layer. - * -> It does. And not only TOS, but all IP header. - */ - - /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */ - hash = iph->protocol & (MAX_INET_PROTOS - 1); - - /* - * If there maybe a raw socket we must check - if not we don't care less - */ - - if((raw_sk = raw_v4_htable[hash]) != NULL) { - struct sock *sknext = NULL; - struct sk_buff *skb1; - raw_sk = raw_v4_lookup(raw_sk, iph->protocol, iph->saddr, iph->daddr, skb->dev->ifindex); - if(raw_sk) { /* Any raw sockets */ - do { - /* Find the next */ - sknext = raw_v4_lookup(raw_sk->next, iph->protocol, - iph->saddr, iph->daddr, skb->dev->ifindex); - if (iph->protocol != IPPROTO_ICMP || !icmp_filter(raw_sk, skb)) { - if (sknext == NULL) - break; - skb1 = skb_clone(skb, GFP_ATOMIC); - if(skb1) - { - raw_rcv(raw_sk, skb1); - } - } - raw_sk = sknext; - } while(raw_sk!=NULL); - - /* Here either raw_sk is the last raw socket, or NULL if - * none. We deliver to the last raw socket AFTER the - * protocol checks as it avoids a surplus copy. - */ - } - } - - /* - * skb->h.raw now points at the protocol beyond the IP header. - */ - - for (ipprot = (struct inet_protocol *)inet_protos[hash];ipprot != NULL;ipprot=(struct inet_protocol *)ipprot->next) { - struct sk_buff *skb2; - - if (ipprot->protocol != iph->protocol) - continue; - /* - * See if we need to make a copy of it. This will - * only be set if more than one protocol wants it. - * and then not for the last one. If there is a pending - * raw delivery wait for that + /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */ + int hash = iph->protocol & (MAX_INET_PROTOS - 1); + struct sock *raw_sk = raw_v4_htable[hash]; + struct inet_protocol *ipprot; + int flag; + + /* If there maybe a raw socket we must check - if not we + * don't care less */ - - if (ipprot->copy || raw_sk) - { - skb2 = skb_clone(skb, GFP_ATOMIC); - if(skb2==NULL) - continue; - } - else - { - skb2 = skb; - } - flag = 1; + if(raw_sk != NULL) + raw_sk = raw_v4_input(skb, iph, hash); + + ipprot = (struct inet_protocol *) inet_protos[hash]; + flag = 0; + if(ipprot != NULL) { + if(raw_sk == NULL && + ipprot->next == NULL && + ipprot->protocol == iph->protocol) { + /* Fast path... */ + return ipprot->handler(skb, (ntohs(iph->tot_len) - + (iph->ihl * 4))); + } else { + flag = ip_run_ipprot(skb, iph, ipprot, (raw_sk != NULL)); + } + } - /* - * Pass on the datagram to each protocol that wants it, - * based on the datagram protocol. We should really - * check the protocol handler's return values here... + /* All protocols checked. + * If this packet was a broadcast, we may *not* reply to it, since that + * causes (proven, grin) ARP storms and a leakage of memory (i.e. all + * ICMP reply messages get queued up for transmission...) */ - - ipprot->handler(skb2, ntohs(iph->tot_len) - (iph->ihl * 4)); - } - - /* - * All protocols checked. - * If this packet was a broadcast, we may *not* reply to it, since that - * causes (proven, grin) ARP storms and a leakage of memory (i.e. all - * ICMP reply messages get queued up for transmission...) - */ - - if(raw_sk!=NULL) /* Shift to last raw user */ - { - raw_rcv(raw_sk, skb); - - } - else if (!flag) /* Free and report errors */ - { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); - kfree_skb(skb); + if(raw_sk != NULL) { /* Shift to last raw user */ + raw_rcv(raw_sk, skb); + } else if (!flag) { /* Free and report errors */ + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); + kfree_skb(skb); + } } - return(0); + return 0; } /* @@ -404,9 +325,8 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) u16 rport; #endif /* CONFIG_FIREWALL */ - /* - * When the interface is in promisc. mode, drop all the crap - * that it receives, do not try to analyse it. + /* When the interface is in promisc. mode, drop all the crap + * that it receives, do not try to analyse it. */ if (skb->pkt_type == PACKET_OTHERHOST) goto drop; @@ -430,17 +350,15 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) goto inhdr_error; { - __u32 len = ntohs(iph->tot_len); - if (skb->len < len) - goto inhdr_error; + __u32 len = ntohs(iph->tot_len); + if (skb->len < len) + goto inhdr_error; - /* - * Our transport medium may have padded the buffer out. Now we know it - * is IP we can trim to the true length of the frame. - * Note this now means skb->len holds ntohs(iph->tot_len). - */ - - __skb_trim(skb, len); + /* Our transport medium may have padded the buffer out. Now we know it + * is IP we can trim to the true length of the frame. + * Note this now means skb->len holds ntohs(iph->tot_len). + */ + __skb_trim(skb, len); } #ifdef CONFIG_IP_ALWAYS_DEFRAG @@ -474,21 +392,17 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) if (skb->dst == NULL) { if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev)) goto drop; -#ifdef CONFIG_CPU_IS_SLOW - if (net_cpu_congestion > 10 && !(iph->tos&IPTOS_RELIABILITY) && - IPTOS_PREC(iph->tos) < IPTOS_PREC_INTERNETCONTROL) { - goto drop; - } -#endif } #ifdef CONFIG_NET_CLS_ROUTE if (skb->dst->tclassid) { u32 idx = skb->dst->tclassid; + write_lock(&ip_rt_acct_lock); ip_rt_acct[idx&0xFF].o_packets++; ip_rt_acct[idx&0xFF].o_bytes+=skb->len; ip_rt_acct[(idx>>16)&0xFF].i_packets++; ip_rt_acct[(idx>>16)&0xFF].i_bytes+=skb->len; + write_unlock(&ip_rt_acct_lock); } #endif diff --git a/net/ipv4/ip_masq_mfw.c b/net/ipv4/ip_masq_mfw.c index dc38b1712..ff07231fc 100644 --- a/net/ipv4/ip_masq_mfw.c +++ b/net/ipv4/ip_masq_mfw.c @@ -3,7 +3,7 @@ * * Does (reverse-masq) forwarding based on skb->fwmark value * - * $Id: ip_masq_mfw.c,v 1.3 1999/01/26 05:33:47 davem Exp $ + * $Id: ip_masq_mfw.c,v 1.4 1999/05/13 23:25:07 davem Exp $ * * Author: Juan Jose Ciarlante <jjciarla@raiz.uncu.edu.ar> * based on Steven Clarke's portfw @@ -79,7 +79,7 @@ struct ip_masq_mfw { }; -static struct semaphore mfw_sema = MUTEX; +static DECLARE_MUTEX(mfw_sema); #ifdef __SMP__ static rwlock_t mfw_lock = RW_LOCK_UNLOCKED; #endif diff --git a/net/ipv4/ip_masq_quake.c b/net/ipv4/ip_masq_quake.c index 165dd6bd5..17b11a799 100644 --- a/net/ipv4/ip_masq_quake.c +++ b/net/ipv4/ip_masq_quake.c @@ -12,6 +12,7 @@ * http://www.gamers.org/dEngine/quake/spec/ * Harald Hoyer : Check for QUAKE-STRING * Juan Jose Ciarlante : litl bits for 2.1 + * Horst von Brand : Add #include <linux/string.h> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -24,6 +25,7 @@ #include <linux/module.h> #include <asm/system.h> #include <linux/types.h> +#include <linux/string.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/in.h> @@ -44,7 +46,7 @@ typedef struct struct quake_priv_data { /* Have we seen a client connect message */ - char cl_connect; + signed char cl_connect; }; static int diff --git a/net/ipv4/ip_masq_vdolive.c b/net/ipv4/ip_masq_vdolive.c index 4724e3b93..2d8d672cc 100644 --- a/net/ipv4/ip_masq_vdolive.c +++ b/net/ipv4/ip_masq_vdolive.c @@ -2,7 +2,7 @@ * IP_MASQ_VDOLIVE - VDO Live masquerading module * * - * Version: @(#)$Id: ip_masq_vdolive.c,v 1.4 1998/10/06 04:49:07 davem Exp $ + * Version: @(#)$Id: ip_masq_vdolive.c,v 1.6 1999/06/09 08:29:03 davem Exp $ * * Author: Nigel Metheringham <Nigel.Metheringham@ThePLAnet.net> * PLAnet Online Ltd @@ -10,6 +10,9 @@ * Fixes: Minor changes for 2.1 by * Steven Clarke <Steven.Clarke@ThePlanet.Net>, Planet Online Ltd * + * Add missing #include <linux/string.h> + * Horst von Brand <vonbrand@sleipnir.valparaiso.cl> + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -25,6 +28,7 @@ #include <linux/config.h> #include <linux/module.h> #include <linux/types.h> +#include <linux/string.h> #include <linux/kernel.h> #include <asm/system.h> #include <linux/skbuff.h> diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index fae22cbe7..359926a4c 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -5,7 +5,7 @@ * * The options processing module for ip.c * - * Version: $Id: ip_options.c,v 1.16 1999/03/21 05:22:40 davem Exp $ + * Version: $Id: ip_options.c,v 1.18 1999/06/09 08:29:06 davem Exp $ * * Authors: A.N.Kuznetsov * @@ -452,7 +452,6 @@ eol: error: if (skb) { icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24)); - kfree_skb(skb); } return -EINVAL; } diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index abe93ec27..51e27ad67 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1,5 +1,5 @@ /* - * $Id: ipconfig.c,v 1.20 1999/03/28 10:18:28 davem Exp $ + * $Id: ipconfig.c,v 1.22 1999/06/09 10:10:57 davem Exp $ * * Automatic Configuration of IP -- use BOOTP or RARP or user-supplied * information to configure own IP address and routes. @@ -112,7 +112,8 @@ static int __init ic_open_devs(void) unsigned short oflags; last = &ic_first_dev; - for (dev = dev_base; dev; dev = dev->next) + read_lock(&dev_base_lock); + for (dev = dev_base; dev; dev = dev->next) { if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : (!(dev->flags & IFF_LOOPBACK) && (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) && @@ -142,6 +143,9 @@ static int __init ic_open_devs(void) ic_proto_have_if |= able; DBG(("IP-Config: Opened %s (able=%d)\n", dev->name, able)); } + } + read_unlock(&dev_base_lock); + *last = NULL; if (!ic_first_dev) { diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index d7db0c007..1034e0e7a 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1,7 +1,7 @@ /* * IP multicast routing support for mrouted 3.6/3.8 * - * (c) 1995 Alan Cox, <alan@cymru.net> + * (c) 1995 Alan Cox, <alan@redhat.com> * Linux Consultancy and Custom Driver Development * * This program is free software; you can redistribute it and/or @@ -9,7 +9,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Version: $Id: ipmr.c,v 1.40 1999/03/25 10:04:25 davem Exp $ + * Version: $Id: ipmr.c,v 1.43 1999/06/09 10:10:59 davem Exp $ * * Fixes: * Michael Chastain : Incorrect size of copying. @@ -23,6 +23,8 @@ * Brad Parker : Better behaviour on mrouted upcall * overflow. * Carlos Picoto : PIMv1 Support + * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header + * Relax this requrement to work with older peers. * */ @@ -431,7 +433,7 @@ static void ipmr_cache_resolve(struct mfc_cache *cache) skb_trim(skb, nlh->nlmsg_len); ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE; } - err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).pid, MSG_DONTWAIT); + err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT); } else #endif ip_mr_forward(skb, cache, 0); @@ -1343,7 +1345,8 @@ int pim_rcv(struct sk_buff * skb, unsigned short len) pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) || (pim->flags&PIM_NULL_REGISTER) || reg_dev == NULL || - ip_compute_csum((void *)pim, len)) { + (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && + ip_compute_csum((void *)pim, len))) { kfree_skb(skb); return -EINVAL; } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 1640a0560..52c5ee5a4 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -7,7 +7,7 @@ * PROC file system. It is mainly used for debugging and * statistics. * - * Version: $Id: proc.c,v 1.34 1999/02/08 11:20:34 davem Exp $ + * Version: $Id: proc.c,v 1.35 1999/05/27 00:37:38 davem Exp $ * * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de> @@ -114,10 +114,8 @@ static inline void get__sock(struct sock *sp, char *tmpbuf, int i, int format) slot_dist = tcp_tw_death_row_slot - slot_dist; timer_expires = jiffies + (slot_dist * TCP_TWKILL_PERIOD); } else { - timer_active1 = del_timer(&tp->retransmit_timer); - timer_active2 = del_timer(&sp->timer); - if (!timer_active1) tp->retransmit_timer.expires=0; - if (!timer_active2) sp->timer.expires=0; + timer_active1 = tp->retransmit_timer.prev != NULL; + timer_active2 = sp->timer.prev != NULL; timer_active = 0; timer_expires = (unsigned) -1; } @@ -147,9 +145,6 @@ static inline void get__sock(struct sock *sp, char *tmpbuf, int i, int format) (!tw_bucket && sp->socket) ? sp->socket->inode->i_uid : 0, (!tw_bucket && timer_active) ? sp->timeout : 0, (!tw_bucket && sp->socket) ? sp->socket->inode->i_ino : 0); - - if (timer_active1) add_timer(&tp->retransmit_timer); - if (timer_active2) add_timer(&sp->timer); } /* @@ -176,7 +171,7 @@ get__netinfo(struct proto *pro, char *buffer, int format, char **start, off_t of " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout inode"); pos = 128; - SOCKHASH_LOCK(); + SOCKHASH_LOCK_READ(); sp = pro->sklist_next; while(sp != (struct sock *)pro) { if (format == 0 && sp->state == TCP_LISTEN) { @@ -211,7 +206,7 @@ get__netinfo(struct proto *pro, char *buffer, int format, char **start, off_t of i++; } out: - SOCKHASH_UNLOCK(); + SOCKHASH_UNLOCK_READ(); begin = len - (pos - offset); *start = buffer + begin; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index fc6b1f2ee..dd2e7555e 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -5,7 +5,7 @@ * * RAW - implementation of IP "raw" sockets. * - * Version: $Id: raw.c,v 1.39 1998/11/08 11:17:04 davem Exp $ + * Version: $Id: raw.c,v 1.41 1999/05/30 01:16:19 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -75,11 +75,11 @@ static void raw_v4_hash(struct sock *sk) num &= (RAWV4_HTABLE_SIZE - 1); skp = &raw_v4_htable[num]; - SOCKHASH_LOCK(); + SOCKHASH_LOCK_WRITE(); sk->next = *skp; *skp = sk; sk->hashent = num; - SOCKHASH_UNLOCK(); + SOCKHASH_UNLOCK_WRITE(); } static void raw_v4_unhash(struct sock *sk) @@ -90,7 +90,7 @@ static void raw_v4_unhash(struct sock *sk) num &= (RAWV4_HTABLE_SIZE - 1); skp = &raw_v4_htable[num]; - SOCKHASH_LOCK(); + SOCKHASH_LOCK_WRITE(); while(*skp != NULL) { if(*skp == sk) { *skp = sk->next; @@ -98,7 +98,7 @@ static void raw_v4_unhash(struct sock *sk) } skp = &((*skp)->next); } - SOCKHASH_UNLOCK(); + SOCKHASH_UNLOCK_WRITE(); } static void raw_v4_rehash(struct sock *sk) @@ -110,7 +110,7 @@ static void raw_v4_rehash(struct sock *sk) num &= (RAWV4_HTABLE_SIZE - 1); skp = &raw_v4_htable[oldnum]; - SOCKHASH_LOCK(); + SOCKHASH_LOCK_WRITE(); while(*skp != NULL) { if(*skp == sk) { *skp = sk->next; @@ -121,16 +121,15 @@ static void raw_v4_rehash(struct sock *sk) sk->next = raw_v4_htable[num]; raw_v4_htable[num] = sk; sk->hashent = num; - SOCKHASH_UNLOCK(); + SOCKHASH_UNLOCK_WRITE(); } -/* Grumble... icmp and ip_input want to get at this... */ -struct sock *raw_v4_lookup(struct sock *sk, unsigned short num, - unsigned long raddr, unsigned long laddr, int dif) +static __inline__ struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, + unsigned long raddr, unsigned long laddr, + int dif) { struct sock *s = sk; - SOCKHASH_LOCK(); for(s = sk; s; s = s->next) { if((s->num == num) && !(s->dead && (s->state == TCP_CLOSE)) && @@ -139,10 +138,79 @@ struct sock *raw_v4_lookup(struct sock *sk, unsigned short num, !(s->bound_dev_if && s->bound_dev_if != dif)) break; /* gotcha */ } - SOCKHASH_UNLOCK(); return s; } +struct sock *raw_v4_lookup(struct sock *sk, unsigned short num, + unsigned long raddr, unsigned long laddr, + int dif) +{ + SOCKHASH_LOCK_READ(); + sk = __raw_v4_lookup(sk, num, raddr, laddr, dif); + SOCKHASH_UNLOCK_READ(); + + return sk; +} + +/* + * 0 - deliver + * 1 - block + */ +static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) +{ + int type; + + type = skb->h.icmph->type; + if (type < 32) + return test_bit(type, &sk->tp_pinfo.tp_raw4.filter); + + /* Do not block unknown ICMP types */ + return 0; +} + +/* IP input processing comes here for RAW socket delivery. + * This is fun as to avoid copies we want to make no surplus + * copies. + * + * RFC 1122: SHOULD pass TOS value up to the transport layer. + * -> It does. And not only TOS, but all IP header. + */ +struct sock *raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) +{ + struct sock *sk; + + SOCKHASH_LOCK_READ_BH(); + if ((sk = raw_v4_htable[hash]) == NULL) + goto out; + sk = __raw_v4_lookup(sk, iph->protocol, + iph->saddr, iph->daddr, + skb->dev->ifindex); + while(sk != NULL) { + struct sock *sknext = __raw_v4_lookup(sk->next, iph->protocol, + iph->saddr, iph->daddr, + skb->dev->ifindex); + + if (iph->protocol != IPPROTO_ICMP || + ! icmp_filter(sk, skb)) { + struct sk_buff *clone; + + if(sknext == NULL) + break; + clone = skb_clone(skb, GFP_ATOMIC); + if(clone) { + SOCKHASH_UNLOCK_READ_BH(); + raw_rcv(sk, clone); + SOCKHASH_LOCK_READ_BH(); + } + } + sk = sknext; + } +out: + SOCKHASH_UNLOCK_READ_BH(); + + return sk; +} + void raw_err (struct sock *sk, struct sk_buff *skb) { int type = skb->h.icmph->type; @@ -402,6 +470,8 @@ done: static void raw_close(struct sock *sk, long timeout) { + bh_lock_sock(sk); + /* Observation: when raw_close is called, processes have no access to socket anymore. But net still has. Step one, detach it from networking: diff --git a/net/ipv4/route.c b/net/ipv4/route.c index dbde97b70..3d9e87de3 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -5,7 +5,7 @@ * * ROUTE - implementation of the IP router. * - * Version: $Id: route.c,v 1.67 1999/05/08 20:00:20 davem Exp $ + * Version: $Id: route.c,v 1.69 1999/06/09 10:11:02 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -174,7 +174,18 @@ __u8 ip_tos2prio[16] = { * Route cache. */ -struct rtable *rt_hash_table[RT_HASH_DIVISOR]; +/* The locking scheme is rather straight forward: + * + * 1) A BH protected rwlock protects the central route hash. + * 2) Only writers remove entries, and they hold the lock + * as they look at rtable reference counts. + * 3) Only readers acquire references to rtable entries, + * they do so with atomic increments and with the + * lock held. + */ + +static struct rtable *rt_hash_table[RT_HASH_DIVISOR]; +static rwlock_t rt_hash_lock = RW_LOCK_UNLOCKED; static int rt_intern_hash(unsigned hash, struct rtable * rth, struct rtable ** res); @@ -204,7 +215,7 @@ static int rt_cache_get_info(char *buffer, char **start, off_t offset, int lengt } - start_bh_atomic(); + read_lock_bh(&rt_hash_lock); for (i = 0; i<RT_HASH_DIVISOR; i++) { for (r = rt_hash_table[i]; r; r = r->u.rt_next) { @@ -239,7 +250,7 @@ static int rt_cache_get_info(char *buffer, char **start, off_t offset, int lengt } done: - end_bh_atomic(); + read_unlock_bh(&rt_hash_lock); *start = buffer+len-(pos-offset); len = pos-offset; @@ -292,6 +303,7 @@ static __inline__ int rt_may_expire(struct rtable *rth, int tmo1, int tmo2) return 1; } +/* This runs via a timer and thus is always in BH context. */ static void rt_check_expire(unsigned long dummy) { int i; @@ -305,6 +317,7 @@ static void rt_check_expire(unsigned long dummy) rover = (rover + 1) & (RT_HASH_DIVISOR-1); rthp = &rt_hash_table[rover]; + write_lock(&rt_hash_lock); while ((rth = *rthp) != NULL) { if (rth->u.dst.expires) { /* Entrie is expired even if it is in use */ @@ -325,6 +338,7 @@ static void rt_check_expire(unsigned long dummy) *rthp = rth->u.rt_next; rt_free(rth); } + write_unlock(&rt_hash_lock); /* Fallback loop breaker. */ if ((jiffies - now) > 0) @@ -334,6 +348,9 @@ static void rt_check_expire(unsigned long dummy) add_timer(&rt_periodic_timer); } +/* This can run from both BH and non-BH contexts, the latter + * in the case of a forced flush event. + */ static void rt_run_flush(unsigned long dummy) { int i; @@ -341,23 +358,23 @@ static void rt_run_flush(unsigned long dummy) rt_deadline = 0; - start_bh_atomic(); for (i=0; i<RT_HASH_DIVISOR; i++) { - if ((rth = xchg(&rt_hash_table[i], NULL)) == NULL) - continue; - end_bh_atomic(); + write_lock_bh(&rt_hash_lock); + rth = rt_hash_table[i]; + if(rth != NULL) + rt_hash_table[i] = NULL; + write_unlock_bh(&rt_hash_lock); for (; rth; rth=next) { next = rth->u.rt_next; rth->u.rt_next = NULL; rt_free(rth); } - - start_bh_atomic(); } - end_bh_atomic(); } +static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED; + void rt_cache_flush(int delay) { unsigned long now = jiffies; @@ -366,7 +383,7 @@ void rt_cache_flush(int delay) if (delay < 0) delay = ip_rt_min_delay; - start_bh_atomic(); + spin_lock_bh(&rt_flush_lock); if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) { long tmo = (long)(rt_deadline - now); @@ -386,7 +403,7 @@ void rt_cache_flush(int delay) } if (delay <= 0) { - end_bh_atomic(); + spin_unlock_bh(&rt_flush_lock); rt_run_flush(0); return; } @@ -396,7 +413,7 @@ void rt_cache_flush(int delay) rt_flush_timer.expires = now + delay; add_timer(&rt_flush_timer); - end_bh_atomic(); + spin_unlock_bh(&rt_flush_lock); } /* @@ -459,7 +476,10 @@ static int rt_garbage_collect(void) do { int i, k; - start_bh_atomic(); + /* The write lock is held during the entire hash + * traversal to ensure consistent state of the rover. + */ + write_lock_bh(&rt_hash_lock); for (i=0, k=rover; i<RT_HASH_DIVISOR; i++) { unsigned tmo = expire; @@ -480,7 +500,7 @@ static int rt_garbage_collect(void) break; } rover = k; - end_bh_atomic(); + write_unlock_bh(&rt_hash_lock); if (goal <= 0) goto work_done; @@ -530,10 +550,9 @@ static int rt_intern_hash(unsigned hash, struct rtable * rt, struct rtable ** rp int attempts = !in_interrupt(); restart: - start_bh_atomic(); - rthp = &rt_hash_table[hash]; + write_lock_bh(&rt_hash_lock); while ((rth = *rthp) != NULL) { if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) { /* Put it first */ @@ -544,7 +563,7 @@ restart: atomic_inc(&rth->u.dst.refcnt); atomic_inc(&rth->u.dst.use); rth->u.dst.lastuse = now; - end_bh_atomic(); + write_unlock_bh(&rt_hash_lock); rt_drop(rt); *rp = rth; @@ -559,7 +578,7 @@ restart: */ if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) { if (!arp_bind_neighbour(&rt->u.dst)) { - end_bh_atomic(); + write_unlock_bh(&rt_hash_lock); /* Neighbour tables are full and nothing can be released. Try to shrink route cache, @@ -594,7 +613,7 @@ restart: } #endif rt_hash_table[hash] = rt; - end_bh_atomic(); + write_unlock_bh(&rt_hash_lock); *rp = rt; return 0; } @@ -633,6 +652,7 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, rthp=&rt_hash_table[hash]; + write_lock_bh(&rt_hash_lock); while ( (rth = *rthp) != NULL) { struct rtable *rt; @@ -657,6 +677,7 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); if (rt == NULL) { ip_rt_put(rth); + write_unlock_bh(&rt_hash_lock); return; } @@ -688,11 +709,15 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, } *rthp = rth->u.rt_next; + write_unlock_bh(&rt_hash_lock); if (!rt_intern_hash(hash, rt, &rt)) ip_rt_put(rt); rt_drop(rth); - break; + goto do_next; } + write_unlock_bh(&rt_hash_lock); + do_next: + ; } } return; @@ -722,8 +747,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) #if RT_CACHE_DEBUG >= 1 printk(KERN_DEBUG "ip_rt_advice: redirect to %d.%d.%d.%d/%02x dropped\n", NIPQUAD(rt->rt_dst), rt->key.tos); #endif - start_bh_atomic(); ip_rt_put(rt); + write_lock_bh(&rt_hash_lock); for (rthp = &rt_hash_table[hash]; *rthp; rthp = &(*rthp)->u.rt_next) { if (*rthp == rt) { *rthp = rt->u.rt_next; @@ -731,7 +756,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) break; } } - end_bh_atomic(); + write_unlock_bh(&rt_hash_lock); return NULL; } } @@ -861,6 +886,7 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) for (i=0; i<2; i++) { unsigned hash = rt_hash_code(daddr, skeys[i], tos); + read_lock_bh(&rt_hash_lock); for (rth = rt_hash_table[hash]; rth; rth = rth->u.rt_next) { if (rth->key.dst == daddr && rth->key.src == skeys[i] && @@ -890,6 +916,7 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) } } } + read_unlock_bh(&rt_hash_lock); } return est_mtu ? : new_mtu; } @@ -1362,6 +1389,7 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr, tos &= IPTOS_TOS_MASK; hash = rt_hash_code(daddr, saddr^(iif<<5), tos); + read_lock_bh(&rt_hash_lock); for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) { if (rth->key.dst == daddr && rth->key.src == saddr && @@ -1374,10 +1402,12 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr, rth->u.dst.lastuse = jiffies; atomic_inc(&rth->u.dst.use); atomic_inc(&rth->u.dst.refcnt); + read_unlock_bh(&rt_hash_lock); skb->dst = (struct dst_entry*)rth; return 0; } } + read_unlock_bh(&rt_hash_lock); /* Multicast recognition logic is moved from route cache to here. The problem was that too many Ethernet cards have broken/missing @@ -1657,7 +1687,7 @@ int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif) hash = rt_hash_code(daddr, saddr^(oif<<5), tos); - start_bh_atomic(); + read_lock_bh(&rt_hash_lock); for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) { if (rth->key.dst == daddr && rth->key.src == saddr && @@ -1673,12 +1703,12 @@ int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif) rth->u.dst.lastuse = jiffies; atomic_inc(&rth->u.dst.use); atomic_inc(&rth->u.dst.refcnt); - end_bh_atomic(); + read_unlock_bh(&rt_hash_lock); *rp = rth; return 0; } } - end_bh_atomic(); + read_unlock_bh(&rt_hash_lock); return ip_route_output_slow(rp, daddr, saddr, tos, oif); } @@ -1821,9 +1851,7 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) return -ENODEV; skb->protocol = __constant_htons(ETH_P_IP); skb->dev = dev; - start_bh_atomic(); err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); - end_bh_atomic(); rt = (struct rtable*)skb->dst; if (!err && rt->u.dst.error) err = -rt->u.dst.error; @@ -1869,7 +1897,7 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) if (h < s_h) continue; if (h > s_h) s_idx = 0; - start_bh_atomic(); + read_lock_bh(&rt_hash_lock); for (rt = rt_hash_table[h], idx = 0; rt; rt = rt->u.rt_next, idx++) { if (idx < s_idx) continue; @@ -1877,12 +1905,12 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) { dst_release(xchg(&skb->dst, NULL)); - end_bh_atomic(); + read_unlock_bh(&rt_hash_lock); goto done; } dst_release(xchg(&skb->dst, NULL)); } - end_bh_atomic(); + read_unlock_bh(&rt_hash_lock); } done: @@ -1968,6 +1996,7 @@ ctl_table ipv4_route_table[] = { #ifdef CONFIG_NET_CLS_ROUTE struct ip_rt_acct ip_rt_acct[256]; +rwlock_t ip_rt_acct_lock = RW_LOCK_UNLOCKED; #ifdef CONFIG_PROC_FS static int ip_rt_acct_read(char *buffer, char **start, off_t offset, @@ -1980,9 +2009,9 @@ static int ip_rt_acct_read(char *buffer, char **start, off_t offset, *eof = 1; } if (length > 0) { - start_bh_atomic(); + read_lock_bh(&ip_rt_acct_lock); memcpy(buffer, ((u8*)&ip_rt_acct)+offset, length); - end_bh_atomic(); + read_unlock_bh(&ip_rt_acct_lock); return length; } return 0; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8c1c9f9be..779c31cef 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.140 1999/04/22 10:34:31 davem Exp $ + * Version: $Id: tcp.c,v 1.144 1999/05/27 01:03:37 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -416,6 +416,7 @@ #include <linux/fcntl.h> #include <linux/poll.h> #include <linux/init.h> +#include <linux/smp_lock.h> #include <net/icmp.h> #include <net/tcp.h> @@ -432,7 +433,7 @@ kmem_cache_t *tcp_timewait_cachep; /* * Find someone to 'accept'. Must be called with - * the socket locked or with interrupts disabled + * the listening socket locked. */ static struct open_request *tcp_find_established(struct tcp_opt *tp, @@ -441,10 +442,11 @@ static struct open_request *tcp_find_established(struct tcp_opt *tp, struct open_request *req = tp->syn_wait_queue; struct open_request *prev = (struct open_request *)&tp->syn_wait_queue; while(req) { - if (req->sk && - ((1 << req->sk->state) & - ~(TCPF_SYN_SENT|TCPF_SYN_RECV))) - break; + if (req->sk) { + if((1 << req->sk->state) & + ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) + break; + } prev = req; req = req->dl_next; } @@ -655,12 +657,13 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) /* * Wait for a socket to get into the connected state * - * Note: must be called with the socket locked. + * Note: Must be called with the socket locked, and it + * runs with the kernel fully unlocked. */ static int wait_for_tcp_connect(struct sock * sk, int flags) { struct task_struct *tsk = current; - struct wait_queue wait = { tsk, NULL }; + DECLARE_WAITQUEUE(wait, tsk); while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { if(sk->err) @@ -698,12 +701,14 @@ static inline int tcp_memory_free(struct sock *sk) /* * Wait for more memory for a socket + * + * NOTE: This runs with the kernel fully unlocked. */ static void wait_for_tcp_memory(struct sock * sk) { release_sock(sk); if (!tcp_memory_free(sk)) { - struct wait_queue wait = { current, NULL }; + DECLARE_WAITQUEUE(wait, current); sk->socket->flags &= ~SO_NOSPACE; add_wait_queue(sk->sleep, &wait); @@ -744,6 +749,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) int mss_now; int err, copied; + unlock_kernel(); lock_sock(sk); err = 0; @@ -896,6 +902,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg) err = -ERESTARTSYS; goto do_interrupted; } + tcp_push_pending_frames(sk, tp); wait_for_tcp_memory(sk); /* If SACK's were formed or PMTU events happened, @@ -969,6 +976,7 @@ do_fault2: out: tcp_push_pending_frames(sk, tp); release_sock(sk); + lock_kernel(); return err; } @@ -1117,7 +1125,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, int len, int nonblock, int flags, int *addr_len) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct wait_queue wait = { current, NULL }; + DECLARE_WAITQUEUE(wait, current); int copied = 0; u32 peek_seq; volatile u32 *seq; /* So gcc doesn't overoptimise */ @@ -1148,6 +1156,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, if (flags & MSG_WAITALL) target=len; + unlock_kernel(); add_wait_queue(sk->sleep, &wait); lock_sock(sk); @@ -1300,6 +1309,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, /* We now will not sleep again until we are finished * with skb. Sorry if you are doing the SMP port * but you'll just have to fix it neatly ;) + * + * Very funny Alan... -DaveM */ atomic_dec(&skb->users); @@ -1344,6 +1355,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, /* Clean up data we have read: This will do ACK frames. */ cleanup_rbuf(sk, copied); release_sock(sk); + lock_kernel(); return copied; } @@ -1415,16 +1427,15 @@ void tcp_shutdown(struct sock *sk, int how) return; /* If we've already sent a FIN, or it's a closed state, skip this. */ + lock_sock(sk); if ((1 << sk->state) & (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) { - lock_sock(sk); /* Clear out any half completed packets. FIN if needed. */ if (tcp_close_state(sk,0)) tcp_send_fin(sk); - - release_sock(sk); } + release_sock(sk); } @@ -1471,13 +1482,6 @@ void tcp_close(struct sock *sk, long timeout) struct sk_buff *skb; int data_was_unread = 0; - /* - * Check whether the socket is locked ... supposedly - * it's impossible to tcp_close() a locked socket. - */ - if (atomic_read(&sk->sock_readers)) - printk("tcp_close: socket already locked!\n"); - /* We need to grab some memory, and put together a FIN, * and then put it into the queue to be sent. */ @@ -1491,6 +1495,8 @@ void tcp_close(struct sock *sk, long timeout) return; } + unlock_kernel(); + /* It is questionable, what the role of this is now. * In any event either it should be removed, or * increment of SLT_KEEPALIVE be done, this is causing @@ -1534,24 +1540,23 @@ void tcp_close(struct sock *sk, long timeout) if (timeout) { struct task_struct *tsk = current; - struct wait_queue wait = { tsk, NULL }; + DECLARE_WAITQUEUE(wait, current); add_wait_queue(sk->sleep, &wait); - release_sock(sk); while (1) { tsk->state = TASK_INTERRUPTIBLE; if (!closing(sk)) break; + release_sock(sk); timeout = schedule_timeout(timeout); + lock_sock(sk); if (signal_pending(tsk) || !timeout) break; } tsk->state = TASK_RUNNING; remove_wait_queue(sk->sleep, &wait); - - lock_sock(sk); } /* Now that the socket is dead, if we are in the FIN_WAIT2 state @@ -1559,23 +1564,40 @@ void tcp_close(struct sock *sk, long timeout) */ tcp_check_fin_timer(sk); - release_sock(sk); sk->dead = 1; + + release_sock(sk); + lock_kernel(); } /* * Wait for an incoming connection, avoid race - * conditions. This must be called with the socket locked. + * conditions. This must be called with the socket locked, + * and without the kernel lock held. */ static struct open_request * wait_for_connect(struct sock * sk, struct open_request **pprev) { - struct wait_queue wait = { current, NULL }; + DECLARE_WAITQUEUE(wait, current); struct open_request *req; - add_wait_queue(sk->sleep, &wait); + /* + * True wake-one mechanism for incoming connections: only + * one process gets woken up, not the 'whole herd'. + * Since we do not 'race & poll' for established sockets + * anymore, the common case will execute the loop only once. + * + * Subtle issue: "add_wait_queue_exclusive()" will be added + * after any current non-exclusive waiters, and we know that + * it will always _stay_ after any new non-exclusive waiters + * because all non-exclusive waiters are added at the + * beginning of the wait-queue. As such, it's ok to "drop" + * our exclusiveness temporarily when we get woken up without + * having to remove and re-insert us on the wait queue. + */ + add_wait_queue_exclusive(sk->sleep, &wait); for (;;) { - current->state = TASK_INTERRUPTIBLE; + current->state = TASK_EXCLUSIVE | TASK_INTERRUPTIBLE; release_sock(sk); schedule(); lock_sock(sk); @@ -1603,6 +1625,7 @@ struct sock *tcp_accept(struct sock *sk, int flags) struct sock *newsk = NULL; int error; + unlock_kernel(); lock_sock(sk); /* We need to make sure that this socket is listening, @@ -1633,16 +1656,17 @@ struct sock *tcp_accept(struct sock *sk, int flags) sk->ack_backlog--; if(sk->keepopen) tcp_inc_slow_timer(TCP_SLT_KEEPALIVE); - release_sock(sk); + lock_kernel(); return newsk; out: /* sk should be in LISTEN state, thus accept can use sk->err for - * internal purposes without stomping one anyone's feed. + * internal purposes without stomping on anyone's feed. */ sk->err = error; release_sock(sk); + lock_kernel(); return newsk; } @@ -1765,6 +1789,8 @@ extern void __skb_cb_too_small_for_tcp(int, int); void __init tcp_init(void) { struct sk_buff *skb = NULL; + unsigned long goal; + int order; if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)) __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), @@ -1790,4 +1816,37 @@ void __init tcp_init(void) NULL, NULL); if(!tcp_timewait_cachep) panic("tcp_init: Cannot alloc tcp_tw_bucket cache."); + + /* Size and allocate the main established and bind bucket + * hash tables. + * + * The methodology is similar to that of the buffer cache. + */ + goal = num_physpages >> (20 - PAGE_SHIFT); + for(order = 5; (1UL << order) < goal; order++) + ; + do { + tcp_ehash_size = (1UL << order) * PAGE_SIZE / + sizeof(struct sock *); + tcp_ehash = (struct sock **) + __get_free_pages(GFP_ATOMIC, order); + } while (tcp_ehash == NULL && --order > 4); + + if (!tcp_ehash) + panic("Failed to allocate TCP established hash table\n"); + memset(tcp_ehash, 0, tcp_ehash_size * sizeof(struct sock *)); + + do { + tcp_bhash_size = (1UL << order) * PAGE_SIZE / + sizeof(struct tcp_bind_bucket *); + tcp_bhash = (struct tcp_bind_bucket **) + __get_free_pages(GFP_ATOMIC, order); + } while (tcp_bhash == NULL && --order > 4); + + if (!tcp_bhash) + panic("Failed to allocate TCP bind hash table\n"); + memset(tcp_bhash, 0, tcp_bhash_size * sizeof(struct tcp_bind_bucket *)); + + printk("TCP: Hash tables configured (established %d bind %d)\n", + tcp_ehash_size, tcp_bhash_size); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4a607a749..af4165fce 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_input.c,v 1.164 1999/05/08 21:09:52 davem Exp $ + * Version: $Id: tcp_input.c,v 1.169 1999/06/09 08:29:13 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -748,7 +748,6 @@ static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp, static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) { struct sk_buff *skb = skb_peek(&sk->write_queue); - __u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when); /* Some data was ACK'd, if still retransmitting (due to a * timeout), resend more of the retransmit queue. The @@ -758,6 +757,9 @@ static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) tcp_xmit_retransmit_queue(sk); tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); } else { + __u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when); + if ((__s32)when < 0) + when = 1; tcp_reset_xmit_timer(sk, TIME_RETRANS, when); } } @@ -785,8 +787,6 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una)) goto uninteresting_ack; - dst_confirm(sk->dst_cache); - /* If there is data set flag 1 */ if (len != th->doff*4) { flag |= FLAG_DATA; @@ -882,6 +882,24 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, /* Clear any aborted fast retransmit starts. */ tp->dup_acks = 0; } + /* It is not a brain fart, I thought a bit now. 8) + * + * Forward progress is indicated, if: + * 1. the ack acknowledges new data. + * 2. or the ack is duplicate, but it is caused by new segment + * arrival. This case is filtered by: + * - it contains no data, syn or fin. + * - it does not update window. + * 3. or new SACK. It is difficult to check, so that we ignore it. + * + * Forward progress is also indicated by arrival new data, + * which was caused by window open from our side. This case is more + * difficult and it is made (alas, incorrectly) in tcp_data_queue(). + * --ANK (990513) + */ + if (ack != tp->snd_una || (flag == 0 && !th->fin)) + dst_confirm(sk->dst_cache); + /* Remember the highest ack received. */ tp->snd_una = ack; return 1; @@ -896,8 +914,11 @@ extern void tcp_tw_schedule(struct tcp_tw_bucket *tw); extern void tcp_tw_reschedule(struct tcp_tw_bucket *tw); extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw); +/* Must be called only from BH context. */ void tcp_timewait_kill(struct tcp_tw_bucket *tw) { + SOCKHASH_LOCK_WRITE_BH(); + /* Unlink from various places. */ if(tw->bind_next) tw->bind_next->bind_pprev = tw->bind_pprev; @@ -915,6 +936,8 @@ void tcp_timewait_kill(struct tcp_tw_bucket *tw) tw->sklist_next->sklist_prev = tw->sklist_prev; tw->sklist_prev->sklist_next = tw->sklist_next; + SOCKHASH_UNLOCK_WRITE_BH(); + /* Ok, now free it up. */ kmem_cache_free(tcp_timewait_cachep, tw); } @@ -945,6 +968,7 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, struct sock *sk; struct tcp_func *af_specific = tw->af_specific; __u32 isn; + int ret; isn = tw->rcv_nxt + 128000; if(isn == 0) @@ -953,14 +977,25 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, tcp_timewait_kill(tw); sk = af_specific->get_sock(skb, th); if(sk == NULL || - !ipsec_sk_policy(sk,skb) || - atomic_read(&sk->sock_readers) != 0) + !ipsec_sk_policy(sk,skb)) return 0; + + bh_lock_sock(sk); + + /* Default is to discard the frame. */ + ret = 0; + + if(sk->lock.users) + goto out_unlock; + skb_set_owner_r(skb, sk); af_specific = sk->tp_pinfo.af_tcp.af_specific; + if(af_specific->conn_request(sk, skb, isn) < 0) - return 1; /* Toss a reset back. */ - return 0; /* Discard the frame. */ + ret = 1; /* Toss a reset back. */ + out_unlock: + bh_unlock_sock(sk); + return ret; } /* Check RST or SYN */ @@ -1013,7 +1048,7 @@ static __inline__ void tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *t sk->prot->inuse--; /* Step 4: Hash TW into TIMEWAIT half of established hash table. */ - head = &tcp_established_hash[sk->hashent + (TCP_HTABLE_SIZE/2)]; + head = &tcp_ehash[sk->hashent + (tcp_ehash_size >> 1)]; sktw = (struct sock *)tw; if((sktw->next = *head) != NULL) (*head)->pprev = &sktw->next; @@ -1051,7 +1086,9 @@ void tcp_time_wait(struct sock *sk) } #endif /* Linkage updates. */ + SOCKHASH_LOCK_WRITE(); tcp_tw_hashdance(sk, tw); + SOCKHASH_UNLOCK_WRITE(); /* Get the TIME_WAIT timeout firing. */ tcp_tw_schedule(tw); @@ -1801,7 +1838,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } } - flg = *(((u32 *)th) + 3) & ~htonl(0x8 << 16); + flg = *(((u32 *)th) + 3) & ~htonl(0xFC8 << 16); /* pred_flags is 0xS?10 << 16 + snd_wnd * if header_predition is to be made @@ -2031,8 +2068,26 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* These use the socket TOS.. * might want to be the received TOS */ - if(th->ack) - return 1; + if(th->ack) { + struct sock *realsk; + int ret; + + realsk = tp->af_specific->get_sock(skb, th); + if(realsk == sk) + return 1; + + bh_lock_sock(realsk); + ret = 0; + if(realsk->lock.users != 0) { + skb_orphan(skb); + sk_add_backlog(realsk, skb); + } else { + ret = tcp_rcv_state_process(realsk, skb, + skb->h.th, skb->len); + } + bh_unlock_sock(realsk); + return ret; + } if(th->syn) { if(tp->af_specific->conn_request(sk, skb, 0) < 0) @@ -2067,21 +2122,81 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * not be in line code. [AC] */ if(th->ack) { - tp->snd_wl1 = TCP_SKB_CB(skb)->seq; - - /* We got an ack, but it's not a good ack. */ - if(!tcp_ack(sk,th, TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->ack_seq, len)) + /* rfc793: + * "If the state is SYN-SENT then + * first check the ACK bit + * If the ACK bit is set + * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send + * a reset (unless the RST bit is set, if so drop + * the segment and return)" + * + * I cite this place to emphasize one essential + * detail, this check is different of one + * in established state: SND.UNA <= SEG.ACK <= SND.NXT. + * SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT, + * because we have no previous data sent before SYN. + * --ANK(990513) + * + * We do not send data with SYN, so that RFC-correct + * test reduces to: + */ + if (sk->zapped || + TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) return 1; - if(th->rst) { + /* Now ACK is acceptable. + * + * "If the RST bit is set + * If the ACK was acceptable then signal the user "error: + * connection reset", drop the segment, enter CLOSED state, + * delete TCB, and return." + */ + + if (th->rst) { tcp_reset(sk); goto discard; } - if(!th->syn) + /* rfc793: + * "fifth, if neither of the SYN or RST bits is set then + * drop the segment and return." + * + * See note below! + * --ANK(990513) + */ + + if (!th->syn) goto discard; + /* rfc793: + * "If the SYN bit is on ... + * are acceptable then ... + * (our SYN has been ACKed), change the connection + * state to ESTABLISHED..." + * + * Do you see? SYN-less ACKs in SYN-SENT state are + * completely ignored. + * + * The bug causing stalled SYN-SENT sockets + * was here: tcp_ack advanced snd_una and canceled + * retransmit timer, so that bare ACK received + * in SYN-SENT state (even with invalid ack==ISS, + * because tcp_ack check is too weak for SYN-SENT) + * causes moving socket to invalid semi-SYN-SENT, + * semi-ESTABLISHED state and connection hangs. + * + * There exist buggy stacks, which really send + * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp) + * Actually, if this host did not try to get something + * from ftp.inr.ac.ru I'd never find this bug 8) + * + * --ANK (990514) + */ + + tp->snd_wl1 = TCP_SKB_CB(skb)->seq; + tcp_ack(sk,th, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->ack_seq, len); + /* Ok.. it's good. Set up sequence numbers and * move to established. */ @@ -2206,8 +2321,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, !(th->fin && TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)) { if (!th->rst) { tcp_send_ack(sk); - goto discard; } + goto discard; } /* step 2: check RST bit */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b5070c3a7..564e859f2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_ipv4.c,v 1.175 1999/05/08 21:09:54 davem Exp $ + * Version: $Id: tcp_ipv4.c,v 1.180 1999/06/09 08:29:19 davem Exp $ * * IPv4 specific functions * @@ -90,12 +90,14 @@ void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, * First half of the table is for sockets not in TIME_WAIT, second half * is for TIME_WAIT sockets only. */ -struct sock *tcp_established_hash[TCP_HTABLE_SIZE]; +struct sock **tcp_ehash; +int tcp_ehash_size; /* Ok, let's try this, I give up, we do need a local binding * TCP hash as well as the others for fast bind/connect. */ -struct tcp_bind_bucket *tcp_bound_hash[TCP_BHTABLE_SIZE]; +struct tcp_bind_bucket **tcp_bhash; +int tcp_bhash_size; /* All sockets in TCP_LISTEN state will be in here. This is the only table * where wildcard'd TCP sockets can exist. Hash function here is just local @@ -117,7 +119,7 @@ int tcp_port_rover = (1024 - 1); static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport, __u32 faddr, __u16 fport) { - return ((laddr ^ lport) ^ (faddr ^ fport)) & ((TCP_HTABLE_SIZE/2) - 1); + return ((laddr ^ lport) ^ (faddr ^ fport)) & ((tcp_ehash_size >> 1) - 1); } static __inline__ int tcp_sk_hashfn(struct sock *sk) @@ -136,8 +138,8 @@ void tcp_bucket_unlock(struct sock *sk) struct tcp_bind_bucket *tb; unsigned short snum = sk->num; - SOCKHASH_LOCK(); - for(tb = tcp_bound_hash[tcp_bhashfn(snum)]; tb; tb = tb->next) { + SOCKHASH_LOCK_WRITE(); + for(tb = tcp_bhash[tcp_bhashfn(snum)]; tb; tb = tb->next) { if(tb->port == snum) { if(tb->owners == NULL && (tb->flags & TCPB_FLAG_LOCKED)) { @@ -148,9 +150,10 @@ void tcp_bucket_unlock(struct sock *sk) break; } } - SOCKHASH_UNLOCK(); + SOCKHASH_UNLOCK_WRITE(); } +/* The sockhash lock must be held as a writer here. */ struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum) { struct tcp_bind_bucket *tb; @@ -158,7 +161,7 @@ struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum) tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC); if(tb != NULL) { struct tcp_bind_bucket **head = - &tcp_bound_hash[tcp_bhashfn(snum)]; + &tcp_bhash[tcp_bhashfn(snum)]; tb->port = snum; tb->flags = TCPB_FLAG_LOCKED; tb->owners = NULL; @@ -176,13 +179,18 @@ struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum) */ static __inline__ int tcp_bucket_check(unsigned short snum) { - struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(snum)]; + struct tcp_bind_bucket *tb; + int ret = 0; + + SOCKHASH_LOCK_WRITE(); + tb = tcp_bhash[tcp_bhashfn(snum)]; for( ; (tb && (tb->port != snum)); tb = tb->next) ; if(tb == NULL && tcp_bucket_create(snum) == NULL) - return 1; - else - return 0; + ret = 1; + SOCKHASH_UNLOCK_WRITE(); + + return ret; } #endif @@ -191,8 +199,8 @@ static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum) struct tcp_bind_bucket *tb; int result = 0; - SOCKHASH_LOCK(); - for(tb = tcp_bound_hash[tcp_bhashfn(snum)]; + SOCKHASH_LOCK_WRITE(); + for(tb = tcp_bhash[tcp_bhashfn(snum)]; (tb && (tb->port != snum)); tb = tb->next) ; @@ -256,7 +264,7 @@ static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum) } } go_like_smoke: - SOCKHASH_UNLOCK(); + SOCKHASH_UNLOCK_WRITE(); return result; } @@ -268,13 +276,13 @@ unsigned short tcp_good_socknum(void) int remaining = (high - low) + 1; int rover; - SOCKHASH_LOCK(); + SOCKHASH_LOCK_WRITE(); rover = tcp_port_rover; do { rover += 1; if((rover < low) || (rover > high)) rover = low; - tb = tcp_bound_hash[tcp_bhashfn(rover)]; + tb = tcp_bhash[tcp_bhashfn(rover)]; for( ; tb; tb = tb->next) { if(tb->port == rover) goto next; @@ -288,7 +296,7 @@ unsigned short tcp_good_socknum(void) rover = 0; if (tb != NULL) tb->flags |= TCPB_FLAG_GOODSOCKNUM; - SOCKHASH_UNLOCK(); + SOCKHASH_UNLOCK_WRITE(); return rover; } @@ -298,20 +306,20 @@ static void tcp_v4_hash(struct sock *sk) if (sk->state != TCP_CLOSE) { struct sock **skp; - SOCKHASH_LOCK(); - skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))]; + SOCKHASH_LOCK_WRITE(); + skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))]; if((sk->next = *skp) != NULL) (*skp)->pprev = &sk->next; *skp = sk; sk->pprev = skp; tcp_sk_bindify(sk); - SOCKHASH_UNLOCK(); + SOCKHASH_UNLOCK_WRITE(); } } static void tcp_v4_unhash(struct sock *sk) { - SOCKHASH_LOCK(); + SOCKHASH_LOCK_WRITE(); if(sk->pprev) { if(sk->next) sk->next->pprev = sk->pprev; @@ -320,14 +328,14 @@ static void tcp_v4_unhash(struct sock *sk) tcp_reg_zap(sk); tcp_sk_unbindify(sk); } - SOCKHASH_UNLOCK(); + SOCKHASH_UNLOCK_WRITE(); } static void tcp_v4_rehash(struct sock *sk) { unsigned char state; - SOCKHASH_LOCK(); + SOCKHASH_LOCK_WRITE(); state = sk->state; if(sk->pprev != NULL) { if(sk->next) @@ -342,7 +350,7 @@ static void tcp_v4_rehash(struct sock *sk) if(state == TCP_LISTEN) skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; else - skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))]; + skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))]; if((sk->next = *skp) != NULL) (*skp)->pprev = &sk->next; @@ -351,7 +359,7 @@ static void tcp_v4_rehash(struct sock *sk) if(state == TCP_LISTEN) tcp_sk_bindify(sk); } - SOCKHASH_UNLOCK(); + SOCKHASH_UNLOCK_WRITE(); } /* Don't inline this cruft. Here are some nice properties to @@ -395,10 +403,10 @@ static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int d /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM - * It is assumed that this code only gets called from within NET_BH. + * + * The sockhash lock must be held as a reader here. */ -static inline struct sock *__tcp_v4_lookup(struct tcphdr *th, - u32 saddr, u16 sport, +static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif) { TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) @@ -416,7 +424,7 @@ static inline struct sock *__tcp_v4_lookup(struct tcphdr *th, * have wildcards anyways. */ hash = tcp_hashfn(daddr, hnum, saddr, sport); - for(sk = tcp_established_hash[hash]; sk; sk = sk->next) { + for(sk = tcp_ehash[hash]; sk; sk = sk->next) { if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) { if (sk->state == TCP_ESTABLISHED) TCP_RHASH(sport) = sk; @@ -424,7 +432,7 @@ static inline struct sock *__tcp_v4_lookup(struct tcphdr *th, } } /* Must check for a TIME_WAIT'er before going to listener hash. */ - for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next) + for(sk = tcp_ehash[hash+(tcp_ehash_size >> 1)]; sk; sk = sk->next) if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) goto hit; sk = tcp_v4_lookup_listener(daddr, hnum, dif); @@ -434,7 +442,13 @@ hit: __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif) { - return __tcp_v4_lookup(0, saddr, sport, daddr, dport, dif); + struct sock *sk; + + SOCKHASH_LOCK_READ(); + sk = __tcp_v4_lookup(saddr, sport, daddr, dport, dif); + SOCKHASH_UNLOCK_READ(); + + return sk; } #ifdef CONFIG_IP_TRANSPARENT_PROXY @@ -462,9 +476,12 @@ static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr, paddr = idev->ifa_list->ifa_local; } - /* This code must run only from NET_BH. */ + /* We must obtain the sockhash lock here, we are always + * in BH context. + */ + SOCKHASH_LOCK_READ_BH(); { - struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hnum)]; + struct tcp_bind_bucket *tb = tcp_bhash[tcp_bhashfn(hnum)]; for( ; (tb && tb->port != hnum); tb = tb->next) ; if(tb == NULL) @@ -505,7 +522,7 @@ pass2: } next: if(firstpass--) { - struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hpnum)]; + struct tcp_bind_bucket *tb = tcp_bhash[tcp_bhashfn(hpnum)]; for( ; (tb && tb->port != hpnum); tb = tb->next) ; if(tb) { @@ -514,6 +531,7 @@ next: } } gotit: + SOCKHASH_UNLOCK_READ_BH(); return result; } #endif /* CONFIG_IP_TRANSPARENT_PROXY */ @@ -540,21 +558,23 @@ static int tcp_v4_unique_address(struct sock *sk) int retval = 1; /* Freeze the hash while we snoop around. */ - SOCKHASH_LOCK(); - tb = tcp_bound_hash[tcp_bhashfn(snum)]; + SOCKHASH_LOCK_READ(); + tb = tcp_bhash[tcp_bhashfn(snum)]; for(; tb; tb = tb->next) { if(tb->port == snum && tb->owners != NULL) { /* Almost certainly the re-use port case, search the real hashes * so it actually scales. */ - sk = __tcp_v4_lookup(NULL, sk->daddr, sk->dport, + sk = __tcp_v4_lookup(sk->daddr, sk->dport, sk->rcv_saddr, snum, sk->bound_dev_if); + SOCKHASH_UNLOCK_READ(); + if((sk != NULL) && (sk->state != TCP_LISTEN)) retval = 0; - break; + return retval; } } - SOCKHASH_UNLOCK(); + SOCKHASH_UNLOCK_READ(); return retval; } @@ -727,16 +747,17 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; - if (atomic_read(&sk->sock_readers)) - return; - - /* Don't interested in TCP_LISTEN and open_requests (SYN-ACKs + /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs * send out by Linux are always <576bytes so they should go through * unfragmented). */ if (sk->state == TCP_LISTEN) return; + bh_lock_sock(sk); + if(sk->lock.users != 0) + goto out; + /* We don't check in the destentry if pmtu discovery is forbidden * on this route. We just assume that no packet_to_big packets * are send back when pmtu discovery is not active. @@ -744,7 +765,8 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned * route, but I think that's acceptable. */ if (sk->dst_cache == NULL) - return; + goto out; + ip_rt_update_pmtu(sk->dst_cache, mtu); if (sk->ip_pmtudisc != IP_PMTUDISC_DONT && tp->pmtu_cookie > sk->dst_cache->pmtu) { @@ -757,6 +779,8 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned */ tcp_simple_retransmit(sk); } /* else let the usual retransmit timer handle it */ +out: + bh_unlock_sock(sk); } /* @@ -849,17 +873,6 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) switch (sk->state) { struct open_request *req, *prev; case TCP_LISTEN: - /* Prevent race conditions with accept() - - * ICMP is unreliable. - */ - if (atomic_read(&sk->sock_readers)) { - net_statistics.LockDroppedIcmps++; - /* If too many ICMPs get dropped on busy - * servers this needs to be solved differently. - */ - return; - } - /* The final ACK of the handshake should be already * handled in the new socket context, not here. * Strictly speaking - an ICMP error for the final @@ -869,12 +882,24 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) if (!no_flags && !th->syn && !th->ack) return; + /* Prevent race conditions with accept() - + * ICMP is unreliable. + */ + bh_lock_sock(sk); + if (sk->lock.users != 0) { + net_statistics.LockDroppedIcmps++; + /* If too many ICMPs get dropped on busy + * servers this needs to be solved differently. + */ + goto out_unlock; + } + req = tcp_v4_search_req(tp, iph, th, &prev); if (!req) - return; + goto out_unlock; if (seq != req->snt_isn) { net_statistics.OutOfWindowIcmps++; - return; + goto out_unlock; } if (req->sk) { /* @@ -884,6 +909,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) * but only with the next operation on the socket after * accept. */ + bh_unlock_sock(sk); sk = req->sk; } else { /* @@ -896,6 +922,8 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) tcp_synq_unlink(tp, req, prev); req->class->destructor(req); tcp_openreq_free(req); + out_unlock: + bh_unlock_sock(sk); return; } break; @@ -1025,9 +1053,10 @@ static struct sock *tcp_v4_search_proxy_openreq(struct sk_buff *skb) { struct iphdr *iph = skb->nh.iph; struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4); - struct sock *sk; + struct sock *sk = NULL; int i; + SOCKHASH_LOCK_READ(); for (i=0; i<TCP_LHTABLE_SIZE; i++) { for(sk = tcp_listening_hash[i]; sk; sk = sk->next) { struct open_request *dummy; @@ -1035,10 +1064,12 @@ static struct sock *tcp_v4_search_proxy_openreq(struct sk_buff *skb) th, &dummy) && (!sk->bound_dev_if || sk->bound_dev_if == skb->dev->ifindex)) - return sk; + goto out; } } - return NULL; +out: + SOCKHASH_UNLOCK_READ(); + return sk; } /* @@ -1319,7 +1350,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, /* Clone the TCP header template */ newsk->dport = req->rmt_port; - atomic_set(&newsk->sock_readers, 0); + sock_lock_init(newsk); + atomic_set(&newsk->rmem_alloc, 0); skb_queue_head_init(&newsk->receive_queue); atomic_set(&newsk->wmem_alloc, 0); @@ -1328,9 +1360,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newsk->done = 0; newsk->proc = 0; - newsk->pair = NULL; - skb_queue_head_init(&newsk->back_log); + newsk->backlog.head = newsk->backlog.tail = NULL; skb_queue_head_init(&newsk->error_queue); + newsk->write_space = tcp_write_space; #ifdef CONFIG_FILTER if ((filter = newsk->filter) != NULL) sk_filter_charge(newsk, filter); @@ -1552,7 +1584,8 @@ static inline struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb) } /* Check for SYN|ACK */ - if (flg & __constant_htonl(0x00120000)) { + flg &= __constant_htonl(0x00120000); + if (flg) { struct open_request *req, *dummy; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); @@ -1570,8 +1603,17 @@ static inline struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb) return sk; } +/* The socket must have it's spinlock held when we get + * here. + * + * We have a potential double-lock case here, so even when + * doing backlog processing we use the BH locking scheme. + * This is because we cannot sleep with the original spinlock + * held. + */ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { + int need_unlock = 0; #ifdef CONFIG_FILTER struct sk_filter *filter = sk->filter; if (filter && sk_filter(skb, filter)) @@ -1591,7 +1633,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; } - if (sk->state == TCP_LISTEN) { struct sock *nsk; @@ -1604,17 +1645,22 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) * otherwise we just shortcircuit this and continue with * the new socket.. */ - if (atomic_read(&nsk->sock_readers)) { - skb_orphan(skb); - __skb_queue_tail(&nsk->back_log, skb); - return 0; + if (nsk != sk) { + bh_lock_sock(nsk); + if (nsk->lock.users != 0) { + skb_orphan(skb); + sk_add_backlog(nsk, skb); + bh_unlock_sock(nsk); + return 0; + } + need_unlock = 1; + sk = nsk; } - sk = nsk; } if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) goto reset; - return 0; + goto out_maybe_unlock; reset: tcp_v4_send_reset(skb); @@ -1625,6 +1671,9 @@ discard: * might be destroyed here. This current version compiles correctly, * but you have been warned. */ +out_maybe_unlock: + if(need_unlock) + bh_unlock_sock(sk); return 0; } @@ -1636,6 +1685,7 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len) { struct tcphdr *th; struct sock *sk; + int ret; if (skb->pkt_type!=PACKET_HOST) goto discard_it; @@ -1681,8 +1731,10 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len) IPCB(skb)->redirport, skb->dev->ifindex); else { #endif - sk = __tcp_v4_lookup(th, skb->nh.iph->saddr, th->source, + SOCKHASH_LOCK_READ_BH(); + sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source, skb->nh.iph->daddr, th->dest, skb->dev->ifindex); + SOCKHASH_UNLOCK_READ_BH(); #ifdef CONFIG_IP_TRANSPARENT_PROXY if (!sk) sk = tcp_v4_search_proxy_openreq(skb); @@ -1702,11 +1754,16 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len) if (sk->state == TCP_TIME_WAIT) goto do_time_wait; - if (!atomic_read(&sk->sock_readers)) - return tcp_v4_do_rcv(sk, skb); - __skb_queue_tail(&sk->back_log, skb); - return 0; + bh_lock_sock(sk); + ret = 0; + if (!sk->lock.users) + ret = tcp_v4_do_rcv(sk, skb); + else + sk_add_backlog(sk, skb); + bh_unlock_sock(sk); + + return ret; no_tcp_socket: tcp_v4_send_reset(skb); @@ -1944,6 +2001,8 @@ __initfunc(void tcp_v4_init(struct net_proto_family *ops)) tcp_inode.i_sock = 1; tcp_inode.i_uid = 0; tcp_inode.i_gid = 0; + init_waitqueue_head(&tcp_inode.i_wait); + init_waitqueue_head(&tcp_inode.u.socket_i.wait); tcp_socket->inode = &tcp_inode; tcp_socket->state = SS_UNCONNECTED; @@ -1952,6 +2011,11 @@ __initfunc(void tcp_v4_init(struct net_proto_family *ops)) if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0) panic("Failed to create the TCP control socket.\n"); tcp_socket->sk->allocation=GFP_ATOMIC; - tcp_socket->sk->num = 256; /* Don't receive any data */ tcp_socket->sk->ip_ttl = MAXTTL; + + /* Unhash it so that IP input processing does not even + * see it, we do not wish this socket to see incoming + * packets. + */ + tcp_socket->sk->prot->unhash(tcp_socket->sk); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9a096f0f3..18b5ebf80 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.108 1999/05/08 21:48:59 davem Exp $ + * Version: $Id: tcp_output.c,v 1.110 1999/05/27 00:37:45 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -36,6 +36,8 @@ #include <net/tcp.h> +#include <linux/smp_lock.h> + extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; @@ -240,6 +242,11 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) /* Rechecksum original buffer. */ skb->csum = csum_partial(skb->data, skb->len, 0); + /* Looks stupid, but our code really uses when of + * skbs, which it never sent before. --ANK + */ + TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; + /* Link BUFF into the send queue. */ __skb_append(skb, buff); @@ -961,6 +968,7 @@ void tcp_connect(struct sock *sk, struct sk_buff *buff, int mtu) /* Ok, now lock the socket before we make it visible to * the incoming packet engine. */ + unlock_kernel(); lock_sock(sk); /* Socket identity change complete, no longer @@ -988,6 +996,7 @@ void tcp_connect(struct sock *sk, struct sk_buff *buff, int mtu) /* Now, it is safe to release the socket. */ release_sock(sk); + lock_kernel(); } /* Send out a delayed ack, the caller does the policy checking diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index ad6ccace9..d23eef143 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_timer.c,v 1.62 1999/05/08 21:09:55 davem Exp $ + * Version: $Id: tcp_timer.c,v 1.64 1999/05/27 00:37:31 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -168,15 +168,16 @@ void tcp_delack_timer(unsigned long data) { struct sock *sk = (struct sock*)data; + bh_lock_sock(sk); if(!sk->zapped && sk->tp_pinfo.af_tcp.delayed_acks && sk->state != TCP_CLOSE) { - /* If socket is currently locked, defer the ACK. */ - if (!atomic_read(&sk->sock_readers)) + if (!sk->lock.users) tcp_send_ack(sk); else tcp_send_delayed_ack(&(sk->tp_pinfo.af_tcp), HZ/10); } + bh_unlock_sock(sk); } void tcp_probe_timer(unsigned long data) @@ -187,9 +188,11 @@ void tcp_probe_timer(unsigned long data) if(sk->zapped) return; - if (atomic_read(&sk->sock_readers)) { + bh_lock_sock(sk); + if (sk->lock.users) { /* Try again later. */ tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ/5); + bh_unlock_sock(sk); return; } @@ -216,6 +219,7 @@ void tcp_probe_timer(unsigned long data) /* Only send another probe if we didn't close things up. */ tcp_send_probe0(sk); } + bh_unlock_sock(sk); } static __inline__ int tcp_keepopen_proc(struct sock *sk) @@ -253,8 +257,9 @@ static void tcp_bucketgc(unsigned long data) { int i, reaped = 0;; - for(i = 0; i < TCP_BHTABLE_SIZE; i++) { - struct tcp_bind_bucket *tb = tcp_bound_hash[i]; + SOCKHASH_LOCK_WRITE_BH(); + for(i = 0; i < tcp_bhash_size; i++) { + struct tcp_bind_bucket *tb = tcp_bhash[i]; while(tb) { struct tcp_bind_bucket *next = tb->next; @@ -274,6 +279,8 @@ static void tcp_bucketgc(unsigned long data) tb = next; } } + SOCKHASH_UNLOCK_WRITE_BH(); + if(reaped != 0) { struct tcp_sl_timer *slt = (struct tcp_sl_timer *)data; @@ -294,8 +301,14 @@ static void tcp_twkill(unsigned long data) struct tcp_tw_bucket *tw; int killed = 0; + /* The death-row tw chains are only ever touched + * in BH context so no locking is needed. + */ tw = tcp_tw_death_row[tcp_tw_death_row_slot]; tcp_tw_death_row[tcp_tw_death_row_slot] = NULL; + tcp_tw_death_row_slot = + ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); + while(tw != NULL) { struct tcp_tw_bucket *next = tw->next_death; @@ -307,8 +320,6 @@ static void tcp_twkill(unsigned long data) struct tcp_sl_timer *slt = (struct tcp_sl_timer *)data; atomic_sub(killed, &slt->count); } - tcp_tw_death_row_slot = - ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); } /* These are always called from BH context. See callers in @@ -319,12 +330,14 @@ void tcp_tw_schedule(struct tcp_tw_bucket *tw) int slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1); struct tcp_tw_bucket **tpp = &tcp_tw_death_row[slot]; + SOCKHASH_LOCK_WRITE_BH(); if((tw->next_death = *tpp) != NULL) (*tpp)->pprev_death = &tw->next_death; *tpp = tw; tw->pprev_death = tpp; tw->death_slot = slot; + SOCKHASH_UNLOCK_WRITE_BH(); tcp_inc_slow_timer(TCP_SLT_TWKILL); } @@ -335,6 +348,7 @@ void tcp_tw_reschedule(struct tcp_tw_bucket *tw) struct tcp_tw_bucket **tpp; int slot; + SOCKHASH_LOCK_WRITE_BH(); if(tw->next_death) tw->next_death->pprev_death = tw->pprev_death; *tw->pprev_death = tw->next_death; @@ -348,16 +362,21 @@ void tcp_tw_reschedule(struct tcp_tw_bucket *tw) tw->pprev_death = tpp; tw->death_slot = slot; + SOCKHASH_UNLOCK_WRITE_BH(); + /* Timer was incremented when we first entered the table. */ } /* This is for handling early-kills of TIME_WAIT sockets. */ void tcp_tw_deschedule(struct tcp_tw_bucket *tw) { + SOCKHASH_LOCK_WRITE_BH(); if(tw->next_death) tw->next_death->pprev_death = tw->pprev_death; *tw->pprev_death = tw->next_death; tw->pprev_death = NULL; + SOCKHASH_UNLOCK_WRITE_BH(); + tcp_dec_slow_timer(TCP_SLT_TWKILL); } @@ -399,20 +418,30 @@ static void tcp_keepalive(unsigned long data) int count = 0; int i; - for(i = chain_start; i < (chain_start + ((TCP_HTABLE_SIZE/2) >> 2)); i++) { - struct sock *sk = tcp_established_hash[i]; + SOCKHASH_LOCK_READ_BH(); + for(i = chain_start; i < (chain_start + ((tcp_ehash_size >> 1) >> 2)); i++) { + struct sock *sk; + + sk = tcp_ehash[i]; while(sk) { - if(!atomic_read(&sk->sock_readers) && sk->keepopen) { + struct sock *next = sk->next; + + bh_lock_sock(sk); + if (sk->keepopen && !sk->lock.users) { + SOCKHASH_UNLOCK_READ_BH(); count += tcp_keepopen_proc(sk); - if(count == sysctl_tcp_max_ka_probes) - goto out; + SOCKHASH_LOCK_READ_BH(); } - sk = sk->next; + bh_unlock_sock(sk); + if(count == sysctl_tcp_max_ka_probes) + goto out; + sk = next; } } out: - chain_start = ((chain_start + ((TCP_HTABLE_SIZE/2)>>2)) & - ((TCP_HTABLE_SIZE/2) - 1)); + SOCKHASH_UNLOCK_READ_BH(); + chain_start = ((chain_start + ((tcp_ehash_size >> 1)>>2)) & + ((tcp_ehash_size >> 1) - 1)); } /* @@ -439,9 +468,11 @@ void tcp_retransmit_timer(unsigned long data) return; } - if (atomic_read(&sk->sock_readers)) { + bh_lock_sock(sk); + if (sk->lock.users) { /* Try again later */ tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ/20); + bh_unlock_sock(sk); return; } @@ -508,12 +539,51 @@ void tcp_retransmit_timer(unsigned long data) tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); tcp_write_timeout(sk); + + bh_unlock_sock(sk); } /* * Slow timer for SYN-RECV sockets */ +static void tcp_do_syn_queue(struct sock *sk, struct tcp_opt *tp, unsigned long now) +{ + struct open_request *prev, *req; + + prev = (struct open_request *) &tp->syn_wait_queue; + for(req = tp->syn_wait_queue; req; ) { + struct open_request *next = req->dl_next; + + if (! req->sk) { + tcp_synq_unlink(tp, req, prev); + if(req->retrans >= sysctl_tcp_retries1) { + (*req->class->destructor)(req); + tcp_dec_slow_timer(TCP_SLT_SYNACK); + tp->syn_backlog--; + tcp_openreq_free(req); + if (! tp->syn_wait_queue) + break; + } else { + unsigned long timeo; + struct open_request *rp; + + (*req->class->rtx_syn_ack)(sk, req); + req->retrans++; + timeo = min((TCP_TIMEOUT_INIT << req->retrans), + (120 * HZ)); + req->expires = now + timeo; + rp = prev->dl_next; + tcp_synq_queue(tp, req); + if(rp != prev->dl_next) + prev = prev->dl_next; + } + } else + prev = req; + req = next; + } +} + /* This now scales very nicely. -DaveM */ static void tcp_syn_recv_timer(unsigned long data) { @@ -521,70 +591,21 @@ static void tcp_syn_recv_timer(unsigned long data) unsigned long now = jiffies; int i; + SOCKHASH_LOCK_READ_BH(); for(i = 0; i < TCP_LHTABLE_SIZE; i++) { sk = tcp_listening_hash[i]; - while(sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; /* TCP_LISTEN is implied. */ - if (!atomic_read(&sk->sock_readers) && tp->syn_wait_queue) { - struct open_request *prev = (struct open_request *)(&tp->syn_wait_queue); - struct open_request *req = tp->syn_wait_queue; - do { - struct open_request *conn; - - conn = req; - req = req->dl_next; - - if (conn->sk) { - prev = conn; - continue; - } - - if ((long)(now - conn->expires) <= 0) - break; - - - tcp_synq_unlink(tp, conn, prev); - if (conn->retrans >= sysctl_tcp_retries1) { -#ifdef TCP_DEBUG - printk(KERN_DEBUG "syn_recv: " - "too many retransmits\n"); -#endif - (*conn->class->destructor)(conn); - tcp_dec_slow_timer(TCP_SLT_SYNACK); - tp->syn_backlog--; - tcp_openreq_free(conn); - - if (!tp->syn_wait_queue) - break; - } else { - unsigned long timeo; - struct open_request *op; - - (*conn->class->rtx_syn_ack)(sk, conn); - - conn->retrans++; -#ifdef TCP_DEBUG - printk(KERN_DEBUG "syn_ack rtx %d\n", - conn->retrans); -#endif - timeo = min((TCP_TIMEOUT_INIT - << conn->retrans), - 120*HZ); - conn->expires = now + timeo; - op = prev->dl_next; - tcp_synq_queue(tp, conn); - if (op != prev->dl_next) - prev = prev->dl_next; - } - /* old prev still valid here */ - } while (req); - } + bh_lock_sock(sk); + if (!sk->lock.users && tp->syn_wait_queue) + tcp_do_syn_queue(sk, tp, now); + bh_unlock_sock(sk); sk = sk->next; } } + SOCKHASH_UNLOCK_READ_BH(); } void tcp_sltimer_handler(unsigned long data) diff --git a/net/ipv4/timer.c b/net/ipv4/timer.c index 3821a7c4c..0487f5bfa 100644 --- a/net/ipv4/timer.c +++ b/net/ipv4/timer.c @@ -5,7 +5,7 @@ * * TIMER - implementation of software timers for IP. * - * Version: $Id: timer.c,v 1.15 1999/02/22 13:54:29 davem Exp $ + * Version: $Id: timer.c,v 1.16 1999/05/27 00:37:39 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -69,13 +69,15 @@ void net_reset_timer (struct sock *t, int timeout, unsigned long len) */ void net_timer (unsigned long data) { - struct sock *sk = (struct sock*)data; + struct sock *sk = (struct sock *) data; int why = sk->timeout; /* Only process if socket is not in use. */ - if (atomic_read(&sk->sock_readers)) { + bh_lock_sock(sk); + if (sk->lock.users) { /* Try again later. */ mod_timer(&sk->timer, jiffies+HZ/20); + bh_unlock_sock(sk); return; } @@ -99,15 +101,15 @@ void net_timer (unsigned long data) printk (KERN_DEBUG "non CLOSE socket in time_done\n"); break; } - destroy_sock (sk); - break; + destroy_sock(sk); + return; case TIME_DESTROY: /* We've waited for a while for all the memory associated with * the socket to be freed. */ destroy_sock(sk); - break; + return; case TIME_CLOSE: /* We've waited long enough, close the socket. */ @@ -123,5 +125,8 @@ void net_timer (unsigned long data) printk ("net_timer: timer expired - reason %d is unknown\n", why); break; } + + /* We only need to unlock if the socket was not destroyed. */ + bh_unlock_sock(sk); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 5fcec9cf3..320e5151e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -5,7 +5,7 @@ * * The User Datagram Protocol (UDP). * - * Version: $Id: udp.c,v 1.66 1999/05/08 20:00:25 davem Exp $ + * Version: $Id: udp.c,v 1.69 1999/06/09 11:15:31 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -128,7 +128,7 @@ static int udp_v4_verify_bind(struct sock *sk, unsigned short snum) struct sock *sk2; int retval = 0, sk_reuse = sk->reuse; - SOCKHASH_LOCK(); + SOCKHASH_LOCK_READ(); for(sk2 = udp_hash[snum & (UDP_HTABLE_SIZE - 1)]; sk2 != NULL; sk2 = sk2->next) { if((sk2->num == snum) && (sk2 != sk)) { unsigned char state = sk2->state; @@ -158,7 +158,7 @@ static int udp_v4_verify_bind(struct sock *sk, unsigned short snum) } } } - SOCKHASH_UNLOCK(); + SOCKHASH_UNLOCK_READ(); return retval; } @@ -173,14 +173,14 @@ static inline int udp_lport_inuse(u16 num) return 0; } -/* Shared by v4/v6 tcp. */ +/* Shared by v4/v6 udp. */ unsigned short udp_good_socknum(void) { int result; static int start = 0; int i, best, best_size_so_far; - SOCKHASH_LOCK(); + SOCKHASH_LOCK_READ(); if (start > sysctl_local_port_range[1] || start < sysctl_local_port_range[0]) start = sysctl_local_port_range[0]; @@ -223,15 +223,10 @@ unsigned short udp_good_socknum(void) } out: start = result; - SOCKHASH_UNLOCK(); + SOCKHASH_UNLOCK_READ(); return result; } -/* Last hit UDP socket cache, this is ipv4 specific so make it static. */ -static u32 uh_cache_saddr, uh_cache_daddr; -static u16 uh_cache_dport, uh_cache_sport; -static struct sock *uh_cache_sk = NULL; - static void udp_v4_hash(struct sock *sk) { struct sock **skp; @@ -240,11 +235,11 @@ static void udp_v4_hash(struct sock *sk) num &= (UDP_HTABLE_SIZE - 1); skp = &udp_hash[num]; - SOCKHASH_LOCK(); + SOCKHASH_LOCK_WRITE(); sk->next = *skp; *skp = sk; sk->hashent = num; - SOCKHASH_UNLOCK(); + SOCKHASH_UNLOCK_WRITE(); } static void udp_v4_unhash(struct sock *sk) @@ -255,7 +250,7 @@ static void udp_v4_unhash(struct sock *sk) num &= (UDP_HTABLE_SIZE - 1); skp = &udp_hash[num]; - SOCKHASH_LOCK(); + SOCKHASH_LOCK_WRITE(); while(*skp != NULL) { if(*skp == sk) { *skp = sk->next; @@ -263,9 +258,7 @@ static void udp_v4_unhash(struct sock *sk) } skp = &((*skp)->next); } - if(uh_cache_sk == sk) - uh_cache_sk = NULL; - SOCKHASH_UNLOCK(); + SOCKHASH_UNLOCK_WRITE(); } static void udp_v4_rehash(struct sock *sk) @@ -277,7 +270,7 @@ static void udp_v4_rehash(struct sock *sk) num &= (UDP_HTABLE_SIZE - 1); skp = &udp_hash[oldnum]; - SOCKHASH_LOCK(); + SOCKHASH_LOCK_WRITE(); while(*skp != NULL) { if(*skp == sk) { *skp = sk->next; @@ -288,13 +281,11 @@ static void udp_v4_rehash(struct sock *sk) sk->next = udp_hash[num]; udp_hash[num] = sk; sk->hashent = num; - if(uh_cache_sk == sk) - uh_cache_sk = NULL; - SOCKHASH_UNLOCK(); + SOCKHASH_UNLOCK_WRITE(); } /* UDP is nearly always wildcards out the wazoo, it makes no sense to try - * harder than this here plus the last hit cache. -DaveM + * harder than this. -DaveM */ struct sock *udp_v4_lookup_longway(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif) { @@ -341,21 +332,9 @@ __inline__ struct sock *udp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport { struct sock *sk; - if(!dif && uh_cache_sk && - uh_cache_saddr == saddr && - uh_cache_sport == sport && - uh_cache_dport == dport && - uh_cache_daddr == daddr) - return uh_cache_sk; - + SOCKHASH_LOCK_READ(); sk = udp_v4_lookup_longway(saddr, sport, daddr, dport, dif); - if(!dif) { - uh_cache_sk = sk; - uh_cache_saddr = saddr; - uh_cache_daddr = daddr; - uh_cache_sport = sport; - uh_cache_dport = dport; - } + SOCKHASH_UNLOCK_READ(); return sk; } @@ -393,7 +372,7 @@ static struct sock *udp_v4_proxy_lookup(unsigned short num, unsigned long raddr, paddr = idev->ifa_list->ifa_local; } - SOCKHASH_LOCK(); + SOCKHASH_LOCK_READ(); for(s = udp_v4_proxy_loop_init(hnum, hpnum, s, firstpass); s != NULL; s = udp_v4_proxy_loop_next(hnum, hpnum, s, firstpass)) { @@ -431,7 +410,7 @@ static struct sock *udp_v4_proxy_lookup(unsigned short num, unsigned long raddr, } } } - SOCKHASH_UNLOCK(); + SOCKHASH_UNLOCK_READ(); return result; } @@ -784,7 +763,10 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) /* 4.1.3.4. It's configurable by the application via setsockopt() */ /* (MAY) and it defaults to on (MUST). */ - err = ip_build_xmit(sk,sk->no_check ? udp_getfrag_nosum : udp_getfrag, + err = ip_build_xmit(sk, + (sk->no_check == UDP_CSUM_NOXMIT ? + udp_getfrag_nosum : + udp_getfrag), &ufh, ulen, &ipc, rt, msg->msg_flags); out: @@ -979,8 +961,6 @@ int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk->rcv_saddr=INADDR_ANY; sk->daddr=INADDR_ANY; sk->state = TCP_CLOSE; - if(uh_cache_sk == sk) - uh_cache_sk = NULL; return 0; } @@ -1005,9 +985,6 @@ int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk->dport = usin->sin_port; sk->state = TCP_ESTABLISHED; - if(uh_cache_sk == sk) - uh_cache_sk = NULL; - sk->dst_cache = &rt->u.dst; return(0); } @@ -1015,6 +992,8 @@ int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) static void udp_close(struct sock *sk, long timeout) { + bh_lock_sock(sk); + /* See for explanation: raw_close in ipv4/raw.c */ sk->state = TCP_CLOSE; udp_v4_unhash(sk); @@ -1117,6 +1096,33 @@ int udp_chkaddr(struct sk_buff *skb) } #endif +static int udp_checksum_verify(struct sk_buff *skb, struct udphdr *uh, + unsigned short ulen, u32 saddr, u32 daddr, + int full_csum_deferred) +{ + if (!full_csum_deferred) { + if (uh->check) { + if (skb->ip_summed == CHECKSUM_HW && + udp_check(uh, ulen, saddr, daddr, skb->csum)) + return -1; + if (skb->ip_summed == CHECKSUM_NONE && + udp_check(uh, ulen, saddr, daddr, + csum_partial((char *)uh, ulen, 0))) + return -1; + } + } else { + if (uh->check == 0) + skb->ip_summed = CHECKSUM_UNNECESSARY; + else if (skb->ip_summed == CHECKSUM_HW) { + if (udp_check(uh, ulen, saddr, daddr, skb->csum)) + return -1; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); + } + return 0; +} + /* * All we need to do is get the socket, and then do a checksum. */ @@ -1158,25 +1164,18 @@ int udp_rcv(struct sk_buff *skb, unsigned short len) } skb_trim(skb, ulen); -#ifndef CONFIG_UDP_DELAY_CSUM - if (uh->check && - (((skb->ip_summed==CHECKSUM_HW)&&udp_check(uh,ulen,saddr,daddr,skb->csum)) || - ((skb->ip_summed==CHECKSUM_NONE) && - (udp_check(uh,ulen,saddr,daddr, csum_partial((char*)uh, ulen, 0)))))) - goto csum_error; + if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) { + int defer; + +#ifdef CONFIG_UDP_DELAY_CSUM + defer = 1; #else - if (uh->check==0) - skb->ip_summed = CHECKSUM_UNNECESSARY; - else if (skb->ip_summed==CHECKSUM_HW) { - if (udp_check(uh,ulen,saddr,daddr,skb->csum)) - goto csum_error; - skb->ip_summed = CHECKSUM_UNNECESSARY; - } else if (skb->ip_summed != CHECKSUM_UNNECESSARY) - skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); + defer = 0; #endif - - if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) + if (udp_checksum_verify(skb, uh, ulen, saddr, daddr, defer)) + goto csum_error; return udp_v4_mcast_deliver(skb, uh, saddr, daddr); + } #ifdef CONFIG_IP_TRANSPARENT_PROXY if (IPCB(skb)->redirport) @@ -1203,6 +1202,15 @@ int udp_rcv(struct sk_buff *skb, unsigned short len) kfree_skb(skb); return(0); } + if (udp_checksum_verify(skb, uh, ulen, saddr, daddr, +#ifdef CONFIG_UDP_DELAY_CSUM + 1 +#else + (sk->no_check & UDP_CSUM_NORCV) != 0 +#endif + )) + goto csum_error; + udp_deliver(sk, skb); return 0; diff --git a/net/ipv4/utils.c b/net/ipv4/utils.c index ce74ade2a..5992cbc55 100644 --- a/net/ipv4/utils.c +++ b/net/ipv4/utils.c @@ -6,7 +6,7 @@ * Various kernel-resident INET utility functions; mainly * for format conversion and debugging output. * - * Version: $Id: utils.c,v 1.6 1997/12/13 21:53:03 kuznet Exp $ + * Version: $Id: utils.c,v 1.7 1999/06/09 10:11:05 davem Exp $ * * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * @@ -57,6 +57,11 @@ char *in_ntoa(__u32 in) return(buff); } +char *in_ntoa2(__u32 in, char *buff) +{ + sprintf(buff, "%d.%d.%d.%d", NIPQUAD(in)); + return buff; +} /* * Convert an ASCII string to binary IP. |