27 files changed, 1093 insertions, 666 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 70fcf4024..ca0f27d0c 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -5,7 +5,7 @@
  *
  *		PF_INET protocol family socket handler.
  *
- * Version:	$Id: af_inet.c,v 1.87 1999/04/22 10:07:33 davem Exp $
+ * Version:	$Id: af_inet.c,v 1.91 1999/06/09 08:28:55 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -147,22 +147,17 @@ static __inline__ void kill_sk_queues(struct sock *sk)
 	struct sk_buff *skb;
 
 	/* First the read buffer. */
-	while((skb = skb_dequeue(&sk->receive_queue)) != NULL) {
-		/* This will take care of closing sockets that were
-		 * listening and didn't accept everything.
-		 */
-		if (skb->sk != NULL && skb->sk != sk)
-			skb->sk->prot->close(skb->sk, 0);
+	while((skb = skb_dequeue(&sk->receive_queue)) != NULL)
 		kfree_skb(skb);
-	}
 
 	/* Next, the error queue. */
 	while((skb = skb_dequeue(&sk->error_queue)) != NULL)
 		kfree_skb(skb);
 
-  	/* Now the backlog. */
-  	while((skb=skb_dequeue(&sk->back_log)) != NULL)
-		kfree_skb(skb);
+	/* It is _impossible_ for the backlog to contain anything
+	 * when we get here.  All user references to this socket
+	 * have gone away, only the net layer knows can touch it.
+	 */
 }
 
 static __inline__ void kill_sk_now(struct sock *sk)
@@ -195,14 +190,19 @@ static __inline__ void kill_sk_later(struct sock *sk)
 
 	sk->destroy = 1;
 	sk->ack_backlog = 0;
-	release_sock(sk);
+	bh_unlock_sock(sk);
 	net_reset_timer(sk, TIME_DESTROY, SOCK_DESTROY_TIME);
 }
 
+/* Callers must hold the BH spinlock.
+ *
+ * At this point, there should be no process reference to this
+ * socket, and thus no user references at all.  Therefore we
+ * can assume the socket waitqueue is inactive and nobody will
+ * try to jump onto it.
+ */
 void destroy_sock(struct sock *sk)
 {
-	lock_sock(sk);			/* just to be safe. */
-
   	/* Now we can no longer get new packets or once the
   	 * timers are killed, send them.
   	 */
@@ -213,12 +213,6 @@ void destroy_sock(struct sock *sk)
 
 	kill_sk_queues(sk);
 
-	/* Now if it has a half accepted/ closed socket. */
-	if (sk->pair) {
-		sk->pair->prot->close(sk->pair, 0);
-		sk->pair = NULL;
-  	}
-
 	/* Now if everything is gone we can free the socket
 	 * structure, otherwise we need to keep it around until
 	 * everything is gone.
@@ -284,6 +278,14 @@ static int inet_autobind(struct sock *sk)
 	return 0;
 }
 
+/* Listening INET sockets never sleep to wait for memory, so
+ * it is completely silly to wake them up on queue space
+ * available events.  So we hook them up to this dummy callback.
+ */
+static void inet_listen_write_space(struct sock *sk)
+{
+}
+
 /*
  *	Move a socket into listening state.
  */
@@ -310,6 +312,7 @@ int inet_listen(struct socket *sock, int backlog)
 		dst_release(xchg(&sk->dst_cache, NULL));
 		sk->prot->rehash(sk);
 		add_to_prot_sklist(sk);
+		sk->write_space = inet_listen_write_space;
 	}
 	sk->socket->flags |= SO_ACCEPTCON;
 	return(0);
@@ -368,7 +371,7 @@ static int inet_create(struct socket *sock, int protocol)
 		if (protocol && protocol != IPPROTO_UDP)
 			goto free_and_noproto;
 		protocol = IPPROTO_UDP;
-		sk->no_check = UDP_NO_CHECK;
+		sk->no_check = UDP_CSUM_DEFAULT;
 		sk->ip_pmtudisc = IP_PMTUDISC_DONT;
 		prot=&udp_prot;
 		sock->ops = &inet_dgram_ops;
@@ -578,7 +581,7 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
 
 static void inet_wait_for_connect(struct sock *sk)
 {
-	struct wait_queue wait = { current, NULL };
+	DECLARE_WAITQUEUE(wait, current);
 
 	add_wait_queue(sk->sleep, &wait);
 	current->state = TASK_INTERRUPTIBLE;
@@ -684,14 +687,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
 	if (sk1->prot->accept == NULL)
 		goto do_err;
 
-	/* Restore the state if we have been interrupted, and then returned. */
-	if (sk1->pair != NULL) {
-		sk2 = sk1->pair;
-		sk1->pair = NULL;
-	} else {
-		if((sk2 = sk1->prot->accept(sk1,flags)) == NULL)
-			goto do_sk1_err;
-	}
+	if((sk2 = sk1->prot->accept(sk1,flags)) == NULL)
+		goto do_sk1_err;
 
 	/*
 	 *	We've been passed an extra socket.
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 2c311f233..a3ca88701 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1,6 +1,6 @@
 /* linux/net/inet/arp.c
  *
- * Version:	$Id: arp.c,v 1.77 1999/03/21 05:22:30 davem Exp $
+ * Version:	$Id: arp.c,v 1.78 1999/06/09 10:10:36 davem Exp $
  *
  * Copyright (C) 1994 by Florian  La Roche
  *
@@ -119,6 +119,11 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+static char *ax2asc2(ax25_address *a, char *buf);
+#endif
+
+
 /*
  *	Interface to generic neighbour cache.
  */
@@ -304,7 +309,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
 	u8  *dst_ha = NULL;
 	struct device *dev = neigh->dev;
 	u32 target = *(u32*)neigh->primary_key;
-	int probes = neigh->probes;
+	int probes = atomic_read(&neigh->probes);
 
 	if (skb && inet_addr_type(skb->nh.iph->saddr) == RTN_LOCAL)
 		saddr = skb->nh.iph->saddr;
@@ -315,6 +320,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
 		if (!(neigh->nud_state&NUD_VALID))
 			printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n");
 		dst_ha = neigh->ha;
+		read_lock_bh(&neigh->lock);
 	} else if ((probes -= neigh->parms->app_probes) < 0) {
 #ifdef CONFIG_ARPD
 		neigh_app_ns(neigh);
@@ -324,6 +330,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
 
 	arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
 		 dst_ha, dev->dev_addr, NULL);
+	if (dst_ha)
+		read_unlock_bh(&neigh->lock);
 }
 
 /* OBSOLETE FUNCTIONS */
@@ -372,29 +380,25 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
 	if (arp_set_predefined(inet_addr_type(paddr), haddr, paddr, dev))
 		return 0;
 
-	start_bh_atomic();
 	n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
 
 	if (n) {
 		n->used = jiffies;
 		if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) {
-			memcpy(haddr, n->ha, dev->addr_len);
+			read_lock_bh(&n->lock);
+ 			memcpy(haddr, n->ha, dev->addr_len);
+			read_unlock_bh(&n->lock);
 			neigh_release(n);
-			end_bh_atomic();
 			return 0;
 		}
+		neigh_release(n);
 	} else
 		kfree_skb(skb);
-	neigh_release(n);
-	end_bh_atomic();
 	return 1;
 }
 
 /* END OF OBSOLETE FUNCTIONS */
 
-/*
- * Note: requires bh_atomic locking.
- */
 int arp_bind_neighbour(struct dst_entry *dst)
 {
 	struct device *dev = dst->dev;
@@ -672,7 +676,8 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
 			    (addr_type == RTN_UNICAST  && rt->u.dst.dev != dev &&
 			     (IN_DEV_PROXY_ARP(in_dev) || pneigh_lookup(&arp_tbl, &tip, dev, 0)))) {
 				n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
-				neigh_release(n);
+				if (n)
+					neigh_release(n);
 
 				if (skb->stamp.tv_sec == 0 ||
 				    skb->pkt_type == PACKET_HOST ||
@@ -785,7 +790,6 @@ int arp_req_set(struct arpreq *r, struct device * dev)
 		return -EINVAL;
 
 	err = -ENOBUFS;
-	start_bh_atomic();
 	neigh = __neigh_lookup(&arp_tbl, &ip, dev, 1);
 	if (neigh) {
 		unsigned state = NUD_STALE;
@@ -795,7 +799,6 @@ int arp_req_set(struct arpreq *r, struct device * dev)
 				   r->arp_ha.sa_data : NULL, state, 1, 0);
 		neigh_release(neigh);
 	}
-	end_bh_atomic();
 	return err;
 }
 
@@ -819,17 +822,17 @@ static int arp_req_get(struct arpreq *r, struct device *dev)
 	struct neighbour *neigh;
 	int err = -ENXIO;
 
-	start_bh_atomic();
-	neigh = __neigh_lookup(&arp_tbl, &ip, dev, 0);
+	neigh = neigh_lookup(&arp_tbl, &ip, dev);
 	if (neigh) {
+		read_lock_bh(&neigh->lock);
 		memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
+		r->arp_flags = arp_state_to_flags(neigh);
+		read_unlock_bh(&neigh->lock);
 		r->arp_ha.sa_family = dev->type;
 		strncpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
-		r->arp_flags = arp_state_to_flags(neigh);
 		neigh_release(neigh);
 		err = 0;
 	}
-	end_bh_atomic();
 	return err;
 }
 
@@ -867,14 +870,12 @@ int arp_req_delete(struct arpreq *r, struct device * dev)
 			return -EINVAL;
 	}
 	err = -ENXIO;
-	start_bh_atomic();
-	neigh = __neigh_lookup(&arp_tbl, &ip, dev, 0);
+	neigh = neigh_lookup(&arp_tbl, &ip, dev);
 	if (neigh) {
 		if (neigh->nud_state&~NUD_NOARP)
 			err = neigh_update(neigh, NULL, NUD_FAILED, 1, 0);
 		neigh_release(neigh);
 	}
-	end_bh_atomic();
 	return err;
 }
 
@@ -961,16 +962,16 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy
 	char hbuffer[HBUFFERLEN];
 	int i,j,k;
 	const char hexbuf[] =  "0123456789ABCDEF";
+	char abuf[16];
 
 	size = sprintf(buffer,"IP address       HW type     Flags       HW address            Mask     Device\n");
 
 	pos+=size;
 	len+=size;
 
-	neigh_table_lock(&arp_tbl);
-
-	for(i=0; i<=NEIGH_HASHMASK; i++)	{
+	for(i=0; i<=NEIGH_HASHMASK; i++) {
 		struct neighbour *n;
+		read_lock_bh(&arp_tbl.lock);
 		for (n=arp_tbl.hash_buckets[i]; n; n=n->next) {
 			struct device *dev = n->dev;
 			int hatype = dev->type;
@@ -979,17 +980,14 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy
 			if (!(n->nud_state&~NUD_NOARP))
 				continue;
 
-			/* I'd get great pleasure deleting
-			   this ugly code. Let's output it in hexadecimal format.
-			   "arp" utility will eventually repaired  --ANK
-			 */
-#if 1 /* UGLY CODE */
+			read_lock(&n->lock);
+
 /*
  *	Convert hardware address to XX:XX:XX:XX ... form.
  */
 #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
 			if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
-			     strcpy(hbuffer,ax2asc((ax25_address *)n->ha));
+				ax2asc2((ax25_address *)n->ha, hbuffer);
 			else {
 #endif
 			for (k=0,j=0;k<HBUFFERLEN-3 && j<dev->addr_len;j++) {
@@ -998,37 +996,33 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy
 				hbuffer[k++]=':';
 			}
 			hbuffer[--k]=0;
-	
+
 #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
 		}
 #endif
-#else
-			if ((neigh->nud_state&NUD_VALID) && dev->addr_len) {
-				int j;
-				for (j=0; j < dev->addr_len; j++)
-					sprintf(hbuffer+2*j, "%02x", neigh->ha[j]);
-			} else
-				sprintf(hbuffer, "0");
-#endif
 
 			size = sprintf(buffer+len,
 				"%-17s0x%-10x0x%-10x%s",
-				in_ntoa(*(u32*)n->primary_key),
+				in_ntoa2(*(u32*)n->primary_key, abuf),
 				hatype,
 				arp_state_to_flags(n), 
 				hbuffer);
 			size += sprintf(buffer+len+size,
 				 "     %-17s %s\n",
 				 "*", dev->name);
+			read_unlock(&n->lock);
 
 			len += size;
 			pos += size;
 		  
 			if (pos <= offset)
 				len=0;
-			if (pos >= offset+length)
-				goto done;
+			if (pos >= offset+length) {
+				read_unlock_bh(&arp_tbl.lock);
+ 				goto done;
+			}
 		}
+		read_unlock_bh(&arp_tbl.lock);
 	}
 
 	for (i=0; i<=PNEIGH_HASHMASK; i++) {
@@ -1039,7 +1033,7 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy
 
 			size = sprintf(buffer+len,
 				"%-17s0x%-10x0x%-10x%s",
-				in_ntoa(*(u32*)n->key),
+				in_ntoa2(*(u32*)n->key, abuf),
 				hatype,
  				ATF_PUBL|ATF_PERM,
 				"00:00:00:00:00:00");
@@ -1058,7 +1052,6 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy
 	}
 
 done:
-	neigh_table_unlock(&arp_tbl);
   
 	*start = buffer+len-(pos-offset);	/* Start of wanted data */
 	len = pos-offset;			/* Start slop */
@@ -1117,14 +1110,13 @@ __initfunc(void arp_init (void))
 }
 
 
-#ifdef CONFIG_AX25_MODULE
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
 
 /*
  *	ax25 -> ASCII conversion
  */
-char *ax2asc(ax25_address *a)
+char *ax2asc2(ax25_address *a, char *buf)
 {
-	static char buf[11];
 	char c, *s;
 	int n;
 
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index c8b0fbbc8..ff2c930d1 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1,7 +1,7 @@
 /*
  *	NET3	IP device support routines.
  *
- *	Version: $Id: devinet.c,v 1.28 1999/05/08 20:00:16 davem Exp $
+ *	Version: $Id: devinet.c,v 1.32 1999/06/09 11:15:33 davem Exp $
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -607,41 +607,39 @@ inet_gifconf(struct device *dev, char *buf, int len)
 {
 	struct in_device *in_dev = dev->ip_ptr;
 	struct in_ifaddr *ifa;
-	struct ifreq ifr;
+	struct ifreq *ifr = (struct ifreq *) buf;
 	int done=0;
 
 	if (in_dev==NULL || (ifa=in_dev->ifa_list)==NULL)
 		return 0;
 
 	for ( ; ifa; ifa = ifa->ifa_next) {
-		if (!buf) {
+		if (!ifr) {
 			done += sizeof(ifr);
 			continue;
 		}
 		if (len < (int) sizeof(ifr))
 			return done;
-		memset(&ifr, 0, sizeof(struct ifreq));
+		memset(ifr, 0, sizeof(struct ifreq));
 		if (ifa->ifa_label)
-			strcpy(ifr.ifr_name, ifa->ifa_label);
+			strcpy(ifr->ifr_name, ifa->ifa_label);
 		else
-			strcpy(ifr.ifr_name, dev->name);
+			strcpy(ifr->ifr_name, dev->name);
 
-		(*(struct sockaddr_in *) &ifr.ifr_addr).sin_family = AF_INET;
-		(*(struct sockaddr_in *) &ifr.ifr_addr).sin_addr.s_addr = ifa->ifa_local;
+		(*(struct sockaddr_in *) &ifr->ifr_addr).sin_family = AF_INET;
+		(*(struct sockaddr_in *) &ifr->ifr_addr).sin_addr.s_addr = ifa->ifa_local;
 
-		if (copy_to_user(buf, &ifr, sizeof(struct ifreq)))
-			return -EFAULT;
-		buf += sizeof(struct ifreq);
+		ifr++;
 		len -= sizeof(struct ifreq);
 		done += sizeof(struct ifreq);
 	}
 	return done;
 }
 
-u32 inet_select_addr(struct device *dev, u32 dst, int scope)
+u32 inet_select_addr(const struct device *dev, u32 dst, int scope)
 {
 	u32 addr = 0;
-	struct in_device *in_dev = dev->ip_ptr;
+	const struct in_device *in_dev = dev->ip_ptr;
 
 	if (in_dev == NULL)
 		return 0;
@@ -661,15 +659,19 @@ u32 inet_select_addr(struct device *dev, u32 dst, int scope)
 	   in this case. It is importnat that lo is the first interface
 	   in dev_base list.
 	 */
+	read_lock(&dev_base_lock);
 	for (dev=dev_base; dev; dev=dev->next) {
 		if ((in_dev=dev->ip_ptr) == NULL)
 			continue;
 
 		for_primary_ifa(in_dev) {
-			if (ifa->ifa_scope <= scope)
+			if (ifa->ifa_scope <= scope) {
+				read_unlock(&dev_base_lock);
 				return ifa->ifa_local;
+			}
 		} endfor_ifa(in_dev);
 	}
+	read_unlock(&dev_base_lock);
 
 	return 0;
 }
@@ -790,6 +792,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
 
 	s_idx = cb->args[0];
 	s_ip_idx = ip_idx = cb->args[1];
+	read_lock(&dev_base_lock);
 	for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
 		if (idx < s_idx)
 			continue;
@@ -807,6 +810,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
 		}
 	}
 done:
+	read_unlock(&dev_base_lock);
 	cb->args[0] = idx;
 	cb->args[1] = ip_idx;
 
@@ -881,11 +885,13 @@ void inet_forward_change()
 	ipv4_devconf.accept_redirects = !on;
 	ipv4_devconf_dflt.forwarding = on;
 
+	read_lock(&dev_base_lock);
 	for (dev = dev_base; dev; dev = dev->next) {
 		struct in_device *in_dev = dev->ip_ptr;
 		if (in_dev)
 			in_dev->cnf.forwarding = on;
 	}
+	read_unlock(&dev_base_lock);
 
 	rt_cache_flush(0);
 
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index a17470483..d57d4daa9 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -5,7 +5,7 @@
  *
  *		IPv4 Forwarding Information Base: FIB frontend.
  *
- * Version:	$Id: fib_frontend.c,v 1.15 1999/03/21 05:22:31 davem Exp $
+ * Version:	$Id: fib_frontend.c,v 1.16 1999/06/09 10:10:42 davem Exp $
  *
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
@@ -123,13 +123,11 @@ fib_get_procinfo(char *buffer, char **start, off_t offset, int length, int dummy
 		first = 0;
   	}
 
-	/* rtnl_shlock(); -- it is pointless at the moment --ANK */
 	if (main_table && count > 0) {
 		int n = main_table->tb_get_info(main_table, ptr, first, count);
 		count -= n;
 		ptr += n*128;
 	}
-	/* rtnl_shunlock(); */
 	len = ptr - *start;
 	if (len >= length)
 		return length;
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index d9e029cef..0472f6118 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -5,7 +5,7 @@
  *
  *		IPv4 FIB: lookup engine and maintenance routines.
  *
- * Version:	$Id: fib_hash.c,v 1.8 1999/03/25 10:04:17 davem Exp $
+ * Version:	$Id: fib_hash.c,v 1.10 1999/06/09 10:10:45 davem Exp $
  *
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
@@ -145,13 +145,16 @@ extern __inline__ int fn_key_leq(fn_key_t a, fn_key_t b)
 	return a.datum <= b.datum;
 }
 
+static rwlock_t fib_hash_lock = RW_LOCK_UNLOCKED;
+
 #define FZ_MAX_DIVISOR 1024
 
 #ifdef CONFIG_IP_ROUTE_LARGE_TABLES
 
+/* The fib hash lock must be held when this is called. */
 static __inline__ void fn_rebuild_zone(struct fn_zone *fz,
-					struct fib_node **old_ht,
-					int old_divisor)
+				       struct fib_node **old_ht,
+				       int old_divisor)
 {
 	int i;
 	struct fib_node *f, **fp, *next;
@@ -198,13 +201,13 @@ static void fn_rehash_zone(struct fn_zone *fz)
 
 	if (ht)	{
 		memset(ht, 0, new_divisor*sizeof(struct fib_node*));
-		start_bh_atomic();
+		write_lock_bh(&fib_hash_lock);
 		old_ht = fz->fz_hash;
 		fz->fz_hash = ht;
 		fz->fz_hashmask = new_hashmask;
 		fz->fz_divisor = new_divisor;
 		fn_rebuild_zone(fz, old_ht, old_divisor);
-		end_bh_atomic();
+		write_unlock_bh(&fib_hash_lock);
 		kfree(old_ht);
 	}
 }
@@ -246,6 +249,7 @@ fn_new_zone(struct fn_hash *table, int z)
 	for (i=z+1; i<=32; i++)
 		if (table->fn_zones[i])
 			break;
+	write_lock_bh(&fib_hash_lock);
 	if (i>32) {
 		/* No more specific masks, we are the first. */
 		fz->fz_next = table->fn_zone_list;
@@ -255,6 +259,7 @@ fn_new_zone(struct fn_hash *table, int z)
 		table->fn_zones[i]->fz_next = fz;
 	}
 	table->fn_zones[z] = fz;
+	write_unlock_bh(&fib_hash_lock);
 	return fz;
 }
 
@@ -265,6 +270,7 @@ fn_hash_lookup(struct fib_table *tb, const struct rt_key *key, struct fib_result
 	struct fn_zone *fz;
 	struct fn_hash *t = (struct fn_hash*)tb->tb_data;
 
+	read_lock(&fib_hash_lock);
 	for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
 		struct fib_node *f;
 		fn_key_t k = fz_key(key->dst, fz);
@@ -293,13 +299,16 @@ fn_hash_lookup(struct fib_table *tb, const struct rt_key *key, struct fib_result
 				res->scope = f->fn_scope;
 				res->prefixlen = fz->fz_order;
 				res->prefix = &fz_prefix(f->fn_key, fz);
-				return 0;
+				goto out;
 			}
 			if (err < 0)
-				return err;
+				goto out;
 		}
 	}
-	return 1;
+	err = 1;
+out:
+	read_unlock(&fib_hash_lock);
+	return err;
 }
 
 static int fn_hash_last_dflt=-1;
@@ -344,6 +353,7 @@ fn_hash_select_default(struct fib_table *tb, const struct rt_key *key, struct fi
 	last_resort = NULL;
 	order = -1;
 
+	read_lock(&fib_hash_lock);
 	for (f = fz->fz_hash[0]; f; f = f->fn_next) {
 		struct fib_info *next_fi = FIB_INFO(f);
 
@@ -364,7 +374,7 @@ fn_hash_select_default(struct fib_table *tb, const struct rt_key *key, struct fi
 		} else if (!fib_detect_death(fi, order, &last_resort, &last_idx)) {
 			res->fi = fi;
 			fn_hash_last_dflt = order;
-			return;
+			goto out;
 		}
 		fi = next_fi;
 		order++;
@@ -372,18 +382,20 @@ fn_hash_select_default(struct fib_table *tb, const struct rt_key *key, struct fi
 
 	if (order<=0 || fi==NULL) {
 		fn_hash_last_dflt = -1;
-		return;
+		goto out;
 	}
 
 	if (!fib_detect_death(fi, order, &last_resort, &last_idx)) {
 		res->fi = fi;
 		fn_hash_last_dflt = order;
-		return;
+		goto out;
 	}
 
 	if (last_idx >= 0)
 		res->fi = last_resort;
 	fn_hash_last_dflt = last_idx;
+out:
+	read_unlock(&fib_hash_lock);
 }
 
 #define FIB_SCAN(f, fp) \
@@ -457,6 +469,7 @@ rta->rta_prefsrc ? *(u32*)rta->rta_prefsrc : 0);
 
 	fp = fz_chain_p(key, fz);
 
+
 	/*
 	 * Scan list to find the first route with the same destination
 	 */
@@ -560,14 +573,17 @@ replace:
 	 */
 
 	new_f->fn_next = f;
+	write_lock_bh(&fib_hash_lock);
 	*fp = new_f;
+	write_unlock_bh(&fib_hash_lock);
 	fz->fz_nent++;
 
 	if (del_fp) {
 		f = *del_fp;
 		/* Unlink replaced node */
+		write_lock_bh(&fib_hash_lock);
 		*del_fp = f->fn_next;
-		synchronize_bh();
+		write_unlock_bh(&fib_hash_lock);
 
 		if (!(f->fn_state&FN_S_ZOMBIE))
 			rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req);
@@ -619,11 +635,13 @@ FTprint("tb(%d)_delete: %d %08x/%d %d\n", tb->tb_id, r->rtm_type, rta->rta_dst ?
 
 	fp = fz_chain_p(key, fz);
 
+
 	FIB_SCAN(f, fp) {
 		if (fn_key_eq(f->fn_key, key))
 			break;
-		if (fn_key_leq(key, f->fn_key))
+		if (fn_key_leq(key, f->fn_key)) {
 			return -ESRCH;
+		}
 	}
 #ifdef CONFIG_IP_ROUTE_TOS
 	FIB_SCAN_KEY(f, fp, key) {
@@ -637,9 +655,9 @@ FTprint("tb(%d)_delete: %d %08x/%d %d\n", tb->tb_id, r->rtm_type, rta->rta_dst ?
 	FIB_SCAN_TOS(f, fp, key, tos) {
 		struct fib_info * fi = FIB_INFO(f);
 
-		if (f->fn_state&FN_S_ZOMBIE)
+		if (f->fn_state&FN_S_ZOMBIE) {
 			return -ESRCH;
-
+		}
 		matched++;
 
 		if (del_fp == NULL &&
@@ -655,8 +673,9 @@ FTprint("tb(%d)_delete: %d %08x/%d %d\n", tb->tb_id, r->rtm_type, rta->rta_dst ?
 		rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req);
 
 		if (matched != 1) {
+			write_lock_bh(&fib_hash_lock);
 			*del_fp = f->fn_next;
-			synchronize_bh();
+			write_unlock_bh(&fib_hash_lock);
 
 			if (f->fn_state&FN_S_ACCESSED)
 				rt_cache_flush(-1);
@@ -687,8 +706,9 @@ fn_flush_list(struct fib_node ** fp, int z, struct fn_hash *table)
 		struct fib_info *fi = FIB_INFO(f);
 
 		if (fi && ((f->fn_state&FN_S_ZOMBIE) || (fi->fib_flags&RTNH_F_DEAD))) {
+			write_lock_bh(&fib_hash_lock);
 			*fp = f->fn_next;
-			synchronize_bh();
+			write_unlock_bh(&fib_hash_lock);
 
 			fn_free_node(f);
 			found++;
@@ -727,6 +747,7 @@ static int fn_hash_get_info(struct fib_table *tb, char *buffer, int first, int c
 	int pos = 0;
 	int n = 0;
 
+	read_lock(&fib_hash_lock);
 	for (fz=table->fn_zone_list; fz; fz = fz->fz_next) {
 		int i;
 		struct fib_node *f;
@@ -752,10 +773,12 @@ static int fn_hash_get_info(struct fib_table *tb, char *buffer, int first, int c
 						  FZ_MASK(fz), buffer);
 				buffer += 128;
 				if (++n >= count)
-					return n;
+					goto out;
 			}
 		}
 	}
+out:
+	read_unlock(&fib_hash_lock);
   	return n;
 }
 #endif
@@ -818,15 +841,18 @@ static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin
 	struct fn_hash *table = (struct fn_hash*)tb->tb_data;
 
 	s_m = cb->args[1];
+	read_lock(&fib_hash_lock);
 	for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) {
 		if (m < s_m) continue;
 		if (m > s_m)
 			memset(&cb->args[2], 0, sizeof(cb->args) - 2*sizeof(cb->args[0]));
 		if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
 			cb->args[1] = m;
+			read_unlock(&fib_hash_lock);
 			return -1;
 		}
 	}
+	read_unlock(&fib_hash_lock);
 	cb->args[1] = m;
 	return skb->len;
 }
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 868c44c31..97074198e 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -5,7 +5,7 @@
  *
  *		IPv4 Forwarding Information Base: policy rules.
  *
- * Version:	$Id: fib_rules.c,v 1.9 1999/03/25 10:04:23 davem Exp $
+ * Version:	$Id: fib_rules.c,v 1.11 1999/06/09 10:10:47 davem Exp $
  *
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
@@ -79,12 +79,14 @@ static struct fib_rule main_rule = { &default_rule, 0x7FFE, RT_TABLE_MAIN, RTN_U
 static struct fib_rule local_rule = { &main_rule, 0, RT_TABLE_LOCAL, RTN_UNICAST, };
 
 static struct fib_rule *fib_rules = &local_rule;
+static rwlock_t fib_rules_lock = RW_LOCK_UNLOCKED;
 
 int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 {
 	struct rtattr **rta = arg;
 	struct rtmsg *rtm = NLMSG_DATA(nlh);
 	struct fib_rule *r, **rp;
+	int err = -ESRCH;
 
 	for (rp=&fib_rules; (r=*rp) != NULL; rp=&r->r_next) {
 		if ((!rta[RTA_SRC-1] || memcmp(RTA_DATA(rta[RTA_SRC-1]), &r->r_src, 4) == 0) &&
@@ -99,18 +101,20 @@ int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 		    (!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) &&
 		    (!rta[RTA_IIF-1] || strcmp(RTA_DATA(rta[RTA_IIF-1]), r->r_ifname) == 0) &&
 		    (!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) {
+			err = -EPERM;
 			if (r == &local_rule)
-				return -EPERM;
+				break;
 
+			write_lock_bh(&fib_rules_lock);
 			*rp = r->r_next;
-			synchronize_bh();
-
+			write_unlock_bh(&fib_rules_lock);
 			if (r != &default_rule && r != &main_rule)
 				kfree(r);
-			return 0;
+			err = 0;
+			break;
 		}
 	}
-	return -ESRCH;
+	return err;
 }
 
 /* Allocate new unique table id */
@@ -205,7 +209,9 @@ int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 	}
 
 	new_r->r_next = r;
+	write_lock_bh(&fib_rules_lock);
 	*rp = new_r;
+	write_unlock_bh(&fib_rules_lock);
 	return 0;
 }
 
@@ -250,8 +256,11 @@ static void fib_rules_detach(struct device *dev)
 	struct fib_rule *r;
 
 	for (r=fib_rules; r; r=r->r_next) {
-		if (r->r_ifindex == dev->ifindex)
+		if (r->r_ifindex == dev->ifindex) {
+			write_lock_bh(&fib_rules_lock);
 			r->r_ifindex = -1;
+			write_unlock_bh(&fib_rules_lock);
+		}
 	}
 }
 
@@ -260,8 +269,11 @@ static void fib_rules_attach(struct device *dev)
 	struct fib_rule *r;
 
 	for (r=fib_rules; r; r=r->r_next) {
-		if (r->r_ifindex == -1 && strcmp(dev->name, r->r_ifname) == 0)
+		if (r->r_ifindex == -1 && strcmp(dev->name, r->r_ifname) == 0) {
+			write_lock_bh(&fib_rules_lock);
 			r->r_ifindex = dev->ifindex;
+			write_unlock_bh(&fib_rules_lock);
+		}
 	}
 }
 
@@ -275,6 +287,7 @@ int fib_lookup(const struct rt_key *key, struct fib_result *res)
 	u32 saddr = key->src;
 
 FRprintk("Lookup: %08x <- %08x ", key->dst, key->src);
+	read_lock(&fib_rules_lock);
 	for (r = fib_rules; r; r=r->r_next) {
 		if (((saddr^r->r_src) & r->r_srcmask) ||
 		    ((daddr^r->r_dst) & r->r_dstmask) ||
@@ -294,11 +307,14 @@ FRprintk("tb %d r %d ", r->r_table, r->r_action);
 			policy = r;
 			break;
 		case RTN_UNREACHABLE:
+			read_unlock(&fib_rules_lock);
 			return -ENETUNREACH;
 		default:
 		case RTN_BLACKHOLE:
+			read_unlock(&fib_rules_lock);
 			return -EINVAL;
 		case RTN_PROHIBIT:
+			read_unlock(&fib_rules_lock);
 			return -EACCES;
 		}
 
@@ -308,12 +324,16 @@ FRprintk("tb %d r %d ", r->r_table, r->r_action);
 		if (err == 0) {
 FRprintk("ok\n");
 			res->r = policy;
+			read_unlock(&fib_rules_lock);
 			return 0;
 		}
-		if (err < 0 && err != -EAGAIN)
+		if (err < 0 && err != -EAGAIN) {
+			read_unlock(&fib_rules_lock);
 			return err;
+		}
 	}
 FRprintk("FAILURE\n");
+	read_unlock(&fib_rules_lock);
 	return -ENETUNREACH;
 }
 
@@ -400,12 +420,14 @@ int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb)
 	int s_idx = cb->args[0];
 	struct fib_rule *r;
 
+	read_lock(&fib_rules_lock);
 	for (r=fib_rules, idx=0; r; r = r->r_next, idx++) {
 		if (idx < s_idx)
 			continue;
 		if (inet_fill_rule(skb, r, cb) < 0)
 			break;
 	}
+	read_unlock(&fib_rules_lock);
 	cb->args[0] = idx;
 
 	return skb->len;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 199550ffb..9456c7f29 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1,9 +1,9 @@
 /*
  *	NET3:	Implementation of the ICMP protocol layer. 
  *	
- *		Alan Cox, <alan@cymru.net>
+ *		Alan Cox, <alan@redhat.com>
  *
- *	Version: $Id: icmp.c,v 1.52 1999/03/21 12:04:11 davem Exp $
+ *	Version: $Id: icmp.c,v 1.57 1999/06/09 10:10:50 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *	modify it under the terms of the GNU General Public License
@@ -699,8 +699,8 @@ static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb, int len)
 			case ICMP_FRAG_NEEDED:
 				if (ipv4_config.no_pmtu_disc) {
 					if (net_ratelimit())
-						printk(KERN_INFO "ICMP: %s: fragmentation needed and DF set.\n",
-					       in_ntoa(iph->daddr));
+						printk(KERN_INFO "ICMP: %d.%d.%d.%d: fragmentation needed and DF set.\n",
+						       NIPQUAD(iph->daddr));
 				} else {
 					unsigned short new_mtu;
 					new_mtu = ip_rt_frag_needed(iph, ntohs(icmph->un.frag.mtu));
@@ -711,7 +711,7 @@ static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb, int len)
 				break;
 			case ICMP_SR_FAILED:
 				if (net_ratelimit())
-					printk(KERN_INFO "ICMP: %s: Source Route Failed.\n", in_ntoa(iph->daddr));
+					printk(KERN_INFO "ICMP: %d.%d.%d.%d: Source Route Failed.\n", NIPQUAD(iph->daddr));
 				break;
 			default:
 				break;
@@ -741,8 +741,8 @@ static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb, int len)
 		if (inet_addr_type(iph->daddr) == RTN_BROADCAST)
 		{
 			if (net_ratelimit())
-				printk(KERN_WARNING "%s sent an invalid ICMP error to a broadcast.\n",
-			       	in_ntoa(skb->nh.iph->saddr));
+				printk(KERN_WARNING "%d.%d.%d.%d sent an invalid ICMP error to a broadcast.\n",
+			       	NIPQUAD(skb->nh.iph->saddr));
 			return; 
 		}
 	}
@@ -1142,6 +1142,8 @@ __initfunc(void icmp_init(struct net_proto_family *ops))
 	icmp_inode.i_sock = 1;
 	icmp_inode.i_uid = 0;
 	icmp_inode.i_gid = 0;
+	init_waitqueue_head(&icmp_inode.i_wait);
+	init_waitqueue_head(&icmp_inode.u.socket_i.wait);
 
 	icmp_socket->inode = &icmp_inode;
 	icmp_socket->state = SS_UNCONNECTED;
@@ -1150,6 +1152,11 @@ __initfunc(void icmp_init(struct net_proto_family *ops))
 	if ((err=ops->create(icmp_socket, IPPROTO_ICMP))<0)
 		panic("Failed to create the ICMP control socket.\n");
 	icmp_socket->sk->allocation=GFP_ATOMIC;
-	icmp_socket->sk->num = 256;		/* Don't receive any data */
 	icmp_socket->sk->ip_ttl = MAXTTL;
+
+	/* Unhash it so that IP input processing does not even
+	 * see it, we do not wish this socket to see incoming
+	 * packets.
+	 */
+	icmp_socket->sk->prot->unhash(icmp_socket->sk);
 }
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 68e52633e..61c530418 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -8,7 +8,7 @@
  *	the older version didn't come out right using gcc 2.5.8, the newer one
  *	seems to fall out with gcc 2.6.2.
  *
- *	Version: $Id: igmp.c,v 1.30 1999/03/25 10:04:10 davem Exp $
+ *	Version: $Id: igmp.c,v 1.32 1999/06/09 10:10:53 davem Exp $
  *
  *	Authors:
  *		Alan Cox <Alan.Cox@linux.org>
@@ -97,6 +97,15 @@
 #include <linux/mroute.h>
 #endif
 
+/* Big mc list lock for all the devices */
+static rwlock_t ip_mc_lock = RW_LOCK_UNLOCKED;
+/* Big mc list semaphore for all the sockets.
+   We do not refer to this list in IP data paths or from BH,
+   so that semaphore is OK.
+ */
+DECLARE_MUTEX(ip_sk_mc_sem);
+
+
 #define IP_MAX_MEMBERSHIPS 20
 
 #ifdef CONFIG_IP_MULTICAST
@@ -216,6 +225,8 @@ static void igmp_timer_expire(unsigned long data)
 	struct in_device *in_dev = im->interface;
 	int err;
 
+	read_lock(&ip_mc_lock);
+
 	im->tm_running=0;
 
 	if (IGMP_V1_SEEN(in_dev))
@@ -234,6 +245,7 @@ static void igmp_timer_expire(unsigned long data)
 		igmp_start_timer(im, IGMP_Unsolicited_Report_Interval);
 	}
 	im->reporter = 1;
+	read_unlock(&ip_mc_lock);
 }
 
 static void igmp_heard_report(struct in_device *in_dev, u32 group)
@@ -245,14 +257,16 @@ static void igmp_heard_report(struct in_device *in_dev, u32 group)
 	if (LOCAL_MCAST(group))
 		return;
 
+	read_lock(&ip_mc_lock);
 	for (im=in_dev->mc_list; im!=NULL; im=im->next) {
 		if (im->multiaddr == group) {
 			igmp_stop_timer(im);
 			im->reporter = 0;
 			im->unsolicit_count = 0;
-			return;
+			break;
 		}
 	}
+	read_unlock(&ip_mc_lock);
 }
 
 static void igmp_heard_query(struct in_device *in_dev, unsigned char max_resp_time,
@@ -281,6 +295,7 @@ static void igmp_heard_query(struct in_device *in_dev, unsigned char max_resp_ti
 	 * - Use the igmp->igmp_code field as the maximum
 	 *   delay possible
 	 */
+	read_lock(&ip_mc_lock);
 	for (im=in_dev->mc_list; im!=NULL; im=im->next) {
 		if (group && group != im->multiaddr)
 			continue;
@@ -291,6 +306,7 @@ static void igmp_heard_query(struct in_device *in_dev, unsigned char max_resp_ti
 			igmp_stop_timer(im);
 		igmp_start_timer(im, max_delay);
 	}
+	read_unlock(&ip_mc_lock);
 }
 
 int igmp_rcv(struct sk_buff *skb, unsigned short len)
@@ -380,9 +396,7 @@ static void igmp_group_dropped(struct ip_mc_list *im)
 	if (LOCAL_MCAST(im->multiaddr))
 		return;
 
-	start_bh_atomic();
 	igmp_stop_timer(im);
-	end_bh_atomic();
 
 	if (im->reporter && !IGMP_V1_SEEN(im->interface))
 		igmp_send_report(im->interface->dev, im->multiaddr, IGMP_HOST_LEAVE_MESSAGE);
@@ -400,9 +414,7 @@ static void igmp_group_added(struct ip_mc_list *im)
 	if (LOCAL_MCAST(im->multiaddr))
 		return;
 
-	start_bh_atomic();
 	igmp_start_timer(im, IGMP_Initial_Report_Delay);
-	end_bh_atomic();
 #endif
 }
 
@@ -422,16 +434,17 @@ void ip_mc_inc_group(struct in_device *in_dev, u32 addr)
 
 	im = (struct ip_mc_list *)kmalloc(sizeof(*im), GFP_KERNEL);
 
+	write_lock_bh(&ip_mc_lock);
 	for (i=in_dev->mc_list; i; i=i->next) {
 		if (i->multiaddr == addr) {
 			i->users++;
 			if (im)
 				kfree(im);
-			return;
+			goto out;
 		}
 	}
 	if (!im)
-		return;
+		goto out;
 	im->users=1;
 	im->interface=in_dev;
 	im->multiaddr=addr;
@@ -447,9 +460,13 @@ void ip_mc_inc_group(struct in_device *in_dev, u32 addr)
 	im->next=in_dev->mc_list;
 	in_dev->mc_list=im;
 	igmp_group_added(im);
+	write_unlock_bh(&ip_mc_lock);
 	if (in_dev->dev->flags & IFF_UP)
 		ip_rt_multicast_event(in_dev);
 	return;
+out:
+	write_unlock_bh(&ip_mc_lock);
+	return;
 }
 
 /*
@@ -458,22 +475,27 @@ void ip_mc_inc_group(struct in_device *in_dev, u32 addr)
 
 int ip_mc_dec_group(struct in_device *in_dev, u32 addr)
 {
+	int err = -ESRCH;
 	struct ip_mc_list *i, **ip;
 
+	write_lock_bh(&ip_mc_lock);
 	for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) {
 		if (i->multiaddr==addr) {
 			if (--i->users == 0) {
 				*ip = i->next;
-				synchronize_bh();
-
 				igmp_group_dropped(i);
+
+				write_unlock_bh(&ip_mc_lock);
 				if (in_dev->dev->flags & IFF_UP)
 					ip_rt_multicast_event(in_dev);
 				kfree_s(i, sizeof(*i));
+				return 0;
 			}
-			return 0;
+			err = 0;
+			break;
 		}
 	}
+	write_unlock_bh(&ip_mc_lock);
 	return -ESRCH;
 }
 
@@ -483,8 +505,10 @@ void ip_mc_down(struct in_device *in_dev)
 {
 	struct ip_mc_list *i;
 
+	read_lock_bh(&ip_mc_lock);
 	for (i=in_dev->mc_list; i; i=i->next)
 		igmp_group_dropped(i);
+	read_unlock_bh(&ip_mc_lock);
 
 	ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
 }
@@ -497,8 +521,10 @@ void ip_mc_up(struct in_device *in_dev)
 
 	ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
 
+	read_lock_bh(&ip_mc_lock);
 	for (i=in_dev->mc_list; i; i=i->next)
 		igmp_group_added(i);
+	read_unlock_bh(&ip_mc_lock);
 }
 
 /*
@@ -509,11 +535,13 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
 {
 	struct ip_mc_list *i;
 
+	write_lock_bh(&ip_mc_lock);
 	while ((i = in_dev->mc_list) != NULL) {
 		in_dev->mc_list = i->next;
 		igmp_group_dropped(i);
 		kfree_s(i, sizeof(*i));
 	}
+	write_unlock_bh(&ip_mc_lock);
 }
 
 static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr)
@@ -570,6 +598,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
 	iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
 
 	err = -EADDRINUSE;
+	down(&ip_sk_mc_sem);
 	for (i=sk->ip_mc_list; i; i=i->next) {
 		if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) {
 			/* New style additions are reference counted */
@@ -577,13 +606,13 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
 				i->count++;
 				err = 0;
 			}
-			goto done;
+			goto done_unlock;
 		}
 		count++;
 	}
 	err = -ENOBUFS;
 	if (iml == NULL || count >= sysctl_igmp_max_memberships)
-		goto done;
+		goto done_unlock;
 	memcpy(&iml->multi, imr, sizeof(*imr));
 	iml->next = sk->ip_mc_list;
 	iml->count = 1;
@@ -591,6 +620,9 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
 	ip_mc_inc_group(in_dev, addr);
 	iml = NULL;
 	err = 0;
+
+done_unlock:
+	up(&ip_sk_mc_sem);
 done:
 	rtnl_shunlock();
 	if (iml)
@@ -606,6 +638,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
 {
 	struct ip_mc_socklist *iml, **imlp;
 
+	down(&ip_sk_mc_sem);
 	for (imlp=&sk->ip_mc_list; (iml=*imlp)!=NULL; imlp=&iml->next) {
 		if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr &&
 		    iml->multi.imr_address.s_addr==imr->imr_address.s_addr &&
@@ -615,7 +648,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
 				return 0;
 
 			*imlp = iml->next;
-			synchronize_bh();
+			up(&ip_sk_mc_sem);
 
 			in_dev = inetdev_by_index(iml->multi.imr_ifindex);
 			if (in_dev)
@@ -624,6 +657,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
 			return 0;
 		}
 	}
+	up(&ip_sk_mc_sem);
 	return -EADDRNOTAVAIL;
 }
 
@@ -635,13 +669,37 @@ void ip_mc_drop_socket(struct sock *sk)
 {
 	struct ip_mc_socklist *iml;
 
+	down(&ip_sk_mc_sem);
 	while ((iml=sk->ip_mc_list) != NULL) {
 		struct in_device *in_dev;
 		sk->ip_mc_list = iml->next;
+		up(&ip_sk_mc_sem);
+
 		if ((in_dev = inetdev_by_index(iml->multi.imr_ifindex)) != NULL)
 			ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
 		sock_kfree_s(sk, iml, sizeof(*iml));
+
+		down(&ip_sk_mc_sem);
 	}
+	up(&ip_sk_mc_sem);
+}
+
+int ip_check_mc(struct device *dev, u32 mc_addr)
+{
+	struct in_device *in_dev = dev->ip_ptr;
+	struct ip_mc_list *im;
+
+	if (in_dev) {
+		read_lock(&ip_mc_lock);
+		for (im=in_dev->mc_list; im; im=im->next) {
+			if (im->multiaddr == mc_addr) {
+				read_unlock(&ip_mc_lock);
+				return 1;
+			}
+		}
+		read_unlock(&ip_mc_lock);
+	}
+	return 0;
 }
 
 
@@ -653,11 +711,11 @@ int ip_mc_procinfo(char *buffer, char **start, off_t offset, int length, int dum
 	struct ip_mc_list *im;
 	int len=0;
 	struct device *dev;
-	
+
 	len=sprintf(buffer,"Idx\tDevice    : Count Querier\tGroup    Users Timer\tReporter\n");  
-	
-	for(dev = dev_base; dev; dev = dev->next)
-	{
+
+	read_lock(&dev_base_lock);
+	for(dev = dev_base; dev; dev = dev->next) {
 		struct in_device *in_dev = dev->ip_ptr;
 		char   *querier = "NONE";
 		
@@ -669,6 +727,7 @@ int ip_mc_procinfo(char *buffer, char **start, off_t offset, int length, int dum
 		len+=sprintf(buffer+len,"%d\t%-10s: %5d %7s\n",
 			     dev->ifindex, dev->name, dev->mc_count, querier);
 
+		read_lock(&ip_mc_lock);
 		for (im = in_dev->mc_list; im; im = im->next) {
 			len+=sprintf(buffer+len,
 				     "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n",
@@ -681,11 +740,16 @@ int ip_mc_procinfo(char *buffer, char **start, off_t offset, int length, int dum
 				len=0;
 				begin=pos;
 			}
-			if(pos>offset+length)
+			if(pos>offset+length) {
+				read_unlock(&ip_mc_lock);
 				goto done;
+			}
 		}
+		read_unlock(&ip_mc_lock);
 	}
 done:
+	read_unlock(&dev_base_lock);
+
 	*start=buffer+(offset-begin);
 	len-=(offset-begin);
 	if(len>length)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index f066e6073..29747fee6 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -5,7 +5,7 @@
  *
  *		The IP fragmentation functionality.
  *		
- * Version:	$Id: ip_fragment.c,v 1.40 1999/03/20 23:58:34 davem Exp $
+ * Version:	$Id: ip_fragment.c,v 1.41 1999/05/27 00:38:07 davem Exp $
  *
  * Authors:	Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
  *		Alan Cox <Alan.Cox@linux.org>
@@ -71,7 +71,8 @@ struct ipq {
 
 #define IPQ_HASHSZ	64
 
-struct ipq *ipq_hash[IPQ_HASHSZ];
+static struct ipq *ipq_hash[IPQ_HASHSZ];
+static spinlock_t ipfrag_lock = SPIN_LOCK_UNLOCKED;
 
 #define ipqhashfn(id, saddr, daddr, prot) \
 	((((id) >> 1) ^ (saddr) ^ (daddr) ^ (prot)) & (IPQ_HASHSZ - 1))
@@ -141,7 +142,9 @@ static inline struct ipq *ip_find(struct iphdr *iph, struct dst_entry *dst)
 	unsigned int hash = ipqhashfn(id, saddr, daddr, protocol);
 	struct ipq *qp;
 
-	/* Always, we are in a BH context, so no locking.  -DaveM */
+	/* We are always in BH context, and protected by the
+	 * ipfrag lock.
+	 */
 	for(qp = ipq_hash[hash]; qp; qp = qp->next) {
 		if(qp->iph->id == id		&&
 		   qp->iph->saddr == saddr	&&
@@ -158,8 +161,9 @@ static inline struct ipq *ip_find(struct iphdr *iph, struct dst_entry *dst)
  * because we completed, reassembled and processed it, or because
  * it timed out.
  *
- * This is called _only_ from BH contexts, on packet reception
- * processing and from frag queue expiration timers.  -DaveM
+ * This is called _only_ from BH contexts with the ipfrag lock held,
+ * on packet reception processing and from frag queue expiration
+ * timers.  -DaveM
  */
 static void ip_free(struct ipq *qp)
 {
@@ -197,6 +201,7 @@ static void ip_expire(unsigned long arg)
 {
 	struct ipq *qp = (struct ipq *) arg;
 
+	spin_lock(&ipfrag_lock);
   	if(!qp->fragments)
         {	
 #ifdef IP_EXPIRE_DEBUG
@@ -213,10 +218,13 @@ static void ip_expire(unsigned long arg)
 out:
 	/* Nuke the fragment queue. */
 	ip_free(qp);
+	spin_lock(&ipfrag_lock);
 }
 
 /* Memory limiting on fragments.  Evictor trashes the oldest 
  * fragment queue until we are back under the low threshold.
+ *
+ * We are always called in BH with the ipfrag lock held.
  */
 static void ip_evictor(void)
 {
@@ -229,9 +237,6 @@ restart:
 		struct ipq *qp;
 		if (atomic_read(&ip_frag_mem) <= sysctl_ipfrag_low_thresh)
 			return;
-		/* We are in a BH context, so these queue
-		 * accesses are safe.  -DaveM
-		 */
 		qp = ipq_hash[i];
 		if (qp) {
 			/* find the oldest queue for this hash bucket */
@@ -283,7 +288,7 @@ static struct ipq *ip_create(struct sk_buff *skb, struct iphdr *iph)
 	/* Add this entry to the queue. */
 	hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
 
-	/* We are in a BH context, no locking necessary.  -DaveM */
+	/* In a BH context and ipfrag lock is held.  -DaveM */
 	if((qp->next = ipq_hash[hash]) != NULL)
 		qp->next->pprev = &qp->next;
 	ipq_hash[hash] = qp;
@@ -421,6 +426,8 @@ struct sk_buff *ip_defrag(struct sk_buff *skb)
 	
 	ip_statistics.IpReasmReqds++;
 
+	spin_lock(&ipfrag_lock);
+
 	/* Start by cleaning up the memory. */
 	if (atomic_read(&ip_frag_mem) > sysctl_ipfrag_high_thresh)
 		ip_evictor();
@@ -565,6 +572,7 @@ struct sk_buff *ip_defrag(struct sk_buff *skb)
 out_freequeue:
 		ip_free(qp);
 out_skb:
+		spin_unlock(&ipfrag_lock);
 		return skb;
 	}
 
@@ -574,6 +582,7 @@ out_skb:
 out_timer:
 	mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time); /* ~ 30 seconds */
 out:
+	spin_unlock(&ipfrag_lock);
 	return NULL;
 
 	/*
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 7a3e2618b..107ccaa16 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -5,7 +5,7 @@
  *
  *		The Internet Protocol (IP) module.
  *
- * Version:	$Id: ip_input.c,v 1.37 1999/04/22 10:38:36 davem Exp $
+ * Version:	$Id: ip_input.c,v 1.40 1999/06/09 10:10:55 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -154,44 +154,11 @@
 
 struct ip_mib ip_statistics={2,IPDEFTTL,};	/* Forwarding=No, Default TTL=64 */
 
-
-/*
- *	Handle the issuing of an ioctl() request
- *	for the ip device. This is scheduled to
- *	disappear
- */
-
-int ip_ioctl(struct sock *sk, int cmd, unsigned long arg)
-{
-	switch(cmd)
-	{
-		default:
-			return(-EINVAL);
-	}
-}
-
-
 #if defined(CONFIG_IP_TRANSPARENT_PROXY) && !defined(CONFIG_IP_ALWAYS_DEFRAG)
 #define CONFIG_IP_ALWAYS_DEFRAG 1
 #endif
 
 /*
- *	0 - deliver
- *	1 - block
- */
-static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
-{
-	int    type;
-
-	type = skb->h.icmph->type;
-	if (type < 32)
-		return test_bit(type, &sk->tp_pinfo.tp_raw4.filter);
-
-	/* Do not block unknown ICMP types */
-	return 0;
-}
-
-/*
  *	Process Router Attention IP option
  */ 
 int ip_call_ra_chain(struct sk_buff *skb)
@@ -224,16 +191,37 @@ int ip_call_ra_chain(struct sk_buff *skb)
 	return 0;
 }
 
+/* Handle this out of line, it is rare. */
+static int ip_run_ipprot(struct sk_buff *skb, struct iphdr *iph,
+			 struct inet_protocol *ipprot, int force_copy)
+{
+	int ret = 0;
+
+	do {
+		if (ipprot->protocol == iph->protocol) {
+			struct sk_buff *skb2 = skb;
+			if (ipprot->copy || force_copy)
+				skb2 = skb_clone(skb, GFP_ATOMIC);
+			if(skb2 != NULL) {
+				ret = 1;
+				ipprot->handler(skb2,
+						ntohs(iph->tot_len) - (iph->ihl * 4));
+			}
+		}
+		ipprot = (struct inet_protocol *) ipprot->next;
+	} while(ipprot != NULL);
+
+	return ret;
+}
+
+extern struct sock *raw_v4_input(struct sk_buff *, struct iphdr *, int);
+
 /*
  * 	Deliver IP Packets to the higher protocol layers.
  */ 
 int ip_local_deliver(struct sk_buff *skb)
 {
 	struct iphdr *iph = skb->nh.iph;
-	struct inet_protocol *ipprot;
-	struct sock *raw_sk=NULL;
-	unsigned char hash;
-	int flag = 0;
 
 #ifndef CONFIG_IP_ALWAYS_DEFRAG
 	/*
@@ -249,34 +237,29 @@ int ip_local_deliver(struct sk_buff *skb)
 #endif
 
 #ifdef CONFIG_IP_MASQUERADE
-	/*
-	 * Do we need to de-masquerade this packet?
-	 */
-        {
-		int ret;
-		/*
-		 *	Some masq modules can re-inject packets if
-		 *	bad configured.
+	/* Do we need to de-masquerade this packet? */
+	if((IPCB(skb)->flags&IPSKB_MASQUERADED)) {
+		/* Some masq modules can re-inject packets if
+		 * bad configured.
 		 */
+		printk(KERN_DEBUG "ip_input(): demasq recursion detected. "
+		       "Check masq modules configuration\n");
+		kfree_skb(skb);
+		return 0;
+	} else {
+		int ret = ip_fw_demasquerade(&skb);
 
-		if((IPCB(skb)->flags&IPSKB_MASQUERADED)) {
-			printk(KERN_DEBUG "ip_input(): demasq recursion detected. Check masq modules configuration\n");
-			kfree_skb(skb);
-			return 0;
-		}
-
-		ret = ip_fw_demasquerade(&skb);
 		if (ret < 0) {
 			kfree_skb(skb);
 			return 0;
 		}
-
 		if (ret) {
-			iph=skb->nh.iph;
+			iph = skb->nh.iph;
 			IPCB(skb)->flags |= IPSKB_MASQUERADED;
 			dst_release(skb->dst);
 			skb->dst = NULL;
-			if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, skb->dev)) {
+			if (ip_route_input(skb, iph->daddr, iph->saddr,
+					   iph->tos, skb->dev)) {
 				kfree_skb(skb);
 				return 0;
 			}
@@ -285,112 +268,50 @@ int ip_local_deliver(struct sk_buff *skb)
         }
 #endif
 
-        /*
-	 *	Point into the IP datagram, just past the header.
-	 */
-
+        /* Point into the IP datagram, just past the header. */
         skb->h.raw = skb->nh.raw + iph->ihl*4;
 
-	/*
-	 *	Deliver to raw sockets. This is fun as to avoid copies we want to make no 
-	 *	surplus copies.
-	 *
-	 *	RFC 1122: SHOULD pass TOS value up to the transport layer.
-	 *	-> It does. And not only TOS, but all IP header.
-	 */
- 
-	/* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
-	hash = iph->protocol & (MAX_INET_PROTOS - 1);
-
-	/* 
-	 *	If there maybe a raw socket we must check - if not we don't care less 
-	 */
-		 
-	if((raw_sk = raw_v4_htable[hash]) != NULL) {
-		struct sock *sknext = NULL;
-		struct sk_buff *skb1;
-		raw_sk = raw_v4_lookup(raw_sk, iph->protocol, iph->saddr, iph->daddr, skb->dev->ifindex);
-		if(raw_sk) {	/* Any raw sockets */
-			do {
-				/* Find the next */
-				sknext = raw_v4_lookup(raw_sk->next, iph->protocol,
-						       iph->saddr, iph->daddr, skb->dev->ifindex);
-				if (iph->protocol != IPPROTO_ICMP || !icmp_filter(raw_sk, skb)) {
-					if (sknext == NULL)
-						break;
-					skb1 = skb_clone(skb, GFP_ATOMIC);
-					if(skb1)
-					{
-						raw_rcv(raw_sk, skb1);
-					}
-				}
-				raw_sk = sknext;
-			} while(raw_sk!=NULL);
-				
-			/*	Here either raw_sk is the last raw socket, or NULL if
-			 *	none.  We deliver to the last raw socket AFTER the
-			 *	protocol checks as it avoids a surplus copy.
-			 */
-		}
-	}
-	
-	/*
-	 *	skb->h.raw now points at the protocol beyond the IP header.
-	 */
-	
-	for (ipprot = (struct inet_protocol *)inet_protos[hash];ipprot != NULL;ipprot=(struct inet_protocol *)ipprot->next)
 	{
-		struct sk_buff *skb2;
-	
-		if (ipprot->protocol != iph->protocol)
-			continue;
-		/*
-		 * 	See if we need to make a copy of it.  This will
-		 * 	only be set if more than one protocol wants it.
-		 * 	and then not for the last one. If there is a pending
-		 *	raw delivery wait for that
+		/* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
+		int hash = iph->protocol & (MAX_INET_PROTOS - 1);
+		struct sock *raw_sk = raw_v4_htable[hash];
+		struct inet_protocol *ipprot;
+		int flag;
+
+		/* If there maybe a raw socket we must check - if not we
+		 * don't care less
 		 */
-	
-		if (ipprot->copy || raw_sk)
-		{
-			skb2 = skb_clone(skb, GFP_ATOMIC);
-			if(skb2==NULL)
-				continue;
-		}
-		else
-		{
-			skb2 = skb;
-		}
-		flag = 1;
+		if(raw_sk != NULL)
+			raw_sk = raw_v4_input(skb, iph, hash);
+
+		ipprot = (struct inet_protocol *) inet_protos[hash];
+		flag = 0;
+		if(ipprot != NULL) {
+			if(raw_sk == NULL &&
+			   ipprot->next == NULL &&
+			   ipprot->protocol == iph->protocol) {
+				/* Fast path... */
+				return ipprot->handler(skb, (ntohs(iph->tot_len) -
+							     (iph->ihl * 4)));
+			} else {
+				flag = ip_run_ipprot(skb, iph, ipprot, (raw_sk != NULL));
+			}
+		}	
 
-		/*
-		 *	Pass on the datagram to each protocol that wants it,
-		 *	based on the datagram protocol.  We should really
-		 *	check the protocol handler's return values here...
+		/* All protocols checked.
+		 * If this packet was a broadcast, we may *not* reply to it, since that
+		 * causes (proven, grin) ARP storms and a leakage of memory (i.e. all
+		 * ICMP reply messages get queued up for transmission...)
 		 */
-
-		ipprot->handler(skb2, ntohs(iph->tot_len) - (iph->ihl * 4));
-	}
-
-	/*
-	 *	All protocols checked.
-	 *	If this packet was a broadcast, we may *not* reply to it, since that
-	 *	causes (proven, grin) ARP storms and a leakage of memory (i.e. all
-	 *	ICMP reply messages get queued up for transmission...)
-	 */
-
-	if(raw_sk!=NULL)	/* Shift to last raw user */
-	{
-		raw_rcv(raw_sk, skb);
-
-	}
-	else if (!flag)		/* Free and report errors */
-	{
-		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);	
-		kfree_skb(skb);
+		if(raw_sk != NULL) {	/* Shift to last raw user */
+			raw_rcv(raw_sk, skb);
+		} else if (!flag) {		/* Free and report errors */
+			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);	
+			kfree_skb(skb);
+		}
 	}
 
-	return(0);
+	return 0;
 }
 
 /*
@@ -404,9 +325,8 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
 	u16 rport;
 #endif /* CONFIG_FIREWALL */
 
-	/*
-	 * 	When the interface is in promisc. mode, drop all the crap
-	 * 	that it receives, do not try to analyse it.
+	/* When the interface is in promisc. mode, drop all the crap
+	 * that it receives, do not try to analyse it.
 	 */
 	if (skb->pkt_type == PACKET_OTHERHOST)
 		goto drop;
@@ -430,17 +350,15 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
 		goto inhdr_error; 
 
 	{
-	__u32 len = ntohs(iph->tot_len); 
-	if (skb->len < len)
-		goto inhdr_error; 
+		__u32 len = ntohs(iph->tot_len); 
+		if (skb->len < len)
+			goto inhdr_error; 
 
-	/*
-	 *	Our transport medium may have padded the buffer out. Now we know it
-	 *	is IP we can trim to the true length of the frame.
-	 *	Note this now means skb->len holds ntohs(iph->tot_len).
-	 */
-
-	__skb_trim(skb, len);
+		/* Our transport medium may have padded the buffer out. Now we know it
+		 * is IP we can trim to the true length of the frame.
+		 * Note this now means skb->len holds ntohs(iph->tot_len).
+		 */
+		__skb_trim(skb, len);
 	}
 	
 #ifdef CONFIG_IP_ALWAYS_DEFRAG
@@ -474,21 +392,17 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
 	if (skb->dst == NULL) {
 		if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))
 			goto drop; 
-#ifdef CONFIG_CPU_IS_SLOW
-		if (net_cpu_congestion > 10 && !(iph->tos&IPTOS_RELIABILITY) &&
-		    IPTOS_PREC(iph->tos) < IPTOS_PREC_INTERNETCONTROL) {
-			goto drop;
-		}
-#endif
 	}
 
 #ifdef CONFIG_NET_CLS_ROUTE
 	if (skb->dst->tclassid) {
 		u32 idx = skb->dst->tclassid;
+		write_lock(&ip_rt_acct_lock);
 		ip_rt_acct[idx&0xFF].o_packets++;
 		ip_rt_acct[idx&0xFF].o_bytes+=skb->len;
 		ip_rt_acct[(idx>>16)&0xFF].i_packets++;
 		ip_rt_acct[(idx>>16)&0xFF].i_bytes+=skb->len;
+		write_unlock(&ip_rt_acct_lock);
 	}
 #endif
 
diff --git a/net/ipv4/ip_masq_mfw.c b/net/ipv4/ip_masq_mfw.c
index dc38b1712..ff07231fc 100644
--- a/net/ipv4/ip_masq_mfw.c
+++ b/net/ipv4/ip_masq_mfw.c
@@ -3,7 +3,7 @@
  *
  *	Does (reverse-masq) forwarding based on skb->fwmark value
  *
- *	$Id: ip_masq_mfw.c,v 1.3 1999/01/26 05:33:47 davem Exp $
+ *	$Id: ip_masq_mfw.c,v 1.4 1999/05/13 23:25:07 davem Exp $
  *
  * Author:	Juan Jose Ciarlante   <jjciarla@raiz.uncu.edu.ar>
  *		  based on Steven Clarke's portfw
@@ -79,7 +79,7 @@ struct ip_masq_mfw {
 };
 
 
-static struct semaphore mfw_sema = MUTEX;
+static DECLARE_MUTEX(mfw_sema);
 #ifdef __SMP__
 static rwlock_t mfw_lock = RW_LOCK_UNLOCKED;
 #endif
diff --git a/net/ipv4/ip_masq_quake.c b/net/ipv4/ip_masq_quake.c
index 165dd6bd5..17b11a799 100644
--- a/net/ipv4/ip_masq_quake.c
+++ b/net/ipv4/ip_masq_quake.c
@@ -12,6 +12,7 @@
  *                                 http://www.gamers.org/dEngine/quake/spec/ 
  *      Harald Hoyer            :       Check for QUAKE-STRING
  *	Juan Jose Ciarlante	:  litl bits for 2.1
+ *      Horst von Brand         :  Add #include <linux/string.h>
  *
  *	This program is free software; you can redistribute it and/or
  *	modify it under the terms of the GNU General Public License
@@ -24,6 +25,7 @@
 #include <linux/module.h>
 #include <asm/system.h>
 #include <linux/types.h>
+#include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
 #include <linux/in.h>
@@ -44,7 +46,7 @@ typedef struct
 
 struct quake_priv_data {
 	/* Have we seen a client connect message */
-	char	cl_connect;
+	signed char	cl_connect;
 };
 
 static int
diff --git a/net/ipv4/ip_masq_vdolive.c b/net/ipv4/ip_masq_vdolive.c
index 4724e3b93..2d8d672cc 100644
--- a/net/ipv4/ip_masq_vdolive.c
+++ b/net/ipv4/ip_masq_vdolive.c
@@ -2,7 +2,7 @@
  *		IP_MASQ_VDOLIVE  - VDO Live masquerading module
  *
  *
- * Version:	@(#)$Id: ip_masq_vdolive.c,v 1.4 1998/10/06 04:49:07 davem Exp $
+ * Version:	@(#)$Id: ip_masq_vdolive.c,v 1.6 1999/06/09 08:29:03 davem Exp $
  *
  * Author:	Nigel Metheringham <Nigel.Metheringham@ThePLAnet.net>
  *		PLAnet Online Ltd
@@ -10,6 +10,9 @@
  * Fixes:	Minor changes for 2.1 by
  *		Steven Clarke <Steven.Clarke@ThePlanet.Net>, Planet Online Ltd
  *
+ *              Add missing #include <linux/string.h>
+ *              Horst von Brand <vonbrand@sleipnir.valparaiso.cl>
+ *
  *	This program is free software; you can redistribute it and/or
  *	modify it under the terms of the GNU General Public License
  *	as published by the Free Software Foundation; either version
@@ -25,6 +28,7 @@
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
+#include <linux/string.h>
 #include <linux/kernel.h>
 #include <asm/system.h>
 #include <linux/skbuff.h>
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index fae22cbe7..359926a4c 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -5,7 +5,7 @@
  *
  *		The options processing module for ip.c
  *
- * Version:	$Id: ip_options.c,v 1.16 1999/03/21 05:22:40 davem Exp $
+ * Version:	$Id: ip_options.c,v 1.18 1999/06/09 08:29:06 davem Exp $
  *
  * Authors:	A.N.Kuznetsov
  *		
@@ -452,7 +452,6 @@ eol:
 error:
 	if (skb) {
 		icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24));
-		kfree_skb(skb);
 	}
 	return -EINVAL;
 }
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index abe93ec27..51e27ad67 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1,5 +1,5 @@
 /*
- *  $Id: ipconfig.c,v 1.20 1999/03/28 10:18:28 davem Exp $
+ *  $Id: ipconfig.c,v 1.22 1999/06/09 10:10:57 davem Exp $
  *
  *  Automatic Configuration of IP -- use BOOTP or RARP or user-supplied
  *  information to configure own IP address and routes.
@@ -112,7 +112,8 @@ static int __init ic_open_devs(void)
 	unsigned short oflags;
 
 	last = &ic_first_dev;
-	for (dev = dev_base; dev; dev = dev->next)
+	read_lock(&dev_base_lock);
+	for (dev = dev_base; dev; dev = dev->next) {
 		if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
 		    (!(dev->flags & IFF_LOOPBACK) &&
 		     (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
@@ -142,6 +143,9 @@ static int __init ic_open_devs(void)
 			ic_proto_have_if |= able;
 			DBG(("IP-Config: Opened %s (able=%d)\n", dev->name, able));
 		}
+	}
+	read_unlock(&dev_base_lock);
+
 	*last = NULL;
 
 	if (!ic_first_dev) {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index d7db0c007..1034e0e7a 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1,7 +1,7 @@
 /*
  *	IP multicast routing support for mrouted 3.6/3.8
  *
- *		(c) 1995 Alan Cox, <alan@cymru.net>
+ *		(c) 1995 Alan Cox, <alan@redhat.com>
  *	  Linux Consultancy and Custom Driver Development
  *
  *	This program is free software; you can redistribute it and/or
@@ -9,7 +9,7 @@
  *	as published by the Free Software Foundation; either version
  *	2 of the License, or (at your option) any later version.
  *
- *	Version: $Id: ipmr.c,v 1.40 1999/03/25 10:04:25 davem Exp $
+ *	Version: $Id: ipmr.c,v 1.43 1999/06/09 10:10:59 davem Exp $
  *
  *	Fixes:
  *	Michael Chastain	:	Incorrect size of copying.
@@ -23,6 +23,8 @@
  *	Brad Parker		:	Better behaviour on mrouted upcall
  *					overflow.
  *      Carlos Picoto           :       PIMv1 Support
+ *	Pavlin Ivanov Radoslavov:	PIMv2 Registers must checksum only PIM header
+ *					Relax this requrement to work with older peers.
  *
  */
 
@@ -431,7 +433,7 @@ static void ipmr_cache_resolve(struct mfc_cache *cache)
 				skb_trim(skb, nlh->nlmsg_len);
 				((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE;
 			}
-			err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+			err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
 		} else
 #endif
 			ip_mr_forward(skb, cache, 0);
@@ -1343,7 +1345,8 @@ int pim_rcv(struct sk_buff * skb, unsigned short len)
 	    pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
 	    (pim->flags&PIM_NULL_REGISTER) ||
 	    reg_dev == NULL ||
-	    ip_compute_csum((void *)pim, len)) {
+	    (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
+	     ip_compute_csum((void *)pim, len))) {
 		kfree_skb(skb);
                 return -EINVAL;
         }
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 1640a0560..52c5ee5a4 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -7,7 +7,7 @@
  *		PROC file system.  It is mainly used for debugging and
  *		statistics.
  *
- * Version:	$Id: proc.c,v 1.34 1999/02/08 11:20:34 davem Exp $
+ * Version:	$Id: proc.c,v 1.35 1999/05/27 00:37:38 davem Exp $
  *
  * Authors:	Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *		Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de>
@@ -114,10 +114,8 @@ static inline void get__sock(struct sock *sp, char *tmpbuf, int i, int format)
 			slot_dist = tcp_tw_death_row_slot - slot_dist;
 		timer_expires	= jiffies + (slot_dist * TCP_TWKILL_PERIOD);
 	} else {
-		timer_active1 = del_timer(&tp->retransmit_timer);
-		timer_active2 = del_timer(&sp->timer);
-		if (!timer_active1) tp->retransmit_timer.expires=0;
-		if (!timer_active2) sp->timer.expires=0;
+		timer_active1 = tp->retransmit_timer.prev != NULL;
+		timer_active2 = sp->timer.prev != NULL;
 		timer_active	= 0;
 		timer_expires	= (unsigned) -1;
 	}
@@ -147,9 +145,6 @@ static inline void get__sock(struct sock *sp, char *tmpbuf, int i, int format)
 		(!tw_bucket && sp->socket) ? sp->socket->inode->i_uid : 0,
 		(!tw_bucket && timer_active) ? sp->timeout : 0,
 		(!tw_bucket && sp->socket) ? sp->socket->inode->i_ino : 0);
-	
-	if (timer_active1) add_timer(&tp->retransmit_timer);
-	if (timer_active2) add_timer(&sp->timer);	
 }
 
 /*
@@ -176,7 +171,7 @@ get__netinfo(struct proto *pro, char *buffer, int format, char **start, off_t of
 			       "  sl  local_address rem_address   st tx_queue "
 			       "rx_queue tr tm->when retrnsmt   uid  timeout inode");
 	pos = 128;
-	SOCKHASH_LOCK(); 
+	SOCKHASH_LOCK_READ();
 	sp = pro->sklist_next;
 	while(sp != (struct sock *)pro) {
 		if (format == 0 && sp->state == TCP_LISTEN) {
@@ -211,7 +206,7 @@ get__netinfo(struct proto *pro, char *buffer, int format, char **start, off_t of
 		i++;
 	}
 out: 
-	SOCKHASH_UNLOCK();
+	SOCKHASH_UNLOCK_READ();
 	
 	begin = len - (pos - offset);
 	*start = buffer + begin;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index fc6b1f2ee..dd2e7555e 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -5,7 +5,7 @@
  *
  *		RAW - implementation of IP "raw" sockets.
  *
- * Version:	$Id: raw.c,v 1.39 1998/11/08 11:17:04 davem Exp $
+ * Version:	$Id: raw.c,v 1.41 1999/05/30 01:16:19 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -75,11 +75,11 @@ static void raw_v4_hash(struct sock *sk)
 
 	num &= (RAWV4_HTABLE_SIZE - 1);
 	skp = &raw_v4_htable[num];
-	SOCKHASH_LOCK();
+	SOCKHASH_LOCK_WRITE();
 	sk->next = *skp;
 	*skp = sk;
 	sk->hashent = num;
-	SOCKHASH_UNLOCK();
+	SOCKHASH_UNLOCK_WRITE();
 }
 
 static void raw_v4_unhash(struct sock *sk)
@@ -90,7 +90,7 @@ static void raw_v4_unhash(struct sock *sk)
 	num &= (RAWV4_HTABLE_SIZE - 1);
 	skp = &raw_v4_htable[num];
 
-	SOCKHASH_LOCK();
+	SOCKHASH_LOCK_WRITE();
 	while(*skp != NULL) {
 		if(*skp == sk) {
 			*skp = sk->next;
@@ -98,7 +98,7 @@ static void raw_v4_unhash(struct sock *sk)
 		}
 		skp = &((*skp)->next);
 	}
-	SOCKHASH_UNLOCK();
+	SOCKHASH_UNLOCK_WRITE();
 }
 
 static void raw_v4_rehash(struct sock *sk)
@@ -110,7 +110,7 @@ static void raw_v4_rehash(struct sock *sk)
 	num &= (RAWV4_HTABLE_SIZE - 1);
 	skp = &raw_v4_htable[oldnum];
 
-	SOCKHASH_LOCK();
+	SOCKHASH_LOCK_WRITE();
 	while(*skp != NULL) {
 		if(*skp == sk) {
 			*skp = sk->next;
@@ -121,16 +121,15 @@ static void raw_v4_rehash(struct sock *sk)
 	sk->next = raw_v4_htable[num];
 	raw_v4_htable[num] = sk;
 	sk->hashent = num;
-	SOCKHASH_UNLOCK();
+	SOCKHASH_UNLOCK_WRITE();
 }
 
-/* Grumble... icmp and ip_input want to get at this... */
-struct sock *raw_v4_lookup(struct sock *sk, unsigned short num,
-			   unsigned long raddr, unsigned long laddr, int dif)
+static __inline__ struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num,
+					       unsigned long raddr, unsigned long laddr,
+					       int dif)
 {
 	struct sock *s = sk;
 
-	SOCKHASH_LOCK();
 	for(s = sk; s; s = s->next) {
 		if((s->num == num) 				&&
 		   !(s->dead && (s->state == TCP_CLOSE))	&&
@@ -139,10 +138,79 @@ struct sock *raw_v4_lookup(struct sock *sk, unsigned short num,
 		   !(s->bound_dev_if && s->bound_dev_if != dif))
 			break; /* gotcha */
 	}
-	SOCKHASH_UNLOCK();
 	return s;
 }
 
+struct sock *raw_v4_lookup(struct sock *sk, unsigned short num,
+			   unsigned long raddr, unsigned long laddr,
+			   int dif)
+{
+	SOCKHASH_LOCK_READ();
+	sk = __raw_v4_lookup(sk, num, raddr, laddr, dif);
+	SOCKHASH_UNLOCK_READ();
+
+	return sk;
+}
+
+/*
+ *	0 - deliver
+ *	1 - block
+ */
+static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
+{
+	int    type;
+
+	type = skb->h.icmph->type;
+	if (type < 32)
+		return test_bit(type, &sk->tp_pinfo.tp_raw4.filter);
+
+	/* Do not block unknown ICMP types */
+	return 0;
+}
+
+/* IP input processing comes here for RAW socket delivery.
+ * This is fun as to avoid copies we want to make no surplus
+ * copies.
+ *
+ * RFC 1122: SHOULD pass TOS value up to the transport layer.
+ * -> It does. And not only TOS, but all IP header.
+ */
+struct sock *raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
+{
+	struct sock *sk;
+
+	SOCKHASH_LOCK_READ_BH();
+	if ((sk = raw_v4_htable[hash]) == NULL)
+		goto out;
+	sk = __raw_v4_lookup(sk, iph->protocol,
+			     iph->saddr, iph->daddr,
+			     skb->dev->ifindex);
+	while(sk != NULL) {
+		struct sock *sknext = __raw_v4_lookup(sk->next, iph->protocol,
+						      iph->saddr, iph->daddr,
+						      skb->dev->ifindex);
+
+		if (iph->protocol != IPPROTO_ICMP ||
+		    ! icmp_filter(sk, skb)) {
+			struct sk_buff *clone;
+
+			if(sknext == NULL)
+				break;
+			clone = skb_clone(skb, GFP_ATOMIC);
+			if(clone) {
+				SOCKHASH_UNLOCK_READ_BH();
+				raw_rcv(sk, clone);
+				SOCKHASH_LOCK_READ_BH();
+			}
+		}
+		sk = sknext;
+	}
+out:
+	SOCKHASH_UNLOCK_READ_BH();
+
+	return sk;
+}
+
 void raw_err (struct sock *sk, struct sk_buff *skb)
 {
 	int type = skb->h.icmph->type;
@@ -402,6 +470,8 @@ done:
 
 static void raw_close(struct sock *sk, long timeout)
 {
+	bh_lock_sock(sk);
+
 	/* Observation: when raw_close is called, processes have
 	   no access to socket anymore. But net still has.
 	   Step one, detach it from networking:
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index dbde97b70..3d9e87de3 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -5,7 +5,7 @@
  *
  *		ROUTE - implementation of the IP router.
  *
- * Version:	$Id: route.c,v 1.67 1999/05/08 20:00:20 davem Exp $
+ * Version:	$Id: route.c,v 1.69 1999/06/09 10:11:02 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -174,7 +174,18 @@ __u8 ip_tos2prio[16] = {
  * Route cache.
  */
 
-struct rtable 	*rt_hash_table[RT_HASH_DIVISOR];
+/* The locking scheme is rather straight forward:
+ *
+ * 1) A BH protected rwlock protects the central route hash.
+ * 2) Only writers remove entries, and they hold the lock
+ *    as they look at rtable reference counts.
+ * 3) Only readers acquire references to rtable entries,
+ *    they do so with atomic increments and with the
+ *    lock held.
+ */
+
+static struct rtable 	*rt_hash_table[RT_HASH_DIVISOR];
+static rwlock_t		 rt_hash_lock = RW_LOCK_UNLOCKED;
 
 static int rt_intern_hash(unsigned hash, struct rtable * rth, struct rtable ** res);
 
@@ -204,7 +215,7 @@ static int rt_cache_get_info(char *buffer, char **start, off_t offset, int lengt
   	}
 	
   	
-	start_bh_atomic();
+	read_lock_bh(&rt_hash_lock);
 
 	for (i = 0; i<RT_HASH_DIVISOR; i++) {
 		for (r = rt_hash_table[i]; r; r = r->u.rt_next) {
@@ -239,7 +250,7 @@ static int rt_cache_get_info(char *buffer, char **start, off_t offset, int lengt
         }
 
 done:
-	end_bh_atomic();
+	read_unlock_bh(&rt_hash_lock);
   	
   	*start = buffer+len-(pos-offset);
   	len = pos-offset;
@@ -292,6 +303,7 @@ static __inline__ int rt_may_expire(struct rtable *rth, int tmo1, int tmo2)
 	return 1;
 }
 
+/* This runs via a timer and thus is always in BH context. */
 static void rt_check_expire(unsigned long dummy)
 {
 	int i;
@@ -305,6 +317,7 @@ static void rt_check_expire(unsigned long dummy)
 		rover = (rover + 1) & (RT_HASH_DIVISOR-1);
 		rthp = &rt_hash_table[rover];
 
+		write_lock(&rt_hash_lock);
 		while ((rth = *rthp) != NULL) {
 			if (rth->u.dst.expires) {
 				/* Entrie is expired even if it is in use */
@@ -325,6 +338,7 @@ static void rt_check_expire(unsigned long dummy)
 			*rthp = rth->u.rt_next;
 			rt_free(rth);
 		}
+		write_unlock(&rt_hash_lock);
 
 		/* Fallback loop breaker. */
 		if ((jiffies - now) > 0)
@@ -334,6 +348,9 @@ static void rt_check_expire(unsigned long dummy)
 	add_timer(&rt_periodic_timer);
 }
 
+/* This can run from both BH and non-BH contexts, the latter
+ * in the case of a forced flush event.
+ */
 static void rt_run_flush(unsigned long dummy)
 {
 	int i;
@@ -341,23 +358,23 @@ static void rt_run_flush(unsigned long dummy)
 
 	rt_deadline = 0;
 
-	start_bh_atomic();
 	for (i=0; i<RT_HASH_DIVISOR; i++) {
-		if ((rth = xchg(&rt_hash_table[i], NULL)) == NULL)
-			continue;
-		end_bh_atomic();
+		write_lock_bh(&rt_hash_lock);
+		rth = rt_hash_table[i];
+		if(rth != NULL)
+			rt_hash_table[i] = NULL;
+		write_unlock_bh(&rt_hash_lock);
 
 		for (; rth; rth=next) {
 			next = rth->u.rt_next;
 			rth->u.rt_next = NULL;
 			rt_free(rth);
 		}
-
-		start_bh_atomic();
 	}
-	end_bh_atomic();
 }
   
+static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
+
 void rt_cache_flush(int delay)
 {
 	unsigned long now = jiffies;
@@ -366,7 +383,7 @@ void rt_cache_flush(int delay)
 	if (delay < 0)
 		delay = ip_rt_min_delay;
 
-	start_bh_atomic();
+	spin_lock_bh(&rt_flush_lock);
 
 	if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 		long tmo = (long)(rt_deadline - now);
@@ -386,7 +403,7 @@ void rt_cache_flush(int delay)
 	}
 
 	if (delay <= 0) {
-		end_bh_atomic();
+		spin_unlock_bh(&rt_flush_lock);
 		rt_run_flush(0);
 		return;
 	}
@@ -396,7 +413,7 @@ void rt_cache_flush(int delay)
 
 	rt_flush_timer.expires = now + delay;
 	add_timer(&rt_flush_timer);
-	end_bh_atomic();
+	spin_unlock_bh(&rt_flush_lock);
 }
 
 /*
@@ -459,7 +476,10 @@ static int rt_garbage_collect(void)
 	do {
 		int i, k;
 
-		start_bh_atomic();
+		/* The write lock is held during the entire hash
+		 * traversal to ensure consistent state of the rover.
+		 */
+		write_lock_bh(&rt_hash_lock);
 		for (i=0, k=rover; i<RT_HASH_DIVISOR; i++) {
 			unsigned tmo = expire;
 
@@ -480,7 +500,7 @@ static int rt_garbage_collect(void)
 				break;
 		}
 		rover = k;
-		end_bh_atomic();
+		write_unlock_bh(&rt_hash_lock);
 
 		if (goal <= 0)
 			goto work_done;
@@ -530,10 +550,9 @@ static int rt_intern_hash(unsigned hash, struct rtable * rt, struct rtable ** rp
 	int attempts = !in_interrupt();
 
 restart:
-	start_bh_atomic();
-
 	rthp = &rt_hash_table[hash];
 
+	write_lock_bh(&rt_hash_lock);
 	while ((rth = *rthp) != NULL) {
 		if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
 			/* Put it first */
@@ -544,7 +563,7 @@ restart:
 			atomic_inc(&rth->u.dst.refcnt);
 			atomic_inc(&rth->u.dst.use);
 			rth->u.dst.lastuse = now;
-			end_bh_atomic();
+			write_unlock_bh(&rt_hash_lock);
 
 			rt_drop(rt);
 			*rp = rth;
@@ -559,7 +578,7 @@ restart:
 	 */
 	if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
 		if (!arp_bind_neighbour(&rt->u.dst)) {
-			end_bh_atomic();
+			write_unlock_bh(&rt_hash_lock);
 
 			/* Neighbour tables are full and nothing
 			   can be released. Try to shrink route cache,
@@ -594,7 +613,7 @@ restart:
 	}
 #endif
 	rt_hash_table[hash] = rt;
-	end_bh_atomic();
+	write_unlock_bh(&rt_hash_lock);
 	*rp = rt;
 	return 0;
 }
@@ -633,6 +652,7 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
 
 			rthp=&rt_hash_table[hash];
 
+			write_lock_bh(&rt_hash_lock);
 			while ( (rth = *rthp) != NULL) {
 				struct rtable *rt;
 
@@ -657,6 +677,7 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
 				rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
 				if (rt == NULL) {
 					ip_rt_put(rth);
+					write_unlock_bh(&rt_hash_lock);
 					return;
 				}
 
@@ -688,11 +709,15 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
 				}
 
 				*rthp = rth->u.rt_next;
+				write_unlock_bh(&rt_hash_lock);
 				if (!rt_intern_hash(hash, rt, &rt))
 					ip_rt_put(rt);
 				rt_drop(rth);
-				break;
+				goto do_next;
 			}
+			write_unlock_bh(&rt_hash_lock);
+		do_next:
+			;
 		}
 	}
 	return;
@@ -722,8 +747,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 #if RT_CACHE_DEBUG >= 1
 			printk(KERN_DEBUG "ip_rt_advice: redirect to %d.%d.%d.%d/%02x dropped\n", NIPQUAD(rt->rt_dst), rt->key.tos);
 #endif
-			start_bh_atomic();
 			ip_rt_put(rt);
+			write_lock_bh(&rt_hash_lock);
 			for (rthp = &rt_hash_table[hash]; *rthp; rthp = &(*rthp)->u.rt_next) {
 				if (*rthp == rt) {
 					*rthp = rt->u.rt_next;
@@ -731,7 +756,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 					break;
 				}
 			}
-			end_bh_atomic();
+			write_unlock_bh(&rt_hash_lock);
 			return NULL;
 		}
 	}
@@ -861,6 +886,7 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
 	for (i=0; i<2; i++) {
 		unsigned hash = rt_hash_code(daddr, skeys[i], tos);
 
+		read_lock_bh(&rt_hash_lock);
 		for (rth = rt_hash_table[hash]; rth; rth = rth->u.rt_next) {
 			if (rth->key.dst == daddr &&
 			    rth->key.src == skeys[i] &&
@@ -890,6 +916,7 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
 				}
 			}
 		}
+		read_unlock_bh(&rt_hash_lock);
 	}
 	return est_mtu ? : new_mtu;
 }
@@ -1362,6 +1389,7 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
 	tos &= IPTOS_TOS_MASK;
 	hash = rt_hash_code(daddr, saddr^(iif<<5), tos);
 
+	read_lock_bh(&rt_hash_lock);
 	for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
 		if (rth->key.dst == daddr &&
 		    rth->key.src == saddr &&
@@ -1374,10 +1402,12 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
 			rth->u.dst.lastuse = jiffies;
 			atomic_inc(&rth->u.dst.use);
 			atomic_inc(&rth->u.dst.refcnt);
+			read_unlock_bh(&rt_hash_lock);
 			skb->dst = (struct dst_entry*)rth;
 			return 0;
 		}
 	}
+	read_unlock_bh(&rt_hash_lock);
 
 	/* Multicast recognition logic is moved from route cache to here.
 	   The problem was that too many Ethernet cards have broken/missing
@@ -1657,7 +1687,7 @@ int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
 
 	hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
 
-	start_bh_atomic();
+	read_lock_bh(&rt_hash_lock);
 	for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
 		if (rth->key.dst == daddr &&
 		    rth->key.src == saddr &&
@@ -1673,12 +1703,12 @@ int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
 			rth->u.dst.lastuse = jiffies;
 			atomic_inc(&rth->u.dst.use);
 			atomic_inc(&rth->u.dst.refcnt);
-			end_bh_atomic();
+			read_unlock_bh(&rt_hash_lock);
 			*rp = rth;
 			return 0;
 		}
 	}
-	end_bh_atomic();
+	read_unlock_bh(&rt_hash_lock);
 
 	return ip_route_output_slow(rp, daddr, saddr, tos, oif);
 }
@@ -1821,9 +1851,7 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
 			return -ENODEV;
 		skb->protocol = __constant_htons(ETH_P_IP);
 		skb->dev = dev;
-		start_bh_atomic();
 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
-		end_bh_atomic();
 		rt = (struct rtable*)skb->dst;
 		if (!err && rt->u.dst.error)
 			err = -rt->u.dst.error;
@@ -1869,7 +1897,7 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 		if (h < s_h) continue;
 		if (h > s_h)
 			s_idx = 0;
-		start_bh_atomic();
+		read_lock_bh(&rt_hash_lock);
 		for (rt = rt_hash_table[h], idx = 0; rt; rt = rt->u.rt_next, idx++) {
 			if (idx < s_idx)
 				continue;
@@ -1877,12 +1905,12 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) {
 				dst_release(xchg(&skb->dst, NULL));
-				end_bh_atomic();
+				read_unlock_bh(&rt_hash_lock);
 				goto done;
 			}
 			dst_release(xchg(&skb->dst, NULL));
 		}
-		end_bh_atomic();
+		read_unlock_bh(&rt_hash_lock);
 	}
 
 done:
@@ -1968,6 +1996,7 @@ ctl_table ipv4_route_table[] = {
 
 #ifdef CONFIG_NET_CLS_ROUTE
 struct ip_rt_acct ip_rt_acct[256];
+rwlock_t ip_rt_acct_lock = RW_LOCK_UNLOCKED;
 
 #ifdef CONFIG_PROC_FS
 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
@@ -1980,9 +2009,9 @@ static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
 		*eof = 1;
 	}
 	if (length > 0) {
-		start_bh_atomic();
+		read_lock_bh(&ip_rt_acct_lock);
 		memcpy(buffer, ((u8*)&ip_rt_acct)+offset, length);
-		end_bh_atomic();
+		read_unlock_bh(&ip_rt_acct_lock);
 		return length;
 	}
 	return 0;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8c1c9f9be..779c31cef 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp.c,v 1.140 1999/04/22 10:34:31 davem Exp $
+ * Version:	$Id: tcp.c,v 1.144 1999/05/27 01:03:37 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -416,6 +416,7 @@
 #include <linux/fcntl.h>
 #include <linux/poll.h>
 #include <linux/init.h>
+#include <linux/smp_lock.h>
 
 #include <net/icmp.h>
 #include <net/tcp.h>
@@ -432,7 +433,7 @@ kmem_cache_t *tcp_timewait_cachep;
 
 /*
  *	Find someone to 'accept'. Must be called with
- *	the socket locked or with interrupts disabled
+ *	the listening socket locked.
  */
 
 static struct open_request *tcp_find_established(struct tcp_opt *tp, 
@@ -441,10 +442,11 @@ static struct open_request *tcp_find_established(struct tcp_opt *tp,
 	struct open_request *req = tp->syn_wait_queue;
 	struct open_request *prev = (struct open_request *)&tp->syn_wait_queue; 
 	while(req) {
-		if (req->sk && 
-		    ((1 << req->sk->state) &
-		     ~(TCPF_SYN_SENT|TCPF_SYN_RECV)))
-			break;
+		if (req->sk) {
+			if((1 << req->sk->state) &
+			   ~(TCPF_SYN_SENT|TCPF_SYN_RECV))
+				break;
+		}
 		prev = req; 
 		req = req->dl_next;
 	}
@@ -655,12 +657,13 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 /*
  *	Wait for a socket to get into the connected state
  *
- *	Note: must be called with the socket locked.
+ *	Note: Must be called with the socket locked, and it
+ *	      runs with the kernel fully unlocked.
  */
 static int wait_for_tcp_connect(struct sock * sk, int flags)
 {
 	struct task_struct *tsk = current;
-	struct wait_queue wait = { tsk, NULL };
+	DECLARE_WAITQUEUE(wait, tsk);
 
 	while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
 		if(sk->err)
@@ -698,12 +701,14 @@ static inline int tcp_memory_free(struct sock *sk)
 
 /*
  *	Wait for more memory for a socket
+ *
+ * NOTE: This runs with the kernel fully unlocked.
  */
 static void wait_for_tcp_memory(struct sock * sk)
 {
 	release_sock(sk);
 	if (!tcp_memory_free(sk)) {
-		struct wait_queue wait = { current, NULL };
+		DECLARE_WAITQUEUE(wait, current);
 
 		sk->socket->flags &= ~SO_NOSPACE;
 		add_wait_queue(sk->sleep, &wait);
@@ -744,6 +749,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
 	int mss_now;
 	int err, copied;
 
+	unlock_kernel();
 	lock_sock(sk);
 
 	err = 0;
@@ -896,6 +902,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
 					err = -ERESTARTSYS;
 					goto do_interrupted;
 				}
+				tcp_push_pending_frames(sk, tp);
 				wait_for_tcp_memory(sk);
 
 				/* If SACK's were formed or PMTU events happened,
@@ -969,6 +976,7 @@ do_fault2:
 out:
 	tcp_push_pending_frames(sk, tp);
 	release_sock(sk);
+	lock_kernel();
 	return err;
 }
 
@@ -1117,7 +1125,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
 		int len, int nonblock, int flags, int *addr_len)
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-	struct wait_queue wait = { current, NULL };
+	DECLARE_WAITQUEUE(wait, current);
 	int copied = 0;
 	u32 peek_seq;
 	volatile u32 *seq;	/* So gcc doesn't overoptimise */
@@ -1148,6 +1156,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
 	if (flags & MSG_WAITALL)
 		target=len;
 
+	unlock_kernel();
 	add_wait_queue(sk->sleep, &wait);
 	lock_sock(sk);
 	
@@ -1300,6 +1309,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
 		/*	We now will not sleep again until we are finished
 		 *	with skb. Sorry if you are doing the SMP port
 		 *	but you'll just have to fix it neatly ;)
+		 *
+		 *	Very funny Alan... -DaveM
 		 */
 		atomic_dec(&skb->users);
 
@@ -1344,6 +1355,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
 	/* Clean up data we have read: This will do ACK frames. */
 	cleanup_rbuf(sk, copied);
 	release_sock(sk);
+	lock_kernel();
 	return copied;
 }
 
@@ -1415,16 +1427,15 @@ void tcp_shutdown(struct sock *sk, int how)
 		return;
 
 	/* If we've already sent a FIN, or it's a closed state, skip this. */
+	lock_sock(sk);
 	if ((1 << sk->state) &
 	    (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
-		lock_sock(sk);
 
 		/* Clear out any half completed packets.  FIN if needed. */
 		if (tcp_close_state(sk,0))
 			tcp_send_fin(sk);
-
-		release_sock(sk);
 	}
+	release_sock(sk);
 }
 
 
@@ -1471,13 +1482,6 @@ void tcp_close(struct sock *sk, long timeout)
 	struct sk_buff *skb;
 	int data_was_unread = 0;
 
-	/*
-	 * Check whether the socket is locked ... supposedly
-	 * it's impossible to tcp_close() a locked socket.
-	 */
-	if (atomic_read(&sk->sock_readers))
-		printk("tcp_close: socket already locked!\n");
-
 	/* We need to grab some memory, and put together a FIN,
 	 * and then put it into the queue to be sent.
 	 */
@@ -1491,6 +1495,8 @@ void tcp_close(struct sock *sk, long timeout)
 		return;
 	}
 
+	unlock_kernel();
+
 	/* It is questionable, what the role of this is now.
 	 * In any event either it should be removed, or
 	 * increment of SLT_KEEPALIVE be done, this is causing
@@ -1534,24 +1540,23 @@ void tcp_close(struct sock *sk, long timeout)
 
 	if (timeout) {
 		struct task_struct *tsk = current;
-		struct wait_queue wait = { tsk, NULL };
+		DECLARE_WAITQUEUE(wait, current);
 
 		add_wait_queue(sk->sleep, &wait);
-		release_sock(sk);
 
 		while (1) {
 			tsk->state = TASK_INTERRUPTIBLE;
 			if (!closing(sk))
 				break;
+			release_sock(sk);
 			timeout = schedule_timeout(timeout);
+			lock_sock(sk);
 			if (signal_pending(tsk) || !timeout)
 				break;
 		}
 
 		tsk->state = TASK_RUNNING;
 		remove_wait_queue(sk->sleep, &wait);
-		
-		lock_sock(sk);
 	}
 
 	/* Now that the socket is dead, if we are in the FIN_WAIT2 state
@@ -1559,23 +1564,40 @@ void tcp_close(struct sock *sk, long timeout)
          */
 	tcp_check_fin_timer(sk);
 
-	release_sock(sk);
 	sk->dead = 1;
+
+	release_sock(sk);
+	lock_kernel();
 }
 
 /*
  *	Wait for an incoming connection, avoid race
- *	conditions. This must be called with the socket locked.
+ *	conditions. This must be called with the socket locked,
+ *	and without the kernel lock held.
  */
 static struct open_request * wait_for_connect(struct sock * sk,
 					      struct open_request **pprev)
 {
-	struct wait_queue wait = { current, NULL };
+	DECLARE_WAITQUEUE(wait, current);
 	struct open_request *req;
 
-	add_wait_queue(sk->sleep, &wait);
+	/*
+	 * True wake-one mechanism for incoming connections: only
+	 * one process gets woken up, not the 'whole herd'.
+	 * Since we do not 'race & poll' for established sockets
+	 * anymore, the common case will execute the loop only once.
+	 *
+	 * Subtle issue: "add_wait_queue_exclusive()" will be added
+	 * after any current non-exclusive waiters, and we know that
+	 * it will always _stay_ after any new non-exclusive waiters
+	 * because all non-exclusive waiters are added at the
+	 * beginning of the wait-queue. As such, it's ok to "drop"
+	 * our exclusiveness temporarily when we get woken up without
+	 * having to remove and re-insert us on the wait queue.
+	 */
+	add_wait_queue_exclusive(sk->sleep, &wait);
 	for (;;) {
-		current->state = TASK_INTERRUPTIBLE;
+		current->state = TASK_EXCLUSIVE | TASK_INTERRUPTIBLE;
 		release_sock(sk);
 		schedule();
 		lock_sock(sk);
@@ -1603,6 +1625,7 @@ struct sock *tcp_accept(struct sock *sk, int flags)
 	struct sock *newsk = NULL;
 	int error;
 
+	unlock_kernel();
 	lock_sock(sk); 
 
 	/* We need to make sure that this socket is listening,
@@ -1633,16 +1656,17 @@ struct sock *tcp_accept(struct sock *sk, int flags)
 	sk->ack_backlog--; 
 	if(sk->keepopen)
 		tcp_inc_slow_timer(TCP_SLT_KEEPALIVE);
-
 	release_sock(sk);
+	lock_kernel();
 	return newsk;
 
 out:
 	/* sk should be in LISTEN state, thus accept can use sk->err for
-	 * internal purposes without stomping one anyone's feed.
+	 * internal purposes without stomping on anyone's feed.
 	 */ 
 	sk->err = error; 
 	release_sock(sk);
+	lock_kernel();
 	return newsk;
 }
 
@@ -1765,6 +1789,8 @@ extern void __skb_cb_too_small_for_tcp(int, int);
 void __init tcp_init(void)
 {
 	struct sk_buff *skb = NULL;
+	unsigned long goal;
+	int order;
 
 	if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
 		__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
@@ -1790,4 +1816,37 @@ void __init tcp_init(void)
 						NULL, NULL);
 	if(!tcp_timewait_cachep)
 		panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
+
+	/* Size and allocate the main established and bind bucket
+	 * hash tables.
+	 *
+	 * The methodology is similar to that of the buffer cache.
+	 */
+	goal = num_physpages >> (20 - PAGE_SHIFT);
+	for(order = 5; (1UL << order) < goal; order++)
+		;
+	do {
+		tcp_ehash_size = (1UL << order) * PAGE_SIZE /
+			sizeof(struct sock *);
+		tcp_ehash = (struct sock **)
+			__get_free_pages(GFP_ATOMIC, order);
+	} while (tcp_ehash == NULL && --order > 4);
+
+	if (!tcp_ehash)
+		panic("Failed to allocate TCP established hash table\n");
+	memset(tcp_ehash, 0, tcp_ehash_size * sizeof(struct sock *));
+
+	do {
+		tcp_bhash_size = (1UL << order) * PAGE_SIZE /
+			sizeof(struct tcp_bind_bucket *);
+		tcp_bhash = (struct tcp_bind_bucket **)
+			__get_free_pages(GFP_ATOMIC, order);
+	} while (tcp_bhash == NULL && --order > 4);
+
+	if (!tcp_bhash)
+		panic("Failed to allocate TCP bind hash table\n");
+	memset(tcp_bhash, 0, tcp_bhash_size * sizeof(struct tcp_bind_bucket *));
+
+	printk("TCP: Hash tables configured (established %d bind %d)\n",
+	       tcp_ehash_size, tcp_bhash_size);
 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4a607a749..af4165fce 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_input.c,v 1.164 1999/05/08 21:09:52 davem Exp $
+ * Version:	$Id: tcp_input.c,v 1.169 1999/06/09 08:29:13 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -748,7 +748,6 @@ static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
 static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
 {
 	struct sk_buff *skb = skb_peek(&sk->write_queue);
-	__u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when);
 
 	/* Some data was ACK'd, if still retransmitting (due to a
 	 * timeout), resend more of the retransmit queue.  The
@@ -758,6 +757,9 @@ static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
 		tcp_xmit_retransmit_queue(sk);
 		tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 	} else {
+		__u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when);
+		if ((__s32)when < 0)
+			when = 1;
 		tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
 	}
 }
@@ -785,8 +787,6 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
 	if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una))
 		goto uninteresting_ack;
 
-	dst_confirm(sk->dst_cache);
-
 	/* If there is data set flag 1 */
 	if (len != th->doff*4) {
 		flag |= FLAG_DATA;
@@ -882,6 +882,24 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
 		/* Clear any aborted fast retransmit starts. */
 		tp->dup_acks = 0;
 	}
+	/* It is not a brain fart, I thought a bit now. 8)
+	 *
+	 * Forward progress is indicated, if:
+	 *   1. the ack acknowledges new data.
+	 *   2. or the ack is duplicate, but it is caused by new segment
+	 *      arrival. This case is filtered by:
+	 *      - it contains no data, syn or fin.
+	 *      - it does not update window.
+	 *   3. or new SACK. It is difficult to check, so that we ignore it.
+	 *
+	 * Forward progress is also indicated by arrival new data,
+	 * which was caused by window open from our side. This case is more
+	 * difficult and it is made (alas, incorrectly) in tcp_data_queue().
+	 *                                              --ANK (990513)
+	 */
+	if (ack != tp->snd_una || (flag == 0 && !th->fin))
+		dst_confirm(sk->dst_cache);
+
 	/* Remember the highest ack received. */
 	tp->snd_una = ack;
 	return 1;
@@ -896,8 +914,11 @@ extern void tcp_tw_schedule(struct tcp_tw_bucket *tw);
 extern void tcp_tw_reschedule(struct tcp_tw_bucket *tw);
 extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw);
 
+/* Must be called only from BH context. */
 void tcp_timewait_kill(struct tcp_tw_bucket *tw)
 {
+	SOCKHASH_LOCK_WRITE_BH();
+
 	/* Unlink from various places. */
 	if(tw->bind_next)
 		tw->bind_next->bind_pprev = tw->bind_pprev;
@@ -915,6 +936,8 @@ void tcp_timewait_kill(struct tcp_tw_bucket *tw)
 	tw->sklist_next->sklist_prev = tw->sklist_prev;
 	tw->sklist_prev->sklist_next = tw->sklist_next;
 
+	SOCKHASH_UNLOCK_WRITE_BH();
+
 	/* Ok, now free it up. */
 	kmem_cache_free(tcp_timewait_cachep, tw);
 }
@@ -945,6 +968,7 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 		struct sock *sk;
 		struct tcp_func *af_specific = tw->af_specific;
 		__u32 isn;
+		int ret;
 
 		isn = tw->rcv_nxt + 128000;
 		if(isn == 0)
@@ -953,14 +977,25 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 		tcp_timewait_kill(tw);
 		sk = af_specific->get_sock(skb, th);
 		if(sk == NULL ||
-		   !ipsec_sk_policy(sk,skb) ||
-		   atomic_read(&sk->sock_readers) != 0)
+		   !ipsec_sk_policy(sk,skb))
 			return 0;
+
+		bh_lock_sock(sk);
+
+		/* Default is to discard the frame. */
+		ret = 0;
+
+		if(sk->lock.users)
+			goto out_unlock;
+
 		skb_set_owner_r(skb, sk);
 		af_specific = sk->tp_pinfo.af_tcp.af_specific;
+
 		if(af_specific->conn_request(sk, skb, isn) < 0)
-			return 1; /* Toss a reset back. */
-		return 0; /* Discard the frame. */
+			ret = 1; /* Toss a reset back. */
+	out_unlock:
+		bh_unlock_sock(sk);
+		return ret;
 	}
 
 	/* Check RST or SYN */
@@ -1013,7 +1048,7 @@ static __inline__ void tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *t
 	sk->prot->inuse--;
 
 	/* Step 4: Hash TW into TIMEWAIT half of established hash table. */
-	head = &tcp_established_hash[sk->hashent + (TCP_HTABLE_SIZE/2)];
+	head = &tcp_ehash[sk->hashent + (tcp_ehash_size >> 1)];
 	sktw = (struct sock *)tw;
 	if((sktw->next = *head) != NULL)
 		(*head)->pprev = &sktw->next;
@@ -1051,7 +1086,9 @@ void tcp_time_wait(struct sock *sk)
 		}
 #endif
 		/* Linkage updates. */
+		SOCKHASH_LOCK_WRITE();
 		tcp_tw_hashdance(sk, tw);
+		SOCKHASH_UNLOCK_WRITE();
 
 		/* Get the TIME_WAIT timeout firing. */
 		tcp_tw_schedule(tw);
@@ -1801,7 +1838,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 		}
 	}
 
-	flg = *(((u32 *)th) + 3) & ~htonl(0x8 << 16);
+	flg = *(((u32 *)th) + 3) & ~htonl(0xFC8 << 16);
 
 	/*	pred_flags is 0xS?10 << 16 + snd_wnd
 	 *	if header_predition is to be made
@@ -2031,8 +2068,26 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		/* These use the socket TOS.. 
 		 * might want to be the received TOS 
 		 */
-		if(th->ack)
-			return 1;
+		if(th->ack) {
+			struct sock *realsk;
+			int ret;
+
+			realsk = tp->af_specific->get_sock(skb, th);
+			if(realsk == sk)
+				return 1;
+
+			bh_lock_sock(realsk);
+			ret = 0;
+			if(realsk->lock.users != 0) {
+				skb_orphan(skb);
+				sk_add_backlog(realsk, skb);
+			} else {
+				ret = tcp_rcv_state_process(realsk, skb,
+							    skb->h.th, skb->len);
+			}
+			bh_unlock_sock(realsk);
+			return ret;
+		}
 		
 		if(th->syn) {
 			if(tp->af_specific->conn_request(sk, skb, 0) < 0)
@@ -2067,21 +2122,81 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		 * not be in line code. [AC]
 		 */
 		if(th->ack) {
-			tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
-
-			/* We got an ack, but it's not a good ack. */
-			if(!tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
-				    TCP_SKB_CB(skb)->ack_seq, len)) 
+			/* rfc793:
+			 * "If the state is SYN-SENT then
+			 *    first check the ACK bit
+			 *      If the ACK bit is set
+			 *	  If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
+			 *        a reset (unless the RST bit is set, if so drop
+			 *        the segment and return)"
+			 *
+			 *  I cite this place to emphasize one essential
+			 *  detail, this check is different of one
+			 *  in established state: SND.UNA <= SEG.ACK <= SND.NXT.
+			 *  SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT,
+			 *  because we have no previous data sent before SYN.
+			 *                                        --ANK(990513)
+			 *
+			 *  We do not send data with SYN, so that RFC-correct
+			 *  test reduces to:
+			 */
+			if (sk->zapped ||
+			    TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
 				return 1;
 
-			if(th->rst) {
+			/* Now ACK is acceptable.
+			 *
+			 * "If the RST bit is set
+			 *    If the ACK was acceptable then signal the user "error:
+			 *    connection reset", drop the segment, enter CLOSED state,
+			 *    delete TCB, and return."
+			 */
+
+			if (th->rst) {
 				tcp_reset(sk);
 				goto discard;
 			}
 
-			if(!th->syn) 
+			/* rfc793:
+			 *   "fifth, if neither of the SYN or RST bits is set then
+			 *    drop the segment and return."
+			 *
+			 *    See note below!
+			 *                                        --ANK(990513)
+		         */
+			
+			if (!th->syn)
 				goto discard;
 
+			/* rfc793:
+			 *   "If the SYN bit is on ...
+			 *    are acceptable then ...
+			 *    (our SYN has been ACKed), change the connection
+			 *    state to ESTABLISHED..."
+			 *
+			 * Do you see? SYN-less ACKs in SYN-SENT state are
+			 * completely ignored.
+			 *
+			 * The bug causing stalled SYN-SENT sockets
+			 * was here: tcp_ack advanced snd_una and canceled
+			 * retransmit timer, so that bare ACK received
+			 * in SYN-SENT state (even with invalid ack==ISS,
+			 * because tcp_ack check is too weak for SYN-SENT)
+			 * causes moving socket to invalid semi-SYN-SENT,
+			 * semi-ESTABLISHED state and connection hangs.
+			 *
+			 * There exist buggy stacks, which really send
+			 * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp)
+			 * Actually, if this host did not try to get something
+			 * from ftp.inr.ac.ru I'd never find this bug 8)
+			 *
+			 *                                     --ANK (990514)
+			 */
+
+			tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+			tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
+				TCP_SKB_CB(skb)->ack_seq, len);
+
 			/* Ok.. it's good. Set up sequence numbers and
 			 * move to established.
 			 */
@@ -2206,8 +2321,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 	    !(th->fin && TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)) {
 		if (!th->rst) {
 			tcp_send_ack(sk);
-			goto discard;
 		}
+		goto discard;
 	}
 
 	/* step 2: check RST bit */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b5070c3a7..564e859f2 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_ipv4.c,v 1.175 1999/05/08 21:09:54 davem Exp $
+ * Version:	$Id: tcp_ipv4.c,v 1.180 1999/06/09 08:29:19 davem Exp $
  *
  *		IPv4 specific functions
  *
@@ -90,12 +90,14 @@ void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
  * First half of the table is for sockets not in TIME_WAIT, second half
  * is for TIME_WAIT sockets only.
  */
-struct sock *tcp_established_hash[TCP_HTABLE_SIZE];
+struct sock **tcp_ehash;
+int tcp_ehash_size;
 
 /* Ok, let's try this, I give up, we do need a local binding
  * TCP hash as well as the others for fast bind/connect.
  */
-struct tcp_bind_bucket *tcp_bound_hash[TCP_BHTABLE_SIZE];
+struct tcp_bind_bucket **tcp_bhash;
+int tcp_bhash_size;
 
 /* All sockets in TCP_LISTEN state will be in here.  This is the only table
  * where wildcard'd TCP sockets can exist.  Hash function here is just local
@@ -117,7 +119,7 @@ int tcp_port_rover = (1024 - 1);
 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 				 __u32 faddr, __u16 fport)
 {
-	return ((laddr ^ lport) ^ (faddr ^ fport)) & ((TCP_HTABLE_SIZE/2) - 1);
+	return ((laddr ^ lport) ^ (faddr ^ fport)) & ((tcp_ehash_size >> 1) - 1);
 }
 
 static __inline__ int tcp_sk_hashfn(struct sock *sk)
@@ -136,8 +138,8 @@ void tcp_bucket_unlock(struct sock *sk)
 	struct tcp_bind_bucket *tb;
 	unsigned short snum = sk->num;
 
-	SOCKHASH_LOCK();
-	for(tb = tcp_bound_hash[tcp_bhashfn(snum)]; tb; tb = tb->next) {
+	SOCKHASH_LOCK_WRITE();
+	for(tb = tcp_bhash[tcp_bhashfn(snum)]; tb; tb = tb->next) {
 		if(tb->port == snum) {
 			if(tb->owners == NULL &&
 			   (tb->flags & TCPB_FLAG_LOCKED)) {
@@ -148,9 +150,10 @@ void tcp_bucket_unlock(struct sock *sk)
 			break;
 		}
 	}
-	SOCKHASH_UNLOCK();
+	SOCKHASH_UNLOCK_WRITE();
 }
 
+/* The sockhash lock must be held as a writer here. */
 struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum)
 {
 	struct tcp_bind_bucket *tb;
@@ -158,7 +161,7 @@ struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum)
 	tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
 	if(tb != NULL) {
 		struct tcp_bind_bucket **head =
-			&tcp_bound_hash[tcp_bhashfn(snum)];
+			&tcp_bhash[tcp_bhashfn(snum)];
 		tb->port = snum;
 		tb->flags = TCPB_FLAG_LOCKED;
 		tb->owners = NULL;
@@ -176,13 +179,18 @@ struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum)
  */
 static __inline__ int tcp_bucket_check(unsigned short snum)
 {
-	struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(snum)];
+	struct tcp_bind_bucket *tb;
+	int ret = 0;
+
+	SOCKHASH_LOCK_WRITE();
+	tb = tcp_bhash[tcp_bhashfn(snum)];
 	for( ; (tb && (tb->port != snum)); tb = tb->next)
 		;
 	if(tb == NULL && tcp_bucket_create(snum) == NULL)
-		return 1;
-	else
-		return 0;
+		ret = 1;
+	SOCKHASH_UNLOCK_WRITE();
+
+	return ret;
 }
 #endif
 
@@ -191,8 +199,8 @@ static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum)
 	struct tcp_bind_bucket *tb;
 	int result = 0;
 
-	SOCKHASH_LOCK();
-	for(tb = tcp_bound_hash[tcp_bhashfn(snum)];
+	SOCKHASH_LOCK_WRITE();
+	for(tb = tcp_bhash[tcp_bhashfn(snum)];
 	    (tb && (tb->port != snum));
 	    tb = tb->next)
 		;
@@ -256,7 +264,7 @@ static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum)
 		}
 	}
 go_like_smoke:
-	SOCKHASH_UNLOCK();
+	SOCKHASH_UNLOCK_WRITE();
 	return result;
 }
 
@@ -268,13 +276,13 @@ unsigned short tcp_good_socknum(void)
 	int remaining = (high - low) + 1;
 	int rover;
 
-	SOCKHASH_LOCK();
+	SOCKHASH_LOCK_WRITE();
 	rover = tcp_port_rover;
 	do {
 		rover += 1;
 		if((rover < low) || (rover > high))
 			rover = low;
-		tb = tcp_bound_hash[tcp_bhashfn(rover)];
+		tb = tcp_bhash[tcp_bhashfn(rover)];
 		for( ; tb; tb = tb->next) {
 			if(tb->port == rover)
 				goto next;
@@ -288,7 +296,7 @@ unsigned short tcp_good_socknum(void)
 		rover = 0;
 	if (tb != NULL)
 		tb->flags |= TCPB_FLAG_GOODSOCKNUM;
-	SOCKHASH_UNLOCK();
+	SOCKHASH_UNLOCK_WRITE();
 
 	return rover;
 }
@@ -298,20 +306,20 @@ static void tcp_v4_hash(struct sock *sk)
 	if (sk->state != TCP_CLOSE) {
 		struct sock **skp;
 
-		SOCKHASH_LOCK();
-		skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))];
+		SOCKHASH_LOCK_WRITE();
+		skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))];
 		if((sk->next = *skp) != NULL)
 			(*skp)->pprev = &sk->next;
 		*skp = sk;
 		sk->pprev = skp;
 		tcp_sk_bindify(sk);
-		SOCKHASH_UNLOCK();
+		SOCKHASH_UNLOCK_WRITE();
 	}
 }
 
 static void tcp_v4_unhash(struct sock *sk)
 {
-	SOCKHASH_LOCK();
+	SOCKHASH_LOCK_WRITE();
 	if(sk->pprev) {
 		if(sk->next)
 			sk->next->pprev = sk->pprev;
@@ -320,14 +328,14 @@ static void tcp_v4_unhash(struct sock *sk)
 		tcp_reg_zap(sk);
 		tcp_sk_unbindify(sk);
 	}
-	SOCKHASH_UNLOCK();
+	SOCKHASH_UNLOCK_WRITE();
 }
 
 static void tcp_v4_rehash(struct sock *sk)
 {
 	unsigned char state;
 
-	SOCKHASH_LOCK();
+	SOCKHASH_LOCK_WRITE();
 	state = sk->state;
 	if(sk->pprev != NULL) {
 		if(sk->next)
@@ -342,7 +350,7 @@ static void tcp_v4_rehash(struct sock *sk)
 		if(state == TCP_LISTEN)
 			skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 		else
-			skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))];
+			skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))];
 
 		if((sk->next = *skp) != NULL)
 			(*skp)->pprev = &sk->next;
@@ -351,7 +359,7 @@ static void tcp_v4_rehash(struct sock *sk)
 		if(state == TCP_LISTEN)
 			tcp_sk_bindify(sk);
 	}
-	SOCKHASH_UNLOCK();
+	SOCKHASH_UNLOCK_WRITE();
 }
 
 /* Don't inline this cruft.  Here are some nice properties to
@@ -395,10 +403,10 @@ static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int d
 
 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
- * It is assumed that this code only gets called from within NET_BH.
+ *
+ * The sockhash lock must be held as a reader here.
  */
-static inline struct sock *__tcp_v4_lookup(struct tcphdr *th,
-					   u32 saddr, u16 sport,
+static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 					   u32 daddr, u16 dport, int dif)
 {
 	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
@@ -416,7 +424,7 @@ static inline struct sock *__tcp_v4_lookup(struct tcphdr *th,
 	 * have wildcards anyways.
 	 */
 	hash = tcp_hashfn(daddr, hnum, saddr, sport);
-	for(sk = tcp_established_hash[hash]; sk; sk = sk->next) {
+	for(sk = tcp_ehash[hash]; sk; sk = sk->next) {
 		if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) {
 			if (sk->state == TCP_ESTABLISHED)
 				TCP_RHASH(sport) = sk;
@@ -424,7 +432,7 @@ static inline struct sock *__tcp_v4_lookup(struct tcphdr *th,
 		}
 	}
 	/* Must check for a TIME_WAIT'er before going to listener hash. */
-	for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next)
+	for(sk = tcp_ehash[hash+(tcp_ehash_size >> 1)]; sk; sk = sk->next)
 		if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 			goto hit;
 	sk = tcp_v4_lookup_listener(daddr, hnum, dif);
@@ -434,7 +442,13 @@ hit:
 
 __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
 {
-	return __tcp_v4_lookup(0, saddr, sport, daddr, dport, dif);
+	struct sock *sk;
+
+	SOCKHASH_LOCK_READ();
+	sk = __tcp_v4_lookup(saddr, sport, daddr, dport, dif);
+	SOCKHASH_UNLOCK_READ();
+
+	return sk;
 }
 
 #ifdef CONFIG_IP_TRANSPARENT_PROXY
@@ -462,9 +476,12 @@ static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr,
 			paddr = idev->ifa_list->ifa_local;
 	}
 
-	/* This code must run only from NET_BH. */
+	/* We must obtain the sockhash lock here, we are always
+	 * in BH context.
+	 */
+	SOCKHASH_LOCK_READ_BH();
 	{
-		struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hnum)];
+		struct tcp_bind_bucket *tb = tcp_bhash[tcp_bhashfn(hnum)];
 		for( ; (tb && tb->port != hnum); tb = tb->next)
 			;
 		if(tb == NULL)
@@ -505,7 +522,7 @@ pass2:
 	}
 next:
 	if(firstpass--) {
-		struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hpnum)];
+		struct tcp_bind_bucket *tb = tcp_bhash[tcp_bhashfn(hpnum)];
 		for( ; (tb && tb->port != hpnum); tb = tb->next)
 			;
 		if(tb) {
@@ -514,6 +531,7 @@ next:
 		}
 	}
 gotit:
+	SOCKHASH_UNLOCK_READ_BH();
 	return result;
 }
 #endif /* CONFIG_IP_TRANSPARENT_PROXY */
@@ -540,21 +558,23 @@ static int tcp_v4_unique_address(struct sock *sk)
 	int retval = 1;
 
 	/* Freeze the hash while we snoop around. */
-	SOCKHASH_LOCK();
-	tb = tcp_bound_hash[tcp_bhashfn(snum)];
+	SOCKHASH_LOCK_READ();
+	tb = tcp_bhash[tcp_bhashfn(snum)];
 	for(; tb; tb = tb->next) {
 		if(tb->port == snum && tb->owners != NULL) {
 			/* Almost certainly the re-use port case, search the real hashes
 			 * so it actually scales.
 			 */
-			sk = __tcp_v4_lookup(NULL, sk->daddr, sk->dport,
+			sk = __tcp_v4_lookup(sk->daddr, sk->dport,
 					     sk->rcv_saddr, snum, sk->bound_dev_if);
+			SOCKHASH_UNLOCK_READ();
+
 			if((sk != NULL) && (sk->state != TCP_LISTEN))
 				retval = 0;
-			break;
+			return retval;
 		}
 	}
-	SOCKHASH_UNLOCK();
+	SOCKHASH_UNLOCK_READ();
 	return retval;
 }
 
@@ -727,16 +747,17 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned
 {
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 
-	if (atomic_read(&sk->sock_readers))
-		return;
-
-	/* Don't interested in TCP_LISTEN and open_requests (SYN-ACKs
+	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 	 * send out by Linux are always <576bytes so they should go through
 	 * unfragmented).
 	 */
 	if (sk->state == TCP_LISTEN)
 		return; 
 
+	bh_lock_sock(sk);
+	if(sk->lock.users != 0)
+		goto out;
+
 	/* We don't check in the destentry if pmtu discovery is forbidden
 	 * on this route. We just assume that no packet_to_big packets
 	 * are send back when pmtu discovery is not active.
@@ -744,7 +765,8 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned
 	 * route, but I think that's acceptable.
 	 */
 	if (sk->dst_cache == NULL)
-		return;
+		goto out;
+
 	ip_rt_update_pmtu(sk->dst_cache, mtu);
 	if (sk->ip_pmtudisc != IP_PMTUDISC_DONT &&
 	    tp->pmtu_cookie > sk->dst_cache->pmtu) {
@@ -757,6 +779,8 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned
 		 */
 		tcp_simple_retransmit(sk);
 	} /* else let the usual retransmit timer handle it */
+out:
+	bh_unlock_sock(sk);
 }
 
 /*
@@ -849,17 +873,6 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 	switch (sk->state) {
 		struct open_request *req, *prev;
 	case TCP_LISTEN:
-		/* Prevent race conditions with accept() - 
-		 * ICMP is unreliable. 
-		 */
-		if (atomic_read(&sk->sock_readers)) {
-			net_statistics.LockDroppedIcmps++;
-			 /* If too many ICMPs get dropped on busy
-			  * servers this needs to be solved differently.
-			  */
-			return;
-		}
-
 		/* The final ACK of the handshake should be already 
 		 * handled in the new socket context, not here.
 		 * Strictly speaking - an ICMP error for the final
@@ -869,12 +882,24 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 		if (!no_flags && !th->syn && !th->ack)
 			return;
 
+		/* Prevent race conditions with accept() - 
+		 * ICMP is unreliable. 
+		 */
+		bh_lock_sock(sk);
+		if (sk->lock.users != 0) {
+			net_statistics.LockDroppedIcmps++;
+			 /* If too many ICMPs get dropped on busy
+			  * servers this needs to be solved differently.
+			  */
+			goto out_unlock;
+		}
+
 		req = tcp_v4_search_req(tp, iph, th, &prev); 
 		if (!req)
-			return;
+			goto out_unlock;
 		if (seq != req->snt_isn) {
 			net_statistics.OutOfWindowIcmps++;
-			return;
+			goto out_unlock;
 		}
 		if (req->sk) {	
 			/* 
@@ -884,6 +909,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 			 * but only with the next operation on the socket after
 			 * accept. 
 			 */
+			bh_unlock_sock(sk);
 			sk = req->sk;
 		} else {
 			/* 
@@ -896,6 +922,8 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 			tcp_synq_unlink(tp, req, prev);
 			req->class->destructor(req);
 			tcp_openreq_free(req);
+	out_unlock:
+			bh_unlock_sock(sk);
 			return; 
 		}
 		break;
@@ -1025,9 +1053,10 @@ static struct sock *tcp_v4_search_proxy_openreq(struct sk_buff *skb)
 {
 	struct iphdr *iph = skb->nh.iph;
 	struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4);
-	struct sock *sk;
+	struct sock *sk = NULL;
 	int i;
 
+	SOCKHASH_LOCK_READ();
 	for (i=0; i<TCP_LHTABLE_SIZE; i++) {
 		for(sk = tcp_listening_hash[i]; sk; sk = sk->next) {
 			struct open_request *dummy;
@@ -1035,10 +1064,12 @@ static struct sock *tcp_v4_search_proxy_openreq(struct sk_buff *skb)
 					      th, &dummy) &&
 			    (!sk->bound_dev_if ||
 			     sk->bound_dev_if == skb->dev->ifindex))
-				return sk;
+				goto out;
 		}
 	}
-	return NULL;
+out:
+	SOCKHASH_UNLOCK_READ();
+	return sk;
 }
 
 /*
@@ -1319,7 +1350,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 		/* Clone the TCP header template */
 		newsk->dport = req->rmt_port;
 
-		atomic_set(&newsk->sock_readers, 0);
+		sock_lock_init(newsk);
+
 		atomic_set(&newsk->rmem_alloc, 0);
 		skb_queue_head_init(&newsk->receive_queue);
 		atomic_set(&newsk->wmem_alloc, 0);
@@ -1328,9 +1360,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 
 		newsk->done = 0;
 		newsk->proc = 0;
-		newsk->pair = NULL;
-		skb_queue_head_init(&newsk->back_log);
+		newsk->backlog.head = newsk->backlog.tail = NULL;
 		skb_queue_head_init(&newsk->error_queue);
+		newsk->write_space = tcp_write_space;
 #ifdef CONFIG_FILTER
 		if ((filter = newsk->filter) != NULL)
 			sk_filter_charge(newsk, filter);
@@ -1552,7 +1584,8 @@ static inline struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
 	}
 
 	/* Check for SYN|ACK */
-	if (flg & __constant_htonl(0x00120000)) {
+	flg &= __constant_htonl(0x00120000);
+	if (flg) {
 		struct open_request *req, *dummy; 
 		struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
@@ -1570,8 +1603,17 @@ static inline struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
 	return sk; 
 }
 
+/* The socket must have it's spinlock held when we get
+ * here.
+ *
+ * We have a potential double-lock case here, so even when
+ * doing backlog processing we use the BH locking scheme.
+ * This is because we cannot sleep with the original spinlock
+ * held.
+ */
 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 {
+	int need_unlock = 0;
 #ifdef CONFIG_FILTER
 	struct sk_filter *filter = sk->filter;
 	if (filter && sk_filter(skb, filter))
@@ -1591,7 +1633,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 		return 0; 
 	} 
 
-
 	if (sk->state == TCP_LISTEN) { 
 		struct sock *nsk;
 		
@@ -1604,17 +1645,22 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 		 * otherwise we just shortcircuit this and continue with
 		 * the new socket..
 		 */
-		if (atomic_read(&nsk->sock_readers)) {
-			skb_orphan(skb);
-			__skb_queue_tail(&nsk->back_log, skb);
-			return 0;
+		if (nsk != sk) {
+			bh_lock_sock(nsk);
+			if (nsk->lock.users != 0) {
+				skb_orphan(skb);
+				sk_add_backlog(nsk, skb);
+				bh_unlock_sock(nsk);
+				return 0;
+			}
+			need_unlock = 1;
+			sk = nsk;
 		}
-		sk = nsk;
 	}
 	
 	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
 		goto reset;
-	return 0;
+	goto out_maybe_unlock;
 
 reset:
 	tcp_v4_send_reset(skb);
@@ -1625,6 +1671,9 @@ discard:
 	 * might be destroyed here. This current version compiles correctly,
 	 * but you have been warned.
 	 */
+out_maybe_unlock:
+	if(need_unlock)
+		bh_unlock_sock(sk);
 	return 0;
 }
 
@@ -1636,6 +1685,7 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
 {
 	struct tcphdr *th;
 	struct sock *sk;
+	int ret;
 
 	if (skb->pkt_type!=PACKET_HOST)
 		goto discard_it;
@@ -1681,8 +1731,10 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
 					 IPCB(skb)->redirport, skb->dev->ifindex);
 	else {
 #endif
-		sk = __tcp_v4_lookup(th, skb->nh.iph->saddr, th->source,
+		SOCKHASH_LOCK_READ_BH();
+		sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
 				     skb->nh.iph->daddr, th->dest, skb->dev->ifindex);
+		SOCKHASH_UNLOCK_READ_BH();
 #ifdef CONFIG_IP_TRANSPARENT_PROXY
 		if (!sk)
 			sk = tcp_v4_search_proxy_openreq(skb);
@@ -1702,11 +1754,16 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
 
 	if (sk->state == TCP_TIME_WAIT)
 		goto do_time_wait;
-	if (!atomic_read(&sk->sock_readers))
-		return tcp_v4_do_rcv(sk, skb);
 
-	__skb_queue_tail(&sk->back_log, skb);
-	return 0;
+	bh_lock_sock(sk);
+	ret = 0;
+	if (!sk->lock.users)
+		ret = tcp_v4_do_rcv(sk, skb);
+	else
+		sk_add_backlog(sk, skb);
+	bh_unlock_sock(sk);
+
+	return ret;
 
 no_tcp_socket:
 	tcp_v4_send_reset(skb);
@@ -1944,6 +2001,8 @@ __initfunc(void tcp_v4_init(struct net_proto_family *ops))
 	tcp_inode.i_sock = 1;
 	tcp_inode.i_uid = 0;
 	tcp_inode.i_gid = 0;
+	init_waitqueue_head(&tcp_inode.i_wait);
+	init_waitqueue_head(&tcp_inode.u.socket_i.wait);
 
 	tcp_socket->inode = &tcp_inode;
 	tcp_socket->state = SS_UNCONNECTED;
@@ -1952,6 +2011,11 @@ __initfunc(void tcp_v4_init(struct net_proto_family *ops))
 	if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
 		panic("Failed to create the TCP control socket.\n");
 	tcp_socket->sk->allocation=GFP_ATOMIC;
-	tcp_socket->sk->num = 256;		/* Don't receive any data */
 	tcp_socket->sk->ip_ttl = MAXTTL;
+
+	/* Unhash it so that IP input processing does not even
+	 * see it, we do not wish this socket to see incoming
+	 * packets.
+	 */
+	tcp_socket->sk->prot->unhash(tcp_socket->sk);
 }
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9a096f0f3..18b5ebf80 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_output.c,v 1.108 1999/05/08 21:48:59 davem Exp $
+ * Version:	$Id: tcp_output.c,v 1.110 1999/05/27 00:37:45 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -36,6 +36,8 @@
 
 #include <net/tcp.h>
 
+#include <linux/smp_lock.h>
+
 extern int sysctl_tcp_timestamps;
 extern int sysctl_tcp_window_scaling;
 extern int sysctl_tcp_sack;
@@ -240,6 +242,11 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
 	/* Rechecksum original buffer. */
 	skb->csum = csum_partial(skb->data, skb->len, 0);
 
+	/* Looks stupid, but our code really uses when of
+	 * skbs, which it never sent before. --ANK
+	 */
+	TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
+
 	/* Link BUFF into the send queue. */
 	__skb_append(skb, buff);
 
@@ -961,6 +968,7 @@ void tcp_connect(struct sock *sk, struct sk_buff *buff, int mtu)
 	/* Ok, now lock the socket before we make it visible to
 	 * the incoming packet engine.
 	 */
+	unlock_kernel();
 	lock_sock(sk);
 
 	/* Socket identity change complete, no longer
@@ -988,6 +996,7 @@ void tcp_connect(struct sock *sk, struct sk_buff *buff, int mtu)
 
 	/* Now, it is safe to release the socket. */
 	release_sock(sk);
+	lock_kernel();
 }
 
 /* Send out a delayed ack, the caller does the policy checking
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index ad6ccace9..d23eef143 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_timer.c,v 1.62 1999/05/08 21:09:55 davem Exp $
+ * Version:	$Id: tcp_timer.c,v 1.64 1999/05/27 00:37:31 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -168,15 +168,16 @@ void tcp_delack_timer(unsigned long data)
 {
 	struct sock *sk = (struct sock*)data;
 
+	bh_lock_sock(sk);
 	if(!sk->zapped &&
 	   sk->tp_pinfo.af_tcp.delayed_acks &&
 	   sk->state != TCP_CLOSE) {
-		/* If socket is currently locked, defer the ACK. */
-		if (!atomic_read(&sk->sock_readers))
+		if (!sk->lock.users)
 			tcp_send_ack(sk);
 		else
 			tcp_send_delayed_ack(&(sk->tp_pinfo.af_tcp), HZ/10);
 	}
+	bh_unlock_sock(sk);
 }
 
 void tcp_probe_timer(unsigned long data)
@@ -187,9 +188,11 @@ void tcp_probe_timer(unsigned long data)
 	if(sk->zapped) 
 		return;
 	
-	if (atomic_read(&sk->sock_readers)) {
+	bh_lock_sock(sk);
+	if (sk->lock.users) {
 		/* Try again later. */
 		tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ/5);
+		bh_unlock_sock(sk);
 		return;
 	}
 
@@ -216,6 +219,7 @@ void tcp_probe_timer(unsigned long data)
 		/* Only send another probe if we didn't close things up. */
 		tcp_send_probe0(sk);
 	}
+	bh_unlock_sock(sk);
 }
 
 static __inline__ int tcp_keepopen_proc(struct sock *sk)
@@ -253,8 +257,9 @@ static void tcp_bucketgc(unsigned long data)
 {
 	int i, reaped = 0;;
 
-	for(i = 0; i < TCP_BHTABLE_SIZE; i++) {
-		struct tcp_bind_bucket *tb = tcp_bound_hash[i];
+	SOCKHASH_LOCK_WRITE_BH();
+	for(i = 0; i < tcp_bhash_size; i++) {
+		struct tcp_bind_bucket *tb = tcp_bhash[i];
 
 		while(tb) {
 			struct tcp_bind_bucket *next = tb->next;
@@ -274,6 +279,8 @@ static void tcp_bucketgc(unsigned long data)
 			tb = next;
 		}
 	}
+	SOCKHASH_UNLOCK_WRITE_BH();
+
 	if(reaped != 0) {
 		struct tcp_sl_timer *slt = (struct tcp_sl_timer *)data;
 
@@ -294,8 +301,14 @@ static void tcp_twkill(unsigned long data)
 	struct tcp_tw_bucket *tw;
 	int killed = 0;
 
+	/* The death-row tw chains are only ever touched
+	 * in BH context so no locking is needed.
+	 */
 	tw = tcp_tw_death_row[tcp_tw_death_row_slot];
 	tcp_tw_death_row[tcp_tw_death_row_slot] = NULL;
+	tcp_tw_death_row_slot =
+	  ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
+
 	while(tw != NULL) {
 		struct tcp_tw_bucket *next = tw->next_death;
 
@@ -307,8 +320,6 @@ static void tcp_twkill(unsigned long data)
 		struct tcp_sl_timer *slt = (struct tcp_sl_timer *)data;
 		atomic_sub(killed, &slt->count);
 	}
-	tcp_tw_death_row_slot =
-	  ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
 }
 
 /* These are always called from BH context.  See callers in
@@ -319,12 +330,14 @@ void tcp_tw_schedule(struct tcp_tw_bucket *tw)
 	int slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1);
 	struct tcp_tw_bucket **tpp = &tcp_tw_death_row[slot];
 
+	SOCKHASH_LOCK_WRITE_BH();
 	if((tw->next_death = *tpp) != NULL)
 		(*tpp)->pprev_death = &tw->next_death;
 	*tpp = tw;
 	tw->pprev_death = tpp;
 
 	tw->death_slot = slot;
+	SOCKHASH_UNLOCK_WRITE_BH();
 
 	tcp_inc_slow_timer(TCP_SLT_TWKILL);
 }
@@ -335,6 +348,7 @@ void tcp_tw_reschedule(struct tcp_tw_bucket *tw)
 	struct tcp_tw_bucket **tpp;
 	int slot;
 
+	SOCKHASH_LOCK_WRITE_BH();
 	if(tw->next_death)
 		tw->next_death->pprev_death = tw->pprev_death;
 	*tw->pprev_death = tw->next_death;
@@ -348,16 +362,21 @@ void tcp_tw_reschedule(struct tcp_tw_bucket *tw)
 	tw->pprev_death = tpp;
 
 	tw->death_slot = slot;
+	SOCKHASH_UNLOCK_WRITE_BH();
+
 	/* Timer was incremented when we first entered the table. */
 }
 
 /* This is for handling early-kills of TIME_WAIT sockets. */
 void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
 {
+	SOCKHASH_LOCK_WRITE_BH();
 	if(tw->next_death)
 		tw->next_death->pprev_death = tw->pprev_death;
 	*tw->pprev_death = tw->next_death;
 	tw->pprev_death = NULL;
+	SOCKHASH_UNLOCK_WRITE_BH();
+
 	tcp_dec_slow_timer(TCP_SLT_TWKILL);
 }
 
@@ -399,20 +418,30 @@ static void tcp_keepalive(unsigned long data)
 	int count = 0;
 	int i;
 	
-	for(i = chain_start; i < (chain_start + ((TCP_HTABLE_SIZE/2) >> 2)); i++) {
-		struct sock *sk = tcp_established_hash[i];
+	SOCKHASH_LOCK_READ_BH();
+	for(i = chain_start; i < (chain_start + ((tcp_ehash_size >> 1) >> 2)); i++) {
+		struct sock *sk;
+
+		sk = tcp_ehash[i];
 		while(sk) {
-			if(!atomic_read(&sk->sock_readers) && sk->keepopen) {
+			struct sock *next = sk->next;
+
+			bh_lock_sock(sk);
+			if (sk->keepopen && !sk->lock.users) {
+				SOCKHASH_UNLOCK_READ_BH();
 				count += tcp_keepopen_proc(sk);
-				if(count == sysctl_tcp_max_ka_probes)
-					goto out;
+				SOCKHASH_LOCK_READ_BH();
 			}
-			sk = sk->next;
+			bh_unlock_sock(sk);
+			if(count == sysctl_tcp_max_ka_probes)
+				goto out;
+			sk = next;
 		}
 	}
 out:
-	chain_start = ((chain_start + ((TCP_HTABLE_SIZE/2)>>2)) &
-		       ((TCP_HTABLE_SIZE/2) - 1));
+	SOCKHASH_UNLOCK_READ_BH();
+	chain_start = ((chain_start + ((tcp_ehash_size >> 1)>>2)) &
+		       ((tcp_ehash_size >> 1) - 1));
 }
 
 /*
@@ -439,9 +468,11 @@ void tcp_retransmit_timer(unsigned long data)
 		return;
 	}
 
-	if (atomic_read(&sk->sock_readers)) {
+	bh_lock_sock(sk);
+	if (sk->lock.users) {
 		/* Try again later */  
 		tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ/20);
+		bh_unlock_sock(sk);
 		return;
 	}
 
@@ -508,12 +539,51 @@ void tcp_retransmit_timer(unsigned long data)
 	tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 
 	tcp_write_timeout(sk);
+
+	bh_unlock_sock(sk);
 }
 
 /*
  *	Slow timer for SYN-RECV sockets
  */
 
+static void tcp_do_syn_queue(struct sock *sk, struct tcp_opt *tp, unsigned long now)
+{
+	struct open_request *prev, *req;
+
+	prev = (struct open_request *) &tp->syn_wait_queue;
+	for(req = tp->syn_wait_queue; req; ) {
+		struct open_request *next = req->dl_next;
+
+		if (! req->sk) {
+			tcp_synq_unlink(tp, req, prev);
+			if(req->retrans >= sysctl_tcp_retries1) {
+				(*req->class->destructor)(req);
+				tcp_dec_slow_timer(TCP_SLT_SYNACK);
+				tp->syn_backlog--;
+				tcp_openreq_free(req);
+				if (! tp->syn_wait_queue)
+					break;
+			} else {
+				unsigned long timeo;
+				struct open_request *rp;
+
+				(*req->class->rtx_syn_ack)(sk, req);
+				req->retrans++;
+				timeo = min((TCP_TIMEOUT_INIT << req->retrans),
+					    (120 * HZ));
+				req->expires = now + timeo;
+				rp = prev->dl_next;
+				tcp_synq_queue(tp, req);
+				if(rp != prev->dl_next)
+					prev = prev->dl_next;
+			}
+		} else
+			prev = req;
+		req = next;
+	}
+}
+
 /* This now scales very nicely. -DaveM */
 static void tcp_syn_recv_timer(unsigned long data)
 {
@@ -521,70 +591,21 @@ static void tcp_syn_recv_timer(unsigned long data)
 	unsigned long now = jiffies;
 	int i;
 
+	SOCKHASH_LOCK_READ_BH();
 	for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
 		sk = tcp_listening_hash[i];
-
 		while(sk) {
 			struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 			
 			/* TCP_LISTEN is implied. */
-			if (!atomic_read(&sk->sock_readers) && tp->syn_wait_queue) {
-				struct open_request *prev = (struct open_request *)(&tp->syn_wait_queue);
-				struct open_request *req = tp->syn_wait_queue;
-				do {
-					struct open_request *conn;
-				  
-					conn = req;
-					req = req->dl_next;
-
-					if (conn->sk) {
-						prev = conn; 
-						continue; 
-					}
-
-					if ((long)(now - conn->expires) <= 0)
-						break;
-
-
-					tcp_synq_unlink(tp, conn, prev);
-					if (conn->retrans >= sysctl_tcp_retries1) {
-#ifdef TCP_DEBUG
-						printk(KERN_DEBUG "syn_recv: "
-						       "too many retransmits\n");
-#endif
-						(*conn->class->destructor)(conn);
-						tcp_dec_slow_timer(TCP_SLT_SYNACK);
-						tp->syn_backlog--;
-						tcp_openreq_free(conn);
-
-						if (!tp->syn_wait_queue)
-							break;
-					} else {
-						unsigned long timeo;
-						struct open_request *op; 
-
-						(*conn->class->rtx_syn_ack)(sk, conn);
-
-						conn->retrans++;
-#ifdef TCP_DEBUG
-						printk(KERN_DEBUG "syn_ack rtx %d\n",
-						       conn->retrans);
-#endif
-						timeo = min((TCP_TIMEOUT_INIT 
-							     << conn->retrans),
-							    120*HZ);
-						conn->expires = now + timeo;
-						op = prev->dl_next; 
-						tcp_synq_queue(tp, conn);
-						if (op != prev->dl_next)
-							prev = prev->dl_next;
-					}
-					/* old prev still valid here */
-				} while (req);
-			}
+			bh_lock_sock(sk);
+			if (!sk->lock.users && tp->syn_wait_queue)
+				tcp_do_syn_queue(sk, tp, now);
+			bh_unlock_sock(sk);
 			sk = sk->next;
 		}
 	}
+	SOCKHASH_UNLOCK_READ_BH();
 }
 
 void tcp_sltimer_handler(unsigned long data)
diff --git a/net/ipv4/timer.c b/net/ipv4/timer.c
index 3821a7c4c..0487f5bfa 100644
--- a/net/ipv4/timer.c
+++ b/net/ipv4/timer.c
@@ -5,7 +5,7 @@
  *
  *		TIMER - implementation of software timers for IP.
  *
- * Version:	$Id: timer.c,v 1.15 1999/02/22 13:54:29 davem Exp $
+ * Version:	$Id: timer.c,v 1.16 1999/05/27 00:37:39 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -69,13 +69,15 @@ void net_reset_timer (struct sock *t, int timeout, unsigned long len)
  */
 void net_timer (unsigned long data)
 {
-	struct sock *sk = (struct sock*)data;
+	struct sock *sk = (struct sock *) data;
 	int why = sk->timeout;
 
 	/* Only process if socket is not in use. */
-	if (atomic_read(&sk->sock_readers)) {
+	bh_lock_sock(sk);
+	if (sk->lock.users) {
 		/* Try again later. */ 
 		mod_timer(&sk->timer, jiffies+HZ/20);
+		bh_unlock_sock(sk);
 		return;
 	}
 
@@ -99,15 +101,15 @@ void net_timer (unsigned long data)
 				printk (KERN_DEBUG "non CLOSE socket in time_done\n");
 				break;
 			}
-			destroy_sock (sk);
-			break;
+			destroy_sock(sk);
+			return;
 
 		case TIME_DESTROY:
 			/* We've waited for a while for all the memory associated with
 			 * the socket to be freed.
 			 */
 			destroy_sock(sk);
-			break;
+			return;
 
 		case TIME_CLOSE:
 			/* We've waited long enough, close the socket. */
@@ -123,5 +125,8 @@ void net_timer (unsigned long data)
 			printk ("net_timer: timer expired - reason %d is unknown\n", why);
 			break;
 	}
+
+	/* We only need to unlock if the socket was not destroyed. */
+	bh_unlock_sock(sk);
 }
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5fcec9cf3..320e5151e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -5,7 +5,7 @@
  *
  *		The User Datagram Protocol (UDP).
  *
- * Version:	$Id: udp.c,v 1.66 1999/05/08 20:00:25 davem Exp $
+ * Version:	$Id: udp.c,v 1.69 1999/06/09 11:15:31 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -128,7 +128,7 @@ static int udp_v4_verify_bind(struct sock *sk, unsigned short snum)
 	struct sock *sk2;
 	int retval = 0, sk_reuse = sk->reuse;
 
-	SOCKHASH_LOCK();
+	SOCKHASH_LOCK_READ();
 	for(sk2 = udp_hash[snum & (UDP_HTABLE_SIZE - 1)]; sk2 != NULL; sk2 = sk2->next) {
 		if((sk2->num == snum) && (sk2 != sk)) {
 			unsigned char state = sk2->state;
@@ -158,7 +158,7 @@ static int udp_v4_verify_bind(struct sock *sk, unsigned short snum)
 			}
 		}
 	}
-	SOCKHASH_UNLOCK();
+	SOCKHASH_UNLOCK_READ();
 	return retval;
 }
 
@@ -173,14 +173,14 @@ static inline int udp_lport_inuse(u16 num)
 	return 0;
 }
 
-/* Shared by v4/v6 tcp. */
+/* Shared by v4/v6 udp. */
 unsigned short udp_good_socknum(void)
 {
 	int result;
 	static int start = 0;
 	int i, best, best_size_so_far;
 
-	SOCKHASH_LOCK();
+	SOCKHASH_LOCK_READ();
         if (start > sysctl_local_port_range[1] || start < sysctl_local_port_range[0])
                 start = sysctl_local_port_range[0];
 
@@ -223,15 +223,10 @@ unsigned short udp_good_socknum(void)
         }
 out:
 	start = result;
-	SOCKHASH_UNLOCK();
+	SOCKHASH_UNLOCK_READ();
 	return result;
 }
 
-/* Last hit UDP socket cache, this is ipv4 specific so make it static. */
-static u32 uh_cache_saddr, uh_cache_daddr;
-static u16 uh_cache_dport, uh_cache_sport;
-static struct sock *uh_cache_sk = NULL;
-
 static void udp_v4_hash(struct sock *sk)
 {
 	struct sock **skp;
@@ -240,11 +235,11 @@ static void udp_v4_hash(struct sock *sk)
 	num &= (UDP_HTABLE_SIZE - 1);
 	skp = &udp_hash[num];
 
-	SOCKHASH_LOCK();
+	SOCKHASH_LOCK_WRITE();
 	sk->next = *skp;
 	*skp = sk;
 	sk->hashent = num;
-	SOCKHASH_UNLOCK();
+	SOCKHASH_UNLOCK_WRITE();
 }
 
 static void udp_v4_unhash(struct sock *sk)
@@ -255,7 +250,7 @@ static void udp_v4_unhash(struct sock *sk)
 	num &= (UDP_HTABLE_SIZE - 1);
 	skp = &udp_hash[num];
 
-	SOCKHASH_LOCK();
+	SOCKHASH_LOCK_WRITE();
 	while(*skp != NULL) {
 		if(*skp == sk) {
 			*skp = sk->next;
@@ -263,9 +258,7 @@ static void udp_v4_unhash(struct sock *sk)
 		}
 		skp = &((*skp)->next);
 	}
-	if(uh_cache_sk == sk)
-		uh_cache_sk = NULL;
-	SOCKHASH_UNLOCK();
+	SOCKHASH_UNLOCK_WRITE();
 }
 
 static void udp_v4_rehash(struct sock *sk)
@@ -277,7 +270,7 @@ static void udp_v4_rehash(struct sock *sk)
 	num &= (UDP_HTABLE_SIZE - 1);
 	skp = &udp_hash[oldnum];
 
-	SOCKHASH_LOCK();
+	SOCKHASH_LOCK_WRITE();
 	while(*skp != NULL) {
 		if(*skp == sk) {
 			*skp = sk->next;
@@ -288,13 +281,11 @@ static void udp_v4_rehash(struct sock *sk)
 	sk->next = udp_hash[num];
 	udp_hash[num] = sk;
 	sk->hashent = num;
-	if(uh_cache_sk == sk)
-		uh_cache_sk = NULL;
-	SOCKHASH_UNLOCK();
+	SOCKHASH_UNLOCK_WRITE();
 }
 
 /* UDP is nearly always wildcards out the wazoo, it makes no sense to try
- * harder than this here plus the last hit cache. -DaveM
+ * harder than this. -DaveM
  */
 struct sock *udp_v4_lookup_longway(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
 {
@@ -341,21 +332,9 @@ __inline__ struct sock *udp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport
 {
 	struct sock *sk;
 
-	if(!dif && uh_cache_sk		&&
-	   uh_cache_saddr == saddr	&&
-	   uh_cache_sport == sport	&&
-	   uh_cache_dport == dport	&&
-	   uh_cache_daddr == daddr)
-		return uh_cache_sk;
-
+	SOCKHASH_LOCK_READ();
 	sk = udp_v4_lookup_longway(saddr, sport, daddr, dport, dif);
-	if(!dif) {
-		uh_cache_sk	= sk;
-		uh_cache_saddr	= saddr;
-		uh_cache_daddr	= daddr;
-		uh_cache_sport	= sport;
-		uh_cache_dport	= dport;
-	}
+	SOCKHASH_UNLOCK_READ();
 	return sk;
 }
 
@@ -393,7 +372,7 @@ static struct sock *udp_v4_proxy_lookup(unsigned short num, unsigned long raddr,
 			paddr = idev->ifa_list->ifa_local;
 	}
 
-	SOCKHASH_LOCK();
+	SOCKHASH_LOCK_READ();
 	for(s = udp_v4_proxy_loop_init(hnum, hpnum, s, firstpass);
 	    s != NULL;
 	    s = udp_v4_proxy_loop_next(hnum, hpnum, s, firstpass)) {
@@ -431,7 +410,7 @@ static struct sock *udp_v4_proxy_lookup(unsigned short num, unsigned long raddr,
 			}
 		}
 	}
-	SOCKHASH_UNLOCK();
+	SOCKHASH_UNLOCK_READ();
 	return result;
 }
 
@@ -784,7 +763,10 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 	/* 4.1.3.4. It's configurable by the application via setsockopt() */
 	/* (MAY) and it defaults to on (MUST). */
 
-	err = ip_build_xmit(sk,sk->no_check ? udp_getfrag_nosum : udp_getfrag,
+	err = ip_build_xmit(sk,
+			    (sk->no_check == UDP_CSUM_NOXMIT ?
+			     udp_getfrag_nosum :
+			     udp_getfrag),
 			    &ufh, ulen, &ipc, rt, msg->msg_flags);
 
 out:
@@ -979,8 +961,6 @@ int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		sk->rcv_saddr=INADDR_ANY;
 		sk->daddr=INADDR_ANY;
 		sk->state = TCP_CLOSE;
-		if(uh_cache_sk == sk)
-			uh_cache_sk = NULL;
 		return 0;
 	}
 
@@ -1005,9 +985,6 @@ int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	sk->dport = usin->sin_port;
 	sk->state = TCP_ESTABLISHED;
 
-	if(uh_cache_sk == sk)
-		uh_cache_sk = NULL;
-
 	sk->dst_cache = &rt->u.dst;
 	return(0);
 }
@@ -1015,6 +992,8 @@ int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 static void udp_close(struct sock *sk, long timeout)
 {
+	bh_lock_sock(sk);
+
 	/* See for explanation: raw_close in ipv4/raw.c */
 	sk->state = TCP_CLOSE;
 	udp_v4_unhash(sk);
@@ -1117,6 +1096,33 @@ int udp_chkaddr(struct sk_buff *skb)
 }
 #endif
 
+static int udp_checksum_verify(struct sk_buff *skb, struct udphdr *uh,
+			       unsigned short ulen, u32 saddr, u32 daddr,
+			       int full_csum_deferred)
+{
+	if (!full_csum_deferred) {
+		if (uh->check) {
+			if (skb->ip_summed == CHECKSUM_HW &&
+			    udp_check(uh, ulen, saddr, daddr, skb->csum))
+				return -1;
+			if (skb->ip_summed == CHECKSUM_NONE &&
+			    udp_check(uh, ulen, saddr, daddr,
+				      csum_partial((char *)uh, ulen, 0)))
+				return -1;
+		}
+	} else {
+		if (uh->check == 0)
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+		else if (skb->ip_summed == CHECKSUM_HW) {
+			if (udp_check(uh, ulen, saddr, daddr, skb->csum))
+				return -1;
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+		} else if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+			skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
+	}
+	return 0;
+}
+
 /*
  *	All we need to do is get the socket, and then do a checksum. 
  */
@@ -1158,25 +1164,18 @@ int udp_rcv(struct sk_buff *skb, unsigned short len)
 	}
 	skb_trim(skb, ulen);
 
-#ifndef CONFIG_UDP_DELAY_CSUM
-	if (uh->check &&
-	    (((skb->ip_summed==CHECKSUM_HW)&&udp_check(uh,ulen,saddr,daddr,skb->csum)) ||
-	     ((skb->ip_summed==CHECKSUM_NONE) &&
-	      (udp_check(uh,ulen,saddr,daddr, csum_partial((char*)uh, ulen, 0)))))) 
-		goto csum_error;
+	if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) {
+		int defer;
+
+#ifdef CONFIG_UDP_DELAY_CSUM
+		defer = 1;
 #else
-	if (uh->check==0)
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-	else if (skb->ip_summed==CHECKSUM_HW) {
-		if (udp_check(uh,ulen,saddr,daddr,skb->csum)) 
-			goto csum_error;
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-	} else if (skb->ip_summed != CHECKSUM_UNNECESSARY)
-		skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
+		defer = 0;
 #endif
-
-	if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
+		if (udp_checksum_verify(skb, uh, ulen, saddr, daddr, defer))
+			goto csum_error;
 		return udp_v4_mcast_deliver(skb, uh, saddr, daddr);
+	}
 
 #ifdef CONFIG_IP_TRANSPARENT_PROXY
 	if (IPCB(skb)->redirport)
@@ -1203,6 +1202,15 @@ int udp_rcv(struct sk_buff *skb, unsigned short len)
 		kfree_skb(skb);
 		return(0);
   	}
+	if (udp_checksum_verify(skb, uh, ulen, saddr, daddr,
+#ifdef CONFIG_UDP_DELAY_CSUM
+				1
+#else
+				(sk->no_check & UDP_CSUM_NORCV) != 0
+#endif
+		))
+		goto csum_error;
+
 	udp_deliver(sk, skb);
 	return 0;
 
diff --git a/net/ipv4/utils.c b/net/ipv4/utils.c
index ce74ade2a..5992cbc55 100644
--- a/net/ipv4/utils.c
+++ b/net/ipv4/utils.c
@@ -6,7 +6,7 @@
  *		Various kernel-resident INET utility functions; mainly
  *		for format conversion and debugging output.
  *
- * Version:	$Id: utils.c,v 1.6 1997/12/13 21:53:03 kuznet Exp $
+ * Version:	$Id: utils.c,v 1.7 1999/06/09 10:11:05 davem Exp $
  *
  * Author:	Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *
@@ -57,6 +57,11 @@ char *in_ntoa(__u32 in)
 	return(buff);
 }
 
+char *in_ntoa2(__u32 in, char *buff)
+{
+	sprintf(buff, "%d.%d.%d.%d", NIPQUAD(in));
+	return buff;
+}
 
 /*
  *	Convert an ASCII string to binary IP.