summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c57
-rw-r--r--net/ipv4/arp.c84
-rw-r--r--net/ipv4/devinet.c34
-rw-r--r--net/ipv4/fib_frontend.c4
-rw-r--r--net/ipv4/fib_hash.c62
-rw-r--r--net/ipv4/fib_rules.c40
-rw-r--r--net/ipv4/icmp.c23
-rw-r--r--net/ipv4/igmp.c102
-rw-r--r--net/ipv4/ip_fragment.c27
-rw-r--r--net/ipv4/ip_input.c260
-rw-r--r--net/ipv4/ip_masq_mfw.c4
-rw-r--r--net/ipv4/ip_masq_quake.c4
-rw-r--r--net/ipv4/ip_masq_vdolive.c6
-rw-r--r--net/ipv4/ip_options.c3
-rw-r--r--net/ipv4/ipconfig.c8
-rw-r--r--net/ipv4/ipmr.c11
-rw-r--r--net/ipv4/proc.c15
-rw-r--r--net/ipv4/raw.c94
-rw-r--r--net/ipv4/route.c97
-rw-r--r--net/ipv4/tcp.c121
-rw-r--r--net/ipv4/tcp_input.c155
-rw-r--r--net/ipv4/tcp_ipv4.c220
-rw-r--r--net/ipv4/tcp_output.c11
-rw-r--r--net/ipv4/tcp_timer.c165
-rw-r--r--net/ipv4/timer.c17
-rw-r--r--net/ipv4/udp.c128
-rw-r--r--net/ipv4/utils.c7
27 files changed, 1093 insertions, 666 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 70fcf4024..ca0f27d0c 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -5,7 +5,7 @@
*
* PF_INET protocol family socket handler.
*
- * Version: $Id: af_inet.c,v 1.87 1999/04/22 10:07:33 davem Exp $
+ * Version: $Id: af_inet.c,v 1.91 1999/06/09 08:28:55 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -147,22 +147,17 @@ static __inline__ void kill_sk_queues(struct sock *sk)
struct sk_buff *skb;
/* First the read buffer. */
- while((skb = skb_dequeue(&sk->receive_queue)) != NULL) {
- /* This will take care of closing sockets that were
- * listening and didn't accept everything.
- */
- if (skb->sk != NULL && skb->sk != sk)
- skb->sk->prot->close(skb->sk, 0);
+ while((skb = skb_dequeue(&sk->receive_queue)) != NULL)
kfree_skb(skb);
- }
/* Next, the error queue. */
while((skb = skb_dequeue(&sk->error_queue)) != NULL)
kfree_skb(skb);
- /* Now the backlog. */
- while((skb=skb_dequeue(&sk->back_log)) != NULL)
- kfree_skb(skb);
+ /* It is _impossible_ for the backlog to contain anything
+ * when we get here. All user references to this socket
+ * have gone away, only the net layer knows can touch it.
+ */
}
static __inline__ void kill_sk_now(struct sock *sk)
@@ -195,14 +190,19 @@ static __inline__ void kill_sk_later(struct sock *sk)
sk->destroy = 1;
sk->ack_backlog = 0;
- release_sock(sk);
+ bh_unlock_sock(sk);
net_reset_timer(sk, TIME_DESTROY, SOCK_DESTROY_TIME);
}
+/* Callers must hold the BH spinlock.
+ *
+ * At this point, there should be no process reference to this
+ * socket, and thus no user references at all. Therefore we
+ * can assume the socket waitqueue is inactive and nobody will
+ * try to jump onto it.
+ */
void destroy_sock(struct sock *sk)
{
- lock_sock(sk); /* just to be safe. */
-
/* Now we can no longer get new packets or once the
* timers are killed, send them.
*/
@@ -213,12 +213,6 @@ void destroy_sock(struct sock *sk)
kill_sk_queues(sk);
- /* Now if it has a half accepted/ closed socket. */
- if (sk->pair) {
- sk->pair->prot->close(sk->pair, 0);
- sk->pair = NULL;
- }
-
/* Now if everything is gone we can free the socket
* structure, otherwise we need to keep it around until
* everything is gone.
@@ -284,6 +278,14 @@ static int inet_autobind(struct sock *sk)
return 0;
}
+/* Listening INET sockets never sleep to wait for memory, so
+ * it is completely silly to wake them up on queue space
+ * available events. So we hook them up to this dummy callback.
+ */
+static void inet_listen_write_space(struct sock *sk)
+{
+}
+
/*
* Move a socket into listening state.
*/
@@ -310,6 +312,7 @@ int inet_listen(struct socket *sock, int backlog)
dst_release(xchg(&sk->dst_cache, NULL));
sk->prot->rehash(sk);
add_to_prot_sklist(sk);
+ sk->write_space = inet_listen_write_space;
}
sk->socket->flags |= SO_ACCEPTCON;
return(0);
@@ -368,7 +371,7 @@ static int inet_create(struct socket *sock, int protocol)
if (protocol && protocol != IPPROTO_UDP)
goto free_and_noproto;
protocol = IPPROTO_UDP;
- sk->no_check = UDP_NO_CHECK;
+ sk->no_check = UDP_CSUM_DEFAULT;
sk->ip_pmtudisc = IP_PMTUDISC_DONT;
prot=&udp_prot;
sock->ops = &inet_dgram_ops;
@@ -578,7 +581,7 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
static void inet_wait_for_connect(struct sock *sk)
{
- struct wait_queue wait = { current, NULL };
+ DECLARE_WAITQUEUE(wait, current);
add_wait_queue(sk->sleep, &wait);
current->state = TASK_INTERRUPTIBLE;
@@ -684,14 +687,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
if (sk1->prot->accept == NULL)
goto do_err;
- /* Restore the state if we have been interrupted, and then returned. */
- if (sk1->pair != NULL) {
- sk2 = sk1->pair;
- sk1->pair = NULL;
- } else {
- if((sk2 = sk1->prot->accept(sk1,flags)) == NULL)
- goto do_sk1_err;
- }
+ if((sk2 = sk1->prot->accept(sk1,flags)) == NULL)
+ goto do_sk1_err;
/*
* We've been passed an extra socket.
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 2c311f233..a3ca88701 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1,6 +1,6 @@
/* linux/net/inet/arp.c
*
- * Version: $Id: arp.c,v 1.77 1999/03/21 05:22:30 davem Exp $
+ * Version: $Id: arp.c,v 1.78 1999/06/09 10:10:36 davem Exp $
*
* Copyright (C) 1994 by Florian La Roche
*
@@ -119,6 +119,11 @@
#include <asm/system.h>
#include <asm/uaccess.h>
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+static char *ax2asc2(ax25_address *a, char *buf);
+#endif
+
+
/*
* Interface to generic neighbour cache.
*/
@@ -304,7 +309,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
u8 *dst_ha = NULL;
struct device *dev = neigh->dev;
u32 target = *(u32*)neigh->primary_key;
- int probes = neigh->probes;
+ int probes = atomic_read(&neigh->probes);
if (skb && inet_addr_type(skb->nh.iph->saddr) == RTN_LOCAL)
saddr = skb->nh.iph->saddr;
@@ -315,6 +320,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
if (!(neigh->nud_state&NUD_VALID))
printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n");
dst_ha = neigh->ha;
+ read_lock_bh(&neigh->lock);
} else if ((probes -= neigh->parms->app_probes) < 0) {
#ifdef CONFIG_ARPD
neigh_app_ns(neigh);
@@ -324,6 +330,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
dst_ha, dev->dev_addr, NULL);
+ if (dst_ha)
+ read_unlock_bh(&neigh->lock);
}
/* OBSOLETE FUNCTIONS */
@@ -372,29 +380,25 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
if (arp_set_predefined(inet_addr_type(paddr), haddr, paddr, dev))
return 0;
- start_bh_atomic();
n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
if (n) {
n->used = jiffies;
if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) {
- memcpy(haddr, n->ha, dev->addr_len);
+ read_lock_bh(&n->lock);
+ memcpy(haddr, n->ha, dev->addr_len);
+ read_unlock_bh(&n->lock);
neigh_release(n);
- end_bh_atomic();
return 0;
}
+ neigh_release(n);
} else
kfree_skb(skb);
- neigh_release(n);
- end_bh_atomic();
return 1;
}
/* END OF OBSOLETE FUNCTIONS */
-/*
- * Note: requires bh_atomic locking.
- */
int arp_bind_neighbour(struct dst_entry *dst)
{
struct device *dev = dst->dev;
@@ -672,7 +676,8 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
(addr_type == RTN_UNICAST && rt->u.dst.dev != dev &&
(IN_DEV_PROXY_ARP(in_dev) || pneigh_lookup(&arp_tbl, &tip, dev, 0)))) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
- neigh_release(n);
+ if (n)
+ neigh_release(n);
if (skb->stamp.tv_sec == 0 ||
skb->pkt_type == PACKET_HOST ||
@@ -785,7 +790,6 @@ int arp_req_set(struct arpreq *r, struct device * dev)
return -EINVAL;
err = -ENOBUFS;
- start_bh_atomic();
neigh = __neigh_lookup(&arp_tbl, &ip, dev, 1);
if (neigh) {
unsigned state = NUD_STALE;
@@ -795,7 +799,6 @@ int arp_req_set(struct arpreq *r, struct device * dev)
r->arp_ha.sa_data : NULL, state, 1, 0);
neigh_release(neigh);
}
- end_bh_atomic();
return err;
}
@@ -819,17 +822,17 @@ static int arp_req_get(struct arpreq *r, struct device *dev)
struct neighbour *neigh;
int err = -ENXIO;
- start_bh_atomic();
- neigh = __neigh_lookup(&arp_tbl, &ip, dev, 0);
+ neigh = neigh_lookup(&arp_tbl, &ip, dev);
if (neigh) {
+ read_lock_bh(&neigh->lock);
memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
+ r->arp_flags = arp_state_to_flags(neigh);
+ read_unlock_bh(&neigh->lock);
r->arp_ha.sa_family = dev->type;
strncpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
- r->arp_flags = arp_state_to_flags(neigh);
neigh_release(neigh);
err = 0;
}
- end_bh_atomic();
return err;
}
@@ -867,14 +870,12 @@ int arp_req_delete(struct arpreq *r, struct device * dev)
return -EINVAL;
}
err = -ENXIO;
- start_bh_atomic();
- neigh = __neigh_lookup(&arp_tbl, &ip, dev, 0);
+ neigh = neigh_lookup(&arp_tbl, &ip, dev);
if (neigh) {
if (neigh->nud_state&~NUD_NOARP)
err = neigh_update(neigh, NULL, NUD_FAILED, 1, 0);
neigh_release(neigh);
}
- end_bh_atomic();
return err;
}
@@ -961,16 +962,16 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy
char hbuffer[HBUFFERLEN];
int i,j,k;
const char hexbuf[] = "0123456789ABCDEF";
+ char abuf[16];
size = sprintf(buffer,"IP address HW type Flags HW address Mask Device\n");
pos+=size;
len+=size;
- neigh_table_lock(&arp_tbl);
-
- for(i=0; i<=NEIGH_HASHMASK; i++) {
+ for(i=0; i<=NEIGH_HASHMASK; i++) {
struct neighbour *n;
+ read_lock_bh(&arp_tbl.lock);
for (n=arp_tbl.hash_buckets[i]; n; n=n->next) {
struct device *dev = n->dev;
int hatype = dev->type;
@@ -979,17 +980,14 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy
if (!(n->nud_state&~NUD_NOARP))
continue;
- /* I'd get great pleasure deleting
- this ugly code. Let's output it in hexadecimal format.
- "arp" utility will eventually repaired --ANK
- */
-#if 1 /* UGLY CODE */
+ read_lock(&n->lock);
+
/*
* Convert hardware address to XX:XX:XX:XX ... form.
*/
#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
- strcpy(hbuffer,ax2asc((ax25_address *)n->ha));
+ ax2asc2((ax25_address *)n->ha, hbuffer);
else {
#endif
for (k=0,j=0;k<HBUFFERLEN-3 && j<dev->addr_len;j++) {
@@ -998,37 +996,33 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy
hbuffer[k++]=':';
}
hbuffer[--k]=0;
-
+
#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
}
#endif
-#else
- if ((neigh->nud_state&NUD_VALID) && dev->addr_len) {
- int j;
- for (j=0; j < dev->addr_len; j++)
- sprintf(hbuffer+2*j, "%02x", neigh->ha[j]);
- } else
- sprintf(hbuffer, "0");
-#endif
size = sprintf(buffer+len,
"%-17s0x%-10x0x%-10x%s",
- in_ntoa(*(u32*)n->primary_key),
+ in_ntoa2(*(u32*)n->primary_key, abuf),
hatype,
arp_state_to_flags(n),
hbuffer);
size += sprintf(buffer+len+size,
" %-17s %s\n",
"*", dev->name);
+ read_unlock(&n->lock);
len += size;
pos += size;
if (pos <= offset)
len=0;
- if (pos >= offset+length)
- goto done;
+ if (pos >= offset+length) {
+ read_unlock_bh(&arp_tbl.lock);
+ goto done;
+ }
}
+ read_unlock_bh(&arp_tbl.lock);
}
for (i=0; i<=PNEIGH_HASHMASK; i++) {
@@ -1039,7 +1033,7 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy
size = sprintf(buffer+len,
"%-17s0x%-10x0x%-10x%s",
- in_ntoa(*(u32*)n->key),
+ in_ntoa2(*(u32*)n->key, abuf),
hatype,
ATF_PUBL|ATF_PERM,
"00:00:00:00:00:00");
@@ -1058,7 +1052,6 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy
}
done:
- neigh_table_unlock(&arp_tbl);
*start = buffer+len-(pos-offset); /* Start of wanted data */
len = pos-offset; /* Start slop */
@@ -1117,14 +1110,13 @@ __initfunc(void arp_init (void))
}
-#ifdef CONFIG_AX25_MODULE
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
/*
* ax25 -> ASCII conversion
*/
-char *ax2asc(ax25_address *a)
+char *ax2asc2(ax25_address *a, char *buf)
{
- static char buf[11];
char c, *s;
int n;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index c8b0fbbc8..ff2c930d1 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1,7 +1,7 @@
/*
* NET3 IP device support routines.
*
- * Version: $Id: devinet.c,v 1.28 1999/05/08 20:00:16 davem Exp $
+ * Version: $Id: devinet.c,v 1.32 1999/06/09 11:15:33 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -607,41 +607,39 @@ inet_gifconf(struct device *dev, char *buf, int len)
{
struct in_device *in_dev = dev->ip_ptr;
struct in_ifaddr *ifa;
- struct ifreq ifr;
+ struct ifreq *ifr = (struct ifreq *) buf;
int done=0;
if (in_dev==NULL || (ifa=in_dev->ifa_list)==NULL)
return 0;
for ( ; ifa; ifa = ifa->ifa_next) {
- if (!buf) {
+ if (!ifr) {
done += sizeof(ifr);
continue;
}
if (len < (int) sizeof(ifr))
return done;
- memset(&ifr, 0, sizeof(struct ifreq));
+ memset(ifr, 0, sizeof(struct ifreq));
if (ifa->ifa_label)
- strcpy(ifr.ifr_name, ifa->ifa_label);
+ strcpy(ifr->ifr_name, ifa->ifa_label);
else
- strcpy(ifr.ifr_name, dev->name);
+ strcpy(ifr->ifr_name, dev->name);
- (*(struct sockaddr_in *) &ifr.ifr_addr).sin_family = AF_INET;
- (*(struct sockaddr_in *) &ifr.ifr_addr).sin_addr.s_addr = ifa->ifa_local;
+ (*(struct sockaddr_in *) &ifr->ifr_addr).sin_family = AF_INET;
+ (*(struct sockaddr_in *) &ifr->ifr_addr).sin_addr.s_addr = ifa->ifa_local;
- if (copy_to_user(buf, &ifr, sizeof(struct ifreq)))
- return -EFAULT;
- buf += sizeof(struct ifreq);
+ ifr++;
len -= sizeof(struct ifreq);
done += sizeof(struct ifreq);
}
return done;
}
-u32 inet_select_addr(struct device *dev, u32 dst, int scope)
+u32 inet_select_addr(const struct device *dev, u32 dst, int scope)
{
u32 addr = 0;
- struct in_device *in_dev = dev->ip_ptr;
+ const struct in_device *in_dev = dev->ip_ptr;
if (in_dev == NULL)
return 0;
@@ -661,15 +659,19 @@ u32 inet_select_addr(struct device *dev, u32 dst, int scope)
in this case. It is importnat that lo is the first interface
in dev_base list.
*/
+ read_lock(&dev_base_lock);
for (dev=dev_base; dev; dev=dev->next) {
if ((in_dev=dev->ip_ptr) == NULL)
continue;
for_primary_ifa(in_dev) {
- if (ifa->ifa_scope <= scope)
+ if (ifa->ifa_scope <= scope) {
+ read_unlock(&dev_base_lock);
return ifa->ifa_local;
+ }
} endfor_ifa(in_dev);
}
+ read_unlock(&dev_base_lock);
return 0;
}
@@ -790,6 +792,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
s_idx = cb->args[0];
s_ip_idx = ip_idx = cb->args[1];
+ read_lock(&dev_base_lock);
for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
if (idx < s_idx)
continue;
@@ -807,6 +810,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
}
}
done:
+ read_unlock(&dev_base_lock);
cb->args[0] = idx;
cb->args[1] = ip_idx;
@@ -881,11 +885,13 @@ void inet_forward_change()
ipv4_devconf.accept_redirects = !on;
ipv4_devconf_dflt.forwarding = on;
+ read_lock(&dev_base_lock);
for (dev = dev_base; dev; dev = dev->next) {
struct in_device *in_dev = dev->ip_ptr;
if (in_dev)
in_dev->cnf.forwarding = on;
}
+ read_unlock(&dev_base_lock);
rt_cache_flush(0);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index a17470483..d57d4daa9 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -5,7 +5,7 @@
*
* IPv4 Forwarding Information Base: FIB frontend.
*
- * Version: $Id: fib_frontend.c,v 1.15 1999/03/21 05:22:31 davem Exp $
+ * Version: $Id: fib_frontend.c,v 1.16 1999/06/09 10:10:42 davem Exp $
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
@@ -123,13 +123,11 @@ fib_get_procinfo(char *buffer, char **start, off_t offset, int length, int dummy
first = 0;
}
- /* rtnl_shlock(); -- it is pointless at the moment --ANK */
if (main_table && count > 0) {
int n = main_table->tb_get_info(main_table, ptr, first, count);
count -= n;
ptr += n*128;
}
- /* rtnl_shunlock(); */
len = ptr - *start;
if (len >= length)
return length;
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index d9e029cef..0472f6118 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -5,7 +5,7 @@
*
* IPv4 FIB: lookup engine and maintenance routines.
*
- * Version: $Id: fib_hash.c,v 1.8 1999/03/25 10:04:17 davem Exp $
+ * Version: $Id: fib_hash.c,v 1.10 1999/06/09 10:10:45 davem Exp $
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
@@ -145,13 +145,16 @@ extern __inline__ int fn_key_leq(fn_key_t a, fn_key_t b)
return a.datum <= b.datum;
}
+static rwlock_t fib_hash_lock = RW_LOCK_UNLOCKED;
+
#define FZ_MAX_DIVISOR 1024
#ifdef CONFIG_IP_ROUTE_LARGE_TABLES
+/* The fib hash lock must be held when this is called. */
static __inline__ void fn_rebuild_zone(struct fn_zone *fz,
- struct fib_node **old_ht,
- int old_divisor)
+ struct fib_node **old_ht,
+ int old_divisor)
{
int i;
struct fib_node *f, **fp, *next;
@@ -198,13 +201,13 @@ static void fn_rehash_zone(struct fn_zone *fz)
if (ht) {
memset(ht, 0, new_divisor*sizeof(struct fib_node*));
- start_bh_atomic();
+ write_lock_bh(&fib_hash_lock);
old_ht = fz->fz_hash;
fz->fz_hash = ht;
fz->fz_hashmask = new_hashmask;
fz->fz_divisor = new_divisor;
fn_rebuild_zone(fz, old_ht, old_divisor);
- end_bh_atomic();
+ write_unlock_bh(&fib_hash_lock);
kfree(old_ht);
}
}
@@ -246,6 +249,7 @@ fn_new_zone(struct fn_hash *table, int z)
for (i=z+1; i<=32; i++)
if (table->fn_zones[i])
break;
+ write_lock_bh(&fib_hash_lock);
if (i>32) {
/* No more specific masks, we are the first. */
fz->fz_next = table->fn_zone_list;
@@ -255,6 +259,7 @@ fn_new_zone(struct fn_hash *table, int z)
table->fn_zones[i]->fz_next = fz;
}
table->fn_zones[z] = fz;
+ write_unlock_bh(&fib_hash_lock);
return fz;
}
@@ -265,6 +270,7 @@ fn_hash_lookup(struct fib_table *tb, const struct rt_key *key, struct fib_result
struct fn_zone *fz;
struct fn_hash *t = (struct fn_hash*)tb->tb_data;
+ read_lock(&fib_hash_lock);
for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
struct fib_node *f;
fn_key_t k = fz_key(key->dst, fz);
@@ -293,13 +299,16 @@ fn_hash_lookup(struct fib_table *tb, const struct rt_key *key, struct fib_result
res->scope = f->fn_scope;
res->prefixlen = fz->fz_order;
res->prefix = &fz_prefix(f->fn_key, fz);
- return 0;
+ goto out;
}
if (err < 0)
- return err;
+ goto out;
}
}
- return 1;
+ err = 1;
+out:
+ read_unlock(&fib_hash_lock);
+ return err;
}
static int fn_hash_last_dflt=-1;
@@ -344,6 +353,7 @@ fn_hash_select_default(struct fib_table *tb, const struct rt_key *key, struct fi
last_resort = NULL;
order = -1;
+ read_lock(&fib_hash_lock);
for (f = fz->fz_hash[0]; f; f = f->fn_next) {
struct fib_info *next_fi = FIB_INFO(f);
@@ -364,7 +374,7 @@ fn_hash_select_default(struct fib_table *tb, const struct rt_key *key, struct fi
} else if (!fib_detect_death(fi, order, &last_resort, &last_idx)) {
res->fi = fi;
fn_hash_last_dflt = order;
- return;
+ goto out;
}
fi = next_fi;
order++;
@@ -372,18 +382,20 @@ fn_hash_select_default(struct fib_table *tb, const struct rt_key *key, struct fi
if (order<=0 || fi==NULL) {
fn_hash_last_dflt = -1;
- return;
+ goto out;
}
if (!fib_detect_death(fi, order, &last_resort, &last_idx)) {
res->fi = fi;
fn_hash_last_dflt = order;
- return;
+ goto out;
}
if (last_idx >= 0)
res->fi = last_resort;
fn_hash_last_dflt = last_idx;
+out:
+ read_unlock(&fib_hash_lock);
}
#define FIB_SCAN(f, fp) \
@@ -457,6 +469,7 @@ rta->rta_prefsrc ? *(u32*)rta->rta_prefsrc : 0);
fp = fz_chain_p(key, fz);
+
/*
* Scan list to find the first route with the same destination
*/
@@ -560,14 +573,17 @@ replace:
*/
new_f->fn_next = f;
+ write_lock_bh(&fib_hash_lock);
*fp = new_f;
+ write_unlock_bh(&fib_hash_lock);
fz->fz_nent++;
if (del_fp) {
f = *del_fp;
/* Unlink replaced node */
+ write_lock_bh(&fib_hash_lock);
*del_fp = f->fn_next;
- synchronize_bh();
+ write_unlock_bh(&fib_hash_lock);
if (!(f->fn_state&FN_S_ZOMBIE))
rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req);
@@ -619,11 +635,13 @@ FTprint("tb(%d)_delete: %d %08x/%d %d\n", tb->tb_id, r->rtm_type, rta->rta_dst ?
fp = fz_chain_p(key, fz);
+
FIB_SCAN(f, fp) {
if (fn_key_eq(f->fn_key, key))
break;
- if (fn_key_leq(key, f->fn_key))
+ if (fn_key_leq(key, f->fn_key)) {
return -ESRCH;
+ }
}
#ifdef CONFIG_IP_ROUTE_TOS
FIB_SCAN_KEY(f, fp, key) {
@@ -637,9 +655,9 @@ FTprint("tb(%d)_delete: %d %08x/%d %d\n", tb->tb_id, r->rtm_type, rta->rta_dst ?
FIB_SCAN_TOS(f, fp, key, tos) {
struct fib_info * fi = FIB_INFO(f);
- if (f->fn_state&FN_S_ZOMBIE)
+ if (f->fn_state&FN_S_ZOMBIE) {
return -ESRCH;
-
+ }
matched++;
if (del_fp == NULL &&
@@ -655,8 +673,9 @@ FTprint("tb(%d)_delete: %d %08x/%d %d\n", tb->tb_id, r->rtm_type, rta->rta_dst ?
rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req);
if (matched != 1) {
+ write_lock_bh(&fib_hash_lock);
*del_fp = f->fn_next;
- synchronize_bh();
+ write_unlock_bh(&fib_hash_lock);
if (f->fn_state&FN_S_ACCESSED)
rt_cache_flush(-1);
@@ -687,8 +706,9 @@ fn_flush_list(struct fib_node ** fp, int z, struct fn_hash *table)
struct fib_info *fi = FIB_INFO(f);
if (fi && ((f->fn_state&FN_S_ZOMBIE) || (fi->fib_flags&RTNH_F_DEAD))) {
+ write_lock_bh(&fib_hash_lock);
*fp = f->fn_next;
- synchronize_bh();
+ write_unlock_bh(&fib_hash_lock);
fn_free_node(f);
found++;
@@ -727,6 +747,7 @@ static int fn_hash_get_info(struct fib_table *tb, char *buffer, int first, int c
int pos = 0;
int n = 0;
+ read_lock(&fib_hash_lock);
for (fz=table->fn_zone_list; fz; fz = fz->fz_next) {
int i;
struct fib_node *f;
@@ -752,10 +773,12 @@ static int fn_hash_get_info(struct fib_table *tb, char *buffer, int first, int c
FZ_MASK(fz), buffer);
buffer += 128;
if (++n >= count)
- return n;
+ goto out;
}
}
}
+out:
+ read_unlock(&fib_hash_lock);
return n;
}
#endif
@@ -818,15 +841,18 @@ static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin
struct fn_hash *table = (struct fn_hash*)tb->tb_data;
s_m = cb->args[1];
+ read_lock(&fib_hash_lock);
for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) {
if (m < s_m) continue;
if (m > s_m)
memset(&cb->args[2], 0, sizeof(cb->args) - 2*sizeof(cb->args[0]));
if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
cb->args[1] = m;
+ read_unlock(&fib_hash_lock);
return -1;
}
}
+ read_unlock(&fib_hash_lock);
cb->args[1] = m;
return skb->len;
}
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 868c44c31..97074198e 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -5,7 +5,7 @@
*
* IPv4 Forwarding Information Base: policy rules.
*
- * Version: $Id: fib_rules.c,v 1.9 1999/03/25 10:04:23 davem Exp $
+ * Version: $Id: fib_rules.c,v 1.11 1999/06/09 10:10:47 davem Exp $
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
@@ -79,12 +79,14 @@ static struct fib_rule main_rule = { &default_rule, 0x7FFE, RT_TABLE_MAIN, RTN_U
static struct fib_rule local_rule = { &main_rule, 0, RT_TABLE_LOCAL, RTN_UNICAST, };
static struct fib_rule *fib_rules = &local_rule;
+static rwlock_t fib_rules_lock = RW_LOCK_UNLOCKED;
int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
{
struct rtattr **rta = arg;
struct rtmsg *rtm = NLMSG_DATA(nlh);
struct fib_rule *r, **rp;
+ int err = -ESRCH;
for (rp=&fib_rules; (r=*rp) != NULL; rp=&r->r_next) {
if ((!rta[RTA_SRC-1] || memcmp(RTA_DATA(rta[RTA_SRC-1]), &r->r_src, 4) == 0) &&
@@ -99,18 +101,20 @@ int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
(!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) &&
(!rta[RTA_IIF-1] || strcmp(RTA_DATA(rta[RTA_IIF-1]), r->r_ifname) == 0) &&
(!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) {
+ err = -EPERM;
if (r == &local_rule)
- return -EPERM;
+ break;
+ write_lock_bh(&fib_rules_lock);
*rp = r->r_next;
- synchronize_bh();
-
+ write_unlock_bh(&fib_rules_lock);
if (r != &default_rule && r != &main_rule)
kfree(r);
- return 0;
+ err = 0;
+ break;
}
}
- return -ESRCH;
+ return err;
}
/* Allocate new unique table id */
@@ -205,7 +209,9 @@ int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
}
new_r->r_next = r;
+ write_lock_bh(&fib_rules_lock);
*rp = new_r;
+ write_unlock_bh(&fib_rules_lock);
return 0;
}
@@ -250,8 +256,11 @@ static void fib_rules_detach(struct device *dev)
struct fib_rule *r;
for (r=fib_rules; r; r=r->r_next) {
- if (r->r_ifindex == dev->ifindex)
+ if (r->r_ifindex == dev->ifindex) {
+ write_lock_bh(&fib_rules_lock);
r->r_ifindex = -1;
+ write_unlock_bh(&fib_rules_lock);
+ }
}
}
@@ -260,8 +269,11 @@ static void fib_rules_attach(struct device *dev)
struct fib_rule *r;
for (r=fib_rules; r; r=r->r_next) {
- if (r->r_ifindex == -1 && strcmp(dev->name, r->r_ifname) == 0)
+ if (r->r_ifindex == -1 && strcmp(dev->name, r->r_ifname) == 0) {
+ write_lock_bh(&fib_rules_lock);
r->r_ifindex = dev->ifindex;
+ write_unlock_bh(&fib_rules_lock);
+ }
}
}
@@ -275,6 +287,7 @@ int fib_lookup(const struct rt_key *key, struct fib_result *res)
u32 saddr = key->src;
FRprintk("Lookup: %08x <- %08x ", key->dst, key->src);
+ read_lock(&fib_rules_lock);
for (r = fib_rules; r; r=r->r_next) {
if (((saddr^r->r_src) & r->r_srcmask) ||
((daddr^r->r_dst) & r->r_dstmask) ||
@@ -294,11 +307,14 @@ FRprintk("tb %d r %d ", r->r_table, r->r_action);
policy = r;
break;
case RTN_UNREACHABLE:
+ read_unlock(&fib_rules_lock);
return -ENETUNREACH;
default:
case RTN_BLACKHOLE:
+ read_unlock(&fib_rules_lock);
return -EINVAL;
case RTN_PROHIBIT:
+ read_unlock(&fib_rules_lock);
return -EACCES;
}
@@ -308,12 +324,16 @@ FRprintk("tb %d r %d ", r->r_table, r->r_action);
if (err == 0) {
FRprintk("ok\n");
res->r = policy;
+ read_unlock(&fib_rules_lock);
return 0;
}
- if (err < 0 && err != -EAGAIN)
+ if (err < 0 && err != -EAGAIN) {
+ read_unlock(&fib_rules_lock);
return err;
+ }
}
FRprintk("FAILURE\n");
+ read_unlock(&fib_rules_lock);
return -ENETUNREACH;
}
@@ -400,12 +420,14 @@ int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb)
int s_idx = cb->args[0];
struct fib_rule *r;
+ read_lock(&fib_rules_lock);
for (r=fib_rules, idx=0; r; r = r->r_next, idx++) {
if (idx < s_idx)
continue;
if (inet_fill_rule(skb, r, cb) < 0)
break;
}
+ read_unlock(&fib_rules_lock);
cb->args[0] = idx;
return skb->len;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 199550ffb..9456c7f29 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1,9 +1,9 @@
/*
* NET3: Implementation of the ICMP protocol layer.
*
- * Alan Cox, <alan@cymru.net>
+ * Alan Cox, <alan@redhat.com>
*
- * Version: $Id: icmp.c,v 1.52 1999/03/21 12:04:11 davem Exp $
+ * Version: $Id: icmp.c,v 1.57 1999/06/09 10:10:50 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -699,8 +699,8 @@ static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb, int len)
case ICMP_FRAG_NEEDED:
if (ipv4_config.no_pmtu_disc) {
if (net_ratelimit())
- printk(KERN_INFO "ICMP: %s: fragmentation needed and DF set.\n",
- in_ntoa(iph->daddr));
+ printk(KERN_INFO "ICMP: %d.%d.%d.%d: fragmentation needed and DF set.\n",
+ NIPQUAD(iph->daddr));
} else {
unsigned short new_mtu;
new_mtu = ip_rt_frag_needed(iph, ntohs(icmph->un.frag.mtu));
@@ -711,7 +711,7 @@ static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb, int len)
break;
case ICMP_SR_FAILED:
if (net_ratelimit())
- printk(KERN_INFO "ICMP: %s: Source Route Failed.\n", in_ntoa(iph->daddr));
+ printk(KERN_INFO "ICMP: %d.%d.%d.%d: Source Route Failed.\n", NIPQUAD(iph->daddr));
break;
default:
break;
@@ -741,8 +741,8 @@ static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb, int len)
if (inet_addr_type(iph->daddr) == RTN_BROADCAST)
{
if (net_ratelimit())
- printk(KERN_WARNING "%s sent an invalid ICMP error to a broadcast.\n",
- in_ntoa(skb->nh.iph->saddr));
+ printk(KERN_WARNING "%d.%d.%d.%d sent an invalid ICMP error to a broadcast.\n",
+ NIPQUAD(skb->nh.iph->saddr));
return;
}
}
@@ -1142,6 +1142,8 @@ __initfunc(void icmp_init(struct net_proto_family *ops))
icmp_inode.i_sock = 1;
icmp_inode.i_uid = 0;
icmp_inode.i_gid = 0;
+ init_waitqueue_head(&icmp_inode.i_wait);
+ init_waitqueue_head(&icmp_inode.u.socket_i.wait);
icmp_socket->inode = &icmp_inode;
icmp_socket->state = SS_UNCONNECTED;
@@ -1150,6 +1152,11 @@ __initfunc(void icmp_init(struct net_proto_family *ops))
if ((err=ops->create(icmp_socket, IPPROTO_ICMP))<0)
panic("Failed to create the ICMP control socket.\n");
icmp_socket->sk->allocation=GFP_ATOMIC;
- icmp_socket->sk->num = 256; /* Don't receive any data */
icmp_socket->sk->ip_ttl = MAXTTL;
+
+ /* Unhash it so that IP input processing does not even
+ * see it, we do not wish this socket to see incoming
+ * packets.
+ */
+ icmp_socket->sk->prot->unhash(icmp_socket->sk);
}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 68e52633e..61c530418 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -8,7 +8,7 @@
* the older version didn't come out right using gcc 2.5.8, the newer one
* seems to fall out with gcc 2.6.2.
*
- * Version: $Id: igmp.c,v 1.30 1999/03/25 10:04:10 davem Exp $
+ * Version: $Id: igmp.c,v 1.32 1999/06/09 10:10:53 davem Exp $
*
* Authors:
* Alan Cox <Alan.Cox@linux.org>
@@ -97,6 +97,15 @@
#include <linux/mroute.h>
#endif
+/* Big mc list lock for all the devices */
+static rwlock_t ip_mc_lock = RW_LOCK_UNLOCKED;
+/* Big mc list semaphore for all the sockets.
+ We do not refer to this list in IP data paths or from BH,
+ so that semaphore is OK.
+ */
+DECLARE_MUTEX(ip_sk_mc_sem);
+
+
#define IP_MAX_MEMBERSHIPS 20
#ifdef CONFIG_IP_MULTICAST
@@ -216,6 +225,8 @@ static void igmp_timer_expire(unsigned long data)
struct in_device *in_dev = im->interface;
int err;
+ read_lock(&ip_mc_lock);
+
im->tm_running=0;
if (IGMP_V1_SEEN(in_dev))
@@ -234,6 +245,7 @@ static void igmp_timer_expire(unsigned long data)
igmp_start_timer(im, IGMP_Unsolicited_Report_Interval);
}
im->reporter = 1;
+ read_unlock(&ip_mc_lock);
}
static void igmp_heard_report(struct in_device *in_dev, u32 group)
@@ -245,14 +257,16 @@ static void igmp_heard_report(struct in_device *in_dev, u32 group)
if (LOCAL_MCAST(group))
return;
+ read_lock(&ip_mc_lock);
for (im=in_dev->mc_list; im!=NULL; im=im->next) {
if (im->multiaddr == group) {
igmp_stop_timer(im);
im->reporter = 0;
im->unsolicit_count = 0;
- return;
+ break;
}
}
+ read_unlock(&ip_mc_lock);
}
static void igmp_heard_query(struct in_device *in_dev, unsigned char max_resp_time,
@@ -281,6 +295,7 @@ static void igmp_heard_query(struct in_device *in_dev, unsigned char max_resp_ti
* - Use the igmp->igmp_code field as the maximum
* delay possible
*/
+ read_lock(&ip_mc_lock);
for (im=in_dev->mc_list; im!=NULL; im=im->next) {
if (group && group != im->multiaddr)
continue;
@@ -291,6 +306,7 @@ static void igmp_heard_query(struct in_device *in_dev, unsigned char max_resp_ti
igmp_stop_timer(im);
igmp_start_timer(im, max_delay);
}
+ read_unlock(&ip_mc_lock);
}
int igmp_rcv(struct sk_buff *skb, unsigned short len)
@@ -380,9 +396,7 @@ static void igmp_group_dropped(struct ip_mc_list *im)
if (LOCAL_MCAST(im->multiaddr))
return;
- start_bh_atomic();
igmp_stop_timer(im);
- end_bh_atomic();
if (im->reporter && !IGMP_V1_SEEN(im->interface))
igmp_send_report(im->interface->dev, im->multiaddr, IGMP_HOST_LEAVE_MESSAGE);
@@ -400,9 +414,7 @@ static void igmp_group_added(struct ip_mc_list *im)
if (LOCAL_MCAST(im->multiaddr))
return;
- start_bh_atomic();
igmp_start_timer(im, IGMP_Initial_Report_Delay);
- end_bh_atomic();
#endif
}
@@ -422,16 +434,17 @@ void ip_mc_inc_group(struct in_device *in_dev, u32 addr)
im = (struct ip_mc_list *)kmalloc(sizeof(*im), GFP_KERNEL);
+ write_lock_bh(&ip_mc_lock);
for (i=in_dev->mc_list; i; i=i->next) {
if (i->multiaddr == addr) {
i->users++;
if (im)
kfree(im);
- return;
+ goto out;
}
}
if (!im)
- return;
+ goto out;
im->users=1;
im->interface=in_dev;
im->multiaddr=addr;
@@ -447,9 +460,13 @@ void ip_mc_inc_group(struct in_device *in_dev, u32 addr)
im->next=in_dev->mc_list;
in_dev->mc_list=im;
igmp_group_added(im);
+ write_unlock_bh(&ip_mc_lock);
if (in_dev->dev->flags & IFF_UP)
ip_rt_multicast_event(in_dev);
return;
+out:
+ write_unlock_bh(&ip_mc_lock);
+ return;
}
/*
@@ -458,22 +475,27 @@ void ip_mc_inc_group(struct in_device *in_dev, u32 addr)
int ip_mc_dec_group(struct in_device *in_dev, u32 addr)
{
+ int err = -ESRCH;
struct ip_mc_list *i, **ip;
+ write_lock_bh(&ip_mc_lock);
for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) {
if (i->multiaddr==addr) {
if (--i->users == 0) {
*ip = i->next;
- synchronize_bh();
-
igmp_group_dropped(i);
+
+ write_unlock_bh(&ip_mc_lock);
if (in_dev->dev->flags & IFF_UP)
ip_rt_multicast_event(in_dev);
kfree_s(i, sizeof(*i));
+ return 0;
}
- return 0;
+ err = 0;
+ break;
}
}
+ write_unlock_bh(&ip_mc_lock);
return -ESRCH;
}
@@ -483,8 +505,10 @@ void ip_mc_down(struct in_device *in_dev)
{
struct ip_mc_list *i;
+ read_lock_bh(&ip_mc_lock);
for (i=in_dev->mc_list; i; i=i->next)
igmp_group_dropped(i);
+ read_unlock_bh(&ip_mc_lock);
ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
}
@@ -497,8 +521,10 @@ void ip_mc_up(struct in_device *in_dev)
ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
+ read_lock_bh(&ip_mc_lock);
for (i=in_dev->mc_list; i; i=i->next)
igmp_group_added(i);
+ read_unlock_bh(&ip_mc_lock);
}
/*
@@ -509,11 +535,13 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
{
struct ip_mc_list *i;
+ write_lock_bh(&ip_mc_lock);
while ((i = in_dev->mc_list) != NULL) {
in_dev->mc_list = i->next;
igmp_group_dropped(i);
kfree_s(i, sizeof(*i));
}
+ write_unlock_bh(&ip_mc_lock);
}
static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr)
@@ -570,6 +598,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
err = -EADDRINUSE;
+ down(&ip_sk_mc_sem);
for (i=sk->ip_mc_list; i; i=i->next) {
if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) {
/* New style additions are reference counted */
@@ -577,13 +606,13 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
i->count++;
err = 0;
}
- goto done;
+ goto done_unlock;
}
count++;
}
err = -ENOBUFS;
if (iml == NULL || count >= sysctl_igmp_max_memberships)
- goto done;
+ goto done_unlock;
memcpy(&iml->multi, imr, sizeof(*imr));
iml->next = sk->ip_mc_list;
iml->count = 1;
@@ -591,6 +620,9 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
ip_mc_inc_group(in_dev, addr);
iml = NULL;
err = 0;
+
+done_unlock:
+ up(&ip_sk_mc_sem);
done:
rtnl_shunlock();
if (iml)
@@ -606,6 +638,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
{
struct ip_mc_socklist *iml, **imlp;
+ down(&ip_sk_mc_sem);
for (imlp=&sk->ip_mc_list; (iml=*imlp)!=NULL; imlp=&iml->next) {
if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr &&
iml->multi.imr_address.s_addr==imr->imr_address.s_addr &&
@@ -615,7 +648,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
return 0;
*imlp = iml->next;
- synchronize_bh();
+ up(&ip_sk_mc_sem);
in_dev = inetdev_by_index(iml->multi.imr_ifindex);
if (in_dev)
@@ -624,6 +657,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
return 0;
}
}
+ up(&ip_sk_mc_sem);
return -EADDRNOTAVAIL;
}
@@ -635,13 +669,37 @@ void ip_mc_drop_socket(struct sock *sk)
{
struct ip_mc_socklist *iml;
+ down(&ip_sk_mc_sem);
while ((iml=sk->ip_mc_list) != NULL) {
struct in_device *in_dev;
sk->ip_mc_list = iml->next;
+ up(&ip_sk_mc_sem);
+
if ((in_dev = inetdev_by_index(iml->multi.imr_ifindex)) != NULL)
ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
sock_kfree_s(sk, iml, sizeof(*iml));
+
+ down(&ip_sk_mc_sem);
}
+ up(&ip_sk_mc_sem);
+}
+
+int ip_check_mc(struct device *dev, u32 mc_addr)
+{
+ struct in_device *in_dev = dev->ip_ptr;
+ struct ip_mc_list *im;
+
+ if (in_dev) {
+ read_lock(&ip_mc_lock);
+ for (im=in_dev->mc_list; im; im=im->next) {
+ if (im->multiaddr == mc_addr) {
+ read_unlock(&ip_mc_lock);
+ return 1;
+ }
+ }
+ read_unlock(&ip_mc_lock);
+ }
+ return 0;
}
@@ -653,11 +711,11 @@ int ip_mc_procinfo(char *buffer, char **start, off_t offset, int length, int dum
struct ip_mc_list *im;
int len=0;
struct device *dev;
-
+
len=sprintf(buffer,"Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n");
-
- for(dev = dev_base; dev; dev = dev->next)
- {
+
+ read_lock(&dev_base_lock);
+ for(dev = dev_base; dev; dev = dev->next) {
struct in_device *in_dev = dev->ip_ptr;
char *querier = "NONE";
@@ -669,6 +727,7 @@ int ip_mc_procinfo(char *buffer, char **start, off_t offset, int length, int dum
len+=sprintf(buffer+len,"%d\t%-10s: %5d %7s\n",
dev->ifindex, dev->name, dev->mc_count, querier);
+ read_lock(&ip_mc_lock);
for (im = in_dev->mc_list; im; im = im->next) {
len+=sprintf(buffer+len,
"\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n",
@@ -681,11 +740,16 @@ int ip_mc_procinfo(char *buffer, char **start, off_t offset, int length, int dum
len=0;
begin=pos;
}
- if(pos>offset+length)
+ if(pos>offset+length) {
+ read_unlock(&ip_mc_lock);
goto done;
+ }
}
+ read_unlock(&ip_mc_lock);
}
done:
+ read_unlock(&dev_base_lock);
+
*start=buffer+(offset-begin);
len-=(offset-begin);
if(len>length)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index f066e6073..29747fee6 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -5,7 +5,7 @@
*
* The IP fragmentation functionality.
*
- * Version: $Id: ip_fragment.c,v 1.40 1999/03/20 23:58:34 davem Exp $
+ * Version: $Id: ip_fragment.c,v 1.41 1999/05/27 00:38:07 davem Exp $
*
* Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
* Alan Cox <Alan.Cox@linux.org>
@@ -71,7 +71,8 @@ struct ipq {
#define IPQ_HASHSZ 64
-struct ipq *ipq_hash[IPQ_HASHSZ];
+static struct ipq *ipq_hash[IPQ_HASHSZ];
+static spinlock_t ipfrag_lock = SPIN_LOCK_UNLOCKED;
#define ipqhashfn(id, saddr, daddr, prot) \
((((id) >> 1) ^ (saddr) ^ (daddr) ^ (prot)) & (IPQ_HASHSZ - 1))
@@ -141,7 +142,9 @@ static inline struct ipq *ip_find(struct iphdr *iph, struct dst_entry *dst)
unsigned int hash = ipqhashfn(id, saddr, daddr, protocol);
struct ipq *qp;
- /* Always, we are in a BH context, so no locking. -DaveM */
+ /* We are always in BH context, and protected by the
+ * ipfrag lock.
+ */
for(qp = ipq_hash[hash]; qp; qp = qp->next) {
if(qp->iph->id == id &&
qp->iph->saddr == saddr &&
@@ -158,8 +161,9 @@ static inline struct ipq *ip_find(struct iphdr *iph, struct dst_entry *dst)
* because we completed, reassembled and processed it, or because
* it timed out.
*
- * This is called _only_ from BH contexts, on packet reception
- * processing and from frag queue expiration timers. -DaveM
+ * This is called _only_ from BH contexts with the ipfrag lock held,
+ * on packet reception processing and from frag queue expiration
+ * timers. -DaveM
*/
static void ip_free(struct ipq *qp)
{
@@ -197,6 +201,7 @@ static void ip_expire(unsigned long arg)
{
struct ipq *qp = (struct ipq *) arg;
+ spin_lock(&ipfrag_lock);
if(!qp->fragments)
{
#ifdef IP_EXPIRE_DEBUG
@@ -213,10 +218,13 @@ static void ip_expire(unsigned long arg)
out:
/* Nuke the fragment queue. */
ip_free(qp);
+ spin_lock(&ipfrag_lock);
}
/* Memory limiting on fragments. Evictor trashes the oldest
* fragment queue until we are back under the low threshold.
+ *
+ * We are always called in BH with the ipfrag lock held.
*/
static void ip_evictor(void)
{
@@ -229,9 +237,6 @@ restart:
struct ipq *qp;
if (atomic_read(&ip_frag_mem) <= sysctl_ipfrag_low_thresh)
return;
- /* We are in a BH context, so these queue
- * accesses are safe. -DaveM
- */
qp = ipq_hash[i];
if (qp) {
/* find the oldest queue for this hash bucket */
@@ -283,7 +288,7 @@ static struct ipq *ip_create(struct sk_buff *skb, struct iphdr *iph)
/* Add this entry to the queue. */
hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
- /* We are in a BH context, no locking necessary. -DaveM */
+ /* In a BH context and ipfrag lock is held. -DaveM */
if((qp->next = ipq_hash[hash]) != NULL)
qp->next->pprev = &qp->next;
ipq_hash[hash] = qp;
@@ -421,6 +426,8 @@ struct sk_buff *ip_defrag(struct sk_buff *skb)
ip_statistics.IpReasmReqds++;
+ spin_lock(&ipfrag_lock);
+
/* Start by cleaning up the memory. */
if (atomic_read(&ip_frag_mem) > sysctl_ipfrag_high_thresh)
ip_evictor();
@@ -565,6 +572,7 @@ struct sk_buff *ip_defrag(struct sk_buff *skb)
out_freequeue:
ip_free(qp);
out_skb:
+ spin_unlock(&ipfrag_lock);
return skb;
}
@@ -574,6 +582,7 @@ out_skb:
out_timer:
mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time); /* ~ 30 seconds */
out:
+ spin_unlock(&ipfrag_lock);
return NULL;
/*
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 7a3e2618b..107ccaa16 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -5,7 +5,7 @@
*
* The Internet Protocol (IP) module.
*
- * Version: $Id: ip_input.c,v 1.37 1999/04/22 10:38:36 davem Exp $
+ * Version: $Id: ip_input.c,v 1.40 1999/06/09 10:10:55 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -154,44 +154,11 @@
struct ip_mib ip_statistics={2,IPDEFTTL,}; /* Forwarding=No, Default TTL=64 */
-
-/*
- * Handle the issuing of an ioctl() request
- * for the ip device. This is scheduled to
- * disappear
- */
-
-int ip_ioctl(struct sock *sk, int cmd, unsigned long arg)
-{
- switch(cmd)
- {
- default:
- return(-EINVAL);
- }
-}
-
-
#if defined(CONFIG_IP_TRANSPARENT_PROXY) && !defined(CONFIG_IP_ALWAYS_DEFRAG)
#define CONFIG_IP_ALWAYS_DEFRAG 1
#endif
/*
- * 0 - deliver
- * 1 - block
- */
-static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
-{
- int type;
-
- type = skb->h.icmph->type;
- if (type < 32)
- return test_bit(type, &sk->tp_pinfo.tp_raw4.filter);
-
- /* Do not block unknown ICMP types */
- return 0;
-}
-
-/*
* Process Router Attention IP option
*/
int ip_call_ra_chain(struct sk_buff *skb)
@@ -224,16 +191,37 @@ int ip_call_ra_chain(struct sk_buff *skb)
return 0;
}
+/* Handle this out of line, it is rare. */
+static int ip_run_ipprot(struct sk_buff *skb, struct iphdr *iph,
+ struct inet_protocol *ipprot, int force_copy)
+{
+ int ret = 0;
+
+ do {
+ if (ipprot->protocol == iph->protocol) {
+ struct sk_buff *skb2 = skb;
+ if (ipprot->copy || force_copy)
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if(skb2 != NULL) {
+ ret = 1;
+ ipprot->handler(skb2,
+ ntohs(iph->tot_len) - (iph->ihl * 4));
+ }
+ }
+ ipprot = (struct inet_protocol *) ipprot->next;
+ } while(ipprot != NULL);
+
+ return ret;
+}
+
+extern struct sock *raw_v4_input(struct sk_buff *, struct iphdr *, int);
+
/*
* Deliver IP Packets to the higher protocol layers.
*/
int ip_local_deliver(struct sk_buff *skb)
{
struct iphdr *iph = skb->nh.iph;
- struct inet_protocol *ipprot;
- struct sock *raw_sk=NULL;
- unsigned char hash;
- int flag = 0;
#ifndef CONFIG_IP_ALWAYS_DEFRAG
/*
@@ -249,34 +237,29 @@ int ip_local_deliver(struct sk_buff *skb)
#endif
#ifdef CONFIG_IP_MASQUERADE
- /*
- * Do we need to de-masquerade this packet?
- */
- {
- int ret;
- /*
- * Some masq modules can re-inject packets if
- * bad configured.
+ /* Do we need to de-masquerade this packet? */
+ if((IPCB(skb)->flags&IPSKB_MASQUERADED)) {
+ /* Some masq modules can re-inject packets if
+ * bad configured.
*/
+ printk(KERN_DEBUG "ip_input(): demasq recursion detected. "
+ "Check masq modules configuration\n");
+ kfree_skb(skb);
+ return 0;
+ } else {
+ int ret = ip_fw_demasquerade(&skb);
- if((IPCB(skb)->flags&IPSKB_MASQUERADED)) {
- printk(KERN_DEBUG "ip_input(): demasq recursion detected. Check masq modules configuration\n");
- kfree_skb(skb);
- return 0;
- }
-
- ret = ip_fw_demasquerade(&skb);
if (ret < 0) {
kfree_skb(skb);
return 0;
}
-
if (ret) {
- iph=skb->nh.iph;
+ iph = skb->nh.iph;
IPCB(skb)->flags |= IPSKB_MASQUERADED;
dst_release(skb->dst);
skb->dst = NULL;
- if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, skb->dev)) {
+ if (ip_route_input(skb, iph->daddr, iph->saddr,
+ iph->tos, skb->dev)) {
kfree_skb(skb);
return 0;
}
@@ -285,112 +268,50 @@ int ip_local_deliver(struct sk_buff *skb)
}
#endif
- /*
- * Point into the IP datagram, just past the header.
- */
-
+ /* Point into the IP datagram, just past the header. */
skb->h.raw = skb->nh.raw + iph->ihl*4;
- /*
- * Deliver to raw sockets. This is fun as to avoid copies we want to make no
- * surplus copies.
- *
- * RFC 1122: SHOULD pass TOS value up to the transport layer.
- * -> It does. And not only TOS, but all IP header.
- */
-
- /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
- hash = iph->protocol & (MAX_INET_PROTOS - 1);
-
- /*
- * If there maybe a raw socket we must check - if not we don't care less
- */
-
- if((raw_sk = raw_v4_htable[hash]) != NULL) {
- struct sock *sknext = NULL;
- struct sk_buff *skb1;
- raw_sk = raw_v4_lookup(raw_sk, iph->protocol, iph->saddr, iph->daddr, skb->dev->ifindex);
- if(raw_sk) { /* Any raw sockets */
- do {
- /* Find the next */
- sknext = raw_v4_lookup(raw_sk->next, iph->protocol,
- iph->saddr, iph->daddr, skb->dev->ifindex);
- if (iph->protocol != IPPROTO_ICMP || !icmp_filter(raw_sk, skb)) {
- if (sknext == NULL)
- break;
- skb1 = skb_clone(skb, GFP_ATOMIC);
- if(skb1)
- {
- raw_rcv(raw_sk, skb1);
- }
- }
- raw_sk = sknext;
- } while(raw_sk!=NULL);
-
- /* Here either raw_sk is the last raw socket, or NULL if
- * none. We deliver to the last raw socket AFTER the
- * protocol checks as it avoids a surplus copy.
- */
- }
- }
-
- /*
- * skb->h.raw now points at the protocol beyond the IP header.
- */
-
- for (ipprot = (struct inet_protocol *)inet_protos[hash];ipprot != NULL;ipprot=(struct inet_protocol *)ipprot->next)
{
- struct sk_buff *skb2;
-
- if (ipprot->protocol != iph->protocol)
- continue;
- /*
- * See if we need to make a copy of it. This will
- * only be set if more than one protocol wants it.
- * and then not for the last one. If there is a pending
- * raw delivery wait for that
+ /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
+ int hash = iph->protocol & (MAX_INET_PROTOS - 1);
+ struct sock *raw_sk = raw_v4_htable[hash];
+ struct inet_protocol *ipprot;
+ int flag;
+
+ /* If there maybe a raw socket we must check - if not we
+ * don't care less
*/
-
- if (ipprot->copy || raw_sk)
- {
- skb2 = skb_clone(skb, GFP_ATOMIC);
- if(skb2==NULL)
- continue;
- }
- else
- {
- skb2 = skb;
- }
- flag = 1;
+ if(raw_sk != NULL)
+ raw_sk = raw_v4_input(skb, iph, hash);
+
+ ipprot = (struct inet_protocol *) inet_protos[hash];
+ flag = 0;
+ if(ipprot != NULL) {
+ if(raw_sk == NULL &&
+ ipprot->next == NULL &&
+ ipprot->protocol == iph->protocol) {
+ /* Fast path... */
+ return ipprot->handler(skb, (ntohs(iph->tot_len) -
+ (iph->ihl * 4)));
+ } else {
+ flag = ip_run_ipprot(skb, iph, ipprot, (raw_sk != NULL));
+ }
+ }
- /*
- * Pass on the datagram to each protocol that wants it,
- * based on the datagram protocol. We should really
- * check the protocol handler's return values here...
+ /* All protocols checked.
+ * If this packet was a broadcast, we may *not* reply to it, since that
+ * causes (proven, grin) ARP storms and a leakage of memory (i.e. all
+ * ICMP reply messages get queued up for transmission...)
*/
-
- ipprot->handler(skb2, ntohs(iph->tot_len) - (iph->ihl * 4));
- }
-
- /*
- * All protocols checked.
- * If this packet was a broadcast, we may *not* reply to it, since that
- * causes (proven, grin) ARP storms and a leakage of memory (i.e. all
- * ICMP reply messages get queued up for transmission...)
- */
-
- if(raw_sk!=NULL) /* Shift to last raw user */
- {
- raw_rcv(raw_sk, skb);
-
- }
- else if (!flag) /* Free and report errors */
- {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);
- kfree_skb(skb);
+ if(raw_sk != NULL) { /* Shift to last raw user */
+ raw_rcv(raw_sk, skb);
+ } else if (!flag) { /* Free and report errors */
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);
+ kfree_skb(skb);
+ }
}
- return(0);
+ return 0;
}
/*
@@ -404,9 +325,8 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
u16 rport;
#endif /* CONFIG_FIREWALL */
- /*
- * When the interface is in promisc. mode, drop all the crap
- * that it receives, do not try to analyse it.
+ /* When the interface is in promisc. mode, drop all the crap
+ * that it receives, do not try to analyse it.
*/
if (skb->pkt_type == PACKET_OTHERHOST)
goto drop;
@@ -430,17 +350,15 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
goto inhdr_error;
{
- __u32 len = ntohs(iph->tot_len);
- if (skb->len < len)
- goto inhdr_error;
+ __u32 len = ntohs(iph->tot_len);
+ if (skb->len < len)
+ goto inhdr_error;
- /*
- * Our transport medium may have padded the buffer out. Now we know it
- * is IP we can trim to the true length of the frame.
- * Note this now means skb->len holds ntohs(iph->tot_len).
- */
-
- __skb_trim(skb, len);
+ /* Our transport medium may have padded the buffer out. Now we know it
+ * is IP we can trim to the true length of the frame.
+ * Note this now means skb->len holds ntohs(iph->tot_len).
+ */
+ __skb_trim(skb, len);
}
#ifdef CONFIG_IP_ALWAYS_DEFRAG
@@ -474,21 +392,17 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
if (skb->dst == NULL) {
if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))
goto drop;
-#ifdef CONFIG_CPU_IS_SLOW
- if (net_cpu_congestion > 10 && !(iph->tos&IPTOS_RELIABILITY) &&
- IPTOS_PREC(iph->tos) < IPTOS_PREC_INTERNETCONTROL) {
- goto drop;
- }
-#endif
}
#ifdef CONFIG_NET_CLS_ROUTE
if (skb->dst->tclassid) {
u32 idx = skb->dst->tclassid;
+ write_lock(&ip_rt_acct_lock);
ip_rt_acct[idx&0xFF].o_packets++;
ip_rt_acct[idx&0xFF].o_bytes+=skb->len;
ip_rt_acct[(idx>>16)&0xFF].i_packets++;
ip_rt_acct[(idx>>16)&0xFF].i_bytes+=skb->len;
+ write_unlock(&ip_rt_acct_lock);
}
#endif
diff --git a/net/ipv4/ip_masq_mfw.c b/net/ipv4/ip_masq_mfw.c
index dc38b1712..ff07231fc 100644
--- a/net/ipv4/ip_masq_mfw.c
+++ b/net/ipv4/ip_masq_mfw.c
@@ -3,7 +3,7 @@
*
* Does (reverse-masq) forwarding based on skb->fwmark value
*
- * $Id: ip_masq_mfw.c,v 1.3 1999/01/26 05:33:47 davem Exp $
+ * $Id: ip_masq_mfw.c,v 1.4 1999/05/13 23:25:07 davem Exp $
*
* Author: Juan Jose Ciarlante <jjciarla@raiz.uncu.edu.ar>
* based on Steven Clarke's portfw
@@ -79,7 +79,7 @@ struct ip_masq_mfw {
};
-static struct semaphore mfw_sema = MUTEX;
+static DECLARE_MUTEX(mfw_sema);
#ifdef __SMP__
static rwlock_t mfw_lock = RW_LOCK_UNLOCKED;
#endif
diff --git a/net/ipv4/ip_masq_quake.c b/net/ipv4/ip_masq_quake.c
index 165dd6bd5..17b11a799 100644
--- a/net/ipv4/ip_masq_quake.c
+++ b/net/ipv4/ip_masq_quake.c
@@ -12,6 +12,7 @@
* http://www.gamers.org/dEngine/quake/spec/
* Harald Hoyer : Check for QUAKE-STRING
* Juan Jose Ciarlante : litl bits for 2.1
+ * Horst von Brand : Add #include <linux/string.h>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -24,6 +25,7 @@
#include <linux/module.h>
#include <asm/system.h>
#include <linux/types.h>
+#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/in.h>
@@ -44,7 +46,7 @@ typedef struct
struct quake_priv_data {
/* Have we seen a client connect message */
- char cl_connect;
+ signed char cl_connect;
};
static int
diff --git a/net/ipv4/ip_masq_vdolive.c b/net/ipv4/ip_masq_vdolive.c
index 4724e3b93..2d8d672cc 100644
--- a/net/ipv4/ip_masq_vdolive.c
+++ b/net/ipv4/ip_masq_vdolive.c
@@ -2,7 +2,7 @@
* IP_MASQ_VDOLIVE - VDO Live masquerading module
*
*
- * Version: @(#)$Id: ip_masq_vdolive.c,v 1.4 1998/10/06 04:49:07 davem Exp $
+ * Version: @(#)$Id: ip_masq_vdolive.c,v 1.6 1999/06/09 08:29:03 davem Exp $
*
* Author: Nigel Metheringham <Nigel.Metheringham@ThePLAnet.net>
* PLAnet Online Ltd
@@ -10,6 +10,9 @@
* Fixes: Minor changes for 2.1 by
* Steven Clarke <Steven.Clarke@ThePlanet.Net>, Planet Online Ltd
*
+ * Add missing #include <linux/string.h>
+ * Horst von Brand <vonbrand@sleipnir.valparaiso.cl>
+ *
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
@@ -25,6 +28,7 @@
#include <linux/config.h>
#include <linux/module.h>
#include <linux/types.h>
+#include <linux/string.h>
#include <linux/kernel.h>
#include <asm/system.h>
#include <linux/skbuff.h>
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index fae22cbe7..359926a4c 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -5,7 +5,7 @@
*
* The options processing module for ip.c
*
- * Version: $Id: ip_options.c,v 1.16 1999/03/21 05:22:40 davem Exp $
+ * Version: $Id: ip_options.c,v 1.18 1999/06/09 08:29:06 davem Exp $
*
* Authors: A.N.Kuznetsov
*
@@ -452,7 +452,6 @@ eol:
error:
if (skb) {
icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24));
- kfree_skb(skb);
}
return -EINVAL;
}
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index abe93ec27..51e27ad67 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1,5 +1,5 @@
/*
- * $Id: ipconfig.c,v 1.20 1999/03/28 10:18:28 davem Exp $
+ * $Id: ipconfig.c,v 1.22 1999/06/09 10:10:57 davem Exp $
*
* Automatic Configuration of IP -- use BOOTP or RARP or user-supplied
* information to configure own IP address and routes.
@@ -112,7 +112,8 @@ static int __init ic_open_devs(void)
unsigned short oflags;
last = &ic_first_dev;
- for (dev = dev_base; dev; dev = dev->next)
+ read_lock(&dev_base_lock);
+ for (dev = dev_base; dev; dev = dev->next) {
if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
(!(dev->flags & IFF_LOOPBACK) &&
(dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
@@ -142,6 +143,9 @@ static int __init ic_open_devs(void)
ic_proto_have_if |= able;
DBG(("IP-Config: Opened %s (able=%d)\n", dev->name, able));
}
+ }
+ read_unlock(&dev_base_lock);
+
*last = NULL;
if (!ic_first_dev) {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index d7db0c007..1034e0e7a 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1,7 +1,7 @@
/*
* IP multicast routing support for mrouted 3.6/3.8
*
- * (c) 1995 Alan Cox, <alan@cymru.net>
+ * (c) 1995 Alan Cox, <alan@redhat.com>
* Linux Consultancy and Custom Driver Development
*
* This program is free software; you can redistribute it and/or
@@ -9,7 +9,7 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * Version: $Id: ipmr.c,v 1.40 1999/03/25 10:04:25 davem Exp $
+ * Version: $Id: ipmr.c,v 1.43 1999/06/09 10:10:59 davem Exp $
*
* Fixes:
* Michael Chastain : Incorrect size of copying.
@@ -23,6 +23,8 @@
* Brad Parker : Better behaviour on mrouted upcall
* overflow.
* Carlos Picoto : PIMv1 Support
+ * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header
+ * Relax this requrement to work with older peers.
*
*/
@@ -431,7 +433,7 @@ static void ipmr_cache_resolve(struct mfc_cache *cache)
skb_trim(skb, nlh->nlmsg_len);
((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE;
}
- err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).pid, MSG_DONTWAIT);
+ err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
} else
#endif
ip_mr_forward(skb, cache, 0);
@@ -1343,7 +1345,8 @@ int pim_rcv(struct sk_buff * skb, unsigned short len)
pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
(pim->flags&PIM_NULL_REGISTER) ||
reg_dev == NULL ||
- ip_compute_csum((void *)pim, len)) {
+ (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
+ ip_compute_csum((void *)pim, len))) {
kfree_skb(skb);
return -EINVAL;
}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 1640a0560..52c5ee5a4 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -7,7 +7,7 @@
* PROC file system. It is mainly used for debugging and
* statistics.
*
- * Version: $Id: proc.c,v 1.34 1999/02/08 11:20:34 davem Exp $
+ * Version: $Id: proc.c,v 1.35 1999/05/27 00:37:38 davem Exp $
*
* Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de>
@@ -114,10 +114,8 @@ static inline void get__sock(struct sock *sp, char *tmpbuf, int i, int format)
slot_dist = tcp_tw_death_row_slot - slot_dist;
timer_expires = jiffies + (slot_dist * TCP_TWKILL_PERIOD);
} else {
- timer_active1 = del_timer(&tp->retransmit_timer);
- timer_active2 = del_timer(&sp->timer);
- if (!timer_active1) tp->retransmit_timer.expires=0;
- if (!timer_active2) sp->timer.expires=0;
+ timer_active1 = tp->retransmit_timer.prev != NULL;
+ timer_active2 = sp->timer.prev != NULL;
timer_active = 0;
timer_expires = (unsigned) -1;
}
@@ -147,9 +145,6 @@ static inline void get__sock(struct sock *sp, char *tmpbuf, int i, int format)
(!tw_bucket && sp->socket) ? sp->socket->inode->i_uid : 0,
(!tw_bucket && timer_active) ? sp->timeout : 0,
(!tw_bucket && sp->socket) ? sp->socket->inode->i_ino : 0);
-
- if (timer_active1) add_timer(&tp->retransmit_timer);
- if (timer_active2) add_timer(&sp->timer);
}
/*
@@ -176,7 +171,7 @@ get__netinfo(struct proto *pro, char *buffer, int format, char **start, off_t of
" sl local_address rem_address st tx_queue "
"rx_queue tr tm->when retrnsmt uid timeout inode");
pos = 128;
- SOCKHASH_LOCK();
+ SOCKHASH_LOCK_READ();
sp = pro->sklist_next;
while(sp != (struct sock *)pro) {
if (format == 0 && sp->state == TCP_LISTEN) {
@@ -211,7 +206,7 @@ get__netinfo(struct proto *pro, char *buffer, int format, char **start, off_t of
i++;
}
out:
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_READ();
begin = len - (pos - offset);
*start = buffer + begin;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index fc6b1f2ee..dd2e7555e 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -5,7 +5,7 @@
*
* RAW - implementation of IP "raw" sockets.
*
- * Version: $Id: raw.c,v 1.39 1998/11/08 11:17:04 davem Exp $
+ * Version: $Id: raw.c,v 1.41 1999/05/30 01:16:19 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -75,11 +75,11 @@ static void raw_v4_hash(struct sock *sk)
num &= (RAWV4_HTABLE_SIZE - 1);
skp = &raw_v4_htable[num];
- SOCKHASH_LOCK();
+ SOCKHASH_LOCK_WRITE();
sk->next = *skp;
*skp = sk;
sk->hashent = num;
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_WRITE();
}
static void raw_v4_unhash(struct sock *sk)
@@ -90,7 +90,7 @@ static void raw_v4_unhash(struct sock *sk)
num &= (RAWV4_HTABLE_SIZE - 1);
skp = &raw_v4_htable[num];
- SOCKHASH_LOCK();
+ SOCKHASH_LOCK_WRITE();
while(*skp != NULL) {
if(*skp == sk) {
*skp = sk->next;
@@ -98,7 +98,7 @@ static void raw_v4_unhash(struct sock *sk)
}
skp = &((*skp)->next);
}
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_WRITE();
}
static void raw_v4_rehash(struct sock *sk)
@@ -110,7 +110,7 @@ static void raw_v4_rehash(struct sock *sk)
num &= (RAWV4_HTABLE_SIZE - 1);
skp = &raw_v4_htable[oldnum];
- SOCKHASH_LOCK();
+ SOCKHASH_LOCK_WRITE();
while(*skp != NULL) {
if(*skp == sk) {
*skp = sk->next;
@@ -121,16 +121,15 @@ static void raw_v4_rehash(struct sock *sk)
sk->next = raw_v4_htable[num];
raw_v4_htable[num] = sk;
sk->hashent = num;
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_WRITE();
}
-/* Grumble... icmp and ip_input want to get at this... */
-struct sock *raw_v4_lookup(struct sock *sk, unsigned short num,
- unsigned long raddr, unsigned long laddr, int dif)
+static __inline__ struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num,
+ unsigned long raddr, unsigned long laddr,
+ int dif)
{
struct sock *s = sk;
- SOCKHASH_LOCK();
for(s = sk; s; s = s->next) {
if((s->num == num) &&
!(s->dead && (s->state == TCP_CLOSE)) &&
@@ -139,10 +138,79 @@ struct sock *raw_v4_lookup(struct sock *sk, unsigned short num,
!(s->bound_dev_if && s->bound_dev_if != dif))
break; /* gotcha */
}
- SOCKHASH_UNLOCK();
return s;
}
+struct sock *raw_v4_lookup(struct sock *sk, unsigned short num,
+ unsigned long raddr, unsigned long laddr,
+ int dif)
+{
+ SOCKHASH_LOCK_READ();
+ sk = __raw_v4_lookup(sk, num, raddr, laddr, dif);
+ SOCKHASH_UNLOCK_READ();
+
+ return sk;
+}
+
+/*
+ * 0 - deliver
+ * 1 - block
+ */
+static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
+{
+ int type;
+
+ type = skb->h.icmph->type;
+ if (type < 32)
+ return test_bit(type, &sk->tp_pinfo.tp_raw4.filter);
+
+ /* Do not block unknown ICMP types */
+ return 0;
+}
+
+/* IP input processing comes here for RAW socket delivery.
+ * This is fun as to avoid copies we want to make no surplus
+ * copies.
+ *
+ * RFC 1122: SHOULD pass TOS value up to the transport layer.
+ * -> It does. And not only TOS, but all IP header.
+ */
+struct sock *raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
+{
+ struct sock *sk;
+
+ SOCKHASH_LOCK_READ_BH();
+ if ((sk = raw_v4_htable[hash]) == NULL)
+ goto out;
+ sk = __raw_v4_lookup(sk, iph->protocol,
+ iph->saddr, iph->daddr,
+ skb->dev->ifindex);
+ while(sk != NULL) {
+ struct sock *sknext = __raw_v4_lookup(sk->next, iph->protocol,
+ iph->saddr, iph->daddr,
+ skb->dev->ifindex);
+
+ if (iph->protocol != IPPROTO_ICMP ||
+ ! icmp_filter(sk, skb)) {
+ struct sk_buff *clone;
+
+ if(sknext == NULL)
+ break;
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if(clone) {
+ SOCKHASH_UNLOCK_READ_BH();
+ raw_rcv(sk, clone);
+ SOCKHASH_LOCK_READ_BH();
+ }
+ }
+ sk = sknext;
+ }
+out:
+ SOCKHASH_UNLOCK_READ_BH();
+
+ return sk;
+}
+
void raw_err (struct sock *sk, struct sk_buff *skb)
{
int type = skb->h.icmph->type;
@@ -402,6 +470,8 @@ done:
static void raw_close(struct sock *sk, long timeout)
{
+ bh_lock_sock(sk);
+
/* Observation: when raw_close is called, processes have
no access to socket anymore. But net still has.
Step one, detach it from networking:
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index dbde97b70..3d9e87de3 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -5,7 +5,7 @@
*
* ROUTE - implementation of the IP router.
*
- * Version: $Id: route.c,v 1.67 1999/05/08 20:00:20 davem Exp $
+ * Version: $Id: route.c,v 1.69 1999/06/09 10:11:02 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -174,7 +174,18 @@ __u8 ip_tos2prio[16] = {
* Route cache.
*/
-struct rtable *rt_hash_table[RT_HASH_DIVISOR];
+/* The locking scheme is rather straight forward:
+ *
+ * 1) A BH protected rwlock protects the central route hash.
+ * 2) Only writers remove entries, and they hold the lock
+ * as they look at rtable reference counts.
+ * 3) Only readers acquire references to rtable entries,
+ * they do so with atomic increments and with the
+ * lock held.
+ */
+
+static struct rtable *rt_hash_table[RT_HASH_DIVISOR];
+static rwlock_t rt_hash_lock = RW_LOCK_UNLOCKED;
static int rt_intern_hash(unsigned hash, struct rtable * rth, struct rtable ** res);
@@ -204,7 +215,7 @@ static int rt_cache_get_info(char *buffer, char **start, off_t offset, int lengt
}
- start_bh_atomic();
+ read_lock_bh(&rt_hash_lock);
for (i = 0; i<RT_HASH_DIVISOR; i++) {
for (r = rt_hash_table[i]; r; r = r->u.rt_next) {
@@ -239,7 +250,7 @@ static int rt_cache_get_info(char *buffer, char **start, off_t offset, int lengt
}
done:
- end_bh_atomic();
+ read_unlock_bh(&rt_hash_lock);
*start = buffer+len-(pos-offset);
len = pos-offset;
@@ -292,6 +303,7 @@ static __inline__ int rt_may_expire(struct rtable *rth, int tmo1, int tmo2)
return 1;
}
+/* This runs via a timer and thus is always in BH context. */
static void rt_check_expire(unsigned long dummy)
{
int i;
@@ -305,6 +317,7 @@ static void rt_check_expire(unsigned long dummy)
rover = (rover + 1) & (RT_HASH_DIVISOR-1);
rthp = &rt_hash_table[rover];
+ write_lock(&rt_hash_lock);
while ((rth = *rthp) != NULL) {
if (rth->u.dst.expires) {
/* Entrie is expired even if it is in use */
@@ -325,6 +338,7 @@ static void rt_check_expire(unsigned long dummy)
*rthp = rth->u.rt_next;
rt_free(rth);
}
+ write_unlock(&rt_hash_lock);
/* Fallback loop breaker. */
if ((jiffies - now) > 0)
@@ -334,6 +348,9 @@ static void rt_check_expire(unsigned long dummy)
add_timer(&rt_periodic_timer);
}
+/* This can run from both BH and non-BH contexts, the latter
+ * in the case of a forced flush event.
+ */
static void rt_run_flush(unsigned long dummy)
{
int i;
@@ -341,23 +358,23 @@ static void rt_run_flush(unsigned long dummy)
rt_deadline = 0;
- start_bh_atomic();
for (i=0; i<RT_HASH_DIVISOR; i++) {
- if ((rth = xchg(&rt_hash_table[i], NULL)) == NULL)
- continue;
- end_bh_atomic();
+ write_lock_bh(&rt_hash_lock);
+ rth = rt_hash_table[i];
+ if(rth != NULL)
+ rt_hash_table[i] = NULL;
+ write_unlock_bh(&rt_hash_lock);
for (; rth; rth=next) {
next = rth->u.rt_next;
rth->u.rt_next = NULL;
rt_free(rth);
}
-
- start_bh_atomic();
}
- end_bh_atomic();
}
+static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
+
void rt_cache_flush(int delay)
{
unsigned long now = jiffies;
@@ -366,7 +383,7 @@ void rt_cache_flush(int delay)
if (delay < 0)
delay = ip_rt_min_delay;
- start_bh_atomic();
+ spin_lock_bh(&rt_flush_lock);
if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
long tmo = (long)(rt_deadline - now);
@@ -386,7 +403,7 @@ void rt_cache_flush(int delay)
}
if (delay <= 0) {
- end_bh_atomic();
+ spin_unlock_bh(&rt_flush_lock);
rt_run_flush(0);
return;
}
@@ -396,7 +413,7 @@ void rt_cache_flush(int delay)
rt_flush_timer.expires = now + delay;
add_timer(&rt_flush_timer);
- end_bh_atomic();
+ spin_unlock_bh(&rt_flush_lock);
}
/*
@@ -459,7 +476,10 @@ static int rt_garbage_collect(void)
do {
int i, k;
- start_bh_atomic();
+ /* The write lock is held during the entire hash
+ * traversal to ensure consistent state of the rover.
+ */
+ write_lock_bh(&rt_hash_lock);
for (i=0, k=rover; i<RT_HASH_DIVISOR; i++) {
unsigned tmo = expire;
@@ -480,7 +500,7 @@ static int rt_garbage_collect(void)
break;
}
rover = k;
- end_bh_atomic();
+ write_unlock_bh(&rt_hash_lock);
if (goal <= 0)
goto work_done;
@@ -530,10 +550,9 @@ static int rt_intern_hash(unsigned hash, struct rtable * rt, struct rtable ** rp
int attempts = !in_interrupt();
restart:
- start_bh_atomic();
-
rthp = &rt_hash_table[hash];
+ write_lock_bh(&rt_hash_lock);
while ((rth = *rthp) != NULL) {
if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
/* Put it first */
@@ -544,7 +563,7 @@ restart:
atomic_inc(&rth->u.dst.refcnt);
atomic_inc(&rth->u.dst.use);
rth->u.dst.lastuse = now;
- end_bh_atomic();
+ write_unlock_bh(&rt_hash_lock);
rt_drop(rt);
*rp = rth;
@@ -559,7 +578,7 @@ restart:
*/
if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
if (!arp_bind_neighbour(&rt->u.dst)) {
- end_bh_atomic();
+ write_unlock_bh(&rt_hash_lock);
/* Neighbour tables are full and nothing
can be released. Try to shrink route cache,
@@ -594,7 +613,7 @@ restart:
}
#endif
rt_hash_table[hash] = rt;
- end_bh_atomic();
+ write_unlock_bh(&rt_hash_lock);
*rp = rt;
return 0;
}
@@ -633,6 +652,7 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
rthp=&rt_hash_table[hash];
+ write_lock_bh(&rt_hash_lock);
while ( (rth = *rthp) != NULL) {
struct rtable *rt;
@@ -657,6 +677,7 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
if (rt == NULL) {
ip_rt_put(rth);
+ write_unlock_bh(&rt_hash_lock);
return;
}
@@ -688,11 +709,15 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
}
*rthp = rth->u.rt_next;
+ write_unlock_bh(&rt_hash_lock);
if (!rt_intern_hash(hash, rt, &rt))
ip_rt_put(rt);
rt_drop(rth);
- break;
+ goto do_next;
}
+ write_unlock_bh(&rt_hash_lock);
+ do_next:
+ ;
}
}
return;
@@ -722,8 +747,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
#if RT_CACHE_DEBUG >= 1
printk(KERN_DEBUG "ip_rt_advice: redirect to %d.%d.%d.%d/%02x dropped\n", NIPQUAD(rt->rt_dst), rt->key.tos);
#endif
- start_bh_atomic();
ip_rt_put(rt);
+ write_lock_bh(&rt_hash_lock);
for (rthp = &rt_hash_table[hash]; *rthp; rthp = &(*rthp)->u.rt_next) {
if (*rthp == rt) {
*rthp = rt->u.rt_next;
@@ -731,7 +756,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
break;
}
}
- end_bh_atomic();
+ write_unlock_bh(&rt_hash_lock);
return NULL;
}
}
@@ -861,6 +886,7 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
for (i=0; i<2; i++) {
unsigned hash = rt_hash_code(daddr, skeys[i], tos);
+ read_lock_bh(&rt_hash_lock);
for (rth = rt_hash_table[hash]; rth; rth = rth->u.rt_next) {
if (rth->key.dst == daddr &&
rth->key.src == skeys[i] &&
@@ -890,6 +916,7 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
}
}
}
+ read_unlock_bh(&rt_hash_lock);
}
return est_mtu ? : new_mtu;
}
@@ -1362,6 +1389,7 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
tos &= IPTOS_TOS_MASK;
hash = rt_hash_code(daddr, saddr^(iif<<5), tos);
+ read_lock_bh(&rt_hash_lock);
for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
if (rth->key.dst == daddr &&
rth->key.src == saddr &&
@@ -1374,10 +1402,12 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
rth->u.dst.lastuse = jiffies;
atomic_inc(&rth->u.dst.use);
atomic_inc(&rth->u.dst.refcnt);
+ read_unlock_bh(&rt_hash_lock);
skb->dst = (struct dst_entry*)rth;
return 0;
}
}
+ read_unlock_bh(&rt_hash_lock);
/* Multicast recognition logic is moved from route cache to here.
The problem was that too many Ethernet cards have broken/missing
@@ -1657,7 +1687,7 @@ int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
- start_bh_atomic();
+ read_lock_bh(&rt_hash_lock);
for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
if (rth->key.dst == daddr &&
rth->key.src == saddr &&
@@ -1673,12 +1703,12 @@ int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
rth->u.dst.lastuse = jiffies;
atomic_inc(&rth->u.dst.use);
atomic_inc(&rth->u.dst.refcnt);
- end_bh_atomic();
+ read_unlock_bh(&rt_hash_lock);
*rp = rth;
return 0;
}
}
- end_bh_atomic();
+ read_unlock_bh(&rt_hash_lock);
return ip_route_output_slow(rp, daddr, saddr, tos, oif);
}
@@ -1821,9 +1851,7 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
return -ENODEV;
skb->protocol = __constant_htons(ETH_P_IP);
skb->dev = dev;
- start_bh_atomic();
err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
- end_bh_atomic();
rt = (struct rtable*)skb->dst;
if (!err && rt->u.dst.error)
err = -rt->u.dst.error;
@@ -1869,7 +1897,7 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
if (h < s_h) continue;
if (h > s_h)
s_idx = 0;
- start_bh_atomic();
+ read_lock_bh(&rt_hash_lock);
for (rt = rt_hash_table[h], idx = 0; rt; rt = rt->u.rt_next, idx++) {
if (idx < s_idx)
continue;
@@ -1877,12 +1905,12 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) {
dst_release(xchg(&skb->dst, NULL));
- end_bh_atomic();
+ read_unlock_bh(&rt_hash_lock);
goto done;
}
dst_release(xchg(&skb->dst, NULL));
}
- end_bh_atomic();
+ read_unlock_bh(&rt_hash_lock);
}
done:
@@ -1968,6 +1996,7 @@ ctl_table ipv4_route_table[] = {
#ifdef CONFIG_NET_CLS_ROUTE
struct ip_rt_acct ip_rt_acct[256];
+rwlock_t ip_rt_acct_lock = RW_LOCK_UNLOCKED;
#ifdef CONFIG_PROC_FS
static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
@@ -1980,9 +2009,9 @@ static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
*eof = 1;
}
if (length > 0) {
- start_bh_atomic();
+ read_lock_bh(&ip_rt_acct_lock);
memcpy(buffer, ((u8*)&ip_rt_acct)+offset, length);
- end_bh_atomic();
+ read_unlock_bh(&ip_rt_acct_lock);
return length;
}
return 0;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8c1c9f9be..779c31cef 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp.c,v 1.140 1999/04/22 10:34:31 davem Exp $
+ * Version: $Id: tcp.c,v 1.144 1999/05/27 01:03:37 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -416,6 +416,7 @@
#include <linux/fcntl.h>
#include <linux/poll.h>
#include <linux/init.h>
+#include <linux/smp_lock.h>
#include <net/icmp.h>
#include <net/tcp.h>
@@ -432,7 +433,7 @@ kmem_cache_t *tcp_timewait_cachep;
/*
* Find someone to 'accept'. Must be called with
- * the socket locked or with interrupts disabled
+ * the listening socket locked.
*/
static struct open_request *tcp_find_established(struct tcp_opt *tp,
@@ -441,10 +442,11 @@ static struct open_request *tcp_find_established(struct tcp_opt *tp,
struct open_request *req = tp->syn_wait_queue;
struct open_request *prev = (struct open_request *)&tp->syn_wait_queue;
while(req) {
- if (req->sk &&
- ((1 << req->sk->state) &
- ~(TCPF_SYN_SENT|TCPF_SYN_RECV)))
- break;
+ if (req->sk) {
+ if((1 << req->sk->state) &
+ ~(TCPF_SYN_SENT|TCPF_SYN_RECV))
+ break;
+ }
prev = req;
req = req->dl_next;
}
@@ -655,12 +657,13 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
/*
* Wait for a socket to get into the connected state
*
- * Note: must be called with the socket locked.
+ * Note: Must be called with the socket locked, and it
+ * runs with the kernel fully unlocked.
*/
static int wait_for_tcp_connect(struct sock * sk, int flags)
{
struct task_struct *tsk = current;
- struct wait_queue wait = { tsk, NULL };
+ DECLARE_WAITQUEUE(wait, tsk);
while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
if(sk->err)
@@ -698,12 +701,14 @@ static inline int tcp_memory_free(struct sock *sk)
/*
* Wait for more memory for a socket
+ *
+ * NOTE: This runs with the kernel fully unlocked.
*/
static void wait_for_tcp_memory(struct sock * sk)
{
release_sock(sk);
if (!tcp_memory_free(sk)) {
- struct wait_queue wait = { current, NULL };
+ DECLARE_WAITQUEUE(wait, current);
sk->socket->flags &= ~SO_NOSPACE;
add_wait_queue(sk->sleep, &wait);
@@ -744,6 +749,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
int mss_now;
int err, copied;
+ unlock_kernel();
lock_sock(sk);
err = 0;
@@ -896,6 +902,7 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
err = -ERESTARTSYS;
goto do_interrupted;
}
+ tcp_push_pending_frames(sk, tp);
wait_for_tcp_memory(sk);
/* If SACK's were formed or PMTU events happened,
@@ -969,6 +976,7 @@ do_fault2:
out:
tcp_push_pending_frames(sk, tp);
release_sock(sk);
+ lock_kernel();
return err;
}
@@ -1117,7 +1125,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
int len, int nonblock, int flags, int *addr_len)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- struct wait_queue wait = { current, NULL };
+ DECLARE_WAITQUEUE(wait, current);
int copied = 0;
u32 peek_seq;
volatile u32 *seq; /* So gcc doesn't overoptimise */
@@ -1148,6 +1156,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
if (flags & MSG_WAITALL)
target=len;
+ unlock_kernel();
add_wait_queue(sk->sleep, &wait);
lock_sock(sk);
@@ -1300,6 +1309,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
/* We now will not sleep again until we are finished
* with skb. Sorry if you are doing the SMP port
* but you'll just have to fix it neatly ;)
+ *
+ * Very funny Alan... -DaveM
*/
atomic_dec(&skb->users);
@@ -1344,6 +1355,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
/* Clean up data we have read: This will do ACK frames. */
cleanup_rbuf(sk, copied);
release_sock(sk);
+ lock_kernel();
return copied;
}
@@ -1415,16 +1427,15 @@ void tcp_shutdown(struct sock *sk, int how)
return;
/* If we've already sent a FIN, or it's a closed state, skip this. */
+ lock_sock(sk);
if ((1 << sk->state) &
(TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
- lock_sock(sk);
/* Clear out any half completed packets. FIN if needed. */
if (tcp_close_state(sk,0))
tcp_send_fin(sk);
-
- release_sock(sk);
}
+ release_sock(sk);
}
@@ -1471,13 +1482,6 @@ void tcp_close(struct sock *sk, long timeout)
struct sk_buff *skb;
int data_was_unread = 0;
- /*
- * Check whether the socket is locked ... supposedly
- * it's impossible to tcp_close() a locked socket.
- */
- if (atomic_read(&sk->sock_readers))
- printk("tcp_close: socket already locked!\n");
-
/* We need to grab some memory, and put together a FIN,
* and then put it into the queue to be sent.
*/
@@ -1491,6 +1495,8 @@ void tcp_close(struct sock *sk, long timeout)
return;
}
+ unlock_kernel();
+
/* It is questionable, what the role of this is now.
* In any event either it should be removed, or
* increment of SLT_KEEPALIVE be done, this is causing
@@ -1534,24 +1540,23 @@ void tcp_close(struct sock *sk, long timeout)
if (timeout) {
struct task_struct *tsk = current;
- struct wait_queue wait = { tsk, NULL };
+ DECLARE_WAITQUEUE(wait, current);
add_wait_queue(sk->sleep, &wait);
- release_sock(sk);
while (1) {
tsk->state = TASK_INTERRUPTIBLE;
if (!closing(sk))
break;
+ release_sock(sk);
timeout = schedule_timeout(timeout);
+ lock_sock(sk);
if (signal_pending(tsk) || !timeout)
break;
}
tsk->state = TASK_RUNNING;
remove_wait_queue(sk->sleep, &wait);
-
- lock_sock(sk);
}
/* Now that the socket is dead, if we are in the FIN_WAIT2 state
@@ -1559,23 +1564,40 @@ void tcp_close(struct sock *sk, long timeout)
*/
tcp_check_fin_timer(sk);
- release_sock(sk);
sk->dead = 1;
+
+ release_sock(sk);
+ lock_kernel();
}
/*
* Wait for an incoming connection, avoid race
- * conditions. This must be called with the socket locked.
+ * conditions. This must be called with the socket locked,
+ * and without the kernel lock held.
*/
static struct open_request * wait_for_connect(struct sock * sk,
struct open_request **pprev)
{
- struct wait_queue wait = { current, NULL };
+ DECLARE_WAITQUEUE(wait, current);
struct open_request *req;
- add_wait_queue(sk->sleep, &wait);
+ /*
+ * True wake-one mechanism for incoming connections: only
+ * one process gets woken up, not the 'whole herd'.
+ * Since we do not 'race & poll' for established sockets
+ * anymore, the common case will execute the loop only once.
+ *
+ * Subtle issue: "add_wait_queue_exclusive()" will be added
+ * after any current non-exclusive waiters, and we know that
+ * it will always _stay_ after any new non-exclusive waiters
+ * because all non-exclusive waiters are added at the
+ * beginning of the wait-queue. As such, it's ok to "drop"
+ * our exclusiveness temporarily when we get woken up without
+ * having to remove and re-insert us on the wait queue.
+ */
+ add_wait_queue_exclusive(sk->sleep, &wait);
for (;;) {
- current->state = TASK_INTERRUPTIBLE;
+ current->state = TASK_EXCLUSIVE | TASK_INTERRUPTIBLE;
release_sock(sk);
schedule();
lock_sock(sk);
@@ -1603,6 +1625,7 @@ struct sock *tcp_accept(struct sock *sk, int flags)
struct sock *newsk = NULL;
int error;
+ unlock_kernel();
lock_sock(sk);
/* We need to make sure that this socket is listening,
@@ -1633,16 +1656,17 @@ struct sock *tcp_accept(struct sock *sk, int flags)
sk->ack_backlog--;
if(sk->keepopen)
tcp_inc_slow_timer(TCP_SLT_KEEPALIVE);
-
release_sock(sk);
+ lock_kernel();
return newsk;
out:
/* sk should be in LISTEN state, thus accept can use sk->err for
- * internal purposes without stomping one anyone's feed.
+ * internal purposes without stomping on anyone's feed.
*/
sk->err = error;
release_sock(sk);
+ lock_kernel();
return newsk;
}
@@ -1765,6 +1789,8 @@ extern void __skb_cb_too_small_for_tcp(int, int);
void __init tcp_init(void)
{
struct sk_buff *skb = NULL;
+ unsigned long goal;
+ int order;
if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
@@ -1790,4 +1816,37 @@ void __init tcp_init(void)
NULL, NULL);
if(!tcp_timewait_cachep)
panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
+
+ /* Size and allocate the main established and bind bucket
+ * hash tables.
+ *
+ * The methodology is similar to that of the buffer cache.
+ */
+ goal = num_physpages >> (20 - PAGE_SHIFT);
+ for(order = 5; (1UL << order) < goal; order++)
+ ;
+ do {
+ tcp_ehash_size = (1UL << order) * PAGE_SIZE /
+ sizeof(struct sock *);
+ tcp_ehash = (struct sock **)
+ __get_free_pages(GFP_ATOMIC, order);
+ } while (tcp_ehash == NULL && --order > 4);
+
+ if (!tcp_ehash)
+ panic("Failed to allocate TCP established hash table\n");
+ memset(tcp_ehash, 0, tcp_ehash_size * sizeof(struct sock *));
+
+ do {
+ tcp_bhash_size = (1UL << order) * PAGE_SIZE /
+ sizeof(struct tcp_bind_bucket *);
+ tcp_bhash = (struct tcp_bind_bucket **)
+ __get_free_pages(GFP_ATOMIC, order);
+ } while (tcp_bhash == NULL && --order > 4);
+
+ if (!tcp_bhash)
+ panic("Failed to allocate TCP bind hash table\n");
+ memset(tcp_bhash, 0, tcp_bhash_size * sizeof(struct tcp_bind_bucket *));
+
+ printk("TCP: Hash tables configured (established %d bind %d)\n",
+ tcp_ehash_size, tcp_bhash_size);
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4a607a749..af4165fce 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_input.c,v 1.164 1999/05/08 21:09:52 davem Exp $
+ * Version: $Id: tcp_input.c,v 1.169 1999/06/09 08:29:13 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -748,7 +748,6 @@ static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
{
struct sk_buff *skb = skb_peek(&sk->write_queue);
- __u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when);
/* Some data was ACK'd, if still retransmitting (due to a
* timeout), resend more of the retransmit queue. The
@@ -758,6 +757,9 @@ static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
tcp_xmit_retransmit_queue(sk);
tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
} else {
+ __u32 when = tp->rto - (tcp_time_stamp - TCP_SKB_CB(skb)->when);
+ if ((__s32)when < 0)
+ when = 1;
tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
}
}
@@ -785,8 +787,6 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una))
goto uninteresting_ack;
- dst_confirm(sk->dst_cache);
-
/* If there is data set flag 1 */
if (len != th->doff*4) {
flag |= FLAG_DATA;
@@ -882,6 +882,24 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
/* Clear any aborted fast retransmit starts. */
tp->dup_acks = 0;
}
+ /* It is not a brain fart, I thought a bit now. 8)
+ *
+ * Forward progress is indicated, if:
+ * 1. the ack acknowledges new data.
+ * 2. or the ack is duplicate, but it is caused by new segment
+ * arrival. This case is filtered by:
+ * - it contains no data, syn or fin.
+ * - it does not update window.
+ * 3. or new SACK. It is difficult to check, so that we ignore it.
+ *
+ * Forward progress is also indicated by arrival new data,
+ * which was caused by window open from our side. This case is more
+ * difficult and it is made (alas, incorrectly) in tcp_data_queue().
+ * --ANK (990513)
+ */
+ if (ack != tp->snd_una || (flag == 0 && !th->fin))
+ dst_confirm(sk->dst_cache);
+
/* Remember the highest ack received. */
tp->snd_una = ack;
return 1;
@@ -896,8 +914,11 @@ extern void tcp_tw_schedule(struct tcp_tw_bucket *tw);
extern void tcp_tw_reschedule(struct tcp_tw_bucket *tw);
extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw);
+/* Must be called only from BH context. */
void tcp_timewait_kill(struct tcp_tw_bucket *tw)
{
+ SOCKHASH_LOCK_WRITE_BH();
+
/* Unlink from various places. */
if(tw->bind_next)
tw->bind_next->bind_pprev = tw->bind_pprev;
@@ -915,6 +936,8 @@ void tcp_timewait_kill(struct tcp_tw_bucket *tw)
tw->sklist_next->sklist_prev = tw->sklist_prev;
tw->sklist_prev->sklist_next = tw->sklist_next;
+ SOCKHASH_UNLOCK_WRITE_BH();
+
/* Ok, now free it up. */
kmem_cache_free(tcp_timewait_cachep, tw);
}
@@ -945,6 +968,7 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
struct sock *sk;
struct tcp_func *af_specific = tw->af_specific;
__u32 isn;
+ int ret;
isn = tw->rcv_nxt + 128000;
if(isn == 0)
@@ -953,14 +977,25 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
tcp_timewait_kill(tw);
sk = af_specific->get_sock(skb, th);
if(sk == NULL ||
- !ipsec_sk_policy(sk,skb) ||
- atomic_read(&sk->sock_readers) != 0)
+ !ipsec_sk_policy(sk,skb))
return 0;
+
+ bh_lock_sock(sk);
+
+ /* Default is to discard the frame. */
+ ret = 0;
+
+ if(sk->lock.users)
+ goto out_unlock;
+
skb_set_owner_r(skb, sk);
af_specific = sk->tp_pinfo.af_tcp.af_specific;
+
if(af_specific->conn_request(sk, skb, isn) < 0)
- return 1; /* Toss a reset back. */
- return 0; /* Discard the frame. */
+ ret = 1; /* Toss a reset back. */
+ out_unlock:
+ bh_unlock_sock(sk);
+ return ret;
}
/* Check RST or SYN */
@@ -1013,7 +1048,7 @@ static __inline__ void tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *t
sk->prot->inuse--;
/* Step 4: Hash TW into TIMEWAIT half of established hash table. */
- head = &tcp_established_hash[sk->hashent + (TCP_HTABLE_SIZE/2)];
+ head = &tcp_ehash[sk->hashent + (tcp_ehash_size >> 1)];
sktw = (struct sock *)tw;
if((sktw->next = *head) != NULL)
(*head)->pprev = &sktw->next;
@@ -1051,7 +1086,9 @@ void tcp_time_wait(struct sock *sk)
}
#endif
/* Linkage updates. */
+ SOCKHASH_LOCK_WRITE();
tcp_tw_hashdance(sk, tw);
+ SOCKHASH_UNLOCK_WRITE();
/* Get the TIME_WAIT timeout firing. */
tcp_tw_schedule(tw);
@@ -1801,7 +1838,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
}
}
- flg = *(((u32 *)th) + 3) & ~htonl(0x8 << 16);
+ flg = *(((u32 *)th) + 3) & ~htonl(0xFC8 << 16);
/* pred_flags is 0xS?10 << 16 + snd_wnd
* if header_predition is to be made
@@ -2031,8 +2068,26 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
/* These use the socket TOS..
* might want to be the received TOS
*/
- if(th->ack)
- return 1;
+ if(th->ack) {
+ struct sock *realsk;
+ int ret;
+
+ realsk = tp->af_specific->get_sock(skb, th);
+ if(realsk == sk)
+ return 1;
+
+ bh_lock_sock(realsk);
+ ret = 0;
+ if(realsk->lock.users != 0) {
+ skb_orphan(skb);
+ sk_add_backlog(realsk, skb);
+ } else {
+ ret = tcp_rcv_state_process(realsk, skb,
+ skb->h.th, skb->len);
+ }
+ bh_unlock_sock(realsk);
+ return ret;
+ }
if(th->syn) {
if(tp->af_specific->conn_request(sk, skb, 0) < 0)
@@ -2067,21 +2122,81 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* not be in line code. [AC]
*/
if(th->ack) {
- tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
-
- /* We got an ack, but it's not a good ack. */
- if(!tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(skb)->ack_seq, len))
+ /* rfc793:
+ * "If the state is SYN-SENT then
+ * first check the ACK bit
+ * If the ACK bit is set
+ * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
+ * a reset (unless the RST bit is set, if so drop
+ * the segment and return)"
+ *
+ * I cite this place to emphasize one essential
+ * detail, this check is different of one
+ * in established state: SND.UNA <= SEG.ACK <= SND.NXT.
+ * SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT,
+ * because we have no previous data sent before SYN.
+ * --ANK(990513)
+ *
+ * We do not send data with SYN, so that RFC-correct
+ * test reduces to:
+ */
+ if (sk->zapped ||
+ TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
return 1;
- if(th->rst) {
+ /* Now ACK is acceptable.
+ *
+ * "If the RST bit is set
+ * If the ACK was acceptable then signal the user "error:
+ * connection reset", drop the segment, enter CLOSED state,
+ * delete TCB, and return."
+ */
+
+ if (th->rst) {
tcp_reset(sk);
goto discard;
}
- if(!th->syn)
+ /* rfc793:
+ * "fifth, if neither of the SYN or RST bits is set then
+ * drop the segment and return."
+ *
+ * See note below!
+ * --ANK(990513)
+ */
+
+ if (!th->syn)
goto discard;
+ /* rfc793:
+ * "If the SYN bit is on ...
+ * are acceptable then ...
+ * (our SYN has been ACKed), change the connection
+ * state to ESTABLISHED..."
+ *
+ * Do you see? SYN-less ACKs in SYN-SENT state are
+ * completely ignored.
+ *
+ * The bug causing stalled SYN-SENT sockets
+ * was here: tcp_ack advanced snd_una and canceled
+ * retransmit timer, so that bare ACK received
+ * in SYN-SENT state (even with invalid ack==ISS,
+ * because tcp_ack check is too weak for SYN-SENT)
+ * causes moving socket to invalid semi-SYN-SENT,
+ * semi-ESTABLISHED state and connection hangs.
+ *
+ * There exist buggy stacks, which really send
+ * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp)
+ * Actually, if this host did not try to get something
+ * from ftp.inr.ac.ru I'd never find this bug 8)
+ *
+ * --ANK (990514)
+ */
+
+ tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+ tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->ack_seq, len);
+
/* Ok.. it's good. Set up sequence numbers and
* move to established.
*/
@@ -2206,8 +2321,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
!(th->fin && TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)) {
if (!th->rst) {
tcp_send_ack(sk);
- goto discard;
}
+ goto discard;
}
/* step 2: check RST bit */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b5070c3a7..564e859f2 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_ipv4.c,v 1.175 1999/05/08 21:09:54 davem Exp $
+ * Version: $Id: tcp_ipv4.c,v 1.180 1999/06/09 08:29:19 davem Exp $
*
* IPv4 specific functions
*
@@ -90,12 +90,14 @@ void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
* First half of the table is for sockets not in TIME_WAIT, second half
* is for TIME_WAIT sockets only.
*/
-struct sock *tcp_established_hash[TCP_HTABLE_SIZE];
+struct sock **tcp_ehash;
+int tcp_ehash_size;
/* Ok, let's try this, I give up, we do need a local binding
* TCP hash as well as the others for fast bind/connect.
*/
-struct tcp_bind_bucket *tcp_bound_hash[TCP_BHTABLE_SIZE];
+struct tcp_bind_bucket **tcp_bhash;
+int tcp_bhash_size;
/* All sockets in TCP_LISTEN state will be in here. This is the only table
* where wildcard'd TCP sockets can exist. Hash function here is just local
@@ -117,7 +119,7 @@ int tcp_port_rover = (1024 - 1);
static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
__u32 faddr, __u16 fport)
{
- return ((laddr ^ lport) ^ (faddr ^ fport)) & ((TCP_HTABLE_SIZE/2) - 1);
+ return ((laddr ^ lport) ^ (faddr ^ fport)) & ((tcp_ehash_size >> 1) - 1);
}
static __inline__ int tcp_sk_hashfn(struct sock *sk)
@@ -136,8 +138,8 @@ void tcp_bucket_unlock(struct sock *sk)
struct tcp_bind_bucket *tb;
unsigned short snum = sk->num;
- SOCKHASH_LOCK();
- for(tb = tcp_bound_hash[tcp_bhashfn(snum)]; tb; tb = tb->next) {
+ SOCKHASH_LOCK_WRITE();
+ for(tb = tcp_bhash[tcp_bhashfn(snum)]; tb; tb = tb->next) {
if(tb->port == snum) {
if(tb->owners == NULL &&
(tb->flags & TCPB_FLAG_LOCKED)) {
@@ -148,9 +150,10 @@ void tcp_bucket_unlock(struct sock *sk)
break;
}
}
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_WRITE();
}
+/* The sockhash lock must be held as a writer here. */
struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum)
{
struct tcp_bind_bucket *tb;
@@ -158,7 +161,7 @@ struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum)
tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
if(tb != NULL) {
struct tcp_bind_bucket **head =
- &tcp_bound_hash[tcp_bhashfn(snum)];
+ &tcp_bhash[tcp_bhashfn(snum)];
tb->port = snum;
tb->flags = TCPB_FLAG_LOCKED;
tb->owners = NULL;
@@ -176,13 +179,18 @@ struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum)
*/
static __inline__ int tcp_bucket_check(unsigned short snum)
{
- struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(snum)];
+ struct tcp_bind_bucket *tb;
+ int ret = 0;
+
+ SOCKHASH_LOCK_WRITE();
+ tb = tcp_bhash[tcp_bhashfn(snum)];
for( ; (tb && (tb->port != snum)); tb = tb->next)
;
if(tb == NULL && tcp_bucket_create(snum) == NULL)
- return 1;
- else
- return 0;
+ ret = 1;
+ SOCKHASH_UNLOCK_WRITE();
+
+ return ret;
}
#endif
@@ -191,8 +199,8 @@ static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum)
struct tcp_bind_bucket *tb;
int result = 0;
- SOCKHASH_LOCK();
- for(tb = tcp_bound_hash[tcp_bhashfn(snum)];
+ SOCKHASH_LOCK_WRITE();
+ for(tb = tcp_bhash[tcp_bhashfn(snum)];
(tb && (tb->port != snum));
tb = tb->next)
;
@@ -256,7 +264,7 @@ static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum)
}
}
go_like_smoke:
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_WRITE();
return result;
}
@@ -268,13 +276,13 @@ unsigned short tcp_good_socknum(void)
int remaining = (high - low) + 1;
int rover;
- SOCKHASH_LOCK();
+ SOCKHASH_LOCK_WRITE();
rover = tcp_port_rover;
do {
rover += 1;
if((rover < low) || (rover > high))
rover = low;
- tb = tcp_bound_hash[tcp_bhashfn(rover)];
+ tb = tcp_bhash[tcp_bhashfn(rover)];
for( ; tb; tb = tb->next) {
if(tb->port == rover)
goto next;
@@ -288,7 +296,7 @@ unsigned short tcp_good_socknum(void)
rover = 0;
if (tb != NULL)
tb->flags |= TCPB_FLAG_GOODSOCKNUM;
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_WRITE();
return rover;
}
@@ -298,20 +306,20 @@ static void tcp_v4_hash(struct sock *sk)
if (sk->state != TCP_CLOSE) {
struct sock **skp;
- SOCKHASH_LOCK();
- skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))];
+ SOCKHASH_LOCK_WRITE();
+ skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))];
if((sk->next = *skp) != NULL)
(*skp)->pprev = &sk->next;
*skp = sk;
sk->pprev = skp;
tcp_sk_bindify(sk);
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_WRITE();
}
}
static void tcp_v4_unhash(struct sock *sk)
{
- SOCKHASH_LOCK();
+ SOCKHASH_LOCK_WRITE();
if(sk->pprev) {
if(sk->next)
sk->next->pprev = sk->pprev;
@@ -320,14 +328,14 @@ static void tcp_v4_unhash(struct sock *sk)
tcp_reg_zap(sk);
tcp_sk_unbindify(sk);
}
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_WRITE();
}
static void tcp_v4_rehash(struct sock *sk)
{
unsigned char state;
- SOCKHASH_LOCK();
+ SOCKHASH_LOCK_WRITE();
state = sk->state;
if(sk->pprev != NULL) {
if(sk->next)
@@ -342,7 +350,7 @@ static void tcp_v4_rehash(struct sock *sk)
if(state == TCP_LISTEN)
skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
else
- skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))];
+ skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))];
if((sk->next = *skp) != NULL)
(*skp)->pprev = &sk->next;
@@ -351,7 +359,7 @@ static void tcp_v4_rehash(struct sock *sk)
if(state == TCP_LISTEN)
tcp_sk_bindify(sk);
}
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_WRITE();
}
/* Don't inline this cruft. Here are some nice properties to
@@ -395,10 +403,10 @@ static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int d
/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
* we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
- * It is assumed that this code only gets called from within NET_BH.
+ *
+ * The sockhash lock must be held as a reader here.
*/
-static inline struct sock *__tcp_v4_lookup(struct tcphdr *th,
- u32 saddr, u16 sport,
+static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
u32 daddr, u16 dport, int dif)
{
TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
@@ -416,7 +424,7 @@ static inline struct sock *__tcp_v4_lookup(struct tcphdr *th,
* have wildcards anyways.
*/
hash = tcp_hashfn(daddr, hnum, saddr, sport);
- for(sk = tcp_established_hash[hash]; sk; sk = sk->next) {
+ for(sk = tcp_ehash[hash]; sk; sk = sk->next) {
if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) {
if (sk->state == TCP_ESTABLISHED)
TCP_RHASH(sport) = sk;
@@ -424,7 +432,7 @@ static inline struct sock *__tcp_v4_lookup(struct tcphdr *th,
}
}
/* Must check for a TIME_WAIT'er before going to listener hash. */
- for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next)
+ for(sk = tcp_ehash[hash+(tcp_ehash_size >> 1)]; sk; sk = sk->next)
if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
goto hit;
sk = tcp_v4_lookup_listener(daddr, hnum, dif);
@@ -434,7 +442,13 @@ hit:
__inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
{
- return __tcp_v4_lookup(0, saddr, sport, daddr, dport, dif);
+ struct sock *sk;
+
+ SOCKHASH_LOCK_READ();
+ sk = __tcp_v4_lookup(saddr, sport, daddr, dport, dif);
+ SOCKHASH_UNLOCK_READ();
+
+ return sk;
}
#ifdef CONFIG_IP_TRANSPARENT_PROXY
@@ -462,9 +476,12 @@ static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr,
paddr = idev->ifa_list->ifa_local;
}
- /* This code must run only from NET_BH. */
+ /* We must obtain the sockhash lock here, we are always
+ * in BH context.
+ */
+ SOCKHASH_LOCK_READ_BH();
{
- struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hnum)];
+ struct tcp_bind_bucket *tb = tcp_bhash[tcp_bhashfn(hnum)];
for( ; (tb && tb->port != hnum); tb = tb->next)
;
if(tb == NULL)
@@ -505,7 +522,7 @@ pass2:
}
next:
if(firstpass--) {
- struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hpnum)];
+ struct tcp_bind_bucket *tb = tcp_bhash[tcp_bhashfn(hpnum)];
for( ; (tb && tb->port != hpnum); tb = tb->next)
;
if(tb) {
@@ -514,6 +531,7 @@ next:
}
}
gotit:
+ SOCKHASH_UNLOCK_READ_BH();
return result;
}
#endif /* CONFIG_IP_TRANSPARENT_PROXY */
@@ -540,21 +558,23 @@ static int tcp_v4_unique_address(struct sock *sk)
int retval = 1;
/* Freeze the hash while we snoop around. */
- SOCKHASH_LOCK();
- tb = tcp_bound_hash[tcp_bhashfn(snum)];
+ SOCKHASH_LOCK_READ();
+ tb = tcp_bhash[tcp_bhashfn(snum)];
for(; tb; tb = tb->next) {
if(tb->port == snum && tb->owners != NULL) {
/* Almost certainly the re-use port case, search the real hashes
* so it actually scales.
*/
- sk = __tcp_v4_lookup(NULL, sk->daddr, sk->dport,
+ sk = __tcp_v4_lookup(sk->daddr, sk->dport,
sk->rcv_saddr, snum, sk->bound_dev_if);
+ SOCKHASH_UNLOCK_READ();
+
if((sk != NULL) && (sk->state != TCP_LISTEN))
retval = 0;
- break;
+ return retval;
}
}
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_READ();
return retval;
}
@@ -727,16 +747,17 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- if (atomic_read(&sk->sock_readers))
- return;
-
- /* Don't interested in TCP_LISTEN and open_requests (SYN-ACKs
+ /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
* send out by Linux are always <576bytes so they should go through
* unfragmented).
*/
if (sk->state == TCP_LISTEN)
return;
+ bh_lock_sock(sk);
+ if(sk->lock.users != 0)
+ goto out;
+
/* We don't check in the destentry if pmtu discovery is forbidden
* on this route. We just assume that no packet_to_big packets
* are send back when pmtu discovery is not active.
@@ -744,7 +765,8 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned
* route, but I think that's acceptable.
*/
if (sk->dst_cache == NULL)
- return;
+ goto out;
+
ip_rt_update_pmtu(sk->dst_cache, mtu);
if (sk->ip_pmtudisc != IP_PMTUDISC_DONT &&
tp->pmtu_cookie > sk->dst_cache->pmtu) {
@@ -757,6 +779,8 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned
*/
tcp_simple_retransmit(sk);
} /* else let the usual retransmit timer handle it */
+out:
+ bh_unlock_sock(sk);
}
/*
@@ -849,17 +873,6 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
switch (sk->state) {
struct open_request *req, *prev;
case TCP_LISTEN:
- /* Prevent race conditions with accept() -
- * ICMP is unreliable.
- */
- if (atomic_read(&sk->sock_readers)) {
- net_statistics.LockDroppedIcmps++;
- /* If too many ICMPs get dropped on busy
- * servers this needs to be solved differently.
- */
- return;
- }
-
/* The final ACK of the handshake should be already
* handled in the new socket context, not here.
* Strictly speaking - an ICMP error for the final
@@ -869,12 +882,24 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
if (!no_flags && !th->syn && !th->ack)
return;
+ /* Prevent race conditions with accept() -
+ * ICMP is unreliable.
+ */
+ bh_lock_sock(sk);
+ if (sk->lock.users != 0) {
+ net_statistics.LockDroppedIcmps++;
+ /* If too many ICMPs get dropped on busy
+ * servers this needs to be solved differently.
+ */
+ goto out_unlock;
+ }
+
req = tcp_v4_search_req(tp, iph, th, &prev);
if (!req)
- return;
+ goto out_unlock;
if (seq != req->snt_isn) {
net_statistics.OutOfWindowIcmps++;
- return;
+ goto out_unlock;
}
if (req->sk) {
/*
@@ -884,6 +909,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
* but only with the next operation on the socket after
* accept.
*/
+ bh_unlock_sock(sk);
sk = req->sk;
} else {
/*
@@ -896,6 +922,8 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
tcp_synq_unlink(tp, req, prev);
req->class->destructor(req);
tcp_openreq_free(req);
+ out_unlock:
+ bh_unlock_sock(sk);
return;
}
break;
@@ -1025,9 +1053,10 @@ static struct sock *tcp_v4_search_proxy_openreq(struct sk_buff *skb)
{
struct iphdr *iph = skb->nh.iph;
struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4);
- struct sock *sk;
+ struct sock *sk = NULL;
int i;
+ SOCKHASH_LOCK_READ();
for (i=0; i<TCP_LHTABLE_SIZE; i++) {
for(sk = tcp_listening_hash[i]; sk; sk = sk->next) {
struct open_request *dummy;
@@ -1035,10 +1064,12 @@ static struct sock *tcp_v4_search_proxy_openreq(struct sk_buff *skb)
th, &dummy) &&
(!sk->bound_dev_if ||
sk->bound_dev_if == skb->dev->ifindex))
- return sk;
+ goto out;
}
}
- return NULL;
+out:
+ SOCKHASH_UNLOCK_READ();
+ return sk;
}
/*
@@ -1319,7 +1350,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
/* Clone the TCP header template */
newsk->dport = req->rmt_port;
- atomic_set(&newsk->sock_readers, 0);
+ sock_lock_init(newsk);
+
atomic_set(&newsk->rmem_alloc, 0);
skb_queue_head_init(&newsk->receive_queue);
atomic_set(&newsk->wmem_alloc, 0);
@@ -1328,9 +1360,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
newsk->done = 0;
newsk->proc = 0;
- newsk->pair = NULL;
- skb_queue_head_init(&newsk->back_log);
+ newsk->backlog.head = newsk->backlog.tail = NULL;
skb_queue_head_init(&newsk->error_queue);
+ newsk->write_space = tcp_write_space;
#ifdef CONFIG_FILTER
if ((filter = newsk->filter) != NULL)
sk_filter_charge(newsk, filter);
@@ -1552,7 +1584,8 @@ static inline struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
}
/* Check for SYN|ACK */
- if (flg & __constant_htonl(0x00120000)) {
+ flg &= __constant_htonl(0x00120000);
+ if (flg) {
struct open_request *req, *dummy;
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
@@ -1570,8 +1603,17 @@ static inline struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
return sk;
}
+/* The socket must have it's spinlock held when we get
+ * here.
+ *
+ * We have a potential double-lock case here, so even when
+ * doing backlog processing we use the BH locking scheme.
+ * This is because we cannot sleep with the original spinlock
+ * held.
+ */
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
+ int need_unlock = 0;
#ifdef CONFIG_FILTER
struct sk_filter *filter = sk->filter;
if (filter && sk_filter(skb, filter))
@@ -1591,7 +1633,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
}
-
if (sk->state == TCP_LISTEN) {
struct sock *nsk;
@@ -1604,17 +1645,22 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
* otherwise we just shortcircuit this and continue with
* the new socket..
*/
- if (atomic_read(&nsk->sock_readers)) {
- skb_orphan(skb);
- __skb_queue_tail(&nsk->back_log, skb);
- return 0;
+ if (nsk != sk) {
+ bh_lock_sock(nsk);
+ if (nsk->lock.users != 0) {
+ skb_orphan(skb);
+ sk_add_backlog(nsk, skb);
+ bh_unlock_sock(nsk);
+ return 0;
+ }
+ need_unlock = 1;
+ sk = nsk;
}
- sk = nsk;
}
if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
goto reset;
- return 0;
+ goto out_maybe_unlock;
reset:
tcp_v4_send_reset(skb);
@@ -1625,6 +1671,9 @@ discard:
* might be destroyed here. This current version compiles correctly,
* but you have been warned.
*/
+out_maybe_unlock:
+ if(need_unlock)
+ bh_unlock_sock(sk);
return 0;
}
@@ -1636,6 +1685,7 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
{
struct tcphdr *th;
struct sock *sk;
+ int ret;
if (skb->pkt_type!=PACKET_HOST)
goto discard_it;
@@ -1681,8 +1731,10 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
IPCB(skb)->redirport, skb->dev->ifindex);
else {
#endif
- sk = __tcp_v4_lookup(th, skb->nh.iph->saddr, th->source,
+ SOCKHASH_LOCK_READ_BH();
+ sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
skb->nh.iph->daddr, th->dest, skb->dev->ifindex);
+ SOCKHASH_UNLOCK_READ_BH();
#ifdef CONFIG_IP_TRANSPARENT_PROXY
if (!sk)
sk = tcp_v4_search_proxy_openreq(skb);
@@ -1702,11 +1754,16 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
if (sk->state == TCP_TIME_WAIT)
goto do_time_wait;
- if (!atomic_read(&sk->sock_readers))
- return tcp_v4_do_rcv(sk, skb);
- __skb_queue_tail(&sk->back_log, skb);
- return 0;
+ bh_lock_sock(sk);
+ ret = 0;
+ if (!sk->lock.users)
+ ret = tcp_v4_do_rcv(sk, skb);
+ else
+ sk_add_backlog(sk, skb);
+ bh_unlock_sock(sk);
+
+ return ret;
no_tcp_socket:
tcp_v4_send_reset(skb);
@@ -1944,6 +2001,8 @@ __initfunc(void tcp_v4_init(struct net_proto_family *ops))
tcp_inode.i_sock = 1;
tcp_inode.i_uid = 0;
tcp_inode.i_gid = 0;
+ init_waitqueue_head(&tcp_inode.i_wait);
+ init_waitqueue_head(&tcp_inode.u.socket_i.wait);
tcp_socket->inode = &tcp_inode;
tcp_socket->state = SS_UNCONNECTED;
@@ -1952,6 +2011,11 @@ __initfunc(void tcp_v4_init(struct net_proto_family *ops))
if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
panic("Failed to create the TCP control socket.\n");
tcp_socket->sk->allocation=GFP_ATOMIC;
- tcp_socket->sk->num = 256; /* Don't receive any data */
tcp_socket->sk->ip_ttl = MAXTTL;
+
+ /* Unhash it so that IP input processing does not even
+ * see it, we do not wish this socket to see incoming
+ * packets.
+ */
+ tcp_socket->sk->prot->unhash(tcp_socket->sk);
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9a096f0f3..18b5ebf80 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_output.c,v 1.108 1999/05/08 21:48:59 davem Exp $
+ * Version: $Id: tcp_output.c,v 1.110 1999/05/27 00:37:45 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -36,6 +36,8 @@
#include <net/tcp.h>
+#include <linux/smp_lock.h>
+
extern int sysctl_tcp_timestamps;
extern int sysctl_tcp_window_scaling;
extern int sysctl_tcp_sack;
@@ -240,6 +242,11 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
/* Rechecksum original buffer. */
skb->csum = csum_partial(skb->data, skb->len, 0);
+ /* Looks stupid, but our code really uses when of
+ * skbs, which it never sent before. --ANK
+ */
+ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
+
/* Link BUFF into the send queue. */
__skb_append(skb, buff);
@@ -961,6 +968,7 @@ void tcp_connect(struct sock *sk, struct sk_buff *buff, int mtu)
/* Ok, now lock the socket before we make it visible to
* the incoming packet engine.
*/
+ unlock_kernel();
lock_sock(sk);
/* Socket identity change complete, no longer
@@ -988,6 +996,7 @@ void tcp_connect(struct sock *sk, struct sk_buff *buff, int mtu)
/* Now, it is safe to release the socket. */
release_sock(sk);
+ lock_kernel();
}
/* Send out a delayed ack, the caller does the policy checking
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index ad6ccace9..d23eef143 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_timer.c,v 1.62 1999/05/08 21:09:55 davem Exp $
+ * Version: $Id: tcp_timer.c,v 1.64 1999/05/27 00:37:31 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -168,15 +168,16 @@ void tcp_delack_timer(unsigned long data)
{
struct sock *sk = (struct sock*)data;
+ bh_lock_sock(sk);
if(!sk->zapped &&
sk->tp_pinfo.af_tcp.delayed_acks &&
sk->state != TCP_CLOSE) {
- /* If socket is currently locked, defer the ACK. */
- if (!atomic_read(&sk->sock_readers))
+ if (!sk->lock.users)
tcp_send_ack(sk);
else
tcp_send_delayed_ack(&(sk->tp_pinfo.af_tcp), HZ/10);
}
+ bh_unlock_sock(sk);
}
void tcp_probe_timer(unsigned long data)
@@ -187,9 +188,11 @@ void tcp_probe_timer(unsigned long data)
if(sk->zapped)
return;
- if (atomic_read(&sk->sock_readers)) {
+ bh_lock_sock(sk);
+ if (sk->lock.users) {
/* Try again later. */
tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ/5);
+ bh_unlock_sock(sk);
return;
}
@@ -216,6 +219,7 @@ void tcp_probe_timer(unsigned long data)
/* Only send another probe if we didn't close things up. */
tcp_send_probe0(sk);
}
+ bh_unlock_sock(sk);
}
static __inline__ int tcp_keepopen_proc(struct sock *sk)
@@ -253,8 +257,9 @@ static void tcp_bucketgc(unsigned long data)
{
int i, reaped = 0;;
- for(i = 0; i < TCP_BHTABLE_SIZE; i++) {
- struct tcp_bind_bucket *tb = tcp_bound_hash[i];
+ SOCKHASH_LOCK_WRITE_BH();
+ for(i = 0; i < tcp_bhash_size; i++) {
+ struct tcp_bind_bucket *tb = tcp_bhash[i];
while(tb) {
struct tcp_bind_bucket *next = tb->next;
@@ -274,6 +279,8 @@ static void tcp_bucketgc(unsigned long data)
tb = next;
}
}
+ SOCKHASH_UNLOCK_WRITE_BH();
+
if(reaped != 0) {
struct tcp_sl_timer *slt = (struct tcp_sl_timer *)data;
@@ -294,8 +301,14 @@ static void tcp_twkill(unsigned long data)
struct tcp_tw_bucket *tw;
int killed = 0;
+ /* The death-row tw chains are only ever touched
+ * in BH context so no locking is needed.
+ */
tw = tcp_tw_death_row[tcp_tw_death_row_slot];
tcp_tw_death_row[tcp_tw_death_row_slot] = NULL;
+ tcp_tw_death_row_slot =
+ ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
+
while(tw != NULL) {
struct tcp_tw_bucket *next = tw->next_death;
@@ -307,8 +320,6 @@ static void tcp_twkill(unsigned long data)
struct tcp_sl_timer *slt = (struct tcp_sl_timer *)data;
atomic_sub(killed, &slt->count);
}
- tcp_tw_death_row_slot =
- ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
}
/* These are always called from BH context. See callers in
@@ -319,12 +330,14 @@ void tcp_tw_schedule(struct tcp_tw_bucket *tw)
int slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1);
struct tcp_tw_bucket **tpp = &tcp_tw_death_row[slot];
+ SOCKHASH_LOCK_WRITE_BH();
if((tw->next_death = *tpp) != NULL)
(*tpp)->pprev_death = &tw->next_death;
*tpp = tw;
tw->pprev_death = tpp;
tw->death_slot = slot;
+ SOCKHASH_UNLOCK_WRITE_BH();
tcp_inc_slow_timer(TCP_SLT_TWKILL);
}
@@ -335,6 +348,7 @@ void tcp_tw_reschedule(struct tcp_tw_bucket *tw)
struct tcp_tw_bucket **tpp;
int slot;
+ SOCKHASH_LOCK_WRITE_BH();
if(tw->next_death)
tw->next_death->pprev_death = tw->pprev_death;
*tw->pprev_death = tw->next_death;
@@ -348,16 +362,21 @@ void tcp_tw_reschedule(struct tcp_tw_bucket *tw)
tw->pprev_death = tpp;
tw->death_slot = slot;
+ SOCKHASH_UNLOCK_WRITE_BH();
+
/* Timer was incremented when we first entered the table. */
}
/* This is for handling early-kills of TIME_WAIT sockets. */
void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
{
+ SOCKHASH_LOCK_WRITE_BH();
if(tw->next_death)
tw->next_death->pprev_death = tw->pprev_death;
*tw->pprev_death = tw->next_death;
tw->pprev_death = NULL;
+ SOCKHASH_UNLOCK_WRITE_BH();
+
tcp_dec_slow_timer(TCP_SLT_TWKILL);
}
@@ -399,20 +418,30 @@ static void tcp_keepalive(unsigned long data)
int count = 0;
int i;
- for(i = chain_start; i < (chain_start + ((TCP_HTABLE_SIZE/2) >> 2)); i++) {
- struct sock *sk = tcp_established_hash[i];
+ SOCKHASH_LOCK_READ_BH();
+ for(i = chain_start; i < (chain_start + ((tcp_ehash_size >> 1) >> 2)); i++) {
+ struct sock *sk;
+
+ sk = tcp_ehash[i];
while(sk) {
- if(!atomic_read(&sk->sock_readers) && sk->keepopen) {
+ struct sock *next = sk->next;
+
+ bh_lock_sock(sk);
+ if (sk->keepopen && !sk->lock.users) {
+ SOCKHASH_UNLOCK_READ_BH();
count += tcp_keepopen_proc(sk);
- if(count == sysctl_tcp_max_ka_probes)
- goto out;
+ SOCKHASH_LOCK_READ_BH();
}
- sk = sk->next;
+ bh_unlock_sock(sk);
+ if(count == sysctl_tcp_max_ka_probes)
+ goto out;
+ sk = next;
}
}
out:
- chain_start = ((chain_start + ((TCP_HTABLE_SIZE/2)>>2)) &
- ((TCP_HTABLE_SIZE/2) - 1));
+ SOCKHASH_UNLOCK_READ_BH();
+ chain_start = ((chain_start + ((tcp_ehash_size >> 1)>>2)) &
+ ((tcp_ehash_size >> 1) - 1));
}
/*
@@ -439,9 +468,11 @@ void tcp_retransmit_timer(unsigned long data)
return;
}
- if (atomic_read(&sk->sock_readers)) {
+ bh_lock_sock(sk);
+ if (sk->lock.users) {
/* Try again later */
tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ/20);
+ bh_unlock_sock(sk);
return;
}
@@ -508,12 +539,51 @@ void tcp_retransmit_timer(unsigned long data)
tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
tcp_write_timeout(sk);
+
+ bh_unlock_sock(sk);
}
/*
* Slow timer for SYN-RECV sockets
*/
+static void tcp_do_syn_queue(struct sock *sk, struct tcp_opt *tp, unsigned long now)
+{
+ struct open_request *prev, *req;
+
+ prev = (struct open_request *) &tp->syn_wait_queue;
+ for(req = tp->syn_wait_queue; req; ) {
+ struct open_request *next = req->dl_next;
+
+ if (! req->sk) {
+ tcp_synq_unlink(tp, req, prev);
+ if(req->retrans >= sysctl_tcp_retries1) {
+ (*req->class->destructor)(req);
+ tcp_dec_slow_timer(TCP_SLT_SYNACK);
+ tp->syn_backlog--;
+ tcp_openreq_free(req);
+ if (! tp->syn_wait_queue)
+ break;
+ } else {
+ unsigned long timeo;
+ struct open_request *rp;
+
+ (*req->class->rtx_syn_ack)(sk, req);
+ req->retrans++;
+ timeo = min((TCP_TIMEOUT_INIT << req->retrans),
+ (120 * HZ));
+ req->expires = now + timeo;
+ rp = prev->dl_next;
+ tcp_synq_queue(tp, req);
+ if(rp != prev->dl_next)
+ prev = prev->dl_next;
+ }
+ } else
+ prev = req;
+ req = next;
+ }
+}
+
/* This now scales very nicely. -DaveM */
static void tcp_syn_recv_timer(unsigned long data)
{
@@ -521,70 +591,21 @@ static void tcp_syn_recv_timer(unsigned long data)
unsigned long now = jiffies;
int i;
+ SOCKHASH_LOCK_READ_BH();
for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
sk = tcp_listening_hash[i];
-
while(sk) {
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
/* TCP_LISTEN is implied. */
- if (!atomic_read(&sk->sock_readers) && tp->syn_wait_queue) {
- struct open_request *prev = (struct open_request *)(&tp->syn_wait_queue);
- struct open_request *req = tp->syn_wait_queue;
- do {
- struct open_request *conn;
-
- conn = req;
- req = req->dl_next;
-
- if (conn->sk) {
- prev = conn;
- continue;
- }
-
- if ((long)(now - conn->expires) <= 0)
- break;
-
-
- tcp_synq_unlink(tp, conn, prev);
- if (conn->retrans >= sysctl_tcp_retries1) {
-#ifdef TCP_DEBUG
- printk(KERN_DEBUG "syn_recv: "
- "too many retransmits\n");
-#endif
- (*conn->class->destructor)(conn);
- tcp_dec_slow_timer(TCP_SLT_SYNACK);
- tp->syn_backlog--;
- tcp_openreq_free(conn);
-
- if (!tp->syn_wait_queue)
- break;
- } else {
- unsigned long timeo;
- struct open_request *op;
-
- (*conn->class->rtx_syn_ack)(sk, conn);
-
- conn->retrans++;
-#ifdef TCP_DEBUG
- printk(KERN_DEBUG "syn_ack rtx %d\n",
- conn->retrans);
-#endif
- timeo = min((TCP_TIMEOUT_INIT
- << conn->retrans),
- 120*HZ);
- conn->expires = now + timeo;
- op = prev->dl_next;
- tcp_synq_queue(tp, conn);
- if (op != prev->dl_next)
- prev = prev->dl_next;
- }
- /* old prev still valid here */
- } while (req);
- }
+ bh_lock_sock(sk);
+ if (!sk->lock.users && tp->syn_wait_queue)
+ tcp_do_syn_queue(sk, tp, now);
+ bh_unlock_sock(sk);
sk = sk->next;
}
}
+ SOCKHASH_UNLOCK_READ_BH();
}
void tcp_sltimer_handler(unsigned long data)
diff --git a/net/ipv4/timer.c b/net/ipv4/timer.c
index 3821a7c4c..0487f5bfa 100644
--- a/net/ipv4/timer.c
+++ b/net/ipv4/timer.c
@@ -5,7 +5,7 @@
*
* TIMER - implementation of software timers for IP.
*
- * Version: $Id: timer.c,v 1.15 1999/02/22 13:54:29 davem Exp $
+ * Version: $Id: timer.c,v 1.16 1999/05/27 00:37:39 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -69,13 +69,15 @@ void net_reset_timer (struct sock *t, int timeout, unsigned long len)
*/
void net_timer (unsigned long data)
{
- struct sock *sk = (struct sock*)data;
+ struct sock *sk = (struct sock *) data;
int why = sk->timeout;
/* Only process if socket is not in use. */
- if (atomic_read(&sk->sock_readers)) {
+ bh_lock_sock(sk);
+ if (sk->lock.users) {
/* Try again later. */
mod_timer(&sk->timer, jiffies+HZ/20);
+ bh_unlock_sock(sk);
return;
}
@@ -99,15 +101,15 @@ void net_timer (unsigned long data)
printk (KERN_DEBUG "non CLOSE socket in time_done\n");
break;
}
- destroy_sock (sk);
- break;
+ destroy_sock(sk);
+ return;
case TIME_DESTROY:
/* We've waited for a while for all the memory associated with
* the socket to be freed.
*/
destroy_sock(sk);
- break;
+ return;
case TIME_CLOSE:
/* We've waited long enough, close the socket. */
@@ -123,5 +125,8 @@ void net_timer (unsigned long data)
printk ("net_timer: timer expired - reason %d is unknown\n", why);
break;
}
+
+ /* We only need to unlock if the socket was not destroyed. */
+ bh_unlock_sock(sk);
}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5fcec9cf3..320e5151e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -5,7 +5,7 @@
*
* The User Datagram Protocol (UDP).
*
- * Version: $Id: udp.c,v 1.66 1999/05/08 20:00:25 davem Exp $
+ * Version: $Id: udp.c,v 1.69 1999/06/09 11:15:31 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -128,7 +128,7 @@ static int udp_v4_verify_bind(struct sock *sk, unsigned short snum)
struct sock *sk2;
int retval = 0, sk_reuse = sk->reuse;
- SOCKHASH_LOCK();
+ SOCKHASH_LOCK_READ();
for(sk2 = udp_hash[snum & (UDP_HTABLE_SIZE - 1)]; sk2 != NULL; sk2 = sk2->next) {
if((sk2->num == snum) && (sk2 != sk)) {
unsigned char state = sk2->state;
@@ -158,7 +158,7 @@ static int udp_v4_verify_bind(struct sock *sk, unsigned short snum)
}
}
}
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_READ();
return retval;
}
@@ -173,14 +173,14 @@ static inline int udp_lport_inuse(u16 num)
return 0;
}
-/* Shared by v4/v6 tcp. */
+/* Shared by v4/v6 udp. */
unsigned short udp_good_socknum(void)
{
int result;
static int start = 0;
int i, best, best_size_so_far;
- SOCKHASH_LOCK();
+ SOCKHASH_LOCK_READ();
if (start > sysctl_local_port_range[1] || start < sysctl_local_port_range[0])
start = sysctl_local_port_range[0];
@@ -223,15 +223,10 @@ unsigned short udp_good_socknum(void)
}
out:
start = result;
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_READ();
return result;
}
-/* Last hit UDP socket cache, this is ipv4 specific so make it static. */
-static u32 uh_cache_saddr, uh_cache_daddr;
-static u16 uh_cache_dport, uh_cache_sport;
-static struct sock *uh_cache_sk = NULL;
-
static void udp_v4_hash(struct sock *sk)
{
struct sock **skp;
@@ -240,11 +235,11 @@ static void udp_v4_hash(struct sock *sk)
num &= (UDP_HTABLE_SIZE - 1);
skp = &udp_hash[num];
- SOCKHASH_LOCK();
+ SOCKHASH_LOCK_WRITE();
sk->next = *skp;
*skp = sk;
sk->hashent = num;
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_WRITE();
}
static void udp_v4_unhash(struct sock *sk)
@@ -255,7 +250,7 @@ static void udp_v4_unhash(struct sock *sk)
num &= (UDP_HTABLE_SIZE - 1);
skp = &udp_hash[num];
- SOCKHASH_LOCK();
+ SOCKHASH_LOCK_WRITE();
while(*skp != NULL) {
if(*skp == sk) {
*skp = sk->next;
@@ -263,9 +258,7 @@ static void udp_v4_unhash(struct sock *sk)
}
skp = &((*skp)->next);
}
- if(uh_cache_sk == sk)
- uh_cache_sk = NULL;
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_WRITE();
}
static void udp_v4_rehash(struct sock *sk)
@@ -277,7 +270,7 @@ static void udp_v4_rehash(struct sock *sk)
num &= (UDP_HTABLE_SIZE - 1);
skp = &udp_hash[oldnum];
- SOCKHASH_LOCK();
+ SOCKHASH_LOCK_WRITE();
while(*skp != NULL) {
if(*skp == sk) {
*skp = sk->next;
@@ -288,13 +281,11 @@ static void udp_v4_rehash(struct sock *sk)
sk->next = udp_hash[num];
udp_hash[num] = sk;
sk->hashent = num;
- if(uh_cache_sk == sk)
- uh_cache_sk = NULL;
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_WRITE();
}
/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
- * harder than this here plus the last hit cache. -DaveM
+ * harder than this. -DaveM
*/
struct sock *udp_v4_lookup_longway(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
{
@@ -341,21 +332,9 @@ __inline__ struct sock *udp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport
{
struct sock *sk;
- if(!dif && uh_cache_sk &&
- uh_cache_saddr == saddr &&
- uh_cache_sport == sport &&
- uh_cache_dport == dport &&
- uh_cache_daddr == daddr)
- return uh_cache_sk;
-
+ SOCKHASH_LOCK_READ();
sk = udp_v4_lookup_longway(saddr, sport, daddr, dport, dif);
- if(!dif) {
- uh_cache_sk = sk;
- uh_cache_saddr = saddr;
- uh_cache_daddr = daddr;
- uh_cache_sport = sport;
- uh_cache_dport = dport;
- }
+ SOCKHASH_UNLOCK_READ();
return sk;
}
@@ -393,7 +372,7 @@ static struct sock *udp_v4_proxy_lookup(unsigned short num, unsigned long raddr,
paddr = idev->ifa_list->ifa_local;
}
- SOCKHASH_LOCK();
+ SOCKHASH_LOCK_READ();
for(s = udp_v4_proxy_loop_init(hnum, hpnum, s, firstpass);
s != NULL;
s = udp_v4_proxy_loop_next(hnum, hpnum, s, firstpass)) {
@@ -431,7 +410,7 @@ static struct sock *udp_v4_proxy_lookup(unsigned short num, unsigned long raddr,
}
}
}
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_READ();
return result;
}
@@ -784,7 +763,10 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
/* 4.1.3.4. It's configurable by the application via setsockopt() */
/* (MAY) and it defaults to on (MUST). */
- err = ip_build_xmit(sk,sk->no_check ? udp_getfrag_nosum : udp_getfrag,
+ err = ip_build_xmit(sk,
+ (sk->no_check == UDP_CSUM_NOXMIT ?
+ udp_getfrag_nosum :
+ udp_getfrag),
&ufh, ulen, &ipc, rt, msg->msg_flags);
out:
@@ -979,8 +961,6 @@ int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
sk->rcv_saddr=INADDR_ANY;
sk->daddr=INADDR_ANY;
sk->state = TCP_CLOSE;
- if(uh_cache_sk == sk)
- uh_cache_sk = NULL;
return 0;
}
@@ -1005,9 +985,6 @@ int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
sk->dport = usin->sin_port;
sk->state = TCP_ESTABLISHED;
- if(uh_cache_sk == sk)
- uh_cache_sk = NULL;
-
sk->dst_cache = &rt->u.dst;
return(0);
}
@@ -1015,6 +992,8 @@ int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
static void udp_close(struct sock *sk, long timeout)
{
+ bh_lock_sock(sk);
+
/* See for explanation: raw_close in ipv4/raw.c */
sk->state = TCP_CLOSE;
udp_v4_unhash(sk);
@@ -1117,6 +1096,33 @@ int udp_chkaddr(struct sk_buff *skb)
}
#endif
+static int udp_checksum_verify(struct sk_buff *skb, struct udphdr *uh,
+ unsigned short ulen, u32 saddr, u32 daddr,
+ int full_csum_deferred)
+{
+ if (!full_csum_deferred) {
+ if (uh->check) {
+ if (skb->ip_summed == CHECKSUM_HW &&
+ udp_check(uh, ulen, saddr, daddr, skb->csum))
+ return -1;
+ if (skb->ip_summed == CHECKSUM_NONE &&
+ udp_check(uh, ulen, saddr, daddr,
+ csum_partial((char *)uh, ulen, 0)))
+ return -1;
+ }
+ } else {
+ if (uh->check == 0)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ else if (skb->ip_summed == CHECKSUM_HW) {
+ if (udp_check(uh, ulen, saddr, daddr, skb->csum))
+ return -1;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ } else if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+ skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
+ }
+ return 0;
+}
+
/*
* All we need to do is get the socket, and then do a checksum.
*/
@@ -1158,25 +1164,18 @@ int udp_rcv(struct sk_buff *skb, unsigned short len)
}
skb_trim(skb, ulen);
-#ifndef CONFIG_UDP_DELAY_CSUM
- if (uh->check &&
- (((skb->ip_summed==CHECKSUM_HW)&&udp_check(uh,ulen,saddr,daddr,skb->csum)) ||
- ((skb->ip_summed==CHECKSUM_NONE) &&
- (udp_check(uh,ulen,saddr,daddr, csum_partial((char*)uh, ulen, 0))))))
- goto csum_error;
+ if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) {
+ int defer;
+
+#ifdef CONFIG_UDP_DELAY_CSUM
+ defer = 1;
#else
- if (uh->check==0)
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- else if (skb->ip_summed==CHECKSUM_HW) {
- if (udp_check(uh,ulen,saddr,daddr,skb->csum))
- goto csum_error;
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- } else if (skb->ip_summed != CHECKSUM_UNNECESSARY)
- skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
+ defer = 0;
#endif
-
- if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
+ if (udp_checksum_verify(skb, uh, ulen, saddr, daddr, defer))
+ goto csum_error;
return udp_v4_mcast_deliver(skb, uh, saddr, daddr);
+ }
#ifdef CONFIG_IP_TRANSPARENT_PROXY
if (IPCB(skb)->redirport)
@@ -1203,6 +1202,15 @@ int udp_rcv(struct sk_buff *skb, unsigned short len)
kfree_skb(skb);
return(0);
}
+ if (udp_checksum_verify(skb, uh, ulen, saddr, daddr,
+#ifdef CONFIG_UDP_DELAY_CSUM
+ 1
+#else
+ (sk->no_check & UDP_CSUM_NORCV) != 0
+#endif
+ ))
+ goto csum_error;
+
udp_deliver(sk, skb);
return 0;
diff --git a/net/ipv4/utils.c b/net/ipv4/utils.c
index ce74ade2a..5992cbc55 100644
--- a/net/ipv4/utils.c
+++ b/net/ipv4/utils.c
@@ -6,7 +6,7 @@
* Various kernel-resident INET utility functions; mainly
* for format conversion and debugging output.
*
- * Version: $Id: utils.c,v 1.6 1997/12/13 21:53:03 kuznet Exp $
+ * Version: $Id: utils.c,v 1.7 1999/06/09 10:11:05 davem Exp $
*
* Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
*
@@ -57,6 +57,11 @@ char *in_ntoa(__u32 in)
return(buff);
}
+char *in_ntoa2(__u32 in, char *buff)
+{
+ sprintf(buff, "%d.%d.%d.%d", NIPQUAD(in));
+ return buff;
+}
/*
* Convert an ASCII string to binary IP.