diff options
author | Ralf Baechle <ralf@linux-mips.org> | 1999-10-09 00:00:47 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 1999-10-09 00:00:47 +0000 |
commit | d6434e1042f3b0a6dfe1b1f615af369486f9b1fa (patch) | |
tree | e2be02f33984c48ec019c654051d27964e42c441 /net/core | |
parent | 609d1e803baf519487233b765eb487f9ec227a18 (diff) |
Merge with 2.3.19.
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/.cvsignore | 2 | ||||
-rw-r--r-- | net/core/Makefile | 4 | ||||
-rw-r--r-- | net/core/datagram.c | 127 | ||||
-rw-r--r-- | net/core/dev.c | 477 | ||||
-rw-r--r-- | net/core/dev_mcast.c | 22 | ||||
-rw-r--r-- | net/core/dst.c | 44 | ||||
-rw-r--r-- | net/core/filter.c | 10 | ||||
-rw-r--r-- | net/core/firewall.c | 160 | ||||
-rw-r--r-- | net/core/iovec.c | 1 | ||||
-rw-r--r-- | net/core/neighbour.c | 214 | ||||
-rw-r--r-- | net/core/netfilter.c | 630 | ||||
-rw-r--r-- | net/core/profile.c | 22 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 73 | ||||
-rw-r--r-- | net/core/scm.c | 6 | ||||
-rw-r--r-- | net/core/skbuff.c | 63 | ||||
-rw-r--r-- | net/core/sock.c | 181 | ||||
-rw-r--r-- | net/core/utils.c | 17 |
17 files changed, 1388 insertions, 665 deletions
diff --git a/net/core/.cvsignore b/net/core/.cvsignore deleted file mode 100644 index 857dd22e9..000000000 --- a/net/core/.cvsignore +++ /dev/null @@ -1,2 +0,0 @@ -.depend -.*.flags diff --git a/net/core/Makefile b/net/core/Makefile index 5df65cd22..7ee0db3fd 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -25,8 +25,8 @@ ifdef CONFIG_NET O_OBJS += dev.o dev_mcast.o dst.o neighbour.o rtnetlink.o utils.o -ifdef CONFIG_FIREWALL -OX_OBJS += firewall.o +ifdef CONFIG_NETFILTER +OX_OBJS += netfilter.o endif endif diff --git a/net/core/datagram.c b/net/core/datagram.c index 98233a224..4c200cf3d 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -46,33 +46,62 @@ /* + * Is a socket 'connection oriented' ? + */ + +static inline int connection_based(struct sock *sk) +{ + return (sk->type==SOCK_SEQPACKET || sk->type==SOCK_STREAM); +} + + +/* * Wait for a packet.. - * - * Interrupts off so that no packet arrives before we begin sleeping. - * Otherwise we might miss our wake up */ -static inline void wait_for_packet(struct sock * sk) +static int wait_for_packet(struct sock * sk, int *err) { + int error; + DECLARE_WAITQUEUE(wait, current); + __set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(sk->sleep, &wait); - current->state = TASK_INTERRUPTIBLE; - if (skb_peek(&sk->receive_queue) == NULL) - schedule(); + /* Socket errors? */ + error = sock_error(sk); + if (error) + goto out; + + if (!skb_queue_empty(&sk->receive_queue)) + goto ready; + + /* Socket shut down? */ + if (sk->shutdown & RCV_SHUTDOWN) + goto out; + + /* Sequenced packets can come disconnected. If so we report the problem */ + error = -ENOTCONN; + if(connection_based(sk) && sk->state!=TCP_ESTABLISHED) + goto out; + + /* handle signals */ + error = -ERESTARTSYS; + if (signal_pending(current)) + goto out; + schedule(); + +ready: current->state = TASK_RUNNING; remove_wait_queue(sk->sleep, &wait); -} + return 0; -/* - * Is a socket 'connection oriented' ? - */ - -static inline int connection_based(struct sock *sk) -{ - return (sk->type==SOCK_SEQPACKET || sk->type==SOCK_STREAM); +out: + current->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); + *err = error; + return error; } /* @@ -108,64 +137,36 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, if (error) goto no_packet; -restart: - while(skb_queue_empty(&sk->receive_queue)) /* No data */ - { - /* Socket errors? */ - error = sock_error(sk); - if (error) - goto no_packet; + do { + /* Again only user level code calls this function, so nothing interrupt level + will suddenly eat the receive_queue. - /* Socket shut down? */ - if (sk->shutdown & RCV_SHUTDOWN) - goto no_packet; + Look at current nfs client by the way... + However, this function was corrent in any case. 8) + */ + if (flags & MSG_PEEK) + { + unsigned long cpu_flags; - /* Sequenced packets can come disconnected. If so we report the problem */ - error = -ENOTCONN; - if(connection_based(sk) && sk->state!=TCP_ESTABLISHED) - goto no_packet; + spin_lock_irqsave(&sk->receive_queue.lock, cpu_flags); + skb = skb_peek(&sk->receive_queue); + if(skb!=NULL) + atomic_inc(&skb->users); + spin_unlock_irqrestore(&sk->receive_queue.lock, cpu_flags); + } else + skb = skb_dequeue(&sk->receive_queue); - /* handle signals */ - error = -ERESTARTSYS; - if (signal_pending(current)) - goto no_packet; + if (skb) + return skb; /* User doesn't want to wait */ error = -EAGAIN; if (noblock) goto no_packet; - wait_for_packet(sk); - } + } while (wait_for_packet(sk, err) == 0); - /* Again only user level code calls this function, so nothing interrupt level - will suddenly eat the receive_queue */ - if (flags & MSG_PEEK) - { - unsigned long cpu_flags; - - /* It is the only POTENTIAL race condition - in this function. skb may be stolen by - another receiver after peek, but before - incrementing use count, provided kernel - is reentearble (it is not) or this function - is called by interrupts. - - Protect it with skb queue spinlock, - though for now even this is overkill. - --ANK (980728) - */ - spin_lock_irqsave(&sk->receive_queue.lock, cpu_flags); - skb = skb_peek(&sk->receive_queue); - if(skb!=NULL) - atomic_inc(&skb->users); - spin_unlock_irqrestore(&sk->receive_queue.lock, cpu_flags); - } else - skb = skb_dequeue(&sk->receive_queue); - - if (!skb) /* Avoid race if someone beats us to the data */ - goto restart; - return skb; + return NULL; no_packet: *err = error; diff --git a/net/core/dev.c b/net/core/dev.c index b9bd18343..955497d90 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -134,13 +134,6 @@ static struct packet_type *ptype_all = NULL; /* Taps */ static rwlock_t ptype_lock = RW_LOCK_UNLOCKED; /* - * Device list lock. Setting it provides that interface - * will not disappear unexpectedly while kernel sleeps. - */ - -atomic_t dev_lockct = ATOMIC_INIT(0); - -/* * Our notifier list */ @@ -159,7 +152,7 @@ int netdev_fastroute_obstacles; struct net_fastroute_stats dev_fastroute_stat; #endif -static void dev_clear_backlog(struct device *dev); +static void dev_clear_backlog(struct net_device *dev); /****************************************************************************************** @@ -256,50 +249,101 @@ void dev_remove_pack(struct packet_type *pt) ******************************************************************************************/ /* - * Find an interface by name. + * Find an interface by name. May be called under rtnl semaphore + * or dev_base_lock. */ -struct device *dev_get(const char *name) + +struct net_device *__dev_get_by_name(const char *name) { - struct device *dev; + struct net_device *dev; - read_lock(&dev_base_lock); for (dev = dev_base; dev != NULL; dev = dev->next) { if (strcmp(dev->name, name) == 0) - goto out; + return dev; } -out: + return NULL; +} + +/* + * Find an interface by name. Any context, dev_put() to release. + */ + +struct net_device *dev_get_by_name(const char *name) +{ + struct net_device *dev; + + read_lock(&dev_base_lock); + dev = __dev_get_by_name(name); + if (dev) + dev_hold(dev); read_unlock(&dev_base_lock); return dev; } -struct device * dev_get_by_index(int ifindex) +/* + Return value is changed to int to prevent illegal usage in future. + It is still legal to use to check for device existance. + */ + +int dev_get(const char *name) { - struct device *dev; + struct net_device *dev; read_lock(&dev_base_lock); + dev = __dev_get_by_name(name); + read_unlock(&dev_base_lock); + return dev != NULL; +} + +/* + * Find an interface by index. May be called under rtnl semaphore + * or dev_base_lock. + */ + +struct net_device * __dev_get_by_index(int ifindex) +{ + struct net_device *dev; + for (dev = dev_base; dev != NULL; dev = dev->next) { if (dev->ifindex == ifindex) - goto out; + return dev; } -out: + return NULL; +} + +/* + * Find an interface by index. Any context, dev_put() to release. + */ + +struct net_device * dev_get_by_index(int ifindex) +{ + struct net_device *dev; + + read_lock(&dev_base_lock); + dev = __dev_get_by_index(ifindex); + if (dev) + dev_hold(dev); read_unlock(&dev_base_lock); return dev; } -struct device *dev_getbyhwaddr(unsigned short type, char *ha) +/* + * Find an interface by ll addr. May be called only under rtnl semaphore. + */ + +struct net_device *dev_getbyhwaddr(unsigned short type, char *ha) { - struct device *dev; + struct net_device *dev; + + ASSERT_RTNL(); - read_lock(&dev_base_lock); for (dev = dev_base; dev != NULL; dev = dev->next) { if (dev->type == type && memcmp(dev->dev_addr, ha, dev->addr_len) == 0) - goto out; + return dev; } -out: - read_unlock(&dev_base_lock); - return dev; + return NULL; } /* @@ -307,7 +351,7 @@ out: * id. Not efficient for many devices, not called a lot.. */ -int dev_alloc_name(struct device *dev, const char *name) +int dev_alloc_name(struct net_device *dev, const char *name) { int i; /* @@ -316,15 +360,15 @@ int dev_alloc_name(struct device *dev, const char *name) for(i=0;i<100;i++) { sprintf(dev->name,name,i); - if(dev_get(dev->name)==NULL) + if(__dev_get_by_name(dev->name)==NULL) return i; } return -ENFILE; /* Over 100 of the things .. bail out! */ } -struct device *dev_alloc(const char *name, int *err) +struct net_device *dev_alloc(const char *name, int *err) { - struct device *dev=kmalloc(sizeof(struct device)+16, GFP_KERNEL); + struct net_device *dev=kmalloc(sizeof(struct net_device)+16, GFP_KERNEL); if(dev==NULL) { *err=-ENOBUFS; @@ -340,7 +384,7 @@ struct device *dev_alloc(const char *name, int *err) return dev; } -void netdev_state_change(struct device *dev) +void netdev_state_change(struct net_device *dev) { if (dev->flags&IFF_UP) notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); @@ -355,7 +399,7 @@ void netdev_state_change(struct device *dev) void dev_load(const char *name) { - if(!dev_get(name) && capable(CAP_SYS_MODULE)) + if(!__dev_get_by_name(name) && capable(CAP_SYS_MODULE)) request_module(name); } @@ -376,7 +420,7 @@ static int default_rebuild_header(struct sk_buff *skb) * Prepare an interface for use. */ -int dev_open(struct device *dev) +int dev_open(struct net_device *dev) { int ret = 0; @@ -434,17 +478,25 @@ int dev_open(struct device *dev) #ifdef CONFIG_NET_FASTROUTE -static __inline__ void dev_do_clear_fastroute(struct device *dev) +static void dev_do_clear_fastroute(struct net_device *dev) { if (dev->accept_fastpath) { int i; - for (i=0; i<=NETDEV_FASTROUTE_HMASK; i++) - dst_release_irqwait(xchg(dev->fastpath+i, NULL)); + for (i=0; i<=NETDEV_FASTROUTE_HMASK; i++) { + struct dst_entry *dst; + + write_lock_irq(&dev->fastpath_lock); + dst = dev->fastpath[i]; + dev->fastpath[i] = NULL; + write_unlock_irq(&dev->fastpath_lock); + + dst_release(dst); + } } } -void dev_clear_fastroute(struct device *dev) +void dev_clear_fastroute(struct net_device *dev) { if (dev) { dev_do_clear_fastroute(dev); @@ -461,15 +513,13 @@ void dev_clear_fastroute(struct device *dev) * Completely shutdown an interface. */ -int dev_close(struct device *dev) +int dev_close(struct net_device *dev) { if (!(dev->flags&IFF_UP)) return 0; dev_deactivate(dev); - dev_lock_wait(); - /* * Call the device specific close. This cannot fail. * Only if device is UP @@ -520,7 +570,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb) * taps currently in use. */ -void dev_queue_xmit_nit(struct sk_buff *skb, struct device *dev) +void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { struct packet_type *ptype; get_fast_time(&skb->stamp); @@ -538,16 +588,7 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct device *dev) if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) break; - /* Code, following below is wrong. - - The only reason, why it does work is that - ONLY packet sockets receive outgoing - packets. If such a packet will be (occasionally) - received by normal packet handler, which expects - that mac header is pulled... - */ - - /* More sensible variant. skb->nh should be correctly + /* skb->nh should be correctly set by sender, so that the second statement is just protection against buggy protocols. */ @@ -563,6 +604,8 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct device *dev) skb2->h.raw = skb2->nh.raw; skb2->pkt_type = PACKET_OUTGOING; + skb2->rx_dev = skb->dev; + dev_hold(skb2->rx_dev); ptype->func(skb2, skb->dev, ptype); } } @@ -590,26 +633,25 @@ void dev_loopback_xmit(struct sk_buff *skb) int dev_queue_xmit(struct sk_buff *skb) { - struct device *dev = skb->dev; + struct net_device *dev = skb->dev; struct Qdisc *q; /* Grab device queue */ spin_lock_bh(&dev->queue_lock); q = dev->qdisc; if (q->enqueue) { - q->enqueue(skb, q); + int ret = q->enqueue(skb, q); /* If the device is not busy, kick it. * Otherwise or if queue is not empty after kick, * add it to run list. */ - if (dev->tbusy || qdisc_restart(dev)) - qdisc_run(dev->qdisc); + if (dev->tbusy || __qdisc_wakeup(dev)) + qdisc_run(q); spin_unlock_bh(&dev->queue_lock); - return 0; + return ret; } - spin_unlock_bh(&dev->queue_lock); /* The device has no queue. Common case for software devices: loopback, all the sorts of tunnels... @@ -623,13 +665,13 @@ int dev_queue_xmit(struct sk_buff *skb) Either shot noqueue qdisc, it is even simpler 8) */ if (dev->flags&IFF_UP) { - if (netdev_nit) - dev_queue_xmit_nit(skb,dev); - - local_bh_disable(); if (dev->xmit_lock_owner != smp_processor_id()) { + spin_unlock(&dev->queue_lock); spin_lock(&dev->xmit_lock); dev->xmit_lock_owner = smp_processor_id(); + + if (netdev_nit) + dev_queue_xmit_nit(skb,dev); if (dev->hard_start_xmit(skb, dev) == 0) { dev->xmit_lock_owner = -1; spin_unlock_bh(&dev->xmit_lock); @@ -639,16 +681,18 @@ int dev_queue_xmit(struct sk_buff *skb) spin_unlock_bh(&dev->xmit_lock); if (net_ratelimit()) printk(KERN_DEBUG "Virtual device %s asks to queue packet!\n", dev->name); + kfree_skb(skb); + return -ENETDOWN; } else { /* Recursion is detected! It is possible, unfortunately */ - local_bh_enable(); if (net_ratelimit()) printk(KERN_DEBUG "Dead loop on virtual device %s, fix it urgently!\n", dev->name); } } + spin_unlock_bh(&dev->queue_lock); kfree_skb(skb); - return 0; + return -ENETDOWN; } @@ -664,20 +708,20 @@ atomic_t netdev_rx_dropped; int netdev_throttle_events; static unsigned long netdev_fc_mask = 1; unsigned long netdev_fc_xoff = 0; +spinlock_t netdev_fc_lock = SPIN_LOCK_UNLOCKED; static struct { - void (*stimul)(struct device *); - struct device *dev; + void (*stimul)(struct net_device *); + struct net_device *dev; } netdev_fc_slots[32]; -int netdev_register_fc(struct device *dev, void (*stimul)(struct device *dev)) +int netdev_register_fc(struct net_device *dev, void (*stimul)(struct net_device *dev)) { int bit = 0; unsigned long flags; - save_flags(flags); - cli(); + spin_lock_irqsave(&netdev_fc_lock, flags); if (netdev_fc_mask != ~0UL) { bit = ffz(netdev_fc_mask); netdev_fc_slots[bit].stimul = stimul; @@ -685,7 +729,7 @@ int netdev_register_fc(struct device *dev, void (*stimul)(struct device *dev)) set_bit(bit, &netdev_fc_mask); clear_bit(bit, &netdev_fc_xoff); } - restore_flags(flags); + spin_unlock_irqrestore(&netdev_fc_lock, flags); return bit; } @@ -693,22 +737,21 @@ void netdev_unregister_fc(int bit) { unsigned long flags; - save_flags(flags); - cli(); + spin_lock_irqsave(&netdev_fc_lock, flags); if (bit > 0) { netdev_fc_slots[bit].stimul = NULL; netdev_fc_slots[bit].dev = NULL; clear_bit(bit, &netdev_fc_mask); clear_bit(bit, &netdev_fc_xoff); } - restore_flags(flags); + spin_unlock_irqrestore(&netdev_fc_lock, flags); } static void netdev_wakeup(void) { unsigned long xoff; - cli(); + spin_lock_irq(&netdev_fc_lock); xoff = netdev_fc_xoff; netdev_fc_xoff = 0; netdev_dropping = 0; @@ -718,47 +761,46 @@ static void netdev_wakeup(void) xoff &= ~(1<<i); netdev_fc_slots[i].stimul(netdev_fc_slots[i].dev); } - sti(); + spin_unlock_irq(&netdev_fc_lock); } #endif -static void dev_clear_backlog(struct device *dev) +static void dev_clear_backlog(struct net_device *dev) { - struct sk_buff *prev, *curr; + struct sk_buff_head garbage; /* * * Let now clear backlog queue. -AS * - * We are competing here both with netif_rx() and net_bh(). - * We don't want either of those to mess with skb ptrs - * while we work on them, thus cli()/sti(). - * - * It looks better to use net_bh trick, at least - * to be sure, that we keep interrupt latency really low. --ANK (980727) - */ + */ + + skb_queue_head_init(&garbage); + spin_lock_irq(&backlog.lock); if (backlog.qlen) { - start_bh_atomic(); + struct sk_buff *prev, *curr; curr = backlog.next; - while ( curr != (struct sk_buff *)(&backlog) ) { - unsigned long flags; + + while (curr != (struct sk_buff *)(&backlog)) { curr=curr->next; - if ( curr->prev->dev == dev ) { + if (curr->prev->dev == dev) { prev = curr->prev; - spin_lock_irqsave(&backlog.lock, flags); __skb_unlink(prev, &backlog); - spin_unlock_irqrestore(&backlog.lock, flags); - kfree_skb(prev); + __skb_queue_tail(&garbage, prev); } } - end_bh_atomic(); + } + spin_unlock_irq(&backlog.lock); + + if (garbage.qlen) { #ifdef CONFIG_NET_HW_FLOWCONTROL if (netdev_dropping) netdev_wakeup(); #else netdev_dropping = 0; #endif + skb_queue_purge(&garbage); } } @@ -769,12 +811,8 @@ static void dev_clear_backlog(struct device *dev) void netif_rx(struct sk_buff *skb) { -#ifndef CONFIG_CPU_IS_SLOW if(skb->stamp.tv_sec==0) get_fast_time(&skb->stamp); -#else - skb->stamp = xtime; -#endif /* The code is rearranged so that the path is the most short when CPU is congested, but is still operating. @@ -783,6 +821,10 @@ void netif_rx(struct sk_buff *skb) if (backlog.qlen <= netdev_max_backlog) { if (backlog.qlen) { if (netdev_dropping == 0) { + if (skb->rx_dev) + dev_put(skb->rx_dev); + skb->rx_dev = skb->dev; + dev_hold(skb->rx_dev); skb_queue_tail(&backlog,skb); mark_bh(NET_BH); return; @@ -797,6 +839,10 @@ void netif_rx(struct sk_buff *skb) #else netdev_dropping = 0; #endif + if (skb->rx_dev) + dev_put(skb->rx_dev); + skb->rx_dev = skb->dev; + dev_hold(skb->rx_dev); skb_queue_tail(&backlog,skb); mark_bh(NET_BH); return; @@ -938,9 +984,15 @@ void net_bh(void) if (!ptype->dev || ptype->dev == skb->dev) { if(pt_prev) { - struct sk_buff *skb2=skb_clone(skb, GFP_ATOMIC); + struct sk_buff *skb2; + if (pt_prev->data == NULL) + skb2 = skb_clone(skb, GFP_ATOMIC); + else { + skb2 = skb; + atomic_inc(&skb2->users); + } if(skb2) - pt_prev->func(skb2,skb->dev, pt_prev); + pt_prev->func(skb2, skb->dev, pt_prev); } pt_prev=ptype; } @@ -958,7 +1010,12 @@ void net_bh(void) { struct sk_buff *skb2; - skb2=skb_clone(skb, GFP_ATOMIC); + if (pt_prev->data == NULL) + skb2 = skb_clone(skb, GFP_ATOMIC); + else { + skb2 = skb; + atomic_inc(&skb2->users); + } /* * Kick the protocol handler. This should be fast @@ -988,7 +1045,7 @@ void net_bh(void) } read_unlock(&ptype_lock); } /* End of queue loop */ - + /* * We have emptied the queue */ @@ -1041,26 +1098,29 @@ int register_gifconf(unsigned int family, gifconf_func_t * gifconf) static int dev_ifname(struct ifreq *arg) { - struct device *dev; + struct net_device *dev; struct ifreq ifr; - int err; /* * Fetch the caller's info block. */ - err = copy_from_user(&ifr, arg, sizeof(struct ifreq)); - if (err) + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) return -EFAULT; - dev = dev_get_by_index(ifr.ifr_ifindex); - if (!dev) + read_lock(&dev_base_lock); + dev = __dev_get_by_index(ifr.ifr_ifindex); + if (!dev) { + read_unlock(&dev_base_lock); return -ENODEV; + } strcpy(ifr.ifr_name, dev->name); + read_unlock(&dev_base_lock); - err = copy_to_user(arg, &ifr, sizeof(struct ifreq)); - return (err)?-EFAULT:0; + if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + return 0; } /* @@ -1072,7 +1132,7 @@ static int dev_ifname(struct ifreq *arg) static int dev_ifconf(char *arg) { struct ifconf ifc; - struct device *dev; + struct net_device *dev; char *pos; int len; int total; @@ -1085,20 +1145,14 @@ static int dev_ifconf(char *arg) if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) return -EFAULT; + pos = ifc.ifc_buf; len = ifc.ifc_len; - if (ifc.ifc_buf) { - pos = (char *) kmalloc(len, GFP_KERNEL); - if(pos == NULL) - return -ENOBUFS; - } else - pos = NULL; /* * Loop over the interfaces, and write an info block for each. */ total = 0; - read_lock(&dev_base_lock); for (dev = dev_base; dev != NULL; dev = dev->next) { for (i=0; i<NPROTO; i++) { if (gifconf_list[i]) { @@ -1108,19 +1162,13 @@ static int dev_ifconf(char *arg) } else { done = gifconf_list[i](dev, pos+total, len-total); } + if (done<0) { + return -EFAULT; + } total += done; } } } - read_unlock(&dev_base_lock); - - if(pos != NULL) { - int err = copy_to_user(ifc.ifc_buf, pos, total); - - kfree(pos); - if(err) - return -EFAULT; - } /* * All done. Write the updated control block back to the caller. @@ -1142,7 +1190,8 @@ static int dev_ifconf(char *arg) */ #ifdef CONFIG_PROC_FS -static int sprintf_stats(char *buffer, struct device *dev) + +static int sprintf_stats(char *buffer, struct net_device *dev) { struct net_device_stats *stats = (dev->get_stats ? dev->get_stats(dev): NULL); int size; @@ -1181,7 +1230,7 @@ int dev_get_info(char *buffer, char **start, off_t offset, int length, int dummy off_t pos=0; int size; - struct device *dev; + struct net_device *dev; size = sprintf(buffer, @@ -1206,11 +1255,13 @@ int dev_get_info(char *buffer, char **start, off_t offset, int length, int dummy break; } read_unlock(&dev_base_lock); - + *start=buffer+(offset-begin); /* Start of wanted data */ len-=(offset-begin); /* Start slop */ if(len>length) len=length; /* Ending slop */ + if (len<0) + len=0; return len; } @@ -1258,7 +1309,7 @@ static int dev_proc_stats(char *buffer, char **start, off_t offset, * Print one entry of /proc/net/wireless * This is a clone of /proc/net/dev (just above) */ -static int sprintf_wireless_stats(char *buffer, struct device *dev) +static int sprintf_wireless_stats(char *buffer, struct net_device *dev) { /* Get stats from the driver */ struct iw_statistics *stats = (dev->get_wireless_stats ? @@ -1298,7 +1349,7 @@ int dev_get_wireless_info(char * buffer, char **start, off_t offset, off_t pos = 0; int size; - struct device * dev; + struct net_device * dev; size = sprintf(buffer, "Inter-|sta| Quality | Discarded packets\n" @@ -1326,13 +1377,15 @@ int dev_get_wireless_info(char * buffer, char **start, off_t offset, len -= (offset - begin); /* Start slop */ if(len > length) len = length; /* Ending slop */ + if (len<0) + len=0; return len; } #endif /* CONFIG_PROC_FS */ #endif /* CONFIG_NET_RADIO */ -void dev_set_promiscuity(struct device *dev, int inc) +void dev_set_promiscuity(struct net_device *dev, int inc) { unsigned short old_flags = dev->flags; @@ -1353,7 +1406,7 @@ void dev_set_promiscuity(struct device *dev, int inc) } } -void dev_set_allmulti(struct device *dev, int inc) +void dev_set_allmulti(struct net_device *dev, int inc) { unsigned short old_flags = dev->flags; @@ -1364,7 +1417,7 @@ void dev_set_allmulti(struct device *dev, int inc) dev_mc_upload(dev); } -int dev_change_flags(struct device *dev, unsigned flags) +int dev_change_flags(struct net_device *dev, unsigned flags) { int ret; int old_flags = dev->flags; @@ -1428,10 +1481,10 @@ int dev_change_flags(struct device *dev, unsigned flags) static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) { - struct device *dev; + struct net_device *dev; int err; - if ((dev = dev_get(ifr->ifr_name)) == NULL) + if ((dev = __dev_get_by_name(ifr->ifr_name)) == NULL) return -ENODEV; switch(cmd) @@ -1543,7 +1596,7 @@ static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) case SIOCSIFNAME: if (dev->flags&IFF_UP) return -EBUSY; - if (dev_get(ifr->ifr_newname)) + if (__dev_get_by_name(ifr->ifr_newname)) return -EEXIST; memcpy(dev->name, ifr->ifr_newname, IFNAMSIZ); dev->name[IFNAMSIZ-1] = 0; @@ -1632,7 +1685,9 @@ int dev_ioctl(unsigned int cmd, void *arg) case SIOCGIFINDEX: case SIOCGIFTXQLEN: dev_load(ifr.ifr_name); + read_lock(&dev_base_lock); ret = dev_ifsioc(&ifr, cmd); + read_unlock(&dev_base_lock); if (!ret) { if (colon) *colon = ':'; @@ -1716,7 +1771,7 @@ int dev_new_index(void) for (;;) { if (++ifindex <= 0) ifindex=1; - if (dev_get_by_index(ifindex) == NULL) + if (__dev_get_by_index(ifindex) == NULL) return ifindex; } } @@ -1724,13 +1779,16 @@ int dev_new_index(void) static int dev_boot_phase = 1; -int register_netdevice(struct device *dev) +int register_netdevice(struct net_device *dev) { - struct device *d, **dp; + struct net_device *d, **dp; spin_lock_init(&dev->queue_lock); spin_lock_init(&dev->xmit_lock); dev->xmit_lock_owner = -1; +#ifdef CONFIG_NET_FASTROUTE + dev->fastpath_lock=RW_LOCK_UNLOCKED; +#endif if (dev_boot_phase) { /* This is NOT bug, but I am not sure, that all the @@ -1755,6 +1813,7 @@ int register_netdevice(struct device *dev) dev->next = NULL; write_lock_bh(&dev_base_lock); *dp = dev; + dev_hold(dev); write_unlock_bh(&dev_base_lock); return 0; } @@ -1775,10 +1834,20 @@ int register_netdevice(struct device *dev) return -EEXIST; } } + /* + * nil rebuild_header routine, + * that should be never called and used as just bug trap. + */ + + if (dev->rebuild_header == NULL) + dev->rebuild_header = default_rebuild_header; + dev->next = NULL; dev_init_scheduler(dev); write_lock_bh(&dev_base_lock); *dp = dev; + dev_hold(dev); + dev->deadbeaf = 0; write_unlock_bh(&dev_base_lock); /* Notify protocols, that a new device appeared. */ @@ -1787,37 +1856,51 @@ int register_netdevice(struct device *dev) return 0; } -int unregister_netdevice(struct device *dev) +int netdev_finish_unregister(struct net_device *dev) { - struct device *d, **dp; + BUG_TRAP(dev->ip_ptr==NULL); + BUG_TRAP(dev->ip6_ptr==NULL); + BUG_TRAP(dev->dn_ptr==NULL); + + if (!dev->deadbeaf) { + printk("Freeing alive device %p, %s\n", dev, dev->name); + return 0; + } +#ifdef NET_REFCNT_DEBUG + printk(KERN_DEBUG "netdev_finish_unregister: %s%s.\n", dev->name, dev->new_style?"":", old style"); +#endif + if (dev->destructor) + dev->destructor(dev); + if (dev->new_style) + kfree(dev); + return 0; +} + +int unregister_netdevice(struct net_device *dev) +{ + unsigned long now; + struct net_device *d, **dp; /* If device is running, close it first. */ if (dev->flags & IFF_UP) dev_close(dev); + BUG_TRAP(dev->deadbeaf==0); + dev->deadbeaf = 1; + /* And unlink it from device chain. */ for (dp = &dev_base; (d=*dp) != NULL; dp=&d->next) { if (d == dev) { write_lock_bh(&dev_base_lock); *dp = d->next; write_unlock_bh(&dev_base_lock); - - /* Sorry. It is known "feature". The race is clear. - Keep it after device reference counting will - be complete. - */ - synchronize_bh(); break; } } - if (d == NULL) + if (d == NULL) { + printk(KERN_DEBUG "unregister_netdevice: device %s/%p never was registered\n", dev->name, dev); return -ENODEV; - - /* It is "synchronize_bh" to those of guys, who overslept - in skb_alloc/page fault etc. that device is off-line. - Again, it can be removed only if devices are refcounted. - */ - dev_lock_wait(); + } if (dev_boot_phase == 0) { #ifdef CONFIG_NET_FASTROUTE @@ -1838,8 +1921,68 @@ int unregister_netdevice(struct device *dev) dev_mc_discard(dev); } - if (dev->destructor) - dev->destructor(dev); + if (dev->uninit) + dev->uninit(dev); + + if (dev->new_style) { +#ifdef NET_REFCNT_DEBUG + if (atomic_read(&dev->refcnt) != 1) + printk(KERN_DEBUG "unregister_netdevice: holding %s refcnt=%d\n", dev->name, atomic_read(&dev->refcnt)-1); +#endif + dev_put(dev); + return 0; + } + + /* Last reference is our one */ + if (atomic_read(&dev->refcnt) == 1) { + dev_put(dev); + return 0; + } + +#ifdef NET_REFCNT_DEBUG + printk("unregister_netdevice: waiting %s refcnt=%d\n", dev->name, atomic_read(&dev->refcnt)); +#endif + + /* EXPLANATION. If dev->refcnt is not 1 now (1 is our own reference) + it means that someone in the kernel still has reference + to this device and we cannot release it. + + "New style" devices have destructors, hence we can return from this + function and destructor will do all the work later. + + "Old style" devices expect that device is free of any references + upon exit from this function. WE CANNOT MAKE such release + without delay. Note that it is not new feature. Referencing devices + after they are released occured in 2.0 and 2.2. + Now we just can know about each fact of illegal usage. + + So, we linger for 10*HZ (it is an arbitrary number) + + After 1 second, we start to rebroadcast unregister notifications + in hope that careless clients will release the device. + + If timeout expired, we have no choice how to cross fingers + and return. Real alternative would be block here forever + and we will make it eventually, when all peaceful citizens + will be notified and repaired. + */ + + now = jiffies; + while (atomic_read(&dev->refcnt) != 1) { + if ((jiffies - now) > 1*HZ) { + /* Rebroadcast unregister notification */ + notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev); + } + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ/4); + current->state = TASK_RUNNING; + if ((jiffies - now) > 10*HZ) + break; + } + + if (atomic_read(&dev->refcnt) != 1) + printk("unregister_netdevice: Old style device %s leaked(refcnt=%d). Wait for crash.\n", dev->name, atomic_read(&dev->refcnt)-1); + dev_put(dev); return 0; } @@ -1856,11 +1999,6 @@ extern int scc_init(void); extern void sdla_setup(void); extern void dlci_setup(void); extern int dmascc_init(void); -extern int sm_init(void); - -extern int baycom_ser_fdx_init(void); -extern int baycom_ser_hdx_init(void); -extern int baycom_par_init(void); extern int lapbeth_init(void); extern void arcnet_init(void); @@ -1889,9 +2027,9 @@ static struct proc_dir_entry proc_net_wireless = { #endif /* CONFIG_PROC_FS */ #endif /* CONFIG_NET_RADIO */ -__initfunc(int net_dev_init(void)) +int __init net_dev_init(void) { - struct device *dev, **dp; + struct net_device *dev, **dp; #ifdef CONFIG_NET_SCHED pktsched_init(); @@ -1932,18 +2070,6 @@ __initfunc(int net_dev_init(void)) #if defined(CONFIG_SDLA) sdla_setup(); #endif -#if defined(CONFIG_BAYCOM_PAR) - baycom_par_init(); -#endif -#if defined(CONFIG_BAYCOM_SER_FDX) - baycom_ser_fdx_init(); -#endif -#if defined(CONFIG_BAYCOM_SER_HDX) - baycom_ser_hdx_init(); -#endif -#if defined(CONFIG_SOUNDMODEM) - sm_init(); -#endif #if defined(CONFIG_LAPBETHER) lapbeth_init(); #endif @@ -1993,18 +2119,23 @@ __initfunc(int net_dev_init(void)) spin_lock_init(&dev->xmit_lock); dev->xmit_lock_owner = -1; dev->iflink = -1; + dev_hold(dev); if (dev->init && dev->init(dev)) { /* * It failed to come up. Unhook it. */ write_lock_bh(&dev_base_lock); *dp = dev->next; + dev->deadbeaf = 1; write_unlock_bh(&dev_base_lock); + dev_put(dev); } else { dp = &dev->next; dev->ifindex = dev_new_index(); if (dev->iflink == -1) dev->iflink = dev->ifindex; + if (dev->rebuild_header == NULL) + dev->rebuild_header = default_rebuild_header; dev_init_scheduler(dev); } } diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c index f7fcb1f87..c52df0507 100644 --- a/net/core/dev_mcast.c +++ b/net/core/dev_mcast.c @@ -68,7 +68,7 @@ static rwlock_t dev_mc_lock = RW_LOCK_UNLOCKED; * Update the multicast list into the physical NIC controller. */ -void dev_mc_upload(struct device *dev) +void dev_mc_upload(struct net_device *dev) { /* Don't do anything till we up the interface [dev_open will call this function so the list will @@ -97,7 +97,7 @@ void dev_mc_upload(struct device *dev) * Delete a device level multicast */ -int dev_mc_delete(struct device *dev, void *addr, int alen, int glbl) +int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl) { int err = 0; struct dev_mc_list *dmi, **dmip; @@ -123,13 +123,14 @@ int dev_mc_delete(struct device *dev, void *addr, int alen, int glbl) */ *dmip = dmi->next; dev->mc_count--; + write_unlock_bh(&dev_mc_lock); + kfree_s(dmi,sizeof(*dmi)); + /* * We have altered the list, so the card * loaded filter is now wrong. Fix it */ - write_unlock_bh(&dev_mc_lock); - dev_mc_upload(dev); return 0; } @@ -144,15 +145,12 @@ done: * Add a device level multicast */ -int dev_mc_add(struct device *dev, void *addr, int alen, int glbl) +int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) { int err = 0; struct dev_mc_list *dmi, *dmi1; - /* RED-PEN: does gfp_any() work now? It requires - true local_bh_disable rather than global. - */ - dmi1 = (struct dev_mc_list *)kmalloc(sizeof(*dmi), gfp_any()); + dmi1 = (struct dev_mc_list *)kmalloc(sizeof(*dmi), GFP_ATOMIC); write_lock_bh(&dev_mc_lock); for(dmi=dev->mc_list; dmi!=NULL; dmi=dmi->next) { @@ -194,7 +192,7 @@ done: * Discard multicast list when a device is downed */ -void dev_mc_discard(struct device *dev) +void dev_mc_discard(struct net_device *dev) { write_lock_bh(&dev_mc_lock); while (dev->mc_list!=NULL) { @@ -215,7 +213,7 @@ static int dev_mc_read_proc(char *buffer, char **start, off_t offset, off_t pos=0, begin=0; struct dev_mc_list *m; int len=0; - struct device *dev; + struct net_device *dev; read_lock(&dev_base_lock); for (dev = dev_base; dev; dev = dev->next) { @@ -257,7 +255,7 @@ done: } #endif -__initfunc(void dev_mcast_init(void)) +void __init dev_mcast_init(void) { #ifdef CONFIG_PROC_FS struct proc_dir_entry *ent; diff --git a/net/core/dst.c b/net/core/dst.c index 92dd0941a..990d86682 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -5,6 +5,7 @@ * */ +#include <asm/segment.h> #include <asm/system.h> #include <asm/bitops.h> #include <linux/types.h> @@ -50,10 +51,11 @@ static void dst_run_gc(unsigned long dummy) return; } + del_timer(&dst_gc_timer); dstp = &dst_garbage_list; while ((dst = *dstp) != NULL) { - if (atomic_read(&dst->use)) { + if (atomic_read(&dst->__refcnt)) { dstp = &dst->next; delayed++; continue; @@ -91,7 +93,7 @@ static int dst_blackhole(struct sk_buff *skb) return 0; } -void * dst_alloc(int size, struct dst_ops * ops) +void * dst_alloc(struct dst_ops * ops) { struct dst_entry * dst; @@ -99,12 +101,11 @@ void * dst_alloc(int size, struct dst_ops * ops) if (ops->gc()) return NULL; } - dst = kmalloc(size, GFP_ATOMIC); + dst = kmem_cache_alloc(ops->kmem_cachep, SLAB_ATOMIC); if (!dst) return NULL; - memset(dst, 0, size); + memset(dst, 0, ops->entry_size); dst->ops = ops; - atomic_set(&dst->refcnt, 0); dst->lastuse = jiffies; dst->input = dst_discard; dst->output = dst_blackhole; @@ -123,7 +124,6 @@ void __dst_free(struct dst_entry * dst) if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { dst->input = dst_discard; dst->output = dst_blackhole; - dst->dev = &loopback_dev; } dst->obsolete = 2; dst->next = dst_garbage_list; @@ -157,13 +157,15 @@ void dst_destroy(struct dst_entry * dst) if (dst->ops->destroy) dst->ops->destroy(dst); + if (dst->dev) + dev_put(dst->dev); atomic_dec(&dst_total); - kfree(dst); + kmem_cache_free(dst->ops->kmem_cachep, dst); } static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct device *dev = ptr; + struct net_device *dev = ptr; struct dst_entry *dst; switch (event) { @@ -172,9 +174,27 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event, void spin_lock_bh(&dst_lock); for (dst = dst_garbage_list; dst; dst = dst->next) { if (dst->dev == dev) { - dst->input = dst_discard; - dst->output = dst_blackhole; - dst->dev = &loopback_dev; + /* Dirty hack. We did it in 2.2 (in __dst_free), + we have _very_ good reasons not to repeat + this mistake in 2.3, but we have no choice + now. _It_ _is_ _explicit_ _deliberate_ + _race_ _condition_. + */ + if (event!=NETDEV_DOWN && !dev->new_style && + dst->output == dst_blackhole) { + dst->dev = &loopback_dev; + dev_put(dev); + dev_hold(&loopback_dev); + dst->output = dst_discard; + if (dst->neighbour && dst->neighbour->dev == dev) { + dst->neighbour->dev = &loopback_dev; + dev_put(dev); + dev_hold(&loopback_dev); + } + } else { + dst->input = dst_discard; + dst->output = dst_blackhole; + } } } spin_unlock_bh(&dst_lock); @@ -189,7 +209,7 @@ struct notifier_block dst_dev_notifier = { 0 }; -__initfunc(void dst_init(void)) +void __init dst_init(void) { register_netdevice_notifier(&dst_dev_notifier); } diff --git a/net/core/filter.c b/net/core/filter.c index 8e1ffb628..d9939e3a4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -49,7 +49,7 @@ static u8 *load_pointer(struct sk_buff *skb, int k) else if (k>=SKF_LL_OFF) ptr = skb->mac.raw + k - SKF_LL_OFF; - if (ptr<skb->head && ptr < skb->tail) + if (ptr >= skb->head && ptr < skb->tail) return ptr; return NULL; } @@ -248,6 +248,7 @@ load_b: continue; } } + return 0; case BPF_LD|BPF_W|BPF_LEN: A = len; @@ -440,9 +441,12 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) fp->len = fprog->len; if ((err = sk_chk_filter(fp->insns, fp->len))==0) { - struct sk_filter *old_fp = sk->filter; + struct sk_filter *old_fp; + + spin_lock_bh(&sk->lock.slock); + old_fp = sk->filter; sk->filter = fp; - synchronize_bh(); + spin_unlock_bh(&sk->lock.slock); fp = old_fp; } diff --git a/net/core/firewall.c b/net/core/firewall.c deleted file mode 100644 index 7ca90f49a..000000000 --- a/net/core/firewall.c +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Generic loadable firewalls. At the moment only IP will actually - * use these, but people can add the others as they are needed. - * - * Authors: Dave Bonn (for IP) - * much hacked by: Alan Cox - */ - -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/firewall.h> -#include <linux/init.h> -#include <linux/interrupt.h> -#include <asm/semaphore.h> - -DECLARE_MUTEX(firewall_sem); -static int firewall_policy[NPROTO]; -static struct firewall_ops *firewall_chain[NPROTO]; - -/* - * Register a firewall - */ - -int register_firewall(int pf, struct firewall_ops *fw) -{ - struct firewall_ops **p; - - if(pf<0||pf>=NPROTO) - return -EINVAL; - - /* - * Don't allow two people to adjust at once. - */ - - down(&firewall_sem); - - p=&firewall_chain[pf]; - - while(*p) - { - if(fw->fw_priority > (*p)->fw_priority) - break; - p=&((*p)->next); - } - - /* - * We need to use a memory barrier to make sure that this - * works correctly even in SMP with weakly ordered writes. - * - * This is atomic wrt interrupts (and generally walking the - * chain), but not wrt itself (so you can't call this from - * an interrupt. Not that you'd want to). - */ - - fw->next=*p; - mb(); - *p = fw; - - /* - * And release the sleep lock - */ - - up(&firewall_sem); - return 0; -} - -/* - * Unregister a firewall - */ - -int unregister_firewall(int pf, struct firewall_ops *fw) -{ - struct firewall_ops **nl; - - if(pf<0||pf>=NPROTO) - return -EINVAL; - - /* - * Don't allow two people to adjust at once. - */ - - down(&firewall_sem); - - nl=&firewall_chain[pf]; - - while(*nl!=NULL) - { - if(*nl==fw) - { - struct firewall_ops *f=fw->next; - *nl = f; - up(&firewall_sem); - synchronize_bh(); - return 0; - } - nl=&((*nl)->next); - } - up(&firewall_sem); - return -ENOENT; -} - -int call_fw_firewall(int pf, struct device *dev, void *phdr, void *arg, struct sk_buff **skb) -{ - struct firewall_ops *fw=firewall_chain[pf]; - - while(fw!=NULL) - { - int rc=fw->fw_forward(fw,pf,dev,phdr,arg,skb); - if(rc!=FW_SKIP) - return rc; - fw=fw->next; - } - return firewall_policy[pf]; -} - -/* - * Actual invocation of the chains - */ - -int call_in_firewall(int pf, struct device *dev, void *phdr, void *arg, struct sk_buff **skb) -{ - struct firewall_ops *fw=firewall_chain[pf]; - - while(fw!=NULL) - { - int rc=fw->fw_input(fw,pf,dev,phdr,arg,skb); - if(rc!=FW_SKIP) - return rc; - fw=fw->next; - } - return firewall_policy[pf]; -} - -int call_out_firewall(int pf, struct device *dev, void *phdr, void *arg, struct sk_buff **skb) -{ - struct firewall_ops *fw=firewall_chain[pf]; - - while(fw!=NULL) - { - int rc=fw->fw_output(fw,pf,dev,phdr,arg,skb); - if(rc!=FW_SKIP) - return rc; - fw=fw->next; - } - /* alan, is this right? */ - return firewall_policy[pf]; -} - -EXPORT_SYMBOL(register_firewall); -EXPORT_SYMBOL(unregister_firewall); -EXPORT_SYMBOL(call_in_firewall); -EXPORT_SYMBOL(call_out_firewall); -EXPORT_SYMBOL(call_fw_firewall); - -__initfunc(void fwchain_init(void)) -{ - int i; - for(i=0;i<NPROTO;i++) - firewall_policy[i]=FW_ACCEPT; -} diff --git a/net/core/iovec.c b/net/core/iovec.c index c20f85303..07970a18e 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -27,6 +27,7 @@ #include <asm/uaccess.h> #include <asm/byteorder.h> #include <net/checksum.h> +#include <net/sock.h> /* * Verify iovec. The caller must ensure that the iovec is big enough diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 6124fcfc3..0ce941a35 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -49,7 +49,7 @@ static void neigh_timer_handler(unsigned long arg); #ifdef CONFIG_ARPD static void neigh_app_notify(struct neighbour *n); #endif -static int pneigh_ifdown(struct neigh_table *tbl, struct device *dev); +static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev); static int neigh_glbl_allocs; static struct neigh_table *neigh_tables; @@ -89,7 +89,6 @@ static struct neigh_table *neigh_tables; The last lock is neigh_tbl_lock. It is pure SMP lock, protecting list of neighbour tables. This list is used only in process context, - so that this lock is useless with big kernel lock. */ static rwlock_t neigh_tbl_lock = RW_LOCK_UNLOCKED; @@ -134,16 +133,15 @@ static int neigh_forced_gc(struct neigh_table *tbl) or flooding. */ write_lock(&n->lock); - if (atomic_read(&n->refcnt) == 0 && + if (atomic_read(&n->refcnt) == 1 && !(n->nud_state&NUD_PERMANENT) && (n->nud_state != NUD_INCOMPLETE || jiffies - n->used > n->parms->retrans_time)) { *np = n->next; - n->tbl = NULL; - tbl->entries--; + n->dead = 1; shrunk = 1; write_unlock(&n->lock); - neigh_destroy(n); + neigh_release(n); continue; } write_unlock(&n->lock); @@ -156,7 +154,18 @@ static int neigh_forced_gc(struct neigh_table *tbl) return shrunk; } -int neigh_ifdown(struct neigh_table *tbl, struct device *dev) +static int neigh_del_timer(struct neighbour *n) +{ + if (n->nud_state & NUD_IN_TIMER) { + if (del_timer(&n->timer)) { + neigh_release(n); + return 1; + } + } + return 0; +} + +int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) { int i; @@ -173,9 +182,10 @@ int neigh_ifdown(struct neigh_table *tbl, struct device *dev) } *np = n->next; write_lock(&n->lock); - n->tbl = NULL; - tbl->entries--; - if (atomic_read(&n->refcnt)) { + neigh_del_timer(n); + n->dead = 1; + + if (atomic_read(&n->refcnt) != 1) { /* The most unpleasant situation. We must destroy neighbour entry, but someone still uses it. @@ -185,8 +195,6 @@ int neigh_ifdown(struct neigh_table *tbl, struct device *dev) we must kill timers etc. and move it to safe state. */ - if (n->nud_state & NUD_IN_TIMER) - del_timer(&n->timer); n->parms = &tbl->parms; skb_queue_purge(&n->arp_queue); n->output = neigh_blackhole; @@ -195,11 +203,9 @@ int neigh_ifdown(struct neigh_table *tbl, struct device *dev) else n->nud_state = NUD_NONE; NEIGH_PRINTK2("neigh %p is stray.\n", n); - write_unlock(&n->lock); - } else { - write_unlock(&n->lock); - neigh_destroy(n); } + write_unlock(&n->lock); + neigh_release(n); } } @@ -223,7 +229,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl) return NULL; } - n = kmalloc(tbl->entry_size, GFP_ATOMIC); + n = kmem_cache_alloc(tbl->kmem_cachep, SLAB_ATOMIC); if (n == NULL) return NULL; @@ -240,27 +246,27 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl) n->timer.data = (unsigned long)n; tbl->stats.allocs++; neigh_glbl_allocs++; + tbl->entries++; + n->tbl = tbl; + atomic_set(&n->refcnt, 1); + n->dead = 1; return n; } struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, - struct device *dev) + struct net_device *dev) { struct neighbour *n; u32 hash_val; int key_len = tbl->key_len; - hash_val = *(u32*)(pkey + key_len - 4); - hash_val ^= (hash_val>>16); - hash_val ^= hash_val>>8; - hash_val ^= hash_val>>3; - hash_val = (hash_val^dev->ifindex)&NEIGH_HASHMASK; + hash_val = tbl->hash(pkey, dev); read_lock_bh(&tbl->lock); for (n = tbl->hash_buckets[hash_val]; n; n = n->next) { if (dev == n->dev && memcmp(n->primary_key, pkey, key_len) == 0) { - atomic_inc(&n->refcnt); + neigh_hold(n); break; } } @@ -269,7 +275,7 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, } struct neighbour * neigh_create(struct neigh_table *tbl, const void *pkey, - struct device *dev) + struct net_device *dev) { struct neighbour *n, *n1; u32 hash_val; @@ -281,50 +287,46 @@ struct neighbour * neigh_create(struct neigh_table *tbl, const void *pkey, memcpy(n->primary_key, pkey, key_len); n->dev = dev; + dev_hold(dev); /* Protocol specific setup. */ if (tbl->constructor && tbl->constructor(n) < 0) { - neigh_destroy(n); + neigh_release(n); return NULL; } /* Device specific setup. */ if (n->parms && n->parms->neigh_setup && n->parms->neigh_setup(n) < 0) { - neigh_destroy(n); + neigh_release(n); return NULL; } n->confirmed = jiffies - (n->parms->base_reachable_time<<1); - hash_val = *(u32*)(pkey + key_len - 4); - hash_val ^= (hash_val>>16); - hash_val ^= hash_val>>8; - hash_val ^= hash_val>>3; - hash_val = (hash_val^dev->ifindex)&NEIGH_HASHMASK; + hash_val = tbl->hash(pkey, dev); write_lock_bh(&tbl->lock); for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) { if (dev == n1->dev && memcmp(n1->primary_key, pkey, key_len) == 0) { - atomic_inc(&n1->refcnt); + neigh_hold(n1); write_unlock_bh(&tbl->lock); - neigh_destroy(n); + neigh_release(n); return n1; } } - tbl->entries++; - n->tbl = tbl; - atomic_set(&n->refcnt, 1); n->next = tbl->hash_buckets[hash_val]; tbl->hash_buckets[hash_val] = n; + n->dead = 0; + neigh_hold(n); write_unlock_bh(&tbl->lock); NEIGH_PRINTK2("neigh %p is created.\n", n); return n; } struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey, - struct device *dev, int creat) + struct net_device *dev, int creat) { struct pneigh_entry *n; u32 hash_val; @@ -336,11 +338,16 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey, hash_val ^= hash_val>>4; hash_val &= PNEIGH_HASHMASK; + read_lock_bh(&tbl->lock); + for (n = tbl->phash_buckets[hash_val]; n; n = n->next) { if (memcmp(n->key, pkey, key_len) == 0 && - (n->dev == dev || !n->dev)) + (n->dev == dev || !n->dev)) { + read_unlock_bh(&tbl->lock); return n; + } } + read_unlock_bh(&tbl->lock); if (!creat) return NULL; @@ -356,13 +363,15 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey, return NULL; } + write_lock_bh(&tbl->lock); n->next = tbl->phash_buckets[hash_val]; tbl->phash_buckets[hash_val] = n; + write_unlock_bh(&tbl->lock); return n; } -int pneigh_delete(struct neigh_table *tbl, const void *pkey, struct device *dev) +int pneigh_delete(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { struct pneigh_entry *n, **np; u32 hash_val; @@ -376,8 +385,9 @@ int pneigh_delete(struct neigh_table *tbl, const void *pkey, struct device *dev) for (np = &tbl->phash_buckets[hash_val]; (n=*np) != NULL; np = &n->next) { if (memcmp(n->key, pkey, key_len) == 0 && n->dev == dev) { + write_lock_bh(&tbl->lock); *np = n->next; - synchronize_bh(); + write_unlock_bh(&tbl->lock); if (tbl->pdestructor) tbl->pdestructor(n); kfree(n); @@ -387,7 +397,7 @@ int pneigh_delete(struct neigh_table *tbl, const void *pkey, struct device *dev) return -ENOENT; } -static int pneigh_ifdown(struct neigh_table *tbl, struct device *dev) +static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev) { struct pneigh_entry *n, **np; u32 h; @@ -397,7 +407,6 @@ static int pneigh_ifdown(struct neigh_table *tbl, struct device *dev) for (np = &tbl->phash_buckets[h]; (n=*np) != NULL; np = &n->next) { if (n->dev == dev || dev == NULL) { *np = n->next; - synchronize_bh(); if (tbl->pdestructor) tbl->pdestructor(n); kfree(n); @@ -418,14 +427,14 @@ void neigh_destroy(struct neighbour *neigh) { struct hh_cache *hh; - if (neigh->tbl || atomic_read(&neigh->refcnt)) { - NEIGH_PRINTK1("neigh_destroy: neighbour is use tbl=%p, ref=%d: " - "called from %p\n", neigh->tbl, atomic_read(&neigh->refcnt), __builtin_return_address(0)); + if (!neigh->dead) { + printk("Destroying alive neighbour %p from %08lx\n", neigh, + *(((unsigned long*)&neigh)-1)); return; } - if (neigh->nud_state&NUD_IN_TIMER) - del_timer(&neigh->timer); + if (neigh_del_timer(neigh)) + printk("Impossible event.\n"); while ((hh = neigh->hh) != NULL) { neigh->hh = hh->hh_next; @@ -442,10 +451,13 @@ void neigh_destroy(struct neighbour *neigh) skb_queue_purge(&neigh->arp_queue); + dev_put(neigh->dev); + NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); neigh_glbl_allocs--; - kfree(neigh); + neigh->tbl->entries--; + kmem_cache_free(neigh->tbl->kmem_cachep, neigh); } /* Neighbour state is suspicious; @@ -514,8 +526,7 @@ static void neigh_sync(struct neighbour *n) } } else if (state&NUD_VALID) { if (now - n->confirmed < n->parms->reachable_time) { - if (state&NUD_IN_TIMER) - del_timer(&n->timer); + neigh_del_timer(n); n->nud_state = NUD_REACHABLE; neigh_connect(n); } @@ -560,14 +571,12 @@ static void neigh_periodic_timer(unsigned long arg) if ((long)(n->used - n->confirmed) < 0) n->used = n->confirmed; - if (atomic_read(&n->refcnt) == 0 && + if (atomic_read(&n->refcnt) == 1 && (state == NUD_FAILED || now - n->used > n->parms->gc_staletime)) { *np = n->next; - n->tbl = NULL; - n->next = NULL; - tbl->entries--; + n->dead = 1; write_unlock(&n->lock); - neigh_destroy(n); + neigh_release(n); continue; } @@ -605,12 +614,13 @@ static void neigh_timer_handler(unsigned long arg) int notify = 0; write_lock(&neigh->lock); - atomic_inc(&neigh->refcnt); state = neigh->nud_state; if (!(state&NUD_IN_TIMER)) { - NEIGH_PRINTK1("neigh: timer & !nud_in_timer\n"); +#ifndef __SMP__ + printk("neigh: timer & !nud_in_timer\n"); +#endif goto out; } @@ -655,7 +665,6 @@ static void neigh_timer_handler(unsigned long arg) neigh->ops->solicit(neigh, skb_peek(&neigh->arp_queue)); atomic_inc(&neigh->probes); - neigh_release(neigh); return; out: @@ -672,16 +681,10 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) write_lock_bh(&neigh->lock); if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE))) { if (!(neigh->nud_state&(NUD_STALE|NUD_INCOMPLETE))) { - if (neigh->tbl == NULL) { - NEIGH_PRINTK2("neigh %p used after death.\n", neigh); - if (skb) - kfree_skb(skb); - write_unlock_bh(&neigh->lock); - return 1; - } if (neigh->parms->mcast_probes + neigh->parms->app_probes) { atomic_set(&neigh->probes, neigh->parms->ucast_probes); neigh->nud_state = NUD_INCOMPLETE; + neigh_hold(neigh); neigh->timer.expires = jiffies + neigh->parms->retrans_time; add_timer(&neigh->timer); write_unlock_bh(&neigh->lock); @@ -712,6 +715,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) } if (neigh->nud_state == NUD_STALE) { NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); + neigh_hold(neigh); neigh->nud_state = NUD_DELAY; neigh->timer.expires = jiffies + neigh->parms->delay_probe_time; add_timer(&neigh->timer); @@ -724,7 +728,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) static __inline__ void neigh_update_hhs(struct neighbour *neigh) { struct hh_cache *hh; - void (*update)(struct hh_cache*, struct device*, unsigned char*) = + void (*update)(struct hh_cache*, struct net_device*, unsigned char*) = neigh->dev->header_cache_update; if (update) { @@ -747,12 +751,12 @@ static __inline__ void neigh_update_hhs(struct neighbour *neigh) Caller MUST hold reference count on the entry. */ -int neigh_update(struct neighbour *neigh, u8 *lladdr, u8 new, int override, int arp) +int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, int override, int arp) { u8 old; int err; int notify = 0; - struct device *dev = neigh->dev; + struct net_device *dev = neigh->dev; write_lock_bh(&neigh->lock); old = neigh->nud_state; @@ -762,8 +766,7 @@ int neigh_update(struct neighbour *neigh, u8 *lladdr, u8 new, int override, int goto out; if (!(new&NUD_VALID)) { - if (old&NUD_IN_TIMER) - del_timer(&neigh->timer); + neigh_del_timer(neigh); if (old&NUD_CONNECTED) neigh_suspect(neigh); neigh->nud_state = new; @@ -813,8 +816,7 @@ int neigh_update(struct neighbour *neigh, u8 *lladdr, u8 new, int override, int if (new == old || (new == NUD_STALE && (old&NUD_CONNECTED))) goto out; } - if (old&NUD_IN_TIMER) - del_timer(&neigh->timer); + neigh_del_timer(neigh); neigh->nud_state = new; if (lladdr != neigh->ha) { memcpy(&neigh->ha, lladdr, dev->addr_len); @@ -858,7 +860,7 @@ out: struct neighbour * neigh_event_ns(struct neigh_table *tbl, u8 *lladdr, void *saddr, - struct device *dev) + struct net_device *dev) { struct neighbour *neigh; @@ -871,7 +873,7 @@ struct neighbour * neigh_event_ns(struct neigh_table *tbl, static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, u16 protocol) { struct hh_cache *hh = NULL; - struct device *dev = dst->dev; + struct net_device *dev = dst->dev; for (hh=n->hh; hh; hh = hh->hh_next) if (hh->hh_type == protocol) @@ -908,7 +910,7 @@ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, u16 protoc int neigh_compat_output(struct sk_buff *skb) { - struct device *dev = skb->dev; + struct net_device *dev = skb->dev; __skb_pull(skb, skb->nh.raw - skb->data); @@ -934,7 +936,7 @@ int neigh_resolve_output(struct sk_buff *skb) if (neigh_event_send(neigh, skb) == 0) { int err; - struct device *dev = neigh->dev; + struct net_device *dev = neigh->dev; if (dev->hard_header_cache && dst->hh == NULL) { write_lock_bh(&neigh->lock); if (dst->hh == NULL) @@ -966,7 +968,7 @@ int neigh_connected_output(struct sk_buff *skb) int err; struct dst_entry *dst = skb->dst; struct neighbour *neigh = dst->neighbour; - struct device *dev = neigh->dev; + struct net_device *dev = neigh->dev; __skb_pull(skb, skb->nh.raw - skb->data); @@ -1032,7 +1034,7 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, } -struct neigh_parms *neigh_parms_alloc(struct device *dev, struct neigh_table *tbl) +struct neigh_parms *neigh_parms_alloc(struct net_device *dev, struct neigh_table *tbl) { struct neigh_parms *p; p = kmalloc(sizeof(*p), GFP_KERNEL); @@ -1073,7 +1075,7 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) } } write_unlock_bh(&tbl->lock); - NEIGH_PRINTK1("neigh_release_parms: not found\n"); + NEIGH_PRINTK1("neigh_parms_release: not found\n"); } @@ -1083,6 +1085,12 @@ void neigh_table_init(struct neigh_table *tbl) tbl->parms.reachable_time = neigh_rand_reach_time(tbl->parms.base_reachable_time); + if (tbl->kmem_cachep == NULL) + tbl->kmem_cachep = kmem_cache_create(tbl->id, + (tbl->entry_size+15)&~15, + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + init_timer(&tbl->gc_timer); tbl->lock = RW_LOCK_UNLOCKED; tbl->gc_timer.data = (unsigned long)tbl; @@ -1135,7 +1143,8 @@ int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) struct ndmsg *ndm = NLMSG_DATA(nlh); struct rtattr **nda = arg; struct neigh_table *tbl; - struct device *dev = NULL; + struct net_device *dev = NULL; + int err = 0; if (ndm->ndm_ifindex) { if ((dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL) @@ -1144,19 +1153,21 @@ int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) read_lock(&neigh_tbl_lock); for (tbl=neigh_tables; tbl; tbl = tbl->next) { - int err = 0; struct neighbour *n; if (tbl->family != ndm->ndm_family) continue; read_unlock(&neigh_tbl_lock); + err = -EINVAL; if (nda[NDA_DST-1] == NULL || nda[NDA_DST-1]->rta_len != RTA_LENGTH(tbl->key_len)) - return -EINVAL; + goto out; - if (ndm->ndm_flags&NTF_PROXY) - return pneigh_delete(tbl, RTA_DATA(nda[NDA_DST-1]), dev); + if (ndm->ndm_flags&NTF_PROXY) { + err = pneigh_delete(tbl, RTA_DATA(nda[NDA_DST-1]), dev); + goto out; + } if (dev == NULL) return -EINVAL; @@ -1166,10 +1177,16 @@ int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) err = neigh_update(n, NULL, NUD_FAILED, 1, 0); neigh_release(n); } +out: + if (dev) + dev_put(dev); return err; } read_unlock(&neigh_tbl_lock); + if (dev) + dev_put(dev); + return -EADDRNOTAVAIL; } @@ -1178,7 +1195,7 @@ int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) struct ndmsg *ndm = NLMSG_DATA(nlh); struct rtattr **nda = arg; struct neigh_table *tbl; - struct device *dev = NULL; + struct net_device *dev = NULL; if (ndm->ndm_ifindex) { if ((dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL) @@ -1194,19 +1211,22 @@ int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) continue; read_unlock(&neigh_tbl_lock); + err = -EINVAL; if (nda[NDA_DST-1] == NULL || nda[NDA_DST-1]->rta_len != RTA_LENGTH(tbl->key_len)) - return -EINVAL; + goto out; if (ndm->ndm_flags&NTF_PROXY) { + err = -ENOBUFS; if (pneigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 1)) - return 0; - return -ENOBUFS; + err = 0; + goto out; } if (dev == NULL) return -EINVAL; + err = -EINVAL; if (nda[NDA_LLADDR-1] != NULL && nda[NDA_LLADDR-1]->rta_len != RTA_LENGTH(dev->addr_len)) - return -EINVAL; + goto out; n = neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev); if (n) { if (nlh->nlmsg_flags&NLM_F_EXCL) @@ -1225,10 +1245,15 @@ int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) } if (n) neigh_release(n); +out: + if (dev) + dev_put(dev); return err; } read_unlock(&neigh_tbl_lock); + if (dev) + dev_put(dev); return -EADDRNOTAVAIL; } @@ -1241,6 +1266,7 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n, struct nlmsghdr *nlh; unsigned char *b = skb->tail; struct nda_cacheinfo ci; + int locked = 0; nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ndm)); ndm = NLMSG_DATA(nlh); @@ -1250,20 +1276,24 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n, ndm->ndm_ifindex = n->dev->ifindex; RTA_PUT(skb, NDA_DST, n->tbl->key_len, n->primary_key); read_lock_bh(&n->lock); + locked=1; ndm->ndm_state = n->nud_state; if (n->nud_state&NUD_VALID) RTA_PUT(skb, NDA_LLADDR, n->dev->addr_len, n->ha); ci.ndm_used = now - n->used; ci.ndm_confirmed = now - n->confirmed; ci.ndm_updated = now - n->updated; - ci.ndm_refcnt = atomic_read(&n->refcnt); + ci.ndm_refcnt = atomic_read(&n->refcnt) - 1; read_unlock_bh(&n->lock); + locked=0; RTA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci); nlh->nlmsg_len = skb->tail - b; return skb->len; nlmsg_failure: rtattr_failure: + if (locked) + read_unlock_bh(&n->lock); skb_trim(skb, b - skb->data); return -1; } @@ -1443,7 +1473,7 @@ struct neigh_sysctl_table {{CTL_NET, "net", NULL, 0, 0555, NULL},{0}} }; -int neigh_sysctl_register(struct device *dev, struct neigh_parms *p, +int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, int p_id, int pdev_id, char *p_name) { struct neigh_sysctl_table *t; diff --git a/net/core/netfilter.c b/net/core/netfilter.c new file mode 100644 index 000000000..a6472a7de --- /dev/null +++ b/net/core/netfilter.c @@ -0,0 +1,630 @@ +/* netfilter.c: look after the filters for various protocols. + * Heavily influenced by the old firewall.c by David Bonn and Alan Cox. + * + * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any + * way. + * + * Rusty Russell (C)1998 -- This code is GPL. + */ +#include <linux/config.h> +#include <linux/netfilter.h> +#include <net/protocol.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/wait.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/if.h> +#include <linux/netdevice.h> +#include <linux/spinlock.h> + +#define __KERNEL_SYSCALLS__ +#include <linux/unistd.h> + +/* In this code, we can be waiting indefinitely for userspace to + * service a packet if a hook returns NF_QUEUE. We could keep a count + * of skbuffs queued for userspace, and not deregister a hook unless + * this is zero, but that sucks. Now, we simply check when the + * packets come back: if the hook is gone, the packet is discarded. */ +#ifdef CONFIG_NETFILTER_DEBUG +#define NFDEBUG(format, args...) printk(format , ## args) +#else +#define NFDEBUG(format, args...) +#endif + +/* Each queued (to userspace) skbuff has one of these. */ +struct nf_info +{ + /* The ops struct which sent us to userspace. */ + struct nf_hook_ops *elem; + + /* If we're sent to userspace, this keeps housekeeping info */ + int pf; + unsigned long mark; + unsigned int hook; + struct net_device *indev, *outdev; + int (*okfn)(struct sk_buff *); +}; + +static rwlock_t nf_lock = RW_LOCK_UNLOCKED; +static DECLARE_MUTEX(nf_sockopt_mutex); + +struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS]; +static LIST_HEAD(nf_sockopts); +static LIST_HEAD(nf_interested); + +int nf_register_hook(struct nf_hook_ops *reg) +{ + struct list_head *i; + +#ifdef CONFIG_NETFILTER_DEBUG + if (reg->pf<0 || reg->pf>=NPROTO || reg->hooknum >= NF_MAX_HOOKS) { + NFDEBUG("nf_register_hook: bad vals: pf=%i, hooknum=%u.\n", + reg->pf, reg->hooknum); + return -EINVAL; + } +#endif + NFDEBUG("nf_register_hook: pf=%i hook=%u.\n", reg->pf, reg->hooknum); + + write_lock_bh(&nf_lock); + for (i = nf_hooks[reg->pf][reg->hooknum].next; + i != &nf_hooks[reg->pf][reg->hooknum]; + i = i->next) { + if (reg->priority < ((struct nf_hook_ops *)i)->priority) + break; + } + list_add(®->list, i->prev); + write_unlock_bh(&nf_lock); + return 0; +} + +void nf_unregister_hook(struct nf_hook_ops *reg) +{ +#ifdef CONFIG_NETFILTER_DEBUG + if (reg->pf<0 || reg->pf>=NPROTO || reg->hooknum >= NF_MAX_HOOKS) { + NFDEBUG("nf_unregister_hook: bad vals: pf=%i, hooknum=%u.\n", + reg->pf, reg->hooknum); + return; + } +#endif + write_lock_bh(&nf_lock); + list_del(®->list); + write_unlock_bh(&nf_lock); +} + +/* Do exclusive ranges overlap? */ +static inline int overlap(int min1, int max1, int min2, int max2) +{ + return (min1 >= min2 && min1 < max2) + || (max1 > min2 && max1 <= max2); +} + +/* Functions to register sockopt ranges (exclusive). */ +int nf_register_sockopt(struct nf_sockopt_ops *reg) +{ + struct list_head *i; + int ret = 0; + +#ifdef CONFIG_NETFILTER_DEBUG + if (reg->pf<0 || reg->pf>=NPROTO) { + NFDEBUG("nf_register_sockopt: bad val: pf=%i.\n", reg->pf); + return -EINVAL; + } + if (reg->set_optmin > reg->set_optmax) { + NFDEBUG("nf_register_sockopt: bad set val: min=%i max=%i.\n", + reg->set_optmin, reg->set_optmax); + return -EINVAL; + } + if (reg->get_optmin > reg->get_optmax) { + NFDEBUG("nf_register_sockopt: bad get val: min=%i max=%i.\n", + reg->get_optmin, reg->get_optmax); + return -EINVAL; + } +#endif + if (down_interruptible(&nf_sockopt_mutex) != 0) + return -EINTR; + + for (i = nf_sockopts.next; i != &nf_sockopts; i = i->next) { + struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i; + if (ops->pf == reg->pf + && (overlap(ops->set_optmin, ops->set_optmax, + reg->set_optmin, reg->set_optmax) + || overlap(ops->get_optmin, ops->get_optmax, + reg->get_optmin, reg->get_optmax))) { + NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n", + ops->set_optmin, ops->set_optmax, + ops->get_optmin, ops->get_optmax, + reg->set_optmin, reg->set_optmax, + reg->get_optmin, reg->get_optmax); + ret = -EBUSY; + goto out; + } + } + + list_add(®->list, &nf_sockopts); +out: + up(&nf_sockopt_mutex); + return ret; +} + +void nf_unregister_sockopt(struct nf_sockopt_ops *reg) +{ +#ifdef CONFIG_NETFILTER_DEBUG + if (reg->pf<0 || reg->pf>=NPROTO) { + NFDEBUG("nf_register_sockopt: bad val: pf=%i.\n", reg->pf); + return; + } +#endif + /* No point being interruptible: we're probably in cleanup_module() */ + down(&nf_sockopt_mutex); + list_del(®->list); + up(&nf_sockopt_mutex); +} + +#ifdef CONFIG_NETFILTER_DEBUG +#include <net/ip.h> +#include <net/route.h> +#include <net/tcp.h> +#include <linux/netfilter_ipv4.h> + +void nf_dump_skb(int pf, struct sk_buff *skb) +{ + printk("skb: pf=%i %s dev=%s len=%u\n", + pf, + skb->sk ? "(owned)" : "(unowned)", + skb->dev ? skb->dev->name : "(no dev)", + skb->len); + switch (pf) { + case PF_INET: { + const struct iphdr *ip = skb->nh.iph; + __u32 *opt = (__u32 *) (ip + 1); + int opti; + __u16 src_port = 0, dst_port = 0; + + if (ip->protocol == IPPROTO_TCP + || ip->protocol == IPPROTO_UDP) { + struct tcphdr *tcp=(struct tcphdr *)((__u32 *)ip+ip->ihl); + src_port = ntohs(tcp->source); + dst_port = ntohs(tcp->dest); + } + + printk("PROTO=%d %ld.%ld.%ld.%ld:%hu %ld.%ld.%ld.%ld:%hu" + " L=%hu S=0x%2.2hX I=%hu F=0x%4.4hX T=%hu", + ip->protocol, + (ntohl(ip->saddr)>>24)&0xFF, + (ntohl(ip->saddr)>>16)&0xFF, + (ntohl(ip->saddr)>>8)&0xFF, + (ntohl(ip->saddr))&0xFF, + src_port, + (ntohl(ip->daddr)>>24)&0xFF, + (ntohl(ip->daddr)>>16)&0xFF, + (ntohl(ip->daddr)>>8)&0xFF, + (ntohl(ip->daddr))&0xFF, + dst_port, + ntohs(ip->tot_len), ip->tos, ntohs(ip->id), + ntohs(ip->frag_off), ip->ttl); + + for (opti = 0; opti < (ip->ihl - sizeof(struct iphdr) / 4); opti++) + printk(" O=0x%8.8X", *opt++); + printk("\n"); + } + } +} + +void nf_debug_ip_local_deliver(struct sk_buff *skb) +{ + /* If it's a loopback packet, it must have come through + * NF_IP_LOCAL_OUT, NF_IP_RAW_INPUT, NF_IP_PRE_ROUTING and + * NF_IP_LOCAL_IN. Otherwise, must have gone through + * NF_IP_RAW_INPUT and NF_IP_PRE_ROUTING. */ + if (!skb->dev) { + printk("ip_local_deliver: skb->dev is NULL.\n"); + } + else if (strcmp(skb->dev->name, "lo") == 0) { + if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT) + | (1 << NF_IP_POST_ROUTING) + | (1 << NF_IP_PRE_ROUTING) + | (1 << NF_IP_LOCAL_IN))) { + printk("ip_local_deliver: bad loopback skb: "); + debug_print_hooks_ip(skb->nf_debug); + nf_dump_skb(PF_INET, skb); + } + } + else { + if (skb->nf_debug != ((1<<NF_IP_PRE_ROUTING) + | (1<<NF_IP_LOCAL_IN))) { + printk("ip_local_deliver: bad non-lo skb: "); + debug_print_hooks_ip(skb->nf_debug); + nf_dump_skb(PF_INET, skb); + } + } +} + +void nf_debug_ip_loopback_xmit(struct sk_buff *newskb) +{ + if (newskb->nf_debug != ((1 << NF_IP_LOCAL_OUT) + | (1 << NF_IP_POST_ROUTING))) { + printk("ip_dev_loopback_xmit: bad owned skb = %p: ", + newskb); + debug_print_hooks_ip(newskb->nf_debug); + nf_dump_skb(PF_INET, newskb); + } + /* Clear to avoid confusing input check */ + newskb->nf_debug = 0; +} + +void nf_debug_ip_finish_output2(struct sk_buff *skb) +{ + /* If it's owned, it must have gone through the + * NF_IP_LOCAL_OUT and NF_IP_POST_ROUTING. + * Otherwise, must have gone through NF_IP_RAW_INPUT, + * NF_IP_PRE_ROUTING, NF_IP_FORWARD and NF_IP_POST_ROUTING. + */ + if (skb->sk) { + if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT) + | (1 << NF_IP_POST_ROUTING))) { + printk("ip_finish_output: bad owned skb = %p: ", skb); + debug_print_hooks_ip(skb->nf_debug); + nf_dump_skb(PF_INET, skb); + } + } else { + if (skb->nf_debug != ((1 << NF_IP_PRE_ROUTING) +#ifdef CONFIG_IP_NETFILTER_RAW_INPUT + | (1 << NF_IP_RAW_INPUT) +#endif + | (1 << NF_IP_FORWARD) + | (1 << NF_IP_POST_ROUTING))) { + printk("ip_finish_output: bad unowned skb = %p: ",skb); + debug_print_hooks_ip(skb->nf_debug); + nf_dump_skb(PF_INET, skb); + } + } +} + + +#endif /*CONFIG_NETFILTER_DEBUG*/ + +void nf_cacheflush(int pf, unsigned int hook, const void *packet, + const struct net_device *indev, const struct net_device *outdev, + __u32 packetcount, __u32 bytecount) +{ + struct list_head *i; + + read_lock_bh(&nf_lock); + for (i = nf_hooks[pf][hook].next; + i != &nf_hooks[pf][hook]; + i = i->next) { + if (((struct nf_hook_ops *)i)->flush) + ((struct nf_hook_ops *)i)->flush(packet, indev, + outdev, + packetcount, + bytecount); + } + read_unlock_bh(&nf_lock); +} + +/* Call get/setsockopt() */ +static int nf_sockopt(struct sock *sk, int pf, int val, + char *opt, int *len, int get) +{ + struct list_head *i; + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (down_interruptible(&nf_sockopt_mutex) != 0) + return -EINTR; + + for (i = nf_sockopts.next; i != &nf_sockopts; i = i->next) { + struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i; + if (ops->pf == pf) { + if (get) { + if (val >= ops->get_optmin + && val < ops->get_optmax) { + ret = ops->get(sk, val, opt, len); + goto out; + } + } else { + if (val >= ops->set_optmin + && val < ops->set_optmax) { + ret = ops->set(sk, val, opt, *len); + goto out; + } + } + } + } + ret = -ENOPROTOOPT; + out: + up(&nf_sockopt_mutex); + return ret; +} + +int nf_setsockopt(struct sock *sk, int pf, int val, char *opt, + int len) +{ + return nf_sockopt(sk, pf, val, opt, &len, 0); +} + +int nf_getsockopt(struct sock *sk, int pf, int val, char *opt, int *len) +{ + return nf_sockopt(sk, pf, val, opt, len, 1); +} + +static unsigned int nf_iterate(struct list_head *head, + struct sk_buff **skb, + int hook, + const struct net_device *indev, + const struct net_device *outdev, + struct list_head **i) +{ + for (*i = (*i)->next; *i != head; *i = (*i)->next) { + struct nf_hook_ops *elem = (struct nf_hook_ops *)*i; + switch (elem->hook(hook, skb, indev, outdev)) { + case NF_QUEUE: + NFDEBUG("nf_iterate: NF_QUEUE for %p.\n", *skb); + return NF_QUEUE; + + case NF_STOLEN: + NFDEBUG("nf_iterate: NF_STOLEN for %p.\n", *skb); + return NF_STOLEN; + + case NF_DROP: + NFDEBUG("nf_iterate: NF_DROP for %p.\n", *skb); + return NF_DROP; + +#ifdef CONFIG_NETFILTER_DEBUG + case NF_ACCEPT: + break; + + default: + NFDEBUG("Evil return from %p(%u).\n", + elem->hook, hook); +#endif + } + } + return NF_ACCEPT; +} + +static void nf_queue(struct sk_buff *skb, + struct list_head *elem, + int pf, unsigned int hook, + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *)) +{ + struct list_head *i; + + struct nf_info *info = kmalloc(sizeof(*info), GFP_ATOMIC); + if (!info) { + NFDEBUG("nf_hook: OOM.\n"); + kfree_skb(skb); + return; + } + + /* Can't do struct assignments with arrays in them. Damn. */ + info->elem = (struct nf_hook_ops *)elem; + info->mark = skb->nfmark; + info->pf = pf; + info->hook = hook; + info->okfn = okfn; + info->indev = indev; + info->outdev = outdev; + skb->nfmark = (unsigned long)info; + + /* Bump dev refs so they don't vanish while packet is out */ + if (indev) dev_hold(indev); + if (outdev) dev_hold(outdev); + + for (i = nf_interested.next; i != &nf_interested; i = i->next) { + struct nf_interest *recip = (struct nf_interest *)i; + + if ((recip->hookmask & (1 << info->hook)) + && info->pf == recip->pf + && (!recip->mark || info->mark == recip->mark) + && (!recip->reason || skb->nfreason == recip->reason)) { + /* FIXME: Andi says: use netlink. Hmmm... --RR */ + if (skb_queue_len(&recip->wake->skbq) >= 100) { + NFDEBUG("nf_hook: queue to long.\n"); + goto free_discard; + } + /* Hand it to userspace for collection */ + skb_queue_tail(&recip->wake->skbq, skb); + NFDEBUG("Waking up pf=%i hook=%u mark=%lu reason=%u\n", + pf, hook, skb->nfmark, skb->nfreason); + wake_up_interruptible(&recip->wake->sleep); + + return; + } + } + NFDEBUG("nf_hook: noone wants the packet.\n"); + + free_discard: + if (indev) dev_put(indev); + if (outdev) dev_put(outdev); + + kfree_s(info, sizeof(*info)); + kfree_skb(skb); +} + +/* nf_hook() doesn't have lock, so may give false positive. */ +int nf_hook_slow(int pf, unsigned int hook, struct sk_buff *skb, + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *)) +{ + struct list_head *elem; + unsigned int verdict; + int ret = 0; + +#ifdef CONFIG_NETFILTER_DEBUG + if (pf < 0 || pf >= NPROTO || hook >= NF_MAX_HOOKS) { + NFDEBUG("nf_hook: bad vals: pf=%i, hook=%u.\n", + pf, hook); + kfree_skb(skb); + return -EINVAL; /* -ECODERFUCKEDUP ?*/ + } + + if (skb->nf_debug & (1 << hook)) { + NFDEBUG("nf_hook: hook %i already set.\n", hook); + nf_dump_skb(pf, skb); + } + skb->nf_debug |= (1 << hook); +#endif + read_lock_bh(&nf_lock); + elem = &nf_hooks[pf][hook]; + verdict = nf_iterate(&nf_hooks[pf][hook], &skb, hook, indev, + outdev, &elem); + if (verdict == NF_QUEUE) { + NFDEBUG("nf_hook: Verdict = QUEUE.\n"); + nf_queue(skb, elem, pf, hook, indev, outdev, okfn); + } + read_unlock_bh(&nf_lock); + + switch (verdict) { + case NF_ACCEPT: + ret = okfn(skb); + break; + + case NF_DROP: + kfree_skb(skb); + ret = -EPERM; + break; + } + + return ret; +} + +struct nf_waitinfo { + unsigned int verdict; + struct task_struct *owner; +}; + +/* For netfilter device. */ +void nf_register_interest(struct nf_interest *interest) +{ + /* First in, best dressed. */ + write_lock_bh(&nf_lock); + list_add(&interest->list, &nf_interested); + write_unlock_bh(&nf_lock); +} + +void nf_unregister_interest(struct nf_interest *interest) +{ + struct sk_buff *skb; + + write_lock_bh(&nf_lock); + list_del(&interest->list); + write_unlock_bh(&nf_lock); + + /* Blow away any queued skbs; this is overzealous. */ + while ((skb = skb_dequeue(&interest->wake->skbq)) != NULL) + nf_reinject(skb, 0, NF_DROP); +} + +void nf_getinfo(const struct sk_buff *skb, + struct net_device **indev, + struct net_device **outdev, + unsigned long *mark) +{ + const struct nf_info *info = (const struct nf_info *)skb->nfmark; + + *indev = info->indev; + *outdev = info->outdev; + *mark = info->mark; +} + +void nf_reinject(struct sk_buff *skb, unsigned long mark, unsigned int verdict) +{ + struct nf_info *info = (struct nf_info *)skb->nfmark; + struct list_head *elem = &info->elem->list; + struct list_head *i; + + read_lock_bh(&nf_lock); + + for (i = nf_hooks[info->pf][info->hook].next; i != elem; i = i->next) { + if (i == &nf_hooks[info->pf][info->hook]) { + /* The module which sent it to userspace is gone. */ + verdict = NF_DROP; + break; + } + } + + /* Continue traversal iff userspace said ok, and devices still + exist... */ + if (verdict == NF_ACCEPT) { + skb->nfmark = mark; + verdict = nf_iterate(&nf_hooks[info->pf][info->hook], + &skb, info->hook, + info->indev, info->outdev, &elem); + } + + if (verdict == NF_QUEUE) { + nf_queue(skb, elem, info->pf, info->hook, + info->indev, info->outdev, info->okfn); + } + read_unlock_bh(&nf_lock); + + switch (verdict) { + case NF_ACCEPT: + local_bh_disable(); + info->okfn(skb); + local_bh_enable(); + break; + + case NF_DROP: + kfree_skb(skb); + break; + } + + /* Release those devices we held, or Alexey will kill me. */ + if (info->indev) dev_put(info->indev); + if (info->outdev) dev_put(info->outdev); + + kfree_s(info, sizeof(*info)); + return; +} + +/* FIXME: Before cache is ever used, this must be implemented for real. */ +void nf_invalidate_cache(int pf) +{ +} + +#ifdef CONFIG_NETFILTER_DEBUG + +void debug_print_hooks_ip(unsigned int nf_debug) +{ + if (nf_debug & (1 << NF_IP_PRE_ROUTING)) { + printk("PRE_ROUTING "); + nf_debug ^= (1 << NF_IP_PRE_ROUTING); + } + if (nf_debug & (1 << NF_IP_LOCAL_IN)) { + printk("LOCAL_IN "); + nf_debug ^= (1 << NF_IP_LOCAL_IN); + } + if (nf_debug & (1 << NF_IP_FORWARD)) { + printk("FORWARD "); + nf_debug ^= (1 << NF_IP_FORWARD); + } + if (nf_debug & (1 << NF_IP_LOCAL_OUT)) { + printk("LOCAL_OUT "); + nf_debug ^= (1 << NF_IP_LOCAL_OUT); + } + if (nf_debug & (1 << NF_IP_POST_ROUTING)) { + printk("POST_ROUTING "); + nf_debug ^= (1 << NF_IP_POST_ROUTING); + } + if (nf_debug) + printk("Crap bits: 0x%04X", nf_debug); + printk("\n"); +} +#endif /* CONFIG_NETFILTER_DEBUG */ + +void __init netfilter_init(void) +{ + int i, h; + + for (i = 0; i < NPROTO; i++) + for (h = 0; h < NF_MAX_HOOKS; h++) + INIT_LIST_HEAD(&nf_hooks[i][h]); +} diff --git a/net/core/profile.c b/net/core/profile.c index fc7464b7a..e43a3d6e1 100644 --- a/net/core/profile.c +++ b/net/core/profile.c @@ -126,10 +126,8 @@ done: len-=(offset-begin); if(len>length) len=length; - if (len < 0) { + if (len < 0) len = 0; - printk(KERN_CRIT "Yep, guys... our template for proc_*_read is crappy :-)\n"); - } if (offset == 0) { cli(); net_prof_total.active = 0; @@ -144,7 +142,7 @@ done: struct iphdr whitehole_iph; int whitehole_count; -static int whitehole_xmit(struct sk_buff *skb, struct device *dev) +static int whitehole_xmit(struct sk_buff *skb, struct net_device *dev) { struct net_device_stats *stats; dev_kfree_skb(skb); @@ -156,15 +154,15 @@ static int whitehole_xmit(struct sk_buff *skb, struct device *dev) } static void whitehole_inject(unsigned long); -int whitehole_init(struct device *dev); +int whitehole_init(struct net_device *dev); static struct timer_list whitehole_timer = { NULL, NULL, 0, 0L, whitehole_inject }; -static struct device whitehole_dev = { +static struct net_device whitehole_dev = { "whitehole", 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, whitehole_init, }; -static int whitehole_open(struct device *dev) +static int whitehole_open(struct net_device *dev) { whitehole_count = 100000; whitehole_timer.expires = jiffies + 5*HZ; @@ -172,7 +170,7 @@ static int whitehole_open(struct device *dev) return 0; } -static int whitehole_close(struct device *dev) +static int whitehole_close(struct net_device *dev) { del_timer(&whitehole_timer); return 0; @@ -206,13 +204,13 @@ static void whitehole_inject(unsigned long dummy) } } -static struct net_device_stats *whitehole_get_stats(struct device *dev) +static struct net_device_stats *whitehole_get_stats(struct net_device *dev) { struct net_device_stats *stats = (struct net_device_stats *) dev->priv; return stats; } -__initfunc(int whitehole_init(struct device *dev)) +int __init whitehole_init(struct net_device *dev) { dev->priv = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL); if (dev->priv == NULL) @@ -262,7 +260,7 @@ int net_profile_unregister(struct net_profile_slot *slot) } -__initfunc(int net_profile_init(void)) +int __init net_profile_init(void) { int i; @@ -282,7 +280,6 @@ __initfunc(int net_profile_init(void)) return -1; } #endif - start_bh_atomic(); #ifdef __alpha__ alpha_tick(0); #endif @@ -298,7 +295,6 @@ __initfunc(int net_profile_init(void)) } net_prof_total.hits = 0; net_profile_stamp(&net_prof_total.entered); - end_bh_atomic(); return 0; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index dad9ee252..b4d858210 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -64,8 +64,6 @@ void rtnl_unlock(void) rtnl_shunlock(); } - - int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len) { memset(tb, 0, sizeof(struct rtattr*)*maxattr); @@ -136,8 +134,29 @@ int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo) return err; } -static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct device *dev, - int type, u32 pid, u32 seq) +int rtnetlink_put_metrics(struct sk_buff *skb, unsigned *metrics) +{ + struct rtattr *mx = (struct rtattr*)skb->tail; + int i; + + RTA_PUT(skb, RTA_METRICS, 0, NULL); + for (i=0; i<RTAX_MAX; i++) { + if (metrics[i]) + RTA_PUT(skb, i+1, sizeof(unsigned), metrics+i); + } + mx->rta_len = skb->tail - (u8*)mx; + if (mx->rta_len == RTA_LENGTH(0)) + skb_trim(skb, (u8*)mx - skb->data); + return 0; + +rtattr_failure: + skb_trim(skb, (u8*)mx - skb->data); + return -1; +} + + +static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, + int type, u32 pid, u32 seq, u32 change) { struct ifinfomsg *r; struct nlmsghdr *nlh; @@ -150,7 +169,7 @@ static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct device *dev, r->ifi_type = dev->type; r->ifi_index = dev->ifindex; r->ifi_flags = dev->flags; - r->ifi_change = ~0U; + r->ifi_change = change; RTA_PUT(skb, IFLA_IFNAME, strlen(dev->name)+1, dev->name); if (dev->addr_len) { @@ -185,13 +204,13 @@ int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { int idx; int s_idx = cb->args[0]; - struct device *dev; + struct net_device *dev; read_lock(&dev_base_lock); for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { if (idx < s_idx) continue; - if (rtnetlink_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq) <= 0) + if (rtnetlink_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, 0) <= 0) break; } read_unlock(&dev_base_lock); @@ -224,7 +243,7 @@ int rtnetlink_dump_all(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -void rtmsg_ifinfo(int type, struct device *dev) +void rtmsg_ifinfo(int type, struct net_device *dev) { struct sk_buff *skb; int size = NLMSG_GOODSIZE; @@ -233,7 +252,7 @@ void rtmsg_ifinfo(int type, struct device *dev) if (!skb) return; - if (rtnetlink_fill_ifinfo(skb, dev, type, 0, 0) < 0) { + if (rtnetlink_fill_ifinfo(skb, dev, type, 0, 0, ~0U) < 0) { kfree_skb(skb); return; } @@ -414,23 +433,25 @@ extern __inline__ int rtnetlink_rcv_skb(struct sk_buff *skb) static void rtnetlink_rcv(struct sock *sk, int len) { - struct sk_buff *skb; - - if (rtnl_shlock_nowait()) - return; - - while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) { - if (rtnetlink_rcv_skb(skb)) { - if (skb->len) - skb_queue_head(&sk->receive_queue, skb); - else - kfree_skb(skb); - break; + do { + struct sk_buff *skb; + + if (rtnl_shlock_nowait()) + return; + + while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) { + if (rtnetlink_rcv_skb(skb)) { + if (skb->len) + skb_queue_head(&sk->receive_queue, skb); + else + kfree_skb(skb); + break; + } + kfree_skb(skb); } - kfree_skb(skb); - } - rtnl_shunlock(); + up(&rtnl_sem); + } while (rtnl && rtnl->receive_queue.qlen); } static struct rtnetlink_link link_rtnetlink_table[RTM_MAX-RTM_BASE+1] = @@ -464,7 +485,7 @@ static struct rtnetlink_link link_rtnetlink_table[RTM_MAX-RTM_BASE+1] = static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct device *dev = ptr; + struct net_device *dev = ptr; switch (event) { case NETDEV_UNREGISTER: rtmsg_ifinfo(RTM_DELLINK, dev); @@ -483,7 +504,7 @@ struct notifier_block rtnetlink_dev_notifier = { }; -__initfunc(void rtnetlink_init(void)) +void __init rtnetlink_init(void) { #ifdef RTNL_DEBUG printk("Initializing RT netlink socket\n"); diff --git a/net/core/scm.c b/net/core/scm.c index e2073166f..a29c21a8a 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -29,7 +29,6 @@ #include <linux/inet.h> #include <net/ip.h> #include <net/protocol.h> -#include <net/rarp.h> #include <net/tcp.h> #include <net/udp.h> #include <linux/skbuff.h> @@ -162,11 +161,6 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) kfree(p->fp); p->fp = NULL; } - - err = -EINVAL; - if (msg->msg_flags & MSG_CTLFLAGS) - goto error; - return 0; error: diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5ea21d7b4..58aeb6cc9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4,7 +4,7 @@ * Authors: Alan Cox <iiitac@pyr.swan.ac.uk> * Florian La Roche <rzsfl@rz.uni-sb.de> * - * Version: $Id: skbuff.c,v 1.56 1999/05/29 23:20:42 davem Exp $ + * Version: $Id: skbuff.c,v 1.60 1999/08/23 07:02:01 davem Exp $ * * Fixes: * Alan Cox : Fixed the worst of the load balancer bugs. @@ -61,6 +61,10 @@ #include <asm/uaccess.h> #include <asm/system.h> +#ifdef CONFIG_ATM +#include <linux/atmdev.h> +#endif + /* * Resource tracking variables */ @@ -81,14 +85,16 @@ static kmem_cache_t *skbuff_head_cache; void skb_over_panic(struct sk_buff *skb, int sz, void *here) { - panic("skput:over: %p:%d put:%d dev:%s", + printk("skput:over: %p:%d put:%d dev:%s", here, skb->len, sz, skb->dev ? skb->dev->name : "<NULL>"); + *(int*)0 = 0; } void skb_under_panic(struct sk_buff *skb, int sz, void *here) { - panic("skput:under: %p:%d put:%d dev:%s", + printk("skput:under: %p:%d put:%d dev:%s", here, skb->len, sz, skb->dev ? skb->dev->name : "<NULL>"); + *(int*)0 = 0; } void show_net_buffers(void) @@ -120,7 +126,8 @@ struct sk_buff *alloc_skb(unsigned int size,int gfp_mask) static int count = 0; if (++count < 5) { printk(KERN_ERR "alloc_skb called nonatomically " - "from interrupt %p\n", __builtin_return_address(0)); + "from interrupt %p\n", NET_CALLER(size)); + *(int*)0 = 0; } gfp_mask &= ~__GFP_WAIT; } @@ -142,7 +149,8 @@ struct sk_buff *alloc_skb(unsigned int size,int gfp_mask) */ atomic_inc(&net_allocs); - skb->truesize = size; + /* XXX: does not include slab overhead */ + skb->truesize = size + sizeof(struct sk_buff); atomic_inc(&net_skbcount); @@ -157,6 +165,10 @@ struct sk_buff *alloc_skb(unsigned int size,int gfp_mask) skb->is_clone = 0; skb->cloned = 0; +#ifdef CONFIG_ATM + ATM_SKB(skb)->iovcnt = 0; +#endif + atomic_set(&skb->users, 1); atomic_set(skb_datarefp(skb), 1); return skb; @@ -187,8 +199,12 @@ static inline void skb_headerinit(void *p, kmem_cache_t *cache, skb->ip_summed = 0; skb->security = 0; /* By default packets are insecure */ skb->dst = NULL; -#ifdef CONFIG_IP_FIREWALL - skb->fwmark = 0; + skb->rx_dev = NULL; +#ifdef CONFIG_NETFILTER + skb->nfmark = skb->nfreason = skb->nfcache = 0; +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug = 0; +#endif #endif memset(skb->cb, 0, sizeof(skb->cb)); skb->priority = 0; @@ -212,13 +228,17 @@ void kfree_skbmem(struct sk_buff *skb) void __kfree_skb(struct sk_buff *skb) { - if (skb->list) + if (skb->list) { printk(KERN_WARNING "Warning: kfree_skb passed an skb still " - "on a list (from %p).\n", __builtin_return_address(0)); + "on a list (from %p).\n", NET_CALLER(skb)); + *(int*)0 = 0; + } dst_release(skb->dst); if(skb->destructor) skb->destructor(skb); + if(skb->rx_dev) + dev_put(skb->rx_dev); skb_headerinit(skb, NULL, 0); /* clean state */ kfree_skbmem(skb); } @@ -242,6 +262,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) atomic_inc(&net_allocs); atomic_inc(&net_skbcount); dst_clone(n->dst); + n->rx_dev = NULL; n->cloned = 1; n->next = n->prev = NULL; n->list = NULL; @@ -285,6 +306,7 @@ struct sk_buff *skb_copy(struct sk_buff *skb, int gfp_mask) n->list=NULL; n->sk=NULL; n->dev=skb->dev; + n->rx_dev=NULL; n->priority=skb->priority; n->protocol=skb->protocol; n->dst=dst_clone(skb->dst); @@ -299,8 +321,13 @@ struct sk_buff *skb_copy(struct sk_buff *skb, int gfp_mask) n->stamp=skb->stamp; n->destructor = NULL; n->security=skb->security; -#ifdef CONFIG_IP_FIREWALL - n->fwmark = skb->fwmark; +#ifdef CONFIG_NETFILTER + n->nfmark=skb->nfmark; + n->nfreason=skb->nfreason; + n->nfcache=skb->nfcache; +#ifdef CONFIG_NETFILTER_DEBUG + n->nf_debug=skb->nf_debug; +#endif #endif return n; } @@ -309,13 +336,12 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, int newheadroom) { struct sk_buff *n; unsigned long offset; - int headroom = skb_headroom(skb); /* * Allocate the copy buffer */ - n=alloc_skb(skb->truesize+newheadroom-headroom, GFP_ATOMIC); + n=alloc_skb((skb->end-skb->data)+newheadroom, GFP_ATOMIC); if(n==NULL) return NULL; @@ -336,6 +362,7 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, int newheadroom) n->priority=skb->priority; n->protocol=skb->protocol; n->dev=skb->dev; + n->rx_dev=NULL; n->dst=dst_clone(skb->dst); n->h.raw=skb->h.raw+offset; n->nh.raw=skb->nh.raw+offset; @@ -348,10 +375,14 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, int newheadroom) n->stamp=skb->stamp; n->destructor = NULL; n->security=skb->security; -#ifdef CONFIG_IP_FIREWALL - n->fwmark = skb->fwmark; +#ifdef CONFIG_NETFILTER + n->nfmark=skb->nfmark; + n->nfreason=skb->nfreason; + n->nfcache=skb->nfcache; +#ifdef CONFIG_NETFILTER_DEBUG + n->nf_debug=skb->nf_debug; +#endif #endif - return n; } diff --git a/net/core/sock.c b/net/core/sock.c index c38e92e93..2b0018ec9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -7,7 +7,7 @@ * handler for protocols to use and generic option handler. * * - * Version: $Id: sock.c,v 1.82 1999/05/27 00:37:03 davem Exp $ + * Version: $Id: sock.c,v 1.86 1999/09/01 08:11:49 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -116,7 +116,6 @@ #include <net/ip.h> #include <net/protocol.h> #include <net/arp.h> -#include <net/rarp.h> #include <net/route.h> #include <net/tcp.h> #include <net/udp.h> @@ -180,7 +179,9 @@ int sock_setsockopt(struct socket *sock, int level, int optname, return err; valbool = val?1:0; - + + lock_sock(sk); + switch(optname) { case SO_DEBUG: @@ -257,14 +258,15 @@ int sock_setsockopt(struct socket *sock, int level, int optname, if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) sk->priority = val; else - return(-EPERM); + ret = -EPERM; break; case SO_LINGER: - if(optlen<sizeof(ling)) - return -EINVAL; /* 1003.1g */ - err = copy_from_user(&ling,optval,sizeof(ling)); - if (err) + if(optlen<sizeof(ling)) { + ret = -EINVAL; /* 1003.1g */ + break; + } + if (copy_from_user(&ling,optval,sizeof(ling))) { ret = -EFAULT; break; @@ -293,8 +295,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname, char devname[IFNAMSIZ]; /* Sorry... */ - if (!capable(CAP_NET_RAW)) - return -EPERM; + if (!capable(CAP_NET_RAW)) { + ret = -EPERM; + break; + } /* Bind this socket to a particular device like "eth0", * as specified in the passed interface name. If the @@ -307,24 +311,27 @@ int sock_setsockopt(struct socket *sock, int level, int optname, } else { if (optlen > IFNAMSIZ) optlen = IFNAMSIZ; - if (copy_from_user(devname, optval, optlen)) - return -EFAULT; + if (copy_from_user(devname, optval, optlen)) { + ret = -EFAULT; + break; + } /* Remove any cached route for this socket. */ - lock_sock(sk); - dst_release(xchg(&sk->dst_cache, NULL)); - release_sock(sk); + sk_dst_reset(sk); if (devname[0] == '\0') { sk->bound_dev_if = 0; } else { - struct device *dev = dev_get(devname); - if (!dev) - return -EINVAL; + struct net_device *dev = dev_get_by_name(devname); + if (!dev) { + ret = -ENODEV; + break; + } sk->bound_dev_if = dev->ifindex; + dev_put(dev); } - return 0; } + break; } #endif @@ -344,20 +351,25 @@ int sock_setsockopt(struct socket *sock, int level, int optname, break; case SO_DETACH_FILTER: + spin_lock_bh(&sk->lock.slock); filter = sk->filter; - if(filter) { + if (filter) { sk->filter = NULL; - synchronize_bh(); + spin_unlock_bh(&sk->lock.slock); sk_filter_release(sk, filter); - return 0; + break; } - return -ENOENT; + spin_unlock_bh(&sk->lock.slock); + ret = -ENONET; + break; #endif /* We implement the SO_SNDLOWAT etc to not be settable (1003.1g 5.3) */ default: - return(-ENOPROTOOPT); + ret = -ENOPROTOOPT; + break; } + release_sock(sk); return ret; } @@ -501,6 +513,7 @@ void sk_free(struct sock *sk) #ifdef CONFIG_FILTER struct sk_filter *filter; #endif + if (sk->destruct) sk->destruct(sk); @@ -540,6 +553,7 @@ void sock_wfree(struct sk_buff *skb) /* In case it might be waiting for more memory. */ atomic_sub(skb->truesize, &sk->wmem_alloc); sk->write_space(sk); + sock_put(sk); } /* @@ -552,6 +566,10 @@ void sock_rfree(struct sk_buff *skb) atomic_sub(skb->truesize, &sk->rmem_alloc); } +void sock_cfree(struct sk_buff *skb) +{ + sock_put(skb->sk); +} /* * Allocate a skb from the socket's send buffer. @@ -561,9 +579,7 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int if (force || atomic_read(&sk->wmem_alloc) < sk->sndbuf) { struct sk_buff * skb = alloc_skb(size, priority); if (skb) { - atomic_add(skb->truesize, &sk->wmem_alloc); - skb->destructor = sock_wfree; - skb->sk = sk; + skb_set_owner_w(skb, sk); return skb; } } @@ -578,9 +594,7 @@ struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int if (force || atomic_read(&sk->rmem_alloc) < sk->rcvbuf) { struct sk_buff *skb = alloc_skb(size, priority); if (skb) { - atomic_add(skb->truesize, &sk->rmem_alloc); - skb->destructor = sock_rfree; - skb->sk = sk; + skb_set_owner_r(skb, sk); return skb; } } @@ -592,7 +606,8 @@ struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int */ void *sock_kmalloc(struct sock *sk, int size, int priority) { - if (atomic_read(&sk->omem_alloc)+size < sysctl_optmem_max) { + if ((unsigned)size <= sysctl_optmem_max && + atomic_read(&sk->omem_alloc)+size < sysctl_optmem_max) { void *mem; /* First do the add, to avoid the race if kmalloc * might sleep. @@ -657,7 +672,7 @@ static void sock_wait_for_wmem(struct sock * sk) for (;;) { if (signal_pending(current)) break; - current->state = TASK_INTERRUPTIBLE; + set_current_state(TASK_INTERRUPTIBLE); if (atomic_read(&sk->wmem_alloc) < sk->sndbuf) break; if (sk->shutdown & SEND_SHUTDOWN) @@ -666,7 +681,7 @@ static void sock_wait_for_wmem(struct sock * sk) break; schedule(); } - current->state = TASK_RUNNING; + __set_current_state(TASK_RUNNING); remove_wait_queue(sk->sleep, &wait); } @@ -736,62 +751,57 @@ failure: return NULL; } -void lock_sock(struct sock *sk) +void __lock_sock(struct sock *sk) { - spin_lock_bh(&sk->lock.slock); - if(sk->lock.users != 0) { - DECLARE_WAITQUEUE(wait, current); + DECLARE_WAITQUEUE(wait, current); - add_wait_queue_exclusive(&sk->lock.wq, &wait); - for(;;) { - current->state = TASK_EXCLUSIVE | TASK_UNINTERRUPTIBLE; - spin_unlock_bh(&sk->lock.slock); - schedule(); - spin_lock_bh(&sk->lock.slock); - if(!sk->lock.users) - break; - } - current->state = TASK_RUNNING; - remove_wait_queue(&sk->lock.wq, &wait); + add_wait_queue_exclusive(&sk->lock.wq, &wait); + for(;;) { + current->state = TASK_EXCLUSIVE | TASK_UNINTERRUPTIBLE; + spin_unlock_bh(&sk->lock.slock); + schedule(); + spin_lock_bh(&sk->lock.slock); + if(!sk->lock.users) + break; } - sk->lock.users = 1; - spin_unlock_bh(&sk->lock.slock); + current->state = TASK_RUNNING; + remove_wait_queue(&sk->lock.wq, &wait); } -void release_sock(struct sock *sk) +void __release_sock(struct sock *sk) { - spin_lock_bh(&sk->lock.slock); - sk->lock.users = 0; - if(sk->backlog.tail != NULL) { - struct sk_buff *skb = sk->backlog.head; - do { struct sk_buff *next = skb->next; - skb->next = NULL; - sk->backlog_rcv(sk, skb); - skb = next; - } while(skb != NULL); - sk->backlog.head = sk->backlog.tail = NULL; - } - wake_up(&sk->lock.wq); - spin_unlock_bh(&sk->lock.slock); + struct sk_buff *skb = sk->backlog.head; + do { + struct sk_buff *next = skb->next; + skb->next = NULL; + sk->backlog_rcv(sk, skb); + skb = next; + } while(skb != NULL); + sk->backlog.head = sk->backlog.tail = NULL; } /* * Generic socket manager library. Most simpler socket families * use this to manage their socket lists. At some point we should * hash these. By making this generic we get the lot hashed for free. + * + * It is broken by design. All the protocols using it must be fixed. --ANK */ + +rwlock_t net_big_sklist_lock = RW_LOCK_UNLOCKED; void sklist_remove_socket(struct sock **list, struct sock *sk) { struct sock *s; - start_bh_atomic(); + write_lock_bh(&net_big_sklist_lock); s= *list; if(s==sk) { *list = s->next; - end_bh_atomic(); + write_unlock_bh(&net_big_sklist_lock); + sock_put(sk); return; } while(s && s->next) @@ -803,15 +813,16 @@ void sklist_remove_socket(struct sock **list, struct sock *sk) } s=s->next; } - end_bh_atomic(); + write_unlock_bh(&net_big_sklist_lock); } void sklist_insert_socket(struct sock **list, struct sock *sk) { - start_bh_atomic(); + write_lock_bh(&net_big_sklist_lock); sk->next= *list; *list=sk; - end_bh_atomic(); + sock_hold(sk); + write_unlock_bh(&net_big_sklist_lock); } /* @@ -853,7 +864,7 @@ void sklist_destroy_socket(struct sock **list,struct sock *sk) atomic_read(&sk->rmem_alloc) == 0 && sk->dead) { - sk_free(sk); + sock_put(sk); } else { @@ -875,14 +886,7 @@ void sklist_destroy_socket(struct sock **list,struct sock *sk) * function, some default processing is provided. */ -int sock_no_dup(struct socket *newsock, struct socket *oldsock) -{ - struct sock *sk = oldsock->sk; - - return net_families[sk->family]->create(newsock, sk->protocol); -} - -int sock_no_release(struct socket *sock, struct socket *peersock) +int sock_no_release(struct socket *sock) { return 0; } @@ -986,7 +990,11 @@ int sock_no_recvmsg(struct socket *sock, struct msghdr *m, int flags, return -EOPNOTSUPP; } - +int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) +{ + /* Mirror missing mmap method error code */ + return -ENODEV; +} /* * Default Socket Callbacks @@ -994,28 +1002,36 @@ int sock_no_recvmsg(struct socket *sock, struct msghdr *m, int flags, void sock_def_wakeup(struct sock *sk) { + read_lock(&sk->callback_lock); if(!sk->dead) wake_up_interruptible(sk->sleep); + read_unlock(&sk->callback_lock); } void sock_def_error_report(struct sock *sk) { + read_lock(&sk->callback_lock); if (!sk->dead) { wake_up_interruptible(sk->sleep); sock_wake_async(sk->socket,0); } + read_unlock(&sk->callback_lock); } void sock_def_readable(struct sock *sk, int len) { + read_lock(&sk->callback_lock); if(!sk->dead) { wake_up_interruptible(sk->sleep); sock_wake_async(sk->socket,1); } + read_unlock(&sk->callback_lock); } void sock_def_write_space(struct sock *sk) { + read_lock(&sk->callback_lock); + /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ @@ -1027,6 +1043,7 @@ void sock_def_write_space(struct sock *sk) if (sock_writeable(sk)) sock_wake_async(sk->socket, 2); } + read_unlock(&sk->callback_lock); } void sock_def_destruct(struct sock *sk) @@ -1040,7 +1057,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) skb_queue_head_init(&sk->receive_queue); skb_queue_head_init(&sk->write_queue); skb_queue_head_init(&sk->error_queue); - + + spin_lock_init(&sk->timer_lock); init_timer(&sk->timer); sk->allocation = GFP_KERNEL; @@ -1058,6 +1076,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) } else sk->sleep = NULL; + sk->callback_lock = RW_LOCK_UNLOCKED; + sk->state_change = sock_def_wakeup; sk->data_ready = sock_def_readable; sk->write_space = sock_def_write_space; @@ -1068,4 +1088,5 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->peercred.uid = -1; sk->peercred.gid = -1; + atomic_set(&sk->refcnt, 1); } diff --git a/net/core/utils.c b/net/core/utils.c index 415926b8e..310393453 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -46,21 +46,28 @@ int net_msg_burst = 10*5*HZ; */ int net_ratelimit(void) { + static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED; static unsigned long toks = 10*5*HZ; static unsigned long last_msg; static int missed; + unsigned long flags; unsigned long now = jiffies; - toks += now - xchg(&last_msg, now); + spin_lock_irqsave(&ratelimit_lock, flags); + toks += now - last_msg; + last_msg = now; if (toks > net_msg_burst) toks = net_msg_burst; if (toks >= net_msg_cost) { - toks -= net_msg_cost; - if (missed) - printk(KERN_WARNING "NET: %d messages suppressed.\n", missed); + int lost = missed; missed = 0; + toks -= net_msg_cost; + spin_unlock_irqrestore(&ratelimit_lock, flags); + if (lost) + printk(KERN_WARNING "NET: %d messages suppressed.\n", lost); return 1; } - missed++; + missed++; + spin_unlock_irqrestore(&ratelimit_lock, flags); return 0; } |