diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/.cvsignore | 1 | ||||
-rw-r--r-- | net/core/Makefile | 10 | ||||
-rw-r--r-- | net/core/datagram.c | 6 | ||||
-rw-r--r-- | net/core/dev.c | 459 | ||||
-rw-r--r-- | net/core/dev_mcast.c | 130 | ||||
-rw-r--r-- | net/core/dst.c | 39 | ||||
-rw-r--r-- | net/core/filter.c | 366 | ||||
-rw-r--r-- | net/core/firewall.c | 1 | ||||
-rw-r--r-- | net/core/iovec.c | 169 | ||||
-rw-r--r-- | net/core/neighbour.c | 1369 | ||||
-rw-r--r-- | net/core/profile.c | 304 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 315 | ||||
-rw-r--r-- | net/core/scm.c | 141 | ||||
-rw-r--r-- | net/core/skbuff.c | 31 | ||||
-rw-r--r-- | net/core/sock.c | 151 | ||||
-rw-r--r-- | net/core/sysctl_net_core.c | 19 | ||||
-rw-r--r-- | net/core/utils.c | 66 |
17 files changed, 2873 insertions, 704 deletions
diff --git a/net/core/.cvsignore b/net/core/.cvsignore index 4671378ae..857dd22e9 100644 --- a/net/core/.cvsignore +++ b/net/core/.cvsignore @@ -1 +1,2 @@ .depend +.*.flags diff --git a/net/core/Makefile b/net/core/Makefile index 2ae776157..fc9dc31c4 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -10,12 +10,16 @@ O_TARGET := core.o O_OBJS := sock.o skbuff.o iovec.o datagram.o dst.o scm.o \ - neighbour.o rtnetlink.o + neighbour.o rtnetlink.o utils.o ifeq ($(CONFIG_SYSCTL),y) O_OBJS += sysctl_net_core.o endif +ifdef CONFIG_FILTER +O_OBJS += filter.o +endif + ifdef CONFIG_NET O_OBJS += dev.o dev_mcast.o @@ -26,6 +30,10 @@ endif endif +ifdef CONFIG_NET_PROFILE +OX_OBJS += profile.o +endif + include $(TOPDIR)/Rules.make tar: diff --git a/net/core/datagram.c b/net/core/datagram.c index cd6e95000..cdab70aba 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -153,7 +153,7 @@ no_packet: void skb_free_datagram(struct sock * sk, struct sk_buff *skb) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); release_sock(sk); } @@ -195,12 +195,12 @@ int skb_copy_datagram_iovec(struct sk_buff *skb, int offset, struct iovec *to, * is only ever holding data ready to receive. */ -unsigned int datagram_poll(struct socket *sock, poll_table *wait) +unsigned int datagram_poll(struct file * file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; unsigned int mask; - poll_wait(sk->sleep, wait); + poll_wait(file, sk->sleep, wait); mask = 0; /* exceptional events? */ diff --git a/net/core/dev.c b/net/core/dev.c index 8d94f6817..b06d0053e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -48,6 +48,8 @@ * 1 device. * Thomas Bogendoerfer : Return ENODEV for dev_open, if there * is no device open function. + * Andi Kleen : Fix error reporting for SIOCGIFCONF + * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF * */ @@ -75,11 +77,11 @@ #include <linux/proc_fs.h> #include <linux/stat.h> #include <net/br.h> +#include <net/dst.h> #include <net/pkt_sched.h> +#include <net/profile.h> #include <linux/init.h> -#ifdef CONFIG_KERNELD #include <linux/kerneld.h> -#endif #ifdef CONFIG_NET_RADIO #include <linux/wireless.h> #endif /* CONFIG_NET_RADIO */ @@ -87,6 +89,10 @@ extern int plip_init(void); #endif +NET_PROFILE_DEFINE(dev_queue_xmit) +NET_PROFILE_DEFINE(net_bh) +NET_PROFILE_DEFINE(net_bh_skb) + const char *if_port_text[] = { "unknown", @@ -141,6 +147,13 @@ static struct notifier_block *netdev_chain=NULL; static struct sk_buff_head backlog; +#ifdef CONFIG_NET_FASTROUTE +int netdev_fastroute; +int netdev_fastroute_obstacles; +struct net_fastroute_stats dev_fastroute_stat; +#endif + + /****************************************************************************************** Protocol management and registration routines @@ -162,6 +175,13 @@ int netdev_nit=0; void dev_add_pack(struct packet_type *pt) { int hash; +#ifdef CONFIG_NET_FASTROUTE + /* Hack to detect packet socket */ + if (pt->data) { + netdev_fastroute_obstacles++; + dev_clear_fastroute(pt->dev); + } +#endif if(pt->type==htons(ETH_P_ALL)) { netdev_nit++; @@ -196,6 +216,10 @@ void dev_remove_pack(struct packet_type *pt) if(pt==(*pt1)) { *pt1=pt->next; +#ifdef CONFIG_NET_FASTROUTE + if (pt->data) + netdev_fastroute_obstacles--; +#endif return; } } @@ -296,17 +320,20 @@ struct device *dev_alloc(const char *name, int *err) void dev_load(const char *name) { - if(!dev_get(name)) + if(!dev_get(name) && suser()) request_module(name); } +#else + +extern inline void dev_load(const char *unused){;} + #endif -static int -default_rebuild_header(struct sk_buff *skb) +static int default_rebuild_header(struct sk_buff *skb) { - printk(KERN_DEBUG "%s: !skb->arp & !rebuild_header -- BUG!\n", skb->dev->name); - kfree_skb(skb, FREE_WRITE); + printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n", skb->dev ? skb->dev->name : "NULL!!!"); + kfree_skb(skb); return 1; } @@ -370,6 +397,24 @@ int dev_open(struct device *dev) return(ret); } +#ifdef CONFIG_NET_FASTROUTE +void dev_clear_fastroute(struct device *dev) +{ + int i; + + if (dev) { + for (i=0; i<=NETDEV_FASTROUTE_HMASK; i++) + dst_release(xchg(dev->fastpath+i, NULL)); + } else { + for (dev = dev_base; dev; dev = dev->next) { + if (dev->accept_fastpath) { + for (i=0; i<=NETDEV_FASTROUTE_HMASK; i++) + dst_release(xchg(dev->fastpath+i, NULL)); + } + } + } +} +#endif /* * Completely shutdown an interface. @@ -400,6 +445,9 @@ int dev_close(struct device *dev) */ dev->flags&=~(IFF_UP|IFF_RUNNING); +#ifdef CONFIG_NET_FASTROUTE + dev_clear_fastroute(dev); +#endif /* * Tell people we are going down @@ -488,7 +536,9 @@ void dev_loopback_xmit(struct sk_buff *skb) if (newskb==NULL) return; + newskb->mac.raw = newskb->data; skb_pull(newskb, newskb->nh.raw - newskb->data); + newskb->pkt_type = PACKET_LOOPBACK; newskb->ip_summed = CHECKSUM_UNNECESSARY; if (newskb->dst==NULL) printk(KERN_DEBUG "BUG: packet without dst looped back 1\n"); @@ -500,24 +550,23 @@ int dev_queue_xmit(struct sk_buff *skb) struct device *dev = skb->dev; struct Qdisc *q; - /* - * If the address has not been resolved. Call the device header rebuilder. - * This can cover all protocols and technically not just ARP either. - * - * This call must be moved to protocol layer. - * Now it works only for IPv6 and for IPv4 in - * some unusual curcumstances (eql device). --ANK - */ - - if (!skb->arp && dev->rebuild_header(skb)) - return 0; +#ifdef CONFIG_NET_PROFILE + start_bh_atomic(); + NET_PROFILE_ENTER(dev_queue_xmit); +#endif + start_bh_atomic(); q = dev->qdisc; if (q->enqueue) { - start_bh_atomic(); q->enqueue(skb, q); qdisc_wakeup(dev); end_bh_atomic(); + +#ifdef CONFIG_NET_PROFILE + NET_PROFILE_LEAVE(dev_queue_xmit); + end_bh_atomic(); +#endif + return 0; } @@ -530,18 +579,30 @@ int dev_queue_xmit(struct sk_buff *skb) made by us here. */ if (dev->flags&IFF_UP) { - start_bh_atomic(); if (netdev_nit) dev_queue_xmit_nit(skb,dev); if (dev->hard_start_xmit(skb, dev) == 0) { end_bh_atomic(); + +#ifdef CONFIG_NET_PROFILE + NET_PROFILE_LEAVE(dev_queue_xmit); + end_bh_atomic(); +#endif + return 0; } if (net_ratelimit()) printk(KERN_DEBUG "Virtual device %s asks to queue packet!\n", dev->name); - end_bh_atomic(); } - kfree_skb(skb, FREE_WRITE); + end_bh_atomic(); + + kfree_skb(skb); + +#ifdef CONFIG_NET_PROFILE + NET_PROFILE_LEAVE(dev_queue_xmit); + end_bh_atomic(); +#endif + return 0; } @@ -551,7 +612,74 @@ int dev_queue_xmit(struct sk_buff *skb) =======================================================================*/ int netdev_dropping = 0; +int netdev_max_backlog = 300; atomic_t netdev_rx_dropped; +#ifdef CONFIG_CPU_IS_SLOW +int net_cpu_congestion; +#endif + +#ifdef CONFIG_NET_HW_FLOWCONTROL +int netdev_throttle_events; +static unsigned long netdev_fc_mask = 1; +unsigned long netdev_fc_xoff = 0; + +static struct +{ + void (*stimul)(struct device *); + struct device *dev; +} netdev_fc_slots[32]; + +int netdev_register_fc(struct device *dev, void (*stimul)(struct device *dev)) +{ + int bit = 0; + unsigned long flags; + + save_flags(flags); + cli(); + if (netdev_fc_mask != ~0UL) { + bit = ffz(netdev_fc_mask); + netdev_fc_slots[bit].stimul = stimul; + netdev_fc_slots[bit].dev = dev; + set_bit(bit, &netdev_fc_mask); + clear_bit(bit, &netdev_fc_xoff); + } + sti(); + return bit; +} + +void netdev_unregister_fc(int bit) +{ + unsigned long flags; + + save_flags(flags); + cli(); + if (bit > 0) { + netdev_fc_slots[bit].stimul = NULL; + netdev_fc_slots[bit].dev = NULL; + clear_bit(bit, &netdev_fc_mask); + clear_bit(bit, &netdev_fc_xoff); + } + sti(); +} + +static void netdev_wakeup(void) +{ + unsigned long xoff; + + cli(); + xoff = netdev_fc_xoff; + netdev_fc_xoff = 0; + netdev_dropping = 0; + netdev_throttle_events++; + while (xoff) { + int i = ffz(~xoff); + xoff &= ~(1<<i); + netdev_fc_slots[i].stimul(netdev_fc_slots[i].dev); + } + sti(); +} +#endif + /* * Receive a packet from a device driver and queue it for the upper @@ -560,42 +688,45 @@ atomic_t netdev_rx_dropped; void netif_rx(struct sk_buff *skb) { +#ifndef CONFIG_CPU_IS_SLOW if(skb->stamp.tv_sec==0) get_fast_time(&skb->stamp); +#else + skb->stamp = xtime; +#endif - /* - * Check that we aren't overdoing things. + /* The code is rearranged so that the path is the most + short when CPU is congested, but is still operating. */ - if (!backlog.qlen) - netdev_dropping = 0; - else if (backlog.qlen > 300) - netdev_dropping = 1; - - if (netdev_dropping) - { - atomic_inc(&netdev_rx_dropped); - kfree_skb(skb, FREE_READ); + if (backlog.qlen <= netdev_max_backlog) { + if (backlog.qlen) { + if (netdev_dropping == 0) { + skb_queue_tail(&backlog,skb); + mark_bh(NET_BH); + return; + } + atomic_inc(&netdev_rx_dropped); + kfree_skb(skb); + return; + } +#ifdef CONFIG_NET_HW_FLOWCONTROL + if (netdev_dropping) + netdev_wakeup(); +#else + netdev_dropping = 0; +#endif + skb_queue_tail(&backlog,skb); + mark_bh(NET_BH); return; } - - /* - * Add it to the "backlog" queue. - */ - - skb_queue_tail(&backlog,skb); - - /* - * If any packet arrived, mark it for processing after the - * hardware interrupt returns. - */ - - mark_bh(NET_BH); - return; + netdev_dropping = 1; + atomic_inc(&netdev_rx_dropped); + kfree_skb(skb); } #ifdef CONFIG_BRIDGE -static inline void handle_bridge(struct skbuff *skb, unsigned short type) +static inline void handle_bridge(struct sk_buff *skb, unsigned short type) { if (br_stats.flags & BR_UP && br_protocol_ok(ntohs(type))) { @@ -610,7 +741,7 @@ static inline void handle_bridge(struct skbuff *skb, unsigned short type) if(br_receive_frame(skb)) { sti(); - continue; + return; } /* * Pull the MAC header off for the copy going to @@ -622,9 +753,6 @@ static inline void handle_bridge(struct skbuff *skb, unsigned short type) } #endif -#ifdef CONFIG_CPU_IS_SLOW -int net_cpu_congestion; -#endif /* * When we are called the queue is ready to grab, the interrupts are @@ -649,6 +777,7 @@ void net_bh(void) net_cpu_congestion = ave_busy>>8; #endif + NET_PROFILE_ENTER(net_bh); /* * Can we send anything now? We want to clear the * decks for any more sends that get done as we @@ -677,11 +806,9 @@ void net_bh(void) { struct sk_buff * skb = backlog.next; - if (jiffies - start_time > 1) { - /* Give chance to other bottom halves to run */ - mark_bh(NET_BH); - return; - } + /* Give chance to other bottom halves to run */ + if (jiffies - start_time > 1) + goto net_bh_break; /* * We have a packet. Therefore the queue has shrunk @@ -692,14 +819,24 @@ void net_bh(void) #ifdef CONFIG_CPU_IS_SLOW if (ave_busy > 128*16) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); while ((skb = skb_dequeue(&backlog)) != NULL) - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); break; } #endif - + +#if 0 + NET_PROFILE_SKB_PASSED(skb, net_bh_skb); +#endif +#ifdef CONFIG_NET_FASTROUTE + if (skb->pkt_type == PACKET_FASTROUTE) { + dev_queue_xmit(skb); + continue; + } +#endif + /* * Fetch the packet protocol ID. */ @@ -726,6 +863,12 @@ void net_bh(void) /* XXX until we figure out every place to modify.. */ skb->h.raw = skb->nh.raw = skb->data; + if (skb->mac.raw < skb->head || skb->mac.raw > skb->data) { + printk(KERN_CRIT "%s: wrong mac.raw ptr, proto=%04x\n", skb->dev->name, skb->protocol); + kfree_skb(skb); + continue; + } + /* * We got a packet ID. Now loop over the "known protocols" * list. There are two lists. The ptype_all list of taps (normally empty) @@ -784,7 +927,7 @@ void net_bh(void) */ else { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); } } /* End of queue loop */ @@ -800,23 +943,36 @@ void net_bh(void) qdisc_run_queues(); #ifdef CONFIG_CPU_IS_SLOW -{ - unsigned long start_idle = jiffies; - ave_busy += ((start_idle - start_busy)<<3) - (ave_busy>>4); - start_busy = 0; -} + if (1) { + unsigned long start_idle = jiffies; + ave_busy += ((start_idle - start_busy)<<3) - (ave_busy>>4); + start_busy = 0; + } +#endif +#ifdef CONFIG_NET_HW_FLOWCONTROL + if (netdev_dropping) + netdev_wakeup(); +#else + netdev_dropping = 0; #endif + NET_PROFILE_LEAVE(net_bh); + return; + +net_bh_break: + mark_bh(NET_BH); + NET_PROFILE_LEAVE(net_bh); + return; } /* Protocol dependent address dumping routines */ -static int (*gifconf[NPROTO])(struct device *dev, char *bufptr, int len); +static gifconf_func_t * gifconf_list [NPROTO]; -int register_gifconf(int family, int (*func)(struct device *dev, char *bufptr, int len)) +int register_gifconf(unsigned int family, gifconf_func_t * gifconf) { - if (family<0 || family>=NPROTO) + if (family>=NPROTO) return -EINVAL; - gifconf[family] = func; + gifconf_list[family] = gifconf; return 0; } @@ -903,58 +1059,53 @@ static int dev_ifconf(char *arg) struct ifconf ifc; struct device *dev; char *pos; - unsigned int len; - int err; + int len; + int total; + int i; /* * Fetch the caller's info block. */ - err = copy_from_user(&ifc, arg, sizeof(struct ifconf)); - if (err) + if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) return -EFAULT; pos = ifc.ifc_buf; - if (pos==NULL) - ifc.ifc_len=0; len = ifc.ifc_len; /* * Loop over the interfaces, and write an info block for each. */ + total = 0; for (dev = dev_base; dev != NULL; dev = dev->next) { - int i; for (i=0; i<NPROTO; i++) { - int done; - - if (gifconf[i] == NULL) - continue; - - done = gifconf[i](dev, pos, len); - - if (done<0) - return -EFAULT; - - len -= done; - if (pos) - pos += done; + if (gifconf_list[i]) { + int done; + if (pos==NULL) { + done = gifconf_list[i](dev, NULL, 0); + } else { + done = gifconf_list[i](dev, pos+total, len-total); + } + if (done<0) + return -EFAULT; + total += done; + } } } /* * All done. Write the updated control block back to the caller. */ - ifc.ifc_len -= len; + ifc.ifc_len = total; if (copy_to_user(arg, &ifc, sizeof(struct ifconf))) return -EFAULT; - /* - * Report how much was filled in + /* + * Both BSD and Solaris return 0 here, so we do too. */ - - return ifc.ifc_len; + return 0; } /* @@ -1006,7 +1157,7 @@ int dev_get_info(char *buffer, char **start, off_t offset, int length, int dummy size = sprintf(buffer, "Inter-| Receive | Transmit\n" - " face |bytes packets errs drop fifo frame|bytes packets errs drop fifo colls carrier\n"); + " face |bytes packets errs drop fifo frame|bytes packets errs drop fifo colls carrier multicast\n"); pos+=size; len+=size; @@ -1033,6 +1184,41 @@ int dev_get_info(char *buffer, char **start, off_t offset, int length, int dummy len=length; /* Ending slop */ return len; } + +static int dev_proc_stats(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + int len; + + len = sprintf(buffer, "%08x %08x %08x %08x %08x\n", + atomic_read(&netdev_rx_dropped), +#ifdef CONFIG_NET_HW_FLOWCONTROL + netdev_throttle_events, +#else + 0, +#endif +#ifdef CONFIG_NET_FASTROUTE + dev_fastroute_stat.hits, + dev_fastroute_stat.succeed, + dev_fastroute_stat.deferred +#else + 0, 0, 0 +#endif + ); + + len -= offset; + + if (len > length) + len = length; + if(len < 0) + len = 0; + + *start = buffer + offset; + *eof = 1; + + return len; +} + #endif /* CONFIG_PROC_FS */ @@ -1125,9 +1311,16 @@ void dev_set_promiscuity(struct device *dev, int inc) if ((dev->promiscuity += inc) == 0) dev->flags &= ~IFF_PROMISC; if (dev->flags^old_flags) { +#ifdef CONFIG_NET_FASTROUTE + if (dev->flags&IFF_PROMISC) { + netdev_fastroute_obstacles++; + dev_clear_fastroute(dev); + } else + netdev_fastroute_obstacles--; +#endif dev_mc_upload(dev); printk(KERN_INFO "device %s %s promiscuous mode\n", - dev->name, (dev->flags&IFF_PROMISC) ? "entered" : "leaved"); + dev->name, (dev->flags&IFF_PROMISC) ? "entered" : "left"); } } @@ -1305,6 +1498,16 @@ static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) ifr->ifr_ifindex = dev->ifindex; return 0; + case SIOCGIFTXQLEN: + ifr->ifr_qlen = dev->tx_queue_len; + return 0; + + case SIOCSIFTXQLEN: + if(ifr->ifr_qlen<2 || ifr->ifr_qlen>1024) + return -EINVAL; + dev->tx_queue_len = ifr->ifr_qlen; + return 0; + /* * Unknown or private ioctl */ @@ -1339,9 +1542,7 @@ int dev_ioctl(unsigned int cmd, void *arg) { struct ifreq ifr; int ret; -#ifdef CONFIG_NET_ALIAS char *colon; -#endif /* One special case: SIOCGIFCONF takes ifconf argument and requires shared lock, because it sleeps writing @@ -1350,9 +1551,9 @@ int dev_ioctl(unsigned int cmd, void *arg) if (cmd == SIOCGIFCONF) { rtnl_shlock(); - dev_ifconf((char *) arg); + ret = dev_ifconf((char *) arg); rtnl_shunlock(); - return 0; + return ret; } if (cmd == SIOCGIFCOUNT) { return dev_ifcount((unsigned int*)arg); @@ -1366,20 +1567,14 @@ int dev_ioctl(unsigned int cmd, void *arg) ifr.ifr_name[IFNAMSIZ-1] = 0; -#ifdef CONFIG_NET_ALIAS colon = strchr(ifr.ifr_name, ':'); if (colon) *colon = 0; -#endif /* * See which interface the caller is talking about. */ -#ifdef CONFIG_KERNELD - dev_load(ifr.ifr_name); -#endif - switch(cmd) { /* @@ -1396,9 +1591,15 @@ int dev_ioctl(unsigned int cmd, void *arg) case SIOCGIFSLAVE: case SIOCGIFMAP: case SIOCGIFINDEX: + case SIOCGIFTXQLEN: + dev_load(ifr.ifr_name); ret = dev_ifsioc(&ifr, cmd); - if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq))) - return -EFAULT; + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + } return ret; /* @@ -1417,8 +1618,10 @@ int dev_ioctl(unsigned int cmd, void *arg) case SIOCADDMULTI: case SIOCDELMULTI: case SIOCSIFHWBROADCAST: + case SIOCSIFTXQLEN: if (!suser()) return -EPERM; + dev_load(ifr.ifr_name); rtnl_lock(); ret = dev_ifsioc(&ifr, cmd); rtnl_unlock(); @@ -1439,6 +1642,7 @@ int dev_ioctl(unsigned int cmd, void *arg) default: if (cmd >= SIOCDEVPRIVATE && cmd <= SIOCDEVPRIVATE + 15) { + dev_load(ifr.ifr_name); rtnl_lock(); ret = dev_ifsioc(&ifr, cmd); rtnl_unlock(); @@ -1448,6 +1652,7 @@ int dev_ioctl(unsigned int cmd, void *arg) } #ifdef CONFIG_NET_RADIO if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { + dev_load(ifr.ifr_name); if (IW_IS_SET(cmd)) { if (!suser()) return -EPERM; @@ -1466,7 +1671,7 @@ int dev_ioctl(unsigned int cmd, void *arg) } } -int dev_new_index() +int dev_new_index(void) { static int ifindex; for (;;) { @@ -1534,6 +1739,10 @@ int unregister_netdevice(struct device *dev) if (dev->flags & IFF_UP) dev_close(dev); +#ifdef CONFIG_NET_FASTROUTE + dev_clear_fastroute(dev); +#endif + /* Shutdown queueing discipline. */ dev_shutdown(dev); @@ -1579,11 +1788,10 @@ extern void sdla_setup(void); extern void dlci_setup(void); extern int dmascc_init(void); extern int sm_init(void); -extern int baycom_ser_fdx_init(void); -extern int baycom_ser_hdx_init(void); -extern int baycom_par_init(void); +extern int baycom_init(void); extern int lapbeth_init(void); extern void arcnet_init(void); +extern void ip_auto_config(void); #ifdef CONFIG_PROC_FS static struct proc_dir_entry proc_net_dev = { @@ -1649,14 +1857,8 @@ __initfunc(int net_dev_init(void)) #if defined(CONFIG_SDLA) sdla_setup(); #endif -#if defined(CONFIG_BAYCOM_PAR) - baycom_par_init(); -#endif -#if defined(CONFIG_BAYCOM_SER_FDX) - baycom_ser_fdx_init(); -#endif -#if defined(CONFIG_BAYCOM_SER_HDX) - baycom_ser_hdx_init(); +#if defined(CONFIG_BAYCOM) + baycom_init(); #endif #if defined(CONFIG_SOUNDMODEM) sm_init(); @@ -1680,7 +1882,14 @@ __initfunc(int net_dev_init(void)) slhc_install(); #endif - +#ifdef CONFIG_NET_PROFILE + net_profile_init(); + NET_PROFILE_REGISTER(dev_queue_xmit); + NET_PROFILE_REGISTER(net_bh); +#if 0 + NET_PROFILE_REGISTER(net_bh_skb); +#endif +#endif /* * Add the devices. * If the call to dev->init fails, the dev is removed @@ -1711,6 +1920,10 @@ __initfunc(int net_dev_init(void)) #ifdef CONFIG_PROC_FS proc_net_register(&proc_net_dev); + { + struct proc_dir_entry *ent = create_proc_entry("net/dev_stat", 0, 0); + ent->read_proc = dev_proc_stats; + } #endif #ifdef CONFIG_NET_RADIO @@ -1723,6 +1936,8 @@ __initfunc(int net_dev_init(void)) dev_boot_phase = 0; + dev_mcast_init(); + #ifdef CONFIG_IP_PNP ip_auto_config(); #endif diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c index eaa1bd058..a724497e0 100644 --- a/net/core/dev_mcast.c +++ b/net/core/dev_mcast.c @@ -19,7 +19,8 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ - + +#include <linux/config.h> #include <asm/uaccess.h> #include <asm/system.h> #include <asm/bitops.h> @@ -37,6 +38,8 @@ #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> +#include <linux/proc_fs.h> +#include <linux/init.h> #include <net/ip.h> #include <net/route.h> #include <linux/skbuff.h> @@ -52,6 +55,9 @@ * that a casual user application can add/delete multicasts used by * protocols without doing damage to the protocols when it deletes the * entries. It also helps IP as it tracks overlapping maps. + * + * BUGGGG! IPv6 calls dev_mac_add/delete from BH, it means + * that all the functions in this file are racy. [NOT FIXED] --ANK */ @@ -82,64 +88,81 @@ void dev_mc_upload(struct device *dev) * Delete a device level multicast */ -void dev_mc_delete(struct device *dev, void *addr, int alen, int all) +int dev_mc_delete(struct device *dev, void *addr, int alen, int glbl) { - struct dev_mc_list **dmi; + struct dev_mc_list *dmi, **dmip; - for(dmi=&dev->mc_list;*dmi!=NULL;dmi=&(*dmi)->next) - { + for (dmip=&dev->mc_list; (dmi=*dmip)!=NULL; dmip=&dmi->next) { /* * Find the entry we want to delete. The device could * have variable length entries so check these too. */ - if(memcmp((*dmi)->dmi_addr,addr,(*dmi)->dmi_addrlen)==0 && alen==(*dmi)->dmi_addrlen) - { - struct dev_mc_list *tmp= *dmi; - if(--(*dmi)->dmi_users && !all) - return; + if (memcmp(dmi->dmi_addr,addr,dmi->dmi_addrlen)==0 && alen==dmi->dmi_addrlen) { + if (glbl) { + int old_glbl = dmi->dmi_gusers; + dmi->dmi_gusers = 0; + if (old_glbl == 0) + return -ENOENT; + } + if(--dmi->dmi_users) + return 0; + /* * Last user. So delete the entry. */ - *dmi=(*dmi)->next; + *dmip = dmi->next; dev->mc_count--; - kfree_s(tmp,sizeof(*tmp)); + kfree_s(dmi,sizeof(*dmi)); /* * We have altered the list, so the card * loaded filter is now wrong. Fix it */ dev_mc_upload(dev); - return; + return 0; } } + return -ENOENT; } /* * Add a device level multicast */ -void dev_mc_add(struct device *dev, void *addr, int alen, int newonly) +int dev_mc_add(struct device *dev, void *addr, int alen, int glbl) { struct dev_mc_list *dmi; - for(dmi=dev->mc_list;dmi!=NULL;dmi=dmi->next) - { - if(memcmp(dmi->dmi_addr,addr,dmi->dmi_addrlen)==0 && dmi->dmi_addrlen==alen) - { - if(!newonly) - dmi->dmi_users++; - return; + for(dmi=dev->mc_list; dmi!=NULL; dmi=dmi->next) { + if (memcmp(dmi->dmi_addr,addr,dmi->dmi_addrlen)==0 && dmi->dmi_addrlen==alen) { + if (glbl) { + int old_glbl = dmi->dmi_gusers; + dmi->dmi_gusers = 1; + if (old_glbl) + return 0; + } + dmi->dmi_users++; + return 0; } } - dmi=(struct dev_mc_list *)kmalloc(sizeof(*dmi),GFP_KERNEL); - if(dmi==NULL) - return; /* GFP_KERNEL so can't happen anyway */ + + /* GFP_ATOMIC!! It is used by IPv6 from interrupt, + when new address arrives. + + Particularly, it means that this part of code is weirdly + racy, and needs numerous *_bh_atomic --ANK + */ + dmi=(struct dev_mc_list *)kmalloc(sizeof(*dmi), GFP_ATOMIC); + if (dmi==NULL) + return -ENOBUFS; memcpy(dmi->dmi_addr, addr, alen); dmi->dmi_addrlen=alen; dmi->next=dev->mc_list; dmi->dmi_users=1; + dmi->dmi_gusers=glbl ? 1 : 0; dev->mc_list=dmi; dev->mc_count++; dev_mc_upload(dev); + return 0; } /* @@ -148,13 +171,64 @@ void dev_mc_add(struct device *dev, void *addr, int alen, int newonly) void dev_mc_discard(struct device *dev) { - while(dev->mc_list!=NULL) - { + while (dev->mc_list!=NULL) { struct dev_mc_list *tmp=dev->mc_list; - dev->mc_list=dev->mc_list->next; - if (tmp->dmi_users) + dev->mc_list=tmp->next; + if (tmp->dmi_users > tmp->dmi_gusers) printk("dev_mc_discard: multicast leakage! dmi_users=%d\n", tmp->dmi_users); kfree_s(tmp,sizeof(*tmp)); } dev->mc_count=0; } + +#ifdef CONFIG_PROC_FS +static int dev_mc_read_proc(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + off_t pos=0, begin=0; + struct dev_mc_list *m; + int len=0; + struct device *dev; + + for (dev = dev_base; dev; dev = dev->next) { + for (m = dev->mc_list; m; m = m->next) { + int i; + + len += sprintf(buffer+len,"%-4d %-15s %-5d %-5d ", dev->ifindex, dev->name, + m->dmi_users, m->dmi_gusers); + + for (i=0; i<m->dmi_addrlen; i++) + len += sprintf(buffer+len, "%02x", m->dmi_addr[i]); + + len+=sprintf(buffer+len, "\n"); + + pos=begin+len; + if (pos < offset) { + len=0; + begin=pos; + } + if (pos > offset+length) + goto done; + } + } + *eof = 1; + +done: + *start=buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + return len; +} +#endif + +__initfunc(void dev_mcast_init(void)) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *ent; + + ent = create_proc_entry("net/dev_mcast", 0, 0); + ent->read_proc = dev_mc_read_proc; +#endif +} + diff --git a/net/core/dst.c b/net/core/dst.c index 8ebdb0bb5..e94ef2967 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -58,38 +58,43 @@ static void dst_run_gc(unsigned long dummy) dst_gc_timer_inc += DST_GC_INC; dst_gc_timer.expires = jiffies + dst_gc_timer_expires; #if RT_CACHE_DEBUG >= 2 - printk("dst_total: %d/%d/%d %ld\n", - atomic_read(&dst_total), delayed, - atomic_read(&hh_count), dst_gc_timer_expires); + printk("dst_total: %d/%d %ld\n", + atomic_read(&dst_total), delayed, dst_gc_timer_expires); #endif add_timer(&dst_gc_timer); } static int dst_discard(struct sk_buff *skb) { - kfree_skb(skb, FREE_READ); + kfree_skb(skb); return 0; } static int dst_blackhole(struct sk_buff *skb) { - kfree_skb(skb, FREE_WRITE); + kfree_skb(skb); return 0; } void * dst_alloc(int size, struct dst_ops * ops) { struct dst_entry * dst; + + if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) { + if (ops->gc()) + return NULL; + } dst = kmalloc(size, GFP_ATOMIC); if (!dst) return NULL; memset(dst, 0, size); dst->ops = ops; - atomic_set(&dst->refcnt, 1); + atomic_set(&dst->refcnt, 0); dst->lastuse = jiffies; dst->input = dst_discard; dst->output = dst_blackhole; atomic_inc(&dst_total); + atomic_inc(&ops->entries); return dst; } @@ -108,3 +113,25 @@ void __dst_free(struct dst_entry * dst) } end_bh_atomic(); } + +void dst_destroy(struct dst_entry * dst) +{ + struct neighbour *neigh = dst->neighbour; + struct hh_cache *hh = dst->hh; + + dst->hh = NULL; + if (hh && atomic_dec_and_test(&hh->hh_refcnt)) + kfree(hh); + + if (neigh) { + dst->neighbour = NULL; + neigh_release(neigh); + } + + atomic_dec(&dst->ops->entries); + + if (dst->ops->destroy) + dst->ops->destroy(dst); + atomic_dec(&dst_total); + kfree(dst); +} diff --git a/net/core/filter.c b/net/core/filter.c new file mode 100644 index 000000000..a60d8f1e5 --- /dev/null +++ b/net/core/filter.c @@ -0,0 +1,366 @@ +/* + * Linux Socket Filter - Kernel level socket filtering + * + * Author: + * Jay Schulist <Jay.Schulist@spacs.k12.wi.us> + * + * Based on the design of: + * - The Berkeley Packet Filter + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#if defined(CONFIG_FILTER) + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/fcntl.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/if_packet.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <linux/filter.h> + +/* + * Decode and apply filter instructions to the skb->data. + * Return length to keep, 0 for none. skb is the data we are + * filtering, filter is the array of filter instructions, and + * len is the number of filter blocks in the array. + */ + +int sk_run_filter(unsigned char *data, int len, struct sock_filter *filter, int flen) +{ + struct sock_filter *fentry; /* We walk down these */ + u32 A = 0; /* Accumulator */ + u32 X = 0; /* Index Register */ + u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ + int k; + int pc; + int *t; + + /* + * Process array of filter instructions. + */ + + for(pc = 0; pc < flen; pc++) + { + fentry = &filter[pc]; + if(fentry->code & BPF_X) + t=&X; + else + t=&fentry->k; + + switch(fentry->code) + { + case BPF_ALU|BPF_ADD|BPF_X: + case BPF_ALU|BPF_ADD|BPF_K: + A += *t; + continue; + + case BPF_ALU|BPF_SUB|BPF_X: + case BPF_ALU|BPF_SUB|BPF_K: + A -= *t; + continue; + + case BPF_ALU|BPF_MUL|BPF_X: + case BPF_ALU|BPF_MUL|BPF_K: + A *= *t; + continue; + + case BPF_ALU|BPF_DIV|BPF_X: + case BPF_ALU|BPF_DIV|BPF_K: + if(*t == 0) + return (0); + A /= *t; + continue; + + case BPF_ALU|BPF_AND|BPF_X: + case BPF_ALU|BPF_AND|BPF_K: + A &= *t; + continue; + + case BPF_ALU|BPF_OR|BPF_X: + case BPF_ALU|BPF_OR|BPF_K: + A |= *t; + continue; + + case BPF_ALU|BPF_LSH|BPF_X: + case BPF_ALU|BPF_LSH|BPF_K: + A <<= *t; + continue; + + case BPF_ALU|BPF_RSH|BPF_X: + case BPF_ALU|BPF_RSH|BPF_K: + A >>= *t; + continue; + + case BPF_ALU|BPF_NEG: + A = -A; + continue; + + case BPF_JMP|BPF_JA: + pc += fentry->k; + continue; + + case BPF_JMP|BPF_JGT|BPF_K: + pc += (A > fentry->k) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JGE|BPF_K: + pc += (A >= fentry->k) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JEQ|BPF_K: + pc += (A == fentry->k) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JSET|BPF_K: + pc += (A & fentry->k) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JGT|BPF_X: + pc += (A > X) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JGE|BPF_X: + pc += (A >= X) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JEQ|BPF_X: + pc += (A == X) ? fentry->jt : fentry->jf; + continue; + + case BPF_JMP|BPF_JSET|BPF_X: + pc += (A & X) ? fentry->jt : fentry->jf; + continue; + case BPF_LD|BPF_W|BPF_ABS: + k = fentry->k; + if(k + sizeof(long) > len) + return (0); + A = ntohl(*(long*)&data[k]); + continue; + + case BPF_LD|BPF_H|BPF_ABS: + k = fentry->k; + if(k + sizeof(short) > len) + return (0); + A = ntohs(*(short*)&data[k]); + continue; + + case BPF_LD|BPF_B|BPF_ABS: + k = fentry->k; + if(k >= len) + return (0); + A = data[k]; + continue; + + case BPF_LD|BPF_W|BPF_LEN: + A = len; + continue; + + case BPF_LDX|BPF_W|BPF_LEN: + X = len; + continue; + + case BPF_LD|BPF_W|BPF_IND: + k = X + fentry->k; + if(k + sizeof(u32) > len) + return (0); + A = ntohl(*(u32 *)&data[k]); + continue; + + case BPF_LD|BPF_H|BPF_IND: + k = X + fentry->k; + if(k + sizeof(u16) > len) + return (0); + A = ntohs(*(u16*)&data[k]); + continue; + + case BPF_LD|BPF_B|BPF_IND: + k = X + fentry->k; + if(k >= len) + return (0); + A = data[k]; + continue; + + case BPF_LDX|BPF_B|BPF_MSH: + /* + * Hack for BPF to handle TOS etc + */ + k = fentry->k; + if(k >= len) + return (0); + X = (data[fentry->k] & 0xf) << 2; + continue; + + case BPF_LD|BPF_IMM: + A = fentry->k; + continue; + + case BPF_LDX|BPF_IMM: + X = fentry->k; + continue; + + case BPF_LD|BPF_MEM: + A = mem[fentry->k]; + continue; + + case BPF_LDX|BPF_MEM: + X = mem[fentry->k]; + continue; + + case BPF_MISC|BPF_TAX: + X = A; + continue; + + case BPF_MISC|BPF_TXA: + A = X; + continue; + + case BPF_RET|BPF_K: + return ((unsigned int)fentry->k); + + case BPF_RET|BPF_A: + return ((unsigned int)A); + + case BPF_ST: + mem[fentry->k] = A; + continue; + + case BPF_STX: + mem[fentry->k] = X; + continue; + + + + default: + /* Invalid instruction counts as RET */ + return (0); + } + } + + printk(KERN_ERR "Filter ruleset ran off the end.\n"); + return (0); +} + +/* + * Check the user's filter code. If we let some ugly + * filter code slip through kaboom! + */ + +int sk_chk_filter(struct sock_filter *filter, int flen) +{ + struct sock_filter *ftest; + int pc; + + /* + * Check the filter code now. + */ + for(pc = 0; pc < flen; pc++) + { + /* + * All jumps are forward as they are not signed + */ + + ftest = &filter[pc]; + if(BPF_CLASS(ftest->code) == BPF_JMP) + { + /* + * But they mustn't jump off the end. + */ + if(BPF_OP(ftest->code) == BPF_JA) + { + if(pc + ftest->k + 1>= (unsigned)flen) + return (-EINVAL); + } + else + { + /* + * For conditionals both must be safe + */ + if(pc + ftest->jt +1 >= flen || pc + ftest->jf +1 >= flen) + return (-EINVAL); + } + } + + /* + * Check that memory operations use valid addresses. + */ + + if(ftest->k <0 || ftest->k >= BPF_MEMWORDS) + { + /* + * But it might not be a memory operation... + */ + + if (BPF_CLASS(ftest->code) == BPF_ST) + return -EINVAL; + if((BPF_CLASS(ftest->code) == BPF_LD) && + (BPF_MODE(ftest->code) == BPF_MEM)) + return (-EINVAL); + } + } + + /* + * The program must end with a return. We don't care where they + * jumped within the script (its always forwards) but in the + * end they _will_ hit this. + */ + + return (BPF_CLASS(filter[flen - 1].code) == BPF_RET)?0:-EINVAL; +} + +/* + * Attach the user's filter code. We first run some sanity checks on + * it to make sure it does not explode on us later. + */ + +int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +{ + struct sock_filter *fp, *old_filter; + int fsize = sizeof(struct sock_filter) * fprog->len; + int err; + + /* Make sure new filter is there and in the right amounts. */ + if(fprog->filter == NULL || fprog->len == 0 || fsize > BPF_MAXINSNS) + return (-EINVAL); + + if((err = sk_chk_filter(fprog->filter, fprog->len))==0) + { + /* If existing filter, remove it first */ + if(sk->filter) + { + old_filter = sk->filter_data; + kfree_s(old_filter, (sizeof(old_filter) * sk->filter)); + sk->filter_data = NULL; + } + + fp = (struct sock_filter *)kmalloc(fsize, GFP_KERNEL); + if(fp == NULL) + return (-ENOMEM); + + memset(fp,0,sizeof(*fp)); + memcpy(fp, fprog->filter, fsize); /* Copy instructions */ + + sk->filter = fprog->len; /* Number of filter blocks */ + sk->filter_data = fp; /* Filter instructions */ + } + + return (err); +} +#endif /* CONFIG_FILTER */ diff --git a/net/core/firewall.c b/net/core/firewall.c index 44e0709cf..5d685b0d2 100644 --- a/net/core/firewall.c +++ b/net/core/firewall.c @@ -6,7 +6,6 @@ * much hacked by: Alan Cox */ -#include <linux/config.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/firewall.h> diff --git a/net/core/iovec.c b/net/core/iovec.c index bff328b19..18a9a3b5b 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -26,13 +26,7 @@ #include <linux/in6.h> #include <asm/uaccess.h> #include <asm/byteorder.h> -#include <asm/checksum.h> - -extern inline int min(int x, int y) -{ - return x>y?y:x; -} - +#include <net/checksum.h> /* * Verify iovec @@ -44,9 +38,8 @@ extern inline int min(int x, int y) int verify_iovec(struct msghdr *m, struct iovec *iov, char *address, int mode) { - int err=0; - int len=0; - int ct; + int size = m->msg_iovlen * sizeof(struct iovec); + int err, ct; if(m->msg_namelen) { @@ -54,7 +47,7 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, char *address, int mode) { err=move_addr_to_kernel(m->msg_name, m->msg_namelen, address); if(err<0) - return err; + goto out; } m->msg_name = address; @@ -63,24 +56,26 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, char *address, int mode) if (m->msg_iovlen > UIO_FASTIOV) { - iov = kmalloc(m->msg_iovlen*sizeof(struct iovec), GFP_KERNEL); + err = -ENOMEM; + iov = kmalloc(size, GFP_KERNEL); if (!iov) - return -ENOMEM; + goto out; } - err = copy_from_user(iov, m->msg_iov, sizeof(struct iovec)*m->msg_iovlen); - if (err) - { - if (m->msg_iovlen > UIO_FASTIOV) - kfree(iov); - return -EFAULT; - } + if (copy_from_user(iov, m->msg_iov, size)) + goto out_free; + m->msg_iov=iov; - for(ct=0;ct<m->msg_iovlen;ct++) - len+=iov[ct].iov_len; + for (err = 0, ct = 0; ct < m->msg_iovlen; ct++) + err += iov[ct].iov_len; +out: + return err; - m->msg_iov=iov; - return len; +out_free: + err = -EFAULT; + if (m->msg_iovlen > UIO_FASTIOV) + kfree(iov); + goto out; } /* @@ -89,15 +84,15 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, char *address, int mode) int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) { - int err; + int err = -EFAULT; + while(len>0) { if(iov->iov_len) { - int copy = min(iov->iov_len,len); - err = copy_to_user(iov->iov_base,kdata,copy); - if (err) - return err; + int copy = min(iov->iov_len, len); + if (copy_to_user(iov->iov_base, kdata, copy)) + goto out; kdata+=copy; len-=copy; iov->iov_len-=copy; @@ -105,7 +100,9 @@ int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) } iov++; } - return 0; + err = 0; +out: + return err; } /* @@ -114,17 +111,15 @@ int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) { - int err; + int err = -EFAULT; + while(len>0) { if(iov->iov_len) { - int copy=min(len,iov->iov_len); - err = copy_from_user(kdata, iov->iov_base, copy); - if (err) - { - return -EFAULT; - } + int copy = min(len, iov->iov_len); + if (copy_from_user(kdata, iov->iov_base, copy)) + goto out; len-=copy; kdata+=copy; iov->iov_base+=copy; @@ -132,7 +127,9 @@ int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) } iov++; } - return 0; + err = 0; +out: + return err; } @@ -143,28 +140,23 @@ int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset, int len) { - int err; + int err = -EFAULT; + while(offset>0) { if (offset > iov->iov_len) { offset -= iov->iov_len; - } else { - u8 *base; - int copy; + u8 *base = iov->iov_base + offset; + int copy = min(len, iov->iov_len - offset); - base = iov->iov_base + offset; - copy = min(len, iov->iov_len - offset); offset = 0; - err = copy_from_user(kdata, base, copy); - if (err) - { - return -EFAULT; - } + if (copy_from_user(kdata, base, copy)) + goto out; len-=copy; kdata+=copy; } @@ -173,17 +165,17 @@ int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset, while (len>0) { - int copy=min(len, iov->iov_len); - err = copy_from_user(kdata, iov->iov_base, copy); - if (err) - { - return -EFAULT; - } + int copy = min(len, iov->iov_len); + + if (copy_from_user(kdata, iov->iov_base, copy)) + goto out; len-=copy; kdata+=copy; iov++; } - return 0; + err = 0; +out: + return err; } /* @@ -206,25 +198,28 @@ int csum_partial_copy_fromiovecend(unsigned char *kdata, do { int copy = iov->iov_len - offset; - if (copy >= 0) { + if (copy > 0) { u8 *base = iov->iov_base + offset; /* Normal case (single iov component) is fastly detected */ if (len <= copy) { - *csump = csum_partial_copy_from_user(base, kdata, - len, *csump, &err); - return err; + *csump = csum_and_copy_from_user(base, kdata, + len, *csump, &err); + goto out; } partial_cnt = copy % 4; if (partial_cnt) { copy -= partial_cnt; - err |= copy_from_user(kdata+copy, base+copy, partial_cnt); + if (copy_from_user(kdata + copy, base + copy, + partial_cnt)) + goto out_fault; } - *csump = csum_partial_copy_from_user(base, kdata, - copy, *csump, &err); - + *csump = csum_and_copy_from_user(base, kdata, copy, + *csump, &err); + if (err) + goto out; len -= copy + partial_cnt; kdata += copy + partial_cnt; iov++; @@ -236,19 +231,11 @@ int csum_partial_copy_fromiovecend(unsigned char *kdata, csum = *csump; - while (len>0) + while (len > 0) { u8 *base = iov->iov_base; unsigned int copy = min(len, iov->iov_len); - /* FIXME: more sanity checking is needed here, because - * the iovs are copied from the user. - */ - if (base == NULL) { - printk(KERN_DEBUG "%s: iov too short\n",current->comm); - return -EINVAL; - } - /* There is a remnant from previous iov. */ if (partial_cnt) { @@ -256,23 +243,26 @@ int csum_partial_copy_fromiovecend(unsigned char *kdata, /* iov component is too short ... */ if (par_len > copy) { - err |= copy_from_user(kdata, base, copy); + if (copy_from_user(kdata, base, copy)) + goto out_fault; + kdata += copy; base += copy; partial_cnt += copy; - kdata += copy; len -= copy; iov++; if (len) continue; - *csump = csum_partial(kdata-partial_cnt, partial_cnt, csum); - return err; + *csump = csum_partial(kdata - partial_cnt, + partial_cnt, csum); + goto out; } - err |= copy_from_user(kdata, base, par_len); - csum = csum_partial(kdata-partial_cnt, 4, csum); + if (copy_from_user(kdata, base, par_len)) + goto out_fault; + csum = csum_partial(kdata - partial_cnt, 4, csum); + kdata += par_len; base += par_len; copy -= par_len; len -= par_len; - kdata += par_len; partial_cnt = 0; } @@ -282,18 +272,31 @@ int csum_partial_copy_fromiovecend(unsigned char *kdata, if (partial_cnt) { copy -= partial_cnt; - err |= copy_from_user(kdata+copy, base + copy, partial_cnt); + if (copy_from_user(kdata + copy, base + copy, + partial_cnt)) + goto out_fault; } } - if (copy == 0) + /* Why do we want to break?? There may be more to copy ... */ + if (copy == 0) { +if (len > partial_cnt) +printk("csum_iovec: early break? len=%d, partial=%d\n", len, partial_cnt); break; + } - csum = csum_partial_copy_from_user(base, kdata, copy, csum, &err); + csum = csum_and_copy_from_user(base, kdata, copy, csum, &err); + if (err) + goto out; len -= copy + partial_cnt; kdata += copy + partial_cnt; iov++; } *csump = csum; +out: return err; + +out_fault: + err = -EFAULT; + goto out; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 427189234..3de3743e0 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1,8 +1,9 @@ /* - * Generic address resultion entity + * Generic address resolution entity * * Authors: - * Pedro Roque <roque@di.fc.ul.pt> + * Pedro Roque <roque@di.fc.ul.pt> + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -10,144 +11,293 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/config.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/socket.h> #include <linux/sched.h> #include <linux/netdevice.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif #include <net/neighbour.h> +#include <net/dst.h> +#include <linux/rtnetlink.h> +#define NEIGH_DEBUG 1 -static void neigh_purge_send_q(struct neighbour *neigh); +#define NEIGH_PRINTK(x...) printk(x) +#define NEIGH_NOPRINTK(x...) do { ; } while(0) +#define NEIGH_PRINTK0 NEIGH_PRINTK +#define NEIGH_PRINTK1 NEIGH_NOPRINTK +#define NEIGH_PRINTK2 NEIGH_NOPRINTK -void neigh_table_init(struct neigh_table *tbl, struct neigh_ops *ops, int size) -{ - int bmemlen; +#if NEIGH_DEBUG >= 1 +#undef NEIGH_PRINTK1 +#define NEIGH_PRINTK1 NEIGH_PRINTK +#endif +#if NEIGH_DEBUG >= 2 +#undef NEIGH_PRINTK2 +#define NEIGH_PRINTK2 NEIGH_PRINTK +#endif - memset(tbl, 0, sizeof(struct neigh_table)); - - tbl->tbl_size = size; - tbl->neigh_ops = ops; - - /* - * This should only be called on initialization - * And interrupts should be on - */ +static void neigh_timer_handler(unsigned long arg); +#ifdef CONFIG_ARPD +static void neigh_app_notify(struct neighbour *n); +#endif - bmemlen = size * sizeof(struct neighbour *); - tbl->hash_buckets = kmalloc(bmemlen, GFP_KERNEL); +static int neigh_glbl_allocs; +static struct neigh_table *neigh_tables; - if (tbl->hash_buckets == NULL) - { - panic("unable to initialize neigh_table"); - } +static int neigh_blackhole(struct sk_buff *skb) +{ + kfree_skb(skb); + return -ENETDOWN; +} + +/* + * It is random distribution in the interval (1/2)*base...(3/2)*base. + * It corresponds to default IPv6 settings and is not overridable, + * because it is really reasonbale choice. + */ - memset(tbl->hash_buckets, 0, bmemlen); +unsigned long neigh_rand_reach_time(unsigned long base) +{ + return (net_random() % base) + (base>>1); } -struct neighbour *neigh_alloc(int size, struct neigh_ops *ops) + +static int neigh_forced_gc(struct neigh_table *tbl) { - struct neighbour *neigh; - - neigh = kmalloc(size, GFP_ATOMIC); - if (neigh == NULL) - { - return NULL; - } + int shrunk = 0; + int i; + + if (atomic_read(&tbl->lock)) + return 0; - memset(neigh, 0, size); + for (i=0; i<=NEIGH_HASHMASK; i++) { + struct neighbour *n, **np; + + np = &tbl->hash_buckets[i]; + while ((n = *np) != NULL) { + if (atomic_read(&n->refcnt) == 0 && + !(n->nud_state&NUD_PERMANENT)) { + *np = n->next; + n->tbl = NULL; + tbl->entries--; + shrunk = 1; + neigh_destroy(n); + continue; + } + np = &n->next; + } + } - skb_queue_head_init(&neigh->arp_queue); - neigh->ops = ops; - return neigh; + tbl->last_flush = jiffies; + return shrunk; } -void neigh_queue_ins(struct neigh_table *tbl, struct neighbour *neigh) +int neigh_ifdown(struct neigh_table *tbl, struct device *dev) { - struct neighbour *entry, **head; - entry = tbl->request_queue; + int i; - head = &tbl->request_queue; - - for (; entry; entry = entry->next) - { - head = &entry->next; + if (atomic_read(&tbl->lock)) { + NEIGH_PRINTK1("neigh_ifdown: impossible event 1763\n"); + return -EBUSY; + } + + start_bh_atomic(); + for (i=0; i<=NEIGH_HASHMASK; i++) { + struct neighbour *n, **np; + + np = &tbl->hash_buckets[i]; + while ((n = *np) != NULL) { + if (dev && n->dev != dev) { + np = &n->next; + continue; + } + *np = n->next; + n->tbl = NULL; + tbl->entries--; + if (atomic_read(&n->refcnt)) { + /* The most unpleasant situation. + We must destroy neighbour entry, + but someone still uses it. + + The destroy will be delayed until + the last user releases us, but + we must kill timers etc. and move + it to safe state. + */ + if (n->nud_state & NUD_IN_TIMER) + del_timer(&n->timer); + n->parms = &tbl->parms; + skb_queue_purge(&n->arp_queue); + n->output = neigh_blackhole; + if (n->nud_state&NUD_VALID) + n->nud_state = NUD_NOARP; + else + n->nud_state = NUD_NONE; + NEIGH_PRINTK2("neigh %p is stray.\n", n); + } else + neigh_destroy(n); + } } - *head = neigh; - neigh->next = neigh->prev = NULL; + del_timer(&tbl->proxy_timer); + skb_queue_purge(&tbl->proxy_queue); + end_bh_atomic(); + return 0; } -static struct neighbour *neigh_dequeue(struct neigh_table *tbl) +static struct neighbour *neigh_alloc(struct neigh_table *tbl, int creat) { - struct neighbour *neigh; + struct neighbour *n; - if ((neigh = tbl->request_queue)) - { - tbl->request_queue = neigh->next; + if (tbl->entries > tbl->gc_thresh1) { + if (creat < 0) + return NULL; + if (tbl->entries > tbl->gc_thresh2 || + jiffies - tbl->last_flush > 5*HZ) { + if (neigh_forced_gc(tbl) == 0 && + tbl->entries > tbl->gc_thresh3) + return NULL; + } } - return neigh; + + n = kmalloc(tbl->entry_size, GFP_ATOMIC); + if (n == NULL) + return NULL; + + memset(n, 0, tbl->entry_size); + + skb_queue_head_init(&n->arp_queue); + n->updated = n->used = jiffies; + n->nud_state = NUD_NONE; + n->output = neigh_blackhole; + n->parms = &tbl->parms; + init_timer(&n->timer); + n->timer.function = neigh_timer_handler; + n->timer.data = (unsigned long)n; + tbl->stats.allocs++; + neigh_glbl_allocs++; + return n; } -void neigh_table_ins(struct neigh_table *tbl, struct neighbour *neigh) + +struct neighbour * __neigh_lookup(struct neigh_table *tbl, const void *pkey, + struct device *dev, int creat) { - unsigned int hash_val; - struct neighbour **head; - - hash_val = tbl->neigh_ops->hash(neigh->primary_key) % tbl->tbl_size; - - neigh->tbl = tbl; - - head = &tbl->hash_buckets[hash_val]; - - if (!(*head)) - { - neigh->next = neigh; - neigh->prev = neigh; + struct neighbour *n; + u32 hash_val; + int key_len = tbl->key_len; + + hash_val = *(u32*)(pkey + key_len - 4); + hash_val ^= (hash_val>>16); + hash_val ^= hash_val>>8; + hash_val ^= hash_val>>3; + hash_val = (hash_val^dev->ifindex)&NEIGH_HASHMASK; + + for (n = tbl->hash_buckets[hash_val]; n; n = n->next) { + if (dev == n->dev && + memcmp(n->primary_key, pkey, key_len) == 0) { + atomic_inc(&n->refcnt); + return n; + } } - else - { - struct neighbour *prev; - struct neighbour *next; - - next = *head; - prev = next->prev; - + if (!creat) + return NULL; + + n = neigh_alloc(tbl, creat); + if (n == NULL) + return NULL; - neigh->next = next; - neigh->prev = prev; - next->prev = neigh; - prev->next = neigh; + memcpy(n->primary_key, pkey, key_len); + n->dev = dev; + + /* Protocol specific setup. */ + if (tbl->constructor && tbl->constructor(n) < 0) { + neigh_destroy(n); + return NULL; } - - *head = neigh; + + /* Device specific setup. */ + if (n->parms && n->parms->neigh_setup && n->parms->neigh_setup(n) < 0) { + neigh_destroy(n); + return NULL; + } + + n->confirmed = jiffies - (n->parms->base_reachable_time<<1); + atomic_set(&n->refcnt, 1); + tbl->entries++; + n->next = tbl->hash_buckets[hash_val]; + tbl->hash_buckets[hash_val] = n; + n->tbl = tbl; + NEIGH_PRINTK2("neigh %p is created.\n", n); + return n; } -struct neighbour * neigh_lookup(struct neigh_table *tbl, void *pkey, - int key_len, struct device *dev) +struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey, + struct device *dev, int creat) { - struct neighbour *neigh, *head; - unsigned int hash_val; - - hash_val = tbl->neigh_ops->hash(pkey) % tbl->tbl_size; - head = tbl->hash_buckets[hash_val]; + struct pneigh_entry *n; + u32 hash_val; + int key_len = tbl->key_len; - neigh = head; + hash_val = *(u32*)(pkey + key_len - 4); + hash_val ^= (hash_val>>16); + hash_val ^= hash_val>>8; + hash_val ^= hash_val>>4; + hash_val &= PNEIGH_HASHMASK; - if (neigh) - { - do { - if (memcmp(neigh->primary_key, pkey, key_len) == 0) - { - if (!dev || dev == neigh->dev) - return neigh; - } - neigh = neigh->next; + for (n = tbl->phash_buckets[hash_val]; n; n = n->next) { + if (memcmp(n->key, pkey, key_len) == 0 && + (n->dev == dev || !n->dev)) + return n; + } + if (!creat) + return NULL; + + n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL); + if (n == NULL) + return NULL; + + memcpy(n->key, pkey, key_len); + n->dev = dev; - } while (neigh != head); + if (tbl->pconstructor && tbl->pconstructor(n)) { + kfree(n); + return NULL; } - return NULL; + n->next = tbl->phash_buckets[hash_val]; + tbl->phash_buckets[hash_val] = n; + return n; +} + + +int pneigh_delete(struct neigh_table *tbl, const void *pkey, struct device *dev) +{ + struct pneigh_entry *n, **np; + u32 hash_val; + int key_len = tbl->key_len; + + hash_val = *(u32*)(pkey + key_len - 4); + hash_val ^= (hash_val>>16); + hash_val ^= hash_val>>8; + hash_val ^= hash_val>>4; + hash_val &= PNEIGH_HASHMASK; + + for (np = &tbl->phash_buckets[hash_val]; (n=*np) != NULL; np = &n->next) { + if (memcmp(n->key, pkey, key_len) == 0 && n->dev == dev) { + *np = n->next; + if (tbl->pdestructor) + tbl->pdestructor(n); + kfree(n); + return 0; + } + } + return -ENOENT; } /* @@ -156,132 +306,991 @@ struct neighbour * neigh_lookup(struct neigh_table *tbl, void *pkey, */ void neigh_destroy(struct neighbour *neigh) { - if (neigh->tbl) - { - printk(KERN_DEBUG "neigh_destroy: neighbour still in table. " - "called from %p\n", __builtin_return_address(0)); + struct hh_cache *hh; + + if (neigh->tbl || atomic_read(&neigh->refcnt)) { + NEIGH_PRINTK1("neigh_destroy: neighbour is use tbl=%p, ref=%d: " + "called from %p\n", neigh->tbl, atomic_read(&neigh->refcnt), __builtin_return_address(0)); + return; } - if (neigh->ops->destructor) - { - (neigh->ops->destructor)(neigh); + if (neigh->nud_state&NUD_IN_TIMER) + del_timer(&neigh->timer); + + while ((hh = neigh->hh) != NULL) { + neigh->hh = hh->hh_next; + hh->hh_next = NULL; + hh->hh_output = neigh_blackhole; + if (atomic_dec_and_test(&hh->hh_refcnt)) + kfree(hh); } - neigh_purge_send_q(neigh); + if (neigh->ops && neigh->ops->destructor) + (neigh->ops->destructor)(neigh); + + skb_queue_purge(&neigh->arp_queue); + + NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); + neigh_glbl_allocs--; kfree(neigh); } -void neigh_unlink(struct neighbour *neigh) +/* Neighbour state is suspicious; + disable fast path. + */ +static void neigh_suspect(struct neighbour *neigh) { - struct neigh_table *tbl; - struct neighbour **head; - unsigned int hash_val; - struct neighbour *next, *prev; - - tbl = neigh->tbl; - neigh->tbl = NULL; + struct hh_cache *hh; - hash_val = neigh->ops->hash(neigh->primary_key) % tbl->tbl_size; + NEIGH_PRINTK2("neigh %p is suspecteded.\n", neigh); - head = &tbl->hash_buckets[hash_val]; - tbl->tbl_entries--; + neigh->output = neigh->ops->output; - next = neigh->next; - if (neigh == (*head)) - { - if (next == neigh) - { - *head = NULL; - goto out; - } - *head = next; - } - - prev = neigh->prev; - next->prev = prev; - prev->next = next; - out: - neigh->next = neigh->prev = NULL; + for (hh = neigh->hh; hh; hh = hh->hh_next) + hh->hh_output = neigh->ops->output; +} + +/* Neighbour state is OK; + enable fast path. + */ +static void neigh_connect(struct neighbour *neigh) +{ + struct hh_cache *hh; + + NEIGH_PRINTK2("neigh %p is connected.\n", neigh); + + neigh->output = neigh->ops->connected_output; + + for (hh = neigh->hh; hh; hh = hh->hh_next) + hh->hh_output = neigh->ops->hh_output; } /* - * Must only be called with an exclusive lock and bh disabled - * + Transitions NUD_STALE <-> NUD_REACHABLE do not occur + when fast path is built: we have no timers assotiated with + these states, we do not have time to check state when sending. + neigh_periodic_timer check periodically neigh->confirmed + time and moves NUD_REACHABLE -> NUD_STALE. + + If a routine wants to know TRUE entry state, it calls + neigh_sync before checking state. */ -void ntbl_walk_table(struct neigh_table *tbl, ntbl_examine_t func, - unsigned long filter, int max, void *args) +static void neigh_sync(struct neighbour *n) { + unsigned long now = jiffies; + u8 state = n->nud_state; + + if (state&(NUD_NOARP|NUD_PERMANENT)) + return; + if (state&NUD_REACHABLE) { + if (now - n->confirmed > n->parms->reachable_time) { + n->nud_state = NUD_STALE; + neigh_suspect(n); + } + } else if (state&NUD_VALID) { + if (now - n->confirmed < n->parms->reachable_time) { + if (state&NUD_IN_TIMER) + del_timer(&n->timer); + n->nud_state = NUD_REACHABLE; + neigh_connect(n); + } + } +} + +static void neigh_periodic_timer(unsigned long arg) +{ + struct neigh_table *tbl = (struct neigh_table*)arg; + unsigned long now = jiffies; int i; - if (max == 0) - max = tbl->tbl_size; + if (atomic_read(&tbl->lock)) { + tbl->gc_timer.expires = now + 1*HZ; + add_timer(&tbl->gc_timer); + return; + } + + /* + * periodicly recompute ReachableTime from random function + */ + + if (now - tbl->last_rand > 300*HZ) { + struct neigh_parms *p; + tbl->last_rand = now; + for (p=&tbl->parms; p; p = p->next) + p->reachable_time = neigh_rand_reach_time(p->base_reachable_time); + } + + for (i=0; i <= NEIGH_HASHMASK; i++) { + struct neighbour *n, **np; - for (i=0; i < max; i++) - { - struct neighbour **head; - struct neighbour *entry; + np = &tbl->hash_buckets[i]; + while ((n = *np) != NULL) { + unsigned state = n->nud_state; - head = &tbl->hash_buckets[i]; - entry = *head; + if (state&(NUD_PERMANENT|NUD_IN_TIMER)) + goto next_elt; - if (!entry) - continue; + if ((long)(n->used - n->confirmed) < 0) + n->used = n->confirmed; + + if (atomic_read(&n->refcnt) == 0 && + (state == NUD_FAILED || now - n->used > n->parms->gc_staletime)) { + *np = n->next; + n->tbl = NULL; + n->next = NULL; + tbl->entries--; + neigh_destroy(n); + continue; + } + + if (n->nud_state&NUD_REACHABLE && + now - n->confirmed > n->parms->reachable_time) { + n->nud_state = NUD_STALE; + neigh_suspect(n); + } + +next_elt: + np = &n->next; + } + } + + tbl->gc_timer.expires = now + tbl->gc_interval; + add_timer(&tbl->gc_timer); +} + +static __inline__ int neigh_max_probes(struct neighbour *n) +{ + struct neigh_parms *p = n->parms; + return p->ucast_probes + p->app_probes + p->mcast_probes; +} + + +/* Called when a timer expires for a neighbour entry. */ - do { - if (entry->flags & (~filter)) - { - int ret; - ret = (*func)(entry, args); +static void neigh_timer_handler(unsigned long arg) +{ + unsigned long now = jiffies; + struct neighbour *neigh = (struct neighbour*)arg; + unsigned state = neigh->nud_state; - if (ret) - { - struct neighbour *curp; + if (!(state&NUD_IN_TIMER)) { + NEIGH_PRINTK1("neigh: timer & !nud_in_timer\n"); + return; + } - curp = entry; - entry = curp->next; + if ((state&NUD_VALID) && + now - neigh->confirmed < neigh->parms->reachable_time) { + neigh->nud_state = NUD_REACHABLE; + NEIGH_PRINTK2("neigh %p is still alive.\n", neigh); + neigh_connect(neigh); + return; + } + if (state == NUD_DELAY) { + NEIGH_PRINTK2("neigh %p is probed.\n", neigh); + neigh->nud_state = NUD_PROBE; + neigh->probes = 0; + } + + if (neigh->probes >= neigh_max_probes(neigh)) { + struct sk_buff *skb; + + neigh->nud_state = NUD_FAILED; + neigh->tbl->stats.res_failed++; + NEIGH_PRINTK2("neigh %p is failed.\n", neigh); + + /* It is very thin place. report_unreachable is very complicated + routine. Particularly, it can hit the same neighbour entry! + + So that, we try to be accurate and avoid dead loop. --ANK + */ + while(neigh->nud_state==NUD_FAILED && (skb=__skb_dequeue(&neigh->arp_queue)) != NULL) + neigh->ops->error_report(neigh, skb); + skb_queue_purge(&neigh->arp_queue); + return; + } - neigh_unlink(curp); - neigh_destroy(curp); + neigh->probes++; + neigh->timer.expires = now + neigh->parms->retrans_time; + add_timer(&neigh->timer); - if ((*head) == NULL) - break; - continue; + neigh->ops->solicit(neigh, skb_peek(&neigh->arp_queue)); +} + +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +{ + start_bh_atomic(); + if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE))) { + if (!(neigh->nud_state&(NUD_STALE|NUD_INCOMPLETE))) { + if (neigh->tbl == NULL) { + NEIGH_PRINTK2("neigh %p used after death.\n", neigh); + if (skb) + kfree_skb(skb); + end_bh_atomic(); + return 1; + } + if (neigh->parms->mcast_probes + neigh->parms->app_probes) { + neigh->probes = neigh->parms->ucast_probes; + neigh->nud_state = NUD_INCOMPLETE; + neigh->timer.expires = jiffies + neigh->parms->retrans_time; + add_timer(&neigh->timer); + + neigh->ops->solicit(neigh, skb); + } else { + neigh->nud_state = NUD_FAILED; + if (skb) + kfree_skb(skb); + end_bh_atomic(); + return 1; + } + } + if (neigh->nud_state == NUD_INCOMPLETE) { + if (skb) { + if (skb_queue_len(&neigh->arp_queue) >= neigh->parms->queue_len) { + struct sk_buff *buff; + buff = neigh->arp_queue.prev; + __skb_unlink(buff, &neigh->arp_queue); + kfree_skb(buff); } + __skb_queue_head(&neigh->arp_queue, skb); } - entry = entry->next; + end_bh_atomic(); + return 1; + } + if (neigh->nud_state == NUD_STALE) { + NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); + neigh->nud_state = NUD_DELAY; + neigh->timer.expires = jiffies + neigh->parms->delay_probe_time; + add_timer(&neigh->timer); + } + } + end_bh_atomic(); + return 0; +} + +static __inline__ void neigh_update_hhs(struct neighbour *neigh) +{ + struct hh_cache *hh; + void (*update)(struct hh_cache*, struct device*, unsigned char*) = + neigh->dev->header_cache_update; - } while (entry != *head); + if (update) { + for (hh=neigh->hh; hh; hh=hh->hh_next) + update(hh, neigh->dev, neigh->ha); } } -void neigh_tbl_run_bh(struct neigh_table *tbl) -{ - if ((tbl->tbl_bh_mask & NT_MASK_QUEUE)) - { - struct neighbour *neigh; - while((neigh = neigh_dequeue(tbl))) - { - neigh_table_ins(tbl, neigh); + +/* Generic update routine. + -- lladdr is new lladdr or NULL, if it is not supplied. + -- new is new state. + -- override==1 allows to override existing lladdr, if it is different. + -- arp==0 means that that the change is administrative. + */ + +int neigh_update(struct neighbour *neigh, u8 *lladdr, u8 new, int override, int arp) +{ + u8 old = neigh->nud_state; + struct device *dev = neigh->dev; + + if (arp && (old&(NUD_NOARP|NUD_PERMANENT))) + return -EPERM; + + if (!(new&NUD_VALID)) { + if (old&NUD_IN_TIMER) + del_timer(&neigh->timer); + if (old&NUD_CONNECTED) + neigh_suspect(neigh); + neigh->nud_state = new; + return 0; + } + + /* Compare new lladdr with cached one */ + if (dev->addr_len == 0) { + /* First case: device needs no address. */ + lladdr = neigh->ha; + } else if (lladdr) { + /* The second case: if something is already cached + and a new address is proposed: + - compare new & old + - if they are different, check override flag + */ + if (old&NUD_VALID) { + if (memcmp(lladdr, neigh->ha, dev->addr_len) == 0) + lladdr = neigh->ha; + else if (!override) + return -EPERM; } - tbl->tbl_bh_mask &= ~NT_MASK_QUEUE; + } else { + /* No address is supplied; if we know something, + use it, otherwise discard the request. + */ + if (!(old&NUD_VALID)) + return -EINVAL; + lladdr = neigh->ha; + } + + neigh_sync(neigh); + old = neigh->nud_state; + if (new&NUD_CONNECTED) + neigh->confirmed = jiffies; + neigh->updated = jiffies; + + /* If entry was valid and address is not changed, + do not change entry state, if new one is STALE. + */ + if (old&NUD_VALID) { + if (lladdr == neigh->ha) + if (new == old || (new == NUD_STALE && (old&NUD_CONNECTED))) + return 0; } + if (old&NUD_IN_TIMER) + del_timer(&neigh->timer); + neigh->nud_state = new; + if (lladdr != neigh->ha) { + memcpy(neigh->ha, lladdr, dev->addr_len); + neigh_update_hhs(neigh); + neigh->confirmed = jiffies - (neigh->parms->base_reachable_time<<1); +#ifdef CONFIG_ARPD + if (neigh->parms->app_probes) + neigh_app_notify(neigh); +#endif + } + if (new == old) + return 0; + if (new&NUD_CONNECTED) + neigh_connect(neigh); + else + neigh_suspect(neigh); + if (!(old&NUD_VALID)) { + struct sk_buff *skb; + while ((skb=__skb_dequeue(&neigh->arp_queue)) != NULL) + neigh->output(skb); + } + return 0; } -/* - * Purge all linked skb's of the entry. +struct neighbour * neigh_event_ns(struct neigh_table *tbl, + u8 *lladdr, void *saddr, + struct device *dev) +{ + struct neighbour *neigh; + + neigh = __neigh_lookup(tbl, saddr, dev, lladdr || !dev->addr_len); + if (neigh) + neigh_update(neigh, lladdr, NUD_STALE, 1, 1); + return neigh; +} + +static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, u16 protocol) +{ + struct hh_cache *hh = NULL; + struct device *dev = dst->dev; + + for (hh=n->hh; hh; hh = hh->hh_next) + if (hh->hh_type == protocol) + break; + + if (!hh && (hh = kmalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) { + memset(hh, 0, sizeof(struct hh_cache)); + hh->hh_type = protocol; + atomic_set(&hh->hh_refcnt, 0); + hh->hh_next = NULL; + if (dev->hard_header_cache(n, hh)) { + kfree(hh); + hh = NULL; + } else { + atomic_inc(&hh->hh_refcnt); + hh->hh_next = n->hh; + n->hh = hh; + if (n->nud_state&NUD_CONNECTED) + hh->hh_output = n->ops->hh_output; + else + hh->hh_output = n->ops->output; + } + } + if (hh) { + atomic_inc(&hh->hh_refcnt); + dst->hh = hh; + } +} + +/* This function can be used in contexts, where only old dev_queue_xmit + worked, f.e. if you want to override normal output path (eql, shaper), + but resoltution is not made yet. */ -static void neigh_purge_send_q(struct neighbour *neigh) +int neigh_compat_output(struct sk_buff *skb) +{ + struct device *dev = skb->dev; + + __skb_pull(skb, skb->nh.raw - skb->data); + + if (dev->hard_header && + dev->hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, skb->len) < 0 && + dev->rebuild_header(skb)) + return 0; + + return dev_queue_xmit(skb); +} + +/* Slow and careful. */ + +int neigh_resolve_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct neighbour *neigh; + + if (!dst || !(neigh = dst->neighbour)) + goto discard; + + __skb_pull(skb, skb->nh.raw - skb->data); + + if (neigh_event_send(neigh, skb) == 0) { + struct device *dev = neigh->dev; + if (dev->hard_header_cache) { + start_bh_atomic(); + if (dst->hh == NULL) + neigh_hh_init(neigh, dst, dst->ops->protocol); + end_bh_atomic(); + } + if (dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len) >= 0) + return neigh->ops->queue_xmit(skb); + kfree_skb(skb); + return -EINVAL; + } + return 0; + +discard: + NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n", dst, dst ? dst->neighbour : NULL); + kfree_skb(skb); + return -EINVAL; +} + +/* As fast as possible without hh cache */ + +int neigh_connected_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct neighbour *neigh = dst->neighbour; + struct device *dev = neigh->dev; + + __skb_pull(skb, skb->nh.raw - skb->data); + + if (dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len) >= 0) + return neigh->ops->queue_xmit(skb); + kfree_skb(skb); + return -EINVAL; +} + +static void neigh_proxy_process(unsigned long arg) +{ + struct neigh_table *tbl = (struct neigh_table *)arg; + long sched_next = 0; + unsigned long now = jiffies; + struct sk_buff *skb = tbl->proxy_queue.next; + + while (skb != (struct sk_buff*)&tbl->proxy_queue) { + struct sk_buff *back = skb; + long tdif = back->stamp.tv_usec - now; + + skb = skb->next; + if (tdif <= 0) { + __skb_unlink(back, &tbl->proxy_queue); + if (tbl->proxy_redo) + tbl->proxy_redo(back); + else + kfree_skb(back); + } else if (!sched_next || tdif < sched_next) + sched_next = tdif; + } + del_timer(&tbl->proxy_timer); + if (sched_next) { + tbl->proxy_timer.expires = jiffies + sched_next; + add_timer(&tbl->proxy_timer); + } +} + +void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, + struct sk_buff *skb) +{ + unsigned long now = jiffies; + long sched_next = net_random()%p->proxy_delay; + + if (tbl->proxy_queue.qlen > p->proxy_qlen) { + kfree_skb(skb); + return; + } + skb->stamp.tv_sec = 0; + skb->stamp.tv_usec = now + sched_next; + if (del_timer(&tbl->proxy_timer)) { + long tval = tbl->proxy_timer.expires - now; + if (tval < sched_next) + sched_next = tval; + } + tbl->proxy_timer.expires = now + sched_next; + dst_release(skb->dst); + skb->dst = NULL; + __skb_queue_tail(&tbl->proxy_queue, skb); + add_timer(&tbl->proxy_timer); +} + + +struct neigh_parms *neigh_parms_alloc(struct device *dev, struct neigh_table *tbl) +{ + struct neigh_parms *p; + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (p) { + memcpy(p, &tbl->parms, sizeof(*p)); + p->tbl = tbl; + p->reachable_time = neigh_rand_reach_time(p->base_reachable_time); + if (dev && dev->neigh_setup) { + if (dev->neigh_setup(dev, p)) { + kfree(p); + return NULL; + } + } + p->next = tbl->parms.next; + /* ATOMIC_SET */ + tbl->parms.next = p; + } + return p; +} + +void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) +{ + struct neigh_parms **p; + + if (parms == NULL || parms == &tbl->parms) + return; + for (p = &tbl->parms.next; *p; p = &(*p)->next) { + if (*p == parms) { + /* ATOMIC_SET */ + *p = parms->next; +#ifdef CONFIG_SYSCTL + neigh_sysctl_unregister(parms); +#endif + kfree(parms); + return; + } + } + NEIGH_PRINTK1("neigh_release_parms: not found\n"); +} + + +void neigh_table_init(struct neigh_table *tbl) +{ + unsigned long now = jiffies; + + tbl->parms.reachable_time = neigh_rand_reach_time(tbl->parms.base_reachable_time); + + init_timer(&tbl->gc_timer); + tbl->gc_timer.data = (unsigned long)tbl; + tbl->gc_timer.function = neigh_periodic_timer; + tbl->gc_timer.expires = now + tbl->gc_interval + tbl->parms.reachable_time; + add_timer(&tbl->gc_timer); + + init_timer(&tbl->proxy_timer); + tbl->proxy_timer.data = (unsigned long)tbl; + tbl->proxy_timer.function = neigh_proxy_process; + skb_queue_head_init(&tbl->proxy_queue); + + tbl->last_flush = now; + tbl->last_rand = now + tbl->parms.reachable_time*20; + tbl->next = neigh_tables; + neigh_tables = tbl; +} + +int neigh_table_clear(struct neigh_table *tbl) +{ + struct neigh_table **tp; + + start_bh_atomic(); + del_timer(&tbl->gc_timer); + del_timer(&tbl->proxy_timer); + skb_queue_purge(&tbl->proxy_queue); + if (tbl->entries) + neigh_ifdown(tbl, NULL); + end_bh_atomic(); + if (tbl->entries) + printk(KERN_CRIT "neighbour leakage\n"); + for (tp = &neigh_tables; *tp; tp = &(*tp)->next) { + if (*tp == tbl) { + *tp = tbl->next; + break; + } + } +#ifdef CONFIG_SYSCTL + neigh_sysctl_unregister(&tbl->parms); +#endif + return 0; +} + +#ifdef CONFIG_RTNETLINK + + +int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct ndmsg *ndm = NLMSG_DATA(nlh); + struct rtattr **nda = arg; + struct neigh_table *tbl; + struct device *dev = NULL; + + if (ndm->ndm_ifindex) { + if ((dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL) + return -ENODEV; + } + + for (tbl=neigh_tables; tbl; tbl = tbl->next) { + int err = 0; + struct neighbour *n; + + if (tbl->family != ndm->ndm_family) + continue; + + if (nda[NDA_DST-1] == NULL || + nda[NDA_DST-1]->rta_len != RTA_LENGTH(tbl->key_len)) + return -EINVAL; + + if (ndm->ndm_flags&NTF_PROXY) + return pneigh_delete(tbl, RTA_DATA(nda[NDA_DST-1]), dev); + + if (dev == NULL) + return -EINVAL; + + start_bh_atomic(); + n = neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev); + if (n) { + err = neigh_update(n, NULL, NUD_FAILED, 1, 0); + neigh_release(n); + } + end_bh_atomic(); + return err; + } + + return -EADDRNOTAVAIL; +} + +int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct ndmsg *ndm = NLMSG_DATA(nlh); + struct rtattr **nda = arg; + struct neigh_table *tbl; + struct device *dev = NULL; + + if (ndm->ndm_ifindex) { + if ((dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL) + return -ENODEV; + } + + for (tbl=neigh_tables; tbl; tbl = tbl->next) { + int err = 0; + struct neighbour *n; + + if (tbl->family != ndm->ndm_family) + continue; + if (nda[NDA_DST-1] == NULL || + nda[NDA_DST-1]->rta_len != RTA_LENGTH(tbl->key_len)) + return -EINVAL; + if (ndm->ndm_flags&NTF_PROXY) { + if (pneigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 1)) + return 0; + return -ENOBUFS; + } + if (dev == NULL) + return -EINVAL; + if (nda[NDA_LLADDR-1] != NULL && + nda[NDA_LLADDR-1]->rta_len != RTA_LENGTH(dev->addr_len)) + return -EINVAL; + start_bh_atomic(); + n = neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev); + if (n) { + if (nlh->nlmsg_flags&NLM_F_EXCL) + err = -EEXIST; + } else if (!(nlh->nlmsg_flags&NLM_F_CREATE)) + err = -ENOENT; + else { + n = __neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 1); + if (n == NULL) + err = -ENOBUFS; + } + if (err == 0) { + err = neigh_update(n, nda[NDA_LLADDR-1] ? RTA_DATA(nda[NDA_LLADDR-1]) : NULL, + ndm->ndm_state, + nlh->nlmsg_flags&NLM_F_REPLACE, 0); + } + neigh_release(n); + end_bh_atomic(); + return err; + } + + return -EADDRNOTAVAIL; +} + + +static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n, + pid_t pid, u32 seq, int event) +{ + unsigned long now = jiffies; + struct ndmsg *ndm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct nda_cacheinfo ci; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ndm)); + ndm = NLMSG_DATA(nlh); + ndm->ndm_family = n->ops->family; + ndm->ndm_flags = n->flags; + ndm->ndm_type = n->type; + ndm->ndm_state = n->nud_state; + ndm->ndm_ifindex = n->dev->ifindex; + RTA_PUT(skb, NDA_DST, n->tbl->key_len, n->primary_key); + if (n->nud_state&NUD_VALID) + RTA_PUT(skb, NDA_LLADDR, n->dev->addr_len, n->ha); + ci.ndm_used = now - n->used; + ci.ndm_confirmed = now - n->confirmed; + ci.ndm_updated = now - n->updated; + ci.ndm_refcnt = atomic_read(&n->refcnt); + RTA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + + +static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb) +{ + struct neighbour *n; + int h, s_h; + int idx, s_idx; + + s_h = cb->args[1]; + s_idx = idx = cb->args[2]; + for (h=0; h <= NEIGH_HASHMASK; h++) { + if (h < s_h) continue; + if (h > s_h) + memset(&cb->args[2], 0, sizeof(cb->args) - 2*sizeof(int)); + start_bh_atomic(); + for (n = tbl->hash_buckets[h], idx = 0; n; + n = n->next, idx++) { + if (idx < s_idx) + continue; + if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWNEIGH) <= 0) { + end_bh_atomic(); + goto done; + } + } + end_bh_atomic(); + } +done: + cb->args[1] = h; + cb->args[2] = idx; + return skb->len; +} + +int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) +{ + int t; + int s_t; + struct neigh_table *tbl; + int family = ((struct rtgenmsg*)NLMSG_DATA(cb->nlh))->rtgen_family; + + s_t = cb->args[0]; + + for (tbl=neigh_tables, t=0; tbl; tbl = tbl->next, t++) { + if (t < s_t) continue; + if (family && tbl->family != family) + continue; + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int)); + if (neigh_dump_table(tbl, skb, cb) < 0) + break; + } + + cb->args[0] = t; + + return skb->len; +} + +#ifdef CONFIG_ARPD +void neigh_app_ns(struct neighbour *n) { struct sk_buff *skb; + struct nlmsghdr *nlh; + int size = NLMSG_SPACE(sizeof(struct ndmsg)+256); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + return; - /* Release the list of `skb' pointers. */ - while ((skb = skb_dequeue(&neigh->arp_queue))) - { - dev_kfree_skb(skb, FREE_WRITE); + if (neigh_fill_info(skb, n, 0, 0, RTM_GETNEIGH) < 0) { + kfree_skb(skb); + return; } - return; + nlh = (struct nlmsghdr*)skb->data; + nlh->nlmsg_flags = NLM_F_REQUEST; + NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH; + netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC); } + +static void neigh_app_notify(struct neighbour *n) +{ + struct sk_buff *skb; + struct nlmsghdr *nlh; + int size = NLMSG_SPACE(sizeof(struct ndmsg)+256); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + return; + + if (neigh_fill_info(skb, n, 0, 0, RTM_NEWNEIGH) < 0) { + kfree_skb(skb); + return; + } + nlh = (struct nlmsghdr*)skb->data; + NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH; + netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC); +} + + + +#endif + + +#endif + +#ifdef CONFIG_SYSCTL + +struct neigh_sysctl_table +{ + struct ctl_table_header *sysctl_header; + ctl_table neigh_vars[17]; + ctl_table neigh_dev[2]; + ctl_table neigh_neigh_dir[2]; + ctl_table neigh_proto_dir[2]; + ctl_table neigh_root_dir[2]; +} neigh_sysctl_template = { + NULL, + {{NET_NEIGH_MCAST_SOLICIT, "mcast_solicit", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_UCAST_SOLICIT, "ucast_solicit", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_APP_SOLICIT, "app_solicit", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_RETRANS_TIME, "retrans_time", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_REACHABLE_TIME, "base_reachable_time", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_NEIGH_GC_STALE_TIME, "gc_stale_time", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_NEIGH_UNRES_QLEN, "unres_qlen", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_PROXY_QLEN, "proxy_qlen", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_ANYCAST_DELAY, "anycast_delay", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_PROXY_DELAY, "proxy_delay", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_LOCKTIME, "locktime", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_GC_INTERVAL, "gc_interval", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_NEIGH_GC_THRESH1, "gc_thresh1", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_GC_THRESH2, "gc_thresh2", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_NEIGH_GC_THRESH3, "gc_thresh3", + NULL, sizeof(int), 0644, NULL, + &proc_dointvec}, + {0}}, + + {{1, "default", NULL, 0, 0555, NULL},{0}}, + {{0, "neigh", NULL, 0, 0555, NULL},{0}}, + {{0, NULL, NULL, 0, 0555, NULL},{0}}, + {{CTL_NET, "net", NULL, 0, 0555, NULL},{0}} +}; + +int neigh_sysctl_register(struct device *dev, struct neigh_parms *p, + int p_id, int pdev_id, char *p_name) +{ + struct neigh_sysctl_table *t; + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (t == NULL) + return -ENOBUFS; + memcpy(t, &neigh_sysctl_template, sizeof(*t)); + t->neigh_vars[1].data = &p->ucast_probes; + t->neigh_vars[2].data = &p->app_probes; + t->neigh_vars[3].data = &p->retrans_time; + t->neigh_vars[4].data = &p->reachable_time; + t->neigh_vars[5].data = &p->delay_probe_time; + t->neigh_vars[6].data = &p->gc_staletime; + t->neigh_vars[7].data = &p->queue_len; + t->neigh_vars[8].data = &p->proxy_qlen; + t->neigh_vars[9].data = &p->anycast_delay; + t->neigh_vars[10].data = &p->proxy_delay; + t->neigh_vars[11].data = &p->locktime; + if (dev) { + t->neigh_dev[0].procname = dev->name; + t->neigh_dev[0].ctl_name = dev->ifindex+1; + memset(&t->neigh_vars[12], 0, sizeof(ctl_table)); + } else { + t->neigh_vars[12].data = (&p->locktime) + 1; + t->neigh_vars[13].data = (&p->locktime) + 2; + t->neigh_vars[14].data = (&p->locktime) + 3; + t->neigh_vars[15].data = (&p->locktime) + 4; + } + t->neigh_neigh_dir[0].ctl_name = pdev_id; + + t->neigh_proto_dir[0].procname = p_name; + t->neigh_proto_dir[0].ctl_name = p_id; + + t->neigh_dev[0].child = t->neigh_vars; + t->neigh_neigh_dir[0].child = t->neigh_dev; + t->neigh_proto_dir[0].child = t->neigh_neigh_dir; + t->neigh_root_dir[0].child = t->neigh_proto_dir; + + t->sysctl_header = register_sysctl_table(t->neigh_root_dir, 0); + if (t->sysctl_header == NULL) { + kfree(t); + return -ENOBUFS; + } + p->sysctl_table = t; + return 0; +} + +void neigh_sysctl_unregister(struct neigh_parms *p) +{ + if (p->sysctl_table) { + struct neigh_sysctl_table *t = p->sysctl_table; + p->sysctl_table = NULL; + unregister_sysctl_table(t->sysctl_header); + kfree(t); + } +} + +#endif /* CONFIG_SYSCTL */ diff --git a/net/core/profile.c b/net/core/profile.c new file mode 100644 index 000000000..54fc57662 --- /dev/null +++ b/net/core/profile.c @@ -0,0 +1,304 @@ +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/string.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/ip.h> +#include <linux/inet.h> +#include <net/checksum.h> + +#include <asm/uaccess.h> +#include <asm/system.h> + +#include <net/profile.h> + +#ifdef CONFIG_NET_PROFILE + +atomic_t net_profile_active; +struct timeval net_profile_adjust; + +NET_PROFILE_DEFINE(total); + +struct net_profile_slot *net_profile_chain = &net_prof_total; + +#ifdef __alpha__ +__u32 alpha_lo; +long alpha_hi; + +static void alpha_tick(unsigned long); + +static struct timer_list alpha_timer = + { NULL, NULL, 0, 0L, alpha_tick }; + +void alpha_tick(unsigned long dummy) +{ + struct timeval dummy_stamp; + net_profile_stamp(&dummy_stamp); + alpha_timer.expires = jiffies + 4*HZ; + add_timer(&alpha_timer); +} + +#endif + +void net_profile_irq_adjust(struct timeval *entered, struct timeval* leaved) +{ + struct net_profile_slot *s; + + net_profile_sub(entered, leaved); + for (s = net_profile_chain; s; s = s->next) { + if (s->active) + net_profile_add(leaved, &s->irq); + } +} + + +#ifdef CONFIG_PROC_FS +static int profile_read_proc(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + off_t pos=0; + off_t begin=0; + int len=0; + struct net_profile_slot *s; + + len+= sprintf(buffer, "Slot Hits Hi Lo OnIrqHi OnIrqLo Ufl\n"); + + if (offset == 0) { + cli(); + net_prof_total.active = 1; + atomic_inc(&net_profile_active); + NET_PROFILE_LEAVE(total); + sti(); + } + for (s = net_profile_chain; s; s = s->next) { + struct net_profile_slot tmp; + + cli(); + tmp = *s; + + /* Wrong, but pretty close to truth */ + + s->accumulator.tv_sec = 0; + s->accumulator.tv_usec = 0; + s->irq.tv_sec = 0; + s->irq.tv_usec = 0; + s->hits = 0; + s->underflow = 0; + /* Repair active count, it is possible, only if code has a bug */ + if (s->active) { + s->active = 0; + atomic_dec(&net_profile_active); + } + sti(); + + net_profile_sub(&tmp.irq, &tmp.accumulator); + + len += sprintf(buffer+len,"%-15s %-10d %-10ld %-10lu %-10lu %-10lu %d/%d", + tmp.id, + tmp.hits, + tmp.accumulator.tv_sec, + tmp.accumulator.tv_usec, + tmp.irq.tv_sec, + tmp.irq.tv_usec, + tmp.underflow, tmp.active); + + buffer[len++]='\n'; + + pos=begin+len; + if(pos<offset) { + len=0; + begin=pos; + } + if(pos>offset+length) + goto done; + } + *eof = 1; + +done: + *start=buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + if (len < 0) { + len = 0; + printk(KERN_CRIT "Yep, guys... our template for proc_*_read is crappy :-)\n"); + } + if (offset == 0) { + cli(); + net_prof_total.active = 0; + net_prof_total.hits = 0; + net_profile_stamp(&net_prof_total.entered); + sti(); + } + return len; +} +#endif + +struct iphdr whitehole_iph; +int whitehole_count; + +static int whitehole_xmit(struct sk_buff *skb, struct device *dev) +{ + struct net_device_stats *stats; + dev_kfree_skb(skb); + stats = (struct net_device_stats *)dev->priv; + stats->tx_packets++; + stats->tx_bytes+=skb->len; + + return 0; +} + +static void whitehole_inject(unsigned long); +int whitehole_init(struct device *dev); + +static struct timer_list whitehole_timer = + { NULL, NULL, 0, 0L, whitehole_inject }; + +static struct device whitehole_dev = { + "whitehole", 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, whitehole_init, }; + +static int whitehole_open(struct device *dev) +{ + whitehole_count = 100000; + whitehole_timer.expires = jiffies + 5*HZ; + add_timer(&whitehole_timer); + return 0; +} + +static int whitehole_close(struct device *dev) +{ + del_timer(&whitehole_timer); + return 0; +} + +static void whitehole_inject(unsigned long dummy) +{ + struct net_device_stats *stats = (struct net_device_stats *)whitehole_dev.priv; + extern int netdev_dropping; + + do { + struct iphdr *iph; + struct sk_buff *skb = alloc_skb(128, GFP_ATOMIC); + if (!skb) + break; + skb_reserve(skb, 32); + iph = (struct iphdr*)skb_put(skb, sizeof(*iph)); + skb->mac.raw = ((u8*)iph) - 14; + memcpy(iph, &whitehole_iph, sizeof(*iph)); + skb->protocol = __constant_htons(ETH_P_IP); + skb->dev = &whitehole_dev; + skb->pkt_type = PACKET_HOST; + stats->rx_packets++; + stats->rx_bytes += skb->len; + netif_rx(skb); + whitehole_count--; + } while (netdev_dropping == 0 && whitehole_count>0); + if (whitehole_count > 0) { + whitehole_timer.expires = jiffies + 1; + add_timer(&whitehole_timer); + } +} + +static struct net_device_stats *whitehole_get_stats(struct device *dev) +{ + struct net_device_stats *stats = (struct net_device_stats *) dev->priv; + return stats; +} + +__initfunc(int whitehole_init(struct device *dev)) +{ + dev->priv = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL); + if (dev->priv == NULL) + return -ENOBUFS; + memset(dev->priv, 0, sizeof(struct net_device_stats)); + dev->get_stats = whitehole_get_stats; + dev->hard_start_xmit = whitehole_xmit; + dev->open = whitehole_open; + dev->stop = whitehole_close; + ether_setup(dev); + dev->tx_queue_len = 0; + dev->flags |= IFF_NOARP; + dev->flags &= ~(IFF_BROADCAST|IFF_MULTICAST); + dev->iflink = 0; + whitehole_iph.ihl = 5; + whitehole_iph.version = 4; + whitehole_iph.ttl = 2; + whitehole_iph.saddr = in_aton("193.233.7.21"); + whitehole_iph.daddr = in_aton("193.233.7.10"); + whitehole_iph.tot_len = htons(20); + whitehole_iph.check = ip_compute_csum((void *)&whitehole_iph, 20); + return 0; +} + +int net_profile_register(struct net_profile_slot *slot) +{ + cli(); + slot->next = net_profile_chain; + net_profile_chain = slot; + sti(); + return 0; +} + +int net_profile_unregister(struct net_profile_slot *slot) +{ + struct net_profile_slot **sp, *s; + + for (sp = &net_profile_chain; (s = *sp) != NULL; sp = &s->next) { + if (s == slot) { + cli(); + *sp = s->next; + sti(); + return 0; + } + } + return -ESRCH; +} + + +__initfunc(int net_profile_init(void)) +{ + int i; + +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *ent; + + ent = create_proc_entry("net/profile", 0, 0); + ent->read_proc = profile_read_proc; +#endif + + register_netdevice(&whitehole_dev); + + printk("Evaluating net profiler cost ..."); +#if CPU == 586 || CPU == 686 + if (!(boot_cpu_data.x86_capability & 16)) { + panic("Sorry, you CPU does not support tsc. I am dying...\n"); + return -1; + } +#endif + start_bh_atomic(); +#ifdef __alpha__ + alpha_tick(0); +#endif + for (i=0; i<1024; i++) { + NET_PROFILE_ENTER(total); + NET_PROFILE_LEAVE(total); + } + if (net_prof_total.accumulator.tv_sec) { + printk(" too high!\n"); + } else { + net_profile_adjust.tv_usec = net_prof_total.accumulator.tv_usec>>10; + printk("%ld units\n", net_profile_adjust.tv_usec); + } + net_prof_total.hits = 0; + net_profile_stamp(&net_prof_total.entered); + end_bh_atomic(); + return 0; +} + +#endif diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 795e0d062..cf7fe8ff8 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -74,65 +74,29 @@ struct rtnetlink_link * rtnetlink_links[NPROTO]; #define _X 2 /* exclusive access to tables required */ #define _G 4 /* GET request */ -static unsigned char rtm_properties[RTM_MAX-RTM_BASE+1] = +static const int rtm_min[(RTM_MAX+1-RTM_BASE)/4] = { - _S|_X, /* RTM_NEWLINK */ - _S|_X, /* RTM_DELLINK */ - _G, /* RTM_GETLINK */ - 0, - - _S|_X, /* RTM_NEWADDR */ - _S|_X, /* RTM_DELADDR */ - _G, /* RTM_GETADDR */ - 0, - - _S|_X, /* RTM_NEWROUTE */ - _S|_X, /* RTM_DELROUTE */ - _G, /* RTM_GETROUTE */ - 0, - - _S|_X, /* RTM_NEWNEIGH */ - _S|_X, /* RTM_DELNEIGH */ - _G, /* RTM_GETNEIGH */ - 0, - - _S|_X, /* RTM_NEWRULE */ - _S|_X, /* RTM_DELRULE */ - _G, /* RTM_GETRULE */ - 0 + NLMSG_LENGTH(sizeof(struct ifinfomsg)), + NLMSG_LENGTH(sizeof(struct ifaddrmsg)), + NLMSG_LENGTH(sizeof(struct rtmsg)), + NLMSG_LENGTH(sizeof(struct ndmsg)), + NLMSG_LENGTH(sizeof(struct rtmsg)), + NLMSG_LENGTH(sizeof(struct tcmsg)), + NLMSG_LENGTH(sizeof(struct tcmsg)), + NLMSG_LENGTH(sizeof(struct tcmsg)) }; -static int rtnetlink_get_rta(struct kern_rta *rta, struct rtattr *attr, int attrlen) -{ - void **rta_data = (void**)rta; - - while (RTA_OK(attr, attrlen)) { - int type = attr->rta_type; - if (type != RTA_UNSPEC) { - if (type > RTA_MAX) - return -EINVAL; - rta_data[type-1] = RTA_DATA(attr); - } - attr = RTA_NEXT(attr, attrlen); - } - return 0; -} - -static int rtnetlink_get_ifa(struct kern_ifa *ifa, struct rtattr *attr, int attrlen) +static const int rta_max[(RTM_MAX+1-RTM_BASE)/4] = { - void **ifa_data = (void**)ifa; - - while (RTA_OK(attr, attrlen)) { - int type = attr->rta_type; - if (type != IFA_UNSPEC) { - if (type > IFA_MAX) - return -EINVAL; - ifa_data[type-1] = RTA_DATA(attr); - } - attr = RTA_NEXT(attr, attrlen); - } - return 0; -} + IFLA_MAX, + IFA_MAX, + RTA_MAX, + NDA_MAX, + RTA_MAX, + TCA_MAX, + TCA_MAX, + TCA_MAX +}; void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { @@ -145,11 +109,13 @@ void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data memcpy(RTA_DATA(rta), data, attrlen); } +#ifdef CONFIG_RTNL_OLD_IFINFO static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct device *dev, int type, pid_t pid, u32 seq) { struct ifinfomsg *r; struct nlmsghdr *nlh; + unsigned char *b = skb->tail; nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*r)); if (pid) nlh->nlmsg_flags |= NLM_F_MULTI; @@ -168,11 +134,65 @@ static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct device *dev, r->ifi_qdisc = dev->qdisc_sleeping->handle; if (dev->qdisc_sleeping->ops) strcpy(r->ifi_qdiscname, dev->qdisc_sleeping->ops->id); + if (dev->get_stats) { + struct net_device_stats *stats = dev->get_stats(dev); + if (stats) + RTA_PUT(skb, IFLA_STATS, sizeof(*stats), stats); + } + nlh->nlmsg_len = skb->tail - b; return skb->len; nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); return -1; } +#else +static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct device *dev, + int type, pid_t pid, u32 seq) +{ + struct ifinfomsg *r; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*r)); + if (pid) nlh->nlmsg_flags |= NLM_F_MULTI; + r = NLMSG_DATA(nlh); + r->ifi_family = AF_UNSPEC; + r->ifi_type = dev->type; + r->ifi_index = dev->ifindex; + r->ifi_flags = dev->flags; + r->ifi_change = ~0U; + + RTA_PUT(skb, IFLA_IFNAME, strlen(dev->name)+1, dev->name); + if (dev->addr_len) { + RTA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); + RTA_PUT(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast); + } + if (1) { + unsigned mtu = dev->mtu; + RTA_PUT(skb, IFLA_MTU, sizeof(mtu), &mtu); + } + if (dev->ifindex != dev->iflink) + RTA_PUT(skb, IFLA_LINK, sizeof(int), &dev->iflink); + if (dev->qdisc_sleeping->ops) + RTA_PUT(skb, IFLA_QDISC, + strlen(dev->qdisc_sleeping->ops->id) + 1, + dev->qdisc_sleeping->ops->id); + if (dev->get_stats) { + struct net_device_stats *stats = dev->get_stats(dev); + if (stats) + RTA_PUT(skb, IFLA_STATS, sizeof(*stats), stats); + } + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { @@ -191,17 +211,48 @@ int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +int rtnetlink_dump_all(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx; + int s_idx = cb->family; + + if (s_idx == 0) + s_idx = 1; + for (idx=1; idx<NPROTO; idx++) { + int type = cb->nlh->nlmsg_type-RTM_BASE; + if (idx < s_idx || idx == AF_PACKET) + continue; + if (rtnetlink_links[idx] == NULL || + rtnetlink_links[idx][type].dumpit == NULL) + continue; + if (idx > s_idx) + memset(&cb->args[0], 0, sizeof(cb->args)); + if (rtnetlink_links[idx][type].dumpit(skb, cb) == 0) + continue; + if (skb_tailroom(skb) < 256) + break; + } + cb->family = idx; + + return skb->len; +} + void rtmsg_ifinfo(int type, struct device *dev) { struct sk_buff *skb; - int size = NLMSG_SPACE(sizeof(struct ifinfomsg)); +#ifdef CONFIG_RTNL_OLD_IFINFO + int size = NLMSG_SPACE(sizeof(struct ifinfomsg)+ + RTA_LENGTH(sizeof(struct net_device_stats))); +#else + int size = NLMSG_GOODSIZE; +#endif skb = alloc_skb(size, GFP_KERNEL); if (!skb) return; if (rtnetlink_fill_ifinfo(skb, dev, type, 0, 0) < 0) { - kfree_skb(skb, 0); + kfree_skb(skb); return; } NETLINK_CB(skb).dst_groups = RTMGRP_LINK; @@ -220,47 +271,68 @@ static int rtnetlink_done(struct netlink_callback *cb) extern __inline__ int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) { - union { - struct kern_rta rta; - struct kern_ifa ifa; - } u; - struct rtmsg *rtm; - struct ifaddrmsg *ifm; + struct rtnetlink_link *link; + struct rtnetlink_link *link_tab; + struct rtattr *rta[RTATTR_MAX]; + int exclusive = 0; + int sz_idx, kind; + int min_len; int family; int type; int err; + /* Only requests are handled by kernel now */ if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) return 0; + type = nlh->nlmsg_type; + + /* A control message: ignore them */ if (type < RTM_BASE) return 0; + + /* Unknown message: reply with EINVAL */ if (type > RTM_MAX) goto err_inval; + type -= RTM_BASE; + + /* All the messages must have at least 1 byte length */ if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg))) return 0; + family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family; - if (family > NPROTO || rtnetlink_links[family] == NULL) { + if (family > NPROTO) { *errp = -EAFNOSUPPORT; return -1; } - if (rtm_properties[type-RTM_BASE]&_S) { - if (NETLINK_CREDS(skb)->uid) { - *errp = -EPERM; - return -1; - } + + link_tab = rtnetlink_links[family]; + if (link_tab == NULL) + link_tab = rtnetlink_links[AF_UNSPEC]; + link = &link_tab[type]; + + sz_idx = type>>2; + kind = type&3; + + if (kind != 2 && NETLINK_CREDS(skb)->uid) { + *errp = -EPERM; + return -1; } - if (rtm_properties[type-RTM_BASE]&_G && nlh->nlmsg_flags&NLM_F_DUMP) { - if (rtnetlink_links[family][type-RTM_BASE].dumpit == NULL) + + if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { + if (link->dumpit == NULL) + link = &(rtnetlink_links[AF_UNSPEC][type]); + + if (link->dumpit == NULL) goto err_inval; /* Super-user locks all the tables to get atomic snapshot */ if (NETLINK_CREDS(skb)->uid == 0 && nlh->nlmsg_flags&NLM_F_ATOMIC) atomic_inc(&rtnl_rlockct); if ((*errp = netlink_dump_start(rtnl, skb, nlh, - rtnetlink_links[family][type-RTM_BASE].dumpit, + link->dumpit, rtnetlink_done)) != 0) { if (NETLINK_CREDS(skb)->uid == 0 && nlh->nlmsg_flags&NLM_F_ATOMIC) atomic_dec(&rtnl_rlockct); @@ -269,59 +341,41 @@ rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) skb_pull(skb, NLMSG_ALIGN(nlh->nlmsg_len)); return -1; } - if (rtm_properties[type-RTM_BASE]&_X) { + + if (kind != 2) { if (rtnl_exlock_nowait()) { *errp = 0; return -1; } exclusive = 1; } - - memset(&u, 0, sizeof(u)); - - switch (nlh->nlmsg_type) { - case RTM_NEWROUTE: - case RTM_DELROUTE: - case RTM_GETROUTE: - case RTM_NEWRULE: - case RTM_DELRULE: - case RTM_GETRULE: - rtm = NLMSG_DATA(nlh); - if (nlh->nlmsg_len < sizeof(*rtm)) - goto err_inval; - if (rtm->rtm_optlen && - rtnetlink_get_rta(&u.rta, RTM_RTA(rtm), rtm->rtm_optlen) < 0) - goto err_inval; - break; - - case RTM_NEWADDR: - case RTM_DELADDR: - case RTM_GETADDR: - ifm = NLMSG_DATA(nlh); - if (nlh->nlmsg_len < sizeof(*ifm)) - goto err_inval; + memset(&rta, 0, sizeof(rta)); - if (nlh->nlmsg_len > NLMSG_LENGTH(sizeof(*ifm)) && - rtnetlink_get_ifa(&u.ifa, IFA_RTA(ifm), - nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*ifm))) < 0) - goto err_inval; - break; - - case RTM_NEWLINK: - case RTM_DELLINK: - case RTM_GETLINK: - case RTM_NEWNEIGH: - case RTM_DELNEIGH: - case RTM_GETNEIGH: - /* Not urgent and even not necessary */ - default: + min_len = rtm_min[sz_idx]; + if (nlh->nlmsg_len < min_len) goto err_inval; + + if (nlh->nlmsg_len > min_len) { + int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); + struct rtattr *attr = (void*)nlh + NLMSG_ALIGN(min_len); + + while (RTA_OK(attr, attrlen)) { + unsigned flavor = attr->rta_type; + if (flavor) { + if (flavor > rta_max[sz_idx]) + goto err_inval; + rta[flavor-1] = attr; + } + attr = RTA_NEXT(attr, attrlen); + } } - if (rtnetlink_links[family][type-RTM_BASE].doit == NULL) + if (link->doit == NULL) + link = &(rtnetlink_links[AF_UNSPEC][type]); + if (link->doit == NULL) goto err_inval; - err = rtnetlink_links[family][type-RTM_BASE].doit(skb, nlh, (void *)&u); + err = link->doit(skb, nlh, (void *)&rta); if (exclusive) rtnl_exunlock(); @@ -390,15 +444,44 @@ static void rtnetlink_rcv(struct sock *sk, int len) if (skb->len) skb_queue_head(&sk->receive_queue, skb); else - kfree_skb(skb, FREE_READ); + kfree_skb(skb); break; } - kfree_skb(skb, FREE_READ); + kfree_skb(skb); } rtnl_shunlock(); } +static struct rtnetlink_link link_rtnetlink_table[RTM_MAX-RTM_BASE+1] = +{ + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, rtnetlink_dump_ifinfo, }, + { NULL, NULL, }, + + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, rtnetlink_dump_all, }, + { NULL, NULL, }, + + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, rtnetlink_dump_all, }, + { NULL, NULL, }, + + { neigh_add, NULL, }, + { neigh_delete, NULL, }, + { NULL, neigh_dump_info, }, + { NULL, NULL, }, + + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, +}; + + static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) { struct device *dev = ptr; @@ -429,6 +512,8 @@ __initfunc(void rtnetlink_init(void)) if (rtnl == NULL) panic("rtnetlink_init: cannot initialize rtnetlink\n"); register_netdevice_notifier(&rtnetlink_dev_notifier); + rtnetlink_links[AF_UNSPEC] = link_rtnetlink_table; + rtnetlink_links[AF_PACKET] = link_rtnetlink_table; } diff --git a/net/core/scm.c b/net/core/scm.c index 5a6d24c40..ac4aefda0 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -17,6 +17,7 @@ #include <linux/major.h> #include <linux/stat.h> #include <linux/socket.h> +#include <linux/file.h> #include <linux/fcntl.h> #include <linux/net.h> #include <linux/interrupt.h> @@ -44,6 +45,7 @@ static __inline__ int scm_check_creds(struct ucred *creds) { + /* N.B. The test for suser should follow the credential check */ if (suser()) return 0; if (creds->pid != current->pid || @@ -58,11 +60,10 @@ static __inline__ int scm_check_creds(struct ucred *creds) static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) { - int num; + int *fdp = (int*)CMSG_DATA(cmsg); struct scm_fp_list *fpl = *fplp; struct file **fpp; - int *fdp = (int*)CMSG_DATA(cmsg); - int i; + int i, num; num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int); @@ -86,41 +87,41 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) return -EINVAL; /* - * Verify the descriptors. + * Verify the descriptors and increment the usage count. */ for (i=0; i< num; i++) { - int fd; - - fd = fdp[i]; - if (fd < 0 || fd >= NR_OPEN) - return -EBADF; - if (current->files->fd[fd]==NULL) + int fd = fdp[i]; + struct file *file; + + if (fd < 0 || !(file = fget(fd))) return -EBADF; - fpp[i] = current->files->fd[fd]; + *fpp++ = file; + fpl->count++; } - - /* add another reference to these files */ - for (i=0; i< num; i++, fpp++) - (*fpp)->f_count++; - fpl->count += num; - return num; } void __scm_destroy(struct scm_cookie *scm) { - int i; struct scm_fp_list *fpl = scm->fp; + struct file *file; + int i; - if (!fpl) - return; - - for (i=fpl->count-1; i>=0; i--) - close_fp(fpl->fp[i]); + if (fpl) { + scm->fp = NULL; + for (i=fpl->count-1; i>=0; i--) + fput(fpl->fp[i]); + kfree(fpl); + } - kfree(fpl); + file = scm->file; + if (file) { + scm->sock = NULL; + scm->file = NULL; + fput(file); + } } @@ -133,11 +134,10 @@ extern __inline__ int not_one_bit(unsigned val) int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) { - int err; struct cmsghdr *cmsg; struct file *file; - int acc_fd; - unsigned scm_flags=0; + int acc_fd, err; + unsigned int scm_flags=0; for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { @@ -169,14 +169,19 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) memcpy(&acc_fd, CMSG_DATA(cmsg), sizeof(int)); p->sock = NULL; if (acc_fd != -1) { - if (acc_fd < 0 || acc_fd >= NR_OPEN || - (file=current->files->fd[acc_fd])==NULL) - return -EBADF; - if (!file->f_dentry->d_inode || !file->f_dentry->d_inode->i_sock) - return -ENOTSOCK; + err = -EBADF; + file = fget(acc_fd); + if (!file) + goto error; + p->file = file; + err = -ENOTSOCK; + if (!file->f_dentry->d_inode || + !file->f_dentry->d_inode->i_sock) + goto error; p->sock = &file->f_dentry->d_inode->u.socket_i; + err = -EINVAL; if (p->sock->state != SS_UNCONNECTED) - return -EINVAL; + goto error; } scm_flags |= MSG_SYN; break; @@ -223,14 +228,17 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) cmhdr.cmsg_level = level; cmhdr.cmsg_type = type; cmhdr.cmsg_len = cmlen; - err = copy_to_user(cm, &cmhdr, sizeof cmhdr); - if (!err) - err = copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr)); - if (!err) { - cmlen = CMSG_SPACE(len); - msg->msg_control += cmlen; - msg->msg_controllen -= cmlen; - } + + err = -EFAULT; + if (copy_to_user(cm, &cmhdr, sizeof cmhdr)) + goto out; + if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr))) + goto out; + cmlen = CMSG_SPACE(len); + msg->msg_control += cmlen; + msg->msg_controllen -= cmlen; + err = 0; +out: return err; } @@ -240,21 +248,28 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) int fdmax = (msg->msg_controllen - sizeof(struct cmsghdr))/sizeof(int); int fdnum = scm->fp->count; - int *cmfptr; - int err = 0; - int i; struct file **fp = scm->fp->fp; + int *cmfptr; + int err = 0, i; if (fdnum < fdmax) fdmax = fdnum; for (i=0, cmfptr=(int*)CMSG_DATA(cm); i<fdmax; i++, cmfptr++) { - int new_fd = get_unused_fd(); - if (new_fd < 0) + int new_fd; + err = get_unused_fd(); + if (err < 0) break; - current->files->fd[new_fd] = fp[i]; + new_fd = err; err = put_user(new_fd, cmfptr); + if (err) { + put_unused_fd(new_fd); + break; + } + /* Bump the usage count and install the file. */ + fp[i]->f_count++; + current->files->fd[new_fd] = fp[i]; } if (i > 0) @@ -272,38 +287,30 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) msg->msg_controllen -= cmlen; } } - - if (err) - i = 0; + if (i < fdnum) + msg->msg_flags |= MSG_CTRUNC; /* - * Dump those that don't fit. + * All of the files that fit in the message have had their + * usage counts incremented, so we just free the list. */ - for ( ; i < fdnum; i++) { - msg->msg_flags |= MSG_CTRUNC; - close_fp(fp[i]); - } - - kfree (scm->fp); - scm->fp = NULL; + __scm_destroy(scm); } struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) { - int i; struct scm_fp_list *new_fpl; + int i; if (!fpl) return NULL; - new_fpl = kmalloc(fpl->count*sizeof(int) + sizeof(*fpl), GFP_KERNEL); - if (!new_fpl) - return NULL; - - memcpy(new_fpl, fpl, fpl->count*sizeof(int) + sizeof(*fpl)); - - for (i=fpl->count-1; i>=0; i--) - fpl->fp[i]->f_count++; + new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL); + if (new_fpl) { + memcpy(new_fpl, fpl, sizeof(*fpl)); + for (i=fpl->count-1; i>=0; i--) + fpl->fp[i]->f_count++; + } return new_fpl; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6baf37c03..9180b8b54 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -64,7 +64,6 @@ static atomic_t net_skbcount = ATOMIC_INIT(0); static atomic_t net_allocs = ATOMIC_INIT(0); static atomic_t net_fails = ATOMIC_INIT(0); - extern atomic_t ip_frag_mem; /* @@ -113,23 +112,23 @@ void __kfree_skb(struct sk_buff *skb) * to be a good idea. */ -struct sk_buff *alloc_skb(unsigned int size,int priority) +struct sk_buff *alloc_skb(unsigned int size,int gfp_mask) { struct sk_buff *skb; unsigned char *bptr; int len; - if (in_interrupt() && priority!=GFP_ATOMIC) { + if (in_interrupt() && (gfp_mask & __GFP_WAIT)) { static int count = 0; if (++count < 5) { printk(KERN_ERR "alloc_skb called nonatomically " "from interrupt %p\n", __builtin_return_address(0)); - priority = GFP_ATOMIC; + gfp_mask &= ~__GFP_WAIT; } } /* - * FIXME: We could do with an architecture dependant + * FIXME: We could do with an architecture dependent * 'alignment mask'. */ @@ -144,7 +143,7 @@ struct sk_buff *alloc_skb(unsigned int size,int priority) * Allocate some space */ - bptr = kmalloc(size,priority); + bptr = kmalloc(size,gfp_mask); if (bptr == NULL) { atomic_inc(&net_fails); return NULL; @@ -226,7 +225,7 @@ void kfree_skbmem(struct sk_buff *skb) * Duplicate an sk_buff. The new one is not owned by a socket. */ -struct sk_buff *skb_clone(struct sk_buff *skb, int priority) +struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) { struct sk_buff *n; int inbuff = 0; @@ -237,7 +236,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int priority) skb->inclone = SKB_CLONE_ORIG; inbuff = SKB_CLONE_INLINE; } else { - n = kmalloc(sizeof(*n), priority); + n = kmalloc(sizeof(*n), gfp_mask); if (!n) return NULL; } @@ -263,7 +262,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int priority) * This is slower, and copies the whole data area */ -struct sk_buff *skb_copy(struct sk_buff *skb, int priority) +struct sk_buff *skb_copy(struct sk_buff *skb, int gfp_mask) { struct sk_buff *n; unsigned long offset; @@ -272,7 +271,7 @@ struct sk_buff *skb_copy(struct sk_buff *skb, int priority) * Allocate the copy buffer */ - n=alloc_skb(skb->end - skb->head, priority); + n=alloc_skb(skb->end - skb->head, gfp_mask); if(n==NULL) return NULL; @@ -303,7 +302,6 @@ struct sk_buff *skb_copy(struct sk_buff *skb, int priority) n->ack_seq=skb->ack_seq; memcpy(n->cb, skb->cb, sizeof(skb->cb)); n->used=skb->used; - n->arp=skb->arp; n->tries=0; atomic_set(&n->users, 1); n->pkt_type=skb->pkt_type; @@ -354,7 +352,6 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, int newheadroom) n->end_seq=skb->end_seq; n->ack_seq=skb->ack_seq; n->used=skb->used; - n->arp=skb->arp; n->tries=0; atomic_set(&n->users, 1); n->pkt_type=skb->pkt_type; @@ -364,13 +361,3 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, int newheadroom) return n; } - -struct sk_buff *dev_alloc_skb(unsigned int length) -{ - struct sk_buff *skb; - - skb = alloc_skb(length+16, GFP_ATOMIC); - if (skb) - skb_reserve(skb,16); - return skb; -} diff --git a/net/core/sock.c b/net/core/sock.c index 725474887..6da5f5a0d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -76,6 +76,8 @@ * Steve Whitehouse: Added various other default routines * common to several socket families. * Chris Evans : Call suser() check last on F_SETOWN + * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. + * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() * * To Fix: * @@ -122,6 +124,10 @@ #include <net/icmp.h> #include <linux/ipsec.h> +#ifdef CONFIG_FILTER +#include <linux/filter.h> +#endif + #define min(a,b) ((a)<(b)?(a):(b)) /* Run time adjustable parameters. */ @@ -147,6 +153,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname, struct linger ling; struct ifreq req; int ret = 0; + +#ifdef CONFIG_FILTER + struct sock_fprog fprog; +#endif /* * Options without arguments @@ -278,48 +288,6 @@ int sock_setsockopt(struct socket *sock, int level, int optname, break; -#ifdef CONFIG_NET_SECURITY - /* - * FIXME: make these error things that are not - * available! - */ - - case SO_SECURITY_AUTHENTICATION: - if(val<=IPSEC_LEVEL_DEFAULT) - { - sk->authentication=val; - return 0; - } - if(net_families[sock->ops->family]->authentication) - sk->authentication=val; - else - return -EINVAL; - break; - - case SO_SECURITY_ENCRYPTION_TRANSPORT: - if(val<=IPSEC_LEVEL_DEFAULT) - { - sk->encryption=val; - return 0; - } - if(net_families[sock->ops->family]->encryption) - sk->encryption = val; - else - return -EINVAL; - break; - - case SO_SECURITY_ENCRYPTION_NETWORK: - if(val<=IPSEC_LEVEL_DEFAULT) - { - sk->encrypt_net=val; - return 0; - } - if(net_families[sock->ops->family]->encrypt_net) - sk->encrypt_net = val; - else - return -EINVAL; - break; -#endif case SO_BINDTODEVICE: /* Bind this socket to a particular device like "eth0", * as specified in an ifreq structure. If the device @@ -330,36 +298,51 @@ int sock_setsockopt(struct socket *sock, int level, int optname, sk->bound_dev_if = 0; } else { - if (copy_from_user(&req, optval, sizeof(req)) < 0) + if (copy_from_user(&req, optval, sizeof(req))) return -EFAULT; /* Remove any cached route for this socket. */ - if (sk->dst_cache) { - ip_rt_put((struct rtable*)sk->dst_cache); - sk->dst_cache = NULL; - } + dst_release(xchg(&sk->dst_cache, NULL)); if (req.ifr_ifrn.ifrn_name[0] == '\0') { sk->bound_dev_if = 0; - } - else { + } else { struct device *dev = dev_get(req.ifr_ifrn.ifrn_name); if (!dev) return -EINVAL; sk->bound_dev_if = dev->ifindex; - if (sk->daddr) { - int ret; - ret = ip_route_output((struct rtable**)&sk->dst_cache, - sk->daddr, sk->saddr, - sk->ip_tos, sk->bound_dev_if); - if (ret) - return ret; - } } } return 0; +#ifdef CONFIG_FILTER + case SO_ATTACH_FILTER: + if(optlen < sizeof(struct sock_fprog)) + return -EINVAL; + + if(copy_from_user(&fprog, optval, sizeof(fprog))) + { + ret = -EFAULT; + break; + } + + ret = sk_attach_filter(&fprog, sk); + break; + + case SO_DETACH_FILTER: + if(sk->filter) + { + fprog.filter = sk->filter_data; + kfree_s(fprog.filter, (sizeof(fprog.filter) * sk->filter)); + sk->filter_data = NULL; + sk->filter = 0; + return 0; + } + else + return -EINVAL; + break; +#endif /* We implement the SO_SNDLOWAT etc to not be settable (1003.1g 5.3) */ default: @@ -470,20 +453,6 @@ int sock_getsockopt(struct socket *sock, int level, int optname, return -EFAULT; goto lenout; -#ifdef CONFIG_NET_SECURITY - - case SO_SECURITY_AUTHENTICATION: - v.val = sk->authentication; - break; - - case SO_SECURITY_ENCRYPTION_TRANSPORT: - v.val = sk->encryption; - break; - - case SO_SECURITY_ENCRYPTION_NETWORK: - v.val = sk->encrypt_net; - break; -#endif default: return(-ENOPROTOOPT); } @@ -589,6 +558,36 @@ struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int return NULL; } +void *sock_kmalloc(struct sock *sk, int size, int priority) +{ + void *mem = NULL; + /* Always use wmem.. */ + if (atomic_read(&sk->wmem_alloc)+size < sk->sndbuf) { + /* First do the add, to avoid the race if kmalloc + * might sleep. + */ + atomic_add(size, &sk->wmem_alloc); + mem = kmalloc(size, priority); + if (mem) + return mem; + atomic_sub(size, &sk->wmem_alloc); + } + return mem; +} + +void sock_kfree_s(struct sock *sk, void *mem, int size) +{ +#if 1 /* Debug */ + if (atomic_read(&sk->wmem_alloc) < size) { + printk(KERN_DEBUG "sock_kfree_s: mem not accounted.\n"); + return; + } +#endif + kfree_s(mem, size); + atomic_sub(size, &sk->wmem_alloc); + sk->write_space(sk); +} + /* FIXME: this is insane. We are trying suppose to be controlling how * how much space we have for data bytes, not packet headers. @@ -627,7 +626,7 @@ unsigned long sock_wspace(struct sock *sk) if (sk != NULL) { if (sk->shutdown & SEND_SHUTDOWN) return(0); - if (atomic_read(&sk->wmem_alloc) >= sk->sndbuf) + if (atomic_read(&sk->wmem_alloc) >= sk->sndbuf) return(0); return sk->sndbuf - atomic_read(&sk->wmem_alloc); } @@ -827,7 +826,7 @@ void sklist_destroy_socket(struct sock **list,struct sock *sk) while((skb=skb_dequeue(&sk->receive_queue))!=NULL) { - kfree_skb(skb,FREE_READ); + kfree_skb(skb); } if(atomic_read(&sk->wmem_alloc) == 0 && @@ -895,7 +894,7 @@ int sock_no_getname(struct socket *sock, struct sockaddr *saddr, return -EOPNOTSUPP; } -unsigned int sock_no_poll(struct socket *sock, poll_table *pt) +unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt) { return -EOPNOTSUPP; } @@ -1009,8 +1008,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) init_timer(&sk->timer); sk->allocation = GFP_KERNEL; - sk->rcvbuf = sysctl_rmem_default*2; - sk->sndbuf = sysctl_wmem_default*2; + sk->rcvbuf = sysctl_rmem_default; + sk->sndbuf = sysctl_wmem_default; sk->state = TCP_CLOSE; sk->zapped = 1; sk->socket = sock; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index b684fba33..1da2cc152 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -11,6 +11,11 @@ #ifdef CONFIG_SYSCTL +extern int netdev_max_backlog; +extern int netdev_fastroute; +extern int net_msg_cost; +extern int net_msg_burst; + extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; extern __u32 sysctl_wmem_default; @@ -34,6 +39,20 @@ ctl_table core_table[] = { {NET_CORE_DESTROY_DELAY, "destroy_delay", &sysctl_core_destroy_delay, sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_CORE_MAX_BACKLOG, "netdev_max_backlog", + &netdev_max_backlog, sizeof(int), 0644, NULL, + &proc_dointvec}, +#ifdef CONFIG_NET_FASTROUTE + {NET_CORE_FASTROUTE, "netdev_fastroute", + &netdev_fastroute, sizeof(int), 0644, NULL, + &proc_dointvec}, +#endif + {NET_CORE_MSG_COST, "message_cost", + &net_msg_cost, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, + {NET_CORE_MSG_BURST, "message_burst", + &net_msg_burst, sizeof(int), 0644, NULL, + &proc_dointvec_jiffies}, { 0 } }; #endif diff --git a/net/core/utils.c b/net/core/utils.c new file mode 100644 index 000000000..415926b8e --- /dev/null +++ b/net/core/utils.c @@ -0,0 +1,66 @@ +/* + * Generic address resultion entity + * + * Authors: + * net_random Alan Cox + * net_ratelimit Andy Kleen + * + * Created by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> + +static unsigned long net_rand_seed = 152L; + +unsigned long net_random(void) +{ + net_rand_seed=net_rand_seed*69069L+1; + return net_rand_seed^jiffies; +} + +void net_srandom(unsigned long entropy) +{ + net_rand_seed ^= entropy; + net_random(); +} + +int net_msg_cost = 5*HZ; +int net_msg_burst = 10*5*HZ; + +/* + * This enforces a rate limit: not more than one kernel message + * every 5secs to make a denial-of-service attack impossible. + * + * All warning printk()s should be guarded by this function. + */ +int net_ratelimit(void) +{ + static unsigned long toks = 10*5*HZ; + static unsigned long last_msg; + static int missed; + unsigned long now = jiffies; + + toks += now - xchg(&last_msg, now); + if (toks > net_msg_burst) + toks = net_msg_burst; + if (toks >= net_msg_cost) { + toks -= net_msg_cost; + if (missed) + printk(KERN_WARNING "NET: %d messages suppressed.\n", missed); + missed = 0; + return 1; + } + missed++; + return 0; +} |