From 06615f62b17d7de6e12d2f5ec6b88cf30af08413 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Thu, 23 Nov 2000 02:00:47 +0000 Subject: Merge with Linux 2.4.0-test10. --- net/core/Makefile | 4 + net/core/dev.c | 195 +++++++++++++++- net/core/dev_mcast.c | 54 +++-- net/core/dv.c | 551 +++++++++++++++++++++++++++++++++++++++++++++ net/core/sysctl_net_core.c | 25 ++ 5 files changed, 793 insertions(+), 36 deletions(-) create mode 100644 net/core/dv.c (limited to 'net/core') diff --git a/net/core/Makefile b/net/core/Makefile index 7ee0db3fd..af3c74091 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -29,6 +29,10 @@ ifdef CONFIG_NETFILTER OX_OBJS += netfilter.o endif +ifeq ($(CONFIG_NET_DIVERT),y) +O_OBJS += dv.o +endif + endif ifdef CONFIG_NET_PROFILE diff --git a/net/core/dev.c b/net/core/dev.c index e6f440cf4..17fae7a1e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -59,6 +59,8 @@ * Paul Rusty Russell : SIOCSIFNAME * Pekka Riikonen : Netdev boot-time settings code * Andrew Morton : Make unregister_netdevice wait indefinitely on dev->refcnt + * J Hadi Salim : - Backlog queue sampling + * - netif_rx() feedback */ #include @@ -85,6 +87,7 @@ #include #include #include +#include #include #include #include @@ -97,6 +100,18 @@ extern int plip_init(void); #endif +/* This define, if set, will randomly drop a packet when congestion + * is more than moderate. It helps fairness in the multi-interface + * case when one of them is a hog, but it kills performance for the + * single interface case so it is off now by default. + */ +#undef RAND_LIE + +/* Setting this will sample the queue lengths and thus congestion + * via a timer instead of as each packet is received. + */ +#undef OFFLINE_SAMPLE + NET_PROFILE_DEFINE(dev_queue_xmit) NET_PROFILE_DEFINE(softnet_process) @@ -133,6 +148,11 @@ const char *if_port_text[] = { static struct packet_type *ptype_base[16]; /* 16 way hashed list */ static struct packet_type *ptype_all = NULL; /* Taps */ +#ifdef OFFLINE_SAMPLE +static void sample_queue(unsigned long dummy); +static struct timer_list samp_timer = { function: sample_queue }; +#endif + /* * Our notifier list */ @@ -933,12 +953,20 @@ int dev_queue_xmit(struct sk_buff *skb) =======================================================================*/ int netdev_max_backlog = 300; +/* These numbers are selected based on intuition and some + * experimentatiom, if you have more scientific way of doing this + * please go ahead and fix things. + */ +int no_cong_thresh = 10; +int no_cong = 20; +int lo_cong = 100; +int mod_cong = 290; struct netif_rx_stats netdev_rx_stat[NR_CPUS]; #ifdef CONFIG_NET_HW_FLOWCONTROL -static atomic_t netdev_dropping = ATOMIC_INIT(0); +atomic_t netdev_dropping = ATOMIC_INIT(0); static unsigned long netdev_fc_mask = 1; unsigned long netdev_fc_xoff = 0; spinlock_t netdev_fc_lock = SPIN_LOCK_UNLOCKED; @@ -996,6 +1024,56 @@ static void netdev_wakeup(void) } #endif +static void get_sample_stats(int cpu) +{ +#ifdef RAND_LIE + unsigned long rd; + int rq; +#endif + int blog = softnet_data[cpu].input_pkt_queue.qlen; + int avg_blog = softnet_data[cpu].avg_blog; + + avg_blog = (avg_blog >> 1)+ (blog >> 1); + + if (avg_blog > mod_cong) { + /* Above moderate congestion levels. */ + softnet_data[cpu].cng_level = NET_RX_CN_HIGH; +#ifdef RAND_LIE + rd = net_random(); + rq = rd % netdev_max_backlog; + if (rq < avg_blog) /* unlucky bastard */ + softnet_data[cpu].cng_level = NET_RX_DROP; +#endif + } else if (avg_blog > lo_cong) { + softnet_data[cpu].cng_level = NET_RX_CN_MOD; +#ifdef RAND_LIE + rd = net_random(); + rq = rd % netdev_max_backlog; + if (rq < avg_blog) /* unlucky bastard */ + softnet_data[cpu].cng_level = NET_RX_CN_HIGH; +#endif + } else if (avg_blog > no_cong) + softnet_data[cpu].cng_level = NET_RX_CN_LOW; + else /* no congestion */ + softnet_data[cpu].cng_level = NET_RX_SUCCESS; + + softnet_data[cpu].avg_blog = avg_blog; +} + +#ifdef OFFLINE_SAMPLE +static void sample_queue(unsigned long dummy) +{ +/* 10 ms 0r 1ms -- i dont care -- JHS */ + int next_tick = 1; + int cpu = smp_processor_id(); + + get_sample_stats(cpu); + next_tick += jiffies; + mod_timer(&samp_timer, next_tick); +} +#endif + + /** * netif_rx - post buffer to the network code * @skb: buffer to post @@ -1004,9 +1082,18 @@ static void netdev_wakeup(void) * the upper (protocol) levels to process. It always succeeds. The buffer * may be dropped during processing for congestion control or by the * protocol layers. + * + * return values: + * NET_RX_SUCCESS (no congestion) + * NET_RX_CN_LOW (low congestion) + * NET_RX_CN_MOD (moderate congestion) + * NET_RX_CN_HIGH (high congestion) + * NET_RX_DROP (packet was dropped) + * + * */ -void netif_rx(struct sk_buff *skb) +int netif_rx(struct sk_buff *skb) { int this_cpu = smp_processor_id(); struct softnet_data *queue; @@ -1036,7 +1123,10 @@ enqueue: __skb_queue_tail(&queue->input_pkt_queue,skb); __cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ); local_irq_restore(flags); - return; +#ifndef OFFLINE_SAMPLE + get_sample_stats(this_cpu); +#endif + return softnet_data[this_cpu].cng_level; } if (queue->throttle) { @@ -1062,19 +1152,22 @@ drop: local_irq_restore(flags); kfree_skb(skb); + return NET_RX_DROP; } /* Deliver skb to an old protocol, which is not threaded well or which do not understand shared skbs. */ -static void deliver_to_old_ones(struct packet_type *pt, struct sk_buff *skb, int last) +static int deliver_to_old_ones(struct packet_type *pt, struct sk_buff *skb, int last) { static spinlock_t net_bh_lock = SPIN_LOCK_UNLOCKED; + int ret = NET_RX_DROP; + if (!last) { skb = skb_clone(skb, GFP_ATOMIC); if (skb == NULL) - return; + return ret; } /* The assumption (correct one) is that old protocols @@ -1087,10 +1180,11 @@ static void deliver_to_old_ones(struct packet_type *pt, struct sk_buff *skb, int /* Disable timers and wait for all timers completion */ tasklet_disable(bh_task_vec+TIMER_BH); - pt->func(skb, skb->dev, pt); + ret = pt->func(skb, skb->dev, pt); tasklet_enable(bh_task_vec+TIMER_BH); spin_unlock(&net_bh_lock); + return ret; } /* Reparent skb to master device. This function is called @@ -1173,22 +1267,35 @@ void net_call_rx_atomic(void (*fn)(void)) void (*br_handle_frame_hook)(struct sk_buff *skb) = NULL; #endif -static void __inline__ handle_bridge(struct sk_buff *skb, +static int __inline__ handle_bridge(struct sk_buff *skb, struct packet_type *pt_prev) { + int ret = NET_RX_DROP; + if (pt_prev) { if (!pt_prev->data) - deliver_to_old_ones(pt_prev, skb, 0); + ret = deliver_to_old_ones(pt_prev, skb, 0); else { atomic_inc(&skb->users); - pt_prev->func(skb, skb->dev, pt_prev); + ret = pt_prev->func(skb, skb->dev, pt_prev); } } br_handle_frame_hook(skb); + return ret; } +#ifdef CONFIG_NET_DIVERT +static inline void handle_diverter(struct sk_buff *skb) +{ + /* if diversion is supported on device, then divert */ + if (skb->dev->divert && skb->dev->divert->divert) + divert_frame(skb); +} +#endif /* CONFIG_NET_DIVERT */ + + static void net_rx_action(struct softirq_action *h) { int this_cpu = smp_processor_id(); @@ -1239,6 +1346,12 @@ static void net_rx_action(struct softirq_action *h) } } +#ifdef CONFIG_NET_DIVERT + if (skb->dev->divert && skb->dev->divert->divert) + handle_diverter(skb); +#endif /* CONFIG_NET_DIVERT */ + + #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) if (skb->dev->br_port != NULL && br_handle_frame_hook != NULL) { @@ -1275,6 +1388,17 @@ static void net_rx_action(struct softirq_action *h) if (bugdet-- < 0 || jiffies - start_time > 1) goto softnet_break; + +#ifdef CONFIG_NET_HW_FLOWCONTROL + if (queue->throttle && queue->input_pkt_queue.qlen < no_cong_thresh ) { + if (atomic_dec_and_test(&netdev_dropping)) { + queue->throttle = 0; + netdev_wakeup(); + goto softnet_break; + } + } +#endif + } br_read_unlock(BR_NETPROTO_LOCK); @@ -2113,9 +2237,9 @@ int dev_ioctl(unsigned int cmd, void *arg) /** * dev_new_index - allocate an ifindex * - * Returns a suitable unique value for a new device interface number. - * The caller must hold the rtnl semaphore to be sure it remains - * unique. + * Returns a suitable unique value for a new device interface + * number. The caller must hold the rtnl semaphore or the + * dev_base_lock to be sure it remains unique. */ int dev_new_index(void) @@ -2140,6 +2264,10 @@ static int dev_boot_phase = 1; * chain. 0 is returned on success. A negative errno code is returned * on a failure to set up the device, or if the name is a duplicate. * + * Callers must hold the rtnl semaphore. See the comment at the + * end of Space.c for details about the locking. You may want + * register_netdev() instead of this. + * * BUGS: * The locking appears insufficient to guarantee two parallel registers * will not get the same name. @@ -2148,6 +2276,9 @@ static int dev_boot_phase = 1; int register_netdevice(struct net_device *dev) { struct net_device *d, **dp; +#ifdef CONFIG_NET_DIVERT + int ret; +#endif spin_lock_init(&dev->queue_lock); spin_lock_init(&dev->xmit_lock); @@ -2182,6 +2313,12 @@ int register_netdevice(struct net_device *dev) dev_hold(dev); write_unlock_bh(&dev_base_lock); +#ifdef CONFIG_NET_DIVERT + ret = alloc_divert_blk(dev); + if (ret) + return ret; +#endif /* CONFIG_NET_DIVERT */ + /* * Default initial state at registry is that the * device is present. @@ -2231,6 +2368,12 @@ int register_netdevice(struct net_device *dev) dev->deadbeaf = 0; write_unlock_bh(&dev_base_lock); +#ifdef CONFIG_NET_DIVERT + ret = alloc_divert_blk(dev); + if (ret) + return ret; +#endif /* CONFIG_NET_DIVERT */ + /* Notify protocols, that a new device appeared. */ notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev); @@ -2272,6 +2415,10 @@ int netdev_finish_unregister(struct net_device *dev) * This function shuts down a device interface and removes it * from the kernel tables. On success 0 is returned, on a failure * a negative errno code is returned. + * + * Callers must hold the rtnl semaphore. See the comment at the + * end of Space.c for details about the locking. You may want + * unregister_netdev() instead of this. */ int unregister_netdevice(struct net_device *dev) @@ -2325,6 +2472,10 @@ int unregister_netdevice(struct net_device *dev) /* Notifier chain MUST detach us from master device. */ BUG_TRAP(dev->master==NULL); +#ifdef CONFIG_NET_DIVERT + free_divert_blk(dev); +#endif + if (dev->new_style) { #ifdef NET_REFCNT_DEBUG if (atomic_read(&dev->refcnt) != 1) @@ -2397,7 +2548,15 @@ int unregister_netdevice(struct net_device *dev) extern void net_device_init(void); extern void ip_auto_config(void); +#ifdef CONFIG_NET_DIVERT +extern void dv_init(void); +#endif /* CONFIG_NET_DIVERT */ + +/* + * Callers must hold the rtnl semaphore. See the comment at the + * end of Space.c for details about the locking. + */ int __init net_dev_init(void) { struct net_device *dev, **dp; @@ -2407,6 +2566,10 @@ int __init net_dev_init(void) pktsched_init(); #endif +#ifdef CONFIG_NET_DIVERT + dv_init(); +#endif /* CONFIG_NET_DIVERT */ + /* * Initialise the packet receive queues. */ @@ -2417,6 +2580,8 @@ int __init net_dev_init(void) queue = &softnet_data[i]; skb_queue_head_init(&queue->input_pkt_queue); queue->throttle = 0; + queue->cng_level = 0; + queue->avg_blog = 10; /* arbitrary non-zero */ queue->completion_queue = NULL; } @@ -2425,6 +2590,12 @@ int __init net_dev_init(void) NET_PROFILE_REGISTER(dev_queue_xmit); NET_PROFILE_REGISTER(softnet_process); #endif + +#ifdef OFFLINE_SAMPLE + samp_timer.expires = jiffies + (10 * HZ); + add_timer(&samp_timer); +#endif + /* * Add the devices. * If the call to dev->init fails, the dev is removed diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c index cf590db04..6b24a1416 100644 --- a/net/core/dev_mcast.c +++ b/net/core/dev_mcast.c @@ -13,6 +13,7 @@ * rather than any time but... * Alan Cox : IFF_ALLMULTI support. * Alan Cox : New format set_multicast_list() calls. + * Gleb Natapov : Remove dev_mc_lock. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -59,16 +60,14 @@ * Device mc lists are changed by bh at least if IPv6 is enabled, * so that it must be bh protected. * - * We protect all mc lists with global rw lock - * and block accesses to device mc filters with dev->xmit_lock. + * We block accesses to device mc filters with dev->xmit_lock. */ -static rwlock_t dev_mc_lock = RW_LOCK_UNLOCKED; /* * Update the multicast list into the physical NIC controller. */ -void dev_mc_upload(struct net_device *dev) +static void __dev_mc_upload(struct net_device *dev) { /* Don't do anything till we up the interface * [dev_open will call this function so the list will @@ -87,13 +86,14 @@ void dev_mc_upload(struct net_device *dev) !netif_device_present(dev)) return; - read_lock_bh(&dev_mc_lock); - spin_lock(&dev->xmit_lock); - dev->xmit_lock_owner = smp_processor_id(); dev->set_multicast_list(dev); - dev->xmit_lock_owner = -1; - spin_unlock(&dev->xmit_lock); - read_unlock_bh(&dev_mc_lock); +} + +void dev_mc_upload(struct net_device *dev) +{ + spin_lock_bh(&dev->xmit_lock); + __dev_mc_upload(dev); + spin_unlock_bh(&dev->xmit_lock); } /* @@ -105,7 +105,8 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl) int err = 0; struct dev_mc_list *dmi, **dmip; - write_lock_bh(&dev_mc_lock); + spin_lock_bh(&dev->xmit_lock); + for (dmip = &dev->mc_list; (dmi = *dmip) != NULL; dmip = &dmi->next) { /* * Find the entry we want to delete. The device could @@ -127,7 +128,6 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl) */ *dmip = dmi->next; dev->mc_count--; - write_unlock_bh(&dev_mc_lock); kfree(dmi); @@ -135,13 +135,15 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl) * We have altered the list, so the card * loaded filter is now wrong. Fix it */ - dev_mc_upload(dev); + __dev_mc_upload(dev); + + spin_unlock_bh(&dev->xmit_lock); return 0; } } err = -ENOENT; done: - write_unlock_bh(&dev_mc_lock); + spin_unlock_bh(&dev->xmit_lock); return err; } @@ -156,7 +158,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) dmi1 = (struct dev_mc_list *)kmalloc(sizeof(*dmi), GFP_ATOMIC); - write_lock_bh(&dev_mc_lock); + spin_lock_bh(&dev->xmit_lock); for (dmi = dev->mc_list; dmi != NULL; dmi = dmi->next) { if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 && dmi->dmi_addrlen == alen) { @@ -172,7 +174,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) } if ((dmi = dmi1) == NULL) { - write_unlock_bh(&dev_mc_lock); + spin_unlock_bh(&dev->xmit_lock); return -ENOMEM; } memcpy(dmi->dmi_addr, addr, alen); @@ -182,12 +184,14 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) dmi->dmi_gusers = glbl ? 1 : 0; dev->mc_list = dmi; dev->mc_count++; - write_unlock_bh(&dev_mc_lock); - dev_mc_upload(dev); + + __dev_mc_upload(dev); + + spin_unlock_bh(&dev->xmit_lock); return 0; done: - write_unlock_bh(&dev_mc_lock); + spin_unlock_bh(&dev->xmit_lock); if (dmi1) kfree(dmi1); return err; @@ -199,7 +203,8 @@ done: void dev_mc_discard(struct net_device *dev) { - write_lock_bh(&dev_mc_lock); + spin_lock_bh(&dev->xmit_lock); + while (dev->mc_list != NULL) { struct dev_mc_list *tmp = dev->mc_list; dev->mc_list = tmp->next; @@ -208,7 +213,8 @@ void dev_mc_discard(struct net_device *dev) kfree(tmp); } dev->mc_count = 0; - write_unlock_bh(&dev_mc_lock); + + spin_unlock_bh(&dev->xmit_lock); } #ifdef CONFIG_PROC_FS @@ -222,7 +228,7 @@ static int dev_mc_read_proc(char *buffer, char **start, off_t offset, read_lock(&dev_base_lock); for (dev = dev_base; dev; dev = dev->next) { - read_lock_bh(&dev_mc_lock); + spin_lock_bh(&dev->xmit_lock); for (m = dev->mc_list; m; m = m->next) { int i; @@ -240,11 +246,11 @@ static int dev_mc_read_proc(char *buffer, char **start, off_t offset, begin = pos; } if (pos > offset + length) { - read_unlock_bh(&dev_mc_lock); + spin_unlock_bh(&dev->xmit_lock); goto done; } } - read_unlock_bh(&dev_mc_lock); + spin_unlock_bh(&dev->xmit_lock); } *eof = 1; diff --git a/net/core/dv.c b/net/core/dv.c new file mode 100644 index 000000000..4df7747b8 --- /dev/null +++ b/net/core/dv.c @@ -0,0 +1,551 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic frame diversion + * + * Version: @(#)eth.c 0.41 09/09/2000 + * + * Authors: + * Benoit LOCHER: initial integration within the kernel with support for ethernet + * Dave Miller: improvement on the code (correctness, performance and source files) + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const char sysctl_divert_version[32]="0.46"; /* Current version */ + +int __init dv_init(void) +{ + printk(KERN_INFO "NET4: Frame Diverter %s\n", sysctl_divert_version); + return 0; +} + +/* + * Allocate a divert_blk for a device. This must be an ethernet nic. + */ +int alloc_divert_blk(struct net_device *dev) +{ + int alloc_size = (sizeof(struct divert_blk) + 3) & ~3; + + if (!strncmp(dev->name, "eth", 3)) { + printk(KERN_DEBUG "divert: allocating divert_blk for %s\n", + dev->name); + + dev->divert = (struct divert_blk *) + kmalloc(alloc_size, GFP_KERNEL); + if (dev->divert == NULL) { + printk(KERN_DEBUG "divert: unable to allocate divert_blk for %s\n", + dev->name); + return -EFAULT; + } else { + memset(dev->divert, 0, sizeof(struct divert_blk)); + } + } else { + printk(KERN_DEBUG "divert: not allocating divert_blk for non-ethernet device %s\n", + dev->name); + + dev->divert = NULL; + } + return 0; +} + +/* + * Free a divert_blk allocated by the above function, if it was + * allocated on that device. + */ +void free_divert_blk(struct net_device *dev) +{ + if (dev->divert) { + kfree(dev->divert); + dev->divert=NULL; + printk(KERN_DEBUG "divert: freeing divert_blk for %s\n", + dev->name); + } else { + printk(KERN_DEBUG "divert: no divert_blk to free, %s not ethernet\n", + dev->name); + } +} + +/* + * Adds a tcp/udp (source or dest) port to an array + */ +int add_port(u16 ports[], u16 port) +{ + int i; + + if (port == 0) + return -EINVAL; + + /* Storing directly in network format for performance, + * thanks Dave :) + */ + port = htons(port); + + for (i = 0; i < MAX_DIVERT_PORTS; i++) { + if (ports[i] == port) + return -EALREADY; + } + + for (i = 0; i < MAX_DIVERT_PORTS; i++) { + if (ports[i] == 0) { + ports[i] = port; + return 0; + } + } + + return -ENOBUFS; +} + +/* + * Removes a port from an array tcp/udp (source or dest) + */ +int remove_port(u16 ports[], u16 port) +{ + int i; + + if (port == 0) + return -EINVAL; + + /* Storing directly in network format for performance, + * thanks Dave ! + */ + port = htons(port); + + for (i = 0; i < MAX_DIVERT_PORTS; i++) { + if (ports[i] == port) { + ports[i] = 0; + return 0; + } + } + + return -EINVAL; +} + +/* Some basic sanity checks on the arguments passed to divert_ioctl() */ +int check_args(struct divert_cf *div_cf, struct net_device **dev) +{ + char devname[32]; + + if (dev == NULL) + return -EFAULT; + + /* GETVERSION: all other args are unused */ + if (div_cf->cmd == DIVCMD_GETVERSION) + return 0; + + /* Network device index should reasonably be between 0 and 1000 :) */ + if (div_cf->dev_index < 0 || div_cf->dev_index > 1000) + return -EINVAL; + + /* Let's try to find the ifname */ + sprintf(devname, "eth%d", div_cf->dev_index); + *dev = dev_get_by_name(devname); + + /* dev should NOT be null */ + if (*dev == NULL) + return -EINVAL; + + /* user issuing the ioctl must be a super one :) */ + if (!suser()) + return -EPERM; + + /* Device must have a divert_blk member NOT null */ + if ((*dev)->divert == NULL) + return -EFAULT; + + return 0; +} + +/* + * control function of the diverter + */ +#define DVDBG(a) \ + printk(KERN_DEBUG "divert_ioctl() line %d %s\n", __LINE__, (a)) + +int divert_ioctl(unsigned int cmd, struct divert_cf *arg) +{ + struct divert_cf div_cf; + struct divert_blk *div_blk; + struct net_device *dev; + int ret; + + switch (cmd) { + case SIOCGIFDIVERT: + DVDBG("SIOCGIFDIVERT, copy_from_user"); + if (copy_from_user(&div_cf, arg, sizeof(struct divert_cf))) + return -EFAULT; + DVDBG("before check_args"); + ret = check_args(&div_cf, &dev); + if (ret) + return ret; + DVDBG("after checkargs"); + div_blk = dev->divert; + + DVDBG("befre switch()"); + switch (div_cf.cmd) { + case DIVCMD_GETSTATUS: + /* Now, just give the user the raw divert block + * for him to play with :) + */ + if (copy_to_user(div_cf.arg1.ptr, dev->divert, + sizeof(struct divert_blk))) + return -EFAULT; + break; + + case DIVCMD_GETVERSION: + DVDBG("GETVERSION: checking ptr"); + if (div_cf.arg1.ptr == NULL) + return -EINVAL; + DVDBG("GETVERSION: copying data to userland"); + if (copy_to_user(div_cf.arg1.ptr, + sysctl_divert_version, 32)) + return -EFAULT; + DVDBG("GETVERSION: data copied"); + break; + + default: + return -EINVAL; + }; + + break; + + case SIOCSIFDIVERT: + if (copy_from_user(&div_cf, arg, sizeof(struct divert_cf))) + return -EFAULT; + + ret = check_args(&div_cf, &dev); + if (ret) + return ret; + + div_blk = dev->divert; + + switch(div_cf.cmd) { + case DIVCMD_RESET: + div_blk->divert = 0; + div_blk->protos = DIVERT_PROTO_NONE; + memset(div_blk->tcp_dst, 0, + MAX_DIVERT_PORTS * sizeof(u16)); + memset(div_blk->tcp_src, 0, + MAX_DIVERT_PORTS * sizeof(u16)); + memset(div_blk->udp_dst, 0, + MAX_DIVERT_PORTS * sizeof(u16)); + memset(div_blk->udp_src, 0, + MAX_DIVERT_PORTS * sizeof(u16)); + return 0; + + case DIVCMD_DIVERT: + switch(div_cf.arg1.int32) { + case DIVARG1_ENABLE: + if (div_blk->divert) + return -EALREADY; + div_blk->divert = 1; + break; + + case DIVARG1_DISABLE: + if (!div_blk->divert) + return -EALREADY; + div_blk->divert = 0; + break; + + default: + return -EINVAL; + }; + + break; + + case DIVCMD_IP: + switch(div_cf.arg1.int32) { + case DIVARG1_ENABLE: + if (div_blk->protos & DIVERT_PROTO_IP) + return -EALREADY; + div_blk->protos |= DIVERT_PROTO_IP; + break; + + case DIVARG1_DISABLE: + if (!(div_blk->protos & DIVERT_PROTO_IP)) + return -EALREADY; + div_blk->protos &= ~DIVERT_PROTO_IP; + break; + + default: + return -EINVAL; + }; + + break; + + case DIVCMD_TCP: + switch(div_cf.arg1.int32) { + case DIVARG1_ENABLE: + if (div_blk->protos & DIVERT_PROTO_TCP) + return -EALREADY; + div_blk->protos |= DIVERT_PROTO_TCP; + break; + + case DIVARG1_DISABLE: + if (!(div_blk->protos & DIVERT_PROTO_TCP)) + return -EALREADY; + div_blk->protos &= ~DIVERT_PROTO_TCP; + break; + + default: + return -EINVAL; + }; + + break; + + case DIVCMD_TCPDST: + switch(div_cf.arg1.int32) { + case DIVARG1_ADD: + return add_port(div_blk->tcp_dst, + div_cf.arg2.uint16); + + case DIVARG1_REMOVE: + return remove_port(div_blk->tcp_dst, + div_cf.arg2.uint16); + + default: + return -EINVAL; + }; + + break; + + case DIVCMD_TCPSRC: + switch(div_cf.arg1.int32) { + case DIVARG1_ADD: + return add_port(div_blk->tcp_src, + div_cf.arg2.uint16); + + case DIVARG1_REMOVE: + return remove_port(div_blk->tcp_src, + div_cf.arg2.uint16); + + default: + return -EINVAL; + }; + + break; + + case DIVCMD_UDP: + switch(div_cf.arg1.int32) { + case DIVARG1_ENABLE: + if (div_blk->protos & DIVERT_PROTO_UDP) + return -EALREADY; + div_blk->protos |= DIVERT_PROTO_UDP; + break; + + case DIVARG1_DISABLE: + if (!(div_blk->protos & DIVERT_PROTO_UDP)) + return -EALREADY; + div_blk->protos &= ~DIVERT_PROTO_UDP; + break; + + default: + return -EINVAL; + }; + + break; + + case DIVCMD_UDPDST: + switch(div_cf.arg1.int32) { + case DIVARG1_ADD: + return add_port(div_blk->udp_dst, + div_cf.arg2.uint16); + + case DIVARG1_REMOVE: + return remove_port(div_blk->udp_dst, + div_cf.arg2.uint16); + + default: + return -EINVAL; + }; + + break; + + case DIVCMD_UDPSRC: + switch(div_cf.arg1.int32) { + case DIVARG1_ADD: + return add_port(div_blk->udp_src, + div_cf.arg2.uint16); + + case DIVARG1_REMOVE: + return remove_port(div_blk->udp_src, + div_cf.arg2.uint16); + + default: + return -EINVAL; + }; + + break; + + case DIVCMD_ICMP: + switch(div_cf.arg1.int32) { + case DIVARG1_ENABLE: + if (div_blk->protos & DIVERT_PROTO_ICMP) + return -EALREADY; + div_blk->protos |= DIVERT_PROTO_ICMP; + break; + + case DIVARG1_DISABLE: + if (!(div_blk->protos & DIVERT_PROTO_ICMP)) + return -EALREADY; + div_blk->protos &= ~DIVERT_PROTO_ICMP; + break; + + default: + return -EINVAL; + }; + + break; + + default: + return -EINVAL; + }; + + break; + + default: + return -EINVAL; + }; + + return 0; +} + + +/* + * Check if packet should have its dest mac address set to the box itself + * for diversion + */ + +#define ETH_DIVERT_FRAME(skb) \ + memcpy(skb->mac.ethernet, skb->dev->dev_addr, ETH_ALEN); \ + skb->pkt_type=PACKET_HOST + +void divert_frame(struct sk_buff *skb) +{ + struct ethhdr *eth = skb->mac.ethernet; + struct iphdr *iph; + struct tcphdr *tcph; + struct udphdr *udph; + struct divert_blk *divert = skb->dev->divert; + int i, src, dst; + unsigned char *skb_data_end = skb->data + skb->len; + + /* Packet is already aimed at us, return */ + if (!memcmp(eth, skb->dev->dev_addr, ETH_ALEN)) + return; + + /* proto is not IP, do nothing */ + if (eth->h_proto != htons(ETH_P_IP)) + return; + + /* Divert all IP frames ? */ + if (divert->protos & DIVERT_PROTO_IP) { + ETH_DIVERT_FRAME(skb); + return; + } + + /* Check for possible (maliciously) malformed IP frame (thanks Dave) */ + iph = (struct iphdr *) skb->data; + if (((iph->ihl<<2)+(unsigned char*)(iph)) >= skb_data_end) { + printk(KERN_INFO "divert: malformed IP packet !\n"); + return; + } + + switch (iph->protocol) { + /* Divert all ICMP frames ? */ + case IPPROTO_ICMP: + if (divert->protos & DIVERT_PROTO_ICMP) { + ETH_DIVERT_FRAME(skb); + return; + } + break; + + /* Divert all TCP frames ? */ + case IPPROTO_TCP: + if (divert->protos & DIVERT_PROTO_TCP) { + ETH_DIVERT_FRAME(skb); + return; + } + + /* Check for possible (maliciously) malformed IP + * frame (thanx Dave) + */ + tcph = (struct tcphdr *) + (((unsigned char *)iph) + (iph->ihl<<2)); + if (((unsigned char *)(tcph+1)) >= skb_data_end) { + printk(KERN_INFO "divert: malformed TCP packet !\n"); + return; + } + + /* Divert some tcp dst/src ports only ?*/ + for (i = 0; i < MAX_DIVERT_PORTS; i++) { + dst = divert->tcp_dst[i]; + src = divert->tcp_src[i]; + if ((dst && dst == tcph->dest) || + (src && src == tcph->source)) { + ETH_DIVERT_FRAME(skb); + return; + } + } + break; + + /* Divert all UDP frames ? */ + case IPPROTO_UDP: + if (divert->protos & DIVERT_PROTO_UDP) { + ETH_DIVERT_FRAME(skb); + return; + } + + /* Check for possible (maliciously) malformed IP + * packet (thanks Dave) + */ + udph = (struct udphdr *) + (((unsigned char *)iph) + (iph->ihl<<2)); + if (((unsigned char *)(udph+1)) >= skb_data_end) { + printk(KERN_INFO + "divert: malformed UDP packet !\n"); + return; + } + + /* Divert some udp dst/src ports only ? */ + for (i = 0; i < MAX_DIVERT_PORTS; i++) { + dst = divert->udp_dst[i]; + src = divert->udp_src[i]; + if ((dst && dst == udph->dest) || + (src && src == udph->source)) { + ETH_DIVERT_FRAME(skb); + return; + } + } + break; + }; + + return; +} + diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 4ea599a88..2f6090a2f 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -12,6 +12,10 @@ #ifdef CONFIG_SYSCTL extern int netdev_max_backlog; +extern int no_cong_thresh; +extern int no_cong; +extern int lo_cong; +extern int mod_cong; extern int netdev_fastroute; extern int net_msg_cost; extern int net_msg_burst; @@ -25,6 +29,10 @@ extern int sysctl_core_destroy_delay; extern int sysctl_optmem_max; extern int sysctl_hot_list_len; +#ifdef CONFIG_NET_DIVERT +extern char sysctl_divert_version[]; +#endif /* CONFIG_NET_DIVERT */ + ctl_table core_table[] = { #ifdef CONFIG_NET {NET_CORE_WMEM_MAX, "wmem_max", @@ -42,6 +50,18 @@ ctl_table core_table[] = { {NET_CORE_MAX_BACKLOG, "netdev_max_backlog", &netdev_max_backlog, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_CORE_NO_CONG_THRESH, "no_cong_thresh", + &no_cong, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_CORE_NO_CONG, "no_cong", + &no_cong, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_CORE_LO_CONG, "lo_cong", + &lo_cong, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_CORE_MOD_CONG, "mod_cong", + &mod_cong, sizeof(int), 0644, NULL, + &proc_dointvec}, #ifdef CONFIG_NET_FASTROUTE {NET_CORE_FASTROUTE, "netdev_fastroute", &netdev_fastroute, sizeof(int), 0644, NULL, @@ -59,6 +79,11 @@ ctl_table core_table[] = { {NET_CORE_HOT_LIST_LENGTH, "hot_list_length", &sysctl_hot_list_len, sizeof(int), 0644, NULL, &proc_dointvec}, +#ifdef CONFIG_NET_DIVERT + {NET_CORE_DIVERT_VERSION, "divert_version", + (void *)sysctl_divert_version, 32, 0444, NULL, + &proc_dostring}, +#endif /* CONFIG_NET_DIVERT */ #endif /* CONFIG_NET */ { 0 } }; -- cgit v1.2.3