summaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>2000-11-23 02:00:47 +0000
committerRalf Baechle <ralf@linux-mips.org>2000-11-23 02:00:47 +0000
commit06615f62b17d7de6e12d2f5ec6b88cf30af08413 (patch)
tree8766f208847d4876a6db619aebbf54d53b76eb44 /net/core
parentfa9bdb574f4febb751848a685d9a9017e04e1d53 (diff)
Merge with Linux 2.4.0-test10.
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile4
-rw-r--r--net/core/dev.c195
-rw-r--r--net/core/dev_mcast.c54
-rw-r--r--net/core/dv.c551
-rw-r--r--net/core/sysctl_net_core.c25
5 files changed, 793 insertions, 36 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 7ee0db3fd..af3c74091 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -29,6 +29,10 @@ ifdef CONFIG_NETFILTER
OX_OBJS += netfilter.o
endif
+ifeq ($(CONFIG_NET_DIVERT),y)
+O_OBJS += dv.o
+endif
+
endif
ifdef CONFIG_NET_PROFILE
diff --git a/net/core/dev.c b/net/core/dev.c
index e6f440cf4..17fae7a1e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -59,6 +59,8 @@
* Paul Rusty Russell : SIOCSIFNAME
* Pekka Riikonen : Netdev boot-time settings code
* Andrew Morton : Make unregister_netdevice wait indefinitely on dev->refcnt
+ * J Hadi Salim : - Backlog queue sampling
+ * - netif_rx() feedback
*/
#include <asm/uaccess.h>
@@ -85,6 +87,7 @@
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/if_bridge.h>
+#include <linux/divert.h>
#include <net/dst.h>
#include <net/pkt_sched.h>
#include <net/profile.h>
@@ -97,6 +100,18 @@
extern int plip_init(void);
#endif
+/* This define, if set, will randomly drop a packet when congestion
+ * is more than moderate. It helps fairness in the multi-interface
+ * case when one of them is a hog, but it kills performance for the
+ * single interface case so it is off now by default.
+ */
+#undef RAND_LIE
+
+/* Setting this will sample the queue lengths and thus congestion
+ * via a timer instead of as each packet is received.
+ */
+#undef OFFLINE_SAMPLE
+
NET_PROFILE_DEFINE(dev_queue_xmit)
NET_PROFILE_DEFINE(softnet_process)
@@ -133,6 +148,11 @@ const char *if_port_text[] = {
static struct packet_type *ptype_base[16]; /* 16 way hashed list */
static struct packet_type *ptype_all = NULL; /* Taps */
+#ifdef OFFLINE_SAMPLE
+static void sample_queue(unsigned long dummy);
+static struct timer_list samp_timer = { function: sample_queue };
+#endif
+
/*
* Our notifier list
*/
@@ -933,12 +953,20 @@ int dev_queue_xmit(struct sk_buff *skb)
=======================================================================*/
int netdev_max_backlog = 300;
+/* These numbers are selected based on intuition and some
+ * experimentatiom, if you have more scientific way of doing this
+ * please go ahead and fix things.
+ */
+int no_cong_thresh = 10;
+int no_cong = 20;
+int lo_cong = 100;
+int mod_cong = 290;
struct netif_rx_stats netdev_rx_stat[NR_CPUS];
#ifdef CONFIG_NET_HW_FLOWCONTROL
-static atomic_t netdev_dropping = ATOMIC_INIT(0);
+atomic_t netdev_dropping = ATOMIC_INIT(0);
static unsigned long netdev_fc_mask = 1;
unsigned long netdev_fc_xoff = 0;
spinlock_t netdev_fc_lock = SPIN_LOCK_UNLOCKED;
@@ -996,6 +1024,56 @@ static void netdev_wakeup(void)
}
#endif
+static void get_sample_stats(int cpu)
+{
+#ifdef RAND_LIE
+ unsigned long rd;
+ int rq;
+#endif
+ int blog = softnet_data[cpu].input_pkt_queue.qlen;
+ int avg_blog = softnet_data[cpu].avg_blog;
+
+ avg_blog = (avg_blog >> 1)+ (blog >> 1);
+
+ if (avg_blog > mod_cong) {
+ /* Above moderate congestion levels. */
+ softnet_data[cpu].cng_level = NET_RX_CN_HIGH;
+#ifdef RAND_LIE
+ rd = net_random();
+ rq = rd % netdev_max_backlog;
+ if (rq < avg_blog) /* unlucky bastard */
+ softnet_data[cpu].cng_level = NET_RX_DROP;
+#endif
+ } else if (avg_blog > lo_cong) {
+ softnet_data[cpu].cng_level = NET_RX_CN_MOD;
+#ifdef RAND_LIE
+ rd = net_random();
+ rq = rd % netdev_max_backlog;
+ if (rq < avg_blog) /* unlucky bastard */
+ softnet_data[cpu].cng_level = NET_RX_CN_HIGH;
+#endif
+ } else if (avg_blog > no_cong)
+ softnet_data[cpu].cng_level = NET_RX_CN_LOW;
+ else /* no congestion */
+ softnet_data[cpu].cng_level = NET_RX_SUCCESS;
+
+ softnet_data[cpu].avg_blog = avg_blog;
+}
+
+#ifdef OFFLINE_SAMPLE
+static void sample_queue(unsigned long dummy)
+{
+/* 10 ms 0r 1ms -- i dont care -- JHS */
+ int next_tick = 1;
+ int cpu = smp_processor_id();
+
+ get_sample_stats(cpu);
+ next_tick += jiffies;
+ mod_timer(&samp_timer, next_tick);
+}
+#endif
+
+
/**
* netif_rx - post buffer to the network code
* @skb: buffer to post
@@ -1004,9 +1082,18 @@ static void netdev_wakeup(void)
* the upper (protocol) levels to process. It always succeeds. The buffer
* may be dropped during processing for congestion control or by the
* protocol layers.
+ *
+ * return values:
+ * NET_RX_SUCCESS (no congestion)
+ * NET_RX_CN_LOW (low congestion)
+ * NET_RX_CN_MOD (moderate congestion)
+ * NET_RX_CN_HIGH (high congestion)
+ * NET_RX_DROP (packet was dropped)
+ *
+ *
*/
-void netif_rx(struct sk_buff *skb)
+int netif_rx(struct sk_buff *skb)
{
int this_cpu = smp_processor_id();
struct softnet_data *queue;
@@ -1036,7 +1123,10 @@ enqueue:
__skb_queue_tail(&queue->input_pkt_queue,skb);
__cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
local_irq_restore(flags);
- return;
+#ifndef OFFLINE_SAMPLE
+ get_sample_stats(this_cpu);
+#endif
+ return softnet_data[this_cpu].cng_level;
}
if (queue->throttle) {
@@ -1062,19 +1152,22 @@ drop:
local_irq_restore(flags);
kfree_skb(skb);
+ return NET_RX_DROP;
}
/* Deliver skb to an old protocol, which is not threaded well
or which do not understand shared skbs.
*/
-static void deliver_to_old_ones(struct packet_type *pt, struct sk_buff *skb, int last)
+static int deliver_to_old_ones(struct packet_type *pt, struct sk_buff *skb, int last)
{
static spinlock_t net_bh_lock = SPIN_LOCK_UNLOCKED;
+ int ret = NET_RX_DROP;
+
if (!last) {
skb = skb_clone(skb, GFP_ATOMIC);
if (skb == NULL)
- return;
+ return ret;
}
/* The assumption (correct one) is that old protocols
@@ -1087,10 +1180,11 @@ static void deliver_to_old_ones(struct packet_type *pt, struct sk_buff *skb, int
/* Disable timers and wait for all timers completion */
tasklet_disable(bh_task_vec+TIMER_BH);
- pt->func(skb, skb->dev, pt);
+ ret = pt->func(skb, skb->dev, pt);
tasklet_enable(bh_task_vec+TIMER_BH);
spin_unlock(&net_bh_lock);
+ return ret;
}
/* Reparent skb to master device. This function is called
@@ -1173,22 +1267,35 @@ void net_call_rx_atomic(void (*fn)(void))
void (*br_handle_frame_hook)(struct sk_buff *skb) = NULL;
#endif
-static void __inline__ handle_bridge(struct sk_buff *skb,
+static int __inline__ handle_bridge(struct sk_buff *skb,
struct packet_type *pt_prev)
{
+ int ret = NET_RX_DROP;
+
if (pt_prev) {
if (!pt_prev->data)
- deliver_to_old_ones(pt_prev, skb, 0);
+ ret = deliver_to_old_ones(pt_prev, skb, 0);
else {
atomic_inc(&skb->users);
- pt_prev->func(skb, skb->dev, pt_prev);
+ ret = pt_prev->func(skb, skb->dev, pt_prev);
}
}
br_handle_frame_hook(skb);
+ return ret;
}
+#ifdef CONFIG_NET_DIVERT
+static inline void handle_diverter(struct sk_buff *skb)
+{
+ /* if diversion is supported on device, then divert */
+ if (skb->dev->divert && skb->dev->divert->divert)
+ divert_frame(skb);
+}
+#endif /* CONFIG_NET_DIVERT */
+
+
static void net_rx_action(struct softirq_action *h)
{
int this_cpu = smp_processor_id();
@@ -1239,6 +1346,12 @@ static void net_rx_action(struct softirq_action *h)
}
}
+#ifdef CONFIG_NET_DIVERT
+ if (skb->dev->divert && skb->dev->divert->divert)
+ handle_diverter(skb);
+#endif /* CONFIG_NET_DIVERT */
+
+
#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
if (skb->dev->br_port != NULL &&
br_handle_frame_hook != NULL) {
@@ -1275,6 +1388,17 @@ static void net_rx_action(struct softirq_action *h)
if (bugdet-- < 0 || jiffies - start_time > 1)
goto softnet_break;
+
+#ifdef CONFIG_NET_HW_FLOWCONTROL
+ if (queue->throttle && queue->input_pkt_queue.qlen < no_cong_thresh ) {
+ if (atomic_dec_and_test(&netdev_dropping)) {
+ queue->throttle = 0;
+ netdev_wakeup();
+ goto softnet_break;
+ }
+ }
+#endif
+
}
br_read_unlock(BR_NETPROTO_LOCK);
@@ -2113,9 +2237,9 @@ int dev_ioctl(unsigned int cmd, void *arg)
/**
* dev_new_index - allocate an ifindex
*
- * Returns a suitable unique value for a new device interface number.
- * The caller must hold the rtnl semaphore to be sure it remains
- * unique.
+ * Returns a suitable unique value for a new device interface
+ * number. The caller must hold the rtnl semaphore or the
+ * dev_base_lock to be sure it remains unique.
*/
int dev_new_index(void)
@@ -2140,6 +2264,10 @@ static int dev_boot_phase = 1;
* chain. 0 is returned on success. A negative errno code is returned
* on a failure to set up the device, or if the name is a duplicate.
*
+ * Callers must hold the rtnl semaphore. See the comment at the
+ * end of Space.c for details about the locking. You may want
+ * register_netdev() instead of this.
+ *
* BUGS:
* The locking appears insufficient to guarantee two parallel registers
* will not get the same name.
@@ -2148,6 +2276,9 @@ static int dev_boot_phase = 1;
int register_netdevice(struct net_device *dev)
{
struct net_device *d, **dp;
+#ifdef CONFIG_NET_DIVERT
+ int ret;
+#endif
spin_lock_init(&dev->queue_lock);
spin_lock_init(&dev->xmit_lock);
@@ -2182,6 +2313,12 @@ int register_netdevice(struct net_device *dev)
dev_hold(dev);
write_unlock_bh(&dev_base_lock);
+#ifdef CONFIG_NET_DIVERT
+ ret = alloc_divert_blk(dev);
+ if (ret)
+ return ret;
+#endif /* CONFIG_NET_DIVERT */
+
/*
* Default initial state at registry is that the
* device is present.
@@ -2231,6 +2368,12 @@ int register_netdevice(struct net_device *dev)
dev->deadbeaf = 0;
write_unlock_bh(&dev_base_lock);
+#ifdef CONFIG_NET_DIVERT
+ ret = alloc_divert_blk(dev);
+ if (ret)
+ return ret;
+#endif /* CONFIG_NET_DIVERT */
+
/* Notify protocols, that a new device appeared. */
notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
@@ -2272,6 +2415,10 @@ int netdev_finish_unregister(struct net_device *dev)
* This function shuts down a device interface and removes it
* from the kernel tables. On success 0 is returned, on a failure
* a negative errno code is returned.
+ *
+ * Callers must hold the rtnl semaphore. See the comment at the
+ * end of Space.c for details about the locking. You may want
+ * unregister_netdev() instead of this.
*/
int unregister_netdevice(struct net_device *dev)
@@ -2325,6 +2472,10 @@ int unregister_netdevice(struct net_device *dev)
/* Notifier chain MUST detach us from master device. */
BUG_TRAP(dev->master==NULL);
+#ifdef CONFIG_NET_DIVERT
+ free_divert_blk(dev);
+#endif
+
if (dev->new_style) {
#ifdef NET_REFCNT_DEBUG
if (atomic_read(&dev->refcnt) != 1)
@@ -2397,7 +2548,15 @@ int unregister_netdevice(struct net_device *dev)
extern void net_device_init(void);
extern void ip_auto_config(void);
+#ifdef CONFIG_NET_DIVERT
+extern void dv_init(void);
+#endif /* CONFIG_NET_DIVERT */
+
+/*
+ * Callers must hold the rtnl semaphore. See the comment at the
+ * end of Space.c for details about the locking.
+ */
int __init net_dev_init(void)
{
struct net_device *dev, **dp;
@@ -2407,6 +2566,10 @@ int __init net_dev_init(void)
pktsched_init();
#endif
+#ifdef CONFIG_NET_DIVERT
+ dv_init();
+#endif /* CONFIG_NET_DIVERT */
+
/*
* Initialise the packet receive queues.
*/
@@ -2417,6 +2580,8 @@ int __init net_dev_init(void)
queue = &softnet_data[i];
skb_queue_head_init(&queue->input_pkt_queue);
queue->throttle = 0;
+ queue->cng_level = 0;
+ queue->avg_blog = 10; /* arbitrary non-zero */
queue->completion_queue = NULL;
}
@@ -2425,6 +2590,12 @@ int __init net_dev_init(void)
NET_PROFILE_REGISTER(dev_queue_xmit);
NET_PROFILE_REGISTER(softnet_process);
#endif
+
+#ifdef OFFLINE_SAMPLE
+ samp_timer.expires = jiffies + (10 * HZ);
+ add_timer(&samp_timer);
+#endif
+
/*
* Add the devices.
* If the call to dev->init fails, the dev is removed
diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c
index cf590db04..6b24a1416 100644
--- a/net/core/dev_mcast.c
+++ b/net/core/dev_mcast.c
@@ -13,6 +13,7 @@
* rather than any time but...
* Alan Cox : IFF_ALLMULTI support.
* Alan Cox : New format set_multicast_list() calls.
+ * Gleb Natapov : Remove dev_mc_lock.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -59,16 +60,14 @@
* Device mc lists are changed by bh at least if IPv6 is enabled,
* so that it must be bh protected.
*
- * We protect all mc lists with global rw lock
- * and block accesses to device mc filters with dev->xmit_lock.
+ * We block accesses to device mc filters with dev->xmit_lock.
*/
-static rwlock_t dev_mc_lock = RW_LOCK_UNLOCKED;
/*
* Update the multicast list into the physical NIC controller.
*/
-void dev_mc_upload(struct net_device *dev)
+static void __dev_mc_upload(struct net_device *dev)
{
/* Don't do anything till we up the interface
* [dev_open will call this function so the list will
@@ -87,13 +86,14 @@ void dev_mc_upload(struct net_device *dev)
!netif_device_present(dev))
return;
- read_lock_bh(&dev_mc_lock);
- spin_lock(&dev->xmit_lock);
- dev->xmit_lock_owner = smp_processor_id();
dev->set_multicast_list(dev);
- dev->xmit_lock_owner = -1;
- spin_unlock(&dev->xmit_lock);
- read_unlock_bh(&dev_mc_lock);
+}
+
+void dev_mc_upload(struct net_device *dev)
+{
+ spin_lock_bh(&dev->xmit_lock);
+ __dev_mc_upload(dev);
+ spin_unlock_bh(&dev->xmit_lock);
}
/*
@@ -105,7 +105,8 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl)
int err = 0;
struct dev_mc_list *dmi, **dmip;
- write_lock_bh(&dev_mc_lock);
+ spin_lock_bh(&dev->xmit_lock);
+
for (dmip = &dev->mc_list; (dmi = *dmip) != NULL; dmip = &dmi->next) {
/*
* Find the entry we want to delete. The device could
@@ -127,7 +128,6 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl)
*/
*dmip = dmi->next;
dev->mc_count--;
- write_unlock_bh(&dev_mc_lock);
kfree(dmi);
@@ -135,13 +135,15 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl)
* We have altered the list, so the card
* loaded filter is now wrong. Fix it
*/
- dev_mc_upload(dev);
+ __dev_mc_upload(dev);
+
+ spin_unlock_bh(&dev->xmit_lock);
return 0;
}
}
err = -ENOENT;
done:
- write_unlock_bh(&dev_mc_lock);
+ spin_unlock_bh(&dev->xmit_lock);
return err;
}
@@ -156,7 +158,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
dmi1 = (struct dev_mc_list *)kmalloc(sizeof(*dmi), GFP_ATOMIC);
- write_lock_bh(&dev_mc_lock);
+ spin_lock_bh(&dev->xmit_lock);
for (dmi = dev->mc_list; dmi != NULL; dmi = dmi->next) {
if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 &&
dmi->dmi_addrlen == alen) {
@@ -172,7 +174,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
}
if ((dmi = dmi1) == NULL) {
- write_unlock_bh(&dev_mc_lock);
+ spin_unlock_bh(&dev->xmit_lock);
return -ENOMEM;
}
memcpy(dmi->dmi_addr, addr, alen);
@@ -182,12 +184,14 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl)
dmi->dmi_gusers = glbl ? 1 : 0;
dev->mc_list = dmi;
dev->mc_count++;
- write_unlock_bh(&dev_mc_lock);
- dev_mc_upload(dev);
+
+ __dev_mc_upload(dev);
+
+ spin_unlock_bh(&dev->xmit_lock);
return 0;
done:
- write_unlock_bh(&dev_mc_lock);
+ spin_unlock_bh(&dev->xmit_lock);
if (dmi1)
kfree(dmi1);
return err;
@@ -199,7 +203,8 @@ done:
void dev_mc_discard(struct net_device *dev)
{
- write_lock_bh(&dev_mc_lock);
+ spin_lock_bh(&dev->xmit_lock);
+
while (dev->mc_list != NULL) {
struct dev_mc_list *tmp = dev->mc_list;
dev->mc_list = tmp->next;
@@ -208,7 +213,8 @@ void dev_mc_discard(struct net_device *dev)
kfree(tmp);
}
dev->mc_count = 0;
- write_unlock_bh(&dev_mc_lock);
+
+ spin_unlock_bh(&dev->xmit_lock);
}
#ifdef CONFIG_PROC_FS
@@ -222,7 +228,7 @@ static int dev_mc_read_proc(char *buffer, char **start, off_t offset,
read_lock(&dev_base_lock);
for (dev = dev_base; dev; dev = dev->next) {
- read_lock_bh(&dev_mc_lock);
+ spin_lock_bh(&dev->xmit_lock);
for (m = dev->mc_list; m; m = m->next) {
int i;
@@ -240,11 +246,11 @@ static int dev_mc_read_proc(char *buffer, char **start, off_t offset,
begin = pos;
}
if (pos > offset + length) {
- read_unlock_bh(&dev_mc_lock);
+ spin_unlock_bh(&dev->xmit_lock);
goto done;
}
}
- read_unlock_bh(&dev_mc_lock);
+ spin_unlock_bh(&dev->xmit_lock);
}
*eof = 1;
diff --git a/net/core/dv.c b/net/core/dv.c
new file mode 100644
index 000000000..4df7747b8
--- /dev/null
+++ b/net/core/dv.c
@@ -0,0 +1,551 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * Generic frame diversion
+ *
+ * Version: @(#)eth.c 0.41 09/09/2000
+ *
+ * Authors:
+ * Benoit LOCHER: initial integration within the kernel with support for ethernet
+ * Dave Miller: improvement on the code (correctness, performance and source files)
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <net/dst.h>
+#include <net/arp.h>
+#include <net/sock.h>
+#include <net/ipv6.h>
+#include <net/ip.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/checksum.h>
+#include <linux/divert.h>
+#include <linux/sockios.h>
+
+const char sysctl_divert_version[32]="0.46"; /* Current version */
+
+int __init dv_init(void)
+{
+ printk(KERN_INFO "NET4: Frame Diverter %s\n", sysctl_divert_version);
+ return 0;
+}
+
+/*
+ * Allocate a divert_blk for a device. This must be an ethernet nic.
+ */
+int alloc_divert_blk(struct net_device *dev)
+{
+ int alloc_size = (sizeof(struct divert_blk) + 3) & ~3;
+
+ if (!strncmp(dev->name, "eth", 3)) {
+ printk(KERN_DEBUG "divert: allocating divert_blk for %s\n",
+ dev->name);
+
+ dev->divert = (struct divert_blk *)
+ kmalloc(alloc_size, GFP_KERNEL);
+ if (dev->divert == NULL) {
+ printk(KERN_DEBUG "divert: unable to allocate divert_blk for %s\n",
+ dev->name);
+ return -EFAULT;
+ } else {
+ memset(dev->divert, 0, sizeof(struct divert_blk));
+ }
+ } else {
+ printk(KERN_DEBUG "divert: not allocating divert_blk for non-ethernet device %s\n",
+ dev->name);
+
+ dev->divert = NULL;
+ }
+ return 0;
+}
+
+/*
+ * Free a divert_blk allocated by the above function, if it was
+ * allocated on that device.
+ */
+void free_divert_blk(struct net_device *dev)
+{
+ if (dev->divert) {
+ kfree(dev->divert);
+ dev->divert=NULL;
+ printk(KERN_DEBUG "divert: freeing divert_blk for %s\n",
+ dev->name);
+ } else {
+ printk(KERN_DEBUG "divert: no divert_blk to free, %s not ethernet\n",
+ dev->name);
+ }
+}
+
+/*
+ * Adds a tcp/udp (source or dest) port to an array
+ */
+int add_port(u16 ports[], u16 port)
+{
+ int i;
+
+ if (port == 0)
+ return -EINVAL;
+
+ /* Storing directly in network format for performance,
+ * thanks Dave :)
+ */
+ port = htons(port);
+
+ for (i = 0; i < MAX_DIVERT_PORTS; i++) {
+ if (ports[i] == port)
+ return -EALREADY;
+ }
+
+ for (i = 0; i < MAX_DIVERT_PORTS; i++) {
+ if (ports[i] == 0) {
+ ports[i] = port;
+ return 0;
+ }
+ }
+
+ return -ENOBUFS;
+}
+
+/*
+ * Removes a port from an array tcp/udp (source or dest)
+ */
+int remove_port(u16 ports[], u16 port)
+{
+ int i;
+
+ if (port == 0)
+ return -EINVAL;
+
+ /* Storing directly in network format for performance,
+ * thanks Dave !
+ */
+ port = htons(port);
+
+ for (i = 0; i < MAX_DIVERT_PORTS; i++) {
+ if (ports[i] == port) {
+ ports[i] = 0;
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
+/* Some basic sanity checks on the arguments passed to divert_ioctl() */
+int check_args(struct divert_cf *div_cf, struct net_device **dev)
+{
+ char devname[32];
+
+ if (dev == NULL)
+ return -EFAULT;
+
+ /* GETVERSION: all other args are unused */
+ if (div_cf->cmd == DIVCMD_GETVERSION)
+ return 0;
+
+ /* Network device index should reasonably be between 0 and 1000 :) */
+ if (div_cf->dev_index < 0 || div_cf->dev_index > 1000)
+ return -EINVAL;
+
+ /* Let's try to find the ifname */
+ sprintf(devname, "eth%d", div_cf->dev_index);
+ *dev = dev_get_by_name(devname);
+
+ /* dev should NOT be null */
+ if (*dev == NULL)
+ return -EINVAL;
+
+ /* user issuing the ioctl must be a super one :) */
+ if (!suser())
+ return -EPERM;
+
+ /* Device must have a divert_blk member NOT null */
+ if ((*dev)->divert == NULL)
+ return -EFAULT;
+
+ return 0;
+}
+
+/*
+ * control function of the diverter
+ */
+#define DVDBG(a) \
+ printk(KERN_DEBUG "divert_ioctl() line %d %s\n", __LINE__, (a))
+
+int divert_ioctl(unsigned int cmd, struct divert_cf *arg)
+{
+ struct divert_cf div_cf;
+ struct divert_blk *div_blk;
+ struct net_device *dev;
+ int ret;
+
+ switch (cmd) {
+ case SIOCGIFDIVERT:
+ DVDBG("SIOCGIFDIVERT, copy_from_user");
+ if (copy_from_user(&div_cf, arg, sizeof(struct divert_cf)))
+ return -EFAULT;
+ DVDBG("before check_args");
+ ret = check_args(&div_cf, &dev);
+ if (ret)
+ return ret;
+ DVDBG("after checkargs");
+ div_blk = dev->divert;
+
+ DVDBG("befre switch()");
+ switch (div_cf.cmd) {
+ case DIVCMD_GETSTATUS:
+ /* Now, just give the user the raw divert block
+ * for him to play with :)
+ */
+ if (copy_to_user(div_cf.arg1.ptr, dev->divert,
+ sizeof(struct divert_blk)))
+ return -EFAULT;
+ break;
+
+ case DIVCMD_GETVERSION:
+ DVDBG("GETVERSION: checking ptr");
+ if (div_cf.arg1.ptr == NULL)
+ return -EINVAL;
+ DVDBG("GETVERSION: copying data to userland");
+ if (copy_to_user(div_cf.arg1.ptr,
+ sysctl_divert_version, 32))
+ return -EFAULT;
+ DVDBG("GETVERSION: data copied");
+ break;
+
+ default:
+ return -EINVAL;
+ };
+
+ break;
+
+ case SIOCSIFDIVERT:
+ if (copy_from_user(&div_cf, arg, sizeof(struct divert_cf)))
+ return -EFAULT;
+
+ ret = check_args(&div_cf, &dev);
+ if (ret)
+ return ret;
+
+ div_blk = dev->divert;
+
+ switch(div_cf.cmd) {
+ case DIVCMD_RESET:
+ div_blk->divert = 0;
+ div_blk->protos = DIVERT_PROTO_NONE;
+ memset(div_blk->tcp_dst, 0,
+ MAX_DIVERT_PORTS * sizeof(u16));
+ memset(div_blk->tcp_src, 0,
+ MAX_DIVERT_PORTS * sizeof(u16));
+ memset(div_blk->udp_dst, 0,
+ MAX_DIVERT_PORTS * sizeof(u16));
+ memset(div_blk->udp_src, 0,
+ MAX_DIVERT_PORTS * sizeof(u16));
+ return 0;
+
+ case DIVCMD_DIVERT:
+ switch(div_cf.arg1.int32) {
+ case DIVARG1_ENABLE:
+ if (div_blk->divert)
+ return -EALREADY;
+ div_blk->divert = 1;
+ break;
+
+ case DIVARG1_DISABLE:
+ if (!div_blk->divert)
+ return -EALREADY;
+ div_blk->divert = 0;
+ break;
+
+ default:
+ return -EINVAL;
+ };
+
+ break;
+
+ case DIVCMD_IP:
+ switch(div_cf.arg1.int32) {
+ case DIVARG1_ENABLE:
+ if (div_blk->protos & DIVERT_PROTO_IP)
+ return -EALREADY;
+ div_blk->protos |= DIVERT_PROTO_IP;
+ break;
+
+ case DIVARG1_DISABLE:
+ if (!(div_blk->protos & DIVERT_PROTO_IP))
+ return -EALREADY;
+ div_blk->protos &= ~DIVERT_PROTO_IP;
+ break;
+
+ default:
+ return -EINVAL;
+ };
+
+ break;
+
+ case DIVCMD_TCP:
+ switch(div_cf.arg1.int32) {
+ case DIVARG1_ENABLE:
+ if (div_blk->protos & DIVERT_PROTO_TCP)
+ return -EALREADY;
+ div_blk->protos |= DIVERT_PROTO_TCP;
+ break;
+
+ case DIVARG1_DISABLE:
+ if (!(div_blk->protos & DIVERT_PROTO_TCP))
+ return -EALREADY;
+ div_blk->protos &= ~DIVERT_PROTO_TCP;
+ break;
+
+ default:
+ return -EINVAL;
+ };
+
+ break;
+
+ case DIVCMD_TCPDST:
+ switch(div_cf.arg1.int32) {
+ case DIVARG1_ADD:
+ return add_port(div_blk->tcp_dst,
+ div_cf.arg2.uint16);
+
+ case DIVARG1_REMOVE:
+ return remove_port(div_blk->tcp_dst,
+ div_cf.arg2.uint16);
+
+ default:
+ return -EINVAL;
+ };
+
+ break;
+
+ case DIVCMD_TCPSRC:
+ switch(div_cf.arg1.int32) {
+ case DIVARG1_ADD:
+ return add_port(div_blk->tcp_src,
+ div_cf.arg2.uint16);
+
+ case DIVARG1_REMOVE:
+ return remove_port(div_blk->tcp_src,
+ div_cf.arg2.uint16);
+
+ default:
+ return -EINVAL;
+ };
+
+ break;
+
+ case DIVCMD_UDP:
+ switch(div_cf.arg1.int32) {
+ case DIVARG1_ENABLE:
+ if (div_blk->protos & DIVERT_PROTO_UDP)
+ return -EALREADY;
+ div_blk->protos |= DIVERT_PROTO_UDP;
+ break;
+
+ case DIVARG1_DISABLE:
+ if (!(div_blk->protos & DIVERT_PROTO_UDP))
+ return -EALREADY;
+ div_blk->protos &= ~DIVERT_PROTO_UDP;
+ break;
+
+ default:
+ return -EINVAL;
+ };
+
+ break;
+
+ case DIVCMD_UDPDST:
+ switch(div_cf.arg1.int32) {
+ case DIVARG1_ADD:
+ return add_port(div_blk->udp_dst,
+ div_cf.arg2.uint16);
+
+ case DIVARG1_REMOVE:
+ return remove_port(div_blk->udp_dst,
+ div_cf.arg2.uint16);
+
+ default:
+ return -EINVAL;
+ };
+
+ break;
+
+ case DIVCMD_UDPSRC:
+ switch(div_cf.arg1.int32) {
+ case DIVARG1_ADD:
+ return add_port(div_blk->udp_src,
+ div_cf.arg2.uint16);
+
+ case DIVARG1_REMOVE:
+ return remove_port(div_blk->udp_src,
+ div_cf.arg2.uint16);
+
+ default:
+ return -EINVAL;
+ };
+
+ break;
+
+ case DIVCMD_ICMP:
+ switch(div_cf.arg1.int32) {
+ case DIVARG1_ENABLE:
+ if (div_blk->protos & DIVERT_PROTO_ICMP)
+ return -EALREADY;
+ div_blk->protos |= DIVERT_PROTO_ICMP;
+ break;
+
+ case DIVARG1_DISABLE:
+ if (!(div_blk->protos & DIVERT_PROTO_ICMP))
+ return -EALREADY;
+ div_blk->protos &= ~DIVERT_PROTO_ICMP;
+ break;
+
+ default:
+ return -EINVAL;
+ };
+
+ break;
+
+ default:
+ return -EINVAL;
+ };
+
+ break;
+
+ default:
+ return -EINVAL;
+ };
+
+ return 0;
+}
+
+
+/*
+ * Check if packet should have its dest mac address set to the box itself
+ * for diversion
+ */
+
+#define ETH_DIVERT_FRAME(skb) \
+ memcpy(skb->mac.ethernet, skb->dev->dev_addr, ETH_ALEN); \
+ skb->pkt_type=PACKET_HOST
+
+void divert_frame(struct sk_buff *skb)
+{
+ struct ethhdr *eth = skb->mac.ethernet;
+ struct iphdr *iph;
+ struct tcphdr *tcph;
+ struct udphdr *udph;
+ struct divert_blk *divert = skb->dev->divert;
+ int i, src, dst;
+ unsigned char *skb_data_end = skb->data + skb->len;
+
+ /* Packet is already aimed at us, return */
+ if (!memcmp(eth, skb->dev->dev_addr, ETH_ALEN))
+ return;
+
+ /* proto is not IP, do nothing */
+ if (eth->h_proto != htons(ETH_P_IP))
+ return;
+
+ /* Divert all IP frames ? */
+ if (divert->protos & DIVERT_PROTO_IP) {
+ ETH_DIVERT_FRAME(skb);
+ return;
+ }
+
+ /* Check for possible (maliciously) malformed IP frame (thanks Dave) */
+ iph = (struct iphdr *) skb->data;
+ if (((iph->ihl<<2)+(unsigned char*)(iph)) >= skb_data_end) {
+ printk(KERN_INFO "divert: malformed IP packet !\n");
+ return;
+ }
+
+ switch (iph->protocol) {
+ /* Divert all ICMP frames ? */
+ case IPPROTO_ICMP:
+ if (divert->protos & DIVERT_PROTO_ICMP) {
+ ETH_DIVERT_FRAME(skb);
+ return;
+ }
+ break;
+
+ /* Divert all TCP frames ? */
+ case IPPROTO_TCP:
+ if (divert->protos & DIVERT_PROTO_TCP) {
+ ETH_DIVERT_FRAME(skb);
+ return;
+ }
+
+ /* Check for possible (maliciously) malformed IP
+ * frame (thanx Dave)
+ */
+ tcph = (struct tcphdr *)
+ (((unsigned char *)iph) + (iph->ihl<<2));
+ if (((unsigned char *)(tcph+1)) >= skb_data_end) {
+ printk(KERN_INFO "divert: malformed TCP packet !\n");
+ return;
+ }
+
+ /* Divert some tcp dst/src ports only ?*/
+ for (i = 0; i < MAX_DIVERT_PORTS; i++) {
+ dst = divert->tcp_dst[i];
+ src = divert->tcp_src[i];
+ if ((dst && dst == tcph->dest) ||
+ (src && src == tcph->source)) {
+ ETH_DIVERT_FRAME(skb);
+ return;
+ }
+ }
+ break;
+
+ /* Divert all UDP frames ? */
+ case IPPROTO_UDP:
+ if (divert->protos & DIVERT_PROTO_UDP) {
+ ETH_DIVERT_FRAME(skb);
+ return;
+ }
+
+ /* Check for possible (maliciously) malformed IP
+ * packet (thanks Dave)
+ */
+ udph = (struct udphdr *)
+ (((unsigned char *)iph) + (iph->ihl<<2));
+ if (((unsigned char *)(udph+1)) >= skb_data_end) {
+ printk(KERN_INFO
+ "divert: malformed UDP packet !\n");
+ return;
+ }
+
+ /* Divert some udp dst/src ports only ? */
+ for (i = 0; i < MAX_DIVERT_PORTS; i++) {
+ dst = divert->udp_dst[i];
+ src = divert->udp_src[i];
+ if ((dst && dst == udph->dest) ||
+ (src && src == udph->source)) {
+ ETH_DIVERT_FRAME(skb);
+ return;
+ }
+ }
+ break;
+ };
+
+ return;
+}
+
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 4ea599a88..2f6090a2f 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -12,6 +12,10 @@
#ifdef CONFIG_SYSCTL
extern int netdev_max_backlog;
+extern int no_cong_thresh;
+extern int no_cong;
+extern int lo_cong;
+extern int mod_cong;
extern int netdev_fastroute;
extern int net_msg_cost;
extern int net_msg_burst;
@@ -25,6 +29,10 @@ extern int sysctl_core_destroy_delay;
extern int sysctl_optmem_max;
extern int sysctl_hot_list_len;
+#ifdef CONFIG_NET_DIVERT
+extern char sysctl_divert_version[];
+#endif /* CONFIG_NET_DIVERT */
+
ctl_table core_table[] = {
#ifdef CONFIG_NET
{NET_CORE_WMEM_MAX, "wmem_max",
@@ -42,6 +50,18 @@ ctl_table core_table[] = {
{NET_CORE_MAX_BACKLOG, "netdev_max_backlog",
&netdev_max_backlog, sizeof(int), 0644, NULL,
&proc_dointvec},
+ {NET_CORE_NO_CONG_THRESH, "no_cong_thresh",
+ &no_cong, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_CORE_NO_CONG, "no_cong",
+ &no_cong, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_CORE_LO_CONG, "lo_cong",
+ &lo_cong, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_CORE_MOD_CONG, "mod_cong",
+ &mod_cong, sizeof(int), 0644, NULL,
+ &proc_dointvec},
#ifdef CONFIG_NET_FASTROUTE
{NET_CORE_FASTROUTE, "netdev_fastroute",
&netdev_fastroute, sizeof(int), 0644, NULL,
@@ -59,6 +79,11 @@ ctl_table core_table[] = {
{NET_CORE_HOT_LIST_LENGTH, "hot_list_length",
&sysctl_hot_list_len, sizeof(int), 0644, NULL,
&proc_dointvec},
+#ifdef CONFIG_NET_DIVERT
+ {NET_CORE_DIVERT_VERSION, "divert_version",
+ (void *)sysctl_divert_version, 32, 0444, NULL,
+ &proc_dostring},
+#endif /* CONFIG_NET_DIVERT */
#endif /* CONFIG_NET */
{ 0 }
};