diff options
author | Ralf Baechle <ralf@linux-mips.org> | 1998-05-07 02:55:41 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 1998-05-07 02:55:41 +0000 |
commit | dcec8a13bf565e47942a1751a9cec21bec5648fe (patch) | |
tree | 548b69625b18cc2e88c3e68d0923be546c9ebb03 /net | |
parent | 2e0f55e79c49509b7ff70ff1a10e1e9e90a3dfd4 (diff) |
o Merge with Linux 2.1.99.
o Fix ancient bug in the ELF loader making ldd crash.
o Fix ancient bug in the keyboard code for SGI, SNI and Jazz.
Diffstat (limited to 'net')
86 files changed, 12165 insertions, 2977 deletions
diff --git a/net/Config.in b/net/Config.in index b4547e569..62dfd430f 100644 --- a/net/Config.in +++ b/net/Config.in @@ -42,6 +42,11 @@ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then # if [ "$CONFIG_LLC" = "y" ]; then # bool 'Netbeui (EXPERIMENTAL)' CONFIG_NETBEUI # fi + tristate 'Acorn Econet/AUN protocols (EXPERIMENTAL)' CONFIG_ECONET + if [ "$CONFIG_ECONET" != "n" ]; then + bool ' AUN over UDP' CONFIG_ECONET_AUNUDP + bool ' Native Econet' CONFIG_ECONET_NATIVE + fi tristate 'WAN router' CONFIG_WAN_ROUTER bool 'Fast switching (read help!)' CONFIG_NET_FASTROUTE bool 'Forwarding between high speed interfaces' CONFIG_NET_HW_FLOWCONTROL diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 107f481d6..d8160d1ec 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1622,7 +1622,7 @@ static int ax25_get_info(char *buffer, char **start, off_t offset, int length, i cli(); - len += sprintf(buffer, "dest_addr src_addr dev st vs vr va t1 t2 t3 idle n2 rtt wnd paclen Snd-Q Rcv-Q\n"); + len += sprintf(buffer, "dest_addr src_addr dev st vs vr va t1 t2 t3 idle n2 rtt wnd paclen Snd-Q Rcv-Q inode\n"); for (ax25 = ax25_list; ax25 != NULL; ax25 = ax25->next) { if (ax25->ax25_dev == NULL) @@ -1658,9 +1658,10 @@ static int ax25_get_info(char *buffer, char **start, off_t offset, int length, i ax25->paclen); if (ax25->sk != NULL) { - len += sprintf(buffer + len, " %5d %5d\n", + len += sprintf(buffer + len, " %5d %5d %ld\n", atomic_read(&ax25->sk->wmem_alloc), - atomic_read(&ax25->sk->rmem_alloc)); + atomic_read(&ax25->sk->rmem_alloc), + ax25->sk->socket != NULL ? ax25->sk->socket->inode->i_ino : 0L); } else { len += sprintf(buffer + len, "\n"); } diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c index 71eb5cfc3..8e330af23 100644 --- a/net/ax25/ax25_out.c +++ b/net/ax25/ax25_out.c @@ -362,7 +362,7 @@ void ax25_transmit_buffer(ax25_cb *ax25, struct sk_buff *skb, int type) ax25_addr_build(ptr, &ax25->source_addr, &ax25->dest_addr, ax25->digipeat, type, ax25->modulus); - skb->dev = ax25->ax25_dev->dev; + skb->dev = ax25->ax25_dev->dev; ax25_queue_xmit(skb); } diff --git a/net/bridge/Makefile b/net/bridge/Makefile index bc432f316..bcccefb75 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -1,5 +1,5 @@ # -# Makefile for the Linux TCP/IP (INET) layer. +# Makefile for the Linux Bridge layer. # # Note! Dependencies are done automagically by 'make dep', which also # removes any old dependencies. DON'T put your own dependencies here diff --git a/net/bridge/br.c b/net/bridge/br.c index 2961ff3c6..014453f8c 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -13,8 +13,31 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Fixes: - * Yury Shevchuk : Bridge with non bridging ports + * Fixes: + * Yury Shevchuk : Bridge with non bridging ports + * Jean-Rene Peulve: jr.peulve@aix.pacwan.net Jan/Feb 98 + * support Linux 2.0 + * Handle Receive config bpdu + * kick mark_bh to send Spanning Tree pdus + * bridgeId comparison using htonl() + * make STP interoperable with other vendors + * wrong test in root_selection() + * add more STP debug info + * some performance improvments + * do not clear bridgeId.mac while setting priority + * do not reset port priority when starting bridge + * make port priority from user value and port number + * maintains user port state out of device state + * broacast/multicast storm limitation + * forwarding statistics + * stop br_tick when bridge is turn off + * add local MACs in avl_tree to forward up stack + * fake receive on right port for IP/ARP + * ages tree even if packet does not cross bridge + * add BRCMD_DISPLAY_FDB (ioctl for now) + * + * Alan Cox: Merged Jean-Rene's stuff, reformatted stuff a bit + * so blame me first if its broken ;) * * Todo: * Don't bring up devices automatically. Start ports disabled @@ -42,11 +65,17 @@ #include <linux/string.h> #include <linux/skbuff.h> #include <linux/if_arp.h> +#include <linux/ip.h> +#include <linux/version.h> #include <linux/init.h> #include <asm/uaccess.h> #include <asm/system.h> #include <net/br.h> +#ifndef min +#define min(a, b) (((a) <= (b)) ? (a) : (b)) +#endif + static void transmit_config(int port_no); static int root_bridge(void); static int supersedes_port_info(int port_no, Config_bpdu *config); @@ -80,7 +109,7 @@ static void br_init_port(int port_no); static void enable_port(int port_no); static void disable_port(int port_no); static void set_bridge_priority(bridge_id_t *new_bridge_id); -static void set_port_priority(int port_no, unsigned short new_port_id); +static void set_port_priority(int port_no); static void set_path_cost(int port_no, unsigned short path_cost); static void start_hello_timer(void); static void stop_hello_timer(void); @@ -104,11 +133,12 @@ static int br_device_event(struct notifier_block *dnot, unsigned long event, voi static void br_tick(unsigned long arg); static int br_forward(struct sk_buff *skb, int port); /* 3.7 */ static int br_port_cost(struct device *dev); /* 4.10.2 */ -static void br_bpdu(struct sk_buff *skb); /* consumes skb */ +static void br_bpdu(struct sk_buff *skb, int port); /* consumes skb */ static int br_cmp(unsigned int *a, unsigned int *b); static int send_tcn_bpdu(int port_no, Tcn_bpdu *bpdu); static int send_config_bpdu(int port_no, Config_bpdu *config_bpdu); static int find_port(struct device *dev); +static void br_add_local_mac(unsigned char *mac); static int br_flood(struct sk_buff *skb, int port); static int br_drop(struct sk_buff *skb); static int br_learn(struct sk_buff *skb, int port); /* 3.8 */ @@ -116,8 +146,21 @@ static int br_learn(struct sk_buff *skb, int port); /* 3.8 */ static unsigned char bridge_ula[ETH_ALEN] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }; static Bridge_data bridge_info; /* (4.5.3) */ Port_data port_info[All_ports]; /* (4.5.5) */ -Config_bpdu config_bpdu[All_ports]; -Tcn_bpdu tcn_bpdu[All_ports]; + +/* JRP: fdb cache 1/port save kmalloc/kfree on every frame */ +struct fdb *newfdb[All_ports]; +int allocated_fdb_cnt = 0; + +/* broacast/multicast storm limitation */ +int max_mcast_per_period = MAX_MCAST_PER_PERIOD; +int mcast_hold_time = MCAST_HOLD_TIME; + +/* JRP: next two bpdu are copied to skbuff so we need only 1 of each */ +static Config_bpdu config_bpdu; +static Tcn_bpdu tcn_bpdu; +static unsigned char port_priority[All_ports]; +static unsigned char user_port_state[All_ports]; + static Timer hello_timer; /* (4.5.4.1) */ static Timer tcn_timer; /* (4.5.4.2) */ static Timer topology_change_timer; /* (4.5.4.3) */ @@ -129,6 +172,7 @@ static Timer hold_timer[All_ports]; /* (4.5.6.3) */ unsigned int fdb_aging_time = FDB_TIMEOUT; struct br_stat br_stats; +#define br_stats_cnt br_stats.packet_cnts static struct timer_list tl; /* for 1 second timer... */ @@ -154,23 +198,28 @@ static struct notifier_block br_dev_notifier={ #define BR_PROTOCOL_HASH(x) (x % BR_MAX_PROTOCOLS) /* Checks if that protocol type is to be bridged */ + int br_protocol_ok(unsigned short protocol) { unsigned x; /* See if protocol statistics are to be kept */ if (br_stats.flags & BR_PROT_STATS) - { for(x=0;x<BR_MAX_PROT_STATS && - br_stats.prot_id[x]!=protocol && br_stats.prot_id[x];x++) ; - if (x<BR_MAX_PROT_STATS) - { br_stats.prot_id[x]=protocol;br_stats.prot_counter[x]++; - } + { + for(x=0;x<BR_MAX_PROT_STATS && br_stats.prot_id[x]!=protocol && br_stats.prot_id[x];x++); + if (x<BR_MAX_PROT_STATS) + { + br_stats.prot_id[x]=protocol;br_stats.prot_counter[x]++; + } } - for (x=BR_PROTOCOL_HASH(protocol); br_stats.protocols[x]!=0;) { - if (br_stats.protocols[x]==protocol) return !br_stats.policy; + for (x=BR_PROTOCOL_HASH(protocol); br_stats.protocols[x]!=0;) + { + if (br_stats.protocols[x]==protocol) + return !br_stats.policy; x++; - if (x==BR_MAX_PROTOCOLS) x=0; + if (x==BR_MAX_PROTOCOLS) + x=0; } return br_stats.policy; } @@ -209,7 +258,7 @@ static int br_set_policy(int policy) /* * this section of code was graciously borrowed from the IEEE 802.1d * specification section 4.9.1 starting on pg 69. It has been - * modified somewhat to fit within out framework and structure. It + * modified somewhat to fit within our framework and structure. It * implements the spanning tree algorithm that is the heart of the * 802.1d bridging protocol. */ @@ -219,42 +268,44 @@ static void transmit_config(int port_no) /* (4.6.1) */ if (hold_timer[port_no].active) { /* (4.6.1.3.1) */ port_info[port_no].config_pending = TRUE; /* (4.6.1.3.1) */ } else { /* (4.6.1.3.2) */ - config_bpdu[port_no].type = BPDU_TYPE_CONFIG; - config_bpdu[port_no].root_id = bridge_info.designated_root; + config_bpdu.type = BPDU_TYPE_CONFIG; + config_bpdu.root_id = bridge_info.designated_root; /* (4.6.1.3.2(1)) */ - config_bpdu[port_no].root_path_cost = bridge_info.root_path_cost; + config_bpdu.root_path_cost = bridge_info.root_path_cost; /* (4.6.1.3.2(2)) */ - config_bpdu[port_no].bridge_id = bridge_info.bridge_id; + config_bpdu.bridge_id = bridge_info.bridge_id; /* (4.6.1.3.2(3)) */ - config_bpdu[port_no].port_id = port_info[port_no].port_id; + config_bpdu.port_id = port_info[port_no].port_id; /* * (4.6.1.3.2(4)) */ if (root_bridge()) { - config_bpdu[port_no].message_age = Zero; /* (4.6.1.3.2(5)) */ + config_bpdu.message_age = Zero; /* (4.6.1.3.2(5)) */ } else { - config_bpdu[port_no].message_age + config_bpdu.message_age = message_age_timer[bridge_info.root_port].value + Message_age_increment; /* (4.6.1.3.2(6)) */ } - config_bpdu[port_no].max_age = bridge_info.max_age; /* (4.6.1.3.2(7)) */ - config_bpdu[port_no].hello_time = bridge_info.hello_time; - config_bpdu[port_no].forward_delay = bridge_info.forward_delay; - config_bpdu[port_no].flags = 0; - config_bpdu[port_no].flags |= - port_info[port_no].top_change_ack ? TOPOLOGY_CHANGE_ACK : 0; - /* (4.6.1.3.2(8)) */ + config_bpdu.max_age = bridge_info.max_age;/* (4.6.1.3.2(7)) */ + config_bpdu.hello_time = bridge_info.hello_time; + config_bpdu.forward_delay = bridge_info.forward_delay; + config_bpdu.top_change_ack = + port_info[port_no].top_change_ack; + /* (4.6.1.3.2(8)) */ port_info[port_no].top_change_ack = 0; - /* (4.6.1.3.2(8)) */ - config_bpdu[port_no].flags |= - bridge_info.top_change ? TOPOLOGY_CHANGE : 0; - /* (4.6.1.3.2(9)) */ - send_config_bpdu(port_no, &config_bpdu[port_no]); + config_bpdu.top_change = + bridge_info.top_change; /* (4.6.1.3.2(9)) */ + + send_config_bpdu(port_no, &config_bpdu); port_info[port_no].config_pending = FALSE; /* (4.6.1.3.2(10)) */ start_hold_timer(port_no); /* (4.6.1.3.2(11)) */ } +/* JRP: we want the frame to be xmitted even if no other traffic. + * net_bh() will do a dev_transmit() that kicks all devices + */ + mark_bh(NET_BH); } static int root_bridge(void) @@ -314,8 +365,7 @@ static void record_config_timeout_values(Config_bpdu *config) /* (4.6.3) */ bridge_info.max_age = config->max_age; /* (4.6.3.3) */ bridge_info.hello_time = config->hello_time; bridge_info.forward_delay = config->forward_delay; - if (config->flags & TOPOLOGY_CHANGE) - bridge_info.top_change = 1; + bridge_info.top_change = config->top_change; } static void config_bpdu_generation(void) @@ -353,8 +403,8 @@ static void transmit_tcn(void) int port_no; port_no = bridge_info.root_port; - tcn_bpdu[port_no].type = BPDU_TYPE_TOPO_CHANGE; - send_tcn_bpdu(port_no, &tcn_bpdu[bridge_info.root_port]); /* (4.6.6.3) */ + tcn_bpdu.type = BPDU_TYPE_TOPO_CHANGE; + send_tcn_bpdu(port_no, &tcn_bpdu); /* (4.6.6.3) */ } static void configuration_update(void) /* (4.6.7) */ @@ -420,7 +470,7 @@ static void root_selection(void) ) /* (4.6.8.3.1(4)) */ || ((port_info[port_no].designated_port - = port_info[root_port].designated_port +/* JRP: was missing an "=" ! */ == port_info[root_port].designated_port ) && (port_info[port_no].port_id @@ -433,6 +483,10 @@ static void root_selection(void) bridge_info.root_port = root_port; /* (4.6.8.3.1) */ if (root_port == No_port) { /* (4.6.8.3.2) */ +#ifdef DEBUG_STP + if (br_stats.flags & BR_DEBUG) + printk(KERN_DEBUG "root_selection: becomes root\n"); +#endif bridge_info.designated_root = bridge_info.bridge_id; /* (4.6.8.3.2(1)) */ bridge_info.root_path_cost = Zero;/* (4.6.8.3.2(2)) */ @@ -450,6 +504,8 @@ static void designated_port_selection(void) int port_no; for (port_no = One; port_no <= No_of_ports; port_no++) { /* (4.6.9.3) */ + if(port_info[port_no].state == Disabled) + continue; if (designated_port(port_no) /* (4.6.9.3.1) */ || ( @@ -498,19 +554,32 @@ static void become_designated_port(int port_no) static void port_state_selection(void) { /* (4.6.11) */ int port_no; + char *state_str; for (port_no = One; port_no <= No_of_ports; port_no++) { + + if(port_info[port_no].state == Disabled) + continue; if (port_no == bridge_info.root_port) { /* (4.6.11.3.1) */ - port_info[port_no].config_pending = FALSE; /* (4.6.11.3~1(1)) */ + state_str = "root"; + port_info[port_no].config_pending = FALSE; /* (4.6.11.3.1(1)) */ port_info[port_no].top_change_ack = 0; make_forwarding(port_no); /* (4.6.11.3.1(2)) */ } else if (designated_port(port_no)) { /* (4.6.11.3.2) */ + state_str = "designated"; stop_message_age_timer(port_no); /* (4.6.11.3.2(1)) */ make_forwarding(port_no); /* (4.6.11.3.2(2)) */ } else { /* (4.6.11.3.3) */ + state_str = "blocking"; port_info[port_no].config_pending = FALSE; /* (4.6.11.3.3(1)) */ port_info[port_no].top_change_ack = 0; make_blocking(port_no); /* (4.6.11.3.3(2)) */ } +#ifdef DEBUG_STP + if (br_stats.flags & BR_DEBUG) + printk(KERN_DEBUG "port_state_selection: becomes %s port %d\n", + state_str, port_no); +#endif + } } @@ -525,6 +594,11 @@ static void make_forwarding(int port_no) static void topology_change_detection(void) { /* (4.6.14) */ +#ifdef DEBUG_STP + if ((br_stats.flags & BR_DEBUG) + && (bridge_info.top_change_detected == 0)) + printk(KERN_DEBUG "topology_change_detected\n"); +#endif if (root_bridge()) { /* (4.6.14.3.1) */ bridge_info.top_change = 1; start_topology_change_timer(); /* (4.6.14.3.1(2)) */ @@ -532,12 +606,16 @@ static void topology_change_detection(void) transmit_tcn(); /* (4.6.14.3.2(1)) */ start_tcn_timer(); /* (4.6.14.3.2(2)) */ } - bridge_info.top_change = 1; + bridge_info.top_change_detected = 1; /* (4.6.14.3.3) */ } static void topology_change_acknowledged(void) { /* (4.6.15) */ - bridge_info.top_change_detected = 0; +#ifdef DEBUG_STP + if (br_stats.flags & BR_DEBUG) + printk(KERN_DEBUG "topology_change_acked\n"); +#endif + bridge_info.top_change_detected = 0; /* (4.6.15.3.1) */ stop_tcn_timer(); /* (4.6.15.3.2) */ } @@ -574,10 +652,16 @@ static void set_port_state(int port_no, int state) static void received_config_bpdu(int port_no, Config_bpdu *config) /* (4.7.1) */ { - int root; + int root; root = root_bridge(); if (port_info[port_no].state != Disabled) { + +#ifdef DEBUG_STP + if (br_stats.flags & BR_DEBUG) + printk(KERN_DEBUG "received_config_bpdu: port %d\n", + port_no); +#endif if (supersedes_port_info(port_no, config)) { /* (4.7.1.1) *//* (4. * 6.2.2) */ record_config_information(port_no, config); /* (4.7.1.1.1) */ @@ -588,7 +672,7 @@ static void received_config_bpdu(int port_no, Config_bpdu *config) /* (4.7.1) /* (4.6.11.2.1) */ if ((!root_bridge()) && root) { /* (4.7.1.1.4) */ stop_hello_timer(); - if (bridge_info.top_change_detected) { /* (4.7.1.1.5~ */ + if (bridge_info.top_change_detected) { /* (4.7.1.1.5 */ stop_topology_change_timer(); transmit_tcn(); /* (4.6.6.1) */ start_tcn_timer(); @@ -598,7 +682,7 @@ static void received_config_bpdu(int port_no, Config_bpdu *config) /* (4.7.1) record_config_timeout_values(config); /* (4.7.1.1.6) */ /* (4.6.3.2) */ config_bpdu_generation(); /* (4.6.4.2.1) */ - if (config->flags & TOPOLOGY_CHANGE_ACK) { /* (4.7.1.1.7) */ + if (config->top_change_ack) { /* (4.7.1.1.7) */ topology_change_acknowledged(); /* (4.6.15.2) */ } } @@ -612,6 +696,11 @@ static void received_config_bpdu(int port_no, Config_bpdu *config) /* (4.7.1) static void received_tcn_bpdu(int port_no, Tcn_bpdu *tcn) /* (4.7.2) */ { if (port_info[port_no].state != Disabled) { +#ifdef DEBUG_STP + if (br_stats.flags & BR_DEBUG) + printk(KERN_DEBUG "received_tcn_bpdu: port %d\n", + port_no); +#endif if (designated_port(port_no)) { topology_change_detection(); /* (4.7.2.1) */ /* (4.6.14.2.1) */ @@ -628,9 +717,14 @@ static void hello_timer_expiry(void) static void message_age_timer_expiry(int port_no) /* (4.7.4) */ { - int root; + int root; root = root_bridge(); +#ifdef DEBUG_STP + if (br_stats.flags & BR_DEBUG) + printk(KERN_DEBUG "message_age_timer_expiry: port %d\n", + port_no); +#endif become_designated_port(port_no); /* (4.7.4.1) */ /* (4.6.10.2.1) */ configuration_update(); /* (4.7.4.2) */ @@ -653,12 +747,17 @@ static void message_age_timer_expiry(int port_no) /* (4.7.4) */ static void forward_delay_timer_expiry(int port_no) /* (4.7.5) */ { - if (port_info[port_no].state == Listening) { /* (4.7.5.1) */ + if (port_info[port_no].state == Listening) + { /* (4.7.5.1) */ set_port_state(port_no, Learning); /* (4.7.5.1.1) */ start_forward_delay_timer(port_no); /* (4.7.5.1.2) */ - } else if (port_info[port_no].state == Learning) { /* (4.7.5.2) */ + } + else if (port_info[port_no].state == Learning) + { + /* (4.7.5.2) */ set_port_state(port_no, Forwarding); /* (4.7.5.2.1) */ - if (designated_for_some_port()) { /* (4.7.5.2.2) */ + if (designated_for_some_port()) + { /* (4.7.5.2.2) */ topology_change_detection(); /* (4.6.14.2.2) */ } @@ -667,13 +766,15 @@ static void forward_delay_timer_expiry(int port_no) /* (4.7.5) */ static int designated_for_some_port(void) { - int port_no; - + int port_no; - for (port_no = One; port_no <= No_of_ports; port_no++) { + for (port_no = One; port_no <= No_of_ports; port_no++) + { + if(port_info[port_no].state == Disabled) + continue; if ((br_cmp(port_info[port_no].designated_bridge.BRIDGE_ID, - bridge_info.bridge_id.BRIDGE_ID) == 0) - ) { + bridge_info.bridge_id.BRIDGE_ID) == 0)) + { return (TRUE); } } @@ -688,26 +789,38 @@ static void tcn_timer_expiry(void) static void topology_change_timer_expiry(void) { /* (4.7.7) */ - bridge_info.top_change_detected = 0; + bridge_info.top_change_detected = 0; /* (4.7.7.1) */ bridge_info.top_change = 0; /* (4.7.7.2) */ } static void hold_timer_expiry(int port_no) /* (4.7.8) */ { - if (port_info[port_no].config_pending) { + if (port_info[port_no].config_pending) + { transmit_config(port_no); /* (4.7.8.1) */ } /* (4.6.1.2.3) */ } __initfunc(void br_init(void)) { /* (4.8.1) */ - int port_no; + int port_no; + + printk(KERN_INFO "Ethernet Bridge 005 for NET3.037 (Linux 2.1)\n"); + + /* + * Form initial topology change time. + * The topology change timer is only used if this is the root bridge. + */ + + bridge_info.topology_change_time = BRIDGE_MAX_AGE + BRIDGE_FORWARD_DELAY; /* (4.5.3.13) */ - printk(KERN_INFO "Ethernet Bridge 003 for NET3.037 (Linux 2.1)\n"); bridge_info.designated_root = bridge_info.bridge_id; /* (4.8.1.1) */ bridge_info.root_path_cost = Zero; bridge_info.root_port = No_port; +#ifdef DEBUG_STP + printk(KERN_INFO "br_init: becomes root\n"); +#endif bridge_info.bridge_max_age = BRIDGE_MAX_AGE; bridge_info.bridge_hello_time = BRIDGE_HELLO_TIME; @@ -722,17 +835,22 @@ __initfunc(void br_init(void)) bridge_info.top_change = 0; stop_tcn_timer(); stop_topology_change_timer(); + memset(newfdb, 0, sizeof(newfdb)); for (port_no = One; port_no <= No_of_ports; port_no++) { /* (4.8.1.4) */ + /* initial state = Enable */ + user_port_state[port_no] = ~Disabled; + port_priority[port_no] = 128; br_init_port(port_no); disable_port(port_no); } +#if 0 /* JRP: We are not UP ! Wait for the start command */ port_state_selection(); /* (4.8.1.5) */ config_bpdu_generation(); /* (4.8.1.6) */ - /* initialize system timer */ tl.expires = jiffies+HZ; /* 1 second */ tl.function = br_tick; add_timer(&tl); +#endif register_netdevice_notifier(&br_dev_notifier); br_stats.flags = 0; /*BR_UP | BR_DEBUG*/; /* enable bridge */ @@ -741,8 +859,14 @@ __initfunc(void br_init(void)) /*start_hello_timer();*/ } +static inline unsigned short make_port_id(int port_no) +{ + return (port_priority[port_no] << 8) | port_no; +} + static void br_init_port(int port_no) { + port_info[port_no].port_id = make_port_id(port_no); become_designated_port(port_no); /* (4.8.1.4.1) */ set_port_state(port_no, Blocking); /* (4.8.1.4.2) */ port_info[port_no].top_change_ack = 0; @@ -787,10 +911,12 @@ static void set_bridge_priority(bridge_id_t *new_bridge_id) /* (4.8.4) */ { - int root; - int port_no; + int root; + int port_no; root = root_bridge(); for (port_no = One; port_no <= No_of_ports; port_no++) { /* (4.8.4.2) */ + if(port_info[port_no].state == Disabled) + continue; if (designated_port(port_no)) { port_info[port_no].designated_bridge = *new_bridge_id; } @@ -810,9 +936,10 @@ static void set_bridge_priority(bridge_id_t *new_bridge_id) } } -static void set_port_priority(int port_no, unsigned short new_port_id) +static void set_port_priority(int port_no) /* (4.8.5) */ -{ +{int new_port_id = make_port_id(port_no); + if (designated_port(port_no)) { /* (4.8.5.2) */ port_info[port_no].designated_port = new_port_id; } @@ -825,7 +952,8 @@ static void set_port_priority(int port_no, unsigned short new_port_id) < port_info[port_no].designated_port ) - ) { + ) + { become_designated_port(port_no); /* (4.8.5.4.1) */ port_state_selection(); /* (4.8.5.4.2) */ } @@ -841,27 +969,33 @@ static void set_path_cost(int port_no, unsigned short path_cost) static void br_tick(unsigned long arg) { - int port_no; + int port_no; + + if(!(br_stats.flags & BR_UP)) + return; /* JRP: we have been shot down */ - if (hello_timer_expired()) { + if (hello_timer_expired()) hello_timer_expiry(); - } - if (tcn_timer_expired()) { + + if (tcn_timer_expired()) tcn_timer_expiry(); - } - if (topology_change_timer_expired()) { + + if (topology_change_timer_expired()) topology_change_timer_expiry(); - } - for (port_no = One; port_no <= No_of_ports; port_no++) { - if (forward_delay_timer_expired(port_no)) { + + for (port_no = One; port_no <= No_of_ports; port_no++) + { + if(port_info[port_no].state == Disabled) + continue; + + if (forward_delay_timer_expired(port_no)) forward_delay_timer_expiry(port_no); - } - if (message_age_timer_expired(port_no)) { + + if (message_age_timer_expired(port_no)) message_age_timer_expiry(port_no); - } - if (hold_timer_expired(port_no)) { + + if (hold_timer_expired(port_no)) hold_timer_expiry(port_no); - } } /* call me again sometime... */ tl.expires = jiffies+HZ; /* 1 second */ @@ -882,7 +1016,8 @@ static void stop_hello_timer(void) static int hello_timer_expired(void) { - if (hello_timer.active && (++hello_timer.value >= bridge_info.hello_time)) { + if (hello_timer.active && (++hello_timer.value >= bridge_info.hello_time)) + { hello_timer.active = FALSE; return (TRUE); } @@ -902,8 +1037,8 @@ static void stop_tcn_timer(void) static int tcn_timer_expired(void) { - if (tcn_timer.active && (++tcn_timer.value >= - bridge_info.bridge_hello_time)) { + if (tcn_timer.active && (++tcn_timer.value >= bridge_info.bridge_hello_time)) + { tcn_timer.active = FALSE; return (TRUE); } @@ -925,9 +1060,8 @@ static void stop_topology_change_timer(void) static int topology_change_timer_expired(void) { if (topology_change_timer.active - && (++topology_change_timer.value - >= bridge_info.topology_change_time - )) { + && (++topology_change_timer.value >= bridge_info.topology_change_time )) + { topology_change_timer.active = FALSE; return (TRUE); } @@ -947,8 +1081,8 @@ static void stop_message_age_timer(int port_no) static int message_age_timer_expired(int port_no) { - if (message_age_timer[port_no].active && - (++message_age_timer[port_no].value >= bridge_info.max_age)) { + if (message_age_timer[port_no].active && (++message_age_timer[port_no].value >= bridge_info.max_age)) + { message_age_timer[port_no].active = FALSE; return (TRUE); } @@ -968,12 +1102,12 @@ static void stop_forward_delay_timer(int port_no) static int forward_delay_timer_expired(int port_no) { - if (forward_delay_timer[port_no].active && - (++forward_delay_timer[port_no].value >= bridge_info.forward_delay)) { - forward_delay_timer[port_no].active = FALSE; - return (TRUE); - } - return (FALSE); + if (forward_delay_timer[port_no].active && (++forward_delay_timer[port_no].value >= bridge_info.forward_delay)) + { + forward_delay_timer[port_no].active = FALSE; + return (TRUE); + } + return (FALSE); } static void start_hold_timer(int port_no) @@ -990,7 +1124,8 @@ static void stop_hold_timer(int port_no) static int hold_timer_expired(int port_no) { if (hold_timer[port_no].active && - (++hold_timer[port_no].value >= bridge_info.hold_time)) { + (++hold_timer[port_no].value >= bridge_info.hold_time)) + { hold_timer[port_no].active = FALSE; return (TRUE); } @@ -998,113 +1133,112 @@ static int hold_timer_expired(int port_no) } -static int send_config_bpdu(int port_no, Config_bpdu *config_bpdu) +static struct sk_buff *alloc_bridge_skb(int port_no, int pdu_size, char *pdu_name) { struct sk_buff *skb; struct device *dev = port_info[port_no].dev; - int size; struct ethhdr *eth; - - if (port_info[port_no].state == Disabled) { - printk(KERN_DEBUG "send_config_bpdu: port %i not valid\n",port_no); - return(-1); + int size = dev->hard_header_len + BRIDGE_LLC1_HS + pdu_size; + unsigned char *llc_buffer; + int pad_size = 60 - size; + + size = 60; /* minimum Ethernet frame - CRC */ + + if (port_info[port_no].state == Disabled) + { + printk(KERN_DEBUG "send_%s_bpdu: port %i not valid\n", pdu_name, port_no); + return NULL; } + + skb = alloc_skb(size, GFP_ATOMIC); + if (skb == NULL) + { + printk(KERN_DEBUG "send_%s_bpdu: no skb available\n", pdu_name); + return NULL; + } + skb->dev = dev; + skb->mac.raw = skb->h.raw = skb_put(skb,size); + memset(skb->h.raw + 60 - pad_size, 0xa5, pad_size); + eth = skb->mac.ethernet; + memcpy(eth->h_dest, bridge_ula, ETH_ALEN); + memcpy(eth->h_source, dev->dev_addr, ETH_ALEN); + + if (br_stats.flags & BR_DEBUG) + printk("send_%s_bpdu: port %i src %02x:%02x:%02x:%02x:%02x:%02x\n", + pdu_name, + port_no, + eth->h_source[0], + eth->h_source[1], + eth->h_source[2], + eth->h_source[3], + eth->h_source[4], + eth->h_source[5]); +#if 0 + /* 8038 is used in older DEC spanning tree protocol which uses a + * different pdu layout as well + */ + eth->h_proto = htons(0x8038); +#endif + eth->h_proto = htons(pdu_size + BRIDGE_LLC1_HS); + + skb->h.raw += skb->dev->hard_header_len; + llc_buffer = skb->h.raw; + *llc_buffer++ = BRIDGE_LLC1_DSAP; + *llc_buffer++ = BRIDGE_LLC1_SSAP; + *llc_buffer++ = BRIDGE_LLC1_CTRL; + /* set h.raw to where the bpdu starts */ + skb->h.raw += BRIDGE_LLC1_HS; + + /* mark that we've been here... */ + skb->pkt_bridged = IS_BRIDGED; + return skb; +} + +static int send_config_bpdu(int port_no, Config_bpdu *config_bpdu) +{ + struct sk_buff *skb; - if (br_stats.flags & BR_DEBUG) - printk("send_config_bpdu: "); /* - * create and send the message + * Create and send the message */ - size = dev->hard_header_len + sizeof(Config_bpdu); - skb = alloc_skb(size, GFP_ATOMIC); - if (skb == NULL) - { - printk(KERN_DEBUG "send_config_bpdu: no skb available\n"); + + skb = alloc_bridge_skb(port_no, BRIDGE_BPDU_8021_CONFIG_SIZE, + "config"); + if (skb == NULL) return(-1); - } - skb->dev = dev; - skb->mac.raw = skb->h.raw = skb_put(skb, size); - eth = skb->mac.ethernet; - memcpy(eth->h_dest, bridge_ula, ETH_ALEN); - memcpy(eth->h_source, dev->dev_addr, ETH_ALEN); - if (br_stats.flags & BR_DEBUG) - printk("port %i src %02x:%02x:%02x:%02x:%02x:%02x\ - dest %02x:%02x:%02x:%02x:%02x:%02x\n", - port_no, - eth->h_source[0], - eth->h_source[1], - eth->h_source[2], - eth->h_source[3], - eth->h_source[4], - eth->h_source[5], - eth->h_dest[0], - eth->h_dest[1], - eth->h_dest[2], - eth->h_dest[3], - eth->h_dest[4], - eth->h_dest[5]); - eth->h_proto = htons(0x8038); - skb->h.raw += skb->dev->hard_header_len; - memcpy(skb->h.raw, config_bpdu, sizeof(Config_bpdu)); + /* copy fields before "flags" */ + memcpy(skb->h.raw, config_bpdu, BRIDGE_BPDU_8021_CONFIG_FLAG_OFFSET); - /* won't get bridged again... */ - skb->pkt_bridged = IS_BRIDGED; - skb->dev=dev; - dev_queue_xmit(skb); - return(0); -} + /* build the "flags" field */ + *(skb->h.raw+BRIDGE_BPDU_8021_CONFIG_FLAG_OFFSET) = 0; + if (config_bpdu->top_change_ack) + *(skb->h.raw+BRIDGE_BPDU_8021_CONFIG_FLAG_OFFSET) |= 0x80; + if (config_bpdu->top_change) + *(skb->h.raw+BRIDGE_BPDU_8021_CONFIG_FLAG_OFFSET) |= 0x01; + + config_bpdu_hton(config_bpdu); + /* copy the rest */ + memcpy(skb->h.raw+BRIDGE_BPDU_8021_CONFIG_FLAG_OFFSET+1, + (char*)&(config_bpdu->root_id), + BRIDGE_BPDU_8021_CONFIG_SIZE-1-BRIDGE_BPDU_8021_CONFIG_FLAG_OFFSET); + dev_queue_xmit(skb); + return(0); +} + static int send_tcn_bpdu(int port_no, Tcn_bpdu *bpdu) { struct sk_buff *skb; - struct device *dev = port_info[port_no].dev; - int size; - struct ethhdr *eth; - - if (port_info[port_no].state == Disabled) { - printk(KERN_DEBUG "send_tcn_bpdu: port %i not valid\n",port_no); - return(-1); - } - if (br_stats.flags & BR_DEBUG) - printk("send_tcn_bpdu: "); - size = sizeof(Tcn_bpdu) + dev->hard_header_len; - skb = alloc_skb(size, GFP_ATOMIC); - if (skb == NULL) { - printk(KERN_DEBUG "send_tcn_bpdu: no skb available\n"); - return(-1); - } - skb->dev = dev; - skb->mac.raw = skb->h.raw = skb_put(skb,size); - eth = skb->mac.ethernet; - memcpy(eth->h_dest, bridge_ula, ETH_ALEN); - memcpy(eth->h_source, dev->dev_addr, ETH_ALEN); - if (br_stats.flags & BR_DEBUG) - printk("port %i src %02x:%02x:%02x:%02x:%02x:%02x\ - dest %02x:%02x:%02x:%02x:%02x:%02x\n", - port_no, - eth->h_source[0], - eth->h_source[1], - eth->h_source[2], - eth->h_source[3], - eth->h_source[4], - eth->h_source[5], - eth->h_dest[0], - eth->h_dest[1], - eth->h_dest[2], - eth->h_dest[3], - eth->h_dest[4], - eth->h_dest[5]); - eth->h_proto = htons(0x8038); - - skb->h.raw += skb->dev->hard_header_len; - memcpy(skb->h.raw, bpdu, sizeof(Tcn_bpdu)); - - /* mark that we've been here... */ - skb->pkt_bridged = IS_BRIDGED; - skb->dev=dev; - dev_queue_xmit(skb); - return(0); + + skb = alloc_bridge_skb(port_no, sizeof(Tcn_bpdu), "tcn"); + if (skb == NULL) + return(-1); + + memcpy(skb->h.raw, bpdu, sizeof(Tcn_bpdu)); + + dev_queue_xmit(skb); + return(0); } static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr) @@ -1116,52 +1250,59 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v if (dev->flags & IFF_LOOPBACK) return(NOTIFY_DONE); - switch (event) { - case NETDEV_DOWN: - if (br_stats.flags & BR_DEBUG) - printk("br_device_event: NETDEV_DOWN...\n"); - /* find our device and mark it down */ - for (i = One; i <= No_of_ports; i++) { - if (port_info[i].dev == dev) { - disable_port(i); - return NOTIFY_DONE; - break; + switch (event) + { + case NETDEV_DOWN: + if (br_stats.flags & BR_DEBUG) + printk(KERN_DEBUG "br_device_event: NETDEV_DOWN...\n"); + /* find our device and mark it down */ + for (i = One; i <= No_of_ports; i++) + { + if (port_info[i].dev == dev) + { + disable_port(i); + return NOTIFY_DONE; + break; + } } - } - break; - case NETDEV_UP: - if (br_stats.flags & BR_DEBUG) - printk("br_device_event: NETDEV_UP...\n"); - /* Only handle ethernet ports */ - if(dev->type!=ARPHRD_ETHER && dev->type!=ARPHRD_LOOPBACK) - return NOTIFY_DONE; - /* look up an unused device and enable it */ - for (i = One; i <= No_of_ports; i++) { - if ((port_info[i].dev == (struct device *)0) || - (port_info[i].dev == dev)) { - port_info[i].dev = dev; - enable_port(i); - set_path_cost(i, br_port_cost(dev)); - set_port_priority(i, 128); - port_info[i].port_id = i; - /* set bridge addr from 1st device addr */ - if ((bridge_info.bridge_id.BRIDGE_ID[0] == 0) && - (bridge_info.bridge_id.BRIDGE_ID[1] == 0)) { - memcpy(bridge_info.bridge_id.BRIDGE_ID_ULA, dev->dev_addr, 6); - bridge_info.bridge_id.BRIDGE_PRIORITY = port_info[i].port_id; - set_bridge_priority(&bridge_info.bridge_id); - } - make_forwarding(i); + break; + case NETDEV_UP: + if (br_stats.flags & BR_DEBUG) + printk(KERN_DEBUG "br_device_event: NETDEV_UP...\n"); + /* Only handle ethernet ports */ + if(dev->type!=ARPHRD_ETHER && dev->type!=ARPHRD_LOOPBACK) return NOTIFY_DONE; - break; + /* look up an unused device and enable it */ + for (i = One; i <= No_of_ports; i++) + { + if (port_info[i].dev == NULL || port_info[i].dev == dev) + { + port_info[i].dev = dev; + port_info[i].port_id = i; + /* set bridge addr from 1st device addr */ + if (((htonl(bridge_info.bridge_id.BRIDGE_ID[0])&0xffff) == 0) && + (bridge_info.bridge_id.BRIDGE_ID[1] == 0)) + { + memcpy(bridge_info.bridge_id.BRIDGE_ID_ULA, dev->dev_addr, 6); + if(bridge_info.bridge_id.BRIDGE_PRIORITY == 0) + bridge_info.bridge_id.BRIDGE_PRIORITY = htons(32768); + set_bridge_priority(&bridge_info.bridge_id); + } + br_add_local_mac(dev->dev_addr); + if((br_stats.flags & BR_UP) && + (user_port_state[i] != Disabled)) + { + /* don't start if user said so */ + enable_port(i); + set_path_cost(i, br_port_cost(dev)); + set_port_priority(i); + make_forwarding(i); + } + return NOTIFY_DONE; + break; + } } - } - break; -#if 0 - default: - printk("br_device_event: unknown event [%x]\n", - (unsigned int)event); -#endif + break; } return NOTIFY_DONE; } @@ -1175,10 +1316,9 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v int br_receive_frame(struct sk_buff *skb) /* 3.5 */ { int port; + Port_data *p; struct ethhdr *eth; - if (br_stats.flags & BR_DEBUG) - printk("br_receive_frame: "); /* sanity */ if (!skb) { printk(KERN_CRIT "br_receive_frame: no skb!\n"); @@ -1189,88 +1329,79 @@ int br_receive_frame(struct sk_buff *skb) /* 3.5 */ /* check for loopback */ if (skb->dev->flags & IFF_LOOPBACK) - return(0); + return 0 ; port = find_port(skb->dev); + if(!port) + return 0; + skb->h.raw = skb->mac.raw; eth = skb->mac.ethernet; - if (br_stats.flags & BR_DEBUG) - printk("port %i src %02x:%02x:%02x:%02x:%02x:%02x\ - dest %02x:%02x:%02x:%02x:%02x:%02x\n", - port, - eth->h_source[0], - eth->h_source[1], - eth->h_source[2], - eth->h_source[3], - eth->h_source[4], - eth->h_source[5], - eth->h_dest[0], - eth->h_dest[1], - eth->h_dest[2], - eth->h_dest[3], - eth->h_dest[4], - eth->h_dest[5]); + p = &port_info[port]; + + if(p->state == Disabled) + { + /* We are here if BR_UP even if this port is Disabled. + * Send everything up + */ + skb->pkt_type = PACKET_HOST; + ++br_stats_cnt.port_disable_up_stack; + return(0); /* pass frame up our stack (this will */ + /* happen in net_bh() in dev.c) */ + } + + /* Here only if not disable. + * Remark: only frames going up will show up in NIT (tcpdump) + */ - if (!port) { - if(br_stats.flags&BR_DEBUG) - printk("\nbr_receive_frame: no port!\n"); - return(0); + /* JRP: even if port is Blocking we need to process the Spanning Tree + * frames to keep the port in that state + */ + if (memcmp(eth->h_dest, bridge_ula, ETH_ALEN) == 0) + { + ++br_stats_cnt.rcv_bpdu; + br_bpdu(skb, port); /* br_bpdu consumes skb */ + return(1); } - - switch (port_info[port].state) + switch (p->state) { case Learning: - (void) br_learn(skb, port); /* 3.8 */ + if(br_learn(skb, port)) + { /* 3.8 */ + ++br_stats_cnt.drop_multicast; + return br_drop(skb); + } /* fall through */ case Listening: - /* process BPDUs */ - if (memcmp(eth->h_dest, bridge_ula, 6) == 0) { - br_bpdu(skb); - return(1); /* br_bpdu consumes skb */ - } /* fall through */ case Blocking: - /* fall through */ - case Disabled: - /* should drop frames, but for now, we let - * them get passed up to the next higher layer + ++br_stats_cnt.notForwarding; return(br_drop(skb)); - */ - return(0); /* pass frame up stack */ + /* + case Disabled: is now handled before this switch ! + Keep the break to allow GCC to use a jmp table. + */ break; case Forwarding: - (void) br_learn(skb, port); /* 3.8 */ - /* process BPDUs */ - if (memcmp(eth->h_dest, bridge_ula, - ETH_ALEN) == 0) - { - /*printk("frame bpdu processor for me!!!\n");*/ - br_bpdu(skb); - return(1); /* br_bpdu consumes skb */ - } - /* is frame for me? */ - if (memcmp(eth->h_dest, - port_info[port].dev->dev_addr, - ETH_ALEN) == 0) - { - /* Packet is for us */ - skb->pkt_type = PACKET_HOST; - return(0); /* pass frame up our stack (this will */ - /* happen in net_bh() in dev.c) */ + if(br_learn(skb, port)) { /* 3.8 */ + ++br_stats_cnt.drop_multicast; + return br_drop(skb); } /* Now this frame came from one of bridged - ports, and it appears to be not for me; - this means we should attempt to forward it. - But actually this frame can still be for me - [as well] if it is destined to one of our - multicast groups. br_forward() will not - consume the frame if this is the case */ + ports this means we should attempt to forward it. + JRP: local addresses are now in the AVL tree, + br_forward will pass frames up if it matches + one of our local MACs or if it is a multicast + group address. + br_forward() will not consume the frame if this + is the case */ return(br_forward(skb, port)); default: printk(KERN_DEBUG "br_receive_frame: port [%i] unknown state [%i]\n", - port, port_info[port].state); - return(0); /* pass frame up stack? */ + port, p->state); + ++br_stats_cnt.unknown_state; + return(br_drop(skb)); /* discard frame */ } } @@ -1304,15 +1435,17 @@ int br_tx_frame(struct sk_buff *skb) /* 3.5 */ /* if bridging is not enabled on the port we are going to send to, we have nothing to do with this frame, hands off */ - if (! find_port(skb->dev)) + if (((port=find_port(skb->dev))==0)||(port_info[port].state==Disabled)) { + ++br_stats_cnt.port_disable; return(0); - + } + ++br_stats_cnt.port_not_disable; skb->mac.raw = skb->h.raw = skb->data; eth = skb->mac.ethernet; - port = 0; /* an impossible port */ + port = 0; /* an impossible port (locally generated) */ if (br_stats.flags & BR_DEBUG) - printk("br_tx_fr : port %i src %02x:%02x:%02x:%02x:%02x:%02x\ - dest %02x:%02x:%02x:%02x:%02x:%02x\n", + printk("br_tx_fr : port %i src %02x:%02x:%02x:%02x:%02x:%02x" + " dest %02x:%02x:%02x:%02x:%02x:%02x\n", port, eth->h_source[0], eth->h_source[1], @@ -1329,55 +1462,135 @@ int br_tx_frame(struct sk_buff *skb) /* 3.5 */ return(br_forward(skb, port)); } +static void br_add_local_mac(unsigned char *mac) +{ + struct fdb *f; + f = (struct fdb *)kmalloc(sizeof(struct fdb), GFP_ATOMIC); + if (!f) + { + printk(KERN_CRIT "br_add_local_mac: unable to malloc fdb\n"); + return; + } + f->port = 0; /* dest port == 0 =>local */ + memcpy(f->ula, mac, 6); + f->timer = 0; /* will not aged anyway */ + f->flags = 0; /* not valid => br_forward special route */ + /* + * add entity to AVL tree. If entity already + * exists in the tree, update the fields with + * what we have here. + */ + if (br_avl_insert(f) != NULL) + { + /* Already in */ + kfree(f); + } +} + +/* Avoid broadcast loop by limiting the number of broacast frames per + * period. The idea is to limit this per source + * returns: 0 if limit is not reached + * 1 if frame should be dropped + */ + +static inline int mcast_quench(struct fdb *f) +{ + if(f->mcast_count++ == 0) /* first time */ + f->mcast_timer = jiffies; + else { + if(f->mcast_count > max_mcast_per_period) { + if(jiffies > (f->mcast_timer + mcast_hold_time)) + f->mcast_count = 0; + else return 1; + } + } + return 0; +} + /* * this routine returns 0 when it learns (or updates) from the - * frame, and -1 if the frame is simply discarded due to port - * state or lack of resources... + * frame, and 1 if we must dropped the frame. */ static int br_learn(struct sk_buff *skb, int port) /* 3.8 */ { - struct fdb *f; + struct fdb *f, *oldfdb; + Port_data *p = &port_info[port]; + struct ethhdr *eth = skb->mac.ethernet; - switch (port_info[port].state) { - case Listening: - case Blocking: - case Disabled: - default: - return(-1); - /* break; */ - case Learning: - case Forwarding: - /* don't keep group addresses in the tree */ - if (skb->mac.ethernet->h_source[0] & 0x01) - return(-1); - - f = (struct fdb *)kmalloc(sizeof(struct fdb), - GFP_ATOMIC); + /* JRP: no reason to check port state again. We are called by + * br_receive_frame() only when in Learning or Forwarding + * Remark: code not realigned yet to keep diffs smaller + */ - if (!f) { - printk(KERN_DEBUG "br_learn: unable to malloc fdb\n"); - return(-1); - } - f->port = port; /* source port */ - memcpy(f->ula, skb->mac.ethernet->h_source, 6); - f->timer = CURRENT_TIME; - f->flags = FDB_ENT_VALID; - /* - * add entity to AVL tree. If entity already - * exists in the tree, update the fields with - * what we have here. - */ - if (br_avl_insert(f) == 0) { /* update */ - kfree(f); - return(0); - } - /* add to head of port chain */ - f->fdb_next = port_info[port].fdb; - port_info[port].fdb = f; - return(0); - /* break */ + /* don't keep group addresses in the tree */ + if (eth->h_source[0] & 0x01) + return 0; + + if((f= newfdb[port]) == NULL) + { + newfdb[port] = f = (struct fdb *)kmalloc(sizeof(struct fdb), GFP_ATOMIC); + if (!f) + { + printk(KERN_DEBUG "br_learn: unable to malloc fdb\n"); + return(-1); /* this drop the frame */ + } + } + f->port = port; /* source port */ + memcpy(f->ula, eth->h_source, 6); + f->timer = CURRENT_TIME; + f->flags = FDB_ENT_VALID; + /* + * add entity to AVL tree. If entity already + * exists in the tree, update the fields with + * what we have here. + */ + if ((oldfdb = br_avl_insert(f))) + { + /* update if !NULL */ + if((eth->h_dest[0] & 0x01) && /* multicast */ mcast_quench(oldfdb)) + return 1; + return 0; } + newfdb[port] = NULL; /* force kmalloc next time */ + f->mcast_count = 0; + /* add to head of port chain */ + f->fdb_next = p->fdb; + p->fdb = f; + allocated_fdb_cnt++; + return 0; +} + +/* JRP: always called under br_receive_frame(). No need for Q protection. */ + +void requeue_fdb(struct fdb *node, int new_port) +{ + Port_data *p = &port_info[node->port]; + + /* dequeue */ + if(p->fdb == node) + p->fdb = node->fdb_next; + else + { + struct fdb *prev; + + for(prev = p->fdb; prev; prev = prev->fdb_next) + if (prev->fdb_next == node) + break; + + if(prev != NULL) + prev->fdb_next = node->fdb_next; + else + { + /* Forget about this update. */ + printk(KERN_ERR "br:requeue_fdb\n"); + return; + } + } + /* enqueue */ + node->port = new_port; + node->fdb_next = port_info[new_port].fdb; + port_info[new_port].fdb = node; } /* @@ -1429,26 +1642,43 @@ static int br_forward(struct sk_buff *skb, int port) /* 3.7 */ * This probably should be dropped since the flood will * have sent it anyway. */ - if (port == 0) /* locally generated */ + if (port == 0) + { + /* Locally generated */ + ++br_stats_cnt.local_multicast; return(br_dev_drop(skb)); + } + ++br_stats_cnt.forwarded_multicast; return(0); - } else { - /* locate port to forward to */ + } + else + { + /* unicast frame, locate port to forward to */ f = br_avl_find_addr(skb->mac.ethernet->h_dest); /* * Send flood and drop. */ - if (!f || !(f->flags & FDB_ENT_VALID)) { - /* not found; flood all ports */ + if (!f || !(f->flags & FDB_ENT_VALID)) + { + if(f && (f->port == 0)) + { + skb->pkt_type = PACKET_HOST; + ++br_stats_cnt.forwarded_unicast_up_stack; + return(0); + } + /* not found or too old; flood all ports */ + ++br_stats_cnt.flood_unicast; br_flood(skb, port); return(br_dev_drop(skb)); } /* * Sending */ - if (f->port!=port && port_info[f->port].state == Forwarding) { - /* has entry expired? */ - if (f->timer + fdb_aging_time < CURRENT_TIME) { + if (f->port!=port && port_info[f->port].state == Forwarding) + { + /* Has entry expired? */ + if (f->timer + fdb_aging_time < CURRENT_TIME) + { /* timer expired, invalidate entry */ f->flags &= ~FDB_ENT_VALID; if (br_stats.flags & BR_DEBUG) @@ -1456,9 +1686,11 @@ static int br_forward(struct sk_buff *skb, int port) /* 3.7 */ /* * Send flood and drop original */ + ++br_stats_cnt.aged_flood_unicast; br_flood(skb, port); return(br_dev_drop(skb)); } + ++br_stats_cnt.forwarded_unicast; /* mark that's we've been here... */ skb->pkt_bridged = IS_BRIDGED; @@ -1477,7 +1709,25 @@ static int br_forward(struct sk_buff *skb, int port) /* 3.7 */ skb->priority = 1; dev_queue_xmit(skb); return(1); /* skb has been consumed */ - } else { + } + else + { + /* JRP: Needs to aged entry as well, if topology changes + * the entry would not age. Got this while swapping + * two cables ! + * + * Has entry expired? + */ + + if (f->timer + fdb_aging_time < CURRENT_TIME) + { + /* timer expired, invalidate entry */ + f->flags &= ~FDB_ENT_VALID; + if (br_stats.flags & BR_DEBUG) + printk("fdb entry expired...\n"); + ++br_stats_cnt.drop_same_port_aged; + } + else ++br_stats_cnt.drop_same_port; /* * Arrived on the right port, we discard */ @@ -1499,7 +1749,7 @@ static int br_flood(struct sk_buff *skb, int port) for (i = One; i <= No_of_ports; i++) { - if (i == port) + if (i == port) /* don't send back where we got it */ continue; if (port_info[i].state == Forwarding) { @@ -1515,8 +1765,12 @@ static int br_flood(struct sk_buff *skb, int port) /* printk("Flood to port %d\n",i);*/ nskb->h.raw = nskb->data + ETH_HLEN; +#if LINUX_VERSION_CODE >= 0x20100 nskb->priority = 1; dev_queue_xmit(nskb); +#else + dev_queue_xmit(nskb,nskb->dev,1); +#endif } } return(0); @@ -1527,12 +1781,16 @@ static int find_port(struct device *dev) int i; for (i = One; i <= No_of_ports; i++) - if ((port_info[i].dev == dev) && - (port_info[i].state != Disabled)) + if (port_info[i].dev == dev) return(i); return(0); } +/* + * FIXME: This needs to come from the device structs, eg for + * 10,100,1Gbit ethernet. + */ + static int br_port_cost(struct device *dev) /* 4.10.2 */ { if (strncmp(dev->name, "eth", 3) == 0) /* ethernet */ @@ -1546,43 +1804,103 @@ static int br_port_cost(struct device *dev) /* 4.10.2 */ * this routine always consumes the skb */ -static void br_bpdu(struct sk_buff *skb) /* consumes skb */ +static void br_bpdu(struct sk_buff *skb, int port) /* consumes skb */ { - Tcn_bpdu *bpdu; - int port; - - port = find_port(skb->dev); - if (port == 0) { /* unknown port */ - br_drop(skb); - return; - } - - bpdu = (Tcn_bpdu *) (skb->data + ETH_HLEN); - switch (bpdu->type) { - case BPDU_TYPE_CONFIG: - received_config_bpdu(port, (Config_bpdu *)bpdu); - break; - case BPDU_TYPE_TOPO_CHANGE: - received_tcn_bpdu(port, bpdu); - break; - default: - printk(KERN_DEBUG "br_bpdu: received unknown bpdu, type = %i\n", - bpdu->type); + char *bufp = skb->data + ETH_HLEN; + Tcn_bpdu *bpdu = (Tcn_bpdu *) (bufp + BRIDGE_LLC1_HS); + Config_bpdu rcv_bpdu; + + if((*bufp++ == BRIDGE_LLC1_DSAP) && (*bufp++ == BRIDGE_LLC1_SSAP) && + (*bufp++ == BRIDGE_LLC1_CTRL) && + (bpdu->protocol_id == BRIDGE_BPDU_8021_PROTOCOL_ID) && + (bpdu->protocol_version_id == BRIDGE_BPDU_8021_PROTOCOL_VERSION_ID)) + { + + switch (bpdu->type) + { + case BPDU_TYPE_CONFIG: + /* realign for portability to RISC */ + memcpy((char*)&rcv_bpdu, bufp, + BRIDGE_BPDU_8021_CONFIG_FLAG_OFFSET); + bufp+= BRIDGE_BPDU_8021_CONFIG_FLAG_OFFSET; + rcv_bpdu.top_change_ack = + (*bufp & TOPOLOGY_CHANGE_ACK) != 0; + rcv_bpdu.top_change = + (*bufp & TOPOLOGY_CHANGE) != 0; + bufp++; + memcpy((char*)&rcv_bpdu.root_id, bufp, + BRIDGE_BPDU_8021_CONFIG_SIZE-1 + -BRIDGE_BPDU_8021_CONFIG_FLAG_OFFSET); + config_bpdu_ntoh(&rcv_bpdu); + received_config_bpdu(port, &rcv_bpdu); + break; + + case BPDU_TYPE_TOPO_CHANGE: + received_tcn_bpdu(port, bpdu); + break; + default: + printk(KERN_DEBUG "br_bpdu: received unknown bpdu, type = %i\n", bpdu->type); /* break; */ + } } br_drop(skb); } +struct fdb_info *get_fdb_info(int user_buf_size, int *copied,int *notcopied) +{ + int fdb_size, i, built = 0; + struct fdb_info *fdbi, *fdbis; + + *copied = user_buf_size - sizeof(struct fdb_info_hdr); + *copied /= sizeof(struct fdb_info); + *copied = min(*copied, allocated_fdb_cnt); + *notcopied = allocated_fdb_cnt - *copied; + if(*copied == 0) + return NULL; + fdb_size = *copied * sizeof(struct fdb_info); + fdbis = kmalloc(fdb_size, GFP_KERNEL); + if(fdbis == NULL) + return NULL; + fdbi = fdbis; + + for(i=One; i<=No_of_ports;i++) + { + struct fdb *fdb; + + cli(); + fdb = port_info[i].fdb; + while(fdb) + { + memcpy(fdbi->ula, fdb->ula, ETH_ALEN); + fdbi->port = fdb->port; + fdbi->flags = fdb->flags; + fdbi->timer = fdb->timer; + fdbi++; + if(++built == *copied) + { + sti(); + return fdbis; + } + fdb = fdb->fdb_next; + } + sti(); + } + printk(KERN_DEBUG "get_fdb_info: built=%d\n", built); + return fdbis; +} + int br_ioctl(unsigned int cmd, void *arg) { - int err; + int err, i; struct br_cf bcf; + bridge_id_t new_id; switch(cmd) { case SIOCGIFBR: /* get bridging control blocks */ memcpy(&br_stats.bridge_data, &bridge_info, sizeof(Bridge_data)); memcpy(&br_stats.port_data, &port_info, sizeof(Port_data)*No_of_ports); + err = copy_to_user(arg, &br_stats, sizeof(struct br_stat)); if (err) { @@ -1590,17 +1908,33 @@ int br_ioctl(unsigned int cmd, void *arg) } return err; case SIOCSIFBR: - if (!suser()) - return -EPERM; err = copy_from_user(&bcf, arg, sizeof(struct br_cf)); if (err) return -EFAULT; - switch (bcf.cmd) { + if (bcf.cmd != BRCMD_DISPLAY_FDB && !suser()) + return -EPERM; + switch (bcf.cmd) + { case BRCMD_BRIDGE_ENABLE: if (br_stats.flags & BR_UP) return(-EALREADY); printk(KERN_DEBUG "br: enabling bridging function\n"); br_stats.flags |= BR_UP; /* enable bridge */ + for(i=One;i<=No_of_ports; i++) + { + /* don't start if user said so */ + if((user_port_state[i] != Disabled) + && port_info[i].dev) + { + enable_port(i); + } + } + port_state_selection(); /* (4.8.1.5) */ + config_bpdu_generation(); /* (4.8.1.6) */ + /* initialize system timer */ + tl.expires = jiffies+HZ; /* 1 second */ + tl.function = br_tick; + add_timer(&tl); start_hello_timer(); break; case BRCMD_BRIDGE_DISABLE: @@ -1609,35 +1943,41 @@ int br_ioctl(unsigned int cmd, void *arg) printk(KERN_DEBUG "br: disabling bridging function\n"); br_stats.flags &= ~BR_UP; /* disable bridge */ stop_hello_timer(); -#if 0 for (i = One; i <= No_of_ports; i++) if (port_info[i].state != Disabled) disable_port(i); -#endif break; case BRCMD_PORT_ENABLE: if (port_info[bcf.arg1].dev == 0) return(-EINVAL); - if (port_info[bcf.arg1].state != Disabled) + if (user_port_state[bcf.arg1] != Disabled) return(-EALREADY); printk(KERN_DEBUG "br: enabling port %i\n",bcf.arg1); - enable_port(bcf.arg1); + user_port_state[bcf.arg1] = ~Disabled; + if(br_stats.flags & BR_UP) + enable_port(bcf.arg1); break; case BRCMD_PORT_DISABLE: if (port_info[bcf.arg1].dev == 0) return(-EINVAL); - if (port_info[bcf.arg1].state == Disabled) + if (user_port_state[bcf.arg1] == Disabled) return(-EALREADY); printk(KERN_DEBUG "br: disabling port %i\n",bcf.arg1); - disable_port(bcf.arg1); + user_port_state[bcf.arg1] = Disabled; + if(br_stats.flags & BR_UP) + disable_port(bcf.arg1); break; case BRCMD_SET_BRIDGE_PRIORITY: - set_bridge_priority((bridge_id_t *)&bcf.arg1); + new_id = bridge_info.bridge_id; + new_id.BRIDGE_PRIORITY = htons(bcf.arg1); + set_bridge_priority(&new_id); break; case BRCMD_SET_PORT_PRIORITY: - if (port_info[bcf.arg1].dev == 0) + if((port_info[bcf.arg1].dev == 0) + || (bcf.arg2 & ~0xff)) return(-EINVAL); - set_port_priority(bcf.arg1, bcf.arg2); + port_priority[bcf.arg1] = bcf.arg2; + set_port_priority(bcf.arg1); break; case BRCMD_SET_PATH_COST: if (port_info[bcf.arg1].dev == 0) @@ -1664,6 +2004,36 @@ int br_ioctl(unsigned int cmd, void *arg) memset(&br_stats.prot_id,0,sizeof(br_stats.prot_id)); memset(&br_stats.prot_counter,0,sizeof(br_stats.prot_counter)); break; + case BRCMD_DISPLAY_FDB: + { + struct fdb_info_hdr *user_buf = (void*) bcf.arg1; + struct fdb_info *u_fdbs, *fdbis; + int copied, notcopied; + u32 j = CURRENT_TIME; + + if(bcf.arg2<sizeof(struct fdb_info_hdr)) + return -EINVAL; + put_user(j, &user_buf->cmd_time); + if(allocated_fdb_cnt == 0) + { + put_user(0, &user_buf->copied); + put_user(0, &user_buf->not_copied); + return 0; + } + fdbis = get_fdb_info(bcf.arg2, &copied, ¬copied); + put_user(copied, &user_buf->copied); + put_user(notcopied, &user_buf->not_copied); + if(!fdbis) + return -ENOMEM; + u_fdbs = (struct fdb_info *) (user_buf+1); + err = copy_to_user(u_fdbs, fdbis, copied*sizeof(struct fdb_info)); + kfree(fdbis); + if (err) + { + err = -EFAULT; + } + return err; + } default: return -EINVAL; } @@ -1680,12 +2050,13 @@ static int br_cmp(unsigned int *a, unsigned int *b) int i; for (i=0; i<2; i++) { - if (a[i] == b[i]) - continue; - if (a[i] < b[i]) - return(1); - if (a[i] > b[i]) + /* JRP: compares prty then MAC address in memory byte order + * OK optimizer does htonl() only once per long ! + */ + if (htonl(a[i]) < htonl(b[i])) return(-1); + if (htonl(a[i]) > htonl(b[i])) + return(1); } return(0); } diff --git a/net/bridge/br_tree.c b/net/bridge/br_tree.c index 8234249c5..709bafb2b 100644 --- a/net/bridge/br_tree.c +++ b/net/bridge/br_tree.c @@ -1,6 +1,7 @@ /* - * this code is derived from the avl functions in mmap.c + * This code is derived from the avl functions in mmap.c */ + #include <linux/kernel.h> #include <linux/errno.h> #include <linux/string.h> @@ -16,6 +17,10 @@ * Written by Bruno Haible <haible@ma2s2.mathematik.uni-karlsruhe.de>. * Taken from mmap.c, extensively modified by John Hayes * <hayes@netplumbing.com> + * 98-02 Modified by Jean-Rene Peulve jr.peulve@aix.pacwan.net + * update port number when topology change + * return oldfdb when updating, for broadcast storm checking + * call addr_cmp once per node */ static struct fdb fdb_head; @@ -50,8 +55,7 @@ static int addr_cmp(unsigned char *a1, unsigned char *a2); * foreach node in tree->fdb_avl_right: node->fdb_avl_key >= tree->fdb_avl_key. */ -static int -fdb_init(void) +static int fdb_init(void) { fdb_head.fdb_avl_height = 0; fdb_head.fdb_avl_left = (struct fdb *)0; @@ -109,7 +113,6 @@ struct fdb *br_avl_find_addr(unsigned char addr[6]) } -#if (0) /* * Rebalance a tree. * After inserting or deleting a node of a tree we have a sequence of subtrees @@ -196,10 +199,14 @@ static void br_avl_rebalance (struct fdb *** nodeplaces_ptr, int count) printk_avl(&fdb_head); #endif /* DEBUG_AVL */ } -#endif /* (0) */ -/* Insert a node into a tree. */ -int br_avl_insert (struct fdb * new_node) +/* Insert a node into a tree. + * Performance improvement: + * call addr_cmp() only once per node and use result in a switch. + * Return old node address if we knew that MAC address already + * Return NULL if we insert the new node + */ +struct fdb *br_avl_insert (struct fdb * new_node) { struct fdb ** nodeplace = fhpp; struct fdb ** stack[avl_maxheight]; @@ -214,15 +221,38 @@ int br_avl_insert (struct fdb * new_node) if (node == avl_br_empty) break; *stack_ptr++ = nodeplace; stack_count++; - if (addr_cmp(new_node->ula, node->ula) == 0) { /* update */ + switch(addr_cmp(new_node->ula, node->ula)) { + case 0: /* update */ + if (node->port == new_node->port) { node->flags = new_node->flags; node->timer = new_node->timer; - return(0); - } - if (addr_cmp(new_node->ula, node->ula) < 0) { - nodeplace = &node->fdb_avl_left; - } else { - nodeplace = &node->fdb_avl_right; + } else if (!(node->flags & FDB_ENT_VALID) && + node->port) { + /* update fdb but never for local interfaces */ +#if (DEBUG_AVL) + printk("node 0x%x:port changed old=%d new=%d\n", + (unsigned int)node, node->port,new_node->port); +#endif + /* JRP: update port as well if the topology change ! + * Don't do this while entry is still valid otherwise + * a broadcast that we flooded and is reentered by another + * port would mess up the good port number. + * The fdb list per port needs to be updated as well. + */ + requeue_fdb(node, new_node->port); + node->flags = new_node->flags; + node->timer = new_node->timer; +#if (DEBUG_AVL) + printk_avl(&fdb_head); +#endif /* DEBUG_AVL */ + } + return node; /* pass old fdb to caller */ + + case 1: /* new_node->ula > node->ula */ + nodeplace = &node->fdb_avl_right; + break; + default: /* -1 => new_node->ula < node->ula */ + nodeplace = &node->fdb_avl_left; } } #if (DEBUG_AVL) @@ -239,17 +269,14 @@ int br_avl_insert (struct fdb * new_node) new_node->fdb_avl_right = avl_br_empty; new_node->fdb_avl_height = 1; *nodeplace = new_node; -#if (0) br_avl_rebalance(stack_ptr,stack_count); -#endif /* (0) */ #ifdef DEBUG_AVL printk_avl(&fdb_head); #endif /* DEBUG_AVL */ - return(1); + return NULL; /* this is a new node */ } -#if (0) /* Removes a node out of a tree. */ static int br_avl_remove (struct fdb * node_to_delete) { @@ -302,7 +329,6 @@ static int br_avl_remove (struct fdb * node_to_delete) br_avl_rebalance(stack_ptr,stack_count); return(0); } -#endif /* (0) */ #ifdef DEBUG_AVL @@ -311,13 +337,14 @@ static void printk_avl (struct fdb * tree) { if (tree != avl_br_empty) { printk("("); - printk("%02x:%02x:%02x:%02x:%02x:%02x", + printk("%02x:%02x:%02x:%02x:%02x:%02x(%d)", tree->ula[0], tree->ula[1], tree->ula[2], tree->ula[3], tree->ula[4], - tree->ula[5]); + tree->ula[5], + tree->port); if (tree->fdb_avl_left != avl_br_empty) { printk_avl(tree->fdb_avl_left); printk("<"); @@ -330,7 +357,6 @@ static void printk_avl (struct fdb * tree) } } -#if (0) static char *avl_check_point = "somewhere"; /* check a tree's consistency and balancing */ @@ -387,7 +413,6 @@ static void avl_checkorder (struct fdb * tree) avl_checkright(tree->fdb_avl_right,tree->fdb_avl_key); } -#endif /* (0) */ #endif /* DEBUG_AVL */ static int addr_cmp(unsigned char a1[], unsigned char a2[]) diff --git a/net/core/Makefile b/net/core/Makefile index fc9dc31c4..ecbe9d99a 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -9,8 +9,7 @@ O_TARGET := core.o -O_OBJS := sock.o skbuff.o iovec.o datagram.o dst.o scm.o \ - neighbour.o rtnetlink.o utils.o +O_OBJS := sock.o skbuff.o iovec.o datagram.o scm.o ifeq ($(CONFIG_SYSCTL),y) O_OBJS += sysctl_net_core.o @@ -22,7 +21,7 @@ endif ifdef CONFIG_NET -O_OBJS += dev.o dev_mcast.o +O_OBJS += dev.o dev_mcast.o dst.o neighbour.o rtnetlink.o utils.o ifdef CONFIG_FIREWALL OX_OBJS += firewall.o diff --git a/net/core/dev.c b/net/core/dev.c index 36efa363b..85312b12c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1342,10 +1342,7 @@ int dev_change_flags(struct device *dev, unsigned flags) ret = 0; if ((old_flags^flags)&IFF_UP) /* Bit is different ? */ { - if(old_flags&IFF_UP) /* Gone down */ - ret=dev_close(dev); - else /* Come up */ - ret=dev_open(dev); + ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev); if (ret == 0) dev_mc_upload(dev); @@ -1792,7 +1789,9 @@ __initfunc(int net_dev_init(void)) { struct device *dev, **dp; +#ifdef CONFIG_NET_SCHED pktsched_init(); +#endif /* * Initialise the packet receive queue. diff --git a/net/core/iovec.c b/net/core/iovec.c index 5b684a48f..67f7a6f2b 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -29,7 +29,8 @@ #include <net/checksum.h> /* - * Verify iovec + * Verify iovec. The caller must ensure that the iovec is big enough + * to hold the message iovec. * * Save time not doing verify_area. copy_*_user will make this work * in any case. @@ -37,8 +38,7 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, char *address, int mode) { - int size = m->msg_iovlen * sizeof(struct iovec); - int err, ct; + int size, err, ct; if(m->msg_namelen) { @@ -53,28 +53,16 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, char *address, int mode) } else m->msg_name = NULL; - if (m->msg_iovlen > UIO_FASTIOV) - { - err = -ENOMEM; - iov = kmalloc(size, GFP_KERNEL); - if (!iov) - goto out; - } - + err = -EFAULT; + size = m->msg_iovlen * sizeof(struct iovec); if (copy_from_user(iov, m->msg_iov, size)) - goto out_free; + goto out; m->msg_iov=iov; for (err = 0, ct = 0; ct < m->msg_iovlen; ct++) err += iov[ct].iov_len; out: return err; - -out_free: - err = -EFAULT; - if (m->msg_iovlen > UIO_FASTIOV) - kfree(iov); - goto out; } /* diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index cf7fe8ff8..4bbe84cac 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -63,6 +63,19 @@ void rtnl_unlock() rtnl_shunlock(); } +int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len) +{ + memset(tb, 0, sizeof(struct rtattr*)*maxattr); + + while (RTA_OK(rta, len)) { + unsigned flavor = rta->rta_type; + if (flavor && flavor <= maxattr) + tb[flavor-1] = rta; + rta = RTA_NEXT(rta, len); + } + return 0; +} + #ifdef CONFIG_RTNETLINK struct sock *rtnl; @@ -109,6 +122,19 @@ void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data memcpy(RTA_DATA(rta), data, attrlen); } +int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo) +{ + int err = 0; + + NETLINK_CB(skb).dst_groups = group; + if (echo) + atomic_inc(&skb->users); + netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL); + if (echo) + err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); + return err; +} + #ifdef CONFIG_RTNL_OLD_IFINFO static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct device *dev, int type, pid_t pid, u32 seq) @@ -132,7 +158,7 @@ static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct device *dev, strncpy(r->ifi_name, dev->name, IFNAMSIZ-1); r->ifi_qdiscname[0] = 0; r->ifi_qdisc = dev->qdisc_sleeping->handle; - if (dev->qdisc_sleeping->ops) + if (dev->qdisc_sleeping) strcpy(r->ifi_qdiscname, dev->qdisc_sleeping->ops->id); if (dev->get_stats) { struct net_device_stats *stats = dev->get_stats(dev); @@ -175,7 +201,7 @@ static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct device *dev, } if (dev->ifindex != dev->iflink) RTA_PUT(skb, IFLA_LINK, sizeof(int), &dev->iflink); - if (dev->qdisc_sleeping->ops) + if (dev->qdisc_sleeping) RTA_PUT(skb, IFLA_QDISC, strlen(dev->qdisc_sleeping->ops->id) + 1, dev->qdisc_sleeping->ops->id); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 57e58f85a..abad1e217 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -273,7 +273,6 @@ struct sk_buff *skb_copy(struct sk_buff *skb, int gfp_mask) n->csum = skb->csum; n->list=NULL; n->sk=NULL; - n->when=skb->when; n->dev=skb->dev; n->priority=skb->priority; n->protocol=skb->protocol; @@ -281,9 +280,6 @@ struct sk_buff *skb_copy(struct sk_buff *skb, int gfp_mask) n->h.raw=skb->h.raw+offset; n->nh.raw=skb->nh.raw+offset; n->mac.raw=skb->mac.raw+offset; - n->seq=skb->seq; - n->end_seq=skb->end_seq; - n->ack_seq=skb->ack_seq; memcpy(n->cb, skb->cb, sizeof(skb->cb)); n->used=skb->used; n->is_clone=0; @@ -323,7 +319,6 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, int newheadroom) memcpy(n->data,skb->data,skb->len); n->list=NULL; n->sk=NULL; - n->when=skb->when; n->priority=skb->priority; n->protocol=skb->protocol; n->dev=skb->dev; @@ -332,9 +327,6 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, int newheadroom) n->nh.raw=skb->nh.raw+offset; n->mac.raw=skb->mac.raw+offset; memcpy(n->cb, skb->cb, sizeof(skb->cb)); - n->seq=skb->seq; - n->end_seq=skb->end_seq; - n->ack_seq=skb->ack_seq; n->used=skb->used; n->is_clone=0; atomic_set(&n->users, 1); diff --git a/net/core/sock.c b/net/core/sock.c index 7707c70d0..30e5d3e77 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -290,6 +290,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, break; +#ifdef CONFIG_NETDEVICES case SO_BINDTODEVICE: /* Bind this socket to a particular device like "eth0", * as specified in an ifreq structure. If the device @@ -316,6 +317,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, } } return 0; +#endif #ifdef CONFIG_FILTER @@ -589,38 +591,37 @@ void sock_kfree_s(struct sock *sk, void *mem, int size) */ unsigned long sock_rspace(struct sock *sk) { - int amt; + int amt = 0; if (sk != NULL) { - /* This used to have some bizzare complications that + /* This used to have some bizarre complications that * to attempt to reserve some amount of space. This doesn't * make sense, since the number returned here does not * actually reflect allocated space, but rather the amount * of space we committed to. We gamble that we won't * run out of memory, and returning a smaller number does - * not change the gamble. If we loose the gamble tcp still + * not change the gamble. If we lose the gamble tcp still * works, it may just slow down for retransmissions. */ amt = sk->rcvbuf - atomic_read(&sk->rmem_alloc); if (amt < 0) - return(0); - return(amt); + amt = 0; } - return(0); + return amt; } /* FIXME: this is also insane. See above comment */ unsigned long sock_wspace(struct sock *sk) { - if (sk != NULL) { - if (sk->shutdown & SEND_SHUTDOWN) - return(0); - if (atomic_read(&sk->wmem_alloc) >= sk->sndbuf) - return(0); - return sk->sndbuf - atomic_read(&sk->wmem_alloc); + int amt = 0; + + if (sk != NULL && !(sk->shutdown & SEND_SHUTDOWN)) { + amt = sk->sndbuf - atomic_read(&sk->wmem_alloc); + if (amt < 0) + amt = 0; } - return(0); + return amt; } /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. @@ -653,13 +654,17 @@ static void sock_wait_for_wmem(struct sock * sk) * Generic send/receive buffer handlers */ -struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, unsigned long fallback, int noblock, int *errcode) +struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, + unsigned long fallback, int noblock, int *errcode) { int err; struct sk_buff *skb; - do { - if ((err = xchg(&sk->err,0)) != 0) + while (1) { + unsigned long try_size = size; + + err = sock_error(sk); + if (err != 0) goto failure; /* @@ -676,33 +681,32 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, unsigne if (sk->shutdown&SEND_SHUTDOWN) goto failure; - if (!fallback) - skb = sock_wmalloc(sk, size, 0, sk->allocation); - else { - /* The buffer get won't block, or use the atomic queue. It does - produce annoying no free page messages still.... */ + if (fallback) { + /* The buffer get won't block, or use the atomic queue. + * It does produce annoying no free page messages still. + */ skb = sock_wmalloc(sk, size, 0, GFP_BUFFER); - if (!skb) - skb=sock_wmalloc(sk, fallback, 0, sk->allocation); + if (skb) + break; + try_size = fallback; } + skb = sock_wmalloc(sk, try_size, 0, sk->allocation); + if (skb) + break; /* * This means we have too many buffers for this socket already. */ - /* The following code is stolen "as is" from tcp.c */ - - if (skb==NULL) { - sk->socket->flags |= SO_NOSPACE; - err = -EAGAIN; - if (noblock) - goto failure; - err = -ERESTARTSYS; - if (signal_pending(current)) - goto failure; - sock_wait_for_wmem(sk); - } - } while (skb==NULL); + sk->socket->flags |= SO_NOSPACE; + err = -EAGAIN; + if (noblock) + goto failure; + err = -ERESTARTSYS; + if (signal_pending(current)) + goto failure; + sock_wait_for_wmem(sk); + } return skb; diff --git a/net/econet/.cvsignore b/net/econet/.cvsignore new file mode 100644 index 000000000..857dd22e9 --- /dev/null +++ b/net/econet/.cvsignore @@ -0,0 +1,2 @@ +.depend +.*.flags diff --git a/net/econet/Makefile b/net/econet/Makefile new file mode 100644 index 000000000..367584873 --- /dev/null +++ b/net/econet/Makefile @@ -0,0 +1,23 @@ +# +# Makefile for Econet support code. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definition is now in the main makefile... + +MOD_LIST_NAME := NET_MISC_MODULES + +O_OBJS := +M_OBJS := + +ifeq ($(CONFIG_ECONET),y) + O_OBJS += econet.o +else + ifeq ($(CONFIG_ECONET), m) + M_OBJS += econet.o + endif +endif + +include $(TOPDIR)/Rules.make diff --git a/net/econet/econet.c b/net/econet/econet.c new file mode 100644 index 000000000..9bfbfd921 --- /dev/null +++ b/net/econet/econet.c @@ -0,0 +1,1108 @@ +/* + * An implementation of the Acorn Econet and AUN protocols. + * Philip Blundell <philb@gnu.org> + * + * Fixes: + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/config.h> +#include <linux/module.h> + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/route.h> +#include <linux/inet.h> +#include <linux/etherdevice.h> +#include <linux/if_arp.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/inet_common.h> +#include <linux/stat.h> +#include <linux/init.h> +#include <linux/if_ec.h> +#include <net/udp.h> +#include <net/ip.h> +#include <asm/spinlock.h> +#include <linux/inetdevice.h> + +static struct proto_ops econet_ops; +static struct sock *econet_sklist; + +#ifdef CONFIG_ECONET_AUNUDP +static struct socket *udpsock; +#define AUN_PORT 0x8000 + +struct aunhdr +{ + unsigned char code; /* AUN magic protocol byte */ + unsigned char port; + unsigned char cb; + unsigned char pad; + unsigned long handle; +}; + +static unsigned long aun_seq = 0; + +/* Queue of packets waiting to be transmitted. */ +static struct sk_buff_head aun_queue; +static struct timer_list ab_cleanup_timer; + +#endif /* CONFIG_ECONET_AUNUDP */ + +/* Per-packet information */ +struct ec_cb +{ + struct sockaddr_ec sec; + unsigned long cookie; /* Supplied by user. */ +#ifdef CONFIG_ECONET_AUNUDP + int done; + unsigned long seq; /* Sequencing */ + unsigned long timeout; /* Timeout */ + unsigned long start; /* jiffies */ +#endif +#ifdef CONFIG_ECONET_NATIVE + void (*sent)(struct sk_buff *, int result); +#endif +}; + +struct ec_device +{ + struct device *dev; /* Real device structure */ + unsigned char station, net; /* Econet protocol address */ + struct ec_device *prev, *next; /* Linked list */ +}; + +static struct ec_device *edevlist = NULL; + +static spinlock_t edevlist_lock; + +/* + * Faster version of edev_get - call with IRQs off + */ + +static __inline__ struct ec_device *__edev_get(struct device *dev) +{ + struct ec_device *edev; + for (edev = edevlist; edev; edev = edev->next) + { + if (edev->dev == dev) + break; + } + return edev; +} + +/* + * Find an Econet device given its `dev' pointer. This is IRQ safe. + */ + +static struct ec_device *edev_get(struct device *dev) +{ + struct ec_device *edev; + unsigned long flags; + spin_lock_irqsave(&edevlist_lock, flags); + edev = __edev_get(dev); + spin_unlock_irqrestore(&edevlist_lock, flags); + return edev; +} + +/* + * Pull a packet from our receive queue and hand it to the user. + * If necessary we block. + */ + +static int econet_recvmsg(struct socket *sock, struct msghdr *msg, int len, + int flags, struct scm_cookie *scm) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + int copied, err; + + msg->msg_namelen = sizeof(struct sockaddr_ec); + + /* + * Call the generic datagram receiver. This handles all sorts + * of horrible races and re-entrancy so we can forget about it + * in the protocol layers. + * + * Now it will return ENETDOWN, if device have just gone down, + * but then it will block. + */ + + skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err); + + /* + * An error occurred so return it. Because skb_recv_datagram() + * handles the blocking we don't see and worry about blocking + * retries. + */ + + if(skb==NULL) + goto out; + + /* + * You lose any data beyond the buffer you gave. If it worries a + * user program they can ask the device for its MTU anyway. + */ + + copied = skb->len; + if (copied > len) + { + copied=len; + msg->msg_flags|=MSG_TRUNC; + } + + /* We can't use skb_copy_datagram here */ + err = memcpy_toiovec(msg->msg_iov, skb->data, copied); + if (err) + goto out_free; + sk->stamp=skb->stamp; + + if (msg->msg_name) + memcpy(msg->msg_name, skb->cb, msg->msg_namelen); + + /* + * Free or return the buffer as appropriate. Again this + * hides all the races and re-entrancy issues from us. + */ + err = copied; + +out_free: + skb_free_datagram(sk, skb); +out: + return err; +} + +/* + * Bind an Econet socket. + */ + +static int econet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_ec *sec = (struct sockaddr_ec *)uaddr; + struct sock *sk=sock->sk; + + /* + * Check legality + */ + + if (addr_len < sizeof(struct sockaddr_ec)) + return -EINVAL; + if (sec->sec_family != AF_ECONET) + return -EINVAL; + + sk->protinfo.af_econet->cb = sec->cb; + sk->protinfo.af_econet->port = sec->port; + sk->protinfo.af_econet->station = sec->addr.station; + sk->protinfo.af_econet->net = sec->addr.net; + + return 0; +} + +/* + * Queue a transmit result for the user to be told about. + */ + +static void tx_result(struct sock *sk, unsigned long cookie, int result) +{ + struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC); + struct ec_cb *eb; + struct sockaddr_ec *sec; + + if (skb == NULL) + { + printk(KERN_DEBUG "ec: memory squeeze, transmit result dropped.\n"); + return; + } + + eb = (struct ec_cb *)&skb->cb; + sec = (struct sockaddr_ec *)&eb->sec; + memset(sec, 0, sizeof(struct sockaddr_ec)); + sec->cookie = cookie; + sec->type = ECTYPE_TRANSMIT_STATUS | result; + sec->sec_family = AF_ECONET; + + if (sock_queue_rcv_skb(sk, skb) < 0) + kfree_skb(skb); +} + +#ifdef CONFIG_ECONET_NATIVE +/* + * Called by the Econet hardware driver when a packet transmit + * has completed. Tell the user. + */ + +static void ec_tx_done(struct sk_buff *skb, int result) +{ + struct ec_cb *eb = (struct ec_cb *)&skb->cb; + tx_result(skb->sk, eb->cookie, result); +} +#endif + +/* + * Send a packet. We have to work out which device it's going out on + * and hence whether to use real Econet or the UDP emulation. + */ + +static int econet_sendmsg(struct socket *sock, struct msghdr *msg, int len, + struct scm_cookie *scm) +{ + struct sock *sk = sock->sk; + struct sockaddr_ec *saddr=(struct sockaddr_ec *)msg->msg_name; + struct device *dev; + struct ec_addr addr; + struct ec_device *edev; + int err; + unsigned char port, cb; + struct sk_buff *skb; + struct ec_cb *eb; +#ifdef CONFIG_ECONET_NATIVE + unsigned short proto = 0; +#endif +#ifdef CONFIG_ECONET_AUNUDP + struct msghdr udpmsg; + struct iovec iov[msg->msg_iovlen+1]; + struct aunhdr ah; + struct sockaddr_in udpdest; + __kernel_size_t size; + int i; + mm_segment_t oldfs; +#endif + + /* + * Check the flags. + */ + + if (msg->msg_flags&~MSG_DONTWAIT) + return(-EINVAL); + + /* + * Get and verify the address. + */ + + if (saddr == NULL) { + addr.station = sk->protinfo.af_econet->station; + addr.net = sk->protinfo.af_econet->net; + port = sk->protinfo.af_econet->port; + cb = sk->protinfo.af_econet->cb; + } else { + if (msg->msg_namelen < sizeof(struct sockaddr_ec)) + return -EINVAL; + addr.station = saddr->addr.station; + addr.net = saddr->addr.net; + port = saddr->port; + cb = saddr->cb; + } + + /* Look for a device with the right network number. */ + for (edev = edevlist; edev && (edev->net != addr.net); + edev = edev->next); + + /* Bridge? What's that? */ + if (edev == NULL) + return -ENETUNREACH; + + dev = edev->dev; + + if (dev->type == ARPHRD_ECONET) + { + /* Real hardware Econet. We're not worthy etc. */ +#ifdef CONFIG_ECONET_NATIVE + unsigned char *p; + + dev_lock_list(); + + skb = sock_alloc_send_skb(sk, len+dev->hard_header_len+15, 0, + msg->msg_flags & MSG_DONTWAIT, &err); + if (skb==NULL) + goto out_unlock; + + skb_reserve(skb, (dev->hard_header_len+15)&~15); + skb->nh.raw = skb->data; + + eb = (struct ec_cb *)&skb->cb; + + eb->cookie = saddr->cookie; + eb->sec = *saddr; + eb->sent - ec_tx_done; + + if (dev->hard_header) { + int res; + err = -EINVAL; + res = dev->hard_header(skb, dev, ntohs(proto), &addr, NULL, len); + if (sock->type != SOCK_DGRAM) { + skb->tail = skb->data; + skb->len = 0; + } else if (res < 0) + goto out_free; + } + + /* Copy the data. Returns -EFAULT on error */ + err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); + skb->protocol = proto; + skb->dev = dev; + skb->priority = sk->priority; + if (err) + goto out_free; + + err = -ENETDOWN; + if (!(dev->flags & IFF_UP)) + goto out_free; + + /* + * Now send it + */ + + dev_unlock_list(); + dev_queue_xmit(skb); + return(len); + + out_free: + kfree_skb(skb); + out_unlock: + dev_unlock_list(); +#else + err = -EPROTOTYPE; +#endif + return err; + } + +#ifdef CONFIG_ECONET_AUNUDP + /* AUN virtual Econet. */ + + if (udpsock == NULL) + return -ENETDOWN; /* No socket - can't send */ + + /* Make up a UDP datagram and hand it off to some higher intellect. */ + + memset(&udpdest, 0, sizeof(udpdest)); + udpdest.sin_family = AF_INET; + udpdest.sin_port = htons(AUN_PORT); + + /* At the moment we use the stupid Acorn scheme of Econet address + y.x maps to IP a.b.c.x. This should be replaced with something + more flexible and more aware of subnet masks. */ + { + struct in_device *idev = (struct in_device *)dev->ip_ptr; + unsigned long network = ntohl(idev->ifa_list->ifa_address) & + 0xffffff00; /* !!! */ + udpdest.sin_addr.s_addr = htonl(network | addr.station); + } + + ah.port = port; + ah.cb = cb & 0x7f; + ah.code = 2; /* magic */ + ah.pad = 0; + + /* tack our header on the front of the iovec */ + size = sizeof(struct aunhdr); + iov[0].iov_base = (void *)&ah; + iov[0].iov_len = size; + for (i = 0; i < msg->msg_iovlen; i++) { + void *base = msg->msg_iov[i].iov_base; + size_t len = msg->msg_iov[i].iov_len; + /* Check it now since we switch to KERNEL_DS later. */ + if ((err = verify_area(VERIFY_READ, base, len)) < 0) + return err; + iov[i+1].iov_base = base; + iov[i+1].iov_len = len; + size += len; + } + + /* Get a skbuff (no data, just holds our cb information) */ + if ((skb = sock_alloc_send_skb(sk, 0, 0, + msg->msg_flags & MSG_DONTWAIT, &err)) == NULL) + return err; + + eb = (struct ec_cb *)&skb->cb; + + eb->cookie = saddr->cookie; + eb->timeout = (5*HZ); + eb->start = jiffies; + ah.handle = aun_seq; + eb->seq = (aun_seq++); + eb->sec = *saddr; + + skb_queue_tail(&aun_queue, skb); + + udpmsg.msg_name = (void *)&udpdest; + udpmsg.msg_namelen = sizeof(udpdest); + udpmsg.msg_iov = &iov[0]; + udpmsg.msg_iovlen = msg->msg_iovlen + 1; + udpmsg.msg_control = NULL; + udpmsg.msg_controllen = 0; + udpmsg.msg_flags=0; + + oldfs = get_fs(); set_fs(KERNEL_DS); /* More privs :-) */ + err = sock_sendmsg(udpsock, &udpmsg, size); + set_fs(oldfs); +#else + err = -EPROTOTYPE; +#endif + return err; +} + +/* + * Look up the address of a socket. + */ + +static int econet_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sock *sk = sock->sk; + struct sockaddr_ec *sec = (struct sockaddr_ec *)uaddr; + + if (peer) + return -EOPNOTSUPP; + + sec->sec_family = AF_ECONET; + sec->port = sk->protinfo.af_econet->port; + sec->addr.station = sk->protinfo.af_econet->station; + sec->addr.net = sk->protinfo.af_econet->net; + + *uaddr_len = sizeof(*sec); + return 0; +} + +static void econet_destroy_timer(unsigned long data) +{ + struct sock *sk=(struct sock *)data; + + if (!atomic_read(&sk->wmem_alloc) && !atomic_read(&sk->rmem_alloc)) { + sk_free(sk); + MOD_DEC_USE_COUNT; + return; + } + + sk->timer.expires=jiffies+10*HZ; + add_timer(&sk->timer); + printk(KERN_DEBUG "econet socket destroy delayed\n"); +} + +/* + * Close an econet socket. + */ + +static int econet_release(struct socket *sock, struct socket *peersock) +{ + struct sk_buff *skb; + struct sock *sk = sock->sk; + + if (!sk) + return 0; + + sklist_remove_socket(&econet_sklist, sk); + + /* + * Now the socket is dead. No more input will appear. + */ + + sk->state_change(sk); /* It is useless. Just for sanity. */ + + sock->sk = NULL; + sk->socket = NULL; + sk->dead = 1; + + /* Purge queues */ + + while ((skb=skb_dequeue(&sk->receive_queue))!=NULL) + kfree_skb(skb); + + if (atomic_read(&sk->rmem_alloc) || atomic_read(&sk->wmem_alloc)) { + sk->timer.data=(unsigned long)sk; + sk->timer.expires=jiffies+HZ; + sk->timer.function=econet_destroy_timer; + add_timer(&sk->timer); + return 0; + } + + sk_free(sk); + MOD_DEC_USE_COUNT; + return 0; +} + +/* + * Create an Econet socket + */ + +static int econet_create(struct socket *sock, int protocol) +{ + struct sock *sk; + int err; + + /* Econet only provides datagram services. */ + if (sock->type != SOCK_DGRAM) + return -ESOCKTNOSUPPORT; + + sock->state = SS_UNCONNECTED; + MOD_INC_USE_COUNT; + + err = -ENOBUFS; + sk = sk_alloc(AF_ECONET, GFP_KERNEL, 1); + if (sk == NULL) + goto out; + + sk->reuse = 1; + sock->ops = &econet_ops; + sock_init_data(sock,sk); + + sk->protinfo.af_econet = kmalloc(sizeof(struct econet_opt), GFP_KERNEL); + if (sk->protinfo.af_econet == NULL) + goto out_free; + memset(sk->protinfo.af_econet, 0, sizeof(struct econet_opt)); + sk->zapped=0; + sk->family = AF_ECONET; + sk->num = protocol; + + sklist_insert_socket(&econet_sklist, sk); + return(0); + +out_free: + sk_free(sk); +out: + MOD_DEC_USE_COUNT; + return err; +} + +/* + * Handle Econet specific ioctls + */ + +static int ec_dev_ioctl(struct socket *sock, unsigned int cmd, void *arg) +{ + struct ifreq ifr; + struct ec_device *edev; + struct device *dev; + unsigned long flags; + struct sockaddr_ec *sec; + + /* + * Fetch the caller's info block into kernel space + */ + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + + if ((dev = dev_get(ifr.ifr_name)) == NULL) + return -ENODEV; + + sec = (struct sockaddr_ec *)&ifr.ifr_addr; + + switch (cmd) + { + case SIOCSIFADDR: + spin_lock_irqsave(&edevlist_lock, flags); + edev = __edev_get(dev); + if (edev == NULL) + { + /* Magic up a new one. */ + edev = kmalloc(GFP_KERNEL, sizeof(struct ec_device)); + if (edev == NULL) { + printk("af_ec: memory squeeze.\n"); + spin_unlock_irqrestore(&edevlist_lock, flags); + return -ENOMEM; + } + memset(edev, 0, sizeof(struct ec_device)); + edev->dev = dev; + edev->next = edevlist; + edevlist = edev; + } + edev->station = sec->addr.station; + edev->net = sec->addr.net; + spin_unlock_irqrestore(&edevlist_lock, flags); + return 0; + + case SIOCGIFADDR: + spin_lock_irqsave(&edevlist_lock, flags); + edev = __edev_get(dev); + if (edev == NULL) + { + spin_unlock_irqrestore(&edevlist_lock, flags); + return -ENODEV; + } + memset(sec, 0, sizeof(struct sockaddr_ec)); + sec->addr.station = edev->station; + sec->addr.net = edev->net; + sec->sec_family = AF_ECONET; + spin_unlock_irqrestore(&edevlist_lock, flags); + if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + return 0; + } + + return -EINVAL; +} + +/* + * Handle generic ioctls + */ + +static int econet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + int err; + int pid; + + switch(cmd) + { + case FIOSETOWN: + case SIOCSPGRP: + err = get_user(pid, (int *) arg); + if (err) + return err; + if (current->pid != pid && current->pgrp != -pid && !suser()) + return -EPERM; + sk->proc = pid; + return(0); + case FIOGETOWN: + case SIOCGPGRP: + return put_user(sk->proc, (int *)arg); + case SIOCGSTAMP: + if(sk->stamp.tv_sec==0) + return -ENOENT; + err = -EFAULT; + if (!copy_to_user((void *)arg, &sk->stamp, sizeof(struct timeval))) + err = 0; + return err; + case SIOCGIFFLAGS: + case SIOCSIFFLAGS: + case SIOCGIFCONF: + case SIOCGIFMETRIC: + case SIOCSIFMETRIC: + case SIOCGIFMEM: + case SIOCSIFMEM: + case SIOCGIFMTU: + case SIOCSIFMTU: + case SIOCSIFLINK: + case SIOCGIFHWADDR: + case SIOCSIFHWADDR: + case SIOCSIFMAP: + case SIOCGIFMAP: + case SIOCSIFSLAVE: + case SIOCGIFSLAVE: + case SIOCGIFINDEX: + case SIOCGIFNAME: + case SIOCGIFCOUNT: + case SIOCSIFHWBROADCAST: + return(dev_ioctl(cmd,(void *) arg)); + + + case SIOCSIFADDR: + case SIOCGIFADDR: + return ec_dev_ioctl(sock, cmd, (void *)arg); + break; + + default: + if ((cmd >= SIOCDEVPRIVATE) && + (cmd <= (SIOCDEVPRIVATE + 15))) + return(dev_ioctl(cmd,(void *) arg)); + +#ifdef CONFIG_NET_RADIO + if((cmd >= SIOCIWFIRST) && (cmd <= SIOCIWLAST)) + return(dev_ioctl(cmd,(void *) arg)); +#endif + return -EOPNOTSUPP; + } + /*NOTREACHED*/ + return 0; +} + +static struct net_proto_family econet_family_ops = { + AF_ECONET, + econet_create +}; + +static struct proto_ops econet_ops = { + AF_ECONET, + + sock_no_dup, + econet_release, + econet_bind, + sock_no_connect, + NULL, + NULL, + econet_getname, + datagram_poll, + econet_ioctl, + sock_no_listen, + sock_no_shutdown, + sock_no_setsockopt, + sock_no_getsockopt, + sock_no_fcntl, + econet_sendmsg, + econet_recvmsg +}; + +/* + * Find the listening socket, if any, for the given data. + */ + +static struct sock *ec_listening_socket(unsigned char port, unsigned char + station, unsigned char net) +{ + struct sock *sk = econet_sklist; + + while (sk) + { + struct econet_opt *opt = sk->protinfo.af_econet; + if ((opt->port == port || opt->port == 0) && + (opt->station == station || opt->station == 0) && + (opt->net == net || opt->net == 0)) + return sk; + sk = sk->sklist_next; + } + + return NULL; +} + +#ifdef CONFIG_ECONET_AUNUDP + +/* + * Send an AUN protocol response. + */ + +static void aun_send_response(__u32 addr, unsigned long seq, int code, int cb) +{ + struct sockaddr_in sin; + struct iovec iov; + struct aunhdr ah; + struct msghdr udpmsg; + int err; + mm_segment_t oldfs; + + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_port = htons(AUN_PORT); + sin.sin_addr.s_addr = addr; + + ah.code = code; + ah.pad = 0; + ah.port = 0; + ah.cb = cb; + ah.handle = seq; + + iov.iov_base = (void *)&ah; + iov.iov_len = sizeof(ah); + + udpmsg.msg_name = (void *)&sin; + udpmsg.msg_namelen = sizeof(sin); + udpmsg.msg_iov = &iov; + udpmsg.msg_iovlen = 1; + udpmsg.msg_control = NULL; + udpmsg.msg_controllen = 0; + udpmsg.msg_flags=0; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock_sendmsg(udpsock, &udpmsg, sizeof(ah)); + set_fs(oldfs); +} + +/* + * Handle incoming AUN packets. Work out if anybody wants them, + * and send positive or negative acknowledgements as appropriate. + */ + +static void aun_incoming(struct sk_buff *skb, struct aunhdr *ah, size_t len) +{ + struct ec_device *edev = edev_get(skb->dev); + struct iphdr *ip = skb->nh.iph; + unsigned char stn = ntohl(ip->saddr) & 0xff; + struct sock *sk; + struct sk_buff *newskb; + struct ec_cb *eb; + struct sockaddr_ec *sec; + + if (edev == NULL) + return; /* Device not configured for AUN */ + + if ((sk = ec_listening_socket(ah->port, stn, edev->net)) == NULL) + goto bad; /* Nobody wants it */ + + newskb = alloc_skb((len - sizeof(struct aunhdr) + 15) & ~15, + GFP_ATOMIC); + if (newskb == NULL) + { + printk(KERN_DEBUG "AUN: memory squeeze, dropping packet.\n"); + /* Send nack and hope sender tries again */ + goto bad; + } + + eb = (struct ec_cb *)&newskb->cb; + sec = (struct sockaddr_ec *)&eb->sec; + memset(sec, 0, sizeof(struct sockaddr_ec)); + sec->sec_family = AF_ECONET; + sec->type = ECTYPE_PACKET_RECEIVED; + sec->port = ah->port; + sec->cb = ah->cb; + sec->addr.net = edev->net; + sec->addr.station = stn; + + memcpy(skb_put(newskb, len - sizeof(struct aunhdr)), (void *)(ah+1), + len - sizeof(struct aunhdr)); + + if (sock_queue_rcv_skb(sk, newskb) < 0) + { + /* Socket is bankrupt. */ + kfree_skb(newskb); + goto bad; + } + + aun_send_response(ip->saddr, ah->handle, 3, 0); + return; + +bad: + aun_send_response(ip->saddr, ah->handle, 4, 0); +} + +/* + * Handle incoming AUN transmit acknowledgements. If the sequence + * number matches something in our backlog then kill it and tell + * the user. If the remote took too long to reply then we may have + * dropped the packet already. + */ + +static void aun_tx_ack(unsigned long seq, int result) +{ + struct sk_buff *skb; + unsigned long flags; + struct ec_cb *eb; + + spin_lock_irqsave(&aun_queue_lock, flags); + skb = skb_peek(&aun_queue); + while (skb && skb != (struct sk_buff *)&aun_queue) + { + struct sk_buff *newskb = skb->next; + eb = (struct ec_cb *)&skb->cb; + if (eb->seq == seq) + goto foundit; + + skb = newskb; + } + spin_unlock_irqrestore(&aun_queue_lock, flags); + printk(KERN_DEBUG "AUN: unknown sequence %ld\n", seq); + return; + +foundit: + tx_result(skb->sk, eb->cookie, result); + skb_unlink(skb); + spin_unlock_irqrestore(&aun_queue_lock, flags); +} + +/* + * Deal with received AUN frames - sort out what type of thing it is + * and hand it to the right function. + */ + +static void aun_data_available(struct sock *sk, int slen) +{ + int err; + struct sk_buff *skb; + unsigned char *data; + struct aunhdr *ah; + struct iphdr *ip; + size_t len; + + while ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL) { + if (err == -EAGAIN) { + printk(KERN_ERR "AUN: no data available?!"); + return; + } + printk(KERN_DEBUG "AUN: recvfrom() error %d\n", -err); + } + + data = skb->h.raw + sizeof(struct udphdr); + ah = (struct aunhdr *)data; + len = skb->len - sizeof(struct udphdr); + ip = skb->nh.iph; + + switch (ah->code) + { + case 2: + aun_incoming(skb, ah, len); + break; + case 3: + aun_tx_ack(ah->handle, ECTYPE_TRANSMIT_OK); + break; + case 4: + aun_tx_ack(ah->handle, ECTYPE_TRANSMIT_NOT_LISTENING); + break; +#if 0 + /* This isn't quite right yet. */ + case 5: + aun_send_response(ip->saddr, ah->handle, 6, ah->cb); + break; +#endif + default: + printk(KERN_DEBUG "unknown AUN packet (type %d)\n", data[0]); + } + + skb_free_datagram(sk, skb); +} + +/* + * Called by the timer to manage the AUN transmit queue. If a packet + * was sent to a dead or nonexistent host then we will never get an + * acknowledgement back. After a few seconds we need to spot this and + * drop the packet. + */ + +static spinlock_t aun_queue_lock; + +static void ab_cleanup(unsigned long h) +{ + struct sk_buff *skb; + unsigned long flags; + + spin_lock_irqsave(&aun_queue_lock, flags); + skb = skb_peek(&aun_queue); + while (skb && skb != (struct sk_buff *)&aun_queue) + { + struct sk_buff *newskb = skb->next; + struct ec_cb *eb = (struct ec_cb *)&skb->cb; + if ((jiffies - eb->start) > eb->timeout) + { + tx_result(skb->sk, eb->cookie, + ECTYPE_TRANSMIT_NOT_PRESENT); + skb_unlink(skb); + } + skb = newskb; + } + spin_unlock_irqrestore(&aun_queue_lock, flags); + + mod_timer(&ab_cleanup_timer, jiffies + (HZ*2)); +} + +__initfunc(static int aun_udp_initialise(void)) +{ + int error; + struct sockaddr_in sin; + + skb_queue_head_init(&aun_queue); + spin_lock_init(&aun_queue_lock); + init_timer(&ab_cleanup_timer); + ab_cleanup_timer.expires = jiffies + (HZ*2); + ab_cleanup_timer.function = ab_cleanup; + add_timer(&ab_cleanup_timer); + + memset(&sin, 0, sizeof(sin)); + sin.sin_port = htons(AUN_PORT); + + /* We can count ourselves lucky Acorn machines are too dim to + speak IPv6. :-) */ + if ((error = sock_create(AF_INET, SOCK_DGRAM, 0, &udpsock)) < 0) + { + printk("AUN: socket error %d\n", -error); + return error; + } + + udpsock->sk->reuse = 1; + udpsock->sk->allocation = GFP_ATOMIC; /* we're going to call it + from interrupts */ + + error = udpsock->ops->bind(udpsock, (struct sockaddr *)&sin, + sizeof(sin)); + if (error < 0) + { + printk("AUN: bind error %d\n", -error); + goto release; + } + + udpsock->sk->data_ready = aun_data_available; + + return 0; + +release: + sock_release(udpsock); + udpsock = NULL; + return error; +} +#endif + +static int econet_notifier(struct notifier_block *this, unsigned long msg, void *data) +{ + struct device *dev = (struct device *)data; + struct ec_device *edev; + unsigned long flags; + + switch (msg) { + case NETDEV_UNREGISTER: + /* A device has gone down - kill any data we hold for it. */ + spin_lock_irqsave(&edevlist_lock, flags); + for (edev = edevlist; edev; edev = edev->next) + { + if (edev->dev == dev) + { + if (edev->prev) + edev->prev->next = edev->next; + else + edevlist = edev->next; + if (edev->next) + edev->next->prev = edev->prev; + kfree(edev); + break; + } + } + spin_unlock_irqrestore(&edevlist_lock, flags); + break; + } + + return NOTIFY_DONE; +} + +struct notifier_block econet_netdev_notifier={ + econet_notifier, + NULL, + 0 +}; + +#ifdef MODULE +void cleanup_module(void) +{ +#ifdef CONFIG_ECONET_AUNUDP + del_timer(&ab_cleanup_timer); + if (udpsock) + sock_release(udpsock); +#endif + unregister_netdevice_notifier(&econet_netdev_notifier); + sock_unregister(econet_family_ops.family); + return; +} + +int init_module(void) +#else +__initfunc(void econet_proto_init(struct net_proto *pro)) +#endif +{ + spin_lock_init(&edevlist_lock); + spin_lock_init(&aun_queue_lock); + /* Stop warnings from happening on UP systems. */ + (void)edevlist_lock; + (void)aun_queue_lock; + sock_register(&econet_family_ops); +#ifdef CONFIG_ECONET_AUNUDP + aun_udp_initialise(); +#endif + register_netdevice_notifier(&econet_netdev_notifier); +#ifdef MODULE + return 0; +#endif +} diff --git a/net/ethernet/Makefile b/net/ethernet/Makefile index 8c9041b4c..193d6af8b 100644 --- a/net/ethernet/Makefile +++ b/net/ethernet/Makefile @@ -1,5 +1,5 @@ # -# Makefile for the Linux TCP/IP (INET) layer. +# Makefile for the Linux Ethernet layer. # # Note! Dependencies are done automagically by 'make dep', which also # removes any old dependencies. DON'T put your own dependencies here diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6667b8d72..ce177c56b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -117,10 +117,13 @@ #define min(a,b) ((a)<(b)?(a):(b)) +struct linux_mib net_statistics; + extern int sysctl_core_destroy_delay; extern int raw_get_info(char *, char **, off_t, int, int); extern int snmp_get_info(char *, char **, off_t, int, int); +extern int netstat_get_info(char *, char **, off_t, int, int); extern int afinet_get_info(char *, char **, off_t, int, int); extern int tcp_get_info(char *, char **, off_t, int, int); extern int udp_get_info(char *, char **, off_t, int, int); @@ -352,7 +355,6 @@ static int inet_create(struct socket *sock, int protocol) if (protocol && protocol != IPPROTO_TCP) goto free_and_noproto; protocol = IPPROTO_TCP; - sk->no_check = TCP_NO_CHECK; if (ipv4_config.no_pmtu_disc) sk->ip_pmtudisc = IP_PMTUDISC_DONT; else @@ -1017,6 +1019,12 @@ static struct proc_dir_entry proc_net_raw = { 0, &proc_net_inode_operations, raw_get_info }; +static struct proc_dir_entry proc_net_netstat = { + PROC_NET_NETSTAT, 7, "netstat", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + netstat_get_info +}; static struct proc_dir_entry proc_net_snmp = { PROC_NET_SNMP, 4, "snmp", S_IFREG | S_IRUGO, 1, 0, 0, @@ -1044,6 +1052,8 @@ static struct proc_dir_entry proc_net_udp = { #endif /* CONFIG_PROC_FS */ extern void tcp_init(void); +extern void tcp_v4_init(struct net_proto_family *); + /* * Called by socket.c on kernel startup. @@ -1093,9 +1103,12 @@ __initfunc(void inet_proto_init(struct net_proto *pro)) ip_init(); + tcp_v4_init(&inet_family_ops); + /* Setup TCP slab cache for open requests. */ tcp_init(); + /* * Set the ICMP layer up */ @@ -1142,6 +1155,7 @@ __initfunc(void inet_proto_init(struct net_proto *pro)) #endif /* RARP */ proc_net_register(&proc_net_raw); proc_net_register(&proc_net_snmp); + proc_net_register(&proc_net_netstat); proc_net_register(&proc_net_sockstat); proc_net_register(&proc_net_tcp); proc_net_register(&proc_net_udp); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 87394f906..79eb9a1d4 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -19,7 +19,7 @@ * * Changes: * Alexey Kuznetsov: pa_* fields are replaced with ifaddr lists. - Cyrus Durgin: updated for kmod + * Cyrus Durgin: updated for kmod */ #include <linux/config.h> diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index cd9b5ba21..592ff5ffb 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -5,7 +5,7 @@ * * IPv4 Forwarding Information Base: policy rules. * - * Version: $Id: fib_rules.c,v 1.4 1998/03/21 07:27:58 davem Exp $ + * Version: $Id: fib_rules.c,v 1.5 1998/04/28 06:21:57 davem Exp $ * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * @@ -65,6 +65,9 @@ struct fib_rule u8 r_flags; u8 r_tos; int r_ifindex; +#ifdef CONFIG_NET_CLS_ROUTE + __u32 r_tclassid; +#endif char r_ifname[IFNAMSIZ]; }; @@ -165,6 +168,10 @@ int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (dev) new_r->r_ifindex = dev->ifindex; } +#ifdef CONFIG_NET_CLS_ROUTE + if (rta[RTA_FLOW-1]) + memcpy(&new_r->r_tclassid, RTA_DATA(rta[RTA_FLOW-1]), 4); +#endif rp = &fib_rules; if (!new_r->r_preference) { @@ -213,6 +220,16 @@ u32 fib_rules_policy(u32 saddr, struct fib_result *res, unsigned *flags) return saddr; } +#ifdef CONFIG_NET_CLS_ROUTE +u32 fib_rules_tclass(struct fib_result *res) +{ + if (res->r) + return res->r->r_tclassid; + return 0; +} +#endif + + static void fib_rules_detach(struct device *dev) { struct fib_rule *r; @@ -246,7 +263,7 @@ FRprintk("Lookup: %08x <- %08x ", key->dst, key->src); for (r = fib_rules; r; r=r->r_next) { if (((saddr^r->r_src) & r->r_srcmask) || ((daddr^r->r_dst) & r->r_dstmask) || -#ifdef CONFIG_IP_TOS_ROUTING +#ifdef CONFIG_IP_ROUTE_TOS (r->r_tos && r->r_tos != key->tos) || #endif (r->r_ifindex && r->r_ifindex != key->iif)) @@ -339,6 +356,10 @@ extern __inline__ int inet_fill_rule(struct sk_buff *skb, RTA_PUT(skb, RTA_PRIORITY, 4, &r->r_preference); if (r->r_srcmap) RTA_PUT(skb, RTA_GATEWAY, 4, &r->r_srcmap); +#ifdef CONFIG_NET_CLS_ROUTE + if (r->r_tclassid) + RTA_PUT(skb, RTA_FLOW, 4, &r->r_tclassid); +#endif nlh->nlmsg_len = skb->tail - b; return skb->len; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d2d37e11e..107f07791 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -5,7 +5,7 @@ * * IPv4 Forwarding Information Base: semantics. * - * Version: $Id: fib_semantics.c,v 1.7 1998/03/08 05:56:18 davem Exp $ + * Version: $Id: fib_semantics.c,v 1.8 1998/04/28 06:21:58 davem Exp $ * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * @@ -124,6 +124,9 @@ extern __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info * #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->nh_weight != onh->nh_weight || #endif +#ifdef CONFIG_NET_CLS_ROUTE + nh->nh_tclassid != onh->nh_tclassid || +#endif ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD)) return -1; onh++; @@ -217,8 +220,12 @@ fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags; nh->nh_oif = nhp->rtnh_ifindex; nh->nh_weight = nhp->rtnh_hops + 1; - if (attrlen) + if (attrlen) { nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY); +#ifdef CONFIG_NET_CLS_ROUTE + nh->nh_tclassid = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW); +#endif + } nhp = RTNH_NEXT(nhp); } endfor_nexthops(fi); return 0; @@ -267,6 +274,11 @@ int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta, gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY); if (gw && gw != nh->nh_gw) return 1; +#ifdef CONFIG_NET_CLS_ROUTE + gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW); + if (gw && gw != nh->nh_tclassid) + return 1; +#endif } nhp = RTNH_NEXT(nhp); } endfor_nexthops(fi); @@ -459,6 +471,10 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta, goto err_inval; if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 4)) goto err_inval; +#ifdef CONFIG_NET_CLS_ROUTE + if (rta->rta_flow && memcmp(&fi->fib_nh->nh_tclassid, rta->rta_flow, 4)) + goto err_inval; +#endif #else goto err_inval; #endif @@ -468,6 +484,10 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta, nh->nh_oif = *rta->rta_oif; if (rta->rta_gw) memcpy(&nh->nh_gw, rta->rta_gw, 4); +#ifdef CONFIG_NET_CLS_ROUTE + if (rta->rta_flow) + memcpy(&nh->nh_tclassid, rta->rta_flow, 4); +#endif nh->nh_flags = r->rtm_flags; #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->nh_weight = 1; @@ -654,6 +674,10 @@ fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, if (fi->fib_rtt) RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &fi->fib_rtt); #else +#ifdef CONFIG_NET_CLS_ROUTE + if (fi->fib_nh[0].nh_tclassid) + RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid); +#endif if (fi->fib_mtu || fi->fib_window || fi->fib_rtt) { int i; struct rtattr *mx = (struct rtattr *)skb->tail; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index e8f636e21..7ce08cdd4 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -680,7 +680,7 @@ static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb, int len) if (inet_addr_type(iph->daddr) == RTN_BROADCAST) { if (net_ratelimit()) - printk("%s sent an invalid ICMP error to a broadcast.\n", + printk(KERN_WARNING "%s sent an invalid ICMP error to a broadcast.\n", in_ntoa(skb->nh.iph->saddr)); return; } @@ -856,6 +856,9 @@ static void icmp_timestamp(struct icmphdr *icmph, struct sk_buff *skb, int len) * All these rules are so bizarre, that I removed kernel addrmask * support at all. It is wrong, it is obsolete, nobody uses it in * any case. --ANK + * + * Furthermore you can do it with a usermode address agent program + * anyway... */ static void icmp_address(struct icmphdr *icmph, struct sk_buff *skb, int len) @@ -1026,7 +1029,6 @@ static unsigned long dummy; * dst_entry gets expired too early. The same should happen when * the cache grows too big. */ -int sysctl_icmp_sourcequench_time = 1*HZ; int sysctl_icmp_destunreach_time = 1*HZ; int sysctl_icmp_timeexceed_time = 1*HZ; int sysctl_icmp_paramprob_time = 1*HZ; @@ -1044,7 +1046,7 @@ static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1] = { /* DEST UNREACH (3) */ { &icmp_statistics.IcmpOutDestUnreachs, &icmp_statistics.IcmpInDestUnreachs, icmp_unreach, 1, &sysctl_icmp_destunreach_time }, /* SOURCE QUENCH (4) */ - { &icmp_statistics.IcmpOutSrcQuenchs, &icmp_statistics.IcmpInSrcQuenchs, icmp_unreach, 1, &sysctl_icmp_sourcequench_time }, + { &icmp_statistics.IcmpOutSrcQuenchs, &icmp_statistics.IcmpInSrcQuenchs, icmp_unreach, 1, }, /* REDIRECT (5) */ { &icmp_statistics.IcmpOutRedirects, &icmp_statistics.IcmpInRedirects, icmp_redirect, 1, }, { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, }, diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 21205362f..1641e5c3d 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -5,7 +5,7 @@ * * The IP fragmentation functionality. * - * Version: $Id: ip_fragment.c,v 1.33 1998/03/19 08:34:08 davem Exp $ + * Version: $Id: ip_fragment.c,v 1.36 1998/04/18 02:13:07 davem Exp $ * * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox <Alan.Cox@linux.org> @@ -346,10 +346,9 @@ static struct sk_buff *ip_glue(struct ipq *qp) memcpy(ptr, qp->iph, qp->ihlen); ptr += qp->ihlen; - count = 0; - /* Copy the data portions of all fragments into the new buffer. */ fp = qp->fragments; + count = qp->ihlen; while(fp) { if (fp->len < 0 || count+fp->len > skb->len) { NETDEBUG(printk(KERN_ERR "Invalid fragment list: " @@ -360,7 +359,7 @@ static struct sk_buff *ip_glue(struct ipq *qp) return NULL; } memcpy((ptr + fp->offset), fp->ptr, fp->len); - if (!count) { + if (count == qp->ihlen) { skb->dst = dst_clone(fp->skb->dst); skb->dev = fp->skb->dev; } @@ -376,7 +375,7 @@ static struct sk_buff *ip_glue(struct ipq *qp) /* Done with all fragments. Fixup the new IP header. */ iph = skb->nh.iph; iph->frag_off = 0; - iph->tot_len = htons((iph->ihl * 4) + count); + iph->tot_len = htons(count); ip_statistics.IpReasmOKs++; return skb; diff --git a/net/ipv4/ip_masq_app.c b/net/ipv4/ip_masq_app.c index 8772bd58c..b620bc82a 100644 --- a/net/ipv4/ip_masq_app.c +++ b/net/ipv4/ip_masq_app.c @@ -506,7 +506,7 @@ static struct sk_buff * skb_replace(struct sk_buff *skb, int pri, char *o_buf, i struct sk_buff *n_skb; int offset; - maxsize = skb->truesize - sizeof(struct sk_buff); + maxsize = skb->truesize; diff = n_len - o_len; o_offset = o_buf - (char*) skb->data; @@ -547,7 +547,6 @@ static struct sk_buff * skb_replace(struct sk_buff *skb, int pri, char *o_buf, i offset = n_skb->data - skb->data; n_skb->nh.raw = skb->nh.raw + offset; n_skb->h.raw = skb->h.raw + offset; - n_skb->when = skb->when; n_skb->dev = skb->dev; n_skb->mac.raw = skb->mac.raw + offset; n_skb->pkt_type = skb->pkt_type; diff --git a/net/ipv4/ip_masq_mod.c b/net/ipv4/ip_masq_mod.c index f6a50dfc6..7319a2624 100644 --- a/net/ipv4/ip_masq_mod.c +++ b/net/ipv4/ip_masq_mod.c @@ -2,10 +2,9 @@ * IP_MASQ_MOD masq modules support * * - * Version: @(#)ip_masq_mod.c 0.02 97/10/30 - * * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> * + * $Id: ip_masq_mod.c,v 1.4 1998/03/27 07:02:45 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -287,7 +286,6 @@ struct ip_masq_mod * ip_masq_mod_getbyname(const char *mmod_name) /* * Module control entry - * no need to lock (already locked in ip_masq.c) */ int ip_masq_mod_ctl(int optname, struct ip_fw_masqctl *mctl, int optlen) { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 69179738e..7a1c141bb 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -31,6 +31,10 @@ * Andi Kleen: Fix broken PMTU recovery and remove * some redundant tests. * Vitaly E. Lavrov : Transparent proxy revived after year coma. + * Andi Kleen : Replace ip_reply with ip_send_reply. + * Andi Kleen : Split fast and slow ip_build_xmit path + * for decreased register pressure on x86 + * and more readibility. */ #include <asm/uaccess.h> @@ -70,7 +74,6 @@ #include <linux/firewall.h> #include <linux/mroute.h> #include <linux/netlink.h> -#include <linux/ipsec.h> /* * Shall we try to damage output packets if routing dev changes? @@ -88,6 +91,9 @@ __inline__ void ip_send_check(struct iphdr *iph) iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } +/* + * Add an ip header to a skbuff and send it out. + */ void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, u32 saddr, u32 daddr, struct ip_options *opt) { @@ -303,16 +309,6 @@ void ip_queue_xmit(struct sk_buff *skb) if (call_out_firewall(PF_INET, dev, iph, NULL, &skb) < FW_ACCEPT) goto drop; -#ifdef CONFIG_NET_SECURITY - /* Add an IP checksum (must do this before SECurity because - * of possible tunneling). - */ - ip_send_check(iph); - if (call_out_firewall(PF_SECURITY, NULL, NULL, (void *) 4, &skb) < FW_ACCEPT) - goto drop; - iph = skb->nh.iph; - /* Don't update tot_len, as the dev->mtu is already decreased. */ -#endif /* This can happen when the transport layer has segments queued * with a cached route, and by the time we get here things are * re-routed to a device with a different MTU than the original @@ -335,10 +331,9 @@ void ip_queue_xmit(struct sk_buff *skb) if (tot_len > rt->u.dst.pmtu) goto fragment; -#ifndef CONFIG_NET_SECURITY /* Add an IP checksum. */ ip_send_check(iph); -#endif + skb->priority = sk->priority; skb->dst->output(skb); return; @@ -382,7 +377,7 @@ drop: * length to be copied. */ -int ip_build_xmit(struct sock *sk, +int ip_build_xmit_slow(struct sock *sk, int getfrag (const void *, char *, unsigned int, @@ -397,92 +392,16 @@ int ip_build_xmit(struct sock *sk, int err; int offset, mf; unsigned short id; - struct iphdr *iph; + int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15; int nfrags=0; struct ip_options *opt = ipc->opt; int df = htons(IP_DF); -#ifdef CONFIG_NET_SECURITY - int fw_res; -#endif if (sk->ip_pmtudisc == IP_PMTUDISC_DONT || - (rt->u.dst.mxlock&(1<<RTAX_MTU))) + (rt->u.dst.mxlock&(1<<RTAX_MTU))) df = 0; - - - /* - * Try the simple case first. This leaves fragmented frames, and by - * choice RAW frames within 20 bytes of maximum size(rare) to the long path - */ - - if (!sk->ip_hdrincl) - length += sizeof(struct iphdr); - - if (length <= rt->u.dst.pmtu && opt == NULL) { - int error; - struct sk_buff *skb=sock_alloc_send_skb(sk, length+hh_len+15, - 0, flags&MSG_DONTWAIT, &error); - if(skb==NULL) { - ip_statistics.IpOutDiscards++; - return error; - } - - skb->when=jiffies; - skb->priority = sk->priority; - skb->dst = dst_clone(&rt->u.dst); - skb_reserve(skb, hh_len); - - skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length); - - dev_lock_list(); - - if(!sk->ip_hdrincl) { - iph->version=4; - iph->ihl=5; - iph->tos=sk->ip_tos; - iph->tot_len = htons(length); - iph->id=htons(ip_id_count++); - iph->frag_off = df; - iph->ttl=sk->ip_mc_ttl; - if (rt->rt_type != RTN_MULTICAST) - iph->ttl=sk->ip_ttl; - iph->protocol=sk->protocol; - iph->saddr=rt->rt_src; - iph->daddr=rt->rt_dst; - iph->check=0; - iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); - err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4); - } - else - err = getfrag(frag, (void *)iph, 0, length); - dev_unlock_list(); - - if (err) - err = -EFAULT; - - if(!err && call_out_firewall(PF_INET, rt->u.dst.dev, iph, NULL, &skb) < FW_ACCEPT) - err = -EPERM; -#ifdef CONFIG_NET_SECURITY - if ((fw_res=call_out_firewall(PF_SECURITY, NULL, NULL, (void *) 5, &skb))<FW_ACCEPT) - { - kfree_skb(skb); - if (fw_res != FW_QUEUE) - return -EPERM; - else - return 0; - } -#endif - - if (err) - { - kfree_skb(skb); - return err; - } - - return rt->u.dst.output(skb); - } - + if (!sk->ip_hdrincl) length -= sizeof(struct iphdr); @@ -498,7 +417,7 @@ int ip_build_xmit(struct sock *sk, */ maxfraglen = ((rt->u.dst.pmtu-sizeof(struct iphdr)) & ~7) + fragheaderlen; - } + } if (length + fragheaderlen > 0xFFFF) return -EMSGSIZE; @@ -552,9 +471,9 @@ int ip_build_xmit(struct sock *sk, */ do { - struct sk_buff * skb; int error; char *data; + struct sk_buff * skb; /* * Get the memory we require with some space left for alignment. @@ -573,7 +492,6 @@ int ip_build_xmit(struct sock *sk, * Fill in the control structures */ - skb->when = jiffies; skb->priority = sk->priority; skb->dst = dst_clone(&rt->u.dst); skb_reserve(skb, hh_len); @@ -583,13 +501,15 @@ int ip_build_xmit(struct sock *sk, */ data = skb_put(skb, fraglen); - skb->nh.iph = iph = (struct iphdr *)data; + skb->nh.iph = (struct iphdr *)data; /* * Only write IP header onto non-raw packets */ if(!sk->ip_hdrincl) { + struct iphdr *iph = (struct iphdr *)data; + iph->version = 4; iph->ihl = 5; if (opt) { @@ -624,49 +544,148 @@ int ip_build_xmit(struct sock *sk, * User data callback */ - err = getfrag(frag, data, offset, fraglen-fragheaderlen); - if (err) + err = 0; + if (getfrag(frag, data, offset, fraglen-fragheaderlen)) err = -EFAULT; /* * Account for the fragment. */ - - if(!err && !offset && call_out_firewall(PF_INET, rt->u.dst.dev, iph, NULL, &skb) < FW_ACCEPT) + + if(!err && offset == 0 && + call_out_firewall(PF_INET, rt->u.dst.dev, skb->nh.iph, NULL, &skb) < FW_ACCEPT) err = -EPERM; -#ifdef CONFIG_NET_SECURITY - if ((fw_res=call_out_firewall(PF_SECURITY, NULL, NULL, (void *) 6, &skb))<FW_ACCEPT) - { - if (fw_res != FW_QUEUE) - err= -EPERM; - } -#endif - if (err) - { + + if (err) { + ip_statistics.IpOutDiscards++; kfree_skb(skb); dev_unlock_list(); - return err; - } + return err; + } + + offset -= (maxfraglen-fragheaderlen); fraglen = maxfraglen; - nfrags++; - + + err = 0; if (rt->u.dst.output(skb)) { - if (nfrags>1) - ip_statistics.IpFragCreates += nfrags; - dev_unlock_list(); - return -ENETDOWN; + err = -ENETDOWN; + ip_statistics.IpOutDiscards++; + break; } } while (offset >= 0); if (nfrags>1) ip_statistics.IpFragCreates += nfrags; + dev_unlock_list(); + return err; +} + + +/* + * Fast path for unfragmented packets. + */ +int ip_build_xmit(struct sock *sk, + int getfrag (const void *, + char *, + unsigned int, + unsigned int), + const void *frag, + unsigned length, + struct ipcm_cookie *ipc, + struct rtable *rt, + int flags) +{ + int err; + struct sk_buff *skb; + int df; + struct iphdr *iph; + + /* + * Try the simple case first. This leaves fragmented frames, and by + * choice RAW frames within 20 bytes of maximum size(rare) to the long path + */ + + if (!sk->ip_hdrincl) + length += sizeof(struct iphdr); + + /* + * Check for slow path. + */ + if (length > rt->u.dst.pmtu || ipc->opt != NULL) + return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags); + + /* + * Do path mtu discovery if needed. + */ + df = htons(IP_DF); + if (sk->ip_pmtudisc == IP_PMTUDISC_DONT || + (rt->u.dst.mxlock&(1<<RTAX_MTU))) + df = 0; + + /* + * Fast path for unfragmented frames without options. + */ + { + int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15; + + skb = sock_alloc_send_skb(sk, length+hh_len+15, + 0, flags&MSG_DONTWAIT, &err); + if(skb==NULL) + goto error; + skb_reserve(skb, hh_len); + } + + skb->priority = sk->priority; + skb->dst = dst_clone(&rt->u.dst); + + skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length); + + dev_lock_list(); + + if(!sk->ip_hdrincl) { + iph->version=4; + iph->ihl=5; + iph->tos=sk->ip_tos; + iph->tot_len = htons(length); + iph->id=htons(ip_id_count++); + iph->frag_off = df; + iph->ttl=sk->ip_mc_ttl; + if (rt->rt_type != RTN_MULTICAST) + iph->ttl=sk->ip_ttl; + iph->protocol=sk->protocol; + iph->saddr=rt->rt_src; + iph->daddr=rt->rt_dst; + iph->check=0; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4); + } + else + err = getfrag(frag, (void *)iph, 0, length); dev_unlock_list(); - return 0; + + if (err) + err = -EFAULT; + + if(!err && call_out_firewall(PF_INET, rt->u.dst.dev, iph, NULL, &skb) < FW_ACCEPT) + err = -EPERM; + + if (err) { + kfree_skb(skb); + goto error; + } + + return rt->u.dst.output(skb); + +error: + ip_statistics.IpOutDiscards++; + return err; } + + /* * This IP datagram is too large to be sent in one piece. Break it up into @@ -684,7 +703,7 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) unsigned char *ptr; struct device *dev; struct sk_buff *skb2; - int left, mtu, hlen, len; + unsigned int mtu, hlen, left, len; int offset; int not_last_frag; u16 dont_fragment; @@ -714,11 +733,8 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) * in this case we were fortunate it didn't happen */ - if (mtu<8) { - ip_statistics.IpFragFails++; - kfree_skb(skb); - return; - } + if (mtu<8) + goto fail; /* * Fragment the datagram. @@ -747,8 +763,7 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) /* IF: we are not sending upto and including the packet end then align the next start on an eight byte boundary */ if (len < left) { - len/=8; - len*=8; + len &= ~7; } /* * Allocate buffer. @@ -756,16 +771,13 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) { NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n")); - ip_statistics.IpFragFails++; - kfree_skb(skb); - return; + goto fail; } /* * Set up data on packet */ - skb2->when = skb->when; skb2->pkt_type = skb->pkt_type; skb2->priority = skb->priority; skb_reserve(skb2, (dev->hard_header_len+15)&~15); @@ -832,61 +844,96 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) } kfree_skb(skb); ip_statistics.IpFragOKs++; + return; + +fail: + kfree_skb(skb); + ip_statistics.IpFragFails++; } -struct sk_buff * ip_reply(struct sk_buff *skb, int payload) +/* + * Fetch data from kernel space and fill in checksum if needed. + */ +static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset, + unsigned int fraglen) +{ + struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr; + u16 *pktp = (u16 *)to; + struct iovec *iov; + int len; + int hdrflag = 1; + +#if 0 + printk("ip_reply_glue_bits: offset=%u,flen=%u iov[0].l=%u,iov[1].len=%u\n", + offset,fraglen,dp->iov[0].iov_len,dp->iov[1].iov_len); +#endif + + iov = &dp->iov[0]; + if (offset >= iov->iov_len) { + offset -= iov->iov_len; + iov++; + hdrflag = 0; + } + len = iov->iov_len - offset; + if (fraglen > len) { /* overlapping. */ +#if 1 + if (iov > &dp->iov[0]) { + printk("frag too long! (o=%u,fl=%u)\n",offset,fraglen); + return -1; + } +#endif + dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len, + dp->csum); + offset = 0; + fraglen -= len; + to += len; + iov++; + } + + dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, fraglen, + dp->csum); + + if (hdrflag && dp->csumoffset) + *(pktp + dp->csumoffset) = csum_fold(dp->csum); /* fill in checksum */ + return 0; +} + +/* + * Generic function to send a packet as reply to another packet. + * Used to send TCP resets so far. ICMP should use this function too. + * + * Should run single threaded per socket because it uses the sock + * structure to pass arguments. + */ +void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, + unsigned int len) { struct { struct ip_options opt; char data[40]; } replyopts; - - struct rtable *rt = (struct rtable*)skb->dst; - struct sk_buff *reply; - int iphlen; - struct iphdr *iph; - struct ipcm_cookie ipc; u32 daddr; - + struct rtable *rt = (struct rtable*)skb->dst; + if (ip_options_echo(&replyopts.opt, skb)) - return NULL; + return; + + sk->ip_tos = skb->nh.iph->tos; + sk->priority = skb->priority; + sk->protocol = skb->nh.iph->protocol; daddr = ipc.addr = rt->rt_src; ipc.opt = &replyopts.opt; + if (ipc.opt->srr) daddr = replyopts.opt.faddr; - if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0)) - return NULL; - - iphlen = sizeof(struct iphdr) + replyopts.opt.optlen; - reply = alloc_skb(rt->u.dst.dev->hard_header_len+15+iphlen+payload, GFP_ATOMIC); - if (reply == NULL) { - ip_rt_put(rt); - return NULL; - } - - reply->priority = skb->priority; - reply->dst = &rt->u.dst; - skb_reserve(reply, (rt->u.dst.dev->hard_header_len+15)&~15); - - /* Now build the IP header. */ - reply->nh.iph = iph = (struct iphdr *)skb_put(reply, iphlen); - - iph->version = 4; - iph->ihl = iphlen>>2; - iph->tos = skb->nh.iph->tos; - iph->frag_off = 0; - iph->ttl = MAXTTL; - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; - iph->protocol = skb->nh.iph->protocol; - iph->id = htons(ip_id_count++); - - ip_options_build(reply, &replyopts.opt, daddr, rt, 0); + return; - return reply; + /* And let IP do all the hard work. */ + ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT); + ip_rt_put(rt); } /* diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 1177f33ac..d3c7503df 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -703,8 +703,7 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen) mrtsock_destruct(sk); return -EADDRINUSE; case MRT_DONE: - mrtsock_destruct(sk); - return 0; + return ip_ra_control(sk, 0, NULL); case MRT_ADD_VIF: case MRT_DEL_VIF: if(optlen!=sizeof(vif)) diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 0ea231adf..fc6176c0f 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -25,6 +25,7 @@ * Alan Cox : Allow inode to be NULL (kernel socket) * Andi Kleen : Add support for open_requests and * split functions for more readibility. + * Andi Kleen : Add support for /proc/net/netstat * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -98,12 +99,19 @@ static inline void get__sock(struct sock *sp, char *tmpbuf, int i, int format) destp = ntohs(destp); srcp = ntohs(srcp); if((format == 0) && (sp->state == TCP_TIME_WAIT)) { + extern int tcp_tw_death_row_slot; struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sp; + int slot_dist; tw_bucket = 1; timer_active1 = timer_active2 = 0; timer_active = 3; - timer_expires = tw->timer.expires; + slot_dist = tw->death_slot; + if(slot_dist > tcp_tw_death_row_slot) + slot_dist = (TCP_TWKILL_SLOTS - slot_dist) + tcp_tw_death_row_slot; + else + slot_dist = tcp_tw_death_row_slot - slot_dist; + timer_expires = jiffies + (slot_dist * TCP_TWKILL_PERIOD); } else { timer_active1 = del_timer(&tp->retransmit_timer); timer_active2 = del_timer(&sp->timer); @@ -337,3 +345,33 @@ int snmp_get_info(char *buffer, char **start, off_t offset, int length, int dumm len = length; return len; } + +/* + * Output /proc/net/netstat + */ + +int netstat_get_info(char *buffer, char **start, off_t offset, int length, int dummy) +{ + extern struct linux_mib net_statistics; + int len; + + len = sprintf(buffer, + "TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed" + "EmbryonicRsts\n" + "TcpExt: %lu %lu %lu %lu\n", + net_statistics.SyncookiesSent, + net_statistics.SyncookiesRecv, + net_statistics.SyncookiesFailed, + net_statistics.EmbryonicRsts); + + if (offset >= len) + { + *start = buffer; + return 0; + } + *start = buffer + offset; + len -= offset; + if (len > length) + len = length; + return len; +} diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 464090776..7d21af4a8 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -577,7 +577,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) if (rt != NULL) { if (dst->obsolete || rt->rt_flags&RTCF_REDIRECTED) { #if RT_CACHE_DEBUG >= 1 - printk(KERN_DEBUG "ip_rt_advice: redirect to %08x/%02x dropped\n", rt->rt_dst, rt->key.tos); + printk(KERN_DEBUG "ip_rt_advice: redirect to %d.%d.%d.%d/%02x dropped\n", NIPQUAD(rt->rt_dst), rt->key.tos); #endif ip_rt_put(rt); rt_cache_flush(0); @@ -725,11 +725,11 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) mtu = guess_mtu(old_mtu); } - if (mtu < rth->u.dst.pmtu) { - /* New mtu received -> path was valid */ - dst_confirm(&rth->u.dst); - - rth->u.dst.pmtu = mtu; + if (mtu <= rth->u.dst.pmtu) { + if (mtu < rth->u.dst.pmtu) { + dst_confirm(&rth->u.dst); + rth->u.dst.pmtu = mtu; + } est_mtu = mtu; } } @@ -808,11 +808,18 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res) #endif rt->u.dst.window= fi->fib_window ? : 0; rt->u.dst.rtt = fi->fib_rtt ? : TCP_TIMEOUT_INIT; +#ifdef CONFIG_NET_CLS_ROUTE + rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid; +#endif } else { rt->u.dst.pmtu = rt->u.dst.dev->mtu; rt->u.dst.window= 0; rt->u.dst.rtt = TCP_TIMEOUT_INIT; } +#ifdef CONFIG_NET_CLS_ROUTE + if (rt->u.dst.tclassid == 0) + rt->u.dst.tclassid = fib_rules_tclass(res); +#endif rt->rt_type = res->type; } @@ -1205,6 +1212,9 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int key.oif = oif; key.scope = (tos&RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE; res.fi = NULL; +#ifdef CONFIG_IP_MULTIPLE_TABLES + res.r = NULL; +#endif if (saddr) { if (MULTICAST(saddr) || BADCLASS(saddr) || ZERONET(saddr)) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 00dd0a8ef..a3e3be0f1 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -9,11 +9,9 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * $Id: syncookies.c,v 1.4 1998/03/08 05:56:34 davem Exp $ + * $Id: syncookies.c,v 1.5 1998/04/03 09:49:46 freitag Exp $ * * Missing: IPv6 support. - * Some counter so that the Administrator can see when the machine - * is under a syn flood attack. */ #include <linux/config.h> @@ -88,6 +86,8 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, found: *mssp = w[-1]; + net_statistics.SyncookiesSent++; + isn |= i; return isn; } @@ -110,8 +110,9 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie) __u32 seq; if ((jiffies - tcp_lastsynq_overflow) > TCP_TIMEOUT_INIT - && tcp_lastsynq_overflow) + && tcp_lastsynq_overflow) { return 0; + } mssind = cookie & 7; cookie &= ~7; @@ -157,8 +158,12 @@ cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt) return sk; mss = cookie_check(skb, cookie); - if (mss == 0) + if (mss == 0) { + net_statistics.SyncookiesFailed++; return sk; + } + + net_statistics.SyncookiesRecv++; req = tcp_openreq_alloc(); if (req == NULL) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index da64fc186..92b980b55 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -48,6 +48,7 @@ extern int sysctl_tcp_hoe_retransmits; extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; +extern int sysctl_tcp_retrans_collapse; extern int sysctl_tcp_keepalive_time; extern int sysctl_tcp_keepalive_probes; extern int sysctl_tcp_max_ka_probes; @@ -62,7 +63,6 @@ extern int sysctl_tcp_syn_taildrop; extern int sysctl_max_syn_backlog; /* From icmp.c */ -extern int sysctl_icmp_sourcequench_time; extern int sysctl_icmp_destunreach_time; extern int sysctl_icmp_timeexceed_time; extern int sysctl_icmp_paramprob_time; @@ -70,9 +70,6 @@ extern int sysctl_icmp_echoreply_time; int tcp_retr1_max = 255; -extern int tcp_sysctl_congavoid(ctl_table *ctl, int write, struct file * filp, - void *buffer, size_t *lenp); - struct ipv4_config ipv4_config; extern ctl_table ipv4_route_table[]; @@ -108,9 +105,9 @@ ctl_table ipv4_table[] = { {NET_IPV4_TCP_SACK, "tcp_sack", &sysctl_tcp_sack, sizeof(int), 0644, NULL, &proc_dointvec}, - {NET_IPV4_TCP_VEGAS_CONG_AVOID, "tcp_vegas_cong_avoid", - &sysctl_tcp_cong_avoidance, sizeof(int), 0644, - NULL, &tcp_sysctl_congavoid }, + {NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse", + &sysctl_tcp_retrans_collapse, sizeof(int), 0644, NULL, + &proc_dointvec}, {NET_IPV4_FORWARD, "ip_forward", &ipv4_devconf.forwarding, sizeof(int), 0644, NULL, &ipv4_sysctl_forward}, @@ -161,8 +158,6 @@ ctl_table ipv4_table[] = { sizeof(int), 0644, NULL, &proc_dointvec}, {NET_TCP_RFC1337, "tcp_rfc1337", &sysctl_tcp_rfc1337, sizeof(int), 0644, NULL, &proc_dointvec}, - {NET_TCP_SYN_TAILDROP, "tcp_syn_taildrop", &sysctl_tcp_syn_taildrop, - sizeof(int), 0644, NULL, &proc_dointvec}, {NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog", &sysctl_max_syn_backlog, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range", @@ -174,8 +169,6 @@ ctl_table ipv4_table[] = { {NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts", &sysctl_icmp_echo_ignore_broadcasts, sizeof(int), 0644, NULL, &proc_dointvec}, - {NET_IPV4_ICMP_SOURCEQUENCH_RATE, "icmp_sourcequench_rate", - &sysctl_icmp_sourcequench_time, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_ICMP_DESTUNREACH_RATE, "icmp_destunreach_rate", &sysctl_icmp_destunreach_time, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_ICMP_TIMEEXCEED_RATE, "icmp_timeexceed_rate", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d57b7e3ef..fd4284af9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.104 1998/03/22 22:10:30 davem Exp $ + * Version: $Id: tcp.c,v 1.114 1998/04/26 01:11:33 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -197,7 +197,7 @@ * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD * Willy Konynenberg : Transparent proxying support. * Mike McLagan : Routing by source - * Keith Owens : Do proper meging with partial SKB's in + * Keith Owens : Do proper merging with partial SKB's in * tcp_do_sendmsg to avoid burstiness. * Eric Schenk : Fix fast close down bug with * shutdown() followed by close(). @@ -451,35 +451,6 @@ static struct open_request *tcp_find_established(struct tcp_opt *tp, } /* - * This routine closes sockets which have been at least partially - * opened, but not yet accepted. Currently it is only called by - * tcp_close, and timeout mirrors the value there. - */ - -static void tcp_close_pending (struct sock *sk) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct open_request *req = tp->syn_wait_queue; - - while(req) { - struct open_request *iter; - - if (req->sk) - tcp_close(req->sk, 0); - - iter = req; - req = req->dl_next; - - (*iter->class->destructor)(iter); - tcp_dec_slow_timer(TCP_SLT_SYNACK); - sk->ack_backlog--; - tcp_openreq_free(iter); - } - - tcp_synq_init(tp); -} - -/* * Walk down the receive queue counting readable data. * * Must be called with the socket lock held. @@ -506,21 +477,19 @@ static int tcp_readable(struct sock *sk) /* Do until a push or until we are out of data. */ do { /* Found a hole so stops here. */ - if (before(counted, skb->seq)) /* should not happen */ + if (before(counted, TCP_SKB_CB(skb)->seq)) /* should not happen */ break; /* Length - header but start from where we are up to * avoid overlaps. */ - sum = skb->len - (counted - skb->seq); - if (skb->h.th->syn) - sum++; - if (sum > 0) { + sum = skb->len - (counted - TCP_SKB_CB(skb)->seq); + if (sum >= 0) { /* Add it up, move on. */ amount += sum; - if (skb->h.th->syn) - amount--; counted += sum; + if (skb->h.th->syn) + counted++; } /* Don't count urg data ... but do it in the right place! @@ -608,7 +577,7 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) /* More than half of the socket queue free? */ space = atomic_read(&sk->wmem_alloc) / 2; #endif - /* Always wake the user up when an error occured */ + /* Always wake the user up when an error occurred */ if (sock_wspace(sk) >= space || sk->err) mask |= POLLOUT | POLLWRNORM; if (tp->urg_data) @@ -619,44 +588,41 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) { + int answ; + switch(cmd) { - case TIOCINQ: + case TIOCINQ: #ifdef FIXME /* FIXME: */ - case FIONREAD: + case FIONREAD: #endif - { - unsigned long amount; - - if (sk->state == TCP_LISTEN) - return(-EINVAL); - - lock_sock(sk); - amount = tcp_readable(sk); - release_sock(sk); - return put_user(amount, (int *)arg); - } - case SIOCATMARK: + if (sk->state == TCP_LISTEN) + return(-EINVAL); + lock_sock(sk); + answ = tcp_readable(sk); + release_sock(sk); + break; + case SIOCATMARK: { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int answ = tp->urg_data && tp->urg_seq == tp->copied_seq; - return put_user(answ,(int *) arg); - } - case TIOCOUTQ: - { - unsigned long amount; - - if (sk->state == TCP_LISTEN) - return(-EINVAL); - amount = sock_wspace(sk); - return put_user(amount, (int *)arg); + answ = tp->urg_data && tp->urg_seq == tp->copied_seq; + break; } - default: - return(-ENOIOCTLCMD); + case TIOCOUTQ: + if (sk->state == TCP_LISTEN) + return(-EINVAL); + answ = sock_wspace(sk); + break; + default: + return(-ENOIOCTLCMD); }; + + return put_user(answ, (int *)arg); } /* * Wait for a socket to get into the connected state + * + * Note: must be called with the socket locked. */ static int wait_for_tcp_connect(struct sock * sk, int flags) { @@ -729,28 +695,27 @@ static void wait_for_tcp_memory(struct sock * sk) /* * This routine copies from a user buffer into a socket, * and starts the transmit system. + * + * Note: must be called with the socket locked. */ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int mss_now = sk->mss; + int mss_now; int err = 0; int copied = 0; + /* Verify that the socket is locked */ + if (!sk->sock_readers) + printk("tcp_do_sendmsg: socket not locked!\n"); + /* Wait for a connection to finish. */ if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) if((err = wait_for_tcp_connect(sk, flags)) != 0) return err; - /* The socket is locked, nothing can change the state of pending - * SACKs or IP options. - */ - if(tp->sack_ok && tp->num_sacks) - mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + - (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); - if(sk->opt && sk->opt->optlen) - mss_now -= (sk->opt->optlen); + mss_now = tcp_current_mss(sk); /* Ok commence sending. */ while(--iovlen >= 0) { @@ -788,7 +753,7 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) */ if (skb_tailroom(skb) > 0 && (mss_now - copy) > 0 && - tp->snd_nxt < skb->end_seq) { + tp->snd_nxt < TCP_SKB_CB(skb)->end_seq) { int last_byte_was_odd = (copy % 4); copy = mss_now - copy; @@ -809,7 +774,7 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) copy, skb->csum, &err); } tp->write_seq += copy; - skb->end_seq += copy; + TCP_SKB_CB(skb)->end_seq += copy; from += copy; copied += copy; seglen -= copy; @@ -839,7 +804,7 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) if(copy > seglen) copy = seglen; - tmp = MAX_HEADER + sk->prot->max_header + 15; + tmp = MAX_HEADER + sk->prot->max_header; queue_it = 0; if (copy < min(mss_now, tp->max_window >> 1) && !(flags & MSG_OOB)) { @@ -870,6 +835,11 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) goto do_interrupted; } wait_for_tcp_memory(sk); + + /* If SACK's were formed or PMTU events happened, + * we must find out about it. + */ + mss_now = tcp_current_mss(sk); continue; } @@ -897,8 +867,8 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) from += copy; copied += copy; - skb->seq = tp->write_seq; - skb->end_seq = skb->seq + copy; + TCP_SKB_CB(skb)->seq = tp->write_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + copy; /* This advances tp->write_seq for us. */ tcp_send_skb(sk, skb, queue_it); @@ -936,10 +906,8 @@ void tcp_read_wakeup(struct sock *sk) /* If we're closed, don't send an ack, or we'll get a RST * from the closed destination. */ - if ((1 << sk->state) & (TCPF_CLOSE|TCPF_TIME_WAIT)) - return; - - tcp_send_ack(sk); + if (sk->state != TCP_CLOSE) + tcp_send_ack(sk); } /* @@ -951,7 +919,6 @@ static int tcp_recv_urg(struct sock * sk, int nonblock, struct msghdr *msg, int len, int flags, int *addr_len) { - int err=0; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* No URG data to read. */ @@ -961,22 +928,19 @@ static int tcp_recv_urg(struct sock * sk, int nonblock, if (sk->err) return sock_error(sk); - if (sk->state == TCP_CLOSE || sk->done) { - if (!sk->done) { - sk->done = 1; - return 0; - } + if (sk->done) return -ENOTCONN; - } - if (sk->shutdown & RCV_SHUTDOWN) { + if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN)) { sk->done = 1; return 0; } lock_sock(sk); if (tp->urg_data & URG_VALID) { + int err = 0; char c = tp->urg_data; + if (!(flags & MSG_PEEK)) tp->urg_data = URG_READ; @@ -994,11 +958,13 @@ static int tcp_recv_urg(struct sock * sk, int nonblock, if(len>0) { err = memcpy_toiovec(msg->msg_iov, &c, 1); + /* N.B. already set above ... */ msg->msg_flags|=MSG_OOB; } else msg->msg_flags|=MSG_TRUNC; + /* N.B. Is this right?? If len == 0 we didn't read any data */ return err ? -EFAULT : 1; } release_sock(sk); @@ -1135,12 +1101,12 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, /* Now that we have two receive queues this * shouldn't happen. */ - if (before(*seq, skb->seq)) { + if (before(*seq, TCP_SKB_CB(skb)->seq)) { printk(KERN_INFO "recvmsg bug: copied %X seq %X\n", - *seq, skb->seq); + *seq, TCP_SKB_CB(skb)->seq); break; } - offset = *seq - skb->seq; + offset = *seq - TCP_SKB_CB(skb)->seq; if (skb->h.th->syn) offset--; if (offset < skb->len) @@ -1160,6 +1126,11 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, break; } + if (sk->shutdown & RCV_SHUTDOWN) { + sk->done = 1; + break; + } + if (sk->state == TCP_CLOSE) { if (!sk->done) { sk->done = 1; @@ -1169,11 +1140,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, break; } - if (sk->shutdown & RCV_SHUTDOWN) { - sk->done = 1; - break; - } - if (nonblock) { copied = -EAGAIN; break; @@ -1225,10 +1191,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, * a crash when cleanup_rbuf() gets called. */ err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used); - if (err) { /* Exception. Bailout! */ - *seq -= err; atomic_dec(&skb->users); copied = -EFAULT; break; @@ -1288,43 +1252,43 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, } /* + * Check whether to renew the timer. + */ +static inline void tcp_check_fin_timer(struct sock *sk) +{ + if (sk->state == TCP_FIN_WAIT2 && !sk->timer.prev) + tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout); +} + +/* * State processing on a close. This implements the state shift for * sending our FIN frame. Note that we only send a FIN for some * states. A shutdown() may have already sent the FIN, or we may be * closed. */ +static unsigned char new_state[16] = { + /* current state: new state: action: */ + /* (Invalid) */ TCP_CLOSE, + /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, + /* TCP_SYN_SENT */ TCP_CLOSE, + /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, + /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1, + /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2, + /* TCP_TIME_WAIT */ TCP_CLOSE, + /* TCP_CLOSE */ TCP_CLOSE, + /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN, + /* TCP_LAST_ACK */ TCP_LAST_ACK, + /* TCP_LISTEN */ TCP_CLOSE, + /* TCP_CLOSING */ TCP_CLOSING, +}; + static int tcp_close_state(struct sock *sk, int dead) { - int ns=TCP_CLOSE; - int send_fin=0; - switch(sk->state) { - case TCP_SYN_SENT: /* No SYN back, no FIN needed */ - break; - case TCP_SYN_RECV: - case TCP_ESTABLISHED: /* Closedown begin */ - ns=TCP_FIN_WAIT1; - send_fin=1; - break; - case TCP_FIN_WAIT1: /* Already closing, or FIN sent: no change */ - case TCP_FIN_WAIT2: - case TCP_CLOSING: - ns=sk->state; - break; - case TCP_CLOSE: - case TCP_LISTEN: - break; - case TCP_LAST_ACK: /* Could have shutdown() then close() - * (but don't do send_fin again!) */ - ns=TCP_LAST_ACK; - break; - case TCP_CLOSE_WAIT: /* They have FIN'd us. We send our FIN and - wait only for the ACK */ - ns=TCP_LAST_ACK; - send_fin=1; - }; + int next = (int) new_state[sk->state]; + int ns = (next & TCP_STATE_MASK); - tcp_set_state(sk,ns); + tcp_set_state(sk, ns); /* This is a (useful) BSD violating of the RFC. There is a * problem with TCP as specified in that the other end could @@ -1334,10 +1298,10 @@ static int tcp_close_state(struct sock *sk, int dead) * that we won't make the old 4*rto = almost no time - whoops * reset mistake. */ - if(dead && ns == TCP_FIN_WAIT2 && !sk->timer.prev) - tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout); + if (dead) + tcp_check_fin_timer(sk); - return send_fin; + return (next & TCP_ACTION_FIN); } /* @@ -1380,12 +1344,47 @@ static inline int closing(struct sock * sk) return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK)); } +/* + * This routine closes sockets which have been at least partially + * opened, but not yet accepted. Currently it is only called by + * tcp_close, and timeout mirrors the value there. + */ + +static void tcp_close_pending (struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct open_request *req = tp->syn_wait_queue; + + while(req) { + struct open_request *iter; + + if (req->sk) + tcp_close(req->sk, 0); + + iter = req; + req = req->dl_next; + + (*iter->class->destructor)(iter); + tcp_dec_slow_timer(TCP_SLT_SYNACK); + sk->ack_backlog--; + tcp_openreq_free(iter); + } + + tcp_synq_init(tp); +} void tcp_close(struct sock *sk, unsigned long timeout) { struct sk_buff *skb; int data_was_unread = 0; + /* + * Check whether the socket is locked ... supposedly + * it's impossible to tcp_close() a locked socket. + */ + if (sk->sock_readers) + printk("tcp_close: socket already locked!\n"); + /* We need to grab some memory, and put together a FIN, * and then put it into the queue to be sent. */ @@ -1399,7 +1398,12 @@ void tcp_close(struct sock *sk, unsigned long timeout) return; } - sk->keepopen = 1; + /* It is questionable, what the role of this is now. + * In any event either it should be removed, or + * increment of SLT_KEEPALIVE be done, this is causing + * big problems. For now I comment it out. -DaveM + */ + /* sk->keepopen = 1; */ sk->shutdown = SHUTDOWN_MASK; if (!sk->dead) @@ -1409,7 +1413,7 @@ void tcp_close(struct sock *sk, unsigned long timeout) * descriptor close, not protocol-sourced closes, because the * reader process may not have drained the data yet! */ - while((skb=skb_dequeue(&sk->receive_queue))!=NULL) { + while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) { data_was_unread++; kfree_skb(skb); } @@ -1438,12 +1442,14 @@ void tcp_close(struct sock *sk, unsigned long timeout) struct task_struct *tsk = current; struct wait_queue wait = { tsk, NULL }; - tsk->state = TASK_INTERRUPTIBLE; tsk->timeout = timeout; add_wait_queue(sk->sleep, &wait); release_sock(sk); - while (closing(sk)) { + while (1) { + tsk->state = TASK_INTERRUPTIBLE; + if (!closing(sk)) + break; schedule(); if (signal_pending(tsk) || !tsk->timeout) break; @@ -1459,8 +1465,7 @@ void tcp_close(struct sock *sk, unsigned long timeout) /* Now that the socket is dead, if we are in the FIN_WAIT2 state * we may need to set up a timer. */ - if (sk->state == TCP_FIN_WAIT2 && !sk->timer.prev) - tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout); + tcp_check_fin_timer(sk); sk->dead = 1; release_sock(sk); @@ -1474,7 +1479,7 @@ static struct open_request * wait_for_connect(struct sock * sk, struct open_request **pprev) { struct wait_queue wait = { current, NULL }; - struct open_request *req = NULL; + struct open_request *req; add_wait_queue(sk->sleep, &wait); for (;;) { @@ -1488,6 +1493,7 @@ static struct open_request * wait_for_connect(struct sock * sk, if (signal_pending(current)) break; } + current->state = TASK_RUNNING; remove_wait_queue(sk->sleep, &wait); return req; } @@ -1517,21 +1523,21 @@ struct sock *tcp_accept(struct sock *sk, int flags) /* Find already established connection */ req = tcp_find_established(tp, &prev); if (!req) { - /* If this is a non blocking socket don't sleep */ - error = EAGAIN; - if (flags & O_NONBLOCK) + /* If this is a non blocking socket don't sleep */ + error = EAGAIN; + if (flags & O_NONBLOCK) goto out; - error = ERESTARTSYS; - req = wait_for_connect(sk, &prev); - if (!req) + error = ERESTARTSYS; + req = wait_for_connect(sk, &prev); + if (!req) goto out; } tcp_synq_unlink(tp, req, prev); newsk = req->sk; tcp_openreq_free(req); - sk->ack_backlog--; /* XXX */ + sk->ack_backlog--; /* FIXME: need to check here if newsk has already * an soft_err or err set. @@ -1625,8 +1631,16 @@ void tcp_set_keepalive(struct sock *sk, int val) tcp_dec_slow_timer(TCP_SLT_KEEPALIVE); } +extern void __skb_cb_too_small_for_tcp(int, int); + __initfunc(void tcp_init(void)) { + struct sk_buff *skb = NULL; + + if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)) + __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), + sizeof(skb->cb)); + tcp_openreq_cachep = kmem_cache_create("tcp_open_request", sizeof(struct open_request), 0, SLAB_HWCACHE_ALIGN, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1c34e6693..d5b0b15c6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_input.c,v 1.98 1998/03/23 22:54:48 davem Exp $ + * Version: $Id: tcp_input.c,v 1.114 1998/04/28 06:42:22 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -42,6 +42,14 @@ * Andi Kleen : Moved open_request checking here * and process RSTs for open_requests. * Andi Kleen : Better prune_queue, and other fixes. + * Andrey Savochkin: Fix RTT measurements in the presnce of + * timestamps. + * Andrey Savochkin: Check sequence numbers correctly when + * removing SACKs due to in sequence incoming + * data segments. + * Andi Kleen: Make sure we never ack data there is not + * enough room for. Also make this condition + * a fatal error if it might still happen. */ #include <linux/config.h> @@ -50,15 +58,6 @@ #include <net/tcp.h> #include <linux/ipsec.h> -typedef void (*tcp_sys_cong_ctl_t)(struct sock *sk, - u32 seq, u32 ack, - u32 seq_rtt); - -static void tcp_cong_avoid_vanj(struct sock *sk, u32 seq, u32 ack, - u32 seq_rtt); -static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack, - u32 seq_rtt); - #ifdef CONFIG_SYSCTL #define SYNC_INIT 0 /* let the user enable it */ #else @@ -80,7 +79,7 @@ int sysctl_tcp_syncookies = SYNC_INIT; int sysctl_tcp_stdurg; int sysctl_tcp_rfc1337; -static tcp_sys_cong_ctl_t tcp_sys_cong_ctl_f = &tcp_cong_avoid_vanj; +static int prune_queue(struct sock *sk); /* There is something which you must keep in mind when you analyze the * behavior of the tp->ato delayed ack timeout interval. When a @@ -164,7 +163,7 @@ static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt) static __inline__ void tcp_set_rto(struct tcp_opt *tp) { tp->rto = (tp->srtt >> 3) + tp->mdev; - tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1)); + tp->rto += (tp->rto >> 2) + (tp->rto >> ((tp->snd_cwnd>>TCP_CWND_SHIFT)-1)); } @@ -176,7 +175,7 @@ static __inline__ void tcp_set_rto(struct tcp_opt *tp) * way to avoid the problem. Is it possible to drop the lower * bound and still avoid trouble with BSD stacks? Perhaps * some modification to the RTO calculation that takes delayed - * ack bais into account? This needs serious thought. -- erics + * ack bias into account? This needs serious thought. -- erics */ static __inline__ void tcp_bound_rto(struct tcp_opt *tp) { @@ -193,19 +192,27 @@ extern __inline__ void tcp_replace_ts_recent(struct tcp_opt *tp, __u32 end_seq) * test is last_ack_sent <= end_seq. * (RFC1323 stated last_ack_sent < end_seq.) */ - if (!before(end_seq,tp->last_ack_sent)) { - tp->ts_recent = tp->rcv_tsval; - tp->ts_recent_stamp = jiffies; + if (!before(end_seq, tp->last_ack_sent)) { + /* PAWS bug workaround wrt. ACK frames, the PAWS discard + * extra check below makes sure this can only happen + * for pure ACK frames. -DaveM + */ + if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) { + tp->ts_recent = tp->rcv_tsval; + tp->ts_recent_stamp = jiffies; + } } } #define PAWS_24DAYS (HZ * 60 * 60 * 24 * 24) -extern __inline__ int tcp_paws_discard(struct tcp_opt *tp) +extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, __u16 len) { /* ts_recent must be younger than 24 days */ return (((jiffies - tp->ts_recent_stamp) >= PAWS_24DAYS) || - ((s32)(tp->rcv_tsval-tp->ts_recent) < 0)); + (((s32)(tp->rcv_tsval-tp->ts_recent) < 0) && + /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM */ + (len != (th->doff * 4)))); } @@ -266,15 +273,34 @@ static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, struct sk_buff *skb = skb_peek(&sk->write_queue); __u32 start_seq = ntohl(sp->start_seq); __u32 end_seq = ntohl(sp->end_seq); + int fack_count = 0; while((skb != NULL) && (skb != tp->send_head) && (skb != (struct sk_buff *)&sk->write_queue)) { + /* The retransmission queue is always in order, so + * we can short-circuit the walk early. + */ + if(!before(start_seq, TCP_SKB_CB(skb)->end_seq)) + break; + /* We play conservative, we don't allow SACKS to partially * tag a sequence space. */ - if(!after(start_seq, skb->seq) && !before(end_seq, skb->end_seq)) + fack_count++; + if(!after(start_seq, TCP_SKB_CB(skb)->seq) && + !before(end_seq, TCP_SKB_CB(skb)->end_seq)) { + /* If this was a retransmitted frame, account for it. */ + if(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) + tp->retrans_out--; TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; + + /* RULE: All new SACKs will either decrease retrans_out + * or advance fackets_out. + */ + if(fack_count > tp->fackets_out) + tp->fackets_out = fack_count; + } skb = skb->next; } sp++; /* Move on to the next SACK block. */ @@ -322,6 +348,13 @@ void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, i if (!no_fancy && sysctl_tcp_window_scaling) { tp->wscale_ok = 1; tp->snd_wscale = *(__u8 *)ptr; + if(tp->snd_wscale > 14) { + if(net_ratelimit()) + printk("tcp_parse_options: Illegal window " + "scaling value %d >14 received.", + tp->snd_wscale); + tp->snd_wscale = 14; + } } break; case TCPOPT_TIMESTAMP: @@ -388,19 +421,43 @@ static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, return 1; } +#if 0 /* Not working yet... -DaveM */ +static void tcp_compute_tsack(struct sock *sk, struct tcp_opt *tp) +{ + struct sk_buff *skb = skb_peek(&sk->write_queue); + __u32 tstamp = tp->rcv_tsecr; + int fack_count = 0; + + while((skb != NULL) && + (skb != tp->send_head) && + (skb != (struct sk_buff *)&sk->write_queue)) { + if(TCP_SKB_CB(skb)->when == tstamp) { + __u8 sacked = TCP_SKB_CB(skb)->sacked; + + sacked |= TCPCB_SACKED_ACKED; + if(sacked & TCPCB_SACKED_RETRANS) + tp->retrans_out--; + TCP_SKB_CB(skb)->sacked = sacked; + } + if(!before(TCP_SKB_CB(skb)->when, tstamp)) + fack_count++; + skb = skb->next; + } + if(fack_count > tp->fackets_out) + tp->fackets_out = fack_count; +} +#endif + #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ #define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ -static __inline__ void clear_fast_retransmit(struct sock *sk) +static __inline__ void clear_fast_retransmit(struct tcp_opt *tp) { - struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); + if (tp->dup_acks > 3) + tp->snd_cwnd = (tp->snd_ssthresh << TCP_CWND_SHIFT); - if (tp->dup_acks > 3) { - tp->retrans_head = NULL; - tp->snd_cwnd = max(tp->snd_ssthresh, 1); - } tp->dup_acks = 0; } @@ -409,10 +466,9 @@ static __inline__ void clear_fast_retransmit(struct sock *sk) */ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) { - struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - /* - * Note: If not_dup is set this implies we got a + /* Note: If not_dup is set this implies we got a * data carrying packet or a window update. * This carries no new information about possible * lost packets, so we have to ignore it for the purposes @@ -422,22 +478,31 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) * the code below much more complex. For now if I see such * a packet I clear the fast retransmit phase. */ - if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) { /* This is the standard reno style fast retransmit branch. */ +#if 0 /* Not working yet... -DaveM */ + /* If not doing SACK, but doing timestamps, compute timestamp + * based pseudo-SACKs when we see duplicate ACKs. + */ + if(!tp->sack_ok && tp->saw_tstamp) + tcp_compute_tsack(sk, tp); +#endif /* 1. When the third duplicate ack is received, set ssthresh * to one half the current congestion window, but no less * than two segments. Retransmit the missing segment. */ if (tp->high_seq == 0 || after(ack, tp->high_seq)) { tp->dup_acks++; - if (tp->dup_acks == 3) { + if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) { tp->dup_acks++; - tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2); - tp->snd_cwnd = tp->snd_ssthresh + 3; + tp->snd_ssthresh = max(tp->snd_cwnd >> (TCP_CWND_SHIFT + 1), 2); + tp->snd_cwnd = (tp->snd_ssthresh + 3) << TCP_CWND_SHIFT; tp->high_seq = tp->snd_nxt; - tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); + if(!tp->fackets_out) + tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); + else + tcp_fack_retransmit(sk); tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); } } @@ -446,10 +511,22 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) * cwnd by the segment size. [...] Transmit a packet... * * Packet transmission will be done on normal flow processing - * since we're not in "retransmit mode" + * since we're not in "retransmit mode". We do not use duplicate + * ACKs to artificially inflate the congestion window when + * doing FACK. */ - if (tp->dup_acks > 3) - tp->snd_cwnd++; + if (tp->dup_acks > 3) { + if(!tp->fackets_out) { + tp->snd_cwnd += (1 << TCP_CWND_SHIFT); + } else { + /* Fill any further holes which may have appeared. + * We may want to change this to run every further + * multiple-of-3 dup ack increments, to be more robust + * against out-of-order packet delivery. -DaveM + */ + tcp_fack_retransmit(sk); + } + } } else if (tp->high_seq != 0) { /* In this branch we deal with clearing the Floyd style * block on duplicate fast retransmits, and if requested @@ -463,15 +540,17 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) * Note that we do want to accept a window * update since this is expected with Hoe's algorithm. */ - clear_fast_retransmit(sk); + clear_fast_retransmit(tp); /* After we have cleared up to high_seq we can * clear the Floyd style block. */ - if (after(ack, tp->high_seq)) + if (!before(ack, tp->high_seq)) { tp->high_seq = 0; + tp->fackets_out = 0; + } } else if (tp->dup_acks >= 3) { - if (sysctl_tcp_hoe_retransmits) { + if (!tp->fackets_out) { /* Hoe Style. We didn't ack the whole * window. Take this as a cue that * another packet was lost and retransmit it. @@ -490,131 +569,34 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); } } else { - /* Reno style. We didn't ack the whole - * window, now we have to drop out of - * fast retransmit and wait for a timeout. + /* FACK style, fill any remaining holes in + * receiver's queue. */ - clear_fast_retransmit(sk); + tcp_fack_retransmit(sk); } } } } -/* - * TCP slow start and congestion avoidance in two flavors: - * RFC 1122 and TCP Vegas. +/* This is Jacobson's slow start and congestion avoidance. + * SIGCOMM '88, p. 328. * - * This is a /proc/sys configurable option. + * FIXME: What happens when the congestion window gets larger + * than the maximum receiver window by some large factor + * Suppose the pipeline never looses packets for a long + * period of time, then traffic increases causing packet loss. + * The congestion window should be reduced, but what it should + * be reduced to is not clear, since 1/2 the old window may + * still be larger than the maximum sending rate we ever achieved. */ - -#define SHIFT_FACTOR 16 - -static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack, - u32 seq_rtt) +static void tcp_cong_avoid(struct tcp_opt *tp, u32 seq, u32 ack, u32 seq_rtt) { - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - unsigned int actual, expected; - unsigned int inv_rtt, inv_basertt, inv_basebd; - u32 snt_bytes; - - /* From: - * TCP Vegas: New Techniques for Congestion - * Detection and Avoidance. - * - * Warning: This code is a scratch implementation taken - * from the paper only. The code they distribute seams - * to have improved several things over the initial spec. - */ - - if (!seq_rtt) - seq_rtt = 1; - - if (tp->basertt) - tp->basertt = min(seq_rtt, tp->basertt); - else - tp->basertt = seq_rtt; - - /* actual = throughput for this segment. - * expected = number_of_bytes in transit / BaseRTT - */ - - snt_bytes = ack - seq; - - inv_rtt = (1 << SHIFT_FACTOR) / seq_rtt; - inv_basertt = (1 << SHIFT_FACTOR) / tp->basertt; - - actual = snt_bytes * inv_rtt; - - expected = (tp->snd_nxt - tp->snd_una) * inv_basertt; - - inv_basebd = sk->mss * inv_basertt; - - /* Slow Start */ - if (tp->snd_cwnd < tp->snd_ssthresh && - (seq == tp->snd_nxt || - (expected - actual <= TCP_VEGAS_GAMMA * inv_basebd))) { - /* "Vegas allows exponential growth only every other RTT" */ - if (tp->snd_cwnd_cnt++) { - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } - } else { - /* Congestion Avoidance */ - if (expected - actual <= TCP_VEGAS_ALPHA * inv_basebd) { - /* Increase Linearly */ - if (tp->snd_cwnd_cnt++ >= tp->snd_cwnd) { - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } - } - - if (expected - actual >= TCP_VEGAS_BETA * inv_basebd) { - /* Decrease Linearly */ - if (tp->snd_cwnd_cnt++ >= tp->snd_cwnd) { - tp->snd_cwnd--; - tp->snd_cwnd_cnt = 0; - } - - /* Never less than 2 segments. */ - if (tp->snd_cwnd < 2) - tp->snd_cwnd = 2; - } - } -} - -static void tcp_cong_avoid_vanj(struct sock *sk, u32 seq, u32 ack, u32 seq_rtt) -{ - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - /* This is Jacobson's slow start and congestion avoidance. - * SIGCOMM '88, p. 328. Because we keep cong_window in - * integral mss's, we can't do cwnd += 1 / cwnd. - * Instead, maintain a counter and increment it once every - * cwnd times. - * FIXME: Check to be sure the mathematics works out right - * on this trick when we have to reduce the congestion window. - * The snd_cwnd_cnt has to be reset properly when reduction events - * happen. - * FIXME: What happens when the congestion window gets larger - * than the maximum receiver window by some large factor - * Suppose the pipeline never looses packets for a long - * period of time, then traffic increases causing packet loss. - * The congestion window should be reduced, but what it should - * be reduced to is not clear, since 1/2 the old window may - * still be larger than the maximum sending rate we ever achieved. - */ - if (tp->snd_cwnd <= tp->snd_ssthresh) { + if ((tp->snd_cwnd>>TCP_CWND_SHIFT) <= tp->snd_ssthresh) { /* In "safe" area, increase. */ - tp->snd_cwnd++; + tp->snd_cwnd += (1 << TCP_CWND_SHIFT); } else { - /* In dangerous area, increase slowly. In theory this is - * tp->snd_cwnd += 1 / tp->snd_cwnd - */ - if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } else - tp->snd_cwnd_cnt++; + /* In dangerous area, increase slowly. */ + tp->snd_cwnd += 1; } } @@ -628,11 +610,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, int acked = 0; while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) { + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + /* If our packet is before the ack sequence we can * discard it as it's confirmed to have arrived at * the other end. */ - if (after(skb->end_seq, ack)) + if (after(scb->end_seq, ack)) break; /* Initial outgoing SYN's get put onto the write_queue @@ -642,17 +626,31 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, * connection startup slow start one packet too * quickly. This is severely frowned upon behavior. */ - if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) { + if(!(scb->flags & TCPCB_FLAG_SYN)) { + __u8 sacked = scb->sacked; + acked |= FLAG_DATA_ACKED; - if(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) + if(sacked & TCPCB_SACKED_RETRANS) { acked |= FLAG_RETRANS_DATA_ACKED; + + /* XXX The race is, fast retrans frame --> + * XXX retrans timeout sends older frame --> + * XXX ACK arrives for fast retrans frame --> + * XXX retrans_out goes negative --> splat. + * XXX Please help me find a better way -DaveM + */ + if(tp->retrans_out) + tp->retrans_out--; + } + if(tp->fackets_out) + tp->fackets_out--; } else { tp->retrans_head = NULL; } tp->packets_out--; - *seq = skb->seq; - *seq_rtt = now - skb->when; - skb_unlink(skb); + *seq = scb->seq; + *seq_rtt = now - scb->when; + __skb_unlink(skb, skb->list); kfree_skb(skb); } @@ -672,7 +670,7 @@ static void tcp_ack_probe(struct sock *sk, __u32 ack) /* should always be non-null */ if (tp->send_head != NULL && - !before (ack + tp->snd_wnd, tp->send_head->end_seq)) { + !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) { tp->backoff = 0; tp->pending = 0; tcp_clear_xmit_timer(sk, TIME_PROBE0); @@ -688,11 +686,26 @@ static void tcp_ack_probe(struct sock *sk, __u32 ack) static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp, u32 seq, u32 ack, int flag) { - __u32 seq_rtt = (jiffies-tp->rcv_tsecr); + __u32 seq_rtt; + + /* RTTM Rule: A TSecr value received in a segment is used to + * update the averaged RTT measurement only if the segment + * acknowledges some new data, i.e., only if it advances the + * left edge of the send window. + * + * See draft-ietf-tcplw-high-performance-00, section 3.3. + * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> + */ + if (!(flag & FLAG_DATA_ACKED)) + return; + + seq_rtt = jiffies-tp->rcv_tsecr; tcp_rtt_estimator(tp, seq_rtt); if (tp->retransmits) { if (tp->packets_out == 0) { tp->retransmits = 0; + tp->fackets_out = 0; + tp->retrans_out = 0; tp->backoff = 0; tcp_set_rto(tp); } else { @@ -702,8 +715,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp, } } else { tcp_set_rto(tp); - if (flag & FLAG_DATA_ACKED) - (*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt); + tcp_cong_avoid(tp, seq, ack, seq_rtt); } /* NOTE: safe here so long as cong_ctl doesn't use rto */ tcp_bound_rto(tp); @@ -712,7 +724,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp, static void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) { struct sk_buff *skb = skb_peek(&sk->write_queue); - long when = tp->rto - (jiffies - skb->when); + long when = tp->rto - (jiffies - TCP_SKB_CB(skb)->when); /* Some data was ACK'd, if still retransmitting (due to a * timeout), resend more of the retransmit queue. The @@ -801,8 +813,11 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, } else { /* If we were retransmiting don't count rtt estimate. */ if (tp->retransmits) { - if (tp->packets_out == 0) + if (tp->packets_out == 0) { tp->retransmits = 0; + tp->fackets_out = 0; + tp->retrans_out = 0; + } } else { /* We don't have a timestamp. Can only use * packets that are not retransmitted to determine @@ -812,13 +827,14 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, * where the network delay has increased suddenly. * I.e. Karn's algorithm. (SIGCOMM '87, p5.) */ - if ((flag & FLAG_DATA_ACKED) && - !(flag & FLAG_RETRANS_DATA_ACKED)) { - tp->backoff = 0; - tcp_rtt_estimator(tp, seq_rtt); - tcp_set_rto(tp); - tcp_bound_rto(tp); - (*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt); + if (flag & FLAG_DATA_ACKED) { + if(!(flag & FLAG_RETRANS_DATA_ACKED)) { + tp->backoff = 0; + tcp_rtt_estimator(tp, seq_rtt); + tcp_set_rto(tp); + tcp_bound_rto(tp); + } + tcp_cong_avoid(tp, seq, ack, seq_rtt); } } } @@ -848,13 +864,12 @@ uninteresting_ack: } /* New-style handling of TIME_WAIT sockets. */ -static void tcp_timewait_kill(unsigned long __arg) -{ - struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)__arg; - - /* Zap the timer. */ - del_timer(&tw->timer); +extern void tcp_tw_schedule(struct tcp_tw_bucket *tw); +extern void tcp_tw_reschedule(struct tcp_tw_bucket *tw); +extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw); +void tcp_timewait_kill(struct tcp_tw_bucket *tw) +{ /* Unlink from various places. */ if(tw->bind_next) tw->bind_next->bind_pprev = tw->bind_pprev; @@ -898,7 +913,7 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, * (2) returns to TIME-WAIT state if the SYN turns out * to be an old duplicate". */ - if(th->syn && !th->rst && after(skb->seq, tw->rcv_nxt)) { + if(th->syn && !th->rst && after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt)) { struct sock *sk; struct tcp_func *af_specific = tw->af_specific; __u32 isn; @@ -906,7 +921,8 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, isn = tw->rcv_nxt + 128000; if(isn == 0) isn++; - tcp_timewait_kill((unsigned long)tw); + tcp_tw_deschedule(tw); + tcp_timewait_kill(tw); sk = af_specific->get_sock(skb, th); if(sk == NULL || !ipsec_sk_policy(sk,skb)) return 0; @@ -923,16 +939,16 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ - if(sysctl_tcp_rfc1337 == 0) - tcp_timewait_kill((unsigned long)tw); - + if(sysctl_tcp_rfc1337 == 0) { + tcp_tw_deschedule(tw); + tcp_timewait_kill(tw); + } if(!th->rst) return 1; /* toss a reset back */ } else { - if(th->ack) { - /* In this case we must reset the TIMEWAIT timer. */ - mod_timer(&tw->timer, jiffies + TCP_TIMEWAIT_LEN); - } + /* In this case we must reset the TIMEWAIT timer. */ + if(th->ack) + tcp_tw_reschedule(tw); } return 0; /* Discard the frame. */ } @@ -1008,11 +1024,7 @@ void tcp_time_wait(struct sock *sk) tcp_tw_hashdance(sk, tw); /* Get the TIME_WAIT timeout firing. */ - init_timer(&tw->timer); - tw->timer.function = tcp_timewait_kill; - tw->timer.data = (unsigned long) tw; - tw->timer.expires = jiffies + TCP_TIMEWAIT_LEN; - add_timer(&tw->timer); + tcp_tw_schedule(tw); /* CLOSE the SK. */ if(sk->state == TCP_ESTABLISHED) @@ -1051,7 +1063,7 @@ void tcp_time_wait(struct sock *sk) static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) { - sk->tp_pinfo.af_tcp.fin_seq = skb->end_seq; + sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq; tcp_send_ack(sk); @@ -1174,14 +1186,14 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb) * "in order". ;-) This also satisfies the requirements * of RFC2018 about ordering of SACKs. */ - if(sp->end_seq == skb->seq) { - sp->end_seq = skb->end_seq; + if(sp->end_seq == TCP_SKB_CB(skb)->seq) { + sp->end_seq = TCP_SKB_CB(skb)->end_seq; tcp_sack_maybe_coalesce(tp, sp); - } else if(sp->start_seq == skb->end_seq) { + } else if(sp->start_seq == TCP_SKB_CB(skb)->end_seq) { /* Re-ordered arrival, in this case, can be optimized * as well. */ - sp->start_seq = skb->seq; + sp->start_seq = TCP_SKB_CB(skb)->seq; tcp_sack_maybe_coalesce(tp, sp); } else { int cur_sacks = tp->num_sacks; @@ -1195,12 +1207,12 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb) int this_sack; for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) { - if((swap->end_seq == skb->seq) || - (swap->start_seq == skb->end_seq)) { - if(swap->end_seq == skb->seq) - swap->end_seq = skb->end_seq; + if((swap->end_seq == TCP_SKB_CB(skb)->seq) || + (swap->start_seq == TCP_SKB_CB(skb)->end_seq)) { + if(swap->end_seq == TCP_SKB_CB(skb)->seq) + swap->end_seq = TCP_SKB_CB(skb)->end_seq; else - swap->start_seq = skb->seq; + swap->start_seq = TCP_SKB_CB(skb)->seq; tcp_sack_swap(sp, swap); tcp_sack_maybe_coalesce(tp, sp); return; @@ -1221,8 +1233,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb) } /* Build head SACK, and we're done. */ - sp->start_seq = skb->seq; - sp->end_seq = skb->end_seq; + sp->start_seq = TCP_SKB_CB(skb)->seq; + sp->end_seq = TCP_SKB_CB(skb)->end_seq; if(tp->num_sacks < max_sacks) tp->num_sacks++; } @@ -1234,9 +1246,14 @@ static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb) int num_sacks = tp->num_sacks; int this_sack; - /* We know this removed SKB will eat from the front of a SACK. */ + /* This is an in order data segment _or_ an out-of-order SKB being + * moved to the receive queue, so we know this removed SKB will eat + * from the front of a SACK. + */ for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) { - if(sp->start_seq == skb->seq) + /* Check if the start of the sack is covered by skb. */ + if(!before(sp->start_seq, TCP_SKB_CB(skb)->seq) && + before(sp->start_seq, TCP_SKB_CB(skb)->end_seq)) break; } @@ -1247,7 +1264,7 @@ static void tcp_sack_remove_skb(struct tcp_opt *tp, struct sk_buff *skb) if(this_sack >= num_sacks) return; - sp->start_seq = skb->end_seq; + sp->start_seq = TCP_SKB_CB(skb)->end_seq; if(!before(sp->start_seq, sp->end_seq)) { /* Zap this SACK, by moving forward any other SACKS. */ for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) { @@ -1266,12 +1283,12 @@ static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct int this_sack; for(this_sack = 0; this_sack < num_sacks; this_sack++, tp++) { - if(sp->end_seq == old_skb->end_seq) + if(sp->end_seq == TCP_SKB_CB(old_skb)->end_seq) break; } if(this_sack >= num_sacks) return; - sp->end_seq = new_skb->end_seq; + sp->end_seq = TCP_SKB_CB(new_skb)->end_seq; } /* This one checks to see if we can put data from the @@ -1283,23 +1300,24 @@ static void tcp_ofo_queue(struct sock *sk) struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); while ((skb = skb_peek(&tp->out_of_order_queue))) { - if (after(skb->seq, tp->rcv_nxt)) + if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) break; - if (!after(skb->end_seq, tp->rcv_nxt)) { + if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { SOCK_DEBUG(sk, "ofo packet was already received \n"); - skb_unlink(skb); + __skb_unlink(skb, skb->list); kfree_skb(skb); continue; } SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", - tp->rcv_nxt, skb->seq, skb->end_seq); + tp->rcv_nxt, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq); if(tp->sack_ok) tcp_sack_remove_skb(tp, skb); - skb_unlink(skb); - skb_queue_tail(&sk->receive_queue, skb); - tp->rcv_nxt = skb->end_seq; + __skb_unlink(skb, skb->list); + __skb_queue_tail(&sk->receive_queue, skb); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; if(skb->h.th->fin) tcp_fin(skb, sk, skb->h.th); } @@ -1314,12 +1332,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) * Packets in sequence go to the receive queue. * Out of sequence packets to out_of_order_queue. */ - if (skb->seq == tp->rcv_nxt) { + if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { /* Ok. In sequence. */ queue_and_out: dst_confirm(sk->dst_cache); - skb_queue_tail(&sk->receive_queue, skb); - tp->rcv_nxt = skb->end_seq; + __skb_queue_tail(&sk->receive_queue, skb); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; if(skb->h.th->fin) { tcp_fin(skb, sk, skb->h.th); } else { @@ -1341,18 +1359,19 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) } /* An old packet, either a retransmit or some packet got lost. */ - if (!after(skb->end_seq, tp->rcv_nxt)) { + if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { /* A retransmit, 2nd most common case. Force an imediate ack. */ - SOCK_DEBUG(sk, "retransmit received: seq %X\n", skb->seq); + SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq); tcp_enter_quickack_mode(tp); kfree_skb(skb); return; } - if (before(skb->seq, tp->rcv_nxt)) { + if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { /* Partial packet, seq < rcv_next < end_seq */ SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", - tp->rcv_nxt, skb->seq, skb->end_seq); + tp->rcv_nxt, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq); goto queue_and_out; } @@ -1365,25 +1384,25 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) tp->pred_flags = 0; SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", - tp->rcv_nxt, skb->seq, skb->end_seq); + tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); if (skb_peek(&tp->out_of_order_queue) == NULL) { /* Initial out of order segment, build 1 SACK. */ if(tp->sack_ok) { tp->num_sacks = 1; - tp->selective_acks[0].start_seq = skb->seq; - tp->selective_acks[0].end_seq = skb->end_seq; + tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; + tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq; } - skb_queue_head(&tp->out_of_order_queue,skb); + __skb_queue_head(&tp->out_of_order_queue,skb); } else { for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) { /* Already there. */ - if (skb->seq == skb1->seq) { + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb1)->seq) { if (skb->len >= skb1->len) { if(tp->sack_ok) tcp_sack_extend(tp, skb1, skb); - skb_append(skb1, skb); - skb_unlink(skb1); + __skb_append(skb1, skb); + __skb_unlink(skb1, skb1->list); kfree_skb(skb1); } else { /* A duplicate, smaller than what is in the @@ -1394,8 +1413,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) break; } - if (after(skb->seq, skb1->seq)) { - skb_append(skb1,skb); + if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) { + __skb_append(skb1, skb); if(tp->sack_ok) tcp_sack_new_ofo_skb(sk, skb); break; @@ -1403,7 +1422,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) /* See if we've hit the start. If so insert. */ if (skb1 == skb_peek(&tp->out_of_order_queue)) { - skb_queue_head(&tp->out_of_order_queue,skb); + __skb_queue_head(&tp->out_of_order_queue,skb); if(tp->sack_ok) tcp_sack_new_ofo_skb(sk, skb); break; @@ -1431,6 +1450,20 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) if (skb->len == 0 && !th->fin) return(0); + /* + * If our receive queue has grown past its limits shrink it. + * Make sure to do this before moving snd_nxt, otherwise + * data might be acked for that we don't have enough room. + */ + if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) { + if (prune_queue(sk) < 0) { + /* Still not enough room. That can happen when + * skb->true_size differs significantly from skb->len. + */ + return 0; + } + } + /* We no longer have anyone receiving data on this connection. */ tcp_data_queue(sk, skb); @@ -1455,8 +1488,8 @@ static void tcp_data_snd_check(struct sock *sk) struct sk_buff *skb; if ((skb = tp->send_head)) { - if (!after(skb->end_seq, tp->snd_una + tp->snd_wnd) && - tp->packets_out < tp->snd_cwnd ) { + if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) && + tcp_packets_in_flight(tp) < (tp->snd_cwnd >> TCP_CWND_SHIFT)) { /* Put more data onto the wire. */ tcp_write_xmit(sk); } else if (tp->packets_out == 0 && !tp->pending) { @@ -1488,7 +1521,7 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk) */ /* Two full frames received or... */ - if (((tp->rcv_nxt - tp->rcv_wup) >= (sk->mss << 1)) || + if (((tp->rcv_nxt - tp->rcv_wup) >= sk->mss * MAX_DELAY_ACK) || /* We will update the window "significantly" or... */ tcp_raise_window(sk) || /* We entered "quick ACK" mode or... */ @@ -1590,7 +1623,7 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len * Clean first the out_of_order queue, then the receive queue until * the socket is in its memory limits again. */ -static void prune_queue(struct sock *sk) +static int prune_queue(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; struct sk_buff * skb; @@ -1601,10 +1634,10 @@ static void prune_queue(struct sock *sk) /* Start with the end because there are probably the least * useful packets (crossing fingers). */ - while ((skb = skb_dequeue_tail(&tp->out_of_order_queue))) { + while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue))) { kfree_skb(skb); if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) - return; + return 0; } /* Now continue with the receive queue if it wasn't enough */ @@ -1616,19 +1649,22 @@ static void prune_queue(struct sock *sk) break; /* Never remove packets that have been already acked */ - if (before(skb->end_seq, tp->last_ack_sent+1)) { - printk(KERN_DEBUG "prune_queue: hit acked data c=%x,%x,%x\n", - tp->copied_seq, skb->end_seq, tp->last_ack_sent); - break; + if (before(TCP_SKB_CB(skb)->end_seq, tp->last_ack_sent+1)) { + SOCK_DEBUG(sk, "prune_queue: hit acked data c=%x,%x,%x\n", + tp->copied_seq, TCP_SKB_CB(skb)->end_seq, + tp->last_ack_sent); + return -1; } - skb_unlink(skb); - tp->rcv_nxt = skb->seq; + __skb_unlink(skb, skb->list); + tp->rcv_nxt = TCP_SKB_CB(skb)->seq; SOCK_DEBUG(sk, "prune_queue: removing %x-%x (c=%x)\n", - skb->seq, skb->end_seq, tp->copied_seq); + TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, + tp->copied_seq); kfree_skb(skb); if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) break; } + return 0; } int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, @@ -1658,13 +1694,13 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, */ if (tcp_fast_parse_options(sk, th, tp)) { if (tp->saw_tstamp) { - if (tcp_paws_discard(tp)) { + if (tcp_paws_discard(tp, th, len)) { if (!th->rst) { tcp_send_ack(sk); goto discard; } } - tcp_replace_ts_recent(tp,skb->end_seq); + tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->end_seq); } } @@ -1678,11 +1714,12 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, * space for instance) */ - if (flg == tp->pred_flags && skb->seq == tp->rcv_nxt) { + if (flg == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { if (len <= th->doff*4) { /* Bulk data transfer: sender */ if (len == th->doff*4) { - tcp_ack(sk, th, skb->seq, skb->ack_seq, len); + tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->ack_seq, len); kfree_skb(skb); tcp_data_snd_check(sk); return 0; @@ -1690,7 +1727,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_statistics.TcpInErrs++; goto discard; } - } else if (skb->ack_seq == tp->snd_una) { + } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una) { /* Bulk data transfer: receiver */ if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) goto discard; @@ -1700,8 +1737,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, /* DO NOT notify forward progress here. * It saves dozen of CPU instructions in fast path. --ANK */ - skb_queue_tail(&sk->receive_queue, skb); - tp->rcv_nxt = skb->end_seq; + __skb_queue_tail(&sk->receive_queue, skb); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; /* FIN bit check is not done since if FIN is set in * this frame, the pred_flags won't match up. -DaveM @@ -1719,11 +1756,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } } - if (!tcp_sequence(tp, skb->seq, skb->end_seq)) { + if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { if (!th->rst) { - if (after(skb->seq, tp->rcv_nxt)) { + if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n", - skb->seq, skb->end_seq, + TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_wup, tp->rcv_wnd); } tcp_send_ack(sk); @@ -1731,7 +1768,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } } - if(th->syn && skb->seq != tp->syn_seq) { + if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) { SOCK_DEBUG(sk, "syn in established state\n"); tcp_statistics.TcpInErrs++; tcp_reset(sk, skb); @@ -1744,7 +1781,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } if(th->ack) - tcp_ack(sk, th, skb->seq, skb->ack_seq, len); + tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len); /* Process urgent data. */ tcp_urg(sk, th, len); @@ -1752,13 +1789,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, /* step 7: process the segment text */ queued = tcp_data(skb, sk, len); - tcp_data_snd_check(sk); - - /* If our receive queue has grown past its limits shrink it */ - if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) - prune_queue(sk); - - tcp_ack_snd_check(sk); + /* Be careful, tcp_data() may have put this into TIME_WAIT. */ + if(sk->state != TCP_CLOSE) { + tcp_data_snd_check(sk); + tcp_ack_snd_check(sk); + } if (!queued) { discard: @@ -1768,42 +1803,44 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, return 0; } -/* Shared between IPv4 and IPv6 now. */ -struct sock * -tcp_check_req(struct sock *sk, struct sk_buff *skb, struct open_request *req) +/* + * Process an incoming SYN or SYN-ACK. + */ + +struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, + struct open_request *req) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + u32 flg; /* assumption: the socket is not in use. * as we checked the user count on tcp_rcv and we're * running from a soft interrupt. */ + /* Check for syn retransmission */ + flg = *(((u32 *)skb->h.th) + 3); + + flg &= __constant_htonl(0x00170000); + /* Only SYN set? */ + if (flg == __constant_htonl(0x00020000)) { + if (!after(TCP_SKB_CB(skb)->seq, req->rcv_isn)) { + /* retransmited syn. + */ + req->class->rtx_syn_ack(sk, req); + return NULL; + } else { + return sk; /* Pass new SYN to the listen socket. */ + } + } + + /* We know it's an ACK here */ if (req->sk) { /* socket already created but not * yet accepted()... */ sk = req->sk; } else { - u32 flg; - - /* Check for syn retransmission */ - flg = *(((u32 *)skb->h.th) + 3); - - flg &= __constant_htonl(0x00170000); - /* Only SYN set? */ - if (flg == __constant_htonl(0x00020000)) { - if (!after(skb->seq, req->rcv_isn)) { - /* retransmited syn. - */ - req->class->rtx_syn_ack(sk, req); - return NULL; - } else { - return sk; /* New SYN */ - } - } - - /* We know it's an ACK here */ /* In theory the packet could be for a cookie, but * TIME_WAIT should guard us against this. * XXX: Nevertheless check for cookies? @@ -1811,8 +1848,8 @@ tcp_check_req(struct sock *sk, struct sk_buff *skb, struct open_request *req) * but we do it here to prevent syn flood attackers * from creating big SYN_RECV sockets. */ - if (!between(skb->ack_seq, req->snt_isn, req->snt_isn+1) || - !between(skb->seq, req->rcv_isn, + if (!between(TCP_SKB_CB(skb)->ack_seq, req->snt_isn, req->snt_isn+1) || + !between(TCP_SKB_CB(skb)->seq, req->rcv_isn, req->rcv_isn+1+req->rcv_wnd)) { req->class->send_reset(skb); return NULL; @@ -1885,10 +1922,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * not be in line code. [AC] */ if(th->ack) { - tp->snd_wl1 = skb->seq; + tp->snd_wl1 = TCP_SKB_CB(skb)->seq; /* We got an ack, but it's not a good ack. */ - if(!tcp_ack(sk,th, skb->seq, skb->ack_seq, len)) { + if(!tcp_ack(sk,th, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->ack_seq, len)) { + sk->err = ECONNRESET; + sk->state_change(sk); tcp_statistics.TcpAttemptFails++; return 1; } @@ -1902,6 +1942,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* A valid ack from a different connection * start. Shouldn't happen but cover it. */ + sk->err = ECONNRESET; + sk->state_change(sk); tcp_statistics.TcpAttemptFails++; return 1; } @@ -1909,13 +1951,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* Ok.. it's good. Set up sequence numbers and * move to established. */ - tp->rcv_nxt = skb->seq+1; - tp->rcv_wup = skb->seq+1; + tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1; + tp->rcv_wup = TCP_SKB_CB(skb)->seq+1; - tp->snd_wnd = htons(th->window) << tp->snd_wscale; - tp->snd_wl1 = skb->seq; - tp->snd_wl2 = skb->ack_seq; - tp->fin_seq = skb->seq; + /* RFC1323: The window in SYN & SYN/ACK segments is + * never scaled. + */ + tp->snd_wnd = htons(th->window); + tp->snd_wl1 = TCP_SKB_CB(skb)->seq; + tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq; + tp->fin_seq = TCP_SKB_CB(skb)->seq; tcp_set_state(sk, TCP_ESTABLISHED); tcp_parse_options(sk, th, tp, 0); @@ -1924,6 +1969,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->snd_wscale = tp->rcv_wscale = 0; tp->window_clamp = min(tp->window_clamp,65535); } + if (tp->tstamp_ok) { tp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; @@ -1983,11 +2029,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tp->ts_recent_stamp = jiffies; } - tp->rcv_nxt = skb->seq + 1; - tp->rcv_wup = skb->seq + 1; + tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; + /* RFC1323: The window in SYN & SYN/ACK segments is + * never scaled. + */ tp->snd_wnd = htons(th->window); - tp->snd_wl1 = skb->seq; + tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tcp_send_synack(sk); goto discard; @@ -2008,18 +2057,18 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * guarantee this. */ if (tp->saw_tstamp) { - if (tcp_paws_discard(tp)) { + if (tcp_paws_discard(tp, th, len)) { if (!th->rst) { tcp_send_ack(sk); goto discard; } } - tcp_replace_ts_recent(tp,skb->end_seq); + tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->end_seq); } } /* step 1: check sequence number */ - if (!tcp_sequence(tp, skb->seq, skb->end_seq)) { + if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { if (!th->rst) { tcp_send_ack(sk); goto discard; @@ -2050,14 +2099,15 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * original syn. */ - if (th->syn && skb->seq!=tp->syn_seq) { + if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) { tcp_reset(sk, skb); return 1; } /* step 5: check the ACK field */ if (th->ack) { - int acceptable = tcp_ack(sk,th,skb->seq, skb->ack_seq,len); + int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->ack_seq, len); switch(sk->state) { case TCP_SYN_RECV: @@ -2069,10 +2119,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if(!sk->dead) sk->state_change(sk); - tp->snd_una = skb->ack_seq; + tp->snd_una = TCP_SKB_CB(skb)->ack_seq; tp->snd_wnd = htons(th->window) << tp->snd_wscale; - tp->snd_wl1 = skb->seq; - tp->snd_wl2 = skb->ack_seq; + tp->snd_wl1 = TCP_SKB_CB(skb)->seq; + tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq; } else { SOCK_DEBUG(sk, "bad ack\n"); @@ -2092,8 +2142,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, break; case TCP_CLOSING: - if (tp->snd_una == tp->write_seq) + if (tp->snd_una == tp->write_seq) { tcp_time_wait(sk); + goto discard; + } break; case TCP_LAST_ACK: @@ -2117,7 +2169,7 @@ step6: switch (sk->state) { case TCP_CLOSE_WAIT: case TCP_CLOSING: - if (!before(skb->seq, tp->fin_seq)) + if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq)) break; case TCP_FIN_WAIT1: @@ -2127,7 +2179,7 @@ step6: * BSD 4.4 also does reset. */ if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) { - if (after(skb->end_seq - th->fin, tp->rcv_nxt)) { + if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { tcp_reset(sk, skb); return 1; } @@ -2135,10 +2187,6 @@ step6: case TCP_ESTABLISHED: queued = tcp_data(skb, sk, len); - - /* This can only happen when MTU+skbheader > rcvbuf */ - if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) - prune_queue(sk); break; } @@ -2151,26 +2199,3 @@ discard: } return 0; } - -int tcp_sysctl_congavoid(ctl_table *ctl, int write, struct file * filp, - void *buffer, size_t *lenp) -{ - int val = sysctl_tcp_cong_avoidance; - int retv; - static tcp_sys_cong_ctl_t tab[] = { - tcp_cong_avoid_vanj, - tcp_cong_avoid_vegas - }; - - retv = proc_dointvec(ctl, write, filp, buffer, lenp); - - if (write) { - if ((unsigned)sysctl_tcp_cong_avoidance > 1) { - retv = -EINVAL; - sysctl_tcp_cong_avoidance = val; - } else { - tcp_sys_cong_ctl_f = tab[sysctl_tcp_cong_avoidance]; - } - } - return retv; -} diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ee53f47d6..08ca40a4b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_ipv4.c,v 1.119 1998/03/22 19:14:47 davem Exp $ + * Version: $Id: tcp_ipv4.c,v 1.142 1998/04/30 12:00:45 davem Exp $ * * IPv4 specific functions * @@ -38,18 +38,20 @@ * open_request handling and moved * most of it into the af independent code. * Added tail drop and some other bugfixes. - * Added new listen sematics (ifdefed by - * NEW_LISTEN for now) + * Added new listen sematics. * Mike McLagan : Routing by source * Juan Jose Ciarlante: ip_dynaddr bits * Andi Kleen: various fixes. * Vitaly E. Lavrov : Transparent proxy revived after year coma. + * Andi Kleen : Fix new listen. */ #include <linux/config.h> #include <linux/types.h> +#include <linux/stddef.h> #include <linux/fcntl.h> #include <linux/random.h> +#include <linux/init.h> #include <linux/ipsec.h> #include <net/icmp.h> @@ -69,6 +71,10 @@ extern int sysctl_ip_dynaddr; /* Check TCP sequence numbers in ICMP packets. */ #define ICMP_MIN_LENGTH 8 +/* Socket used for sending RSTs */ +struct inode tcp_inode; +struct socket *tcp_socket=&tcp_inode.u.socket_i; + static void tcp_v4_send_reset(struct sk_buff *skb); void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, @@ -160,6 +166,18 @@ struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum) return tb; } +/* Ensure that the bound bucket for the port exists. + * Return 0 on success. + */ +static __inline__ int tcp_bucket_check(unsigned short snum) +{ + if (tcp_bound_hash[tcp_bhashfn(snum)] == NULL && + tcp_bucket_create(snum) == NULL) + return 1; + else + return 0; +} + static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum) { struct tcp_bind_bucket *tb; @@ -336,9 +354,6 @@ static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int d return result; } -/* Until this is verified... -DaveM */ -/* #define USE_QUICKSYNS */ - /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM * It is assumed that this code only gets called from within NET_BH. @@ -347,24 +362,15 @@ static inline struct sock *__tcp_v4_lookup(struct tcphdr *th, u32 saddr, u16 sport, u32 daddr, u16 dport, int dif) { - unsigned short hnum = ntohs(dport); + TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) + __u16 hnum = ntohs(dport); + __u32 ports = TCP_COMBINED_PORTS(sport, hnum); struct sock *sk; int hash; -#ifdef USE_QUICKSYNS - /* Incomming connection short-cut. */ - if (th && th->syn == 1 && th->ack == 0) - goto listener_shortcut; -#endif - /* Check TCP register quick cache first. */ sk = TCP_RHASH(sport); - if(sk && - sk->daddr == saddr && /* remote address */ - sk->dport == sport && /* remote port */ - sk->num == hnum && /* local port */ - sk->rcv_saddr == daddr && /* local address */ - (!sk->bound_dev_if || sk->bound_dev_if == dif)) + if(sk && TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) goto hit; /* Optimize here for direct hit, only listening connections can @@ -372,28 +378,16 @@ static inline struct sock *__tcp_v4_lookup(struct tcphdr *th, */ hash = tcp_hashfn(daddr, hnum, saddr, sport); for(sk = tcp_established_hash[hash]; sk; sk = sk->next) { - if(sk->daddr == saddr && /* remote address */ - sk->dport == sport && /* remote port */ - sk->num == hnum && /* local port */ - sk->rcv_saddr == daddr && /* local address */ - (!sk->bound_dev_if || sk->bound_dev_if == dif)) { + if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) { if (sk->state == TCP_ESTABLISHED) TCP_RHASH(sport) = sk; goto hit; /* You sunk my battleship! */ } } /* Must check for a TIME_WAIT'er before going to listener hash. */ - for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next) { - if(sk->daddr == saddr && /* remote address */ - sk->dport == sport && /* remote port */ - sk->num == hnum && /* local port */ - sk->rcv_saddr == daddr && /* local address */ - (!sk->bound_dev_if || sk->bound_dev_if == dif)) + for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next) + if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) goto hit; - } -#ifdef USE_QUICKSYNS -listener_shortcut: -#endif sk = tcp_v4_lookup_listener(daddr, hnum, dif); hit: return sk; @@ -601,8 +595,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk->mtu = 64; /* Sanity limit */ mss = sk->mtu - sizeof(struct iphdr); - if (sk->opt) - mss -= sk->opt->optlen; tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr, sk->sport, usin->sin_port); @@ -773,8 +765,8 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) switch (type) { case ICMP_SOURCE_QUENCH: #ifndef OLD_SOURCE_QUENCH /* This is deprecated */ - tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2); - tp->snd_cwnd = tp->snd_ssthresh; + tp->snd_ssthresh = max(tp->snd_cwnd >> (1 + TCP_CWND_SHIFT), 2); + tp->snd_cwnd = (tp->snd_ssthresh << TCP_CWND_SHIFT); tp->high_seq = tp->snd_nxt; #endif return; @@ -826,6 +818,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) if (req->sk) { /* not yet accept()ed */ sk = req->sk; /* report error in accept */ } else { + tp->syn_backlog--; tcp_synq_unlink(tp, req, prev); req->class->destructor(req); tcp_openreq_free(req); @@ -875,49 +868,42 @@ void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, static void tcp_v4_send_reset(struct sk_buff *skb) { struct tcphdr *th = skb->h.th; + struct tcphdr rth; + struct ip_reply_arg arg; /* Never send a reset in response to a reset. */ - if (th->rst == 0) { - struct tcphdr *th = skb->h.th; - struct sk_buff *skb1 = ip_reply(skb, sizeof(struct tcphdr)); - struct tcphdr *th1; - - if (skb1 == NULL) - return; - - skb1->h.th = th1 = (struct tcphdr *) - skb_put(skb1, sizeof(struct tcphdr)); - - /* Swap the send and the receive. */ - memset(th1, 0, sizeof(*th1)); - th1->dest = th->source; - th1->source = th->dest; - th1->doff = sizeof(*th1)/4; - th1->rst = 1; - - if (th->ack) { - th1->seq = th->ack_seq; - } else { - th1->ack = 1; - if (!th->syn) - th1->ack_seq = th->seq; - else - th1->ack_seq = htonl(ntohl(th->seq)+1); - } - skb1->csum = csum_partial((u8 *) th1, sizeof(*th1), 0); - th1->check = tcp_v4_check(th1, sizeof(*th1), skb1->nh.iph->saddr, - skb1->nh.iph->daddr, skb1->csum); - - /* Finish up some IP bits. */ - skb1->nh.iph->tot_len = htons(skb1->len); - ip_send_check(skb1->nh.iph); + if (th->rst) + return; - /* All the other work was done by ip_reply(). */ - skb1->dst->output(skb1); + /* Swap the send and the receive. */ + memset(&rth, 0, sizeof(struct tcphdr)); + rth.dest = th->source; + rth.source = th->dest; + rth.doff = sizeof(struct tcphdr)/4; + rth.rst = 1; - tcp_statistics.TcpOutSegs++; - tcp_statistics.TcpOutRsts++; + if (th->ack) { + rth.seq = th->ack_seq; + } else { + rth.ack = 1; + rth.ack_seq = th->syn ? htonl(ntohl(th->seq)+1) : th->seq; } + + memset(&arg, 0, sizeof arg); + arg.iov[0].iov_base = (unsigned char *)&rth; + arg.iov[0].iov_len = sizeof rth; + arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, + skb->nh.iph->saddr, /*XXX*/ + sizeof(struct tcphdr), + IPPROTO_TCP, + 0); + arg.n_iov = 1; + arg.csumoffset = offsetof(struct tcphdr, check) / sizeof(u16); + + ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth); + + tcp_statistics.TcpOutSegs++; + tcp_statistics.TcpOutRsts++; } #ifdef CONFIG_IP_TRANSPARENT_PROXY @@ -971,8 +957,6 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) } mss = (rt->u.dst.pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr)); - if (opt) - mss -= opt->optlen; skb = tcp_make_synack(sk, &rt->u.dst, req, mss); if (skb) { @@ -1033,7 +1017,6 @@ tcp_v4_save_options(struct sock *sk, struct sk_buff *skb, } int sysctl_max_syn_backlog = 1024; -int sysctl_tcp_syn_taildrop = 1; struct or_calltable or_ipv4 = { tcp_v4_send_synack, @@ -1041,13 +1024,8 @@ struct or_calltable or_ipv4 = { tcp_v4_send_reset }; -#ifdef NEW_LISTEN #define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */ #define BACKLOGMAX(sk) sysctl_max_syn_backlog -#else -#define BACKLOG(sk) ((sk)->ack_backlog) -#define BACKLOGMAX(sk) ((sk)->max_ack_backlog) -#endif int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, __u32 isn) @@ -1073,17 +1051,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, if (sysctl_tcp_syncookies) { syn_flood_warning(skb); want_cookie = 1; - } else + } else #endif - if (sysctl_tcp_syn_taildrop) { - struct open_request *req; - - req = tcp_synq_unlink_tail(&sk->tp_pinfo.af_tcp); - tcp_openreq_free(req); - tcp_statistics.TcpAttemptFails++; - } else { - goto error; - } + goto drop; } else { if (isn == 0) isn = tcp_v4_init_sequence(sk, skb); @@ -1092,13 +1062,12 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, req = tcp_openreq_alloc(); if (req == NULL) { - if (!want_cookie) BACKLOG(sk)--; - goto error; + goto dropbacklog; } req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ - req->rcv_isn = skb->seq; + req->rcv_isn = TCP_SKB_CB(skb)->seq; tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0; tp.in_mss = 536; tcp_parse_options(NULL, th, &tp, want_cookie); @@ -1153,7 +1122,10 @@ dead: tcp_statistics.TcpAttemptFails++; return -ENOTCONN; /* send reset */ -error: +dropbacklog: + if (!want_cookie) + BACKLOG(sk)--; +drop: tcp_statistics.TcpAttemptFails++; return 0; } @@ -1200,19 +1172,25 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newtp->ato = 0; newtp->snd_wl1 = req->rcv_isn; newtp->snd_wl2 = req->snt_isn; + + /* RFC1323: The window in SYN & SYN/ACK segments + * is never scaled. + */ newtp->snd_wnd = ntohs(skb->h.th->window); + newtp->max_window = newtp->snd_wnd; newtp->pending = 0; newtp->retransmits = 0; newtp->last_ack_sent = req->rcv_isn + 1; newtp->backoff = 0; newtp->mdev = TCP_TIMEOUT_INIT; - newtp->snd_cwnd = 1; + newtp->snd_cwnd = (1 << TCP_CWND_SHIFT); newtp->rto = TCP_TIMEOUT_INIT; newtp->packets_out = 0; + newtp->fackets_out = 0; + newtp->retrans_out = 0; newtp->high_seq = 0; newtp->snd_ssthresh = 0x7fffffff; - newtp->snd_cwnd_cnt = 0; newtp->dup_acks = 0; newtp->delayed_acks = 0; init_timer(&newtp->retransmit_timer); @@ -1285,6 +1263,10 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, return newsk; } +/* + * The three way handshake has completed - we got a valid synack - + * now create the new socket. + */ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct open_request *req, struct dst_entry *dst) @@ -1295,35 +1277,40 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, int snd_mss; int mtu; -#ifdef NEW_LISTEN if (sk->ack_backlog > sk->max_ack_backlog) goto exit; /* head drop */ -#endif if (dst == NULL) { struct rtable *rt; if (ip_route_output(&rt, - opt && opt->srr ? opt->faddr : req->af.v4_req.rmt_addr, - req->af.v4_req.loc_addr, sk->ip_tos|RTO_CONN, 0)) + opt && opt->srr ? opt->faddr : req->af.v4_req.rmt_addr, + req->af.v4_req.loc_addr, sk->ip_tos|RTO_CONN, 0)) return NULL; dst = &rt->u.dst; } - -#ifdef NEW_LISTEN - sk->ack_backlog++; +#ifdef CONFIG_IP_TRANSPARENT_PROXY + /* The new socket created for transparent proxy may fall + * into a non-existed bind bucket because sk->num != newsk->num. + * Ensure existance of the bucket now. The placement of the check + * later will require to destroy just created newsk in the case of fail. + * 1998/04/22 Andrey V. Savochkin <saw@msu.ru> + */ + if (tcp_bucket_check(ntohs(skb->h.th->dest))) + goto exit; #endif mtu = dst->pmtu; - if (mtu < 68) + if (mtu < 68) /* XXX: we should turn pmtu disc off when this happens. */ mtu = 68; snd_mss = mtu - sizeof(struct iphdr); - if (opt) - snd_mss -= opt->optlen; newsk = tcp_create_openreq_child(sk, req, skb, snd_mss); if (!newsk) goto exit; + sk->tp_pinfo.af_tcp.syn_backlog--; + sk->ack_backlog++; + newsk->dst_cache = dst; newtp = &(newsk->tp_pinfo.af_tcp); @@ -1337,8 +1324,7 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->opt = req->af.v4_req.opt; newsk->mtu = mtu; - /* Must use the af_specific ops here for the case of IPv6 mapped. */ - newsk->prot->hash(newsk); + tcp_v4_hash(newsk); add_to_prot_sklist(newsk); return newsk; @@ -1357,11 +1343,15 @@ static void tcp_v4_rst_req(struct sock *sk, struct sk_buff *skb) if (!req) return; /* Sequence number check required by RFC793 */ - if (before(skb->seq, req->snt_isn) || after(skb->seq, req->snt_isn+1)) + if (before(TCP_SKB_CB(skb)->seq, req->snt_isn) || + after(TCP_SKB_CB(skb)->seq, req->snt_isn+1)) return; tcp_synq_unlink(tp, req, prev); + (req->sk ? sk->ack_backlog : tp->syn_backlog)--; req->class->destructor(req); tcp_openreq_free(req); + + net_statistics.EmbryonicRsts++; } /* Check for embryonic sockets (open_requests) We check packets with @@ -1391,9 +1381,9 @@ static inline struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb) sk = tcp_check_req(sk, skb, req); } #ifdef CONFIG_SYN_COOKIES - else { + else { sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); - } + } #endif } return sk; @@ -1485,9 +1475,14 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len) skb->csum = csum_partial((char *)th, len, 0); case CHECKSUM_HW: if (tcp_v4_check(th,len,skb->nh.iph->saddr,skb->nh.iph->daddr,skb->csum)) { - printk(KERN_DEBUG "TCPv4 bad checksum from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, len=%d/%d/%d\n", - NIPQUAD(skb->nh.iph->saddr), ntohs(th->source), NIPQUAD(skb->nh.iph->daddr), - ntohs(th->dest), len, skb->len, ntohs(skb->nh.iph->tot_len)); + printk(KERN_DEBUG "TCPv4 bad checksum from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, " + "len=%d/%d/%d\n", + NIPQUAD(skb->nh.iph->saddr), + ntohs(th->source), + NIPQUAD(skb->nh.iph->daddr), + ntohs(th->dest), + len, skb->len, + ntohs(skb->nh.iph->tot_len)); tcp_statistics.TcpInErrs++; goto discard_it; } @@ -1509,9 +1504,10 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len) if(!ipsec_sk_policy(sk,skb)) goto discard_it; - skb->seq = ntohl(th->seq); - skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4; - skb->ack_seq = ntohl(th->ack_seq); + TCP_SKB_CB(skb)->seq = ntohl(th->seq); + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + + len - th->doff*4); + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); skb->used = 0; @@ -1658,11 +1654,11 @@ static int tcp_v4_init_sock(struct sock *sk) tp->rto = TCP_TIMEOUT_INIT; /*TCP_WRITE_TIME*/ tp->mdev = TCP_TIMEOUT_INIT; tp->in_mss = 536; - + /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. */ - tp->snd_cwnd = 1; + tp->snd_cwnd = (1 << TCP_CWND_SHIFT); tp->snd_ssthresh = 0x7fffffff; /* Infinity */ sk->priority = 1; @@ -1690,11 +1686,11 @@ static int tcp_v4_destroy_sock(struct sock *sk) tcp_dec_slow_timer(TCP_SLT_KEEPALIVE); /* Cleanup up the write buffer. */ - while((skb = skb_dequeue(&sk->write_queue)) != NULL) + while((skb = __skb_dequeue(&sk->write_queue)) != NULL) kfree_skb(skb); /* Cleans up our, hopefuly empty, out_of_order_queue. */ - while((skb = skb_dequeue(&tp->out_of_order_queue)) != NULL) + while((skb = __skb_dequeue(&tp->out_of_order_queue)) != NULL) kfree_skb(skb); /* Clean up a locked TCP bind bucket, this only happens if a @@ -1739,3 +1735,25 @@ struct proto tcp_prot = { 0, /* inuse */ 0 /* highestinuse */ }; + + + +__initfunc(void tcp_v4_init(struct net_proto_family *ops)) +{ + int err; + + tcp_inode.i_mode = S_IFSOCK; + tcp_inode.i_sock = 1; + tcp_inode.i_uid = 0; + tcp_inode.i_gid = 0; + + tcp_socket->inode = &tcp_inode; + tcp_socket->state = SS_UNCONNECTED; + tcp_socket->type=SOCK_RAW; + + if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0) + panic("Failed to create the TCP control socket.\n"); + tcp_socket->sk->allocation=GFP_ATOMIC; + tcp_socket->sk->num = 256; /* Don't receive any data */ + tcp_socket->sk->ip_ttl = MAXTTL; +} diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 465ee3fdc..482ca262c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.76 1998/03/22 22:10:24 davem Exp $ + * Version: $Id: tcp_output.c,v 1.87 1998/04/26 01:11:35 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -37,6 +37,10 @@ extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; +extern int sysctl_tcp_sack; + +/* People can turn this off for buggy TCP's found in printers etc. */ +int sysctl_tcp_retrans_collapse = 1; /* Get rid of any delayed acks, we sent one already.. */ static __inline__ void clear_delayed_acks(struct sock * sk) @@ -99,25 +103,29 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) /* Build TCP header and checksum it. */ th->source = sk->sport; th->dest = sk->dport; - th->seq = htonl(skb->seq); + th->seq = htonl(TCP_SKB_CB(skb)->seq); th->ack_seq = htonl(tp->rcv_nxt); th->doff = (tcp_header_size >> 2); th->res1 = 0; *(((__u8 *)th) + 13) = tcb->flags; - th->window = htons(tcp_select_window(sk)); + if(!(tcb->flags & TCPCB_FLAG_SYN)) + th->window = htons(tcp_select_window(sk)); th->check = 0; th->urg_ptr = ntohs(tcb->urg_ptr); if(tcb->flags & TCPCB_FLAG_SYN) { + /* RFC1323: The window in SYN & SYN/ACK segments + * is never scaled. + */ th->window = htons(tp->rcv_wnd); tcp_syn_build_options((__u32 *)(th + 1), sk->mss, sysctl_tcp_timestamps, sysctl_tcp_sack, sysctl_tcp_window_scaling, tp->rcv_wscale, - skb->when); + TCP_SKB_CB(skb)->when); } else { tcp_build_and_update_options((__u32 *)(th + 1), - tp, skb->when); + tp, TCP_SKB_CB(skb)->when); } tp->af_specific->send_check(sk, th, skb->len, skb); @@ -136,13 +144,13 @@ void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue) struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* Advance write_seq and place onto the write_queue. */ - tp->write_seq += (skb->end_seq - skb->seq); - skb_queue_tail(&sk->write_queue, skb); + tp->write_seq += (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq); + __skb_queue_tail(&sk->write_queue, skb); if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) { /* Send it out now. */ - skb->when = jiffies; - tp->snd_nxt = skb->end_seq; + TCP_SKB_CB(skb)->when = jiffies; + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; tp->packets_out++; tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)); if(!tcp_timer_is_set(sk, TIME_RETRANS)) @@ -171,9 +179,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) /* Get a new skb... force flag on. */ buff = sock_wmalloc(sk, - (nsize + - MAX_HEADER + - sk->prot->max_header + 15), + (nsize + MAX_HEADER + sk->prot->max_header), 1, GFP_ATOMIC); if (buff == NULL) return -1; /* We'll just try again later. */ @@ -182,8 +188,8 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) skb_reserve(buff, MAX_HEADER + sk->prot->max_header); /* Correct the sequence numbers. */ - buff->seq = skb->seq + len; - buff->end_seq = skb->end_seq; + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->flags; @@ -209,14 +215,14 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) buff->csum = csum_partial_copy(skb->data + len, skb_put(buff, nsize), nsize, 0); - skb->end_seq -= nsize; + TCP_SKB_CB(skb)->end_seq -= nsize; skb_trim(skb, skb->len - nsize); /* Rechecksum original buffer. */ skb->csum = csum_partial(skb->data, skb->len, 0); /* Link BUFF into the send queue. */ - skb_append(skb, buff); + __skb_append(skb, buff); return 0; } @@ -228,18 +234,14 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) void tcp_write_xmit(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int mss_now = sk->mss; + unsigned int mss_now; /* Account for SACKS, we may need to fragment due to this. * It is just like the real MSS changing on us midstream. * We also handle things correctly when the user adds some * IP options mid-stream. Silly to do, but cover it. */ - if(tp->sack_ok && tp->num_sacks) - mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + - (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); - if(sk->opt && sk->opt->optlen) - mss_now -= sk->opt->optlen; + mss_now = tcp_current_mss(sk); /* If we are zapped, the bytes will have to remain here. * In time closedown will empty the write queue and all @@ -264,8 +266,8 @@ void tcp_write_xmit(struct sock *sk) /* Advance the send_head. This one is going out. */ update_send_head(sk); - skb->when = jiffies; - tp->snd_nxt = skb->end_seq; + TCP_SKB_CB(skb)->when = jiffies; + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; tp->packets_out++; tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); sent_pkts = 1; @@ -397,7 +399,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m return; /* Ok. We will be able to collapse the packet. */ - skb_unlink(next_skb); + __skb_unlink(next_skb, next_skb->list); if(skb->len % 4) { /* Must copy and rechecksum all data. */ @@ -413,7 +415,8 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m } /* Update sequence range on original skb. */ - skb->end_seq += next_skb->end_seq - next_skb->seq; + TCP_SKB_CB(skb)->end_seq += + TCP_SKB_CB(next_skb)->end_seq - TCP_SKB_CB(next_skb)->seq; /* Merge over control information. */ flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */ @@ -432,22 +435,28 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m } /* Do a simple retransmit without using the backoff mechanisms in - * tcp_timer. This is used to speed up path mtu recovery. Note that - * these simple retransmits aren't counted in the usual tcp retransmit - * backoff counters. + * tcp_timer. This is used for path mtu discovery. * The socket is already locked here. */ void tcp_simple_retransmit(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb; + unsigned int mss = tcp_current_mss(sk); /* Don't muck with the congestion window here. */ tp->dup_acks = 0; tp->high_seq = tp->snd_nxt; - - /* FIXME: make the current rtt sample invalid */ tp->retrans_head = NULL; - tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); + + /* Input control flow will see that this was retransmitted + * and not use it for RTT calculation in the absence of + * the timestamp option. + */ + for (skb = skb_peek(&sk->write_queue); skb != tp->send_head; + skb = skb->next) + if (skb->len > mss) + tcp_retransmit_skb(sk, skb); } static __inline__ void update_retrans_head(struct sock *sk) @@ -467,17 +476,10 @@ static __inline__ void update_retrans_head(struct sock *sk) int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int current_mss = sk->mss; - - /* Account for outgoing SACKS and IP options, if any. */ - if(tp->sack_ok && tp->num_sacks) - current_mss -= (TCPOLEN_SACK_BASE_ALIGNED + - (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); - if(sk->opt && sk->opt->optlen) - current_mss -= sk->opt->optlen; + unsigned int cur_mss = tcp_current_mss(sk); - if(skb->len > current_mss) { - if(tcp_fragment(sk, skb, current_mss)) + if(skb->len > cur_mss) { + if(tcp_fragment(sk, skb, cur_mss)) return 1; /* We'll try again later. */ /* New SKB created, account for it. */ @@ -486,21 +488,23 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) /* Collapse two adjacent packets if worthwhile and we can. */ if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && - (skb->len < (current_mss >> 1)) && + (skb->len < (cur_mss >> 1)) && (skb->next != tp->send_head) && - (skb->next != (struct sk_buff *)&sk->write_queue)) - tcp_retrans_try_collapse(sk, skb, current_mss); + (skb->next != (struct sk_buff *)&sk->write_queue) && + (sysctl_tcp_retrans_collapse != 0)) + tcp_retrans_try_collapse(sk, skb, cur_mss); if(tp->af_specific->rebuild_header(sk)) return 1; /* Routing failure or similar. */ /* Ok, we're gonna send it out, update state. */ TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_RETRANS; + tp->retrans_out++; /* Make a copy, if the first transmission SKB clone we made * is still in somebodies hands, else make a clone. */ - skb->when = jiffies; + TCP_SKB_CB(skb)->when = jiffies; if(skb_cloned(skb)) skb = skb_copy(skb, GFP_ATOMIC); else @@ -518,12 +522,14 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) * retransmitted data is acknowledged. It tries to continue * resending the rest of the retransmit queue, until either * we've sent it all or the congestion window limit is reached. + * If doing SACK, the first ACK which comes back for a timeout + * based retransmit packet might feed us FACK information again. + * If so, we use it to avoid unnecessarily retransmissions. */ void tcp_xmit_retransmit_queue(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; - int ct = 0; if (tp->retrans_head == NULL) tp->retrans_head = skb_peek(&sk->write_queue); @@ -539,19 +545,48 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if(tcp_retransmit_skb(sk, skb)) break; - /* Count retransmissions locally. */ - ct++; - /* Stop retransmitting if we've hit the congestion * window limit. */ - if (ct >= tp->snd_cwnd) + if (tp->retrans_out >= (tp->snd_cwnd >> TCP_CWND_SHIFT)) break; } update_retrans_head(sk); } } +/* Using FACK information, retransmit all missing frames at the receiver + * up to the forward most SACK'd packet (tp->fackets_out) if the packet + * has not been retransmitted already. + */ +void tcp_fack_retransmit(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb = skb_peek(&sk->write_queue); + int packet_cnt = 0; + + while((skb != NULL) && + (skb != tp->send_head) && + (skb != (struct sk_buff *)&sk->write_queue)) { + __u8 sacked = TCP_SKB_CB(skb)->sacked; + + if(sacked & (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS)) + goto next_packet; + + /* Ok, retransmit it. */ + if(tcp_retransmit_skb(sk, skb)) + break; + + if(tcp_packets_in_flight(tp) >= (tp->snd_cwnd >> TCP_CWND_SHIFT)) + break; +next_packet: + packet_cnt++; + if(packet_cnt >= tp->fackets_out) + break; + skb = skb->next; + } +} + /* Send a fin. The caller locks the socket for us. This cannot be * allowed to fail queueing a FIN frame under any circumstances. */ @@ -559,22 +594,37 @@ void tcp_send_fin(struct sock *sk) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb = skb_peek_tail(&sk->write_queue); - int mss_now = sk->mss; + unsigned int mss_now; /* Optimization, tack on the FIN if we have a queue of * unsent frames. But be careful about outgoing SACKS * and IP options. */ - if(tp->sack_ok && tp->num_sacks) - mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + - (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); - if(sk->opt && sk->opt->optlen) - mss_now -= sk->opt->optlen; + mss_now = tcp_current_mss(sk); + if((tp->send_head != NULL) && (skb->len < mss_now)) { /* tcp_write_xmit() takes care of the rest. */ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; - skb->end_seq++; + TCP_SKB_CB(skb)->end_seq++; tp->write_seq++; + + /* Special case to avoid Nagle bogosity. If this + * segment is the last segment, and it was queued + * due to Nagle/SWS-avoidance, send it out now. + */ + if(tp->send_head == skb && + !sk->nonagle && + skb->len < (sk->mss >> 1) && + tp->packets_out && + !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) { + update_send_head(sk); + TCP_SKB_CB(skb)->when = jiffies; + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + tp->packets_out++; + tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + if(!tcp_timer_is_set(sk, TIME_RETRANS)) + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + } } else { /* Socket is locked, keep trying until memory is available. */ do { @@ -592,8 +642,8 @@ void tcp_send_fin(struct sock *sk) TCP_SKB_CB(skb)->urg_ptr = 0; /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */ - skb->seq = tp->write_seq; - skb->end_seq = skb->seq + 1; + TCP_SKB_CB(skb)->seq = tp->write_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; tcp_send_skb(sk, skb, 0); } } @@ -621,9 +671,9 @@ void tcp_send_active_reset(struct sock *sk) TCP_SKB_CB(skb)->urg_ptr = 0; /* Send it off. */ - skb->seq = tp->write_seq; - skb->end_seq = skb->seq; - skb->when = jiffies; + TCP_SKB_CB(skb)->seq = tp->write_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; + TCP_SKB_CB(skb)->when = jiffies; tcp_transmit_skb(sk, skb); } @@ -650,15 +700,18 @@ int tcp_send_synack(struct sock *sk) TCP_SKB_CB(skb)->urg_ptr = 0; /* SYN eats a sequence byte. */ - skb->seq = tp->snd_una; - skb->end_seq = skb->seq + 1; - skb_queue_tail(&sk->write_queue, skb); - skb->when = jiffies; + TCP_SKB_CB(skb)->seq = tp->snd_una; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; + __skb_queue_tail(&sk->write_queue, skb); + TCP_SKB_CB(skb)->when = jiffies; tp->packets_out++; tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); return 0; } +/* + * Prepare a SYN-ACK. + */ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, struct open_request *req, int mss) { @@ -705,9 +758,9 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, th->ack = 1; th->source = sk->sport; th->dest = req->rmt_port; - skb->seq = req->snt_isn; - skb->end_seq = skb->seq + 1; - th->seq = htonl(skb->seq); + TCP_SKB_CB(skb)->seq = req->snt_isn; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; + th->seq = htonl(TCP_SKB_CB(skb)->seq); th->ack_seq = htonl(req->rcv_isn + 1); if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ __u8 rcv_wscale; @@ -720,16 +773,18 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, &rcv_wscale); req->rcv_wscale = rcv_wscale; } + + /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(req->rcv_wnd); - skb->when = jiffies; + TCP_SKB_CB(skb)->when = jiffies; tcp_syn_build_options((__u32 *)(th + 1), req->mss, req->tstamp_ok, req->sack_ok, req->wscale_ok, req->rcv_wscale, - skb->when); + TCP_SKB_CB(skb)->when); skb->csum = 0; th->doff = (tcp_header_size >> 2); - tcp_statistics.TcpOutSegs++; + tcp_statistics.TcpOutSegs++; return skb; } @@ -774,9 +829,9 @@ void tcp_connect(struct sock *sk, struct sk_buff *buff, int mss) TCP_SKB_CB(buff)->sacked = 0; TCP_SKB_CB(buff)->urg_ptr = 0; buff->csum = 0; - buff->seq = tp->write_seq++; - buff->end_seq = tp->write_seq; - tp->snd_nxt = buff->end_seq; + TCP_SKB_CB(buff)->seq = tp->write_seq++; + TCP_SKB_CB(buff)->end_seq = tp->write_seq; + tp->snd_nxt = TCP_SKB_CB(buff)->end_seq; tp->window_clamp = dst->window; tcp_select_initial_window(sock_rspace(sk)/2,sk->mss, @@ -784,7 +839,6 @@ void tcp_connect(struct sock *sk, struct sk_buff *buff, int mss) &tp->window_clamp, sysctl_tcp_window_scaling, &tp->rcv_wscale); - /* Ok, now lock the socket before we make it visible to * the incoming packet engine. */ @@ -800,10 +854,12 @@ void tcp_connect(struct sock *sk, struct sk_buff *buff, int mss) tp->rto = dst->rtt; tcp_init_xmit_timers(sk); tp->retransmits = 0; + tp->fackets_out = 0; + tp->retrans_out = 0; /* Send it off. */ - skb_queue_tail(&sk->write_queue, buff); - buff->when = jiffies; + __skb_queue_tail(&sk->write_queue, buff); + TCP_SKB_CB(buff)->when = jiffies; tp->packets_out++; tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); tcp_statistics.TcpActiveOpens++; @@ -870,8 +926,8 @@ void tcp_send_ack(struct sock *sk) TCP_SKB_CB(buff)->urg_ptr = 0; /* Send it off, this clears delayed acks for us. */ - buff->seq = buff->end_seq = tp->snd_nxt; - buff->when = jiffies; + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tp->snd_nxt; + TCP_SKB_CB(buff)->when = jiffies; tcp_transmit_skb(sk, buff); } } @@ -904,13 +960,13 @@ void tcp_write_wakeup(struct sock *sk) * must have been a result SWS avoidance ( sender ) */ win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); - if (win_size < skb->end_seq - skb->seq) { + if (win_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) { if (tcp_fragment(sk, skb, win_size)) return; /* Let a retransmit get it. */ } update_send_head(sk); - skb->when = jiffies; - tp->snd_nxt = skb->end_seq; + TCP_SKB_CB(skb)->when = jiffies; + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; tp->packets_out++; tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); if (!tcp_timer_is_set(sk, TIME_RETRANS)) @@ -933,9 +989,9 @@ void tcp_write_wakeup(struct sock *sk) * end to send an ack. Don't queue or clone SKB, just * send it. */ - skb->seq = tp->snd_nxt - 1; - skb->end_seq = skb->seq; - skb->when = jiffies; + TCP_SKB_CB(skb)->seq = tp->snd_nxt - 1; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; + TCP_SKB_CB(skb)->when = jiffies; tcp_transmit_skb(sk, skb); } } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 54380b07d..9bf74f472 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -32,6 +32,7 @@ static void tcp_sltimer_handler(unsigned long); static void tcp_syn_recv_timer(unsigned long); static void tcp_keepalive(unsigned long data); static void tcp_bucketgc(unsigned long); +static void tcp_twkill(unsigned long); struct timer_list tcp_slow_timer = { NULL, NULL, @@ -43,6 +44,7 @@ struct timer_list tcp_slow_timer = { struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = { {ATOMIC_INIT(0), TCP_SYNACK_PERIOD, 0, tcp_syn_recv_timer},/* SYNACK */ {ATOMIC_INIT(0), TCP_KEEPALIVE_PERIOD, 0, tcp_keepalive}, /* KEEPALIVE */ + {ATOMIC_INIT(0), TCP_TWKILL_PERIOD, 0, tcp_twkill}, /* TWKILL */ {ATOMIC_INIT(0), TCP_BUCKETGC_PERIOD, 0, tcp_bucketgc} /* BUCKETGC */ }; @@ -166,11 +168,10 @@ void tcp_delack_timer(unsigned long data) { struct sock *sk = (struct sock*)data; - if(sk->zapped) - return; - - if (sk->tp_pinfo.af_tcp.delayed_acks) - tcp_read_wakeup(sk); + if(!sk->zapped && + sk->tp_pinfo.af_tcp.delayed_acks && + sk->state != TCP_CLOSE) + tcp_send_ack(sk); } void tcp_probe_timer(unsigned long data) @@ -240,9 +241,9 @@ static __inline__ int tcp_keepopen_proc(struct sock *sk) } /* Garbage collect TCP bind buckets. */ -static void tcp_bucketgc(unsigned long __unused) +static void tcp_bucketgc(unsigned long data) { - int i; + int i, reaped = 0;; for(i = 0; i < TCP_BHTABLE_SIZE; i++) { struct tcp_bind_bucket *tb = tcp_bound_hash[i]; @@ -252,8 +253,7 @@ static void tcp_bucketgc(unsigned long __unused) if((tb->owners == NULL) && !(tb->flags & TCPB_FLAG_LOCKED)) { - /* Eat timer reference. */ - tcp_dec_slow_timer(TCP_SLT_BUCKETGC); + reaped++; /* Unlink bucket. */ if(tb->next) @@ -266,6 +266,92 @@ static void tcp_bucketgc(unsigned long __unused) tb = next; } } + if(reaped != 0) { + struct tcp_sl_timer *slt = (struct tcp_sl_timer *)data; + + /* Eat timer references. */ + atomic_sub(reaped, &slt->count); + } +} + +/* Kill off TIME_WAIT sockets once their lifetime has expired. */ +int tcp_tw_death_row_slot = 0; +static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS] = + { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }; + +extern void tcp_timewait_kill(struct tcp_tw_bucket *tw); + +static void tcp_twkill(unsigned long data) +{ + struct tcp_tw_bucket *tw; + int killed = 0; + + tw = tcp_tw_death_row[tcp_tw_death_row_slot]; + tcp_tw_death_row[tcp_tw_death_row_slot] = NULL; + while(tw != NULL) { + struct tcp_tw_bucket *next = tw->next_death; + + tcp_timewait_kill(tw); + killed++; + tw = next; + } + if(killed != 0) { + struct tcp_sl_timer *slt = (struct tcp_sl_timer *)data; + atomic_sub(killed, &slt->count); + } + tcp_tw_death_row_slot = + ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); +} + +/* These are always called from BH context. See callers in + * tcp_input.c to verify this. + */ +void tcp_tw_schedule(struct tcp_tw_bucket *tw) +{ + int slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1); + + tw->death_slot = slot; + tw->next_death = tcp_tw_death_row[slot]; + tcp_tw_death_row[slot] = tw; + tcp_inc_slow_timer(TCP_SLT_TWKILL); +} + +/* Happens rarely if at all, no care about scalability here. */ +void tcp_tw_reschedule(struct tcp_tw_bucket *tw) +{ + struct tcp_tw_bucket *walk; + int slot = tw->death_slot; + + walk = tcp_tw_death_row[slot]; + if(walk == tw) { + tcp_tw_death_row[slot] = tw->next_death; + } else { + while(walk->next_death != tw) + walk = walk->next_death; + walk->next_death = tw->next_death; + } + slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1); + tw->death_slot = slot; + tw->next_death = tcp_tw_death_row[slot]; + tcp_tw_death_row[slot] = tw; + /* Timer was incremented when we first entered the table. */ +} + +/* This is for handling early-kills of TIME_WAIT sockets. */ +void tcp_tw_deschedule(struct tcp_tw_bucket *tw) +{ + struct tcp_tw_bucket *walk; + int slot = tw->death_slot; + + walk = tcp_tw_death_row[slot]; + if(walk == tw) { + tcp_tw_death_row[slot] = tw->next_death; + } else { + while(walk->next_death != tw) + walk = walk->next_death; + walk->next_death = tw->next_death; + } + tcp_dec_slow_timer(TCP_SLT_TWKILL); } /* @@ -362,13 +448,17 @@ void tcp_retransmit_timer(unsigned long data) */ if(tp->sack_ok) { struct sk_buff *skb = skb_peek(&sk->write_queue); + __u8 toclear = TCPCB_SACKED_ACKED; + if(tp->retransmits == 0) + toclear |= TCPCB_SACKED_RETRANS; while((skb != NULL) && (skb != tp->send_head) && (skb != (struct sk_buff *)&sk->write_queue)) { - TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->sacked &= ~(toclear); skb = skb->next; } + tp->fackets_out = 0; } /* Retransmission. */ @@ -377,9 +467,9 @@ void tcp_retransmit_timer(unsigned long data) /* remember window where we lost * "one half of the current window but at least 2 segments" */ - tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2); - tp->snd_cwnd_cnt = 0; - tp->snd_cwnd = 1; + tp->retrans_out = 0; + tp->snd_ssthresh = max(tp->snd_cwnd >> (1 + TCP_CWND_SHIFT), 2); + tp->snd_cwnd = (1 << TCP_CWND_SHIFT); } tp->retransmits++; @@ -447,6 +537,7 @@ static void tcp_syn_recv_timer(unsigned long data) if ((long)(now - conn->expires) <= 0) break; + tcp_synq_unlink(tp, conn, prev); if (conn->retrans >= sysctl_tcp_retries1) { #ifdef TCP_DEBUG @@ -455,7 +546,7 @@ static void tcp_syn_recv_timer(unsigned long data) #endif (*conn->class->destructor)(conn); tcp_dec_slow_timer(TCP_SLT_SYNACK); - sk->ack_backlog--; + tp->syn_backlog--; tcp_openreq_free(conn); if (!tp->syn_wait_queue) @@ -506,14 +597,14 @@ void tcp_sltimer_handler(unsigned long data) slt->last = now; trigger = slt->period; } - next = min(next, trigger); - } - } - if (next != ~0UL) { - tcp_slow_timer.expires = now + next; - add_timer(&tcp_slow_timer); + /* Only reschedule if some events remain. */ + if (atomic_read(&slt->count)) + next = min(next, trigger); + } } + if (next != ~0UL) + mod_timer(&tcp_slow_timer, (now + next)); } void __tcp_inc_slow_timer(struct tcp_sl_timer *slt) @@ -526,9 +617,8 @@ void __tcp_inc_slow_timer(struct tcp_sl_timer *slt) when = now + slt->period; if (tcp_slow_timer.prev) { - if ((long)(tcp_slow_timer.expires - when) >= 0) { + if ((long)(tcp_slow_timer.expires - when) >= 0) mod_timer(&tcp_slow_timer, when); - } } else { tcp_slow_timer.expires = when; add_timer(&tcp_slow_timer); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 6a24bea8b..902274ecb 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -7,7 +7,7 @@ * * Adapted from linux/net/ipv4/af_inet.c * - * $Id: af_inet6.c,v 1.29 1998/03/18 07:52:11 davem Exp $ + * $Id: af_inet6.c,v 1.30 1998/03/25 00:23:05 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -79,7 +79,6 @@ static int inet6_create(struct socket *sock, int protocol) if (protocol && protocol != IPPROTO_TCP) goto free_and_noproto; protocol = IPPROTO_TCP; - sk->no_check = TCP_NO_CHECK; prot = &tcpv6_prot; sock->ops = &inet6_stream_ops; } else if(sock->type == SOCK_DGRAM) { diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 735ceeb5f..693caaf3b 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: ip6_fib.c,v 1.12 1998/03/20 09:12:16 davem Exp $ + * $Id: ip6_fib.c,v 1.13 1998/04/28 06:22:03 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -694,8 +694,13 @@ static void fib6_del_2(struct fib6_node *fn) /* * We can't tidy a case of two children. */ - - if (children > 1 || (fn->fn_flags & RTN_RTINFO)) + if (children > 1) { + if (fn->leaf == NULL) + goto split_repair; + break; + } + + if (fn->fn_flags & RTN_RTINFO) break; /* @@ -765,6 +770,8 @@ static void fib6_del_2(struct fib6_node *fn) stree_node: rt6_release(fn->leaf); + +split_repair: rt = fib6_find_prefix(fn); if (rt == NULL) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 0f1c710d3..eb3984f55 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: ip6_output.c,v 1.10 1998/03/20 09:12:17 davem Exp $ + * $Id: ip6_output.c,v 1.12 1998/04/11 22:11:06 davem Exp $ * * Based on linux/net/ipv4/ip_output.c * @@ -75,7 +75,6 @@ int ip6_output(struct sk_buff *skb) } else if (dst->neighbour) return dst->neighbour->output(skb); - printk(KERN_DEBUG "khm\n"); kfree_skb(skb); return -EINVAL; } @@ -265,7 +264,6 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag, return err; last_skb->dst = dst_clone(dst); - last_skb->when = jiffies; skb_reserve(last_skb, (dst->dev->hard_header_len + 15) & ~15); @@ -461,8 +459,6 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data, dev = dst->dev; skb->dst = dst_clone(dst); - skb->when = jiffies; - skb_reserve(skb, (dev->hard_header_len + 15) & ~15); hdr = (struct ipv6hdr *) skb->tail; diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index c010b0964..9b24b4948 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -7,7 +7,7 @@ * PROC file system. This is very similar to the IPv4 version, * except it reports the sockets in the INET6 address family. * - * Version: $Id: proc.c,v 1.7 1998/03/18 07:52:13 davem Exp $ + * Version: $Id: proc.c,v 1.8 1998/04/13 17:06:03 davem Exp $ * * Authors: David S. Miller (davem@caip.rutgers.edu) * @@ -71,9 +71,17 @@ static int get__netinfo6(struct proto *pro, char *buffer, int format, char **sta destp = ntohs(sp->dport); srcp = ntohs(sp->sport); if((format == 0) && (sp->state == TCP_TIME_WAIT)) { + extern int tcp_tw_death_row_slot; + int slot_dist; + timer_active1 = timer_active2 = 0; timer_active = 3; - timer_expires = tw->timer.expires; + slot_dist = tw->death_slot; + if(slot_dist > tcp_tw_death_row_slot) + slot_dist = (TCP_TWKILL_SLOTS - slot_dist) + tcp_tw_death_row_slot; + else + slot_dist = tcp_tw_death_row_slot - slot_dist; + timer_expires = jiffies + (slot_dist * TCP_TWKILL_PERIOD); } else { timer_active1 = del_timer(&tp->retransmit_timer); timer_active2 = del_timer(&sp->timer); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3015d254b..a71c9c0e5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: route.c,v 1.27 1998/03/21 07:28:04 davem Exp $ + * $Id: route.c,v 1.28 1998/04/28 06:22:04 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -90,7 +90,11 @@ struct rt6_info ip6_null_entry = { {{NULL, ATOMIC_INIT(1), ATOMIC_INIT(1), NULL, -1, 0, 0, 0, 0, 0, 0, 0, 0, -ENETUNREACH, NULL, NULL, - ip6_pkt_discard, ip6_pkt_discard, &ip6_dst_ops}}, + ip6_pkt_discard, ip6_pkt_discard, +#ifdef CONFIG_NET_CLS_ROUTE + 0, +#endif + &ip6_dst_ops}}, NULL, {{{0}}}, 256, RTF_REJECT|RTF_NONEXTHOP, ~0U, 255, 0, {NULL}, {{{{0}}}, 0}, {{{{0}}}, 0} }; @@ -751,7 +755,7 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) goto out; } - grt = rt6_lookup(gw_addr, NULL, dev->ifindex, RTF_LINKRT); + grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, RTF_LINKRT); if (grt == NULL || (grt->rt6i_flags&RTF_GATEWAY)) { *err = -EHOSTUNREACH; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4f176cd60..721677fa6 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: tcp_ipv6.c,v 1.68 1998/03/22 19:14:50 davem Exp $ + * $Id: tcp_ipv6.c,v 1.78 1998/04/16 16:29:22 freitag Exp $ * * Based on: * linux/net/ipv4/tcp.c @@ -42,14 +42,13 @@ #include <asm/uaccess.h> -extern int sysctl_tcp_timestamps; -extern int sysctl_tcp_window_scaling; +extern int sysctl_max_syn_backlog; static void tcp_v6_send_reset(struct sk_buff *skb); static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, struct sk_buff *skb); -static int tcp_v6_backlog_rcv(struct sock *sk, struct sk_buff *skb); +static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); static void tcp_v6_xmit(struct sk_buff *skb); static struct open_request *tcp_v6_search_req(struct tcp_opt *tp, struct ipv6hdr *ip6h, @@ -228,9 +227,6 @@ static struct sock *tcp_v6_lookup_listener(struct in6_addr *daddr, unsigned shor return result; } -/* Until this is verified... -DaveM */ -/* #define USE_QUICKSYNS */ - /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM * It is assumed that this code only gets called from within NET_BH. @@ -240,25 +236,14 @@ static inline struct sock *__tcp_v6_lookup(struct tcphdr *th, struct in6_addr *daddr, u16 dport, int dif) { - unsigned short hnum = ntohs(dport); struct sock *sk; + __u16 hnum = ntohs(dport); + __u32 ports = TCP_COMBINED_PORTS(sport, hnum); int hash; -#ifdef USE_QUICKSYNS - /* Incomming connection short-cut. */ - if (th && th->syn == 1 && th->ack == 0) - goto listener_shortcut; -#endif - /* Check TCP register quick cache first. */ sk = TCP_RHASH(sport); - if(sk && - sk->num == hnum && /* local port */ - sk->family == AF_INET6 && /* address family */ - sk->dport == sport && /* remote port */ - !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.daddr, saddr) && - !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, daddr) && - (!sk->bound_dev_if || sk->bound_dev_if == dif)) + if(sk && TCP_IPV6_MATCH(sk, saddr, daddr, ports, dif)) goto hit; /* Optimize here for direct hit, only listening connections can @@ -267,31 +252,23 @@ static inline struct sock *__tcp_v6_lookup(struct tcphdr *th, hash = tcp_v6_hashfn(daddr, hnum, saddr, sport); for(sk = tcp_established_hash[hash]; sk; sk = sk->next) { /* For IPV6 do the cheaper port and family tests first. */ - if(sk->num == hnum && /* local port */ - sk->family == AF_INET6 && /* address family */ - sk->dport == sport && /* remote port */ - !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.daddr, saddr) && - !ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, daddr) && - (!sk->bound_dev_if || sk->bound_dev_if == dif)) { + if(TCP_IPV6_MATCH(sk, saddr, daddr, ports, dif)) { if (sk->state == TCP_ESTABLISHED) TCP_RHASH(sport) = sk; goto hit; /* You sunk my battleship! */ } } /* Must check for a TIME_WAIT'er before going to listener hash. */ - for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next) - if(sk->num == hnum && /* local port */ - sk->family == AF_INET6 && /* address family */ - sk->dport == sport) { /* remote port */ + for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next) { + if(*((__u32 *)&(sk->dport)) == ports && + sk->family == AF_INET6) { struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; if(!ipv6_addr_cmp(&tw->v6_daddr, saddr) && !ipv6_addr_cmp(&tw->v6_rcv_saddr, daddr) && (!sk->bound_dev_if || sk->bound_dev_if == dif)) goto hit; } -#ifdef USE_QUICKSYNS -listener_shortcut: -#endif + } sk = tcp_v6_lookup_listener(daddr, hnum, dif); hit: return sk; @@ -426,7 +403,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (err) { sk->tp_pinfo.af_tcp.af_specific = &ipv6_specific; - sk->backlog_rcv = tcp_v6_backlog_rcv; + sk->backlog_rcv = tcp_v6_do_rcv; } return err; @@ -651,6 +628,7 @@ void tcp_v6_err(struct sk_buff *skb, int type, int code, unsigned char *header, if (req->sk) { sk = req->sk; /* report error in accept */ } else { + tp->syn_backlog--; tcp_synq_unlink(tp, req, prev); req->class->destructor(req); tcp_openreq_free(req); @@ -676,9 +654,6 @@ void tcp_v6_err(struct sk_buff *skb, int type, int code, unsigned char *header, } -/* FIXME: this is substantially similar to the ipv4 code. - * Can some kind of merge be done? -- erics - */ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req) { struct sk_buff * skb; @@ -730,6 +705,9 @@ static struct or_calltable or_ipv6 = { tcp_v6_send_reset }; +#define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */ +#define BACKLOGMAX(sk) sysctl_max_syn_backlog + /* FIXME: this is substantially similar to the ipv4 code. * Can some kind of merge be done? -- erics */ @@ -755,22 +733,22 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, /* * There are no SYN attacks on IPv6, yet... */ - if (sk->ack_backlog >= sk->max_ack_backlog) { + if (BACKLOG(sk) >= BACKLOGMAX(sk)) { printk(KERN_DEBUG "droping syn ack:%d max:%d\n", - sk->ack_backlog, sk->max_ack_backlog); - tcp_statistics.TcpAttemptFails++; - goto exit; + BACKLOG(sk), BACKLOGMAX(sk)); + goto drop; } req = tcp_openreq_alloc(); if (req == NULL) { + goto drop; } - sk->ack_backlog++; + BACKLOG(sk)++; req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ - req->rcv_isn = skb->seq; + req->rcv_isn = TCP_SKB_CB(skb)->seq; req->snt_isn = isn; tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0; tp.in_mss = 536; @@ -806,8 +784,11 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, sk->data_ready(sk, 0); -exit: return 0; + +drop: + tcp_statistics.TcpAttemptFails++; + return 0; /* don't send reset */ } static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, @@ -830,7 +811,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct tcp_opt *newtp; struct sock *newsk; int mss; - + if (skb->protocol == __constant_htons(ETH_P_IP)) { /* * v6 mapped @@ -858,6 +839,9 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, } + if (sk->ack_backlog > sk->max_ack_backlog) + return NULL; + if (dst == NULL) { /* * options / mss / route cache @@ -876,6 +860,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (dst->error || dst->pmtu < 576) goto out; + sk->tp_pinfo.af_tcp.syn_backlog--; + sk->ack_backlog++; mss = dst->pmtu - sizeof(struct ipv6hdr); #if 0 @@ -1012,132 +998,171 @@ static void tcp_v6_rst_req(struct sock *sk, struct sk_buff *skb) if (!req) return; /* Sequence number check required by RFC793 */ - if (before(skb->seq, req->snt_isn) || after(skb->seq, req->snt_isn+1)) + if (before(TCP_SKB_CB(skb)->seq, req->snt_isn) || + after(TCP_SKB_CB(skb)->seq, req->snt_isn+1)) return; + if(req->sk) + sk->ack_backlog--; + else + tp->syn_backlog--; tcp_synq_unlink(tp, req, prev); req->class->destructor(req); tcp_openreq_free(req); + net_statistics.EmbryonicRsts++; } -int tcp_v6_rcv(struct sk_buff *skb, struct device *dev, - struct in6_addr *saddr, struct in6_addr *daddr, - struct ipv6_options *opt, unsigned short len, - int redo, struct inet6_protocol *protocol) +static inline struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) { - struct tcphdr *th; - struct sock *sk; + struct tcphdr *th = skb->h.th; + u32 flg = ((u32 *)th)[3]; - /* - * "redo" is 1 if we have already seen this skb but couldn't - * use it at that time (the socket was locked). In that case - * we have already done a lot of the work (looked up the socket - * etc). + /* Check for RST */ + if (flg & __constant_htonl(0x00040000)) { + tcp_v6_rst_req(sk, skb); + return NULL; + } + + /* Check SYN|ACK */ + if (flg & __constant_htonl(0x00120000)) { + struct open_request *req, *dummy; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + req = tcp_v6_search_req(tp, skb->nh.ipv6h,th, &dummy); + if (req) { + sk = tcp_check_req(sk, skb, req); + } +#if 0 /*def CONFIG_SYN_COOKIES */ + else { + sk = cookie_v6_check(sk, skb, (struct ipv6_options *) skb->cb); + } +#endif + } + return sk; +} + +static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) +{ + /* Imagine: socket is IPv6. IPv4 packet arrives, + goes to IPv4 receive handler and backlogged. + From backlog it always goes here. Kerboom... + Fortunately, tcp_rcv_established and rcv_established + handle them correctly, but it is not case with + tcp_v6_hnd_req and tcp_v6_send_reset(). --ANK */ - th = skb->h.th; + if (skb->protocol == __constant_htons(ETH_P_IP)) + return tcp_v4_do_rcv(sk, skb); - sk = skb->sk; + /* + * socket locking is here for SMP purposes as backlog rcv + * is currently called with bh processing disabled. + */ - if (!redo) { - if (skb->pkt_type != PACKET_HOST) - goto discard_it; + /* XXX We need to think more about socket locking + * XXX wrt. backlog queues, __release_sock(), etc. -DaveM + */ + lock_sock(sk); - /* - * Pull up the IP header. - */ + /* + * This doesn't check if the socket has enough room for the packet. + * Either process the packet _without_ queueing it and then free it, + * or do the check later. + */ + skb_set_owner_r(skb, sk); - __skb_pull(skb, skb->h.raw - skb->data); + if (sk->state == TCP_ESTABLISHED) { /* Fast path */ + if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) + goto reset; + release_sock(sk); + return 0; + } - /* - * Count it even if it's bad. - */ + if (sk->state == TCP_LISTEN) { + struct sock *nsk; + + nsk = tcp_v6_hnd_req(sk, skb); + if (!nsk) + goto discard; + lock_sock(nsk); + release_sock(sk); + sk = nsk; + } - tcp_statistics.TcpInSegs++; + if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->cb, skb->len)) + goto reset; + release_sock(sk); + return 0; - /* - * Try to use the device checksum if provided. - */ +reset: + tcp_v6_send_reset(skb); +discard: + kfree_skb(skb); + release_sock(sk); + return 0; +} - switch (skb->ip_summed) { - case CHECKSUM_NONE: - skb->csum = csum_partial((char *)th, len, 0); - case CHECKSUM_HW: - if (tcp_v6_check(th,len,saddr,daddr,skb->csum)) { - printk(KERN_DEBUG "tcp csum failed\n"); - tcp_statistics.TcpInErrs++; - goto discard_it; - } - default: - /* CHECKSUM_UNNECESSARY */ - }; +int tcp_v6_rcv(struct sk_buff *skb, struct device *dev, + struct in6_addr *saddr, struct in6_addr *daddr, + struct ipv6_options *opt, unsigned short len, + int redo, struct inet6_protocol *protocol) +{ + struct tcphdr *th; + struct sock *sk; - sk = __tcp_v6_lookup(th, saddr, th->source, daddr, th->dest, dev->ifindex); + th = skb->h.th; - if (!sk) { - printk(KERN_DEBUG "socket not found\n"); - goto no_tcp_socket; - } + if (skb->pkt_type != PACKET_HOST) + goto discard_it; - skb->seq = ntohl(th->seq); - skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4; - skb->ack_seq = ntohl(th->ack_seq); - skb->used = 0; - if(sk->state == TCP_TIME_WAIT) - goto do_time_wait; + /* + * Pull up the IP header. + */ - skb->sk = sk; - } + __skb_pull(skb, skb->h.raw - skb->data); /* - * We may need to add it to the backlog here. + * Count it even if it's bad. */ - if (sk->sock_readers) { - __skb_queue_tail(&sk->back_log, skb); - return(0); - } + tcp_statistics.TcpInSegs++; - skb_set_owner_r(skb, sk); + /* + * Try to use the device checksum if provided. + */ - if (sk->state == TCP_ESTABLISHED) { - if (tcp_rcv_established(sk, skb, th, len)) - goto no_tcp_socket; - return 0; - } + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = csum_partial((char *)th, len, 0); + case CHECKSUM_HW: + if (tcp_v6_check(th,len,saddr,daddr,skb->csum)) { + printk(KERN_DEBUG "tcp csum failed\n"); + tcp_statistics.TcpInErrs++; + goto discard_it; + } + default: + /* CHECKSUM_UNNECESSARY */ + }; - if (sk->state == TCP_LISTEN) { - __u32 flg = ((u32 *)th)[3]; + sk = __tcp_v6_lookup(th, saddr, th->source, daddr, th->dest, dev->ifindex); - /* Check for RST */ - if (flg & __constant_htonl(0x00040000)) { - tcp_v6_rst_req(sk, skb); - } - - /* Check SYN|ACK */ - if (flg & __constant_htonl(0x00120000)) { - struct open_request *req, *prev; - struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - - req = tcp_v6_search_req(tp, skb->nh.ipv6h,th,&prev); - if (req) { - sk = tcp_check_req(sk, skb, req); - } - /* else do syncookies (add them here) */ - if (sk == NULL) - goto discard_it; - } - } + if (!sk) + goto no_tcp_socket; - if (tcp_rcv_state_process(sk, skb, th, opt, len) == 0) - return 0; + TCP_SKB_CB(skb)->seq = ntohl(th->seq); + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + + len - th->doff*4); + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); + skb->used = 0; + if(sk->state == TCP_TIME_WAIT) + goto do_time_wait; -no_tcp_socket: + if (!sk->sock_readers) + return tcp_v6_do_rcv(sk, skb); - /* - * No such TCB. If th->rst is 0 send a reset - * (checked in tcp_v6_send_reset) - */ + __skb_queue_tail(&sk->back_log, skb); + return(0); +no_tcp_socket: tcp_v6_send_reset(skb); discard_it: @@ -1187,18 +1212,6 @@ static int tcp_v6_rebuild_header(struct sock *sk) return dst->error; } -static int tcp_v6_backlog_rcv(struct sock *sk, struct sk_buff *skb) -{ - int res; - - res = tcp_v6_rcv(skb, skb->dev, - &skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr, - (struct ipv6_options *) skb->cb, - skb->len, 1, - (struct inet6_protocol *) sk->pair); - return res; -} - static struct sock * tcp_v6_get_sock(struct sk_buff *skb, struct tcphdr *th) { struct in6_addr *saddr; @@ -1300,7 +1313,7 @@ static int tcp_v6_init_sock(struct sock *sk) /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. */ - tp->snd_cwnd = 1; + tp->snd_cwnd = (1 << TCP_CWND_SHIFT); tp->snd_ssthresh = 0x7fffffff; sk->priority = 1; @@ -1331,14 +1344,14 @@ static int tcp_v6_destroy_sock(struct sock *sk) * Cleanup up the write buffer. */ - while((skb = skb_dequeue(&sk->write_queue)) != NULL) + while((skb = __skb_dequeue(&sk->write_queue)) != NULL) kfree_skb(skb); /* * Cleans up our, hopefuly empty, out_of_order_queue */ - while((skb = skb_dequeue(&tp->out_of_order_queue)) != NULL) + while((skb = __skb_dequeue(&tp->out_of_order_queue)) != NULL) kfree_skb(skb); /* @@ -1377,7 +1390,7 @@ struct proto tcpv6_prot = { tcp_v6_sendmsg, /* sendmsg */ tcp_recvmsg, /* recvmsg */ NULL, /* bind */ - tcp_v6_backlog_rcv, /* backlog_rcv */ + tcp_v6_do_rcv, /* backlog_rcv */ tcp_v6_hash, /* hash */ tcp_v6_unhash, /* unhash */ tcp_v6_rehash, /* rehash */ diff --git a/net/ipx/Config.in b/net/ipx/Config.in index d35afbac0..17080b0c6 100644 --- a/net/ipx/Config.in +++ b/net/ipx/Config.in @@ -2,5 +2,7 @@ # IPX configuration # -comment 'IPX options' -bool 'Full internal IPX network' CONFIG_IPX_INTERN +bool 'IPX: Full internal IPX network' CONFIG_IPX_INTERN +if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + dep_tristate 'IPX: SPX networking (EXPERIMENTAL)' CONFIG_SPX $CONFIG_IPX +fi diff --git a/net/ipx/Makefile b/net/ipx/Makefile index b9d337a8a..39639c6dc 100644 --- a/net/ipx/Makefile +++ b/net/ipx/Makefile @@ -17,6 +17,14 @@ ifeq ($(CONFIG_SYSCTL),y) O_OBJS += sysctl_net_ipx.o endif +ifeq ($(CONFIG_SPX),y) +OX_OBJS += af_spx.o +else + ifeq ($(CONFIG_SPX),m) + MX_OBJS += af_spx.o + endif +endif + include $(TOPDIR)/Rules.make tar: diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 904fa1174..f035e8c62 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1,5 +1,5 @@ /* - * Implements an IPX socket layer (badly - but I'm working on it). + * Implements an IPX socket layer. * * This code is derived from work by * Ross Biro : Writing the original IP stack @@ -47,6 +47,8 @@ * Revision 0.36: Internal bump up for 2.1 * Revision 0.37: Began adding POSIXisms. * Revision 0.38: Asynchronous socket stuff made current. + * Revision 0.39: SPX interfaces + * Revision 0.40: Tiny SIOCGSTAMP fix (chris@cybernet.co.nz) * * Protect the module by a MOD_INC_USE_COUNT/MOD_DEC_USE_COUNT * pair. Also, now usage count is managed this way @@ -111,6 +113,8 @@ static struct datalink_proto *pSNAP_datalink = NULL; static struct proto_ops ipx_dgram_ops; +static struct net_proto_family *spx_family_ops; + static ipx_route *ipx_routes = NULL; static ipx_interface *ipx_interfaces = NULL; static ipx_interface *ipx_primary_net = NULL; @@ -163,7 +167,7 @@ static int ipxcfg_get_config_data(ipx_config_data *arg) * use this facility. */ -static void ipx_remove_socket(struct sock *sk) +void ipx_remove_socket(struct sock *sk) { struct sock *s; ipx_interface *intrfc; @@ -624,6 +628,14 @@ static int ipxitf_send(ipx_interface *intrfc, struct sk_buff *skb, char *node) if (ipx->ipx_source.net != intrfc->if_netnum) { + /* + * Unshare the buffer before modifying the count in + * case its a flood or tcpdump + */ + skb=skb_unshare(skb, GFP_ATOMIC); + if(!skb) + return 0; + ipx = skb->nh.ipxh; if (++(ipx->ipx_tctrl) > ipxcfg_max_hops) send_to_wire = 0; } @@ -722,7 +734,7 @@ static int ipxitf_rcv(ipx_interface *intrfc, struct sk_buff *skb) } } - if( ipx->ipx_type == IPX_TYPE_PPROP && ipx->ipx_tctrl < 8 && skb->pkt_type == PACKET_HOST ) + if( ipx->ipx_type == IPX_TYPE_PPROP && ipx->ipx_tctrl < 8 && skb->pkt_type != PACKET_OTHERHOST ) { int i; ipx_interface *ifcs; @@ -762,8 +774,8 @@ static int ipxitf_rcv(ipx_interface *intrfc, struct sk_buff *skb) if (call_fw_firewall(PF_IPX, skb->dev, ipx, NULL, &skb)==FW_ACCEPT) { skb2 = skb_clone(skb, GFP_ATOMIC); - ipxrtr_route_skb(skb2); - } + ipxrtr_route_skb(skb2); + } } } /* @@ -1264,6 +1276,9 @@ static __u16 ipx_set_checksum(struct ipxhdr *packet,int length) */ __u32 i=length>>1; + char hops = packet->ipx_tctrl; + + packet->ipx_tctrl = 0; /* hop count excluded from checksum calc */ /* * Loop through all complete words except the checksum field @@ -1279,6 +1294,7 @@ static __u16 ipx_set_checksum(struct ipxhdr *packet,int length) if(packet->ipx_pktsize&htons(1)) sum+=ntohs(0xff00)&*p; + packet->ipx_tctrl = hops; /* * Do final fixup */ @@ -1713,19 +1729,24 @@ static int ipx_getsockopt(struct socket *sock, int level, int optname, static int ipx_create(struct socket *sock, int protocol) { struct sock *sk; - sk=sk_alloc(AF_IPX, GFP_KERNEL, 1); - if(sk==NULL) - return(-ENOMEM); switch(sock->type) { case SOCK_DGRAM: + sk=sk_alloc(AF_IPX, GFP_KERNEL, 1); + if(sk==NULL) + return(-ENOMEM); sock->ops = &ipx_dgram_ops; break; - case SOCK_STREAM: /* Allow higher levels to piggyback */ case SOCK_SEQPACKET: - printk(KERN_CRIT "IPX: _create-ing non_DGRAM socket\n"); + /* + * From this point on SPX sockets are handled + * by af_spx.c and the methods replaced. + */ + if(spx_family_ops) + return spx_family_ops->create(sock,protocol); + /* Fall through if SPX is not loaded */ + case SOCK_STREAM: /* Allow higher levels to piggyback */ default: - sk_free(sk); return(-ESOCKTNOSUPPORT); } sock_init_data(sock,sk); @@ -2157,6 +2178,7 @@ static int ipx_recvmsg(struct socket *sock, struct msghdr *msg, int size, copied); if (err) goto out_free; + sk->stamp=skb->stamp; msg->msg_namelen = sizeof(*sipx); @@ -2249,6 +2271,34 @@ static int ipx_ioctl(struct socket *sock,unsigned int cmd, unsigned long arg) return(0); } +/* + * SPX interface support + */ + +int ipx_register_spx(struct proto_ops **p, struct net_proto_family *spx) +{ + if(spx_family_ops!=NULL) + return -EBUSY; + cli(); + MOD_INC_USE_COUNT; + *p=&ipx_dgram_ops; + spx_family_ops=spx; + sti(); + return 0; +} + +int ipx_unregister_spx(void) +{ + spx_family_ops=NULL; + MOD_DEC_USE_COUNT; + return 0; +} + + +/* + * Socket family declarations + */ + static struct net_proto_family ipx_family_ops = { AF_IPX, ipx_create @@ -2256,7 +2306,6 @@ static struct net_proto_family ipx_family_ops = { static struct proto_ops ipx_dgram_ops = { AF_IPX, - sock_no_dup, ipx_release, ipx_bind, @@ -2280,7 +2329,7 @@ static struct proto_ops ipx_dgram_ops = { static struct packet_type ipx_8023_packet_type = { - 0, /* MUTTER ntohs(ETH_P_8023),*/ + 0, /* MUTTER ntohs(ETH_P_802_3),*/ NULL, /* All devices */ ipx_rcv, NULL, @@ -2371,6 +2420,10 @@ int ipx_if_offset(unsigned long ipx_net_number) /* Export symbols for higher layers */ EXPORT_SYMBOL(ipxrtr_route_skb); EXPORT_SYMBOL(ipx_if_offset); +EXPORT_SYMBOL(ipx_remove_socket); +EXPORT_SYMBOL(ipx_register_spx); +EXPORT_SYMBOL(ipx_unregister_spx); + #ifdef MODULE /* Note on MOD_{INC,DEC}_USE_COUNT: @@ -2386,8 +2439,9 @@ EXPORT_SYMBOL(ipx_if_offset); * sockets be closed from user space. */ -__initfunc(static void ipx_proto_finito(void)) -{ ipx_interface *ifc; +static void ipx_proto_finito(void) +{ + ipx_interface *ifc; while (ipx_interfaces) { ifc = ipx_interfaces; diff --git a/net/ipx/af_spx.c b/net/ipx/af_spx.c new file mode 100644 index 000000000..a14ad0a31 --- /dev/null +++ b/net/ipx/af_spx.c @@ -0,0 +1,872 @@ +/* + * This module implements the (SPP-derived) Sequenced Packet eXchange + * (SPX) protocol for Linux 2.1.X as specified in + * NetWare SPX Services Specification, Semantics and API + * Revision: 1.00 + * Revision Date: February 9, 1993 + * + * Developers: + * Jay Schulist <Jay.Schulist@spacs.k12.wi.us> + * Jim Freeman <jfree@caldera.com> + * + * Changes: + * Alan Cox : Fixed an skb_unshare check for NULL + * that crashed it under load. Renamed and + * made static the ipx ops. Removed the hack + * ipx methods interface. Dropped AF_SPX - its + * the wrong abstraction. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * None of the authors or maintainers or their employers admit + * liability nor provide warranty for any of this software. + * This material is provided "as is" and at no charge. + */ + +#include <linux/config.h> +#if defined(CONFIG_SPX) || defined(CONFIG_SPX_MODULE) +#include <linux/module.h> +#include <net/ipx.h> +#include <net/spx.h> +#include <net/sock.h> +#include <asm/byteorder.h> +#include <asm/uaccess.h> +#include <linux/uio.h> +#include <linux/unistd.h> +#include <linux/firewall.h> + +static struct proto_ops *ipx_operations; +static struct proto_ops spx_operations; +static __u16 connids; + +/* Functions needed for SPX connection start up */ +static int spx_transmit(struct sock *sk,struct sk_buff *skb,int type,int len); +static void spx_retransmit(unsigned long data); +static void spx_watchdog(unsigned long data); +void spx_rcv(struct sock *sk, int bytes); + +/* Create the SPX specific data */ +static int spx_sock_init(struct sock *sk) +{ + struct spx_opt *pdata = &sk->tp_pinfo.af_spx; + + pdata->state = SPX_CLOSED; + pdata->sequence = 0; + pdata->acknowledge = 0; + pdata->source_connid = htons(connids); + pdata->rmt_seq = 0; + connids++; + + pdata->owner = (void *)sk; + pdata->sndbuf = sk->sndbuf; + + pdata->watchdog.function = spx_watchdog; + pdata->watchdog.data = (unsigned long)sk; + pdata->wd_interval = VERIFY_TIMEOUT; + pdata->retransmit.function = spx_retransmit; + pdata->retransmit.data = (unsigned long)sk; + pdata->retransmits = 0; + pdata->retries = 0; + pdata->max_retries = RETRY_COUNT; + + skb_queue_head_init(&pdata->rcv_queue); + skb_queue_head_init(&pdata->transmit_queue); + skb_queue_head_init(&pdata->retransmit_queue); + + return (0); +} + +static int spx_create(struct socket *sock, int protocol) +{ + struct sock *sk; + + sk = sk_alloc(AF_IPX, GFP_KERNEL, 1); + if(sk == NULL) + return (-ENOMEM); + + switch(sock->type) + { + case SOCK_SEQPACKET: + sock->ops = &spx_operations; + break; + default: + sk_free(sk); + return (-ESOCKTNOSUPPORT); + } + + sock_init_data(sock, sk); + spx_sock_init(sk); + sk->data_ready = spx_rcv; + sk->destruct = NULL; + sk->mtu = IPX_MTU; + sk->no_check = 1; + + MOD_INC_USE_COUNT; + + return (0); +} + +static int spx_shutdown(struct socket *sk,int how) +{ + return (-EOPNOTSUPP); +} + +void spx_close_socket(struct sock *sk) +{ + struct spx_opt *pdata = &sk->tp_pinfo.af_spx; + + pdata->state = SPX_CLOSED; + sk->state = TCP_CLOSE; + del_timer(&pdata->retransmit); + del_timer(&pdata->watchdog); +} + +void spx_destroy_socket(struct sock *sk) +{ + struct spx_opt *pdata = &sk->tp_pinfo.af_spx; + struct sk_buff *skb; + + ipx_remove_socket(sk); + while((skb = skb_dequeue(&sk->receive_queue)) != NULL) + kfree_skb(skb); + while((skb = skb_dequeue(&pdata->transmit_queue)) != NULL) + kfree_skb(skb); + while((skb = skb_dequeue(&pdata->retransmit_queue)) != NULL) + kfree_skb(skb); + while((skb = skb_dequeue(&pdata->rcv_queue)) != NULL) + kfree_skb(skb); + + sk_free(sk); + MOD_DEC_USE_COUNT; +} + +/* Release an SPX socket */ +static int spx_release(struct socket *sock, struct socket *peer) +{ + struct sock *sk = sock->sk; + struct spx_opt *pdata = &sk->tp_pinfo.af_spx; + + if(sk == NULL) + return (0); + if(!sk->dead) + sk->state_change(sk); + sk->dead = 1; + + if(pdata->state != SPX_CLOSED) + { + spx_transmit(sk, NULL, DISCON, 0); + spx_close_socket(sk); + } + + sock->sk = NULL; + sk->socket = NULL; + spx_destroy_socket(sk); + + return (0); +} + +/* Move a socket into listening state. */ +static int spx_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + + if(sock->state != SS_UNCONNECTED) + return (-EINVAL); + if(sock->type != SOCK_SEQPACKET) + return (-EOPNOTSUPP); + if(sk->zapped != 0) + return (-EAGAIN); + + if((unsigned) backlog == 0) /* BSDism */ + backlog = 1; + if((unsigned) backlog > SOMAXCONN) + backlog = SOMAXCONN; + sk->max_ack_backlog = backlog; + if(sk->state != TCP_LISTEN) + { + sk->ack_backlog = 0; + sk->state = TCP_LISTEN; + } + sk->socket->flags |= SO_ACCEPTCON; + + return (0); +} + +/* Accept a pending SPX connection */ +static int spx_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct sock *sk; + struct sock *newsk; + struct sk_buff *skb; + int err; + + if(newsock->sk != NULL) + spx_destroy_socket(newsock->sk); + newsock->sk = NULL; + + if(sock->sk == NULL) + return (-EINVAL); + sk = sock->sk; + + if((sock->state != SS_UNCONNECTED) || !(sock->flags & SO_ACCEPTCON)) + return (-EINVAL); + if(sock->type != SOCK_SEQPACKET) + return (-EOPNOTSUPP); + if(sk->state != TCP_LISTEN) + return (-EINVAL); + + cli(); + do { + skb = skb_dequeue(&sk->receive_queue); + if(skb == NULL) + { + if(flags & O_NONBLOCK) + { + sti(); + return (-EWOULDBLOCK); + } + interruptible_sleep_on(sk->sleep); + if(signal_pending(current)) + { + sti(); + return (-ERESTARTSYS); + } + } + } while (skb == NULL); + + newsk = skb->sk; + newsk->pair = NULL; + sti(); + + err = spx_transmit(newsk, skb, CONACK, 0); /* Connection ACK */ + if(err) + return (err); + + /* Now attach up the new socket */ + sock->sk = NULL; + sk->ack_backlog--; + newsock->sk = newsk; + newsk->state = TCP_ESTABLISHED; + newsk->protinfo.af_ipx.dest_addr = newsk->tp_pinfo.af_spx.dest_addr; + + return (0); +} + +/* Build a connection to an SPX socket */ +static int spx_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + struct spx_opt *pdata = &sk->tp_pinfo.af_spx; + struct sockaddr_ipx src; + struct sk_buff *skb; + int size, err; + + size = sizeof(src); + err = ipx_operations->getname(sock, (struct sockaddr *)&src, &size, 0); + if(err) + return (err); + + pdata->source_addr.net = src.sipx_network; + memcpy(pdata->source_addr.node, src.sipx_node, IPX_NODE_LEN); + pdata->source_addr.sock = (unsigned short)src.sipx_port; + + err = ipx_operations->connect(sock, uaddr, addr_len, flags); + if(err) + return (err); + + pdata->dest_addr = sk->protinfo.af_ipx.dest_addr; + pdata->state = SPX_CONNECTING; + sock->state = SS_CONNECTING; + sk->state = TCP_SYN_SENT; + + /* Send Connection request */ + err = spx_transmit(sk, NULL, CONREQ, 0); + if(err) + return (err); + + cli(); + do { + skb = skb_dequeue(&sk->receive_queue); + if(skb == NULL) + { + if(flags & O_NONBLOCK) + { + sti(); + return (-EWOULDBLOCK); + } + interruptible_sleep_on(sk->sleep); + if(signal_pending(current)) + { + sti(); + return (-ERESTARTSYS); + } + } + } while (skb == NULL); + + if(pdata->state == SPX_CLOSED) + { + sti(); + del_timer(&pdata->watchdog); + return (-ETIMEDOUT); + } + + sock->state = SS_CONNECTED; + sk->state = TCP_ESTABLISHED; + kfree_skb(skb); + sti(); + + return (0); +} + +/* + * Calculate the timeout for a packet. Thankfully SPX has a large + * fudge factor (3/4 secs) and does not pay much attention to RTT. + * As we simply have a default retry time of 1*HZ and a max retry + * time of 5*HZ. Between those values we increase the timeout based + * on the number of retransmit tries. + */ +static inline unsigned long spx_calc_rtt(int tries) +{ + if(tries < 1) + return (RETRY_TIME); + if(tries > 5) + return (MAX_RETRY_DELAY); + return (tries * HZ); +} + +static int spx_route_skb(struct spx_opt *pdata, struct sk_buff *skb, int type) +{ + struct sk_buff *skb2; + int err = 0; + + skb = skb_unshare(skb, GFP_ATOMIC); + if(skb==NULL) + return -ENOBUFS; + + switch(type) + { + case (DATA): + if(!skb_queue_empty(&pdata->retransmit_queue)) + { + skb_queue_tail(&pdata->transmit_queue, skb); + return 0; + } + + case (TQUEUE): + pdata->retransmit.expires = jiffies + spx_calc_rtt(0); + add_timer(&pdata->retransmit); + + skb2 = skb_clone(skb, GFP_BUFFER); + if(skb2 == NULL) + return -ENOBUFS; + skb_queue_tail(&pdata->retransmit_queue, skb2); + + case (ACK): + case (CONREQ): + case (CONACK): + case (WDREQ): + case (WDACK): + case (DISCON): + case (DISACK): + case (RETRAN): + default: + /* Send data */ + err = ipxrtr_route_skb(skb); + if(err) + kfree_skb(skb); + } + + return (err); +} + +/* SPX packet transmit engine */ +static int spx_transmit(struct sock *sk, struct sk_buff *skb, int type, int len) +{ + struct spx_opt *pdata = &sk->tp_pinfo.af_spx; + struct ipxspxhdr *ipxh; + int flags, err; + + if(skb == NULL) + { + int offset = ipx_if_offset(pdata->dest_addr.net); + int size = offset + sizeof(struct ipxspxhdr); + + save_flags(flags); + cli(); + skb = sock_alloc_send_skb(sk, size, 0, 0, &err); + if(skb == NULL) + return (-ENOMEM); + skb_reserve(skb, offset); + skb->nh.raw = skb_put(skb, sizeof(struct ipxspxhdr)); + restore_flags(flags); + } + + /* IPX header */ + ipxh = (struct ipxspxhdr *)skb->nh.raw; + ipxh->ipx.ipx_checksum = 0xFFFF; + ipxh->ipx.ipx_pktsize = htons(SPX_SYS_PKT_LEN); + ipxh->ipx.ipx_tctrl = 0; + ipxh->ipx.ipx_type = IPX_TYPE_SPX; + ipxh->ipx.ipx_dest = pdata->dest_addr; + ipxh->ipx.ipx_source = pdata->source_addr; + + /* SPX header */ + ipxh->spx.dtype = 0; + ipxh->spx.sequence = htons(pdata->sequence); + ipxh->spx.ackseq = htons(pdata->rmt_seq); + ipxh->spx.sconn = pdata->source_connid; + ipxh->spx.dconn = pdata->dest_connid; + ipxh->spx.allocseq = htons(pdata->alloc); + + /* Reset/Set WD timer */ + del_timer(&pdata->watchdog); + pdata->watchdog.expires = jiffies + VERIFY_TIMEOUT; + add_timer(&pdata->watchdog); + + switch(type) + { + case (DATA): /* Data */ + ipxh->ipx.ipx_pktsize = htons(SPX_SYS_PKT_LEN + len); + ipxh->spx.cctl = (CCTL_ACK | CCTL_EOM); + pdata->sequence++; + break; + + case (ACK): /* Connection/WD/Data ACK */ + pdata->rmt_seq++; + case (WDACK): + case (CONACK): + ipxh->spx.cctl = CCTL_SYS; + ipxh->spx.ackseq = htons(pdata->rmt_seq); + break; + + case (CONREQ): /* Connection Request */ + del_timer(&pdata->watchdog); + case (WDREQ): /* WD Request */ + pdata->source_connid = htons(connids++); + pdata->dest_connid = 0xFFFF; + pdata->alloc = 3 + pdata->rmt_seq; + ipxh->spx.cctl = (CCTL_ACK | CCTL_SYS); + ipxh->spx.sconn = pdata->source_connid; + ipxh->spx.dconn = pdata->dest_connid; + ipxh->spx.allocseq = htons(pdata->alloc); + break; + + case (DISCON): /* Informed Disconnect */ + ipxh->spx.cctl = CCTL_ACK; + ipxh->spx.dtype = SPX_DTYPE_ECONN; + break; + + case (DISACK): /* Informed Disconnect ACK */ + ipxh->spx.cctl = 0; + ipxh->spx.dtype = SPX_DTYPE_ECACK; + ipxh->spx.sequence = 0; + ipxh->spx.ackseq = htons(pdata->rmt_seq++); + break; + + default: + return (-EOPNOTSUPP); + } + + /* Send data */ + spx_route_skb(pdata, skb, type); + + return (0); +} + +/* Check the state of the connection and send a WD request if needed. */ +static void spx_watchdog(unsigned long data) +{ + struct sock *sk = (struct sock*)data; + struct spx_opt *pdata = &sk->tp_pinfo.af_spx; + + del_timer(&pdata->watchdog); + if(pdata->retries > pdata->max_retries) + { + spx_close_socket(sk); /* Unilateral Abort */ + return; + } + + /* Send WD request */ + spx_transmit(sk, NULL, WDREQ, 0); + pdata->retries++; + + return; +} + +static void spx_retransmit(unsigned long data) +{ + struct sock *sk = (struct sock*)data; + struct spx_opt *pdata = &sk->tp_pinfo.af_spx; + struct sk_buff *skb; + int err; + + del_timer(&pdata->retransmit); + if(pdata->retransmits > RETRY_COUNT) + { + spx_close_socket(sk); /* Unilateral Abort */ + return; + } + + /* need to leave skb on the queue! */ + skb = skb_peek(&pdata->retransmit_queue); + if(skb_cloned(skb)) + skb = skb_copy(skb, GFP_ATOMIC); + else + skb = skb_clone(skb, GFP_ATOMIC); + + pdata->retransmit.expires = jiffies + spx_calc_rtt(pdata->retransmits); + add_timer(&pdata->retransmit); + + err = spx_route_skb(pdata, skb, RETRAN); + pdata->retransmits++; + + return; +} + +/* SPX packet receive engine */ +void spx_rcv(struct sock *sk, int bytes) +{ + struct sk_buff *skb; + struct sk_buff *skb2; + struct ipxspxhdr *ipxh; + struct ipxspxhdr *ipxh2; + struct spx_opt *pdata = &sk->tp_pinfo.af_spx; + + skb = skb_dequeue(&sk->receive_queue); + if(skb == NULL) + return; + ipxh = (struct ipxspxhdr *)skb->nh.raw; + + /* Can't receive on a closed connection */ + if((pdata->state == SPX_CLOSED) && (ipxh->spx.sequence != 0)) + return; + if(ntohs(ipxh->ipx.ipx_pktsize) < SPX_SYS_PKT_LEN) + return; + if(ipxh->ipx.ipx_type != IPX_TYPE_SPX) + return; + + /* insanity - rcv'd ACK of unsent data ?? */ + if(ntohs(ipxh->spx.ackseq) > pdata->sequence) + return; + + /* Reset WD timer on any received packet */ + del_timer(&pdata->watchdog); + pdata->retries = 0; + pdata->watchdog.expires = jiffies + ABORT_TIMEOUT; + add_timer(&pdata->watchdog); + + switch(ipxh->spx.cctl) + { + case (CCTL_SYS | CCTL_ACK): + if((ipxh->spx.sequence == 0) /* ConReq */ + && (ipxh->spx.ackseq == 0) + && (ipxh->spx.dconn == 0xFFFF)) + { + pdata->state = SPX_CONNECTED; + pdata->dest_addr = ipxh->ipx.ipx_source; + pdata->source_addr = ipxh->ipx.ipx_dest; + pdata->dest_connid = ipxh->spx.sconn; + pdata->alloc = 3 + ntohs(ipxh->spx.sequence); + + skb_queue_tail(&sk->receive_queue, skb); + wake_up_interruptible(sk->sleep); + } + else /* WD Request */ + spx_transmit(sk, skb, WDACK, 0); + break; + + case CCTL_SYS: /* ACK */ + if((ipxh->spx.dtype == 0) /* ConReq ACK */ + && (ipxh->spx.sconn != 0xFFFF) + && (ipxh->spx.dconn != 0xFFFF) + && (ipxh->spx.sequence == 0) + && (ipxh->spx.ackseq == 0) + && (pdata->state != SPX_CONNECTED)) + { + pdata->state = SPX_CONNECTED; + + skb_queue_tail(&sk->receive_queue, skb); + wake_up_interruptible(sk->sleep); + break; + } + + /* Check Data/ACK seq */ + skb2 = skb_dequeue(&pdata->retransmit_queue); + if(skb2) + { + ipxh2 = (struct ipxspxhdr *)skb2->nh.raw; + if((ntohs(ipxh2->spx.sequence) + == (ntohs(ipxh->spx.ackseq) - 1)) + || (ntohs(ipxh2->spx.sequence) == 65535 + && ntohs(ipxh->spx.ackseq) == 0)) + { + del_timer(&pdata->retransmit); + pdata->retransmits = 0; + kfree_skb(skb2); + if(skb_queue_empty(&pdata->retransmit_queue)) + { + skb2 = skb_dequeue(&pdata->transmit_queue); + if(skb2 != NULL) + spx_route_skb(pdata, skb2, TQUEUE); + } + } + else /* Out of Seq - ERROR! */ + skb_queue_head(&pdata->retransmit_queue, skb2); + } + + kfree_skb(skb); + break; + + case (CCTL_ACK): /* Informed Disconnect */ + if(ipxh->spx.dtype == SPX_DTYPE_ECONN) + { + spx_transmit(sk, skb, DISACK, 0); + spx_close_socket(sk); + } + break; + + default: + if(ntohs(ipxh->spx.sequence) == pdata->rmt_seq) + { + pdata->rmt_seq = ntohs(ipxh->spx.sequence); + skb_queue_tail(&pdata->rcv_queue, skb); + wake_up_interruptible(sk->sleep); + spx_transmit(sk, NULL, ACK, 0); + break; + } + + /* Catch All */ + kfree_skb(skb); + break; + } + + return; +} + +/* Get message/packet data from user-land */ +static int spx_sendmsg(struct socket *sock, struct msghdr *msg, int len, + struct scm_cookie *scm) +{ + struct sock *sk = sock->sk; + int flags = msg->msg_flags; + struct sk_buff *skb; + int err, offset, size; + + if(len > 534) + return (-EMSGSIZE); + if(sk->zapped) + return (-ENOTCONN); /* Socket not bound */ + if(flags&~MSG_DONTWAIT) + return (-EINVAL); + + offset = ipx_if_offset(sk->tp_pinfo.af_spx.dest_addr.net); + size = offset + sizeof(struct ipxspxhdr) + len; + skb = sock_alloc_send_skb(sk, size, 0, flags&MSG_DONTWAIT, &err); + if(skb == NULL) + return (err); + + skb->sk = sk; + skb_reserve(skb, offset); + skb->nh.raw = skb_put(skb, sizeof(struct ipxspxhdr)); + + err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); + if(err) + { + kfree_skb(skb); + return (-EFAULT); + } + + err = spx_transmit(sk, skb, DATA, len); + if(err) + return (-EAGAIN); + + return (len); +} + +/* Send message/packet data to user-land */ +static int spx_recvmsg(struct socket *sock, struct msghdr *msg, int size, + int flags, struct scm_cookie *scm) +{ + struct sk_buff *skb; + struct ipxspxhdr *ispxh; + struct sock *sk = sock->sk; + struct spx_opt *pdata = &sk->tp_pinfo.af_spx; + struct sockaddr_ipx *sipx = (struct sockaddr_ipx *)msg->msg_name; + int copied, err; + + if(sk->zapped) + return (-ENOTCONN); /* Socket not bound */ + + lock_sock(sk); +restart: + while(skb_queue_empty(&pdata->rcv_queue)) /* No data */ + { + /* Socket errors? */ + err = sock_error(sk); + if(err) + return (err); + + /* Socket shut down? */ + if(sk->shutdown & RCV_SHUTDOWN) + return (-ESHUTDOWN); + + /* handle signals */ + if(signal_pending(current)) + return (-ERESTARTSYS); + + /* User doesn't want to wait */ + if(flags&MSG_DONTWAIT) + return (-EAGAIN); + + release_sock(sk); + save_flags(flags); + cli(); + if(skb_peek(&pdata->rcv_queue) == NULL) + interruptible_sleep_on(sk->sleep); + restore_flags(flags); + lock_sock(sk); + } + + skb = skb_dequeue(&pdata->rcv_queue); + if(skb == NULL) + goto restart; + + ispxh = (struct ipxspxhdr *)skb->nh.raw; + copied = ntohs(ispxh->ipx.ipx_pktsize) - SPX_SYS_PKT_LEN; + if(copied > size) + { + copied = size; + msg->msg_flags |= MSG_TRUNC; + } + + err = memcpy_toiovec(msg->msg_iov, skb->nh.raw+SPX_SYS_PKT_LEN, copied); + if(err) + return (-EFAULT); + + msg->msg_namelen = sizeof(*sipx); + if(sipx) + { + sipx->sipx_family = AF_IPX; + sipx->sipx_port = ispxh->ipx.ipx_source.sock; + memcpy(sipx->sipx_node,ispxh->ipx.ipx_source.node,IPX_NODE_LEN); + sipx->sipx_network = ispxh->ipx.ipx_source.net; + sipx->sipx_type = ispxh->ipx.ipx_type; + } + kfree_skb(skb); + release_sock(sk); + + return (copied); +} + +/* + * Functions which just wrap their IPX cousins + */ + +static int spx_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + int err; + err = ipx_operations->bind(sock, uaddr, addr_len); + return (err); +} + +static int spx_getname (struct socket *sock, struct sockaddr *uaddr, + int *usockaddr_len, int peer) +{ + int err; + err = ipx_operations->getname(sock, uaddr, usockaddr_len, peer); + return (err); +} + +static int spx_ioctl (struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + int err; + err = ipx_operations->ioctl(sock, cmd, arg); + return (err); +} + +static int spx_setsockopt(struct socket *sock, int level, int optname, + char *optval, int optlen) +{ + int err; + err = ipx_operations->setsockopt(sock, level, optname, optval, optlen); + return (err); +} + +static int spx_getsockopt(struct socket *sock, int level, int optname, + char *optval, int *optlen) +{ + int err; + err = ipx_operations->getsockopt(sock, level, optname, optval, optlen); + return (err); +} + +static struct proto_ops spx_operations = { + AF_IPX, + sock_no_dup, + spx_release, + spx_bind, + spx_connect, + sock_no_socketpair, + spx_accept, + spx_getname, + datagram_poll, /* this does seqpacket too */ + spx_ioctl, + spx_listen, + spx_shutdown, + spx_setsockopt, + spx_getsockopt, + sock_no_fcntl, + spx_sendmsg, + spx_recvmsg +}; + +static struct net_proto_family spx_family_ops= +{ + AF_IPX, + spx_create +}; + + +void spx_proto_init(void) +{ + int error; + + connids = (__u16)jiffies; /* initalize random */ + + error = ipx_register_spx(&ipx_operations, &spx_family_ops); + if (error) + printk(KERN_ERR "SPX: unable to register with IPX.\n"); + + /* route socket(AF_IPX, SOCK_SEQPACKET) calls through spx_create() */ + + printk(KERN_INFO "Sequenced Packet eXchange (SPX) 0.01 for Linux NET3.037\n"); + return; +} + +void spx_proto_finito(void) +{ + ipx_unregister_spx(); + return; +} + +#ifdef MODULE + +int init_module(void) +{ + spx_proto_init(); + return 0; +} + +void cleanup_module(void) +{ + spx_proto_finito(); + return; +} + +#endif /* MODULE */ +#endif /* CONFIG_SPX || CONFIG_SPX_MODULE */ diff --git a/net/netrom/Makefile b/net/netrom/Makefile index 4ac78639b..1afcfd8e7 100644 --- a/net/netrom/Makefile +++ b/net/netrom/Makefile @@ -8,7 +8,8 @@ # Note 2! The CFLAGS definition is now in the main makefile... O_TARGET := netrom.o -O_OBJS := af_netrom.o nr_dev.o nr_in.o nr_out.o nr_route.o nr_subr.o nr_timer.o +O_OBJS := af_netrom.o nr_dev.o nr_in.o nr_loopback.o nr_out.o nr_route.o \ + nr_subr.o nr_timer.o M_OBJS := $(O_TARGET) ifeq ($(CONFIG_SYSCTL),y) diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 9d8a206da..59d3dacfb 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -259,6 +259,28 @@ static struct sock *nr_find_peer(unsigned char index, unsigned char id) } /* + * Find next free circuit ID. + */ +static unsigned short nr_find_next_circuit(void) +{ + unsigned short id = circuit; + unsigned char i, j; + + for (;;) { + i = id / 256; + j = id % 256; + + if (i != 0 && j != 0) + if (nr_find_socket(i, j) == NULL) + break; + + id++; + } + + return id; +} + +/* * Deferred destroy. */ void nr_destroy_socket(struct sock *); @@ -535,12 +557,12 @@ static int nr_release(struct socket *sock, struct socket *peer) switch (sk->protinfo.nr->state) { case NR_STATE_0: + case NR_STATE_1: case NR_STATE_2: nr_disconnect(sk, 0); nr_destroy_socket(sk); break; - case NR_STATE_1: case NR_STATE_3: nr_clear_queues(sk); sk->protinfo.nr->n2count = 0; @@ -670,8 +692,7 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr, sk->protinfo.nr->dest_addr = addr->sax25_call; - while (nr_find_socket((unsigned char)circuit / 256, (unsigned char)circuit % 256) != NULL) - circuit++; + circuit = nr_find_next_circuit(); sk->protinfo.nr->my_index = circuit / 256; sk->protinfo.nr->my_id = circuit % 256; @@ -764,7 +785,6 @@ static int nr_accept(struct socket *sock, struct socket *newsock, int flags) sti(); /* Now attach up the new socket */ - skb->sk = NULL; kfree_skb(skb); sk->ack_backlog--; newsock->sk = newsk; @@ -802,7 +822,8 @@ int nr_rx_frame(struct sk_buff *skb, struct device *dev) struct sock *make; ax25_address *src, *dest, *user; unsigned short circuit_index, circuit_id; - unsigned short frametype, window, timeout; + unsigned short peer_circuit_index, peer_circuit_id; + unsigned short frametype, flags, window, timeout; skb->sk = NULL; /* Initially we don't know who it's for */ @@ -813,28 +834,46 @@ int nr_rx_frame(struct sk_buff *skb, struct device *dev) src = (ax25_address *)(skb->data + 0); dest = (ax25_address *)(skb->data + 7); - circuit_index = skb->data[15]; - circuit_id = skb->data[16]; - frametype = skb->data[19] & 0x0F; + circuit_index = skb->data[15]; + circuit_id = skb->data[16]; + peer_circuit_index = skb->data[17]; + peer_circuit_id = skb->data[18]; + frametype = skb->data[19] & 0x0F; + flags = skb->data[19] & 0xF0; #ifdef CONFIG_INET /* * Check for an incoming IP over NET/ROM frame. */ - if (frametype == NR_PROTOEXT && circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) { + if (frametype == NR_PROTOEXT && circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) { skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN); - skb->h.raw = skb->data; + skb->h.raw = skb->data; return nr_rx_ip(skb, dev); - } + } #endif /* * Find an existing socket connection, based on circuit ID, if it's * a Connect Request base it on their circuit ID. + * + * Circuit ID 0/0 is not valid but it could still be a "reset" for a + * circuit that no longer exists at the other end ... */ - if ((frametype != NR_CONNREQ && (sk = nr_find_socket(circuit_index, circuit_id)) != NULL) || - (frametype == NR_CONNREQ && (sk = nr_find_peer(circuit_index, circuit_id)) != NULL)) { + + sk = NULL; + + if (circuit_index == 0 && circuit_id == 0) { + if (frametype == NR_CONNACK && flags == NR_CHOKE_FLAG) + sk = nr_find_peer(peer_circuit_index, peer_circuit_id); + } else { + if (frametype == NR_CONNREQ) + sk = nr_find_peer(circuit_index, circuit_id); + else + sk = nr_find_socket(circuit_index, circuit_id); + } + + if (sk != NULL) { skb->h.raw = skb->data; if (frametype == NR_CONNACK && skb->len == 22) @@ -845,15 +884,17 @@ int nr_rx_frame(struct sk_buff *skb, struct device *dev) return nr_process_rx_frame(sk, skb); } - switch (frametype) { - case NR_CONNREQ: - break; - case NR_DISCREQ: - case NR_DISCACK: - return 0; - default: - nr_transmit_dm(skb); - return 0; + /* + * Now it should be a CONNREQ. + */ + if (frametype != NR_CONNREQ) { + /* + * Never reply to a CONNACK/CHOKE. + */ + if (frametype != NR_CONNACK || flags != NR_CHOKE_FLAG) + nr_transmit_refusal(skb, 1); + + return 0; } sk = nr_find_listener(dest); @@ -861,7 +902,7 @@ int nr_rx_frame(struct sk_buff *skb, struct device *dev) user = (ax25_address *)(skb->data + 21); if (sk == NULL || sk->ack_backlog == sk->max_ack_backlog || (make = nr_make_new(sk)) == NULL) { - nr_transmit_dm(skb); + nr_transmit_refusal(skb, 0); return 0; } @@ -878,6 +919,8 @@ int nr_rx_frame(struct sk_buff *skb, struct device *dev) make->protinfo.nr->your_index = circuit_index; make->protinfo.nr->your_id = circuit_id; + circuit = nr_find_next_circuit(); + make->protinfo.nr->my_index = circuit / 256; make->protinfo.nr->my_id = circuit % 256; @@ -1131,7 +1174,7 @@ static int nr_get_info(char *buffer, char **start, off_t offset, int length, int cli(); - len += sprintf(buffer, "user_addr dest_node src_node dev my your st vs vr va t1 t2 t4 idle n2 wnd Snd-Q Rcv-Q\n"); + len += sprintf(buffer, "user_addr dest_node src_node dev my your st vs vr va t1 t2 t4 idle n2 wnd Snd-Q Rcv-Q inode\n"); for (s = nr_list; s != NULL; s = s->next) { if ((dev = s->protinfo.nr->device) == NULL) @@ -1143,7 +1186,7 @@ static int nr_get_info(char *buffer, char **start, off_t offset, int length, int ax2asc(&s->protinfo.nr->user_addr)); len += sprintf(buffer + len, "%-9s ", ax2asc(&s->protinfo.nr->dest_addr)); - len += sprintf(buffer + len, "%-9s %-3s %02X/%02X %02X/%02X %2d %3d %3d %3d %3lu/%03lu %2lu/%02lu %3lu/%03lu %3lu/%03lu %2d/%02d %3d %5d %5d\n", + len += sprintf(buffer + len, "%-9s %-3s %02X/%02X %02X/%02X %2d %3d %3d %3d %3lu/%03lu %2lu/%02lu %3lu/%03lu %3lu/%03lu %2d/%02d %3d %5d %5d %ld\n", ax2asc(&s->protinfo.nr->source_addr), devname, s->protinfo.nr->my_index, @@ -1166,7 +1209,8 @@ static int nr_get_info(char *buffer, char **start, off_t offset, int length, int s->protinfo.nr->n2, s->protinfo.nr->window, atomic_read(&s->wmem_alloc), - atomic_read(&s->rmem_alloc)); + atomic_read(&s->rmem_alloc), + s->socket != NULL ? s->socket->inode->i_ino : 0L); pos = begin + len; @@ -1273,6 +1317,8 @@ __initfunc(void nr_proto_init(struct net_proto *pro)) nr_register_sysctl(); #endif + nr_loopback_init(); + #ifdef CONFIG_PROC_FS proc_net_register(&proc_net_nr); proc_net_register(&proc_net_nr_neigh); @@ -1305,6 +1351,8 @@ void cleanup_module(void) proc_net_unregister(PROC_NET_NR_NEIGH); proc_net_unregister(PROC_NET_NR_NODES); #endif + nr_loopback_clear(); + nr_rt_free(); ax25_protocol_release(AX25_P_NETROM); diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c index ac32cd704..fadf69de8 100644 --- a/net/netrom/nr_in.c +++ b/net/netrom/nr_in.c @@ -129,6 +129,10 @@ static int nr_state2_machine(struct sock *sk, struct sk_buff *skb, int frametype { switch (frametype) { + case NR_CONNACK | NR_CHOKE_FLAG: + nr_disconnect(sk, ECONNRESET); + break; + case NR_DISCREQ: nr_write_internal(sk, NR_DISCACK); @@ -170,6 +174,7 @@ static int nr_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype nr_disconnect(sk, 0); break; + case NR_CONNACK | NR_CHOKE_FLAG: case NR_DISCACK: nr_disconnect(sk, ECONNRESET); break; diff --git a/net/netrom/nr_loopback.c b/net/netrom/nr_loopback.c new file mode 100644 index 000000000..ba9644cbe --- /dev/null +++ b/net/netrom/nr_loopback.c @@ -0,0 +1,107 @@ +/* + * NET/ROM release 007 + * + * This code REQUIRES 2.1.15 or higher/ NET3.038 + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * NET/ROM 007 Tomi(OH2BNS) Created this file. + * + */ + +#include <linux/config.h> +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/timer.h> +#include <net/ax25.h> +#include <linux/skbuff.h> +#include <net/netrom.h> + +static struct sk_buff_head loopback_queue; +static struct timer_list loopback_timer; + +static void nr_set_loopback_timer(void); + +void nr_loopback_init(void) +{ + skb_queue_head_init(&loopback_queue); + + init_timer(&loopback_timer); +} + +static int nr_loopback_running(void) +{ + return (loopback_timer.prev != NULL || loopback_timer.next != NULL); +} + +int nr_loopback_queue(struct sk_buff *skb) +{ + struct sk_buff *skbn; + + skbn = skb_clone(skb, GFP_ATOMIC); + + kfree_skb(skb); + + if (skbn != NULL) { + skb_queue_tail(&loopback_queue, skbn); + + if (!nr_loopback_running()) + nr_set_loopback_timer(); + } + + return 1; +} + +static void nr_loopback_timer(unsigned long); + +static void nr_set_loopback_timer(void) +{ + del_timer(&loopback_timer); + + loopback_timer.data = 0; + loopback_timer.function = &nr_loopback_timer; + loopback_timer.expires = jiffies + 10; + + add_timer(&loopback_timer); +} + +static void nr_loopback_timer(unsigned long param) +{ + struct sk_buff *skb; + ax25_address *nr_dest; + struct device *dev; + + while ((skb = skb_dequeue(&loopback_queue)) != NULL) { + nr_dest = (ax25_address *)(skb->data + 7); + + if ((dev = nr_dev_get(nr_dest)) == NULL) { + kfree_skb(skb); + continue; + } + + if (nr_rx_frame(skb, dev) == 0) + kfree_skb(skb); + } +} + +#ifdef MODULE + +void nr_loopback_clear(void) +{ + struct sk_buff *skb; + + del_timer(&loopback_timer); + + while ((skb = skb_dequeue(&loopback_queue)) != NULL) + kfree_skb(skb); +} + +#endif + +#endif diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index ffbb240c4..26f5ac8dd 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -697,8 +697,12 @@ int nr_route_frame(struct sk_buff *skb, ax25_cb *ax25) nr_add_node(nr_src, "", &ax25->dest_addr, ax25->digipeat, ax25->ax25_dev->dev, 0, sysctl_netrom_obsolescence_count_initialiser); - if ((dev = nr_dev_get(nr_dest)) != NULL) /* Its for me */ - return nr_rx_frame(skb, dev); + if ((dev = nr_dev_get(nr_dest)) != NULL) { /* Its for me */ + if (ax25 == NULL) /* Its from me */ + return nr_loopback_queue(skb); + else + return nr_rx_frame(skb, dev); + } if (!sysctl_netrom_routing_control && ax25 != NULL) return 0; diff --git a/net/netrom/nr_subr.c b/net/netrom/nr_subr.c index 7ae69fe07..096ca3a8f 100644 --- a/net/netrom/nr_subr.c +++ b/net/netrom/nr_subr.c @@ -229,7 +229,7 @@ void nr_write_internal(struct sock *sk, int frametype) * This routine is called when a Connect Acknowledge with the Choke Flag * set is needed to refuse a connection. */ -void nr_transmit_dm(struct sk_buff *skb) +void nr_transmit_refusal(struct sk_buff *skb, int mine) { struct sk_buff *skbn; unsigned char *dptr; @@ -258,10 +258,18 @@ void nr_transmit_dm(struct sk_buff *skb) *dptr++ = sysctl_netrom_network_ttl_initialiser; - *dptr++ = skb->data[15]; - *dptr++ = skb->data[16]; - *dptr++ = 0; - *dptr++ = 0; + if (mine) { + *dptr++ = 0; + *dptr++ = 0; + *dptr++ = skb->data[15]; + *dptr++ = skb->data[16]; + } else { + *dptr++ = skb->data[15]; + *dptr++ = skb->data[16]; + *dptr++ = 0; + *dptr++ = 0; + } + *dptr++ = NR_CONNACK | NR_CHOKE_FLAG; *dptr++ = 0; diff --git a/net/netsyms.c b/net/netsyms.c index 9ce58d285..5d380fbb6 100644 --- a/net/netsyms.c +++ b/net/netsyms.c @@ -15,6 +15,7 @@ #include <linux/trdevice.h> #include <linux/ioport.h> #include <net/neighbour.h> +#include <net/snmp.h> #ifdef CONFIG_INET #include <linux/ip.h> @@ -41,6 +42,8 @@ extern struct net_proto_family inet_family_ops; #include <net/ndisc.h> #include <net/dst.h> #include <net/transp_v6.h> + +extern int tcp_tw_death_row_slot; #endif #endif @@ -70,6 +73,12 @@ extern void destroy_8023_client(struct datalink_proto *); #include <net/sock.h> #endif +#ifdef CONFIG_SYSCTL +extern int sysctl_max_syn_backlog; +#endif + +EXPORT_SYMBOL(dev_lockct); + /* Skbuff symbols. */ EXPORT_SYMBOL(skb_push_errstr); EXPORT_SYMBOL(skb_put_errstr); @@ -120,6 +129,7 @@ EXPORT_SYMBOL(put_cmsg); EXPORT_SYMBOL(net_families); EXPORT_SYMBOL(sock_kmalloc); EXPORT_SYMBOL(sock_kfree_s); +EXPORT_SYMBOL(skb_queue_lock); #ifdef CONFIG_FILTER EXPORT_SYMBOL(sk_run_filter); @@ -176,17 +186,10 @@ EXPORT_SYMBOL(make_EII_client); EXPORT_SYMBOL(destroy_EII_client); #endif -#ifdef CONFIG_ATALK_MODULE EXPORT_SYMBOL(sklist_destroy_socket); -#endif - -#if defined(CONFIG_ATALK_MODULE) || defined(CONFIG_PACKET_MODULE) EXPORT_SYMBOL(sklist_insert_socket); -#endif -#ifdef CONFIG_SMB_FS_MODULE EXPORT_SYMBOL(scm_detach_fds); -#endif #ifdef CONFIG_INET /* Internet layer registration */ @@ -210,6 +213,7 @@ EXPORT_SYMBOL(ip_mc_inc_group); EXPORT_SYMBOL(ip_mc_dec_group); EXPORT_SYMBOL(__ip_finish_output); EXPORT_SYMBOL(inet_dgram_ops); +EXPORT_SYMBOL(__release_sock); /* needed for ip_gre -cw */ EXPORT_SYMBOL(ip_statistics); @@ -241,11 +245,8 @@ EXPORT_SYMBOL(destroy_sock); EXPORT_SYMBOL(ip_queue_xmit); EXPORT_SYMBOL(memcpy_fromiovecend); EXPORT_SYMBOL(csum_partial_copy_fromiovecend); -EXPORT_SYMBOL(__release_sock); EXPORT_SYMBOL(net_timer); /* UDP/TCP exported functions for TCPv6 */ -EXPORT_SYMBOL(sysctl_tcp_timestamps); -EXPORT_SYMBOL(sysctl_tcp_window_scaling); EXPORT_SYMBOL(sock_rspace); EXPORT_SYMBOL(udp_ioctl); EXPORT_SYMBOL(udp_connect); @@ -293,6 +294,8 @@ EXPORT_SYMBOL(tcp_simple_retransmit); EXPORT_SYMBOL(tcp_transmit_skb); EXPORT_SYMBOL(tcp_connect); EXPORT_SYMBOL(tcp_make_synack); +EXPORT_SYMBOL(tcp_tw_death_row_slot); +EXPORT_SYMBOL(net_statistics); EXPORT_SYMBOL(xrlim_allow); @@ -300,6 +303,9 @@ EXPORT_SYMBOL(tcp_write_xmit); EXPORT_SYMBOL(dev_loopback_xmit); EXPORT_SYMBOL(tcp_regs); +#ifdef CONFIG_SYSCTL +EXPORT_SYMBOL(sysctl_max_syn_backlog); +#endif #endif #ifdef CONFIG_NETLINK @@ -317,6 +323,7 @@ EXPORT_SYMBOL(netlink_post); #endif #ifdef CONFIG_RTNETLINK +EXPORT_SYMBOL(rtattr_parse); EXPORT_SYMBOL(rtnetlink_links); EXPORT_SYMBOL(__rta_fill); EXPORT_SYMBOL(rtnetlink_dump_ifinfo); @@ -327,18 +334,16 @@ EXPORT_SYMBOL(neigh_add); EXPORT_SYMBOL(neigh_dump_info); #endif -#ifdef CONFIG_PACKET_MODULE EXPORT_SYMBOL(dev_set_allmulti); EXPORT_SYMBOL(dev_set_promiscuity); EXPORT_SYMBOL(sklist_remove_socket); EXPORT_SYMBOL(rtnl_wait); EXPORT_SYMBOL(rtnl_rlockct); -#endif +EXPORT_SYMBOL(rtnl_lock); +EXPORT_SYMBOL(rtnl_unlock); -#if defined(CONFIG_IPV6_MODULE) || defined(CONFIG_PACKET_MODULE) -EXPORT_SYMBOL(dev_lockct); EXPORT_SYMBOL(sock_wmalloc); -#endif +EXPORT_SYMBOL(sock_rmalloc); #if defined(CONFIG_ULTRA) || defined(CONFIG_WD80x3) || \ defined(CONFIG_EL2) || defined(CONFIG_NE2000) || \ @@ -424,9 +429,6 @@ EXPORT_SYMBOL(ip_rcv); EXPORT_SYMBOL(arp_rcv); EXPORT_SYMBOL(dev_mc_delete); -EXPORT_SYMBOL(rtnl_lock); -EXPORT_SYMBOL(rtnl_unlock); - EXPORT_SYMBOL(if_port_text); #if defined(CONFIG_ATALK) || defined(CONFIG_ATALK_MODULE) @@ -444,9 +446,31 @@ EXPORT_SYMBOL(qdisc_destroy); EXPORT_SYMBOL(qdisc_reset); EXPORT_SYMBOL(qdisc_restart); EXPORT_SYMBOL(qdisc_head); +EXPORT_SYMBOL(qdisc_create_dflt); +EXPORT_SYMBOL(noop_qdisc); +#ifdef CONFIG_NET_SCHED +EXPORT_SYMBOL(pfifo_qdisc_ops); EXPORT_SYMBOL(register_qdisc); EXPORT_SYMBOL(unregister_qdisc); -EXPORT_SYMBOL(noop_qdisc); +EXPORT_SYMBOL(qdisc_get_rtab); +EXPORT_SYMBOL(qdisc_put_rtab); +#ifdef CONFIG_NET_ESTIMATOR +EXPORT_SYMBOL(qdisc_new_estimator); +EXPORT_SYMBOL(qdisc_kill_estimator); +#endif +#ifdef CONFIG_NET_POLICE +EXPORT_SYMBOL(tcf_police); +EXPORT_SYMBOL(tcf_police_locate); +EXPORT_SYMBOL(tcf_police_destroy); +#ifdef CONFIG_RTNETLINK +EXPORT_SYMBOL(tcf_police_dump); +#endif +#endif +#endif +#ifdef CONFIG_NET_CLS +EXPORT_SYMBOL(register_tcf_proto_ops); +EXPORT_SYMBOL(unregister_tcf_proto_ops); +#endif EXPORT_SYMBOL(register_gifconf); diff --git a/net/rose/Makefile b/net/rose/Makefile index 7eb55881e..de3f1b257 100644 --- a/net/rose/Makefile +++ b/net/rose/Makefile @@ -8,7 +8,8 @@ # Note 2! The CFLAGS definition is now in the main makefile... O_TARGET := rose.o -O_OBJS := af_rose.o rose_dev.o rose_in.o rose_link.o rose_out.o rose_route.o rose_subr.o rose_timer.o +O_OBJS := af_rose.o rose_dev.o rose_in.o rose_link.o rose_loopback.o \ + rose_out.o rose_route.o rose_subr.o rose_timer.o M_OBJS := $(O_TARGET) ifeq ($(CONFIG_SYSCTL),y) diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index a575402c7..286a2aa68 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1261,7 +1261,7 @@ static int rose_get_info(char *buffer, char **start, off_t offset, int length, i cli(); - len += sprintf(buffer, "dest_addr dest_call src_addr src_call dev lci st vs vr va t t1 t2 t3 hb idle Snd-Q Rcv-Q\n"); + len += sprintf(buffer, "dest_addr dest_call src_addr src_call dev lci st vs vr va t t1 t2 t3 hb idle Snd-Q Rcv-Q inode\n"); for (s = rose_list; s != NULL; s = s->next) { if ((dev = s->protinfo.rose->device) == NULL) @@ -1278,7 +1278,7 @@ static int rose_get_info(char *buffer, char **start, off_t offset, int length, i else callsign = ax2asc(&s->protinfo.rose->source_call); - len += sprintf(buffer + len, "%-10s %-9s %-5s %3.3X %d %d %d %d %3lu %3lu %3lu %3lu %3lu %3lu/%03lu %5d %5d\n", + len += sprintf(buffer + len, "%-10s %-9s %-5s %3.3X %d %d %d %d %3lu %3lu %3lu %3lu %3lu %3lu/%03lu %5d %5d %ld\n", rose2asc(&s->protinfo.rose->source_addr), callsign, devname, @@ -1295,7 +1295,8 @@ static int rose_get_info(char *buffer, char **start, off_t offset, int length, i ax25_display_timer(&s->protinfo.rose->idletimer) / (60 * HZ), s->protinfo.rose->idle / (60 * HZ), atomic_read(&s->wmem_alloc), - atomic_read(&s->rmem_alloc)); + atomic_read(&s->rmem_alloc), + s->socket != NULL ? s->socket->inode->i_ino : 0L); pos = begin + len; @@ -1408,6 +1409,9 @@ __initfunc(void rose_proto_init(struct net_proto *pro)) #ifdef CONFIG_SYSCTL rose_register_sysctl(); #endif + rose_loopback_init(); + + rose_add_loopback_neigh(); #ifdef CONFIG_PROC_FS proc_net_register(&proc_net_rose); @@ -1443,6 +1447,8 @@ void cleanup_module(void) proc_net_unregister(PROC_NET_RS_NODES); proc_net_unregister(PROC_NET_RS_ROUTES); #endif + rose_loopback_clear(); + rose_rt_free(); ax25_protocol_release(AX25_P_ROSE); diff --git a/net/rose/rose_dev.c b/net/rose/rose_dev.c index 0cc81c464..702a55931 100644 --- a/net/rose/rose_dev.c +++ b/net/rose/rose_dev.c @@ -134,11 +134,11 @@ static int rose_set_mac_address(struct device *dev, void *addr) { struct sockaddr *sa = addr; - ax25_listen_release((ax25_address *)dev->dev_addr, NULL); + rose_del_loopback_node((rose_address *)dev->dev_addr); memcpy(dev->dev_addr, sa->sa_data, dev->addr_len); - ax25_listen_register((ax25_address *)dev->dev_addr, NULL); + rose_add_loopback_node((rose_address *)dev->dev_addr); return 0; } @@ -150,7 +150,7 @@ static int rose_open(struct device *dev) MOD_INC_USE_COUNT; - ax25_listen_register((ax25_address *)dev->dev_addr, NULL); + rose_add_loopback_node((rose_address *)dev->dev_addr); return 0; } @@ -162,7 +162,7 @@ static int rose_close(struct device *dev) MOD_DEC_USE_COUNT; - ax25_listen_release((ax25_address *)dev->dev_addr, NULL); + rose_del_loopback_node((rose_address *)dev->dev_addr); return 0; } diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c index de412d3c4..be86c9e16 100644 --- a/net/rose/rose_in.c +++ b/net/rose/rose_in.c @@ -141,10 +141,6 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety case ROSE_RR: case ROSE_RNR: - if (frametype == ROSE_RNR) - sk->protinfo.rose->condition |= ROSE_COND_PEER_RX_BUSY; - else - sk->protinfo.rose->condition &= ~ROSE_COND_PEER_RX_BUSY; if (!rose_validate_nr(sk, nr)) { rose_write_internal(sk, ROSE_RESET_REQUEST); sk->protinfo.rose->condition = 0x00; @@ -157,8 +153,11 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety rose_stop_idletimer(sk); } else { rose_frames_acked(sk, nr); - if (frametype == ROSE_RNR) - rose_requeue_frames(sk); + if (frametype == ROSE_RNR) { + sk->protinfo.rose->condition |= ROSE_COND_PEER_RX_BUSY; + } else { + sk->protinfo.rose->condition &= ~ROSE_COND_PEER_RX_BUSY; + } } break; @@ -177,16 +176,26 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety break; } rose_frames_acked(sk, nr); - if (sk->protinfo.rose->condition & ROSE_COND_OWN_RX_BUSY) - break; if (ns == sk->protinfo.rose->vr) { rose_start_idletimer(sk); if (sock_queue_rcv_skb(sk, skb) == 0) { sk->protinfo.rose->vr = (sk->protinfo.rose->vr + 1) % ROSE_MODULUS; queued = 1; } else { - sk->protinfo.rose->condition |= ROSE_COND_OWN_RX_BUSY; + /* Should never happen ! */ + rose_write_internal(sk, ROSE_RESET_REQUEST); + sk->protinfo.rose->condition = 0x00; + sk->protinfo.rose->vs = 0; + sk->protinfo.rose->vr = 0; + sk->protinfo.rose->va = 0; + sk->protinfo.rose->vl = 0; + sk->protinfo.rose->state = ROSE_STATE_4; + rose_start_t2timer(sk); + rose_stop_idletimer(sk); + break; } + if (atomic_read(&sk->rmem_alloc) > (sk->rcvbuf / 2)) + sk->protinfo.rose->condition |= ROSE_COND_OWN_RX_BUSY; } /* * If the window is full, ack the frame, else start the diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c index c462fa696..33cc2f990 100644 --- a/net/rose/rose_link.c +++ b/net/rose/rose_link.c @@ -113,7 +113,7 @@ static int rose_send_frame(struct sk_buff *skb, struct rose_neigh *neigh) else rose_call = &rose_callsign; - neigh->ax25 = ax25_send_frame(skb, 0, rose_call, &neigh->callsign, neigh->digipeat, neigh->dev); + neigh->ax25 = ax25_send_frame(skb, 260, rose_call, &neigh->callsign, neigh->digipeat, neigh->dev); return (neigh->ax25 != NULL); } @@ -296,6 +296,11 @@ void rose_transmit_link(struct sk_buff *skb, struct rose_neigh *neigh) return; } + if (neigh->loopback) { + rose_loopback_queue(skb, neigh); + return; + } + if (!rose_link_up(neigh)) neigh->restarted = 0; diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c new file mode 100644 index 000000000..ce66a9911 --- /dev/null +++ b/net/rose/rose_loopback.c @@ -0,0 +1,126 @@ +/* + * ROSE release 003 + * + * This code REQUIRES 2.1.15 or higher/ NET3.038 + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * ROSE 003 Jonathan(G4KLX) Created this file from nr_loopback.c. + * + */ + +#include <linux/config.h> +#if defined(CONFIG_ROSE) || defined(CONFIG_ROSE_MODULE) +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/timer.h> +#include <net/ax25.h> +#include <linux/skbuff.h> +#include <net/rose.h> + +static struct sk_buff_head loopback_queue; +static struct timer_list loopback_timer; + +static void rose_set_loopback_timer(void); + +void rose_loopback_init(void) +{ + skb_queue_head_init(&loopback_queue); + + init_timer(&loopback_timer); +} + +static int rose_loopback_running(void) +{ + return (loopback_timer.prev != NULL || loopback_timer.next != NULL); +} + +int rose_loopback_queue(struct sk_buff *skb, struct rose_neigh *neigh) +{ + struct sk_buff *skbn; + + skbn = skb_clone(skb, GFP_ATOMIC); + + kfree_skb(skb); + + if (skbn != NULL) { + skb_queue_tail(&loopback_queue, skbn); + + if (!rose_loopback_running()) + rose_set_loopback_timer(); + } + + return 1; +} + +static void rose_loopback_timer(unsigned long); + +static void rose_set_loopback_timer(void) +{ + del_timer(&loopback_timer); + + loopback_timer.data = 0; + loopback_timer.function = &rose_loopback_timer; + loopback_timer.expires = jiffies + 10; + + add_timer(&loopback_timer); +} + +static void rose_loopback_timer(unsigned long param) +{ + struct sk_buff *skb; + struct device *dev; + rose_address *dest; + struct sock *sk; + unsigned short frametype; + unsigned int lci_i, lci_o; + + while ((skb = skb_dequeue(&loopback_queue)) != NULL) { + lci_i = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF); + frametype = skb->data[2]; + dest = (rose_address *)(skb->data + 4); + lci_o = sysctl_rose_maximum_vcs - lci_i + 1; + + skb->h.raw = skb->data; + + if ((sk = rose_find_socket(lci_o, rose_loopback_neigh)) != NULL) { + if (rose_process_rx_frame(sk, skb) == 0) + kfree_skb(skb); + continue; + } + + if (frametype == ROSE_CALL_REQUEST) { + if ((dev = rose_dev_get(dest)) != NULL) { + if (rose_rx_call_request(skb, dev, rose_loopback_neigh, lci_o) == 0) + kfree_skb(skb); + } else { + kfree_skb(skb); + } + } else { + kfree_skb(skb); + } + } +} + +#ifdef MODULE + +void rose_loopback_clear(void) +{ + struct sk_buff *skb; + + del_timer(&loopback_timer); + + while ((skb = skb_dequeue(&loopback_queue)) != NULL) { + skb->sk = NULL; + kfree_skb(skb); + } +} + +#endif + +#endif diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 917846bf7..2d6d23230 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -55,6 +55,8 @@ static struct rose_node *rose_node_list = NULL; static struct rose_neigh *rose_neigh_list = NULL; static struct rose_route *rose_route_list = NULL; +struct rose_neigh *rose_loopback_neigh = NULL; + static void rose_remove_neigh(struct rose_neigh *); /* @@ -72,6 +74,9 @@ static int rose_add_node(struct rose_route_struct *rose_route, struct device *de if ((rose_node->mask == rose_route->mask) && (rosecmpm(&rose_route->address, &rose_node->address, rose_route->mask) == 0)) break; + if (rose_node != NULL && rose_node->loopback) + return -EINVAL; + for (rose_neigh = rose_neigh_list; rose_neigh != NULL; rose_neigh = rose_neigh->next) if (ax25cmp(&rose_route->neighbour, &rose_neigh->callsign) == 0 && rose_neigh->dev == dev) break; @@ -87,6 +92,7 @@ static int rose_add_node(struct rose_route_struct *rose_route, struct device *de rose_neigh->count = 0; rose_neigh->use = 0; rose_neigh->dce_mode = 0; + rose_neigh->loopback = 0; rose_neigh->number = rose_neigh_no++; rose_neigh->restarted = 0; @@ -123,6 +129,7 @@ static int rose_add_node(struct rose_route_struct *rose_route, struct device *de rose_node->address = rose_route->address; rose_node->mask = rose_route->mask; rose_node->count = 1; + rose_node->loopback = 0; rose_node->neighbour[0] = rose_neigh; save_flags(flags); cli(); @@ -263,6 +270,8 @@ static int rose_del_node(struct rose_route_struct *rose_route, struct device *de if (rose_node == NULL) return -EINVAL; + if (rose_node->loopback) return -EINVAL; + for (rose_neigh = rose_neigh_list; rose_neigh != NULL; rose_neigh = rose_neigh->next) if (ax25cmp(&rose_route->neighbour, &rose_neigh->callsign) == 0 && rose_neigh->dev == dev) break; @@ -299,6 +308,86 @@ static int rose_del_node(struct rose_route_struct *rose_route, struct device *de } /* + * Add the loopback neighbour. + */ +int rose_add_loopback_neigh(void) +{ + unsigned long flags; + + if ((rose_loopback_neigh = kmalloc(sizeof(struct rose_neigh), GFP_ATOMIC)) == NULL) + return -ENOMEM; + + rose_loopback_neigh->callsign = null_ax25_address; + rose_loopback_neigh->digipeat = NULL; + rose_loopback_neigh->ax25 = NULL; + rose_loopback_neigh->dev = NULL; + rose_loopback_neigh->count = 0; + rose_loopback_neigh->use = 0; + rose_loopback_neigh->dce_mode = 1; + rose_loopback_neigh->loopback = 1; + rose_loopback_neigh->number = rose_neigh_no++; + rose_loopback_neigh->restarted = 1; + + save_flags(flags); cli(); + rose_loopback_neigh->next = rose_neigh_list; + rose_neigh_list = rose_loopback_neigh; + restore_flags(flags); + + return 0; +} + +/* + * Add a loopback node. + */ +int rose_add_loopback_node(rose_address *address) +{ + struct rose_node *rose_node; + unsigned long flags; + + for (rose_node = rose_node_list; rose_node != NULL; rose_node = rose_node->next) + if ((rose_node->mask == 10) && (rosecmpm(address, &rose_node->address, 10) == 0) && rose_node->loopback) + break; + + if (rose_node != NULL) return 0; + + if ((rose_node = kmalloc(sizeof(*rose_node), GFP_ATOMIC)) == NULL) + return -ENOMEM; + + rose_node->address = *address; + rose_node->mask = 10; + rose_node->count = 1; + rose_node->loopback = 1; + rose_node->neighbour[0] = rose_loopback_neigh; + + save_flags(flags); cli(); + rose_node->next = rose_node_list; + rose_node_list = rose_node; + restore_flags(flags); + + rose_loopback_neigh->count++; + + return 0; +} + +/* + * Delete a loopback node. + */ +void rose_del_loopback_node(rose_address *address) +{ + struct rose_node *rose_node; + + for (rose_node = rose_node_list; rose_node != NULL; rose_node = rose_node->next) + if ((rose_node->mask == 10) && (rosecmpm(address, &rose_node->address, 10) == 0) && rose_node->loopback) + break; + + if (rose_node == NULL) return; + + rose_remove_node(rose_node); + + rose_loopback_neigh->count--; +} + +/* * A device has been removed. Remove its routes and neighbours. */ void rose_rt_device_down(struct device *dev) @@ -723,16 +812,12 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) rosecmp(src_addr, &rose_route->src_addr) == 0 && ax25cmp(&facilities.dest_call, &rose_route->src_call) == 0 && ax25cmp(&facilities.source_call, &rose_route->dest_call) == 0) { - printk(KERN_DEBUG "ROSE: routing loop from %s\n", rose2asc(src_addr)); - printk(KERN_DEBUG "ROSE: to %s\n", rose2asc(dest_addr)); rose_transmit_clear_request(rose_neigh, lci, ROSE_NOT_OBTAINABLE, 120); return 0; } } if ((new_neigh = rose_get_neigh(dest_addr, &cause, &diagnostic)) == NULL) { - if (cause == ROSE_NOT_OBTAINABLE) - printk(KERN_DEBUG "ROSE: no route to %s\n", rose2asc(dest_addr)); rose_transmit_clear_request(rose_neigh, lci, cause, diagnostic); return 0; } @@ -788,16 +873,22 @@ int rose_nodes_get_info(char *buffer, char **start, off_t offset, len += sprintf(buffer, "address mask n neigh neigh neigh\n"); for (rose_node = rose_node_list; rose_node != NULL; rose_node = rose_node->next) { - len += sprintf(buffer + len, "%-10s %04d %d", - rose2asc(&rose_node->address), - rose_node->mask, - rose_node->count); + if (rose_node->loopback) { + len += sprintf(buffer + len, "%-10s %04d 1 loopback\n", + rose2asc(&rose_node->address), + rose_node->mask); + } else { + len += sprintf(buffer + len, "%-10s %04d %d", + rose2asc(&rose_node->address), + rose_node->mask, + rose_node->count); - for (i = 0; i < rose_node->count; i++) - len += sprintf(buffer + len, " %05d", - rose_node->neighbour[i]->number); + for (i = 0; i < rose_node->count; i++) + len += sprintf(buffer + len, " %05d", + rose_node->neighbour[i]->number); - len += sprintf(buffer + len, "\n"); + len += sprintf(buffer + len, "\n"); + } pos = begin + len; @@ -834,33 +925,35 @@ int rose_neigh_get_info(char *buffer, char **start, off_t offset, len += sprintf(buffer, "addr callsign dev count use mode restart t0 tf digipeaters\n"); for (rose_neigh = rose_neigh_list; rose_neigh != NULL; rose_neigh = rose_neigh->next) { - len += sprintf(buffer + len, "%05d %-9s %-4s %3d %3d %3s %3s %3lu %3lu", - rose_neigh->number, - ax2asc(&rose_neigh->callsign), - rose_neigh->dev ? rose_neigh->dev->name : "???", - rose_neigh->count, - rose_neigh->use, - (rose_neigh->dce_mode) ? "DCE" : "DTE", - (rose_neigh->restarted) ? "yes" : "no", - ax25_display_timer(&rose_neigh->t0timer) / HZ, - ax25_display_timer(&rose_neigh->ftimer) / HZ); - - if (rose_neigh->digipeat != NULL) { - for (i = 0; i < rose_neigh->digipeat->ndigi; i++) - len += sprintf(buffer + len, " %s", ax2asc(&rose_neigh->digipeat->calls[i])); - } + if (!rose_neigh->loopback) { + len += sprintf(buffer + len, "%05d %-9s %-4s %3d %3d %3s %3s %3lu %3lu", + rose_neigh->number, + ax2asc(&rose_neigh->callsign), + rose_neigh->dev ? rose_neigh->dev->name : "???", + rose_neigh->count, + rose_neigh->use, + (rose_neigh->dce_mode) ? "DCE" : "DTE", + (rose_neigh->restarted) ? "yes" : "no", + ax25_display_timer(&rose_neigh->t0timer) / HZ, + ax25_display_timer(&rose_neigh->ftimer) / HZ); + + if (rose_neigh->digipeat != NULL) { + for (i = 0; i < rose_neigh->digipeat->ndigi; i++) + len += sprintf(buffer + len, " %s", ax2asc(&rose_neigh->digipeat->calls[i])); + } - len += sprintf(buffer + len, "\n"); + len += sprintf(buffer + len, "\n"); - pos = begin + len; + pos = begin + len; - if (pos < offset) { - len = 0; - begin = pos; - } + if (pos < offset) { + len = 0; + begin = pos; + } - if (pos > offset + length) - break; + if (pos > offset + length) + break; + } } sti(); diff --git a/net/sched/Config.in b/net/sched/Config.in index d1287a781..052b62281 100644 --- a/net/sched/Config.in +++ b/net/sched/Config.in @@ -3,9 +3,28 @@ # tristate 'CBQ packet scheduler' CONFIG_NET_SCH_CBQ tristate 'CSZ packet scheduler' CONFIG_NET_SCH_CSZ -#tristate 'HFQ packet scheduler' CONFIG_NET_SCH_HFQ -tristate 'RED queueing discipline' CONFIG_NET_SCH_RED -tristate 'SFQ queueing discipline' CONFIG_NET_SCH_SFQ -tristate 'auxiliary TBF queue' CONFIG_NET_SCH_TBF -tristate 'auxiliary FIFO queue' CONFIG_NET_SCH_PFIFO -tristate 'auxiliary PRIO queue' CONFIG_NET_SCH_PRIO +#tristate 'H-PFQ packet scheduler' CONFIG_NET_SCH_HPFQ +#tristate 'H-FSC packet scheduler' CONFIG_NET_SCH_HFCS +tristate 'The simplest PRIO pseudoscheduler' CONFIG_NET_SCH_PRIO +tristate 'RED queue' CONFIG_NET_SCH_RED +tristate 'SFQ queue' CONFIG_NET_SCH_SFQ +tristate 'TEQL queue' CONFIG_NET_SCH_TEQL +tristate 'TBF queue' CONFIG_NET_SCH_TBF +bool 'QoS support' CONFIG_NET_QOS +if [ "$CONFIG_NET_QOS" = "y" ]; then + bool 'Rate estimator' CONFIG_NET_ESTIMATOR +fi +if [ "$CONFIG_IP_MULTIPLE_TABLES" = "y" ]; then + bool 'Packet classifier API' CONFIG_NET_CLS +fi +if [ "$CONFIG_NET_CLS" = "y" ]; then + bool 'Routing tables based classifier' CONFIG_NET_CLS_ROUTE +# bool 'Firewall based classifier' CONFIG_NET_CLS_FW + tristate 'U32 classifier' CONFIG_NET_CLS_U32 + if [ "$CONFIG_NET_QOS" = "y" ]; then + tristate 'Special RSVP classifier' CONFIG_NET_CLS_RSVP + tristate 'Special RSVP classifier for IPv6' CONFIG_NET_CLS_RSVP6 + bool 'Ingres traffic policing' CONFIG_NET_CLS_POLICE + fi +fi + diff --git a/net/sched/Makefile b/net/sched/Makefile index cbb6704c1..21a1cf07a 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -11,6 +11,23 @@ O_TARGET := sched.o O_OBJS := sch_generic.o +ifeq ($(CONFIG_NET_SCHED), y) + +O_OBJS += sch_api.o sch_fifo.o + +ifeq ($(CONFIG_NET_ESTIMATOR), y) +O_OBJS += estimator.o +endif + +ifeq ($(CONFIG_NET_CLS), y) +O_OBJS += cls_api.o + +ifeq ($(CONFIG_NET_CLS_POLICE), y) +O_OBJS += police.o +endif + +endif + ifeq ($(CONFIG_NET_SCH_CBQ), y) O_OBJS += sch_cbq.o else @@ -27,6 +44,23 @@ else endif endif +ifeq ($(CONFIG_NET_SCH_HPFQ), y) +O_OBJS += sch_hpfq.o +else + ifeq ($(CONFIG_NET_SCH_HPFQ), m) + M_OBJS += sch_hpfq.o + endif +endif + +ifeq ($(CONFIG_NET_SCH_HFSC), y) +O_OBJS += sch_hfsc.o +else + ifeq ($(CONFIG_NET_SCH_HFSC), m) + M_OBJS += sch_hfsc.o + endif +endif + + ifeq ($(CONFIG_NET_SCH_SFQ), y) O_OBJS += sch_sfq.o else @@ -51,21 +85,54 @@ else endif endif +ifeq ($(CONFIG_NET_SCH_PRIO), y) +O_OBJS += sch_prio.o +else + ifeq ($(CONFIG_NET_SCH_PRIO), m) + M_OBJS += sch_prio.o + endif +endif -ifeq ($(CONFIG_NET_SCH_PFIFO), y) -O_OBJS += sch_fifo.o +ifeq ($(CONFIG_NET_SCH_TEQL), y) +O_OBJS += sch_teql.o else - ifeq ($(CONFIG_NET_SCH_PFIFO), m) - M_OBJS += sch_fifo.o + ifeq ($(CONFIG_NET_SCH_TEQL), m) + M_OBJS += sch_teql.o endif endif -ifeq ($(CONFIG_NET_SCH_PRIO), y) -O_OBJS += sch_prio.o +ifeq ($(CONFIG_NET_CLS_U32), y) +O_OBJS += cls_u32.o else - ifeq ($(CONFIG_NET_SCH_PRIO), m) - M_OBJS += sch_prio.o + ifeq ($(CONFIG_NET_CLS_U32), m) + M_OBJS += cls_u32.o + endif +endif + +ifeq ($(CONFIG_NET_CLS_RSVP), y) +O_OBJS += cls_rsvp.o +else + ifeq ($(CONFIG_NET_CLS_RSVP), m) + M_OBJS += cls_rsvp.o endif endif +ifeq ($(CONFIG_NET_CLS_RSVP6), y) +O_OBJS += cls_rsvp6.o +else + ifeq ($(CONFIG_NET_CLS_RSVP6), m) + M_OBJS += cls_rsvp6.o + endif +endif + +ifeq ($(CONFIG_NET_CLS_ROUTE), y) +O_OBJS += cls_route.o +endif + +ifeq ($(CONFIG_NET_CLS_FW), y) +O_OBJS += cls_fw.o +endif + +endif + include $(TOPDIR)/Rules.make diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c new file mode 100644 index 000000000..6eae05d7b --- /dev/null +++ b/net/sched/cls_api.c @@ -0,0 +1,432 @@ +/* + * net/sched/cls_api.c Packet classifier API. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +/* The list of all installed classifier types */ + +static struct tcf_proto_ops *tcf_proto_base; + + +/* Find classifier type by string name */ + +struct tcf_proto_ops * tcf_proto_lookup_ops(struct rtattr *kind) +{ + struct tcf_proto_ops *t; + + if (kind) { + for (t = tcf_proto_base; t; t = t->next) { + if (rtattr_strcmp(kind, t->kind) == 0) + return t; + } + } + return NULL; +} + +/* Register(unregister) new classifier type */ + +int register_tcf_proto_ops(struct tcf_proto_ops *ops) +{ + struct tcf_proto_ops *t, **tp; + + for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next) + if (strcmp(ops->kind, t->kind) == 0) + return -EEXIST; + + ops->next = NULL; + *tp = ops; + return 0; +} + +int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) +{ + struct tcf_proto_ops *t, **tp; + + for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next) + if (t == ops) + break; + + if (!t) + return -ENOENT; + *tp = t->next; + return 0; +} + +#ifdef CONFIG_RTNETLINK + +static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct tcf_proto *tp, unsigned long fh, int event); + + +/* Select new prio value from the range, managed by kernel. */ + +static __inline__ u32 tcf_auto_prio(struct tcf_proto *tp, u32 prio) +{ + u32 first = TC_H_MAKE(0xC0000000U,0U); + + if (!tp || tp->next == NULL) + return first; + + if (prio == TC_H_MAKE(0xFFFF0000U,0U)) + first = tp->prio+1; + else + first = tp->prio-1; + + if (first == prio) + first = tp->prio; + + return first; +} + +/* Add/change/delete/get a filter node */ + +static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct rtattr **tca = arg; + struct tcmsg *t = NLMSG_DATA(n); + u32 protocol = TC_H_MIN(t->tcm_info); + u32 prio = TC_H_MAJ(t->tcm_info); + u32 nprio = prio; + struct device *dev; + struct Qdisc *q; + struct tcf_proto **back, **chain; + struct tcf_proto *tp = NULL; + struct tcf_proto_ops *tp_ops; + struct Qdisc_class_ops *cops; + unsigned long cl = 0; + unsigned long fh; + int err; + + if (prio == 0) { + /* If no priority is given, user wants we allocated it. */ + if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) + return -ENOENT; + if (n->nlmsg_flags&NLM_F_APPEND) + prio = TC_H_MAKE(0xFFFF0000U,0U); + else + prio = TC_H_MAKE(0x80000000U,0U); + } + + /* Find head of filter chain. */ + + /* Find link */ + if ((dev = dev_get_by_index(t->tcm_ifindex)) == NULL) + return -ENODEV; + + /* Find qdisc */ + if (!t->tcm_parent) + q = dev->qdisc_sleeping; + else if ((q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent))) == NULL) + return -EINVAL; + + /* Is it classful? */ + if ((cops = q->ops->cl_ops) == NULL) + return -EINVAL; + + /* Do we search for filter, attached to class? */ + if (TC_H_MIN(t->tcm_parent)) { + cl = cops->get(q, t->tcm_parent); + if (cl == 0) + return -ENOENT; + } + + /* And the last stroke */ + chain = cops->tcf_chain(q, cl); + err = -EINVAL; + if (chain == NULL) + goto errout; + + /* Check the chain for existence of proto-tcf with this priority */ + for (back = chain; (tp=*back) != NULL; back = &tp->next) { + if (tp->prio >= prio) { + if (tp->prio == prio) { + if (!nprio || (tp->protocol != protocol && protocol)) + goto errout; + } else + tp = NULL; + break; + } + } + + if (tp == NULL) { + /* Proto-tcf does not exist, create new one */ + + if (tca[TCA_KIND-1] == NULL || !protocol) + goto errout; + + err = -ENOENT; + if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) + goto errout; + + + /* Create new proto tcf */ + + err = -ENOBUFS; + if ((tp = kmalloc(sizeof(*tp), GFP_KERNEL)) == NULL) + goto errout; + tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND-1]); + if (tp_ops == NULL) { + err = -EINVAL; + kfree(tp); + goto errout; + } + memset(tp, 0, sizeof(*tp)); + tp->ops = tp_ops; + tp->protocol = protocol; + tp->prio = nprio ? : tcf_auto_prio(*back, prio); + tp->q = q; + tp->classify = tp_ops->classify; + tp->classid = t->tcm_parent; + err = tp_ops->init(tp); + if (err) { + kfree(tp); + goto errout; + } + tp->next = *back; + *back = tp; + } else if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], tp->ops->kind)) + goto errout; + + fh = tp->ops->get(tp, t->tcm_handle); + + if (fh == 0) { + if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { + *back = tp->next; + tp->ops->destroy(tp); + kfree(tp); + err = 0; + goto errout; + } + + err = -ENOENT; + if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) + goto errout; + } else { + switch (n->nlmsg_type) { + case RTM_NEWTFILTER: + err = -EEXIST; + if (n->nlmsg_flags&NLM_F_EXCL) + goto errout; + break; + case RTM_DELTFILTER: + err = tp->ops->delete(tp, fh); + goto errout; + case RTM_GETTFILTER: + err = tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER); + goto errout; + default: + err = -EINVAL; + goto errout; + } + } + + err = tp->ops->change(tp, t->tcm_handle, tca, &fh); + if (err == 0) + tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER); + +errout: + if (cl) + cops->put(q, cl); + return err; +} + +static int +tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh, + u32 pid, u32 seq, unsigned flags, int event) +{ + struct tcmsg *tcm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); + nlh->nlmsg_flags = flags; + tcm = NLMSG_DATA(nlh); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm_ifindex = tp->q->dev->ifindex; + tcm->tcm_parent = tp->classid; + tcm->tcm_handle = 0; + tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); + RTA_PUT(skb, TCA_KIND, IFNAMSIZ, tp->ops->kind); + if (tp->ops->dump && tp->ops->dump(tp, fh, skb, tcm) < 0) + goto rtattr_failure; + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct tcf_proto *tp, unsigned long fh, int event) +{ + struct sk_buff *skb; + pid_t pid = oskb ? NETLINK_CB(oskb).pid : 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (tcf_fill_node(skb, tp, fh, pid, n->nlmsg_seq, 0, event) <= 0) { + kfree_skb(skb); + return -EINVAL; + } + + return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); +} + +struct tcf_dump_args +{ + struct tcf_walker w; + struct sk_buff *skb; + struct netlink_callback *cb; +}; + +static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, struct tcf_walker *arg) +{ + struct tcf_dump_args *a = (void*)arg; + + return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).pid, + a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER); +} + +static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) +{ + int t; + int s_t; + struct device *dev; + struct Qdisc *q; + struct tcf_proto *tp, **chain; + struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); + unsigned long cl = 0; + struct Qdisc_class_ops *cops; + struct tcf_dump_args arg; + + if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) + return skb->len; + if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) + return skb->len; + if ((q = qdisc_lookup(dev, tcm->tcm_parent)) == NULL) + return skb->len; + cops = q->ops->cl_ops; + if (TC_H_MIN(tcm->tcm_parent)) { + if (cops) + cl = cops->get(q, tcm->tcm_parent); + if (cl == 0) + goto errout; + } + chain = cops->tcf_chain(q, cl); + if (chain == NULL) + goto errout; + + s_t = cb->args[0]; + + for (tp=*chain, t=0; tp; tp = tp->next, t++) { + if (t < s_t) continue; + if (TC_H_MAJ(tcm->tcm_info) && + TC_H_MAJ(tcm->tcm_info) != tp->prio) + continue; + if (TC_H_MIN(tcm->tcm_info) && + TC_H_MIN(tcm->tcm_info) != tp->protocol) + continue; + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int)); + if (cb->args[1] == 0) { + if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) { + break; + } + cb->args[1] = 1; + } + if (tp->ops->walk == NULL) + continue; + arg.w.fn = tcf_node_dump; + arg.skb = skb; + arg.cb = cb; + arg.w.stop = 0; + arg.w.skip = cb->args[1]-1; + arg.w.count = 0; + tp->ops->walk(tp, &arg.w); + cb->args[1] = arg.w.count+1; + if (arg.w.stop) + break; + } + + cb->args[0] = t; + +errout: + if (cl) + cops->put(q, cl); + + return skb->len; +} + +#endif + + +__initfunc(int tc_filter_init(void)) +{ +#ifdef CONFIG_RTNETLINK + struct rtnetlink_link *link_p = rtnetlink_links[AF_UNSPEC]; + + /* Setup rtnetlink links. It is made here to avoid + exporting large number of public symbols. + */ + + if (link_p) { + link_p[RTM_NEWTFILTER-RTM_BASE].doit = tc_ctl_tfilter; + link_p[RTM_DELTFILTER-RTM_BASE].doit = tc_ctl_tfilter; + link_p[RTM_GETTFILTER-RTM_BASE].doit = tc_ctl_tfilter; + link_p[RTM_GETTFILTER-RTM_BASE].dumpit = tc_dump_tfilter; + } +#endif +#define INIT_TC_FILTER(name) { \ + extern struct tcf_proto_ops cls_##name##_ops; \ + register_tcf_proto_ops(&cls_##name##_ops); \ + } + +#ifdef CONFIG_NET_CLS_U32 + INIT_TC_FILTER(u32); +#endif +#ifdef CONFIG_NET_CLS_ROUTE + INIT_TC_FILTER(route); +#endif +#ifdef CONFIG_NET_CLS_FW + INIT_TC_FILTER(fw); +#endif +#ifdef CONFIG_NET_CLS_RSVP + INIT_TC_FILTER(rsvp); +#endif +#ifdef CONFIG_NET_CLS_RSVP6 + INIT_TC_FILTER(rsvp6); +#endif + return 0; +} diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c new file mode 100644 index 000000000..ff831817a --- /dev/null +++ b/net/sched/cls_fw.c @@ -0,0 +1,96 @@ +/* + * net/sched/cls_fw.c Routing table based packet classifier. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + + +static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ +#if 0 /* XXX skb->fwmark, where is it? -DaveM */ + u32 clid = skb->fwmark; + + if (clid && (TC_H_MAJ(clid) == 0 || + !(TC_H_MAJ(clid^tp->q->handle)))) { + res->classid = clid; + res->class = 0; + return 0; + } +#endif + return -1; +} + +static unsigned long fw_get(struct tcf_proto *tp, u32 handle) +{ + return 0; +} + +static void fw_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int fw_init(struct tcf_proto *tp) +{ + return 0; +} + +static void fw_destroy(struct tcf_proto *tp) +{ +} + +static int fw_delete(struct tcf_proto *tp, unsigned long arg) +{ + return -EINVAL; +} + +static int fw_change(struct tcf_proto *tp, u32 handle, + struct rtattr **tca, + unsigned long *arg) +{ + return handle ? -EINVAL : 0; +} + +struct tcf_proto_ops fw_cls_ops = { + NULL, + "fw", + fw_classify, + fw_init, + fw_destroy, + + fw_get, + fw_put, + fw_change, + fw_delete, + NULL, +}; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c new file mode 100644 index 000000000..a78f2090e --- /dev/null +++ b/net/sched/cls_route.c @@ -0,0 +1,98 @@ +/* + * net/sched/cls_route.c Routing table based packet classifier. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + + +static int route_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + struct dst_entry *dst = skb->dst; + + if (dst) { + u32 clid = dst->tclassid; + + if (clid && (TC_H_MAJ(clid) == 0 || + !(TC_H_MAJ(clid^tp->q->handle)))) { + res->classid = clid; + res->class = 0; + return 0; + } + } + return -1; +} + +static unsigned long route_get(struct tcf_proto *tp, u32 handle) +{ + return 0; +} + +static void route_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int route_init(struct tcf_proto *tp) +{ + return 0; +} + +static void route_destroy(struct tcf_proto *tp) +{ +} + +static int route_delete(struct tcf_proto *tp, unsigned long arg) +{ + return -EINVAL; +} + +static int route_change(struct tcf_proto *tp, u32 handle, + struct rtattr **tca, + unsigned long *arg) +{ + return handle ? -EINVAL : 0; +} + +struct tcf_proto_ops cls_route_ops = { + NULL, + "route", + route_classify, + route_init, + route_destroy, + + route_get, + route_put, + route_change, + route_delete, + NULL, +}; diff --git a/net/sched/cls_rsvp.c b/net/sched/cls_rsvp.c new file mode 100644 index 000000000..8388aee4c --- /dev/null +++ b/net/sched/cls_rsvp.c @@ -0,0 +1,41 @@ +/* + * net/sched/cls_rsvp.c Special RSVP packet classifier for IPv4. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +#define RSVP_DST_LEN 1 +#define RSVP_ID "rsvp" +#define RSVP_OPS cls_rsvp_ops + +#include "cls_rsvp.h" diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h new file mode 100644 index 000000000..9e12a806a --- /dev/null +++ b/net/sched/cls_rsvp.h @@ -0,0 +1,672 @@ +/* + * net/sched/cls_rsvp.h Template file for RSVPv[46] classifiers. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +/* + Comparing to general packet classification problem, + RSVP needs only sevaral relatively simple rules: + + * (dst, protocol) are always specified, + so that we are able to hash them. + * src may be exact, and may be wildcard, so that + we can keep hash table plus one wildcard entry. + * source port (or flow label) is important only if src is given. + + IMPLEMENTATION. + + We use two level hash table: top level is keyed by + destination address and protocol ID, every bucket contains list of + "rsvp sessions", identified by destination address, protocol + and DPI(="Destination Port ID"): triple (key, mask, offset). + + Every bucket has smaller hash table keyed by source address + (cf. RSVP flowspec) and one wildcard entry for wildcard reservations. + Every bucket is again list of "RSVP flows", selected by + source address and SPI(="Source Port ID" here rather than + "security parameter index"): triple (key, mask, offset). + + + NOTE 1. All the packets with IPv6 extension headers (but AH and ESP) + and all fragmented packets go to best-effort traffic class. + + + NOTE 2. Two "port id"'s seems to be redundant, rfc2207 requires + only one "Generalized Port Identifier". So that for classic + ah, esp (and udp,tcp) both *pi should coincide or one of them + should be wildcard. + + From the first sight, this redundancy is just waste of CPU + resources. But, DPI and SPI add possibility to assign different + priorities to GPIs. Look also note 4 about tunnels below. + + + NOTE 3. One complication is the case of tunneled packets. + We implement it as the following: if the first lookup + matches special session with "tunnelhdr" value not zero, + flowid contains not true flow ID, but tunnel ID (1...255). + In this case, we pull tunnelhdr bytes and restart lookup + with tunnel ID added to list of keys. Simple and stupid 8)8) + It's enough for PIMREG and IPIP. + + + NOTE 4. Two GPIs make possible to parse even GRE packets. + F.e. DPI can select ETH_P_IP (and necessary flags to make + tunnelhdr correct) in GRE protocol field and SPI matches + GRE key. Is it not nice? 8)8) + + + Well, as result, despite of simplicity, we get pretty + powerful clsssification engine. + */ + +#include <linux/config.h> + +struct rsvp_head +{ + u32 tmap[256/32]; + u32 hgenerator; + u8 tgenerator; + struct rsvp_session *ht[256]; +}; + +struct rsvp_session +{ + struct rsvp_session *next; + u32 dst[RSVP_DST_LEN]; + struct tc_rsvp_gpi dpi; + u8 protocol; + u8 tunnelid; + /* 16 (src,sport) hash slots, and one wildcard source slot */ + struct rsvp_filter *ht[16+1]; +}; + + +struct rsvp_filter +{ + struct rsvp_filter *next; + u32 src[RSVP_DST_LEN]; + struct tc_rsvp_gpi spi; + u8 tunnelhdr; + + struct tcf_result res; +#ifdef CONFIG_NET_CLS_POLICE + struct tcf_police *police; +#endif + + u32 handle; + struct rsvp_session *sess; +}; + +static __inline__ unsigned hash_dst(u32 *dst, u8 protocol, u8 tunnelid) +{ + unsigned h = dst[RSVP_DST_LEN-1]; + h ^= h>>16; + h ^= h>>8; + return (h ^ protocol ^ tunnelid) & 0xFF; +} + +static __inline__ unsigned hash_src(u32 *src) +{ + unsigned h = src[RSVP_DST_LEN-1]; + h ^= h>>16; + h ^= h>>8; + h ^= h>>4; + return h & 0xF; +} + +static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht; + struct rsvp_session *s; + struct rsvp_filter *f; + unsigned h1, h2; + u32 *dst, *src; + u8 protocol; + u8 tunnelid = 0; + u8 *xprt; +#if RSVP_DST_LEN == 4 + struct ipv6hdr *nhptr = skb->nh.ipv6h; +#else + struct iphdr *nhptr = skb->nh.iph; +#endif + +#ifndef __i386__ + if ((unsigned long)nhptr & 3) + return -1; +#endif + +restart: + +#if RSVP_DST_LEN == 4 + src = &nhptr->saddr.s6_addr32[0]; + dst = &nhptr->daddr.s6_addr32[0]; + protocol = nhptr->nexthdr; + xprt = ((u8*)nhptr) + sizeof(struct ipv6hdr); +#else + src = &nhptr->saddr; + dst = &nhptr->daddr; + protocol = nhptr->protocol; + xprt = ((u8*)nhptr) + (nhptr->ihl<<2); + if (nhptr->frag_off&__constant_htons(IP_MF|IP_OFFSET)) + return -1; +#endif + + h1 = hash_dst(dst, protocol, tunnelid); + h2 = hash_src(src); + + for (s = sht[h1]; s; s = s->next) { + if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && + protocol == s->protocol && + !(s->dpi.mask & (*(u32*)(xprt+s->dpi.offset)^s->dpi.key)) +#if RSVP_DST_LEN == 4 + && dst[0] == s->dst[0] + && dst[1] == s->dst[1] + && dst[2] == s->dst[2] +#endif + && tunnelid == s->tunnelid) { + + for (f = s->ht[h2]; f; f = f->next) { + if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN-1] && + !(f->spi.mask & (*(u32*)(xprt+f->spi.offset)^f->spi.key)) +#if RSVP_DST_LEN == 4 + && src[0] == f->src[0] + && src[1] == f->src[1] + && src[2] == f->src[2] +#endif + ) { +matched: + if (f->tunnelhdr == 0) { + *res = f->res; +#ifdef CONFIG_NET_CLS_POLICE + if (f->police) + return tcf_police(skb, f->police); +#endif + return 0; + } else { + tunnelid = f->res.classid; + nhptr = (void*)(xprt + f->tunnelhdr - sizeof(*nhptr)); + goto restart; + } + } + } + + /* And wildcard bucket... */ + if ((f = s->ht[16]) != NULL) + goto matched; + return -1; + } + } + return -1; +} + +static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle) +{ + struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht; + struct rsvp_session *s; + struct rsvp_filter *f; + unsigned h1 = handle&0xFF; + unsigned h2 = (handle>>8)&0xFF; + + if (h2 > 16) + return 0; + + for (s = sht[h1]; s; s = s->next) { + for (f = s->ht[h2]; f; f = f->next) { + if (f->handle == handle) + return (unsigned long)f; + } + } + return 0; +} + +static void rsvp_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int rsvp_init(struct tcf_proto *tp) +{ + struct rsvp_head *data; + + MOD_INC_USE_COUNT; + data = kmalloc(sizeof(struct rsvp_head), GFP_KERNEL); + if (data) { + memset(data, 0, sizeof(struct rsvp_head)); + tp->root = data; + return 0; + } + MOD_DEC_USE_COUNT; + return -ENOBUFS; +} + +static void rsvp_destroy(struct tcf_proto *tp) +{ + struct rsvp_head *data = xchg(&tp->root, NULL); + struct rsvp_session **sht; + int h1, h2; + + if (data == NULL) + return; + + sht = data->ht; + + for (h1=0; h1<256; h1++) { + struct rsvp_session *s; + + while ((s = sht[h1]) != NULL) { + + sht[h1] = s->next; + + for (h2=0; h2<=16; h2++) { + struct rsvp_filter *f; + + while ((f = s->ht[h2]) != NULL) { + unsigned long cl; + + s->ht[h2] = f->next; + if ((cl = xchg(&f->res.class, 0)) != 0) + tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); +#ifdef CONFIG_NET_CLS_POLICE + tcf_police_release(f->police); +#endif + kfree(f); + } + } + kfree(s); + } + } + kfree(data); + MOD_DEC_USE_COUNT; +} + +static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct rsvp_filter **fp, *f = (struct rsvp_filter*)arg; + unsigned h = f->handle; + struct rsvp_session **sp; + struct rsvp_session *s = f->sess; + int i; + + for (fp = &s->ht[(h>>8)&0xFF]; *fp; fp = &(*fp)->next) { + if (*fp == f) { + unsigned long cl; + + *fp = f->next; + if ((cl = xchg(&f->res.class, 0)) != 0) + tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); + +#ifdef CONFIG_NET_CLS_POLICE + tcf_police_release(f->police); +#endif + + kfree(f); + + /* Strip tree */ + + for (i=0; i<=16; i++) + if (s->ht[i]) + return 0; + + /* OK, session has no flows */ + for (sp = &((struct rsvp_head*)tp->root)->ht[h&0xFF]; + *sp; sp = &(*sp)->next) { + if (*sp == s) { + *sp = s->next; + kfree(s); + return 0; + } + } + + return 0; + } + } + return 0; +} + +static unsigned gen_handle(struct tcf_proto *tp, unsigned salt) +{ + struct rsvp_head *data = tp->root; + int i = 0xFFFF; + + while (i-- > 0) { + u32 h; + if ((data->hgenerator += 0x10000) == 0) + data->hgenerator = 0x10000; + h = data->hgenerator|salt; + if (rsvp_get(tp, h) == 0) + return h; + } + return 0; +} + +static int tunnel_bts(struct rsvp_head *data) +{ + int n = data->tgenerator>>5; + u32 b = 1<<(data->tgenerator&0x1F); + + if (data->tmap[n]&b) + return 0; + data->tmap[n] |= b; + return 1; +} + +static void tunnel_recycle(struct rsvp_head *data) +{ + struct rsvp_session **sht = data->ht; + u32 tmap[256/32]; + int h1, h2; + + memset(tmap, 0, sizeof(tmap)); + + for (h1=0; h1<256; h1++) { + struct rsvp_session *s; + for (s = sht[h1]; s; s = s->next) { + for (h2=0; h2<=16; h2++) { + struct rsvp_filter *f; + + for (f = s->ht[h2]; f; f = f->next) { + if (f->tunnelhdr == 0) + continue; + data->tgenerator = f->res.classid; + tunnel_bts(data); + } + } + } + } + + memcpy(data->tmap, tmap, sizeof(tmap)); +} + +static u32 gen_tunnel(struct rsvp_head *data) +{ + int i, k; + + for (k=0; k<2; k++) { + for (i=255; i>0; i--) { + if (++data->tgenerator == 0) + data->tgenerator = 1; + if (tunnel_bts(data)) + return data->tgenerator; + } + tunnel_recycle(data); + } + return 0; +} + +static int rsvp_change(struct tcf_proto *tp, u32 handle, + struct rtattr **tca, + unsigned long *arg) +{ + struct rsvp_head *data = tp->root; + struct rsvp_filter *f, **fp; + struct rsvp_session *s, **sp; + struct tc_rsvp_pinfo *pinfo = NULL; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_RSVP_MAX]; + unsigned h1, h2; + u32 *dst; + int err; + + if (opt == NULL) + return -EINVAL; + + if (rtattr_parse(tb, TCA_RSVP_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0) + return -EINVAL; + + if ((f = (struct rsvp_filter*)*arg) != NULL) { + /* Node exists: adjust only classid */ + + if (f->handle != handle && handle) + return -EINVAL; + if (tb[TCA_RSVP_CLASSID-1]) { + unsigned long cl = xchg(&f->res.class, 0); + if (cl) + tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); + f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]); + f->res.class = tp->q->ops->cl_ops->bind_tcf(tp->q, f->res.classid); + } +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_RSVP_POLICE-1]) { + struct tcf_police *police = tcf_police_locate(tb[TCA_RSVP_POLICE-1]); + + tcf_police_release(xchg(&f->police, police)); + } +#endif + return 0; + } + + /* Now more serious part... */ + if (handle) + return -EINVAL; + if (tb[TCA_RSVP_DST-1] == NULL) + return -EINVAL; + + f = kmalloc(sizeof(struct rsvp_filter), GFP_KERNEL); + if (f == NULL) + return -ENOBUFS; + + memset(f, 0, sizeof(*f)); + h2 = 16; + if (tb[TCA_RSVP_SRC-1]) { + err = -EINVAL; + if (RTA_PAYLOAD(tb[TCA_RSVP_SRC-1]) != sizeof(f->src)) + goto errout; + memcpy(f->src, RTA_DATA(tb[TCA_RSVP_SRC-1]), sizeof(f->src)); + h2 = hash_src(f->src); + } + if (tb[TCA_RSVP_PINFO-1]) { + err = -EINVAL; + if (RTA_PAYLOAD(tb[TCA_RSVP_PINFO-1]) < sizeof(struct tc_rsvp_pinfo)) + goto errout; + pinfo = RTA_DATA(tb[TCA_RSVP_PINFO-1]); + f->spi = pinfo->spi; + f->tunnelhdr = pinfo->tunnelhdr; + } + if (tb[TCA_RSVP_CLASSID-1]) { + err = -EINVAL; + if (RTA_PAYLOAD(tb[TCA_RSVP_CLASSID-1]) != 4) + goto errout; + f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]); + } + + err = -EINVAL; + if (RTA_PAYLOAD(tb[TCA_RSVP_DST-1]) != sizeof(f->src)) + goto errout; + dst = RTA_DATA(tb[TCA_RSVP_DST-1]); + h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0); + + err = -ENOMEM; + if ((f->handle = gen_handle(tp, h1 | (h2<<8))) == 0) + goto errout; + + if (f->tunnelhdr) { + err = -EINVAL; + if (f->res.classid > 255) + goto errout; + + err = -ENOMEM; + if (f->res.classid == 0 && + (f->res.classid = gen_tunnel(data)) == 0) + goto errout; + } + + for (sp = &data->ht[h1]; (s=*sp) != NULL; sp = &s->next) { + if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && + pinfo->protocol == s->protocol && + memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 +#if RSVP_DST_LEN == 4 + && dst[0] == s->dst[0] + && dst[1] == s->dst[1] + && dst[2] == s->dst[2] +#endif + && pinfo->tunnelid == s->tunnelid) { + +insert: + /* OK, we found appropriate session */ + + fp = &s->ht[h2]; + + f->sess = s; + if (f->tunnelhdr == 0) + f->res.class = tp->q->ops->cl_ops->bind_tcf(tp->q, f->res.classid); +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_RSVP_POLICE-1]) + f->police = tcf_police_locate(tb[TCA_RSVP_POLICE-1]); +#endif + + for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next) + if (((*fp)->spi.mask&f->spi.mask) != f->spi.mask) + break; + f->next = *fp; + *fp = f; + return 0; + } + } + + /* No session found. Create new one. */ + + err = -ENOBUFS; + s = kmalloc(sizeof(struct rsvp_session), GFP_KERNEL); + if (s == NULL) + goto errout; + memset(s, 0, sizeof(*s)); + memcpy(s->dst, dst, sizeof(*dst)); + s->dpi = pinfo->dpi; + s->protocol = pinfo->protocol; + s->tunnelid = pinfo->tunnelid; + for (sp = &data->ht[h1]; *sp; sp = &(*sp)->next) { + if (((*sp)->dpi.mask&s->dpi.mask) != s->dpi.mask) + break; + } + s->next = *sp; + *sp = s; + goto insert; + +errout: + if (f) + kfree(f); + return err; +} + +static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct rsvp_head *head = tp->root; + unsigned h, h1; + + if (arg->stop) + return; + + for (h = 0; h < 256; h++) { + struct rsvp_session *s; + + for (s = head->ht[h]; s; s = s->next) { + for (h1 = 0; h1 <= 16; h1++) { + struct rsvp_filter *f; + + for (f = s->ht[h1]; f; f = f->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(tp, (unsigned long)f, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } + } + } + } +} + +#ifdef CONFIG_RTNETLINK +static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct rsvp_head *head = tp->root; + struct rsvp_filter *f = (struct rsvp_filter*)fh; + struct rsvp_session *s; + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_rsvp_pinfo pinfo; + + if (f == NULL) + return skb->len; + s = f->sess; + + t->tcm_handle = f->handle; + + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + RTA_PUT(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst); + pinfo.dpi = s->dpi; + pinfo.spi = f->spi; + pinfo.protocol = s->protocol; + pinfo.tunnelid = s->tunnelid; + pinfo.tunnelhdr = f->tunnelhdr; + RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo); + if (f->res.classid) + RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid); + if (((f->handle>>8)&0xFF) != 16) + RTA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src); +#ifdef CONFIG_NET_CLS_POLICE + if (f->police) { + struct rtattr * p_rta = (struct rtattr*)skb->tail; + + RTA_PUT(skb, TCA_RSVP_POLICE, 0, NULL); + + if (tcf_police_dump(skb, f->police) < 0) + goto rtattr_failure; + + p_rta->rta_len = skb->tail - (u8*)p_rta; + } +#endif + + rta->rta_len = skb->tail - b; + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif + +struct tcf_proto_ops RSVP_OPS = { + NULL, + RSVP_ID, + rsvp_classify, + rsvp_init, + rsvp_destroy, + + rsvp_get, + rsvp_put, + rsvp_change, + rsvp_delete, + rsvp_walk, +#ifdef CONFIG_RTNETLINK + rsvp_dump +#else + NULL +#endif +}; + +#ifdef MODULE +int init_module(void) +{ + return register_tcf_proto_ops(&RSVP_OPS); +} + +void cleanup_module(void) +{ + unregister_tcf_proto_ops(&RSVP_OPS); +} +#endif diff --git a/net/sched/cls_rsvp6.c b/net/sched/cls_rsvp6.c new file mode 100644 index 000000000..069960213 --- /dev/null +++ b/net/sched/cls_rsvp6.c @@ -0,0 +1,42 @@ +/* + * net/sched/cls_rsvp6.c Special RSVP packet classifier for IPv6. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <linux/ipv6.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +#define RSVP_DST_LEN 4 +#define RSVP_ID "rsvp6" +#define RSVP_OPS cls_rsvp6_ops + +#include "cls_rsvp.h" diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c new file mode 100644 index 000000000..10e355201 --- /dev/null +++ b/net/sched/cls_u32.c @@ -0,0 +1,704 @@ +/* + * net/sched/cls_u32.c Ugly (or Universal) 32bit key Packet Classifier. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * The filters are packed to hash tables of key nodes + * with a set of 32bit key/mask pairs at every node. + * Nodes reference next level hash tables etc. + * + * This scheme is the best universal classifier + * I managed to invent; it is not super-fast, but it is not slow + * (provided you programmed it correctly), and enough general. + * And its relative speed grows, when number of rules becomes larger. + * + * Seems, it presents the best middle point between speed and + * managability both by human and by machine. + * + * It is especially useful for link sharing and link sharing, combined + * with QoS; pure RSVP need not such general approach and can use + * much simpler (and faster) schemes, sort of cls_rsvp.c. + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <linux/rtnetlink.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +#define BUG_TRAP(x) if (!(x)) { printk("Assertion (" #x ") failed at " __FILE__ "(%d):" __FUNCTION__ "\n", __LINE__); } + + +struct tc_u_knode +{ + struct tc_u_knode *next; + u32 handle; + struct tc_u_hnode *ht_up; +#ifdef CONFIG_NET_CLS_POLICE + struct tcf_police *police; +#endif + struct tcf_result res; + struct tc_u_hnode *ht_down; + struct tc_u32_sel sel; +}; + +struct tc_u_hnode +{ + struct tc_u_hnode *next; + u32 handle; + struct tc_u_common *tp_c; + int refcnt; + unsigned divisor; + u32 hgenerator; + struct tc_u_knode *ht[1]; +}; + +struct tc_u_common +{ + struct tc_u_common *next; + struct tc_u_hnode *hlist; + struct Qdisc *q; + int refcnt; + u32 hgenerator; +}; + +static struct tc_u_common *u32_list; + +static __inline__ unsigned u32_hash_fold(u32 key, struct tc_u32_sel *sel) +{ + unsigned h = key & sel->hmask; + + h ^= h>>16; + h ^= h>>8; + return h; +} + +static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res) +{ + struct { + struct tc_u_knode *knode; + u8 *ptr; + } stack[TC_U32_MAXDEPTH]; + + struct tc_u_hnode *ht = (struct tc_u_hnode*)tp->root; + u8 *ptr = skb->nh.raw; + struct tc_u_knode *n; + int sdepth = 0; + int off2 = 0; + int sel = 0; + int i; + +#ifndef __i386__ + if ((unsigned long)ptr & 3) + return -1; +#endif + +next_ht: + n = ht->ht[sel]; + +next_knode: + if (n) { + struct tc_u32_key *key = n->sel.keys; + + for (i = n->sel.nkeys; i>0; i--, key++) { + if ((*(u32*)(ptr+key->off+(off2&key->offmask))^key->val)&key->mask) { + n = n->next; + goto next_knode; + } + } + if (n->ht_down == NULL) { +check_terminal: + if (n->sel.flags&TC_U32_TERMINAL) { + *res = n->res; +#ifdef CONFIG_NET_CLS_POLICE + if (n->police) + return tcf_police(skb, n->police); +#endif + return 0; + } + n = n->next; + goto next_knode; + } + + /* PUSH */ + if (sdepth >= TC_U32_MAXDEPTH) + goto deadloop; + stack[sdepth].knode = n; + stack[sdepth].ptr = ptr; + sdepth++; + + ht = n->ht_down; + sel = 0; + if (ht->divisor) + sel = ht->divisor&u32_hash_fold(*(u32*)(ptr+n->sel.hoff), &n->sel); + + if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT))) + goto next_ht; + + if (n->sel.flags&(TC_U32_EAT|TC_U32_VAROFFSET)) { + off2 = n->sel.off + 3; + if (n->sel.flags&TC_U32_VAROFFSET) + off2 += ntohs(n->sel.offmask & *(u16*)(ptr+n->sel.offoff)) >>n->sel.offshift; + off2 &= ~3; + } + if (n->sel.flags&TC_U32_EAT) { + ptr += off2; + off2 = 0; + } + + if (ptr < skb->tail) + goto next_ht; + } + + /* POP */ + if (sdepth--) { + n = stack[sdepth].knode; + ht = n->ht_up; + ptr = stack[sdepth].ptr; + goto check_terminal; + } + return -1; + +deadloop: + if (net_ratelimit()) + printk("cls_u32: dead loop\n"); + return -1; +} + +static __inline__ struct tc_u_hnode * +u32_lookup_ht(struct tc_u_common *tp_c, u32 handle) +{ + struct tc_u_hnode *ht; + + for (ht = tp_c->hlist; ht; ht = ht->next) + if (ht->handle == handle) + break; + + return ht; +} + +static __inline__ struct tc_u_knode * +u32_lookup_key(struct tc_u_hnode *ht, u32 handle) +{ + unsigned sel; + struct tc_u_knode *n; + + sel = TC_U32_HASH(handle); + if (sel > ht->divisor) + return 0; + + for (n = ht->ht[sel]; n; n = n->next) + if (n->handle == handle) + return n; + + return NULL; +} + + +static unsigned long u32_get(struct tcf_proto *tp, u32 handle) +{ + struct tc_u_hnode *ht; + struct tc_u_common *tp_c = tp->data; + + if (TC_U32_HTID(handle) == TC_U32_ROOT) + ht = tp->root; + else + ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle)); + + if (!ht) + return 0; + + if (TC_U32_KEY(handle) == 0) + return (unsigned long)ht; + + return (unsigned long)u32_lookup_key(ht, handle); +} + +static void u32_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static u32 gen_new_htid(struct tc_u_common *tp_c) +{ + int i = 0x800; + + do { + if (++tp_c->hgenerator == 0x7FF) + tp_c->hgenerator = 1; + } while (i>0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20)); + + return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0; +} + +static int u32_init(struct tcf_proto *tp) +{ + struct tc_u_hnode *root_ht; + struct tc_u_common *tp_c; + + MOD_INC_USE_COUNT; + + for (tp_c = u32_list; tp_c; tp_c = tp_c->next) + if (tp_c->q == tp->q) + break; + + root_ht = kmalloc(sizeof(*root_ht), GFP_KERNEL); + if (root_ht == NULL) { + MOD_DEC_USE_COUNT; + return -ENOBUFS; + } + memset(root_ht, 0, sizeof(*root_ht)); + root_ht->divisor = 0; + root_ht->refcnt++; + root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000; + + if (tp_c == NULL) { + tp_c = kmalloc(sizeof(*tp_c), GFP_KERNEL); + if (tp_c == NULL) { + kfree(root_ht); + MOD_DEC_USE_COUNT; + return -ENOBUFS; + } + memset(tp_c, 0, sizeof(*tp_c)); + tp_c->q = tp->q; + tp_c->next = u32_list; + u32_list = tp_c; + } + + tp_c->refcnt++; + root_ht->next = tp_c->hlist; + tp_c->hlist = root_ht; + root_ht->tp_c = tp_c; + + tp->root = root_ht; + tp->data = tp_c; + return 0; +} + +static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n) +{ + unsigned long cl; + + if ((cl = xchg(&n->res.class, 0)) != 0) + tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); +#ifdef CONFIG_NET_CLS_POLICE + tcf_police_release(n->police); +#endif + if (n->ht_down) + n->ht_down->refcnt--; + kfree(n); + return 0; +} + +static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key) +{ + struct tc_u_knode **kp; + struct tc_u_hnode *ht = key->ht_up; + + if (ht) { + for (kp = &ht->ht[TC_U32_HASH(key->handle)]; *kp; kp = &(*kp)->next) { + if (*kp == key) { + *kp = key->next; + u32_destroy_key(tp, key); + return 0; + } + } + } + BUG_TRAP(0); + return 0; +} + +static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) +{ + struct tc_u_knode *n; + unsigned h; + + for (h=0; h<=ht->divisor; h++) { + while ((n = ht->ht[h]) != NULL) { + ht->ht[h] = n->next; + u32_destroy_key(tp, n); + } + } +} + +static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode **hn; + + BUG_TRAP(!ht->refcnt); + + u32_clear_hnode(tp, ht); + + for (hn = &tp_c->hlist; *hn; hn = &(*hn)->next) { + if (*hn == ht) { + *hn = ht->next; + kfree(ht); + return 0; + } + } + + BUG_TRAP(0); + return -ENOENT; +} + +static void u32_destroy(struct tcf_proto *tp) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode *root_ht = xchg(&tp->root, NULL); + + BUG_TRAP(root_ht != NULL); + + if (root_ht && --root_ht->refcnt == 0) + u32_destroy_hnode(tp, root_ht); + + if (--tp_c->refcnt == 0) { + struct tc_u_hnode *ht; + struct tc_u_common **tp_cp; + + for (tp_cp = &u32_list; *tp_cp; tp_cp = &(*tp_cp)->next) { + if (*tp_cp == tp_c) { + *tp_cp = tp_c->next; + break; + } + } + + for (ht=tp_c->hlist; ht; ht = ht->next) + u32_clear_hnode(tp, ht); + + while ((ht = tp_c->hlist) != NULL) { + tp_c->hlist = ht->next; + + BUG_TRAP(ht->refcnt == 0); + + kfree(ht); + }; + + kfree(tp_c); + } + + tp->data = NULL; +} + +static int u32_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct tc_u_hnode *ht = (struct tc_u_hnode*)arg; + + if (ht == NULL) + return 0; + + if (TC_U32_KEY(ht->handle)) + return u32_delete_key(tp, (struct tc_u_knode*)ht); + + if (tp->root == ht) + return -EINVAL; + + if (--ht->refcnt == 0) + u32_destroy_hnode(tp, ht); + + return 0; +} + +static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle) +{ + struct tc_u_knode *n; + unsigned i = 0x7FF; + + for (n=ht->ht[TC_U32_HASH(handle)]; n; n = n->next) + if (i < TC_U32_NODE(n->handle)) + i = TC_U32_NODE(n->handle); + i++; + + return handle|(i>0xFFF ? 0xFFF : i); +} + +static int u32_set_parms(struct Qdisc *q, struct tc_u_hnode *ht, + struct tc_u_knode *n, struct rtattr **tb) +{ + if (tb[TCA_U32_LINK-1]) { + u32 handle = *(u32*)RTA_DATA(tb[TCA_U32_LINK-1]); + struct tc_u_hnode *ht_down = NULL; + + if (TC_U32_KEY(handle)) + return -EINVAL; + + if (handle) { + ht_down = u32_lookup_ht(ht->tp_c, handle); + + if (ht_down == NULL) + return -EINVAL; + ht_down->refcnt++; + } + + ht_down = xchg(&n->ht_down, ht_down); + + if (ht_down) + ht_down->refcnt--; + } + if (tb[TCA_U32_CLASSID-1]) { + unsigned long cl = xchg(&n->res.class, 0); + if (cl) + q->ops->cl_ops->unbind_tcf(q, cl); + n->res.classid = *(u32*)RTA_DATA(tb[TCA_U32_CLASSID-1]); + n->res.class = q->ops->cl_ops->bind_tcf(q, n->res.classid); + } +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_U32_POLICE-1]) { + struct tcf_police *police = tcf_police_locate(tb[TCA_U32_POLICE-1]); + + tcf_police_release(xchg(&n->police, police)); + } +#endif + return 0; +} + +static int u32_change(struct tcf_proto *tp, u32 handle, + struct rtattr **tca, + unsigned long *arg) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode *ht; + struct tc_u_knode *n; + struct tc_u32_sel *s; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_U32_MAX]; + u32 htid; + int err; + + if (opt == NULL) + return handle ? -EINVAL : 0; + + if (rtattr_parse(tb, TCA_U32_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0) + return -EINVAL; + + if ((n = (struct tc_u_knode*)*arg) != NULL) { + if (TC_U32_KEY(n->handle) == 0) + return -EINVAL; + + return u32_set_parms(tp->q, n->ht_up, n, tb); + } + + if (tb[TCA_U32_DIVISOR-1]) { + unsigned divisor = *(unsigned*)RTA_DATA(tb[TCA_U32_DIVISOR-1]); + + if (--divisor > 0x100) + return -EINVAL; + if (TC_U32_KEY(handle)) + return -EINVAL; + if (handle == 0) { + handle = gen_new_htid(tp->data); + if (handle == 0) + return -ENOMEM; + } + ht = kmalloc(sizeof(*ht) + divisor*sizeof(void*), GFP_KERNEL); + if (ht == NULL) + return -ENOBUFS; + memset(ht, 0, sizeof(*ht) + divisor*sizeof(void*)); + ht->tp_c = tp_c; + ht->refcnt = 0; + ht->divisor = divisor; + ht->handle = handle; + ht->next = tp_c->hlist; + tp_c->hlist = ht; + *arg = (unsigned long)ht; + return 0; + } + + if (tb[TCA_U32_HASH-1]) { + htid = *(unsigned*)RTA_DATA(tb[TCA_U32_HASH-1]); + if (TC_U32_HTID(handle) == TC_U32_ROOT) { + ht = tp->root; + htid = ht->handle; + } else { + ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid)); + if (ht == NULL) + return -EINVAL; + } + } else { + ht = tp->root; + htid = ht->handle; + } + + if (ht->divisor < TC_U32_HASH(htid)) + return -EINVAL; + + if (handle) { + if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid)) + return -EINVAL; + if (TC_U32_HASH(handle) && TC_U32_HASH(handle^htid)) + return -EINVAL; + handle = htid | TC_U32_NODE(handle); + } else + handle = gen_new_kid(ht, htid); + + if (tb[TCA_U32_SEL-1] == 0 || + RTA_PAYLOAD(tb[TCA_U32_SEL-1]) < sizeof(struct tc_u32_sel)) + return -EINVAL; + + s = RTA_DATA(tb[TCA_U32_SEL-1]); + n = kmalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL); + if (n == NULL) + return -ENOBUFS; + memset(n, 0, sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key)); + memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); + n->ht_up = ht; + n->handle = handle; + err = u32_set_parms(tp->q, ht, n, tb); + if (err == 0) { + struct tc_u_knode **ins; + for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next) + if (TC_U32_NODE(handle) >= TC_U32_NODE((*ins)->handle)) + break; + n->next = *ins; + *ins = n; + *arg = (unsigned long)n; + return 0; + } + kfree(n); + return err; +} + +static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode *ht; + struct tc_u_knode *n; + unsigned h; + + if (arg->stop) + return; + + for (ht = tp_c->hlist; ht; ht = ht->next) { + if (arg->count >= arg->skip) { + if (arg->fn(tp, (unsigned long)ht, arg) < 0) { + arg->stop = 1; + return; + } + } + arg->count++; + for (h = 0; h <= ht->divisor; h++) { + for (n = ht->ht[h]; n; n = n->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(tp, (unsigned long)n, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } + } +} + +#ifdef CONFIG_RTNETLINK +static int u32_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct tc_u_knode *n = (struct tc_u_knode*)fh; + unsigned char *b = skb->tail; + struct rtattr *rta; + + if (n == NULL) + return skb->len; + + t->tcm_handle = n->handle; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + if (TC_U32_KEY(n->handle) == 0) { + struct tc_u_hnode *ht = (struct tc_u_hnode*)fh; + u32 divisor = ht->divisor+1; + RTA_PUT(skb, TCA_U32_DIVISOR, 4, &divisor); + } else { + RTA_PUT(skb, TCA_U32_SEL, + sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key), + &n->sel); + if (n->ht_up) { + u32 htid = n->handle & 0xFFFFF000; + RTA_PUT(skb, TCA_U32_HASH, 4, &htid); + } + if (n->res.classid) + RTA_PUT(skb, TCA_U32_CLASSID, 4, &n->res.classid); + if (n->ht_down) + RTA_PUT(skb, TCA_U32_LINK, 4, &n->ht_down->handle); +#ifdef CONFIG_NET_CLS_POLICE + if (n->police) { + struct rtattr * p_rta = (struct rtattr*)skb->tail; + + RTA_PUT(skb, TCA_U32_POLICE, 0, NULL); + + if (tcf_police_dump(skb, n->police) < 0) + goto rtattr_failure; + + p_rta->rta_len = skb->tail - (u8*)p_rta; + } +#endif + } + + rta->rta_len = skb->tail - b; + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif + +struct tcf_proto_ops cls_u32_ops = { + NULL, + "u32", + u32_classify, + u32_init, + u32_destroy, + + u32_get, + u32_put, + u32_change, + u32_delete, + u32_walk, +#ifdef CONFIG_RTNETLINK + u32_dump +#else + NULL +#endif +}; + +#ifdef MODULE +int init_module(void) +{ + return register_tcf_proto_ops(&cls_u32_ops); +} + +void cleanup_module(void) +{ + unregister_tcf_proto_ops(&cls_u32_ops); +} +#endif diff --git a/net/sched/estimator.c b/net/sched/estimator.c new file mode 100644 index 000000000..06defeec5 --- /dev/null +++ b/net/sched/estimator.c @@ -0,0 +1,183 @@ +/* + * net/sched/estimator.c Simple rate estimator. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +/* + This text is NOT intended to be used for statistics collection, + its purpose is to provide base for statistical multiplexing + for controlled load service. + If you need only statistics, run user level daemon, which will + periodically read byte counters. + + Unfortunately, rate estimation is not very easy task. + F.e. I did not find a simple way to estimate current peak rate + and even failed to formulate the problem 8)8) + + So that I preferred not to built estimator in scheduler, + but run this task separately. + Ideally, it should be kernel thread(s), but for now it runs + from timers, which puts apparent top bounds on number of rated + flows, but has minimal overhead on small, but enough + to handle controlled load service, sets of aggregates. + + We measure rate over A=(1<<interval) seconds and evaluate EWMA: + + avrate = avrate*(1-W) + rate*W + + where W is chosen as negative power of 2: W = 2^(-ewma_log) + + Resulting time constant is: + + T = A/(-ln(1-W)) + + + NOTES. + + * Stored value for avbps is scaled by 2^5, so that maximal + rate is ~1Gbit, avpps is scaled by 2^10. + + * Minimal interval is HZ/4=250msec (it is the least integer divisor + both for HZ=100 and HZ=1024 8)), maximal interval + is (HZ/4)*2^EST_MAX_INTERVAL = 8sec. Shorter intervals + are too expensive, longer ones can be implemented + at user level painlessly. + */ + +#if (HZ%4) != 0 +#error Bad HZ value. +#endif + +#define EST_MAX_INTERVAL 5 + +struct qdisc_estimator +{ + struct qdisc_estimator *next; + struct tc_stats *stats; + unsigned interval; + int ewma_log; + u64 last_bytes; + u32 last_packets; + u32 avpps; + u32 avbps; +}; + +struct qdisc_estimator_head +{ + struct timer_list timer; + struct qdisc_estimator *list; +}; + +static struct qdisc_estimator_head elist[EST_MAX_INTERVAL+1]; + +static void est_timer(unsigned long arg) +{ + int idx = (int)arg; + struct qdisc_estimator *e; + + for (e = elist[idx].list; e; e = e->next) { + u64 nbytes = e->stats->bytes; + u32 npackets = e->stats->packets; + u32 rate; + + rate = (nbytes - e->last_bytes)<<(7 - idx); + e->last_bytes = nbytes; + e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log; + e->stats->bps = (e->avbps+0xF)>>5; + + rate = (npackets - e->last_packets)<<(12 - idx); + e->last_packets = npackets; + e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log; + e->stats->pps = (e->avpps+0x1FF)>>10; + } + + elist[idx].timer.expires = jiffies + ((HZ/4)<<idx); + add_timer(&elist[idx].timer); +} + +int qdisc_new_estimator(struct tc_stats *stats, struct rtattr *opt) +{ + struct qdisc_estimator *est; + struct tc_estimator *parm = RTA_DATA(opt); + + if (RTA_PAYLOAD(opt) < sizeof(*parm)) + return -EINVAL; + + if (parm->interval < -2 || parm->interval > 3) + return -EINVAL; + + est = kmalloc(sizeof(*est), GFP_KERNEL); + if (est == NULL) + return -ENOBUFS; + + memset(est, 0, sizeof(*est)); + est->interval = parm->interval + 2; + est->stats = stats; + est->ewma_log = parm->ewma_log; + est->last_bytes = stats->bytes; + est->avbps = stats->bps<<5; + est->last_packets = stats->packets; + est->avpps = stats->pps<<10; + + est->next = elist[est->interval].list; + if (est->next == NULL) { + init_timer(&elist[est->interval].timer); + elist[est->interval].timer.data = est->interval; + elist[est->interval].timer.expires = jiffies + ((HZ/4)<<est->interval); + elist[est->interval].timer.function = est_timer; + add_timer(&elist[est->interval].timer); + } + elist[est->interval].list = est; + return 0; +} + +void qdisc_kill_estimator(struct tc_stats *stats) +{ + int idx; + struct qdisc_estimator *est, **pest; + + for (idx=0; idx <= EST_MAX_INTERVAL; idx++) { + int killed = 0; + pest = &elist[idx].list; + while ((est=*pest) != NULL) { + if (est->stats != stats) { + pest = &est->next; + continue; + } + /* ATOMIC_SET */ + *pest = est->next; + kfree(est); + killed++; + } + if (killed && elist[idx].list == NULL) + del_timer(&elist[idx].timer); + } +} + diff --git a/net/sched/police.c b/net/sched/police.c new file mode 100644 index 000000000..13599ac49 --- /dev/null +++ b/net/sched/police.c @@ -0,0 +1,196 @@ +/* + * net/sched/police.c Input police filter. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +#define BUG_TRAP(x) if (!(x)) { printk("Assertion (" #x ") failed at " __FILE__ "(%d):" __FUNCTION__ "\n", __LINE__); } + +#define L2T(p,L) ((p)->R_tab->data[(L)>>(p)->R_tab->rate.cell_log]) +#define L2T_P(p,L) ((p)->P_tab->data[(L)>>(p)->P_tab->rate.cell_log]) + +static u32 idx_gen; +static struct tcf_police *tcf_police_ht[16]; + +static __inline__ unsigned tcf_police_hash(u32 index) +{ + return index&0xF; +} + +static __inline__ struct tcf_police * tcf_police_lookup(u32 index) +{ + struct tcf_police *p; + + for (p = tcf_police_ht[tcf_police_hash(index)]; p; p = p->next) { + if (p->index == index) + return p; + } + return NULL; +} + +static __inline__ u32 tcf_police_new_index(void) +{ + do { + if (++idx_gen == 0) + idx_gen = 1; + } while (tcf_police_lookup(idx_gen)); + + return idx_gen; +} + + +void tcf_police_destroy(struct tcf_police *p) +{ + unsigned h = tcf_police_hash(p->index); + struct tcf_police **p1p; + + for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->next) { + if (*p1p == p) { + *p1p = p->next; + if (p->R_tab) + qdisc_put_rtab(p->R_tab); + if (p->P_tab) + qdisc_put_rtab(p->P_tab); + kfree(p); + return; + } + } + BUG_TRAP(0); +} + +struct tcf_police * tcf_police_locate(struct rtattr *rta) +{ + unsigned h; + struct tcf_police *p; + struct rtattr *tb[TCA_POLICE_MAX]; + struct tc_police *parm; + + if (rtattr_parse(tb, TCA_POLICE_MAX, RTA_DATA(rta), RTA_PAYLOAD(rta)) < 0) + return NULL; + + if (tb[TCA_POLICE_TBF-1] == NULL) + return NULL; + + parm = RTA_DATA(tb[TCA_POLICE_TBF-1]); + + if (parm->index && (p = tcf_police_lookup(parm->index)) != NULL) { + p->refcnt++; + return p; + } + + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (p == NULL) + return NULL; + + memset(p, 0, sizeof(*p)); + p->refcnt = 1; + if ((p->R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1])) == NULL) + goto failure; + if (parm->peakrate.rate && + (p->P_tab = qdisc_get_rtab(&parm->peakrate, tb[TCA_POLICE_PEAKRATE-1])) == NULL) + goto failure; + p->toks = p->burst = parm->burst; + p->mtu = parm->mtu; + if (p->mtu == 0) + p->mtu = 255<<p->R_tab->rate.cell_log; + if (p->P_tab) + p->ptoks = L2T_P(p, p->mtu); + PSCHED_GET_TIME(p->t_c); + p->index = parm->index ? : tcf_police_new_index(); + p->action = parm->action; + h = tcf_police_hash(p->index); + p->next = tcf_police_ht[h]; + tcf_police_ht[h] = p; + return p; + +failure: + if (p->R_tab) + qdisc_put_rtab(p->R_tab); + kfree(p); + return NULL; +} + +int tcf_police(struct sk_buff *skb, struct tcf_police *p) +{ + psched_time_t now; + long toks; + long ptoks = 0; + + if (skb->len <= p->mtu) { + PSCHED_GET_TIME(now); + + toks = PSCHED_TDIFF_SAFE(now, p->t_c, p->burst, 0); + + if (p->P_tab) { + ptoks = toks + p->ptoks; + if (ptoks > (long)L2T_P(p, p->mtu)) + ptoks = (long)L2T_P(p, p->mtu); + ptoks -= L2T_P(p, skb->len); + } + toks += p->toks; + if (toks > (long)p->burst) + toks = p->burst; + toks -= L2T(p, skb->len); + + if ((toks|ptoks) >= 0) { + p->t_c = now; + p->toks = toks; + p->ptoks = ptoks; + return TC_POLICE_OK; + } + } + + return p->action; +} + +#ifdef CONFIG_RTNETLINK +int tcf_police_dump(struct sk_buff *skb, struct tcf_police *p) +{ + unsigned char *b = skb->tail; + struct tc_police opt; + + opt.index = p->index; + opt.action = p->action; + opt.mtu = p->mtu; + opt.burst = p->burst; + opt.rate = p->R_tab->rate; + if (p->P_tab) + opt.peakrate = p->P_tab->rate; + else + memset(&opt.peakrate, 0, sizeof(opt.peakrate)); + RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif + diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c new file mode 100644 index 000000000..6d36af30d --- /dev/null +++ b/net/sched/sch_api.c @@ -0,0 +1,994 @@ +/* + * net/sched/sch_api.c Packet scheduler API. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + + +#define BUG_TRAP(x) if (!(x)) { printk("Assertion (" #x ") failed at " __FILE__ "(%d):" __FUNCTION__ "\n", __LINE__); } + +#ifdef CONFIG_RTNETLINK +static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct Qdisc *old, struct Qdisc *new); +static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct Qdisc *q, unsigned long cl, int event); +#endif + +/* + + Short review. + ------------- + + This file consists of two interrelated parts: + + 1. queueing disciplines manager frontend. + 2. traffic classes manager frontend. + + Generally, queueing discipline ("qdisc") is a black box, + which is able to enqueue packets and to dequeue them (when + device is ready to send something) in order and at times + determined by algorithm hidden in it. + + qdisc's are divided to two categories: + - "queues", which have no internal structure visible from outside. + - "schedulers", which split all the packets to "traffic classes", + using "packet classifiers" (look at cls_api.c) + + In turn, classes may have child qdiscs (as rule, queues) + attached to them etc. etc. etc. + + The goal of the routines in this file is to translate + information supplied by user in the form of handles + to more intelligible for kernel form, to make some sanity + checks and part of work, which is common to all qdiscs + and to provide rtnetlink notifications. + + All real intelligent work is done inside qdisc modules. + + + + Every discipline has two major routines: enqueue and dequeue. + + ---dequeue + + dequeue usually returns a skb to send. It is allowed to return NULL, + but it does not mean that queue is empty, it just means that + discipline does not want to send anything this time. + Queue is really empty if q->q.qlen == 0. + For complicated disciplines with multiple queues q->q is not + real packet queue, but however q->q.qlen must be valid. + + ---enqueue + + enqueue returns number of enqueued packets i.e. this number is 1, + if packet was enqueued sucessfully and <1 if something (not + necessary THIS packet) was dropped. + + Auxiliary routines: + + ---requeue + + requeues once dequeued packet. It is used for non-standard or + just buggy devices, which can defer output even if dev->tbusy=0. + + ---reset + + returns qdisc to initial state: purge all buffers, clear all + timers, counters (except for statistics) etc. + + ---init + + initializes newly created qdisc. + + ---destroy + + destroys resources allocated by init and during lifetime of qdisc. + */ + +/************************************************ + * Queueing disciplines manipulation. * + ************************************************/ + + +/* The list of all installed queueing disciplines. */ + +static struct Qdisc_ops *qdisc_base = NULL; + +/* Register/uregister queueing discipline */ + +int register_qdisc(struct Qdisc_ops *qops) +{ + struct Qdisc_ops *q, **qp; + + for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) + if (strcmp(qops->id, q->id) == 0) + return -EEXIST; + + if (qops->enqueue == NULL) + qops->enqueue = noop_qdisc_ops.enqueue; + if (qops->requeue == NULL) + qops->requeue = noop_qdisc_ops.requeue; + if (qops->dequeue == NULL) + qops->dequeue = noop_qdisc_ops.dequeue; + + qops->next = NULL; + *qp = qops; + return 0; +} + +int unregister_qdisc(struct Qdisc_ops *qops) +{ + struct Qdisc_ops *q, **qp; + for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) + if (q == qops) + break; + if (!q) + return -ENOENT; + *qp = q->next; + q->next = NULL; + return 0; +} + +/* We know handle. Find qdisc among all qdisc's attached to device + (root qdisc, all its children, children of children etc.) + */ + +struct Qdisc *qdisc_lookup(struct device *dev, u32 handle) +{ + struct Qdisc *q; + + for (q = dev->qdisc_list; q; q = q->next) { + if (q->handle == handle) + return q; + } + return NULL; +} + +/* We know classid. Find qdisc among all qdisc's attached to device + (root qdisc, all its children, children of children etc.) + */ + +struct Qdisc *qdisc_lookup_class(struct device *dev, u32 classid) +{ + struct Qdisc *q; + + for (q = dev->qdisc_list; q; q = q->next) { + if (q->classid == classid) + return q; + } + return NULL; +} + + +/* Find queueing discipline by name */ + +struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind) +{ + struct Qdisc_ops *q; + + if (kind) { + for (q = qdisc_base; q; q = q->next) { + if (rtattr_strcmp(kind, q->id) == 0) + return q; + } + } + return NULL; +} + +static struct qdisc_rate_table *qdisc_rtab_list; + +struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab) +{ + struct qdisc_rate_table *rtab; + + for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { + if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) { + rtab->refcnt++; + return rtab; + } + } + + if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024) + return NULL; + + rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); + if (rtab) { + rtab->rate = *r; + rtab->refcnt = 1; + memcpy(rtab->data, RTA_DATA(tab), 1024); + rtab->next = qdisc_rtab_list; + qdisc_rtab_list = rtab; + } + return rtab; +} + +void qdisc_put_rtab(struct qdisc_rate_table *tab) +{ + struct qdisc_rate_table *rtab, **rtabp; + + if (!tab || --tab->refcnt) + return; + + for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) { + if (rtab == tab) { + *rtabp = rtab->next; + kfree(rtab); + return; + } + } +} + + +/* Allocate an unique handle from space managed by kernel */ + +u32 qdisc_alloc_handle(struct device *dev) +{ + int i = 0x10000; + static u32 autohandle = TC_H_MAKE(0x80000000U, 0); + + do { + autohandle += TC_H_MAKE(0x10000U, 0); + if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) + autohandle = TC_H_MAKE(0x80000000U, 0); + } while (qdisc_lookup(dev, autohandle) && --i > 0); + + return i>0 ? autohandle : 0; +} + +/* Graft qdisc "new" to class "classid" of qdisc "parent" or + to device "dev". + + Old qdisc is not destroyed but returned in *old. + */ + +int qdisc_graft(struct device *dev, struct Qdisc *parent, u32 classid, + struct Qdisc *new, struct Qdisc **old) +{ + int err = 0; + + if (parent == NULL) { + BUG_TRAP(classid == TC_H_ROOT); + if (new) { + new->parent = NULL; + new->classid = TC_H_ROOT; + } + *old = dev_set_scheduler(dev, new); + } else { + struct Qdisc_class_ops *cops = parent->ops->cl_ops; + + BUG_TRAP(classid != TC_H_ROOT); + + err = -EINVAL; + + if (cops) { + unsigned long cl = cops->get(parent, classid); + if (cl) { + err = cops->graft(parent, cl, new, old); + cops->put(parent, cl); + } + } + } + return err; +} + +#ifdef CONFIG_RTNETLINK + +/* + Allocate and initialize new qdisc. + + Parameters are passed via opt. + */ + +static struct Qdisc * +qdisc_create(struct device *dev, struct Qdisc_ops *ops, u32 handle, + u32 parentid, struct rtattr **tca, int *errp) +{ + int err; + struct rtattr *kind = tca[TCA_KIND-1]; + struct Qdisc *sch = NULL; + int size; + int new = 0; + + if (ops == NULL) { + ops = qdisc_lookup_ops(kind); + err = -EINVAL; + if (ops == NULL) + goto err_out; + new = 1; + } + + size = sizeof(*sch) + ops->priv_size; + + sch = kmalloc(size, GFP_KERNEL); + err = -ENOBUFS; + if (!sch) + goto err_out; + + /* Grrr... Resolve race condition with module unload */ + + err = -EINVAL; + if (new) { + if (ops != qdisc_lookup_ops(kind)) + goto err_out; + } else if (kind) { + if (rtattr_strcmp(kind, ops->id)) + goto err_out; + } + + memset(sch, 0, size); + + skb_queue_head_init(&sch->q); + sch->ops = ops; + sch->enqueue = ops->enqueue; + sch->dequeue = ops->dequeue; + sch->dev = dev; + if (handle == 0) { + handle = qdisc_alloc_handle(dev); + err = -ENOMEM; + if (handle == 0) + goto err_out; + } + sch->handle = handle; + sch->classid = parentid; + + if (ops->init && (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { + sch->next = dev->qdisc_list; + dev->qdisc_list = sch; +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) + qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]); +#endif + return sch; + } + +err_out: + *errp = err; + if (sch) + kfree(sch); + return NULL; +} + + +/* + Create/delete/change/get qdisc. + */ + +static int tc_ctl_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct tcmsg *tcm = NLMSG_DATA(n); + struct rtattr **tca = arg; + struct device *dev; + u32 clid = tcm->tcm_parent; + struct Qdisc *old_q; + struct Qdisc *q = NULL; + struct Qdisc *p = NULL; + struct Qdisc *leaf = NULL; + struct Qdisc_ops *qops = NULL; + int err; + + /* Find device */ + if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) + return -ENODEV; + + /* If parent is specified, it must exist + and tcm_parent selects a class in parent which + new qdisc will be attached to. + + The place may be already busy by another qdisc, + remember this fact, if it was not auto-created discipline. + */ + if (clid) { + if (clid != TC_H_ROOT) { + p = qdisc_lookup(dev, TC_H_MAJ(clid)); + if (p == NULL) + return -ENOENT; + leaf = qdisc_lookup_class(dev, clid); + } else + leaf = dev->qdisc_sleeping; + + if (leaf && leaf->flags&TCQ_F_DEFAULT && n->nlmsg_type == RTM_NEWQDISC) + leaf = NULL; + + /* + Also, leaf may be exactly that qdisc, which we want + to control. Remember this to avoid one more qdisc_lookup. + */ + + if (leaf && leaf->handle == tcm->tcm_handle) + q = leaf; + } + + /* Try to locate the discipline */ + if (tcm->tcm_handle && q == NULL) { + if (TC_H_MIN(tcm->tcm_handle)) + return -EINVAL; + q = qdisc_lookup(dev, tcm->tcm_handle); + } + + /* If discipline already exists, check that its real parent + matches to one selected by tcm_parent. + */ + + if (q) { + if (clid && p != q->parent) + return -EINVAL; + BUG_TRAP(!leaf || leaf == q); + if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) + return -EINVAL; + clid = q->classid; + goto process_existing; + } + + /* The discipline is known not to exist. + If parent was not selected too, return error. + */ + if (clid == 0) + return tcm->tcm_handle ? -ENOENT : -EINVAL; + + /* Check for the case when leaf is exactly the thing, + that you want. + */ + + if (leaf && tcm->tcm_handle == 0) { + q = leaf; + if (!tca[TCA_KIND-1] || rtattr_strcmp(tca[TCA_KIND-1], q->ops->id) == 0) + goto process_existing; + } + + if (n->nlmsg_type != RTM_NEWQDISC || !(n->nlmsg_flags&NLM_F_CREATE)) + return -ENOENT; + if (leaf && n->nlmsg_flags&NLM_F_EXCL) + return -EEXIST; + +create_and_graft: + q = qdisc_create(dev, qops, tcm->tcm_handle, clid, tca, &err); + if (q == NULL) + return err; + +graft: + err = qdisc_graft(dev, p, clid, q, &old_q); + if (err) { + if (q) + qdisc_destroy(q); + return err; + } + qdisc_notify(skb, n, old_q, q); + if (old_q) + qdisc_destroy(old_q); + return 0; + +process_existing: + + switch (n->nlmsg_type) { + case RTM_NEWQDISC: + if (n->nlmsg_flags&NLM_F_EXCL) + return -EEXIST; + qops = q->ops; + goto create_and_graft; + case RTM_GETQDISC: + qdisc_notify(skb, n, NULL, q); + return 0; + case RTM_DELQDISC: + q = NULL; + goto graft; + default: + return -EINVAL; + } +} + +static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, + pid_t pid, u32 seq, unsigned flags, int event) +{ + struct tcmsg *tcm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); + nlh->nlmsg_flags = flags; + tcm = NLMSG_DATA(nlh); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0; + tcm->tcm_parent = q->classid; + tcm->tcm_handle = q->handle; + tcm->tcm_info = 0; + RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); + if (q->ops->dump && q->ops->dump(q, skb) < 0) + goto rtattr_failure; + q->stats.qlen = q->q.qlen; + RTA_PUT(skb, TCA_STATS, sizeof(q->stats), &q->stats); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct Qdisc *old, struct Qdisc *new) +{ + struct sk_buff *skb; + pid_t pid = oskb ? NETLINK_CB(oskb).pid : 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (old && !(old->flags&TCQ_F_DEFAULT)) { + if (tc_fill_qdisc(skb, old, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0) + goto err_out; + } + if (new) { + if (tc_fill_qdisc(skb, new, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) + goto err_out; + } + + if (skb->len) + return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + +err_out: + kfree_skb(skb); + return -EINVAL; +} + +static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx, q_idx; + int s_idx, s_q_idx; + struct device *dev; + struct Qdisc *q; + + s_idx = cb->args[0]; + s_q_idx = q_idx = cb->args[1]; + for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { + if (idx < s_idx) + continue; + if (idx > s_idx) + s_q_idx = 0; + for (q = dev->qdisc_list, q_idx = 0; q; + q = q->next, q_idx++) { + if (q_idx < s_q_idx) + continue; + if (tc_fill_qdisc(skb, q, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) + goto done; + } + } + +done: + cb->args[0] = idx; + cb->args[1] = q_idx; + + return skb->len; +} + + + +/************************************************ + * Traffic classes manipulation. * + ************************************************/ + + + +static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct tcmsg *tcm = NLMSG_DATA(n); + struct rtattr **tca = arg; + struct device *dev; + struct Qdisc *q = NULL; + struct Qdisc_class_ops *cops; + unsigned long cl = 0; + unsigned long new_cl; + u32 pid = tcm->tcm_parent; + u32 clid = tcm->tcm_handle; + u32 qid = TC_H_MAJ(clid); + int err; + + if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) + return -ENODEV; + + /* + parent == TC_H_UNSPEC - unspecified parent. + parent == TC_H_ROOT - class is root, which has no parent. + parent == X:0 - parent is root class. + parent == X:Y - parent is a node in hierarchy. + parent == 0:Y - parent is X:Y, where X:0 is qdisc. + + handle == 0:0 - generate handle from kernel pool. + handle == 0:Y - class is X:Y, where X:0 is qdisc. + handle == X:Y - clear. + handle == X:0 - root class. + */ + + /* Step 1. Determine qdisc handle X:0 */ + + if (pid != TC_H_ROOT) { + u32 qid1 = TC_H_MAJ(pid); + + if (qid && qid1) { + /* If both majors are known, they must be identical. */ + if (qid != qid1) + return -EINVAL; + } else if (qid1) { + qid = qid1; + } else if (qid == 0) + qid = dev->qdisc_sleeping->handle; + + /* Now qid is genuine qdisc handle consistent + both with parent and child. + + TC_H_MAJ(pid) still may be unspecified, complete it now. + */ + if (pid) + pid = TC_H_MAKE(qid, pid); + } else { + if (qid == 0) + qid = dev->qdisc_sleeping->handle; + } + + /* OK. Locate qdisc */ + if ((q = qdisc_lookup(dev, qid)) == NULL) + return -ENOENT; + + /* An check that it supports classes */ + cops = q->ops->cl_ops; + if (cops == NULL) + return -EINVAL; + + /* Now try to get class */ + if (clid == 0) { + if (pid == TC_H_ROOT) + clid = qid; + } else + clid = TC_H_MAKE(qid, clid); + + if (clid) + cl = cops->get(q, clid); + + if (cl == 0) { + err = -ENOENT; + if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE)) + goto out; + } else { + switch (n->nlmsg_type) { + case RTM_NEWTCLASS: + err = -EEXIST; + if (n->nlmsg_flags&NLM_F_EXCL) + goto out; + break; + case RTM_DELTCLASS: + err = cops->delete(q, cl); + if (err == 0) + tclass_notify(skb, n, q, cl, RTM_DELTCLASS); + goto out; + case RTM_GETTCLASS: + err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS); + goto out; + default: + err = -EINVAL; + goto out; + } + } + + new_cl = cl; + err = cops->change(q, clid, pid, tca, &new_cl); + if (err == 0) + tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS); + +out: + if (cl) + cops->put(q, cl); + + return err; +} + + +static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, + unsigned long cl, + pid_t pid, u32 seq, unsigned flags, int event) +{ + struct tcmsg *tcm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); + nlh->nlmsg_flags = flags; + tcm = NLMSG_DATA(nlh); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0; + tcm->tcm_parent = q->handle; + tcm->tcm_handle = q->handle; + tcm->tcm_info = 0; + RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); + if (q->ops->cl_ops->dump && q->ops->cl_ops->dump(q, cl, skb, tcm) < 0) + goto rtattr_failure; + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct Qdisc *q, unsigned long cl, int event) +{ + struct sk_buff *skb; + pid_t pid = oskb ? NETLINK_CB(oskb).pid : 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) { + kfree_skb(skb); + return -EINVAL; + } + + return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); +} + +struct qdisc_dump_args +{ + struct qdisc_walker w; + struct sk_buff *skb; + struct netlink_callback *cb; +}; + +static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg) +{ + struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; + + return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid, + a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS); +} + +static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) +{ + int t; + int s_t; + struct device *dev; + struct Qdisc *q; + struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); + struct qdisc_dump_args arg; + + if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) + return 0; + if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) + return 0; + + s_t = cb->args[0]; + + for (q=dev->qdisc_list, t=0; q; q = q->next, t++) { + if (t < s_t) continue; + if (!q->ops->cl_ops) continue; + if (tcm->tcm_parent && TC_H_MAJ(tcm->tcm_parent) != q->handle + && (tcm->tcm_parent != TC_H_ROOT || q->parent != NULL)) + continue; + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int)); + arg.w.fn = qdisc_class_dump; + arg.skb = skb; + arg.cb = cb; + arg.w.stop = 0; + arg.w.skip = cb->args[1]; + arg.w.count = 0; + q->ops->cl_ops->walk(q, &arg.w); + cb->args[1] = arg.w.count; + if (arg.w.stop) + break; + } + + cb->args[0] = t; + + return skb->len; +} +#endif + +int psched_us_per_tick = 1; +int psched_tick_per_us = 1; + +#ifdef CONFIG_PROC_FS +static int psched_read_proc(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + int len; + + len = sprintf(buffer, "%08x %08x\n", + psched_tick_per_us, psched_us_per_tick); + + len -= offset; + + if (len > length) + len = length; + if(len < 0) + len = 0; + + *start = buffer + offset; + *eof = 1; + + return len; +} +#endif + +psched_time_t psched_time_base; + +#if PSCHED_CLOCK_SOURCE == PSCHED_CPU +psched_tdiff_t psched_clock_per_hz; +int psched_clock_scale; +#endif + +#ifdef PSCHED_WATCHER +u32 psched_time_mark; + +static void psched_tick(unsigned long); + +static struct timer_list psched_timer = + { NULL, NULL, 0, 0L, psched_tick }; + +static void psched_tick(unsigned long dummy) +{ +#if PSCHED_CLOCK_SOURCE == PSCHED_CPU + psched_time_t dummy_stamp; + PSCHED_GET_TIME(dummy_stamp); + psched_timer.expires = jiffies + 4*HZ; +#else + unsigned long jiffies = now; + psched_time_base = ((u64)now)<<PSCHED_JSCALE; + psched_time_mark = now; + psched_timer.expires = jiffies + 60*60*HZ; +#endif + add_timer(&psched_timer); +} +#endif + +#if PSCHED_CLOCK_SOURCE == PSCHED_CPU +__initfunc(int psched_calibrate_clock(void)) +{ + psched_time_t stamp, stamp1; + struct timeval tv, tv1; + psched_tdiff_t delay; + long rdelay; + unsigned long stop; + +#if CPU == 586 || CPU == 686 + if (!(boot_cpu_data.x86_capability & 16)) + return -1; +#endif + + start_bh_atomic(); +#ifdef PSCHED_WATCHER + psched_tick(0); +#endif + stop = jiffies + HZ/10; + PSCHED_GET_TIME(stamp); + do_gettimeofday(&tv); + while (jiffies < stop) + boundary(); + PSCHED_GET_TIME(stamp1); + do_gettimeofday(&tv1); + end_bh_atomic(); + + delay = PSCHED_TDIFF(stamp1, stamp); + rdelay = tv1.tv_usec - tv.tv_usec; + rdelay += (tv1.tv_sec - tv.tv_sec)*1000000; + if (rdelay > delay) + return -1; + delay /= rdelay; + psched_tick_per_us = delay; + while ((delay>>=1) != 0) + psched_clock_scale++; + psched_us_per_tick = 1<<psched_clock_scale; + psched_clock_per_hz = (delay*(1000000/HZ))>>psched_clock_scale; + return 0; +} +#endif + +__initfunc(int pktsched_init(void)) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *ent; +#endif + +#if PSCHED_CLOCK_SOURCE == PSCHED_CPU + if (psched_calibrate_clock() < 0) + return -1; +#elif PSCHED_CLOCK_SOURCE == PSCHED_JIFFIES + psched_tick_per_us = HZ<<PSCHED_JSCALE; + psched_us_per_tick = 1000000; +#endif + +#ifdef CONFIG_RTNETLINK + struct rtnetlink_link *link_p = rtnetlink_links[AF_UNSPEC]; + + /* Setup rtnetlink links. It is made here to avoid + exporting large number of public symbols. + */ + + if (link_p) { + link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_ctl_qdisc; + link_p[RTM_DELQDISC-RTM_BASE].doit = tc_ctl_qdisc; + link_p[RTM_GETQDISC-RTM_BASE].doit = tc_ctl_qdisc; + link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc; + link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass; + link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass; + link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass; + link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass; + } +#endif + +#define INIT_QDISC(name) { \ + extern struct Qdisc_ops name##_qdisc_ops; \ + register_qdisc(&##name##_qdisc_ops); \ + } + + INIT_QDISC(pfifo); + INIT_QDISC(bfifo); + +#ifdef CONFIG_NET_SCH_CBQ + INIT_QDISC(cbq); +#endif +#ifdef CONFIG_NET_SCH_CSZ + INIT_QDISC(csz); +#endif +#ifdef CONFIG_NET_SCH_HPFQ + INIT_QDISC(hpfq); +#endif +#ifdef CONFIG_NET_SCH_HFSC + INIT_QDISC(hfsc); +#endif +#ifdef CONFIG_NET_SCH_RED + INIT_QDISC(red); +#endif +#ifdef CONFIG_NET_SCH_SFQ + INIT_QDISC(sfq); +#endif +#ifdef CONFIG_NET_SCH_TBF + INIT_QDISC(tbf); +#endif +#ifdef CONFIG_NET_SCH_TEQL + teql_init(); +#endif +#ifdef CONFIG_NET_SCH_PRIO + INIT_QDISC(prio); +#endif +#ifdef CONFIG_NET_CLS + tc_filter_init(); +#endif + +#ifdef CONFIG_PROC_FS + ent = create_proc_entry("net/psched", 0, 0); + ent->read_proc = psched_read_proc; +#endif + + return 0; +} diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 626afe555..759ef4d57 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -10,6 +10,8 @@ * */ +#include <linux/config.h> +#include <linux/module.h> #include <asm/uaccess.h> #include <asm/system.h> #include <asm/bitops.h> @@ -47,222 +49,279 @@ [3] Sally Floyd, "Notes on Class-Based Queueing: Setting Parameters", 1996 - Algorithm skeleton is taken from from NS simulator cbq.cc. + [4] Sally Floyd and Michael Speer, "Experimental Results + for Class-Based Queueing", 1998, not published. ----------------------------------------------------------------------- - Differences from NS version. - - --- WRR algorith is different. Our version looks more reasonable :-) - and fair when quanta are allowed to be less than MTU. - - --- cl->aveidle is REALLY limited from below by cl->minidle. - Seems, it was bug in NS. - - --- Purely lexical change: "depth" -> "level", "maxdepth" -> "toplevel". - When depth increases we expect, that the thing becomes lower, does not it? :-) - Besides that, "depth" word is semantically overloaded --- - "token bucket depth", "sfq depth"... Besides that, the algorithm - was called "top-LEVEL sharing". - - PROBLEM. - - --- Linux has no EOI event at the moment, so that we cannot - estimate true class idle time. Three workarounds are possible, - all of them have drawbacks: - - 1. (as now) Consider the next dequeue event as sign that - previous packet is finished. It is wrong because of ping-pong - buffers, but on permanently loaded link it is true. - 2. (NS approach) Use as link busy time estimate skb->leb/"physical - bandwidth". Even more wrong f.e. on ethernet real busy time much - higher because of collisions. - 3. (seems, the most clever) Split net bh to two parts: - NETRX_BH (for received packets) and preserve NET_BH for transmitter. - It will not require driver changes (NETRX_BH flag will be set - in netif_rx), but will allow to trace EOIs more precisely - and will save useless checks in net_bh. Besides that we will - have to eliminate random calling hard_start_xmit with dev->tbusy flag - (done) and to drop failure_q --- i.e. if !dev->tbusy hard_start_xmit - MUST succeed; failed packets will be dropped on the floor. + Algorithm skeleton is taken from from NS simulator cbq.cc. + If someone wants to check this text against LBL version, + he should take into account that ONLY skeleton is borrowed, + implementation is different. Particularly: + + --- WRR algorithm is different. Our version looks + more reasonable (I hope) and works when quanta are allowed + to be less than MTU, which always is the case, when real time + classes have small rates. Note, that the statement of [3] is incomplete, + Actually delay may be estimated even if class per-round allotment + less than MTU. Namely, if per-round allotment is W*r_i, + and r_1+...+r_k = r < 1 + + delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B + + In the worst case we have IntServ estimate with D = W*r+k*MTU + and C = MTU*r. The proof (if correct at all) is trivial. + + + --- Seems, cbq-2.0 is not very accurate. At least, I cannot + interpret some places, which look like wrong translation + from NS. Anyone is advertised to found these differences + and explain me, why I am wrong 8). + + --- Linux has no EOI event, so that we cannot estimate true class + idle time. Workaround is to consider the next dequeue event + as sign that previous packet is finished. It is wrong because of + internal device queueing, but on permanently loaded link it is true. + Moreover, combined with clock integrator, this scheme looks + very close to ideal solution. */ -#define CBQ_TOPLEVEL_SHARING -/* #define CBQ_NO_TRICKERY */ +struct cbq_sched_data; -#define CBQ_CLASSIFIER(skb, q) ((q)->fallback_class) struct cbq_class { + struct cbq_class *next; /* hash table link */ + struct cbq_class *next_alive; /* next class with backlog in this priority band */ + /* Parameters */ - int priority; /* priority */ -#ifdef CBQ_TOPLEVEL_SHARING - int level; /* level of the class in hierarchy: - 0 for leaf classes, and maximal - level of childrens + 1 for nodes. - */ + u32 classid; + unsigned char priority; /* class priority */ + unsigned char priority2; /* priority to be used after overlimit */ + unsigned char ewma_log; /* time constant for idle time calculation */ + unsigned char ovl_strategy; +#ifdef CONFIG_NET_CLS_POLICE + unsigned char police; #endif + u32 defmap; + + /* Link-sharing scheduler parameters */ long maxidle; /* Class paramters: see below. */ + long offtime; long minidle; - int filter_log; -#ifndef CBQ_NO_TRICKERY - long extradelay; -#endif + u32 avpkt; + struct qdisc_rate_table *R_tab; - long quantum; /* Allotment per WRR round */ - long rquantum; /* Relative allotment: see below */ + /* Overlimit strategy parameters */ + void (*overlimit)(struct cbq_class *cl); + long penalty; - int cell_log; - unsigned long L_tab[256]; + /* General scheduler (WRR) parameters */ + long allot; + long quantum; /* Allotment per WRR round */ + long weight; /* Relative allotment: see below */ - struct Qdisc *qdisc; /* ptr to CBQ discipline */ - struct cbq_class *root; /* Ptr to root class; - root can be not unique. - */ - struct cbq_class *parent; /* Ptr to parent in the class tree */ + struct Qdisc *qdisc; /* Ptr to CBQ discipline */ + struct cbq_class *split; /* Ptr to split node */ + struct cbq_class *share; /* Ptr to LS parent in the class tree */ + struct cbq_class *tparent; /* Ptr to tree parent in the class tree */ struct cbq_class *borrow; /* NULL if class is bandwidth limited; parent otherwise */ + struct cbq_class *sibling; /* Sibling chain */ + struct cbq_class *children; /* Pointer to children chain */ struct Qdisc *q; /* Elementary queueing discipline */ - struct cbq_class *next; /* next class in this priority band */ - struct cbq_class *next_alive; /* next class with backlog in this priority band */ /* Variables */ - psched_time_t last; + unsigned char cpriority; /* Effective priority */ + unsigned char delayed; + unsigned char level; /* level of the class in hierarchy: + 0 for leaf classes, and maximal + level of children + 1 for nodes. + */ + + psched_time_t last; /* Last end of service */ psched_time_t undertime; long avgidle; long deficit; /* Saved deficit for WRR */ - char awake; /* Class is in alive list */ + unsigned long penalized; + struct tc_stats stats; + struct tc_cbq_xstats xstats; -#if 0 - void (*overlimit)(struct cbq_class *cl); -#endif -}; + struct tcf_proto *filter_list; -#define L2T(cl,len) ((cl)->L_tab[(len)>>(cl)->cell_log]) + int refcnt; + int filters; + + struct cbq_class *defaults[TC_PRIO_MAX+1]; +}; struct cbq_sched_data { - struct cbq_class *classes[CBQ_MAXPRIO]; /* List of all classes */ - int nclasses[CBQ_MAXPRIO]; - unsigned quanta[CBQ_MAXPRIO]; - unsigned mtu; - int cell_log; - unsigned long L_tab[256]; - struct cbq_class *fallback_class; + struct cbq_class *classes[16]; /* Hash table of all classes */ + int nclasses[TC_CBQ_MAXPRIO+1]; + unsigned quanta[TC_CBQ_MAXPRIO+1]; + + struct cbq_class link; unsigned activemask; - struct cbq_class *active[CBQ_MAXPRIO]; /* List of all classes - with backlog */ - struct cbq_class *last_sent; - int last_sent_len; + struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes + with backlog */ + struct cbq_class *tx_class; + struct cbq_class *tx_borrowed; + int tx_len; psched_time_t now; /* Cached timestamp */ + unsigned pmask; + struct timer_list delay_timer; struct timer_list wd_timer; /* Wathchdog timer, that started when CBQ has backlog, but cannot transmit just now */ - unsigned long wd_expires; -#ifdef CBQ_TOPLEVEL_SHARING - struct cbq_class *borrowed; + long wd_expires; int toplevel; -#endif + u32 hgenerator; }; -/* - WRR quanta - ---------- - cl->quantum is number added to class allotment on every round. - cl->rquantum is "relative" quantum. +#define L2T(cl,len) ((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log]) - For real-time classes: +#define BUG_TRAP(x) if (!(x)) { printk("Assertion (" #x ") failed at " __FILE__ "(%d):" __FUNCTION__ "\n", __LINE__); } - cl->quantum = (cl->rquantum*q->nclasses[prio]*q->mtu)/q->quanta[prio] - where q->quanta[prio] is sum of all rquanta for given priority. - cl->rquantum can be identified with absolute rate of the class - in arbitrary units (f.e. bytes/sec) +static __inline__ unsigned cbq_hash(u32 h) +{ + h ^= h>>8; + h ^= h>>4; + return h&0xF; +} - In this case, delay introduced by round-robin was estimated by - Sally Floyd [2] as: +static __inline__ struct cbq_class * +cbq_class_lookup(struct cbq_sched_data *q, u32 classid) +{ + struct cbq_class *cl; - D = q->nclasses*q->mtu/(bandwidth/2) + for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next) + if (cl->classid == classid) + return cl; + return NULL; +} - Note, that D does not depend on class rate (it is very bad), - but not much worse than Gallager-Parekh estimate for CSZ - C/R = q->mtu/rate, when real-time classes have close rates. +#ifdef CONFIG_NET_CLS_POLICE - For not real-time classes this folmula is not necessary, - so that cl->quantum can be set to any reasonable not zero value. - Apparently, it should be proportional to class rate, if the - rate is not zero. -*/ +static struct cbq_class * +cbq_reclassify(struct sk_buff *skb, struct cbq_class *this) +{ + struct cbq_class *cl, *new; -/* - maxidle, minidle, extradelay - ---------------------------- - - CBQ estimator calculates smoothed class idle time cl->aveidle, - considering class as virtual interface with corresponding bandwidth. - When cl->aveidle wants to be less than zero, class is overlimit. - When it is positive, class is underlimit. - - * maxidle bounds aveidle from above. - It controls maximal length of burst in this class after - long period of idle time. Burstness of active class - is controlled by filter constant cl->filter_log, - but this number is related to burst length only indirectly. - - * minidle is a negative number, normally set to zero. - Setting it to not zero value allows avgidle to drop - below zero, effectively penalizing class, when it is overlimit. - When the class load will decrease, it will take a time to - raise negative avgidle to put the class at limit. - It should be set to zero for leaf classes. - - * extradelay is penalty in delay, when a class goes overlimit. - I believe this parameter is useless and confusing. - Setting it to not zero forces class to accumulate - its "idleness" for extradelay and then send BURST of packets - until going to overlimit again. Non-sense. - - For details see [1] and [3]. - - Really, minidle and extradelay are irrelevant to real scheduling - task. As I understand, SF&VJ introduced them to experiment - with CBQ simulator in attempts to fix erratic behaviour - of ancestor-only (and, partially, top-level) algorithm. - - WARNING. - - User passes them measured in usecs, but cl->minidle, - cl->maxidle and cl->aveidle are scaled with cl->filter_log - in the text of the scheduler. -*/ + for (cl = this->tparent; cl; cl = cl->tparent) + if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this) + return new; + + return NULL; +} + +#endif + +/* Classify packet. The procedure is pretty complicated, but + it allows us to combine link sharing and priority scheduling + transparently. + + Namely, you can put link sharing rules (f.e. route based) at root of CBQ, + so that it resolves to split nodes. Then packeta are classified + by logical priority, or more specific classifier may be attached + to split node. + */ + +static struct cbq_class * +cbq_classify(struct sk_buff *skb, struct Qdisc *sch) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data; + struct cbq_class *head = &q->link; + struct cbq_class **defmap; + struct cbq_class *cl = NULL; + u32 prio = skb->priority; + struct tcf_result res; + + /* + * Step 1. If skb->priority points to one of our classes, use it. + */ + if (TC_H_MAJ(prio^sch->handle) == 0 && + (cl = cbq_class_lookup(q, prio)) != NULL) + return cl; + + for (;;) { + int result = 0; + + defmap = head->defaults; + + /* + * Step 2+n. Apply classifier. + */ + if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0) + goto fallback; + + if ((cl = (void*)res.class) == NULL) { + if (TC_H_MAJ(res.classid)) + cl = cbq_class_lookup(q, res.classid); + else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL) + cl = defmap[TC_PRIO_BESTEFFORT]; + + if (cl == NULL) + goto fallback; + } + + if (cl->level == 0) { +#ifdef CONFIG_NET_CLS_POLICE + if (result) + return cbq_reclassify(skb, cl); +#endif + return cl; + } + + /* + * Step 3+n. If classifier selected link sharing class, + * apply agency specific classifier. + * Repeat this procdure until we hit leaf node. + */ + head = cl; + } + +fallback: + cl = head; + + /* + * Step 4. No success... + */ + if (TC_H_MAJ(prio) == 0 && + !(cl = head->defaults[prio&TC_PRIO_MAX]) && + !(cl = head->defaults[TC_PRIO_BESTEFFORT])) + return head; + + return cl; +} /* A packet has just been enqueued on the empty class. - cbq_wakeup_class adds it to the tail of active class list + cbq_activate_class adds it to the tail of active class list of its priority band. */ -static __inline__ void cbq_wakeup_class(struct cbq_class *cl) +static __inline__ void cbq_activate_class(struct cbq_class *cl) { struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data; - int prio = cl->priority; + int prio = cl->cpriority; struct cbq_class *cl_tail; - cl->awake = 1; - cl_tail = q->active[prio]; q->active[prio] = cl; if (cl_tail != NULL) { cl->next_alive = cl_tail->next_alive; + cl_tail->next_alive = cl; cl->deficit = 0; } else { cl->next_alive = cl; @@ -271,58 +330,353 @@ static __inline__ void cbq_wakeup_class(struct cbq_class *cl) } } +/* + Unlink class from active chain. + Note, that the same procedure is made directly in cbq_dequeue* + during round-robin procedure. + */ + +static void cbq_deactivate_class(struct cbq_class *this) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data; + int prio = this->cpriority; + struct cbq_class *cl; + struct cbq_class *cl_prev = q->active[prio]; + + do { + cl = cl_prev->next_alive; + if (cl == this) { + cl_prev->next_alive = cl->next_alive; + cl->next_alive = NULL; + + if (cl == q->active[prio]) { + q->active[prio] = cl_prev; + if (cl == q->active[prio]) { + q->active[prio] = NULL; + q->activemask &= ~(1<<prio); + return; + } + } + + cl = cl_prev->next_alive; + cl->deficit += cl->quantum; + return; + } + } while ((cl_prev = cl) != q->active[prio]); +} + +static __inline__ void +cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) +{ + if (q->toplevel > 0) { + psched_time_t now; + PSCHED_GET_TIME(now); + if (PSCHED_TLESS(now, q->now)) + now = q->now; + if (PSCHED_TLESS(cl->undertime, now)) { + q->toplevel = 0; + return; + } + while ((cl = cl->borrow) != NULL + && q->toplevel > cl->level) { + if (PSCHED_TLESS(cl->borrow->undertime, now)) { + q->toplevel = cl->level; + return; + } + } + } +} + static int cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; - struct cbq_class *cl = CBQ_CLASSIFIER(skb, q); + struct cbq_class *cl = cbq_classify(skb, sch); + int len = skb->len; - if (cl->q->enqueue(skb, cl->q) == 1) { + if (cl && cl->q->enqueue(skb, cl->q) == 1) { sch->q.qlen++; + sch->stats.packets++; + cl->stats.packets++; + sch->stats.bytes+=len; + cl->stats.bytes+=len; + cbq_mark_toplevel(q, cl); + if (!cl->next_alive) + cbq_activate_class(cl); + return 1; + } -#ifdef CBQ_TOPLEVEL_SHARING - if (q->toplevel > 0) { - psched_time_t now; - PSCHED_GET_TIME(now); - if (PSCHED_TLESS(cl->undertime, now)) - q->toplevel = 0; - else if (q->toplevel > 1 && cl->borrow && - PSCHED_TLESS(cl->borrow->undertime, now)) - q->toplevel = 1; - } -#endif - if (!cl->awake) - cbq_wakeup_class(cl); + sch->stats.drops++; + if (cl == NULL) + kfree_skb(skb); + else + cl->stats.drops++; + return 0; +} + +static int +cbq_requeue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl; + + if ((cl = q->tx_class) == NULL) { + kfree_skb(skb); + sch->stats.drops++; + return 0; + } + q->tx_class = NULL; + + if (cl->q->ops->requeue(skb, cl->q) == 1) { + sch->q.qlen++; + cbq_mark_toplevel(q, cl); + if (!cl->next_alive) + cbq_activate_class(cl); return 1; } + sch->stats.drops++; + cl->stats.drops++; return 0; } -static __inline__ void cbq_delay(struct cbq_sched_data *q, struct cbq_class *cl) +/* Overlimit actions */ + +/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */ + +static void cbq_ovl_classic(struct cbq_class *cl) { - long delay; + struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data; - delay = PSCHED_TDIFF(cl->undertime, q->now); - if (q->wd_expires == 0 || q->wd_expires - delay > 0) - q->wd_expires = delay; + if (!cl->delayed) { + psched_tdiff_t delay; + + delay = PSCHED_TDIFF(cl->undertime, q->now); + delay += cl->offtime; + + /* + Class goes to sleep, so that it will have no + chance to work avgidle. Let's forgive it 8) + + BTW cbq-2.0 has a crap in this + place, apparently they forgot to shift it by cl->ewma_log. + */ + if (cl->avgidle < 0) + delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); + if (cl->avgidle < cl->minidle) + cl->avgidle = cl->minidle; + if (delay < 0) + delay = 0; + PSCHED_TADD2(q->now, delay, cl->undertime); + + if (q->wd_expires == 0 || q->wd_expires > delay) + q->wd_expires = delay; + cl->xstats.overactions++; + cl->delayed = 1; + } +} + +/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when + they go overlimit + */ + +static void cbq_ovl_rclassic(struct cbq_class *cl) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data; + + while (cl && cl->delayed) { + cl = cl->borrow; + if (cl->level > q->toplevel) + return; + } + + if (cl) + cbq_ovl_classic(cl); +} + +/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */ + +static void cbq_ovl_delay(struct cbq_class *cl) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data; + + if (!cl->delayed) { + psched_tdiff_t delay; + unsigned long sched = jiffies; + + delay = PSCHED_TDIFF(cl->undertime, q->now); + delay += cl->offtime; + if (cl->avgidle < 0) + delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); + if (cl->avgidle < cl->minidle) + cl->avgidle = cl->minidle; + PSCHED_TADD2(q->now, delay, cl->undertime); + + if (delay > 0) { + sched += PSCHED_US2JIFFIE(delay) + cl->penalty; + cl->penalized = sched; + cl->cpriority = TC_CBQ_MAXPRIO; + q->pmask |= (1<<TC_CBQ_MAXPRIO); + if (del_timer(&q->delay_timer) && + (long)(q->delay_timer.expires - sched) > 0) + q->delay_timer.expires = sched; + add_timer(&q->delay_timer); + cl->delayed = 1; + cl->xstats.overactions++; + } + } +} + +/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */ + +static void cbq_ovl_lowprio(struct cbq_class *cl) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data; + + cl->penalized = jiffies + cl->penalty; + + if (cl->cpriority != cl->priority2) { + cl->cpriority = cl->priority2; + q->pmask |= (1<<cl->cpriority); + cl->xstats.overactions++; + } + cbq_ovl_classic(cl); +} + +/* TC_CBQ_OVL_DROP: penalize class by dropping */ + +static void cbq_ovl_drop(struct cbq_class *cl) +{ + if (cl->q->ops->drop) + if (cl->q->ops->drop(cl->q)) + cl->qdisc->q.qlen--; + cl->xstats.overactions++; + cbq_ovl_classic(cl); } static void cbq_watchdog(unsigned long arg) { struct Qdisc *sch = (struct Qdisc*)arg; + qdisc_wakeup(sch->dev); +} + +static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio) +{ + struct cbq_class *cl; + struct cbq_class *cl_prev = q->active[prio]; + unsigned long now = jiffies; + unsigned long sched = now; + + if (cl_prev == NULL) + return now; + + do { + cl = cl_prev->next_alive; + if ((long)(now - cl->penalized) > 0) { + cl_prev->next_alive = cl->next_alive; + cl->next_alive = NULL; + cl->cpriority = cl->priority; + cl->delayed = 0; + cbq_activate_class(cl); + + if (cl == q->active[prio]) { + q->active[prio] = cl_prev; + if (cl == q->active[prio]) { + q->active[prio] = NULL; + return 0; + } + } + + cl = cl_prev->next_alive; + } else if ((long)(sched - cl->penalized) > 0) + sched = cl->penalized; + } while ((cl_prev = cl) != q->active[prio]); + + return (long)(sched - now); +} + +static void cbq_undelay(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data; + long delay = 0; + unsigned pmask; + + pmask = q->pmask; + q->pmask = 0; + + while (pmask) { + int prio = ffz(~pmask); + long tmp; + + pmask &= ~(1<<prio); + + tmp = cbq_undelay_prio(q, prio); + if (tmp > 0) { + q->pmask |= 1<<prio; + if (tmp < delay || delay == 0) + delay = tmp; + } + } + + if (delay) { + q->delay_timer.expires = jiffies + delay; + add_timer(&q->delay_timer); + } - q->wd_timer.expires = 0; - q->wd_timer.function = NULL; qdisc_wakeup(sch->dev); } + +#ifdef CONFIG_NET_CLS_POLICE + +static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) +{ + int len = skb->len; + struct Qdisc *sch = child->parent; + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl = cbq_class_lookup(q, child->classid); + + if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) { + if (cl->q->enqueue(skb, cl->q) == 1) { + sch->q.qlen++; + sch->stats.packets++; + cl->stats.packets++; + sch->stats.bytes+=len; + cl->stats.bytes+=len; + cbq_mark_toplevel(q, cl); + if (!cl->next_alive) + cbq_activate_class(cl); + return 0; + } + sch->stats.drops++; + return 0; + } + + sch->stats.drops++; + return -1; +} +#endif + +static __inline__ void +cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) +{ + if (cl && q->toplevel >= cl->level) { + if (cl->q->q.qlen <= 1 || PSCHED_TLESS(q->now, cl->undertime)) + q->toplevel = TC_CBQ_MAXLEVEL; + else /* BUGGGG? if (cl != this) */ + q->toplevel = cl->level; + } +} + static __inline__ void cbq_update(struct cbq_sched_data *q) { - struct cbq_class *cl; + struct cbq_class *cl = q->tx_class; + int len = q->tx_len; + + q->tx_class = NULL; - for (cl = q->last_sent; cl; cl = cl->parent) { + for ( ; cl; cl = cl->share) { long avgidle = cl->avgidle; long idle; @@ -333,26 +687,17 @@ cbq_update(struct cbq_sched_data *q) idle = (now - last) - last_pktlen/rate */ - idle = PSCHED_TDIFF(q->now, cl->last) - - L2T(cl, q->last_sent_len); + idle = PSCHED_TDIFF(q->now, cl->last) - L2T(cl, len); /* true_avgidle := (1-W)*true_avgidle + W*idle, - where W=2^{-filter_log}. But cl->avgidle is scaled: + where W=2^{-ewma_log}. But cl->avgidle is scaled: cl->avgidle == true_avgidle/W, hence: */ - avgidle += idle - (avgidle>>cl->filter_log); + avgidle += idle - (avgidle>>cl->ewma_log); if (avgidle <= 0) { /* Overlimit or at-limit */ -#ifdef CBQ_NO_TRICKERY - avgidle = 0; -#else - if (avgidle < cl->minidle) - avgidle = cl->minidle; -#endif - - /* This line was missing in NS. */ cl->avgidle = avgidle; /* Calculate expected time, when this class @@ -362,29 +707,24 @@ cbq_update(struct cbq_sched_data *q) idle = (1/W - 1)*(-true_avgidle) or idle = (1 - W)*(-cl->avgidle); + */ + idle = (-avgidle) - ((-avgidle) >> cl->ewma_log); + /* That is not all. - We want to set undertime to the moment, when - the class is allowed to start next transmission i.e. - (undertime + next_pktlen/phys_bandwidth) - - now - next_pktlen/rate = idle - or - undertime = now + idle + next_pktlen/rate - - next_pktlen/phys_bandwidth - - We do not know next packet length, but can - estimate it with average packet length - or current packet_length. + To maintain rate allocated to class, + we add to undertime virtual clock, + necassry to complete transmitted packet. + (len/phys_bandwidth has been already passed + to the moment of cbq_update) */ - idle = (-avgidle) - ((-avgidle) >> cl->filter_log); - idle += L2T(q, q->last_sent_len); - idle -= L2T(cl, q->last_sent_len); + idle -= L2T(&q->link, len); + idle += L2T(cl, len); + + PSCHED_AUDIT_TDIFF(idle); + PSCHED_TADD2(q->now, idle, cl->undertime); -#ifndef CBQ_NO_TRICKERY - /* Do not forget extra delay :-) */ - PSCHED_TADD(cl->undertime, cl->extradelay); -#endif } else { /* Underlimit */ @@ -393,60 +733,44 @@ cbq_update(struct cbq_sched_data *q) cl->avgidle = cl->maxidle; else cl->avgidle = avgidle; + } cl->last = q->now; } -#ifdef CBQ_TOPLEVEL_SHARING - cl = q->last_sent; - - if (q->borrowed && q->toplevel >= q->borrowed->level) { - if (cl->q->q.qlen <= 1 || PSCHED_TLESS(q->now, q->borrowed->undertime)) - q->toplevel = CBQ_MAXLEVEL; - else if (q->borrowed != cl) - q->toplevel = q->borrowed->level; - } -#endif - - q->last_sent = NULL; + cbq_update_toplevel(q, q->tx_borrowed); } -static __inline__ int +static __inline__ struct cbq_class * cbq_under_limit(struct cbq_class *cl) { struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data; struct cbq_class *this_cl = cl; - if (PSCHED_IS_PASTPERFECT(cl->undertime) || cl->parent == NULL) - return 1; + if (cl->tparent == NULL) + return cl; - if (PSCHED_TLESS(cl->undertime, q->now)) { - q->borrowed = cl; - return 1; + if (PSCHED_IS_PASTPERFECT(cl->undertime) || + PSCHED_TLESS(cl->undertime, q->now)) { + cl->delayed = 0; + return cl; } while (!PSCHED_IS_PASTPERFECT(cl->undertime) && PSCHED_TLESS(q->now, cl->undertime)) { - cl = cl->borrow; - if (cl == NULL -#ifdef CBQ_TOPLEVEL_SHARING - || cl->level > q->toplevel -#endif - ) { -#if 0 + if ((cl = cl->borrow) == NULL || cl->level > q->toplevel) { + this_cl->stats.overlimits++; this_cl->overlimit(this_cl); -#else - cbq_delay(q, this_cl); -#endif - return 0; + return NULL; } } - q->borrowed = cl; - return 1; + this_cl->xstats.borrows++; + cl->xstats.borrows++; + return cl; } static __inline__ struct sk_buff * -cbq_dequeue_prio(struct Qdisc *sch, int prio, int fallback) +cbq_dequeue_prio(struct Qdisc *sch, int prio) { struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; struct cbq_class *cl_tail, *cl_prev, *cl; @@ -461,23 +785,14 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio, int fallback) /* Start round */ do { + struct cbq_class *borrow; + /* Class is empty */ - if (cl->q->q.qlen == 0) + if (cl->q->q.qlen == 0) goto skip_class; - - if (fallback) { - /* Fallback pass: all classes are overlimit; - we send from the first class that is allowed - to borrow. - */ - if (cl->borrow == NULL) - goto skip_class; - } else { - /* Normal pass: check that class is under limit */ - if (!cbq_under_limit(cl)) - goto skip_class; - } + if ((borrow = cbq_under_limit(cl)) == NULL) + goto skip_class; if (cl->deficit <= 0) { /* Class exhausted its allotment per this @@ -496,8 +811,9 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio, int fallback) goto skip_class; cl->deficit -= skb->len; - q->last_sent = cl; - q->last_sent_len = skb->len; + q->tx_class = cl; + q->tx_borrowed = borrow; + q->tx_len = skb->len; if (cl->deficit <= 0) { q->active[prio] = cl; @@ -509,10 +825,12 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio, int fallback) skip_class: cl->deficit = 0; - if (cl->q->q.qlen == 0) { - /* Class is empty, declare it dead */ + if (cl->q->q.qlen == 0 || prio != cl->cpriority) { + /* Class is empty or penalized. + Unlink it from active chain. + */ cl_prev->next_alive = cl->next_alive; - cl->awake = 0; + cl->next_alive = NULL; /* Did cl_tail point to it? */ if (cl == cl_tail) { @@ -524,9 +842,17 @@ skip_class: /* Kill the band! */ q->active[prio] = NULL; q->activemask &= ~(1<<prio); + if (cl->q->q.qlen) + cbq_activate_class(cl); return NULL; } + + q->active[prio] = cl_tail; } + if (cl->q->q.qlen) + cbq_activate_class(cl); + + cl = cl_prev; } next_class: @@ -537,22 +863,22 @@ next_class: } while (deficit); q->active[prio] = cl_prev; - + return NULL; } static __inline__ struct sk_buff * -cbq_dequeue_1(struct Qdisc *sch, int fallback) +cbq_dequeue_1(struct Qdisc *sch) { struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; struct sk_buff *skb; unsigned activemask; - activemask = q->activemask; + activemask = q->activemask&0xFF; while (activemask) { int prio = ffz(~activemask); activemask &= ~(1<<prio); - skb = cbq_dequeue_prio(sch, prio, fallback); + skb = cbq_dequeue_prio(sch, prio); if (skb) return skb; } @@ -564,40 +890,73 @@ cbq_dequeue(struct Qdisc *sch) { struct sk_buff *skb; struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + psched_time_t now; - PSCHED_GET_TIME(q->now); + PSCHED_GET_TIME(now); - if (q->last_sent) + if (q->tx_class) { + /* Time integrator. We calculate EOS time + by adding expected packet transmittion time. + If real time is greater, we warp artificial clock, + so that: + + cbq_time = max(real_time, work); + */ + PSCHED_TADD(q->now, L2T(&q->link, q->tx_len)); + if (PSCHED_TLESS(q->now, now)) + q->now = now; cbq_update(q); + } else if (PSCHED_TLESS(q->now, now)) + q->now = now; - q->wd_expires = 0; + for (;;) { + q->wd_expires = 0; - skb = cbq_dequeue_1(sch, 0); - if (skb) - return skb; + skb = cbq_dequeue_1(sch); + if (skb) { + sch->q.qlen--; + return skb; + } - /* All the classes are overlimit. - Search for overlimit class, which is allowed to borrow - and use it as fallback case. - */ + /* All the classes are overlimit. -#ifdef CBQ_TOPLEVEL_SHARING - q->toplevel = CBQ_MAXLEVEL; -#endif + It is possible, if: - skb = cbq_dequeue_1(sch, 1); - if (skb) - return skb; + 1. Scheduler is empty. + 2. Toplevel cutoff inhibited borrowing. + 3. Root class is overlimit. + + Reset 2d and 3d conditions and retry. + + Note, that NS and cbq-2.0 are buggy, peeking + an arbitrary class is appropriate for ancestor-only + sharing, but not for toplevel algorithm. + + Our version is better, but slower, because requires + two passes, but it is inavoidable with top-level sharing. + */ + + if (q->toplevel == TC_CBQ_MAXLEVEL && + PSCHED_IS_PASTPERFECT(q->link.undertime)) + break; + + q->toplevel = TC_CBQ_MAXLEVEL; + PSCHED_SET_PASTPERFECT(q->link.undertime); + } /* No packets in scheduler or nobody wants to give them to us :-( Sigh... start watchdog timer in the last case. */ - if (sch->q.qlen && q->wd_expires) { - if (q->wd_timer.function) + if (sch->q.qlen) { + sch->stats.overlimits++; + if (q->wd_expires && !sch->dev->tbusy) { + long delay = PSCHED_US2JIFFIE(q->wd_expires); del_timer(&q->wd_timer); - q->wd_timer.function = cbq_watchdog; - q->wd_timer.expires = jiffies + PSCHED_US2JIFFIE(q->wd_expires); - add_timer(&q->wd_timer); + if (delay <= 0) + delay = 1; + q->wd_timer.expires = jiffies + delay; + add_timer(&q->wd_timer); + } } return NULL; } @@ -606,234 +965,974 @@ cbq_dequeue(struct Qdisc *sch) static void cbq_adjust_levels(struct cbq_class *this) { - struct cbq_class *cl; + if (this == NULL) + return; - for (cl = this->parent; cl; cl = cl->parent) { - if (cl->level > this->level) - return; - cl->level = this->level + 1; - this = cl; - } + do { + int level = 0; + struct cbq_class *cl; + + if ((cl = this->children) != NULL) { + do { + if (cl->level > level) + level = cl->level; + } while ((cl = cl->sibling) != this->children); + } + this->level = level+1; + } while ((this = this->tparent) != NULL); } static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) { struct cbq_class *cl; + unsigned h; if (q->quanta[prio] == 0) return; - for (cl = q->classes[prio]; cl; cl = cl->next) { - if (cl->rquantum) - cl->quantum = (cl->rquantum*q->mtu*q->nclasses[prio])/ - q->quanta[prio]; + for (h=0; h<16; h++) { + for (cl = q->classes[h]; cl; cl = cl->next) { + /* BUGGGG... Beware! This expression suffer of + arithmetic overflows! + */ + if (cl->priority == prio) { + cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ + q->quanta[prio]; + } + if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) { + printk("Damn! %08x cl->quantum==%ld\n", cl->classid, cl->quantum); + cl->quantum = 1; + } + } } } -static __inline__ int cbq_unlink_class(struct cbq_class *this) +static void cbq_sync_defmap(struct cbq_class *cl) { - struct cbq_class *cl, **clp; - struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data; + struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data; + struct cbq_class *split = cl->split; + unsigned h; + int i; - for (clp = &q->classes[this->priority]; (cl = *clp) != NULL; - clp = &cl->next) { - if (cl == this) { - *clp = cl->next; - return 0; - } - } - return -ENOENT; -} + if (split == NULL) + return; -static int cbq_prune(struct cbq_class *this) -{ - struct cbq_class *cl; - int prio = this->priority; - struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data; + for (i=0; i<=TC_PRIO_MAX; i++) { + if (split->defaults[i] == cl && !(cl->defmap&(1<<i))) + split->defaults[i] = NULL; + } - qdisc_reset(this->q); + for (i=0; i<=TC_PRIO_MAX; i++) { + int level = split->level; - if (cbq_unlink_class(this)) - return -ENOENT; + if (split->defaults[i]) + continue; - if (this->awake) { - struct cbq_class *cl_prev = q->active[prio]; - do { - cl = cl_prev->next_alive; - if (cl == this) { - cl_prev->next_alive = cl->next_alive; + for (h=0; h<16; h++) { + struct cbq_class *c; - if (cl == q->active[prio]) { - q->active[prio] = cl; - if (cl == q->active[prio]) { - q->active[prio] = NULL; - q->activemask &= ~(1<<prio); - break; - } + for (c = q->classes[h]; c; c = c->next) { + if (c->split == split && c->level < level && + c->defmap&(1<<i)) { + split->defaults[i] = c; + level = c->level; } - - cl = cl->next_alive; - cl->deficit += cl->quantum; - break; } - } while ((cl_prev = cl) != q->active[prio]); + } } +} - --q->nclasses[prio]; - if (this->rquantum) { - q->quanta[prio] -= this->rquantum; - cbq_normalize_quanta(q, prio); +static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask) +{ + struct cbq_class *split = NULL; + + if (splitid == 0) { + if ((split = cl->split) == NULL) + return; + splitid = split->classid; } - if (q->fallback_class == this) - q->fallback_class = NULL; + if (split == NULL || split->classid != splitid) { + for (split = cl->tparent; split; split = split->tparent) + if (split->classid == splitid) + break; + } - this->parent = NULL; - this->borrow = NULL; - this->root = this; - this->qdisc = NULL; - return 0; + if (split == NULL) + return; + + if (cl->split != split) { + cl->defmap = 0; + cbq_sync_defmap(cl); + cl->split = split; + cl->defmap = def&mask; + } else + cl->defmap = (cl->defmap&~mask)|(def&mask); + + cbq_sync_defmap(cl); } -static int cbq_graft(struct cbq_class *this, struct cbq_class *parent) +static void cbq_unlink_class(struct cbq_class *this) { struct cbq_class *cl, **clp; - int prio = this->priority; struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data; - qdisc_reset(this->q); + for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) { + if (cl == this) { + *clp = cl->next; + cl->next = NULL; + break; + } + } + if (this->tparent) { + clp=&this->sibling; + cl = *clp; + do { + if (cl == this) { + *clp = cl->sibling; + break; + } + clp = &cl->sibling; + } while ((cl = *clp) != this->sibling); - for (clp = &q->classes[prio]; (cl = *clp) != NULL; clp = &cl->next) { - if (cl == this) - return -EBUSY; + if (this->tparent->children == this) { + this->tparent->children = this->sibling; + if (this->sibling == this) + this->tparent->children = NULL; + } + } else { + BUG_TRAP(this->sibling == this); } +} - cl->next = NULL; - *clp = cl; - - cl->parent = parent; - cl->borrow = parent; - cl->root = parent ? parent->root : cl; +static void cbq_link_class(struct cbq_class *this) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data; + unsigned h = cbq_hash(this->classid); + struct cbq_class *parent = this->tparent; + + this->sibling = this; + this->next = q->classes[h]; + q->classes[h] = this; + + if (parent == NULL) + return; - ++q->nclasses[prio]; - if (this->rquantum) { - q->quanta[prio] += this->rquantum; - cbq_normalize_quanta(q, prio); + if (parent->children == NULL) { + parent->children = this; + } else { + this->sibling = parent->children->sibling; + parent->children->sibling = this; } - - cbq_adjust_levels(this); +} + +static int cbq_drop(struct Qdisc* sch) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl; + int h; + for (h = TC_CBQ_MAXPRIO; h >= 0; h++) { + for (cl = q->classes[h]; cl; cl = cl->next) { + if (cl->q->ops->drop && cl->q->ops->drop(cl->q)) + return 1; + } + } return 0; } - static void cbq_reset(struct Qdisc* sch) { struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; struct cbq_class *cl; int prio; + unsigned h; q->activemask = 0; - q->last_sent = NULL; - if (q->wd_timer.function) { - del_timer(&q->wd_timer); - q->wd_timer.expires = 0; - q->wd_timer.function = NULL; - } -#ifdef CBQ_TOPLEVEL_SHARING - q->toplevel = CBQ_MAXLEVEL; -#endif - - for (prio = 0; prio < CBQ_MAXPRIO; prio++) { + q->pmask = 0; + q->tx_class = NULL; + q->tx_borrowed = NULL; + del_timer(&q->wd_timer); + del_timer(&q->delay_timer); + q->toplevel = TC_CBQ_MAXLEVEL; + + for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++) q->active[prio] = NULL; - - for (cl = q->classes[prio]; cl; cl = cl->next) { + + for (h = 0; h < 16; h++) { + for (cl = q->classes[h]; cl; cl = cl->next) { qdisc_reset(cl->q); cl->next_alive = NULL; PSCHED_SET_PASTPERFECT(cl->undertime); cl->avgidle = 0; cl->deficit = 0; - cl->awake = 0; + cl->cpriority = cl->priority; } } + sch->q.qlen = 0; +} + + +static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss) +{ + if (lss->change&TCF_CBQ_LSS_FLAGS) { + cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent; + cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent; + } + if (lss->change&TCF_CBQ_LSS_EWMA) + cl->ewma_log = lss->ewma_log; + if (lss->change&TCF_CBQ_LSS_AVPKT) + cl->avpkt = lss->avpkt; + if (lss->change&TCF_CBQ_LSS_MINIDLE) + cl->minidle = -(long)lss->minidle; + if (lss->change&TCF_CBQ_LSS_MAXIDLE) + cl->maxidle = lss->maxidle; + if (lss->change&TCF_CBQ_LSS_OFFTIME) + cl->offtime = lss->offtime; + return 0; +} + +static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl) +{ + q->nclasses[cl->priority]--; + q->quanta[cl->priority] -= cl->weight; + cbq_normalize_quanta(q, cl->priority); +} + +static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl) +{ + q->nclasses[cl->priority]++; + q->quanta[cl->priority] += cl->weight; + cbq_normalize_quanta(q, cl->priority); +} + +static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data; + + if (wrr->allot) + cl->allot = wrr->allot; + if (wrr->weight) + cl->weight = wrr->weight; + if (wrr->priority) { + cl->priority = wrr->priority-1; + cl->cpriority = cl->priority; + if (cl->priority >= cl->priority2) + cl->priority2 = TC_CBQ_MAXPRIO-1; + } + + cbq_addprio(q, cl); + return 0; +} + +static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl) +{ + switch (ovl->strategy) { + case TC_CBQ_OVL_CLASSIC: + cl->overlimit = cbq_ovl_classic; + break; + case TC_CBQ_OVL_DELAY: + cl->overlimit = cbq_ovl_delay; + break; + case TC_CBQ_OVL_LOWPRIO: + if (ovl->priority2-1 >= TC_CBQ_MAXPRIO || + ovl->priority2-1 <= cl->priority) + return -EINVAL; + cl->priority2 = ovl->priority2-1; + cl->overlimit = cbq_ovl_lowprio; + break; + case TC_CBQ_OVL_DROP: + cl->overlimit = cbq_ovl_drop; + break; + case TC_CBQ_OVL_RCLASSIC: + cl->overlimit = cbq_ovl_rclassic; + break; + default: + return -EINVAL; + } + cl->penalty = (ovl->penalty*HZ)/1000; + return 0; +} + +#ifdef CONFIG_NET_CLS_POLICE +static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p) +{ + cl->police = p->police; + + if (!(cl->q->flags&TCQ_F_DEFAULT)) { + if (p->police == TC_POLICE_RECLASSIFY) + cl->q->reshape_fail = cbq_reshape_fail; + else + cl->q->reshape_fail = NULL; + } + return 0; +} +#endif + +static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt) +{ + cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange); + return 0; +} + +static int cbq_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data; + struct rtattr *tb[TCA_CBQ_MAX]; + struct tc_ratespec *r; + + if (rtattr_parse(tb, TCA_CBQ_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0 || + tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL || + RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec)) + return -EINVAL; + + if (tb[TCA_CBQ_LSSOPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt)) + return -EINVAL; + + r = RTA_DATA(tb[TCA_CBQ_RATE-1]); + + MOD_INC_USE_COUNT; + if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL) { + MOD_DEC_USE_COUNT; + return -EINVAL; + } + + q->link.sibling = &q->link; + q->link.classid = sch->handle; + q->link.qdisc = sch; + if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) + q->link.q = &noop_qdisc; + + q->link.priority = TC_CBQ_MAXPRIO-1; + q->link.priority2 = TC_CBQ_MAXPRIO-1; + q->link.cpriority = TC_CBQ_MAXPRIO-1; + q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC; + q->link.overlimit = cbq_ovl_classic; + q->link.allot = psched_mtu(sch->dev); + q->link.quantum = q->link.allot; + q->link.weight = q->link.R_tab->rate.rate; + + q->link.ewma_log = TC_CBQ_DEF_EWMA; + q->link.avpkt = q->link.allot/2; + q->link.minidle = -0x7FFFFFFF; + + init_timer(&q->wd_timer); + q->wd_timer.data = (unsigned long)sch; + q->wd_timer.function = cbq_watchdog; + init_timer(&q->delay_timer); + q->delay_timer.data = (unsigned long)sch; + q->delay_timer.function = cbq_undelay; + q->toplevel = TC_CBQ_MAXLEVEL; + + cbq_link_class(&q->link); + + if (tb[TCA_CBQ_LSSOPT-1]) + cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); + + cbq_addprio(q, &q->link); + return 0; +} + +#ifdef CONFIG_RTNETLINK + +static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + + RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_lssopt opt; + + opt.flags = 0; + if (cl->borrow == NULL) + opt.flags |= TCF_CBQ_LSS_BOUNDED; + if (cl->share == NULL) + opt.flags |= TCF_CBQ_LSS_ISOLATED; + opt.ewma_log = cl->ewma_log; + opt.level = cl->level; + opt.avpkt = cl->avpkt; + opt.maxidle = cl->maxidle; + opt.minidle = (u32)(-cl->minidle); + opt.offtime = cl->offtime; + opt.change = ~0; + RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_wrropt opt; + + opt.flags = 0; + opt.allot = cl->allot; + opt.priority = cl->priority+1; + opt.cpriority = cl->cpriority+1; + opt.weight = cl->weight; + RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_ovl opt; + + opt.strategy = cl->ovl_strategy; + opt.priority2 = cl->priority2+1; + opt.penalty = (cl->penalty*1000)/HZ; + RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_fopt opt; + + if (cl->split || cl->defmap) { + opt.split = cl->split ? cl->split->classid : 0; + opt.defmap = cl->defmap; + opt.defchange = ~0; + RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt); + } + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +#ifdef CONFIG_NET_CLS_POLICE +static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_police opt; + + if (cl->police) { + opt.police = cl->police; + RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); + } + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif + +static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) +{ + if (cbq_dump_lss(skb, cl) < 0 || + cbq_dump_rate(skb, cl) < 0 || + cbq_dump_wrr(skb, cl) < 0 || + cbq_dump_ovl(skb, cl) < 0 || +#ifdef CONFIG_NET_CLS_POLICE + cbq_dump_police(skb, cl) < 0 || +#endif + cbq_dump_fopt(skb, cl) < 0) + return -1; + return 0; +} + +static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data; + unsigned char *b = skb->tail; + struct rtattr *rta; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + if (cbq_dump_attr(skb, &q->link) < 0) + goto rtattr_failure; + rta->rta_len = skb->tail - b; + q->link.xstats.avgidle = q->link.avgidle; + RTA_PUT(skb, TCA_XSTATS, sizeof(q->link.xstats), &q->link.xstats); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int +cbq_dump_class(struct Qdisc *sch, unsigned long arg, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data; + struct cbq_class *cl = (struct cbq_class*)arg; + unsigned char *b = skb->tail; + struct rtattr *rta; + + if (cl->tparent) + tcm->tcm_parent = cl->tparent->classid; + else + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_handle = cl->classid; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + if (cbq_dump_attr(skb, cl) < 0) + goto rtattr_failure; + rta->rta_len = skb->tail - b; + cl->stats.qlen = cl->q->q.qlen; + RTA_PUT(skb, TCA_STATS, sizeof(cl->stats), &cl->stats); + cl->xstats.avgidle = cl->avgidle; + cl->xstats.undertime = 0; + if (!PSCHED_IS_PASTPERFECT(cl->undertime)) + cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now); + RTA_PUT(skb, TCA_XSTATS, sizeof(cl->xstats), &cl->xstats); + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +#endif + +static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + + if (cl) { + if (new == NULL) { + if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)) == NULL) + return -ENOBUFS; + } else { +#ifdef CONFIG_NET_CLS_POLICE + if (cl->police == TC_POLICE_RECLASSIFY) + new->reshape_fail = cbq_reshape_fail; +#endif + } + if ((*old = xchg(&cl->q, new)) != NULL) + qdisc_reset(*old); + + return 0; + } + return -ENOENT; +} + +static unsigned long cbq_get(struct Qdisc *sch, u32 classid) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl = cbq_class_lookup(q, classid); + + if (cl) { + cl->refcnt++; + return (unsigned long)cl; + } + return 0; +} + +static void cbq_destroy_filters(struct cbq_class *cl) +{ + struct tcf_proto *tp; + + while ((tp = cl->filter_list) != NULL) { + cl->filter_list = tp->next; + tp->ops->destroy(tp); + } +} + +static void cbq_destroy_class(struct cbq_class *cl) +{ + cbq_destroy_filters(cl); + qdisc_destroy(cl->q); + qdisc_put_rtab(cl->R_tab); +#ifdef CONFIG_NET_ESTIMATOR + qdisc_kill_estimator(&cl->stats); +#endif } static void cbq_destroy(struct Qdisc* sch) { struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; - struct cbq_class *cl, **clp; - int prio; + struct cbq_class *cl; + unsigned h; + + for (h = 0; h < 16; h++) { + for (cl = q->classes[h]; cl; cl = cl->next) + cbq_destroy_filters(cl); + } + + for (h = 0; h < 16; h++) { + for (cl = q->classes[h]; cl; cl = cl->next) + if (cl != &q->link) + cbq_destroy_class(cl); + } + + qdisc_put_rtab(q->link.R_tab); +} + +static void cbq_put(struct Qdisc *q, unsigned long arg) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + + if (--cl->refcnt == 0) + cbq_destroy_class(cl); + return; +} + +static int +cbq_change(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca, + unsigned long *arg) +{ + int err; + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl = (struct cbq_class*)*arg; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_CBQ_MAX]; + struct cbq_class *parent; + struct qdisc_rate_table *rtab = NULL; + + if (opt==NULL || + rtattr_parse(tb, TCA_CBQ_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt))) + return -EINVAL; + + if (tb[TCA_CBQ_OVL_STRATEGY-1] && + RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl)) + return -EINVAL; + + if (tb[TCA_CBQ_FOPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt)) + return -EINVAL; + + if (tb[TCA_CBQ_RATE-1] && + RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec)) + return -EINVAL; + + if (tb[TCA_CBQ_LSSOPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt)) + return -EINVAL; + + if (tb[TCA_CBQ_WRROPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt)) + return -EINVAL; + +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_CBQ_POLICE-1] && + RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police)) + return -EINVAL; +#endif + + if (cl) { + /* Check parent */ + if (parentid) { + if (cl->tparent && cl->tparent->classid != parentid) + return -EINVAL; + if (!cl->tparent && parentid != TC_H_ROOT) + return -EINVAL; + } + + if (tb[TCA_CBQ_RATE-1]) { + rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]); + if (rtab == NULL) + return -EINVAL; + } + + /* Change class parameters */ + start_bh_atomic(); + + if (cl->next_alive != NULL) + cbq_deactivate_class(cl); + + if (rtab) { + rtab = xchg(&cl->R_tab, rtab); + qdisc_put_rtab(rtab); + } + + if (tb[TCA_CBQ_LSSOPT-1]) + cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); + + if (tb[TCA_CBQ_WRROPT-1]) { + cbq_rmprio(q, cl); + cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1])); + } + + if (tb[TCA_CBQ_OVL_STRATEGY-1]) + cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1])); + +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_CBQ_POLICE-1]) + cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1])); +#endif + + if (tb[TCA_CBQ_FOPT-1]) + cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1])); + + if (cl->q->q.qlen) + cbq_activate_class(cl); + + end_bh_atomic(); + +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) { + qdisc_kill_estimator(&cl->stats); + qdisc_new_estimator(&cl->stats, tca[TCA_RATE-1]); + } +#endif + return 0; + } + + if (parentid == TC_H_ROOT) + return -EINVAL; + + if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL || + tb[TCA_CBQ_LSSOPT-1] == NULL) + return -EINVAL; + + rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]); + if (rtab == NULL) + return -EINVAL; - for (prio = 0; prio < CBQ_MAXPRIO; prio++) { - struct cbq_class *cl_head = q->classes[prio]; - - for (clp = &cl_head; (cl=*clp) != NULL; clp = &cl->next) { - qdisc_destroy(cl->q); - kfree(cl); + if (classid) { + err = -EINVAL; + if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid)) + goto failure; + } else { + int i; + classid = TC_H_MAKE(sch->handle,0x8000); + + for (i=0; i<0x8000; i++) { + if (++q->hgenerator >= 0x8000) + q->hgenerator = 1; + if (cbq_class_lookup(q, classid|q->hgenerator) == NULL) + break; } + err = -ENOSR; + if (i >= 0x8000) + goto failure; + classid = classid|q->hgenerator; } + + parent = &q->link; + if (parentid) { + parent = cbq_class_lookup(q, parentid); + err = -EINVAL; + if (parent == NULL) + goto failure; + } + + err = -ENOBUFS; + cl = kmalloc(sizeof(*cl), GFP_KERNEL); + if (cl == NULL) + goto failure; + memset(cl, 0, sizeof(*cl)); + cl->R_tab = rtab; + rtab = NULL; + cl->refcnt = 1; + if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) + cl->q = &noop_qdisc; + cl->classid = classid; + cl->tparent = parent; + cl->qdisc = sch; + cl->allot = parent->allot; + cl->quantum = cl->allot; + cl->weight = cl->R_tab->rate.rate; + + start_bh_atomic(); + cbq_link_class(cl); + cl->borrow = cl->tparent; + if (cl->tparent != &q->link) + cl->share = cl->tparent; + cl->minidle = -0x7FFFFFFF; + cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); + cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1])); + if (cl->ewma_log==0) + cl->ewma_log = q->link.ewma_log; + if (cl->maxidle==0) + cl->maxidle = q->link.maxidle; + if (cl->avpkt==0) + cl->avpkt = q->link.avpkt; + cl->overlimit = cbq_ovl_classic; + if (tb[TCA_CBQ_OVL_STRATEGY-1]) + cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1])); +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_CBQ_POLICE-1]) + cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1])); +#endif + if (tb[TCA_CBQ_FOPT-1]) + cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1])); + cbq_adjust_levels(parent); + end_bh_atomic(); + +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) + qdisc_new_estimator(&cl->stats, tca[TCA_RATE-1]); +#endif + + *arg = (unsigned long)cl; + return 0; + +failure: + qdisc_put_rtab(rtab); + return err; } -static int cbq_control(struct Qdisc *sch, void *arg) +static int cbq_delete(struct Qdisc *sch, unsigned long arg) { - struct cbq_sched_data *q; + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl = (struct cbq_class*)arg; + + if (cl->filters || cl->children || cl == &q->link) + return -EBUSY; + + start_bh_atomic(); + + if (cl->next_alive) + cbq_deactivate_class(cl); + + if (q->tx_class == cl) + q->tx_class = cl->borrow; + if (q->tx_borrowed == cl) + q->tx_borrowed = q->tx_class; - q = (struct cbq_sched_data *)sch->data; + cbq_unlink_class(cl); + cbq_adjust_levels(cl->tparent); + cl->defmap = 0; + cbq_sync_defmap(cl); - /* Do attachment here. It is the last thing to do. */ + cbq_rmprio(q, cl); - return -EINVAL; + if (--cl->refcnt == 0) + cbq_destroy_class(cl); + + end_bh_atomic(); + + return 0; } -static int cbq_init(struct Qdisc *sch, void *arg) +static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg) { - struct cbq_sched_data *q; - struct cbqctl *ctl = (struct cbqctl*)arg; + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl = (struct cbq_class *)arg; - q = (struct cbq_sched_data *)sch->data; - init_timer(&q->wd_timer); - q->wd_timer.data = (unsigned long)sch; -#ifdef CBQ_TOPLEVEL_SHARING - q->toplevel = CBQ_MAXLEVEL; -#endif + if (cl == NULL) + cl = &q->link; + + return &cl->filter_list; +} + +static unsigned long cbq_bind_filter(struct Qdisc *sch, u32 classid) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl = cbq_class_lookup(q, classid); + if (cl) { + cl->filters++; + return (unsigned long)cl; + } return 0; } +static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + + cl->filters--; +} + +static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + unsigned h; + + if (arg->stop) + return; + + for (h = 0; h < 16; h++) { + struct cbq_class *cl; + + for (cl = q->classes[h]; cl; cl = cl->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, (unsigned long)cl, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } + } +} -struct Qdisc_ops cbq_ops = +static struct Qdisc_class_ops cbq_class_ops = +{ + cbq_graft, + cbq_get, + cbq_put, + cbq_change, + cbq_delete, + cbq_walk, + + cbq_find_tcf, + cbq_bind_filter, + cbq_unbind_filter, + +#ifdef CONFIG_RTNETLINK + cbq_dump_class, +#endif +}; + +struct Qdisc_ops cbq_qdisc_ops = { NULL, + &cbq_class_ops, "cbq", - 0, sizeof(struct cbq_sched_data), + cbq_enqueue, cbq_dequeue, + cbq_requeue, + cbq_drop, + + cbq_init, cbq_reset, cbq_destroy, - cbq_init, - cbq_control, + +#ifdef CONFIG_RTNETLINK + cbq_dump, +#endif }; #ifdef MODULE int init_module(void) { - int err; - - /* Load once and never free it. */ - MOD_INC_USE_COUNT; - - err = register_qdisc(&cbq_ops); - if (err) - MOD_DEC_USE_COUNT; - return err; + return register_qdisc(&cbq_qdisc_ops); } void cleanup_module(void) { + unregister_qdisc(&cbq_qdisc_ops); } #endif diff --git a/net/sched/sch_csz.c b/net/sched/sch_csz.c index 5e10ac097..c21d8ac43 100644 --- a/net/sched/sch_csz.c +++ b/net/sched/sch_csz.c @@ -10,6 +10,8 @@ * */ +#include <linux/config.h> +#include <linux/module.h> #include <asm/uaccess.h> #include <asm/system.h> #include <asm/bitops.h> @@ -48,16 +50,16 @@ but it has pretty poor delay characteristics. Round-robin scheduling and link-sharing goals apparently contradict to minimization of network delay and jitter. - Moreover, correct handling of predicted flows seems to be + Moreover, correct handling of predictive flows seems to be impossible in CBQ. CSZ presents more precise but less flexible and less efficient approach. As I understand, the main idea is to create WFQ flows for each guaranteed service and to allocate the rest of bandwith to dummy flow-0. Flow-0 comprises - the predicted services and the best effort traffic; + the predictive services and the best effort traffic; it is handled by a priority scheduler with the highest - priority band allocated for predicted services, and the rest --- + priority band allocated for predictive services, and the rest --- to the best effort packets. Note, that in CSZ flows are NOT limited to their bandwidth. @@ -67,14 +69,16 @@ will introduce undesired delays and raise jitter. At the moment CSZ is the only scheduler that provides - real guaranteed service. Another schemes (including CBQ) + true guaranteed service. Another schemes (including CBQ) do not provide guaranteed delay and randomize jitter. There exists the statement (Sally Floyd), that delay can be estimated by a IntServ compliant formulae. This result is true formally, but it is wrong in principle. - At first, it ignores delays introduced by link sharing. - And the second (and main) it limits bandwidth, - it is fatal flaw. + It takes into account only round-robin delays, + ignoring delays introduced by link sharing i.e. overlimiting. + Note, that temporary overlimits are inevitable because + real links are not ideal, and true algorithm must take it + into account. ALGORITHM. @@ -204,9 +208,8 @@ /* This number is arbitrary */ -#define CSZ_MAX_GUARANTEED 16 - -#define CSZ_FLOW_ID(skb) (CSZ_MAX_GUARANTEED) +#define CSZ_GUARANTEED 16 +#define CSZ_FLOWS (CSZ_GUARANTEED+4) struct csz_head { @@ -224,12 +227,15 @@ struct csz_flow struct csz_head *fprev; /* Parameters */ - unsigned long rate; /* Flow rate. Fixed point is at rate_log */ - unsigned long *L_tab; /* Lookup table for L/(B*r_a) values */ - unsigned long max_bytes; /* Maximal length of queue */ + struct tc_ratespec rate; + struct tc_ratespec slice; + u32 *L_tab; /* Lookup table for L/(B*r_a) values */ + unsigned long limit; /* Maximal length of queue */ #ifdef CSZ_PLUS_TBF - unsigned long depth; /* Depth of token bucket, normalized + struct tc_ratespec peakrate; + __u32 buffer; /* Depth of token bucket, normalized as L/(B*r_a) */ + __u32 mtu; #endif /* Variables */ @@ -246,12 +252,11 @@ struct csz_flow struct sk_buff_head q; /* FIFO queue */ }; -#define L2R(q,f,L) ((f)->L_tab[(L)>>(q)->cell_log]) +#define L2R(f,L) ((f)->L_tab[(L)>>(f)->slice.cell_log]) struct csz_sched_data { /* Parameters */ - unsigned char cell_log; /* 1<<cell_log is quantum of packet size */ unsigned char rate_log; /* fixed point position for rate; * really we need not it */ unsigned char R_log; /* fixed point position for round number */ @@ -259,6 +264,8 @@ struct csz_sched_data * 21 <-> 2.1sec is MAXIMAL value */ /* Variables */ + struct tcf_proto *filter_list; + u8 prio2band[TC_PRIO_MAX+1]; #ifdef CSZ_PLUS_TBF struct timer_list wd_timer; long wd_expires; @@ -270,8 +277,8 @@ struct csz_sched_data struct csz_head f; /* Flows sorted by "finish" */ struct sk_buff_head other[4];/* Predicted (0) and the best efforts - classes (1,2,3) */ - struct csz_flow flow[CSZ_MAX_GUARANTEED]; /* Array of flows */ + classes (1,2,3) */ + struct csz_flow flow[CSZ_GUARANTEED]; /* Array of flows */ }; /* These routines (csz_insert_finish and csz_insert_start) are @@ -353,7 +360,11 @@ extern __inline__ void csz_insert_start(struct csz_head *b, It is another time consuming part, but it is impossible to avoid it. - Fixed point arithmetic is not ... does not ... Well, it is just CRAP. + It costs O(N) that make all the algorithm useful only + to play with closest to ideal fluid model. + + There exist less academic, but more practical modifications, + which might have even better characteristics (WF2Q+, HPFQ, HFSC) */ static unsigned long csz_update(struct Qdisc *sch) @@ -430,9 +441,9 @@ do_reset: tmp = ((F-q->R_c)*q->rate)<<q->R_log; R_c = F; - q->rate -= a->rate; + q->rate -= a->slice.rate; - if (delay - tmp >= 0) { + if ((long)(delay - tmp) >= 0) { delay -= tmp; continue; } @@ -443,35 +454,41 @@ do_reset: return tmp; } +unsigned csz_classify(struct sk_buff *skb, struct csz_sched_data *q) +{ + return CSZ_GUARANTEED; +} + static int csz_enqueue(struct sk_buff *skb, struct Qdisc* sch) { struct csz_sched_data *q = (struct csz_sched_data *)sch->data; - unsigned flow_id = CSZ_FLOW_ID(skb); + unsigned flow_id = csz_classify(skb, q); unsigned long R; - int prio; + int prio = 0; struct csz_flow *this; - if (flow_id >= CSZ_MAX_GUARANTEED) { - prio = flow_id - CSZ_MAX_GUARANTEED; + if (flow_id >= CSZ_GUARANTEED) { + prio = flow_id - CSZ_GUARANTEED; flow_id = 0; } this = &q->flow[flow_id]; - if (this->q.qlen >= this->max_bytes || this->L_tab == NULL) { + if (this->q.qlen >= this->limit || this->L_tab == NULL) { + sch->stats.drops++; kfree_skb(skb); return 0; } R = csz_update(sch); - if (this->finish - R >= 0) { + if ((long)(this->finish - R) >= 0) { /* It was active */ - this->finish += L2R(q,this,skb->len); + this->finish += L2R(this,skb->len); } else { /* It is inactive; activate it */ - this->finish = R + L2R(q,this,skb->len); - q->rate += this->rate; + this->finish = R + L2R(this,skb->len); + q->rate += this->slice.rate; csz_insert_finish(&q->f, this); } @@ -486,6 +503,8 @@ csz_enqueue(struct sk_buff *skb, struct Qdisc* sch) else skb_queue_tail(&q->other[prio], skb); sch->q.qlen++; + sch->stats.bytes += skb->len; + sch->stats.packets++; return 1; } @@ -524,10 +543,6 @@ skb_peek_best(struct csz_sched_data * q) static void csz_watchdog(unsigned long arg) { struct Qdisc *sch = (struct Qdisc*)arg; - struct csz_sched_data *q = (struct csz_sched_data*)sch->data; - - q->wd_timer.expires = 0; - q->wd_timer.function = NULL; qdisc_wakeup(sch->dev); } @@ -568,7 +583,7 @@ static __inline__ int csz_enough_tokens(struct csz_sched_data *q, if (toks >= 0) { /* Now we have enough tokens to proceed */ - this->tokens = toks <= this->depth ? toks ? this->depth; + this->tokens = toks <= this->depth ? toks : this->depth; this->t_tbf = now; if (!this->throttled) @@ -601,7 +616,7 @@ static __inline__ int csz_enough_tokens(struct csz_sched_data *q, This apriory shift in R will be adjusted later to reflect real delay. We cannot avoid it because of: - throttled flow continues to be active from the viewpoint - of CSZ, so that it would acquire highest priority, + of CSZ, so that it would acquire the highest priority, if you not adjusted start numbers. - Eventually, finish number would become less than round number and flow were declared inactive. @@ -654,7 +669,7 @@ csz_dequeue(struct Qdisc* sch) #endif if (this->q.qlen) { struct sk_buff *nskb = skb_peek(&this->q); - this->start += L2R(q,this,nskb->len); + this->start += L2R(this,nskb->len); csz_insert_start(&q->s, this); } sch->q.qlen--; @@ -668,7 +683,7 @@ csz_dequeue(struct Qdisc* sch) if (--this->q.qlen) { struct sk_buff *nskb; - unsigned dequeued = L2R(q,this,skb->len); + unsigned dequeued = L2R(this,skb->len); /* We got not the same thing that peeked earlier; adjust start number @@ -677,7 +692,7 @@ csz_dequeue(struct Qdisc* sch) this->start += dequeued - peeked; nskb = skb_peek_best(q); - peeked = L2R(q,this,nskb->len); + peeked = L2R(this,nskb->len); this->start += peeked; this->peeked = peeked; csz_insert_start(&q->s, this); @@ -692,11 +707,13 @@ csz_dequeue(struct Qdisc* sch) Schedule watchdog timer, if it occured because of shaping. */ if (q->wd_expires) { - if (q->wd_timer.function) - del_timer(&q->wd_timer); - q->wd_timer.function = csz_watchdog; - q->wd_timer.expires = jiffies + PSCHED_US2JIFFIE(q->wd_expires); + unsigned long delay = PSCHED_US2JIFFIE(q->wd_expires); + del_timer(&q->wd_timer); + if (delay == 0) + delay = 1; + q->wd_timer.expires = jiffies + delay; add_timer(&q->wd_timer); + sch->stats.overlimits++; } #endif return NULL; @@ -706,17 +723,14 @@ static void csz_reset(struct Qdisc* sch) { struct csz_sched_data *q = (struct csz_sched_data *)sch->data; - struct sk_buff *skb; int i; for (i=0; i<4; i++) - while ((skb=skb_dequeue(&q->other[i])) != NULL) - kfree_skb(skb); + skb_queue_purge(&q->other[i]); - for (i=0; i<CSZ_MAX_GUARANTEED; i++) { + for (i=0; i<CSZ_GUARANTEED; i++) { struct csz_flow *this = q->flow + i; - while ((skb = skb_dequeue(&this->q)) != NULL) - kfree_skb(skb); + skb_queue_purge(&this->q); this->snext = this->sprev = this->fnext = this->fprev = (struct csz_head*)this; this->start = this->finish = 0; @@ -727,10 +741,7 @@ csz_reset(struct Qdisc* sch) #ifdef CSZ_PLUS_TBF PSCHED_GET_TIME(&q->t_tbf); q->tokens = q->depth; - if (q->wd_timer.function) { - del_timer(&q->wd_timer); - q->wd_timer.function = NULL; - } + del_timer(&q->wd_timer); #endif sch->q.qlen = 0; } @@ -738,25 +749,34 @@ csz_reset(struct Qdisc* sch) static void csz_destroy(struct Qdisc* sch) { -/* - struct csz_sched_data *q = (struct csz_sched_data *)sch->data; - int i; - - for (i=0; i<4; i++) - qdisc_destroy(q->other[i]); - */ + MOD_DEC_USE_COUNT; } -static int csz_init(struct Qdisc *sch, void *arg) +static int csz_init(struct Qdisc *sch, struct rtattr *opt) { struct csz_sched_data *q = (struct csz_sched_data *)sch->data; - struct cszinitctl *ctl = (struct cszinitctl*)arg; + struct rtattr *tb[TCA_CSZ_PTAB]; + struct tc_csz_qopt *qopt; int i; + rtattr_parse(tb, TCA_CSZ_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)); + if (tb[TCA_CSZ_PARMS-1] == NULL || + RTA_PAYLOAD(tb[TCA_CSZ_PARMS-1]) < sizeof(*qopt)) + return -EINVAL; + qopt = RTA_DATA(tb[TCA_CSZ_PARMS-1]); + + q->R_log = qopt->R_log; + q->delta_log = qopt->delta_log; + for (i=0; i<=TC_PRIO_MAX; i++) { + if (qopt->priomap[i] >= CSZ_FLOWS) + return -EINVAL; + q->prio2band[i] = qopt->priomap[i]; + } + for (i=0; i<4; i++) skb_queue_head_init(&q->other[i]); - for (i=0; i<CSZ_MAX_GUARANTEED; i++) { + for (i=0; i<CSZ_GUARANTEED; i++) { struct csz_flow *this = q->flow + i; skb_queue_head_init(&this->q); this->snext = this->sprev = @@ -769,64 +789,268 @@ static int csz_init(struct Qdisc *sch, void *arg) #ifdef CSZ_PLUS_TBF init_timer(&q->wd_timer); q->wd_timer.data = (unsigned long)sch; + q->wd_timer.function = csz_watchdog; +#endif + MOD_INC_USE_COUNT; + return 0; +} + +#ifdef CONFIG_RTNETLINK +static int csz_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_csz_qopt opt; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + opt.flows = CSZ_FLOWS; + memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1); + RTA_PUT(skb, TCA_CSZ_PARMS, sizeof(opt), &opt); + rta->rta_len = skb->tail - b; + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} #endif - if (ctl) { - if (ctl->flows != CSZ_MAX_GUARANTEED) + + +static int csz_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, + struct Qdisc **old) +{ + return -EINVAL; +} + +static unsigned long csz_get(struct Qdisc *sch, u32 classid) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + unsigned long band = TC_H_MIN(classid) - 1; + + if (band >= CSZ_FLOWS) + return 0; + + if (band < CSZ_GUARANTEED && q->flow[band].L_tab == NULL) + return 0; + + return band+1; +} + +static void csz_put(struct Qdisc *sch, unsigned long cl) +{ + return; +} + +static int csz_change(struct Qdisc *sch, u32 handle, u32 parent, struct rtattr **tca, unsigned long *arg) +{ + unsigned long cl = *arg; + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_CSZ_PTAB]; + struct tc_csz_copt *copt; + + rtattr_parse(tb, TCA_CSZ_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)); + if (tb[TCA_CSZ_PARMS-1] == NULL || + RTA_PAYLOAD(tb[TCA_CSZ_PARMS-1]) < sizeof(*copt)) + return -EINVAL; + copt = RTA_DATA(tb[TCA_CSZ_PARMS-1]); + + if (tb[TCA_CSZ_RTAB-1] && + RTA_PAYLOAD(tb[TCA_CSZ_RTAB-1]) < 1024) + return -EINVAL; + + if (cl) { + struct csz_flow *a; + cl--; + if (cl >= CSZ_FLOWS) + return -ENOENT; + if (cl >= CSZ_GUARANTEED || q->flow[cl].L_tab == NULL) return -EINVAL; - q->cell_log = ctl->cell_log; + + a = &q->flow[cl]; + + start_bh_atomic(); +#if 0 + a->rate_log = copt->rate_log; +#endif +#ifdef CSZ_PLUS_TBF + a->limit = copt->limit; + a->rate = copt->rate; + a->buffer = copt->buffer; + a->mtu = copt->mtu; +#endif + + if (tb[TCA_CSZ_RTAB-1]) + memcpy(a->L_tab, RTA_DATA(tb[TCA_CSZ_RTAB-1]), 1024); + + end_bh_atomic(); + return 0; } + /* NI */ return 0; } -static int csz_control(struct Qdisc *sch, struct pschedctl *gctl) +static int csz_delete(struct Qdisc *sch, unsigned long cl) { -/* struct csz_sched_data *q = (struct csz_sched_data *)sch->data; - struct cszctl *ctl = (struct cszctl*)gctl->arg; - struct sk_buff *skb; - int i; + struct csz_flow *a; + + cl--; + + if (cl >= CSZ_FLOWS) + return -ENOENT; + if (cl >= CSZ_GUARANTEED || q->flow[cl].L_tab == NULL) + return -EINVAL; + + a = &q->flow[cl]; + + start_bh_atomic(); + a->fprev->fnext = a->fnext; + a->fnext->fprev = a->fprev; + a->sprev->snext = a->snext; + a->snext->sprev = a->sprev; + a->start = a->finish = 0; + kfree(xchg(&q->flow[cl].L_tab, NULL)); + end_bh_atomic(); - if (op == PSCHED_TC_ATTACH) { - - } -*/ return 0; } +#ifdef CONFIG_RTNETLINK +static int csz_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_csz_copt opt; + + tcm->tcm_handle = sch->handle|cl; + + cl--; + + if (cl > CSZ_FLOWS) + goto rtattr_failure; + + if (cl < CSZ_GUARANTEED) { + struct csz_flow *f = &q->flow[cl]; + + if (f->L_tab == NULL) + goto rtattr_failure; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + opt.limit = f->limit; + opt.rate = f->rate; + opt.slice = f->slice; + memset(&opt.peakrate, 0, sizeof(opt.peakrate)); +#ifdef CSZ_PLUS_TBF + opt.buffer = f->buffer; + opt.mtu = f->mtu; +#else + opt.buffer = 0; + opt.mtu = 0; +#endif + + RTA_PUT(skb, TCA_CSZ_PARMS, sizeof(opt), &opt); + rta->rta_len = skb->tail - b; + } + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif + +static void csz_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + int prio = 0; + + if (arg->stop) + return; + + for (prio = 0; prio < CSZ_FLOWS; prio++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (prio < CSZ_GUARANTEED && q->flow[prio].L_tab == NULL) { + arg->count++; + continue; + } + if (arg->fn(sch, prio+1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static struct tcf_proto ** csz_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + + if (cl) + return NULL; + return &q->filter_list; +} +struct Qdisc_class_ops csz_class_ops = +{ + csz_graft, + csz_get, + csz_put, + csz_change, + csz_delete, + csz_walk, + + csz_find_tcf, + csz_get, + csz_put, + +#ifdef CONFIG_RTNETLINK + csz_dump_class, +#endif +}; -struct Qdisc_ops csz_ops = +struct Qdisc_ops csz_qdisc_ops = { NULL, + &csz_class_ops, "csz", - 0, sizeof(struct csz_sched_data), + csz_enqueue, csz_dequeue, + NULL, + NULL, + + csz_init, csz_reset, csz_destroy, - csz_init, - csz_control, + +#ifdef CONFIG_RTNETLINK + csz_dump, +#endif }; #ifdef MODULE -#include <linux/module.h> int init_module(void) { - int err; - - /* Load once and never free it. */ - MOD_INC_USE_COUNT; - - err = register_qdisc(&csz_ops); - if (err) - MOD_DEC_USE_COUNT; - return err; + return register_qdisc(&csz_qdisc_ops); } void cleanup_module(void) { + unregister_qdisc(&csz_qdisc_ops); } #endif diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index af44d4e75..14bc8bb8b 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -1,9 +1,15 @@ /* - * net/sched/sch_fifo.c Simple FIFO "scheduler" + * net/sched/sch_fifo.c The simplest FIFO queue. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ +#include <linux/config.h> #include <asm/uaccess.h> #include <asm/system.h> #include <asm/bitops.h> @@ -32,9 +38,7 @@ struct fifo_sched_data { - int qmaxbytes; - int qmaxlen; - int qbytes; + unsigned limit; }; static int @@ -42,41 +46,62 @@ bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) { struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data; - if (q->qbytes <= q->qmaxbytes) { - skb_queue_tail(&sch->q, skb); - q->qbytes += skb->len; - return 0; + if (sch->stats.backlog <= q->limit) { + __skb_queue_tail(&sch->q, skb); + sch->stats.backlog += skb->len; + sch->stats.bytes += skb->len; + sch->stats.packets++; + return 1; } - kfree_skb(skb); + sch->stats.drops++; +#ifdef CONFIG_NET_CLS_POLICE + if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch)) +#endif + kfree_skb(skb); + return 0; +} + +static int +bfifo_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + __skb_queue_head(&sch->q, skb); + sch->stats.backlog += skb->len; return 1; } static struct sk_buff * bfifo_dequeue(struct Qdisc* sch) { - struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data; struct sk_buff *skb; - skb = skb_dequeue(&sch->q); + skb = __skb_dequeue(&sch->q); if (skb) - q->qbytes -= skb->len; + sch->stats.backlog -= skb->len; return skb; } -static void -bfifo_reset(struct Qdisc* sch) +static int +fifo_drop(struct Qdisc* sch) { - struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data; struct sk_buff *skb; - while((skb=skb_dequeue(&sch->q)) != NULL) { - q->qbytes -= skb->len; + skb = __skb_dequeue_tail(&sch->q); + if (skb) { + sch->stats.backlog -= skb->len; kfree_skb(skb); + return 1; } - if (q->qbytes) { - printk("fifo_reset: qbytes=%d\n", q->qbytes); - q->qbytes = 0; - } + return 0; +} + +static void +fifo_reset(struct Qdisc* sch) +{ + struct sk_buff *skb; + + while ((skb=__skb_dequeue(&sch->q)) != NULL) + kfree_skb(skb); + sch->stats.backlog = 0; } static int @@ -84,96 +109,106 @@ pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) { struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data; - if (sch->q.qlen <= q->qmaxlen) { - skb_queue_tail(&sch->q, skb); - return 0; + if (sch->q.qlen <= q->limit) { + __skb_queue_tail(&sch->q, skb); + sch->stats.bytes += skb->len; + sch->stats.packets++; + return 1; } - kfree_skb(skb); + sch->stats.drops++; +#ifdef CONFIG_NET_CLS_POLICE + if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch)) +#endif + kfree_skb(skb); + return 0; +} + +static int +pfifo_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + __skb_queue_head(&sch->q, skb); return 1; } + static struct sk_buff * pfifo_dequeue(struct Qdisc* sch) { - return skb_dequeue(&sch->q); + return __skb_dequeue(&sch->q); } -static void -pfifo_reset(struct Qdisc* sch) -{ - struct sk_buff *skb; - while((skb=skb_dequeue(&sch->q))!=NULL) - kfree_skb(skb); +static int fifo_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct fifo_sched_data *q = (void*)sch->data; + + if (opt == NULL) { + q->limit = sch->dev->tx_queue_len; + if (sch->ops == &bfifo_qdisc_ops) + q->limit *= sch->dev->mtu; + } else { + struct tc_fifo_qopt *ctl = RTA_DATA(opt); + if (opt->rta_len < RTA_LENGTH(sizeof(*ctl))) + return -EINVAL; + q->limit = ctl->limit; + } + return 0; } - -static int fifo_init(struct Qdisc *sch, void *arg /* int bytes, int pkts */) +#ifdef CONFIG_RTNETLINK +static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb) { - struct fifo_sched_data *q; -/* - struct device *dev = sch->dev; - */ + struct fifo_sched_data *q = (void*)sch->data; + unsigned char *b = skb->tail; + struct tc_fifo_qopt opt; - q = (struct fifo_sched_data *)sch->data; -/* - if (pkts<0) - pkts = dev->tx_queue_len; - if (bytes<0) - bytes = pkts*dev->mtu; - q->qmaxbytes = bytes; - q->qmaxlen = pkts; - */ - return 0; + opt.limit = q->limit; + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; } +#endif -struct Qdisc_ops pfifo_ops = +struct Qdisc_ops pfifo_qdisc_ops = { NULL, + NULL, "pfifo", - 0, sizeof(struct fifo_sched_data), + pfifo_enqueue, pfifo_dequeue, - pfifo_reset, - NULL, + pfifo_requeue, + fifo_drop, + fifo_init, + fifo_reset, + NULL, +#ifdef CONFIG_RTNETLINK + fifo_dump, +#endif }; -struct Qdisc_ops bfifo_ops = +struct Qdisc_ops bfifo_qdisc_ops = { NULL, - "pfifo", - 0, + NULL, + "bfifo", sizeof(struct fifo_sched_data), + bfifo_enqueue, bfifo_dequeue, - bfifo_reset, - NULL, - fifo_init, -}; - -#ifdef MODULE -#include <linux/module.h> -int init_module(void) -{ - int err; - - /* Load once and never free it. */ - MOD_INC_USE_COUNT; + bfifo_requeue, + fifo_drop, - err = register_qdisc(&pfifo_ops); - if (err == 0) { - err = register_qdisc(&bfifo_ops); - if (err) - unregister_qdisc(&pfifo_ops); - } - if (err) - MOD_DEC_USE_COUNT; - return err; -} - -void cleanup_module(void) -{ -} + fifo_init, + fifo_reset, + NULL, +#ifdef CONFIG_RTNETLINK + fifo_dump, #endif +}; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index c3399f9c1..5e07bced8 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -30,66 +30,116 @@ #include <net/sock.h> #include <net/pkt_sched.h> +#define BUG_TRAP(x) if (!(x)) { printk("Assertion (" #x ") failed at " __FILE__ "(%d):" __FUNCTION__ "\n", __LINE__); } + +/* Main transmission queue. */ + struct Qdisc_head qdisc_head = { &qdisc_head }; -static struct Qdisc_ops *qdisc_base = NULL; +/* Kick device. + Note, that this procedure can be called by watchdog timer, so that + we do not check dev->tbusy flag here. + + Returns: 0 - queue is empty. + >0 - queue is not empty, but throttled. + <0 - queue is not empty. Device is throttled, if dev->tbusy != 0. + + NOTE: Called only from NET BH +*/ -static int default_requeue(struct sk_buff *skb, struct Qdisc* qdisc); +int qdisc_restart(struct device *dev) +{ + struct Qdisc *q = dev->qdisc; + struct sk_buff *skb; + if ((skb = q->dequeue(q)) != NULL) { + if (netdev_nit) + dev_queue_xmit_nit(skb, dev); -/* NOTES. + if (dev->hard_start_xmit(skb, dev) == 0) { + q->tx_last = jiffies; + return -1; + } - Every discipline has two major routines: enqueue and dequeue. + /* Device kicked us out :( + It is possible in three cases: - ---dequeue + 1. fastroute is enabled + 2. device cannot determine busy state + before start of transmission (f.e. dialout) + 3. device is buggy (ppp) + */ - dequeue usually returns a skb to send. It is allowed to return NULL, - but it does not mean that queue is empty, it just means that - discipline does not want to send anything this time. - Queue is really empty if q->q.qlen == 0. - For complicated disciplines with multiple queues q->q is not - real packet queue, but however q->q.qlen must be valid. + q->ops->requeue(skb, q); + return -1; + } + return q->q.qlen; +} - ---enqueue +/* Scan transmission queue and kick devices. - enqueue returns number of enqueued packets i.e. this number is 1, - if packet was enqueued sucessfully and <1 if something (not - necessary THIS packet) was dropped. + Deficiency: slow devices (ppp) and fast ones (100Mb ethernet) + share one queue. It means, that if we have a lot of loaded ppp channels, + we will scan a long list on every 100Mb EOI. + I have no idea how to solve it using only "anonymous" Linux mark_bh(). + To change queue from device interrupt? Ough... only not this... */ -int register_qdisc(struct Qdisc_ops *qops) +void qdisc_run_queues(void) { - struct Qdisc_ops *q, **qp; - for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) - if (strcmp(qops->id, q->id) == 0) - return -EEXIST; - qops->next = NULL; - qops->refcnt = 0; - *qp = qops; - return 0; -} + struct Qdisc_head **hp, *h; -int unregister_qdisc(struct Qdisc_ops *qops) -{ - struct Qdisc_ops *q, **qp; - for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) - if (q == qops) - break; - if (!q) - return -ENOENT; - if (q->requeue == NULL) - q->requeue = default_requeue; - *qp = q->next; - return 0; + hp = &qdisc_head.forw; + while ((h = *hp) != &qdisc_head) { + int res = -1; + struct Qdisc *q = (struct Qdisc*)h; + struct device *dev = q->dev; + + while (!dev->tbusy && (res = qdisc_restart(dev)) < 0) + /* NOTHING */; + + /* The explanation is necessary here. + qdisc_restart called dev->hard_start_xmit, + if device is virtual, it could trigger one more + dev_queue_xmit and new device could appear + in active chain. In this case we cannot unlink + empty queue, because we lost back pointer. + No problem, we will unlink it during the next round. + */ + + if (res == 0 && *hp == h) { + *hp = h->forw; + h->forw = NULL; + continue; + } + hp = &h->forw; + } } -struct Qdisc *qdisc_lookup(int handle) +/* Periodic watchdoc timer to recover of hard/soft device bugs. */ + +static void dev_do_watchdog(unsigned long dummy); + +static struct timer_list dev_watchdog = + { NULL, NULL, 0L, 0L, &dev_do_watchdog }; + +static void dev_do_watchdog(unsigned long dummy) { - return NULL; + struct Qdisc_head *h; + + for (h = qdisc_head.forw; h != &qdisc_head; h = h->forw) { + struct Qdisc *q = (struct Qdisc*)h; + struct device *dev = q->dev; + if (dev->tbusy && jiffies - q->tx_last > q->tx_timeo) + qdisc_restart(dev); + } + dev_watchdog.expires = jiffies + 5*HZ; + add_timer(&dev_watchdog); } + /* "NOOP" scheduler: the best scheduler, recommended for all interfaces in all curcumstances. It is difficult to invent anything more fast or cheap. @@ -108,11 +158,48 @@ noop_dequeue(struct Qdisc * qdisc) return NULL; } +static int +noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc) +{ + if (net_ratelimit()) + printk(KERN_DEBUG "%s deferred output. It is buggy.\n", skb->dev->name); + kfree_skb(skb); + return 0; +} + +struct Qdisc_ops noop_qdisc_ops = +{ + NULL, + NULL, + "noop", + 0, + + noop_enqueue, + noop_dequeue, + noop_requeue, +}; + struct Qdisc noop_qdisc = { { NULL }, noop_enqueue, noop_dequeue, + TCQ_F_DEFAULT|TCQ_F_BUILTIN, + &noop_qdisc_ops, +}; + + +struct Qdisc_ops noqueue_qdisc_ops = +{ + NULL, + NULL, + "noqueue", + 0, + + noop_enqueue, + noop_dequeue, + noop_requeue, + }; struct Qdisc noqueue_qdisc = @@ -120,25 +207,32 @@ struct Qdisc noqueue_qdisc = { NULL }, NULL, NULL, + TCQ_F_DEFAULT|TCQ_F_BUILTIN, + &noqueue_qdisc_ops, }; +static const u8 prio2band[TC_PRIO_MAX+1] = +{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 }; -/* 3-band FIFO queue: old style, but should be a bit faster (several CPU insns) */ +/* 3-band FIFO queue: old style, but should be a bit faster than + generic prio+fifo combination. + */ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc) { - const static u8 prio2band[8] = { 1, 2, 2, 2, 1, 2, 0, 0 }; struct sk_buff_head *list; - list = ((struct sk_buff_head*)qdisc->data) + prio2band[skb->priority&7]; + list = ((struct sk_buff_head*)qdisc->data) + + prio2band[skb->priority&TC_PRIO_MAX]; if (list->qlen <= skb->dev->tx_queue_len) { __skb_queue_tail(list, skb); + qdisc->q.qlen++; return 1; } - qdisc->dropped++; + qdisc->stats.drops++; kfree_skb(skb); return 0; } @@ -152,8 +246,10 @@ pfifo_fast_dequeue(struct Qdisc* qdisc) for (prio = 0; prio < 3; prio++, list++) { skb = __skb_dequeue(list); - if (skb) + if (skb) { + qdisc->q.qlen--; return skb; + } } return NULL; } @@ -161,12 +257,13 @@ pfifo_fast_dequeue(struct Qdisc* qdisc) static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc) { - const static u8 prio2band[8] = { 1, 2, 2, 2, 1, 2, 0, 0 }; struct sk_buff_head *list; - list = ((struct sk_buff_head*)qdisc->data) + prio2band[skb->priority&7]; + list = ((struct sk_buff_head*)qdisc->data) + + prio2band[skb->priority&TC_PRIO_MAX]; __skb_queue_head(list, skb); + qdisc->q.qlen++; return 1; } @@ -178,16 +275,17 @@ pfifo_fast_reset(struct Qdisc* qdisc) for (prio=0; prio < 3; prio++) skb_queue_purge(list+prio); + qdisc->q.qlen = 0; } -static int pfifo_fast_init(struct Qdisc *qdisc, void *arg) +static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt) { int i; struct sk_buff_head *list; list = ((struct sk_buff_head*)qdisc->data); - for(i=0; i<3; i++) + for (i=0; i<3; i++) skb_queue_head_init(list+i); return 0; @@ -196,29 +294,20 @@ static int pfifo_fast_init(struct Qdisc *qdisc, void *arg) static struct Qdisc_ops pfifo_fast_ops = { NULL, + NULL, "pfifo_fast", - 1, 3 * sizeof(struct sk_buff_head), + pfifo_fast_enqueue, pfifo_fast_dequeue, - pfifo_fast_reset, + pfifo_fast_requeue, NULL, + pfifo_fast_init, - NULL, - pfifo_fast_requeue + pfifo_fast_reset, }; -static int -default_requeue(struct sk_buff *skb, struct Qdisc* qdisc) -{ - if (net_ratelimit()) - printk(KERN_DEBUG "%s deferred output. It is buggy.\n", skb->dev->name); - kfree_skb(skb); - return 0; -} - -static struct Qdisc * -qdisc_alloc(struct device *dev, struct Qdisc_ops *ops, void *arg) +struct Qdisc * qdisc_create_dflt(struct device *dev, struct Qdisc_ops *ops) { struct Qdisc *sch; int size = sizeof(*sch) + ops->priv_size; @@ -233,56 +322,48 @@ qdisc_alloc(struct device *dev, struct Qdisc_ops *ops, void *arg) sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev = dev; - if (ops->init && ops->init(sch, arg)) - return NULL; - ops->refcnt++; - return sch; + sch->flags |= TCQ_F_DEFAULT; + if (ops->init && ops->init(sch, NULL) == 0) + return sch; + + kfree(sch); + return NULL; } void qdisc_reset(struct Qdisc *qdisc) { struct Qdisc_ops *ops = qdisc->ops; - if (ops) { - start_bh_atomic(); - if (ops->reset) - ops->reset(qdisc); - end_bh_atomic(); - } + start_bh_atomic(); + if (ops->reset) + ops->reset(qdisc); + end_bh_atomic(); } void qdisc_destroy(struct Qdisc *qdisc) { struct Qdisc_ops *ops = qdisc->ops; - if (ops) { - start_bh_atomic(); - if (ops->reset) - ops->reset(qdisc); - if (ops->destroy) - ops->destroy(qdisc); - ops->refcnt--; - end_bh_atomic(); - kfree(qdisc); - } -} - -static void dev_do_watchdog(unsigned long dummy); - -static struct timer_list dev_watchdog = - { NULL, NULL, 0L, 0L, &dev_do_watchdog }; - -static void dev_do_watchdog(unsigned long dummy) -{ - struct Qdisc_head *h; - - for (h = qdisc_head.forw; h != &qdisc_head; h = h->forw) { - struct Qdisc *q = (struct Qdisc*)h; - struct device *dev = q->dev; - if (dev->tbusy && jiffies - q->tx_last > q->tx_timeo) { - qdisc_restart(dev); - } +#ifdef CONFIG_NET_SCHED + if (qdisc->dev) { + struct Qdisc *q, **qp; + for (qp = &qdisc->dev->qdisc_list; (q=*qp) != NULL; qp = &q->next) + if (q == qdisc) { + *qp = q->next; + q->next = NULL; + break; + } } - dev_watchdog.expires = jiffies + 5*HZ; - add_timer(&dev_watchdog); +#ifdef CONFIG_NET_ESTIMATOR + qdisc_kill_estimator(&qdisc->stats); +#endif +#endif + start_bh_atomic(); + if (ops->reset) + ops->reset(qdisc); + if (ops->destroy) + ops->destroy(qdisc); + end_bh_atomic(); + if (!(qdisc->flags&TCQ_F_BUILTIN)) + kfree(qdisc); } @@ -291,15 +372,17 @@ void dev_activate(struct device *dev) /* No queueing discipline is attached to device; create default one i.e. pfifo_fast for devices, which need queueing and noqueue_qdisc for - virtual intrfaces + virtual interfaces */ if (dev->qdisc_sleeping == &noop_qdisc) { if (dev->tx_queue_len) { struct Qdisc *qdisc; - qdisc = qdisc_alloc(dev, &pfifo_fast_ops, NULL); - if (qdisc == NULL) + qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops); + if (qdisc == NULL) { + printk(KERN_INFO "%s: activation failed\n", dev->name); return; + } dev->qdisc_sleeping = qdisc; } else dev->qdisc_sleeping = &noqueue_qdisc; @@ -309,10 +392,9 @@ void dev_activate(struct device *dev) if ((dev->qdisc = dev->qdisc_sleeping) != &noqueue_qdisc) { dev->qdisc->tx_timeo = 5*HZ; dev->qdisc->tx_last = jiffies - dev->qdisc->tx_timeo; - if (!dev_watchdog.expires) { + if (!del_timer(&dev_watchdog)) dev_watchdog.expires = jiffies + 5*HZ; - add_timer(&dev_watchdog); - } + add_timer(&dev_watchdog); } end_bh_atomic(); } @@ -323,8 +405,7 @@ void dev_deactivate(struct device *dev) start_bh_atomic(); - qdisc = dev->qdisc; - dev->qdisc = &noop_qdisc; + qdisc = xchg(&dev->qdisc, &noop_qdisc); qdisc_reset(qdisc); @@ -346,6 +427,7 @@ void dev_init_scheduler(struct device *dev) { dev->qdisc = &noop_qdisc; dev->qdisc_sleeping = &noop_qdisc; + dev->qdisc_list = NULL; } void dev_shutdown(struct device *dev) @@ -354,12 +436,15 @@ void dev_shutdown(struct device *dev) start_bh_atomic(); qdisc = dev->qdisc_sleeping; + dev->qdisc = &noop_qdisc; dev->qdisc_sleeping = &noop_qdisc; - qdisc_destroy(qdisc); + qdisc_destroy(qdisc); + BUG_TRAP(dev->qdisc_list == NULL); + dev->qdisc_list = NULL; end_bh_atomic(); } -void dev_set_scheduler(struct device *dev, struct Qdisc *qdisc) +struct Qdisc * dev_set_scheduler(struct device *dev, struct Qdisc *qdisc) { struct Qdisc *oqdisc; @@ -369,195 +454,20 @@ void dev_set_scheduler(struct device *dev, struct Qdisc *qdisc) start_bh_atomic(); oqdisc = dev->qdisc_sleeping; - /* Destroy old scheduler */ + /* Prune old scheduler */ if (oqdisc) - qdisc_destroy(oqdisc); + qdisc_reset(oqdisc); - /* ... and attach new one */ + /* ... and graft new one */ + if (qdisc == NULL) + qdisc = &noop_qdisc; dev->qdisc_sleeping = qdisc; dev->qdisc = &noop_qdisc; end_bh_atomic(); if (dev->flags & IFF_UP) dev_activate(dev); -} - -/* Kick the queue "q". - Note, that this procedure is called by watchdog timer, so that - we do not check dev->tbusy flag here. - Returns: 0 - queue is empty. - >0 - queue is not empty, but throttled. - <0 - queue is not empty. Device is throttled, if dev->tbusy != 0. - - NOTE: Called only from NET BH -*/ - - -int qdisc_restart(struct device *dev) -{ - struct Qdisc *q = dev->qdisc; - struct sk_buff *skb; - - if ((skb = q->dequeue(q)) != NULL) { - if (netdev_nit) - dev_queue_xmit_nit(skb, dev); - - if (dev->hard_start_xmit(skb, dev) == 0) { - q->tx_last = jiffies; - return -1; - } - - if (q->ops) { - q->ops->requeue(skb, q); - return -1; - } - - printk(KERN_DEBUG "%s: it is impossible!!!\n", dev->name); - kfree_skb(skb); - } - return q->q.qlen; + return oqdisc; } -void qdisc_run_queues(void) -{ - struct Qdisc_head **hp, *h; - - hp = &qdisc_head.forw; - while ((h = *hp) != &qdisc_head) { - int res = -1; - struct Qdisc *q = (struct Qdisc*)h; - struct device *dev = q->dev; - - while (!dev->tbusy && (res = qdisc_restart(dev)) < 0) - /* NOTHING */; - - /* The explanation is necessary here. - qdisc_restart called dev->hard_start_xmit, - if device is virtual, it could trigger one more - dev_queue_xmit and new device could appear - in active chain. In this case we cannot unlink - empty queue, because we lost back pointer. - No problem, we will unlink it during the next round. - */ - - if (res == 0 && *hp == h) { - *hp = h->forw; - h->forw = NULL; - continue; - } - hp = &h->forw; - } -} - - -int tc_init(struct pschedctl *pctl) -{ - struct Qdisc *q; - struct Qdisc_ops *qops; - - if (pctl->handle) { - q = qdisc_lookup(pctl->handle); - if (q == NULL) - return -ENOENT; - qops = q->ops; - if (pctl->ifindex && q->dev->ifindex != pctl->ifindex) - return -EINVAL; - } - return -EINVAL; -} - -int tc_destroy(struct pschedctl *pctl) -{ - return -EINVAL; -} - -int tc_attach(struct pschedctl *pctl) -{ - return -EINVAL; -} - -int tc_detach(struct pschedctl *pctl) -{ - return -EINVAL; -} - - -int psched_ioctl(void *arg) -{ - struct pschedctl ctl; - struct pschedctl *pctl = &ctl; - int err; - - if (copy_from_user(&ctl, arg, sizeof(ctl))) - return -EFAULT; - - if (ctl.arglen > 0) { - pctl = kmalloc(sizeof(ctl) + ctl.arglen, GFP_KERNEL); - if (pctl == NULL) - return -ENOBUFS; - memcpy(pctl, &ctl, sizeof(ctl)); - if (copy_from_user(pctl->args, ((struct pschedctl*)arg)->args, ctl.arglen)) { - kfree(pctl); - return -EFAULT; - } - } - - rtnl_lock(); - - switch (ctl.command) { - case PSCHED_TC_INIT: - err = tc_init(pctl); - break; - case PSCHED_TC_DESTROY: - err = tc_destroy(pctl); - break; - case PSCHED_TC_ATTACH: - err = tc_attach(pctl); - break; - case PSCHED_TC_DETACH: - err = tc_detach(pctl); - break; - default: - err = -EINVAL; - } - - rtnl_unlock(); - - if (pctl != &ctl) - kfree(pctl); - return err; -} - -__initfunc(int pktsched_init(void)) -{ -#define INIT_QDISC(name) { \ - extern struct Qdisc_ops name##_ops; \ - register_qdisc(&##name##_ops); \ - } - - register_qdisc(&pfifo_fast_ops); -#ifdef CONFIG_NET_SCH_CBQ - INIT_QDISC(cbq); -#endif -#ifdef CONFIG_NET_SCH_CSZ - INIT_QDISC(csz); -#endif -#ifdef CONFIG_NET_SCH_RED - INIT_QDISC(red); -#endif -#ifdef CONFIG_NET_SCH_SFQ - INIT_QDISC(sfq); -#endif -#ifdef CONFIG_NET_SCH_TBF - INIT_QDISC(tbf); -#endif -#ifdef CONFIG_NET_SCH_PFIFO - INIT_QDISC(pfifo); - INIT_QDISC(bfifo); -#endif -#ifdef CONFIG_NET_SCH_PRIO - INIT_QDISC(prio); -#endif - return 0; -} diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index a3806eda4..5b7b39fea 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -1,9 +1,16 @@ /* * net/sched/sch_prio.c Simple 3-band priority "scheduler". * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ +#include <linux/config.h> +#include <linux/module.h> #include <asm/uaccess.h> #include <asm/system.h> #include <asm/bitops.h> @@ -28,32 +35,69 @@ #include <net/sock.h> #include <net/pkt_sched.h> -/* New N-band generic scheduler */ struct prio_sched_data { - int qbytes; int bands; - u8 prio2band[8]; - struct Qdisc *queues[8]; + struct tcf_proto *filter_list; + u8 prio2band[TC_PRIO_MAX+1]; + struct Qdisc *queues[TCQ_PRIO_BANDS]; }; + +static __inline__ unsigned prio_classify(struct sk_buff *skb, struct Qdisc *sch) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + struct tcf_result res; + + res.classid = skb->priority; + if (TC_H_MAJ(res.classid) != sch->handle) { + if (!q->filter_list || tc_classify(skb, q->filter_list, &res)) { + if (TC_H_MAJ(res.classid)) + res.classid = 0; + res.classid = q->prio2band[res.classid&TC_PRIO_MAX] + 1; + } + } + + return res.classid - 1; +} + static int prio_enqueue(struct sk_buff *skb, struct Qdisc* sch) { struct prio_sched_data *q = (struct prio_sched_data *)sch->data; - int prio = q->prio2band[skb->priority&7]; struct Qdisc *qdisc; - qdisc = q->queues[prio]; - if (qdisc->enqueue(skb, qdisc) == 0) { - q->qbytes += skb->len; + qdisc = q->queues[prio_classify(skb, sch)]; + + if (qdisc->enqueue(skb, qdisc) == 1) { + sch->stats.bytes += skb->len; + sch->stats.packets++; sch->q.qlen++; - return 0; + return 1; + } + sch->stats.drops++; + return 0; +} + + +static int +prio_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + struct Qdisc *qdisc; + + qdisc = q->queues[prio_classify(skb, sch)]; + + if (qdisc->ops->requeue(skb, qdisc) == 1) { + sch->q.qlen++; + return 1; } - return 1; + sch->stats.drops++; + return 0; } + static struct sk_buff * prio_dequeue(struct Qdisc* sch) { @@ -66,7 +110,6 @@ prio_dequeue(struct Qdisc* sch) qdisc = q->queues[prio]; skb = qdisc->dequeue(qdisc); if (skb) { - q->qbytes -= skb->len; sch->q.qlen--; return skb; } @@ -75,6 +118,24 @@ prio_dequeue(struct Qdisc* sch) } +static int +prio_drop(struct Qdisc* sch) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + int prio; + struct Qdisc *qdisc; + + for (prio = q->bands-1; prio >= 0; prio--) { + qdisc = q->queues[prio]; + if (qdisc->ops->drop(qdisc)) { + sch->q.qlen--; + return 1; + } + } + return 0; +} + + static void prio_reset(struct Qdisc* sch) { @@ -83,7 +144,7 @@ prio_reset(struct Qdisc* sch) for (prio=0; prio<q->bands; prio++) qdisc_reset(q->queues[prio]); - q->qbytes = 0; + sch->q.qlen = 0; } static void @@ -96,51 +157,205 @@ prio_destroy(struct Qdisc* sch) qdisc_destroy(q->queues[prio]); q->queues[prio] = &noop_qdisc; } + MOD_DEC_USE_COUNT; } -static int prio_init(struct Qdisc *sch, void *arg) +static int prio_init(struct Qdisc *sch, struct rtattr *opt) { - const static u8 prio2band[8] = { 1, 2, 2, 2, 1, 2, 0, 0 }; - struct prio_sched_data *q; + static const u8 prio2band[TC_PRIO_MAX+1] = + { 1, 2, 2, 2, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }; + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + unsigned mask = 0; int i; - q = (struct prio_sched_data *)sch->data; - q->bands = 3; - memcpy(q->prio2band, prio2band, sizeof(prio2band)); - for (i=0; i<q->bands; i++) - q->queues[i] = &noop_qdisc; + if (opt == NULL) { + q->bands = 3; + memcpy(q->prio2band, prio2band, sizeof(prio2band)); + mask = 7; + } else { + struct tc_prio_qopt *qopt = RTA_DATA(opt); + + if (opt->rta_len < RTA_LENGTH(sizeof(*qopt))) + return -EINVAL; + if (qopt->bands > TCQ_PRIO_BANDS) + return -EINVAL; + q->bands = qopt->bands; + for (i=0; i<=TC_PRIO_MAX; i++) { + if (qopt->priomap[i] >= q->bands) + return -EINVAL; + q->prio2band[i] = qopt->priomap[i]; + mask |= (1<<qopt->priomap[i]); + } + } + for (i=0; i<TCQ_PRIO_BANDS; i++) { + if (mask&(1<<i)) + q->queues[i] = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + if (q->queues[i] == NULL) + q->queues[i] = &noop_qdisc; + } + MOD_INC_USE_COUNT; + return 0; +} + +#ifdef CONFIG_RTNETLINK +static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct tc_prio_qopt opt; + + opt.bands = q->bands; + memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1); + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif + +static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + unsigned long band = arg - 1; + + if (band >= q->bands) + return -EINVAL; + + if (new == NULL) + new = &noop_qdisc; + + *old = xchg(&q->queues[band], new); + return 0; } -struct Qdisc_ops prio_ops = +static unsigned long prio_get(struct Qdisc *sch, u32 classid) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + unsigned long band = TC_H_MIN(classid); + + if (band - 1 >= q->bands) + return 0; + return band; +} + +static void prio_put(struct Qdisc *q, unsigned long cl) +{ + return; +} + +static int prio_change(struct Qdisc *sch, u32 handle, u32 parent, struct rtattr **tca, unsigned long *arg) +{ + unsigned long cl = *arg; + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + + if (cl - 1 > q->bands) + return -ENOENT; + return 0; +} + +static int prio_delete(struct Qdisc *sch, unsigned long cl) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + if (cl - 1 > q->bands) + return -ENOENT; + return 0; +} + + +#ifdef CONFIG_RTNETLINK +static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + + if (cl - 1 > q->bands) + return -ENOENT; + return 0; +} +#endif + +static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + int prio; + + if (arg->stop) + return; + + for (prio = 0; prio < q->bands; prio++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, prio+1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + + if (cl) + return NULL; + return &q->filter_list; +} + +static struct Qdisc_class_ops prio_class_ops = +{ + prio_graft, + prio_get, + prio_put, + prio_change, + prio_delete, + prio_walk, + + prio_find_tcf, + prio_get, + prio_put, + +#ifdef CONFIG_RTNETLINK + prio_dump_class, +#endif +}; + +struct Qdisc_ops prio_qdisc_ops = { NULL, + &prio_class_ops, "prio", - 0, sizeof(struct prio_sched_data), + prio_enqueue, prio_dequeue, + prio_requeue, + prio_drop, + + prio_init, prio_reset, prio_destroy, - prio_init, + +#ifdef CONFIG_RTNETLINK + prio_dump, +#endif }; #ifdef MODULE -#include <linux/module.h> + int init_module(void) { - int err; - - /* Load once and never free it. */ - MOD_INC_USE_COUNT; - - err = register_qdisc(&prio_ops); - if (err) - MOD_DEC_USE_COUNT; - return err; + return register_qdisc(&prio_qdisc_ops); } void cleanup_module(void) { + unregister_qdisc(&prio_qdisc_ops); } + #endif diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 637288d99..56d1651f3 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -1,5 +1,5 @@ /* - * net/sched/sch_red.c Random Early Detection scheduler. + * net/sched/sch_red.c Random Early Detection queue. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -9,6 +9,8 @@ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ +#include <linux/config.h> +#include <linux/module.h> #include <asm/uaccess.h> #include <asm/system.h> #include <asm/bitops.h> @@ -62,32 +64,42 @@ Short description. and mark (drop) packet with this probability. Pb changes from 0 (at avg==th_min) to max_P (avg==th_max). - max_P should be small (not 1!). - - NB. SF&VJ assumed that Pb[avg] is linear function. I think it - is wrong. I'd make: - P[th_min] = 0, P[th_max] = 1; - dP/davg[th_min] = 0, dP/davg[th_max] = infinity, or a large number. - - I choose max_P as a number between 0.01 and 0.1, so that - C1 = max_P/(th_max-th_min) is power of two: C1 = 2^(-C1log) - - Parameters, settable by user (with default values): - - qmaxbytes=256K - hard limit on queue length, should be chosen >qth_max - to allow packet bursts. This parameter does not - affect algorithm behaviour and can be chosen - arbitrarily high (well, less than ram size) - Really, this limit will never be achieved - if RED works correctly. - qth_min=32K - qth_max=128K - qth_max should be at least 2*qth_min - Wlog=8 - log(1/W). - Alog=Wlog - fixed point position in th_min and th_max. - Rlog=10 - C1log=24 - C1log = trueC1log+Alog-Rlog - so that trueC1log=22 and max_P~0.02 - + max_P should be small (not 1), usually 0.01..0.02 is good value. + + max_P is chosen as a number, so that max_P/(th_max-th_min) + is negative power of two in order arithmetics to contain + only shifts. + + + Parameters, settable by user: + ----------------------------- + + limit - bytes (must be > qth_max + burst) + + Hard limit on queue length, should be chosen >qth_max + to allow packet bursts. This parameter does not + affect algorithm behaviour and can be chosen + arbitrarily high (well, less than ram size) + Really, this limit will never be achieved + if RED works correctly. + + qth_min - bytes (should be < qth_max/2) + qth_max - bytes (should be at least 2*qth_min and less limit) + Wlog - bits (<32) log(1/W). + Plog - bits (<32) + + Plog is related to max_P by formula: + + max_P = (qth_max-qth_min)/2^Plog; + + F.e. if qth_max=128K and qth_min=32K, then Plog=22 + corresponds to max_P=0.02 + + Scell_log + Stab + + Lookup table for log((1-W)^(t/t_ave). + NOTES: @@ -97,10 +109,10 @@ Upper bound on W. If you want to allow bursts of L packets of size S, you should choose W: - L + 1 -th_min/S < (1-(1-W)^L)/W - - For th_min/S = 32 + L + 1 - th_min/S < (1-(1-W)^L)/W + th_min/S = 32 th_min/S = 4 + log(W) L -1 33 -2 35 @@ -117,33 +129,24 @@ Upper bound on W. struct red_sched_data { /* Parameters */ - unsigned long qmaxbytes; /* HARD maximal queue length */ - unsigned long qth_min; /* Min average length threshold: A scaled */ - unsigned long qth_max; /* Max average length threshold: A scaled */ - char Alog; /* Point position in average lengths */ + u32 limit; /* HARD maximal queue length */ + u32 qth_min; /* Min average length threshold: A scaled */ + u32 qth_max; /* Max average length threshold: A scaled */ + u32 Rmask; + u32 Scell_max; char Wlog; /* log(W) */ - char Rlog; /* random number bits */ - char C1log; /* log(1/C1) */ - char Slog; - char Stab[256]; + char Plog; /* random number bits */ + char Scell_log; + u8 Stab[256]; /* Variables */ - unsigned long qbytes; /* Queue length in bytes */ unsigned long qave; /* Average queue length: A scaled */ int qcount; /* Packets since last random number generation */ - unsigned qR; /* Cached random number [0..1<Rlog) */ + u32 qR; /* Cached random number */ + psched_time_t qidlestart; /* Start of idle period */ }; -/* Stolen from igmp.c. */ - -static __inline__ unsigned red_random(int log) -{ - static unsigned long seed=152L; - seed=seed*69069L+1; - return (seed^jiffies)&((1<<log)-1); -} - static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch) { @@ -155,17 +158,15 @@ red_enqueue(struct sk_buff *skb, struct Qdisc* sch) long us_idle; PSCHED_SET_PASTPERFECT(q->qidlestart); PSCHED_GET_TIME(now); - us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, (256<<q->Slog)-1, 0); - -/* It is wrong, but I do not think that SF+VJ proposal is reasonable - and did not invented anything more clever 8) + us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max, 0); +/* The problem: ideally, average length queue recalcultion should be done over constant clock intervals. It is too expensive, so that calculation is driven by outgoing packets. When queue is idle we have to model this clock by hands. - SF+VJ proposed to "generate" m = (idletime/bandwidth)*average_pkt_size + SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth) dummy packets as burst after idle time, i.e. q->qave *= (1-W)^m @@ -175,129 +176,193 @@ red_enqueue(struct sk_buff *skb, struct Qdisc* sch) I believe, that a simpler model may be used here, but it is field for experiments. */ - q->qave >>= q->Stab[(us_idle>>q->Slog)&0xFF]; + q->qave >>= q->Stab[(us_idle>>q->Scell_log)&0xFF]; } - q->qave += ((q->qbytes<<q->Alog) - q->qave) >> q->Wlog; + q->qave += sch->stats.backlog - (q->qave >> q->Wlog); if (q->qave < q->qth_min) { enqueue: q->qcount = -1; - if (q->qbytes <= q->qmaxbytes) { - skb_queue_tail(&sch->q, skb); - q->qbytes += skb->len; + if (sch->stats.backlog <= q->limit) { + __skb_queue_tail(&sch->q, skb); + sch->stats.backlog += skb->len; + sch->stats.bytes += skb->len; + sch->stats.packets++; return 1; } drop: kfree_skb(skb); + sch->stats.drops++; return 0; } if (q->qave >= q->qth_max) { q->qcount = -1; + sch->stats.overlimits++; goto drop; } - q->qcount++; - if (q->qcount++) { - if ((((q->qave - q->qth_min)*q->qcount)>>q->C1log) < q->qR) + if (++q->qcount) { + if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR) goto enqueue; q->qcount = 0; - q->qR = red_random(q->Rlog); + q->qR = net_random()&q->Rmask; + sch->stats.overlimits++; goto drop; } - q->qR = red_random(q->Rlog); + q->qR = net_random()&q->Rmask; goto enqueue; } +static int +red_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct red_sched_data *q = (struct red_sched_data *)sch->data; + + PSCHED_SET_PASTPERFECT(q->qidlestart); + + __skb_queue_head(&sch->q, skb); + sch->stats.backlog += skb->len; + return 1; +} + static struct sk_buff * red_dequeue(struct Qdisc* sch) { struct sk_buff *skb; struct red_sched_data *q = (struct red_sched_data *)sch->data; - skb = skb_dequeue(&sch->q); + skb = __skb_dequeue(&sch->q); if (skb) { - q->qbytes -= skb->len; + sch->stats.backlog -= skb->len; return skb; } PSCHED_GET_TIME(q->qidlestart); return NULL; } -static void -red_reset(struct Qdisc* sch) +static int +red_drop(struct Qdisc* sch) { - struct red_sched_data *q = (struct red_sched_data *)sch->data; struct sk_buff *skb; + struct red_sched_data *q = (struct red_sched_data *)sch->data; - while((skb=skb_dequeue(&sch->q))!=NULL) { - q->qbytes -= skb->len; + skb = __skb_dequeue_tail(&sch->q); + if (skb) { + sch->stats.backlog -= skb->len; + sch->stats.drops++; kfree_skb(skb); + return 1; } - if (q->qbytes) { - printk("red_reset: qbytes=%lu\n", q->qbytes); - q->qbytes = 0; - } + PSCHED_GET_TIME(q->qidlestart); + return 0; +} + +static void red_reset(struct Qdisc* sch) +{ + struct red_sched_data *q = (struct red_sched_data *)sch->data; + struct sk_buff *skb; + + while((skb=__skb_dequeue(&sch->q))!=NULL) + kfree_skb(skb); + sch->stats.backlog = 0; PSCHED_SET_PASTPERFECT(q->qidlestart); q->qave = 0; q->qcount = -1; } -static int red_init(struct Qdisc *sch, struct pschedctl *pctl) +static int red_init(struct Qdisc *sch, struct rtattr *opt) { - struct red_sched_data *q; - struct redctl *ctl = (struct redctl*)pctl->args; - - q = (struct red_sched_data *)sch->data; - - if (pctl->arglen < sizeof(struct redctl)) + struct red_sched_data *q = (struct red_sched_data *)sch->data; + struct rtattr *tb[TCA_RED_STAB]; + struct tc_red_qopt *ctl; + + if (opt == NULL || + rtattr_parse(tb, TCA_RED_STAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) || + tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 || + RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) || + RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256) return -EINVAL; + ctl = RTA_DATA(tb[TCA_RED_PARMS-1]); + q->Wlog = ctl->Wlog; - q->Alog = ctl->Alog; - q->Rlog = ctl->Rlog; - q->C1log = ctl->C1log; - q->Slog = ctl->Slog; - q->qth_min = ctl->qth_min; - q->qth_max = ctl->qth_max; - q->qmaxbytes = ctl->qmaxbytes; - memcpy(q->Stab, ctl->Stab, 256); + q->Plog = ctl->Plog; + q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; + q->Scell_log = ctl->Scell_log; + q->Scell_max = (256<<q->Scell_log)-1; + q->qth_min = ctl->qth_min<<ctl->Wlog; + q->qth_max = ctl->qth_max<<ctl->Wlog; + q->limit = ctl->limit; + memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256); q->qcount = -1; PSCHED_SET_PASTPERFECT(q->qidlestart); + MOD_INC_USE_COUNT; return 0; } -struct Qdisc_ops red_ops = +#ifdef CONFIG_RTNETLINK +static int red_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct red_sched_data *q = (struct red_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_red_qopt opt; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + opt.limit = q->limit; + opt.qth_min = q->qth_min>>q->Wlog; + opt.qth_max = q->qth_max>>q->Wlog; + opt.Wlog = q->Wlog; + opt.Plog = q->Plog; + opt.Scell_log = q->Scell_log; + RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt); + rta->rta_len = skb->tail - b; + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif + +static void red_destroy(struct Qdisc *sch) +{ + MOD_DEC_USE_COUNT; +} + +struct Qdisc_ops red_qdisc_ops = { NULL, + NULL, "red", - 0, sizeof(struct red_sched_data), + red_enqueue, red_dequeue, - red_reset, - NULL, + red_requeue, + red_drop, + red_init, - NULL + red_reset, + red_destroy, + +#ifdef CONFIG_RTNETLINK + red_dump, +#endif }; #ifdef MODULE -#include <linux/module.h> int init_module(void) { - int err; - - /* Load once and never free it. */ - MOD_INC_USE_COUNT; - - err = register_qdisc(&red_ops); - if (err) - MOD_DEC_USE_COUNT; - return err; + return register_qdisc(&red_qdisc_ops); } void cleanup_module(void) { + unregister_qdisc(&red_qdisc_ops); } #endif diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 7a90df655..7cc2b6e5f 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -1,5 +1,5 @@ /* - * net/sched/sch_sfq.c Stochastic Fairness Queueing scheduler. + * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -9,6 +9,8 @@ * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ +#include <linux/config.h> +#include <linux/module.h> #include <asm/uaccess.h> #include <asm/system.h> #include <asm/bitops.h> @@ -30,6 +32,7 @@ #include <linux/notifier.h> #include <linux/init.h> #include <net/ip.h> +#include <linux/ipv6.h> #include <net/route.h> #include <linux/skbuff.h> #include <net/sock.h> @@ -84,14 +87,12 @@ scattered over different locations. It is not good, but it allowed to put it into 4K. - It is easy to increase these values. + It is easy to increase these values, but not in flight. */ #define SFQ_DEPTH 128 #define SFQ_HASH_DIVISOR 1024 -#define SFQ_HASH(a) 0 - /* This type should contain at least SFQ_DEPTH*2 values */ typedef unsigned char sfq_index; @@ -104,9 +105,12 @@ struct sfq_head struct sfq_sched_data { /* Parameters */ + int perturb_period; unsigned quantum; /* Allotment per round: MUST BE >= MTU */ /* Variables */ + struct timer_list perturb_timer; + int perturbation; sfq_index tail; /* Index of current slot in round */ sfq_index max_depth; /* Maximal depth */ @@ -118,10 +122,59 @@ struct sfq_sched_data struct sfq_head dep[SFQ_DEPTH*2]; /* Linked list of slots, indexed by depth */ }; +static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1) +{ + int pert = q->perturbation; + + /* Have we any rotation primitives? If not, WHY? */ + h ^= (h1<<pert) ^ (h1>>(0x1F - pert)); + h ^= h>>10; + return h & 0x3FF; +} + +#ifndef IPPROTO_ESP +#define IPPROTO_ESP 50 +#endif + +static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) +{ + u32 h, h2; + + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + { + struct iphdr *iph = skb->nh.iph; + h = iph->daddr; + h2 = iph->saddr^iph->protocol; + if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && + (iph->protocol == IPPROTO_TCP || + iph->protocol == IPPROTO_UDP || + iph->protocol == IPPROTO_ESP)) + h2 ^= *(((u32*)iph) + iph->ihl); + break; + } + case __constant_htons(ETH_P_IPV6): + { + struct ipv6hdr *iph = skb->nh.ipv6h; + h = iph->daddr.s6_addr32[3]; + h2 = iph->saddr.s6_addr32[3]^iph->nexthdr; + if (iph->nexthdr == IPPROTO_TCP || + iph->nexthdr == IPPROTO_UDP || + iph->nexthdr == IPPROTO_ESP) + h2 ^= *(u32*)&iph[1]; + break; + } + default: + h = (u32)(unsigned long)skb->dst^skb->protocol; + h2 = (u32)(unsigned long)skb->sk; + } + return sfq_fold_hash(q, h, h2); +} + extern __inline__ void sfq_link(struct sfq_sched_data *q, sfq_index x) { sfq_index p, n; - int d = q->qs[x].qlen; + int d = q->qs[x].qlen + SFQ_DEPTH; p = d; n = q->dep[d].next; @@ -161,47 +214,49 @@ extern __inline__ void sfq_inc(struct sfq_sched_data *q, sfq_index x) sfq_link(q, x); } -static __inline__ void sfq_drop(struct sfq_sched_data *q) +static int sfq_drop(struct Qdisc *sch) { - struct sk_buff *skb; + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; sfq_index d = q->max_depth; + struct sk_buff *skb; /* Queue is full! Find the longest slot and drop a packet from it */ - if (d != 1) { - sfq_index x = q->dep[d].next; + if (d > 1) { + sfq_index x = q->dep[d+SFQ_DEPTH].next; skb = q->qs[x].prev; __skb_unlink(skb, &q->qs[x]); kfree_skb(skb); sfq_dec(q, x); -/* sch->q.qlen--; - */ - return; + sch->stats.drops++; + return 1; } - /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ + if (d == 1) { + /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ + d = q->next[q->tail]; + q->next[q->tail] = q->next[d]; + q->allot[q->next[d]] += q->quantum; + skb = q->qs[d].prev; + __skb_unlink(skb, &q->qs[d]); + kfree_skb(skb); + sfq_dec(q, d); + sch->q.qlen--; + q->ht[q->hash[d]] = SFQ_DEPTH; + sch->stats.drops++; + return 1; + } - d = q->next[q->tail]; - q->next[q->tail] = q->next[d]; - q->allot[q->next[d]] += q->quantum; - skb = q->qs[d].prev; - __skb_unlink(skb, &q->qs[d]); - kfree_skb(skb); - sfq_dec(q, d); -/* - sch->q.qlen--; - */ - q->ht[q->hash[d]] = SFQ_DEPTH; - return; + return 0; } static int sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch) { struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; - unsigned hash = SFQ_HASH(skb); + unsigned hash = sfq_hash(q, skb); sfq_index x; x = q->ht[hash]; @@ -222,13 +277,52 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch) q->tail = x; } } + if (++sch->q.qlen < SFQ_DEPTH-1) { + sch->stats.bytes += skb->len; + sch->stats.packets++; + return 1; + } + + sfq_drop(sch); + return 0; +} + +static int +sfq_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; + unsigned hash = sfq_hash(q, skb); + sfq_index x; + + x = q->ht[hash]; + if (x == SFQ_DEPTH) { + q->ht[hash] = x = q->dep[SFQ_DEPTH].next; + q->hash[x] = hash; + } + __skb_queue_head(&q->qs[x], skb); + sfq_inc(q, x); + if (q->qs[x].qlen == 1) { /* The flow is new */ + if (q->tail == SFQ_DEPTH) { /* It is the first flow */ + q->tail = x; + q->next[x] = x; + q->allot[x] = q->quantum; + } else { + q->next[x] = q->next[q->tail]; + q->next[q->tail] = x; + q->tail = x; + } + } if (++sch->q.qlen < SFQ_DEPTH-1) return 1; - sfq_drop(q); + sch->stats.drops++; + sfq_drop(sch); return 0; } + + + static struct sk_buff * sfq_dequeue(struct Qdisc* sch) { @@ -273,13 +367,28 @@ sfq_reset(struct Qdisc* sch) kfree_skb(skb); } +static void sfq_perturbation(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; + + q->perturbation = net_random()&0x1F; + q->perturb_timer.expires = jiffies + q->perturb_period; -static int sfq_open(struct Qdisc *sch, void *arg) + if (q->perturb_period) { + q->perturb_timer.expires = jiffies + q->perturb_period; + add_timer(&q->perturb_timer); + } +} + +static int sfq_init(struct Qdisc *sch, struct rtattr *opt) { - struct sfq_sched_data *q; + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; int i; - q = (struct sfq_sched_data *)sch->data; + q->perturb_timer.data = (unsigned long)sch; + q->perturb_timer.function = sfq_perturbation; + init_timer(&q->perturb_timer); for (i=0; i<SFQ_HASH_DIVISOR; i++) q->ht[i] = SFQ_DEPTH; @@ -290,43 +399,89 @@ static int sfq_open(struct Qdisc *sch, void *arg) } q->max_depth = 0; q->tail = SFQ_DEPTH; - q->quantum = sch->dev->mtu; - if (sch->dev->hard_header) - q->quantum += sch->dev->hard_header_len; + if (opt == NULL) { + q->quantum = sch->dev->mtu; + q->perturb_period = 0; + if (sch->dev->hard_header) + q->quantum += sch->dev->hard_header_len; + } else { + struct tc_sfq_qopt *ctl = RTA_DATA(opt); + if (opt->rta_len < RTA_LENGTH(sizeof(*ctl))) + return -EINVAL; + q->quantum = ctl->quantum ? : psched_mtu(sch->dev); + q->perturb_period = ctl->perturb_period*HZ; + /* The rest is compiled in */ + } for (i=0; i<SFQ_DEPTH; i++) sfq_link(q, i); + if (q->perturb_period) { + q->perturb_timer.expires = jiffies + q->perturb_period; + add_timer(&q->perturb_timer); + } + MOD_INC_USE_COUNT; return 0; } +static void sfq_destroy(struct Qdisc *sch) +{ + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; + del_timer(&q->perturb_timer); + MOD_DEC_USE_COUNT; +} + +#ifdef CONFIG_RTNETLINK +static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct tc_sfq_qopt opt; + + opt.quantum = q->quantum; + opt.perturb_period = q->perturb_period/HZ; + + opt.limit = SFQ_DEPTH; + opt.divisor = SFQ_HASH_DIVISOR; + opt.flows = SFQ_DEPTH; + + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif -struct Qdisc_ops sfq_ops = +struct Qdisc_ops sfq_qdisc_ops = { NULL, + NULL, "sfq", - 0, sizeof(struct sfq_sched_data), + sfq_enqueue, sfq_dequeue, + sfq_requeue, + sfq_drop, + + sfq_init, sfq_reset, - NULL, - sfq_open, + sfq_destroy, + +#ifdef CONFIG_RTNETLINK + sfq_dump, +#endif }; #ifdef MODULE int init_module(void) { - int err; - - /* Load once and never free it. */ - MOD_INC_USE_COUNT; - - err = register_qdisc(&sfq_ops); - if (err) - MOD_DEC_USE_COUNT; - return err; + return register_qdisc(&sfq_qdisc_ops); } void cleanup_module(void) { + unregister_qdisc(&sfq_qdisc_ops); } #endif diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index b4f141761..109ae7bec 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -1,5 +1,5 @@ /* - * net/sched/sch_tbf.c Token Bucket Filter. + * net/sched/sch_tbf.c Token Bucket Filter queue. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -10,6 +10,8 @@ * */ +#include <linux/config.h> +#include <linux/module.h> #include <asm/uaccess.h> #include <asm/system.h> #include <asm/bitops.h> @@ -39,69 +41,91 @@ ======================================= SOURCE. + ------- None. - ALGORITHM. + Description. + ------------ + + Data flow obeys TBF with rate R and depth B, if for any + time interval t_i...t_f number of transmitted bits + does not exceed B + R*(t_f-t_i). + + Packetized version of this definition: + sequence of packets of sizes s_i served at moments t_i + obeys TBF, if for any i<=k: + + s_i+....+s_k <= B + R*(t_k - t_i) + + Algorithm. + ---------- + + Let N(t_i) be B/R initially and N(t) grows continuously with time as: + + N(t+delta) = min{B/R, N(t) + delta} + + If the first packet in queue has length S, it may be + transmited only at the time t_* when S/R <= N(t_*), + and in this case N(t) jumps: + + N(t_* + 0) = N(t_* - 0) - S/R. + + - Sequence of packets satisfy token bucket filter with - rate $r$ and depth $b$, if all the numbers defined by: - \begin{eqnarray*} - n_0 &=& b, \\ - n_i &=& {\rm max} ( b, n_{i-1} + r*(t_i-t_{i-1}) - L_i ), - \end{eqnarray*} - where $t_i$ --- departure time of $i$-th packet and - $L_i$ -- its length, never less than zero. + Actually, QoS requires two TBF to be applied to data stream. + One of them controls steady state burst size, another + with rate P (peak rate) and depth M (equal to link MTU) + limits bursts at smaller time scale. - It is convenient to rescale $n_i$ by factor $r$, so - that the sequence has "canonical" form: - \[ - n_0 = b/r, - n_i = max { b/r, n_{i-1} + t_i - t_{i-1} - L_i/r }, - \] + Apparently, P>R, and B>M. If P is infinity, this double + TBF is equivalent to single one. + + When TBF works in reshaping mode, latency is estimated as: + + lat = max ((L-B)/R, (L-M)/P) - If a packet has n_i < 0, we throttle filter - by $-n_i$ usecs. NOTES. + ------ If TBF throttles, it starts watchdog timer, which will wake up it - after 0...10 msec. + when it will be ready to transmit. + Note, that minimal timer resolution is 1/HZ. If no new packets will arrive during this period, or device will not be awaken by EOI for previous packet, - tbf could stop its activity for 10 msec. + tbf could stop its activity for 1/HZ. + - It means that tbf will sometimes introduce pathological - 10msec delays to flow corresponding to rate*10msec bytes. - For 10Mbit/sec flow it is about 12Kb, on 100Mbit/sec -- ~100Kb. - This number puts lower reasonbale bound on token bucket depth, - but even if depth is larger traffic is erratic at large rates. + It means, that with depth B, the maximal rate is - This problem is not specific for THIS implementation. Really, - there exists statement that any attempt to shape traffic - in transit will increase delays and jitter much more than - we expected naively. + R_crit = B*HZ - Particularily, it means that delay/jitter sensitive traffic - MUST NOT be shaped. Cf. CBQ (wrong) and CSZ (correct) approaches. + F.e. for 10Mbit ethernet and HZ=100 minimal allowed B is ~10Kbytes. + + Note, that peak rate TBF is much more tough: with MTU 1500 + P_crit = 150Kbytes/sec. So that, if you need greater peak + rates, use alpha with HZ=1000 :-) */ struct tbf_sched_data { /* Parameters */ - int cell_log; /* 1<<cell_log is quantum of packet size */ - unsigned long L_tab[256]; /* Lookup table for L/B values */ - unsigned long depth; /* Token bucket depth/B: MUST BE >= MTU/B */ - unsigned long max_bytes; /* Maximal length of backlog: bytes */ + u32 limit; /* Maximal length of backlog: bytes */ + u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */ + u32 mtu; + struct qdisc_rate_table *R_tab; + struct qdisc_rate_table *P_tab; /* Variables */ - unsigned long bytes; /* Current length of backlog */ - unsigned long tokens; /* Current number of tokens */ + long tokens; /* Current number of B tokens */ + long ptokens; /* Current number of P tokens */ psched_time_t t_c; /* Time check-point */ struct timer_list wd_timer; /* Watchdog timer */ }; -#define L2T(q,L) ((q)->L_tab[(L)>>(q)->cell_log]) +#define L2T(q,L) ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log]) +#define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log]) static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch) @@ -109,30 +133,56 @@ tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch) struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; __skb_queue_tail(&sch->q, skb); - if ((q->bytes += skb->len) <= q->max_bytes) + if ((sch->stats.backlog += skb->len) <= q->limit) { + sch->stats.bytes += skb->len; + sch->stats.packets++; return 1; + } /* Drop action: undo the things that we just made, * i.e. make tail drop */ __skb_unlink(skb, &sch->q); - q->bytes -= skb->len; - kfree_skb(skb); + sch->stats.backlog -= skb->len; + sch->stats.drops++; +#ifdef CONFIG_NET_CLS_POLICE + if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch)) +#endif + kfree_skb(skb); + return 0; +} + +static int +tbf_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + __skb_queue_head(&sch->q, skb); + sch->stats.backlog += skb->len; + return 1; +} + +static int +tbf_drop(struct Qdisc* sch) +{ + struct sk_buff *skb; + + skb = __skb_dequeue_tail(&sch->q); + if (skb) { + sch->stats.backlog -= skb->len; + sch->stats.drops++; + kfree_skb(skb); + return 1; + } return 0; } static void tbf_watchdog(unsigned long arg) { struct Qdisc *sch = (struct Qdisc*)arg; - struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; - - q->wd_timer.function = NULL; qdisc_wakeup(sch->dev); } - static struct sk_buff * tbf_dequeue(struct Qdisc* sch) { @@ -144,19 +194,42 @@ tbf_dequeue(struct Qdisc* sch) if (skb) { psched_time_t now; long toks; + long ptoks = 0; PSCHED_GET_TIME(now); - toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->depth, 0) - + q->tokens - L2T(q,skb->len); + toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->buffer, 0); - if (toks >= 0) { + if (q->P_tab) { + ptoks = toks + q->ptokens; + if (ptoks > (long)q->mtu) + ptoks = q->mtu; + ptoks -= L2T_P(q, skb->len); + } + toks += q->tokens; + if (toks > (long)q->buffer) + toks = q->buffer; + toks -= L2T(q, skb->len); + + if ((toks|ptoks) >= 0) { q->t_c = now; - q->tokens = toks <= q->depth ? toks : q->depth; - q->bytes -= skb->len; + q->tokens = toks; + q->ptokens = ptoks; + sch->stats.backlog -= skb->len; return skb; } + if (!sch->dev->tbusy) { + long delay = PSCHED_US2JIFFIE(max(-toks, -ptoks)); + + if (delay == 0) + delay = 1; + + del_timer(&q->wd_timer); + q->wd_timer.expires = jiffies + delay; + add_timer(&q->wd_timer); + } + /* Maybe, we have in queue a shorter packet, which can be sent now. It sounds cool, but, however, wrong in principle. @@ -164,17 +237,12 @@ tbf_dequeue(struct Qdisc* sch) Really, if we splitted flow to independent subflows, it would be very good solution. - Look at sch_csz.c. + It is main idea of all FQ algorithms + (cf. CSZ, HPFQ, HFCS) */ __skb_queue_head(&sch->q, skb); - if (!sch->dev->tbusy) { - if (q->wd_timer.function) - del_timer(&q->wd_timer); - q->wd_timer.function = tbf_watchdog; - q->wd_timer.expires = jiffies + PSCHED_US2JIFFIE(-toks); - add_timer(&q->wd_timer); - } + sch->stats.overlimits++; } return NULL; } @@ -184,69 +252,135 @@ static void tbf_reset(struct Qdisc* sch) { struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; - struct sk_buff *skb; - while ((skb = __skb_dequeue(&sch->q)) != NULL) - kfree_skb(skb); - q->bytes = 0; + skb_queue_purge(&sch->q); + sch->stats.backlog = 0; PSCHED_GET_TIME(q->t_c); - q->tokens = q->depth; - if (q->wd_timer.function) { - del_timer(&q->wd_timer); - q->wd_timer.function = NULL; - } + q->tokens = q->buffer; + q->ptokens = q->mtu; + del_timer(&q->wd_timer); } -static int tbf_init(struct Qdisc* sch, void *arg) +static int tbf_init(struct Qdisc* sch, struct rtattr *opt) { struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; - struct tbfctl *ctl = (struct tbfctl*)arg; + struct rtattr *tb[TCA_TBF_PTAB]; + struct tc_tbf_qopt *qopt; + + MOD_INC_USE_COUNT; + + if (opt == NULL || + rtattr_parse(tb, TCA_TBF_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) || + tb[TCA_TBF_PARMS-1] == NULL || + RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt)) { + MOD_DEC_USE_COUNT; + return -EINVAL; + } + + qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]); + q->R_tab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]); + if (q->R_tab == NULL) { + MOD_DEC_USE_COUNT; + return -EINVAL; + } + + if (qopt->peakrate.rate) { + q->P_tab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_PTAB-1]); + if (q->P_tab == NULL) { + MOD_DEC_USE_COUNT; + qdisc_put_rtab(q->R_tab); + return -EINVAL; + } + } PSCHED_GET_TIME(q->t_c); init_timer(&q->wd_timer); - q->wd_timer.function = NULL; + q->wd_timer.function = tbf_watchdog; q->wd_timer.data = (unsigned long)sch; - if (ctl) { - q->max_bytes = ctl->bytes; - q->depth = ctl->depth; - q->tokens = q->tokens; - q->cell_log = ctl->cell_log; - memcpy(q->L_tab, ctl->L_tab, 256*sizeof(unsigned long)); - } + q->limit = qopt->limit; + q->mtu = qopt->mtu; + if (q->mtu == 0) + q->mtu = psched_mtu(sch->dev); + q->buffer = qopt->buffer; + q->tokens = q->buffer; + q->ptokens = q->mtu; return 0; } -struct Qdisc_ops tbf_ops = +static void tbf_destroy(struct Qdisc *sch) +{ + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + + del_timer(&q->wd_timer); + + if (q->P_tab) + qdisc_put_rtab(q->P_tab); + if (q->R_tab) + qdisc_put_rtab(q->R_tab); + + MOD_DEC_USE_COUNT; +} + +#ifdef CONFIG_RTNETLINK +static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) { + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_tbf_qopt opt; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + opt.limit = q->limit; + opt.rate = q->R_tab->rate; + if (q->P_tab) + opt.peakrate = q->P_tab->rate; + else + memset(&opt.peakrate, 0, sizeof(opt.peakrate)); + opt.mtu = q->mtu; + opt.buffer = q->buffer; + RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt); + rta->rta_len = skb->tail - b; + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif + +struct Qdisc_ops tbf_qdisc_ops = +{ + NULL, NULL, "tbf", - 0, sizeof(struct tbf_sched_data), + tbf_enqueue, tbf_dequeue, - tbf_reset, - NULL, + tbf_requeue, + tbf_drop, + tbf_init, - NULL, + tbf_reset, + tbf_destroy, + +#ifdef CONFIG_RTNETLINK + tbf_dump, +#endif }; #ifdef MODULE -#include <linux/module.h> int init_module(void) { - int err; - - /* Load once and never free it. */ - MOD_INC_USE_COUNT; - - err = register_qdisc(&tbf_ops); - if (err) - MOD_DEC_USE_COUNT; - return err; + return register_qdisc(&tbf_qdisc_ops); } void cleanup_module(void) { + unregister_qdisc(&tbf_qdisc_ops); } #endif diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c new file mode 100644 index 000000000..414bbdfb1 --- /dev/null +++ b/net/sched/sch_teql.c @@ -0,0 +1,474 @@ +/* net/sched/sch_teql.c "True" (or "trivial") link equalizer. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <linux/init.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +/* + How to setup it. + ---------------- + + After loading this module you will find new device teqlN + and new qdisc with the same name. To join a slave to equalizer + you should just set this qdisc on a device f.e. + + # tc qdisc add dev eth0 root teql0 + # tc qdisc add dev eth1 root teql0 + + That's all. Full PnP 8) + + Applicability. + -------------- + + 1. Slave devices MUST be active devices i.e. must raise tbusy + signal and generate EOI event. If you want to equalize virtual devices + sort of tunnels, use normal eql device. + 2. This device puts no limitations on physical slave characteristics + f.e. it will equalize 9600baud line and 100Mb ethernet perfectly :-) + Certainly, large difference in link speeds will make resulting eqalized + link unusable, because of huge packet reordering. I estimated upper + useful difference as ~10 times. + 3. If slave requires address resolution, only protocols using + neighbour cache (IPv4/IPv6) will work over equalized link. + Another protocols still are allowed to use slave device directly, + which will not break load balancing, though native slave + traffic will have the highest priority. + */ + +struct teql_master +{ + struct Qdisc_ops qops; + struct device dev; + struct Qdisc *slaves; + struct net_device_stats stats; + char name[IFNAMSIZ]; +}; + +struct teql_sched_data +{ + struct Qdisc *next; + struct teql_master *m; + struct neighbour *ncache; + struct sk_buff_head q; +}; + +#define NEXT_SLAVE(q) (((struct teql_sched_data*)((q)->data))->next) + +#define FMASK (IFF_BROADCAST|IFF_POINTOPOINT|IFF_BROADCAST) + +/* "teql*" qdisc routines */ + +static int +teql_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct device *dev = sch->dev; + struct teql_sched_data *q = (struct teql_sched_data *)sch->data; + + __skb_queue_tail(&q->q, skb); + if (q->q.qlen <= dev->tx_queue_len) { + sch->stats.bytes += skb->len; + sch->stats.packets++; + return 1; + } + + __skb_unlink(skb, &q->q); + kfree_skb(skb); + sch->stats.drops++; + return 0; +} + +static int +teql_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct teql_sched_data *q = (struct teql_sched_data *)sch->data; + + __skb_queue_head(&q->q, skb); + return 1; +} + +static struct sk_buff * +teql_dequeue(struct Qdisc* sch) +{ + struct teql_sched_data *dat = (struct teql_sched_data *)sch->data; + struct sk_buff *skb; + + skb = __skb_dequeue(&dat->q); + if (skb == NULL) { + struct device *m = dat->m->dev.qdisc->dev; + if (m) { + m->tbusy = 0; + dat->m->slaves = sch; + qdisc_restart(m); + } + } + sch->q.qlen = dat->q.qlen + dat->m->dev.qdisc->q.qlen; + return skb; +} + +static __inline__ void +teql_neigh_release(struct neighbour *n) +{ + if (n) + neigh_release(n); +} + +static void +teql_reset(struct Qdisc* sch) +{ + struct teql_sched_data *dat = (struct teql_sched_data *)sch->data; + + skb_queue_purge(&dat->q); + sch->q.qlen = 0; + teql_neigh_release(xchg(&dat->ncache, NULL)); +} + +static void +teql_destroy(struct Qdisc* sch) +{ + struct Qdisc *q, *prev; + struct teql_sched_data *dat = (struct teql_sched_data *)sch->data; + struct teql_master *master = dat->m; + + if ((prev = master->slaves) != NULL) { + do { + q = NEXT_SLAVE(prev); + if (q == sch) { + NEXT_SLAVE(prev) = NEXT_SLAVE(q); + if (q == master->slaves) { + master->slaves = NEXT_SLAVE(q); + if (q == master->slaves) + master->slaves = NULL; + } + skb_queue_purge(&dat->q); + teql_neigh_release(xchg(&dat->ncache, NULL)); + break; + } + + } while ((prev = q) != master->slaves); + } + + MOD_DEC_USE_COUNT; +} + +static int teql_qdisc_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct device *dev = sch->dev; + struct teql_master *m = (struct teql_master*)sch->ops; + struct teql_sched_data *q = (struct teql_sched_data *)sch->data; + + if (dev->hard_header_len > m->dev.hard_header_len) + return -EINVAL; + + q->m = m; + + skb_queue_head_init(&q->q); + + if (m->slaves) { + if (m->dev.flags & IFF_UP) { + if ((m->dev.flags&IFF_POINTOPOINT && !(dev->flags&IFF_POINTOPOINT)) + || (m->dev.flags&IFF_BROADCAST && !(dev->flags&IFF_BROADCAST)) + || (m->dev.flags&IFF_MULTICAST && !(dev->flags&IFF_MULTICAST)) + || dev->mtu < m->dev.mtu) + return -EINVAL; + } else { + if (!(dev->flags&IFF_POINTOPOINT)) + m->dev.flags &= ~IFF_POINTOPOINT; + if (!(dev->flags&IFF_BROADCAST)) + m->dev.flags &= ~IFF_BROADCAST; + if (!(dev->flags&IFF_MULTICAST)) + m->dev.flags &= ~IFF_MULTICAST; + if (dev->mtu < m->dev.mtu) + m->dev.mtu = dev->mtu; + } + q->next = NEXT_SLAVE(m->slaves); + NEXT_SLAVE(m->slaves) = sch; + } else { + q->next = sch; + m->slaves = sch; + m->dev.mtu = dev->mtu; + m->dev.flags = (m->dev.flags&~FMASK)|(dev->flags&FMASK); + } + + MOD_INC_USE_COUNT; + return 0; +} + +/* "teql*" netdevice routines */ + +static int +__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct device *dev) +{ + struct teql_sched_data *q = (void*)dev->qdisc->data; + struct neighbour *mn = skb->dst->neighbour; + struct neighbour *n = q->ncache; + + if (mn->tbl == NULL) + return -EINVAL; + if (n && n->tbl == mn->tbl && + memcmp(n->primary_key, mn->primary_key, mn->tbl->key_len) == 0) { + atomic_inc(&n->refcnt); + } else { + n = __neigh_lookup(mn->tbl, mn->primary_key, dev, 1); + if (n == NULL) + return -ENOBUFS; + } + if (neigh_event_send(n, skb_res) == 0) { + if (dev->hard_header(skb, dev, ntohs(skb->protocol), n->ha, NULL, skb->len) < 0) { + neigh_release(n); + return -EINVAL; + } + teql_neigh_release(xchg(&q->ncache, n)); + return 0; + } + neigh_release(n); + return (skb_res != NULL); +} + +static __inline__ int +teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct device *dev) +{ + if (dev->hard_header == NULL || + skb->dst == NULL || + skb->dst->neighbour == NULL) + return 0; + return __teql_resolve(skb, skb_res, dev); +} + +static int teql_master_xmit(struct sk_buff *skb, struct device *dev) +{ + struct teql_master *master = (void*)dev->priv; + struct Qdisc *start, *q; + int busy; + int nores; + struct sk_buff *skb_res = NULL; + + dev->tbusy = 1; + + start = master->slaves; + +restart: + nores = 0; + busy = 1; + + if ((q = start) == NULL) + goto drop; + + do { + struct device *slave = q->dev; + + if (!slave->tbusy && slave->qdisc_sleeping == q) { + busy = 0; + + if (q->h.forw == NULL) { + q->h.forw = qdisc_head.forw; + qdisc_head.forw = &q->h; + } + + switch (teql_resolve(skb, skb_res, slave)) { + case 0: + if (slave->hard_start_xmit(skb, slave) == 0) { + master->slaves = NEXT_SLAVE(q); + dev->tbusy = 0; + return 0; + } + break; + case 1: + nores = 1; + break; + default: + master->slaves = NEXT_SLAVE(q); + dev->tbusy = 0; + return 0; + } + __skb_pull(skb, skb->nh.raw - skb->data); + } + } while ((q = NEXT_SLAVE(q)) != start); + + if (nores && skb_res == NULL) { + skb_res = skb; + goto restart; + } + + dev->tbusy = busy; + if (busy) + return 1; + +drop: + dev_kfree_skb(skb); + return 0; +} + +static int teql_master_open(struct device *dev) +{ + struct Qdisc * q; + struct teql_master *m = (void*)dev->priv; + int mtu = 0xFFFE; + unsigned flags = IFF_NOARP|IFF_MULTICAST; + + if (m->slaves == NULL) + return -EUNATCH; + + flags = FMASK; + + q = m->slaves; + do { + struct device *slave = q->dev; + + if (slave == NULL) + return -EUNATCH; + + if (slave->mtu < mtu) + mtu = slave->mtu; + if (slave->hard_header_len > LL_MAX_HEADER) + return -EINVAL; + + /* If all the slaves are BROADCAST, master is BROADCAST + If all the slaves are PtP, master is PtP + Otherwise, master is NBMA. + */ + if (!(slave->flags&IFF_POINTOPOINT)) + flags &= ~IFF_POINTOPOINT; + if (!(slave->flags&IFF_BROADCAST)) + flags &= ~IFF_BROADCAST; + if (!(slave->flags&IFF_MULTICAST)) + flags &= ~IFF_MULTICAST; + } while ((q = NEXT_SLAVE(q)) != m->slaves); + + m->dev.mtu = mtu; + m->dev.flags = (m->dev.flags&~FMASK) | flags; + MOD_INC_USE_COUNT; + return 0; +} + +static int teql_master_close(struct device *dev) +{ + MOD_DEC_USE_COUNT; + return 0; +} + +static struct net_device_stats *teql_master_stats(struct device *dev) +{ + struct teql_master *m = (void*)dev->priv; + return &m->stats; +} + +static int teql_master_mtu(struct device *dev, int new_mtu) +{ + struct teql_master *m = (void*)dev->priv; + struct Qdisc *q; + + if (new_mtu < 68) + return -EINVAL; + + q = m->slaves; + if (q) { + do { + if (new_mtu > q->dev->mtu) + return -EINVAL; + } while ((q=NEXT_SLAVE(q)) != m->slaves); + } + + dev->mtu = new_mtu; + return 0; +} + +static int teql_master_init(struct device *dev) +{ + dev->open = teql_master_open; + dev->hard_start_xmit = teql_master_xmit; + dev->stop = teql_master_close; + dev->get_stats = teql_master_stats; + dev->change_mtu = teql_master_mtu; + dev->type = 0; + dev->mtu = 1500; + dev->tx_queue_len = 100; + dev->flags = IFF_NOARP; + dev->hard_header_len = LL_MAX_HEADER; + return 0; +} + +static struct teql_master the_master = { +{ + NULL, + NULL, + "", + sizeof(struct teql_sched_data), + + teql_enqueue, + teql_dequeue, + teql_requeue, + NULL, + + teql_qdisc_init, + teql_reset, + teql_destroy, +},}; + + +#ifdef MODULE +int init_module(void) +#else +__initfunc(int teql_init(void)) +#endif +{ + int err; + + rtnl_lock(); + + the_master.dev.priv = (void*)&the_master; + the_master.dev.name = (void*)&the_master.name; + err = dev_alloc_name(&the_master.dev, "teql%d"); + if (err < 0) + return err; + memcpy(the_master.qops.id, the_master.name, IFNAMSIZ); + the_master.dev.init = teql_master_init; + + err = register_netdevice(&the_master.dev); + if (err == 0) { + err = register_qdisc(&the_master.qops); + if (err) + unregister_netdevice(&the_master.dev); + } + rtnl_unlock(); + return err; +} + +#ifdef MODULE +void cleanup_module(void) +{ + rtnl_lock(); + unregister_qdisc(&the_master.qops); + unregister_netdevice(&the_master.dev); + rtnl_unlock(); +} +#endif diff --git a/net/socket.c b/net/socket.c index 6220cff45..6a2624058 100644 --- a/net/socket.c +++ b/net/socket.c @@ -547,20 +547,19 @@ int sock_wake_async(struct socket *sock, int how) return -1; switch (how) { - case 0: - kill_fasync(sock->fasync_list, SIGIO); + case 1: + if (sock->flags & SO_WAITDATA) break; - case 1: - if (!(sock->flags & SO_WAITDATA)) - kill_fasync(sock->fasync_list, SIGIO); - break; - case 2: - if (sock->flags & SO_NOSPACE) - { - kill_fasync(sock->fasync_list, SIGIO); - sock->flags &= ~SO_NOSPACE; - } + goto call_kill; + case 2: + if (!(sock->flags & SO_NOSPACE)) break; + sock->flags &= ~SO_NOSPACE; + /* fall through */ + case 0: + call_kill: + kill_fasync(sock->fasync_list, SIGIO); + break; } return 0; } @@ -827,6 +826,7 @@ restart: sys_close(err); goto restart; } + /* N.B. Should check for errors here */ move_addr_to_user(address, len, upeer_sockaddr, upeer_addrlen); } @@ -912,13 +912,13 @@ asmlinkage int sys_getpeername(int fd, struct sockaddr *usockaddr, int *usockadd { struct socket *sock; char address[MAX_SOCK_ADDR]; - int len; - int err; + int len, err; lock_kernel(); if ((sock = sockfd_lookup(fd, &err))!=NULL) { - if((err=sock->ops->getname(sock, (struct sockaddr *)address, &len, 1))==0) + err = sock->ops->getname(sock, (struct sockaddr *)address, &len, 1); + if (!err) err=move_addr_to_user(address,len, usockaddr, usockaddr_len); sockfd_put(sock); } @@ -940,28 +940,22 @@ asmlinkage int sys_send(int fd, void * buff, size_t len, unsigned flags) lock_kernel(); sock = sockfd_lookup(fd, &err); - if (!sock) - goto out; - err = -EINVAL; - if (len < 0) - goto out_put; - - iov.iov_base=buff; - iov.iov_len=len; - msg.msg_name=NULL; - msg.msg_namelen=0; - msg.msg_iov=&iov; - msg.msg_iovlen=1; - msg.msg_control=NULL; - msg.msg_controllen=0; - if (sock->file->f_flags & O_NONBLOCK) - flags |= MSG_DONTWAIT; - msg.msg_flags = flags; - err = sock_sendmsg(sock, &msg, len); + if (sock) { + iov.iov_base=buff; + iov.iov_len=len; + msg.msg_name=NULL; + msg.msg_namelen=0; + msg.msg_iov=&iov; + msg.msg_iovlen=1; + msg.msg_control=NULL; + msg.msg_controllen=0; + if (sock->file->f_flags & O_NONBLOCK) + flags |= MSG_DONTWAIT; + msg.msg_flags = flags; + err = sock_sendmsg(sock, &msg, len); -out_put: - sockfd_put(sock); -out: + sockfd_put(sock); + } unlock_kernel(); return err; } @@ -1140,11 +1134,11 @@ asmlinkage int sys_sendmsg(int fd, struct msghdr *msg, unsigned flags) { struct socket *sock; char address[MAX_SOCK_ADDR]; - struct iovec iov[UIO_FASTIOV]; + struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; unsigned char ctl[sizeof(struct cmsghdr) + 20]; /* 20 is size of ipv6_pktinfo */ unsigned char *ctl_buf = ctl; struct msghdr msg_sys; - int err, total_len; + int err, ctl_len, iov_size, total_len; lock_kernel(); @@ -1152,25 +1146,34 @@ asmlinkage int sys_sendmsg(int fd, struct msghdr *msg, unsigned flags) if (copy_from_user(&msg_sys,msg,sizeof(struct msghdr))) goto out; + sock = sockfd_lookup(fd, &err); + if (!sock) + goto out; + /* do not move before msg_sys is valid */ err = -EINVAL; if (msg_sys.msg_iovlen > UIO_MAXIOV) - goto out; + goto out_put; + + /* Check whether to allocate the iovec area*/ + err = -ENOMEM; + iov_size = msg_sys.msg_iovlen * sizeof(struct iovec); + if (msg_sys.msg_iovlen > 1 /* UIO_FASTIOV */) { + iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL); + if (!iov) + goto out_put; + } /* This will also move the address data into kernel space */ err = verify_iovec(&msg_sys, iov, address, VERIFY_READ); if (err < 0) - goto out; - - total_len=err; - - sock = sockfd_lookup(fd, &err); - if (!sock) goto out_freeiov; + total_len = err; - if (msg_sys.msg_controllen) + ctl_len = msg_sys.msg_controllen; + if (ctl_len) { - if (msg_sys.msg_controllen > sizeof(ctl)) + if (ctl_len > sizeof(ctl)) { /* Suggested by the Advanced Sockets API for IPv6 draft: * Limit the msg_controllen size by the SO_SNDBUF size. @@ -1179,15 +1182,13 @@ asmlinkage int sys_sendmsg(int fd, struct msghdr *msg, unsigned flags) * SMP machines you have a race to fix here. */ err = -ENOBUFS; - ctl_buf = sock_kmalloc(sock->sk, msg_sys.msg_controllen, - GFP_KERNEL); + ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL); if (ctl_buf == NULL) - goto failed2; + goto out_freeiov; } err = -EFAULT; - if (copy_from_user(ctl_buf, msg_sys.msg_control, - msg_sys.msg_controllen)) - goto failed; + if (copy_from_user(ctl_buf, msg_sys.msg_control, ctl_len)) + goto out_freectl; msg_sys.msg_control = ctl_buf; } msg_sys.msg_flags = flags; @@ -1196,14 +1197,14 @@ asmlinkage int sys_sendmsg(int fd, struct msghdr *msg, unsigned flags) msg_sys.msg_flags |= MSG_DONTWAIT; err = sock_sendmsg(sock, &msg_sys, total_len); -failed: +out_freectl: if (ctl_buf != ctl) - sock_kfree_s(sock->sk, ctl_buf, msg_sys.msg_controllen); -failed2: - sockfd_put(sock); + sock_kfree_s(sock->sk, ctl_buf, ctl_len); out_freeiov: - if (msg_sys.msg_iov != iov) - kfree(msg_sys.msg_iov); + if (iov != iovstack) + sock_kfree_s(sock->sk, iov, iov_size); +out_put: + sockfd_put(sock); out: unlock_kernel(); return err; @@ -1220,9 +1221,7 @@ asmlinkage int sys_recvmsg(int fd, struct msghdr *msg, unsigned int flags) struct iovec *iov=iovstack; struct msghdr msg_sys; unsigned long cmsg_ptr; - int err; - int total_len; - int len = 0; + int err, iov_size, total_len, len; /* kernel mode address */ char addr[MAX_SOCK_ADDR]; @@ -1236,10 +1235,23 @@ asmlinkage int sys_recvmsg(int fd, struct msghdr *msg, unsigned int flags) if (copy_from_user(&msg_sys,msg,sizeof(struct msghdr))) goto out; - err=-EINVAL; - if (msg_sys.msg_iovlen > UIO_MAXIOV) + sock = sockfd_lookup(fd, &err); + if (!sock) goto out; + + err = -EINVAL; + if (msg_sys.msg_iovlen > UIO_MAXIOV) + goto out_put; + /* Check whether to allocate the iovec area*/ + err = -ENOMEM; + iov_size = msg_sys.msg_iovlen * sizeof(struct iovec); + if (msg_sys.msg_iovlen > UIO_FASTIOV) { + iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL); + if (!iov) + goto out_put; + } + /* * Save the user-mode address (verify_iovec will change the * kernel msghdr to use the kernel address space) @@ -1247,41 +1259,43 @@ asmlinkage int sys_recvmsg(int fd, struct msghdr *msg, unsigned int flags) uaddr = msg_sys.msg_name; uaddr_len = &msg->msg_namelen; - err=verify_iovec(&msg_sys, iov, addr, VERIFY_WRITE); - if (err<0) - goto out; - + err = verify_iovec(&msg_sys, iov, addr, VERIFY_WRITE); + if (err < 0) + goto out_freeiov; total_len=err; cmsg_ptr = (unsigned long)msg_sys.msg_control; msg_sys.msg_flags = 0; - if ((sock = sockfd_lookup(fd, &err))!=NULL) - { - if (sock->file->f_flags & O_NONBLOCK) - flags |= MSG_DONTWAIT; - err=sock_recvmsg(sock, &msg_sys, total_len, flags); - if(err>=0) - len=err; - sockfd_put(sock); - } - if (msg_sys.msg_iov != iov) - kfree(msg_sys.msg_iov); + if (sock->file->f_flags & O_NONBLOCK) + flags |= MSG_DONTWAIT; + err = sock_recvmsg(sock, &msg_sys, total_len, flags); + if (err < 0) + goto out_freeiov; + len = err; - if (uaddr != NULL && err>=0) + if (uaddr != NULL) { err = move_addr_to_user(addr, msg_sys.msg_namelen, uaddr, uaddr_len); - if (err < 0) - goto out; + if (err < 0) + goto out_freeiov; + } err = __put_user(msg_sys.msg_flags, &msg->msg_flags); if (err) - goto out; + goto out_freeiov; err = __put_user((unsigned long)msg_sys.msg_control-cmsg_ptr, &msg->msg_controllen); + if (err) + goto out_freeiov; + err = len; + +out_freeiov: + if (iov != iovstack) + sock_kfree_s(sock->sk, iov, iov_size); +out_put: + sockfd_put(sock); out: unlock_kernel(); - if(err<0) - return err; - return len; + return err; } diff --git a/net/unix/Makefile b/net/unix/Makefile index f0bebfae3..a335b486d 100644 --- a/net/unix/Makefile +++ b/net/unix/Makefile @@ -1,5 +1,5 @@ # -# Makefile for the Linux TCP/IP (INET) layer. +# Makefile for the Linux unix domain socket layer. # # Note! Dependencies are done automagically by 'make dep', which also # removes any old dependencies. DON'T put your own dependencies here diff --git a/net/wanrouter/Makefile b/net/wanrouter/Makefile index 12afaee5d..beafe5059 100644 --- a/net/wanrouter/Makefile +++ b/net/wanrouter/Makefile @@ -8,7 +8,8 @@ # Note 2! The CFLAGS definition is now in the main makefile... O_TARGET := wanrouter.o -O_OBJS := wanmain.o wanproc.o +OX_OBJS := wanmain.o +O_OBJS := wanproc.o M_OBJS := $(O_TARGET) include $(TOPDIR)/Rules.make diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c index f92ac29bb..30e2c2034 100644 --- a/net/wanrouter/wanmain.c +++ b/net/wanrouter/wanmain.c @@ -18,11 +18,12 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * ============================================================================ -* Oct 15, 1997 Farhan Thawar changed wan_encapsulate to add a pad byte of 0 -* Jun 27, 1997 Alan Cox realigned with vendor code +* Dec 27, 1996 Gene Kozin Initial version (based on Sangoma's WANPIPE) * Jan 16, 1997 Gene Kozin router_devlist made public * Jan 31, 1997 Alan Cox Hacked it about a bit for 2.1 -* Dec 27, 1996 Gene Kozin Initial version (based on Sangoma's WANPIPE) +* Jun 27, 1997 Alan Cox realigned with vendor code +* Oct 15, 1997 Farhan Thawar changed wan_encapsulate to add a pad byte of 0 +* Apr 20, 1998 Alan Cox Fixed 2.1 symbols *****************************************************************************/ #include <linux/stddef.h> /* offsetof(), etc. */ @@ -165,6 +166,7 @@ __initfunc(void wanrouter_init(void)) * Context: process */ + int register_wan_device(wan_device_t* wandev) { int err, namelen; @@ -223,6 +225,7 @@ int register_wan_device(wan_device_t* wandev) * <0 error. * Context: process */ + int unregister_wan_device(char* name) { @@ -269,6 +272,7 @@ int unregister_wan_device(char* name) * 1. This function may be called on interrupt context. */ + int wanrouter_encapsulate (struct sk_buff* skb, struct device* dev) { int hdr_len = 0; @@ -310,6 +314,7 @@ int wanrouter_encapsulate (struct sk_buff* skb, struct device* dev) * 1. This function may be called on interrupt context. */ + unsigned short wanrouter_type_trans (struct sk_buff* skb, struct device* dev) { int cnt = skb->data[0] ? 0 : 1; /* there may be a pad present */ @@ -679,6 +684,14 @@ static int delete_interface (wan_device_t* wandev, char* name, int force) return 0; } +#ifdef MODULE +EXPORT_SYMBOL(register_wan_device); +EXPORT_SYMBOL(unregister_wan_device); +EXPORT_SYMBOL(wanrouter_encapsulate); +EXPORT_SYMBOL(wanrouter_type_trans); +#endif + /* * End */ + diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index a85aeea5f..163960409 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1118,13 +1118,14 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) struct x25_facilities facilities; if (copy_from_user(&facilities, (void *)arg, sizeof(facilities))) return -EFAULT; - if (sk->state != TCP_LISTEN) + if (sk->state != TCP_LISTEN && sk->state != TCP_CLOSE) return -EINVAL; if (facilities.pacsize_in < X25_PS16 || facilities.pacsize_in > X25_PS4096) return -EINVAL; if (facilities.pacsize_out < X25_PS16 || facilities.pacsize_out > X25_PS4096) return -EINVAL; - if (sk->protinfo.x25->neighbour->extended) { + if (sk->state == TCP_CLOSE || sk->protinfo.x25->neighbour->extended) + { if (facilities.winsize_in < 1 || facilities.winsize_in > 127) return -EINVAL; if (facilities.winsize_out < 1 || facilities.winsize_out > 127) @@ -1188,7 +1189,7 @@ static int x25_get_info(char *buffer, char **start, off_t offset, int length, in cli(); - len += sprintf(buffer, "dest_addr src_addr dev lci st vs vr va t t2 t21 t22 t23 Snd-Q Rcv-Q\n"); + len += sprintf(buffer, "dest_addr src_addr dev lci st vs vr va t t2 t21 t22 t23 Snd-Q Rcv-Q inode\n"); for (s = x25_list; s != NULL; s = s->next) { if (s->protinfo.x25->neighbour == NULL || (dev = s->protinfo.x25->neighbour->dev) == NULL) @@ -1196,7 +1197,7 @@ static int x25_get_info(char *buffer, char **start, off_t offset, int length, in else devname = s->protinfo.x25->neighbour->dev->name; - len += sprintf(buffer + len, "%-10s %-10s %-5s %3.3X %d %d %d %d %3lu %3lu %3lu %3lu %3lu %5d %5d\n", + len += sprintf(buffer + len, "%-10s %-10s %-5s %3.3X %d %d %d %d %3lu %3lu %3lu %3lu %3lu %5d %5d %ld\n", (s->protinfo.x25->dest_addr.x25_addr[0] == '\0') ? "*" : s->protinfo.x25->dest_addr.x25_addr, (s->protinfo.x25->source_addr.x25_addr[0] == '\0') ? "*" : s->protinfo.x25->source_addr.x25_addr, devname, @@ -1211,7 +1212,8 @@ static int x25_get_info(char *buffer, char **start, off_t offset, int length, in s->protinfo.x25->t22 / HZ, s->protinfo.x25->t23 / HZ, atomic_read(&s->wmem_alloc), - atomic_read(&s->rmem_alloc)); + atomic_read(&s->rmem_alloc), + s->socket != NULL ? s->socket->inode->i_ino : 0L); pos = begin + len; diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c index b9a66103c..ae98e95ec 100644 --- a/net/x25/x25_in.c +++ b/net/x25/x25_in.c @@ -184,11 +184,6 @@ static int x25_state3_machine(struct sock *sk, struct sk_buff *skb, int frametyp case X25_RR: case X25_RNR: - if (frametype == X25_RNR) { - sk->protinfo.x25->condition |= X25_COND_PEER_RX_BUSY; - } else { - sk->protinfo.x25->condition &= ~X25_COND_PEER_RX_BUSY; - } if (!x25_validate_nr(sk, nr)) { x25_clear_queues(sk); x25_write_internal(sk, X25_RESET_REQUEST); @@ -201,8 +196,11 @@ static int x25_state3_machine(struct sock *sk, struct sk_buff *skb, int frametyp sk->protinfo.x25->state = X25_STATE_4; } else { x25_frames_acked(sk, nr); - if (frametype == X25_RNR) - x25_requeue_frames(sk); + if (frametype == X25_RNR) { + sk->protinfo.x25->condition |= X25_COND_PEER_RX_BUSY; + } else { + sk->protinfo.x25->condition &= ~X25_COND_PEER_RX_BUSY; + } } break; @@ -221,15 +219,25 @@ static int x25_state3_machine(struct sock *sk, struct sk_buff *skb, int frametyp break; } x25_frames_acked(sk, nr); - if (sk->protinfo.x25->condition & X25_COND_OWN_RX_BUSY) - break; if (ns == sk->protinfo.x25->vr) { if (x25_queue_rx_frame(sk, skb, m) == 0) { sk->protinfo.x25->vr = (sk->protinfo.x25->vr + 1) % modulus; queued = 1; } else { - sk->protinfo.x25->condition |= X25_COND_OWN_RX_BUSY; + /* Should never happen */ + x25_clear_queues(sk); + x25_write_internal(sk, X25_RESET_REQUEST); + x25_start_t22timer(sk); + sk->protinfo.x25->condition = 0x00; + sk->protinfo.x25->vs = 0; + sk->protinfo.x25->vr = 0; + sk->protinfo.x25->va = 0; + sk->protinfo.x25->vl = 0; + sk->protinfo.x25->state = X25_STATE_4; + break; } + if (atomic_read(&sk->rmem_alloc) > (sk->rcvbuf / 2)) + sk->protinfo.x25->condition |= X25_COND_OWN_RX_BUSY; } /* * If the window is full Ack it immediately, else |