diff options
Diffstat (limited to 'net')
101 files changed, 7617 insertions, 9687 deletions
diff --git a/net/Config.in b/net/Config.in index b57dc9e3d..e1c9487bf 100644 --- a/net/Config.in +++ b/net/Config.in @@ -3,9 +3,11 @@ # mainmenu_option next_comment comment 'Networking options' -bool 'Kernel/User network link driver' CONFIG_NETLINK +tristate 'Packet socket' CONFIG_PACKET +bool 'Kernel/User netlink socket' CONFIG_NETLINK if [ "$CONFIG_NETLINK" = "y" ]; then bool 'Routing messages' CONFIG_RTNETLINK + tristate 'Netlink device emulation' CONFIG_NETLINK_DEV fi bool 'Network firewalls' CONFIG_FIREWALL if [ "$CONFIG_FIREWALL" = "y" ]; then @@ -14,11 +16,15 @@ if [ "$CONFIG_FIREWALL" = "y" ]; then fi fi bool 'Network aliasing' CONFIG_NET_ALIAS +tristate 'BSD Unix domain sockets' CONFIG_UNIX bool 'TCP/IP networking' CONFIG_INET if [ "$CONFIG_INET" = "y" ]; then source net/ipv4/Config.in if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then tristate 'The IPv6 protocol (EXPERIMENTAL)' CONFIG_IPV6 + if [ "$CONFIG_IPV6" != "n" ]; then + source net/ipv6/Config.in + fi fi fi @@ -28,13 +34,6 @@ if [ "$CONFIG_IPX" != "n" ]; then bool 'Full internal IPX network' CONFIG_IPX_INTERN fi tristate 'Appletalk DDP' CONFIG_ATALK -tristate 'Amateur Radio AX.25 Level 2' CONFIG_AX25 -if [ "$CONFIG_AX25" != "n" ]; then - bool 'AX.25 DAMA Slave support' CONFIG_AX25_DAMA_SLAVE -# bool 'AX.25 DAMA Master support' CONFIG_AX25_DAMA_MASTER - dep_tristate 'Amateur Radio NET/ROM' CONFIG_NETROM $CONFIG_AX25 - dep_tristate 'Amateur Radio X.25 PLP (Rose)' CONFIG_ROSE $CONFIG_AX25 -fi if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then # tristate 'DECnet Support (NOT YET FUNCTIONAL)' CONFIG_DECNET # if [ "$CONFIG_DECNET" != "n" ]; then @@ -48,5 +47,19 @@ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then # bool 'Netbeui (EXPERIMENTAL)' CONFIG_NETBEUI # fi tristate 'WAN router' CONFIG_WAN_ROUTER + bool 'CPU is too slow to handle full bandwidth' CONFIG_CPU_IS_SLOW + if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + bool 'QoS and/or fair queueing' CONFIG_NET_SCHED + if [ "$CONFIG_NET_SCHED" = "y" ]; then + tristate 'CBQ packet scheduler' CONFIG_NET_SCH_CBQ + tristate 'CSZ packet scheduler' CONFIG_NET_SCH_CSZ + tristate 'HFQ packet scheduler' CONFIG_NET_SCH_HFQ + tristate 'RED queueing discipline' CONFIG_NET_SCH_RED + tristate 'SFQ queueing discipline' CONFIG_NET_SCH_SFQ + tristate 'auxiliary TBF queue' CONFIG_NET_SCH_TBF + tristate 'auxiliary FIFO queue' CONFIG_NET_SCH_PFIFO + tristate 'auxiliary PRIO queue' CONFIG_NET_SCH_PRIO + fi + fi fi endmenu diff --git a/net/Makefile b/net/Makefile index 09924ff89..0f32c8397 100644 --- a/net/Makefile +++ b/net/Makefile @@ -9,8 +9,8 @@ MOD_SUB_DIRS := ipv4 ALL_SUB_DIRS := 802 ax25 bridge core ethernet ipv4 ipv6 ipx unix appletalk \ - netrom rose lapb x25 wanrouter sunrpc #decnet -SUB_DIRS := core ethernet unix + netrom rose lapb x25 wanrouter netlink sched packet sunrpc #decnet +SUB_DIRS := core ethernet sched MOD_LIST_NAME := NET_MISC_MODULES ifeq ($(CONFIG_NET),y) @@ -21,6 +21,14 @@ ifeq ($(CONFIG_INET),y) SUB_DIRS += ipv4 endif +ifeq ($(CONFIG_UNIX),y) +SUB_DIRS += unix +else + ifeq ($(CONFIG_UNIX),m) + MOD_SUB_DIRS += unix + endif +endif + ifeq ($(CONFIG_IPV6),y) SUB_DIRS += ipv6 else @@ -29,6 +37,25 @@ else endif endif +ifeq ($(CONFIG_NETLINK),y) +SUB_DIRS += netlink + ifeq ($(CONFIG_NETLINK_DEV),m) + MOD_SUB_DIRS += netlink + endif +endif + +ifeq ($(CONFIG_PACKET),y) +SUB_DIRS += packet +else + ifeq ($(CONFIG_PACKET),m) + MOD_SUB_DIRS += packet + endif +endif + +ifeq ($(CONFIG_NET_SCHED),y) + MOD_SUB_DIRS += sched +endif + ifeq ($(CONFIG_BRIDGE),y) SUB_DIRS += bridge endif @@ -135,31 +162,4 @@ ifeq ($(CONFIG_SYSCTL),y) L_OBJS += sysctl_net.o endif -CONFIG_NETLINK_BUILTIN := -CONFIG_NETLINK_MODULE := - -ifeq ($(CONFIG_NETLINK), y) - CONFIG_NETLINK_BUILTIN = y -endif - -ifeq ($(CONFIG_IPV6), y) - CONFIG_NETLINK_BUILTIN = y -endif - -ifeq ($(CONFIG_NETLINK), m) - CONFIG_NETLINK_MODULE = y -endif - -ifeq ($(CONFIG_IPV6), m) - CONFIG_NETLINK_MODULE = y -endif - -ifdef CONFIG_NETLINK_BUILTIN -L_OBJS += netlink.o -else - ifdef CONFIG_NETLINK_MODULE - M_OBJS += netlink.o - endif -endif - include $(TOPDIR)/Rules.make diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 2fa92c4ad..c12b9fd13 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -954,6 +954,7 @@ static int ax25_release(struct socket *sock, struct socket *peer) #ifdef AX25_CONFIG_DAMA_SLAVE case AX25_PROTO_DAMA_SLAVE: ax25_stop_t3timer(sk->protinfo.ax25); + ax25_stop_idletimer(sk->protinfo.ax25); break; #endif } @@ -1412,7 +1413,6 @@ static int ax25_sendmsg(struct socket *sock, struct msghdr *msg, int len, struct /* Datagram frames go straight out of the door as UI */ skb->dev = sk->protinfo.ax25->ax25_dev->dev; - skb->priority = SOPRI_NORMAL; ax25_queue_xmit(skb); diff --git a/net/ax25/ax25_ds_subr.c b/net/ax25/ax25_ds_subr.c index 89ca64f3f..941a41f99 100644 --- a/net/ax25/ax25_ds_subr.c +++ b/net/ax25/ax25_ds_subr.c @@ -115,7 +115,10 @@ void ax25_ds_enquiry_response(ax25_cb *ax25) if (ax25o->state == AX25_STATE_1 || ax25o->state == AX25_STATE_2 || skb_peek(&ax25o->ack_queue) != NULL) ax25_ds_t1_timeout(ax25o); - ax25_start_t3timer(ax25o); + /* do not start T3 for listening sockets (tnx DD8NE) */ + + if (ax25o->state != AX25_STATE_0) + ax25_start_t3timer(ax25o); } } @@ -154,7 +157,6 @@ static void ax25_kiss_cmd(ax25_dev *ax25_dev, unsigned char cmd, unsigned char p skb->arp = 1; skb->dev = ax25_dev->dev; - skb->priority = SOPRI_NORMAL; skb->protocol = htons(ETH_P_AX25); dev_queue_xmit(skb); @@ -211,8 +213,8 @@ void ax25_dama_on(ax25_cb *ax25) void ax25_dama_off(ax25_cb *ax25) { - ax25_dev_dama_off(ax25->ax25_dev); ax25->condition &= ~AX25_COND_DAMA_MODE; + ax25_dev_dama_off(ax25->ax25_dev); } #endif diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c index 3a8594fba..a50822b90 100644 --- a/net/ax25/ax25_ip.c +++ b/net/ax25/ax25_ip.c @@ -177,7 +177,6 @@ int ax25_rebuild_header(struct sk_buff *skb) } skb->dev = dev; - skb->priority = SOPRI_NORMAL; ax25_queue_xmit(skb); diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c index 4550302d7..787a645de 100644 --- a/net/ax25/ax25_out.c +++ b/net/ax25/ax25_out.c @@ -27,6 +27,9 @@ * Joerg(DL1BKE) Fixed a problem with buffer allocation * for fragments. * AX.25 037 Jonathan(G4KLX) New timer architecture. + * Joerg(DL1BKE) Fixed DAMA Slave mode: will work + * on non-DAMA interfaces like AX25L2V2 + * again (this behaviour is _required_). */ #include <linux/config.h> @@ -58,8 +61,16 @@ ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, ax25_address *src, ax2 ax25_dev *ax25_dev; ax25_cb *ax25; - if (skb == NULL) - return 0; + /* + * Take the default packet length for the device if zero is + * specified. + */ + if (paclen == 0) { + if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) + return NULL; + + paclen = ax25_dev->values[AX25_VALUES_PACLEN]; + } /* * Look for an existing connection. @@ -188,9 +199,22 @@ void ax25_output(ax25_cb *ax25, int paclen, struct sk_buff *skb) skb_queue_tail(&ax25->write_queue, skb); /* Throw it on the queue */ } - if (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL] == AX25_PROTO_STD_SIMPLEX || - ax25->ax25_dev->values[AX25_VALUES_PROTOCOL] == AX25_PROTO_STD_DUPLEX) - ax25_kick(ax25); + switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { + case AX25_PROTO_STD_SIMPLEX: + case AX25_PROTO_STD_DUPLEX: + ax25_kick(ax25); + break; + +#ifdef CONFIG_AX25_DAMA_SLAVE + /* + * A DAMA slave is _required_ to work as normal AX.25L2V2 + * if no DAMA master is available. + */ + case AX25_PROTO_DAMA_SLAVE: + if (!ax25->ax25_dev->dama.slave) ax25_kick(ax25); + break; +#endif + } } /* @@ -339,7 +363,6 @@ void ax25_transmit_buffer(ax25_cb *ax25, struct sk_buff *skb, int type) ax25_addr_build(ptr, &ax25->source_addr, &ax25->dest_addr, ax25->digipeat, type, ax25->modulus); skb->dev = ax25->ax25_dev->dev; - skb->priority = SOPRI_NORMAL; ax25_queue_xmit(skb); } diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c index 39dfd7d42..98a977182 100644 --- a/net/ax25/ax25_subr.c +++ b/net/ax25/ax25_subr.c @@ -252,7 +252,6 @@ void ax25_return_dm(struct device *dev, ax25_address *src, ax25_address *dest, a dptr += ax25_addr_build(dptr, dest, src, &retdigi, AX25_RESPONSE, AX25_MODULUS); skb->dev = dev; - skb->priority = SOPRI_NORMAL; ax25_queue_xmit(skb); } diff --git a/net/core/Makefile b/net/core/Makefile index b7efbe6b4..2ae776157 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -1,5 +1,5 @@ # -# Makefile for the Linux TCP/IP (INET) layer. +# Makefile for the Linux networking core. # # Note! Dependencies are done automagically by 'make dep', which also # removes any old dependencies. DON'T put your own dependencies here @@ -10,7 +10,7 @@ O_TARGET := core.o O_OBJS := sock.o skbuff.o iovec.o datagram.o dst.o scm.o \ - neighbour.o + neighbour.o rtnetlink.o ifeq ($(CONFIG_SYSCTL),y) O_OBJS += sysctl_net_core.o @@ -24,10 +24,6 @@ ifdef CONFIG_FIREWALL OX_OBJS += firewall.o endif -ifdef CONFIG_NET_ALIAS -O_OBJS += net_alias.o -endif - endif include $(TOPDIR)/Rules.make diff --git a/net/core/dev.c b/net/core/dev.c index 5970c5bab..8d94f6817 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -15,6 +15,7 @@ * Florian la Roche <rzsfl@rz.uni-sb.de> * Alan Cox <gw4pts@gw4pts.ampr.org> * David Hinds <dhinds@allegro.stanford.edu> + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * * Changes: * Alan Cox : device private ioctl copies fields back. @@ -61,24 +62,20 @@ #include <linux/mm.h> #include <linux/socket.h> #include <linux/sockios.h> -#include <linux/in.h> #include <linux/errno.h> #include <linux/interrupt.h> #include <linux/if_ether.h> -#include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/notifier.h> -#include <net/ip.h> -#include <net/route.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <net/arp.h> +#include <linux/rtnetlink.h> #include <net/slhc.h> #include <linux/proc_fs.h> #include <linux/stat.h> #include <net/br.h> -#include <linux/net_alias.h> +#include <net/pkt_sched.h> #include <linux/init.h> #ifdef CONFIG_KERNELD #include <linux/kerneld.h> @@ -90,6 +87,7 @@ extern int plip_init(void); #endif + const char *if_port_text[] = { "unknown", "BNC", @@ -101,12 +99,6 @@ const char *if_port_text[] = { }; /* - * The list of devices, that are able to output. - */ - -static struct device *dev_up_base; - -/* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. * @@ -130,16 +122,17 @@ struct packet_type *ptype_base[16]; /* 16 way hashed list */ struct packet_type *ptype_all = NULL; /* Taps */ /* - * Device list lock + * Device list lock. Setting it provides that interface + * will not disappear unexpectedly while kernel sleeps. */ atomic_t dev_lockct = ATOMIC_INIT(0); - + /* * Our notifier list */ -struct notifier_block *netdev_chain=NULL; +static struct notifier_block *netdev_chain=NULL; /* * Device drivers call our routines to queue packets here. We empty the @@ -148,14 +141,6 @@ struct notifier_block *netdev_chain=NULL; static struct sk_buff_head backlog; -/* - * We don't overdo the queue or we will thrash memory badly. - */ - -static int backlog_size = 0; - - - /****************************************************************************************** Protocol management and registration routines @@ -166,7 +151,7 @@ static int backlog_size = 0; * For efficiency */ -static int dev_nit=0; +int netdev_nit=0; /* * Add a protocol ID to the list. Now that the input handler is @@ -179,7 +164,7 @@ void dev_add_pack(struct packet_type *pt) int hash; if(pt->type==htons(ETH_P_ALL)) { - dev_nit++; + netdev_nit++; pt->next=ptype_all; ptype_all=pt; } @@ -201,7 +186,7 @@ void dev_remove_pack(struct packet_type *pt) struct packet_type **pt1; if(pt->type==htons(ETH_P_ALL)) { - dev_nit--; + netdev_nit--; pt1=&ptype_all; } else @@ -258,7 +243,6 @@ struct device *dev_getbyhwaddr(unsigned short type, char *ha) for (dev = dev_base; dev != NULL; dev = dev->next) { if (dev->type == type && - !(dev->flags&(IFF_LOOPBACK|IFF_NOARP)) && memcmp(dev->dev_addr, ha, dev->addr_len) == 0) return(dev); } @@ -312,19 +296,20 @@ struct device *dev_alloc(const char *name, int *err) void dev_load(const char *name) { - if(!dev_get(name)) { -#ifdef CONFIG_NET_ALIAS - const char *sptr; - - for (sptr=name ; *sptr ; sptr++) if(*sptr==':') break; - if (!(*sptr && *(sptr+1))) -#endif + if(!dev_get(name)) request_module(name); - } } #endif - + +static int +default_rebuild_header(struct sk_buff *skb) +{ + printk(KERN_DEBUG "%s: !skb->arp & !rebuild_header -- BUG!\n", skb->dev->name); + kfree_skb(skb, FREE_WRITE); + return 1; +} + /* * Prepare an interface for use. */ @@ -334,6 +319,13 @@ int dev_open(struct device *dev) int ret = 0; /* + * Is it already up? + */ + + if (dev->flags&IFF_UP) + return 0; + + /* * Call device private open method */ @@ -341,29 +333,39 @@ int dev_open(struct device *dev) ret = dev->open(dev); /* - * If it went open OK then set the flags + * If it went open OK then: */ if (ret == 0) { + /* + * nil rebuild_header routine, + * that should be never called and used as just bug trap. + */ + + if (dev->rebuild_header == NULL) + dev->rebuild_header = default_rebuild_header; + + /* + * Set the flags. + */ dev->flags |= (IFF_UP | IFF_RUNNING); + /* - * Initialise multicasting status + * Initialize multicasting status */ dev_mc_upload(dev); - notifier_call_chain(&netdev_chain, NETDEV_UP, dev); - + /* - * Passive non transmitting devices (including - * aliases) need not be on this chain. + * Wakeup transmit queue engine */ - if (!net_alias_is(dev) && dev->tx_queue_len) - { - cli(); - dev->next_up = dev_up_base; - dev_up_base = dev; - sti(); - } + dev_activate(dev); + + /* + * ... and announce new interface. + */ + notifier_call_chain(&netdev_chain, NETDEV_UP, dev); + } return(ret); } @@ -375,17 +377,24 @@ int dev_open(struct device *dev) int dev_close(struct device *dev) { - int ct=0; - struct device **devp; + if (!(dev->flags&IFF_UP)) + return 0; + + dev_deactivate(dev); + + dev_lock_wait(); /* * Call the device specific close. This cannot fail. * Only if device is UP */ - if ((dev->flags & IFF_UP) && dev->stop) + if (dev->stop) dev->stop(dev); + if (dev->start) + printk("dev_close: bug %s still running\n", dev->name); + /* * Device is now down. */ @@ -397,36 +406,7 @@ int dev_close(struct device *dev) */ notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev); - /* - * Flush the multicast chain - */ - dev_mc_discard(dev); - - /* - * Purge any queued packets when we down the link - */ - while(ct<DEV_NUMBUFFS) - { - struct sk_buff *skb; - while((skb=skb_dequeue(&dev->buffs[ct]))!=NULL) - kfree_skb(skb,FREE_WRITE); - ct++; - } - /* - * The device is no longer up. Drop it from the list. - */ - - devp = &dev_up_base; - while (*devp) - { - if (*devp == dev) - { - *devp = dev->next_up; - break; - } - devp = &(*devp)->next_up; - } return(0); } @@ -451,7 +431,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb) * taps currently in use. */ -static void queue_xmit_nit(struct sk_buff *skb, struct device *dev) +void dev_queue_xmit_nit(struct sk_buff *skb, struct device *dev) { struct packet_type *ptype; get_fast_time(&skb->stamp); @@ -467,180 +447,111 @@ static void queue_xmit_nit(struct sk_buff *skb, struct device *dev) struct sk_buff *skb2; if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) break; - skb2->mac.raw = skb2->data; - skb2->nh.raw = skb2->h.raw = skb2->data + dev->hard_header_len; - ptype->func(skb2, skb->dev, ptype); - } - } -} - -/* - * Send (or queue for sending) a packet. - * - * IMPORTANT: When this is called to resend frames. The caller MUST - * already have locked the sk_buff. Apart from that we do the - * rest of the magic. - */ - -static void do_dev_queue_xmit(struct sk_buff *skb, struct device *dev, int pri) -{ - unsigned long flags; - struct sk_buff_head *list; - int retransmission = 0; /* used to say if the packet should go */ - /* at the front or the back of the */ - /* queue - front is a retransmit try */ - /* - * Negative priority is used to flag a frame that is being pulled from the - * queue front as a retransmit attempt. It therefore goes back on the queue - * start on a failure. - */ - - if (pri < 0) - { - pri = -pri-1; - retransmission = 1; - } + /* Code, following below is wrong. -#ifdef CONFIG_NET_DEBUG - if (pri >= DEV_NUMBUFFS) - { - printk(KERN_WARNING "bad priority in do_dev_queue_xmit.\n"); - pri = 1; - } -#endif - - /* - * If we are bridging and this is directly generated output - * pass the frame via the bridge. - */ - -#ifdef CONFIG_BRIDGE - if(skb->pkt_bridged!=IS_BRIDGED && br_stats.flags & BR_UP) - { - if(br_tx_frame(skb)) - return; - } -#endif - - list = dev->buffs + pri; - - save_flags(flags); - - /* - * If this isn't a retransmission, use the first packet instead. - * Note: We don't do strict priority ordering here. We will in - * fact kick the queue that is our priority. The dev_tint reload - * does strict priority queueing. In effect what we are doing here - * is to add some random jitter to the queues and to do so by - * saving clocks. Doing a perfect priority queue isn't a good idea - * as you get some fascinating timing interactions. - */ + The only reason, why it does work is that + ONLY packet sockets receive outgoing + packets. If such a packet will be (occasionally) + received by normal packet handler, which expects + that mac header is pulled... + */ - if (!retransmission) - { - /* avoid overrunning the device queue.. */ - if (skb_queue_len(list) > dev->tx_queue_len) - { - dev_kfree_skb(skb, FREE_WRITE); - return; - } + /* More sensible variant. skb->nh should be correctly + set by sender, so that the second statement is + just protection against buggy protocols. + */ + skb2->mac.raw = skb2->data; - /* copy outgoing packets to any sniffer packet handlers */ - if (dev_nit) - queue_xmit_nit(skb,dev); + if (skb2->nh.raw < skb2->data || skb2->nh.raw >= skb2->tail) { + if (net_ratelimit()) + printk(KERN_DEBUG "protocol %04x is buggy, dev %s\n", skb2->protocol, dev->name); + skb2->nh.raw = skb2->data; + if (dev->hard_header) + skb2->nh.raw += dev->hard_header_len; + } - if (skb_queue_len(list)) { - cli(); - __skb_queue_tail(list, skb); - skb = __skb_dequeue(list); - restore_flags(flags); + skb2->h.raw = skb2->nh.raw; + skb2->pkt_type = PACKET_OUTGOING; + ptype->func(skb2, skb->dev, ptype); } } - if (dev->hard_start_xmit(skb, dev) == 0) { - /* - * Packet is now solely the responsibility of the driver - */ - return; - } - - /* - * Transmission failed, put skb back into a list. Once on the list it's safe and - * no longer device locked (it can be freed safely from the device queue) - */ - cli(); - __skb_queue_head(list,skb); - restore_flags(flags); } /* - * Entry point for transmitting frames. + * Fast path for loopback frames. */ +void dev_loopback_xmit(struct sk_buff *skb) +{ + struct sk_buff *newskb=skb_clone(skb, GFP_ATOMIC); + if (newskb==NULL) + return; + + skb_pull(newskb, newskb->nh.raw - newskb->data); + newskb->ip_summed = CHECKSUM_UNNECESSARY; + if (newskb->dst==NULL) + printk(KERN_DEBUG "BUG: packet without dst looped back 1\n"); + netif_rx(newskb); +} + int dev_queue_xmit(struct sk_buff *skb) { struct device *dev = skb->dev; - - start_bh_atomic(); + struct Qdisc *q; /* * If the address has not been resolved. Call the device header rebuilder. * This can cover all protocols and technically not just ARP either. + * + * This call must be moved to protocol layer. + * Now it works only for IPv6 and for IPv4 in + * some unusual curcumstances (eql device). --ANK */ - if (!skb->arp) - { - /* - * FIXME: we should make the printk for no rebuild - * header a default rebuild_header routine and drop - * this call. Similarly we should make hard_header - * have a default NULL operation not check conditions. - */ - if (dev->rebuild_header) - { - if (dev->rebuild_header(skb)) - { - end_bh_atomic(); - return 0; - } - } - else - printk(KERN_DEBUG "%s: !skb->arp & !rebuild_header!\n", dev->name); + if (!skb->arp && dev->rebuild_header(skb)) + return 0; + + q = dev->qdisc; + if (q->enqueue) { + start_bh_atomic(); + q->enqueue(skb, q); + qdisc_wakeup(dev); + end_bh_atomic(); + return 0; } - /* - * - * If dev is an alias, switch to its main device. - * "arp" resolution has been made with alias device, so - * arp entries refer to alias, not main. - * - */ - - if (net_alias_is(dev)) - skb->dev = dev = net_alias_main_dev(dev); + /* The device has no queue. Common case for software devices: + loopback, all the sorts of tunnels... - do_dev_queue_xmit(skb, dev, skb->priority); - end_bh_atomic(); + Really, it is unlikely that bh protection is necessary here: + virtual devices do not generate EOI events. + However, it is possible, that they rely on bh protection + made by us here. + */ + if (dev->flags&IFF_UP) { + start_bh_atomic(); + if (netdev_nit) + dev_queue_xmit_nit(skb,dev); + if (dev->hard_start_xmit(skb, dev) == 0) { + end_bh_atomic(); + return 0; + } + if (net_ratelimit()) + printk(KERN_DEBUG "Virtual device %s asks to queue packet!\n", dev->name); + end_bh_atomic(); + } + kfree_skb(skb, FREE_WRITE); return 0; } -/* - * Fast path for loopback frames. - */ - -void dev_loopback_xmit(struct sk_buff *skb) -{ - struct sk_buff *newskb=skb_clone(skb, GFP_ATOMIC); - if (newskb==NULL) - return; - skb_pull(newskb, newskb->nh.raw - newskb->data); - newskb->ip_summed = CHECKSUM_UNNECESSARY; - if (newskb->dst==NULL) - printk(KERN_DEBUG "BUG: packet without dst looped back 1\n"); - netif_rx(newskb); -} +/*======================================================================= + Receiver rotutines + =======================================================================*/ +int netdev_dropping = 0; +atomic_t netdev_rx_dropped; /* * Receive a packet from a device driver and queue it for the upper @@ -649,15 +560,6 @@ void dev_loopback_xmit(struct sk_buff *skb) void netif_rx(struct sk_buff *skb) { - static int dropping = 0; - - /* - * Any received buffers are un-owned and should be discarded - * when freed. These will be updated later as the frames get - * owners. - */ - - skb->sk = NULL; if(skb->stamp.tv_sec==0) get_fast_time(&skb->stamp); @@ -665,13 +567,14 @@ void netif_rx(struct sk_buff *skb) * Check that we aren't overdoing things. */ - if (!backlog_size) - dropping = 0; - else if (backlog_size > 300) - dropping = 1; + if (!backlog.qlen) + netdev_dropping = 0; + else if (backlog.qlen > 300) + netdev_dropping = 1; - if (dropping) + if (netdev_dropping) { + atomic_inc(&netdev_rx_dropped); kfree_skb(skb, FREE_READ); return; } @@ -681,7 +584,6 @@ void netif_rx(struct sk_buff *skb) */ skb_queue_tail(&backlog,skb); - backlog_size++; /* * If any packet arrived, mark it for processing after the @@ -692,32 +594,37 @@ void netif_rx(struct sk_buff *skb) return; } -/* - * This routine causes all interfaces to try to send some data. - */ - -static void dev_transmit(void) +#ifdef CONFIG_BRIDGE +static inline void handle_bridge(struct skbuff *skb, unsigned short type) { - struct device *dev; - - for (dev = dev_up_base; dev != NULL; dev = dev->next_up) + if (br_stats.flags & BR_UP && br_protocol_ok(ntohs(type))) { - if (dev->flags != 0 && !dev->tbusy) + /* + * We pass the bridge a complete frame. This means + * recovering the MAC header first. + */ + + int offset=skb->data-skb->mac.raw; + cli(); + skb_push(skb,offset); /* Put header back on for bridge */ + if(br_receive_frame(skb)) { - /* - * Kick the device - */ - dev_tint(dev); + sti(); + continue; } + /* + * Pull the MAC header off for the copy going to + * the upper layers. + */ + skb_pull(skb,offset); + sti(); } } +#endif - -/********************************************************************************** - - Receive Queue Processor - -***********************************************************************************/ +#ifdef CONFIG_CPU_IS_SLOW +int net_cpu_congestion; +#endif /* * When we are called the queue is ready to grab, the interrupts are @@ -732,7 +639,15 @@ void net_bh(void) struct packet_type *ptype; struct packet_type *pt_prev; unsigned short type; - int nit = 301; + unsigned long start_time = jiffies; +#ifdef CONFIG_CPU_IS_SLOW + static unsigned long start_busy = 0; + static unsigned long ave_busy = 0; + + if (start_busy == 0) + start_busy = start_time; + net_cpu_congestion = ave_busy>>8; +#endif /* * Can we send anything now? We want to clear the @@ -741,7 +656,8 @@ void net_bh(void) * latency on a transmit interrupt bh. */ - dev_transmit(); + if (qdisc_head.forw != &qdisc_head) + qdisc_run_queues(); /* * Any data left to process. This may occur because a @@ -761,55 +677,43 @@ void net_bh(void) { struct sk_buff * skb = backlog.next; + if (jiffies - start_time > 1) { + /* Give chance to other bottom halves to run */ + mark_bh(NET_BH); + return; + } + /* * We have a packet. Therefore the queue has shrunk */ cli(); __skb_unlink(skb, &backlog); - backlog_size--; sti(); - /* - * We do not want to spin in net_bh infinitely. --ANK - */ - if (--nit <= 0) - { - if (nit == 0) - printk(KERN_WARNING "net_bh: too many loops, dropping...\n"); +#ifdef CONFIG_CPU_IS_SLOW + if (ave_busy > 128*16) { kfree_skb(skb, FREE_WRITE); - continue; + while ((skb = skb_dequeue(&backlog)) != NULL) + kfree_skb(skb, FREE_WRITE); + break; } +#endif + -#ifdef CONFIG_BRIDGE + /* + * Fetch the packet protocol ID. + */ + + type = skb->protocol; + +#ifdef CONFIG_BRIDGE /* * If we are bridging then pass the frame up to the * bridging code (if this protocol is to be bridged). * If it is bridged then move on */ - - if (br_stats.flags & BR_UP && br_protocol_ok(ntohs(skb->protocol))) - { - /* - * We pass the bridge a complete frame. This means - * recovering the MAC header first. - */ - - int offset=skb->data-skb->mac.raw; - cli(); - skb_push(skb,offset); /* Put header back on for bridge */ - if(br_receive_frame(skb)) - { - sti(); - continue; - } - /* - * Pull the MAC header off for the copy going to - * the upper layers. - */ - skb_pull(skb,offset); - sti(); - } + handle_bridge(skb, type); #endif /* @@ -823,12 +727,6 @@ void net_bh(void) skb->h.raw = skb->nh.raw = skb->data; /* - * Fetch the packet protocol ID. - */ - - type = skb->protocol; - - /* * We got a packet ID. Now loop over the "known protocols" * list. There are two lists. The ptype_all list of taps (normally empty) * and the main protocol list which is hashed perfectly for normal protocols. @@ -837,15 +735,17 @@ void net_bh(void) pt_prev = NULL; for (ptype = ptype_all; ptype!=NULL; ptype=ptype->next) { - if(pt_prev) - { - struct sk_buff *skb2=skb_clone(skb, GFP_ATOMIC); - if(skb2) - pt_prev->func(skb2,skb->dev, pt_prev); + if (!ptype->dev || ptype->dev == skb->dev) { + if(pt_prev) + { + struct sk_buff *skb2=skb_clone(skb, GFP_ATOMIC); + if(skb2) + pt_prev->func(skb2,skb->dev, pt_prev); + } + pt_prev=ptype; } - pt_prev=ptype; } - + for (ptype = ptype_base[ntohs(type)&15]; ptype != NULL; ptype = ptype->next) { if (ptype->type == type && (!ptype->dev || ptype->dev==skb->dev)) @@ -872,7 +772,7 @@ void net_bh(void) pt_prev=ptype; } } /* End of protocol list loop */ - + /* * Is there a last item to send to ? */ @@ -883,16 +783,9 @@ void net_bh(void) * Has an unknown packet has been received ? */ - else + else { kfree_skb(skb, FREE_WRITE); - /* - * Again, see if we can transmit anything now. - * [Ought to take this out judging by tests it slows - * us down not speeds us up] - */ -#ifdef XMIT_EVERY - dev_transmit(); -#endif + } } /* End of queue loop */ /* @@ -903,64 +796,47 @@ void net_bh(void) * One last output flush. */ - dev_transmit(); + if (qdisc_head.forw != &qdisc_head) + qdisc_run_queues(); + +#ifdef CONFIG_CPU_IS_SLOW +{ + unsigned long start_idle = jiffies; + ave_busy += ((start_idle - start_busy)<<3) - (ave_busy>>4); + start_busy = 0; +} +#endif } +/* Protocol dependent address dumping routines */ -/* - * This routine is called when an device driver (i.e. an - * interface) is ready to transmit a packet. - */ - -void dev_tint(struct device *dev) +static int (*gifconf[NPROTO])(struct device *dev, char *bufptr, int len); + +int register_gifconf(int family, int (*func)(struct device *dev, char *bufptr, int len)) { - int i; - unsigned long flags; - struct sk_buff_head * head; - - /* - * aliases do not transmit (for now :) ) - */ + if (family<0 || family>=NPROTO) + return -EINVAL; + gifconf[family] = func; + return 0; +} - if (net_alias_is(dev)) { - printk(KERN_DEBUG "net alias %s transmits\n", dev->name); - return; - } - head = dev->buffs; - save_flags(flags); - cli(); +/* + This ioctl is wrong by design. It really existed in some + old SYSV systems, only was named SIOCGIFNUM. + In multiprotocol environment it is just useless. + Well, SIOCGIFCONF is wrong too, but we have to preserve + it by compatibility reasons. - /* - * Work the queues in priority order - */ - for(i = 0;i < DEV_NUMBUFFS; i++,head++) - { + If someone wants to achieve the same effect, please, use undocumented + feature of SIOCGIFCONF: it returns buffer length, if buffer + is not supplied. - while (!skb_queue_empty(head)) { - struct sk_buff *skb; + Let's remove it, until someone started to use it. --ANK - skb = head->next; - __skb_unlink(skb, head); - /* - * Stop anyone freeing the buffer while we retransmit it - */ - restore_flags(flags); - /* - * Feed them to the output stage and if it fails - * indicate they re-queue at the front. - */ - do_dev_queue_xmit(skb,dev,-i - 1); - /* - * If we can take no more then stop here. - */ - if (dev->tbusy) - return; - cli(); - } - } - restore_flags(flags); -} + In any case, if someone cannot live without it, it should + be renamed to SIOCGIFNUM. + */ /* @@ -970,20 +846,26 @@ void dev_tint(struct device *dev) static int dev_ifcount(unsigned int *arg) { struct device *dev; - int err; unsigned int count = 0; for (dev = dev_base; dev != NULL; dev = dev->next) count++; - err = copy_to_user(arg, &count, sizeof(unsigned int)); - if (err) - return -EFAULT; - return 0; + return put_user(count, arg); } /* - * Map an interface index to its name (SIOGIFNAME) + * Map an interface index to its name (SIOCGIFNAME) + */ + +/* + * This call is useful, but I'd remove it too. + * + * The reason is purely aestetical, it is the only call + * from SIOC* family using struct ifreq in reversed manner. + * Besides that, it is pretty silly to put "drawing" facility + * to kernel, it is useful only to print ifindices + * in readable form, is not it? --ANK */ static int dev_ifname(struct ifreq *arg) @@ -1019,7 +901,6 @@ static int dev_ifname(struct ifreq *arg) static int dev_ifconf(char *arg) { struct ifconf ifc; - struct ifreq ifr; struct device *dev; char *pos; unsigned int len; @@ -1031,68 +912,51 @@ static int dev_ifconf(char *arg) err = copy_from_user(&ifc, arg, sizeof(struct ifconf)); if (err) - return -EFAULT; - len = ifc.ifc_len; + return -EFAULT; + pos = ifc.ifc_buf; + if (pos==NULL) + ifc.ifc_len=0; + len = ifc.ifc_len; /* - * We now walk the device list filling each active device - * into the array. - */ - - /* * Loop over the interfaces, and write an info block for each. */ - - dev_lock_wait(); - dev_lock_list(); - for (dev = dev_base; dev != NULL; dev = dev->next) - { - /* - * Have we run out of space here ? - */ - - if (len < sizeof(struct ifreq)) - break; + for (dev = dev_base; dev != NULL; dev = dev->next) { + int i; + for (i=0; i<NPROTO; i++) { + int done; + + if (gifconf[i] == NULL) + continue; - memset(&ifr, 0, sizeof(struct ifreq)); - strcpy(ifr.ifr_name, dev->name); - (*(struct sockaddr_in *) &ifr.ifr_addr).sin_family = dev->family; - (*(struct sockaddr_in *) &ifr.ifr_addr).sin_addr.s_addr = dev->pa_addr; + done = gifconf[i](dev, pos, len); + if (done<0) + return -EFAULT; - /* - * Write this block to the caller's space. - */ - - err = copy_to_user(pos, &ifr, sizeof(struct ifreq)); - if (err) - return -EFAULT; - pos += sizeof(struct ifreq); - len -= sizeof(struct ifreq); + len -= done; + if (pos) + pos += done; + } } - dev_unlock_list(); - /* * All done. Write the updated control block back to the caller. */ - - ifc.ifc_len = (pos - ifc.ifc_buf); - ifc.ifc_req = (struct ifreq *) ifc.ifc_buf; - err = copy_to_user(arg, &ifc, sizeof(struct ifconf)); - if (err) + ifc.ifc_len -= len; + + if (copy_to_user(arg, &ifc, sizeof(struct ifconf))) return -EFAULT; /* * Report how much was filled in */ - return(pos - arg); + return ifc.ifc_len; } - /* * This is invoked by the /proc filesystem handler to display a device * in detail. @@ -1105,7 +969,7 @@ static int sprintf_stats(char *buffer, struct device *dev) int size; if (stats) - size = sprintf(buffer, "%6s:%8lu %7lu %4lu %4lu %4lu %4lu %8lu %8lu %4lu %4lu %4lu %5lu %4lu\n", + size = sprintf(buffer, "%6s:%8lu %7lu %4lu %4lu %4lu %4lu %8lu %8lu %4lu %4lu %4lu %5lu %4lu %4lu\n", dev->name, stats->rx_bytes, stats->rx_packets, stats->rx_errors, @@ -1117,7 +981,8 @@ static int sprintf_stats(char *buffer, struct device *dev) stats->tx_packets, stats->tx_errors, stats->tx_dropped, stats->tx_fifo_errors, stats->collisions, stats->tx_carrier_errors + stats->tx_aborted_errors - + stats->tx_window_errors + stats->tx_heartbeat_errors); + + stats->tx_window_errors + stats->tx_heartbeat_errors, + stats->multicast); else size = sprintf(buffer, "%6s: No statistics available.\n", dev->name); @@ -1252,272 +1117,216 @@ int dev_get_wireless_info(char * buffer, char **start, off_t offset, #endif /* CONFIG_PROC_FS */ #endif /* CONFIG_NET_RADIO */ +void dev_set_promiscuity(struct device *dev, int inc) +{ + unsigned short old_flags = dev->flags; -/* - * Perform the SIOCxIFxxx calls. - * - * The socket layer has seen an ioctl the address family thinks is - * for the device. At this point we get invoked to make a decision - */ - -static int dev_ifsioc(void *arg, unsigned int getset) + dev->flags |= IFF_PROMISC; + if ((dev->promiscuity += inc) == 0) + dev->flags &= ~IFF_PROMISC; + if (dev->flags^old_flags) { + dev_mc_upload(dev); + printk(KERN_INFO "device %s %s promiscuous mode\n", + dev->name, (dev->flags&IFF_PROMISC) ? "entered" : "leaved"); + } +} + +void dev_set_allmulti(struct device *dev, int inc) { - struct ifreq ifr; - struct device *dev; - int ret, err; + unsigned short old_flags = dev->flags; + + dev->flags |= IFF_ALLMULTI; + if ((dev->allmulti += inc) == 0) + dev->flags &= ~IFF_ALLMULTI; + if (dev->flags^old_flags) + dev_mc_upload(dev); +} + +int dev_change_flags(struct device *dev, unsigned flags) +{ + int ret; + int old_flags = dev->flags; /* - * Fetch the caller's info block into kernel space + * Set the flags on our device. */ - - err = copy_from_user(&ifr, arg, sizeof(struct ifreq)); - if (err) - return -EFAULT; + + dev->flags = (flags & (IFF_DEBUG|IFF_NOTRAILERS|IFF_RUNNING|IFF_NOARP| + IFF_SLAVE|IFF_MASTER| + IFF_MULTICAST|IFF_PORTSEL|IFF_AUTOMEDIA)) | + (dev->flags & (IFF_UP|IFF_VOLATILE|IFF_PROMISC)); /* - * See which interface the caller is talking about. - */ - + * Load in the correct multicast list now the flags have changed. + */ + + dev_mc_upload(dev); + /* - * - * net_alias_dev_get(): dev_get() with added alias naming magic. - * only allow alias creation/deletion if (getset==SIOCSIFADDR) - * + * Have we downed the interface. We handle IFF_UP ourselves + * according to user attempts to set it, rather than blindly + * setting it. */ - -#ifdef CONFIG_KERNELD - dev_load(ifr.ifr_name); -#endif -#ifdef CONFIG_NET_ALIAS - if ((dev = net_alias_dev_get(ifr.ifr_name, getset == SIOCSIFADDR, &err, NULL, NULL)) == NULL) - return(err); -#else - if ((dev = dev_get(ifr.ifr_name)) == NULL) - return(-ENODEV); -#endif - switch(getset) + ret = 0; + if ((old_flags^flags)&IFF_UP) /* Bit is different ? */ + { + if(old_flags&IFF_UP) /* Gone down */ + ret=dev_close(dev); + else /* Come up */ + ret=dev_open(dev); + + if (ret == 0) + dev_mc_upload(dev); + } + + if (dev->flags&IFF_UP && + ((old_flags^dev->flags)&~(IFF_UP|IFF_RUNNING|IFF_PROMISC|IFF_VOLATILE))) { + printk(KERN_DEBUG "SIFFL %s(%s)\n", dev->name, current->comm); + notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); + } + + if ((flags^dev->gflags)&IFF_PROMISC) { + int inc = (flags&IFF_PROMISC) ? +1 : -1; + dev->gflags ^= IFF_PROMISC; + dev_set_promiscuity(dev, inc); + } + + return ret; +} + +/* + * Perform the SIOCxIFxxx calls. + */ + +static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) +{ + struct device *dev; + int err; + + if ((dev = dev_get(ifr->ifr_name)) == NULL) + return -ENODEV; + + switch(cmd) { case SIOCGIFFLAGS: /* Get interface flags */ - ifr.ifr_flags = dev->flags; - goto rarok; + ifr->ifr_flags = (dev->flags&~IFF_PROMISC)|(dev->gflags&IFF_PROMISC); + return 0; case SIOCSIFFLAGS: /* Set interface flags */ - { - int old_flags = dev->flags; - - /* - * We are not allowed to potentially close/unload - * a device until we get this lock. - */ - - dev_lock_wait(); - dev_lock_list(); - - /* - * Set the flags on our device. - */ - - dev->flags = (ifr.ifr_flags & ( - IFF_BROADCAST | IFF_DEBUG | IFF_LOOPBACK | IFF_PORTSEL | - IFF_POINTOPOINT | IFF_NOTRAILERS | IFF_RUNNING | IFF_AUTOMEDIA | - IFF_NOARP | IFF_PROMISC | IFF_ALLMULTI | IFF_SLAVE | IFF_MASTER - | IFF_MULTICAST)) | (dev->flags & IFF_UP); - /* - * Load in the correct multicast list now the flags have changed. - */ - - dev_mc_upload(dev); - - /* - * Have we downed the interface. We handle IFF_UP ourselves - * according to user attempts to set it, rather than blindly - * setting it. - */ - - if ((old_flags^ifr.ifr_flags)&IFF_UP) /* Bit is different ? */ - { - if(old_flags&IFF_UP) /* Gone down */ - ret=dev_close(dev); - else /* Come up */ - { - ret=dev_open(dev); - if(ret<0) - dev->flags&=~IFF_UP; /* Open failed */ - } - } - else - ret=0; - /* - * Load in the correct multicast list now the flags have changed. - */ - - dev_mc_upload(dev); - if ((dev->flags&IFF_UP) && ((old_flags^dev->flags)&~(IFF_UP|IFF_RUNNING|IFF_PROMISC))) - { - printk(KERN_DEBUG "SIFFL %s(%s)\n", dev->name, current->comm); - notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); - } - if ((dev->flags^old_flags)&IFF_PROMISC) { - if (dev->flags&IFF_PROMISC) - printk(KERN_INFO "%s enters promiscuous mode.\n", dev->name); - else - printk(KERN_INFO "%s leave promiscuous mode.\n", dev->name); - } - dev_unlock_list(); - } - break; + return dev_change_flags(dev, ifr->ifr_flags); case SIOCGIFMETRIC: /* Get the metric on the interface (currently unused) */ - - ifr.ifr_metric = dev->metric; - goto rarok; + ifr->ifr_metric = dev->metric; + return 0; case SIOCSIFMETRIC: /* Set the metric on the interface (currently unused) */ - dev->metric = ifr.ifr_metric; - ret=0; - break; + dev->metric = ifr->ifr_metric; + return 0; case SIOCGIFMTU: /* Get the MTU of a device */ - ifr.ifr_mtu = dev->mtu; - goto rarok; + ifr->ifr_mtu = dev->mtu; + return 0; case SIOCSIFMTU: /* Set the MTU of a device */ - - if (ifr.ifr_mtu == dev->mtu) { - ret = 0; - break; - } + if (ifr->ifr_mtu == dev->mtu) + return 0; /* * MTU must be positive. */ - if(ifr.ifr_mtu<68) + if (ifr->ifr_mtu<0) return -EINVAL; if (dev->change_mtu) - ret = dev->change_mtu(dev, ifr.ifr_mtu); - else - { - dev->mtu = ifr.ifr_mtu; - ret = 0; + err = dev->change_mtu(dev, ifr->ifr_mtu); + else { + dev->mtu = ifr->ifr_mtu; + err = 0; } - if (!ret && dev->flags&IFF_UP) { + if (!err && dev->flags&IFF_UP) { printk(KERN_DEBUG "SIFMTU %s(%s)\n", dev->name, current->comm); notifier_call_chain(&netdev_chain, NETDEV_CHANGEMTU, dev); } - break; - - case SIOCGIFMEM: /* Get the per device memory space. We can add this but currently - do not support it */ - ret = -EINVAL; - break; - - case SIOCSIFMEM: /* Set the per device memory buffer space. Not applicable in our case */ - ret = -EINVAL; - break; + return err; case SIOCGIFHWADDR: - memcpy(ifr.ifr_hwaddr.sa_data,dev->dev_addr, MAX_ADDR_LEN); - ifr.ifr_hwaddr.sa_family=dev->type; - goto rarok; + memcpy(ifr->ifr_hwaddr.sa_data,dev->dev_addr, MAX_ADDR_LEN); + ifr->ifr_hwaddr.sa_family=dev->type; + return 0; case SIOCSIFHWADDR: if(dev->set_mac_address==NULL) return -EOPNOTSUPP; - if(ifr.ifr_hwaddr.sa_family!=dev->type) + if(ifr->ifr_hwaddr.sa_family!=dev->type) return -EINVAL; - ret=dev->set_mac_address(dev,&ifr.ifr_hwaddr); - if (!ret) + err=dev->set_mac_address(dev,&ifr->ifr_hwaddr); + if (!err) notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); - break; + return err; + case SIOCSIFHWBROADCAST: + if(ifr->ifr_hwaddr.sa_family!=dev->type) + return -EINVAL; + memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, MAX_ADDR_LEN); + notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); + return 0; + case SIOCGIFMAP: - ifr.ifr_map.mem_start=dev->mem_start; - ifr.ifr_map.mem_end=dev->mem_end; - ifr.ifr_map.base_addr=dev->base_addr; - ifr.ifr_map.irq=dev->irq; - ifr.ifr_map.dma=dev->dma; - ifr.ifr_map.port=dev->if_port; - goto rarok; + ifr->ifr_map.mem_start=dev->mem_start; + ifr->ifr_map.mem_end=dev->mem_end; + ifr->ifr_map.base_addr=dev->base_addr; + ifr->ifr_map.irq=dev->irq; + ifr->ifr_map.dma=dev->dma; + ifr->ifr_map.port=dev->if_port; + return 0; case SIOCSIFMAP: - if(dev->set_config==NULL) - return -EOPNOTSUPP; - return dev->set_config(dev,&ifr.ifr_map); + if (dev->set_config) + return dev->set_config(dev,&ifr->ifr_map); + return -EOPNOTSUPP; case SIOCADDMULTI: - if(dev->set_multicast_list==NULL) - return -EINVAL; - if(ifr.ifr_hwaddr.sa_family!=AF_UNSPEC) + if(dev->set_multicast_list==NULL || + ifr->ifr_hwaddr.sa_family!=AF_UNSPEC) return -EINVAL; - dev_mc_add(dev,ifr.ifr_hwaddr.sa_data, dev->addr_len, 1); + dev_mc_add(dev,ifr->ifr_hwaddr.sa_data, dev->addr_len, 1); return 0; case SIOCDELMULTI: - if(dev->set_multicast_list==NULL) + if(dev->set_multicast_list==NULL || + ifr->ifr_hwaddr.sa_family!=AF_UNSPEC) return -EINVAL; - if(ifr.ifr_hwaddr.sa_family!=AF_UNSPEC) - return -EINVAL; - dev_mc_delete(dev,ifr.ifr_hwaddr.sa_data,dev->addr_len, 1); + dev_mc_delete(dev,ifr->ifr_hwaddr.sa_data,dev->addr_len, 1); return 0; - case SIOGIFINDEX: - ifr.ifr_ifindex = dev->ifindex; - goto rarok; - + case SIOCGIFINDEX: + ifr->ifr_ifindex = dev->ifindex; + return 0; /* * Unknown or private ioctl */ default: - if((getset >= SIOCDEVPRIVATE) && - (getset <= (SIOCDEVPRIVATE + 15))) { - if(dev->do_ioctl==NULL) - return -EOPNOTSUPP; - ret = dev->do_ioctl(dev, &ifr, getset); - if (!ret) - { - err = copy_to_user(arg,&ifr,sizeof(struct ifreq)); - if (err) - ret = -EFAULT; - } - break; + if(cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15) { + if (dev->do_ioctl) + return dev->do_ioctl(dev, ifr, cmd); + return -EOPNOTSUPP; } #ifdef CONFIG_NET_RADIO - if((getset >= SIOCIWFIRST) && (getset <= SIOCIWLAST)) - { - if(dev->do_ioctl==NULL) - return -EOPNOTSUPP; - /* Perform the ioctl */ - ret=dev->do_ioctl(dev, &ifr, getset); - /* If return args... */ - if(IW_IS_GET(getset)) - { - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - { - ret = -EFAULT; - } - } - break; + if(cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { + if (dev->do_ioctl) + return dev->do_ioctl(dev, ifr, cmd); + return -EOPNOTSUPP; } #endif /* CONFIG_NET_RADIO */ - ret = -EINVAL; } - return(ret); -/* - * The load of calls that return an ifreq and ok (saves memory). - */ -rarok: - err = copy_to_user(arg, &ifr, sizeof(struct ifreq)); - if (err) - err = -EFAULT; - return err; + return -EINVAL; } @@ -1528,47 +1337,98 @@ rarok: int dev_ioctl(unsigned int cmd, void *arg) { + struct ifreq ifr; + int ret; +#ifdef CONFIG_NET_ALIAS + char *colon; +#endif + + /* One special case: SIOCGIFCONF takes ifconf argument + and requires shared lock, because it sleeps writing + to user space. + */ + + if (cmd == SIOCGIFCONF) { + rtnl_shlock(); + dev_ifconf((char *) arg); + rtnl_shunlock(); + return 0; + } + if (cmd == SIOCGIFCOUNT) { + return dev_ifcount((unsigned int*)arg); + } + if (cmd == SIOCGIFNAME) { + return dev_ifname((struct ifreq *)arg); + } + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + + ifr.ifr_name[IFNAMSIZ-1] = 0; + +#ifdef CONFIG_NET_ALIAS + colon = strchr(ifr.ifr_name, ':'); + if (colon) + *colon = 0; +#endif + + /* + * See which interface the caller is talking about. + */ + +#ifdef CONFIG_KERNELD + dev_load(ifr.ifr_name); +#endif + switch(cmd) { - case SIOCGIFCONF: - (void) dev_ifconf((char *) arg); - return 0; - case SIOCGIFCOUNT: - return dev_ifcount((unsigned int *) arg); - case SIOGIFNAME: - return dev_ifname((struct ifreq *)arg); - /* - * Ioctl calls that can be done by all. + * These ioctl calls: + * - can be done by all. + * - atomic and do not require locking. + * - return a value */ case SIOCGIFFLAGS: case SIOCGIFMETRIC: case SIOCGIFMTU: - case SIOCGIFMEM: case SIOCGIFHWADDR: case SIOCGIFSLAVE: case SIOCGIFMAP: - case SIOGIFINDEX: - return dev_ifsioc(arg, cmd); + case SIOCGIFINDEX: + ret = dev_ifsioc(&ifr, cmd); + if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + return ret; /* - * Ioctl calls requiring the power of a superuser + * These ioctl calls: + * - require superuser power. + * - require strict serialization. + * - do not return a value */ case SIOCSIFFLAGS: case SIOCSIFMETRIC: case SIOCSIFMTU: - case SIOCSIFMEM: - case SIOCSIFHWADDR: case SIOCSIFMAP: + case SIOCSIFHWADDR: case SIOCSIFSLAVE: case SIOCADDMULTI: case SIOCDELMULTI: + case SIOCSIFHWBROADCAST: if (!suser()) return -EPERM; - return dev_ifsioc(arg, cmd); + rtnl_lock(); + ret = dev_ifsioc(&ifr, cmd); + rtnl_unlock(); + return ret; + case SIOCGIFMEM: + /* Get the per device memory space. We can add this but currently + do not support it */ + case SIOCSIFMEM: + /* Set the per device memory buffer space. Not applicable in our case */ case SIOCSIFLINK: return -EINVAL; @@ -1577,16 +1437,29 @@ int dev_ioctl(unsigned int cmd, void *arg) */ default: - if((cmd >= SIOCDEVPRIVATE) && - (cmd <= (SIOCDEVPRIVATE + 15))) { - return dev_ifsioc(arg, cmd); + if (cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15) { + rtnl_lock(); + ret = dev_ifsioc(&ifr, cmd); + rtnl_unlock(); + if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + return ret; } #ifdef CONFIG_NET_RADIO - if((cmd >= SIOCIWFIRST) && (cmd <= SIOCIWLAST)) - { - if((IW_IS_SET(cmd)) && (!suser())) - return -EPERM; - return dev_ifsioc(arg, cmd); + if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { + if (IW_IS_SET(cmd)) { + if (!suser()) + return -EPERM; + rtnl_lock(); + } + ret = dev_ifsioc(&ifr, cmd); + if (IW_IS_SET(cmd)) + rtnl_unlock(); + if (!ret && IW_IS_GET(cmd) && + copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + return ret; } #endif /* CONFIG_NET_RADIO */ return -EINVAL; @@ -1596,9 +1469,103 @@ int dev_ioctl(unsigned int cmd, void *arg) int dev_new_index() { static int ifindex; - return ++ifindex; + for (;;) { + if (++ifindex <= 0) + ifindex=1; + if (dev_get_by_index(ifindex) == NULL) + return ifindex; + } +} + +static int dev_boot_phase = 1; + + +int register_netdevice(struct device *dev) +{ + struct device *d, **dp; + + if (dev_boot_phase) { + printk(KERN_INFO "early initialization of device %s is deferred\n", dev->name); + + /* Check for existence, and append to tail of chain */ + for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) { + if (d == dev || strcmp(d->name, dev->name) == 0) + return -EEXIST; + } + dev->next = NULL; + *dp = dev; + return 0; + } + + dev->iflink = -1; + + /* Init, if this function is available */ + if (dev->init && dev->init(dev) != 0) + return -EIO; + + /* Check for existence, and append to tail of chain */ + for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) { + if (d == dev || strcmp(d->name, dev->name) == 0) + return -EEXIST; + } + dev->next = NULL; + dev_init_scheduler(dev); + dev->ifindex = dev_new_index(); + if (dev->iflink == -1) + dev->iflink = dev->ifindex; + *dp = dev; + + /* Notify protocols, that a new device appeared. */ + notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev); + + return 0; +} + +int unregister_netdevice(struct device *dev) +{ + struct device *d, **dp; + + if (dev_boot_phase == 0) { + /* If device is running, close it. + It is very bad idea, really we should + complain loudly here, but random hackery + in linux/drivers/net likes it. + */ + if (dev->flags & IFF_UP) + dev_close(dev); + + /* Shutdown queueing discipline. */ + dev_shutdown(dev); + + /* Notify protocols, that we are about to destroy + this device. They should clean all the things. + */ + notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev); + + /* + * Flush the multicast chain + */ + dev_mc_discard(dev); + + /* To avoid pointers looking to nowhere, + we wait for end of critical section */ + dev_lock_wait(); + } + + /* And unlink it from device chain. */ + for (dp = &dev_base; (d=*dp) != NULL; dp=&d->next) { + if (d == dev) { + *dp = d->next; + d->next = NULL; + if (dev->destructor) + dev->destructor(dev); + return 0; + } + } + return -ENODEV; } + /* * Initialize the DEV module. At boot time this walks the device list and * unhooks any devices that fail to initialise (normally hardware not @@ -1606,14 +1573,15 @@ int dev_new_index() * */ extern int lance_init(void); -extern int pi_init(void); extern int bpq_init(void); extern int scc_init(void); extern void sdla_setup(void); extern void dlci_setup(void); -extern int pt_init(void); +extern int dmascc_init(void); extern int sm_init(void); -extern int baycom_init(void); +extern int baycom_ser_fdx_init(void); +extern int baycom_ser_hdx_init(void); +extern int baycom_par_init(void); extern int lapbeth_init(void); extern void arcnet_init(void); @@ -1641,6 +1609,8 @@ __initfunc(int net_dev_init(void)) { struct device *dev, **dp; + pktsched_init(); + /* * Initialise the packet receive queue. */ @@ -1660,18 +1630,16 @@ __initfunc(int net_dev_init(void)) * * Some devices want to be initialized early.. */ + #if defined(CONFIG_LANCE) lance_init(); #endif -#if defined(CONFIG_PI) - pi_init(); -#endif #if defined(CONFIG_SCC) scc_init(); #endif -#if defined(CONFIG_PT) - pt_init(); -#endif +#if defined(CONFIG_DMASCC) + dmascc_init(); +#endif #if defined(CONFIG_BPQETHER) bpq_init(); #endif @@ -1681,8 +1649,14 @@ __initfunc(int net_dev_init(void)) #if defined(CONFIG_SDLA) sdla_setup(); #endif -#if defined(CONFIG_BAYCOM) - baycom_init(); +#if defined(CONFIG_BAYCOM_PAR) + baycom_par_init(); +#endif +#if defined(CONFIG_BAYCOM_SER_FDX) + baycom_ser_fdx_init(); +#endif +#if defined(CONFIG_BAYCOM_SER_HDX) + baycom_ser_hdx_init(); #endif #if defined(CONFIG_SOUNDMODEM) sm_init(); @@ -1706,6 +1680,7 @@ __initfunc(int net_dev_init(void)) slhc_install(); #endif + /* * Add the devices. * If the call to dev->init fails, the dev is removed @@ -1716,11 +1691,7 @@ __initfunc(int net_dev_init(void)) dp = &dev_base; while ((dev = *dp) != NULL) { - int i; - for (i = 0; i < DEV_NUMBUFFS; i++) { - skb_queue_head_init(dev->buffs + i); - } - + dev->iflink = -1; if (dev->init && dev->init(dev)) { /* @@ -1732,6 +1703,9 @@ __initfunc(int net_dev_init(void)) { dp = &dev->next; dev->ifindex = dev_new_index(); + if (dev->iflink == -1) + dev->iflink = dev->ifindex; + dev_init_scheduler(dev); } } @@ -1745,18 +1719,13 @@ __initfunc(int net_dev_init(void)) #endif /* CONFIG_PROC_FS */ #endif /* CONFIG_NET_RADIO */ - /* - * Initialise net_alias engine - * - * - register net_alias device notifier - * - register proc entries: /proc/net/alias_types - * /proc/net/aliases - */ + init_bh(NET_BH, net_bh); -#ifdef CONFIG_NET_ALIAS - net_alias_init(); + dev_boot_phase = 0; + +#ifdef CONFIG_IP_PNP + ip_auto_config(); #endif - init_bh(NET_BH, net_bh); return 0; } diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c index 4aa6cbb0c..eaa1bd058 100644 --- a/net/core/dev_mcast.c +++ b/net/core/dev_mcast.c @@ -42,7 +42,6 @@ #include <linux/skbuff.h> #include <net/sock.h> #include <net/arp.h> -#include <linux/net_alias.h> /* @@ -70,19 +69,6 @@ void dev_mc_upload(struct device *dev) return; /* - * An aliased device should end up with the combined - * multicast list of all its aliases. - * Really, multicasting with logical interfaces is very - * subtle question. Now we DO forward multicast packets - * to logical interfcases, that doubles multicast - * traffic but allows mrouted to work. - * Alas, mrouted does not understand aliases even - * in 4.4BSD --ANK - */ - - dev = net_alias_main_dev(dev); - - /* * Devices with no set multicast don't get set */ @@ -99,7 +85,6 @@ void dev_mc_upload(struct device *dev) void dev_mc_delete(struct device *dev, void *addr, int alen, int all) { struct dev_mc_list **dmi; - dev = net_alias_main_dev(dev); for(dmi=&dev->mc_list;*dmi!=NULL;dmi=&(*dmi)->next) { @@ -136,8 +121,6 @@ void dev_mc_add(struct device *dev, void *addr, int alen, int newonly) { struct dev_mc_list *dmi; - dev = net_alias_main_dev(dev); - for(dmi=dev->mc_list;dmi!=NULL;dmi=dmi->next) { if(memcmp(dmi->dmi_addr,addr,dmi->dmi_addrlen)==0 && dmi->dmi_addrlen==alen) @@ -165,12 +148,12 @@ void dev_mc_add(struct device *dev, void *addr, int alen, int newonly) void dev_mc_discard(struct device *dev) { - if (net_alias_is(dev)) - return; while(dev->mc_list!=NULL) { struct dev_mc_list *tmp=dev->mc_list; dev->mc_list=dev->mc_list->next; + if (tmp->dmi_users) + printk("dev_mc_discard: multicast leakage! dmi_users=%d\n", tmp->dmi_users); kfree_s(tmp,sizeof(*tmp)); } dev->mc_count=0; diff --git a/net/core/iovec.c b/net/core/iovec.c index 9bc21ffc5..10aa7a4cc 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -192,69 +192,78 @@ int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset, * * ip_build_xmit must ensure that when fragmenting only the last * call to this function will be unaligned also. - * - * FIXME: add an error handling path when a copy/checksum from - * user space failed because of a invalid pointer. */ -unsigned int csum_partial_copy_fromiovecend(unsigned char *kdata, - struct iovec *iov, int offset, - int len, int csum) +int csum_partial_copy_fromiovecend(unsigned char *kdata, + struct iovec *iov, int offset, + int len, int *csump) { - __u32 partial; - __u32 partial_cnt = 0; + int partial_cnt = 0; + int err = 0; + int csum; - while(offset>0) - { - if (offset > iov->iov_len) - { - offset -= iov->iov_len; + do { + int copy = iov->iov_len - offset; - } - else - { - u8 *base; - int copy; + if (copy >= 0) { + u8 *base = iov->iov_base + offset; - base = iov->iov_base + offset; - copy = min(len, iov->iov_len - offset); - offset = 0; + /* Normal case (single iov component) is fastly detected */ + if (len <= copy) { + *csump = csum_partial_copy_from_user(base, kdata, + len, *csump, &err); + return err; + } partial_cnt = copy % 4; - if (partial_cnt) - { + if (partial_cnt) { copy -= partial_cnt; - copy_from_user(&partial, base + copy, - partial_cnt); + err |= copy_from_user(kdata+copy, base+copy, partial_cnt); } - /* - * FIXME: add exception handling to the - * csum functions and set *err when an - * exception occurs. - */ - csum = csum_partial_copy_fromuser(base, kdata, - copy, csum); + *csump = csum_partial_copy_from_user(base, kdata, + copy, *csump, &err); len -= copy + partial_cnt; kdata += copy + partial_cnt; + iov++; + break; } - iov++; - } + iov++; + offset = -copy; + } while (offset > 0); + + csum = *csump; while (len>0) { u8 *base = iov->iov_base; - int copy=min(len, iov->iov_len); + int copy = min(len, iov->iov_len); + /* There is a remnant from previous iov. */ if (partial_cnt) { int par_len = 4 - partial_cnt; - copy_from_user(&partial, base + partial_cnt, par_len); - csum = csum_partial((u8*) &partial, 4, csum); + /* iov component is too short ... */ + if (par_len > copy) { + err |= copy_from_user(kdata, base, copy); + base += copy; + partial_cnt += copy; + kdata += copy; + len -= copy; + iov++; + if (len) + continue; + *csump = csum_partial(kdata-partial_cnt, partial_cnt, csum); + return err; + } + err |= copy_from_user(kdata, base, par_len); + csum = csum_partial(kdata-partial_cnt, 4, csum); base += par_len; copy -= par_len; + len -= par_len; + kdata += par_len; partial_cnt = 0; } @@ -264,16 +273,15 @@ unsigned int csum_partial_copy_fromiovecend(unsigned char *kdata, if (partial_cnt) { copy -= partial_cnt; - copy_from_user(&partial, base + copy, - partial_cnt); + err |= copy_from_user(kdata+copy, base + copy, partial_cnt); } } - csum = csum_partial_copy_fromuser(base, kdata, copy, csum); + csum = csum_partial_copy_from_user(base, kdata, copy, csum, &err); len -= copy + partial_cnt; kdata += copy + partial_cnt; iov++; } - - return csum; + *csump = csum; + return err; } diff --git a/net/core/net_alias.c b/net/core/net_alias.c index 807c2e935..e69de29bb 100644 --- a/net/core/net_alias.c +++ b/net/core/net_alias.c @@ -1,1464 +0,0 @@ -/* - * NET_ALIAS network device aliasing module. - * - * - * Version: @(#)net_alias.c 0.43 12/20/95 - * - * Authors: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> - * Marcelo Fabian Roccasalva, <mfroccas@raiz.uncu.edu.ar> - * - * Features: - * - AF_ independent: net_alias_type objects - * - AF_INET optimized - * - ACTUAL alias devices inserted in dev chain - * - fast hashed alias address lookup - * - net_alias_type objs registration/unreg., module-ables. - * - /proc/net/aliases & /proc/net/alias_types entries - * Fixes: - * JJC : several net_alias_type func. renamed. - * JJC : net_alias_type object methods now pass - * *this. - * JJC : xxx_rcv device selection based on <src,dst> - * addrs - * Andreas Schultz : Kerneld support. - * - * FIXME: - * - User calls sleep/wake_up locking. - * - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <linux/config.h> -#include <linux/types.h> -#include <linux/errno.h> -#include <linux/netdevice.h> -#include <linux/notifier.h> -#include <linux/if.h> -#include <linux/inet.h> -#include <linux/in.h> -#include <linux/proc_fs.h> -#include <linux/stat.h> -#include <linux/init.h> - -#include <linux/net_alias.h> - -#ifdef CONFIG_KERNELD -#include <linux/kerneld.h> -#endif - -/* - * Only allow the following flags to pass from main device to aliases - */ - -#define NET_ALIAS_IFF_MASK (IFF_UP|IFF_RUNNING|IFF_NOARP|IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_MULTICAST) - -static struct net_alias_type * nat_getbytype(int type); -static int nat_attach_chg(struct net_alias_type *nat, int delta); -static int nat_bind(struct net_alias_type *nat,struct net_alias *alias, struct sockaddr *sa); -static int nat_unbind(struct net_alias_type *nat, struct net_alias *alias); - -static int net_alias_devinit(struct device *dev); -static int net_alias_hard_start_xmit(struct sk_buff *skb, struct device *dev); -static int net_alias_devsetup(struct net_alias *alias, struct net_alias_type *nat, struct sockaddr *sa); -static struct net_alias **net_alias_slow_findp(struct net_alias_info *alias_info, struct net_alias *alias); -static struct device *net_alias_dev_create(struct device *main_dev, int slot, int *err, struct sockaddr *sa, void *data); -static struct device *net_alias_dev_delete(struct device *main_dev, int slot, int *err); -static void net_alias_free(struct device *dev); - -/* - * net_alias_type base array, will hold net_alias_type obj hashed list - * heads. - */ - -struct net_alias_type *nat_base[16]; - - -/* - * Get net_alias_type ptr by type - */ - -extern __inline__ struct net_alias_type *nat_getbytype(int type) -{ - struct net_alias_type *nat; - for(nat = nat_base[type & 0x0f]; nat ; nat = nat->next) - { - if (nat->type == type) - return nat; - } - return NULL; -} - - -/* - * Get addr32 representation (pre-hashing) of address. - * If NULL nat->get_addr32, assume sockaddr_in struct (IP-ish). - */ - -extern __inline__ __u32 nat_addr32(struct net_alias_type *nat, struct sockaddr *sa) -{ - if (nat->get_addr32) - return nat->get_addr32(nat, sa); - else - return (*(struct sockaddr_in *)sa).sin_addr.s_addr; -} - - -/* - * Hashing code for alias_info->hash_tab entries - * 4 bytes -> 1/2 byte using xor complemented by af - */ - -extern __inline__ unsigned HASH(__u32 addr, int af) -{ - unsigned tmp = addr ^ (addr>>16); /* 4 -> 2 */ - tmp ^= (tmp>>8); /* 2 -> 1 */ - return (tmp^(tmp>>4)^af) & 0x0f; /* 1 -> 1/2 */ -} - - -/* - * get hash key for supplied net alias type and address - * nat must be !NULL - * the purpose here is to map a net_alias_type and a generic - * address to a hash code. - */ - -extern __inline__ int nat_hash_key(struct net_alias_type *nat, struct sockaddr *sa) -{ - return HASH(nat_addr32(nat,sa), sa->sa_family); -} - - -/* - * Change net_alias_type number of attachments (bindings) - */ - -static int nat_attach_chg(struct net_alias_type *nat, int delta) -{ - unsigned long flags; - int n_at; - if (!nat) - return -1; - save_flags(flags); - cli(); - n_at = nat->n_attach + delta; - if (n_at < 0) - { - restore_flags(flags); - printk(KERN_WARNING - "net_alias: tried to set n_attach < 0 for (family==%d) nat object.\n", - nat->type); - return -1; - } - nat->n_attach = n_at; - restore_flags(flags); - return 0; -} - - -/* - * Bind alias to its type (family) object and call initialization hook - */ - -extern __inline__ int nat_bind(struct net_alias_type *nat, - struct net_alias *alias, struct sockaddr *sa) -{ - if (nat->alias_init_1) - nat->alias_init_1(nat, alias, sa); - return nat_attach_chg(nat, +1); -} - - -/* - * Unbind alias from type object and call alias destructor - */ - -extern __inline__ int nat_unbind(struct net_alias_type *nat, - struct net_alias *alias) -{ - if (nat->alias_done_1) - nat->alias_done_1(nat, alias); - return nat_attach_chg(nat, -1); -} - - -/* - * Compare device address with given. if NULL nat->dev_addr_chk, - * compare dev->pa_addr with (sockaddr_in) 32 bits address (IP-ish) - */ - -static __inline__ int nat_dev_addr_chk_1(struct net_alias_type *nat, - struct device *dev, struct sockaddr *sa) -{ - if (nat->dev_addr_chk) - return nat->dev_addr_chk(nat, dev, sa); - else - return (dev->pa_addr == (*(struct sockaddr_in *)sa).sin_addr.s_addr); -} - - -/* - * Alias device init() - * do nothing. - */ - -static int net_alias_devinit(struct device *dev) -{ -#ifdef ALIAS_USER_LAND_DEBUG - printk("net_alias_devinit(%s) called.\n", dev->name); -#endif - return 0; -} - - -/* - * 2 options for multicast: - * 1) fake it for aliases. - * 2) allow aliases and actual device to set it. - * current choice: option 1 - */ -static void net_alias_setmulticast(struct device *dev) -{ -} - - -/* - * Hard_start_xmit() should not be called. - * ignore ... but shout!. - */ - -static int net_alias_hard_start_xmit(struct sk_buff *skb, struct device *dev) -{ - printk(KERN_WARNING "net_alias: net_alias_hard_start_xmit() for %s called (ignored)!!\n", dev->name); - dev_kfree_skb(skb, FREE_WRITE); - return 0; -} - - -static int net_alias_open(struct device * dev) -{ - return 0; -} - -static int net_alias_close(struct device * dev) -{ - return 0; -} - -/* - * setups a new (alias) device - */ - -static int net_alias_devsetup(struct net_alias *alias, - struct net_alias_type *nat, struct sockaddr *sa) -{ - struct device *main_dev; - struct device *dev; - int family; - int i; - - /* - * - * generic device setup based on main_dev info - * - * FIXME: is NULL bitwise 0 for all Linux platforms? - */ - - main_dev = alias->main_dev; - dev = &alias->dev; - memset(dev, '\0', sizeof(struct device)); - family = (sa)? sa->sa_family : main_dev->family; - - dev->alias_info = NULL; /* no aliasing recursion */ - dev->my_alias = alias; /* point to alias */ - dev->name = alias->name; - dev->type = main_dev->type; - dev->open = net_alias_open; - dev->stop = net_alias_close; - if (main_dev->set_multicast_list) - dev->set_multicast_list = net_alias_setmulticast; - dev->hard_header_len = main_dev->hard_header_len; - memcpy(dev->broadcast, main_dev->broadcast, MAX_ADDR_LEN); - memcpy(dev->dev_addr, main_dev->dev_addr, MAX_ADDR_LEN); - dev->addr_len = main_dev->addr_len; - dev->init = net_alias_devinit; - dev->hard_start_xmit = net_alias_hard_start_xmit; - dev->flags = main_dev->flags & NET_ALIAS_IFF_MASK & ~IFF_UP; - dev->ifindex = dev_new_index(); - - /* - * Only makes sense if same family (arguable) - */ - - if (family == main_dev->family) - { - dev->metric = main_dev->metric; - dev->mtu = main_dev->mtu; - dev->pa_alen = main_dev->pa_alen; - dev->hard_header = main_dev->hard_header; - dev->hard_header_cache = main_dev->hard_header_cache; - dev->header_cache_update = main_dev->header_cache_update; - dev->rebuild_header = main_dev->rebuild_header; - } - - /* - * Fill in the generic fields of the device structure. - * not actually used, avoids some dev.c #ifdef's - */ - - for (i = 0; i < DEV_NUMBUFFS; i++) - skb_queue_head_init(&dev->buffs[i]); - - dev->family = family; - return 0; -} - - -/* - * Slow alias find (parse the whole hash_tab) - * returns: alias' pointer address - */ - -static struct net_alias **net_alias_slow_findp(struct net_alias_info - *alias_info, struct net_alias *alias) -{ - unsigned idx, n_aliases; - struct net_alias **aliasp; - - /* - * For each alias_info's hash_tab entry, for every alias ... - */ - - n_aliases = alias_info->n_aliases; - for (idx=0; idx < 16 ; idx++) - { - for (aliasp = &alias_info->hash_tab[idx];*aliasp; - aliasp = &(*aliasp)->next) - { - if (*aliasp == alias) - return aliasp; - else - if (--n_aliases == 0) - break; /* faster give up */ - } - } - return NULL; -} - - -/* - * Create alias device for main_dev with given slot num. - * if sa==NULL will create a same_family alias device. - */ - -static struct device *net_alias_dev_create(struct device *main_dev, int slot, - int *err, struct sockaddr *sa, void *data) -{ - struct net_alias_info *alias_info; - struct net_alias *alias, **aliasp; - struct net_alias_type *nat; - struct device *dev; - unsigned long flags; - int family; - __u32 addr32; - - /* FIXME: lock */ - - alias_info = main_dev->alias_info; - - /* - * If NULL address given, take family from main_dev - */ - - family = (sa)? sa->sa_family : main_dev->family; - - /* - * Check if wanted family has a net_alias_type object registered - */ - - nat = nat_getbytype(family); - if (!nat) - { -#ifdef CONFIG_KERNELD - char modname[20]; - sprintf (modname,"netalias-%d", family); - request_module(modname); - - nat = nat_getbytype(family); - if (!nat) - { -#endif - printk(KERN_WARNING "net_alias_dev_create(%s:%d): unregistered family==%d\n", - main_dev->name, slot, family); - /* *err = -EAFNOSUPPORT; */ - *err = -EINVAL; - return NULL; -#ifdef CONFIG_KERNELD - } -#endif - } - - /* - * Do not allow creation over downed devices - */ - - *err = -EIO; - - if (! (main_dev->flags & IFF_UP) ) - return NULL; - - /* - * If first alias, must also create alias_info - */ - - *err = -ENOMEM; - - if (!alias_info) - { - alias_info = kmalloc(sizeof(struct net_alias_info), GFP_KERNEL); - if (!alias_info) - return NULL; /* ENOMEM */ - memset(alias_info, 0, sizeof(struct net_alias_info)); - } - - if (!(alias = kmalloc(sizeof(struct net_alias), GFP_KERNEL))) - return NULL; /* ENOMEM */ - - memset(alias, 0, sizeof(struct net_alias)); - alias->slot = slot; - alias->main_dev = main_dev; - alias->nat = nat; - alias->next = NULL; - alias->data = data; - sprintf(alias->name, "%s:%d", main_dev->name, slot); - - /* - * Initialise alias' device structure - */ - - net_alias_devsetup(alias, nat, sa); - - dev = &alias->dev; - - save_flags(flags); - cli(); - - /* - * bind alias to its object type - * nat_bind calls nat->alias_init_1 - */ - - nat_bind(nat, alias, sa); - - /* - * If no address passed, take from device (could have been - * set by nat->alias_init_1) - */ - - addr32 = (sa)? nat_addr32(nat, sa) : alias->dev.pa_addr; - - /* - * Store hash key in alias: will speed-up rehashing and deletion - */ - - alias->hash = HASH(addr32, family); - - /* - * Insert alias in hashed linked list - */ - - aliasp = &alias_info->hash_tab[alias->hash]; - alias->next = *aliasp; - *aliasp = alias; - - /* - * If first alias ... - */ - - if (!alias_info->n_aliases++) - { - alias_info->taildev = main_dev; - main_dev->alias_info = alias_info; - } - - /* - * add device at tail (just after last main_dev alias) - */ - - dev->next = alias_info->taildev->next; - alias_info->taildev->next = dev; - alias_info->taildev = dev; - restore_flags(flags); - return dev; -} - - -/* - * Delete one main_dev alias (referred by its slot num) - */ - -static struct device *net_alias_dev_delete(struct device *main_dev, int slot, - int *err) -{ - struct net_alias_info *alias_info; - struct net_alias *alias, **aliasp; - struct device *dev; - unsigned n_aliases; - unsigned long flags; - struct net_alias_type *nat; - struct device *prevdev; - - /* FIXME: lock */ - *err = -ENODEV; - - if (main_dev == NULL) - return NULL; - - /* - * Does main_dev have aliases? - */ - - alias_info = main_dev->alias_info; - if (!alias_info) - return NULL; /* ENODEV */ - - n_aliases = alias_info->n_aliases; - - /* - * Find device that holds the same slot number (could also - * be strcmp() ala dev_get). - */ - - for (prevdev=main_dev, alias = NULL; - prevdev->next && n_aliases; prevdev = prevdev->next) - { - if (!(alias = prevdev->next->my_alias)) - { - printk(KERN_ERR "net_alias_dev_delete(): incorrect non-alias device after maindev\n"); - continue; /* or should give up? */ - } - if (alias->slot == slot) - break; - alias = NULL; - n_aliases--; - } - - if (!alias) - return NULL; /* ENODEV */ - - dev = &alias->dev; - - /* - * Find alias hashed entry - */ - - for(aliasp = &alias_info->hash_tab[alias->hash]; *aliasp; - aliasp = &(*aliasp)->next) - { - if(*aliasp == alias) - break; - } - - /* - * If not found (???), try a full search - */ - - if (*aliasp != alias) - { - if ((aliasp = net_alias_slow_findp(alias_info, alias))) - printk(KERN_WARNING "net_alias_dev_delete(%s): bad hashing recovered\n", alias->name); - else - { - printk(KERN_ERR "net_alias_dev_delete(%s): unhashed alias!\n",alias->name); - return NULL; /* ENODEV */ - } - } - nat = alias->nat; - - save_flags(flags); - cli(); - - /* - * Unbind alias from alias_type obj. - */ - - nat_unbind(nat, alias); - - /* - * Is alias at tail? - */ - - if ( dev == alias_info->taildev ) - alias_info->taildev = prevdev; - - /* - * Unlink and close device - */ - prevdev->next = dev->next; - dev_close(dev); - - /* - * Unlink alias - */ - - *aliasp = (*aliasp)->next; - if (--alias_info->n_aliases == 0) /* last alias */ - main_dev->alias_info = NULL; - - restore_flags(flags); - - /* - * Now free structures - */ - - kfree_s(alias, sizeof(struct net_alias)); - if (main_dev->alias_info == NULL) - kfree_s(alias_info, sizeof(struct net_alias_info)); - - /* - * Deletion ok (*err=0), NULL device returned. - */ - - *err = 0; - return NULL; -} - -/* - * Free all main device aliasing stuff - * will be called on dev_close(main_dev) - */ - -static void net_alias_free(struct device *main_dev) -{ - struct net_alias_info *alias_info; - struct net_alias *alias; - struct net_alias_type *nat; - struct device *dev; - unsigned long flags; - - /* - * Do I really have aliases? - */ - - if (!(alias_info = main_dev->alias_info)) - return; - - /* - * Fast device link "short-circuit": set main_dev->next to - * device after last alias - */ - - save_flags(flags); - cli(); - - dev = main_dev->next; - main_dev->next = alias_info->taildev->next; - main_dev->alias_info = NULL; - alias_info->taildev->next = NULL; - - restore_flags(flags); - - /* - * Loop over alias devices, free and dev_close() - */ - - while (dev) - { - if (net_alias_is(dev)) - { - alias = dev->my_alias; - if (alias->main_dev == main_dev) - { - /* - * unbind alias from alias_type object - */ - nat = alias->nat; - if (nat) - { - nat_unbind(nat, alias); - } /* else error/printk ??? */ - - dev_close(dev); - dev = dev->next; - - kfree_s(alias, sizeof(struct net_alias)); - continue; - } - else - printk(KERN_ERR "net_alias_free(%s): '%s' is not my alias\n", - main_dev->name, alias->name); - } - else - { - printk(KERN_ERR "net_alias_free(%s): found a non-alias after device!\n", - main_dev->name); - } - dev = dev->next; - } - - kfree_s(alias_info, sizeof(alias_info)); - return; -} - -/* - * dev_get() with added alias naming magic. - */ - -struct device *net_alias_dev_get(char *dev_name, int aliasing_ok, int *err, - struct sockaddr *sa, void *data) -{ - struct device *dev; - char *sptr,*eptr; - int slot = 0; - int delete = 0; - - *err = -ENODEV; - if ((dev=dev_get(dev_name))) - return dev; - - /* - * Want alias naming magic? - */ - - if (!aliasing_ok) - return NULL; - - if (!dev_name || !*dev_name) - return NULL; - - /* - * Find the first ':' , must be followed by, at least, 1 char - */ - - sptr=strchr(dev_name,':'); - if (sptr==NULL || !sptr[1]) - return NULL; - -#if 0 - for (sptr=dev_name ; *sptr ; sptr++) - if(*sptr==':') - break; - if (!*sptr || !*(sptr+1)) - return NULL; -#endif - /* - * Seems to be an alias name, fetch main device - */ - - *sptr='\0'; - if (!(dev=dev_get(dev_name))) - return NULL; - *sptr++=':'; - - /* - * Fetch slot number - */ - - slot = simple_strtoul(sptr,&eptr,10); - if (slot >= NET_ALIAS_MAX_SLOT) - return NULL; - - /* - * If last char is '-', it is a deletion request - */ - - if (eptr[0] == '-' && !eptr[1] ) - delete++; - else if (eptr[0]) - return NULL; - - /* - * Well... let's work. - */ - - if (delete) - return net_alias_dev_delete(dev, slot, err); - else - return net_alias_dev_create(dev, slot, err, sa, data); -} - - -/* - * Rehash alias device with address supplied. - */ - -int net_alias_dev_rehash(struct device *dev, struct sockaddr *sa) -{ - struct net_alias_info *alias_info; - struct net_alias *alias, **aliasp; - struct device *main_dev; - unsigned long flags; - struct net_alias_type *o_nat, *n_nat; - unsigned n_hash; - - /* - * Defensive ... - */ - - if (dev == NULL) - return -1; - if ( (alias = dev->my_alias) == NULL ) - return -1; - - if (!sa) - { - printk(KERN_ERR "net_alias_rehash(): NULL sockaddr passed\n"); - return -1; - } - - /* - * Defensive. should not happen. - */ - - if ( (main_dev = alias->main_dev) == NULL ) - { - printk(KERN_ERR "net_alias_rehash for %s: NULL maindev\n", alias->name); - return -1; - } - - /* - * Defensive. should not happen. - */ - - if (!(alias_info=main_dev->alias_info)) - { - printk(KERN_ERR "net_alias_rehash for %s: NULL alias_info\n", alias->name); - return -1; - } - - /* - * Will the request also change device family? - */ - - o_nat = alias->nat; - if (!o_nat) - { - printk(KERN_ERR "net_alias_rehash(%s): unbound alias.\n", alias->name); - return -1; - } - - /* - * Point to new alias_type obj. - */ - - if (o_nat->type == sa->sa_family) - n_nat = o_nat; - else - { - n_nat = nat_getbytype(sa->sa_family); - if (!n_nat) - { - printk(KERN_ERR "net_alias_rehash(%s): unreg family==%d.\n", alias->name, sa->sa_family); - return -1; - } - } - - /* - * New hash key. if same as old AND same type (family) return; - */ - - n_hash = nat_hash_key(n_nat, sa); - if (n_hash == alias->hash && o_nat == n_nat ) - return 0; - - /* - * Find alias in hashed list - */ - - for (aliasp = &alias_info->hash_tab[alias->hash]; *aliasp; - aliasp = &(*aliasp)->next) - { - if (*aliasp == alias) - break; - } - - /* - * Not found (???). try a full search - */ - - if(!*aliasp) - { - if ((aliasp = net_alias_slow_findp(alias_info, alias))) - { - printk(KERN_WARNING - "net_alias_rehash(%s): bad hashing recovered\n", alias->name); - } - else - { - printk(KERN_ERR "net_alias_rehash(%s): unhashed alias!\n", alias->name); - return -1; - } - } - - save_flags(flags); - cli(); - - /* - * If type (family) changed, unlink from old type object (o_nat) - * Will call o_nat->alias_done_1() - */ - - if (o_nat != n_nat) - nat_unbind(o_nat, alias); - - /* - * If diff hash key, change alias position in hashed list - */ - - if (n_hash != alias->hash) - { - *aliasp = (*aliasp)->next; - alias->hash = n_hash; - aliasp = &alias_info->hash_tab[n_hash]; - alias->next = *aliasp; - *aliasp = alias; - } - - /* - * If type (family) changed link to new type object (n_nat) - * will call n_nat->alias_init_1() - */ - - if (o_nat != n_nat) - nat_bind(n_nat, alias, sa); - - restore_flags(flags); - return 0; -} - - - - -/* - * Implements /proc/net/alias_types entry - * Shows net_alias_type objects registered. - */ - -int net_alias_types_getinfo(char *buffer, char **start, off_t offset, int length, int dummy) -{ - off_t pos=0, begin=0; - int len=0; - struct net_alias_type *nat; - unsigned idx; - len=sprintf(buffer,"type name n_attach\n"); - for (idx=0 ; idx < 16 ; idx++) - { - for (nat = nat_base[idx]; nat ; nat = nat->next) - { - len += sprintf(buffer+len, "%-7d %-15s %-7d\n", - nat->type, nat->name,nat->n_attach); - pos=begin+len; - if(pos<offset) - { - len=0; - begin=pos; - } - if(pos>offset+length) - break; - } - } - *start=buffer+(offset-begin); - len-=(offset-begin); - if(len>length) - len=length; - return len; -} - - -/* - * Implements /proc/net/aliases entry, shows alias devices. - * calls alias nat->alias_print_1 if not NULL and formats everything - * to a fixed rec. size without using local (stack) buffers - * - */ - -#define NET_ALIASES_RECSIZ 64 - -int net_alias_getinfo(char *buffer, char **start, off_t offset, - int length, int dummy) -{ - off_t pos=0, begin=0; - int len=0; - int dlen; - struct net_alias_type *nat; - struct net_alias *alias; - struct device *dev; - - len=sprintf(buffer,"%-*s\n",NET_ALIASES_RECSIZ-1,"device family address"); - for (dev = dev_base; dev ; dev = dev->next) - { - if (net_alias_is(dev)) - { - alias = dev->my_alias; - nat = alias->nat; - dlen=sprintf(buffer+len, "%-16s %-6d ", alias->name, alias->dev.family); - - /* - * Call alias_type specific print function. - */ - - if (nat->alias_print_1) - dlen += nat->alias_print_1(nat, alias, buffer+len+dlen, NET_ALIASES_RECSIZ - dlen); - else - dlen += sprintf(buffer+len+dlen, "-"); - - /* - * Fill with spaces if needed - */ - - if (dlen < NET_ALIASES_RECSIZ) - memset(buffer+len+dlen, ' ', NET_ALIASES_RECSIZ - dlen); - - /* - * Truncate to NET_ALIASES_RECSIZ - */ - - len += NET_ALIASES_RECSIZ; - buffer[len-1] = '\n'; - - pos=begin+len; - if(pos<offset) - { - len=0; - begin=pos; - } - if(pos>offset+length) - break; - } - } - *start=buffer+(offset-begin); - len-=(offset-begin); - if(len>length) - len=length; - return len; -} - - -/* - * Notifier for devices events - */ - -int net_alias_device_event(struct notifier_block *this, unsigned long event, void *ptr) -{ - struct device *dev = ptr; - - if (event == NETDEV_DOWN) - { -#ifdef ALIAS_USER_LAND_DEBUG - printk("net_alias: NETDEV_DOWN for %s received\n", dev->name); -#endif - if (net_alias_has(dev)) - net_alias_free(dev); - } - - if (event == NETDEV_UP) - { -#ifdef ALIAS_USER_LAND_DEBUG - printk("net_alias: NETDEV_UP for %s received\n", dev->name); -#endif - dev->alias_info = 0; - } - - return NOTIFY_DONE; -} - - -/* - * Device aliases address comparison workhorse - * No checks for nat and alias_info, must be !NULL - */ - -extern __inline__ struct device *nat_addr_chk(struct net_alias_type *nat, - struct net_alias_info *alias_info, struct sockaddr *sa, int flags_on, int flags_off) -{ - struct net_alias *alias; - for(alias = alias_info->hash_tab[nat_hash_key(nat,sa)]; - alias; alias = alias->next) - { - if (alias->dev.family != sa->sa_family) - continue; - - /* - * Nat_dev_addr_chk_1 will call type specific address - * cmp function. - */ - - if (alias->dev.flags & flags_on && - !(alias->dev.flags & flags_off) && - nat_dev_addr_chk_1(nat,&alias->dev,sa)) - return &alias->dev; - } - return NULL; -} - -/* - * Nat_addr_chk enough for protocols whose addr is (fully) stored at - * pa_addr. Note that nat pointer is ignored because of static comparison. - */ - -extern __inline__ struct device *nat_addr_chk32(struct net_alias_type *nat, - struct net_alias_info *alias_info, int family, __u32 addr32, - int flags_on, int flags_off) -{ - struct net_alias *alias; - for (alias=alias_info->hash_tab[HASH(addr32,family)]; - alias; alias=alias->next) - { - if (alias->dev.family != family) - continue; - /* - * "hard" (static) comparison between addr32 and pa_addr. - */ - - if (alias->dev.flags & flags_on && !(alias->dev.flags & flags_off) && - addr32 == alias->dev.pa_addr) - return &alias->dev; - } - return NULL; -} - -/* - * Returns alias device with specified address AND flags_on AND flags_off, - * else NULL. - * Intended for main devices. - */ - -struct device *net_alias_dev_chk(struct device *main_dev, - struct sockaddr *sa,int flags_on, int flags_off) -{ - struct net_alias_info *alias_info = main_dev->alias_info; - struct net_alias_type *nat; - - /* - * Only if main_dev has aliases - */ - - if (!alias_info) - return NULL; - - /* - * Get alias_type object for sa->sa_family. - */ - - nat = nat_getbytype(sa->sa_family); - if (!nat) - return NULL; - - return nat_addr_chk(nat, alias_info, sa, flags_on, flags_off); -} - -/* - * net_alias_dev_chk enough for protocols whose addr is (fully) stored - * at pa_addr. - */ - -struct device *net_alias_dev_chk32(struct device *main_dev, int family, - __u32 addr32, int flags_on, int flags_off) -{ - struct net_alias_info *alias_info = main_dev->alias_info; - - /* - * only if main_dev has aliases - */ - - if (!alias_info) - return NULL; - return nat_addr_chk32(NULL, alias_info, family, addr32, - flags_on, flags_off); -} - - -/* - * Select closest (main or alias) device to <src,dst> addresses given. If - * there is no further info available, return main_dev (for easier - * calling arrangement). - * - * Should be called early at xxx_rcv() time for device selection - */ - -struct device *net_alias_dev_rcv_sel(struct device *main_dev, - struct sockaddr *sa_src, struct sockaddr *sa_dst) -{ - int family; - struct net_alias_type *nat; - struct net_alias_info *alias_info; - struct device *dev; - - if (main_dev == NULL) - return NULL; - - /* - * If not aliased, don't bother any more - */ - - if ((alias_info = main_dev->alias_info) == NULL) - return main_dev; - - /* - * Find out family - */ - - family = (sa_src)? sa_src->sa_family : - ((sa_dst)? sa_dst->sa_family : AF_UNSPEC); - - if (family == AF_UNSPEC) - return main_dev; - - /* - * Get net_alias_type object for this family - */ - - if ( (nat = nat_getbytype(family)) == NULL ) - return main_dev; - - /* - * First step: find out if dst addr is main_dev's or one of its - * aliases' - */ - - if (sa_dst) - { - if (nat_dev_addr_chk_1(nat, main_dev,sa_dst)) - return main_dev; - - dev = nat_addr_chk(nat, alias_info, sa_dst, IFF_UP, 0); - - if (dev != NULL) - return dev; - } - - /* - * Second step: find the rcv addr 'closest' alias through nat - * method call - */ - - if ( sa_src == NULL || nat->dev_select == NULL) - return main_dev; - - dev = nat->dev_select(nat, main_dev, sa_src); - - if (dev == NULL || dev->family != family) - return main_dev; - - /* - * Dev ok only if it is alias of main_dev - */ - - dev = net_alias_is(dev)? - ( (dev->my_alias->main_dev == main_dev)? dev : NULL) : NULL; - - /* - * Do not return NULL. - */ - - return (dev)? dev : main_dev; - -} - -/* - * dev_rcv_sel32: dev_rcv_sel for 'pa_addr' protocols. - */ - -struct device *net_alias_dev_rcv_sel32(struct device *main_dev, int family, - __u32 src, __u32 dst) -{ - struct net_alias_type *nat; - struct net_alias_info *alias_info; - struct sockaddr_in sin_src; - struct device *dev; - - if (main_dev == NULL) - return NULL; - - /* - * If not aliased, don't bother any more - */ - - if ((alias_info = main_dev->alias_info) == NULL) - return main_dev; - - /* - * Early return if dst is main_dev's address - */ - - if (dst == main_dev->pa_addr) - return main_dev; - - if (family == AF_UNSPEC) - return main_dev; - - /* - * Get net_alias_type object for this family - */ - - if ( (nat = nat_getbytype(family)) == NULL ) - return main_dev; - - /* - * First step: find out if dst address one of main_dev aliases' - */ - - if (dst) - { - dev = nat_addr_chk32(nat, alias_info, family, dst, IFF_UP, 0); - if (dev) - return dev; - } - - /* - * Second step: find the rcv addr 'closest' alias through nat - * method call - */ - - if ( src == 0 || nat->dev_select == NULL) - return main_dev; - - sin_src.sin_family = family; - sin_src.sin_addr.s_addr = src; - - dev = nat->dev_select(nat, main_dev, (struct sockaddr *)&sin_src); - - if (dev == NULL || dev->family != family) - return main_dev; - - /* - * Dev ok only if it is alias of main_dev - */ - - dev = net_alias_is(dev)? - ( (dev->my_alias->main_dev == main_dev)? dev : NULL) : NULL; - - /* - * Do not return NULL. - */ - - return (dev)? dev : main_dev; -} - - -/* - * Device event hook - */ - -static struct notifier_block net_alias_dev_notifier = -{ - net_alias_device_event, - NULL, - 0 -}; - -#ifndef ALIAS_USER_LAND_DEBUG -#ifdef CONFIG_PROC_FS -static struct proc_dir_entry proc_net_alias_types = { - PROC_NET_ALIAS_TYPES, 11, "alias_types", - S_IFREG | S_IRUGO, 1, 0, 0, - 0, &proc_net_inode_operations, - net_alias_types_getinfo -}; -static struct proc_dir_entry proc_net_aliases = { - PROC_NET_ALIASES, 7, "aliases", - S_IFREG | S_IRUGO, 1, 0, 0, - 0, &proc_net_inode_operations, - net_alias_getinfo -}; -#endif -#endif - -/* - * Net_alias initialisation called from net_dev_init(). - */ - -__initfunc(void net_alias_init(void)) -{ - - /* - * Register device events notifier - */ - - register_netdevice_notifier(&net_alias_dev_notifier); - - /* - * Register /proc/net entries - */ - -#ifndef ALIAS_USER_LAND_DEBUG -#ifdef CONFIG_PROC_FS - proc_net_register(&proc_net_alias_types); - proc_net_register(&proc_net_aliases); -#endif -#endif - -} - -/* - * Net_alias type object registering func. - */ - -int register_net_alias_type(struct net_alias_type *nat, int type) -{ - unsigned hash; - unsigned long flags; - if (!nat) - { - printk(KERN_ERR "register_net_alias_type(): NULL arg\n"); - return -EINVAL; - } - nat->type = type; - nat->n_attach = 0; - hash = nat->type & 0x0f; - save_flags(flags); - cli(); - nat->next = nat_base[hash]; - nat_base[hash] = nat; - restore_flags(flags); - return 0; -} - -/* - * Net_alias type object unreg. - */ - -int unregister_net_alias_type(struct net_alias_type *nat) -{ - struct net_alias_type **natp; - unsigned hash; - unsigned long flags; - - if (!nat) - { - printk(KERN_ERR "unregister_net_alias_type(): NULL arg\n"); - return -EINVAL; - } - - /* - * Only allow unregistration if it has no attachments - */ - - if (nat->n_attach) - { - printk(KERN_ERR "unregister_net_alias_type(): has %d attachments. failed\n", - nat->n_attach); - return -EINVAL; - } - hash = nat->type & 0x0f; - save_flags(flags); - cli(); - for (natp = &nat_base[hash]; *natp ; natp = &(*natp)->next) - { - if (nat==(*natp)) - { - *natp = nat->next; - restore_flags(flags); - return 0; - } - } - restore_flags(flags); - printk(KERN_ERR "unregister_net_alias_type(type=%d): not found!\n", nat->type); - return -EINVAL; -} - diff --git a/net/core/scm.c b/net/core/scm.c index e5fa793a7..5a6d24c40 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -205,25 +205,25 @@ error: return err; } -void put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) +int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) { struct cmsghdr *cm = (struct cmsghdr*)msg->msg_control; + struct cmsghdr cmhdr; int cmlen = CMSG_LEN(len); int err; if (cm==NULL || msg->msg_controllen < sizeof(*cm)) { msg->msg_flags |= MSG_CTRUNC; - return; + return 0; /* XXX: return error? check spec. */ } if (msg->msg_controllen < cmlen) { msg->msg_flags |= MSG_CTRUNC; cmlen = msg->msg_controllen; } - err = put_user(level, &cm->cmsg_level); - if (!err) - err = put_user(type, &cm->cmsg_type); - if (!err) - err = put_user(cmlen, &cm->cmsg_len); + cmhdr.cmsg_level = level; + cmhdr.cmsg_type = type; + cmhdr.cmsg_len = cmlen; + err = copy_to_user(cm, &cmhdr, sizeof cmhdr); if (!err) err = copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr)); if (!err) { @@ -231,6 +231,7 @@ void put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) msg->msg_control += cmlen; msg->msg_controllen -= cmlen; } + return err; } void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 06c321e4f..6baf37c03 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -176,7 +176,7 @@ struct sk_buff *alloc_skb(unsigned int size,int priority) skb->dst = NULL; skb->destructor = NULL; memset(skb->cb, 0, sizeof(skb->cb)); - skb->priority = SOPRI_NORMAL; + skb->priority = 0; atomic_inc(&net_skbcount); atomic_set(&skb->users, 1); diff --git a/net/core/sock.c b/net/core/sock.c index 65cee3b62..725474887 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -75,6 +75,7 @@ * protocol private data. * Steve Whitehouse: Added various other default routines * common to several socket families. + * Chris Evans : Call suser() check last on F_SETOWN * * To Fix: * @@ -101,6 +102,7 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/interrupt.h> +#include <linux/poll.h> #include <asm/uaccess.h> #include <asm/system.h> @@ -143,6 +145,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, int valbool; int err; struct linger ling; + struct ifreq req; int ret = 0; /* @@ -241,7 +244,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, break; case SO_PRIORITY: - if (val >= 0 && val < DEV_NUMBUFFS) + if (val >= 0 && val <= 7) sk->priority = val; else return(-EINVAL); @@ -317,6 +320,46 @@ int sock_setsockopt(struct socket *sock, int level, int optname, return -EINVAL; break; #endif + case SO_BINDTODEVICE: + /* Bind this socket to a particular device like "eth0", + * as specified in an ifreq structure. If the device + * is "", socket is NOT bound to a device. + */ + + if (!valbool) { + sk->bound_dev_if = 0; + } + else { + if (copy_from_user(&req, optval, sizeof(req)) < 0) + return -EFAULT; + + /* Remove any cached route for this socket. */ + if (sk->dst_cache) { + ip_rt_put((struct rtable*)sk->dst_cache); + sk->dst_cache = NULL; + } + + if (req.ifr_ifrn.ifrn_name[0] == '\0') { + sk->bound_dev_if = 0; + } + else { + struct device *dev = dev_get(req.ifr_ifrn.ifrn_name); + if (!dev) + return -EINVAL; + sk->bound_dev_if = dev->ifindex; + if (sk->daddr) { + int ret; + ret = ip_route_output((struct rtable**)&sk->dst_cache, + sk->daddr, sk->saddr, + sk->ip_tos, sk->bound_dev_if); + if (ret) + return ret; + } + } + } + return 0; + + /* We implement the SO_SNDLOWAT etc to not be settable (1003.1g 5.3) */ default: @@ -627,7 +670,7 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, unsigne produce annoying no free page messages still.... */ skb = sock_wmalloc(sk, size, 0 , GFP_BUFFER); if(!skb) - skb=sock_wmalloc(sk, fallback, 0, GFP_KERNEL); + skb=sock_wmalloc(sk, fallback, 0, sk->allocation); } /* @@ -669,7 +712,7 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, unsigne * In any case I'd delete this check at all, or * change it to: */ - if (atomic_read(&sk->wmem_alloc) + size >= sk->sndbuf) + if (atomic_read(&sk->wmem_alloc) >= sk->sndbuf) #endif { sk->socket->flags &= ~SO_NOSPACE; @@ -896,8 +939,9 @@ int sock_no_fcntl(struct socket *sock, unsigned int cmd, unsigned long arg) * way to make sure that you can't send a sigurg to * another process. */ - if (!suser() && current->pgrp != -arg && - current->pid != arg) return(-EPERM); + if (current->pgrp != -arg && + current->pid != arg && + !suser()) return(-EPERM); sk->proc = arg; return(0); case F_GETOWN: @@ -967,7 +1011,6 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->allocation = GFP_KERNEL; sk->rcvbuf = sysctl_rmem_default*2; sk->sndbuf = sysctl_wmem_default*2; - sk->priority = SOPRI_NORMAL; sk->state = TCP_CLOSE; sk->zapped = 1; sk->socket = sock; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index bdc6b37fd..47417a27a 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -232,6 +232,13 @@ unsigned short eth_type_trans(struct sk_buff *skb, struct device *dev) return htons(ETH_P_802_2); } +int eth_header_parse(struct sk_buff *skb, unsigned char *haddr) +{ + struct ethhdr *eth = skb->mac.ethernet; + memcpy(haddr, eth->h_source, ETH_ALEN); + return ETH_ALEN; +} + int eth_header_cache(struct dst_entry *dst, struct neighbour *neigh, struct hh_cache *hh) { diff --git a/net/ipv4/Config.in b/net/ipv4/Config.in index 3a5ac3b04..ea50576ab 100644 --- a/net/ipv4/Config.in +++ b/net/ipv4/Config.in @@ -2,6 +2,25 @@ # IP configuration # bool 'IP: multicasting' CONFIG_IP_MULTICAST +bool 'IP: advanced router' CONFIG_IP_ADVANCED_ROUTER +if [ "$CONFIG_IP_ADVANCED_ROUTER" = "y" ]; then + define_bool CONFIG_RTNETLINK y + bool 'IP: policy routing' CONFIG_IP_MULTIPLE_TABLES + bool 'IP: equal cost multipath' CONFIG_IP_ROUTE_MULTIPATH + bool 'IP: use TOS value as routing key' CONFIG_IP_ROUTE_TOS + bool 'IP: verbose route monitoring' CONFIG_IP_ROUTE_VERBOSE + bool 'IP: large routing tables' CONFIG_IP_ROUTE_LARGE_TABLES + if [ "$CONFIG_IP_MULTIPLE_TABLES" = "y" ]; then + bool 'IP: fast network address translation' CONFIG_IP_ROUTE_NAT + fi +fi +bool 'IP: kernel level autoconfiguration' CONFIG_IP_PNP +if [ "$CONFIG_IP_PNP" = "y" ]; then + bool ' BOOTP support' CONFIG_IP_PNP_BOOTP + bool ' RARP support' CONFIG_IP_PNP_RARP +# not yet ready.. +# bool ' ARP support' CONFIG_IP_PNP_ARP +fi if [ "$CONFIG_FIREWALL" = "y" ]; then bool 'IP: firewalling' CONFIG_IP_FIREWALL if [ "$CONFIG_IP_FIREWALL" = "y" ]; then @@ -9,23 +28,29 @@ if [ "$CONFIG_FIREWALL" = "y" ]; then bool 'IP: firewall packet netlink device' CONFIG_IP_FIREWALL_NETLINK fi bool 'IP: firewall packet logging' CONFIG_IP_FIREWALL_VERBOSE - bool 'IP: masquerading' CONFIG_IP_MASQUERADE - if [ "$CONFIG_IP_MASQUERADE" != "n" ]; then - comment 'Protocol-specific masquerading support will be built as modules.' - fi bool 'IP: transparent proxy support' CONFIG_IP_TRANSPARENT_PROXY bool 'IP: always defragment' CONFIG_IP_ALWAYS_DEFRAG fi fi bool 'IP: accounting' CONFIG_IP_ACCT +bool 'IP: masquerading' CONFIG_IP_MASQUERADE +if [ "$CONFIG_IP_MASQUERADE" != "n" ]; then + comment 'Protocol-specific masquerading support will be built as modules.' +fi bool 'IP: optimize as router not host' CONFIG_IP_ROUTER tristate 'IP: tunneling' CONFIG_NET_IPIP +tristate 'IP: GRE tunnels over IP' CONFIG_NET_IPGRE if [ "$CONFIG_IP_MULTICAST" = "y" ]; then + if [ "$CONFIG_NET_IPGRE" != "n" ]; then + bool 'IP: broadcast GRE over IP' CONFIG_NET_IPGRE_BROADCAST + fi bool 'IP: multicast routing' CONFIG_IP_MROUTE + if [ "$CONFIG_IP_MROUTE" = "y" ]; then + bool 'IP: PIM-SM version 1 support' CONFIG_IP_PIMSM_V1 + bool 'IP: PIM-SM version 2 support' CONFIG_IP_PIMSM_V2 + fi fi -if [ "$CONFIG_NET_ALIAS" = "y" ]; then - tristate 'IP: aliasing support' CONFIG_IP_ALIAS -fi +tristate 'IP: aliasing support' CONFIG_IP_ALIAS if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then if [ "$CONFIG_NETLINK" = "y" ]; then bool 'IP: ARP daemon support (EXPERIMENTAL)' CONFIG_ARPD @@ -33,9 +58,9 @@ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then fi bool 'IP: TCP syncookie support (not enabled per default) ' CONFIG_SYN_COOKIES comment '(it is safe to leave these untouched)' -bool 'IP: PC/TCP compatibility mode' CONFIG_INET_PCTCP +#bool 'IP: PC/TCP compatibility mode' CONFIG_INET_PCTCP tristate 'IP: Reverse ARP' CONFIG_INET_RARP -bool 'IP: Path MTU Discovery (normally enabled)' CONFIG_PATH_MTU_DISCOVERY +#bool 'IP: Path MTU Discovery (normally enabled)' CONFIG_PATH_MTU_DISCOVERY #bool 'IP: Disable NAGLE algorithm (normally enabled)' CONFIG_TCP_NAGLE_OFF bool 'IP: Drop source routed frames' CONFIG_IP_NOSR bool 'IP: Allow large windows (not recommended if <16Mb of memory)' CONFIG_SKB_LARGE diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 2428ccc55..759def7ea 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -8,17 +8,25 @@ # Note 2! The CFLAGS definition is now in the main makefile... O_TARGET := ipv4.o -IPV4_OBJS := utils.o route.o proc.o timer.o protocol.o packet.o \ +IPV4_OBJS := utils.o route.o proc.o timer.o protocol.o \ ip_input.o ip_fragment.o ip_forward.o ip_options.o \ ip_output.o ip_sockglue.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o\ raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o ip_fw.o \ - sysctl_net_ipv4.o fib.o ip_nat_dumb.o + sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o IPV4X_OBJS := MOD_LIST_NAME := IPV4_MODULES M_OBJS := +ifeq ($(CONFIG_IP_MULTIPLE_TABLES),y) +IPV4_OBJS += fib_rules.o +endif + +ifeq ($(CONFIG_IP_ROUTE_NAT),y) +IPV4_OBJS += ip_nat_dumb.o +endif + ifeq ($(CONFIG_IP_MROUTE),y) IPV4_OBJS += ipmr.o endif @@ -32,10 +40,18 @@ else endif ifeq ($(CONFIG_NET_IPIP),y) -IPV4_OBJS += ipip.o +IPV4X_OBJS += ipip.o else ifeq ($(CONFIG_NET_IPIP),m) - M_OBJS += ipip.o + MX_OBJS += ipip.o + endif +endif + +ifeq ($(CONFIG_NET_IPGRE),y) +IPV4X_OBJS += ip_gre.o +else + ifeq ($(CONFIG_NET_IPGRE),m) + MX_OBJS += ip_gre.o endif endif @@ -44,19 +60,15 @@ IPV4X_OBJS += ip_masq.o ip_masq_app.o M_OBJS += ip_masq_ftp.o ip_masq_irc.o ip_masq_raudio.o ip_masq_quake.o endif -ifeq ($(CONFIG_IP_ALIAS),y) -IPV4_OBJS += ip_alias.o -else - ifeq ($(CONFIG_IP_ALIAS),m) - M_OBJS += ip_alias.o - endif -endif - ifeq ($(CONFIG_SYN_COOKIES),y) IPV4_OBJS += syncookies.o # module not supported, because it would be too messy. endif +ifeq ($(CONFIG_IP_PNP),y) +IPV4_OBJS += ipconfig.o +endif + ifdef CONFIG_INET O_OBJS := $(IPV4_OBJS) OX_OBJS := $(IPV4X_OBJS) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 4bf4bf166..ca3ff3213 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -5,7 +5,7 @@ * * AF_INET protocol family socket handler. * - * Version: @(#)af_inet.c (from sock.c) 1.0.17 06/02/93 + * Version: $Id: af_inet.c,v 1.58 1997/10/29 20:27:21 kuznet Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -77,6 +77,7 @@ #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/init.h> +#include <linux/poll.h> #include <asm/uaccess.h> #include <asm/system.h> @@ -94,14 +95,15 @@ #include <net/sock.h> #include <net/raw.h> #include <net/icmp.h> +#include <net/ipip.h> #include <net/inet_common.h> #include <linux/ip_fw.h> +#ifdef CONFIG_IP_MROUTE +#include <linux/mroute.h> +#endif #ifdef CONFIG_IP_MASQUERADE #include <net/ip_masq.h> #endif -#ifdef CONFIG_IP_ALIAS -#include <net/ip_alias.h> -#endif #ifdef CONFIG_BRIDGE #include <net/br.h> #endif @@ -115,13 +117,13 @@ #define min(a,b) ((a)<(b)?(a):(b)) extern int sysctl_core_destroy_delay; -extern struct proto packet_prot; + extern int raw_get_info(char *, char **, off_t, int, int); extern int snmp_get_info(char *, char **, off_t, int, int); extern int afinet_get_info(char *, char **, off_t, int, int); extern int tcp_get_info(char *, char **, off_t, int, int); extern int udp_get_info(char *, char **, off_t, int, int); - +extern void ip_mc_drop_socket(struct sock *sk); #ifdef CONFIG_DLCI extern int dlci_ioctl(unsigned int, void*); @@ -165,9 +167,8 @@ static __inline__ void kill_sk_now(struct sock *sk) /* No longer exists. */ del_from_prot_sklist(sk); - /* This is gross, but needed for SOCK_PACKET -DaveM */ - if(sk->prot->unhash) - sk->prot->unhash(sk); + /* Remove from protocol hash chains. */ + sk->prot->unhash(sk); if(sk->opt) kfree(sk->opt); @@ -321,13 +322,24 @@ static int inet_create(struct socket *sock, int protocol) struct sock *sk; struct proto *prot; + /* Compatibility */ + if (sock->type == SOCK_PACKET) { + static int warned; + if (net_families[AF_PACKET]==NULL) + return -ESOCKTNOSUPPORT; + if (!warned++) + printk(KERN_INFO "%s uses obsolete (AF_INET,SOCK_PACKET)\n", current->comm); + return net_families[AF_PACKET]->create(sock, protocol); + } + sock->state = SS_UNCONNECTED; sk = sk_alloc(AF_INET, GFP_KERNEL); if (sk == NULL) goto do_oom; - /* Note for tcp that also wiped the dummy_th block for us. */ - if(sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET) { + switch (sock->type) { + case SOCK_STREAM: + /* Note for tcp that also wiped the dummy_th block for us. */ if (protocol && protocol != IPPROTO_TCP) goto free_and_noproto; protocol = IPPROTO_TCP; @@ -338,7 +350,10 @@ static int inet_create(struct socket *sock, int protocol) sk->ip_pmtudisc = IP_PMTUDISC_WANT; prot = &tcp_prot; sock->ops = &inet_stream_ops; - } else if(sock->type == SOCK_DGRAM) { + break; + case SOCK_SEQPACKET: + goto free_and_badtype; + case SOCK_DGRAM: if (protocol && protocol != IPPROTO_UDP) goto free_and_noproto; protocol = IPPROTO_UDP; @@ -346,21 +361,26 @@ static int inet_create(struct socket *sock, int protocol) sk->ip_pmtudisc = IP_PMTUDISC_DONT; prot=&udp_prot; sock->ops = &inet_dgram_ops; - } else if(sock->type == SOCK_RAW || sock->type == SOCK_PACKET) { + break; + case SOCK_RAW: if (!suser()) goto free_and_badperm; if (!protocol) goto free_and_noproto; - prot = (sock->type == SOCK_RAW) ? &raw_prot : &packet_prot; + prot = &raw_prot; sk->reuse = 1; sk->ip_pmtudisc = IP_PMTUDISC_DONT; sk->num = protocol; sock->ops = &inet_dgram_ops; - } else { + if (protocol == IPPROTO_RAW) + sk->ip_hdrincl = 1; + break; + default: goto free_and_badtype; } sock_init_data(sock,sk); + sk->destruct = NULL; sk->zapped=0; @@ -378,11 +398,6 @@ static int inet_create(struct socket *sock, int protocol) sk->ip_ttl=ip_statistics.IpDefaultTTL; - if(sk->type==SOCK_RAW && protocol==IPPROTO_RAW) - sk->ip_hdrincl=1; - else - sk->ip_hdrincl=0; - sk->ip_mc_loop=1; sk->ip_mc_ttl=1; sk->ip_mc_index=0; @@ -398,11 +413,10 @@ static int inet_create(struct socket *sock, int protocol) * creation time automatically * shares. */ - sk->dummy_th.source = ntohs(sk->num); + sk->dummy_th.source = htons(sk->num); - /* This is gross, but needed for SOCK_PACKET -DaveM */ - if(sk->prot->hash) - sk->prot->hash(sk); + /* Add to protocol hash chains. */ + sk->prot->hash(sk); add_to_prot_sklist(sk); } @@ -482,7 +496,7 @@ static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) unsigned short snum; int chk_addr_ret; - /* If the socket has its own bind function then use it. (RAW and PACKET) */ + /* If the socket has its own bind function then use it. (RAW) */ if(sk->prot->bind) return sk->prot->bind(sk, uaddr, addr_len); @@ -503,12 +517,12 @@ static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (snum < PROT_SOCK && !suser()) return(-EACCES); - chk_addr_ret = __ip_chk_addr(addr->sin_addr.s_addr); - if (addr->sin_addr.s_addr != 0 && chk_addr_ret != IS_MYADDR && - chk_addr_ret != IS_MULTICAST && chk_addr_ret != IS_BROADCAST) { + chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); + if (addr->sin_addr.s_addr != 0 && chk_addr_ret != RTN_LOCAL && + chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) { #ifdef CONFIG_IP_TRANSPARENT_PROXY /* Superuser may bind to any address to allow transparent proxying. */ - if(!suser()) + if(chk_addr_ret != RTN_UNICAST || !suser()) #endif return -EADDRNOTAVAIL; /* Source address MUST be ours! */ } @@ -521,7 +535,7 @@ static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * which case the sending device address is used. */ sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr; - if(chk_addr_ret == IS_MULTICAST || chk_addr_ret == IS_BROADCAST) + if(chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) sk->saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ @@ -529,7 +543,7 @@ static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) return -EADDRINUSE; sk->num = snum; - sk->dummy_th.source = ntohs(snum); + sk->dummy_th.source = htons(snum); sk->daddr = 0; sk->dummy_th.dest = 0; sk->prot->rehash(sk); @@ -868,9 +882,6 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCDARP: case SIOCGARP: case SIOCSARP: - case OLD_SIOCDARP: - case OLD_SIOCGARP: - case OLD_SIOCSARP: return(arp_ioctl(cmd,(void *) arg)); case SIOCDRARP: case SIOCGRARP: @@ -889,10 +900,12 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCSIFNETMASK: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: + case SIOCSIFPFLAGS: + case SIOCGIFPFLAGS: + case SIOCSIFFLAGS: return(devinet_ioctl(cmd,(void *) arg)); case SIOCGIFCONF: case SIOCGIFFLAGS: - case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: case SIOCGIFMETRIC: @@ -908,9 +921,10 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCGIFMAP: case SIOCSIFSLAVE: case SIOCGIFSLAVE: - case SIOGIFINDEX: - case SIOGIFNAME: - case SIOCGIFCOUNT: + case SIOCGIFINDEX: + case SIOCGIFNAME: + case SIOCGIFCOUNT: + case SIOCSIFHWBROADCAST: return(dev_ioctl(cmd,(void *) arg)); case SIOCGIFBR: @@ -1105,6 +1119,16 @@ __initfunc(void inet_proto_init(struct net_proto *pro)) icmp_init(&inet_family_ops); + /* I wish inet_add_protocol had no constructor hook... + I had to move IPIP from net/ipv4/protocol.c :-( --ANK + */ +#ifdef CONFIG_NET_IPIP + ipip_init(); +#endif +#ifdef CONFIG_NET_IPGRE + ipgre_init(); +#endif + /* * Set the firewalling up */ @@ -1114,21 +1138,13 @@ __initfunc(void inet_proto_init(struct net_proto *pro)) #ifdef CONFIG_IP_MASQUERADE ip_masq_init(); #endif - + /* * Initialise the multicast router */ #if defined(CONFIG_IP_MROUTE) ip_mr_init(); #endif - - /* - * Initialise AF_INET alias type (register net_alias_type) - */ - -#if defined(CONFIG_IP_ALIAS) - ip_alias_init(); -#endif #ifdef CONFIG_INET_RARP rarp_ioctl_hook = rarp_ioctl; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 472f64811..26cc21977 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1,5 +1,7 @@ /* linux/net/inet/arp.c * + * Version: $Id: arp.c,v 1.56 1997/11/24 12:51:47 freitag Exp $ + * * Copyright (C) 1994 by Florian La Roche * * This module implements the Address Resolution Protocol ARP (RFC 826), @@ -58,6 +60,8 @@ * folded into the mainstream FDDI code. * Ack spit, Linus how did you allow that * one in... + * Jes Sorensen : Make FDDI work again in 2.1.x and + * clean up the APFDDI & gen. FDDI bits. */ /* RFC1122 Status: @@ -105,7 +109,6 @@ #include <net/netrom.h> #endif #endif -#include <linux/net_alias.h> #ifdef CONFIG_ARPD #include <net/netlink.h> #endif @@ -251,6 +254,7 @@ static atomic_t arp_unres_size = ATOMIC_INIT(0); #ifdef CONFIG_ARPD static int arpd_not_running; static int arpd_stamp; +struct sock *arpd_sk; #endif static void arp_check_expire (unsigned long); @@ -428,8 +432,6 @@ static void arpd_send(int req, u32 addr, struct device * dev, char *ha, static __inline__ void arpd_update(u32 ip, struct device *dev, char *ha) { - if (arpd_not_running) - return; arpd_send(ARPD_UPDATE, ip, dev, ha, jiffies); } @@ -440,8 +442,6 @@ static __inline__ void arpd_update(u32 ip, struct device *dev, char *ha) static __inline__ void arpd_lookup(u32 addr, struct device * dev) { - if (arpd_not_running) - return; arpd_send(ARPD_LOOKUP, addr, dev, NULL, 0); } @@ -451,13 +451,11 @@ static __inline__ void arpd_lookup(u32 addr, struct device * dev) static __inline__ void arpd_flush(struct device * dev) { - if (arpd_not_running) - return; arpd_send(ARPD_FLUSH, 0, dev, NULL, 0); } -static int arpd_callback(int minor, struct sk_buff *skb) +static int arpd_callback(struct sk_buff *skb, struct sock *sk) { struct device * dev; struct arpd_request *retreq; @@ -484,7 +482,9 @@ static int arpd_callback(int minor, struct sk_buff *skb) /* * Invalid mapping: drop it and send ARP broadcast. */ - arp_send(ARPOP_REQUEST, ETH_P_ARP, retreq->ip, dev, dev->pa_addr, NULL, + arp_send(ARPOP_REQUEST, ETH_P_ARP, retreq->ip, dev, + inet_select_addr(dev, retreq->ip, RT_SCOPE_LINK), + NULL, dev->dev_addr, NULL); } else @@ -658,8 +658,8 @@ static void arp_check_expire(unsigned long dummy) entry->timer.expires = jiffies + ARP_CONFIRM_TIMEOUT; add_timer(&entry->timer); arp_send(ARPOP_REQUEST, ETH_P_ARP, entry->ip, - dev, dev->pa_addr, entry->u.neigh.ha, - dev->dev_addr, NULL); + dev, inet_select_addr(dev, entry->ip, RT_SCOPE_LINK), + entry->u.neigh.ha, dev->dev_addr, NULL); #if RT_CACHE_DEBUG >= 2 printk("arp_expire: %08x requires confirmation\n", entry->ip); #endif @@ -710,7 +710,8 @@ static void arp_expire_request (unsigned long arg) /* Set new timer. */ entry->timer.expires = jiffies + sysctl_arp_res_time; add_timer(&entry->timer); - arp_send(ARPOP_REQUEST, ETH_P_ARP, entry->ip, dev, dev->pa_addr, + arp_send(ARPOP_REQUEST, ETH_P_ARP, entry->ip, dev, + inet_select_addr(dev, entry->ip, RT_SCOPE_LINK), entry->retries > sysctl_arp_max_tries ? entry->u.neigh.ha : NULL, dev->dev_addr, NULL); return; @@ -749,7 +750,8 @@ static void arp_expire_request (unsigned long arg) entry->timer.expires = jiffies + sysctl_arp_dead_res_time; add_timer(&entry->timer); - arp_send(ARPOP_REQUEST, ETH_P_ARP, entry->ip, dev, dev->pa_addr, + arp_send(ARPOP_REQUEST, ETH_P_ARP, entry->ip, dev, + inet_select_addr(dev, entry->ip, RT_SCOPE_LINK), NULL, dev->dev_addr, NULL); return; } @@ -797,9 +799,7 @@ static struct arp_table * arp_alloc(int how) entry = (struct arp_table *)neigh_alloc(sizeof(struct arp_table), &arp_neigh_ops); - - if (entry != NULL) - { + if (entry != NULL) { atomic_set(&entry->u.neigh.refcnt, 1); if (how) @@ -953,19 +953,19 @@ static __inline__ struct arp_table *arp_lookup(u32 paddr, struct device * dev) for (entry = arp_tables[HASH(paddr)]; entry != NULL; entry = entry->u.next) if (entry->ip == paddr && entry->u.neigh.dev == dev) - return entry; - return NULL; + break; + return entry; } static int arp_set_predefined(int addr_hint, unsigned char * haddr, u32 paddr, struct device * dev) { switch (addr_hint) { - case IS_MYADDR: + case RTN_LOCAL: printk(KERN_DEBUG "ARP: arp called for own IP address\n"); memcpy(haddr, dev->dev_addr, dev->addr_len); return 1; - case IS_MULTICAST: + case RTN_MULTICAST: if(dev->type==ARPHRD_ETHER || dev->type==ARPHRD_IEEE802 || dev->type==ARPHRD_FDDI) { @@ -985,7 +985,7 @@ static int arp_set_predefined(int addr_hint, unsigned char * haddr, u32 paddr, s * If a device does not support multicast broadcast the stuff (eg AX.25 for now) */ - case IS_BROADCAST: + case RTN_BROADCAST: memcpy(haddr, dev->broadcast, dev->addr_len); return 1; } @@ -1007,11 +1007,17 @@ static void arp_start_resolution(struct arp_table *entry) else #endif arp_send(ARPOP_REQUEST, ETH_P_ARP, entry->ip, dev, - dev->pa_addr, NULL, dev->dev_addr, NULL); + inet_select_addr(dev, entry->ip, RT_SCOPE_LINK), NULL, + dev->dev_addr, NULL); } /* * Create a new unresolved entry. + * + * NOTE: Always make sure no possibility of sleeping is introduced here, + * since nearly all callers are inside of BH atomic. Don't let + * the arp_alloc() fool you, at neigh_alloc() it is using GFP_ATOMIC + * always. */ struct arp_table * arp_new_entry(u32 paddr, struct device *dev, struct sk_buff *skb) @@ -1049,7 +1055,6 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) struct device *dev = skb->dev; u32 paddr; struct arp_table *entry; - unsigned long hash; if (!skb->dst) { printk(KERN_DEBUG "arp_find called with dst==NULL\n"); @@ -1058,14 +1063,11 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) paddr = ((struct rtable*)skb->dst)->rt_gateway; - if (arp_set_predefined(__ip_chk_addr(paddr), haddr, paddr, dev)) { - if (skb) - skb->arp = 1; + if (arp_set_predefined(inet_addr_type(paddr), haddr, paddr, dev)) { + skb->arp = 1; return 0; } - hash = HASH(paddr); - start_bh_atomic(); /* @@ -1079,8 +1081,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) { entry->u.neigh.lastused = jiffies; memcpy(haddr, entry->u.neigh.ha, dev->addr_len); - if (skb) - skb->arp = 1; + skb->arp = 1; end_bh_atomic(); return 0; } @@ -1090,24 +1091,17 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) * queue the packet with the previous attempt */ - if (skb != NULL) - { - if (entry->last_updated) - { - if (entry->u.neigh.arp_queue.qlen < ARP_MAX_UNRES_PACKETS) - skb_queue_tail(&entry->u.neigh.arp_queue, skb); - else - kfree_skb(skb, FREE_WRITE); - } - /* - * If last_updated==0 host is dead, so - * drop skb's and set socket error. - */ + if (entry->last_updated) { + if (entry->u.neigh.arp_queue.qlen < ARP_MAX_UNRES_PACKETS) + skb_queue_tail(&entry->u.neigh.arp_queue, skb); else - { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); kfree_skb(skb, FREE_WRITE); - } + } else { + /* If last_updated==0 host is dead, so + * drop skb's and set socket error. + */ + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); + kfree_skb(skb, FREE_WRITE); } end_bh_atomic(); return 1; @@ -1115,7 +1109,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) entry = arp_new_entry(paddr, dev, skb); - if (skb != NULL && !entry) + if (entry == NULL) kfree_skb(skb, FREE_WRITE); end_bh_atomic(); @@ -1129,12 +1123,13 @@ int arp_find_1(unsigned char *haddr, struct dst_entry *dst, struct device *dev = dst->dev; u32 paddr = rt->rt_gateway; struct arp_table *entry; - unsigned long hash; if (!neigh) { - if ((rt->rt_flags & RTF_MULTICAST) && - (dev->type==ARPHRD_ETHER || dev->type==ARPHRD_IEEE802)) + if (rt->rt_type == RTN_MULTICAST && + (dev->type == ARPHRD_ETHER || + dev->type == ARPHRD_IEEE802 || + dev->type == ARPHRD_FDDI)) { u32 taddr; haddr[0]=0x01; @@ -1148,12 +1143,12 @@ int arp_find_1(unsigned char *haddr, struct dst_entry *dst, haddr[3]=taddr&0x7f; return 1; } - if (rt->rt_flags & (RTF_BROADCAST|RTF_MULTICAST)) + if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) { memcpy(haddr, dev->broadcast, dev->addr_len); return 1; } - if (rt->rt_flags & RTF_LOCAL) + if (rt->rt_flags & RTCF_LOCAL) { printk(KERN_DEBUG "ARP: arp called for own IP address\n"); memcpy(haddr, dev->dev_addr, dev->addr_len); @@ -1162,8 +1157,6 @@ int arp_find_1(unsigned char *haddr, struct dst_entry *dst, return 0; } - hash = HASH(paddr); - start_bh_atomic(); entry = (struct arp_table*)neigh; @@ -1187,17 +1180,14 @@ struct neighbour* arp_find_neighbour(struct dst_entry *dst, int resolve) struct device *dev = rt->u.dst.dev; u32 paddr = rt->rt_gateway; struct arp_table *entry; - unsigned long hash; if (dst->ops->family != AF_INET) return NULL; if ((dev->flags & (IFF_LOOPBACK|IFF_NOARP)) || - (rt->rt_flags & (RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST))) + (rt->rt_flags & (RTCF_LOCAL|RTCF_BROADCAST|RTCF_MULTICAST))) return NULL; - hash = HASH(paddr); - start_bh_atomic(); /* @@ -1213,8 +1203,10 @@ struct neighbour* arp_find_neighbour(struct dst_entry *dst, int resolve) return (struct neighbour*)entry; } - if (!resolve) + if (!resolve) { + end_bh_atomic(); return NULL; + } entry = arp_new_entry(paddr, dev, NULL); @@ -1256,17 +1248,19 @@ void arp_send(int type, int ptype, u32 dest_ip, */ skb = alloc_skb(sizeof(struct arphdr)+ 2*(dev->addr_len+4) - + dev->hard_header_len, GFP_ATOMIC); + + dev->hard_header_len + 15, GFP_ATOMIC); if (skb == NULL) { printk(KERN_DEBUG "ARP: no memory to send an arp packet\n"); return; } - skb_reserve(skb, dev->hard_header_len); + + skb_reserve(skb, (dev->hard_header_len+15)&~15); + skb->nh.raw = skb->data; arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4)); skb->arp = 1; skb->dev = dev; - skb->protocol = htons (ETH_P_ARP); + skb->protocol = __constant_htons (ETH_P_ARP); /* * Fill the device header for the ARP frame @@ -1295,7 +1289,7 @@ void arp_send(int type, int ptype, u32 dest_ip, arp->ar_pro = (dev->type != ARPHRD_AX25) ? htons(ETH_P_IP) : htons(AX25_P_IP); #endif #else - arp->ar_pro = htons(ETH_P_IP); + arp->ar_pro = __constant_htons(ETH_P_IP); #endif arp->ar_hln = dev->addr_len; arp->ar_pln = 4; @@ -1319,6 +1313,20 @@ void arp_send(int type, int ptype, u32 dest_ip, dev_queue_xmit(skb); } +static __inline__ int arp_check_published(u32 tip, struct device *dev) +{ + struct arp_table *entry; + + for (entry = arp_proxy_list; entry; entry = entry->u.next) { + if (!((entry->ip^tip)&entry->mask) && + ((!entry->u.neigh.dev && + (!(entry->flags & ATF_COM) || entry->hatype == dev->type)) + || entry->u.neigh.dev == dev) ) + break; + } + + return entry && !(entry->flags & ATF_DONTPUB); +} /* * Receive an arp request by the device layer. @@ -1331,6 +1339,7 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) struct rtable *rt; unsigned char *sha, *tha; u32 sip, tip; + u16 dev_type = dev->type; /* * The hardware length of the packet should match the hardware length @@ -1339,45 +1348,38 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) * is not from an IP number. We can't currently handle this, so toss * it. */ -#if defined(CONFIG_FDDI) || defined(CONFIG_AP1000) - if (dev->type == ARPHRD_FDDI) +#if defined(CONFIG_FDDI) + if (dev_type == ARPHRD_FDDI) { /* * According to RFC 1390, FDDI devices should accept ARP hardware types * of 1 (Ethernet). However, to be more robust, we'll accept hardware * types of either 1 (Ethernet) or 6 (IEEE 802.2). */ + if (arp->ar_hln != dev->addr_len || ((ntohs(arp->ar_hrd) != ARPHRD_ETHER) && (ntohs(arp->ar_hrd) != ARPHRD_IEEE802)) || dev->flags & IFF_NOARP || skb->pkt_type == PACKET_OTHERHOST || arp->ar_pln != 4) - { - kfree_skb(skb, FREE_READ); - return 0; - } + goto out; } else { if (arp->ar_hln != dev->addr_len || - dev->type != ntohs(arp->ar_hrd) || + dev_type != ntohs(arp->ar_hrd) || dev->flags & IFF_NOARP || skb->pkt_type == PACKET_OTHERHOST || arp->ar_pln != 4) - { - kfree_skb(skb, FREE_READ); - return 0; - } + goto out; } #else if (arp->ar_hln != dev->addr_len || - dev->type != ntohs(arp->ar_hrd) || + dev_type != ntohs(arp->ar_hrd) || dev->flags & IFF_NOARP || skb->pkt_type == PACKET_OTHERHOST || - arp->ar_pln != 4) { - kfree_skb(skb, FREE_READ); - return 0; - } + arp->ar_pln != 4) + goto out; #endif /* @@ -1387,24 +1389,18 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) * problem, so toss the packet. */ - switch (dev->type) + switch (dev_type) { #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) case ARPHRD_AX25: if(arp->ar_pro != htons(AX25_P_IP)) - { - kfree_skb(skb, FREE_READ); - return 0; - } + goto out; break; #endif #if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) case ARPHRD_NETROM: if(arp->ar_pro != htons(AX25_P_IP)) - { - kfree_skb(skb, FREE_READ); - return 0; - } + goto out; break; #endif case ARPHRD_ETHER: @@ -1412,23 +1408,19 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) case ARPHRD_METRICOM: case ARPHRD_IEEE802: case ARPHRD_FDDI: + case ARPHRD_IPGRE: if(arp->ar_pro != htons(ETH_P_IP)) - { - kfree_skb(skb, FREE_READ); - return 0; - } + goto out; break; default: printk(KERN_ERR "ARP: dev->type mangled!\n"); - kfree_skb(skb, FREE_READ); - return 0; + goto out; } /* * Extract fields */ - sha=arp_ptr; arp_ptr += dev->addr_len; memcpy(&sip, arp_ptr, 4); @@ -1440,21 +1432,8 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) * Check for bad requests for 127.x.x.x and requests for multicast * addresses. If this is one such, delete it. */ - if (LOOPBACK(tip) || MULTICAST(tip)) { - kfree_skb(skb, FREE_READ); - return 0; - } - if (ip_route_input(skb, tip, sip, 0, dev)) { - kfree_skb(skb, FREE_READ); - return 0; - } - dev = skb->dev; - rt = (struct rtable*)skb->dst; - if (dev->type != ntohs(arp->ar_hrd) || dev->flags&IFF_NOARP || - rt->rt_flags&RTF_BROADCAST) { - kfree_skb(skb, FREE_READ); - return 0; - } + if (LOOPBACK(tip) || MULTICAST(tip)) + goto out; /* * Process entry. The idea here is we want to send a reply if it is a @@ -1472,31 +1451,31 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) * and in the case of requests for us we add the requester to the arp * cache. */ - if (arp->ar_op == htons(ARPOP_REQUEST)) { - struct arp_table *entry; - - for (entry = arp_proxy_list; entry; entry = entry->u.next) { - if (!((entry->ip^tip)&entry->mask) && - ((!entry->u.neigh.dev && - (!(entry->flags & ATF_COM) || entry->hatype == dev->type)) - || entry->u.neigh.dev == dev) ) - break; - } - - if (entry && !(entry->flags & ATF_DONTPUB)) { - char *ha = (entry->flags & ATF_COM) ? entry->u.neigh.ha : dev->dev_addr; - - if (rt->rt_flags&(RTF_LOCAL|RTF_NAT) || - (!(rt->rt_flags&RTCF_DOREDIRECT) && - rt->u.dst.dev != dev)) - arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,ha,sha); - } + int addr_type; + struct in_device *in_dev = dev->ip_ptr; + + if (ip_route_input(skb, tip, sip, 0, dev)) + goto out; + rt = (struct rtable*)skb->dst; + addr_type = rt->rt_type; + + if (addr_type == RTN_LOCAL || (rt->rt_flags&RTCF_DNAT) || + (addr_type == RTN_UNICAST && rt->u.dst.dev != dev && + ((in_dev && IN_DEV_PROXY_ARP(in_dev) && IN_DEV_FORWARD(in_dev)) || + arp_check_published(tip, dev)))) + arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); + } else { + if (arp->ar_op != htons(ARPOP_REPLY) || + inet_addr_type(sip) != RTN_UNICAST) + goto out; } start_bh_atomic(); - arp_update(sip, sha, dev, 0, !RT_LOCALADDR(rt->rt_flags) && dev->type != ARPHRD_METRICOM); + arp_update(sip, sha, dev, 0, arp->ar_op == htons(ARPOP_REPLY)); end_bh_atomic(); + +out: kfree_skb(skb, FREE_READ); return 0; } @@ -1554,13 +1533,13 @@ int arp_req_set(struct arpreq *r, struct device * dev) if ((r->arp_flags & ATF_PERM) && !(r->arp_flags & ATF_COM)) return -EINVAL; - err = ip_route_output(&rt, ip, 0, 1, dev); + err = ip_route_output(&rt, ip, 0, 1, dev ? dev->ifindex : 0); if (err) return err; if (!dev) dev = rt->u.dst.dev; - if (rt->rt_flags&(RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST|RTCF_NAT)) { - if (rt->rt_flags&RTF_BROADCAST && + if (rt->rt_flags&(RTCF_LOCAL|RTCF_BROADCAST|RTCF_MULTICAST|RTCF_DNAT)) { + if (rt->rt_flags&RTCF_BROADCAST && dev->type == ARPHRD_METRICOM && r->arp_ha.sa_family == ARPHRD_METRICOM) { memcpy(dev->broadcast, r->arp_ha.sa_data, dev->addr_len); @@ -1578,7 +1557,7 @@ int arp_req_set(struct arpreq *r, struct device * dev) if (dev && r->arp_ha.sa_family != dev->type) return -EINVAL; - + start_bh_atomic(); if (!(r->arp_flags & ATF_PUBL)) @@ -1991,7 +1970,7 @@ __initfunc(void arp_init (void)) #endif #ifdef CONFIG_ARPD - netlink_attach(NETLINK_ARPD, arpd_callback); + arpd_sk = netlink_kernel_create(NETLINK_ARPD, arpd_callback); #endif } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index c12417c52..269361e35 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1,6 +1,8 @@ /* * NET3 IP device support routines. * + * Version: $Id: devinet.c,v 1.14 1997/10/10 22:40:44 davem Exp $ + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -13,9 +15,13 @@ * * Additional Authors: * Alan Cox, <gw4pts@gw4pts.ampr.org> + * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Changes: + * Alexey Kuznetsov: pa_* fields are replaced with ifaddr lists. */ -#include <linux/config.h> /* For CONFIG_IP_CLASSLESS */ +#include <linux/config.h> #include <asm/uaccess.h> #include <asm/system.h> @@ -34,72 +40,336 @@ #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> -#include <linux/if_arp.h> -#include <net/ip.h> -#include <net/route.h> -#include <net/protocol.h> -#include <net/tcp.h> #include <linux/skbuff.h> -#include <net/sock.h> -#include <net/arp.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> #include <linux/notifier.h> -#include <linux/net_alias.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> #ifdef CONFIG_KERNELD #include <linux/kerneld.h> #endif -extern struct notifier_block *netdev_chain; +#include <net/ip.h> +#include <net/route.h> +#include <net/ip_fib.h> -/* - * Determine a default network mask, based on the IP address. +#ifdef CONFIG_RTNETLINK +static void rtmsg_ifa(int event, struct in_ifaddr *); +#else +#define rtmsg_ifa(a,b) do { } while(0) +#endif + +static struct notifier_block *inetaddr_chain; +static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy); + + +int inet_ifa_count; +int inet_dev_count; + +static struct in_ifaddr * inet_alloc_ifa(void) +{ + struct in_ifaddr *ifa; + + ifa = kmalloc(sizeof(*ifa), GFP_KERNEL); + if (ifa) { + memset(ifa, 0, sizeof(*ifa)); + inet_ifa_count++; + } + + return ifa; +} + +static __inline__ void inet_free_ifa(struct in_ifaddr *ifa) +{ + kfree_s(ifa, sizeof(*ifa)); + inet_ifa_count--; +} + +struct in_device *inetdev_init(struct device *dev) +{ + struct in_device *in_dev; + + in_dev = kmalloc(sizeof(*in_dev), GFP_KERNEL); + if (!in_dev) + return NULL; + inet_dev_count++; + memset(in_dev, 0, sizeof(*in_dev)); + in_dev->dev = dev; + dev->ip_ptr = in_dev; + ip_mc_init_dev(in_dev); + return in_dev; +} + +static void inetdev_destroy(struct in_device *in_dev) +{ + struct in_ifaddr *ifa; + + ip_mc_destroy_dev(in_dev); + + while ((ifa = in_dev->ifa_list) != NULL) { + inet_del_ifa(in_dev, &in_dev->ifa_list, 0); + inet_free_ifa(ifa); + } + + in_dev->dev->ip_ptr = NULL; + kfree(in_dev); +} + +struct in_ifaddr * inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b) +{ + for_primary_ifa(in_dev) { + if (inet_ifa_match(a, ifa)) { + if (!b || inet_ifa_match(b, ifa)) + return ifa; + } + } endfor_ifa(in_dev); + return NULL; +} + +static void +inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy) +{ + struct in_ifaddr *ifa1 = *ifap; + struct in_ifaddr *ifa; + + /* 1. Unlink it */ + + *ifap = ifa1->ifa_next; + + /* 2. Deleting primary ifaddr forces deletion all secondaries */ + + if (!(ifa1->ifa_flags&IFA_F_SECONDARY)) { + while ((ifa=*ifap) != NULL) { + if (ifa1->ifa_mask != ifa->ifa_mask || + !inet_ifa_match(ifa1->ifa_address, ifa)) { + ifap = &ifa->ifa_next; + continue; + } + *ifap = ifa->ifa_next; + rtmsg_ifa(RTM_DELADDR, ifa); + notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa); + inet_free_ifa(ifa); + } + } + + /* 3. Announce address deletion */ + + /* Send message first, then call notifier. + At first sight, FIB update triggered by notifier + will refer to already deleted ifaddr, that could confuse + netlink listeners. It is not true: look, gated sees + that route deleted and if it still thinks that ifaddr + is valid, it will try to restore deleted routes... Grr. + So that, this order is correct. + */ + rtmsg_ifa(RTM_DELADDR, ifa1); + notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); + if (destroy) { + inet_free_ifa(ifa1); + if (in_dev->ifa_list == NULL) + inetdev_destroy(in_dev); + } +} + +static int +inet_insert_ifa(struct in_device *in_dev, struct in_ifaddr *ifa) +{ + struct in_ifaddr *ifa1, **ifap, **last_primary; + + if (ifa->ifa_local == 0) { + inet_free_ifa(ifa); + return 0; + } + + ifa->ifa_flags &= ~IFA_F_SECONDARY; + last_primary = &in_dev->ifa_list; + + for (ifap=&in_dev->ifa_list; (ifa1=*ifap)!=NULL; ifap=&ifa1->ifa_next) { + if (!(ifa1->ifa_flags&IFA_F_SECONDARY) && ifa->ifa_scope <= ifa1->ifa_scope) + last_primary = &ifa1->ifa_next; + if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa)) { + if (ifa1->ifa_local == ifa->ifa_local) { + inet_free_ifa(ifa); + return -EEXIST; + } + if (ifa1->ifa_scope != ifa->ifa_scope) { + inet_free_ifa(ifa); + return -EINVAL; + } + ifa->ifa_flags |= IFA_F_SECONDARY; + } + } + + if (!(ifa->ifa_flags&IFA_F_SECONDARY)) + ifap = last_primary; + + cli(); + ifa->ifa_next = *ifap; + *ifap = ifa; + sti(); + + /* Send message first, then call notifier. + Notifier will trigger FIB update, so that + listeners of netlink will know about new ifaddr */ + rtmsg_ifa(RTM_NEWADDR, ifa); + notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); + + return 0; +} + +static int +inet_set_ifa(struct device *dev, struct in_ifaddr *ifa) +{ + struct in_device *in_dev = dev->ip_ptr; + + if (in_dev == NULL) { + in_dev = inetdev_init(dev); + if (in_dev == NULL) { + inet_free_ifa(ifa); + return -ENOBUFS; + } + } + ifa->ifa_dev = in_dev; + if (LOOPBACK(ifa->ifa_local)) + ifa->ifa_scope = RT_SCOPE_HOST; + return inet_insert_ifa(in_dev, ifa); +} + +struct in_device *inetdev_by_index(int ifindex) +{ + struct device *dev; + dev = dev_get_by_index(ifindex); + if (dev) + return dev->ip_ptr; + return NULL; +} + +struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, u32 prefix, u32 mask) +{ + for_primary_ifa(in_dev) { + if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa)) + return ifa; + } endfor_ifa(in_dev); + return NULL; +} + +#ifdef CONFIG_RTNETLINK + +/* rtm_{add|del} functions are not reenterable, so that + this structure can be made static */ -static unsigned long ip_get_mask(unsigned long addr) +int +inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { - unsigned long dst; + struct kern_ifa *k_ifa = arg; + struct in_device *in_dev; + struct ifaddrmsg *ifm = NLMSG_DATA(nlh); + struct in_ifaddr *ifa, **ifap; - if (ZERONET(addr)) - return(0L); /* special case */ - - dst = ntohl(addr); - if (IN_CLASSA(dst)) - return(htonl(IN_CLASSA_NET)); - if (IN_CLASSB(dst)) - return(htonl(IN_CLASSB_NET)); - if (IN_CLASSC(dst)) - return(htonl(IN_CLASSC_NET)); - - /* - * Something else, probably a multicast. - */ - - return(0); + if ((in_dev = inetdev_by_index(ifm->ifa_index)) == NULL) + return -EADDRNOTAVAIL; + + for (ifap=&in_dev->ifa_list; (ifa=*ifap)!=NULL; ifap=&ifa->ifa_next) { + if ((k_ifa->ifa_local && memcmp(k_ifa->ifa_local, &ifa->ifa_local, 4)) || + (k_ifa->ifa_label && strcmp(k_ifa->ifa_label, ifa->ifa_label)) || + (k_ifa->ifa_address && + (ifm->ifa_prefixlen != ifa->ifa_prefixlen || + !inet_ifa_match(*(u32*)k_ifa->ifa_address, ifa)))) + continue; + inet_del_ifa(in_dev, ifap, 1); + return 0; + } + + return -EADDRNOTAVAIL; } +int +inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct kern_ifa *k_ifa = arg; + struct device *dev; + struct in_device *in_dev; + struct ifaddrmsg *ifm = NLMSG_DATA(nlh); + struct in_ifaddr *ifa; -/* - * This checks bitmasks for the ioctl calls for devices. + if (ifm->ifa_prefixlen > 32 || k_ifa->ifa_local == NULL) + return -EINVAL; + + if ((dev = dev_get_by_index(ifm->ifa_index)) == NULL) + return -ENODEV; + + if ((in_dev = dev->ip_ptr) == NULL) { + in_dev = inetdev_init(dev); + if (!in_dev) + return -ENOBUFS; + } + + if ((ifa = inet_alloc_ifa()) == NULL) + return -ENOBUFS; + + if (k_ifa->ifa_address == NULL) + k_ifa->ifa_address = k_ifa->ifa_local; + memcpy(&ifa->ifa_local, k_ifa->ifa_local, 4); + memcpy(&ifa->ifa_address, k_ifa->ifa_address, 4); + ifa->ifa_prefixlen = ifm->ifa_prefixlen; + ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); + if (k_ifa->ifa_broadcast) + memcpy(&ifa->ifa_broadcast, k_ifa->ifa_broadcast, 4); + if (k_ifa->ifa_anycast) + memcpy(&ifa->ifa_anycast, k_ifa->ifa_anycast, 4); + ifa->ifa_flags = ifm->ifa_flags; + ifa->ifa_scope = ifm->ifa_scope; + ifa->ifa_dev = in_dev; + if (k_ifa->ifa_label) + memcpy(ifa->ifa_label, k_ifa->ifa_label, IFNAMSIZ); + else + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + + return inet_insert_ifa(in_dev, ifa); +} + +#endif + +/* + * Determine a default network mask, based on the IP address. */ - -static inline int bad_mask(__u32 mask, __u32 addr) + +static __inline__ int inet_abc_len(u32 addr) { - if (addr & (mask = ~mask)) - return 1; - mask = ntohl(mask); - if (mask & (mask+1)) - return 1; - return 0; + if (ZERONET(addr)) + return 0; + + addr = ntohl(addr); + if (IN_CLASSA(addr)) + return 8; + if (IN_CLASSB(addr)) + return 16; + if (IN_CLASSC(addr)) + return 24; + + /* + * Something else, probably a multicast. + */ + + return -1; } - + int devinet_ioctl(unsigned int cmd, void *arg) { struct ifreq ifr; + struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr; + struct in_device *in_dev; + struct in_ifaddr **ifap = NULL; + struct in_ifaddr *ifa = NULL; struct device *dev; - __u32 addr; -#ifdef CONFIG_NET_ALIAS - int err; +#ifdef CONFIG_IP_ALIAS + char *colon; #endif + int exclusive = 0; + int ret = 0; /* * Fetch the caller's info block into kernel space @@ -107,191 +377,483 @@ int devinet_ioctl(unsigned int cmd, void *arg) if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) return -EFAULT; + ifr.ifr_name[IFNAMSIZ-1] = 0; + +#ifdef CONFIG_IP_ALIAS + colon = strchr(ifr.ifr_name, ':'); + if (colon) + *colon = 0; +#endif - /* - * See which interface the caller is talking about. - */ - - /* - * - * net_alias_dev_get(): dev_get() with added alias naming magic. - * only allow alias creation/deletion if (getset==SIOCSIFADDR) - * - */ - #ifdef CONFIG_KERNELD dev_load(ifr.ifr_name); -#endif +#endif -#ifdef CONFIG_NET_ALIAS - if ((dev = net_alias_dev_get(ifr.ifr_name, cmd == SIOCSIFADDR, &err, NULL, NULL)) == NULL) - return(err); -#else - if ((dev = dev_get(ifr.ifr_name)) == NULL) - return(-ENODEV); + switch(cmd) { + case SIOCGIFADDR: /* Get interface address */ + case SIOCGIFBRDADDR: /* Get the broadcast address */ + case SIOCGIFDSTADDR: /* Get the destination address */ + case SIOCGIFNETMASK: /* Get the netmask for the interface */ + case SIOCGIFPFLAGS: /* Get per device sysctl controls */ + /* Note that this ioctls will not sleep, + so that we do not impose a lock. + One day we will be forced to put shlock here (I mean SMP) + */ + memset(sin, 0, sizeof(*sin)); + sin->sin_family = AF_INET; + break; + + case SIOCSIFFLAGS: + if (!suser()) + return -EACCES; + rtnl_lock(); + exclusive = 1; + break; + case SIOCSIFADDR: /* Set interface address (and family) */ + case SIOCSIFBRDADDR: /* Set the broadcast address */ + case SIOCSIFDSTADDR: /* Set the destination address */ + case SIOCSIFNETMASK: /* Set the netmask for the interface */ + case SIOCSIFPFLAGS: /* Set per device sysctl controls */ + if (!suser()) + return -EACCES; + if (sin->sin_family != AF_INET) + return -EINVAL; + rtnl_lock(); + exclusive = 1; + break; + default: + return -EINVAL; + } + + + if ((dev = dev_get(ifr.ifr_name)) == NULL) { + ret = -ENODEV; + goto done; + } + +#ifdef CONFIG_IP_ALIAS + if (colon) + *colon = ':'; #endif - if (cmd != SIOCSIFADDR && dev->family != AF_INET) - return(-EINVAL); + if ((in_dev=dev->ip_ptr) != NULL) { + for (ifap=&in_dev->ifa_list; (ifa=*ifap) != NULL; ifap=&ifa->ifa_next) + if (strcmp(ifr.ifr_name, ifa->ifa_label) == 0) + break; + } - switch(cmd) - { - case SIOCGIFADDR: /* Get interface address (and family) */ - if (ifr.ifr_addr.sa_family == AF_UNSPEC) - { - memcpy(ifr.ifr_hwaddr.sa_data, dev->dev_addr, MAX_ADDR_LEN); - ifr.ifr_hwaddr.sa_family = dev->type; - } - else - { - (*(struct sockaddr_in *) - &ifr.ifr_addr).sin_addr.s_addr = dev->pa_addr; - (*(struct sockaddr_in *) - &ifr.ifr_addr).sin_family = dev->family; - (*(struct sockaddr_in *) - &ifr.ifr_addr).sin_port = 0; - } - break; - - case SIOCSIFADDR: /* Set interface address (and family) */ - - if (!suser()) - return -EPERM; + if (ifa == NULL && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS) { + ret = -EADDRNOTAVAIL; + goto done; + } - /* - * BSDism. SIOCSIFADDR family=AF_UNSPEC sets the - * physical address. We can cope with this now. - */ - - if(ifr.ifr_addr.sa_family==AF_UNSPEC) - { - int ret; - if(dev->set_mac_address==NULL) - return -EOPNOTSUPP; - ret = dev->set_mac_address(dev,&ifr.ifr_addr); - if (!ret) - notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); - return ret; - } - if(ifr.ifr_addr.sa_family!=AF_INET) - return -EINVAL; + switch(cmd) { + case SIOCGIFADDR: /* Get interface address */ + sin->sin_addr.s_addr = ifa->ifa_local; + goto rarok; - addr = (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr; + case SIOCGIFBRDADDR: /* Get the broadcast address */ + sin->sin_addr.s_addr = ifa->ifa_broadcast; + goto rarok; - dev_lock_wait(); - dev_lock_list(); + case SIOCGIFDSTADDR: /* Get the destination address */ + sin->sin_addr.s_addr = ifa->ifa_address; + goto rarok; - if (dev->family == AF_INET && addr == dev->pa_addr) { - dev_unlock_list(); - return 0; - } + case SIOCGIFNETMASK: /* Get the netmask for the interface */ + sin->sin_addr.s_addr = ifa->ifa_mask; + goto rarok; - if (dev->flags & IFF_UP) - notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev); + case SIOCGIFPFLAGS: + ifr.ifr_flags = in_dev->flags; + goto rarok; - /* - * if dev is an alias, must rehash to update - * address change - */ + case SIOCSIFFLAGS: +#ifdef CONFIG_IP_ALIAS + if (colon) { + if (ifa == NULL) { + ret = -EADDRNOTAVAIL; + break; + } + if (!(ifr.ifr_flags&IFF_UP)) + inet_del_ifa(in_dev, ifap, 1); + break; + } +#endif + ret = dev_change_flags(dev, ifr.ifr_flags); + break; + + case SIOCSIFPFLAGS: + in_dev->flags = ifr.ifr_flags; + break; -#ifdef CONFIG_NET_ALIAS - if (net_alias_is(dev)) - net_alias_dev_rehash(dev, &ifr.ifr_addr); + case SIOCSIFADDR: /* Set interface address (and family) */ + if (inet_abc_len(sin->sin_addr.s_addr) < 0) { + ret = -EINVAL; + break; + } + + if (!ifa) { + if ((ifa = inet_alloc_ifa()) == NULL) { + ret = -ENOBUFS; + break; + } +#ifdef CONFIG_IP_ALIAS + if (colon) + memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ); + else #endif - dev->pa_addr = addr; - dev->ip_flags |= IFF_IP_ADDR_OK; - dev->ip_flags &= ~(IFF_IP_BRD_OK|IFF_IP_MASK_OK); - dev->family = AF_INET; - if (dev->flags & IFF_POINTOPOINT) { - dev->pa_mask = 0xFFFFFFFF; - dev->pa_brdaddr = 0xFFFFFFFF; + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); } else { - dev->pa_mask = ip_get_mask(dev->pa_addr); - dev->pa_brdaddr = dev->pa_addr|~dev->pa_mask; + ret = 0; + if (ifa->ifa_local == sin->sin_addr.s_addr) + break; + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_broadcast = 0; + ifa->ifa_anycast = 0; + ifa->ifa_prefixlen = 32; + ifa->ifa_mask = inet_make_mask(32); } - if (dev->flags & IFF_UP) - notifier_call_chain(&netdev_chain, NETDEV_UP, dev); - dev_unlock_list(); - return 0; - - case SIOCGIFBRDADDR: /* Get the broadcast address */ - (*(struct sockaddr_in *) - &ifr.ifr_broadaddr).sin_addr.s_addr = dev->pa_brdaddr; - (*(struct sockaddr_in *) - &ifr.ifr_broadaddr).sin_family = dev->family; - (*(struct sockaddr_in *) - &ifr.ifr_broadaddr).sin_port = 0; + + ifa->ifa_address = + ifa->ifa_local = sin->sin_addr.s_addr; + + if (!(dev->flags&IFF_POINTOPOINT)) { + ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address); + ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen); + if ((dev->flags&IFF_BROADCAST) && ifa->ifa_prefixlen < 31) + ifa->ifa_broadcast = ifa->ifa_address|~ifa->ifa_mask; + } + ret = inet_set_ifa(dev, ifa); break; case SIOCSIFBRDADDR: /* Set the broadcast address */ - if (!suser()) - return -EPERM; - - addr = (*(struct sockaddr_in *)&ifr.ifr_broadaddr).sin_addr.s_addr; - - if (dev->flags & IFF_UP) - ip_rt_change_broadcast(dev, addr); - dev->pa_brdaddr = addr; - dev->ip_flags |= IFF_IP_BRD_OK; - return 0; - - case SIOCGIFDSTADDR: /* Get the destination address (for point-to-point links) */ - (*(struct sockaddr_in *) - &ifr.ifr_dstaddr).sin_addr.s_addr = dev->pa_dstaddr; - (*(struct sockaddr_in *) - &ifr.ifr_dstaddr).sin_family = dev->family; - (*(struct sockaddr_in *) - &ifr.ifr_dstaddr).sin_port = 0; + if (ifa->ifa_broadcast != sin->sin_addr.s_addr) { + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_broadcast = sin->sin_addr.s_addr; + inet_insert_ifa(in_dev, ifa); + } break; - case SIOCSIFDSTADDR: /* Set the destination address (for point-to-point links) */ - if (!suser()) - return -EPERM; - addr = (*(struct sockaddr_in *)&ifr.ifr_dstaddr).sin_addr.s_addr; - if (addr == dev->pa_dstaddr) - return 0; - if (dev->flags & IFF_UP) - ip_rt_change_dstaddr(dev, addr); - dev->pa_dstaddr = addr; - return 0; - - case SIOCGIFNETMASK: /* Get the netmask for the interface */ - (*(struct sockaddr_in *) - &ifr.ifr_netmask).sin_addr.s_addr = dev->pa_mask; - (*(struct sockaddr_in *) - &ifr.ifr_netmask).sin_family = dev->family; - (*(struct sockaddr_in *) - &ifr.ifr_netmask).sin_port = 0; + case SIOCSIFDSTADDR: /* Set the destination address */ + if (ifa->ifa_address != sin->sin_addr.s_addr) { + if (inet_abc_len(sin->sin_addr.s_addr) < 0) { + ret = -EINVAL; + break; + } + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_address = sin->sin_addr.s_addr; + inet_insert_ifa(in_dev, ifa); + } break; case SIOCSIFNETMASK: /* Set the netmask for the interface */ - if (!suser()) - return -EPERM; - addr = (*(struct sockaddr_in *)&ifr.ifr_netmask).sin_addr.s_addr; - - if (addr == dev->pa_mask) { - dev->ip_flags |= IFF_IP_MASK_OK; - return 0; - } /* * The mask we set must be legal. */ - if (bad_mask(addr, 0)) - return -EINVAL; - if (addr == htonl(0xFFFFFFFE)) - return -EINVAL; - if (dev->flags & IFF_UP) - ip_rt_change_netmask(dev, addr); - dev->pa_mask = addr; - dev->ip_flags |= IFF_IP_MASK_OK; - dev->ip_flags &= ~IFF_IP_BRD_OK; - return 0; - default: - return -EINVAL; - + if (bad_mask(sin->sin_addr.s_addr, 0)) { + ret = -EINVAL; + break; + } + + if (ifa->ifa_mask != sin->sin_addr.s_addr) { + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_mask = sin->sin_addr.s_addr; + ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask); + inet_set_ifa(dev, ifa); + } + break; } +done: + if (exclusive) + rtnl_unlock(); + return ret; + +rarok: if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) return -EFAULT; return 0; } + +static int +inet_gifconf(struct device *dev, char *buf, int len) +{ + struct in_device *in_dev = dev->ip_ptr; + struct in_ifaddr *ifa; + struct ifreq ifr; + int done=0; + + if (in_dev==NULL || (ifa=in_dev->ifa_list)==NULL) + return 0; + + for ( ; ifa; ifa = ifa->ifa_next) { + if (!buf) { + done += sizeof(ifr); + continue; + } + if (len < sizeof(ifr)) + return done; + memset(&ifr, 0, sizeof(struct ifreq)); + if (ifa->ifa_label) + strcpy(ifr.ifr_name, ifa->ifa_label); + else + strcpy(ifr.ifr_name, dev->name); + + (*(struct sockaddr_in *) &ifr.ifr_addr).sin_family = AF_INET; + (*(struct sockaddr_in *) &ifr.ifr_addr).sin_addr.s_addr = ifa->ifa_local; + + if (copy_to_user(buf, &ifr, sizeof(struct ifreq))) + return -EFAULT; + buf += sizeof(struct ifreq); + len -= sizeof(struct ifreq); + done += sizeof(struct ifreq); + } + return done; +} + +u32 inet_select_addr(struct device *dev, u32 dst, int scope) +{ + u32 addr = 0; + struct in_device *in_dev = dev->ip_ptr; + + if (in_dev == NULL) + return 0; + + for_primary_ifa(in_dev) { + if (ifa->ifa_scope > scope) + continue; + addr = ifa->ifa_local; + if (!dst || inet_ifa_match(dst, ifa)) + return addr; + } endfor_ifa(in_dev); + + return addr; +} + +/* + * Device notifier + */ + +int register_inetaddr_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&inetaddr_chain, nb); +} + +int unregister_inetaddr_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&inetaddr_chain,nb); +} + +static int inetdev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct device *dev = ptr; + struct in_device *in_dev = dev->ip_ptr; + + if (in_dev == NULL) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_REGISTER: + if (in_dev) + printk(KERN_DEBUG "inetdev_event: bug\n"); + dev->ip_ptr = NULL; + break; + case NETDEV_UP: + if (dev == &loopback_dev) { + struct in_ifaddr *ifa; + if ((ifa = inet_alloc_ifa()) != NULL) { + ifa->ifa_local = + ifa->ifa_address = htonl(INADDR_LOOPBACK); + ifa->ifa_prefixlen = 8; + ifa->ifa_mask = inet_make_mask(8); + ifa->ifa_dev = in_dev; + ifa->ifa_scope = RT_SCOPE_HOST; + inet_insert_ifa(in_dev, ifa); + } + } + ip_mc_up(in_dev); + break; + case NETDEV_DOWN: + ip_mc_down(in_dev); + break; + case NETDEV_UNREGISTER: + inetdev_destroy(in_dev); + break; + } + + return NOTIFY_DONE; +} + +struct notifier_block ip_netdev_notifier={ + inetdev_event, + NULL, + 0 +}; + +#ifdef CONFIG_RTNETLINK + +static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, + pid_t pid, u32 seq, int event) +{ + struct ifaddrmsg *ifm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); + ifm = NLMSG_DATA(nlh); + ifm->ifa_family = AF_INET; + ifm->ifa_prefixlen = ifa->ifa_prefixlen; + ifm->ifa_flags = ifa->ifa_flags; + ifm->ifa_scope = ifa->ifa_scope; + ifm->ifa_index = ifa->ifa_dev->dev->ifindex; + if (ifa->ifa_prefixlen) + RTA_PUT(skb, IFA_ADDRESS, 4, &ifa->ifa_address); + if (ifa->ifa_local) + RTA_PUT(skb, IFA_LOCAL, 4, &ifa->ifa_local); + if (ifa->ifa_broadcast) + RTA_PUT(skb, IFA_BROADCAST, 4, &ifa->ifa_broadcast); + if (ifa->ifa_anycast) + RTA_PUT(skb, IFA_ANYCAST, 4, &ifa->ifa_anycast); + if (ifa->ifa_label[0]) + RTA_PUT(skb, IFA_LABEL, IFNAMSIZ, &ifa->ifa_label); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_put(skb, b - skb->tail); + return -1; +} + +static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx, ip_idx; + int s_idx, s_ip_idx; + struct device *dev; + struct in_device *in_dev; + struct in_ifaddr *ifa; + + s_idx = cb->args[0]; + s_ip_idx = ip_idx = cb->args[1]; + for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { + if (idx < s_idx) + continue; + if (idx > s_idx) + s_ip_idx = 0; + if ((in_dev = dev->ip_ptr) == NULL) + continue; + for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; + ifa = ifa->ifa_next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWADDR) <= 0) + goto done; + } + } +done: + cb->args[0] = idx; + cb->args[1] = ip_idx; + + return skb->len; +} + +static void rtmsg_ifa(int event, struct in_ifaddr * ifa) +{ + struct sk_buff *skb; + int size = NLMSG_SPACE(sizeof(struct ifaddrmsg)+128); + + skb = alloc_skb(size, GFP_KERNEL); + if (!skb) { + netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS); + return; + } + if (inet_fill_ifaddr(skb, ifa, 0, 0, event) < 0) { + kfree_skb(skb, 0); + netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_IFADDR; + netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV4_IFADDR, GFP_KERNEL); +} + + +static struct rtnetlink_link inet_rtnetlink_table[RTM_MAX-RTM_BASE+1] = +{ + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, rtnetlink_dump_ifinfo, }, + { NULL, NULL, }, + + { inet_rtm_newaddr, NULL, }, + { inet_rtm_deladdr, NULL, }, + { NULL, inet_dump_ifaddr, }, + { NULL, NULL, }, + + { inet_rtm_newroute, NULL, }, + { inet_rtm_delroute, NULL, }, + { inet_rtm_getroute, inet_dump_fib, }, + { NULL, NULL, }, + + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, + +#ifdef CONFIG_IP_MULTIPLE_TABLES + { inet_rtm_newrule, NULL, }, + { inet_rtm_delrule, NULL, }, + { NULL, inet_dump_rules, }, + { NULL, NULL, }, +#else + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, + { NULL, NULL, }, +#endif +}; + +#endif /* CONFIG_RTNETLINK */ + +#ifdef CONFIG_IP_PNP_BOOTP + +/* + * Addition and deletion of fake interface addresses + * for sending of BOOTP packets. In this case, we must + * set the local address to zero which is not permitted + * otherwise. + */ + +__initfunc(int inet_add_bootp_addr(struct device *dev)) +{ + struct in_device *in_dev = dev->ip_ptr; + struct in_ifaddr *ifa; + + if (!in_dev && !(in_dev = inetdev_init(dev))) + return -ENOBUFS; + if (!(ifa = inet_alloc_ifa())) + return -ENOBUFS; + ifa->ifa_dev = in_dev; + in_dev->ifa_list = ifa; + rtmsg_ifa(RTM_NEWADDR, ifa); + notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); + return 0; +} + +__initfunc(void inet_del_bootp_addr(struct device *dev)) +{ + if (dev->ip_ptr) + inetdev_destroy(dev->ip_ptr); +} + +#endif + +__initfunc(void devinet_init(void)) +{ + register_gifconf(AF_INET, inet_gifconf); + register_netdevice_notifier(&ip_netdev_notifier); +#ifdef CONFIG_RTNETLINK + rtnetlink_links[AF_INET] = inet_rtnetlink_table; +#endif +} diff --git a/net/ipv4/fib.c b/net/ipv4/fib.c index f444718a7..e69de29bb 100644 --- a/net/ipv4/fib.c +++ b/net/ipv4/fib.c @@ -1,2077 +0,0 @@ -/* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * IPv4 Forwarding Information Base. - * - * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * - * NOTE: This file is scheduled to be removed from kernel. - * The natural place for router FIB is user level - * routing daemon (it has to keep its copy in any case) - * - * Kernel should keep only interface routes and, - * if host is not router, default gateway. - * - * We have good proof that it is feasible and efficient - - * multicast routing. - */ - -#include <linux/config.h> -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/bitops.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/errno.h> -#include <linux/in.h> -#include <linux/inet.h> -#include <linux/netdevice.h> -#include <linux/if_arp.h> -#include <linux/proc_fs.h> -#include <linux/skbuff.h> -#include <linux/init.h> - -#include <net/ip.h> -#include <net/protocol.h> -#include <net/route.h> -#include <net/tcp.h> -#include <net/sock.h> -#include <net/icmp.h> -#include <net/arp.h> -#include <net/netlink.h> -#include <net/ip_fib.h> -#include <net/dst.h> -#include <linux/net_alias.h> - -static struct fib_class local_class = {RT_CLASS_LOCAL, }; -static struct fib_class default_class = {RT_CLASS_DEFAULT, }; -static struct fib_class main_class = {RT_CLASS_MAIN, }; -static struct fib_class *fib_classes[RT_CLASS_MAX+1]; - -static struct fib_rule *fib_rules; - -static struct fib_info *fib_info_list; - -static int fib_stamp; - -static int rtmsg_process(struct nlmsghdr *n, struct in_rtmsg *r); - - -#ifdef CONFIG_RTNETLINK - -static unsigned rt_nl_flags; -static int rt_nl_owner = -1; - -/* - * Default mode is delayed for 0.5sec batch delivery. - * If someone starts to use user->level calls, - * we turn on synchronous message passing. - */ - -#define RTMSG_DELAY (HZ/2) - -static struct nlmsg_ctl rtmsg_ctl = { - { NULL, NULL, 0, 0L, NULL }, - NULL, - NETLINK_ROUTE, - RTMSG_DELAY, - NLMSG_GOODSIZE, - 0, 0, 0, 0 -}; - -static void __rtmsg_ack(struct nlmsghdr *n, int err); - -static __inline__ void rtmsg_ack(struct nlmsghdr *n, int err) -{ - if (n->nlmsg_seq && rt_nl_flags&RTCTL_ACK) - __rtmsg_ack(n, err); -} - -static void rtmsg_fib(unsigned long type, struct fib_node *f, int logmask, - struct fib_class *class, struct nlmsghdr *n); -static void rtmsg_dev(unsigned long type, struct device *dev, struct nlmsghdr *n); -#define rtmsg_kick() ({ if (rtmsg_ctl.nlmsg_skb) nlmsg_transmit(&rtmsg_ctl); }) - -#else -#define rtmsg_fib(a,b,c,d,e) -#define rtmsg_dev(a,b,c) -#define rtmsg_ack(a,b) -#define rtmsg_kick() -#endif - - -/* - * FIB locking. - */ - -static struct wait_queue *fib_wait; -static atomic_t fib_users = ATOMIC_INIT(0); - -static void fib_lock(void) -{ - while (atomic_read(&fib_users)) - sleep_on(&fib_wait); - atomic_inc(&fib_users); - dev_lock_list(); -} - -static void fib_unlock(void) -{ - dev_unlock_list(); - if (atomic_dec_and_test(&fib_users)) { - rtmsg_kick(); - wake_up(&fib_wait); - } -} - -/* - * Check if a mask is acceptable. - */ - -static __inline__ int bad_mask(u32 mask, u32 addr) -{ - if (addr & (mask = ~mask)) - return 1; - mask = ntohl(mask); - if (mask & (mask+1)) - return 1; - return 0; -} - -/* - * Evaluate mask length. - */ - -static __inline__ int fib_logmask(u32 mask) -{ - if (!(mask = ntohl(mask))) - return 32; - return ffz(~mask); -} - -/* - * Create mask from mask length. - */ - -static __inline__ u32 fib_mask(int logmask) -{ - if (logmask >= 32) - return 0; - return htonl(~((1<<logmask)-1)); -} - -static __inline__ u32 fib_netmask(int logmask) -{ - return fib_mask(32-logmask); -} - - -static struct fib_class *fib_alloc_class(int id) -{ - struct fib_class *class; - - if (fib_classes[id]) - return fib_classes[id]; - - class = kmalloc(sizeof(*class), GFP_KERNEL); - if (!class) - return NULL; - memset(class, 0, sizeof(*class)); - class->cl_id = id; - fib_classes[id] = class; - return class; -} - -static struct fib_class *fib_empty_class(void) -{ - int id; - for (id = 1; id <= RT_CLASS_MAX; id++) - if (fib_classes[id] == NULL) - return fib_alloc_class(id); - return NULL; -} - -static int fib_rule_delete(struct in_rtrulemsg *r, struct device *dev, struct nlmsghdr *n) -{ - u32 src = r->rtrmsg_src.s_addr; - u32 dst = r->rtrmsg_dst.s_addr; - u32 srcmask = fib_netmask(r->rtrmsg_srclen); - u32 dstmask = fib_netmask(r->rtrmsg_dstlen); - struct fib_rule *cl, **clp; - - for (clp=&fib_rules; (cl=*clp) != NULL; clp=&cl->cl_next) { - if (src == cl->cl_src && - srcmask == cl->cl_srcmask && - dst == cl->cl_dst && - dstmask == cl->cl_dstmask && - r->rtrmsg_tos == cl->cl_tos && - dev == cl->cl_dev && - r->rtrmsg_action == cl->cl_action && - (!r->rtrmsg_preference || r->rtrmsg_preference == cl->cl_preference) && - (!r->rtrmsg_class || (cl && r->rtrmsg_class == cl->cl_class->cl_id))) { - cli(); - *clp = cl->cl_next; - sti(); - if (cl->cl_class) - cl->cl_class->cl_users--; - kfree(cl); - return 0; - } - } - return -ESRCH; -} - -static int fib_rule_add(struct in_rtrulemsg *r, struct device *dev, struct nlmsghdr *n) -{ - u32 src = r->rtrmsg_src.s_addr; - u32 dst = r->rtrmsg_dst.s_addr; - u32 srcmask = fib_netmask(r->rtrmsg_srclen); - u32 dstmask = fib_netmask(r->rtrmsg_dstlen); - - struct fib_rule *cl, *new_cl, **clp; - struct fib_class *class = NULL; - - if ((src&~srcmask) || (dst&~dstmask)) - return -EINVAL; - if (dev && net_alias_main_dev(dev) != dev) - return -ENODEV; - - if (!r->rtrmsg_class) { - if (r->rtrmsg_action==RTP_GO || r->rtrmsg_action==RTP_NAT - || r->rtrmsg_action==RTP_MASQUERADE) { - if ((class = fib_empty_class()) == NULL) - return -ENOMEM; - class->cl_auto = 1; - } else if (r->rtrmsg_rtmsgs) - return -EINVAL; - } else if ((class = fib_alloc_class(r->rtrmsg_class)) == NULL) - return -ENOMEM; - - new_cl = kmalloc(sizeof(*new_cl), GFP_KERNEL); - if (!new_cl) - return -ENOMEM; - new_cl->cl_src = src; - new_cl->cl_srcmask = srcmask; - new_cl->cl_dst = dst; - new_cl->cl_dstmask = dstmask; - new_cl->cl_dev = dev; - new_cl->cl_srcmap = r->rtrmsg_srcmap.s_addr; - new_cl->cl_tos = r->rtrmsg_tos; - new_cl->cl_action = r->rtrmsg_action; - new_cl->cl_flags = r->rtrmsg_flags; - new_cl->cl_preference = r->rtrmsg_preference; - new_cl->cl_class = class; - if (class) - class->cl_users++; - - clp = &fib_rules; - - if (!new_cl->cl_preference) { - cl = fib_rules; - if (cl && (cl = cl->cl_next) != NULL) { - clp = &fib_rules->cl_next; - if (cl->cl_preference) - new_cl->cl_preference = cl->cl_preference - 1; - } - } - - while ( (cl = *clp) != NULL ) { - if (cl->cl_preference >= new_cl->cl_preference) - break; - clp = &cl->cl_next; - } - - new_cl->cl_next = cl; - cli(); - *clp = new_cl; - sti(); - - if (r->rtrmsg_rtmsgs) { - n->nlmsg_type = RTMSG_NEWROUTE; - r->rtrmsg_rtmsg->rtmsg_class = class->cl_id; - return rtmsg_process(n, r->rtrmsg_rtmsg); - } - return 0; -} - - -#define FZ_MAX_DIVISOR 1024 - -static __inline__ u32 fib_hash(u32 key, u32 mask) -{ - u32 h; - h = key^(key>>20); - h = h^(h>>10); - h = h^(h>>5); - return h & mask; -} - -static __inline__ struct fib_node ** fz_hash_p(u32 key, struct fib_zone *fz) -{ - return &fz->fz_hash[fib_hash(key, fz->fz_hashmask)]; -} - -static __inline__ struct fib_node * fz_hash(u32 key, struct fib_zone *fz) -{ - return fz->fz_hash[fib_hash(key, fz->fz_hashmask)]; -} - -/* - * Free FIB node. - */ - -static void fib_free_node(struct fib_node * f) -{ - struct fib_info * fi = f->fib_info; - if (fi && !--fi->fib_refcnt) { -#if RT_CACHE_DEBUG >= 2 - printk("fib_free_node: fi %08x/%s is free\n", fi->fib_gateway, fi->fib_dev ? fi->fib_dev->name : "null"); -#endif - if (fi->fib_next) - fi->fib_next->fib_prev = fi->fib_prev; - if (fi->fib_prev) - fi->fib_prev->fib_next = fi->fib_next; - if (fi == fib_info_list) - fib_info_list = fi->fib_next; - } - kfree_s(f, sizeof(struct fib_node)); -} - -static __inline__ int fib_flags_trans(unsigned flags) -{ - if (flags & RTF_BROADCAST) - return IS_BROADCAST; - if (flags & RTF_MULTICAST) - return IS_MULTICAST; - if (flags & RTF_LOCAL) - return IS_MYADDR; - return 0; -} - -unsigned ip_fib_chk_addr(u32 addr) -{ - struct fib_zone * fz; - struct fib_node * f; - - /* - * Accept both `all ones' and `all zeros' as BROADCAST. - * (Support old BSD in other words). This old BSD - * support will go very soon as it messes other things - * up. - */ - - if (addr == INADDR_ANY || addr == 0xFFFFFFFF) - return RTF_LOCAL|RTF_BROADCAST; - - if ((addr & htonl(0x7F000000L)) == htonl(0x7F000000L)) - return RTF_LOCAL|RTF_INTERFACE; - - if (MULTICAST(addr)) - return RTF_MULTICAST; - - addr = ntohl(addr); - for (fz = local_class.fib_zone_list; fz; fz = fz->fz_next) { - u32 key = (addr&fz->fz_mask)>>fz->fz_logmask; - for (f = fz_hash(key, fz); f; f = f->fib_next) { - if (key != f->fib_key || (f->fib_flag & FIBFLG_DOWN)) - continue; - if (!f->fib_info) - return 0; - return f->fib_info->fib_flags&RTF_ADDRCLASSMASK; - } - } - - return 0; -} - -int __ip_chk_addr(unsigned long addr) -{ - return fib_flags_trans(ip_fib_chk_addr(addr)); -} - -/* - * Find the first device with a given source address. - */ - -struct device *ip_dev_find(unsigned long addr, char *name) -{ - struct fib_zone * fz = local_class.fib_zones[0]; - u32 key; - struct fib_node * f; - - key = (ntohl(addr)&fz->fz_mask)>>fz->fz_logmask; - for (f = fz_hash(key, fz); f; f = f->fib_next) { - if (key == f->fib_key && - !(f->fib_flag & (FIBFLG_DOWN|FIBFLG_REJECT|FIBFLG_THROW)) && - f->fib_info->fib_flags == (RTF_IFLOCAL&~RTF_UP)) { - if (!name || strcmp(name, f->fib_info->fib_dev->name) == 0) - return f->fib_info->fib_dev; - } - } - - return NULL; -} - -/* - * Find tunnel with a given source and destination. - */ - -struct device *ip_dev_find_tunnel(u32 daddr, u32 saddr) -{ - struct fib_zone * fz = local_class.fib_zones[0]; - u32 key; - struct fib_node * f; - - key = (ntohl(daddr)&fz->fz_mask)>>fz->fz_logmask; - for (f = fz_hash(key, fz); f; f = f->fib_next) { - if (key == f->fib_key && - !(f->fib_flag & (FIBFLG_DOWN|FIBFLG_REJECT|FIBFLG_THROW)) && - f->fib_info->fib_flags == (RTF_IFLOCAL&~RTF_UP)) { - struct device *dev = f->fib_info->fib_dev; - if (dev->type == ARPHRD_TUNNEL && - dev->pa_dstaddr == saddr) - return dev; - } - if (!f->fib_info) - return NULL; - } - - return NULL; -} - - -int ip_fib_chk_default_gw(u32 addr, struct device *dev) -{ - struct fib_rule *cl; - struct fib_node * f; - - for (cl = fib_rules; cl; cl = cl->cl_next) { - if (cl->cl_srcmask || cl->cl_dstmask || cl->cl_tos || - cl->cl_dev || cl->cl_action != RTP_GO || !cl->cl_class || - !cl->cl_class->fib_zones[32]) - continue; - for (f = cl->cl_class->fib_zones[32]->fz_hash[0]; f; f = f->fib_next) { - struct fib_info *fi = f->fib_info; - if (!(f->fib_flag & (FIBFLG_DOWN|FIBFLG_REJECT|FIBFLG_THROW)) && - fi->fib_gateway == addr && - fi->fib_dev == dev && - fi->fib_flags&RTF_GATEWAY) - return 0; - } - } - return -1; -} - - -/* - * Main lookup routine. - */ - - -int -fib_lookup(struct fib_result *res, u32 daddr, u32 src, u8 tos, - struct device *devin, struct device *devout) -{ - struct fib_node * f; - struct fib_rule * cl; - u32 dst; - int local = tos & 1; - - tos &= IPTOS_TOS_MASK; - dst = ntohl(daddr); - - for (cl = fib_rules; cl; cl=cl->cl_next) { - struct fib_zone * fz; - - if (((src^cl->cl_src) & cl->cl_srcmask) || - ((daddr^cl->cl_dst) & cl->cl_dstmask) || - (cl->cl_tos && cl->cl_tos != tos) || - (cl->cl_dev && cl->cl_dev != devin)) - continue; - - switch (cl->cl_action) { - case RTP_GO: - case RTP_NAT: - case RTP_MASQUERADE: - default: - break; - case RTP_UNREACHABLE: - return -ENETUNREACH; - case RTP_DROP: - return -EINVAL; - case RTP_PROHIBIT: - return -EACCES; - } - - for (fz = cl->cl_class->fib_zone_list; fz; fz = fz->fz_next) { - u32 key = (dst&fz->fz_mask)>>fz->fz_logmask; - - for (f = fz_hash(key, fz); f; f = f->fib_next) { - if (key != f->fib_key || - (f->fib_flag & FIBFLG_DOWN) || - (f->fib_tos && f->fib_tos != tos)) - continue; - if (f->fib_flag & FIBFLG_THROW) - goto next_class; - if (f->fib_flag & FIBFLG_REJECT) - return -ENETUNREACH; - if (devout && f->fib_info->fib_dev != devout) - continue; - if (!local || !(f->fib_info->fib_flags&RTF_GATEWAY)) { - res->f = f; - res->fr = cl; - res->fm = fz->fz_logmask; - return 0; - } - } - } -next_class: - } - return -ENETUNREACH; -} - -static int fib_autopublish(int op, struct fib_node *f, int logmask) -{ - struct fib_zone *fz; - struct fib_node *f1; - struct arpreq r; - u32 addr = htonl(f->fib_key<<logmask); - - if (f->fib_flag || LOOPBACK(addr) || - (!RT_LOCALADDR(f->fib_info->fib_flags) && - !(f->fib_info->fib_flags&RTF_NAT))) - return 0; - - memset(&r, 0, sizeof(struct arpreq)); - r.arp_flags = ATF_PUBL|ATF_PERM|ATF_MAGIC; - if (logmask) - r.arp_flags |= ATF_NETMASK; - ((struct sockaddr_in*)&r.arp_pa)->sin_family = AF_INET; - ((struct sockaddr_in*)&r.arp_pa)->sin_addr.s_addr = addr; - ((struct sockaddr_in*)&r.arp_netmask)->sin_family = AF_INET; - ((struct sockaddr_in*)&r.arp_netmask)->sin_addr.s_addr = fib_mask(logmask); - - if (op) - return arp_req_set(&r, NULL); - - fz = local_class.fib_zones[logmask]; - - for (f1 = fz_hash(f->fib_key, fz); f1; f1=f1->fib_next) { - if (f->fib_key != f1->fib_key || f1->fib_flag || - (!RT_LOCALADDR(f1->fib_info->fib_flags) && - !(f1->fib_info->fib_flags&RTF_NAT))) - continue; - return 0; - } - - return arp_req_delete(&r, NULL); -} - -#define FIB_SCAN(f, fp) \ -for ( ; ((f) = *(fp)) != NULL; (fp) = &(f)->fib_next) - -#define FIB_SCAN_KEY(f, fp, key) \ -for ( ; ((f) = *(fp)) != NULL && (f)->fib_key == (key); (fp) = &(f)->fib_next) - -#define FIB_CONTINUE(f, fp) \ -{ \ - fp = &f->fib_next; \ - continue; \ -} - -static int fib_delete(struct in_rtmsg * r, struct device *dev, - struct fib_class *class, struct nlmsghdr *n) -{ - struct fib_node **fp, *f; - struct fib_zone *fz = class->fib_zones[32-r->rtmsg_prefixlen]; - int logmask = 32 - r->rtmsg_prefixlen; - u32 dst = ntohl(r->rtmsg_prefix.s_addr); - u32 gw = r->rtmsg_gateway.s_addr; - short metric = r->rtmsg_metric; - u8 tos = r->rtmsg_tos; - u8 fibflg = 0; - int found=0; - unsigned flags; - u32 key; - - flags = r->rtmsg_flags; - if (flags & RTF_REJECT) - fibflg |= FIBFLG_REJECT; - else if (flags & RTF_THROW) - fibflg |= FIBFLG_THROW; - flags &= ~(RTF_UP|RTF_REJECT|RTF_THROW); - - if (fz != NULL) { - key = (dst&fz->fz_mask)>>logmask; - fp = fz_hash_p(key, fz); - - FIB_SCAN(f, fp) { - if (f->fib_key == key) - break; - } - FIB_SCAN_KEY(f, fp, key) { - if (f->fib_tos == tos) - break; - } - - while ((f = *fp) != NULL && f->fib_key == key && f->fib_tos == tos) { - struct fib_info * fi = f->fib_info; - - /* - * If metric was not specified (<0), match all metrics. - */ - if (metric >= 0 && f->fib_metric != metric) - FIB_CONTINUE(f, fp); - - if (flags & RTF_MAGIC) { - /* "Magic" deletions require exact match */ - if (!fi || (fi->fib_flags^flags) || - fi->fib_dev != dev || - fi->fib_gateway != gw) - FIB_CONTINUE(f, fp); - } else { - /* - * Device, gateway, reject and throw are - * also checked if specified. - */ - if ((dev && fi && fi->fib_dev != dev) || - (gw && fi && fi->fib_gateway != gw) || - (fibflg && (f->fib_flag^fibflg)&~FIBFLG_DOWN)) - FIB_CONTINUE(f, fp); - } - cli(); - /* It's interesting, can this operation be not atomic? */ - *fp = f->fib_next; - sti(); - if (class == &local_class) - fib_autopublish(0, f, logmask); - rtmsg_fib(RTMSG_DELROUTE, f, logmask, class, n); - fib_free_node(f); - found++; - } - fz->fz_nent -= found; - } - - if (found) { - fib_stamp++; - rt_cache_flush(0); - rtmsg_ack(n, 0); - return 0; - } - rtmsg_ack(n, ESRCH); - return -ESRCH; -} - -static struct fib_info * fib_create_info(struct device * dev, struct in_rtmsg *r) -{ - struct fib_info * fi; - unsigned flags = r->rtmsg_flags; - u32 gw = r->rtmsg_gateway.s_addr; - unsigned short mtu; - unsigned short irtt; - unsigned long window; - - mtu = dev ? dev->mtu : 0; - if (flags&RTF_MSS && r->rtmsg_mtu < mtu && r->rtmsg_mtu >= 68) - mtu = r->rtmsg_mtu; - window = (flags & RTF_WINDOW) ? r->rtmsg_window : 0; - irtt = (flags & RTF_IRTT) ? r->rtmsg_rtt : TCP_TIMEOUT_INIT; - - flags &= RTF_FIB; - - for (fi=fib_info_list; fi; fi = fi->fib_next) { - if (fi->fib_gateway != gw || - fi->fib_dev != dev || - fi->fib_flags != flags || - fi->fib_mtu != mtu || - fi->fib_window != window || - fi->fib_irtt != irtt) - continue; - fi->fib_refcnt++; -#if RT_CACHE_DEBUG >= 2 - printk("fib_create_info: fi %08x/%s/%04x is duplicate\n", fi->fib_gateway, fi->fib_dev ? fi->fib_dev->name : "null", fi->fib_flags); -#endif - return fi; - } - fi = (struct fib_info*)kmalloc(sizeof(struct fib_info), GFP_KERNEL); - if (!fi) - return NULL; - memset(fi, 0, sizeof(struct fib_info)); - fi->fib_flags = flags; - fi->fib_dev = dev; - fi->fib_gateway = gw; - fi->fib_mtu = mtu; - fi->fib_window = window; - fi->fib_refcnt++; - fi->fib_next = fib_info_list; - fi->fib_prev = NULL; - fi->fib_irtt = irtt; - if (fib_info_list) - fib_info_list->fib_prev = fi; - fib_info_list = fi; -#if RT_CACHE_DEBUG >= 2 - printk("fib_create_info: fi %08x/%s/%04x is created\n", fi->fib_gateway, fi->fib_dev ? fi->fib_dev->name : "null", fi->fib_flags); -#endif - return fi; -} - -static __inline__ void fib_rebuild_zone(struct fib_zone *fz, - struct fib_node **old_ht, - int old_divisor) -{ - int i; - struct fib_node **ht = fz->fz_hash; - u32 hashmask = fz->fz_hashmask; - struct fib_node *f, **fp, *next; - unsigned hash; - - for (i=0; i<old_divisor; i++) { - for (f=old_ht[i]; f; f=next) { - next = f->fib_next; - f->fib_next = NULL; - hash = fib_hash(f->fib_key, hashmask); - for (fp = &ht[hash]; *fp; fp = &(*fp)->fib_next) - /* NONE */; - *fp = f; - } - } -} - -static void fib_rehash_zone(struct fib_zone *fz) -{ - struct fib_node **ht, **old_ht; - int old_divisor, new_divisor; - u32 new_hashmask; - - old_divisor = fz->fz_divisor; - - switch (old_divisor) { - case 16: - new_divisor = 256; - new_hashmask = 0xFF; - break; - case 256: - new_divisor = 1024; - new_hashmask = 0x3FF; - break; - default: - printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor); - return; - } -#if RT_CACHE_DEBUG >= 2 - printk("fib_rehash_zone: hash for zone %d grows from %d\n", fz->fz_logmask, old_divisor); -#endif - - ht = kmalloc(new_divisor*sizeof(struct rtable*), GFP_KERNEL); - - if (ht) { - memset(ht, 0, new_divisor*sizeof(struct fib_node*)); - start_bh_atomic(); - old_ht = fz->fz_hash; - fz->fz_hash = ht; - fz->fz_hashmask = new_hashmask; - fz->fz_divisor = new_divisor; - fib_rebuild_zone(fz, old_ht, old_divisor); - fib_stamp++; - end_bh_atomic(); - kfree(old_ht); - } -} - -static struct fib_zone * -fib_new_zone(struct fib_class *class, int logmask) -{ - int i; - struct fib_zone *fz = kmalloc(sizeof(struct fib_zone), GFP_KERNEL); - if (!fz) - return NULL; - - memset(fz, 0, sizeof(struct fib_zone)); - if (logmask < 32) { - fz->fz_divisor = 16; - fz->fz_hashmask = 0xF; - } else { - fz->fz_divisor = 1; - fz->fz_hashmask = 0; - } - fz->fz_hash = kmalloc(fz->fz_divisor*sizeof(struct fib_node*), GFP_KERNEL); - if (!fz->fz_hash) { - kfree(fz); - return NULL; - } - memset(fz->fz_hash, 0, fz->fz_divisor*sizeof(struct fib_node*)); - fz->fz_logmask = logmask; - fz->fz_mask = ntohl(fib_mask(logmask)); - for (i=logmask-1; i>=0; i--) - if (class->fib_zones[i]) - break; - start_bh_atomic(); - if (i<0) { - fz->fz_next = class->fib_zone_list; - class->fib_zone_list = fz; - } else { - fz->fz_next = class->fib_zones[i]->fz_next; - class->fib_zones[i]->fz_next = fz; - } - class->fib_zones[logmask] = fz; - fib_stamp++; - end_bh_atomic(); - return fz; -} - -static int fib_create(struct in_rtmsg *r, struct device *dev, - struct fib_class *class, struct nlmsghdr *n) -{ - struct fib_node *f, *f1, **fp; - struct fib_node **dup_fp = NULL; - struct fib_zone * fz; - struct fib_info * fi; - - long logmask = 32L - r->rtmsg_prefixlen; /* gcc bug work-around: must be "L" and "long" */ - u32 dst = ntohl(r->rtmsg_prefix.s_addr); - u32 gw = r->rtmsg_gateway.s_addr; - short metric = r->rtmsg_metric; - unsigned flags = r->rtmsg_flags; - u8 tos = r->rtmsg_tos; - u8 fibflg = 0; - u32 key; - - /* - * Allocate an entry and fill it in. - */ - - f = (struct fib_node *) kmalloc(sizeof(struct fib_node), GFP_KERNEL); - if (f == NULL) { - rtmsg_ack(n, ENOMEM); - return -ENOMEM; - } - - memset(f, 0, sizeof(struct fib_node)); - - if (!(flags & RTF_UP)) - fibflg = FIBFLG_DOWN; - if (flags & RTF_REJECT) - fibflg |= FIBFLG_REJECT; - else if (flags & RTF_THROW) - fibflg |= FIBFLG_THROW; - - flags &= ~(RTF_UP|RTF_REJECT|RTF_THROW); - r->rtmsg_flags = flags; - - fi = NULL; - if (!(fibflg & (FIBFLG_REJECT|FIBFLG_THROW))) { - if ((fi = fib_create_info(dev, r)) == NULL) { - kfree_s(f, sizeof(struct fib_node)); - rtmsg_ack(n, ENOMEM); - return -ENOMEM; - } - f->fib_info = fi; - flags = fi->fib_flags; - } - - f->fib_key = key = dst>>logmask; - f->fib_metric = metric; - f->fib_tos = tos; - f->fib_flag = fibflg; - fz = class->fib_zones[logmask]; - - if (!fz && !(fz = fib_new_zone(class, logmask))) { - fib_free_node(f); - rtmsg_ack(n, ENOMEM); - return -ENOMEM; - } - - if (fz->fz_nent > (fz->fz_divisor<<2) && - fz->fz_divisor < FZ_MAX_DIVISOR && - (!logmask || (1<<(32-logmask)) > fz->fz_divisor)) - fib_rehash_zone(fz); - - fp = fz_hash_p(key, fz); - - /* - * Scan list to find the first route with the same destination - */ - FIB_SCAN(f1, fp) { - if (f1->fib_key == key) - break; - } - - /* - * Find route with the same destination and tos. - */ - FIB_SCAN_KEY(f1, fp, dst) { - if (f1->fib_tos <= tos) - break; - } - - /* - * Find route with the same destination/tos and less (or equal) metric. - * "Magic" additions go to the end of list. - */ - for ( ; (f1 = *fp) != NULL && f1->fib_key == key && f1->fib_tos == tos; - fp = &f1->fib_next) { - if (f1->fib_metric >= metric && metric != MAGIC_METRIC) - break; - - /* - * Record route with the same destination/tos/gateway/dev, - * but less metric. - */ - if (!dup_fp) { - struct fib_info *fi1 = f1->fib_info; - - if ((fibflg^f1->fib_flag) & ~FIBFLG_DOWN) - continue; - if (fi == fi1 || - (fi && fi1 && - fi->fib_dev == fi1->fib_dev && - fi->fib_gateway == fi1->fib_gateway && - !(flags&RTF_MAGIC))) - dup_fp = fp; - } - } - - /* - * Is it already present? - */ - - if (f1 && f1->fib_key == key && f1->fib_tos == tos && - f1->fib_metric == metric && f1->fib_info == fi) { - fib_free_node(f); - - if (fibflg == f1->fib_flag) { - rtmsg_ack(n, EEXIST); - return -EEXIST; - } else { - fib_stamp++; - f1->fib_flag = fibflg; - rt_cache_flush(0); - rtmsg_ack(n, 0); - return 0; - } - } - - /* - * Do not add "magic" route, if better one is already present. - */ - if ((flags & RTF_MAGIC) && dup_fp) { - fib_free_node(f); - rtmsg_ack(n, EEXIST); - return -EEXIST; - } - - /* - * Insert new entry to the list. - */ - - cli(); - f->fib_next = f1; - *fp = f; - sti(); - fz->fz_nent++; - if (class == &local_class && !dup_fp) - fib_autopublish(1, f, logmask); - rtmsg_fib(RTMSG_NEWROUTE, f, logmask, class, n); - - if (flags & RTF_MAGIC) { - fib_stamp++; - rt_cache_flush(0); - rtmsg_ack(n, 0); - return 0; - } - - /* - * Clean routes with the same destination,tos,gateway and device, - * but different metric. - */ - fp = dup_fp ? : &f->fib_next; - - while ((f1 = *fp) != NULL && f1->fib_key == key && f1->fib_tos == tos) { - if (f1 == f || ((f1->fib_flag^fibflg)&~FIBFLG_DOWN)) - FIB_CONTINUE(f1, fp); - - if (f1->fib_info != fi && - (!fi || !f1->fib_info || - f1->fib_info->fib_gateway != gw || - f1->fib_info->fib_dev != dev)) - FIB_CONTINUE(f1, fp); - - cli(); - *fp = f1->fib_next; - sti(); - fz->fz_nent--; - rtmsg_fib(RTMSG_DELROUTE, f1, logmask, class, n); - fib_free_node(f1); - } - fib_stamp++; - rt_cache_flush(0); - rtmsg_ack(n, 0); - return 0; -} - -static int fib_flush_list(struct fib_node ** fp, struct device *dev, - int logmask, struct fib_class *class) -{ - int found = 0; - struct fib_node *f; - - while ((f = *fp) != NULL) { - if (!f->fib_info || f->fib_info->fib_dev != dev) - FIB_CONTINUE(f, fp); - cli(); - *fp = f->fib_next; - sti(); - if (class == &local_class) - fib_autopublish(0, f, logmask); -#ifdef CONFIG_RTNETLINK - if (rt_nl_flags&RTCTL_FLUSH) - rtmsg_fib(RTMSG_DELROUTE, f, logmask, class, 0); -#endif - fib_free_node(f); - found++; - } - return found; -} - -static void fib_flush(struct device *dev) -{ - struct fib_class *class; - struct fib_rule *cl, **clp; - struct fib_zone *fz; - int found = 0; - int i, tmp, cl_id; - - - for (cl_id = RT_CLASS_MAX; cl_id>=0; cl_id--) { - if ((class = fib_classes[cl_id])==NULL) - continue; - for (fz = class->fib_zone_list; fz; fz = fz->fz_next) { - tmp = 0; - for (i=fz->fz_divisor-1; i>=0; i--) - tmp += fib_flush_list(&fz->fz_hash[i], dev, - fz->fz_logmask, class); - fz->fz_nent -= tmp; - found += tmp; - } - } - - clp = &fib_rules; - while ( (cl=*clp) != NULL) { - if (cl->cl_dev != dev) { - clp = &cl->cl_next; - continue; - } - found++; - cli(); - *clp = cl->cl_next; - sti(); - kfree(cl); - } - - if (found) { - fib_stamp++; - rt_cache_flush(1); - } -} - -#ifdef CONFIG_PROC_FS - -static unsigned __inline__ fib_flag_trans(u8 fibflg) -{ - unsigned ret = RTF_UP; - if (!fibflg) - return ret; - if (fibflg & FIBFLG_DOWN) - ret &= ~RTF_UP; - if (fibflg & FIBFLG_REJECT) - ret |= RTF_REJECT; - if (fibflg & FIBFLG_THROW) - ret |= RTF_THROW; - return ret; -} - -/* - * Called from the PROCfs module. This outputs /proc/net/route. - * - * We preserve the old format but pad the buffers out. This means that - * we can spin over the other entries as we read them. Remember the - * gated BGP4 code could need to read 60,000+ routes on occasion (that's - * about 7Mb of data). To do that ok we will need to also cache the - * last route we got to (reads will generally be following on from - * one another without gaps). - */ - -static int fib_get_info(char *buffer, char **start, off_t offset, int length, int dummy) -{ - struct fib_class *class; - struct fib_zone *fz; - struct fib_node *f; - int len=0; - off_t pos=0; - char temp[129]; - int i; - int cl_id; - - pos = 128; - - if (offset<128) - { - sprintf(buffer,"%-127s\n","Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT\tTOS\tClass"); - len = 128; - } - - fib_lock(); - - for (cl_id=RT_CLASS_MAX-1; cl_id >= 0; cl_id--) { - class = fib_classes[cl_id]; - if (!class) - continue; - for (fz=class->fib_zone_list; fz; fz = fz->fz_next) - { - int maxslot; - struct fib_node ** fp; - - if (fz->fz_nent == 0) - continue; - - if (pos + 128*fz->fz_nent <= offset) { - pos += 128*fz->fz_nent; - len = 0; - continue; - } - - maxslot = fz->fz_divisor; - fp = fz->fz_hash; - - for (i=0; i < maxslot; i++, fp++) { - - for (f = *fp; f; f = f->fib_next) - { - struct fib_info * fi; - unsigned flags; - - /* - * Spin through entries until we are ready - */ - pos += 128; - - if (pos <= offset) - { - len=0; - continue; - } - - fi = f->fib_info; - flags = fib_flag_trans(f->fib_flag); - - if (fi) - flags |= fi->fib_flags; - sprintf(temp, "%s\t%08lX\t%08X\t%04X\t%d\t%u\t%d\t%08lX\t%d\t%lu\t%u\t%02x\t%02x", - fi && fi->fib_dev ? fi->fib_dev->name : "*", htonl(f->fib_key<<fz->fz_logmask), fi ? fi->fib_gateway : 0, - flags, 0, 0, f->fib_metric, - htonl(fz->fz_mask), fi ? (int)fi->fib_mtu : 0, fi ? fi->fib_window : 0, fi ? (int)fi->fib_irtt : 0, f->fib_tos, class->cl_id); - sprintf(buffer+len,"%-127s\n",temp); - - len += 128; - if (pos >= offset+length) - goto done; - } - } - } - } - -done: - fib_unlock(); - - *start = buffer+len-(pos-offset); - len = pos - offset; - if (len>length) - len = length; - return len; -} - -static int fib_local_get_info(char *buffer, char **start, off_t offset, int length, int dummy) -{ - struct fib_zone *fz; - struct fib_node *f; - int len=0; - off_t pos=0; - char temp[129]; - int i; - - pos = 128; - - if (offset<128) - { - sprintf(buffer,"%-127s\n","Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT\tTOS\tClass"); - len = 128; - } - - fib_lock(); - - for (fz=local_class.fib_zone_list; fz; fz = fz->fz_next) - { - int maxslot; - struct fib_node ** fp; - - if (fz->fz_nent == 0) - continue; - - if (pos + 128*fz->fz_nent <= offset) - { - pos += 128*fz->fz_nent; - len = 0; - continue; - } - - maxslot = fz->fz_divisor; - fp = fz->fz_hash; - - for (i=0; i < maxslot; i++, fp++) - { - - for (f = *fp; f; f = f->fib_next) - { - unsigned flags; - struct fib_info * fi; - - /* - * Spin through entries until we are ready - */ - pos += 128; - - if (pos <= offset) - { - len=0; - continue; - } - - fi = f->fib_info; - flags = fib_flag_trans(f->fib_flag); - - if (fi) - flags |= fi->fib_flags; - sprintf(temp, "%s\t%08lX\t%08X\t%X\t%d\t%u\t%d\t%08lX\t%d\t%lu\t%u\t%02x\t%02x", - fi && fi->fib_dev ? fi->fib_dev->name : "*", - htonl(f->fib_key<<fz->fz_logmask), - fi ? fi->fib_gateway : 0, - flags, 0, 0, f->fib_metric, - htonl(fz->fz_mask), fi ? (int)fi->fib_mtu : 0, fi ? fi->fib_window : 0, fi ? (int)fi->fib_irtt : 0, f->fib_tos, RT_CLASS_LOCAL); - sprintf(buffer+len,"%-127s\n",temp); - - len += 128; - if (pos >= offset+length) - goto done; - } - } - } - -done: - fib_unlock(); - - *start = buffer+len-(pos-offset); - len = pos - offset; - if (len>length) - len = length; - return len; -} - -static int fib_rules_get_info(char *buffer, char **start, off_t offset, int length, int dummy) -{ - int len=0; - off_t pos=0; - char temp[129]; - struct fib_rule *cl; - - pos = 128; - - if (offset<128) { - sprintf(buffer,"%-127s\n","Pref\tSource\t\tSrcMask\t\tDst\t\tDstMask\t\tIface\tTOS\tClass\tFlags\tSrcMap\n"); - len = 128; - } - - - fib_lock(); - - for (cl = fib_rules; cl; cl = cl->cl_next) { - /* - * Spin through entries until we are ready - */ - pos += 128; - - if (pos <= offset) { - len = 0; - continue; - } - - sprintf(temp, "%d\t%08X\t%08X\t%08X\t%08X\t%s\t%02X\t%02x\t%02X\t%02X\t%08X", - cl->cl_preference, - cl->cl_src, cl->cl_srcmask, - cl->cl_dst, cl->cl_dstmask, - cl->cl_dev ? cl->cl_dev->name : "*", - cl->cl_tos, cl->cl_class ? cl->cl_class->cl_id : 0, - cl->cl_flags, cl->cl_action, cl->cl_srcmap - ); - sprintf(buffer+len,"%-127s\n",temp); - len += 128; - if (pos >= offset+length) - goto done; - } - -done: - fib_unlock(); - - *start = buffer+len-(pos-offset); - len = pos-offset; - if (len>length) - len = length; - return len; -} - -static int fib_class_get_info(char *buffer, char **start, off_t offset, int length, int dummy) -{ - int len=0; - off_t pos=0; - char temp[129]; - int i; - struct fib_class *cl; - - pos = 128; - - if (offset<128) - { - sprintf(buffer,"%-127s\n","Class\tSize\n"); - len = 128; - } - - - fib_lock(); - - for (i = RT_CLASS_MAX; i>=0; i--) - { - int sz = 0; - struct fib_zone *fz; - - if ((cl=fib_classes[i])==NULL) - continue; - - for (fz=cl->fib_zone_list; fz; fz=fz->fz_next) - sz += fz->fz_nent; - - /* - * Spin through entries until we are ready - */ - pos += 128; - - if (pos <= offset) - { - len = 0; - continue; - } - - sprintf(temp, "%d\t%d\n", cl->cl_id, sz); - sprintf(buffer+len,"%-127s\n",temp); - len += 128; - if (pos >= offset+length) - goto done; - } - -done: - fib_unlock(); - - *start = buffer+len-(pos-offset); - len = pos-offset; - if (len>length) - len = length; - return len; -} - -#endif - -static int rtmsg_process(struct nlmsghdr *n, struct in_rtmsg *r) -{ - unsigned long cmd=n->nlmsg_type; - struct device * dev = NULL; - struct fib_class *class; - - if ((cmd != RTMSG_NEWROUTE && cmd != RTMSG_DELROUTE) || - (r->rtmsg_flags & (RTF_MAGIC|RTF_XRESOLVE|RTF_REINSTATE)) || - r->rtmsg_prefixlen > 32 || - (r->rtmsg_tos & ~IPTOS_TOS_MASK)) { - rtmsg_ack(n, EINVAL); - return -EINVAL; - } - - /* Reject/throw directives have no interface/gateway specification */ - - if (r->rtmsg_flags & (RTF_REJECT|RTF_THROW)) { - r->rtmsg_ifindex = 0; - r->rtmsg_gateway.s_addr = 0; - r->rtmsg_flags &= ~RTF_GATEWAY; - } - - /* Silly metric hack, it is preserved for "compatibility", - * though I do not know any program using it. - */ - - r->rtmsg_metric--; - if (cmd == RTMSG_NEWROUTE && r->rtmsg_metric < 0) - r->rtmsg_metric = 0; - - if (cmd == RTMSG_DELROUTE) - r->rtmsg_flags &= RTF_FIB; - - if (r->rtmsg_ifindex) { - dev = dev_get_by_index(r->rtmsg_ifindex); - if (!dev) { - rtmsg_ack(n, ENODEV); - return -ENODEV; - } - } - - if (r->rtmsg_gateway.s_addr && !(r->rtmsg_flags&RTF_NAT)) { - struct fib_info *fi; - - fi = fib_lookup_info(r->rtmsg_gateway.s_addr, 0, 1, - &loopback_dev, dev); - if (fi) { - if (fi->fib_flags&(RTF_BROADCAST|RTF_MULTICAST) && - cmd != RTMSG_DELROUTE) - return -EINVAL; - dev = fi->fib_dev; - if (fi->fib_flags&RTF_LOCAL) { - r->rtmsg_flags &= ~RTF_GATEWAY; - r->rtmsg_gateway.s_addr = 0; - } - } else if (cmd != RTMSG_DELROUTE) - return -ENETUNREACH; - - /* If gateway is not found in routing table, - * we could assume that user knows that he does. - * It is link layer problem to decide reachable - * this gateway or not. Good example is tunnel interface. - * Another example is ethernet, ARP could (in theory) - * resolve addresses, even if we had no routes. - */ - } - - if (dev && (dev->flags&IFF_LOOPBACK)) { - if (r->rtmsg_flags&RTF_GATEWAY) - return -EINVAL; - /* - * Loopback routes: we declare them local addresses. - * It is the only reasonable solution to avoid - * loopback routing loops. - */ - r->rtmsg_flags |= RTF_LOCAL|RTF_INTERFACE; - } - - if (r->rtmsg_flags&RTF_GATEWAY) { - if (!dev && cmd != RTMSG_DELROUTE) { - rtmsg_ack(n, ENETUNREACH); - return -ENETUNREACH; - } - } else { - if (!dev && !(r->rtmsg_flags & (RTF_NAT|RTF_REJECT|RTF_THROW)) && - cmd != RTMSG_DELROUTE) { - rtmsg_ack(n, ENODEV); - return -ENODEV; - } - } - - if (dev && dev->family != AF_INET) - { - rtmsg_ack(n, ENODEV); - return -ENODEV; - } - - if (r->rtmsg_class == 0) { - if (r->rtmsg_flags&(RTF_LOCAL|RTF_NAT)) - r->rtmsg_class = RT_CLASS_LOCAL; - else if ((r->rtmsg_flags&RTF_GATEWAY) && - (ipv4_config.fib_model==2 || - (ipv4_config.fib_model==1 && !r->rtmsg_prefixlen))) - r->rtmsg_class = RT_CLASS_DEFAULT; - else - r->rtmsg_class = RT_CLASS_MAIN; - } - - if ((class = fib_classes[r->rtmsg_class]) == NULL) - { - rtmsg_ack(n, EINVAL); - return -EINVAL; - } - - return (cmd == RTMSG_NEWROUTE ? fib_create : fib_delete)(r, dev, class, n); -} - - -static int rtrulemsg_process(struct nlmsghdr *n, struct in_rtrulemsg *r) -{ - unsigned long cmd=n->nlmsg_type; - struct device * dev = NULL; - - if ((cmd != RTMSG_NEWRULE && cmd != RTMSG_DELRULE) || - r->rtrmsg_srclen > 32 || r->rtrmsg_dstlen > 32 || - (r->rtrmsg_tos & ~IPTOS_TOS_MASK)) - return -EINVAL; - - if (r->rtrmsg_ifindex) { - dev = dev_get_by_index(r->rtrmsg_ifindex); - if (!dev) - return -ENODEV; - if (dev->family != AF_INET) - return -ENODEV; - } - - if (cmd == RTMSG_DELRULE) - return fib_rule_delete(r, dev, n); - - return fib_rule_add(r, dev, n); -} - - -static int ifmsg_process(struct nlmsghdr *n, struct in_ifmsg *r) -{ - unsigned long cmd=n->nlmsg_type; - - if (cmd != RTMSG_NEWDEVICE && cmd != RTMSG_DELDEVICE) { - rtmsg_ack(n, EINVAL); - return -EINVAL; - } - rtmsg_ack(n, EINVAL); - return -EINVAL; -} - -static int rtcmsg_process(struct nlmsghdr *n, struct in_rtctlmsg *r) -{ -#ifdef CONFIG_RTNETLINK - if (r->rtcmsg_flags&RTCTL_DELAY) - rtmsg_ctl.nlmsg_delay = r->rtcmsg_delay; - if (r->rtcmsg_flags&RTCTL_OWNER) - rt_nl_owner = n->nlmsg_pid; - rt_nl_flags = r->rtcmsg_flags; - return 0; -#else - return -EINVAL; -#endif -} - -static int get_rt_from_user(struct in_rtmsg *rtm, void *arg) -{ - struct rtentry r; - - if (copy_from_user(&r, arg, sizeof(struct rtentry))) - return -EFAULT; - if (r.rt_dev) { - struct device *dev; - char devname[16]; - - if (copy_from_user(devname, r.rt_dev, 15)) - return -EFAULT; - devname[15] = 0; - dev = dev_get(devname); - if (!dev) - return -ENODEV; - rtm->rtmsg_ifindex = dev->ifindex; - } - - rtm->rtmsg_flags = r.rt_flags; - - if (r.rt_dst.sa_family != AF_INET) - return -EAFNOSUPPORT; - rtm->rtmsg_prefix = ((struct sockaddr_in*)&r.rt_dst)->sin_addr; - - if (rtm->rtmsg_flags&RTF_HOST) { - rtm->rtmsg_flags &= ~RTF_HOST; - rtm->rtmsg_prefixlen = 32; - } else { - u32 mask = ((struct sockaddr_in*)&r.rt_genmask)->sin_addr.s_addr; - if (r.rt_genmask.sa_family != AF_INET) { - printk(KERN_DEBUG "%s forgot to specify route netmask.\n", current->comm); - if (r.rt_genmask.sa_family) - return -EAFNOSUPPORT; - } - if (bad_mask(mask, rtm->rtmsg_prefix.s_addr)) - return -EINVAL; - rtm->rtmsg_prefixlen = 32 - fib_logmask(mask); - } - if ((rtm->rtmsg_flags & RTF_GATEWAY) && - r.rt_gateway.sa_family != AF_INET) - return -EAFNOSUPPORT; - rtm->rtmsg_gateway = ((struct sockaddr_in*)&r.rt_gateway)->sin_addr; - rtm->rtmsg_rtt = r.rt_irtt; - rtm->rtmsg_window = r.rt_window; - rtm->rtmsg_mtu = r.rt_mtu; - rtm->rtmsg_class = r.rt_class; - rtm->rtmsg_metric = r.rt_metric; - rtm->rtmsg_tos = r.rt_tos; - return 0; -} - - -/* - * Handle IP routing ioctl calls. These are used to manipulate the routing tables - */ - -int ip_rt_ioctl(unsigned int cmd, void *arg) -{ - int err; - union - { - struct in_rtmsg rtmsg; - struct in_ifmsg ifmsg; - struct in_rtrulemsg rtrmsg; - struct in_rtctlmsg rtcmsg; - } m; - struct nlmsghdr dummy_nlh; - - memset(&m, 0, sizeof(m)); - dummy_nlh.nlmsg_seq = 0; - dummy_nlh.nlmsg_pid = current->pid; - - switch (cmd) - { - case SIOCADDRT: /* Add a route */ - case SIOCDELRT: /* Delete a route */ - if (!suser()) - return -EPERM; - err = get_rt_from_user(&m.rtmsg, arg); - if (err) - return err; - fib_lock(); - dummy_nlh.nlmsg_type = cmd == SIOCDELRT ? RTMSG_DELROUTE - : RTMSG_NEWROUTE; - err = rtmsg_process(&dummy_nlh, &m.rtmsg); - fib_unlock(); - return err; - case SIOCRTMSG: - if (!suser()) - return -EPERM; - if (copy_from_user(&dummy_nlh, arg, sizeof(dummy_nlh))) - return -EFAULT; - switch (dummy_nlh.nlmsg_type) - { - case RTMSG_NEWROUTE: - case RTMSG_DELROUTE: - if (dummy_nlh.nlmsg_len < sizeof(m.rtmsg) + sizeof(dummy_nlh)) - return -EINVAL; - if (copy_from_user(&m.rtmsg, arg+sizeof(dummy_nlh), sizeof(m.rtmsg))) - return -EFAULT; - fib_lock(); - err = rtmsg_process(&dummy_nlh, &m.rtmsg); - fib_unlock(); - return err; - case RTMSG_NEWRULE: - case RTMSG_DELRULE: - if (dummy_nlh.nlmsg_len < sizeof(m.rtrmsg) + sizeof(dummy_nlh)) - return -EINVAL; - if (copy_from_user(&m.rtrmsg, arg+sizeof(dummy_nlh), sizeof(m.rtrmsg))) - return -EFAULT; - fib_lock(); - err = rtrulemsg_process(&dummy_nlh, &m.rtrmsg); - fib_unlock(); - return err; - case RTMSG_NEWDEVICE: - case RTMSG_DELDEVICE: - if (dummy_nlh.nlmsg_len < sizeof(m.ifmsg) + sizeof(dummy_nlh)) - return -EINVAL; - if (copy_from_user(&m.ifmsg, arg+sizeof(dummy_nlh), sizeof(m.ifmsg))) - return -EFAULT; - fib_lock(); - err = ifmsg_process(&dummy_nlh, &m.ifmsg); - fib_unlock(); - return err; - case RTMSG_CONTROL: - if (dummy_nlh.nlmsg_len < sizeof(m.rtcmsg) + sizeof(dummy_nlh)) - return -EINVAL; - if (copy_from_user(&m.rtcmsg, arg+sizeof(dummy_nlh), sizeof(m.rtcmsg))) - return -EFAULT; - fib_lock(); - err = rtcmsg_process(&dummy_nlh, &m.rtcmsg); - fib_unlock(); - return err; - default: - return -EINVAL; - } - } - - return -EINVAL; -} - -#ifdef CONFIG_RTNETLINK - -/* - * Netlink hooks for IP - */ - - -static void -rtmsg_fib(unsigned long type, struct fib_node *f, int logmask, - struct fib_class *class, struct nlmsghdr *n) -{ - struct in_rtmsg *r; - struct fib_info *fi; - - if (n && !(rt_nl_flags&RTCTL_ECHO) && rt_nl_owner == n->nlmsg_pid) - return; - - start_bh_atomic(); - r = nlmsg_send(&rtmsg_ctl, type, sizeof(*r), n ? n->nlmsg_seq : 0, - n ? n->nlmsg_pid : 0); - if (r) { - r->rtmsg_prefix.s_addr = htonl(f->fib_key<<logmask); - r->rtmsg_prefixlen = 32 - logmask; - r->rtmsg_metric= f->fib_metric; - r->rtmsg_tos = f->fib_tos; - r->rtmsg_class=class->cl_id; - r->rtmsg_flags = fib_flag_trans(f->fib_flag); - - if ((fi = f->fib_info) != NULL) { - r->rtmsg_gateway.s_addr = fi->fib_gateway; - r->rtmsg_flags |= fi->fib_flags; - r->rtmsg_mtu = fi->fib_mtu; - r->rtmsg_window = fi->fib_window; - r->rtmsg_rtt = fi->fib_irtt; - r->rtmsg_ifindex = fi->fib_dev ? fi->fib_dev->ifindex : 0; - } - } - end_bh_atomic(); -} - -static void -__rtmsg_ack(struct nlmsghdr *n, int err) -{ - nlmsg_ack(&rtmsg_ctl, n->nlmsg_seq, n->nlmsg_pid, err); -} - - -static void -rtmsg_dev(unsigned long type, struct device *dev, struct nlmsghdr *n) -{ - struct in_ifmsg *r; - - start_bh_atomic(); - r = nlmsg_send(&rtmsg_ctl, type, sizeof(*r), n ? n->nlmsg_seq : 0, - n ? n->nlmsg_pid : 0); - if (r) - { - memset(r, 0, sizeof(*r)); - r->ifmsg_lladdr.sa_family = dev->type; - memcpy(&r->ifmsg_lladdr.sa_data, dev->dev_addr, dev->addr_len); - r->ifmsg_prefix.s_addr = dev->pa_addr; - if (dev->flags & IFF_POINTOPOINT || dev->type == ARPHRD_TUNNEL) - r->ifmsg_brd.s_addr = dev->pa_dstaddr; - else - r->ifmsg_brd.s_addr = dev->pa_brdaddr; - r->ifmsg_flags = dev->flags; - r->ifmsg_mtu = dev->mtu; - r->ifmsg_metric = dev->metric; - r->ifmsg_prefixlen = 32 - fib_logmask(dev->pa_mask); - r->ifmsg_index = dev->ifindex; - strcpy(r->ifmsg_name, dev->name); - } - end_bh_atomic(); -} - -static int fib_netlink_call(int minor, struct sk_buff *skb) -{ - struct nlmsghdr *nlh; - int totlen = 0; - int err = 0; - - fib_lock(); - while (skb->len >= sizeof(*nlh)) { - int rlen; - nlh = (struct nlmsghdr *)skb->data; - rlen = NLMSG_ALIGN(nlh->nlmsg_len); - if (skb->len < rlen) - break; - totlen += rlen; - err = 0; - skb_pull(skb, rlen); - switch (nlh->nlmsg_type) { - case RTMSG_NEWROUTE: - case RTMSG_DELROUTE: - if (nlh->nlmsg_len < sizeof(*nlh)+sizeof(struct in_rtmsg)) { - rtmsg_ack(nlh, EINVAL); - err = -EINVAL; - break; - } - err = rtmsg_process(nlh, (struct in_rtmsg*)nlh->nlmsg_data); - break; - case RTMSG_NEWRULE: - case RTMSG_DELRULE: - if (nlh->nlmsg_len < sizeof(*nlh)+sizeof(struct in_rtrulemsg)) { - rtmsg_ack(nlh, EINVAL); - err = -EINVAL; - break; - } - err = rtrulemsg_process(nlh, (struct in_rtrulemsg*)nlh->nlmsg_data); - break; - case RTMSG_NEWDEVICE: - case RTMSG_DELDEVICE: - if (nlh->nlmsg_len < sizeof(*nlh)+sizeof(struct in_ifmsg)) { - rtmsg_ack(nlh, EINVAL); - err = -EINVAL; - break; - } - err = ifmsg_process(nlh, (struct in_ifmsg*)nlh->nlmsg_data); - break; - case RTMSG_CONTROL: - if (nlh->nlmsg_len < sizeof(*nlh)+sizeof(struct in_rtctlmsg)) { - rtmsg_ack(nlh, EINVAL); - err = -EINVAL; - break; - } - err = rtcmsg_process(nlh, (struct in_rtctlmsg*)nlh->nlmsg_data); - break; - default: - break; - } - } - kfree_skb(skb, FREE_READ); - fib_unlock(); - if (!err || rt_nl_flags&RTCTL_ACK) - return totlen; - return err; -} - -#endif - - -static int fib_magic(int op, unsigned flags, u32 dst, u32 mask, struct device *dev) -{ - struct nlmsghdr n; - struct in_rtmsg r; - memset(&r, 0, sizeof(r)); - n.nlmsg_seq=0; - n.nlmsg_pid=0; - r.rtmsg_metric = MAGIC_METRIC; - r.rtmsg_prefix.s_addr = dst; - if (dev->flags&IFF_LOOPBACK) - flags |= RTF_LOCAL; - r.rtmsg_flags = flags; - r.rtmsg_prefixlen = 32 - fib_logmask(mask); - - return (op == RTMSG_NEWROUTE ? fib_create : fib_delete) - (&r, dev, (flags&RTF_LOCAL) ? &local_class : &main_class, &n); -} - -static void ip_rt_del_broadcasts(struct device *dev) -{ - u32 net = dev->pa_addr&dev->pa_mask; - - fib_magic(RTMSG_DELROUTE, RTF_IFBRD, dev->pa_brdaddr, ~0, dev); - fib_magic(RTMSG_DELROUTE, RTF_IFBRD, net, ~0, dev); - fib_magic(RTMSG_DELROUTE, RTF_IFBRD, net|~dev->pa_mask, ~0, dev); -} - -static void ip_rt_add_broadcasts(struct device *dev, u32 brd, u32 mask) -{ - u32 net = dev->pa_addr&mask; - - if (dev->flags&IFF_BROADCAST) - fib_magic(RTMSG_NEWROUTE, RTF_IFBRD, brd, ~0, dev); - - if (net && !(mask&htonl(1))) { - fib_magic(RTMSG_NEWROUTE, RTF_IFBRD, net, ~0, dev); - fib_magic(RTMSG_NEWROUTE, RTF_IFBRD, net|~mask, ~0, dev); - } -} - -void ip_rt_change_broadcast(struct device *dev, u32 new_brd) -{ - fib_lock(); - printk(KERN_DEBUG "%s changes brd %08X -> %08X\n", - dev->name, (u32)dev->pa_brdaddr, new_brd); - if (!ZERONET(dev->pa_addr) && dev->flags&IFF_BROADCAST) { - fib_magic(RTMSG_DELROUTE, RTF_IFBRD, dev->pa_brdaddr, ~0, dev); - rtmsg_dev(RTMSG_DELDEVICE, dev, NULL); - rtmsg_dev(RTMSG_NEWDEVICE, dev, NULL); - ip_rt_add_broadcasts(dev, new_brd, dev->pa_mask); - } - fib_unlock(); -} - -void ip_rt_change_dstaddr(struct device *dev, u32 dstaddr) -{ - fib_lock(); - if (!ZERONET(dev->pa_addr) && (dev->flags&IFF_POINTOPOINT) && dev->type != ARPHRD_TUNNEL) { - printk(KERN_DEBUG "%s changes dst %08X -> %08X\n", - dev->name, (u32)dev->pa_dstaddr, dstaddr); - fib_magic(RTMSG_DELROUTE, RTF_IFPREFIX, dev->pa_dstaddr, ~0, dev); - rtmsg_dev(RTMSG_DELDEVICE, dev, NULL); - rtmsg_dev(RTMSG_NEWDEVICE, dev, NULL); - if (dstaddr) - fib_magic(RTMSG_NEWROUTE, RTF_IFPREFIX, dstaddr, ~0, dev); - } - fib_unlock(); -} - -void ip_rt_change_netmask(struct device *dev, u32 mask) -{ - u32 net; - - fib_lock(); - printk(KERN_DEBUG "%s changes netmask %08X -> %08X\n", - dev->name, (u32)dev->pa_mask, mask); - if (ZERONET(dev->pa_addr)) { - fib_unlock(); - return; - } - net = dev->pa_addr&dev->pa_mask; - fib_magic(RTMSG_DELROUTE, RTF_IFPREFIX, net, dev->pa_mask, dev); - ip_rt_del_broadcasts(dev); - if (mask != 0xFFFFFFFF && dev->flags&IFF_POINTOPOINT) - fib_magic(RTMSG_DELROUTE, RTF_IFPREFIX, dev->pa_dstaddr, ~0, dev); - rtmsg_dev(RTMSG_DELDEVICE, dev, NULL); - - if (mask != 0xFFFFFFFF) - dev->flags &= ~IFF_POINTOPOINT; - - rtmsg_dev(RTMSG_NEWDEVICE, dev, NULL); - net = dev->pa_addr&mask; - if (net) - fib_magic(RTMSG_NEWROUTE, RTF_IFPREFIX, net, mask, dev); - ip_rt_add_broadcasts(dev, dev->pa_addr, mask); - fib_unlock(); -} - -int ip_rt_event(int event, struct device *dev) -{ - fib_lock(); - if (event == NETDEV_DOWN) { - fib_flush(dev); - rtmsg_dev(RTMSG_DELDEVICE, dev, NULL); - fib_unlock(); - return NOTIFY_DONE; - } - if (event == NETDEV_CHANGE) { - printk(KERN_DEBUG "%s(%s) changes state fl=%08x pa=%08X/%08X brd=%08X dst=%08X\n", - dev->name, current->comm, dev->flags, (u32)dev->pa_addr, (u32)dev->pa_mask, - (u32)dev->pa_brdaddr, (u32)dev->pa_dstaddr); - if (!(dev->flags&IFF_BROADCAST)) - fib_magic(RTMSG_DELROUTE, RTF_IFBRD, dev->pa_brdaddr, ~0, dev); - if (!(dev->flags&IFF_POINTOPOINT)) - fib_magic(RTMSG_DELROUTE, RTF_IFPREFIX, dev->pa_dstaddr, ~0, dev); - else { - u32 net = dev->pa_addr&dev->pa_mask; - fib_magic(RTMSG_DELROUTE, RTF_IFPREFIX, net, dev->pa_mask, dev); - ip_rt_del_broadcasts(dev); - } - rtmsg_dev(RTMSG_DELDEVICE, dev, NULL); - } - - if ((event == NETDEV_UP || event == NETDEV_CHANGE) && !ZERONET(dev->pa_addr)) { - if (dev->flags&IFF_POINTOPOINT) { - dev->pa_mask = 0xFFFFFFFF; - dev->ip_flags &= ~IFF_IP_MASK_OK; - dev->flags &= ~IFF_BROADCAST; - dev->pa_brdaddr = 0; - } - - if (event == NETDEV_UP) - printk(KERN_DEBUG "%s UP fl=%08x pa=%08X/%08X brd=%08X dst=%08X\n", - dev->name, dev->flags, (u32)dev->pa_addr, - (u32)dev->pa_mask, (u32)dev->pa_brdaddr, (u32)dev->pa_dstaddr); - - rtmsg_dev(RTMSG_NEWDEVICE, dev, NULL); - - if (dev->flags&IFF_POINTOPOINT) { - if (dev->pa_dstaddr && dev->type != ARPHRD_TUNNEL) - fib_magic(RTMSG_NEWROUTE, RTF_IFPREFIX, dev->pa_dstaddr, ~0, dev); - } else { - u32 net = dev->pa_addr&dev->pa_mask; - - if (net) - fib_magic(RTMSG_NEWROUTE, RTF_IFPREFIX, net, dev->pa_mask, dev); - ip_rt_add_broadcasts(dev, dev->pa_brdaddr, dev->pa_mask); - } - fib_magic(RTMSG_NEWROUTE, RTF_IFLOCAL, dev->pa_addr, ~0, dev); - if (dev == &loopback_dev) { - if (dev->pa_addr != htonl(INADDR_LOOPBACK)) { - u32 mask = htonl(0xFF000000); - fib_magic(RTMSG_NEWROUTE, RTF_IFPREFIX, - htonl(INADDR_LOOPBACK)&mask, - mask, dev); - fib_magic(RTMSG_NEWROUTE, RTF_IFLOCAL, - htonl(INADDR_LOOPBACK), - mask, dev); - } - } - } - if (event == NETDEV_CHANGEMTU || event == NETDEV_CHANGEADDR) - rtmsg_dev(RTMSG_NEWDEVICE, dev, NULL); - fib_unlock(); - return NOTIFY_DONE; -} - - -__initfunc(void ip_fib_init(void)) -{ - struct in_rtrulemsg r; - -#ifdef CONFIG_PROC_FS - proc_net_register(&(struct proc_dir_entry) { - PROC_NET_ROUTE, 5, "route", - S_IFREG | S_IRUGO, 1, 0, 0, - 0, &proc_net_inode_operations, - fib_get_info - }); - proc_net_register(&(struct proc_dir_entry) { - PROC_NET_RTCLASSES, 10, "rt_classes", - S_IFREG | S_IRUGO, 1, 0, 0, - 0, &proc_net_inode_operations, - fib_class_get_info - }); - proc_net_register(&(struct proc_dir_entry) { - PROC_NET_RTLOCAL, 8, "rt_local", - S_IFREG | S_IRUGO, 1, 0, 0, - 0, &proc_net_inode_operations, - fib_local_get_info - }); - proc_net_register(&(struct proc_dir_entry) { - PROC_NET_RTRULES, 8, "rt_rules", - S_IFREG | S_IRUGO, 1, 0, 0, - 0, &proc_net_inode_operations, - fib_rules_get_info - }); -#endif /* CONFIG_PROC_FS */ - - fib_classes[RT_CLASS_LOCAL] = &local_class; - fib_classes[RT_CLASS_MAIN] = &main_class; - fib_classes[RT_CLASS_DEFAULT] = &default_class; - - memset(&r, 0, sizeof(r)); - r.rtrmsg_class = RT_CLASS_LOCAL; - r.rtrmsg_preference = 0; - fib_rule_add(&r, NULL, NULL); - - memset(&r, 0, sizeof(r)); - r.rtrmsg_class = RT_CLASS_DEFAULT; - r.rtrmsg_preference = 255; - fib_rule_add(&r, NULL, NULL); - - memset(&r, 0, sizeof(r)); - r.rtrmsg_class = RT_CLASS_MAIN; - r.rtrmsg_preference = 254; - fib_rule_add(&r, NULL, NULL); - -#ifdef CONFIG_RTNETLINK - netlink_attach(NETLINK_ROUTE, fib_netlink_call); -#endif -} diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 667d2352c..e66efde90 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -3,6 +3,8 @@ * * Alan Cox, <alan@cymru.net> * + * Version: $Id: icmp.c,v 1.35 1997/10/19 18:17:13 freitag Exp $ + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -41,6 +43,10 @@ * Andi Kleen : Check all packet lengths properly * and moved all kfree_skb() up to * icmp_rcv. + * Andi Kleen : Move the rate limit bookkeeping + * into the dest entry and use a tocken + * bucket filter (thanks to ANK). Make + * the rates sysctl configurable. * * RFC1122 (Host Requirements -- Comm. Layer) Status: * (boy, are there a lot of rules for ICMP) @@ -77,7 +83,7 @@ * [Solaris 2.X seems to assert EPROTO when this occurs] -- AC * 3.2.2.6 (Echo Request/Reply) * MUST reply to ECHO_REQUEST, and give app to do ECHO stuff (OK, OK) - * MAY discard broadcast ECHO_REQUESTs. (We don't, but that's OK.) + * MAY discard broadcast ECHO_REQUESTs. (Configurable with a sysctl.) * MUST reply using same source address as the request was sent to. * We're OK for unicast ECHOs, and it doesn't say anything about * how to handle broadcast ones, since it's optional. @@ -293,39 +299,9 @@ struct icmp_err icmp_err_convert[] = { { EHOSTUNREACH, 1 } /* ICMP_PREC_CUTOFF */ }; -/* - * A spare long used to speed up statistics updating - */ - -unsigned long dummy; - -/* - * ICMP transmit rate limit control structures. We use a relatively simple - * approach to the problem: For each type of ICMP message with rate limit - * we count the number of messages sent during some time quantum. If this - * count exceeds given maximal value, we ignore all messages not separated - * from the last message sent at least by specified time. - */ - -#define XRLIM_CACHE_SIZE 16 /* How many destination hosts do we cache */ - -struct icmp_xrl_cache /* One entry of the ICMP rate cache */ -{ - __u32 daddr; /* Destination address */ - unsigned long counter; /* Message counter */ - unsigned long next_reset; /* Time of next reset of the counter */ - unsigned long last_access; /* Time of last access to this entry (LRU) */ - unsigned int restricted; /* Set if we're in restricted mode */ - unsigned long next_packet; /* When we'll allow a next packet if restricted */ -}; - -struct icmp_xrlim -{ - unsigned long timeout; /* Time quantum for rate measuring */ - unsigned long limit; /* Maximal number of messages per time quantum allowed */ - unsigned long delay; /* How long we wait between packets when restricting */ - struct icmp_xrl_cache cache[XRLIM_CACHE_SIZE]; /* Rate cache */ -}; +/* Control parameters for ECHO relies. */ +int sysctl_icmp_echo_ignore_all = 0; +int sysctl_icmp_echo_ignore_broadcasts = 0; /* * ICMP control array. This specifies what to do with each ICMP. @@ -336,8 +312,8 @@ struct icmp_control unsigned long *output; /* Address to increment on output */ unsigned long *input; /* Address to increment on input */ void (*handler)(struct icmphdr *icmph, struct sk_buff *skb, int len); - unsigned long error; /* This ICMP is classed as an error message */ - struct icmp_xrlim *xrlim; /* Transmit rate limit control structure or NULL for no limits */ + short error; /* This ICMP is classed as an error message */ + int *timeout; /* Rate limit */ }; static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; @@ -369,100 +345,47 @@ struct socket *icmp_socket=&icmp_inode.u.socket_i; * Send an ICMP frame. */ - -/* - * Initialize the transmit rate limitation mechanism. - */ - -#ifndef CONFIG_NO_ICMP_LIMIT - -__initfunc(static void xrlim_init(void)) -{ - int type, entry; - struct icmp_xrlim *xr; - - for (type=0; type<=NR_ICMP_TYPES; type++) { - xr = icmp_pointers[type].xrlim; - if (xr) { - for (entry=0; entry<XRLIM_CACHE_SIZE; entry++) - xr->cache[entry].daddr = INADDR_NONE; - } - } -} - /* * Check transmit rate limitation for given message. + * The rate information is held in the destination cache now. + * This function is generic and could be used for other purposes + * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. * * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate - * SHOULD allow setting of rate limits (we allow - * in the source) + * SHOULD allow setting of rate limits + * + * Shared between ICMPv4 and ICMPv6. */ - -static int xrlim_allow(int type, __u32 addr) +#define XRLIM_BURST_FACTOR 6 +int xrlim_allow(struct dst_entry *dst, int timeout) { - struct icmp_xrlim *r; - struct icmp_xrl_cache *c; unsigned long now; - if (type > NR_ICMP_TYPES) /* No time limit present */ - return 1; - r = icmp_pointers[type].xrlim; - if (!r) + now = jiffies; + dst->rate_tokens += now - dst->rate_last; + if (dst->rate_tokens > 6*timeout) + dst->rate_tokens = XRLIM_BURST_FACTOR*timeout; + if (dst->rate_tokens >= timeout) { + dst->rate_tokens -= timeout; return 1; + } + return 0; +} - for (c = r->cache; c < &r->cache[XRLIM_CACHE_SIZE]; c++) - /* Cache lookup */ - if (c->daddr == addr) - break; - - now = jiffies; /* Cache current time (saves accesses to volatile variable) */ +static inline int icmpv4_xrlim_allow(struct rtable *rt, int type, int code) +{ + struct dst_entry *dst = &rt->u.dst; - if (c == &r->cache[XRLIM_CACHE_SIZE]) { /* Cache miss */ - unsigned long oldest = now; /* Find the oldest entry to replace */ - struct icmp_xrl_cache *d; - c = r->cache; - for (d = r->cache; d < &r->cache[XRLIM_CACHE_SIZE]; d++) - if (!d->daddr) { /* Unused entry */ - c = d; - break; - } else if (d->last_access < oldest) { - oldest = d->last_access; - c = d; - } - c->last_access = now; /* Fill the entry with new data */ - c->daddr = addr; - c->counter = 1; - c->next_reset = now + r->timeout; - c->restricted = 0; + if (type > NR_ICMP_TYPES || !icmp_pointers[type].timeout) return 1; - } - c->last_access = now; - if (c->next_reset > now) { /* Let's increment the counter */ - c->counter++; - if (c->counter == r->limit) { /* Limit exceeded, start restrictions */ - c->restricted = 1; - c->next_packet = now + r->delay; - return 0; - } - if (c->restricted) { /* Any restrictions pending? */ - if (c->next_packet > now) - return 0; - c->next_packet = now + r->delay; - return 1; - } - } else { /* Reset the counter */ - if (c->counter < r->limit) /* Switch off all restrictions */ - c->restricted = 0; - c->next_reset = now + r->timeout; - c->counter = 0; - } + /* Don't limit PMTU discovery. */ + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) + return 1; - return 1; /* Send the packet */ + return xrlim_allow(dst, *(icmp_pointers[type].timeout)); } -#endif /* CONFIG_NO_ICMP_LIMIT */ - /* * Maintain the counters used in the SNMP statistics for outgoing ICMP */ @@ -530,7 +453,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) ipc.opt = &icmp_param->replyopts; if (ipc.opt->srr) daddr = icmp_param->replyopts.faddr; - if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), NULL)) + if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0)) return; ip_build_xmit(sk, icmp_glue_bits, icmp_param, icmp_param->data_len+sizeof(struct icmphdr), @@ -578,7 +501,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info) */ if (!rt) return; - if (rt->rt_flags&(RTF_BROADCAST|RTF_MULTICAST)) + if (rt->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST)) return; @@ -610,34 +533,30 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info) } } - /* - * Check the rate limit - */ - -#ifndef CONFIG_NO_ICMP_LIMIT - if (!xrlim_allow(type, iph->saddr)) - return; -#endif /* * Construct source address and options. */ saddr = iph->daddr; - if (!(rt->rt_flags&RTF_LOCAL)) + if (!(rt->rt_flags&RTCF_LOCAL)) saddr = 0; tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | IPTOS_PREC_INTERNETCONTROL) : iph->tos; - if (ip_route_output(&rt, iph->saddr, saddr, RT_TOS(tos), NULL)) + /* XXX: use a more aggressive expire for routes created by + * this call (not longer than the rate limit timeout). + * It could be also worthwhile to not put them into ipv4 + * fast routing cache at first. + */ + if (ip_route_output(&rt, iph->saddr, saddr, RT_TOS(tos), 0)) return; - if (ip_options_echo(&icmp_param.replyopts, skb_in)) { - ip_rt_put(rt); - return; - } + if (ip_options_echo(&icmp_param.replyopts, skb_in)) + goto ende; + /* * Prepare data for ICMP header. @@ -655,10 +574,13 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info) ipc.opt = &icmp_param.replyopts; if (icmp_param.replyopts.srr) { ip_rt_put(rt); - if (ip_route_output(&rt, icmp_param.replyopts.faddr, saddr, RT_TOS(tos), NULL)) + if (ip_route_output(&rt, icmp_param.replyopts.faddr, saddr, RT_TOS(tos), 0)) return; } + if (!icmpv4_xrlim_allow(rt, type, code)) + goto ende; + /* RFC says return as much as we can without exceeding 576 bytes. */ room = rt->u.dst.pmtu; @@ -674,6 +596,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info) icmp_param.data_len+sizeof(struct icmphdr), &ipc, rt, MSG_DONTWAIT); +ende: ip_rt_put(rt); } @@ -753,7 +676,7 @@ static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb, int len) * get the other vendor to fix their kit. */ - if(__ip_chk_addr(iph->daddr)==IS_BROADCAST) + if (inet_addr_type(iph->daddr) == RTN_BROADCAST) { if (net_ratelimit()) printk("%s sent an invalid ICMP error to a broadcast.\n", @@ -770,12 +693,12 @@ static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb, int len) hash = iph->protocol & (MAX_INET_PROTOS - 1); if ((raw_sk = raw_v4_htable[hash]) != NULL) { - raw_sk = raw_v4_lookup(raw_sk, iph->protocol, iph->saddr, iph->daddr); + raw_sk = raw_v4_lookup(raw_sk, iph->protocol, iph->saddr, iph->daddr, skb->dev->ifindex); while (raw_sk) { raw_err(raw_sk, skb); raw_sk = raw_v4_lookup(raw_sk->next, iph->protocol, - iph->saddr, iph->daddr); + iph->saddr, iph->daddr, skb->dev->ifindex); } } @@ -797,7 +720,7 @@ static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb, int len) /* appropriate protocol layer (MUST), as per 3.2.2. */ if (iph->protocol == ipprot->protocol && ipprot->err_handler) - ipprot->err_handler(skb, dp); + ipprot->err_handler(skb, dp, len); ipprot = nextip; } @@ -850,18 +773,18 @@ static void icmp_redirect(struct icmphdr *icmph, struct sk_buff *skb, int len) * RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring echo requests, MUST have default=NOT. * See also WRT handling of options once they are done and working. */ - + static void icmp_echo(struct icmphdr *icmph, struct sk_buff *skb, int len) { -#ifndef CONFIG_IP_IGNORE_ECHO_REQUESTS - struct icmp_bxm icmp_param; - - icmp_param.icmph=*icmph; - icmp_param.icmph.type=ICMP_ECHOREPLY; - icmp_param.data_ptr=(icmph+1); - icmp_param.data_len=len; - icmp_reply(&icmp_param, skb); -#endif + if (!sysctl_icmp_echo_ignore_all) { + struct icmp_bxm icmp_param; + + icmp_param.icmph=*icmph; + icmp_param.icmph.type=ICMP_ECHOREPLY; + icmp_param.data_ptr=(icmph+1); + icmp_param.data_len=len; + icmp_reply(&icmp_param, skb); + } } /* @@ -928,32 +851,16 @@ static void icmp_timestamp(struct icmphdr *icmph, struct sk_buff *skb, int len) * Gratuitous mask announcements suffer from the same problem. * RFC1812 explains it, but still allows to use ADDRMASK, * that is pretty silly. --ANK + * + * All these rules are so bizarre, that I removed kernel addrmask + * support at all. It is wrong, it is obsolete, nobody uses it in + * any case. --ANK */ - + static void icmp_address(struct icmphdr *icmph, struct sk_buff *skb, int len) { - struct icmp_bxm icmp_param; - struct rtable *rt = (struct rtable*)skb->dst; - struct device *dev = skb->dev; - - if (!ipv4_config.addrmask_agent || - len < 4 || - ZERONET(rt->rt_src) || - rt->rt_src_dev != rt->u.dst.dev || - !(rt->rt_flags&RTCF_DIRECTSRC) || - (rt->rt_flags&RTF_GATEWAY) || - !(dev->ip_flags&IFF_IP_ADDR_OK) || - !(dev->ip_flags&IFF_IP_MASK_OK)) { - icmp_statistics.IcmpInErrors++; - return; - } - - icmp_param.icmph.type=ICMP_ADDRESSREPLY; - icmp_param.icmph.code=0; - icmp_param.icmph.un.echo = icmph->un.echo; - icmp_param.data_ptr=&dev->pa_mask; - icmp_param.data_len=4; - icmp_reply(&icmp_param, skb); + if (net_ratelimit()) + printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n"); } /* @@ -965,27 +872,29 @@ static void icmp_address_reply(struct icmphdr *icmph, struct sk_buff *skb, int l { struct rtable *rt = (struct rtable*)skb->dst; struct device *dev = skb->dev; + struct in_device *in_dev = dev->ip_ptr; + struct in_ifaddr *ifa; u32 mask; if (!ipv4_config.log_martians || + !IS_ROUTER || + !in_dev || !in_dev->ifa_list || len < 4 || - !(rt->rt_flags&RTCF_DIRECTSRC) || - (rt->rt_flags&RTF_GATEWAY) || - !(dev->ip_flags&IFF_IP_ADDR_OK) || - !(dev->ip_flags&IFF_IP_MASK_OK)) { - icmp_statistics.IcmpInErrors++; + !(rt->rt_flags&RTCF_DIRECTSRC)) return; - } mask = *(u32*)&icmph[1]; - if (mask != dev->pa_mask && net_ratelimit()) + for (ifa=in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + if (mask == ifa->ifa_mask && inet_ifa_match(rt->rt_src, ifa)) + return; + } + if (net_ratelimit()) printk(KERN_INFO "Wrong address mask %08lX from %08lX/%s\n", ntohl(mask), ntohl(rt->rt_src), dev->name); } static void icmp_discard(struct icmphdr *icmph, struct sk_buff *skb, int len) { - return; } #ifdef CONFIG_IP_TRANSPARENT_PROXY @@ -1000,8 +909,8 @@ static void icmp_discard(struct icmphdr *icmph, struct sk_buff *skb, int len) */ /* This should work with the new hashes now. -DaveM */ -extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport); -extern struct sock *udp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport); +extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif); +extern struct sock *udp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif); int icmp_chkaddr(struct sk_buff *skb) { @@ -1017,7 +926,7 @@ int icmp_chkaddr(struct sk_buff *skb) { struct tcphdr *th = (struct tcphdr *)(((unsigned char *)iph)+(iph->ihl<<2)); - sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source); + sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex); if (!sk) return 0; if (sk->saddr != iph->saddr) return 0; if (sk->daddr != iph->daddr) return 0; @@ -1031,9 +940,9 @@ int icmp_chkaddr(struct sk_buff *skb) { struct udphdr *uh = (struct udphdr *)(((unsigned char *)iph)+(iph->ihl<<2)); - sk = udp_v4_lookup(iph->daddr, uh->dest, iph->saddr, uh->source); + sk = udp_v4_lookup(iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex); if (!sk) return 0; - if (sk->saddr != iph->saddr && __ip_chk_addr(iph->saddr) != IS_MYADDR) + if (sk->saddr != iph->saddr && inet_addr_type(iph->saddr) != RTN_LOCAL) return 0; /* * This packet may have come from us. @@ -1067,46 +976,59 @@ int icmp_rcv(struct sk_buff *skb, unsigned short len) if(len < sizeof(struct icmphdr) || ip_compute_csum((unsigned char *) icmph, len) || icmph->type > NR_ICMP_TYPES) - { - icmp_statistics.IcmpInErrors++; - kfree_skb(skb, FREE_READ); - return 0; - } + goto error; /* * Parse the ICMP message */ - if (rt->rt_flags&(RTF_BROADCAST|RTF_MULTICAST)) { + if (rt->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST)) { /* - * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be silently ignored (we don't as it is used - * by some network mapping tools). - * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently discarded if to broadcast/multicast. + * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be + * silently ignored (we let user decide with a sysctl). + * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently + * discarded if to broadcast/multicast. */ + if (icmph->type == ICMP_ECHO && + sysctl_icmp_echo_ignore_broadcasts) { + goto error; + } if (icmph->type != ICMP_ECHO && icmph->type != ICMP_TIMESTAMP && icmph->type != ICMP_ADDRESS && icmph->type != ICMP_ADDRESSREPLY) { - icmp_statistics.IcmpInErrors++; - kfree_skb(skb, FREE_READ); - return(0); + goto error; } } len -= sizeof(struct icmphdr); (*icmp_pointers[icmph->type].input)++; (icmp_pointers[icmph->type].handler)(icmph, skb, len); + +drop: kfree_skb(skb, FREE_READ); return 0; +error: + icmp_statistics.IcmpInErrors++; + goto drop; } /* - * This table defined limits of ICMP sending rate for various ICMP messages. + * A spare long used to speed up statistics updating */ + +static unsigned long dummy; -static struct icmp_xrlim - xrl_unreach = { 4*HZ, 80, HZ/4 }, /* Host Unreachable */ - xrl_generic = { 3*HZ, 30, HZ/4 }; /* All other errors */ +/* + * Configurable rate limits. + * Send at most one packets per time. + * Someone should check if these default values are correct. + */ +int sysctl_icmp_sourcequench_time = 1*HZ; +int sysctl_icmp_destunreach_time = 1*HZ; +int sysctl_icmp_timeexceed_time = 1*HZ; +int sysctl_icmp_paramprob_time = 1*HZ; +int sysctl_icmp_echoreply_time = 0; /* don't limit it per default. */ /* * This table is the definition of how we handle ICMP. @@ -1114,38 +1036,38 @@ static struct icmp_xrlim static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1] = { /* ECHO REPLY (0) */ - { &icmp_statistics.IcmpOutEchoReps, &icmp_statistics.IcmpInEchoReps, icmp_discard, 0, NULL }, - { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, NULL }, - { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, NULL }, + { &icmp_statistics.IcmpOutEchoReps, &icmp_statistics.IcmpInEchoReps, icmp_discard, 0, &sysctl_icmp_echoreply_time}, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, }, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, }, /* DEST UNREACH (3) */ - { &icmp_statistics.IcmpOutDestUnreachs, &icmp_statistics.IcmpInDestUnreachs, icmp_unreach, 1, &xrl_unreach }, + { &icmp_statistics.IcmpOutDestUnreachs, &icmp_statistics.IcmpInDestUnreachs, icmp_unreach, 1, &sysctl_icmp_destunreach_time }, /* SOURCE QUENCH (4) */ - { &icmp_statistics.IcmpOutSrcQuenchs, &icmp_statistics.IcmpInSrcQuenchs, icmp_unreach, 1, NULL }, + { &icmp_statistics.IcmpOutSrcQuenchs, &icmp_statistics.IcmpInSrcQuenchs, icmp_unreach, 1, &sysctl_icmp_sourcequench_time }, /* REDIRECT (5) */ - { &icmp_statistics.IcmpOutRedirects, &icmp_statistics.IcmpInRedirects, icmp_redirect, 1, NULL }, - { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, NULL }, - { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, NULL }, + { &icmp_statistics.IcmpOutRedirects, &icmp_statistics.IcmpInRedirects, icmp_redirect, 1, }, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, }, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, }, /* ECHO (8) */ - { &icmp_statistics.IcmpOutEchos, &icmp_statistics.IcmpInEchos, icmp_echo, 0, NULL }, - { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, NULL }, - { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, NULL }, + { &icmp_statistics.IcmpOutEchos, &icmp_statistics.IcmpInEchos, icmp_echo, 0, }, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, }, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, }, /* TIME EXCEEDED (11) */ - { &icmp_statistics.IcmpOutTimeExcds, &icmp_statistics.IcmpInTimeExcds, icmp_unreach, 1, &xrl_generic }, + { &icmp_statistics.IcmpOutTimeExcds, &icmp_statistics.IcmpInTimeExcds, icmp_unreach, 1, &sysctl_icmp_timeexceed_time }, /* PARAMETER PROBLEM (12) */ /* FIXME: RFC1122 3.2.2.5 - MUST pass PARAM_PROB messages to transport layer */ - { &icmp_statistics.IcmpOutParmProbs, &icmp_statistics.IcmpInParmProbs, icmp_discard, 1, &xrl_generic }, + { &icmp_statistics.IcmpOutParmProbs, &icmp_statistics.IcmpInParmProbs, icmp_discard, 1, &sysctl_icmp_paramprob_time }, /* TIMESTAMP (13) */ - { &icmp_statistics.IcmpOutTimestamps, &icmp_statistics.IcmpInTimestamps, icmp_timestamp, 0, NULL }, + { &icmp_statistics.IcmpOutTimestamps, &icmp_statistics.IcmpInTimestamps, icmp_timestamp, 0, }, /* TIMESTAMP REPLY (14) */ - { &icmp_statistics.IcmpOutTimestampReps, &icmp_statistics.IcmpInTimestampReps, icmp_discard, 0, NULL }, + { &icmp_statistics.IcmpOutTimestampReps, &icmp_statistics.IcmpInTimestampReps, icmp_discard, 0, }, /* INFO (15) */ - { &dummy, &dummy, icmp_discard, 0, NULL }, + { &dummy, &dummy, icmp_discard, 0, }, /* INFO REPLY (16) */ - { &dummy, &dummy, icmp_discard, 0, NULL }, + { &dummy, &dummy, icmp_discard, 0, }, /* ADDR MASK (17) */ - { &icmp_statistics.IcmpOutAddrMasks, &icmp_statistics.IcmpInAddrMasks, icmp_address, 0, NULL }, + { &icmp_statistics.IcmpOutAddrMasks, &icmp_statistics.IcmpInAddrMasks, icmp_address, 0, }, /* ADDR MASK REPLY (18) */ - { &icmp_statistics.IcmpOutAddrMaskReps, &icmp_statistics.IcmpInAddrMaskReps, icmp_address_reply, 0, NULL } + { &icmp_statistics.IcmpOutAddrMaskReps, &icmp_statistics.IcmpInAddrMaskReps, icmp_address_reply, 0, } }; __initfunc(void icmp_init(struct net_proto_family *ops)) @@ -1166,8 +1088,4 @@ __initfunc(void icmp_init(struct net_proto_family *ops)) icmp_socket->sk->allocation=GFP_ATOMIC; icmp_socket->sk->num = 256; /* Don't receive any data */ icmp_socket->sk->ip_ttl = MAXTTL; -#ifndef CONFIG_NO_ICMP_LIMIT - xrlim_init(); -#endif } - diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index fbc5403fc..1c59f5462 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -8,6 +8,8 @@ * the older version didn't come out right using gcc 2.5.8, the newer one * seems to fall out with gcc 2.6.2. * + * Version: $Id: igmp.c,v 1.22 1997/10/29 20:27:24 kuznet Exp $ + * * Authors: * Alan Cox <Alan.Cox@linux.org> * @@ -65,9 +67,11 @@ * fix from pending 2.1.x patches. * Alan Cox: Forget to enable FDDI support earlier. * Alexey Kuznetsov: Fixed leaving groups on device down. + * Alexey Kuznetsov: Accordance to igmp-v2-06 draft. */ +#include <linux/config.h> #include <asm/uaccess.h> #include <asm/system.h> #include <linux/types.h> @@ -79,141 +83,52 @@ #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> #include <linux/if_arp.h> +#include <linux/rtnetlink.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> -#include <linux/skbuff.h> #include <net/sock.h> -#include <linux/igmp.h> #include <net/checksum.h> +#ifdef CONFIG_IP_MROUTE +#include <linux/mroute.h> +#endif -int sysctl_igmp_max_host_report_delay = IGMP_MAX_HOST_REPORT_DELAY; -int sysctl_igmp_timer_scale = IGMP_TIMER_SCALE; -int sysctl_igmp_age_threshold = IGMP_AGE_THRESHOLD; - -/* - * If time expired, change the router type to IGMP_NEW_ROUTER. - */ - -static void ip_router_timer_expire(unsigned long data) -{ - struct ip_router_info *i=(struct ip_router_info *)data; - - del_timer(&i->timer); - i->type=IGMP_NEW_ROUTER; /* Revert to new multicast router */ - i->time=0; -} - -/* - * Multicast router info manager - */ -struct ip_router_info *ip_router_info_head=(struct ip_router_info *)0; +#ifdef CONFIG_IP_MULTICAST -/* - * Get the multicast router info on that device - */ +/* Parameter names and values are taken from igmp-v2-06 draft */ -static struct ip_router_info *igmp_get_mrouter_info(struct device *dev) -{ - register struct ip_router_info *i; +#define IGMP_V1_Router_Present_Timeout (400*HZ) +#define IGMP_Unsolicited_Report_Interval (10*HZ) +#define IGMP_Query_Response_Interval (10*HZ) +#define IGMP_Unsolicited_Report_Count 2 - for(i=ip_router_info_head;i!=NULL;i=i->next) - { - if (i->dev == dev) - { - return i; - } - } - /* - * Not found. Create a new entry. The default is IGMP V2 router - */ - - i=(struct ip_router_info *)kmalloc(sizeof(*i), GFP_ATOMIC); - if(i==NULL) - return NULL; - i->dev = dev; - i->type = IGMP_NEW_ROUTER; - i->time = sysctl_igmp_age_threshold; - i->next = ip_router_info_head; - ip_router_info_head = i; - - init_timer(&i->timer); - i->timer.data=(unsigned long)i; - i->timer.function=&ip_router_timer_expire; - - return i; -} +#define IGMP_Initial_Report_Delay (1*HZ) -/* - * Set the multicast router info on that device +/* IGMP_Initial_Report_Delay is not from IGMP specs! + * IGMP specs require to report membership immediately after + * joining a group, but we delay the first report by a + * small interval. It seems more natural and still does not + * contradict to specs provided this delay is small enough. */ -static struct ip_router_info *igmp_set_mrouter_info(struct device *dev,int type,int time) -{ - register struct ip_router_info *i; - - for(i=ip_router_info_head;i!=NULL;i=i->next) - { - if (i->dev == dev) - { - if(i->type==IGMP_OLD_ROUTER) - { - del_timer(&i->timer); - } - - i->type = type; - i->time = time; - - if(i->type==IGMP_OLD_ROUTER) - { - i->timer.expires=jiffies+i->time*HZ; - add_timer(&i->timer); - } - return i; - } - } - - /* - * Not found. Create a new entry. - */ - i=(struct ip_router_info *)kmalloc(sizeof(*i), GFP_ATOMIC); - if(i==NULL) - return NULL; - i->dev = dev; - i->type = type; - i->time = time; - i->next = ip_router_info_head; - ip_router_info_head = i; - - init_timer(&i->timer); - i->timer.data=(unsigned long)i; - i->timer.function=&ip_router_timer_expire; - if(i->type==IGMP_OLD_ROUTER) - { - i->timer.expires=jiffies+i->time*HZ; - add_timer(&i->timer); - } - - return i; -} - +#define IGMP_V1_SEEN(in_dev) ((in_dev)->mr_v1_seen && jiffies - (in_dev)->mr_v1_seen < 0) /* * Timer management */ -static void igmp_stop_timer(struct ip_mc_list *im) +static __inline__ void igmp_stop_timer(struct ip_mc_list *im) { - if (im->tm_running) - { - del_timer(&im->timer); - im->tm_running=0; - } - else - printk(KERN_DEBUG "igmp_stop_timer() called with timer not running by %p\n",__builtin_return_address(0)); + if (im->tm_running) { + del_timer(&im->timer); + im->tm_running=0; + } } extern __inline__ unsigned int random(void) @@ -223,17 +138,13 @@ extern __inline__ unsigned int random(void) return seed^jiffies; } -/* - * Inlined as it's only called once. - */ - -static void igmp_start_timer(struct ip_mc_list *im,unsigned char max_resp_time) +static __inline__ void igmp_start_timer(struct ip_mc_list *im, int max_delay) { int tv; - if(im->tm_running) + if (im->tm_running) return; - tv=random()%(max_resp_time*HZ/sysctl_igmp_timer_scale); /* Pick a number any number 8) */ - im->timer.expires=jiffies+tv; + tv=random() % max_delay; + im->timer.expires=jiffies+tv+2; im->tm_running=1; add_timer(&im->timer); } @@ -244,20 +155,32 @@ static void igmp_start_timer(struct ip_mc_list *im,unsigned char max_resp_time) #define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4) -static void igmp_send_report(struct device *dev, u32 group, int type) +static int igmp_send_report(struct device *dev, u32 group, int type) { struct sk_buff *skb; struct iphdr *iph; struct igmphdr *ih; struct rtable *rt; + u32 dst; - if (ip_route_output(&rt, group, 0, 0, dev)) - return; + /* According to IGMPv2 specs, LEAVE messages are + * sent to all-routers group. + */ + dst = group; + if (type == IGMP_HOST_LEAVE_MESSAGE) + dst = IGMP_ALL_ROUTER; + + if (ip_route_output(&rt, dst, 0, 0, dev->ifindex)) + return -1; + if (rt->rt_src == 0) { + ip_rt_put(rt); + return -1; + } skb=alloc_skb(IGMP_SIZE+dev->hard_header_len+15, GFP_ATOMIC); if (skb == NULL) { ip_rt_put(rt); - return; + return -1; } skb->dst = &rt->u.dst; @@ -272,7 +195,7 @@ static void igmp_send_report(struct device *dev, u32 group, int type) iph->tos = 0; iph->frag_off = 0; iph->ttl = 1; - iph->daddr = group; + iph->daddr = dst; iph->saddr = rt->rt_src; iph->protocol = IPPROTO_IGMP; iph->tot_len = htons(IGMP_SIZE); @@ -290,115 +213,140 @@ static void igmp_send_report(struct device *dev, u32 group, int type) ih->group=group; ih->csum=ip_compute_csum((void *)ih, sizeof(struct igmphdr)); - skb->dst->output(skb); + return skb->dst->output(skb); } static void igmp_timer_expire(unsigned long data) { struct ip_mc_list *im=(struct ip_mc_list *)data; - struct ip_router_info *r; + struct in_device *in_dev = im->interface; + int err; im->tm_running=0; - r=igmp_get_mrouter_info(im->interface); - if(r==NULL) - return; - if(r->type==IGMP_NEW_ROUTER) - igmp_send_report(im->interface, im->multiaddr, IGMP_HOST_NEW_MEMBERSHIP_REPORT); + + if (IGMP_V1_SEEN(in_dev)) + err = igmp_send_report(in_dev->dev, im->multiaddr, IGMP_HOST_MEMBERSHIP_REPORT); else - igmp_send_report(im->interface, im->multiaddr, IGMP_HOST_MEMBERSHIP_REPORT); - im->reporter = 1; -} + err = igmp_send_report(in_dev->dev, im->multiaddr, IGMP_HOST_NEW_MEMBERSHIP_REPORT); -static void igmp_init_timer(struct ip_mc_list *im) -{ - im->tm_running=0; - init_timer(&im->timer); - im->timer.data=(unsigned long)im; - im->timer.function=&igmp_timer_expire; -} + /* Failed. Retry later. */ + if (err) { + igmp_start_timer(im, IGMP_Unsolicited_Report_Interval); + return; + } + if (im->unsolicit_count) { + im->unsolicit_count--; + igmp_start_timer(im, IGMP_Unsolicited_Report_Interval); + } + im->reporter = 1; +} -static void igmp_heard_report(struct device *dev, u32 group, u32 source) +static void igmp_heard_report(struct in_device *in_dev, u32 group) { struct ip_mc_list *im; /* Timers are only set for non-local groups */ + if (LOCAL_MCAST(group)) return; - for (im=dev->ip_mc_list; im!=NULL; im=im->next) { + for (im=in_dev->mc_list; im!=NULL; im=im->next) { if (im->multiaddr == group) { - if (im->tm_running) - igmp_stop_timer(im); - if (source != dev->pa_addr) - im->reporter = 0; + igmp_stop_timer(im); + im->reporter = 0; + im->unsolicit_count = 0; return; } } } -static void igmp_heard_query(struct device *dev, unsigned char max_resp_time, +static void igmp_heard_query(struct in_device *in_dev, unsigned char max_resp_time, u32 group) { - struct ip_mc_list *im; - int mrouter_type; + struct ip_mc_list *im; + int max_delay; - /* - * The max_resp_time is in units of 1/10 second. - */ - if(max_resp_time>0) { - mrouter_type=IGMP_NEW_ROUTER; + max_delay = max_resp_time*(HZ/IGMP_TIMER_SCALE); - if (igmp_set_mrouter_info(dev,mrouter_type,0)==NULL) - return; - /* - * - Start the timers in all of our membership records - * that the query applies to for the interface on - * which the query arrived excl. those that belong - * to a "local" group (224.0.0.X) - * - For timers already running check if they need to - * be reset. - * - Use the igmp->igmp_code field as the maximum - * delay possible - */ - for(im=dev->ip_mc_list;im!=NULL;im=im->next) { - if (group && group != im->multiaddr) - continue; - if(im->tm_running) { - if(im->timer.expires>jiffies+max_resp_time*HZ/sysctl_igmp_timer_scale) { - igmp_stop_timer(im); - igmp_start_timer(im,max_resp_time); - } - } else if (!LOCAL_MCAST(im->multiaddr)) - igmp_start_timer(im,max_resp_time); - } - } else { - mrouter_type=IGMP_OLD_ROUTER; - max_resp_time=sysctl_igmp_max_host_report_delay*sysctl_igmp_timer_scale; + if (max_resp_time == 0) { + /* Alas, old v1 router presents here. */ - if(igmp_set_mrouter_info(dev,mrouter_type,sysctl_igmp_age_threshold)==NULL) - return; + max_delay = IGMP_Query_Response_Interval; + in_dev->mr_v1_seen = jiffies + IGMP_V1_Router_Present_Timeout; + group = 0; + } + + /* + * - Start the timers in all of our membership records + * that the query applies to for the interface on + * which the query arrived excl. those that belong + * to a "local" group (224.0.0.X) + * - For timers already running check if they need to + * be reset. + * - Use the igmp->igmp_code field as the maximum + * delay possible + */ + for (im=in_dev->mc_list; im!=NULL; im=im->next) { + if (group && group != im->multiaddr) + continue; + if (LOCAL_MCAST(im->multiaddr)) + continue; + im->unsolicit_count = 0; + if (im->tm_running && im->timer.expires-jiffies > max_delay) + igmp_stop_timer(im); + igmp_start_timer(im, max_delay); + } +} - /* - * Start the timers in all of our membership records for - * the interface on which the query arrived, except those - * that are already running and those that belong to a - * "local" group (224.0.0.X). - */ +int igmp_rcv(struct sk_buff *skb, unsigned short len) +{ + /* This basically follows the spec line by line -- see RFC1112 */ + struct igmphdr *ih = skb->h.igmph; + struct in_device *in_dev = skb->dev->ip_ptr; - for(im=dev->ip_mc_list;im!=NULL;im=im->next) { - if(!im->tm_running && !LOCAL_MCAST(im->multiaddr)) - igmp_start_timer(im,max_resp_time); - } + if (len < sizeof(struct igmphdr) || ip_compute_csum((void *)ih, len) + || in_dev==NULL) { + kfree_skb(skb, FREE_READ); + return 0; + } + + switch (ih->type) { + case IGMP_HOST_MEMBERSHIP_QUERY: + igmp_heard_query(in_dev, ih->code, ih->group); + break; + case IGMP_HOST_MEMBERSHIP_REPORT: + case IGMP_HOST_NEW_MEMBERSHIP_REPORT: + /* Is it our report looped back? */ + if (((struct rtable*)skb->dst)->key.iif == 0) + break; + igmp_heard_report(in_dev, ih->group); + break; + case IGMP_PIM: +#ifdef CONFIG_IP_PIMSM_V1 + return pim_rcv_v1(skb, len); +#endif + case IGMP_DVMRP: + case IGMP_TRACE: + case IGMP_HOST_LEAVE_MESSAGE: + case IGMP_MTRACE: + case IGMP_MTRACE_RESP: + break; + default: + NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type)); } + kfree_skb(skb, FREE_READ); + return 0; } +#endif + /* * Map a multicast IP onto multicast MAC for type ethernet. */ -extern __inline__ void ip_mc_map(unsigned long addr, char *buf) +extern __inline__ void ip_mc_map(u32 addr, char *buf) { addr=ntohl(addr); buf[0]=0x01; @@ -415,15 +363,16 @@ extern __inline__ void ip_mc_map(unsigned long addr, char *buf) * Add a filter to a device */ -void ip_mc_filter_add(struct device *dev, unsigned long addr) +static void ip_mc_filter_add(struct in_device *in_dev, u32 addr) { char buf[6]; - ip_rt_multicast_event(dev); - if(!(dev->flags & IFF_MULTICAST)) + struct device *dev = in_dev->dev; + + if (!(dev->flags & IFF_MULTICAST)) return; - if(dev->type!=ARPHRD_ETHER && dev->type!=ARPHRD_FDDI) + if (dev->type!=ARPHRD_ETHER && dev->type!=ARPHRD_FDDI) return; /* Only do ethernet or FDDI for now */ - ip_mc_map(addr,buf); + ip_mc_map(addr, buf); dev_mc_add(dev,buf,ETH_ALEN,0); } @@ -431,70 +380,49 @@ void ip_mc_filter_add(struct device *dev, unsigned long addr) * Remove a filter from a device */ -void ip_mc_filter_del(struct device *dev, unsigned long addr) +static void ip_mc_filter_del(struct in_device *in_dev, u32 addr) { char buf[6]; - ip_rt_multicast_event(dev); - if(dev->type!=ARPHRD_ETHER && dev->type!=ARPHRD_FDDI) + struct device *dev = in_dev->dev; + + if (dev->type!=ARPHRD_ETHER && dev->type!=ARPHRD_FDDI) return; /* Only do ethernet or FDDI for now */ ip_mc_map(addr,buf); dev_mc_delete(dev,buf,ETH_ALEN,0); } -extern __inline__ void igmp_group_dropped(struct ip_mc_list *im) +static void igmp_group_dropped(struct ip_mc_list *im) { - del_timer(&im->timer); - if (im->reporter) - igmp_send_report(im->interface, im->multiaddr, IGMP_HOST_LEAVE_MESSAGE); ip_mc_filter_del(im->interface, im->multiaddr); -} -extern __inline__ void igmp_group_added(struct ip_mc_list *im) -{ - struct ip_router_info *r; - igmp_init_timer(im); - ip_mc_filter_add(im->interface, im->multiaddr); - r=igmp_get_mrouter_info(im->interface); - if(r==NULL) +#ifdef CONFIG_IP_MULTICAST + if (LOCAL_MCAST(im->multiaddr)) return; - if(r->type==IGMP_NEW_ROUTER) - igmp_send_report(im->interface, im->multiaddr, IGMP_HOST_NEW_MEMBERSHIP_REPORT); - else - igmp_send_report(im->interface, im->multiaddr, IGMP_HOST_MEMBERSHIP_REPORT); + + start_bh_atomic(); + igmp_stop_timer(im); + end_bh_atomic(); + + if (im->reporter && !IGMP_V1_SEEN(im->interface)) + igmp_send_report(im->interface->dev, im->multiaddr, IGMP_HOST_LEAVE_MESSAGE); +#endif } -int igmp_rcv(struct sk_buff *skb, unsigned short len) +static void igmp_group_added(struct ip_mc_list *im) { - /* This basically follows the spec line by line -- see RFC1112 */ - struct igmphdr *ih = skb->h.igmph; + ip_mc_filter_add(im->interface, im->multiaddr); - if (len < sizeof(struct igmphdr) || ip_compute_csum((void *)ih, len)) { - kfree_skb(skb, FREE_READ); - return 0; - } - - switch (ih->type) { - case IGMP_HOST_MEMBERSHIP_QUERY: - igmp_heard_query(skb->dev, ih->code, ih->group); - break; - case IGMP_HOST_MEMBERSHIP_REPORT: - case IGMP_HOST_NEW_MEMBERSHIP_REPORT: - igmp_heard_report(skb->dev, ih->group, skb->nh.iph->saddr); - break; - case IGMP_DVMRP: - case IGMP_PIM: - case IGMP_TRACE: - case IGMP_HOST_LEAVE_MESSAGE: - case IGMP_MTRACE: - case IGMP_MTRACE_RESP: - break; - default: - NETDEBUG(printk(KERN_DEBUG "Unknown IGMP type=%d\n", ih->type)); - } - kfree_skb(skb, FREE_READ); - return 0; +#ifdef CONFIG_IP_MULTICAST + if (LOCAL_MCAST(im->multiaddr)) + return; + + start_bh_atomic(); + igmp_start_timer(im, IGMP_Initial_Report_Delay); + end_bh_atomic(); +#endif } + /* * Multicast list managers */ @@ -504,143 +432,210 @@ int igmp_rcv(struct sk_buff *skb, unsigned short len) * A socket has joined a multicast group on device dev. */ -static void ip_mc_inc_group(struct device *dev, unsigned long addr) +void ip_mc_inc_group(struct in_device *in_dev, u32 addr) { - struct ip_mc_list *i; - for(i=dev->ip_mc_list;i!=NULL;i=i->next) - { - if(i->multiaddr==addr) - { + struct ip_mc_list *i, *im; + + im = (struct ip_mc_list *)kmalloc(sizeof(*im), GFP_KERNEL); + + for (i=in_dev->mc_list; i; i=i->next) { + if (i->multiaddr == addr) { i->users++; + if (im) + kfree(im); return; } } - i=(struct ip_mc_list *)kmalloc(sizeof(*i), GFP_KERNEL); - if(!i) + if (!im) return; - i->users=1; - i->interface=dev; - i->multiaddr=addr; - i->next=dev->ip_mc_list; - igmp_group_added(i); - dev->ip_mc_list=i; + im->users=1; + im->interface=in_dev; + im->multiaddr=addr; +#ifdef CONFIG_IP_MULTICAST + im->tm_running=0; + init_timer(&im->timer); + im->timer.data=(unsigned long)im; + im->timer.function=&igmp_timer_expire; + im->unsolicit_count = IGMP_Unsolicited_Report_Count; + im->reporter = 0; +#endif + im->next=in_dev->mc_list; + in_dev->mc_list=im; + if (in_dev->dev->flags & IFF_UP) { + igmp_group_added(im); + ip_rt_multicast_event(in_dev); + } + return; } /* * A socket has left a multicast group on device dev */ -static void ip_mc_dec_group(struct device *dev, unsigned long addr) +int ip_mc_dec_group(struct in_device *in_dev, u32 addr) { - struct ip_mc_list **i; - for(i=&(dev->ip_mc_list);(*i)!=NULL;i=&(*i)->next) - { - if((*i)->multiaddr==addr) - { - if(--((*i)->users) == 0) - { - struct ip_mc_list *tmp= *i; - igmp_group_dropped(tmp); - *i=(*i)->next; - kfree_s(tmp,sizeof(*tmp)); + struct ip_mc_list *i, **ip; + + for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) { + if (i->multiaddr==addr) { + if (--i->users == 0) { + *ip = i->next; + if (in_dev->dev->flags & IFF_UP) { + igmp_group_dropped(i); + ip_rt_multicast_event(in_dev); + } + kfree_s(i, sizeof(*i)); } - return; + return 0; } } + return -ESRCH; } -/* - * Device going down: Clean up. - */ +/* Device going down */ -void ip_mc_drop_device(struct device *dev) +void ip_mc_down(struct in_device *in_dev) { struct ip_mc_list *i; - struct ip_mc_list *j; - start_bh_atomic(); - for(i=dev->ip_mc_list;i!=NULL;i=j) - { - j=i->next; - if(i->tm_running) - del_timer(&i->timer); - kfree_s(i,sizeof(*i)); - } - dev->ip_mc_list=NULL; - end_bh_atomic(); + + for (i=in_dev->mc_list; i; i=i->next) + igmp_group_dropped(i); +} + +/* Device going up */ + +void ip_mc_up(struct in_device *in_dev) +{ + struct ip_mc_list *i; + + for (i=in_dev->mc_list; i; i=i->next) + igmp_group_added(i); } /* - * Device going up. Make sure it is in all hosts + * Device is about to be destroyed: clean up. */ -void ip_mc_allhost(struct device *dev) +void ip_mc_destroy_dev(struct in_device *in_dev) { struct ip_mc_list *i; - for(i=dev->ip_mc_list;i!=NULL;i=i->next) - if(i->multiaddr==IGMP_ALL_HOSTS) - return; - i=(struct ip_mc_list *)kmalloc(sizeof(*i), GFP_KERNEL); - if(!i) - return; - i->users=1; - i->interface=dev; - i->multiaddr=IGMP_ALL_HOSTS; - i->tm_running=0; - i->next=dev->ip_mc_list; - dev->ip_mc_list=i; - ip_mc_filter_add(i->interface, i->multiaddr); + + while ((i = in_dev->mc_list) != NULL) { + in_dev->mc_list = i->next; + kfree_s(i, sizeof(*i)); + } +} + +/* Initialize multicasting on an IP interface */ + +void ip_mc_init_dev(struct in_device *in_dev) +{ + in_dev->mc_list = NULL; + in_dev->mr_v1_seen = 0; + ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); +} + +static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr) +{ + struct rtable *rt; + struct device *dev = NULL; + + if (imr->imr_address.s_addr) { + dev = ip_dev_find(imr->imr_address.s_addr); + if (!dev) + return NULL; + } + + if (!dev && !ip_route_output(&rt, imr->imr_multiaddr.s_addr, 0, 0, 0)) { + dev = rt->u.dst.dev; + ip_rt_put(rt); + } + if (dev) { + imr->imr_ifindex = dev->ifindex; + return dev->ip_ptr; + } + return NULL; } /* * Join a socket to a group */ -int ip_mc_join_group(struct sock *sk , struct device *dev, unsigned long addr) +int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) { - int unused= -1; - int i; - if(!MULTICAST(addr)) + int err; + u32 addr = imr->imr_multiaddr.s_addr; + struct ip_mc_socklist *iml, *i; + struct in_device *in_dev; + int count = 0; + + if (!MULTICAST(addr)) return -EINVAL; - if(sk->ip_mc_list==NULL) - { - if((sk->ip_mc_list=(struct ip_mc_socklist *)kmalloc(sizeof(*sk->ip_mc_list), GFP_KERNEL))==NULL) - return -ENOMEM; - memset(sk->ip_mc_list,'\0',sizeof(*sk->ip_mc_list)); - } - for(i=0;i<IP_MAX_MEMBERSHIPS;i++) - { - if(sk->ip_mc_list->multiaddr[i]==addr && sk->ip_mc_list->multidev[i]==dev) - return -EADDRINUSE; - if(sk->ip_mc_list->multidev[i]==NULL) - unused=i; + + rtnl_shlock(); + + if (!imr->imr_ifindex) + in_dev = ip_mc_find_dev(imr); + else + in_dev = inetdev_by_index(imr->imr_ifindex); + + if (!in_dev) { + iml = NULL; + err = -ENODEV; + goto done; } - if(unused==-1) - return -ENOBUFS; - sk->ip_mc_list->multiaddr[unused]=addr; - sk->ip_mc_list->multidev[unused]=dev; - ip_mc_inc_group(dev,addr); - return 0; + iml = (struct ip_mc_socklist *)kmalloc(sizeof(*iml), GFP_KERNEL); + + err = -EADDRINUSE; + for (i=sk->ip_mc_list; i; i=i->next) { + if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) { + /* New style additions are reference counted */ + if (imr->imr_address.s_addr == 0) { + i->count++; + err = 0; + } + goto done; + } + count++; + } + err = -ENOBUFS; + if (iml == NULL || count >= IP_MAX_MEMBERSHIPS) + goto done; + memcpy(&iml->multi, imr, sizeof(*imr)); + iml->next = sk->ip_mc_list; + iml->count = 1; + sk->ip_mc_list = iml; + ip_mc_inc_group(in_dev, addr); + iml = NULL; + err = 0; +done: + rtnl_shunlock(); + if (iml) + kfree(iml); + return err; } /* * Ask a socket to leave a group. */ -int ip_mc_leave_group(struct sock *sk, struct device *dev, unsigned long addr) +int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) { - int i; - if(!MULTICAST(addr)) - return -EINVAL; - if(sk->ip_mc_list==NULL) - return -EADDRNOTAVAIL; - - for(i=0;i<IP_MAX_MEMBERSHIPS;i++) - { - if(sk->ip_mc_list->multiaddr[i]==addr && sk->ip_mc_list->multidev[i]==dev) - { - sk->ip_mc_list->multidev[i]=NULL; - ip_mc_dec_group(dev,addr); + struct ip_mc_socklist *iml, **imlp; + + for (imlp=&sk->ip_mc_list; (iml=*imlp)!=NULL; imlp=&iml->next) { + if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr && + iml->multi.imr_address.s_addr==imr->imr_address.s_addr && + (!imr->imr_ifindex || iml->multi.imr_ifindex==imr->imr_ifindex)) { + struct in_device *in_dev; + if (--iml->count) + return 0; + *imlp = iml->next; + in_dev = inetdev_by_index(iml->multi.imr_ifindex); + if (in_dev) + ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr); + kfree_s(iml, sizeof(*iml)); return 0; } } @@ -653,69 +648,63 @@ int ip_mc_leave_group(struct sock *sk, struct device *dev, unsigned long addr) void ip_mc_drop_socket(struct sock *sk) { - int i; - - if(sk->ip_mc_list==NULL) - return; - - for(i=0;i<IP_MAX_MEMBERSHIPS;i++) - { - if(sk->ip_mc_list->multidev[i]) - { - ip_mc_dec_group(sk->ip_mc_list->multidev[i], sk->ip_mc_list->multiaddr[i]); - sk->ip_mc_list->multidev[i]=NULL; - } + struct ip_mc_socklist *iml; + + while ((iml=sk->ip_mc_list) != NULL) { + struct in_device *in_dev; + sk->ip_mc_list = iml->next; + if ((in_dev = inetdev_by_index(iml->multi.imr_ifindex)) != NULL) + ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); + kfree_s(iml, sizeof(*iml)); } - kfree_s(sk->ip_mc_list,sizeof(*sk->ip_mc_list)); - sk->ip_mc_list=NULL; } -/* - * Write an multicast group list table for the IGMP daemon to - * read. - */ +#ifdef CONFIG_IP_MULTICAST int ip_mc_procinfo(char *buffer, char **start, off_t offset, int length, int dummy) { off_t pos=0, begin=0; struct ip_mc_list *im; - unsigned long flags; int len=0; struct device *dev; - len=sprintf(buffer,"Device : Count\tGroup Users Timer\tReporter\n"); - save_flags(flags); - cli(); + len=sprintf(buffer,"Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n"); for(dev = dev_base; dev; dev = dev->next) { - if(dev->flags&IFF_UP) - { - len+=sprintf(buffer+len,"%-10s: %5d\n", - dev->name, dev->mc_count); - for(im = dev->ip_mc_list; im; im = im->next) - { - len+=sprintf(buffer+len, - "\t\t\t%08lX %5d %d:%08lX\t%d\n", - im->multiaddr, im->users, - im->tm_running, im->timer.expires-jiffies, im->reporter); - pos=begin+len; - if(pos<offset) - { - len=0; - begin=pos; - } - if(pos>offset+length) - break; - } - } + struct in_device *in_dev = dev->ip_ptr; + char *querier = "NONE"; + + if (in_dev == NULL) + continue; + + querier = IGMP_V1_SEEN(in_dev) ? "V1" : "V2"; + + len+=sprintf(buffer+len,"%d\t%-10s: %5d %7s\n", + dev->ifindex, dev->name, dev->mc_count, querier); + + for (im = in_dev->mc_list; im; im = im->next) { + len+=sprintf(buffer+len, + "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n", + im->multiaddr, im->users, + im->tm_running, im->timer.expires-jiffies, im->reporter); + + pos=begin+len; + if(pos<offset) + { + len=0; + begin=pos; + } + if(pos>offset+length) + break; + } } - restore_flags(flags); *start=buffer+(offset-begin); len-=(offset-begin); if(len>length) - len=length; + len=length; return len; } +#endif diff --git a/net/ipv4/ip_alias.c b/net/ipv4/ip_alias.c index a78eef17a..e69de29bb 100644 --- a/net/ipv4/ip_alias.c +++ b/net/ipv4/ip_alias.c @@ -1,170 +0,0 @@ -/* - * IP_ALIAS (AF_INET) aliasing module. - * - * - * Version: @(#)ip_alias.c 0.43 12/20/95 - * - * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> - * - * Fixes: - * JJC : ip_alias_dev_select method. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <linux/module.h> - -#include <linux/types.h> -#include <linux/errno.h> -#include <linux/netdevice.h> -#include <linux/if.h> -#include <linux/inet.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/route.h> -#include <linux/init.h> -#include <net/route.h> - -#ifdef ALIAS_USER_LAND_DEBUG -#include "net_alias.h" -#include "ip_alias.h" -#include "user_stubs.h" -#endif - -#include <linux/net_alias.h> -#include <net/ip_alias.h> - -/* - * AF_INET alias init - */ - -static int ip_alias_init_1(struct net_alias_type *this, struct net_alias *alias, struct sockaddr *sa) -{ -#ifdef ALIAS_USER_LAND_DEBUG - printk("alias_init(%s) called.\n", alias->name); -#endif - MOD_INC_USE_COUNT; - return 0; -} - -/* - * AF_INET alias done - */ - -static int ip_alias_done_1(struct net_alias_type *this, struct net_alias *alias) -{ -#ifdef ALIAS_USER_LAND_DEBUG - printk("alias_done(%s) called.\n", alias->name); -#endif - MOD_DEC_USE_COUNT; - return 0; -} - -/* - * Print alias address info - */ - -int ip_alias_print_1(struct net_alias_type *this, struct net_alias *alias, char *buf, int len) -{ - char *p; - - p = (char *) &alias->dev.pa_addr; - return sprintf(buf, "%d.%d.%d.%d", - (p[0] & 255), (p[1] & 255), (p[2] & 255), (p[3] & 255)); -} - -struct device *ip_alias_dev_select(struct net_alias_type *this, struct device *main_dev, struct sockaddr *sa) -{ - __u32 addr; -#if 0 - struct rtable *rt; -#endif - struct device *dev=NULL; - - /* - * Defensive... - */ - - if (main_dev == NULL) - return NULL; - - /* - * Get u32 address. - */ - - addr = (sa)? (*(struct sockaddr_in *)sa).sin_addr.s_addr : 0; - if (addr == 0) - return NULL; - - /* - * Find 'closest' device to address given. any other suggestions? ... - * net_alias module will check if returned device is main_dev's alias - */ - -#if 0 - rt = ip_rt_route(addr, 0); - if(rt) - { - dev=rt->rt_dev; - ip_rt_put(rt); - } -#endif - return dev; -} - -/* - * net_alias AF_INET type defn. - */ - -struct net_alias_type ip_alias_type = -{ - AF_INET, /* type */ - 0, /* n_attach */ - "ip", /* name */ - NULL, /* get_addr32() */ - NULL, /* dev_addr_chk() */ - ip_alias_dev_select, /* dev_select() */ - ip_alias_init_1, /* alias_init_1() */ - ip_alias_done_1, /* alias_done_1() */ - ip_alias_print_1, /* alias_print_1() */ - NULL /* next */ -}; - -/* - * ip_alias module initialization - */ - -__initfunc(int ip_alias_init(void)) -{ - return register_net_alias_type(&ip_alias_type, AF_INET); -} - -/* - * ip_alias module done - */ - -int ip_alias_done(void) -{ - return unregister_net_alias_type(&ip_alias_type); -} - -#ifdef MODULE - -int init_module(void) -{ - if (ip_alias_init() != 0) - return -EIO; - return 0; -} - -void cleanup_module(void) -{ - if (ip_alias_done() != 0) - printk(KERN_INFO "ip_alias: can't remove module"); -} - -#endif /* MODULE */ diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 0726f3bb4..8f48894a4 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -5,6 +5,8 @@ * * The IP forwarding functionality. * + * Version: $Id: ip_forward.c,v 1.32 1997/10/24 17:16:06 kuznet Exp $ + * * Authors: see ip.c * * Fixes: @@ -76,10 +78,13 @@ int ip_forward(struct sk_buff *skb) int fw_res = 0; #endif - if (skb->pkt_type != PACKET_HOST) { - kfree_skb(skb,FREE_WRITE); - return 0; + if (IPCB(skb)->opt.router_alert) { + if (ip_call_ra_chain(skb)) + return 0; } + + if (skb->pkt_type != PACKET_HOST) + goto drop; /* * According to the RFC, we must first decrease the TTL field. If @@ -90,27 +95,25 @@ int ip_forward(struct sk_buff *skb) iph = skb->nh.iph; rt = (struct rtable*)skb->dst; +#ifdef CONFIG_CPU_IS_SLOW + if (net_cpu_congestion > 1 && !(iph->tos&IPTOS_RELIABILITY) && + IPTOS_PREC(iph->tos) < IPTOS_PREC_INTERNETCONTROL) { + if (((xtime.tv_usec&0xF)<<net_cpu_congestion) > 0x1C) + goto drop; + } +#endif + + #ifdef CONFIG_TRANSPARENT_PROXY if (ip_chk_sock(skb)) - return ip_local_deliver(skb); + goto local_pkt; #endif - if (ip_decrease_ttl(iph) <= 0) { - /* Tell the sender its packet died... */ - icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); - kfree_skb(skb, FREE_WRITE); - return -1; - } - - if (opt->is_strictroute && (rt->rt_flags&RTF_GATEWAY)) { - /* - * Strict routing permits no gatewaying - */ - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0); - kfree_skb(skb, FREE_WRITE); - return -1; - } + if (ip_decrease_ttl(iph) <= 0) + goto too_many_hops; + if (opt->is_strictroute && (rt->rt_flags&RTF_GATEWAY)) + goto sr_failed; /* * Having picked a route we can now send the frame out @@ -139,19 +142,23 @@ int ip_forward(struct sk_buff *skb) */ if (dev2->flags & IFF_UP) { - if (skb->len > mtu && (ntohs(iph->frag_off) & IP_DF)) { - ip_statistics.IpFragFails++; - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); - kfree_skb(skb, FREE_WRITE); - return -1; - } + if (skb->len > mtu && (ntohs(iph->frag_off) & IP_DF)) + goto frag_needed; - if (rt->rt_flags&RTCF_NAT) { +#ifdef CONFIG_IP_ROUTE_NAT + if (rt->rt_flags & RTCF_NAT) { + if (skb_headroom(skb) < dev2->hard_header_len || skb_cloned(skb)) { + struct sk_buff *skb2; + skb2 = skb_realloc_headroom(skb, (dev2->hard_header_len + 15)&~15); + kfree_skb(skb, FREE_WRITE); + skb = skb2; + } if (ip_do_nat(skb)) { kfree_skb(skb, FREE_WRITE); return -1; } } +#endif #ifdef CONFIG_IP_MASQUERADE if(!(IPCB(skb)->flags&IPSKB_MASQUERADED)) { @@ -168,7 +175,7 @@ int ip_forward(struct sk_buff *skb) * and skip the firewall checks */ if (iph->protocol == IPPROTO_ICMP) { - if ((fw_res = ip_fw_masq_icmp(&skb, dev2)) < 0) { + if ((fw_res = ip_fw_masq_icmp(&skb)) < 0) { kfree_skb(skb, FREE_READ); return -1; } @@ -179,7 +186,8 @@ int ip_forward(struct sk_buff *skb) } if (rt->rt_flags&RTCF_MASQ) goto skip_call_fw_firewall; -#endif +#endif /* CONFIG_IP_MASQUERADE */ + #ifdef CONFIG_FIREWALL fw_res=call_fw_firewall(PF_INET, dev2, iph, NULL, &skb); switch (fw_res) { @@ -205,7 +213,16 @@ skip_call_fw_firewall: */ if (!(IPCB(skb)->flags&IPSKB_MASQUERADED) && (fw_res==FW_MASQUERADE || rt->rt_flags&RTCF_MASQ)) { - if (ip_fw_masquerade(&skb, dev2) < 0) { + u32 maddr; + +#ifdef CONFIG_IP_ROUTE_NAT + maddr = (rt->rt_flags&RTCF_MASQ) ? rt->rt_src_map : 0; + + if (maddr == 0) +#endif + maddr = inet_select_addr(dev2, rt->rt_gateway, RT_SCOPE_UNIVERSE); + + if (ip_fw_masquerade(&skb, maddr) < 0) { kfree_skb(skb, FREE_READ); return -1; } @@ -238,10 +255,36 @@ skip_call_fw_firewall: ip_statistics.IpForwDatagrams++; - if (opt->optlen) - ip_forward_options(skb); - + if (opt->optlen == 0) { + ip_send(skb); + return 0; + } + ip_forward_options(skb); ip_send(skb); } return 0; + +#ifdef CONFIG_TRANSPARENT_PROXY +local_pkt: +#endif + return ip_local_deliver(skb); + +frag_needed: + ip_statistics.IpFragFails++; + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + goto drop; + +sr_failed: + /* + * Strict routing permits no gatewaying + */ + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0); + goto drop; + +too_many_hops: + /* Tell the sender its packet died... */ + icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); +drop: + kfree_skb(skb,FREE_WRITE); + return -1; } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 5edcb4a9c..637fe022e 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -5,7 +5,7 @@ * * The IP fragmentation functionality. * - * Version: $Id: ip_fragment.c,v 1.26 1997/09/04 22:35:00 davem Exp $ + * Version: $Id: ip_fragment.c,v 1.29 1997/11/22 12:31:05 freitag Exp $ * * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox <Alan.Cox@linux.org> @@ -130,7 +130,7 @@ static struct ipfrag *ip_frag_create(int offset, int end, /* Find the correct entry in the "incomplete datagrams" queue for * this IP datagram, and return the queue entry address if found. */ -static inline struct ipq *ip_find(struct iphdr *iph) +static inline struct ipq *ip_find(struct iphdr *iph, struct dst_entry *dst) { __u16 id = iph->id; __u32 saddr = iph->saddr; @@ -314,7 +314,8 @@ static struct sk_buff *ip_glue(struct ipq *qp) len = qp->ihlen + qp->len; if(len>65535) { - printk(KERN_INFO "Oversized IP packet from %d.%d.%d.%d.\n", NIPQUAD(qp->iph->saddr)); + if (net_ratelimit()) + printk(KERN_INFO "Oversized IP packet from %d.%d.%d.%d.\n", NIPQUAD(qp->iph->saddr)); ip_statistics.IpReasmFails++; ip_free(qp); return NULL; @@ -322,7 +323,7 @@ static struct sk_buff *ip_glue(struct ipq *qp) if ((skb = dev_alloc_skb(len)) == NULL) { ip_statistics.IpReasmFails++; - NETDEBUG(printk(KERN_ERR "IP: queue_glue: no memory for gluing queue %p\n", qp)); + NETDEBUG(printk(KERN_ERR "IP: queue_glue: no memory for gluing queue %p\n", qp)); ip_free(qp); return NULL; } @@ -390,7 +391,7 @@ struct sk_buff *ip_defrag(struct sk_buff *skb) ip_evictor(); /* Find the entry of this IP datagram in the "incomplete datagrams" queue. */ - qp = ip_find(iph); + qp = ip_find(iph, skb->dst); /* Is this a non-fragmented datagram? */ offset = ntohs(iph->frag_off); @@ -435,7 +436,8 @@ struct sk_buff *ip_defrag(struct sk_buff *skb) /* Attempt to construct an oversize packet. */ if(ntohs(iph->tot_len)+(int)offset>65535) { - printk(KERN_INFO "Oversized packet received from %d.%d.%d.%d\n", NIPQUAD(iph->saddr)); + if (net_ratelimit()) + printk(KERN_INFO "Oversized packet received from %d.%d.%d.%d\n", NIPQUAD(iph->saddr)); frag_kfree_skb(skb, FREE_READ); ip_statistics.IpReasmFails++; return NULL; diff --git a/net/ipv4/ip_fw.c b/net/ipv4/ip_fw.c index fa5917957..9f8123afd 100644 --- a/net/ipv4/ip_fw.c +++ b/net/ipv4/ip_fw.c @@ -6,6 +6,8 @@ * license in recognition of the original copyright. * -- Alan Cox. * + * $Id: ip_fw.c,v 1.29 1997/10/10 22:41:01 davem Exp $ + * * Ported from BSD to Linux, * Alan Cox 22/Nov/1994. * Zeroing /proc and other additions @@ -104,7 +106,7 @@ #include <net/udp.h> #include <net/sock.h> #include <net/icmp.h> -#include <net/netlink.h> +#include <linux/netlink.h> #include <linux/firewall.h> #include <linux/ip_fw.h> #include <linux/init.h> @@ -165,6 +167,10 @@ static int *policies[] = #endif +#ifdef CONFIG_IP_FIREWALL_NETLINK +struct sock *ipfwsk; +#endif + /* * Returns 1 if the port is matched by the vector, 0 otherwise */ @@ -376,15 +382,6 @@ int ip_fw_chk(struct iphdr *ip, struct device *rif, __u16 *redirport, struct ip_ continue; /* - * Look for a VIA address match - */ - if(f->fw_via.s_addr && rif) - { - if(rif->pa_addr!=f->fw_via.s_addr) - continue; /* Mismatch */ - } - - /* * Look for a VIA device match */ if(f->fw_viadev) @@ -651,6 +648,11 @@ static int insert_in_chain(struct ip_fw *volatile* chainptr, struct ip_fw *frwl, if ((ftmp->fw_vianame)[0]) { if (!(ftmp->fw_viadev = dev_get(ftmp->fw_vianame))) ftmp->fw_viadev = (struct device *) -1; + } else if (ftmp->fw_via.s_addr) { + if (!(ftmp->fw_viadev = ip_dev_find(ftmp->fw_via.s_addr))) + ftmp->fw_viadev = (struct device *) -1; + else + memcpy(ftmp->fw_vianame, ftmp->fw_viadev->name, IFNAMSIZ); } else ftmp->fw_viadev = NULL; @@ -695,6 +697,11 @@ static int append_to_chain(struct ip_fw *volatile* chainptr, struct ip_fw *frwl, if ((ftmp->fw_vianame)[0]) { if (!(ftmp->fw_viadev = dev_get(ftmp->fw_vianame))) ftmp->fw_viadev = (struct device *) -1; + } else if (ftmp->fw_via.s_addr) { + if (!(ftmp->fw_viadev = ip_dev_find(ftmp->fw_via.s_addr))) + ftmp->fw_viadev = (struct device *) -1; + else + memcpy(ftmp->fw_vianame, ftmp->fw_viadev->name, IFNAMSIZ); } else ftmp->fw_viadev = NULL; @@ -957,12 +964,6 @@ int ip_fw_ctl(int stage, void *m, int len) printk("ip_fw_ctl: invalid device \"%s\"\n", ipfwp->fwp_vianame); #endif return(EINVAL); - } else if ( viadev->pa_addr != ipfwp->fwp_via.s_addr ) { -#ifdef DEBUG_IP_FIREWALL - printk("ip_fw_ctl: device \"%s\" has another IP address\n", - ipfwp->fwp_vianame); -#endif - return(EINVAL); } else if ( ip->ihl != sizeof(struct iphdr) / sizeof(int)) { #ifdef DEBUG_IP_FIREWALL printk("ip_fw_ctl: ip->ihl=%d, want %d\n",ip->ihl, @@ -1066,6 +1067,7 @@ int ip_fw_ctl(int stage, void *m, int len) } #endif /* CONFIG_IP_FIREWALL */ +#ifdef CONFIG_PROC_FS #if defined(CONFIG_IP_FIREWALL) || defined(CONFIG_IP_ACCT) static int ip_chain_procinfo(int stage, char *buffer, char **start, @@ -1120,9 +1122,9 @@ static int ip_chain_procinfo(int stage, char *buffer, char **start, ntohl(i->fw_dst.s_addr),ntohl(i->fw_dmsk.s_addr), (i->fw_vianame)[0] ? i->fw_vianame : "-", ntohl(i->fw_via.s_addr),i->fw_flg); - /* 9 is enough for a 32 bit box but the counters are 64bit on + /* 10 is enough for a 32 bit box but the counters are 64bit on the Alpha and Ultrapenguin */ - len+=sprintf(buffer+len,"%u %u %-19lu %-19lu", + len+=sprintf(buffer+len,"%u %u %-20lu %-20lu", i->fw_nsp,i->fw_ndp, i->fw_pcnt,i->fw_bcnt); for (p = 0; p < IP_FW_MAX_PORTS; p++) len+=sprintf(buffer+len, " %u", i->fw_pts[p]); @@ -1192,6 +1194,7 @@ static int ip_fw_fwd_procinfo(char *buffer, char **start, off_t offset, reset); } #endif +#endif #ifdef CONFIG_IP_FIREWALL @@ -1323,8 +1326,7 @@ __initfunc(void ip_fw_init(void)) /* Register for device up/down reports */ register_netdevice_notifier(&ipfw_dev_notifier); #endif - #ifdef CONFIG_IP_FIREWALL_NETLINK - netlink_attach(NETLINK_FIREWALL, netlink_donothing); /* XXX */ -#endif /* CONFIG_IP_FIREWALL_NETLINK */ + ipfwsk = netlink_kernel_create(NETLINK_FIREWALL, NULL); +#endif } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 2642832e3..1c3c2da7a 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) module. * - * Version: @(#)ip.c 1.0.16b 9/1/93 + * Version: $Id: ip_input.c,v 1.24 1997/10/24 17:15:58 kuznet Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -153,8 +153,7 @@ #endif #include <linux/firewall.h> #include <linux/mroute.h> -#include <net/netlink.h> -#include <linux/net_alias.h> +#include <linux/netlink.h> #include <linux/ipsec.h> /* @@ -184,13 +183,55 @@ int ip_ioctl(struct sock *sk, int cmd, unsigned long arg) #define CONFIG_IP_ALWAYS_DEFRAG 1 #endif +/* + * 0 - deliver + * 1 - block + */ +static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) +{ + int type; + + type = skb->h.icmph->type; + if (type < 32) + return test_bit(type, &sk->tp_pinfo.tp_raw4.filter); + + /* Do not block unknown ICMP types */ + return 0; +} + +int ip_call_ra_chain(struct sk_buff *skb) +{ + struct ip_ra_chain *ra; + u8 protocol = skb->nh.iph->protocol; + struct sock *last = NULL; + + for (ra = ip_ra_chain; ra; ra = ra->next) { + struct sock *sk = ra->sk; + if (sk && sk->num == protocol) { + if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + skb = ip_defrag(skb); + if (skb == NULL) + return 1; + } + if (last) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + raw_rcv(last, skb2); + } + last = sk; + } + } + + if (last) { + raw_rcv(last, skb); + return 1; + } + return 0; +} int ip_local_deliver(struct sk_buff *skb) { struct iphdr *iph = skb->nh.iph; -#ifdef CONFIG_IP_MASQUERADE - struct device *dev = skb->dev; -#endif struct inet_protocol *ipprot; struct sock *raw_sk=NULL; unsigned char hash; @@ -214,7 +255,7 @@ int ip_local_deliver(struct sk_buff *skb) * Do we need to de-masquerade this packet? */ { - int ret = ip_fw_demasquerade(&skb, dev); + int ret = ip_fw_demasquerade(&skb); if (ret < 0) { kfree_skb(skb, FREE_WRITE); return 0; @@ -256,22 +297,23 @@ int ip_local_deliver(struct sk_buff *skb) if((raw_sk = raw_v4_htable[hash]) != NULL) { struct sock *sknext = NULL; struct sk_buff *skb1; - raw_sk = raw_v4_lookup(raw_sk, iph->protocol, iph->saddr, iph->daddr); + raw_sk = raw_v4_lookup(raw_sk, iph->protocol, iph->saddr, iph->daddr, skb->dev->ifindex); if(raw_sk) { /* Any raw sockets */ do { /* Find the next */ sknext = raw_v4_lookup(raw_sk->next, iph->protocol, - iph->saddr, iph->daddr); - if(sknext) + iph->saddr, iph->daddr, skb->dev->ifindex); + if (iph->protocol != IPPROTO_ICMP || !icmp_filter(raw_sk, skb)) { + if (sknext == NULL) + break; skb1 = skb_clone(skb, GFP_ATOMIC); - else - break; /* One pending raw socket left */ - if(skb1) - { - if(ipsec_sk_policy(raw_sk,skb1)) - raw_rcv(raw_sk, skb1); - else - kfree_skb(skb1, FREE_WRITE); + if(skb1) + { + if(ipsec_sk_policy(raw_sk,skb1)) + raw_rcv(raw_sk, skb1); + else + kfree_skb(skb1, FREE_WRITE); + } } raw_sk = sknext; } while(raw_sk!=NULL); @@ -350,15 +392,6 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) struct ip_options * opt = NULL; int err; -#ifdef CONFIG_NET_IPV6 - /* - * Intercept IPv6 frames. We dump ST-II and invalid types just below.. - */ - - if(iph->version == 6) - return ipv6_rcv(skb,dev,pt); -#endif - /* * When interface is in promisc. mode, drop all the crap * that it receives, do not truing to analyse it. @@ -398,13 +431,18 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) * is IP we can trim to the true length of the frame. * Note this now means skb->len holds ntohs(iph->tot_len). */ - - skb_trim(skb, ntohs(iph->tot_len)); + __skb_trim(skb, ntohs(iph->tot_len)); if (skb->dst == NULL) { err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev); if (err) goto drop; +#ifdef CONFIG_CPU_IS_SLOW + if (net_cpu_congestion > 10 && !(iph->tos&IPTOS_RELIABILITY) && + IPTOS_PREC(iph->tos) < IPTOS_PREC_INTERNETCONTROL) { + goto drop; + } +#endif } #ifdef CONFIG_IP_ALWAYS_DEFRAG @@ -425,12 +463,12 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) opt = &(IPCB(skb)->opt); if (opt->srr) { if (!ipv4_config.source_route) { - if (ipv4_config.log_martians) + if (ipv4_config.log_martians && net_ratelimit()) printk(KERN_INFO "source route option %08lx -> %08lx\n", ntohl(iph->saddr), ntohl(iph->daddr)); goto drop; } - if (RT_LOCALADDR(((struct rtable*)skb->dst)->rt_flags) && + if (((struct rtable*)skb->dst)->rt_type == RTN_LOCAL && ip_options_rcv_srr(skb)) goto drop; } diff --git a/net/ipv4/ip_masq.c b/net/ipv4/ip_masq.c index 2d2fd3717..8c300e155 100644 --- a/net/ipv4/ip_masq.c +++ b/net/ipv4/ip_masq.c @@ -339,7 +339,7 @@ static void masq_expire(unsigned long data) * given boundaries MASQ_BEGIN and MASQ_END. */ -struct ip_masq * ip_masq_new(struct device *dev, int proto, __u32 saddr, __u16 sport, __u32 daddr, __u16 dport, unsigned mflags) +struct ip_masq * ip_masq_new(__u32 maddr, int proto, __u32 saddr, __u16 sport, __u32 daddr, __u16 dport, unsigned mflags) { struct ip_masq *ms, *mst; int ports_tried, *free_ports_p; @@ -377,7 +377,7 @@ struct ip_masq * ip_masq_new(struct device *dev, int proto, __u32 saddr, __u16 s ms->flags |= IP_MASQ_F_NO_DADDR; /* get masq address from rif */ - ms->maddr = dev->pa_addr; + ms->maddr = maddr; for (ports_tried = 0; ports_tried < *free_ports_p; ports_tried++){ save_flags(flags); @@ -449,7 +449,7 @@ static void recalc_check(struct udphdr *uh, __u32 saddr, uh->check=0xFFFF; } -int ip_fw_masquerade(struct sk_buff **skb_ptr, struct device *dev) +int ip_fw_masquerade(struct sk_buff **skb_ptr, __u32 maddr) { struct sk_buff *skb=*skb_ptr; struct iphdr *iph = skb->nh.iph; @@ -489,7 +489,7 @@ int ip_fw_masquerade(struct sk_buff **skb_ptr, struct device *dev) if (ms==NULL) { - ms = ip_masq_new(dev, iph->protocol, + ms = ip_masq_new(maddr, iph->protocol, iph->saddr, portptr[0], iph->daddr, portptr[1], 0); @@ -512,7 +512,7 @@ int ip_fw_masquerade(struct sk_buff **skb_ptr, struct device *dev) * Attempt ip_masq_app call. * will fix ip_masq and iph seq stuff */ - if (ip_masq_app_pkt_out(ms, skb_ptr, dev) != 0) + if (ip_masq_app_pkt_out(ms, skb_ptr, maddr) != 0) { /* * skb has possibly changed, update pointers. @@ -572,7 +572,7 @@ int ip_fw_masquerade(struct sk_buff **skb_ptr, struct device *dev) ip_send_check(iph); #ifdef DEBUG_CONFIG_IP_MASQUERADE - printk("O-routed from %lX:%X over %s\n",ntohl(ms->maddr),ntohs(ms->mport),dev->name); + printk("O-routed from %lX:%X via %lX\n",ntohl(ms->maddr),ntohs(ms->mport),ntohl(maddr)); #endif return 0; @@ -586,7 +586,7 @@ int ip_fw_masquerade(struct sk_buff **skb_ptr, struct device *dev) * Currently handles error types - unreachable, quench, ttl exceeded */ -int ip_fw_masq_icmp(struct sk_buff **skb_p, struct device *dev) +int ip_fw_masq_icmp(struct sk_buff **skb_p) { struct sk_buff *skb = *skb_p; struct iphdr *iph = skb->nh.iph; @@ -685,7 +685,7 @@ int ip_fw_masq_icmp(struct sk_buff **skb_p, struct device *dev) * Currently handles error types - unreachable, quench, ttl exceeded */ -int ip_fw_demasq_icmp(struct sk_buff **skb_p, struct device *dev) +int ip_fw_demasq_icmp(struct sk_buff **skb_p) { struct sk_buff *skb = *skb_p; struct iphdr *iph = skb->nh.iph; @@ -778,7 +778,7 @@ int ip_fw_demasq_icmp(struct sk_buff **skb_p, struct device *dev) * this function. */ -int ip_fw_demasquerade(struct sk_buff **skb_p, struct device *dev) +int ip_fw_demasquerade(struct sk_buff **skb_p) { struct sk_buff *skb = *skb_p; struct iphdr *iph = skb->nh.iph; @@ -789,7 +789,7 @@ int ip_fw_demasquerade(struct sk_buff **skb_p, struct device *dev) switch (iph->protocol) { case IPPROTO_ICMP: - return(ip_fw_demasq_icmp(skb_p, dev)); + return(ip_fw_demasq_icmp(skb_p)); case IPPROTO_TCP: case IPPROTO_UDP: /* Make sure packet is in the masq range */ @@ -869,7 +869,7 @@ int ip_fw_demasquerade(struct sk_buff **skb_p, struct device *dev) * will fix ip_masq and iph ack_seq stuff */ - if (ip_masq_app_pkt_in(ms, skb_p, dev) != 0) + if (ip_masq_app_pkt_in(ms, skb_p) != 0) { /* * skb has changed, update pointers. @@ -937,6 +937,7 @@ int ip_fw_demasquerade(struct sk_buff **skb_p, struct device *dev) return 0; } +#ifdef CONFIG_PROC_FS /* * /proc/net entry */ @@ -999,7 +1000,6 @@ done: return len; } -#ifdef CONFIG_PROC_FS static struct proc_dir_entry proc_net_ipmsqhst = { PROC_NET_IPMSQHST, 13, "ip_masquerade", S_IFREG | S_IRUGO, 1, 0, 0, diff --git a/net/ipv4/ip_masq_app.c b/net/ipv4/ip_masq_app.c index f7449e0ba..f03aef04b 100644 --- a/net/ipv4/ip_masq_app.c +++ b/net/ipv4/ip_masq_app.c @@ -306,7 +306,7 @@ static __inline__ void masq_seq_update(struct ip_masq *ms, struct ip_masq_seq *m * returns (new - old) skb->len diff. */ -int ip_masq_app_pkt_out(struct ip_masq *ms, struct sk_buff **skb_p, struct device *dev) +int ip_masq_app_pkt_out(struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr) { struct ip_masq_app * mapp; struct iphdr *iph; @@ -351,7 +351,7 @@ int ip_masq_app_pkt_out(struct ip_masq *ms, struct sk_buff **skb_p, struct devic if ( mapp->pkt_out == NULL ) return 0; - diff = mapp->pkt_out(mapp, ms, skb_p, dev); + diff = mapp->pkt_out(mapp, ms, skb_p, maddr); /* * Update ip_masq seq stuff if len has changed. @@ -369,7 +369,7 @@ int ip_masq_app_pkt_out(struct ip_masq *ms, struct sk_buff **skb_p, struct devic * returns (new - old) skb->len diff. */ -int ip_masq_app_pkt_in(struct ip_masq *ms, struct sk_buff **skb_p, struct device *dev) +int ip_masq_app_pkt_in(struct ip_masq *ms, struct sk_buff **skb_p) { struct ip_masq_app * mapp; struct iphdr *iph; @@ -414,7 +414,7 @@ int ip_masq_app_pkt_in(struct ip_masq *ms, struct sk_buff **skb_p, struct device if ( mapp->pkt_in == NULL ) return 0; - diff = mapp->pkt_in(mapp, ms, skb_p, dev); + diff = mapp->pkt_in(mapp, ms, skb_p); /* * Update ip_masq seq stuff if len has changed. diff --git a/net/ipv4/ip_masq_ftp.c b/net/ipv4/ip_masq_ftp.c index 4d5568d0a..4cb88d925 100644 --- a/net/ipv4/ip_masq_ftp.c +++ b/net/ipv4/ip_masq_ftp.c @@ -50,7 +50,7 @@ masq_ftp_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms) } int -masq_ftp_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, struct device *dev) +masq_ftp_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr) { struct sk_buff *skb; struct iphdr *iph; @@ -118,7 +118,7 @@ masq_ftp_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb ip_masq_set_expire(n_ms,0); } else { - n_ms = ip_masq_new(dev, IPPROTO_TCP, + n_ms = ip_masq_new(maddr, IPPROTO_TCP, htonl(from), htons(port), iph->daddr, 0, IP_MASQ_F_NO_DPORT); diff --git a/net/ipv4/ip_masq_irc.c b/net/ipv4/ip_masq_irc.c index a1be56f81..b2e325ce6 100644 --- a/net/ipv4/ip_masq_irc.c +++ b/net/ipv4/ip_masq_irc.c @@ -51,7 +51,7 @@ masq_irc_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms) } int -masq_irc_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, struct device *dev) +masq_irc_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr) { struct sk_buff *skb; struct iphdr *iph; @@ -167,7 +167,7 @@ masq_irc_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb * connection is requested by another client. */ - n_ms = ip_masq_new(dev, IPPROTO_TCP, + n_ms = ip_masq_new(maddr, IPPROTO_TCP, htonl(s_addr),htons(s_port), 0, 0, IP_MASQ_F_NO_DPORT|IP_MASQ_F_NO_DADDR diff --git a/net/ipv4/ip_masq_quake.c b/net/ipv4/ip_masq_quake.c index 08a062bc7..482096f2b 100644 --- a/net/ipv4/ip_masq_quake.c +++ b/net/ipv4/ip_masq_quake.c @@ -73,7 +73,7 @@ masq_quake_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms) } int -masq_quake_in (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, struct device *dev) +masq_quake_in (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p) { struct sk_buff *skb; struct iphdr *iph; @@ -158,7 +158,7 @@ masq_quake_in (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **sk } int -masq_quake_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, struct device *dev) +masq_quake_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr) { struct sk_buff *skb; struct iphdr *iph; @@ -234,7 +234,7 @@ masq_quake_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **s memcpy(&udp_port, data, 2); - n_ms = ip_masq_new(dev, IPPROTO_UDP, + n_ms = ip_masq_new(maddr, IPPROTO_UDP, ms->saddr, htons(udp_port), ms->daddr, ms->dport, 0); diff --git a/net/ipv4/ip_masq_raudio.c b/net/ipv4/ip_masq_raudio.c index 52f439102..26b5cd4da 100644 --- a/net/ipv4/ip_masq_raudio.c +++ b/net/ipv4/ip_masq_raudio.c @@ -2,7 +2,7 @@ * IP_MASQ_RAUDIO - Real Audio masquerading module * * - * Version: @(#)$Id: ip_masq_raudio.c,v 1.6 1997/04/29 09:38:26 mj Exp $ + * Version: @(#)$Id: ip_masq_raudio.c,v 1.7 1997/09/16 18:43:40 kuznet Exp $ * * Author: Nigel Metheringham * [strongly based on ftp module by Juan Jose Ciarlante & Wouter Gadeyne] @@ -88,7 +88,7 @@ masq_raudio_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms) } int -masq_raudio_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, struct device *dev) +masq_raudio_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr) { struct sk_buff *skb; struct iphdr *iph; @@ -154,7 +154,7 @@ masq_raudio_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff ** if (ntohs(msg_id) == 1) { /* This is a message detailing the UDP port to be used */ memcpy(&udp_port, p, 2); - n_ms = ip_masq_new(dev, IPPROTO_UDP, + n_ms = ip_masq_new(maddr, IPPROTO_UDP, ms->saddr, udp_port, ms->daddr, 0, IP_MASQ_F_NO_DPORT); diff --git a/net/ipv4/ip_nat_dumb.c b/net/ipv4/ip_nat_dumb.c index 1d510af42..06e9be8fb 100644 --- a/net/ipv4/ip_nat_dumb.c +++ b/net/ipv4/ip_nat_dumb.c @@ -5,6 +5,8 @@ * * Dumb Network Address Translation. * + * Version: $Id: ip_nat_dumb.c,v 1.2 1997/10/10 22:41:05 davem Exp $ + * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 80baf8364..14b423f2f 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -5,6 +5,8 @@ * * The options processing module for ip.c * + * Version: $Id: ip_options.c,v 1.12 1997/10/10 22:41:08 davem Exp $ + * * Authors: A.N.Kuznetsov * */ @@ -15,10 +17,10 @@ #include <linux/ip.h> #include <linux/icmp.h> #include <linux/netdevice.h> +#include <linux/rtnetlink.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> -#include <linux/net_alias.h> /* * Write options to IP header, record destination address to @@ -32,7 +34,7 @@ */ void ip_options_build(struct sk_buff * skb, struct ip_options * opt, - u32 daddr, u32 saddr, int is_frag) + u32 daddr, struct rtable *rt, int is_frag) { unsigned char * iph = skb->nh.raw; @@ -46,9 +48,9 @@ void ip_options_build(struct sk_buff * skb, struct ip_options * opt, if (!is_frag) { if (opt->rr_needaddr) - memcpy(iph+opt->rr+iph[opt->rr+2]-5, &saddr, 4); + ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, rt); if (opt->ts_needaddr) - memcpy(iph+opt->ts+iph[opt->ts+2]-9, &saddr, 4); + ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, rt); if (opt->ts_needtime) { struct timeval tv; __u32 midtime; @@ -147,7 +149,7 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) if (((struct timestamp*)(dptr+1))->flags == IPOPT_TS_PRESPEC) { __u32 addr; memcpy(&addr, sptr+soffset-9, 4); - if (__ip_chk_addr(addr) == 0) { + if (inet_addr_type(addr) == RTN_UNICAST) { dopt->ts_needtime = 0; dopt->ts_needaddr = 0; soffset -= 8; @@ -248,6 +250,7 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb) unsigned char * optptr; int optlen; unsigned char * pp_ptr = NULL; + struct rtable *rt = skb ? (struct rtable*)skb->dst : NULL; if (!opt) { opt = &(IPCB(skb)->opt); @@ -328,7 +331,7 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb) goto error; } if (skb) { - memcpy(&optptr[optptr[2]-1], &skb->dev->pa_addr, 4); + memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); opt->is_changed = 1; } optptr[2] += 4; @@ -371,7 +374,7 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb) } opt->ts = optptr - iph; if (skb) { - memcpy(&optptr[ts->ptr-1], &skb->dev->pa_addr, 4); + memcpy(&optptr[ts->ptr-1], &rt->rt_spec_dst, 4); timeptr = (__u32*)&optptr[ts->ptr+3]; } opt->ts_needaddr = 1; @@ -387,7 +390,7 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb) { u32 addr; memcpy(&addr, &optptr[ts->ptr-1], 4); - if (__ip_chk_addr(addr) == 0) + if (inet_addr_type(addr) == RTN_UNICAST) break; if (skb) timeptr = (__u32*)&optptr[ts->ptr+3]; @@ -521,7 +524,7 @@ void ip_forward_options(struct sk_buff *skb) if (opt->rr_needaddr) { optptr = (unsigned char *)raw + opt->rr; - memcpy(&optptr[optptr[2]-5], &rt->u.dst.dev->pa_addr, 4); + ip_rt_get_source(&optptr[optptr[2]-5], rt); opt->is_changed = 1; } if (opt->srr_is_hit) { @@ -540,20 +543,20 @@ void ip_forward_options(struct sk_buff *skb) } if (srrptr + 3 <= srrspace) { opt->is_changed = 1; - memcpy(&optptr[srrptr-1], &rt->u.dst.dev->pa_addr, 4); + ip_rt_get_source(&optptr[srrptr-1], rt); skb->nh.iph->daddr = rt->rt_dst; optptr[2] = srrptr+4; } else printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); if (opt->ts_needaddr) { optptr = raw + opt->ts; - memcpy(&optptr[optptr[2]-9], &rt->u.dst.dev->pa_addr, 4); + ip_rt_get_source(&optptr[optptr[2]-9], rt); opt->is_changed = 1; } - if (opt->is_changed) { - opt->is_changed = 0; - ip_send_check(skb->nh.iph); - } + } + if (opt->is_changed) { + opt->is_changed = 0; + ip_send_check(skb->nh.iph); } } @@ -571,16 +574,16 @@ int ip_options_rcv_srr(struct sk_buff *skb) if (!opt->srr) return 0; - if (rt->rt_flags&(RTF_BROADCAST|RTF_MULTICAST|RTF_NAT) - || skb->pkt_type != PACKET_HOST) + if (skb->pkt_type != PACKET_HOST) return -EINVAL; - - if (!(rt->rt_flags & RTF_LOCAL)) { + if (rt->rt_type == RTN_UNICAST) { if (!opt->is_strictroute) return 0; icmp_send(skb, ICMP_PARAMETERPROB, 0, 16); return -EINVAL; } + if (rt->rt_type != RTN_LOCAL) + return -EINVAL; for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) { if (srrptr + 3 > srrspace) { @@ -591,16 +594,15 @@ int ip_options_rcv_srr(struct sk_buff *skb) rt = (struct rtable*)skb->dst; skb->dst = NULL; - err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, - net_alias_main_dev(skb->dev)); + err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev); rt2 = (struct rtable*)skb->dst; - if (err || rt2->rt_flags&(RTF_BROADCAST|RTF_MULTICAST|RTF_NAT)) { + if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { ip_rt_put(rt2); skb->dst = &rt->u.dst; return -EINVAL; } ip_rt_put(rt); - if (!(rt2->rt_flags&RTF_LOCAL)) + if (rt2->rt_type != RTN_LOCAL) break; /* Superfast 8) loopback forward */ memcpy(&iph->daddr, &optptr[srrptr-1], 4); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 4f070ed0b..106236c93 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) output module. * - * Version: @(#)ip.c 1.0.16b 9/1/93 + * Version: $Id: ip_output.c,v 1.40 1997/10/12 17:01:48 kuznet Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -67,7 +67,7 @@ #include <linux/ip_fw.h> #include <linux/firewall.h> #include <linux/mroute.h> -#include <net/netlink.h> +#include <linux/netlink.h> #include <linux/ipsec.h> static void __inline__ ip_ll_header_reserve(struct sk_buff *skb) @@ -92,7 +92,7 @@ int ip_build_pkt(struct sk_buff *skb, struct sock *sk, u32 saddr, u32 daddr, daddr = opt->faddr; err = ip_route_output(&rt, daddr, saddr, RT_TOS(sk->ip_tos) | - (sk->localroute||0), NULL); + (sk->localroute||0), sk->bound_dev_if); if (err) { ip_statistics.IpOutNoRoutes++; @@ -130,7 +130,7 @@ int ip_build_pkt(struct sk_buff *skb, struct sock *sk, u32 saddr, u32 daddr, iph->tos = sk->ip_tos; iph->frag_off = 0; if (sk->ip_pmtudisc == IP_PMTUDISC_WANT && - !(rt->rt_flags & RTF_NOPMTUDISC)) + !(rt->rt_flags & RTCF_NOPMTUDISC)) iph->frag_off |= htons(IP_DF); iph->ttl = sk->ip_ttl; iph->daddr = rt->rt_dst; @@ -143,8 +143,7 @@ int ip_build_pkt(struct sk_buff *skb, struct sock *sk, u32 saddr, u32 daddr, { iph->ihl += opt->optlen>>2; skb->h.raw += opt->optlen; - ip_options_build(skb, opt, final_daddr, - rt->u.dst.dev->pa_addr, 0); + ip_options_build(skb, opt, final_daddr, rt, 0); } ip_rt_put(rt); @@ -170,9 +169,10 @@ int ip_build_header(struct sk_buff *skb, struct sock *sk) rt = (struct rtable*)sk->dst_cache; if (!rt || rt->u.dst.obsolete) { + sk->dst_cache = NULL; ip_rt_put(rt); err = ip_route_output(&rt, daddr, sk->saddr, RT_TOS(sk->ip_tos) | - (sk->localroute||0), NULL); + (sk->localroute||0), sk->bound_dev_if); if (err) return err; sk->dst_cache = &rt->u.dst; @@ -210,7 +210,7 @@ int ip_build_header(struct sk_buff *skb, struct sock *sk) iph->tos = sk->ip_tos; iph->frag_off = 0; if (sk->ip_pmtudisc == IP_PMTUDISC_WANT && - !(rt->rt_flags & RTF_NOPMTUDISC)) + !(rt->rt_flags & RTCF_NOPMTUDISC)) iph->frag_off |= htons(IP_DF); iph->ttl = sk->ip_ttl; iph->daddr = rt->rt_dst; @@ -223,7 +223,7 @@ int ip_build_header(struct sk_buff *skb, struct sock *sk) return 0; iph->ihl += opt->optlen>>2; skb->h.raw += opt->optlen; - ip_options_build(skb, opt, final_daddr, rt->u.dst.dev->pa_addr, 0); + ip_options_build(skb, opt, final_daddr, rt, 0); return 0; } @@ -242,17 +242,35 @@ int ip_mc_output(struct sk_buff *skb) #ifdef CONFIG_IP_ACCT ip_fw_chk(skb->nh.iph, skb->dev,NULL,ip_acct_chain,0,IP_FW_MODE_ACCT_OUT); #endif - +#ifdef CONFIG_IP_ROUTE_NAT if (rt->rt_flags & RTCF_NAT) ip_do_nat(skb); +#endif /* * Multicasts are looped back for other local users */ - - if (rt->rt_flags&RTF_MULTICAST && !(dev->flags&IFF_LOOPBACK)) { - if (sk==NULL || sk->ip_mc_loop) - dev_loopback_xmit(skb); + + if (rt->rt_flags&RTCF_MULTICAST && (!sk || sk->ip_mc_loop)) { +#ifndef CONFIG_IP_MROUTE +#if 1 + /* It should never occur. Delete it eventually. --ANK */ + if (!(rt->rt_flags&RTCF_LOCAL) || (dev->flags&IFF_LOOPBACK)) + printk(KERN_DEBUG "ip_mc_output (mc): it should never occur\n"); + else +#endif +#else + /* Small optimization: do not loopback not local frames, + which returned after forwarding; they will be dropped + by ip_mr_input in any case. + Note, that local frames are looped back to be delivered + to local recipients. + + This check is duplicated in ip_mr_input at the moment. + */ + if ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED)) +#endif + dev_loopback_xmit(skb); /* Multicasts with ttl 0 must not go beyond the host */ @@ -262,9 +280,15 @@ int ip_mc_output(struct sk_buff *skb) } } - if ((rt->rt_flags&(RTF_LOCAL|RTF_BROADCAST)) == (RTF_LOCAL|RTF_BROADCAST) && - !(dev->flags&IFF_LOOPBACK)) + if (rt->rt_flags&RTCF_BROADCAST) { +#if 1 + /* It should never occur. Delete it eventually. --ANK */ + if (!(rt->rt_flags&RTCF_LOCAL) || (dev->flags&IFF_LOOPBACK)) + printk(KERN_DEBUG "ip_mc_output (brd): it should never occur!\n"); + else +#endif dev_loopback_xmit(skb); + } if (dev->flags & IFF_UP) { dev_queue_xmit(skb); @@ -291,8 +315,10 @@ int ip_output(struct sk_buff *skb) ip_fw_chk(skb->nh.iph, skb->dev,NULL,ip_acct_chain,0,IP_FW_MODE_ACCT_OUT); #endif +#ifdef CONFIG_IP_ROUTE_NAT if (rt->rt_flags&RTCF_NAT) ip_do_nat(skb); +#endif if (dev->flags & IFF_UP) { dev_queue_xmit(skb); @@ -431,8 +457,7 @@ check_route: */ { struct rtable *nrt; - if (ip_route_output(&nrt, rt->key.dst, rt->key.src, - rt->key.tos, NULL)) { + if (ip_route_output(&nrt, rt->key.dst, rt->key.src, rt->key.tos, sk?sk->bound_dev_if:0)) { kfree_skb(skb, 0); return; } @@ -500,14 +525,13 @@ int ip_build_xmit(struct sock *sk, int hh_len = rt->u.dst.dev->hard_header_len; int nfrags=0; struct ip_options *opt = ipc->opt; - struct device *dev = rt->u.dst.dev; int df = htons(IP_DF); #ifdef CONFIG_NET_SECURITY int fw_res; #endif if (sk->ip_pmtudisc == IP_PMTUDISC_DONT || - rt->rt_flags&RTF_NOPMTUDISC) + rt->rt_flags&RTCF_NOPMTUDISC) df = 0; @@ -546,7 +570,7 @@ int ip_build_xmit(struct sock *sk, iph->id=htons(ip_id_count++); iph->frag_off = df; iph->ttl=sk->ip_mc_ttl; - if (!(rt->rt_flags&RTF_MULTICAST)) + if (rt->rt_type != RTN_MULTICAST) iph->ttl=sk->ip_ttl; iph->protocol=sk->protocol; iph->saddr=rt->rt_src; @@ -695,14 +719,14 @@ int ip_build_xmit(struct sock *sk, if (opt) { iph->ihl += opt->optlen>>2; ip_options_build(skb, opt, - ipc->addr, dev->pa_addr, offset); + ipc->addr, rt, offset); } iph->tos = sk->ip_tos; iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4); iph->id = id; iph->frag_off = htons(offset>>3); iph->frag_off |= mf|df; - if (rt->rt_flags&RTF_MULTICAST) + if (rt->rt_type == RTN_MULTICAST) iph->ttl = sk->ip_mc_ttl; else iph->ttl = sk->ip_ttl; @@ -966,7 +990,7 @@ struct sk_buff * ip_reply(struct sk_buff *skb, int payload) if (ipc.opt->srr) daddr = replyopts.opt.faddr; - if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), NULL)) + if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0)) return NULL; iphlen = sizeof(struct iphdr) + replyopts.opt.optlen; @@ -1000,7 +1024,7 @@ struct sk_buff * ip_reply(struct sk_buff *skb, int payload) iph->saddr = rt->rt_src; iph->protocol = skb->nh.iph->protocol; - ip_options_build(reply, &replyopts.opt, daddr, rt->u.dst.dev->pa_addr, 0); + ip_options_build(reply, &replyopts.opt, daddr, rt, 0); return reply; } @@ -1019,43 +1043,16 @@ static struct packet_type ip_packet_type = }; -/* - * Device notifier - */ - -static int ip_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) -{ - struct device *dev=ptr; - - if (dev->family != AF_INET) - return NOTIFY_DONE; - - if(event==NETDEV_UP) - { - /* - * Join the initial group if multicast. - */ - ip_mc_allhost(dev); - } - if(event==NETDEV_DOWN) - ip_mc_drop_device(dev); - - return ip_rt_event(event, dev); -} - -struct notifier_block ip_netdev_notifier={ - ip_netdev_event, - NULL, - 0 -}; #ifdef CONFIG_PROC_FS +#ifdef CONFIG_IP_MULTICAST static struct proc_dir_entry proc_net_igmp = { PROC_NET_IGMP, 4, "igmp", S_IFREG | S_IRUGO, 1, 0, 0, 0, &proc_net_inode_operations, ip_mc_procinfo }; +#endif #endif /* @@ -1068,11 +1065,10 @@ __initfunc(void ip_init(void)) ip_rt_init(); - /* So we flush routes and multicast lists when a device is downed */ - register_netdevice_notifier(&ip_netdev_notifier); - #ifdef CONFIG_PROC_FS +#ifdef CONFIG_IP_MULTICAST proc_net_register(&proc_net_igmp); +#endif #endif } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 366ce9fb9..080452dd3 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -5,6 +5,8 @@ * * The IP to API glue. * + * Version: $Id: ip_sockglue.c,v 1.28 1997/11/17 17:36:08 kuznet Exp $ + * * Authors: see ip.c * * Fixes: @@ -27,6 +29,7 @@ #include <net/icmp.h> #include <linux/tcp.h> #include <linux/udp.h> +#include <linux/igmp.h> #include <linux/firewall.h> #include <linux/ip_fw.h> #include <net/checksum.h> @@ -36,34 +39,47 @@ #include <asm/uaccess.h> +#define IP_CMSG_PKTINFO 1 +#define IP_CMSG_TTL 2 +#define IP_CMSG_TOS 4 +#define IP_CMSG_RECVOPTS 8 +#define IP_CMSG_RETOPTS 16 + /* * SOL_IP control messages. */ -static void ip_cmsg_recv_rxinfo(struct msghdr *msg, struct sk_buff *skb) +static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) { struct in_pktinfo info; struct rtable *rt = (struct rtable *)skb->dst; - info.ipi_ifindex = skb->dev->ifindex; info.ipi_addr.s_addr = skb->nh.iph->daddr; - info.ipi_spec_dst.s_addr = rt->rt_spec_dst; + if (rt) { + info.ipi_ifindex = rt->rt_iif; + info.ipi_spec_dst.s_addr = rt->rt_spec_dst; + } else { + info.ipi_ifindex = 0; + info.ipi_spec_dst.s_addr = 0; + } - put_cmsg(msg, SOL_IP, IP_RXINFO, sizeof(info), &info); + put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); } -static void ip_cmsg_recv_localaddr(struct msghdr *msg, struct sk_buff *skb, int local) +static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb) { - struct in_addr addr; + if (IPCB(skb)->opt.optlen == 0) + return; - addr.s_addr = skb->nh.iph->daddr; + put_cmsg(msg, SOL_IP, IP_TTL, 1, &skb->nh.iph->ttl); +} - if (local) { - struct rtable *rt = (struct rtable *)skb->dst; - addr.s_addr = rt->rt_spec_dst; - } - put_cmsg(msg, SOL_IP, local ? IP_LOCALADDR : IP_RECVDSTADDR, - sizeof(addr), &addr); +static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb) +{ + if (IPCB(skb)->opt.optlen == 0) + return; + + put_cmsg(msg, SOL_IP, IP_TOS, 1, &skb->nh.iph->tos); } static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb) @@ -99,26 +115,30 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) /* Ordered by supposed usage frequency */ if (flags & 1) - ip_cmsg_recv_rxinfo(msg, skb); + ip_cmsg_recv_pktinfo(msg, skb); if ((flags>>=1) == 0) return; + if (flags & 1) - ip_cmsg_recv_localaddr(msg, skb, 1); + ip_cmsg_recv_ttl(msg, skb); if ((flags>>=1) == 0) return; + if (flags & 1) - ip_cmsg_recv_opts(msg, skb); + ip_cmsg_recv_tos(msg, skb); if ((flags>>=1) == 0) return; + if (flags & 1) - ip_cmsg_recv_retopts(msg, skb); + ip_cmsg_recv_opts(msg, skb); if ((flags>>=1) == 0) return; + if (flags & 1) - ip_cmsg_recv_localaddr(msg, skb, 0); + ip_cmsg_recv_retopts(msg, skb); } -int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc, struct device **devp) +int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc) { int err; struct cmsghdr *cmsg; @@ -127,27 +147,19 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc, struct device **de if (cmsg->cmsg_level != SOL_IP) continue; switch (cmsg->cmsg_type) { - case IP_LOCALADDR: - if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_addr))) - return -EINVAL; - memcpy(&ipc->addr, CMSG_DATA(cmsg), sizeof(struct in_addr)); - break; case IP_RETOPTS: err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0); if (err) return err; break; - case IP_TXINFO: + case IP_PKTINFO: { struct in_pktinfo *info; if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo))) return -EINVAL; info = (struct in_pktinfo *)CMSG_DATA(cmsg); - if (info->ipi_ifindex && !devp) - return -EINVAL; - if ((*devp = dev_get_by_index(info->ipi_ifindex)) == NULL) - return -ENODEV; + ipc->oif = info->ipi_ifindex; ipc->addr = info->ipi_spec_dst.s_addr; break; } @@ -158,6 +170,53 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc, struct device **de return 0; } + +/* Special input handler for packets catched by router alert option. + They are selected only by protocol field, and then processed likely + local ones; but only if someone wants them! Otherwise, router + not running rsvpd will kill RSVP. + + It is user level problem, what it will make with them. + I have no idea, how it will masquearde or NAT them (it is joke, joke :-)), + but receiver should be enough clever f.e. to forward mtrace requests, + sent to multicast group to reach destination designated router. + */ +struct ip_ra_chain *ip_ra_chain; + +int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *)) +{ + struct ip_ra_chain *ra, *new_ra, **rap; + + if (sk->type != SOCK_RAW || sk->num == IPPROTO_RAW) + return -EINVAL; + + new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; + + for (rap = &ip_ra_chain; (ra=*rap) != NULL; rap = &ra->next) { + if (ra->sk == sk) { + if (on) { + if (new_ra) + kfree(new_ra); + return -EADDRINUSE; + } + *rap = ra->next; + if (ra->destructor) + ra->destructor(sk); + kfree(ra); + return 0; + } + } + if (new_ra == NULL) + return -ENOBUFS; + new_ra->sk = sk; + new_ra->destructor = destructor; + start_bh_atomic(); + new_ra->next = ra; + *rap = new_ra; + end_bh_atomic(); + return 0; +} + /* * Socket option code for IP. This is the end of the line after any TCP,UDP etc options on * an IP socket. @@ -168,7 +227,6 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc, struct device **de int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { int val=0,err; - unsigned char ucval = 0; #if defined(CONFIG_IP_FIREWALL) || defined(CONFIG_IP_ACCT) struct ip_fw tmp_fw; #endif @@ -177,9 +235,12 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt if(get_user(val, (int *) optval)) return -EFAULT; } else if(optlen>=sizeof(char)) { + unsigned char ucval; if(get_user(ucval, (unsigned char *) optval)) return -EFAULT; + val = (int)ucval; } + /* If optlen==0, it is equivalent to val == 0 */ if(level!=SOL_IP) return -ENOPROTOOPT; @@ -213,50 +274,38 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt kfree_s(old_opt, sizeof(struct ip_options) + old_opt->optlen); return 0; } - case IP_RXINFO: - if (optlen<4) - return -EINVAL; + case IP_PKTINFO: if (val) - sk->ip_cmsg_flags |= 1; + sk->ip_cmsg_flags |= IP_CMSG_PKTINFO; else - sk->ip_cmsg_flags &= ~1; + sk->ip_cmsg_flags &= ~IP_CMSG_PKTINFO; return 0; - case IP_LOCALADDR: - if (optlen<4) - return -EINVAL; + case IP_RECVTTL: if (val) - sk->ip_cmsg_flags |= 2; + sk->ip_cmsg_flags |= IP_CMSG_TTL; else - sk->ip_cmsg_flags &= ~2; + sk->ip_cmsg_flags &= ~IP_CMSG_TTL; return 0; - case IP_RECVOPTS: - if (optlen<4) - return -EINVAL; + case IP_RECVTOS: if (val) - sk->ip_cmsg_flags |= 4; + sk->ip_cmsg_flags |= IP_CMSG_TOS; else - sk->ip_cmsg_flags &= ~4; + sk->ip_cmsg_flags &= ~IP_CMSG_TOS; return 0; - case IP_RETOPTS: - if (optlen<4) - return -EINVAL; + case IP_RECVOPTS: if (val) - sk->ip_cmsg_flags |= 8; + sk->ip_cmsg_flags |= IP_CMSG_RECVOPTS; else - sk->ip_cmsg_flags &= ~8; + sk->ip_cmsg_flags &= ~IP_CMSG_RECVOPTS; return 0; - case IP_RECVDSTADDR: - if (optlen<4) - return -EINVAL; + case IP_RETOPTS: if (val) - sk->ip_cmsg_flags |= 0x10; + sk->ip_cmsg_flags |= IP_CMSG_RETOPTS; else - sk->ip_cmsg_flags &= ~0x10; + sk->ip_cmsg_flags &= ~IP_CMSG_RETOPTS; return 0; case IP_TOS: /* This sets both TOS and Precedence */ /* Reject setting of unused bits */ - if (optlen<4) - return -EINVAL; if (val & ~(IPTOS_TOS_MASK|IPTOS_PREC_MASK)) return -EINVAL; if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP && !suser()) @@ -274,29 +323,25 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt sk->priority = rt_tos2priority(val); return 0; case IP_TTL: - if (optlen<4) + if (optlen<1) return -EINVAL; + if(val==-1) + val = ip_statistics.IpDefaultTTL; if(val<1||val>255) return -EINVAL; sk->ip_ttl=val; return 0; case IP_HDRINCL: - if (optlen<4) - return -EINVAL; if(sk->type!=SOCK_RAW) return -ENOPROTOOPT; sk->ip_hdrincl=val?1:0; return 0; case IP_PMTUDISC: - if (optlen<4) - return -EINVAL; if (val<0 || val>2) return -EINVAL; sk->ip_pmtudisc = val; return 0; case IP_RECVERR: - if (optlen<4) - return -EINVAL; if (sk->type==SOCK_STREAM) return -ENOPROTOOPT; lock_sock(sk); @@ -312,211 +357,81 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt case IP_MULTICAST_TTL: if (optlen<1) return -EINVAL; - sk->ip_mc_ttl=(int)ucval; + if (val==-1) + val = 1; + if (val < 0 || val > 255) + return -EINVAL; + sk->ip_mc_ttl=val; return 0; case IP_MULTICAST_LOOP: if (optlen<1) return -EINVAL; - if(ucval!=0 && ucval!=1) - return -EINVAL; - sk->ip_mc_loop=(int)ucval; + sk->ip_mc_loop = val ? 1 : 0; return 0; case IP_MULTICAST_IF: { - struct in_addr addr; + struct ip_mreqn mreq; struct device *dev = NULL; /* * Check the arguments are allowable */ - if(optlen<sizeof(addr)) - return -EINVAL; - - if(copy_from_user(&addr,optval,sizeof(addr))) - return -EFAULT; - - - - /* - * What address has been requested - */ - - if (addr.s_addr==INADDR_ANY) /* Default */ - { - sk->ip_mc_index = 0; - return 0; - } - - /* - * Find the device - */ - - dev=ip_dev_find(addr.s_addr, NULL); - - /* - * Did we find one - */ - - if(dev) - { - sk->ip_mc_index = dev->ifindex; - return 0; + if (optlen >= sizeof(struct ip_mreqn)) { + if (copy_from_user(&mreq,optval,sizeof(mreq))) + return -EFAULT; + } else { + memset(&mreq, 0, sizeof(mreq)); + if (optlen >= sizeof(struct in_addr) && + copy_from_user(&mreq.imr_address,optval,sizeof(struct in_addr))) + return -EFAULT; } - return -EADDRNOTAVAIL; - } - - - case IP_ADD_MEMBERSHIP: - { - -/* - * FIXME: Add/Del membership should have a semaphore protecting them from re-entry - */ - struct ip_mreq mreq; - struct rtable *rt; - struct device *dev=NULL; - - /* - * Check the arguments. - */ - - if(optlen<sizeof(mreq)) - return -EINVAL; - if(copy_from_user(&mreq,optval,sizeof(mreq))) - return -EFAULT; - - /* - * Get device for use later - */ - - if (mreq.imr_interface.s_addr==INADDR_ANY) { - err = ip_route_output(&rt, mreq.imr_multiaddr.s_addr, 0, 1, NULL); - if (err) - return err; - dev = rt->u.dst.dev; - ip_rt_put(rt); - } else - dev = ip_dev_find(mreq.imr_interface.s_addr, NULL); - - /* - * No device, no cookies. - */ - - if(!dev) - return -ENODEV; - - /* - * Join group. - */ - - return ip_mc_join_group(sk,dev,mreq.imr_multiaddr.s_addr); - } - - case IP_DROP_MEMBERSHIP: - { - struct ip_mreq mreq; - struct rtable *rt; - struct device *dev=NULL; - - /* - * Check the arguments - */ - - if(optlen<sizeof(mreq)) - return -EINVAL; - if(copy_from_user(&mreq,optval,sizeof(mreq))) - return -EFAULT; - - /* - * Get device for use later - */ - - if (mreq.imr_interface.s_addr==INADDR_ANY) { - err = ip_route_output(&rt, mreq.imr_multiaddr.s_addr, 0, 1, NULL); - if (err) - return err; - dev = rt->u.dst.dev; - ip_rt_put(rt); - } else - dev = ip_dev_find(mreq.imr_interface.s_addr, NULL); - - /* - * Did we find a suitable device. - */ - - if(!dev) - return -ENODEV; - - /* - * Leave group - */ - - return ip_mc_leave_group(sk,dev,mreq.imr_multiaddr.s_addr); - } - - case IP_MULTICAST_IFN: - { - struct ip_mreqn mreq; - struct device *dev = NULL; - - if(optlen<sizeof(mreq)) - return -EINVAL; - if(copy_from_user(&mreq,optval,sizeof(mreq))) - return -EFAULT; if (!mreq.imr_ifindex) { - if (!mreq.imr_address.s_addr) { + if (!mreq.imr_address.s_addr == INADDR_ANY) { sk->ip_mc_index = 0; sk->ip_mc_addr = 0; return 0; } - dev = ip_dev_find(mreq.imr_address.s_addr, NULL); + dev = ip_dev_find(mreq.imr_address.s_addr); } else dev = dev_get_by_index(mreq.imr_ifindex); if (!dev) - return -ENODEV; + return -EADDRNOTAVAIL; + + if (sk->bound_dev_if && dev->ifindex != sk->bound_dev_if) + return -EINVAL; sk->ip_mc_index = mreq.imr_ifindex; sk->ip_mc_addr = mreq.imr_address.s_addr; return 0; } - case IP_ADD_MEMBERSHIPN: - { - struct ip_mreqn mreq; - struct device *dev = NULL; - if(optlen<sizeof(mreq)) - return -EINVAL; - if(copy_from_user(&mreq,optval,sizeof(mreq))) - return -EFAULT; - dev = dev_get_by_index(mreq.imr_ifindex); - if (!dev) - return -ENODEV; - return ip_mc_join_group(sk,dev,mreq.imr_multiaddr.s_addr); - } - - case IP_DROP_MEMBERSHIPN: + case IP_ADD_MEMBERSHIP: + case IP_DROP_MEMBERSHIP: { struct ip_mreqn mreq; - struct device *dev=NULL; - - /* - * Check the arguments - */ - - if(optlen<sizeof(mreq)) - return -EINVAL; - if(copy_from_user(&mreq,optval,sizeof(mreq))) - return -EFAULT; + + if (optlen < sizeof(struct ip_mreq)) + return -EINVAL; + if (optlen >= sizeof(struct ip_mreqn)) { + if(copy_from_user(&mreq,optval,sizeof(mreq))) + return -EFAULT; + } else { + memset(&mreq, 0, sizeof(mreq)); + if (copy_from_user(&mreq,optval,sizeof(struct ip_mreq))) + return -EFAULT; + } - dev=dev_get_by_index(mreq.imr_ifindex); - if(!dev) - return -ENODEV; - - return ip_mc_leave_group(sk,dev,mreq.imr_multiaddr.s_addr); + if (optname == IP_ADD_MEMBERSHIP) + return ip_mc_join_group(sk,&mreq); + else + return ip_mc_leave_group(sk,&mreq); } + case IP_ROUTER_ALERT: + return ip_ra_control(sk, val ? 1 : 0, NULL); + #ifdef CONFIG_IP_FIREWALL case IP_FW_INSERT_IN: case IP_FW_INSERT_OUT: @@ -616,21 +531,21 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *op return -EFAULT; return 0; } - case IP_RXINFO: - val = (sk->ip_cmsg_flags & 1) != 0; - return 0; - case IP_LOCALADDR: - val = (sk->ip_cmsg_flags & 2) != 0; - return 0; + case IP_PKTINFO: + val = (sk->ip_cmsg_flags & IP_CMSG_PKTINFO) != 0; + break; + case IP_RECVTTL: + val = (sk->ip_cmsg_flags & IP_CMSG_TTL) != 0; + break; + case IP_RECVTOS: + val = (sk->ip_cmsg_flags & IP_CMSG_TOS) != 0; + break; case IP_RECVOPTS: - val = (sk->ip_cmsg_flags & 4) != 0; - return 0; + val = (sk->ip_cmsg_flags & IP_CMSG_RECVOPTS) != 0; + break; case IP_RETOPTS: - val = (sk->ip_cmsg_flags & 8) != 0; - return 0; - case IP_RECVDSTADDR: - val = (sk->ip_cmsg_flags & 0x10) != 0; - return 0; + val = (sk->ip_cmsg_flags & IP_CMSG_RETOPTS) != 0; + break; case IP_TOS: val=sk->ip_tos; break; @@ -642,17 +557,18 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *op break; case IP_PMTUDISC: val=sk->ip_pmtudisc; - return 0; + break; case IP_RECVERR: val=sk->ip_recverr; - return 0; + break; case IP_MULTICAST_TTL: val=sk->ip_mc_ttl; break; case IP_MULTICAST_LOOP: val=sk->ip_mc_loop; break; - case IP_MULTICAST_IFN: +#if 0 + case IP_MULTICAST_IF: { struct ip_mreqn mreq; len = min(len,sizeof(struct ip_mreqn)); @@ -665,9 +581,13 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *op return -EFAULT; return 0; } +#endif case IP_MULTICAST_IF: { struct device *dev = dev_get_by_index(sk->ip_mc_index); + + printk(KERN_INFO "application %s uses old get IP_MULTICAST_IF. Please, report!\n", current->comm); + if (dev == NULL) { len = 0; @@ -689,11 +609,19 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *op return(-ENOPROTOOPT); } - len=min(sizeof(int),len); - - if(put_user(len, optlen)) - return -EFAULT; - if(copy_to_user(optval,&val,len)) - return -EFAULT; + if (len < sizeof(int) && len > 0 && val>=0 && val<255) { + unsigned char ucval = (unsigned char)val; + len = 1; + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval,&ucval,1)) + return -EFAULT; + } else { + len=min(sizeof(int),len); + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval,&val,len)) + return -EFAULT; + } return 0; } diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 75346d6dc..565116ffc 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -1,6 +1,8 @@ /* * Linux NET3: IP/IP protocol decoder. * + * Version: $Id: ipip.c,v 1.19 1997/11/08 17:50:21 kuznet Exp $ + * * Authors: * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 * @@ -11,6 +13,11 @@ * to keep ip_forward happy. * Alan Cox : More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8). * Kai Schulte : Fixed #defines for IP_FIREWALL->FIREWALL + * David Woodhouse : Perform some basic ICMP handling. + * IPIP Routing without decapsulation. + * Carlos Picoto : GRE over IP support + * Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c. + * I do not want to merge them together. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -18,12 +25,80 @@ * 2 of the License, or (at your option) any later version. * */ + +/* tunnel.c: an IP tunnel driver + + The purpose of this driver is to provide an IP tunnel through + which you can tunnel network traffic transparently across subnets. + + This was written by looking at Nick Holloway's dummy driver + Thanks for the great code! + + -Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 + + Minor tweaks: + Cleaned up the code a little and added some pre-1.3.0 tweaks. + dev->hard_header/hard_header_len changed to use no headers. + Comments/bracketing tweaked. + Made the tunnels use dev->name not tunnel: when error reporting. + Added tx_dropped stat + + -Alan Cox (Alan.Cox@linux.org) 21 March 95 + + Reworked: + Changed to tunnel to destination gateway in addition to the + tunnel's pointopoint address + Almost completely rewritten + Note: There is currently no firewall or ICMP handling done. + + -Sam Lantinga (slouken@cs.ucdavis.edu) 02/13/96 + +*/ + +/* Things I wish I had known when writing the tunnel driver: + + When the tunnel_xmit() function is called, the skb contains the + packet to be sent (plus a great deal of extra info), and dev + contains the tunnel device that _we_ are. + + When we are passed a packet, we are expected to fill in the + source address with our source IP address. + + What is the proper way to allocate, copy and free a buffer? + After you allocate it, it is a "0 length" chunk of memory + starting at zero. If you want to add headers to the buffer + later, you'll have to call "skb_reserve(skb, amount)" with + the amount of memory you want reserved. Then, you call + "skb_put(skb, amount)" with the amount of space you want in + the buffer. skb_put() returns a pointer to the top (#0) of + that buffer. skb->len is set to the amount of space you have + "allocated" with skb_put(). You can then write up to skb->len + bytes to that buffer. If you need more, you can call skb_put() + again with the additional amount of space you need. You can + find out how much more space you can allocate by calling + "skb_tailroom(skb)". + Now, to add header space, call "skb_push(skb, header_len)". + This creates space at the beginning of the buffer and returns + a pointer to this new space. If later you need to strip a + header from a buffer, call "skb_pull(skb, header_len)". + skb_headroom() will return how much space is left at the top + of the buffer (before the main data). Remember, this headroom + space must be reserved before the skb_put() function is called. + */ + +/* + This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c + + For comments look at net/ipv4/ip_gre.c --ANK + */ + -#include <linux/module.h> #include <linux/config.h> +#include <linux/module.h> #include <linux/types.h> #include <linux/sched.h> #include <linux/kernel.h> +#include <asm/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> @@ -31,91 +106,673 @@ #include <linux/udp.h> #include <linux/if_arp.h> #include <linux/mroute.h> +#include <linux/init.h> -#include <net/datalink.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/protocol.h> #include <net/ipip.h> -void ipip_err(struct sk_buff *skb, unsigned char *dp) +#define HASH_SIZE 16 +#define HASH(addr) ((addr^(addr>>4))&0xF) + +static int ipip_fb_tunnel_init(struct device *dev); +static int ipip_tunnel_init(struct device *dev); + +static struct device ipip_fb_tunnel_dev = { + NULL, 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipip_fb_tunnel_init, +}; + +static struct ip_tunnel ipip_fb_tunnel = { + NULL, &ipip_fb_tunnel_dev, {0, }, 0, 0, 0, 0, 0, 0, 0, {"tunl0", } +}; + +static struct ip_tunnel *tunnels_r_l[HASH_SIZE]; +static struct ip_tunnel *tunnels_r[HASH_SIZE]; +static struct ip_tunnel *tunnels_l[HASH_SIZE]; +static struct ip_tunnel *tunnels_wc[1]; +static struct ip_tunnel **tunnels[4] = { tunnels_wc, tunnels_l, tunnels_r, tunnels_r_l }; + +static struct ip_tunnel * ipip_tunnel_lookup(u32 remote, u32 local) { - /* NI */ - return; + unsigned h0 = HASH(remote); + unsigned h1 = HASH(local); + struct ip_tunnel *t; + + for (t = tunnels_r_l[h0^h1]; t; t = t->next) { + if (local == t->parms.iph.saddr && + remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + return t; + } + for (t = tunnels_r[h0]; t; t = t->next) { + if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + return t; + } + for (t = tunnels_l[h1]; t; t = t->next) { + if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) + return t; + } + if ((t = tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP)) + return t; + return NULL; } -/* - * The IPIP protocol driver. - * - * On entry here - * skb->data is the original IP header - * skb->nh points to the initial IP header. - * skb->h points at the new header. +struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create) +{ + u32 remote = parms->iph.daddr; + u32 local = parms->iph.saddr; + struct ip_tunnel *t, **tp, *nt; + struct device *dev; + unsigned h = 0; + int prio = 0; + + if (remote) { + prio |= 2; + h ^= HASH(remote); + } + if (local) { + prio |= 1; + h ^= HASH(local); + } + for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) { + if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) + return t; + } + if (!create) + return NULL; + + MOD_INC_USE_COUNT; + dev = kmalloc(sizeof(*dev) + sizeof(*t), GFP_KERNEL); + if (dev == NULL) { + MOD_DEC_USE_COUNT; + return NULL; + } + memset(dev, 0, sizeof(*dev) + sizeof(*t)); + dev->priv = (void*)(dev+1); + nt = (struct ip_tunnel*)dev->priv; + nt->dev = dev; + dev->name = nt->parms.name; + dev->init = ipip_tunnel_init; + memcpy(&nt->parms, parms, sizeof(*parms)); + if (dev->name[0] == 0) { + int i; + for (i=1; i<100; i++) { + sprintf(dev->name, "tunl%d", i); + if (dev_get(dev->name) == NULL) + break; + } + if (i==100) + goto failed; + memcpy(parms->name, dev->name, IFNAMSIZ); + } + if (register_netdevice(dev) < 0) + goto failed; + + start_bh_atomic(); + nt->next = t; + *tp = nt; + end_bh_atomic(); + /* Do not decrement MOD_USE_COUNT here. */ + return nt; + +failed: + kfree(dev); + MOD_DEC_USE_COUNT; + return NULL; +} + +static void ipip_tunnel_destroy(struct device *dev) +{ + struct ip_tunnel *t, **tp; + struct ip_tunnel *t0 = (struct ip_tunnel*)dev->priv; + u32 remote = t0->parms.iph.daddr; + u32 local = t0->parms.iph.saddr; + unsigned h = 0; + int prio = 0; + + if (dev == &ipip_fb_tunnel_dev) { + tunnels_wc[0] = NULL; + return; + } + + if (remote) { + prio |= 2; + h ^= HASH(remote); + } + if (local) { + prio |= 1; + h ^= HASH(local); + } + for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) { + if (t == t0) { + *tp = t->next; + kfree(dev); + MOD_DEC_USE_COUNT; + break; + } + } +} + + +void ipip_err(struct sk_buff *skb, unsigned char *dp, int len) +{ +#ifndef I_WISH_WORLD_WERE_PERFECT + +/* It is not :-( All the routers (except for Linux) return only + 8 bytes of packet payload. It means, that precise relaying of + ICMP in the real Internet is absolutely infeasible. */ + struct iphdr *iph = (struct iphdr*)dp; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + struct ip_tunnel *t; + + if (len < sizeof(struct iphdr)) + return; + + switch (type) { + default: + case ICMP_PARAMETERPROB: + return; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* Soft state for pmtu is maintained by IP core. */ + return; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe they are just ether pollution. --ANK + */ + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + break; + } + + t = ipip_tunnel_lookup(iph->daddr, iph->saddr); + if (t == NULL || t->parms.iph.daddr == 0) + return; + if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) + return; + + if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) + t->err_count++; + else + t->err_count = 1; + t->err_time = jiffies; + return; +#else + struct iphdr *iph = (struct iphdr*)dp; + int hlen = iph->ihl<<2; + struct iphdr *eiph; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + int rel_type = 0; + int rel_code = 0; + int rel_info = 0; + struct sk_buff *skb2; + struct rtable *rt; + + if (len < hlen + sizeof(struct iphdr)) + return; + eiph = (struct iphdr*)(dp + hlen); + + switch (type) { + default: + return; + case ICMP_PARAMETERPROB: + if (skb->h.icmph->un.gateway < hlen) + return; + + /* So... This guy found something strange INSIDE encapsulated + packet. Well, he is fool, but what can we do ? + */ + rel_type = ICMP_PARAMETERPROB; + rel_info = skb->h.icmph->un.gateway - hlen; + break; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* And it is the only really necesary thing :-) */ + rel_info = ntohs(skb->h.icmph->un.frag.mtu); + if (rel_info < hlen+68) + return; + rel_info -= hlen; + /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */ + if (rel_info > ntohs(eiph->tot_len)) + return; + break; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe, it is just ether pollution. --ANK + */ + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_HOST_UNREACH; + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + break; + } + + /* Prepare fake skb to feed it to icmp_send */ + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2 == NULL) + return; + dst_release(skb2->dst); + skb2->dst = NULL; + skb_pull(skb2, skb->data - (u8*)eiph); + skb2->nh.raw = skb2->data; + + /* Try to guess incoming interface */ + if (ip_route_output(&rt, eiph->saddr, 0, RT_TOS(eiph->tos), 0)) { + kfree_skb(skb2, FREE_WRITE); + return; + } + skb2->dev = rt->u.dst.dev; + + /* route "incoming" packet */ + if (rt->rt_flags&RTCF_LOCAL) { + ip_rt_put(rt); + rt = NULL; + if (ip_route_output(&rt, eiph->daddr, eiph->saddr, eiph->tos, 0) || + rt->u.dst.dev->type != ARPHRD_IPGRE) { + ip_rt_put(rt); + kfree_skb(skb2, FREE_WRITE); + return; + } + } else { + ip_rt_put(rt); + if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) || + skb2->dst->dev->type != ARPHRD_IPGRE) { + kfree_skb(skb2, FREE_WRITE); + return; + } + } + + /* change mtu on this route */ + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { + if (rel_info > skb2->dst->pmtu) { + kfree_skb(skb2, FREE_WRITE); + return; + } + skb2->dst->pmtu = rel_info; + rel_info = htonl(rel_info); + } else if (type == ICMP_TIME_EXCEEDED) { + struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv; + if (t->parms.iph.ttl) { + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_HOST_UNREACH; + } + } + + icmp_send(skb2, rel_type, rel_code, rel_info); + kfree_skb(skb2, FREE_WRITE); + return; +#endif +} int ipip_rcv(struct sk_buff *skb, unsigned short len) { - struct device *dev; struct iphdr *iph; + struct ip_tunnel *tunnel; -#ifdef TUNNEL_DEBUG - printk("ipip_rcv: got a packet!\n"); -#endif - /* - * Discard the original IP header - */ - - skb_pull(skb, skb->h.raw - skb->nh.raw); - - /* - * Adjust pointers - */ - iph = skb->nh.iph; - skb->nh.iph = skb->h.ipiph; + skb->mac.raw = skb->nh.raw; + skb->nh.raw = skb_pull(skb, skb->h.raw - skb->data); memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); - - /* - * If you want to add LZ compressed IP or things like that here, - * and in drivers/net/tunnel.c are the places to add. - */ - - skb->protocol = htons(ETH_P_IP); + skb->protocol = __constant_htons(ETH_P_IP); skb->ip_summed = 0; skb->pkt_type = PACKET_HOST; + if ((tunnel = ipip_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) { + tunnel->stat.rx_packets++; + tunnel->stat.rx_bytes += skb->len; + skb->dev = tunnel->dev; + dst_release(skb->dst); + skb->dst = NULL; + netif_rx(skb); + return 0; + } + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); + kfree_skb(skb, FREE_READ); + return 0; +} + +/* + * This function assumes it is being called from dev_queue_xmit() + * and that skb is filled properly by that function. + */ + +static int ipip_tunnel_xmit(struct sk_buff *skb, struct device *dev) +{ + struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + struct net_device_stats *stats = &tunnel->stat; + struct iphdr *tiph = &tunnel->parms.iph; + u8 tos = tunnel->parms.iph.tos; + u16 df = tiph->frag_off; + struct rtable *rt; /* Route to the other host */ + struct device *tdev; /* Device to other host */ + struct iphdr *old_iph = skb->nh.iph; + struct iphdr *iph; /* Our new IP header */ + int max_headroom; /* The extra header space needed */ + u32 dst = tiph->daddr; + int mtu; + + if (tunnel->recursion++) { + tunnel->stat.collisions++; + goto tx_error; + } + + if (skb->protocol != __constant_htons(ETH_P_IP)) + goto tx_error; + + if (tos&1) + tos = old_iph->tos; + + if (!dst) { + /* NBMA tunnel */ + if ((rt = (struct rtable*)skb->dst) == NULL) { + tunnel->stat.tx_fifo_errors++; + goto tx_error; + } + if ((dst = rt->rt_gateway) == 0) + goto tx_error_icmp; + } + + if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) { + tunnel->stat.tx_carrier_errors++; + goto tx_error_icmp; + } + tdev = rt->u.dst.dev; + + if (tdev == dev) { + ip_rt_put(rt); + tunnel->stat.collisions++; + goto tx_error; + } + + mtu = rt->u.dst.pmtu - sizeof(struct iphdr); + if (mtu < 68) { + tunnel->stat.collisions++; + ip_rt_put(rt); + goto tx_error; + } + if (skb->dst && mtu < skb->dst->pmtu) + skb->dst->pmtu = mtu; + + df |= (old_iph->frag_off&__constant_htons(IP_DF)); + + if ((old_iph->frag_off&__constant_htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + goto tx_error; + } + + if (tunnel->err_count > 0) { + if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { + tunnel->err_count--; + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); + } else + tunnel->err_count = 0; + } + + skb->h.raw = skb->nh.raw; + /* - * Is it draconic? I do not think so. --ANK + * Okay, now see if we can stuff it in the buffer as-is. */ - dev = ip_dev_find_tunnel(iph->daddr, iph->saddr); - if (dev == NULL) { -#ifdef CONFIG_IP_MROUTE - int vif; - - if (!MULTICAST(skb->nh.iph->daddr) || - !ipv4_config.multicast_route || - LOCAL_MCAST(skb->nh.iph->daddr) || - (vif=ip_mr_find_tunnel(iph->daddr, iph->saddr)) < 0) - { -#endif - kfree_skb(skb, FREE_READ); - return -EINVAL; -#ifdef CONFIG_IP_MROUTE + max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr)); + + if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + stats->tx_dropped++; + dev_kfree_skb(skb, FREE_WRITE); + tunnel->recursion--; + return 0; } - IPCB(skb)->flags |= IPSKB_TUNNELED; - IPCB(skb)->vif = vif; - dev = skb->dev; -#endif + dev_kfree_skb(skb, FREE_WRITE); + skb = new_skb; } - skb->dev = dev; + + skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); dst_release(skb->dst); - skb->dst = NULL; - netif_rx(skb); - return(0); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + + iph = skb->nh.iph; + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; + iph->frag_off = df; + iph->protocol = IPPROTO_IPIP; + iph->tos = tos; + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + + if ((iph->ttl = tiph->ttl) == 0) + iph->ttl = old_iph->ttl; + + iph->tot_len = htons(skb->len); + iph->id = htons(ip_id_count++); + ip_send_check(iph); + + stats->tx_bytes += skb->len; + stats->tx_packets++; + ip_send(skb); + tunnel->recursion--; + return 0; + +tx_error_icmp: + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); +tx_error: + stats->tx_errors++; + dev_kfree_skb(skb, FREE_WRITE); + tunnel->recursion--; + return 0; +} + +static int +ipip_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip_tunnel_parm p; + struct ip_tunnel *t; + + MOD_INC_USE_COUNT; + + switch (cmd) { + case SIOCGETTUNNEL: + t = NULL; + if (dev == &ipip_fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + err = -EFAULT; + break; + } + t = ipip_tunnel_locate(&p, 0); + } + if (t == NULL) + t = (struct ip_tunnel*)dev->priv; + memcpy(&p, &t->parms, sizeof(p)); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + break; + + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + + err = -EINVAL; + if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || + p.iph.ihl != 5 || (p.iph.frag_off&__constant_htons(~IP_DF))) + goto done; + if (p.iph.ttl) + p.iph.frag_off |= __constant_htons(IP_DF); + + t = ipip_tunnel_locate(&p, cmd == SIOCADDTUNNEL); + + if (t) { + err = 0; + if (cmd == SIOCCHGTUNNEL) { + t->parms.iph.ttl = p.iph.ttl; + t->parms.iph.tos = p.iph.tos; + t->parms.iph.frag_off = p.iph.frag_off; + } + if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) + err = -EFAULT; + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + + case SIOCDELTUNNEL: + if (dev == &ipip_fb_tunnel_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + err = -ENOENT; + if ((t = ipip_tunnel_locate(&p, 0)) == NULL) + goto done; + err = -EPERM; + if (t == &ipip_fb_tunnel) + goto done; + } + err = unregister_netdevice(dev); + break; + + default: + err = -EINVAL; + } + +done: + MOD_DEC_USE_COUNT; + return err; +} + +static struct net_device_stats *ipip_tunnel_get_stats(struct device *dev) +{ + return &(((struct ip_tunnel*)dev->priv)->stat); +} + +static int ipip_tunnel_change_mtu(struct device *dev, int new_mtu) +{ + if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr)) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +static void ipip_tunnel_init_gen(struct device *dev) +{ + struct ip_tunnel *t = (struct ip_tunnel*)dev->priv; + + dev->destructor = ipip_tunnel_destroy; + dev->hard_start_xmit = ipip_tunnel_xmit; + dev->get_stats = ipip_tunnel_get_stats; + dev->do_ioctl = ipip_tunnel_ioctl; + dev->change_mtu = ipip_tunnel_change_mtu; + + dev_init_buffers(dev); + + dev->type = ARPHRD_TUNNEL; + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); + dev->mtu = 1500 - sizeof(struct iphdr); + dev->flags = IFF_NOARP; + dev->iflink = 0; + dev->addr_len = 4; + memcpy(dev->dev_addr, &t->parms.iph.saddr, 4); + memcpy(dev->broadcast, &t->parms.iph.daddr, 4); +} + +static int ipip_tunnel_init(struct device *dev) +{ + struct device *tdev = NULL; + struct ip_tunnel *tunnel; + struct iphdr *iph; + + tunnel = (struct ip_tunnel*)dev->priv; + iph = &tunnel->parms.iph; + + ipip_tunnel_init_gen(dev); + + if (iph->daddr) { + struct rtable *rt; + if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link)) { + tdev = rt->u.dst.dev; + ip_rt_put(rt); + } + dev->flags |= IFF_POINTOPOINT; + } + + if (!tdev && tunnel->parms.link) + tdev = dev_get_by_index(tunnel->parms.link); + + if (tdev) { + dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); + dev->mtu = tdev->mtu - sizeof(struct iphdr); + } + dev->iflink = tunnel->parms.link; + + return 0; } #ifdef MODULE +static int ipip_fb_tunnel_open(struct device *dev) +{ + MOD_INC_USE_COUNT; + return 0; +} + +static int ipip_fb_tunnel_close(struct device *dev) +{ + MOD_DEC_USE_COUNT; + return 0; +} +#endif + +__initfunc(int ipip_fb_tunnel_init(struct device *dev)) +{ + struct iphdr *iph; + + ipip_tunnel_init_gen(dev); +#ifdef MODULE + dev->open = ipip_fb_tunnel_open; + dev->stop = ipip_fb_tunnel_close; +#endif + + iph = &ipip_fb_tunnel.parms.iph; + iph->version = 4; + iph->protocol = IPPROTO_IPIP; + iph->ihl = 5; + + tunnels_wc[0] = &ipip_fb_tunnel; + return 0; +} static struct inet_protocol ipip_protocol = { ipip_rcv, /* IPIP handler */ @@ -127,21 +784,34 @@ static struct inet_protocol ipip_protocol = { "IPIP" /* name */ }; +#ifdef MODULE +int init_module(void) +#else +__initfunc(int ipip_init(void)) +#endif +{ + printk(KERN_INFO "IPv4 over IPv4 tunneling driver\n"); -/* - * And now the modules code and kernel interface. - */ + ipip_fb_tunnel_dev.priv = (void*)&ipip_fb_tunnel; + ipip_fb_tunnel_dev.name = ipip_fb_tunnel.parms.name; +#ifdef MODULE + register_netdev(&ipip_fb_tunnel_dev); +#else + register_netdevice(&ipip_fb_tunnel_dev); +#endif -int init_module( void) -{ inet_add_protocol(&ipip_protocol); return 0; } -void cleanup_module( void) +#ifdef MODULE + +void cleanup_module(void) { if ( inet_del_protocol(&ipip_protocol) < 0 ) printk(KERN_INFO "ipip close: can't remove protocol\n"); + + unregister_netdevice(&ipip_fb_tunnel_dev); } #endif diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 20246148a..9909f32b0 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -9,6 +9,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * + * Version: $Id: ipmr.c,v 1.28 1997/10/30 00:43:16 davem Exp $ * * Fixes: * Michael Chastain : Incorrect size of copying. @@ -20,14 +21,8 @@ * Alexey Kuznetsov : Status, optimisations and more. * Brad Parker : Better behaviour on mrouted upcall * overflow. + * Carlos Picoto : PIMv1 Support * - * Status: - * Cache manager under test. Forwarding in vague test mode - * Todo: - * Flow control - * Finish Tunnels - * Debug cache ttl handling properly - * Resolve IFF_ALLMULTI for rest of cards */ #include <linux/config.h> @@ -45,6 +40,8 @@ #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> #include <linux/proc_fs.h> #include <linux/mroute.h> #include <linux/init.h> @@ -54,9 +51,16 @@ #include <net/sock.h> #include <net/icmp.h> #include <net/udp.h> +#include <net/raw.h> #include <linux/notifier.h> +#include <linux/if_arp.h> +#include <net/ipip.h> #include <net/checksum.h> +#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) +#define CONFIG_IP_PIMSM 1 +#endif + /* * Multicast router control variables */ @@ -64,10 +68,133 @@ static struct vif_device vif_table[MAXVIFS]; /* Devices */ static unsigned long vifc_map; /* Active device map */ static int maxvif; -int mroute_do_pim = 0; /* Set in PIM assert */ +int mroute_do_assert = 0; /* Set in PIM assert */ +int mroute_do_pim = 0; static struct mfc_cache *mfc_cache_array[MFC_LINES]; /* Forwarding cache */ int cache_resolve_queue_len = 0; /* Size of unresolved */ +static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local); +static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert); +static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); + +extern struct inet_protocol pim_protocol; + +static +struct device *ipmr_new_tunnel(struct vifctl *v) +{ + struct device *dev = NULL; + + rtnl_lock(); + dev = dev_get("tunl0"); + + if (dev) { + int err; + struct ifreq ifr; + mm_segment_t oldfs; + struct ip_tunnel_parm p; + struct in_device *in_dev; + + memset(&p, 0, sizeof(p)); + p.iph.daddr = v->vifc_rmt_addr.s_addr; + p.iph.saddr = v->vifc_lcl_addr.s_addr; + p.iph.version = 4; + p.iph.ihl = 5; + p.iph.protocol = IPPROTO_IPIP; + sprintf(p.name, "dvmrp%d", v->vifc_vifi); + ifr.ifr_ifru.ifru_data = (void*)&p; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL); + set_fs(oldfs); + + if (err == 0 && (dev = dev_get(p.name)) != NULL) { + dev->flags |= IFF_MULTICAST; + + in_dev = dev->ip_ptr; + if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL) + goto failure; + + if (dev_open(dev)) + goto failure; + } + } + rtnl_unlock(); + return dev; + +failure: + unregister_netdevice(dev); + rtnl_unlock(); + return NULL; +} + +#ifdef CONFIG_IP_PIMSM + +static int reg_vif_num = -1; +static struct device * reg_dev; + +static int reg_vif_xmit(struct sk_buff *skb, struct device *dev) +{ + ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT); + kfree_skb(skb, FREE_WRITE); + return 0; +} + +static struct net_device_stats *reg_vif_get_stats(struct device *dev) +{ + return (struct net_device_stats*)dev->priv; +} + +static +struct device *ipmr_reg_vif(struct vifctl *v) +{ + struct device *dev; + struct in_device *in_dev; + int size; + + size = sizeof(*dev) + IFNAMSIZ + sizeof(struct net_device_stats); + dev = kmalloc(size, GFP_KERNEL); + if (!dev) + return NULL; + + memset(dev, 0, size); + + dev->priv = dev + 1; + dev->name = dev->priv + sizeof(struct net_device_stats); + + strcpy(dev->name, "pimreg"); + + dev->type = ARPHRD_PIMREG; + dev->mtu = 1500 - sizeof(struct iphdr) - 8; + dev->flags = IFF_NOARP; + dev->hard_start_xmit = reg_vif_xmit; + dev->get_stats = reg_vif_get_stats; + + rtnl_lock(); + + if (register_netdevice(dev)) { + rtnl_unlock(); + kfree(dev); + return NULL; + } + + if ((in_dev = inetdev_init(dev)) == NULL) + goto failure; + + if (dev_open(dev)) + goto failure; + + rtnl_unlock(); + reg_dev = dev; + return dev; + +failure: + unregister_netdevice(dev); + rtnl_unlock(); + kfree(dev); + return NULL; +} +#endif + /* * Delete a VIF entry */ @@ -75,27 +202,35 @@ int cache_resolve_queue_len = 0; /* Size of unresolved */ static int vif_delete(int vifi) { struct vif_device *v; + struct device *dev; + struct in_device *in_dev; if (vifi < 0 || vifi >= maxvif || !(vifc_map&(1<<vifi))) return -EADDRNOTAVAIL; v = &vif_table[vifi]; - start_bh_atomic(); + dev = v->dev; + v->dev = NULL; + vifc_map &= ~(1<<vifi); - if (!(v->flags&VIFF_TUNNEL)) { - v->u.dev->flags &= ~IFF_ALLMULTI; - dev_mc_upload(v->u.dev); - ip_rt_multicast_event(v->u.dev); - v->u.dev = NULL; - } else { - ip_rt_put(v->u.rt); - v->u.rt = NULL; - } + if ((in_dev = dev->ip_ptr) != NULL) + in_dev->flags &= ~IFF_IP_MFORWARD; - vifc_map&=~(1<<vifi); + dev_set_allmulti(dev, -1); + ip_rt_multicast_event(in_dev); - end_bh_atomic(); + if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER)) { +#ifdef CONFIG_IP_PIMSM + if (vifi == reg_vif_num) { + reg_vif_num = -1; + reg_dev = NULL; + } +#endif + unregister_netdevice(dev); + if (v->flags&VIFF_REGISTER) + kfree(dev); + } if (vifi+1 == maxvif) { int tmp; @@ -108,21 +243,27 @@ static int vif_delete(int vifi) return 0; } -static void ipmr_set_bounds(struct mfc_cache *cache) +static void ipmr_update_threshoulds(struct mfc_cache *cache, unsigned char *ttls) { int vifi; + + start_bh_atomic(); + + cache->mfc_minvif = MAXVIFS; + cache->mfc_maxvif = 0; + memset(cache->mfc_ttls, 255, MAXVIFS); + for (vifi=0; vifi<maxvif; vifi++) { - if (vifc_map&(1<<vifi) && cache->mfc_ttls[vifi]) { - cache->mfc_minvif = vifi; - cache->mfc_maxvif = vifi+1; + if (vifc_map&(1<<vifi) && ttls[vifi] && ttls[vifi] < 255) { + cache->mfc_ttls[vifi] = ttls[vifi]; + if (cache->mfc_minvif > vifi) + cache->mfc_minvif = vifi; + if (cache->mfc_maxvif <= vifi) + cache->mfc_maxvif = vifi + 1; vifi++; - break; } } - for ( ; vifi<maxvif; vifi++) { - if (vifc_map&(1<<vifi) && cache->mfc_ttls[vifi]) - cache->mfc_maxvif = vifi+1; - } + end_bh_atomic(); } /* @@ -148,7 +289,7 @@ static void ipmr_cache_delete(struct mfc_cache *cache) /* * Unlink the buffer */ - + while(*cp!=NULL) { if(*cp==cache) @@ -158,7 +299,7 @@ static void ipmr_cache_delete(struct mfc_cache *cache) } cp=&((*cp)->next); } - + /* * Free the buffer. If it is a pending resolution * clean up the other resources. @@ -167,8 +308,19 @@ static void ipmr_cache_delete(struct mfc_cache *cache) if(cache->mfc_flags&MFC_QUEUED) { cache_resolve_queue_len--; - while((skb=skb_dequeue(&cache->mfc_unresolved))) + while((skb=skb_dequeue(&cache->mfc_unresolved))) { +#ifdef CONFIG_RTNETLINK + if (skb->nh.iph->version == 0) { + struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); + nlh->nlmsg_type = NLMSG_ERROR; + nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + skb_trim(skb, nlh->nlmsg_len); + ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT; + netlink_unicast(rtnl, skb, NETLINK_CB(skb).pid, MSG_DONTWAIT); + } else +#endif kfree_skb(skb, FREE_WRITE); + } } kfree_s(cache,sizeof(cache)); } @@ -222,14 +374,12 @@ static struct mfc_cache *ipmr_cache_alloc(int priority) struct mfc_cache *c=(struct mfc_cache *)kmalloc(sizeof(struct mfc_cache), priority); if(c==NULL) return NULL; - c->mfc_queuelen=0; + memset(c, 0, sizeof(*c)); skb_queue_head_init(&c->mfc_unresolved); init_timer(&c->mfc_timer); c->mfc_timer.data=(long)c; c->mfc_timer.function=ipmr_cache_timer; - c->mfc_last_assert=0; c->mfc_minvif = MAXVIFS; - c->mfc_maxvif = 0; return c; } @@ -259,8 +409,26 @@ static void ipmr_cache_resolve(struct mfc_cache *cache) /* * Play the pending entries through our router */ - while((skb=skb_dequeue(&cache->mfc_unresolved))) - ip_mr_input(skb); + while((skb=skb_dequeue(&cache->mfc_unresolved))) { +#ifdef CONFIG_RTNETLINK + if (skb->nh.iph->version == 0) { + int err; + struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); + + if (ipmr_fill_mroute(skb, cache, NLMSG_DATA(nlh)) > 0) { + nlh->nlmsg_len = skb->tail - (u8*)nlh; + } else { + nlh->nlmsg_type = NLMSG_ERROR; + nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + skb_trim(skb, nlh->nlmsg_len); + ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE; + } + err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).pid, MSG_DONTWAIT); + if (err < 0) printk(KERN_DEBUG "Err=%d", err); + } else +#endif + ip_mr_forward(skb, cache, 0); + } } /* @@ -270,15 +438,40 @@ static void ipmr_cache_resolve(struct mfc_cache *cache) static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) { - struct sk_buff *skb = alloc_skb(128, GFP_ATOMIC); + struct sk_buff *skb; int ihl = pkt->nh.iph->ihl<<2; struct igmphdr *igmp; struct igmpmsg *msg; int ret; +#ifdef CONFIG_IP_PIMSM + if (assert == IGMPMSG_WHOLEPKT) + skb = skb_realloc_headroom(pkt, sizeof(struct iphdr)); + else +#endif + skb = alloc_skb(128, GFP_ATOMIC); + if(!skb) - return -ENOMEM; - + return -ENOBUFS; + +#ifdef CONFIG_IP_PIMSM + if (assert == IGMPMSG_WHOLEPKT) { + /* Ugly, but we have no choice with this interface. + Duplicate old header, fix ihl, length etc. + And all this only to mangle msg->im_msgtype and + to set msg->im_mbz to "mbz" :-) + */ + msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr)); + skb->nh.raw = skb->h.raw = (u8*)msg; + memcpy(msg, pkt->nh.raw, sizeof(struct iphdr)); + msg->im_msgtype = IGMPMSG_WHOLEPKT; + msg->im_mbz = 0; + msg->im_vif = reg_vif_num; + skb->nh.iph->ihl = sizeof(struct iphdr) >> 2; + skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr)); + } else { +#endif + /* * Copy the IP header */ @@ -287,33 +480,30 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) memcpy(skb->data,pkt->data,ihl); skb->nh.iph->protocol = 0; /* Flag to the kernel this is a route add */ msg = (struct igmpmsg*)skb->nh.iph; - if (assert) - msg->im_vif = vifi; - + msg->im_vif = vifi; + skb->dst = dst_clone(pkt->dst); + /* * Add our header */ - + igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr)); igmp->type = - msg->im_msgtype = assert ? IGMPMSG_WRONGVIF : IGMPMSG_NOCACHE; + msg->im_msgtype = assert; igmp->code = 0; skb->nh.iph->tot_len=htons(skb->len); /* Fix the length */ skb->h.raw = skb->nh.raw; +#ifdef CONFIG_IP_PIMSM + } +#endif /* * Deliver to mrouted */ - if((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) - { - static unsigned long last_warn; - if(jiffies-last_warn>10*HZ) - { - last_warn=jiffies; - printk("mroute: pending queue full, dropping entries.\n"); - } + if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) { + if (net_ratelimit()) + printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); kfree_skb(skb, FREE_READ); - return ret; } return ret; @@ -323,7 +513,7 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) * Queue a packet for resolution */ -static void ipmr_cache_unresolved(struct mfc_cache *cache, vifi_t vifi, struct sk_buff *skb) +static int ipmr_cache_unresolved(struct mfc_cache *cache, vifi_t vifi, struct sk_buff *skb) { if(cache==NULL) { @@ -333,12 +523,12 @@ static void ipmr_cache_unresolved(struct mfc_cache *cache, vifi_t vifi, struct s if(cache_resolve_queue_len>=10 || (cache=ipmr_cache_alloc(GFP_ATOMIC))==NULL) { kfree_skb(skb, FREE_WRITE); - return; + return -ENOBUFS; } /* * Fill in the new cache entry */ - cache->mfc_parent=vifi; + cache->mfc_parent=ALL_VIFS; cache->mfc_origin=skb->nh.iph->saddr; cache->mfc_mcastgrp=skb->nh.iph->daddr; cache->mfc_flags=MFC_QUEUED; @@ -358,9 +548,16 @@ static void ipmr_cache_unresolved(struct mfc_cache *cache, vifi_t vifi, struct s if(mroute_socket) { /* If the report failed throw the cache entry - out - Brad Parker */ - if(ipmr_cache_report(skb, vifi, 0)<0) + out - Brad Parker + + OK, OK, Brad. Only do not forget to free skb + and return :-) --ANK + */ + if (ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE)<0) { ipmr_cache_delete(cache); + kfree_skb(skb, FREE_WRITE); + return -ENOBUFS; + } } } /* @@ -369,10 +566,11 @@ static void ipmr_cache_unresolved(struct mfc_cache *cache, vifi_t vifi, struct s if(cache->mfc_queuelen>3) { kfree_skb(skb, FREE_WRITE); - return; + return -ENOBUFS; } cache->mfc_queuelen++; skb_queue_tail(&cache->mfc_unresolved,skb); + return 0; } /* @@ -416,8 +614,7 @@ int ipmr_mfc_modify(int action, struct mfcctl *mfc) cache->mfc_flags|=MFC_RESOLVED; cache->mfc_parent=mfc->mfcc_parent; - memcpy(cache->mfc_ttls, mfc->mfcc_ttls,sizeof(cache->mfc_ttls)); - ipmr_set_bounds(cache); + ipmr_update_threshoulds(cache, mfc->mfcc_ttls); /* * Check to see if we resolved a queued list. If so we @@ -445,13 +642,21 @@ int ipmr_mfc_modify(int action, struct mfcctl *mfc) cache->mfc_origin=mfc->mfcc_origin.s_addr; cache->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr; cache->mfc_parent=mfc->mfcc_parent; - memcpy(cache->mfc_ttls, mfc->mfcc_ttls,sizeof(cache->mfc_ttls)); - ipmr_set_bounds(cache); + ipmr_update_threshoulds(cache, mfc->mfcc_ttls); ipmr_cache_insert(cache); end_bh_atomic(); return 0; } - + +static void mrtsock_destruct(struct sock *sk) +{ + if (sk == mroute_socket) { + ipv4_config.multicast_route = 0; + mroute_socket=NULL; + mroute_close(sk); + } +} + /* * Socket options and virtual interface manipulation. The whole * virtual interface system is a complete heap, but unfortunately @@ -461,7 +666,6 @@ int ipmr_mfc_modify(int action, struct mfcctl *mfc) int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen) { - int err; struct vifctl vif; struct mfcctl mfc; @@ -480,9 +684,8 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen) return -ENOPROTOOPT; { int opt; - err = get_user(opt,(int *)optval); - if (err) - return err; + if (get_user(opt,(int *)optval)) + return -EFAULT; if (opt != 1) return -ENOPROTOOPT; } @@ -490,78 +693,101 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen) return -EADDRINUSE; mroute_socket=sk; ipv4_config.multicast_route = 1; - /* Initialise state */ - return 0; + if (ip_ra_control(sk, 1, mrtsock_destruct) == 0) + return 0; + mrtsock_destruct(sk); + return -EADDRINUSE; case MRT_DONE: - ipv4_config.multicast_route = 0; - mroute_close(sk); - mroute_socket=NULL; + mrtsock_destruct(sk); return 0; case MRT_ADD_VIF: case MRT_DEL_VIF: if(optlen!=sizeof(vif)) return -EINVAL; - err = copy_from_user(&vif,optval,sizeof(vif)); - if (err) + if (copy_from_user(&vif,optval,sizeof(vif))) return -EFAULT; - if(vif.vifc_vifi > MAXVIFS) + if(vif.vifc_vifi >= MAXVIFS) return -ENFILE; if(optname==MRT_ADD_VIF) { struct vif_device *v=&vif_table[vif.vifc_vifi]; struct device *dev; - /* Empty vif ? */ - if(vifc_map&(1<<vif.vifc_vifi)) + struct in_device *in_dev; + + /* Is vif busy ? */ + if (vifc_map&(1<<vif.vifc_vifi)) return -EADDRINUSE; - /* Find the interface */ - dev=ip_dev_find(vif.vifc_lcl_addr.s_addr, NULL); - if(!dev) - return -EADDRNOTAVAIL; - /* Must be tunnelled or multicastable */ - if(vif.vifc_flags&VIFF_TUNNEL) - { - if(vif.vifc_flags&VIFF_SRCRT) - return -EOPNOTSUPP; - } - else - { - if(dev->flags&IFF_MULTICAST) - { - /* Most ethernet cards don't know - how to do this yet.. */ - dev->flags|=IFF_ALLMULTI; - dev_mc_upload(dev); - ip_rt_multicast_event(dev); - } - else - { - /* We are stuck.. */ - return -EOPNOTSUPP; + + switch (vif.vifc_flags) { +#ifdef CONFIG_IP_PIMSM + case VIFF_REGISTER: + + /* + * Special Purpose VIF in PIM + * All the packets will be sent to the daemon + */ + if (reg_vif_num >= 0) + return -EADDRINUSE; + reg_vif_num = vif.vifc_vifi; + dev = ipmr_reg_vif(&vif); + if (!dev) { + reg_vif_num = -1; + return -ENOBUFS; } + break; +#endif + case VIFF_TUNNEL: + dev = ipmr_new_tunnel(&vif); + if (!dev) + return -ENOBUFS; + break; + case 0: + dev=ip_dev_find(vif.vifc_lcl_addr.s_addr); + if (!dev) + return -EADDRNOTAVAIL; + break; + default: + printk(KERN_DEBUG "ipmr_add_vif: flags %02x\n", vif.vifc_flags); + return -EINVAL; } + + if ((in_dev = dev->ip_ptr) == NULL) + return -EADDRNOTAVAIL; + if (in_dev->flags & IFF_IP_MFORWARD) + return -EADDRINUSE; + in_dev->flags |= IFF_IP_MFORWARD; + dev_set_allmulti(dev, +1); + ip_rt_multicast_event(in_dev); + /* * Fill in the VIF structures */ - cli(); + start_bh_atomic(); v->rate_limit=vif.vifc_rate_limit; v->local=vif.vifc_lcl_addr.s_addr; v->remote=vif.vifc_rmt_addr.s_addr; v->flags=vif.vifc_flags; v->threshold=vif.vifc_threshold; - v->u.dev=NULL; - if (!(vif.vifc_flags&VIFF_TUNNEL)) - v->u.dev=dev; + v->dev=dev; v->bytes_in = 0; v->bytes_out = 0; v->pkt_in = 0; v->pkt_out = 0; + v->link = dev->ifindex; + if (vif.vifc_flags&(VIFF_TUNNEL|VIFF_REGISTER)) + v->link = dev->iflink; vifc_map|=(1<<vif.vifc_vifi); if (vif.vifc_vifi+1 > maxvif) maxvif = vif.vifc_vifi+1; - sti(); + end_bh_atomic(); return 0; - } else - return vif_delete(vif.vifc_vifi); + } else { + int ret; + rtnl_lock(); + ret = vif_delete(vif.vifc_vifi); + rtnl_unlock(); + return ret; + } /* * Manipulate the forwarding caches. These live @@ -571,8 +797,9 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen) case MRT_DEL_MFC: if(optlen!=sizeof(mfc)) return -EINVAL; - err = copy_from_user(&mfc,optval, sizeof(mfc)); - return err ? -EFAULT : ipmr_mfc_modify(optname, &mfc); + if (copy_from_user(&mfc,optval, sizeof(mfc))) + return -EFAULT; + return ipmr_mfc_modify(optname, &mfc); /* * Control PIM assert. */ @@ -581,9 +808,29 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen) int v; if(get_user(v,(int *)optval)) return -EFAULT; - mroute_do_pim=(v)?1:0; + mroute_do_assert=(v)?1:0; return 0; } +#ifdef CONFIG_IP_PIMSM + case MRT_PIM: + { + int v; + if(get_user(v,(int *)optval)) + return -EFAULT; + v = (v)?1:0; + if (v != mroute_do_pim) { + mroute_do_pim = v; + mroute_do_assert = v; +#ifdef CONFIG_IP_PIMSM_V2 + if (mroute_do_pim) + inet_add_protocol(&pim_protocol); + else + inet_del_protocol(&pim_protocol); +#endif + } + return 0; + } +#endif /* * Spurious command, or MRT_VERSION which you cannot * set. @@ -604,7 +851,11 @@ int ip_mroute_getsockopt(struct sock *sk,int optname,char *optval,int *optlen) if(sk!=mroute_socket) return -EACCES; - if(optname!=MRT_VERSION && optname!=MRT_ASSERT) + if(optname!=MRT_VERSION && +#ifdef CONFIG_IP_PIMSM + optname!=MRT_PIM && +#endif + optname!=MRT_ASSERT) return -ENOPROTOOPT; if(get_user(olr, optlen)) @@ -615,8 +866,12 @@ int ip_mroute_getsockopt(struct sock *sk,int optname,char *optval,int *optlen) return -EFAULT; if(optname==MRT_VERSION) val=0x0305; - else +#ifdef CONFIG_IP_PIMSM + else if(optname==MRT_PIM) val=mroute_do_pim; +#endif + else + val=mroute_do_assert; if(copy_to_user(optval,&val,olr)) return -EFAULT; return 0; @@ -628,7 +883,6 @@ int ip_mroute_getsockopt(struct sock *sk,int optname,char *optval,int *optlen) int ipmr_ioctl(struct sock *sk, int cmd, unsigned long arg) { - int err; struct sioc_sg_req sr; struct sioc_vif_req vr; struct vif_device *vif; @@ -637,8 +891,7 @@ int ipmr_ioctl(struct sock *sk, int cmd, unsigned long arg) switch(cmd) { case SIOCGETVIFCNT: - err = copy_from_user(&vr,(void *)arg,sizeof(vr)); - if (err) + if (copy_from_user(&vr,(void *)arg,sizeof(vr))) return -EFAULT; if(vr.vifi>=maxvif) return -EINVAL; @@ -649,16 +902,13 @@ int ipmr_ioctl(struct sock *sk, int cmd, unsigned long arg) vr.ocount=vif->pkt_out; vr.ibytes=vif->bytes_in; vr.obytes=vif->bytes_out; - err = copy_to_user((void *)arg,&vr,sizeof(vr)); - if (err) - err = -EFAULT; - return err; + if (copy_to_user((void *)arg,&vr,sizeof(vr))) + return -EFAULT; return 0; } return -EADDRNOTAVAIL; case SIOCGETSGCNT: - err = copy_from_user(&sr,(void *)arg,sizeof(sr)); - if (err) + if (copy_from_user(&sr,(void *)arg,sizeof(sr))) return -EFAULT; for (c = mfc_cache_array[MFC_HASH(sr.grp.s_addr, sr.src.s_addr)]; c; c = c->next) { @@ -667,10 +917,8 @@ int ipmr_ioctl(struct sock *sk, int cmd, unsigned long arg) sr.pktcnt = c->mfc_pkt; sr.bytecnt = c->mfc_bytes; sr.wrong_if = c->mfc_wrong_if; - err = copy_to_user((void *)arg,&sr,sizeof(sr)); - if (err) - err = -EFAULT; - return err; + if (copy_to_user((void *)arg,&sr,sizeof(sr))) + return -EFAULT; return 0; } } @@ -691,9 +939,10 @@ void mroute_close(struct sock *sk) /* * Shut down all active vif entries */ - + rtnl_lock(); for(i=0; i<maxvif; i++) vif_delete(i); + rtnl_unlock(); /* * Wipe the cache @@ -711,12 +960,11 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v { struct vif_device *v; int ct; - if(event!=NETDEV_DOWN) + if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; v=&vif_table[0]; - for(ct=0;ct<maxvif;ct++) - { - if(vifc_map&(1<<ct) && !(v->flags&VIFF_TUNNEL) && v->u.dev==ptr) + for(ct=0;ct<maxvif;ct++) { + if (vifc_map&(1<<ct) && v->dev==ptr) vif_delete(ct); v++; } @@ -769,26 +1017,24 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, struct rtable *rt; int encap = 0; struct sk_buff *skb2; - int err; - + +#ifdef CONFIG_IP_PIMSM + if (vif->flags & VIFF_REGISTER) { + vif->pkt_out++; + vif->bytes_out+=skb->len; + ((struct net_device_stats*)vif->dev->priv)->tx_bytes += skb->len; + ((struct net_device_stats*)vif->dev->priv)->tx_packets++; + ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT); + return; + } +#endif + if (vif->flags&VIFF_TUNNEL) { - rt = vif->u.rt; - if (!rt || rt->u.dst.obsolete) { - ip_rt_put(rt); - vif->u.rt = NULL; - err = ip_route_output(&rt, vif->remote, vif->local, RT_TOS(iph->tos), NULL); - if (err) - return; - vif->u.rt = rt; - } - dst_clone(&rt->u.dst); + if (ip_route_output(&rt, vif->remote, vif->local, RT_TOS(iph->tos), vif->link)) + return; encap = sizeof(struct iphdr); } else { - dev = vif->u.dev; - if (dev == NULL) - return; - err = ip_route_output(&rt, iph->daddr, 0, RT_TOS(iph->tos), dev); - if (err) + if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(iph->tos), vif->link)) return; } @@ -807,10 +1053,14 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, return; } - if (skb_headroom(skb) < encap || (encap && !last)) + if (skb_headroom(skb) < encap || skb_cloned(skb) || !last) skb2 = skb_realloc_headroom(skb, (encap + 15)&~15); - else + else if (atomic_read(&skb->users) != 1) skb2 = skb_clone(skb, GFP_ATOMIC); + else { + atomic_inc(&skb->users); + skb2 = skb; + } if (skb2 == NULL) { ip_rt_put(rt); @@ -826,34 +1076,45 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, iph = skb2->nh.iph; ip_decrease_ttl(iph); - if (vif->flags & VIFF_TUNNEL) + if (vif->flags & VIFF_TUNNEL) { ip_encap(skb2, vif->local, vif->remote); + ((struct ip_tunnel *)vif->dev->priv)->stat.tx_packets++; + ((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb2->len; + } + + IPCB(skb2)->flags |= IPSKB_FORWARDED; - ip_send(skb2); + /* + * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally + * not only before forwarding, but after forwarding on all output + * interfaces. It is clear, if mrouter runs a multicasting + * program, it should receive packets not depending to what interface + * program is joined. + * If we will not make it, the program will have to join on all + * interfaces. On the other hand, multihoming host (or router, but + * not mrouter) cannot join to more than one interface - it will + * result in receiving multiple packets. + */ + ip_ll_header(skb2); + skb2->dst->output(skb2); } -/* - * Multicast packets for forwarding arrive here - */ +int ipmr_find_vif(struct device *dev) +{ + int ct; + for (ct=0; ct<maxvif; ct++) { + if (vifc_map&(1<<ct) && vif_table[ct].dev == dev) + return ct; + } + return ALL_VIFS; +} -int ip_mr_input(struct sk_buff *skb) +/* "local" means that we should preserve one skb (for local delivery) */ + +int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local) { - struct mfc_cache *cache; int psend = -1; int vif, ct; - int local = 0; - int tunneled = IPCB(skb)->flags&IPSKB_TUNNELED; - - cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr); - - /* - * No usable cache entry - */ - - if (cache==NULL || (cache->mfc_flags&MFC_QUEUED)) { - ipmr_cache_unresolved(cache, ALL_VIFS, skb); - return -EAGAIN; - } vif = cache->mfc_parent; cache->mfc_pkt++; @@ -862,75 +1123,290 @@ int ip_mr_input(struct sk_buff *skb) /* * Wrong interface: drop packet and (maybe) send PIM assert. */ - if (vif >= maxvif || !(vifc_map&(1<<vif)) || - (tunneled && IPCB(skb)->vif != vif) || - (!tunneled && (vif_table[vif].flags&VIFF_TUNNEL || - vif_table[vif].u.dev != skb->dev))) { + if (vif_table[vif].dev != skb->dev) { + int true_vifi; + + if (((struct rtable*)skb->dst)->key.iif == 0) { + /* It is our own packet, looped back. + Very complicated situation... + + The best workaround until routing daemons will be + fixed is not to redistribute packet, if it was + send through wrong interface. It means, that + multicast applications WILL NOT work for + (S,G), which have default multicast route pointing + to wrong oif. In any case, it is not a good + idea to use multicasting applications on router. + */ + goto dont_forward; + } + cache->mfc_wrong_if++; - if (vif < MAXVIFS && mroute_do_pim && - !(vif_table[vif].flags&VIFF_TUNNEL) && - skb->dev->flags&IFF_BROADCAST && + true_vifi = ipmr_find_vif(skb->dev); + + if (true_vifi < MAXVIFS && mroute_do_assert && + /* pimsm uses asserts, when switching from RPT to SPT, + so that we cannot check that packet arrived on an oif. + It is bad, but otherwise we would need to move pretty + large chunk of pimd to kernel. Ough... --ANK + */ + (mroute_do_pim || cache->mfc_ttls[true_vifi] < 255) && jiffies - cache->mfc_last_assert > MFC_ASSERT_THRESH) { cache->mfc_last_assert = jiffies; - /* - * It is wrong! Routing daemon can - * determine vif itself, but it cannot - * determine REAL device. - * BSD bug. Fix it later, PIM does not - * work in any case 8) _ANK_ - */ - ipmr_cache_report(skb, vif, 1); + ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF); } - kfree_skb(skb, FREE_WRITE); - return -EINVAL; + goto dont_forward; } vif_table[vif].pkt_in++; vif_table[vif].bytes_in+=skb->len; - if (IPCB(skb)->opt.router_alert || - ((struct rtable*)skb->dst)->rt_flags&RTF_LOCAL || - skb->nh.iph->protocol == IPPROTO_IGMP) - local = 1; - /* * Forward the frame */ - ct = cache->mfc_maxvif-1; - while (ct>=cache->mfc_minvif) { - /* - * 0 means don't do it. Silly idea, 255 as don't do it would be cleaner! - */ - if (skb->nh.iph->ttl > cache->mfc_ttls[ct] && cache->mfc_ttls[ct]>0) { + for (ct = cache->mfc_maxvif-1; ct >= cache->mfc_minvif; ct--) { + if (skb->nh.iph->ttl > cache->mfc_ttls[ct]) { if (psend != -1) ipmr_queue_xmit(skb, cache, psend, 0); psend=ct; } - ct--; } if (psend != -1) - ipmr_queue_xmit(skb, cache, psend, 1); + ipmr_queue_xmit(skb, cache, psend, !local); + +dont_forward: + if (!local) + kfree_skb(skb, FREE_WRITE); + return 0; +} + + +/* + * Multicast packets for forwarding arrive here + */ + +int ip_mr_input(struct sk_buff *skb) +{ + struct mfc_cache *cache; + int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL; + + /* Packet is looped back after forward, it should not be + forwarded second time, but still can be delivered locally. + */ + if (IPCB(skb)->flags&IPSKB_FORWARDED) + goto dont_forward; + if (!local) { + if (IPCB(skb)->opt.router_alert) { + if (ip_call_ra_chain(skb)) + return 0; + } else if (skb->nh.iph->protocol == IPPROTO_IGMP && mroute_socket) { + /* IGMPv1 (and broken IGMPv2 implementations sort of + Cisco IOS <= 11.2(8)) do not put router alert + option to IGMP packets destined to routable + groups. It is very bad, because it means + that we can forward NO IGMP messages. + */ + raw_rcv(mroute_socket, skb); + return 0; + } + } + + cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr); + + /* + * No usable cache entry + */ + + if (cache==NULL || (cache->mfc_flags&MFC_QUEUED)) { + int vif; + + if (local) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + ip_local_deliver(skb); + if (skb2 == NULL) + return -ENOBUFS; + skb = skb2; + } + + vif = ipmr_find_vif(skb->dev); + if (vif != ALL_VIFS) { + ipmr_cache_unresolved(cache, vif, skb); + return -EAGAIN; + } kfree_skb(skb, FREE_READ); return 0; } - return ip_local_deliver(skb); + + ip_mr_forward(skb, cache, local); + + if (local) + return ip_local_deliver(skb); + return 0; + +dont_forward: + if (local) + return ip_local_deliver(skb); + kfree_skb(skb, FREE_READ); + return 0; +} + +#ifdef CONFIG_IP_PIMSM_V1 +/* + * Handle IGMP messages of PIMv1 + */ + +int pim_rcv_v1(struct sk_buff * skb, unsigned short len) +{ + struct igmphdr *pim = (struct igmphdr*)skb->h.raw; + struct iphdr *encap; + + if (!mroute_do_pim || + len < sizeof(*pim) + sizeof(*encap) || + pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER || + reg_dev == NULL) { + kfree_skb(skb, FREE_READ); + return -EINVAL; + } + + encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr)); + /* + Check that: + a. packet is really destinted to a multicast group + b. packet is not a NULL-REGISTER + c. packet is not truncated + */ + if (!MULTICAST(encap->daddr) || + ntohs(encap->tot_len) == 0 || + ntohs(encap->tot_len) + sizeof(*pim) > len) { + kfree_skb(skb, FREE_READ); + return -EINVAL; + } + skb_pull(skb, (u8*)encap - skb->data); + skb->nh.iph = (struct iphdr *)skb->data; + skb->dev = reg_dev; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->protocol = __constant_htons(ETH_P_IP); + skb->ip_summed = 0; + skb->pkt_type = PACKET_HOST; + dst_release(skb->dst); + skb->dst = NULL; + ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len; + ((struct net_device_stats*)reg_dev->priv)->rx_packets++; + netif_rx(skb); + return 0; +} +#endif + +#ifdef CONFIG_IP_PIMSM_V2 +int pim_rcv(struct sk_buff * skb, unsigned short len) +{ + struct pimreghdr *pim = (struct pimreghdr*)skb->h.raw; + struct iphdr *encap; + + if (len < sizeof(*pim) + sizeof(*encap) || + pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) || + (pim->flags&PIM_NULL_REGISTER) || + reg_dev == NULL || + ip_compute_csum((void *)pim, len)) { + kfree_skb(skb, FREE_READ); + return -EINVAL; + } + + /* check if the inner packet is destined to mcast group */ + encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr)); + if (!MULTICAST(encap->daddr) || + ntohs(encap->tot_len) == 0 || + ntohs(encap->tot_len) + sizeof(*pim) > len) { + kfree_skb(skb, FREE_READ); + return -EINVAL; + } + skb_pull(skb, (u8*)encap - skb->data); + skb->nh.iph = (struct iphdr *)skb->data; + skb->dev = reg_dev; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->protocol = __constant_htons(ETH_P_IP); + skb->ip_summed = 0; + skb->pkt_type = PACKET_HOST; + dst_release(skb->dst); + ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len; + ((struct net_device_stats*)reg_dev->priv)->rx_packets++; + skb->dst = NULL; + netif_rx(skb); + return 0; } +#endif -int ip_mr_find_tunnel(u32 local, u32 remote) +#ifdef CONFIG_RTNETLINK + +static int +ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) { int ct; - struct vif_device *vif; + struct rtnexthop *nhp; + struct device *dev = vif_table[c->mfc_parent].dev; - for (ct=0; ct<maxvif; ct++) { - vif = &vif_table[ct]; - if (vifc_map&(1<<ct) && vif->flags&VIFF_TUNNEL && - vif->local == local && vif->remote == remote) - return ct; + if (dev) { + u8 *o = skb->tail; + RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex); + rtm->rtm_optlen += skb->tail - o; + } + + for (ct = c->mfc_minvif; ct < c->mfc_maxvif; ct++) { + if (c->mfc_ttls[ct] < 255) { + if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) + goto rtattr_failure; + nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); + nhp->rtnh_flags = 0; + nhp->rtnh_hops = c->mfc_ttls[ct]; + nhp->rtnh_ifindex = vif_table[ct].dev->ifindex; + nhp->rtnh_len = sizeof(*nhp); + rtm->rtm_nhs++; + } } - return -1; + rtm->rtm_type = RTN_MULTICAST; + return 1; + +rtattr_failure: + return -EMSGSIZE; } +int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm) +{ + struct mfc_cache *cache; + struct rtable *rt = (struct rtable*)skb->dst; + + start_bh_atomic(); + cache = ipmr_cache_find(rt->rt_src, rt->rt_dst); + if (cache==NULL || (cache->mfc_flags&MFC_QUEUED)) { + struct device *dev = skb->dev; + int vif; + int err; + + if (dev == NULL || (vif = ipmr_find_vif(dev)) == ALL_VIFS) { + end_bh_atomic(); + return -ENODEV; + } + skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + skb->nh.iph->ihl = sizeof(struct iphdr)>>2; + skb->nh.iph->saddr = rt->rt_src; + skb->nh.iph->daddr = rt->rt_dst; + skb->nh.iph->version = 0; + err = ipmr_cache_unresolved(cache, vif, skb); + end_bh_atomic(); + return err; + } + /* Resolved cache entry is not changed by net bh, + so that we are allowed to enable it. + */ + end_bh_atomic(); + + if (rtm->rtm_flags & RTM_F_NOTIFY) + cache->mfc_flags |= MFC_NOTIFY; + return ipmr_fill_mroute(skb, cache, rtm); +} +#endif + /* * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif */ @@ -945,16 +1421,19 @@ int ipmr_vif_info(char *buffer, char **start, off_t offset, int length, int dumm int ct; len += sprintf(buffer, - "Interface Bytes In Pkts In Bytes Out Pkts Out Flags Local Remote\n"); + "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n"); pos=len; for (ct=0;ct<maxvif;ct++) { + char *name = "none"; vif=&vif_table[ct]; if(!(vifc_map&(1<<ct))) continue; - size = sprintf(buffer+len, "%2d %-10s %8ld %7ld %8ld %7ld %05X %08lX %08lX\n", - ct, vif->flags&VIFF_TUNNEL ? "Tunnel" : vif->u.dev->name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, + if (vif->dev) + name = vif->dev->name; + size = sprintf(buffer+len, "%2d %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", + ct, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, vif->flags, vif->local, vif->remote); len+=size; pos+=size; @@ -984,7 +1463,7 @@ int ipmr_mfc_info(char *buffer, char **start, off_t offset, int length, int dumm int ct; len += sprintf(buffer, - "Group Origin SrcIface Pkts Bytes Wrong VifTtls\n"); + "Group Origin Iif Pkts Bytes Wrong Oifs\n"); pos=len; for (ct=0;ct<MFC_LINES;ct++) @@ -993,33 +1472,22 @@ int ipmr_mfc_info(char *buffer, char **start, off_t offset, int length, int dumm mfc=mfc_cache_array[ct]; while(mfc!=NULL) { - char *name="none"; int n; - /* - * Device name - */ - if(mfc->mfc_parent < maxvif && vifc_map&(1<<mfc->mfc_parent)) { - if (vif_table[mfc->mfc_parent].flags&VIFF_TUNNEL) - name="Tunnel"; - else - name=vif_table[mfc->mfc_parent].u.dev->name; - } + /* * Interface forwarding map */ - size = sprintf(buffer+len, "%08lX %08lX %-8s %8ld %8ld %8ld", + size = sprintf(buffer+len, "%08lX %08lX %-3d %8ld %8ld %8ld", (unsigned long)mfc->mfc_mcastgrp, (unsigned long)mfc->mfc_origin, - name, + mfc->mfc_parent == ALL_VIFS ? -1 : mfc->mfc_parent, + (mfc->mfc_flags & MFC_QUEUED) ? mfc->mfc_unresolved.qlen : mfc->mfc_pkt, mfc->mfc_bytes, - mfc->mfc_pkt, mfc->mfc_wrong_if); - for(n=0;n<maxvif;n++) + for(n=mfc->mfc_minvif;n<mfc->mfc_maxvif;n++) { - if(vifc_map&(1<<n)) - size += sprintf(buffer+len+size, " %-3d", mfc->mfc_ttls[n]); - else - size += sprintf(buffer+len+size, " --- "); + if(vifc_map&(1<<n) && mfc->mfc_ttls[n] < 255) + size += sprintf(buffer+len+size, " %2d:%-3d", n, mfc->mfc_ttls[n]); } size += sprintf(buffer+len+size, "\n"); len+=size; @@ -1043,6 +1511,10 @@ done: len-=(offset-begin); if(len>length) len=length; + if (len < 0) { + len = 0; + printk(KERN_CRIT "Yep, guys... our template for proc_*_read is crappy :-)\n"); + } return len; } @@ -1061,6 +1533,19 @@ static struct proc_dir_entry proc_net_ipmr_mfc = { }; #endif +#ifdef CONFIG_IP_PIMSM_V2 +struct inet_protocol pim_protocol = +{ + pim_rcv, /* PIM handler */ + NULL, /* PIM error control */ + NULL, /* next */ + IPPROTO_PIM, /* protocol ID */ + 0, /* copy */ + NULL, /* data */ + "PIM" /* name */ +}; +#endif + /* * Setup for IP multicast routing @@ -1068,7 +1553,7 @@ static struct proc_dir_entry proc_net_ipmr_mfc = { __initfunc(void ip_mr_init(void)) { - printk(KERN_INFO "Linux IP multicast router 0.06.\n"); + printk(KERN_INFO "Linux IP multicast router 0.06 plus PIM-SM\n"); register_netdevice_notifier(&ip_mr_notifier); #ifdef CONFIG_PROC_FS proc_net_register(&proc_net_ipmr_vif); diff --git a/net/ipv4/packet.c b/net/ipv4/packet.c index f69449e76..e69de29bb 100644 --- a/net/ipv4/packet.c +++ b/net/ipv4/packet.c @@ -1,528 +0,0 @@ -/* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * PACKET - implements raw packet sockets. - * - * Doesn't belong in IP but it's currently too hooked into ip - * to separate. - * - * Version: @(#)packet.c 1.0.6 05/25/93 - * - * Authors: Ross Biro, <bir7@leland.Stanford.Edu> - * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> - * Alan Cox, <gw4pts@gw4pts.ampr.org> - * - * Fixes: - * Alan Cox : verify_area() now used correctly - * Alan Cox : new skbuff lists, look ma no backlogs! - * Alan Cox : tidied skbuff lists. - * Alan Cox : Now uses generic datagram routines I - * added. Also fixed the peek/read crash - * from all old Linux datagram code. - * Alan Cox : Uses the improved datagram code. - * Alan Cox : Added NULL's for socket options. - * Alan Cox : Re-commented the code. - * Alan Cox : Use new kernel side addressing - * Rob Janssen : Correct MTU usage. - * Dave Platt : Counter leaks caused by incorrect - * interrupt locking and some slightly - * dubious gcc output. Can you read - * compiler: it said _VOLATILE_ - * Richard Kooijman : Timestamp fixes. - * Alan Cox : New buffers. Use sk->mac.raw. - * Alan Cox : sendmsg/recvmsg support. - * Alan Cox : Protocol setting support - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <linux/types.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/fcntl.h> -#include <linux/socket.h> -#include <linux/in.h> -#include <linux/inet.h> -#include <linux/netdevice.h> -#include <linux/if_packet.h> -#include <net/ip.h> -#include <net/protocol.h> -#include <linux/skbuff.h> -#include <net/sock.h> -#include <linux/errno.h> -#include <linux/timer.h> -#include <asm/system.h> -#include <asm/uaccess.h> - - -/* - * This should be the easiest of all, all we do is copy it into a buffer. - */ - -int packet_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) -{ - struct sock *sk; - - /* - * When we registered the protocol we saved the socket in the data - * field for just this event. - */ - - sk = (struct sock *) pt->data; - - /* - * Yank back the headers [hope the device set this - * right or kerboom...] - */ - - skb_push(skb,skb->data-skb->mac.raw); - - /* - * The SOCK_PACKET socket receives _all_ frames. - */ - - skb->dev = dev; - - /* - * Charge the memory to the socket. This is done specifically - * to prevent sockets using all the memory up. - */ - - if(sock_queue_rcv_skb(sk,skb)<0) - { - kfree_skb(skb, FREE_READ); - return 0; - } - /* - * Processing complete. - */ - - return(0); -} - - -/* - * Output a raw packet to a device layer. This bypasses all the other - * protocol layers and you must therefore supply it with a complete frame - */ - -static int packet_sendmsg(struct sock *sk, struct msghdr *msg, int len) -{ - struct sk_buff *skb; - struct device *dev; - struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name; - unsigned short proto=0; - int err; - - /* - * Check the flags. - */ - - if (msg->msg_flags&~MSG_DONTWAIT) - return(-EINVAL); - - /* - * Get and verify the address. - */ - - if (saddr) - { - if (msg->msg_namelen < sizeof(struct sockaddr)) - return(-EINVAL); - if (msg->msg_namelen==sizeof(struct sockaddr_pkt)) - proto=saddr->spkt_protocol; - } - else - return(-ENOTCONN); /* SOCK_PACKET must be sent giving an address */ - - /* - * Find the device first to size check it - */ - - saddr->spkt_device[13] = 0; - dev = dev_get(saddr->spkt_device); - if (dev == NULL) - { - return(-ENODEV); - } - - /* - * You may not queue a frame bigger than the mtu. This is the lowest level - * raw protocol and you must do your own fragmentation at this level. - */ - - if(len>dev->mtu+dev->hard_header_len) - return -EMSGSIZE; - - skb = sock_wmalloc(sk, len+dev->hard_header_len, 0, GFP_KERNEL); - - /* - * If the write buffer is full, then tough. At this level the user gets to - * deal with the problem - do your own algorithmic backoffs. That's far - * more flexible. - */ - - if (skb == NULL) - { - return(-ENOBUFS); - } - - /* - * Fill it in - */ - - /* FIXME: Save some space for broken drivers that write a - * hard header at transmission time by themselves. PPP is the - * notable one here. This should really be fixed at the driver level. - */ - skb_reserve(skb,dev->hard_header_len); - err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); - skb->arp = 1; /* No ARP needs doing on this (complete) frame */ - skb->protocol = proto; - skb->dev = dev; - skb->priority = sk->priority; - - /* - * Now send it - */ - - if (err) - { - err = -EFAULT; - } - else - { - if (!(dev->flags & IFF_UP)) - { - err = -ENODEV; - } - } - - if (err) - { - kfree_skb(skb, FREE_WRITE); - return err; - } - - dev_queue_xmit(skb); - return(len); -} - -/* - * Close a SOCK_PACKET socket. This is fairly simple. We immediately go - * to 'closed' state and remove our protocol entry in the device list. - * The release_sock() will destroy the socket if a user has closed the - * file side of the object. - */ - -static void packet_close(struct sock *sk, unsigned long timeout) -{ - /* - * Stop more data and kill the socket off. - */ - - lock_sock(sk); - sk->state = TCP_CLOSE; - - /* - * Unhook the notifier - */ - - unregister_netdevice_notifier(&sk->protinfo.af_packet.notifier); - - if(sk->protinfo.af_packet.prot_hook) - { - /* - * Remove the protocol hook - */ - - dev_remove_pack((struct packet_type *)sk->protinfo.af_packet.prot_hook); - - /* - * Dispose of litter carefully. - */ - - kfree_s((void *)sk->protinfo.af_packet.prot_hook, sizeof(struct packet_type)); - sk->protinfo.af_packet.prot_hook = NULL; - } - - release_sock(sk); - sk->dead = 1; - destroy_sock(sk); -} - -/* - * Attach a packet hook to a device. - */ - -int packet_attach(struct sock *sk, struct device *dev) -{ - struct packet_type *p = (struct packet_type *) kmalloc(sizeof(*p), GFP_KERNEL); - if (p == NULL) - return(-ENOMEM); - - p->func = packet_rcv; - p->type = sk->num; - p->data = (void *)sk; - p->dev = dev; - dev_add_pack(p); - - /* - * We need to remember this somewhere. - */ - - sk->protinfo.af_packet.prot_hook = p; - sk->protinfo.af_packet.bound_dev = dev; - return 0; -} - -/* - * Bind a packet socket to a device - */ - -static int packet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) -{ - char name[15]; - struct device *dev; - - /* - * Check legality - */ - - if(addr_len!=sizeof(struct sockaddr)) - return -EINVAL; - strncpy(name,uaddr->sa_data,14); - name[14]=0; - - /* - * Lock the device chain while we sanity check - * the bind request. - */ - - dev_lock_list(); - dev=dev_get(name); - if(dev==NULL) - { - dev_unlock_list(); - return -ENODEV; - } - - if(!(dev->flags&IFF_UP)) - { - dev_unlock_list(); - return -ENETDOWN; - } - - /* - * Perform the request. - */ - - memcpy(sk->protinfo.af_packet.device_name,name,15); - - /* - * Rewrite an existing hook if present. - */ - - if(sk->protinfo.af_packet.prot_hook) - { - dev_remove_pack(sk->protinfo.af_packet.prot_hook); - sk->protinfo.af_packet.prot_hook->dev=dev; - sk->protinfo.af_packet.bound_dev=dev; - dev_add_pack(sk->protinfo.af_packet.prot_hook); - } - else - { - int err=packet_attach(sk, dev); - if(err) - { - dev_unlock_list(); - return err; - } - } - /* - * Now the notifier is set up right this lot is safe. - */ - dev_unlock_list(); - return 0; -} - -/* - * This hook is called when a device goes up or down so that - * SOCK_PACKET sockets can come unbound properly. - */ - -static int packet_unbind(struct notifier_block *this, unsigned long msg, void *data) -{ - struct inet_packet_opt *ipo=(struct inet_packet_opt *)this; - if(msg==NETDEV_DOWN && data==ipo->bound_dev) - { - /* - * Our device has gone down. - */ - ipo->bound_dev=NULL; - dev_remove_pack(ipo->prot_hook); - kfree(ipo->prot_hook); - ipo->prot_hook=NULL; - } - return NOTIFY_DONE; -} - - -/* - * Create a packet of type SOCK_PACKET. - */ - -static int packet_init(struct sock *sk) -{ - /* - * Attach a protocol block - */ - - int err=packet_attach(sk, NULL); - if(err) - return err; - - /* - * Set up the per socket notifier. - */ - - sk->protinfo.af_packet.notifier.notifier_call=packet_unbind; - sk->protinfo.af_packet.notifier.priority=0; - - register_netdevice_notifier(&sk->protinfo.af_packet.notifier); - - return(0); -} - - -/* - * Pull a packet from our receive queue and hand it to the user. - * If necessary we block. - */ - -int packet_recvmsg(struct sock *sk, struct msghdr *msg, int len, - int noblock, int flags,int *addr_len) -{ - int copied=0; - struct sk_buff *skb; - struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name; - int err; - - /* - * If there is no protocol hook then the device is down. - */ - - if(sk->protinfo.af_packet.prot_hook==NULL) - return -ENETDOWN; - - /* - * If the address length field is there to be filled in, we fill - * it in now. - */ - - if (addr_len) - *addr_len=sizeof(*saddr); - - /* - * Call the generic datagram receiver. This handles all sorts - * of horrible races and re-entrancy so we can forget about it - * in the protocol layers. - */ - - skb=skb_recv_datagram(sk,flags,noblock,&err); - - /* - * An error occurred so return it. Because skb_recv_datagram() - * handles the blocking we don't see and worry about blocking - * retries. - */ - - if(skb==NULL) - return err; - - /* - * You lose any data beyond the buffer you gave. If it worries a - * user program they can ask the device for its MTU anyway. - */ - - copied = skb->len; - if(copied>len) - { - copied=len; - msg->msg_flags|=MSG_TRUNC; - } - - /* We can't use skb_copy_datagram here */ - err = memcpy_toiovec(msg->msg_iov, skb->data, copied); - if (err) - { - return -EFAULT; - } - - sk->stamp=skb->stamp; - - /* - * Copy the address. - */ - - if (saddr) - { - saddr->spkt_family = skb->dev->type; - strncpy(saddr->spkt_device,skb->dev->name, 15); - saddr->spkt_protocol = skb->protocol; - } - - /* - * Free or return the buffer as appropriate. Again this hides all the - * races and re-entrancy issues from us. - */ - - skb_free_datagram(sk, skb); - - return(copied); -} - -/* - * This structure declares to the lower layer socket subsystem currently - * incorrectly embedded in the IP code how to behave. This interface needs - * a lot of work and will change. - */ - -struct proto packet_prot = -{ - (struct sock *)&packet_prot, /* sklist_next */ - (struct sock *)&packet_prot, /* sklist_prev */ - packet_close, /* close */ - NULL, /* connect */ - NULL, /* accept */ - NULL, /* retransmit */ - NULL, /* write_wakeup */ - NULL, /* read_wakeup */ - datagram_poll, /* poll */ - NULL, /* ioctl */ - packet_init, /* init */ - NULL, /* destroy */ - NULL, /* shutdown */ - NULL, /* setsockopt */ - NULL, /* getsockopt */ - packet_sendmsg, /* Sendmsg */ - packet_recvmsg, /* Recvmsg */ - packet_bind, /* bind */ - NULL, /* backlog_rcv */ - NULL, /* hash */ - NULL, /* unhash */ - NULL, /* rehash */ - NULL, /* good_socknum */ - NULL, /* verify_bind */ - 128, /* max_header */ - 0, /* retransmits */ - "PACKET", /* name */ - 0, /* inuse */ - 0 /* highestinuse */ -}; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 0ce80fec4..7f3b5f9bb 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -7,7 +7,7 @@ * PROC file system. It is mainly used for debugging and * statistics. * - * Version: @(#)proc.c 1.0.5 05/27/93 + * Version: $Id: proc.c,v 1.23 1997/10/30 23:52:20 davem Exp $ * * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de> @@ -221,7 +221,6 @@ int afinet_get_info(char *buffer, char **start, off_t offset, int length, int du { /* From net/socket.c */ extern int socket_get_info(char *, char **, off_t, int); - extern struct proto packet_prot; int len = socket_get_info(buffer,start,offset,length); @@ -231,8 +230,6 @@ int afinet_get_info(char *buffer, char **start, off_t offset, int length, int du udp_prot.inuse, udp_prot.highestinuse); len += sprintf(buffer+len,"RAW: inuse %d highest %d\n", raw_prot.inuse, raw_prot.highestinuse); - len += sprintf(buffer+len,"PAC: inuse %d highest %d\n", - packet_prot.inuse, packet_prot.highestinuse); if (offset >= len) { *start = buffer; @@ -291,14 +288,15 @@ int snmp_get_info(char *buffer, char **start, off_t offset, int length, int dumm icmp_statistics.IcmpOutAddrMasks, icmp_statistics.IcmpOutAddrMaskReps); len += sprintf (buffer + len, - "Tcp: RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs\n" - "Tcp: %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + "Tcp: RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts\n" + "Tcp: %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", tcp_statistics.TcpRtoAlgorithm, tcp_statistics.TcpRtoMin, tcp_statistics.TcpRtoMax, tcp_statistics.TcpMaxConn, tcp_statistics.TcpActiveOpens, tcp_statistics.TcpPassiveOpens, tcp_statistics.TcpAttemptFails, tcp_statistics.TcpEstabResets, tcp_statistics.TcpCurrEstab, tcp_statistics.TcpInSegs, - tcp_statistics.TcpOutSegs, tcp_statistics.TcpRetransSegs); + tcp_statistics.TcpOutSegs, tcp_statistics.TcpRetransSegs, + tcp_statistics.TcpInErrs, tcp_statistics.TcpOutRsts); len += sprintf (buffer + len, "Udp: InDatagrams NoPorts InErrors OutDatagrams\nUdp: %lu %lu %lu %lu\n", diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 5c7d6ca75..b47480be5 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -5,7 +5,7 @@ * * INET protocol dispatch tables. * - * Version: @(#)protocol.c 1.0.5 05/25/93 + * Version: $Id: protocol.c,v 1.9 1997/10/29 20:27:34 kuznet Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -45,20 +45,23 @@ #include <net/ipip.h> #include <linux/igmp.h> +#define IPPROTO_PREVIOUS NULL -#ifdef CONFIG_NET_IPIP +#ifdef CONFIG_IP_MULTICAST -static struct inet_protocol ipip_protocol = +static struct inet_protocol igmp_protocol = { - ipip_rcv, /* IPIP handler */ - ipip_err, /* TUNNEL error control */ - 0, /* next */ - IPPROTO_IPIP, /* protocol ID */ - 0, /* copy */ - NULL, /* data */ - "IPIP" /* name */ + igmp_rcv, /* IGMP handler */ + NULL, /* IGMP error control */ + IPPROTO_PREVIOUS, /* next */ + IPPROTO_IGMP, /* protocol ID */ + 0, /* copy */ + NULL, /* data */ + "IGMP" /* name */ }; +#undef IPPROTO_PREVIOUS +#define IPPROTO_PREVIOUS &igmp_protocol #endif @@ -66,52 +69,47 @@ static struct inet_protocol tcp_protocol = { tcp_v4_rcv, /* TCP handler */ tcp_v4_err, /* TCP error control */ -#ifdef CONFIG_NET_IPIP - &ipip_protocol, -#else - NULL, /* next */ -#endif + IPPROTO_PREVIOUS, IPPROTO_TCP, /* protocol ID */ 0, /* copy */ NULL, /* data */ "TCP" /* name */ }; +#undef IPPROTO_PREVIOUS +#define IPPROTO_PREVIOUS &tcp_protocol + static struct inet_protocol udp_protocol = { udp_rcv, /* UDP handler */ udp_err, /* UDP error control */ - &tcp_protocol, /* next */ + IPPROTO_PREVIOUS, /* next */ IPPROTO_UDP, /* protocol ID */ 0, /* copy */ NULL, /* data */ "UDP" /* name */ }; +#undef IPPROTO_PREVIOUS +#define IPPROTO_PREVIOUS &udp_protocol + static struct inet_protocol icmp_protocol = { icmp_rcv, /* ICMP handler */ NULL, /* ICMP error control */ - &udp_protocol, /* next */ + IPPROTO_PREVIOUS, /* next */ IPPROTO_ICMP, /* protocol ID */ 0, /* copy */ NULL, /* data */ "ICMP" /* name */ }; -static struct inet_protocol igmp_protocol = -{ - igmp_rcv, /* IGMP handler */ - NULL, /* IGMP error control */ - &icmp_protocol, /* next */ - IPPROTO_IGMP, /* protocol ID */ - 0, /* copy */ - NULL, /* data */ - "IGMP" /* name */ -}; +#undef IPPROTO_PREVIOUS +#define IPPROTO_PREVIOUS &icmp_protocol + -struct inet_protocol *inet_protocol_base = &igmp_protocol; +struct inet_protocol *inet_protocol_base = IPPROTO_PREVIOUS; struct inet_protocol *inet_protos[MAX_INET_PROTOS] = { diff --git a/net/ipv4/rarp.c b/net/ipv4/rarp.c index d2e6ad5c4..f7ab4ddc3 100644 --- a/net/ipv4/rarp.c +++ b/net/ipv4/rarp.c @@ -3,6 +3,8 @@ * Copyright (C) 1994 by Ross Martin * Based on linux/net/inet/arp.c, Copyright (C) 1994 by Florian La Roche * + * $Id: rarp.c,v 1.21 1997/10/27 09:13:16 geert Exp $ + * * This module implements the Reverse Address Resolution Protocol * (RARP, RFC 903), which is used to convert low level addresses such * as ethernet addresses into high level addresses such as IP addresses. @@ -119,20 +121,20 @@ static void rarp_destroy(unsigned long ip_addr) struct rarp_table *entry; struct rarp_table **pentry; - cli(); + start_bh_atomic(); pentry = &rarp_tables; while ((entry = *pentry) != NULL) { if (entry->ip == ip_addr) { *pentry = entry->next; - sti(); + end_bh_atomic(); rarp_release_entry(entry); return; } pentry = &entry->next; } - sti(); + end_bh_atomic(); } /* @@ -144,7 +146,7 @@ static void rarp_destroy_dev(struct device *dev) struct rarp_table *entry; struct rarp_table **pentry; - cli(); + start_bh_atomic(); pentry = &rarp_tables; while ((entry = *pentry) != NULL) { @@ -156,7 +158,7 @@ static void rarp_destroy_dev(struct device *dev) else pentry = &entry->next; } - sti(); + end_bh_atomic(); } static int rarp_device_event(struct notifier_block *this, unsigned long event, void *ptr) @@ -176,6 +178,8 @@ static struct notifier_block rarp_dev_notifier={ NULL, 0 }; + +static int rarp_pkt_inited=0; static void rarp_init_pkt (void) { @@ -183,8 +187,19 @@ static void rarp_init_pkt (void) rarp_packet_type.type=htons(ETH_P_RARP); dev_add_pack(&rarp_packet_type); register_netdevice_notifier(&rarp_dev_notifier); + rarp_pkt_inited=1; } +static void rarp_end_pkt(void) +{ + if(!rarp_pkt_inited) + return; + dev_remove_pack(&rarp_packet_type); + unregister_netdevice_notifier(&rarp_dev_notifier); + rarp_pkt_inited=0; +} + + /* * Receive an arp request by the device layer. Maybe it should be * rewritten to use the incoming packet for the reply. The current @@ -199,6 +214,7 @@ static int rarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type struct arphdr *rarp = (struct arphdr *) skb->data; unsigned char *rarp_ptr = skb_pull(skb,sizeof(struct arphdr)); struct rarp_table *entry; + struct in_device *in_dev = dev->ip_ptr; long sip,tip; unsigned char *sha,*tha; /* s for "source", t for "target" */ @@ -207,7 +223,7 @@ static int rarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type */ if (rarp->ar_hln != dev->addr_len || dev->type != ntohs(rarp->ar_hrd) - || dev->flags&IFF_NOARP) + || dev->flags&IFF_NOARP || !in_dev || !in_dev->ifa_list) { kfree_skb(skb, FREE_READ); return 0; @@ -256,7 +272,6 @@ static int rarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type * Process entry. Use tha for table lookup according to RFC903. */ - cli(); for (entry = rarp_tables; entry != NULL; entry = entry->next) if (!memcmp(entry->ha, tha, rarp->ar_hln)) break; @@ -264,13 +279,10 @@ static int rarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type if (entry != NULL) { sip=entry->ip; - sti(); - arp_send(ARPOP_RREPLY, ETH_P_RARP, sip, dev, dev->pa_addr, sha, + arp_send(ARPOP_RREPLY, ETH_P_RARP, sip, dev, in_dev->ifa_list->ifa_address, sha, dev->dev_addr, sha); } - else - sti(); kfree_skb(skb, FREE_READ); return 0; @@ -331,10 +343,10 @@ static int rarp_req_set(struct arpreq *req) * Is it reachable directly ? */ - err = ip_route_output(&rt, ip, 0, 1, NULL); + err = ip_route_output(&rt, ip, 0, 1, 0); if (err) return err; - if (rt->rt_flags&(RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST|RTF_NAT)) { + if (rt->rt_flags&(RTCF_LOCAL|RTCF_BROADCAST|RTCF_MULTICAST|RTCF_DNAT)) { ip_rt_put(rt); return -EINVAL; } @@ -344,7 +356,6 @@ static int rarp_req_set(struct arpreq *req) * Is there an existing entry for this address? Find out... */ - cli(); for (entry = rarp_tables; entry != NULL; entry = entry->next) if (entry->ip == ip) break; @@ -359,7 +370,6 @@ static int rarp_req_set(struct arpreq *req) GFP_ATOMIC); if (entry == NULL) { - sti(); return -ENOMEM; } if (initflag) @@ -368,21 +378,23 @@ static int rarp_req_set(struct arpreq *req) initflag=0; } + /* Block interrupts until table modification is finished */ + + cli(); entry->next = rarp_tables; rarp_tables = entry; } - + cli(); entry->ip = ip; entry->hlen = hlen; entry->htype = htype; memcpy(&entry->ha, &r.arp_ha.sa_data, hlen); entry->dev = dev; + sti(); /* Don't unlink if we have entries to serve. */ MOD_INC_USE_COUNT; - sti(); - return 0; } @@ -417,14 +429,12 @@ static int rarp_req_get(struct arpreq *req) si = (struct sockaddr_in *) &r.arp_pa; ip = si->sin_addr.s_addr; - cli(); for (entry = rarp_tables; entry != NULL; entry = entry->next) if (entry->ip == ip) break; if (entry == NULL) { - sti(); return -ENXIO; } @@ -434,7 +444,6 @@ static int rarp_req_get(struct arpreq *req) memcpy(r.arp_ha.sa_data, &entry->ha, entry->hlen); r.arp_ha.sa_family = entry->htype; - sti(); /* * Copy the information back @@ -483,6 +492,7 @@ int rarp_ioctl(unsigned int cmd, void *arg) return 0; } +#ifdef CONFIG_PROC_FS int rarp_get_info(char *buffer, char **start, off_t offset, int length, int dummy) { int len=0; @@ -505,7 +515,6 @@ int rarp_get_info(char *buffer, char **start, off_t offset, int length, int dumm pos+=size; len+=size; - cli(); for(entry=rarp_tables; entry!=NULL; entry=entry->next) { netip=htonl(entry->ip); /* switch to network order */ @@ -537,7 +546,6 @@ int rarp_get_info(char *buffer, char **start, off_t offset, int length, int dumm if(pos>offset+length) break; } - sti(); } *start = buffer+(offset-begin); /* Start of wanted data */ @@ -553,11 +561,14 @@ struct proc_dir_entry proc_net_rarp = { 0, &proc_net_inode_operations, rarp_get_info }; +#endif __initfunc(void rarp_init(void)) { +#ifdef CONFIG_PROC_FS proc_net_register(&proc_net_rarp); +#endif rarp_ioctl_hook = rarp_ioctl; } @@ -572,7 +583,9 @@ int init_module(void) void cleanup_module(void) { struct rarp_table *rt, *rt_next; +#ifdef CONFIG_PROC_FS proc_net_unregister(PROC_NET_RARP); +#endif rarp_ioctl_hook = NULL; cli(); /* Destroy the RARP-table */ @@ -584,5 +597,6 @@ void cleanup_module(void) rt_next = rt->next; rarp_release_entry(rt); } + rarp_end_pkt(); } #endif diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 0d51af255..2f4de9fbd 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -5,7 +5,7 @@ * * RAW - implementation of IP "raw" sockets. * - * Version: @(#)raw.c 1.0.4 05/25/93 + * Version: $Id: raw.c,v 1.32 1997/10/24 17:16:00 kuznet Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -126,7 +126,7 @@ static void raw_v4_rehash(struct sock *sk) /* Grumble... icmp and ip_input want to get at this... */ struct sock *raw_v4_lookup(struct sock *sk, unsigned short num, - unsigned long raddr, unsigned long laddr) + unsigned long raddr, unsigned long laddr, int dif) { struct sock *s = sk; @@ -135,7 +135,8 @@ struct sock *raw_v4_lookup(struct sock *sk, unsigned short num, if((s->num == num) && !(s->dead && (s->state == TCP_CLOSE)) && !(s->daddr && s->daddr != raddr) && - !(s->rcv_saddr && s->rcv_saddr != laddr)) + !(s->rcv_saddr && s->rcv_saddr != laddr) && + !(s->bound_dev_if && s->bound_dev_if != dif)) break; /* gotcha */ } SOCKHASH_UNLOCK(); @@ -203,7 +204,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) struct rawfakehdr { - const unsigned char *from; + struct iovec *iov; u32 saddr; }; @@ -218,7 +219,7 @@ struct rawfakehdr static int raw_getfrag(const void *p, char *to, unsigned int offset, unsigned int fraglen) { struct rawfakehdr *rfh = (struct rawfakehdr *) p; - return copy_from_user(to, rfh->from + offset, fraglen); + return memcpy_fromiovecend(to, rfh->iov, offset, fraglen); } /* @@ -229,8 +230,9 @@ static int raw_getrawfrag(const void *p, char *to, unsigned int offset, unsigned { struct rawfakehdr *rfh = (struct rawfakehdr *) p; - if (copy_from_user(to, rfh->from + offset, fraglen)) + if (memcpy_fromiovecend(to, rfh->iov, offset, fraglen)) return -EFAULT; + if (offset==0) { struct iphdr *iph = (struct iphdr *)to; if (!iph->saddr) @@ -249,10 +251,8 @@ static int raw_getrawfrag(const void *p, char *to, unsigned int offset, unsigned return 0; } -static int raw_sendto(struct sock *sk, const unsigned char *from, - int len, struct msghdr *msg) +static int raw_sendmsg(struct sock *sk, struct msghdr *msg, int len) { - struct device *dev = NULL; struct ipcm_cookie ipc; struct rawfakehdr rfh; struct rtable *rt; @@ -302,9 +302,10 @@ static int raw_sendto(struct sock *sk, const unsigned char *from, ipc.addr = sk->saddr; ipc.opt = NULL; + ipc.oif = sk->bound_dev_if; if (msg->msg_controllen) { - int tmp = ip_cmsg_send(msg, &ipc, &dev); + int tmp = ip_cmsg_send(msg, &ipc); if (tmp) return tmp; if (ipc.opt && sk->ip_hdrincl) { @@ -327,23 +328,27 @@ static int raw_sendto(struct sock *sk, const unsigned char *from, } tos = RT_TOS(sk->ip_tos) | (sk->localroute || (msg->msg_flags&MSG_DONTROUTE)); - if (MULTICAST(daddr) && sk->ip_mc_index && dev==NULL) - err = ip_route_output_dev(&rt, daddr, rfh.saddr, tos, sk->ip_mc_index); - else - err = ip_route_output(&rt, daddr, rfh.saddr, tos, dev); + if (MULTICAST(daddr)) { + if (!ipc.oif) + ipc.oif = sk->ip_mc_index; + if (!rfh.saddr) + rfh.saddr = sk->ip_mc_addr; + } + + err = ip_route_output(&rt, daddr, rfh.saddr, tos, ipc.oif); if (err) { if (free) kfree(ipc.opt); return err; } - if (rt->rt_flags&RTF_BROADCAST && !sk->broadcast) { + if (rt->rt_flags&RTCF_BROADCAST && !sk->broadcast) { if (free) kfree(ipc.opt); ip_rt_put(rt); return -EACCES; } - rfh.from = from; + rfh.iov = msg->msg_iov; rfh.saddr = rt->rt_src; if (!ipc.addr) ipc.addr = rt->rt_dst; @@ -363,56 +368,10 @@ static int raw_sendto(struct sock *sk, const unsigned char *from, return err<0 ? err : len; } -/* - * Temporary - */ - -static int raw_sendmsg(struct sock *sk, struct msghdr *msg, int len) -{ - if (msg->msg_iovlen==1) - return raw_sendto(sk, msg->msg_iov[0].iov_base,len, msg); - else { - /* - * For awkward cases we linearise the buffer first. In theory this is only frames - * whose iovec's don't split on 4 byte boundaries, and soon encrypted stuff (to keep - * skip happy). We are a bit more general about it. - */ - - unsigned char *buf; - int err; - if(len>65515) - return -EMSGSIZE; - buf=kmalloc(len, GFP_KERNEL); - if(buf==NULL) - return -ENOBUFS; - err = memcpy_fromiovec(buf, msg->msg_iov, len); - if (!err) - { - unsigned long fs; - fs=get_fs(); - set_fs(get_ds()); - err=raw_sendto(sk,buf,len, msg); - set_fs(fs); - } - else - err = -EFAULT; - - kfree_s(buf,len); - return err; - } -} - static void raw_close(struct sock *sk, unsigned long timeout) { sk->state = TCP_CLOSE; -#ifdef CONFIG_IP_MROUTE - if(sk==mroute_socket) - { - ipv4_config.multicast_route = 0; - mroute_close(sk); - mroute_socket=NULL; - } -#endif + ip_ra_control(sk, 0, NULL); sk->dead=1; destroy_sock(sk); } @@ -425,17 +384,17 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) if((sk->state != TCP_CLOSE) || (addr_len < sizeof(struct sockaddr_in))) return -EINVAL; - chk_addr_ret = __ip_chk_addr(addr->sin_addr.s_addr); - if(addr->sin_addr.s_addr != 0 && chk_addr_ret != IS_MYADDR && - chk_addr_ret != IS_MULTICAST && chk_addr_ret != IS_BROADCAST) { + chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); + if(addr->sin_addr.s_addr != 0 && chk_addr_ret != RTN_LOCAL && + chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) { #ifdef CONFIG_IP_TRANSPARENT_PROXY /* Superuser may bind to any address to allow transparent proxying. */ - if(!suser()) + if(chk_addr_ret != RTN_UNICAST || !suser()) #endif return -EADDRNOTAVAIL; } sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr; - if(chk_addr_ret == IS_MULTICAST || chk_addr_ret == IS_BROADCAST) + if(chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) sk->saddr = 0; /* Use device */ dst_release(sk->dst_cache); sk->dst_cache = NULL; @@ -448,7 +407,7 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) */ int raw_recvmsg(struct sock *sk, struct msghdr *msg, int len, - int noblock, int flags,int *addr_len) + int noblock, int flags,int *addr_len) { int copied=0; struct sk_buff *skb; @@ -500,6 +459,75 @@ int raw_recvmsg(struct sock *sk, struct msghdr *msg, int len, return err ? err : (copied); } +static int raw_init(struct sock *sk) +{ + struct raw_opt *tp = &(sk->tp_pinfo.tp_raw4); + if (sk->num == IPPROTO_ICMP) { + memset(&tp->filter, 0, sizeof(tp->filter)); + + /* By default block ECHO and TIMESTAMP requests */ + + set_bit(ICMP_ECHO, &tp->filter); + set_bit(ICMP_TIMESTAMP, &tp->filter); + } + return 0; +} + +static int raw_seticmpfilter(struct sock *sk, char *optval, int optlen) +{ + if (optlen > sizeof(struct icmp_filter)) + optlen = sizeof(struct icmp_filter); + if (copy_from_user(&sk->tp_pinfo.tp_raw4.filter, optval, optlen)) + return -EFAULT; + return 0; +} + +static int raw_geticmpfilter(struct sock *sk, char *optval, int *optlen) +{ + int len; + + if (get_user(len,optlen)) + return -EFAULT; + if (len > sizeof(struct icmp_filter)) + len = sizeof(struct icmp_filter); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &sk->tp_pinfo.tp_raw4.filter, len)) + return -EFAULT; + return 0; +} + +static int raw_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + if (level != SOL_RAW) + return ip_setsockopt(sk, level, optname, optval, optlen); + + switch (optname) { + case ICMP_FILTER: + if (sk->num != IPPROTO_ICMP) + return -EOPNOTSUPP; + return raw_seticmpfilter(sk, optval, optlen); + }; + + return -ENOPROTOOPT; +} + +static int raw_getsockopt(struct sock *sk, int level, int optname, + char *optval, int *optlen) +{ + if (level != SOL_RAW) + return ip_getsockopt(sk, level, optname, optval, optlen); + + switch (optname) { + case ICMP_FILTER: + if (sk->num != IPPROTO_ICMP) + return -EOPNOTSUPP; + return raw_geticmpfilter(sk, optval, optlen); + }; + + return -ENOPROTOOPT; +} struct proto raw_prot = { (struct sock *)&raw_prot, /* sklist_next */ @@ -516,11 +544,11 @@ struct proto raw_prot = { #else NULL, /* ioctl */ #endif - NULL, /* init */ + raw_init, /* init */ NULL, /* destroy */ NULL, /* shutdown */ - ip_setsockopt, /* setsockopt */ - ip_getsockopt, /* getsockopt */ + raw_setsockopt, /* setsockopt */ + raw_getsockopt, /* getsockopt */ raw_sendmsg, /* sendmsg */ raw_recvmsg, /* recvmsg */ raw_bind, /* bind */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b55fb7666..046c60beb 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -5,7 +5,7 @@ * * ROUTE - implementation of the IP router. * - * Version: @(#)route.c 1.0.14 05/31/93 + * Version: $Id: route.c,v 1.33 1997/10/24 17:16:08 kuznet Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -68,27 +68,27 @@ #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> -#include <linux/if_arp.h> #include <linux/proc_fs.h> #include <linux/init.h> -#include <net/ip.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#include <linux/pkt_sched.h> +#include <linux/mroute.h> #include <net/protocol.h> +#include <net/ip.h> #include <net/route.h> +#include <net/sock.h> +#include <net/ip_fib.h> #include <net/arp.h> #include <net/tcp.h> -#include <linux/skbuff.h> -#include <net/sock.h> #include <net/icmp.h> -#include <linux/net_alias.h> - -/* Compile time configuretion flags */ -#define CONFIG_IP_LOCAL_RT_POLICY 1 +#define RTprint(a...) printk(KERN_DEBUG a) -static void rt_run_flush(unsigned long); - static struct timer_list rt_flush_timer = - { NULL, NULL, RT_FLUSH_DELAY, 0L, rt_run_flush }; + { NULL, NULL, RT_FLUSH_DELAY, 0L, NULL }; /* * Interface to generic destination cache. @@ -108,6 +108,24 @@ struct dst_ops ipv4_dst_ops = ipv4_dst_destroy }; +__u8 ip_tos2prio[16] = { + TC_PRIO_FILLER, + TC_PRIO_BESTEFFORT, + TC_PRIO_FILLER, + TC_PRIO_FILLER, + TC_PRIO_BULK, + TC_PRIO_FILLER, + TC_PRIO_BULK, + TC_PRIO_FILLER, + TC_PRIO_INTERACTIVE, + TC_PRIO_FILLER, + TC_PRIO_INTERACTIVE, + TC_PRIO_FILLER, + TC_PRIO_INTERACTIVE_BULK, + TC_PRIO_FILLER, + TC_PRIO_INTERACTIVE_BULK, + TC_PRIO_FILLER +}; /* * Route cache. @@ -162,8 +180,10 @@ static int rt_cache_get_info(char *buffer, char **start, off_t offset, int lengt r->u.dst.dev ? r->u.dst.dev->name : "*", (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, - r->rt_flags, atomic_read(&r->u.dst.refcnt), - atomic_read(&r->u.dst.use), 0, + r->rt_flags, + atomic_read(&r->u.dst.use), + atomic_read(&r->u.dst.refcnt), + 0, (unsigned long)r->rt_src, (int)r->u.dst.pmtu, r->u.dst.window, (int)r->u.dst.rtt, r->key.tos, @@ -202,8 +222,6 @@ void ip_rt_check_expire() struct rtable *rth, **rthp; unsigned long now = jiffies; - start_bh_atomic(); - for (i=0; i<RT_HASH_DIVISOR/5; i++) { rover = (rover + 1) & (RT_HASH_DIVISOR-1); rthp = &rt_hash_table[rover]; @@ -229,61 +247,24 @@ void ip_rt_check_expire() if (!rth_next) break; - /* - * Pseudo-LRU ordering. - * Really we should teach it to move - * rarely used but permanently living entries - * (f.e. rdisc, igmp etc.) to the end of list. - */ - if ( rth_next->u.dst.lastuse - rth->u.dst.lastuse > RT_CACHE_BUBBLE_THRESHOLD || (rth->u.dst.lastuse - rth_next->u.dst.lastuse < 0 && - atomic_read(&rth->u.dst.use) < atomic_read(&rth_next->u.dst.use))) { + atomic_read(&rth->u.dst.refcnt) < atomic_read(&rth_next->u.dst.refcnt))) { #if RT_CACHE_DEBUG >= 2 printk("rt_check_expire bubbled %02x@%08x<->%08x\n", rover, rth->rt_dst, rth_next->rt_dst); #endif *rthp = rth_next; rth->u.rt_next = rth_next->u.rt_next; rth_next->u.rt_next = rth; - sti(); rthp = &rth_next->u.rt_next; continue; } rthp = &rth->u.rt_next; } } - - end_bh_atomic(); -} - - -void rt_cache_flush(int how) -{ - start_bh_atomic(); - if (rt_flush_timer.expires) { - if (jiffies - rt_flush_timer.expires > 0 || - rt_flush_timer.expires - jiffies > RT_FLUSH_DELAY/2) - how = 1; - } - if (how) { - if (rt_flush_timer.expires) - del_timer(&rt_flush_timer); - rt_flush_timer.expires = 0; - end_bh_atomic(); - rt_run_flush(0); - return; - } - if (rt_flush_timer.expires) { - end_bh_atomic(); - return; - } - del_timer(&rt_flush_timer); - rt_flush_timer.expires = jiffies + RT_FLUSH_DELAY; - add_timer(&rt_flush_timer); - end_bh_atomic(); } - -void rt_run_flush(unsigned long dummy) + +static void rt_run_flush(unsigned long dummy) { int i; struct rtable * rth, * next; @@ -313,6 +294,30 @@ void rt_run_flush(unsigned long dummy) #endif } } + +void rt_cache_flush(int delay) +{ + start_bh_atomic(); + if (delay && rt_flush_timer.function && + rt_flush_timer.expires - jiffies < delay) { + end_bh_atomic(); + return; + } + if (rt_flush_timer.function) { + del_timer(&rt_flush_timer); + rt_flush_timer.function = NULL; + } + if (delay == 0) { + end_bh_atomic(); + rt_run_flush(0); + return; + } + rt_flush_timer.function = rt_run_flush; + rt_flush_timer.expires = jiffies + delay; + add_timer(&rt_flush_timer); + end_bh_atomic(); +} + static void rt_garbage_collect(void) { @@ -327,7 +332,7 @@ static void rt_garbage_collect(void) /* * Garbage collection is pretty expensive, - * do not make it too frequently. + * do not make it too frequently, but just increase expire strength. */ if (now - last_gc < 1*HZ) { expire >>= 1; @@ -342,7 +347,7 @@ static void rt_garbage_collect(void) continue; for (rthp=&rt_hash_table[i]; (rth=*rthp); rthp=&rth->u.rt_next) { if (atomic_read(&rth->u.dst.use) || - (now - rth->u.dst.lastuse > expire)) + now - rth->u.dst.lastuse < expire) continue; atomic_dec(&rt_cache_size); *rthp = rth->u.rt_next; @@ -465,115 +470,94 @@ static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt, u16 prot void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, u32 saddr, u8 tos, struct device *dev) { - int i; - int off_link = 0; - struct fib_info *fi; + int i, k; + struct in_device *in_dev = dev->ip_ptr; struct rtable *rth, **rthp; - u32 skeys[2] = { saddr, 0, }; - struct device *pdev = net_alias_main_dev(dev); + u32 skeys[2] = { saddr, 0 }; + int ikeys[2] = { dev->ifindex, 0 }; tos &= IPTOS_TOS_MASK; - if (new_gw == old_gw || !ipv4_config.accept_redirects + if (!in_dev || new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw)) goto reject_redirect; - if ((new_gw^dev->pa_addr)&dev->pa_mask) - off_link = 1; - - if (!ipv4_config.rfc1620_redirects) { - if (off_link) + if (!IN_DEV_SHARED_MEDIA(in_dev)) { + if (ip_fib_check_default(new_gw, dev)) goto reject_redirect; - if (ipv4_config.secure_redirects && ip_fib_chk_default_gw(new_gw, dev)) + } else { + if (inet_addr_type(new_gw) != RTN_UNICAST) goto reject_redirect; } - fi = fib_lookup_info(new_gw, 0, 0, &loopback_dev, NULL); - if (fi == NULL || fi->fib_flags&(RTF_LOCAL|RTF_BROADCAST|RTF_NAT)) - goto reject_redirect; - for (i=0; i<2; i++) { - unsigned hash = rt_hash_code(daddr, skeys[i], tos); + for (k=0; k<2; k++) { + unsigned hash = rt_hash_code(daddr, skeys[i]^(ikeys[k]<<5), tos); - rthp=&rt_hash_table[hash]; + rthp=&rt_hash_table[hash]; - while ( (rth = *rthp) != NULL) { - struct rtable *rt; + while ( (rth = *rthp) != NULL) { + struct rtable *rt; - if (rth->key.dst != daddr || - rth->key.src != skeys[i] || - rth->key.tos != tos || - rth->key.dst_dev != NULL || - rth->key.src_dev != NULL) { - rthp = &rth->u.rt_next; - continue; - } + if (rth->key.dst != daddr || + rth->key.src != skeys[i] || + rth->key.tos != tos || + rth->key.oif != ikeys[k] || + rth->key.iif != 0) { + rthp = &rth->u.rt_next; + continue; + } - if (rth->rt_dst != daddr || - rth->rt_src != saddr || - rth->rt_flags&RTF_REJECT || - rth->rt_gateway != old_gw || - rth->u.dst.dev != dev) - break; + if (rth->rt_dst != daddr || + rth->rt_src != saddr || + rth->u.dst.error || + rth->rt_gateway != old_gw || + rth->u.dst.dev != dev) + break; - rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); - if (rt == NULL) - return; + rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); + if (rt == NULL) + return; - /* - * Copy all the information. - */ - atomic_set(&rt->u.dst.refcnt, 1); - rt->u.dst.dev = dev; - rt->u.dst.input = rth->u.dst.input; - rt->u.dst.output = rth->u.dst.output; - rt->u.dst.pmtu = dev->mtu; - rt->u.dst.rtt = TCP_TIMEOUT_INIT; - rt->u.dst.window = 0; - atomic_set(&rt->u.dst.use, 1); - rt->u.dst.lastuse = jiffies; - - rt->rt_flags = rth->rt_flags|RTF_DYNAMIC|RTF_MODIFIED; - rt->rt_flags &= ~RTF_GATEWAY; - if (new_gw != daddr) - rt->rt_flags |= RTF_GATEWAY; - - rt->rt_src = rth->rt_src; - rt->rt_dst = rth->rt_dst; - rt->rt_src_dev = rth->rt_src_dev; - rt->rt_spec_dst = rth->rt_spec_dst; - rt->key = rth->key; - - /* But gateway is different ... */ - rt->rt_gateway = new_gw; - - if (off_link) { - if (fi->fib_dev != dev && - net_alias_main_dev(fi->fib_dev) == pdev) - rt->u.dst.dev = fi->fib_dev; - } + /* + * Copy all the information. + */ + *rt = *rth; + atomic_set(&rt->u.dst.refcnt, 1); + atomic_set(&rt->u.dst.use, 1); + rt->u.dst.lastuse = jiffies; + rt->u.dst.neighbour = NULL; + rt->u.dst.hh = NULL; + + rt->rt_flags |= RTCF_REDIRECTED; + + /* Gateway is different ... */ + rt->rt_gateway = new_gw; + + if (!rt_ll_bind(rt)) { + ip_rt_put(rt); + rt_free(rt); + break; + } - if (ipv4_config.rfc1620_redirects && !rt_ll_bind(rt)) { + *rthp = rth->u.rt_next; + rt_free(rth); + rt = rt_intern_hash(hash, rt, ETH_P_IP); ip_rt_put(rt); - rt_free(rt); break; } - - *rthp = rth->u.rt_next; - rt_free(rth); - rt = rt_intern_hash(hash, rt, ETH_P_IP); - ip_rt_put(rt); - break; } } return; reject_redirect: +#ifdef CONFIG_IP_ROUTE_VERBOSE if (ipv4_config.log_martians && net_ratelimit()) printk(KERN_INFO "Redirect from %lX/%s to %lX ignored." "Path = %lX -> %lX, tos %02x\n", ntohl(old_gw), dev->name, ntohl(new_gw), ntohl(saddr), ntohl(daddr), tos); +#endif } @@ -585,7 +569,7 @@ void ip_rt_advice(struct rtable **rp, int advice) return; start_bh_atomic(); - if ((rt = *rp) != NULL && (rt->rt_flags&(RTF_DYNAMIC|RTF_MODIFIED))) { + if ((rt = *rp) != NULL && (rt->rt_flags&RTCF_REDIRECTED)) { #if RT_CACHE_DEBUG >= 1 printk(KERN_DEBUG "ip_rt_advice: redirect to %08x/%02x dropped\n", rt->rt_dst, rt->key.tos); #endif @@ -602,7 +586,7 @@ void ip_rt_advice(struct rtable **rp, int advice) * 1. The first RT_REDIRECT_NUMBER redirects are sent * with exponential backoff, then we stop sending them at all, * assuming that the host ignores our redirects. - * 2. If we did not see a packets requiring redirects + * 2. If we did not see packets requiring redirects * during RT_REDIRECT_SILENCE, we assume that the host * forgot redirected route and start to send redirects again. * @@ -637,9 +621,12 @@ void ip_rt_send_redirect(struct sk_buff *skb) if (jiffies - rt->last_error > (RT_REDIRECT_LOAD<<rt->errors)) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); rt->last_error = jiffies; - if (ipv4_config.log_martians && ++rt->errors == RT_REDIRECT_NUMBER && net_ratelimit()) - printk(KERN_WARNING "host %08x/%s ignores redirects for %08x to %08x.\n", - rt->rt_src, rt->rt_src_dev->name, rt->rt_dst, rt->rt_gateway); + ++rt->errors; +#ifdef CONFIG_IP_ROUTE_VERBOSE + if (ipv4_config.log_martians && rt->errors == RT_REDIRECT_NUMBER && net_ratelimit()) + printk(KERN_WARNING "host %08x/if%d ignores redirects for %08x to %08x.\n", + rt->rt_src, rt->rt_iif, rt->rt_dst, rt->rt_gateway); +#endif } } @@ -653,6 +640,9 @@ static int ip_error(struct sk_buff *skb) default: kfree_skb(skb, FREE_READ); return 0; + case EHOSTUNREACH: + code = ICMP_HOST_UNREACH; + break; case ENETUNREACH: code = ICMP_NET_UNREACH; break; @@ -668,37 +658,24 @@ static int ip_error(struct sk_buff *skb) return 0; } +/* + * The last two values are not from the RFC but + * are needed for AMPRnet AX.25 paths. + */ + +static unsigned short mtu_plateau[] = +{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; static __inline__ unsigned short guess_mtu(unsigned short old_mtu) { - if (old_mtu > 32000) - return 32000; - else if (old_mtu > 17914) - return 17914; - else if (old_mtu > 8166) - return 8166; - else if (old_mtu > 4352) - return 4352; - else if (old_mtu > 2002) - return 2002; - else if (old_mtu > 1492) - return 1492; - else if (old_mtu > 576) - return 576; - else if (old_mtu > 296) - return 296; - /* - * These two are not from the RFC but - * are needed for AMPRnet AX.25 paths. - */ - else if (old_mtu > 216) - return 216; - else if (old_mtu > 128) - return 128; + int i; + + for (i = 0; i < sizeof(mtu_plateau)/sizeof(mtu_plateau[0]); i++) + if (old_mtu > mtu_plateau[i]) + return mtu_plateau[i]; return 68; } - unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) { int i; @@ -721,8 +698,8 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) rth->rt_dst == daddr && rth->rt_src == iph->saddr && rth->key.tos == tos && - !rth->key.src_dev && - !(rth->rt_flags&RTF_NOPMTUDISC)) { + rth->key.iif == 0 && + !(rth->rt_flags&RTCF_NOPMTUDISC)) { unsigned short mtu = new_mtu; if (new_mtu < 68 || new_mtu >= old_mtu) { @@ -770,177 +747,227 @@ static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst, return NULL; } -int -ip_check_mc(struct device *dev, u32 mc_addr) +static int ip_rt_bug(struct sk_buff *skb) { - struct ip_mc_list *ip_mc; + printk(KERN_DEBUG "ip_rt_bug: %08x -> %08x, %s\n", skb->nh.iph->saddr, + skb->nh.iph->daddr, skb->dev ? skb->dev->name : "?"); + kfree_skb(skb, FREE_WRITE); + return 0; +} - if (mc_addr==htonl(INADDR_ALLHOSTS_GROUP)) - return 1; +/* + We do not cache source address of outgoing interface, + because it is used only by IP RR, TS and SRR options, + so that it out of fast path. - for (ip_mc=dev->ip_mc_list; ip_mc; ip_mc=ip_mc->next) - if (ip_mc->multiaddr == mc_addr) - return 1; - return 0; + BTW remember: "addr" is allowed to be not aligned + in IP options! + */ + +void ip_rt_get_source(u8 *addr, struct rtable *rt) +{ + u32 src; + struct fib_result res; + + if (rt->key.iif == 0) { + memcpy(addr, &rt->rt_src, 4); + return; + } + if (fib_lookup(&rt->key, &res) == 0) { + src = FIB_RES_PREFSRC(res); + memcpy(addr, &src, 4); + return; + } + src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); + memcpy(addr, &src, 4); } -static int ip_rt_bug(struct sk_buff *skb) +static int +ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr, + u8 tos, struct device *dev, int our) { - kfree_skb(skb, FREE_WRITE); - printk(KERN_DEBUG "ip_rt_bug: %08x -> %08x, %s\n", skb->nh.iph->saddr, - skb->nh.iph->daddr, skb->dev ? skb->dev->name : "?"); + unsigned hash; + struct rtable *rth; + u32 spec_dst; + struct in_device *in_dev = dev->ip_ptr; + + /* Primary sanity checks. */ + + if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) || + in_dev == NULL || skb->protocol != __constant_htons(ETH_P_IP)) + return -EINVAL; + + if (ZERONET(saddr)) { + if (!LOCAL_MCAST(daddr)) + return -EINVAL; + spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); + } else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst) < 0) + return -EINVAL; + + rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); + if (!rth) + return -ENOBUFS; + + rth->u.dst.output= ip_rt_bug; + + atomic_set(&rth->u.dst.use, 1); + rth->key.dst = daddr; + rth->rt_dst = daddr; + rth->key.tos = tos; + rth->key.src = saddr; + rth->rt_src = saddr; +#ifdef CONFIG_IP_ROUTE_NAT + rth->rt_dst_map = daddr; + rth->rt_src_map = saddr; +#endif + rth->rt_iif = + rth->key.iif = dev->ifindex; + rth->u.dst.dev = &loopback_dev; + rth->key.oif = 0; + rth->rt_gateway = daddr; + rth->rt_spec_dst= spec_dst; + rth->rt_type = RTN_MULTICAST; + rth->rt_flags = RTCF_MULTICAST; + if (our) { + rth->u.dst.input= ip_local_deliver; + rth->rt_flags |= RTCF_LOCAL; + } + +#ifdef CONFIG_IP_MROUTE + if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) + rth->u.dst.input = ip_mr_input; +#endif + + hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos); + skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, 0); return 0; } /* - * This function is called ONLY FROM NET BH. No locking! - * * NOTE. We drop all the packets that has local source * addresses, because every properly looped back packet * must have correct destination already attached by output routine. * * Such approach solves two big problems: - * 1. Not simplex devices (if they exist 8)) are handled properly. + * 1. Not simplex devices are handled properly. * 2. IP spoofing attempts are filtered with 100% of guarantee. */ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, - u8 tos, struct device *pdev) + u8 tos, struct device *dev) { - struct device * dev = pdev; - struct fib_info *fi = NULL; - struct fib_info *src_fi = NULL; + struct rt_key key; + struct fib_result res; + struct in_device *in_dev = dev->ip_ptr; + struct in_device *out_dev; unsigned flags = 0; - struct device *devout; struct rtable * rth; unsigned hash; - struct fib_result res; - u32 src_key = saddr; - u32 dst_key = daddr; - int err = -EINVAL; - int log = 0; + u32 spec_dst; + int err = -EINVAL; + + /* + * IP on this device is disabled. + */ - hash = rt_hash_code(daddr, saddr^(unsigned long)pdev, tos); + if (!in_dev) + return -EINVAL; - /* Check for martians... */ + key.dst = daddr; + key.src = saddr; + key.tos = tos; + key.iif = dev->ifindex; + key.oif = 0; + key.scope = RT_SCOPE_UNIVERSE; + + hash = rt_hash_code(daddr, saddr^(key.iif<<5), tos); + + /* Check for the most weird martians, which can be not detected + by fib_lookup. + */ if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr)) goto martian_source; - if (MULTICAST(daddr) || daddr == 0xFFFFFFFF) - goto mc_input; - /* Accept zero addresses only to limited broadcast/multicasts; - * I even do not know to fix it or not. + if (daddr == 0xFFFFFFFF) + goto brd_input; + + /* Accept zero addresses only to limited broadcast; + * I even do not know to fix it or not. Waiting for complains :-) */ if (ZERONET(saddr)) goto martian_source; + if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr)) goto martian_destination; /* - * Device is not yet initialized, accept all addresses as ours. + * Now we are ready to route packet. */ - if (ZERONET(dev->pa_addr)) - goto promisc_ip; - - /* - * Now we are able to route packet. - */ - if ((err = fib_lookup(&res, daddr, saddr, tos, pdev, NULL)) < 0) { - if (!IS_ROUTER) + if ((err = fib_lookup(&key, &res))) { + if (!IN_DEV_FORWARD(in_dev)) return -EINVAL; goto no_route; } - fi = res.f->fib_info; - flags = fi->fib_flags; - devout = fi->fib_dev; - - if (flags&RTF_NAT) { - daddr = htonl((ntohl(daddr)&((1<<res.fm)-1)))|fi->fib_gateway; - fi = fib_lookup_info(daddr, saddr, tos, pdev, NULL); - if (!fi || fi->fib_flags&(RTF_NAT|RTF_LOCAL|RTF_MULTICAST|RTF_BROADCAST)) - return -EINVAL; - devout = fi->fib_dev; - flags = fi->fib_flags|RTCF_NAT|RTF_NAT; - } +#ifdef CONFIG_IP_ROUTE_NAT + /* Policy is applied before mapping destination, + but rerouting after map should be made with old source. + */ - switch (res.fr->cl_action) { - case RTP_NAT: - /* Packet is from translated source; remember it */ - saddr = (saddr&~res.fr->cl_srcmask)|res.fr->cl_srcmap; - flags |= RTCF_NAT; - break; - case RTP_MASQUERADE: - /* Packet is from masqueraded source; remember it */ - flags |= RTCF_MASQ; - break; - default: - } - log = res.fr->cl_flags&RTRF_LOG; + if (1) { + u32 src_map = saddr; + if (res.r) + src_map = fib_rules_policy(saddr, &res, &flags); - if (!(flags & RTF_LOCAL)) { - if (!IS_ROUTER || flags&RTF_NOFORWARD) - return -EINVAL; - } else { - fi = NULL; - devout = &loopback_dev; - if (flags&RTF_BROADCAST) - goto mc_input; + if (res.type == RTN_NAT) { + key.dst = fib_rules_map_destination(daddr, &res); + if (fib_lookup(&key, &res) || res.type != RTN_UNICAST) + return -EINVAL; + flags |= RTCF_DNAT; + } + key.src = src_map; } - -#ifndef CONFIG_IP_LOCAL_RT_POLICY - if (flags&RTF_LOCAL) - src_fi = fib_lookup_info(src_key, 0, tos, &loopback_dev, NULL); - else #endif - if (fib_lookup(&res, src_key, daddr, tos, net_alias_main_dev(devout), NULL) == 0) { - src_fi = res.f->fib_info; - /* Destination is on masqueraded network: - * if it is real incoming frame, ip_forward will drop it. - */ - if (res.fr->cl_flags&RTRF_VALVE) - flags |= RTCF_VALVE; - } - if (src_fi) { - if (src_fi->fib_flags&(RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST|RTF_NAT)) + if (res.type == RTN_BROADCAST) + goto brd_input; + + if (res.type == RTN_LOCAL) { + spec_dst = daddr; + if (inet_addr_type(saddr) != RTN_UNICAST) goto martian_source; + goto local_input; + } - if (!(src_fi->fib_flags&RTF_GATEWAY)) - flags |= RTCF_DIRECTSRC; + if (!IN_DEV_FORWARD(in_dev)) + return -EINVAL; + if (res.type != RTN_UNICAST) + goto martian_destination; - if (net_alias_main_dev(src_fi->fib_dev) == pdev) - skb->dev = dev = src_fi->fib_dev; - else { - /* Route to packet source goes via - different interface; rfc1812 proposes - to drop them. - It is dangerous on not-stub/transit networks - because of path asymmetry. - */ - if (ipv4_config.rfc1812_filter >= 2) - goto martian_source; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (res.fi->fib_nhs > 1 && key.oif == 0) + fib_select_multipath(&key, &res); +#endif + out_dev = FIB_RES_DEV(res)->ip_ptr; - /* Weaker form of rfc1812 filtering. - If source is on directly connected network, - it can mean either local network configuration error - (the most probable case) or real IP spoofing attempt. - */ - if (ipv4_config.rfc1812_filter >= 1 && !(flags&RTCF_DIRECTSRC)) - goto martian_source; - } - } else if (ipv4_config.rfc1812_filter >= 1) + err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst); + if (err < 0) goto martian_source; -make_route: + if (err) + flags |= RTCF_DIRECTSRC; + + if (out_dev == in_dev && err && !(flags&RTCF_NAT) && + (IN_DEV_SHARED_MEDIA(out_dev) + || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res)))) + flags |= RTCF_DOREDIRECT; + if (skb->protocol != __constant_htons(ETH_P_IP)) { - /* ARP request. Do not make route for invalid destination or - * if it is redirected. + /* Not IP (i.e. ARP). Do not make route for invalid + * destination or if it is redirected. */ - if (flags&(RTF_REJECT|RTF_BROADCAST|RTF_MULTICAST) || - skb->pkt_type == PACKET_OTHERHOST || - (devout == dev && !(flags&(RTF_LOCAL|RTCF_NAT)))) + if (out_dev == in_dev && flags&RTCF_DOREDIRECT) return -EINVAL; } @@ -948,147 +975,105 @@ make_route: if (!rth) return -ENOBUFS; - rth->u.dst.output= ip_rt_bug; - atomic_set(&rth->u.dst.use, 1); - rth->key.dst = dst_key; - rth->rt_dst = dst_key; - rth->rt_dst_map = daddr; + rth->key.dst = daddr; + rth->rt_dst = daddr; rth->key.tos = tos; - rth->key.src = src_key; - rth->rt_src = src_key; - rth->rt_src_map = saddr; - rth->rt_src_dev = dev; - rth->key.src_dev= pdev; - rth->u.dst.dev = devout; - rth->key.dst_dev= NULL; + rth->key.src = saddr; + rth->rt_src = saddr; rth->rt_gateway = daddr; - rth->rt_spec_dst= daddr; - - if (!(flags&RTF_REJECT)) { - if (flags&RTF_LOCAL) - rth->u.dst.input= ip_local_deliver; - if (!(flags&(RTF_NOFORWARD|RTF_BROADCAST))) { - if (flags&RTF_MULTICAST) { -#ifdef CONFIG_IP_MROUTE - if (!LOCAL_MCAST(daddr) && ipv4_config.multicast_route) { - rth->u.dst.input = ip_mr_input; - rth->u.dst.output = ip_output; - } +#ifdef CONFIG_IP_ROUTE_NAT + rth->rt_src_map = key.src; + rth->rt_dst_map = key.dst; + if (flags&RTCF_DNAT) + rth->rt_gateway = key.dst; #endif - } else if (!(flags&RTF_LOCAL)) { - rth->u.dst.input = ip_forward; - rth->u.dst.output = ip_output; - } - } - } else if (IS_ROUTER && !(flags&(RTF_MULTICAST|RTF_BROADCAST))) { - rth->u.dst.input= ip_error; - rth->u.dst.error= -err; - } - - if ((flags&(RTF_BROADCAST|RTF_MULTICAST)) || !(flags&RTF_LOCAL)) - rth->rt_spec_dst= dev->pa_addr; + rth->rt_iif = + rth->key.iif = dev->ifindex; + rth->u.dst.dev = out_dev->dev; + rth->key.oif = 0; + rth->rt_spec_dst= spec_dst; - if (fi) { - rth->u.dst.pmtu = fi->fib_mtu; - rth->u.dst.window=fi->fib_window; - rth->u.dst.rtt = fi->fib_irtt; - if (flags & RTF_GATEWAY) - rth->rt_gateway = fi->fib_gateway; - } else { - rth->u.dst.pmtu = devout->mtu; - rth->u.dst.window=0; - rth->u.dst.rtt = TCP_TIMEOUT_INIT; - } + rth->u.dst.input = ip_forward; + rth->u.dst.output = ip_output; - if (!(flags&(RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST|RTCF_NAT)) && - flags&RTCF_DIRECTSRC && - (devout == dev || (ipv4_config.rfc1620_redirects && - net_alias_main_dev(devout) == pdev))) - flags |= RTCF_DOREDIRECT; + rth->u.dst.pmtu = res.fi->fib_mtu ? : out_dev->dev->mtu; + rth->u.dst.window=res.fi->fib_window ? : 0; + rth->u.dst.rtt = res.fi->fib_rtt ? : TCP_TIMEOUT_INIT; + if (FIB_RES_GW(res) && FIB_RES_NH(res).nh_scope == RT_SCOPE_LINK) + rth->rt_gateway = FIB_RES_GW(res); rth->rt_flags = flags; + rth->rt_type = res.type; - if (log) - printk(KERN_INFO "installing route %08lX -> %08lX\n", ntohl(rth->rt_src), ntohl(rth->rt_dst)); - - if (flags&(RTF_LOCAL|RTF_MULTICAST|RTF_BROADCAST|RTF_REJECT)) { - skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, 0); - return 0; - } - skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, __constant_ntohs(skb->protocol)); + skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, ntohs(skb->protocol)); return 0; -mc_input: +brd_input: if (skb->protocol != __constant_htons(ETH_P_IP)) return -EINVAL; if (ZERONET(saddr)) { - if (!ipv4_config.bootp_agent) - goto martian_source; - flags |= RTF_NOFORWARD|RTF_LOCAL; + spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); } else { - src_fi = fib_lookup_info(saddr, 0, tos, &loopback_dev, NULL); - if (!src_fi) - goto martian_source; - - if (src_fi->fib_flags&(RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST|RTF_NAT)) + err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst); + if (err < 0) goto martian_source; - - if (!(src_fi->fib_flags&RTF_GATEWAY)) + if (err) flags |= RTCF_DIRECTSRC; - - if (!MULTICAST(daddr) || !ipv4_config.multicast_route || - LOCAL_MCAST(daddr)) { - if (net_alias_main_dev(src_fi->fib_dev) == pdev) { - skb->dev = dev = src_fi->fib_dev; - } else { - /* Fascist not-unicast filtering 8) */ - goto martian_source; - } - } - } - - if (!MULTICAST(daddr)) { - flags |= RTF_LOCAL|RTF_BROADCAST|RTF_NOFORWARD; - devout = dev; - goto make_route; } + flags |= RTCF_BROADCAST; - flags |= RTF_MULTICAST|RTF_LOCAL; +local_input: + rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); + if (!rth) + return -ENOBUFS; - if (ip_check_mc(dev, daddr) == 0) { - flags &= ~RTF_LOCAL; + rth->u.dst.output= ip_rt_bug; - if (!ipv4_config.multicast_route || !(dev->flags&IFF_ALLMULTI)) - goto no_route; + atomic_set(&rth->u.dst.use, 1); + rth->key.dst = daddr; + rth->rt_dst = daddr; + rth->key.tos = tos; + rth->key.src = saddr; + rth->rt_src = saddr; +#ifdef CONFIG_IP_ROUTE_NAT + rth->rt_dst_map = key.dst; + rth->rt_src_map = key.src; +#endif + rth->rt_iif = + rth->key.iif = dev->ifindex; + rth->u.dst.dev = &loopback_dev; + rth->key.oif = 0; + rth->rt_gateway = daddr; + rth->rt_spec_dst= spec_dst; + rth->u.dst.input= ip_local_deliver; + if (res.type == RTN_UNREACHABLE) { + rth->u.dst.input= ip_error; + rth->u.dst.error= err; } - devout = dev; - goto make_route; - -promisc_ip: - flags |= RTF_LOCAL|RTF_NOFORWARD; - if (MULTICAST(daddr)) - flags |= RTF_MULTICAST; - else - flags |= RTF_BROADCAST; - devout = dev; - goto make_route; + rth->rt_flags = flags|RTCF_LOCAL; + rth->rt_type = res.type; + skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, 0); + return 0; no_route: - flags |= RTF_REJECT; - devout = dev; - goto make_route; + spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + res.type = RTN_UNREACHABLE; + goto local_input; /* * Do not cache martian addresses: they should be logged (RFC1812) */ martian_destination: +#ifdef CONFIG_IP_ROUTE_VERBOSE if (ipv4_config.log_martians && net_ratelimit()) printk(KERN_WARNING "martian destination %08x from %08x, dev %s\n", daddr, saddr, dev->name); +#endif return -EINVAL; martian_source: +#ifdef CONFIG_IP_ROUTE_VERBOSE if (ipv4_config.log_martians && net_ratelimit()) { /* * RFC1812 recommenadtion, if source is martian, @@ -1104,6 +1089,7 @@ martian_source: printk("\n"); } } +#endif return -EINVAL; } @@ -1112,224 +1098,298 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr, { struct rtable * rth; unsigned hash; - - if (skb->dst) - return 0; - -#if RT_CACHE_DEBUG >= 1 - if (dev->flags & IFF_LOOPBACK) { - printk(KERN_DEBUG "ip_route_input: bug: packet is looped back\n"); - return -EINVAL; - } - if (net_alias_main_dev(dev) != dev) - printk(KERN_DEBUG "ip_route_input: bug: packet is received on alias %s\n", dev->name); -#endif + int iif = dev->ifindex; tos &= IPTOS_TOS_MASK; - hash = rt_hash_code(daddr, saddr^(unsigned long)dev, tos); - skb->dev = dev; + hash = rt_hash_code(daddr, saddr^(iif<<5), tos); for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) { if (rth->key.dst == daddr && rth->key.src == saddr && - rth->key.src_dev == dev && - rth->key.dst_dev == NULL && + rth->key.iif == iif && + rth->key.oif == 0 && rth->key.tos == tos) { rth->u.dst.lastuse = jiffies; atomic_inc(&rth->u.dst.use); atomic_inc(&rth->u.dst.refcnt); skb->dst = (struct dst_entry*)rth; - skb->dev = rth->rt_src_dev; return 0; } } + + /* Multicast recognition logic is moved from route cache to here. + The problem was that too many ethernet cards have broken/missing + hardware multicast filters :-( As result the host on multicasting + network acquires a lot of useless route cache entries, sort of + SDR messages from all the world. Now we try to get rid of them. + Really, provided software IP multicast filter is organized + reasonably (at least, hashed), it does not result in a slowdown + comparing with route cache reject entries. + Note, that multicast routers are not affected, because + route cache entry is created eventually. + */ + if (MULTICAST(daddr)) { + int our = ip_check_mc(dev, daddr); + if (!our +#ifdef CONFIG_IP_MROUTE + && (LOCAL_MCAST(daddr) || !dev->ip_ptr || + !IN_DEV_MFORWARD((struct in_device*)dev->ip_ptr)) +#endif + ) return -EINVAL; + return ip_route_input_mc(skb, daddr, saddr, tos, dev, our); + } return ip_route_input_slow(skb, daddr, saddr, tos, dev); } - /* * Major route resolver routine. */ -int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, - struct device *dev_out) +int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, int oif) { - u32 src_key = saddr; - u32 dst_key = daddr; - u32 dst_map; - struct device *dst_dev_key = dev_out; + struct rt_key key; + struct fib_result res; unsigned flags = 0; - struct fib_info *fi = NULL; struct rtable *rth; -#ifdef CONFIG_IP_LOCAL_RT_POLICY - struct fib_result res; -#endif + struct device *dev_out = NULL; unsigned hash; tos &= IPTOS_TOS_MASK|1; + key.dst = daddr; + key.src = saddr; + key.tos = tos&IPTOS_TOS_MASK; + key.iif = loopback_dev.ifindex; + key.oif = oif; + key.scope = (tos&1) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE; + res.fi = NULL; if (saddr) { - if (MULTICAST(saddr) || BADCLASS(saddr) || ZERONET(saddr) || - __ip_chk_addr(saddr) != IS_MYADDR) + if (MULTICAST(saddr) || BADCLASS(saddr) || ZERONET(saddr)) return -EINVAL; - if (dev_out == NULL && (MULTICAST(daddr) || daddr == 0xFFFFFFFF)) - dev_out = ip_dev_find(saddr, NULL); + + /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ + dev_out = ip_dev_find(saddr); + if (dev_out == NULL) + return -EINVAL; + + /* I removed check for oif == dev_out->oif here. + It was wrong by three reasons: + 1. ip_dev_find(saddr) can return wrong iface, if saddr is + assigned to multiple interfaces. + 2. Moreover, we are allowed to send packets with saddr + of another iface. --ANK + */ + + if (oif == 0 && (MULTICAST(daddr) || daddr == 0xFFFFFFFF)) { + /* Special hack: user can direct multicasts + and limited broadcast via necessary interface + without fiddling with IP_MULTICAST_IF or IP_TXINFO. + This hack is not just for fun, it allows + vic,vat and friends to work. + They bind socket to loopback, set ttl to zero + and expect that it will work. + From the viewpoint of routing cache they are broken, + because we are not allowed to build multicast path + with loopback source addr (look, routing cache + cannot know, that ttl is zero, so that packet + will not leave this host and route is valid). + Luckily, this hack is good workaround. + */ + + key.oif = dev_out->ifindex; + goto make_route; + } + dev_out = NULL; } - if (!daddr) - daddr = saddr; - - if (dev_out) { - if (!saddr) { - saddr = dev_out->pa_addr; - if (!daddr) - daddr = saddr; + if (oif) { + dev_out = dev_get_by_index(oif); + if (dev_out == NULL) + return -ENODEV; + if (dev_out->ip_ptr == NULL) + return -ENODEV; /* Wrong error code */ + + if (LOCAL_MCAST(daddr) || daddr == 0xFFFFFFFF) { + key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); + goto make_route; } - dst_map = daddr; - if (MULTICAST(daddr) || daddr == 0xFFFFFFFF) + if (MULTICAST(daddr)) { + key.src = inet_select_addr(dev_out, 0, key.scope); goto make_route; + } + if (!daddr) + key.src = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } - if (!daddr) - daddr = htonl(INADDR_LOOPBACK); + if (!key.dst) { + key.dst = key.src; + if (!key.dst) + key.dst = key.src = htonl(INADDR_LOOPBACK); + dev_out = &loopback_dev; + key.oif = loopback_dev.ifindex; + flags |= RTCF_LOCAL; + goto make_route; + } -#ifdef CONFIG_IP_LOCAL_RT_POLICY - if (fib_lookup(&res, daddr, saddr, tos, &loopback_dev, dev_out)) + if (fib_lookup(&key, &res)) { + res.fi = NULL; + if (oif) { + /* Apparently, routing tables are wrong. Assume, + that the destination is on link. + + WHY? DW. + Because we are allowed to send to iface + even if it has NO routes and NO assigned + addresses. When oif is specified, routing + tables are looked up with only one purpose: + to catch if destination is gatewayed, rather than + direct. Moreover, if MSG_DONTROUTE is set, + we send packet, no matter of routing tables + of ifaddr state. --ANK + + + We could make it even if oif is unknown, + likely IPv6, but we do not. + */ + + printk(KERN_DEBUG "Dest not on link. Forcing...\n"); + if (key.src == 0) + key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); + goto make_route; + } return -ENETUNREACH; - fi = res.f->fib_info; - dst_map = daddr; + } - if (fi->fib_flags&RTF_NAT) + if (res.type == RTN_NAT) return -EINVAL; - if (!saddr) { - saddr = fi->fib_dev->pa_addr; + if (!key.src) { + key.src = FIB_RES_PREFSRC(res); + +#ifdef CONFIG_IP_MULTIPLE_TABLES /* * "Stabilization" of route. * This step is necessary, if locally originated packets - * are subjected to source routing, else we could get + * are subjected to policy routing, otherwise we could get * route flapping. */ - fi = fib_lookup_info(dst_map, saddr, tos, &loopback_dev, dev_out); - if (!fi) + if (fib_lookup(&key, &res)) return -ENETUNREACH; +#endif } -#else - fi = fib_lookup_info(daddr, 0, tos, &loopback_dev, dev_out); - if (!fi) - return -ENETUNREACH; - - if (fi->fib_flags&RTF_NAT) - return -EINVAL; - dst_map = daddr; - if (!saddr) - saddr = fi->fib_dev->pa_addr; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (res.fi->fib_nhs > 1 && key.oif == 0) + fib_select_multipath(&key, &res); #endif - flags |= fi->fib_flags; - dev_out = fi->fib_dev; + dev_out = FIB_RES_DEV(res); - if (RT_LOCALADDR(flags)) { + if (res.type == RTN_LOCAL) { dev_out = &loopback_dev; - fi = NULL; + key.oif = dev_out->ifindex; + res.fi = NULL; + flags |= RTCF_LOCAL; } - if (dst_dev_key && dev_out != dst_dev_key) - return -EINVAL; + key.oif = dev_out->ifindex; make_route: - if (LOOPBACK(saddr) && !(dev_out->flags&IFF_LOOPBACK)) { - printk(KERN_DEBUG "this guy talks to %08x from loopback\n", daddr); + if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK)) { + printk(KERN_DEBUG "this guy talks to %08x from loopback\n", key.dst); return -EINVAL; } - if (daddr == 0xFFFFFFFF) - flags |= RTF_BROADCAST; - else if (MULTICAST(daddr)) - flags |= RTF_MULTICAST; - else if (BADCLASS(daddr) || ZERONET(daddr)) + if (key.dst == 0xFFFFFFFF) + res.type = RTN_BROADCAST; + else if (MULTICAST(key.dst)) + res.type = RTN_MULTICAST; + else if (BADCLASS(key.dst) || ZERONET(key.dst)) return -EINVAL; - if (flags&RTF_BROADCAST && (dev_out->flags&IFF_LOOPBACK || - !(dev_out->flags&IFF_BROADCAST))) - flags &= ~RTF_LOCAL; - else if (flags&RTF_MULTICAST) { + if (res.type == RTN_BROADCAST) { + flags |= RTCF_BROADCAST; + if (!(dev_out->flags&IFF_LOOPBACK) && dev_out->flags&IFF_BROADCAST) + flags |= RTCF_LOCAL; + } else if (res.type == RTN_MULTICAST) { + flags |= RTCF_MULTICAST; if (ip_check_mc(dev_out, daddr)) - flags |= RTF_LOCAL; + flags |= RTCF_LOCAL; } - + rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); if (!rth) return -ENOBUFS; atomic_set(&rth->u.dst.use, 1); - rth->key.dst = dst_key; + rth->key.dst = daddr; rth->key.tos = tos; - rth->key.src = src_key; - rth->key.src_dev= NULL; - rth->key.dst_dev= dst_dev_key; - rth->rt_dst = daddr; - rth->rt_dst_map = dst_map; - rth->rt_src = saddr; - rth->rt_src_map = saddr; - rth->rt_src_dev = dev_out; + rth->key.src = saddr; + rth->key.iif = 0; + rth->key.oif = oif; + rth->rt_dst = key.dst; + rth->rt_src = key.src; +#ifdef CONFIG_IP_ROUTE_NAT + rth->rt_dst_map = key.dst; + rth->rt_src_map = key.src; +#endif + rth->rt_iif = dev_out->ifindex; rth->u.dst.dev = dev_out; - rth->rt_gateway = dst_map; - rth->rt_spec_dst= dev_out->pa_addr; + rth->rt_gateway = key.dst; + rth->rt_spec_dst= key.src; rth->u.dst.output=ip_output; - if (flags&RTF_LOCAL) { + if (flags&RTCF_LOCAL) { rth->u.dst.input = ip_local_deliver; - rth->rt_spec_dst = daddr; + rth->rt_spec_dst = key.dst; } - if (flags&(RTF_BROADCAST|RTF_MULTICAST)) { - rth->rt_spec_dst = dev_out->pa_addr; - flags &= ~RTF_GATEWAY; - if (flags&RTF_LOCAL) + if (flags&(RTCF_BROADCAST|RTCF_MULTICAST)) { + rth->rt_spec_dst = key.src; + if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK)) rth->u.dst.output = ip_mc_output; - if (flags&RTF_MULTICAST) { - if (dev_out->flags&IFF_ALLMULTI) - rth->u.dst.output = ip_mc_output; #ifdef CONFIG_IP_MROUTE - if (ipv4_config.multicast_route && !LOCAL_MCAST(daddr)) + if (res.type == RTN_MULTICAST && dev_out->ip_ptr) { + struct in_device *in_dev = dev_out->ip_ptr; + if (IN_DEV_MFORWARD(in_dev) && !LOCAL_MCAST(daddr)) { rth->u.dst.input = ip_mr_input; -#endif + rth->u.dst.output = ip_mc_output; + } } +#endif } - if (fi) { - if (flags&RTF_GATEWAY) - rth->rt_gateway = fi->fib_gateway; - rth->u.dst.pmtu = fi->fib_mtu; - rth->u.dst.window=fi->fib_window; - rth->u.dst.rtt = fi->fib_irtt; + if (res.fi) { + if (FIB_RES_GW(res) && FIB_RES_NH(res).nh_scope == RT_SCOPE_LINK) + rth->rt_gateway = FIB_RES_GW(res); + rth->u.dst.pmtu = res.fi->fib_mtu ? : dev_out->mtu; + rth->u.dst.window=res.fi->fib_window ? : 0; + rth->u.dst.rtt = res.fi->fib_rtt ? : TCP_TIMEOUT_INIT; } else { rth->u.dst.pmtu = dev_out->mtu; rth->u.dst.window=0; rth->u.dst.rtt = TCP_TIMEOUT_INIT; } rth->rt_flags = flags; - hash = rt_hash_code(dst_key, dst_dev_key ? src_key^(dst_dev_key->ifindex<<5) : src_key, tos); + rth->rt_type = res.type; + hash = rt_hash_code(daddr, saddr^(oif<<5), tos); *rp = rt_intern_hash(hash, rth, ETH_P_IP); return 0; } -int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, struct device *dev_out) +int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, int oif) { unsigned hash; struct rtable *rth; - hash = rt_hash_code(daddr, dev_out ? saddr^(dev_out->ifindex<<5) - : saddr, tos); + hash = rt_hash_code(daddr, saddr^(oif<<5), tos); start_bh_atomic(); for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) { if (rth->key.dst == daddr && rth->key.src == saddr && - rth->key.src_dev == NULL && - rth->key.dst_dev == dev_out && + rth->key.iif == 0 && + rth->key.oif == oif && rth->key.tos == tos) { rth->u.dst.lastuse = jiffies; atomic_inc(&rth->u.dst.use); @@ -1341,48 +1401,126 @@ int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, struct dev } end_bh_atomic(); - return ip_route_output_slow(rp, daddr, saddr, tos, dev_out); + return ip_route_output_slow(rp, daddr, saddr, tos, oif); } -int ip_route_output_dev(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, int ifindex) +#ifdef CONFIG_RTNETLINK + +int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) { - unsigned hash; - struct rtable *rth; - struct device *dev_out; + struct kern_rta *rta = arg; + struct rtmsg *rtm = NLMSG_DATA(nlh); + struct rtable *rt = NULL; + u32 dst = 0; + u32 src = 0; + int err; + struct sk_buff *skb; + u8 *o; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + return -ENOBUFS; - hash = rt_hash_code(daddr, saddr^(ifindex<<5), tos); + /* Reserve room for dummy headers, this skb can pass + through good chunk of routing engine. + */ + skb->mac.raw = skb->data; + skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); + + if (rta->rta_dst) + memcpy(&dst, rta->rta_dst, 4); + if (rta->rta_src) + memcpy(&src, rta->rta_src, 4); + + if (rta->rta_iif) { + struct device *dev; + dev = dev_get_by_index(*rta->rta_iif); + if (!dev) + return -ENODEV; + skb->protocol = __constant_htons(ETH_P_IP); + skb->dev = dev; + start_bh_atomic(); + err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); + end_bh_atomic(); + rt = (struct rtable*)skb->dst; + if (!err && rt->u.dst.error) + err = rt->u.dst.error; + } else { + err = ip_route_output(&rt, dst, src, rtm->rtm_tos, + rta->rta_oif ? *rta->rta_oif : 0); + } + if (err) { + kfree_skb(skb, FREE_WRITE); + return err; + } - start_bh_atomic(); - for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) { - if (rth->key.dst == daddr && - rth->key.src == saddr && - rth->key.src_dev == NULL && - rth->key.tos == tos && - rth->key.dst_dev && - rth->key.dst_dev->ifindex == ifindex) { - rth->u.dst.lastuse = jiffies; - atomic_inc(&rth->u.dst.use); - atomic_inc(&rth->u.dst.refcnt); - end_bh_atomic(); - *rp = rth; - return 0; + skb->dst = &rt->u.dst; + if (rtm->rtm_flags & RTM_F_NOTIFY) + rt->rt_flags |= RTCF_NOTIFY; + + nlh = NLMSG_PUT(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, + RTM_NEWROUTE, sizeof(*rtm)); + rtm = NLMSG_DATA(nlh); + nlh->nlmsg_flags = 0; + rtm->rtm_family = AF_INET; + rtm->rtm_dst_len = 32; + rtm->rtm_src_len = 32; + rtm->rtm_tos = rt->key.tos; + rtm->rtm_table = RT_TABLE_MAIN; + rtm->rtm_type = rt->rt_type; + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + rtm->rtm_protocol = RTPROT_UNSPEC; + rtm->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED; + rtm->rtm_nhs = 0; + + o = skb->tail; + RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst); + RTA_PUT(skb, RTA_SRC, 4, &rt->rt_src); + if (rt->u.dst.dev) + RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex); + if (rt->rt_dst != rt->rt_gateway) + RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway); + RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &rt->u.dst.pmtu); + RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &rt->u.dst.window); + RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &rt->u.dst.rtt); + RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst); + rtm->rtm_optlen = skb->tail - o; + if (rta->rta_iif) { +#ifdef CONFIG_IP_MROUTE + if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_config.multicast_route) { + NETLINK_CB(skb).pid = NETLINK_CB(in_skb).pid; + err = ipmr_get_route(skb, rtm); + if (err <= 0) + return err; + } else +#endif + { + RTA_PUT(skb, RTA_IIF, 4, rta->rta_iif); + rtm->rtm_optlen = skb->tail - o; } } - end_bh_atomic(); + nlh->nlmsg_len = skb->tail - (u8*)nlh; + err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); + if (err < 0) + return err; + return 0; - dev_out = dev_get_by_index(ifindex); - if (!dev_out) - return -ENODEV; - return ip_route_output_slow(rp, daddr, saddr, tos, dev_out); +nlmsg_failure: +rtattr_failure: + kfree_skb(skb, FREE_WRITE); + return -EMSGSIZE; } -void ip_rt_multicast_event(struct device *dev) +#endif /* CONFIG_RTNETLINK */ + +void ip_rt_multicast_event(struct in_device *in_dev) { - rt_cache_flush(0); + rt_cache_flush(1*HZ); } __initfunc(void ip_rt_init(void)) { + devinet_init(); ip_fib_init(); #ifdef CONFIG_PROC_FS diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index c175f30f3..d3e018be8 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -9,7 +9,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * $Id: syncookies.c,v 1.2 1997/08/22 19:15:08 freitag Exp $ + * $Id: syncookies.c,v 1.3 1997/09/16 17:16:21 freitag Exp $ * * Missing: IPv6 support. * Some counter so that the Administrator can see when the machine @@ -200,9 +200,11 @@ cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt) * no easy way to do this. */ if (ip_route_output(&rt, - opt && opt->srr ? opt->faddr : - req->af.v4_req.rmt_addr,req->af.v4_req.loc_addr, - sk->ip_tos, NULL)) { + opt && + opt->srr ? opt->faddr : req->af.v4_req.rmt_addr, + req->af.v4_req.loc_addr, + sk->ip_tos, + 0)) { tcp_openreq_free(req); return NULL; } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e710235a1..f49514171 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1,6 +1,8 @@ /* * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem. * + * $Id: sysctl_net_ipv4.c,v 1.21 1997/10/17 01:21:18 davem Exp $ + * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS] */ @@ -36,16 +38,15 @@ extern int sysctl_arp_confirm_interval; extern int sysctl_arp_confirm_timeout; extern int sysctl_arp_max_pings; +/* From icmp.c */ +extern int sysctl_icmp_echo_ignore_all; +extern int sysctl_icmp_echo_ignore_broadcasts; + /* From ip_fragment.c */ extern int sysctl_ipfrag_low_thresh; extern int sysctl_ipfrag_high_thresh; extern int sysctl_ipfrag_time; -/* From igmp.c */ -extern int sysctl_igmp_max_host_report_delay; -extern int sysctl_igmp_timer_scale; -extern int sysctl_igmp_age_threshold; - extern int sysctl_tcp_cong_avoidance; extern int sysctl_tcp_hoe_retransmits; extern int sysctl_tcp_sack; @@ -65,6 +66,13 @@ extern int sysctl_tcp_stdurg; extern int sysctl_tcp_syn_taildrop; extern int sysctl_max_syn_backlog; +/* From icmp.c */ +extern int sysctl_icmp_sourcequench_time; +extern int sysctl_icmp_destunreach_time; +extern int sysctl_icmp_timeexceed_time; +extern int sysctl_icmp_paramprob_time; +extern int sysctl_icmp_echoreply_time; + int tcp_retr1_max = 255; extern int tcp_sysctl_congavoid(ctl_table *ctl, int write, struct file * filp, @@ -77,6 +85,7 @@ struct ipv4_config ipv4_config = { 1, 1, 1, 0, }; struct ipv4_config ipv4_def_router_config = { 0, 1, 1, 1, 1, 1, 1, }; struct ipv4_config ipv4_def_host_config = { 1, 1, 1, 0, }; +static int ipv4_sysctl_forwarding(ctl_table *ctl, int write, struct file * filp, void *buffer, size_t *lenp) { @@ -95,6 +104,15 @@ int ipv4_sysctl_forwarding(ctl_table *ctl, int write, struct file * filp, return ret; } +static +int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp, + void *buffer, size_t *lenp) +{ + if (write) + rt_cache_flush(0); + return 0; +} + ctl_table ipv4_table[] = { {NET_IPV4_ARP_RES_TIME, "arp_res_time", &sysctl_arp_res_time, sizeof(int), 0644, NULL, &proc_dointvec}, @@ -147,17 +165,17 @@ ctl_table ipv4_table[] = { {NET_IPV4_SOURCE_ROUTE, "ip_source_route", &ipv4_config.source_route, sizeof(int), 0644, NULL, &proc_dointvec}, - {NET_IPV4_ADDRMASK_AGENT, "ip_addrmask_agent", - &ipv4_config.addrmask_agent, sizeof(int), 0644, NULL, + {NET_IPV4_SEND_REDIRECTS, "ip_send_redirects", + &ipv4_config.send_redirects, sizeof(int), 0644, NULL, &proc_dointvec}, - {NET_IPV4_BOOTP_AGENT, "ip_bootp_agent", - &ipv4_config.bootp_agent, sizeof(int), 0644, NULL, + {NET_IPV4_AUTOCONFIG, "ip_autoconfig", + &ipv4_config.autoconfig, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_BOOTP_RELAY, "ip_bootp_relay", &ipv4_config.bootp_relay, sizeof(int), 0644, NULL, &proc_dointvec}, - {NET_IPV4_FIB_MODEL, "ip_fib_model", - &ipv4_config.fib_model, sizeof(int), 0644, NULL, + {NET_IPV4_PROXY_ARP, "ip_proxy_arp", + &ipv4_config.proxy_arp, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc", &ipv4_config.no_pmtu_disc, sizeof(int), 0644, NULL, @@ -171,6 +189,9 @@ ctl_table ipv4_table[] = { {NET_IPV4_RFC1620_REDIRECTS, "ip_rfc1620_redirects", &ipv4_config.rfc1620_redirects, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_RTCACHE_FLUSH, "ip_rtcache_flush", + NULL, sizeof(int), 0644, NULL, + &ipv4_sysctl_rtcache_flush}, {NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries", &sysctl_tcp_syn_retries, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh", @@ -197,17 +218,6 @@ ctl_table ipv4_table[] = { {NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout", &sysctl_tcp_fin_timeout, sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, - {NET_IPV4_IGMP_MAX_HOST_REPORT_DELAY, "igmp_max_host_report_delay", - &sysctl_igmp_max_host_report_delay, sizeof(int), 0644, NULL, - &proc_dointvec}, - {NET_IPV4_IGMP_TIMER_SCALE, "igmp_timer_scale", - &sysctl_igmp_timer_scale, sizeof(int), 0644, NULL, &proc_dointvec}, -#if 0 - /* This one shouldn't be exposed to the user (too implementation - specific): */ - {NET_IPV4_IGMP_AGE_THRESHOLD, "igmp_age_threshold", - &sysctl_igmp_age_threshold, sizeof(int), 0644, NULL, &proc_dointvec}, -#endif #ifdef CONFIG_SYN_COOKIES {NET_TCP_SYNCOOKIES, "tcp_syncookies", &sysctl_tcp_syncookies, sizeof(int), 0644, NULL, &proc_dointvec}, @@ -218,6 +228,25 @@ ctl_table ipv4_table[] = { sizeof(int), 0644, NULL, &proc_dointvec}, {NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog", &sysctl_max_syn_backlog, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range", + &sysctl_local_port_range, sizeof(sysctl_local_port_range), 0644, + NULL, &proc_dointvec}, + {NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all", + &sysctl_icmp_echo_ignore_all, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts", + &sysctl_icmp_echo_ignore_broadcasts, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_ICMP_SOURCEQUENCH_RATE, "icmp_sourcequench_rate", + &sysctl_icmp_sourcequench_time, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_ICMP_DESTUNREACH_RATE, "icmp_destunreach_rate", + &sysctl_icmp_destunreach_time, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_ICMP_TIMEEXCEED_RATE, "icmp_timeexceed_rate", + &sysctl_icmp_timeexceed_time, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_ICMP_PARAMPROB_RATE, "icmp_paramprob_rate", + &sysctl_icmp_paramprob_time, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_ICMP_ECHOREPLY_RATE, "icmp_echoreply_rate", + &sysctl_icmp_echoreply_time, sizeof(int), 0644, NULL, &proc_dointvec}, {0} }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b62035e3b..eff309bcf 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.71 1997/09/06 05:11:45 davem Exp $ + * Version: $Id: tcp.c,v 1.75 1997/10/16 02:57:34 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -437,8 +437,8 @@ static struct open_request *tcp_find_established(struct tcp_opt *tp, struct open_request *prev = (struct open_request *)&tp->syn_wait_queue; while(req) { if (req->sk && - (req->sk->state == TCP_ESTABLISHED || - req->sk->state >= TCP_FIN_WAIT1)) + ((1 << req->sk->state) & + ~(TCPF_SYN_SENT|TCPF_SYN_RECV))) break; prev = req; req = req->dl_next; @@ -603,7 +603,7 @@ unsigned int tcp_poll(struct socket *sock, poll_table *wait) if (sk->err) mask = POLLERR; /* Connected? */ - if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) { + if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) { if (sk->shutdown & RCV_SHUTDOWN) mask |= POLLHUP; @@ -653,7 +653,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) { unsigned long amount; - if (sk->state == TCP_LISTEN) return(-EINVAL); + if (sk->state == TCP_LISTEN) + return(-EINVAL); amount = sock_wspace(sk); return put_user(amount, (int *)arg); } @@ -701,7 +702,8 @@ static void wait_for_tcp_connect(struct sock * sk) { release_sock(sk); cli(); - if (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT && sk->err == 0) + if (((1 << sk->state) & ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT)) && + sk->err == 0) interruptible_sleep_on(sk->sleep); sti(); lock_sock(sk); @@ -779,11 +781,11 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); /* Wait for a connection to finish. */ - while (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) { + while ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { if (sk->err) return sock_error(sk); - if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) { + if ((1 << sk->state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (sk->keepopen) send_sig(SIGPIPE, current, 0); return -EPIPE; @@ -982,7 +984,7 @@ void tcp_read_wakeup(struct sock *sk) /* If we're closed, don't send an ack, or we'll get a RST * from the closed destination. */ - if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT)) + if ((1 << sk->state) & (TCPF_CLOSE|TCPF_TIME_WAIT)) return; tcp_send_ack(sk); @@ -1400,10 +1402,8 @@ void tcp_shutdown(struct sock *sk, int how) return; /* If we've already sent a FIN, or it's a closed state, skip this. */ - if (sk->state == TCP_ESTABLISHED || - sk->state == TCP_SYN_SENT || - sk->state == TCP_SYN_RECV || - sk->state == TCP_CLOSE_WAIT) { + if ((1 << sk->state) & + (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) { lock_sock(sk); /* Flag that the sender has shutdown. */ @@ -1424,9 +1424,7 @@ void tcp_shutdown(struct sock *sk, int how) static inline int closing(struct sock * sk) { - return ((1 << sk->state) & ((1 << TCP_FIN_WAIT1)| - (1 << TCP_CLOSING)| - (1 << TCP_LAST_ACK))); + return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK)); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7c6fbec56..e9f936f82 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_input.c,v 1.56 1997/08/31 08:24:54 freitag Exp $ + * Version: $Id: tcp_input.c,v 1.64 1997/10/30 23:52:24 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -64,6 +64,8 @@ static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack, #define SYNC_INIT 1 #endif +extern int sysctl_tcp_fin_timeout; + int sysctl_tcp_cong_avoidance; int sysctl_tcp_hoe_retransmits; int sysctl_tcp_sack; @@ -249,7 +251,7 @@ extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) * really. */ -static int tcp_reset(struct sock *sk, struct sk_buff *skb) +static void tcp_reset(struct sock *sk, struct sk_buff *skb) { sk->zapped = 1; @@ -285,8 +287,6 @@ static int tcp_reset(struct sock *sk, struct sk_buff *skb) #endif if (!sk->dead) sk->state_change(sk); - - return(0); } /* @@ -345,15 +345,16 @@ void tcp_parse_options(struct tcphdr *th, struct tcp_opt *tp, int no_fancy) /* Cheaper to set again then to * test syn. Optimize this? */ - if (sysctl_tcp_timestamps && !no_fancy) + if (sysctl_tcp_timestamps && !no_fancy) { tp->tstamp_ok = 1; - tp->saw_tstamp = 1; - tp->rcv_tsval = ntohl(*(__u32 *)ptr); - tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4)); + tp->saw_tstamp = 1; + tp->rcv_tsval = ntohl(*(__u32 *)ptr); + tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4)); + } } break; case TCPOPT_SACK: - if (no_fancy) + if (no_fancy || !sysctl_tcp_sack) break; tp->sacks = (opsize-2)>>3; if (tp->sacks<<3 == opsize-2) { @@ -486,8 +487,10 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) #define FLAG_WIN_UPDATE 0x02 #define FLAG_DATA_ACKED 0x04 -static __inline__ void clear_fast_retransmit(struct sock *sk) { +static __inline__ void clear_fast_retransmit(struct sock *sk) +{ struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); + if (tp->dup_acks > 3) { tp->retrans_head = NULL; tp->snd_cwnd = max(tp->snd_ssthresh, 1); @@ -857,8 +860,7 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, tcp_ack_probe(sk, ack); /* See if we can take anything off of the retransmit queue. */ - if (tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt)) - flag |= FLAG_DATA_ACKED; + flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt); /* If we have a timestamp, we always do rtt estimates. */ if (tp->saw_tstamp) { @@ -879,7 +881,7 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, } } else { tcp_set_rto(tp); - if (flag && FLAG_DATA_ACKED) + if (flag & FLAG_DATA_ACKED) (*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt); } /* NOTE: safe here so long as cong_ctl doesn't use rto */ @@ -973,6 +975,11 @@ static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + if(sk->state == TCP_SYN_SENT) { + /* RFC793 says to drop the segment and return. */ + return 1; + } + /* XXX This fin_seq thing should disappear... -DaveM */ tp->fin_seq = skb->end_seq; @@ -985,7 +992,6 @@ static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) switch(sk->state) { case TCP_SYN_RECV: - case TCP_SYN_SENT: case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); @@ -999,12 +1005,16 @@ static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) * nothing. */ break; + case TCP_LAST_ACK: + /* RFC793: Remain in the LAST-ACK state. */ + break; case TCP_TIME_WAIT: /* Received a retransmission of the FIN, * restart the TIME_WAIT timer. */ tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); - return(0); + break; + case TCP_FIN_WAIT1: /* This case occurs when a simultaneous close * happens, we must ack the received FIN and @@ -1028,15 +1038,13 @@ static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) /* Already in CLOSE. */ break; default: - /* FIXME: Document whats happening in this case. -DaveM */ - tcp_set_state(sk,TCP_LAST_ACK); - - /* Start the timers. */ - tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); - return(0); + /* Only TCP_LISTEN is left, in that case we should never + * reach this piece of code. + */ + printk("tcp_fin: Impossible, sk->state=%d\n", sk->state); + break; }; - - return(0); + return 0; } /* This one checks to see if we can put data from the @@ -1337,8 +1345,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, * We do checksum and copy also but from device to kernel. */ - tp = &(sk->tp_pinfo.af_tcp); - /* * RFC1323: H1. Apply PAWS check first. */ @@ -1373,6 +1379,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_data_snd_check(sk); } + tcp_statistics.TcpInErrs++; kfree_skb(skb, FREE_READ); return 0; } else if (skb->ack_seq == tp->snd_una) { @@ -1409,6 +1416,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if(th->syn && skb->seq != sk->syn_seq) { SOCK_DEBUG(sk, "syn in established state\n"); + tcp_statistics.TcpInErrs++; tcp_reset(sk, skb); return 1; } @@ -1430,7 +1438,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, /* step 8: check the FIN bit */ if (th->fin) - tcp_fin(skb, sk, th); + (void) tcp_fin(skb, sk, th); tcp_data_snd_check(sk); tcp_ack_snd_check(sk); @@ -1449,82 +1457,67 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, /* Shared between IPv4 and IPv6 now. */ struct sock * -tcp_check_req(struct sock *sk, struct sk_buff *skb, void *opt) +tcp_check_req(struct sock *sk, struct sk_buff *skb, struct open_request *req) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct open_request *dummy, *req; /* assumption: the socket is not in use. * as we checked the user count on tcp_rcv and we're * running from a soft interrupt. */ - req = tp->af_specific->search_open_req(tp, (void *)skb->nh.raw, skb->h.th, - &dummy); - if (req) { - if (req->sk) { - /* socket already created but not - * yet accepted()... - */ - sk = req->sk; - } else { - u32 flg; - /* Check for syn retransmission */ - flg = *(((u32 *)skb->h.th) + 3); + if (req->sk) { + /* socket already created but not + * yet accepted()... + */ + sk = req->sk; + } else { + u32 flg; - flg &= __constant_htonl(0x00170000); - if ((flg == __constant_htonl(0x00020000)) && - (!after(skb->seq, req->rcv_isn))) { + /* Check for syn retransmission */ + flg = *(((u32 *)skb->h.th) + 3); + + flg &= __constant_htonl(0x00170000); + /* Only SYN set? */ + if (flg == __constant_htonl(0x00020000)) { + if (!after(skb->seq, req->rcv_isn)) { /* retransmited syn. */ req->class->rtx_syn_ack(sk, req); return NULL; + } else { + return sk; /* New SYN */ } - - /* In theory the packet could be for a cookie, but - * TIME_WAIT should guard us against this. - * XXX: Nevertheless check for cookies? - */ - if (skb->ack_seq != req->snt_isn+1) { - tp->af_specific->send_reset(skb); - return NULL; - } - - sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); - tcp_dec_slow_timer(TCP_SLT_SYNACK); - if (sk == NULL) - return NULL; + } - req->expires = 0UL; - req->sk = sk; + /* We know it's an ACK here */ + /* In theory the packet could be for a cookie, but + * TIME_WAIT should guard us against this. + * XXX: Nevertheless check for cookies? + * This sequence number check is done again later, + * but we do it here to prevent syn flood attackers + * from creating big SYN_RECV sockets. + */ + if (!between(skb->ack_seq, req->snt_isn, req->snt_isn+1) || + !between(skb->seq, req->rcv_isn, + req->rcv_isn+1+req->rcv_wnd)) { + req->class->send_reset(skb); + return NULL; } - } -#ifdef CONFIG_SYNCOOKIES - else { - sk = tp->af_specific->cookie_check(sk, skb, opt); + + sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); + tcp_dec_slow_timer(TCP_SLT_SYNACK); if (sk == NULL) - return NULL; + return NULL; + + req->expires = 0UL; + req->sk = sk; } -#endif skb_orphan(skb); skb_set_owner_r(skb, sk); return sk; } - -static void tcp_rst_req(struct tcp_opt *tp, struct sk_buff *skb) -{ - struct open_request *req, *prev; - - req = tp->af_specific->search_open_req(tp,skb->nh.iph,skb->h.th,&prev); - if (!req) - return; - /* Sequence number check required by RFC793 */ - if (before(skb->seq, req->snt_isn) || after(skb->seq, req->snt_isn+1)) - return; - tcp_synq_unlink(tp, req, prev); -} - /* * This function implements the receiving procedure of RFC 793. * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be @@ -1540,16 +1533,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* state == CLOSED, hash lookup always fails, so no worries. -DaveM */ switch (sk->state) { case TCP_LISTEN: - if (th->rst) { - tcp_rst_req(tp, skb); - goto discard; - } - /* These use the socket TOS.. * might want to be the received TOS */ - if(th->ack) - return 1; + if(th->ack) + return 1; if(th->syn) { if(tp->af_specific->conn_request(sk, skb, opt, 0) < 0) @@ -1812,6 +1800,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tcp_set_state(sk, TCP_FIN_WAIT2); if (!sk->dead) sk->state_change(sk); + else + tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout); } break; @@ -1870,8 +1860,10 @@ step6: } /* step 8: check the FIN bit */ - if (th->fin) - tcp_fin(skb, sk, th); + if (th->fin) { + if(tcp_fin(skb, sk, th) != 0) + goto discard; + } tcp_data_snd_check(sk); tcp_ack_snd_check(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f8cb36894..10c7cd4f4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_ipv4.c,v 1.62 1997/09/04 22:34:59 davem Exp $ + * Version: $Id: tcp_ipv4.c,v 1.74 1997/10/30 23:52:27 davem Exp $ * * IPv4 specific functions * @@ -60,8 +60,13 @@ extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_syncookies; -/* Define this to check TCP sequence numbers in ICMP packets. */ -#define ICMP_PARANOIA 1 +/* Check TCP sequence numbers in ICMP packets. */ +#define ICMP_PARANOIA 1 +#ifndef ICMP_PARANOIA +#define ICMP_MIN_LENGTH 4 +#else +#define ICMP_MIN_LENGTH 8 +#endif static void tcp_v4_send_reset(struct sk_buff *skb); @@ -88,6 +93,13 @@ struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE]; */ struct sock *tcp_bound_hash[TCP_BHTABLE_SIZE]; +/* + * This array holds the first and last local port number. + * For high-usage systems, use sysctl to change this to + * 32768-61000 + */ +int sysctl_local_port_range[2] = { 1024, 4999 }; + static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport, __u32 faddr, __u16 fport) { @@ -116,6 +128,13 @@ static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum) unsigned char state = sk2->state; int sk2_reuse = sk2->reuse; + /* Two sockets can be bound to the same port if they're + * bound to different interfaces. + */ + + if(sk->bound_dev_if != sk2->bound_dev_if) + continue; + if(!sk2->rcv_saddr || !sk->rcv_saddr) { if((!sk2_reuse) || (!sk_reuse) || @@ -161,13 +180,15 @@ static __inline__ int tcp_lport_inuse(int num) */ unsigned short tcp_good_socknum(void) { - static int start = PROT_SOCK; + static int start = 0; static int binding_contour = 0; int best = 0; int size = 32767; /* a big num. */ int retval = 0, i, end, bc; SOCKHASH_LOCK(); + if (start > sysctl_local_port_range[1] || start < sysctl_local_port_range[0]) + start = sysctl_local_port_range[0]; i = tcp_bhashfn(start); end = i + TCP_BHTABLE_SIZE; bc = binding_contour; @@ -207,8 +228,8 @@ verify: best = retval; /* mark the starting point to avoid infinite loops */ while(tcp_lport_inuse(retval)) { retval = tcp_bhashnext(retval,i); - if (retval > 32767) /* Upper bound */ - retval = tcp_bhashnext(PROT_SOCK,i); + if (retval > sysctl_local_port_range[1]) /* Upper bound */ + retval = tcp_bhashnext(sysctl_local_port_range[0],i); if (retval == best) { /* This hash chain is full. No answer. */ retval = 0; @@ -218,8 +239,6 @@ verify: done: start = (retval + 1); - if (start > 32767 || start < PROT_SOCK) - start = PROT_SOCK; SOCKHASH_UNLOCK(); return retval; @@ -301,20 +320,34 @@ static void tcp_v4_rehash(struct sock *sk) * connection. So always assume those are both wildcarded * during the search since they can never be otherwise. */ -static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum) +static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif) { struct sock *sk; struct sock *result = NULL; + int score, hiscore; + hiscore=0; for(sk = tcp_listening_hash[tcp_lhashfn(hnum)]; sk; sk = sk->next) { if(sk->num == hnum) { __u32 rcv_saddr = sk->rcv_saddr; + score = 1; if(rcv_saddr) { - if(rcv_saddr == daddr) - return sk; /* Best possible match. */ - } else if(!result) + if (rcv_saddr != daddr) + continue; + score++; + } + if (sk->bound_dev_if) { + if (sk->bound_dev_if != dif) + continue; + score++; + } + if (score == 3) + return sk; + if (score > hiscore) { + hiscore = score; result = sk; + } } } return result; @@ -324,7 +357,7 @@ static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum) * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM */ static inline struct sock *__tcp_v4_lookup(struct tcphdr *th, - u32 saddr, u16 sport, u32 daddr, u16 dport) + u32 saddr, u16 sport, u32 daddr, u16 dport, int dif) { unsigned short hnum = ntohs(dport); struct sock *sk; @@ -338,7 +371,8 @@ static inline struct sock *__tcp_v4_lookup(struct tcphdr *th, if(sk->daddr == saddr && /* remote address */ sk->dummy_th.dest == sport && /* remote port */ sk->num == hnum && /* local port */ - sk->rcv_saddr == daddr) /* local address */ + sk->rcv_saddr == daddr && /* local address */ + (!sk->bound_dev_if || sk->bound_dev_if == dif)) goto hit; /* You sunk my battleship! */ /* Must check for a TIME_WAIT'er before going to listener hash. */ @@ -346,17 +380,18 @@ static inline struct sock *__tcp_v4_lookup(struct tcphdr *th, if(sk->daddr == saddr && /* remote address */ sk->dummy_th.dest == sport && /* remote port */ sk->num == hnum && /* local port */ - sk->rcv_saddr == daddr) /* local address */ + sk->rcv_saddr == daddr && /* local address */ + (!sk->bound_dev_if || sk->bound_dev_if == dif)) goto hit; - sk = tcp_v4_lookup_listener(daddr, hnum); + sk = tcp_v4_lookup_listener(daddr, hnum, dif); hit: return sk; } -__inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport) +__inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif) { - return __tcp_v4_lookup(0, saddr, sport, daddr, dport); + return __tcp_v4_lookup(0, saddr, sport, daddr, dport, dif); } #ifdef CONFIG_IP_TRANSPARENT_PROXY @@ -374,16 +409,25 @@ __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport #define tcp_v4_proxy_loop_next(hnum, hpnum, sk, fpass) \ secondlist((hpnum),(sk)->bind_next,(fpass)) -struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr, - unsigned short rnum, unsigned long laddr, - unsigned long paddr, unsigned short pnum) +static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr, + unsigned short rnum, unsigned long laddr, + struct device *dev, unsigned short pnum, + int dif) { struct sock *s, *result = NULL; int badness = -1; + u32 paddr = 0; unsigned short hnum = ntohs(num); unsigned short hpnum = ntohs(pnum); int firstpass = 1; + if(dev && dev->ip_ptr) { + struct in_device *idev = dev->ip_ptr; + + if(idev->ifa_list) + paddr = idev->ifa_list->ifa_local; + } + /* This code must run only from NET_BH. */ for(s = tcp_v4_proxy_loop_init(hnum, hpnum, s, firstpass); s != NULL; @@ -408,7 +452,12 @@ struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr, continue; score++; } - if(score == 3 && s->num == hnum) { + if(s->bound_dev_if) { + if(s->bound_dev_if != dif) + continue; + score++; + } + if(score == 4 && s->num == hnum) { result = s; break; } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) { @@ -486,7 +535,6 @@ out: int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sk_buff *buff; - struct sk_buff *skb1; int tmp; struct tcphdr *th; struct rtable *rt; @@ -517,11 +565,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) } tmp = ip_route_connect(&rt, usin->sin_addr.s_addr, sk->saddr, - RT_TOS(sk->ip_tos)|(sk->localroute || 0)); + RT_TOS(sk->ip_tos)|(sk->localroute || 0), sk->bound_dev_if); if (tmp < 0) return tmp; - if (rt->rt_flags&(RTF_MULTICAST|RTF_BROADCAST)) { + if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) { ip_rt_put(rt); return -ENETUNREACH; } @@ -533,13 +581,22 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) } lock_sock(sk); + + /* Do this early, so there is less state to unwind on failure. */ + buff = sock_wmalloc(sk, MAX_SYN_SIZE, 0, GFP_KERNEL); + if (buff == NULL) { + release_sock(sk); + ip_rt_put(rt); + return(-ENOBUFS); + } + sk->dst_cache = &rt->u.dst; sk->daddr = rt->rt_dst; if (!sk->saddr) sk->saddr = rt->rt_src; sk->rcv_saddr = sk->saddr; - if (sk->priority == SOPRI_NORMAL) + if (sk->priority == 0) sk->priority = rt->u.dst.priority; sk->dummy_th.dest = usin->sin_port; @@ -557,20 +614,23 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk->err = 0; - buff = sock_wmalloc(sk, MAX_SYN_SIZE, 0, GFP_KERNEL); - if (buff == NULL) { - release_sock(sk); - return(-ENOBUFS); - } - /* Put in the IP header and routing stuff. */ tmp = ip_build_header(buff, sk); if (tmp < 0) { + /* Caller has done ip_rt_put(rt) and set sk->dst_cache + * to NULL. We must unwind the half built TCP socket + * state so that this failure does not create a "stillborn" + * sock (ie. future re-tries of connect() would fail). + */ + sk->daddr = 0; + sk->saddr = sk->rcv_saddr = 0; kfree_skb(buff, FREE_WRITE); release_sock(sk); return(-ENETUNREACH); } + /* No failure conditions can result past this point. */ + th = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr)); buff->h.th = th; @@ -582,11 +642,10 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) th->ack = 0; th->syn = 1; - sk->mtu = rt->u.dst.pmtu; if ((sk->ip_pmtudisc == IP_PMTUDISC_DONT || (sk->ip_pmtudisc == IP_PMTUDISC_WANT && - rt->rt_flags&RTF_NOPMTUDISC)) && + rt->rt_flags&RTCF_NOPMTUDISC)) && rt->u.dst.pmtu > 576) sk->mtu = 576; @@ -639,8 +698,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tp->packets_out++; buff->when = jiffies; - skb1 = skb_clone(buff, GFP_KERNEL); - ip_queue_xmit(skb1); + ip_queue_xmit(skb_clone(buff, GFP_KERNEL)); /* Timer for repeating the SYN until an answer. */ tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); @@ -691,11 +749,10 @@ out: * This should be replaced with a global hash table. */ static struct open_request *tcp_v4_search_req(struct tcp_opt *tp, - void *header, - struct tcphdr *th, - struct open_request **prevp) + struct iphdr *iph, + struct tcphdr *th, + struct open_request **prevp) { - struct iphdr *iph = header; struct open_request *req, *prev; __u16 rport = th->source; @@ -720,9 +777,7 @@ static struct open_request *tcp_v4_search_req(struct tcp_opt *tp, /* * This routine does path mtu discovery as defined in RFC1197. */ -static inline void do_pmtu_discovery(struct sock *sk, - struct iphdr *ip, - struct tcphdr *th) +static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip) { int new_mtu; struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; @@ -750,7 +805,8 @@ static inline void do_pmtu_discovery(struct sock *sk, * dropped. This is the new "fast" path mtu * discovery. */ - tcp_simple_retransmit(sk); + if (!sk->sock_readers) + tcp_simple_retransmit(sk); } } } @@ -764,7 +820,7 @@ static inline void do_pmtu_discovery(struct sock *sk, * to find the appropriate port. */ -void tcp_v4_err(struct sk_buff *skb, unsigned char *dp) +void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) { struct iphdr *iph = (struct iphdr*)dp; struct tcphdr *th; @@ -772,19 +828,19 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp) int type = skb->h.icmph->type; int code = skb->h.icmph->code; struct sock *sk; - __u32 seq; + int opening; +#ifdef ICMP_PARANOIA + __u32 seq; +#endif -#if 0 - /* check wrong - icmp.c should pass in len */ - if (skb->len < 8+(iph->ihl << 2)+sizeof(struct tcphdr)) { - icmp_statistics.IcmpInErrors++; + if (len < (iph->ihl << 2) + ICMP_MIN_LENGTH) { + icmp_statistics.IcmpInErrors++; return; } -#endif th = (struct tcphdr*)(dp+(iph->ihl<<2)); - sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source); + sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex); if (sk == NULL) { icmp_statistics.IcmpInErrors++; return; @@ -793,19 +849,38 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp) /* pointless, because we have no way to retry when sk is locked. But the socket should be really locked here for better interaction with the socket layer. This needs to be solved for SMP - (I would prefer an "ICMP backlog"). */ - /* lock_sock(sk); */ + (I would prefer an "ICMP backlog"). + + tcp_v4_err is called only from bh, so that lock_sock is pointless, + even in commented form :-) --ANK + + Note "for SMP" ;) -AK + + Couple of notes about backlogging: + - error_queue could be used for it. + - could, but MUST NOT :-), because: + a) it is not clear, + who will process deferred messages. + b) ICMP is not reliable by design, so that you can safely + drop ICMP messages. Besides that, if ICMP really arrived + it is very unlikely, that socket is locked. --ANK + + I don't think it's unlikely that sk is locked. With the + open_request stuff there is much more stress on the main + LISTEN socket. I just want to make sure that all ICMP unreachables + destroy unneeded open_requests as reliable as possible (for + syn flood protection) -AK + */ tp = &sk->tp_pinfo.af_tcp; - - seq = ntohl(th->seq); - #ifdef ICMP_PARANOIA - if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) { + seq = ntohl(th->seq); + if (sk->state != TCP_LISTEN && + !between(seq, tp->snd_una, max(tp->snd_una+32768,tp->snd_nxt))) { if (net_ratelimit()) printk(KERN_DEBUG "icmp packet outside the tcp window:" " s:%d %u,%u,%u\n", (int)sk->state, seq, tp->snd_una, tp->snd_nxt); - goto out; + return; } #endif @@ -814,15 +889,15 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp) tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2); tp->snd_cwnd = tp->snd_ssthresh; tp->high_seq = tp->snd_nxt; - goto out; + return; case ICMP_PARAMETERPROB: sk->err=EPROTO; sk->error_report(sk); break; case ICMP_DEST_UNREACH: if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ - do_pmtu_discovery(sk, iph, th); - goto out; + do_pmtu_discovery(sk, iph); + return; } break; } @@ -830,62 +905,62 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp) /* If we've already connected we will keep trying * until we time out, or the user gives up. */ - if (code <= NR_ICMP_UNREACH) { - int fatal = 0; - - if (sk->state == TCP_LISTEN) { - struct open_request *req, *prev; - - /* Prevent race conditions with accept() - * icmp is unreliable. - * This is the easiest solution for now - for - * very big servers it might prove inadequate. - */ - if (sk->sock_readers) { - /* XXX: add a counter here to profile this. - * If too many ICMPs get dropped on busy - * servers this needs to be solved differently. - */ - goto out; - } + if (code > NR_ICMP_UNREACH) + return; - req = tcp_v4_search_req(tp, iph, th, &prev); - if (!req) - goto out; + opening = 0; + switch (sk->state) { + struct open_request *req, *prev; + case TCP_LISTEN: + /* Prevent race conditions with accept() - + * ICMP is unreliable. + */ + if (sk->sock_readers) { + /* XXX: add a counter here to profile this. + * If too many ICMPs get dropped on busy + * servers this needs to be solved differently. + */ + return; + } + + if (!th->syn && !th->ack) + return; + req = tcp_v4_search_req(tp, iph, th, &prev); + if (!req) + return; #ifdef ICMP_PARANOIA - if (seq != req->snt_isn) { - if (net_ratelimit()) - printk(KERN_DEBUG "icmp packet for openreq " - "with wrong seq number:%d:%d\n", - seq, req->snt_isn); - goto out; - } + if (seq != req->snt_isn) { + if (net_ratelimit()) + printk(KERN_DEBUG "icmp packet for openreq " + "with wrong seq number:%d:%d\n", + seq, req->snt_isn); + return; + } #endif - if (req->sk) { /* not yet accept()ed */ - sk = req->sk; - } else { - tcp_synq_unlink(tp, req, prev); - tcp_openreq_free(req); - fatal = 1; - } - } else if (sk->state == TCP_SYN_SENT - || sk->state == TCP_SYN_RECV) - fatal = 1; - - if(icmp_err_convert[code].fatal || fatal) { - sk->err = icmp_err_convert[code].errno; - if (fatal) { - tcp_statistics.TcpAttemptFails++; - if (sk->state != TCP_LISTEN) - tcp_set_state(sk,TCP_CLOSE); - sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ - } - } else /* Only an error on timeout */ - sk->err_soft = icmp_err_convert[code].errno; + if (req->sk) { /* not yet accept()ed */ + sk = req->sk; /* report error in accept */ + } else { + tcp_synq_unlink(tp, req, prev); + req->class->destructor(req); + tcp_openreq_free(req); + } + /* FALL THOUGH */ + case TCP_SYN_SENT: + case TCP_SYN_RECV: + opening = 1; + break; } - -out: - /* release_sock(sk); */ + + if(icmp_err_convert[code].fatal || opening) { + sk->err = icmp_err_convert[code].errno; + if (opening) { + tcp_statistics.TcpAttemptFails++; + if (sk->state != TCP_LISTEN) + tcp_set_state(sk,TCP_CLOSE); + sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ + } + } else /* Only an error on timeout */ + sk->err_soft = icmp_err_convert[code].errno; } /* This routine computes an IPv4 TCP checksum. */ @@ -948,6 +1023,7 @@ static void tcp_v4_send_reset(struct sk_buff *skb) /* FIXME: should this carry an options packet? */ ip_queue_xmit(skb1); tcp_statistics.TcpOutSegs++; + tcp_statistics.TcpOutRsts++; } #ifdef CONFIG_IP_TRANSPARENT_PROXY @@ -962,7 +1038,7 @@ int tcp_chkaddr(struct sk_buff *skb) struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4); struct sock *sk; - sk = tcp_v4_lookup(iph->saddr, th->source, iph->daddr, th->dest); + sk = tcp_v4_lookup(iph->saddr, th->source, iph->daddr, th->dest, skb->dev->ifindex); if (!sk) return 0; @@ -992,7 +1068,7 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) kfree_skb(skb, FREE_WRITE); return; } - + mss = (skb->dst->pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr)); if (sk->user_mss) mss = min(mss, sk->user_mss); @@ -1077,7 +1153,8 @@ int sysctl_tcp_syn_taildrop = 1; struct or_calltable or_ipv4 = { tcp_v4_send_synack, - tcp_v4_or_free + tcp_v4_or_free, + tcp_v4_send_reset }; #ifdef NEW_LISTEN @@ -1304,7 +1381,7 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (ip_route_output(&rt, newsk->opt && newsk->opt->srr ? newsk->opt->faddr : newsk->daddr, - newsk->saddr, newsk->ip_tos, NULL)) { + newsk->saddr, newsk->ip_tos, 0)) { sk_free(newsk); return NULL; } @@ -1359,6 +1436,57 @@ exit: return NULL; } +static void tcp_v4_rst_req(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct open_request *req, *prev; + + req = tcp_v4_search_req(tp,skb->nh.iph, skb->h.th, &prev); + if (!req) + return; + /* Sequence number check required by RFC793 */ + if (before(skb->seq, req->snt_isn) || after(skb->seq, req->snt_isn+1)) + return; + tcp_synq_unlink(tp, req, prev); + req->class->destructor(req); + tcp_openreq_free(req); +} + +/* Check for embryonic sockets (open_requests) We check packets with + * only the SYN bit set against the open_request queue too: This + * increases connection latency a bit, but is required to detect + * retransmitted SYNs. + */ +static inline struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb) +{ + struct tcphdr *th = skb->h.th; + u32 flg = ((u32 *)th)[3]; + + /* Check for RST */ + if (flg & __constant_htonl(0x00040000)) { + tcp_v4_rst_req(sk, skb); + return NULL; + } + + /* Check for SYN|ACK */ + if (flg & __constant_htonl(0x00120000)) { + struct open_request *req, *dummy; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* Find possible connection requests. */ + req = tcp_v4_search_req(tp, skb->nh.iph, th, &dummy); + if (req) { + sk = tcp_check_req(sk, skb, req); + } +#ifdef CONFIG_SYN_COOKIES + else { + sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); + } +#endif + } + return sk; +} + int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { skb_set_owner_r(skb, sk); @@ -1368,49 +1496,42 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) * is currently called with bh processing disabled. */ lock_sock(sk); - + if (sk->state == TCP_ESTABLISHED) { /* Fast path */ if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) goto reset; - } else { - /* Check for embryonic sockets (open_requests) - * We check packets with only the SYN bit set - * against the open_request queue too: This - * increases connection latency a bit, but is - * required to detect retransmitted SYNs. - */ - /* FIXME: need to check for multicast syns - * here to satisfy RFC1122 4.2.3.10, p. 104: - * discard bcast/mcast SYN. I'm not sure if - * they're filtered out at the IP layer (I - * think not) - */ - if (sk->state == TCP_LISTEN && - ((u32 *)skb->h.th)[3] & __constant_htonl(0x00120000)) { - struct sock *nsk; - - /* Find possible connection requests. */ - nsk = tcp_check_req(sk, skb, &(IPCB(skb)->opt)); - if (nsk == NULL) - goto discard; - - release_sock(sk); - lock_sock(nsk); - sk = nsk; - } + release_sock(sk); + return 0; + } + - if (tcp_rcv_state_process(sk, skb, skb->h.th, - &(IPCB(skb)->opt), skb->len)) - goto reset; + if (sk->state == TCP_LISTEN) { + struct sock *nsk; + + nsk = tcp_v4_hnd_req(sk, skb); + if (!nsk) + goto discard; + lock_sock(nsk); + release_sock(sk); + sk = nsk; } + + if (tcp_rcv_state_process(sk, skb, skb->h.th, + &(IPCB(skb)->opt), skb->len)) + goto reset; release_sock(sk); return 0; reset: tcp_v4_send_reset(skb); discard: - kfree_skb(skb, FREE_READ); - release_sock(sk); + kfree_skb(skb, FREE_READ); + /* Be careful here. If this function gets more complicated and + * gcc suffers from register pressure on the x86, sk (in %ebx) + * might be destroyed here. This current version compiles correctly, + * but you have been warned. + */ + release_sock(sk); return 0; } @@ -1422,42 +1543,43 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len) { struct tcphdr *th; struct sock *sk; - u32 saddr = skb->nh.iph->saddr; - u32 daddr = skb->nh.iph->daddr; - - th = skb->h.th; if (skb->pkt_type!=PACKET_HOST) goto discard_it; + th = skb->h.th; + /* Pull up the IP header. */ - skb_pull(skb, skb->h.raw-skb->data); + __skb_pull(skb, skb->h.raw - skb->data); + + /* Count it even if it's bad */ + tcp_statistics.TcpInSegs++; /* Try to use the device checksum if provided. */ switch (skb->ip_summed) { case CHECKSUM_NONE: skb->csum = csum_partial((char *)th, len, 0); case CHECKSUM_HW: - if (tcp_v4_check(th,len,saddr,daddr,skb->csum)) { - struct iphdr * iph = skb->nh.iph; + if (tcp_v4_check(th,len,skb->nh.iph->saddr,skb->nh.iph->daddr,skb->csum)) { printk(KERN_DEBUG "TCPv4 bad checksum from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, len=%d/%d/%d\n", - NIPQUAD(saddr), ntohs(th->source), NIPQUAD(daddr), - ntohs(th->dest), len, skb->len, ntohs(iph->tot_len)); - goto discard_it; + NIPQUAD(skb->nh.iph->saddr), ntohs(th->source), NIPQUAD(skb->nh.iph->daddr), + ntohs(th->dest), len, skb->len, ntohs(skb->nh.iph->tot_len)); + tcp_statistics.TcpInErrs++; + goto discard_it; } default: /* CHECKSUM_UNNECESSARY */ } - tcp_statistics.TcpInSegs++; - #ifdef CONFIG_IP_TRANSPARENT_PROXY if (IPCB(skb)->redirport) - sk = tcp_v4_proxy_lookup(th->dest, saddr, th->source, daddr, - skb->dev->pa_addr, IPCB(skb)->redirport); + sk = tcp_v4_proxy_lookup(th->dest, skb->nh.iph->saddr, th->source, + skb->nh.iph->daddr, skb->dev, + IPCB(skb)->redirport, skb->dev->ifindex); else #endif - sk = __tcp_v4_lookup(th, saddr, th->source, daddr, th->dest); + sk = __tcp_v4_lookup(th, skb->nh.iph->saddr, th->source, + skb->nh.iph->daddr, th->dest, skb->dev->ifindex); if (!sk) goto no_tcp_socket; if(!ipsec_sk_policy(sk,skb)) @@ -1501,7 +1623,7 @@ int tcp_v4_rebuild_header(struct sock *sk, struct sk_buff *skb) rt = (struct rtable*)skb->dst; if (rt->u.dst.obsolete) { int err; - err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos, rt->key.dst_dev); + err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos, rt->key.oif); if (err) { sk->err_soft=-err; sk->error_report(skb->sk); @@ -1524,7 +1646,7 @@ int tcp_v4_rebuild_header(struct sock *sk, struct sk_buff *skb) static struct sock * tcp_v4_get_sock(struct sk_buff *skb, struct tcphdr *th) { return tcp_v4_lookup(skb->nh.iph->saddr, th->source, - skb->nh.iph->daddr, th->dest); + skb->nh.iph->daddr, th->dest, skb->dev->ifindex); } static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) @@ -1547,13 +1669,6 @@ struct tcp_func ipv4_specific = { ip_setsockopt, ip_getsockopt, v4_addr2sockaddr, - tcp_v4_send_reset, - tcp_v4_search_req, -#ifdef CONFIG_SYNCOOKIES - cookie_v4_check, -#else - NULL, -#endif sizeof(struct sockaddr_in) }; @@ -1592,8 +1707,6 @@ static int tcp_v4_init_sock(struct sock *sk) sk->priority = 1; sk->state = TCP_CLOSE; - /* This is how many unacked bytes we will accept for this socket. */ - sk->max_unacked = 2048; /* needs to be at most 2 full packets. */ sk->max_ack_backlog = SOMAXCONN; sk->mtu = 576; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8e60f1a50..f9ffb1517 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.46 1997/08/24 16:22:28 freitag Exp $ + * Version: $Id: tcp_output.c,v 1.50 1997/10/15 19:13:02 freitag Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -74,9 +74,12 @@ static __inline__ int tcp_snd_test(struct sock *sk, struct sk_buff *skb) * (part of SWS is done on packetization) * c) We are retransmiting [Nagle] * d) We have too many packets 'in flight' + * + * Don't use the nagle rule for urgent data. */ len = skb->end_seq - skb->seq; - if (!sk->nonagle && len < (sk->mss >> 1) && tp->packets_out) + if (!sk->nonagle && len < (sk->mss >> 1) && tp->packets_out && + !skb->h.th->urg) nagle_check = 0; return (nagle_check && tp->packets_out < tp->snd_cwnd && @@ -471,8 +474,12 @@ unsigned short tcp_select_window(struct sock *sk) if (tp->window_clamp) { free_space = min(tp->window_clamp, free_space); mss = min(tp->window_clamp, mss); - } else + } +#ifdef NO_ANK_FIX + /* I am tired of this message */ + else printk(KERN_DEBUG "Clamp failure. Water leaking.\n"); +#endif if (mss < 1) { mss = 1; @@ -487,8 +494,11 @@ unsigned short tcp_select_window(struct sock *sk) if (cur_win < 0) { cur_win = 0; +#ifdef NO_ANK_FIX + /* And this too. */ printk(KERN_DEBUG "TSW: win < 0 w=%d 1=%u 2=%u\n", tp->rcv_wnd, tp->rcv_nxt, tp->rcv_wup); +#endif } if (free_space < sk->rcvbuf/4 && free_space < mss/2) @@ -610,9 +620,8 @@ static int tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb) th1->urg = 1; th1->urg_ptr = th2->urg_ptr + size1; } - if (th2->fin) { + if (th2->fin) th1->fin = 1; - } /* ... and off you go. */ kfree_skb(buff, FREE_WRITE); @@ -1007,11 +1016,8 @@ void tcp_write_wakeup(struct sock *sk) * following states. If any other state is encountered, return. * [listen/close will never occur here anyway] */ - if (sk->state != TCP_ESTABLISHED && - sk->state != TCP_CLOSE_WAIT && - sk->state != TCP_FIN_WAIT1 && - sk->state != TCP_LAST_ACK && - sk->state != TCP_CLOSING) + if ((1 << sk->state) & + ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|TCPF_LAST_ACK|TCPF_CLOSING)) return; if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) && (skb=tp->send_head)) { diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index cf6fcfbe7..5cb05d55b 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: @(#)tcp.c 1.0.16 05/25/93 + * Version: $Id: tcp_timer.c,v 1.31 1997/11/05 08:14:01 freitag Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -212,7 +212,7 @@ static int tcp_write_timeout(struct sock *sk) tcp_clear_xmit_timers(sk); /* Time wait the socket. */ - if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING) { + if ((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING)) { tcp_set_state(sk,TCP_TIME_WAIT); tcp_reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); } else { @@ -263,8 +263,7 @@ void tcp_probe_timer(unsigned long data) { sk->error_report(sk); /* Time wait the socket. */ - if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 - || sk->state == TCP_CLOSING) { + if ((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING)) { tcp_set_state(sk, TCP_TIME_WAIT); tcp_reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); } else { @@ -280,8 +279,7 @@ static __inline__ int tcp_keepopen_proc(struct sock *sk) { int res = 0; - if (sk->state == TCP_ESTABLISHED || sk->state == TCP_CLOSE_WAIT || - sk->state == TCP_FIN_WAIT2) { + if ((1<<sk->state) & (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT2)) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; __u32 elapsed = jiffies - tp->rcv_tstamp; @@ -382,6 +380,11 @@ void tcp_retransmit_timer(unsigned long data) return; } + if (sk->sock_readers) { + /* Try again in a second. */ + tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ); + return; + } lock_sock(sk); /* Clear delay ack timer. */ diff --git a/net/ipv4/timer.c b/net/ipv4/timer.c index 3a2927528..fe02b3f4c 100644 --- a/net/ipv4/timer.c +++ b/net/ipv4/timer.c @@ -5,7 +5,7 @@ * * TIMER - implementation of software timers for IP. * - * Version: @(#)timer.c 1.0.7 05/25/93 + * Version: $Id: timer.c,v 1.7 1997/09/17 18:50:26 freitag Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ed84d5b0f..42a3df7ca 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -5,7 +5,7 @@ * * The User Datagram Protocol (UDP). * - * Version: @(#)udp.c 1.0.13 06/02/93 + * Version: $Id: udp.c,v 1.44 1997/10/15 19:56:35 freitag Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -81,8 +81,7 @@ when application doesn't choose (NOT YET - doesn't seem to be in the BSD API) [Does opening a SOCK_PACKET and snooping your output count 8)] 4.1.3.6 (Invalid Addresses) - MUST discard invalid source addresses (NOT YET -- will be implemented - in IP, so UDP will eventually be OK. Right now it's a violation.) + MUST discard invalid source addresses (OK -- done in the new routing code) MUST only send datagrams with one of our addresses (NOT YET - ought to be OK ) 950728 -- MS */ @@ -133,6 +132,13 @@ static int udp_v4_verify_bind(struct sock *sk, unsigned short snum) unsigned char state = sk2->state; int sk2_reuse = sk2->reuse; + /* Two sockets can be bound to the same port if they're + * bound to different interfaces. + */ + + if(sk2->bound_dev_if != sk->bound_dev_if) + continue; + if(!sk2->rcv_saddr || !sk->rcv_saddr) { if((!sk2_reuse) || (!sk_reuse) || @@ -173,20 +179,24 @@ unsigned short udp_good_socknum(void) int i, best, best_size_so_far; SOCKHASH_LOCK(); + if (start > sysctl_local_port_range[1] || start < sysctl_local_port_range[0]) + start = sysctl_local_port_range[0]; - /* Select initial not-so-random "best" */ - best = PROT_SOCK + 1 + (start & 1023); best_size_so_far = 32767; /* "big" num */ - result = best; - for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) { + best = result = start; + + for(i = 0; i < UDP_HTABLE_SIZE; i++, result++) { struct sock *sk; int size; sk = udp_hash[result & (UDP_HTABLE_SIZE - 1)]; - /* No clashes - take it */ - if (!sk) + if(!sk) { + if (result > sysctl_local_port_range[1]) + result = sysctl_local_port_range[0] + + ((result - sysctl_local_port_range[0]) & (UDP_HTABLE_SIZE - 1)); goto out; + } /* Is this one better than our best so far? */ size = 0; @@ -196,12 +206,19 @@ unsigned short udp_good_socknum(void) } while((sk = sk->next) != NULL); best_size_so_far = size; best = result; -next: + next: } - while (udp_lport_inuse(best)) - best += UDP_HTABLE_SIZE; result = best; + + for(;; result += UDP_HTABLE_SIZE) { + /* Get into range (but preserve hash bin)... */ + if (result > sysctl_local_port_range[1]) + result = sysctl_local_port_range[0] + + ((result - sysctl_local_port_range[0]) & (UDP_HTABLE_SIZE - 1)); + if (!udp_lport_inuse(result)) + break; + } out: start = result; SOCKHASH_UNLOCK(); @@ -277,7 +294,7 @@ static void udp_v4_rehash(struct sock *sk) /* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this here plus the last hit cache. -DaveM */ -struct sock *udp_v4_lookup_longway(u32 saddr, u16 sport, u32 daddr, u16 dport) +struct sock *udp_v4_lookup_longway(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif) { struct sock *sk, *result = NULL; unsigned short hnum = ntohs(dport); @@ -301,7 +318,12 @@ struct sock *udp_v4_lookup_longway(u32 saddr, u16 sport, u32 daddr, u16 dport) continue; score++; } - if(score == 3) { + if(sk->bound_dev_if) { + if(sk->bound_dev_if != dif) + continue; + score++; + } + if(score == 4) { result = sk; break; } else if(score > badness) { @@ -313,23 +335,25 @@ struct sock *udp_v4_lookup_longway(u32 saddr, u16 sport, u32 daddr, u16 dport) return result; } -__inline__ struct sock *udp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport) +__inline__ struct sock *udp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif) { struct sock *sk; - if(uh_cache_sk && + if(!dif && uh_cache_sk && uh_cache_saddr == saddr && uh_cache_sport == sport && uh_cache_dport == dport && uh_cache_daddr == daddr) return uh_cache_sk; - sk = udp_v4_lookup_longway(saddr, sport, daddr, dport); - uh_cache_sk = sk; - uh_cache_saddr = saddr; - uh_cache_daddr = daddr; - uh_cache_sport = sport; - uh_cache_dport = dport; + sk = udp_v4_lookup_longway(saddr, sport, daddr, dport, dif); + if(!dif) { + uh_cache_sk = sk; + uh_cache_saddr = saddr; + uh_cache_daddr = daddr; + uh_cache_sport = sport; + uh_cache_dport = dport; + } return sk; } @@ -348,16 +372,25 @@ __inline__ struct sock *udp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport #define udp_v4_proxy_loop_next(hnum, hpnum, sk, fpass) \ secondlist((hpnum),(sk)->next,(fpass)) -struct sock *udp_v4_proxy_lookup(unsigned short num, unsigned long raddr, - unsigned short rnum, unsigned long laddr, - unsigned long paddr, unsigned short pnum) +static struct sock *udp_v4_proxy_lookup(unsigned short num, unsigned long raddr, + unsigned short rnum, unsigned long laddr, + struct device *dev, unsigned short pnum, + int dif) { struct sock *s, *result = NULL; int badness = -1; + u32 paddr = 0; unsigned short hnum = ntohs(num); unsigned short hpnum = ntohs(pnum); int firstpass = 1; + if(dev && dev->ip_ptr) { + struct in_device *idev = dev->ip_ptr; + + if(idev->ifa_list) + paddr = idev->ifa_list->ifa_local; + } + SOCKHASH_LOCK(); for(s = udp_v4_proxy_loop_init(hnum, hpnum, s, firstpass); s != NULL; @@ -382,7 +415,12 @@ struct sock *udp_v4_proxy_lookup(unsigned short num, unsigned long raddr, continue; score++; } - if(score == 3 && s->num == hnum) { + if(s->bound_dev_if) { + if(s->bound_dev_if != dif) + continue; + score++; + } + if(score == 4 && s->num == hnum) { result = s; break; } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) { @@ -434,7 +472,7 @@ static inline struct sock *udp_v4_mcast_next(struct sock *sk, * to find the appropriate port. */ -void udp_err(struct sk_buff *skb, unsigned char *dp) +void udp_err(struct sk_buff *skb, unsigned char *dp, int len) { struct iphdr *iph = (struct iphdr*)dp; struct udphdr *uh = (struct udphdr*)(dp+(iph->ihl<<2)); @@ -442,9 +480,16 @@ void udp_err(struct sk_buff *skb, unsigned char *dp) int code = skb->h.icmph->code; struct sock *sk; - sk = udp_v4_lookup(iph->daddr, uh->dest, iph->saddr, uh->source); - if (sk == NULL) - return; /* No socket for error */ + if (len < (iph->ihl<<2)+sizeof(struct udphdr)) { + icmp_statistics.IcmpInErrors++; + return; + } + + sk = udp_v4_lookup(iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex); + if (sk == NULL) { + icmp_statistics.IcmpInErrors++; + return; /* No socket for error */ + } if (sk->ip_recverr && !sk->sock_readers) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); @@ -519,7 +564,6 @@ struct udpfakehdr u32 daddr; u32 other; struct iovec *iov; - int nriov; u32 wcheck; }; @@ -533,46 +577,23 @@ struct udpfakehdr static int udp_getfrag(const void *p, char * to, unsigned int offset, unsigned int fraglen) { struct udpfakehdr *ufh = (struct udpfakehdr *)p; - struct iovec *iov; - char *src; - char *dst = to; - unsigned int len; - - if (offset == 0) { - fraglen -= sizeof(struct udphdr); - dst += sizeof(struct udphdr); - } - - iov = ufh->iov; - do { - if ((len = iov->iov_len) > fraglen) - len = fraglen; - src = (char *) iov->iov_base + iov->iov_len - len; - ufh->wcheck = csum_partial_copy_fromuser(src, - dst + fraglen - len, len, - ufh->wcheck); - if ((iov->iov_len -= len) == 0) { - if (--(ufh->nriov) < 0) { - printk(KERN_NOTICE "udp_getfrag: nriov = %d\n", - ufh->nriov); - return -EINVAL; - } - iov--; - } - fraglen -= len; - } while (fraglen); - ufh->iov = iov; - - if (offset == 0) { + if (offset==0) { + if (csum_partial_copy_fromiovecend(to+sizeof(struct udphdr), ufh->iov, offset, + fraglen-sizeof(struct udphdr), &ufh->wcheck)) + return -EFAULT; ufh->wcheck = csum_partial((char *)ufh, sizeof(struct udphdr), - ufh->wcheck); + ufh->wcheck); ufh->uh.check = csum_tcpudp_magic(ufh->saddr, ufh->daddr, ntohs(ufh->uh.len), IPPROTO_UDP, ufh->wcheck); if (ufh->uh.check == 0) ufh->uh.check = -1; memcpy(to, ufh, sizeof(struct udphdr)); + return 0; } + if (csum_partial_copy_fromiovecend(to, ufh->iov, offset-sizeof(struct udphdr), + fraglen, &ufh->wcheck)) + return -EFAULT; return 0; } @@ -586,45 +607,19 @@ static int udp_getfrag(const void *p, char * to, unsigned int offset, unsigned i static int udp_getfrag_nosum(const void *p, char * to, unsigned int offset, unsigned int fraglen) { struct udpfakehdr *ufh = (struct udpfakehdr *)p; - struct iovec *iov; - char *src; - char *dst = to; - int err; - unsigned int len; - - if (offset == 0) { - fraglen -= sizeof(struct udphdr); - dst += sizeof(struct udphdr); - } - - iov = ufh->iov; - do { - if ((len = iov->iov_len) > fraglen) - len = fraglen; - src = (char *) iov->iov_base + iov->iov_len - len; - err = copy_from_user(dst + fraglen - len, src, len); - fraglen -= len; - if ((iov->iov_len -= len) == 0) { - if (--(ufh->nriov) < 0) { - printk(KERN_NOTICE "udp_getfrag: nriov = %d\n", - ufh->nriov); - return -EINVAL; - } - iov--; - } - } while (fraglen && err >= 0); - ufh->iov = iov; - if (offset == 0) + if (offset==0) { memcpy(to, ufh, sizeof(struct udphdr)); - return err; + return memcpy_fromiovecend(to+sizeof(struct udphdr), ufh->iov, offset, + fraglen-sizeof(struct udphdr)); + } + return memcpy_fromiovecend(to, ufh->iov, offset-sizeof(struct udphdr), + fraglen); } - int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) { int ulen = len + sizeof(struct udphdr); - struct device *dev = NULL; struct ipcm_cookie ipc; struct udpfakehdr ufh; struct rtable *rt; @@ -674,8 +669,9 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) ipc.addr = sk->saddr; ipc.opt = NULL; + ipc.oif = sk->bound_dev_if; if (msg->msg_controllen) { - err = ip_cmsg_send(msg, &ipc, &dev); + err = ip_cmsg_send(msg, &ipc); if (err) return err; if (ipc.opt) @@ -695,17 +691,21 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) tos = RT_TOS(sk->ip_tos) | (sk->localroute || (msg->msg_flags&MSG_DONTROUTE) || (ipc.opt && ipc.opt->is_strictroute)); - if (MULTICAST(daddr) && sk->ip_mc_index && dev == NULL) - err = ip_route_output_dev(&rt, daddr, ufh.saddr, tos, sk->ip_mc_index); - else - err = ip_route_output(&rt, daddr, ufh.saddr, tos, dev); + if (MULTICAST(daddr)) { + if (!ipc.oif) + ipc.oif = sk->ip_mc_index; + if (!ufh.saddr) + ufh.saddr = sk->ip_mc_addr; + } + + err = ip_route_output(&rt, daddr, ufh.saddr, tos, ipc.oif); if (err) { if (free) kfree(ipc.opt); return err; } - if (rt->rt_flags&RTF_BROADCAST && !sk->broadcast) { + if (rt->rt_flags&RTCF_BROADCAST && !sk->broadcast) { if (free) kfree(ipc.opt); ip_rt_put(rt); return -EACCES; @@ -718,8 +718,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len) ufh.uh.len = htons(ulen); ufh.uh.check = 0; ufh.other = (htons(ulen) << 16) + IPPROTO_UDP*256; - ufh.iov = msg->msg_iov + msg->msg_iovlen - 1; - ufh.nriov = msg->msg_iovlen; + ufh.iov = msg->msg_iov; ufh.wcheck = 0; /* RFC1122: OK. Provides the checksumming facility (MUST) as per */ @@ -907,10 +906,10 @@ int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) return(-EAFNOSUPPORT); err = ip_route_connect(&rt, usin->sin_addr.s_addr, sk->saddr, - sk->ip_tos|sk->localroute); + sk->ip_tos|sk->localroute, sk->bound_dev_if); if (err) return err; - if ((rt->rt_flags&RTF_BROADCAST) && !sk->broadcast) { + if ((rt->rt_flags&RTCF_BROADCAST) && !sk->broadcast) { ip_rt_put(rt); return -EACCES; } @@ -1024,7 +1023,7 @@ int udp_chkaddr(struct sk_buff *skb) struct udphdr *uh = (struct udphdr *)(skb->nh.raw + iph->ihl*4); struct sock *sk; - sk = udp_v4_lookup(iph->saddr, uh->source, iph->daddr, uh->dest); + sk = udp_v4_lookup(iph->saddr, uh->source, iph->daddr, uh->dest, skb->dev->ifindex); if (!sk) return 0; @@ -1113,17 +1112,17 @@ int udp_rcv(struct sk_buff *skb, unsigned short len) skb_trim(skb,len); - if(rt->rt_flags & (RTF_BROADCAST|RTF_MULTICAST)) + if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return udp_v4_mcast_deliver(skb, uh, saddr, daddr); #ifdef CONFIG_IP_TRANSPARENT_PROXY if (IPCB(skb)->redirport) sk = udp_v4_proxy_lookup(uh->dest, saddr, uh->source, - daddr, skb->dev->pa_addr, - IPCB(skb)->redirport); + daddr, skb->dev, IPCB(skb)->redirport, + skb->dev->ifindex); else #endif - sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest); + sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex); if (sk == NULL) { udp_statistics.UdpNoPorts++; diff --git a/net/ipv4/utils.c b/net/ipv4/utils.c index d2b8e0089..0f463d0ee 100644 --- a/net/ipv4/utils.c +++ b/net/ipv4/utils.c @@ -6,7 +6,7 @@ * Various kernel-resident INET utility functions; mainly * for format conversion and debugging output. * - * Version: @(#)utils.c 1.0.7 05/18/93 + * Version: $Id: utils.c,v 1.5 1997/09/17 18:50:31 freitag Exp $ * * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 359de74f0..6e69b8813 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: addrconf.c,v 1.21 1997/08/09 03:44:24 davem Exp $ + * $Id: addrconf.c,v 1.28 1997/11/05 20:20:43 kuznet Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -31,6 +31,7 @@ #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/route.h> +#include <linux/inetdevice.h> #include <linux/init.h> #include <linux/proc_fs.h> @@ -42,7 +43,8 @@ #include <net/ndisc.h> #include <net/ip6_route.h> #include <net/addrconf.h> -#include <net/sit.h> +#include <net/ip.h> +#include <linux/if_tunnel.h> #include <asm/uaccess.h> @@ -92,12 +94,11 @@ int ipv6_addr_type(struct in6_addr *addr) st = addr->s6_addr32[0]; - /* - * UCast Provider Based Address - * 0x4/3 + /* Consider all addresses with the first three bits different of + 000 and 111 as unicasts. */ - - if ((st & __constant_htonl(0xE0000000)) == __constant_htonl(0x40000000)) + if ((st & __constant_htonl(0xE0000000)) != __constant_htonl(0x00000000) && + (st & __constant_htonl(0xE0000000)) != __constant_htonl(0xE0000000)) return IPV6_ADDR_UNICAST; if ((st & __constant_htonl(0xFF000000)) == __constant_htonl(0xFF000000)) { @@ -184,6 +185,8 @@ void addrconf_forwarding_on(void) printk(KERN_DEBUG "joining all-routers\n"); #endif idev->router = 1; + + /* Wrong. It is user level function. */ ipv6_addr_all_routers(&maddr); ipv6_dev_mc_inc(idev->dev, &maddr); } @@ -222,6 +225,7 @@ struct inet6_ifaddr * ipv6_add_addr(struct inet6_dev *idev, memcpy(&ifa->addr, addr, sizeof(struct in6_addr)); init_timer(&ifa->timer); + ifa->timer.data = (unsigned long) ifa; ifa->scope = scope; ifa->idev = idev; @@ -361,7 +365,7 @@ struct inet6_ifaddr * ipv6_get_saddr(struct dst_entry *dst, } out: - if (ifp == NULL && match) + if (ifp == NULL) ifp = match; atomic_dec(&addr_list_lock); return ifp; @@ -410,6 +414,157 @@ struct inet6_ifaddr * ipv6_chk_addr(struct in6_addr *addr) return ifp; } +/* Join to solicited addr multicast group. */ + +static void addrconf_join_solict(struct device *dev, struct in6_addr *addr) +{ + struct in6_addr maddr; + + addrconf_addr_solict_mult(addr, &maddr); + ipv6_dev_mc_inc(dev, &maddr); +} + +#ifdef CONFIG_IPV6_EUI64 +static int ipv6_generate_eui64(u8 *eui, struct device *dev) +{ + switch (dev->type) { + case ARPHRD_ETHER: + if (dev->addr_len != ETH_ALEN) + return -1; + memcpy(eui, dev->dev_addr, 3); + memcpy(eui + 5, dev->dev_addr+3, 3); + eui[3] = 0xFF; + eui[4] = 0xFE; + eui[0] ^= 2; + return 0; + } + return -1; +} +#endif + +/* + * Add prefix route. + */ + +static void +addrconf_prefix_route(struct in6_addr *pfx, int plen, struct device *dev, + unsigned long info) +{ + struct in6_rtmsg rtmsg; + int err; + + memset(&rtmsg, 0, sizeof(rtmsg)); + memcpy(&rtmsg.rtmsg_dst, pfx, sizeof(struct in6_addr)); + rtmsg.rtmsg_dst_len = plen; + rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; + rtmsg.rtmsg_ifindex = dev->ifindex; + rtmsg.rtmsg_info = info; + rtmsg.rtmsg_flags = RTF_UP|RTF_ADDRCONF; + + /* Prevent useless cloning on PtP SIT. + This thing is done here expecting that the whole + class of non-broadcast devices need not cloning. + */ + if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT)) + rtmsg.rtmsg_flags |= RTF_NONEXTHOP; + rtmsg.rtmsg_type = RTMSG_NEWROUTE; + + ip6_route_add(&rtmsg, &err); + + if (err) + printk(KERN_DEBUG "IPv6: error %d adding prefix route\n", err); +} + +/* Create "default" multicast route to the interface */ + +static void addrconf_add_mroute(struct device *dev) +{ + struct in6_rtmsg rtmsg; + struct rt6_info *rt; + int err; + + memset(&rtmsg, 0, sizeof(rtmsg)); + ipv6_addr_set(&rtmsg.rtmsg_dst, + __constant_htonl(0xFF000000), 0, 0, 0); + rtmsg.rtmsg_dst_len = 8; + rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; + rtmsg.rtmsg_ifindex = dev->ifindex; + rtmsg.rtmsg_flags = RTF_UP|RTF_ADDRCONF; + rtmsg.rtmsg_type = RTMSG_NEWROUTE; + + rt = ip6_route_add(&rtmsg, &err); + + /* + * Pedro makes interesting thing here, he attached + * fake nexthop to multicast route. + * It is trick to avoid cloning, ugly, but efficient. --ANK + */ + + if (err) + printk(KERN_DEBUG "IPv6: error %d adding mroute\n", err); + else + rt->rt6i_nexthop = ndisc_get_neigh(dev, &rtmsg.rtmsg_dst); +} + +static void sit_route_add(struct device *dev) +{ + struct in6_rtmsg rtmsg; + struct rt6_info *rt; + int err; + + memset(&rtmsg, 0, sizeof(rtmsg)); + + rtmsg.rtmsg_type = RTMSG_NEWROUTE; + rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; + + /* prefix length - 96 bytes "::d.d.d.d" */ + rtmsg.rtmsg_dst_len = 96; + rtmsg.rtmsg_flags = RTF_UP; + rtmsg.rtmsg_ifindex = dev->ifindex; + + rt = ip6_route_add(&rtmsg, &err); + + /* See comment in addrconf_add_mroute. + * It is the same trick, but to avoid cloning for direct + * sit routes i.e. IPv4 comaptible destinations. + */ + if (err) + printk(KERN_DEBUG "sit_route_add: error %d in route_add\n", err); + else + rt->rt6i_nexthop = ndisc_get_neigh(dev, &rtmsg.rtmsg_dst); +} + +static void addrconf_add_lroute(struct device *dev) +{ + struct in6_addr addr; + + ipv6_addr_set(&addr, __constant_htonl(0xFE800000), 0, 0, 0); + addrconf_prefix_route(&addr, 10, dev, 0); +} + +static struct inet6_dev *addrconf_add_dev(struct device *dev) +{ + struct in6_addr maddr; + struct inet6_dev *idev; + + if ((idev = ipv6_get_idev(dev)) == NULL) { + idev = ipv6_add_dev(dev); + if (idev == NULL) + return NULL; + } + + /* Add default multicast route */ + addrconf_add_mroute(dev); + + /* Add link local route */ + addrconf_add_lroute(dev); + + /* Join to all nodes multicast group. */ + ipv6_addr_all_nodes(&maddr); + ipv6_dev_mc_inc(dev, &maddr); + return idev; +} + void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len) { struct prefix_info *pinfo; @@ -432,7 +587,7 @@ void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len) addr_type = ipv6_addr_type(&pinfo->prefix); - if (addr_type & IPV6_ADDR_LINKLOCAL) + if (addr_type & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)) return; valid_lft = ntohl(pinfo->valid); @@ -470,23 +625,12 @@ void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len) rt->rt6i_expires = rt_expires; } } else if (pinfo->onlink && valid_lft) { - struct in6_rtmsg rtmsg; - int err; - - memset(&rtmsg, 0, sizeof(rtmsg)); - - printk(KERN_DEBUG "adding on link route\n"); - - ipv6_addr_copy(&rtmsg.rtmsg_dst, &pinfo->prefix); - rtmsg.rtmsg_dst_len = pinfo->prefix_len; - rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; - rtmsg.rtmsg_ifindex = dev->ifindex; - rtmsg.rtmsg_flags = RTF_UP | RTF_ADDRCONF; - rtmsg.rtmsg_info = rt_expires; - - ip6_route_add(&rtmsg, &err); + addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, + dev, rt_expires); } + /* Try to figure out our local address for this prefix */ + if (pinfo->autoconf && ipv6_config.autoconf) { struct inet6_ifaddr * ifp; struct in6_addr addr; @@ -494,33 +638,41 @@ void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len) plen = pinfo->prefix_len >> 3; - if (plen + dev->addr_len == sizeof(struct in6_addr)) { +#ifdef CONFIG_IPV6_EUI64 + if (pinfo->prefix_len == 64) { + memcpy(&addr, &pinfo->prefix, 8); + if (ipv6_generate_eui64(addr.s6_addr + 8, dev)) + return; + goto ok; + } +#endif +#ifndef CONFIG_IPV6_NO_PB + if (pinfo->prefix_len == ((sizeof(struct in6_addr) - dev->addr_len)<<3)) { memcpy(&addr, &pinfo->prefix, plen); memcpy(addr.s6_addr + plen, dev->dev_addr, dev->addr_len); - } else { - ADBG(("addrconf: prefix_len invalid\n")); - return; + goto ok; } +#endif + printk(KERN_DEBUG "IPv6 addrconf: prefix with wrong length %d\n", pinfo->prefix_len); + return; +ok: ifp = ipv6_chk_addr(&addr); if (ifp == NULL && valid_lft) { struct inet6_dev *in6_dev = ipv6_get_idev(dev); - if (in6_dev == NULL) - ADBG(("addrconf: device not configured\n")); - + if (in6_dev == NULL) { + printk(KERN_DEBUG "addrconf: device %s not configured\n", dev->name); + return; + } + ifp = ipv6_add_addr(in6_dev, &addr, addr_type & IPV6_ADDR_SCOPE_MASK); - if (dev->flags & IFF_MULTICAST) { - struct in6_addr maddr; - - /* Join to solicited addr multicast group. */ - addrconf_addr_solict_mult(&addr, &maddr); - ipv6_dev_mc_inc(dev, &maddr); - } + if (ifp == NULL) + return; ifp->prefix_len = pinfo->prefix_len; @@ -564,17 +716,32 @@ int addrconf_set_dstaddr(void *arg) } if (dev->type == ARPHRD_SIT) { - struct device *dev; - + struct ifreq ifr; + mm_segment_t oldfs; + struct ip_tunnel_parm p; + if (!(ipv6_addr_type(&ireq.ifr6_addr) & IPV6_ADDR_COMPATv4)) return -EADDRNOTAVAIL; - - dev = sit_add_tunnel(ireq.ifr6_addr.s6_addr32[3]); - - if (dev == NULL) - err = -ENODEV; - else - err = 0; + + memset(&p, 0, sizeof(p)); + p.iph.daddr = ireq.ifr6_addr.s6_addr32[3]; + p.iph.saddr = 0; + p.iph.version = 4; + p.iph.ihl = 5; + p.iph.protocol = IPPROTO_IPV6; + p.iph.ttl = 64; + ifr.ifr_ifru.ifru_data = (void*)&p; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL); + set_fs(oldfs); + + if (err == 0) { + err = -ENOBUFS; + if ((dev = dev_get(p.name)) == NULL) + goto err_exit; + err = dev_open(dev); + } } err_exit: @@ -595,38 +762,27 @@ int addrconf_add_ifaddr(void *arg) if (!suser()) return -EPERM; - if(copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) + if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) return -EFAULT; - if((dev = dev_get_by_index(ireq.ifr6_ifindex)) == NULL) - return -EINVAL; + if ((dev = dev_get_by_index(ireq.ifr6_ifindex)) == NULL) + return -ENODEV; + + if (!(dev->flags&IFF_UP)) + return -ENETDOWN; - if ((idev = ipv6_get_idev(dev)) == NULL) - return -EINVAL; + if ((idev = addrconf_add_dev(dev)) == NULL) + return -ENOBUFS; scope = ipv6_addr_scope(&ireq.ifr6_addr); if((ifp = ipv6_add_addr(idev, &ireq.ifr6_addr, scope)) == NULL) return -ENOMEM; - ifp->prefix_len = 128; - - if (dev->flags & IFF_MULTICAST) { - struct in6_addr maddr; - - /* Join to solicited addr multicast group. */ - addrconf_addr_solict_mult(&ireq.ifr6_addr, &maddr); - ipv6_dev_mc_inc(dev, &maddr); - } - ifp->prefix_len = ireq.ifr6_prefixlen; ifp->flags |= ADDR_PERMANENT; - if (!(dev->flags & (IFF_NOARP|IFF_LOOPBACK))) - addrconf_dad_start(ifp); - else - ip6_rt_addr_add(&ifp->addr, dev); - + addrconf_dad_start(ifp); return 0; } @@ -645,90 +801,22 @@ int addrconf_del_ifaddr(void *arg) return -EFAULT; if ((dev = dev_get_by_index(ireq.ifr6_ifindex)) == NULL) - return -EINVAL; + return -ENODEV; if ((idev = ipv6_get_idev(dev)) == NULL) - return -EINVAL; + return -ENXIO; scope = ipv6_addr_scope(&ireq.ifr6_addr); for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { - if (ifp->scope == scope && - (!memcmp(&ireq.ifr6_addr, &ifp->addr, sizeof(struct in6_addr)))) { - ipv6_del_addr(ifp); - break; - } - } - - return 0; -} - -static void sit_route_add(struct device *dev) -{ - struct in6_rtmsg rtmsg; - struct rt6_info *rt; - int err; - - ADBG(("sit_route_add(%s): ", dev->name)); - memset(&rtmsg, 0, sizeof(rtmsg)); - - rtmsg.rtmsg_type = RTMSG_NEWROUTE; - rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; - - if (dev->pa_dstaddr == 0) { - ADBG(("pa_dstaddr=0, ")); - /* prefix length - 96 bytes "::d.d.d.d" */ - rtmsg.rtmsg_dst_len = 96; - rtmsg.rtmsg_flags = RTF_NONEXTHOP|RTF_UP; - } else { - ADBG(("pa_dstaddr=%08x, ", dev->pa_dstaddr)); - rtmsg.rtmsg_dst_len = 10; - rtmsg.rtmsg_dst.s6_addr32[0] = __constant_htonl(0xfe800000); - rtmsg.rtmsg_dst.s6_addr32[3] = dev->pa_dstaddr; - rtmsg.rtmsg_gateway.s6_addr32[3]= dev->pa_dstaddr; - rtmsg.rtmsg_flags = RTF_UP; - } - - rtmsg.rtmsg_ifindex = dev->ifindex; - ADBG(("doing ip6_route_add()\n")); - rt = ip6_route_add(&rtmsg, &err); - - if (err) { -#if ACONF_DEBUG >= 1 - printk(KERN_DEBUG "sit_route_add: error %d in route_add\n", err); -#endif - } - - ADBG(("sit_route_add(cont): ")); - if (dev->pa_dstaddr) { - struct rt6_info *mrt; - - ADBG(("pa_dstaddr != 0, ")); - rt->rt6i_nexthop = ndisc_get_neigh(dev, &rtmsg.rtmsg_gateway); - if (rt->rt6i_nexthop == NULL) { - ADBG(("can't get neighbour\n")); - printk(KERN_DEBUG "sit_route: get_neigh failed\n"); + if (ifp->scope == scope && + (!memcmp(&ireq.ifr6_addr, &ifp->addr, sizeof(struct in6_addr)))) { + ipv6_del_addr(ifp); + break; } - - /* - * Add multicast route. - */ - ADBG(("add MULT, ")); - ipv6_addr_set(&rtmsg.rtmsg_dst, __constant_htonl(0xFF000000), 0, 0, 0); - - rtmsg.rtmsg_dst_len = 8; - rtmsg.rtmsg_flags = RTF_UP; - rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; - - memset(&rtmsg.rtmsg_gateway, 0, sizeof(struct in6_addr)); - ADBG(("doing ip6_route_add()\n")); - mrt = ip6_route_add(&rtmsg, &err); - - if (mrt) - mrt->rt6i_nexthop = ndisc_get_neigh(dev, &rtmsg.rtmsg_dst); - } else { - ADBG(("pa_dstaddr==0\n")); } + + return 0; } static void sit_add_v4_addrs(struct inet6_dev *idev) @@ -739,34 +827,55 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) int scope; memset(&addr, 0, sizeof(struct in6_addr)); + memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4); - if (idev->dev->pa_dstaddr) { + if (idev->dev->flags&IFF_POINTOPOINT) { addr.s6_addr32[0] = __constant_htonl(0xfe800000); scope = IFA_LINK; } else { scope = IPV6_ADDR_COMPATv4; } + if (addr.s6_addr32[3]) { + ifp = ipv6_add_addr(idev, &addr, scope); + if (ifp) { + ifp->flags |= ADDR_PERMANENT; + ifp->prefix_len = 128; + ip6_rt_addr_add(&ifp->addr, idev->dev); + } + return; + } + for (dev = dev_base; dev != NULL; dev = dev->next) { - if (dev->family == AF_INET && (dev->flags & IFF_UP)) { + if (dev->ip_ptr && (dev->flags & IFF_UP)) { + struct in_device * in_dev = dev->ip_ptr; + struct in_ifaddr * ifa; + int flag = scope; - - addr.s6_addr32[3] = dev->pa_addr; - if (dev->flags & IFF_LOOPBACK) { - if (idev->dev->pa_dstaddr) - continue; + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + addr.s6_addr32[3] = ifa->ifa_local; - flag |= IFA_HOST; - } - - ifp = ipv6_add_addr(idev, &addr, flag); + if (ifa->ifa_scope == RT_SCOPE_LINK) + continue; + if (ifa->ifa_scope >= RT_SCOPE_HOST) { + if (idev->dev->flags&IFF_POINTOPOINT) + continue; + flag |= IFA_HOST; + } - if (ifp == NULL) - continue; + ifp = ipv6_add_addr(idev, &addr, flag); + + if (ifp == NULL) + continue; - ifp->flags |= ADDR_PERMANENT; - ip6_rt_addr_add(&ifp->addr, dev); + if (idev->dev->flags&IFF_POINTOPOINT) + ifp->prefix_len = 10; + else + ifp->prefix_len = 96; + ifp->flags |= ADDR_PERMANENT; + ip6_rt_addr_add(&ifp->addr, dev); + } } } } @@ -804,56 +913,98 @@ static void init_loopback(struct device *dev) printk(KERN_DEBUG "init_loopback: error in route_add\n"); } -static void addrconf_eth_config(struct device *dev) +static void addrconf_add_linklocal(struct inet6_dev *idev, struct in6_addr *addr) +{ + struct inet6_ifaddr * ifp; + + ifp = ipv6_add_addr(idev, addr, IFA_LINK); + if (ifp == NULL) + return; + + ifp->flags = ADDR_PERMANENT; + ifp->prefix_len = 10; + + addrconf_dad_start(ifp); +} + +static void addrconf_dev_config(struct device *dev) { struct in6_addr addr; struct in6_addr maddr; - struct inet6_ifaddr * ifp; struct inet6_dev * idev; + if (dev->type != ARPHRD_ETHER) { + /* Alas, we support only ethernet autoconfiguration. */ + return; + } + + idev = addrconf_add_dev(dev); + if (idev == NULL) + return; + +#ifdef CONFIG_IPV6_EUI64 memset(&addr, 0, sizeof(struct in6_addr)); - /* Generate link local address. */ addr.s6_addr[0] = 0xFE; addr.s6_addr[1] = 0x80; - memcpy(addr.s6_addr + (sizeof(struct in6_addr) - dev->addr_len), - dev->dev_addr, dev->addr_len); + if (ipv6_generate_eui64(addr.s6_addr + 8, dev) == 0) + addrconf_add_linklocal(idev, &addr); +#endif - idev = ipv6_add_dev(dev); - if (idev == NULL) - return; - - ifp = ipv6_add_addr(idev, &addr, IFA_LINK); - if (ifp == NULL) - return; +#ifndef CONFIG_IPV6_NO_PB + memset(&addr, 0, sizeof(struct in6_addr)); - ifp->flags = ADDR_PERMANENT; - ifp->prefix_len = 10; + addr.s6_addr[0] = 0xFE; + addr.s6_addr[1] = 0x80; - /* Join to all nodes multicast group. */ - ipv6_addr_all_nodes(&maddr); - ipv6_dev_mc_inc(dev, &maddr); + memcpy(addr.s6_addr + (sizeof(struct in6_addr) - dev->addr_len), + dev->dev_addr, dev->addr_len); + addrconf_add_linklocal(idev, &addr); +#endif if (ipv6_config.forwarding) { idev->router = 1; + + /* It is wrong. + It is routing daemon or radvd that must make it, + rather than kernel. + */ ipv6_addr_all_routers(&maddr); ipv6_dev_mc_inc(dev, &maddr); } +} - /* Join to solicited addr multicast group. */ - addrconf_addr_solict_mult(&addr, &maddr); - ipv6_dev_mc_inc(dev, &maddr); +static void addrconf_sit_config(struct device *dev) +{ + struct inet6_dev *idev; - /* Start duplicate address detection. */ - addrconf_dad_start(ifp); + /* + * Configure the tunnel with one of our IPv4 + * addresses... we should configure all of + * our v4 addrs in the tunnel + */ + + idev = ipv6_add_dev(dev); + if (idev == NULL) { + printk(KERN_DEBUG "init sit: add_dev failed\n"); + return; + } + + sit_add_v4_addrs(idev); + + if (dev->flags&IFF_POINTOPOINT) { + addrconf_add_mroute(dev); + addrconf_add_lroute(dev); + } else + sit_route_add(dev); } + int addrconf_notify(struct notifier_block *this, unsigned long event, void * data) { struct device *dev; - struct inet6_dev * idev; dev = (struct device *) data; @@ -861,34 +1012,15 @@ int addrconf_notify(struct notifier_block *this, unsigned long event, case NETDEV_UP: switch(dev->type) { case ARPHRD_SIT: - - printk(KERN_DEBUG "sit device up: %s\n", dev->name); - - /* - * Configure the tunnel with one of our IPv4 - * addresses... we should configure all of - * our v4 addrs in the tunnel - */ - - idev = ipv6_add_dev(dev); - - sit_add_v4_addrs(idev); - - /* - * we do an hack for now to configure the tunnel - * route. - */ - - sit_route_add(dev); + addrconf_sit_config(dev); break; case ARPHRD_LOOPBACK: init_loopback(dev); break; - case ARPHRD_ETHER: - printk(KERN_DEBUG "Configuring eth interface\n"); - addrconf_eth_config(dev); + default: + addrconf_dev_config(dev); break; }; @@ -934,7 +1066,6 @@ static int addrconf_ifdown(struct device *dev) } if (idev == NULL) { - printk(KERN_DEBUG "addrconf_ifdown: device not found\n"); end_bh_atomic(); return -ENODEV; } @@ -958,8 +1089,8 @@ static int addrconf_ifdown(struct device *dev) ifa = *bifa; continue; } - ifa = ifa->lst_next; bifa = &ifa->lst_next; + ifa = *bifa; } } @@ -968,6 +1099,7 @@ static int addrconf_ifdown(struct device *dev) return 0; } + static void addrconf_rs_timer(unsigned long data) { struct inet6_ifaddr *ifp; @@ -1003,10 +1135,8 @@ static void addrconf_rs_timer(unsigned long data) struct in6_rtmsg rtmsg; int err; -#if ACONF_DEBUG >= 2 printk(KERN_DEBUG "%s: no IPv6 routers present\n", ifp->idev->dev->name); -#endif memset(&rtmsg, 0, sizeof(struct in6_rtmsg)); rtmsg.rtmsg_type = RTMSG_NEWROUTE; @@ -1031,27 +1161,17 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp) dev = ifp->idev->dev; - if (dev->flags & IFF_MULTICAST) { - struct in6_rtmsg rtmsg; - struct rt6_info *mrt; - int err; - - memset(&rtmsg, 0, sizeof(rtmsg)); - ipv6_addr_set(&rtmsg.rtmsg_dst, - __constant_htonl(0xFF000000), 0, 0, 0); - - rtmsg.rtmsg_dst_len = 8; - rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; - rtmsg.rtmsg_ifindex = dev->ifindex; - - rtmsg.rtmsg_flags = RTF_UP; + addrconf_join_solict(dev, &ifp->addr); - mrt = ip6_route_add(&rtmsg, &err); + if (ifp->prefix_len != 128) + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, 0); - if (err) - printk(KERN_DEBUG "dad_start: mcast route add failed\n"); - else - mrt->rt6i_nexthop = ndisc_get_neigh(dev, &rtmsg.rtmsg_dst); + if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { + start_bh_atomic(); + ifp->flags &= ~DAD_INCOMPLETE; + addrconf_dad_completed(ifp); + end_bh_atomic(); + return; } if (rand_seed) { @@ -1059,15 +1179,12 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp) nd_rand_seed = ifp->addr.s6_addr32[3]; } - init_timer(&ifp->timer); - ifp->probes = ipv6_config.dad_transmits; ifp->flags |= DAD_INCOMPLETE; rand_num = ipv6_random() % ipv6_config.rtr_solicit_delay; ifp->timer.function = addrconf_dad_timer; - ifp->timer.data = (unsigned long) ifp; ifp->timer.expires = jiffies + rand_num; add_timer(&ifp->timer); @@ -1105,62 +1222,41 @@ static void addrconf_dad_timer(unsigned long data) static void addrconf_dad_completed(struct inet6_ifaddr *ifp) { - struct device *dev; - int err; + struct device * dev = ifp->idev->dev; - dev = ifp->idev->dev; + /* + * Configure the address for reception. Now it is valid. + */ - if (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) { - struct in6_rtmsg rtmsg; - struct in6_addr all_routers; + ip6_rt_addr_add(&ifp->addr, dev); - /* - * 1) configure a link route for this interface - * 2) send a (delayed) router solicitation - */ + /* If added prefix is link local and forwarding is off, + start sending router solicitations. + */ - memset(&rtmsg, 0, sizeof(rtmsg)); - - memcpy(&rtmsg.rtmsg_dst, &ifp->addr, sizeof(struct in6_addr)); + if (ipv6_config.forwarding == 0 && + (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) == 0 && + (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) { + struct in6_addr all_routers; - rtmsg.rtmsg_dst_len = ifp->prefix_len; - rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; - rtmsg.rtmsg_ifindex = dev->ifindex; + ipv6_addr_set(&all_routers, + __constant_htonl(0xff020000U), 0, 0, + __constant_htonl(0x2U)); - rtmsg.rtmsg_flags = RTF_UP; + /* + * If a host as already performed a random delay + * [...] as part of DAD [...] there is no need + * to delay again before sending the first RS + */ + ndisc_send_rs(ifp->idev->dev, &ifp->addr, &all_routers); - ip6_route_add(&rtmsg, &err); - - if (err) - printk(KERN_DEBUG "dad_complete: error in route_add\n"); - - if (ipv6_config.forwarding == 0) { - ipv6_addr_set(&all_routers, - __constant_htonl(0xff020000U), 0, 0, - __constant_htonl(0x2U)); - - /* - * If a host as already performed a random delay - * [...] as part of DAD [...] there is no need - * to delay again before sending the first RS - */ - ndisc_send_rs(ifp->idev->dev, &ifp->addr, - &all_routers); - - ifp->probes = 1; - ifp->timer.function = addrconf_rs_timer; - ifp->timer.expires = (jiffies + - ipv6_config.rtr_solicit_interval); - ifp->idev->if_flags |= IF_RS_SENT; - add_timer(&ifp->timer); - } + ifp->probes = 1; + ifp->timer.function = addrconf_rs_timer; + ifp->timer.expires = (jiffies + + ipv6_config.rtr_solicit_interval); + ifp->idev->if_flags |= IF_RS_SENT; + add_timer(&ifp->timer); } - - /* - * configure the address for reception - */ - - ip6_rt_addr_add(&ifp->addr, dev); } #ifdef CONFIG_PROC_FS @@ -1251,7 +1347,9 @@ void addrconf_verify(unsigned long foo) __initfunc(void addrconf_init(void)) { +#ifdef MODULE struct device *dev; +#endif /* * init address and device hash lists @@ -1263,24 +1361,25 @@ __initfunc(void addrconf_init(void)) memset(inet6_dev_lst, 0, IN6_ADDR_HSIZE * sizeof(struct inet6_dev *)); - /* - * Init loopback device - */ - - dev = dev_get("lo"); - - if (dev && (dev->flags & IFF_UP)) - init_loopback(dev); - - /* - * and maybe: - * search availiable AF_INET devs and try to configure them - */ +#ifdef MODULE + /* This takes sense only during module load. */ - dev = dev_get("eth0"); + for (dev = dev_base; dev; dev = dev->next) { + if (!(dev->flags&IFF_UP)) + continue; - if (dev && (dev->flags & IFF_UP)) - addrconf_eth_config(dev); + switch (dev->type) { + case ARPHRD_LOOPBACK: + init_loopback(dev); + break; + case ARPHRD_ETHER: + addrconf_dev_config(dev); + break; + default: + /* Ignore all other */ + } + } +#endif #ifdef CONFIG_PROC_FS proc_net_register(&iface_proc_entry); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 8d2755b09..9f707272f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -7,7 +7,7 @@ * * Adapted from linux/net/ipv4/af_inet.c * - * $Id: af_inet6.c,v 1.21 1997/08/20 11:25:00 alan Exp $ + * $Id: af_inet6.c,v 1.23 1997/10/29 20:27:52 kuznet Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -44,7 +44,7 @@ #include <net/ipv6.h> #include <net/udp.h> #include <net/tcp.h> -#include <net/sit.h> +#include <net/ipip.h> #include <net/protocol.h> #include <net/inet_common.h> #include <net/transp_v6.h> @@ -200,7 +200,7 @@ static int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* Check if the address belongs to the host. */ if (addr_type == IPV6_ADDR_MAPPED) { v4addr = addr->sin6_addr.s6_addr32[3]; - if (__ip_chk_addr(v4addr) != IS_MYADDR) + if (inet_addr_type(v4addr) != RTN_LOCAL) return(-EADDRNOTAVAIL); } else { if (addr_type != IPV6_ADDR_ANY) { @@ -354,8 +354,8 @@ static int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCGIFMAP: case SIOCSIFSLAVE: case SIOCGIFSLAVE: - case SIOGIFINDEX: - case SIOGIFNAME: + case SIOCGIFINDEX: + case SIOCGIFNAME: case SIOCGIFCOUNT: return(dev_ioctl(cmd,(void *) arg)); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 36eb01ddc..28d9af57e 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: icmp.c,v 1.10 1997/06/05 11:07:20 schenk Exp $ + * $Id: icmp.c,v 1.11 1997/09/20 20:48:26 davem Exp $ * * Based on net/ipv4/icmp.c * diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 0ad79f211..6c9f24492 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: ip6_fib.c,v 1.7 1997/04/12 04:32:46 davem Exp $ + * $Id: ip6_fib.c,v 1.9 1997/09/20 20:48:27 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -28,7 +28,6 @@ #include <net/ipv6.h> #include <net/ndisc.h> #include <net/addrconf.h> -#include <net/netlink.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> diff --git a/net/ipv6/ip6_fw.c b/net/ipv6/ip6_fw.c index 5a47cc251..ddce1ccfa 100644 --- a/net/ipv6/ip6_fw.c +++ b/net/ipv6/ip6_fw.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: ip6_fw.c,v 1.5 1997/04/29 09:38:44 mj Exp $ + * $Id: ip6_fw.c,v 1.7 1997/10/06 23:09:54 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -13,6 +13,7 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/config.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> @@ -368,12 +369,16 @@ static void ip6_fw_destroy(struct flow_rule *rl) __initfunc(void ip6_fw_init(void)) { +#ifdef CONFIG_NETLINK netlink_attach(NETLINK_IP6_FW, ip6_fw_msgrcv); +#endif } #ifdef MODULE void module_cleanup(void) { +#ifdef CONFIG_NETLINK netlink_detach(NETLINK_IP6_FW); +#endif } #endif diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 534ebc66a..72ce290ae 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -6,7 +6,7 @@ * Pedro Roque <roque@di.fc.ul.pt> * Ian P. Morris <I.P.Morris@soton.ac.uk> * - * $Id: ip6_input.c,v 1.6 1997/05/11 16:06:52 davem Exp $ + * $Id: ip6_input.c,v 1.7 1997/09/20 20:48:27 davem Exp $ * * Based in linux/net/ipv4/ip_input.c * diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 7a865296f..e0b20e066 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: ip6_output.c,v 1.3 1997/03/18 18:24:37 davem Exp $ + * $Id: ip6_output.c,v 1.5 1997/09/21 18:33:14 kuznet Exp $ * * Based on linux/net/ipv4/ip_output.c * @@ -540,6 +540,11 @@ int ip6_forward(struct sk_buff *skb) struct ipv6hdr *hdr = skb->nh.ipv6h; int size; + if (ipv6_config.forwarding == 0) { + kfree_skb(skb, FREE_READ); + return -EINVAL; + } + /* * check hop-by-hop options present */ diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 64cfb00d5..98d8339b2 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -7,7 +7,7 @@ * * Based on linux/net/ipv4/ip_sockglue.c * - * $Id: ipv6_sockglue.c,v 1.13 1997/05/15 18:55:10 davem Exp $ + * $Id: ipv6_sockglue.c,v 1.15 1997/10/29 20:27:54 kuznet Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -43,7 +43,6 @@ #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/inet_common.h> -#include <net/sit.h> #include <net/tcp.h> #include <net/udp.h> @@ -111,6 +110,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval, sk->prot = &tcp_prot; tp->af_specific = &ipv4_specific; sk->socket->ops = &inet_stream_ops; + sk->family = AF_INET; } else { sk->prot = &udp_prot; sk->socket->ops = &inet_dgram_ops; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 637f434d4..eae3efed6 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: mcast.c,v 1.10 1997/05/07 09:40:22 davem Exp $ + * $Id: mcast.c,v 1.11 1997/10/29 20:27:50 kuznet Exp $ * * Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c * @@ -417,7 +417,10 @@ void igmp6_send(struct in6_addr *addr, struct device *dev, int type) skb_reserve(skb, (dev->hard_header_len + 15) & ~15); if (dev->hard_header) { unsigned char ha[MAX_ADDR_LEN]; - ipv6_mc_map(addr, ha); + if (dev->type == ARPHRD_ETHER) + ipv6_mc_map(addr, ha); + else + memcpy(ha, dev->broadcast, dev->addr_len); dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, plen); skb->arp = 1; } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 83b5cf3bc..04d92b6b9 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -6,8 +6,6 @@ * Pedro Roque <roque@di.fc.ul.pt> * Mike Shaver <shaver@ingenia.com> * - * $Id: ndisc.c,v 1.15 1997/04/29 09:38:48 mj Exp $ - * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -24,7 +22,7 @@ */ /* Set to 3 to get tracing... */ -#define ND_DEBUG 2 +#define ND_DEBUG 1 #if ND_DEBUG >= 3 #define NDBG(x) printk x @@ -396,7 +394,10 @@ int ndisc_eth_resolv(unsigned char *h_dest, struct sk_buff *skb) struct in6_addr *daddr; daddr = &skb->nh.ipv6h->daddr; - ipv6_mc_map(daddr, h_dest); + if (skb->dev->type == ARPHRD_ETHER) + ipv6_mc_map(daddr, h_dest); + else + memcpy(h_dest, skb->dev->broadcast, skb->dev->addr_len); return 0; } @@ -434,6 +435,54 @@ int ndisc_eth_resolv(unsigned char *h_dest, struct sk_buff *skb) return 1; } +static int +ndisc_build_ll_hdr(struct sk_buff *skb, struct device *dev, + struct in6_addr *daddr, struct neighbour *neigh, int len) +{ + unsigned char ha[MAX_ADDR_LEN]; + unsigned char *h_dest = NULL; + + skb->arp = 1; + if (dev->hard_header_len) { + skb_reserve(skb, (dev->hard_header_len + 15) & ~15); + + if (dev->hard_header) { + if (ipv6_addr_type(daddr) & IPV6_ADDR_MULTICAST) { + nd_stats.snt_probes_mcast++; + if (dev->type == ARPHRD_ETHER) + ipv6_mc_map(daddr, ha); + else + memcpy(ha, dev->broadcast, dev->addr_len); + h_dest = ha; + } else if (neigh) { + h_dest = neigh->ha; + nd_stats.snt_probes_ucast++; + } else { + struct nd_neigh *ndn; + + neigh_table_lock(&nd_tbl); + + neigh = neigh_lookup(&nd_tbl, (void *) daddr, + sizeof(struct in6_addr), dev); + if (neigh) { + ndn = (struct nd_neigh*)neigh; + if (ndn->ndn_flags&NTF_COMPLETE) { + memcpy(ha, ndn->ndn_ha, dev->addr_len); + h_dest = ha; + } + } + neigh_table_unlock(&nd_tbl); + } + + if (dev->hard_header(skb, dev, ETH_P_IPV6, h_dest, NULL, len) < 0) + skb->arp = 0; + } + } + + return skb->arp; +} + + /* * Send a Neighbour Advertisement */ @@ -486,17 +535,10 @@ void ndisc_send_na(struct device *dev, struct nd_neigh *ndn, printk(KERN_DEBUG "send_na: alloc skb failed\n"); return; } - /* - * build the MAC header - */ - - if (dev->hard_header_len) { - skb_reserve(skb, (dev->hard_header_len + 15) & ~15); - if (dev->hard_header) { - dev->hard_header(skb, dev, ETH_P_IPV6, ndn->ndn_ha, - NULL, len); - skb->arp = 1; - } + + if (ndisc_build_ll_hdr(skb, dev, daddr, (struct neighbour*)ndn, len) == 0) { + kfree_skb(skb, FREE_WRITE); + return; } ip6_nd_hdr(sk, skb, dev, solicited_addr, daddr, IPPROTO_ICMPV6, len); @@ -540,12 +582,10 @@ void ndisc_send_ns(struct device *dev, struct neighbour *neigh, struct in6_addr *solicit, struct in6_addr *daddr, struct in6_addr *saddr) { - unsigned char ha[MAX_ADDR_LEN]; struct sock *sk = ndisc_socket->sk; struct sk_buff *skb; struct nd_msg *msg; int len, opt_len; - void *h_dest; int err; NDBG(("ndisc_send_ns(%s,%p): ", (dev ? dev->name : "[NULL]"), neigh)); @@ -581,7 +621,11 @@ void ndisc_send_ns(struct device *dev, struct neighbour *neigh, return; } +#if 0 + /* Why Pedro did it? Is it remnant of early + attempts to avoid looping back? I have no idea. --ANK */ skb->pkt_type = PACKET_NDISC; +#endif if (saddr == NULL) { struct inet6_ifaddr *ifa; @@ -593,29 +637,9 @@ void ndisc_send_ns(struct device *dev, struct neighbour *neigh, saddr = &ifa->addr; } - if ((ipv6_addr_type(daddr) & IPV6_ADDR_MULTICAST)) { - nd_stats.snt_probes_mcast++; - ipv6_mc_map(daddr, ha); - h_dest = ha; - } else { - if (neigh == NULL) { -#if ND_DEBUG >= 1 - printk(KERN_DEBUG "send_ns: ucast destination " - "with null neighbour\n"); -#endif - return; - } - h_dest = neigh->ha; - nd_stats.snt_probes_ucast++; - } - - if (dev->hard_header_len) { - skb_reserve(skb, (dev->hard_header_len + 15) & ~15); - if (dev->hard_header) { - dev->hard_header(skb, dev, ETH_P_IPV6, h_dest, NULL, - len); - skb->arp = 1; - } + if (ndisc_build_ll_hdr(skb, dev, daddr, neigh, len) == 0) { + kfree_skb(skb, FREE_WRITE); + return; } ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len); @@ -684,15 +708,9 @@ void ndisc_send_rs(struct device *dev, struct in6_addr *saddr, return; } - if (dev->hard_header_len) { - skb_reserve(skb, (dev->hard_header_len + 15) & ~15); - if (dev->hard_header) { - unsigned char ha[MAX_ADDR_LEN]; - - ipv6_mc_map(daddr, ha); - dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, len); - skb->arp = 1; - } + if (ndisc_build_ll_hdr(skb, dev, daddr, NULL, len) == 0) { + kfree_skb(skb, FREE_WRITE); + return; } ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len); @@ -783,15 +801,19 @@ static void ndisc_timer_handler(unsigned long arg) ntimer = min(ntimer, time); } ndn = (struct nd_neigh *) ndn->neigh.next; - } while (ndn != head); } if (ntimer != (~0UL)) { - ndisc_timer.expires = now + ntimer; + unsigned long tval = jiffies + ntimer; + if (del_timer(&ndisc_timer)) { + if (ndisc_timer.expires - tval < 0) + tval = ndisc_timer.expires; + } + ndisc_timer.expires = tval; add_timer(&ndisc_timer); } - + neigh_table_unlock(&nd_tbl); } @@ -1238,14 +1260,12 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) NDBG(("ndisc_redirect_rcv(%p)\n", skb)); if (skb->nh.ipv6h->hop_limit != 255) { - printk(KERN_WARNING - "NDISC: fake ICMP redirect received\n"); + printk(KERN_WARNING "NDISC: fake ICMP redirect received\n"); return; } if (!(ipv6_addr_type(&skb->nh.ipv6h->saddr) & IPV6_ADDR_LINKLOCAL)) { - printk(KERN_WARNING - "ICMP redirect: source address is not linklocal\n"); + printk(KERN_WARNING "ICMP redirect: source address is not linklocal\n"); return; } @@ -1269,19 +1289,15 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) if (ipv6_addr_cmp(dest, target) == 0) { on_link = 1; } else if (!(ipv6_addr_type(target) & IPV6_ADDR_LINKLOCAL)) { - printk(KERN_WARNING - "ICMP redirect: target address is not linklocal\n"); + printk(KERN_WARNING "ICMP redirect: target address is not linklocal\n"); return; } /* passed validation tests */ - rt = rt6_redirect(dest, &skb->nh.ipv6h->saddr, target, skb->dev, - on_link); + rt = rt6_redirect(dest, &skb->nh.ipv6h->saddr, target, skb->dev, on_link); - if (rt == NULL) { - printk(KERN_WARNING "ICMP redirect: no route to host\n"); + if (rt == NULL) return; - } ndn = (struct nd_neigh *) rt->rt6i_nexthop; @@ -1365,13 +1381,9 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, hlen = 0; - if (dev->hard_header_len) { - skb_reserve(buff, (dev->hard_header_len + 15) & ~15); - if (dev->hard_header) { - dev->hard_header(buff, dev, ETH_P_IPV6, ndn->ndn_ha, - NULL, len); - buff->arp = 1; - } + if (ndisc_build_ll_hdr(buff, dev, &skb->nh.ipv6h->saddr, NULL, len) == 0) { + kfree_skb(buff, FREE_WRITE); + return; } ip6_nd_hdr(sk, buff, dev, &ifp->addr, &skb->nh.ipv6h->saddr, @@ -1471,25 +1483,32 @@ int ndisc_rcv(struct sk_buff *skb, struct device *dev, switch (msg->icmph.icmp6_type) { case NDISC_NEIGHBOUR_SOLICITATION: NDBG(("NS ")); - if ((ifp = ipv6_chk_addr(&msg->target))) { - int addr_type; + if ((ifp = ipv6_chk_addr(&msg->target)) != NULL) { + int addr_type = ipv6_addr_type(saddr); if (ifp->flags & DAD_INCOMPLETE) { - /* - * DAD failed + /* Address is tentative. If the source + is unspecified address, it is someone + does DAD, otherwise we ignore solicitations + until DAD timer expires. */ + if (addr_type == IPV6_ADDR_ANY) { + printk(KERN_INFO "%s: duplicate address detected!\n", + ifp->idev->dev->name); + del_timer(&ifp->timer); + } + return 0; + } - /* XXX Check if this came in over same interface - * XXX we just sent an NS from! That is valid! -DaveM - */ + if (addr_type == IPV6_ADDR_ANY) { + struct in6_addr maddr; - printk(KERN_DEBUG "%s: duplicate address\n", - ifp->idev->dev->name); - del_timer(&ifp->timer); + ipv6_addr_all_nodes(&maddr); + ndisc_send_na(dev, NULL, &maddr, &ifp->addr, + ifp->idev->router, 0, 1, 1); return 0; } - addr_type = ipv6_addr_type(saddr); if (addr_type & IPV6_ADDR_UNICAST) { int inc; @@ -1512,7 +1531,6 @@ int ndisc_rcv(struct sk_buff *skb, struct device *dev, ifp->idev->router, 1, inc, inc); } else { #if ND_DEBUG >= 1 - /* FIXME */ printk(KERN_DEBUG "ns: non unicast saddr\n"); #endif } @@ -1521,6 +1539,28 @@ int ndisc_rcv(struct sk_buff *skb, struct device *dev, case NDISC_NEIGHBOUR_ADVERTISEMENT: NDBG(("NA ")); + if ((ipv6_addr_type(saddr)&IPV6_ADDR_MULTICAST) && + msg->icmph.icmp6_solicited) { + printk(KERN_DEBUG "NDISC: solicited NA is multicasted\n"); + return 0; + } + if ((ifp = ipv6_chk_addr(&msg->target))) { + if (ifp->flags & DAD_INCOMPLETE) { + /* Address is duplicate. */ + printk(KERN_INFO "%s: duplicate address detected!\n", + ifp->idev->dev->name); + del_timer(&ifp->timer); + return 0; + } + /* What should we make now? The advertisement + is invalid, but ndisc specs say nothing + about it. It could be misconfiguration, or + an smart proxy agent tries to help us :-) + */ + printk(KERN_DEBUG "%s: someone avertise our address!\n", + ifp->idev->dev->name); + return 0; + } neigh_table_lock(&nd_tbl); ndn = (struct nd_neigh *) neigh_lookup(&nd_tbl, (void *) &msg->target, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 303649705..17af36fe6 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -7,7 +7,7 @@ * * Adapted from linux/net/ipv4/raw.c * - * $Id: raw.c,v 1.12 1997/04/01 02:23:34 davem Exp $ + * $Id: raw.c,v 1.13 1997/09/14 08:32:14 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -156,7 +156,7 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) /* Check if the address belongs to the host. */ if (addr_type == IPV6_ADDR_MAPPED) { v4addr = addr->sin6_addr.s6_addr32[3]; - if (__ip_chk_addr(v4addr) != IS_MYADDR) + if (inet_addr_type(v4addr) != RTN_LOCAL) return(-EADDRNOTAVAIL); } else { if (addr_type != IPV6_ADDR_ANY) { @@ -307,8 +307,9 @@ static int rawv6_frag_cksum(const void *data, struct in6_addr *addr, { struct rawv6_fakehdr *hdr = (struct rawv6_fakehdr *) data; - hdr->cksum = csum_partial_copy_fromiovecend(buff, hdr->iov, offset, - len, hdr->cksum); + if (csum_partial_copy_fromiovecend(buff, hdr->iov, offset, + len, &hdr->cksum)) + return -EFAULT; if (offset == 0) { struct sock *sk; @@ -461,28 +462,49 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len) static int rawv6_seticmpfilter(struct sock *sk, int level, int optname, char *optval, int optlen) { - struct raw6_opt *opt = &sk->tp_pinfo.tp_raw; - int err = 0; + switch (optname) { + case ICMPV6_FILTER: + if (optlen > sizeof(struct icmp6_filter)) + optlen = sizeof(struct icmp6_filter); + if (copy_from_user(&sk->tp_pinfo.tp_raw.filter, optval, optlen)) + return -EFAULT; + return 0; + default: + return -ENOPROTOOPT; + }; + + return 0; +} + +static int rawv6_geticmpfilter(struct sock *sk, int level, int optname, + char *optval, int *optlen) +{ + int len; switch (optname) { - case ICMPV6_FILTER: - err = copy_from_user(&opt->filter, optval, - sizeof(struct icmp6_filter)); - if (err) - err = -EFAULT; - break; - default: - err = -ENOPROTOOPT; + case ICMPV6_FILTER: + if (get_user(len, optlen)) + return -EFAULT; + if (len > sizeof(struct icmp6_filter)) + len = sizeof(struct icmp6_filter); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &sk->tp_pinfo.tp_raw.filter, len)) + return -EFAULT; + return 0; + default: + return -ENOPROTOOPT; }; - return err; + return 0; } + static int rawv6_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { struct raw6_opt *opt = &sk->tp_pinfo.tp_raw; - int val, err; + int val; switch(level) { case SOL_RAW: @@ -501,12 +523,8 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname, optlen); }; - if (optval == NULL) - return(-EINVAL); - - err = get_user(val, (int *)optval); - if(err) - return err; + if (get_user(val, (int *)optval)) + return -EFAULT; switch (optname) { case IPV6_CHECKSUM: @@ -525,6 +543,53 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname, } } +static int rawv6_getsockopt(struct sock *sk, int level, int optname, + char *optval, int *optlen) +{ + struct raw6_opt *opt = &sk->tp_pinfo.tp_raw; + int val, len; + + switch(level) { + case SOL_RAW: + break; + + case SOL_ICMPV6: + if (sk->num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + return rawv6_geticmpfilter(sk, level, optname, optval, + optlen); + case SOL_IPV6: + if (optname == IPV6_CHECKSUM) + break; + default: + return ipv6_getsockopt(sk, level, optname, optval, + optlen); + }; + + if (get_user(len,optlen)) + return -EFAULT; + + switch (optname) { + case IPV6_CHECKSUM: + if (opt->checksum == 0) + val = -1; + else + val = opt->offset; + + default: + return -ENOPROTOOPT; + } + + len=min(sizeof(int),len); + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval,&val,len)) + return -EFAULT; + return 0; +} + + static void rawv6_close(struct sock *sk, unsigned long timeout) { struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; @@ -558,7 +623,7 @@ struct proto rawv6_prot = { NULL, /* destroy */ NULL, /* shutdown */ rawv6_setsockopt, /* setsockopt */ - ipv6_getsockopt, /* getsockopt - FIXME */ + rawv6_getsockopt, /* getsockopt */ rawv6_sendmsg, /* sendmsg */ rawv6_recvmsg, /* recvmsg */ rawv6_bind, /* bind */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 90a8caf09..6a412d423 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: route.c,v 1.13 1997/07/19 11:11:35 davem Exp $ + * $Id: route.c,v 1.18 1997/10/17 00:15:05 freitag Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -23,6 +23,8 @@ #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/init.h> +#include <linux/netlink.h> +#include <linux/if_arp.h> #ifdef CONFIG_PROC_FS #include <linux/proc_fs.h> @@ -34,7 +36,7 @@ #include <net/ip6_route.h> #include <net/ndisc.h> #include <net/addrconf.h> -#include <net/netlink.h> +#include <linux/netlink.h> #include <asm/uaccess.h> @@ -64,7 +66,7 @@ struct dst_ops ip6_dst_ops = { struct rt6_info ip6_null_entry = { {{NULL, ATOMIC_INIT(0), ATOMIC_INIT(0), NULL, - 0, 0, 0, 0, 0, 0, 0, 0, -ENETUNREACH, NULL, NULL, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -ENETUNREACH, NULL, NULL, ip6_pkt_discard, ip6_pkt_discard, &ip6_dst_ops}}, NULL, {{{0}}}, 256, RTF_REJECT|RTF_NONEXTHOP, ~0UL, 0, {NULL}, {{{{0}}}, 128}, {{{{0}}}, 128} @@ -297,7 +299,7 @@ struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr, rt6_lock(); fn = fib6_lookup(&ip6_routing_table, daddr, saddr); - rt = rt6_device_match(fn->leaf, dev, 0); + rt = rt6_device_match(fn->leaf, dev, flags&RTF_LINKRT); rt6_unlock(); return rt; } @@ -314,6 +316,9 @@ static struct rt6_info *rt6_cow(struct rt6_info *rt, struct in6_addr *daddr, if (rt) { ipv6_addr_copy(&rt->rt6i_dst.addr, daddr); + if (!(rt->rt6i_flags&RTF_GATEWAY)) + ipv6_addr_copy(&rt->rt6i_gateway, daddr); + rt->rt6i_dst.plen = 128; rt->rt6i_flags |= RTF_CACHE; @@ -322,7 +327,7 @@ static struct rt6_info *rt6_cow(struct rt6_info *rt, struct in6_addr *daddr, rt->rt6i_src.plen = 128; } - rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, daddr); + rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); rtreq_add(rt, RT_OPER_ADD); } else { @@ -556,6 +561,23 @@ struct dst_entry *ip6_dst_reroute(struct dst_entry *dst, struct sk_buff *skb) return NULL; } +/* Clean host part of a prefix. Not necessary in radix tree, + but results in cleaner routing tables. + + Remove it only when all the things will work! + */ + +static void ipv6_wash_prefix(struct in6_addr *pfx, int plen) +{ + int b = plen&0x7; + int o = (plen + 7)>>3; + + if (o < 16) + memset(pfx->s6_addr + o, 0, 16 - o); + if (b != 0) + pfx->s6_addr[plen>>3] &= (0xFF<<(8-b)); +} + /* * */ @@ -566,7 +588,11 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) struct device *dev = NULL; int addr_type; - RDBG(("ip6_route_add(%p)[%p] ", rtmsg, __builtin_return_address(0))); + if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128) { + *err = -EINVAL; + return NULL; + } + *err = 0; rt = dst_alloc(sizeof(struct rt6_info), &ip6_dst_ops); @@ -577,29 +603,6 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) goto out; } - /* - * default... this should be chosen according to route flags - */ - -#if RT6_DEBUG >= 3 - { - struct in6_addr *addr = &rtmsg->rtmsg_dst; - int i; - - RDBG(("daddr[")); - for(i = 0; i < 8; i++) { - RDBG(("%04x%c", addr->s6_addr16[i], - i == 7 ? ']' : ':')); - } - addr = &rtmsg->rtmsg_src; - RDBG(("saddr[")); - for(i = 0; i < 8; i++) { - RDBG(("%04x%c", addr->s6_addr16[i], - i == 7 ? ']' : ':')); - } - } -#endif - addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst); if (addr_type & IPV6_ADDR_MULTICAST) { @@ -609,71 +612,58 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) RDBG(("!MCAST ")); rt->u.dst.input = ip6_forward; } - + rt->u.dst.output = dev_queue_xmit; - - if (rtmsg->rtmsg_ifindex) + + if (rtmsg->rtmsg_ifindex) { dev = dev_get_by_index(rtmsg->rtmsg_ifindex); - if(dev) - RDBG(("d[%s] ", dev->name)); + if (dev == NULL) { + *err = -ENODEV; + goto out; + } + } ipv6_addr_copy(&rt->rt6i_dst.addr, &rtmsg->rtmsg_dst); rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len; + ipv6_wash_prefix(&rt->rt6i_dst.addr, rt->rt6i_dst.plen); - /* XXX Figure out what really is supposed to be happening here -DaveM */ ipv6_addr_copy(&rt->rt6i_src.addr, &rtmsg->rtmsg_src); rt->rt6i_src.plen = rtmsg->rtmsg_src_len; - - if ((rt->rt6i_src.plen = rtmsg->rtmsg_src_len)) { - RDBG(("splen, ")); - ipv6_addr_copy(&rt->rt6i_src.addr, &rtmsg->rtmsg_src); - } else { - RDBG(("!splen, ")); - } - /* XXX */ + ipv6_wash_prefix(&rt->rt6i_src.addr, rt->rt6i_src.plen); - if (rtmsg->rtmsg_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) { - struct rt6_info *grt; + if (rtmsg->rtmsg_flags & RTF_GATEWAY) { struct in6_addr *gw_addr; - u32 flags = 0; - - RDBG(("RTF_GATEWAY, ")); - /* - * 1. gateway route lookup - * 2. ndisc_get_neigh - */ + int gwa_type; gw_addr = &rtmsg->rtmsg_gateway; - -#if RT6_DEBUG >= 3 - { - struct in6_addr *addr = gw_addr; - int i; - - RDBG(("gwaddr[")); - for(i = 0; i < 8; i++) { - RDBG(("%04x%c", addr->s6_addr16[i], - i == 7 ? ']' : ':')); + ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway); + gwa_type = ipv6_addr_type(gw_addr); + + if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { + struct rt6_info *grt; + + /* IPv6 strictly inhibits using not link-local + addresses as nexthop address. + It is very good, but in some (rare!) curcumstances + (SIT, NBMA NOARP links) it is handy to allow + some exceptions. + */ + if (!(gwa_type&IPV6_ADDR_UNICAST)) { + *err = -EINVAL; + goto out; } - } -#endif - if ((rtmsg->rtmsg_flags & RTF_GATEWAY) && - (rtmsg->rtmsg_flags & RTF_ADDRCONF) == 0) { - RDBG(("RTF_GATEWAY && !RTF_ADDRCONF, ")); - if (dev) - flags |= RTF_LINKRT; + grt = rt6_lookup(gw_addr, NULL, dev, RTF_LINKRT); - grt = rt6_lookup(gw_addr, NULL, dev, flags); - - if (grt == NULL) - { - RDBG(("!grt, ")); + if (grt == NULL || (grt->rt6i_flags&RTF_GATEWAY)) { *err = -EHOSTUNREACH; goto out; } dev = grt->rt6i_dev; - RDBG(("grt(d=%s), ", dev ? dev->name : "NULL")); + } + if (dev == NULL) { + *err = -EINVAL; + goto out; } rt->rt6i_nexthop = ndisc_get_neigh(dev, gw_addr); @@ -739,20 +729,26 @@ int ip6_route_del(struct in6_rtmsg *rtmsg) /* * Find device */ - if(rtmsg->rtmsg_ifindex) + if(rtmsg->rtmsg_ifindex) { dev=dev_get_by_index(rtmsg->rtmsg_ifindex); + if (dev == NULL) + return -ENODEV; + } /* * Find route */ - rt=rt6_lookup(&rtmsg->rtmsg_dst, &rtmsg->rtmsg_src, dev, rtmsg->rtmsg_flags); - + rt=rt6_lookup(&rtmsg->rtmsg_dst, &rtmsg->rtmsg_src, dev, dev ? RTF_LINKRT : 0); + /* * Blow it away */ - if(rt) + if(rt && rt->rt6i_dst.plen == rtmsg->rtmsg_dst_len && + rt->rt6i_src.plen == rtmsg->rtmsg_src_len) { ip6_del_rt(rt); + return 0; + } - return 0; + return -ESRCH; } @@ -777,6 +773,7 @@ void __rt6_run_bh(void) rt6_bh_mask = 0; } +#ifdef CONFIG_NETLINK /* * NETLINK interface * routing socket moral equivalent @@ -815,6 +812,7 @@ out: kfree_skb(skb, FREE_READ); return count; } +#endif /* CONFIG_NETLINK */ static void rt6_sndrtmsg(struct in6_rtmsg *rtmsg) { @@ -827,7 +825,9 @@ static void rt6_sndrtmsg(struct in6_rtmsg *rtmsg) memcpy(skb_put(skb, sizeof(struct in6_rtmsg)), &rtmsg, sizeof(struct in6_rtmsg)); +#ifdef CONFIG_NETLINK if (netlink_post(NETLINK_ROUTE6, skb)) +#endif kfree_skb(skb, FREE_WRITE); } @@ -867,7 +867,9 @@ void rt6_sndmsg(int type, struct in6_addr *dst, struct in6_addr *src, msg->rtmsg_flags = flags; +#ifdef CONFIG_NETLINK if (netlink_post(NETLINK_ROUTE6, skb)) +#endif kfree_skb(skb, FREE_WRITE); } @@ -878,54 +880,28 @@ struct rt6_info *rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr, struct in6_addr *target, struct device *dev, int on_link) { - struct rt6_info *rt, *tgtr, *nrt; + struct rt6_info *rt, *nrt; - RDBG(("rt6_redirect(%s)[%p]: ", - dev ? dev->name : "NULL", - __builtin_return_address(0))); + /* Locate old route to this destination. */ rt = rt6_lookup(dest, NULL, dev, 0); - if (rt == NULL || rt->u.dst.error) { - RDBG(("!rt\n")); - printk(KERN_DEBUG "rt6_redirect: no route to destination\n"); + if (rt == NULL || rt->u.dst.error) return NULL; - } - if (rt->rt6i_flags & RTF_GATEWAY) { - /* - * This can happen due to misconfiguration - * if we are dealing with an "on link" redirect. - */ - RDBG(("RTF_GATEWAY\n")); - printk(KERN_DEBUG "rt6_redirect: destination not directly " - "connected\n"); + /* Duplicate redirect: silently ignore. */ + if (ipv6_addr_cmp(target, &rt->rt6i_gateway) == 0) return NULL; - } - RDBG(("tgt_lkup, ")); - tgtr = rt6_lookup(target, NULL, dev, 0); - if (tgtr == NULL || tgtr->u.dst.error) { - /* - * duh?! no route to redirect target. - * How where we talking to it in the first place ? - */ - RDBG(("!tgtr||dsterr\n")); - printk(KERN_DEBUG "rt6_redirect: no route to target\n"); + /* Current route is on-link; redirect is always invalid. */ + if (!(rt->rt6i_flags&RTF_GATEWAY)) return NULL; - } - - if ((tgtr->rt6i_flags & RTF_GATEWAY) && - ipv6_addr_cmp(dest, &tgtr->rt6i_gateway) == 0) { - RDBG(("tgt RTF_GATEWAY && dstmatch, dup\n")); - /* - * Check if we already have the right route. - */ -#if RT6_DEBUG >= 1 - printk(KERN_DEBUG "rt6_redirect: duplicate\n"); -#endif - return NULL; - } +#if !defined(CONFIG_IPV6_EUI64) || defined(CONFIG_IPV6_NO_PB) + /* + * During transition gateways have more than + * one link local address. Certainly, it is violation + * of basic principles, but it is temparary. + */ /* * RFC 1970 specifies that redirects should only be * accepted if they come from the nexthop to the target. @@ -934,62 +910,57 @@ struct rt6_info *rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr, * routers. */ - if (ipv6_addr_cmp(saddr, &tgtr->rt6i_gateway)) { - RDBG(("saddr/tgt->gway match, ")); - if (tgtr->rt6i_flags & RTF_DEFAULT) { - tgtr = ip6_routing_table.leaf; + if (ipv6_addr_cmp(saddr, &rt->rt6i_gateway)) { + if (rt->rt6i_flags & RTF_DEFAULT) { + rt = ip6_routing_table.leaf; - for (; tgtr; tgtr = tgtr->u.next) { - if (!ipv6_addr_cmp(saddr, &tgtr->rt6i_gateway)) { - RDBG(("found srcok, ")); + for (; rt; rt = rt->u.next) { + if (!ipv6_addr_cmp(saddr, &rt->rt6i_gateway)) goto source_ok; - } } } - RDBG(("!dflt||!srcok, ")); printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop " - "for redirect target\n"); + "for redirect target\n"); + return NULL; } source_ok: +#endif /* * We have finally decided to accept it. */ - RDBG(("srcok: ")); - if ((tgtr->rt6i_flags & RTF_HOST)) { + if (rt->rt6i_dst.plen == 128) { /* * Already a host route. * */ - RDBG(("hralready, ")); - if (tgtr->rt6i_nexthop) { - RDBG(("nrel(nxthop) ")); - neigh_release(tgtr->rt6i_nexthop); - } + if (rt->rt6i_nexthop) + neigh_release(rt->rt6i_nexthop); /* * purge hh_cache */ - tgtr->rt6i_flags |= RTF_MODIFIED | RTF_CACHE; - ipv6_addr_copy(&tgtr->rt6i_gateway, dest); - tgtr->rt6i_nexthop = ndisc_get_neigh(tgtr->rt6i_dev, dest); - RDBG(("hhpurge, getnewneigh, ret(%p)\n", tgtr)); - return tgtr; + rt->rt6i_flags |= RTF_MODIFIED | RTF_CACHE; + if (on_link) + rt->rt6i_flags &= ~RTF_GATEWAY; + ipv6_addr_copy(&rt->rt6i_gateway, target); + rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, target); + return rt; } - nrt = ip6_rt_copy(tgtr); - nrt->rt6i_flags = RTF_GATEWAY|RTF_HOST|RTF_UP|RTF_DYNAMIC|RTF_CACHE; + nrt = ip6_rt_copy(rt); + nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; + if (on_link) + nrt->rt6i_flags &= ~RTF_GATEWAY; - ipv6_addr_copy(&nrt->rt6i_dst.addr, target); + ipv6_addr_copy(&nrt->rt6i_dst.addr, dest); nrt->rt6i_dst.plen = 128; - ipv6_addr_copy(&nrt->rt6i_gateway, dest); - nrt->rt6i_nexthop = ndisc_get_neigh(nrt->rt6i_dev, dest); + ipv6_addr_copy(&nrt->rt6i_gateway, target); + nrt->rt6i_nexthop = ndisc_get_neigh(nrt->rt6i_dev, target); nrt->rt6i_dev = dev; nrt->u.dst.pmtu = dev->mtu; - RDBG(("rt6_ins(%p)\n", nrt)); - rt6_lock(); rt6_ins(nrt); rt6_unlock(); @@ -1023,7 +994,15 @@ void rt6_pmtu_discovery(struct in6_addr *addr, struct device *dev, int pmtu) return; } - if (rt->rt6i_flags & RTF_HOST) { + /* It is wrong, but I plugged the hole here. + On-link routes are cloned differently, + look at rt6_redirect --ANK + */ + if (!(rt->rt6i_flags&RTF_GATEWAY)) { + return; + } + + if (rt->rt6i_dst.plen == 128) { /* * host route */ @@ -1037,7 +1016,7 @@ void rt6_pmtu_discovery(struct in6_addr *addr, struct device *dev, int pmtu) ipv6_addr_copy(&rt->rt6i_dst.addr, addr); rt->rt6i_dst.plen = 128; - rt->rt6i_flags |= (RTF_HOST | RTF_DYNAMIC | RTF_CACHE); + rt->rt6i_flags |= (RTF_DYNAMIC | RTF_CACHE); rt6_lock(); rt6_ins(rt); @@ -1065,7 +1044,7 @@ struct rt6_info * ip6_rt_copy(struct rt6_info *ort) rt->rt6i_keylen = ort->rt6i_keylen; rt->rt6i_flags = ort->rt6i_flags; rt->rt6i_metric = ort->rt6i_metric; - + memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); } @@ -1257,7 +1236,7 @@ int ip6_rt_addr_add(struct in6_addr *addr, struct device *dev) rt->rt6i_dev = dev_get("lo"); rt->u.dst.pmtu = rt->rt6i_dev->mtu; - rt->rt6i_flags = RTF_HOST | RTF_LOCAL | RTF_UP | RTF_NONEXTHOP; + rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; ipv6_addr_copy(&rt->rt6i_dst.addr, addr); rt->rt6i_dst.plen = 128; @@ -1600,7 +1579,9 @@ __initfunc(void ip6_route_init(void)) proc_net_register(&proc_rt6_stats); proc_net_register(&proc_rt6_tree); #endif +#ifdef CONFIG_NETLINK netlink_attach(NETLINK_ROUTE6, rt6_msgrcv); +#endif } #ifdef MODULE @@ -1611,7 +1592,9 @@ void ip6_route_cleanup(void) proc_net_unregister(PROC_NET_RT6_TREE); proc_net_unregister(PROC_NET_RT6_STATS); #endif +#ifdef CONFIG_NETLINK netlink_detach(NETLINK_ROUTE6); +#endif #if 0 fib6_flush(); #endif diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index d818bc777..4ff6e28d8 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -4,8 +4,9 @@ * * Authors: * Pedro Roque <roque@di.fc.ul.pt> + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * - * $Id: sit.c,v 1.14 1997/04/29 09:38:52 mj Exp $ + * $Id: sit.c,v 1.23 1997/11/08 18:15:49 kuznet Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -13,6 +14,9 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/config.h> +#define __NO_VERSION__ +#include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> @@ -23,6 +27,7 @@ #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/icmp.h> +#include <asm/uaccess.h> #include <linux/init.h> #include <net/sock.h> @@ -31,385 +36,363 @@ #include <net/ipv6.h> #include <net/protocol.h> #include <net/transp_v6.h> +#include <net/ip6_fib.h> +#include <net/ip6_route.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/ip.h> #include <net/udp.h> -#include <net/sit.h> +#include <net/icmp.h> +#include <net/ipip.h> +/* + This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c -static int sit_init_dev(struct device *dev); - -static struct device sit_device = { - "sit0", - 0, 0, 0, 0, - 0x0, 0, - 0, 0, 0, NULL, sit_init_dev -}; - -static unsigned long sit_gc_last_run; -static void sit_mtu_cache_gc(void); - -static int sit_xmit(struct sk_buff *skb, - struct device *dev); -static int sit_rcv(struct sk_buff *skb, unsigned short len); -static void sit_err(struct sk_buff *skb, unsigned char *dp); - -static int sit_open(struct device *dev); -static int sit_close(struct device *dev); + For comments look at net/ipv4/ip_gre.c --ANK + */ -static struct net_device_stats *sit_get_stats(struct device *dev); +#define HASH_SIZE 16 +#define HASH(addr) ((addr^(addr>>4))&0xF) -extern void udp_err(struct sk_buff *, unsigned char *); +static int ipip6_fb_tunnel_init(struct device *dev); +static int ipip6_tunnel_init(struct device *dev); -static struct inet_protocol sit_protocol = { - sit_rcv, - sit_err, - 0, - IPPROTO_IPV6, - 0, - NULL, - "IPv6" +static struct device ipip6_fb_tunnel_dev = { + NULL, 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipip6_fb_tunnel_init, }; -#define SIT_NUM_BUCKETS 16 - -struct sit_mtu_info *sit_mtu_cache[SIT_NUM_BUCKETS]; - -static int vif_num = 0; -static struct sit_vif *vif_list = NULL; +static struct ip_tunnel ipip6_fb_tunnel = { + NULL, &ipip6_fb_tunnel_dev, {0, }, 0, 0, 0, 0, 0, 0, 0, {"sit0", } +}; -static __inline__ __u32 sit_addr_hash(__u32 addr) -{ - - __u32 hash_val; - - hash_val = addr; - - hash_val ^= hash_val >> 16; - hash_val ^= hash_val >> 8; - - return (hash_val & (SIT_NUM_BUCKETS - 1)); -} +static struct ip_tunnel *tunnels_r_l[HASH_SIZE]; +static struct ip_tunnel *tunnels_r[HASH_SIZE]; +static struct ip_tunnel *tunnels_l[HASH_SIZE]; +static struct ip_tunnel *tunnels_wc[1]; +static struct ip_tunnel **tunnels[4] = { tunnels_wc, tunnels_l, tunnels_r, tunnels_r_l }; -static void sit_cache_insert(__u32 addr, int mtu) +static struct ip_tunnel * ipip6_tunnel_lookup(u32 remote, u32 local) { - struct sit_mtu_info *minfo; - int hash; - - minfo = kmalloc(sizeof(struct sit_mtu_info), GFP_ATOMIC); - - if (minfo == NULL) - return; - - minfo->addr = addr; - minfo->tstamp = jiffies; - minfo->mtu = mtu; - - hash = sit_addr_hash(addr); - - minfo->next = sit_mtu_cache[hash]; - sit_mtu_cache[hash] = minfo; + unsigned h0 = HASH(remote); + unsigned h1 = HASH(local); + struct ip_tunnel *t; + + for (t = tunnels_r_l[h0^h1]; t; t = t->next) { + if (local == t->parms.iph.saddr && + remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + return t; + } + for (t = tunnels_r[h0]; t; t = t->next) { + if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + return t; + } + for (t = tunnels_l[h1]; t; t = t->next) { + if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) + return t; + } + if ((t = tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP)) + return t; + return NULL; } -static struct sit_mtu_info * sit_mtu_lookup(__u32 addr) +struct ip_tunnel * ipip6_tunnel_locate(struct ip_tunnel_parm *parms, int create) { - struct sit_mtu_info *iter; - int hash; - - hash = sit_addr_hash(addr); + u32 remote = parms->iph.daddr; + u32 local = parms->iph.saddr; + struct ip_tunnel *t, **tp, *nt; + struct device *dev; + unsigned h = 0; + int prio = 0; - for(iter = sit_mtu_cache[hash]; iter; iter=iter->next) { - if (iter->addr == addr) { - iter->tstamp = jiffies; - break; - } + if (remote) { + prio |= 2; + h ^= HASH(remote); } - - /* - * run garbage collector - */ - - if (jiffies - sit_gc_last_run > SIT_GC_FREQUENCY) { - sit_mtu_cache_gc(); - sit_gc_last_run = jiffies; + if (local) { + prio |= 1; + h ^= HASH(local); } + for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) { + if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) + return t; + } + if (!create) + return NULL; - return iter; -} - -static void sit_mtu_cache_gc(void) -{ - struct sit_mtu_info *iter, *back; - unsigned long now = jiffies; - int i; - - for (i=0; i < SIT_NUM_BUCKETS; i++) { - back = NULL; - for (iter = sit_mtu_cache[i]; iter;) { - if (now - iter->tstamp > SIT_GC_TIMEOUT) { - struct sit_mtu_info *old; - - old = iter; - iter = iter->next; - - if (back) - back->next = iter; - else - sit_mtu_cache[i] = iter; - - kfree(old); - continue; - } - back = iter; - iter = iter->next; + MOD_INC_USE_COUNT; + dev = kmalloc(sizeof(*dev) + sizeof(*t), GFP_KERNEL); + if (dev == NULL) { + MOD_DEC_USE_COUNT; + return NULL; + } + memset(dev, 0, sizeof(*dev) + sizeof(*t)); + dev->priv = (void*)(dev+1); + nt = (struct ip_tunnel*)dev->priv; + nt->dev = dev; + dev->name = nt->parms.name; + dev->init = ipip6_tunnel_init; + memcpy(&nt->parms, parms, sizeof(*parms)); + if (dev->name[0] == 0) { + int i; + for (i=1; i<100; i++) { + sprintf(dev->name, "sit%d", i); + if (dev_get(dev->name) == NULL) + break; } + if (i==100) + goto failed; + memcpy(parms->name, dev->name, IFNAMSIZ); } + if (register_netdevice(dev) < 0) + goto failed; + + start_bh_atomic(); + nt->next = t; + *tp = nt; + end_bh_atomic(); + /* Do not decrement MOD_USE_COUNT here. */ + return nt; + +failed: + kfree(dev); + MOD_DEC_USE_COUNT; + return NULL; } -static int sit_init_dev(struct device *dev) -{ - int i; - - dev->open = sit_open; - dev->stop = sit_close; - - dev->hard_start_xmit = sit_xmit; - dev->get_stats = sit_get_stats; - - dev->priv = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL); - - if (dev->priv == NULL) - return -ENOMEM; - - memset(dev->priv, 0, sizeof(struct net_device_stats)); - - - for (i = 0; i < DEV_NUMBUFFS; i++) - skb_queue_head_init(&dev->buffs[i]); - - dev->hard_header = NULL; - dev->rebuild_header = NULL; - dev->set_mac_address = NULL; - dev->hard_header_cache = NULL; - dev->header_cache_update= NULL; - - dev->type = ARPHRD_SIT; - - dev->hard_header_len = MAX_HEADER; - dev->mtu = 1500 - sizeof(struct iphdr); - dev->addr_len = 0; - dev->tx_queue_len = 0; - - memset(dev->broadcast, 0, MAX_ADDR_LEN); - memset(dev->dev_addr, 0, MAX_ADDR_LEN); - - dev->flags = IFF_NOARP; - - dev->family = AF_INET6; - dev->pa_addr = 0; - dev->pa_brdaddr = 0; - dev->pa_dstaddr = 0; - dev->pa_mask = 0; - dev->pa_alen = 4; - - return 0; -} - -static int sit_init_vif(struct device *dev) +static void ipip6_tunnel_destroy(struct device *dev) { - int i; - - dev->flags = IFF_NOARP|IFF_POINTOPOINT|IFF_MULTICAST; - dev->priv = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL); - - if (dev->priv == NULL) - return -ENOMEM; - - memset(dev->priv, 0, sizeof(struct net_device_stats)); - - for (i = 0; i < DEV_NUMBUFFS; i++) - skb_queue_head_init(&dev->buffs[i]); - - return 0; -} - -static int sit_open(struct device *dev) -{ - return 0; -} + struct ip_tunnel *t, **tp; + struct ip_tunnel *t0 = (struct ip_tunnel*)dev->priv; + u32 remote = t0->parms.iph.daddr; + u32 local = t0->parms.iph.saddr; + unsigned h = 0; + int prio = 0; + + if (dev == &ipip6_fb_tunnel_dev) { + tunnels_wc[0] = NULL; + return; + } -static int sit_close(struct device *dev) -{ - return 0; + if (remote) { + prio |= 2; + h ^= HASH(remote); + } + if (local) { + prio |= 1; + h ^= HASH(local); + } + for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) { + if (t == t0) { + *tp = t->next; + kfree(dev); + MOD_DEC_USE_COUNT; + break; + } + } } -__initfunc(int sit_init(void)) -{ - int i; - - /* register device */ - - if (register_netdev(&sit_device) != 0) - return -EIO; - - inet_add_protocol(&sit_protocol); - - for (i=0; i < SIT_NUM_BUCKETS; i++) - sit_mtu_cache[i] = NULL; - - sit_gc_last_run = jiffies; - return 0; -} - -struct device *sit_add_tunnel(__u32 dstaddr) +void ipip6_err(struct sk_buff *skb, unsigned char *dp, int len) { - struct sit_vif *vif; - struct device *dev; +#ifndef I_WISH_WORLD_WERE_PERFECT - if ((sit_device.flags & IFF_UP) == 0) - return NULL; - - vif = kmalloc(sizeof(struct sit_vif), GFP_KERNEL); - if (vif == NULL) - return NULL; - - /* - * Create PtoP configured tunnel - */ - - dev = kmalloc(sizeof(struct device), GFP_KERNEL); - if (dev == NULL) - return NULL; - - memcpy(dev, &sit_device, sizeof(struct device)); - dev->init = sit_init_vif; - dev->pa_dstaddr = dstaddr; - - dev->name = vif->name; - sprintf(vif->name, "sit%d", ++vif_num); - - register_netdev(dev); +/* It is not :-( All the routers (except for Linux) return only + 8 bytes of packet payload. It means, that precise relaying of + ICMP in the real Internet is absolutely infeasible. + */ + struct iphdr *iph = (struct iphdr*)dp; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + struct ip_tunnel *t; - vif->dev = dev; - vif->next = vif_list; - vif_list = vif; + if (len < sizeof(struct iphdr)) + return; - return dev; -} + switch (type) { + default: + case ICMP_PARAMETERPROB: + return; -void sit_cleanup(void) -{ - struct sit_vif *vif; - - for (vif = vif_list; vif;) { - struct device *dev = vif->dev; - struct sit_vif *cur; - - unregister_netdev(dev); - kfree(dev->priv); - kfree(dev); - - cur = vif; - vif = vif->next; + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* Soft state for pmtu is maintained by IP core. */ + return; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe they are just ether pollution. --ANK + */ + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + break; } - vif_list = NULL; - - unregister_netdev(&sit_device); - inet_del_protocol(&sit_protocol); - -} - -/* - * receive IPv4 ICMP messages - */ + t = ipip6_tunnel_lookup(iph->daddr, iph->saddr); + if (t == NULL || t->parms.iph.daddr == 0) + return; + if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) + return; -static void sit_err(struct sk_buff *skb, unsigned char *dp) -{ + if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) + t->err_count++; + else + t->err_count = 1; + t->err_time = jiffies; + return; +#else struct iphdr *iph = (struct iphdr*)dp; + int hlen = iph->ihl<<2; + struct ipv6hdr *iph6; int type = skb->h.icmph->type; int code = skb->h.icmph->code; + int rel_type = 0; + int rel_code = 0; + int rel_info = 0; + struct sk_buff *skb2; + struct rt6_info *rt6i; - if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { - struct sit_mtu_info *minfo; - unsigned short info = skb->h.icmph->un.frag.mtu - sizeof(struct iphdr); - - minfo = sit_mtu_lookup(iph->daddr); - - printk(KERN_DEBUG "sit: %08lx pmtu = %ul\n", ntohl(iph->saddr), - info); - - if (minfo == NULL) { - minfo = kmalloc(sizeof(struct sit_mtu_info), - GFP_ATOMIC); + if (len < hlen + sizeof(struct ipv6hdr)) + return; + iph6 = (struct ipv6hdr*)(dp + hlen); - if (minfo == NULL) - return; + switch (type) { + default: + return; + case ICMP_PARAMETERPROB: + if (skb->h.icmph->un.gateway < hlen) + return; + + /* So... This guy found something strange INSIDE encapsulated + packet. Well, he is fool, but what can we do ? + */ + rel_type = ICMPV6_PARAMPROB; + rel_info = skb->h.icmph->un.gateway - hlen; + break; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* Too complicated case ... */ + return; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe, it is just ether pollution. --ANK + */ + rel_type = ICMPV6_DEST_UNREACH; + rel_code = ICMPV6_ADDR_UNREACH; + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + rel_type = ICMPV6_TIME_EXCEED; + rel_code = ICMPV6_EXC_HOPLIMIT; + break; + } - start_bh_atomic(); - sit_cache_insert(iph->daddr, info); - end_bh_atomic(); - } else { - minfo->mtu = info; + /* Prepare fake skb to feed it to icmpv6_send */ + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2 == NULL) + return; + dst_release(skb2->dst); + skb2->dst = NULL; + skb_pull(skb2, skb->data - (u8*)iph6); + skb2->nh.raw = skb2->data; + + /* Try to guess incoming interface */ + rt6i = rt6_lookup(&iph6->saddr, NULL, NULL, 0); + if (rt6i && rt6i->rt6i_dev) { + skb2->dev = rt6i->rt6i_dev; + + rt6i = rt6_lookup(&iph6->daddr, &iph6->saddr, NULL, 0); + + if (rt6i && rt6i->rt6i_dev && rt6i->rt6i_dev->type == ARPHRD_SIT) { + struct ip_tunnel * t = (struct ip_tunnel*)rt6i->rt6i_dev->priv; + if (rel_type == ICMPV6_TIME_EXCEED && t->parms.iph.ttl) { + rel_type = ICMPV6_DEST_UNREACH; + rel_code = ICMPV6_ADDR_UNREACH; + } + icmpv6_send(skb2, rel_type, rel_code, rel_info, skb2->dev); } } + kfree_skb(skb2, FREE_WRITE); + return; +#endif } -static int sit_rcv(struct sk_buff *skb, unsigned short len) +int ipip6_rcv(struct sk_buff *skb, unsigned short len) { - struct net_device_stats *stats; - struct device *dev = NULL; - struct sit_vif *vif; - __u32 saddr = skb->nh.iph->saddr; - - skb->h.raw = skb->nh.raw = skb_pull(skb, skb->h.raw - skb->data); - - skb->protocol = __constant_htons(ETH_P_IPV6); - - for (vif = vif_list; vif; vif = vif->next) { - if (saddr == vif->dev->pa_dstaddr) { - dev = vif->dev; - break; - } + struct iphdr *iph; + struct ip_tunnel *tunnel; + + iph = skb->nh.iph; + + if ((tunnel = ipip6_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) { + skb->mac.raw = skb->nh.raw; + skb->nh.raw = skb_pull(skb, skb->h.raw - skb->data); + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->protocol = __constant_htons(ETH_P_IPV6); + skb->ip_summed = 0; + skb->pkt_type = PACKET_HOST; + tunnel->stat.rx_packets++; + tunnel->stat.rx_bytes += skb->len; + skb->dev = tunnel->dev; + dst_release(skb->dst); + skb->dst = NULL; + netif_rx(skb); + return 0; } - if (dev == NULL) - dev = &sit_device; - - skb->dev = dev; - skb->ip_summed = CHECKSUM_NONE; - - stats = (struct net_device_stats *)dev->priv; - stats->rx_bytes += len; - stats->rx_packets++; - - ipv6_rcv(skb, dev, NULL); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); + kfree_skb(skb, FREE_READ); return 0; } -static int sit_xmit(struct sk_buff *skb, struct device *dev) +/* + * This function assumes it is being called from dev_queue_xmit() + * and that skb is filled properly by that function. + */ + +static int ipip6_tunnel_xmit(struct sk_buff *skb, struct device *dev) { - struct net_device_stats *stats; - struct sit_mtu_info *minfo; + struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + struct net_device_stats *stats = &tunnel->stat; + struct iphdr *tiph = &tunnel->parms.iph; + struct ipv6hdr *iph6 = skb->nh.ipv6h; + u8 tos = tunnel->parms.iph.tos; + struct rtable *rt; /* Route to the other host */ + struct device *tdev; /* Device to other host */ + struct iphdr *iph; /* Our new IP header */ + int max_headroom; /* The extra header space needed */ + u32 dst = tiph->daddr; + int mtu; struct in6_addr *addr6; - struct rtable *rt; - struct iphdr *iph; - __u32 saddr; - __u32 daddr; int addr_type; - int mtu; - int headroom; - /* - * Make sure we are not busy (check lock variable) - */ + if (tunnel->recursion++) { + tunnel->stat.collisions++; + goto tx_error; + } - stats = (struct net_device_stats *)dev->priv; + if (skb->protocol != __constant_htons(ETH_P_IPV6)) + goto tx_error; - daddr = dev->pa_dstaddr; - if (daddr == 0) { + if (!dst) { struct nd_neigh *neigh = NULL; if (skb->dst) @@ -417,9 +400,9 @@ static int sit_xmit(struct sk_buff *skb, struct device *dev) if (neigh == NULL) { printk(KERN_DEBUG "sit: nexthop == NULL\n"); - goto on_error; + goto tx_error; } - + addr6 = &neigh->ndn_addr; addr_type = ipv6_addr_type(addr6); @@ -428,88 +411,329 @@ static int sit_xmit(struct sk_buff *skb, struct device *dev) addr_type = ipv6_addr_type(addr6); } - if ((addr_type & IPV6_ADDR_COMPATv4) == 0) { - printk(KERN_DEBUG "sit_xmit: non v4 address\n"); - goto on_error; - } - daddr = addr6->s6_addr32[3]; - } + if ((addr_type & IPV6_ADDR_COMPATv4) == 0) + goto tx_error_icmp; - if (ip_route_output(&rt, daddr, 0, 0, NULL)) { - printk(KERN_DEBUG "sit: no route to host\n"); - goto on_error; + dst = addr6->s6_addr32[3]; } - minfo = sit_mtu_lookup(daddr); + if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) { + tunnel->stat.tx_carrier_errors++; + goto tx_error_icmp; + } + tdev = rt->u.dst.dev; - /* IP should calculate pmtu correctly, - * let's check it... - */ -#if 0 - if (minfo) - mtu = minfo->mtu; - else -#endif - mtu = rt->u.dst.pmtu; + if (tdev == dev) { + ip_rt_put(rt); + tunnel->stat.collisions++; + goto tx_error; + } - if (mtu > 576 && skb->tail - (skb->data + sizeof(struct ipv6hdr)) > mtu) { - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); + mtu = rt->u.dst.pmtu - sizeof(struct iphdr); + if (mtu < 68) { + tunnel->stat.collisions++; ip_rt_put(rt); - goto on_error; + goto tx_error; + } + if (mtu >= 576) { + if (skb->dst && mtu < skb->dst->pmtu) { + struct rt6_info *rt6 = (struct rt6_info*)skb->dst; + if (mtu < rt6->u.dst.pmtu) { + if (tunnel->parms.iph.daddr || rt6->rt6i_dst.plen == 128) { + rt6->rt6i_flags |= RTF_MODIFIED; + rt6->u.dst.pmtu = mtu; + } + } + } + if (skb->len > mtu) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); + ip_rt_put(rt); + goto tx_error; + } } - headroom = ((rt->u.dst.dev->hard_header_len+15)&~15)+sizeof(struct iphdr); + if (tunnel->err_count > 0) { + if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { + tunnel->err_count--; + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev); + } else + tunnel->err_count = 0; + } - if (skb_headroom(skb) < headroom || skb_shared(skb)) { - struct sk_buff *new_skb = skb_realloc_headroom(skb, headroom); + skb->h.raw = skb->nh.raw; + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr)); + + if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); - goto on_error; + stats->tx_dropped++; + dev_kfree_skb(skb, FREE_WRITE); + tunnel->recursion--; + return 0; } dev_kfree_skb(skb, FREE_WRITE); skb = new_skb; } - - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - - iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr)); - skb->nh.iph = iph; - saddr = rt->rt_src; + skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); dst_release(skb->dst); skb->dst = &rt->u.dst; - iph->version = 4; - iph->ihl = 5; - iph->tos = 0; /* tos set to 0... */ + /* + * Push down and install the IPIP header. + */ + iph = skb->nh.iph; + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; if (mtu > 576) - iph->frag_off = htons(IP_DF); + iph->frag_off = __constant_htons(IP_DF); else - iph->frag_off = 0; - - iph->ttl = 64; - iph->saddr = saddr; - iph->daddr = daddr; - iph->protocol = IPPROTO_IPV6; - iph->tot_len = htons(skb->len); - iph->id = htons(ip_id_count++); - ip_send_check(iph); + iph->frag_off = 0; - ip_send(skb); + iph->protocol = IPPROTO_IPV6; + iph->tos = tos; + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + + if ((iph->ttl = tiph->ttl) == 0) + iph->ttl = iph6->hop_limit; + + iph->tot_len = htons(skb->len); + iph->id = htons(ip_id_count++); + ip_send_check(iph); stats->tx_bytes += skb->len; stats->tx_packets++; + ip_send(skb); + tunnel->recursion--; return 0; -on_error: - dev_kfree_skb(skb, FREE_WRITE); +tx_error_icmp: + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, dev); +tx_error: stats->tx_errors++; - return 0; + dev_kfree_skb(skb, FREE_WRITE); + tunnel->recursion--; + return 0; +} + +static int +ipip6_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip_tunnel_parm p; + struct ip_tunnel *t; + + MOD_INC_USE_COUNT; + + switch (cmd) { + case SIOCGETTUNNEL: + t = NULL; + if (dev == &ipip6_fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + err = -EFAULT; + break; + } + t = ipip6_tunnel_locate(&p, 0); + } + if (t == NULL) + t = (struct ip_tunnel*)dev->priv; + memcpy(&p, &t->parms, sizeof(p)); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + break; + + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + + err = -EINVAL; + if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPV6 || + p.iph.ihl != 5 || (p.iph.frag_off&__constant_htons(~IP_DF))) + goto done; + if (p.iph.ttl) + p.iph.frag_off |= __constant_htons(IP_DF); + + t = ipip6_tunnel_locate(&p, cmd == SIOCADDTUNNEL); + + if (t) { + err = 0; + if (cmd == SIOCCHGTUNNEL) { + t->parms.iph.ttl = p.iph.ttl; + t->parms.iph.tos = p.iph.tos; + } + if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) + err = -EFAULT; + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + + case SIOCDELTUNNEL: + if (dev == &ipip6_fb_tunnel_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + err = -ENOENT; + if ((t = ipip6_tunnel_locate(&p, 0)) == NULL) + goto done; + err = -EPERM; + if (t == &ipip6_fb_tunnel) + goto done; + } + err = unregister_netdevice(dev); + break; + + default: + err = -EINVAL; + } + +done: + MOD_DEC_USE_COUNT; + return err; +} + +static struct net_device_stats *ipip6_tunnel_get_stats(struct device *dev) +{ + return &(((struct ip_tunnel*)dev->priv)->stat); } -static struct net_device_stats *sit_get_stats(struct device *dev) +static int ipip6_tunnel_change_mtu(struct device *dev, int new_mtu) { - return((struct net_device_stats *) dev->priv); + if (new_mtu < 576 || new_mtu > 0xFFF8 - sizeof(struct iphdr)) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +static void ipip6_tunnel_init_gen(struct device *dev) +{ + struct ip_tunnel *t = (struct ip_tunnel*)dev->priv; + + dev->destructor = ipip6_tunnel_destroy; + dev->hard_start_xmit = ipip6_tunnel_xmit; + dev->get_stats = ipip6_tunnel_get_stats; + dev->do_ioctl = ipip6_tunnel_ioctl; + dev->change_mtu = ipip6_tunnel_change_mtu; + + dev_init_buffers(dev); + + dev->type = ARPHRD_SIT; + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); + dev->mtu = 1500 - sizeof(struct iphdr); + dev->flags = IFF_NOARP; + dev->iflink = 0; + dev->addr_len = 4; + memcpy(dev->dev_addr, &t->parms.iph.saddr, 4); + memcpy(dev->broadcast, &t->parms.iph.daddr, 4); +} + +static int ipip6_tunnel_init(struct device *dev) +{ + struct device *tdev = NULL; + struct ip_tunnel *tunnel; + struct iphdr *iph; + + tunnel = (struct ip_tunnel*)dev->priv; + iph = &tunnel->parms.iph; + + ipip6_tunnel_init_gen(dev); + + if (iph->daddr) { + struct rtable *rt; + if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link)) { + tdev = rt->u.dst.dev; + ip_rt_put(rt); + } + dev->flags |= IFF_POINTOPOINT; + } + + if (!tdev && tunnel->parms.link) + tdev = dev_get_by_index(tunnel->parms.link); + + if (tdev) { + dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); + dev->mtu = tdev->mtu - sizeof(struct iphdr); + if (dev->mtu < 576) + dev->mtu = 576; + } + dev->iflink = tunnel->parms.link; + + return 0; +} + +#ifdef MODULE +static int ipip6_fb_tunnel_open(struct device *dev) +{ + MOD_INC_USE_COUNT; + return 0; +} + +static int ipip6_fb_tunnel_close(struct device *dev) +{ + MOD_DEC_USE_COUNT; + return 0; +} +#endif + +__initfunc(int ipip6_fb_tunnel_init(struct device *dev)) +{ + struct iphdr *iph; + + ipip6_tunnel_init_gen(dev); +#ifdef MODULE + dev->open = ipip6_fb_tunnel_open; + dev->stop = ipip6_fb_tunnel_close; +#endif + + iph = &ipip6_fb_tunnel.parms.iph; + iph->version = 4; + iph->protocol = IPPROTO_IPV6; + iph->ihl = 5; + iph->ttl = 64; + + tunnels_wc[0] = &ipip6_fb_tunnel; + return 0; +} + +static struct inet_protocol sit_protocol = { + ipip6_rcv, + ipip6_err, + 0, + IPPROTO_IPV6, + 0, + NULL, + "IPv6" +}; + +#ifdef MODULE +void sit_cleanup(void) +{ + inet_del_protocol(&sit_protocol); + unregister_netdevice(&ipip6_fb_tunnel_dev); +} +#endif + +__initfunc(int sit_init(void)) +{ + printk(KERN_INFO "IPv6 over IPv4 tunneling driver\n"); + + ipip6_fb_tunnel_dev.priv = (void*)&ipip6_fb_tunnel; + ipip6_fb_tunnel_dev.name = ipip6_fb_tunnel.parms.name; +#ifdef MODULE + register_netdev(&ipip6_fb_tunnel_dev); +#else + register_netdevice(&ipip6_fb_tunnel_dev); +#endif + inet_add_protocol(&sit_protocol); + return 0; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7fba7c526..b6559565b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: tcp_ipv6.c,v 1.37 1997/08/22 19:15:40 freitag Exp $ + * $Id: tcp_ipv6.c,v 1.43 1997/10/30 23:52:34 davem Exp $ * * Based on: * linux/net/ipv4/tcp.c @@ -42,22 +42,23 @@ #include <asm/uaccess.h> +#define ICMP_PARANOIA + extern int sysctl_tcp_sack; extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; -static void tcp_v6_send_reset(struct in6_addr *saddr, - struct in6_addr *daddr, - struct tcphdr *th, struct proto *prot, - struct ipv6_options *opt, - struct device *dev, int pri, int hop_limit); - +static void tcp_v6_send_reset(struct sk_buff *skb); static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, struct sk_buff *skb); static int tcp_v6_backlog_rcv(struct sock *sk, struct sk_buff *skb); static int tcp_v6_build_header(struct sock *sk, struct sk_buff *skb); static void tcp_v6_xmit(struct sk_buff *skb); +static struct open_request *tcp_v6_search_req(struct tcp_opt *tp, + struct ipv6hdr *ip6h, + struct tcphdr *th, + struct open_request **prevp); static struct tcp_func ipv6_mapped; static struct tcp_func ipv6_specific; @@ -536,7 +537,6 @@ out: return retval; } -/* XXX: this functions needs to be updated like tcp_v4_err. */ void tcp_v6_err(int type, int code, unsigned char *header, __u32 info, struct in6_addr *saddr, struct in6_addr *daddr, struct inet6_protocol *protocol) @@ -546,14 +546,34 @@ void tcp_v6_err(int type, int code, unsigned char *header, __u32 info, struct sock *sk; int err; int opening; + struct tcp_opt *tp; +#ifdef ICMP_PARANOIA + __u32 seq; +#endif + + /* XXX: length check for tcphdr missing here */ sk = tcp_v6_lookup(daddr, th->dest, saddr, th->source); - if (sk == NULL) + if (sk == NULL) { + /* XXX: Update ICMP error count */ return; + } + + tp = &sk->tp_pinfo.af_tcp; +#ifdef ICMP_PARANOIA + seq = ntohl(th->seq); + if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) { + if (net_ratelimit()) + printk(KERN_DEBUG "icmp packet outside the tcp window:" + " s:%d %u,%u,%u\n", + (int)sk->state, seq, tp->snd_una, tp->snd_nxt); + return; + } +#endif - np = &sk->net_pinfo.af_inet6; + np = &sk->net_pinfo.af_inet6; if (type == ICMPV6_PKT_TOOBIG && sk->state != TCP_LISTEN) { /* icmp should have updated the destination cache entry */ @@ -580,12 +600,52 @@ void tcp_v6_err(int type, int code, unsigned char *header, __u32 info, else sk->mtu = np->dst->pmtu; - release_sock(sk); + if (sk->sock_readers) { /* remove later */ + printk(KERN_DEBUG "tcp_v6_err: pmtu disc: socket locked.\n"); + return; + } + tcp_simple_retransmit(sk); return; } - /* FIXME: This is wrong. Need to check for open_requests here. */ - opening = (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV); + opening = 0; + /* Might be for an open_request */ + switch (sk->state) { + struct open_request *req, *prev; + struct ipv6hdr hd; + case TCP_LISTEN: + if (sk->sock_readers) + return; + + /* Grrrr - fix this later. */ + ipv6_addr_copy(&hd.saddr, saddr); + ipv6_addr_copy(&hd.daddr, daddr); + req = tcp_v6_search_req(tp, &hd,th, &prev); + if (!req) + return; +#ifdef ICMP_PARANOIA + if (seq != req->snt_isn) { + if (net_ratelimit()) + printk(KERN_DEBUG "icmp packet for openreq " + "with wrong seq number:%d:%d\n", + seq, req->snt_isn); + return; + } +#endif + if (req->sk) { + sk = req->sk; /* report error in accept */ + } else { + tcp_synq_unlink(tp, req, prev); + req->class->destructor(req); + tcp_openreq_free(req); + } + /* FALL THROUGH */ + case TCP_SYN_SENT: + case TCP_SYN_RECV: + opening = 1; + break; + } + if (icmpv6_err_convert(type, code, &err) || opening) { sk->err = err; @@ -692,7 +752,8 @@ static void tcp_v6_or_free(struct open_request *req) static struct or_calltable or_ipv6 = { tcp_v6_send_synack, - tcp_v6_or_free + tcp_v6_or_free, + tcp_v6_send_reset }; /* FIXME: this is substantially similar to the ipv4 code. @@ -864,8 +925,6 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, atomic_set(&newsk->rmem_alloc, 0); newsk->localroute = sk->localroute; - newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF; - newsk->err = 0; newsk->shutdown = 0; newsk->ack_backlog = 0; @@ -957,17 +1016,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, return newsk; } -static void tcp_v6_reply_reset(struct sk_buff *skb) -{ -} - -static void tcp_v6_send_reset(struct in6_addr *saddr, struct in6_addr *daddr, - struct tcphdr *th, struct proto *prot, - struct ipv6_options *opt, - struct device *dev, int pri, int hop_limit) +static void tcp_v6_send_reset(struct sk_buff *skb) { + struct tcphdr *th = skb->h.th, *t1; struct sk_buff *buff; - struct tcphdr *t1; struct flowi fl; if(th->rst) @@ -982,7 +1034,7 @@ static void tcp_v6_send_reset(struct in6_addr *saddr, struct in6_addr *daddr, if (buff == NULL) return; - buff->dev = dev; + buff->dev = skb->dev; tcp_v6_build_header(NULL, buff); @@ -1009,29 +1061,32 @@ static void tcp_v6_send_reset(struct in6_addr *saddr, struct in6_addr *daddr, } buff->csum = csum_partial((char *)t1, sizeof(*t1), 0); - - t1->check = csum_ipv6_magic(saddr, daddr, sizeof(*t1), IPPROTO_TCP, + + fl.nl_u.ip6_u.daddr = &skb->nh.ipv6h->daddr; + fl.nl_u.ip6_u.saddr = &skb->nh.ipv6h->saddr; + + t1->check = csum_ipv6_magic(fl.nl_u.ip6_u.saddr, + fl.nl_u.ip6_u.daddr, + sizeof(*t1), IPPROTO_TCP, buff->csum); fl.proto = IPPROTO_TCP; - fl.nl_u.ip6_u.daddr = daddr; - fl.nl_u.ip6_u.saddr = saddr; - fl.dev = dev; + fl.dev = skb->dev; fl.uli_u.ports.dport = th->dest; fl.uli_u.ports.sport = th->source; ip6_xmit(NULL, buff, &fl, NULL); tcp_statistics.TcpOutSegs++; + tcp_statistics.TcpOutRsts++; } static struct open_request *tcp_v6_search_req(struct tcp_opt *tp, - void *header, + struct ipv6hdr *ip6h, struct tcphdr *th, struct open_request **prevp) { - struct ipv6hdr *ip6h = header; struct open_request *req, *prev; - __u16 rport = th->source; + __u16 rport = th->source; /* assumption: the socket is not in use. * as we checked the user count on tcp_rcv and we're @@ -1050,6 +1105,22 @@ static struct open_request *tcp_v6_search_req(struct tcp_opt *tp, return NULL; } +static void tcp_v6_rst_req(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct open_request *req, *prev; + + req = tcp_v6_search_req(tp,skb->nh.ipv6h,skb->h.th,&prev); + if (!req) + return; + /* Sequence number check required by RFC793 */ + if (before(skb->seq, req->snt_isn) || after(skb->seq, req->snt_isn+1)) + return; + tcp_synq_unlink(tp, req, prev); + req->class->destructor(req); + tcp_openreq_free(req); +} + int tcp_v6_rcv(struct sk_buff *skb, struct device *dev, struct in6_addr *saddr, struct in6_addr *daddr, struct ipv6_options *opt, unsigned short len, @@ -1077,7 +1148,13 @@ int tcp_v6_rcv(struct sk_buff *skb, struct device *dev, * Pull up the IP header. */ - skb_pull(skb, skb->h.raw - skb->data); + __skb_pull(skb, skb->h.raw - skb->data); + + /* + * Count it even if it's bad. + */ + + tcp_statistics.TcpInSegs++; /* * Try to use the device checksum if provided. @@ -1089,14 +1166,13 @@ int tcp_v6_rcv(struct sk_buff *skb, struct device *dev, case CHECKSUM_HW: if (tcp_v6_check(th,len,saddr,daddr,skb->csum)) { printk(KERN_DEBUG "tcp csum failed\n"); + tcp_statistics.TcpInErrs++; goto discard_it; } default: /* CHECKSUM_UNNECESSARY */ }; - tcp_statistics.TcpInSegs++; - sk = __tcp_v6_lookup(th, saddr, th->source, daddr, th->dest); if (!sk) { @@ -1137,28 +1213,35 @@ int tcp_v6_rcv(struct sk_buff *skb, struct device *dev, } } - if (!sk->prot) { - printk(KERN_DEBUG "tcp_rcv: sk->prot == NULL\n"); - return(0); - } - skb_set_owner_r(skb, sk); - /* I don't understand why lock_sock()/release_sock() is not - * called here. IPv4 does this. It looks like a bug to me. -AK - */ if (sk->state == TCP_ESTABLISHED) { if (tcp_rcv_established(sk, skb, th, len)) goto no_tcp_socket; return 0; } + if (sk->state == TCP_LISTEN) { + __u32 flg = ((u32 *)th)[3]; - if (sk->state == TCP_LISTEN && - ((u32 *)th)[3] & __constant_htonl(0x00120000)) { - sk = tcp_check_req(sk, skb, opt); - if (sk == NULL) - goto discard_it; + /* Check for RST */ + if (flg & __constant_htonl(0x00040000)) { + tcp_v6_rst_req(sk, skb); + } + + /* Check SYN|ACK */ + if (flg & __constant_htonl(0x00120000)) { + struct open_request *req, *prev; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + req = tcp_v6_search_req(tp, skb->nh.ipv6h,th,&prev); + if (req) { + sk = tcp_check_req(sk, skb, req); + } + /* else do syncookies (add them here) */ + if (sk == NULL) + goto discard_it; + } } if (tcp_rcv_state_process(sk, skb, th, opt, len) == 0) @@ -1168,11 +1251,10 @@ no_tcp_socket: /* * No such TCB. If th->rst is 0 send a reset - * (checked in tcp_send_reset) + * (checked in tcp_v6_send_reset) */ - tcp_v6_send_reset(daddr, saddr, th, &tcpv6_prot, opt, dev, - skb->nh.ipv6h->priority, 255); + tcp_v6_send_reset(skb); discard_it: @@ -1285,12 +1367,6 @@ static void v6_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) sin6->sin6_port = sk->dummy_th.dest; } -static struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb, - void *opt) -{ - return sk; /* dummy */ -} - static struct tcp_func ipv6_specific = { tcp_v6_build_header, tcp_v6_xmit, @@ -1302,9 +1378,6 @@ static struct tcp_func ipv6_specific = { ipv6_setsockopt, ipv6_getsockopt, v6_addr2sockaddr, - tcp_v6_reply_reset, - tcp_v6_search_req, - /* not implemented yet: */ cookie_v6_check, sizeof(struct sockaddr_in6) }; @@ -1323,9 +1396,6 @@ static struct tcp_func ipv6_mapped = { ipv6_setsockopt, ipv6_getsockopt, v6_addr2sockaddr, - tcp_v6_reply_reset, - tcp_v6_search_req, - cookie_v6_check, /* not implemented yet. */ sizeof(struct sockaddr_in6) }; @@ -1364,8 +1434,6 @@ static int tcp_v6_init_sock(struct sock *sk) sk->priority = 1; sk->state = TCP_CLOSE; - /* this is how many unacked bytes we will accept for this socket. */ - sk->max_unacked = 2048; /* needs to be at most 2 full packets. */ sk->max_ack_backlog = SOMAXCONN; sk->mtu = 576; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index f18f5a6f8..aed22f964 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -7,7 +7,7 @@ * * Based on linux/ipv4/udp.c * - * $Id: udp.c,v 1.17 1997/04/29 09:38:55 mj Exp $ + * $Id: udp.c,v 1.18 1997/09/14 08:32:24 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -601,8 +601,9 @@ static int udpv6_getfrag(const void *data, struct in6_addr *addr, clen -= sizeof(struct udphdr); } - udh->wcheck = csum_partial_copy_fromiovecend(dst, udh->iov, offset, - clen, udh->wcheck); + if (csum_partial_copy_fromiovecend(dst, udh->iov, offset, + clen, &udh->wcheck)) + return -EFAULT; if (final) { struct in6_addr *daddr; diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 13d9528e6..2a46c5270 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -694,7 +694,6 @@ static int ipxitf_send(ipx_interface *intrfc, struct sk_buff *skb, char *node) /* * Send it out */ - skb->priority = SOPRI_NORMAL; dev_queue_xmit(skb); return 0; } diff --git a/net/netlink.c b/net/netlink.c index d2128c180..e69de29bb 100644 --- a/net/netlink.c +++ b/net/netlink.c @@ -1,475 +0,0 @@ -/* - * NETLINK An implementation of a loadable kernel mode driver providing - * multiple kernel/user space bidirectional communications links. - * - * Author: Alan Cox <alan@cymru.net> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include <linux/module.h> - -#include <linux/errno.h> -#include <linux/kernel.h> -#include <linux/major.h> -#include <linux/sched.h> -#include <linux/malloc.h> -#include <linux/ioport.h> -#include <linux/fcntl.h> -#include <linux/delay.h> -#include <linux/interrupt.h> -#include <linux/skbuff.h> -#include <linux/init.h> - -#include <net/netlink.h> - -#include <asm/poll.h> -#include <asm/io.h> -#include <asm/uaccess.h> -#include <asm/system.h> - -static int (*netlink_handler[MAX_LINKS])(int minor, struct sk_buff *skb); -static struct sk_buff_head skb_queue_rd[MAX_LINKS]; -static int rdq_size[MAX_LINKS]; -static struct wait_queue *read_space_wait[MAX_LINKS]; - -static unsigned long active_map = 0; -static unsigned long open_map = 0; - -/* - * Device operations - */ - -/* - * Default write handler. - */ - -static int netlink_err(int minor, struct sk_buff *skb) -{ - kfree_skb(skb, FREE_READ); - return -EUNATCH; -} - -/* - * Exported do nothing receiver for one way - * interfaces. - */ - -int netlink_donothing(int minor, struct sk_buff *skb) -{ - kfree_skb(skb, FREE_READ); - return -EINVAL; -} - -static unsigned int netlink_poll(struct file *file, poll_table * wait) -{ - unsigned int mask; - unsigned int minor = MINOR(file->f_dentry->d_inode->i_rdev); - - poll_wait(&read_space_wait[minor], wait); - mask = POLLOUT | POLLWRNORM; - if (skb_peek(&skb_queue_rd[minor])) - mask |= POLLIN | POLLRDNORM; - return mask; -} - -/* - * Write a message to the kernel side of a communication link - */ - -static ssize_t netlink_write(struct file * file, const char * buf, - size_t count,loff_t *ppos) -{ - int err; - unsigned int minor = MINOR(file->f_dentry->d_inode->i_rdev); - struct sk_buff *skb; - skb=alloc_skb(count, GFP_KERNEL); - err = copy_from_user(skb_put(skb,count),buf, count); - return err ? -EFAULT : (netlink_handler[minor])(minor,skb); -} - -/* - * Read a message from the kernel side of the communication link - */ - -static ssize_t netlink_read(struct file * file, char * buf, - size_t count,loff_t *ppos) -{ - int err; - unsigned int minor = MINOR(file->f_dentry->d_inode->i_rdev); - struct sk_buff *skb; - cli(); - while((skb=skb_dequeue(&skb_queue_rd[minor]))==NULL) - { - if(file->f_flags&O_NONBLOCK) - { - sti(); - return -EAGAIN; - } - interruptible_sleep_on(&read_space_wait[minor]); - if(signal_pending(current)) - { - sti(); - return -ERESTARTSYS; - } - } - rdq_size[minor]-=skb->len; - sti(); - if(skb->len<count) - count=skb->len; - err = copy_to_user(buf,skb->data,count); - kfree_skb(skb, FREE_READ); - return err ? -EFAULT : count; -} - -static long long netlink_lseek(struct file * file, long long offset, int origin) -{ - return -ESPIPE; -} - -static int netlink_open(struct inode * inode, struct file * file) -{ - unsigned int minor = MINOR(inode->i_rdev); - - if(minor>=MAX_LINKS) - return -ENODEV; - if(active_map&(1<<minor)) - { - if (file->f_mode & FMODE_READ) - { - if (open_map&(1<<minor)) - return -EBUSY; - open_map|=(1<<minor); - } - MOD_INC_USE_COUNT; - return 0; - } - return -EUNATCH; -} - -static int netlink_release(struct inode * inode, struct file * file) -{ - unsigned int minor = MINOR(inode->i_rdev); - if (file->f_mode & FMODE_READ) - open_map&=~(1<<minor); - MOD_DEC_USE_COUNT; - return 0; -} - - -static int netlink_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) -{ - unsigned int minor = MINOR(inode->i_rdev); - int retval = 0; - - if (minor >= MAX_LINKS) - return -ENODEV; - switch ( cmd ) { - default: - retval = -EINVAL; - } - return retval; -} - - -static struct file_operations netlink_fops = { - netlink_lseek, - netlink_read, - netlink_write, - NULL, /* netlink_readdir */ - netlink_poll, - netlink_ioctl, - NULL, /* netlink_mmap */ - netlink_open, - netlink_release -}; - -/* - * We export these functions to other modules. They provide a - * complete set of kernel non-blocking support for message - * queueing. - */ - -int netlink_attach(int unit, int (*function)(int minor, struct sk_buff *skb)) -{ - if(unit>=MAX_LINKS) - return -ENODEV; - if(active_map&(1<<unit)) - return -EBUSY; - active_map|=(1<<unit); - netlink_handler[unit]=function; - return 0; -} - -void netlink_detach(int unit) -{ - active_map&=~(1<<unit); - netlink_handler[unit]=netlink_err; -} - -int netlink_post(int unit, struct sk_buff *skb) -{ - unsigned long flags; - int ret=-EUNATCH; - if(open_map&(1<<unit)) - { - save_flags(flags); - cli(); - if(rdq_size[unit]+skb->len>MAX_QBYTES) - ret=-EAGAIN; - else - { - skb_queue_tail(&skb_queue_rd[unit], skb); - rdq_size[unit]+=skb->len; - ret=0; - wake_up_interruptible(&read_space_wait[unit]); - } - restore_flags(flags); - } - return ret; -} - - -/* - * "High" level netlink interface. (ANK) - * - * Features: - * - standard message format. - * - pseudo-reliable delivery. Messages can be still lost, but - * user level will know that they were lost and can - * recover (f.e. gated could reread FIB and device list) - * - messages are batched. - * - if user is not attached, we do not make useless work. - * - * Examples: - * - netlink_post equivalent (but with pseudo-reliable delivery) - * ctl.nlmsg_delay = 0; - * ctl.nlmsg_maxsize = <one message size>; - * .... - * msg = nlmsg_send(&ctl, ...); - * if (msg) { - * ... make it ... - * nlmsg_transmit(&ctl); - * } - * - * - batched messages. - * if nlmsg_delay==0, messages are delivered only - * by nlmsg_transmit, or when batch is completed, - * otherwise nlmsg_transmit is noop (only starts - * timer) - * - * ctl.nlmsg_delay = ...; - * ctl.nlmsg_maxsize = <one batch size>; - * .... - * msg = nlmsg_send(&ctl, ...); - * if (msg) - * ... make it ... - * .... - * msg = nlmsg_send(&ctl, ...); - * if (msg) - * ... make it ... - * .... - * if (ctl.nlmsg_skb) - * nlmsg_transmit(&ctl); - * - */ - -/* - * Try to deliver queued messages. - * If the delivery fails (netlink is not attached or congested), - * do not free skb to avoid useless new message creation. - * - * Notes: - * - timer should be already stopped. - * - NET SPL. - */ - -void nlmsg_flush(struct nlmsg_ctl *ctl) -{ - if (ctl->nlmsg_skb == NULL) - return; - - if (netlink_post(ctl->nlmsg_unit, ctl->nlmsg_skb) == 0) - { - ctl->nlmsg_skb = NULL; - return; - } - - ctl->nlmsg_timer.expires = jiffies + NLMSG_RECOVERY_TIMEO; - ctl->nlmsg_timer.data = (unsigned long)ctl; - ctl->nlmsg_timer.function = (void (*)(unsigned long))nlmsg_flush; - add_timer(&ctl->nlmsg_timer); - return; -} - - -/* - * Allocate room for new message. If it is impossible, - * start "overrun" mode and return NULL. - * - * Notes: - * - NET SPL. - */ - -void* nlmsg_send(struct nlmsg_ctl *ctl, unsigned long type, int len, - unsigned long seq, unsigned long pid) -{ - struct nlmsghdr *nlh; - struct sk_buff *skb; - int rlen; - - static __inline__ void nlmsg_lost(struct nlmsg_ctl *ctl, - unsigned long seq) - { - if (!ctl->nlmsg_overrun) - { - ctl->nlmsg_overrun_start = seq; - ctl->nlmsg_overrun_end = seq; - ctl->nlmsg_overrun = 1; - return; - } - if (!ctl->nlmsg_overrun_start) - ctl->nlmsg_overrun_start = seq; - if (seq) - ctl->nlmsg_overrun_end = seq; - } - - if (!(open_map&(1<<ctl->nlmsg_unit))) - { - nlmsg_lost(ctl, seq); - return NULL; - } - - rlen = NLMSG_ALIGN(len + sizeof(struct nlmsghdr)); - - if (rlen > ctl->nlmsg_maxsize) - { - printk(KERN_ERR "nlmsg_send: too big message\n"); - return NULL; - } - - if ((skb=ctl->nlmsg_skb) == NULL || skb_tailroom(skb) < rlen) - { - if (skb) - { - ctl->nlmsg_force++; - nlmsg_flush(ctl); - ctl->nlmsg_force--; - } - - if (ctl->nlmsg_skb || - (skb=alloc_skb(ctl->nlmsg_maxsize, GFP_ATOMIC)) == NULL) - { - printk (KERN_WARNING "nlmsg at unit %d overrunned\n", ctl->nlmsg_unit); - nlmsg_lost(ctl, seq); - return NULL; - } - - ctl->nlmsg_skb = skb; - - if (ctl->nlmsg_overrun) - { - int *seqp; - nlh = (struct nlmsghdr*)skb_put(skb, sizeof(struct nlmsghdr) + 2*sizeof(unsigned long)); - nlh->nlmsg_type = NLMSG_OVERRUN; - nlh->nlmsg_len = sizeof(struct nlmsghdr) + 2*sizeof(unsigned long); - nlh->nlmsg_seq = 0; - nlh->nlmsg_pid = 0; - seqp = (int*)nlh->nlmsg_data; - seqp[0] = ctl->nlmsg_overrun_start; - seqp[1] = ctl->nlmsg_overrun_end; - ctl->nlmsg_overrun = 0; - } - if (ctl->nlmsg_timer.function) - { - del_timer(&ctl->nlmsg_timer); - ctl->nlmsg_timer.function = NULL; - } - if (ctl->nlmsg_delay) - { - ctl->nlmsg_timer.expires = jiffies + ctl->nlmsg_delay; - ctl->nlmsg_timer.function = (void (*)(unsigned long))nlmsg_flush; - ctl->nlmsg_timer.data = (unsigned long)ctl; - add_timer(&ctl->nlmsg_timer); - } - } - - nlh = (struct nlmsghdr*)skb_put(skb, rlen); - nlh->nlmsg_type = type; - nlh->nlmsg_len = sizeof(struct nlmsghdr) + len; - nlh->nlmsg_seq = seq; - nlh->nlmsg_pid = pid; - return nlh->nlmsg_data; -} - -/* - * Kick message queue. - * Two modes: - * - synchronous (delay==0). Messages are delivered immediately. - * - delayed. Do not deliver, but start delivery timer. - */ - -void nlmsg_transmit(struct nlmsg_ctl *ctl) -{ - start_bh_atomic(); - - if (!ctl->nlmsg_delay) - { - if (ctl->nlmsg_timer.function) - { - del_timer(&ctl->nlmsg_timer); - ctl->nlmsg_timer.function = NULL; - } - ctl->nlmsg_force++; - nlmsg_flush(ctl); - ctl->nlmsg_force--; - end_bh_atomic(); - return; - } - if (!ctl->nlmsg_timer.function) - { - ctl->nlmsg_timer.expires = jiffies + ctl->nlmsg_delay; - ctl->nlmsg_timer.function = (void (*)(unsigned long))nlmsg_flush; - ctl->nlmsg_timer.data = (unsigned long)ctl; - add_timer(&ctl->nlmsg_timer); - } - - end_bh_atomic(); -} - - -__initfunc(int init_netlink(void)) -{ - int ct; - - if(register_chrdev(NETLINK_MAJOR,"netlink", &netlink_fops)) { - printk(KERN_ERR "netlink: unable to get major %d\n", NETLINK_MAJOR); - return -EIO; - } - for(ct=0;ct<MAX_LINKS;ct++) - { - skb_queue_head_init(&skb_queue_rd[ct]); - netlink_handler[ct]=netlink_err; - } - return 0; -} - -#ifdef MODULE - -int init_module(void) -{ - printk(KERN_INFO "Network Kernel/User communications module 0.05\n"); - return init_netlink(); -} - -void cleanup_module(void) -{ - unregister_chrdev(NET_MAJOR,"netlink"); -} - -#endif diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c index f7b617dcc..380ec8ecc 100644 --- a/net/netrom/nr_dev.c +++ b/net/netrom/nr_dev.c @@ -249,14 +249,6 @@ int nr_init(struct device *dev) /* New-style flags. */ dev->flags = 0; - dev->family = AF_INET; - -#ifdef CONFIG_INET - dev->pa_addr = in_aton("192.168.0.1"); - dev->pa_brdaddr = in_aton("192.168.0.255"); - dev->pa_mask = in_aton("255.255.255.0"); - dev->pa_alen = 4; -#endif if ((dev->priv = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL)) == NULL) return -ENOMEM; diff --git a/net/netsyms.c b/net/netsyms.c index 089d6ebc1..dfc3c9db1 100644 --- a/net/netsyms.c +++ b/net/netsyms.c @@ -19,6 +19,7 @@ #ifdef CONFIG_INET #include <linux/ip.h> #include <linux/etherdevice.h> +#include <linux/fddidevice.h> #include <net/protocol.h> #include <net/arp.h> #include <net/ip.h> @@ -28,8 +29,8 @@ #include <net/route.h> #include <net/scm.h> #include <net/inet_common.h> +#include <net/pkt_sched.h> #include <linux/inet.h> -#include <linux/net_alias.h> #include <linux/mroute.h> extern struct net_proto_family inet_family_ops; @@ -43,13 +44,7 @@ extern struct net_proto_family inet_family_ops; #endif -#ifdef CONFIG_NETLINK -#include <net/netlink.h> -#endif - -#ifdef CONFIG_NET_ALIAS -#include <linux/net_alias.h> -#endif +#include <linux/rtnetlink.h> #include <net/scm.h> @@ -121,6 +116,7 @@ EXPORT_SYMBOL(skb_copy_datagram_iovec); EXPORT_SYMBOL(skb_realloc_headroom); EXPORT_SYMBOL(datagram_poll); EXPORT_SYMBOL(put_cmsg); +EXPORT_SYMBOL(net_families); EXPORT_SYMBOL(neigh_table_init); /* Declared in <net/neighbour.h> but not defined? @@ -144,6 +140,13 @@ EXPORT_SYMBOL(dst_total); EXPORT_SYMBOL(__scm_destroy); EXPORT_SYMBOL(__scm_send); +/* Needed by unix.o */ +EXPORT_SYMBOL(scm_fp_dup); +EXPORT_SYMBOL(max_files); +EXPORT_SYMBOL(do_mknod); +EXPORT_SYMBOL(memcpy_toiovec); +EXPORT_SYMBOL(csum_partial); + #ifdef CONFIG_IPX_MODULE EXPORT_SYMBOL(make_8023_client); EXPORT_SYMBOL(destroy_8023_client); @@ -153,6 +156,9 @@ EXPORT_SYMBOL(destroy_EII_client); #ifdef CONFIG_ATALK_MODULE EXPORT_SYMBOL(sklist_destroy_socket); +#endif + +#if defined(CONFIG_ATALK_MODULE) || defined(CONFIG_PACKET_MODULE) EXPORT_SYMBOL(sklist_insert_socket); #endif @@ -169,15 +175,14 @@ EXPORT_SYMBOL(init_etherdev); EXPORT_SYMBOL(ip_route_output); EXPORT_SYMBOL(icmp_send); EXPORT_SYMBOL(ip_options_compile); -EXPORT_SYMBOL(ip_rt_put); EXPORT_SYMBOL(arp_send); EXPORT_SYMBOL(ip_id_count); EXPORT_SYMBOL(ip_send_check); EXPORT_SYMBOL(ip_fragment); -EXPORT_SYMBOL(ip_dev_find_tunnel); EXPORT_SYMBOL(inet_family_ops); EXPORT_SYMBOL(in_aton); EXPORT_SYMBOL(in_ntoa); +EXPORT_SYMBOL(net_ratelimit); #ifdef CONFIG_IPV6_MODULE /* inet functions common to v4 and v6 */ @@ -205,8 +210,6 @@ EXPORT_SYMBOL(udp_hash); EXPORT_SYMBOL(destroy_sock); EXPORT_SYMBOL(ip_queue_xmit); -EXPORT_SYMBOL(csum_partial); -EXPORT_SYMBOL(dev_lockct); EXPORT_SYMBOL(memcpy_fromiovecend); EXPORT_SYMBOL(csum_partial_copy_fromiovecend); EXPORT_SYMBOL(__release_sock); @@ -231,7 +234,6 @@ EXPORT_SYMBOL(tcp_getsockopt); EXPORT_SYMBOL(tcp_recvmsg); EXPORT_SYMBOL(tcp_send_synack); EXPORT_SYMBOL(tcp_check_req); -EXPORT_SYMBOL(sock_wmalloc); EXPORT_SYMBOL(tcp_reset_xmit_timer); EXPORT_SYMBOL(tcp_parse_options); EXPORT_SYMBOL(tcp_rcv_established); @@ -249,13 +251,34 @@ EXPORT_SYMBOL(tcp_v4_conn_request); EXPORT_SYMBOL(tcp_v4_syn_recv_sock); EXPORT_SYMBOL(tcp_v4_do_rcv); EXPORT_SYMBOL(tcp_v4_connect); -EXPORT_SYMBOL(__ip_chk_addr); +EXPORT_SYMBOL(inet_addr_type); EXPORT_SYMBOL(net_reset_timer); EXPORT_SYMBOL(net_delete_timer); EXPORT_SYMBOL(udp_prot); EXPORT_SYMBOL(tcp_prot); EXPORT_SYMBOL(tcp_openreq_cachep); EXPORT_SYMBOL(ipv4_specific); +EXPORT_SYMBOL(tcp_simple_retransmit); + +EXPORT_SYMBOL(xrlim_allow); +#endif + +#ifdef CONFIG_PACKET_MODULE +EXPORT_SYMBOL(dev_set_allmulti); +EXPORT_SYMBOL(dev_set_promiscuity); +EXPORT_SYMBOL(dev_mc_delete); +EXPORT_SYMBOL(sklist_remove_socket); +EXPORT_SYMBOL(rtnl_wait); +EXPORT_SYMBOL(rtnl_rlockct); +#ifdef CONFIG_RTNETLINK +EXPORT_SYMBOL(rtnl); +EXPORT_SYMBOL(rtnl_wlockct); +#endif +#endif + +#if defined(CONFIG_IPV6_MODULE) || defined(CONFIG_PACKET_MODULE) +EXPORT_SYMBOL(dev_lockct); +EXPORT_SYMBOL(sock_wmalloc); #endif #if defined(CONFIG_ULTRA) || defined(CONFIG_WD80x3) || \ @@ -282,15 +305,9 @@ EXPORT_SYMBOL(tr_freedev); EXPORT_SYMBOL(tr_reformat); #endif -#ifdef CONFIG_NET_ALIAS -#include <linux/net_alias.h> -#endif - /* Used by at least ipip.c. */ EXPORT_SYMBOL(ipv4_config); -#ifdef CONFIG_IP_MROUTE -EXPORT_SYMBOL(ip_mr_find_tunnel); -#endif +EXPORT_SYMBOL(dev_open); #endif /* CONFIG_INET */ @@ -298,19 +315,19 @@ EXPORT_SYMBOL(ip_mr_find_tunnel); EXPORT_SYMBOL(register_netdevice_notifier); EXPORT_SYMBOL(unregister_netdevice_notifier); -#ifdef CONFIG_NET_ALIAS -EXPORT_SYMBOL(register_net_alias_type); -EXPORT_SYMBOL(unregister_net_alias_type); -#endif - /* support for loadable net drivers */ #ifdef CONFIG_NET +EXPORT_SYMBOL(register_netdevice); +EXPORT_SYMBOL(unregister_netdevice); EXPORT_SYMBOL(register_netdev); EXPORT_SYMBOL(unregister_netdev); EXPORT_SYMBOL(ether_setup); EXPORT_SYMBOL(dev_new_index); EXPORT_SYMBOL(dev_get_by_index); EXPORT_SYMBOL(eth_type_trans); +#ifdef CONFIG_FDDI +EXPORT_SYMBOL(fddi_type_trans); +#endif /* CONFIG_FDDI */ EXPORT_SYMBOL(eth_copy_and_sum); EXPORT_SYMBOL(alloc_skb); EXPORT_SYMBOL(__kfree_skb); @@ -318,7 +335,6 @@ EXPORT_SYMBOL(skb_clone); EXPORT_SYMBOL(skb_copy); EXPORT_SYMBOL(dev_alloc_skb); EXPORT_SYMBOL(netif_rx); -EXPORT_SYMBOL(dev_tint); EXPORT_SYMBOL(dev_add_pack); EXPORT_SYMBOL(dev_remove_pack); EXPORT_SYMBOL(dev_get); @@ -340,6 +356,9 @@ EXPORT_SYMBOL(kill_fasync); EXPORT_SYMBOL(ip_rcv); EXPORT_SYMBOL(arp_rcv); +EXPORT_SYMBOL(rtnl_lock); +EXPORT_SYMBOL(rtnl_unlock); + EXPORT_SYMBOL(if_port_text); #if defined(CONFIG_ATALK) || defined(CONFIG_ATALK_MODULE) @@ -352,11 +371,13 @@ extern int (*dlci_ioctl_hook)(unsigned int, void *); EXPORT_SYMBOL(dlci_ioctl_hook); #endif -#endif /* CONFIG_NET */ +/* Packet scheduler modules want these. */ +EXPORT_SYMBOL(qdisc_destroy); +EXPORT_SYMBOL(qdisc_reset); +EXPORT_SYMBOL(qdisc_restart); +EXPORT_SYMBOL(qdisc_head); +EXPORT_SYMBOL(register_qdisc); +EXPORT_SYMBOL(unregister_qdisc); +EXPORT_SYMBOL(noop_qdisc); -#ifdef CONFIG_NETLINK -EXPORT_SYMBOL(netlink_attach); -EXPORT_SYMBOL(netlink_detach); -EXPORT_SYMBOL(netlink_donothing); -EXPORT_SYMBOL(netlink_post); -#endif /* CONFIG_NETLINK */ +#endif /* CONFIG_NET */ diff --git a/net/protocols.c b/net/protocols.c index a0bb0a6b8..cba2a3ac4 100644 --- a/net/protocols.c +++ b/net/protocols.c @@ -10,8 +10,6 @@ #include <linux/net.h> #include <linux/fs.h> -#define CONFIG_UNIX /* always present... */ - #ifdef CONFIG_UNIX #include <linux/un.h> #include <net/af_unix.h> @@ -24,6 +22,14 @@ extern void inet6_proto_init(struct net_proto *pro); #endif #endif /* INET */ +#ifdef CONFIG_NETLINK +extern void netlink_proto_init(struct net_proto *pro); +#endif + +#ifdef CONFIG_PACKET +extern void packet_proto_init(struct net_proto *pro); +#endif + #if defined(CONFIG_IPX) || defined(CONFIG_IPX_MODULE) #define NEED_802 #include <net/ipxcall.h> @@ -61,6 +67,10 @@ extern void inet6_proto_init(struct net_proto *pro); #include <net/netbeuicall.h> #endif +#if defined(CONFIG_LLC) +#define NEED_LLC +#endif + #include <net/psnapcall.h> #ifdef CONFIG_TR @@ -84,6 +94,14 @@ extern void rif_init(struct net_proto *); */ struct net_proto protocols[] = { +#ifdef CONFIG_NETLINK + { "NETLINK", netlink_proto_init }, +#endif + +#ifdef CONFIG_PACKET + { "PACKET", packet_proto_init }, +#endif + #ifdef CONFIG_UNIX { "UNIX", unix_proto_init }, /* Unix domain socket family */ #endif diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 69b77a9f2..5ae64334d 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -548,6 +548,8 @@ static int rose_create(struct socket *sock, int protocol) sock_init_data(sock, sk); + skb_queue_head_init(&rose->ack_queue); + sock->ops = &rose_proto_ops; sk->protocol = protocol; sk->mtu = ROSE_MTU; /* 253 */ @@ -555,8 +557,6 @@ static int rose_create(struct socket *sock, int protocol) init_timer(&rose->timer); init_timer(&rose->idletimer); - skb_queue_head_init(&rose->frag_queue); - rose->t1 = sysctl_rose_call_request_timeout; rose->t2 = sysctl_rose_reset_request_timeout; rose->t3 = sysctl_rose_clear_request_timeout; @@ -583,6 +583,8 @@ static struct sock *rose_make_new(struct sock *osk) sock_init_data(NULL, sk); + skb_queue_head_init(&rose->ack_queue); + sk->type = osk->type; sk->socket = osk->socket; sk->priority = osk->priority; @@ -598,8 +600,6 @@ static struct sock *rose_make_new(struct sock *osk) init_timer(&rose->timer); init_timer(&rose->idletimer); - skb_queue_head_init(&rose->frag_queue); - rose->t1 = osk->protinfo.rose->t1; rose->t2 = osk->protinfo.rose->t2; rose->t3 = osk->protinfo.rose->t3; @@ -1068,7 +1068,9 @@ static int rose_sendmsg(struct socket *sock, struct msghdr *msg, int len, return -ENOTCONN; } - rose_output(sk, skb); /* Shove it onto the queue */ + skb_queue_tail(&sk->write_queue, skb); /* Shove it onto the queue */ + + rose_kick(sk); return len; } @@ -1210,7 +1212,7 @@ static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return 0; } - case SIOCRSL2CALL: + case SIOCRSSL2CALL: if (!suser()) return -EPERM; if (ax25cmp(&rose_callsign, &null_ax25_address) != 0) ax25_listen_release(&rose_callsign, NULL); @@ -1220,6 +1222,11 @@ static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) ax25_listen_register(&rose_callsign, NULL); return 0; + case SIOCRSGL2CALL: + if (copy_to_user((void *)arg, &rose_callsign, sizeof(ax25_address))) + return -EFAULT; + return 0; + case SIOCRSACCEPT: if (sk->protinfo.rose->state == ROSE_STATE_5) { rose_write_internal(sk, ROSE_CALL_ACCEPTED); diff --git a/net/rose/rose_dev.c b/net/rose/rose_dev.c index 7861220ee..bc2097cda 100644 --- a/net/rose/rose_dev.c +++ b/net/rose/rose_dev.c @@ -221,14 +221,6 @@ int rose_init(struct device *dev) /* New-style flags. */ dev->flags = 0; - dev->family = AF_INET; - -#ifdef CONFIG_INET - dev->pa_addr = in_aton("192.168.0.1"); - dev->pa_brdaddr = in_aton("192.168.0.255"); - dev->pa_mask = in_aton("255.255.255.0"); - dev->pa_alen = 4; -#endif if ((dev->priv = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL)) == NULL) return -ENOMEM; diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c index 1ac11528d..de412d3c4 100644 --- a/net/rose/rose_in.c +++ b/net/rose/rose_in.c @@ -19,6 +19,7 @@ * ROSE 001 Jonathan(G4KLX) Cloned from nr_in.c * ROSE 002 Jonathan(G4KLX) Return cause and diagnostic codes from Clear Requests. * ROSE 003 Jonathan(G4KLX) New timer architecture. + * Removed M bit processing. */ #include <linux/config.h> @@ -46,43 +47,6 @@ #include <linux/interrupt.h> #include <net/rose.h> -static int rose_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more) -{ - struct sk_buff *skbo, *skbn = skb; - - rose_start_idletimer(sk); - - if (more) { - sk->protinfo.rose->fraglen += skb->len; - skb_queue_tail(&sk->protinfo.rose->frag_queue, skb); - return 0; - } - - if (!more && sk->protinfo.rose->fraglen > 0) { /* End of fragment */ - sk->protinfo.rose->fraglen += skb->len; - skb_queue_tail(&sk->protinfo.rose->frag_queue, skb); - - if ((skbn = alloc_skb(sk->protinfo.rose->fraglen, GFP_ATOMIC)) == NULL) - return 1; - - skbn->h.raw = skbn->data; - - skbo = skb_dequeue(&sk->protinfo.rose->frag_queue); - memcpy(skb_put(skbn, skbo->len), skbo->data, skbo->len); - kfree_skb(skbo, FREE_READ); - - while ((skbo = skb_dequeue(&sk->protinfo.rose->frag_queue)) != NULL) { - skb_pull(skbo, ROSE_MIN_LEN); - memcpy(skb_put(skbn, skbo->len), skbo->data, skbo->len); - kfree_skb(skbo, FREE_READ); - } - - sk->protinfo.rose->fraglen = 0; - } - - return sock_queue_rcv_skb(sk, skbn); -} - /* * State machine for state 1, Awaiting Call Accepted State. * The handling of the timer(s) is in file rose_timer.c. @@ -166,6 +130,7 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety sk->protinfo.rose->vr = 0; sk->protinfo.rose->va = 0; sk->protinfo.rose->vl = 0; + rose_requeue_frames(sk); break; case ROSE_CLEAR_REQUEST: @@ -191,11 +156,9 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety rose_start_t2timer(sk); rose_stop_idletimer(sk); } else { - if (sk->protinfo.rose->condition & ROSE_COND_PEER_RX_BUSY) { - sk->protinfo.rose->va = nr; - } else { - rose_check_iframes_acked(sk, nr); - } + rose_frames_acked(sk, nr); + if (frametype == ROSE_RNR) + rose_requeue_frames(sk); } break; @@ -213,15 +176,12 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety rose_stop_idletimer(sk); break; } - if (sk->protinfo.rose->condition & ROSE_COND_PEER_RX_BUSY) { - sk->protinfo.rose->va = nr; - } else { - rose_check_iframes_acked(sk, nr); - } + rose_frames_acked(sk, nr); if (sk->protinfo.rose->condition & ROSE_COND_OWN_RX_BUSY) break; if (ns == sk->protinfo.rose->vr) { - if (rose_queue_rx_frame(sk, skb, m) == 0) { + rose_start_idletimer(sk); + if (sock_queue_rcv_skb(sk, skb) == 0) { sk->protinfo.rose->vr = (sk->protinfo.rose->vr + 1) % ROSE_MODULUS; queued = 1; } else { @@ -270,6 +230,7 @@ static int rose_state4_machine(struct sock *sk, struct sk_buff *skb, int framety sk->protinfo.rose->vs = 0; sk->protinfo.rose->vl = 0; sk->protinfo.rose->state = ROSE_STATE_3; + rose_requeue_frames(sk); break; case ROSE_CLEAR_REQUEST: diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c index b481e485f..8ee27147a 100644 --- a/net/rose/rose_link.c +++ b/net/rose/rose_link.c @@ -113,7 +113,7 @@ static int rose_send_frame(struct sk_buff *skb, struct rose_neigh *neigh) else rose_call = &rose_callsign; - neigh->ax25 = ax25_send_frame(skb, 256, rose_call, &neigh->callsign, neigh->digipeat, neigh->dev); + neigh->ax25 = ax25_send_frame(skb, 0, rose_call, &neigh->callsign, neigh->digipeat, neigh->dev); return (neigh->ax25 != NULL); } diff --git a/net/rose/rose_out.c b/net/rose/rose_out.c index 0ed9f7480..aea1d9f68 100644 --- a/net/rose/rose_out.c +++ b/net/rose/rose_out.c @@ -12,6 +12,7 @@ * History * ROSE 001 Jonathan(G4KLX) Cloned from nr_out.c * ROSE 003 Jonathan(G4KLX) New timer architecture. + * Removed M bit processing. */ #include <linux/config.h> @@ -38,52 +39,6 @@ #include <linux/interrupt.h> #include <net/rose.h> -/* - * This is where all ROSE frames pass; - */ -void rose_output(struct sock *sk, struct sk_buff *skb) -{ - struct sk_buff *skbn; - unsigned char header[ROSE_MIN_LEN]; - int err, frontlen, len; - - if (skb->len - ROSE_MIN_LEN > ROSE_MAX_PACKET_SIZE) { - /* Save a copy of the Header */ - memcpy(header, skb->data, ROSE_MIN_LEN); - skb_pull(skb, ROSE_MIN_LEN); - - frontlen = skb_headroom(skb); - - while (skb->len > 0) { - if ((skbn = sock_alloc_send_skb(sk, frontlen + ROSE_MAX_PACKET_SIZE, 0, 0, &err)) == NULL) - return; - - skb_reserve(skbn, frontlen); - - len = (ROSE_MAX_PACKET_SIZE > skb->len) ? skb->len : ROSE_MAX_PACKET_SIZE; - - /* Copy the user data */ - memcpy(skb_put(skbn, len), skb->data, len); - skb_pull(skb, len); - - /* Duplicate the Header */ - skb_push(skbn, ROSE_MIN_LEN); - memcpy(skbn->data, header, ROSE_MIN_LEN); - - if (skb->len > 0) - skbn->data[2] |= ROSE_M_BIT; - - skb_queue_tail(&sk->write_queue, skbn); /* Throw it on the queue */ - } - - kfree_skb(skb, FREE_WRITE); - } else { - skb_queue_tail(&sk->write_queue, skb); /* Throw it on the queue */ - } - - rose_kick(sk); -} - /* * This procedure is passed a buffer descriptor for an iframe. It builds * the rest of the control part of the frame and then writes it out. @@ -103,8 +58,8 @@ static void rose_send_iframe(struct sock *sk, struct sk_buff *skb) void rose_kick(struct sock *sk) { - struct sk_buff *skb; - unsigned short end; + struct sk_buff *skb, *skbn; + unsigned short start, end; if (sk->protinfo.rose->state != ROSE_STATE_3) return; @@ -115,11 +70,14 @@ void rose_kick(struct sock *sk) if (skb_peek(&sk->write_queue) == NULL) return; - end = (sk->protinfo.rose->va + sysctl_rose_window_size) % ROSE_MODULUS; + start = (skb_peek(&sk->protinfo.rose->ack_queue) == NULL) ? sk->protinfo.rose->va : sk->protinfo.rose->vs; + end = (sk->protinfo.rose->va + sysctl_rose_window_size) % ROSE_MODULUS; - if (sk->protinfo.rose->vs == end) + if (start == end) return; + sk->protinfo.rose->vs = start; + /* * Transmit data until either we're out of data to send or * the window is full. @@ -128,13 +86,25 @@ void rose_kick(struct sock *sk) skb = skb_dequeue(&sk->write_queue); do { + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { + skb_queue_head(&sk->write_queue, skb); + break; + } + + skb_set_owner_w(skbn, sk); + /* - * Transmit the frame. + * Transmit the frame copy. */ - rose_send_iframe(sk, skb); + rose_send_iframe(sk, skbn); sk->protinfo.rose->vs = (sk->protinfo.rose->vs + 1) % ROSE_MODULUS; + /* + * Requeue the original data frame. + */ + skb_queue_tail(&sk->protinfo.rose->ack_queue, skb); + } while (sk->protinfo.rose->vs != end && (skb = skb_dequeue(&sk->write_queue)) != NULL); sk->protinfo.rose->vl = sk->protinfo.rose->vr; @@ -161,14 +131,4 @@ void rose_enquiry_response(struct sock *sk) rose_stop_timer(sk); } -void rose_check_iframes_acked(struct sock *sk, unsigned short nr) -{ - if (sk->protinfo.rose->vs == nr) { - sk->protinfo.rose->va = nr; - } else { - if (sk->protinfo.rose->va != nr) - sk->protinfo.rose->va = nr; - } -} - #endif diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 43358644c..d9145cdea 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -63,7 +63,7 @@ static void rose_remove_neigh(struct rose_neigh *); */ static int rose_add_node(struct rose_route_struct *rose_route, struct device *dev) { - struct rose_node *rose_node, *rose_tmpn, *rose_tmpp; + struct rose_node *rose_node; struct rose_neigh *rose_neigh; unsigned long flags; int i; @@ -116,55 +116,18 @@ static int rose_add_node(struct rose_route_struct *rose_route, struct device *de restore_flags(flags); } - /* - * This is a new node to be inserted into the list. Find where it needs - * to be inserted into the list, and insert it. We want to be sure - * to order the list in descending order of mask size to ensure that - * later when we are searching this list the first match will be the - * best match. - */ if (rose_node == NULL) { - rose_tmpn = rose_node_list; - rose_tmpp = NULL; - - while (rose_tmpn != NULL) { - if (rose_tmpn->mask > rose_route->mask) { - rose_tmpp = rose_tmpn; - rose_tmpn = rose_tmpn->next; - } else { - break; - } - } - - /* create new node */ if ((rose_node = kmalloc(sizeof(*rose_node), GFP_ATOMIC)) == NULL) return -ENOMEM; - rose_node->address = rose_route->address; - rose_node->mask = rose_route->mask; - rose_node->count = 1; + rose_node->address = rose_route->address; + rose_node->mask = rose_route->mask; + rose_node->count = 1; rose_node->neighbour[0] = rose_neigh; save_flags(flags); cli(); - - if (rose_tmpn == NULL) { - if (rose_tmpp == NULL) { /* Empty list */ - rose_node_list = rose_node; - rose_node->next = NULL; - } else { - rose_tmpp->next = rose_node; - rose_node->next = NULL; - } - } else { - if (rose_tmpp == NULL) { /* 1st node */ - rose_node->next = rose_node_list; - rose_node_list = rose_node; - } else { - rose_tmpp->next = rose_node; - rose_node->next = rose_tmpn; - } - } - + rose_node->next = rose_node_list; + rose_node_list = rose_node; restore_flags(flags); rose_neigh->count++; @@ -487,20 +450,29 @@ struct rose_route *rose_route_free_lci(unsigned int lci, struct rose_neigh *neig struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause, unsigned char *diagnostic) { struct rose_node *node; + struct rose_neigh *neigh; int failed = 0; + int mask = 0; int i; - for (node = rose_node_list; node != NULL; node = node->next) { + for (neigh = NULL, node = rose_node_list; node != NULL; node = node->next) { if (rosecmpm(addr, &node->address, node->mask) == 0) { - for (i = 0; i < node->count; i++) { - if (!rose_ftimer_running(node->neighbour[i])) - return node->neighbour[i]; - else - failed = 1; + if (node->mask > mask) { + mask = node->mask; + + for (i = 0; i < node->count; i++) { + if (!rose_ftimer_running(node->neighbour[i])) + neigh = node->neighbour[i]; + else + failed = 1; + } } } } + if (neigh != NULL) + return neigh; + if (failed) { *cause = ROSE_OUT_OF_ORDER; *diagnostic = 0; diff --git a/net/rose/rose_subr.c b/net/rose/rose_subr.c index ee710bd6e..e7709726c 100644 --- a/net/rose/rose_subr.c +++ b/net/rose/rose_subr.c @@ -49,8 +49,47 @@ void rose_clear_queues(struct sock *sk) while ((skb = skb_dequeue(&sk->write_queue)) != NULL) kfree_skb(skb, FREE_WRITE); - while ((skb = skb_dequeue(&sk->protinfo.rose->frag_queue)) != NULL) - kfree_skb(skb, FREE_READ); + while ((skb = skb_dequeue(&sk->protinfo.rose->ack_queue)) != NULL) + kfree_skb(skb, FREE_WRITE); +} + +/* + * This routine purges the input queue of those frames that have been + * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the + * SDL diagram. + */ +void rose_frames_acked(struct sock *sk, unsigned short nr) +{ + struct sk_buff *skb; + + /* + * Remove all the ack-ed frames from the ack queue. + */ + if (sk->protinfo.rose->va != nr) { + while (skb_peek(&sk->protinfo.rose->ack_queue) != NULL && sk->protinfo.rose->va != nr) { + skb = skb_dequeue(&sk->protinfo.rose->ack_queue); + kfree_skb(skb, FREE_WRITE); + sk->protinfo.rose->va = (sk->protinfo.rose->va + 1) % ROSE_MODULUS; + } + } +} + +void rose_requeue_frames(struct sock *sk) +{ + struct sk_buff *skb, *skb_prev = NULL; + + /* + * Requeue all the un-ack-ed frames on the output queue to be picked + * up by rose_kick. This arrangement handles the possibility of an + * empty output queue. + */ + while ((skb = skb_dequeue(&sk->protinfo.rose->ack_queue)) != NULL) { + if (skb_prev == NULL) + skb_queue_head(&sk->write_queue, skb); + else + skb_append(skb_prev, skb); + skb_prev = skb; + } } /* diff --git a/net/socket.c b/net/socket.c index ee19a84f2..697a06cd3 100644 --- a/net/socket.c +++ b/net/socket.c @@ -74,18 +74,16 @@ #include <linux/firewall.h> #include <linux/wanrouter.h> #include <linux/init.h> +#include <linux/poll.h> #if defined(CONFIG_KERNELD) && defined(CONFIG_NET) #include <linux/kerneld.h> #endif -#include <net/netlink.h> - #include <asm/system.h> #include <asm/uaccess.h> #include <linux/inet.h> -#include <linux/netdevice.h> #include <net/ip.h> #include <net/protocol.h> #include <net/rarp.h> @@ -103,7 +101,8 @@ static ssize_t sock_write(struct file *file, const char *buf, size_t size, loff_t *ppos); static int sock_close(struct inode *inode, struct file *file); -static unsigned int sock_poll(struct file *file, poll_table *wait); +static unsigned int sock_poll(struct file *file, + struct poll_table_struct *wait); static int sock_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg); static int sock_fasync(struct file *filp, int on); @@ -1158,8 +1157,11 @@ asmlinkage int sys_sendmsg(int fd, struct msghdr *msg, unsigned flags) * skbuff accounting stops it from going too far. * I hope this is correct. */ - if (msg_sys.msg_controllen > sizeof(ctl) && - msg_sys.msg_controllen <= 256) + if (msg_sys.msg_controllen > 256) { + err = -EINVAL; + goto failed2; + } + if (msg_sys.msg_controllen > sizeof(ctl)) { ctl_buf = kmalloc(msg_sys.msg_controllen, GFP_KERNEL); if (ctl_buf == NULL) @@ -1176,11 +1178,11 @@ asmlinkage int sys_sendmsg(int fd, struct msghdr *msg, unsigned flags) msg_sys.msg_control = ctl_buf; } msg_sys.msg_flags = flags; - if (current->files->fd[fd]->f_flags & O_NONBLOCK) - msg_sys.msg_flags |= MSG_DONTWAIT; if ((sock = sockfd_lookup(fd,&err))!=NULL) { + if (current->files->fd[fd]->f_flags & O_NONBLOCK) + msg_sys.msg_flags |= MSG_DONTWAIT; err = sock_sendmsg(sock, &msg_sys, total_len); sockfd_put(sock); } @@ -1246,11 +1248,10 @@ asmlinkage int sys_recvmsg(int fd, struct msghdr *msg, unsigned int flags) cmsg_ptr = (unsigned long)msg_sys.msg_control; msg_sys.msg_flags = 0; - if (current->files->fd[fd]->f_flags&O_NONBLOCK) - flags |= MSG_DONTWAIT; - if ((sock = sockfd_lookup(fd, &err))!=NULL) { + if (current->files->fd[fd]->f_flags&O_NONBLOCK) + flags |= MSG_DONTWAIT; err=sock_recvmsg(sock, &msg_sys, total_len, flags); if(err>=0) len=err; @@ -1392,9 +1393,10 @@ asmlinkage int sys_socketcall(int call, unsigned long *args) int sock_register(struct net_proto_family *ops) { - if (ops->family < 0 || ops->family >= NPROTO) - return -1; - + if (ops->family >= NPROTO) { + printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family, NPROTO); + return -ENOBUFS; + } net_families[ops->family]=ops; return 0; } @@ -1450,13 +1452,6 @@ __initfunc(void sock_init(void)) sk_init(); - /* - * The netlink device handler may be needed early. - */ - -#ifdef CONFIG_NETLINK - init_netlink(); -#endif /* * Wan router layer. @@ -1479,6 +1474,17 @@ __initfunc(void sock_init(void)) */ proto_init(); + + /* + * The netlink device handler may be needed early. + */ + +#ifdef CONFIG_RTNETLINK + rtnetlink_init(); +#endif +#ifdef CONFIG_NETLINK_DEV + init_netlink(); +#endif } int socket_get_info(char *buffer, char **start, off_t offset, int length) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index ec6f52e3f..7abaa691e 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -60,8 +60,6 @@ static void call_reconnect(struct rpc_task *task); static u32 * call_header(struct rpc_task *task); static u32 * call_verify(struct rpc_task *task); -#define _S(nr) (1 << ((nr) - 1)) - /* * Create an RPC client * FIXME: This should also take a flags argument (as in task->tk_flags). @@ -197,19 +195,24 @@ rpc_do_call(struct rpc_clnt *clnt, u32 proc, void *argp, void *resp, int flags, rpc_action func, void *data) { struct rpc_task my_task, *task = &my_task; - unsigned long oldmask, sigallow = _S(SIGKILL); + unsigned long sigallow = sigmask(SIGKILL); + sigset_t oldset; + unsigned long irqflags; int async, status; /* Turn off various signals */ if (clnt->cl_intr) { - struct sigaction *action = current->sig->action; - if (action[SIGINT-1].sa_handler == SIG_DFL) - sigallow |= _S(SIGINT); - if (action[SIGQUIT-1].sa_handler == SIG_DFL) - sigallow |= _S(SIGQUIT); + struct k_sigaction *action = current->sig->action; + if (action[SIGINT-1].sa.sa_handler == SIG_DFL) + sigallow |= sigmask(SIGINT); + if (action[SIGQUIT-1].sa.sa_handler == SIG_DFL) + sigallow |= sigmask(SIGQUIT); } - oldmask = current->blocked; - current->blocked |= ~sigallow; + spin_lock_irqsave(¤t->sigmask_lock, irqflags); + oldset = current->blocked; + siginitsetinv(¤t->blocked, sigallow & ~oldset.sig[0]); + recalc_sigpending(current); + spin_unlock_irqrestore(¤t->sigmask_lock, irqflags); /* Create/initialize a new RPC task */ if ((async = (flags & RPC_TASK_ASYNC)) != 0) { @@ -238,7 +241,11 @@ rpc_do_call(struct rpc_clnt *clnt, u32 proc, void *argp, void *resp, } out: - current->blocked = oldmask; + spin_lock_irqsave(¤t->sigmask_lock, irqflags); + current->blocked = oldset; + recalc_sigpending(current); + spin_unlock_irqrestore(¤t->sigmask_lock, irqflags); + return status; } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 8e2d5868c..6e14bb287 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -23,8 +23,6 @@ static int rpc_task_id = 0; #endif -#define _S(signo) (1 << ((signo)-1)) - /* * We give RPC the same get_free_pages priority as NFS */ @@ -410,9 +408,7 @@ __rpc_execute(struct rpc_task *task) * break the loop here, but go around once more. */ if (0 && !RPC_IS_ASYNC(task) && signalled()) { - dprintk("RPC: %4d got signal (map %08lx)\n", - task->tk_pid, - current->signal & ~current->blocked); + dprintk("RPC: %4d got signal\n", task->tk_pid); rpc_exit(task, -ERESTARTSYS); } } @@ -746,7 +742,7 @@ rpciod(void *ptr) exit_files(current); exit_mm(current); - current->blocked |= ~_S(SIGKILL); + siginitsetinv(¤t->blocked, sigmask(SIGKILL)); current->session = 1; current->pgrp = 1; sprintf(current->comm, "rpciod"); @@ -754,13 +750,13 @@ rpciod(void *ptr) dprintk("RPC: rpciod starting (pid %d)\n", rpciod_pid); while (rpciod_users) { if (signalled()) { - if (current->signal & _S(SIGKILL)) { + if (sigismember(¤t->signal, SIGKILL)) { rpciod_killall(); } else { printk("rpciod: ignoring signal (%d users)\n", rpciod_users); } - current->signal &= current->blocked; + flush_signals(current); } __rpc_schedule(); @@ -795,17 +791,32 @@ rpciod(void *ptr) static void rpciod_killall(void) { - while (all_tasks) { - unsigned long oldsig = current->signal; + unsigned long flags; + sigset_t old_set; + + /* FIXME: What had been going on before was saving and restoring + current->signal. This as opposed to blocking signals? Do we + still need them to wake up out of schedule? In any case it + isn't playing nice and a better way should be found. */ - current->signal = 0; + spin_lock_irqsave(¤t->sigmask_lock, flags); + old_set = current->blocked; + sigfillset(¤t->blocked); + recalc_sigpending(current); + spin_unlock_irqrestore(¤t->sigmask_lock, flags); + + while (all_tasks) { rpc_killall_tasks(NULL); __rpc_schedule(); current->timeout = jiffies + HZ / 100; need_resched = 1; schedule(); - current->signal = oldsig; } + + spin_lock_irqsave(¤t->sigmask_lock, flags); + current->blocked = old_set; + recalc_sigpending(current); + spin_unlock_irqrestore(¤t->sigmask_lock, flags); } /* @@ -846,7 +857,7 @@ out: void rpciod_down(void) { - unsigned long oldflags; + unsigned long flags; MOD_INC_USE_COUNT; down(&rpciod_sema); @@ -867,8 +878,7 @@ rpciod_down(void) * Usually rpciod will exit very quickly, so we * wait briefly before checking the process id. */ - oldflags = current->signal; - current->signal = 0; + current->sigpending = 0; current->state = TASK_INTERRUPTIBLE; current->timeout = jiffies + 1; schedule(); @@ -884,7 +894,9 @@ rpciod_down(void) } interruptible_sleep_on(&rpciod_killer); } - current->signal = oldflags; + spin_lock_irqsave(¤t->sigmask_lock, flags); + recalc_sigpending(current); + spin_unlock_irqrestore(¤t->sigmask_lock, flags); out: up(&rpciod_sema); MOD_DEC_USE_COUNT; diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index b5495df93..73f805f40 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -73,10 +73,10 @@ EXPORT_SYMBOL(svc_wake_up); /* RPC statistics */ #ifdef CONFIG_PROC_FS +EXPORT_SYMBOL(rpc_proc_init); EXPORT_SYMBOL(rpc_proc_register); EXPORT_SYMBOL(rpc_register_sysctl); EXPORT_SYMBOL(rpc_proc_unregister); -EXPORT_SYMBOL(rpc_proc_init); EXPORT_SYMBOL(rpc_proc_read); EXPORT_SYMBOL(svc_proc_register); EXPORT_SYMBOL(svc_proc_unregister); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 17376ef76..731072fe5 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -169,7 +169,8 @@ int svc_register(struct svc_serv *serv, int proto, unsigned short port) { struct svc_program *progp; - unsigned long oldsigs = 0; + unsigned long flags; + sigset_t old_set; int i, error = 0, dummy; progp = serv->sv_program; @@ -177,9 +178,17 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port) dprintk("RPC: svc_register(%s, %s, %d)\n", progp->pg_name, proto == IPPROTO_UDP? "udp" : "tcp", port); + /* FIXME: What had been going on before was saving and restoring + current->signal. This as opposed to blocking signals? Do we + still need them to wake up out of schedule? In any case it + isn't playing nice and a better way should be found. */ + if (!port) { - oldsigs = current->signal; - current->signal = 0; + spin_lock_irqsave(¤t->sigmask_lock, flags); + old_set = current->blocked; + sigfillset(¤t->blocked); + recalc_sigpending(current); + spin_unlock_irqrestore(¤t->sigmask_lock, flags); } for (i = 0; i < progp->pg_nvers; i++) { @@ -193,7 +202,14 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port) break; } } - current->signal |= oldsigs; + + if (!port) { + spin_lock_irqsave(¤t->sigmask_lock, flags); + current->blocked = old_set; + recalc_sigpending(current); + spin_unlock_irqrestore(¤t->sigmask_lock, flags); + } + return error; } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index fb6b81db2..2701a8398 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -226,7 +226,7 @@ svc_wake_up(struct svc_serv *serv) static int svc_sendto(struct svc_rqst *rqstp, struct iovec *iov, int nr) { - unsigned long oldfs; + mm_segment_t oldfs; struct socket *sock = rqstp->rq_sock->sk_sock; struct msghdr msg; int i, buflen, len; @@ -268,7 +268,7 @@ svc_sendto(struct svc_rqst *rqstp, struct iovec *iov, int nr) static int svc_recv_available(struct svc_sock *svsk) { - unsigned long oldfs; + mm_segment_t oldfs; struct socket *sock = svsk->sk_sock; int avail, err; @@ -285,7 +285,7 @@ svc_recv_available(struct svc_sock *svsk) static int svc_recvfrom(struct svc_rqst *rqstp, struct iovec *iov, int nr, int buflen) { - unsigned long oldfs; + mm_segment_t oldfs; struct msghdr msg; struct socket *sock; int len; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 15703111d..c76566399 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -141,7 +141,7 @@ xprt_sendmsg(struct rpc_xprt *xprt) { struct socket *sock = xprt->sock; struct msghdr msg; - unsigned long oldfs; + mm_segment_t oldfs; int result; xprt_pktdump("packet data:", @@ -205,7 +205,7 @@ xprt_recvmsg(struct rpc_xprt *xprt, struct iovec *iov, int nr, int len) struct socket *sock = xprt->sock; struct sockaddr_in sin; struct msghdr msg; - unsigned long oldfs; + mm_segment_t oldfs; int result; #if LINUX_VERSION_CODE >= 0x020100 @@ -528,7 +528,7 @@ udp_data_ready(struct sock *sk, int len) struct rpc_rqst *rovr; struct sk_buff *skb; struct iovec iov[MAX_IOVEC]; - unsigned long oldfs; + mm_segment_t oldfs; int err, repsize, copied; dprintk("RPC: udp_data_ready...\n"); diff --git a/net/sysctl_net.c b/net/sysctl_net.c index 1acd01749..5f5e8593e 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -24,7 +24,11 @@ extern ctl_table ipv4_table[]; extern ctl_table ipx_table[]; #endif -extern ctl_table core_table[], unix_table[]; +extern ctl_table core_table[]; + +#ifdef CONFIG_UNIX +extern ctl_table unix_table[]; +#endif #ifdef CONFIG_NET extern ctl_table ether_table[], e802_table[]; @@ -44,7 +48,9 @@ extern ctl_table tr_table[]; ctl_table net_table[] = { {NET_CORE, "core", NULL, 0, 0555, core_table}, +#ifdef CONFIG_UNIX {NET_UNIX, "unix", NULL, 0, 0555, unix_table}, +#endif #ifdef CONFIG_NET {NET_802, "802", NULL, 0, 0555, e802_table}, {NET_ETHER, "ethernet", NULL, 0, 0555, ether_table}, diff --git a/net/unix/Makefile b/net/unix/Makefile index afce06790..f0bebfae3 100644 --- a/net/unix/Makefile +++ b/net/unix/Makefile @@ -9,6 +9,7 @@ O_TARGET := unix.o O_OBJS := af_unix.o garbage.o +M_OBJS := $(O_TARGET) ifeq ($(CONFIG_SYSCTL),y) O_OBJS += sysctl_net_unix.o diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 936d61220..1a9baa549 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -26,6 +26,7 @@ * Alan Cox : Started POSIXisms * Andreas Schwab : Replace inode by dentry for proper * reference counting + * Kirk Petersen : Made this a module * * Known differences from reference BSD that was tested: * @@ -57,6 +58,7 @@ * with BSD names. */ +#include <linux/module.h> #include <linux/config.h> #include <linux/kernel.h> #include <linux/major.h> @@ -310,6 +312,9 @@ static void unix_destroy_socket(unix_socket *sk) sk->dead=1; unix_delayed_delete(sk); /* Try every so often until buffers are all freed */ } + + /* socket destroyed, decrement count */ + MOD_DEC_USE_COUNT; } static int unix_listen(struct socket *sock, int backlog) @@ -373,6 +378,10 @@ static int unix_create(struct socket *sock, int protocol) sk->mtu=4096; sk->protinfo.af_unix.list=&unix_sockets_unbound; unix_insert_socket(sk); + + /* socket created, increment count */ + MOD_INC_USE_COUNT; + return 0; } @@ -1465,7 +1474,16 @@ struct net_proto_family unix_family_ops = { unix_create }; +#ifdef MODULE +#ifdef CONFIG_SYSCTL +extern void unix_sysctl_register(void); +extern void unix_sysctl_unregister(void); +#endif + +int init_module(void) +#else __initfunc(void unix_proto_init(struct net_proto *pro)) +#endif { struct sk_buff *dummy_skb; struct proc_dir_entry *ent; @@ -1474,14 +1492,37 @@ __initfunc(void unix_proto_init(struct net_proto *pro)) if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)) { printk(KERN_CRIT "unix_proto_init: panic\n"); +#ifdef MODULE + return -1; +#else return; +#endif } sock_register(&unix_family_ops); #ifdef CONFIG_PROC_FS ent = create_proc_entry("net/unix", 0, 0); ent->read_proc = unix_read_proc; #endif + +#ifdef MODULE +#ifdef CONFIG_SYSCTL + unix_sysctl_register(); +#endif + + return 0; +#endif } + +#ifdef MODULE +void cleanup_module(void) +{ + sock_unregister(AF_UNIX); +#ifdef CONFIG_SYSCTL + unix_sysctl_unregister(); +#endif +} +#endif + /* * Local variables: * compile-command: "gcc -g -D__KERNEL__ -Wall -O6 -I/usr/src/linux/include -c af_unix.c" diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c index f487ae95a..d492e8e2b 100644 --- a/net/unix/sysctl_net_unix.c +++ b/net/unix/sysctl_net_unix.c @@ -29,4 +29,31 @@ ctl_table unix_table[] = { &proc_dointvec_jiffies}, {0} }; -#endif + +#ifdef MODULE +static struct ctl_table_header * unix_sysctl_header; +static struct ctl_table unix_root_table[]; +static struct ctl_table unix_net_table[]; + +ctl_table unix_root_table[] = { + {CTL_NET, "net", NULL, 0, 0555, unix_net_table}, + {0} +}; + +ctl_table unix_net_table[] = { + {NET_UNIX, "unix", NULL, 0, 0555, unix_table}, + {0} +}; + +void unix_sysctl_register(void) +{ + unix_sysctl_header = register_sysctl_table(unix_root_table, 0); +} + +void unix_sysctl_unregister(void) +{ + unregister_sysctl_table(unix_sysctl_header); +} +#endif /* MODULE */ + +#endif /* CONFIG_SYSCTL */ diff --git a/net/wanrouter/wanproc.c b/net/wanrouter/wanproc.c index 9c3fe9b2a..937c50076 100644 --- a/net/wanrouter/wanproc.c +++ b/net/wanrouter/wanproc.c @@ -57,8 +57,7 @@ typedef struct wan_stat_entry /* Proc filesystem interface */ static int router_proc_perms (struct inode*, int); -static long router_proc_read(struct inode* inode, struct file* file, char* buf, - unsigned long count); +static ssize_t router_proc_read(struct file* file, char* buf, size_t count, loff_t *ppos); /* Methods for preparing data for reading proc entries */ @@ -337,9 +336,10 @@ static int router_proc_perms (struct inode* inode, int op) * <0 error */ -static long router_proc_read(struct inode* inode, struct file* file, - char* buf, unsigned long count) +static ssize_t router_proc_read(struct file* file, char* buf, size_t count, + loff_t *ppos) { + struct inode *inode; struct proc_dir_entry* dent; char* page; int pos, offs, len; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 2970a82b9..bc473e317 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -440,6 +440,7 @@ static struct sock *x25_alloc_socket(void) sock_init_data(NULL, sk); + skb_queue_head_init(&x25->ack_queue); skb_queue_head_init(&x25->fragment_queue); skb_queue_head_init(&x25->interrupt_in_queue); skb_queue_head_init(&x25->interrupt_out_queue); diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c index e4cd99ae7..c8ffb33ef 100644 --- a/net/x25/x25_dev.c +++ b/net/x25/x25_dev.c @@ -177,7 +177,6 @@ void x25_establish_link(struct x25_neigh *neigh) } skb->protocol = htons(ETH_P_X25); - skb->priority = SOPRI_NORMAL; skb->dev = neigh->dev; skb->arp = 1; @@ -208,7 +207,6 @@ void x25_terminate_link(struct x25_neigh *neigh) } skb->protocol = htons(ETH_P_X25); - skb->priority = SOPRI_NORMAL; skb->dev = neigh->dev; skb->arp = 1; @@ -236,7 +234,6 @@ void x25_send_frame(struct sk_buff *skb, struct x25_neigh *neigh) } skb->protocol = htons(ETH_P_X25); - skb->priority = SOPRI_NORMAL; skb->dev = neigh->dev; skb->arp = 1; diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c index 96b459a4e..1c4cb3bc7 100644 --- a/net/x25/x25_in.c +++ b/net/x25/x25_in.c @@ -174,6 +174,7 @@ static int x25_state3_machine(struct sock *sk, struct sk_buff *skb, int frametyp sk->protinfo.x25->vr = 0; sk->protinfo.x25->va = 0; sk->protinfo.x25->vl = 0; + x25_requeue_frames(sk); break; case X25_CLEAR_REQUEST: @@ -199,11 +200,9 @@ static int x25_state3_machine(struct sock *sk, struct sk_buff *skb, int frametyp sk->protinfo.x25->vl = 0; sk->protinfo.x25->state = X25_STATE_4; } else { - if (sk->protinfo.x25->condition & X25_COND_PEER_RX_BUSY) { - sk->protinfo.x25->va = nr; - } else { - x25_check_iframes_acked(sk, nr); - } + x25_frames_acked(sk, nr); + if (frametype == X25_RNR) + x25_requeue_frames(sk); } break; @@ -221,11 +220,7 @@ static int x25_state3_machine(struct sock *sk, struct sk_buff *skb, int frametyp sk->protinfo.x25->state = X25_STATE_4; break; } - if (sk->protinfo.x25->condition & X25_COND_PEER_RX_BUSY) { - sk->protinfo.x25->va = nr; - } else { - x25_check_iframes_acked(sk, nr); - } + x25_frames_acked(sk, nr); if (sk->protinfo.x25->condition & X25_COND_OWN_RX_BUSY) break; if (ns == sk->protinfo.x25->vr) { @@ -298,6 +293,7 @@ static int x25_state4_machine(struct sock *sk, struct sk_buff *skb, int frametyp sk->protinfo.x25->vs = 0; sk->protinfo.x25->vl = 0; sk->protinfo.x25->state = X25_STATE_3; + x25_requeue_frames(sk); break; case X25_CLEAR_REQUEST: diff --git a/net/x25/x25_out.c b/net/x25/x25_out.c index aa8fc2c1b..5283092a1 100644 --- a/net/x25/x25_out.c +++ b/net/x25/x25_out.c @@ -126,8 +126,8 @@ static void x25_send_iframe(struct sock *sk, struct sk_buff *skb) void x25_kick(struct sock *sk) { - struct sk_buff *skb; - unsigned short end; + struct sk_buff *skb, *skbn; + unsigned short start, end; int modulus; if (sk->protinfo.x25->state != X25_STATE_3) @@ -149,11 +149,15 @@ void x25_kick(struct sock *sk) return; modulus = (sk->protinfo.x25->neighbour->extended) ? X25_EMODULUS : X25_SMODULUS; + + start = (skb_peek(&sk->protinfo.x25->ack_queue) == NULL) ? sk->protinfo.x25->va : sk->protinfo.x25->vs; end = (sk->protinfo.x25->va + sk->protinfo.x25->facilities.winsize_out) % modulus; - if (sk->protinfo.x25->vs == end) + if (start == end) return; + sk->protinfo.x25->vs = start; + /* * Transmit data until either we're out of data to send or * the window is full. @@ -162,13 +166,25 @@ void x25_kick(struct sock *sk) skb = skb_dequeue(&sk->write_queue); do { + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { + skb_queue_head(&sk->write_queue, skb); + break; + } + + skb_set_owner_w(skbn, sk); + /* - * Transmit the frame. + * Transmit the frame copy. */ - x25_send_iframe(sk, skb); + x25_send_iframe(sk, skbn); sk->protinfo.x25->vs = (sk->protinfo.x25->vs + 1) % modulus; + /* + * Requeue the original data frame. + */ + skb_queue_tail(&sk->protinfo.x25->ack_queue, skb); + } while (sk->protinfo.x25->vs != end && (skb = skb_dequeue(&sk->write_queue)) != NULL); sk->protinfo.x25->vl = sk->protinfo.x25->vr; @@ -195,15 +211,4 @@ void x25_enquiry_response(struct sock *sk) x25_stop_timer(sk); } -void x25_check_iframes_acked(struct sock *sk, unsigned short nr) -{ - if (sk->protinfo.x25->vs == nr) { - sk->protinfo.x25->va = nr; - } else { - if (sk->protinfo.x25->va != nr) { - sk->protinfo.x25->va = nr; - } - } -} - #endif diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c index f2aff6d12..52e5be0cb 100644 --- a/net/x25/x25_subr.c +++ b/net/x25/x25_subr.c @@ -50,6 +50,9 @@ void x25_clear_queues(struct sock *sk) while ((skb = skb_dequeue(&sk->write_queue)) != NULL) kfree_skb(skb, FREE_WRITE); + while ((skb = skb_dequeue(&sk->protinfo.x25->ack_queue)) != NULL) + kfree_skb(skb, FREE_WRITE); + while ((skb = skb_dequeue(&sk->protinfo.x25->interrupt_in_queue)) != NULL) kfree_skb(skb, FREE_READ); @@ -60,6 +63,49 @@ void x25_clear_queues(struct sock *sk) kfree_skb(skb, FREE_READ); } + +/* + * This routine purges the input queue of those frames that have been + * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the + * SDL diagram. +*/ +void x25_frames_acked(struct sock *sk, unsigned short nr) +{ + struct sk_buff *skb; + int modulus; + + modulus = (sk->protinfo.x25->neighbour->extended) ? X25_EMODULUS : X25_SMODULUS; + + /* + * Remove all the ack-ed frames from the ack queue. + */ + if (sk->protinfo.x25->va != nr) { + while (skb_peek(&sk->protinfo.x25->ack_queue) != NULL && sk->protinfo.x25->va != nr) { + skb = skb_dequeue(&sk->protinfo.x25->ack_queue); + kfree_skb(skb, FREE_WRITE); + sk->protinfo.x25->va = (sk->protinfo.x25->va + 1) % modulus; + } + } +} + +void x25_requeue_frames(struct sock *sk) +{ + struct sk_buff *skb, *skb_prev = NULL; + + /* + * Requeue all the un-ack-ed frames on the output queue to be picked + * up by x25_kick. This arrangement handles the possibility of an empty + * output queue. + */ + while ((skb = skb_dequeue(&sk->protinfo.x25->ack_queue)) != NULL) { + if (skb_prev == NULL) + skb_queue_head(&sk->write_queue, skb); + else + skb_append(skb_prev, skb); + skb_prev = skb; + } +} + /* * Validate that the value of nr is between va and vs. Return true or * false for testing. |