diff options
author | Ralf Baechle <ralf@linux-mips.org> | 1999-01-04 16:03:48 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 1999-01-04 16:03:48 +0000 |
commit | 78c388aed2b7184182c08428db1de6c872d815f5 (patch) | |
tree | 4b2003b1b4ceb241a17faa995da8dd1004bb8e45 /net | |
parent | eb7a5bf93aaa4be1d7c6181100ab7639e74d67f7 (diff) |
Merge with Linux 2.1.131 and more MIPS goodies.
(Did I mention that CVS is buggy ...)
Diffstat (limited to 'net')
91 files changed, 4351 insertions, 1791 deletions
diff --git a/net/802/fddi.c b/net/802/fddi.c index 1c9f7e765..afe14c384 100644 --- a/net/802/fddi.c +++ b/net/802/fddi.c @@ -25,6 +25,7 @@ * Alan Cox : New arp/rebuild header */ +#include <linux/config.h> #include <asm/segment.h> #include <asm/system.h> #include <linux/types.h> @@ -98,18 +99,20 @@ int fddi_rebuild_header(struct sk_buff *skb) { struct fddihdr *fddi = (struct fddihdr *)skb->data; - if (fddi->hdr.llc_snap.ethertype != __constant_htons(ETH_P_IP)) +#ifdef CONFIG_INET + if (fddi->hdr.llc_snap.ethertype == __constant_htons(ETH_P_IP)) + /* Try to get ARP to resolve the header and fill destination address */ + return arp_find(fddi->daddr, skb); + else +#endif { printk("%s: Don't know how to resolve type %02X addresses.\n", skb->dev->name, htons(fddi->hdr.llc_snap.ethertype)); return(0); } - - /* Try to get ARP to resolve the header and fill destination address */ - - return arp_find(fddi->daddr, skb); } + /* * Determine the packet's protocol ID and fill in skb fields. * This routine is called before an incoming packet is passed diff --git a/net/802/hippi.c b/net/802/hippi.c index b8890647e..f1dd1dd17 100644 --- a/net/802/hippi.c +++ b/net/802/hippi.c @@ -34,6 +34,7 @@ #include <linux/errno.h> #include <net/arp.h> #include <net/sock.h> +#include <asm/uaccess.h> #include <asm/checksum.h> #include <asm/segment.h> #include <asm/system.h> @@ -80,16 +81,8 @@ int hippi_header(struct sk_buff *skb, struct device *dev, hip->le.dest_addr_type = 2; /* 12 bit SC address */ hip->le.src_addr_type = 2; /* 12 bit SC address */ -#if 1 - if (saddr) - { - printk("existing saddr - this should not happen, configure ARP please!\n"); - memcpy(hip->le.src_switch_addr, saddr + 3, 3); - }else - memcpy(hip->le.src_switch_addr, dev->dev_addr + 3, 3); - + memcpy(hip->le.src_switch_addr, dev->dev_addr + 3, 3); memset(&hip->le.reserved, 0, 16); -#endif hip->snap.dsap = HIPPI_EXTENDED_SAP; hip->snap.ssap = HIPPI_EXTENDED_SAP; @@ -103,7 +96,7 @@ int hippi_header(struct sk_buff *skb, struct device *dev, { memcpy(hip->le.dest_switch_addr, daddr + 3, 3); memcpy(&skb->private.ifield, daddr + 2, 4); - return(HIPPI_HLEN); + return HIPPI_HLEN; } return -HIPPI_HLEN; } diff --git a/net/Config.in b/net/Config.in index b64570308..43e0e1705 100644 --- a/net/Config.in +++ b/net/Config.in @@ -18,8 +18,8 @@ if [ "$CONFIG_INET" = "y" ]; then source net/ipv4/Config.in if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then # Sorry, but IPv6 as module is still invalid. -# tristate 'The IPv6 protocol (EXPERIMENTAL)' CONFIG_IPV6 - bool 'The IPv6 protocol (EXPERIMENTAL)' CONFIG_IPV6 + tristate 'The IPv6 protocol (EXPERIMENTAL)' CONFIG_IPV6 +# bool 'The IPv6 protocol (EXPERIMENTAL)' CONFIG_IPV6 if [ "$CONFIG_IPV6" != "n" ]; then source net/ipv6/Config.in fi diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index 89ce0b56d..74540951f 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -23,6 +23,8 @@ * Inside AppleTalk (2nd Ed). * Fixes: * Jaume Grau - flush caches on AARP_PROBE + * Rob Newberry - Added proxy AARP and AARP proc fs, + * moved probing from DDP module. * */ @@ -53,6 +55,7 @@ #include <net/psnap.h> #include <linux/atalk.h> #include <linux/init.h> +#include <linux/proc_fs.h> int sysctl_aarp_expiry_time = AARP_EXPIRY_TIME; @@ -69,6 +72,7 @@ struct aarp_entry /* These first two are only used for unresolved entries */ unsigned long last_sent; /* Last time we xmitted the aarp request */ struct sk_buff_head packet_queue; /* Queue of frames wait for resolution */ + int status; /* Used for proxy AARP */ unsigned long expires_at; /* Entry expiry time */ struct at_addr target_addr; /* DDP Address */ struct device *dev; /* Device to use */ @@ -77,12 +81,11 @@ struct aarp_entry struct aarp_entry *next; /* Next entry in chain */ }; - /* - * Hashed list of resolved and unresolved entries + * Hashed list of resolved, unresolved and proxy entries */ -static struct aarp_entry *resolved[AARP_HASH_SIZE], *unresolved[AARP_HASH_SIZE]; +static struct aarp_entry *resolved[AARP_HASH_SIZE], *unresolved[AARP_HASH_SIZE], *proxies[AARP_HASH_SIZE]; static int unresolved_count=0; /* @@ -224,7 +227,7 @@ static void aarp_send_reply(struct device *dev, struct at_addr *us, struct at_ad } /* - * Send probe frames. Called from atif_probe_device. + * Send probe frames. Called from aarp_probe_network and aarp_proxy_probe_network. */ void aarp_send_probe(struct device *dev, struct at_addr *us) @@ -360,6 +363,7 @@ static void aarp_expire_timeout(unsigned long unused) aarp_expire_timer(&resolved[ct]); aarp_kick(&unresolved[ct]); aarp_expire_timer(&unresolved[ct]); + aarp_expire_timer(&proxies[ct]); } del_timer(&aarp_timer); if(unresolved_count==0) @@ -382,6 +386,7 @@ static int aarp_device_event(struct notifier_block *this, unsigned long event, v { aarp_expire_device(&resolved[ct],ptr); aarp_expire_device(&unresolved[ct],ptr); + aarp_expire_device(&proxies[ct],ptr); } } return NOTIFY_DONE; @@ -420,6 +425,156 @@ static struct aarp_entry *aarp_find_entry(struct aarp_entry *list, struct device return list; } +void aarp_proxy_remove(struct device *dev, struct at_addr *sa) +{ + struct aarp_entry *a; + int hash; + + hash = sa->s_node % (AARP_HASH_SIZE-1); + a = aarp_find_entry(proxies[hash], dev, sa); + if (a) + { + a->expires_at = 0; + + } +} + +struct at_addr* aarp_proxy_find(struct device *dev, struct at_addr *sa) +{ + struct aarp_entry *a; + int hash; + + hash = sa->s_node % (AARP_HASH_SIZE-1); + a = aarp_find_entry(proxies[hash], dev, sa); + if (a != NULL) + return sa; + + return NULL; +} + + +/* + * Probe a Phase 1 device or a device that requires its Net:Node to + * be set via an ioctl. + */ +void aarp_send_probe_phase1(struct atalk_iface *iface) +{ + struct ifreq atreq; + struct sockaddr_at *sa = (struct sockaddr_at *)&atreq.ifr_addr; + + sa->sat_addr.s_node = iface->address.s_node; + sa->sat_addr.s_net = ntohs(iface->address.s_net); + + /* We pass the Net:Node to the drivers/cards by a Device ioctl. */ + if(!(iface->dev->do_ioctl(iface->dev, &atreq, SIOCSIFADDR))) + { + (void)iface->dev->do_ioctl(iface->dev, &atreq, SIOCGIFADDR); + if((iface->address.s_net != htons(sa->sat_addr.s_net)) + || (iface->address.s_node != sa->sat_addr.s_node)) + iface->status |= ATIF_PROBE_FAIL; + + iface->address.s_net = htons(sa->sat_addr.s_net); + iface->address.s_node = sa->sat_addr.s_node; + } + + return; +} + + +void aarp_probe_network(struct atalk_iface *atif) +{ + if(atif->dev->type == ARPHRD_LOCALTLK || atif->dev->type == ARPHRD_PPP) + aarp_send_probe_phase1(atif); + else + { + unsigned int count; + for (count = 0; count < AARP_RETRANSMIT_LIMIT; count++) + { + aarp_send_probe(atif->dev, &atif->address); + + /* + * Defer 1/10th + */ + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ/10); + + if (atif->status & ATIF_PROBE_FAIL) + break; + } + } +} + +int aarp_proxy_probe_network(struct atalk_iface *atif, struct at_addr *sa) +{ + struct aarp_entry *entry; + unsigned int count; + int hash; + + /* + * we don't currently support LocalTalk or PPP for proxy AARP; + * if someone wants to try and add it, have fun + */ + if (atif->dev->type == ARPHRD_LOCALTLK) + return (-EPROTONOSUPPORT); + + if (atif->dev->type == ARPHRD_PPP) + return (-EPROTONOSUPPORT); + + /* + * create a new AARP entry with the flags set to be published -- + * we need this one to hang around even if it's in use + */ + entry = aarp_alloc(); + if (entry == NULL) + return (-ENOMEM); + + entry->expires_at = -1; + entry->status = ATIF_PROBE; + entry->target_addr.s_node = sa->s_node; + entry->target_addr.s_net = sa->s_net; + entry->dev = atif->dev; + + hash = sa->s_node % (AARP_HASH_SIZE-1); + entry->next = proxies[hash]; + proxies[hash] = entry; + + for(count = 0; count < AARP_RETRANSMIT_LIMIT; count++) + { + aarp_send_probe(atif->dev, sa); + + /* + * Defer 1/10th + */ + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ/10); + + if (entry->status & ATIF_PROBE_FAIL) + break; + } + + /* + * FIX ME: I think we need exclusive access to the status flags, + * in case some one fails the probe while we're removing + * the probe flag. + */ + if (entry->status & ATIF_PROBE_FAIL) + { + /* free the entry */ + entry->expires_at = 0; + + /* return network full */ + return (-EADDRINUSE); + } + else + { + /* clear the probing flag */ + entry->status &= ~ATIF_PROBE; + } + + return 1; +} + + /* * Send a DDP frame */ @@ -654,7 +809,7 @@ static int aarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type { struct elapaarp *ea=(struct elapaarp *)skb->h.raw; struct aarp_entry *a; - struct at_addr sa, *ma; + struct at_addr sa, *ma, da; unsigned long flags; int hash; struct atalk_iface *ifa; @@ -736,8 +891,34 @@ static int aarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type kfree_skb(skb); return 1; } - } + } + /* + * Check for replies of proxy AARP entries + */ + + /* + * FIX ME: do we need a cli() here? + * aarp_find_entry does one on its own, between saving and restoring flags, so + * I don't think it is necessary, but I could be wrong -- it's happened before + */ + da.s_node = ea->pa_dst_node; + da.s_net = ea->pa_dst_net; + a = aarp_find_entry(proxies[hash], dev, &da); + if (a != NULL) + if (a->status & ATIF_PROBE) + { + a->status |= ATIF_PROBE_FAIL; + + /* + * we do not respond to probe or request packets for + * this address while we are probing this address + */ + restore_flags(flags); + kfree_skb(skb); + return 1; + } + switch(ea->function) { case AARP_REPLY: @@ -747,7 +928,7 @@ static int aarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type * Find the entry */ - cli(); + cli(); /* FIX ME: is this cli() necessary? aarp_find_entry does one on its own... */ if((a=aarp_find_entry(unresolved[hash],dev,&sa))==NULL || dev != a->dev) break; /* @@ -770,12 +951,32 @@ static int aarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type * If it is my address set ma to my address and reply. We can treat probe and * request the same. Probe simply means we shouldn't cache the querying host, * as in a probe they are proposing an address not using one. + * + * Support for proxy-AARP added. We check if the address is one + * of our proxies before we toss the packet out. */ - ma=&ifa->address; sa.s_node=ea->pa_dst_node; sa.s_net=ea->pa_dst_net; - + + /* + * see if we have a matching proxy + */ + ma = aarp_proxy_find(dev, &sa); + if (!ma) + { + ma=&ifa->address; + } + else + { + /* + * we need to make a copy of the entry + */ + da.s_node = sa.s_node; + da.s_net = da.s_net; + ma = &da; + } + if(ea->function==AARP_PROBE) { /* A probe implies someone trying to get an @@ -845,11 +1046,104 @@ void aarp_device_down(struct device *dev) { aarp_expire_device(&resolved[ct], dev); aarp_expire_device(&unresolved[ct], dev); + aarp_expire_device(&proxies[ct], dev); } return; } +/* + * Called from proc fs + */ +int aarp_get_info(char *buffer, char **start, off_t offset, int length, int dummy) +{ + /* we should dump all our AARP entries */ + struct aarp_entry *entry; + int len, ct; + + len = sprintf(buffer, + "%-10.10s ""%-10.10s""%-18.18s""%12.12s""%12.12s"" xmit_count status\n", + "address","device","hw addr","last_sent", "expires"); + for (ct = 0; ct < AARP_HASH_SIZE; ct++) + { + for (entry = resolved[ct]; entry; entry = entry->next) + { + len+= sprintf(buffer+len,"%6u:%-3u ", + (unsigned int)ntohs(entry->target_addr.s_net), + (unsigned int)(entry->target_addr.s_node)); + len+= sprintf(buffer+len,"%-10.10s", + entry->dev->name); + len+= sprintf(buffer+len,"%2.2X:%2.2X:%2.2X:%2.2X:%2.2X:%2.2X", + (int)(entry->hwaddr[0] & 0x000000FF), + (int)(entry->hwaddr[1] & 0x000000FF), + (int)(entry->hwaddr[2] & 0x000000FF), + (int)(entry->hwaddr[3] & 0x000000FF), + (int)(entry->hwaddr[4] & 0x000000FF), + (int)(entry->hwaddr[5] & 0x000000FF)); + len+= sprintf(buffer+len,"%12lu ""%12lu ", + (unsigned long)entry->last_sent, + (unsigned long)entry->expires_at); + len+=sprintf(buffer+len,"%10u", + (unsigned int)entry->xmit_count); + + len+=sprintf(buffer+len," resolved\n"); + } + } + + for (ct = 0; ct < AARP_HASH_SIZE; ct++) + { + for (entry = unresolved[ct]; entry; entry = entry->next) + { + len+= sprintf(buffer+len,"%6u:%-3u ", + (unsigned int)ntohs(entry->target_addr.s_net), + (unsigned int)(entry->target_addr.s_node)); + len+= sprintf(buffer+len,"%-10.10s", + entry->dev->name); + len+= sprintf(buffer+len,"%2.2X:%2.2X:%2.2X:%2.2X:%2.2X:%2.2X", + (int)(entry->hwaddr[0] & 0x000000FF), + (int)(entry->hwaddr[1] & 0x000000FF), + (int)(entry->hwaddr[2] & 0x000000FF), + (int)(entry->hwaddr[3] & 0x000000FF), + (int)(entry->hwaddr[4] & 0x000000FF), + (int)(entry->hwaddr[5] & 0x000000FF)); + len+= sprintf(buffer+len,"%12lu ""%12lu ", + (unsigned long)entry->last_sent, + (unsigned long)entry->expires_at); + len+=sprintf(buffer+len,"%10u", + (unsigned int)entry->xmit_count); + len+=sprintf(buffer+len," unresolved\n"); + } + } + + for (ct = 0; ct < AARP_HASH_SIZE; ct++) + { + for (entry = proxies[ct]; entry; entry = entry->next) + { + len+= sprintf(buffer+len,"%6u:%-3u ", + (unsigned int)ntohs(entry->target_addr.s_net), + (unsigned int)(entry->target_addr.s_node)); + len+= sprintf(buffer+len,"%-10.10s", + entry->dev->name); + len+= sprintf(buffer+len,"%2.2X:%2.2X:%2.2X:%2.2X:%2.2X:%2.2X", + (int)(entry->hwaddr[0] & 0x000000FF), + (int)(entry->hwaddr[1] & 0x000000FF), + (int)(entry->hwaddr[2] & 0x000000FF), + (int)(entry->hwaddr[3] & 0x000000FF), + (int)(entry->hwaddr[4] & 0x000000FF), + (int)(entry->hwaddr[5] & 0x000000FF)); + len+= sprintf(buffer+len,"%12lu ""%12lu ", + (unsigned long)entry->last_sent, + (unsigned long)entry->expires_at); + len+=sprintf(buffer+len,"%10u", + (unsigned int)entry->xmit_count); + len+=sprintf(buffer+len," proxy\n"); + } + } + + + return len; +} + #ifdef MODULE /* * General module cleanup. Called from cleanup_module() in ddp.c. @@ -862,4 +1156,27 @@ void aarp_cleanup_module(void) } #endif /* MODULE */ + +#ifdef CONFIG_PROC_FS + +static struct proc_dir_entry proc_aarp_entries= +{ + PROC_NET_AT_AARP, 4, "aarp", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + aarp_get_info +}; + +void aarp_register_proc_fs(void) +{ + proc_net_register(&proc_aarp_entries); +} + +void aarp_unregister_proc_fs(void) +{ + proc_net_unregister(PROC_NET_AT_AARP); +} + +#endif + #endif /* CONFIG_ATALK || CONFIG_ATALK_MODULE */ diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index bcfe9e4de..c79fc6874 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -29,6 +29,8 @@ * driver file. (ipddp.c & ipddp.h) * Jay Schulist : Made work as module with * AppleTalk drivers, cleaned it. + * Rob Newberry : Added proxy AARP and AARP proc fs, + * moved probing to AARP module. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -76,6 +78,16 @@ #include <linux/init.h> +#ifdef CONFIG_PROC_FS +extern void aarp_register_proc_fs(void); +extern void aarp_unregister_proc_fs(void); +#endif + +extern void aarp_probe_network(struct atalk_iface *atif); +extern int aarp_proxy_probe_network(struct atalk_iface *atif, struct at_addr *sa); +extern void aarp_proxy_remove(struct device *dev, struct at_addr *sa); + + #undef APPLETALK_DEBUG #ifdef APPLETALK_DEBUG @@ -301,32 +313,6 @@ static struct atalk_iface *atif_add_device(struct device *dev, struct at_addr *s return (iface); } -/* - * Probe a Phase 1 device or a device that requires its Net:Node to - * be set via an ioctl. - */ -void atif_send_probe_phase1(struct atalk_iface *iface) -{ - struct ifreq atreq; - struct sockaddr_at *sa = (struct sockaddr_at *)&atreq.ifr_addr; - - sa->sat_addr.s_node = iface->address.s_node; - sa->sat_addr.s_net = ntohs(iface->address.s_net); - - /* We pass the Net:Node to the drivers/cards by a Device ioctl. */ - if(!(iface->dev->do_ioctl(iface->dev, &atreq, SIOCSIFADDR))) - { - (void)iface->dev->do_ioctl(iface->dev, &atreq, SIOCGIFADDR); - if((iface->address.s_net != htons(sa->sat_addr.s_net)) - || (iface->address.s_node != sa->sat_addr.s_node)) - iface->status |= ATIF_PROBE_FAIL; - - iface->address.s_net = htons(sa->sat_addr.s_net); - iface->address.s_node = sa->sat_addr.s_node; - } - - return; -} /* * Perform phase 2 AARP probing on our tentative address. @@ -336,7 +322,7 @@ static int atif_probe_device(struct atalk_iface *atif) int netrange=ntohs(atif->nets.nr_lastnet)-ntohs(atif->nets.nr_firstnet)+1; int probe_net=ntohs(atif->address.s_net); int probe_node=atif->address.s_node; - int ct, netct, nodect; + int netct, nodect; /* * Offset the network we start probing with. @@ -372,24 +358,8 @@ static int atif_probe_device(struct atalk_iface *atif) /* * Probe a proposed address. */ + aarp_probe_network(atif); - if(atif->dev->type == ARPHRD_LOCALTLK || atif->dev->type == ARPHRD_PPP) - atif_send_probe_phase1(atif); - else - { - for(ct = 0; ct < AARP_RETRANSMIT_LIMIT; ct++) - { - aarp_send_probe(atif->dev, &atif->address); - /* - * Defer 1/10th - */ - current->timeout = jiffies + (HZ/10); - current->state = TASK_INTERRUPTIBLE; - schedule(); - if(atif->status & ATIF_PROBE_FAIL) - break; - } - } if(!(atif->status & ATIF_PROBE_FAIL)) return (0); } @@ -403,6 +373,69 @@ static int atif_probe_device(struct atalk_iface *atif) return (-EADDRINUSE); /* Network is full... */ } + +/* + * Perform AARP probing for a proxy address + */ +static int atif_proxy_probe_device(struct atalk_iface *atif, struct at_addr* proxy_addr) +{ + int netrange=ntohs(atif->nets.nr_lastnet)-ntohs(atif->nets.nr_firstnet)+1; + int probe_net=ntohs(atif->address.s_net); // we probe the interface's network + int probe_node=ATADDR_ANYNODE; // we'll take anything + int netct, nodect; + + /* + * Offset the network we start probing with. + */ + + if(probe_net == ATADDR_ANYNET) + { + if(!netrange) + probe_net = ntohs(atif->nets.nr_firstnet); + else + probe_net = ntohs(atif->nets.nr_firstnet) + (jiffies%netrange); + } + + if(probe_node == ATADDR_ANYNODE) + probe_node = jiffies&0xFF; + + /* + * Scan the networks. + */ + + for(netct = 0; netct <= netrange; netct++) + { + /* + * Sweep the available nodes from a given start. + */ + + proxy_addr->s_net = htons(probe_net); + for(nodect = 0; nodect < 256; nodect++) + { + proxy_addr->s_node = ((nodect+probe_node) & 0xFF); + if((proxy_addr->s_node>0) && (proxy_addr->s_node<254)) + { + /* + * Tell AARP to probe a proposed address. + */ + int probe_result = aarp_proxy_probe_network(atif, proxy_addr); + + if (probe_result == 0) + return 0; + + if (probe_result != -EADDRINUSE) + return probe_result; + } + } + probe_net++; + if(probe_net > ntohs(atif->nets.nr_lastnet)) + probe_net = ntohs(atif->nets.nr_firstnet); + } + + return (-EADDRINUSE); /* Network is full... */ +} + + struct at_addr *atalk_find_dev_addr(struct device *dev) { struct atalk_iface *iface=dev->atalk_ptr; @@ -482,19 +515,46 @@ static struct atalk_iface *atalk_find_interface(int net, int node) */ static struct atalk_route *atrtr_find(struct at_addr *target) { + /* + * we must search through all routes unless we find a + * host route, because some host routes might overlap + * network routes + */ struct atalk_route *r; - + struct atalk_route *net_route = NULL; + for(r=atalk_router_list; r != NULL; r=r->next) { if(!(r->flags & RTF_UP)) continue; if(r->target.s_net == target->s_net) { - if(!(r->flags&RTF_HOST) - || r->target.s_node == target->s_node) - return (r); + if (r->flags & RTF_HOST) + { + /* + * if this host route is for the target, + * the we're done + */ + if (r->target.s_node == target->s_node) + return (r); + } + else + { + /* + * this route will work if there isn't a + * direct host route, so cache it + */ + net_route = r; + } } } + + /* + * if we found a network route but not a direct host + * route, then return it + */ + if (net_route != NULL) + return (net_route); if(atrtr_default.dev) return (&atrtr_default); @@ -706,6 +766,7 @@ int atif_ioctl(int cmd, void *arg) int ct; int limit; struct rtentry rtdef; + int add_route; if(copy_from_user(&atreq,arg,sizeof(atreq))) return (-EFAULT); @@ -731,6 +792,18 @@ int atif_ioctl(int cmd, void *arg) nr=(struct netrange *)&sa->sat_zero[0]; + add_route = 1; + + /* + * if this is a point-to-point iface, and we already have an + * iface for this AppleTalk address, then we should not add a route + */ + if (dev->flags & IFF_POINTOPOINT && atalk_find_interface(sa->sat_addr.s_net, sa->sat_addr.s_node)) + { + printk(KERN_DEBUG "AppleTalk: point-to-point interface added with existing address\n"); + add_route = 0; + } + /* * Phase 1 is fine on LocalTalk but we don't do * EtherTalk phase 1. Anyone wanting to add it go ahead. @@ -765,7 +838,7 @@ int atif_ioctl(int cmd, void *arg) * error and atalkd will try another. */ - if(!(dev->flags & IFF_LOOPBACK) && atif_probe_device(atif) < 0) + if(!(dev->flags & IFF_LOOPBACK) && !(dev->flags & IFF_POINTOPOINT) && atif_probe_device(atif) < 0) { atif_drop_device(dev); return (-EADDRINUSE); @@ -783,7 +856,7 @@ int atif_ioctl(int cmd, void *arg) rtdef.rt_flags = RTF_UP; sa->sat_family = AF_APPLETALK; sa->sat_addr.s_node = ATADDR_ANYNODE; - if(dev->flags & IFF_LOOPBACK) + if((dev->flags & IFF_LOOPBACK) || (dev->flags & IFF_POINTOPOINT)) rtdef.rt_flags |= RTF_HOST; /* @@ -804,11 +877,12 @@ int atif_ioctl(int cmd, void *arg) printk(KERN_WARNING "Too many routes/iface.\n"); return (-EINVAL); } - for(ct=ntohs(nr->nr_firstnet);ct<=limit;ct++) - { - sa->sat_addr.s_net = htons(ct); - atrtr_create(&rtdef, dev); - } + if (add_route) + for(ct=ntohs(nr->nr_firstnet);ct<=limit;ct++) + { + sa->sat_addr.s_net = htons(ct); + atrtr_create(&rtdef, dev); + } } dev_mc_add(dev, aarp_mcast, 6, 1); return (0); @@ -836,6 +910,68 @@ int atif_ioctl(int cmd, void *arg) return (-EINVAL); atalk_dev_down(dev); break; + + case SIOCSARP: + if(!suser()) + return (-EPERM); + if(sa->sat_family != AF_APPLETALK) + return (-EINVAL); + + /* + * for now, we only support proxy AARP on ELAP; + * we should be able to do it for LocalTalk, too. + */ + if(dev->type != ARPHRD_ETHER) + return (-EPROTONOSUPPORT); + + /* + * atif points to the current interface on this network; + * we aren't concerned about its current status (at least for now), + * but it has all the settings about the network we're going + * to probe. consequently, it must exist. + */ + if (!atif) + return (-EADDRNOTAVAIL); + + nr=(struct netrange *)&(atif->nets); + /* + * Phase 1 is fine on Localtalk but we don't do + * Ethertalk phase 1. Anyone wanting to add it go ahead. + */ + if(dev->type == ARPHRD_ETHER && nr->nr_phase != 2) + return (-EPROTONOSUPPORT); + + if(sa->sat_addr.s_node == ATADDR_BCAST + || sa->sat_addr.s_node == 254) + return (-EINVAL); + + /* + * Check if the chosen address is used. If so we + * error and ATCP will try another. + */ + if (atif_proxy_probe_device(atif, &(sa->sat_addr)) < 0) + return (-EADDRINUSE); + + /* + * We now have an address on the local network, and the AARP + * code will defend it for us until we take it down. + * We don't set up any routes right now, because ATCP will + * install them manually via SIOCADDRT. + */ + break; + + case SIOCDARP: + if(!suser()) + return (-EPERM); + if(sa->sat_family != AF_APPLETALK) + return (-EINVAL); + + /* + * give to aarp module to remove proxy entry + */ + aarp_proxy_remove(atif->dev, &(sa->sat_addr)); + + return (0); } if(copy_to_user(arg, &atreq, sizeof(atreq))) @@ -850,6 +986,7 @@ int atif_ioctl(int cmd, void *arg) static int atrtr_ioctl(unsigned int cmd, void *arg) { struct rtentry rt; + struct device *dev = NULL; if(copy_from_user(&rt, arg, sizeof(rt))) return (-EFAULT); @@ -862,7 +999,12 @@ static int atrtr_ioctl(unsigned int cmd, void *arg) return (atrtr_delete(&((struct sockaddr_at *)&rt.rt_dst)->sat_addr)); case SIOCADDRT: - return (atrtr_create(&rt, NULL)); + /* FIX ME: the name of the device is still in user space, isn't it? */ + if (rt.rt_dev != NULL) + if ((dev = dev_get(rt.rt_dev)) == NULL) + return -(ENODEV); + + return (atrtr_create(&rt, dev)); default: return (-EINVAL); @@ -1005,7 +1147,15 @@ static int atalk_create(struct socket *sock, int protocol) case SOCK_DGRAM: sock->ops = &atalk_dgram_ops; break; - + + case SOCK_STREAM: + /* + * TO DO: if you want to implement ADSP, here's the place to start + */ + /* + sock->ops = &atalk_stream_ops; + break; + */ default: sk_free((void *)sk); return (-ESOCKTNOSUPPORT); @@ -1331,6 +1481,13 @@ static int atalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type */ if (skb->pkt_type != PACKET_HOST || ddp->deh_dnet == 0) { + /* + * FIX ME: + * Can it ever happen that a packet is from a PPP iface and needs to be broadcast onto the default network? + */ + if (dev->type == ARPHRD_PPP) + printk(KERN_DEBUG "AppleTalk: didn't forward broadcast packet received from PPP iface\n"); + kfree_skb(skb); return (0); } @@ -1822,6 +1979,8 @@ static int atalk_ioctl(struct socket *sock,unsigned int cmd, unsigned long arg) case SIOCGIFBRDADDR: case SIOCATALKDIFADDR: case SIOCDIFADDR: + case SIOCSARP: /* proxy AARP */ + case SIOCDARP: /* proxy AARP */ return (atif_ioctl(cmd,(void *)arg)); /* @@ -1967,6 +2126,8 @@ __initfunc(void atalk_proto_init(struct net_proto *pro)) proc_net_register(&proc_appletalk); proc_net_register(&proc_atalk_route); proc_net_register(&proc_atalk_iface); + + aarp_register_proc_fs(); #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SYSCTL @@ -2007,6 +2168,8 @@ void cleanup_module(void) proc_net_unregister(PROC_NET_ATALK); proc_net_unregister(PROC_NET_AT_ROUTE); proc_net_unregister(PROC_NET_ATIF); + + aarp_unregister_proc_fs(); #endif /* CONFIG_PROC_FS */ aarp_cleanup_module(); /* General aarp clean-up. */ diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index cd84989a6..77cb218d3 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1290,10 +1290,12 @@ static int ax25_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_ fsa.fsa_ax25.sax25_call = sk->protinfo.ax25->dest_addr; fsa.fsa_ax25.sax25_ndigis = 0; - ndigi = sk->protinfo.ax25->digipeat->ndigi; - fsa.fsa_ax25.sax25_ndigis = ndigi; - for (i = 0; i < ndigi; i++) - fsa.fsa_digipeater[i] = sk->protinfo.ax25->digipeat->calls[i]; + if (sk->protinfo.ax25->digipeat != NULL) { + ndigi = sk->protinfo.ax25->digipeat->ndigi; + fsa.fsa_ax25.sax25_ndigis = ndigi; + for (i = 0; i < ndigi; i++) + fsa.fsa_digipeater[i] = sk->protinfo.ax25->digipeat->calls[i]; + } } else { fsa.fsa_ax25.sax25_family = AF_AX25; fsa.fsa_ax25.sax25_call = sk->protinfo.ax25->source_addr; diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c index b5d5f071e..1382132ed 100644 --- a/net/ax25/ax25_ip.c +++ b/net/ax25/ax25_ip.c @@ -195,6 +195,18 @@ int ax25_rebuild_header(struct sk_buff *skb) return 1; } +#else /* INET */ + +int ax25_encapsulate(struct sk_buff *skb, struct device *dev, unsigned short type, void *daddr, void *saddr, unsigned len) +{ + return -AX25_HEADER_LEN; +} + +int ax25_rebuild_header(struct sk_buff *skb) +{ + return 1; +} + #endif #endif diff --git a/net/core/datagram.c b/net/core/datagram.c index f064370d4..da09973cd 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -103,6 +103,11 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, int error; struct sk_buff *skb; + /* Caller is allowed not to check sk->err before skb_recv_datagram() */ + error = sock_error(sk); + if (error) + goto no_packet; + restart: while(skb_queue_empty(&sk->receive_queue)) /* No data */ { @@ -216,11 +221,11 @@ unsigned int datagram_poll(struct file * file, struct socket *sock, poll_table * mask = 0; /* exceptional events? */ - if (sk->err) + if (sk->err || !skb_queue_empty(&sk->error_queue)) mask |= POLLERR; if (sk->shutdown & RCV_SHUTDOWN) mask |= POLLHUP; - + /* readable? */ if (!skb_queue_empty(&sk->receive_queue)) mask |= POLLIN | POLLRDNORM; @@ -237,6 +242,8 @@ unsigned int datagram_poll(struct file * file, struct socket *sock, poll_table * /* writable? */ if (sock_writeable(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + else + sk->socket->flags |= SO_NOSPACE; return mask; } diff --git a/net/core/dev.c b/net/core/dev.c index 045fd0f92..b4e8d140b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1375,7 +1375,7 @@ int dev_change_flags(struct device *dev, unsigned flags) */ dev->flags = (flags & (IFF_DEBUG|IFF_NOTRAILERS|IFF_RUNNING|IFF_NOARP| - IFF_NODYNARP|IFF_SLAVE|IFF_MASTER| + IFF_SLAVE|IFF_MASTER|IFF_DYNAMIC| IFF_MULTICAST|IFF_PORTSEL|IFF_AUTOMEDIA)) | (dev->flags & (IFF_UP|IFF_VOLATILE|IFF_PROMISC|IFF_ALLMULTI)); @@ -1730,7 +1730,6 @@ static int dev_boot_phase = 1; int register_netdevice(struct device *dev) { struct device *d, **dp; -printk("register_netdevice #1\n"); if (dev_boot_phase) { /* This is NOT bug, but I am not sure, that all the @@ -1755,32 +1754,27 @@ printk("register_netdevice #1\n"); *dp = dev; return 0; } -printk("register_netdevice #2\n"); dev->iflink = -1; /* Init, if this function is available */ if (dev->init && dev->init(dev) != 0) return -EIO; -printk("register_netdevice #3\n"); /* Check for existence, and append to tail of chain */ for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) { if (d == dev || strcmp(d->name, dev->name) == 0) return -EEXIST; } -printk("register_netdevice #4\n"); dev->next = NULL; dev_init_scheduler(dev); dev->ifindex = dev_new_index(); if (dev->iflink == -1) dev->iflink = dev->ifindex; *dp = dev; -printk("register_netdevice #5\n"); /* Notify protocols, that a new device appeared. */ notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev); -printk("register_netdevice #6\n"); return 0; } diff --git a/net/core/iovec.c b/net/core/iovec.c index b8960ecf7..8919fc5c1 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -95,6 +95,30 @@ out: } /* + * In kernel copy to iovec. Returns -EFAULT on error. + * + * Note: this modifies the original iovec. + */ + +void memcpy_tokerneliovec(struct iovec *iov, unsigned char *kdata, int len) +{ + while(len>0) + { + if(iov->iov_len) + { + int copy = min(iov->iov_len, len); + memcpy(iov->iov_base, kdata, copy); + kdata+=copy; + len-=copy; + iov->iov_len-=copy; + iov->iov_base+=copy; + } + iov++; + } +} + + +/* * Copy iovec to kernel. Returns -EFAULT on error. * * Note: this modifies the original iovec. diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ead3b77ff..637322f65 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -592,7 +592,7 @@ static __inline__ void neigh_update_hhs(struct neighbour *neigh) -- lladdr is new lladdr or NULL, if it is not supplied. -- new is new state. -- override==1 allows to override existing lladdr, if it is different. - -- arp==0 means that that the change is administrative. + -- arp==0 means that the change is administrative. */ int neigh_update(struct neighbour *neigh, u8 *lladdr, u8 new, int override, int arp) diff --git a/net/core/scm.c b/net/core/scm.c index e16c4a45f..c28da7ebb 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -103,7 +103,6 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) void __scm_destroy(struct scm_cookie *scm) { struct scm_fp_list *fpl = scm->fp; - struct file *file; int i; if (fpl) { @@ -112,34 +111,18 @@ void __scm_destroy(struct scm_cookie *scm) fput(fpl->fp[i]); kfree(fpl); } - - file = scm->file; - if (file) { - scm->sock = NULL; - scm->file = NULL; - fput(file); - } } - - -extern __inline__ int not_one_bit(unsigned val) -{ - return (val-1) & val; -} - - int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) { struct cmsghdr *cmsg; - struct file *file; - int acc_fd, err; - unsigned int scm_flags=0; + int err; for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { err = -EINVAL; + /* Verify that cmsg_len is at least sizeof(struct cmsghdr) */ if ((unsigned long)(((char*)cmsg - (char*)msg->msg_control) + cmsg->cmsg_len) > msg->msg_controllen) goto error; @@ -162,30 +145,6 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) if (err) goto error; break; - case SCM_CONNECT: - if (scm_flags) - goto error; - if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) - goto error; - memcpy(&acc_fd, CMSG_DATA(cmsg), sizeof(int)); - p->sock = NULL; - if (acc_fd != -1) { - err = -EBADF; - file = fget(acc_fd); - if (!file) - goto error; - p->file = file; - err = -ENOTSOCK; - if (!file->f_dentry->d_inode || - !file->f_dentry->d_inode->i_sock) - goto error; - p->sock = &file->f_dentry->d_inode->u.socket_i; - err = -EINVAL; - if (p->sock->state != SS_UNCONNECTED) - goto error; - } - scm_flags |= MSG_SYN; - break; default: goto error; } @@ -196,16 +155,13 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) kfree(p->fp); p->fp = NULL; } - - err = -EINVAL; - msg->msg_flags |= scm_flags; - scm_flags = msg->msg_flags&MSG_CTLFLAGS; - if (not_one_bit(scm_flags)) - goto error; - if (!(scm_flags && p->fp)) - return 0; + err = -EINVAL; + if (msg->msg_flags & MSG_CTLFLAGS) + goto error; + return 0; + error: scm_destroy(p); return err; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fb13b5e16..a03d284e7 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4,7 +4,7 @@ * Authors: Alan Cox <iiitac@pyr.swan.ac.uk> * Florian La Roche <rzsfl@rz.uni-sb.de> * - * Version: $Id: skbuff.c,v 1.53 1998/08/19 13:32:44 freitag Exp $ + * Version: $Id: skbuff.c,v 1.54 1998/09/15 02:11:09 davem Exp $ * * Fixes: * Alan Cox : Fixed the worst of the load balancer bugs. @@ -192,7 +192,7 @@ static inline void skb_headerinit(void *p, kmem_cache_t *cache, skb->ip_summed = 0; skb->security = 0; /* By default packets are insecure */ skb->dst = NULL; -#ifdef CONFIG_IP_FIREWALL_CHAINS +#ifdef CONFIG_IP_FIREWALL skb->fwmark = 0; #endif memset(skb->cb, 0, sizeof(skb->cb)); diff --git a/net/core/sock.c b/net/core/sock.c index e9e293ec9..caaaa21e6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -7,7 +7,7 @@ * handler for protocols to use and generic option handler. * * - * Version: $Id: sock.c,v 1.70 1998/08/26 12:03:07 davem Exp $ + * Version: $Id: sock.c,v 1.75 1998/11/07 10:54:38 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -138,8 +138,7 @@ __u32 sysctl_rmem_max = SK_RMEM_MAX; __u32 sysctl_wmem_default = SK_WMEM_MAX; __u32 sysctl_rmem_default = SK_RMEM_MAX; -int sysctl_core_destroy_delay = SOCK_DESTROY_TIME; -/* Maximal space eaten by iovec (still not made (2.1.88)!) plus some space */ +/* Maximal space eaten by iovec or ancilliary data plus some space */ int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512); /* @@ -155,7 +154,6 @@ int sock_setsockopt(struct socket *sock, int level, int optname, int valbool; int err; struct linger ling; - struct ifreq req; int ret = 0; #ifdef CONFIG_FILTER @@ -293,31 +291,41 @@ int sock_setsockopt(struct socket *sock, int level, int optname, #ifdef CONFIG_NETDEVICES case SO_BINDTODEVICE: + { + char devname[IFNAMSIZ]; + + /* Sorry... */ + if (!capable(CAP_NET_RAW)) + return -EPERM; + /* Bind this socket to a particular device like "eth0", - * as specified in an ifreq structure. If the device - * is "", socket is NOT bound to a device. - */ + * as specified in the passed interface name. If the + * name is "" or the option length is zero the socket + * is not bound. + */ if (!valbool) { sk->bound_dev_if = 0; - } - else { - if (copy_from_user(&req, optval, sizeof(req))) - return -EFAULT; - + } else { + if (optlen > IFNAMSIZ) + optlen = IFNAMSIZ; + if (copy_from_user(devname, optval, optlen)) + return -EFAULT; + /* Remove any cached route for this socket. */ dst_release(xchg(&sk->dst_cache, NULL)); - if (req.ifr_ifrn.ifrn_name[0] == '\0') { + if (devname[0] == '\0') { sk->bound_dev_if = 0; } else { - struct device *dev = dev_get(req.ifr_ifrn.ifrn_name); + struct device *dev = dev_get(devname); if (!dev) return -EINVAL; sk->bound_dev_if = dev->ifindex; } + return 0; } - return 0; + } #endif @@ -483,7 +491,8 @@ struct sock *sk_alloc(int family, int priority, int zero_it) struct sock *sk = kmem_cache_alloc(sk_cachep, priority); if(sk) { - if (zero_it) memset(sk, 0, sizeof(struct sock)); + if (zero_it) + memset(sk, 0, sizeof(struct sock)); sk->family = family; } @@ -498,7 +507,7 @@ void sk_free(struct sock *sk) kmem_cache_free(sk_cachep, sk); } -__initfunc(void sk_init(void)) +void __init sk_init(void) { sk_cachep = kmem_cache_create("sock", sizeof(struct sock), 0, SLAB_HWCACHE_ALIGN, 0, 0); @@ -508,35 +517,34 @@ __initfunc(void sk_init(void)) /* * Simple resource managers for sockets. */ - + + +/* + * Write buffer destructor automatically called from kfree_skb. + */ void sock_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; -#if 1 - if (!sk) { - printk(KERN_DEBUG "sock_wfree: sk==NULL\n"); - return; - } -#endif + /* In case it might be waiting for more memory. */ atomic_sub(skb->truesize, &sk->wmem_alloc); sk->write_space(sk); } - +/* + * Read buffer destructor automatically called from kfree_skb. + */ void sock_rfree(struct sk_buff *skb) { struct sock *sk = skb->sk; -#if 1 - if (!sk) { - printk(KERN_DEBUG "sock_rfree: sk==NULL\n"); - return; - } -#endif + atomic_sub(skb->truesize, &sk->rmem_alloc); } +/* + * Allocate a skb from the socket's send buffer. + */ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority) { if (force || atomic_read(&sk->wmem_alloc) < sk->sndbuf) { @@ -551,6 +559,9 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int return NULL; } +/* + * Allocate a skb from the socket's receive buffer. + */ struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority) { if (force || atomic_read(&sk->rmem_alloc) < sk->rcvbuf) { @@ -565,6 +576,9 @@ struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int return NULL; } +/* + * Allocate a memory block from the socket's option memory buffer. + */ void *sock_kmalloc(struct sock *sk, int size, int priority) { if (atomic_read(&sk->omem_alloc)+size < sysctl_optmem_max) { @@ -581,6 +595,9 @@ void *sock_kmalloc(struct sock *sk, int size, int priority) return NULL; } +/* + * Free an option memory block. + */ void sock_kfree_s(struct sock *sk, void *mem, int size) { kfree_s(mem, size); @@ -813,7 +830,7 @@ void sklist_destroy_socket(struct sock **list,struct sock *sk) * Someone is using our buffers still.. defer */ init_timer(&sk->timer); - sk->timer.expires=jiffies+sysctl_core_destroy_delay; + sk->timer.expires=jiffies+SOCK_DESTROY_TIME; sk->timer.function=sklist_destroy_timer; sk->timer.data = (unsigned long)sk; add_timer(&sk->timer); @@ -944,16 +961,23 @@ int sock_no_recvmsg(struct socket *sock, struct msghdr *m, int flags, * Default Socket Callbacks */ -void sock_def_callback1(struct sock *sk) +void sock_def_wakeup(struct sock *sk) { if(!sk->dead) wake_up_interruptible(sk->sleep); } -void sock_def_callback2(struct sock *sk, int len) +void sock_def_error_report(struct sock *sk) { - if(!sk->dead) - { + if (!sk->dead) { + wake_up_interruptible(sk->sleep); + sock_wake_async(sk->socket,0); + } +} + +void sock_def_readable(struct sock *sk, int len) +{ + if(!sk->dead) { wake_up_interruptible(sk->sleep); sock_wake_async(sk->socket,1); } @@ -961,11 +985,14 @@ void sock_def_callback2(struct sock *sk, int len) void sock_def_write_space(struct sock *sk) { - if(!sk->dead) - { + /* Do not wake up a writer until he can make "significant" + * progress. --DaveM + */ + if(!sk->dead && + ((atomic_read(&sk->wmem_alloc) << 1) <= sk->sndbuf)) { wake_up_interruptible(sk->sleep); - /* Should agree with poll, otherwise some programs break */ + /* Should agree with poll, otherwise some programs break */ if (sock_writeable(sk)) sock_wake_async(sk->socket, 2); } @@ -1000,10 +1027,10 @@ void sock_init_data(struct socket *sock, struct sock *sk) sock->sk = sk; } - sk->state_change = sock_def_callback1; - sk->data_ready = sock_def_callback2; + sk->state_change = sock_def_wakeup; + sk->data_ready = sock_def_readable; sk->write_space = sock_def_write_space; - sk->error_report = sock_def_callback1; + sk->error_report = sock_def_error_report; sk->destruct = sock_def_destruct; sk->peercred.pid = 0; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 47c85d006..446ca1458 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -25,6 +25,7 @@ extern int sysctl_core_destroy_delay; extern int sysctl_optmem_max; ctl_table core_table[] = { +#ifdef CONFIG_NET {NET_CORE_WMEM_MAX, "wmem_max", &sysctl_wmem_max, sizeof(int), 0644, NULL, &proc_dointvec}, @@ -37,9 +38,6 @@ ctl_table core_table[] = { {NET_CORE_RMEM_DEFAULT, "rmem_default", &sysctl_rmem_default, sizeof(int), 0644, NULL, &proc_dointvec}, - {NET_CORE_DESTROY_DELAY, "destroy_delay", - &sysctl_core_destroy_delay, sizeof(int), 0644, NULL, - &proc_dointvec_jiffies}, {NET_CORE_MAX_BACKLOG, "netdev_max_backlog", &netdev_max_backlog, sizeof(int), 0644, NULL, &proc_dointvec}, @@ -57,6 +55,7 @@ ctl_table core_table[] = { {NET_CORE_OPTMEM_MAX, "optmem_max", &sysctl_optmem_max, sizeof(int), 0644, NULL, &proc_dointvec}, +#endif /* CONFIG_NET */ { 0 } }; #endif diff --git a/net/econet/econet.c b/net/econet/econet.c index 92bdc4c97..8a3a72ae7 100644 --- a/net/econet/econet.c +++ b/net/econet/econet.c @@ -330,8 +330,6 @@ static int econet_sendmsg(struct socket *sock, struct msghdr *msg, int len, { /* Real hardware Econet. We're not worthy etc. */ #ifdef CONFIG_ECONET_NATIVE - unsigned char *p; - dev_lock_list(); skb = sock_alloc_send_skb(sk, len+dev->hard_header_len+15, 0, @@ -708,22 +706,13 @@ static int econet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg case SIOCSIFHWBROADCAST: return(dev_ioctl(cmd,(void *) arg)); - case SIOCSIFADDR: case SIOCGIFADDR: return ec_dev_ioctl(sock, cmd, (void *)arg); break; default: - if ((cmd >= SIOCDEVPRIVATE) && - (cmd <= (SIOCDEVPRIVATE + 15))) - return(dev_ioctl(cmd,(void *) arg)); - -#ifdef CONFIG_NET_RADIO - if((cmd >= SIOCIWFIRST) && (cmd <= SIOCIWLAST)) - return(dev_ioctl(cmd,(void *) arg)); -#endif - return -EOPNOTSUPP; + return(dev_ioctl(cmd,(void *) arg)); } /*NOTREACHED*/ return 0; diff --git a/net/ipv4/Config.in b/net/ipv4/Config.in index 4b83152f0..e0379e69b 100644 --- a/net/ipv4/Config.in +++ b/net/ipv4/Config.in @@ -31,19 +31,24 @@ if [ "$CONFIG_FIREWALL" = "y" ]; then define_bool CONFIG_NETLINK_DEV y fi fi - bool 'IP: transparent proxy support' CONFIG_IP_TRANSPARENT_PROXY - bool 'IP: always defragment' CONFIG_IP_ALWAYS_DEFRAG + bool 'IP: always defragment (required for masquerading)' CONFIG_IP_ALWAYS_DEFRAG fi fi if [ "$CONFIG_IP_FIREWALL" = "y" ]; then - bool 'IP: masquerading' CONFIG_IP_MASQUERADE - if [ "$CONFIG_IP_MASQUERADE" != "n" ]; then - comment 'Protocol-specific masquerading support will be built as modules.' - bool 'IP: ICMP masquerading' CONFIG_IP_MASQUERADE_ICMP - comment 'Protocol-specific masquerading support will be built as modules.' - if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then - tristate 'IP: ipautofw masq support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_IPAUTOFW - tristate 'IP: ipportfw masq support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_IPPORTFW + if [ "$CONFIG_IP_ALWAYS_DEFRAG" != "n" ]; then + bool 'IP: transparent proxy support' CONFIG_IP_TRANSPARENT_PROXY + bool 'IP: masquerading' CONFIG_IP_MASQUERADE + if [ "$CONFIG_IP_MASQUERADE" != "n" ]; then + comment 'Protocol-specific masquerading support will be built as modules.' + bool 'IP: ICMP masquerading' CONFIG_IP_MASQUERADE_ICMP + comment 'Protocol-specific masquerading support will be built as modules.' + if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + bool 'IP: masquerading special modules support' CONFIG_IP_MASQUERADE_MOD + if [ "$CONFIG_IP_MASQUERADE_MOD" = "y" ]; then + tristate 'IP: ipautofw masq support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_IPAUTOFW + tristate 'IP: ipportfw masq support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_IPPORTFW + fi + fi fi fi fi @@ -74,3 +79,7 @@ tristate 'IP: Reverse ARP' CONFIG_INET_RARP #bool 'IP: Disable NAGLE algorithm (normally enabled)' CONFIG_TCP_NAGLE_OFF bool 'IP: Drop source routed frames' CONFIG_IP_NOSR bool 'IP: Allow large windows (not recommended if <16Mb of memory)' CONFIG_SKB_LARGE +#if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then +#bool 'IP: support experimental checksum copy to user for UDP' CONFIG_UDP_DELAY_CSUM +#fi + diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 2a519f346..ad2a0a650 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -60,24 +60,30 @@ else endif ifeq ($(CONFIG_IP_MASQUERADE),y) -IPV4X_OBJS += ip_masq.o ip_masq_mod.o ip_masq_app.o - -ifeq ($(CONFIG_IP_MASQUERADE_IPAUTOFW),y) -IPV4_OBJS += ip_masq_autofw.o -else - ifeq ($(CONFIG_IP_MASQUERADE_IPAUTOFW),m) - M_OBJS += ip_masq_autofw.o +IPV4X_OBJS += ip_masq.o ip_masq_app.o + +ifeq ($(CONFIG_IP_MASQUERADE_MOD),y) + IPV4X_OBJS += ip_masq_mod.o + + ifeq ($(CONFIG_IP_MASQUERADE_IPAUTOFW),y) + IPV4_OBJS += ip_masq_autofw.o + else + ifeq ($(CONFIG_IP_MASQUERADE_IPAUTOFW),m) + M_OBJS += ip_masq_autofw.o + endif endif -endif - -ifeq ($(CONFIG_IP_MASQUERADE_IPPORTFW),y) -IPV4_OBJS += ip_masq_portfw.o -else - ifeq ($(CONFIG_IP_MASQUERADE_IPPORTFW),m) - M_OBJS += ip_masq_portfw.o + + ifeq ($(CONFIG_IP_MASQUERADE_IPPORTFW),y) + IPV4_OBJS += ip_masq_portfw.o + else + ifeq ($(CONFIG_IP_MASQUERADE_IPPORTFW),m) + M_OBJS += ip_masq_portfw.o + endif endif + endif +M_OBJS += ip_masq_user.o M_OBJS += ip_masq_ftp.o ip_masq_irc.o ip_masq_raudio.o ip_masq_quake.o M_OBJS += ip_masq_vdolive.o ip_masq_cuseeme.o endif diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8282333dc..54a4578ca 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -5,7 +5,7 @@ * * PF_INET protocol family socket handler. * - * Version: $Id: af_inet.c,v 1.75 1998/08/26 12:03:15 davem Exp $ + * Version: $Id: af_inet.c,v 1.80 1998/11/08 11:17:03 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -119,8 +119,6 @@ struct linux_mib net_statistics; -extern int sysctl_core_destroy_delay; - extern int raw_get_info(char *, char **, off_t, int, int); extern int snmp_get_info(char *, char **, off_t, int, int); extern int netstat_get_info(char *, char **, off_t, int, int); @@ -198,7 +196,7 @@ static __inline__ void kill_sk_later(struct sock *sk) sk->destroy = 1; sk->ack_backlog = 0; release_sock(sk); - net_reset_timer(sk, TIME_DESTROY, sysctl_core_destroy_delay); + net_reset_timer(sk, TIME_DESTROY, SOCK_DESTROY_TIME); } void destroy_sock(struct sock *sk) @@ -466,7 +464,7 @@ int inet_release(struct socket *sock, struct socket *peersock) struct sock *sk = sock->sk; if (sk) { - unsigned long timeout; + long timeout; /* Begin closedown and wake up sleepers. */ if (sock->state != SS_UNCONNECTED) @@ -485,11 +483,11 @@ int inet_release(struct socket *sock, struct socket *peersock) */ timeout = 0; if (sk->linger && !(current->flags & PF_EXITING)) { - timeout = ~0UL; + timeout = MAX_SCHEDULE_TIMEOUT; /* XXX This makes no sense whatsoever... -DaveM */ if (!sk->lingertime) - timeout = jiffies + HZ*sk->lingertime; + timeout = HZ*sk->lingertime; } sock->sk = NULL; sk->socket = NULL; @@ -643,33 +641,20 @@ int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr, if (sk->state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) return (-EINPROGRESS); -#if 1 if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) { inet_wait_for_connect(sk); if (signal_pending(current)) return -ERESTARTSYS; } -#else - cli(); - while(sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) { - interruptible_sleep_on(sk->sleep); - if (signal_pending(current)) { - sti(); - return(-ERESTARTSYS); - } - /* This fixes a nasty in the tcp/ip code. There is a hideous hassle with - icmp error packets wanting to close a tcp or udp socket. */ - if (sk->err && sk->protocol == IPPROTO_TCP) { - sock->state = SS_UNCONNECTED; - sti(); - return sock_error(sk); /* set by tcp_err() */ - } - } - sti(); -#endif sock->state = SS_CONNECTED; if ((sk->state != TCP_ESTABLISHED) && sk->err) { + /* This is ugly but needed to fix a race in the ICMP error handler */ + if (sk->protocol == IPPROTO_TCP && sk->zapped) { + lock_sock(sk); + tcp_set_state(sk, TCP_CLOSE); + release_sock(sk); + } sock->state = SS_UNCONNECTED; return sock_error(sk); } @@ -716,13 +701,6 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags) if (flags & O_NONBLOCK) goto do_half_success; - cli(); - while (sk2->state == TCP_SYN_RECV) { - interruptible_sleep_on(sk2->sleep); - if (signal_pending(current)) - goto do_interrupted; - } - sti(); if(sk2->state == TCP_ESTABLISHED) goto do_full_success; if(sk2->err > 0) @@ -749,18 +727,9 @@ do_bad_connection: newsk->socket = newsock; return err; -do_interrupted: - sti(); - sk1->pair = sk2; - sk2->sleep = NULL; - sk2->socket = NULL; - newsock->sk = newsk; - newsk->socket = newsock; - err = -ERESTARTSYS; -do_err: - return err; do_sk1_err: err = sock_error(sk1); +do_err: return err; } @@ -805,8 +774,6 @@ int inet_recvmsg(struct socket *sock, struct msghdr *msg, int size, return(-EINVAL); if (sk->prot->recvmsg == NULL) return(-EOPNOTSUPP); - if (sk->err) - return sock_error(sk); /* We may need to bind the socket. */ if (inet_autobind(sk) != 0) return(-EAGAIN); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 1ce69028f..efb16cc47 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1,6 +1,6 @@ /* linux/net/inet/arp.c * - * Version: $Id: arp.c,v 1.70 1998/08/26 12:03:18 davem Exp $ + * Version: $Id: arp.c,v 1.75 1998/11/16 04:51:56 davem Exp $ * * Copyright (C) 1994 by Florian La Roche * @@ -163,8 +163,6 @@ static struct neigh_ops arp_direct_ops = dev_queue_xmit }; -#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) || \ - defined(CONFIG_SHAPER) || defined(CONFIG_SHAPER_MODULE) struct neigh_ops arp_broken_ops = { AF_INET, @@ -176,7 +174,6 @@ struct neigh_ops arp_broken_ops = dev_queue_xmit, dev_queue_xmit, }; -#endif struct neigh_table arp_tbl = { @@ -547,7 +544,7 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) */ if (in_dev == NULL || arp->ar_hln != dev->addr_len || - dev->flags & IFF_NOARP || + dev->flags & IFF_NOARP || skb->pkt_type == PACKET_OTHERHOST || skb->pkt_type == PACKET_LOOPBACK || arp->ar_pln != 4) @@ -1027,7 +1024,7 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy "%-17s0x%-10x0x%-10x%s", in_ntoa(*(u32*)n->key), hatype, - ATF_PUBL|ATF_PERM, + ATF_PUBL|ATF_PERM, "00:00:00:00:00:00"); size += sprintf(buffer+len+size, " %-17s %s\n", diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 618d247bd..5232c618c 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -5,7 +5,7 @@ * * IPv4 FIB: lookup engine and maintenance routines. * - * Version: $Id: fib_hash.c,v 1.5 1998/08/26 12:03:27 davem Exp $ + * Version: $Id: fib_hash.c,v 1.6 1998/10/03 09:37:06 davem Exp $ * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * @@ -43,9 +43,9 @@ #include <net/sock.h> #include <net/ip_fib.h> -#define FTprint(a...) +#define FTprint(a...) /* -printk(KERN_DEBUG a) + printk(KERN_DEBUG a) */ /* @@ -140,6 +140,11 @@ extern __inline__ int fn_key_eq(fn_key_t a, fn_key_t b) return a.datum == b.datum; } +extern __inline__ int fn_key_leq(fn_key_t a, fn_key_t b) +{ + return a.datum <= b.datum; +} + #define FZ_MAX_DIVISOR 1024 #ifdef CONFIG_IP_ROUTE_LARGE_TABLES @@ -154,9 +159,11 @@ static __inline__ void fn_rebuild_zone(struct fn_zone *fz, for (i=0; i<old_divisor; i++) { for (f=old_ht[i]; f; f=next) { next = f->fn_next; - f->fn_next = NULL; - for (fp = fz_chain_p(f->fn_key, fz); *fp; fp = &(*fp)->fn_next) + for (fp = fz_chain_p(f->fn_key, fz); + *fp && fn_key_leq((*fp)->fn_key, f->fn_key); + fp = &(*fp)->fn_next) /* NONE */; + f->fn_next = *fp; *fp = f; } } @@ -199,7 +206,6 @@ static void fn_rehash_zone(struct fn_zone *fz) fn_rebuild_zone(fz, old_ht, old_divisor); end_bh_atomic(); kfree(old_ht); -FTprint("REHASHED ZONE: order %d mask %08x hash %d/%08x\n", fz->fz_order, fz->fz_mask, fz->fz_divisor, fz->fz_hashmask); } } #endif /* CONFIG_IP_ROUTE_LARGE_TABLES */ @@ -240,7 +246,6 @@ fn_new_zone(struct fn_hash *table, int z) for (i=z+1; i<=32; i++) if (table->fn_zones[i]) break; - start_bh_atomic(); if (i>32) { /* No more specific masks, we are the first. */ fz->fz_next = table->fn_zone_list; @@ -250,8 +255,6 @@ fn_new_zone(struct fn_hash *table, int z) table->fn_zones[i]->fz_next = fz; } table->fn_zones[z] = fz; - end_bh_atomic(); -FTprint("NEW ZONE: order %d mask %08x hash %d/%08x\n", fz->fz_order, fz->fz_mask, fz->fz_divisor, fz->fz_hashmask); return fz; } @@ -265,19 +268,18 @@ fn_hash_lookup(struct fib_table *tb, const struct rt_key *key, struct fib_result for (fz = t->fn_zone_list; fz; fz = fz->fz_next) { struct fib_node *f; fn_key_t k = fz_key(key->dst, fz); - int matched = 0; for (f = fz_chain(k, fz); f; f = f->fn_next) { - if (!fn_key_eq(k, f->fn_key) -#ifdef CONFIG_IP_ROUTE_TOS - || (f->fn_tos && f->fn_tos != key->tos) -#endif - ) { - if (matched) + if (!fn_key_eq(k, f->fn_key)) { + if (fn_key_leq(k, f->fn_key)) break; - continue; + else + continue; } - matched = 1; +#ifdef CONFIG_IP_ROUTE_TOS + if (f->fn_tos && f->fn_tos != key->tos) + continue; +#endif f->fn_state |= FN_S_ACCESSED; if (f->fn_state&FN_S_ZOMBIE) @@ -306,11 +308,14 @@ for ( ; ((f) = *(fp)) != NULL; (fp) = &(f)->fn_next) #define FIB_SCAN_KEY(f, fp, key) \ for ( ; ((f) = *(fp)) != NULL && fn_key_eq((f)->fn_key, (key)); (fp) = &(f)->fn_next) -#define FIB_CONTINUE(f, fp) \ -{ \ - fp = &f->fn_next; \ - continue; \ -} +#ifndef CONFIG_IP_ROUTE_TOS +#define FIB_SCAN_TOS(f, fp, key, tos) FIB_SCAN_KEY(f, fp, key) +#else +#define FIB_SCAN_TOS(f, fp, key, tos) \ +for ( ; ((f) = *(fp)) != NULL && fn_key_eq((f)->fn_key, (key)) && \ + (f)->fn_tos == (tos) ; (fp) = &(f)->fn_next) +#endif + #ifdef CONFIG_RTNETLINK static void rtmsg_fib(int, struct fib_node*, int, int, @@ -326,7 +331,7 @@ fn_hash_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, struct nlmsghdr *n, struct netlink_skb_parms *req) { struct fn_hash *table = (struct fn_hash*)tb->tb_data; - struct fib_node *new_f, *f, **fp; + struct fib_node *new_f, *f, **fp, **del_fp; struct fn_zone *fz; struct fib_info *fi; @@ -336,7 +341,6 @@ fn_hash_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, u8 tos = r->rtm_tos; #endif fn_key_t key; - unsigned state = 0; int err; FTprint("tb(%d)_insert: %d %08x/%d %d %08x\n", tb->tb_id, r->rtm_type, rta->rta_dst ? @@ -357,10 +361,8 @@ rta->rta_prefsrc ? *(u32*)rta->rta_prefsrc : 0); key = fz_key(dst, fz); } - if ((fi = fib_create_info(r, rta, n, &err)) == NULL) { -FTprint("fib_create_info err=%d\n", err); + if ((fi = fib_create_info(r, rta, n, &err)) == NULL) return err; - } #ifdef CONFIG_IP_ROUTE_LARGE_TABLES if (fz->fz_nent > (fz->fz_divisor<<2) && @@ -375,7 +377,7 @@ FTprint("fib_create_info err=%d\n", err); * Scan list to find the first route with the same destination */ FIB_SCAN(f, fp) { - if (fn_key_eq(f->fn_key,key)) + if (fn_key_leq(key,f->fn_key)) break; } @@ -389,70 +391,75 @@ FTprint("fib_create_info err=%d\n", err); } #endif - if (f && fn_key_eq(f->fn_key, key) + del_fp = NULL; + + if (f && (f->fn_state&FN_S_ZOMBIE) && #ifdef CONFIG_IP_ROUTE_TOS - && f->fn_tos == tos + f->fn_tos == tos && #endif - ) { + fn_key_eq(f->fn_key, key)) { + del_fp = fp; + fp = &f->fn_next; + f = *fp; + goto create; + } + + FIB_SCAN_TOS(f, fp, key, tos) { + if (fi->fib_priority <= FIB_INFO(f)->fib_priority) + break; + } + + /* Now f==*fp points to the first node with the same + keys [prefix,tos,priority], if such key already + exists or to the node, before which we will insert new one. + */ + + if (f && +#ifdef CONFIG_IP_ROUTE_TOS + f->fn_tos == tos && +#endif + fn_key_eq(f->fn_key, key) && + fi->fib_priority == FIB_INFO(f)->fib_priority) { struct fib_node **ins_fp; - state = f->fn_state; - if (n->nlmsg_flags&NLM_F_EXCL && !(state&FN_S_ZOMBIE)) - return -EEXIST; + err = -EEXIST; + if (n->nlmsg_flags&NLM_F_EXCL) + goto out; + if (n->nlmsg_flags&NLM_F_REPLACE) { - struct fib_info *old_fi = FIB_INFO(f); - if (old_fi != fi) { - rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req); - start_bh_atomic(); - FIB_INFO(f) = fi; - f->fn_type = r->rtm_type; - f->fn_scope = r->rtm_scope; - end_bh_atomic(); - rtmsg_fib(RTM_NEWROUTE, f, z, tb->tb_id, n, req); - } - state = f->fn_state; - f->fn_state = 0; - fib_release_info(old_fi); - if (state&FN_S_ACCESSED) - rt_cache_flush(-1); - return 0; + del_fp = fp; + fp = &f->fn_next; + f = *fp; + goto replace; } ins_fp = fp; + err = -EEXIST; - for ( ; (f = *fp) != NULL && fn_key_eq(f->fn_key, key) -#ifdef CONFIG_IP_ROUTE_TOS - && f->fn_tos == tos -#endif - ; fp = &f->fn_next) { - state |= f->fn_state; + FIB_SCAN_TOS(f, fp, key, tos) { + if (fi->fib_priority != FIB_INFO(f)->fib_priority) + break; if (f->fn_type == type && f->fn_scope == r->rtm_scope - && FIB_INFO(f) == fi) { - fib_release_info(fi); - if (f->fn_state&FN_S_ZOMBIE) { - f->fn_state = 0; - rtmsg_fib(RTM_NEWROUTE, f, z, tb->tb_id, n, req); - if (state&FN_S_ACCESSED) - rt_cache_flush(-1); - return 0; - } - return -EEXIST; - } + && FIB_INFO(f) == fi) + goto out; } + if (!(n->nlmsg_flags&NLM_F_APPEND)) { fp = ins_fp; f = *fp; } - } else { - if (!(n->nlmsg_flags&NLM_F_CREATE)) - return -ENOENT; } +create: + err = -ENOENT; + if (!(n->nlmsg_flags&NLM_F_CREATE)) + goto out; + +replace: + err = -ENOBUFS; new_f = (struct fib_node *) kmalloc(sizeof(struct fib_node), GFP_KERNEL); - if (new_f == NULL) { - fib_release_info(fi); - return -ENOBUFS; - } + if (new_f == NULL) + goto out; memset(new_f, 0, sizeof(struct fib_node)); @@ -473,9 +480,25 @@ FTprint("fib_create_info err=%d\n", err); *fp = new_f; fz->fz_nent++; + if (del_fp) { + f = *del_fp; + /* Unlink replaced node */ + *del_fp = f->fn_next; + if (!(f->fn_state&FN_S_ZOMBIE)) + rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req); + if (f->fn_state&FN_S_ACCESSED) + rt_cache_flush(-1); + fn_free_node(f); + fz->fz_nent--; + } else { + rt_cache_flush(-1); + } rtmsg_fib(RTM_NEWROUTE, new_f, z, tb->tb_id, n, req); - rt_cache_flush(-1); return 0; + +out: + fib_release_info(fi); + return err; } @@ -484,10 +507,11 @@ fn_hash_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, struct nlmsghdr *n, struct netlink_skb_parms *req) { struct fn_hash *table = (struct fn_hash*)tb->tb_data; - struct fib_node **fp, *f; + struct fib_node **fp, **del_fp, *f; int z = r->rtm_dst_len; struct fn_zone *fz; fn_key_t key; + int matched; #ifdef CONFIG_IP_ROUTE_TOS u8 tos = r->rtm_tos; #endif @@ -513,6 +537,8 @@ FTprint("tb(%d)_delete: %d %08x/%d %d\n", tb->tb_id, r->rtm_type, rta->rta_dst ? FIB_SCAN(f, fp) { if (fn_key_eq(f->fn_key, key)) break; + if (fn_key_leq(key, f->fn_key)) + return -ESRCH; } #ifdef CONFIG_IP_ROUTE_TOS FIB_SCAN_KEY(f, fp, key) { @@ -521,40 +547,47 @@ FTprint("tb(%d)_delete: %d %08x/%d %d\n", tb->tb_id, r->rtm_type, rta->rta_dst ? } #endif - while ((f = *fp) != NULL && fn_key_eq(f->fn_key, key) -#ifdef CONFIG_IP_ROUTE_TOS - && f->fn_tos == tos -#endif - ) { + matched = 0; + del_fp = NULL; + FIB_SCAN_TOS(f, fp, key, tos) { struct fib_info * fi = FIB_INFO(f); - if ((f->fn_state&FN_S_ZOMBIE) || - (r->rtm_type && f->fn_type != r->rtm_type) || - (r->rtm_scope && f->fn_scope != r->rtm_scope) || - (r->rtm_protocol && fi->fib_protocol != r->rtm_protocol) || - fib_nh_match(r, n, rta, fi)) - FIB_CONTINUE(f, fp); - break; + if (f->fn_state&FN_S_ZOMBIE) + return -ESRCH; + + matched++; + + if (del_fp == NULL && + (!r->rtm_type || f->fn_type == r->rtm_type) && + (r->rtm_scope == RT_SCOPE_NOWHERE || f->fn_scope == r->rtm_scope) && + (!r->rtm_protocol || fi->fib_protocol == r->rtm_protocol) && + fib_nh_match(r, n, rta, fi) == 0) + del_fp = fp; } - if (!f) - return -ESRCH; -#if 0 - *fp = f->fn_next; - rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req); - fn_free_node(f); - fz->fz_nent--; - rt_cache_flush(0); -#else - f->fn_state |= FN_S_ZOMBIE; - rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req); - if (f->fn_state&FN_S_ACCESSED) { - f->fn_state &= ~FN_S_ACCESSED; - rt_cache_flush(-1); + + if (del_fp) { + f = *del_fp; + rtmsg_fib(RTM_DELROUTE, f, z, tb->tb_id, n, req); + + if (matched != 1) { + *del_fp = f->fn_next; + if (f->fn_state&FN_S_ACCESSED) + rt_cache_flush(-1); + fn_free_node(f); + fz->fz_nent--; + } else { + f->fn_state |= FN_S_ZOMBIE; + if (f->fn_state&FN_S_ACCESSED) { + f->fn_state &= ~FN_S_ACCESSED; + rt_cache_flush(-1); + } + if (++fib_hash_zombies > 128) + fib_flush(); + } + + return 0; } - if (++fib_hash_zombies > 128) - fib_flush(); -#endif - return 0; + return -ESRCH; } extern __inline__ int diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 2302f5322..70fa5d843 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -5,7 +5,7 @@ * * IPv4 Forwarding Information Base: policy rules. * - * Version: $Id: fib_rules.c,v 1.6 1998/08/26 12:03:30 davem Exp $ + * Version: $Id: fib_rules.c,v 1.7 1998/10/03 09:37:09 davem Exp $ * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * @@ -13,6 +13,9 @@ * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Rani Assaf : local_rule cannot be deleted */ #include <linux/config.h> @@ -89,8 +92,10 @@ int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) (!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) && (!rta[RTA_IIF-1] || strcmp(RTA_DATA(rta[RTA_IIF-1]), r->r_ifname) == 0) && (!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) { + if (r == &local_rule) + return -EPERM; *rp = r->r_next; - if (r != &default_rule && r != &main_rule && r != &local_rule) + if (r != &default_rule && r != &main_rule) kfree(r); return 0; } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 36c801e8c..c77ecc251 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -5,7 +5,7 @@ * * IPv4 Forwarding Information Base: semantics. * - * Version: $Id: fib_semantics.c,v 1.10 1998/08/26 12:03:32 davem Exp $ + * Version: $Id: fib_semantics.c,v 1.11 1998/10/03 09:37:12 davem Exp $ * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * @@ -141,6 +141,7 @@ extern __inline__ struct fib_info * fib_find_info(const struct fib_info *nfi) continue; if (nfi->fib_protocol == fi->fib_protocol && nfi->fib_prefsrc == fi->fib_prefsrc && + nfi->fib_priority == fi->fib_priority && nfi->fib_mtu == fi->fib_mtu && nfi->fib_rtt == fi->fib_rtt && nfi->fib_window == fi->fib_window && @@ -231,6 +232,10 @@ int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta, int nhlen; #endif + if (rta->rta_priority && + *rta->rta_priority != fi->fib_priority) + return 1; + if (rta->rta_oif || rta->rta_gw) { if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) && (!rta->rta_gw || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 4) == 0)) @@ -405,6 +410,8 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta, fi->fib_protocol = r->rtm_protocol; fi->fib_nhs = nhs; fi->fib_flags = r->rtm_flags; + if (rta->rta_priority) + fi->fib_priority = *rta->rta_priority; if (rta->rta_mx) { int attrlen = RTA_PAYLOAD(rta->rta_mx); struct rtattr *attr = RTA_DATA(rta->rta_mx); @@ -484,34 +491,20 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta, goto failure; } else { change_nexthops(fi) { - if ((err = fib_check_nh(r, fi, nh)) != 0) { - if (err == -EINVAL) - printk("Einval 2\n"); + if ((err = fib_check_nh(r, fi, nh)) != 0) goto failure; - } } endfor_nexthops(fi) } if (fi->fib_prefsrc) { if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL || memcmp(&fi->fib_prefsrc, rta->rta_dst, 4)) - if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL) { - printk("Einval 3\n"); + if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL) goto err_inval; - } } link_it: if ((ofi = fib_find_info(fi)) != NULL) { - if (fi->fib_nh[0].nh_scope != ofi->fib_nh[0].nh_scope) { - printk("nh %d/%d gw=%08x/%08x dev=%s/%s\n", - fi->fib_nh[0].nh_scope, - ofi->fib_nh[0].nh_scope, - fi->fib_nh[0].nh_gw, - ofi->fib_nh[0].nh_gw, - fi->fib_nh[0].nh_dev->name, - ofi->fib_nh[0].nh_dev->name); - } kfree(fi); ofi->fib_refcnt++; return ofi; @@ -613,6 +606,8 @@ fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, if (rtm->rtm_dst_len) RTA_PUT(skb, RTA_DST, 4, dst); rtm->rtm_protocol = fi->fib_protocol; + if (fi->fib_priority) + RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority); #ifdef CONFIG_NET_CLS_ROUTE if (fi->fib_nh[0].nh_tclassid) RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid); @@ -720,12 +715,16 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm, rtm->rtm_dst_len = plen; rta->rta_dst = ptr; + if (r->rt_metric) { + *(u32*)&r->rt_pad3 = r->rt_metric - 1; + rta->rta_priority = (u32*)&r->rt_pad3; + } if (r->rt_flags&RTF_REJECT) { rtm->rtm_scope = RT_SCOPE_HOST; rtm->rtm_type = RTN_UNREACHABLE; return 0; } - rtm->rtm_scope = RT_SCOPE_LINK; + rtm->rtm_scope = RT_SCOPE_NOWHERE; rtm->rtm_type = RTN_UNICAST; if (r->rt_dev) { @@ -735,7 +734,7 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm, struct device *dev; char devname[IFNAMSIZ]; - if (copy_from_user(devname, r->rt_dev, 15)) + if (copy_from_user(devname, r->rt_dev, IFNAMSIZ-1)) return -EFAULT; devname[IFNAMSIZ-1] = 0; #ifdef CONFIG_IP_ALIAS @@ -777,6 +776,9 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm, if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL) return -EINVAL; + if (rtm->rtm_scope == RT_SCOPE_NOWHERE) + rtm->rtm_scope = RT_SCOPE_LINK; + if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) { struct rtattr *rec; struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL); @@ -974,7 +976,7 @@ void fib_node_get_info(int type, int dead, struct fib_info *fi, u32 prefix, u32 if (fi) { len = sprintf(buffer, "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u", fi->fib_dev ? fi->fib_dev->name : "*", prefix, - fi->fib_nh->nh_gw, flags, 0, 0, 0, + fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority, mask, fi->fib_mtu, fi->fib_window, fi->fib_rtt); } else { len = sprintf(buffer, "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u", diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 9cc7c733b..af1bb4a44 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -3,7 +3,7 @@ * * Alan Cox, <alan@cymru.net> * - * Version: $Id: icmp.c,v 1.45 1998/08/26 12:03:35 davem Exp $ + * Version: $Id: icmp.c,v 1.47 1998/10/21 05:32:24 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -51,6 +51,11 @@ * - IP option length was accounted wrongly * - ICMP header length was not accounted at all. * + * To Fix: + * + * - Should use skb_pull() instead of all the manual checking. + * This would also greatly simply some upper layer error handlers. --AK + * * RFC1122 (Host Requirements -- Comm. Layer) Status: * (boy, are there a lot of rules for ICMP) * 3.2.2 (Generic ICMP stuff) @@ -354,6 +359,11 @@ struct socket *icmp_socket=&icmp_inode.u.socket_i; * This function is generic and could be used for other purposes * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. * + * Note that the same dst_entry fields are modified by functions in + * route.c too, but these work for packet destinations while xrlim_allow + * works for icmp destinations. This means the rate limiting information + * for one "ip object" is shared. + * * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate * SHOULD allow setting of rate limits * @@ -369,6 +379,7 @@ int xrlim_allow(struct dst_entry *dst, int timeout) if (dst->rate_tokens > XRLIM_BURST_FACTOR*timeout) dst->rate_tokens = XRLIM_BURST_FACTOR*timeout; if (dst->rate_tokens >= timeout) { + dst->rate_last = now; dst->rate_tokens -= timeout; return 1; } @@ -708,12 +719,10 @@ static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb, int len) hash = iph->protocol & (MAX_INET_PROTOS - 1); if ((raw_sk = raw_v4_htable[hash]) != NULL) { - raw_sk = raw_v4_lookup(raw_sk, iph->protocol, iph->saddr, iph->daddr, skb->dev->ifindex); - while (raw_sk) - { + while ((raw_sk = raw_v4_lookup(raw_sk, iph->protocol, iph->saddr, + iph->daddr, skb->dev->ifindex)) != NULL) { raw_err(raw_sk, skb); - raw_sk = raw_v4_lookup(raw_sk->next, iph->protocol, - iph->saddr, iph->daddr, skb->dev->ifindex); + raw_sk = raw_sk->next; } } @@ -1072,8 +1081,7 @@ static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1] = { /* TIME EXCEEDED (11) */ { &icmp_statistics.IcmpOutTimeExcds, &icmp_statistics.IcmpInTimeExcds, icmp_unreach, 1, &sysctl_icmp_timeexceed_time }, /* PARAMETER PROBLEM (12) */ -/* FIXME: RFC1122 3.2.2.5 - MUST pass PARAM_PROB messages to transport layer */ - { &icmp_statistics.IcmpOutParmProbs, &icmp_statistics.IcmpInParmProbs, icmp_discard, 1, &sysctl_icmp_paramprob_time }, + { &icmp_statistics.IcmpOutParmProbs, &icmp_statistics.IcmpInParmProbs, icmp_unreach, 1, &sysctl_icmp_paramprob_time }, /* TIMESTAMP (13) */ { &icmp_statistics.IcmpOutTimestamps, &icmp_statistics.IcmpInTimestamps, icmp_timestamp, 0, }, /* TIMESTAMP REPLY (14) */ diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 8cd0d5962..b617bc343 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -5,7 +5,7 @@ * * The IP forwarding functionality. * - * Version: $Id: ip_forward.c,v 1.41 1998/08/26 12:03:42 davem Exp $ + * Version: $Id: ip_forward.c,v 1.42 1998/10/03 09:37:19 davem Exp $ * * Authors: see ip.c * @@ -103,8 +103,8 @@ int ip_forward(struct sk_buff *skb) #endif -#ifdef CONFIG_TRANSPARENT_PROXY - if (ip_chk_sock(skb)) +#ifdef CONFIG_IP_TRANSPARENT_PROXY + if (ip_chksock(skb)) goto local_pkt; #endif @@ -271,7 +271,7 @@ skip_call_fw_firewall: ip_send(skb); return 0; -#ifdef CONFIG_TRANSPARENT_PROXY +#ifdef CONFIG_IP_TRANSPARENT_PROXY local_pkt: return ip_local_deliver(skb); #endif diff --git a/net/ipv4/ip_fw.c b/net/ipv4/ip_fw.c index b45457c72..5044e7b45 100644 --- a/net/ipv4/ip_fw.c +++ b/net/ipv4/ip_fw.c @@ -245,7 +245,7 @@ struct ip_chain #endif /* Lock around ip_fw_chains linked list structure */ -spinlock_t ip_fw_lock = SPIN_LOCK_UNLOCKED; +rwlock_t ip_fw_lock = RW_LOCK_UNLOCKED; /* Head of linked list of fw rules */ static struct ip_chain *ip_fw_chains; @@ -531,18 +531,19 @@ ip_fw_domatch(struct ip_fwkernel *f, #ifdef CONFIG_IP_FIREWALL_NETLINK if (f->ipfw.fw_flg & IP_FW_F_NETLINK) { size_t len = min(f->ipfw.fw_outputsize, ntohs(ip->tot_len)) - + sizeof(skb->fwmark) + IFNAMSIZ; + + sizeof(__u32) + sizeof(skb->fwmark) + IFNAMSIZ; struct sk_buff *outskb=alloc_skb(len, GFP_ATOMIC); duprintf("Sending packet out NETLINK (length = %u).\n", (unsigned int)len); if (outskb) { - /* Prepend mark & interface */ + /* Prepend length, mark & interface */ skb_put(outskb, len); - *((__u32 *)outskb->data) = skb->fwmark; - strcpy(outskb->data+sizeof(__u32), rif); - memcpy(outskb->data+sizeof(__u32)+IFNAMSIZ, ip, - len-(sizeof(__u32)+IFNAMSIZ)); + *((__u32 *)outskb->data) = (__u32)len; + *((__u32 *)(outskb->data+sizeof(__u32))) = skb->fwmark; + strcpy(outskb->data+sizeof(__u32)*2, rif); + memcpy(outskb->data+sizeof(__u32)*2+IFNAMSIZ, ip, + len-(sizeof(__u32)*2+IFNAMSIZ)); netlink_broadcast(ipfwsk, outskb, 0, ~0, GFP_KERNEL); } else duprintf("netlink post failed - alloc_skb failed!\n"); @@ -1324,28 +1325,7 @@ int ip_fw_ctl(int cmd, void *m, int len) case IP_FW_MASQ_TIMEOUTS: { #ifdef CONFIG_IP_MASQUERADE - struct ip_fw_masq *masq; - - if (len != sizeof(struct ip_fw_masq)) { - duprintf("ip_fw_ctl (masq): length %d, expected %d\n", - len, sizeof(struct ip_fw_masq)); - ret = EINVAL; - } - else { - masq = (struct ip_fw_masq *)m; - if (masq->tcp_timeout) - ip_masq_expire->tcp_timeout - = masq->tcp_timeout; - - if (masq->tcp_fin_timeout) - ip_masq_expire->tcp_fin_timeout - = masq->tcp_fin_timeout; - - if (masq->udp_timeout) - ip_masq_expire->udp_timeout - = masq->udp_timeout; - ret = 0; - } + ret = ip_fw_masq_timeouts(m, len); #else ret = EINVAL; #endif diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 6a2e4eca5..6488e9d70 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -826,6 +826,10 @@ ipgre_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd) case SIOCADDTUNNEL: case SIOCCHGTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + err = -EFAULT; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) goto done; @@ -859,6 +863,10 @@ ipgre_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd) break; case SIOCDELTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + if (dev == &ipgre_fb_tunnel_dev) { err = -EFAULT; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index e06ad8206..260d178f1 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) module. * - * Version: $Id: ip_input.c,v 1.33 1998/08/26 12:03:47 davem Exp $ + * Version: $Id: ip_input.c,v 1.34 1998/10/03 09:37:23 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -503,6 +503,7 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) { int fwres; u16 rport; + u8 tos = iph->tos; if ((fwres=call_in_firewall(PF_INET, skb->dev, iph, &rport, &skb))<FW_ACCEPT) { if (fwres==FW_REJECT) @@ -514,6 +515,18 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) if (fwres==FW_REDIRECT && (IPCB(skb)->redirport = rport) != 0) return ip_local_deliver(skb); #endif +#ifdef CONFIG_IP_ROUTE_TOS + /* It is for 2.2 only. Firewalling should make smart + rerouting itself, ideally, but now it is too late + to teach it. --ANK (980905) + */ + if (iph->tos != tos && ((struct rtable*)skb->dst)->rt_type == RTN_UNICAST) { + dst_release(skb->dst); + skb->dst = NULL; + if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev)) + goto drop; + } +#endif } #endif diff --git a/net/ipv4/ip_masq.c b/net/ipv4/ip_masq.c index 2a6093583..7a57caeb0 100644 --- a/net/ipv4/ip_masq.c +++ b/net/ipv4/ip_masq.c @@ -4,7 +4,7 @@ * * Copyright (c) 1994 Pauline Middelink * - * Version: @(#)ip_masq.c 0.12 97/11/30 + * $Id: ip_masq.c,v 1.28 1998/11/21 00:33:30 davem Exp $ * * * See ip_fw.c for original log @@ -32,12 +32,26 @@ * Steven Clarke : IP_MASQ_S_xx state design * Juan Jose Ciarlante : IP_MASQ_S state implementation * Juan Jose Ciarlante : xx_get() clears timer, _put() inserts it + * Juan Jose Ciarlante : create /proc/net/ip_masq/ + * Juan Jose Ciarlante : reworked checksums (save payload csum if possible) + * Juan Jose Ciarlante : added missing ip_fw_masquerade checksum + * Juan Jose Ciarlante : csum savings + * Juan Jose Ciarlante : added user-space tunnel creation/del, etc + * Juan Jose Ciarlante : (last) moved to ip_masq_user runtime module + * Juan Jose Ciarlante : user timeout handling again + * Juan Jose Ciarlante : make new modules support optional + * Juan Jose Ciarlante : u-space context => locks reworked + * Juan Jose Ciarlante : fixed stupid SMP locking bug + * Juan Jose Ciarlante : fixed "tap"ing in demasq path by copy-on-w + * Juan Jose Ciarlante : make masq_proto_doff() robust against fake sized/corrupted packets * - * */ #include <linux/config.h> #include <linux/module.h> +#ifdef CONFIG_KMOD +#include <linux/kmod.h> +#endif #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> @@ -55,17 +69,14 @@ #include <net/udp.h> #include <net/checksum.h> #include <net/ip_masq.h> -#include <net/ip_masq_mod.h> -#include <linux/sysctl.h> -#include <linux/ip_fw.h> -#ifdef CONFIG_IP_MASQUERADE_IPAUTOFW -#include <net/ip_autofw.h> -#endif -#ifdef CONFIG_IP_MASQUERADE_IPPORTFW -#include <net/ip_portfw.h> +#ifdef CONFIG_IP_MASQUERADE_MOD +#include <net/ip_masq_mod.h> #endif +#include <linux/sysctl.h> +#include <linux/ip_fw.h> +#include <linux/ip_masq.h> int sysctl_ip_masq_debug = 0; @@ -77,6 +88,8 @@ int ip_masq_get_debug_level(void) return sysctl_ip_masq_debug; } +struct ip_masq_hook *ip_masq_user_hook = NULL; + /* * Timeout table[state] */ @@ -98,7 +111,7 @@ static struct ip_masq_timeout_table masq_timeout_table = { 5*60*HZ, /* IP_MASQ_S_UDP, */ 1*60*HZ, /* IP_MASQ_S_ICMP, */ 2*HZ,/* IP_MASQ_S_LAST */ - }, + }, /* timeout */ }; #define MASQUERADE_EXPIRE_RETRY masq_timeout_table.timeout[IP_MASQ_S_TIME_WAIT] @@ -134,7 +147,7 @@ struct masq_tcp_states_t { int next_state[IP_MASQ_S_LAST]; /* should be _LAST_TCP */ }; -static const char * masq_state_name(int state) +const char * ip_masq_state_name(int state) { if (state >= IP_MASQ_S_LAST) return "ERR!"; @@ -224,8 +237,8 @@ tcp_state_out: th->rst? 'R' : '.', ntohl(ms->saddr), ntohs(ms->sport), ntohl(ms->daddr), ntohs(ms->dport), - masq_state_name(ms->state), - masq_state_name(new_state)); + ip_masq_state_name(ms->state), + ip_masq_state_name(new_state)); return masq_set_state_timeout(ms, new_state); } @@ -235,20 +248,19 @@ tcp_state_out: */ static int masq_set_state(struct ip_masq *ms, int output, struct iphdr *iph, void *tp) { - struct tcphdr *th = tp; switch (iph->protocol) { case IPPROTO_ICMP: return masq_set_state_timeout(ms, IP_MASQ_S_ICMP); case IPPROTO_UDP: return masq_set_state_timeout(ms, IP_MASQ_S_UDP); case IPPROTO_TCP: - return masq_tcp_state(ms, output, th); + return masq_tcp_state(ms, output, tp); } return -1; } /* - * Moves tunnel to listen state + * Set LISTEN timeout. (ip_masq_put will setup timer) */ int ip_masq_listen(struct ip_masq *ms) { @@ -256,8 +268,6 @@ int ip_masq_listen(struct ip_masq *ms) return ms->timeout; } -#define IP_MASQ_TAB_SIZE 256 /* must be power of 2 */ - /* * Dynamic address rewriting */ @@ -266,9 +276,7 @@ extern int sysctl_ip_dynaddr; /* * Lookup lock */ -static struct wait_queue *masq_wait; -atomic_t __ip_masq_lock = ATOMIC_INIT(0); - +rwlock_t __ip_masq_lock = RW_LOCK_UNLOCKED; /* * Implement IP packet masquerading @@ -305,6 +313,9 @@ static __inline__ const __u8 icmp_type_request(__u8 type) * Will cycle in MASQ_PORT boundaries. */ static __u16 masq_port = PORT_MASQ_BEGIN; +#ifdef __SMP__ +static spinlock_t masq_port_lock = SPIN_LOCK_UNLOCKED; +#endif /* * free ports counters (UDP & TCP) @@ -327,26 +338,35 @@ static __u16 masq_port = PORT_MASQ_BEGIN; #define PORT_MASQ_MUL 10 #endif +/* + * At the moment, hardcore in sync with masq_proto_num + */ atomic_t ip_masq_free_ports[3] = { ATOMIC_INIT((PORT_MASQ_END-PORT_MASQ_BEGIN) * PORT_MASQ_MUL),/* UDP */ ATOMIC_INIT((PORT_MASQ_END-PORT_MASQ_BEGIN) * PORT_MASQ_MUL),/* TCP */ ATOMIC_INIT((PORT_MASQ_END-PORT_MASQ_BEGIN) * PORT_MASQ_MUL),/* ICMP */ }; +/* + * Counts entries that have been requested with specific mport. + * Used for incoming packets to "relax" input rule (port in MASQ range). + */ +atomic_t mport_count = ATOMIC_INIT(0); + EXPORT_SYMBOL(ip_masq_get_debug_level); EXPORT_SYMBOL(ip_masq_new); EXPORT_SYMBOL(ip_masq_listen); -/* -EXPORT_SYMBOL(ip_masq_set_expire); -*/ EXPORT_SYMBOL(ip_masq_free_ports); -EXPORT_SYMBOL(ip_masq_expire); EXPORT_SYMBOL(ip_masq_out_get); EXPORT_SYMBOL(ip_masq_in_get); EXPORT_SYMBOL(ip_masq_put); EXPORT_SYMBOL(ip_masq_control_add); EXPORT_SYMBOL(ip_masq_control_del); EXPORT_SYMBOL(ip_masq_control_get); +EXPORT_SYMBOL(ip_masq_user_hook); +EXPORT_SYMBOL(ip_masq_m_tab); +EXPORT_SYMBOL(ip_masq_state_name); +EXPORT_SYMBOL(ip_masq_select_addr); EXPORT_SYMBOL(__ip_masq_lock); /* @@ -360,13 +380,16 @@ struct ip_masq *ip_masq_s_tab[IP_MASQ_TAB_SIZE]; * timeouts */ +#if 000 /* FIXED timeout handling */ static struct ip_fw_masq ip_masq_dummy = { MASQUERADE_EXPIRE_TCP, MASQUERADE_EXPIRE_TCP_FIN, MASQUERADE_EXPIRE_UDP }; +EXPORT_SYMBOL(ip_masq_expire); struct ip_fw_masq *ip_masq_expire = &ip_masq_dummy; +#endif /* @@ -375,7 +398,7 @@ struct ip_fw_masq *ip_masq_expire = &ip_masq_dummy; * Warning: it does not check/delete previous timer! */ -void __ip_masq_set_expire(struct ip_masq *ms, unsigned long tout) +static void __ip_masq_set_expire(struct ip_masq *ms, unsigned long tout) { if (tout) { ms->timer.expires = jiffies+tout; @@ -398,7 +421,7 @@ ip_masq_hash_key(unsigned proto, __u32 addr, __u16 port) /* * Hashes ip_masq by its proto,addrs,ports. - * should be called with masked interrupts. + * should be called with locked tables. * returns bool success. */ @@ -434,7 +457,7 @@ static int ip_masq_hash(struct ip_masq *ms) /* * UNhashes ip_masq from ip_masq_[ms]_tables. - * should be called with masked interrupts. + * should be called with locked tables. * returns bool success. */ @@ -488,16 +511,18 @@ static int ip_masq_unhash(struct ip_masq *ms) * phoenix and get a reply from any other interface(==dst)! * * [Only for UDP] - AC + * + * Caller must lock tables */ -struct ip_masq * __ip_masq_in_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) +static struct ip_masq * __ip_masq_in_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) { unsigned hash; struct ip_masq *ms = NULL; - ip_masq_lock(&__ip_masq_lock, 0); - hash = ip_masq_hash_key(protocol, d_addr, d_port); + + for(ms = ip_masq_m_tab[hash]; ms ; ms = ms->m_link) { if (protocol==ms->protocol && ((s_addr==ms->daddr || ms->flags & IP_MASQ_F_NO_DADDR)) && @@ -521,7 +546,6 @@ struct ip_masq * __ip_masq_in_get(int protocol, __u32 s_addr, __u16 s_port, __u3 d_port); out: - ip_masq_unlock(&__ip_masq_lock, 0); return ms; } @@ -537,9 +561,11 @@ out: * hash is keyed on source port so if the first lookup fails then try again * with a zero port, this time only looking at entries marked "no source * port". + * + * Caller must lock tables */ -struct ip_masq * __ip_masq_out_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) +static struct ip_masq * __ip_masq_out_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) { unsigned hash; struct ip_masq *ms = NULL; @@ -549,8 +575,6 @@ struct ip_masq * __ip_masq_out_get(int protocol, __u32 s_addr, __u16 s_port, __u */ hash = ip_masq_hash_key(protocol, s_addr, s_port); - ip_masq_lock(&__ip_masq_lock, 0); - for(ms = ip_masq_s_tab[hash]; ms ; ms = ms->s_link) { if (protocol == ms->protocol && s_addr == ms->saddr && s_port == ms->sport && @@ -596,7 +620,6 @@ struct ip_masq * __ip_masq_out_get(int protocol, __u32 s_addr, __u16 s_port, __u d_port); out: - ip_masq_unlock(&__ip_masq_lock, 0); return ms; } @@ -604,6 +627,8 @@ out: /* * Returns ip_masq for given proto,m_addr,m_port. * called by allocation routine to find an unused m_port. + * + * Caller must lock tables */ static struct ip_masq * __ip_masq_getbym(int protocol, __u32 m_addr, __u16 m_port) @@ -613,8 +638,6 @@ static struct ip_masq * __ip_masq_getbym(int protocol, __u32 m_addr, __u16 m_por hash = ip_masq_hash_key(protocol, m_addr, m_port); - ip_masq_lock(&__ip_masq_lock, 0); - for(ms = ip_masq_m_tab[hash]; ms ; ms = ms->m_link) { if ( protocol==ms->protocol && (m_addr==ms->maddr && m_port==ms->mport)) { @@ -624,7 +647,6 @@ static struct ip_masq * __ip_masq_getbym(int protocol, __u32 m_addr, __u16 m_por } out: - ip_masq_unlock(&__ip_masq_lock, 0); return ms; } #endif @@ -632,7 +654,11 @@ out: struct ip_masq * ip_masq_out_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) { struct ip_masq *ms; + + read_lock(&__ip_masq_lock); ms = __ip_masq_out_get(protocol, s_addr, s_port, d_addr, d_port); + read_unlock(&__ip_masq_lock); + if (ms) __ip_masq_set_expire(ms, 0); return ms; @@ -641,7 +667,11 @@ struct ip_masq * ip_masq_out_get(int protocol, __u32 s_addr, __u16 s_port, __u32 struct ip_masq * ip_masq_in_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) { struct ip_masq *ms; + + read_lock(&__ip_masq_lock); ms = __ip_masq_in_get(protocol, s_addr, s_port, d_addr, d_port); + read_unlock(&__ip_masq_lock); + if (ms) __ip_masq_set_expire(ms, 0); return ms; @@ -685,8 +715,9 @@ static void masq_expire(unsigned long data) masq_proto_name(ms->protocol), ntohl(ms->saddr),ntohs(ms->sport)); - ip_masq_lock(&__ip_masq_lock, 1); + write_lock(&__ip_masq_lock); +#if 0000 /* * Already locked, do bounce ... */ @@ -694,6 +725,7 @@ static void masq_expire(unsigned long data) goto masq_expire_later; } +#endif /* * do I control anybody? */ @@ -708,8 +740,11 @@ static void masq_expire(unsigned long data) ip_masq_control_del(ms); if (ip_masq_unhash(ms)) { - if (!(ms->flags&IP_MASQ_F_MPORT)) + if (ms->flags&IP_MASQ_F_MPORT) { + atomic_dec(&mport_count); + } else { atomic_inc(ip_masq_free_ports + masq_proto_num(ms->protocol)); + } ip_masq_unbind_app(ms); } @@ -718,28 +753,45 @@ static void masq_expire(unsigned long data) */ if (atomic_read(&ms->refcnt) == 1) { kfree_s(ms,sizeof(*ms)); + MOD_DEC_USE_COUNT; goto masq_expire_out; } masq_expire_later: - IP_MASQ_DEBUG(0, "masq_expire delayed: %s %08lX:%04X->%08lX:%04X nlocks-1=%d masq.refcnt-1=%d masq.n_control=%d\n", + IP_MASQ_DEBUG(0, "masq_expire delayed: %s %08lX:%04X->%08lX:%04X masq.refcnt-1=%d masq.n_control=%d\n", masq_proto_name(ms->protocol), ntohl(ms->saddr), ntohs(ms->sport), ntohl(ms->daddr), ntohs(ms->dport), - ip_masq_nlocks(&__ip_masq_lock)-1, atomic_read(&ms->refcnt)-1, atomic_read(&ms->n_control)); ip_masq_put(ms); masq_expire_out: - ip_masq_unlock(&__ip_masq_lock, 1); + write_unlock(&__ip_masq_lock); +} + +static __u16 get_next_mport(void) +{ + __u16 mport; + + spin_lock_irq(&masq_port_lock); + /* + * Try the next available port number + */ + mport = htons(masq_port++); + if (masq_port==PORT_MASQ_END) masq_port = PORT_MASQ_BEGIN; + + spin_unlock_irq(&masq_port_lock); + return mport; } /* * Create a new masquerade list entry, also allocate an * unused mport, keeping the portnumber between the * given boundaries MASQ_BEGIN and MASQ_END. + * + * Be careful, it can be called from u-space */ struct ip_masq * ip_masq_new(int proto, __u32 maddr, __u16 mport, __u32 saddr, __u16 sport, __u32 daddr, __u16 dport, unsigned mflags) @@ -748,6 +800,7 @@ struct ip_masq * ip_masq_new(int proto, __u32 maddr, __u16 mport, __u32 saddr, _ int ports_tried; atomic_t *free_ports_p = NULL; static int n_fails = 0; + int prio; if (masq_proto_num(proto)!=-1 && mport == 0) { @@ -760,13 +813,17 @@ struct ip_masq * ip_masq_new(int proto, __u32 maddr, __u16 mport, __u32 saddr, _ return NULL; } } - ms = (struct ip_masq *) kmalloc(sizeof(struct ip_masq), GFP_ATOMIC); + + prio = (mflags&IP_MASQ_F_USER) ? GFP_KERNEL : GFP_ATOMIC; + + ms = (struct ip_masq *) kmalloc(sizeof(struct ip_masq), prio); if (ms == NULL) { if (++n_fails < 5) IP_MASQ_ERR("ip_masq_new(proto=%s): no memory available.\n", masq_proto_name(proto)); return NULL; } + MOD_INC_USE_COUNT; memset(ms, 0, sizeof(*ms)); init_timer(&ms->timer); ms->timer.data = (unsigned long)ms; @@ -805,22 +862,33 @@ struct ip_masq * ip_masq_new(int proto, __u32 maddr, __u16 mport, __u32 saddr, _ /* * Check 5-upla uniqueness */ - ip_masq_lock(&__ip_masq_lock, 1); + if (mflags & IP_MASQ_F_USER) + write_lock_bh(&__ip_masq_lock); + else + write_lock(&__ip_masq_lock); mst = __ip_masq_in_get(proto, daddr, dport, maddr, mport); if (mst==NULL) { ms->flags |= IP_MASQ_F_MPORT; + atomic_inc(&mport_count); ip_masq_hash(ms); - ip_masq_unlock(&__ip_masq_lock, 1); + + if (mflags & IP_MASQ_F_USER) + write_unlock_bh(&__ip_masq_lock); + else + write_unlock(&__ip_masq_lock); ip_masq_bind_app(ms); atomic_inc(&ms->refcnt); masq_set_state_timeout(ms, IP_MASQ_S_NONE); return ms; } + if (mflags & IP_MASQ_F_USER) + write_unlock_bh(&__ip_masq_lock); + else + write_unlock(&__ip_masq_lock); - ip_masq_unlock(&__ip_masq_lock, 1); __ip_masq_put(mst); IP_MASQ_ERR( "Already used connection: %s, %d.%d.%d.%d:%d => %d.%d.%d.%d:%d, called from %p\n", @@ -838,20 +906,15 @@ struct ip_masq * ip_masq_new(int proto, __u32 maddr, __u16 mport, __u32 saddr, _ (atomic_read(free_ports_p) && (ports_tried <= (PORT_MASQ_END - PORT_MASQ_BEGIN))); ports_tried++){ - cli(); - /* - * Try the next available port number - */ - mport = ms->mport = htons(masq_port++); - if (masq_port==PORT_MASQ_END) masq_port = PORT_MASQ_BEGIN; - - sti(); - + mport = ms->mport = get_next_mport(); /* * lookup to find out if this connection is used. */ - ip_masq_lock(&__ip_masq_lock, 1); + if (mflags & IP_MASQ_F_USER) + write_lock_bh(&__ip_masq_lock); + else + write_lock(&__ip_masq_lock); #ifdef CONFIG_IP_MASQUERADE_NREUSE mst = __ip_masq_getbym(proto, maddr, mport); @@ -861,12 +924,20 @@ struct ip_masq * ip_masq_new(int proto, __u32 maddr, __u16 mport, __u32 saddr, _ if (mst == NULL) { if (atomic_read(free_ports_p) == 0) { - ip_masq_unlock(&__ip_masq_lock, 1); + if (mflags & IP_MASQ_F_USER) + write_unlock_bh(&__ip_masq_lock); + else + write_unlock(&__ip_masq_lock); + break; } atomic_dec(free_ports_p); ip_masq_hash(ms); - ip_masq_unlock(&__ip_masq_lock, 1); + + if (mflags & IP_MASQ_F_USER) + write_unlock_bh(&__ip_masq_lock); + else + write_unlock(&__ip_masq_lock); ip_masq_bind_app(ms); n_fails = 0; @@ -874,7 +945,11 @@ struct ip_masq * ip_masq_new(int proto, __u32 maddr, __u16 mport, __u32 saddr, _ masq_set_state_timeout(ms, IP_MASQ_S_NONE); return ms; } - ip_masq_unlock(&__ip_masq_lock, 1); + if (mflags & IP_MASQ_F_USER) + write_unlock_bh(&__ip_masq_lock); + else + write_unlock(&__ip_masq_lock); + __ip_masq_put(mst); } @@ -884,48 +959,133 @@ struct ip_masq * ip_masq_new(int proto, __u32 maddr, __u16 mport, __u32 saddr, _ atomic_read(free_ports_p)); mport_nono: kfree_s(ms, sizeof(*ms)); + + MOD_DEC_USE_COUNT; return NULL; } -static void recalc_check(struct udphdr *uh, __u32 saddr, - __u32 daddr, int len) +/* + * Get transport protocol data offset, check against size + */ +static __inline__ int proto_doff(unsigned proto, char *th, unsigned size) { - uh->check=0; - uh->check=csum_tcpudp_magic(saddr,daddr,len, - IPPROTO_UDP, csum_partial((char *)uh,len,0)); - if(uh->check==0) - uh->check=0xFFFF; + int ret = -1; + switch (proto) { + case IPPROTO_ICMP: + if (size >= sizeof(struct icmphdr)) + ret = sizeof(struct icmphdr); + break; + case IPPROTO_UDP: + if (size >= sizeof(struct udphdr)) + ret = sizeof(struct udphdr); + break; + case IPPROTO_TCP: + /* + * Is this case, this check _also_ avoids + * touching an invalid pointer if + * size is invalid + */ + if (size >= sizeof(struct tcphdr)) { + ret = ((struct tcphdr*)th)->doff << 2; + if (ret > size) { + ret = -1 ; + } + } + + break; + } + if (ret < 0) + IP_MASQ_DEBUG(0, "mess proto_doff for proto=%d, size =%d\n", + proto, size); + return ret; } -int ip_fw_masquerade(struct sk_buff **skb_ptr, __u32 maddr) +int ip_fw_masquerade(struct sk_buff **skb_p, __u32 maddr) { - struct sk_buff *skb=*skb_ptr; + struct sk_buff *skb = *skb_p; struct iphdr *iph = skb->nh.iph; - __u16 *portptr; + union ip_masq_tphdr h; struct ip_masq *ms; int size; + /* + * doff holds transport protocol data offset + * csum holds its checksum + * csum_ok says if csum is valid + */ + int doff = 0; + int csum = 0; + int csum_ok = 0; + /* - * We can only masquerade protocols with ports... - * [TODO] - * We may need to consider masq-ing some ICMP related to masq-ed protocols + * We can only masquerade protocols with ports... and hack some ICMPs */ - if (iph->protocol==IPPROTO_ICMP) - return (ip_fw_masq_icmp(skb_ptr, maddr)); + h.raw = (char*) iph + iph->ihl * 4; + size = ntohs(iph->tot_len) - (iph->ihl * 4); - if (iph->protocol!=IPPROTO_UDP && iph->protocol!=IPPROTO_TCP) + doff = proto_doff(iph->protocol, h.raw, size); + if (doff < 0) { + IP_MASQ_DEBUG(0, "O-pkt invalid packet data size\n"); return -1; + } + switch (iph->protocol) { + case IPPROTO_ICMP: + return(ip_fw_masq_icmp(skb_p, maddr)); + case IPPROTO_UDP: + if (h.uh->check == 0) + /* No UDP checksum */ + break; + case IPPROTO_TCP: + /* Make sure packet is in the masq range */ + IP_MASQ_DEBUG(3, "O-pkt: %s size=%d\n", + masq_proto_name(iph->protocol), + size); +#ifdef CONFIG_IP_MASQ_DEBUG + if (ip_masq_get_debug_level() > 3) { + skb->ip_summed = CHECKSUM_NONE; + } +#endif + /* Check that the checksum is OK */ + switch (skb->ip_summed) + { + case CHECKSUM_NONE: + { + csum = csum_partial(h.raw + doff, size - doff, 0); + IP_MASQ_DEBUG(3, "O-pkt: %s I-datacsum=%d\n", + masq_proto_name(iph->protocol), + csum); + + skb->csum = csum_partial(h.raw , doff, csum); + } + case CHECKSUM_HW: + if (csum_tcpudp_magic(iph->saddr, iph->daddr, + size, iph->protocol, skb->csum)) + { + IP_MASQ_DEBUG(0, "Outgoing failed %s checksum from %d.%d.%d.%d (size=%d)!\n", + masq_proto_name(iph->protocol), + NIPQUAD(iph->saddr), + size); + return -1; + } + default: + /* CHECKSUM_UNNECESSARY */ + } + break; + default: + return -1; + } /* * Now hunt the list to see if we have an old entry */ - portptr = (__u16 *)&(((char *)iph)[iph->ihl*4]); + /* h.raw = (char*) iph + iph->ihl * 4; */ + IP_MASQ_DEBUG(2, "Outgoing %s %08lX:%04X -> %08lX:%04X\n", masq_proto_name(iph->protocol), - ntohl(iph->saddr), ntohs(portptr[0]), - ntohl(iph->daddr), ntohs(portptr[1])); + ntohl(iph->saddr), ntohs(h.portp[0]), + ntohl(iph->daddr), ntohs(h.portp[1])); ms = ip_masq_out_get_iph(iph); if (ms!=NULL) { @@ -942,13 +1102,13 @@ int ip_fw_masquerade(struct sk_buff **skb_ptr, __u32 maddr) NIPQUAD(ms->maddr),NIPQUAD(maddr)); } - ip_masq_lock(&__ip_masq_lock, 1); + write_lock(&__ip_masq_lock); ip_masq_unhash(ms); ms->maddr = maddr; ip_masq_hash(ms); - ip_masq_unlock(&__ip_masq_lock, 1); + write_unlock(&__ip_masq_lock); } /* @@ -960,13 +1120,13 @@ int ip_fw_masquerade(struct sk_buff **skb_ptr, __u32 maddr) if ( ms->flags & IP_MASQ_F_NO_SPORT && ms->protocol == IPPROTO_TCP ) { ms->flags &= ~IP_MASQ_F_NO_SPORT; - ip_masq_lock(&__ip_masq_lock, 1); + write_lock(&__ip_masq_lock); ip_masq_unhash(ms); - ms->sport = portptr[0]; + ms->sport = h.portp[0]; ip_masq_hash(ms); /* hash on new sport */ - ip_masq_unlock(&__ip_masq_lock, 1); + write_unlock(&__ip_masq_lock); IP_MASQ_DEBUG(1, "ip_fw_masquerade(): filled sport=%d\n", ntohs(ms->sport)); @@ -976,68 +1136,113 @@ int ip_fw_masquerade(struct sk_buff **skb_ptr, __u32 maddr) * Nope, not found, create a new entry for it */ - if (!(ms = ip_masq_mod_out_create(iph, portptr, maddr))) +#ifdef CONFIG_IP_MASQUERADE_MOD + if (!(ms = ip_masq_mod_out_create(skb, iph, maddr))) +#endif ms = ip_masq_new(iph->protocol, maddr, 0, - iph->saddr, portptr[0], - iph->daddr, portptr[1], + iph->saddr, h.portp[0], + iph->daddr, h.portp[1], 0); if (ms == NULL) return -1; } - ip_masq_mod_out_update(iph, portptr, ms); + /* + * Call module's output update hook + */ + +#ifdef CONFIG_IP_MASQUERADE_MOD + ip_masq_mod_out_update(skb, iph, ms); +#endif /* * Change the fragments origin */ - size = skb->len - ((unsigned char *)portptr - skb->nh.raw); + size = skb->len - (h.raw - skb->nh.raw); + /* * Set iph addr and port from ip_masq obj. */ iph->saddr = ms->maddr; - portptr[0] = ms->mport; + h.portp[0] = ms->mport; + + /* + * Invalidate csum saving if tunnel has masq helper + */ + + if (ms->app) + csum_ok = 0; /* * Attempt ip_masq_app call. * will fix ip_masq and iph seq stuff */ - if (ip_masq_app_pkt_out(ms, skb_ptr, maddr) != 0) + if (ip_masq_app_pkt_out(ms, skb_p, maddr) != 0) { /* * skb has possibly changed, update pointers. */ - skb = *skb_ptr; + skb = *skb_p; iph = skb->nh.iph; - portptr = (__u16 *)&(((char *)iph)[iph->ihl*4]); - size = skb->len - ((unsigned char *)portptr-skb->nh.raw); + h.raw = (char*) iph + iph->ihl *4; + size = skb->len - (h.raw - skb->nh.raw); + /* doff should have not changed */ } /* * Adjust packet accordingly to protocol */ - if (iph->protocol == IPPROTO_UDP) - { - recalc_check((struct udphdr *)portptr,iph->saddr,iph->daddr,size); - } else { - struct tcphdr *th = (struct tcphdr *)portptr; + /* + * Transport's payload partial csum + */ + if (!csum_ok) { + csum = csum_partial(h.raw + doff, size - doff, 0); + } + skb->csum = csum; - skb->csum = csum_partial((void *)(th + 1), size - sizeof(*th), 0); - th->check = 0; - th->check = tcp_v4_check(th, size, iph->saddr, iph->daddr, - csum_partial((char *)th, sizeof(*th), - skb->csum)); - } + IP_MASQ_DEBUG(3, "O-pkt: %s size=%d O-datacsum=%d\n", + masq_proto_name(iph->protocol), + size, + csum); + /* + * Protocol csum + */ + switch (iph->protocol) { + case IPPROTO_TCP: + h.th->check = 0; + h.th->check=csum_tcpudp_magic(iph->saddr, iph->daddr, + size, iph->protocol, + csum_partial(h.raw , doff, csum)); + IP_MASQ_DEBUG(3, "O-pkt: %s O-csum=%d (+%d)\n", + masq_proto_name(iph->protocol), + h.th->check, + (char*) & (h.th->check) - (char*) h.raw); + + break; + case IPPROTO_UDP: + h.uh->check = 0; + h.uh->check=csum_tcpudp_magic(iph->saddr, iph->daddr, + size, iph->protocol, + csum_partial(h.raw , doff, csum)); + if (h.uh->check == 0) + h.uh->check = 0xFFFF; + IP_MASQ_DEBUG(3, "O-pkt: %s O-csum=%d (+%d)\n", + masq_proto_name(iph->protocol), + h.uh->check, + (char*) &(h.uh->check)- (char*) h.raw); + break; + } ip_send_check(iph); IP_MASQ_DEBUG(2, "O-routed from %08lX:%04X with masq.addr %08lX\n", ntohl(ms->maddr),ntohs(ms->mport),ntohl(maddr)); - masq_set_state(ms, 1, iph, portptr); + masq_set_state(ms, 1, iph, h.portp); ip_masq_put(ms); return 0; @@ -1106,13 +1311,13 @@ int ip_fw_masq_icmp(struct sk_buff **skb_p, __u32 maddr) NIPQUAD(ms->maddr), NIPQUAD(maddr)); } - ip_masq_lock(&__ip_masq_lock, 1); + write_lock(&__ip_masq_lock); ip_masq_unhash(ms); ms->maddr = maddr; ip_masq_hash(ms); - ip_masq_unlock(&__ip_masq_lock, 1); + write_unlock(&__ip_masq_lock); } iph->saddr = ms->maddr; @@ -1166,11 +1371,13 @@ int ip_fw_masq_icmp(struct sk_buff **skb_p, __u32 maddr) ntohs(icmp_id(cicmph)), cicmph->type); + read_lock(&__ip_masq_lock); ms = __ip_masq_out_get(ciph->protocol, ciph->daddr, icmp_id(cicmph), ciph->saddr, icmp_hv_rep(cicmph)); + read_unlock(&__ip_masq_lock); if (ms == NULL) return 0; @@ -1239,11 +1446,13 @@ int ip_fw_masq_icmp(struct sk_buff **skb_p, __u32 maddr) /* This is pretty much what __ip_masq_in_get_iph() does */ ms = __ip_masq_in_get(ciph->protocol, ciph->saddr, pptr[0], ciph->daddr, pptr[1]); #endif + read_lock(&__ip_masq_lock); ms = __ip_masq_out_get(ciph->protocol, ciph->daddr, pptr[1], ciph->saddr, pptr[0]); + read_unlock(&__ip_masq_lock); if (ms == NULL) return 0; @@ -1274,6 +1483,30 @@ int ip_fw_masq_icmp(struct sk_buff **skb_p, __u32 maddr) return 1; } + +/* + * Own skb_cow() beast, tweaked for rewriting commonly + * used pointers in masq code + */ +static struct sk_buff * masq_skb_cow(struct sk_buff **skb_p, + struct iphdr **iph_p, unsigned char **t_p) { + struct sk_buff *skb=(*skb_p); + if (skb_cloned(skb)) { + skb = skb_copy(skb, GFP_ATOMIC); + if (skb) { + /* + * skb changed, update other pointers + */ + struct iphdr *iph = skb->nh.iph; + kfree_skb(*skb_p); + *skb_p = skb; + *iph_p = iph; + *t_p = (char*) iph + iph->ihl * 4; + } + } + return skb; +} + /* * Handle ICMP messages in reverse (demasquerade) direction. * Find any that might be relevant, check against existing connections, @@ -1323,6 +1556,11 @@ int ip_fw_demasq_icmp(struct sk_buff **skb_p) */ ms->flags &= ~IP_MASQ_F_NO_REPLY; + if ((skb=masq_skb_cow(skb_p, &iph, (unsigned char**)&icmph)) == NULL) { + ip_masq_put(ms); + return -1; + } + /* Reset source address */ iph->daddr = ms->saddr; /* Redo IP header checksum */ @@ -1378,15 +1616,23 @@ int ip_fw_demasq_icmp(struct sk_buff **skb_p) ntohs(icmp_id(cicmph)), cicmph->type); + read_lock(&__ip_masq_lock); ms = __ip_masq_in_get(ciph->protocol, ciph->daddr, icmp_hv_req(cicmph), ciph->saddr, icmp_id(cicmph)); + read_unlock(&__ip_masq_lock); if (ms == NULL) return 0; + if ((skb=masq_skb_cow(skb_p, &iph, (unsigned char**)&icmph)) == NULL) { + __ip_masq_put(ms); + return -1; + } + ciph = (struct iphdr *) (icmph + 1); + /* Now we do real damage to this packet...! */ /* First change the dest IP address, and recalc checksum */ iph->daddr = ms->saddr; @@ -1445,15 +1691,23 @@ int ip_fw_demasq_icmp(struct sk_buff **skb_p) /* This is pretty much what __ip_masq_in_get_iph() does, except params are wrong way round */ + read_lock(&__ip_masq_lock); ms = __ip_masq_in_get(ciph->protocol, ciph->daddr, pptr[1], ciph->saddr, pptr[0]); + read_unlock(&__ip_masq_lock); if (ms == NULL) return 0; + if ((skb=masq_skb_cow(skb_p, &iph, (unsigned char**)&icmph)) == NULL) { + __ip_masq_put(ms); + return -1; + } + ciph = (struct iphdr *) (icmph + 1); + /* Now we do real damage to this packet...! */ /* First change the dest IP address, and recalc checksum */ iph->daddr = ms->saddr; @@ -1480,7 +1734,6 @@ int ip_fw_demasq_icmp(struct sk_buff **skb_p) return 1; } - /* * Check if it's an masqueraded port, look it up, * and send it on its way... @@ -1492,44 +1745,86 @@ int ip_fw_demasq_icmp(struct sk_buff **skb_p) int ip_fw_demasquerade(struct sk_buff **skb_p) { - struct sk_buff *skb = *skb_p; - struct iphdr *iph = skb->nh.iph; - __u16 *portptr; - struct ip_masq *ms; - unsigned short len; - + struct sk_buff *skb = *skb_p; + struct iphdr *iph = skb->nh.iph; + union ip_masq_tphdr h; + struct ip_masq *ms; + unsigned short size; + int doff = 0; + int csum = 0; + int csum_ok = 0; __u32 maddr; - maddr = iph->daddr; + /* + * Big tappo: only PACKET_HOST (nor loopback neither mcasts) + * ... don't know why 1st test DOES NOT include 2nd (?) + */ + + if (skb->pkt_type != PACKET_HOST || skb->dev == &loopback_dev) { + IP_MASQ_DEBUG(2, "ip_fw_demasquerade(): packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", + skb->pkt_type, + iph->protocol, + NIPQUAD(iph->daddr)); + return 0; + } + + h.raw = (char*) iph + iph->ihl * 4; + /* + * IP payload size + */ + size = ntohs(iph->tot_len) - (iph->ihl * 4); + + doff = proto_doff(iph->protocol, h.raw, size); + if (doff < 0) { + IP_MASQ_DEBUG(0, "I-pkt invalid packet data size\n"); + return -1; + } + + maddr = iph->daddr; switch (iph->protocol) { case IPPROTO_ICMP: return(ip_fw_demasq_icmp(skb_p)); case IPPROTO_TCP: case IPPROTO_UDP: - /* Make sure packet is in the masq range */ - portptr = (__u16 *)&(((char *)iph)[iph->ihl*4]); - if ((ntohs(portptr[1]) < PORT_MASQ_BEGIN - || ntohs(portptr[1]) > PORT_MASQ_END) - && (ip_masq_mod_in_rule(iph, portptr) != 1)) + /* + * Make sure packet is in the masq range + * ... or some mod-ule relaxes input range + * ... or there is still some `special' mport opened + */ + if ((ntohs(h.portp[1]) < PORT_MASQ_BEGIN + || ntohs(h.portp[1]) > PORT_MASQ_END) +#ifdef CONFIG_IP_MASQUERADE_MOD + && (ip_masq_mod_in_rule(skb, iph) != 1) +#endif + && atomic_read(&mport_count) == 0 ) return 0; /* Check that the checksum is OK */ - len = ntohs(iph->tot_len) - (iph->ihl * 4); - if ((iph->protocol == IPPROTO_UDP) && (portptr[3] == 0)) + if ((iph->protocol == IPPROTO_UDP) && (h.uh->check == 0)) /* No UDP checksum */ break; +#ifdef CONFIG_IP_MASQ_DEBUG + if (ip_masq_get_debug_level() > 3) { + skb->ip_summed = CHECKSUM_NONE; + } +#endif switch (skb->ip_summed) { case CHECKSUM_NONE: - skb->csum = csum_partial((char *)portptr, len, 0); + csum = csum_partial(h.raw + doff, size - doff, 0); + csum_ok++; + skb->csum = csum_partial(h.raw , doff, csum); + case CHECKSUM_HW: - if (csum_tcpudp_magic(iph->saddr, iph->daddr, len, - iph->protocol, skb->csum)) + if (csum_tcpudp_magic(iph->saddr, iph->daddr, + size, iph->protocol, skb->csum)) { - IP_MASQ_DEBUG(2, "failed TCP/UDP checksum from %d.%d.%d.%d!\n", - NIPQUAD(iph->saddr)); + IP_MASQ_DEBUG(0, "Incoming failed %s checksum from %d.%d.%d.%d (size=%d)!\n", + masq_proto_name(iph->protocol), + NIPQUAD(iph->saddr), + size); return -1; } default: @@ -1544,8 +1839,8 @@ int ip_fw_demasquerade(struct sk_buff **skb_p) IP_MASQ_DEBUG(2, "Incoming %s %08lX:%04X -> %08lX:%04X\n", masq_proto_name(iph->protocol), - ntohl(iph->saddr), ntohs(portptr[0]), - ntohl(iph->daddr), ntohs(portptr[1])); + ntohl(iph->saddr), ntohs(h.portp[0]), + ntohl(iph->daddr), ntohs(h.portp[1])); /* * reroute to original host:port if found... @@ -1553,10 +1848,19 @@ int ip_fw_demasquerade(struct sk_buff **skb_p) ms = ip_masq_in_get_iph(iph); + /* + * Give additional modules a chance to create an entry + */ +#ifdef CONFIG_IP_MASQUERADE_MOD if (!ms) - ms = ip_masq_mod_in_create(iph, portptr, maddr); + ms = ip_masq_mod_in_create(skb, iph, maddr); + + /* + * Call module's input update hook + */ + ip_masq_mod_in_update(skb, iph, ms); +#endif - ip_masq_mod_in_update(iph, portptr, ms); if (ms != NULL) { @@ -1572,7 +1876,7 @@ int ip_fw_demasquerade(struct sk_buff **skb_p) if ( ms->flags & IP_MASQ_F_NO_DPORT ) { /* && ms->protocol == IPPROTO_TCP ) { */ ms->flags &= ~IP_MASQ_F_NO_DPORT; - ms->dport = portptr[0]; + ms->dport = h.portp[0]; IP_MASQ_DEBUG(1, "ip_fw_demasquerade(): filled dport=%d\n", ntohs(ms->dport)); @@ -1582,12 +1886,23 @@ int ip_fw_demasquerade(struct sk_buff **skb_p) ms->flags &= ~IP_MASQ_F_NO_DADDR; ms->daddr = iph->saddr; - IP_MASQ_DEBUG(1, "ip_fw_demasquerade(): filled daddr=%X\n", - ntohs(ms->daddr)); + IP_MASQ_DEBUG(1, "ip_fw_demasquerade(): filled daddr=%lX\n", + ntohl(ms->daddr)); } + if ((skb=masq_skb_cow(skb_p, &iph, &h.raw)) == NULL) { + ip_masq_put(ms); + return -1; + } iph->daddr = ms->saddr; - portptr[1] = ms->sport; + h.portp[1] = ms->sport; + + /* + * Invalidate csum saving if tunnel has masq helper + */ + + if (ms->app) + csum_ok = 0; /* * Attempt ip_masq_app call. @@ -1602,34 +1917,47 @@ int ip_fw_demasquerade(struct sk_buff **skb_p) skb = *skb_p; iph = skb->nh.iph; - portptr = (__u16 *)&(((char *)iph)[iph->ihl*4]); - len = ntohs(iph->tot_len) - (iph->ihl * 4); + h.raw = (char*) iph + iph->ihl*4; + size = ntohs(iph->tot_len) - (iph->ihl * 4); } /* - * Yug! adjust UDP/TCP and IP checksums, also update - * timeouts. - * If a TCP RST is seen collapse the tunnel (by using short timeout)! - */ - if (iph->protocol == IPPROTO_UDP) { - recalc_check((struct udphdr *)portptr,iph->saddr,iph->daddr,len); - } else { - struct tcphdr *th = (struct tcphdr *)portptr; - skb->csum = csum_partial((void *)(th + 1), - len - sizeof(struct tcphdr), 0); + * Yug! adjust UDP/TCP checksums + */ - th->check = 0; - th->check = tcp_v4_check(th, len, iph->saddr, iph->daddr, - csum_partial((char *)th, - sizeof(*th), - skb->csum)); + /* + * Transport's payload partial csum + */ - } + if (!csum_ok) { + csum = csum_partial(h.raw + doff, size - doff, 0); + } + skb->csum = csum; + + /* + * Protocol csum + */ + switch (iph->protocol) { + case IPPROTO_TCP: + h.th->check = 0; + h.th->check=csum_tcpudp_magic(iph->saddr, iph->daddr, + size, iph->protocol, + csum_partial(h.raw , doff, csum)); + break; + case IPPROTO_UDP: + h.uh->check = 0; + h.uh->check=csum_tcpudp_magic(iph->saddr, iph->daddr, + size, iph->protocol, + csum_partial(h.raw , doff, csum)); + if (h.uh->check == 0) + h.uh->check = 0xFFFF; + break; + } ip_send_check(iph); - IP_MASQ_DEBUG(2, "I-routed to %08lX:%04X\n",ntohl(iph->daddr),ntohs(portptr[1])); + IP_MASQ_DEBUG(2, "I-routed to %08lX:%04X\n",ntohl(iph->daddr),ntohs(h.portp[1])); - masq_set_state (ms, 0, iph, portptr); + masq_set_state (ms, 0, iph, h.portp); ip_masq_put(ms); return 1; @@ -1683,6 +2011,7 @@ struct ip_masq * ip_masq_control_get(struct ip_masq *ms) return ms->control; } + #ifdef CONFIG_PROC_FS /* * /proc/net entries @@ -1697,7 +2026,6 @@ static int ip_msqhst_procinfo(char *buffer, char **start, off_t offset, int idx = 0; int len=0; - ip_masq_lockz(&__ip_masq_lock, &masq_wait, 0); if (offset < 128) { @@ -1710,12 +2038,21 @@ static int ip_msqhst_procinfo(char *buffer, char **start, off_t offset, } pos = 128; - for(idx = 0; idx < IP_MASQ_TAB_SIZE; idx++) + for(idx = 0; idx < IP_MASQ_TAB_SIZE; idx++) + { + /* + * Lock is actually only need in next loop + * we are called from uspace: must stop bh. + */ + read_lock_bh(&__ip_masq_lock); + for(ms = ip_masq_m_tab[idx]; ms ; ms = ms->m_link) { pos += 128; - if (pos <= offset) + if (pos <= offset) { + len = 0; continue; + } /* * We have locked the tables, no need to del/add timers @@ -1733,12 +2070,17 @@ static int ip_msqhst_procinfo(char *buffer, char **start, off_t offset, ms->timer.expires-jiffies); len += sprintf(buffer+len, "%-127s\n", temp); - if(len >= length) + if(len >= length) { + + read_unlock_bh(&__ip_masq_lock); goto done; + } } + read_unlock_bh(&__ip_masq_lock); + + } done: - ip_masq_unlockz(&__ip_masq_lock, &masq_wait, 0); begin = len - (pos - offset); *start = buffer + begin; @@ -1748,82 +2090,173 @@ done: return len; } -static int ip_masq_procinfo(char *buffer, char **start, off_t offset, - int length, int unused) +#endif + +/* + * Timeouts handling by ipfwadm/ipchains + * From ip_fw.c + */ + +int ip_fw_masq_timeouts(void *m, int len) { - off_t pos=0, begin; - struct ip_masq *ms; - char temp[129]; - int idx = 0; - int len=0; + struct ip_fw_masq *masq; + int ret = EINVAL; - ip_masq_lockz(&__ip_masq_lock, &masq_wait, 0); + if (len != sizeof(struct ip_fw_masq)) { + IP_MASQ_DEBUG(1, "ip_fw_masq_timeouts: length %d, expected %d\n", + len, sizeof(struct ip_fw_masq)); + } else { + masq = (struct ip_fw_masq *)m; + if (masq->tcp_timeout) + masq_timeout_table.timeout[IP_MASQ_S_ESTABLISHED] + = masq->tcp_timeout; + + if (masq->tcp_fin_timeout) + masq_timeout_table.timeout[IP_MASQ_S_FIN_WAIT] + = masq->tcp_fin_timeout; + + if (masq->udp_timeout) + masq_timeout_table.timeout[IP_MASQ_S_UDP] + = masq->udp_timeout; + ret = 0; + } + return ret; +} +/* + * Module autoloading stuff + */ - if (offset < 128) - { - sprintf(temp, - "Prot SrcIP SPrt DstIP DPrt MAddr MPrt State Ref Ctl Expires (free=%d,%d,%d)", - atomic_read(ip_masq_free_ports), - atomic_read(ip_masq_free_ports+1), - atomic_read(ip_masq_free_ports+2)); - len = sprintf(buffer, "%-127s\n", temp); +static int ip_masq_user_check_hook(void) { +#ifdef CONFIG_KMOD + if (ip_masq_user_hook == NULL) { + IP_MASQ_DEBUG(1, "About to request \"ip_masq_user\" module\n"); + request_module("ip_masq_user"); } - pos = 128; +#endif /* CONFIG_KMOD */ + return (ip_masq_user_hook != NULL); +} - for(idx = 0; idx < IP_MASQ_TAB_SIZE; idx++) - for(ms = ip_masq_m_tab[idx]; ms ; ms = ms->m_link) - { - pos += 128; - if (pos <= offset) - continue; +/* + * user module hook- info + */ +static int ip_masq_user_info(char *buffer, char **start, off_t offset, + int len, int *eof, void *data) +{ + int ret = -ENOPKG; + if (ip_masq_user_check_hook()) { + ret = ip_masq_user_hook->info(buffer, start, offset, len, (int) data); + } + return ret; +} - /* - * We have locked the tables, no need to del/add timers - * nor cli() 8) - */ +/* + * user module hook- entry mgmt + */ +static int ip_masq_user_ctl(int optname, void *arg, int arglen) +{ + int ret = -ENOPKG; + if (ip_masq_user_check_hook()) { + ret = ip_masq_user_hook->ctl(optname, arg, arglen); + } + return ret; +} - sprintf(temp,"%-4s %08lX:%04X %08lX:%04X %08lX:%04X %-12s %3d %3d %7lu", - masq_proto_name(ms->protocol), - ntohl(ms->saddr), ntohs(ms->sport), - ntohl(ms->daddr), ntohs(ms->dport), - ntohl(ms->maddr), ntohs(ms->mport), - masq_state_name(ms->state), - atomic_read(&ms->refcnt), - atomic_read(&ms->n_control), - (ms->timer.expires-jiffies)/HZ); - len += sprintf(buffer+len, "%-127s\n", temp); +/* + * Control from ip_sockglue + * MAIN ENTRY point from userspace (apart from /proc *info entries) + * Returns errno + */ +int ip_masq_uctl(int optname, char * optval , int optlen) +{ + struct ip_masq_ctl masq_ctl; + int ret = -EINVAL; - if(len >= length) - goto done; - } -done: + if(optlen>sizeof(masq_ctl)) + return -EINVAL; - ip_masq_unlockz(&__ip_masq_lock, &masq_wait, 0); + if(copy_from_user(&masq_ctl,optval,optlen)) + return -EFAULT; - begin = len - (pos - offset); - *start = buffer + begin; - len -= begin; - if(len>length) - len = length; - return len; + IP_MASQ_DEBUG(1,"ip_masq_ctl(optname=%d, optlen=%d, target=%d, cmd=%d)\n", + optname, optlen, masq_ctl.m_target, masq_ctl.m_cmd); + + switch (masq_ctl.m_target) { + case IP_MASQ_TARGET_USER: + ret = ip_masq_user_ctl(optname, &masq_ctl, optlen); + break; +#ifdef CONFIG_IP_MASQUERADE_MOD + case IP_MASQ_TARGET_MOD: + ret = ip_masq_mod_ctl(optname, &masq_ctl, optlen); + break; +#endif + } + + /* + * If ret>0, copy to user space + */ + + if (ret > 0 && ret <= sizeof (masq_ctl)) { + if (copy_to_user(optval, &masq_ctl, ret) ) + return -EFAULT; + ret = 0; + } + + return ret; } +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry *proc_net_ip_masq = NULL; + +#ifdef MODULE +static void ip_masq_proc_count(struct inode *inode, int fill) +{ + if (fill) + MOD_INC_USE_COUNT; + else + MOD_DEC_USE_COUNT; +} #endif +int ip_masq_proc_register(struct proc_dir_entry *ent) +{ + if (!proc_net_ip_masq) return -1; + IP_MASQ_DEBUG(1, "registering \"/proc/net/ip_masq/%s\" entry\n", + ent->name); + return proc_register(proc_net_ip_masq, ent); +} +void ip_masq_proc_unregister(struct proc_dir_entry *ent) +{ + if (!proc_net_ip_masq) return; + IP_MASQ_DEBUG(1, "unregistering \"/proc/net/ip_masq/%s\" entry\n", + ent->name); + proc_unregister(proc_net_ip_masq, ent->low_ino); +} + /* - * Control from ip_sockglue - * From userspace + * Wrapper over inet_select_addr() */ -int ip_masq_ctl(int optname, void *arg, int arglen) +u32 ip_masq_select_addr(struct device *dev, u32 dst, int scope) { - struct ip_fw_masqctl *mctl = arg; - int ret = EINVAL; - - if (1) /* (mctl->mctl_action == IP_MASQ_MOD_CTL) */ - ret = ip_masq_mod_ctl(optname, mctl, arglen); + return inet_select_addr(dev, dst, scope); +} - return ret; +__initfunc(static void masq_proc_init(void)) +{ + IP_MASQ_DEBUG(1,"registering /proc/net/ip_masq\n"); + if (!proc_net_ip_masq) { + struct proc_dir_entry *ent; + ent = create_proc_entry("net/ip_masq", S_IFDIR, 0); + if (ent) { +#ifdef MODULE + ent->fill_inode = ip_masq_proc_count; +#endif + proc_net_ip_masq = ent; + } else { + IP_MASQ_ERR("Could not create \"/proc/net/ip_masq\" entry\n"); + } + } } +#endif /* CONFIG_PROC_FS */ /* * Initialize ip masquerading @@ -1837,11 +2270,37 @@ __initfunc(int ip_masq_init(void)) 0, &proc_net_inode_operations, ip_msqhst_procinfo }); - proc_net_register(&(struct proc_dir_entry) { - 0, 7, "ip_masq", + masq_proc_init(); + + ip_masq_proc_register(&(struct proc_dir_entry) { + 0, 3, "tcp", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + NULL, /* get_info */ + NULL, /* fill_inode */ + NULL, NULL, NULL, + (char *) IPPROTO_TCP, + ip_masq_user_info + }); + ip_masq_proc_register(&(struct proc_dir_entry) { + 0, 3, "udp", S_IFREG | S_IRUGO, 1, 0, 0, 0, &proc_net_inode_operations, - ip_masq_procinfo + NULL, /* get_info */ + NULL, /* fill_inode */ + NULL, NULL, NULL, + (char *) IPPROTO_UDP, + ip_masq_user_info + }); + ip_masq_proc_register(&(struct proc_dir_entry) { + 0, 4, "icmp", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + NULL, /* get_info */ + NULL, /* fill_inode */ + NULL, NULL, NULL, + (char *) IPPROTO_ICMP, + ip_masq_user_info }); #endif #ifdef CONFIG_IP_MASQUERADE_IPAUTOFW @@ -1850,6 +2309,9 @@ __initfunc(int ip_masq_init(void)) #ifdef CONFIG_IP_MASQUERADE_IPPORTFW ip_portfw_init(); #endif +#ifdef CONFIG_IP_MASQUERADE_IPMARKFW + ip_markfw_init(); +#endif ip_masq_app_init(); return 0; diff --git a/net/ipv4/ip_masq_app.c b/net/ipv4/ip_masq_app.c index b620bc82a..45fd14fa4 100644 --- a/net/ipv4/ip_masq_app.c +++ b/net/ipv4/ip_masq_app.c @@ -2,7 +2,7 @@ * IP_MASQ_APP application masquerading module * * - * Version: @(#)ip_masq_app.c 0.04 96/06/17 + * $Id: ip_masq_app.c,v 1.16 1998/08/29 23:51:14 davem Exp $ * * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> * @@ -474,7 +474,7 @@ done: #ifdef CONFIG_PROC_FS static struct proc_dir_entry proc_net_ip_masq_app = { - PROC_NET_IP_MASQ_APP, 11, "ip_masq_app", + PROC_NET_IP_MASQ_APP, 3, "app", S_IFREG | S_IRUGO, 1, 0, 0, 0, &proc_net_inode_operations, ip_masq_app_getinfo @@ -488,7 +488,7 @@ static struct proc_dir_entry proc_net_ip_masq_app = { __initfunc(int ip_masq_app_init(void)) { #ifdef CONFIG_PROC_FS - proc_net_register(&proc_net_ip_masq_app); + ip_masq_proc_register(&proc_net_ip_masq_app); #endif return 0; } diff --git a/net/ipv4/ip_masq_autofw.c b/net/ipv4/ip_masq_autofw.c index 27b98bb03..d2a1729c5 100644 --- a/net/ipv4/ip_masq_autofw.c +++ b/net/ipv4/ip_masq_autofw.c @@ -2,7 +2,7 @@ * IP_MASQ_AUTOFW auto forwarding module * * - * Version: @(#)ip_masq_autofw.c 0.02 97/10/22 + * $Id: ip_masq_autofw.c,v 1.3 1998/08/29 23:51:10 davem Exp $ * * Author: Richard Lynch * @@ -36,15 +36,34 @@ #include <linux/ip_fw.h> #include <net/ip_masq.h> #include <net/ip_masq_mod.h> -#include <net/ip_autofw.h> +#include <linux/ip_masq.h> + +#define IP_AUTOFW_EXPIRE 15*HZ + +/* WARNING: bitwise equal to ip_autofw_user in linux/ip_masq.h */ +struct ip_autofw { + struct ip_autofw * next; + __u16 type; + __u16 low; + __u16 hidden; + __u16 high; + __u16 visible; + __u16 protocol; + __u32 lastcontact; + __u32 where; + __u16 ctlproto; + __u16 ctlport; + __u16 flags; + struct timer_list timer; +}; /* * Debug level */ +#ifdef CONFIG_IP_MASQ_DEBUG static int debug=0; - -MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i"); MODULE_PARM(debug, "i"); +#endif /* * Auto-forwarding table @@ -156,11 +175,11 @@ static __inline__ void ip_autofw_expire(unsigned long data) -static __inline__ int ip_autofw_add(struct ip_autofw * af) +static __inline__ int ip_autofw_add(struct ip_autofw_user * af) { struct ip_autofw * newaf; - init_timer(&af->timer); newaf = kmalloc( sizeof(struct ip_autofw), GFP_KERNEL ); + init_timer(&newaf->timer); if ( newaf == NULL ) { printk("ip_autofw_add: malloc said no\n"); @@ -169,7 +188,7 @@ static __inline__ int ip_autofw_add(struct ip_autofw * af) MOD_INC_USE_COUNT; - memcpy(newaf, af, sizeof(struct ip_autofw)); + memcpy(newaf, af, sizeof(struct ip_autofw_user)); newaf->timer.data = (unsigned long) newaf; newaf->timer.function = ip_autofw_expire; newaf->timer.expires = 0; @@ -180,7 +199,7 @@ static __inline__ int ip_autofw_add(struct ip_autofw * af) return(0); } -static __inline__ int ip_autofw_del(struct ip_autofw * af) +static __inline__ int ip_autofw_del(struct ip_autofw_user * af) { struct ip_autofw ** af_p, *curr; @@ -229,20 +248,21 @@ static __inline__ int ip_autofw_flush(void) * Methods for registered object */ -static int autofw_ctl(int optname, struct ip_fw_masqctl *mctl, int optlen) +static int autofw_ctl(int optname, struct ip_masq_ctl *mctl, int optlen) { - struct ip_autofw *af = (struct ip_autofw*) mctl->u.mod.data; + struct ip_autofw_user *af = &mctl->u.autofw_user; - switch (optname) { - case IP_FW_MASQ_ADD: + switch (mctl->m_cmd) { + case IP_MASQ_CMD_ADD: + case IP_MASQ_CMD_INSERT: if (optlen<sizeof(*af)) return EINVAL; return ip_autofw_add(af); - case IP_FW_MASQ_DEL: + case IP_MASQ_CMD_DEL: if (optlen<sizeof(*af)) return EINVAL; return ip_autofw_del(af); - case IP_FW_MASQ_FLUSH: + case IP_MASQ_CMD_FLUSH: return ip_autofw_flush(); } @@ -250,8 +270,9 @@ static int autofw_ctl(int optname, struct ip_fw_masqctl *mctl, int optlen) } -static int autofw_out_update(struct iphdr *iph, __u16 *portp, struct ip_masq *ms) +static int autofw_out_update(const struct sk_buff *skb, const struct iphdr *iph, struct ip_masq *ms) { + const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); /* * Update any ipautofw entries ... */ @@ -260,8 +281,9 @@ static int autofw_out_update(struct iphdr *iph, __u16 *portp, struct ip_masq *ms return IP_MASQ_MOD_NOP; } -static struct ip_masq * autofw_out_create(struct iphdr *iph, __u16 * portp, __u32 maddr) +static struct ip_masq * autofw_out_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr) { + const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); /* * If the source port is supposed to match the masq port, then * make it so @@ -278,22 +300,25 @@ static struct ip_masq * autofw_out_create(struct iphdr *iph, __u16 * portp, __u3 } #if 0 -static int autofw_in_update(struct iphdr *iph, __u16 *portp, struct ip_masq *ms) +static int autofw_in_update(const struct sk_buff *skb, const struct iphdr *iph, __u16 *portp, struct ip_masq *ms) { + const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); ip_autofw_update_in(iph->saddr, portp[1], iph->protocol); return IP_MASQ_MOD_NOP; } #endif -static int autofw_in_rule(struct iphdr *iph, __u16 *portp) +static int autofw_in_rule(const struct sk_buff *skb, const struct iphdr *iph) { + const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); return (ip_autofw_check_range(iph->saddr, portp[1], iph->protocol, 0) || ip_autofw_check_direct(portp[1], iph->protocol) || ip_autofw_check_port(portp[1], iph->protocol)); } -static struct ip_masq * autofw_in_create(struct iphdr *iph, __u16 *portp, __u32 maddr) +static struct ip_masq * autofw_in_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr) { + const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); struct ip_autofw *af; if ((af=ip_autofw_check_range(iph->saddr, portp[1], iph->protocol, 0))) { diff --git a/net/ipv4/ip_masq_cuseeme.c b/net/ipv4/ip_masq_cuseeme.c index a306b4f25..9b412bafe 100644 --- a/net/ipv4/ip_masq_cuseeme.c +++ b/net/ipv4/ip_masq_cuseeme.c @@ -2,7 +2,7 @@ * IP_MASQ_FTP CUSeeMe masquerading module * * - * Version: @(#)$Id: ip_masq_cuseeme.c,v 1.2 1997/11/28 15:32:18 alan Exp $ + * Version: @(#)$Id: ip_masq_cuseeme.c,v 1.4 1998/10/06 04:48:57 davem Exp $ * * Author: Richard Lynch * @@ -39,6 +39,7 @@ * */ +#include <linux/config.h> #include <linux/module.h> #include <asm/system.h> #include <linux/types.h> @@ -94,10 +95,12 @@ struct ip_masq_app *masq_incarnations[MAX_MASQ_APP_PORTS]; /* * Debug level */ +#ifdef CONFIG_IP_MASQ_DEBUG static int debug=0; +MODULE_PARM(debug, "i"); +#endif MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i"); -MODULE_PARM(debug, "i"); static int masq_cuseeme_init_1 (struct ip_masq_app *mapp, struct ip_masq *ms) diff --git a/net/ipv4/ip_masq_ftp.c b/net/ipv4/ip_masq_ftp.c index 1d8edb253..35d1f5440 100644 --- a/net/ipv4/ip_masq_ftp.c +++ b/net/ipv4/ip_masq_ftp.c @@ -37,6 +37,7 @@ * */ +#include <linux/config.h> #include <linux/module.h> #include <asm/system.h> #include <linux/types.h> @@ -62,10 +63,12 @@ struct ip_masq_app *masq_incarnations[MAX_MASQ_APP_PORTS]; /* * Debug level */ +#ifdef CONFIG_IP_MASQ_DEBUG static int debug=0; +MODULE_PARM(debug, "i"); +#endif MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i"); -MODULE_PARM(debug, "i"); /* Dummy variable */ static int masq_ftp_pasv; diff --git a/net/ipv4/ip_masq_irc.c b/net/ipv4/ip_masq_irc.c index c13ca6e9a..11c0ca83f 100644 --- a/net/ipv4/ip_masq_irc.c +++ b/net/ipv4/ip_masq_irc.c @@ -40,6 +40,7 @@ * */ +#include <linux/config.h> #include <linux/module.h> #include <linux/types.h> @@ -63,10 +64,12 @@ struct ip_masq_app *masq_incarnations[MAX_MASQ_APP_PORTS]; /* * Debug level */ +#ifdef CONFIG_IP_MASQ_DEBUG static int debug=0; +MODULE_PARM(debug, "i"); +#endif MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i"); -MODULE_PARM(debug, "i"); /* diff --git a/net/ipv4/ip_masq_mod.c b/net/ipv4/ip_masq_mod.c index 7319a2624..005354944 100644 --- a/net/ipv4/ip_masq_mod.c +++ b/net/ipv4/ip_masq_mod.c @@ -4,7 +4,7 @@ * * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> * - * $Id: ip_masq_mod.c,v 1.4 1998/03/27 07:02:45 davem Exp $ + * $Id: ip_masq_mod.c,v 1.5 1998/08/29 23:51:09 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -22,6 +22,8 @@ #include <linux/errno.h> #include <net/ip_masq.h> #include <net/ip_masq_mod.h> + +#include <linux/ip_masq.h> #ifdef CONFIG_KMOD #include <linux/kmod.h> #endif @@ -31,6 +33,10 @@ EXPORT_SYMBOL(unregister_ip_masq_mod); EXPORT_SYMBOL(ip_masq_mod_lkp_link); EXPORT_SYMBOL(ip_masq_mod_lkp_unlink); +#ifdef __SMP__ +static spinlock_t masq_mod_lock = SPIN_LOCK_UNLOCKED; +#endif + /* * Base pointer for registered modules */ @@ -56,7 +62,7 @@ int ip_masq_mod_register_proc(struct ip_masq_mod *mmod) ent->name = mmod->mmod_name; ent->namelen = strlen (mmod->mmod_name); } - ret = proc_net_register(ent); + ret = ip_masq_proc_register(ent); if (ret) mmod->mmod_proc_ent = NULL; return ret; @@ -71,7 +77,7 @@ void ip_masq_mod_unregister_proc(struct ip_masq_mod *mmod) struct proc_dir_entry *ent = mmod->mmod_proc_ent; if (!ent) return; - proc_unregister(proc_net, ent->low_ino); + ip_masq_proc_unregister(ent); #endif } @@ -83,28 +89,28 @@ int ip_masq_mod_lkp_unlink(struct ip_masq_mod *mmod) { struct ip_masq_mod **mmod_p; - start_bh_atomic(); + write_lock_bh(&masq_mod_lock); for (mmod_p = &ip_masq_mod_lkp_base; *mmod_p ; mmod_p = &(*mmod_p)->next) if (mmod == (*mmod_p)) { *mmod_p = mmod->next; mmod->next = NULL; - end_bh_atomic(); + write_unlock_bh(&masq_mod_lock); return 0; } - end_bh_atomic(); + write_unlock_bh(&masq_mod_lock); return -EINVAL; } int ip_masq_mod_lkp_link(struct ip_masq_mod *mmod) { - start_bh_atomic(); + write_lock_bh(&masq_mod_lock); mmod->next = ip_masq_mod_lkp_base; ip_masq_mod_lkp_base=mmod; - end_bh_atomic(); + write_unlock_bh(&masq_mod_lock); return 0; } @@ -164,108 +170,110 @@ int unregister_ip_masq_mod(struct ip_masq_mod *mmod) return -EINVAL; } -int ip_masq_mod_in_rule(struct iphdr *iph, __u16 *portp) +int ip_masq_mod_in_rule(const struct sk_buff *skb, const struct iphdr *iph) { struct ip_masq_mod *mmod; - int ret; + int ret = IP_MASQ_MOD_NOP; for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) { if (!mmod->mmod_in_rule) continue; - switch (ret=mmod->mmod_in_rule(iph, portp)) { + switch (ret=mmod->mmod_in_rule(skb, iph)) { case IP_MASQ_MOD_NOP: continue; case IP_MASQ_MOD_ACCEPT: - return 1; case IP_MASQ_MOD_REJECT: - return -1; + goto out; } } - return 0; +out: + return ret; } -int ip_masq_mod_out_rule(struct iphdr *iph, __u16 *portp) +int ip_masq_mod_out_rule(const struct sk_buff *skb, const struct iphdr *iph) { struct ip_masq_mod *mmod; - int ret; + int ret = IP_MASQ_MOD_NOP; for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) { if (!mmod->mmod_out_rule) continue; - switch (ret=mmod->mmod_out_rule(iph, portp)) { + switch (ret=mmod->mmod_out_rule(skb, iph)) { case IP_MASQ_MOD_NOP: continue; case IP_MASQ_MOD_ACCEPT: - return 1; case IP_MASQ_MOD_REJECT: - return -1; + goto out; } } - return 0; +out: + return ret; } -struct ip_masq * ip_masq_mod_in_create(struct iphdr *iph, __u16 *portp, __u32 maddr) +struct ip_masq * ip_masq_mod_in_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr) { struct ip_masq_mod *mmod; - struct ip_masq *ms; + struct ip_masq *ms = NULL; for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) { if (!mmod->mmod_in_create) continue; - if ((ms=mmod->mmod_in_create(iph, portp, maddr))) { - return ms; + if ((ms=mmod->mmod_in_create(skb, iph, maddr))) { + goto out; } } - return NULL; +out: + return ms; } -struct ip_masq * ip_masq_mod_out_create(struct iphdr *iph, __u16 *portp, __u32 maddr) +struct ip_masq * ip_masq_mod_out_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr) { struct ip_masq_mod *mmod; - struct ip_masq *ms; + struct ip_masq *ms = NULL; for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) { if (!mmod->mmod_out_create) continue; - if ((ms=mmod->mmod_out_create(iph, portp, maddr))) { - return ms; + if ((ms=mmod->mmod_out_create(skb, iph, maddr))) { + goto out; } } - return NULL; +out: + return ms; } -int ip_masq_mod_in_update(struct iphdr *iph, __u16 *portp, struct ip_masq *ms) +int ip_masq_mod_in_update(const struct sk_buff *skb, const struct iphdr *iph, struct ip_masq *ms) { struct ip_masq_mod *mmod; - int ret; + int ret = IP_MASQ_MOD_NOP; for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) { if (!mmod->mmod_in_update) continue; - switch (ret=mmod->mmod_in_update(iph, ms)) { + switch (ret=mmod->mmod_in_update(skb, iph, ms)) { case IP_MASQ_MOD_NOP: continue; case IP_MASQ_MOD_ACCEPT: - return 1; case IP_MASQ_MOD_REJECT: - return -1; + goto out; } } - return 0; +out: + return ret; } -int ip_masq_mod_out_update(struct iphdr *iph, __u16 *portp, struct ip_masq *ms) +int ip_masq_mod_out_update(const struct sk_buff *skb, const struct iphdr *iph, struct ip_masq *ms) { struct ip_masq_mod *mmod; - int ret; + int ret = IP_MASQ_MOD_NOP; for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) { if (!mmod->mmod_out_update) continue; - switch (ret=mmod->mmod_out_update(iph, portp, ms)) { + switch (ret=mmod->mmod_out_update(skb, iph, ms)) { case IP_MASQ_MOD_NOP: continue; case IP_MASQ_MOD_ACCEPT: - return 1; case IP_MASQ_MOD_REJECT: - return -1; + goto out; } } - return 0; +out: + return ret; } struct ip_masq_mod * ip_masq_mod_getbyname(const char *mmod_name) @@ -287,20 +295,20 @@ struct ip_masq_mod * ip_masq_mod_getbyname(const char *mmod_name) /* * Module control entry */ -int ip_masq_mod_ctl(int optname, struct ip_fw_masqctl *mctl, int optlen) +int ip_masq_mod_ctl(int optname, struct ip_masq_ctl *mctl, int optlen) { struct ip_masq_mod * mmod; #ifdef CONFIG_KMOD - char kmod_name[IP_MASQ_MOD_NMAX+8]; + char kmod_name[IP_MASQ_TNAME_MAX+8]; #endif /* tappo */ - mctl->u.mod.name[IP_MASQ_MOD_NMAX-1] = 0; + mctl->m_tname[IP_MASQ_TNAME_MAX-1] = 0; - mmod = ip_masq_mod_getbyname(mctl->u.mod.name); + mmod = ip_masq_mod_getbyname(mctl->m_tname); if (mmod) return mmod->mmod_ctl(optname, mctl, optlen); #ifdef CONFIG_KMOD - sprintf(kmod_name,"ip_masq_%s", mctl->u.mod.name); + sprintf(kmod_name,"ip_masq_%s", mctl->m_tname); IP_MASQ_DEBUG(1, "About to request \"%s\" module\n", kmod_name); @@ -308,7 +316,7 @@ int ip_masq_mod_ctl(int optname, struct ip_fw_masqctl *mctl, int optlen) * Let sleep for a while ... */ request_module(kmod_name); - mmod = ip_masq_mod_getbyname(mctl->u.mod.name); + mmod = ip_masq_mod_getbyname(mctl->m_tname); if (mmod) return mmod->mmod_ctl(optname, mctl, optlen); #endif diff --git a/net/ipv4/ip_masq_portfw.c b/net/ipv4/ip_masq_portfw.c index 862742a21..4384d9cf6 100644 --- a/net/ipv4/ip_masq_portfw.c +++ b/net/ipv4/ip_masq_portfw.c @@ -2,7 +2,7 @@ * IP_MASQ_PORTFW masquerading module * * - * Version: @(#)ip_masq_portfw.c 0.02 97/10/30 + * $Id: ip_masq_portfw.c,v 1.2 1998/08/29 23:51:11 davem Exp $ * * Author: Steven Clarke <steven.clarke@monmouth.demon.co.uk> * @@ -10,9 +10,8 @@ * Juan Jose Ciarlante : created this new file from ip_masq.c and ip_fw.c * Juan Jose Ciarlante : modularized * Juan Jose Ciarlante : use GFP_KERNEL + * Juan Jose Ciarlante : locking * - * FIXME - * - after creating /proc/net/ip_masq/ direct, put portfw underneath */ #include <linux/config.h> @@ -23,19 +22,38 @@ #include <linux/list.h> #include <net/ip.h> #include <linux/ip_fw.h> +#include <linux/ip_masq.h> #include <net/ip_masq.h> #include <net/ip_masq_mod.h> -#include <net/ip_portfw.h> #include <linux/proc_fs.h> #include <linux/init.h> +#define IP_PORTFW_PORT_MIN 1 +#define IP_PORTFW_PORT_MAX 60999 + +struct ip_portfw { + struct list_head list; + __u32 laddr, raddr; + __u16 lport, rport; + atomic_t pref_cnt; /* pref "counter" down to 0 */ + int pref; /* user set pref */ +}; + static struct ip_masq_mod *mmod_self = NULL; +/* + * Debug level + */ +#ifdef CONFIG_IP_MASQ_DEBUG +static int debug=0; +MODULE_PARM(debug, "i"); +#endif /* * Lock */ -static atomic_t portfw_lock = ATOMIC_INIT(0); -static struct wait_queue *portfw_wait; +#ifdef __SMP__ +static spinlock_t portfw_lock = SPIN_LOCK_UNLOCKED; +#endif static struct list_head portfw_list[2]; static __inline__ int portfw_idx(int protocol) @@ -61,7 +79,7 @@ static __inline__ int ip_portfw_del(__u16 protocol, __u16 lport, __u32 laddr, __ nent = atomic_read(&mmod_self->mmod_nent); - ip_masq_lockz(&portfw_lock, &portfw_wait, 1); + write_lock_bh(&portfw_lock); for (entry=list->next;entry != list;entry = entry->next) { n = list_entry(entry, struct ip_portfw, list); @@ -75,7 +93,7 @@ static __inline__ int ip_portfw_del(__u16 protocol, __u16 lport, __u32 laddr, __ MOD_DEC_USE_COUNT; } } - ip_masq_unlockz(&portfw_lock, &portfw_wait, 1); + write_unlock_bh(&portfw_lock); return nent==atomic_read(&mmod_self->mmod_nent)? ESRCH : 0; } @@ -91,7 +109,7 @@ static __inline__ void ip_portfw_flush(void) struct list_head *e; struct ip_portfw *n; - ip_masq_lockz(&portfw_lock, &portfw_wait, 1); + write_lock_bh(&portfw_lock); for (prot = 0; prot < 2;prot++) { l = &portfw_list[prot]; @@ -104,12 +122,12 @@ static __inline__ void ip_portfw_flush(void) } } - ip_masq_unlockz(&portfw_lock, &portfw_wait, 1); + write_unlock_bh(&portfw_lock); } /* * Lookup routine for lport,laddr match - * called from ip_masq module (via registered obj) + * must be called with locked tables */ static __inline__ struct ip_portfw *ip_portfw_lookup(__u16 protocol, __u16 lport, __u32 laddr, __u32 *daddr_p, __u16 *dport_p) { @@ -118,8 +136,6 @@ static __inline__ struct ip_portfw *ip_portfw_lookup(__u16 protocol, __u16 lport struct ip_portfw *n = NULL; struct list_head *l, *e; - ip_masq_lock(&portfw_lock, 0); - l = &portfw_list[prot]; for (e=l->next;e!=l;e=e->next) { @@ -136,7 +152,6 @@ static __inline__ struct ip_portfw *ip_portfw_lookup(__u16 protocol, __u16 lport } n = NULL; out: - ip_masq_unlock(&portfw_lock, 0); return n; } @@ -153,7 +168,7 @@ static __inline__ int ip_portfw_edit(__u16 protocol, __u16 lport, __u32 laddr, _ int count = 0; - ip_masq_lockz(&portfw_lock, &portfw_wait, 0); + read_lock_bh(&portfw_lock); l = &portfw_list[prot]; @@ -169,7 +184,7 @@ static __inline__ int ip_portfw_edit(__u16 protocol, __u16 lport, __u32 laddr, _ } } - ip_masq_unlockz(&portfw_lock, &portfw_wait, 0); + read_unlock_bh(&portfw_lock); return count; } @@ -212,14 +227,14 @@ static __inline__ int ip_portfw_add(__u16 protocol, __u16 lport, __u32 laddr, __ atomic_set(&npf->pref_cnt, npf->pref); INIT_LIST_HEAD(&npf->list); - ip_masq_lockz(&portfw_lock, &portfw_wait, 1); + write_lock_bh(&portfw_lock); /* * Add at head */ list_add(&npf->list, &portfw_list[prot]); - ip_masq_unlockz(&portfw_lock, &portfw_wait, 1); + write_unlock_bh(&portfw_lock); ip_masq_mod_inc_nent(mmod_self); return 0; @@ -227,18 +242,34 @@ static __inline__ int ip_portfw_add(__u16 protocol, __u16 lport, __u32 laddr, __ -static __inline__ int portfw_ctl(int cmd, struct ip_fw_masqctl *mctl, int optlen) +static __inline__ int portfw_ctl(int optname, struct ip_masq_ctl *mctl, int optlen) { - struct ip_portfw_edits *mm = (struct ip_portfw_edits *) mctl->u.mod.data; + struct ip_portfw_user *mm = &mctl->u.portfw_user; int ret = EINVAL; + int arglen = optlen - IP_MASQ_CTL_BSIZE; + int cmd; - /* - * Don't trust the lusers - plenty of error checking! + + IP_MASQ_DEBUG(1-debug, "ip_masq_user_ctl(len=%d/%d|%d/%d)\n", + arglen, + sizeof (*mm), + optlen, + sizeof (*mctl)); + + /* + * Yes, I'm a bad guy ... */ - if (optlen<sizeof(*mm)) + if (arglen != sizeof(*mm) && optlen != sizeof(*mctl)) return EINVAL; - if (cmd != IP_FW_MASQ_FLUSH) { + /* + * Don't trust the lusers - plenty of error checking! + */ + cmd = mctl->m_cmd; + IP_MASQ_DEBUG(1-debug, "ip_masq_portfw_ctl(cmd=%d)\n", cmd); + + + if (cmd != IP_MASQ_CMD_FLUSH) { if (htons(mm->lport) < IP_PORTFW_PORT_MIN || htons(mm->lport) > IP_PORTFW_PORT_MAX) return EINVAL; @@ -249,19 +280,19 @@ static __inline__ int portfw_ctl(int cmd, struct ip_fw_masqctl *mctl, int optlen switch(cmd) { - case IP_FW_MASQ_ADD: + case IP_MASQ_CMD_ADD: ret = ip_portfw_add(mm->protocol, mm->lport, mm->laddr, mm->rport, mm->raddr, mm->pref); break; - case IP_FW_MASQ_DEL: + case IP_MASQ_CMD_DEL: ret = ip_portfw_del(mm->protocol, mm->lport, mm->laddr, mm->rport, mm->raddr); break; - case IP_FW_MASQ_FLUSH: + case IP_MASQ_CMD_FLUSH: ip_portfw_flush(); ret = 0; break; @@ -286,7 +317,6 @@ static int portfw_procinfo(char *buffer, char **start, off_t offset, int ind; int len=0; - ip_masq_lockz(&portfw_lock, &portfw_wait, 0); if (offset < 64) { @@ -295,6 +325,8 @@ static int portfw_procinfo(char *buffer, char **start, off_t offset, } pos = 64; + read_lock_bh(&portfw_lock); + for(ind = 0; ind < 2; ind++) { l = &portfw_list[ind]; @@ -302,8 +334,10 @@ static int portfw_procinfo(char *buffer, char **start, off_t offset, { pf = list_entry(e, struct ip_portfw, list); pos += 64; - if (pos <= offset) + if (pos <= offset) { + len = 0; continue; + } sprintf(temp,"%s %08lX %5u > %08lX %5u %5d %5d", ind ? "TCP" : "UDP", @@ -317,7 +351,7 @@ static int portfw_procinfo(char *buffer, char **start, off_t offset, } } done: - ip_masq_unlockz(&portfw_lock, &portfw_wait, 0); + read_unlock_bh(&portfw_lock); begin = len - (pos - offset); *start = buffer + begin; @@ -329,7 +363,7 @@ done: static struct proc_dir_entry portfw_proc_entry = { /* 0, 0, NULL", */ - 0, 9, "ip_portfw", /* Just for compatibility, for now ... */ + 0, 6, "portfw", /* Just for compatibility, for now ... */ S_IFREG | S_IRUGO, 1, 0, 0, 0, &proc_net_inode_operations, portfw_procinfo @@ -341,13 +375,26 @@ static struct proc_dir_entry portfw_proc_entry = { #define proc_ent NULL #endif -static int portfw_in_rule(struct iphdr *iph, __u16 *portp) +static int portfw_in_rule(const struct sk_buff *skb, const struct iphdr *iph) { + const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); +#ifdef CONFIG_IP_MASQ_DEBUG + struct rtable *rt = (struct rtable *)skb->dst; +#endif + struct ip_portfw *pfw; + + IP_MASQ_DEBUG(2, "portfw_in_rule(): skb:= dev=%s (index=%d), rt_iif=%d, rt_flags=0x%x rt_dev___=%s daddr=%d.%d.%d.%d dport=%d\n", + skb->dev->name, skb->dev->ifindex, rt->rt_iif, rt->rt_flags, + rt->u.dst.dev->name, + NIPQUAD(iph->daddr), ntohs(portp[1])); - return (ip_portfw_lookup(iph->protocol, portp[1], iph->daddr, NULL, NULL)!=0); + read_lock(&portfw_lock); + pfw = ip_portfw_lookup(iph->protocol, portp[1], iph->daddr, NULL, NULL); + read_unlock(&portfw_lock); + return (pfw!=0); } -static struct ip_masq * portfw_in_create(struct iphdr *iph, __u16 *portp, __u32 maddr) +static struct ip_masq * portfw_in_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr) { /* * If no entry exists in the masquerading table @@ -357,13 +404,14 @@ static struct ip_masq * portfw_in_create(struct iphdr *iph, __u16 *portp, __u32 __u32 raddr; __u16 rport; + const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); struct ip_masq *ms = NULL; struct ip_portfw *pf; /* - * Lock for reading only, by now... + * Lock for writing. */ - ip_masq_lock(&portfw_lock, 0); + write_lock(&portfw_lock); if ((pf=ip_portfw_lookup(iph->protocol, portp[1], iph->daddr, @@ -375,8 +423,8 @@ static struct ip_masq * portfw_in_create(struct iphdr *iph, __u16 *portp, __u32 0); ip_masq_listen(ms); - if (!ms || atomic_read(&mmod_self->mmod_nent) <= 1 || - ip_masq_nlocks(&portfw_lock) != 1) + if (!ms || atomic_read(&mmod_self->mmod_nent) <= 1 + /* || ip_masq_nlocks(&portfw_lock) != 1 */ ) /* * Maybe later... */ @@ -390,18 +438,16 @@ static struct ip_masq * portfw_in_create(struct iphdr *iph, __u16 *portp, __u32 */ if (atomic_dec_and_test(&pf->pref_cnt)) { - start_bh_atomic(); atomic_set(&pf->pref_cnt, pf->pref); list_del(&pf->list); list_add(&pf->list, portfw_list[portfw_idx(iph->protocol)].prev); - end_bh_atomic(); } } out: - ip_masq_unlock(&portfw_lock, 0); + write_unlock(&portfw_lock); return ms; } diff --git a/net/ipv4/ip_masq_raudio.c b/net/ipv4/ip_masq_raudio.c index 377b8223e..ee3e276b9 100644 --- a/net/ipv4/ip_masq_raudio.c +++ b/net/ipv4/ip_masq_raudio.c @@ -2,7 +2,7 @@ * IP_MASQ_RAUDIO - Real Audio masquerading module * * - * Version: @(#)$Id: ip_masq_raudio.c,v 1.9 1998/02/23 02:50:19 davem Exp $ + * Version: @(#)$Id: ip_masq_raudio.c,v 1.11 1998/10/06 04:49:04 davem Exp $ * * Author: Nigel Metheringham * Real Time Streaming code by Progressive Networks @@ -62,6 +62,7 @@ * */ +#include <linux/config.h> #include <linux/module.h> #include <asm/system.h> #include <linux/types.h> @@ -110,10 +111,12 @@ struct ip_masq_app *masq_incarnations[MAX_MASQ_APP_PORTS]; /* * Debug level */ +#ifdef CONFIG_IP_MASQ_DEBUG static int debug=0; +MODULE_PARM(debug, "i"); +#endif MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i"); -MODULE_PARM(debug, "i"); static int diff --git a/net/ipv4/ip_masq_user.c b/net/ipv4/ip_masq_user.c new file mode 100644 index 000000000..9264301ae --- /dev/null +++ b/net/ipv4/ip_masq_user.c @@ -0,0 +1,467 @@ +/* + * IP_MASQ_USER user space control module + * + * + * $Id: ip_masq_user.c,v 1.1 1998/08/29 23:51:08 davem Exp $ + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <asm/system.h> +#include <linux/stat.h> +#include <linux/proc_fs.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/inet.h> +#include <linux/init.h> +#include <net/protocol.h> +#include <net/icmp.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/checksum.h> +#include <net/ip_masq.h> +#include <net/ip_masq_mod.h> +#include <linux/sysctl.h> +#include <linux/ip_fw.h> + +#include <linux/ip_masq.h> + +/* + * Debug level + */ +static int debug=0; + +MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i"); +MODULE_PARM(debug, "i"); + +/* +static int check_5uple (struct ip_masq_user *ums) { + return 0; +} +*/ +static void masq_user_k2u(const struct ip_masq *ms, struct ip_masq_user *ums) +{ + ums->protocol = ms->protocol; + ums->daddr = ms->daddr; + ums->dport = ms->dport; + ums->maddr = ms->maddr; + ums->mport = ms->mport; + ums->saddr = ms->saddr; + ums->sport = ms->sport; + ums->timeout = ms->timeout; +} + + +static int ip_masq_user_maddr(struct ip_masq_user *ums) +{ + struct device *dev; + struct rtable *rt; + int ret = -EINVAL; + u32 rt_daddr, rt_saddr; + u32 tos; + + /* + * Did specify masq address. + */ + if (ums->maddr) + return 0; + + /* + * Select address to use for routing query + */ + + rt_daddr = ums->rt_daddr? ums->rt_daddr : ums->daddr; + rt_saddr = ums->rt_saddr? ums->rt_saddr : ums->saddr; + + + /* + * No address for routing, cannot continue + */ + if (rt_daddr == 0) { + IP_MASQ_DEBUG(1-debug, "cannot setup maddr with daddr=%lX, rt_addr=%lX\n", + ntohl(ums->daddr), ntohl(ums->rt_daddr)); + return -EINVAL; + } + + /* + * Find out rt device + */ + + rt_saddr = 0; + tos = RT_TOS(ums->ip_tos) | RTO_CONN; + + if ((ret=ip_route_output(&rt, rt_daddr, rt_saddr, tos, 0 /* dev */))) { + IP_MASQ_DEBUG(0-debug, "could not setup maddr for routing daddr=%lX, saddr=%lX\n", + ntohl(rt_daddr), ntohl(rt_saddr)); + return ret; + } + dev = rt->u.dst.dev; + ums->maddr = ip_masq_select_addr(dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); + + IP_MASQ_DEBUG(1-debug, "did setup maddr=%lX\n", ntohl(ums->maddr)); + ip_rt_put(rt); + return 0; +} + +/* + * Create new entry (from uspace) + */ +static int ip_masq_user_new(struct ip_masq_user *ums) +{ + struct ip_masq *ms = NULL; + unsigned mflags = 0; + int ret; + + if (masq_proto_num (ums->protocol) == -1) { + return EPROTONOSUPPORT; + } + + if (ums->dport == 0) { + ums->flags |= IP_MASQ_USER_F_LISTEN; + } + + if (ums->flags | IP_MASQ_USER_F_LISTEN) { + if ((ums->saddr == 0) || (ums->sport == 0)) { + return EINVAL; + } + mflags |= (IP_MASQ_F_NO_DPORT|IP_MASQ_F_NO_DADDR); + + } + + if ((ret = ip_masq_user_maddr(ums)) < 0) { + return -ret; + } + + mflags |= IP_MASQ_F_USER; + ms = ip_masq_new(ums->protocol, + ums->maddr, ums->mport, + ums->saddr, ums->sport, + ums->daddr, ums->dport, + mflags); + + if (ms == NULL) { + /* + * FIXME: ip_masq_new() should return errno + */ + return EBUSY; + } + + /* + * Setup timeouts for this new entry + */ + + if (ums->timeout) { + ms->timeout = ums->timeout; + } else if (ums->flags | IP_MASQ_USER_F_LISTEN) { + ip_masq_listen(ms); + } + + masq_user_k2u(ms, ums); + ip_masq_put(ms); + return 0; +} + +/* + * Delete existing entry + */ +static int ip_masq_user_del(struct ip_masq_user *ums) +{ + struct ip_masq *ms=NULL; + + if (masq_proto_num (ums->protocol) == -1) { + return EPROTONOSUPPORT; + } + start_bh_atomic(); + if (ums->mport && ums->maddr) { + ms = ip_masq_in_get(ums->protocol, + ums->daddr, ums->dport, + ums->maddr, ums->mport); + end_bh_atomic(); + } else if (ums->sport && ums->saddr) { + ms = ip_masq_out_get(ums->protocol, + ums->saddr, ums->sport, + ums->daddr, ums->dport); + end_bh_atomic(); + } else + return EINVAL; + + if (ms == NULL) { + return ESRCH; + } + + /* + * got (locked) entry, setup almost tiny timeout :) and + * give away + * + * FIXME: should use something better than S_CLOSE + */ + ms->timeout = IP_MASQ_S_CLOSE; + + masq_user_k2u(ms, ums); + ip_masq_put(ms); + return 0; +} + +static struct ip_masq * ip_masq_user_locked_get (struct ip_masq_user *ums, int *err) +{ + struct ip_masq *ms=NULL; + if (masq_proto_num (ums->protocol) == -1) { + *err = EPROTONOSUPPORT; + } + + start_bh_atomic(); + if (ums->mport && ums->maddr) { + ms = ip_masq_in_get(ums->protocol, + ums->daddr, ums->dport, + ums->maddr, ums->mport); + end_bh_atomic(); + } else if (ums->sport && ums->saddr) { + ms = ip_masq_out_get(ums->protocol, + ums->saddr, ums->sport, + ums->daddr, ums->dport); + end_bh_atomic(); + } else + *err = EINVAL; + + if (ms == NULL) *err = ESRCH; + return ms; +} + +/* + * Get existing entry (complete full tunnel info) + */ +static int ip_masq_user_get(struct ip_masq_user *ums) +{ + struct ip_masq *ms=NULL; + int err; + + ms = ip_masq_user_locked_get(ums, &err); + if (ms == NULL) + return err; + + masq_user_k2u(ms, ums); + + ip_masq_put(ms); + return 0; +} + +/* + * Set (some, valid) entry parameters + */ +static int ip_masq_user_set(struct ip_masq_user *ums) +{ + struct ip_masq *ms = NULL; + int err; + + ms = ip_masq_user_locked_get(ums, &err); + if (ms == NULL) + return err; + + /* + * FIXME: must allow selecting what you want to set + */ + ms->timeout = ums->timeout; + + masq_user_k2u(ms, ums); + + ip_masq_put(ms); + return 0; +} + + +/* + * Entry point + * ret value: + * <0 err + * ==0 ok + * >0 ok, copy to user + */ +static int ip_masq_user_ctl(int optname, struct ip_masq_ctl *mctl, int optlen) +{ + struct ip_masq_user *ums = &mctl->u.user; + int ret = EINVAL; + int arglen = optlen - IP_MASQ_CTL_BSIZE; + int cmd; + + IP_MASQ_DEBUG(1-debug, "ip_masq_user_ctl(len=%d/%d|%d/%d)\n", + arglen, + sizeof (*ums), + optlen, + sizeof (*mctl)); + + /* + * Yes, I'm a bad guy ... + */ + if (arglen != sizeof(*ums) && optlen != sizeof(*mctl)) + return EINVAL; + + MOD_INC_USE_COUNT; + + /* + * Don't trust the lusers - plenty of error checking! + */ + cmd = mctl->m_cmd; + IP_MASQ_DEBUG(1-debug, "ip_masq_user_ctl(cmd=%d)\n", cmd); + + switch (mctl->m_cmd) { + case IP_MASQ_CMD_ADD: + case IP_MASQ_CMD_INSERT: + ret = ip_masq_user_new(ums); + break; + case IP_MASQ_CMD_DEL: + ret = ip_masq_user_del(ums); + break; + case IP_MASQ_CMD_SET: + ret = ip_masq_user_set(ums); + break; + case IP_MASQ_CMD_GET: + ret = ip_masq_user_get(ums); + break; + } + + /* + * For all of the above, return masq tunnel info + */ + + ret = -ret; + + if (ret == 0) { + ret = sizeof (*ums) + IP_MASQ_CTL_BSIZE; + IP_MASQ_DEBUG(1-debug, "will return %d bytes to user\n", ret); + } + + MOD_DEC_USE_COUNT; + return ret; +} + + +#ifdef CONFIG_PROC_FS +static int ip_masq_user_info(char *buffer, char **start, off_t offset, + int length, int proto) +{ + off_t pos=0, begin; + struct ip_masq *ms; + char temp[129]; + int idx = 0; + int len=0; + int magic_control; + + MOD_INC_USE_COUNT; + + IP_MASQ_DEBUG(1-debug, "Entered user_info with proto=%d\n", proto); + + if (offset < 128) + { + sprintf(temp, + "Prot SrcIP SPrt DstIP DPrt MAddr MPrt State Flgs Ref Ctl Expires (free=%d,%d,%d)", + atomic_read(ip_masq_free_ports), + atomic_read(ip_masq_free_ports+1), + atomic_read(ip_masq_free_ports+2)); + len = sprintf(buffer, "%-127s\n", temp); + } + pos = 128; + + for(idx = 0; idx < IP_MASQ_TAB_SIZE; idx++) + { + /* + * Lock is actually only need in next loop + * we are called from uspace: must stop bh. + */ + read_lock_bh(&__ip_masq_lock); + for(ms = ip_masq_m_tab[idx]; ms ; ms = ms->m_link) + { + if (ms->protocol != proto) { + continue; + } + + pos += 128; + if (pos <= offset) { + len = 0; + continue; + } + + /* + * We have locked the tables, no need to del/add timers + * nor cli() 8) + */ + + + magic_control = atomic_read(&ms->n_control); + if (!magic_control && ms->control) magic_control = -1; + sprintf(temp,"%-4s %08lX:%04X %08lX:%04X %08lX:%04X %-12s %3X %4d %3d %7lu", + masq_proto_name(ms->protocol), + ntohl(ms->saddr), ntohs(ms->sport), + ntohl(ms->daddr), ntohs(ms->dport), + ntohl(ms->maddr), ntohs(ms->mport), + ip_masq_state_name(ms->state), + ms->flags, + atomic_read(&ms->refcnt), + magic_control, + (ms->timer.expires-jiffies)/HZ); + len += sprintf(buffer+len, "%-127s\n", temp); + + if(len >= length) { + read_unlock_bh(&__ip_masq_lock); + goto done; + } + } + read_unlock_bh(&__ip_masq_lock); + } + +done: + + if (len) { + begin = len - (pos - offset); + *start = buffer + begin; + len -= begin; + } + if(len>length) + len = length; + MOD_DEC_USE_COUNT; + return len; +} +#else +#define ip_masq_user_info NULL +#endif + +static struct ip_masq_hook ip_masq_user = { + ip_masq_user_ctl, + ip_masq_user_info +}; + +int ip_masq_user_init(void) +{ + if (ip_masq_user_hook != NULL) + return -EEXIST; + ip_masq_user_hook = &ip_masq_user; + return 0; +} + +int ip_masq_user_done(void) +{ + if (ip_masq_user_hook == NULL) + return ENOENT; + ip_masq_user_hook = NULL; + return 0; +} + +#ifdef MODULE +EXPORT_NO_SYMBOLS; +int init_module(void) +{ + if (ip_masq_user_init() != 0) + return -EIO; + return 0; +} + +void cleanup_module(void) +{ + if (ip_masq_user_done() != 0) + printk(KERN_INFO "ip_masq_user_done(): can't remove module"); +} + +#endif /* MODULE */ diff --git a/net/ipv4/ip_masq_vdolive.c b/net/ipv4/ip_masq_vdolive.c index 3b74d5f6f..4724e3b93 100644 --- a/net/ipv4/ip_masq_vdolive.c +++ b/net/ipv4/ip_masq_vdolive.c @@ -2,7 +2,7 @@ * IP_MASQ_VDOLIVE - VDO Live masquerading module * * - * Version: @(#)$Id: ip_masq_vdolive.c,v 1.2 1997/11/28 15:32:35 alan Exp $ + * Version: @(#)$Id: ip_masq_vdolive.c,v 1.4 1998/10/06 04:49:07 davem Exp $ * * Author: Nigel Metheringham <Nigel.Metheringham@ThePLAnet.net> * PLAnet Online Ltd @@ -22,6 +22,7 @@ * */ +#include <linux/config.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> @@ -52,10 +53,12 @@ struct ip_masq_app *masq_incarnations[MAX_MASQ_APP_PORTS]; /* * Debug level */ +#ifdef CONFIG_IP_MASQ_DEBUG static int debug=0; +MODULE_PARM(debug, "i"); +#endif MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i"); -MODULE_PARM(debug, "i"); static int masq_vdolive_init_1 (struct ip_masq_app *mapp, struct ip_masq *ms) diff --git a/net/ipv4/ip_nat_dumb.c b/net/ipv4/ip_nat_dumb.c index c48ea9263..9f9966b34 100644 --- a/net/ipv4/ip_nat_dumb.c +++ b/net/ipv4/ip_nat_dumb.c @@ -5,7 +5,7 @@ * * Dumb Network Address Translation. * - * Version: $Id: ip_nat_dumb.c,v 1.4 1998/08/26 12:03:49 davem Exp $ + * Version: $Id: ip_nat_dumb.c,v 1.7 1998/10/06 04:49:09 davem Exp $ * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * @@ -18,11 +18,13 @@ * Rani Assaf : A zero checksum is a special case * only in UDP * Rani Assaf : Added ICMP messages rewriting + * Rani Assaf : Repaired wrong changes, made by ANK. * * * NOTE: It is just working model of real NAT. */ +#include <linux/config.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/sched.h> @@ -40,6 +42,7 @@ #include <net/checksum.h> #include <linux/route.h> #include <net/route.h> +#include <net/ip_fib.h> int @@ -89,7 +92,8 @@ ip_do_nat(struct sk_buff *skb) if ((icmph->type != ICMP_DEST_UNREACH) && (icmph->type != ICMP_TIME_EXCEEDED) && - (icmph->type != ICMP_PARAMETERPROB)) break; + (icmph->type != ICMP_PARAMETERPROB)) + break; ciph = (struct iphdr *) (icmph + 1); @@ -98,8 +102,30 @@ ip_do_nat(struct sk_buff *skb) if (rt->rt_flags&RTCF_DNAT && ciph->saddr == odaddr) ciph->saddr = iph->daddr; - if (rt->rt_flags&RTCF_SNAT && ciph->daddr == osaddr) - ciph->daddr = iph->saddr; + if (rt->rt_flags&RTCF_SNAT) { + if (ciph->daddr != osaddr) { + struct fib_result res; + struct rt_key key; + unsigned flags = 0; + + key.src = ciph->daddr; + key.dst = ciph->saddr; + key.iif = skb->dev->ifindex; + key.oif = 0; +#ifdef CONFIG_IP_ROUTE_TOS + key.tos = RT_TOS(ciph->tos); +#endif + /* Use fib_lookup() until we get our own + * hash table of NATed hosts -- Rani + */ + if (fib_lookup(&key, &res) != 0) + return 0; + if (res.r) + ciph->daddr = fib_rules_policy(ciph->daddr, &res, &flags); + } + else + ciph->daddr = iph->saddr; + } break; } default: diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 153c7a391..92502239c 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -5,7 +5,7 @@ * * The options processing module for ip.c * - * Version: $Id: ip_options.c,v 1.14 1998/08/26 12:03:51 davem Exp $ + * Version: $Id: ip_options.c,v 1.15 1998/10/03 09:37:27 davem Exp $ * * Authors: A.N.Kuznetsov * @@ -89,12 +89,6 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) int optlen; u32 daddr; -#if 111 - if (skb == NULL) { - printk(KERN_DEBUG "no skb in ip_options_echo\n"); - return -EINVAL; - } -#endif memset(dopt, 0, sizeof(struct ip_options)); dopt->is_data = 1; @@ -145,14 +139,19 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) return -EINVAL; dopt->ts_needtime = 1; soffset += 4; - } - if (((struct timestamp*)(dptr+1))->flags == IPOPT_TS_PRESPEC) { - __u32 addr; - memcpy(&addr, sptr+soffset-9, 4); - if (inet_addr_type(addr) == RTN_UNICAST) { - dopt->ts_needtime = 0; - dopt->ts_needaddr = 0; - soffset -= 8; + if ((dptr[3]&0xF) == IPOPT_TS_PRESPEC) { + __u32 addr; + if (soffset + 3 > optlen) + return -EINVAL; + soffset += 4; + if (soffset + 8 <= optlen) { + dopt->ts_needtime = 0; + memcpy(&addr, sptr+soffset-1, 4); + if (inet_addr_type(addr) != RTN_UNICAST) { + dopt->ts_needtime = 1; + soffset += 8; + } + } } } dptr[2] = soffset; @@ -353,55 +352,56 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb) goto error; } if (optptr[2] <= optlen) { - struct timestamp * ts = (struct timestamp*)(optptr+1); __u32 * timeptr = NULL; - if (ts->ptr+3 > ts->len) { + if (optptr[2]+3 > optptr[1]) { pp_ptr = optptr + 2; goto error; } - switch (ts->flags) { + switch (optptr[3]&0xF) { case IPOPT_TS_TSONLY: opt->ts = optptr - iph; if (skb) - timeptr = (__u32*)&optptr[ts->ptr-1]; + timeptr = (__u32*)&optptr[optptr[2]-1]; opt->ts_needtime = 1; - ts->ptr += 4; + optptr[2] += 4; break; case IPOPT_TS_TSANDADDR: - if (ts->ptr+7 > ts->len) { + if (optptr[2]+7 > optptr[1]) { pp_ptr = optptr + 2; goto error; } opt->ts = optptr - iph; if (skb) { - memcpy(&optptr[ts->ptr-1], &rt->rt_spec_dst, 4); - timeptr = (__u32*)&optptr[ts->ptr+3]; + memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); + timeptr = (__u32*)&optptr[optptr[2]+3]; } opt->ts_needaddr = 1; opt->ts_needtime = 1; - ts->ptr += 8; + optptr[2] += 8; break; case IPOPT_TS_PRESPEC: - if (ts->ptr+7 > ts->len) { + if (optptr[2]+7 > optptr[1]) { pp_ptr = optptr + 2; goto error; } opt->ts = optptr - iph; { u32 addr; - memcpy(&addr, &optptr[ts->ptr-1], 4); + memcpy(&addr, &optptr[optptr[2]-1], 4); if (inet_addr_type(addr) == RTN_UNICAST) break; if (skb) - timeptr = (__u32*)&optptr[ts->ptr+3]; + timeptr = (__u32*)&optptr[optptr[2]+3]; } - opt->ts_needaddr = 1; opt->ts_needtime = 1; - ts->ptr += 8; + optptr[2] += 8; break; default: - pp_ptr = optptr + 3; - goto error; + if (!skb && !capable(CAP_NET_RAW)) { + pp_ptr = optptr + 3; + goto error; + } + break; } if (timeptr) { struct timeval tv; @@ -412,14 +412,14 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb) opt->is_changed = 1; } } else { - struct timestamp * ts = (struct timestamp*)(optptr+1); - if (ts->overflow == 15) { + unsigned overflow = optptr[3]>>4; + if (overflow == 15) { pp_ptr = optptr + 3; goto error; } opt->ts = optptr - iph; if (skb) { - ts->overflow++; + optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4); opt->is_changed = 1; } } @@ -435,7 +435,7 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb) case IPOPT_SEC: case IPOPT_SID: default: - if (!skb) { + if (!skb && !capable(CAP_NET_RAW)) { pp_ptr = optptr; goto error; } @@ -480,10 +480,10 @@ void ip_options_undo(struct ip_options * opt) memset(&optptr[optptr[2]-1], 0, 4); optptr[2] -= 4; } - if (opt->ts_needaddr) { + if (opt->ts_needaddr) memset(&optptr[optptr[2]-1], 0, 4); + if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) optptr[2] -= 4; - } } } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 9250051ab..5edfbef93 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) output module. * - * Version: $Id: ip_output.c,v 1.61 1998/08/26 12:03:54 davem Exp $ + * Version: $Id: ip_output.c,v 1.63 1998/10/03 09:37:30 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -111,8 +111,7 @@ void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, iph->ihl = 5; iph->tos = sk->ip_tos; iph->frag_off = 0; - if (sk->ip_pmtudisc == IP_PMTUDISC_WANT && - !(rt->u.dst.mxlock&(1<<RTAX_MTU))) + if (ip_dont_fragment(sk, &rt->u.dst)) iph->frag_off |= htons(IP_DF); iph->ttl = sk->ip_ttl; iph->daddr = rt->rt_dst; @@ -312,7 +311,7 @@ void ip_queue_xmit(struct sk_buff *skb) if (tot_len > rt->u.dst.pmtu) goto fragment; - if (sk->ip_pmtudisc == IP_PMTUDISC_WANT && !(rt->u.dst.mxlock & (1 << RTAX_MTU))) + if (ip_dont_fragment(sk, &rt->u.dst)) iph->frag_off |= __constant_htons(IP_DF); /* Add an IP checksum. */ @@ -323,8 +322,7 @@ void ip_queue_xmit(struct sk_buff *skb) return; fragment: - if (sk->ip_pmtudisc == IP_PMTUDISC_WANT && - !(rt->u.dst.mxlock & (1 << RTAX_MTU)) && + if (ip_dont_fragment(sk, &rt->u.dst) && tot_len > (iph->ihl<<2) + sizeof(struct tcphdr)+16) { /* Reject packet ONLY if TCP might fragment it itself, if were careful enough. @@ -383,23 +381,24 @@ int ip_build_xmit_slow(struct sock *sk, unsigned int fraglen, maxfraglen, fragheaderlen; int err; int offset, mf; + int mtu; unsigned short id; int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15; int nfrags=0; struct ip_options *opt = ipc->opt; - int df = htons(IP_DF); + int df = 0; - if (sk->ip_pmtudisc == IP_PMTUDISC_DONT || - (rt->u.dst.mxlock&(1<<RTAX_MTU))) - df = 0; + mtu = rt->u.dst.pmtu; + if (ip_dont_fragment(sk, &rt->u.dst)) + df = htons(IP_DF); if (!sk->ip_hdrincl) length -= sizeof(struct iphdr); if (opt) { fragheaderlen = sizeof(struct iphdr) + opt->optlen; - maxfraglen = ((rt->u.dst.pmtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen; + maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen; } else { fragheaderlen = sk->ip_hdrincl ? 0 : sizeof(struct iphdr); @@ -408,11 +407,13 @@ int ip_build_xmit_slow(struct sock *sk, * out the size of the frames to send. */ - maxfraglen = ((rt->u.dst.pmtu-sizeof(struct iphdr)) & ~7) + fragheaderlen; + maxfraglen = ((mtu-sizeof(struct iphdr)) & ~7) + fragheaderlen; } - if (length + fragheaderlen > 0xFFFF) + if (length + fragheaderlen > 0xFFFF) { + ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu); return -EMSGSIZE; + } /* * Start at the end of the frame by handling the remainder. @@ -443,6 +444,7 @@ int ip_build_xmit_slow(struct sock *sk, */ if (offset > 0 && df) { + ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu); return(-EMSGSIZE); } @@ -544,7 +546,7 @@ int ip_build_xmit_slow(struct sock *sk, * Account for the fragment. */ - if(!err && offset == 0 && + if(!err && call_out_firewall(PF_INET, rt->u.dst.dev, skb->nh.iph, NULL, &skb) < FW_ACCEPT) err = -EPERM; @@ -612,10 +614,9 @@ int ip_build_xmit(struct sock *sk, /* * Do path mtu discovery if needed. */ - df = htons(IP_DF); - if (sk->ip_pmtudisc == IP_PMTUDISC_DONT || - (rt->u.dst.mxlock&(1<<RTAX_MTU))) - df = 0; + df = 0; + if (ip_dont_fragment(sk, &rt->u.dst)) + df = htons(IP_DF); /* * Fast path for unfragmented frames without options. diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 3d8f4fab6..1391cbd24 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -5,7 +5,7 @@ * * The IP to API glue. * - * Version: $Id: ip_sockglue.c,v 1.37 1998/08/26 12:03:57 davem Exp $ + * Version: $Id: ip_sockglue.c,v 1.39 1998/10/03 09:37:33 davem Exp $ * * Authors: see ip.c * @@ -41,6 +41,11 @@ #include <net/transp_v6.h> #endif +#ifdef CONFIG_IP_MASQUERADE +#include <linux/ip_masq.h> +#endif + +#include <linux/errqueue.h> #include <asm/uaccess.h> #define MAX(a,b) ((a)>(b)?(a):(b)) @@ -74,7 +79,8 @@ static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb) { - put_cmsg(msg, SOL_IP, IP_TTL, 1, &skb->nh.iph->ttl); + int ttl = skb->nh.iph->ttl; + put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl); } static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb) @@ -221,6 +227,140 @@ int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct s return 0; } +void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, + u16 port, u32 info, u8 *payload) +{ + struct sock_exterr_skb *serr; + + if (!sk->ip_recverr) + return; + + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) + return; + + serr = SKB_EXT_ERR(skb); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_ICMP; + serr->ee.ee_type = skb->h.icmph->type; + serr->ee.ee_code = skb->h.icmph->code; + serr->ee.ee_pad = 0; + serr->ee.ee_info = info; + serr->ee.ee_data = 0; + serr->addr_offset = (u8*)&(((struct iphdr*)(skb->h.icmph+1))->daddr) - skb->nh.raw; + serr->port = port; + + skb->h.raw = payload; + skb_pull(skb, payload - skb->data); + + if (sock_queue_err_skb(sk, skb)) + kfree_skb(skb); +} + +void ip_local_error(struct sock *sk, int err, u32 daddr, u16 port, u32 info) +{ + struct sock_exterr_skb *serr; + struct iphdr *iph; + struct sk_buff *skb; + + if (!sk->ip_recverr) + return; + + skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC); + if (!skb) + return; + + iph = (struct iphdr*)skb_put(skb, sizeof(struct iphdr)); + skb->nh.iph = iph; + iph->daddr = daddr; + + serr = SKB_EXT_ERR(skb); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; + serr->ee.ee_type = 0; + serr->ee.ee_code = 0; + serr->ee.ee_pad = 0; + serr->ee.ee_info = info; + serr->ee.ee_data = 0; + serr->addr_offset = (u8*)&iph->daddr - skb->nh.raw; + serr->port = port; + + skb->h.raw = skb->tail; + skb_pull(skb, skb->tail - skb->data); + + if (sock_queue_err_skb(sk, skb)) + kfree_skb(skb); +} + +/* + * Handle MSG_ERRQUEUE + */ +int ip_recv_error(struct sock *sk, struct msghdr *msg, int len) +{ + struct sock_exterr_skb *serr; + struct sk_buff *skb, *skb2; + struct sockaddr_in *sin; + struct { + struct sock_extended_err ee; + struct sockaddr_in offender; + } errhdr; + int err; + int copied; + + err = -EAGAIN; + skb = skb_dequeue(&sk->error_queue); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = memcpy_toiovec(msg->msg_iov, skb->data, copied); + if (err) + goto out_free_skb; + + serr = SKB_EXT_ERR(skb); + + sin = (struct sockaddr_in *)msg->msg_name; + if (sin) { + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = *(u32*)(skb->nh.raw + serr->addr_offset); + sin->sin_port = serr->port; + } + + memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); + sin = &errhdr.offender; + sin->sin_family = AF_UNSPEC; + if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) { + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = skb->nh.iph->saddr; + if (sk->ip_cmsg_flags) + ip_cmsg_recv(msg, skb); + } + + put_cmsg(msg, SOL_IP, IP_RECVERR, sizeof(errhdr), &errhdr); + + /* Now we could try to dump offended packet options */ + + msg->msg_flags |= MSG_ERRQUEUE; + err = copied; + + /* Reset and regenerate socket error */ + sk->err = 0; + if ((skb2 = skb_peek(&sk->error_queue)) != NULL) { + sk->err = SKB_EXT_ERR(skb2)->ee.ee_errno; + sk->error_report(sk); + } + +out_free_skb: + kfree_skb(skb); +out: + return err; +} + + /* * Socket option code for IP. This is the end of the line after any TCP,UDP etc options on * an IP socket. @@ -234,10 +374,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt #if defined(CONFIG_IP_FIREWALL) char tmp_fw[MAX(sizeof(struct ip_fwtest),sizeof(struct ip_fwnew))]; #endif -#ifdef CONFIG_IP_MASQUERADE - char masq_ctl[IP_FW_MASQCTL_MAX]; -#endif - if(optlen>=sizeof(int)) { if(get_user(val, (int *) optval)) return -EFAULT; @@ -347,23 +483,15 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt return -ENOPROTOOPT; sk->ip_hdrincl=val?1:0; return 0; - case IP_PMTUDISC: + case IP_MTU_DISCOVER: if (val<0 || val>2) return -EINVAL; sk->ip_pmtudisc = val; return 0; case IP_RECVERR: - if (sk->type==SOCK_STREAM) - return -ENOPROTOOPT; - lock_sock(sk); - if (sk->ip_recverr && !val) { - struct sk_buff *skb; - /* Drain queued errors */ - while((skb=skb_dequeue(&sk->error_queue))!=NULL) - kfree_skb(skb); - } - sk->ip_recverr = val?1:0; - release_sock(sk); + sk->ip_recverr = !!val; + if (!val) + skb_queue_purge(&sk->error_queue); return 0; case IP_MULTICAST_TTL: if (optlen<1) @@ -466,17 +594,13 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt return -err; /* -0 is 0 after all */ #endif /* CONFIG_IP_FIREWALL */ #ifdef CONFIG_IP_MASQUERADE - case IP_FW_MASQ_ADD: - case IP_FW_MASQ_DEL: - case IP_FW_MASQ_FLUSH: + case IP_FW_MASQ_CTL: if(!capable(CAP_NET_ADMIN)) return -EPERM; - if(optlen>sizeof(masq_ctl) || optlen<1) + if(optlen<1) return -EINVAL; - if(copy_from_user(masq_ctl,optval,optlen)) - return -EFAULT; - err=ip_masq_ctl(optname, masq_ctl,optlen); - return -err; /* -0 is 0 after all */ + err=ip_masq_uctl(optname, optval ,optlen); + return err; #endif default: @@ -491,7 +615,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen) { - int val,err; + int val; int len; if(level!=SOL_IP) @@ -554,9 +678,18 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *op case IP_HDRINCL: val=sk->ip_hdrincl; break; - case IP_PMTUDISC: + case IP_MTU_DISCOVER: val=sk->ip_pmtudisc; break; + case IP_MTU: + val = 0; + lock_sock(sk); + if (sk->dst_cache) + val = sk->dst_cache->pmtu; + release_sock(sk); + if (!val) + return -ENOTCONN; + break; case IP_RECVERR: val=sk->ip_recverr; break; @@ -566,7 +699,6 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *op case IP_MULTICAST_LOOP: val=sk->ip_mc_loop; break; -#if 0 case IP_MULTICAST_IF: { struct ip_mreqn mreq; @@ -580,30 +712,6 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *op return -EFAULT; return 0; } -#endif - case IP_MULTICAST_IF: - { - struct device *dev = dev_get_by_index(sk->ip_mc_index); - - printk(KERN_INFO "application %s uses old get IP_MULTICAST_IF. Please, report!\n", current->comm); - - if (dev == NULL) - { - len = 0; - return put_user(len, optlen); - } - dev_lock_list(); - len = min(len,strlen(dev->name)); - err = put_user(len, optlen); - if (!err) - { - err = copy_to_user((void *)optval,dev->name, len); - if(err) - err=-EFAULT; - } - dev_unlock_list(); - return err; - } default: return(-ENOPROTOOPT); } diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 76372b4ab..db1d7fc3f 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1,5 +1,5 @@ /* - * $Id: ipconfig.c,v 1.15 1998/06/19 13:22:33 davem Exp $ + * $Id: ipconfig.c,v 1.16 1998/10/21 22:27:26 davem Exp $ * * Automatic Configuration of IP -- use BOOTP or RARP or user-supplied * information to configure own IP address and routes. @@ -260,12 +260,12 @@ __initfunc(int ic_defaults(void)) root_server_addr = ic_servaddr; if (ic_netmask == INADDR_NONE) { - if (IN_CLASSA(ic_myaddr)) - ic_netmask = IN_CLASSA_NET; - else if (IN_CLASSB(ic_myaddr)) - ic_netmask = IN_CLASSB_NET; - else if (IN_CLASSC(ic_myaddr)) - ic_netmask = IN_CLASSC_NET; + if (IN_CLASSA(ntohl(ic_myaddr))) + ic_netmask = htonl(IN_CLASSA_NET); + else if (IN_CLASSB(ntohl(ic_myaddr))) + ic_netmask = htonl(IN_CLASSB_NET); + else if (IN_CLASSC(ntohl(ic_myaddr))) + ic_netmask = htonl(IN_CLASSC_NET); else { printk(KERN_ERR "IP-Config: Unable to guess netmask for address %08x\n", ic_myaddr); return -1; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 778ac15c1..9175e6fe6 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -1,7 +1,7 @@ /* * Linux NET3: IP/IP protocol decoder. * - * Version: $Id: ipip.c,v 1.23 1998/08/26 12:04:00 davem Exp $ + * Version: $Id: ipip.c,v 1.24 1998/10/03 09:37:35 davem Exp $ * * Authors: * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 @@ -625,6 +625,10 @@ ipip_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd) case SIOCADDTUNNEL: case SIOCCHGTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + err = -EFAULT; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) goto done; @@ -652,6 +656,10 @@ ipip_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd) break; case SIOCDELTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + if (dev == &ipip_fb_tunnel_dev) { err = -EFAULT; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 49cd6daf5..79ecd1102 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -9,7 +9,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Version: $Id: ipmr.c,v 1.36 1998/08/26 12:04:03 davem Exp $ + * Version: $Id: ipmr.c,v 1.37 1998/10/03 09:37:39 davem Exp $ * * Fixes: * Michael Chastain : Incorrect size of copying. @@ -431,7 +431,6 @@ static void ipmr_cache_resolve(struct mfc_cache *cache) ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE; } err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).pid, MSG_DONTWAIT); - if (err < 0) printk(KERN_DEBUG "Err=%d", err); } else #endif ip_mr_forward(skb, cache, 0); @@ -476,9 +475,10 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) msg->im_vif = reg_vif_num; skb->nh.iph->ihl = sizeof(struct iphdr) >> 2; skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr)); - } else { + } else #endif - + { + /* * Copy the IP header */ @@ -500,9 +500,7 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) igmp->code = 0; skb->nh.iph->tot_len=htons(skb->len); /* Fix the length */ skb->h.raw = skb->nh.raw; -#ifdef CONFIG_IP_PIMSM } -#endif /* * Deliver to mrouted @@ -753,7 +751,9 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen) return -EADDRNOTAVAIL; break; default: +#if 0 printk(KERN_DEBUG "ipmr_add_vif: flags %02x\n", vif.vifc_flags); +#endif return -EINVAL; } @@ -1548,7 +1548,6 @@ done: len=length; if (len < 0) { len = 0; - printk(KERN_CRIT "Yep, guys... our template for proc_*_read is crappy :-)\n"); } return len; } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 6f06f4345..f8990903e 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -7,7 +7,7 @@ * PROC file system. It is mainly used for debugging and * statistics. * - * Version: $Id: proc.c,v 1.31 1998/07/29 20:09:25 freitag Exp $ + * Version: $Id: proc.c,v 1.33 1998/10/21 05:44:35 davem Exp $ * * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de> @@ -55,21 +55,22 @@ static inline void get__openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i) { - /* FIXME: I'm not sure if the timer fields are correct. */ sprintf(tmpbuf, "%4d: %08lX:%04X %08lX:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu", + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u", i, (long unsigned int)req->af.v4_req.loc_addr, ntohs(sk->sport), (long unsigned int)req->af.v4_req.rmt_addr, - req->rmt_port, + ntohs(req->rmt_port), TCP_SYN_RECV, - 0,0, /* use sizeof(struct open_request) here? */ - 0, (unsigned long)(req->expires - jiffies), /* ??? */ + 0,0, /* could print option size, but that is af dependent. */ + 1, /* timers active (only the expire timer) */ + (unsigned long)(req->expires - jiffies), req->retrans, sk->socket ? sk->socket->inode->i_uid : 0, - 0, /* ??? */ - sk->socket ? sk->socket->inode->i_ino:0); + 0, /* non standard timer */ + 0 /* open_requests have no inode */ + ); } /* Format a single socket into tmpbuf. */ @@ -157,6 +158,9 @@ static inline void get__sock(struct sock *sp, char *tmpbuf, int i, int format) * KNOWN BUGS * As in get_unix_netinfo, the buffer might be too small. If this * happens, get__netinfo returns only part of the available infos. + * + * Assumes that buffer length is a multiply of 128 - if not it will + * write past the end. */ static int get__netinfo(struct proto *pro, char *buffer, int format, char **start, off_t offset, int length) @@ -172,17 +176,6 @@ get__netinfo(struct proto *pro, char *buffer, int format, char **start, off_t of " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout inode"); pos = 128; -/* - * This was very pretty but didn't work when a socket is destroyed - * at the wrong moment (eg a syn recv socket getting a reset), or - * a memory timer destroy. Instead of playing with timers we just - * concede defeat and do a start_bh_atomic(). - * Why not just use lock_sock()? As far as I can see all timer routines - * check for sock_readers before doing anything. -AK - * [Disabled for now again, because it hard-locked my machine, and there - * is an theoretical situation then, where an user could prevent - * sockets from being destroyed by constantly reading /proc/net/tcp.] - */ SOCKHASH_LOCK(); sp = pro->sklist_next; while(sp != (struct sock *)pro) { @@ -196,8 +189,8 @@ get__netinfo(struct proto *pro, char *buffer, int format, char **start, off_t of continue; get__openreq(sp, req, tmpbuf, i); len += sprintf(buffer+len, "%-127s\n", tmpbuf); - if(len >= length) - break; + if(len >= length) + goto out; } } @@ -215,6 +208,7 @@ get__netinfo(struct proto *pro, char *buffer, int format, char **start, off_t of sp = next; i++; } +out: SOCKHASH_UNLOCK(); begin = len - (pos - offset); @@ -222,6 +216,8 @@ get__netinfo(struct proto *pro, char *buffer, int format, char **start, off_t of len -= begin; if(len>length) len = length; + if (len<0) + len = 0; return len; } @@ -265,6 +261,8 @@ int afinet_get_info(char *buffer, char **start, off_t offset, int length, int du len -= offset; if (len > length) len = length; + if (len < 0) + len = 0; return len; } @@ -343,6 +341,8 @@ int snmp_get_info(char *buffer, char **start, off_t offset, int length, int dumm len -= offset; if (len > length) len = length; + if (len < 0) + len = 0; return len; } @@ -357,15 +357,18 @@ int netstat_get_info(char *buffer, char **start, off_t offset, int length, int d len = sprintf(buffer, "TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed" - " EmbryonicRsts PruneCalled RcvPruned OfoPruned\n" - "TcpExt: %lu %lu %lu %lu %lu %lu %lu\n", + " EmbryonicRsts PruneCalled RcvPruned OfoPruned" + " OutOfWindowIcmps LockDroppedIcmps\n" + "TcpExt: %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", net_statistics.SyncookiesSent, net_statistics.SyncookiesRecv, net_statistics.SyncookiesFailed, net_statistics.EmbryonicRsts, net_statistics.PruneCalled, net_statistics.RcvPruned, - net_statistics.OfoPruned); + net_statistics.OfoPruned, + net_statistics.OutOfWindowIcmps, + net_statistics.LockDroppedIcmps); if (offset >= len) { @@ -376,5 +379,7 @@ int netstat_get_info(char *buffer, char **start, off_t offset, int length, int d len -= offset; if (len > length) len = length; + if (len < 0) + len = 0; return len; } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index e10ddc0dd..fc6b1f2ee 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -5,7 +5,7 @@ * * RAW - implementation of IP "raw" sockets. * - * Version: $Id: raw.c,v 1.37 1998/08/26 12:04:07 davem Exp $ + * Version: $Id: raw.c,v 1.39 1998/11/08 11:17:04 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -143,26 +143,53 @@ struct sock *raw_v4_lookup(struct sock *sk, unsigned short num, return s; } -/* - * Raw_err does not currently get called by the icmp module - FIXME: - */ - void raw_err (struct sock *sk, struct sk_buff *skb) { int type = skb->h.icmph->type; int code = skb->h.icmph->code; - - if (sk->ip_recverr) { - struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); - if (skb2 && sock_queue_err_skb(sk, skb2)) - kfree_skb(skb); + u32 info = 0; + int err = 0; + int harderr = 0; + + /* Report error on raw socket, if: + 1. User requested ip_recverr. + 2. Socket is connected (otherwise the error indication + is useless without ip_recverr and error is hard. + */ + if (!sk->ip_recverr && sk->state != TCP_ESTABLISHED) + return; + + switch (type) { + default: + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + case ICMP_SOURCE_QUENCH: + return; + case ICMP_PARAMETERPROB: + err = EPROTO; + info = ntohl(skb->h.icmph->un.gateway)>>24; + harderr = 1; + break; + case ICMP_DEST_UNREACH: + err = EHOSTUNREACH; + if (code > NR_ICMP_UNREACH) + break; + err = icmp_err_convert[code].errno; + harderr = icmp_err_convert[code].fatal; + if (code == ICMP_FRAG_NEEDED) { + harderr = (sk->ip_pmtudisc != IP_PMTUDISC_DONT); + err = EMSGSIZE; + info = ntohs(skb->h.icmph->un.frag.mtu); + } } - if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { - if (sk->ip_pmtudisc != IP_PMTUDISC_DONT) { - sk->err = EMSGSIZE; - sk->error_report(sk); - } + if (sk->ip_recverr) + ip_icmp_error(sk, skb, err, 0, info, (u8 *)(skb->h.icmph + 1)); + + if (sk->ip_recverr || harderr) { + sk->err = err; + sk->error_report(sk); } } @@ -170,7 +197,7 @@ static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) { /* Charge it to the socket. */ - if (__sock_queue_rcv_skb(sk,skb)<0) + if (sock_queue_rcv_skb(sk,skb)<0) { ip_statistics.IpInDiscards++; kfree_skb(skb); @@ -373,7 +400,7 @@ done: return err<0 ? err : len; } -static void raw_close(struct sock *sk, unsigned long timeout) +static void raw_close(struct sock *sk, long timeout) { /* Observation: when raw_close is called, processes have no access to socket anymore. But net still has. @@ -443,23 +470,12 @@ int raw_recvmsg(struct sock *sk, struct msghdr *msg, int len, if (flags & MSG_OOB) return -EOPNOTSUPP; - - if (sk->shutdown & RCV_SHUTDOWN) - return(0); if (addr_len) *addr_len=sizeof(*sin); - if (sk->ip_recverr && (skb = skb_dequeue(&sk->error_queue)) != NULL) { - err = sock_error(sk); - if (msg->msg_controllen == 0) { - skb_free_datagram(sk, skb); - return err; - } - put_cmsg(msg, SOL_IP, IP_RECVERR, skb->len, skb->data); - skb_free_datagram(sk, skb); - return 0; - } + if (flags & MSG_ERRQUEUE) + return ip_recv_error(sk, msg, len); skb=skb_recv_datagram(sk,flags,noblock,&err); if(skb==NULL) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 5788342c9..a3d002fae 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -5,7 +5,7 @@ * * ROUTE - implementation of the IP router. * - * Version: $Id: route.c,v 1.57 1998/08/26 12:04:09 davem Exp $ + * Version: $Id: route.c,v 1.58 1998/10/03 09:37:50 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -333,7 +333,7 @@ void rt_cache_flush(int delay) otherwise fire it at deadline time. */ - if (user_mode && (long)(rt_deadline-now) < ip_rt_max_delay-ip_rt_min_delay) + if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay) tmo = 0; if (delay > tmo) @@ -432,7 +432,7 @@ static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt) rthp = &rth->u.rt_next; } - /* Try to bind route ro arp only if it is output + /* Try to bind route to arp only if it is output route or unicast forwarding path. */ if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) @@ -569,12 +569,26 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) struct rtable *rt = (struct rtable*)dst; if (rt != NULL) { - if (dst->obsolete || rt->rt_flags&RTCF_REDIRECTED) { + if (dst->obsolete) { + ip_rt_put(rt); + return NULL; + } + if (rt->rt_flags&RTCF_REDIRECTED) { + unsigned hash = rt_hash_code(rt->key.dst, rt->key.src^(rt->key.oif<<5), rt->key.tos); + struct rtable **rthp; #if RT_CACHE_DEBUG >= 1 printk(KERN_DEBUG "ip_rt_advice: redirect to %d.%d.%d.%d/%02x dropped\n", NIPQUAD(rt->rt_dst), rt->key.tos); #endif ip_rt_put(rt); - rt_cache_flush(0); + start_bh_atomic(); + for (rthp = &rt_hash_table[hash]; *rthp; rthp = &(*rthp)->u.rt_next) { + if (*rthp == rt) { + *rthp = rt->u.rt_next; + rt_free(rt); + break; + } + } + end_bh_atomic(); return NULL; } } @@ -654,12 +668,12 @@ static int ip_error(struct sk_buff *skb) } now = jiffies; - if ((rt->u.dst.rate_tokens += now - rt->u.dst.rate_last) > ip_rt_error_burst) + if ((rt->u.dst.rate_tokens += (now - rt->u.dst.rate_last)) > ip_rt_error_burst) rt->u.dst.rate_tokens = ip_rt_error_burst; + rt->u.dst.rate_last = now; if (rt->u.dst.rate_tokens >= ip_rt_error_cost) { rt->u.dst.rate_tokens -= ip_rt_error_cost; icmp_send(skb, ICMP_DEST_UNREACH, code, 0); - rt->u.dst.rate_last = now; } kfree_skb(skb); @@ -1004,8 +1018,8 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, flags |= RTCF_DOREDIRECT; if (skb->protocol != __constant_htons(ETH_P_IP)) { - /* Not IP (i.e. ARP). Do not make route for invalid - * destination AND it is not translated destination. + /* Not IP (i.e. ARP). Do not create route, if it is + * invalid for proxy arp. DNAT routes are always valid. */ if (out_dev == in_dev && !(flags&RTCF_DNAT)) return -EINVAL; @@ -1069,6 +1083,7 @@ brd_input: flags |= RTCF_DIRECTSRC; } flags |= RTCF_BROADCAST; + res.type = RTN_BROADCAST; local_input: rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); @@ -1227,7 +1242,7 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int if address is local --- clear the flag. */ if (dev_out == NULL) { - if (nochecksrc == 0) + if (nochecksrc == 0 || inet_addr_type(saddr) != RTN_UNICAST) return -EINVAL; flags |= RTCF_TPROXY; } @@ -1251,7 +1266,7 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int (MULTICAST(daddr) || daddr == 0xFFFFFFFF)) { /* Special hack: user can direct multicasts and limited broadcast via necessary interface - without fiddling with IP_MULTICAST_IF or IP_TXINFO. + without fiddling with IP_MULTICAST_IF or IP_PKTINFO. This hack is not just for fun, it allows vic,vat and friends to work. They bind socket to loopback, set ttl to zero @@ -1280,11 +1295,9 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); goto make_route; } - if (MULTICAST(daddr)) { + if (MULTICAST(daddr)) key.src = inet_select_addr(dev_out, 0, key.scope); - goto make_route; - } - if (!daddr) + else if (!daddr) key.src = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } @@ -1378,13 +1391,18 @@ make_route: flags |= RTCF_LOCAL; if (res.type == RTN_BROADCAST) { - flags |= RTCF_BROADCAST; - if (dev_out->flags&IFF_BROADCAST) - flags |= RTCF_LOCAL; + flags |= RTCF_BROADCAST|RTCF_LOCAL; + res.fi = NULL; } else if (res.type == RTN_MULTICAST) { flags |= RTCF_MULTICAST|RTCF_LOCAL; if (!ip_check_mc(dev_out, daddr)) flags &= ~RTCF_LOCAL; + /* If multicast route do not exist use + default one, but do not gateway in this case. + Yes, it is hack. + */ + if (res.fi && res.prefixlen < 4) + res.fi = NULL; } rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 65bc5f0fc..c186a8953 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1,7 +1,7 @@ /* * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem. * - * $Id: sysctl_net_ipv4.c,v 1.34 1998/04/11 09:38:26 freitag Exp $ + * $Id: sysctl_net_ipv4.c,v 1.36 1998/10/21 05:26:59 davem Exp $ * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS] @@ -43,8 +43,6 @@ extern int sysctl_ip_dynaddr; /* From ip_masq.c */ extern int sysctl_ip_masq_debug; -extern int sysctl_tcp_cong_avoidance; -extern int sysctl_tcp_hoe_retransmits; extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; @@ -93,9 +91,6 @@ int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp, ctl_table ipv4_table[] = { - {NET_IPV4_TCP_HOE_RETRANSMITS, "tcp_hoe_retransmits", - &sysctl_tcp_hoe_retransmits, sizeof(int), 0644, NULL, - &proc_dointvec}, {NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps", &sysctl_tcp_timestamps, sizeof(int), 0644, NULL, &proc_dointvec}, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 30a0b0dd6..b6f1c7a93 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.119 1998/08/26 12:04:14 davem Exp $ + * Version: $Id: tcp.c,v 1.132 1998/11/08 13:21:14 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -301,7 +301,7 @@ * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is * necessary for 10Mbps networks - and harder than BSD to spoof! - * With syncookies we doesn't) + * With syncookies we don't) * * Simultaneous Open Attempts (4.2.2.10) * MUST support simultaneous open attempts (does) @@ -541,17 +541,8 @@ static unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait) /* * Compute minimal free write space needed to queue new packets. */ -static inline int tcp_min_write_space(struct sock *sk, struct tcp_opt *tp) -{ - int space; -#if 1 /* This needs benchmarking and real world tests */ - space = max(tp->mss_cache + 128, MIN_WRITE_SPACE); -#else /* 2.0 way */ - /* More than half of the socket queue free? */ - space = atomic_read(&sk->wmem_alloc) / 2; -#endif - return space; -} +#define tcp_min_write_space(__sk) \ + (atomic_read(&(__sk)->wmem_alloc) / 2) /* * Wait for a TCP event. @@ -598,9 +589,14 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) sk->urginline || !tp->urg_data)) mask |= POLLIN | POLLRDNORM; - /* Always wake the user up when an error occurred */ - if (sock_wspace(sk) >= tcp_min_write_space(sk, tp) || sk->err) - mask |= POLLOUT | POLLWRNORM; + if (!(sk->shutdown & SEND_SHUTDOWN)) { + if (sock_wspace(sk) >= tcp_min_write_space(sk)) { + mask |= POLLOUT | POLLWRNORM; + } else { /* send SIGIO later */ + sk->socket->flags |= SO_NOSPACE; + } + } + if (tp->urg_data & URG_VALID) mask |= POLLPRI; } @@ -618,7 +614,7 @@ void tcp_write_space(struct sock *sk) wake_up_interruptible(sk->sleep); if (sock_wspace(sk) >= - tcp_min_write_space(sk, &(sk->tp_pinfo.af_tcp))) + tcp_min_write_space(sk)) sock_wake_async(sk->socket, 2); } @@ -729,6 +725,9 @@ static void wait_for_tcp_memory(struct sock * sk) lock_sock(sk); } +/* When all user supplied data has been queued set the PSH bit */ +#define PSH_NEEDED (seglen == 0 && iovlen == 0) + /* * This routine copies from a user buffer into a socket, * and starts the transmit system. @@ -742,16 +741,16 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) int mss_now; int err = 0; int copied = 0; - - /* Verify that the socket is locked */ - if (!atomic_read(&sk->sock_readers)) - printk("tcp_do_sendmsg: socket not locked!\n"); + struct sk_buff *skb; /* Wait for a connection to finish. */ if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) if((err = wait_for_tcp_connect(sk, flags)) != 0) return err; + /* This should be in poll */ + sk->socket->flags &= ~SO_NOSPACE; /* clear SIGIO XXX */ + mss_now = tcp_current_mss(sk); /* Ok commence sending. */ @@ -763,10 +762,9 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) while(seglen > 0) { int copy, tmp, queue_it; - struct sk_buff *skb; if (err) - return -EFAULT; + goto do_fault2; /* Stop on errors. */ if (sk->err) @@ -810,12 +808,24 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) from, skb_put(skb, copy), copy, skb->csum, &err); } + /* + * FIXME: the *_user functions should + * return how much data was + * copied before the fault + * occured and then a partial + * packet with this data should + * be sent. Unfortunately + * csum_and_copy_from_user doesn't + * return this information. + * ATM it might send partly zeroed + * data in this case. + */ tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; from += copy; copied += copy; seglen -= copy; - if(!seglen && !iovlen) + if (PSH_NEEDED) TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; continue; } @@ -841,8 +851,8 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) if(copy > seglen) copy = seglen; + /* Determine how large of a buffer to allocate. */ tmp = MAX_HEADER + sk->prot->max_header; - queue_it = 0; if (copy < min(mss_now, tp->max_window >> 1) && !(flags & MSG_OOB)) { tmp += min(mss_now, tp->max_window); @@ -857,6 +867,7 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) queue_it = 1; } else { tmp += copy; + queue_it = 0; } skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL); @@ -884,7 +895,7 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) /* Prepare control bits for TCP header creation engine. */ TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | - ((!seglen && !iovlen) ? + (PSH_NEEDED ? TCPCB_FLAG_PSH : 0)); TCP_SKB_CB(skb)->sacked = 0; if (flags & MSG_OOB) { @@ -901,6 +912,9 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) skb->csum = csum_and_copy_from_user(from, skb_put(skb, copy), copy, 0, &err); + if (err) + goto do_fault; + from += copy; copied += copy; @@ -912,8 +926,6 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) } } sk->err = 0; - if (err) - return -EFAULT; return copied; do_sock_err: @@ -930,8 +942,14 @@ do_interrupted: if(copied) return copied; return err; +do_fault: + kfree_skb(skb); +do_fault2: + return -EFAULT; } +#undef PSH_NEEDED + /* * Send an ack if one is backlogged at this point. Ought to merge * this with tcp_send_ack(). @@ -1046,8 +1064,6 @@ static void cleanup_rbuf(struct sock *sk, int copied) tcp_eat_skb(sk, skb); } - SOCK_DEBUG(sk, "sk->rspace = %lu\n", sock_rspace(sk)); - /* We send an ACK if we can now advertise a non-zero window * which has been raised "significantly". */ @@ -1084,6 +1100,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, int err = 0; int target = 1; /* Read at least this many bytes */ + if (sk->err) + return sock_error(sk); + if (sk->state == TCP_LISTEN) return -ENOTCONN; @@ -1165,6 +1184,14 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, if (copied >= target) break; + /* + These three lines and clause if (sk->state == TCP_CLOSE) + are unlikely to be correct, if target > 1. + I DO NOT FIX IT, because I have no idea, what + POSIX prescribes to make here. Probably, it really + wants to lose data 8), if not all target is received. + --ANK + */ if (sk->err && !(flags&MSG_PEEK)) { copied = sock_error(sk); break; @@ -1417,7 +1444,7 @@ static void tcp_close_pending (struct sock *sk) tcp_synq_init(tp); } -void tcp_close(struct sock *sk, unsigned long timeout) +void tcp_close(struct sock *sk, long timeout) { struct sk_buff *skb; int data_was_unread = 0; @@ -1458,7 +1485,8 @@ void tcp_close(struct sock *sk, unsigned long timeout) * reader process may not have drained the data yet! */ while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) { - data_was_unread++; + u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin; + data_was_unread += len; kfree_skb(skb); } @@ -1486,7 +1514,6 @@ void tcp_close(struct sock *sk, unsigned long timeout) struct task_struct *tsk = current; struct wait_queue wait = { tsk, NULL }; - tsk->timeout = timeout; add_wait_queue(sk->sleep, &wait); release_sock(sk); @@ -1494,12 +1521,11 @@ void tcp_close(struct sock *sk, unsigned long timeout) tsk->state = TASK_INTERRUPTIBLE; if (!closing(sk)) break; - schedule(); - if (signal_pending(tsk) || !tsk->timeout) + timeout = schedule_timeout(timeout); + if (signal_pending(tsk) || !timeout) break; } - tsk->timeout=0; tsk->state = TASK_RUNNING; remove_wait_queue(sk->sleep, &wait); @@ -1511,8 +1537,8 @@ void tcp_close(struct sock *sk, unsigned long timeout) */ tcp_check_fin_timer(sk); - sk->dead = 1; release_sock(sk); + sk->dead = 1; } /* @@ -1583,12 +1609,17 @@ struct sock *tcp_accept(struct sock *sk, int flags) req->class->destructor(req); tcp_openreq_free(req); sk->ack_backlog--; + if(sk->keepopen) + tcp_inc_slow_timer(TCP_SLT_KEEPALIVE); /* * This does not pass any already set errors on the new socket * to the user, but they will be returned on the first socket operation * after the accept. - */ + * + * Once linux gets a multithreaded net_bh or equivalent there will be a race + * here - you'll have to check for sk->zapped as set by the ICMP handler then. + */ error = 0; out: @@ -1618,48 +1649,86 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, return -EFAULT; switch(optname) { - case TCP_MAXSEG: -/* values greater than interface MTU won't take effect. however at - * the point when this call is done we typically don't yet know - * which interface is going to be used - */ - if(val<1||val>MAX_WINDOW) - return -EINVAL; - tp->user_mss=val; - return 0; - case TCP_NODELAY: - sk->nonagle=(val==0)?0:1; - return 0; - default: - return(-ENOPROTOOPT); + case TCP_MAXSEG: + /* values greater than interface MTU won't take effect. however at + * the point when this call is done we typically don't yet know + * which interface is going to be used + */ + if(val < 1 || val > MAX_WINDOW) + return -EINVAL; + tp->user_mss = val; + return 0; + + case TCP_NODELAY: + /* You cannot try to use this and TCP_CORK in + * tandem, so let the user know. + */ + if (sk->nonagle == 2) + return -EINVAL; + sk->nonagle = (val == 0) ? 0 : 1; + return 0; + + case TCP_CORK: + /* When set indicates to always queue non-full frames. + * Later the user clears this option and we transmit + * any pending partial frames in the queue. This is + * meant to be used alongside sendfile() to get properly + * filled frames when the user (for example) must write + * out headers with a write() call first and then use + * sendfile to send out the data parts. + * + * You cannot try to use TCP_NODELAY and this mechanism + * at the same time, so let the user know. + */ + if (sk->nonagle == 1) + return -EINVAL; + if (val != 0) { + sk->nonagle = 2; + } else { + sk->nonagle = 0; + + if (tp->send_head) { + lock_sock(sk); + if (tp->send_head && + tcp_snd_test (sk, tp->send_head)) + tcp_write_xmit(sk); + release_sock(sk); + } + } + return 0; + + default: + return -ENOPROTOOPT; }; } -int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, +int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int val; - int len; + int val, len; if(level != SOL_TCP) return tp->af_specific->getsockopt(sk, level, optname, optval, optlen); - + if(get_user(len,optlen)) return -EFAULT; - - len = min(len,sizeof(int)); + + len = min(len, sizeof(int)); switch(optname) { - case TCP_MAXSEG: - val=tp->user_mss; - break; - case TCP_NODELAY: - val=sk->nonagle; - break; - default: - return(-ENOPROTOOPT); + case TCP_MAXSEG: + val = tp->user_mss; + break; + case TCP_NODELAY: + val = (sk->nonagle == 1); + break; + case TCP_CORK: + val = (sk->nonagle == 2); + break; + default: + return -ENOPROTOOPT; }; if(put_user(len, optlen)) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6a3ae17bf..59ae01f88 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_input.c,v 1.127 1998/08/26 12:04:20 davem Exp $ + * Version: $Id: tcp_input.c,v 1.141 1998/11/18 02:12:07 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -53,6 +53,8 @@ * Andi Kleen: Add tcp_measure_rcv_mss to make * connections with MSS<min(MTU,ann. MSS) * work without delayed acks. + * Andi Kleen: Process packets with PSH set in the + * fast path. */ #include <linux/config.h> @@ -75,9 +77,7 @@ extern int sysctl_tcp_fin_timeout; int sysctl_tcp_timestamps = 1; int sysctl_tcp_window_scaling = 1; int sysctl_tcp_sack = 1; -int sysctl_tcp_hoe_retransmits = 1; -int sysctl_tcp_cong_avoidance; int sysctl_tcp_syncookies = SYNC_INIT; int sysctl_tcp_stdurg; int sysctl_tcp_rfc1337; @@ -120,6 +120,18 @@ static void tcp_delack_estimator(struct tcp_opt *tp) } } +/* + * Remember to send an ACK later. + */ +static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th, + struct sk_buff *skb) +{ + tp->delayed_acks++; + /* Tiny-grams with PSH set make us ACK quickly. */ + if(th->psh && (skb->len < (tp->mss_cache >> 1))) + tp->ato = HZ/50; +} + /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge @@ -166,7 +178,7 @@ static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt) static __inline__ void tcp_set_rto(struct tcp_opt *tp) { tp->rto = (tp->srtt >> 3) + tp->mdev; - tp->rto += (tp->rto >> 2) + (tp->rto >> ((tp->snd_cwnd>>TCP_CWND_SHIFT)-1)); + tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1)); } @@ -231,16 +243,13 @@ static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) { u32 end_window = tp->rcv_wup + tp->rcv_wnd; - if (tp->rcv_wnd) { - if (!before(seq, tp->rcv_nxt) && before(seq, end_window)) - return 1; - - if ((end_seq - seq) && after(end_seq, tp->rcv_nxt) && - !after(end_seq, end_window)) - return 1; - } - - return 0; + if (tp->rcv_wnd && + after(end_seq, tp->rcv_nxt) && + before(seq, end_window)) + return 1; + if (seq != end_window) + return 0; + return (seq == end_seq); } /* This functions checks to see if the tcp header is actually acceptable. */ @@ -253,7 +262,7 @@ extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) } /* When we get a reset we do this. */ -static void tcp_reset(struct sock *sk, struct sk_buff *skb) +static void tcp_reset(struct sock *sk) { sk->zapped = 1; @@ -268,7 +277,7 @@ static void tcp_reset(struct sock *sk, struct sk_buff *skb) default: sk->err = ECONNRESET; }; - tcp_set_state(sk,TCP_CLOSE); + tcp_set_state(sk, TCP_CLOSE); sk->shutdown = SHUTDOWN_MASK; if (!sk->dead) sk->state_change(sk); @@ -292,7 +301,7 @@ static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, /* The retransmission queue is always in order, so * we can short-circuit the walk early. */ - if(after(TCP_SKB_CB(skb)->end_seq, end_seq)) + if(!before(start_seq, TCP_SKB_CB(skb)->end_seq)) break; /* We play conservative, we don't allow SACKS to partially @@ -442,7 +451,7 @@ static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, static __inline__ void clear_fast_retransmit(struct tcp_opt *tp) { if (tp->dup_acks > 3) - tp->snd_cwnd = (tp->snd_ssthresh << TCP_CWND_SHIFT); + tp->snd_cwnd = (tp->snd_ssthresh); tp->dup_acks = 0; } @@ -471,36 +480,39 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) * to one half the current congestion window, but no less * than two segments. Retransmit the missing segment. */ - tp->dup_acks++; if (tp->high_seq == 0 || after(ack, tp->high_seq)) { + tp->dup_acks++; if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) { - tp->snd_ssthresh = max(tp->snd_cwnd >> (TCP_CWND_SHIFT + 1), 2); - tp->snd_cwnd = (tp->snd_ssthresh + 3) << TCP_CWND_SHIFT; + tp->snd_ssthresh = + max(min(tp->snd_wnd, tp->snd_cwnd) >> 1, 2); + tp->snd_cwnd = (tp->snd_ssthresh + 3); tp->high_seq = tp->snd_nxt; if(!tp->fackets_out) - tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); + tcp_retransmit_skb(sk, + skb_peek(&sk->write_queue)); else tcp_fack_retransmit(sk); tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); } - } - - /* 2. Each time another duplicate ACK arrives, increment - * cwnd by the segment size. [...] Transmit a packet... - * - * Packet transmission will be done on normal flow processing - * since we're not in "retransmit mode". We do not use duplicate - * ACKs to artificially inflate the congestion window when - * doing FACK. - */ - if (tp->dup_acks > 3) { + } else if (++tp->dup_acks > 3) { + /* 2. Each time another duplicate ACK arrives, increment + * cwnd by the segment size. [...] Transmit a packet... + * + * Packet transmission will be done on normal flow processing + * since we're not in "retransmit mode". We do not use + * duplicate ACKs to artificially inflate the congestion + * window when doing FACK. + */ if(!tp->fackets_out) { - tp->snd_cwnd += (1 << TCP_CWND_SHIFT); + tp->snd_cwnd++; } else { - /* Fill any further holes which may have appeared. - * We may want to change this to run every further - * multiple-of-3 dup ack increments, to be more robust - * against out-of-order packet delivery. -DaveM + /* Fill any further holes which may have + * appeared. + * + * We may want to change this to run every + * further multiple-of-3 dup ack increments, + * to be more robust against out-of-order + * packet delivery. -DaveM */ tcp_fack_retransmit(sk); } @@ -543,7 +555,8 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) * from snd_una is if this was a window update. */ if (ack != tp->snd_una && before(ack, tp->high_seq)) { - tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); + tcp_retransmit_skb(sk, + skb_peek(&sk->write_queue)); tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); } } else { @@ -558,23 +571,21 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. - * - * FIXME: What happens when the congestion window gets larger - * than the maximum receiver window by some large factor - * Suppose the pipeline never looses packets for a long - * period of time, then traffic increases causing packet loss. - * The congestion window should be reduced, but what it should - * be reduced to is not clear, since 1/2 the old window may - * still be larger than the maximum sending rate we ever achieved. */ -static void tcp_cong_avoid(struct tcp_opt *tp, u32 seq, u32 ack, u32 seq_rtt) +static __inline__ void tcp_cong_avoid(struct tcp_opt *tp) { - if ((tp->snd_cwnd>>TCP_CWND_SHIFT) <= tp->snd_ssthresh) { + if (tp->snd_cwnd <= tp->snd_ssthresh) { /* In "safe" area, increase. */ - tp->snd_cwnd += (1 << TCP_CWND_SHIFT); + tp->snd_cwnd++; } else { - /* In dangerous area, increase slowly. */ - tp->snd_cwnd += 1; + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd + */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + tp->snd_cwnd++; + tp->snd_cwnd_cnt=0; + } else + tp->snd_cwnd_cnt++; } } @@ -649,6 +660,33 @@ static void tcp_ack_probe(struct sock *sk, __u32 ack) } } +/* Should we open up the congestion window? */ +static __inline__ int should_advance_cwnd(struct tcp_opt *tp, int flag) +{ + /* Data must have been acked. */ + if ((flag & FLAG_DATA_ACKED) == 0) + return 0; + + /* Some of the data acked was retransmitted somehow? */ + if ((flag & FLAG_RETRANS_DATA_ACKED) != 0) { + /* We advance in all cases except during + * non-FACK fast retransmit/recovery. + */ + if (tp->fackets_out != 0 || + tp->retransmits != 0) + return 1; + + /* Non-FACK fast retransmit does it's own + * congestion window management, don't get + * in the way. + */ + return 0; + } + + /* New non-retransmitted data acked, always advance. */ + return 1; +} + /* Read draft-ietf-tcplw-high-performance before mucking * with this code. (Superceeds RFC1323) */ @@ -684,13 +722,15 @@ static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp, } } else { tcp_set_rto(tp); - tcp_cong_avoid(tp, seq, ack, seq_rtt); } + if (should_advance_cwnd(tp, flag)) + tcp_cong_avoid(tp); + /* NOTE: safe here so long as cong_ctl doesn't use rto */ tcp_bound_rto(tp); } -static void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) +static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) { struct sk_buff *skb = skb_peek(&sk->write_queue); long when = tp->rto - (jiffies - TCP_SKB_CB(skb)->when); @@ -803,9 +843,10 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, tcp_set_rto(tp); tcp_bound_rto(tp); } - tcp_cong_avoid(tp, seq, ack, seq_rtt); } } + if (should_advance_cwnd(tp, flag)) + tcp_cong_avoid(tp); } if (tp->packets_out) { @@ -1125,7 +1166,7 @@ coalesce: /* Zap SWALK, by moving every further SACK up by one slot. * Decrease num_sacks. */ - for(this_sack += 1; this_sack < num_sacks; this_sack++, swalk++) { + for(this_sack += 1; this_sack < num_sacks-1; this_sack++, swalk++) { struct tcp_sack_block *next = (swalk + 1); swalk->start_seq = next->start_seq; swalk->end_seq = next->end_seq; @@ -1150,6 +1191,10 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct tcp_sack_block *sp = &tp->selective_acks[0]; + int cur_sacks = tp->num_sacks; + + if (!cur_sacks) + goto new_sack; /* Optimize for the common case, new ofo frames arrive * "in order". ;-) This also satisfies the requirements @@ -1165,34 +1210,36 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb) sp->start_seq = TCP_SKB_CB(skb)->seq; tcp_sack_maybe_coalesce(tp, sp); } else { - int cur_sacks = tp->num_sacks; - int max_sacks = (tp->tstamp_ok ? 3 : 4); + struct tcp_sack_block *swap = sp + 1; + int this_sack, max_sacks = (tp->tstamp_ok ? 3 : 4); /* Oh well, we have to move things around. * Try to find a SACK we can tack this onto. */ - if(cur_sacks > 1) { - struct tcp_sack_block *swap = sp + 1; - int this_sack; - - for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) { - if((swap->end_seq == TCP_SKB_CB(skb)->seq) || - (swap->start_seq == TCP_SKB_CB(skb)->end_seq)) { - if(swap->end_seq == TCP_SKB_CB(skb)->seq) - swap->end_seq = TCP_SKB_CB(skb)->end_seq; - else - swap->start_seq = TCP_SKB_CB(skb)->seq; - tcp_sack_swap(sp, swap); - tcp_sack_maybe_coalesce(tp, sp); - return; - } + + for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) { + if((swap->end_seq == TCP_SKB_CB(skb)->seq) || + (swap->start_seq == TCP_SKB_CB(skb)->end_seq)) { + if(swap->end_seq == TCP_SKB_CB(skb)->seq) + swap->end_seq = TCP_SKB_CB(skb)->end_seq; + else + swap->start_seq = TCP_SKB_CB(skb)->seq; + tcp_sack_swap(sp, swap); + tcp_sack_maybe_coalesce(tp, sp); + return; } } /* Could not find an adjacent existing SACK, build a new one, * put it at the front, and shift everyone else down. We * always know there is at least one SACK present already here. + * + * If the sack array is full, forget about the last one. */ + if (cur_sacks >= max_sacks) { + cur_sacks--; + tp->num_sacks--; + } while(cur_sacks >= 1) { struct tcp_sack_block *this = &tp->selective_acks[cur_sacks]; struct tcp_sack_block *prev = (this - 1); @@ -1201,11 +1248,11 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, struct sk_buff *skb) cur_sacks--; } - /* Build head SACK, and we're done. */ + new_sack: + /* Build the new head SACK, and we're done. */ sp->start_seq = TCP_SKB_CB(skb)->seq; sp->end_seq = TCP_SKB_CB(skb)->end_seq; - if(tp->num_sacks < max_sacks) - tp->num_sacks++; + tp->num_sacks++; } } @@ -1310,16 +1357,14 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if(skb->h.th->fin) { tcp_fin(skb, sk, skb->h.th); } else { - tp->delayed_acks++; - - /* Tiny-grams with PSH set make us ACK quickly. */ - if(skb->h.th->psh && (skb->len < (tp->mss_cache >> 1))) - tp->ato = HZ/50; + tcp_remember_ack(tp, skb->h.th, skb); } /* This may have eaten into a SACK block. */ if(tp->sack_ok && tp->num_sacks) tcp_sack_remove_skb(tp, skb); tcp_ofo_queue(sk); + + /* Turn on fast path. */ if (skb_queue_len(&tp->out_of_order_queue) == 0) tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) | (0x10 << 16) | @@ -1450,23 +1495,28 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) return(1); } -static void tcp_data_snd_check(struct sock *sk) +static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - struct sk_buff *skb; - if ((skb = tp->send_head)) { - if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) && - tcp_packets_in_flight(tp) < (tp->snd_cwnd >> TCP_CWND_SHIFT)) { - /* Put more data onto the wire. */ - tcp_write_xmit(sk); - } else if (tp->packets_out == 0 && !tp->pending) { - /* Start probing the receivers window. */ - tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); - } + if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) && + tcp_packets_in_flight(tp) < tp->snd_cwnd) { + /* Put more data onto the wire. */ + tcp_write_xmit(sk); + } else if (tp->packets_out == 0 && !tp->pending) { + /* Start probing the receivers window. */ + tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); } } +static __inline__ void tcp_data_snd_check(struct sock *sk) +{ + struct sk_buff *skb = sk->tp_pinfo.af_tcp.send_head; + + if (skb != NULL) + __tcp_data_snd_check(sk, skb); +} + /* * Adapt the MSS value used to make delayed ack decision to the * real world. @@ -1501,7 +1551,7 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk) * - delay time <= 0.5 HZ * - we don't have a window update to send * - must send at least every 2 full sized packets - * - must send an ACK if we have any SACKs + * - must send an ACK if we have any out of order data * * With an extra heuristic to handle loss of packet * situations and also helping the sender leave slow @@ -1514,8 +1564,8 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk) tcp_raise_window(sk) || /* We entered "quick ACK" mode or... */ tcp_in_quickack_mode(tp) || - /* We have pending SACKs */ - (tp->sack_ok && tp->num_sacks)) { + /* We have out of order data */ + (skb_peek(&tp->out_of_order_queue) != NULL)) { /* Then ack it now */ tcp_send_ack(sk); } else { @@ -1631,8 +1681,11 @@ static int prune_queue(struct sock *sk) return 0; } - /* Now continue with the receive queue if it wasn't enough */ - while ((skb = skb_peek_tail(&sk->receive_queue))) { + /* Now continue with the receive queue if it wasn't enough. + * But only do this if we are really being abused. + */ + while ((atomic_read(&sk->rmem_alloc) >= (sk->rcvbuf * 2)) && + (skb = skb_peek_tail(&sk->receive_queue))) { /* Never toss anything when we've seen the FIN. * It's just too complex to recover from it. */ @@ -1655,17 +1708,37 @@ static int prune_queue(struct sock *sk) TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, tp->copied_seq); kfree_skb(skb); - if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) - break; } return 0; } +/* + * TCP receive function for the ESTABLISHED state. + * + * It is split into a fast path and a slow path. The fast path is + * disabled when: + * - A zero window was announced from us - zero window probing + * is only handled properly in the slow path. + * - Out of order segments arrived. + * - Urgent data is expected. + * - There is no buffer space left + * - Unexpected TCP flags/window values/header lengths are received + * (detected by checking the TCP header against pred_flags) + * - Data is sent in both directions. Fast path only supports pure senders + * or pure receivers (this means either the sequence number or the ack + * value must stay constant) + * + * When these conditions are not satisfied it drops into a standard + * receive procedure patterned after RFC793 to handle all cases. + * The first three cases are guaranteed by proper pred_flags setting, + * the rest is checked inline. Fast processing is turned on in + * tcp_data_queue when everything is OK. + */ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - int queued = 0; + int queued; u32 flg; /* @@ -1700,21 +1773,18 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } } - flg = *(((u32 *)th) + 3); - + flg = *(((u32 *)th) + 3) & ~htonl(0x8 << 16); + /* pred_flags is 0xS?10 << 16 + snd_wnd * if header_predition is to be made * 'S' will always be tp->tcp_header_len >> 2 * '?' will be 0 else it will be !0 * (when there are holes in the receive * space for instance) - */ + * PSH flag is ignored. + */ if (flg == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { - if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { - tcp_send_ack(sk); - goto discard; - } if (len <= th->doff*4) { /* Bulk data transfer: sender */ if (len == th->doff*4) { @@ -1727,11 +1797,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_statistics.TcpInErrs++; goto discard; } - } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una) { + } else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una && + atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) { /* Bulk data transfer: receiver */ - if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) - goto discard; - __skb_pull(skb,th->doff*4); tcp_measure_rcv_mss(sk, skb); @@ -1748,16 +1816,17 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, sk->data_ready(sk, 0); tcp_delack_estimator(tp); - /* Tiny-grams with PSH set make us ACK quickly. */ - if(th->psh && (skb->len < (tp->mss_cache >> 1))) - tp->ato = HZ/50; + tcp_remember_ack(tp, th, skb); - tp->delayed_acks++; __tcp_ack_snd_check(sk); return 0; } } + /* + * Standard slow path. + */ + if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { /* RFC793, page 37: "In all states except SYN-SENT, all reset * (RST) segments are validated by checking their SEQ-fields." @@ -1779,12 +1848,12 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) { SOCK_DEBUG(sk, "syn in established state\n"); tcp_statistics.TcpInErrs++; - tcp_reset(sk, skb); + tcp_reset(sk); return 1; } if(th->rst) { - tcp_reset(sk,skb); + tcp_reset(sk); goto discard; } @@ -1831,7 +1900,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } /* - * Process an incoming SYN or SYN-ACK. + * Process an incoming SYN or SYN-ACK for SYN_RECV sockets represented + * as an open_request. */ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, @@ -1896,7 +1966,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, } /* - * This function implements the receiving procedure of RFC 793. + * This function implements the receiving procedure of RFC 793 for + * all states except ESTABLISHED and TIME_WAIT. * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be * address independent. */ @@ -1907,8 +1978,27 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int queued = 0; - /* state == CLOSED, hash lookup always fails, so no worries. -DaveM */ switch (sk->state) { + case TCP_CLOSE: + /* When state == CLOSED, hash lookup always fails. + * + * But, there is a back door, the backlog queue. + * If we have a sequence of packets in the backlog + * during __release_sock() which have a sequence such + * that: + * packet X causes entry to TCP_CLOSE state + * ... + * packet X + N has FIN bit set + * + * We report a (luckily) harmless error in this case. + * The issue is that backlog queue processing bypasses + * any hash lookups (we know which socket packets are for). + * The correct behavior here is what 2.0.x did, since + * a TCP_CLOSE socket does not exist. Drop the frame + * and send a RST back to the other end. + */ + return 1; + case TCP_LISTEN: /* These use the socket TOS.. * might want to be the received TOS @@ -1961,7 +2051,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, } if(th->rst) { - tcp_reset(sk,skb); + tcp_reset(sk); goto discard; } @@ -2090,7 +2180,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* step 2: check RST bit */ if(th->rst) { - tcp_reset(sk,skb); + tcp_reset(sk); goto discard; } @@ -2113,7 +2203,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) { - tcp_reset(sk, skb); + tcp_reset(sk); return 1; } @@ -2193,7 +2283,7 @@ step6: */ if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) { if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { - tcp_reset(sk, skb); + tcp_reset(sk); return 1; } } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bf3fb243b..f486852d1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_ipv4.c,v 1.157 1998/08/28 00:27:47 davem Exp $ + * Version: $Id: tcp_ipv4.c,v 1.162 1998/11/07 11:50:26 davem Exp $ * * IPv4 specific functions * @@ -265,7 +265,7 @@ unsigned short tcp_good_socknum(void) struct tcp_bind_bucket *tb; int low = sysctl_local_port_range[0]; int high = sysctl_local_port_range[1]; - int remaining = high - low; + int remaining = high - low + 1; int rover; SOCKHASH_LOCK(); @@ -594,7 +594,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) } tmp = ip_route_connect(&rt, nexthop, sk->saddr, - RT_TOS(sk->ip_tos)|sk->localroute, sk->bound_dev_if); + RT_TOS(sk->ip_tos)|RTO_CONN|sk->localroute, sk->bound_dev_if); if (tmp < 0) return tmp; @@ -642,9 +642,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) /* Reset mss clamp */ tp->mss_clamp = ~0; - if ((sk->ip_pmtudisc == IP_PMTUDISC_DONT || - (sk->ip_pmtudisc == IP_PMTUDISC_WANT && - (rt->u.dst.mxlock&(1<<RTAX_MTU)))) && + if (!ip_dont_fragment(sk, &rt->u.dst) && rt->u.dst.pmtu > 576 && rt->rt_dst != rt->rt_gateway) { /* Clamp mss at maximum of 536 and user_mss. Probably, user ordered to override tiny segment size @@ -716,7 +714,11 @@ static struct open_request *tcp_v4_search_req(struct tcp_opt *tp, for (req = prev->dl_next; req; req = req->dl_next) { if (req->af.v4_req.rmt_addr == iph->saddr && req->af.v4_req.loc_addr == iph->daddr && - req->rmt_port == rport) { + req->rmt_port == rport +#ifdef CONFIG_IP_TRANSPARENT_PROXY + && req->lcl_port == th->dest +#endif + ) { *prevp = prev; return req; } @@ -776,6 +778,8 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip) * and for some paths there is no check at all. * A more general error queue to queue errors for later handling * is probably better. + * + * sk->err and sk->err_soft should be atomic_t. */ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) @@ -786,8 +790,8 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) int type = skb->h.icmph->type; int code = skb->h.icmph->code; struct sock *sk; - int opening; __u32 seq; + int err; if (len < (iph->ihl << 2) + ICMP_MIN_LENGTH) { icmp_statistics.IcmpInErrors++; @@ -804,43 +808,41 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) tp = &sk->tp_pinfo.af_tcp; seq = ntohl(th->seq); - if (sk->state != TCP_LISTEN && - !between(seq, tp->snd_una-16384, max(tp->snd_una+32768,tp->snd_nxt))) { - if (net_ratelimit()) - printk(KERN_WARNING - "icmp packet outside the tcp window:" - " state:%d seq:%u win:%u,%u\n", - (int)sk->state, seq, tp->snd_una, tp->snd_nxt); + if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) { + net_statistics.OutOfWindowIcmps++; return; } switch (type) { case ICMP_SOURCE_QUENCH: #ifndef OLD_SOURCE_QUENCH /* This is deprecated */ - tp->snd_ssthresh = max(tp->snd_cwnd >> (1 + TCP_CWND_SHIFT), 2); - tp->snd_cwnd = (tp->snd_ssthresh << TCP_CWND_SHIFT); + tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2); + tp->snd_cwnd = tp->snd_ssthresh; + tp->snd_cwnd_cnt = 0; tp->high_seq = tp->snd_nxt; #endif return; case ICMP_PARAMETERPROB: - sk->err=EPROTO; - sk->error_report(sk); /* This isn't serialized on SMP! */ + err = EPROTO; break; case ICMP_DEST_UNREACH: + if (code > NR_ICMP_UNREACH) + return; + if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ do_pmtu_discovery(sk, iph); - return; + return; } - break; - } - /* If we've already connected we will keep trying - * until we time out, or the user gives up. - */ - if (code > NR_ICMP_UNREACH) + err = icmp_err_convert[code].errno; + break; + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + default: return; - - opening = 0; + } + switch (sk->state) { struct open_request *req, *prev; case TCP_LISTEN: @@ -848,10 +850,10 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) * ICMP is unreliable. */ if (atomic_read(&sk->sock_readers)) { - /* XXX: add a counter here to profile this. - * If too many ICMPs get dropped on busy - * servers this needs to be solved differently. - */ + net_statistics.LockDroppedIcmps++; + /* If too many ICMPs get dropped on busy + * servers this needs to be solved differently. + */ return; } @@ -868,10 +870,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) if (!req) return; if (seq != req->snt_isn) { - if (net_ratelimit()) - printk(KERN_DEBUG "icmp packet for openreq " - "with wrong seq number:%d:%d\n", - seq, req->snt_isn); + net_statistics.OutOfWindowIcmps++; return; } if (req->sk) { @@ -898,25 +897,43 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len) } break; case TCP_SYN_SENT: - case TCP_SYN_RECV: + case TCP_SYN_RECV: /* Cannot happen */ if (!th->syn) - return; - opening = 1; - break; + return; + tcp_statistics.TcpAttemptFails++; + sk->err = err; + sk->zapped = 1; + mb(); + sk->error_report(sk); + return; } - - if(icmp_err_convert[code].fatal || opening) { + + /* If we've already connected we will keep trying + * until we time out, or the user gives up. + * + * rfc1122 4.2.3.9 allows to consider as hard errors + * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, + * but it is obsoleted by pmtu discovery). + * + * Note, that in modern internet, where routing is unreliable + * and in each dark corner broken firewalls sit, sending random + * errors ordered by their masters even this two messages finally lose + * their original sense (even Linux sends invalid PORT_UNREACHs) + * + * Now we are in compliance with RFCs. + * --ANK (980905) + */ + + if (sk->ip_recverr) { /* This code isn't serialized with the socket code */ - sk->err = icmp_err_convert[code].errno; - if (opening) { - tcp_statistics.TcpAttemptFails++; - if (sk->state != TCP_LISTEN) - tcp_set_state(sk,TCP_CLOSE); - mb(); - sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ - } + /* ANK (980927) ... which is harmless now, + sk->err's may be safely lost. + */ + sk->err = err; + mb(); + sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ } else { /* Only an error on timeout */ - sk->err_soft = icmp_err_convert[code].errno; + sk->err_soft = err; mb(); } } @@ -951,7 +968,16 @@ static void tcp_v4_send_reset(struct sk_buff *skb) /* Never send a reset in response to a reset. */ if (th->rst) - return; + return; + + if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL) { +#ifdef CONFIG_IP_TRANSPARENT_PROXY + if (((struct rtable*)skb->dst)->rt_type == RTN_UNICAST) + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_PORT_UNREACH, 0); +#endif + return; + } /* Swap the send and the receive. */ memset(&rth, 0, sizeof(struct tcphdr)); @@ -985,6 +1011,33 @@ static void tcp_v4_send_reset(struct sk_buff *skb) } #ifdef CONFIG_IP_TRANSPARENT_PROXY + +/* + Seems, I never wrote nothing more stupid. + I hope Gods will forgive me, but I cannot forgive myself 8) + --ANK (981001) + */ + +static struct sock *tcp_v4_search_proxy_openreq(struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4); + struct sock *sk; + int i; + + for (i=0; i<TCP_LHTABLE_SIZE; i++) { + for(sk = tcp_listening_hash[i]; sk; sk = sk->next) { + struct open_request *dummy; + if (tcp_v4_search_req(&sk->tp_pinfo.af_tcp, iph, + th, &dummy) && + (!sk->bound_dev_if || + sk->bound_dev_if == skb->dev->ifindex)) + return sk; + } + } + return NULL; +} + /* * Check whether a received TCP packet might be for one of our * connections. @@ -996,10 +1049,20 @@ int tcp_chkaddr(struct sk_buff *skb) struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4); struct sock *sk; - sk = tcp_v4_lookup(iph->saddr, th->source, iph->daddr, th->dest, skb->dev->ifindex); + sk = tcp_v4_lookup(iph->saddr, th->source, iph->daddr, + th->dest, skb->dev->ifindex); if (!sk) - return 0; + return tcp_v4_search_proxy_openreq(skb) != NULL; + + if (sk->state == TCP_LISTEN) { + struct open_request *dummy; + if (tcp_v4_search_req(&sk->tp_pinfo.af_tcp, skb->nh.iph, + th, &dummy) && + (!sk->bound_dev_if || + sk->bound_dev_if == skb->dev->ifindex)) + return 1; + } /* 0 means accept all LOCAL addresses here, not all the world... */ @@ -1285,13 +1348,14 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newtp->last_ack_sent = req->rcv_isn + 1; newtp->backoff = 0; newtp->mdev = TCP_TIMEOUT_INIT; - newtp->snd_cwnd = (1 << TCP_CWND_SHIFT); + newtp->snd_cwnd = 1; newtp->rto = TCP_TIMEOUT_INIT; newtp->packets_out = 0; newtp->fackets_out = 0; newtp->retrans_out = 0; newtp->high_seq = 0; newtp->snd_ssthresh = 0x7fffffff; + newtp->snd_cwnd_cnt = 0; newtp->dup_acks = 0; newtp->delayed_acks = 0; init_timer(&newtp->retransmit_timer); @@ -1569,6 +1633,9 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len) /* Count it even if it's bad */ tcp_statistics.TcpInSegs++; + if (len < sizeof(struct tcphdr)) + goto bad_packet; + /* Try to use the device checksum if provided. */ switch (skb->ip_summed) { case CHECKSUM_NONE: @@ -1583,6 +1650,7 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len) ntohs(th->dest), len, skb->len, ntohs(skb->nh.iph->tot_len)); + bad_packet: tcp_statistics.TcpInErrs++; goto discard_it; } @@ -1595,10 +1663,15 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len) sk = tcp_v4_proxy_lookup(th->dest, skb->nh.iph->saddr, th->source, skb->nh.iph->daddr, skb->dev, IPCB(skb)->redirport, skb->dev->ifindex); - else + else { +#endif + sk = __tcp_v4_lookup(th, skb->nh.iph->saddr, th->source, + skb->nh.iph->daddr, th->dest, skb->dev->ifindex); +#ifdef CONFIG_IP_TRANSPARENT_PROXY + if (!sk) + sk = tcp_v4_search_proxy_openreq(skb); + } #endif - sk = __tcp_v4_lookup(th, skb->nh.iph->saddr, th->source, - skb->nh.iph->daddr, th->dest, skb->dev->ifindex); if (!sk) goto no_tcp_socket; if(!ipsec_sk_policy(sk,skb)) @@ -1760,7 +1833,8 @@ static int tcp_v4_init_sock(struct sock *sk) /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. */ - tp->snd_cwnd = (1 << TCP_CWND_SHIFT); + tp->snd_cwnd = 1; + tp->snd_cwnd_cnt = 0; tp->snd_ssthresh = 0x7fffffff; /* Infinity */ sk->state = TCP_CLOSE; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 03696cbe0..25695f05d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.93 1998/08/26 12:04:32 davem Exp $ + * Version: $Id: tcp_output.c,v 1.97 1998/11/08 13:21:27 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -390,7 +390,7 @@ void tcp_write_xmit(struct sock *sk) * * Note, we don't "adjust" for TIMESTAMP or SACK option bytes. */ -u32 __tcp_select_window(struct sock *sk, u32 cur_win) +u32 __tcp_select_window(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; unsigned int mss = tp->mss_cache; @@ -414,6 +414,7 @@ u32 __tcp_select_window(struct sock *sk, u32 cur_win) if ((free_space < (sk->rcvbuf/4)) && (free_space < ((int) (mss/2)))) { window = 0; + tp->pred_flags = 0; } else { /* Get the largest window that is a nice multiple of mss. * Window clamp already applied above. @@ -616,7 +617,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) /* Stop retransmitting if we've hit the congestion * window limit. */ - if (tp->retrans_out >= (tp->snd_cwnd >> TCP_CWND_SHIFT)) + if (tp->retrans_out >= tp->snd_cwnd) break; } else { update_retrans_head(sk); @@ -646,7 +647,7 @@ void tcp_fack_retransmit(struct sock *sk) if(tcp_retransmit_skb(sk, skb)) break; - if(tcp_packets_in_flight(tp) >= (tp->snd_cwnd >> TCP_CWND_SHIFT)) + if(tcp_packets_in_flight(tp) >= tp->snd_cwnd) break; next_packet: packet_cnt++; @@ -728,9 +729,9 @@ void tcp_send_active_reset(struct sock *sk) struct sk_buff *skb; /* NOTE: No TCP options attached and we never retransmit this. */ - do { - skb = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_KERNEL); - } while(skb == NULL); + skb = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_KERNEL); + if (!skb) + return; /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_HEADER + sk->prot->max_header); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 94275718b..ea46d3268 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_timer.c,v 1.51 1998/05/02 15:19:26 davem Exp $ + * Version: $Id: tcp_timer.c,v 1.55 1998/11/07 11:55:42 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -183,8 +183,8 @@ void tcp_probe_timer(unsigned long data) return; if (atomic_read(&sk->sock_readers)) { - /* Try again in second. */ - tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ); + /* Try again later. */ + tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ/5); return; } @@ -229,6 +229,9 @@ static __inline__ int tcp_keepopen_proc(struct sock *sk) sk->err = ETIMEDOUT; tcp_set_state(sk, TCP_CLOSE); + sk->shutdown = SHUTDOWN_MASK; + if (!sk->dead) + sk->state_change(sk); } else { tp->probes_out++; tp->pending = TIME_KEEPOPEN; @@ -433,8 +436,8 @@ void tcp_retransmit_timer(unsigned long data) } if (atomic_read(&sk->sock_readers)) { - /* Try again in a second. */ - tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ); + /* Try again later */ + tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ/20); return; } lock_sock(sk); @@ -463,11 +466,16 @@ void tcp_retransmit_timer(unsigned long data) tp->fackets_out = 0; tp->retrans_out = 0; if (tp->retransmits == 0) { - /* remember window where we lost + /* Remember window where we lost: * "one half of the current window but at least 2 segments" + * + * Here "current window" means the effective one, which + * means it must be an accurate representation of our current + * sending rate _and_ the snd_wnd. */ - tp->snd_ssthresh = max(tp->snd_cwnd >> (1 + TCP_CWND_SHIFT), 2); - tp->snd_cwnd = (1 << TCP_CWND_SHIFT); + tp->snd_ssthresh = max(min(tp->snd_wnd, tp->snd_cwnd) >> 1, 2); + tp->snd_cwnd_cnt = 0; + tp->snd_cwnd = 1; } tp->retransmits++; diff --git a/net/ipv4/timer.c b/net/ipv4/timer.c index d5f6d3eb5..df3c9cce5 100644 --- a/net/ipv4/timer.c +++ b/net/ipv4/timer.c @@ -5,7 +5,7 @@ * * TIMER - implementation of software timers for IP. * - * Version: $Id: timer.c,v 1.11 1998/03/19 08:34:06 davem Exp $ + * Version: $Id: timer.c,v 1.14 1998/11/07 11:55:43 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -74,7 +74,8 @@ void net_timer (unsigned long data) /* Only process if socket is not in use. */ if (atomic_read(&sk->sock_readers)) { - sk->timer.expires = jiffies+HZ; + /* Try again later. */ + sk->timer.expires = jiffies+HZ/20; add_timer(&sk->timer); return; } @@ -111,11 +112,10 @@ void net_timer (unsigned long data) case TIME_CLOSE: /* We've waited long enough, close the socket. */ - sk->state = TCP_CLOSE; - net_delete_timer (sk); + tcp_set_state(sk, TCP_CLOSE); + sk->shutdown = SHUTDOWN_MASK; if (!sk->dead) sk->state_change(sk); - sk->shutdown = SHUTDOWN_MASK; net_reset_timer (sk, TIME_DONE, TCP_DONE_TIME); break; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index eab552c36..113b06ef8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -5,7 +5,7 @@ * * The User Datagram Protocol (UDP). * - * Version: $Id: udp.c,v 1.61 1998/08/29 17:11:10 freitag Exp $ + * Version: $Id: udp.c,v 1.64 1998/11/08 11:17:07 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -481,6 +481,9 @@ void udp_err(struct sk_buff *skb, unsigned char *dp, int len) int type = skb->h.icmph->type; int code = skb->h.icmph->code; struct sock *sk; + int harderr; + u32 info; + int err; if (len < (iph->ihl<<2)+sizeof(struct udphdr)) { icmp_statistics.IcmpInErrors++; @@ -493,35 +496,40 @@ void udp_err(struct sk_buff *skb, unsigned char *dp, int len) return; /* No socket for error */ } - if (sk->ip_recverr && !atomic_read(&sk->sock_readers)) { - struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); - if (skb2 && sock_queue_err_skb(sk, skb2)) - kfree_skb(skb2); - } - + err = 0; + info = 0; + harderr = 0; + switch (type) { + default: + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; case ICMP_SOURCE_QUENCH: return; case ICMP_PARAMETERPROB: - sk->err = EPROTO; - sk->error_report(sk); - return; + err = EPROTO; + info = ntohl(skb->h.icmph->un.gateway)>>24; + harderr = 1; + break; case ICMP_DEST_UNREACH: if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ if (sk->ip_pmtudisc != IP_PMTUDISC_DONT) { - /* - * There should be really a way to pass the - * discovered MTU value back to the user (the - * ICMP layer did all the work for us) - */ - sk->err = EMSGSIZE; - sk->error_report(sk); + err = EMSGSIZE; + info = ntohs(skb->h.icmph->un.frag.mtu); + harderr = 1; + break; } return; } + err = EHOSTUNREACH; + if (code <= NR_ICMP_UNREACH) { + harderr = icmp_err_convert[code].fatal; + err = icmp_err_convert[code].errno; + } break; } - + /* * Various people wanted BSD UDP semantics. Well they've come * back out because they slow down response to stuff like dead @@ -530,21 +538,25 @@ void udp_err(struct sk_buff *skb, unsigned char *dp, int len) * client code people. */ - /* RFC1122: OK. Passes ICMP errors back to application, as per */ - /* 4.1.3.3. */ - /* After the comment above, that should be no surprise. */ + /* + * RFC1122: OK. Passes ICMP errors back to application, as per + * 4.1.3.3. After the comment above, that should be no surprise. + */ - if (code < NR_ICMP_UNREACH && icmp_err_convert[code].fatal) - { - /* - * 4.x BSD compatibility item. Break RFC1122 to - * get BSD socket semantics. - */ - if(sk->bsdism && sk->state!=TCP_ESTABLISHED) - return; - sk->err = icmp_err_convert[code].errno; - sk->error_report(sk); - } + if (!harderr && !sk->ip_recverr) + return; + + /* + * 4.x BSD compatibility item. Break RFC1122 to + * get BSD socket semantics. + */ + if(sk->bsdism && sk->state!=TCP_ESTABLISHED) + return; + + if (sk->ip_recverr) + ip_icmp_error(sk, skb, err, uh->dest, info, (u8*)(uh+1)); + sk->err = err; + sk->error_report(sk); } @@ -853,24 +865,17 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len, /* * Check any passed addresses */ - if (addr_len) *addr_len=sizeof(*sin); - if (sk->ip_recverr && (skb = skb_dequeue(&sk->error_queue)) != NULL) { - err = sock_error(sk); - if (msg->msg_controllen != 0) { - put_cmsg(msg, SOL_IP, IP_RECVERR, skb->len, skb->data); - err = 0; - } - goto out_free; - } - + if (flags & MSG_ERRQUEUE) + return ip_recv_error(sk, msg, len); + /* * From here the generic datagram does a lot of the work. Come * the finished NET3, it will do _ALL_ the work! */ - + skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) goto out; @@ -1010,7 +1015,7 @@ int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) } -static void udp_close(struct sock *sk, unsigned long timeout) +static void udp_close(struct sock *sk, long timeout) { /* See for explanation: raw_close in ipv4/raw.c */ sk->state = TCP_CLOSE; @@ -1025,7 +1030,7 @@ static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) * Charge it to the socket, dropping if the queue is full. */ - if (__sock_queue_rcv_skb(sk,skb)<0) { + if (sock_queue_rcv_skb(sk,skb)<0) { udp_statistics.UdpInErrors++; ip_statistics.IpInDiscards++; ip_statistics.IpInDelivers--; @@ -1195,9 +1200,11 @@ csum_error: * RFC1122: OK. Discards the bad packet silently (as far as * the network is concerned, anyway) as per 4.1.3.4 (MUST). */ - NETDEBUG(printk(KERN_DEBUG "UDP: bad checksum. From %08lX:%d to %08lX:%d ulen %d\n", - ntohl(saddr),ntohs(uh->source), - ntohl(daddr),ntohs(uh->dest), + NETDEBUG(printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n", + NIPQUAD(saddr), + ntohs(uh->source), + NIPQUAD(daddr), + ntohs(uh->dest), ulen)); udp_statistics.UdpInErrors++; kfree_skb(skb); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index a9ee64925..b40c35d00 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -7,7 +7,7 @@ * * Adapted from linux/net/ipv4/af_inet.c * - * $Id: af_inet6.c,v 1.37 1998/08/26 12:04:45 davem Exp $ + * $Id: af_inet6.c,v 1.39 1998/10/03 09:38:23 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -54,6 +54,17 @@ #include <asm/uaccess.h> #include <asm/system.h> +#ifdef MODULE +static int unloadable = 0; /* XX: Turn to one when all is ok within the + module for allowing unload */ +#endif + +#if defined(MODULE) && LINUX_VERSION_CODE > 0x20115 +MODULE_AUTHOR("Cast of dozens"); +MODULE_DESCRIPTION("IPv6 protocol stack for Linux"); +MODULE_PARM(unloadable, "i"); +#endif + extern struct proto_ops inet6_stream_ops; extern struct proto_ops inet6_dgram_ops; @@ -123,6 +134,7 @@ static int inet6_create(struct socket *sock, int protocol) sk->net_pinfo.af_inet6.hop_limit = -1; sk->net_pinfo.af_inet6.mcast_hops = -1; sk->net_pinfo.af_inet6.mc_loop = 1; + sk->net_pinfo.af_inet6.pmtudisc = IPV6_PMTUDISC_WANT; /* Init the ipv4 part of the socket since we can have sockets * using v6 API for ipv4. @@ -463,6 +475,7 @@ static struct proc_dir_entry proc_net_snmp6 = { #ifdef MODULE int ipv6_unload(void) { + if (!unloadable) return 1; /* We keep internally 3 raw sockets */ return __this_module.usecount - 3; } diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 51960bd26..cd8725ded 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: datagram.c,v 1.15 1998/08/26 12:04:47 davem Exp $ + * $Id: datagram.c,v 1.16 1998/10/03 09:38:25 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -29,6 +29,158 @@ #include <net/addrconf.h> #include <net/transp_v6.h> +#include <linux/errqueue.h> +#include <asm/uaccess.h> + +void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, + u16 port, u32 info, u8 *payload) +{ + struct icmp6hdr *icmph = (struct icmp6hdr *)skb->h.raw; + struct sock_exterr_skb *serr; + + if (!sk->net_pinfo.af_inet6.recverr) + return; + + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) + return; + + serr = SKB_EXT_ERR(skb); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_ICMP6; + serr->ee.ee_type = icmph->icmp6_type; + serr->ee.ee_code = icmph->icmp6_code; + serr->ee.ee_pad = 0; + serr->ee.ee_info = info; + serr->ee.ee_data = 0; + serr->addr_offset = (u8*)&(((struct ipv6hdr*)(icmph+1))->daddr) - skb->nh.raw; + serr->port = port; + + skb->h.raw = payload; + skb_pull(skb, payload - skb->data); + + if (sock_queue_err_skb(sk, skb)) + kfree_skb(skb); +} + +void ipv6_local_error(struct sock *sk, int err, struct flowi *fl, u32 info) +{ + struct sock_exterr_skb *serr; + struct ipv6hdr *iph; + struct sk_buff *skb; + + if (!sk->net_pinfo.af_inet6.recverr) + return; + + skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); + if (!skb) + return; + + iph = (struct ipv6hdr*)skb_put(skb, sizeof(struct ipv6hdr)); + skb->nh.ipv6h = iph; + memcpy(&iph->daddr, fl->fl6_dst, 16); + + serr = SKB_EXT_ERR(skb); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; + serr->ee.ee_type = 0; + serr->ee.ee_code = 0; + serr->ee.ee_pad = 0; + serr->ee.ee_info = info; + serr->ee.ee_data = 0; + serr->addr_offset = (u8*)&iph->daddr - skb->nh.raw; + serr->port = fl->uli_u.ports.dport; + + skb->h.raw = skb->tail; + skb_pull(skb, skb->tail - skb->data); + + if (sock_queue_err_skb(sk, skb)) + kfree_skb(skb); +} + +/* + * Handle MSG_ERRQUEUE + */ +int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) +{ + struct sock_exterr_skb *serr; + struct sk_buff *skb, *skb2; + struct sockaddr_in6 *sin; + struct { + struct sock_extended_err ee; + struct sockaddr_in6 offender; + } errhdr; + int err; + int copied; + + err = -EAGAIN; + skb = skb_dequeue(&sk->error_queue); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = memcpy_toiovec(msg->msg_iov, skb->data, copied); + if (err) + goto out_free_skb; + + serr = SKB_EXT_ERR(skb); + + sin = (struct sockaddr_in6 *)msg->msg_name; + if (sin) { + sin->sin6_family = AF_INET6; + sin->sin6_port = serr->port; + if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6) + memcpy(&sin->sin6_addr, skb->nh.raw + serr->addr_offset, 16); + else + ipv6_addr_set(&sin->sin6_addr, 0, 0, + __constant_htonl(0xffff), + *(u32*)(skb->nh.raw + serr->addr_offset)); + } + + memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); + sin = &errhdr.offender; + sin->sin6_family = AF_UNSPEC; + if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL) { + sin->sin6_family = AF_INET6; + if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6) { + memcpy(&sin->sin6_addr, &skb->nh.ipv6h->saddr, 16); + if (sk->net_pinfo.af_inet6.rxopt.all) + datagram_recv_ctl(sk, msg, skb); + } else { + ipv6_addr_set(&sin->sin6_addr, 0, 0, + __constant_htonl(0xffff), + skb->nh.iph->saddr); + if (sk->ip_cmsg_flags) + ip_cmsg_recv(msg, skb); + } + } + + put_cmsg(msg, SOL_IPV6, IPV6_RECVERR, sizeof(errhdr), &errhdr); + + /* Now we could try to dump offended packet options */ + + msg->msg_flags |= MSG_ERRQUEUE; + err = copied; + + /* Reset and regenerate socket error */ + sk->err = 0; + if ((skb2 = skb_peek(&sk->error_queue)) != NULL) { + sk->err = SKB_EXT_ERR(skb2)->ee.ee_errno; + sk->error_report(sk); + } + +out_free_skb: + kfree_skb(skb); +out: + return err; +} + + + int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 89d58936d..8a4f85b6c 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -7,7 +7,7 @@ * Andi Kleen <ak@muc.de> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * - * $Id: exthdrs.c,v 1.7 1998/08/26 12:04:49 davem Exp $ + * $Id: exthdrs.c,v 1.8 1998/10/03 09:38:27 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -368,9 +368,7 @@ ipv6_invert_rthdr(struct sock *sk, struct ipv6_rt_hdr *hdr) what it does and calculates authentication data correctly. Certainly, it is possible only for udp and raw sockets, but not for tcp. - BTW I beg pardon, it is not good place for flames, but - I cannot be silent 8) It is very sad, but fools prevail 8) - AUTH header has 4byte granular length, what kills all the idea + AUTH header has 4byte granular length, which kills all the idea behind AUTOMATIC 64bit alignment of IPv6. Now we will loose cpu ticks, checking that sender did not something stupid and opt->hdrlen is even. Shit! --ANK (980730) @@ -383,6 +381,8 @@ static u8 *ipv6_auth_hdr(struct sk_buff **skb_ptr, u8 *nhptr) struct ipv6_opt_hdr *hdr = (struct ipv6_opt_hdr *)skb->h.raw; int len = (hdr->hdrlen+2)<<2; + if (len&7) + return NULL; opt->auth = (u8*)hdr - skb->nh.raw; if (skb->h.raw + len > skb->tail) return NULL; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index d43d1f98d..8f49443e6 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: icmp.c,v 1.19 1998/08/26 12:04:52 davem Exp $ + * $Id: icmp.c,v 1.20 1998/10/03 09:38:31 davem Exp $ * * Based on net/ipv4/icmp.c * @@ -334,7 +334,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, msg.daddr = &hdr->saddr; len = min((skb->tail - ((unsigned char *) hdr)) + sizeof(struct icmp6hdr), - IPV6_MIN_MTU - sizeof(struct icmp6hdr)); + IPV6_MIN_MTU - sizeof(struct ipv6hdr)); if (len < 0) { printk(KERN_DEBUG "icmp: len problem\n"); @@ -396,7 +396,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) } static void icmpv6_notify(struct sk_buff *skb, - int type, int code, unsigned char *buff, int len) + int type, int code, u32 info, unsigned char *buff, int len) { struct in6_addr *saddr = &skb->nh.ipv6h->saddr; struct in6_addr *daddr = &skb->nh.ipv6h->daddr; @@ -404,7 +404,6 @@ static void icmpv6_notify(struct sk_buff *skb, struct inet6_protocol *ipprot; struct sock *sk; u8 *pb; - __u32 info = 0; int hash; u8 nexthdr; @@ -436,11 +435,8 @@ static void icmpv6_notify(struct sk_buff *skb, if (ipprot->err_handler) ipprot->err_handler(skb, hdr, NULL, type, code, pb, info); - return; } - /* delivery to upper layer protocols failed. try raw sockets */ - sk = raw_v6_htable[hash]; if (sk == NULL) @@ -468,6 +464,9 @@ int icmpv6_rcv(struct sk_buff *skb, unsigned long len) icmpv6_statistics.Icmp6InMsgs++; + if (len < sizeof(struct icmp6hdr)) + goto discard_it; + /* Perform checksum. */ switch (skb->ip_summed) { case CHECKSUM_NONE: @@ -538,7 +537,7 @@ int icmpv6_rcv(struct sk_buff *skb, unsigned long len) case ICMPV6_DEST_UNREACH: case ICMPV6_TIME_EXCEED: case ICMPV6_PARAMPROB: - icmpv6_notify(skb, type, hdr->icmp6_code, + icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu, (char *) (hdr + 1), ulen); break; @@ -574,7 +573,7 @@ int icmpv6_rcv(struct sk_buff *skb, unsigned long len) * must pass to upper level */ - icmpv6_notify(skb, type, hdr->icmp6_code, + icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu, (char *) (hdr + 1), ulen); }; kfree_skb(skb); @@ -586,7 +585,7 @@ discard_it: return 0; } -__initfunc(int icmpv6_init(struct net_proto_family *ops)) +int __init icmpv6_init(struct net_proto_family *ops) { struct sock *sk; int err; @@ -632,7 +631,7 @@ static struct icmp6_err { } tab_unreach[] = { { ENETUNREACH, 0}, /* NOROUTE */ { EACCES, 1}, /* ADM_PROHIBITED */ - { 0, 0}, /* Was NOT_NEIGHBOUR, now reserved */ + { EHOSTUNREACH, 0}, /* Was NOT_NEIGHBOUR, now reserved */ { EHOSTUNREACH, 0}, /* ADDR_UNREACH */ { ECONNREFUSED, 1}, /* PORT_UNREACH */ }; @@ -641,10 +640,11 @@ int icmpv6_err_convert(int type, int code, int *err) { int fatal = 0; - *err = 0; + *err = EPROTO; switch (type) { case ICMPV6_DEST_UNREACH: + fatal = 1; if (code <= ICMPV6_PORT_UNREACH) { *err = tab_unreach[code].err; fatal = tab_unreach[code].fatal; @@ -659,6 +659,10 @@ int icmpv6_err_convert(int type, int code, int *err) *err = EPROTO; fatal = 1; break; + + case ICMPV6_TIME_EXCEED: + *err = EHOSTUNREACH; + break; }; return fatal; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 0555c1a24..a9dfa97ba 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: ip6_output.c,v 1.14 1998/08/26 12:05:01 davem Exp $ + * $Id: ip6_output.c,v 1.15 1998/10/03 09:38:34 davem Exp $ * * Based on linux/net/ipv4/ip_output.c * @@ -291,8 +291,10 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag, frag_len = (mtu - unfrag_len) & ~0x7; /* Unfragmentable part exceeds mtu. */ - if (frag_len <= 0) + if (frag_len <= 0) { + ipv6_local_error(sk, EMSGSIZE, fl, mtu); return -EMSGSIZE; + } nfrags = last_len / frag_len; @@ -321,8 +323,10 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag, all the exthdrs will fit to the first fragment. */ if (opt) { - if (frag_len < opt->opt_flen) + if (frag_len < opt->opt_flen) { + ipv6_local_error(sk, EMSGSIZE, fl, mtu); return -EMSGSIZE; + } data_off = frag_off - opt->opt_flen; } @@ -520,12 +524,21 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data, } mtu = dst->pmtu; + if (np->frag_size < mtu) { + if (np->frag_size) + mtu = np->frag_size; + else if (np->pmtudisc == IPV6_PMTUDISC_DONT) + mtu = IPV6_MIN_MTU; + } /* Critical arithmetic overflow check. FIXME: may gcc optimize it out? --ANK (980726) */ - if (pktlength < length) - return -EMSGSIZE; + if (pktlength < length) { + ipv6_local_error(sk, EMSGSIZE, fl, mtu); + err = -EMSGSIZE; + goto out; + } if (pktlength <= mtu) { struct sk_buff *skb; @@ -573,8 +586,12 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data, kfree_skb(skb); } } else { - if (sk->ip_hdrincl || jumbolen) - return -EMSGSIZE; + if (sk->ip_hdrincl || jumbolen || + np->pmtudisc == IPV6_PMTUDISC_DO) { + ipv6_local_error(sk, EMSGSIZE, fl, mtu); + err = -EMSGSIZE; + goto out; + } err = ip6_frag_xmit(sk, getfrag, data, dst, fl, opt, final_dst, hlimit, flags, length, mtu); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index a246b996b..4b8089d4a 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -7,7 +7,7 @@ * * Based on linux/net/ipv4/ip_sockglue.c * - * $Id: ipv6_sockglue.c,v 1.23 1998/08/26 12:05:04 davem Exp $ + * $Id: ipv6_sockglue.c,v 1.24 1998/10/03 09:38:37 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -275,33 +275,17 @@ done: break; case IPV6_MULTICAST_IF: - { - int oif = 0; - struct in6_addr addr; - - if (copy_from_user(&addr, optval, sizeof(struct in6_addr))) - return -EFAULT; - - if (!ipv6_addr_any(&addr)) { - struct inet6_ifaddr *ifp; - - ifp = ipv6_chk_addr(&addr, NULL, 0); - - if (ifp == NULL) { - retv = -EADDRNOTAVAIL; - break; - } - - oif = ifp->idev->dev->ifindex; - } - if (sk->bound_dev_if && sk->bound_dev_if != oif) { + if (sk->bound_dev_if && sk->bound_dev_if != val) { retv = -EINVAL; break; } - np->mcast_oif = oif; + if (dev_get_by_index(val) == NULL) { + retv = -ENODEV; + break; + } + np->mcast_oif = val; retv = 0; break; - } case IPV6_ADD_MEMBERSHIP: case IPV6_DROP_MEMBERSHIP: { @@ -319,6 +303,21 @@ done: case IPV6_ROUTER_ALERT: retv = ip6_ra_control(sk, val, NULL); break; + case IPV6_MTU_DISCOVER: + if (val<0 || val>2) + return -EINVAL; + np->pmtudisc = val; + return 0; + case IPV6_MTU: + if (val && val < IPV6_MIN_MTU) + return -EINVAL; + np->frag_size = val; + return 0; + case IPV6_RECVERR: + np->recverr = !!val; + if (!val) + skb_queue_purge(&sk->error_queue); + return 0; }; out: @@ -330,6 +329,7 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname, char *optval, { struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; int len; + int val; if(level==SOL_IP && sk->type != SOCK_RAW) return udp_prot.getsockopt(sk, level, optname, optval, optlen); @@ -364,9 +364,24 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname, char *optval, len = 0; return put_user(len, optlen); } + case IP_MTU: + val = 0; + lock_sock(sk); + if (sk->dst_cache) + val = sk->dst_cache->pmtu; + release_sock(sk); + if (!val) + return -ENOTCONN; + break; default: + return -EINVAL; } - return -EINVAL; + len=min(sizeof(int),len); + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval,&val,len)) + return -EFAULT; + return 0; } #if defined(MODULE) && defined(CONFIG_SYSCTL) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 76339ff58..3b02e06d9 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -7,7 +7,7 @@ * * Adapted from linux/net/ipv4/raw.c * - * $Id: raw.c,v 1.21 1998/08/26 12:05:13 davem Exp $ + * $Id: raw.c,v 1.23 1998/11/08 11:17:09 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -185,8 +185,31 @@ void rawv6_err(struct sock *sk, struct sk_buff *skb, struct ipv6hdr *hdr, struct inet6_skb_parm *opt, int type, int code, unsigned char *buff, u32 info) { - if (sk == NULL) + int err; + int harderr; + + if (buff > skb->tail) return; + + /* Report error on raw socket, if: + 1. User requested recverr. + 2. Socket is connected (otherwise the error indication + is useless without recverr and error is hard. + */ + if (!sk->net_pinfo.af_inet6.recverr && sk->state != TCP_ESTABLISHED) + return; + + harderr = icmpv6_err_convert(type, code, &err); + if (type == ICMPV6_PKT_TOOBIG) + harderr = (sk->net_pinfo.af_inet6.pmtudisc == IPV6_PMTUDISC_DO); + + if (sk->net_pinfo.af_inet6.recverr) + ipv6_icmp_error(sk, skb, err, 0, ntohl(info), buff); + + if (sk->net_pinfo.af_inet6.recverr || harderr) { + sk->err = err; + sk->error_report(sk); + } } static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) @@ -234,16 +257,16 @@ int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, int len, if (flags & MSG_OOB) return -EOPNOTSUPP; - if (sk->shutdown & RCV_SHUTDOWN) - return(0); - if (addr_len) *addr_len=sizeof(*sin6); + if (flags & MSG_ERRQUEUE) + return ipv6_recv_error(sk, msg, len); + skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) goto out; - + copied = skb->tail - skb->h.raw; if (copied > len) { copied = len; @@ -574,7 +597,7 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname, } -static void rawv6_close(struct sock *sk, unsigned long timeout) +static void rawv6_close(struct sock *sk, long timeout) { /* See for explanation: raw_close in ipv4/raw.c */ sk->state = TCP_CLOSE; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8d1f59632..9ae8f63d7 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: route.c,v 1.33 1998/08/26 12:05:18 davem Exp $ + * $Id: route.c,v 1.34 1998/10/03 09:38:43 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -1025,6 +1025,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, */ if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) { nrt = rt6_cow(rt, daddr, saddr); + nrt->u.dst.pmtu = pmtu; nrt->rt6i_flags |= RTF_DYNAMIC; dst_release(&nrt->u.dst); } else { @@ -1035,6 +1036,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, nrt->rt6i_dst.plen = 128; nrt->rt6i_nexthop = neigh_clone(rt->rt6i_nexthop); nrt->rt6i_flags |= (RTF_DYNAMIC | RTF_CACHE); + nrt->u.dst.pmtu = pmtu; rt6_ins(nrt); } @@ -1063,10 +1065,10 @@ static struct rt6_info * ip6_rt_copy(struct rt6_info *ort) rt->u.dst.dev = ort->u.dst.dev; rt->u.dst.lastuse = jiffies; rt->rt6i_hoplimit = ort->rt6i_hoplimit; - rt->rt6i_expires = ort->rt6i_expires; + rt->rt6i_expires = 0; ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway); - rt->rt6i_flags = ort->rt6i_flags; + rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES; rt->rt6i_metric = ort->rt6i_metric; memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 0d6efd515..850553d9d 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -6,7 +6,7 @@ * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * - * $Id: sit.c,v 1.28 1998/08/26 12:05:22 davem Exp $ + * $Id: sit.c,v 1.29 1998/10/03 09:38:47 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -434,21 +434,21 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct device *dev) ip_rt_put(rt); goto tx_error; } - if (mtu >= IPV6_MIN_MTU) { - if (skb->dst && mtu < skb->dst->pmtu) { - struct rt6_info *rt6 = (struct rt6_info*)skb->dst; - if (mtu < rt6->u.dst.pmtu) { - if (tunnel->parms.iph.daddr || rt6->rt6i_dst.plen == 128) { - rt6->rt6i_flags |= RTF_MODIFIED; - rt6->u.dst.pmtu = mtu; - } + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + if (skb->dst && mtu < skb->dst->pmtu) { + struct rt6_info *rt6 = (struct rt6_info*)skb->dst; + if (mtu < rt6->u.dst.pmtu) { + if (tunnel->parms.iph.daddr || rt6->rt6i_dst.plen == 128) { + rt6->rt6i_flags |= RTF_MODIFIED; + rt6->u.dst.pmtu = mtu; } } - if (skb->len > mtu) { - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); - ip_rt_put(rt); - goto tx_error; - } + } + if (skb->len > mtu) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); + ip_rt_put(rt); + goto tx_error; } if (tunnel->err_count > 0) { @@ -554,6 +554,10 @@ ipip6_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd) case SIOCADDTUNNEL: case SIOCCHGTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + err = -EFAULT; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) goto done; @@ -580,6 +584,10 @@ ipip6_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd) break; case SIOCDELTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + if (dev == &ipip6_fb_tunnel_dev) { err = -EFAULT; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index c997999db..a95698db5 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: tcp_ipv6.c,v 1.89 1998/08/28 00:27:54 davem Exp $ + * $Id: tcp_ipv6.c,v 1.94 1998/11/07 11:50:33 davem Exp $ * * Based on: * linux/net/ipv4/tcp.c @@ -53,6 +53,7 @@ static void tcp_v6_xmit(struct sk_buff *skb); static struct open_request *tcp_v6_search_req(struct tcp_opt *tp, struct ipv6hdr *ip6h, struct tcphdr *th, + int iif, struct open_request **prevp); static struct tcp_func ipv6_mapped; @@ -363,6 +364,12 @@ static int tcp_v6_unique_address(struct sock *sk) return retval; } +static __inline__ int tcp_v6_iif(struct sk_buff *skb) +{ + struct inet6_skb_parm *opt = (struct inet6_skb_parm *) skb->cb; + return opt->iif; +} + static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -580,7 +587,6 @@ void tcp_v6_err(struct sk_buff *skb, struct ipv6hdr *hdr, struct ipv6_pinfo *np; struct sock *sk; int err; - int opening; struct tcp_opt *tp; __u32 seq; @@ -597,18 +603,18 @@ void tcp_v6_err(struct sk_buff *skb, struct ipv6hdr *hdr, tp = &sk->tp_pinfo.af_tcp; seq = ntohl(th->seq); if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) { - if (net_ratelimit()) - printk(KERN_DEBUG "icmp packet outside the tcp window:" - " s:%d %u,%u,%u\n", - (int)sk->state, seq, tp->snd_una, tp->snd_nxt); + net_statistics.OutOfWindowIcmps++; return; } np = &sk->net_pinfo.af_inet6; - if (type == ICMPV6_PKT_TOOBIG && sk->state != TCP_LISTEN) { + if (type == ICMPV6_PKT_TOOBIG) { struct dst_entry *dst = NULL; /* icmp should have updated the destination cache entry */ + if (sk->state == TCP_LISTEN) + return; + if (sk->dst_cache) dst = dst_check(&sk->dst_cache, np->dst_cookie); @@ -632,7 +638,7 @@ void tcp_v6_err(struct sk_buff *skb, struct ipv6hdr *hdr, dst = dst_clone(dst); if (dst->error) { - sk->err_soft = dst->error; + sk->err_soft = -dst->error; } else if (tp->pmtu_cookie > dst->pmtu && !atomic_read(&sk->sock_readers)) { lock_sock(sk); @@ -644,26 +650,29 @@ void tcp_v6_err(struct sk_buff *skb, struct ipv6hdr *hdr, return; } - opening = 0; + icmpv6_err_convert(type, code, &err); + /* Might be for an open_request */ switch (sk->state) { struct open_request *req, *prev; struct ipv6hdr hd; case TCP_LISTEN: - if (atomic_read(&sk->sock_readers)) - return; + if (atomic_read(&sk->sock_readers)) { + net_statistics.LockDroppedIcmps++; + /* If too many ICMPs get dropped on busy + * servers this needs to be solved differently. + */ + return; + } /* Grrrr - fix this later. */ ipv6_addr_copy(&hd.saddr, saddr); ipv6_addr_copy(&hd.daddr, daddr); - req = tcp_v6_search_req(tp, &hd,th, &prev); + req = tcp_v6_search_req(tp, &hd, th, tcp_v6_iif(skb), &prev); if (!req) return; if (seq != req->snt_isn) { - if (net_ratelimit()) - printk(KERN_DEBUG "icmp packet for openreq " - "with wrong seq number:%d:%d\n", - seq, req->snt_isn); + net_statistics.OutOfWindowIcmps++; return; } if (req->sk) { @@ -676,21 +685,26 @@ void tcp_v6_err(struct sk_buff *skb, struct ipv6hdr *hdr, } /* FALL THROUGH */ case TCP_SYN_SENT: - case TCP_SYN_RECV: - opening = 1; - break; + case TCP_SYN_RECV: /* Cannot happen */ + tcp_statistics.TcpAttemptFails++; + sk->err = err; + sk->zapped = 1; + mb(); + sk->error_report(sk); + return; } - if (icmpv6_err_convert(type, code, &err) || opening) { + if (np->recverr) { + /* This code isn't serialized with the socket code */ + /* ANK (980927) ... which is harmless now, + sk->err's may be safely lost. + */ sk->err = err; - - if (opening) { - tcp_statistics.TcpAttemptFails++; - tcp_set_state(sk,TCP_CLOSE); - sk->error_report(sk); - } + mb(); + sk->error_report(sk); } else { sk->err_soft = err; + mb(); } } @@ -853,7 +867,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, __u32 isn) /* So that link locals have meaning */ if (!sk->bound_dev_if && ipv6_addr_type(&req->af.v6_req.rmt_addr)&IPV6_ADDR_LINKLOCAL) - req->af.v6_req.iif = skb->dev->ifindex; + req->af.v6_req.iif = tcp_v6_iif(skb); req->class = &or_ipv6; req->retrans = 0; @@ -1035,6 +1049,9 @@ static void tcp_v6_send_reset(struct sk_buff *skb) if (th->rst) return; + if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) + return; + /* * We need to grab some memory, and put together an RST, * and then put it into the queue to be sent. @@ -1076,7 +1093,7 @@ static void tcp_v6_send_reset(struct sk_buff *skb) buff->csum); fl.proto = IPPROTO_TCP; - fl.oif = skb->dev->ifindex; + fl.oif = tcp_v6_iif(skb); fl.uli_u.ports.dport = t1->dest; fl.uli_u.ports.sport = t1->source; @@ -1096,6 +1113,7 @@ static void tcp_v6_send_reset(struct sk_buff *skb) static struct open_request *tcp_v6_search_req(struct tcp_opt *tp, struct ipv6hdr *ip6h, struct tcphdr *th, + int iif, struct open_request **prevp) { struct open_request *req, *prev; @@ -1109,9 +1127,10 @@ static struct open_request *tcp_v6_search_req(struct tcp_opt *tp, for (req = prev->dl_next; req; req = req->dl_next) { if (!ipv6_addr_cmp(&req->af.v6_req.rmt_addr, &ip6h->saddr) && !ipv6_addr_cmp(&req->af.v6_req.loc_addr, &ip6h->daddr) && - req->rmt_port == rport) { - *prevp = prev; - return req; + req->rmt_port == rport && + (!req->af.v6_req.iif || req->af.v6_req.iif == iif)) { + *prevp = prev; + return req; } prev = req; } @@ -1123,7 +1142,7 @@ static void tcp_v6_rst_req(struct sock *sk, struct sk_buff *skb) struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; struct open_request *req, *prev; - req = tcp_v6_search_req(tp,skb->nh.ipv6h,skb->h.th,&prev); + req = tcp_v6_search_req(tp,skb->nh.ipv6h,skb->h.th,tcp_v6_iif(skb),&prev); if (!req) return; /* Sequence number check required by RFC793 */ @@ -1156,7 +1175,7 @@ static inline struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) struct open_request *req, *dummy; struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); - req = tcp_v6_search_req(tp, skb->nh.ipv6h,th, &dummy); + req = tcp_v6_search_req(tp, skb->nh.ipv6h, th, tcp_v6_iif(skb), &dummy); if (req) { sk = tcp_check_req(sk, skb, req); } @@ -1292,7 +1311,6 @@ int tcp_v6_rcv(struct sk_buff *skb, unsigned long len) { struct tcphdr *th; struct sock *sk; - struct device *dev = skb->dev; struct in6_addr *saddr = &skb->nh.ipv6h->saddr; struct in6_addr *daddr = &skb->nh.ipv6h->daddr; @@ -1313,6 +1331,9 @@ int tcp_v6_rcv(struct sk_buff *skb, unsigned long len) tcp_statistics.TcpInSegs++; + if (len < sizeof(struct tcphdr)) + goto bad_packet; + /* * Try to use the device checksum if provided. */ @@ -1323,6 +1344,7 @@ int tcp_v6_rcv(struct sk_buff *skb, unsigned long len) case CHECKSUM_HW: if (tcp_v6_check(th,len,saddr,daddr,skb->csum)) { printk(KERN_DEBUG "tcp csum failed\n"); + bad_packet: tcp_statistics.TcpInErrs++; goto discard_it; } @@ -1330,7 +1352,7 @@ int tcp_v6_rcv(struct sk_buff *skb, unsigned long len) /* CHECKSUM_UNNECESSARY */ }; - sk = __tcp_v6_lookup(th, saddr, th->source, daddr, th->dest, dev->ifindex); + sk = __tcp_v6_lookup(th, saddr, th->source, daddr, th->dest, tcp_v6_iif(skb)); if (!sk) goto no_tcp_socket; @@ -1412,7 +1434,7 @@ static struct sock * tcp_v6_get_sock(struct sk_buff *skb, struct tcphdr *th) saddr = &skb->nh.ipv6h->saddr; daddr = &skb->nh.ipv6h->daddr; - return tcp_v6_lookup(saddr, th->source, daddr, th->dest, skb->dev->ifindex); + return tcp_v6_lookup(saddr, th->source, daddr, th->dest, tcp_v6_iif(skb)); } static void tcp_v6_xmit(struct sk_buff *skb) @@ -1441,7 +1463,7 @@ static void tcp_v6_xmit(struct sk_buff *skb) dst = ip6_route_output(sk, &fl); if (dst->error) { - sk->err_soft = dst->error; + sk->err_soft = -dst->error; dst_release(dst); return; } @@ -1518,7 +1540,8 @@ static int tcp_v6_init_sock(struct sock *sk) /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. */ - tp->snd_cwnd = (1 << TCP_CWND_SHIFT); + tp->snd_cwnd = 1; + tp->snd_cwnd_cnt = 0; tp->snd_ssthresh = 0x7fffffff; sk->state = TCP_CLOSE; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index bfa701c97..0670e8758 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -7,7 +7,7 @@ * * Based on linux/ipv4/udp.c * - * $Id: udp.c,v 1.33 1998/08/27 16:55:20 davem Exp $ + * $Id: udp.c,v 1.37 1998/11/08 11:17:10 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -308,7 +308,7 @@ ipv4_connected: return(0); } -static void udpv6_close(struct sock *sk, unsigned long timeout) +static void udpv6_close(struct sock *sk, long timeout) { /* See for explanation: raw_close in ipv4/raw.c */ sk->state = TCP_CLOSE; @@ -317,43 +317,10 @@ static void udpv6_close(struct sock *sk, unsigned long timeout) destroy_sock(sk); } -#ifdef CONFIG_FILTER +#if defined(CONFIG_FILTER) || !defined(HAVE_CSUM_COPY_USER) #undef CONFIG_UDP_DELAY_CSUM #endif -#ifdef CONFIG_UDP_DELAY_CSUM - -/* Please, read comments in net/checksum.h, asm/checksum.h - - I commented out csum_partial_copy_to_user there because it did not - verify_area. Now I am even wondered, how clever was I that time 8)8) - If I did not it, I would step into this hole again. --ANK - */ - -#ifndef _HAVE_ARCH_COPY_AND_CSUM_TO_USER -#if defined(__i386__) -static __inline__ -unsigned int csum_and_copy_to_user (const char *src, char *dst, - int len, int sum, int *err_ptr) -{ - int *src_err_ptr=NULL; - - if (verify_area(VERIFY_WRITE, dst, len) == 0) - return csum_partial_copy_generic(src, dst, len, sum, src_err_ptr, err_ptr); - - if (len) - *err_ptr = -EFAULT; - - return sum; -} -#elif defined(__sparc__) -#define csum_and_copy_to_user csum_partial_copy_to_user -#else -#undef CONFIG_UDP_DELAY_CSUM -#endif -#endif -#endif - /* * This should be easy, if there is something there we * return it, otherwise we block. @@ -365,32 +332,22 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len, struct sk_buff *skb; int copied, err; - /* - * Check any passed addresses - */ - - if (addr_len) + if (addr_len) *addr_len=sizeof(struct sockaddr_in6); - /* - * From here the generic datagram does a lot of the work. Come - * the finished NET3, it will do _ALL_ the work! - */ + if (flags & MSG_ERRQUEUE) + return ipv6_recv_error(sk, msg, len); skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) goto out; - + copied = skb->len - sizeof(struct udphdr); if (copied > len) { copied = len; msg->msg_flags |= MSG_TRUNC; } - /* - * FIXME : should use udp header size info value - */ - #ifndef CONFIG_UDP_DELAY_CSUM err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, copied); @@ -428,7 +385,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len, #endif if (err) goto out_free; - + sk->stamp=skb->stamp; /* Copy the address. */ @@ -478,26 +435,25 @@ void udpv6_err(struct sk_buff *skb, struct ipv6hdr *hdr, sk = udp_v6_lookup(daddr, uh->dest, saddr, uh->source, dev->ifindex); - if (sk == NULL) { - if (net_ratelimit()) - printk(KERN_DEBUG "icmp for unknown sock\n"); + if (sk == NULL) return; - } - if (icmpv6_err_convert(type, code, &err)) { - if(sk->bsdism && sk->state!=TCP_ESTABLISHED) - return; - - sk->err = err; - sk->error_report(sk); - } else { - sk->err_soft = err; - } + if (!icmpv6_err_convert(type, code, &err) && + !sk->net_pinfo.af_inet6.recverr) + return; + + if (sk->bsdism && sk->state!=TCP_ESTABLISHED) + return; + + if (sk->net_pinfo.af_inet6.recverr) + ipv6_icmp_error(sk, skb, err, uh->dest, ntohl(info), (u8 *)(uh+1)); + + sk->err = err; + sk->error_report(sk); } static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) { - if (sock_queue_rcv_skb(sk,skb)<0) { udp_stats_in6.UdpInErrors++; ipv6_statistics.Ip6InDiscards++; @@ -801,6 +757,11 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen) udh.uh.dest = sin6->sin6_port; daddr = &sin6->sin6_addr; + + /* Otherwise it will be difficult to maintain sk->dst_cache. */ + if (sk->state == TCP_ESTABLISHED && + !ipv6_addr_cmp(daddr, &sk->net_pinfo.af_inet6.daddr)) + daddr = &sk->net_pinfo.af_inet6.daddr; } else { if (sk->state != TCP_ESTABLISHED) return(-ENOTCONN); @@ -818,6 +779,7 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen) sin.sin_addr.s_addr = daddr->s6_addr32[3]; sin.sin_port = udh.uh.dest; msg->msg_name = (struct sockaddr *)(&sin); + msg->msg_namelen = sizeof(sin); return udp_sendmsg(sk, msg, ulen); } @@ -839,7 +801,7 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen) udh.daddr = daddr; udh.uh.source = sk->sport; - udh.uh.len = len < 0x1000 ? htons(len) : 0; + udh.uh.len = len < 0x10000 ? htons(len) : 0; udh.uh.check = 0; udh.iov = msg->msg_iov; udh.wcheck = 0; @@ -905,7 +867,7 @@ struct proto udpv6_prot = { 0 /* highestinuse */ }; -__initfunc(void udpv6_init(void)) +void __init udpv6_init(void) { inet6_add_protocol(&udpv6_protocol); } diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 0db8e06ef..5990b69a3 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -2324,6 +2324,7 @@ extern struct datalink_proto *make_8023_client(void); extern void destroy_EII_client(struct datalink_proto *); extern void destroy_8023_client(struct datalink_proto *); +#ifdef CONFIG_PROC_FS struct proc_dir_entry ipx_procinfo = { PROC_NET_IPX, 3, "ipx", S_IFREG | S_IRUGO, 1, 0, 0, 0, &proc_net_inode_operations, ipx_get_info @@ -2338,6 +2339,7 @@ struct proc_dir_entry ipx_rt_procinfo = { PROC_NET_IPX_ROUTE, 9, "ipx_route", S_IFREG | S_IRUGO, 1, 0, 0, 0, &proc_net_inode_operations, ipx_rt_get_info }; +#endif static unsigned char ipx_8022_type = 0xE0; static unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 }; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index de104813e..a281c966b 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -427,7 +427,7 @@ void netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, continue; if (failure) { - sk->err = -ENOBUFS; + sk->err = ENOBUFS; sk->state_change(sk); continue; } @@ -442,12 +442,12 @@ void netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, } } if (skb2 == NULL) { - sk->err = -ENOBUFS; + sk->err = ENOBUFS; sk->state_change(sk); /* Clone failed. Notify ALL listeners. */ failure = 1; } else if (netlink_broadcast_deliver(sk, skb2)) { - sk->err = -ENOBUFS; + sk->err = ENOBUFS; sk->state_change(sk); } else skb2 = NULL; @@ -551,10 +551,6 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, int len, if (flags&(MSG_OOB|MSG_PEEK)) return -EOPNOTSUPP; - err = -sock_error(sk); - if (err) - return err; - skb = skb_recv_datagram(sk,flags,noblock,&err); if (skb==NULL) return err; diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 66b49db8a..7813f3072 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -238,7 +238,7 @@ static struct sock *nr_find_socket(unsigned char index, unsigned char id) /* * Find a connected NET/ROM socket given their circuit IDs. */ -static struct sock *nr_find_peer(unsigned char index, unsigned char id) +static struct sock *nr_find_peer(unsigned char index, unsigned char id, ax25_address *dest) { struct sock *s; unsigned long flags; @@ -247,7 +247,7 @@ static struct sock *nr_find_peer(unsigned char index, unsigned char id) cli(); for (s = nr_list; s != NULL; s = s->next) { - if (s->protinfo.nr->your_index == index && s->protinfo.nr->your_id == id) { + if (s->protinfo.nr->your_index == index && s->protinfo.nr->your_id == id && ax25cmp(&s->protinfo.nr->dest_addr, dest) == 0) { restore_flags(flags); return s; } @@ -575,14 +575,15 @@ static int nr_release(struct socket *sock, struct socket *peer) sk->state_change(sk); sk->dead = 1; sk->destroy = 1; + sk->socket = NULL; break; default: + sk->socket = NULL; break; } sock->sk = NULL; - sk->socket = NULL; /* Not used, but we should do this */ return 0; } @@ -597,7 +598,11 @@ static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (sk->zapped == 0) return -EINVAL; - if (addr_len != sizeof(struct sockaddr_ax25) && addr_len != sizeof(struct full_sockaddr_ax25)) + if (addr_len < sizeof(struct sockaddr_ax25) || addr_len > sizeof(struct +full_sockaddr_ax25)) + return -EINVAL; + + if (addr_len < (addr->fsa_ax25.sax25_ndigis * sizeof(ax25_address) + sizeof(struct sockaddr_ax25))) return -EINVAL; if (addr->fsa_ax25.sax25_family != AF_NETROM) @@ -863,10 +868,10 @@ int nr_rx_frame(struct sk_buff *skb, struct device *dev) if (circuit_index == 0 && circuit_id == 0) { if (frametype == NR_CONNACK && flags == NR_CHOKE_FLAG) - sk = nr_find_peer(peer_circuit_index, peer_circuit_id); + sk = nr_find_peer(peer_circuit_index, peer_circuit_id, src); } else { if (frametype == NR_CONNREQ) - sk = nr_find_peer(circuit_index, circuit_id); + sk = nr_find_peer(circuit_index, circuit_id, src); else sk = nr_find_socket(circuit_index, circuit_id); } diff --git a/net/netrom/nr_loopback.c b/net/netrom/nr_loopback.c index ba9644cbe..007cb8738 100644 --- a/net/netrom/nr_loopback.c +++ b/net/netrom/nr_loopback.c @@ -77,16 +77,16 @@ static void nr_loopback_timer(unsigned long param) ax25_address *nr_dest; struct device *dev; - while ((skb = skb_dequeue(&loopback_queue)) != NULL) { + if ((skb = skb_dequeue(&loopback_queue)) != NULL) { nr_dest = (ax25_address *)(skb->data + 7); - if ((dev = nr_dev_get(nr_dest)) == NULL) { - kfree_skb(skb); - continue; - } + dev = nr_dev_get(nr_dest); - if (nr_rx_frame(skb, dev) == 0) + if (dev == NULL || nr_rx_frame(skb, dev) == 0) kfree_skb(skb); + + if (!skb_queue_empty(&loopback_queue) && !nr_loopback_running()) + nr_set_loopback_timer(); } } diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index 26f5ac8dd..d46e45eb6 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -81,6 +81,22 @@ static int nr_add_node(ax25_address *nr, const char *mnemonic, ax25_address *ax2 if (ax25cmp(ax25, &nr_neigh->callsign) == 0 && nr_neigh->dev == dev) break; + /* + * The L2 link to a neighbour has failed in the past + * and now a frame comes from this neighbour. We assume + * it was a temporary trouble with the link and reset the + * routes now (and not wait for a node broadcast). + */ + if (nr_neigh != NULL && nr_neigh->failed != 0 && quality == 0) { + struct nr_node *node; + + for (node = nr_node_list; node != NULL; node = node->next) + for (i = 0; i < node->count; i++) + if (node->routes[i].neighbour == nr_neigh) + if (i < node->which) + node->which = i; + } + if (nr_neigh != NULL) nr_neigh->failed = 0; diff --git a/net/netsyms.c b/net/netsyms.c index f987d9425..d3e96333d 100644 --- a/net/netsyms.c +++ b/net/netsyms.c @@ -11,7 +11,13 @@ #include <linux/types.h> #include <linux/net.h> #include <linux/in.h> +#include <net/sock.h> +#include <net/dst.h> +#include <net/checksum.h> +#include <net/pkt_sched.h> #include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/fddidevice.h> #include <linux/trdevice.h> #include <linux/ioport.h> #include <net/neighbour.h> @@ -23,8 +29,6 @@ #ifdef CONFIG_INET #include <linux/ip.h> -#include <linux/etherdevice.h> -#include <linux/fddidevice.h> #include <net/protocol.h> #include <net/arp.h> #include <net/ip.h> @@ -34,15 +38,23 @@ #include <net/route.h> #include <net/scm.h> #include <net/inet_common.h> -#include <net/pkt_sched.h> #include <linux/inet.h> #include <linux/mroute.h> #include <linux/igmp.h> extern struct net_proto_family inet_family_ops; +#ifdef CONFIG_DLCI_MODULE +extern int (*dlci_ioctl_hook)(unsigned int, void *); +EXPORT_SYMBOL(dlci_ioctl_hook); +#endif + +#endif + #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) #include <linux/in6.h> +#include <linux/icmpv6.h> +#include <net/ipv6.h> #include <net/ndisc.h> #include <net/dst.h> #include <net/transp_v6.h> @@ -50,7 +62,6 @@ extern struct net_proto_family inet_family_ops; extern int tcp_tw_death_row_slot; #endif -#endif #include <linux/rtnetlink.h> @@ -60,7 +71,8 @@ extern int tcp_tw_death_row_slot; defined(CONFIG_EL2) || defined(CONFIG_NE2000) || \ defined(CONFIG_E2100) || defined(CONFIG_HPLAN_PLUS) || \ defined(CONFIG_HPLAN) || defined(CONFIG_AC3200) || \ - defined(CONFIG_ES3210) + defined(CONFIG_ES3210) || defined(CONFIG_ULTRA32) || \ + defined(CONFIG_LNE390) || defined(CONFIG_NE3210) #include "../drivers/net/8390.h" #endif @@ -93,6 +105,7 @@ EXPORT_SYMBOL(sock_unregister); /* Socket layer support routines */ EXPORT_SYMBOL(memcpy_fromiovec); +EXPORT_SYMBOL(memcpy_tokerneliovec); EXPORT_SYMBOL(sock_create); EXPORT_SYMBOL(sock_alloc); EXPORT_SYMBOL(sock_release); @@ -209,9 +222,6 @@ EXPORT_SYMBOL(ip_route_output); EXPORT_SYMBOL(icmp_send); EXPORT_SYMBOL(ip_options_compile); EXPORT_SYMBOL(arp_send); -#ifdef CONFIG_SHAPER_MODULE -EXPORT_SYMBOL(arp_broken_ops); -#endif EXPORT_SYMBOL(ip_id_count); EXPORT_SYMBOL(ip_send_check); EXPORT_SYMBOL(ip_fragment); @@ -223,10 +233,17 @@ EXPORT_SYMBOL(__ip_finish_output); EXPORT_SYMBOL(inet_dgram_ops); EXPORT_SYMBOL(ip_cmsg_recv); EXPORT_SYMBOL(__release_sock); +EXPORT_SYMBOL(arp_find); +EXPORT_SYMBOL(ip_rcv); +EXPORT_SYMBOL(arp_rcv); /* needed for ip_gre -cw */ EXPORT_SYMBOL(ip_statistics); +#ifdef CONFIG_IPV6 +EXPORT_SYMBOL(ipv6_addr_type); +EXPORT_SYMBOL(icmpv6_send); +#endif #ifdef CONFIG_IPV6_MODULE /* inet functions common to v4 and v6 */ EXPORT_SYMBOL(inet_stream_ops); @@ -305,6 +322,7 @@ EXPORT_SYMBOL(tcp_transmit_skb); EXPORT_SYMBOL(tcp_connect); EXPORT_SYMBOL(tcp_make_synack); EXPORT_SYMBOL(tcp_tw_death_row_slot); +EXPORT_SYMBOL(tcp_sync_mss); EXPORT_SYMBOL(net_statistics); EXPORT_SYMBOL(xrlim_allow); @@ -359,7 +377,8 @@ EXPORT_SYMBOL(sock_rmalloc); defined(CONFIG_EL2) || defined(CONFIG_NE2000) || \ defined(CONFIG_E2100) || defined(CONFIG_HPLAN_PLUS) || \ defined(CONFIG_HPLAN) || defined(CONFIG_AC3200) || \ - defined(CONFIG_ES3210) + defined(CONFIG_ES3210) || defined(CONFIG_ULTRA32) || \ + defined(CONFIG_LNE390) || defined(CONFIG_NE3210) /* If 8390 NIC support is built in, we will need these. */ EXPORT_SYMBOL(ei_open); EXPORT_SYMBOL(ei_close); @@ -426,12 +445,9 @@ EXPORT_SYMBOL(netdev_fc_xoff); EXPORT_SYMBOL(dev_base); EXPORT_SYMBOL(dev_close); EXPORT_SYMBOL(dev_mc_add); -EXPORT_SYMBOL(arp_find); EXPORT_SYMBOL(n_tty_ioctl); EXPORT_SYMBOL(tty_register_ldisc); EXPORT_SYMBOL(kill_fasync); -EXPORT_SYMBOL(ip_rcv); -EXPORT_SYMBOL(arp_rcv); EXPORT_SYMBOL(dev_mc_delete); EXPORT_SYMBOL(if_port_text); @@ -441,10 +457,6 @@ EXPORT_SYMBOL(if_port_text); EXPORT_SYMBOL(ltalk_setup); #endif -#ifdef CONFIG_DLCI_MODULE -extern int (*dlci_ioctl_hook)(unsigned int, void *); -EXPORT_SYMBOL(dlci_ioctl_hook); -#endif /* Packet scheduler modules want these. */ EXPORT_SYMBOL(qdisc_destroy); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 1e5a509d4..c7e7a6733 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -5,10 +5,7 @@ * * PACKET - implements raw packet sockets. * - * Doesn't belong in IP but it's currently too hooked into ip - * to separate. - * - * Version: @(#)packet.c 1.0.6 05/25/93 + * Version: $Id: af_packet.c,v 1.18 1998/10/03 15:55:24 freitag Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -1245,7 +1242,7 @@ void cleanup_module(void) int init_module(void) #else -__initfunc(void packet_proto_init(struct net_proto *pro)) +void __init packet_proto_init(struct net_proto *pro) #endif { sock_register(&packet_family_ops); diff --git a/net/protocols.c b/net/protocols.c index 2e2362b82..6ec830cca 100644 --- a/net/protocols.c +++ b/net/protocols.c @@ -22,6 +22,10 @@ extern void inet6_proto_init(struct net_proto *pro); #endif #endif /* INET */ +#ifdef CONFIG_ECONET +extern void econet_proto_init(struct net_proto *pro); +#endif + #ifdef CONFIG_NETLINK extern void netlink_proto_init(struct net_proto *pro); #endif @@ -153,5 +157,9 @@ struct net_proto protocols[] = { { "X.25", x25_proto_init }, /* CCITT X.25 Packet Layer */ #endif +#ifdef CONFIG_ECONET + { "Econet", econet_proto_init }, /* Acorn Econet */ +#endif + { NULL, NULL } /* End marker */ }; diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 8a681b8fb..1c27a4724 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -57,7 +57,7 @@ #include <linux/if_arp.h> #include <linux/init.h> -int rose_ndevs = 6; +int rose_ndevs = 10; int sysctl_rose_restart_request_timeout = ROSE_DEFAULT_T0; int sysctl_rose_call_request_timeout = ROSE_DEFAULT_T1; @@ -524,7 +524,7 @@ static int rose_listen(struct socket *sock, int backlog) sk->protinfo.rose->dest_ndigis = 0; memset(&sk->protinfo.rose->dest_addr, '\0', ROSE_ADDR_LEN); memset(&sk->protinfo.rose->dest_call, '\0', AX25_ADDR_LEN); - memset(&sk->protinfo.rose->dest_digi, '\0', AX25_ADDR_LEN); + memset(sk->protinfo.rose->dest_digis, '\0', AX25_ADDR_LEN*ROSE_MAX_DIGIS); sk->max_ack_backlog = backlog; sk->state = TCP_LISTEN; return 0; @@ -549,6 +549,10 @@ static int rose_create(struct socket *sock, int protocol) sock_init_data(sock, sk); skb_queue_head_init(&rose->ack_queue); +#ifdef M_BIT + skb_queue_head_init(&rose->frag_queue); + rose->fraglen = 0; +#endif sock->ops = &rose_proto_ops; sk->protocol = protocol; @@ -583,6 +587,10 @@ static struct sock *rose_make_new(struct sock *osk) sock_init_data(NULL, sk); skb_queue_head_init(&rose->ack_queue); +#ifdef M_BIT + skb_queue_head_init(&rose->frag_queue); + rose->fraglen = 0; +#endif sk->type = osk->type; sk->socket = osk->socket; @@ -662,16 +670,23 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr; struct device *dev; ax25_address *user, *source; + int n; if (sk->zapped == 0) return -EINVAL; - if (addr_len != sizeof(struct sockaddr_rose)) + if (addr_len != sizeof(struct sockaddr_rose) && addr_len != sizeof(struct full_sockaddr_rose)) return -EINVAL; if (addr->srose_family != AF_ROSE) return -EINVAL; + if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1) + return -EINVAL; + + if (addr->srose_ndigis > ROSE_MAX_DIGIS) + return -EINVAL; + if ((dev = rose_dev_get(&addr->srose_addr)) == NULL) { SOCK_DEBUG(sk, "ROSE: bind failed: invalid address\n"); return -EADDRNOTAVAIL; @@ -685,13 +700,19 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) user = source; } - sk->protinfo.rose->source_addr = addr->srose_addr; - sk->protinfo.rose->source_call = *user; - sk->protinfo.rose->device = dev; + sk->protinfo.rose->source_addr = addr->srose_addr; + sk->protinfo.rose->source_call = *user; + sk->protinfo.rose->device = dev; + sk->protinfo.rose->source_ndigis = addr->srose_ndigis; - if (addr->srose_ndigis == 1) { - sk->protinfo.rose->source_ndigis = 1; - sk->protinfo.rose->source_digi = addr->srose_digi; + if (addr_len == sizeof(struct full_sockaddr_rose)) { + struct full_sockaddr_rose *full_addr = (struct full_sockaddr_rose *)uaddr; + for (n = 0 ; n < addr->srose_ndigis ; n++) + sk->protinfo.rose->source_digis[n] = full_addr->srose_digis[n]; + } else { + if (sk->protinfo.rose->source_ndigis == 1) { + sk->protinfo.rose->source_digis[0] = addr->srose_digi; + } } rose_insert_socket(sk); @@ -708,6 +729,7 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le unsigned char cause, diagnostic; ax25_address *user; struct device *dev; + int n; if (sk->state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { sock->state = SS_CONNECTED; @@ -725,12 +747,22 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le sk->state = TCP_CLOSE; sock->state = SS_UNCONNECTED; - if (addr_len != sizeof(struct sockaddr_rose)) + if (addr_len != sizeof(struct sockaddr_rose) && addr_len != sizeof(struct full_sockaddr_rose)) return -EINVAL; if (addr->srose_family != AF_ROSE) return -EINVAL; + if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1) + return -EINVAL; + + if (addr->srose_ndigis > ROSE_MAX_DIGIS) + return -EINVAL; + + /* Source + Destination digis should not exceed ROSE_MAX_DIGIS */ + if ((sk->protinfo.rose->source_ndigis + addr->srose_ndigis) > ROSE_MAX_DIGIS) + return -EINVAL; + if ((sk->protinfo.rose->neighbour = rose_get_neigh(&addr->srose_addr, &cause, &diagnostic)) == NULL) return -ENETUNREACH; @@ -753,13 +785,19 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le rose_insert_socket(sk); /* Finish the bind */ } - sk->protinfo.rose->dest_addr = addr->srose_addr; - sk->protinfo.rose->dest_call = addr->srose_call; - sk->protinfo.rose->rand = ((int)sk->protinfo.rose & 0xFFFF) + sk->protinfo.rose->lci; + sk->protinfo.rose->dest_addr = addr->srose_addr; + sk->protinfo.rose->dest_call = addr->srose_call; + sk->protinfo.rose->rand = ((int)sk->protinfo.rose & 0xFFFF) + sk->protinfo.rose->lci; + sk->protinfo.rose->dest_ndigis = addr->srose_ndigis; - if (addr->srose_ndigis == 1) { - sk->protinfo.rose->dest_ndigis = 1; - sk->protinfo.rose->dest_digi = addr->srose_digi; + if (addr_len == sizeof(struct full_sockaddr_rose)) { + struct full_sockaddr_rose *full_addr = (struct full_sockaddr_rose *)uaddr; + for (n = 0 ; n < addr->srose_ndigis ; n++) + sk->protinfo.rose->dest_digis[n] = full_addr->srose_digis[n]; + } else { + if (sk->protinfo.rose->dest_ndigis == 1) { + sk->protinfo.rose->dest_digis[0] = addr->srose_digi; + } } /* Move to connecting socket, start sending Connect Requests */ @@ -863,6 +901,7 @@ static int rose_getname(struct socket *sock, struct sockaddr *uaddr, { struct sockaddr_rose *srose = (struct sockaddr_rose *)uaddr; struct sock *sk = sock->sk; + int n; if (peer != 0) { if (sk->state != TCP_ESTABLISHED) @@ -871,21 +910,37 @@ static int rose_getname(struct socket *sock, struct sockaddr *uaddr, srose->srose_ndigis = 0; srose->srose_addr = sk->protinfo.rose->dest_addr; srose->srose_call = sk->protinfo.rose->dest_call; - if (sk->protinfo.rose->dest_ndigis == 1) { - srose->srose_ndigis = 1; - srose->srose_digi = sk->protinfo.rose->dest_digi; + srose->srose_ndigis = sk->protinfo.rose->dest_ndigis; + if (*uaddr_len >= sizeof(struct full_sockaddr_rose)) { + struct full_sockaddr_rose *full_srose = (struct full_sockaddr_rose *)uaddr; + for (n = 0 ; n < sk->protinfo.rose->dest_ndigis ; n++) + full_srose->srose_digis[n] = sk->protinfo.rose->dest_digis[n]; + *uaddr_len = sizeof(struct full_sockaddr_rose); + } else { + if (sk->protinfo.rose->dest_ndigis >= 1) { + srose->srose_ndigis = 1; + srose->srose_digi = sk->protinfo.rose->dest_digis[0]; + } + *uaddr_len = sizeof(struct sockaddr_rose); } - *uaddr_len = sizeof(struct sockaddr_rose); } else { srose->srose_family = AF_ROSE; srose->srose_ndigis = 0; srose->srose_addr = sk->protinfo.rose->source_addr; srose->srose_call = sk->protinfo.rose->source_call; - if (sk->protinfo.rose->source_ndigis == 1) { - srose->srose_ndigis = 1; - srose->srose_digi = sk->protinfo.rose->source_digi; + srose->srose_ndigis = sk->protinfo.rose->source_ndigis; + if (*uaddr_len >= sizeof(struct full_sockaddr_rose)) { + struct full_sockaddr_rose *full_srose = (struct full_sockaddr_rose *)uaddr; + for (n = 0 ; n < sk->protinfo.rose->source_ndigis ; n++) + full_srose->srose_digis[n] = sk->protinfo.rose->source_digis[n]; + *uaddr_len = sizeof(struct full_sockaddr_rose); + } else { + if (sk->protinfo.rose->source_ndigis >= 1) { + srose->srose_ndigis = 1; + srose->srose_digi = sk->protinfo.rose->source_digis[sk->protinfo.rose->source_ndigis-1]; + } + *uaddr_len = sizeof(struct sockaddr_rose); } - *uaddr_len = sizeof(struct sockaddr_rose); } return 0; @@ -895,14 +950,19 @@ int rose_rx_call_request(struct sk_buff *skb, struct device *dev, struct rose_ne { struct sock *sk; struct sock *make; - struct rose_facilities facilities; + struct rose_facilities_struct facilities; + int n, len; skb->sk = NULL; /* Initially we don't know who it's for */ /* * skb->data points to the rose frame start */ - if (!rose_parse_facilities(skb, &facilities)) { + memset(&facilities, 0x00, sizeof(struct rose_facilities_struct)); + + len = (((skb->data[3] >> 4) & 0x0F) + 1) / 2; + len += (((skb->data[3] >> 0) & 0x0F) + 1) / 2; + if (!rose_parse_facilities(skb->data + len + 4, &facilities)) { rose_transmit_clear_request(neigh, lci, ROSE_INVALID_FACILITY, 76); return 0; } @@ -924,13 +984,16 @@ int rose_rx_call_request(struct sk_buff *skb, struct device *dev, struct rose_ne make->protinfo.rose->dest_addr = facilities.dest_addr; make->protinfo.rose->dest_call = facilities.dest_call; make->protinfo.rose->dest_ndigis = facilities.dest_ndigis; - make->protinfo.rose->dest_digi = facilities.dest_digi; + for (n = 0 ; n < facilities.dest_ndigis ; n++) + make->protinfo.rose->dest_digis[n] = facilities.dest_digis[n]; make->protinfo.rose->source_addr = facilities.source_addr; make->protinfo.rose->source_call = facilities.source_call; make->protinfo.rose->source_ndigis = facilities.source_ndigis; - make->protinfo.rose->source_digi = facilities.source_digi; + for (n = 0 ; n < facilities.source_ndigis ; n++) + make->protinfo.rose->source_digis[n]= facilities.source_digis[n]; make->protinfo.rose->neighbour = neigh; make->protinfo.rose->device = dev; + make->protinfo.rose->facilities = facilities; make->protinfo.rose->neighbour->use++; @@ -968,10 +1031,10 @@ static int rose_sendmsg(struct socket *sock, struct msghdr *msg, int len, struct sock *sk = sock->sk; struct sockaddr_rose *usrose = (struct sockaddr_rose *)msg->msg_name; int err; - struct sockaddr_rose srose; + struct full_sockaddr_rose srose; struct sk_buff *skb; unsigned char *asmptr; - int size, qbit = 0; + int n, size, qbit = 0; if (msg->msg_flags & ~MSG_DONTWAIT) return -EINVAL; @@ -988,15 +1051,19 @@ static int rose_sendmsg(struct socket *sock, struct msghdr *msg, int len, return -ENETUNREACH; if (usrose != NULL) { - if (msg->msg_namelen < sizeof(srose)) + if (msg->msg_namelen != sizeof(struct sockaddr_rose) && msg->msg_namelen != sizeof(struct full_sockaddr_rose)) return -EINVAL; - srose = *usrose; + memset(&srose, 0, sizeof(struct full_sockaddr_rose)); + memcpy(&srose, usrose, msg->msg_namelen); if (rosecmp(&sk->protinfo.rose->dest_addr, &srose.srose_addr) != 0 || ax25cmp(&sk->protinfo.rose->dest_call, &srose.srose_call) != 0) return -EISCONN; - if (srose.srose_ndigis == 1 && sk->protinfo.rose->dest_ndigis == 1) { - if (ax25cmp(&sk->protinfo.rose->dest_digi, &srose.srose_digi) != 0) - return -EISCONN; + if (srose.srose_ndigis != sk->protinfo.rose->dest_ndigis) + return -EISCONN; + if (srose.srose_ndigis == sk->protinfo.rose->dest_ndigis) { + for (n = 0 ; n < srose.srose_ndigis ; n++) + if (ax25cmp(&sk->protinfo.rose->dest_digis[n], &srose.srose_digis[n]) != 0) + return -EISCONN; } if (srose.srose_family != AF_ROSE) return -EINVAL; @@ -1007,12 +1074,9 @@ static int rose_sendmsg(struct socket *sock, struct msghdr *msg, int len, srose.srose_family = AF_ROSE; srose.srose_addr = sk->protinfo.rose->dest_addr; srose.srose_call = sk->protinfo.rose->dest_call; - srose.srose_ndigis = 0; - - if (sk->protinfo.rose->dest_ndigis == 1) { - srose.srose_ndigis = 1; - srose.srose_digi = sk->protinfo.rose->dest_digi; - } + srose.srose_ndigis = sk->protinfo.rose->dest_ndigis; + for (n = 0 ; n < sk->protinfo.rose->dest_ndigis ; n++) + srose.srose_digis[n] = sk->protinfo.rose->dest_digis[n]; } SOCK_DEBUG(sk, "ROSE: sendto: Addresses built.\n"); @@ -1068,7 +1132,54 @@ static int rose_sendmsg(struct socket *sock, struct msghdr *msg, int len, return -ENOTCONN; } +#ifdef M_BIT +#define ROSE_PACLEN (256-ROSE_MIN_LEN) + if (skb->len - ROSE_MIN_LEN > ROSE_PACLEN) { + unsigned char header[ROSE_MIN_LEN]; + struct sk_buff *skbn; + int frontlen; + int lg; + + /* Save a copy of the Header */ + memcpy(header, skb->data, ROSE_MIN_LEN); + skb_pull(skb, ROSE_MIN_LEN); + + frontlen = skb_headroom(skb); + + while (skb->len > 0) { + if ((skbn = sock_alloc_send_skb(sk, frontlen + ROSE_PACLEN, 0, 0, &err)) == NULL) + return err; + + skbn->sk = sk; + skbn->free = 1; + skbn->arp = 1; + + skb_reserve(skbn, frontlen); + + lg = (ROSE_PACLEN > skb->len) ? skb->len : ROSE_PACLEN; + + /* Copy the user data */ + memcpy(skb_put(skbn, lg), skb->data, lg); + skb_pull(skb, lg); + + /* Duplicate the Header */ + skb_push(skbn, ROSE_MIN_LEN); + memcpy(skbn->data, header, ROSE_MIN_LEN); + + if (skb->len > 0) + skbn->data[2] |= M_BIT; + + skb_queue_tail(&sk->write_queue, skbn); /* Throw it on the queue */ + } + + skb->free = 1; + kfree_skb(skb, FREE_WRITE); + } else { + skb_queue_tail(&sk->write_queue, skb); /* Throw it on the queue */ + } +#else skb_queue_tail(&sk->write_queue, skb); /* Shove it onto the queue */ +#endif rose_kick(sk); @@ -1084,7 +1195,7 @@ static int rose_recvmsg(struct socket *sock, struct msghdr *msg, int size, int copied, qbit; unsigned char *asmptr; struct sk_buff *skb; - int er; + int n, er; /* * This works for seqpacket too. The receiver has ordered the queue for @@ -1120,16 +1231,21 @@ static int rose_recvmsg(struct socket *sock, struct msghdr *msg, int size, srose->srose_family = AF_ROSE; srose->srose_addr = sk->protinfo.rose->dest_addr; srose->srose_call = sk->protinfo.rose->dest_call; - srose->srose_ndigis = 0; - - if (sk->protinfo.rose->dest_ndigis == 1) { - srose->srose_ndigis = 1; - srose->srose_digi = sk->protinfo.rose->dest_digi; + srose->srose_ndigis = sk->protinfo.rose->dest_ndigis; + if (msg->msg_namelen >= sizeof(struct full_sockaddr_rose)) { + struct full_sockaddr_rose *full_srose = (struct full_sockaddr_rose *)msg->msg_name; + for (n = 0 ; n < sk->protinfo.rose->dest_ndigis ; n++) + full_srose->srose_digis[n] = sk->protinfo.rose->dest_digis[n]; + msg->msg_namelen = sizeof(struct full_sockaddr_rose); + } else { + if (sk->protinfo.rose->dest_ndigis >= 1) { + srose->srose_ndigis = 1; + srose->srose_digi = sk->protinfo.rose->dest_digis[0]; + } + msg->msg_namelen = sizeof(struct sockaddr_rose); } } - msg->msg_namelen = sizeof(struct sockaddr_rose); - skb_free_datagram(sk, skb); return copied; @@ -1259,7 +1375,7 @@ static int rose_get_info(char *buffer, char **start, off_t offset, int length, i cli(); - len += sprintf(buffer, "dest_addr dest_call src_addr src_call dev lci st vs vr va t t1 t2 t3 hb idle Snd-Q Rcv-Q inode\n"); + len += sprintf(buffer, "dest_addr dest_call src_addr src_call dev lci neigh st vs vr va t t1 t2 t3 hb idle Snd-Q Rcv-Q inode\n"); for (s = rose_list; s != NULL; s = s->next) { if ((dev = s->protinfo.rose->device) == NULL) @@ -1276,11 +1392,12 @@ static int rose_get_info(char *buffer, char **start, off_t offset, int length, i else callsign = ax2asc(&s->protinfo.rose->source_call); - len += sprintf(buffer + len, "%-10s %-9s %-5s %3.3X %d %d %d %d %3lu %3lu %3lu %3lu %3lu %3lu/%03lu %5d %5d %ld\n", + len += sprintf(buffer + len, "%-10s %-9s %-5s %3.3X %05d %d %d %d %d %3lu %3lu %3lu %3lu %3lu %3lu/%03lu %5d %5d %ld\n", rose2asc(&s->protinfo.rose->source_addr), callsign, devname, s->protinfo.rose->lci & 0x0FFF, + (s->protinfo.rose->neighbour) ? s->protinfo.rose->neighbour->number : 0, s->protinfo.rose->state, s->protinfo.rose->vs, s->protinfo.rose->vr, @@ -1399,7 +1516,7 @@ __initfunc(void rose_proto_init(struct net_proto *pro)) sock_register(&rose_family_ops); register_netdevice_notifier(&rose_dev_notifier); - printk(KERN_INFO "G4KLX ROSE for Linux. Version 0.3 for AX25.037 Linux 2.1\n"); + printk(KERN_INFO "F6FBB/G4KLX ROSE for Linux. Version 0.62 for AX25.037 Linux 2.1\n"); ax25_protocol_register(AX25_P_ROSE, rose_route_frame); ax25_linkfail_register(rose_link_failed); diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c index ce66a9911..586bf9b85 100644 --- a/net/rose/rose_loopback.c +++ b/net/rose/rose_loopback.c @@ -84,7 +84,7 @@ static void rose_loopback_timer(unsigned long param) lci_i = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF); frametype = skb->data[2]; dest = (rose_address *)(skb->data + 4); - lci_o = sysctl_rose_maximum_vcs - lci_i + 1; + lci_o = 0xFFF - lci_i; skb->h.raw = skb->data; diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 2d6d23230..1fad6b7cc 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -65,7 +65,7 @@ static void rose_remove_neigh(struct rose_neigh *); */ static int rose_add_node(struct rose_route_struct *rose_route, struct device *dev) { - struct rose_node *rose_node; + struct rose_node *rose_node, *rose_tmpn, *rose_tmpp; struct rose_neigh *rose_neigh; unsigned long flags; int i; @@ -122,7 +122,27 @@ static int rose_add_node(struct rose_route_struct *rose_route, struct device *de restore_flags(flags); } + /* + * This is a new node to be inserted into the list. Find where it needs + * to be inserted into the list, and insert it. We want to be sure + * to order the list in descending order of mask size to ensure that + * later when we are searching this list the first match will be the + * best match. + */ if (rose_node == NULL) { + rose_tmpn = rose_node_list; + rose_tmpp = NULL; + + while (rose_tmpn != NULL) { + if (rose_tmpn->mask > rose_route->mask) { + rose_tmpp = rose_tmpn; + rose_tmpn = rose_tmpn->next; + } else { + break; + } + } + + /* create new node */ if ((rose_node = kmalloc(sizeof(*rose_node), GFP_ATOMIC)) == NULL) return -ENOMEM; @@ -133,8 +153,25 @@ static int rose_add_node(struct rose_route_struct *rose_route, struct device *de rose_node->neighbour[0] = rose_neigh; save_flags(flags); cli(); - rose_node->next = rose_node_list; - rose_node_list = rose_node; + + if (rose_tmpn == NULL) { + if (rose_tmpp == NULL) { /* Empty list */ + rose_node_list = rose_node; + rose_node->next = NULL; + } else { + rose_tmpp->next = rose_node; + rose_node->next = NULL; + } + } else { + if (rose_tmpp == NULL) { /* 1st node */ + rose_node->next = rose_node_list; + rose_node_list = rose_node; + } else { + rose_tmpp->next = rose_node; + rose_node->next = rose_tmpn; + } + } + restore_flags(flags); rose_neigh->count++; @@ -328,6 +365,11 @@ int rose_add_loopback_neigh(void) rose_loopback_neigh->number = rose_neigh_no++; rose_loopback_neigh->restarted = 1; + skb_queue_head_init(&rose_loopback_neigh->queue); + + init_timer(&rose_loopback_neigh->ftimer); + init_timer(&rose_loopback_neigh->t0timer); + save_flags(flags); cli(); rose_loopback_neigh->next = rose_neigh_list; rose_neigh_list = rose_loopback_neigh; @@ -349,7 +391,7 @@ int rose_add_loopback_node(rose_address *address) break; if (rose_node != NULL) return 0; - + if ((rose_node = kmalloc(sizeof(*rose_node), GFP_ATOMIC)) == NULL) return -ENOMEM; @@ -359,6 +401,7 @@ int rose_add_loopback_node(rose_address *address) rose_node->loopback = 1; rose_node->neighbour[0] = rose_loopback_neigh; + /* Insert at the head of list. Address is always mask=10 */ save_flags(flags); cli(); rose_node->next = rose_node_list; rose_node_list = rose_node; @@ -450,6 +493,7 @@ void rose_route_device_down(struct device *dev) /* * Clear all nodes and neighbours out, except for neighbours with * active connections going through them. + * Do not clear loopback neighbour and nodes. */ static int rose_clear_routes(void) { @@ -459,18 +503,18 @@ static int rose_clear_routes(void) while (rose_node != NULL) { t = rose_node; rose_node = rose_node->next; - - rose_remove_node(t); + if (!t->loopback) + rose_remove_node(t); } while (rose_neigh != NULL) { s = rose_neigh; rose_neigh = rose_neigh->next; - s->count = 0; - - if (s->use == 0) + if (s->use == 0 && !s->loopback) { + s->count = 0; rose_remove_neigh(s); + } } return 0; @@ -539,29 +583,21 @@ struct rose_route *rose_route_free_lci(unsigned int lci, struct rose_neigh *neig struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause, unsigned char *diagnostic) { struct rose_node *node; - struct rose_neigh *neigh; int failed = 0; - int mask = 0; int i; - for (neigh = NULL, node = rose_node_list; node != NULL; node = node->next) { + for (node = rose_node_list; node != NULL; node = node->next) { if (rosecmpm(addr, &node->address, node->mask) == 0) { - if (node->mask > mask) { - mask = node->mask; - - for (i = 0; i < node->count; i++) { - if (!rose_ftimer_running(node->neighbour[i])) - neigh = node->neighbour[i]; - else - failed = 1; - } + for (i = 0; i < node->count; i++) { + if (!rose_ftimer_running(node->neighbour[i])) { + return node->neighbour[i]; } + else + failed = 1; } + break; } } - if (neigh != NULL) - return neigh; - if (failed) { *cause = ROSE_OUT_OF_ORDER; *diagnostic = 0; @@ -697,7 +733,7 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) { struct rose_neigh *rose_neigh, *new_neigh; struct rose_route *rose_route; - struct rose_facilities facilities; + struct rose_facilities_struct facilities; rose_address *src_addr, *dest_addr; struct sock *sk; unsigned short frametype; @@ -705,6 +741,7 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) unsigned char cause, diagnostic; struct device *dev; unsigned long flags; + int len; if (call_in_firewall(PF_ROSE, skb->dev, skb->data, NULL, &skb) != FW_ACCEPT) return 0; @@ -718,8 +755,10 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) if (ax25cmp(&ax25->dest_addr, &rose_neigh->callsign) == 0 && ax25->ax25_dev->dev == rose_neigh->dev) break; - if (rose_neigh == NULL) + if (rose_neigh == NULL) { + printk("rose_route : unknown neighbour or device %s\n", ax2asc(&ax25->dest_addr)); return 0; + } /* * Obviously the link is working, halt the ftimer. @@ -739,8 +778,26 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) * Find an existing socket. */ if ((sk = rose_find_socket(lci, rose_neigh)) != NULL) { - skb->h.raw = skb->data; - return rose_process_rx_frame(sk, skb); + if (frametype == ROSE_CALL_REQUEST) { + /* Remove an existing unused socket */ + rose_clear_queues(sk); + sk->protinfo.rose->cause = ROSE_NETWORK_CONGESTION; + sk->protinfo.rose->diagnostic = 0; + sk->protinfo.rose->neighbour->use--; + sk->protinfo.rose->neighbour = NULL; + sk->protinfo.rose->lci = 0; + sk->protinfo.rose->state = ROSE_STATE_0; + sk->state = TCP_CLOSE; + sk->err = 0; + sk->shutdown |= SEND_SHUTDOWN; + if (!sk->dead) + sk->state_change(sk); + sk->dead = 1; + } + else { + skb->h.raw = skb->data; + return rose_process_rx_frame(sk, skb); + } } /* @@ -760,7 +817,11 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) */ for (rose_route = rose_route_list; rose_route != NULL; rose_route = rose_route->next) { if (rose_route->lci1 == lci && rose_route->neigh1 == rose_neigh) { - if (rose_route->neigh2 != NULL) { + if (frametype == ROSE_CALL_REQUEST) { + /* F6FBB - Remove an existing unused route */ + rose_remove_route(rose_route); + break; + } else if (rose_route->neigh2 != NULL) { skb->data[0] &= 0xF0; skb->data[0] |= (rose_route->lci2 >> 8) & 0x0F; skb->data[1] = (rose_route->lci2 >> 0) & 0xFF; @@ -775,7 +836,11 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) } } if (rose_route->lci2 == lci && rose_route->neigh2 == rose_neigh) { - if (rose_route->neigh1 != NULL) { + if (frametype == ROSE_CALL_REQUEST) { + /* F6FBB - Remove an existing unused route */ + rose_remove_route(rose_route); + break; + } else if (rose_route->neigh1 != NULL) { skb->data[0] &= 0xF0; skb->data[0] |= (rose_route->lci1 >> 8) & 0x0F; skb->data[1] = (rose_route->lci1 >> 0) & 0xFF; @@ -799,7 +864,12 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) if (frametype != ROSE_CALL_REQUEST) /* XXX */ return 0; - if (!rose_parse_facilities(skb, &facilities)) { + len = (((skb->data[3] >> 4) & 0x0F) + 1) / 2; + len += (((skb->data[3] >> 0) & 0x0F) + 1) / 2; + + memset(&facilities, 0x00, sizeof(struct rose_facilities_struct)); + + if (!rose_parse_facilities(skb->data + len + 4, &facilities)) { rose_transmit_clear_request(rose_neigh, lci, ROSE_INVALID_FACILITY, 76); return 0; } @@ -873,11 +943,11 @@ int rose_nodes_get_info(char *buffer, char **start, off_t offset, len += sprintf(buffer, "address mask n neigh neigh neigh\n"); for (rose_node = rose_node_list; rose_node != NULL; rose_node = rose_node->next) { - if (rose_node->loopback) { + /* if (rose_node->loopback) { len += sprintf(buffer + len, "%-10s %04d 1 loopback\n", rose2asc(&rose_node->address), rose_node->mask); - } else { + } else { */ len += sprintf(buffer + len, "%-10s %04d %d", rose2asc(&rose_node->address), rose_node->mask, @@ -888,7 +958,7 @@ int rose_nodes_get_info(char *buffer, char **start, off_t offset, rose_node->neighbour[i]->number); len += sprintf(buffer + len, "\n"); - } + /* } */ pos = begin + len; @@ -925,10 +995,10 @@ int rose_neigh_get_info(char *buffer, char **start, off_t offset, len += sprintf(buffer, "addr callsign dev count use mode restart t0 tf digipeaters\n"); for (rose_neigh = rose_neigh_list; rose_neigh != NULL; rose_neigh = rose_neigh->next) { - if (!rose_neigh->loopback) { + /* if (!rose_neigh->loopback) { */ len += sprintf(buffer + len, "%05d %-9s %-4s %3d %3d %3s %3s %3lu %3lu", rose_neigh->number, - ax2asc(&rose_neigh->callsign), + (rose_neigh->loopback) ? "RSLOOP-0" : ax2asc(&rose_neigh->callsign), rose_neigh->dev ? rose_neigh->dev->name : "???", rose_neigh->count, rose_neigh->use, @@ -953,7 +1023,7 @@ int rose_neigh_get_info(char *buffer, char **start, off_t offset, if (pos > offset + length) break; - } + /* } */ } sti(); diff --git a/net/rose/rose_subr.c b/net/rose/rose_subr.c index d80212261..dc172ac3b 100644 --- a/net/rose/rose_subr.c +++ b/net/rose/rose_subr.c @@ -251,9 +251,11 @@ int rose_decode(struct sk_buff *skb, int *ns, int *nr, int *q, int *d, int *m) return ROSE_ILLEGAL; } -static int rose_parse_national(unsigned char *p, struct rose_facilities *facilities, int len) +static int rose_parse_national(unsigned char *p, struct rose_facilities_struct *facilities, int len) { - unsigned char l, n = 0; + unsigned char *pt; + unsigned char l, lg, n = 0; + int fac_national_digis_received = 0; do { switch (*p & 0xC0) { @@ -280,12 +282,33 @@ static int rose_parse_national(unsigned char *p, struct rose_facilities *facilit case 0xC0: l = p[1]; if (*p == FAC_NATIONAL_DEST_DIGI) { - memcpy(&facilities->source_digi, p + 2, AX25_ADDR_LEN); - facilities->source_ndigis = 1; + if (!fac_national_digis_received) { + memcpy(&facilities->source_digis[0], p + 2, AX25_ADDR_LEN); + facilities->source_ndigis = 1; + } + } + else if (*p == FAC_NATIONAL_SRC_DIGI) { + if (!fac_national_digis_received) { + memcpy(&facilities->dest_digis[0], p + 2, AX25_ADDR_LEN); + facilities->dest_ndigis = 1; + } + } + else if (*p == FAC_NATIONAL_FAIL_CALL) { + memcpy(&facilities->fail_call, p + 2, AX25_ADDR_LEN); + } + else if (*p == FAC_NATIONAL_FAIL_ADD) { + memcpy(&facilities->fail_addr, p + 3, ROSE_ADDR_LEN); } - if (*p == FAC_NATIONAL_SRC_DIGI) { - memcpy(&facilities->dest_digi, p + 2, AX25_ADDR_LEN); - facilities->dest_ndigis = 1; + else if (*p == FAC_NATIONAL_DIGIS) { + fac_national_digis_received = 1; + facilities->source_ndigis = 0; + facilities->dest_ndigis = 0; + for (pt = p + 2, lg = 0 ; lg < l ; pt += AX25_ADDR_LEN, lg += AX25_ADDR_LEN) { + if (pt[6] & AX25_HBIT) + memcpy(&facilities->dest_digis[facilities->dest_ndigis++], pt, AX25_ADDR_LEN); + else + memcpy(&facilities->source_digis[facilities->source_ndigis++], pt, AX25_ADDR_LEN); + } } p += l + 2; n += l + 2; @@ -297,7 +320,7 @@ static int rose_parse_national(unsigned char *p, struct rose_facilities *facilit return n; } -static int rose_parse_ccitt(unsigned char *p, struct rose_facilities *facilities, int len) +static int rose_parse_ccitt(unsigned char *p, struct rose_facilities_struct *facilities, int len) { unsigned char l, n = 0; char callsign[11]; @@ -346,17 +369,9 @@ static int rose_parse_ccitt(unsigned char *p, struct rose_facilities *facilities return n; } -int rose_parse_facilities(struct sk_buff *skb, struct rose_facilities *facilities) +int rose_parse_facilities(unsigned char *p, struct rose_facilities_struct *facilities) { int facilities_len, len; - unsigned char *p; - - memset(facilities, 0x00, sizeof(struct rose_facilities)); - - len = (((skb->data[3] >> 4) & 0x0F) + 1) / 2; - len += (((skb->data[3] >> 0) & 0x0F) + 1) / 2; - - p = skb->data + len + 4; facilities_len = *p++; @@ -388,6 +403,7 @@ int rose_parse_facilities(struct sk_buff *skb, struct rose_facilities *facilitie break; } } + else break; /* Error in facilities format */ } return 1; @@ -397,7 +413,7 @@ int rose_create_facilities(unsigned char *buffer, rose_cb *rose) { unsigned char *p = buffer + 1; char *callsign; - int len; + int len, nb; /* National Facilities */ if (rose->rand != 0 || rose->source_ndigis == 1 || rose->dest_ndigis == 1) { @@ -410,17 +426,40 @@ int rose_create_facilities(unsigned char *buffer, rose_cb *rose) *p++ = (rose->rand >> 0) & 0xFF; } - if (rose->source_ndigis == 1) { + /* Sent before older facilities */ + if ((rose->source_ndigis > 0) || (rose->dest_ndigis > 0)) { + int maxdigi = 0; + *p++ = FAC_NATIONAL_DIGIS; + *p++ = AX25_ADDR_LEN * (rose->source_ndigis + rose->dest_ndigis); + for (nb = 0 ; nb < rose->source_ndigis ; nb++) { + if (++maxdigi >= ROSE_MAX_DIGIS) + break; + memcpy(p, &rose->source_digis[nb], AX25_ADDR_LEN); + p[6] |= AX25_HBIT; + p += AX25_ADDR_LEN; + } + for (nb = 0 ; nb < rose->dest_ndigis ; nb++) { + if (++maxdigi >= ROSE_MAX_DIGIS) + break; + memcpy(p, &rose->dest_digis[nb], AX25_ADDR_LEN); + p[6] &= ~AX25_HBIT; + p += AX25_ADDR_LEN; + } + } + + /* For compatibility */ + if (rose->source_ndigis > 0) { *p++ = FAC_NATIONAL_SRC_DIGI; *p++ = AX25_ADDR_LEN; - memcpy(p, &rose->source_digi, AX25_ADDR_LEN); + memcpy(p, &rose->source_digis[0], AX25_ADDR_LEN); p += AX25_ADDR_LEN; } - if (rose->dest_ndigis == 1) { + /* For compatibility */ + if (rose->dest_ndigis > 0) { *p++ = FAC_NATIONAL_DEST_DIGI; *p++ = AX25_ADDR_LEN; - memcpy(p, &rose->dest_digi, AX25_ADDR_LEN); + memcpy(p, &rose->dest_digis[0], AX25_ADDR_LEN); p += AX25_ADDR_LEN; } } diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 52512e879..9ae14c243 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -54,7 +54,7 @@ ----------------------------------------------------------------------- - Algorithm skeleton was taken from from NS simulator cbq.cc. + Algorithm skeleton was taken from NS simulator cbq.cc. If someone wants to check this code against the LBL version, he should take into account that ONLY the skeleton was borrowed, the implementation is different. Particularly: diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 80bc0a96f..eac678b83 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -7,6 +7,9 @@ * 2 of the License, or (at your option) any later version. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Changes: + * J Hadi Salim <hadi@nortel.com> 980914: computation fixes */ #include <linux/config.h> @@ -156,9 +159,9 @@ red_enqueue(struct sk_buff *skb, struct Qdisc* sch) if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { long us_idle; - PSCHED_SET_PASTPERFECT(q->qidlestart); PSCHED_GET_TIME(now); us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max, 0); + PSCHED_SET_PASTPERFECT(q->qidlestart); /* The problem: ideally, average length queue recalcultion should @@ -177,10 +180,18 @@ red_enqueue(struct sk_buff *skb, struct Qdisc* sch) but it is field for experiments. */ q->qave >>= q->Stab[(us_idle>>q->Scell_log)&0xFF]; + } else { + q->qave += sch->stats.backlog - (q->qave >> q->Wlog); + /* NOTE: + q->qave is fixed point number with point at Wlog. + The formulae above is equvalent to floating point + version: + + qave = qave*(1-W) + sch->stats.backlog*W; + --ANK (980924) + */ } - q->qave += sch->stats.backlog - (q->qave >> q->Wlog); - if (q->qave < q->qth_min) { enqueue: q->qcount = -1; @@ -202,6 +213,22 @@ drop: goto drop; } if (++q->qcount) { + /* The formula used below causes questions. + + OK. qR is random number in the interval 0..Rmask + i.e. 0..(2^Plog). If we used floating point + arithmetics, it would be: (2^Plog)*rnd_num, + where rnd_num is less 1. + + Taking into account, that qave have fixed + point at Wlog, and Plog is related to max_P by + max_P = (qth_max-qth_min)/2^Plog; two lines + below have the following floating point equivalent: + + max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount + + Any questions? --ANK (980924) + */ if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR) goto enqueue; q->qcount = 0; @@ -289,7 +316,7 @@ static int red_init(struct Qdisc *sch, struct rtattr *opt) q->Plog = ctl->Plog; q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; q->Scell_log = ctl->Scell_log; - q->Scell_max = (256<<q->Scell_log)-1; + q->Scell_max = (255<<q->Scell_log); q->qth_min = ctl->qth_min<<ctl->Wlog; q->qth_max = ctl->qth_max<<ctl->Wlog; q->limit = ctl->limit; diff --git a/net/socket.c b/net/socket.c index 118a7276b..e53d74252 100644 --- a/net/socket.c +++ b/net/socket.c @@ -55,17 +55,9 @@ */ #include <linux/config.h> -#include <linux/signal.h> -#include <linux/errno.h> -#include <linux/sched.h> #include <linux/mm.h> -#include <linux/smp.h> #include <linux/smp_lock.h> -#include <linux/kernel.h> -#include <linux/major.h> -#include <linux/stat.h> #include <linux/socket.h> -#include <linux/fcntl.h> #include <linux/file.h> #include <linux/net.h> #include <linux/interrupt.h> @@ -80,20 +72,17 @@ #include <linux/kmod.h> #endif -#include <asm/system.h> #include <asm/uaccess.h> #include <linux/inet.h> #include <net/ip.h> -#include <net/protocol.h> +#include <net/sock.h> #include <net/rarp.h> #include <net/tcp.h> #include <net/udp.h> -#include <linux/skbuff.h> -#include <net/sock.h> #include <net/scm.h> - +static int sock_no_open(struct inode *irrelevant, struct file *dontcare); static long long sock_lseek(struct file *file, long long offset, int whence); static ssize_t sock_read(struct file *file, char *buf, size_t size, loff_t *ppos); @@ -121,7 +110,7 @@ static struct file_operations socket_file_ops = { sock_poll, sock_ioctl, NULL, /* mmap */ - NULL, /* no special open code... */ + sock_no_open, /* special open code to disallow open via /proc */ NULL, /* flush */ sock_close, NULL, /* no fsync */ @@ -305,6 +294,17 @@ struct socket *sock_alloc(void) return sock; } +/* + * In theory you can't get an open on this inode, but /proc provides + * a back door. Remember to keep it shut otherwise you'll let the + * creepy crawlies in. + */ + +static int sock_no_open(struct inode *irrelevant, struct file *dontcare) +{ + return -ENXIO; +} + void sock_release(struct socket *sock) { if (sock->state != SS_UNCONNECTED) diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index e3025334d..2e22c6461 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -10,7 +10,6 @@ #include <linux/malloc.h> #include <linux/socket.h> #include <linux/in.h> -#include <linux/utsname.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/auth.h> @@ -28,13 +27,7 @@ struct unx_cred { #define UNX_CRED_EXPIRE (60 * HZ) -#ifndef DONT_FILLIN_HOSTNAME -/* # define UNX_MAXNODENAME (sizeof(system_utsname.nodename)-1) */ -# define UNX_MAXNODENAME 32 -# define UNX_WRITESLACK (21 + (UNX_MAXNODENAME >> 2)) -#else -# define UNX_WRITESLACK 20 -#endif +#define UNX_WRITESLACK (21 + (UNX_MAXNODENAME >> 2)) #ifdef RPC_DEBUG # define RPCDBG_FACILITY RPCDBG_AUTH @@ -170,6 +163,7 @@ unx_match(struct rpc_task * task, struct rpc_cred *rcred) static u32 * unx_marshal(struct rpc_task *task, u32 *p, int ruid) { + struct rpc_clnt *clnt = task->tk_client; struct unx_cred *cred = (struct unx_cred *) task->tk_cred; u32 *base, *hold; int i, n; @@ -177,20 +171,15 @@ unx_marshal(struct rpc_task *task, u32 *p, int ruid) *p++ = htonl(RPC_AUTH_UNIX); base = p++; *p++ = htonl(jiffies/HZ); -#ifndef DONT_FILLIN_HOSTNAME + /* - * Problem: The UTS name could change under us. We can't lock - * here to handle this. On the other hand we can't really - * go building a bad RPC! + * Copy the UTS nodename captured when the client was created. */ - if ((n = strlen((char *) system_utsname.nodename)) > UNX_MAXNODENAME) - n = UNX_MAXNODENAME; + n = clnt->cl_nodelen; *p++ = htonl(n); - memcpy(p, system_utsname.nodename, n); + memcpy(p, clnt->cl_nodename, n); p += (n + 3) >> 2; -#else - *p++ = 0; -#endif + if (ruid) { *p++ = htonl((u32) cred->uc_uid); *p++ = htonl((u32) cred->uc_gid); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 9380ff4a4..dc06be6b0 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -28,6 +28,7 @@ #include <linux/mm.h> #include <linux/malloc.h> #include <linux/in.h> +#include <linux/utsname.h> #include <linux/sunrpc/clnt.h> @@ -57,6 +58,7 @@ static void call_reconnect(struct rpc_task *task); static u32 * call_header(struct rpc_task *task); static u32 * call_verify(struct rpc_task *task); + /* * Create an RPC client * FIXME: This should also take a flags argument (as in task->tk_flags). @@ -101,6 +103,12 @@ rpc_create_client(struct rpc_xprt *xprt, char *servname, if (!rpcauth_create(flavor, clnt)) goto out_no_auth; + + /* save the nodename */ + clnt->cl_nodelen = strlen(system_utsname.nodename); + if (clnt->cl_nodelen > UNX_MAXNODENAME) + clnt->cl_nodelen = UNX_MAXNODENAME; + memcpy(clnt->cl_nodename, system_utsname.nodename, clnt->cl_nodelen); out: return clnt; @@ -375,7 +383,7 @@ call_reserveresult(struct rpc_task *task) xprt_reserve(task); goto out; } else if (task->tk_status == -ETIMEDOUT) { - printk("RPC: task timed out\n"); + dprintk("RPC: task timed out\n"); task->tk_action = call_timeout; goto out; } else { @@ -493,13 +501,12 @@ static void call_receive(struct rpc_task *task) { dprintk("RPC: %4d call_receive (status %d)\n", - task->tk_pid, task->tk_status); + task->tk_pid, task->tk_status); + task->tk_action = call_status; /* In case of error, evaluate status */ - if (task->tk_status < 0) { - task->tk_action = call_status; + if (task->tk_status < 0) return; - } /* If we have no decode function, this means we're performing * a void call (a la lockd message passing). */ @@ -509,7 +516,6 @@ call_receive(struct rpc_task *task) return; } - task->tk_action = call_status; xprt_receive(task); } @@ -572,7 +578,8 @@ call_timeout(struct rpc_task *task) task->tk_pid); goto minor_timeout; } - if ((to->to_initval <<= 1) > to->to_maxval) + to->to_initval <<= 1; + if (to->to_initval > to->to_maxval) to->to_initval = to->to_maxval; } @@ -585,9 +592,13 @@ call_timeout(struct rpc_task *task) return; } if (clnt->cl_chatty && !(task->tk_flags & RPC_CALL_MAJORSEEN)) { - printk("%s: server %s not responding, still trying\n", - clnt->cl_protname, clnt->cl_server); task->tk_flags |= RPC_CALL_MAJORSEEN; + if (req) + printk("%s: server %s not responding, still trying\n", + clnt->cl_protname, clnt->cl_server); + else + printk("%s: task %d can't get a request slot\n", + clnt->cl_protname, task->tk_pid); } if (clnt->cl_autobind) clnt->cl_port = 0; diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 817a10127..26f1efc07 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -127,7 +127,7 @@ rpc_add_timer(struct rpc_task *task, rpc_action timer) task->tk_pid, task->tk_timeout * 1000 / HZ); if (!timer) timer = __rpc_default_timer; - if (expires < jiffies) { + if (time_before(expires, jiffies)) { printk(KERN_ERR "RPC: bad timeout value %ld - setting to 10 sec!\n", task->tk_timeout); expires = jiffies + 10 * HZ; @@ -413,7 +413,6 @@ __rpc_execute(struct rpc_task *task) task->tk_pid); if (current->pid == rpciod_pid) printk(KERN_ERR "RPC: rpciod waiting on sync task!\n"); - current->timeout = 0; sleep_on(&task->tk_wait); /* @@ -552,9 +551,8 @@ rpc_allocate(unsigned int flags, unsigned int size) } if (flags & RPC_TASK_ASYNC) return NULL; - current->timeout = jiffies + (HZ >> 4); current->state = TASK_INTERRUPTIBLE; - schedule(); + schedule_timeout(HZ>>4); } while (!signalled()); return NULL; @@ -846,9 +844,7 @@ rpciod_killall(void) if (all_tasks) { dprintk("rpciod_killall: waiting for tasks to exit\n"); current->state = TASK_INTERRUPTIBLE; - current->timeout = jiffies + 1; - schedule(); - current->timeout = 0; + schedule_timeout(1); } } @@ -919,9 +915,7 @@ rpciod_down(void) */ current->sigpending = 0; current->state = TASK_INTERRUPTIBLE; - current->timeout = jiffies + 1; - schedule(); - current->timeout = 0; + schedule_timeout(1); /* * Display a message if we're going to wait longer. */ diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index e97d339b3..4e0acee23 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -76,43 +76,52 @@ svc_serv_dequeue(struct svc_serv *serv, struct svc_rqst *rqstp) static inline void svc_release_skb(struct svc_rqst *rqstp) { - if (!rqstp->rq_skbuff) - return; + struct sk_buff *skb = rqstp->rq_skbuff; - dprintk("svc: releasing skb %p\n", rqstp->rq_skbuff); - skb_free_datagram(rqstp->rq_sock->sk_sk, rqstp->rq_skbuff); + if (!skb) + return; rqstp->rq_skbuff = NULL; + + dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); + skb_free_datagram(rqstp->rq_sock->sk_sk, skb); } /* * Queue up a socket with data pending. If there are idle nfsd - * processes, wake'em up. + * processes, wake 'em up. * When calling this function, you should make sure it can't be interrupted * by the network bottom half. */ -static inline void +static void svc_sock_enqueue(struct svc_sock *svsk) { + struct svc_serv *serv = svsk->sk_server; struct svc_rqst *rqstp; - struct svc_serv *serv; + + if (serv->sv_threads && serv->sv_sockets) + printk(KERN_ERR + "svc_sock_enqueue: threads and sockets both waiting??\n"); if (svsk->sk_busy) { /* Don't enqueue socket while daemon is receiving */ - dprintk("svc: socket %p not enqueued: busy\n", svsk->sk_sk); + dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk); return; } /* Mark socket as busy. It will remain in this state until the * server has processed all pending data and put the socket back - * on the idle list + * on the idle list. */ svsk->sk_busy = 1; - serv = svsk->sk_server; if ((rqstp = serv->sv_threads) != NULL) { dprintk("svc: socket %p served by daemon %p\n", - svsk->sk_sk, rqstp); + svsk->sk_sk, rqstp); svc_serv_dequeue(serv, rqstp); + if (rqstp->rq_sock) + printk(KERN_ERR + "svc_sock_enqueue: server %p, rq_sock=%p!\n", + rqstp, rqstp->rq_sock); rqstp->rq_sock = svsk; svsk->sk_inuse++; wake_up(&rqstp->rq_wait); @@ -137,7 +146,8 @@ svc_sock_dequeue(struct svc_serv *serv) end_bh_atomic(); if (svsk) { - dprintk("svc: socket %p dequeued\n", svsk->sk_sk); + dprintk("svc: socket %p dequeued, inuse=%d\n", + svsk->sk_sk, svsk->sk_inuse); svsk->sk_qued = 0; } @@ -325,13 +335,12 @@ svc_recvfrom(struct svc_rqst *rqstp, struct iovec *iov, int nr, int buflen) static void svc_udp_data_ready(struct sock *sk, int count) { - struct svc_sock *svsk; + struct svc_sock *svsk = (struct svc_sock *)(sk->user_data); - dprintk("svc: socket %p data ready (inet %p)\n", sk->user_data, sk); - - svsk = (struct svc_sock *)(sk->user_data); if (!svsk) return; + dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n", + svsk, sk, count, svsk->sk_busy); svsk->sk_data = 1; svc_sock_enqueue(svsk); } @@ -677,7 +686,7 @@ error: /* * Send out data on TCP socket. * FIXME: Make the sendto call non-blocking in order not to hang - * a daemon on a a dead client. Requires write queue maintenance. + * a daemon on a dead client. Requires write queue maintenance. */ static int svc_tcp_sendto(struct svc_rqst *rqstp) @@ -722,14 +731,23 @@ svc_tcp_init(struct svc_sock *svsk) * Receive the next request on any socket. */ int -svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp) +svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) { - struct wait_queue wait = { current, NULL }; struct svc_sock *svsk; int len; + struct wait_queue wait = { current, NULL }; dprintk("svc: server %p waiting for data (to = %ld)\n", - rqstp, current->timeout); + rqstp, timeout); + + if (rqstp->rq_sock) + printk(KERN_ERR + "svc_recv: service %p, socket not NULL!\n", + rqstp); + if (waitqueue_active(&rqstp->rq_wait)) + printk(KERN_ERR + "svc_recv: service %p, wait queue active!\n", + rqstp); again: /* Initialize the buffers */ @@ -741,13 +759,10 @@ again: start_bh_atomic(); if ((svsk = svc_sock_dequeue(serv)) != NULL) { - end_bh_atomic(); rqstp->rq_sock = svsk; - svsk->sk_inuse++; /* N.B. where is this decremented? */ + svsk->sk_inuse++; } else { /* No data pending. Go to sleep */ - rqstp->rq_sock = NULL; - rqstp->rq_wait = NULL; svc_serv_enqueue(serv, rqstp); /* @@ -757,17 +772,24 @@ again: current->state = TASK_INTERRUPTIBLE; add_wait_queue(&rqstp->rq_wait, &wait); end_bh_atomic(); - schedule(); + schedule_timeout(timeout); + remove_wait_queue(&rqstp->rq_wait, &wait); + + start_bh_atomic(); if (!(svsk = rqstp->rq_sock)) { svc_serv_dequeue(serv, rqstp); - if (!(svsk = rqstp->rq_sock)) - return signalled()? -EINTR : -EAGAIN; + end_bh_atomic(); + dprintk("svc: server %p, no data yet\n", rqstp); + return signalled()? -EINTR : -EAGAIN; } } + end_bh_atomic(); - dprintk("svc: server %p servicing socket %p\n", rqstp, svsk); + dprintk("svc: server %p, socket %p, inuse=%d\n", + rqstp, svsk, svsk->sk_inuse); len = svsk->sk_recvfrom(rqstp); + dprintk("svc: got len=%d\n", len); /* No data, incomplete (TCP) read, or accept() */ if (len == 0 || len == -EAGAIN) { diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 4566ce5d2..851f4d952 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -310,7 +310,7 @@ xprt_adjust_cwnd(struct rpc_xprt *xprt, int result) if ((cwnd >>= 1) < RPC_CWNDSCALE) cwnd = RPC_CWNDSCALE; xprt->congtime = jiffies + ((cwnd * HZ) << 3) / RPC_CWNDSCALE; - dprintk("RPC: cong %08lx, cwnd was %08lx, now %08lx, " + dprintk("RPC: cong %ld, cwnd was %ld, now %ld, " "time %ld ms\n", xprt->cong, xprt->cwnd, cwnd, (xprt->congtime-jiffies)*1000/HZ); pprintk("RPC: %lu %ld cwnd\n", jiffies, cwnd); @@ -573,7 +573,6 @@ udp_data_ready(struct sock *sk, int len) struct rpc_rqst *rovr; struct sk_buff *skb; struct iovec iov[MAX_IOVEC]; - mm_segment_t oldfs; int err, repsize, copied; dprintk("RPC: udp_data_ready...\n"); @@ -603,9 +602,8 @@ udp_data_ready(struct sock *sk, int len) /* Okay, we have it. Copy datagram... */ memcpy(iov, rovr->rq_rvec, rovr->rq_rnr * sizeof(iov[0])); - oldfs = get_fs(); set_fs(get_ds()); - skb_copy_datagram_iovec(skb, 8, iov, copied); - set_fs(oldfs); + /* This needs to stay tied with the usermode skb_copy_dagram... */ + memcpy_tokerneliovec(iov, skb->data+8, copied); xprt_complete_rqst(xprt, rovr, copied); @@ -886,11 +884,14 @@ tcp_write_space(struct sock *sk) static void xprt_timer(struct rpc_task *task) { - if (task->tk_rqstp) + struct rpc_rqst *req = task->tk_rqstp; + + if (req) { xprt_adjust_cwnd(task->tk_xprt, -ETIMEDOUT); + } - dprintk("RPC: %4d xprt_timer (%s request)\n", task->tk_pid, - task->tk_rqstp? "pending" : "backlogged"); + dprintk("RPC: %4d xprt_timer (%s request)\n", + task->tk_pid, req ? "pending" : "backlogged"); task->tk_status = -ETIMEDOUT; task->tk_timeout = 0; @@ -1157,12 +1158,13 @@ xprt_reserve_status(struct rpc_task *task) return; bad_list: - printk("RPC: %4d inconsistent free list (cong %ld cwnd %ld)\n", + printk(KERN_ERR + "RPC: %4d inconsistent free list (cong %ld cwnd %ld)\n", task->tk_pid, xprt->cong, xprt->cwnd); rpc_debug = ~0; goto bummer; bad_used: - printk("RPC: used rqst slot %p on free list!\n", req); + printk(KERN_ERR "RPC: used rqst slot %p on free list!\n", req); bummer: task->tk_status = -EIO; xprt->free = NULL; @@ -1218,12 +1220,16 @@ xprt_release(struct rpc_task *task) } end_bh_atomic(); - /* Decrease congestion value. If congestion threshold is not yet - * reached, pass on the request slot. + /* Decrease congestion value. */ + xprt->cong -= RPC_CWNDSCALE; + +#if 0 + /* If congestion threshold is not yet reached, pass on the request slot. * This looks kind of kludgy, but it guarantees backlogged requests * are served in order. + * N.B. This doesn't look completely safe, as the task is still + * on the backlog list after wake-up. */ - xprt->cong -= RPC_CWNDSCALE; if (!RPCXPRT_CONGESTED(xprt)) { struct rpc_task *next = rpc_wake_up_next(&xprt->backlog); @@ -1234,9 +1240,14 @@ xprt_release(struct rpc_task *task) return; } } +#endif req->rq_next = xprt->free; xprt->free = req; + + /* If not congested, wake up the next backlogged process */ + if (!RPCXPRT_CONGESTED(xprt)) + rpc_wake_up_next(&xprt->backlog); } /* diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 8e0110b18..bdd15f744 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -8,7 +8,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Version: $Id: af_unix.c,v 1.68 1998/08/26 13:18:35 davem Exp $ + * Version: $Id: af_unix.c,v 1.71 1998/10/03 09:39:05 davem Exp $ * * Fixes: * Linus Torvalds : Assorted bug cures. @@ -463,7 +463,7 @@ static int unix_autobind(struct socket *sock) addr = kmalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL); if (!addr) - return -ENOBUFS; + return -ENOMEM; if (sk->protinfo.af_unix.addr || sk->protinfo.af_unix.dentry) { kfree(addr); @@ -548,7 +548,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); if (!addr) - return -ENOBUFS; + return -ENOMEM; /* We slept; recheck ... */ @@ -786,7 +786,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags) struct sk_buff *skb; if (sock->state != SS_UNCONNECTED) - return(-EINVAL); + return(-EINVAL); if (!(sock->flags & SO_ACCEPTCON)) return(-EINVAL); @@ -1332,7 +1332,7 @@ static int unix_shutdown(struct socket *sock, int mode) peer_mode |= SEND_SHUTDOWN; if (mode&SEND_SHUTDOWN) peer_mode |= RCV_SHUTDOWN; - other->shutdown |= mode; + other->shutdown |= peer_mode; other->state_change(other); } } diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 4f85caa73..3dcc2cada 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -31,6 +31,25 @@ * Fixes: * Alan Cox 07 Sept 1997 Vmalloc internal stack as needed. * Cope with changing max_files. + * Al Viro 11 Oct 1998 + * Graph may have cycles. That is, we can send the descriptor + * of foo to bar and vice versa. Current code chokes on that. + * Fix: move SCM_RIGHTS ones into the separate list and then + * skb_free() them all instead of doing explicit fput's. + * Another problem: since fput() may block somebody may + * create a new unix_socket when we are in the middle of sweep + * phase. Fix: revert the logic wrt MARKED. Mark everything + * upon the beginning and unmark non-junk ones. + * + * [12 Oct 1998] AAARGH! New code purges all SCM_RIGHTS + * sent to connect()'ed but still not accept()'ed sockets. + * Fixed. Old code had slightly different problem here: + * extra fput() in situation when we passed the descriptor via + * such socket and closed it (descriptor). That would happen on + * each unix_gc() until the accept(). Since the struct file in + * question would go to the free list and might be reused... + * That might be the reason of random oopses on close_fp() in + * unrelated processes. * */ @@ -123,11 +142,11 @@ extern inline int empty_stack(void) return in_stack == 0; } -extern inline void maybe_mark_and_push(unix_socket *x) +extern inline void maybe_unmark_and_push(unix_socket *x) { - if (x->protinfo.af_unix.marksweep&MARKED) + if (!(x->protinfo.af_unix.marksweep&MARKED)) return; - x->protinfo.af_unix.marksweep|=MARKED; + x->protinfo.af_unix.marksweep&=~MARKED; push_stack(x); } @@ -139,7 +158,8 @@ void unix_gc(void) static int in_unix_gc=0; int i; unix_socket *s; - unix_socket *next; + struct sk_buff_head hitlist; + struct sk_buff *skb; /* * Avoid a recursive GC. @@ -163,17 +183,21 @@ void unix_gc(void) max_stack=max_files; } + forall_unix_sockets(i, s) + { + s->protinfo.af_unix.marksweep|=MARKED; + } /* - * Assume everything is now unmarked + * Everything is now marked */ /* Invariant to be maintained: - - everything marked is either: + - everything unmarked is either: -- (a) on the stack, or - -- (b) has all of its children marked - - everything on the stack is always marked + -- (b) has all of its children unmarked + - everything on the stack is always unmarked - nothing is ever pushed onto the stack twice, because: - -- nothing previously marked is ever pushed on the stack + -- nothing previously unmarked is ever pushed on the stack */ /* @@ -186,8 +210,9 @@ void unix_gc(void) * If all instances of the descriptor are not * in flight we are in use. */ - if(s->socket && s->socket->file && s->socket->file->f_count > s->protinfo.af_unix.inflight) - maybe_mark_and_push(s); + if(s->socket && s->socket->file && + s->socket->file->f_count > s->protinfo.af_unix.inflight) + maybe_unmark_and_push(s); } /* @@ -198,7 +223,6 @@ void unix_gc(void) { unix_socket *x = pop_stack(); unix_socket *f=NULL,*sk; - struct sk_buff *skb; tail: skb=skb_peek(&x->receive_queue); @@ -227,16 +251,23 @@ tail: if((sk=unix_get_socket(*fp++))!=NULL) { /* - * Remember the first, mark the - * rest. + * Remember the first, + * unmark the rest. */ if(f==NULL) f=sk; else - maybe_mark_and_push(sk); + maybe_unmark_and_push(sk); } } } + /* We have to scan not-yet-accepted ones too */ + if (UNIXCB(skb).attr & MSG_SYN) { + if (f==NULL) + f=skb->sk; + else + maybe_unmark_and_push(skb->sk); + } skb=skb->next; } /* @@ -245,9 +276,9 @@ tail: if (f) { - if (!(f->protinfo.af_unix.marksweep&MARKED)) + if ((f->protinfo.af_unix.marksweep&MARKED)) { - f->protinfo.af_unix.marksweep|=MARKED; + f->protinfo.af_unix.marksweep&=~MARKED; x=f; f=NULL; goto tail; @@ -255,35 +286,37 @@ tail: } } - /* - * Sweep phase. NOTE: this part dominates the time complexity - */ + skb_queue_head_init(&hitlist); forall_unix_sockets(i, s) { - next=s->next; - if (!(s->protinfo.af_unix.marksweep&MARKED)) + if (s->protinfo.af_unix.marksweep&MARKED) { - /* - * We exist only in the passing tree of sockets - * that is no longer connected to active descriptors - * Time to die.. - * - * Subtle item: We will correctly sweep out the - * socket that has just been closed by the user. - * We must not close this as we are in the middle - * of its close at this moment. Skip that file - * using f_count==0 to spot it. - */ - - if(s->socket && s->socket->file && s->socket->file->f_count) - fput(s->socket->file); + struct sk_buff *nextsk; + skb=skb_peek(&s->receive_queue); + while(skb && skb != (struct sk_buff *)&s->receive_queue) + { + nextsk=skb->next; + /* + * Do we have file descriptors ? + */ + if(UNIXCB(skb).fp) + { + skb_unlink(skb); + skb_queue_tail(&hitlist,skb); + } + skb=nextsk; + } } - else - s->protinfo.af_unix.marksweep&=~MARKED; /* unmark everything for next collection */ } - + + /* + * Here we are. Hitlist is filled. Die. + */ + + while ((skb=skb_dequeue(&hitlist))!=NULL) { + kfree_skb(skb); + } + in_unix_gc=0; - - free_page((long)stack); } diff --git a/net/wanrouter/wanproc.c b/net/wanrouter/wanproc.c index f92f87e1f..9aa0c3485 100644 --- a/net/wanrouter/wanproc.c +++ b/net/wanrouter/wanproc.c @@ -19,6 +19,7 @@ * Dec 13, 1996 Gene Kozin Initial version (based on Sangoma's WANPIPE) *****************************************************************************/ +#include <linux/config.h> #include <linux/stddef.h> /* offsetof(), etc. */ #include <linux/errno.h> /* return codes */ #include <linux/kernel.h> @@ -56,6 +57,8 @@ typedef struct wan_stat_entry /****** Function Prototypes *************************************************/ +#ifdef CONFIG_PROC_FS + /* Proc filesystem interface */ static int router_proc_perms(struct inode *, int); static ssize_t router_proc_read(struct file* file, char* buf, size_t count, loff_t *ppos); @@ -176,7 +179,7 @@ static struct inode_operations wandev_inode = /* * /proc/net/router */ - + static struct proc_dir_entry proc_router = { 0, /* .low_ino */ @@ -528,3 +531,30 @@ static int wandev_get_info(char* buf, char** start, off_t offs, int len, * End */ +#else + +/* + * No /proc - output stubs + */ + +__initfunc(int wanrouter_proc_init(void)) +{ + return 0; +} + +void wanrouter_proc_cleanup(void) +{ + return; +} + +int wanrouter_proc_add(wan_device_t *wandev) +{ + return 0; +} + +int wanrouter_proc_delete(wan_device_t *wandev) +{ + return 0; +} + +#endif |