diff options
author | Ralf Baechle <ralf@linux-mips.org> | 1997-01-07 02:33:00 +0000 |
---|---|---|
committer | <ralf@linux-mips.org> | 1997-01-07 02:33:00 +0000 |
commit | beb116954b9b7f3bb56412b2494b562f02b864b1 (patch) | |
tree | 120e997879884e1b9d93b265221b939d2ef1ade1 /net | |
parent | 908d4681a1dc3792ecafbe64265783a86c4cccb6 (diff) |
Import of Linux/MIPS 2.1.14
Diffstat (limited to 'net')
139 files changed, 53010 insertions, 15983 deletions
diff --git a/net/802/Makefile b/net/802/Makefile index a81249c91..cb8d33a61 100644 --- a/net/802/Makefile +++ b/net/802/Makefile @@ -7,49 +7,28 @@ # # Note 2! The CFLAGS definition is now in the main makefile... -.c.o: - $(CC) $(CFLAGS) -c $< -.s.o: - $(AS) -o $*.o $< -.c.s: - $(CC) $(CFLAGS) -S $< - - -OBJS := p8023.o +O_TARGET := 802.o +O_OBJS = p8023.o sysctl_net_802.o ifdef CONFIG_TR +O_OBJS += tr.o +endif -OBJS := $(OBJS) tr.o - +ifdef CONFIG_FDDI +O_OBJS += fddi.o endif ifdef CONFIG_IPX - -OBJS := $(OBJS) p8022.o psnap.o - +OX_OBJS += p8022.o psnap.o p8022tr.o endif ifdef CONFIG_ATALK ifndef CONFIG_IPX - -OBJS := $(OBJS) p8022.o psnap.o - +OX_OBJS += p8022.o psnap.o p8022tr.o endif endif -802.o: $(OBJS) - $(LD) -r -o 802.o $(OBJS) - - -dep: - $(CPP) -M *.c > .depend +include $(TOPDIR)/Rules.make tar: tar -cvf /dev/f1 . - -# -# include a dependency file if one exists -# -ifeq (.depend,$(wildcard .depend)) -include .depend -endif diff --git a/net/802/fddi.c b/net/802/fddi.c new file mode 100644 index 000000000..24fcff127 --- /dev/null +++ b/net/802/fddi.c @@ -0,0 +1,162 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * FDDI-type device handling. + * + * Version: @(#)fddi.c 1.0.0 08/12/96 + * + * Authors: Lawrence V. Stefani, <stefani@lkg.dec.com> + * + * fddi.c is based on previous eth.c and tr.c work by + * Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Mark Evans, <evansmp@uhura.aston.ac.uk> + * Florian La Roche, <rzsfl@rz.uni-sb.de> + * Alan Cox, <gw4pts@gw4pts.ampr.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/segment.h> +#include <asm/system.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/fddidevice.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <net/arp.h> +#include <net/sock.h> + +/* + * Create the FDDI MAC header for an arbitrary protocol layer + * + * saddr=NULL means use device source address + * daddr=NULL means leave destination address (eg unresolved arp) + */ + +int fddi_header( + struct sk_buff *skb, + struct device *dev, + unsigned short type, + void *daddr, + void *saddr, + unsigned len + ) + + { + struct fddihdr *fddi = (struct fddihdr *)skb_push(skb, FDDI_K_SNAP_HLEN); + + /* Fill in frame header - assume 802.2 SNAP frames for now */ + + fddi->fc = FDDI_FC_K_ASYNC_LLC_DEF; + fddi->hdr.llc_snap.dsap = FDDI_EXTENDED_SAP; + fddi->hdr.llc_snap.ssap = FDDI_EXTENDED_SAP; + fddi->hdr.llc_snap.ctrl = FDDI_UI_CMD; + fddi->hdr.llc_snap.oui[0] = 0x00; + fddi->hdr.llc_snap.oui[1] = 0x00; + fddi->hdr.llc_snap.oui[2] = 0x00; + fddi->hdr.llc_snap.ethertype = htons(type); + + /* Set the source and destination hardware addresses */ + + if (saddr != NULL) + memcpy(fddi->saddr, saddr, dev->addr_len); + else + memcpy(fddi->saddr, dev->dev_addr, dev->addr_len); + + if (daddr != NULL) + { + memcpy(fddi->daddr, daddr, dev->addr_len); + return(FDDI_K_SNAP_HLEN); + } + return(-FDDI_K_SNAP_HLEN); + } + + +/* + * Rebuild the FDDI MAC header. This is called after an ARP + * (or in future other address resolution) has completed on + * this sk_buff. We now let ARP fill in the other fields. + */ + +int fddi_rebuild_header( + void *buff, + struct device *dev, + unsigned long dest, + struct sk_buff *skb + ) + + { + struct fddihdr *fddi = (struct fddihdr *)buff; + + /* Only ARP/IP is currently supported */ + + if (fddi->hdr.llc_snap.ethertype != htons(ETH_P_IP)) + { + printk("fddi_rebuild_header: Don't know how to resolve type %04X addresses?\n", (unsigned int)htons(fddi->hdr.llc_snap.ethertype)); + return(0); + } + + /* Try to get ARP to resolve the header and fill destination address */ + + if (arp_find(fddi->daddr, dest, dev, dev->pa_addr, skb)) + return(1); + else + return(0); + } + + +/* + * Determine the packet's protocol ID and fill in skb fields. + * This routine is called before an incoming packet is passed + * up. It's used to fill in specific skb fields and to set + * the proper pointer to the start of packet data (skb->data). + */ + +unsigned short fddi_type_trans( + struct sk_buff *skb, + struct device *dev + ) + + { + struct fddihdr *fddi = (struct fddihdr *)skb->data; + + /* + * Set mac.raw field to point to FC byte, set data field to point + * to start of packet data. Assume 802.2 SNAP frames for now. + */ + + skb->mac.raw = skb->data; /* point to frame control (FC) */ + skb_pull(skb, FDDI_K_SNAP_HLEN); /* adjust for 21 byte header */ + + /* Set packet type based on destination address and flag settings */ + + if (*fddi->daddr & 0x01) + { + if (memcmp(fddi->daddr, dev->broadcast, FDDI_K_ALEN) == 0) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + } + + else if (dev->flags & IFF_PROMISC) + { + if (memcmp(fddi->daddr, dev->dev_addr, FDDI_K_ALEN)) + skb->pkt_type = PACKET_OTHERHOST; + } + + /* Assume 802.2 SNAP frames, for now */ + + return(fddi->hdr.llc_snap.ethertype); + } diff --git a/net/802/p8022.c b/net/802/p8022.c index dd1510774..f8754e5c0 100644 --- a/net/802/p8022.c +++ b/net/802/p8022.c @@ -1,11 +1,20 @@ +#include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/datalink.h> #include <linux/mm.h> #include <linux/in.h> +#include <net/p8022.h> static struct datalink_proto *p8022_list = NULL; +/* + * We don't handle the loopback SAP stuff, the extended + * 802.2 command set, multicast SAP identifiers and non UI + * frames. We have the absolute minimum needed for IPX, + * IP and Appletalk phase 2. + */ + static struct datalink_proto * find_8022_client(unsigned char type) { @@ -27,7 +36,7 @@ p8022_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) proto = find_8022_client(*(skb->h.raw)); if (proto != NULL) { skb->h.raw += 3; - skb->len -= 3; + skb_pull(skb,3); return proto->rcvfunc(skb, dev, pt); } @@ -41,36 +50,37 @@ p8022_datalink_header(struct datalink_proto *dl, struct sk_buff *skb, unsigned char *dest_node) { struct device *dev = skb->dev; - unsigned long len = skb->len; - unsigned long hard_len = dev->hard_header_len; unsigned char *rawp; - dev->hard_header(skb->data, dev, len - hard_len, - dest_node, NULL, len - hard_len, skb); - rawp = skb->data + hard_len; - *rawp = dl->type[0]; - rawp++; - *rawp = dl->type[0]; - rawp++; + rawp = skb_push(skb,3); + *rawp++ = dl->type[0]; + *rawp++ = dl->type[0]; *rawp = 0x03; /* UI */ - rawp++; - skb->h.raw = rawp; + dev->hard_header(skb, dev, ETH_P_802_3, dest_node, NULL, skb->len); } static struct packet_type p8022_packet_type = { - 0, /* MUTTER ntohs(ETH_P_IPX),*/ + 0, /* MUTTER ntohs(ETH_P_8022),*/ NULL, /* All devices */ p8022_rcv, NULL, NULL, }; + +static struct symbol_table p8022_proto_syms = { +#include <linux/symtab_begin.h> + X(register_8022_client), + X(unregister_8022_client), +#include <linux/symtab_end.h> +}; void p8022_proto_init(struct net_proto *pro) { p8022_packet_type.type=htons(ETH_P_802_2); dev_add_pack(&p8022_packet_type); + register_symtab(&p8022_proto_syms); } struct datalink_proto * @@ -96,3 +106,24 @@ register_8022_client(unsigned char type, int (*rcvfunc)(struct sk_buff *, struct return proto; } +void unregister_8022_client(unsigned char type) +{ + struct datalink_proto *tmp, **clients = &p8022_list; + unsigned long flags; + + save_flags(flags); + cli(); + + while ((tmp = *clients) != NULL) + { + if (tmp->type[0] == type) { + *clients = tmp->next; + kfree_s(tmp, sizeof(struct datalink_proto)); + break; + } else { + clients = &tmp->next; + } + } + + restore_flags(flags); +} diff --git a/net/802/p8022tr.c b/net/802/p8022tr.c new file mode 100644 index 000000000..d1fcc5c46 --- /dev/null +++ b/net/802/p8022tr.c @@ -0,0 +1,137 @@ +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <net/datalink.h> +#include <linux/mm.h> +#include <linux/in.h> +#include <net/p8022tr.h> + +#define SNAP_HEADER_LEN 8 + +static struct datalink_proto *p8022tr_list = NULL; + +/* + * We don't handle the loopback SAP stuff, the extended + * 802.2 command set, multicast SAP identifiers and non UI + * frames. We have the absolute minimum needed for IPX, + * IP and Appletalk phase 2. + */ + +static struct datalink_proto * +find_8022tr_client(unsigned char type) +{ + struct datalink_proto *proto; + + for (proto = p8022tr_list; + ((proto != NULL) && (*(proto->type) != type)); + proto = proto->next) + ; + + return proto; +} + +int +p8022tr_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) +{ + struct datalink_proto *proto; + + proto = find_8022tr_client(*(skb->h.raw)); + if (proto != NULL) { + skb->h.raw += 3; + skb_pull(skb,3); + return proto->rcvfunc(skb, dev, pt); + } + + skb->sk = NULL; + kfree_skb(skb, FREE_READ); + return 0; +} + +static void +p8022tr_datalink_header(struct datalink_proto *dl, + struct sk_buff *skb, unsigned char *dest_node) +{ + struct device *dev = skb->dev; + unsigned char *rawp; + unsigned char *olddata; + unsigned char *newdata; + + rawp = skb_push(skb,3); + *rawp++ = dl->type[0]; + *rawp++ = dl->type[0]; + *rawp = 0x03; /* UI */ + dev->hard_header(skb, dev, ETH_P_802_3, dest_node, NULL, skb->len); + olddata = skb->data; + newdata = skb_pull(skb, SNAP_HEADER_LEN); + memmove(newdata, olddata, dev->hard_header_len - SNAP_HEADER_LEN); +} + +static struct packet_type p8022tr_packet_type = +{ + 0, + NULL, /* All devices */ + p8022tr_rcv, + NULL, + NULL, +}; + + +static struct symbol_table p8022tr_proto_syms = { +#include <linux/symtab_begin.h> + X(register_8022tr_client), + X(unregister_8022tr_client), +#include <linux/symtab_end.h> +}; + +void p8022tr_proto_init(struct net_proto *pro) +{ + p8022tr_packet_type.type=htons(ETH_P_TR_802_2); + dev_add_pack(&p8022tr_packet_type); + register_symtab(&p8022tr_proto_syms); +} + +struct datalink_proto * +register_8022tr_client(unsigned char type, int (*rcvfunc)(struct sk_buff *, struct device *, struct packet_type *)) +{ + struct datalink_proto *proto; + + if (find_8022tr_client(type) != NULL) + return NULL; + + proto = (struct datalink_proto *) kmalloc(sizeof(*proto), GFP_ATOMIC); + if (proto != NULL) { + proto->type[0] = type; + proto->type_len = 1; + proto->rcvfunc = rcvfunc; + proto->header_length = 3; + proto->datalink_header = p8022tr_datalink_header; + proto->string_name = "802.2TR"; + proto->next = p8022tr_list; + p8022tr_list = proto; + } + + return proto; +} + +void unregister_8022tr_client(unsigned char type) +{ + struct datalink_proto *tmp, **clients = &p8022tr_list; + unsigned long flags; + + save_flags(flags); + cli(); + + while ((tmp = *clients) != NULL) + { + if (tmp->type[0] == type) { + *clients = tmp->next; + kfree_s(tmp, sizeof(struct datalink_proto)); + break; + } else { + clients = &tmp->next; + } + } + + restore_flags(flags); +} + diff --git a/net/802/p8023.c b/net/802/p8023.c index 4b1f5e0bf..57bd6a74a 100644 --- a/net/802/p8023.c +++ b/net/802/p8023.c @@ -9,12 +9,8 @@ p8023_datalink_header(struct datalink_proto *dl, struct sk_buff *skb, unsigned char *dest_node) { struct device *dev = skb->dev; - unsigned long len = skb->len; - unsigned long hard_len = dev->hard_header_len; - - dev->hard_header(skb->data, dev, len - hard_len, - dest_node, NULL, len - hard_len, skb); - skb->h.raw = skb->data + hard_len; + + dev->hard_header(skb, dev, ETH_P_802_3, dest_node, NULL, skb->len); } struct datalink_proto * @@ -33,3 +29,9 @@ make_8023_client(void) return proto; } +void destroy_8023_client(struct datalink_proto *dl) +{ + if (dl) + kfree_s(dl,sizeof(struct datalink_proto)); +} + diff --git a/net/802/psnap.c b/net/802/psnap.c index d0186c54e..4f17352ab 100644 --- a/net/802/psnap.c +++ b/net/802/psnap.c @@ -10,6 +10,7 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/datalink.h> @@ -58,7 +59,7 @@ int snap_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) */ skb->h.raw += 5; - skb->len -= 5; + skb_pull(skb,5); if (psnap_packet_type.type == 0) psnap_packet_type.type=htons(ETH_P_SNAP); return proto->rcvfunc(skb, dev, &psnap_packet_type); @@ -74,24 +75,27 @@ int snap_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) static void snap_datalink_header(struct datalink_proto *dl, struct sk_buff *skb, unsigned char *dest_node) { - struct device *dev = skb->dev; - unsigned char *rawp; - - rawp = skb->data + snap_dl->header_length+dev->hard_header_len; - memcpy(rawp,dl->type,5); - skb->h.raw = rawp+5; + memcpy(skb_push(skb,5),dl->type,5); snap_dl->datalink_header(snap_dl, skb, dest_node); } /* * Set up the SNAP layer */ + +static struct symbol_table snap_proto_syms = { +#include <linux/symtab_begin.h> + X(register_snap_client), + X(unregister_snap_client), +#include <linux/symtab_end.h> +}; void snap_proto_init(struct net_proto *pro) { snap_dl=register_8022_client(0xAA, snap_rcv); if(snap_dl==NULL) printk("SNAP - unable to register with 802.2\n"); + register_symtab(&snap_proto_syms); } /* @@ -121,3 +125,31 @@ struct datalink_proto *register_snap_client(unsigned char *desc, int (*rcvfunc)( return proto; } +/* + * Unregister SNAP clients. Protocols no longer want to play with us ... + */ + +void unregister_snap_client(unsigned char *desc) +{ + struct datalink_proto **clients = &snap_list; + struct datalink_proto *tmp; + unsigned long flags; + + save_flags(flags); + cli(); + + while ((tmp = *clients) != NULL) + { + if (memcmp(tmp->type,desc,5) == 0) + { + *clients = tmp->next; + kfree_s(tmp, sizeof(struct datalink_proto)); + break; + } + else + clients = &tmp->next; + } + + restore_flags(flags); +} + diff --git a/net/802/sysctl_net_802.c b/net/802/sysctl_net_802.c new file mode 100644 index 000000000..96f51588c --- /dev/null +++ b/net/802/sysctl_net_802.c @@ -0,0 +1,13 @@ +/* -*- linux-c -*- + * sysctl_net_802.c: sysctl interface to net 802 subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/802 directory entry (empty =) ). [MS] + */ + +#include <linux/mm.h> +#include <linux/sysctl.h> + +ctl_table e802_table[] = { + {0} +}; diff --git a/net/802/tr.c b/net/802/tr.c index 643cf64c5..c12a66d83 100644 --- a/net/802/tr.c +++ b/net/802/tr.c @@ -1,4 +1,4 @@ -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <linux/types.h> #include <linux/kernel.h> @@ -38,12 +38,13 @@ rif_cache rif_table[RIF_TABLE_SIZE]={ NULL, }; #define RIF_CHECK_INTERVAL 60*HZ static struct timer_list rif_timer={ NULL,NULL,RIF_CHECK_INTERVAL,0L,rif_check_expire }; -int tr_header(unsigned char *buff, struct device *dev, unsigned short type, - void *daddr, void *saddr, unsigned len, struct sk_buff *skb) { +int tr_header(struct sk_buff *skb, struct device *dev, unsigned short type, + void *daddr, void *saddr, unsigned len) +{ + + struct trh_hdr *trh=(struct trh_hdr *)skb_push(skb,dev->hard_header_len); + struct trllc *trllc=(struct trllc *)(trh+1); - struct trh_hdr *trh=(struct trh_hdr *)buff; - struct trllc *trllc=(struct trllc *)(buff+sizeof(struct trh_hdr)); - trh->ac=AC; trh->fc=LLC_FRAME; @@ -92,6 +93,10 @@ unsigned short tr_type_trans(struct sk_buff *skb, struct device *dev) { struct trh_hdr *trh=(struct trh_hdr *)skb->data; struct trllc *trllc=(struct trllc *)(skb->data+sizeof(struct trh_hdr)); + skb->mac.raw = skb->data; + + skb_pull(skb,dev->hard_header_len); + if(trh->saddr[0] & TR_RII) tr_add_rif_info(trh); @@ -231,7 +236,7 @@ static void rif_check_expire(unsigned long dummy) { restore_flags(flags); del_timer(&rif_timer); - rif_timer.expires=RIF_CHECK_INTERVAL; + rif_timer.expires=jiffies+RIF_CHECK_INTERVAL; add_timer(&rif_timer); } diff --git a/net/Changes b/net/Changes index b316f85fd..30898803a 100644 --- a/net/Changes +++ b/net/Changes @@ -7,7 +7,7 @@ Initial patches to catch up with things we want to add. o Merged in the Jorge Cwik fast checksum. [TESTED] o Added Arnt Gulbrandsen's fast UDP build. [TESTED] -o Pauline Middelinks masquerade patch [IN/COMPILES] +o Pauline Middelinks masquerade patch [TESTED] 0.1 @@ -21,16 +21,16 @@ o Cache last socket for TCP [TESTED] o Routing cache (only in ip_build_header so far) [TESTED] ------->>>>> ALPHA 001 <<<<<---------- o eql load balancing driver. [TESTED] -o Token ring drivers. [COMPILE - CANT TEST] +o Token ring drivers. [TESTED] o IPIP and tunnels [TESTED] -o Fix ethernet/token ring promisc broadcast error [QUICK TEST] +o Fix ethernet/token ring promisc broadcast error [TESTED] (pkt_type set to OTHERHOST in error). o Fixed bug in the routing caches [TESTED] o Protocol header cache support [TESTED] o Fixed ip_build_xmit loopback bugs [TESTED] o Fixes for SIOCGSTAMP on SOCK_PACKET [TESTED] o Perfect hash on net_bh(). [TESTED] -o Sonix ISDN driver. [NOT INCLUDED YET] +o Sonix ISDN driver. [SEPARATED/SENT] o Use ip_build_xmit for raw sockets [TESTED] o 3c501 fixed for speed [TESTED] ------->>>>> ALPHA 002 <<<<<-------- @@ -40,96 +40,370 @@ o Merged in G4KLX AX.25 state machine, with KA9Q o Massive loopback device bug fixed [TESTED] ------->>>>> ALPHA 003 <<<<<---------- o Revised code layout [TESTED] -o More bug fixes (tracroute etc) [TESTED] +o More bug fixes (traceroute etc) [TESTED] ------->>>>> ALPHA 004 <<<<<---------- o IP build xmit fragment fixes [TESTED] o TCP SWS probe fix [TESTED] o Appletalk DDP [TESTED] o IP firewall bug fixed [TESTED] -o IP masquerade ftp port spoof [IN] +o IP masquerade ftp port spoof [TESTED] o gcc 2.6.3 -O3 fix for checksum assembler [TESTED] o /proc support shows both timers [TESTED] o TCP irtt support [TESTED] o RTF_REJECT routing support [TESTED] o Fixed 0 length fragment bug [TESTED] o Fixed overlapping reasm bug [TESTED] -o Newest AX.25 code from John Naylor [IN] -o NetROM from John Naylor [IN] +o Newest AX.25 code from John Naylor [TESTED] +o NetROM from John Naylor [TESTED] o Routerless DDP fixes from Wesley [TESTED] ------->>>>> ALPHA 005 <<<<<---------- -o Several compile and bugfixes from Jakko [IN] +o Several compile and bugfixes from Jakko [TESTED] o Connect fix from Matt Day (+ fix to fix) [TESTED] o RTT, memory leak and other netrom/ax.25 cures - -- John Naylor [IN] -o IP source route via broadcast now illegal [IN] + -- John Naylor [TESTED] +o IP source route via broadcast now illegal [TESTED] ------->>>>> ALPHA 006 <<<<<---------- -o Yet more NetROM/AX.25 improvements [IN] +o Yet more NetROM/AX.25 improvements [TESTED] -- John Naylor o Fixed a _stupid_ appletalk bug [TESTED] -o Missing include [IN] +o Missing include [TESTED] -- Lots of people -o Can drop all source routes [IN] -o Printing fixes for ip_fw [IN] -o UDP checksum fix (Gerhard) [IN] +o Can drop all source routes [TESTED] +o Printing fixes for ip_fw [TESTED] +o UDP checksum fix (Gerhard) [TESTED] o Newer 3c505 driver from Juha Laiho [IN] -o Security fix to axassociate [IN] +o Security fix to axassociate [TESTED] o Loopback driver debugged (fixes named) [TESTED] -o SCC driver from Joerg Reuter [IN] -o IP Firewall accounting zero bug [IN] - -////////////////////////////1.3.0/////////////////////////// ?? - -o Finish merging the bridge code -o Device locking -o Faster ip_csum -o SIOCSLEEPRT patch -o Options support in ip_build_xmit [PENDING] -o Fast checksum/copy on outgoing TCP -o New buffers. Used totally non-optimally -o Long word align ethernet IP headers (64byte align for pentium) -o Explode/implode headers for alpha,mips etc. -o Fast dev_grab_next() transmit reload function - and dev_push_failed() ?? -o Faster ip_forward -o Faster loopback frame forwarding. -o Forwarding queue control (+ fairness algorithms ??) -o Merge loadable firewall code. -o IP forward flow control. -o Infinite PPP/SLIP devices. +o SCC driver from Joerg Reuter [TESTED] +o IP Firewall accounting zero bug [TESTED] + +////////////////////////////1.3.0/////////////////////////// + + +o Merged loadable firewall code [TESTED] +o New buffers used totally non optimally [TESTED] +o Fast ip_forwarding (needs changing) [NOW INCLUDED IN 1.3.15] +o Fixed connection hang bug in new SWS code [TESTED] +o Buffer management hack putting skbuff control + after data in the frame because kmalloc is + totally cache non-optimal [TESTED] +o Faster checksum [Tom May] [TESTED] +o Appletalk router fixes [Michael Callahan] [TESTED] +o TCP state error fixes [Mark Tamsky] [TESTED] +o Verify area fixes [Heiko Eissfeldt] [TESTED] +o Routes use metric field [John Naylor] [TESTED] +o Major AX.25/NetROM fixes [John Nalor] [TESTED] + +------->>>>> NET3 030 <<<<<---------- + +o Long word align ethernet IP headers (64byte align for pentium) [TESTED] + (less helpful than I'd have liked) +o Fixed variable length header support to really work [TESTED] +o Mend appletalk/ipx partially [TESTED] +o Start playing with input checksum & copy [TESTED] +o Fixed PPP and other oddments [TESTED] +o Mended IPIP [TESTED] + +------->>>>> 1.3.7 <<<<<---------- + +o Checksum bug fixed [TESTED] +o Lance driver panic cured [TESTED] +o DEC ALPHA stuff (Linus) [ASK HIM NOT ME] +o Always try to keep output packet order + (eg for vat and BSD fast path tcp) [TESTED] +o Copy the mac pointer in skb_clone [TESTED] +o Fix tcpdump panic [TESTED] +o Fix dev_alloc_skb NULL deref bug [TESTED] +o Fix Security error in SIGURG stuff [TESTED] +o Missing 15 byte slack on ip_loopback [TESTED] + +------->>>>> 1.3.8 <<<<<---------- + +o UDP snmp count fixed [TESTED] +o IP snmp out count fixed [TESTED] +o First bit of Dave Bonn's fast forwarding [TESTED/NOW WORKS] +o Fix leaks and double free in firewalling [TESTED] +o Fix memory scribble in ip_build_xmit [TESTED] +o Do fast cases of ip_build_xmit first + slows fragmented I/O down, speeds up smaller + packets. UDP send ttcp can now touch 7.5Mbyte/sec + with nothing else going on. UDP recv is slower 8( [TESTED] +o Fixed and enabled ethernet header caches [TESTED] +o Removed junk from igmp [TESTED] +o Obscure UDP/copy&sum bug fix [TESTED] +o Fixed multicast [TESTED] +o TCP does rerouting for most cases [TESTED] + +------->>>>> 1.3.14 <<<<<---------- + +o IPX works [TESTED] +o NetROM works [TESTED] +o AX.25 works [TESTED] +o Most modules need recompiling even though they + load OK [BLAME LINUS] +o Appletalk works nicely [CHECKED] +o Fast IP forwarding part 1 works [CHECKED] + +------->>>>> 1.3.15 <<<<<--------- +o Mike Shaver has started RFC1122 verification [DONE] +o Minor bug fixes [TESTED] + +------->>>> 1.3.16 <<<-------- + +o Missing patches for device change in TCP [TESTED] +o Device locking [TESTED] +o Infinite slip devices [TESTED] +o New AF_UNIX sockets [TESTED] +o Sendmsg/recvmsg (for some stuff only) [TESTED] +o Device unload loopholes fixed [TESTED] +o Extra firewall abilities [TESTED] +o Appletalk node probe bug fix [TESTED] + +------->>>> 1.3.18 <<<<--------- + +o AF_UNIX debugging [TESTED] +o TCP explode on SWS bug fix [TESTED] +o John Naylor's ARP hwtype fix [TESTED] +o Datagram select matches BSD semantics [TESTED] + +-------->>>>> 1.3.21 <<<<<--------- + +o AF_UNIX fixes and 4K limiter [TESTED] +o Sendmsg/recvmsg for AX.25/Appletalk [TESTED] +o Datagram generic iovec support [TESTED] +o Misc minor bug fixes [TESTED] + +-------->>>>> 1.3.22 <<<<<------- + +o Device lock against page fault [TESTED] +o IP_HDRINCL [TESTED] +o IP firewalling spoofing protection [TESTED] +o IGMP bug fixes and workarounds [TESTED] +o IFF_ALLMULTI protocol layer support [TESTED] +o First parts of IP multicast routing code [TESTED] +o Generate BSD ENETDOWN errors [TESTED] +o Clean device unload bug<Walter Wolfgang> [TESTED] + +-------->>>>> 1.3.24 <<<<<------- + +o Missing IGMP includes fixes [TESTED] +o Smarter buffer use options for sockets [TESTED] +o AF_UNIX smarter buffer driving [TESTED] +o AF_UNIX full BSD semantics on STREAM writes [TESTED] +o IOVEC's support repeated calls to copy more [TESTED] +o Zero fragment 'solaris nfs' bug fixed <Werner> [TESTED] +o NetROM supports sendmsg/recvmsg [TESTED] +o Sendmsg verify_iovec bugfix [TESTED] +o ARP PERM is really permanent now <Craig> [TESTED] +o IPIP tunnels work fully we hope [UMM...] +o Closing socket change (Marc Tamsky) [TESTED] +o RFC1122 verification of tcp.c <Mike Shaver> [DONE] + +-------->>>>> 1.3.26 <<<<<------- + +o Rewrote ICMP completely [TESTED] +o Moved most IP addresses to __u32 [TESTED] +o Cleaned up ICMP reporting [TESTED] +o Tidied remove_sock [TESTED] +o Added memory allocation type to ip_build_xmit [TESTED] +o Cleaned up af_inet to use inet_error [TESTED] +o Named firewall returns [TESTED] +o Added firewall output checks to ip_build_xmit [TESTED] +o Multicast router downs VIF's when their + physical interface is dropped [TESTED] +o Reformatted ipv4/protocol.c, dropped frag field [TESTED] +o Fixed MSS for TCP [TESTED] +o Dropped sock_awaitconn [TESTED] +o Added ip_forward to ksyms for IPIP etc [TESTED] +o Appletalk TIOCINQ/TIOCOUTQ bug fix [TESTED] +o Rewrote the IFF_UP/IFF_DOWN handling code [TESTED] + +-------->>>>> 1.3.29 <<<<<------- + +o Major AX.25/NetROM fixes [John Naylor] [TESTED] +o Error in ip_mr ioctls fixed [Michael Chastain] [TESTED] +o TCP cache zap bugs hopefully fixed [CLOSE BUT NO COOKIE] +o Length checks in udp/raw sending [Craig Metz] [TESTED] + +-------->>>>> 1.3.31 <<<<<<------- + +o IP_OPTIONS [A.N.Kuznetsov] [TESTED] +o TCP cache zap more fixes [TESTED] +o Most of the IP multicast routing cache added [TESTED - WORK NEEDED] +o Kernel/user communication module (not used yet) [TESTED] + +-------->>>>> 1.3.31 <<<<<<------- + +o IFF_ALLMULTI support for 3c501,3c509,8390 and + tulip(SMC etherpower) boards [TESTED] + +-------->>>>> 1.3.33 <<<<<<-------- + +o IFF_ALLMULTI causes an address check on ether [TESTED] +o Added multicast ability readme file [TESTED] +o Assorted driver/multicast fixes [TESTED] +o IP routing change errors resemble BSD more [TESTED/MORE TO COME] +o IP port masquerading fixes [TESTED] + +-------->>>>> 1.3.35 <<<<<<-------- + +o Appletalk data now in the protinfo union [TESTED] +o BSD style bind to broadcast address supported [TESTED] +o Standard loadable firewall chains [TESTED] +o IPFW uses the firewall chains for firewall but + not yet acct/masquerade [TESTED] +o Firewall chain hooks in all other protocols [TESTED] +o Sendmsg/recvmsg for IPX. [TESTED] +o IPX uses sock_alloc_send_skb [TESTED] +o Recvmsg for all IP, sendmsg for TCP [TESTED] + (nearly ready to go all *msg()) + +-------->>>>> 1.3.42 <<<<<<-------- + +o ip udp/raw nonblock bug fixed [TESTED] +o ICMP lockup fix [TESTED] +o Fundamental operations now only sendmsg/recvmsg [TESTED] +o bind() for SOCK_PACKET [IN] +o set_mac_addr fixed up [TESTED] +o BSD SIOCSIFADDR, AF_UNSPEC behaviour [TESTED] +o Updated this list [OK] +o Massive ARP/cache/routing rewrite [ANK] [TESTED] +o AX.25 connect return fixed in using sock_error [TESTED] +o Proper netlink device major(36) [TESTED] +o First parts of the SKIP support [IN, not useful] +o TCP ICMP (SOSS should work again) [TESTED] +o IPFW support for TOS changing (Al Longyear) [TESTED] +o DECNET PPP test code [Steve] [IN] +o NFS root [Miguel/Gero] [TESTED] +o Path MTU discovery [ANK] [TESTED] + +-------->>>>> 1.3.44 <<<<<<-------- + +o NFS root/ FPU clash fixed [TESTED] +o ARP lock bug fixed [TESTED] +o SO_BSDCOMPAT option(libbsd/ibcs2 ought to set) [SEMIDONE] +o Changed to new set_multicast_list() [TESTED] +o ARP ioctl() call fixes [Bernd] [TESTED] +o Fixes to the name set functions (maybe fixes + netrom) [Steve] [TESTED] +o Packet protocol labelling (not IPX yet) [TESTED] +o Faster buffer copy/clone [Linus] [TESTED] + +-------->>>>> 1.3.46 <<<<<<-------- + +o AX.25/NetROM fixes/changes [John Naylor] [TESTED] +o Further attempts to fix the IPX memory bug [NOW FIXED] +o ARP fixes (Assorted) [TESTED] +o Driver fixes for multicast lists [TESTED] + +-------->>>>> 1.3.48 <<<<<<-------- + +o IPalias [TESTED] + +-------->>>>> 1.3.50 <<<<<<-------- + +o TCP soft error support [TESTED] +o Further 3c501 tweaking [TESTED] +o Still trying to make IPX work right [TESTED] +o Trap faulty boxes sending IGMP using 0.0.0.0 [TESTED] +o Only allow SMBFS selection with IP configured [TESTED] +o Packetwin driver [Craig] [IN] +o Net alias changes [Juan] [TESTED] + +-------->>>>> 1.3.53 <<<<<<-------- + +o Missing htons() in ip_build_xmit [Mike Kilburn] [TESTED] +o Missing protocol type set in appletalk [TESTED] +o Net alias changes/documentation [Juan Ciarlante][TESTED] +o Set protocol type on IPX sends [Various] [TESTED] +o Lance driver packet length sanity check [TESTED] + +-------->>>>> 1.3.60 <<<<<<-------- + +o Fixed NFS notice message [IN] +o Extra ETH_P_xx types [IN] +o Added skb_copy [IN] +o AX.25 unlock bug fix [Joerg] [IN] +o Disabled buggy kerneld support [IN] +o Removed dev_rint [IN] +o Fixed bind checking [IN] +o ARP changes [Bernd] [IN] +o IPX memory leak fix [Martin] [IN] +o Net builds without /proc [Paul] [IN] +o IP multicast races removed [IN] +o Device queue length limits and packet discarder [IN] + +---------- Things I thought Linus had for a while and not merged ---------------- + + +---------- Things pending from other people ------------- + +o Improved IPX support for lwared. +o Decnet pre pre pre pre pre Alpha 0.0. +o Chase Donald for new drivers, get people to sort out what net + drivers should cease to be 'Alpha'. +o IPX PPP support +o IP multicast bug fixes + +---------- Things pending for me to merge -------------- + +o AF_UNIX garbage collect code +o Faster closedown option for heavy use sites (me) +o Tom May's insw_and_checksum() +o SPARC patches [Dave] [partly in] + +--------------- Things That Need Doing Before 1.4 ------------------ + +o Clean up RAW AX.25 sockets. [Sorted by skb_clone change] +o Finish IPIP bug fixes [Done hopefully] +o Multicast routing [Nearly right] +o PPP/IPX +o IPX for Lwared +o SKIP [Available in user mode] +o AX.25/NetROM locking changes +o insw_and_csum +o AF_UNIX fd passing + +-------------------------- Bugs to fix ------------------------------ + +o signal interrupting a unix domain connect can occasionally hang the + machine ?? +o TCP socket cache gets things wrong very very occasionally under high + load. [TRYING THINGS] +o AX.25/NetROM needs more locking. +o NFS flow control is needed with the new multirequest NFS support. +o Need to be able to turn off the intelligent arp refreshing as it's not so + hot over AX.25 and upsets some people with very dumb ISDN bridges. +o Matti Arnio's TCP problem. +o Should unix domain connect never block ? +o Sort out kerneld getting things right. 0.2 --- -o New UNIX sockets include Pedro Roque's shutdown. -o New icmp.c. -o Better TCP window handling [Pedro Roque] -o IP option support. +o Fast checksum/copy on outgoing TCP o Add tty support to sonix driver. o PPP for Sonix ISDN. -o Loadable firewall extensions. o Screend loadable firewall module -o LZ SLIP +o AXIP [AVAILABLE IN USER MODE] +o Finish merging the bridge code [LEAVE POST 1.4] +o Finish 802.2 Class I code to be compliant to the oddities of 802.2 +o Tidy BPQ support to use a bpqip tunnel device +o Kill off old ip_queue_xmit/ip_send stuff. +o Remove kernel RARP and replace with user mode daemon. +o Throw out existing firewall ioctl()'s and use a single table load. +o SPARC merge 0.3 --- -o Merge the layered protocol support. -o IP firewalling performance - caching and radix trees. -o Zebedee o 802.2 Class 2 services (eg netbios). -o Multidrop KISS -o Multicast routing -o IPX/Atalk/Netrom firewalling Possible projects for victim^H^H^H^H^Holunteers -1. Verifying the correctness of implementation against RFC1122 and -making a list of violations (BSD is sufficiently screwed up you can't -implement all of RFC1122 and talk to it usefully). - 2. Verifying all the error returns match the BSD ones (grotty job I wouldn't wish on anyone). @@ -140,61 +414,55 @@ because a single frame in the data stream has been lost). Given a mathematician with some queue theory you can show this allows you to lose one frame per window full of data without measurable speed loss. -4. RFC1323 and RFC1191. These are the extensions for very fast nets -and for 'path MTU discovery' - a way of finding the best packetsize to use. +4. RFC1323. These are the extensions for very fast nets. RFC1323 will be useful for Linux talking to systems over 100Mb/sec ethernet and over ATM as it allows large windows and protects from some potential high speed TCP problems. -5. Fixing the IP fragment handling so that the total space allocated to -fragments is limited and old fragments are deleted to make room for new ones -when space is exhausted. Fixing the fragment handling to work at a decent -speed wouldn't be bad either. - 6. Delayed ack. This is mostly supported but not actually set up and used yet. Basically ack frames are held back 1/10th of a second in the hope that two acks can be merged into one or for interactive use the ack can piggyback on the next character typed (great improvement on 2400 baud modems). Johannes Stille did some work on this about 0.99.13 but it never -got merged in. +got merged in. [Pedro Roque] 7. One on my tempting project list. Add an extra (unofficial - but so is SLIP6) SLIP mode that does packet data compression [maybe use the code from term]. -8. Making SLIP/PPP dynamically allocate devices so you never run out -of channels. [Taken/Done pending inclusion] - 9. Implementing streams. Not as a blind slow SYS5.4 style copy but actually working out how to do it so it runs like greased lightning. Quite a big -problem. +problem. [See the LiS project] 10. Frame Relay/WAN/ISDN drivers [I'm working on the sonix EuroISDN board -driver but thats for an internal project and its general release is still -a maybe (so is finishing it ;))]. - +driver but that's for an internal project and its general release is still +a maybe (so is finishing it ;))][Jim Freeman is working on Frame Relay as is +Mike McLagan][Fritz Elfert is doing the isdn4linux kit]. + 11. IP over SCSI. -12. Debugging and making the appletalk alpha test code useful. - [Done and in] - -13. Mrouted Multicast routing. Or possibly MOSPF and others - as they become available - [Some interest: see/join linux-multicast@www.linux.org.uk - if you wish to join in] - 14. Bidirectional PLIP. Also PLIP for the newer style parallel ports. 15. 802.2LLC and thus Netbeui sockets. Becoming less important since the -rumour is microsoft are phasing out netbeui for netbios/IP. +rumour is microsoft are phasing out netbeui for netbios/IP. Microsoft have +gone for netbios/funny-ipx-variant it seems in Win95, but TCP is selectable. 16. X.25. This is one for a real head case with far too much time on their hands. [Provisionally taken] 17. PPP multilink. Another nasty job. +[In progress] 18. Implement swIPe under Linux. -[In progress] +[Reportedly in progress] + +19. IPv4 IP-AH and IP-ESP. +[Taken] + +20. SKIP IP security using ENskip-0.10 - started +[Abandoned] + +21. T/TCP support. BTW: Don't let the magic words 'kernel programming' worry you. Its like DOS - you make a mistake you have to reboot. You do at least get dumps and a @@ -202,5 +470,3 @@ kernel logger that is reliable. There is now a loadable module allowing use of gdb on the kernel (no breakpoints though!). No magic involved. Alan - - diff --git a/net/Config.in b/net/Config.in new file mode 100644 index 000000000..834001fdc --- /dev/null +++ b/net/Config.in @@ -0,0 +1,36 @@ +# +# Network configuration +# +mainmenu_option next_comment +comment 'Networking options' +bool 'Kernel/User network link driver' CONFIG_NETLINK +if [ "$CONFIG_NETLINK" = "y" ]; then + bool 'Routing messages' CONFIG_RTNETLINK +fi +bool 'Network firewalls' CONFIG_FIREWALL +bool 'Network aliasing' CONFIG_NET_ALIAS +bool 'TCP/IP networking' CONFIG_INET +if [ "$CONFIG_INET" = "y" ]; then + source net/ipv4/Config.in + + if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + tristate 'The IPv6 protocol' CONFIG_IPV6 + fi +fi + +comment ' ' +tristate 'The IPX protocol' CONFIG_IPX +if [ "$CONFIG_IPX" != "n" ]; then + bool 'Full internal IPX network' CONFIG_IPX_INTERN +fi +tristate 'Appletalk DDP' CONFIG_ATALK +tristate 'Amateur Radio AX.25 Level 2' CONFIG_AX25 +if [ "$CONFIG_AX25" != "n" ]; then + dep_tristate 'Amateur Radio NET/ROM' CONFIG_NETROM $CONFIG_AX25 + dep_tristate 'Amateur Radio X.25 PLP (Rose)' CONFIG_ROSE $CONFIG_AX25 +fi +if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then +# tristate 'CCITT X.25 Packet Layer' CONFIG_X25 + bool 'Bridging (EXPERIMENTAL)' CONFIG_BRIDGE +fi +endmenu diff --git a/net/Makefile b/net/Makefile index 9797a97a7..f86e704e8 100644 --- a/net/Makefile +++ b/net/Makefile @@ -7,48 +7,107 @@ # # Note 2! The CFLAGS definition is now in the main makefile... -SUBDIRS := 802 ax25 core ethernet ipv4 ipx unix appletalk netrom +MOD_SUB_DIRS := ipv4 +ALL_SUB_DIRS := 802 ax25 bridge core ethernet ipv4 ipv6 ipx unix appletalk \ + netrom rose #decnet +SUB_DIRS := core ethernet unix +MOD_LIST_NAME := NET_MISC_MODULES -SUBOBJS := $(foreach f,$(SUBDIRS),$f/$f.o) +ifeq ($(CONFIG_NET),y) +SUB_DIRS += 802 +endif + +ifeq ($(CONFIG_INET),y) +SUB_DIRS += ipv4 +endif + +ifeq ($(CONFIG_IPV6),y) +SUB_DIRS += ipv6 +else + ifeq ($(CONFIG_IPV6),m) + MOD_SUB_DIRS += ipv6 + endif +endif + +ifeq ($(CONFIG_BRIDGE),y) +SUB_DIRS += bridge +endif -.c.o: - $(CC) $(CFLAGS) -c $< -.s.o: - $(AS) -o $*.o $< -.c.s: - $(CC) $(CFLAGS) -S $< +ifeq ($(CONFIG_IPX),y) +SUB_DIRS += ipx +else + ifeq ($(CONFIG_IPX),m) + MOD_SUB_DIRS += ipx + endif +endif -OBJS = socket.o protocols.o +ifeq ($(CONFIG_ATALK),y) +SUB_DIRS += appletalk +else + ifeq ($(CONFIG_ATALK),m) + MOD_SUB_DIRS += appletalk + endif +endif + +ifeq ($(CONFIG_NETROM),y) +SUB_DIRS += netrom +else + ifeq ($(CONFIG_NETROM),m) + MOD_SUB_DIRS += netrom + endif +endif -all: net.o +ifeq ($(CONFIG_ROSE),y) +SUB_DIRS += rose +else + ifeq ($(CONFIG_ROSE),m) + MOD_SUB_DIRS += rose + endif +endif -net.o: $(OBJS) network.a -ifeq ($(ARCH),mips) - $(LD) -u eth_setup -r -o net.o $(OBJS) network.a +ifeq ($(CONFIG_AX25),y) +SUB_DIRS += ax25 else - $(LD) -u _eth_setup -r -o net.o $(OBJS) network.a + ifeq ($(CONFIG_AX25),m) + MOD_SUB_DIRS += ax25 + endif endif -network.a: subdirs - rm -f $@ - $(AR) rc $@ $(SUBOBJS) - $(RANLIB) $@ +L_TARGET := network.a +L_OBJS := socket.o protocols.o sysctl_net.o $(join $(SUB_DIRS),$(SUB_DIRS:%=/%.o)) +ifeq ($(CONFIG_NET),y) +ifeq ($(CONFIG_MODULES),y) +LX_OBJS = netsyms.o +endif +endif + +M_OBJS := + +CONFIG_NETLINK_BUILTIN := +CONFIG_NETLINK_MODULE := -subdirs: dummy - set -e; for i in $(SUBDIRS); do $(MAKE) -C $$i; done +ifeq ($(CONFIG_NETLINK), y) + CONFIG_NETLINK_BUILTIN = y +endif -dep: - $(CPP) -M *.c > .depend - set -e; for i in $(SUBDIRS); do $(MAKE) -C $$i dep; done +ifeq ($(CONFIG_IPV6), y) + CONFIG_NETLINK_BUILTIN = y +endif -modules: +ifeq ($(CONFIG_NETLINK), m) + CONFIG_NETLINK_MODULE = y +endif -dummy: +ifeq ($(CONFIG_IPV6), m) + CONFIG_NETLINK_MODULE = y +endif -# -# include a dependency file if one exists -# -ifeq (.depend,$(wildcard .depend)) -include .depend +ifdef CONFIG_NETLINK_BUILTIN +L_OBJS += netlink.o +else + ifdef CONFIG_NETLINK_MODULE + M_OBJS += netlink.o + endif endif +include $(TOPDIR)/Rules.make diff --git a/net/README b/net/README index 33ffd8a41..8685b38f8 100644 --- a/net/README +++ b/net/README @@ -1,41 +1,37 @@ -Upgrade Notes from 1.0 -[Alan Cox - Alan.Cox@linux.org] -Upgrading to 1.2.0 from a 1.0 kernel networking set. If you are using -a complete 1.2 distribution you can ignore this. +Maintainers and developers for networking code sections -This doesn't attempt to list the changes. That would be too large. Instead -just what you need and can change +Code Section Bug Report Contact +-------------------+------------------------------------------- +802 [other ] alan@lxorguk.ukuu.org.uk + [token ring ] needs a maintainer/debugger +appletalk alan@lxorguk.ukuu.org.uk and netatalk@umich.edu +ax25 jsn@cs.nott.ac.uk +core alan@lxorguk.ukuu.org.uk +ethernet alan@lxorguk.ukuu.org.uk +ipv4 alan@lxorguk.ukuu.org.uk +ipx alan@lxorguk.ukuu.org.uk,greg@caldera.com +netrom jsn@cs.nott.ac.uk +rose jsn@cs.nott.ac.uk +unix alan@lxorguk.ukuu.org.uk +x25 jsn@cs.nott.ac.uk -arp,ifconfig, etc. Get net-tools-1.1.95 (or 1.2.0 if its out) from -ftp.linux.org.uk:/pub/Linux/Networking/PROGRAMS/NetTools, and install -these. You will also acquire a couple of new tools "plipconfig" for tuning -plip links and "ipfw" for ip firewall management. -bootpd: The original bootpd has a bug that the 1.2 kernel spots. You will -need to upgrade this to the version in -ftp.linux.org.uk:/pub/Linux/Networking/PROGRAMS/Upgrades + If in doubt contact me <alan@lxorguk.ukuu.org.uk> first. - -Standard programs that you ought to update are - -named 4.7.x to 4.9.x Stops named dying occasionally -pop3d 1.001 to 1.004 Fixes a bug that can lose mail - -A complete current networking set for Linux can be obtained by getting -the NetKit[A,B...] series archives from ftp.funet.fi. Funet also carries -binaries for Linux mbone applications if you now wish to make use of -these facilities. +--------------------------------------------------------------------------- For commercial UK custom Linux networking projects, drivers and development (but not free support!) I can be contacted via - I^2IT Ltd, The Innovation Centre, University Of Wales + CymruNET Ltd, The Innovation Centre, University Of Wales Swansea SA2 8PP. Fax: +44 1792 295811 Tel: +44 1792 295213 -Please don't send commercial queries to my email address as I have that + Email: alan@cymru.net + +Please don't send commercial queries to my .ac.uk email address as I have that in an academic and _not_ commercial capacity. On the other hand feel free to send bug reports, queries and enhancements that way. diff --git a/net/TUNABLE b/net/TUNABLE new file mode 100644 index 000000000..bd6066126 --- /dev/null +++ b/net/TUNABLE @@ -0,0 +1,65 @@ +The following parameters should be tunable but aren't, until we get sysctl +or similar schemes. For now you'll have to dig around. Various CONFIG_xxx +items that should be configurable using sysctl omitted. + +This is far from complete + +Item Description +---------------------------------------------------------------------------- +MAX_SOCKETS Tunable on boot, maximum sockets we will allocate +NUM_PROTO Maximum loadable address family, will need recompile +MAX_LINKS Maximum number of netlink minor devices. (1-32) +MAX_QBYTES Size of a netlink device queue (tunable) +RIF_TABLE_SIZE Token ring RIF cache size (tunable) +AARP_HASH_SIZE Size of appletalk hash table (tunable) +AX25_DEF_T1 AX.25 parameters. These are all tunable via +AX25_DEF_T2 SIOCAX25SETPARMS +AX25_DEF_T3 T1-T3,N2 have the meanings in the specification +AX25_DEF_N2 +AX25_DEF_AXDEFMODE 8 = normal 128 is PE1CHL extended +AX25_DEF_IPDEFMODE 'D' - datagram 'V' - virtual connection +AX25_DEF_BACKOFF 'E'xponential 'L'inear +AX25_DEF_NETROM Allow netrom 1=Y +AX25_DF_TEXT Allow PID=Text 1=Y +AX25_DEF_WINDOW Window for normal mode +AX25_DEF_EWINDOW Window for PE1CHL mode +AX25_DEF_DIGI 1 for inband 2 for cross band 3 for both +AX25_DEF_CONMODE Allow connected modes 1=Yes +AX25_ROUTE_MAX AX.25 route cache size - no currently tunable +Unnamed (16) Number of protocol hash slots (tunable) +DEV_NUMBUFFS Number of priority levels (not easily tunable) +Unnamed (300) Maximum packet backlog queue (tunable) +MAX_IOVEC Maximum number of iovecs in a message (tunable) +MIN_WINDOW Offered minimum window (tunable) +MAX_WINDOW Offered maximum window (tunable) +MAX_HEADER Largest physical header (tunable) +MAX_ADDR_LEN Largest physical address (tunable) +SOCK_ARRAY_SIZE IP socket array hash size (tunable) +ARP_RES_TIME Time we try to resolve (tunable) +ARP_DEAD_RES_TIME Time the entry stays dead (tunable) +ARP_MAX_TRIES Maximum tries (tunable) +ARP_TIMEOUT Timeout on an ARP (tunable) +ARP_CHECK_INTERVAL Check interval to refresh an arp (tunable) +ARP_CONFIRM_INTERVAL Confirm poll time (tunable) +ARP_TABLE_SIZE Hash table size for ARP (tunable) +IP_MAX_MEMBERSHIPS Largest number of groups per socket (BSD style) +16 Hard coded constant for amount of room allowed for + cache align and faster forwarding (tunable) +IPFRAG_HIGH_THRESH Limit on fragments, we free fragments until we reach +IPFRAG_LOW_THRESH which provides some breathing space. (tunable) +IP_FRAG_TIME Time we hold a fragment for. (tunable) +PORT_MASQ_BEGIN First port reserved for masquerade (tunable) +PORT_MASQ_END Last port used for masquerade (tunable) +MASQUERADE_EXPIRE_TCP_FIN Time we keep a masquerade for after a FIN +MASQUERADE_EXPIRE_UDP Time we keep a UDP masquerade for (tunable) +MAXVIFS Maximum mrouted vifs (1-32) +MFC_LINES Lines in the multicast router cache (tunable) +SK_RMEM_MAX Max memory a socket owns for receive (tunable) +SK_WMEM_MAX Max memory a socket owns for send (tunable) + +NetROM parameters are tunable via an ioctl passing a struct + +4000 Size a Unix domain socket malloc falls back to + (tunable) should be 8K - a bit for 8K machines like + the ALPHA + diff --git a/net/appletalk/Makefile b/net/appletalk/Makefile index a14da6dd9..bfe567264 100644 --- a/net/appletalk/Makefile +++ b/net/appletalk/Makefile @@ -7,29 +7,11 @@ # # Note 2! The CFLAGS definition is now in the main makefile... -.c.o: - $(CC) $(CFLAGS) -c $< -.s.o: - $(AS) -o $*.o $< -.c.s: - $(CC) $(CFLAGS) -S $< +O_TARGET := appletalk.o +O_OBJS := aarp.o ddp.o sysctl_net_atalk.o +M_OBJS := $(O_TARGET) - -OBJS := aarp.o ddp.o - - -appletalk.o: $(OBJS) - $(LD) -r -o appletalk.o $(OBJS) - -dep: - $(CPP) -M *.c > .depend +include $(TOPDIR)/Rules.make tar: tar -cvf /dev/f1 . - -# -# include a dependency file if one exists -# -ifeq (.depend,$(wildcard .depend)) -include .depend -endif diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index 52a46347a..4b8d4fb7e 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -3,7 +3,7 @@ * ethernet 'ELAP'. * * Alan Cox <Alan.Cox@linux.org> - * <iialan@www.linux.org.uk> + * <alan@cymru.net> * * This doesn't fit cleanly with the IP arp. This isn't a problem as * the IP arp wants extracting from the device layer in 1.3.x anyway. @@ -11,7 +11,7 @@ * * FIXME: * We ought to handle the retransmits with a single list and a - * seperate fast timer for when it is needed. + * separate fast timer for when it is needed. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -23,10 +23,9 @@ * Inside Appletalk (2nd Ed). */ -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <asm/bitops.h> -#include <linux/config.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -42,13 +41,13 @@ #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> +#include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/datalink.h> #include <net/psnap.h> -#include <net/atalk.h> +#include <linux/atalk.h> -#ifdef CONFIG_ATALK /* * Lists of aarp entries */ @@ -103,7 +102,7 @@ static void aarp_send_query(struct aarp_entry *a) struct device *dev=a->dev; int len=dev->hard_header_len+sizeof(struct elapaarp)+aarp_dl->header_length; struct sk_buff *skb=alloc_skb(len, GFP_ATOMIC); - struct elapaarp *eah=(struct elapaarp *)(skb->data+dev->hard_header_len+aarp_dl->header_length); + struct elapaarp *eah; struct at_addr *sat=atalk_find_dev_addr(dev); if(skb==NULL || sat==NULL) @@ -112,10 +111,11 @@ static void aarp_send_query(struct aarp_entry *a) /* * Set up the buffer. */ - + + skb_reserve(skb,dev->hard_header_len+aarp_dl->header_length); + eah = (struct elapaarp *)skb_put(skb,sizeof(struct elapaarp)); skb->arp = 1; skb->free = 1; - skb->len = len; skb->dev = a->dev; /* @@ -150,7 +150,6 @@ static void aarp_send_query(struct aarp_entry *a) * Send it. */ - dev_queue_xmit(skb, dev, SOPRI_NORMAL); /* @@ -164,7 +163,7 @@ static void aarp_send_reply(struct device *dev, struct at_addr *us, struct at_ad { int len=dev->hard_header_len+sizeof(struct elapaarp)+aarp_dl->header_length; struct sk_buff *skb=alloc_skb(len, GFP_ATOMIC); - struct elapaarp *eah=(struct elapaarp *)(skb->data+dev->hard_header_len+aarp_dl->header_length); + struct elapaarp *eah; if(skb==NULL) return; @@ -172,10 +171,11 @@ static void aarp_send_reply(struct device *dev, struct at_addr *us, struct at_ad /* * Set up the buffer. */ - + + skb_reserve(skb,dev->hard_header_len+aarp_dl->header_length); + eah = (struct elapaarp *)skb_put(skb,sizeof(struct elapaarp)); skb->arp = 1; skb->free = 1; - skb->len = len; skb->dev = dev; /* @@ -225,7 +225,7 @@ void aarp_send_probe(struct device *dev, struct at_addr *us) { int len=dev->hard_header_len+sizeof(struct elapaarp)+aarp_dl->header_length; struct sk_buff *skb=alloc_skb(len, GFP_ATOMIC); - struct elapaarp *eah=(struct elapaarp *)(skb->data+dev->hard_header_len+aarp_dl->header_length); + struct elapaarp *eah; static char aarp_eth_multicast[ETH_ALEN]={ 0x09, 0x00, 0x07, 0xFF, 0xFF, 0xFF }; if(skb==NULL) @@ -234,10 +234,12 @@ void aarp_send_probe(struct device *dev, struct at_addr *us) /* * Set up the buffer. */ - + + skb_reserve(skb,dev->hard_header_len+aarp_dl->header_length); + eah = (struct elapaarp *)skb_put(skb,sizeof(struct elapaarp)); + skb->arp = 1; skb->free = 1; - skb->len = len; skb->dev = dev; /* @@ -358,9 +360,9 @@ static void aarp_expire_timeout(unsigned long unused) } del_timer(&aarp_timer); if(unresolved_count==0) - aarp_timer.expires=AARP_EXPIRY_TIME; + aarp_timer.expires=jiffies+AARP_EXPIRY_TIME; else - aarp_timer.expires=AARP_TICK_TIME; + aarp_timer.expires=jiffies+AARP_TICK_TIME; add_timer(&aarp_timer); } @@ -368,7 +370,7 @@ static void aarp_expire_timeout(unsigned long unused) * Network device notifier chain handler. */ -static int aarp_device_event(unsigned long event, void *ptr) +static int aarp_device_event(struct notifier_block *this, unsigned long event, void *ptr) { int ct=0; if(event==NETDEV_DOWN) @@ -428,14 +430,60 @@ int aarp_send_ddp(struct device *dev,struct sk_buff *skb, struct at_addr *sa, vo unsigned long flags; /* + * Check for localtalk first + */ + + if(dev->type==ARPHRD_LOCALTLK) + { + struct at_addr *at=atalk_find_dev_addr(dev); + struct ddpehdr *ddp=(struct ddpehdr *)skb->data; + int ft=2; + + /* + * Compressible ? + * + * IFF: src_net==dest_net==device_net + */ + + if(at->s_net==sa->s_net && sa->s_net==ddp->deh_snet) + { + skb_pull(skb,sizeof(struct ddpehdr)-4); + /* + * The upper two remaining bytes are the port + * numbers we just happen to need. Now put the + * length in the lower two. + */ + *((__u16 *)skb->data)=htons(skb->len); + ft=1; + } + /* + * Nice and easy. No AARP type protocols occur here + * so we can just shovel it out with a 3 byte LLAP header + */ + + skb_push(skb,3); + skb->data[0]=sa->s_node; + skb->data[1]=at->s_node; + skb->data[2]=ft; + + if(skb->sk==NULL) + dev_queue_xmit(skb, skb->dev, SOPRI_NORMAL); + else + dev_queue_xmit(skb, skb->dev, skb->sk->priority); + return 1; + } + + /* * Non ELAP we cannot do. */ + if(dev->type!=ARPHRD_ETHER) { return -1; } skb->dev = dev; + skb->protocol = htons(ETH_P_ATALK); hash=sa->s_node%(AARP_HASH_SIZE-1); save_flags(flags); @@ -461,6 +509,7 @@ int aarp_send_ddp(struct device *dev,struct sk_buff *skb, struct at_addr *sa, vo /* * Return 1 and fill in the address */ + a->expires_at=jiffies+AARP_EXPIRY_TIME*10; ddp_dl->datalink_header(ddp_dl, skb, a->hwaddr); if(skb->sk==NULL) @@ -470,22 +519,27 @@ int aarp_send_ddp(struct device *dev,struct sk_buff *skb, struct at_addr *sa, vo restore_flags(flags); return 1; } + /* * Do we have an unresolved entry: This is the less common path */ + a=aarp_find_entry(unresolved[hash],dev,sa); if(a!=NULL) { /* * Queue onto the unresolved queue */ + skb_queue_tail(&a->packet_queue, skb); restore_flags(flags); return 0; } + /* * Allocate a new entry */ + a=aarp_alloc(); if(a==NULL) { @@ -496,9 +550,11 @@ int aarp_send_ddp(struct device *dev,struct sk_buff *skb, struct at_addr *sa, vo restore_flags(flags); return -1; } + /* * Set up the queue */ + skb_queue_tail(&a->packet_queue, skb); a->expires_at=jiffies+AARP_RESOLVE_TIME; a->dev=dev; @@ -508,26 +564,37 @@ int aarp_send_ddp(struct device *dev,struct sk_buff *skb, struct at_addr *sa, vo unresolved[hash]=a; unresolved_count++; restore_flags(flags); + /* * Send an initial request for the address */ + aarp_send_query(a); + /* * Switch to fast timer if needed (That is if this is the * first unresolved entry to get added) */ + if(unresolved_count==1) { del_timer(&aarp_timer); - aarp_timer.expires=AARP_TICK_TIME; + aarp_timer.expires=jiffies+AARP_TICK_TIME; add_timer(&aarp_timer); } + /* * Tell the ddp layer we have taken over for this frame. */ + return 0; } +/* + * An entry in the aarp unresolved queue has become resolved. Send + * all the frames queued under it. + */ + static void aarp_resolved(struct aarp_entry **list, struct aarp_entry *a, int hash) { struct sk_buff *skb; @@ -537,10 +604,18 @@ static void aarp_resolved(struct aarp_entry **list, struct aarp_entry *a, int ha { unresolved_count--; *list=a->next; - /* Move into the resolved list */ + + /* + * Move into the resolved list + */ + a->next=resolved[hash]; resolved[hash]=a; - /* Kick frames off */ + + /* + * Kick frames off + */ + while((skb=skb_dequeue(&a->packet_queue))!=NULL) { a->expires_at=jiffies+AARP_EXPIRY_TIME*10; @@ -556,6 +631,11 @@ static void aarp_resolved(struct aarp_entry **list, struct aarp_entry *a, int ha } } +/* + * This is called by the SNAP driver whenever we see an AARP SNAP + * frame. We currently only support ethernet. + */ + static int aarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) { struct elapaarp *ea=(struct elapaarp *)skb->h.raw; @@ -580,7 +660,7 @@ static int aarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type * Frame size ok ? */ - if(skb->len<sizeof(*ea)) + if(!skb_pull(skb,sizeof(*ea))) { kfree_skb(skb, FREE_READ); return 0; @@ -636,6 +716,7 @@ static int aarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type /* * Fail the probe (in use) */ + ifa->status|=ATIF_PROBE_FAIL; restore_flags(flags); kfree_skb(skb, FREE_READ); @@ -658,12 +739,13 @@ static int aarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type /* * We can fill one in - this is good */ + memcpy(a->hwaddr,ea->hw_src,ETH_ALEN); aarp_resolved(&unresolved[hash],a,hash); if(unresolved_count==0) { del_timer(&aarp_timer); - aarp_timer.expires=AARP_EXPIRY_TIME; + aarp_timer.expires=jiffies+AARP_EXPIRY_TIME; add_timer(&aarp_timer); } break; @@ -691,6 +773,7 @@ static int aarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type /* * aarp_my_address has found the address to use for us. */ + aarp_send_reply(dev,ma,&sa,ea->hw_src); break; } @@ -705,17 +788,57 @@ static struct notifier_block aarp_notifier={ 0 }; +static char aarp_snap_id[]={0x00,0x00,0x00,0x80,0xF3}; + void aarp_proto_init(void) { - static char aarp_snap_id[]={0x00,0x00,0x00,0x80,0xF3}; if((aarp_dl=register_snap_client(aarp_snap_id, aarp_rcv))==NULL) - printk("Unable to register AARP with SNAP.\n"); + printk(KERN_CRIT "Unable to register AARP with SNAP.\n"); init_timer(&aarp_timer); aarp_timer.function=aarp_expire_timeout; aarp_timer.data=0; - aarp_timer.expires=AARP_EXPIRY_TIME; + aarp_timer.expires=jiffies+AARP_EXPIRY_TIME; add_timer(&aarp_timer); register_netdevice_notifier(&aarp_notifier); } -#endif + + +#ifdef MODULE + +/* Free all the entries in an aarp list. Caller should turn off interrupts. */ +static void free_entry_list(struct aarp_entry *list) +{ + struct aarp_entry *tmp; + + while (list != NULL) + { + tmp = list->next; + aarp_expire(list); + list = tmp; + } +} + +/* General module cleanup. Called from cleanup_module() in ddp.c. */ +void aarp_cleanup_module(void) +{ + unsigned long flags; + int i; + + save_flags(flags); + cli(); + + del_timer(&aarp_timer); + unregister_netdevice_notifier(&aarp_notifier); + unregister_snap_client(aarp_snap_id); + + for (i = 0; i < AARP_HASH_SIZE; i++) + { + free_entry_list(resolved[i]); + free_entry_list(unresolved[i]); + } + + restore_flags(flags); +} + +#endif /* MODULE */ diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 67ad1bf22..211842144 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -9,6 +9,22 @@ * * Wesley Craig <netatalk@umich.edu> * + * Fixes: + * Michael Callahan : Made routing work + * Wesley Craig : Fix probing to listen to a + * passed node id. + * Alan Cox : Added send/recvmsg support + * Alan Cox : Moved at. to protinfo in + * socket. + * Alan Cox : Added firewall hooks. + * Alan Cox : Supports new ARPHRD_LOOPBACK + * Christer Weinigel : Routing and /proc fixes. + * Bradford Johnson : Localtalk. + * Tom Dyas : Module support. + * Alan Cox : Hooks for PPP (based on the + * localtalk hook). + * Alan Cox : Posix bits + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -16,13 +32,13 @@ * * TODO * ASYNC I/O - * Testing. */ -#include <asm/segment.h> +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <asm/bitops.h> -#include <linux/config.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -34,21 +50,25 @@ #include <linux/errno.h> #include <linux/interrupt.h> #include <linux/if_ether.h> +#include <linux/route.h> #include <linux/inet.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> +#include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/termios.h> /* For TIOCOUTQ/INQ */ #include <net/datalink.h> #include <net/p8022.h> #include <net/psnap.h> #include <net/sock.h> -#include <net/atalk.h> +#include <linux/atalk.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/firewall.h> -#ifdef CONFIG_ATALK -#define APPLETALK_DEBUG +#undef APPLETALK_DEBUG #ifdef APPLETALK_DEBUG @@ -117,23 +137,28 @@ static atalk_socket *atalk_search_socket(struct sockaddr_at *to, struct atalk_if { atalk_socket *s; - for( s = atalk_socket_list; s != NULL; s = s->next ) { - if ( to->sat_port != s->at.src_port ) { - continue; - } + for( s = atalk_socket_list; s != NULL; s = s->next ) + { + if ( to->sat_port != s->protinfo.af_at.src_port ) + { + continue; + } - if ( to->sat_addr.s_net == 0 && + if ( to->sat_addr.s_net == 0 && to->sat_addr.s_node == ATADDR_BCAST && - s->at.src_net == atif->address.s_net ) { - break; - } + s->protinfo.af_at.src_net == atif->address.s_net ) + { + break; + } - if ( to->sat_addr.s_net == s->at.src_net && - to->sat_addr.s_node == s->at.src_node ) { - break; - } + if ( to->sat_addr.s_net == s->protinfo.af_at.src_net && + (to->sat_addr.s_node == s->protinfo.af_at.src_node + ||to->sat_addr.s_node == ATADDR_BCAST )) + { + break; + } - /* XXXX.0 */ + /* XXXX.0 */ } return( s ); } @@ -146,17 +171,21 @@ static atalk_socket *atalk_find_socket(struct sockaddr_at *sat) { atalk_socket *s; - for ( s = atalk_socket_list; s != NULL; s = s->next ) { - if ( s->at.src_net != sat->sat_addr.s_net ) { - continue; - } - if ( s->at.src_node != sat->sat_addr.s_node ) { - continue; - } - if ( s->at.src_port != sat->sat_port ) { - continue; - } - break; + for ( s = atalk_socket_list; s != NULL; s = s->next ) + { + if ( s->protinfo.af_at.src_net != sat->sat_addr.s_net ) + { + continue; + } + if ( s->protinfo.af_at.src_node != sat->sat_addr.s_node ) + { + continue; + } + if ( s->protinfo.af_at.src_port != sat->sat_port ) + { + continue; + } + break; } return( s ); } @@ -190,14 +219,17 @@ static void atalk_destroy_socket(atalk_socket *sk) } if(sk->wmem_alloc == 0 && sk->rmem_alloc == 0 && sk->dead) - kfree_s(sk,sizeof(*sk)); + { + sk_free(sk); + MOD_DEC_USE_COUNT; + } else { /* * Someone is using our buffers still.. defer */ init_timer(&sk->timer); - sk->timer.expires=10*HZ; + sk->timer.expires=jiffies+10*HZ; sk->timer.function=atalk_destroy_timer; sk->timer.data = (unsigned long)sk; add_timer(&sk->timer); @@ -205,8 +237,11 @@ static void atalk_destroy_socket(atalk_socket *sk) } -/* Called from proc fs */ -int atalk_get_info(char *buffer, char **start, off_t offset, int length) +/* + * Called from proc fs + */ + +int atalk_get_info(char *buffer, char **start, off_t offset, int length, int dummy) { atalk_socket *s; int len=0; @@ -214,20 +249,22 @@ int atalk_get_info(char *buffer, char **start, off_t offset, int length) off_t begin=0; /* - * Fill this in to print out the appletalk info you want + * Output the appletalk data for the /proc virtual fs. */ - /* Theory.. Keep printing in the same place until we pass offset */ - len += sprintf (buffer,"Type local_addr remote_addr tx_queue rx_queue st uid\n"); for (s = atalk_socket_list; s != NULL; s = s->next) { len += sprintf (buffer+len,"%02X ", s->type); len += sprintf (buffer+len,"%04X:%02X:%02X ", - s->at.src_net,s->at.src_node,s->at.src_port); + ntohs(s->protinfo.af_at.src_net), + s->protinfo.af_at.src_node, + s->protinfo.af_at.src_port); len += sprintf (buffer+len,"%04X:%02X:%02X ", - s->at.dest_net,s->at.dest_node,s->at.dest_port); - len += sprintf (buffer+len,"%08lX:%08lX ", s->wmem_alloc, s->rmem_alloc); + ntohs(s->protinfo.af_at.dest_net), + s->protinfo.af_at.dest_node, + s->protinfo.af_at.dest_port); + len += sprintf (buffer+len,"%08X:%08X ", s->wmem_alloc, s->rmem_alloc); len += sprintf (buffer+len,"%02X %d\n", s->state, SOCK_INODE(s->socket)->i_uid); /* Are we still dumping unwanted data then discard the record */ @@ -316,10 +353,42 @@ static int atif_probe_device(struct atalk_iface *atif) int ct; int netrange=ntohs(atif->nets.nr_lastnet)-ntohs(atif->nets.nr_firstnet)+1; int probe_net=ntohs(atif->address.s_net); + int probe_node=atif->address.s_node; int netct; int nodect; - + struct ifreq atreq; + struct sockaddr_at *sa; + int err; + +/* + * THIS IS A HACK: Farallon cards want to do their own picking of + * addresses. This needs tidying up post 1.4, but we need it in + * now for the 1.4 release as is. + * + */ + if((atif->dev->type == ARPHRD_LOCALTLK || atif->dev->type == ARPHRD_PPP) + && atif->dev->do_ioctl) + { + /* fake up the request and pass it down */ + sa = (struct sockaddr_at*)&atreq.ifr_addr; + sa->sat_addr.s_node = probe_node; + sa->sat_addr.s_net = probe_net; + if (!(err=atif->dev->do_ioctl(atif->dev,&atreq,SIOCSIFADDR))) + { + (void)atif->dev->do_ioctl(atif->dev,&atreq,SIOCGIFADDR); + atif->address.s_net=htons(sa->sat_addr.s_net); + atif->address.s_node=sa->sat_addr.s_node; + return 0; + } + /* + * If it didn't like our faked request then fail: + * This should check against -ENOIOCTLCMD and fall + * through. That needs us to fix all the devices up + * properly. We can then also dump the localtalk test. + */ + return err; + } /* * Offset the network we start probing with. */ @@ -332,6 +401,9 @@ static int atif_probe_device(struct atalk_iface *atif) probe_net=ntohs(atif->nets.nr_firstnet) + (jiffies%netrange); } + if(probe_node == ATADDR_ANYNODE) + probe_node = jiffies&0xFF; + /* * Scan the networks. @@ -340,14 +412,13 @@ static int atif_probe_device(struct atalk_iface *atif) for(netct=0;netct<=netrange;netct++) { /* - * Sweep the available nodes from a random start. + * Sweep the available nodes from a given start. */ - int nodeoff=jiffies&255; - + atif->address.s_net=htons(probe_net); for(nodect=0;nodect<256;nodect++) { - atif->address.s_node=((nodect+nodeoff)&0xFF); + atif->address.s_node=((nodect+probe_node)&0xFF); if(atif->address.s_node>0&&atif->address.s_node<254) { /* @@ -420,13 +491,12 @@ struct atalk_iface *atalk_find_dev(struct device *dev) static struct atalk_iface *atalk_find_anynet(int node, struct device *dev) { struct atalk_iface *iface; - for(iface=atalk_iface_list;iface!=NULL;iface=iface->next) { - if ( iface->dev != dev || ( iface->status & ATIF_PROBE )) { + for(iface=atalk_iface_list;iface!=NULL;iface=iface->next) + { + if ( iface->dev != dev || ( iface->status & ATIF_PROBE )) continue; - } - if ( node == ATADDR_BCAST || iface->address.s_node == node ) { + if ( node == ATADDR_BCAST || iface->address.s_node == node ) return iface; - } } return NULL; } @@ -537,29 +607,30 @@ static int atrtr_create(struct rtentry *r, struct device *devhint) if(r->rt_flags != rt->flags) continue; - if(ta->sat_addr.s_net == rt->target.s_net) { - if(!(rt->flags&RTF_HOST)) - break; - if(ta->sat_addr.s_node == rt->target.s_node) - break; + if(ta->sat_addr.s_net == rt->target.s_net) + { + if(!(rt->flags&RTF_HOST)) + break; + if(ta->sat_addr.s_node == rt->target.s_node) + break; } } - if ( devhint == NULL ) { - for ( riface = NULL, iface = atalk_iface_list; iface; - iface = iface->next ) { - if ( riface == NULL && ntohs( ga->sat_addr.s_net ) >= - ntohs( iface->nets.nr_firstnet ) && - ntohs( ga->sat_addr.s_net ) <= - ntohs( iface->nets.nr_lastnet )) - riface = iface; - if ( ga->sat_addr.s_net == iface->address.s_net && - ga->sat_addr.s_node == iface->address.s_node ) - riface = iface; - } - if ( riface == NULL ) - return -ENETUNREACH; - devhint = riface->dev; + if ( devhint == NULL ) + { + for ( riface = NULL, iface = atalk_iface_list; iface; iface = iface->next ) + { + if ( riface == NULL && ntohs( ga->sat_addr.s_net ) >= ntohs( iface->nets.nr_firstnet ) && + ntohs( ga->sat_addr.s_net ) <= ntohs( iface->nets.nr_lastnet )) + { + riface = iface; + } + if ( ga->sat_addr.s_net == iface->address.s_net && ga->sat_addr.s_node == iface->address.s_node ) + riface = iface; + } + if ( riface == NULL ) + return -ENETUNREACH; + devhint = riface->dev; } if(rt==NULL) @@ -594,10 +665,12 @@ static int atrtr_delete( struct at_addr *addr ) struct atalk_route **r = &atalk_router_list; struct atalk_route *tmp; - while ((tmp = *r) != NULL) { + while ((tmp = *r) != NULL) + { if (tmp->target.s_net == addr->s_net && (!(tmp->flags&RTF_GATEWAY) || - tmp->target.s_node == addr->s_node )) { + tmp->target.s_node == addr->s_node )) + { *r = tmp->next; kfree_s(tmp, sizeof(struct atalk_route)); return 0; @@ -617,8 +690,10 @@ void atrtr_device_down(struct device *dev) struct atalk_route **r = &atalk_router_list; struct atalk_route *tmp; - while ((tmp = *r) != NULL) { - if (tmp->dev == dev) { + while ((tmp = *r) != NULL) + { + if (tmp->dev == dev) + { *r = tmp->next; kfree_s(tmp, sizeof(struct atalk_route)); } @@ -630,11 +705,11 @@ void atrtr_device_down(struct device *dev) } /* - * A device event has occured. Watch for devices going down and + * A device event has occurred. Watch for devices going down and * delete our use of them (iface and route). */ -static int ddp_device_event(unsigned long event, void *ptr) +static int ddp_device_event(struct notifier_block *this, unsigned long event, void *ptr) { if(event==NETDEV_DOWN) { @@ -661,17 +736,15 @@ int atif_ioctl(int cmd, void *arg) struct sockaddr_at *sa; struct device *dev; struct atalk_iface *atif; - int ro=(cmd==SIOCSIFADDR); - int err=verify_area(ro?VERIFY_READ:VERIFY_WRITE, arg,sizeof(atreq)); + int err; int ct; int limit; struct rtentry rtdef; - if(err) - return err; - - memcpy_fromfs(&atreq,arg,sizeof(atreq)); - + err = copy_from_user(&atreq,arg,sizeof(atreq)); + if (err) + return -EFAULT; + if((dev=dev_get(atreq.ifr_name))==NULL) return -ENODEV; @@ -685,10 +758,16 @@ int atif_ioctl(int cmd, void *arg) return -EPERM; if(sa->sat_family!=AF_APPLETALK) return -EINVAL; - if(dev->type!=ARPHRD_ETHER) + if(dev->type!=ARPHRD_ETHER&&dev->type!=ARPHRD_LOOPBACK + &&dev->type!=ARPHRD_LOCALTLK && dev->type!=ARPHRD_PPP) return -EPROTONOSUPPORT; nr=(struct netrange *)&sa->sat_zero[0]; - if(nr->nr_phase!=2) + /* + * Phase 1 is fine on localtalk but we don't + * do Ethertalk phase 1. Anyone wanting to add + * it go ahead. + */ + if(dev->type==ARPHRD_ETHER && nr->nr_phase!=2) return -EPROTONOSUPPORT; if(sa->sat_addr.s_node==ATADDR_BCAST || sa->sat_addr.s_node == 254) return -EINVAL; @@ -739,15 +818,18 @@ int atif_ioctl(int cmd, void *arg) /* * Routerless initial state. */ - if(nr->nr_firstnet==htons(0) && nr->nr_lastnet==htons(0xFFFE)) { + if(nr->nr_firstnet==htons(0) && nr->nr_lastnet==htons(0xFFFE)) + { sa->sat_addr.s_net=atif->address.s_net; atrtr_create(&rtdef, dev); atrtr_set_default(dev); - } else { + } + else + { limit=ntohs(nr->nr_lastnet); if(limit-ntohs(nr->nr_firstnet) > 256) { - printk("Too many routes/iface.\n"); + printk(KERN_WARNING "Too many routes/iface.\n"); return -EINVAL; } for(ct=ntohs(nr->nr_firstnet);ct<=limit;ct++) @@ -772,8 +854,13 @@ int atif_ioctl(int cmd, void *arg) ((struct sockaddr_at *)(&atreq.ifr_addr))->sat_addr.s_node=ATADDR_BCAST; break; } - memcpy_tofs(arg,&atreq,sizeof(atreq)); - return 0; + err = copy_to_user(arg,&atreq,sizeof(atreq)); + + if (err) + { + err = -EFAULT; + } + return err; } /* @@ -785,11 +872,10 @@ static int atrtr_ioctl(unsigned int cmd, void *arg) int err; struct rtentry rt; - err=verify_area(VERIFY_READ, arg, sizeof(rt)); - if(err) - return err; - memcpy_fromfs(&rt,arg,sizeof(rt)); - + err = copy_from_user(&rt,arg,sizeof(rt)); + if (err) + return -EFAULT; + switch(cmd) { case SIOCDELRT: @@ -805,7 +891,7 @@ static int atrtr_ioctl(unsigned int cmd, void *arg) /* Called from proc fs - just make it print the ifaces neatly */ -int atalk_if_get_info(char *buffer, char **start, off_t offset, int length) +int atalk_if_get_info(char *buffer, char **start, off_t offset, int length, int dummy) { struct atalk_iface *iface; int len=0; @@ -838,7 +924,7 @@ int atalk_if_get_info(char *buffer, char **start, off_t offset, int length) /* Called from proc fs - just make it print the routes neatly */ -int atalk_rt_get_info(char *buffer, char **start, off_t offset, int length) +int atalk_rt_get_info(char *buffer, char **start, off_t offset, int length, int dummy) { struct atalk_route *rt; int len=0; @@ -849,13 +935,13 @@ int atalk_rt_get_info(char *buffer, char **start, off_t offset, int length) if(atrtr_default.dev) { rt=&atrtr_default; - len += sprintf (buffer+len,"Default %5d:%-3d %-4d %s\n", + len += sprintf (buffer+len,"Default %04X:%02X %-4d %s\n", ntohs(rt->gateway.s_net), rt->gateway.s_node, rt->flags, rt->dev->name); } for (rt = atalk_router_list; rt != NULL; rt = rt->next) { - len += sprintf (buffer+len,"%04X:%02X %5d:%-3d %-4d %s\n", + len += sprintf (buffer+len,"%04X:%02X %04X:%02X %-4d %s\n", ntohs(rt->target.s_net),rt->target.s_node, ntohs(rt->gateway.s_net), rt->gateway.s_node, rt->flags, rt->dev->name); @@ -913,7 +999,7 @@ unsigned short atalk_checksum(struct ddpehdr *ddp, int len) /* * Generic fcntl calls are already dealt with. If we don't need funny ones - * this is the all you need. Async I/O is also seperate. + * this is the all you need. Async I/O is also separate. */ static int atalk_fcntl(struct socket *sock, unsigned int cmd, unsigned long arg) @@ -941,10 +1027,9 @@ static int atalk_setsockopt(struct socket *sock, int level, int optname, char *o if(optval==NULL) return(-EINVAL); - err=verify_area(VERIFY_READ,optval,sizeof(int)); - if(err) + err = get_user(opt, (int *)optval); + if (err) return err; - opt=get_fs_long((unsigned long *)optval); switch(level) { @@ -956,7 +1041,7 @@ static int atalk_setsockopt(struct socket *sock, int level, int optname, char *o } break; - case SOL_SOCKET: + case SOL_SOCKET: return sock_setsockopt(sk,level,optname,optval,optlen); default: @@ -995,13 +1080,10 @@ static int atalk_getsockopt(struct socket *sock, int level, int optname, default: return -EOPNOTSUPP; } - err=verify_area(VERIFY_WRITE,optlen,sizeof(int)); - if(err) - return err; - put_fs_long(sizeof(int),(unsigned long *)optlen); - err=verify_area(VERIFY_WRITE,optval,sizeof(int)); - put_fs_long(val,(unsigned long *)optval); - return(0); + err = put_user(sizeof(int),optlen); + if (!err) + err = put_user(val, (int *) optval); + return err; } /* @@ -1040,7 +1122,7 @@ static void def_callback2(struct sock *sk, int len) static int atalk_create(struct socket *sock, int protocol) { atalk_socket *sk; - sk=(atalk_socket *)kmalloc(sizeof(*sk),GFP_KERNEL); + sk=(atalk_socket *)sk_alloc(GFP_KERNEL); if(sk==NULL) return(-ENOMEM); switch(sock->type) @@ -1052,42 +1134,25 @@ static int atalk_create(struct socket *sock, int protocol) case SOCK_DGRAM: break; default: - kfree_s((void *)sk,sizeof(*sk)); + sk_free((void *)sk); return(-ESOCKTNOSUPPORT); } - sk->dead=0; - sk->next=NULL; - sk->broadcast=0; + + MOD_INC_USE_COUNT; + sk->no_check=0; /* Checksums on by default */ + sk->allocation=GFP_KERNEL; sk->rcvbuf=SK_RMEM_MAX; sk->sndbuf=SK_WMEM_MAX; sk->pair=NULL; - sk->wmem_alloc=0; - sk->rmem_alloc=0; - sk->inuse=0; - sk->proc=0; sk->priority=1; - sk->shutdown=0; - sk->prot=NULL; /* So we use default free mechanisms */ - sk->broadcast=0; - sk->err=0; skb_queue_head_init(&sk->receive_queue); skb_queue_head_init(&sk->write_queue); - sk->send_head=NULL; skb_queue_head_init(&sk->back_log); sk->state=TCP_CLOSE; sk->socket=sock; sk->type=sock->type; - sk->debug=0; - - sk->at.src_net=0; - sk->at.src_node=0; - sk->at.src_port=0; - sk->at.dest_net=0; - sk->at.dest_node=0; - sk->at.dest_port=0; - sk->mtu=DDP_MAXSZ; if(sock!=NULL) @@ -1138,10 +1203,11 @@ static int atalk_release(struct socket *sock, struct socket *peer) static int atalk_pick_port(struct sockaddr_at *sat) { - for ( sat->sat_port = ATPORT_RESERVED; sat->sat_port < ATPORT_LAST; - sat->sat_port++ ) - if ( atalk_find_socket( sat ) == NULL ) - return sat->sat_port; + for ( sat->sat_port = ATPORT_RESERVED; sat->sat_port < ATPORT_LAST; sat->sat_port++ ) + { + if ( atalk_find_socket( sat ) == NULL ) + return sat->sat_port; + } return -EBUSY; } @@ -1152,13 +1218,13 @@ static int atalk_autobind(atalk_socket *sk) int n; if ( ap == NULL || ap->s_net == htons( ATADDR_ANYNET )) - return -EADDRNOTAVAIL; - sk->at.src_net = sat.sat_addr.s_net = ap->s_net; - sk->at.src_node = sat.sat_addr.s_node = ap->s_node; + return -EADDRNOTAVAIL; + sk->protinfo.af_at.src_net = sat.sat_addr.s_net = ap->s_net; + sk->protinfo.af_at.src_node = sat.sat_addr.s_node = ap->s_node; if (( n = atalk_pick_port( &sat )) < 0 ) - return( n ); - sk->at.src_port=n; + return( n ); + sk->protinfo.af_at.src_port=n; atalk_insert_socket(sk); sk->zapped=0; return 0; @@ -1168,7 +1234,7 @@ static int atalk_autobind(atalk_socket *sk) * Set the address 'our end' of the connection. */ -static int atalk_bind(struct socket *sock, struct sockaddr *uaddr,int addr_len) +static int atalk_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { atalk_socket *sk; struct sockaddr_at *addr=(struct sockaddr_at *)uaddr; @@ -1176,7 +1242,7 @@ static int atalk_bind(struct socket *sock, struct sockaddr *uaddr,int addr_len) sk=(atalk_socket *)sock->data; if(sk->zapped==0) - return(-EIO); + return(-EINVAL); if(addr_len!=sizeof(struct sockaddr_at)) return -EINVAL; @@ -1189,16 +1255,15 @@ static int atalk_bind(struct socket *sock, struct sockaddr *uaddr,int addr_len) struct at_addr *ap=atalk_find_primary(); if(ap==NULL) return -EADDRNOTAVAIL; - sk->at.src_net=addr->sat_addr.s_net=ap->s_net; - sk->at.src_node=addr->sat_addr.s_node=ap->s_node; + sk->protinfo.af_at.src_net=addr->sat_addr.s_net=ap->s_net; + sk->protinfo.af_at.src_node=addr->sat_addr.s_node=ap->s_node; } else { - if ( atalk_find_interface( addr->sat_addr.s_net, - addr->sat_addr.s_node ) == NULL ) - return -EADDRNOTAVAIL; - sk->at.src_net=addr->sat_addr.s_net; - sk->at.src_node=addr->sat_addr.s_node; + if ( atalk_find_interface( addr->sat_addr.s_net, addr->sat_addr.s_node ) == NULL ) + return -EADDRNOTAVAIL; + sk->protinfo.af_at.src_net=addr->sat_addr.s_net; + sk->protinfo.af_at.src_node=addr->sat_addr.s_node; } if(addr->sat_port == ATADDR_ANYPORT) @@ -1206,10 +1271,10 @@ static int atalk_bind(struct socket *sock, struct sockaddr *uaddr,int addr_len) int n = atalk_pick_port(addr); if(n < 0) return n; - sk->at.src_port=addr->sat_port=n; + sk->protinfo.af_at.src_port=addr->sat_port=n; } else - sk->at.src_port=addr->sat_port; + sk->protinfo.af_at.src_port=addr->sat_port; if(atalk_find_socket(addr)!=NULL) return -EADDRINUSE; @@ -1238,9 +1303,9 @@ static int atalk_connect(struct socket *sock, struct sockaddr *uaddr, if(addr->sat_family!=AF_APPLETALK) return -EAFNOSUPPORT; -#if 0 /* Netatalk doesnt check this */ +#if 0 /* Netatalk doesn't check this - fix netatalk first!*/ if(addr->sat_addr.s_node==ATADDR_BCAST && !sk->broadcast) - return -EPERM; + return -EACCES; #endif if(sk->zapped) { @@ -1251,9 +1316,9 @@ static int atalk_connect(struct socket *sock, struct sockaddr *uaddr, if(atrtr_get_dev(&addr->sat_addr)==NULL) return -ENETUNREACH; - sk->at.dest_port=addr->sat_port; - sk->at.dest_net=addr->sat_addr.s_net; - sk->at.dest_node=addr->sat_addr.s_node; + sk->protinfo.af_at.dest_port=addr->sat_port; + sk->protinfo.af_at.dest_net=addr->sat_addr.s_net; + sk->protinfo.af_at.dest_node=addr->sat_addr.s_node; sock->state = SS_CONNECTED; sk->state=TCP_ESTABLISHED; return(0); @@ -1275,7 +1340,7 @@ static int atalk_socketpair(struct socket *sock1, struct socket *sock2) static int atalk_accept(struct socket *sock, struct socket *newsock, int flags) { if(newsock->data) - kfree_s(newsock->data,sizeof(atalk_socket)); + sk_free(newsock->data); return -EOPNOTSUPP; } @@ -1294,7 +1359,7 @@ static int atalk_getname(struct socket *sock, struct sockaddr *uaddr, if(sk->zapped) { if(atalk_autobind(sk)<0) - return -EBUSY; + return -ENOBUFS; } *uaddr_len = sizeof(struct sockaddr_at); @@ -1303,15 +1368,15 @@ static int atalk_getname(struct socket *sock, struct sockaddr *uaddr, { if(sk->state!=TCP_ESTABLISHED) return -ENOTCONN; - sat.sat_addr.s_net=sk->at.dest_net; - sat.sat_addr.s_node=sk->at.dest_node; - sat.sat_port=sk->at.dest_port; + sat.sat_addr.s_net=sk->protinfo.af_at.dest_net; + sat.sat_addr.s_node=sk->protinfo.af_at.dest_node; + sat.sat_port=sk->protinfo.af_at.dest_port; } else { - sat.sat_addr.s_net=sk->at.src_net; - sat.sat_addr.s_node=sk->at.src_node; - sat.sat_port=sk->at.src_port; + sat.sat_addr.s_net=sk->protinfo.af_at.src_net; + sat.sat_addr.s_node=sk->protinfo.af_at.src_node; + sat.sat_port=sk->protinfo.af_at.src_port; } sat.sat_family = AF_APPLETALK; memcpy(uaddr,&sat,sizeof(sat)); @@ -1321,15 +1386,17 @@ static int atalk_getname(struct socket *sock, struct sockaddr *uaddr, /* * Receive a packet (in skb) from device dev. This has come from the SNAP decoder, and on entry * skb->h.raw is the DDP header, skb->len is the DDP length. The physical headers have been - * extracted. + * extracted. PPP should probably pass frames marked as for this layer + * [ie ARPHRD_ETHERTALK] */ -int atalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) +static int atalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) { atalk_socket *sock; struct ddpehdr *ddp=(void *)skb->h.raw; struct atalk_iface *atif; struct sockaddr_at tosat; + int origlen; /* Size check */ if(skb->len<sizeof(*ddp)) @@ -1342,7 +1409,11 @@ int atalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) /* * Fix up the length field [Ok this is horrible but otherwise * I end up with unions of bit fields and messy bit field order - * compiler/endian dependancies..] + * compiler/endian dependencies..] + * + * FIXME: This is a write to a shared object. Granted it + * happens to be safe BUT.. (Its safe as user space will not + * run until we put it back) */ *((__u16 *)ddp)=ntohs(*((__u16 *)ddp)); @@ -1351,12 +1422,14 @@ int atalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) * Trim buffer in case of stray trailing data */ - skb->len=min(skb->len,ddp->deh_len); + origlen = skb->len; + + skb_trim(skb,min(skb->len,ddp->deh_len)); /* * Size check to see if ddp->deh_len was crap * (Otherwise we'll detonate most spectacularly - * in the middle of recvfrom()). + * in the middle of recvmsg()). */ if(skb->len<sizeof(*ddp)) @@ -1376,6 +1449,16 @@ int atalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) kfree_skb(skb,FREE_READ); return(0); } + +#ifdef CONFIG_FIREWALL + + if(call_in_firewall(AF_APPLETALK, skb->dev, ddp, NULL)!=FW_ACCEPT) + { + kfree_skb(skb, FREE_READ); + return 0; + } + +#endif /* Check the packet is aimed at us */ @@ -1389,8 +1472,29 @@ int atalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) { struct atalk_route *rt; struct at_addr ta; + + /* Don't route multicast, etc., packets, or packets + sent to "this network" */ + if (skb->pkt_type != PACKET_HOST || ddp->deh_dnet == 0) + { + kfree_skb(skb, FREE_READ); + return(0); + } + +#ifdef CONFIG_FIREWALL + /* + * Check firewall allows this routing + */ + + if(call_fw_firewall(AF_APPLETALK, skb->dev, ddp, NULL)!=FW_ACCEPT) + { + kfree_skb(skb, FREE_READ); + return(0); + } +#endif ta.s_net=ddp->deh_dnet; ta.s_node=ddp->deh_dnode; + /* Route the packet */ rt=atrtr_find(&ta); if(rt==NULL || ddp->deh_hops==15) @@ -1399,12 +1503,34 @@ int atalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) return(0); } ddp->deh_hops++; + + /* + * Route goes through another gateway, so + * set the target to the gateway instead. + */ + + if(rt->flags&RTF_GATEWAY) + { + ta.s_net = rt->gateway.s_net; + ta.s_node = rt->gateway.s_node; + } + + /* Fix up skb->len field */ + skb_trim(skb,min(origlen, rt->dev->hard_header_len + + ddp_dl->header_length + ddp->deh_len)); + *((__u16 *)ddp)=ntohs(*((__u16 *)ddp)); /* Mend the byte order */ /* * Send the buffer onwards */ - if(aarp_send_ddp(dev,skb, &ta, NULL)==-1) - kfree_skb(skb, FREE_READ); + + skb=skb_unshare(skb, GFP_ATOMIC, FREE_READ); + if(skb) + { + skb->arp = 1; /* Resolved */ + if(aarp_send_ddp(rt->dev, skb, &ta, NULL)==-1) + kfree_skb(skb, FREE_READ); + } return 0; } @@ -1437,11 +1563,74 @@ int atalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) return(0); } -static int atalk_sendto(struct socket *sock, void *ubuf, int len, int noblock, - unsigned flags, struct sockaddr *sat, int addr_len) +/* + * Receive a localtalk frame. We make some demands on the caller here. + * Caller must provide enough headroom on the packet to pull the short + * header and append a long one. + */ + + +static int ltalk_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) +{ + struct ddpehdr *ddp; + struct at_addr *ap; + /* + * Expand any short form frames. + */ + + if(skb->mac.raw[2]==1) + { + /* + * Find our address. + */ + + ap=atalk_find_dev_addr(dev); + if(ap==NULL || skb->len<sizeof(struct ddpshdr)) + { + kfree_skb(skb, FREE_READ); + return 0; + } + + /* + * The push leaves us with a ddephdr not an shdr, and + * handily the port bytes in the right place preset. + */ + + skb_push(skb, sizeof(*ddp)-4); + ddp=(struct ddpehdr *)skb->data; + + /* + * Now fill in the long header. + */ + + /* + * These two first. The mac overlays the new source/dest + * network information so we MUST copy these before + * we write the network numbers ! + */ + + ddp->deh_dnode=skb->mac.raw[0]; /* From physical header */ + ddp->deh_snode=skb->mac.raw[1]; /* From physical header */ + + ddp->deh_dnet=ap->s_net; /* Network number */ + ddp->deh_snet=ap->s_net; + ddp->deh_sum=0; /* No checksum */ + /* + * Not sure about this bit... + */ + ddp->deh_len=skb->len; + ddp->deh_hops=15; /* Non routable, so force a drop + if we slip up later */ + *((__u16 *)ddp)=htons(*((__u16 *)ddp)); /* Mend the byte order */ + } + skb->h.raw = skb->data; + return atalk_rcv(skb,dev,pt); +} + +static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, int len, int nonblock, int flags) { atalk_socket *sk=(atalk_socket *)sock->data; - struct sockaddr_at *usat=(struct sockaddr_at *)sat; + struct sockaddr_at *usat=(struct sockaddr_at *)msg->msg_name; struct sockaddr_at local_satalk, gsat; struct sk_buff *skb; struct device *dev; @@ -1460,17 +1649,16 @@ static int atalk_sendto(struct socket *sock, void *ubuf, int len, int noblock, if(usat) { if(sk->zapped) - /* put the autobinding in */ { if(atalk_autobind(sk)<0) return -EBUSY; } - if(addr_len <sizeof(*usat)) + if(msg->msg_namelen <sizeof(*usat)) return(-EINVAL); if(usat->sat_family != AF_APPLETALK) return -EINVAL; -#if 0 /* netatalk doesnt implement this check */ +#if 0 /* netatalk doesn't implement this check */ if(usat->sat_addr.s_node==ATADDR_BCAST && !sk->broadcast) return -EPERM; #endif @@ -1481,9 +1669,9 @@ static int atalk_sendto(struct socket *sock, void *ubuf, int len, int noblock, return -ENOTCONN; usat=&local_satalk; usat->sat_family=AF_APPLETALK; - usat->sat_port=sk->at.dest_port; - usat->sat_addr.s_node=sk->at.dest_node; - usat->sat_addr.s_net=sk->at.dest_net; + usat->sat_port=sk->protinfo.af_at.dest_port; + usat->sat_addr.s_node=sk->protinfo.af_at.dest_node; + usat->sat_addr.s_net=sk->protinfo.af_at.dest_net; } /* Build a packet */ @@ -1504,7 +1692,7 @@ static int atalk_sendto(struct socket *sock, void *ubuf, int len, int noblock, { struct at_addr at_hint; at_hint.s_node=0; - at_hint.s_net=sk->at.src_net; + at_hint.s_net=sk->protinfo.af_at.src_net; rt=atrtr_find(&at_hint); if(rt==NULL) return -ENETUNREACH; @@ -1516,49 +1704,63 @@ static int atalk_sendto(struct socket *sock, void *ubuf, int len, int noblock, size += dev->hard_header_len; - skb = sock_alloc_send_skb(sk, size, 0 , &err); + skb = sock_alloc_send_skb(sk, size, 0, 0 , &err); if(skb==NULL) return err; skb->sk=sk; skb->free=1; skb->arp=1; - skb->len=size; + skb_reserve(skb,ddp_dl->header_length); + skb_reserve(skb,dev->hard_header_len); skb->dev=dev; if(sk->debug) printk("SK %p: Begin build.\n", sk); - skb->h.raw=skb->data+ddp_dl->header_length+dev->hard_header_len; - - ddp=(struct ddpehdr *)skb->h.raw; + ddp=(struct ddpehdr *)skb_put(skb,sizeof(struct ddpehdr)); ddp->deh_pad=0; ddp->deh_hops=0; ddp->deh_len=len+sizeof(*ddp); /* * Fix up the length field [Ok this is horrible but otherwise * I end up with unions of bit fields and messy bit field order - * compiler/endian dependancies.. + * compiler/endian dependencies.. */ *((__u16 *)ddp)=ntohs(*((__u16 *)ddp)); ddp->deh_dnet=usat->sat_addr.s_net; - ddp->deh_snet=sk->at.src_net; + ddp->deh_snet=sk->protinfo.af_at.src_net; ddp->deh_dnode=usat->sat_addr.s_node; - ddp->deh_snode=sk->at.src_node; + ddp->deh_snode=sk->protinfo.af_at.src_node; ddp->deh_dport=usat->sat_port; - ddp->deh_sport=sk->at.src_port; + ddp->deh_sport=sk->protinfo.af_at.src_port; if(sk->debug) printk("SK %p: Copy user data (%d bytes).\n", sk, len); - memcpy_fromfs((char *)(ddp+1),ubuf,len); - + err = memcpy_fromiovec(skb_put(skb,len),msg->msg_iov,len); + if (err) + { + kfree_skb(skb, FREE_WRITE); + return -EFAULT; + } + if(sk->no_check==1) ddp->deh_sum=0; else ddp->deh_sum=atalk_checksum(ddp, len+sizeof(*ddp)); + +#ifdef CONFIG_FIREWALL + + if(call_out_firewall(AF_APPLETALK, skb->dev, ddp, NULL)!=FW_ACCEPT) + { + kfree_skb(skb, FREE_WRITE); + return -EPERM; + } + +#endif /* * Loopback broadcast packets to non gateway targets (ie routes @@ -1587,12 +1789,13 @@ static int atalk_sendto(struct socket *sock, void *ubuf, int len, int noblock, if(sk->debug) printk("SK %p: Loop back.\n", sk); /* loop back */ - sk->wmem_alloc-=skb->mem_len; + atomic_sub(skb->truesize, &sk->wmem_alloc); ddp_dl->datalink_header(ddp_dl, skb, dev->dev_addr); skb->sk = NULL; + skb->mac.raw=skb->data; skb->h.raw = skb->data + ddp_dl->header_length + dev->hard_header_len; - skb->len -= ddp_dl->header_length ; - skb->len -= dev->hard_header_len ; + skb_pull(skb,dev->hard_header_len); + skb_pull(skb,ddp_dl->header_length); atalk_rcv(skb,dev,NULL); } else @@ -1614,27 +1817,15 @@ static int atalk_sendto(struct socket *sock, void *ubuf, int len, int noblock, return len; } -static int atalk_send(struct socket *sock, void *ubuf, int size, int noblock, unsigned flags) -{ - return atalk_sendto(sock,ubuf,size,noblock,flags,NULL,0); -} -static int atalk_recvfrom(struct socket *sock, void *ubuf, int size, int noblock, - unsigned flags, struct sockaddr *sip, int *addr_len) +static int atalk_recvmsg(struct socket *sock, struct msghdr *msg, int size, int noblock, int flags, int *addr_len) { atalk_socket *sk=(atalk_socket *)sock->data; - struct sockaddr_at *sat=(struct sockaddr_at *)sip; + struct sockaddr_at *sat=(struct sockaddr_at *)msg->msg_name; struct ddpehdr *ddp = NULL; int copied = 0; struct sk_buff *skb; - int er; - - if(sk->err) - { - er= -sk->err; - sk->err=0; - return er; - } + int er = 0; if(addr_len) *addr_len=sizeof(*sat); @@ -1648,15 +1839,25 @@ static int atalk_recvfrom(struct socket *sock, void *ubuf, int size, int noblock { copied=ddp->deh_len; if(copied > size) + { copied=size; - skb_copy_datagram(skb,0,ubuf,copied); + msg->msg_flags|=MSG_TRUNC; + } + er = skb_copy_datagram_iovec(skb,0,msg->msg_iov,copied); + if (er) + goto out; } else { copied=ddp->deh_len - sizeof(*ddp); if (copied > size) + { copied = size; - skb_copy_datagram(skb,sizeof(*ddp),ubuf,copied); + msg->msg_flags|=MSG_TRUNC; + } + er = skb_copy_datagram_iovec(skb,sizeof(*ddp),msg->msg_iov,copied); + if (er) + goto out; } if(sat) { @@ -1665,32 +1866,12 @@ static int atalk_recvfrom(struct socket *sock, void *ubuf, int size, int noblock sat->sat_addr.s_node=ddp->deh_snode; sat->sat_addr.s_net=ddp->deh_snet; } - skb_free_datagram(skb); - return(copied); +out: + skb_free_datagram(sk, skb); + return er ? er : (copied); } -static int atalk_write(struct socket *sock, char *ubuf, int size, int noblock) -{ - return atalk_send(sock,ubuf,size,noblock,0); -} - - -static int atalk_recv(struct socket *sock, void *ubuf, int size , int noblock, - unsigned flags) -{ - atalk_socket *sk=(atalk_socket *)sock->data; - if(sk->zapped) - return -ENOTCONN; - return atalk_recvfrom(sock,ubuf,size,noblock,flags,NULL, NULL); -} - -static int atalk_read(struct socket *sock, char *ubuf, int size, int noblock) -{ - return atalk_recv(sock,ubuf,size,noblock,0); -} - - static int atalk_shutdown(struct socket *sk,int how) { return -EOPNOTSUPP; @@ -1709,10 +1890,8 @@ static int atalk_select(struct socket *sock , int sel_type, select_table *wait) static int atalk_ioctl(struct socket *sock,unsigned int cmd, unsigned long arg) { - int err; long amount=0; atalk_socket *sk=(atalk_socket *)sock->data; - int v; switch(cmd) { @@ -1720,16 +1899,16 @@ static int atalk_ioctl(struct socket *sock,unsigned int cmd, unsigned long arg) * Protocol layer */ case TIOCOUTQ: - v=sk->sndbuf-sk->wmem_alloc; - if(v<0) - v=0; + amount=sk->sndbuf-sk->wmem_alloc; + if(amount<0) + amount=0; break; case TIOCINQ: { struct sk_buff *skb; /* These two are safe on a single CPU system as only user tasks fiddle here */ if((skb=skb_peek(&sk->receive_queue))!=NULL) - v=skb->len-sizeof(struct ddpehdr); + amount=skb->len-sizeof(struct ddpehdr); break; } case SIOCGSTAMP: @@ -1737,11 +1916,7 @@ static int atalk_ioctl(struct socket *sock,unsigned int cmd, unsigned long arg) { if(sk->stamp.tv_sec==0) return -ENOENT; - err=verify_area(VERIFY_WRITE,(void *)arg,sizeof(struct timeval)); - if(err) - return err; - memcpy_tofs((void *)arg,&sk->stamp,sizeof(struct timeval)); - return 0; + return copy_to_user((void *)arg,&sk->stamp,sizeof(struct timeval)) ? -EFAULT : 0; } return -EINVAL; /* @@ -1765,7 +1940,6 @@ static int atalk_ioctl(struct socket *sock,unsigned int cmd, unsigned long arg) case SIOCSIFLINK: case SIOCGIFHWADDR: case SIOCSIFHWADDR: - case OLD_SIOCGIFHWADDR: case SIOCGIFFLAGS: case SIOCSIFFLAGS: case SIOCGIFMTU: @@ -1788,11 +1962,7 @@ static int atalk_ioctl(struct socket *sock,unsigned int cmd, unsigned long arg) default: return -EINVAL; } - err=verify_area(VERIFY_WRITE,(void *)arg,sizeof(unsigned long)); - if(err) - return err; - put_fs_long(amount,(unsigned long *)arg); - return(0); + return put_user(amount, (int *)arg); } static struct proto_ops atalk_proto_ops = { @@ -1806,19 +1976,15 @@ static struct proto_ops atalk_proto_ops = { atalk_socketpair, atalk_accept, atalk_getname, - atalk_read, - atalk_write, atalk_select, atalk_ioctl, atalk_listen, - atalk_send, - atalk_recv, - atalk_sendto, - atalk_recvfrom, atalk_shutdown, atalk_setsockopt, atalk_getsockopt, atalk_fcntl, + atalk_sendmsg, + atalk_recvmsg }; static struct notifier_block ddp_notifier={ @@ -1827,17 +1993,137 @@ static struct notifier_block ddp_notifier={ 0 }; +struct packet_type ltalk_packet_type= +{ + 0, + NULL, + ltalk_rcv, + NULL, + NULL +}; + +struct packet_type ppptalk_packet_type= +{ + 0, + NULL, + atalk_rcv, + NULL, + NULL +}; + +static char ddp_snap_id[]={0x08,0x00,0x07,0x80,0x9B}; + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry proc_appletalk = { + PROC_NET_ATALK, 9, "appletalk", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + atalk_get_info +}; +static struct proc_dir_entry proc_atalk_route = { + PROC_NET_AT_ROUTE, 11,"atalk_route", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + atalk_rt_get_info +}; +static struct proc_dir_entry proc_atalk_iface = { + PROC_NET_ATIF, 11,"atalk_iface", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + atalk_if_get_info +}; +#endif + /* Called by proto.c on kernel start up */ void atalk_proto_init(struct net_proto *pro) { - static char ddp_snap_id[]={0x08,0x00,0x07,0x80,0x9B}; (void) sock_register(atalk_proto_ops.family, &atalk_proto_ops); - if((ddp_dl=register_snap_client(ddp_snap_id, atalk_rcv))==NULL) - printk("Unable to register DDP with SNAP.\n"); + if ((ddp_dl = register_snap_client(ddp_snap_id, atalk_rcv)) == NULL) + printk(KERN_CRIT "Unable to register DDP with SNAP.\n"); + + ltalk_packet_type.type=htons(ETH_P_LOCALTALK); + dev_add_pack(<alk_packet_type); + + ppptalk_packet_type.type=htons(ETH_P_PPPTALK); + dev_add_pack(&ppptalk_packet_type); + register_netdevice_notifier(&ddp_notifier); aarp_proto_init(); - printk("Appletalk ALPHA 0.08 for Linux NET3.029\n"); - + +#ifdef CONFIG_PROC_FS + proc_net_register(&proc_appletalk); + proc_net_register(&proc_atalk_route); + proc_net_register(&proc_atalk_iface); +#endif + + printk(KERN_INFO "Appletalk 0.18 for Linux NET3.037\n"); } -#endif + +#ifdef MODULE + +int init_module(void) +{ + atalk_proto_init(NULL); + register_symtab(0); + return 0; +} + +/* + * FIX THIS: If there are any routes/devices configured + * for appletalk we must not be unloaded. + */ + +/* Remove all route entries. Interrupts must be off. */ +extern inline void free_route_list(void) +{ + struct atalk_route *list = atalk_router_list, *tmp; + + while (list != NULL) + { + tmp = list->next; + kfree_s(list, sizeof(struct atalk_route)); + list = tmp; + } +} + +/* Remove all interface entries. Interrupts must be off. */ +extern inline void free_interface_list(void) +{ + struct atalk_iface *list = atalk_iface_list, *tmp; + + while (list != NULL) + { + tmp = list->next; + kfree_s(list, sizeof(struct atalk_iface)); + list = tmp; + } +} + +void cleanup_module(void) +{ + unsigned long flags; + + save_flags(flags); + cli(); + + aarp_cleanup_module(); + +#ifdef CONFIG_PROC_FS + proc_net_unregister(PROC_NET_ATALK); + proc_net_unregister(PROC_NET_AT_ROUTE); + proc_net_unregister(PROC_NET_ATIF); +#endif + unregister_netdevice_notifier(&ddp_notifier); + dev_remove_pack(<alk_packet_type); + dev_remove_pack(&ppptalk_packet_type); + unregister_snap_client(ddp_snap_id); + sock_unregister(atalk_proto_ops.family); + + free_route_list(); + free_interface_list(); + + restore_flags(flags); +} + +#endif /* MODULE */ diff --git a/net/appletalk/sysctl_net_atalk.c b/net/appletalk/sysctl_net_atalk.c new file mode 100644 index 000000000..307278992 --- /dev/null +++ b/net/appletalk/sysctl_net_atalk.c @@ -0,0 +1,13 @@ +/* -*- linux-c -*- + * sysctl_net_atalk.c: sysctl interface to net Appletalk subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/atalk directory entry (empty =) ). [MS] + */ + +#include <linux/mm.h> +#include <linux/sysctl.h> + +ctl_table atalk_table[] = { + {0} +}; diff --git a/net/ax25/Makefile b/net/ax25/Makefile index 77301561c..172d89eed 100644 --- a/net/ax25/Makefile +++ b/net/ax25/Makefile @@ -1,5 +1,5 @@ # -# Makefile for the Linux TCP/IP (INET) layer. +# Makefile for the Linux AX.25 layer. # # Note! Dependencies are done automagically by 'make dep', which also # removes any old dependencies. DON'T put your own dependencies here @@ -7,34 +7,14 @@ # # Note 2! The CFLAGS definition is now in the main makefile... -.c.o: - $(CC) $(CFLAGS) -c $< -.s.o: - $(AS) -o $*.o $< -.c.s: - $(CC) $(CFLAGS) -S $< +O_TARGET := ax25.o +O_OBJS := sysctl_net_ax25.o ax25_in.o ax25_out.o ax25_route.o ax25_subr.o ax25_timer.o +M_OBJS := $(O_TARGET) -OBJS := af_ax25.o +OX_OBJS += af_ax25.o -ifdef CONFIG_AX25 - -OBJS := $(OBJS) ax25_in.o ax25_out.o ax25_route.o ax25_subr.o ax25_timer.o - -endif - -ax25.o: $(OBJS) - $(LD) -r -o ax25.o $(OBJS) - -dep: - $(CPP) -M *.c > .depend +include $(TOPDIR)/Rules.make tar: tar -cvf /dev/f1 . - -# -# include a dependency file if one exists -# -ifeq (.depend,$(wildcard .depend)) -include .depend -endif diff --git a/net/ax25/README.AX25 b/net/ax25/README.AX25 deleted file mode 100644 index 90b6a037a..000000000 --- a/net/ax25/README.AX25 +++ /dev/null @@ -1,20 +0,0 @@ -This is a working version of the new state machine code for AX25 under -Linux. It is closely based on the SDL diagrams published in the ARRL 7th -Computer Networking Conference papers, and they should be referred to when -reading the code, notably the stuff in ax25_in.c. The next stage is to -separate the ax25 control block from the socket and then add NET/ROM and -connected mode IP. I would also like to add the extended AX25 designed by a -Dutch station which allows for window sizes up to 127. - -This code will work the same as the old code, although the display in -/proc/net/ax25 is a little different, but should be understandable. Please -give this code a work out and report any bugs to me either at -jsn@cs.nott.ac.uk or at GB7DAD.GBR.EU. - -This code has taught me a lot about the internals of the networking side of -Linux especially skbuff handling and I now feel happy about implementing the -higher level protocols. - -73's - -Jonathan diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 725939698..7d08fed10 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1,10 +1,10 @@ /* - * AX.25 release 029 + * AX.25 release 034 * * This is ALPHA test software. This code may break your machine, randomly fail to work with new * releases, misbehave and/or generally screw up. It might even work. * - * This code REQUIRES 1.2.1 or higher/ NET3.029 + * This code REQUIRES 2.1.10 or higher/ NET3.029 * * This module: * This module is free software; you can redistribute it and/or @@ -13,7 +13,7 @@ * 2 of the License, or (at your option) any later version. * * History - * AX.25 006 Alan(GW4PTS) Nearly died of shock - its working 8-) + * AX.25 006 Alan(GW4PTS) Nearly died of shock - it's working 8-) * AX.25 007 Alan(GW4PTS) Removed the silliest bugs * AX.25 008 Alan(GW4PTS) Cleaned up, fixed a few state machine problems, added callbacks * AX.25 009 Alan(GW4PTS) Emergency patch kit to fix memory corruption @@ -25,7 +25,7 @@ * Correct receive on SOCK_DGRAM. * AX.25 013 Alan(GW4PTS) Send DM to all unknown frames, missing initialiser fixed * Leave spare SSID bits set (DAMA etc) - thanks for bug report, - * removed device registration (its not used or needed). Clean up for + * removed device registration (it's not used or needed). Clean up for * gcc 2.5.8. PID to AX25_P_ * AX.25 014 Alan(GW4PTS) Cleanup and NET3 merge * AX.25 015 Alan(GW4PTS) Internal test version. @@ -56,17 +56,46 @@ * Jonathan(G4KLX) and removed all the old Berkeley, added IP mode registration. * Darryl(G7LED) stuff. Cross-port digipeating. Minor fixes and enhancements. * Alan(GW4PTS) Missed suser() on axassociate checks + * AX.25 030 Alan(GW4PTS) Added variable length headers. + * Jonathan(G4KLX) Added BPQ Ethernet interface. + * Steven(GW7RRM) Added digi-peating control ioctl. + * Added extended AX.25 support. + * Added AX.25 frame segmentation. + * Darryl(G7LED) Changed connect(), recvfrom(), sendto() sockaddr/addrlen to + * fall inline with bind() and new policy. + * Moved digipeating ctl to new ax25_dev structs. + * Fixed ax25_release(), set TCP_CLOSE, wakeup app + * context, THEN make the sock dead. + * Alan(GW4PTS) Cleaned up for single recvmsg methods. + * Alan(GW4PTS) Fixed not clearing error on connect failure. + * AX.25 031 Jonathan(G4KLX) Added binding to any device. + * Joerg(DL1BKE) Added DAMA support, fixed (?) digipeating, fixed buffer locking + * for "virtual connect" mode... Result: Probably the + * "Most Buggiest Code You've Ever Seen" (TM) + * HaJo(DD8NE) Implementation of a T5 (idle) timer + * Joerg(DL1BKE) Renamed T5 to IDLE and changed behaviour: + * the timer gets reloaded on every received or transmitted + * I frame for IP or NETROM. The idle timer is not active + * on "vanilla AX.25" connections. Furthermore added PACLEN + * to provide AX.25-layer based fragmentation (like WAMPES) + * AX.25 032 Joerg(DL1BKE) Fixed DAMA timeout error. + * ax25_send_frame() limits the number of enqueued + * datagrams per socket. + * AX.25 033 Jonathan(G4KLX) Removed auto-router. + * Hans(PE1AYX) Converted to Module. + * Joerg(DL1BKE) Moved BPQ Ethernet to seperate driver. + * AX.25 034 Jonathan(G4KLX) 2.1 changes + * Alan(GW4PTS) Small POSIXisations * * To do: - * Support use as digipeater, including an on/off ioctl * Restructure the ax25_rcv code to be cleaner/faster and * copy only when needed. - * Consider better arbitary protocol support. - * Fix non-blocking connect failure. + * Consider better arbitrary protocol support. */ #include <linux/config.h> -#ifdef CONFIG_AX25 +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) +#include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> @@ -80,28 +109,30 @@ #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> +#include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <linux/fcntl.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/notifier.h> - +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/firewall.h> +#include <linux/sysctl.h> #include <net/ip.h> #include <net/arp.h> -#define CONFIG_AX25_XDIGI /* Cross port (band) digi stuff */ - -/**********************************************************************************************************************\ -* * -* Handlers for the socket list. * -* * -\**********************************************************************************************************************/ +/* + * The null address is defined as a callsign of all spaces with an + * SSID of zero. + */ +ax25_address null_ax25_address = {{0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00}}; -static ax25_cb *volatile ax25_list = NULL; +ax25_cb *volatile ax25_list = NULL; /* * ax25 -> ascii conversion @@ -112,8 +143,7 @@ char *ax2asc(ax25_address *a) char c, *s; int n; - for (n = 0, s = buf; n < 6; n++) - { + for (n = 0, s = buf; n < 6; n++) { c = (a->ax25_call[n] >> 1) & 0x7F; if (c != ' ') *s++ = c; @@ -121,8 +151,7 @@ char *ax2asc(ax25_address *a) *s++ = '-'; - if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) - { + if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) { *s++ = '1'; n -= 10; } @@ -130,11 +159,50 @@ char *ax2asc(ax25_address *a) *s++ = n + '0'; *s++ = '\0'; - return(buf); + if (*buf == '\0' || *buf == '-') + return "*"; + + return buf; } /* + * ascii -> ax25 conversion + */ +ax25_address *asc2ax(char *callsign) +{ + static ax25_address addr; + char *s; + int n; + + for (s = callsign, n = 0; n < 6; n++) { + if (*s != '\0' && *s != '-') + addr.ax25_call[n] = *s++; + else + addr.ax25_call[n] = ' '; + addr.ax25_call[n] <<= 1; + addr.ax25_call[n] &= 0xFE; + } + + if (*s++ == '\0') { + addr.ax25_call[6] = 0x00; + return &addr; + } + + addr.ax25_call[6] = *s++ - '0'; + + if (*s != '\0') { + addr.ax25_call[6] *= 10; + addr.ax25_call[6] += *s++ - '0'; + } + + addr.ax25_call[6] <<= 1; + addr.ax25_call[6] &= 0x1E; + + return &addr; +} + +/* * Compare two ax.25 addresses */ int ax25cmp(ax25_address *a, ax25_address *b) @@ -154,6 +222,22 @@ int ax25cmp(ax25_address *a, ax25_address *b) } /* + * Free an allocated ax25 control block. This is done to centralise + * the MOD count code. + */ +static void ax25_free_cb(ax25_cb *ax25) +{ + if (ax25->digipeat != NULL) { + kfree_s(ax25->digipeat, sizeof(ax25_digi)); + ax25->digipeat = NULL; + } + + kfree_s(ax25, sizeof(ax25_cb)); + + MOD_DEC_USE_COUNT; +} + +/* * Socket removal during an interrupt is now safe. */ static void ax25_remove_socket(ax25_cb *ax25) @@ -192,30 +276,44 @@ static void ax25_kill_by_device(struct device *dev) for (s = ax25_list; s != NULL; s = s->next) { if (s->device == dev) { + s->state = AX25_STATE_0; s->device = NULL; if (s->sk != NULL) { - s->sk->state = TCP_CLOSE; - s->sk->err = ENETUNREACH; + s->sk->state = TCP_CLOSE; + s->sk->err = ENETUNREACH; + s->sk->shutdown |= SEND_SHUTDOWN; if (!s->sk->dead) s->sk->state_change(s->sk); s->sk->dead = 1; } } } - - ax25_rt_device_down(dev); } /* * Handle device status changes. */ -static int ax25_device_event(unsigned long event, void *ptr) +static int ax25_device_event(struct notifier_block *this,unsigned long event, void *ptr) { - if (event != NETDEV_DOWN) + struct device *dev = (struct device *)ptr; + + /* Reject non AX.25 devices */ + if (dev->type != ARPHRD_AX25) return NOTIFY_DONE; - - ax25_kill_by_device(ptr); - + + switch (event) { + case NETDEV_UP: + ax25_dev_device_up(dev); + break; + case NETDEV_DOWN: + ax25_kill_by_device(dev); + ax25_rt_device_down(dev); + ax25_dev_device_down(dev); + break; + default: + break; + } + return NOTIFY_DONE; } @@ -236,7 +334,7 @@ static void ax25_insert_socket(ax25_cb *ax25) } /* - * Find a socket that wants to accept the SABM we just + * Find a socket that wants to accept the SABM we have just * received. */ static struct sock *ax25_find_listener(ax25_address *addr, struct device *dev, int type) @@ -311,7 +409,7 @@ static ax25_cb *ax25_find_cb(ax25_address *my_addr, ax25_address *dest_addr, str } /* - * Look for any matching address - RAW sockets can bind to arbitary names + * Look for any matching address - RAW sockets can bind to arbitrary names */ static struct sock *ax25_addr_match(ax25_address *addr) { @@ -343,10 +441,10 @@ static void ax25_send_to_raw(struct sock *sk, struct sk_buff *skb, int proto) return; copy->sk = sk; - sk->rmem_alloc += copy->mem_len; + atomic_add(copy->truesize, &sk->rmem_alloc); skb_queue_tail(&sk->receive_queue, copy); if (!sk->dead) - sk->data_ready(sk, skb->len - 2); + sk->data_ready(sk, skb->len); } sk = sk->next; @@ -356,7 +454,7 @@ static void ax25_send_to_raw(struct sock *sk, struct sk_buff *skb, int proto) /* * Deferred destroy. */ -void ax25_destory_socket(ax25_cb *); +void ax25_destroy_socket(ax25_cb *); /* * Handler for deferred kills. @@ -372,7 +470,7 @@ static void ax25_destroy_timer(unsigned long data) * Once it is removed from the queue no interrupt or bottom half will * touch it and we are (fairly 8-) ) safe. */ -void ax25_destroy_socket(ax25_cb *ax25) /* Not static as its used by the timer */ +void ax25_destroy_socket(ax25_cb *ax25) /* Not static as it's used by the timer */ { struct sk_buff *skb; unsigned long flags; @@ -383,38 +481,33 @@ void ax25_destroy_socket(ax25_cb *ax25) /* Not static as its used by the timer * del_timer(&ax25->timer); ax25_remove_socket(ax25); - ax25_clear_tx_queue(ax25); /* Flush the send queue */ + ax25_clear_queues(ax25); /* Flush the queues */ if (ax25->sk != NULL) { while ((skb = skb_dequeue(&ax25->sk->receive_queue)) != NULL) { if (skb->sk != ax25->sk) { /* A pending connection */ skb->sk->dead = 1; /* Queue the unaccepted socket for death */ - ax25_set_timer(skb->sk->ax25); - skb->sk->ax25->state = AX25_STATE_0; + ax25_set_timer(skb->sk->protinfo.ax25); + skb->sk->protinfo.ax25->state = AX25_STATE_0; } kfree_skb(skb, FREE_READ); } } - if (ax25->digipeat != NULL) { - kfree_s(ax25->digipeat, sizeof(ax25_digi)); - ax25->digipeat = NULL; - } - if (ax25->sk != NULL) { if (ax25->sk->wmem_alloc || ax25->sk->rmem_alloc) { /* Defer: outstanding buffers */ init_timer(&ax25->timer); - ax25->timer.expires = 10 * HZ; + ax25->timer.expires = jiffies + 10 * HZ; ax25->timer.function = ax25_destroy_timer; ax25->timer.data = (unsigned long)ax25; add_timer(&ax25->timer); } else { - kfree_s(ax25->sk, sizeof(*ax25->sk)); - kfree_s(ax25, sizeof(*ax25)); + sk_free(ax25->sk); + ax25_free_cb(ax25); } } else { - kfree_s(ax25, sizeof(*ax25)); + ax25_free_cb(ax25); } restore_flags(flags); @@ -451,22 +544,27 @@ static int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax) return a->uid; } return -ENOENT; + case SIOCAX25ADDUID: - if(!suser()) + if (!suser()) return -EPERM; if (ax25_findbyuid(sax->sax25_uid)) return -EEXIST; + if (sax->sax25_uid == 0) + return -EINVAL; a = (ax25_uid_assoc *)kmalloc(sizeof(*a), GFP_KERNEL); + if (a == NULL) + return -ENOMEM; a->uid = sax->sax25_uid; a->call = sax->sax25_call; a->next = ax25_uid_list; ax25_uid_list = a; return 0; - case SIOCAX25DELUID: - { + + case SIOCAX25DELUID: { ax25_uid_assoc **l; - if(!suser()) + if (!suser()) return -EPERM; l = &ax25_uid_list; while ((*l) != NULL) { @@ -481,10 +579,138 @@ static int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax) } return -ENOENT; } + + default: + return -EINVAL; } return -EINVAL; /*NOTREACHED */ -} +} + +/* + * dl1bke 960311: set parameters for existing AX.25 connections, + * includes a KILL command to abort any connection. + * VERY useful for debugging ;-) + */ +static int ax25_ctl_ioctl(const unsigned int cmd, void *arg) +{ + struct ax25_ctl_struct ax25_ctl; + struct device *dev; + ax25_cb *ax25; + unsigned long flags; + int err; + + if ((err = verify_area(VERIFY_READ, arg, sizeof(ax25_ctl))) != 0) + return err; + + copy_from_user(&ax25_ctl, arg, sizeof(ax25_ctl)); + + if ((dev = ax25rtr_get_dev(&ax25_ctl.port_addr)) == NULL) + return -ENODEV; + + if ((ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, dev)) == NULL) + return -ENOTCONN; + + switch (ax25_ctl.cmd) { + case AX25_KILL: + ax25_clear_queues(ax25); + ax25_send_control(ax25, DISC, POLLON, C_COMMAND); + + ax25->state = AX25_STATE_0; + if (ax25->sk != NULL) { + ax25->sk->state = TCP_CLOSE; + ax25->sk->err = ENETRESET; + ax25->sk->shutdown |= SEND_SHUTDOWN; + if (!ax25->sk->dead) + ax25->sk->state_change(ax25->sk); + ax25->sk->dead = 1; + } + + ax25_dama_off(ax25); + ax25_set_timer(ax25); + break; + + case AX25_WINDOW: + if (ax25->modulus == MODULUS) { + if (ax25_ctl.arg < 1 || ax25_ctl.arg > 7) + return -EINVAL; + } else { + if (ax25_ctl.arg < 1 || ax25_ctl.arg > 63) + return -EINVAL; + } + ax25->window = ax25_ctl.arg; + break; + + case AX25_T1: + if (ax25_ctl.arg < 1) + return -EINVAL; + ax25->rtt = (ax25_ctl.arg * PR_SLOWHZ) / 2; + ax25->t1 = ax25_ctl.arg * PR_SLOWHZ; + save_flags(flags); cli(); + if (ax25->t1timer > ax25->t1) + ax25->t1timer = ax25->t1; + restore_flags(flags); + break; + + case AX25_T2: + if (ax25_ctl.arg < 1) + return -EINVAL; + save_flags(flags); cli(); + ax25->t2 = ax25_ctl.arg * PR_SLOWHZ; + if (ax25->t2timer > ax25->t2) + ax25->t2timer = ax25->t2; + restore_flags(flags); + break; + + case AX25_N2: + if (ax25_ctl.arg < 1 || ax25_ctl.arg > 31) + return -EINVAL; + ax25->n2count = 0; + ax25->n2 = ax25_ctl.arg; + break; + + case AX25_T3: + if (ax25_ctl.arg < 0) + return -EINVAL; + save_flags(flags); cli(); + ax25->t3 = ax25_ctl.arg * PR_SLOWHZ; + if (ax25->t3timer != 0) + ax25->t3timer = ax25->t3; + restore_flags(flags); + break; + + case AX25_IDLE: + if (ax25_ctl.arg < 0) + return -EINVAL; + save_flags(flags); cli(); + ax25->idle = ax25_ctl.arg * PR_SLOWHZ * 60; + if (ax25->idletimer != 0) + ax25->idletimer = ax25->idle; + restore_flags(flags); + break; + + case AX25_PACLEN: + if (ax25_ctl.arg < 16 || ax25_ctl.arg > 65535) + return -EINVAL; +#if 0 + if (ax25_ctl.arg > 256) /* we probably want this */ + printk(KERN_WARNING "ax25_ctl_ioctl: Warning --- huge paclen %d\n", (int)ax25_ctl.arg); +#endif + ax25->paclen = ax25_ctl.arg; + break; + + case AX25_MAXQUEUE: + if (ax25_ctl.arg < 1) + return -EINVAL; + ax25->maxqueue = ax25_ctl.arg; + break; + + default: + return -EINVAL; + } + + return 0; +} /* * Create an empty AX.25 control block. @@ -496,49 +722,120 @@ static ax25_cb *ax25_create_cb(void) if ((ax25 = (ax25_cb *)kmalloc(sizeof(*ax25), GFP_ATOMIC)) == NULL) return NULL; + MOD_INC_USE_COUNT; + skb_queue_head_init(&ax25->write_queue); + skb_queue_head_init(&ax25->frag_queue); skb_queue_head_init(&ax25->ack_queue); + skb_queue_head_init(&ax25->reseq_queue); init_timer(&ax25->timer); - ax25->rtt = DEFAULT_T1; - ax25->t1 = DEFAULT_T1; - ax25->t2 = DEFAULT_T2; - ax25->n2 = DEFAULT_N2; - ax25->t3 = DEFAULT_T3; + ax25->dama_slave = 0; + + ax25->rtt = AX25_DEF_T1 / 2; + ax25->t1 = AX25_DEF_T1; + ax25->t2 = AX25_DEF_T2; + ax25->t3 = AX25_DEF_T3; + ax25->n2 = AX25_DEF_N2; + ax25->paclen = AX25_DEF_PACLEN; + ax25->maxqueue= AX25_DEF_MAXQUEUE; + ax25->idle = AX25_DEF_IDLE; + + if (AX25_DEF_AXDEFMODE) { + ax25->modulus = EMODULUS; + ax25->window = AX25_DEF_EWINDOW; + } else { + ax25->modulus = MODULUS; + ax25->window = AX25_DEF_WINDOW; + } + ax25->fragno = 0; + ax25->fraglen = 0; + ax25->hdrincl = 0; + ax25->backoff = AX25_DEF_BACKOFF; ax25->condition = 0x00; ax25->t1timer = 0; ax25->t2timer = 0; ax25->t3timer = 0; ax25->n2count = 0; + ax25->idletimer = 0; ax25->va = 0; ax25->vr = 0; ax25->vs = 0; - ax25->window = DEFAULT_WINDOW; ax25->device = NULL; ax25->digipeat = NULL; ax25->sk = NULL; ax25->state = AX25_STATE_0; - memset(&ax25->dest_addr, '\0', sizeof(ax25_address)); - memset(&ax25->source_addr, '\0', sizeof(ax25_address)); + memset(&ax25->dest_addr, '\0', AX25_ADDR_LEN); + memset(&ax25->source_addr, '\0', AX25_ADDR_LEN); return ax25; } -int ax25_send_frame(struct sk_buff *skb, ax25_address *src, ax25_address *dest, struct device *dev) +/* + * Find out if we are a DAMA slave for this device and count the + * number of connections. + * + * dl1bke 951121 + */ +int ax25_dev_is_dama_slave(struct device *dev) +{ + ax25_cb *ax25; + int count = 0; + + for (ax25 = ax25_list; ax25 != NULL; ax25 = ax25->next) { + if (ax25->device == dev && ax25->dama_slave) { + count++; + break; + } + } + + return count; +} + +/* + * Fill in a created AX.25 created control block with the default + * values for a particular device. + */ +static void ax25_fillin_cb(ax25_cb *ax25, struct device *dev) +{ + ax25->device = dev; + + ax25->rtt = ax25_dev_get_value(dev, AX25_VALUES_T1); + ax25->t1 = ax25_dev_get_value(dev, AX25_VALUES_T1); + ax25->t2 = ax25_dev_get_value(dev, AX25_VALUES_T2); + ax25->t3 = ax25_dev_get_value(dev, AX25_VALUES_T3); + ax25->n2 = ax25_dev_get_value(dev, AX25_VALUES_N2); + ax25->paclen = ax25_dev_get_value(dev, AX25_VALUES_PACLEN); + ax25->maxqueue = ax25_dev_get_value(dev, AX25_VALUES_MAXQUEUE); + ax25->idle = ax25_dev_get_value(dev, AX25_VALUES_IDLE); + + ax25->dama_slave = 0; + + if (ax25_dev_get_value(dev, AX25_VALUES_AXDEFMODE)) { + ax25->modulus = EMODULUS; + ax25->window = ax25_dev_get_value(dev, AX25_VALUES_EWINDOW); + } else { + ax25->modulus = MODULUS; + ax25->window = ax25_dev_get_value(dev, AX25_VALUES_WINDOW); + } + + ax25->backoff = ax25_dev_get_value(dev, AX25_VALUES_BACKOFF); +} + +int ax25_send_frame(struct sk_buff *skb, ax25_address *src, ax25_address *dest, + ax25_digi *digi, struct device *dev) { ax25_cb *ax25; if (skb == NULL) return 0; - skb->h.raw = skb->data + 15; - /* * Look for an existing connection. */ @@ -547,7 +844,12 @@ int ax25_send_frame(struct sk_buff *skb, ax25_address *src, ax25_address *dest, continue; if (ax25cmp(&ax25->source_addr, src) == 0 && ax25cmp(&ax25->dest_addr, dest) == 0 && ax25->device == dev) { - ax25_output(ax25, skb); + if (ax25_queue_length(ax25, skb) > ax25->maxqueue * ax25->window) { + kfree_skb(skb, FREE_WRITE); + } else { + ax25_output(ax25, skb); + } + ax25->idletimer = ax25->idle; return 1; /* It already existed */ } } @@ -555,12 +857,30 @@ int ax25_send_frame(struct sk_buff *skb, ax25_address *src, ax25_address *dest, if ((ax25 = ax25_create_cb()) == NULL) return 0; - ax25->device = dev; + ax25_fillin_cb(ax25, dev); - memcpy(&ax25->source_addr, src, sizeof(ax25_address)); - memcpy(&ax25->dest_addr, dest, sizeof(ax25_address)); + ax25->source_addr = *src; + ax25->dest_addr = *dest; - ax25_establish_data_link(ax25); + if (digi != NULL) { + if ((ax25->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { + ax25_free_cb(ax25); + return 0; + } + *ax25->digipeat = *digi; + } else { + ax25_rt_build_path(ax25, dest, dev); + } + + if (ax25_dev_is_dama_slave(ax25->device)) + dama_establish_data_link(ax25); + else + ax25_establish_data_link(ax25); + + /* idle timeouts only for mode vc connections */ + + ax25->idletimer = ax25->idle; + ax25_insert_socket(ax25); ax25->state = AX25_STATE_1; @@ -572,39 +892,47 @@ int ax25_send_frame(struct sk_buff *skb, ax25_address *src, ax25_address *dest, return 1; /* We had to create it */ } -/*******************************************************************************************************************\ -* * -* Routing rules for AX.25: Basically iterate over the active interfaces * -* * -\*******************************************************************************************************************/ +/* + * Return the state of an AX.25 link given source, destination, and + * device. + */ +int ax25_link_up(ax25_address *src, ax25_address *dest, struct device *dev) +{ + ax25_cb *ax25; + + for (ax25 = ax25_list; ax25 != NULL; ax25 = ax25->next) { + if (ax25->sk != NULL && ax25->sk->type != SOCK_SEQPACKET) + continue; + + if (ax25cmp(&ax25->source_addr, src) == 0 && ax25cmp(&ax25->dest_addr, dest) == 0 && ax25->device == dev) + return 1; + } + + return 0; +} +/* + * Find the AX.25 device that matches the hardware address supplied. + */ struct device *ax25rtr_get_dev(ax25_address *addr) { struct device *dev; - for (dev = dev_base; dev != NULL; dev = dev->next) { - if ((dev->flags & IFF_UP) && dev->type == ARPHRD_AX25) { /* Active kiss ax25 mode */ - if (ax25cmp(addr, (ax25_address *)dev->dev_addr) == 0) - return dev; - } - } + for (dev = dev_base; dev != NULL; dev = dev->next) + if ((dev->flags & IFF_UP) && dev->type == ARPHRD_AX25 && + ax25cmp(addr, (ax25_address*) dev->dev_addr) == 0) + return dev; return NULL; } -/*******************************************************************************************************************\ -* * -* Handling for system calls applied via the various interfaces to an AX25 socket object * -* * -\*******************************************************************************************************************/ - +/* + * Handling for system calls applied via the various interfaces to an + * AX25 socket object + */ static int ax25_fcntl(struct socket *sock, unsigned int cmd, unsigned long arg) { - switch(cmd) - { - default: - return(-EINVAL); - } + return -EINVAL; } static int ax25_setsockopt(struct socket *sock, int level, int optname, @@ -627,39 +955,68 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, if ((err = verify_area(VERIFY_READ, optval, sizeof(int))) != 0) return err; - opt = get_fs_long((unsigned long *)optval); - + get_user(opt, (int *)optval); + switch (optname) { case AX25_WINDOW: - if (opt < 1 || opt > 7) - return -EINVAL; - sk->ax25->window = opt; + if (sk->protinfo.ax25->modulus == MODULUS) { + if (opt < 1 || opt > 7) + return -EINVAL; + } else { + if (opt < 1 || opt > 63) + return -EINVAL; + } + sk->protinfo.ax25->window = opt; return 0; - + case AX25_T1: if (opt < 1) return -EINVAL; - sk->ax25->t1 = opt * PR_SLOWHZ; + sk->protinfo.ax25->rtt = (opt * PR_SLOWHZ) / 2; return 0; case AX25_T2: if (opt < 1) return -EINVAL; - sk->ax25->t2 = opt * PR_SLOWHZ; + sk->protinfo.ax25->t2 = opt * PR_SLOWHZ; return 0; - + case AX25_N2: if (opt < 1 || opt > 31) return -EINVAL; - sk->ax25->n2 = opt; + sk->protinfo.ax25->n2 = opt; return 0; - + case AX25_T3: if (opt < 1) return -EINVAL; - sk->ax25->t3 = opt * PR_SLOWHZ; + sk->protinfo.ax25->t3 = opt * PR_SLOWHZ; return 0; - + + case AX25_IDLE: + if (opt < 0) + return -EINVAL; + sk->protinfo.ax25->idle = opt * PR_SLOWHZ * 60; + return 0; + + case AX25_BACKOFF: + sk->protinfo.ax25->backoff = opt ? 1 : 0; + return 0; + + case AX25_EXTSEQ: + sk->protinfo.ax25->modulus = opt ? EMODULUS : MODULUS; + return 0; + + case AX25_HDRINCL: + sk->protinfo.ax25->hdrincl = opt ? 1 : 0; + return 0; + + case AX25_PACLEN: + if (opt < 16 || opt > 65535) + return -EINVAL; + sk->protinfo.ax25->paclen = opt; + return 0; + default: return -ENOPROTOOPT; } @@ -679,28 +1036,48 @@ static int ax25_getsockopt(struct socket *sock, int level, int optname, if (level != SOL_AX25) return -EOPNOTSUPP; - + switch (optname) { case AX25_WINDOW: - val = sk->ax25->window; + val = sk->protinfo.ax25->window; break; case AX25_T1: - val = sk->ax25->t1 / PR_SLOWHZ; + val = (sk->protinfo.ax25->t1 * 2) / PR_SLOWHZ; break; - + case AX25_T2: - val = sk->ax25->t2 / PR_SLOWHZ; + val = sk->protinfo.ax25->t2 / PR_SLOWHZ; break; - + case AX25_N2: - val = sk->ax25->n2; + val = sk->protinfo.ax25->n2; break; - + case AX25_T3: - val = sk->ax25->t3 / PR_SLOWHZ; + val = sk->protinfo.ax25->t3 / PR_SLOWHZ; break; - + + case AX25_IDLE: + val = sk->protinfo.ax25->idle / (PR_SLOWHZ * 60); + break; + + case AX25_BACKOFF: + val = sk->protinfo.ax25->backoff; + break; + + case AX25_EXTSEQ: + val = (sk->protinfo.ax25->modulus == EMODULUS); + break; + + case AX25_HDRINCL: + val = sk->protinfo.ax25->hdrincl; + break; + + case AX25_PACLEN: + val = sk->protinfo.ax25->paclen; + break; + default: return -ENOPROTOOPT; } @@ -708,12 +1085,12 @@ static int ax25_getsockopt(struct socket *sock, int level, int optname, if ((err = verify_area(VERIFY_WRITE, optlen, sizeof(int))) != 0) return err; - put_fs_long(sizeof(int), (unsigned long *)optlen); + put_user(sizeof(int), optlen); if ((err = verify_area(VERIFY_WRITE, optval, sizeof(int))) != 0) return err; - put_fs_long(val, (unsigned long *)optval); + put_user(val, (int *) optval); return 0; } @@ -748,53 +1125,60 @@ static int ax25_create(struct socket *sock, int protocol) struct sock *sk; ax25_cb *ax25; - if ((sk = (struct sock *)kmalloc(sizeof(*sk), GFP_ATOMIC)) == NULL) - return -ENOMEM; - - if ((ax25 = ax25_create_cb()) == NULL) { - kfree_s(sk, sizeof(*sk)); - return -ENOMEM; - } - - sk->type = sock->type; - switch (sock->type) { case SOCK_DGRAM: - case SOCK_SEQPACKET: - if (protocol == 0) + if (protocol == 0 || protocol == AF_AX25) protocol = AX25_P_TEXT; break; + case SOCK_SEQPACKET: + switch (protocol) { + case 0: + case AF_AX25: /* For CLX */ + protocol = AX25_P_TEXT; + break; + case AX25_P_SEGMENT: +#ifdef CONFIG_INET + case AX25_P_ARP: + case AX25_P_IP: +#endif +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) + case AX25_P_NETROM: +#endif +#if defined(CONFIG_ROSE) || defined(CONFIG_ROSE_MODULE) + case AX25_P_ROSE: +#endif + return -ESOCKTNOSUPPORT; + default: + break; + } + break; case SOCK_RAW: break; default: - kfree_s((void *)sk, sizeof(*sk)); - kfree_s((void *)ax25, sizeof(*ax25)); return -ESOCKTNOSUPPORT; } + if ((sk = sk_alloc(GFP_ATOMIC)) == NULL) + return -ENOMEM; + + if ((ax25 = ax25_create_cb()) == NULL) { + sk_free(sk); + return -ENOMEM; + } + skb_queue_head_init(&sk->receive_queue); skb_queue_head_init(&sk->write_queue); skb_queue_head_init(&sk->back_log); sk->socket = sock; + sk->type = sock->type; sk->protocol = protocol; - sk->dead = 0; sk->next = NULL; - sk->broadcast = 0; + sk->allocation = GFP_KERNEL; sk->rcvbuf = SK_RMEM_MAX; sk->sndbuf = SK_WMEM_MAX; - sk->wmem_alloc = 0; - sk->rmem_alloc = 0; - sk->inuse = 0; - sk->debug = 0; - sk->prot = NULL; /* So we use default free mechanisms */ - sk->err = 0; - sk->localroute = 0; - sk->send_head = NULL; sk->state = TCP_CLOSE; - sk->shutdown = 0; sk->priority = SOPRI_NORMAL; - sk->ack_backlog = 0; sk->mtu = AX25_MTU; /* 256 */ sk->zapped = 1; @@ -808,8 +1192,8 @@ static int ax25_create(struct socket *sock, int protocol) sk->sleep = sock->wait; } - ax25->sk = sk; - sk->ax25 = ax25; + ax25->sk = sk; + sk->protinfo.ax25 = ax25; return 0; } @@ -819,26 +1203,27 @@ static struct sock *ax25_make_new(struct sock *osk, struct device *dev) struct sock *sk; ax25_cb *ax25; - if ((sk = (struct sock *)kmalloc(sizeof(*sk), GFP_ATOMIC)) == NULL) + if ((sk = sk_alloc(GFP_ATOMIC)) == NULL) return NULL; if ((ax25 = ax25_create_cb()) == NULL) { - kfree_s(sk, sizeof(*sk)); + sk_free(sk); return NULL; } + ax25_fillin_cb(ax25, dev); + sk->type = osk->type; sk->socket = osk->socket; - switch(osk->type) - { + switch (osk->type) { case SOCK_DGRAM: break; case SOCK_SEQPACKET: break; default: - kfree_s((void *)sk, sizeof(*sk)); - kfree_s((void *)ax25, sizeof(*ax25)); + sk_free(sk); + ax25_free_cb(ax25); return NULL; } @@ -846,25 +1231,14 @@ static struct sock *ax25_make_new(struct sock *osk, struct device *dev) skb_queue_head_init(&sk->write_queue); skb_queue_head_init(&sk->back_log); - sk->dead = 0; sk->next = NULL; sk->priority = osk->priority; - sk->broadcast = 0; sk->protocol = osk->protocol; sk->rcvbuf = osk->rcvbuf; sk->sndbuf = osk->sndbuf; - sk->wmem_alloc = 0; - sk->rmem_alloc = 0; - sk->inuse = 0; - sk->ack_backlog = 0; - sk->prot = NULL; /* So we use default free mechanisms */ - sk->err = 0; - sk->localroute = 0; - sk->send_head = NULL; sk->debug = osk->debug; sk->state = TCP_ESTABLISHED; sk->window = osk->window; - sk->shutdown = 0; sk->mtu = osk->mtu; sk->sleep = osk->sleep; sk->zapped = osk->zapped; @@ -874,27 +1248,34 @@ static struct sock *ax25_make_new(struct sock *osk, struct device *dev) sk->write_space = def_callback1; sk->error_report = def_callback1; - ax25->rtt = osk->ax25->rtt; - ax25->t1 = osk->ax25->t1; - ax25->t2 = osk->ax25->t2; - ax25->t3 = osk->ax25->t3; - ax25->n2 = osk->ax25->n2; - - ax25->window = osk->ax25->window; - ax25->device = dev; - - memcpy(&ax25->source_addr, &osk->ax25->source_addr, sizeof(ax25_address)); + ax25->modulus = osk->protinfo.ax25->modulus; + ax25->backoff = osk->protinfo.ax25->backoff; + ax25->hdrincl = osk->protinfo.ax25->hdrincl; + ax25->rtt = osk->protinfo.ax25->rtt; + ax25->t1 = osk->protinfo.ax25->t1; + ax25->t2 = osk->protinfo.ax25->t2; + ax25->t3 = osk->protinfo.ax25->t3; + ax25->n2 = osk->protinfo.ax25->n2; + ax25->idle = osk->protinfo.ax25->idle; + ax25->paclen = osk->protinfo.ax25->paclen; + + ax25->window = osk->protinfo.ax25->window; + ax25->maxqueue = osk->protinfo.ax25->maxqueue; + + ax25->source_addr = osk->protinfo.ax25->source_addr; - if (osk->ax25->digipeat != NULL) { + if (osk->protinfo.ax25->digipeat != NULL) { if ((ax25->digipeat = (ax25_digi *)kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { - kfree_s(sk, sizeof(*sk)); - kfree_s(ax25, sizeof(*ax25)); + sk_free(sk); + ax25_free_cb(ax25); return NULL; } + + *ax25->digipeat = *osk->protinfo.ax25->digipeat; } - sk->ax25 = ax25; - ax25->sk = sk; + sk->protinfo.ax25 = ax25; + ax25->sk = sk; return sk; } @@ -913,51 +1294,70 @@ static int ax25_release(struct socket *sock, struct socket *peer) if (sk == NULL) return 0; if (sk->type == SOCK_SEQPACKET) { - switch (sk->ax25->state) { + switch (sk->protinfo.ax25->state) { case AX25_STATE_0: - sk->dead = 1; + sk->state = TCP_CLOSE; + sk->shutdown |= SEND_SHUTDOWN; sk->state_change(sk); - ax25_destroy_socket(sk->ax25); + sk->dead = 1; + ax25_destroy_socket(sk->protinfo.ax25); break; case AX25_STATE_1: - ax25_send_control(sk->ax25, DISC | PF, C_RESPONSE); - sk->ax25->state = AX25_STATE_0; - sk->dead = 1; + ax25_send_control(sk->protinfo.ax25, DISC, POLLON, C_COMMAND); + sk->protinfo.ax25->state = AX25_STATE_0; + sk->state = TCP_CLOSE; + sk->shutdown |= SEND_SHUTDOWN; sk->state_change(sk); - ax25_destroy_socket(sk->ax25); + sk->dead = 1; + ax25_destroy_socket(sk->protinfo.ax25); break; case AX25_STATE_2: - ax25_send_control(sk->ax25, DM | PF, C_RESPONSE); - sk->ax25->state = AX25_STATE_0; - sk->dead = 1; + if (sk->protinfo.ax25->dama_slave) + ax25_send_control(sk->protinfo.ax25, DISC, POLLON, C_COMMAND); + else + ax25_send_control(sk->protinfo.ax25, DM, POLLON, C_RESPONSE); + sk->protinfo.ax25->state = AX25_STATE_0; + sk->state = TCP_CLOSE; + sk->shutdown |= SEND_SHUTDOWN; sk->state_change(sk); - ax25_destroy_socket(sk->ax25); + sk->dead = 1; + ax25_destroy_socket(sk->protinfo.ax25); break; case AX25_STATE_3: case AX25_STATE_4: - ax25_clear_tx_queue(sk->ax25); - sk->ax25->n2count = 0; - ax25_send_control(sk->ax25, DISC | PF, C_COMMAND); - sk->ax25->t3timer = 0; - sk->ax25->t1timer = sk->ax25->t1 = ax25_calculate_t1(sk->ax25); - sk->ax25->state = AX25_STATE_2; + ax25_clear_queues(sk->protinfo.ax25); + sk->protinfo.ax25->n2count = 0; + if (!sk->protinfo.ax25->dama_slave) { + ax25_send_control(sk->protinfo.ax25, DISC, POLLON, C_COMMAND); + sk->protinfo.ax25->t3timer = 0; + } else { + sk->protinfo.ax25->t3timer = sk->protinfo.ax25->t3; /* DAMA slave timeout */ + } + sk->protinfo.ax25->t1timer = sk->protinfo.ax25->t1 = ax25_calculate_t1(sk->protinfo.ax25); + sk->protinfo.ax25->state = AX25_STATE_2; + sk->state = TCP_CLOSE; + sk->shutdown |= SEND_SHUTDOWN; sk->state_change(sk); - sk->dead = 1; + sk->dead = 1; + sk->destroy = 1; break; default: break; } } else { - sk->dead = 1; + sk->state = TCP_CLOSE; + sk->shutdown |= SEND_SHUTDOWN; sk->state_change(sk); - ax25_destroy_socket(sk->ax25); + sk->dead = 1; + ax25_destroy_socket(sk->protinfo.ax25); } sock->data = NULL; + sk->socket = NULL; /* Not used, but we should do this. **/ return 0; } @@ -968,7 +1368,7 @@ static int ax25_release(struct socket *sock, struct socket *peer) * BSD 4.4 ADDIFADDR type support. It is however small and trivially backward * compatible 8) */ -static int ax25_bind(struct socket *sock, struct sockaddr *uaddr,int addr_len) +static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk; struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr; @@ -978,44 +1378,49 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr,int addr_len) sk = (struct sock *)sock->data; if (sk->zapped == 0) - return -EIO; + return -EINVAL; if (addr_len != sizeof(struct sockaddr_ax25) && addr_len != sizeof(struct full_sockaddr_ax25)) return -EINVAL; -#ifdef DONTDO - if (ax25_find_socket(&addr->fsa_ax25.sax25_call, sk->type) != NULL) { - if (sk->debug) - printk("AX25: bind failed: in use\n"); - return -EADDRINUSE; - } -#endif - call = ax25_findbyuid(current->euid); if (call == NULL && ax25_uid_policy && !suser()) - return -EPERM; + return -EACCES; if (call == NULL) - memcpy(&sk->ax25->source_addr, &addr->fsa_ax25.sax25_call, sizeof(ax25_address)); + sk->protinfo.ax25->source_addr = addr->fsa_ax25.sax25_call; else - memcpy(&sk->ax25->source_addr, call, sizeof(ax25_address)); + sk->protinfo.ax25->source_addr = *call; + + if (sk->debug) + printk("AX25: source address set to %s\n", ax2asc(&sk->protinfo.ax25->source_addr)); if (addr_len == sizeof(struct full_sockaddr_ax25) && addr->fsa_ax25.sax25_ndigis == 1) { - if (!suser()) - return -EPERM; - call = &addr->fsa_digipeater[0]; + if (ax25cmp(&addr->fsa_digipeater[0], &null_ax25_address) == 0) { + dev = NULL; + if (sk->debug) + printk("AX25: bound to any device\n"); + } else { + if ((dev = ax25rtr_get_dev(&addr->fsa_digipeater[0])) == NULL) { + if (sk->debug) + printk("AX25: bind failed - no device\n"); + return -EADDRNOTAVAIL; + } + if (sk->debug) + printk("AX25: bound to device %s\n", dev->name); + } } else { - call = &addr->fsa_ax25.sax25_call; - } - - if ((dev = ax25rtr_get_dev(call)) == NULL) { + if ((dev = ax25rtr_get_dev(&addr->fsa_ax25.sax25_call)) == NULL) { + if (sk->debug) + printk("AX25: bind failed - no device\n"); + return -EADDRNOTAVAIL; + } if (sk->debug) - printk("AX25 bind failed: no device\n"); - return -EADDRNOTAVAIL; + printk("AX25: bound to device %s\n", dev->name); } - sk->ax25->device = dev; - ax25_insert_socket(sk->ax25); + ax25_fillin_cb(sk->protinfo.ax25, dev); + ax25_insert_socket(sk->protinfo.ax25); sk->zapped = 0; @@ -1048,51 +1453,55 @@ static int ax25_connect(struct socket *sock, struct sockaddr *uaddr, sk->state = TCP_CLOSE; sock->state = SS_UNCONNECTED; - if (addr_len > sizeof(*addr)) { - int ct = 0; - int ndigi = addr_len - sizeof(*addr); - ax25_address *ap = (ax25_address *)(((char *)addr) + sizeof(*addr)); - - /* Size is an exact number of digipeaters ? */ - if (ndigi % sizeof(ax25_address)) - return -EINVAL; + if (addr_len != sizeof(struct sockaddr_ax25) && addr_len != sizeof(struct full_sockaddr_ax25)) + return -EINVAL; - ndigi /= sizeof(ax25_address); + /* + * Handle digi-peaters to be used. + */ + if (addr_len == sizeof(struct full_sockaddr_ax25) && addr->sax25_ndigis != 0) { + int ct = 0; + struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)addr; /* Valid number of digipeaters ? */ - if (ndigi < 1 || ndigi > 6) + if (addr->sax25_ndigis < 1 || addr->sax25_ndigis > AX25_MAX_DIGIS) return -EINVAL; - if (sk->ax25->digipeat == NULL) { - if ((sk->ax25->digipeat = (ax25_digi *)kmalloc(sizeof(ax25_digi), GFP_KERNEL)) == NULL) - return -ENOMEM; + if (sk->protinfo.ax25->digipeat == NULL) { + if ((sk->protinfo.ax25->digipeat = (ax25_digi *)kmalloc(sizeof(ax25_digi), GFP_KERNEL)) == NULL) + return -ENOBUFS; } - sk->ax25->digipeat->ndigi = ndigi; + sk->protinfo.ax25->digipeat->ndigi = addr->sax25_ndigis; - while (ct < ndigi) { - sk->ax25->digipeat->repeated[ct] = 0; - memcpy(&sk->ax25->digipeat->calls[ct], &ap[ct], sizeof(ax25_address)); + while (ct < addr->sax25_ndigis) { + sk->protinfo.ax25->digipeat->repeated[ct] = 0; + sk->protinfo.ax25->digipeat->calls[ct] = fsa->fsa_digipeater[ct]; ct++; } - sk->ax25->digipeat->lastrepeat = 0; - addr_len -= ndigi * sizeof(ax25_address); + sk->protinfo.ax25->digipeat->lastrepeat = 0; } - if (addr_len != sizeof(struct sockaddr_ax25)) - return -EINVAL; - - if (sk->zapped) { /* Must bind first - autobinding in this may or may not work */ - if ((err = ax25_rt_autobind(sk->ax25, &addr->sax25_call)) < 0) + /* + * Must bind first - autobinding in this may or may not work. If + * the socket is already bound, check to see if the device has + * been filled in, error if it hasn't. + */ + if (sk->zapped) { + if ((err = ax25_rt_autobind(sk->protinfo.ax25, &addr->sax25_call)) < 0) return err; - ax25_insert_socket(sk->ax25); /* Finish the bind */ + ax25_fillin_cb(sk->protinfo.ax25, sk->protinfo.ax25->device); + ax25_insert_socket(sk->protinfo.ax25); + } else { + if (sk->protinfo.ax25->device == NULL) + return -EHOSTUNREACH; } - if (sk->type == SOCK_SEQPACKET && ax25_find_cb(&sk->ax25->source_addr, &addr->sax25_call, sk->ax25->device) != NULL) - return -EBUSY; /* Already such a connection */ - - memcpy(&sk->ax25->dest_addr, &addr->sax25_call, sizeof(ax25_address)); + if (sk->type == SOCK_SEQPACKET && ax25_find_cb(&sk->protinfo.ax25->source_addr, &addr->sax25_call, sk->protinfo.ax25->device) != NULL) + return -EADDRINUSE; /* Already such a connection */ + + sk->protinfo.ax25->dest_addr = addr->sax25_call; /* First the easy one */ if (sk->type != SOCK_SEQPACKET) { @@ -1104,9 +1513,14 @@ static int ax25_connect(struct socket *sock, struct sockaddr *uaddr, /* Move to connecting socket, ax.25 lapb WAIT_UA.. */ sock->state = SS_CONNECTING; sk->state = TCP_SYN_SENT; - ax25_establish_data_link(sk->ax25); - sk->ax25->state = AX25_STATE_1; - ax25_set_timer(sk->ax25); /* Start going SABM SABM until a UA or a give up and DM */ + + if (ax25_dev_is_dama_slave(sk->protinfo.ax25->device)) + dama_establish_data_link(sk->protinfo.ax25); + else + ax25_establish_data_link(sk->protinfo.ax25); + + sk->protinfo.ax25->state = AX25_STATE_1; + ax25_set_timer(sk->protinfo.ax25); /* Start going SABM SABM until a UA or a give up and DM */ /* Now the loop */ if (sk->state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) @@ -1123,10 +1537,11 @@ static int ax25_connect(struct socket *sock, struct sockaddr *uaddr, } } - if (sk->state != TCP_ESTABLISHED) { /* Not in ABM, not in WAIT_UA -> failed */ + if (sk->state != TCP_ESTABLISHED) { + /* Not in ABM, not in WAIT_UA -> failed */ sti(); sock->state = SS_UNCONNECTED; - return -sk->err; /* Always set at this point */ + return sock_error(sk); /* Always set at this point */ } sock->state = SS_CONNECTED; @@ -1148,7 +1563,7 @@ static int ax25_accept(struct socket *sock, struct socket *newsock, int flags) struct sk_buff *skb; if (newsock->data) - kfree_s(newsock->data, sizeof(struct sock)); + sk_free(newsock->data); newsock->data = NULL; @@ -1160,8 +1575,10 @@ static int ax25_accept(struct socket *sock, struct socket *newsock, int flags) if (sk->state != TCP_LISTEN) return -EINVAL; - /* The write queue this time is holding sockets ready to use - hooked into the SABM we saved */ + /* + * The write queue this time is holding sockets ready to use + * hooked into the SABM we saved + */ do { cli(); if ((skb = skb_dequeue(&sk->receive_queue)) == NULL) { @@ -1193,7 +1610,6 @@ static int ax25_accept(struct socket *sock, struct socket *newsock, int flags) static int ax25_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer) { - ax25_address *addr; struct full_sockaddr_ax25 *sax = (struct full_sockaddr_ax25 *)uaddr; struct sock *sk; unsigned char ndigi, i; @@ -1203,31 +1619,35 @@ static int ax25_getname(struct socket *sock, struct sockaddr *uaddr, if (peer != 0) { if (sk->state != TCP_ESTABLISHED) return -ENOTCONN; - addr = &sk->ax25->dest_addr; + + sax->fsa_ax25.sax25_family = AF_AX25; + sax->fsa_ax25.sax25_call = sk->protinfo.ax25->dest_addr; + sax->fsa_ax25.sax25_ndigis = 0; + *uaddr_len = sizeof(struct full_sockaddr_ax25); + + if (sk->protinfo.ax25->digipeat != NULL) { + ndigi = sk->protinfo.ax25->digipeat->ndigi; + sax->fsa_ax25.sax25_ndigis = ndigi; + for (i = 0; i < ndigi; i++) + sax->fsa_digipeater[i] = sk->protinfo.ax25->digipeat->calls[i]; + } } else { - addr = &sk->ax25->source_addr; - } - - sax->fsa_ax25.sax25_family = AF_AX25; - memcpy(&sax->fsa_ax25.sax25_call, addr, sizeof(ax25_address)); - sax->fsa_ax25.sax25_ndigis = 0; - *uaddr_len = sizeof(struct sockaddr_ax25); + sax->fsa_ax25.sax25_family = AF_AX25; + sax->fsa_ax25.sax25_call = sk->protinfo.ax25->source_addr; + sax->fsa_ax25.sax25_ndigis = 1; + *uaddr_len = sizeof(struct full_sockaddr_ax25); - /* This will supply digipeat path on both getpeername() and getsockname() */ - if (sk->ax25->digipeat != NULL) { - ndigi = sk->ax25->digipeat->ndigi; - sax->fsa_ax25.sax25_ndigis = ndigi; - *uaddr_len += sizeof(ax25_address) * ndigi; - for (i = 0; i < ndigi; i++) - memcpy(&sax->fsa_digipeater[i], &sk->ax25->digipeat->calls[i], sizeof(ax25_address)); + if (sk->protinfo.ax25->device != NULL) + memcpy(&sax->fsa_digipeater[0], sk->protinfo.ax25->device->dev_addr, AX25_ADDR_LEN); + else + sax->fsa_digipeater[0] = null_ax25_address; } - + return 0; } -int ax25_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *ptype) +static int ax25_rcv(struct sk_buff *skb, struct device *dev, ax25_address *dev_addr, struct packet_type *ptype) { - unsigned char *data = skb->data; struct sock *make; struct sock *sk; int type = 0; @@ -1236,52 +1656,76 @@ int ax25_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *ptype) ax25_address src, dest; struct sock *raw; int mine = 0; + int dama; - skb->sk = NULL; /* Initially we don't know who its for */ + /* + * Process the AX.25/LAPB frame. + */ + + skb->h.raw = skb->data; - if ((*data & 0x0F) != 0) { - kfree_skb(skb, FREE_READ); /* Not a KISS data frame */ +#ifdef CONFIG_FIREWALL + if (call_in_firewall(PF_AX25, skb->dev, skb->h.raw, NULL) != FW_ACCEPT) { + kfree_skb(skb, FREE_READ); return 0; } +#endif - data++; - /* * Parse the address header. */ - if ((data = ax25_parse_addr(data, skb->len + dev->hard_header_len - 1, &src, &dest, &dp, &type)) == NULL) { + + if (ax25_parse_addr(skb->data, skb->len, &src, &dest, &dp, &type, &dama) == NULL) { kfree_skb(skb, FREE_READ); return 0; } - - /* - * Send the frame to the AX.25 auto-router - */ - ax25_rt_rx_frame(&src, dev); - + /* * Ours perhaps ? */ if (dp.lastrepeat + 1 < dp.ndigi) { /* Not yet digipeated completely */ - if (ax25cmp(&dp.calls[dp.lastrepeat + 1], (ax25_address *)dev->dev_addr) == 0) { + if (ax25cmp(&dp.calls[dp.lastrepeat + 1], dev_addr) == 0) { + struct device *dev_out = dev; + + skb=skb_unshare(skb, GFP_ATOMIC, FREE_READ); + if(skb==NULL) + return 0; + /* We are the digipeater. Mark ourselves as repeated and throw the packet back out of the same device */ dp.lastrepeat++; dp.repeated[(int)dp.lastrepeat] = 1; -#ifdef CONFIG_AX25_XDIGI - while (dp.lastrepeat + 1 < dp.ndigi) { - struct device *dev_scan; - if ((dev_scan = ax25rtr_get_dev(&dp.calls[dp.lastrepeat + 1])) == NULL) - break; - dp.lastrepeat++; - dp.repeated[(int)dp.lastrepeat] = 1; - dev = dev_scan; + + if (ax25_dev_get_value(dev, AX25_VALUES_DIGI) & AX25_DIGI_XBAND) { + while (dp.lastrepeat + 1 < dp.ndigi) { + struct device *dev_scan; + if ((dev_scan = ax25rtr_get_dev(&dp.calls[dp.lastrepeat + 1])) == NULL) + break; + dp.lastrepeat++; + dp.repeated[(int)dp.lastrepeat] = 1; + dev_out = dev_scan; + } + if (dev != dev_out && (ax25_dev_get_value(dev_out, AX25_VALUES_DIGI) & AX25_DIGI_XBAND) == 0) { + kfree_skb(skb, FREE_READ); + return 0; + } + } + + if (dev == dev_out && (ax25_dev_get_value(dev, AX25_VALUES_DIGI) & AX25_DIGI_INBAND) == 0) { + kfree_skb(skb, FREE_READ); + return 0; + } + + build_ax25_addr(skb->data, &src, &dest, &dp, type, MODULUS); +#ifdef CONFIG_FIREWALL + if (call_fw_firewall(PF_AX25, skb->dev, skb->data, NULL) != FW_ACCEPT) { + kfree_skb(skb, FREE_READ); + return 0; } #endif - build_ax25_addr(skb->data + 1, &src, &dest, &dp, type); - skb->len += dev->hard_header_len; + skb->arp = 1; - dev_queue_xmit(skb, dev, SOPRI_NORMAL); + ax25_queue_xmit(skb, dev_out, SOPRI_NORMAL); } else { kfree_skb(skb, FREE_READ); } @@ -1290,26 +1734,23 @@ int ax25_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *ptype) } /* - * Adjust the lengths for digipeated input + * Pull of the AX.25 headers leaving the CTRL/PID bytes */ - skb->len -= sizeof(ax25_address) * dp.ndigi; - - /* For our port addreses ? */ - if (ax25cmp(&dest, (ax25_address *)dev->dev_addr) == 0) + skb_pull(skb, size_ax25_addr(&dp)); + + /* For our port addresses ? */ + if (ax25cmp(&dest, dev_addr) == 0) mine = 1; -#ifdef CONFIG_NETROM - /* Also match on any NET/ROM callsign */ - if (!mine && nr_dev_get(&dest) != NULL) + /* Also match on any registered callsign from L3/4 */ + if (!mine && ax25_listen_mine(&dest, dev)) mine = 1; -#endif - if ((*data & ~0x10) == LAPB_UI) { /* UI frame - bypass LAPB processing */ - data++; - skb->h.raw = data + 1; /* skip pid */ + if ((*skb->data & ~0x10) == LAPB_UI) { /* UI frame - bypass LAPB processing */ + skb->h.raw = skb->data + 2; /* skip control and pid */ if ((raw = ax25_addr_match(&dest)) != NULL) - ax25_send_to_raw(raw, skb, (int)*data); + ax25_send_to_raw(raw, skb, skb->data[1]); if (!mine && ax25cmp(&dest, (ax25_address *)dev->broadcast) != 0) { kfree_skb(skb, FREE_READ); @@ -1317,14 +1758,15 @@ int ax25_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *ptype) } /* Now we are pointing at the pid byte */ - switch (*data++) { + switch (skb->data[1]) { #ifdef CONFIG_INET case AX25_P_IP: - ax25_ip_mode_set(&src, dev, 'D'); + skb_pull(skb,2); /* drop PID/CTRL */ ip_rcv(skb, dev, ptype); /* Note ptype here is the wrong one, fix me later */ break; case AX25_P_ARP: + skb_pull(skb,2); arp_rcv(skb, dev, ptype); /* Note ptype here is wrong... */ break; #endif @@ -1334,11 +1776,15 @@ int ax25_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *ptype) if (sk->rmem_alloc >= sk->rcvbuf) { kfree_skb(skb, FREE_READ); } else { + /* + * Remove the control and PID. + */ + skb_pull(skb, 2); skb_queue_tail(&sk->receive_queue, skb); skb->sk = sk; - sk->rmem_alloc += skb->mem_len; + atomic_add(skb->truesize, &sk->rmem_alloc); if (!sk->dead) - sk->data_ready(sk, skb->len - 2); + sk->data_ready(sk, skb->len); } } else { kfree_skb(skb, FREE_READ); @@ -1352,31 +1798,51 @@ int ax25_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *ptype) return 0; } + + /* + * Is connected mode supported on this device ? + * If not, should we DM the incoming frame (except DMs) or + * silently ignore them. For now we stay quiet. + */ + if (!ax25_dev_get_value(dev, AX25_VALUES_CONMODE)) { + kfree_skb(skb, FREE_READ); + return 0; + } /* LAPB */ + + /* AX.25 state 1-4 */ + if ((ax25 = ax25_find_cb(&dest, &src, dev)) != NULL) { - skb->h.raw = data; - /* Process the frame. If it is queued up internally it returns one otherwise we - free it immediately. This routine itself wakes the user context layers so we - do no further work */ - if (ax25_process_rx_frame(ax25, skb, type) == 0) + /* + * Process the frame. If it is queued up internally it returns one otherwise we + * free it immediately. This routine itself wakes the user context layers so we + * do no further work + */ + if (ax25_process_rx_frame(ax25, skb, type, dama) == 0) kfree_skb(skb, FREE_READ); return 0; } - if ((data[0] & 0xEF) != SABM) { + /* AX.25 state 0 (disconnected) */ + + /* a) received not a SABM(E) */ + + if ((*skb->data & ~PF) != SABM && (*skb->data & ~PF) != SABME) { /* * Never reply to a DM. Also ignore any connects for * addresses that are not our interfaces and not a socket. */ - if ((data[0] & 0xEF) != DM && mine) + if ((*skb->data & ~PF) != DM && mine) ax25_return_dm(dev, &src, &dest, &dp); kfree_skb(skb, FREE_READ); return 0; } + /* b) received SABM(E) */ + if ((sk = ax25_find_listener(&dest, dev, SOCK_SEQPACKET)) != NULL) { if (sk->ack_backlog == sk->max_ack_backlog || (make = ax25_make_new(sk, dev)) == NULL) { if (mine) @@ -1386,26 +1852,7 @@ int ax25_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *ptype) return 0; } - ax25 = make->ax25; - - /* - * Sort out any digipeated paths. - */ - if (dp.ndigi != 0 && ax25->digipeat == NULL && (ax25->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { - kfree_skb(skb, FREE_READ); - ax25_destroy_socket(ax25); - return 0; - } - - if (dp.ndigi == 0) { - if (ax25->digipeat != NULL) { - kfree_s(ax25->digipeat, sizeof(ax25_digi)); - ax25->digipeat = NULL; - } - } else { - /* Reverse the source SABM's path */ - ax25_digi_invert(&dp, ax25->digipeat); - } + ax25 = make->protinfo.ax25; skb_queue_head(&sk->receive_queue, skb); @@ -1415,39 +1862,58 @@ int ax25_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *ptype) sk->ack_backlog++; } else { -#ifdef CONFIG_NETROM if (!mine) { kfree_skb(skb, FREE_READ); return 0; } - - if (dp.ndigi != 0) { - ax25_return_dm(dev, &src, &dest, &dp); - kfree_skb(skb, FREE_READ); - return 0; - } - + if ((ax25 = ax25_create_cb()) == NULL) { ax25_return_dm(dev, &src, &dest, &dp); kfree_skb(skb, FREE_READ); return 0; } -#else - if (mine) - ax25_return_dm(dev, &src, &dest, &dp); + ax25_fillin_cb(ax25, dev); + ax25->idletimer = ax25->idle; + } + + ax25->source_addr = dest; + ax25->dest_addr = src; + + /* + * Sort out any digipeated paths. + */ + if (dp.ndigi != 0 && ax25->digipeat == NULL && (ax25->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { kfree_skb(skb, FREE_READ); + ax25_destroy_socket(ax25); return 0; -#endif } - memcpy(&ax25->source_addr, &dest, sizeof(ax25_address)); - memcpy(&ax25->dest_addr, &src, sizeof(ax25_address)); + if (dp.ndigi == 0) { + if (ax25->digipeat != NULL) { + kfree_s(ax25->digipeat, sizeof(ax25_digi)); + ax25->digipeat = NULL; + } + } else { + /* Reverse the source SABM's path */ + ax25_digi_invert(&dp, ax25->digipeat); + } + + if ((*skb->data & ~PF) == SABME) { + ax25->modulus = EMODULUS; + ax25->window = ax25_dev_get_value(dev, AX25_VALUES_EWINDOW); + } else { + ax25->modulus = MODULUS; + ax25->window = ax25_dev_get_value(dev, AX25_VALUES_WINDOW); + } ax25->device = dev; - ax25_send_control(ax25, UA | PF, C_RESPONSE); + ax25_send_control(ax25, UA, POLLON, C_RESPONSE); + if (dama) ax25_dama_on(ax25); /* bke 951121 */ + + ax25->dama_slave = dama; ax25->t3timer = ax25->t3; ax25->state = AX25_STATE_3; @@ -1457,7 +1923,7 @@ int ax25_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *ptype) if (sk != NULL) { if (!sk->dead) - sk->data_ready(sk, skb->len - 2); + sk->data_ready(sk, skb->len); } else { kfree_skb(skb, FREE_READ); } @@ -1465,12 +1931,28 @@ int ax25_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *ptype) return 0; } -static int ax25_sendto(struct socket *sock, void *ubuf, int len, int noblock, - unsigned flags, struct sockaddr *usip, int addr_len) +/* + * Receive an AX.25 frame via a SLIP interface. + */ +static int kiss_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *ptype) +{ + skb->sk = NULL; /* Initially we don't know who it's for */ + + if ((*skb->data & 0x0F) != 0) { + kfree_skb(skb, FREE_READ); /* Not a KISS data frame */ + return 0; + } + + skb_pull(skb, AX25_KISS_HEADER_LEN); /* Remove the KISS byte */ + + return ax25_rcv(skb, dev, (ax25_address *)dev->dev_addr, ptype); +} + + +static int ax25_sendmsg(struct socket *sock, struct msghdr *msg, int len, int noblock, int flags) { struct sock *sk = (struct sock *)sock->data; - struct sockaddr_ax25 *usax = (struct sockaddr_ax25 *)usip; - unsigned char *uaddr = (unsigned char *)usip; + struct sockaddr_ax25 *usax = (struct sockaddr_ax25 *)msg->msg_name; int err; struct sockaddr_ax25 sax; struct sk_buff *skb; @@ -1479,68 +1961,67 @@ static int ax25_sendto(struct socket *sock, void *ubuf, int len, int noblock, ax25_digi *dp; ax25_digi dtmp; int lv; + int addr_len = msg->msg_namelen; - if (sk->err) { - err = sk->err; - sk->err = 0; - return -err; - } + if (sk->err) + return sock_error(sk); - if (flags) + if (flags || msg->msg_control) return -EINVAL; if (sk->zapped) return -EADDRNOTAVAIL; - if (sk->ax25->device == NULL) + if (sk->shutdown & SEND_SHUTDOWN) { + send_sig(SIGPIPE, current, 0); + return -EPIPE; + } + + if (sk->protinfo.ax25->device == NULL) return -ENETUNREACH; if (usax) { - int ndigi = addr_len - sizeof(sax); - if (addr_len < sizeof(sax)) + if (addr_len != sizeof(struct sockaddr_ax25) && addr_len != sizeof(struct full_sockaddr_ax25)) return -EINVAL; - - /* Trailing digipeaters on address ?? */ - if (addr_len > sizeof(sax)) { - int ct = 0; - - ax25_address *ap = (ax25_address *)(((char *)uaddr) + sizeof(sax)); - /* Size is an exact number of digipeaters ? */ - if (ndigi % sizeof(ax25_address)) - return -EINVAL; - ndigi /= sizeof(ax25_address); + if (usax->sax25_family != AF_AX25) + return -EINVAL; + if (addr_len == sizeof(struct full_sockaddr_ax25) && usax->sax25_ndigis != 0) { + int ct = 0; + struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)usax; /* Valid number of digipeaters ? */ - if (ndigi < 1 || ndigi > 6) + if (usax->sax25_ndigis < 1 || usax->sax25_ndigis > AX25_MAX_DIGIS) return -EINVAL; - /* Copy data into digipeat structure */ - while (ct < ndigi) { + dtmp.ndigi = usax->sax25_ndigis; + + while (ct < usax->sax25_ndigis) { dtmp.repeated[ct] = 0; - memcpy(&dtmp.calls[ct], &ap[ct], sizeof(ax25_address)); + dtmp.calls[ct] = fsa->fsa_digipeater[ct]; ct++; } dtmp.lastrepeat = 0; - dtmp.ndigi = ndigi; - addr_len -= ndigi * sizeof(ax25_address); } - memcpy(&sax, usax, sizeof(sax)); - if (sk->type == SOCK_SEQPACKET && memcmp(&sk->ax25->dest_addr, &sax.sax25_call, sizeof(ax25_address)) != 0) + sax = *usax; + if (sk->type == SOCK_SEQPACKET && ax25cmp(&sk->protinfo.ax25->dest_addr, &sax.sax25_call) != 0) return -EISCONN; - if (sax.sax25_family != AF_AX25) - return -EINVAL; - if (ndigi != 0) - dp = &dtmp; - else + if (usax->sax25_ndigis == 0) dp = NULL; + else + dp = &dtmp; } else { + /* + * FIXME: 1003.1g - if the socket is like this because + * it has become closed (not started closed) and is VC + * we ought to SIGPIPE, EPIPE + */ if (sk->state != TCP_ESTABLISHED) return -ENOTCONN; sax.sax25_family = AF_AX25; - memcpy(&sax.sax25_call, &sk->ax25->dest_addr, sizeof(ax25_address)); - dp = sk->ax25->digipeat; + sax.sax25_call = sk->protinfo.ax25->dest_addr; + dp = sk->protinfo.ax25->digipeat; } if (sk->debug) @@ -1550,88 +2031,87 @@ static int ax25_sendto(struct socket *sock, void *ubuf, int len, int noblock, if (sk->debug) printk("AX.25: sendto: building packet.\n"); - size = 2 + len + 1 + size_ax25_addr(dp); - /* 2 bytes for PID and (U)I frame byte: 15+ for KISS data & calls */ + /* Assume the worst case */ + size = len + 3 + size_ax25_addr(dp) + AX25_BPQ_HEADER_LEN; - if ((skb = sock_alloc_send_skb(sk, size, 0, &err)) == NULL) + if ((skb = sock_alloc_send_skb(sk, size, 0, 0, &err)) == NULL) return err; skb->sk = sk; skb->free = 1; skb->arp = 1; - skb->len = size; - - asmptr = skb->data; - if (sk->debug) { - printk("Building AX.25 Header (dp=%p).\n", dp); - if (dp != 0) - printk("Num digipeaters=%d\n", dp->ndigi); - } - /* Build an AX.25 header */ - *asmptr++ = 0; /* KISS data */ - asmptr += (lv = build_ax25_addr(asmptr, &sk->ax25->source_addr, &sax.sax25_call, dp, C_COMMAND)); - if (sk->debug) - printk("Built header (%d bytes)\n",lv); - skb->h.raw = asmptr; - - if (sk->debug) - printk("base=%p pos=%p\n", skb->data, asmptr); - *asmptr++ = LAPB_UI; /* Datagram - will get replaced for I frames */ - *asmptr++ = sk->protocol; /* AX.25 TEXT by default */ - + skb_reserve(skb, size - len); + if (sk->debug) printk("AX.25: Appending user data\n"); /* User data follows immediately after the AX.25 data */ - memcpy_fromfs(asmptr, ubuf, len); + memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); + + /* Add the PID, usually AX25_TEXT */ + asmptr = skb_push(skb, 1); + *asmptr = sk->protocol; + if (sk->debug) printk("AX.25: Transmitting buffer\n"); + if (sk->type == SOCK_SEQPACKET) { /* Connected mode sockets go via the LAPB machine */ if (sk->state != TCP_ESTABLISHED) { kfree_skb(skb, FREE_WRITE); return -ENOTCONN; } - ax25_output(sk->ax25, skb); /* Shove it onto the queue and kick */ + + ax25_output(sk->protinfo.ax25, skb); /* Shove it onto the queue and kick */ + return len; } else { + asmptr = skb_push(skb, 1 + size_ax25_addr(dp)); + + if (sk->debug) { + printk("Building AX.25 Header (dp=%p).\n", dp); + if (dp != 0) + printk("Num digipeaters=%d\n", dp->ndigi); + } + + /* Build an AX.25 header */ + asmptr += (lv = build_ax25_addr(asmptr, &sk->protinfo.ax25->source_addr, &sax.sax25_call, dp, C_COMMAND, MODULUS)); + + if (sk->debug) + printk("Built header (%d bytes)\n",lv); + + skb->h.raw = asmptr; + + if (sk->debug) + printk("base=%p pos=%p\n", skb->data, asmptr); + + *asmptr = LAPB_UI; + /* Datagram frames go straight out of the door as UI */ - dev_queue_xmit(skb, sk->ax25->device, SOPRI_NORMAL); + ax25_queue_xmit(skb, sk->protinfo.ax25->device, SOPRI_NORMAL); + return len; } + } -static int ax25_send(struct socket *sock, void *ubuf, int size, int noblock, unsigned flags) -{ - return ax25_sendto(sock, ubuf, size, noblock, flags, NULL, 0); -} - -static int ax25_write(struct socket *sock, char *ubuf, int size, int noblock) -{ - return ax25_send(sock, ubuf, size, noblock, 0); -} - -static int ax25_recvfrom(struct socket *sock, void *ubuf, int size, int noblock, - unsigned flags, struct sockaddr *sip, int *addr_len) +static int ax25_recvmsg(struct socket *sock, struct msghdr *msg, int size, int noblock, int flags, int *addr_len) { struct sock *sk = (struct sock *)sock->data; - struct sockaddr_ax25 *sax = (struct sockaddr_ax25 *)sip; - char *addrptr = (char *)sip; - int copied = 0; + struct sockaddr_ax25 *sax = (struct sockaddr_ax25 *)msg->msg_name; + int copied, length; struct sk_buff *skb; int er; + int dama; - if (sk->err) { - er = -sk->err; - sk->err = 0; - return er; - } - if (addr_len != NULL) *addr_len = sizeof(*sax); - /* This works for seqpacket too. The receiver has ordered the queue for us! We do one quick check first though */ + /* + * This works for seqpacket too. The receiver has ordered the + * queue for us! We do one quick check first though + */ if (sk->type == SOCK_SEQPACKET && sk->state != TCP_ESTABLISHED) return -ENOTCONN; @@ -1639,51 +2119,61 @@ static int ax25_recvfrom(struct socket *sock, void *ubuf, int size, int noblock, if ((skb = skb_recv_datagram(sk, flags, noblock, &er)) == NULL) return er; - copied= (size < skb->len) ? size : skb->len; - skb_copy_datagram(skb, sk->type == SOCK_SEQPACKET ? 2 : 0, ubuf, copied); + if (sk->protinfo.ax25->hdrincl) { + length = skb->len + (skb->data - skb->h.raw); + } else { + if (sk->type == SOCK_SEQPACKET) + skb_pull(skb, 1); /* Remove PID */ + length = skb->len; + skb->h.raw = skb->data; + } + + copied = length; + + if (copied > size) { + copied = size; + msg->msg_flags |= MSG_TRUNC; + } + + skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); if (sax) { - struct sockaddr_ax25 addr; ax25_digi digi; ax25_address dest; - unsigned char *dp = skb->data; - int ct = 0; - - ax25_parse_addr(dp, skb->len, NULL, &dest, &digi, NULL); - addr.sax25_family = AF_AX25; - memcpy(&addr.sax25_call, &dest, sizeof(ax25_address)); - memcpy(sax,&addr, sizeof(*sax)); - addrptr += sizeof(*sax); - - while (ct < digi.ndigi) { - memcpy(addrptr, &digi. calls[ct], 7); - addrptr += 7; - ct++; - } - if (addr_len) - *addr_len = sizeof(*sax) + 7 * digi.ndigi; - } - skb_free_datagram(skb); + if (addr_len == (int *)0) + return -EINVAL; + if (*addr_len != sizeof(struct sockaddr_ax25) && *addr_len != sizeof(struct full_sockaddr_ax25)) + return -EINVAL; - return copied; -} + ax25_parse_addr(skb->data, skb->len, NULL, &dest, &digi, NULL, &dama); -static int ax25_recv(struct socket *sock, void *ubuf, int size , int noblock, - unsigned flags) -{ - struct sock *sk = (struct sock *)sock->data; + sax->sax25_family = AF_AX25; + /* We set this correctly, even though we may not let the + application know the digi calls further down (because it + did NOT ask to know them). This could get political... **/ + sax->sax25_ndigis = digi.ndigi; + sax->sax25_call = dest; - if (sk->zapped) - return -ENOTCONN; + *addr_len = sizeof(struct sockaddr_ax25); - return ax25_recvfrom(sock, ubuf, size, noblock, flags, NULL, NULL); -} + if (*addr_len == sizeof(struct full_sockaddr_ax25) && sax->sax25_ndigis != 0) { + int ct = 0; + struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)sax; -static int ax25_read(struct socket *sock, char *ubuf, int size, int noblock) -{ - return ax25_recv(sock, ubuf, size, noblock, 0); -} + while (ct < digi.ndigi) { + fsa->fsa_digipeater[ct] = digi.calls[ct]; + ct++; + } + + *addr_len = sizeof(struct full_sockaddr_ax25); + } + } + + skb_free_datagram(sk, skb); + + return copied; +} static int ax25_shutdown(struct socket *sk, int how) { @@ -1706,23 +2196,22 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) switch (cmd) { case TIOCOUTQ: - if ((err = verify_area(VERIFY_WRITE, (void *)arg, sizeof(unsigned long))) != 0) + if ((err = verify_area(VERIFY_WRITE, (void *)arg, sizeof(int))) != 0) return err; amount = sk->sndbuf - sk->wmem_alloc; if (amount < 0) amount = 0; - put_fs_long(amount, (unsigned long *)arg); + put_user(amount, (int *)arg); return 0; - case TIOCINQ: - { + case TIOCINQ: { struct sk_buff *skb; /* These two are safe on a single CPU system as only user tasks fiddle here */ if ((skb = skb_peek(&sk->receive_queue)) != NULL) amount = skb->len; - if ((err = verify_area(VERIFY_WRITE, (void *)arg, sizeof(unsigned long))) != 0) + if ((err = verify_area(VERIFY_WRITE, (void *)arg, sizeof(int))) != 0) return err; - put_fs_long(amount, (unsigned long *)arg); + put_user(amount, (int *)arg); return 0; } @@ -1732,33 +2221,44 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return -ENOENT; if ((err = verify_area(VERIFY_WRITE,(void *)arg,sizeof(struct timeval))) != 0) return err; - memcpy_tofs((void *)arg, &sk->stamp, sizeof(struct timeval)); + copy_to_user((void *)arg, &sk->stamp, sizeof(struct timeval)); return 0; } return -EINVAL; case SIOCAX25ADDUID: /* Add a uid to the uid/call map table */ case SIOCAX25DELUID: /* Delete a uid from the uid/call map table */ - case SIOCAX25GETUID: - { + case SIOCAX25GETUID: { struct sockaddr_ax25 sax25; if ((err = verify_area(VERIFY_READ, (void *)arg, sizeof(struct sockaddr_ax25))) != 0) return err; - memcpy_fromfs(&sax25, (void *)arg, sizeof(sax25)); + copy_from_user(&sax25, (void *)arg, sizeof(sax25)); return ax25_uid_ioctl(cmd, &sax25); } case SIOCAX25NOUID: /* Set the default policy (default/bar) */ if ((err = verify_area(VERIFY_READ, (void *)arg, sizeof(unsigned long))) != 0) return err; - if(!suser()) + if (!suser()) return -EPERM; - amount = get_fs_long((void *)arg); + get_user(amount, (long *)arg); if (amount > AX25_NOUID_BLOCK) return -EINVAL; ax25_uid_policy = amount; return 0; + case SIOCADDRT: + case SIOCDELRT: + case SIOCAX25OPTRT: + if (!suser()) + return -EPERM; + return ax25_rt_ioctl(cmd, (void *)arg); + + case SIOCAX25CTLCON: + if (!suser()) + return -EPERM; + return ax25_ctl_ioctl(cmd, (void *)arg); + case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: @@ -1772,25 +2272,26 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return -EINVAL; default: - return(dev_ioctl(cmd, (void *)arg)); + return dev_ioctl(cmd, (void *)arg); } /*NOTREACHED*/ - return(0); + return 0; } -int ax25_get_info(char *buffer, char **start, off_t offset, int length) + +static int ax25_get_info(char *buffer, char **start, off_t offset, int length, int dummy) { ax25_cb *ax25; struct device *dev; - char *devname; + const char *devname; int len = 0; off_t pos = 0; off_t begin = 0; - + cli(); - len += sprintf(buffer, "dest_addr src_addr dev st vs vr va t1 t2 t3 n2 rtt wnd Snd-Q Rcv-Q\n"); + len += sprintf(buffer, "dest_addr src_addr dev st vs vr va t1 t2 t3 idle n2 rtt wnd paclen dama Snd-Q Rcv-Q\n"); for (ax25 = ax25_list; ax25 != NULL; ax25 = ax25->next) { if ((dev = ax25->device) == NULL) @@ -1800,7 +2301,7 @@ int ax25_get_info(char *buffer, char **start, off_t offset, int length) len += sprintf(buffer + len, "%-9s ", ax2asc(&ax25->dest_addr)); - len += sprintf(buffer + len, "%-9s %-3s %2d %2d %2d %2d %3d/%03d %2d/%02d %3d/%03d %2d/%02d %3d %3d", + len += sprintf(buffer + len, "%-9s %-4s %2d %3d %3d %3d %3d/%03d %2d/%02d %3d/%03d %3d/%03d %2d/%02d %3d %3d %5d", ax2asc(&ax25->source_addr), devname, ax25->state, ax25->vs, ax25->vr, ax25->va, @@ -1810,12 +2311,17 @@ int ax25_get_info(char *buffer, char **start, off_t offset, int length) ax25->t2 / PR_SLOWHZ, ax25->t3timer / PR_SLOWHZ, ax25->t3 / PR_SLOWHZ, + ax25->idletimer / (PR_SLOWHZ * 60), + ax25->idle / (PR_SLOWHZ * 60), ax25->n2count, ax25->n2, ax25->rtt / PR_SLOWHZ, - ax25->window); + ax25->window, + ax25->paclen); + + len += sprintf(buffer + len, " %s", ax25->dama_slave ? " slave" : " no"); if (ax25->sk != NULL) { - len += sprintf(buffer + len, " %5ld %5ld\n", + len += sprintf(buffer + len, " %5d %5d\n", ax25->sk->wmem_alloc, ax25->sk->rmem_alloc); } else { @@ -1854,28 +2360,25 @@ static struct proto_ops ax25_proto_ops = { ax25_socketpair, ax25_accept, ax25_getname, - ax25_read, - ax25_write, ax25_select, ax25_ioctl, ax25_listen, - ax25_send, - ax25_recv, - ax25_sendto, - ax25_recvfrom, ax25_shutdown, ax25_setsockopt, ax25_getsockopt, ax25_fcntl, + ax25_sendmsg, + ax25_recvmsg }; -/* Called by socket.c on kernel start up */ - +/* + * Called by socket.c on kernel start up + */ static struct packet_type ax25_packet_type = { 0, /* MUTTER ntohs(ETH_P_AX25),*/ 0, /* copy */ - ax25_rcv, + kiss_rcv, NULL, NULL, }; @@ -1885,21 +2388,92 @@ static struct notifier_block ax25_dev_notifier = { 0 }; +static struct symbol_table ax25_syms = { +#include <linux/symtab_begin.h> + X(ax25_encapsulate), + X(ax25_rebuild_header), +#if defined(CONFIG_NETROM_MODULE) || defined(CONFIG_ROSE_MODULE) + X(ax25_findbyuid), + X(ax25_link_up), + X(ax25_linkfail_register), + X(ax25_linkfail_release), + X(ax25_listen_register), + X(ax25_listen_release), + X(ax25_protocol_register), + X(ax25_protocol_release), + X(ax25_send_frame), + X(ax25_uid_policy), + X(ax25cmp), + X(ax2asc), + X(asc2ax), + X(null_ax25_address), +#endif +#include <linux/symtab_end.h> +}; + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry proc_ax25_route = { + PROC_NET_AX25_ROUTE, 10, "ax25_route", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + ax25_rt_get_info +}; +static struct proc_dir_entry proc_ax25 = { + PROC_NET_AX25, 4, "ax25", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + ax25_get_info +}; +static struct proc_dir_entry proc_ax25_calls = { + PROC_NET_AX25_CALLS, 10, "ax25_calls", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + ax25_cs_get_info +}; +#endif + void ax25_proto_init(struct net_proto *pro) { sock_register(ax25_proto_ops.family, &ax25_proto_ops); ax25_packet_type.type = htons(ETH_P_AX25); dev_add_pack(&ax25_packet_type); register_netdevice_notifier(&ax25_dev_notifier); - printk("GW4PTS/G4KLX AX.25 for Linux. Version 0.29 ALPHA for Linux NET3.029 (Linux 1.3.0)\n"); + register_symtab(&ax25_syms); + ax25_register_sysctl(); + +#ifdef CONFIG_PROC_FS + proc_net_register(&proc_ax25_route); + proc_net_register(&proc_ax25); + proc_net_register(&proc_ax25_calls); +#endif + + printk(KERN_INFO "G4KLX/GW4PTS AX.25 for Linux. Version 0.34 for Linux NET3.037 (Linux 2.1)\n"); } -/*******************************************************************************************************************\ -* * -* Driver encapsulation support: Moved out of SLIP because a) it should be here * -* b) for HDLC cards * -* * -\*******************************************************************************************************************/ +/* + * A small shim to dev_queue_xmit to add the KISS control byte. + */ +void ax25_queue_xmit(struct sk_buff *skb, struct device *dev, int pri) +{ + unsigned char *ptr; + +#ifdef CONFIG_FIREWALL + if (call_out_firewall(PF_AX25, skb->dev, skb->data, NULL) != FW_ACCEPT) { + dev_kfree_skb(skb, FREE_WRITE); + return; + } +#endif + + skb->protocol = htons(ETH_P_AX25); + + ptr = skb_push(skb, 1); + *ptr++ = 0; /* KISS */ + dev_queue_xmit(skb, dev, pri); +} + +/* + * IP over AX.25 encapsulation. + */ /* * Shove an AX.25 UI header on an IP packet and handle ARP @@ -1907,18 +2481,21 @@ void ax25_proto_init(struct net_proto *pro) #ifdef CONFIG_INET -int ax25_encapsulate(unsigned char *buff, struct device *dev, unsigned short type, void *daddr, - void *saddr, unsigned len, struct sk_buff *skb) +int ax25_encapsulate(struct sk_buff *skb, struct device *dev, unsigned short type, void *daddr, + void *saddr, unsigned len) { /* header is an AX.25 UI frame from us to them */ + unsigned char *buff = skb_push(skb, AX25_HEADER_LEN); + *buff++ = 0; /* KISS DATA */ if (daddr != NULL) memcpy(buff, daddr, dev->addr_len); /* Address specified */ + buff[6] &= ~LAPB_C; buff[6] &= ~LAPB_E; - buff[6] |= SSID_SPARE; - buff += 7; + buff[6] |= SSSID_SPARE; + buff += AX25_ADDR_LEN; if (saddr != NULL) memcpy(buff, saddr, dev->addr_len); @@ -1927,8 +2504,9 @@ int ax25_encapsulate(unsigned char *buff, struct device *dev, unsigned short typ buff[6] &= ~LAPB_C; buff[6] |= LAPB_E; - buff[6] |= SSID_SPARE; - buff += 7; + buff[6] |= SSSID_SPARE; + buff += AX25_ADDR_LEN; + *buff++ = LAPB_UI; /* UI */ /* Append a suitable AX.25 PID */ @@ -1940,33 +2518,109 @@ int ax25_encapsulate(unsigned char *buff, struct device *dev, unsigned short typ case ETH_P_ARP: *buff++ = AX25_P_ARP; break; - default: + printk(KERN_ERR "AX.25 wrong protocol type 0x%x2.2\n", type); *buff++ = 0; break; } - if (daddr != NULL) - return 17; + if (daddr != NULL) + return AX25_HEADER_LEN; - return -17; /* Unfinished header */ + return -AX25_HEADER_LEN; /* Unfinished header */ } int ax25_rebuild_header(unsigned char *bp, struct device *dev, unsigned long dest, struct sk_buff *skb) { + struct sk_buff *ourskb; + int mode; + if (arp_find(bp + 1, dest, dev, dev->pa_addr, skb)) return 1; + if (bp[16] == AX25_P_IP) { + mode = ax25_ip_mode_get((ax25_address *)(bp + 1), dev); + if (mode == 'V' || (mode == ' ' && ax25_dev_get_value(dev, AX25_VALUES_IPDEFMODE))) { + /* + * This is a workaround to try to keep the device locking + * straight until skb->free=0 is abolished post 1.4. + * + * We clone the buffer and release the original thereby + * keeping it straight + * + * Note: we report 1 back so the caller will + * not feed the frame direct to the physical device + * We don't want that to happen. (It won't be upset + * as we have pulled the frame from the queue by + * freeing it). + */ + if ((ourskb = skb_clone(skb, GFP_ATOMIC)) == NULL) { + dev_kfree_skb(skb, FREE_WRITE); + return 1; + } + + ourskb->sk = skb->sk; + + if (ourskb->sk != NULL) + atomic_add(ourskb->truesize, &ourskb->sk->wmem_alloc); + + dev_kfree_skb(skb, FREE_WRITE); + + skb_pull(ourskb, AX25_HEADER_LEN - 1); /* Keep PID */ + + ax25_send_frame(ourskb, (ax25_address *)(bp + 8), (ax25_address *)(bp + 1), NULL, dev); + + return 1; + } + } + bp[7] &= ~LAPB_C; bp[7] &= ~LAPB_E; - bp[7] |= SSID_SPARE; + bp[7] |= SSSID_SPARE; + bp[14] &= ~LAPB_C; bp[14] |= LAPB_E; - bp[14] |= SSID_SPARE; - - return 0; + bp[14] |= SSSID_SPARE; + + /* + * dl1bke 960317: we use ax25_queue_xmit here to allow mode datagram + * over ethernet. I don't know if this is valid, though. + */ + ax25_dg_build_path(skb, (ax25_address *)(bp + 1), dev); + ax25_queue_xmit(skb, dev, SOPRI_NORMAL); + + return 1; } #endif +#ifdef MODULE +int init_module(void) +{ + ax25_proto_init(NULL); + + return 0; +} + +void cleanup_module(void) +{ +#ifdef CONFIG_PROC_FS + proc_net_unregister(PROC_NET_AX25_ROUTE); + proc_net_unregister(PROC_NET_AX25); + proc_net_unregister(PROC_NET_AX25_CALLS); + proc_net_unregister(PROC_NET_AX25_ROUTE); +#endif + ax25_rt_free(); + + ax25_unregister_sysctl(); + + unregister_netdevice_notifier(&ax25_dev_notifier); + + ax25_packet_type.type = htons(ETH_P_AX25); + dev_remove_pack(&ax25_packet_type); + + sock_unregister(ax25_proto_ops.family); +} +#endif + #endif diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index ab22a8f6d..3ef1c3fdf 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -1,5 +1,5 @@ /* - * AX.25 release 029 + * AX.25 release 033 * * This is ALPHA test software. This code may break your machine, randomly fail to work with new * releases, misbehave and/or generally screw up. It might even work. @@ -24,10 +24,21 @@ * the sock structure. * AX.25 029 Alan(GW4PTS) Switched to KA9Q constant names. * Jonathan(G4KLX) Added IP mode registration. + * AX.25 030 Jonathan(G4KLX) Added AX.25 fragment reception. + * Upgraded state machine for SABME. + * Added arbitrary protocol id support. + * AX.25 031 Joerg(DL1BKE) Added DAMA support + * HaJo(DD8NE) Added Idle Disc Timer T5 + * Joerg(DL1BKE) Renamed it to "IDLE" with a slightly + * different behaviour. Fixed defrag + * routine (I hope) + * AX.25 032 Darryl(G7LED) AX.25 segmentation fixed. + * AX.25 033 Jonathan(G4KLX) Remove auto-router. + * Modularisation changes. */ #include <linux/config.h> -#ifdef CONFIG_AX25 +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> @@ -44,52 +55,135 @@ #include <linux/skbuff.h> #include <net/sock.h> #include <net/ip.h> /* For ip_rcv */ -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> -#ifdef CONFIG_NETROM -#include <net/netrom.h> -#endif + +static int ax25_rx_iframe(ax25_cb *, struct sk_buff *); + +/* + * Given a fragment, queue it on the fragment queue and if the fragment + * is complete, send it back to ax25_rx_iframe. + */ +static int ax25_rx_fragment(ax25_cb *ax25, struct sk_buff *skb) +{ + struct sk_buff *skbn, *skbo; + int hdrlen, nhdrlen; + + if (ax25->fragno != 0) { + if (!(*skb->data & SEG_FIRST)) { + if ((ax25->fragno - 1) == (*skb->data & SEG_REM)) { + /* Enqueue fragment */ + ax25->fragno = *skb->data & SEG_REM; + skb_pull(skb, 1); /* skip fragno */ + ax25->fraglen += skb->len; + skb_queue_tail(&ax25->frag_queue, skb); + + /* Last fragment received ? */ + if (ax25->fragno == 0) { + if ((skbn = alloc_skb(AX25_MAX_HEADER_LEN + ax25->fraglen, GFP_ATOMIC)) == NULL) { + while ((skbo = skb_dequeue(&ax25->frag_queue)) != NULL) + kfree_skb(skbo, FREE_READ); + return 1; + } + + skbn->free = 1; + skbn->arp = 1; + skbn->dev = ax25->device; + + if (ax25->sk != NULL) { + skbn->sk = ax25->sk; + atomic_add(skbn->truesize, &ax25->sk->rmem_alloc); + } + + skb_reserve(skbn, AX25_MAX_HEADER_LEN); + + /* Get first fragment from queue */ + skbo = skb_dequeue(&ax25->frag_queue); + hdrlen = skbo->data - skbo->h.raw; + nhdrlen = hdrlen - 2; + + skb_push(skbo, hdrlen); + skb_push(skbn, nhdrlen); + skbn->h.raw = skbn->data; + + /* Copy AX.25 headers */ + memcpy(skbn->data, skbo->data, nhdrlen); + skb_pull(skbn, nhdrlen); + skb_pull(skbo, hdrlen); + + /* Copy data from the fragments */ + do { + memcpy(skb_put(skbn, skbo->len), skbo->data, skbo->len); + kfree_skb(skbo, FREE_READ); + } while ((skbo = skb_dequeue(&ax25->frag_queue)) != NULL); + + ax25->fraglen = 0; + + if (ax25_rx_iframe(ax25, skbn) == 0) + kfree_skb(skbn, FREE_READ); + } + + return 1; + } + } + } else { + /* First fragment received */ + if (*skb->data & SEG_FIRST) { + while ((skbo = skb_dequeue(&ax25->frag_queue)) != NULL) + kfree_skb(skbo, FREE_READ); + ax25->fragno = *skb->data & SEG_REM; + skb_pull(skb, 1); /* skip fragno */ + ax25->fraglen = skb->len; + skb_queue_tail(&ax25->frag_queue, skb); + return 1; + } + } + + return 0; +} /* * This is where all valid I frames are sent to, to be dispatched to * whichever protocol requires them. */ -static int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb, unsigned char *iframe) +static int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb) { - int queued = 0; + int (*func)(struct sk_buff *, ax25_cb *); + volatile int queued = 0; + unsigned char pid; + + if (skb == NULL) return 0; + + ax25->idletimer = ax25->idle; + + pid = *skb->data; - switch (iframe[1]) { -#ifdef CONFIG_NETROM - case AX25_P_NETROM: - /* We can't handle digipeated NET/ROM frames */ - if (ax25->digipeat == NULL) - queued = nr_route_frame(skb, ax25->device); - break; -#endif #ifdef CONFIG_INET - case AX25_P_IP: - ax25_ip_mode_set(&ax25->dest_addr, ax25->device, 'V'); - skb->h.raw = ((char *)(iframe)) + 2; - skb->len -= 2; - ip_rcv(skb, skb->dev, NULL); /* Wrong ptype */ - queued = 1; - break; + if (pid == AX25_P_IP) { + skb_pull(skb, 1); /* Remove PID */ + skb->h.raw = skb->data; + ip_rcv(skb, ax25->device, NULL); /* Wrong ptype */ + return 1; + } #endif - case AX25_P_TEXT: - if (ax25->sk != NULL) { - if (sock_queue_rcv_skb(ax25->sk, skb) == 0) { - queued = 1; - } else { - ax25->condition |= OWN_RX_BUSY_CONDITION; - } - } - break; - - default: - break; + if (pid == AX25_P_SEGMENT) { + skb_pull(skb, 1); /* Remove PID */ + return ax25_rx_fragment(ax25, skb); + } + + if ((func = ax25_protocol_function(pid)) != NULL) { + skb_pull(skb, 1); /* Remove PID */ + return (*func)(skb, ax25); + } + + if (ax25->sk != NULL && ax25_dev_get_value(ax25->device, AX25_VALUES_TEXT) && ax25->sk->protocol == pid) { + if (sock_queue_rcv_skb(ax25->sk, skb) == 0) + queued = 1; + else + ax25->condition |= OWN_RX_BUSY_CONDITION; } return queued; @@ -100,29 +194,40 @@ static int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb, unsigned char *ifr * The handling of the timer(s) is in file ax25_timer.c. * Handling of state 0 and connection release is in ax25.c. */ -static int ax25_state1_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char *frame, int frametype, int type) +static int ax25_state1_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int pf, int type, int dama) { - int pf = frame[0] & PF; - switch (frametype) { case SABM: - ax25_send_control(ax25, UA | pf, C_RESPONSE); + ax25->modulus = MODULUS; + ax25->window = ax25_dev_get_value(ax25->device, AX25_VALUES_WINDOW); + ax25_send_control(ax25, UA, pf, C_RESPONSE); + break; + + case SABME: + ax25->modulus = EMODULUS; + ax25->window = ax25_dev_get_value(ax25->device, AX25_VALUES_EWINDOW); + ax25_send_control(ax25, UA, pf, C_RESPONSE); break; case DISC: - ax25_send_control(ax25, DM | pf, C_RESPONSE); + ax25_send_control(ax25, DM, pf, C_RESPONSE); break; case UA: - if (pf) { + if (pf || dama) { + if (dama) ax25_dama_on(ax25); /* bke */ + ax25_calculate_rtt(ax25); ax25->t1timer = 0; ax25->t3timer = ax25->t3; + ax25->idletimer = ax25->idle; ax25->vs = 0; ax25->va = 0; ax25->vr = 0; ax25->state = AX25_STATE_3; ax25->n2count = 0; + ax25->dama_slave = dama; /* bke */ + if (ax25->sk != NULL) { ax25->sk->state = TCP_ESTABLISHED; /* For WAIT_SABM connections we will produce an accept ready socket here */ @@ -134,19 +239,27 @@ static int ax25_state1_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char case DM: if (pf) { - ax25_clear_tx_queue(ax25); - ax25->state = AX25_STATE_0; - if (ax25->sk != NULL) { - ax25->sk->state = TCP_CLOSE; - ax25->sk->err = ECONNREFUSED; - if (!ax25->sk->dead) - ax25->sk->state_change(ax25->sk); - ax25->sk->dead = 1; + if (ax25->modulus == MODULUS) { + ax25_clear_queues(ax25); + ax25->state = AX25_STATE_0; + if (ax25->sk != NULL) { + ax25->sk->state = TCP_CLOSE; + ax25->sk->err = ECONNREFUSED; + ax25->sk->shutdown |= SEND_SHUTDOWN; + if (!ax25->sk->dead) + ax25->sk->state_change(ax25->sk); + ax25->sk->dead = 1; + } + } else { + ax25->modulus = MODULUS; + ax25->window = ax25_dev_get_value(ax25->device, AX25_VALUES_WINDOW); } } break; default: + if (dama && pf) + ax25_send_control(ax25, SABM, POLLON, C_COMMAND); break; } @@ -158,28 +271,43 @@ static int ax25_state1_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char * The handling of the timer(s) is in file ax25_timer.c * Handling of state 0 and connection release is in ax25.c. */ -static int ax25_state2_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char *frame, int frametype, int type) +static int ax25_state2_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int pf, int type) { - int pf = frame[0] & PF; - switch (frametype) { case SABM: - ax25_send_control(ax25, DM | pf, C_RESPONSE); + case SABME: + ax25_send_control(ax25, DM, pf, C_RESPONSE); + if (ax25->dama_slave) + ax25_send_control(ax25, DISC, POLLON, C_COMMAND); break; case DISC: - ax25_send_control(ax25, UA | pf, C_RESPONSE); + ax25_send_control(ax25, UA, pf, C_RESPONSE); + if (ax25->dama_slave) { + ax25->state = AX25_STATE_0; + ax25_dama_off(ax25); + if (ax25->sk != NULL) { + ax25->sk->state = TCP_CLOSE; + ax25->sk->err = 0; + ax25->sk->shutdown |= SEND_SHUTDOWN; + if (!ax25->sk->dead) + ax25->sk->state_change(ax25->sk); + ax25->sk->dead = 1; + } + } break; case UA: if (pf) { ax25->state = AX25_STATE_0; + ax25_dama_off(ax25); if (ax25->sk != NULL) { - ax25->sk->state = TCP_CLOSE; - ax25->sk->err = 0; + ax25->sk->state = TCP_CLOSE; + ax25->sk->err = 0; + ax25->sk->shutdown |= SEND_SHUTDOWN; if (!ax25->sk->dead) ax25->sk->state_change(ax25->sk); - ax25->sk->dead = 1; + ax25->sk->dead = 1; } } break; @@ -187,12 +315,14 @@ static int ax25_state2_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char case DM: if (pf) { ax25->state = AX25_STATE_0; + ax25_dama_off(ax25); if (ax25->sk != NULL) { - ax25->sk->state = TCP_CLOSE; - ax25->sk->err = 0; + ax25->sk->state = TCP_CLOSE; + ax25->sk->err = 0; + ax25->sk->shutdown |= SEND_SHUTDOWN; if (!ax25->sk->dead) ax25->sk->state_change(ax25->sk); - ax25->sk->dead = 1; + ax25->sk->dead = 1; } } break; @@ -201,8 +331,12 @@ static int ax25_state2_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char case REJ: case RNR: case RR: - if (pf) - ax25_send_control(ax25, DM | PF, C_RESPONSE); + if (pf) { + if (ax25->dama_slave) + ax25_send_control(ax25, DISC, POLLON, C_COMMAND); + else + ax25_send_control(ax25, DM, POLLON, C_RESPONSE); + } break; default: @@ -217,53 +351,69 @@ static int ax25_state2_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char * The handling of the timer(s) is in file ax25_timer.c * Handling of state 0 and connection release is in ax25.c. */ -static int ax25_state3_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char *frame, int frametype, int type) +static int ax25_state3_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int ns, int nr, int pf, int type, int dama) { - unsigned short nr = (frame[0] >> 5) & 7; - unsigned short ns = (frame[0] >> 1) & 7; - int pf = frame[0] & PF; int queued = 0; switch (frametype) { case SABM: - ax25_send_control(ax25, UA | pf, C_RESPONSE); + if (dama) ax25_dama_on(ax25); + ax25->modulus = MODULUS; + ax25->window = ax25_dev_get_value(ax25->device, AX25_VALUES_WINDOW); + ax25_send_control(ax25, UA, pf, C_RESPONSE); + ax25->condition = 0x00; + ax25->t1timer = 0; + ax25->t3timer = ax25->t3; + ax25->idletimer = ax25->idle; + ax25->vs = 0; + ax25->va = 0; + ax25->vr = 0; + ax25->dama_slave = dama; + break; + + case SABME: + if (dama) ax25_dama_on(ax25); + ax25->modulus = EMODULUS; + ax25->window = ax25_dev_get_value(ax25->device, AX25_VALUES_EWINDOW); + ax25_send_control(ax25, UA, pf, C_RESPONSE); ax25->condition = 0x00; ax25->t1timer = 0; ax25->t3timer = ax25->t3; + ax25->idletimer = ax25->idle; ax25->vs = 0; ax25->va = 0; ax25->vr = 0; + ax25->dama_slave = dama; break; case DISC: - ax25_clear_tx_queue(ax25); - ax25_send_control(ax25, UA | pf, C_RESPONSE); + ax25_clear_queues(ax25); + ax25_send_control(ax25, UA, pf, C_RESPONSE); ax25->t3timer = 0; ax25->state = AX25_STATE_0; + ax25_dama_off(ax25); if (ax25->sk != NULL) { - ax25->sk->state = TCP_CLOSE; - ax25->sk->err = 0; + ax25->sk->state = TCP_CLOSE; + ax25->sk->err = 0; + ax25->sk->shutdown |= SEND_SHUTDOWN; if (!ax25->sk->dead) ax25->sk->state_change(ax25->sk); - ax25->sk->dead = 1; + ax25->sk->dead = 1; } break; - case UA: - ax25_establish_data_link(ax25); - ax25->state = AX25_STATE_1; - break; - case DM: - ax25_clear_tx_queue(ax25); + ax25_clear_queues(ax25); ax25->t3timer = 0; ax25->state = AX25_STATE_0; - if (ax25->sk) { - ax25->sk->state = TCP_CLOSE; - ax25->sk->err = ECONNRESET; + ax25_dama_off(ax25); + if (ax25->sk != NULL) { + ax25->sk->state = TCP_CLOSE; + ax25->sk->err = ECONNRESET; + ax25->sk->shutdown |= SEND_SHUTDOWN; if (!ax25->sk->dead) ax25->sk->state_change(ax25->sk); - ax25->sk->dead = 1; + ax25->sk->dead = 1; } break; @@ -272,6 +422,7 @@ static int ax25_state3_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char ax25_check_need_response(ax25, type, pf); if (ax25_validate_nr(ax25, nr)) { ax25_check_iframes_acked(ax25, nr); + dama_check_need_response(ax25, type, pf); } else { ax25_nr_error_recovery(ax25); ax25->state = AX25_STATE_1; @@ -283,6 +434,7 @@ static int ax25_state3_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char ax25_check_need_response(ax25, type, pf); if (ax25_validate_nr(ax25, nr)) { ax25_check_iframes_acked(ax25, nr); + dama_check_need_response(ax25, type, pf); } else { ax25_nr_error_recovery(ax25); ax25->state = AX25_STATE_1; @@ -297,6 +449,8 @@ static int ax25_state3_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char ax25_calculate_rtt(ax25); ax25->t1timer = 0; ax25->t3timer = ax25->t3; + ax25_requeue_frames(ax25); + dama_check_need_response(ax25, type, pf); } else { ax25_nr_error_recovery(ax25); ax25->state = AX25_STATE_1; @@ -304,8 +458,10 @@ static int ax25_state3_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char break; case I: +#ifndef AX25_BROKEN_NETMAC if (type != C_COMMAND) break; +#endif if (!ax25_validate_nr(ax25, nr)) { ax25_nr_error_recovery(ax25); ax25->state = AX25_STATE_1; @@ -317,19 +473,33 @@ static int ax25_state3_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char ax25_check_iframes_acked(ax25, nr); } if (ax25->condition & OWN_RX_BUSY_CONDITION) { - if (pf) ax25_enquiry_response(ax25); + if (pf) { + if (ax25->dama_slave) + dama_enquiry_response(ax25); + else + ax25_enquiry_response(ax25); + } break; } if (ns == ax25->vr) { - queued = ax25_rx_iframe(ax25, skb, frame); + ax25->vr = (ax25->vr + 1) % ax25->modulus; + queued = ax25_rx_iframe(ax25, skb); if (ax25->condition & OWN_RX_BUSY_CONDITION) { - if (pf) ax25_enquiry_response(ax25); + ax25->vr = ns; /* ax25->vr - 1 */ + if (pf) { + if (ax25->dama_slave) + dama_enquiry_response(ax25); + else + ax25_enquiry_response(ax25); + } break; } - ax25->vr = (ax25->vr + 1) % MODULUS; ax25->condition &= ~REJECT_CONDITION; if (pf) { - ax25_enquiry_response(ax25); + if (ax25->dama_slave) + dama_enquiry_response(ax25); + else + ax25_enquiry_response(ax25); } else { if (!(ax25->condition & ACK_PENDING_CONDITION)) { ax25->t2timer = ax25->t2; @@ -338,10 +508,18 @@ static int ax25_state3_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char } } else { if (ax25->condition & REJECT_CONDITION) { - if (pf) ax25_enquiry_response(ax25); + if (pf) { + if (ax25->dama_slave) + dama_enquiry_response(ax25); + else + ax25_enquiry_response(ax25); + } } else { ax25->condition |= REJECT_CONDITION; - ax25_send_control(ax25, REJ | pf, C_RESPONSE); + if (ax25->dama_slave) + dama_enquiry_response(ax25); + else + ax25_send_control(ax25, REJ, pf, C_RESPONSE); ax25->condition &= ~ACK_PENDING_CONDITION; } } @@ -365,19 +543,38 @@ static int ax25_state3_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char * The handling of the timer(s) is in file ax25_timer.c * Handling of state 0 and connection release is in ax25.c. */ -static int ax25_state4_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char *frame, int frametype, int type) +static int ax25_state4_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int ns, int nr, int pf, int type, int dama) { - unsigned short nr = (frame[0] >> 5) & 7; - unsigned short ns = (frame[0] >> 1) & 7; - int pf = frame[0] & PF; int queued = 0; switch (frametype) { case SABM: - ax25_send_control(ax25, UA | pf, C_RESPONSE); + if (dama) ax25_dama_on(ax25); + ax25->dama_slave = dama; + ax25->modulus = MODULUS; + ax25->window = ax25_dev_get_value(ax25->device, AX25_VALUES_WINDOW); + ax25_send_control(ax25, UA, pf, C_RESPONSE); + ax25->condition = 0x00; + ax25->t1timer = 0; + ax25->t3timer = ax25->t3; + ax25->idletimer = ax25->idle; + ax25->vs = 0; + ax25->va = 0; + ax25->vr = 0; + ax25->state = AX25_STATE_3; + ax25->n2count = 0; + break; + + case SABME: + if (dama) ax25_dama_on(ax25); + ax25->dama_slave = dama; + ax25->modulus = EMODULUS; + ax25->window = ax25_dev_get_value(ax25->device, AX25_VALUES_EWINDOW); + ax25_send_control(ax25, UA, pf, C_RESPONSE); ax25->condition = 0x00; ax25->t1timer = 0; ax25->t3timer = ax25->t3; + ax25->idletimer = ax25->idle; ax25->vs = 0; ax25->va = 0; ax25->vr = 0; @@ -386,34 +583,33 @@ static int ax25_state4_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char break; case DISC: - ax25_clear_tx_queue(ax25); - ax25_send_control(ax25, UA | pf, C_RESPONSE); + ax25_clear_queues(ax25); + ax25_send_control(ax25, UA, pf, C_RESPONSE); ax25->t3timer = 0; ax25->state = AX25_STATE_0; + ax25_dama_off(ax25); if (ax25->sk != NULL) { - ax25->sk->state = TCP_CLOSE; - ax25->sk->err = 0; + ax25->sk->state = TCP_CLOSE; + ax25->sk->err = 0; + ax25->sk->shutdown |= SEND_SHUTDOWN; if (!ax25->sk->dead) ax25->sk->state_change(ax25->sk); - ax25->sk->dead = 1; + ax25->sk->dead = 1; } break; - case UA: - ax25_establish_data_link(ax25); - ax25->state = AX25_STATE_1; - break; - case DM: - ax25_clear_tx_queue(ax25); + ax25_clear_queues(ax25); ax25->t3timer = 0; ax25->state = AX25_STATE_0; + ax25_dama_off(ax25); if (ax25->sk != NULL) { - ax25->sk->state = TCP_CLOSE; - ax25->sk->err = ECONNRESET; + ax25->sk->state = TCP_CLOSE; + ax25->sk->err = ECONNRESET; + ax25->sk->shutdown |= SEND_SHUTDOWN; if (!ax25->sk->dead) ax25->sk->state_change(ax25->sk); - ax25->sk->dead = 1; + ax25->sk->dead = 1; } break; @@ -434,10 +630,11 @@ static int ax25_state4_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char } break; } - if (type == C_COMMAND && pf) - ax25_enquiry_response(ax25); + + ax25_check_need_response(ax25, type, pf); if (ax25_validate_nr(ax25, nr)) { ax25_frames_acked(ax25, nr); + dama_check_need_response(ax25, type, pf); } else { ax25_nr_error_recovery(ax25); ax25->state = AX25_STATE_1; @@ -446,7 +643,7 @@ static int ax25_state4_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char case RR: ax25->condition &= ~PEER_RX_BUSY_CONDITION; - if (type == C_RESPONSE && pf) { + if (pf && (type == C_RESPONSE || (ax25->dama_slave && type == C_COMMAND))) { ax25->t1timer = 0; if (ax25_validate_nr(ax25, nr)) { ax25_frames_acked(ax25, nr); @@ -454,17 +651,21 @@ static int ax25_state4_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char ax25->t3timer = ax25->t3; ax25->n2count = 0; ax25->state = AX25_STATE_3; + } else { + ax25_requeue_frames(ax25); } + dama_check_need_response(ax25, type, pf); } else { ax25_nr_error_recovery(ax25); ax25->state = AX25_STATE_1; } break; } - if (type == C_COMMAND && pf) - ax25_enquiry_response(ax25); + + ax25_check_need_response(ax25, type, pf); if (ax25_validate_nr(ax25, nr)) { ax25_frames_acked(ax25, nr); + dama_check_need_response(ax25, type, pf); } else { ax25_nr_error_recovery(ax25); ax25->state = AX25_STATE_1; @@ -473,7 +674,7 @@ static int ax25_state4_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char case REJ: ax25->condition &= ~PEER_RX_BUSY_CONDITION; - if (type == C_RESPONSE && pf) { + if (pf && (type == C_RESPONSE || (ax25->dama_slave && type == C_COMMAND))) { ax25->t1timer = 0; if (ax25_validate_nr(ax25, nr)) { ax25_frames_acked(ax25, nr); @@ -481,17 +682,24 @@ static int ax25_state4_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char ax25->t3timer = ax25->t3; ax25->n2count = 0; ax25->state = AX25_STATE_3; + } else { + ax25_requeue_frames(ax25); } + dama_check_need_response(ax25, type, pf); } else { ax25_nr_error_recovery(ax25); ax25->state = AX25_STATE_1; } break; } - if (type == C_COMMAND && pf) - ax25_enquiry_response(ax25); + + ax25_check_need_response(ax25, type, pf); if (ax25_validate_nr(ax25, nr)) { ax25_frames_acked(ax25, nr); + if(ax25->vs != ax25->va) { + ax25_requeue_frames(ax25); + } + dama_check_need_response(ax25, type, pf); } else { ax25_nr_error_recovery(ax25); ax25->state = AX25_STATE_1; @@ -499,8 +707,10 @@ static int ax25_state4_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char break; case I: +#ifndef AX25_BROKEN_NETMAC if (type != C_COMMAND) break; +#endif if (!ax25_validate_nr(ax25, nr)) { ax25_nr_error_recovery(ax25); ax25->state = AX25_STATE_1; @@ -508,19 +718,33 @@ static int ax25_state4_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char } ax25_frames_acked(ax25, nr); if (ax25->condition & OWN_RX_BUSY_CONDITION) { - if (pf) ax25_enquiry_response(ax25); + if (pf) { + if (ax25->dama_slave) + ax25_enquiry_response(ax25); + else + dama_enquiry_response(ax25); + } break; } if (ns == ax25->vr) { - queued = ax25_rx_iframe(ax25, skb, frame); + ax25->vr = (ax25->vr + 1) % ax25->modulus; + queued = ax25_rx_iframe(ax25, skb); if (ax25->condition & OWN_RX_BUSY_CONDITION) { - if (pf) ax25_enquiry_response(ax25); + ax25->vr = ns; /* ax25->vr - 1 */ + if (pf) { + if (ax25->dama_slave) + dama_enquiry_response(ax25); + else + ax25_enquiry_response(ax25); + } break; } - ax25->vr = (ax25->vr + 1) % MODULUS; ax25->condition &= ~REJECT_CONDITION; if (pf) { - ax25_enquiry_response(ax25); + if (ax25->dama_slave) + dama_enquiry_response(ax25); + else + ax25_enquiry_response(ax25); } else { if (!(ax25->condition & ACK_PENDING_CONDITION)) { ax25->t2timer = ax25->t2; @@ -529,10 +753,18 @@ static int ax25_state4_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char } } else { if (ax25->condition & REJECT_CONDITION) { - if (pf) ax25_enquiry_response(ax25); + if (pf) { + if (ax25->dama_slave) + dama_enquiry_response(ax25); + else + ax25_enquiry_response(ax25); + } } else { ax25->condition |= REJECT_CONDITION; - ax25_send_control(ax25, REJ | pf, C_RESPONSE); + if (ax25->dama_slave) + dama_enquiry_response(ax25); + else + ax25_send_control(ax25, REJ, pf, C_RESPONSE); ax25->condition &= ~ACK_PENDING_CONDITION; } } @@ -554,38 +786,35 @@ static int ax25_state4_machine(ax25_cb *ax25, struct sk_buff *skb, unsigned char /* * Higher level upcall for a LAPB frame */ -int ax25_process_rx_frame(ax25_cb *ax25, struct sk_buff *skb, int type) +int ax25_process_rx_frame(ax25_cb *ax25, struct sk_buff *skb, int type, int dama) { - int queued = 0, frametype; - unsigned char *frame; + int queued = 0, frametype, ns, nr, pf; + + if (ax25->state == AX25_STATE_0) + return 0; del_timer(&ax25->timer); - frame = skb->h.raw; - - frametype = ax25_decode(frame); + frametype = ax25_decode(ax25, skb, &ns, &nr, &pf); switch (ax25->state) { case AX25_STATE_1: - queued = ax25_state1_machine(ax25, skb, frame, frametype, type); + queued = ax25_state1_machine(ax25, skb, frametype, pf, type, dama); break; case AX25_STATE_2: - queued = ax25_state2_machine(ax25, skb, frame, frametype, type); + queued = ax25_state2_machine(ax25, skb, frametype, pf, type); break; case AX25_STATE_3: - queued = ax25_state3_machine(ax25, skb, frame, frametype, type); + queued = ax25_state3_machine(ax25, skb, frametype, ns, nr, pf, type, dama); break; case AX25_STATE_4: - queued = ax25_state4_machine(ax25, skb, frame, frametype, type); - break; - default: - printk("ax25_process_rx_frame: frame received - state = %d\n", ax25->state); + queued = ax25_state4_machine(ax25, skb, frametype, ns, nr, pf, type, dama); break; } ax25_set_timer(ax25); - return(queued); + return queued; } #endif diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c index 73cd056c7..be265b344 100644 --- a/net/ax25/ax25_out.c +++ b/net/ax25/ax25_out.c @@ -1,5 +1,5 @@ /* - * AX.25 release 029 + * AX.25 release 033 * * This is ALPHA test software. This code may break your machine, randomly fail to work with new * releases, misbehave and/or generally screw up. It might even work. @@ -22,10 +22,17 @@ * AX.25 028a Jonathan(G4KLX) New state machine based on SDL diagrams. * AX.25 029 Alan(GW4PTS) Switched to KA9Q constant names. * Jonathan(G4KLX) Only poll when window is full. + * AX.25 030 Jonathan(G4KLX) Added fragmentation to ax25_output. + * Added support for extended AX.25. + * AX.25 031 Joerg(DL1BKE) Added DAMA support + * Joerg(DL1BKE) Modified fragmenter to fragment vanilla + * AX.25 I-Frames. Added PACLEN parameter. + * Joerg(DL1BKE) Fixed a problem with buffer allocation + * for fragments. */ #include <linux/config.h> -#ifdef CONFIG_AX25 +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> @@ -41,20 +48,108 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> -int ax25_output(ax25_cb *ax25, struct sk_buff *skb) +/* + * All outgoing AX.25 I frames pass via this routine. Therefore this is + * where the fragmentation of frames takes place. + */ +void ax25_output(ax25_cb *ax25, struct sk_buff *skb) { - skb_queue_tail(&ax25->write_queue, skb); /* Throw it on the queue */ + struct sk_buff *skbn; + unsigned char *p; + int frontlen, mtu, len, fragno, ka9qfrag, first = 1; + long flags; + + /* + * dl1bke 960301: We use the new PACLEN parameter as MTU of the AX.25 layer. + * This will (hopefully) allow user programs to write() data + * w/o having to think of the maximal amount of data we can + * send with one call. It's called PACLEN to (1) avoid confusion + * with (IP) MTU and (2) TAPR calls this PACLEN, too ;-) + */ + + mtu = ax25->paclen; + + if ((skb->len - 1) > mtu) { + if (*skb->data == AX25_P_TEXT) { + skb_pull(skb, 1); /* skip PID */ + ka9qfrag = 0; + } else { + mtu -= 2; /* Allow for fragment control info */ + ka9qfrag = 1; + } + + fragno = skb->len / mtu; + if (skb->len % mtu == 0) fragno--; + + frontlen = skb_headroom(skb); /* Address space + CTRL */ + + while (skb->len > 0) { + save_flags(flags); + cli(); + /* + * do _not_ use sock_alloc_send_skb, our socket may have + * sk->shutdown set... + */ + if ((skbn = alloc_skb(mtu + 2 + frontlen, GFP_ATOMIC)) == NULL) { + restore_flags(flags); + printk(KERN_DEBUG "ax25_output: alloc_skb returned NULL\n"); + if (skb_device_locked(skb)) + skb_device_unlock(skb); + return; + } - if (ax25->state == AX25_STATE_3 || ax25->state == AX25_STATE_4) - ax25_kick(ax25); + skbn->sk = skb->sk; + + if (skbn->sk) + atomic_add(skbn->truesize, &skbn->sk->wmem_alloc); + + restore_flags(flags); + + skbn->free = 1; + skbn->arp = 1; + + len = (mtu > skb->len) ? skb->len : mtu; + + if (ka9qfrag == 1) { + skb_reserve(skbn, frontlen + 2); + + memcpy(skb_put(skbn, len), skb->data, len); + p = skb_push(skbn, 2); + + *p++ = AX25_P_SEGMENT; + + *p = fragno--; + if (first) { + *p |= SEG_FIRST; + first = 0; + } + } else { + skb_reserve(skbn, frontlen + 1); + memcpy(skb_put(skbn, len), skb->data, len); + p = skb_push(skbn, 1); + *p = AX25_P_TEXT; + } - return 0; + skb_pull(skb, len); + skb_queue_tail(&ax25->write_queue, skbn); /* Throw it on the queue */ + } + + skb->free = 1; + kfree_skb(skb, FREE_WRITE); + } else { + skb_queue_tail(&ax25->write_queue, skb); /* Throw it on the queue */ + } + + if (ax25->state == AX25_STATE_3 || ax25->state == AX25_STATE_4) { + if (!ax25->dama_slave) /* bke 960114: we aren't allowed to transmit */ + ax25_kick(ax25); /* in DAMA mode unless we received a Poll */ + } } /* @@ -67,13 +162,22 @@ static void ax25_send_iframe(ax25_cb *ax25, struct sk_buff *skb, int poll_bit) if (skb == NULL) return; - - frame = skb->h.raw; /* KISS + header */ - *frame = I; - *frame |= poll_bit; - *frame |= (ax25->vr << 5); - *frame |= (ax25->vs << 1); + if (ax25->modulus == MODULUS) { + frame = skb_push(skb, 1); + + *frame = I; + *frame |= (poll_bit) ? PF : 0; + *frame |= (ax25->vr << 5); + *frame |= (ax25->vs << 1); + } else { + frame = skb_push(skb, 2); + + frame[0] = I; + frame[0] |= (ax25->vs << 1); + frame[1] = (poll_bit) ? EPF : 0; + frame[1] |= (ax25->vr << 1); + } ax25_transmit_buffer(ax25, skb, C_COMMAND); } @@ -87,7 +191,7 @@ void ax25_kick(ax25_cb *ax25) del_timer(&ax25->timer); start = (skb_peek(&ax25->ack_queue) == NULL) ? ax25->va : ax25->vs; - end = (ax25->va + ax25->window) % MODULUS; + end = (ax25->va + ax25->window) % ax25->modulus; if (!(ax25->condition & PEER_RX_BUSY_CONDITION) && start != end && @@ -100,18 +204,19 @@ void ax25_kick(ax25_cb *ax25) * the window is full. Send a poll on the final I frame if * the window is filled. */ - do { - /* - * Dequeue the frame and copy it. - */ - skb = skb_dequeue(&ax25->write_queue); + /* + * Dequeue the frame and copy it. + */ + skb = skb_dequeue(&ax25->write_queue); + + do { if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { skb_queue_head(&ax25->write_queue, skb); - return; + break; } - next = (ax25->vs + 1) % MODULUS; + next = (ax25->vs + 1) % ax25->modulus; #ifdef notdef last = (next == end) || skb_peek(&ax25->write_queue) == NULL; #else @@ -119,8 +224,10 @@ void ax25_kick(ax25_cb *ax25) #endif /* * Transmit the frame copy. + * bke 960114: do not set the Poll bit on the last frame + * in DAMA mode. */ - ax25_send_iframe(ax25, skbn, (last) ? PF : 0); + ax25_send_iframe(ax25, skbn, (last && !ax25->dama_slave) ? POLLON : POLLOFF); ax25->vs = next; @@ -131,7 +238,7 @@ void ax25_kick(ax25_cb *ax25) #ifdef notdef } while (!last); #else - } while (!last && skb_peek(&ax25->write_queue) != NULL); + } while (!last && (skb = skb_dequeue(&ax25->write_queue)) != NULL); #endif ax25->condition &= ~ACK_PENDING_CONDITION; @@ -146,25 +253,33 @@ void ax25_kick(ax25_cb *ax25) void ax25_transmit_buffer(ax25_cb *ax25, struct sk_buff *skb, int type) { - unsigned char *ptr = skb->data; + unsigned char *ptr; if (ax25->device == NULL) { if (ax25->sk != NULL) { - ax25->sk->state = TCP_CLOSE; - ax25->sk->err = ENETUNREACH; + ax25->sk->state = TCP_CLOSE; + ax25->sk->err = ENETUNREACH; + ax25->sk->shutdown |= SEND_SHUTDOWN; if (!ax25->sk->dead) ax25->sk->state_change(ax25->sk); - ax25->sk->dead = 1; + ax25->sk->dead = 1; } return; } - *ptr++ = 0; /* KISS data */ - ptr += build_ax25_addr(ptr, &ax25->source_addr, &ax25->dest_addr, ax25->digipeat, type); + if (skb_headroom(skb) < size_ax25_addr(ax25->digipeat)) { + printk(KERN_CRIT "ax25_transmit_buffer: not enough room for digi-peaters\n"); + skb->free = 1; + kfree_skb(skb, FREE_WRITE); + return; + } + + ptr = skb_push(skb, size_ax25_addr(ax25->digipeat)); + build_ax25_addr(ptr, &ax25->source_addr, &ax25->dest_addr, ax25->digipeat, type, ax25->modulus); skb->arp = 1; - dev_queue_xmit(skb, ax25->device, SOPRI_NORMAL); + ax25_queue_xmit(skb, ax25->device, SOPRI_NORMAL); } /* @@ -182,8 +297,12 @@ void ax25_establish_data_link(ax25_cb *ax25) ax25->condition = 0x00; ax25->n2count = 0; - ax25_send_control(ax25, SABM | PF, C_COMMAND); - + if (ax25->modulus == MODULUS) { + ax25_send_control(ax25, SABM, POLLON, C_COMMAND); + } else { + ax25_send_control(ax25, SABME, POLLON, C_COMMAND); + } + ax25->t3timer = 0; ax25->t2timer = 0; ax25->t1timer = ax25->t1 = ax25_calculate_t1(ax25); @@ -192,9 +311,9 @@ void ax25_establish_data_link(ax25_cb *ax25) void ax25_transmit_enquiry(ax25_cb *ax25) { if (ax25->condition & OWN_RX_BUSY_CONDITION) - ax25_send_control(ax25, RNR | PF, C_COMMAND); + ax25_send_control(ax25, RNR, POLLON, C_COMMAND); else - ax25_send_control(ax25, RR | PF, C_COMMAND); + ax25_send_control(ax25, RR, POLLON, C_COMMAND); ax25->condition &= ~ACK_PENDING_CONDITION; @@ -204,9 +323,19 @@ void ax25_transmit_enquiry(ax25_cb *ax25) void ax25_enquiry_response(ax25_cb *ax25) { if (ax25->condition & OWN_RX_BUSY_CONDITION) - ax25_send_control(ax25, RNR | PF, C_RESPONSE); + ax25_send_control(ax25, RNR, POLLON, C_RESPONSE); else - ax25_send_control(ax25, RR | PF, C_RESPONSE); + ax25_send_control(ax25, RR, POLLON, C_RESPONSE); + + ax25->condition &= ~ACK_PENDING_CONDITION; +} + +void ax25_timeout_response(ax25_cb *ax25) +{ + if (ax25->condition & OWN_RX_BUSY_CONDITION) + ax25_send_control(ax25, RNR, POLLOFF, C_RESPONSE); + else + ax25_send_control(ax25, RR, POLLOFF, C_RESPONSE); ax25->condition &= ~ACK_PENDING_CONDITION; } @@ -226,10 +355,100 @@ void ax25_check_iframes_acked(ax25_cb *ax25, unsigned short nr) } } +/* + * dl1bke 960114: shouldn't ax25/dama_check_need_response reside as + * static inline void ...() in ax25.h, should it? ;-) + */ void ax25_check_need_response(ax25_cb *ax25, int type, int pf) { - if (type == C_COMMAND && pf) + if (!ax25->dama_slave && type == C_COMMAND && pf) + ax25_enquiry_response(ax25); +} + +/* + * dl1bke 960114: transmit I frames on DAMA poll + */ +void dama_enquiry_response(ax25_cb *ax25) +{ + ax25_cb *ax25o; + + if (!(ax25->condition & PEER_RX_BUSY_CONDITION)) { + ax25_requeue_frames(ax25); + ax25_kick(ax25); + } + + if (ax25->state == AX25_STATE_1 || ax25->state == AX25_STATE_2 || + skb_peek(&ax25->ack_queue) != NULL) { + ax25_t1_timeout(ax25); + } else { + ax25->n2count = 0; + } + + ax25->t3timer = ax25->t3; + + + /* The FLEXNET DAMA master implementation refuses to send us ANY */ + /* I frame for this connection if we send a REJ here, probably */ + /* due to its frame collector scheme? A simple RR or RNR will */ + /* invoke the retransmission, and in fact REJs are superfluous */ + /* in DAMA mode anyway... */ + +#if 0 + if (ax25->condition & REJECT_CONDITION) + ax25_send_control(ax25, REJ, POLLOFF, C_RESPONSE); + else +#endif ax25_enquiry_response(ax25); + + /* Note that above response to the poll could be sent behind the */ + /* transmissions of the other channels as well... This version */ + /* gives better performance on FLEXNET nodes. (Why, Gunter?) */ + + for (ax25o = ax25_list; ax25o != NULL; ax25o = ax25o->next) { + if (ax25o == ax25) + continue; + + if (ax25o->device != ax25->device) + continue; + + if (ax25o->state == AX25_STATE_1 || ax25o->state == AX25_STATE_2) { + ax25_t1_timeout(ax25o); + continue; + } + + if (!ax25o->dama_slave) + continue; + + if ( !(ax25o->condition & PEER_RX_BUSY_CONDITION) && + (ax25o->state == AX25_STATE_3 || + (ax25o->state == AX25_STATE_4 && ax25o->t1timer == 0))) { + ax25_requeue_frames(ax25o); + ax25_kick(ax25o); + } + + if (ax25o->state == AX25_STATE_1 || ax25o->state == AX25_STATE_2 || + skb_peek(&ax25o->ack_queue) != NULL) { + ax25_t1_timeout(ax25o); + } + + ax25o->t3timer = ax25o->t3; + } +} + +void dama_check_need_response(ax25_cb *ax25, int type, int pf) +{ + if (ax25->dama_slave && type == C_COMMAND && pf) + dama_enquiry_response(ax25); +} + +void dama_establish_data_link(ax25_cb *ax25) +{ + ax25->condition = 0x00; + ax25->n2count = 0; + + ax25->t3timer = ax25->t3; + ax25->t2timer = 0; + ax25->t1timer = ax25->t1 = ax25_calculate_t1(ax25); } #endif diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index b0ffcea58..b4606111e 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -1,5 +1,5 @@ /* - * AX.25 release 029 + * AX.25 release 033 * * This is ALPHA test software. This code may break your machine, randomly fail to work with new * releases, misbehave and/or generally screw up. It might even work. @@ -17,17 +17,32 @@ * * History * AX.25 020 Jonathan(G4KLX) First go. - * AX.25 022 Jonathan(G4KLX) Added the actual meat to this - we now have a nice mheard list. + * AX.25 022 Jonathan(G4KLX) Added the actual meat to this - we now have a nice heard list. * AX.25 025 Alan(GW4PTS) First cut at autobinding by route scan. * AX.25 028b Jonathan(G4KLX) Extracted AX25 control block from the * sock structure. Device removal now * removes the heard structure. * AX.25 029 Steven(GW7RRM) Added /proc information for uid/callsign mapping. * Jonathan(G4KLX) Handling of IP mode in the routing list and /proc entry. + * AX.25 030 Jonathan(G4KLX) Added digi-peaters to routing table, and + * ioctls to manipulate them. Added port + * configuration. + * AX.25 031 Jonathan(G4KLX) Added concept of default route. + * Joerg(DL1BKE) ax25_rt_build_path() find digipeater list and device by + * destination call. Needed for IP routing via digipeater + * Jonathan(G4KLX) Added routing for IP datagram packets. + * Joerg(DL1BKE) Changed routing for IP datagram and VC to use a default + * route if available. Does not overwrite default routes + * on route-table overflow anymore. + * Joerg(DL1BKE) Fixed AX.25 routing of IP datagram and VC, new ioctl() + * "SIOCAX25OPTRT" to set IP mode and a 'permanent' flag + * on routes. + * AX.25 033 Jonathan(G4KLX) Remove auto-router. + * Joerg(DL1BKE) Moved BPQ Ethernet driver to seperate device. */ #include <linux/config.h> -#ifdef CONFIG_AX25 +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> @@ -41,74 +56,46 @@ #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> +#include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> -#define AX25_ROUTE_MAX 40 - static struct ax25_route { struct ax25_route *next; ax25_address callsign; struct device *dev; - struct timeval stamp; - int n; + ax25_digi *digipeat; char ip_mode; } *ax25_route = NULL; -void ax25_rt_rx_frame(ax25_address *src, struct device *dev) -{ - unsigned long flags; - extern struct timeval xtime; - struct ax25_route *ax25_rt; - struct ax25_route *oldest; - int count; - - count = 0; - oldest = NULL; - - for (ax25_rt = ax25_route; ax25_rt != NULL; ax25_rt = ax25_rt->next) { - if (count == 0 || ax25_rt->stamp.tv_sec < oldest->stamp.tv_sec) - oldest = ax25_rt; - - if (ax25cmp(&ax25_rt->callsign, src) == 0 && ax25_rt->dev == dev) { - ax25_rt->stamp = xtime; - ax25_rt->n++; - return; - } - - count++; - } - - if (count > AX25_ROUTE_MAX) { - oldest->callsign = *src; - oldest->dev = dev; - oldest->stamp = xtime; - oldest->n = 1; - oldest->ip_mode = ' '; - return; - } +struct ax25_dev ax25_device[AX25_MAX_DEVICES] = { + {"", NULL}, {"", NULL}, {"", NULL}, {"", NULL}, {"", NULL}, + {"", NULL}, {"", NULL}, {"", NULL}, {"", NULL}, {"", NULL}, + {"", NULL}, {"", NULL}, {"", NULL}, {"", NULL}, {"", NULL}, + {"", NULL}, {"", NULL}, {"", NULL}, {"", NULL}, {"", NULL} +}; - if ((ax25_rt = (struct ax25_route *)kmalloc(sizeof(struct ax25_route), GFP_ATOMIC)) == NULL) - return; /* No space */ +static struct ax25_route *ax25_find_route(ax25_address *, struct device *); - ax25_rt->callsign = *src; - ax25_rt->dev = dev; - ax25_rt->stamp = xtime; - ax25_rt->n = 1; - ax25_rt->ip_mode = ' '; +/* + * small macro to drop non-digipeated digipeaters and reverse path + */ +static inline void ax25_route_invert(ax25_digi *in, ax25_digi *out) +{ + int k; - save_flags(flags); - cli(); + for (k = 0; k < in->ndigi; k++) + if (!in->repeated[k]) + break; - ax25_rt->next = ax25_route; - ax25_route = ax25_rt; + in->ndigi = k; - restore_flags(flags); + ax25_digi_invert(in, out); } void ax25_rt_device_down(struct device *dev) @@ -122,11 +109,15 @@ void ax25_rt_device_down(struct device *dev) if (s->dev == dev) { if (ax25_route == s) { ax25_route = s->next; + if (s->digipeat != NULL) + kfree_s((void *)s->digipeat, sizeof(ax25_digi)); kfree_s((void *)s, (sizeof *s)); } else { for (t = ax25_route; t != NULL; t = t->next) { if (t->next == s) { t->next = s->next; + if (s->digipeat != NULL) + kfree_s((void *)s->digipeat, sizeof(ax25_digi)); kfree_s((void *)s, sizeof(*s)); break; } @@ -136,37 +127,172 @@ void ax25_rt_device_down(struct device *dev) } } -int ax25_rt_get_info(char *buffer, char **start, off_t offset, int length) +int ax25_rt_ioctl(unsigned int cmd, void *arg) +{ + unsigned long flags; + struct ax25_route *s, *t, *ax25_rt; + struct ax25_routes_struct route; + struct ax25_route_opt_struct rt_option; + struct device *dev; + int i, err; + + switch (cmd) { + case SIOCADDRT: + if ((err = verify_area(VERIFY_READ, arg, sizeof(route))) != 0) + return err; + copy_from_user(&route, arg, sizeof(route)); + if ((dev = ax25rtr_get_dev(&route.port_addr)) == NULL) + return -EINVAL; + if (route.digi_count > AX25_MAX_DIGIS) + return -EINVAL; + for (ax25_rt = ax25_route; ax25_rt != NULL; ax25_rt = ax25_rt->next) { + if (ax25cmp(&ax25_rt->callsign, &route.dest_addr) == 0 && ax25_rt->dev == dev) { + if (ax25_rt->digipeat != NULL) { + kfree_s(ax25_rt->digipeat, sizeof(ax25_digi)); + ax25_rt->digipeat = NULL; + } + if (route.digi_count != 0) { + if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) + return -ENOMEM; + ax25_rt->digipeat->lastrepeat = 0; + ax25_rt->digipeat->ndigi = route.digi_count; + for (i = 0; i < route.digi_count; i++) { + ax25_rt->digipeat->repeated[i] = 0; + ax25_rt->digipeat->calls[i] = route.digi_addr[i]; + } + } + return 0; + } + } + if ((ax25_rt = (struct ax25_route *)kmalloc(sizeof(struct ax25_route), GFP_ATOMIC)) == NULL) + return -ENOMEM; + ax25_rt->callsign = route.dest_addr; + ax25_rt->dev = dev; + ax25_rt->digipeat = NULL; + ax25_rt->ip_mode = ' '; + if (route.digi_count != 0) { + if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { + kfree_s(ax25_rt, sizeof(struct ax25_route)); + return -ENOMEM; + } + ax25_rt->digipeat->lastrepeat = 0; + ax25_rt->digipeat->ndigi = route.digi_count; + for (i = 0; i < route.digi_count; i++) { + ax25_rt->digipeat->repeated[i] = 0; + ax25_rt->digipeat->calls[i] = route.digi_addr[i]; + } + } + save_flags(flags); + cli(); + ax25_rt->next = ax25_route; + ax25_route = ax25_rt; + restore_flags(flags); + break; + + case SIOCDELRT: + if ((err = verify_area(VERIFY_READ, arg, sizeof(route))) != 0) + return err; + copy_from_user(&route, arg, sizeof(route)); + if ((dev = ax25rtr_get_dev(&route.port_addr)) == NULL) + return -EINVAL; + ax25_rt = ax25_route; + while (ax25_rt != NULL) { + s = ax25_rt; + ax25_rt = ax25_rt->next; + if (s->dev == dev && ax25cmp(&route.dest_addr, &s->callsign) == 0) { + if (ax25_route == s) { + ax25_route = s->next; + if (s->digipeat != NULL) + kfree_s((void *)s->digipeat, sizeof(ax25_digi)); + kfree_s((void *)s, (sizeof *s)); + } else { + for (t = ax25_route; t != NULL; t = t->next) { + if (t->next == s) { + t->next = s->next; + if (s->digipeat != NULL) + kfree_s((void *)s->digipeat, sizeof(ax25_digi)); + kfree_s((void *)s, sizeof(*s)); + break; + } + } + } + } + } + break; + + case SIOCAX25OPTRT: + if ((err = verify_area(VERIFY_READ, arg, sizeof(rt_option))) != 0) + return err; + copy_from_user(&rt_option, arg, sizeof(rt_option)); + if ((dev = ax25rtr_get_dev(&rt_option.port_addr)) == NULL) + return -EINVAL; + for (ax25_rt = ax25_route; ax25_rt != NULL; ax25_rt = ax25_rt->next) { + if (ax25_rt->dev == dev && ax25cmp(&rt_option.dest_addr, &ax25_rt->callsign) == 0) { + switch (rt_option.cmd) { + case AX25_SET_RT_IPMODE: + switch (rt_option.arg) { + case ' ': + case 'D': + case 'V': + ax25_rt->ip_mode = rt_option.arg; + break; + default: + return -EINVAL; + } + break; + default: + return -EINVAL; + } + } + } + break; + + default: + return -EINVAL; + } + + return 0; +} + +int ax25_rt_get_info(char *buffer, char **start, off_t offset, int length, int dummy) { struct ax25_route *ax25_rt; int len = 0; off_t pos = 0; off_t begin = 0; + char *callsign; + int i; cli(); - len += sprintf(buffer, "callsign dev count time mode\n"); + len += sprintf(buffer, "callsign dev mode digipeaters\n"); for (ax25_rt = ax25_route; ax25_rt != NULL; ax25_rt = ax25_rt->next) { - len += sprintf(buffer + len, "%-9s %-3s %5d %9ld", - ax2asc(&ax25_rt->callsign), - ax25_rt->dev ? ax25_rt->dev->name : "???", - ax25_rt->n, - ax25_rt->stamp.tv_sec); + if (ax25cmp(&ax25_rt->callsign, &null_ax25_address) == 0) + callsign = "default"; + else + callsign = ax2asc(&ax25_rt->callsign); + len += sprintf(buffer + len, "%-9s %-4s", + callsign, + ax25_rt->dev ? ax25_rt->dev->name : "???"); switch (ax25_rt->ip_mode) { case 'V': - case 'v': - len += sprintf(buffer + len, " vc\n"); + len += sprintf(buffer + len, " vc"); break; case 'D': - case 'd': - len += sprintf(buffer + len, " dg\n"); + len += sprintf(buffer + len, " dg"); break; default: - len += sprintf(buffer + len, "\n"); + len += sprintf(buffer + len, " *"); break; } + + if (ax25_rt->digipeat != NULL) + for (i = 0; i < ax25_rt->digipeat->ndigi; i++) + len += sprintf(buffer + len, " %s", ax2asc(&ax25_rt->digipeat->calls[i])); + + len += sprintf(buffer + len, "\n"); pos = begin + len; @@ -189,7 +315,7 @@ int ax25_rt_get_info(char *buffer, char **start, off_t offset, int length) return len; } -int ax25_cs_get_info(char *buffer, char **start, off_t offset, int length) +int ax25_cs_get_info(char *buffer, char **start, off_t offset, int length, int dummy) { ax25_uid_assoc *pt; int len = 0; @@ -225,50 +351,145 @@ int ax25_cs_get_info(char *buffer, char **start, off_t offset, int length) } /* - * Find what interface to use. + * Find AX.25 route */ -int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr) +static struct ax25_route *ax25_find_route(ax25_address *addr, struct device *dev) { + struct ax25_route *ax25_spe_rt = NULL; + struct ax25_route *ax25_def_rt = NULL; struct ax25_route *ax25_rt; - ax25_address *call; + /* + * Bind to the physical interface we heard them on, or the default + * route if none is found; + */ for (ax25_rt = ax25_route; ax25_rt != NULL; ax25_rt = ax25_rt->next) { - if (ax25cmp(&ax25_rt->callsign, addr) == 0) { - /* - * Bind to the physical interface we heard them on. - */ - if ((ax25->device = ax25_rt->dev) == NULL) - continue; - if ((call = ax25_findbyuid(current->euid)) == NULL) { - if (ax25_uid_policy && !suser()) - return -EPERM; - call = (ax25_address *)ax25->device->dev_addr; - } - memcpy(&ax25->source_addr, call, sizeof(ax25_address)); - if (ax25->sk != NULL) - ax25->sk->zapped = 0; - - return 0; + if (dev == NULL) { + if (ax25cmp(&ax25_rt->callsign, addr) == 0 && ax25_rt->dev != NULL) + ax25_spe_rt = ax25_rt; + if (ax25cmp(&ax25_rt->callsign, &null_ax25_address) == 0 && ax25_rt->dev != NULL) + ax25_def_rt = ax25_rt; + } else { + if (ax25cmp(&ax25_rt->callsign, addr) == 0 && ax25_rt->dev == dev) + ax25_spe_rt = ax25_rt; + if (ax25cmp(&ax25_rt->callsign, &null_ax25_address) == 0 && ax25_rt->dev == dev) + ax25_def_rt = ax25_rt; } } - return -EINVAL; + if (ax25_spe_rt != NULL) + return ax25_spe_rt; + + return ax25_def_rt; } /* - * Register the mode of an incoming IP frame. It is assumed that an entry - * already exists in the routing table. + * Adjust path: If you specify a default route and want to connect + * a target on the digipeater path but w/o having a special route + * set before, the path has to be truncated from your target on. */ -void ax25_ip_mode_set(ax25_address *callsign, struct device *dev, char ip_mode) +static inline void ax25_adjust_path(ax25_address *addr, ax25_digi *digipeat) +{ + int k; + + for (k = 0; k < digipeat->ndigi; k++) { + if (ax25cmp(addr, &digipeat->calls[k]) == 0) + break; + } + + digipeat->ndigi = k; +} + + +/* + * Find which interface to use. + */ +int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr) { struct ax25_route *ax25_rt; + ax25_address *call; - for (ax25_rt = ax25_route; ax25_rt != NULL; ax25_rt = ax25_rt->next) { - if (ax25cmp(&ax25_rt->callsign, callsign) == 0 && ax25_rt->dev == dev) { - ax25_rt->ip_mode = ip_mode; - return; - } + if ((ax25_rt = ax25_find_route(addr, NULL)) == NULL) + return -EHOSTUNREACH; + + ax25->device = ax25_rt->dev; + + if ((call = ax25_findbyuid(current->euid)) == NULL) { + if (ax25_uid_policy && !suser()) + return -EPERM; + call = (ax25_address *)ax25->device->dev_addr; + } + + ax25->source_addr = *call; + + if (ax25_rt->digipeat != NULL) { + if ((ax25->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) + return -ENOMEM; + *ax25->digipeat = *ax25_rt->digipeat; + ax25_adjust_path(addr, ax25->digipeat); + } + + if (ax25->sk != NULL) + ax25->sk->zapped = 0; + + return 0; +} + +/* + * dl1bke 960117: build digipeater path + * dl1bke 960301: use the default route if it exists + */ +void ax25_rt_build_path(ax25_cb *ax25, ax25_address *addr, struct device *dev) +{ + struct ax25_route *ax25_rt; + + if ((ax25_rt = ax25_find_route(addr, dev)) == NULL) + return; + + if (ax25_rt->digipeat == NULL) + return; + + if ((ax25->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) + return; + + ax25->device = ax25_rt->dev; + *ax25->digipeat = *ax25_rt->digipeat; + ax25_adjust_path(addr, ax25->digipeat); +} + +void ax25_dg_build_path(struct sk_buff *skb, ax25_address *addr, struct device *dev) +{ + struct ax25_route *ax25_rt; + ax25_digi digipeat; + ax25_address src, dest; + unsigned char *bp; + int len; + + skb_pull(skb, 1); /* skip KISS command */ + + if ((ax25_rt = ax25_find_route(addr, dev)) == NULL) + return; + + if (ax25_rt->digipeat == NULL) + return; + + digipeat = *ax25_rt->digipeat; + + ax25_adjust_path(addr, &digipeat); + + len = ax25_rt->digipeat->ndigi * AX25_ADDR_LEN; + + if (skb_headroom(skb) < len) { + printk(KERN_CRIT "ax25_dg_build_path: not enough headroom for digis in skb\n"); + return; } + + memcpy(&dest, skb->data , AX25_ADDR_LEN); + memcpy(&src, skb->data + 7, AX25_ADDR_LEN); + + bp = skb_push(skb, len); + + build_ax25_addr(bp, &src, &dest, ax25_rt->digipeat, C_COMMAND, MODULUS); } /* @@ -285,4 +506,102 @@ char ax25_ip_mode_get(ax25_address *callsign, struct device *dev) return ' '; } +/* + * Wow, a bit of data hiding. Is this C++ or what ? + */ +int ax25_dev_get_value(struct device *dev, int valueno) +{ + int i; + + for (i = 0; i < AX25_MAX_DEVICES; i++) + if (ax25_device[i].dev != NULL && ax25_device[i].dev == dev) + return ax25_device[i].values[valueno]; + + printk(KERN_WARNING "ax25_dev_get_value called with invalid device\n"); + + return 0; +} + +/* + * This is called when an interface is brought up. These are + * reasonable defaults. + */ +void ax25_dev_device_up(struct device *dev) +{ + struct ax25_dev *ax25_dev = NULL; + int i; + + for (i = 0; i < AX25_MAX_DEVICES; i++) { + if (ax25_device[i].dev == NULL) { + ax25_dev = ax25_device + i; + break; + } + } + + if (ax25_dev == NULL) { + printk(KERN_ERR "ax25_dev_device_up cannot find free AX.25 device\n"); + return; + } + + ax25_unregister_sysctl(); + + sprintf(ax25_dev->name, "%s.parms", dev->name); + + ax25_dev->dev = dev; + + ax25_dev->values[AX25_VALUES_IPDEFMODE] = AX25_DEF_IPDEFMODE; + ax25_dev->values[AX25_VALUES_AXDEFMODE] = AX25_DEF_AXDEFMODE; + ax25_dev->values[AX25_VALUES_TEXT] = AX25_DEF_TEXT; + ax25_dev->values[AX25_VALUES_BACKOFF] = AX25_DEF_BACKOFF; + ax25_dev->values[AX25_VALUES_CONMODE] = AX25_DEF_CONMODE; + ax25_dev->values[AX25_VALUES_WINDOW] = AX25_DEF_WINDOW; + ax25_dev->values[AX25_VALUES_EWINDOW] = AX25_DEF_EWINDOW; + ax25_dev->values[AX25_VALUES_T1] = AX25_DEF_T1; + ax25_dev->values[AX25_VALUES_T2] = AX25_DEF_T2; + ax25_dev->values[AX25_VALUES_T3] = AX25_DEF_T3; + ax25_dev->values[AX25_VALUES_IDLE] = AX25_DEF_IDLE; + ax25_dev->values[AX25_VALUES_N2] = AX25_DEF_N2; + ax25_dev->values[AX25_VALUES_DIGI] = AX25_DEF_DIGI; + ax25_dev->values[AX25_VALUES_PACLEN] = AX25_DEF_PACLEN; + ax25_dev->values[AX25_VALUES_MAXQUEUE] = AX25_DEF_MAXQUEUE; + + ax25_register_sysctl(); +} + +void ax25_dev_device_down(struct device *dev) +{ + int i; + + ax25_unregister_sysctl(); + + for (i = 0; i < AX25_MAX_DEVICES; i++) + if (ax25_device[i].dev != NULL && ax25_device[i].dev == dev) + ax25_device[i].dev = NULL; + + ax25_register_sysctl(); +} + +#ifdef MODULE + +/* + * Free all memory associated with routing and device structures. + */ +void ax25_rt_free(void) +{ + struct ax25_route *s, *ax25_rt = ax25_route; + + while (ax25_rt != NULL) { + s = ax25_rt; + ax25_rt = ax25_rt->next; + + if (s->digipeat != NULL) + kfree_s(s->digipeat, sizeof(ax25_digi)); + + kfree_s(s, sizeof(struct ax25_route)); + } +} + +#endif + #endif + diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c index 2530346e5..071043d1e 100644 --- a/net/ax25/ax25_subr.c +++ b/net/ax25/ax25_subr.c @@ -1,10 +1,10 @@ /* - * AX.25 release 029 + * AX.25 release 033 * * This is ALPHA test software. This code may break your machine, randomly fail to work with new * releases, misbehave and/or generally screw up. It might even work. * - * This code REQUIRES 1.2.1 or higher/ NET3.029 + * This code REQUIRES 1.3.61 or higher/ NET3.029 * * This module: * This module is free software; you can redistribute it and/or @@ -21,10 +21,21 @@ * History * AX.25 029 Alan(GW4PTS) Switched to KA9Q constant names. Removed * old BSD code. + * AX.25 030 Jonathan(G4KLX) Added support for extended AX.25. + * Added fragmentation support. + * Darryl(G7LED) Added function ax25_requeue_frames() to split + * it up from ax25_frames_acked(). + * AX.25 031 Joerg(DL1BKE) DAMA needs KISS Fullduplex ON/OFF. + * Thus we have ax25_kiss_cmd() now... ;-) + * Dave Brown(N2RJT) + * Killed a silly bug in the DAMA code. + * Joerg(DL1BKE) Found the real bug in ax25.h, sri. + * AX.25 032 Joerg(DL1BKE) Added ax25_queue_length to count the number of + * enqueued buffers of a socket.. */ #include <linux/config.h> -#ifdef CONFIG_AX25 +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> @@ -40,16 +51,16 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> /* - * This routine purges the input queue of frames. + * This routine purges all the queues of frames. */ -void ax25_clear_tx_queue(ax25_cb *ax25) +void ax25_clear_queues(ax25_cb *ax25) { struct sk_buff *skb; @@ -62,6 +73,14 @@ void ax25_clear_tx_queue(ax25_cb *ax25) skb->free = 1; kfree_skb(skb, FREE_WRITE); } + + while ((skb = skb_dequeue(&ax25->reseq_queue)) != NULL) { + kfree_skb(skb, FREE_READ); + } + + while ((skb = skb_dequeue(&ax25->frag_queue)) != NULL) { + kfree_skb(skb, FREE_READ); + } } /* @@ -71,7 +90,7 @@ void ax25_clear_tx_queue(ax25_cb *ax25) */ void ax25_frames_acked(ax25_cb *ax25, unsigned short nr) { - struct sk_buff *skb, *skb_prev = NULL; + struct sk_buff *skb; /* * Remove all the ack-ed frames from the ack queue. @@ -81,9 +100,20 @@ void ax25_frames_acked(ax25_cb *ax25, unsigned short nr) skb = skb_dequeue(&ax25->ack_queue); skb->free = 1; kfree_skb(skb, FREE_WRITE); - ax25->va = (ax25->va + 1) % MODULUS; + ax25->va = (ax25->va + 1) % ax25->modulus; + if (ax25->dama_slave) + ax25->n2count = 0; } } +} + +/* Maybe this should be your ax25_invoke_retransmission(), which appears + * to be used but not do anything. ax25_invoke_retransmission() used to + * be in AX 0.29, but has now gone in 0.30. + */ +void ax25_requeue_frames(ax25_cb *ax25) +{ + struct sk_buff *skb, *skb_prev = NULL; /* * Requeue all the un-ack-ed frames on the output queue to be picked @@ -109,7 +139,7 @@ int ax25_validate_nr(ax25_cb *ax25, unsigned short nr) while (vc != ax25->vs) { if (nr == vc) return 1; - vc = (vc + 1) % MODULUS; + vc = (vc + 1) % ax25->modulus; } if (nr == ax25->vs) return 1; @@ -117,16 +147,51 @@ int ax25_validate_nr(ax25_cb *ax25, unsigned short nr) return 0; } -int ax25_decode(unsigned char *frame) +/* + * This routine is the centralised routine for parsing the control + * information for the different frame formats. + */ +int ax25_decode(ax25_cb *ax25, struct sk_buff *skb, int *ns, int *nr, int *pf) { + unsigned char *frame; int frametype = ILLEGAL; - if ((frame[0] & S) == 0) - frametype = I; /* I frame - carries NR/NS/PF */ - else if ((frame[0] & U) == 1) /* S frame - take out PF/NR */ - frametype = frame[0] & 0x0F; - else if ((frame[0] & U) == 3) /* U frame - take out PF */ - frametype = frame[0] & ~PF; + frame = skb->data; + *ns = *nr = *pf = 0; + + if (ax25->modulus == MODULUS) { + if ((frame[0] & S) == 0) { + frametype = I; /* I frame - carries NR/NS/PF */ + *ns = (frame[0] >> 1) & 0x07; + *nr = (frame[0] >> 5) & 0x07; + *pf = frame[0] & PF; + } else if ((frame[0] & U) == 1) { /* S frame - take out PF/NR */ + frametype = frame[0] & 0x0F; + *nr = (frame[0] >> 5) & 0x07; + *pf = frame[0] & PF; + } else if ((frame[0] & U) == 3) { /* U frame - take out PF */ + frametype = frame[0] & ~PF; + *pf = frame[0] & PF; + } + skb_pull(skb, 1); + } else { + if ((frame[0] & S) == 0) { + frametype = I; /* I frame - carries NR/NS/PF */ + *ns = (frame[0] >> 1) & 0x7F; + *nr = (frame[1] >> 1) & 0x7F; + *pf = frame[1] & EPF; + skb_pull(skb, 2); + } else if ((frame[0] & U) == 1) { /* S frame - take out PF/NR */ + frametype = frame[0] & 0x0F; + *nr = (frame[1] >> 1) & 0x7F; + *pf = frame[1] & EPF; + skb_pull(skb, 2); + } else if ((frame[0] & U) == 3) { /* U frame - take out PF */ + frametype = frame[0] & ~PF; + *pf = frame[0] & PF; + skb_pull(skb, 1); + } + } return frametype; } @@ -136,38 +201,46 @@ int ax25_decode(unsigned char *frame) * command or response for the remote machine ( eg. RR, UA etc. ). * Only supervisory or unnumbered frames are processed. */ -void ax25_send_control(ax25_cb *ax25, int frametype, int type) +void ax25_send_control(ax25_cb *ax25, int frametype, int poll_bit, int type) { struct sk_buff *skb; unsigned char *dptr; - int len; struct device *dev; if ((dev = ax25->device) == NULL) return; /* Route died */ - if ((skb = alloc_skb(16 + 1 + size_ax25_addr(ax25->digipeat), GFP_ATOMIC)) == NULL) + if ((skb = alloc_skb(AX25_BPQ_HEADER_LEN + size_ax25_addr(ax25->digipeat) + 2, GFP_ATOMIC)) == NULL) return; + skb_reserve(skb, AX25_BPQ_HEADER_LEN + size_ax25_addr(ax25->digipeat)); + if (ax25->sk != NULL) { skb->sk = ax25->sk; - ax25->sk->wmem_alloc += skb->mem_len; + atomic_add(skb->truesize, &ax25->sk->wmem_alloc); } - dptr = skb->data; - - dptr += 1 + size_ax25_addr(ax25->digipeat); /* KISS byte & 2 calls */ - /* Assume a response - address structure for DTE */ - len = 1; /* Normal size */ - - if ((frametype & U) == S) /* S frames carry NR */ - frametype |= (ax25->vr << 5); - - *dptr = frametype; + if (ax25->modulus == MODULUS) { + dptr = skb_put(skb, 1); + *dptr = frametype; + *dptr |= (poll_bit) ? PF : 0; + if ((frametype & U) == S) /* S frames carry NR */ + *dptr |= (ax25->vr << 5); + } else { + if ((frametype & U) == U) { + dptr = skb_put(skb, 1); + *dptr = frametype; + *dptr |= (poll_bit) ? PF : 0; + } else { + dptr = skb_put(skb, 2); + dptr[0] = frametype; + dptr[1] = (ax25->vr << 1); + dptr[1] |= (poll_bit) ? EPF : 0; + } + } skb->free = 1; - skb->len = len + size_ax25_addr(ax25->digipeat) + 1; ax25_transmit_buffer(ax25, skb, type); } @@ -175,38 +248,40 @@ void ax25_send_control(ax25_cb *ax25, int frametype, int type) /* * Send a 'DM' to an unknown connection attempt, or an invalid caller. * - * Note: src here is the sender, thus its the target of the DM + * Note: src here is the sender, thus it's the target of the DM */ void ax25_return_dm(struct device *dev, ax25_address *src, ax25_address *dest, ax25_digi *digi) { struct sk_buff *skb; char *dptr; ax25_digi retdigi; - int len = 2 + size_ax25_addr(digi); - if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) + if (dev == NULL) + return; + + if ((skb = alloc_skb(AX25_BPQ_HEADER_LEN + size_ax25_addr(digi) + 1, GFP_ATOMIC)) == NULL) return; /* Next SABM will get DM'd */ - skb->len = len; + skb_reserve(skb, AX25_BPQ_HEADER_LEN + size_ax25_addr(digi)); ax25_digi_invert(digi, &retdigi); - dptr = skb->data + 1 + size_ax25_addr(digi); + dptr = skb_put(skb, 1); skb->sk = NULL; - *dptr = DM; + *dptr = DM | PF; - if (dev == NULL) - return; + /* + * Do the address ourselves + */ - dptr = skb->data; - *dptr++ = 0; - dptr += build_ax25_addr(dptr, dest, src, &retdigi, C_RESPONSE); + dptr = skb_push(skb, size_ax25_addr(digi)); + dptr += build_ax25_addr(dptr, dest, src, &retdigi, C_RESPONSE, MODULUS); skb->arp = 1; skb->free = 1; - dev_queue_xmit(skb, dev, SOPRI_NORMAL); + ax25_queue_xmit(skb, dev, SOPRI_NORMAL); } /* @@ -214,25 +289,39 @@ void ax25_return_dm(struct device *dev, ax25_address *src, ax25_address *dest, a */ unsigned short ax25_calculate_t1(ax25_cb *ax25) { - int t, n; - - for (t = 2, n = 0; n < ax25->n2count; n++) - t *= 2; - + int n, t = 2; + + if (ax25->backoff) { + for (n = 0; n < ax25->n2count; n++) + t *= 2; + + if (t > 8) t = 8; + } + return t * ax25->rtt; } /* - * Calculate the r Round Trip Time + * Calculate the Round Trip Time */ void ax25_calculate_rtt(ax25_cb *ax25) { - if (ax25->n2count == 0) + if (ax25->t1timer > 0 && ax25->n2count == 0) ax25->rtt = (9 * ax25->rtt + ax25->t1 - ax25->t1timer) / 10; - /* Don't go below one second */ - if (ax25->rtt < 1 * PR_SLOWHZ) - ax25->rtt = 1 * PR_SLOWHZ; +#ifdef AX25_T1CLAMPLO + /* Don't go below one tenth of a second */ + if (ax25->rtt < (AX25_T1CLAMPLO)) + ax25->rtt = (AX25_T1CLAMPLO); +#else /* Failsafe - some people might have sub 1/10th RTTs :-) **/ + if (ax25->rtt == 0) + ax25->rtt = PR_SLOWHZ; +#endif +#ifdef AX25_T1CLAMPHI + /* OR above clamped seconds **/ + if (ax25->rtt > (AX25_T1CLAMPHI)) + ax25->rtt = (AX25_T1CLAMPHI); +#endif } /* @@ -244,8 +333,7 @@ void ax25_calculate_rtt(ax25_cb *ax25) * Given an AX.25 address pull of to, from, digi list, command/response and the start of data * */ - -unsigned char *ax25_parse_addr(unsigned char *buf, int len, ax25_address *src, ax25_address *dest, ax25_digi *digi, int *flags) +unsigned char *ax25_parse_addr(unsigned char *buf, int len, ax25_address *src, ax25_address *dest, ax25_digi *digi, int *flags, int *dama) { int d = 0; @@ -262,21 +350,25 @@ unsigned char *ax25_parse_addr(unsigned char *buf, int len, ax25_address *src, a } } + if (dama != NULL) + *dama = ~buf[13] & DAMA_FLAG; + /* Copy to, from */ - if (dest != NULL) memcpy(dest, buf + 0, 7); - if (src != NULL) memcpy(src, buf + 7, 7); - buf += 14; - len -= 14; + if (dest != NULL) + memcpy(dest, buf + 0, AX25_ADDR_LEN); + if (src != NULL) + memcpy(src, buf + 7, AX25_ADDR_LEN); + buf += 2 * AX25_ADDR_LEN; + len -= 2 * AX25_ADDR_LEN; digi->lastrepeat = -1; digi->ndigi = 0; - while (!(buf[-1] & LAPB_E)) - { - if (d >= 6) return NULL; /* Max of 6 digis */ + while (!(buf[-1] & LAPB_E)) { + if (d >= AX25_MAX_DIGIS) return NULL; /* Max of 6 digis */ if (len < 7) return NULL; /* Short packet */ if (digi != NULL) { - memcpy(&digi->calls[d], buf, 7); + memcpy(&digi->calls[d], buf, AX25_ADDR_LEN); digi->ndigi = d + 1; if (buf[6] & AX25_REPEATED) { digi->repeated[d] = 1; @@ -286,8 +378,8 @@ unsigned char *ax25_parse_addr(unsigned char *buf, int len, ax25_address *src, a } } - buf += 7; - len -= 7; + buf += AX25_ADDR_LEN; + len -= AX25_ADDR_LEN; d++; } @@ -297,50 +389,54 @@ unsigned char *ax25_parse_addr(unsigned char *buf, int len, ax25_address *src, a /* * Assemble an AX.25 header from the bits */ - -int build_ax25_addr(unsigned char *buf, ax25_address *src, ax25_address *dest, ax25_digi *d, int flag) +int build_ax25_addr(unsigned char *buf, ax25_address *src, ax25_address *dest, ax25_digi *d, int flag, int modulus) { int len = 0; int ct = 0; - memcpy(buf, dest, 7); - - if (flag != C_COMMAND && flag != C_RESPONSE) - printk("build_ax25_addr: Bogus flag %d\n!", flag); + memcpy(buf, dest, AX25_ADDR_LEN); buf[6] &= ~(LAPB_E | LAPB_C); - buf[6] |= SSID_SPARE; + buf[6] |= SSSID_SPARE; if (flag == C_COMMAND) buf[6] |= LAPB_C; - buf += 7; - len += 7; - memcpy(buf, src, 7); + buf += AX25_ADDR_LEN; + len += AX25_ADDR_LEN; + + memcpy(buf, src, AX25_ADDR_LEN); buf[6] &= ~(LAPB_E | LAPB_C); - buf[6] |= SSID_SPARE; + buf[6] &= ~SSSID_SPARE; + + if (modulus == MODULUS) { + buf[6] |= SSSID_SPARE; + } else { + buf[6] |= ESSID_SPARE; + } if (flag == C_RESPONSE) buf[6] |= LAPB_C; + /* * Fast path the normal digiless path */ if (d == NULL || d->ndigi == 0) { buf[6] |= LAPB_E; - return 14; + return 2 * AX25_ADDR_LEN; } - buf += 7; - len += 7; + buf += AX25_ADDR_LEN; + len += AX25_ADDR_LEN; while (ct < d->ndigi) { - memcpy(buf, &d->calls[ct], 7); + memcpy(buf, &d->calls[ct], AX25_ADDR_LEN); if (d->repeated[ct]) buf[6] |= AX25_REPEATED; else buf[6] &= ~AX25_REPEATED; buf[6] &= ~LAPB_E; - buf[6] |= SSID_SPARE; + buf[6] |= SSSID_SPARE; - buf += 7; - len += 7; + buf += AX25_ADDR_LEN; + len += AX25_ADDR_LEN; ct++; } @@ -352,15 +448,14 @@ int build_ax25_addr(unsigned char *buf, ax25_address *src, ax25_address *dest, a int size_ax25_addr(ax25_digi *dp) { if (dp == NULL) - return 14; + return 2 * AX25_ADDR_LEN; - return 14 + (7 * dp->ndigi); + return AX25_ADDR_LEN * (2 + dp->ndigi); } /* * Reverse Digipeat List. May not pass both parameters as same struct */ - void ax25_digi_invert(ax25_digi *in, ax25_digi *out) { int ct = 0; @@ -380,4 +475,100 @@ void ax25_digi_invert(ax25_digi *in, ax25_digi *out) out->lastrepeat = 0; } +/* + * count the number of buffers on a list belonging to the same + * socket as skb + */ + +static int ax25_list_length(struct sk_buff_head *list, struct sk_buff *skb) +{ + int count = 0; + long flags; + struct sk_buff *skbq; + + save_flags(flags); + cli(); + + if (list == NULL) { + restore_flags(flags); + return 0; + } + + for (skbq = list->next; skbq != (struct sk_buff *)list; skbq = skbq->next) + if (skb->sk == skbq->sk) + count++; + + restore_flags(flags); + return count; +} + +/* + * count the number of buffers of one socket on the write/ack-queue + */ + +int ax25_queue_length(ax25_cb *ax25, struct sk_buff *skb) +{ + return ax25_list_length(&ax25->write_queue, skb) + ax25_list_length(&ax25->ack_queue, skb); +} + +/* + * :::FIXME::: + * This is ****NOT**** the right approach. Not all drivers do kiss. We + * need a driver level request to switch duplex mode, that does either + * SCC changing, PI config or KISS as required. + * + * Not to mention this request isn't currently reliable. + */ + +void ax25_kiss_cmd(ax25_cb *ax25, unsigned char cmd, unsigned char param) +{ + struct sk_buff *skb; + unsigned char *p; + + if (ax25->device == NULL) + return; + + if ((skb = alloc_skb(2, GFP_ATOMIC)) == NULL) + return; + + skb->free = 1; + skb->arp = 1; + + if (ax25->sk != NULL) { + skb->sk = ax25->sk; + atomic_add(skb->truesize, &ax25->sk->wmem_alloc); + } + + skb->protocol = htons(ETH_P_AX25); + + p = skb_put(skb, 2); + + *p++=cmd; + *p =param; + + dev_queue_xmit(skb, ax25->device, SOPRI_NORMAL); +} + +void ax25_dama_on(ax25_cb *ax25) +{ + if (ax25_dev_is_dama_slave(ax25->device) == 0) { + if (ax25->sk != NULL && ax25->sk->debug) + printk("ax25_dama_on: DAMA on\n"); + ax25_kiss_cmd(ax25, 5, 1); + } +} + +void ax25_dama_off(ax25_cb *ax25) +{ + if (ax25->dama_slave == 0) + return; + + ax25->dama_slave = 0; + if (ax25_dev_is_dama_slave(ax25->device) == 0) { + if (ax25->sk != NULL && ax25->sk->debug) + printk("ax25_dama_off: DAMA off\n"); + ax25_kiss_cmd(ax25, 5, 0); + } +} + #endif diff --git a/net/ax25/ax25_timer.c b/net/ax25/ax25_timer.c index daa3bd657..f6ce6e00b 100644 --- a/net/ax25/ax25_timer.c +++ b/net/ax25/ax25_timer.c @@ -1,5 +1,5 @@ /* - * AX.25 release 029 + * AX.25 release 033 * * This is ALPHA test software. This code may break your machine, randomly fail to work with new * releases, misbehave and/or generally screw up. It might even work. @@ -17,10 +17,13 @@ * AX.25 028b Jonathan(G4KLX) Extracted AX25 control block from the * sock structure. * AX.25 029 Alan(GW4PTS) Switched to KA9Q constant names. + * AX.25 031 Joerg(DL1BKE) Added DAMA support + * AX.25 032 Joerg(DL1BKE) Fixed DAMA timeout bug + * AX.25 033 Jonathan(G4KLX) Modularisation functions. */ #include <linux/config.h> -#ifdef CONFIG_AX25 +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> @@ -36,14 +39,11 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> -#ifdef CONFIG_NETROM -#include <net/netrom.h> -#endif static void ax25_timer(unsigned long); @@ -52,8 +52,8 @@ static void ax25_timer(unsigned long); */ void ax25_set_timer(ax25_cb *ax25) { - unsigned long flags; - + unsigned long flags; + save_flags(flags); cli(); del_timer(&ax25->timer); @@ -63,7 +63,7 @@ void ax25_set_timer(ax25_cb *ax25) ax25->timer.data = (unsigned long)ax25; ax25->timer.function = &ax25_timer; - ax25->timer.expires = 10; + ax25->timer.expires = jiffies + 10; add_timer(&ax25->timer); } @@ -78,14 +78,14 @@ static void ax25_reset_timer(ax25_cb *ax25) ax25->timer.data = (unsigned long)ax25; ax25->timer.function = &ax25_timer; - ax25->timer.expires = 10; + ax25->timer.expires = jiffies + 10; add_timer(&ax25->timer); } /* * AX.25 TIMER * - * This routine is called every 500ms. Decrement timer by this + * This routine is called every 100ms. Decrement timer by this * amount - if expired then process the event. */ static void ax25_timer(unsigned long param) @@ -95,8 +95,8 @@ static void ax25_timer(unsigned long param) switch (ax25->state) { case AX25_STATE_0: /* Magic here: If we listen() and a new link dies before it - is accepted() it isnt 'dead' so doesnt get removed. */ - if ((ax25->sk != NULL && ax25->sk->dead) || ax25->sk == NULL) { + is accepted() it isn't 'dead' so doesn't get removed. */ + if (ax25->sk == NULL || ax25->sk->destroy || (ax25->sk->state == TCP_LISTEN && ax25->sk->dead)) { del_timer(&ax25->timer); ax25_destroy_socket(ax25); return; @@ -111,7 +111,8 @@ static void ax25_timer(unsigned long param) if (ax25->sk != NULL) { if (ax25->sk->rmem_alloc < (ax25->sk->rcvbuf / 2) && (ax25->condition & OWN_RX_BUSY_CONDITION)) { ax25->condition &= ~OWN_RX_BUSY_CONDITION; - ax25_send_control(ax25, RR, C_RESPONSE); + if (!ax25->dama_slave) + ax25_send_control(ax25, RR, POLLOFF, C_RESPONSE); ax25->condition &= ~ACK_PENDING_CONDITION; break; } @@ -119,7 +120,8 @@ static void ax25_timer(unsigned long param) /* * Check for frames to transmit. */ - ax25_kick(ax25); + if (!ax25->dama_slave) + ax25_kick(ax25); break; default: @@ -130,12 +132,36 @@ static void ax25_timer(unsigned long param) if (ax25->state == AX25_STATE_3 || ax25->state == AX25_STATE_4) { if (ax25->condition & ACK_PENDING_CONDITION) { ax25->condition &= ~ACK_PENDING_CONDITION; - ax25_enquiry_response(ax25); + if (!ax25->dama_slave) + ax25_timeout_response(ax25); } } } if (ax25->t3timer > 0 && --ax25->t3timer == 0) { + /* dl1bke 960114: T3 expires and we are in DAMA mode: */ + /* send a DISC and abort the connection */ + if (ax25->dama_slave) { + ax25_link_failed(&ax25->dest_addr, ax25->device); + ax25_clear_queues(ax25); + ax25_send_control(ax25, DISC, POLLON, C_COMMAND); + + ax25->state = AX25_STATE_0; + if (ax25->sk != NULL) { + if (ax25->sk->debug) + printk(KERN_DEBUG "AX.25 T3 Timeout\n"); + ax25->sk->state = TCP_CLOSE; + ax25->sk->err = ETIMEDOUT; + ax25->sk->shutdown |= SEND_SHUTDOWN; + if (!ax25->sk->dead) + ax25->sk->state_change(ax25->sk); + ax25->sk->dead = 1; + } + + ax25_reset_timer(ax25); + return; + } + if (ax25->state == AX25_STATE_3) { ax25->n2count = 0; ax25_transmit_enquiry(ax25); @@ -143,77 +169,148 @@ static void ax25_timer(unsigned long param) } ax25->t3timer = ax25->t3; } + + if (ax25->idletimer > 0 && --ax25->idletimer == 0) { + /* dl1bke 960228: close the connection when IDLE expires */ + /* similar to DAMA T3 timeout but with */ + /* a "clean" disconnect of the connection */ + ax25_clear_queues(ax25); + + ax25->n2count = 0; + if (!ax25->dama_slave) { + ax25->t3timer = 0; + ax25_send_control(ax25, DISC, POLLON, C_COMMAND); + } else { + ax25->t3timer = ax25->t3; + } + + /* state 1 or 2 should not happen, but... */ + + if (ax25->state == AX25_STATE_1 || ax25->state == AX25_STATE_2) + ax25->state = AX25_STATE_0; + else + ax25->state = AX25_STATE_2; + + ax25->t1timer = ax25->t1 = ax25_calculate_t1(ax25); + + if (ax25->sk != NULL) { + ax25->sk->state = TCP_CLOSE; + ax25->sk->err = 0; + ax25->sk->shutdown |= SEND_SHUTDOWN; + if (!ax25->sk->dead) + ax25->sk->state_change(ax25->sk); + ax25->sk->dead = 1; + ax25->sk->destroy = 1; + } + } + + /* dl1bke 960114: DAMA T1 timeouts are handled in ax25_dama_slave_transmit */ + /* nevertheless we have to re-enqueue the timer struct... */ + if (ax25->t1timer == 0 || --ax25->t1timer > 0) { ax25_reset_timer(ax25); return; } + if (!ax25_dev_is_dama_slave(ax25->device)) { + if (ax25->dama_slave) + ax25->dama_slave = 0; + ax25_t1_timeout(ax25); + } +} + + +/* dl1bke 960114: The DAMA protocol requires to send data and SABM/DISC + * within the poll of any connected channel. Remember + * that we are not allowed to send anything unless we + * get polled by the Master. + * + * Thus we'll have to do parts of our T1 handling in + * ax25_enquiry_response(). + */ +void ax25_t1_timeout(ax25_cb * ax25) +{ switch (ax25->state) { case AX25_STATE_1: if (ax25->n2count == ax25->n2) { -#ifdef CONFIG_NETROM - nr_link_failed(&ax25->dest_addr, ax25->device); -#endif - ax25_clear_tx_queue(ax25); - ax25->state = AX25_STATE_0; - if (ax25->sk != NULL) { - ax25->sk->state = TCP_CLOSE; - ax25->sk->err = ETIMEDOUT; - if (!ax25->sk->dead) - ax25->sk->state_change(ax25->sk); - ax25->sk->dead = 1; + if (ax25->modulus == MODULUS) { + ax25_link_failed(&ax25->dest_addr, ax25->device); + ax25_clear_queues(ax25); + ax25->state = AX25_STATE_0; + if (ax25->sk != NULL) { + ax25->sk->state = TCP_CLOSE; + ax25->sk->err = ETIMEDOUT; + ax25->sk->shutdown |= SEND_SHUTDOWN; + if (!ax25->sk->dead) + ax25->sk->state_change(ax25->sk); + ax25->sk->dead = 1; + } + } else { + ax25->modulus = MODULUS; + ax25->window = ax25_dev_get_value(ax25->device, AX25_VALUES_WINDOW); + ax25->n2count = 0; + ax25_send_control(ax25, SABM, ax25_dev_is_dama_slave(ax25->device)? POLLOFF : POLLON, C_COMMAND); } } else { ax25->n2count++; - ax25_send_control(ax25, SABM | PF, C_COMMAND); + if (ax25->modulus == MODULUS) { + ax25_send_control(ax25, SABM, ax25_dev_is_dama_slave(ax25->device)? POLLOFF : POLLON, C_COMMAND); + } else { + ax25_send_control(ax25, SABME, ax25_dev_is_dama_slave(ax25->device)? POLLOFF : POLLON, C_COMMAND); + } } break; case AX25_STATE_2: if (ax25->n2count == ax25->n2) { -#ifdef CONFIG_NETROM - nr_link_failed(&ax25->dest_addr, ax25->device); -#endif - ax25_clear_tx_queue(ax25); + ax25_link_failed(&ax25->dest_addr, ax25->device); + ax25_clear_queues(ax25); ax25->state = AX25_STATE_0; + ax25_send_control(ax25, DISC, POLLON, C_COMMAND); + if (ax25->sk != NULL) { - ax25->sk->state = TCP_CLOSE; - ax25->sk->err = ETIMEDOUT; + ax25->sk->state = TCP_CLOSE; + ax25->sk->err = ETIMEDOUT; + ax25->sk->shutdown |= SEND_SHUTDOWN; if (!ax25->sk->dead) ax25->sk->state_change(ax25->sk); - ax25->sk->dead = 1; + ax25->sk->dead = 1; } } else { ax25->n2count++; - ax25_send_control(ax25, DISC | PF, C_COMMAND); + if (!ax25_dev_is_dama_slave(ax25->device)) + ax25_send_control(ax25, DISC, POLLON, C_COMMAND); } break; case AX25_STATE_3: ax25->n2count = 1; - ax25_transmit_enquiry(ax25); + if (!ax25->dama_slave) + ax25_transmit_enquiry(ax25); ax25->state = AX25_STATE_4; break; case AX25_STATE_4: if (ax25->n2count == ax25->n2) { -#ifdef CONFIG_NETROM - nr_link_failed(&ax25->dest_addr, ax25->device); -#endif - ax25_clear_tx_queue(ax25); - ax25_send_control(ax25, DM | PF, C_RESPONSE); + ax25_link_failed(&ax25->dest_addr, ax25->device); + ax25_clear_queues(ax25); + ax25_send_control(ax25, DM, POLLON, C_RESPONSE); ax25->state = AX25_STATE_0; if (ax25->sk != NULL) { - ax25->sk->state = TCP_CLOSE; - ax25->sk->err = ETIMEDOUT; + if (ax25->sk->debug) + printk(KERN_DEBUG "AX.25 link Failure\n"); + ax25->sk->state = TCP_CLOSE; + ax25->sk->err = ETIMEDOUT; + ax25->sk->shutdown |= SEND_SHUTDOWN; if (!ax25->sk->dead) ax25->sk->state_change(ax25->sk); - ax25->sk->dead = 1; + ax25->sk->dead = 1; } } else { ax25->n2count++; - ax25_transmit_enquiry(ax25); + if (!ax25->dama_slave) + ax25_transmit_enquiry(ax25); } break; } @@ -223,4 +320,228 @@ static void ax25_timer(unsigned long param) ax25_set_timer(ax25); } +/************************************************************************/ +/* Module support functions follow. */ +/************************************************************************/ + +static struct protocol_struct { + struct protocol_struct *next; + unsigned int pid; + int (*func)(struct sk_buff *, ax25_cb *); +} *protocol_list = NULL; + +static struct linkfail_struct { + struct linkfail_struct *next; + void (*func)(ax25_address *, struct device *); +} *linkfail_list = NULL; + +static struct listen_struct { + struct listen_struct *next; + ax25_address callsign; + struct device *dev; +} *listen_list = NULL; + +int ax25_protocol_register(unsigned int pid, int (*func)(struct sk_buff *, ax25_cb *)) +{ + struct protocol_struct *protocol; + unsigned long flags; + + if (pid == AX25_P_TEXT || pid == AX25_P_SEGMENT) + return 0; +#ifdef CONFIG_INET + if (pid == AX25_P_IP || pid == AX25_P_ARP) + return 0; +#endif + if ((protocol = (struct protocol_struct *)kmalloc(sizeof(*protocol), GFP_ATOMIC)) == NULL) + return 0; + + protocol->pid = pid; + protocol->func = func; + + save_flags(flags); + cli(); + + protocol->next = protocol_list; + protocol_list = protocol; + + restore_flags(flags); + + return 1; +} + +void ax25_protocol_release(unsigned int pid) +{ + struct protocol_struct *s, *protocol = protocol_list; + unsigned long flags; + + if (protocol == NULL) + return; + + save_flags(flags); + cli(); + + if (protocol->pid == pid) { + protocol_list = protocol->next; + restore_flags(flags); + kfree_s(protocol, sizeof(struct protocol_struct)); + return; + } + + while (protocol != NULL && protocol->next != NULL) { + if (protocol->next->pid == pid) { + s = protocol->next; + protocol->next = protocol->next->next; + restore_flags(flags); + kfree_s(s, sizeof(struct protocol_struct)); + return; + } + + protocol = protocol->next; + } + + restore_flags(flags); +} + +int ax25_linkfail_register(void (*func)(ax25_address *, struct device *)) +{ + struct linkfail_struct *linkfail; + unsigned long flags; + + if ((linkfail = (struct linkfail_struct *)kmalloc(sizeof(*linkfail), GFP_ATOMIC)) == NULL) + return 0; + + linkfail->func = func; + + save_flags(flags); + cli(); + + linkfail->next = linkfail_list; + linkfail_list = linkfail; + + restore_flags(flags); + + return 1; +} + +void ax25_linkfail_release(void (*func)(ax25_address *, struct device *)) +{ + struct linkfail_struct *s, *linkfail = linkfail_list; + unsigned long flags; + + if (linkfail == NULL) + return; + + save_flags(flags); + cli(); + + if (linkfail->func == func) { + linkfail_list = linkfail->next; + restore_flags(flags); + kfree_s(linkfail, sizeof(struct linkfail_struct)); + return; + } + + while (linkfail != NULL && linkfail->next != NULL) { + if (linkfail->next->func == func) { + s = linkfail->next; + linkfail->next = linkfail->next->next; + restore_flags(flags); + kfree_s(s, sizeof(struct linkfail_struct)); + return; + } + + linkfail = linkfail->next; + } + + restore_flags(flags); +} + +int ax25_listen_register(ax25_address *callsign, struct device *dev) +{ + struct listen_struct *listen; + unsigned long flags; + + if (ax25_listen_mine(callsign, dev)) + return 0; + + if ((listen = (struct listen_struct *)kmalloc(sizeof(*listen), GFP_ATOMIC)) == NULL) + return 0; + + listen->callsign = *callsign; + listen->dev = dev; + + save_flags(flags); + cli(); + + listen->next = listen_list; + listen_list = listen; + + restore_flags(flags); + + return 1; +} + +void ax25_listen_release(ax25_address *callsign, struct device *dev) +{ + struct listen_struct *s, *listen = listen_list; + unsigned long flags; + + if (listen == NULL) + return; + + save_flags(flags); + cli(); + + if (ax25cmp(&listen->callsign, callsign) == 0 && listen->dev == dev) { + listen_list = listen->next; + restore_flags(flags); + kfree_s(listen, sizeof(struct listen_struct)); + return; + } + + while (listen != NULL && listen->next != NULL) { + if (ax25cmp(&listen->next->callsign, callsign) == 0 && listen->next->dev == dev) { + s = listen->next; + listen->next = listen->next->next; + restore_flags(flags); + kfree_s(s, sizeof(struct listen_struct)); + return; + } + + listen = listen->next; + } + + restore_flags(flags); +} + +int (*ax25_protocol_function(unsigned int pid))(struct sk_buff *, ax25_cb *) +{ + struct protocol_struct *protocol; + + for (protocol = protocol_list; protocol != NULL; protocol = protocol->next) + if (protocol->pid == pid) + return protocol->func; + + return NULL; +} + +int ax25_listen_mine(ax25_address *callsign, struct device *dev) +{ + struct listen_struct *listen; + + for (listen = listen_list; listen != NULL; listen = listen->next) + if (ax25cmp(&listen->callsign, callsign) == 0 && (listen->dev == dev || listen->dev == NULL)) + return 1; + + return 0; +} + +void ax25_link_failed(ax25_address *callsign, struct device *dev) +{ + struct linkfail_struct *linkfail; + + for (linkfail = linkfail_list; linkfail != NULL; linkfail = linkfail->next) + (linkfail->func)(callsign, dev); +} + #endif diff --git a/net/ax25/sysctl_net_ax25.c b/net/ax25/sysctl_net_ax25.c new file mode 100644 index 000000000..302d210f8 --- /dev/null +++ b/net/ax25/sysctl_net_ax25.c @@ -0,0 +1,60 @@ +/* -*- linux-c -*- + * sysctl_net_ax25.c: sysctl interface to net AX.25 subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/ax25 directory entry (empty =) ). [MS] + */ + +#include <linux/mm.h> +#include <linux/sysctl.h> +#include <net/ax25.h> + +static int min_ax25[] = {0, 0, 0, 0, 0, 1, 1, 1, 1, + 0, 0, 1, 1, 1, 0x00}; +static int max_ax25[] = {1, 1, 1, 1, 1, 7, 63, 30 * PR_SLOWHZ, 20 * PR_SLOWHZ, + 3600 * PR_SLOWHZ, 65535 * PR_SLOWHZ, 31, 512, 20, 0x03}; + +static struct ctl_table_header *ax25_table_header; + +static ctl_table ax25_table[AX25_MAX_DEVICES + 1]; + +static ctl_table ax25_dir_table[] = { + {NET_AX25, "ax25", NULL, 0, 0555, ax25_table}, + {0} +}; + +static ctl_table ax25_root_table[] = { + {CTL_NET, "net", NULL, 0, 0555, ax25_dir_table}, + {0} +}; + +void ax25_register_sysctl(void) +{ + int i, n; + + memset(ax25_table, 0x00, (AX25_MAX_DEVICES + 1) * sizeof(ctl_table)); + + for (n = 0, i = 0; i < AX25_MAX_DEVICES; i++) { + if (ax25_device[i].dev != NULL) { + ax25_table[n].ctl_name = n + 1; + ax25_table[n].procname = ax25_device[i].name; + ax25_table[n].data = &ax25_device[i].values; + ax25_table[n].maxlen = AX25_MAX_VALUES * sizeof(int); + ax25_table[n].mode = 0644; + ax25_table[n].child = NULL; + ax25_table[n].proc_handler = &proc_dointvec_minmax; + ax25_table[n].strategy = &sysctl_intvec; + ax25_table[n].de = NULL; + ax25_table[n].extra1 = &min_ax25; + ax25_table[n].extra2 = &max_ax25; + n++; + } + } + + ax25_table_header = register_sysctl_table(ax25_root_table, 1); +} + +void ax25_unregister_sysctl(void) +{ + unregister_sysctl_table(ax25_table_header); +} diff --git a/net/bridge/Makefile b/net/bridge/Makefile new file mode 100644 index 000000000..981c47dcd --- /dev/null +++ b/net/bridge/Makefile @@ -0,0 +1,17 @@ +# +# Makefile for the Linux TCP/IP (INET) layer. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definition is now in the main makefile... + +O_TARGET := bridge.o +O_OBJS := br.o br_tree.o sysctl_net_bridge.o +M_OBJS := $(O_TARGET) + +include $(TOPDIR)/Rules.make + +tar: + tar -cvf /dev/f1 . diff --git a/net/bridge/br.c b/net/bridge/br.c new file mode 100644 index 000000000..70e54dbc3 --- /dev/null +++ b/net/bridge/br.c @@ -0,0 +1,1619 @@ +/* + * Linux NET3 Bridge Support + * + * Originally by John Hayes (Network Plumbing). + * Minor hacks to get it to run with 1.3.x by Alan Cox <Alan.Cox@linux.org> + * More hacks to be able to switch protocols on and off by Christoph Lameter + * <clameter@debian.org> + * Software and more Documentation for the bridge is available from ftp.debian.org + * in the bridge package or at ftp.fuller.edu/Linux/bridge + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * + * Todo: + * Don't bring up devices automatically. Start ports disabled + * and use a netlink notifier so a daemon can maintain the bridge + * port group (could we also do multiple groups ????). + * A nice /proc file interface. + * Put the path costs in the port info and devices. + * Put the bridge port number in the device structure for speed. + * Bridge SNMP stats. + * + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/net.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/string.h> +#include <linux/skbuff.h> +#include <linux/if_arp.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <net/br.h> + +static int br_device_event(struct notifier_block *dnot, unsigned long event, void *ptr); +static void br_tick(unsigned long arg); +int br_forward(struct sk_buff *skb, int port); /* 3.7 */ +int br_port_cost(struct device *dev); /* 4.10.2 */ +void br_bpdu(struct sk_buff *skb); /* consumes skb */ +int br_tx_frame(struct sk_buff *skb); +int br_cmp(unsigned int *a, unsigned int *b); + +unsigned char bridge_ula[ETH_ALEN] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }; + +Bridge_data bridge_info; /* (4.5.3) */ +Port_data port_info[All_ports]; /* (4.5.5) */ +Config_bpdu config_bpdu[All_ports]; +Tcn_bpdu tcn_bpdu[All_ports]; +Timer hello_timer; /* (4.5.4.1) */ +Timer tcn_timer; /* (4.5.4.2) */ +Timer topology_change_timer; /* (4.5.4.3) */ +Timer message_age_timer[All_ports]; /* (4.5.6.1) */ +Timer forward_delay_timer[All_ports]; /* (4.5.6.2) */ +Timer hold_timer[All_ports]; /* (4.5.6.3) */ + +/* entries timeout after this many seconds */ +unsigned int fdb_aging_time = FDB_TIMEOUT; + +struct br_stat br_stats; + +static struct timer_list tl; /* for 1 second timer... */ + +/* + * the following structure is required so that we receive + * event notifications when network devices are enabled and + * disabled (ifconfig up and down). + */ +static struct notifier_block br_dev_notifier={ + br_device_event, + NULL, + 0 +}; + +/* + * Implementation of Protocol specific bridging + * + * The protocols to be bridged or not to be bridged are stored in a hashed array. This is the old type + * of unlinked hash array where one simply takes the next cell if the one the hash function points to + * is occupied. + */ + +#define BR_PROTOCOL_HASH(x) (x % BR_MAX_PROTOCOLS) + +/* Checks if that protocol type is to be bridged */ +int br_protocol_ok(unsigned short protocol) +{ + unsigned x; + + /* See if protocol statistics are to be kept */ + if (br_stats.flags & BR_PROT_STATS) + { for(x=0;x<BR_MAX_PROT_STATS && + br_stats.prot_id[x]!=protocol && br_stats.prot_id[x];x++) ; + if (x<BR_MAX_PROT_STATS) + { br_stats.prot_id[x]=protocol;br_stats.prot_counter[x]++; + } + } + + for (x=BR_PROTOCOL_HASH(protocol); br_stats.protocols[x]!=0;) { + if (br_stats.protocols[x]==protocol) return !br_stats.policy; + x++; + if (x==BR_MAX_PROTOCOLS) x=0; + } + return br_stats.policy; +} + +/* Add a protocol to be handled opposite to the standard policy of the bridge */ + +int br_add_exempt_protocol(unsigned short p) +{ + unsigned x; + if (p == 0) return -EINVAL; + if (br_stats.exempt_protocols > BR_MAX_PROTOCOLS-2) return -EXFULL; + for (x=BR_PROTOCOL_HASH(p);br_stats.protocols[x]!=0;) { + if (br_stats.protocols[x]==p) return 0; /* Attempt to add the protocol a second time */ + x++; + if (x==BR_MAX_PROTOCOLS) x=0; + } + br_stats.protocols[x]=p; + br_stats.exempt_protocols++; + return 0; +} + +/* Valid Policies are 0=No Protocols bridged 1=Bridge all protocols */ +int br_set_policy(int policy) +{ + if (policy>1) return -EINVAL; + br_stats.policy=policy; + /* Policy change means initializing the exempt table */ + memset(br_stats.protocols,0,sizeof(br_stats.protocols)); + br_stats.exempt_protocols = 0; + return 0; +} + + +/** Elements of Procedure (4.6) **/ + +/* + * this section of code was graciously borrowed from the IEEE 802.1d + * specification section 4.9.1 starting on pg 69. It has been + * modified somewhat to fit within out framework and structure. It + * implements the spanning tree algorithm that is the heart of the + * 802.1d bridging protocol. + */ + +void transmit_config(int port_no) /* (4.6.1) */ +{ + if (hold_timer[port_no].active) { /* (4.6.1.3.1) */ + port_info[port_no].config_pending = TRUE; /* (4.6.1.3.1) */ + } else { /* (4.6.1.3.2) */ + config_bpdu[port_no].type = BPDU_TYPE_CONFIG; + config_bpdu[port_no].root_id = bridge_info.designated_root; + /* (4.6.1.3.2(1)) */ + config_bpdu[port_no].root_path_cost = bridge_info.root_path_cost; + /* (4.6.1.3.2(2)) */ + config_bpdu[port_no].bridge_id = bridge_info.bridge_id; + /* (4.6.1.3.2(3)) */ + config_bpdu[port_no].port_id = port_info[port_no].port_id; + /* + * (4.6.1.3.2(4)) + */ + if (root_bridge()) { + config_bpdu[port_no].message_age = Zero; /* (4.6.1.3.2(5)) */ + } else { + config_bpdu[port_no].message_age + = message_age_timer[bridge_info.root_port].value + + Message_age_increment; /* (4.6.1.3.2(6)) */ + } + + config_bpdu[port_no].max_age = bridge_info.max_age; /* (4.6.1.3.2(7)) */ + config_bpdu[port_no].hello_time = bridge_info.hello_time; + config_bpdu[port_no].forward_delay = bridge_info.forward_delay; + config_bpdu[port_no].flags = 0; + config_bpdu[port_no].flags |= + port_info[port_no].top_change_ack ? TOPOLOGY_CHANGE_ACK : 0; + /* (4.6.1.3.2(8)) */ + port_info[port_no].top_change_ack = 0; + /* (4.6.1.3.2(8)) */ + config_bpdu[port_no].flags |= + bridge_info.top_change ? TOPOLOGY_CHANGE : 0; + /* (4.6.1.3.2(9)) */ + + send_config_bpdu(port_no, &config_bpdu[port_no]); + port_info[port_no].config_pending = FALSE; /* (4.6.1.3.2(10)) */ + start_hold_timer(port_no); /* (4.6.1.3.2(11)) */ + } +} + +int root_bridge(void) +{ + return (br_cmp(bridge_info.designated_root.BRIDGE_ID, + bridge_info.bridge_id.BRIDGE_ID)?FALSE:TRUE); +} + +int supersedes_port_info(int port_no, Config_bpdu *config) /* (4.6.2.2) */ +{ + return ( + (br_cmp(config->root_id.BRIDGE_ID, + port_info[port_no].designated_root.BRIDGE_ID) < 0) /* (4.6.2.2.1) */ + || + ((br_cmp(config->root_id.BRIDGE_ID, + port_info[port_no].designated_root.BRIDGE_ID) == 0 + ) + && + ((config->root_path_cost + < port_info[port_no].designated_cost /* (4.6.2.2.2) */ + ) + || + ((config->root_path_cost + == port_info[port_no].designated_cost + ) + && + ((br_cmp(config->bridge_id.BRIDGE_ID, + port_info[port_no].designated_bridge.BRIDGE_ID) < 0 /* (4.6.2.2.3) */ + ) + || + ((br_cmp(config->bridge_id.BRIDGE_ID, + port_info[port_no].designated_bridge.BRIDGE_ID) == 0 + ) /* (4.6.2.2.4) */ + && + ((br_cmp(config->bridge_id.BRIDGE_ID, + bridge_info.bridge_id.BRIDGE_ID) != 0 + ) /* (4.6.2.2.4(1)) */ + || + (config->port_id <= + port_info[port_no].designated_port + ) /* (4.6.2.2.4(2)) */ + )))))) + ); +} + +void record_config_information(int port_no, Config_bpdu *config) /* (4.6.2) */ +{ + port_info[port_no].designated_root = config->root_id; /* (4.6.2.3.1) */ + port_info[port_no].designated_cost = config->root_path_cost; + port_info[port_no].designated_bridge = config->bridge_id; + port_info[port_no].designated_port = config->port_id; + start_message_age_timer(port_no, config->message_age); /* (4.6.2.3.2) */ +} + +void record_config_timeout_values(Config_bpdu *config) /* (4.6.3) */ +{ + bridge_info.max_age = config->max_age; /* (4.6.3.3) */ + bridge_info.hello_time = config->hello_time; + bridge_info.forward_delay = config->forward_delay; + if (config->flags & TOPOLOGY_CHANGE) + bridge_info.top_change = 1; +} + +void config_bpdu_generation(void) +{ /* (4.6.4) */ + int port_no; + for (port_no = One; port_no <= No_of_ports; port_no++) { /* (4.6.4.3) */ + if (designated_port(port_no) /* (4.6.4.3) */ + && + (port_info[port_no].state != Disabled) + ) { + transmit_config(port_no); /* (4.6.4.3) */ + } /* (4.6.1.2) */ + } +} + +int designated_port(int port_no) +{ + return ((br_cmp(port_info[port_no].designated_bridge.BRIDGE_ID, + bridge_info.bridge_id.BRIDGE_ID) == 0 + ) + && + (port_info[port_no].designated_port + == port_info[port_no].port_id + ) + ); +} + +void reply(int port_no) /* (4.6.5) */ +{ + transmit_config(port_no); /* (4.6.5.3) */ +} + +void transmit_tcn(void) +{ /* (4.6.6) */ + int port_no; + + port_no = bridge_info.root_port; + tcn_bpdu[port_no].type = BPDU_TYPE_TOPO_CHANGE; + send_tcn_bpdu(port_no, &tcn_bpdu[bridge_info.root_port]); /* (4.6.6.3) */ +} + +void configuration_update(void) /* (4.6.7) */ +{ + root_selection(); /* (4.6.7.3.1) */ + /* (4.6.8.2) */ + designated_port_selection(); /* (4.6.7.3.2) */ + /* (4.6.9.2) */ +} + +void root_selection(void) +{ /* (4.6.8) */ + int root_port; + int port_no; + root_port = No_port; + for (port_no = One; port_no <= No_of_ports; port_no++) { /* (4.6.8.3.1) */ + if (((!designated_port(port_no)) + && + (port_info[port_no].state != Disabled) + && + (br_cmp(port_info[port_no].designated_root.BRIDGE_ID, + bridge_info.bridge_id.BRIDGE_ID) < 0) + ) + && + ((root_port == No_port) + || + (br_cmp(port_info[port_no].designated_root.BRIDGE_ID, + port_info[root_port].designated_root.BRIDGE_ID) < 0 + ) + || + ((br_cmp(port_info[port_no].designated_root.BRIDGE_ID, + port_info[root_port].designated_root.BRIDGE_ID) == 0 + ) + && + (((port_info[port_no].designated_cost + + port_info[port_no].path_cost + ) + < + (port_info[root_port].designated_cost + + port_info[root_port].path_cost + ) /* (4.6.8.3.1(2)) */ + ) + || + (((port_info[port_no].designated_cost + + port_info[port_no].path_cost + ) + == + (port_info[root_port].designated_cost + + port_info[root_port].path_cost + ) + ) + && + ((br_cmp(port_info[port_no].designated_bridge.BRIDGE_ID, + port_info[root_port].designated_bridge.BRIDGE_ID) < 0 + ) /* (4.6.8.3.1(3)) */ + || + ((br_cmp(port_info[port_no].designated_bridge.BRIDGE_ID, + port_info[root_port].designated_bridge.BRIDGE_ID) == 0 + ) + && + ((port_info[port_no].designated_port + < port_info[root_port].designated_port + ) /* (4.6.8.3.1(4)) */ + || + ((port_info[port_no].designated_port + = port_info[root_port].designated_port + ) + && + (port_info[port_no].port_id + < port_info[root_port].port_id + ) /* (4.6.8.3.1(5)) */ + ))))))))) { + root_port = port_no; + } + } + bridge_info.root_port = root_port; /* (4.6.8.3.1) */ + + if (root_port == No_port) { /* (4.6.8.3.2) */ + bridge_info.designated_root = bridge_info.bridge_id; + /* (4.6.8.3.2(1)) */ + bridge_info.root_path_cost = Zero;/* (4.6.8.3.2(2)) */ + } else { /* (4.6.8.3.3) */ + bridge_info.designated_root = port_info[root_port].designated_root; + /* (4.6.8.3.3(1)) */ + bridge_info.root_path_cost = (port_info[root_port].designated_cost + + port_info[root_port].path_cost + ); /* (4.6.8.3.3(2)) */ + } +} + +void designated_port_selection(void) +{ /* (4.6.9) */ + int port_no; + + for (port_no = One; port_no <= No_of_ports; port_no++) { /* (4.6.9.3) */ + if (designated_port(port_no) /* (4.6.9.3.1) */ + || + ( + br_cmp(port_info[port_no].designated_root.BRIDGE_ID, + bridge_info.designated_root.BRIDGE_ID) != 0 + ) + || + (bridge_info.root_path_cost + < port_info[port_no].designated_cost + ) /* (4.6.9.3.3) */ + || + ((bridge_info.root_path_cost + == port_info[port_no].designated_cost + ) + && + ((br_cmp(bridge_info.bridge_id.BRIDGE_ID, + port_info[port_no].designated_bridge.BRIDGE_ID) < 0 + ) /* (4.6.9.3.4) */ + || + ((br_cmp(bridge_info.bridge_id.BRIDGE_ID, + port_info[port_no].designated_bridge.BRIDGE_ID) == 0 + ) + && + (port_info[port_no].port_id + <= port_info[port_no].designated_port + ) /* (4.6.9.3.5) */ + )))) { + become_designated_port(port_no); /* (4.6.10.3.2.2) */ + } + } +} + +void become_designated_port(int port_no) +{ /* (4.6.10) */ + + /* (4.6.10.3.1) */ + port_info[port_no].designated_root = bridge_info.designated_root; + /* (4.6.10.3.2) */ + port_info[port_no].designated_cost = bridge_info.root_path_cost; + /* (4.6.10.3.3) */ + port_info[port_no].designated_bridge = bridge_info.bridge_id; + /* (4.6.10.3.4) */ + port_info[port_no].designated_port = port_info[port_no].port_id; +} + +void port_state_selection(void) +{ /* (4.6.11) */ + int port_no; + for (port_no = One; port_no <= No_of_ports; port_no++) { + if (port_no == bridge_info.root_port) { /* (4.6.11.3.1) */ + port_info[port_no].config_pending = FALSE; /* (4.6.11.3~1(1)) */ + port_info[port_no].top_change_ack = 0; + make_forwarding(port_no); /* (4.6.11.3.1(2)) */ + } else if (designated_port(port_no)) { /* (4.6.11.3.2) */ + stop_message_age_timer(port_no); /* (4.6.11.3.2(1)) */ + make_forwarding(port_no); /* (4.6.11.3.2(2)) */ + } else { /* (4.6.11.3.3) */ + port_info[port_no].config_pending = FALSE; /* (4.6.11.3.3(1)) */ + port_info[port_no].top_change_ack = 0; + make_blocking(port_no); /* (4.6.11.3.3(2)) */ + } + } + +} + +void make_forwarding(int port_no) +{ /* (4.6.12) */ + if (port_info[port_no].state == Blocking) { /* (4.6.12.3) */ + set_port_state(port_no, Listening); /* (4.6.12.3.1) */ + start_forward_delay_timer(port_no); /* (4.6.12.3.2) */ + } +} + +void topology_change_detection(void) +{ /* (4.6.14) */ + if (root_bridge()) { /* (4.6.14.3.1) */ + bridge_info.top_change = 1; + start_topology_change_timer(); /* (4.6.14.3.1(2)) */ + } else if (!(bridge_info.top_change_detected)) { + transmit_tcn(); /* (4.6.14.3.2(1)) */ + start_tcn_timer(); /* (4.6.14.3.2(2)) */ + } + bridge_info.top_change = 1; +} + +void topology_change_acknowledged(void) +{ /* (4.6.15) */ + bridge_info.top_change_detected = 0; + stop_tcn_timer(); /* (4.6.15.3.2) */ +} + +void acknowledge_topology_change(int port_no) +{ /* (4.6.16) */ + port_info[port_no].top_change_ack = 1; + transmit_config(port_no); /* (4.6.16.3.2) */ +} + +void make_blocking(int port_no) /* (4.6.13) */ +{ + + if ((port_info[port_no].state != Disabled) + && + (port_info[port_no].state != Blocking) + /* (4.6.13.3) */ + ) { + if ((port_info[port_no].state == Forwarding) + || + (port_info[port_no].state == Learning) + ) { + topology_change_detection(); /* (4.6.13.3.1) */ + /* (4.6.14.2.3) */ + } + set_port_state(port_no, Blocking);/* (4.6.13.3.2) */ + stop_forward_delay_timer(port_no);/* (4.6.13.3.3) */ + } +} + +void set_port_state(int port_no, int state) +{ + port_info[port_no].state = state; +} + +void received_config_bpdu(int port_no, Config_bpdu *config) /* (4.7.1) */ +{ + int root; + + root = root_bridge(); + if (port_info[port_no].state != Disabled) { + if (supersedes_port_info(port_no, config)) { /* (4.7.1.1) *//* (4. + * 6.2.2) */ + record_config_information(port_no, config); /* (4.7.1.1.1) */ + /* (4.6.2.2) */ + configuration_update(); /* (4.7.1.1.2) */ + /* (4.6.7.2.1) */ + port_state_selection(); /* (4.7.1.1.3) */ + /* (4.6.11.2.1) */ + if ((!root_bridge()) && root) { /* (4.7.1.1.4) */ + stop_hello_timer(); + if (bridge_info.top_change_detected) { /* (4.7.1.1.5~ */ + stop_topology_change_timer(); + transmit_tcn(); /* (4.6.6.1) */ + start_tcn_timer(); + } + } + if (port_no == bridge_info.root_port) { + record_config_timeout_values(config); /* (4.7.1.1.6) */ + /* (4.6.3.2) */ + config_bpdu_generation(); /* (4.6.4.2.1) */ + if (config->flags & TOPOLOGY_CHANGE_ACK) { /* (4.7.1.1.7) */ + topology_change_acknowledged(); /* (4.6.15.2) */ + } + } + } else if (designated_port(port_no)) { /* (4.7.1.2) */ + reply(port_no); /* (4.7.1.2.1) */ + /* (4.6.5.2) */ + } + } +} + +void received_tcn_bpdu(int port_no, Tcn_bpdu *tcn) /* (4.7.2) */ +{ + if (port_info[port_no].state != Disabled) { + if (designated_port(port_no)) { + topology_change_detection(); /* (4.7.2.1) */ + /* (4.6.14.2.1) */ + acknowledge_topology_change(port_no); /* (4.7.2.2) */ + } /* (4.6.16.2) */ + } +} + +void hello_timer_expiry(void) +{ /* (4.7.3) */ + config_bpdu_generation(); /* (4.6.4.2.2) */ + start_hello_timer(); +} + +void message_age_timer_expiry(int port_no) /* (4.7.4) */ +{ + int root; + root = root_bridge(); + + become_designated_port(port_no); /* (4.7.4.1) */ + /* (4.6.10.2.1) */ + configuration_update(); /* (4.7.4.2) */ + /* (4.6.7.2.2) */ + port_state_selection(); /* (4.7.4.3) */ + /* (4.6.11.2.2) */ + if ((root_bridge()) && (!root)) { /* (4.7.4.4) */ + + bridge_info.max_age = bridge_info.bridge_max_age; /* (4.7.4.4.1) */ + bridge_info.hello_time = bridge_info.bridge_hello_time; + bridge_info.forward_delay = bridge_info.bridge_forward_delay; + topology_change_detection(); /* (4.7.4.4.2) */ + /* (4.6.14.2.4) */ + stop_tcn_timer(); /* (4.7.4.4.3) */ + config_bpdu_generation(); /* (4.7.4.4.4) */ + /* (4.6.4.4.3) */ + start_hello_timer(); + } +} + +void forward_delay_timer_expiry(int port_no) /* (4.7.5) */ +{ + if (port_info[port_no].state == Listening) { /* (4.7.5.1) */ + set_port_state(port_no, Learning); /* (4.7.5.1.1) */ + start_forward_delay_timer(port_no); /* (4.7.5.1.2) */ + } else if (port_info[port_no].state == Learning) { /* (4.7.5.2) */ + set_port_state(port_no, Forwarding); /* (4.7.5.2.1) */ + if (designated_for_some_port()) { /* (4.7.5.2.2) */ + topology_change_detection(); /* (4.6.14.2.2) */ + + } + } +} + +int designated_for_some_port(void) +{ + int port_no; + + + for (port_no = One; port_no <= No_of_ports; port_no++) { + if ((br_cmp(port_info[port_no].designated_bridge.BRIDGE_ID, + bridge_info.bridge_id.BRIDGE_ID) == 0) + ) { + return (TRUE); + } + } + return (FALSE); +} + +void tcn_timer_expiry(void) +{ /* (4.7.6) */ + transmit_tcn(); /* (4.7.6.1) */ + start_tcn_timer(); /* (4.7.6.2) */ +} + +void topology_change_timer_expiry(void) +{ /* (4.7.7) */ + bridge_info.top_change_detected = 0; + bridge_info.top_change = 0; + /* (4.7.7.2) */ +} + +void hold_timer_expiry(int port_no) /* (4.7.8) */ +{ + if (port_info[port_no].config_pending) { + transmit_config(port_no); /* (4.7.8.1) */ + } /* (4.6.1.2.3) */ +} + +void br_init(void) +{ /* (4.8.1) */ + int port_no; + + printk(KERN_INFO "Ethernet Bridge 003 for NET3.037 (Linux 2.1)\n"); + bridge_info.designated_root = bridge_info.bridge_id; /* (4.8.1.1) */ + bridge_info.root_path_cost = Zero; + bridge_info.root_port = No_port; + + bridge_info.bridge_max_age = BRIDGE_MAX_AGE; + bridge_info.bridge_hello_time = BRIDGE_HELLO_TIME; + bridge_info.bridge_forward_delay = BRIDGE_FORWARD_DELAY; + bridge_info.hold_time = HOLD_TIME; + + bridge_info.max_age = bridge_info.bridge_max_age; /* (4.8.1.2) */ + bridge_info.hello_time = bridge_info.bridge_hello_time; + bridge_info.forward_delay = bridge_info.bridge_forward_delay; + + bridge_info.top_change_detected = 0; + bridge_info.top_change = 0; + stop_tcn_timer(); + stop_topology_change_timer(); + for (port_no = One; port_no <= No_of_ports; port_no++) { /* (4.8.1.4) */ + br_init_port(port_no); + disable_port(port_no); + } + port_state_selection(); /* (4.8.1.5) */ + config_bpdu_generation(); /* (4.8.1.6) */ + + /* initialize system timer */ + tl.expires = jiffies+HZ; /* 1 second */ + tl.function = br_tick; + add_timer(&tl); + + register_netdevice_notifier(&br_dev_notifier); + br_stats.flags = 0; /*BR_UP | BR_DEBUG*/; /* enable bridge */ + br_stats.policy = BR_ACCEPT; /* Enable bridge to accpet all protocols */ + br_stats.exempt_protocols = 0; + /*start_hello_timer();*/ +} + +void br_init_port(int port_no) +{ + become_designated_port(port_no); /* (4.8.1.4.1) */ + set_port_state(port_no, Blocking); /* (4.8.1.4.2) */ + port_info[port_no].top_change_ack = 0; + port_info[port_no].config_pending = FALSE;/* (4.8.1.4.4) */ + stop_message_age_timer(port_no); /* (4.8.1.4.5) */ + stop_forward_delay_timer(port_no); /* (4.8.1.4.6) */ + stop_hold_timer(port_no); /* (4.8.1.4.7) */ +} + +void enable_port(int port_no) /* (4.8.2) */ +{ + br_init_port(port_no); + port_state_selection(); /* (4.8.2.7) */ +} /* */ + +void disable_port(int port_no) /* (4.8.3) */ +{ + int root; + + root = root_bridge(); + become_designated_port(port_no); /* (4.8.3.1) */ + set_port_state(port_no, Disabled); /* (4.8.3.2) */ + port_info[port_no].top_change_ack = 0; + port_info[port_no].config_pending = FALSE;/* (4.8.3.4) */ + stop_message_age_timer(port_no); /* (4.8.3.5) */ + stop_forward_delay_timer(port_no); /* (4.8.3.6) */ + configuration_update(); + port_state_selection(); /* (4.8.3.7) */ + if ((root_bridge()) && (!root)) { /* (4.8.3.8) */ + bridge_info.max_age = bridge_info.bridge_max_age; /* (4.8.3.8.1) */ + bridge_info.hello_time = bridge_info.bridge_hello_time; + bridge_info.forward_delay = bridge_info.bridge_forward_delay; + topology_change_detection(); /* (4.8.3.8.2) */ + stop_tcn_timer(); /* (4.8.3.8.3) */ + config_bpdu_generation(); /* (4.8.3.8.4) */ + start_hello_timer(); + } +} + + +void set_bridge_priority(bridge_id_t *new_bridge_id) /* (4.8.4) */ +{ + + int root; + int port_no; + root = root_bridge(); + for (port_no = One; port_no <= No_of_ports; port_no++) { /* (4.8.4.2) */ + if (designated_port(port_no)) { + port_info[port_no].designated_bridge = *new_bridge_id; + } + } + + bridge_info.bridge_id = *new_bridge_id; /* (4.8.4.3) */ + configuration_update(); /* (4.8.4.4) */ + port_state_selection(); /* (4.8.4.5) */ + if ((root_bridge()) && (!root)) { /* (4.8.4.6) */ + bridge_info.max_age = bridge_info.bridge_max_age; /* (4.8.4.6.1) */ + bridge_info.hello_time = bridge_info.bridge_hello_time; + bridge_info.forward_delay = bridge_info.bridge_forward_delay; + topology_change_detection(); /* (4.8.4.6.2) */ + stop_tcn_timer(); /* (4.8.4.6.3) */ + config_bpdu_generation(), /* (4.8.4.6.4) */ + start_hello_timer(); + } +} + +void set_port_priority(int port_no, unsigned short new_port_id) /* (4.8.5) */ +{ + if (designated_port(port_no)) { /* (4.8.5.2) */ + port_info[port_no].designated_port = new_port_id; + } + port_info[port_no].port_id = new_port_id; /* (4.8.5.3) */ + if ((br_cmp(bridge_info.bridge_id.BRIDGE_ID, + port_info[port_no].designated_bridge.BRIDGE_ID) == 0 + ) + && + (port_info[port_no].port_id + < port_info[port_no].designated_port + + ) + ) { + become_designated_port(port_no); /* (4.8.5.4.1) */ + port_state_selection(); /* (4.8.5.4.2) */ + } +} + +void set_path_cost(int port_no, unsigned short path_cost) /* (4.8.6) */ +{ + port_info[port_no].path_cost = path_cost; /* (4.8.6.1) */ + configuration_update(); /* (4.8.6.2) */ + port_state_selection(); /* (4.8.6.3) */ +} + +static void br_tick(unsigned long arg) +{ + int port_no; + + if (hello_timer_expired()) { + hello_timer_expiry(); + } + if (tcn_timer_expired()) { + tcn_timer_expiry(); + } + if (topology_change_timer_expired()) { + topology_change_timer_expiry(); + } + for (port_no = One; port_no <= No_of_ports; port_no++) { + if (forward_delay_timer_expired(port_no)) { + forward_delay_timer_expiry(port_no); + } + if (message_age_timer_expired(port_no)) { + message_age_timer_expiry(port_no); + } + if (hold_timer_expired(port_no)) { + hold_timer_expiry(port_no); + } + } + /* call me again sometime... */ + tl.expires = jiffies+HZ; /* 1 second */ + tl.function = br_tick; + add_timer(&tl); +} + +void start_hello_timer(void) +{ + hello_timer.value = 0; + hello_timer.active = TRUE; +} + +void stop_hello_timer(void) +{ + hello_timer.active = FALSE; +} + +int hello_timer_expired(void) +{ + if (hello_timer.active && (++hello_timer.value >= bridge_info.hello_time)) { + hello_timer.active = FALSE; + return (TRUE); + } + return (FALSE); +} + +void start_tcn_timer(void) +{ + tcn_timer.value = 0; + tcn_timer.active = TRUE; +} + +void stop_tcn_timer(void) +{ + tcn_timer.active = FALSE; +} + +int tcn_timer_expired(void) +{ + if (tcn_timer.active && (++tcn_timer.value >= + bridge_info.bridge_hello_time)) { + tcn_timer.active = FALSE; + return (TRUE); + } + return (FALSE); + +} + +void start_topology_change_timer(void) +{ + topology_change_timer.value = 0; + topology_change_timer.active = TRUE; +} + +void stop_topology_change_timer(void) +{ + topology_change_timer.active = FALSE; +} + +int topology_change_timer_expired(void) +{ + if (topology_change_timer.active + && (++topology_change_timer.value + >= bridge_info.topology_change_time + )) { + topology_change_timer.active = FALSE; + return (TRUE); + } + return (FALSE); +} + +void start_message_age_timer(int port_no, unsigned short message_age) +{ + message_age_timer[port_no].value = message_age; + message_age_timer[port_no].active = TRUE; +} + +void stop_message_age_timer(int port_no) +{ + message_age_timer[port_no].active = FALSE; +} + +int message_age_timer_expired(int port_no) +{ + if (message_age_timer[port_no].active && + (++message_age_timer[port_no].value >= bridge_info.max_age)) { + message_age_timer[port_no].active = FALSE; + return (TRUE); + } + return (FALSE); +} + +void start_forward_delay_timer(int port_no) +{ + forward_delay_timer[port_no].value = 0; + forward_delay_timer[port_no].active = TRUE; +} + +void stop_forward_delay_timer(int port_no) +{ + forward_delay_timer[port_no].active = FALSE; +} + +int forward_delay_timer_expired(int port_no) +{ + if (forward_delay_timer[port_no].active && + (++forward_delay_timer[port_no].value >= bridge_info.forward_delay)) { + forward_delay_timer[port_no].active = FALSE; + return (TRUE); + } + return (FALSE); +} + +void start_hold_timer(int port_no) +{ + hold_timer[port_no].value = 0; + hold_timer[port_no].active = TRUE; +} + +void stop_hold_timer(int port_no) +{ + hold_timer[port_no].active = FALSE; +} + + +int hold_timer_expired(int port_no) +{ + if (hold_timer[port_no].active && + (++hold_timer[port_no].value >= bridge_info.hold_time)) { + hold_timer[port_no].active = FALSE; + return (TRUE); + } + return (FALSE); + +} + +int send_config_bpdu(int port_no, Config_bpdu *config_bpdu) +{ +struct sk_buff *skb; +struct device *dev = port_info[port_no].dev; +int size; +unsigned long flags; + + if (port_info[port_no].state == Disabled) { + printk(KERN_DEBUG "send_config_bpdu: port %i not valid\n",port_no); + return(-1); + } + if (br_stats.flags & BR_DEBUG) + printk("send_config_bpdu: "); + /* + * create and send the message + */ + size = sizeof(Config_bpdu) + dev->hard_header_len; + skb = alloc_skb(size, GFP_ATOMIC); + if (skb == NULL) { + printk(KERN_DEBUG "send_config_bpdu: no skb available\n"); + return(-1); + } + skb->dev = dev; + skb->free = 1; + skb->h.eth = (struct ethhdr *)skb_put(skb, size); + memcpy(skb->h.eth->h_dest, bridge_ula, ETH_ALEN); + memcpy(skb->h.eth->h_source, dev->dev_addr, ETH_ALEN); + if (br_stats.flags & BR_DEBUG) + printk("port %i src %02x:%02x:%02x:%02x:%02x:%02x\ + dest %02x:%02x:%02x:%02x:%02x:%02x\n", + port_no, + skb->h.eth->h_source[0], + skb->h.eth->h_source[1], + skb->h.eth->h_source[2], + skb->h.eth->h_source[3], + skb->h.eth->h_source[4], + skb->h.eth->h_source[5], + skb->h.eth->h_dest[0], + skb->h.eth->h_dest[1], + skb->h.eth->h_dest[2], + skb->h.eth->h_dest[3], + skb->h.eth->h_dest[4], + skb->h.eth->h_dest[5]); + skb->h.eth->h_proto = htonl(0x8038); /* XXX verify */ + + skb->h.raw += skb->dev->hard_header_len; + memcpy(skb->h.raw, config_bpdu, sizeof(Config_bpdu)); + + /* won't get bridged again... */ + skb->pkt_bridged = IS_BRIDGED; + skb->arp = 1; /* do not resolve... */ + skb->h.raw = skb->data + ETH_HLEN; + save_flags(flags); + cli(); + skb_queue_tail(dev->buffs, skb); + restore_flags(flags); + return(0); +} + +int send_tcn_bpdu(int port_no, Tcn_bpdu *bpdu) +{ +struct sk_buff *skb; +struct device *dev = port_info[port_no].dev; +int size; +unsigned long flags; + + if (port_info[port_no].state == Disabled) { + printk(KERN_DEBUG "send_tcn_bpdu: port %i not valid\n",port_no); + return(-1); + } + if (br_stats.flags & BR_DEBUG) + printk("send_tcn_bpdu: "); + size = sizeof(Tcn_bpdu) + dev->hard_header_len; + skb = alloc_skb(size, GFP_ATOMIC); + if (skb == NULL) { + printk(KERN_DEBUG "send_tcn_bpdu: no skb available\n"); + return(-1); + } + skb->dev = dev; + skb->free = 1; + skb->h.eth = (struct ethhdr *)skb_put(skb,size); + memcpy(skb->h.eth->h_dest, bridge_ula, ETH_ALEN); + memcpy(skb->h.eth->h_source, dev->dev_addr, ETH_ALEN); + if (br_stats.flags & BR_DEBUG) + printk("port %i src %02x:%02x:%02x:%02x:%02x:%02x\ + dest %02x:%02x:%02x:%02x:%02x:%02x\n", + port_no, + skb->h.eth->h_source[0], + skb->h.eth->h_source[1], + skb->h.eth->h_source[2], + skb->h.eth->h_source[3], + skb->h.eth->h_source[4], + skb->h.eth->h_source[5], + skb->h.eth->h_dest[0], + skb->h.eth->h_dest[1], + skb->h.eth->h_dest[2], + skb->h.eth->h_dest[3], + skb->h.eth->h_dest[4], + skb->h.eth->h_dest[5]); + skb->h.eth->h_proto = 0x8038; /* XXX verify */ + + skb->h.raw += skb->dev->hard_header_len; + memcpy(skb->h.raw, bpdu, sizeof(Tcn_bpdu)); + + /* mark that's we've been here... */ + skb->pkt_bridged = IS_BRIDGED; + skb->arp = 1; /* do not resolve... */ + skb->h.raw = skb->data + ETH_HLEN; + save_flags(flags); + cli(); + skb_queue_tail(dev->buffs, skb); + restore_flags(flags); + return(0); +} + +static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr) +{ + struct device *dev = ptr; + int i; + + /* check for loopback devices */ + if (dev->flags & IFF_LOOPBACK) + return(NOTIFY_DONE); + + switch (event) { + case NETDEV_DOWN: + if (br_stats.flags & BR_DEBUG) + printk("br_device_event: NETDEV_DOWN...\n"); + /* find our device and mark it down */ + for (i = One; i <= No_of_ports; i++) { + if (port_info[i].dev == dev) { + disable_port(i); + return NOTIFY_DONE; + break; + } + } + break; + case NETDEV_UP: + if (br_stats.flags & BR_DEBUG) + printk("br_device_event: NETDEV_UP...\n"); + /* Only handle ethernet ports */ + if(dev->type!=ARPHRD_ETHER && dev->type!=ARPHRD_LOOPBACK) + return NOTIFY_DONE; + /* look up an unused device and enable it */ + for (i = One; i <= No_of_ports; i++) { + if ((port_info[i].dev == (struct device *)0) || + (port_info[i].dev == dev)) { + port_info[i].dev = dev; + enable_port(i); + set_path_cost(i, br_port_cost(dev)); + set_port_priority(i, 128); + port_info[i].port_id = i; + /* set bridge addr from 1st device addr */ + if ((bridge_info.bridge_id.BRIDGE_ID[0] == 0) && + (bridge_info.bridge_id.BRIDGE_ID[1] == 0)) { + memcpy(bridge_info.bridge_id.BRIDGE_ID_ULA, dev->dev_addr, 6); + bridge_info.bridge_id.BRIDGE_PRIORITY = port_info[i].port_id; + set_bridge_priority(&bridge_info.bridge_id); + } + make_forwarding(i); + return NOTIFY_DONE; + break; + } + } + break; +#if 0 + default: + printk("br_device_event: unknown event [%x]\n", + (unsigned int)event); +#endif + } + return NOTIFY_DONE; +} + +/* + * following routine is called when a frame is received + * from an interface, it returns 1 when it consumes the + * frame, 0 when it does not + */ + +int br_receive_frame(struct sk_buff *skb) /* 3.5 */ +{ + int port; + + if (br_stats.flags & BR_DEBUG) + printk("br_receive_frame: "); + /* sanity */ + if (!skb) { + printk(KERN_CRIT "br_receive_frame: no skb!\n"); + return(1); + } + + skb->pkt_bridged = IS_BRIDGED; + + /* check for loopback */ + if (skb->dev->flags & IFF_LOOPBACK) + return(0); + + port = find_port(skb->dev); + + skb->arp = 1; /* Received frame so it is resolved */ + skb->h.raw = skb->mac.raw; + if (br_stats.flags & BR_DEBUG) + printk("port %i src %02x:%02x:%02x:%02x:%02x:%02x\ + dest %02x:%02x:%02x:%02x:%02x:%02x\n", + port, + skb->h.eth->h_source[0], + skb->h.eth->h_source[1], + skb->h.eth->h_source[2], + skb->h.eth->h_source[3], + skb->h.eth->h_source[4], + skb->h.eth->h_source[5], + skb->h.eth->h_dest[0], + skb->h.eth->h_dest[1], + skb->h.eth->h_dest[2], + skb->h.eth->h_dest[3], + skb->h.eth->h_dest[4], + skb->h.eth->h_dest[5]); + + if (!port) { + if(br_stats.flags&BR_DEBUG) + printk("\nbr_receive_frame: no port!\n"); + return(0); + } + + switch (port_info[port].state) + { + case Learning: + (void) br_learn(skb, port); /* 3.8 */ + /* fall through */ + case Listening: + /* process BPDUs */ + if (memcmp(skb->h.eth->h_dest, bridge_ula, 6) == 0) { + br_bpdu(skb); + return(1); /* br_bpdu consumes skb */ + } + /* fall through */ + case Blocking: + /* fall through */ + case Disabled: + /* should drop frames, but for now, we let + * them get passed up to the next higher layer + return(br_drop(skb)); + */ + return(0); /* pass frame up stack */ + break; + case Forwarding: + (void) br_learn(skb, port); /* 3.8 */ + /* process BPDUs */ + if (memcmp(skb->h.eth->h_dest, bridge_ula, + ETH_ALEN) == 0) + { + /*printk("frame bpdu processor for me!!!\n");*/ + br_bpdu(skb); + return(1); /* br_bpdu consumes skb */ + } + /* is frame for me? */ + if (memcmp(skb->h.eth->h_dest, + port_info[port].dev->dev_addr, + ETH_ALEN) == 0) + { + /* Packet is for us */ + skb->pkt_type = PACKET_HOST; + return(0); /* pass frame up our stack (this will */ + /* happen in net_bh() in dev.c) */ + } + /* ok, forward this frame... */ + return(br_forward(skb, port)); + default: + printk(KERN_DEBUG "br_receive_frame: port [%i] unknown state [%i]\n", + port, port_info[port].state); + return(0); /* pass frame up stack? */ + } +} + +/* + * the following routine is called to transmit frames from the host + * stack. it returns 1 when it consumes the frame and + * 0 when it does not. + */ + +int br_tx_frame(struct sk_buff *skb) /* 3.5 */ +{ + int port; + + /* sanity */ + if (!skb) + { + printk(KERN_CRIT "br_tx_frame: no skb!\n"); + return(0); + } + + if (!skb->dev) + { + printk(KERN_CRIT "br_tx_frame: no dev!\n"); + return(0); + } + + /* check for loopback */ + if (skb->dev->flags & IFF_LOOPBACK) + return(0); + + skb->h.raw = skb->data; + port = 0; /* an impossible port */ + if (br_stats.flags & BR_DEBUG) + printk("br_tx_fr : port %i src %02x:%02x:%02x:%02x:%02x:%02x\ + dest %02x:%02x:%02x:%02x:%02x:%02x\n", + port, + skb->h.eth->h_source[0], + skb->h.eth->h_source[1], + skb->h.eth->h_source[2], + skb->h.eth->h_source[3], + skb->h.eth->h_source[4], + skb->h.eth->h_source[5], + skb->h.eth->h_dest[0], + skb->h.eth->h_dest[1], + skb->h.eth->h_dest[2], + skb->h.eth->h_dest[3], + skb->h.eth->h_dest[4], + skb->h.eth->h_dest[5]); + return(br_forward(skb, port)); +} + +/* + * this routine returns 0 when it learns (or updates) from the + * frame, and -1 if the frame is simply discarded due to port + * state or lack of resources... + */ + +int br_learn(struct sk_buff *skb, int port) /* 3.8 */ +{ + struct fdb *f; + + switch (port_info[port].state) { + case Listening: + case Blocking: + case Disabled: + default: + return(-1); + /* break; */ + case Learning: + case Forwarding: + /* don't keep group addresses in the tree */ + if (skb->h.eth->h_source[0] & 0x01) + return(-1); + + f = (struct fdb *)kmalloc(sizeof(struct fdb), + GFP_ATOMIC); + + if (!f) { + printk(KERN_DEBUG "br_learn: unable to malloc fdb\n"); + return(-1); + } + f->port = port; /* source port */ + memcpy(f->ula, skb->h.eth->h_source, 6); + f->timer = CURRENT_TIME; + f->flags = FDB_ENT_VALID; + /* + * add entity to AVL tree. If entity already + * exists in the tree, update the fields with + * what we have here. + */ + if (br_avl_insert(f) == 0) { /* update */ + kfree(f); + return(0); + } + /* add to head of port chain */ + f->fdb_next = port_info[port].fdb; + port_info[port].fdb = f; + return(0); + /* break */ + } +} + +/* + * this routine always consumes the frame + */ + +int br_drop(struct sk_buff *skb) +{ + kfree_skb(skb, 0); + return(1); +} + +/* + * this routine always consumes the frame + */ + +int br_dev_drop(struct sk_buff *skb) +{ + dev_kfree_skb(skb, 0); + return(1); +} + +/* + * this routine returns 1 if it consumes the frame, 0 + * if not... + */ + +int br_forward(struct sk_buff *skb, int port) /* 3.7 */ +{ + struct fdb *f; + + /* + * flood all ports with frames destined for a group + * address. If frame came from above, drop it, + * otherwise it will be handled in br_receive_frame() + * Multicast frames will also need to be seen + * by our upper layers. + */ + if (skb->h.eth->h_dest[0] & 0x01) + { + /* group address */ + br_flood(skb, port); + /* + * External groups are fed out via the normal source + * This probably should be dropped since the flood will + * have sent it anyway. + */ + if (port == 0) /* locally generated */ + return(br_dev_drop(skb)); + return(0); + } else { + /* locate port to forward to */ + f = br_avl_find_addr(skb->h.eth->h_dest); + /* + * Send flood and drop. + */ + if (!f || !(f->flags & FDB_ENT_VALID)) { + /* not found; flood all ports */ + br_flood(skb, port); + return(br_dev_drop(skb)); + } + /* + * Sending + */ + if (f->port!=port && port_info[f->port].state == Forwarding) { + /* has entry expired? */ + if (f->timer + fdb_aging_time < CURRENT_TIME) { + /* timer expired, invalidate entry */ + f->flags &= ~FDB_ENT_VALID; + if (br_stats.flags & BR_DEBUG) + printk("fdb entry expired...\n"); + /* + * Send flood and drop original + */ + br_flood(skb, port); + return(br_dev_drop(skb)); + } + /* mark that's we've been here... */ + skb->pkt_bridged = IS_BRIDGED; + + /* reset the skb->ip pointer */ + skb->h.raw = skb->data + ETH_HLEN; + + /* + * Send the buffer out. + */ + + skb->dev=port_info[f->port].dev; + + /* + * We send this still locked + */ + dev_queue_xmit(skb, skb->dev,1); + return(1); /* skb has been consumed */ + } else { + /* + * Arrived on the right port, we discard + */ + return(br_dev_drop(skb)); + } + } +} + +/* + * this routine sends a copy of the frame to all forwarding ports + * with the exception of the port given. This routine never + * consumes the original frame. + */ + +int br_flood(struct sk_buff *skb, int port) +{ + int i; + struct sk_buff *nskb; + + for (i = One; i <= No_of_ports; i++) + { + if (i == port) + continue; + if (port_info[i].state == Forwarding) + { + nskb = skb_clone(skb, GFP_ATOMIC); + if(nskb==NULL) + continue; + /* mark that's we've been here... */ + nskb->pkt_bridged = IS_BRIDGED; + /* Send to each port in turn */ + nskb->dev= port_info[i].dev; + /* To get here we must have done ARP already, + or have a received valid MAC header */ + nskb->arp = 1; + +/* printk("Flood to port %d\n",i);*/ + nskb->h.raw = nskb->data + ETH_HLEN; + dev_queue_xmit(nskb,nskb->dev,1); + } + } + return(0); +} + +int find_port(struct device *dev) +{ + int i; + + for (i = One; i <= No_of_ports; i++) + if ((port_info[i].dev == dev) && + (port_info[i].state != Disabled)) + return(i); + return(0); +} + +int br_port_cost(struct device *dev) /* 4.10.2 */ +{ + if (strncmp(dev->name, "eth", 3) == 0) /* ethernet */ + return(100); + if (strncmp(dev->name, "wic", 3) == 0) /* wic */ + return(1600); + if (strncmp(dev->name, "plip",4) == 0) /* plip */ + return (1600); + return(100); /* default */ +} + +/* + * this routine always consumes the skb + */ + +void br_bpdu(struct sk_buff *skb) /* consumes skb */ +{ + Tcn_bpdu *bpdu; + int port; + + port = find_port(skb->dev); + if (port == 0) { /* unknown port */ + br_drop(skb); + return; + } + + bpdu = (Tcn_bpdu *)skb->data + ETH_HLEN; + switch (bpdu->type) { + case BPDU_TYPE_CONFIG: + received_config_bpdu(port, (Config_bpdu *)bpdu); + break; + case BPDU_TYPE_TOPO_CHANGE: + received_tcn_bpdu(port, bpdu); + break; + default: + printk(KERN_DEBUG "br_bpdu: received unknown bpdu, type = %i\n", + bpdu->type); + /* break; */ + } + br_drop(skb); +} + +int br_ioctl(unsigned int cmd, void *arg) +{ + int err; + struct br_cf bcf; + + switch(cmd) + { + case SIOCGIFBR: /* get bridging control blocks */ + memcpy(&br_stats.bridge_data, &bridge_info, sizeof(Bridge_data)); + memcpy(&br_stats.port_data, &port_info, sizeof(Port_data)*No_of_ports); + err = copy_to_user(arg, &br_stats, sizeof(struct br_stat)); + if (err) + { + err = -EFAULT; + } + return err; + case SIOCSIFBR: + if (!suser()) + return -EPERM; + err = copy_from_user(&bcf, arg, sizeof(struct br_cf)); + if (err) + return -EFAULT; + switch (bcf.cmd) { + case BRCMD_BRIDGE_ENABLE: + if (br_stats.flags & BR_UP) + return(-EALREADY); + printk(KERN_DEBUG "br: enabling bridging function\n"); + br_stats.flags |= BR_UP; /* enable bridge */ + start_hello_timer(); + break; + case BRCMD_BRIDGE_DISABLE: + if (!(br_stats.flags & BR_UP)) + return(-EALREADY); + printk(KERN_DEBUG "br: disabling bridging function\n"); + br_stats.flags &= ~BR_UP; /* disable bridge */ + stop_hello_timer(); +#if 0 + for (i = One; i <= No_of_ports; i++) + if (port_info[i].state != Disabled) + disable_port(i); +#endif + break; + case BRCMD_PORT_ENABLE: + if (port_info[bcf.arg1].dev == 0) + return(-EINVAL); + if (port_info[bcf.arg1].state != Disabled) + return(-EALREADY); + printk(KERN_DEBUG "br: enabling port %i\n",bcf.arg1); + enable_port(bcf.arg1); + break; + case BRCMD_PORT_DISABLE: + if (port_info[bcf.arg1].dev == 0) + return(-EINVAL); + if (port_info[bcf.arg1].state == Disabled) + return(-EALREADY); + printk(KERN_DEBUG "br: disabling port %i\n",bcf.arg1); + disable_port(bcf.arg1); + break; + case BRCMD_SET_BRIDGE_PRIORITY: + set_bridge_priority((bridge_id_t *)&bcf.arg1); + break; + case BRCMD_SET_PORT_PRIORITY: + if (port_info[bcf.arg1].dev == 0) + return(-EINVAL); + set_port_priority(bcf.arg1, bcf.arg2); + break; + case BRCMD_SET_PATH_COST: + if (port_info[bcf.arg1].dev == 0) + return(-EINVAL); + set_path_cost(bcf.arg1, bcf.arg2); + break; + case BRCMD_ENABLE_DEBUG: + br_stats.flags |= BR_DEBUG; + break; + case BRCMD_DISABLE_DEBUG: + br_stats.flags &= ~BR_DEBUG; + break; + case BRCMD_SET_POLICY: + return br_set_policy(bcf.arg1); + case BRCMD_EXEMPT_PROTOCOL: + return br_add_exempt_protocol(bcf.arg1); + case BRCMD_ENABLE_PROT_STATS: + br_stats.flags |= BR_PROT_STATS; + break; + case BRCMD_DISABLE_PROT_STATS: + br_stats.flags &= ~BR_PROT_STATS; + break; + case BRCMD_ZERO_PROT_STATS: + memset(&br_stats.prot_id,0,sizeof(br_stats.prot_id)); + memset(&br_stats.prot_counter,0,sizeof(br_stats.prot_counter)); + break; + default: + return -EINVAL; + } + return(0); + default: + return -EINVAL; + } + /*NOTREACHED*/ + return 0; +} + +int br_cmp(unsigned int *a, unsigned int *b) +{ + int i; + for (i=0; i<2; i++) + { + if (a[i] == b[i]) + continue; + if (a[i] < b[i]) + return(1); + if (a[i] > b[i]) + return(-1); + } + return(0); +} + diff --git a/net/bridge/br_tree.c b/net/bridge/br_tree.c new file mode 100644 index 000000000..a1965d498 --- /dev/null +++ b/net/bridge/br_tree.c @@ -0,0 +1,402 @@ +/* + * this code is derived from the avl functions in mmap.c + */ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/malloc.h> +#include <linux/skbuff.h> + +#include <net/br.h> +#define _DEBUG_AVL + +/* + * Use an AVL (Adelson-Velskii and Landis) tree to speed up this search + * from O(n) to O(log n), where n is the number of ULAs. + * Written by Bruno Haible <haible@ma2s2.mathematik.uni-karlsruhe.de>. + * Taken from mmap.c, extensively modified by John Hayes + * <hayes@netplumbing.com> + */ + +struct fdb fdb_head; +struct fdb *fhp = &fdb_head; +struct fdb **fhpp = &fhp; +static int fdb_inited = 0; + +int addr_cmp(unsigned char *a1, unsigned char *a2); + +/* + * fdb_head is the AVL tree corresponding to fdb + * or, more exactly, its root. + * A fdb has the following fields: + * fdb_avl_left left son of a tree node + * fdb_avl_right right son of a tree node + * fdb_avl_height 1+max(heightof(left),heightof(right)) + * The empty tree is represented as NULL. + */ + +#ifndef avl_br_empty +#define avl_br_empty (struct fdb *) NULL +#endif + +/* Since the trees are balanced, their height will never be large. */ +#define avl_maxheight 127 +#define heightof(tree) ((tree) == avl_br_empty ? 0 : (tree)->fdb_avl_height) +/* + * Consistency and balancing rules: + * 1. tree->fdb_avl_height == 1+max(heightof(tree->fdb_avl_left),heightof(tree->fdb_avl_right)) + * 2. abs( heightof(tree->fdb_avl_left) - heightof(tree->fdb_avl_right) ) <= 1 + * 3. foreach node in tree->fdb_avl_left: node->fdb_avl_key <= tree->fdb_avl_key, + * foreach node in tree->fdb_avl_right: node->fdb_avl_key >= tree->fdb_avl_key. + */ + +int +fdb_init(void) +{ + fdb_head.fdb_avl_height = 0; + fdb_head.fdb_avl_left = (struct fdb *)0; + fdb_head.fdb_avl_right = (struct fdb *)0; + fdb_inited = 1; + return(0); +} + +struct fdb * +br_avl_find_addr(unsigned char addr[6]) +{ + struct fdb * result = NULL; + struct fdb * tree; + + if (!fdb_inited) + fdb_init(); +#if (DEBUG_AVL) + printk("searching for ula %02x:%02x:%02x:%02x:%02x:%02x\n", + addr[0], + addr[1], + addr[2], + addr[3], + addr[4], + addr[5]); +#endif /* DEBUG_AVL */ + for (tree = &fdb_head ; ; ) { + if (tree == avl_br_empty) { +#if (DEBUG_AVL) + printk("search failed, returning node 0x%x\n", (unsigned int)result); +#endif /* DEBUG_AVL */ + return result; + } + +#if (DEBUG_AVL) + printk("node 0x%x: checking ula %02x:%02x:%02x:%02x:%02x:%02x\n", + (unsigned int)tree, + tree->ula[0], + tree->ula[1], + tree->ula[2], + tree->ula[3], + tree->ula[4], + tree->ula[5]); +#endif /* DEBUG_AVL */ + if (addr_cmp(addr, tree->ula) == 0) { +#if (DEBUG_AVL) + printk("found node 0x%x\n", (unsigned int)tree); +#endif /* DEBUG_AVL */ + return tree; + } + if (addr_cmp(addr, tree->ula) < 0) { + tree = tree->fdb_avl_left; + } else { + tree = tree->fdb_avl_right; + } + } +} + +/* + * Rebalance a tree. + * After inserting or deleting a node of a tree we have a sequence of subtrees + * nodes[0]..nodes[k-1] such that + * nodes[0] is the root and nodes[i+1] = nodes[i]->{fdb_avl_left|fdb_avl_right}. + */ +static void +br_avl_rebalance (struct fdb *** nodeplaces_ptr, int count) +{ + if (!fdb_inited) + fdb_init(); + for ( ; count > 0 ; count--) { + struct fdb ** nodeplace = *--nodeplaces_ptr; + struct fdb * node = *nodeplace; + struct fdb * nodeleft = node->fdb_avl_left; + struct fdb * noderight = node->fdb_avl_right; + int heightleft = heightof(nodeleft); + int heightright = heightof(noderight); + if (heightright + 1 < heightleft) { + /* */ + /* * */ + /* / \ */ + /* n+2 n */ + /* */ + struct fdb * nodeleftleft = nodeleft->fdb_avl_left; + struct fdb * nodeleftright = nodeleft->fdb_avl_right; + int heightleftright = heightof(nodeleftright); + if (heightof(nodeleftleft) >= heightleftright) { + /* */ + /* * n+2|n+3 */ + /* / \ / \ */ + /* n+2 n --> / n+1|n+2 */ + /* / \ | / \ */ + /* n+1 n|n+1 n+1 n|n+1 n */ + /* */ + node->fdb_avl_left = nodeleftright; + nodeleft->fdb_avl_right = node; + nodeleft->fdb_avl_height = 1 + (node->fdb_avl_height = 1 + heightleftright); + *nodeplace = nodeleft; + } else { + /* */ + /* * n+2 */ + /* / \ / \ */ + /* n+2 n --> n+1 n+1 */ + /* / \ / \ / \ */ + /* n n+1 n L R n */ + /* / \ */ + /* L R */ + /* */ + nodeleft->fdb_avl_right = nodeleftright->fdb_avl_left; + node->fdb_avl_left = nodeleftright->fdb_avl_right; + nodeleftright->fdb_avl_left = nodeleft; + nodeleftright->fdb_avl_right = node; + nodeleft->fdb_avl_height = node->fdb_avl_height = heightleftright; + nodeleftright->fdb_avl_height = heightleft; + *nodeplace = nodeleftright; + } + } else if (heightleft + 1 < heightright) { + /* similar to the above, just interchange 'left' <--> 'right' */ + struct fdb * noderightright = noderight->fdb_avl_right; + struct fdb * noderightleft = noderight->fdb_avl_left; + int heightrightleft = heightof(noderightleft); + if (heightof(noderightright) >= heightrightleft) { + node->fdb_avl_right = noderightleft; + noderight->fdb_avl_left = node; + noderight->fdb_avl_height = 1 + (node->fdb_avl_height = 1 + heightrightleft); + *nodeplace = noderight; + } else { + noderight->fdb_avl_left = noderightleft->fdb_avl_right; + node->fdb_avl_right = noderightleft->fdb_avl_left; + noderightleft->fdb_avl_right = noderight; + noderightleft->fdb_avl_left = node; + noderight->fdb_avl_height = node->fdb_avl_height = heightrightleft; + noderightleft->fdb_avl_height = heightright; + *nodeplace = noderightleft; + } + } else { + int height = (heightleft<heightright ? heightright : heightleft) + 1; + if (height == node->fdb_avl_height) + break; + node->fdb_avl_height = height; + } + } +#ifdef DEBUG_AVL + printk_avl(&fdb_head); +#endif /* DEBUG_AVL */ +} + +/* Insert a node into a tree. */ +int +br_avl_insert (struct fdb * new_node) +{ + struct fdb ** nodeplace = fhpp; + struct fdb ** stack[avl_maxheight]; + int stack_count = 0; + struct fdb *** stack_ptr = &stack[0]; /* = &stack[stackcount] */ + if (!fdb_inited) + fdb_init(); + for (;;) { + struct fdb *node; + + node = *nodeplace; + if (node == avl_br_empty) + break; + *stack_ptr++ = nodeplace; stack_count++; + if (addr_cmp(new_node->ula, node->ula) == 0) { /* update */ + node->flags = new_node->flags; + node->timer = new_node->timer; + return(0); + } + if (addr_cmp(new_node->ula, node->ula) < 0) { + nodeplace = &node->fdb_avl_left; + } else { + nodeplace = &node->fdb_avl_right; + } + } +#if (DEBUG_AVL) + printk("node 0x%x: adding ula %02x:%02x:%02x:%02x:%02x:%02x\n", + (unsigned int)new_node, + new_node->ula[0], + new_node->ula[1], + new_node->ula[2], + new_node->ula[3], + new_node->ula[4], + new_node->ula[5]); +#endif /* (DEBUG_AVL) */ + new_node->fdb_avl_left = avl_br_empty; + new_node->fdb_avl_right = avl_br_empty; + new_node->fdb_avl_height = 1; + *nodeplace = new_node; +#if (0) + br_avl_rebalance(stack_ptr,stack_count); +#endif /* (0) */ +#ifdef DEBUG_AVL + printk_avl(&fdb_head); +#endif /* DEBUG_AVL */ + return(1); +} + +/* Removes a node out of a tree. */ +int +br_avl_remove (struct fdb * node_to_delete) +{ + struct fdb ** nodeplace = fhpp; + struct fdb ** stack[avl_maxheight]; + int stack_count = 0; + struct fdb *** stack_ptr = &stack[0]; /* = &stack[stackcount] */ + struct fdb ** nodeplace_to_delete; + if (!fdb_inited) + fdb_init(); + for (;;) { + struct fdb * node = *nodeplace; + if (node == avl_br_empty) { + /* what? node_to_delete not found in tree? */ + printk(KERN_ERR "br: avl_remove: node to delete not found in tree\n"); + return(-1); + } + *stack_ptr++ = nodeplace; stack_count++; + if (addr_cmp(node_to_delete->ula, node->ula) == 0) + break; + if (addr_cmp(node_to_delete->ula, node->ula) < 0) + nodeplace = &node->fdb_avl_left; + else + nodeplace = &node->fdb_avl_right; + } + nodeplace_to_delete = nodeplace; + /* Have to remove node_to_delete = *nodeplace_to_delete. */ + if (node_to_delete->fdb_avl_left == avl_br_empty) { + *nodeplace_to_delete = node_to_delete->fdb_avl_right; + stack_ptr--; stack_count--; + } else { + struct fdb *** stack_ptr_to_delete = stack_ptr; + struct fdb ** nodeplace = &node_to_delete->fdb_avl_left; + struct fdb * node; + for (;;) { + node = *nodeplace; + if (node->fdb_avl_right == avl_br_empty) + break; + *stack_ptr++ = nodeplace; stack_count++; + nodeplace = &node->fdb_avl_right; + } + *nodeplace = node->fdb_avl_left; + /* node replaces node_to_delete */ + node->fdb_avl_left = node_to_delete->fdb_avl_left; + node->fdb_avl_right = node_to_delete->fdb_avl_right; + node->fdb_avl_height = node_to_delete->fdb_avl_height; + *nodeplace_to_delete = node; /* replace node_to_delete */ + *stack_ptr_to_delete = &node->fdb_avl_left; /* replace &node_to_delete->fdb_avl_left */ + } + br_avl_rebalance(stack_ptr,stack_count); + return(0); +} + +#ifdef DEBUG_AVL + +/* print a tree */ +static void printk_avl (struct fdb * tree) +{ + if (tree != avl_br_empty) { + printk("("); + printk("%02x:%02x:%02x:%02x:%02x:%02x", + tree->ula[0], + tree->ula[1], + tree->ula[2], + tree->ula[3], + tree->ula[4], + tree->ula[5]); + if (tree->fdb_avl_left != avl_br_empty) { + printk_avl(tree->fdb_avl_left); + printk("<"); + } + if (tree->fdb_avl_right != avl_br_empty) { + printk(">"); + printk_avl(tree->fdb_avl_right); + } + printk(")\n"); + } +} + +#if (0) +static char *avl_check_point = "somewhere"; + +/* check a tree's consistency and balancing */ +static void avl_checkheights (struct fdb * tree) +{ + int h, hl, hr; + + if (tree == avl_br_empty) + return; + avl_checkheights(tree->fdb_avl_left); + avl_checkheights(tree->fdb_avl_right); + h = tree->fdb_avl_height; + hl = heightof(tree->fdb_avl_left); + hr = heightof(tree->fdb_avl_right); + if ((h == hl+1) && (hr <= hl) && (hl <= hr+1)) + return; + if ((h == hr+1) && (hl <= hr) && (hr <= hl+1)) + return; + printk("%s: avl_checkheights: heights inconsistent\n",avl_check_point); +} + +/* check that all values stored in a tree are < key */ +static void avl_checkleft (struct fdb * tree, fdb_avl_key_t key) +{ + if (tree == avl_br_empty) + return; + avl_checkleft(tree->fdb_avl_left,key); + avl_checkleft(tree->fdb_avl_right,key); + if (tree->fdb_avl_key < key) + return; + printk("%s: avl_checkleft: left key %lu >= top key %lu\n",avl_check_point,tree->fdb_avl_key,key); +} + +/* check that all values stored in a tree are > key */ +static void avl_checkright (struct fdb * tree, fdb_avl_key_t key) +{ + if (tree == avl_br_empty) + return; + avl_checkright(tree->fdb_avl_left,key); + avl_checkright(tree->fdb_avl_right,key); + if (tree->fdb_avl_key > key) + return; + printk("%s: avl_checkright: right key %lu <= top key %lu\n",avl_check_point,tree->fdb_avl_key,key); +} + +/* check that all values are properly increasing */ +static void avl_checkorder (struct fdb * tree) +{ + if (tree == avl_br_empty) + return; + avl_checkorder(tree->fdb_avl_left); + avl_checkorder(tree->fdb_avl_right); + avl_checkleft(tree->fdb_avl_left,tree->fdb_avl_key); + avl_checkright(tree->fdb_avl_right,tree->fdb_avl_key); +} + +#endif /* (0) */ +#endif /* DEBUG_AVL */ + +int +addr_cmp(unsigned char a1[], unsigned char a2[]) +{ + int i; + + for (i=0; i<6; i++) { + if (a1[i] > a2[i]) return(1); + if (a1[i] < a2[i]) return(-1); + } + return(0); +} + diff --git a/net/bridge/sysctl_net_bridge.c b/net/bridge/sysctl_net_bridge.c new file mode 100644 index 000000000..6e2f57d65 --- /dev/null +++ b/net/bridge/sysctl_net_bridge.c @@ -0,0 +1,13 @@ +/* -*- linux-c -*- + * sysctl_net_bridge.c: sysctl interface to net bridge subsystem. + * + * Begun June 1, 1996, Mike Shaver. + * Added /proc/sys/net/bridge directory entry (empty =) ). [MS] + */ + +#include <linux/mm.h> +#include <linux/sysctl.h> + +ctl_table bridge_table[] = { + {0} +}; diff --git a/net/core/Makefile b/net/core/Makefile index dee2b16d3..c4216d0e9 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -7,37 +7,25 @@ # # Note 2! The CFLAGS definition is now in the main makefile... -.c.o: - $(CC) $(CFLAGS) -c $< -.s.o: - $(AS) -o $*.o $< -.c.s: - $(CC) $(CFLAGS) -S $< +O_TARGET := core.o - -OBJS := sock.o dev.o dev_mcast.o skbuff.o datagram.o +O_OBJS := sock.o skbuff.o iovec.o datagram.o sysctl_net_core.o ifdef CONFIG_NET -core.o: $(OBJS) - $(LD) -r -o core.o $(OBJS) +O_OBJS += dev.o dev_mcast.o -else +ifdef CONFIG_FIREWALL +OX_OBJS += firewall.o +endif -core.o: - $(AR) rcs core.o +ifdef CONFIG_NET_ALIAS +O_OBJS += net_alias.o +endif endif -dep: - $(CPP) -M *.c > .depend +include $(TOPDIR)/Rules.make tar: tar -cvf /dev/f1 . - -# -# include a dependency file if one exists -# -ifeq (.depend,$(wildcard .depend)) -include .depend -endif diff --git a/net/core/datagram.c b/net/core/datagram.c index ce08e543c..74f10f8a5 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -3,10 +3,10 @@ * * Generic datagram handling routines. These are generic for all protocols. Possibly a generic IP version on top * of these would make sense. Not tonight however 8-). - * This is used because UDP, RAW, PACKET and the to be released IPX layer all have identical select code and mostly - * identical recvfrom() code. So we share it here. The select was shared before but buried in udp.c so I moved it. + * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and NetROM layer all have identical select code and mostly + * identical recvmsg() code. So we share it here. The select was shared before but buried in udp.c so I moved it. * - * Authors: Alan Cox <iiitac@pyr.swan.ac.uk>. (datagram_select() from old udp.c code) + * Authors: Alan Cox <alan@cymru.net>. (datagram_select() from old udp.c code) * * Fixes: * Alan Cox : NULL return from skb_peek_copy() understood @@ -16,15 +16,16 @@ * Alan Cox : Fixed write select of non IP protocol crash. * Florian La Roche: Changed for my new skbuff handling. * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET. + * Linus Torvalds : BSD semantic fixes. + * Alan Cox : Datagram iovec handling + * Darryl Miles : Fixed non-blocking SOCK_STREAM. + * Alan Cox : POSIXisms * - * Note: - * A lot of this will change when the protocol/socket separation - * occurs. Using this will make things reasonably clean. */ #include <linux/types.h> #include <linux/kernel.h> -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <linux/mm.h> #include <linux/interrupt.h> @@ -43,129 +44,159 @@ /* + * Wait for a packet.. + * + * Interrupts off so that no packet arrives before we begin sleeping. + * Otherwise we might miss our wake up + */ + +static inline void wait_for_packet(struct sock * sk) +{ + unsigned long flags; + + release_sock(sk); + save_flags(flags); + cli(); + if (skb_peek(&sk->receive_queue) == NULL) + interruptible_sleep_on(sk->sleep); + restore_flags(flags); + lock_sock(sk); +} + +/* + * Is a socket 'connection oriented' ? + */ + +static inline int connection_based(struct sock *sk) +{ + return (sk->type==SOCK_SEQPACKET || sk->type==SOCK_STREAM); +} + +/* * Get a datagram skbuff, understands the peeking, nonblocking wakeups and possible * races. This replaces identical code in packet,raw and udp, as well as the IPX * AX.25 and Appletalk. It also finally fixes the long standing peek and read * race for datagram sockets. If you alter this routine remember it must be * re-entrant. + * + * This function will lock the socket if a skb is returned, so the caller + * needs to unlock the socket in that case (usually by calling skb_free_datagram) + * + * The order of the tests when we find no data waiting are specified + * quite explicitly by POSIX 1003.1g, don't change them without having + * the standard around please. */ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, int *err) { + int error; struct sk_buff *skb; - unsigned long intflags; - /* Socket is inuse - so the timer doesn't attack it */ - save_flags(intflags); + lock_sock(sk); restart: - sk->inuse = 1; - while(skb_peek(&sk->receive_queue) == NULL) /* No data */ + while(skb_queue_empty(&sk->receive_queue)) /* No data */ { - /* If we are shutdown then no more data is going to appear. We are done */ + /* Socket errors? */ + error = sock_error(sk); + if (error) + goto no_packet; + + /* Socket shut down? */ if (sk->shutdown & RCV_SHUTDOWN) - { - release_sock(sk); - *err=0; - return NULL; - } - - if(sk->err) - { - release_sock(sk); - *err=-sk->err; - sk->err=0; - return NULL; - } + goto no_packet; /* Sequenced packets can come disconnected. If so we report the problem */ - if(sk->type==SOCK_SEQPACKET && sk->state!=TCP_ESTABLISHED) - { - release_sock(sk); - *err=-ENOTCONN; - return NULL; - } + error = -ENOTCONN; + if(connection_based(sk) && sk->state!=TCP_ESTABLISHED) + goto no_packet; + + /* handle signals */ + error = -ERESTARTSYS; + if (current->signal & ~current->blocked) + goto no_packet; /* User doesn't want to wait */ + error = -EAGAIN; if (noblock) - { - release_sock(sk); - *err=-EAGAIN; - return NULL; - } - release_sock(sk); - - /* Interrupts off so that no packet arrives before we begin sleeping. - Otherwise we might miss our wake up */ - cli(); - if (skb_peek(&sk->receive_queue) == NULL) - { - interruptible_sleep_on(sk->sleep); - /* Signals may need a restart of the syscall */ - if (current->signal & ~current->blocked) - { - restore_flags(intflags);; - *err=-ERESTARTSYS; - return(NULL); - } - if(sk->err != 0) /* Error while waiting for packet - eg an icmp sent earlier by the - peer has finally turned up now */ - { - *err = -sk->err; - sk->err=0; - restore_flags(intflags); - return NULL; - } - } - sk->inuse = 1; - restore_flags(intflags); - } - /* Again only user level code calls this function, so nothing interrupt level - will suddenly eat the receive_queue */ - if (!(flags & MSG_PEEK)) - { - skb=skb_dequeue(&sk->receive_queue); - if(skb!=NULL) - skb->users++; - else - goto restart; /* Avoid race if someone beats us to the data */ + goto no_packet; + + wait_for_packet(sk); } - else - { + + /* Again only user level code calls this function, so nothing interrupt level + will suddenly eat the receive_queue */ + if (flags & MSG_PEEK) + { + unsigned long flags; + save_flags(flags); cli(); skb=skb_peek(&sk->receive_queue); if(skb!=NULL) skb->users++; - restore_flags(intflags); - if(skb==NULL) /* shouldn't happen but .. */ - *err=-EAGAIN; - } - return skb; + restore_flags(flags); + if(skb==NULL) /* shouldn't happen but .. */ + goto restart; + return skb; + } + skb = skb_dequeue(&sk->receive_queue); + if (!skb) /* Avoid race if someone beats us to the data */ + goto restart; + skb->users++; + return skb; + +no_packet: + release_sock(sk); + *err = error; + return NULL; } -void skb_free_datagram(struct sk_buff *skb) +void skb_free_datagram(struct sock * sk, struct sk_buff *skb) { unsigned long flags; save_flags(flags); cli(); skb->users--; - if(skb->users>0) - { - restore_flags(flags); - return; + if(skb->users <= 0) { + /* See if it needs destroying */ + /* Been dequeued by someone - ie it's read */ + if(!skb->next && !skb->prev) + kfree_skb(skb,FREE_READ); } - /* See if it needs destroying */ - if(!skb->next && !skb->prev) /* Been dequeued by someone - ie it's read */ - kfree_skb(skb,FREE_READ); restore_flags(flags); + release_sock(sk); +} + +/* + * Copy a datagram to a linear buffer. + */ + +int skb_copy_datagram(struct sk_buff *skb, int offset, char *to, int size) +{ + int err; + err = copy_to_user(to, skb->h.raw+offset, size); + if (err) + { + err = -EFAULT; + } + return err; } -void skb_copy_datagram(struct sk_buff *skb, int offset, char *to, int size) + +/* + * Copy a datagram to an iovec. + */ + +int skb_copy_datagram_iovec(struct sk_buff *skb, int offset, struct iovec *to, + int size) { - /* We will know all about the fraglist options to allow >4K receives - but not this release */ - memcpy_tofs(to,skb->h.raw+offset,size); + int err; + err = memcpy_toiovec(to, skb->h.raw+offset, size); + if (err) + { + err = -EFAULT; + } + return err; } /* @@ -175,42 +206,48 @@ void skb_copy_datagram(struct sk_buff *skb, int offset, char *to, int size) int datagram_select(struct sock *sk, int sel_type, select_table *wait) { - select_wait(sk->sleep, wait); + if (sk->err) + return 1; switch(sel_type) { case SEL_IN: - if (sk->type==SOCK_SEQPACKET && sk->state==TCP_CLOSE) + if (sk->shutdown & RCV_SHUTDOWN) + return 1; + if (connection_based(sk) && sk->state==TCP_CLOSE) { /* Connection closed: Wake up */ - return(1); + return 1; } - if (skb_peek(&sk->receive_queue) != NULL || sk->err != 0) + if (!skb_queue_empty(&sk->receive_queue)) { /* This appears to be consistent with other stacks */ - return(1); + return 1; } - return(0); + break; case SEL_OUT: - if (sk->type==SOCK_SEQPACKET && sk->state==TCP_SYN_SENT) + if (sk->shutdown & SEND_SHUTDOWN) + return 1; + if (connection_based(sk) && sk->state==TCP_SYN_SENT) { /* Connection still in progress */ - return(0); + break; } - if (sk->prot && sk->prot->wspace(sk) >= MIN_WRITE_SPACE) + if (sk->prot && sock_wspace(sk) >= MIN_WRITE_SPACE) { - return(1); + return 1; } if (sk->prot==NULL && sk->sndbuf-sk->wmem_alloc >= MIN_WRITE_SPACE) { - return(1); + return 1; } - return(0); + break; case SEL_EX: - if (sk->err) - return(1); /* Socket has gone into error state (eg icmp error) */ - return(0); + break; } - return(0); + + /* select failed.. */ + select_wait(sk->sleep, wait); + return 0; } diff --git a/net/core/dev.c b/net/core/dev.c index d36b6b00c..f6bc01f9d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -34,18 +34,23 @@ * Alan Cox : Network driver sets packet type before calling netif_rx. Saves * a function call a packet. * Alan Cox : Hashed net_bh() - * Richard Kooijman : Timestamp fixes. + * Richard Kooijman: Timestamp fixes. * Alan Cox : Wrong field in SIOCGIFDSTADDR + * Alan Cox : Device lock protection. + * Alan Cox : Fixed nasty side effect of device close changes. + * Rudi Cilibrasi : Pass the right thing to set_mac_address() + * Dave Miller : 32bit quantity for the device lock to make it work out + * on a Sparc. + * Bjorn Ekwall : Added KERNELD hack. + * Alan Cox : Cleaned up the backlog initialise. + * Craig Metz : SIOCGIFCONF fix if space for under + * 1 device. + * Thomas Bogendoerfer : Return ENODEV for dev_open, if there + * is no device open function. * - * Cleaned up and recommented by Alan Cox 2nd April 1994. I hope to have - * the rest as well commented in the end. */ -/* - * A lot of these includes will be going walkies very soon - */ - -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <asm/bitops.h> #include <linux/config.h> @@ -69,7 +74,16 @@ #include <linux/skbuff.h> #include <net/sock.h> #include <net/arp.h> - +#include <net/slhc.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <net/br.h> +#ifdef CONFIG_NET_ALIAS +#include <linux/net_alias.h> +#endif +#ifdef CONFIG_KERNELD +#include <linux/kerneld.h> +#endif /* * The list of packet types we will receive (as opposed to discard) @@ -80,6 +94,12 @@ struct packet_type *ptype_base[16]; struct packet_type *ptype_all = NULL; /* Taps */ /* + * Device list lock + */ + +int dev_lockct=0; + +/* * Our notifier list */ @@ -90,13 +110,7 @@ struct notifier_block *netdev_chain=NULL; * queue in the bottom half handler. */ -static struct sk_buff_head backlog = -{ - (struct sk_buff *)&backlog, (struct sk_buff *)&backlog -#ifdef CONFIG_SKB_CHECK - ,SK_HEAD_SKB -#endif -}; +static struct sk_buff_head backlog; /* * We don't overdo the queue or we will thrash memory badly. @@ -104,14 +118,6 @@ static struct sk_buff_head backlog = static int backlog_size = 0; -/* - * Return the lesser of the two values. - */ - -static __inline__ unsigned long min(unsigned long a, unsigned long b) -{ - return (a < b)? a : b; -} /****************************************************************************************** @@ -172,6 +178,7 @@ void dev_remove_pack(struct packet_type *pt) return; } } + printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); } /***************************************************************************************** @@ -184,7 +191,7 @@ void dev_remove_pack(struct packet_type *pt) * Find an interface by name. */ -struct device *dev_get(char *name) +struct device *dev_get(const char *name) { struct device *dev; @@ -193,17 +200,37 @@ struct device *dev_get(char *name) if (strcmp(dev->name, name) == 0) return(dev); } - return(NULL); + return NULL; } + +/* + * Find and possibly load an interface. + */ + +#ifdef CONFIG_KERNELD +extern __inline__ void dev_load(const char *name) +{ + if(!dev_get(name)) { +#ifdef CONFIG_NET_ALIAS + const char *sptr; + + for (sptr=name ; *sptr ; sptr++) if(*sptr==':') break; + if (!(*sptr && *(sptr+1))) +#endif + request_module(name); + } +} +#endif + /* * Prepare an interface for use. */ int dev_open(struct device *dev) { - int ret = 0; + int ret = -ENODEV; /* * Call device private open method @@ -221,12 +248,6 @@ int dev_open(struct device *dev) /* * Initialise multicasting status */ -#ifdef CONFIG_IP_MULTICAST - /* - * Join the all host group - */ - ip_mc_allhost(dev); -#endif dev_mc_upload(dev); notifier_call_chain(&netdev_chain, NETDEV_UP, dev); } @@ -240,45 +261,41 @@ int dev_open(struct device *dev) int dev_close(struct device *dev) { + int ct=0; + + /* + * Call the device specific close. This cannot fail. + * Only if device is UP + */ + + if ((dev->flags & IFF_UP) && dev->stop) + dev->stop(dev); + /* - * Only close a device if it is up. + * Device is now down. */ - if (dev->flags != 0) + dev->flags&=~(IFF_UP|IFF_RUNNING); + + /* + * Tell people we are going down + */ + notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev); + /* + * Flush the multicast chain + */ + dev_mc_discard(dev); + + /* + * Purge any queued packets when we down the link + */ + while(ct<DEV_NUMBUFFS) { - int ct=0; - dev->flags = 0; - /* - * Call the device specific close. This cannot fail. - */ - if (dev->stop) - dev->stop(dev); - /* - * Tell people we are going down - */ - notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev); - /* - * Flush the multicast chain - */ - dev_mc_discard(dev); - /* - * Blank the IP addresses - */ - dev->pa_addr = 0; - dev->pa_dstaddr = 0; - dev->pa_brdaddr = 0; - dev->pa_mask = 0; - /* - * Purge any queued packets when we down the link - */ - while(ct<DEV_NUMBUFFS) - { - struct sk_buff *skb; - while((skb=skb_dequeue(&dev->buffs[ct]))!=NULL) - if(skb->free) - kfree_skb(skb,FREE_WRITE); - ct++; - } + struct sk_buff *skb; + while((skb=skb_dequeue(&dev->buffs[ct]))!=NULL) + if(skb->free) + kfree_skb(skb,FREE_WRITE); + ct++; } return(0); } @@ -299,8 +316,6 @@ int unregister_netdevice_notifier(struct notifier_block *nb) return notifier_chain_unregister(&netdev_chain,nb); } - - /* * Send (or queue for sending) a packet. * @@ -309,18 +324,17 @@ int unregister_netdevice_notifier(struct notifier_block *nb) * rest of the magic. */ -void dev_queue_xmit(struct sk_buff *skb, struct device *dev, int pri) +static void do_dev_queue_xmit(struct sk_buff *skb, struct device *dev, int pri) { unsigned long flags; - int nitcount; - struct packet_type *ptype; - int where = 0; /* used to say if the packet should go */ + struct sk_buff_head *list; + int retransmission = 0; /* used to say if the packet should go */ /* at the front or the back of the */ /* queue - front is a retransmit try */ if(pri>=0 && !skb_device_locked(skb)) skb_device_lock(skb); /* Shove a lock on the frame */ -#ifdef CONFIG_SKB_CHECK +#if CONFIG_SKB_CHECK IS_SKB(skb); #endif skb->dev = dev; @@ -334,13 +348,13 @@ void dev_queue_xmit(struct sk_buff *skb, struct device *dev, int pri) if (pri < 0) { pri = -pri-1; - where = 1; + retransmission = 1; } #ifdef CONFIG_NET_DEBUG if (pri >= DEV_NUMBUFFS) { - printk("bad priority in dev_queue_xmit.\n"); + printk(KERN_WARNING "bad priority in dev_queue_xmit.\n"); pri = 1; } #endif @@ -354,51 +368,84 @@ void dev_queue_xmit(struct sk_buff *skb, struct device *dev, int pri) return; } - save_flags(flags); - cli(); - if (dev_nit && !where) + /* + * + * If dev is an alias, switch to its main device. + * "arp" resolution has been made with alias device, so + * arp entries refer to alias, not main. + * + */ + +#ifdef CONFIG_NET_ALIAS + if (net_alias_is(dev)) + skb->dev = dev = net_alias_main_dev(dev); +#endif + + /* + * If we are bridging and this is directly generated output + * pass the frame via the bridge. + */ + +#ifdef CONFIG_BRIDGE + if(skb->pkt_bridged!=IS_BRIDGED && br_stats.flags & BR_UP) { - skb_queue_tail(dev->buffs + pri,skb); - skb_device_unlock(skb); /* Buffer is on the device queue and can be freed safely */ - skb = skb_dequeue(dev->buffs + pri); - skb_device_lock(skb); /* New buffer needs locking down */ + if(br_tx_frame(skb)) + return; } - restore_flags(flags); +#endif - /* copy outgoing packets to any sniffer packet handlers */ - if(!where) - { - skb->stamp=xtime; - for (ptype = ptype_all; ptype!=NULL; ptype = ptype->next) - { - /* Never send packets back to the socket - * they originated from - MvS (miquels@drinkel.ow.org) - */ - if ((ptype->dev == dev || !ptype->dev) && - ((struct sock *)ptype->data != skb->sk)) + list = dev->buffs + pri; + + save_flags(flags); + /* if this isn't a retransmission, use the first packet instead... */ + if (!retransmission) { + if (skb_queue_len(list)) { + /* avoid overrunning the device queue.. */ + if (skb_queue_len(list) > dev->tx_queue_len) { + dev_kfree_skb(skb, FREE_WRITE); + return; + } + } + + /* copy outgoing packets to any sniffer packet handlers */ + if (dev_nit) { + struct packet_type *ptype; + + get_fast_time(&skb->stamp); + + for (ptype = ptype_all; ptype!=NULL; ptype = ptype->next) { - struct sk_buff *skb2; - if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) - break; - /* - * The protocol knows this has (for other paths) been taken off - * and adds it back. + /* Never send packets back to the socket + * they originated from - MvS (miquels@drinkel.ow.org) */ - skb2->len-=skb->dev->hard_header_len; - ptype->func(skb2, skb->dev, ptype); - nitcount--; + if ((ptype->dev == dev || !ptype->dev) && + ((struct sock *)ptype->data != skb->sk)) + { + struct sk_buff *skb2; + if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) + break; + skb2->h.raw = skb2->data + dev->hard_header_len; + skb2->mac.raw = skb2->data; + ptype->func(skb2, skb->dev, ptype); + } } } + + if (skb_queue_len(list)) { + cli(); + skb_device_unlock(skb); /* Buffer is on the device queue and can be freed safely */ + __skb_queue_tail(list, skb); + skb = __skb_dequeue(list); + skb_device_lock(skb); /* New buffer needs locking down */ + restore_flags(flags); + } } - start_bh_atomic(); if (dev->hard_start_xmit(skb, dev) == 0) { /* * Packet is now solely the responsibility of the driver */ - end_bh_atomic(); return; } - end_bh_atomic(); /* * Transmission failed, put skb back into a list. Once on the list it's safe and @@ -406,10 +453,17 @@ void dev_queue_xmit(struct sk_buff *skb, struct device *dev, int pri) */ cli(); skb_device_unlock(skb); - skb_queue_head(dev->buffs + pri,skb); + __skb_queue_head(list,skb); restore_flags(flags); } +void dev_queue_xmit(struct sk_buff *skb, struct device *dev, int pri) +{ + start_bh_atomic(); + do_dev_queue_xmit(skb, dev, pri); + end_bh_atomic(); +} + /* * Receive a packet from a device driver and queue it for the upper * (protocol) levels. It always succeeds. This is the recommended @@ -425,10 +479,11 @@ void netif_rx(struct sk_buff *skb) * when freed. These will be updated later as the frames get * owners. */ + skb->sk = NULL; skb->free = 1; if(skb->stamp.tv_sec==0) - skb->stamp = xtime; + get_fast_time(&skb->stamp); /* * Check that we aren't overdoing things. @@ -448,7 +503,7 @@ void netif_rx(struct sk_buff *skb) /* * Add it to the "backlog" queue. */ -#ifdef CONFIG_SKB_CHECK +#if CONFIG_SKB_CHECK IS_SKB(skb); #endif skb_queue_tail(&backlog,skb); @@ -459,111 +514,19 @@ void netif_rx(struct sk_buff *skb) * hardware interrupt returns. */ -#ifdef CONFIG_NET_RUNONIRQ /* Dont enable yet, needs some driver mods */ - inet_bh(); -#else mark_bh(NET_BH); -#endif return; } - -/* - * The old interface to fetch a packet from a device driver. - * This function is the base level entry point for all drivers that - * want to send a packet to the upper (protocol) levels. It takes - * care of de-multiplexing the packet to the various modules based - * on their protocol ID. - * - * Return values: 1 <- exit I can't do any more - * 0 <- feed me more (i.e. "done", "OK"). - * - * This function is OBSOLETE and should not be used by any new - * device. - */ - -int dev_rint(unsigned char *buff, long len, int flags, struct device *dev) -{ - static int dropping = 0; - struct sk_buff *skb = NULL; - unsigned char *to; - int amount, left; - int len2; - - if (dev == NULL || buff == NULL || len <= 0) - return(1); - - if (flags & IN_SKBUFF) - { - skb = (struct sk_buff *) buff; - } - else - { - if (dropping) - { - if (skb_peek(&backlog) != NULL) - return(1); - printk("INET: dev_rint: no longer dropping packets.\n"); - dropping = 0; - } - - skb = alloc_skb(len, GFP_ATOMIC); - if (skb == NULL) - { - printk("dev_rint: packet dropped on %s (no memory) !\n", - dev->name); - dropping = 1; - return(1); - } - - /* - * First we copy the packet into a buffer, and save it for later. We - * in effect handle the incoming data as if it were from a circular buffer - */ - - to = skb->data; - left = len; - - len2 = len; - while (len2 > 0) - { - amount = min(len2, (unsigned long) dev->rmem_end - - (unsigned long) buff); - memcpy(to, buff, amount); - len2 -= amount; - left -= amount; - buff += amount; - to += amount; - if ((unsigned long) buff == dev->rmem_end) - buff = (unsigned char *) dev->rmem_start; - } - } - - /* - * Tag the frame and kick it to the proper receive routine - */ - - skb->len = len; - skb->dev = dev; - skb->free = 1; - - netif_rx(skb); - /* - * OK, all done. - */ - return(0); -} - - /* * This routine causes all interfaces to try to send some data. */ -void dev_transmit(void) +static void dev_transmit(void) { struct device *dev; - for (dev = dev_base; dev != NULL; dev = dev->next) + for (dev = dev_base; dev != NULL; dev = dev->next) { if (dev->flags != 0 && !dev->tbusy) { /* @@ -582,44 +545,24 @@ void dev_transmit(void) ***********************************************************************************/ /* - * This is a single non-reentrant routine which takes the received packet - * queue and throws it at the networking layers in the hope that something - * useful will emerge. - */ - -volatile int in_bh = 0; /* Non-reentrant remember */ - -int in_net_bh() /* Used by timer.c */ -{ - return(in_bh==0?0:1); -} - -/* * When we are called the queue is ready to grab, the interrupts are - * on and hardware can interrupt and queue to the receive queue a we + * on and hardware can interrupt and queue to the receive queue as we * run with no problems. * This is run as a bottom half after an interrupt handler that does * mark_bh(NET_BH); */ -void net_bh(void *tmp) +void net_bh(void) { - struct sk_buff *skb; struct packet_type *ptype; struct packet_type *pt_prev; unsigned short type; /* - * Atomically check and mark our BUSY state. - */ - - if (set_bit(1, (void*)&in_bh)) - return; - - /* * Can we send anything now? We want to clear the * decks for any more sends that get done as we - * process the input. + * process the input. This also minimises the + * latency on a transmit interrupt bh. */ dev_transmit(); @@ -630,34 +573,70 @@ void net_bh(void *tmp) * that from the device which does a mark_bh() just after */ - cli(); - /* - * While the queue is not empty + * While the queue is not empty.. + * + * Note that the queue never shrinks due to + * an interrupt, so we can do this test without + * disabling interrupts. */ - - while((skb=skb_dequeue(&backlog))!=NULL) - { + + while (!skb_queue_empty(&backlog)) { + struct sk_buff * skb = backlog.next; + /* * We have a packet. Therefore the queue has shrunk */ + cli(); + __skb_unlink(skb, &backlog); backlog_size--; - sti(); - /* - * Bump the pointer to the next structure. - * This assumes that the basic 'skb' pointer points to - * the MAC header, if any (as indicated by its "length" - * field). Take care now! - */ - - skb->h.raw = skb->data + skb->dev->hard_header_len; - skb->len -= skb->dev->hard_header_len; - - /* - * Fetch the packet protocol ID. - */ + +#ifdef CONFIG_BRIDGE + + /* + * If we are bridging then pass the frame up to the + * bridging code (if this protocol is to be bridged). + * If it is bridged then move on + */ + + if (br_stats.flags & BR_UP && br_protocol_ok(ntohs(skb->protocol))) + { + /* + * We pass the bridge a complete frame. This means + * recovering the MAC header first. + */ + + int offset=skb->data-skb->mac.raw; + cli(); + skb_push(skb,offset); /* Put header back on for bridge */ + if(br_receive_frame(skb)) + { + sti(); + continue; + } + /* + * Pull the MAC header off for the copy going to + * the upper layers. + */ + skb_pull(skb,offset); + sti(); + } +#endif + + /* + * Bump the pointer to the next structure. + * + * On entry to the protocol layer. skb->data and + * skb->h.raw point to the MAC and encapsulated data + */ + + skb->h.raw = skb->data; + + /* + * Fetch the packet protocol ID. + */ type = skb->protocol; @@ -666,6 +645,7 @@ void net_bh(void *tmp) * list. There are two lists. The ptype_all list of taps (normally empty) * and the main protocol list which is hashed perfectly for normal protocols. */ + pt_prev = NULL; for (ptype = ptype_all; ptype!=NULL; ptype=ptype->next) { @@ -680,7 +660,7 @@ void net_bh(void *tmp) for (ptype = ptype_base[ntohs(type)&15]; ptype != NULL; ptype = ptype->next) { - if ((ptype->type == type || ptype->type == htons(ETH_P_ALL)) && (!ptype->dev || ptype->dev==skb->dev)) + if (ptype->type == type && (!ptype->dev || ptype->dev==skb->dev)) { /* * We already have a match queued. Deliver @@ -717,30 +697,27 @@ void net_bh(void *tmp) else kfree_skb(skb, FREE_WRITE); - /* * Again, see if we can transmit anything now. * [Ought to take this out judging by tests it slows * us down not speeds us up] */ -#ifdef CONFIG_XMIT_EVERY +#ifdef XMIT_EVERY dev_transmit(); #endif - cli(); } /* End of queue loop */ /* * We have emptied the queue */ - - in_bh = 0; - sti(); /* * One last output flush. */ - + +#ifdef XMIT_AFTER dev_transmit(); +#endif } @@ -752,24 +729,31 @@ void net_bh(void *tmp) void dev_tint(struct device *dev) { int i; - struct sk_buff *skb; unsigned long flags; + struct sk_buff_head * head; - save_flags(flags); /* - * Work the queues in priority order + * aliases do not transmit (for now :) ) */ - - for(i = 0;i < DEV_NUMBUFFS; i++) + +#ifdef CONFIG_NET_ALIAS + if (net_alias_is(dev)) return; +#endif + head = dev->buffs; + save_flags(flags); + cli(); + + /* + * Work the queues in priority order + */ + for(i = 0;i < DEV_NUMBUFFS; i++,head++) { - /* - * Pull packets from the queue - */ - - cli(); - while((skb=skb_dequeue(&dev->buffs[i]))!=NULL) - { + while (!skb_queue_empty(head)) { + struct sk_buff *skb; + + skb = head->next; + __skb_unlink(skb, head); /* * Stop anyone freeing the buffer while we retransmit it */ @@ -779,7 +763,7 @@ void dev_tint(struct device *dev) * Feed them to the output stage and if it fails * indicate they re-queue at the front. */ - dev_queue_xmit(skb,dev,-i - 1); + do_dev_queue_xmit(skb,dev,-i - 1); /* * If we can take no more then stop here. */ @@ -810,11 +794,10 @@ static int dev_ifconf(char *arg) /* * Fetch the caller's info block. */ - - err=verify_area(VERIFY_WRITE, arg, sizeof(struct ifconf)); - if(err) - return err; - memcpy_fromfs(&ifc, arg, sizeof(struct ifconf)); + + err = copy_from_user(&ifc, arg, sizeof(struct ifconf)); + if (err) + return -EFAULT; len = ifc.ifc_len; pos = ifc.ifc_buf; @@ -822,38 +805,37 @@ static int dev_ifconf(char *arg) * We now walk the device list filling each active device * into the array. */ - - err=verify_area(VERIFY_WRITE,pos,len); - if(err) - return err; - + /* * Loop over the interfaces, and write an info block for each. */ for (dev = dev_base; dev != NULL; dev = dev->next) { - if(!(dev->flags & IFF_UP)) /* Downed devices don't count */ - continue; + if(!(dev->flags & IFF_UP)) /* Downed devices don't count */ + continue; + /* + * Have we run out of space here ? + */ + + if (len < sizeof(struct ifreq)) + break; + memset(&ifr, 0, sizeof(struct ifreq)); strcpy(ifr.ifr_name, dev->name); (*(struct sockaddr_in *) &ifr.ifr_addr).sin_family = dev->family; (*(struct sockaddr_in *) &ifr.ifr_addr).sin_addr.s_addr = dev->pa_addr; + /* * Write this block to the caller's space. */ - memcpy_tofs(pos, &ifr, sizeof(struct ifreq)); + err = copy_to_user(pos, &ifr, sizeof(struct ifreq)); + if (err) + return -EFAULT; pos += sizeof(struct ifreq); - len -= sizeof(struct ifreq); - - /* - * Have we run out of space here ? - */ - - if (len < sizeof(struct ifreq)) - break; + len -= sizeof(struct ifreq); } /* @@ -862,8 +844,10 @@ static int dev_ifconf(char *arg) ifc.ifc_len = (pos - ifc.ifc_buf); ifc.ifc_req = (struct ifreq *) ifc.ifc_buf; - memcpy_tofs(arg, &ifc, sizeof(struct ifconf)); - + err = copy_to_user(arg, &ifc, sizeof(struct ifconf)); + if (err) + return -EFAULT; + /* * Report how much was filled in */ @@ -877,6 +861,7 @@ static int dev_ifconf(char *arg) * in detail. */ +#ifdef CONFIG_PROC_FS static int sprintf_stats(char *buffer, struct device *dev) { struct enet_statistics *stats = (dev->get_stats ? dev->get_stats(dev): NULL); @@ -905,7 +890,7 @@ static int sprintf_stats(char *buffer, struct device *dev) * to create /proc/net/dev */ -int dev_get_info(char *buffer, char **start, off_t offset, int length) +int dev_get_info(char *buffer, char **start, off_t offset, int length, int dummy) { int len=0; off_t begin=0; @@ -943,6 +928,7 @@ int dev_get_info(char *buffer, char **start, off_t offset, int length) len=length; /* Ending slop */ return len; } +#endif /* CONFIG_PROC_FS */ /* @@ -970,25 +956,38 @@ static int dev_ifsioc(void *arg, unsigned int getset) { struct ifreq ifr; struct device *dev; - int ret; + int ret, err; /* * Fetch the caller's info block into kernel space */ - - int err=verify_area(VERIFY_WRITE, arg, sizeof(struct ifreq)); - if(err) - return err; - memcpy_fromfs(&ifr, arg, sizeof(struct ifreq)); + err = copy_from_user(&ifr, arg, sizeof(struct ifreq)); + if (err) + return -EFAULT; /* * See which interface the caller is talking about. */ - if ((dev = dev_get(ifr.ifr_name)) == NULL) - return(-ENODEV); + /* + * + * net_alias_dev_get(): dev_get() with added alias naming magic. + * only allow alias creation/deletion if (getset==SIOCSIFADDR) + * + */ + +#ifdef CONFIG_KERNELD + dev_load(ifr.ifr_name); +#endif +#ifdef CONFIG_NET_ALIAS + if ((dev = net_alias_dev_get(ifr.ifr_name, getset == SIOCSIFADDR, &err, NULL, NULL)) == NULL) + return(err); +#else + if ((dev = dev_get(ifr.ifr_name)) == NULL) + return(-ENODEV); +#endif switch(getset) { case SIOCGIFFLAGS: /* Get interface flags */ @@ -998,11 +997,23 @@ static int dev_ifsioc(void *arg, unsigned int getset) case SIOCSIFFLAGS: /* Set interface flags */ { int old_flags = dev->flags; - dev->flags = ifr.ifr_flags & ( - IFF_UP | IFF_BROADCAST | IFF_DEBUG | IFF_LOOPBACK | + + /* + * We are not allowed to potentially close/unload + * a device until we get this lock. + */ + + dev_lock_wait(); + + /* + * Set the flags on our device. + */ + + dev->flags = (ifr.ifr_flags & ( + IFF_BROADCAST | IFF_DEBUG | IFF_LOOPBACK | IFF_POINTOPOINT | IFF_NOTRAILERS | IFF_RUNNING | IFF_NOARP | IFF_PROMISC | IFF_ALLMULTI | IFF_SLAVE | IFF_MASTER - | IFF_MULTICAST); + | IFF_MULTICAST)) | (dev->flags & IFF_UP); /* * Load in the correct multicast list now the flags have changed. */ @@ -1010,51 +1021,101 @@ static int dev_ifsioc(void *arg, unsigned int getset) dev_mc_upload(dev); /* - * Have we downed the interface + * Have we downed the interface. We handle IFF_UP ourselves + * according to user attempts to set it, rather than blindly + * setting it. */ - - if ((old_flags & IFF_UP) && ((dev->flags & IFF_UP) == 0)) - { - ret = dev_close(dev); - } - else - { - /* - * Have we upped the interface - */ - - ret = (! (old_flags & IFF_UP) && (dev->flags & IFF_UP)) - ? dev_open(dev) : 0; - /* - * Check the flags. - */ - if(ret<0) - dev->flags&=~IFF_UP; /* Didn't open so down the if */ + + if ((old_flags^ifr.ifr_flags)&IFF_UP) /* Bit is different ? */ + { + if(old_flags&IFF_UP) /* Gone down */ + ret=dev_close(dev); + else /* Come up */ + { + ret=dev_open(dev); + if(ret<0) + dev->flags&=~IFF_UP; /* Open failed */ + } } - } + else + ret=0; + /* + * Load in the correct multicast list now the flags have changed. + */ + + dev_mc_upload(dev); + } break; case SIOCGIFADDR: /* Get interface address (and family) */ - (*(struct sockaddr_in *) - &ifr.ifr_addr).sin_addr.s_addr = dev->pa_addr; - (*(struct sockaddr_in *) - &ifr.ifr_addr).sin_family = dev->family; - (*(struct sockaddr_in *) - &ifr.ifr_addr).sin_port = 0; + if(ifr.ifr_addr.sa_family==AF_UNSPEC) + { + memcpy(ifr.ifr_hwaddr.sa_data,dev->dev_addr, MAX_ADDR_LEN); + ifr.ifr_hwaddr.sa_family=dev->type; + goto rarok; + } + else + { + (*(struct sockaddr_in *) + &ifr.ifr_addr).sin_addr.s_addr = dev->pa_addr; + (*(struct sockaddr_in *) + &ifr.ifr_addr).sin_family = dev->family; + (*(struct sockaddr_in *) + &ifr.ifr_addr).sin_port = 0; + } goto rarok; case SIOCSIFADDR: /* Set interface address (and family) */ - dev->pa_addr = (*(struct sockaddr_in *) - &ifr.ifr_addr).sin_addr.s_addr; - dev->family = ifr.ifr_addr.sa_family; + + /* + * BSDism. SIOCSIFADDR family=AF_UNSPEC sets the + * physical address. We can cope with this now. + */ + + if(ifr.ifr_addr.sa_family==AF_UNSPEC) + { + if(dev->set_mac_address==NULL) + return -EOPNOTSUPP; + ret=dev->set_mac_address(dev,&ifr.ifr_addr); + } + else + { + u32 new_pa_addr = (*(struct sockaddr_in *) + &ifr.ifr_addr).sin_addr.s_addr; + u16 new_family = ifr.ifr_addr.sa_family; + + if (new_family == dev->family && + new_pa_addr == dev->pa_addr) { + ret =0; + break; + } + if (dev->flags & IFF_UP) + notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev); + + /* + * if dev is an alias, must rehash to update + * address change + */ + +#ifdef CONFIG_NET_ALIAS + if (net_alias_is(dev)) + net_alias_dev_rehash(dev ,&ifr.ifr_addr); +#endif + dev->pa_addr = new_pa_addr; + dev->family = new_family; #ifdef CONFIG_INET - /* This is naughty. When net-032e comes out It wants moving into the net032 - code not the kernel. Till then it can sit here (SIGH) */ - dev->pa_mask = ip_get_mask(dev->pa_addr); + /* This is naughty. When net-032e comes out It wants moving into the net032 + code not the kernel. Till then it can sit here (SIGH) */ + if (!dev->pa_mask) + dev->pa_mask = ip_get_mask(dev->pa_addr); #endif - dev->pa_brdaddr = dev->pa_addr | ~dev->pa_mask; - ret = 0; + if (!dev->pa_brdaddr) + dev->pa_brdaddr = dev->pa_addr | ~dev->pa_mask; + if (dev->flags & IFF_UP) + notifier_call_chain(&netdev_chain, NETDEV_UP, dev); + ret = 0; + } break; case SIOCGIFBRDADDR: /* Get the broadcast address */ @@ -1065,9 +1126,6 @@ static int dev_ifsioc(void *arg, unsigned int getset) (*(struct sockaddr_in *) &ifr.ifr_broadaddr).sin_port = 0; goto rarok; - memcpy_tofs(arg, &ifr, sizeof(struct ifreq)); - ret = 0; - break; case SIOCSIFBRDADDR: /* Set the broadcast address */ dev->pa_brdaddr = (*(struct sockaddr_in *) @@ -1082,9 +1140,7 @@ static int dev_ifsioc(void *arg, unsigned int getset) &ifr.ifr_dstaddr).sin_family = dev->family; (*(struct sockaddr_in *) &ifr.ifr_dstaddr).sin_port = 0; - memcpy_tofs(arg, &ifr, sizeof(struct ifreq)); - ret = 0; - break; + goto rarok; case SIOCSIFDSTADDR: /* Set the destination address (for point-to-point links) */ dev->pa_dstaddr = (*(struct sockaddr_in *) @@ -1138,8 +1194,14 @@ static int dev_ifsioc(void *arg, unsigned int getset) if(ifr.ifr_mtu<68) return -EINVAL; - dev->mtu = ifr.ifr_mtu; - ret = 0; + + if (dev->change_mtu) + ret = dev->change_mtu(dev, ifr.ifr_mtu); + else + { + dev->mtu = ifr.ifr_mtu; + ret = 0; + } break; case SIOCGIFMEM: /* Get the per device memory space. We can add this but currently @@ -1151,10 +1213,6 @@ static int dev_ifsioc(void *arg, unsigned int getset) ret = -EINVAL; break; - case OLD_SIOCGIFHWADDR: /* Get the hardware address. This will change and SIFHWADDR will be added */ - memcpy(ifr.old_ifr_hwaddr,dev->dev_addr, MAX_ADDR_LEN); - goto rarok; - case SIOCGIFHWADDR: memcpy(ifr.ifr_hwaddr.sa_data,dev->dev_addr, MAX_ADDR_LEN); ifr.ifr_hwaddr.sa_family=dev->type; @@ -1165,7 +1223,7 @@ static int dev_ifsioc(void *arg, unsigned int getset) return -EOPNOTSUPP; if(ifr.ifr_hwaddr.sa_family!=dev->type) return -EINVAL; - ret=dev->set_mac_address(dev,ifr.ifr_hwaddr.sa_data); + ret=dev->set_mac_address(dev,&ifr.ifr_hwaddr); break; case SIOCGIFMAP: @@ -1175,9 +1233,7 @@ static int dev_ifsioc(void *arg, unsigned int getset) ifr.ifr_map.irq=dev->irq; ifr.ifr_map.dma=dev->dma; ifr.ifr_map.port=dev->if_port; - memcpy_tofs(arg,&ifr,sizeof(struct ifreq)); - ret=0; - break; + goto rarok; case SIOCSIFMAP: if(dev->set_config==NULL) @@ -1208,8 +1264,13 @@ static int dev_ifsioc(void *arg, unsigned int getset) (getset <= (SIOCDEVPRIVATE + 15))) { if(dev->do_ioctl==NULL) return -EOPNOTSUPP; - ret=dev->do_ioctl(dev, &ifr, getset); - memcpy_tofs(arg,&ifr,sizeof(struct ifreq)); + ret = dev->do_ioctl(dev, &ifr, getset); + if (!ret) + { + err = copy_to_user(arg,&ifr,sizeof(struct ifreq)); + if (err) + ret = -EFAULT; + } break; } @@ -1220,8 +1281,10 @@ static int dev_ifsioc(void *arg, unsigned int getset) * The load of calls that return an ifreq and ok (saves memory). */ rarok: - memcpy_tofs(arg, &ifr, sizeof(struct ifreq)); - return 0; + err = copy_to_user(arg, &ifr, sizeof(struct ifreq)); + if (err) + err = -EFAULT; + return err; } @@ -1252,7 +1315,6 @@ int dev_ioctl(unsigned int cmd, void *arg) case SIOCGIFMEM: case SIOCGIFHWADDR: case SIOCSIFHWADDR: - case OLD_SIOCGIFHWADDR: case SIOCGIFSLAVE: case SIOCGIFMAP: return dev_ifsioc(arg, cmd); @@ -1300,10 +1362,85 @@ int dev_ioctl(unsigned int cmd, void *arg) * present) and leaves us with a valid list of present and active devices. * */ - -void dev_init(void) +extern int lance_init(void); +extern int ni65_init(void); +extern int pi_init(void); +extern int bpq_init(void); +extern int scc_init(void); +extern void sdla_setup(void); +extern void dlci_setup(void); +extern int pt_init(void); +extern int sm_init(void); +extern int baycom_init(void); + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry proc_net_dev = { + PROC_NET_DEV, 3, "dev", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + dev_get_info +}; +#endif + +int net_dev_init(void) { - struct device *dev, *dev2; + struct device *dev, **dp; + + /* + * Initialise the packet receive queue. + */ + + skb_queue_head_init(&backlog); + + /* + * The bridge has to be up before the devices + */ + +#ifdef CONFIG_BRIDGE + br_init(); +#endif + + /* + * This is Very Ugly(tm). + * + * Some devices want to be initialized early.. + */ +#if defined(CONFIG_LANCE) + lance_init(); +#endif +#if defined(CONFIG_PI) + pi_init(); +#endif +#if defined(CONFIG_SCC) + scc_init(); +#endif +#if defined(CONFIG_PT) + pt_init(); +#endif +#if defined(CONFIG_BPQETHER) + bpq_init(); +#endif +#if defined(CONFIG_DLCI) + dlci_setup(); +#endif +#if defined(CONFIG_SDLA) + sdla_setup(); +#endif +#if defined(CONFIG_BAYCOM) + baycom_init(); +#endif +#if defined(CONFIG_SOUNDMODEM) + sm_init(); +#endif + /* + * SLHC if present needs attaching so other people see it + * even if not opened. + */ +#if (defined(CONFIG_SLIP) && defined(CONFIG_SLIP_COMPRESSED)) \ + || defined(CONFIG_PPP) \ + || (defined(CONFIG_ISDN) && defined(CONFIG_ISDN_PPP)) + slhc_install(); +#endif /* * Add the devices. @@ -1311,25 +1448,44 @@ void dev_init(void) * from the chain disconnecting the device until the * next reboot. */ - - dev2 = NULL; - for (dev = dev_base; dev != NULL; dev=dev->next) + + dp = &dev_base; + while ((dev = *dp) != NULL) { + int i; + for (i = 0; i < DEV_NUMBUFFS; i++) { + skb_queue_head_init(dev->buffs + i); + } + if (dev->init && dev->init(dev)) { /* * It failed to come up. Unhook it. */ - - if (dev2 == NULL) - dev_base = dev->next; - else - dev2->next = dev->next; + *dp = dev->next; } else { - dev2 = dev; + dp = &dev->next; } } -} +#ifdef CONFIG_PROC_FS + proc_net_register(&proc_net_dev); +#endif + + /* + * Initialise net_alias engine + * + * - register net_alias device notifier + * - register proc entries: /proc/net/alias_types + * /proc/net/aliases + */ + +#ifdef CONFIG_NET_ALIAS + net_alias_init(); +#endif + + init_bh(NET_BH, net_bh); + return 0; +} diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c index 7195d5a52..183d3fc3b 100644 --- a/net/core/dev_mcast.c +++ b/net/core/dev_mcast.c @@ -11,6 +11,8 @@ * Fixes: * Alan Cox : Update the device on a real delete * rather than any time but... + * Alan Cox : IFF_ALLMULTI support. + * Alan Cox : New format set_multicast_list() calls. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -18,7 +20,7 @@ * 2 of the License, or (at your option) any later version. */ -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <asm/bitops.h> #include <linux/types.h> @@ -59,9 +61,6 @@ void dev_mc_upload(struct device *dev) { - struct dev_mc_list *dmi; - char *data, *tmp; - /* Don't do anything till we up the interface [dev_open will call this function so the list will stay sane] */ @@ -69,36 +68,14 @@ void dev_mc_upload(struct device *dev) if(!(dev->flags&IFF_UP)) return; - - /* Devices with no set multicast don't get set */ + /* + * Devices with no set multicast don't get set + */ + if(dev->set_multicast_list==NULL) return; - /* Promiscuous is promiscuous - so no filter needed */ - if(dev->flags&IFF_PROMISC) - { - dev->set_multicast_list(dev, -1, NULL); - return; - } - - if(dev->mc_count==0) - { - dev->set_multicast_list(dev,0,NULL); - return; - } - - data=kmalloc(dev->mc_count*dev->addr_len, GFP_KERNEL); - if(data==NULL) - { - printk("Unable to get memory to set multicast list on %s\n",dev->name); - return; - } - for(tmp = data, dmi=dev->mc_list;dmi!=NULL;dmi=dmi->next) - { - memcpy(tmp,dmi->dmi_addr, dmi->dmi_addrlen); - tmp+=dev->addr_len; - } - dev->set_multicast_list(dev,dev->mc_count,data); - kfree(data); + + dev->set_multicast_list(dev); } /* @@ -166,4 +143,3 @@ void dev_mc_discard(struct device *dev) } dev->mc_count=0; } - diff --git a/net/core/firewall.c b/net/core/firewall.c new file mode 100644 index 000000000..a57f67eaf --- /dev/null +++ b/net/core/firewall.c @@ -0,0 +1,165 @@ +/* + * Generic loadable firewalls. At the moment only IP will actually + * use these, but people can add the others as they are needed. + * + * Authors: Dave Bonn (for IP) + * much hacked by: Alan Cox + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/firewall.h> + +static int firewall_lock=0; +static int firewall_policy[NPROTO]; +static struct firewall_ops *firewall_chain[NPROTO]; + +/* + * Register a firewall + */ + +int register_firewall(int pf, struct firewall_ops *fw) +{ + struct firewall_ops **p; + + if(pf<0||pf>=NPROTO) + return -EINVAL; + + /* + * Don't allow two people to adjust at once. + */ + + while(firewall_lock) + schedule(); + firewall_lock=1; + + p=&firewall_chain[pf]; + + while(*p) + { + if(fw->fw_priority > (*p)->fw_priority) + break; + p=&((*p)->next); + } + + + /* + * We need to use a memory barrier to make sure that this + * works correctly even in SMP with weakly ordered writes. + * + * This is atomic wrt interrupts (and generally walking the + * chain), but not wrt itself (so you can't call this from + * an interrupt. Not that you'd want to). + */ + fw->next=*p; + mb(); + *p = fw; + + /* + * And release the sleep lock + */ + + firewall_lock=0; + return 0; +} + +/* + * Unregister a firewall + */ + +int unregister_firewall(int pf, struct firewall_ops *fw) +{ + struct firewall_ops **nl; + + if(pf<0||pf>=NPROTO) + return -EINVAL; + + /* + * Don't allow two people to adjust at once. + */ + + while(firewall_lock) + schedule(); + firewall_lock=1; + + nl=&firewall_chain[pf]; + + while(*nl!=NULL) + { + if(*nl==fw) + { + struct firewall_ops *f=fw->next; + *nl = f; + firewall_lock=0; + return 0; + } + nl=&((*nl)->next); + } + firewall_lock=0; + return -ENOENT; +} + +int call_fw_firewall(int pf, struct device *dev, void *phdr, void *arg) +{ + struct firewall_ops *fw=firewall_chain[pf]; + + while(fw!=NULL) + { + int rc=fw->fw_forward(fw,pf,dev,phdr,arg); + if(rc!=FW_SKIP) + return rc; + fw=fw->next; + } + return firewall_policy[pf]; +} + +/* + * Actual invocation of the chains + */ + +int call_in_firewall(int pf, struct device *dev, void *phdr, void *arg) +{ + struct firewall_ops *fw=firewall_chain[pf]; + + while(fw!=NULL) + { + int rc=fw->fw_input(fw,pf,dev,phdr,arg); + if(rc!=FW_SKIP) + return rc; + fw=fw->next; + } + return firewall_policy[pf]; +} + +int call_out_firewall(int pf, struct device *dev, void *phdr, void *arg) +{ + struct firewall_ops *fw=firewall_chain[pf]; + + while(fw!=NULL) + { + int rc=fw->fw_output(fw,pf,dev,phdr,arg); + if(rc!=FW_SKIP) + return rc; + fw=fw->next; + } + /* alan, is this right? */ + return firewall_policy[pf]; +} + +static struct symbol_table firewall_syms = { +#include <linux/symtab_begin.h> + X(register_firewall), + X(unregister_firewall), + X(call_in_firewall), + X(call_out_firewall), + X(call_fw_firewall), +#include <linux/symtab_end.h> +}; + +void fwchain_init(void) +{ + int i; + for(i=0;i<NPROTO;i++) + firewall_policy[i]=FW_ACCEPT; + register_symtab(&firewall_syms); +} diff --git a/net/core/iovec.c b/net/core/iovec.c new file mode 100644 index 000000000..6db6ac3e9 --- /dev/null +++ b/net/core/iovec.c @@ -0,0 +1,278 @@ +/* + * iovec manipulation routines. + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Andrew Lunn : Errors in iovec copying. + * Pedro Roque : Added memcpy_fromiovecend and + * csum_..._fromiovecend. + * Andi Kleen : fixed error handling for 2.1 + */ + + +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <asm/uaccess.h> +#include <asm/byteorder.h> +#include <asm/checksum.h> + +extern inline int min(int x, int y) +{ + return x>y?y:x; +} + + +/* + * Verify iovec + * verify area does a simple check for completly bogus addresses + */ + +int verify_iovec(struct msghdr *m, struct iovec *iov, char *address, int mode) +{ + int err=0; + int len=0; + int ct; + + if(m->msg_name!=NULL) + { + if(mode==VERIFY_READ) + { + err=move_addr_to_kernel(m->msg_name, m->msg_namelen, address); + } + else + { + err=verify_area(mode, m->msg_name, m->msg_namelen); + } + + if(err<0) + return err; + m->msg_name = address; + } + + if(m->msg_control!=NULL) + { + err=verify_area(mode, m->msg_control, m->msg_controllen); + if(err) + return err; + } + + for(ct=0;ct<m->msg_iovlen;ct++) + { + err = copy_from_user(&iov[ct], &m->msg_iov[ct], + sizeof(struct iovec)); + if (err) + return err; + + err = verify_area(mode, iov[ct].iov_base, iov[ct].iov_len); + if(err) + return err; + len+=iov[ct].iov_len; + } + m->msg_iov=&iov[0]; + return len; +} + +/* + * Copy kernel to iovec. + */ + +int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) +{ + int err; + while(len>0) + { + if(iov->iov_len) + { + int copy = min(iov->iov_len,len); + err = copy_to_user(iov->iov_base,kdata,copy); + if (err) + return err; + kdata+=copy; + len-=copy; + iov->iov_len-=copy; + iov->iov_base+=copy; + } + iov++; + } + return 0; +} + +/* + * Copy iovec to kernel. + */ + +int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) +{ + int err; + while(len>0) + { + if(iov->iov_len) + { + int copy=min(len,iov->iov_len); + err = copy_from_user(kdata, iov->iov_base, copy); + if (err) + { + return err; + } + len-=copy; + kdata+=copy; + iov->iov_base+=copy; + iov->iov_len-=copy; + } + iov++; + } + return 0; +} + + +/* + * For use with ip_build_xmit + */ + +int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset, + int len) +{ + int err; + while(offset>0) + { + if (offset > iov->iov_len) + { + offset -= iov->iov_len; + + } + else + { + u8 *base; + int copy; + + base = iov->iov_base + offset; + copy = min(len, iov->iov_len - offset); + offset = 0; + + err = copy_from_user(kdata, base, copy); + if (err) + { + return err; + } + len-=copy; + kdata+=copy; + } + iov++; + } + + while (len>0) + { + int copy=min(len, iov->iov_len); + err = copy_from_user(kdata, iov->iov_base, copy); + if (err) + { + return err; + } + len-=copy; + kdata+=copy; + iov++; + } + return 0; +} + +/* + * And now for the all-in-one: copy and checksum from a user iovec + * directly to a datagram + * Calls to csum_partial but the last must be in 32 bit chunks + * + * ip_build_xmit must ensure that when fragmenting only the last + * call to this function will be unaligned also. + * + * FIXME: add an error handling path when a copy/checksum from + * user space failed because of a invalid pointer. + */ + +unsigned int csum_partial_copy_fromiovecend(unsigned char *kdata, + struct iovec *iov, int offset, + int len, int csum) +{ + __u32 partial; + __u32 partial_cnt = 0; + + while(offset>0) + { + if (offset > iov->iov_len) + { + offset -= iov->iov_len; + + } + else + { + u8 *base; + int copy; + + base = iov->iov_base + offset; + copy = min(len, iov->iov_len - offset); + offset = 0; + + partial_cnt = copy % 4; + if (partial_cnt) + { + copy -= partial_cnt; + copy_from_user(&partial, base + copy, + partial_cnt); + } + + /* + * FIXME: add exception handling to the + * csum functions and set *err when an + * exception occurs. + */ + csum = csum_partial_copy_fromuser(base, kdata, + copy, csum); + + len -= copy + partial_cnt; + kdata += copy + partial_cnt; + } + iov++; + } + + while (len>0) + { + u8 *base = iov->iov_base; + int copy=min(len, iov->iov_len); + + if (partial_cnt) + { + int par_len = 4 - partial_cnt; + + copy_from_user(&partial, base + partial_cnt, par_len); + csum = csum_partial((u8*) &partial, 4, csum); + base += par_len; + copy -= par_len; + partial_cnt = 0; + } + + if (len - copy > 0) + { + partial_cnt = copy % 4; + if (partial_cnt) + { + copy -= partial_cnt; + copy_from_user(&partial, base + copy, + partial_cnt); + } + } + + csum = csum_partial_copy_fromuser(base, kdata, copy, csum); + len -= copy + partial_cnt; + kdata += copy + partial_cnt; + iov++; + } + + return csum; +} diff --git a/net/core/net_alias.c b/net/core/net_alias.c new file mode 100644 index 000000000..358303705 --- /dev/null +++ b/net/core/net_alias.c @@ -0,0 +1,1388 @@ +/* + * NET_ALIAS network device aliasing module. + * + * + * Version: @(#)net_alias.c 0.43 12/20/95 + * + * Authors: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> + * Marcelo Fabian Roccasalva, <mfroccas@raiz.uncu.edu.ar> + * + * Features: + * - AF_ independent: net_alias_type objects + * - AF_INET optimized + * - ACTUAL alias devices inserted in dev chain + * - fast hashed alias address lookup + * - net_alias_type objs registration/unreg., module-ables. + * - /proc/net/aliases & /proc/net/alias_types entries + * Fixes: + * JJC : several net_alias_type func. renamed. + * JJC : net_alias_type object methods now pass + * *this. + * JJC : xxx_rcv device selection based on <src,dst> + * addrs + * Andreas Schultz : Kerneld support. + * + * FIXME: + * - User calls sleep/wake_up locking. + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/notifier.h> +#include <linux/if.h> +#include <linux/inet.h> +#include <linux/in.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> + +#ifdef ALIAS_USER_LAND_DEBUG +#include "net_alias.h" +#include "user_stubs.h" +#endif + +#include <linux/net_alias.h> + +#ifdef CONFIG_KERNELD +#include <linux/kerneld.h> +#endif + +/* + * Only allow the following flags to pass from main device to aliases + */ + +#define NET_ALIAS_IFF_MASK (IFF_UP|IFF_BROADCAST|IFF_RUNNING|IFF_NOARP|IFF_LOOPBACK|IFF_POINTOPOINT) + +static struct net_alias_type * nat_getbytype(int type); +static int nat_attach_chg(struct net_alias_type *nat, int delta); +static int nat_bind(struct net_alias_type *nat,struct net_alias *alias, struct sockaddr *sa); +static int nat_unbind(struct net_alias_type *nat, struct net_alias *alias); + + +static int net_alias_devinit(struct device *dev); +static int net_alias_hard_start_xmit(struct sk_buff *skb, struct device *dev); +static int net_alias_devsetup(struct net_alias *alias, struct net_alias_type *nat, struct sockaddr *sa); +static struct net_alias **net_alias_slow_findp(struct net_alias_info *alias_info, struct net_alias *alias); +static struct device *net_alias_dev_create(struct device *main_dev, int slot, int *err, struct sockaddr *sa, void *data); +static struct device *net_alias_dev_delete(struct device *main_dev, int slot, int *err); +static void net_alias_free(struct device *dev); + +/* + * net_alias_type base array, will hold net_alias_type obj hashed list heads. + */ + +struct net_alias_type *nat_base[16]; + + +/* + * get net_alias_type ptr by type + */ + +static __inline__ struct net_alias_type * +nat_getbytype(int type) +{ + struct net_alias_type *nat; + for(nat = nat_base[type & 0x0f]; nat ; nat = nat->next) + { + if (nat->type == type) return nat; + } + return NULL; +} + + +/* + * get addr32 representation (pre-hashing) of address. + * if NULL nat->get_addr32, assume sockaddr_in struct (IP-ish). + */ + +static __inline__ __u32 +nat_addr32(struct net_alias_type *nat, struct sockaddr *sa) +{ + if (nat->get_addr32) + return nat->get_addr32(nat, sa); + else + return (*(struct sockaddr_in *)sa).sin_addr.s_addr; +} + + +/* + * hashing code for alias_info->hash_tab entries + * 4 bytes -> 1/2 byte using xor complemented by af + */ + +static __inline__ unsigned +HASH(__u32 addr, int af) +{ + unsigned tmp = addr ^ (addr>>16); /* 4 -> 2 */ + tmp ^= (tmp>>8); /* 2 -> 1 */ + return (tmp^(tmp>>4)^af) & 0x0f; /* 1 -> 1/2 */ +} + + +/* + * get hash key for supplied net alias type and address + * nat must be !NULL + * the purpose here is to map a net_alias_type and a generic + * address to a hash code. + */ + +static __inline__ int +nat_hash_key(struct net_alias_type *nat, struct sockaddr *sa) +{ + return HASH(nat_addr32(nat,sa), sa->sa_family); +} + + +/* + * change net_alias_type number of attachments (bindings) + */ + +static int +nat_attach_chg(struct net_alias_type *nat, int delta) +{ + unsigned long flags; + int n_at; + if (!nat) return -1; + save_flags(flags); + cli(); + n_at = nat->n_attach + delta; + if (n_at < 0) + { + restore_flags(flags); + printk(KERN_WARNING "net_alias: tried to set n_attach < 0 for (family==%d) nat object.\n", + nat->type); + return -1; + } + nat->n_attach = n_at; + restore_flags(flags); + return 0; +} + + +/* + * bind alias to its type (family) object and call initialization hook + */ + +static __inline__ int +nat_bind(struct net_alias_type *nat,struct net_alias *alias, struct sockaddr *sa) +{ + if (nat->alias_init_1) nat->alias_init_1(nat, alias, sa); + return nat_attach_chg(nat, +1); +} + + +/* + * unbind alias from type object and call alias destructor + */ + +static __inline__ int +nat_unbind(struct net_alias_type *nat, struct net_alias *alias) +{ + if (nat->alias_done_1) nat->alias_done_1(nat, alias); + return nat_attach_chg(nat, -1); +} + + +/* + * compare device address with given. if NULL nat->dev_addr_chk, + * compare dev->pa_addr with (sockaddr_in) 32 bits address (IP-ish) + */ + +static __inline__ int nat_dev_addr_chk_1(struct net_alias_type *nat, + struct device *dev, struct sockaddr *sa) +{ + if (nat->dev_addr_chk) + return nat->dev_addr_chk(nat, dev, sa); + else + return (dev->pa_addr == (*(struct sockaddr_in *)sa).sin_addr.s_addr); +} + + +/* + * alias device init() + * do nothing. + */ + +static int +net_alias_devinit(struct device *dev) +{ +#ifdef ALIAS_USER_LAND_DEBUG + printk("net_alias_devinit(%s) called.\n", dev->name); +#endif + return 0; +} + + +/* + * hard_start_xmit() should not be called. + * ignore ... but shout!. + */ + +static int +net_alias_hard_start_xmit(struct sk_buff *skb, struct device *dev) +{ + printk(KERN_WARNING "net_alias: net_alias_hard_start_xmit() for %s called (ignored)!!\n", dev->name); + dev_kfree_skb(skb, FREE_WRITE); + return 0; +} + + +static int +net_alias_open(struct device * dev) +{ + return 0; +} + +static int +net_alias_close(struct device * dev) +{ + return 0; +} + +/* + * setups a new (alias) device + */ + +static int +net_alias_devsetup(struct net_alias *alias, struct net_alias_type *nat, + struct sockaddr *sa) +{ + struct device *main_dev; + struct device *dev; + int family; + int i; + + /* + * + * generic device setup based on main_dev info + * + * FIXME: is NULL bitwise 0 for all Linux platforms? + */ + + main_dev = alias->main_dev; + dev = &alias->dev; + memset(dev, '\0', sizeof(struct device)); + family = (sa)? sa->sa_family : main_dev->family; + + dev->alias_info = NULL; /* no aliasing recursion */ + dev->my_alias = alias; /* point to alias */ + dev->name = alias->name; + dev->type = main_dev->type; + dev->open = net_alias_open; + dev->stop = net_alias_close; + dev->hard_header_len = main_dev->hard_header_len; + memcpy(dev->broadcast, main_dev->broadcast, MAX_ADDR_LEN); + memcpy(dev->dev_addr, main_dev->dev_addr, MAX_ADDR_LEN); + dev->addr_len = main_dev->addr_len; + dev->init = net_alias_devinit; + dev->hard_start_xmit = net_alias_hard_start_xmit; + dev->flags = main_dev->flags & NET_ALIAS_IFF_MASK & ~IFF_UP; + + /* + * only makes sense if same family + */ + + if (family == main_dev->family) + { + dev->metric = main_dev->metric; + dev->mtu = main_dev->mtu; + dev->pa_alen = main_dev->pa_alen; + dev->hard_header = main_dev->hard_header; + dev->rebuild_header = main_dev->rebuild_header; + } + + /* + * Fill in the generic fields of the device structure. + * not actually used, avoids some dev.c #ifdef's + */ + + for (i = 0; i < DEV_NUMBUFFS; i++) + skb_queue_head_init(&dev->buffs[i]); + + dev->family = family; + return 0; +} + + +/* + * slow alias find (parse the whole hash_tab) + * returns: alias' pointer address + */ + +static struct net_alias ** +net_alias_slow_findp(struct net_alias_info *alias_info, struct net_alias *alias) +{ + unsigned idx, n_aliases; + struct net_alias **aliasp; + + /* + * for each alias_info's hash_tab entry, for every alias ... + */ + + n_aliases = alias_info->n_aliases; + for (idx=0; idx < 16 ; idx++) + for (aliasp = &alias_info->hash_tab[idx];*aliasp;aliasp = &(*aliasp)->next) + if (*aliasp == alias) + return aliasp; + else + if (--n_aliases == 0) break; /* faster give up */ + return NULL; +} + + +/* + * create alias device for main_dev with given slot num. + * if sa==NULL will create a same_family alias device + */ + +static struct device * +net_alias_dev_create(struct device *main_dev, int slot, int *err, struct sockaddr *sa, void *data) +{ + struct net_alias_info *alias_info; + struct net_alias *alias, **aliasp; + struct net_alias_type *nat; + struct device *dev; + unsigned long flags; + int family; + __u32 addr32; + + /* FIXME: lock */ + alias_info = main_dev->alias_info; + + /* + * if NULL address given, take family from main_dev + */ + + family = (sa)? sa->sa_family : main_dev->family; + + /* + * check if wanted family has a net_alias_type object registered + */ + + nat = nat_getbytype(family); + if (!nat) { +#ifdef CONFIG_KERNELD + char modname[20]; + sprintf (modname,"netalias-%d", family); + request_module(modname); + + nat = nat_getbytype(family); + if (!nat) { +#endif + printk(KERN_WARNING "net_alias_dev_create(%s:%d): unregistered family==%d\n", + main_dev->name, slot, family); + /* *err = -EAFNOSUPPORT; */ + *err = -EINVAL; + return NULL; +#ifdef CONFIG_KERNELD + } +#endif + } + + /* + * do not allow creation over downed devices + */ + + *err = -EIO; + + if (! (main_dev->flags & IFF_UP) ) + return NULL; + + /* + * if first alias, must also create alias_info + */ + + *err = -ENOMEM; + + if (!alias_info) + { + alias_info = kmalloc(sizeof(struct net_alias_info), GFP_KERNEL); + if (!alias_info) return NULL; /* ENOMEM */ + memset(alias_info, 0, sizeof(struct net_alias_info)); + } + + if (!(alias = kmalloc(sizeof(struct net_alias), GFP_KERNEL))) + return NULL; /* ENOMEM */ + + /* + * FIXME: is NULL bitwise 0 for all Linux platforms? + */ + + memset(alias, 0, sizeof(struct net_alias)); + alias->slot = slot; + alias->main_dev = main_dev; + alias->nat = nat; + alias->next = NULL; + alias->data = data; + sprintf(alias->name, "%s:%d", main_dev->name, slot); + + /* + * initialise alias' device structure + */ + + net_alias_devsetup(alias, nat, sa); + + dev = &alias->dev; + + save_flags(flags); + cli(); + + /* + * bind alias to its object type + * nat_bind calls nat->alias_init_1 + */ + + nat_bind(nat, alias, sa); + + /* + * if no address passed, take from device (could have been + * set by nat->alias_init_1) + */ + + addr32 = (sa)? nat_addr32(nat, sa) : alias->dev.pa_addr; + + /* + * store hash key in alias: will speed-up rehashing and deletion + */ + + alias->hash = HASH(addr32, family); + + /* + * insert alias in hashed linked list + */ + + aliasp = &alias_info->hash_tab[alias->hash]; + alias->next = *aliasp; + *aliasp = alias; + + /* + * if first alias ... + */ + + if (!alias_info->n_aliases++) + { + alias_info->taildev = main_dev; + main_dev->alias_info = alias_info; + } + + /* + * add device at tail (just after last main_dev alias) + */ + + dev->next = alias_info->taildev->next; + alias_info->taildev->next = dev; + alias_info->taildev = dev; + restore_flags(flags); + return dev; +} + + +/* + * delete one main_dev alias (referred by its slot num) + */ + +static struct device * +net_alias_dev_delete(struct device *main_dev, int slot, int *err) +{ + struct net_alias_info *alias_info; + struct net_alias *alias, **aliasp; + struct device *dev; + unsigned n_aliases; + unsigned long flags; + struct net_alias_type *nat; + struct device *prevdev; + + /* FIXME: lock */ + *err = -ENODEV; + + if (main_dev == NULL) return NULL; + + /* + * does main_dev have aliases? + */ + + alias_info = main_dev->alias_info; + if (!alias_info) return NULL; /* ENODEV */ + + n_aliases = alias_info->n_aliases; + + /* + * find device that holds the same slot number (could also + * be strcmp() ala dev_get). + */ + + for (prevdev=main_dev, alias = NULL;prevdev->next && n_aliases; prevdev = prevdev->next) + { + if (!(alias = prevdev->next->my_alias)) + { + printk(KERN_ERR "net_alias_dev_delete(): incorrect non-alias device after maindev\n"); + continue; /* or should give up? */ + } + if (alias->slot == slot) break; + alias = NULL; + n_aliases--; + } + + if (!alias) return NULL; /* ENODEV */ + + dev = &alias->dev; + + /* + * find alias hashed entry + */ + + for(aliasp = &alias_info->hash_tab[alias->hash]; *aliasp; aliasp = &(*aliasp)->next) + if(*aliasp == alias) break; + + /* + * if not found (???), try a full search + */ + + if (*aliasp != alias) + if ((aliasp = net_alias_slow_findp(alias_info, alias))) + printk(KERN_WARNING "net_alias_dev_delete(%s): bad hashing recovered\n", alias->name); + else + { + printk(KERN_ERR "net_alias_dev_delete(%s): unhashed alias!\n",alias->name); + return NULL; /* ENODEV */ + } + + nat = alias->nat; + + save_flags(flags); + cli(); + + /* + * unbind alias from alias_type obj. + */ + + nat_unbind(nat, alias); + + /* + * is alias at tail? + */ + + if ( dev == alias_info->taildev ) + alias_info->taildev = prevdev; + + /* + * unlink and close device + */ + prevdev->next = dev->next; + dev_close(dev); + + /* + * unlink alias + */ + + *aliasp = (*aliasp)->next; + + if (--alias_info->n_aliases == 0) /* last alias */ + main_dev->alias_info = NULL; + restore_flags(flags); + + /* + * now free structures + */ + + kfree_s(alias, sizeof(struct net_alias)); + if (main_dev->alias_info == NULL) + kfree_s(alias_info, sizeof(struct net_alias_info)); + + /* + * deletion ok (*err=0), NULL device returned. + */ + + *err = 0; + return NULL; +} + +/* + * free all main device aliasing stuff + * will be called on dev_close(main_dev) + */ + +static void +net_alias_free(struct device *main_dev) +{ + struct net_alias_info *alias_info; + struct net_alias *alias; + struct net_alias_type *nat; + struct device *dev; + unsigned long flags; + + /* + * do I really have aliases? + */ + + if (!(alias_info = main_dev->alias_info)) return; + + /* + * fast device link "short-circuit": set main_dev->next to + * device after last alias + */ + + save_flags(flags); + cli(); + + dev = main_dev->next; + main_dev->next = alias_info->taildev->next; + main_dev->alias_info = NULL; + alias_info->taildev->next = NULL; + + restore_flags(flags); + + /* + * loop over alias devices, free and dev_close() + */ + + while (dev) + { + if (net_alias_is(dev)) + { + alias = dev->my_alias; + if (alias->main_dev == main_dev) + { + /* + * unbind alias from alias_type object + */ + + nat = alias->nat; + if (nat) + { + nat_unbind(nat, alias); + } /* else error/printk ??? */ + + dev_close(dev); + dev = dev->next; + + kfree_s(alias, sizeof(struct net_alias)); + continue; + } + else + printk(KERN_ERR "net_alias_free(%s): '%s' is not my alias\n", + main_dev->name, alias->name); + } + else + printk(KERN_ERR "net_alias_free(%s): found a non-alias after device!\n", + main_dev->name); + dev = dev->next; + } + + kfree_s(alias_info, sizeof(alias_info)); + return; +} + +/* + * dev_get() with added alias naming magic. + */ + +struct device * +net_alias_dev_get(char *dev_name, int aliasing_ok, int *err, + struct sockaddr *sa, void *data) +{ + struct device *dev; + char *sptr,*eptr; + int slot = 0; + int delete = 0; + + *err = -ENODEV; + if ((dev=dev_get(dev_name))) + return dev; + + /* + * want alias naming magic? + */ + + if (!aliasing_ok) return NULL; + + if (!dev_name || !*dev_name) + return NULL; + + /* + * find the first ':' , must be followed by, at least, 1 char + */ + + for (sptr=dev_name ; *sptr ; sptr++) if(*sptr==':') break; + if (!*sptr || !*(sptr+1)) + return NULL; + + /* + * seems to be an alias name, fetch main device + */ + + *sptr='\0'; + if (!(dev=dev_get(dev_name))) + return NULL; + *sptr++=':'; + + /* + * fetch slot number + */ + + slot = simple_strtoul(sptr,&eptr,10); + if (slot >= NET_ALIAS_MAX_SLOT) + return NULL; + + /* + * if last char is '-', it is a deletion request + */ + + if (eptr[0] == '-' && !eptr[1] ) delete++; + else if (eptr[0]) + return NULL; + + /* + * well... let's work. + */ + + if (delete) + return net_alias_dev_delete(dev, slot, err); + else + return net_alias_dev_create(dev, slot, err, sa, data); +} + + +/* + * rehash alias device with address supplied. + */ + +int +net_alias_dev_rehash(struct device *dev, struct sockaddr *sa) +{ + struct net_alias_info *alias_info; + struct net_alias *alias, **aliasp; + struct device *main_dev; + unsigned long flags; + struct net_alias_type *o_nat, *n_nat; + unsigned n_hash; + + /* + * defensive ... + */ + + if (dev == NULL) return -1; + if ( (alias = dev->my_alias) == NULL ) return -1; + + if (!sa) + { + printk(KERN_ERR "net_alias_rehash(): NULL sockaddr passed\n"); + return -1; + } + + /* + * defensive. should not happen. + */ + + if ( (main_dev = alias->main_dev) == NULL ) + { + printk(KERN_ERR "net_alias_rehash for %s: NULL maindev\n", alias->name); + return -1; + } + + /* + * defensive. should not happen. + */ + + if (!(alias_info=main_dev->alias_info)) + { + printk(KERN_ERR "net_alias_rehash for %s: NULL alias_info\n", alias->name); + return -1; + } + + /* + * will the request also change device family? + */ + + o_nat = alias->nat; + if (!o_nat) + { + printk(KERN_ERR "net_alias_rehash(%s): unbound alias.\n", alias->name); + return -1; + } + + /* + * point to new alias_type obj. + */ + + if (o_nat->type == sa->sa_family) + n_nat = o_nat; + else + { + n_nat = nat_getbytype(sa->sa_family); + if (!n_nat) + { + printk(KERN_ERR "net_alias_rehash(%s): unreg family==%d.\n", alias->name, sa->sa_family); + return -1; + } + } + + /* + * new hash key. if same as old AND same type (family) return; + */ + + n_hash = nat_hash_key(n_nat, sa); + if (n_hash == alias->hash && o_nat == n_nat ) + return 0; + + /* + * find alias in hashed list + */ + + for (aliasp = &alias_info->hash_tab[alias->hash]; *aliasp; aliasp = &(*aliasp)->next) + if (*aliasp == alias) break; + + /* + * not found (???). try a full search + */ + + if(!*aliasp) + if ((aliasp = net_alias_slow_findp(alias_info, alias))) + printk(KERN_WARNING "net_alias_rehash(%s): bad hashing recovered\n", alias->name); + else + { + printk(KERN_ERR "net_alias_rehash(%s): unhashed alias!\n", alias->name); + return -1; + } + + save_flags(flags); + cli(); + + /* + * if type (family) changed, unlink from old type object (o_nat) + * will call o_nat->alias_done_1() + */ + + if (o_nat != n_nat) + nat_unbind(o_nat, alias); + + /* + * if diff hash key, change alias position in hashed list + */ + + if (n_hash != alias->hash) + { + *aliasp = (*aliasp)->next; + alias->hash = n_hash; + aliasp = &alias_info->hash_tab[n_hash]; + alias->next = *aliasp; + *aliasp = alias; + } + + /* + * if type (family) changed link to new type object (n_nat) + * will call n_nat->alias_init_1() + */ + + if (o_nat != n_nat) + nat_bind(n_nat, alias, sa); + + restore_flags(flags); + return 0; +} + + + + +/* + * implements /proc/net/alias_types entry + * shows net_alias_type objects registered. + */ + +int net_alias_types_getinfo(char *buffer, char **start, off_t offset, int length, int dummy) +{ + off_t pos=0, begin=0; + int len=0; + struct net_alias_type *nat; + unsigned idx; + len=sprintf(buffer,"type name n_attach\n"); + for (idx=0 ; idx < 16 ; idx++) + for (nat = nat_base[idx]; nat ; nat = nat->next) + { + len += sprintf(buffer+len, "%-7d %-15s %-7d\n", + nat->type, nat->name,nat->n_attach); + pos=begin+len; + if(pos<offset) + { + len=0; + begin=pos; + } + if(pos>offset+length) + break; + } + *start=buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + return len; +} + + +/* + * implements /proc/net/aliases entry, shows alias devices. + * calls alias nat->alias_print_1 if not NULL and formats everything + * to a fixed rec. size without using local (stack) buffers + * + */ + +#define NET_ALIASES_RECSIZ 64 +int net_alias_getinfo(char *buffer, char **start, off_t offset, int length, int dummy) +{ + off_t pos=0, begin=0; + int len=0; + int dlen; + struct net_alias_type *nat; + struct net_alias *alias; + struct device *dev; + + len=sprintf(buffer,"%-*s\n",NET_ALIASES_RECSIZ-1,"device family address"); + for (dev = dev_base; dev ; dev = dev->next) + if (net_alias_is(dev)) + { + alias = dev->my_alias; + nat = alias->nat; + dlen=sprintf(buffer+len, "%-16s %-6d ", alias->name, alias->dev.family); + + /* + * call alias_type specific print function. + */ + + if (nat->alias_print_1) + dlen += nat->alias_print_1(nat, alias, buffer+len+dlen, NET_ALIASES_RECSIZ - dlen); + else + dlen += sprintf(buffer+len+dlen, "-"); + + /* + * fill with spaces if needed + */ + + if (dlen < NET_ALIASES_RECSIZ) memset(buffer+len+dlen, ' ', NET_ALIASES_RECSIZ - dlen); + /* + * truncate to NET_ALIASES_RECSIZ + */ + + len += NET_ALIASES_RECSIZ; + buffer[len-1] = '\n'; + + pos=begin+len; + if(pos<offset) + { + len=0; + begin=pos; + } + if(pos>offset+length) + break; + } + *start=buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + return len; +} + + +/* + * notifier for devices events + */ + +int net_alias_device_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct device *dev = ptr; + + if (event == NETDEV_DOWN) + { +#ifdef ALIAS_USER_LAND_DEBUG + printk("net_alias: NETDEV_DOWN for %s received\n", dev->name); +#endif + if (net_alias_has(dev)) + net_alias_free(dev); + } + + if (event == NETDEV_UP) + { +#ifdef ALIAS_USER_LAND_DEBUG + printk("net_alias: NETDEV_UP for %s received\n", dev->name); +#endif + dev->alias_info = 0; + } + + return NOTIFY_DONE; +} + + +/* + * device aliases address comparison workhorse + * no checks for nat and alias_info, must be !NULL + */ + +static __inline__ struct device * +nat_addr_chk(struct net_alias_type *nat, struct net_alias_info *alias_info, struct sockaddr *sa, int flags_on, int flags_off) +{ + struct net_alias *alias; + for(alias = alias_info->hash_tab[nat_hash_key(nat,sa)]; + alias; alias = alias->next) + { + if (alias->dev.family != sa->sa_family) continue; + + /* + * nat_dev_addr_chk_1 will call type specific address cmp function. + */ + + if (alias->dev.flags & flags_on && !(alias->dev.flags & flags_off) && + nat_dev_addr_chk_1(nat,&alias->dev,sa)) + return &alias->dev; + } + return NULL; +} + +/* + * nat_addr_chk enough for protocols whose addr is (fully) stored at pa_addr. + * note that nat pointer is ignored because of static comparison. + */ + +static __inline__ struct device * +nat_addr_chk32(struct net_alias_type *nat, struct net_alias_info *alias_info, int family, __u32 addr32, int flags_on, int flags_off) +{ + struct net_alias *alias; + for (alias=alias_info->hash_tab[HASH(addr32,family)]; + alias; alias=alias->next) + { + if (alias->dev.family != family) continue; + + /* + * "hard" (static) comparison between addr32 and pa_addr. + */ + + if (alias->dev.flags & flags_on && !(alias->dev.flags & flags_off) && + addr32 == alias->dev.pa_addr) + return &alias->dev; + } + return NULL; +} + +/* + * returns alias device with specified address AND flags_on AND flags_off, + * else NULL. + * intended for main devices. + */ + +struct device * +net_alias_dev_chk(struct device *main_dev, struct sockaddr *sa,int flags_on, int flags_off) +{ + struct net_alias_info *alias_info = main_dev->alias_info; + struct net_alias_type *nat; + + /* + * only if main_dev has aliases + */ + + if (!alias_info) return NULL; + + /* + * get alias_type object for sa->sa_family. + */ + + nat = nat_getbytype(sa->sa_family); + if (!nat) + return NULL; + + return nat_addr_chk(nat, alias_info, sa, flags_on, flags_off); +} + +/* + * net_alias_dev_chk enough for protocols whose addr is (fully) stored + * at pa_addr. + */ + +struct device * +net_alias_dev_chk32(struct device *main_dev, int family, __u32 addr32, + int flags_on, int flags_off) +{ + struct net_alias_info *alias_info = main_dev->alias_info; + + /* + * only if main_dev has aliases + */ + + if (!alias_info) return NULL; + + return nat_addr_chk32(NULL, alias_info, family, addr32, flags_on, flags_off); +} + + +/* + * select closest (main or alias) device to <src,dst> addresses given. if no + * further info is available, return main_dev (for easier calling arrangement). + * + * Should be called early at xxx_rcv() time for device selection + */ + +struct device * +net_alias_dev_rcv_sel(struct device *main_dev, struct sockaddr *sa_src, struct sockaddr *sa_dst) +{ + int family; + struct net_alias_type *nat; + struct net_alias_info *alias_info; + struct device *dev; + + if (main_dev == NULL) return NULL; + + /* + * if not aliased, don't bother any more + */ + + if ((alias_info = main_dev->alias_info) == NULL) + return main_dev; + + /* + * find out family + */ + + family = (sa_src)? sa_src->sa_family : ((sa_dst)? sa_dst->sa_family : AF_UNSPEC); + if (family == AF_UNSPEC) return main_dev; + + /* + * get net_alias_type object for this family + */ + + if ( (nat = nat_getbytype(family)) == NULL ) return main_dev; + + /* + * first step: find out if dst addr is main_dev's or one of its aliases' + */ + + if (sa_dst) + { + if (nat_dev_addr_chk_1(nat, main_dev,sa_dst)) + return main_dev; + + dev = nat_addr_chk(nat, alias_info, sa_dst, IFF_UP, 0); + + if (dev != NULL) return dev; + } + + /* + * second step: find the rcv addr 'closest' alias through nat method call + */ + + if ( sa_src == NULL || nat->dev_select == NULL) return main_dev; + dev = nat->dev_select(nat, main_dev, sa_src); + + if (dev == NULL || dev->family != family) return main_dev; + + /* + * dev ok only if it is alias of main_dev + */ + + dev = net_alias_is(dev)? + ( (dev->my_alias->main_dev == main_dev)? dev : NULL) : NULL; + + /* + * do not return NULL. + */ + + return (dev)? dev : main_dev; + +} + +/* + * dev_rcv_sel32: dev_rcv_sel for 'pa_addr' protocols. + */ + +struct device * +net_alias_dev_rcv_sel32(struct device *main_dev, int family, __u32 src, __u32 dst) +{ + struct net_alias_type *nat; + struct net_alias_info *alias_info; + struct sockaddr_in sin_src; + struct device *dev; + + if (main_dev == NULL) return NULL; + + /* + * if not aliased, don't bother any more + */ + + if ((alias_info = main_dev->alias_info) == NULL) + return main_dev; + + /* + * early return if dst is main_dev's address + */ + + if (dst == main_dev->pa_addr) + return main_dev; + + if (family == AF_UNSPEC) return main_dev; + + /* + * get net_alias_type object for this family + */ + + if ( (nat = nat_getbytype(family)) == NULL ) return main_dev; + + /* + * first step: find out if dst address one of main_dev aliases' + */ + + if (dst) + { + dev = nat_addr_chk32(nat, alias_info, family, dst, IFF_UP, 0); + if (dev) return dev; + } + + /* + * second step: find the rcv addr 'closest' alias through nat method call + */ + + if ( src == 0 || nat->dev_select == NULL) return main_dev; + + sin_src.sin_family = family; + sin_src.sin_addr.s_addr = src; + + dev = nat->dev_select(nat, main_dev, (struct sockaddr *)&sin_src); + + if (dev == NULL || dev->family != family) return main_dev; + + /* + * dev ok only if it is alias of main_dev + */ + + dev = net_alias_is(dev)? + ( (dev->my_alias->main_dev == main_dev)? dev : NULL) : NULL; + + /* + * do not return NULL. + */ + + return (dev)? dev : main_dev; + +} + + +/* + * device event hook + */ + +static struct notifier_block net_alias_dev_notifier = { + net_alias_device_event, + NULL, + 0 +}; + +#ifndef ALIAS_USER_LAND_DEBUG +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry proc_net_alias_types = { + PROC_NET_ALIAS_TYPES, 11, "alias_types", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + net_alias_types_getinfo +}; +static struct proc_dir_entry proc_net_aliases = { + PROC_NET_ALIASES, 7, "aliases", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + net_alias_getinfo +}; +#endif +#endif + +/* + * net_alias initialisation + * called from net_dev_init(). + */ + +void net_alias_init(void) +{ + + /* + * register dev events notifier + */ + + register_netdevice_notifier(&net_alias_dev_notifier); + + /* + * register /proc/net entries + */ + +#ifndef ALIAS_USER_LAND_DEBUG +#ifdef CONFIG_PROC_FS + proc_net_register(&proc_net_alias_types); + proc_net_register(&proc_net_aliases); +#endif +#endif + +} + +/* + * net_alias type object registering func. + */ +int register_net_alias_type(struct net_alias_type *nat, int type) +{ + unsigned hash; + unsigned long flags; + if (!nat) + { + printk(KERN_ERR "register_net_alias_type(): NULL arg\n"); + return -EINVAL; + } + nat->type = type; + nat->n_attach = 0; + hash = nat->type & 0x0f; + save_flags(flags); + cli(); + nat->next = nat_base[hash]; + nat_base[hash] = nat; + restore_flags(flags); + return 0; +} + +/* + * net_alias type object unreg. + */ +int unregister_net_alias_type(struct net_alias_type *nat) +{ + struct net_alias_type **natp; + unsigned hash; + unsigned long flags; + + if (!nat) + { + printk(KERN_ERR "unregister_net_alias_type(): NULL arg\n"); + return -EINVAL; + } + + /* + * only allow unregistration if it has no attachments + */ + if (nat->n_attach) + { + printk(KERN_ERR "unregister_net_alias_type(): has %d attachments. failed\n", + nat->n_attach); + return -EINVAL; + } + hash = nat->type & 0x0f; + save_flags(flags); + cli(); + for (natp = &nat_base[hash]; *natp ; natp = &(*natp)->next) + { + if (nat==(*natp)) + { + *natp = nat->next; + restore_flags(flags); + return 0; + } + } + restore_flags(flags); + printk(KERN_ERR "unregister_net_alias_type(type=%d): not found!\n", nat->type); + return -EINVAL; +} + diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1b49683e6..c90d8d4e2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -6,8 +6,21 @@ * * Fixes: * Alan Cox : Fixed the worst of the load balancer bugs. - * Dave Platt : Interrupt stacking fix + * Dave Platt : Interrupt stacking fix. * Richard Kooijman : Timestamp fixes. + * Alan Cox : Changed buffer format. + * Alan Cox : destructor hook for AF_UNIX etc. + * Linus Torvalds : Better skb_clone. + * Alan Cox : Added skb_copy. + * Alan Cox : Added all the changed routines Linus + * only put in the headers + * Ray VanTassle : Fixed --skb->lock in free + * + * TO FIX: + * The __skb_ routines ought to check interrupts are disabled + * when called, and bitch like crazy if not. Unfortunately I don't think + * we currently have a portable way to check if interrupts are off - + * Linus ??? * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -16,51 +29,62 @@ */ /* - * Note: There are a load of cli()/sti() pairs protecting the net_memory type - * variables. Without them for some reason the ++/-- operators do not come out - * atomic. Also with gcc 2.4.5 these counts can come out wrong anyway - use 2.5.8!! + * The functions in this file will not compile correctly with gcc 2.4.x */ #include <linux/config.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> -#include <asm/segment.h> -#include <asm/system.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/in.h> #include <linux/inet.h> +#include <linux/malloc.h> #include <linux/netdevice.h> +#include <linux/string.h> +#include <linux/skbuff.h> + #include <net/ip.h> +#include <net/ipv6.h> #include <net/protocol.h> -#include <linux/string.h> #include <net/route.h> #include <net/tcp.h> #include <net/udp.h> -#include <linux/skbuff.h> #include <net/sock.h> +#include <asm/uaccess.h> +#include <asm/system.h> /* * Resource tracking variables */ -volatile unsigned long net_memory = 0; -volatile unsigned long net_skbcount = 0; -volatile unsigned long net_locked = 0; -volatile unsigned long net_allocs = 0; -volatile unsigned long net_fails = 0; -volatile unsigned long net_free_locked = 0; +atomic_t net_skbcount = 0; +atomic_t net_locked = 0; +atomic_t net_allocs = 0; +atomic_t net_fails = 0; +atomic_t net_free_locked = 0; + +extern atomic_t ip_frag_mem; + +/* + * Strings we don't want inline's duplicating + */ + +char *skb_push_errstr="skpush:under: %p:%d"; +char *skb_put_errstr ="skput:over: %p:%d"; void show_net_buffers(void) { - printk("Networking buffers in use : %lu\n",net_skbcount); - printk("Memory committed to network buffers: %lu\n",net_memory); - printk("Network buffers locked by drivers : %lu\n",net_locked); - printk("Total network buffer allocations : %lu\n",net_allocs); - printk("Total failed network buffer allocs : %lu\n",net_fails); - printk("Total free while locked events : %lu\n",net_free_locked); + printk(KERN_INFO "Networking buffers in use : %u\n",net_skbcount); + printk(KERN_INFO "Network buffers locked by drivers : %u\n",net_locked); + printk(KERN_INFO "Total network buffer allocations : %u\n",net_allocs); + printk(KERN_INFO "Total failed network buffer allocs : %u\n",net_fails); + printk(KERN_INFO "Total free while locked events : %u\n",net_free_locked); +#ifdef CONFIG_INET + printk(KERN_INFO "IP fragment buffer size : %u\n",ip_frag_mem); +#endif } #if CONFIG_SKB_CHECK @@ -127,35 +151,65 @@ int skb_check(struct sk_buff *skb, int head, int line, char *file) { printk("File: %s Line %d, found a freed skb lurking in the undergrowth!\n", file,line); - printk("skb=%p, real size=%ld, claimed size=%ld, free=%d\n", - skb,skb->truesize,skb->mem_len,skb->free); + printk("skb=%p, real size=%d, free=%d\n", + skb,skb->truesize,skb->free); return -1; } if(skb->magic_debug_cookie!=SK_GOOD_SKB) { printk("File: %s Line %d, passed a non skb!\n", file,line); - printk("skb=%p, real size=%ld, claimed size=%ld, free=%d\n", - skb,skb->truesize,skb->mem_len,skb->free); + printk("skb=%p, real size=%d, free=%d\n", + skb,skb->truesize,skb->free); + return -1; + } + if(skb->head>skb->data) + { + printk("File: %s Line %d, head > data !\n", file,line); + printk("skb=%p, head=%p, data=%p\n", + skb,skb->head,skb->data); return -1; } - if(skb->mem_len!=skb->truesize) + if(skb->tail>skb->end) { - printk("File: %s Line %d, Dubious size setting!\n",file,line); - printk("skb=%p, real size=%ld, claimed size=%ld\n", - skb,skb->truesize,skb->mem_len); + printk("File: %s Line %d, tail > end!\n", file,line); + printk("skb=%p, tail=%p, end=%p\n", + skb,skb->tail,skb->end); return -1; } + if(skb->data>skb->tail) + { + printk("File: %s Line %d, data > tail!\n", file,line); + printk("skb=%p, data=%p, tail=%p\n", + skb,skb->data,skb->tail); + return -1; + } + if(skb->tail-skb->data!=skb->len) + { + printk("File: %s Line %d, wrong length\n", file,line); + printk("skb=%p, data=%p, end=%p len=%ld\n", + skb,skb->data,skb->end,skb->len); + return -1; + } + if((unsigned long) skb->end > (unsigned long) skb) + { + printk("File: %s Line %d, control overrun\n", file,line); + printk("skb=%p, end=%p\n", + skb,skb->end); + return -1; + } + /* Guess it might be acceptable then */ return 0; } #endif -#ifdef CONFIG_SKB_CHECK +#if CONFIG_SKB_CHECK void skb_queue_head_init(struct sk_buff_head *list) { list->prev = (struct sk_buff *)list; list->next = (struct sk_buff *)list; + list->qlen = 0; list->magic_debug_cookie = SK_HEAD_SKB; } @@ -181,10 +235,32 @@ void skb_queue_head(struct sk_buff_head *list_,struct sk_buff *newsk) newsk->next->prev = newsk; newsk->prev->next = newsk; + newsk->list = list_; + list_->qlen++; restore_flags(flags); } +void __skb_queue_head(struct sk_buff_head *list_,struct sk_buff *newsk) +{ + struct sk_buff *list = (struct sk_buff *)list_; + + + IS_SKB(newsk); + IS_SKB_HEAD(list); + if (newsk->next || newsk->prev) + printk("Suspicious queue head: sk_buff on list!\n"); + + newsk->next = list->next; + newsk->prev = list; + + newsk->next->prev = newsk; + newsk->prev->next = newsk; + newsk->list = list_; + list_->qlen++; + +} + /* * Insert an sk_buff at the end of a list. */ @@ -206,10 +282,32 @@ void skb_queue_tail(struct sk_buff_head *list_, struct sk_buff *newsk) newsk->next->prev = newsk; newsk->prev->next = newsk; + + newsk->list = list_; + list_->qlen++; restore_flags(flags); } +void __skb_queue_tail(struct sk_buff_head *list_, struct sk_buff *newsk) +{ + struct sk_buff *list = (struct sk_buff *)list_; + + if (newsk->next || newsk->prev) + printk("Suspicious queue tail: sk_buff on list!\n"); + IS_SKB(newsk); + IS_SKB_HEAD(list); + + newsk->next = list; + newsk->prev = list->prev; + + newsk->next->prev = newsk; + newsk->prev->next = newsk; + + newsk->list = list_; + list_->qlen++; +} + /* * Remove an sk_buff from a list. This routine is also interrupt safe * so you can grab read and free buffers as another process adds them. @@ -217,7 +315,7 @@ void skb_queue_tail(struct sk_buff_head *list_, struct sk_buff *newsk) struct sk_buff *skb_dequeue(struct sk_buff_head *list_) { - long flags; + unsigned long flags; struct sk_buff *result; struct sk_buff *list = (struct sk_buff *)list_; @@ -237,13 +335,39 @@ struct sk_buff *skb_dequeue(struct sk_buff_head *list_) result->next = NULL; result->prev = NULL; - + list_->qlen--; + result->list = NULL; + restore_flags(flags); IS_SKB(result); return result; } +struct sk_buff *__skb_dequeue(struct sk_buff_head *list_) +{ + struct sk_buff *result; + struct sk_buff *list = (struct sk_buff *)list_; + + IS_SKB_HEAD(list); + + result = list->next; + if (result == list) { + return NULL; + } + + result->next->prev = list; + list->next = result->next; + + result->next = NULL; + result->prev = NULL; + list_->qlen--; + result->list = NULL; + + IS_SKB(result); + return result; +} + /* * Insert a packet before another one in a list. */ @@ -265,11 +389,41 @@ void skb_insert(struct sk_buff *old, struct sk_buff *newsk) newsk->prev = old->prev; old->prev = newsk; newsk->prev->next = newsk; + newsk->list = old->list; + newsk->list->qlen++; restore_flags(flags); } /* + * Insert a packet before another one in a list. + */ + +void __skb_insert(struct sk_buff *newsk, + struct sk_buff * prev, struct sk_buff *next, + struct sk_buff_head * list) +{ + IS_SKB(prev); + IS_SKB(newsk); + IS_SKB(next); + + if(!prev->next || !prev->prev) + printk("insert after unlisted item!\n"); + if(!next->next || !next->prev) + printk("insert before unlisted item!\n"); + if(newsk->next || newsk->prev) + printk("inserted item is already on a list.\n"); + + newsk->next = next; + newsk->prev = prev; + next->prev = newsk; + prev->next = newsk; + newsk->list = list; + list->qlen++; + +} + +/* * Place a packet after a given packet in a list. */ void skb_append(struct sk_buff *old, struct sk_buff *newsk) @@ -291,6 +445,8 @@ void skb_append(struct sk_buff *old, struct sk_buff *newsk) newsk->next = old->next; newsk->next->prev = newsk; old->next = newsk; + newsk->list = old->list; + newsk->list->qlen++; restore_flags(flags); } @@ -310,12 +466,14 @@ void skb_unlink(struct sk_buff *skb) IS_SKB(skb); - if(skb->prev && skb->next) + if(skb->list) { + skb->list->qlen--; skb->next->prev = skb->prev; skb->prev->next = skb->next; skb->next = NULL; skb->prev = NULL; + skb->list = NULL; } #ifdef PARANOID_BUGHUNT_MODE /* This is legal but we sometimes want to watch it */ else @@ -324,6 +482,98 @@ void skb_unlink(struct sk_buff *skb) restore_flags(flags); } +void __skb_unlink(struct sk_buff *skb) +{ + IS_SKB(skb); + + if(skb->list) + { + skb->list->qlen--; + skb->next->prev = skb->prev; + skb->prev->next = skb->next; + skb->next = NULL; + skb->prev = NULL; + skb->list = NULL; + } +#ifdef PARANOID_BUGHUNT_MODE /* This is legal but we sometimes want to watch it */ + else + printk("skb_unlink: not a linked element\n"); +#endif +} + +/* + * Add data to an sk_buff + */ + +unsigned char *skb_put(struct sk_buff *skb, unsigned int len) +{ + unsigned char *tmp=skb->tail; + IS_SKB(skb); + skb->tail+=len; + skb->len+=len; + IS_SKB(skb); + if(skb->tail>skb->end) + panic("skput:over: %p:%d", return_address(),len); + return tmp; +} + +unsigned char *skb_push(struct sk_buff *skb, unsigned int len) +{ + IS_SKB(skb); + skb->data-=len; + skb->len+=len; + IS_SKB(skb); + if(skb->data<skb->head) + panic("skpush:under: %p:%d", return_address(),len); + return skb->data; +} + +unsigned char * skb_pull(struct sk_buff *skb, unsigned int len) +{ + IS_SKB(skb); + if(len>skb->len) + return 0; + skb->data+=len; + skb->len-=len; + return skb->data; +} + +int skb_headroom(struct sk_buff *skb) +{ + IS_SKB(skb); + return skb->data-skb->head; +} + +int skb_tailroom(struct sk_buff *skb) +{ + IS_SKB(skb); + return skb->end-skb->tail; +} + +void skb_reserve(struct sk_buff *skb, unsigned int len) +{ + IS_SKB(skb); + skb->data+=len; + skb->tail+=len; + if(skb->tail>skb->end) + panic("sk_res: over"); + if(skb->data<skb->head) + panic("sk_res: under"); + IS_SKB(skb); +} + +void skb_trim(struct sk_buff *skb, unsigned int len) +{ + IS_SKB(skb); + if(skb->len>len) + { + skb->len=len; + skb->tail=skb->data+len; + } +} + + + #endif /* @@ -335,11 +585,11 @@ void kfree_skb(struct sk_buff *skb, int rw) { if (skb == NULL) { - printk("kfree_skb: skb = NULL (from %p)\n", - __builtin_return_address(0)); + printk(KERN_CRIT "kfree_skb: skb = NULL (from %p)\n", + return_address()); return; } -#ifdef CONFIG_SKB_CHECK +#if CONFIG_SKB_CHECK IS_SKB(skb); #endif if (skb->lock) @@ -349,39 +599,39 @@ void kfree_skb(struct sk_buff *skb, int rw) return; } if (skb->free == 2) - printk("Warning: kfree_skb passed an skb that nobody set the free flag on! (from %p)\n", - __builtin_return_address(0)); - if (skb->next) - printk("Warning: kfree_skb passed an skb still on a list (from %p).\n", - __builtin_return_address(0)); + printk(KERN_WARNING "Warning: kfree_skb passed an skb that nobody set the free flag on! (from %p)\n", + return_address()); + if (skb->list) + printk(KERN_WARNING "Warning: kfree_skb passed an skb still on a list (from %p).\n", + return_address()); + + if(skb->destructor) + skb->destructor(skb); if (skb->sk) { - if(skb->sk->prot!=NULL) + struct sock * sk = skb->sk; + if(sk->prot!=NULL) { if (rw) - skb->sk->prot->rfree(skb->sk, skb, skb->mem_len); + sock_rfree(sk, skb); else - skb->sk->prot->wfree(skb->sk, skb, skb->mem_len); + sock_wfree(sk, skb); } else { - unsigned long flags; - /* Non INET - default wmalloc/rmalloc handler */ - save_flags(flags); - cli(); if (rw) - skb->sk->rmem_alloc-=skb->mem_len; - else - skb->sk->wmem_alloc-=skb->mem_len; - restore_flags(flags); - if(!skb->sk->dead) - skb->sk->write_space(skb->sk); - kfree_skbmem(skb,skb->mem_len); + atomic_sub(skb->truesize, &sk->rmem_alloc); + else { + if(!sk->dead) + sk->write_space(sk); + atomic_sub(skb->truesize, &sk->wmem_alloc); + } + kfree_skbmem(skb); } } else - kfree_skbmem(skb, skb->mem_len); + kfree_skbmem(skb); } /* @@ -391,20 +641,30 @@ void kfree_skb(struct sk_buff *skb, int rw) struct sk_buff *alloc_skb(unsigned int size,int priority) { struct sk_buff *skb; - unsigned long flags; + int len; + unsigned char *bptr; - if (intr_count && priority!=GFP_ATOMIC) { + if (intr_count && priority!=GFP_ATOMIC) + { static int count = 0; if (++count < 5) { - printk("alloc_skb called nonatomically from interrupt %p\n", - __builtin_return_address(0)); + printk(KERN_ERR "alloc_skb called nonatomically from interrupt %p\n", + return_address()); priority = GFP_ATOMIC; } } - size+=sizeof(struct sk_buff); - skb=(struct sk_buff *)kmalloc(size,priority); - if (skb == NULL) + size=(size+15)&~15; /* Allow for alignments. Make a multiple of 16 bytes */ + len = size; + + size+=sizeof(struct sk_buff); /* And stick the control itself on the end */ + + /* + * Allocate some space + */ + + bptr=(unsigned char *)kmalloc(size,priority); + if (bptr == NULL) { net_fails++; return NULL; @@ -413,34 +673,46 @@ struct sk_buff *alloc_skb(unsigned int size,int priority) if(skb->magic_debug_cookie == SK_GOOD_SKB) printk("Kernel kmalloc handed us an existing skb (%p)\n",skb); #endif - + /* + * Now we play a little game with the caches. Linux kmalloc is + * a bit cache dumb, in fact its just about maximally non + * optimal for typical kernel buffers. We actually run faster + * by doing the following. Which is to deliberately put the + * skb at the _end_ not the start of the memory block. + */ net_allocs++; + + skb=(struct sk_buff *)(bptr+size)-1; + + skb->count = 1; /* only one reference to this */ + skb->data_skb = NULL; /* and we're our own data skb */ skb->free = 2; /* Invalid so we pick up forgetful users */ skb->lock = 0; skb->pkt_type = PACKET_HOST; /* Default type */ - skb->truesize = size; - skb->mem_len = size; - skb->mem_addr = skb; -#ifdef CONFIG_SLAVE_BALANCING - skb->in_dev_queue = 0; -#endif - skb->fraglist = NULL; - skb->prev = skb->next = NULL; - skb->link3 = NULL; + skb->pkt_bridged = 0; /* Not bridged */ + skb->prev = skb->next = skb->link3 = NULL; + skb->list = NULL; skb->sk = NULL; + skb->truesize=size; skb->localroute=0; skb->stamp.tv_sec=0; /* No idea about time */ skb->localroute = 0; - save_flags(flags); - cli(); - net_memory += size; + skb->ip_summed = 0; + memset(skb->proto_priv, 0, sizeof(skb->proto_priv)); net_skbcount++; - restore_flags(flags); #if CONFIG_SKB_CHECK skb->magic_debug_cookie = SK_GOOD_SKB; #endif skb->users = 0; + /* Load the data pointers */ + skb->head=bptr; + skb->data=bptr; + skb->tail=bptr; + skb->end=bptr+len; + skb->len=0; + skb->destructor=NULL; + skb->inclone = 0; return skb; } @@ -448,42 +720,35 @@ struct sk_buff *alloc_skb(unsigned int size,int priority) * Free an skbuff by memory */ -void kfree_skbmem(struct sk_buff *skb,unsigned size) +static inline void __kfree_skbmem(struct sk_buff *skb) { - unsigned long flags; -#ifdef CONFIG_SLAVE_BALANCING - save_flags(flags); - cli(); - if(skb->in_dev_queue && skb->dev!=NULL) - skb->dev->pkt_queue--; - restore_flags(flags); -#endif -#ifdef CONFIG_SKB_CHECK - IS_SKB(skb); - if(size!=skb->truesize) - printk("kfree_skbmem: size mismatch.\n"); + /* don't do anything if somebody still uses us */ + if (atomic_dec_and_test(&skb->count)) { + kfree(skb->head); + atomic_dec(&net_skbcount); + } +} - if(skb->magic_debug_cookie == SK_GOOD_SKB) - { - save_flags(flags); - cli(); - IS_SKB(skb); - skb->magic_debug_cookie = SK_FREED_SKB; - kfree_s((void *)skb,size); - net_skbcount--; - net_memory -= size; - restore_flags(flags); +void kfree_skbmem(struct sk_buff *skb) +{ + void * addr = skb->head; + + /* don't do anything if somebody still uses us */ + if (atomic_dec_and_test(&skb->count)) { + + int free_head; + + free_head = (skb->inclone != SKB_CLONE_INLINE); + + /* free the skb that contains the actual data if we've clone()'d */ + if (skb->data_skb) { + addr = skb; + __kfree_skbmem(skb->data_skb); + } + if (free_head) + kfree(addr); + atomic_dec(&net_skbcount); } - else - printk("kfree_skbmem: bad magic cookie\n"); -#else - save_flags(flags); - cli(); - kfree_s((void *)skb,size); - net_skbcount--; - net_memory -= size; - restore_flags(flags); -#endif } /* @@ -494,28 +759,92 @@ void kfree_skbmem(struct sk_buff *skb,unsigned size) struct sk_buff *skb_clone(struct sk_buff *skb, int priority) { struct sk_buff *n; + int inbuff = 0; + + IS_SKB(skb); + if (skb_tailroom(skb) >= sizeof(struct sk_buff)) + { + n = ((struct sk_buff *) skb->end) - 1; + skb->end -= sizeof(struct sk_buff); + skb->inclone = SKB_CLONE_ORIG; + inbuff = SKB_CLONE_INLINE; + } + else + { + n = kmalloc(sizeof(*n), priority); + if (!n) + return NULL; + } + memcpy(n, skb, sizeof(*n)); + n->count = 1; + if (skb->data_skb) + skb = skb->data_skb; + atomic_inc(&skb->count); + atomic_inc(&net_allocs); + atomic_inc(&net_skbcount); + n->data_skb = skb; + n->next = n->prev = n->link3 = NULL; + n->list = NULL; + n->sk = NULL; + n->free = 1; + n->tries = 0; + n->lock = 0; + n->users = 0; + n->inclone = inbuff; + return n; +} + +/* + * This is slower, and copies the whole data area + */ + +struct sk_buff *skb_copy(struct sk_buff *skb, int priority) +{ + struct sk_buff *n; unsigned long offset; - n=alloc_skb(skb->mem_len-sizeof(struct sk_buff),priority); + /* + * Allocate the copy buffer + */ + + IS_SKB(skb); + + n=alloc_skb(skb->end - skb->head, priority); if(n==NULL) return NULL; - offset=((char *)n)-((char *)skb); - - memcpy(n->data,skb->data,skb->mem_len-sizeof(struct sk_buff)); - n->len=skb->len; + /* + * Shift between the two data areas in bytes + */ + + offset=n->head-skb->head; + + /* Set the data pointer */ + skb_reserve(n,skb->data-skb->head); + /* Set the tail pointer and length */ + skb_put(n,skb->len); + /* Copy the bytes */ + memcpy(n->head,skb->head,skb->end-skb->head); n->link3=NULL; + n->list=NULL; n->sk=NULL; n->when=skb->when; n->dev=skb->dev; n->h.raw=skb->h.raw+offset; + n->mac.raw=skb->mac.raw+offset; n->ip_hdr=(struct iphdr *)(((char *)skb->ip_hdr)+offset); - n->fraglen=skb->fraglen; - n->fraglist=skb->fraglist; +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + n->ipv6_hdr=(struct ipv6hdr *)(((char *)skb->ipv6_hdr)+offset); + n->nexthop = skb->nexthop; +#endif n->saddr=skb->saddr; n->daddr=skb->daddr; n->raddr=skb->raddr; + n->seq=skb->seq; + n->end_seq=skb->end_seq; + n->ack_seq=skb->ack_seq; n->acked=skb->acked; + memcpy(n->proto_priv, skb->proto_priv, sizeof(skb->proto_priv)); n->used=skb->used; n->free=1; n->arp=skb->arp; @@ -524,10 +853,11 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int priority) n->users=0; n->pkt_type=skb->pkt_type; n->stamp=skb->stamp; + + IS_SKB(n); return n; } - /* * Skbuff device locking */ @@ -535,7 +865,8 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int priority) void skb_device_lock(struct sk_buff *skb) { if(skb->lock) - printk("double lock on device queue!\n"); + printk("double lock on device queue, lock=%d caller=%p\n", + skb->lock, (&skb)[-1]); else net_locked++; skb->lock++; @@ -556,10 +887,12 @@ void dev_kfree_skb(struct sk_buff *skb, int mode) save_flags(flags); cli(); - if(skb->lock==1) + if(skb->lock) + { net_locked--; - - if (!--skb->lock && (skb->free == 1 || skb->free == 3)) + skb->lock--; + } + if (!skb->lock && (skb->free == 1 || skb->free == 3)) { restore_flags(flags); kfree_skb(skb,mode); @@ -568,8 +901,17 @@ void dev_kfree_skb(struct sk_buff *skb, int mode) restore_flags(flags); } +struct sk_buff *dev_alloc_skb(unsigned int length) +{ + struct sk_buff *skb; + + skb = alloc_skb(length+16, GFP_ATOMIC); + if (skb) + skb_reserve(skb,16); + return skb; +} + int skb_device_locked(struct sk_buff *skb) { return skb->lock? 1 : 0; } - diff --git a/net/core/sock.c b/net/core/sock.c index a95586cbc..28c3eb897 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3,7 +3,7 @@ * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * - * Generic socket support routines. Memory allocators, sk->inuse/release + * Generic socket support routines. Memory allocators, socket lock/release * handler for protocols to use and generic option handler. * * @@ -64,6 +64,13 @@ * Alan Cox : Make SO_DEBUG superuser only. * Alan Cox : Allow anyone to clear SO_DEBUG * (compatibility fix) + * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. + * Alan Cox : Allocator for a socket is settable. + * Alan Cox : SO_ERROR includes soft errors. + * Alan Cox : Allow NULL arguments on some SO_ opts + * Alan Cox : Generic socket allocation to make hooks + * easier (suggested by Craig Metz). + * Michael Pall : SO_ERROR returns positive errno again * * To Fix: * @@ -90,7 +97,7 @@ #include <linux/mm.h> #include <linux/interrupt.h> -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <linux/inet.h> @@ -115,87 +122,116 @@ */ int sock_setsockopt(struct sock *sk, int level, int optname, - char *optval, int optlen) + char *optval, int optlen) { int val; int valbool; int err; struct linger ling; + int ret = 0; + /* + * Options without arguments + */ + +#ifdef SO_DONTLINGER /* Compatibility item... */ + switch(optname) + { + case SO_DONTLINGER: + sk->linger=0; + return 0; + } +#endif + if (optval == NULL) return(-EINVAL); - - err=verify_area(VERIFY_READ, optval, sizeof(int)); - if(err) - return err; - val = get_fs_long((unsigned long *)optval); + err = get_user(val, (int *)optval); + if (err) + return err; + valbool = val?1:0; switch(optname) { case SO_DEBUG: if(val && !suser()) - return(-EPERM); - sk->debug=valbool; - return 0; + { + ret = -EPERM; + } + else + sk->debug=valbool; + break; case SO_REUSEADDR: sk->reuse = valbool; - return(0); + break; case SO_TYPE: case SO_ERROR: - return(-ENOPROTOOPT); + ret = -ENOPROTOOPT; + break; case SO_DONTROUTE: sk->localroute=valbool; - return 0; + break; case SO_BROADCAST: sk->broadcast=valbool; - return 0; + break; case SO_SNDBUF: - if(val>32767) - val=32767; - if(val<256) - val=256; - sk->sndbuf=val; - return 0; + if(val > SK_WMEM_MAX*2) + val = SK_WMEM_MAX*2; + if(val < 256) + val = 256; + if(val > 65535) + val = 65535; + sk->sndbuf = val; + break; case SO_RCVBUF: - if(val>32767) - val=32767; - if(val<256) - val=256; - sk->rcvbuf=val; - return(0); + if(val > SK_RMEM_MAX*2) + val = SK_RMEM_MAX*2; + if(val < 256) + val = 256; + if(val > 65535) + val = 65535; + sk->rcvbuf = val; + break; case SO_KEEPALIVE: +#ifdef CONFIG_INET + if (sk->protocol == IPPROTO_TCP) + { + tcp_set_keepalive(sk, valbool); + } +#endif sk->keepopen = valbool; - return(0); + break; case SO_OOBINLINE: sk->urginline = valbool; - return(0); + break; case SO_NO_CHECK: sk->no_check = valbool; - return(0); + break; case SO_PRIORITY: if (val >= 0 && val < DEV_NUMBUFFS) { sk->priority = val; } - else + else { return(-EINVAL); } - return(0); + break; case SO_LINGER: - err=verify_area(VERIFY_READ,optval,sizeof(ling)); - if(err) - return err; - memcpy_fromfs(&ling,optval,sizeof(ling)); + err = copy_from_user(&ling,optval,sizeof(ling)); + if (err) + { + ret = -EFAULT; + break; + } if(ling.l_onoff==0) sk->linger=0; else @@ -203,12 +239,18 @@ int sock_setsockopt(struct sock *sk, int level, int optname, sk->lingertime=ling.l_linger; sk->linger=1; } - return 0; - + break; + case SO_BSDCOMPAT: + sk->bsdism = valbool; + break; + + /* We implementation the SO_SNDLOWAT etc to + not be settable (1003.1g 5.3) */ default: return(-ENOPROTOOPT); } + return ret; } @@ -254,8 +296,9 @@ int sock_getsockopt(struct sock *sk, int level, int optname, break; case SO_ERROR: - val = sk->err; - sk->err = 0; + val = -sock_error(sk); + if(val==0) + val=xchg(&sk->err_soft,0); break; case SO_OOBINLINE: @@ -271,80 +314,81 @@ int sock_getsockopt(struct sock *sk, int level, int optname, break; case SO_LINGER: - err=verify_area(VERIFY_WRITE,optval,sizeof(ling)); - if(err) - return err; - err=verify_area(VERIFY_WRITE,optlen,sizeof(int)); - if(err) - return err; - put_fs_long(sizeof(ling),(unsigned long *)optlen); - ling.l_onoff=sk->linger; - ling.l_linger=sk->lingertime; - memcpy_tofs(optval,&ling,sizeof(ling)); - return 0; + err = put_user(sizeof(ling), optlen); + if (!err) { + ling.l_onoff=sk->linger; + ling.l_linger=sk->lingertime; + err = copy_to_user(optval,&ling,sizeof(ling)); + if (err) + err = -EFAULT; + } + return err; - + case SO_BSDCOMPAT: + val = sk->bsdism; + break; + + case SO_RCVTIMEO: + case SO_SNDTIMEO: + { + static struct timeval tm={0,0}; + return copy_to_user(optval,&tm,sizeof(tm)); + } + case SO_RCVLOWAT: + case SO_SNDLOWAT: + val=1; default: return(-ENOPROTOOPT); } - err=verify_area(VERIFY_WRITE, optlen, sizeof(int)); - if(err) - return err; - put_fs_long(sizeof(int),(unsigned long *) optlen); + err = put_user(sizeof(int), optlen); + if (!err) + err = put_user(val,(unsigned int *)optval); + + return err; +} - err=verify_area(VERIFY_WRITE, optval, sizeof(int)); - if(err) - return err; - put_fs_long(val,(unsigned long *)optval); +struct sock *sk_alloc(int priority) +{ + struct sock *sk=(struct sock *)kmalloc(sizeof(*sk), priority); + if(!sk) + return NULL; + memset(sk, 0, sizeof(*sk)); + return sk; +} - return(0); +void sk_free(struct sock *sk) +{ + kfree_s(sk,sizeof(*sk)); } struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority) { - if (sk) - { - if (sk->wmem_alloc + size < sk->sndbuf || force) - { - struct sk_buff * c = alloc_skb(size, priority); - if (c) - { - unsigned long flags; - save_flags(flags); - cli(); - sk->wmem_alloc+= c->mem_len; - restore_flags(flags); /* was sti(); */ - } - return c; + if (sk) { + if (force || sk->wmem_alloc < sk->sndbuf) { + struct sk_buff * skb = alloc_skb(size, priority); + if (skb) + atomic_add(skb->truesize, &sk->wmem_alloc); + return skb; } - return(NULL); + return NULL; } - return(alloc_skb(size, priority)); + return alloc_skb(size, priority); } - struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority) { - if (sk) - { - if (sk->rmem_alloc + size < sk->rcvbuf || force) - { - struct sk_buff *c = alloc_skb(size, priority); - if (c) - { - unsigned long flags; - save_flags(flags); - cli(); - sk->rmem_alloc += c->mem_len; - restore_flags(flags); /* was sti(); */ - } - return(c); + if (sk) { + if (force || sk->rmem_alloc < sk->rcvbuf) { + struct sk_buff *skb = alloc_skb(size, priority); + if (skb) + atomic_add(skb->truesize, &sk->rmem_alloc); + return skb; } - return(NULL); + return NULL; } - return(alloc_skb(size, priority)); + return alloc_skb(size, priority); } @@ -373,45 +417,38 @@ unsigned long sock_wspace(struct sock *sk) return(0); if (sk->wmem_alloc >= sk->sndbuf) return(0); - return(sk->sndbuf-sk->wmem_alloc ); + return sk->sndbuf - sk->wmem_alloc; } return(0); } -void sock_wfree(struct sock *sk, struct sk_buff *skb, unsigned long size) +void sock_wfree(struct sock *sk, struct sk_buff *skb) { -#ifdef CONFIG_SKB_CHECK + int s=skb->truesize; +#if CONFIG_SKB_CHECK IS_SKB(skb); #endif - kfree_skbmem(skb, size); + kfree_skbmem(skb); if (sk) { - unsigned long flags; - save_flags(flags); - cli(); - sk->wmem_alloc -= size; - restore_flags(flags); /* In case it might be waiting for more memory. */ sk->write_space(sk); - return; + atomic_sub(s, &sk->wmem_alloc); } } -void sock_rfree(struct sock *sk, struct sk_buff *skb, unsigned long size) +void sock_rfree(struct sock *sk, struct sk_buff *skb) { -#ifdef CONFIG_SKB_CHECK + int s=skb->truesize; +#if CONFIG_SKB_CHECK IS_SKB(skb); #endif - kfree_skbmem(skb, size); + kfree_skbmem(skb); if (sk) { - unsigned long flags; - save_flags(flags); - cli(); - sk->rmem_alloc -= size; - restore_flags(flags); + atomic_sub(s, &sk->rmem_alloc); } } @@ -419,13 +456,11 @@ void sock_rfree(struct sock *sk, struct sk_buff *skb, unsigned long size) * Generic send/receive buffer handlers */ -struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode) +struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, unsigned long fallback, int noblock, int *errcode) { struct sk_buff *skb; int err; - sk->inuse=1; - do { if(sk->err!=0) @@ -444,8 +479,21 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int nob return NULL; } - skb = sock_wmalloc(sk, size, 0, GFP_KERNEL); + if(!fallback) + skb = sock_wmalloc(sk, size, 0, sk->allocation); + else + { + /* The buffer get won't block, or use the atomic queue. It does + produce annoying no free page messages still.... */ + skb = sock_wmalloc(sk, size, 0 , GFP_BUFFER); + if(!skb) + skb=sock_wmalloc(sk, fallback, 0, GFP_KERNEL); + } + /* + * This means we have too many buffers for this socket already. + */ + if(skb==NULL) { unsigned long tmp; @@ -470,7 +518,19 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int nob return NULL; } +#if 1 if( tmp <= sk->wmem_alloc) +#else + /* ANK: Line above seems either incorrect + * or useless. sk->wmem_alloc has a tiny chance to change + * between tmp = sk->w... and cli(), + * but it might(?) change earlier. In real life + * it does not (I never seen the message). + * In any case I'd delete this check at all, or + * change it to: + */ + if (sk->wmem_alloc + size >= sk->sndbuf) +#endif { sk->socket->flags &= ~SO_NOSPACE; interruptible_sleep_on(sk->sleep); @@ -490,53 +550,19 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int nob } -void release_sock(struct sock *sk) +void __release_sock(struct sock *sk) { - unsigned long flags; #ifdef CONFIG_INET - struct sk_buff *skb; -#endif - - if (!sk->prot) - return; - /* - * Make the backlog atomic. If we don't do this there is a tiny - * window where a packet may arrive between the sk->blog being - * tested and then set with sk->inuse still 0 causing an extra - * unwanted re-entry into release_sock(). - */ - - save_flags(flags); - cli(); - if (sk->blog) - { - restore_flags(flags); + if (!sk->prot || !sk->backlog_rcv) return; - } - sk->blog=1; - sk->inuse = 1; - restore_flags(flags); -#ifdef CONFIG_INET + /* See if we have any packets built up. */ - while((skb = skb_dequeue(&sk->back_log)) != NULL) - { - sk->blog = 1; - if (sk->prot->rcv) - sk->prot->rcv(skb, skb->dev, sk->opt, - skb->saddr, skb->len, skb->daddr, 1, - /* Only used for/by raw sockets. */ - (struct inet_protocol *)sk->pair); - } -#endif - sk->blog = 0; - sk->inuse = 0; -#ifdef CONFIG_INET - if (sk->dead && sk->state == TCP_CLOSE) - { - /* Should be about 2 rtt's */ - reset_timer(sk, TIME_DONE, min(sk->rtt * 2, TCP_DONE_TIME)); + start_bh_atomic(); + while (!skb_queue_empty(&sk->back_log)) { + struct sk_buff * skb = sk->back_log.next; + __skb_unlink(skb, &sk->back_log); + sk->backlog_rcv(sk, skb); } + end_bh_atomic(); #endif } - - diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c new file mode 100644 index 000000000..8b5848e6b --- /dev/null +++ b/net/core/sysctl_net_core.c @@ -0,0 +1,13 @@ +/* -*- linux-c -*- + * sysctl_net_core.c: sysctl interface to net core subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/core directory entry (empty =) ). [MS] + */ + +#include <linux/mm.h> +#include <linux/sysctl.h> + +ctl_table core_table[] = { + {0} +}; diff --git a/net/decnet/README b/net/decnet/README new file mode 100644 index 000000000..96816c47c --- /dev/null +++ b/net/decnet/README @@ -0,0 +1,6 @@ +Yes.. it's being worked on. + +If you want to get involved email me <Alan.Cox@linux.org> and I'll put you +in touch with the people doing the work. + +Alan diff --git a/net/ethernet/Makefile b/net/ethernet/Makefile index a298cb88a..a1e61cdda 100644 --- a/net/ethernet/Makefile +++ b/net/ethernet/Makefile @@ -7,51 +7,23 @@ # # Note 2! The CFLAGS definition is now in the main makefile... -.c.o: - $(CC) $(CFLAGS) -c $< -.s.o: - $(AS) -o $*.o $< -.c.s: - $(CC) $(CFLAGS) -S $< +O_TARGET := ethernet.o - -OBJS := eth.o +OBJS := eth.o sysctl_net_ether.o ifdef CONFIG_IPX - OBJ2 := pe2.o - endif ifdef CONFIG_ATALK - OBJ2 := pe2.o - endif -OBJS := $(OBJS) $(OBJ2) - ifdef CONFIG_NET - -ethernet.o: $(OBJS) - $(LD) -r -o ethernet.o $(OBJS) - -else - -ethernet.o: - $(AR) rcs ethernet.o - +O_OBJS := $(OBJS) $(OBJ2) endif -dep: - $(CPP) -M *.c > .depend +include $(TOPDIR)/Rules.make tar: - tar -cvf /dev/f1 . - -# -# include a dependency file if one exists -# -ifeq (.depend,$(wildcard .depend)) -include .depend -endif + tar -cvf /dev/f1 . diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index cf6ef5328..4872fd2b5 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -24,14 +24,19 @@ * and changes for new arp and skbuff. * Alan Cox : Redid header building to reflect new format. * Alan Cox : ARP only when compiled with CONFIG_INET - * Greg Page : 802.2 and SNAP stuff + * Greg Page : 802.2 and SNAP stuff. + * Alan Cox : MAC layer pointers/new format. + * Paul Gortmaker : eth_copy_and_sum shouldn't csum padding. + * Alan Cox : Protect against forwarding explosions with + * older network drivers and IFF_ALLMULTI. + * Christer Weinigel : Better rebuild header message. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <linux/types.h> #include <linux/kernel.h> @@ -48,6 +53,20 @@ #include <linux/config.h> #include <net/arp.h> #include <net/sock.h> +#include <net/ipv6.h> + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) +#include <linux/in6.h> +#include <net/ndisc.h> +#endif + +#include <asm/checksum.h> + + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) +int (*ndisc_eth_hook) (unsigned char *, struct device *, + struct sk_buff *) = NULL; +#endif void eth_setup(char *str, int *ints) { @@ -81,11 +100,10 @@ void eth_setup(char *str, int *ints) * daddr=NULL means leave destination address (eg unresolved arp) */ -int eth_header(unsigned char *buff, struct device *dev, unsigned short type, - void *daddr, void *saddr, unsigned len, - struct sk_buff *skb) +int eth_header(struct sk_buff *skb, struct device *dev, unsigned short type, + void *daddr, void *saddr, unsigned len) { - struct ethhdr *eth = (struct ethhdr *)buff; + struct ethhdr *eth = (struct ethhdr *)skb_push(skb,ETH_HLEN); /* * Set the protocol type. For a packet of type ETH_P_802_3 we put the length @@ -138,24 +156,43 @@ int eth_rebuild_header(void *buff, struct device *dev, unsigned long dst, struct ethhdr *eth = (struct ethhdr *)buff; /* - * Only ARP/IP is currently supported + * Only ARP/IP and NDISC/IPv6 are currently supported */ - - if(eth->h_proto != htons(ETH_P_IP)) + + switch (eth->h_proto) { - printk("eth_rebuild_header: Don't know how to resolve type %d addresses?\n",(int)eth->h_proto); +#ifdef CONFIG_INET + case __constant_htons(ETH_P_IP): + + /* + * Try to get ARP to resolve the header. + */ + + return (arp_find(eth->h_dest, dst, dev, dev->pa_addr, skb) ? + 1 : 0); + break; +#endif + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + case __constant_htons(ETH_P_IPV6): +#ifdef CONFIG_IPV6 + return (ndisc_eth_resolv(eth->h_dest, dev, skb)); +#else + if (ndisc_eth_hook) + return (ndisc_eth_hook(eth->h_dest, dev, skb)); +#endif +#endif + default: + printk(KERN_DEBUG + "%s: unable to resolve type %X addresses.\n", + dev->name, (int)eth->h_proto); + memcpy(eth->h_source, dev->dev_addr, dev->addr_len); return 0; + break; } - /* - * Try and get ARP to resolve the header. - */ -#ifdef CONFIG_INET - return arp_find(eth->h_dest, dst, dev, dev->pa_addr, skb)? 1 : 0; -#else return 0; -#endif } @@ -167,9 +204,13 @@ int eth_rebuild_header(void *buff, struct device *dev, unsigned long dst, unsigned short eth_type_trans(struct sk_buff *skb, struct device *dev) { - struct ethhdr *eth = (struct ethhdr *) skb->data; + struct ethhdr *eth; unsigned char *rawp; + skb->mac.raw=skb->data; + skb_pull(skb,dev->hard_header_len); + eth= skb->mac.ethernet; + if(*eth->h_dest&1) { if(memcmp(eth->h_dest,dev->broadcast, ETH_ALEN)==0) @@ -178,7 +219,12 @@ unsigned short eth_type_trans(struct sk_buff *skb, struct device *dev) skb->pkt_type=PACKET_MULTICAST; } - else if(dev->flags&IFF_PROMISC) + /* + * This ALLMULTI check should be redundant by 1.4 + * so don't forget to remove it. + */ + + else if(dev->flags&(IFF_PROMISC|IFF_ALLMULTI)) { if(memcmp(eth->h_dest,dev->dev_addr, ETH_ALEN)) skb->pkt_type=PACKET_OTHERHOST; @@ -187,31 +233,100 @@ unsigned short eth_type_trans(struct sk_buff *skb, struct device *dev) if (ntohs(eth->h_proto) >= 1536) return eth->h_proto; - rawp = (unsigned char *)(eth + 1); + rawp = skb->data; + /* + * This is a magic hack to spot IPX packets. Older Novell breaks + * the protocol design and runs IPX over 802.3 without an 802.2 LLC + * layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This + * won't work for fault tolerant netware but does for the rest. + */ if (*(unsigned short *)rawp == 0xFFFF) return htons(ETH_P_802_3); + /* + * Real 802.2 LLC + */ return htons(ETH_P_802_2); } /* - * Header caching for ethernet. Try to find and cache a header to avoid arp overhead. + * Upper level calls this function to bind hardware header cache entry. + * If the call is successful, then corresponding Address Resolution Protocol + * (maybe, not ARP) takes responsibility for updating cache content. */ - -void eth_header_cache(struct device *dev, struct sock *sk, unsigned long saddr, unsigned long daddr) + +void eth_header_cache_bind(struct hh_cache ** hhp, struct device *dev, + unsigned short htype, __u32 daddr) { - int v=arp_find_cache(sk->ip_hcache_data, daddr, dev); - if(v!=1) - sk->ip_hcache_state=0; /* Try when arp resolves */ - else + struct hh_cache *hh; + + if (htype != ETH_P_IP) { - memcpy(sk->ip_hcache_data+6, dev->dev_addr, ETH_ALEN); - sk->ip_hcache_data[12]=ETH_P_IP>>8; - sk->ip_hcache_data[13]=ETH_P_IP&0xFF; - sk->ip_hcache_state=1; - sk->ip_hcache_stamp=arp_cache_stamp; - sk->ip_hcache_ver=&arp_cache_stamp; + printk(KERN_DEBUG "eth_header_cache_bind: %04x cache is not implemented\n", htype); + return; + } + if (arp_bind_cache(hhp, dev, htype, daddr)) + return; + if ((hh=*hhp) != NULL) + { + memcpy(hh->hh_data+6, dev->dev_addr, ETH_ALEN); + hh->hh_data[12] = htype>>8; + hh->hh_data[13] = htype&0xFF; + } +} + +/* + * Called by Address Resolution module to notify changes in address. + */ + +void eth_header_cache_update(struct hh_cache *hh, struct device *dev, unsigned char * haddr) +{ + if (hh->hh_type != ETH_P_IP) + { + printk(KERN_DEBUG "eth_header_cache_update: %04x cache is not implemented\n", hh->hh_type); + return; } + memcpy(hh->hh_data, haddr, ETH_ALEN); + hh->hh_uptodate = 1; } +/* + * Copy from an ethernet device memory space to an sk_buff while checksumming if IP + */ + +void eth_copy_and_sum(struct sk_buff *dest, unsigned char *src, int length, int base) +{ +#ifdef CONFIG_IP_ROUTER + memcpy(dest->data,src,length); +#else + struct ethhdr *eth; + struct iphdr *iph; + int ip_length; + + IS_SKB(dest); + eth=(struct ethhdr *)src; + if(eth->h_proto!=htons(ETH_P_IP)) + { + memcpy(dest->data,src,length); + return; + } + /* + * We have to watch for padded packets. The csum doesn't include the + * padding, and there is no point in copying the padding anyway. + * We have to use the smaller of length and ip_length because it + * can happen that ip_length > length. + */ + memcpy(dest->data,src,sizeof(struct iphdr)+ETH_HLEN); /* ethernet is always >= 34 */ + length -= sizeof(struct iphdr) + ETH_HLEN; + iph=(struct iphdr*)(src+ETH_HLEN); + ip_length = ntohs(iph->tot_len) - sizeof(struct iphdr); + + /* Also watch out for bogons - min IP size is 8 (rfc-1042) */ + if ((ip_length <= length) && (ip_length > 7)) + length=ip_length; + + dest->csum=csum_partial_copy(src+sizeof(struct iphdr)+ETH_HLEN,dest->data+sizeof(struct iphdr)+ETH_HLEN,length,base); + dest->ip_summed=1; +#endif +} diff --git a/net/ethernet/pe2.c b/net/ethernet/pe2.c index 7cb40d12f..812d35864 100644 --- a/net/ethernet/pe2.c +++ b/net/ethernet/pe2.c @@ -9,12 +9,9 @@ pEII_datalink_header(struct datalink_proto *dl, struct sk_buff *skb, unsigned char *dest_node) { struct device *dev = skb->dev; - unsigned long len = skb->len; - unsigned long hard_len = dev->hard_header_len; - dev->hard_header(skb->data, dev, ETH_P_IPX, - dest_node, NULL, len - hard_len, skb); - skb->h.raw = skb->data + hard_len; + skb->protocol = htons (ETH_P_IPX); + dev->hard_header(skb, dev, ETH_P_IPX, dest_node, NULL, skb->len); } struct datalink_proto * @@ -33,3 +30,8 @@ make_EII_client(void) return proto; } +void destroy_EII_client(struct datalink_proto *dl) +{ + if (dl) + kfree_s(dl, sizeof(struct datalink_proto)); +} diff --git a/net/ethernet/sysctl_net_ether.c b/net/ethernet/sysctl_net_ether.c new file mode 100644 index 000000000..b81a6d532 --- /dev/null +++ b/net/ethernet/sysctl_net_ether.c @@ -0,0 +1,13 @@ +/* -*- linux-c -*- + * sysctl_net_ether.c: sysctl interface to net Ethernet subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/ether directory entry (empty =) ). [MS] + */ + +#include <linux/mm.h> +#include <linux/sysctl.h> + +ctl_table ether_table[] = { + {0} +}; diff --git a/net/ipv4/Config.in b/net/ipv4/Config.in new file mode 100644 index 000000000..25596cc4f --- /dev/null +++ b/net/ipv4/Config.in @@ -0,0 +1,47 @@ +# +# IP configuration +# +bool 'IP: forwarding/gatewaying' CONFIG_IP_FORWARD +bool 'IP: multicasting' CONFIG_IP_MULTICAST +if [ "$CONFIG_FIREWALL" = "y" ]; then + bool 'IP: firewalling' CONFIG_IP_FIREWALL + if [ "$CONFIG_IP_FIREWALL" = "y" ]; then + if [ "$CONFIG_NETLINK" = "y" ]; then + bool 'IP: firewall packet netlink device' CONFIG_IP_FIREWALL_NETLINK + fi + bool 'IP: firewall packet logging' CONFIG_IP_FIREWALL_VERBOSE + if [ "$CONFIG_EXPERIMENTAL" = "y" -a "$CONFIG_IP_FORWARD" = "y" ]; then + bool 'IP: masquerading (EXPERIMENTAL)' CONFIG_IP_MASQUERADE + if [ "$CONFIG_IP_MASQUERADE" != "n" ]; then + comment 'Protocol-specific masquerading support will be built as modules.' + fi + bool 'IP: transparent proxy support (EXPERIMENTAL)' CONFIG_IP_TRANSPARENT_PROXY + bool 'IP: always defragment' CONFIG_IP_ALWAYS_DEFRAG + fi + fi +fi +bool 'IP: accounting' CONFIG_IP_ACCT +if [ "$CONFIG_IP_FORWARD" = "y" ]; then + bool 'IP: optimize as router not host' CONFIG_IP_ROUTER + tristate 'IP: tunneling' CONFIG_NET_IPIP + if [ "$CONFIG_IP_MULTICAST" = "y" ]; then + if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + bool 'IP: multicast routing (EXPERIMENTAL)' CONFIG_IP_MROUTE + fi + fi +fi +if [ "$CONFIG_NET_ALIAS" = "y" ]; then + tristate 'IP: aliasing support' CONFIG_IP_ALIAS +fi +if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + if [ "$CONFIG_NETLINK" = "y" ]; then + bool 'IP: ARP daemon support (EXPERIMENTAL)' CONFIG_ARPD + fi +fi +comment '(it is safe to leave these untouched)' +bool 'IP: PC/TCP compatibility mode' CONFIG_INET_PCTCP +tristate 'IP: Reverse ARP' CONFIG_INET_RARP +bool 'IP: Disable Path MTU Discovery (normally enabled)' CONFIG_NO_PATH_MTU_DISCOVERY +#bool 'IP: Disable NAGLE algorithm (normally enabled)' CONFIG_TCP_NAGLE_OFF +bool 'IP: Drop source routed frames' CONFIG_IP_NOSR +bool 'IP: Allow large windows (not recommended if <16Mb of memory)' CONFIG_SKB_LARGE diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 296c4d114..2ca338c04 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -7,45 +7,55 @@ # # Note 2! The CFLAGS definition is now in the main makefile... -.c.o: - $(CC) $(CFLAGS) -c $< -.s.o: - $(AS) -o $*.o $< -.c.s: - $(CC) $(CFLAGS) -S $< - - -OBJS := utils.o route.o proc.o timer.o protocol.o packet.o \ - arp.o ip.o raw.o icmp.o tcp.o udp.o devinet.o af_inet.o \ - igmp.o ip_fw.o checksum.o ipip.o - -ifdef CONFIG_INET_RARP - -OBJS := $(OBJS) rarp.o +O_TARGET := ipv4.o +IPV4_OBJS := utils.o route.o proc.o timer.o protocol.o packet.o \ + ip_input.o ip_fragment.o ip_forward.o ip_options.o \ + ip_output.o ip_sockglue.o \ + tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o\ + raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o ip_fw.o \ + sysctl_net_ipv4.o + +MOD_LIST_NAME := IPV4_MODULES +M_OBJS := + +ifeq ($(CONFIG_IP_MROUTE),y) +IPV4_OBJS += ipmr.o +endif +ifeq ($(CONFIG_INET_RARP),y) +IPV4_OBJS += rarp.o +else + ifeq ($(CONFIG_INET_RARP),m) + M_OBJS += rarp.o + endif endif -ifdef CONFIG_INET +ifeq ($(CONFIG_NET_IPIP),y) +IPV4_OBJS += ipip.o +else + ifeq ($(CONFIG_NET_IPIP),m) + M_OBJS += ipip.o + endif +endif -ipv4.o: $(OBJS) - $(LD) -r -o ipv4.o $(OBJS) +ifeq ($(CONFIG_IP_MASQUERADE),y) +IPV4_OBJS += ip_masq.o ip_masq_app.o +M_OBJS += ip_masq_ftp.o ip_masq_irc.o ip_masq_raudio.o +endif +ifeq ($(CONFIG_IP_ALIAS),y) +IPV4_OBJS += ip_alias.o else + ifeq ($(CONFIG_IP_ALIAS),m) + M_OBJS += ip_alias.o + endif +endif -ipv4.o: - $(AR) rcs ipv4.o - +ifdef CONFIG_INET +O_OBJS := $(IPV4_OBJS) endif -dep: - $(CPP) -M *.c > .depend +include $(TOPDIR)/Rules.make tar: tar -cvf /dev/f1 . - -# -# include a dependency file if one exists -# -ifeq (.depend,$(wildcard .depend)) -include .depend -endif diff --git a/net/ipv4/README.TCP b/net/ipv4/README.TCP deleted file mode 100644 index f18963f88..000000000 --- a/net/ipv4/README.TCP +++ /dev/null @@ -1,39 +0,0 @@ -How the new TCP output machine [nyi] works. - - -Data is kept on a single queue. The skb->users flag tells us if the frame is -one that has been queued already. To add a frame we throw it on the end. Ack -walks down the list from the start. - -We keep a set of control flags - - - sk->tcp_pend_event - - TCP_PEND_ACK Ack needed - TCP_ACK_NOW Needed now - TCP_WINDOW Window update check - TCP_WINZERO Zero probing - - - sk->transmit_queue The transmission frame begin - sk->transmit_new First new frame pointer - sk->transmit_end Where to add frames - - sk->tcp_last_tx_ack Last ack seen - sk->tcp_dup_ack Dup ack count for fast retransmit - - -Frames are queued for output by tcp_write. We do our best to send the frames -off immediately if possible, but otherwise queue and compute the body -checksum in the copy. - -When a write is done we try to clear any pending events and piggy back them. -If the window is full we queue full sized frames. On the firs timeout in -zero window we split this. - -On a timer we walk the retransmit list to send any retransmits, update the -backoff timers etc. A change of route table stamp causes a change of header -and recompute. We add any new tcp level headers and refinish the checksum -before sending. - diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index fbfc44bb2..34379849d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -31,11 +31,25 @@ * Alan Cox : bind() shouldn't abort existing but dead * sockets. Stops FTP netin:.. I hope. * Alan Cox : bind() works correctly for RAW sockets. Note - * that FreeBSD at least is broken in this respect + * that FreeBSD at least was broken in this respect * so be careful with compatibility tests... * Alan Cox : routing cache support * Alan Cox : memzero the socket structure for compactness. * Matt Day : nonblock connect error handler + * Alan Cox : Allow large numbers of pending sockets + * (eg for big web sites), but only if + * specifically application requested. + * Alan Cox : New buffering throughout IP. Used dumbly. + * Alan Cox : New buffering now used smartly. + * Alan Cox : BSD rather than common sense interpretation of + * listen. + * Germano Caronni : Assorted small races. + * Alan Cox : sendmsg/recvmsg basic support. + * Alan Cox : Only sendmsg/recvmsg now supported. + * Alan Cox : Locked down bind (see security list). + * Alan Cox : Loosened bind a little. + * Mike McLagan : ADD/DEL DLCI Ioctls + * Willy Konynenberg : Transparent proxying support. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -58,8 +72,10 @@ #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <linux/inet.h> @@ -76,11 +92,42 @@ #include <net/raw.h> #include <net/icmp.h> #include <linux/ip_fw.h> +#ifdef CONFIG_IP_MASQUERADE +#include <net/ip_masq.h> +#endif +#ifdef CONFIG_IP_ALIAS +#include <net/ip_alias.h> +#endif +#ifdef CONFIG_BRIDGE +#include <net/br.h> +#endif +#ifdef CONFIG_KERNELD +#include <linux/kerneld.h> +#endif #define min(a,b) ((a)<(b)?(a):(b)) extern struct proto packet_prot; +extern int raw_get_info(char *, char **, off_t, int, int); +extern int snmp_get_info(char *, char **, off_t, int, int); +extern int afinet_get_info(char *, char **, off_t, int, int); +extern int tcp_get_info(char *, char **, off_t, int, int); +extern int udp_get_info(char *, char **, off_t, int, int); + + +struct sock * tcp_sock_array[SOCK_ARRAY_SIZE]; +struct sock * udp_sock_array[SOCK_ARRAY_SIZE]; +struct sock * raw_sock_array[SOCK_ARRAY_SIZE]; + +#ifdef CONFIG_DLCI +extern int dlci_ioctl(unsigned int, void*); +#endif +#ifdef CONFIG_DLCI_MODULE +int (*dlci_ioctl_hook)(unsigned int, void *) = NULL; +#endif + +int (*rarp_ioctl_hook)(unsigned int,void*) = NULL; /* * See if a socket number is in use. @@ -119,13 +166,16 @@ unsigned short get_new_socknum(struct proto *prot, unsigned short base) struct sock *sk; if (base == 0) - base = PROT_SOCK+1+(start % 1024); + base = PROT_SOCK+1+(start & 1023); if (base <= PROT_SOCK) { - base += PROT_SOCK+(start % 1024); + base += PROT_SOCK+(start & 1023); } - /* Now look through the entire array and try to find an empty ptr. */ + /* + * Now look through the entire array and try to find an empty ptr. + */ + for(i=0; i < SOCK_ARRAY_SIZE; i++) { j = 0; @@ -137,7 +187,7 @@ unsigned short get_new_socknum(struct proto *prot, unsigned short base) } if (j == 0) { - start =(i+1+start )%1024; + start =(i+1+start )&1023; return(i+base+1); } if (j < size) @@ -160,10 +210,9 @@ unsigned short get_new_socknum(struct proto *prot, unsigned short base) * Add a socket into the socket tables by number. */ -void put_sock(unsigned short num, struct sock *sk) +void inet_put_sock(unsigned short num, struct sock *sk) { - struct sock *sk1; - struct sock *sk2; + struct sock **skp, *tmp; int mask; unsigned long flags; @@ -174,7 +223,10 @@ void put_sock(unsigned short num, struct sock *sk) sk->next = NULL; num = num &(SOCK_ARRAY_SIZE -1); - /* We can't have an interrupt re-enter here. */ + /* + * We can't have an interrupt re-enter here. + */ + save_flags(flags); cli(); @@ -188,40 +240,30 @@ void put_sock(unsigned short num, struct sock *sk) restore_flags(flags); return; } + restore_flags(flags); for(mask = 0xff000000; mask != 0xffffffff; mask = (mask >> 8) | mask) { - if ((mask & sk->saddr) && - (mask & sk->saddr) != (mask & 0xffffffff)) + if ((mask & sk->rcv_saddr) && + (mask & sk->rcv_saddr) != (mask & 0xffffffff)) { mask = mask << 8; break; } } + + /* + * add the socket to the sock_array[].. + */ + skp = sk->prot->sock_array + num; cli(); - sk1 = sk->prot->sock_array[num]; - for(sk2 = sk1; sk2 != NULL; sk2=sk2->next) - { - if (!(sk2->saddr & mask)) - { - if (sk2 == sk1) - { - sk->next = sk->prot->sock_array[num]; - sk->prot->sock_array[num] = sk; - sti(); - return; - } - sk->next = sk2; - sk1->next= sk; - sti(); - return; - } - sk1 = sk2; + while ((tmp = *skp) != NULL) { + if (!(tmp->rcv_saddr & mask)) + break; + skp = &tmp->next; } - - /* Goes at the end. */ - sk->next = NULL; - sk1->next = sk; + sk->next = tmp; + *skp = sk; sti(); } @@ -229,9 +271,9 @@ void put_sock(unsigned short num, struct sock *sk) * Remove a socket from the socket tables. */ -static void remove_sock(struct sock *sk1) +void inet_remove_sock(struct sock *sk1) { - struct sock *sk2; + struct sock **p; unsigned long flags; if (sk1->type==SOCK_PACKET) @@ -246,26 +288,18 @@ static void remove_sock(struct sock *sk1) /* We can't have this changing out from under us. */ save_flags(flags); cli(); - sk2 = sk1->prot->sock_array[sk1->num &(SOCK_ARRAY_SIZE -1)]; - if (sk2 == sk1) - { - sk1->prot->inuse -= 1; - sk1->prot->sock_array[sk1->num &(SOCK_ARRAY_SIZE -1)] = sk1->next; - restore_flags(flags); - return; - } - - while(sk2 && sk2->next != sk1) - { - sk2 = sk2->next; - } - - if (sk2) + + p=&(sk1->prot->sock_array[sk1->num & (SOCK_ARRAY_SIZE -1)]); + + while(*p!=NULL) { - sk1->prot->inuse -= 1; - sk2->next = sk1->next; - restore_flags(flags); - return; + if(*p==sk1) + { + sk1->prot->inuse--; + *p=sk1->next; + break; + } + p=&((*p)->next); } restore_flags(flags); } @@ -278,89 +312,55 @@ void destroy_sock(struct sock *sk) { struct sk_buff *skb; - sk->inuse = 1; /* just to be safe. */ + lock_sock(sk); /* just to be safe. */ - /* In case it's sleeping somewhere. */ - if (!sk->dead) - sk->write_space(sk); - - remove_sock(sk); - /* Now we can no longer get new packets. */ - delete_timer(sk); - /* Nor send them */ - del_timer(&sk->retransmit_timer); - - while ((skb = tcp_dequeue_partial(sk)) != NULL) { - IS_SKB(skb); - kfree_skb(skb, FREE_WRITE); - } + /* + * Now we can no longer get new packets or once the + * timers are killed, send them. + */ + + net_delete_timer(sk); - /* Cleanup up the write buffer. */ - while((skb = skb_dequeue(&sk->write_queue)) != NULL) { - IS_SKB(skb); - kfree_skb(skb, FREE_WRITE); - } + if (sk->prot->destroy) + sk->prot->destroy(sk); /* - * Don't discard received data until the user side kills its - * half of the socket. + * Clean up the read buffer. */ - if (sk->dead) + while((skb=skb_dequeue(&sk->receive_queue))!=NULL) { - while((skb=skb_dequeue(&sk->receive_queue))!=NULL) - { /* * This will take care of closing sockets that were * listening and didn't accept everything. */ - if (skb->sk != NULL && skb->sk != sk) - { - IS_SKB(skb); - skb->sk->dead = 1; - skb->sk->prot->close(skb->sk, 0); - } - IS_SKB(skb); - kfree_skb(skb, FREE_READ); - } - } - - /* Now we need to clean up the send head. */ - cli(); - for(skb = sk->send_head; skb != NULL; ) - { - struct sk_buff *skb2; - - /* - * We need to remove skb from the transmit queue, - * or maybe the arp queue. - */ - if (skb->next && skb->prev) { -/* printk("destroy_sock: unlinked skb\n");*/ + if (skb->sk != NULL && skb->sk != sk) + { IS_SKB(skb); - skb_unlink(skb); + skb->sk->prot->close(skb->sk, 0); } - skb->dev = NULL; - skb2 = skb->link3; - kfree_skb(skb, FREE_WRITE); - skb = skb2; + IS_SKB(skb); + kfree_skb(skb, FREE_READ); } - sk->send_head = NULL; - sti(); - /* And now the backlog. */ + /* + * Now the backlog. + */ + while((skb=skb_dequeue(&sk->back_log))!=NULL) { - /* this should never happen. */ -/* printk("cleaning back_log\n");*/ + /* this should [almost] never happen. */ + skb->sk = NULL; kfree_skb(skb, FREE_READ); } - /* Now if it has a half accepted/ closed socket. */ + /* + * Now if it has a half accepted/ closed socket. + */ + if (sk->pair) { - sk->pair->dead = 1; sk->pair->prot->close(sk->pair, 0); sk->pair = NULL; } @@ -371,18 +371,37 @@ void destroy_sock(struct sock *sk) * everything is gone. */ - if (sk->dead && sk->rmem_alloc == 0 && sk->wmem_alloc == 0) - { - kfree_s((void *)sk,sizeof(*sk)); - } - else - { + if (sk->rmem_alloc == 0 && sk->wmem_alloc == 0) + { + inet_remove_sock(sk); + + if(sk->opt) + kfree(sk->opt); + ip_rt_put(sk->ip_route_cache); + /* + * This one is pure paranoia. I'll take it out + * later once I know the bug is buried. + */ + tcp_cache_zap(); + sk_free(sk); + } + else + { /* this should never happen. */ /* actually it can if an ack has just been sent. */ + /* + * It's more normal than that... + * It can happen because a skb is still in the device queues + * [PR] + */ + + printk("Socket destroy delayed (r=%d w=%d)\n", + sk->rmem_alloc, sk->wmem_alloc); + sk->destroy = 1; sk->ack_backlog = 0; - sk->inuse = 0; - reset_timer(sk, TIME_DESTROY, SOCK_DESTROY_TIME); + release_sock(sk); + net_reset_timer(sk, TIME_DESTROY, SOCK_DESTROY_TIME); } } @@ -392,7 +411,7 @@ void destroy_sock(struct sock *sk) * the work. */ -static int inet_fcntl(struct socket *sock, unsigned int cmd, unsigned long arg) +int inet_fcntl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk; @@ -421,7 +440,7 @@ static int inet_fcntl(struct socket *sock, unsigned int cmd, unsigned long arg) * Set socket options on an inet socket. */ -static int inet_setsockopt(struct socket *sock, int level, int optname, +int inet_setsockopt(struct socket *sock, int level, int optname, char *optval, int optlen) { struct sock *sk = (struct sock *) sock->data; @@ -435,9 +454,13 @@ static int inet_setsockopt(struct socket *sock, int level, int optname, /* * Get a socket option on an AF_INET socket. + * + * FIX: POSIX 1003.1g is very ambiguous here. It states that + * asynchronous errors should be reported by getsockopt. We assume + * this means if you specify SO_ERROR (otherwise whats the point of it). */ -static int inet_getsockopt(struct socket *sock, int level, int optname, +int inet_getsockopt(struct socket *sock, int level, int optname, char *optval, int *optlen) { struct sock *sk = (struct sock *) sock->data; @@ -463,7 +486,7 @@ static int inet_autobind(struct sock *sk) return(-EAGAIN); udp_cache_zap(); tcp_cache_zap(); - put_sock(sk->num, sk); + inet_put_sock(sk->num, sk); sk->dummy_th.source = ntohs(sk->num); } return 0; @@ -473,7 +496,7 @@ static int inet_autobind(struct sock *sk) * Move a socket into listening state. */ -static int inet_listen(struct socket *sock, int backlog) +int inet_listen(struct socket *sock, int backlog) { struct sock *sk = (struct sock *) sock->data; @@ -485,9 +508,12 @@ static int inet_listen(struct socket *sock, int backlog) * note that the backlog is "unsigned char", so truncate it * somewhere. We might as well truncate it to what everybody * else does.. + * Now truncate to 128 not 5. */ - if (backlog > 5) - backlog = 5; + if ((unsigned) backlog == 0) /* BSDism */ + backlog = 1; + if ((unsigned) backlog > SOMAXCONN) + backlog = SOMAXCONN; sk->max_ack_backlog = backlog; if (sk->state != TCP_LISTEN) { @@ -519,7 +545,7 @@ static void def_callback2(struct sock *sk,int len) static void def_callback3(struct sock *sk) { - if(!sk->dead) + if(!sk->dead && sk->wmem_alloc*2 <= sk->sndbuf) { wake_up_interruptible(sk->sleep); sock_wake_async(sk->socket, 2); @@ -539,19 +565,20 @@ static int inet_create(struct socket *sock, int protocol) struct proto *prot; int err; - sk = (struct sock *) kmalloc(sizeof(*sk), GFP_KERNEL); + sk = sk_alloc(GFP_KERNEL); if (sk == NULL) return(-ENOBUFS); memset(sk,0,sizeof(*sk)); /* Efficient way to set most fields to zero */ -/* sk->num = 0; - * sk->reuse = 0;*/ + /* + * Note for tcp that also wiped the dummy_th block for us. + */ switch(sock->type) { case SOCK_STREAM: case SOCK_SEQPACKET: if (protocol && protocol != IPPROTO_TCP) { - kfree_s((void *)sk, sizeof(*sk)); + sk_free(sk); return(-EPROTONOSUPPORT); } protocol = IPPROTO_TCP; @@ -562,7 +589,7 @@ static int inet_create(struct socket *sock, int protocol) case SOCK_DGRAM: if (protocol && protocol != IPPROTO_UDP) { - kfree_s((void *)sk, sizeof(*sk)); + sk_free(sk); return(-EPROTONOSUPPORT); } protocol = IPPROTO_UDP; @@ -573,12 +600,12 @@ static int inet_create(struct socket *sock, int protocol) case SOCK_RAW: if (!suser()) { - kfree_s((void *)sk, sizeof(*sk)); + sk_free(sk); return(-EPERM); } if (!protocol) { - kfree_s((void *)sk, sizeof(*sk)); + sk_free(sk); return(-EPROTONOSUPPORT); } prot = &raw_prot; @@ -589,12 +616,12 @@ static int inet_create(struct socket *sock, int protocol) case SOCK_PACKET: if (!suser()) { - kfree_s((void *)sk, sizeof(*sk)); + sk_free(sk); return(-EPERM); } if (!protocol) { - kfree_s((void *)sk, sizeof(*sk)); + sk_free(sk); return(-EPROTONOSUPPORT); } prot = &packet_prot; @@ -603,117 +630,55 @@ static int inet_create(struct socket *sock, int protocol) break; default: - kfree_s((void *)sk, sizeof(*sk)); + sk_free(sk); return(-ESOCKTNOSUPPORT); } sk->socket = sock; #ifdef CONFIG_TCP_NAGLE_OFF sk->nonagle = 1; -#else -/* sk->nonagle = 0;*/ #endif + sk->family = AF_INET; sk->type = sock->type; sk->protocol = protocol; + sk->allocation = GFP_KERNEL; sk->sndbuf = SK_WMEM_MAX; sk->rcvbuf = SK_RMEM_MAX; - sk->rto = TCP_TIMEOUT_INIT; /*TCP_WRITE_TIME*/ - sk->cong_window = 1; /* start with only sending one packet at a time. */ sk->priority = 1; - sk->state = TCP_CLOSE; -#ifdef WHAT_WE_DO_THE_MEMZERO_INSTEAD_OF - sk->stamp.tv_sec=0; - sk->wmem_alloc = 0; - sk->rmem_alloc = 0; - sk->pair = NULL; - sk->opt = NULL; - sk->write_seq = 0; - sk->acked_seq = 0; - sk->copied_seq = 0; - sk->fin_seq = 0; - sk->urg_seq = 0; - sk->urg_data = 0; - sk->proc = 0; - sk->rtt = 0; /*TCP_WRITE_TIME << 3;*/ - sk->mdev = 0; - sk->backoff = 0; - sk->packets_out = 0; - sk->cong_count = 0; - sk->ssthresh = 0; - sk->max_window = 0; - sk->urginline = 0; - sk->intr = 0; - sk->linger = 0; - sk->destroy = 0; - sk->shutdown = 0; - sk->keepopen = 0; - sk->zapped = 0; - sk->done = 0; - sk->ack_backlog = 0; - sk->window = 0; - sk->bytes_rcv = 0; - sk->dead = 0; - sk->ack_timed = 0; - sk->partial = NULL; - sk->user_mss = 0; - sk->debug = 0; - /* how many packets we should send before forcing an ack. - if this is set to zero it is the same as sk->delay_acks = 0 */ - sk->max_ack_backlog = 0; - sk->inuse = 0; - sk->delay_acks = 0; - sk->daddr = 0; - sk->saddr = 0 /* ip_my_addr() */; - sk->err = 0; - sk->next = NULL; - sk->pair = NULL; - sk->send_tail = NULL; - sk->send_head = NULL; - sk->timeout = 0; - sk->broadcast = 0; - sk->localroute = 0; - sk->blog = 0; - sk->dummy_th.res1=0; - sk->dummy_th.res2=0; - sk->dummy_th.urg_ptr = 0; - sk->dummy_th.fin = 0; - sk->dummy_th.syn = 0; - sk->dummy_th.rst = 0; - sk->dummy_th.psh = 0; - sk->dummy_th.ack = 0; - sk->dummy_th.urg = 0; - sk->dummy_th.dest = 0; - sk->ip_tos=0; - sk->ip_route_cache=NULL; - sk->ip_hcache_ver= 0; - sk->ip_option_len=0; - sk->ip_option_flen=0; - sk->ip_opt_next_hop=0; - sk->ip_opt_ptr[0]=NULL; - sk->ip_opt_ptr[1]=NULL; -#endif - /* this is how many unacked bytes we will accept for this socket. */ - sk->max_unacked = 2048; /* needs to be at most 2 full packets. */ + sk->prot = prot; + sk->backlog_rcv = prot->backlog_rcv; + + sk->sleep = sock->wait; + sock->data =(void *) sk; + + sk->state = TCP_CLOSE; skb_queue_head_init(&sk->write_queue); skb_queue_head_init(&sk->receive_queue); - sk->mtu = 576; - sk->prot = prot; - sk->sleep = sock->wait; - init_timer(&sk->timer); - init_timer(&sk->retransmit_timer); + skb_queue_head_init(&sk->back_log); + + sk->timer.data = (unsigned long)sk; sk->timer.function = &net_timer; - skb_queue_head_init(&sk->back_log); + sock->data =(void *) sk; - sk->dummy_th.doff = sizeof(sk->dummy_th)/4; - sk->ip_ttl=64; + sk->ip_ttl=ip_statistics.IpDefaultTTL; + + if(sk->type==SOCK_RAW && protocol==IPPROTO_RAW) + sk->ip_hdrincl=1; + else + sk->ip_hdrincl=0; + #ifdef CONFIG_IP_MULTICAST sk->ip_mc_loop=1; sk->ip_mc_ttl=1; *sk->ip_mc_name=0; sk->ip_mc_list=NULL; #endif + /* + * Speed up by setting some standard state for the dummy_th + * if TCP uses it (maybe move to tcp_init later) + */ sk->state_change = def_callback1; sk->data_ready = def_callback2; @@ -728,7 +693,7 @@ static int inet_create(struct socket *sock, int protocol) * creation time automatically * shares. */ - put_sock(sk->num, sk); + inet_put_sock(sk->num, sk); sk->dummy_th.source = ntohs(sk->num); } @@ -755,29 +720,16 @@ static int inet_dup(struct socket *newsock, struct socket *oldsock) } /* - * Return 1 if we still have things to send in our buffers. - */ -static inline int closing(struct sock * sk) -{ - switch (sk->state) { - case TCP_FIN_WAIT1: - case TCP_CLOSING: - case TCP_LAST_ACK: - return 1; - } - return 0; -} - - -/* * The peer socket should always be NULL (or else). When we call this * function we are destroying the object and from then on nobody * should refer to it. */ -static int inet_release(struct socket *sock, struct socket *peer) +int inet_release(struct socket *sock, struct socket *peer) { + unsigned long timeout; struct sock *sk = (struct sock *) sock->data; + if (sk == NULL) return(0); @@ -797,42 +749,19 @@ static int inet_release(struct socket *sock, struct socket *peer) * If the close is due to the process exiting, we never * linger.. */ - - if (sk->linger == 0 || (current->flags & PF_EXITING)) - { - sk->prot->close(sk,0); - sk->dead = 1; - } - else - { - sk->prot->close(sk, 0); - cli(); - if (sk->lingertime) - current->timeout = jiffies + HZ*sk->lingertime; - while(closing(sk) && current->timeout>0) - { - interruptible_sleep_on(sk->sleep); - if (current->signal & ~current->blocked) - { - break; -#if 0 - /* not working now - closes can't be restarted */ - sti(); - current->timeout=0; - return(-ERESTARTSYS); -#endif - } - } - current->timeout=0; - sti(); - sk->dead = 1; + timeout = 0; + if (sk->linger) { + timeout = ~0UL; + if (!sk->lingertime) + timeout = jiffies + HZ*sk->lingertime; } - sk->inuse = 1; + if (current->flags & PF_EXITING) + timeout = 0; - /* This will destroy it. */ sock->data = NULL; - release_sock(sk); sk->socket = NULL; + + sk->prot->close(sk, timeout); return(0); } @@ -845,9 +774,16 @@ static int inet_bind(struct socket *sock, struct sockaddr *uaddr, unsigned short snum = 0 /* Stoopid compiler.. this IS ok */; int chk_addr_ret; + /* + * If the socket has its own bind function then use it. + */ + + if(sk->prot->bind) + return sk->prot->bind(sk,uaddr, addr_len); + /* check this error. */ if (sk->state != TCP_CLOSE) - return(-EIO); + return(-EINVAL); if(addr_len<sizeof(struct sockaddr_in)) return -EINVAL; @@ -873,12 +809,41 @@ static int inet_bind(struct socket *sock, struct sockaddr *uaddr, } chk_addr_ret = ip_chk_addr(addr->sin_addr.s_addr); - if (addr->sin_addr.s_addr != 0 && chk_addr_ret != IS_MYADDR && chk_addr_ret != IS_MULTICAST) +#ifdef CONFIG_IP_TRANSPARENT_PROXY + /* + * Superuser may bind to any address to allow transparent proxying. + */ + if (addr->sin_addr.s_addr != 0 && chk_addr_ret != IS_MYADDR && chk_addr_ret != IS_MULTICAST && chk_addr_ret != IS_BROADCAST && !suser()) +#else + if (addr->sin_addr.s_addr != 0 && chk_addr_ret != IS_MYADDR && chk_addr_ret != IS_MULTICAST && chk_addr_ret != IS_BROADCAST) +#endif return(-EADDRNOTAVAIL); /* Source address MUST be ours! */ - + +#ifndef CONFIG_IP_TRANSPARENT_PROXY + /* + * Am I just thick or is this test really always true after the one + * above? Just taking the test out appears to be the easiest way to + * make binds to remote addresses for transparent proxying work. + */ if (chk_addr_ret || addr->sin_addr.s_addr == 0) - sk->saddr = addr->sin_addr.s_addr; - + { +#endif + /* + * We keep a pair of addresses. rcv_saddr is the one + * used by get_sock_*(), and saddr is used for transmit. + * + * In the BSD API these are the same except where it + * would be illegal to use them (multicast/broadcast) in + * which case the sending device address is used. + */ + sk->rcv_saddr = addr->sin_addr.s_addr; + if(chk_addr_ret==IS_MULTICAST||chk_addr_ret==IS_BROADCAST) + sk->saddr = 0; /* Use device */ + else + sk->saddr = addr->sin_addr.s_addr; +#ifndef CONFIG_IP_TRANSPARENT_PROXY + } +#endif if(sock->type != SOCK_RAW) { /* Make sure we are allowed to bind here. */ @@ -886,20 +851,50 @@ static int inet_bind(struct socket *sock, struct sockaddr *uaddr, for(sk2 = sk->prot->sock_array[snum & (SOCK_ARRAY_SIZE -1)]; sk2 != NULL; sk2 = sk2->next) { - /* should be below! */ + /* + * Hash collision or real match ? + */ + if (sk2->num != snum) continue; + + /* + * Either bind on the port is wildcard means + * they will overlap and thus be in error + */ + + if (!sk2->rcv_saddr || !sk->rcv_saddr) + { + /* + * Allow only if both are setting reuse. + */ + if(sk2->reuse && sk->reuse && sk2->state!=TCP_LISTEN) + continue; + sti(); + return(-EADDRINUSE); + } + + /* + * Two binds match ? + */ + + if (sk2->rcv_saddr != sk->rcv_saddr) + continue; + /* + * Reusable port ? + */ + if (!sk->reuse) { sti(); return(-EADDRINUSE); } - if (sk2->num != snum) - continue; /* more than one */ - if (sk2->saddr != sk->saddr) - continue; /* socket per slot ! -FB */ - if (!sk2->reuse || sk2->state==TCP_LISTEN) + /* + * Reuse ? + */ + + if (!sk2->reuse || sk2->state==TCP_LISTEN) { sti(); return(-EADDRINUSE); @@ -907,43 +902,28 @@ static int inet_bind(struct socket *sock, struct sockaddr *uaddr, } sti(); - remove_sock(sk); + inet_remove_sock(sk); if(sock->type==SOCK_DGRAM) udp_cache_zap(); if(sock->type==SOCK_STREAM) tcp_cache_zap(); - put_sock(snum, sk); + inet_put_sock(snum, sk); sk->dummy_th.source = ntohs(sk->num); sk->daddr = 0; sk->dummy_th.dest = 0; } + ip_rt_put(sk->ip_route_cache); sk->ip_route_cache=NULL; return(0); } /* - * Handle sk->err properly. The cli/sti matter. - */ - -static int inet_error(struct sock *sk) -{ - unsigned long flags; - int err; - save_flags(flags); - cli(); - err=sk->err; - sk->err=0; - restore_flags(flags); - return -err; -} - -/* * Connect to a remote host. There is regrettably still a little * TCP 'magic' in here. */ -static int inet_connect(struct socket *sock, struct sockaddr * uaddr, - int addr_len, int flags) +int inet_connect(struct socket *sock, struct sockaddr * uaddr, + int addr_len, int flags) { struct sock *sk=(struct sock *)sock->data; int err; @@ -959,11 +939,7 @@ static int inet_connect(struct socket *sock, struct sockaddr * uaddr, if (sock->state == SS_CONNECTING && sk->protocol == IPPROTO_TCP && (flags & O_NONBLOCK)) { if(sk->err!=0) - { - err=sk->err; - sk->err=0; - return -err; - } + return sock_error(sk); return -EALREADY; /* Connecting is currently in progress */ } if (sock->state != SS_CONNECTING) @@ -973,7 +949,7 @@ static int inet_connect(struct socket *sock, struct sockaddr * uaddr, return(-EAGAIN); if (sk->prot->connect == NULL) return(-EOPNOTSUPP); - err = sk->prot->connect(sk, (struct sockaddr_in *)uaddr, addr_len); + err = sk->prot->connect(sk, uaddr, addr_len); if (err < 0) return(err); sock->state = SS_CONNECTING; @@ -982,11 +958,7 @@ static int inet_connect(struct socket *sock, struct sockaddr * uaddr, if (sk->state > TCP_FIN_WAIT2 && sock->state==SS_CONNECTING) { sock->state=SS_UNCONNECTED; - cli(); - err=sk->err; - sk->err=0; - sti(); - return -err; + return sock_error(sk); } if (sk->state != TCP_ESTABLISHED &&(flags & O_NONBLOCK)) @@ -1005,11 +977,9 @@ static int inet_connect(struct socket *sock, struct sockaddr * uaddr, icmp error packets wanting to close a tcp or udp socket. */ if(sk->err && sk->protocol == IPPROTO_TCP) { - sti(); sock->state = SS_UNCONNECTED; - err = -sk->err; - sk->err=0; - return err; /* set by tcp_err() */ + sti(); + return sock_error(sk); /* set by tcp_err() */ } } sti(); @@ -1018,9 +988,7 @@ static int inet_connect(struct socket *sock, struct sockaddr * uaddr, if (sk->state != TCP_ESTABLISHED && sk->err) { sock->state = SS_UNCONNECTED; - err=sk->err; - sk->err=0; - return(-err); + return sock_error(sk); } return(0); } @@ -1036,7 +1004,7 @@ static int inet_socketpair(struct socket *sock1, struct socket *sock2) * Accept a pending connection. The TCP layer now gives BSD semantics. */ -static int inet_accept(struct socket *sock, struct socket *newsock, int flags) +int inet_accept(struct socket *sock, struct socket *newsock, int flags) { struct sock *sk1, *sk2; int err; @@ -1044,22 +1012,25 @@ static int inet_accept(struct socket *sock, struct socket *newsock, int flags) sk1 = (struct sock *) sock->data; /* - * We've been passed an extra socket. - * We need to free it up because the tcp module creates - * its own when it accepts one. + * We've been passed an extra socket. + * We need to free it up because the tcp module creates + * its own when it accepts one. */ + if (newsock->data) { struct sock *sk=(struct sock *)newsock->data; newsock->data=NULL; - sk->dead = 1; destroy_sock(sk); } if (sk1->prot->accept == NULL) return(-EOPNOTSUPP); - /* Restore the state if we have been interrupted, and then returned. */ + /* + * Restore the state if we have been interrupted, and then returned. + */ + if (sk1->pair != NULL ) { sk2 = sk1->pair; @@ -1070,9 +1041,7 @@ static int inet_accept(struct socket *sock, struct socket *newsock, int flags) sk2 = sk1->prot->accept(sk1,flags); if (sk2 == NULL) { - err=sk1->err; - sk1->err=0; - return(-err); + return sock_error(sk1); } } newsock->data = (void *)sk2; @@ -1100,12 +1069,16 @@ static int inet_accept(struct socket *sock, struct socket *newsock, int flags) if (sk2->state != TCP_ESTABLISHED && sk2->err > 0) { - err = -sk2->err; - sk2->err=0; - sk2->dead=1; /* ANK */ + err = sock_error(sk2); destroy_sock(sk2); newsock->data = NULL; - return(err); + return err; + } + if (sk2->state == TCP_CLOSE) + { + destroy_sock(sk2); + newsock->data=NULL; + return -ECONNABORTED; } newsock->state = SS_CONNECTED; return(0); @@ -1133,81 +1106,39 @@ static int inet_getname(struct socket *sock, struct sockaddr *uaddr, } else { + __u32 addr = sk->rcv_saddr; + if (!addr) { + addr = sk->saddr; + if (!addr) + addr = ip_my_addr(); + } sin->sin_port = sk->dummy_th.source; - if (sk->saddr == 0) - sin->sin_addr.s_addr = ip_my_addr(); - else - sin->sin_addr.s_addr = sk->saddr; + sin->sin_addr.s_addr = addr; } *uaddr_len = sizeof(*sin); return(0); } -/* - * The assorted BSD I/O operations - */ -static int inet_recvfrom(struct socket *sock, void *ubuf, int size, int noblock, - unsigned flags, struct sockaddr *sin, int *addr_len ) +int inet_recvmsg(struct socket *sock, struct msghdr *ubuf, int size, + int noblock, int flags, int *addr_len) { struct sock *sk = (struct sock *) sock->data; - if (sk->prot->recvfrom == NULL) + if (sk->prot->recvmsg == NULL) return(-EOPNOTSUPP); if(sk->err) - return inet_error(sk); - /* We may need to bind the socket. */ - if(inet_autobind(sk)!=0) - return(-EAGAIN); - return(sk->prot->recvfrom(sk, (unsigned char *) ubuf, size, noblock, flags, - (struct sockaddr_in*)sin, addr_len)); -} - - -static int inet_recv(struct socket *sock, void *ubuf, int size, int noblock, - unsigned flags) -{ - /* BSD explicitly states these are the same - so we do it this way to be sure */ - return inet_recvfrom(sock,ubuf,size,noblock,flags,NULL,NULL); -} - -static int inet_read(struct socket *sock, char *ubuf, int size, int noblock) -{ - struct sock *sk = (struct sock *) sock->data; - - if(sk->err) - return inet_error(sk); - /* We may need to bind the socket. */ - if(inet_autobind(sk)) - return(-EAGAIN); - return(sk->prot->read(sk, (unsigned char *) ubuf, size, noblock, 0)); -} - -static int inet_send(struct socket *sock, void *ubuf, int size, int noblock, - unsigned flags) -{ - struct sock *sk = (struct sock *) sock->data; - if (sk->shutdown & SEND_SHUTDOWN) - { - send_sig(SIGPIPE, current, 1); - return(-EPIPE); - } - if(sk->err) - return inet_error(sk); + return sock_error(sk); /* We may need to bind the socket. */ if(inet_autobind(sk)!=0) return(-EAGAIN); - return(sk->prot->write(sk, (unsigned char *) ubuf, size, noblock, flags)); + return(sk->prot->recvmsg(sk, ubuf, size, noblock, flags,addr_len)); } -static int inet_write(struct socket *sock, char *ubuf, int size, int noblock) -{ - return inet_send(sock,ubuf,size,noblock,0); -} -static int inet_sendto(struct socket *sock, void *ubuf, int size, int noblock, - unsigned flags, struct sockaddr *sin, int addr_len) +int inet_sendmsg(struct socket *sock, struct msghdr *msg, int size, + int noblock, int flags) { struct sock *sk = (struct sock *) sock->data; if (sk->shutdown & SEND_SHUTDOWN) @@ -1215,19 +1146,19 @@ static int inet_sendto(struct socket *sock, void *ubuf, int size, int noblock, send_sig(SIGPIPE, current, 1); return(-EPIPE); } - if (sk->prot->sendto == NULL) + if (sk->prot->sendmsg == NULL) return(-EOPNOTSUPP); if(sk->err) - return inet_error(sk); + return sock_error(sk); /* We may need to bind the socket. */ if(inet_autobind(sk)!=0) return -EAGAIN; - return(sk->prot->sendto(sk, (unsigned char *) ubuf, size, noblock, flags, - (struct sockaddr_in *)sin, addr_len)); + return(sk->prot->sendmsg(sk, msg, size, noblock, flags)); + } -static int inet_shutdown(struct socket *sock, int how) +int inet_shutdown(struct socket *sock, int how) { struct sock *sk=(struct sock*)sock->data; @@ -1242,7 +1173,7 @@ static int inet_shutdown(struct socket *sock, int how) return(-EINVAL); if (sock->state == SS_CONNECTING && sk->state == TCP_ESTABLISHED) sock->state = SS_CONNECTED; - if (!tcp_connected(sk->state)) + if (!sk || !tcp_connected(sk->state)) return(-ENOTCONN); sk->shutdown |= how; if (sk->prot->shutdown) @@ -1251,7 +1182,7 @@ static int inet_shutdown(struct socket *sock, int how) } -static int inet_select(struct socket *sock, int sel_type, select_table *wait ) +int inet_select(struct socket *sock, int sel_type, select_table *wait ) { struct sock *sk=(struct sock *) sock->data; if (sk->prot->select == NULL) @@ -1275,44 +1206,51 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk=(struct sock *)sock->data; int err; + int pid; switch(cmd) { case FIOSETOWN: case SIOCSPGRP: - err=verify_area(VERIFY_READ,(int *)arg,sizeof(long)); - if(err) - return err; - sk->proc = get_fs_long((int *) arg); + err = get_user(pid, (int *) arg); + if (err) + return err; + /* see inet_fcntl */ + if (current->pid != pid && current->pgrp != -pid && !suser()) + return -EPERM; + sk->proc = pid; return(0); case FIOGETOWN: case SIOCGPGRP: - err=verify_area(VERIFY_WRITE,(void *) arg, sizeof(long)); - if(err) - return err; - put_fs_long(sk->proc,(int *)arg); - return(0); + return put_user(sk->proc, (int *)arg); case SIOCGSTAMP: if(sk->stamp.tv_sec==0) return -ENOENT; - err=verify_area(VERIFY_WRITE,(void *)arg,sizeof(struct timeval)); - if(err) - return err; - memcpy_tofs((void *)arg,&sk->stamp,sizeof(struct timeval)); - return 0; + err = copy_to_user((void *)arg,&sk->stamp,sizeof(struct timeval)); + if (err) + { + err = -EFAULT; + } + return err; case SIOCADDRT: case SIOCDELRT: return(ip_rt_ioctl(cmd,(void *) arg)); case SIOCDARP: case SIOCGARP: case SIOCSARP: + case OLD_SIOCDARP: + case OLD_SIOCGARP: + case OLD_SIOCSARP: return(arp_ioctl(cmd,(void *) arg)); -#ifdef CONFIG_INET_RARP case SIOCDRARP: case SIOCGRARP: case SIOCSRARP: - return(rarp_ioctl(cmd,(void *) arg)); +#ifdef CONFIG_KERNELD + if (rarp_ioctl_hook == NULL) + request_module("rarp"); #endif + if (rarp_ioctl_hook != NULL) + return(rarp_ioctl_hook(cmd,(void *) arg)); case SIOCGIFCONF: case SIOCGIFFLAGS: case SIOCSIFFLAGS: @@ -1335,13 +1273,37 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCSIFLINK: case SIOCGIFHWADDR: case SIOCSIFHWADDR: - case OLD_SIOCGIFHWADDR: case SIOCSIFMAP: case SIOCGIFMAP: case SIOCSIFSLAVE: case SIOCGIFSLAVE: return(dev_ioctl(cmd,(void *) arg)); + case SIOCGIFBR: + case SIOCSIFBR: +#ifdef CONFIG_BRIDGE + return(br_ioctl(cmd,(void *) arg)); +#else + return -ENOPKG; +#endif + case SIOCADDDLCI: + case SIOCDELDLCI: +#ifdef CONFIG_DLCI + return(dlci_ioctl(cmd, (void *) arg)); +#endif + +#ifdef CONFIG_DLCI_MODULE + +#ifdef CONFIG_KERNELD + if (dlci_ioctl_hook == NULL) + request_module("dlci"); +#endif + + if (dlci_ioctl_hook) + return((*dlci_ioctl_hook)(cmd, (void *) arg)); +#endif + return -ENOPKG; + default: if ((cmd >= SIOCDEVPRIVATE) && (cmd <= (SIOCDEVPRIVATE + 15))) @@ -1355,6 +1317,36 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return(0); } +#ifdef CONFIG_IP_TRANSPARENT_PROXY +/* + * Some routines for the for loop in get_sock which sometimes needs to walk + * two linked lists in sequence. Could use macros as well. + * Does anyone know a nicer way to code this? + */ +static __inline__ struct sock *secondlist(unsigned short hpnum, struct sock *s, + int *pfirstpass, struct proto *prot) +{ + if (hpnum && s == NULL && (*pfirstpass)-- ) + return prot->sock_array[hpnum & (SOCK_ARRAY_SIZE - 1)]; + else + return s; +} +static __inline__ struct sock *get_sock_loop_init(unsigned short hnum, + unsigned short hpnum, struct sock *s, + int *pfirstpass, struct proto *prot) +{ + s = prot->sock_array[hnum & (SOCK_ARRAY_SIZE - 1)]; + return secondlist(hpnum, s, pfirstpass, prot); +} +static __inline__ struct sock *get_sock_loop_next(unsigned short hnum, + unsigned short hpnum, struct sock *s, + int *pfirstpass, struct proto *prot) +{ + s = s->next; + return secondlist(hpnum, s, pfirstpass, prot); +} +#endif + /* * This routine must find a socket given a TCP or UDP header. * Everything is assumed to be in net order. @@ -1365,15 +1357,23 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) */ struct sock *get_sock(struct proto *prot, unsigned short num, - unsigned long raddr, - unsigned short rnum, unsigned long laddr) + unsigned long raddr, unsigned short rnum, + unsigned long laddr, unsigned long paddr, + unsigned short pnum) { - struct sock *s; + struct sock *s = 0; struct sock *result = NULL; int badness = -1; unsigned short hnum; +#ifdef CONFIG_IP_TRANSPARENT_PROXY + unsigned short hpnum; + int firstpass = 1; +#endif hnum = ntohs(num); +#ifdef CONFIG_IP_TRANSPARENT_PROXY + hpnum = ntohs(pnum); +#endif /* * SOCK_ARRAY_SIZE must be a power of two. This will work better @@ -1384,19 +1384,43 @@ struct sock *get_sock(struct proto *prot, unsigned short num, * socket number when we choose an arbitrary one. */ +#ifdef CONFIG_IP_TRANSPARENT_PROXY + for(s = get_sock_loop_init(hnum, hpnum, s, &firstpass, prot); + s != NULL; + s = get_sock_loop_next(hnum, hpnum, s, &firstpass, prot)) +#else for(s = prot->sock_array[hnum & (SOCK_ARRAY_SIZE - 1)]; s != NULL; s = s->next) +#endif { int score = 0; +#ifdef CONFIG_IP_TRANSPARENT_PROXY + /* accept the addressed port or the redirect (proxy) port */ + if (s->num != hnum && (hpnum == 0 || s->num != hpnum)) +#else if (s->num != hnum) +#endif continue; if(s->dead && (s->state == TCP_CLOSE)) continue; /* local address matches? */ - if (s->saddr) { - if (s->saddr != laddr) + if (s->rcv_saddr) { +#ifdef CONFIG_IP_TRANSPARENT_PROXY + /* + * If this is redirected traffic, it must either + * match on the redirected port/ip-address or on + * the actual destination, not on a mixture. + * There must be a simpler way to express this... + */ + if (hpnum + ? ((s->num != hpnum || s->rcv_saddr != paddr) + && (s->num != hnum || s->rcv_saddr != laddr)) + : (s->rcv_saddr != laddr)) +#else + if (s->rcv_saddr != laddr) +#endif continue; score++; } @@ -1413,11 +1437,23 @@ struct sock *get_sock(struct proto *prot, unsigned short num, score++; } /* perfect match? */ +#ifdef CONFIG_IP_TRANSPARENT_PROXY + if (score == 3 && s->num == hnum) +#else if (score == 3) +#endif return s; /* no, check if this is the best so far.. */ if (score <= badness) continue; +#ifdef CONFIG_IP_TRANSPARENT_PROXY + /* don't accept near matches on the actual destination + * port with IN_ADDR_ANY for redirected traffic, but do + * allow explicit remote address listens. (disputable) + */ + if (hpnum && s->num != hpnum && !s->rcv_saddr) + continue; +#endif result = s; badness = score; } @@ -1445,7 +1481,7 @@ struct sock *get_sock_raw(struct sock *sk, continue; if(s->daddr && s->daddr!=raddr) continue; - if(s->saddr && s->saddr!=laddr) + if(s->rcv_saddr && s->rcv_saddr != laddr) continue; return(s); } @@ -1488,7 +1524,7 @@ struct sock *get_sock_mcast(struct sock *sk, continue; if (s->dummy_th.dest != rnum && s->dummy_th.dest != 0) continue; - if(s->saddr && s->saddr!=laddr) + if(s->rcv_saddr && s->rcv_saddr != laddr) continue; return(s); } @@ -1497,7 +1533,7 @@ struct sock *get_sock_mcast(struct sock *sk, #endif -static struct proto_ops inet_proto_ops = { +struct proto_ops inet_proto_ops = { AF_INET, inet_create, @@ -1508,23 +1544,72 @@ static struct proto_ops inet_proto_ops = { inet_socketpair, inet_accept, inet_getname, - inet_read, - inet_write, inet_select, inet_ioctl, inet_listen, - inet_send, - inet_recv, - inet_sendto, - inet_recvfrom, inet_shutdown, inet_setsockopt, inet_getsockopt, inet_fcntl, + inet_sendmsg, + inet_recvmsg }; extern unsigned long seq_offset; +#ifdef CONFIG_PROC_FS +#ifdef CONFIG_INET_RARP +static struct proc_dir_entry proc_net_rarp = { + PROC_NET_RARP, 4, "rarp", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + rarp_get_info +}; +#endif /* RARP */ +static struct proc_dir_entry proc_net_raw = { + PROC_NET_RAW, 3, "raw", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + raw_get_info +}; +static struct proc_dir_entry proc_net_snmp = { + PROC_NET_SNMP, 4, "snmp", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + snmp_get_info +}; +static struct proc_dir_entry proc_net_sockstat = { + PROC_NET_SOCKSTAT, 8, "sockstat", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + afinet_get_info +}; +static struct proc_dir_entry proc_net_tcp = { + PROC_NET_TCP, 3, "tcp", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + tcp_get_info +}; +static struct proc_dir_entry proc_net_udp = { + PROC_NET_UDP, 3, "udp", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + udp_get_info +}; +static struct proc_dir_entry proc_net_route = { + PROC_NET_ROUTE, 5, "route", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + rt_get_info +}; +static struct proc_dir_entry proc_net_rtcache = { + PROC_NET_RTCACHE, 8, "rt_cache", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + rt_cache_get_info +}; +#endif /* CONFIG_PROC_FS */ + /* * Called by socket.c on kernel startup. */ @@ -1535,7 +1620,7 @@ void inet_proto_init(struct net_proto *pro) int i; - printk("Swansea University Computer Society TCP/IP for NET3.029 (Snapshot #6)\n"); + printk("Swansea University Computer Society TCP/IP for NET3.037\n"); /* * Tell SOCKET that we are alive... @@ -1551,16 +1636,20 @@ void inet_proto_init(struct net_proto *pro) for(i = 0; i < SOCK_ARRAY_SIZE; i++) { - tcp_prot.sock_array[i] = NULL; - udp_prot.sock_array[i] = NULL; - raw_prot.sock_array[i] = NULL; + tcp_sock_array[i] = NULL; + udp_sock_array[i] = NULL; + raw_sock_array[i] = NULL; } + tcp_prot.inuse = 0; tcp_prot.highestinuse = 0; + tcp_prot.sock_array = tcp_sock_array; udp_prot.inuse = 0; udp_prot.highestinuse = 0; + udp_prot.sock_array = udp_sock_array; raw_prot.inuse = 0; raw_prot.highestinuse = 0; + raw_prot.sock_array = raw_sock_array; printk("IP Protocols: "); for(p = inet_protocol_base; p != NULL;) @@ -1571,13 +1660,62 @@ void inet_proto_init(struct net_proto *pro) p = tmp; } + /* * Set the ARP module up */ arp_init(); + /* * Set the IP module up */ ip_init(); -} + /* + * Set the ICMP layer up + */ + icmp_init(&inet_proto_ops); + /* + * Set the firewalling up + */ +#if defined(CONFIG_IP_ACCT)||defined(CONFIG_IP_FIREWALL)|| \ + defined(CONFIG_IP_MASQUERADE) + ip_fw_init(); +#endif + /* + * Initialise the multicast router + */ +#if defined(CONFIG_IP_MROUTE) + ip_mr_init(); +#endif + + /* + * Initialise AF_INET alias type (register net_alias_type) + */ + +#if defined(CONFIG_IP_ALIAS) + ip_alias_init(); +#endif + +#ifdef CONFIG_INET_RARP + rarp_ioctl_hook = rarp_ioctl; +#endif + /* + * Create all the /proc entries. + */ + +#ifdef CONFIG_PROC_FS + +#ifdef CONFIG_INET_RARP + proc_net_register(&proc_net_rarp); +#endif /* RARP */ + + proc_net_register(&proc_net_raw); + proc_net_register(&proc_net_snmp); + proc_net_register(&proc_net_sockstat); + proc_net_register(&proc_net_tcp); + proc_net_register(&proc_net_udp); + proc_net_register(&proc_net_route); + proc_net_register(&proc_net_rtcache); +#endif /* CONFIG_PROC_FS */ +} diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 64bc060b9..090808b68 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -4,28 +4,29 @@ * * This module implements the Address Resolution Protocol ARP (RFC 826), * which is used to convert IP addresses (or in the future maybe other - * high-level addresses into a low-level hardware address (like an Ethernet + * high-level addresses) into a low-level hardware address (like an Ethernet * address). * * FIXME: * Experiment with better retransmit timers * Clean up the timer deletions - * If you create a proxy entry set your interface address to the address - * and then delete it, proxies may get out of sync with reality - check this + * If you create a proxy entry, set your interface address to the address + * and then delete it, proxies may get out of sync with reality - + * check this. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * * Fixes: - * Alan Cox : Removed the ethernet assumptions in Florian's code - * Alan Cox : Fixed some small errors in the ARP logic + * Alan Cox : Removed the ethernet assumptions in + * Florian's code + * Alan Cox : Fixed some small errors in the ARP + * logic * Alan Cox : Allow >4K in /proc * Alan Cox : Make ARP add its own protocol entry - * - * Ross Martin : Rewrote arp_rcv() and arp_get_info() + * Ross Martin : Rewrote arp_rcv() and arp_get_info() * Stephen Henson : Add AX25 support to arp_get_info() * Alan Cox : Drop data when a device is downed. * Alan Cox : Use init_timer(). @@ -33,14 +34,45 @@ * Martin Seine : Move the arphdr structure * to if_arp.h for compatibility. * with BSD based programs. - * Andrew Tridgell : Added ARP netmask code and - * re-arranged proxy handling. + * Andrew Tridgell : Added ARP netmask code and + * re-arranged proxy handling. * Alan Cox : Changed to use notifiers. * Niibe Yutaka : Reply for this device or proxies only. * Alan Cox : Don't proxy across hardware types! * Jonathan Naylor : Added support for NET/ROM. + * Mike Shaver : RFC1122 checks. + * Jonathan Naylor : Only lookup the hardware address for + * the correct hardware type. + * Germano Caronni : Assorted subtle races. + * Craig Schlenter : Don't modify permanent entry + * during arp_rcv. + * Russ Nelson : Tidied up a few bits. + * Alexey Kuznetsov: Major changes to caching and behaviour, + * eg intelligent arp probing and + * generation + * of host down events. + * Alan Cox : Missing unlock in device events. + * Eckes : ARP ioctl control errors. + * Alexey Kuznetsov: Arp free fix. + * Manuel Rodriguez: Gratuitous ARP. + * Jonathan Layes : Added arpd support through kerneld + * message queue (960314) + * Mike Shaver : /proc/sys/net/ipv4/arp_* support + * Stuart Cheshire : Metricom and grat arp fixes + * *** FOR 2.1 clean this up *** */ +/* RFC1122 Status: + 2.3.2.1 (ARP Cache Validation): + MUST provide mechanism to flush stale cache entries (OK) + SHOULD be able to configure cache timeout (OK) + MUST throttle ARP retransmits (OK) + 2.3.2.2 (ARP Packet Queue): + SHOULD save at least one packet from each "conversation" with an + unresolved IP address. (OK) + 950727 -- MS +*/ + #include <linux/types.h> #include <linux/string.h> #include <linux/kernel.h> @@ -49,49 +81,168 @@ #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> -#include <linux/if_arp.h> #include <linux/in.h> #include <linux/mm.h> -#include <asm/system.h> -#include <asm/segment.h> -#include <stdarg.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> +#include <linux/if_arp.h> #include <linux/trdevice.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> + #include <net/ip.h> +#include <net/icmp.h> #include <net/route.h> #include <net/protocol.h> #include <net/tcp.h> -#include <linux/skbuff.h> #include <net/sock.h> #include <net/arp.h> -#ifdef CONFIG_AX25 +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) #include <net/ax25.h> -#ifdef CONFIG_NETROM +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) #include <net/netrom.h> #endif #endif +#ifdef CONFIG_NET_ALIAS +#include <linux/net_alias.h> +#endif +#ifdef CONFIG_ARPD +#include <net/netlink.h> +#endif + +#include <asm/system.h> +#include <asm/uaccess.h> + +#include <stdarg.h> + +/* + * Configurable Parameters + */ + +/* + * After that time, an unused entry is deleted from the arp table. + * RFC1122 recommends set it to 60*HZ, if your site uses proxy arp + * and dynamic routing. + */ + +#define ARP_TIMEOUT (60*HZ) +int sysctl_arp_timeout = ARP_TIMEOUT; /* - * This structure defines the ARP mapping cache. As long as we make changes - * in this structure, we keep interrupts of. But normally we can copy the - * hardware address and the device pointer in a local variable and then make - * any "long calls" to send a packet out. + * How often is ARP cache checked for expire. + * It is useless to set ARP_CHECK_INTERVAL > ARP_TIMEOUT + */ + +#define ARP_CHECK_INTERVAL (60*HZ) + +int sysctl_arp_check_interval = ARP_CHECK_INTERVAL; + +/* + * Soft limit on ARP cache size. + * Note that this number should be greater than + * number of simultaneously opened sockets, or else + * hardware header cache will not be efficient. + */ + +#if RT_CACHE_DEBUG >= 2 +#define ARP_MAXSIZE 4 +#else +#ifdef CONFIG_ARPD +#define ARP_MAXSIZE 64 +#else +#define ARP_MAXSIZE 256 +#endif /* CONFIG_ARPD */ +#endif + +/* + * If an arp request is send, ARP_RES_TIME is the timeout value until the + * next request is send. + * RFC1122: OK. Throttles ARPing, as per 2.3.2.1. (MUST) + * The recommended minimum timeout is 1 second per destination. + * + */ + +#define ARP_RES_TIME (5*HZ) + +int sysctl_arp_res_time = ARP_RES_TIME; + +/* + * The number of times an broadcast arp request is send, until + * the host is considered temporarily unreachable. + */ + +#define ARP_MAX_TRIES 3 + +int sysctl_arp_max_tries = ARP_MAX_TRIES; + +/* + * The entry is reconfirmed by sending point-to-point ARP + * request after ARP_CONFIRM_INTERVAL. + * RFC1122 recommends 60*HZ. + * + * Warning: there exist nodes, that answer only broadcast + * ARP requests (Cisco-4000 in hot standby mode?) + * Now arp code should work with such nodes, but + * it still will generate redundant broadcast requests, so that + * this interval should be enough long. + */ + +#define ARP_CONFIRM_INTERVAL (300*HZ) + +int sysctl_arp_confirm_interval = ARP_CONFIRM_INTERVAL; + +/* + * We wait for answer to unicast request for ARP_CONFIRM_TIMEOUT. + */ + +#define ARP_CONFIRM_TIMEOUT ARP_RES_TIME + +int sysctl_arp_confirm_timeout = ARP_CONFIRM_TIMEOUT; + +/* + * The number of times an unicast arp request is retried, until + * the cache entry is considered suspicious. + * Value 0 means that no unicast pings will be sent. + * RFC1122 recommends 2. + */ + +#define ARP_MAX_PINGS 1 + +int sysctl_arp_max_pings = ARP_MAX_PINGS; + +/* + * When a host is dead, but someone tries to connect it, + * we do not remove corresponding cache entry (it would + * be useless, it will be created again immediately) + * Instead we prolongate interval between broadcasts + * to ARP_DEAD_RES_TIME. + * This interval should be not very long. + * (When the host will be up again, we will notice it only + * when ARP_DEAD_RES_TIME expires, or when the host will arp us. + */ + +#define ARP_DEAD_RES_TIME (60*HZ) + +int sysctl_arp_dead_res_time = ARP_DEAD_RES_TIME; + +/* + * This structure defines the ARP mapping cache. */ struct arp_table { struct arp_table *next; /* Linked entry list */ unsigned long last_used; /* For expiry */ + unsigned long last_updated; /* For expiry */ unsigned int flags; /* Control status */ - unsigned long ip; /* ip address of entry */ - unsigned long mask; /* netmask - used for generalised proxy arps (tridge) */ + u32 ip; /* ip address of entry */ + u32 mask; /* netmask - used for generalised proxy arps (tridge) */ unsigned char ha[MAX_ADDR_LEN]; /* Hardware address */ - unsigned char hlen; /* Length of hardware address */ - unsigned short htype; /* Type of hardware in use */ struct device *dev; /* Device the entry is tied to */ + struct hh_cache *hh; /* Hardware headers chain */ /* * The following entries are only used for unresolved hw addresses. @@ -103,48 +254,32 @@ struct arp_table }; -/* - * Configurable Parameters (don't touch unless you know what you are doing - */ - -/* - * If an arp request is send, ARP_RES_TIME is the timeout value until the - * next request is send. - */ +static atomic_t arp_size = 0; -#define ARP_RES_TIME (250*(HZ/10)) +#ifdef CONFIG_ARPD +static int arpd_not_running; +static int arpd_stamp; +#endif -/* - * The number of times an arp request is send, until the host is - * considered unreachable. - */ +static unsigned int arp_bh_mask; -#define ARP_MAX_TRIES 3 +#define ARP_BH_BACKLOG 1 /* - * After that time, an unused entry is deleted from the arp table. + * Backlog for ARP updates. */ - -#define ARP_TIMEOUT (600*HZ) +static struct arp_table *arp_backlog; /* - * How often is the function 'arp_check_retries' called. - * An entry is invalidated in the time between ARP_TIMEOUT and - * (ARP_TIMEOUT+ARP_CHECK_INTERVAL). + * Backlog for incomplete entries. */ +static struct arp_table *arp_req_backlog; -#define ARP_CHECK_INTERVAL (60 * HZ) - -enum proxy { - PROXY_EXACT=0, - PROXY_ANY, - PROXY_NONE, -}; -/* Forward declarations. */ +static void arp_run_bh(void); static void arp_check_expire (unsigned long); -static struct arp_table *arp_lookup(unsigned long paddr, enum proxy proxy); - +static int arp_update (u32 sip, char *sha, struct device * dev, + unsigned long updated, struct arp_table *ientry, int grat); static struct timer_list arp_timer = { NULL, NULL, ARP_CHECK_INTERVAL, 0L, &arp_check_expire }; @@ -152,96 +287,126 @@ static struct timer_list arp_timer = /* * The default arp netmask is just 255.255.255.255 which means it's * a single machine entry. Only proxy entries can have other netmasks - * -*/ + */ #define DEF_ARP_NETMASK (~0) - /* * The size of the hash table. Must be a power of two. - * Maybe we should remove hashing in the future for arp and concentrate - * on Patrick Schaaf's Host-Cache-Lookup... */ - -#define ARP_TABLE_SIZE 16 - -/* The ugly +1 here is to cater for proxy entries. They are put in their - own list for efficiency of lookup. If you don't want to find a proxy - entry then don't look in the last entry, otherwise do -*/ - -#define FULL_ARP_TABLE_SIZE (ARP_TABLE_SIZE+1) +#define ARP_TABLE_SIZE 16 +#define FULL_ARP_TABLE_SIZE (ARP_TABLE_SIZE+1) struct arp_table *arp_tables[FULL_ARP_TABLE_SIZE] = { NULL, }; -unsigned long arp_cache_stamp; - +#define arp_proxy_list arp_tables[ARP_TABLE_SIZE] /* * The last bits in the IP address are used for the cache lookup. - * A special entry is used for proxy arp entries + * A special entry is used for proxy arp entries */ #define HASH(paddr) (htonl(paddr) & (ARP_TABLE_SIZE - 1)) -#define PROXY_HASH ARP_TABLE_SIZE /* - * Check if there are too old entries and remove them. If the ATF_PERM - * flag is set, they are always left in the arp cache (permanent entry). - * Note: Only fully resolved entries, which don't have any packets in - * the queue, can be deleted, since ARP_TIMEOUT is much greater than - * ARP_MAX_TRIES*ARP_RES_TIME. + * ARP cache semaphore. + * + * Every time when someone wants to traverse arp table, + * he MUST call arp_fast_lock. + * It will guarantee that arp cache list will not change + * by interrupts and the entry that you found will not + * disappear unexpectedly. + * + * If you want to modify arp cache lists, you MUST + * call arp_fast_lock, and check that you are the only + * owner of semaphore (arp_lock == 1). If it is not the case + * you can defer your operation or forgot it, + * but DO NOT TOUCH lists. + * + * However, you are allowed to change arp entry contents. + * + * Assumptions: + * -- interrupt code MUST have lock/unlock balanced, + * you cannot lock cache on interrupt and defer unlocking + * to callback. + * In particular, it means that lock/unlock are allowed + * to be non-atomic. They are made atomic, but it was not + * necessary. + * -- nobody is allowed to sleep while + * it keeps arp locked. (route cache has similar locking + * scheme, but allows sleeping) + * */ -static void arp_check_expire(unsigned long dummy) +static atomic_t arp_lock; + +#define ARP_LOCKED() (arp_lock != 1) + +static __inline__ void arp_fast_lock(void) +{ + atomic_inc(&arp_lock); +} + +static __inline__ void arp_unlock(void) +{ + if (atomic_dec_and_test(&arp_lock) && arp_bh_mask) + arp_run_bh(); +} + +/* + * Enqueue to FIFO list. + */ + +static void arp_enqueue(struct arp_table **q, struct arp_table *entry) { - int i; - unsigned long now = jiffies; unsigned long flags; + struct arp_table * tail; + save_flags(flags); cli(); - - for (i = 0; i < FULL_ARP_TABLE_SIZE; i++) + tail = *q; + if (!tail) + entry->next = entry; + else { - struct arp_table *entry; - struct arp_table **pentry = &arp_tables[i]; - - while ((entry = *pentry) != NULL) - { - if ((now - entry->last_used) > ARP_TIMEOUT - && !(entry->flags & ATF_PERM)) - { - *pentry = entry->next; /* remove from list */ - arp_cache_stamp++; - del_timer(&entry->timer); /* Paranoia */ - kfree_s(entry, sizeof(struct arp_table)); - } - else - pentry = &entry->next; /* go to next entry */ - } + entry->next = tail->next; + tail->next = entry; } + *q = entry; restore_flags(flags); + return; +} - /* - * Set the timer again. - */ +/* + * Dequeue from FIFO list, + * caller should mask interrupts. + */ - del_timer(&arp_timer); - arp_timer.expires = ARP_CHECK_INTERVAL; - add_timer(&arp_timer); -} +static struct arp_table * arp_dequeue(struct arp_table **q) +{ + struct arp_table * entry; + if (*q) + { + entry = (*q)->next; + (*q)->next = entry->next; + if (entry->next == entry) + *q = NULL; + entry->next = NULL; + return entry; + } + return NULL; +} /* - * Release all linked skb's and the memory for this entry. + * Purge all linked skb's of the entry. */ -static void arp_release_entry(struct arp_table *entry) +static void arp_purge_send_q(struct arp_table *entry) { struct sk_buff *skb; unsigned long flags; @@ -254,132 +419,471 @@ static void arp_release_entry(struct arp_table *entry) skb_device_lock(skb); restore_flags(flags); dev_kfree_skb(skb, FREE_WRITE); + cli(); } restore_flags(flags); + return; +} + +/* + * Release the entry and all resources linked to it: skb's, hh's, timer + * and certainly memory. + * The entry should be already removed from lists. + */ + +static void arp_free_entry(struct arp_table *entry) +{ + unsigned long flags; + struct hh_cache *hh, *next; + del_timer(&entry->timer); + arp_purge_send_q(entry); + + save_flags(flags); + cli(); + hh = entry->hh; + entry->hh = NULL; + restore_flags(flags); + + for ( ; hh; hh = next) + { + next = hh->hh_next; + hh->hh_uptodate = 0; + hh->hh_next = NULL; + hh->hh_arp = NULL; + if (atomic_dec_and_test(&hh->hh_refcnt)) + kfree_s(hh, sizeof(struct(struct hh_cache))); + } + kfree_s(entry, sizeof(struct arp_table)); + atomic_dec(&arp_size); return; } /* - * Purge a device from the ARP queue + * Hardware header cache. + * + * BEWARE! Hardware header cache has no locking, so that + * it requires especially careful handling. + * It is the only part of arp+route, where a list + * should be traversed with masked interrupts. + * Luckily, this list contains one element 8), as rule. */ - -int arp_device_event(unsigned long event, void *ptr) + +/* + * How many users has this entry? + * The answer is reliable only when interrupts are masked. + */ + +static __inline__ int arp_count_hhs(struct arp_table * entry) +{ + struct hh_cache *hh; + int count = 0; + + for (hh = entry->hh; hh; hh = hh->hh_next) + count += hh->hh_refcnt-1; + + return count; +} + +/* + * Signal to device layer, that hardware address may be changed. + */ + +static __inline__ void arp_update_hhs(struct arp_table * entry) +{ + struct hh_cache *hh; + + for (hh=entry->hh; hh; hh=hh->hh_next) + entry->dev->header_cache_update(hh, entry->dev, entry->ha); +} + +/* + * Invalidate all hh's, so that higher level will not try to use it. + */ + +static __inline__ void arp_invalidate_hhs(struct arp_table * entry) +{ + struct hh_cache *hh; + + for (hh=entry->hh; hh; hh=hh->hh_next) + hh->hh_uptodate = 0; +} + +/* + * Atomic attaching new hh entry. + * Return 1, if entry has been freed, rather than attached. + */ + +static int arp_set_hh(struct hh_cache **hhp, struct hh_cache *hh) { - struct device *dev=ptr; - int i; unsigned long flags; - - if(event!=NETDEV_DOWN) - return NOTIFY_DONE; - /* - * This is a bit OTT - maybe we need some arp semaphores instead. - */ - + struct hh_cache *hh1; + struct arp_table *entry; + + atomic_inc(&hh->hh_refcnt); + save_flags(flags); cli(); - for (i = 0; i < FULL_ARP_TABLE_SIZE; i++) + if ((hh1 = *hhp) == NULL) { - struct arp_table *entry; - struct arp_table **pentry = &arp_tables[i]; + *hhp = hh; + restore_flags(flags); + return 0; + } - while ((entry = *pentry) != NULL) - { - if(entry->dev==dev) - { - *pentry = entry->next; /* remove from list */ - del_timer(&entry->timer); /* Paranoia */ - kfree_s(entry, sizeof(struct arp_table)); - } - else - pentry = &entry->next; /* go to next entry */ - } + entry = (struct arp_table*)hh->hh_arp; + + /* + * An hh1 entry is already attached to this point. + * Is it not linked to arp entry? Link it! + */ + if (!hh1->hh_arp && entry) + { + atomic_inc(&hh1->hh_refcnt); + hh1->hh_next = entry->hh; + entry->hh = hh1; + hh1->hh_arp = (void*)entry; + restore_flags(flags); + + if (entry->flags & ATF_COM) + entry->dev->header_cache_update(hh1, entry->dev, entry->ha); +#if RT_CACHE_DEBUG >= 1 + printk("arp_set_hh: %08x is reattached. Good!\n", entry->ip); +#endif } - arp_cache_stamp++; +#if RT_CACHE_DEBUG >= 1 + else if (entry) + printk("arp_set_hh: %08x rr1 ok!\n", entry->ip); +#endif restore_flags(flags); - return NOTIFY_DONE; + if (atomic_dec_and_test(&hh->hh_refcnt)) + kfree_s(hh, sizeof(struct hh_cache)); + return 1; } +static __inline__ struct hh_cache * arp_alloc_hh(int htype) +{ + struct hh_cache *hh; + hh = kmalloc(sizeof(struct hh_cache), GFP_ATOMIC); + if (hh) + { + memset(hh, 0, sizeof(struct hh_cache)); + hh->hh_type = htype; + } + return hh; +} /* - * Create and send an arp packet. If (dest_hw == NULL), we create a broadcast - * message. + * Test if a hardware address is all zero */ -void arp_send(int type, int ptype, unsigned long dest_ip, - struct device *dev, unsigned long src_ip, - unsigned char *dest_hw, unsigned char *src_hw) +static __inline__ int empty(unsigned char * addr, int len) +{ + while (len > 0) + { + if (*addr) + return 0; + len--; + addr++; + } + return 1; +} + + +#ifdef CONFIG_ARPD + +/* + * Send ARPD message. + */ +static void arpd_send(int req, u32 addr, struct device * dev, char *ha, + unsigned long updated) { + int retval; struct sk_buff *skb; - struct arphdr *arp; - unsigned char *arp_ptr; + struct arpd_request *arpreq; - /* - * No arp on this interface. - */ - - if(dev->flags&IFF_NOARP) + if (arpd_not_running) return; - /* - * Allocate a buffer - */ - - skb = alloc_skb(sizeof(struct arphdr)+ 2*(dev->addr_len+4) - + dev->hard_header_len, GFP_ATOMIC); + skb = alloc_skb(sizeof(struct arpd_request), GFP_ATOMIC); if (skb == NULL) + return; + + skb->free=1; + arpreq=(struct arpd_request *)skb_put(skb, sizeof(struct arpd_request)); + arpreq->req = req; + arpreq->ip = addr; + arpreq->dev = (unsigned long)dev; + arpreq->stamp = arpd_stamp; + arpreq->updated = updated; + if (ha) + memcpy(arpreq->ha, ha, sizeof(arpreq->ha)); + + retval = netlink_post(NETLINK_ARPD, skb); + if (retval) { - printk("ARP: no memory to send an arp packet\n"); + kfree_skb(skb, FREE_WRITE); + if (retval == -EUNATCH) + arpd_not_running = 1; + } +} + +/* + * Send ARPD update message. + */ + +static __inline__ void arpd_update(struct arp_table * entry) +{ + if (arpd_not_running) + return; + arpd_send(ARPD_UPDATE, entry->ip, entry->dev, entry->ha, + entry->last_updated); +} + +/* + * Send ARPD lookup request. + */ + +static __inline__ void arpd_lookup(u32 addr, struct device * dev) +{ + if (arpd_not_running) return; + arpd_send(ARPD_LOOKUP, addr, dev, NULL, 0); +} + +/* + * Send ARPD flush message. + */ + +static __inline__ void arpd_flush(struct device * dev) +{ + if (arpd_not_running) + return; + arpd_send(ARPD_FLUSH, 0, dev, NULL, 0); +} + + +static int arpd_callback(struct sk_buff *skb) +{ + struct device * dev; + struct arpd_request *retreq; + + arpd_not_running = 0; + + if (skb->len != sizeof(struct arpd_request)) + { + kfree_skb(skb, FREE_READ); + return -EINVAL; } - skb->len = sizeof(struct arphdr) + dev->hard_header_len + 2*(dev->addr_len+4); - skb->arp = 1; - skb->dev = dev; - skb->free = 1; - /* - * Fill the device header for the ARP frame - */ + retreq = (struct arpd_request *)skb->data; + dev = (struct device*)retreq->dev; - dev->hard_header(skb->data,dev,ptype,dest_hw?dest_hw:dev->broadcast,src_hw?src_hw:NULL,skb->len,skb); + if (retreq->stamp != arpd_stamp || !dev) + { + kfree_skb(skb, FREE_READ); + return -EINVAL; + } + + if (!retreq->updated || empty(retreq->ha, sizeof(retreq->ha))) + { +/* + * Invalid mapping: drop it and send ARP broadcast. + */ + arp_send(ARPOP_REQUEST, ETH_P_ARP, retreq->ip, dev, dev->pa_addr, NULL, + dev->dev_addr, NULL); + } + else + { + arp_fast_lock(); + arp_update(retreq->ip, retreq->ha, dev, retreq->updated, NULL, 0); + arp_unlock(); + } + + kfree_skb(skb, FREE_READ); + return sizeof(struct arpd_request); +} - /* Fill out the arp protocol part. */ - arp = (struct arphdr *) (skb->data + dev->hard_header_len); - arp->ar_hrd = htons(dev->type); -#ifdef CONFIG_AX25 -#ifdef CONFIG_NETROM - arp->ar_pro = (dev->type == ARPHRD_AX25 || dev->type == ARPHRD_NETROM) ? htons(AX25_P_IP) : htons(ETH_P_IP); #else - arp->ar_pro = (dev->type != ARPHRD_AX25)? htons(ETH_P_IP) : htons(AX25_P_IP); + +static __inline__ void arpd_update(struct arp_table * entry) +{ + return; +} + +#endif /* CONFIG_ARPD */ + + + + +/* + * ARP expiration routines. + */ + +/* + * Force the expiry of an entry in the internal cache so the memory + * can be used for a new request. + */ + +static int arp_force_expire(void) +{ + int i; + struct arp_table *entry, **pentry; + struct arp_table **oldest_entry = NULL; + unsigned long oldest_used = ~0; + unsigned long flags; + unsigned long now = jiffies; + int result = 0; + + static last_index; + + if (ARP_LOCKED()) + return 0; + + save_flags(flags); + + if (last_index >= ARP_TABLE_SIZE) + last_index = 0; + + for (i = 0; i < ARP_TABLE_SIZE; i++, last_index++) + { + pentry = &arp_tables[last_index & (ARP_TABLE_SIZE-1)]; + + while ((entry = *pentry) != NULL) + { + if (!(entry->flags & ATF_PERM)) + { + int users; + cli(); + users = arp_count_hhs(entry); + + if (!users && now - entry->last_used > sysctl_arp_timeout) + { + *pentry = entry->next; + restore_flags(flags); +#if RT_CACHE_DEBUG >= 2 + printk("arp_force_expire: %08x expired\n", entry->ip); #endif -#else - arp->ar_pro = htons(ETH_P_IP); + arp_free_entry(entry); + result++; + if (arp_size < ARP_MAXSIZE) + goto done; + continue; + } + restore_flags(flags); + if (!users && entry->last_used < oldest_used) + { + oldest_entry = pentry; + oldest_used = entry->last_used; + } + } + pentry = &entry->next; + } + } + +done: + if (result || !oldest_entry) + return result; + + entry = *oldest_entry; + *oldest_entry = entry->next; +#if RT_CACHE_DEBUG >= 2 + printk("arp_force_expire: expiring %08x\n", entry->ip); #endif - arp->ar_hln = dev->addr_len; - arp->ar_pln = 4; - arp->ar_op = htons(type); + arp_free_entry(entry); + return 1; +} - arp_ptr=(unsigned char *)(arp+1); +/* + * Check if there are entries that are too old and remove them. If the + * ATF_PERM flag is set, they are always left in the arp cache (permanent + * entries). If an entry was not confirmed for ARP_CONFIRM_INTERVAL, + * send point-to-point ARP request. + * If it will not be confirmed for ARP_CONFIRM_TIMEOUT, + * give it to shred by arp_expire_entry. + */ - memcpy(arp_ptr, src_hw, dev->addr_len); - arp_ptr+=dev->addr_len; - memcpy(arp_ptr, &src_ip,4); - arp_ptr+=4; - if (dest_hw != NULL) - memcpy(arp_ptr, dest_hw, dev->addr_len); - else - memset(arp_ptr, 0, dev->addr_len); - arp_ptr+=dev->addr_len; - memcpy(arp_ptr, &dest_ip, 4); +static void arp_check_expire(unsigned long dummy) +{ + int i; + unsigned long now = jiffies; - dev_queue_xmit(skb, dev, 0); -} + del_timer(&arp_timer); + +#ifdef CONFIG_ARPD + arpd_not_running = 0; +#endif + + ip_rt_check_expire(); + arp_fast_lock(); + + if (!ARP_LOCKED()) + { + + for (i = 0; i < ARP_TABLE_SIZE; i++) + { + struct arp_table *entry, **pentry; + + pentry = &arp_tables[i]; + + while ((entry = *pentry) != NULL) + { + if (entry->flags & ATF_PERM) + { + pentry = &entry->next; + continue; + } + + cli(); + if (now - entry->last_used > sysctl_arp_timeout + && !arp_count_hhs(entry)) + { + *pentry = entry->next; + sti(); +#if RT_CACHE_DEBUG >= 2 + printk("arp_expire: %08x expired\n", entry->ip); +#endif + arp_free_entry(entry); + continue; + } + sti(); + if (entry->last_updated + && now - entry->last_updated > sysctl_arp_confirm_interval + && !(entry->flags & ATF_PERM)) + { + struct device * dev = entry->dev; + entry->retries = sysctl_arp_max_tries+sysctl_arp_max_pings; + del_timer(&entry->timer); + entry->timer.expires = jiffies + ARP_CONFIRM_TIMEOUT; + add_timer(&entry->timer); + arp_send(ARPOP_REQUEST, ETH_P_ARP, entry->ip, + dev, dev->pa_addr, entry->ha, + dev->dev_addr, NULL); +#if RT_CACHE_DEBUG >= 2 + printk("arp_expire: %08x requires confirmation\n", entry->ip); +#endif + } + pentry = &entry->next; /* go to next entry */ + } + } + } + + arp_unlock(); + + /* + * Set the timer again. + */ + + arp_timer.expires = jiffies + sysctl_arp_check_interval; + add_timer(&arp_timer); +} /* * This function is called, if an entry is not resolved in ARP_RES_TIME. - * Either resend a request, or give it up and free the entry. + * When more than MAX_ARP_TRIES retries was done, release queued skb's, + * but not discard entry itself if it is in use. */ static void arp_expire_request (unsigned long arg) @@ -389,78 +893,215 @@ static void arp_expire_request (unsigned long arg) unsigned long hash; unsigned long flags; + arp_fast_lock(); + save_flags(flags); cli(); + del_timer(&entry->timer); + + /* + * If arp table is locked, defer expire processing. + */ + if (ARP_LOCKED()) + { +#if RT_CACHE_DEBUG >= 1 + printk(KERN_DEBUG "arp_expire_request: %08x deferred\n", entry->ip); +#endif + entry->timer.expires = jiffies + HZ/10; + add_timer(&entry->timer); + restore_flags(flags); + arp_unlock(); + return; + } /* * Since all timeouts are handled with interrupts enabled, there is a * small chance, that this entry has just been resolved by an incoming * packet. This is the only race condition, but it is handled... + * + * One exception: if entry is COMPLETE but old, + * it means that point-to-point ARP ping has been failed + * (It really occurs with Cisco 4000 routers) + * We should reconfirm it. */ - if (entry->flags & ATF_COM) + if ((entry->flags & ATF_COM) && entry->last_updated + && jiffies - entry->last_updated <= sysctl_arp_confirm_interval) { restore_flags(flags); + arp_unlock(); return; } - if (--entry->retries > 0) + restore_flags(flags); + + if (entry->last_updated && --entry->retries > 0) { - unsigned long ip = entry->ip; struct device *dev = entry->dev; +#if RT_CACHE_DEBUG >= 2 + printk("arp_expire_request: %08x timed out\n", entry->ip); +#endif /* Set new timer. */ - del_timer(&entry->timer); - entry->timer.expires = ARP_RES_TIME; + entry->timer.expires = jiffies + sysctl_arp_res_time; add_timer(&entry->timer); - restore_flags(flags); - arp_send(ARPOP_REQUEST, ETH_P_ARP, ip, dev, dev->pa_addr, - NULL, dev->dev_addr); + arp_send(ARPOP_REQUEST, ETH_P_ARP, entry->ip, dev, dev->pa_addr, + entry->retries > sysctl_arp_max_tries ? entry->ha : NULL, + dev->dev_addr, NULL); + arp_unlock(); return; } /* - * Arp request timed out. Delete entry and all waiting packets. - * If we give each entry a pointer to itself, we don't have to - * loop through everything again. Maybe hash is good enough, but - * I will look at it later. + * The host is really dead. */ + arp_purge_send_q(entry); + + cli(); + if (arp_count_hhs(entry)) + { + /* + * The host is dead, but someone refers to it. + * It is useless to drop this entry just now, + * it will be born again, so that + * we keep it, but slow down retransmitting + * to ARP_DEAD_RES_TIME. + */ + + struct device *dev = entry->dev; +#if RT_CACHE_DEBUG >= 2 + printk("arp_expire_request: %08x is dead\n", entry->ip); +#endif + entry->retries = sysctl_arp_max_tries; + entry->flags &= ~ATF_COM; + arp_invalidate_hhs(entry); + restore_flags(flags); + + /* + * Declare the entry dead. + */ + entry->last_updated = 0; + arpd_update(entry); + + entry->timer.expires = jiffies + sysctl_arp_dead_res_time; + add_timer(&entry->timer); + arp_send(ARPOP_REQUEST, ETH_P_ARP, entry->ip, dev, dev->pa_addr, + NULL, dev->dev_addr, NULL); + arp_unlock(); + return; + } + restore_flags(flags); + + entry->last_updated = 0; + arpd_update(entry); + hash = HASH(entry->ip); - /* proxy entries shouldn't really time out so this is really - only here for completeness - */ - if (entry->flags & ATF_PUBL) - pentry = &arp_tables[PROXY_HASH]; - else - pentry = &arp_tables[hash]; + pentry = &arp_tables[hash]; + while (*pentry != NULL) { - if (*pentry == entry) + if (*pentry != entry) { - *pentry = entry->next; /* delete from linked list */ - del_timer(&entry->timer); - restore_flags(flags); - arp_release_entry(entry); - arp_cache_stamp++; - return; + pentry = &(*pentry)->next; + continue; } - pentry = &(*pentry)->next; + *pentry = entry->next; +#if RT_CACHE_DEBUG >= 2 + printk("arp_expire_request: %08x is killed\n", entry->ip); +#endif + arp_free_entry(entry); } - restore_flags(flags); - printk("Possible ARP queue corruption.\n"); - /* - * We should never arrive here. - */ + arp_unlock(); +} + + +/* + * Allocate memory for a new entry. If we are at the maximum limit + * of the internal ARP cache, arp_force_expire() an entry. NOTE: + * arp_force_expire() needs the cache to be locked, so therefore + * arp_alloc_entry() should only be called with the cache locked too! + */ + +static struct arp_table * arp_alloc_entry(void) +{ + struct arp_table * entry; + + + if (arp_size >= ARP_MAXSIZE) + arp_force_expire(); + + entry = (struct arp_table *) + kmalloc(sizeof(struct arp_table),GFP_ATOMIC); + + if (entry != NULL) + { + atomic_inc(&arp_size); + memset(entry, 0, sizeof(struct arp_table)); + + entry->mask = DEF_ARP_NETMASK; + init_timer(&entry->timer); + entry->timer.function = arp_expire_request; + entry->timer.data = (unsigned long)entry; + entry->last_updated = entry->last_used = jiffies; + skb_queue_head_init(&entry->skb); + } + return entry; } + +/* + * Purge a device from the ARP queue + */ + +int arp_device_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct device *dev=ptr; + int i; + + if (event != NETDEV_DOWN) + return NOTIFY_DONE; + +#ifdef CONFIG_ARPD + arpd_flush(dev); + arpd_stamp++; +#endif + + arp_fast_lock(); +#if RT_CACHE_DEBUG >= 1 + if (ARP_LOCKED()) + printk("arp_device_event: impossible\n"); +#endif + + for (i = 0; i < FULL_ARP_TABLE_SIZE; i++) + { + struct arp_table *entry; + struct arp_table **pentry = &arp_tables[i]; + + while ((entry = *pentry) != NULL) + { + if (entry->dev == dev) + { + *pentry = entry->next; /* remove from list */ + arp_free_entry(entry); + } + else + pentry = &entry->next; /* go to next entry */ + } + } + arp_unlock(); + return NOTIFY_DONE; +} + + + /* * This will try to retransmit everything on the queue. */ -static void arp_send_q(struct arp_table *entry, unsigned char *hw_dest) +static void arp_send_q(struct arp_table *entry) { struct sk_buff *skb; @@ -472,8 +1113,12 @@ static void arp_send_q(struct arp_table *entry, unsigned char *hw_dest) if(!(entry->flags&ATF_COM)) { - printk("arp_send_q: incomplete entry for %s\n", + printk(KERN_ERR "arp_send_q: incomplete entry for %s\n", in_ntoa(entry->ip)); + /* Can't flush the skb, because RFC1122 says to hang on to */ + /* at least one from any unresolved entry. --MS */ + /* What's happened is that someone has 'unresolved' the entry + as we got to use it - this 'can't happen' -- AC */ return; } @@ -493,69 +1138,607 @@ static void arp_send_q(struct arp_table *entry, unsigned char *hw_dest) else dev_queue_xmit(skb,skb->dev,skb->sk->priority); } + cli(); + } + restore_flags(flags); +} + + +static int +arp_update (u32 sip, char *sha, struct device * dev, + unsigned long updated, struct arp_table *ientry, int grat) +{ + struct arp_table * entry; + unsigned long hash; + int do_arpd = 0; + + if (updated == 0) + { + updated = jiffies; + do_arpd = 1; + } + + hash = HASH(sip); + + for (entry=arp_tables[hash]; entry; entry = entry->next) + if (entry->ip == sip && entry->dev == dev) + break; + + if (entry) + { +/* + * Entry found; update it only if it is not a permanent entry. + */ + if (!(entry->flags & ATF_PERM)) + { + del_timer(&entry->timer); + entry->last_updated = updated; + if (memcmp(entry->ha, sha, dev->addr_len)!=0) + { + memcpy(entry->ha, sha, dev->addr_len); + if (entry->flags & ATF_COM) + arp_update_hhs(entry); + } + if (do_arpd) + arpd_update(entry); + } + + if (!(entry->flags & ATF_COM)) + { +/* + * This entry was incomplete. Delete the retransmit timer + * and switch to complete status. + */ + entry->flags |= ATF_COM; + arp_update_hhs(entry); +/* + * Send out waiting packets. We might have problems, if someone is + * manually removing entries right now -- entry might become invalid + * underneath us. + */ + arp_send_q(entry); + } + return 1; + } + +/* + * No entry found. Need to add a new entry to the arp table. + */ + entry = ientry; + + if (grat && !entry) + return 0; + + if (!entry) + { + entry = arp_alloc_entry(); + if (!entry) + return 0; + + entry->ip = sip; + entry->flags = ATF_COM; + memcpy(entry->ha, sha, dev->addr_len); + entry->dev = dev; + } + + entry->last_updated = updated; + entry->last_used = jiffies; + if (do_arpd) + arpd_update(entry); + + if (!ARP_LOCKED()) + { + entry->next = arp_tables[hash]; + arp_tables[hash] = entry; + return 0; + } +#if RT_CACHE_DEBUG >= 2 + printk("arp_update: %08x backlogged\n", entry->ip); +#endif + arp_enqueue(&arp_backlog, entry); + arp_bh_mask |= ARP_BH_BACKLOG; + return 0; +} + + + +static __inline__ struct arp_table *arp_lookup(u32 paddr, struct device * dev) +{ + struct arp_table *entry; + + for (entry = arp_tables[HASH(paddr)]; entry != NULL; entry = entry->next) + if (entry->ip == paddr && (!dev || entry->dev == dev)) + return entry; + return NULL; +} + +/* + * Find an arp mapping in the cache. If not found, return false. + */ + +int arp_query(unsigned char *haddr, u32 paddr, struct device * dev) +{ + struct arp_table *entry; + + arp_fast_lock(); + + entry = arp_lookup(paddr, dev); + + if (entry != NULL) + { + entry->last_used = jiffies; + if (entry->flags & ATF_COM) + { + memcpy(haddr, entry->ha, dev->addr_len); + arp_unlock(); + return 1; + } + } + arp_unlock(); + return 0; +} + + +static int arp_set_predefined(int addr_hint, unsigned char * haddr, u32 paddr, struct device * dev) +{ + switch (addr_hint) + { + case IS_MYADDR: + printk(KERN_DEBUG "ARP: arp called for own IP address\n"); + memcpy(haddr, dev->dev_addr, dev->addr_len); + return 1; +#ifdef CONFIG_IP_MULTICAST + case IS_MULTICAST: + if(dev->type==ARPHRD_ETHER || dev->type==ARPHRD_IEEE802) + { + u32 taddr; + haddr[0]=0x01; + haddr[1]=0x00; + haddr[2]=0x5e; + taddr=ntohl(paddr); + haddr[5]=taddr&0xff; + taddr=taddr>>8; + haddr[4]=taddr&0xff; + taddr=taddr>>8; + haddr[3]=taddr&0x7f; + return 1; + } + /* + * If a device does not support multicast broadcast the stuff (eg AX.25 for now) + */ +#endif + + case IS_BROADCAST: + memcpy(haddr, dev->broadcast, dev->addr_len); + return 1; + } + return 0; +} + +/* + * Create a new unresolved entry. + */ + +struct arp_table * arp_new_entry(u32 paddr, struct device *dev, struct hh_cache *hh, struct sk_buff *skb) +{ + struct arp_table *entry; + + entry = arp_alloc_entry(); + + if (entry != NULL) + { + entry->ip = paddr; + entry->dev = dev; + if (hh) + { + entry->hh = hh; + atomic_inc(&hh->hh_refcnt); + hh->hh_arp = (void*)entry; + } + entry->timer.expires = jiffies + sysctl_arp_res_time; + + if (skb != NULL) + { + skb_queue_tail(&entry->skb, skb); + skb_device_unlock(skb); + } + + if (!ARP_LOCKED()) + { + unsigned long hash = HASH(paddr); + entry->next = arp_tables[hash]; + arp_tables[hash] = entry; + add_timer(&entry->timer); + entry->retries = sysctl_arp_max_tries; +#ifdef CONFIG_ARPD + if (!arpd_not_running) + arpd_lookup(paddr, dev); + else +#endif + arp_send(ARPOP_REQUEST, ETH_P_ARP, paddr, dev, dev->pa_addr, NULL, + dev->dev_addr, NULL); + } else { - /* This routine is only ever called when 'entry' is - complete. Thus this can't fail. */ - printk("arp_send_q: The impossible occurred. Please notify Alan.\n"); - printk("arp_send_q: active entity %s\n",in_ntoa(entry->ip)); - printk("arp_send_q: failed to find %s\n",in_ntoa(skb->raddr)); +#if RT_CACHE_DEBUG >= 2 + printk("arp_new_entry: %08x backlogged\n", entry->ip); +#endif + arp_enqueue(&arp_req_backlog, entry); + arp_bh_mask |= ARP_BH_BACKLOG; } } - restore_flags(flags); + return entry; } /* - * Delete an ARP mapping entry in the cache. + * Find an arp mapping in the cache. If not found, post a request. */ -void arp_destroy(unsigned long ip_addr, int force) +int arp_find(unsigned char *haddr, u32 paddr, struct device *dev, + u32 saddr, struct sk_buff *skb) { - int checked_proxies = 0; struct arp_table *entry; - struct arp_table **pentry; - unsigned long hash = HASH(ip_addr); + unsigned long hash; + + if (arp_set_predefined(ip_chk_addr(paddr), haddr, paddr, dev)) + { + if (skb) + skb->arp = 1; + return 0; + } + + hash = HASH(paddr); + arp_fast_lock(); + + /* + * Find an entry + */ + entry = arp_lookup(paddr, dev); + + if (entry != NULL) /* It exists */ + { + if (entry->flags & ATF_COM) + { + entry->last_used = jiffies; + memcpy(haddr, entry->ha, dev->addr_len); + if (skb) + skb->arp = 1; + arp_unlock(); + return 0; + } + + /* + * A request was already sent, but no reply yet. Thus + * queue the packet with the previous attempt + */ + + if (skb != NULL) + { + if (entry->last_updated) + { + skb_queue_tail(&entry->skb, skb); + skb_device_unlock(skb); + } + /* + * If last_updated==0 host is dead, so + * drop skb's and set socket error. + */ + else + { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, dev); + dev_kfree_skb(skb, FREE_WRITE); + } + } + arp_unlock(); + return 1; + } -ugly: + entry = arp_new_entry(paddr, dev, NULL, skb); + + if (skb != NULL && !entry) + dev_kfree_skb(skb, FREE_WRITE); + + arp_unlock(); + return 1; +} + +/* + * Binding hardware header cache entry. + * It is the only really complicated part of arp code. + * We have no locking for hh records, so that + * all possible race conditions should be resolved by + * cli()/sti() pairs. + * + * Important note: hhs never disappear from lists, if ARP_LOCKED, + * this fact allows to scan hh lists with enabled interrupts, + * but results in generating duplicate hh entries. + * It is harmless. (and I've never seen such event) + * + * Returns 0, if hh has been just created, so that + * caller should fill it. + */ + +int arp_bind_cache(struct hh_cache ** hhp, struct device *dev, unsigned short htype, u32 paddr) +{ + struct arp_table *entry; + struct hh_cache *hh; + int addr_hint; + unsigned long flags; + + save_flags(flags); + + if ((addr_hint = ip_chk_addr(paddr)) != 0) + { + unsigned char haddr[MAX_ADDR_LEN]; + if (*hhp) + return 1; + hh = arp_alloc_hh(htype); + if (!hh) + return 1; + arp_set_predefined(addr_hint, haddr, paddr, dev); + dev->header_cache_update(hh, dev, haddr); + return arp_set_hh(hhp, hh); + } + + arp_fast_lock(); + + entry = arp_lookup(paddr, dev); + + if (entry) + { + for (hh = entry->hh; hh; hh=hh->hh_next) + if (hh->hh_type == htype) + break; + + if (hh) + { + arp_set_hh(hhp, hh); + arp_unlock(); + return 1; + } + } + + hh = arp_alloc_hh(htype); + if (!hh) + { + arp_unlock(); + return 1; + } + + if (entry) + { + + cli(); + hh->hh_arp = (void*)entry; + hh->hh_next = entry->hh; + entry->hh = hh; + atomic_inc(&hh->hh_refcnt); + restore_flags(flags); + + if (entry->flags & ATF_COM) + dev->header_cache_update(hh, dev, entry->ha); + + if (arp_set_hh(hhp, hh)) + { + arp_unlock(); + return 0; + } + + entry->last_used = jiffies; + arp_unlock(); + return 0; + } + + entry = arp_new_entry(paddr, dev, hh, NULL); + if (entry == NULL) + { + kfree_s(hh, sizeof(struct hh_cache)); + arp_unlock(); + return 1; + } + + if (!arp_set_hh(hhp, hh)) + { + arp_unlock(); + return 0; + } + arp_unlock(); + return 1; +} + +static void arp_run_bh() +{ + unsigned long flags; + struct arp_table *entry, *entry1; + struct device * dev; + unsigned long hash; + struct hh_cache *hh; + u32 sip; + + save_flags(flags); cli(); - pentry = &arp_tables[hash]; - if (! *pentry) /* also check proxy entries */ - pentry = &arp_tables[PROXY_HASH]; + arp_fast_lock(); - while ((entry = *pentry) != NULL) + while (arp_bh_mask) { - if (entry->ip == ip_addr) + arp_bh_mask &= ~ARP_BH_BACKLOG; + + while ((entry = arp_dequeue(&arp_backlog)) != NULL) { - if ((entry->flags & ATF_PERM) && !force) - return; - *pentry = entry->next; - del_timer(&entry->timer); - sti(); - arp_release_entry(entry); - /* this would have to be cleaned up */ - goto ugly; - /* perhaps like this ? + restore_flags(flags); + if (arp_update(entry->ip, entry->ha, entry->dev, 0, entry, 0)) + arp_free_entry(entry); cli(); - entry = *pentry; - */ } - pentry = &entry->next; - if (!checked_proxies && ! *pentry) - { /* ugly. we have to make sure we check proxy - entries as well */ - checked_proxies = 1; - pentry = &arp_tables[PROXY_HASH]; - } + + cli(); + while ((entry = arp_dequeue(&arp_req_backlog)) != NULL) + { + restore_flags(flags); + + dev = entry->dev; + sip = entry->ip; + hash = HASH(sip); + + for (entry1 = arp_tables[hash]; entry1; entry1 = entry1->next) + if (entry1->ip == sip && entry1->dev == dev) + break; + + if (!entry1) + { + cli(); + entry->next = arp_tables[hash]; + arp_tables[hash] = entry; + restore_flags(flags); + entry->timer.expires = jiffies + sysctl_arp_res_time; + entry->retries = sysctl_arp_max_tries; + entry->last_used = jiffies; + if (!(entry->flags & ATF_COM)) + { + add_timer(&entry->timer); +#ifdef CONFIG_ARPD + if (!arpd_not_running) + arpd_lookup(sip, dev); + else +#endif + arp_send(ARPOP_REQUEST, ETH_P_ARP, sip, dev, dev->pa_addr, NULL, dev->dev_addr, NULL); + } +#if RT_CACHE_DEBUG >= 1 + printk(KERN_DEBUG "arp_run_bh: %08x reinstalled\n", sip); +#endif + } + else + { + struct sk_buff * skb; + struct hh_cache * next; + + /* Discard entry, but preserve its hh's and + * skb's. + */ + cli(); + for (hh=entry->hh; hh; hh=next) + { + next = hh->hh_next; + hh->hh_next = entry1->hh; + entry1->hh = hh; + hh->hh_arp = (void*)entry1; + } + entry->hh = NULL; + + /* Prune skb list from entry + * and graft it to entry1. + */ + while ((skb = skb_dequeue(&entry->skb)) != NULL) + { + skb_device_lock(skb); + restore_flags(flags); + skb_queue_tail(&entry1->skb, skb); + skb_device_unlock(skb); + cli(); + } + restore_flags(flags); + + arp_free_entry(entry); + + if (entry1->flags & ATF_COM) + { + arp_update_hhs(entry1); + arp_send_q(entry1); + } + } + cli(); + } + cli(); } - sti(); + arp_unlock(); + restore_flags(flags); } /* - * Receive an arp request by the device layer. Maybe I rewrite it, to - * use the incoming packet for the reply. The time for the current - * "overhead" isn't that high... + * Interface to link layer: send routine and receive handler. + */ + +/* + * Create and send an arp packet. If (dest_hw == NULL), we create a broadcast + * message. + */ + +void arp_send(int type, int ptype, u32 dest_ip, + struct device *dev, u32 src_ip, + unsigned char *dest_hw, unsigned char *src_hw, + unsigned char *target_hw) +{ + struct sk_buff *skb; + struct arphdr *arp; + unsigned char *arp_ptr; + + /* + * No arp on this interface. + */ + + if (dev->flags&IFF_NOARP) + return; + + /* + * Allocate a buffer + */ + + skb = alloc_skb(sizeof(struct arphdr)+ 2*(dev->addr_len+4) + + dev->hard_header_len, GFP_ATOMIC); + if (skb == NULL) + { + printk(KERN_DEBUG "ARP: no memory to send an arp packet\n"); + return; + } + skb_reserve(skb, dev->hard_header_len); + arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4)); + skb->arp = 1; + skb->dev = dev; + skb->free = 1; + skb->protocol = htons (ETH_P_IP); + + /* + * Fill the device header for the ARP frame + */ + + dev->hard_header(skb,dev,ptype,dest_hw?dest_hw:dev->broadcast,src_hw?src_hw:NULL,skb->len); + + /* Fill out the arp protocol part. */ + arp->ar_hrd = htons(dev->type); +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) + arp->ar_pro = (dev->type == ARPHRD_AX25 || dev->type == ARPHRD_NETROM) ? htons(AX25_P_IP) : htons(ETH_P_IP); +#else + arp->ar_pro = (dev->type != ARPHRD_AX25) ? htons(ETH_P_IP) : htons(AX25_P_IP); +#endif +#else + arp->ar_pro = htons(ETH_P_IP); +#endif + arp->ar_hln = dev->addr_len; + arp->ar_pln = 4; + arp->ar_op = htons(type); + + arp_ptr=(unsigned char *)(arp+1); + + memcpy(arp_ptr, src_hw, dev->addr_len); + arp_ptr+=dev->addr_len; + memcpy(arp_ptr, &src_ip,4); + arp_ptr+=4; + if (target_hw != NULL) + memcpy(arp_ptr, target_hw, dev->addr_len); + else + memset(arp_ptr, 0, dev->addr_len); + arp_ptr+=dev->addr_len; + memcpy(arp_ptr, &dest_ip, 4); + + dev_queue_xmit(skb, dev, 0); +} + + +/* + * Receive an arp request by the device layer. */ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) @@ -566,14 +1749,9 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) struct arphdr *arp = (struct arphdr *)skb->h.raw; unsigned char *arp_ptr= (unsigned char *)(arp+1); - struct arp_table *entry; - struct arp_table *proxy_entry; - int addr_hint,hlen,htype; - unsigned long hash; - unsigned char ha[MAX_ADDR_LEN]; /* So we can enable ints again. */ - long sip,tip; unsigned char *sha,*tha; - + u32 sip,tip; + /* * The hardware length of the packet should match the hardware length * of the device. Similarly, the hardware types should match. The @@ -588,6 +1766,8 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) { kfree_skb(skb, FREE_READ); return 0; + /* Should this be an error/printk? Seems like something */ + /* you'd want to know about. Unless it's just !IFF_NOARP. -- MS */ } /* @@ -596,9 +1776,11 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) * match the protocol the device speaks. If it doesn't, there is a * problem, so toss the packet. */ - switch(dev->type) +/* Again, should this be an error/printk? -- MS */ + + switch (dev->type) { -#ifdef CONFIG_AX25 +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) case ARPHRD_AX25: if(arp->ar_pro != htons(AX25_P_IP)) { @@ -607,7 +1789,7 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) } break; #endif -#ifdef CONFIG_NETROM +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) case ARPHRD_NETROM: if(arp->ar_pro != htons(AX25_P_IP)) { @@ -618,6 +1800,7 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) #endif case ARPHRD_ETHER: case ARPHRD_ARCNET: + case ARPHRD_METRICOM: if(arp->ar_pro != htons(ETH_P_IP)) { kfree_skb(skb, FREE_READ); @@ -634,7 +1817,7 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) break; default: - printk("ARP: dev->type mangled!\n"); + printk(KERN_ERR "ARP: dev->type mangled!\n"); kfree_skb(skb, FREE_READ); return 0; } @@ -643,21 +1826,19 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) * Extract fields */ - hlen = dev->addr_len; - htype = dev->type; - sha=arp_ptr; - arp_ptr+=hlen; - memcpy(&sip,arp_ptr,4); - arp_ptr+=4; + arp_ptr += dev->addr_len; + memcpy(&sip, arp_ptr, 4); + arp_ptr += 4; tha=arp_ptr; - arp_ptr+=hlen; - memcpy(&tip,arp_ptr,4); + arp_ptr += dev->addr_len; + memcpy(&tip, arp_ptr, 4); /* - * Check for bad requests for 127.0.0.1. If this is one such, delete it. + * Check for bad requests for 127.x.x.x and requests for multicast + * addresses. If this is one such, delete it. */ - if(tip == INADDR_LOOPBACK) + if (LOOPBACK(tip) || MULTICAST(tip)) { kfree_skb(skb, FREE_READ); return 0; @@ -680,291 +1861,420 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) * cache. */ - addr_hint = ip_chk_addr(tip); +/* + * try to switch to alias device whose addr is tip or closest to sip. + */ - if(arp->ar_op == htons(ARPOP_REPLY)) +#ifdef CONFIG_NET_ALIAS + if (tip != dev->pa_addr && net_alias_has(skb->dev)) { - if(addr_hint!=IS_MYADDR) + /* + * net_alias_dev_rcv_sel32 returns main dev if it fails to found other. + */ + dev = net_alias_dev_rcv_sel32(dev, AF_INET, sip, tip); + + if (dev->type != ntohs(arp->ar_hrd) || dev->flags & IFF_NOARP) { -/* - * Replies to other machines get tossed. - */ kfree_skb(skb, FREE_READ); return 0; } -/* - * Fall through to code below that adds sender to cache. - */ } - else +#endif + + if (arp->ar_op == htons(ARPOP_REQUEST)) { -/* - * It is now an arp request - */ + /* * Only reply for the real device address or when it's in our proxy tables */ - if(tip!=dev->pa_addr) + if (tip != dev->pa_addr) { + struct arp_table *proxy_entry; + /* * To get in here, it is a request for someone else. We need to * check if that someone else is one of our proxies. If it isn't, * we can toss it. + * + * Make "longest match" lookup, a la routing. */ - cli(); - for(proxy_entry=arp_tables[PROXY_HASH]; - proxy_entry; - proxy_entry = proxy_entry->next) - { - /* we will respond to a proxy arp request - if the masked arp table ip matches the masked - tip. This allows a single proxy arp table - entry to be used on a gateway machine to handle - all requests for a whole network, rather than - having to use a huge number of proxy arp entries - and having to keep them uptodate. - */ - if (proxy_entry->dev != dev && proxy_entry->htype == htype && - !((proxy_entry->ip^tip)&proxy_entry->mask)) - break; - } - if (proxy_entry) + arp_fast_lock(); + + for (proxy_entry = arp_proxy_list; proxy_entry; + proxy_entry = proxy_entry->next) { - memcpy(ha, proxy_entry->ha, hlen); - sti(); - arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,ha); - kfree_skb(skb, FREE_READ); - return 0; + if (proxy_entry->dev == dev && + !((proxy_entry->ip^tip)&proxy_entry->mask)) + break; } - else + + if (proxy_entry && (proxy_entry->mask || ((dev->pa_addr^tip)&dev->pa_mask))) { - sti(); - kfree_skb(skb, FREE_READ); - return 0; + char ha[MAX_ADDR_LEN]; + struct rtable * rt; + + /* Unlock arp tables to make life for + * ip_rt_route easy. Note, that we are obliged + * to make local copy of hardware address. + */ + + memcpy(ha, proxy_entry->ha, dev->addr_len); + arp_unlock(); + + rt = ip_rt_route(tip, 0); + if (rt && rt->rt_dev != dev) + arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,ha,sha); + ip_rt_put(rt); + } + else + arp_unlock(); } else - { -/* - * To get here, it must be an arp request for us. We need to reply. - */ - arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr); - } + arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); + } + arp_fast_lock(); + arp_update(sip, sha, dev, 0, NULL, ip_chk_addr(tip) != IS_MYADDR && dev->type != ARPHRD_METRICOM); + arp_unlock(); + kfree_skb(skb, FREE_READ); + return 0; +} -/* - * Now all replies are handled. Next, anything that falls through to here - * needs to be added to the arp cache, or have its entry updated if it is - * there. - */ - hash = HASH(sip); - cli(); - for(entry=arp_tables[hash];entry;entry=entry->next) - if(entry->ip==sip && entry->htype==htype) - break; - if(entry) - { /* - * Entry found; update it. + * User level interface (ioctl, /proc) */ - memcpy(entry->ha, sha, hlen); - entry->hlen = hlen; - entry->last_used = jiffies; - if (!(entry->flags & ATF_COM)) - { + /* - * This entry was incomplete. Delete the retransmit timer - * and switch to complete status. - */ - del_timer(&entry->timer); - entry->flags |= ATF_COM; - sti(); -/* - * Send out waiting packets. We might have problems, if someone is - * manually removing entries right now -- entry might become invalid - * underneath us. + * Set (create) an ARP cache entry. */ - arp_send_q(entry, sha); + +static int arp_req_set(struct arpreq *r, struct device * dev) +{ + struct arp_table *entry, **entryp; + struct sockaddr_in *si; + unsigned char *ha; + u32 ip; + u32 mask = DEF_ARP_NETMASK; + unsigned long flags; + + /* + * Extract netmask (if supplied). + */ + + if (r->arp_flags&ATF_NETMASK) + { + si = (struct sockaddr_in *) &r->arp_netmask; + mask = si->sin_addr.s_addr; + } + + /* + * Extract destination. + */ + + si = (struct sockaddr_in *) &r->arp_pa; + ip = si->sin_addr.s_addr; + + + if (r->arp_flags&ATF_PUBL) + { + if (!mask && ip) + return -EINVAL; + if (!dev) { + dev = dev_getbytype(r->arp_ha.sa_family); + if (!dev) + return -ENODEV; } - else + } + else + { + if (!dev) { - sti(); + struct rtable * rt; + rt = ip_rt_route(ip, 0); + if (!rt) + return -ENETUNREACH; + dev = rt->rt_dev; + ip_rt_put(rt); + if (!dev) + return -ENODEV; } + if (dev->type != ARPHRD_METRICOM && ip_chk_addr(ip)) + return -EINVAL; } + if (dev->flags & (IFF_LOOPBACK | IFF_NOARP)) + return -ENODEV; + + if (r->arp_ha.sa_family != dev->type) + return -EINVAL; + + arp_fast_lock(); +#if RT_CACHE_DEBUG >= 1 + if (ARP_LOCKED()) + printk("arp_req_set: bug\n"); +#endif + + if (!(r->arp_flags & ATF_PUBL)) + entryp = &arp_tables[HASH(ip)]; else + entryp = &arp_proxy_list; + + while ((entry = *entryp) != NULL) { -/* - * No entry found. Need to add a new entry to the arp table. - */ - entry = (struct arp_table *)kmalloc(sizeof(struct arp_table),GFP_ATOMIC); - if(entry == NULL) - { - sti(); - printk("ARP: no memory for new arp entry\n"); + /* User supplied arp entries are definitive - RHP 960603 */ - kfree_skb(skb, FREE_READ); - return 0; + if (entry->ip == ip && entry->mask == mask && entry->dev == dev) { + *entryp=entry->next; + arp_free_entry(entry); + continue; } + if ((entry->mask & mask) != mask) + break; + entryp = &entry->next; + } - entry->mask = DEF_ARP_NETMASK; - entry->ip = sip; - entry->hlen = hlen; - entry->htype = htype; - entry->flags = ATF_COM; - init_timer(&entry->timer); - memcpy(entry->ha, sha, hlen); - entry->last_used = jiffies; - entry->dev = skb->dev; - skb_queue_head_init(&entry->skb); - entry->next = arp_tables[hash]; - arp_tables[hash] = entry; - sti(); + entry = arp_alloc_entry(); + if (entry == NULL) + { + arp_unlock(); + return -ENOMEM; } + entry->ip = ip; + entry->dev = dev; + entry->mask = mask; + entry->flags = r->arp_flags; -/* - * Replies have been sent, and entries have been added. All done. - */ - kfree_skb(skb, FREE_READ); + entry->next = *entryp; + *entryp = entry; + + ha = r->arp_ha.sa_data; + if (empty(ha, dev->addr_len)) + ha = dev->dev_addr; + + save_flags(flags); + cli(); + memcpy(entry->ha, ha, dev->addr_len); + entry->last_updated = entry->last_used = jiffies; + entry->flags |= ATF_COM; + restore_flags(flags); + arpd_update(entry); + arp_update_hhs(entry); + arp_unlock(); return 0; } + /* - * Find an arp mapping in the cache. If not found, post a request. + * Get an ARP cache entry. */ -int arp_find(unsigned char *haddr, unsigned long paddr, struct device *dev, - unsigned long saddr, struct sk_buff *skb) +static int arp_req_get(struct arpreq *r, struct device *dev) { struct arp_table *entry; - unsigned long hash; -#ifdef CONFIG_IP_MULTICAST - unsigned long taddr; -#endif + struct sockaddr_in *si; + u32 mask = DEF_ARP_NETMASK; - switch (ip_chk_addr(paddr)) + if (r->arp_flags&ATF_NETMASK) { - case IS_MYADDR: - printk("ARP: arp called for own IP address\n"); - memcpy(haddr, dev->dev_addr, dev->addr_len); - skb->arp = 1; - return 0; -#ifdef CONFIG_IP_MULTICAST - case IS_MULTICAST: - if(dev->type==ARPHRD_ETHER || dev->type==ARPHRD_IEEE802) - { - haddr[0]=0x01; - haddr[1]=0x00; - haddr[2]=0x5e; - taddr=ntohl(paddr); - haddr[5]=taddr&0xff; - taddr=taddr>>8; - haddr[4]=taddr&0xff; - taddr=taddr>>8; - haddr[3]=taddr&0x7f; - return 0; - } - /* - * If a device does not support multicast broadcast the stuff (eg AX.25 for now) - */ -#endif - - case IS_BROADCAST: - memcpy(haddr, dev->broadcast, dev->addr_len); - skb->arp = 1; - return 0; + si = (struct sockaddr_in *) &r->arp_netmask; + mask = si->sin_addr.s_addr; } - hash = HASH(paddr); - cli(); + si = (struct sockaddr_in *) &r->arp_pa; - /* - * Find an entry - */ - entry = arp_lookup(paddr, PROXY_NONE); + arp_fast_lock(); +#if RT_CACHE_DEBUG >= 1 + if (ARP_LOCKED()) + printk("arp_req_set: impossible\n"); +#endif - if (entry != NULL) /* It exists */ + if (!(r->arp_flags & ATF_PUBL)) + entry = arp_tables[HASH(si->sin_addr.s_addr)]; + else + entry = arp_proxy_list; + + for ( ; entry ;entry = entry->next) { - if (!(entry->flags & ATF_COM)) - { - /* - * A request was already send, but no reply yet. Thus - * queue the packet with the previous attempt - */ - - if (skb != NULL) - { - skb_queue_tail(&entry->skb, skb); - skb_device_unlock(skb); - } - sti(); - return 1; + if (entry->ip == si->sin_addr.s_addr + && (!dev || entry->dev == dev) + && (!(r->arp_flags&ATF_NETMASK) || entry->mask == mask)) + { + memcpy(r->arp_ha.sa_data, entry->ha, entry->dev->addr_len); + r->arp_ha.sa_family = entry->dev->type; + r->arp_flags = entry->flags; + strncpy(r->arp_dev, entry->dev->name, sizeof(r->arp_dev)); + arp_unlock(); + return 0; } + } - /* - * Update the record - */ - - entry->last_used = jiffies; - memcpy(haddr, entry->ha, dev->addr_len); - if (skb) - skb->arp = 1; - sti(); - return 0; + arp_unlock(); + return -ENXIO; +} + +static int arp_req_delete(struct arpreq *r, struct device * dev) +{ + struct sockaddr_in *si; + struct arp_table *entry, **entryp; + int retval = -ENXIO; + u32 mask = DEF_ARP_NETMASK; + + if (r->arp_flags&ATF_NETMASK) + { + si = (struct sockaddr_in *) &r->arp_netmask; + mask = si->sin_addr.s_addr; } - /* - * Create a new unresolved entry. - */ - - entry = (struct arp_table *) kmalloc(sizeof(struct arp_table), - GFP_ATOMIC); - if (entry != NULL) + si = (struct sockaddr_in *) &r->arp_pa; + + arp_fast_lock(); +#if RT_CACHE_DEBUG >= 1 + if (ARP_LOCKED()) + printk("arp_req_delete: impossible\n"); +#endif + + if (!(r->arp_flags & ATF_PUBL)) + entryp = &arp_tables[HASH(si->sin_addr.s_addr)]; + else + entryp = &arp_proxy_list; + + while ((entry = *entryp) != NULL) { - entry->next = arp_tables[hash]; - entry->last_used = jiffies; - entry->flags = 0; - entry->ip = paddr; - entry->mask = DEF_ARP_NETMASK; - memset(entry->ha, 0, dev->addr_len); - entry->hlen = dev->addr_len; - entry->htype = dev->type; - entry->dev = dev; - init_timer(&entry->timer); - entry->timer.function = arp_expire_request; - entry->timer.data = (unsigned long)entry; - entry->timer.expires = ARP_RES_TIME; - arp_tables[hash] = entry; - add_timer(&entry->timer); - entry->retries = ARP_MAX_TRIES; - skb_queue_head_init(&entry->skb); - if (skb != NULL) + if (entry->ip == si->sin_addr.s_addr + && (!dev || entry->dev == dev) + && (!(r->arp_flags&ATF_NETMASK) || entry->mask == mask)) { - skb_queue_tail(&entry->skb, skb); - skb_device_unlock(skb); + *entryp = entry->next; + arp_free_entry(entry); + retval = 0; + continue; } + entryp = &entry->next; } - else + + arp_unlock(); + return retval; +} + +/* + * Handle an ARP layer I/O control request. + */ + +int arp_ioctl(unsigned int cmd, void *arg) +{ + int err; + struct arpreq r; + + struct device * dev = NULL; + + switch(cmd) { - if (skb != NULL && skb->free) - kfree_skb(skb, FREE_WRITE); - } - sti(); + case SIOCDARP: + case SIOCSARP: + if (!suser()) + return -EPERM; + case SIOCGARP: + err = copy_from_user(&r, arg, sizeof(struct arpreq)); + if (err) + return -EFAULT; + break; + case OLD_SIOCDARP: + case OLD_SIOCSARP: + if (!suser()) + return -EPERM; + case OLD_SIOCGARP: + err = copy_from_user(&r, arg, sizeof(struct arpreq_old)); + if (err) + return -EFAULT; + memset(&r.arp_dev, 0, sizeof(r.arp_dev)); + break; + default: + return -EINVAL; + } - /* - * If we didn't find an entry, we will try to send an ARP packet. - */ - - arp_send(ARPOP_REQUEST, ETH_P_ARP, paddr, dev, saddr, NULL, - dev->dev_addr); + if (r.arp_pa.sa_family != AF_INET) + return -EPFNOSUPPORT; - return 1; -} + if (!(r.arp_flags & ATF_PUBL)) + r.arp_flags &= ~ATF_NETMASK; + if (!(r.arp_flags & ATF_NETMASK)) + ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr=DEF_ARP_NETMASK; + + if (r.arp_dev[0]) + { + if ((dev = dev_get(r.arp_dev)) == NULL) + return -ENODEV; + if (!r.arp_ha.sa_family) + r.arp_ha.sa_family = dev->type; + else if (r.arp_ha.sa_family != dev->type) + return -EINVAL; + } + + switch(cmd) + { + case SIOCDARP: + return arp_req_delete(&r, dev); + case SIOCSARP: + return arp_req_set(&r, dev); + case OLD_SIOCDARP: + /* old SIOCDARP destroys both + * normal and proxy mappings + */ + r.arp_flags &= ~ATF_PUBL; + err = arp_req_delete(&r, dev); + r.arp_flags |= ATF_PUBL; + if (!err) + arp_req_delete(&r, dev); + else + err = arp_req_delete(&r, dev); + return err; + case OLD_SIOCSARP: + err = arp_req_set(&r, dev); + /* old SIOCSARP works so funny, + * that its behaviour can be emulated + * only approximately 8). + * It should work. --ANK + */ + if (r.arp_flags & ATF_PUBL) + { + r.arp_flags &= ~ATF_PUBL; + arp_req_delete(&r, dev); + } + return err; + case SIOCGARP: + err = arp_req_get(&r, dev); + if (!err) + { + err = copy_to_user(arg, &r, sizeof(r)); + if (err) + err = -EFAULT; + } + return err; + case OLD_SIOCGARP: + r.arp_flags &= ~ATF_PUBL; + err = arp_req_get(&r, dev); + if (err < 0) + { + r.arp_flags |= ATF_PUBL; + err = arp_req_get(&r, dev); + } + if (!err) + { + err = copy_to_user(arg, &r, sizeof(struct arpreq_old)); + if (err) + err = -EFAULT; + } + return err; + } + /*NOTREACHED*/ + return 0; +} /* * Write the contents of the ARP cache to a PROCfs file. @@ -972,10 +2282,9 @@ int arp_find(unsigned char *haddr, unsigned long paddr, struct device *dev, #define HBUFFERLEN 30 -int arp_get_info(char *buffer, char **start, off_t offset, int length) +int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy) { int len=0; - off_t begin=0; off_t pos=0; int size; struct arp_table *entry; @@ -983,12 +2292,13 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length) int i,j,k; const char hexbuf[] = "0123456789ABCDEF"; - size = sprintf(buffer,"IP address HW type Flags HW address Mask\n"); + size = sprintf(buffer,"IP address HW type Flags HW address Mask Device\n"); pos+=size; len+=size; - - cli(); + + arp_fast_lock(); + for(i=0; i<FULL_ARP_TABLE_SIZE; i++) { for(entry=arp_tables[i]; entry!=NULL; entry=entry->next) @@ -996,19 +2306,19 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length) /* * Convert hardware address to XX:XX:XX:XX ... form. */ -#ifdef CONFIG_AX25 -#ifdef CONFIG_NETROM - if (entry->htype == ARPHRD_AX25 || entry->htype == ARPHRD_NETROM) +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) + if (entry->dev->type == ARPHRD_AX25 || entry->dev->type == ARPHRD_NETROM) strcpy(hbuffer,ax2asc((ax25_address *)entry->ha)); else { #else - if(entry->htype==ARPHRD_AX25) + if(entry->dev->type==ARPHRD_AX25) strcpy(hbuffer,ax2asc((ax25_address *)entry->ha)); else { #endif #endif - for(k=0,j=0;k<HBUFFERLEN-3 && j<entry->hlen;j++) + for(k=0,j=0;k<HBUFFERLEN-3 && j<entry->dev->addr_len;j++) { hbuffer[k++]=hexbuf[ (entry->ha[j]>>4)&15 ]; hbuffer[k++]=hexbuf[ entry->ha[j]&15 ]; @@ -1016,349 +2326,49 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length) } hbuffer[--k]=0; -#ifdef CONFIG_AX25 +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) } #endif size = sprintf(buffer+len, "%-17s0x%-10x0x%-10x%s", in_ntoa(entry->ip), - (unsigned int)entry->htype, + (unsigned int)entry->dev->type, entry->flags, hbuffer); +#if RT_CACHE_DEBUG < 2 + size += sprintf(buffer+len+size, + " %-17s %s\n", + entry->mask==DEF_ARP_NETMASK ? + "*" : in_ntoa(entry->mask), entry->dev->name); +#else size += sprintf(buffer+len+size, - " %-17s\n", - entry->mask==DEF_ARP_NETMASK? - "*":in_ntoa(entry->mask)); + " %-17s %s\t%d\t%1d\n", + entry->mask==DEF_ARP_NETMASK ? + "*" : in_ntoa(entry->mask), entry->dev->name, + entry->hh ? entry->hh->hh_refcnt : -1, + entry->hh ? entry->hh->hh_uptodate : 0); +#endif - len+=size; - pos=begin+len; + len += size; + pos += size; - if(pos<offset) - { + if (pos <= offset) len=0; - begin=pos; - } - if(pos>offset+length) - break; + if (pos >= offset+length) + goto done; } } - sti(); +done: + arp_unlock(); - *start=buffer+(offset-begin); /* Start of wanted data */ - len-=(offset-begin); /* Start slop */ - if(len>length) - len=length; /* Ending slop */ + *start = buffer+len-(pos-offset); /* Start of wanted data */ + len = pos-offset; /* Start slop */ + if (len>length) + len = length; /* Ending slop */ return len; } -/* - * This will find an entry in the ARP table by looking at the IP address. - * If proxy is PROXY_EXACT then only exact IP matches will be allowed - * for proxy entries, otherwise the netmask will be used - */ - -static struct arp_table *arp_lookup(unsigned long paddr, enum proxy proxy) -{ - struct arp_table *entry; - unsigned long hash = HASH(paddr); - - for (entry = arp_tables[hash]; entry != NULL; entry = entry->next) - if (entry->ip == paddr) break; - - /* it's possibly a proxy entry (with a netmask) */ - if (!entry && proxy != PROXY_NONE) - for (entry=arp_tables[PROXY_HASH]; entry != NULL; entry = entry->next) - if ((proxy==PROXY_EXACT) ? (entry->ip==paddr) - : !((entry->ip^paddr)&entry->mask)) - break; - - return entry; -} - - -int arp_find_cache(unsigned char *dp, unsigned long daddr, struct device *dev) -{ - /* - * We need the broadcast/multicast awareness here and the find routine split up. - */ - struct arp_table *entry; -#ifdef CONFIG_IP_MULTICAST - unsigned long taddr; -#endif - - switch (ip_chk_addr(daddr)) - { - case IS_MYADDR: - printk("ARP: arp called for own IP address\n"); - memcpy(dp, dev->dev_addr, dev->addr_len); - return 1; -#ifdef CONFIG_IP_MULTICAST - case IS_MULTICAST: - if(dev->type==ARPHRD_ETHER || dev->type==ARPHRD_IEEE802) - { - dp[0]=0x01; - dp[1]=0x00; - dp[2]=0x5e; - taddr=ntohl(daddr); - dp[5]=taddr&0xff; - taddr=taddr>>8; - dp[4]=taddr&0xff; - taddr=taddr>>8; - dp[3]=taddr&0x7f; - return 1; - } - /* - * If a device does not support multicast broadcast the stuff (eg AX.25 for now) - */ -#endif - - case IS_BROADCAST: - memcpy(dp, dev->broadcast, dev->addr_len); - return 1; - - default: - entry=arp_lookup(daddr, PROXY_NONE); - if(entry) - { - memcpy(dp,entry->ha, ETH_ALEN); - return 1; - } - } - return 0; -} - -/* - * Set (create) an ARP cache entry. - */ - -static int arp_req_set(struct arpreq *req) -{ - struct arpreq r; - struct arp_table *entry; - struct sockaddr_in *si; - int htype, hlen; - unsigned long ip; - struct rtable *rt; - - memcpy_fromfs(&r, req, sizeof(r)); - - /* We only understand about IP addresses... */ - if (r.arp_pa.sa_family != AF_INET) - return -EPFNOSUPPORT; - - /* - * Find out about the hardware type. - * We have to be compatible with BSD UNIX, so we have to - * assume that a "not set" value (i.e. 0) means Ethernet. - */ - - switch (r.arp_ha.sa_family) { - case ARPHRD_ETHER: - htype = ARPHRD_ETHER; - hlen = ETH_ALEN; - break; - - case ARPHRD_ARCNET: - htype = ARPHRD_ARCNET; - hlen = 1; /* length of arcnet addresses */ - break; - -#ifdef CONFIG_AX25 - case ARPHRD_AX25: - htype = ARPHRD_AX25; - hlen = 7; - break; -#endif -#ifdef CONFIG_NETROM - case ARPHRD_NETROM: - htype = ARPHRD_NETROM; - hlen = 7; - break; -#endif - case ARPHRD_IEEE802: - htype = ARPHRD_IEEE802; - hlen = TR_ALEN; - break; - default: - return -EPFNOSUPPORT; - } - - si = (struct sockaddr_in *) &r.arp_pa; - ip = si->sin_addr.s_addr; - if (ip == 0) - { - printk("ARP: SETARP: requested PA is 0.0.0.0 !\n"); - return -EINVAL; - } - - /* - * Is it reachable directly ? - */ - - rt = ip_rt_route(ip, NULL, NULL); - if (rt == NULL) - return -ENETUNREACH; - - /* - * Is there an existing entry for this address? - */ - - cli(); - - /* - * Find the entry - */ - entry = arp_lookup(ip, PROXY_EXACT); - if (entry && (entry->flags & ATF_PUBL) != (r.arp_flags & ATF_PUBL)) - { - sti(); - arp_destroy(ip,1); - cli(); - entry = NULL; - } - - /* - * Do we need to create a new entry - */ - - if (entry == NULL) - { - unsigned long hash = HASH(ip); - if (r.arp_flags & ATF_PUBL) - hash = PROXY_HASH; - - entry = (struct arp_table *) kmalloc(sizeof(struct arp_table), - GFP_ATOMIC); - if (entry == NULL) - { - sti(); - return -ENOMEM; - } - entry->ip = ip; - entry->hlen = hlen; - entry->htype = htype; - init_timer(&entry->timer); - entry->next = arp_tables[hash]; - arp_tables[hash] = entry; - skb_queue_head_init(&entry->skb); - } - /* - * We now have a pointer to an ARP entry. Update it! - */ - - memcpy(&entry->ha, &r.arp_ha.sa_data, hlen); - entry->last_used = jiffies; - entry->flags = r.arp_flags | ATF_COM; - if ((entry->flags & ATF_PUBL) && (entry->flags & ATF_NETMASK)) - { - si = (struct sockaddr_in *) &r.arp_netmask; - entry->mask = si->sin_addr.s_addr; - } - else - entry->mask = DEF_ARP_NETMASK; - entry->dev = rt->rt_dev; - arp_cache_stamp++; - sti(); - - return 0; -} - - -/* - * Get an ARP cache entry. - */ - -static int arp_req_get(struct arpreq *req) -{ - struct arpreq r; - struct arp_table *entry; - struct sockaddr_in *si; - - /* - * We only understand about IP addresses... - */ - - memcpy_fromfs(&r, req, sizeof(r)); - - if (r.arp_pa.sa_family != AF_INET) - return -EPFNOSUPPORT; - - /* - * Is there an existing entry for this address? - */ - - si = (struct sockaddr_in *) &r.arp_pa; - cli(); - entry = arp_lookup(si->sin_addr.s_addr,PROXY_ANY); - - if (entry == NULL) - { - sti(); - return -ENXIO; - } - - /* - * We found it; copy into structure. - */ - - memcpy(r.arp_ha.sa_data, &entry->ha, entry->hlen); - r.arp_ha.sa_family = entry->htype; - r.arp_flags = entry->flags; - sti(); - - /* - * Copy the information back - */ - - memcpy_tofs(req, &r, sizeof(r)); - return 0; -} - - -/* - * Handle an ARP layer I/O control request. - */ - -int arp_ioctl(unsigned int cmd, void *arg) -{ - struct arpreq r; - struct sockaddr_in *si; - int err; - - switch(cmd) - { - case SIOCDARP: - if (!suser()) - return -EPERM; - err = verify_area(VERIFY_READ, arg, sizeof(struct arpreq)); - if(err) - return err; - memcpy_fromfs(&r, arg, sizeof(r)); - if (r.arp_pa.sa_family != AF_INET) - return -EPFNOSUPPORT; - si = (struct sockaddr_in *) &r.arp_pa; - arp_destroy(si->sin_addr.s_addr, 1); - return 0; - case SIOCGARP: - err = verify_area(VERIFY_WRITE, arg, sizeof(struct arpreq)); - if(err) - return err; - return arp_req_get((struct arpreq *)arg); - case SIOCSARP: - if (!suser()) - return -EPERM; - err = verify_area(VERIFY_READ, arg, sizeof(struct arpreq)); - if(err) - return err; - return arp_req_set((struct arpreq *)arg); - default: - return -EINVAL; - } - /*NOTREACHED*/ - return 0; -} - /* * Called once on startup. @@ -1379,6 +2389,15 @@ static struct notifier_block arp_dev_notifier={ 0 }; +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry proc_net_arp = { + PROC_NET_ARP, 3, "arp", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + arp_get_info +}; +#endif + void arp_init (void) { /* Register the packet type */ @@ -1388,5 +2407,48 @@ void arp_init (void) add_timer(&arp_timer); /* Register for device down reports */ register_netdevice_notifier(&arp_dev_notifier); + +#ifdef CONFIG_PROC_FS + proc_net_register(&proc_net_arp); +#endif + +#ifdef CONFIG_ARPD + netlink_attach(NETLINK_ARPD, arpd_callback); +#endif +} + +#ifdef CONFIG_AX25_MODULE + +/* + * ax25 -> ascii conversion + */ +char *ax2asc(ax25_address *a) +{ + static char buf[11]; + char c, *s; + int n; + + for (n = 0, s = buf; n < 6; n++) { + c = (a->ax25_call[n] >> 1) & 0x7F; + + if (c != ' ') *s++ = c; + } + + *s++ = '-'; + + if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) { + *s++ = '1'; + n -= 10; + } + + *s++ = n + '0'; + *s++ = '\0'; + + if (*buf == '\0' || *buf == '-') + return "*"; + + return buf; + } +#endif diff --git a/net/ipv4/checksum.c b/net/ipv4/checksum.c deleted file mode 100644 index 59355e967..000000000 --- a/net/ipv4/checksum.c +++ /dev/null @@ -1,276 +0,0 @@ -/* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * IP/TCP/UDP checksumming routines - * - * Authors: Jorge Cwik, <jorge@laser.satlink.net> - * Arnt Gulbrandsen, <agulbra@nvg.unit.no> - * Lots of code moved from tcp.c and ip.c; see those files - * for more names. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <net/checksum.h> -#ifdef __mips__ -#include <asm/string.h> -#endif - -/* - * computes a partial checksum, e.g. for TCP/UDP fragments - */ - -unsigned int csum_partial(unsigned char * buff, int len, unsigned int sum) { -#ifdef __i386__ - __asm__(" - movl %%ecx, %%edx - cld - shrl $5, %%ecx - jz 2f - orl %%ecx, %%ecx -1: movl (%%esi), %%eax - adcl %%eax, %%ebx - movl 4(%%esi), %%eax - adcl %%eax, %%ebx - movl 8(%%esi), %%eax - adcl %%eax, %%ebx - movl 12(%%esi), %%eax - adcl %%eax, %%ebx - movl 16(%%esi), %%eax - adcl %%eax, %%ebx - movl 20(%%esi), %%eax - adcl %%eax, %%ebx - movl 24(%%esi), %%eax - adcl %%eax, %%ebx - movl 28(%%esi), %%eax - adcl %%eax, %%ebx - lea 32(%%esi), %%esi - dec %%ecx - jne 1b - adcl $0, %%ebx -2: movl %%edx, %%ecx - andl $28, %%ecx - je 4f - shrl $2, %%ecx - orl %%ecx, %%ecx -3: adcl (%%esi), %%ebx - lea 4(%%esi), %%esi - dec %%ecx - jne 3b - adcl $0, %%ebx -4: movl $0, %%eax - testw $2, %%dx - je 5f - lodsw - addl %%eax, %%ebx - adcl $0, %%ebx - movw $0, %%ax -5: test $1, %%edx - je 6f - lodsb - addl %%eax, %%ebx - adcl $0, %%ebx -6: " - : "=b"(sum) - : "0"(sum), "c"(len), "S"(buff) - : "ax", "bx", "cx", "dx", "si" ); -#elif defined (__mips__) - unsigned long scratch1; - unsigned long scratch2; - - __asm__(" - .set noreorder - .set noat - move %1,%4 - srl %1,%1,5 - beqz %1,2f - sll %1,%1,5 # delay slot - - addu %1,%5 -1: lw %2,0(%5) - addu %5,32 - addu %0,%2 - sltu $1,%0,%2 - - lw %2,-28(%5) - addu %0,$1 - addu %0,%2 - sltu $1,%0,%2 - - lw %2,-24(%5) - addu %0,$1 - addu %0,%2 - sltu $1,%0,%2 - - lw %2,-20(%5) - addu %0,$1 - addu %0,%2 - sltu $1,%0,%2 - - lw %2,-16(%5) - addu %0,$1 - addu %0,%2 - sltu $1,%0,%2 - - lw %2,-12(%5) - addu %0,$1 - addu %0,%2 - sltu $1,%0,%2 - - lw %2,-8(%5) - addu %0,$1 - addu %0,%2 - sltu $1,%0,%2 - - lw %2,-4(%5) - addu %0,$1 - addu %0,%2 - sltu $1,%0,%2 - - bne %5,%1,1b - addu %0,$1 # delay slot - -2: srl %1,%4,2 - bnez %1,4f - addu %1,%5 # delay slot -3: lw %2,0(%5) - addu %5,4 - addu %0,%2 - sltu $1,%0,%2 - bne %5,%1,3b - addu %0,$1 # delay slot - -4: andi $1,%4,2 - beqz %4,5f - lhu %2,0(%5) # delay slot - addu %5,2 - addu %0,%2 - sltu $1,%0,%2 - addu %0,$1 # delay slot - -5: andi $1,%4,1 - beqz %4,6f - lbu %2,0(%5) # delay slot - addu %0,%2 - sltu $1,%0,%2 - addu %0,$1 # delay slot -6: .set at - .set reorder" - : "=r"(sum), "=r" (scratch1), "=r" (scratch2) - : "0"(sum), "r"(len), "r"(buff) - : "$1"); -#else -#error Not implemented for this CPU -#endif - return(sum); -} - - - -/* - * copy from fs while checksumming, otherwise like csum_partial - */ - -unsigned int csum_partial_copyffs( char *src, char *dst, - int len, int sum) { -#ifdef __i386__ - __asm__(" - push %%ds - push %%es - movw %%ds, %%dx - movw %%dx, %%es - movw %%fs, %%dx - movw %%dx, %%ds - cld - cmpl $32, %%ecx - jb 2f - pushl %%ecx - shrl $5, %%ecx - orl %%ecx, %%ecx -1: movl (%%esi), %%eax - movl 4(%%esi), %%edx - adcl %%eax, %%ebx - movl %%eax, %%es:(%%edi) - adcl %%edx, %%ebx - movl %%edx, %%es:4(%%edi) - - movl 8(%%esi), %%eax - movl 12(%%esi), %%edx - adcl %%eax, %%ebx - movl %%eax, %%es:8(%%edi) - adcl %%edx, %%ebx - movl %%edx, %%es:12(%%edi) - - movl 16(%%esi), %%eax - movl 20(%%esi), %%edx - adcl %%eax, %%ebx - movl %%eax, %%es:16(%%edi) - adcl %%edx, %%ebx - movl %%edx, %%es:20(%%edi) - - movl 24(%%esi), %%eax - movl 28(%%esi), %%edx - adcl %%eax, %%ebx - movl %%eax, %%es:24(%%edi) - adcl %%edx, %%ebx - movl %%edx, %%es:28(%%edi) - - lea 32(%%esi), %%esi - lea 32(%%edi), %%edi - dec %%ecx - jne 1b - adcl $0, %%ebx - popl %%ecx -2: movl %%ecx, %%edx - andl $28, %%ecx - je 4f - shrl $2, %%ecx - orl %%ecx, %%ecx -3: movl (%%esi), %%eax - adcl %%eax, %%ebx - movl %%eax, %%es:(%%edi) - lea 4(%%esi), %%esi - lea 4(%%edi), %%edi - dec %%ecx - jne 3b - adcl $0, %%ebx -4: movl $0, %%eax - testl $2, %%edx - je 5f - lodsw - stosw - addl %%eax, %%ebx - movw $0, %%ax - adcl %%eax, %%ebx -5: test $1, %%edx - je 6f - lodsb - stosb - addl %%eax, %%ebx - adcl $0, %%ebx -6: pop %%es - pop %%ds - " - : "=b"(sum) - : "0"(sum), "c"(len), "S"(src), "D"(dst) - : "ax", "bx", "cx", "dx", "si", "di" ); -#elif defined (__mips__) - /* - * It's 2:30 am and I don't feel like doing it real ... - * This is lots slower than the real thing (tm) - */ - sum = csum_partial(src, len, sum); - memcpy(dst, src, len); -#else -#error Not implemented for this CPU -#endif - return(sum); -} - - - diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 794a7e897..0c2d70cae 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -14,8 +14,10 @@ * Additional Authors: * Alan Cox, <gw4pts@gw4pts.ampr.org> */ + +#include <linux/config.h> /* For CONFIG_IP_CLASSLESS */ -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <asm/bitops.h> #include <linux/types.h> @@ -76,7 +78,9 @@ unsigned long ip_get_mask(unsigned long addr) int ip_chk_addr(unsigned long addr) { struct device *dev; +#ifndef CONFIG_IP_CLASSLESS unsigned long mask; +#endif /* * Accept both `all ones' and `all zeros' as BROADCAST. @@ -90,6 +94,7 @@ int ip_chk_addr(unsigned long addr) addr == htonl(0x7FFFFFFFL)) return IS_BROADCAST; +#ifndef CONFIG_IP_CLASSLESS mask = ip_get_mask(addr); /* @@ -98,6 +103,10 @@ int ip_chk_addr(unsigned long addr) if ((addr & mask) == htonl(0x7F000000L)) return IS_MYADDR; +#else + if ((addr & htonl(0x7F000000L)) == htonl(0x7F000000L)) + return IS_MYADDR; +#endif /* * OK, now check the interface addresses. We could @@ -106,14 +115,14 @@ int ip_chk_addr(unsigned long addr) for (dev = dev_base; dev != NULL; dev = dev->next) { - if (!(dev->flags & IFF_UP)) + if ((!(dev->flags & IFF_UP)) || dev->family!=AF_INET) continue; /* * If the protocol address of the device is 0 this is special * and means we are address hunting (eg bootp). */ - if ((dev->pa_addr == 0)/* || (dev->flags&IFF_PROMISC)*/) + if (dev->pa_addr == 0) return IS_MYADDR; /* * Is it the exact IP address? @@ -139,6 +148,7 @@ int ip_chk_addr(unsigned long addr) return IS_BROADCAST; } +#ifndef CONFIG_IP_CLASSLESS /* * Nope. Check for Network broadcast. */ @@ -150,6 +160,7 @@ int ip_chk_addr(unsigned long addr) if ((addr & ~mask) == ~mask) return IS_BROADCAST; } +#endif } if(IN_MULTICAST(ntohl(addr))) return IS_MULTICAST; @@ -181,35 +192,60 @@ unsigned long ip_my_addr(void) /* * Find an interface that can handle addresses for a certain address. - * - * This needs optimising, since it's relatively trivial to collapse - * the two loops into one. */ - -struct device * ip_dev_check(unsigned long addr) + +struct device * ip_dev_bynet(unsigned long addr, unsigned long mask) { struct device *dev; + struct device *best_dev = NULL; + __u32 best_mask = mask; for (dev = dev_base; dev; dev = dev->next) { if (!(dev->flags & IFF_UP)) continue; - if (!(dev->flags & IFF_POINTOPOINT)) - continue; - if (addr != dev->pa_dstaddr) - continue; - return dev; - } - for (dev = dev_base; dev; dev = dev->next) - { - if (!(dev->flags & IFF_UP)) - continue; if (dev->flags & IFF_POINTOPOINT) + { + if (addr == dev->pa_dstaddr) + return dev; continue; + } if (dev->pa_mask & (addr ^ dev->pa_addr)) continue; - return dev; + if (mask == dev->pa_mask) + return dev; + if (best_dev && (best_mask & dev->pa_mask) != best_mask) + continue; + best_dev = dev; + best_mask = dev->pa_mask; + } + return best_dev; +} + +/* + * Find the first device with a given source address. + */ + +struct device *ip_dev_find(unsigned long addr) +{ + struct device *dev; + for(dev = dev_base; dev; dev=dev->next) + { + if((dev->flags&IFF_UP) && dev->pa_addr==addr) + return dev; } return NULL; } +struct device *dev_getbytype(unsigned short type) +{ + struct device *dev; + + for (dev = dev_base; dev != NULL; dev = dev->next) + { + if (dev->type == type && !(dev->flags&(IFF_LOOPBACK|IFF_NOARP))) + return(dev); + } + return(NULL); +} + diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 7c1eea15d..787f69d7f 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1,45 +1,239 @@ /* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. + * NET3: Implementation of the ICMP protocol layer. + * + * Alan Cox, <alan@cymru.net> * - * Internet Control Message Protocol (ICMP) + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. * - * Version: @(#)icmp.c 1.0.11 06/02/93 + * Some of the function names and the icmp unreach table for this + * module were derived from [icmp.c 1.0.11 06/02/93] by + * Ross Biro, Fred N. van Kempen, Mark Evans, Alan Cox, Gerhard Koerting. + * Other than that this module is a complete rewrite. * - * Authors: Ross Biro, <bir7@leland.Stanford.Edu> - * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> - * Mark Evans, <evansmp@uhura.aston.ac.uk> - * Alan Cox, <gw4pts@gw4pts.ampr.org> - * Stefan Becker, <stefanb@yello.ping.de> + * Fixes: + * Mike Shaver : RFC1122 checks. + * Alan Cox : Multicast ping reply as self. + * Alan Cox : Fix atomicity lockup in ip_build_xmit + * call. + * Alan Cox : Added 216,128 byte paths to the MTU + * code. + * Martin Mares : RFC1812 checks. + * Martin Mares : Can be configured to follow redirects + * if acting as a router _without_ a + * routing protocol (RFC 1812). + * Martin Mares : Echo requests may be configured to + * be ignored (RFC 1812). + * Martin Mares : Limitation of ICMP error message + * transmit rate (RFC 1812). + * Martin Mares : TOS and Precedence set correctly + * (RFC 1812). + * Martin Mares : Now copying as much data from the + * original packet as we can without + * exceeding 576 bytes (RFC 1812). + * Willy Konynenberg : Transparent proxying support. + * Keith Owens : RFC1191 correction for 4.2BSD based + * path MTU bug. + * Thomas Quinot : ICMP Dest Unreach codes up to 15 are + * valid (RFC 1812). * - * Fixes: - * Alan Cox : Generic queue usage. - * Gerhard Koerting: ICMP addressing corrected - * Alan Cox : Use tos/ttl settings - * Alan Cox : Protocol violations - * Alan Cox : SNMP Statistics - * Alan Cox : Routing errors - * Alan Cox : Changes for newer routing code - * Alan Cox : Removed old debugging junk - * Alan Cox : Fixed the ICMP error status of net/host unreachable - * Gerhard Koerting : Fixed broadcast ping properly - * Ulrich Kunitz : Fixed ICMP timestamp reply - * A.N.Kuznetsov : Multihoming fixes. - * Laco Rusnak : Multihoming fixes. - * Alan Cox : Tightened up icmp_send(). - * Alan Cox : Multicasts. - * Stefan Becker : ICMP redirects in icmp_send(). - * Peter Belding : Tightened up ICMP redirect handling - * Alan Cox : Tightened even more. * - * + * RFC1122 (Host Requirements -- Comm. Layer) Status: + * (boy, are there a lot of rules for ICMP) + * 3.2.2 (Generic ICMP stuff) + * MUST discard messages of unknown type. (OK) + * MUST copy at least the first 8 bytes from the offending packet + * when sending ICMP errors. (OBSOLETE -- see RFC1812) + * MUST pass received ICMP errors up to protocol level. (OK) + * SHOULD send ICMP errors with TOS == 0. (OBSOLETE -- see RFC1812) + * MUST NOT send ICMP errors in reply to: + * ICMP errors (OK) + * Broadcast/multicast datagrams (OK) + * MAC broadcasts (OK) + * Non-initial fragments (OK) + * Datagram with a source address that isn't a single host. (OK) + * 3.2.2.1 (Destination Unreachable) + * All the rules govern the IP layer, and are dealt with in ip.c, not here. + * 3.2.2.2 (Redirect) + * Host SHOULD NOT send ICMP_REDIRECTs. (OK) + * MUST update routing table in response to host or network redirects. + * (host OK, network OBSOLETE) + * SHOULD drop redirects if they're not from directly connected gateway + * (OK -- we drop it if it's not from our old gateway, which is close + * enough) + * 3.2.2.3 (Source Quench) + * MUST pass incoming SOURCE_QUENCHs to transport layer (OK) + * Other requirements are dealt with at the transport layer. + * 3.2.2.4 (Time Exceeded) + * MUST pass TIME_EXCEEDED to transport layer (OK) + * Other requirements dealt with at IP (generating TIME_EXCEEDED). + * 3.2.2.5 (Parameter Problem) + * SHOULD generate these (OK) + * MUST pass received PARAMPROBLEM to transport layer (NOT YET) + * [Solaris 2.X seems to assert EPROTO when this occurs] -- AC + * 3.2.2.6 (Echo Request/Reply) + * MUST reply to ECHO_REQUEST, and give app to do ECHO stuff (OK, OK) + * MAY discard broadcast ECHO_REQUESTs. (We don't, but that's OK.) + * MUST reply using same source address as the request was sent to. + * We're OK for unicast ECHOs, and it doesn't say anything about + * how to handle broadcast ones, since it's optional. + * MUST copy data from REQUEST to REPLY (OK) + * unless it would require illegal fragmentation (OK) + * MUST pass REPLYs to transport/user layer (OK) + * MUST use any provided source route (reversed) for REPLY. (NOT YET) + * 3.2.2.7 (Information Request/Reply) + * MUST NOT implement this. (I guess that means silently discard...?) (OK) + * 3.2.2.8 (Timestamp Request/Reply) + * MAY implement (OK) + * SHOULD be in-kernel for "minimum variability" (OK) + * MAY discard broadcast REQUESTs. (OK, but see source for inconsistency) + * MUST reply using same source address as the request was sent to. (OK) + * MUST reverse source route, as per ECHO (NOT YET) + * MUST pass REPLYs to transport/user layer (requires RAW, just like + * ECHO) (OK) + * MUST update clock for timestamp at least 15 times/sec (OK) + * MUST be "correct within a few minutes" (OK) + * 3.2.2.9 (Address Mask Request/Reply) + * MAY implement (OK) + * MUST send a broadcast REQUEST if using this system to set netmask + * (OK... we don't use it) + * MUST discard received REPLYs if not using this system (OK) + * MUST NOT send replies unless specifically made agent for this sort + * of thing. (OK) * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. + * + * RFC 1812 (IPv4 Router Requirements) Status (even longer): + * 4.3.2.1 (Unknown Message Types) + * MUST pass messages of unknown type to ICMP user iface or silently discard + * them (OK) + * 4.3.2.2 (ICMP Message TTL) + * MUST initialize TTL when originating an ICMP message (OK) + * 4.3.2.3 (Original Message Header) + * SHOULD copy as much data from the offending packet as possible without + * the length of the ICMP datagram exceeding 576 bytes (OK) + * MUST leave original IP header of the offending packet, but we're not + * required to undo modifications made (OK) + * 4.3.2.4 (Original Message Source Address) + * MUST use one of addresses for the interface the orig. packet arrived as + * source address (OK) + * 4.3.2.5 (TOS and Precedence) + * SHOULD leave TOS set to the same value unless the packet would be + * discarded for that reason (OK) + * MUST use TOS=0 if not possible to leave original value (OK) + * MUST leave IP Precedence for Source Quench messages (OK -- not sent + * at all) + * SHOULD use IP Precedence = 6 (Internetwork Control) or 7 (Network Control) + * for all other error messages (OK, we use 6) + * MAY allow configuration of IP Precedence (OK -- not done) + * MUST leave IP Precedence and TOS for reply messages (OK) + * 4.3.2.6 (Source Route) + * SHOULD use reverse source route UNLESS sending Parameter Problem on source + * routing and UNLESS the packet would be immediately discarded (NOT YET) + * 4.3.2.7 (When Not to Send ICMP Errors) + * MUST NOT send ICMP errors in reply to: + * ICMP errors (OK) + * Packets failing IP header validation tests unless otherwise noted (OK) + * Broadcast/multicast datagrams (OK) + * MAC broadcasts (OK) + * Non-initial fragments (OK) + * Datagram with a source address that isn't a single host. (OK) + * 4.3.2.8 (Rate Limiting) + * SHOULD be able to limit error message rate (OK) + * SHOULD allow setting of rate limits (OK, in the source) + * 4.3.3.1 (Destination Unreachable) + * All the rules govern the IP layer, and are dealt with in ip.c, not here. + * 4.3.3.2 (Redirect) + * MAY ignore ICMP Redirects if running a routing protocol or if forwarding + * is enabled on the interface (OK -- ignores) + * 4.3.3.3 (Source Quench) + * SHOULD NOT originate SQ messages (OK) + * MUST be able to limit SQ rate if originates them (OK as we don't + * send them) + * MAY ignore SQ messages it receives (OK -- we don't) + * 4.3.3.4 (Time Exceeded) + * Requirements dealt with at IP (generating TIME_EXCEEDED). + * 4.3.3.5 (Parameter Problem) + * MUST generate these for all errors not covered by other messages (OK) + * MUST include original value of the value pointed by (OK) + * 4.3.3.6 (Echo Request) + * MUST implement echo server function (OK) + * MUST process at ER of at least max(576, MTU) (OK) + * MAY reject broadcast/multicast ER's (We don't, but that's OK) + * SHOULD have a config option for silently ignoring ER's (OK) + * MUST have a default value for the above switch = NO (OK) + * MUST have application layer interface for Echo Request/Reply (OK) + * MUST reply using same source address as the request was sent to. + * We're OK for unicast ECHOs, and it doesn't say anything about + * how to handle broadcast ones, since it's optional. + * MUST copy data from Request to Reply (OK) + * SHOULD update Record Route / Timestamp options (??) + * MUST use reversed Source Route for Reply if possible (NOT YET) + * 4.3.3.7 (Information Request/Reply) + * SHOULD NOT originate or respond to these (OK) + * 4.3.3.8 (Timestamp / Timestamp Reply) + * MAY implement (OK) + * MUST reply to every Timestamp message received (OK) + * MAY discard broadcast REQUESTs. (OK, but see source for inconsistency) + * MUST reply using same source address as the request was sent to. (OK) + * MUST use reversed Source Route if possible (NOT YET) + * SHOULD update Record Route / Timestamp options (??) + * MUST pass REPLYs to transport/user layer (requires RAW, just like + * ECHO) (OK) + * MUST update clock for timestamp at least 16 times/sec (OK) + * MUST be "correct within a few minutes" (OK) + * 4.3.3.9 (Address Mask Request/Reply) + * MUST have support for receiving AMRq and responding with AMRe (OK, + * but only as a compile-time option) + * SHOULD have option for each interface for AMRe's, MUST default to + * NO (NOT YET) + * MUST NOT reply to AMRq before knows the correct AM (OK) + * MUST NOT respond to AMRq with source address 0.0.0.0 on physical + * interfaces having multiple logical i-faces with different masks + * (NOT YET) + * SHOULD examine all AMRe's it receives and check them (NOT YET) + * SHOULD log invalid AMRe's (AM+sender) (NOT YET) + * MUST NOT use contents of AMRe to determine correct AM (OK) + * MAY broadcast AMRe's after having configured address masks (OK -- doesn't) + * MUST NOT do broadcast AMRe's if not set by extra option (OK, no option) + * MUST use the { <NetPrefix>, -1 } form of broadcast addresses (OK) + * 4.3.3.10 (Router Advertisement and Solicitations) + * MUST support router part of Router Discovery Protocol on all networks we + * support broadcast or multicast addressing. (OK -- done by gated) + * MUST have all config parameters with the respective defaults (OK) + * 5.2.7.1 (Destination Unreachable) + * MUST generate DU's (OK) + * SHOULD choose a best-match response code (OK) + * SHOULD NOT generate Host Isolated codes (OK) + * SHOULD use Communication Administratively Prohibited when administratively + * filtering packets (NOT YET -- bug-to-bug compatibility) + * MAY include config option for not generating the above and silently + * discard the packets instead (OK) + * MAY include config option for not generating Precedence Violation and + * Precedence Cutoff messages (OK as we don't generate them at all) + * MUST use Host Unreachable or Dest. Host Unknown codes whenever other hosts + * on the same network might be reachable (OK -- no net unreach's at all) + * MUST use new form of Fragmentation Needed and DF Set messages (OK) + * 5.2.7.2 (Redirect) + * MUST NOT generate network redirects (OK) + * MUST be able to generate host redirects (OK) + * SHOULD be able to generate Host+TOS redirects (NO as we don't use TOS) + * MUST have an option to use Host redirects instead of Host+TOS ones (OK as + * no Host+TOS Redirects are used) + * MUST NOT generate redirects unless forwarding to the same i-face and the + * dest. address is on the same subnet as the src. address and no source + * routing is in use. (OK) + * MUST NOT follow redirects when using a routing protocol (OK) + * MAY use redirects if not using a routing protocol (OK, compile-time option) + * MUST comply to Host Requirements when not acting as a router (OK) + * 5.2.7.3 (Time Exceeded) + * MUST generate Time Exceeded Code 0 when discarding packet due to TTL=0 (OK) + * MAY have a per-interface option to disable origination of TE messages, but + * it MUST default to "originate" (OK -- we don't support it) */ + +#include <linux/config.h> #include <linux/types.h> #include <linux/sched.h> #include <linux/kernel.h> @@ -55,27 +249,27 @@ #include <net/protocol.h> #include <net/icmp.h> #include <net/tcp.h> +#include <net/udp.h> #include <net/snmp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/errno.h> #include <linux/timer.h> #include <asm/system.h> -#include <asm/segment.h> +#include <asm/uaccess.h> #include <net/checksum.h> - #define min(a,b) ((a)<(b)?(a):(b)) - /* * Statistics */ -struct icmp_mib icmp_statistics={0,}; - +struct icmp_mib icmp_statistics; /* An array of errno for error messages from dest unreach. */ +/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOS_UNREACH and SR_FAIELD MUST be considered 'transient errs'. */ + struct icmp_err icmp_err_convert[] = { { ENETUNREACH, 0 }, /* ICMP_NET_UNREACH */ { EHOSTUNREACH, 0 }, /* ICMP_HOST_UNREACH */ @@ -89,227 +283,478 @@ struct icmp_err icmp_err_convert[] = { { ENETUNREACH, 1 }, /* ICMP_NET_ANO */ { EHOSTUNREACH, 1 }, /* ICMP_HOST_ANO */ { EOPNOTSUPP, 0 }, /* ICMP_NET_UNR_TOS */ - { EOPNOTSUPP, 0 } /* ICMP_HOST_UNR_TOS */ + { EOPNOTSUPP, 0 }, /* ICMP_HOST_UNR_TOS */ + { EOPNOTSUPP, 1 }, /* ICMP_PKT_FILTERED */ + { EOPNOTSUPP, 1 }, /* ICMP_PREC_VIOLATION */ + { EOPNOTSUPP, 1 } /* ICMP_PREC_CUTOFF */ }; +/* + * A spare long used to speed up statistics updating + */ + +unsigned long dummy; /* - * Send an ICMP message in response to a situation + * ICMP transmit rate limit control structures. We use a relatively simple + * approach to the problem: For each type of ICMP message with rate limit + * we count the number of messages sent during some time quantum. If this + * count exceeds given maximal value, we ignore all messages not separated + * from the last message sent at least by specified time. + */ + +#define XRLIM_CACHE_SIZE 16 /* How many destination hosts do we cache */ + +struct icmp_xrl_cache /* One entry of the ICMP rate cache */ +{ + __u32 daddr; /* Destination address */ + unsigned long counter; /* Message counter */ + unsigned long next_reset; /* Time of next reset of the counter */ + unsigned long last_access; /* Time of last access to this entry (LRU) */ + unsigned int restricted; /* Set if we're in restricted mode */ + unsigned long next_packet; /* When we'll allow a next packet if restricted */ +}; + +struct icmp_xrlim +{ + unsigned long timeout; /* Time quantum for rate measuring */ + unsigned long limit; /* Maximal number of messages per time quantum allowed */ + unsigned long delay; /* How long we wait between packets when restricting */ + struct icmp_xrl_cache cache[XRLIM_CACHE_SIZE]; /* Rate cache */ +}; + +/* + * ICMP control array. This specifies what to do with each ICMP. */ +struct icmp_control +{ + unsigned long *output; /* Address to increment on output */ + unsigned long *input; /* Address to increment on input */ + void (*handler)(struct icmphdr *icmph, struct sk_buff *skb, struct device *dev, __u32 saddr, __u32 daddr, int len); + unsigned long error; /* This ICMP is classed as an error message */ + struct icmp_xrlim *xrlim; /* Transmit rate limit control structure or NULL for no limits */ +}; + +static struct icmp_control icmp_pointers[19]; + +/* + * Build xmit assembly blocks + */ + +struct icmp_bxm +{ + void *data_ptr; + int data_len; + struct icmphdr icmph; + unsigned long csum; + struct options replyopts; + unsigned char optbuf[40]; +}; + +/* + * The ICMP socket. This is the most convenient way to flow control + * our ICMP output as well as maintain a clean interface throughout + * all layers. All Socketless IP sends will soon be gone. + */ + +struct socket icmp_socket; + +/* + * Send an ICMP frame. + */ + + +/* + * Initialize the transmit rate limitation mechanism. + */ + +#ifndef CONFIG_NO_ICMP_LIMIT + +static void xrlim_init(void) +{ + int type, entry; + struct icmp_xrlim *xr; + + for (type=0; type<=18; type++) { + xr = icmp_pointers[type].xrlim; + if (xr) { + for (entry=0; entry<XRLIM_CACHE_SIZE; entry++) + xr->cache[entry].daddr = INADDR_NONE; + } + } +} + +/* + * Check transmit rate limitation for given message. + * + * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate + * SHOULD allow setting of rate limits (we allow + * in the source) + */ + +static int xrlim_allow(int type, __u32 addr) +{ + struct icmp_xrlim *r; + struct icmp_xrl_cache *c; + unsigned long now; + + if (type > 18) /* No time limit present */ + return 1; + r = icmp_pointers[type].xrlim; + if (!r) + return 1; + + for (c = r->cache; c < &r->cache[XRLIM_CACHE_SIZE]; c++) + /* Cache lookup */ + if (c->daddr == addr) + break; + + now = jiffies; /* Cache current time (saves accesses to volatile variable) */ + + if (c == &r->cache[XRLIM_CACHE_SIZE]) { /* Cache miss */ + unsigned long oldest = now; /* Find the oldest entry to replace */ + struct icmp_xrl_cache *d; + c = r->cache; + for (d = r->cache; d < &r->cache[XRLIM_CACHE_SIZE]; d++) + if (!d->daddr) { /* Unused entry */ + c = d; + break; + } else if (d->last_access < oldest) { + oldest = d->last_access; + c = d; + } + c->last_access = now; /* Fill the entry with new data */ + c->daddr = addr; + c->counter = 1; + c->next_reset = now + r->timeout; + c->restricted = 0; + return 1; + } + + c->last_access = now; + if (c->next_reset > now) { /* Let's increment the counter */ + c->counter++; + if (c->counter == r->limit) { /* Limit exceeded, start restrictions */ + c->restricted = 1; + c->next_packet = now + r->delay; + return 0; + } + if (c->restricted) { /* Any restrictions pending? */ + if (c->next_packet > now) + return 0; + c->next_packet = now + r->delay; + return 1; + } + } else { /* Reset the counter */ + if (c->counter < r->limit) /* Switch off all restrictions */ + c->restricted = 0; + c->next_reset = now + r->timeout; + c->counter = 0; + } + + return 1; /* Send the packet */ +} + +#endif /* CONFIG_NO_ICMP_LIMIT */ + +/* + * Maintain the counters used in the SNMP statistics for outgoing ICMP + */ + +static void icmp_out_count(int type) +{ + if(type>18) + return; + (*icmp_pointers[type].output)++; + icmp_statistics.IcmpOutMsgs++; +} + +/* + * Checksum each fragment, and on the first include the headers and final checksum. + */ + +static int icmp_glue_bits(const void *p, __u32 saddr, char *to, unsigned int offset, unsigned int fraglen) +{ + struct icmp_bxm *icmp_param = (struct icmp_bxm *)p; + struct icmphdr *icmph; + unsigned long csum; + + if (offset) + { + icmp_param->csum=csum_partial_copy(icmp_param->data_ptr+offset-sizeof(struct icmphdr), + to, fraglen,icmp_param->csum); + return 0; + } + + /* + * First fragment includes header. Note that we've done + * the other fragments first, so that we get the checksum + * for the whole packet here. + */ + csum = csum_partial_copy((void *)&icmp_param->icmph, + to, sizeof(struct icmphdr), + icmp_param->csum); + csum = csum_partial_copy(icmp_param->data_ptr, + to+sizeof(struct icmphdr), + fraglen-sizeof(struct icmphdr), csum); + icmph=(struct icmphdr *)to; + icmph->checksum = csum_fold(csum); + + return 0; +} + +/* + * Driving logic for building and sending ICMP messages. + */ + +static void icmp_build_xmit(struct icmp_bxm *icmp_param, __u32 saddr, __u32 daddr, __u8 tos) +{ + struct sock *sk=icmp_socket.data; + icmp_param->icmph.checksum=0; + icmp_param->csum=0; + icmp_out_count(icmp_param->icmph.type); + sk->ip_tos = tos; + ip_build_xmit(sk, icmp_glue_bits, icmp_param, + icmp_param->data_len+sizeof(struct icmphdr), + daddr, saddr, &icmp_param->replyopts, 0, IPPROTO_ICMP, 1); +} + + +/* + * Send an ICMP message in response to a situation + * + * RFC 1122: 3.2.2 MUST send at least the IP header and 8 bytes of header. MAY send more (we do). + * MUST NOT change this header information. + * MUST NOT reply to a multicast/broadcast IP address. + * MUST NOT reply to a multicast/broadcast MAC address. + * MUST reply to only the first fragment. + */ + void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info, struct device *dev) { - struct sk_buff *skb; struct iphdr *iph; - int offset; struct icmphdr *icmph; - int len; - struct device *ndev=NULL; /* Make this =dev to force replies on the same interface */ - unsigned long our_addr; - int atype; + int atype, room; + struct icmp_bxm icmp_param; + __u32 saddr; /* - * Find the original IP header. + * Find the original header */ - iph = (struct iphdr *) (skb_in->data + dev->hard_header_len); + iph = skb_in->ip_hdr; /* - * No replies to MAC multicast + * No replies to physical multicast/broadcast */ if(skb_in->pkt_type!=PACKET_HOST) return; /* - * No replies to IP multicasting + * Now check at the protocol level */ atype=ip_chk_addr(iph->daddr); - if(atype==IS_BROADCAST || IN_MULTICAST(iph->daddr)) + if(atype==IS_BROADCAST||atype==IS_MULTICAST) return; - + /* - * Only reply to first fragment. + * Only reply to fragment 0. We byte re-order the constant + * mask for efficiency. */ - if(ntohs(iph->frag_off)&IP_OFFSET) + if(iph->frag_off&htons(IP_OFFSET)) return; - - /* - * We must NEVER NEVER send an ICMP error to an ICMP error message + + /* + * If we send an ICMP error to an ICMP error a mess would result.. */ - if(type==ICMP_DEST_UNREACH||type==ICMP_REDIRECT||type==ICMP_SOURCE_QUENCH||type==ICMP_TIME_EXCEEDED) + if(icmp_pointers[type].error) { - /* - * Is the original packet an ICMP packet? + * We are an error, check if we are replying to an ICMP error */ - + if(iph->protocol==IPPROTO_ICMP) { - icmph = (struct icmphdr *) ((char *) iph + - 4 * iph->ihl); + icmph = (struct icmphdr *)((char *)iph + (iph->ihl<<2)); /* - * Check for ICMP error packets (Must never reply to - * an ICMP error). + * Assume any unknown ICMP type is an error. This isn't + * specified by the RFC, but think about it.. */ - - if (icmph->type == ICMP_DEST_UNREACH || - icmph->type == ICMP_SOURCE_QUENCH || - icmph->type == ICMP_REDIRECT || - icmph->type == ICMP_TIME_EXCEEDED || - icmph->type == ICMP_PARAMETERPROB) + if(icmph->type>18 || icmp_pointers[icmph->type].error) return; } } - icmp_statistics.IcmpOutMsgs++; - - /* - * This needs a tidy. - */ - - switch(type) - { - case ICMP_DEST_UNREACH: - icmp_statistics.IcmpOutDestUnreachs++; - break; - case ICMP_SOURCE_QUENCH: - icmp_statistics.IcmpOutSrcQuenchs++; - break; - case ICMP_REDIRECT: - icmp_statistics.IcmpOutRedirects++; - break; - case ICMP_ECHO: - icmp_statistics.IcmpOutEchos++; - break; - case ICMP_ECHOREPLY: - icmp_statistics.IcmpOutEchoReps++; - break; - case ICMP_TIME_EXCEEDED: - icmp_statistics.IcmpOutTimeExcds++; - break; - case ICMP_PARAMETERPROB: - icmp_statistics.IcmpOutParmProbs++; - break; - case ICMP_TIMESTAMP: - icmp_statistics.IcmpOutTimestamps++; - break; - case ICMP_TIMESTAMPREPLY: - icmp_statistics.IcmpOutTimestampReps++; - break; - case ICMP_ADDRESS: - icmp_statistics.IcmpOutAddrMasks++; - break; - case ICMP_ADDRESSREPLY: - icmp_statistics.IcmpOutAddrMaskReps++; - break; - } + /* - * Get some memory for the reply. + * Check the rate limit */ - - len = dev->hard_header_len + sizeof(struct iphdr) + sizeof(struct icmphdr) + - sizeof(struct iphdr) + 32; /* amount of header to return */ - - skb = (struct sk_buff *) alloc_skb(len, GFP_ATOMIC); - if (skb == NULL) - { - icmp_statistics.IcmpOutErrors++; + +#ifndef CONFIG_NO_ICMP_LIMIT + if (!xrlim_allow(type, iph->saddr)) return; - } - skb->free = 1; +#endif /* - * Build Layer 2-3 headers for message back to source. + * Construct source address and options. */ - - our_addr = dev->pa_addr; - if (iph->daddr != our_addr && ip_chk_addr(iph->daddr) == IS_MYADDR) - our_addr = iph->daddr; - offset = ip_build_header(skb, our_addr, iph->saddr, - &ndev, IPPROTO_ICMP, NULL, len, - skb_in->ip_hdr->tos,255); - if (offset < 0) - { - icmp_statistics.IcmpOutErrors++; - skb->sk = NULL; - kfree_skb(skb, FREE_READ); + + saddr=iph->daddr; + if(saddr!=dev->pa_addr && ip_chk_addr(saddr)!=IS_MYADDR) + saddr=dev->pa_addr; + if(ip_options_echo(&icmp_param.replyopts, NULL, saddr, iph->saddr, skb_in)) return; - } - /* - * Re-adjust length according to actual IP header size. + /* + * Prepare data for ICMP header. */ - skb->len = offset + sizeof(struct icmphdr) + sizeof(struct iphdr) + 8; + icmp_param.icmph.type=type; + icmp_param.icmph.code=code; + icmp_param.icmph.un.gateway = info; + icmp_param.data_ptr=iph; + room = 576 - sizeof(struct iphdr) - icmp_param.replyopts.optlen; + icmp_param.data_len=(iph->ihl<<2)+skb_in->len; /* RFC says return as much as we can without exceeding 576 bytes */ + if (icmp_param.data_len > room) + icmp_param.data_len = room; /* - * Fill in the frame + * Build and send the packet. */ - - icmph = (struct icmphdr *) (skb->data + offset); - icmph->type = type; - icmph->code = code; - icmph->checksum = 0; - icmph->un.gateway = info; /* This might not be meant for - this form of the union but it will - be right anyway */ - memcpy(icmph + 1, iph, sizeof(struct iphdr) + 8); - - icmph->checksum = ip_compute_csum((unsigned char *)icmph, - sizeof(struct icmphdr) + sizeof(struct iphdr) + 8); - /* - * Send it and free it once sent. - */ - ip_queue_xmit(NULL, ndev, skb, 1); + icmp_build_xmit(&icmp_param, saddr, iph->saddr, + icmp_pointers[type].error ? + (iph->tos & 0x1E) | 0xC0 : iph->tos); } /* - * Handle ICMP_UNREACH and ICMP_QUENCH. + * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH. */ -static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb) +static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb, struct device *dev, __u32 saddr, __u32 daddr, int len) { - struct inet_protocol *ipprot; struct iphdr *iph; - unsigned char hash; - int err; + int hash; + struct inet_protocol *ipprot; + unsigned char *dp; + __u32 info = 0; + + if(len<sizeof(struct iphdr)) + goto flush_it; - err = (icmph->type << 8) | icmph->code; iph = (struct iphdr *) (icmph + 1); - switch(icmph->code & 7) + len-=iph->ihl<<2; + if(len<0) + goto flush_it; + + dp= ((unsigned char *)iph)+(iph->ihl<<2); + + if(icmph->type==ICMP_DEST_UNREACH) { - case ICMP_NET_UNREACH: - break; - case ICMP_HOST_UNREACH: - break; - case ICMP_PROT_UNREACH: -#ifdef CONFIG_NET_DEBUG - printk("ICMP: %s:%d: protocol unreachable.\n", - in_ntoa(iph->daddr), ntohs(iph->protocol)); -#endif - break; - case ICMP_PORT_UNREACH: - break; - case ICMP_FRAG_NEEDED: -#ifdef CONFIG_NET_DEBUG - printk("ICMP: %s: fragmentation needed and DF set.\n", + switch(icmph->code & 15) + { + case ICMP_NET_UNREACH: + break; + case ICMP_HOST_UNREACH: + break; + case ICMP_PROT_UNREACH: +/* printk(KERN_INFO "ICMP: %s:%d: protocol unreachable.\n", + in_ntoa(iph->daddr), (int)iph->protocol);*/ + break; + case ICMP_PORT_UNREACH: + break; + case ICMP_FRAG_NEEDED: +#ifdef CONFIG_NO_PATH_MTU_DISCOVERY + printk(KERN_INFO "ICMP: %s: fragmentation needed and DF set.\n", in_ntoa(iph->daddr)); + break; +#else + { + unsigned short old_mtu = ntohs(iph->tot_len); + unsigned short new_mtu = ntohs(icmph->un.echo.sequence); + + /* + * RFC1191 5. 4.2BSD based router can return incorrect + * Total Length. If current mtu is unknown or old_mtu + * is not less than current mtu, reduce old_mtu by 4 times + * the header length. + */ + + if (skb->sk == NULL /* can this happen? */ + || skb->sk->ip_route_cache == NULL + || skb->sk->ip_route_cache->rt_mtu <= old_mtu) + { + NETDEBUG(printk(KERN_INFO "4.2BSD based fragmenting router between here and %s, mtu corrected from %d", in_ntoa(iph->daddr), old_mtu)); + old_mtu -= 4 * iph->ihl; + NETDEBUG(printk(" to %d\n", old_mtu)); + } + + if (new_mtu < 68 || new_mtu >= old_mtu) + { + /* + * It is either dumb router, which does not + * understand Path MTU Disc. protocol + * or broken (f.e. Linux<=1.3.37 8) router. + * Try to guess... + * The table is taken from RFC-1191. + */ + if (old_mtu > 32000) + new_mtu = 32000; + else if (old_mtu > 17914) + new_mtu = 17914; + else if (old_mtu > 8166) + new_mtu = 8166; + else if (old_mtu > 4352) + new_mtu = 4352; + else if (old_mtu > 2002) + new_mtu = 2002; + else if (old_mtu > 1492) + new_mtu = 1492; + else if (old_mtu > 576) + new_mtu = 576; + else if (old_mtu > 296) + new_mtu = 296; + /* + * These two are not from the RFC but + * are needed for AMPRnet AX.25 paths. + */ + else if (old_mtu > 216) + new_mtu = 216; + else if (old_mtu > 128) + new_mtu = 128; + else + /* + * Despair.. + */ + new_mtu = 68; + } + info = new_mtu; + break; + } #endif - break; - case ICMP_SR_FAILED: -#ifdef CONFIG_NET_DEBUG - printk("ICMP: %s: Source Route Failed.\n", in_ntoa(iph->daddr)); -#endif - break; - default: - break; + case ICMP_SR_FAILED: + printk(KERN_INFO "ICMP: %s: Source Route Failed.\n", in_ntoa(iph->daddr)); + break; + default: + break; + } + if(icmph->code>NR_ICMP_UNREACH) /* Invalid type */ + return; } + + /* + * Throw it at our lower layers + * + * RFC 1122: 3.2.2 MUST extract the protocol ID from the passed header. + * RFC 1122: 3.2.2.1 MUST pass ICMP unreach messages to the transport layer. + * RFC 1122: 3.2.2.2 MUST pass ICMP time expired messages to transport layer. + */ /* * Get the protocol(s). @@ -319,6 +764,8 @@ static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb) /* * This can't change while we are doing it. + * + * FIXME: Deliver to appropriate raw sockets too. */ ipprot = (struct inet_protocol *) inet_protos[hash]; @@ -331,14 +778,19 @@ static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb) /* * Pass it off to everyone who wants it. */ + + /* RFC1122: OK. Passes appropriate ICMP errors to the */ + /* appropriate protocol layer (MUST), as per 3.2.2. */ + if (iph->protocol == ipprot->protocol && ipprot->err_handler) { - ipprot->err_handler(err, (unsigned char *)(icmph + 1), - iph->daddr, iph->saddr, ipprot); + ipprot->err_handler(icmph->type, icmph->code, dp, info, + iph->daddr, iph->saddr, ipprot, len); } ipprot = nextip; } +flush_it: kfree_skb(skb, FREE_READ); } @@ -347,25 +799,30 @@ static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb) * Handle ICMP_REDIRECT. */ -static void icmp_redirect(struct icmphdr *icmph, struct sk_buff *skb, - struct device *dev, unsigned long source) +static void icmp_redirect(struct icmphdr *icmph, struct sk_buff *skb, struct device *dev, __u32 source, __u32 daddr, int len) { - struct rtable *rt; struct iphdr *iph; unsigned long ip; /* * Get the copied header of the packet that caused the redirect */ + + if(len<=sizeof(struct iphdr)) + goto flush_it; iph = (struct iphdr *) (icmph + 1); ip = iph->daddr; -#ifdef CONFIG_IP_FORWARD /* - * We are a router. Routers should not respond to ICMP_REDIRECT messages. + * If we are a router and we run a routing protocol, we MUST NOT follow redirects. + * When using no routing protocol, we MAY follow redirects. (RFC 1812, 5.2.7.2) */ - printk("icmp: ICMP redirect from %s on %s ignored.\n", in_ntoa(source), dev->name); + +#if defined(CONFIG_IP_FORWARD) && !defined(CONFIG_IP_DUMB_ROUTER) + NETDEBUG(printk(KERN_INFO "icmp: ICMP redirect ignored. dest = %lX, " + "orig gw = %lX, \"new\" gw = %lX, device = %s.\n", ntohl(ip), + ntohl(source), ntohl(icmph->un.gateway), dev->name)); #else switch(icmph->code & 7) { @@ -373,13 +830,18 @@ static void icmp_redirect(struct icmphdr *icmph, struct sk_buff *skb, /* * This causes a problem with subnetted networks. What we should do * is use ICMP_ADDRESS to get the subnet mask of the problem route - * and set both. But we don't.. + * and set both. But we don't.. [RFC1812 says routers MUST NOT + * generate Network Redirects] */ #ifdef not_a_good_idea ip_rt_add((RTF_DYNAMIC | RTF_MODIFIED | RTF_GATEWAY), - ip, 0, icmph->un.gateway, dev,0, 0); - break; + ip, 0, icmph->un.gateway, dev,0, 0, 0); #endif + /* + * As per RFC recommendations now handle it as + * a host redirect. + */ + case ICMP_REDIR_HOST: /* * Add better route to host. @@ -389,20 +851,12 @@ static void icmp_redirect(struct icmphdr *icmph, struct sk_buff *skb, * (not some confused thing sending our * address) */ - rt = ip_rt_route(ip, NULL, NULL); - if (!rt) - break; - if (rt->rt_gateway != source || - ((icmph->un.gateway^dev->pa_addr)&dev->pa_mask) || - ip_chk_addr(icmph->un.gateway)) - break; - printk("ICMP redirect from %s\n", in_ntoa(source)); - ip_rt_add((RTF_DYNAMIC | RTF_MODIFIED | RTF_HOST | RTF_GATEWAY), - ip, 0, icmph->un.gateway, dev,0, 0, 0); + printk(KERN_INFO "ICMP redirect from %s\n", in_ntoa(source)); + ip_rt_redirect(source, ip, icmph->un.gateway, dev); break; case ICMP_REDIR_NETTOS: case ICMP_REDIR_HOSTTOS: - printk("ICMP: cannot handle TOS redirects yet!\n"); + printk(KERN_INFO "ICMP: cannot handle TOS redirects yet!\n"); break; default: break; @@ -411,377 +865,311 @@ static void icmp_redirect(struct icmphdr *icmph, struct sk_buff *skb, /* * Discard the original packet */ - +flush_it: kfree_skb(skb, FREE_READ); } - /* * Handle ICMP_ECHO ("ping") requests. + * + * RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo requests. + * RFC 1122: 3.2.2.6 Data received in the ICMP_ECHO request MUST be included in the reply. + * RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring echo requests, MUST have default=NOT. + * See also WRT handling of options once they are done and working. */ -static void icmp_echo(struct icmphdr *icmph, struct sk_buff *skb, struct device *dev, - unsigned long saddr, unsigned long daddr, int len, - struct options *opt) +static void icmp_echo(struct icmphdr *icmph, struct sk_buff *skb, struct device *dev, __u32 saddr, __u32 daddr, int len) { - struct icmphdr *icmphr; - struct sk_buff *skb2; - struct device *ndev=NULL; - int size, offset; - - icmp_statistics.IcmpOutEchoReps++; - icmp_statistics.IcmpOutMsgs++; - - size = dev->hard_header_len + 64 + len; - skb2 = alloc_skb(size, GFP_ATOMIC); - - if (skb2 == NULL) - { - icmp_statistics.IcmpOutErrors++; - kfree_skb(skb, FREE_READ); - return; - } - skb2->free = 1; - - /* Build Layer 2-3 headers for message back to source */ - offset = ip_build_header(skb2, daddr, saddr, &ndev, - IPPROTO_ICMP, opt, len, skb->ip_hdr->tos,255); - if (offset < 0) - { - icmp_statistics.IcmpOutErrors++; - printk("ICMP: Could not build IP Header for ICMP ECHO Response\n"); - kfree_skb(skb2,FREE_WRITE); - kfree_skb(skb, FREE_READ); - return; - } - - /* - * Re-adjust length according to actual IP header size. - */ - - skb2->len = offset + len; - - /* - * Build ICMP_ECHO Response message. - */ - icmphr = (struct icmphdr *) (skb2->data + offset); - memcpy((char *) icmphr, (char *) icmph, len); - icmphr->type = ICMP_ECHOREPLY; - icmphr->code = 0; - icmphr->checksum = 0; - icmphr->checksum = ip_compute_csum((unsigned char *)icmphr, len); - - /* - * Ship it out - free it when done - */ - ip_queue_xmit((struct sock *)NULL, ndev, skb2, 1); - - /* - * Free the received frame - */ - +#ifndef CONFIG_IP_IGNORE_ECHO_REQUESTS + struct icmp_bxm icmp_param; + icmp_param.icmph=*icmph; + icmp_param.icmph.type=ICMP_ECHOREPLY; + icmp_param.data_ptr=(icmph+1); + icmp_param.data_len=len; + if (ip_options_echo(&icmp_param.replyopts, NULL, daddr, saddr, skb)==0) + icmp_build_xmit(&icmp_param, daddr, saddr, skb->ip_hdr->tos); +#endif kfree_skb(skb, FREE_READ); } /* * Handle ICMP Timestamp requests. + * RFC 1122: 3.2.2.8 MAY implement ICMP timestamp requests. + * SHOULD be in the kernel for minimum random latency. + * MUST be accurate to a few minutes. + * MUST be updated at least at 15Hz. */ -static void icmp_timestamp(struct icmphdr *icmph, struct sk_buff *skb, struct device *dev, - unsigned long saddr, unsigned long daddr, int len, - struct options *opt) +static void icmp_timestamp(struct icmphdr *icmph, struct sk_buff *skb, struct device *dev, __u32 saddr, __u32 daddr, int len) { - struct icmphdr *icmphr; - struct sk_buff *skb2; - int size, offset; - unsigned long *timeptr, midtime; - struct device *ndev=NULL; - - if (len != 20) - { - printk( - "ICMP: Size (%d) of ICMP_TIMESTAMP request should be 20!\n", - len); - icmp_statistics.IcmpInErrors++; - /* correct answers are possible for everything >= 12 */ - if (len < 12) - return; - } - - size = dev->hard_header_len + 84; - - if (! (skb2 = alloc_skb(size, GFP_ATOMIC))) - { - skb->sk = NULL; - kfree_skb(skb, FREE_READ); - icmp_statistics.IcmpOutErrors++; - return; - } - skb2->free = 1; - -/* - * Build Layer 2-3 headers for message back to source - */ - - offset = ip_build_header(skb2, daddr, saddr, &ndev, IPPROTO_ICMP, opt, len, - skb->ip_hdr->tos, 255); - if (offset < 0) + __u32 times[3]; /* So the new timestamp works on ALPHA's.. */ + struct icmp_bxm icmp_param; + + /* + * Too short. + */ + + if(len<12) { - printk("ICMP: Could not build IP Header for ICMP TIMESTAMP Response\n"); - kfree_skb(skb2, FREE_WRITE); + icmp_statistics.IcmpInErrors++; kfree_skb(skb, FREE_READ); - icmp_statistics.IcmpOutErrors++; return; } - - /* - * Re-adjust length according to actual IP header size. - */ - skb2->len = offset + 20; - - /* - * Build ICMP_TIMESTAMP Response message. - */ - - icmphr = (struct icmphdr *) ((char *) (skb2 + 1) + offset); - memcpy((char *) icmphr, (char *) icmph, 12); - icmphr->type = ICMP_TIMESTAMPREPLY; - icmphr->code = icmphr->checksum = 0; - - /* fill in the current time as ms since midnight UT: */ - midtime = (xtime.tv_sec % 86400) * 1000 + xtime.tv_usec / 1000; - timeptr = (unsigned long *) (icmphr + 1); - /* - * the originate timestamp (timeptr [0]) is still in the copy: - */ - timeptr [1] = timeptr [2] = htonl(midtime); - - icmphr->checksum = ip_compute_csum((unsigned char *) icmphr, 20); - + /* - * Ship it out - free it when done + * Fill in the current time as ms since midnight UT: */ - - ip_queue_xmit((struct sock *) NULL, ndev, skb2, 1); - icmp_statistics.IcmpOutTimestampReps++; - kfree_skb(skb, FREE_READ); + + { + struct timeval tv; + do_gettimeofday(&tv); + times[1] = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000); + } + times[2] = times[1]; + memcpy((void *)×[0], icmph+1, 4); /* Incoming stamp */ + icmp_param.icmph=*icmph; + icmp_param.icmph.type=ICMP_TIMESTAMPREPLY; + icmp_param.icmph.code=0; + icmp_param.data_ptr=× + icmp_param.data_len=12; + if (ip_options_echo(&icmp_param.replyopts, NULL, daddr, saddr, skb)==0) + icmp_build_xmit(&icmp_param, daddr, saddr, skb->ip_hdr->tos); + kfree_skb(skb,FREE_READ); } - - -/* - * Handle the ICMP INFORMATION REQUEST. +/* + * Handle ICMP_ADDRESS_MASK requests. (RFC950) + * + * RFC1122 (3.2.2.9). A host MUST only send replies to + * ADDRESS_MASK requests if it's been configured as an address mask + * agent. Receiving a request doesn't constitute implicit permission to + * act as one. Of course, implementing this correctly requires (SHOULD) + * a way to turn the functionality on and off. Another one for sysctl(), + * I guess. -- MS + * Botched with a CONFIG option for now - Linus add scts sysctl please.. */ -static void icmp_info(struct icmphdr *icmph, struct sk_buff *skb, struct device *dev, - unsigned long saddr, unsigned long daddr, int len, - struct options *opt) +static void icmp_address(struct icmphdr *icmph, struct sk_buff *skb, struct device *dev, __u32 saddr, __u32 daddr, int len) { - /* Obsolete */ - kfree_skb(skb, FREE_READ); +#ifdef CONFIG_IP_ADDR_AGENT /* Don't use, broken */ + struct icmp_bxm icmp_param; + icmp_param.icmph.type=ICMP_ADDRESSREPLY; + icmp_param.icmph.code=0; + icmp_param.icmph.un.echo.id = icmph->un.echo.id; + icmp_param.icmph.un.echo.sequence = icmph->un.echo.sequence; + icmp_param.data_ptr=&dev->pa_mask; + icmp_param.data_len=4; + if (ip_options_echo(&icmp_param.replyopts, NULL, daddr, saddr, skb)==0) + icmp_build_xmit(&icmp_param, daddr, saddr, skb->iph->tos); +#endif + kfree_skb(skb, FREE_READ); } - -/* - * Handle ICMP_ADDRESS_MASK requests. - */ - -static void icmp_address(struct icmphdr *icmph, struct sk_buff *skb, struct device *dev, - unsigned long saddr, unsigned long daddr, int len, - struct options *opt) +static void icmp_discard(struct icmphdr *icmph, struct sk_buff *skb, struct device *dev, __u32 saddr, __u32 daddr, int len) { - struct icmphdr *icmphr; - struct sk_buff *skb2; - int size, offset; - struct device *ndev=NULL; + kfree_skb(skb, FREE_READ); +} - icmp_statistics.IcmpOutMsgs++; - icmp_statistics.IcmpOutAddrMaskReps++; - - size = dev->hard_header_len + 64 + len; - skb2 = alloc_skb(size, GFP_ATOMIC); - if (skb2 == NULL) - { - icmp_statistics.IcmpOutErrors++; - kfree_skb(skb, FREE_READ); - return; - } - skb2->free = 1; - - /* - * Build Layer 2-3 headers for message back to source - */ +#ifdef CONFIG_IP_TRANSPARENT_PROXY +/* + * Check incoming icmp packets not addressed locally, to check whether + * they relate to a (proxying) socket on our system. + * Needed for transparent proxying. + * + * This code is presently ugly and needs cleanup. + * Probably should add a chkaddr entry to ipprot to call a chk routine + * in udp.c or tcp.c... + */ - offset = ip_build_header(skb2, daddr, saddr, &ndev, - IPPROTO_ICMP, opt, len, skb->ip_hdr->tos,255); - if (offset < 0) - { - icmp_statistics.IcmpOutErrors++; - printk("ICMP: Could not build IP Header for ICMP ADDRESS Response\n"); - kfree_skb(skb2,FREE_WRITE); - kfree_skb(skb, FREE_READ); - return; +int icmp_chkaddr(struct sk_buff *skb) +{ + struct icmphdr *icmph=(struct icmphdr *)(skb->h.raw + skb->h.iph->ihl*4); + struct iphdr *iph = (struct iphdr *) (icmph + 1); + void (*handler)(struct icmphdr *icmph, struct sk_buff *skb, struct device *dev, __u32 saddr, __u32 daddr, int len) = icmp_pointers[icmph->type].handler; + + if (handler == icmp_unreach || handler == icmp_redirect) { + struct sock *sk; + + switch (iph->protocol) { + case IPPROTO_TCP: + { + struct tcphdr *th = (struct tcphdr *)(((unsigned char *)iph)+(iph->ihl<<2)); + + sk = get_sock(&tcp_prot, th->source, iph->daddr, + th->dest, iph->saddr, 0, 0); + if (!sk) return 0; + if (sk->saddr != iph->saddr) return 0; + if (sk->daddr != iph->daddr) return 0; + if (sk->dummy_th.dest != th->dest) return 0; + /* + * This packet came from us. + */ + return 1; + } + case IPPROTO_UDP: + { + struct udphdr *uh = (struct udphdr *)(((unsigned char *)iph)+(iph->ihl<<2)); + + sk = get_sock(&udp_prot, uh->source, iph->daddr, + uh->dest, iph->saddr, 0, 0); + if (!sk) return 0; + if (sk->saddr != iph->saddr && ip_chk_addr(iph->saddr) != IS_MYADDR) + return 0; + /* + * This packet may have come from us. + * Assume it did. + */ + return 1; + } + } } - - /* - * Re-adjust length according to actual IP header size. - */ - - skb2->len = offset + len; - - /* - * Build ICMP ADDRESS MASK Response message. - */ - - icmphr = (struct icmphdr *) (skb2->data + offset); - icmphr->type = ICMP_ADDRESSREPLY; - icmphr->code = 0; - icmphr->checksum = 0; - icmphr->un.echo.id = icmph->un.echo.id; - icmphr->un.echo.sequence = icmph->un.echo.sequence; - memcpy((char *) (icmphr + 1), (char *) &dev->pa_mask, sizeof(dev->pa_mask)); - - icmphr->checksum = ip_compute_csum((unsigned char *)icmphr, len); - - /* Ship it out - free it when done */ - ip_queue_xmit((struct sock *)NULL, ndev, skb2, 1); - - skb->sk = NULL; - kfree_skb(skb, FREE_READ); + return 0; } - +#endif /* * Deal with incoming ICMP packets. */ -int icmp_rcv(struct sk_buff *skb1, struct device *dev, struct options *opt, - unsigned long daddr, unsigned short len, - unsigned long saddr, int redo, struct inet_protocol *protocol) +int icmp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt, + __u32 daddr, unsigned short len, + __u32 saddr, int redo, struct inet_protocol *protocol) { - struct icmphdr *icmph; - unsigned char *buff; - - /* - * Drop broadcast packets. IP has done a broadcast check and ought one day - * to pass on that information. - */ - + struct icmphdr *icmph=(void *)skb->h.raw; +#ifdef CONFIG_IP_TRANSPARENT_PROXY + int r; +#endif icmp_statistics.IcmpInMsgs++; - - + /* - * Grab the packet as an icmp object + * Validate the packet */ - - buff = skb1->h.raw; - icmph = (struct icmphdr *) buff; - - /* - * Validate the packet first - */ - + if (ip_compute_csum((unsigned char *) icmph, len)) { /* Failed checksum! */ icmp_statistics.IcmpInErrors++; - printk("ICMP: failed checksum from %s!\n", in_ntoa(saddr)); - kfree_skb(skb1, FREE_READ); + printk(KERN_INFO "ICMP: failed checksum from %s!\n", in_ntoa(saddr)); + kfree_skb(skb, FREE_READ); return(0); } - + + /* + * 18 is the highest 'known' ICMP type. Anything else is a mystery + * + * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently discarded. + */ + + if(icmph->type > 18) + { + icmp_statistics.IcmpInErrors++; /* Is this right - or do we ignore ? */ + kfree_skb(skb,FREE_READ); + return(0); + } + /* * Parse the ICMP message */ - if (ip_chk_addr(daddr) != IS_MYADDR) +#ifdef CONFIG_IP_TRANSPARENT_PROXY + /* + * We may get non-local addresses and still want to handle them + * locally, due to transparent proxying. + * Thus, narrow down the test to what is really meant. + */ + if (daddr!=dev->pa_addr && ((r = ip_chk_addr(daddr)) == IS_BROADCAST || r == IS_MULTICAST)) +#else + if (daddr!=dev->pa_addr && ip_chk_addr(daddr) != IS_MYADDR) +#endif { + /* + * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be silently ignored (we don't as it is used + * by some network mapping tools). + * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently discarded if to broadcast/multicast. + */ if (icmph->type != ICMP_ECHO) { icmp_statistics.IcmpInErrors++; - kfree_skb(skb1, FREE_READ); + kfree_skb(skb, FREE_READ); return(0); } + /* + * Reply the multicast/broadcast using a legal + * interface - in this case the device we got + * it from. + */ daddr=dev->pa_addr; } - - switch(icmph->type) - { - case ICMP_TIME_EXCEEDED: - icmp_statistics.IcmpInTimeExcds++; - icmp_unreach(icmph, skb1); - return 0; - case ICMP_DEST_UNREACH: - icmp_statistics.IcmpInDestUnreachs++; - icmp_unreach(icmph, skb1); - return 0; - case ICMP_SOURCE_QUENCH: - icmp_statistics.IcmpInSrcQuenchs++; - icmp_unreach(icmph, skb1); - return(0); - case ICMP_REDIRECT: - icmp_statistics.IcmpInRedirects++; - icmp_redirect(icmph, skb1, dev, saddr); - return(0); - case ICMP_ECHO: - icmp_statistics.IcmpInEchos++; - icmp_echo(icmph, skb1, dev, saddr, daddr, len, opt); - return 0; - case ICMP_ECHOREPLY: - icmp_statistics.IcmpInEchoReps++; - kfree_skb(skb1, FREE_READ); - return(0); - case ICMP_TIMESTAMP: - icmp_statistics.IcmpInTimestamps++; - icmp_timestamp(icmph, skb1, dev, saddr, daddr, len, opt); - return 0; - case ICMP_TIMESTAMPREPLY: - icmp_statistics.IcmpInTimestampReps++; - kfree_skb(skb1,FREE_READ); - return 0; - /* INFO is obsolete and doesn't even feature in the SNMP stats */ - case ICMP_INFO_REQUEST: - icmp_info(icmph, skb1, dev, saddr, daddr, len, opt); - return 0; - case ICMP_INFO_REPLY: - skb1->sk = NULL; - kfree_skb(skb1, FREE_READ); - return(0); - case ICMP_ADDRESS: - icmp_statistics.IcmpInAddrMasks++; - icmp_address(icmph, skb1, dev, saddr, daddr, len, opt); - return 0; - case ICMP_ADDRESSREPLY: - /* - * We ought to set our netmask on receiving this, but - * experience shows it's a waste of effort. - */ - icmp_statistics.IcmpInAddrMaskReps++; - kfree_skb(skb1, FREE_READ); - return(0); - default: - icmp_statistics.IcmpInErrors++; - kfree_skb(skb1, FREE_READ); - return(0); - } - /*NOTREACHED*/ - kfree_skb(skb1, FREE_READ); - return(-1); + + len-=sizeof(struct icmphdr); + (*icmp_pointers[icmph->type].input)++; + (icmp_pointers[icmph->type].handler)(icmph,skb,skb->dev,saddr,daddr,len); + return 0; } +/* + * This table defined limits of ICMP sending rate for various ICMP messages. + */ + +static struct icmp_xrlim + xrl_unreach = { 4*HZ, 80, HZ/4 }, /* Host Unreachable */ + xrl_redirect = { 2*HZ, 10, HZ/2 }, /* Redirect */ + xrl_generic = { 3*HZ, 30, HZ/4 }; /* All other errors */ /* - * Perform any ICMP-related I/O control requests. - * [to vanish soon] + * This table is the definition of how we handle ICMP. */ -int icmp_ioctl(struct sock *sk, int cmd, unsigned long arg) +static struct icmp_control icmp_pointers[19] = { +/* ECHO REPLY (0) */ + { &icmp_statistics.IcmpOutEchoReps, &icmp_statistics.IcmpInEchoReps, icmp_discard, 0, NULL }, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, NULL }, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, NULL }, +/* DEST UNREACH (3) */ + { &icmp_statistics.IcmpOutDestUnreachs, &icmp_statistics.IcmpInDestUnreachs, icmp_unreach, 1, &xrl_unreach }, +/* SOURCE QUENCH (4) */ + { &icmp_statistics.IcmpOutSrcQuenchs, &icmp_statistics.IcmpInSrcQuenchs, icmp_unreach, 1, NULL }, +/* REDIRECT (5) */ + { &icmp_statistics.IcmpOutRedirects, &icmp_statistics.IcmpInRedirects, icmp_redirect, 1, &xrl_redirect }, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, NULL }, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, NULL }, +/* ECHO (8) */ + { &icmp_statistics.IcmpOutEchos, &icmp_statistics.IcmpInEchos, icmp_echo, 0, NULL }, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, NULL }, + { &dummy, &icmp_statistics.IcmpInErrors, icmp_discard, 1, NULL }, +/* TIME EXCEEDED (11) */ + { &icmp_statistics.IcmpOutTimeExcds, &icmp_statistics.IcmpInTimeExcds, icmp_unreach, 1, &xrl_generic }, +/* PARAMETER PROBLEM (12) */ +/* FIXME: RFC1122 3.2.2.5 - MUST pass PARAM_PROB messages to transport layer */ + { &icmp_statistics.IcmpOutParmProbs, &icmp_statistics.IcmpInParmProbs, icmp_discard, 1, &xrl_generic }, +/* TIMESTAMP (13) */ + { &icmp_statistics.IcmpOutTimestamps, &icmp_statistics.IcmpInTimestamps, icmp_timestamp, 0, NULL }, +/* TIMESTAMP REPLY (14) */ + { &icmp_statistics.IcmpOutTimestampReps, &icmp_statistics.IcmpInTimestampReps, icmp_discard, 0, NULL }, +/* INFO (15) */ + { &dummy, &dummy, icmp_discard, 0, NULL }, +/* INFO REPLY (16) */ + { &dummy, &dummy, icmp_discard, 0, NULL }, +/* ADDR MASK (17) */ + { &icmp_statistics.IcmpOutAddrMasks, &icmp_statistics.IcmpInAddrMasks, icmp_address, 0, NULL }, +/* ADDR MASK REPLY (18) */ + { &icmp_statistics.IcmpOutAddrMaskReps, &icmp_statistics.IcmpInAddrMaskReps, icmp_discard, 0, NULL } +}; + +void icmp_init(struct proto_ops *ops) { - switch(cmd) - { - default: - return(-EINVAL); - } - return(0); + struct sock *sk; + int err; + icmp_socket.type=SOCK_RAW; + icmp_socket.ops=ops; + if((err=ops->create(&icmp_socket, IPPROTO_ICMP))<0) + panic("Failed to create the ICMP control socket.\n"); + sk=icmp_socket.data; + sk->allocation=GFP_ATOMIC; + sk->num = 256; /* Don't receive any data */ +#ifndef CONFIG_NO_ICMP_LIMIT + xrlim_init(); +#endif } + diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index ec182d8e3..a5b60f12b 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1,8 +1,15 @@ /* - * Linux NET3: Internet Gateway Management Protocol [IGMP] + * Linux NET3: Internet Group Management Protocol [IGMP] + * + * This code implements the IGMP protocol as defined in RFC1112. There has + * been a further revision of this protocol since which is now supported. + * + * If you have trouble with this module be careful what gcc you have used, + * the older version didn't come out right using gcc 2.5.8, the newer one + * seems to fall out with gcc 2.6.2. * * Authors: - * Alan Cox <Alan.Cox@linux.org> + * Alan Cox <Alan.Cox@linux.org> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -10,14 +17,57 @@ * 2 of the License, or (at your option) any later version. * * Fixes: - * + * * Alan Cox : Added lots of __inline__ to optimise * the memory usage of all the tiny little * functions. + * Alan Cox : Dumped the header building experiment. + * Alan Cox : Minor tweaks ready for multicast routing + * and extended IGMP protocol. + * Alan Cox : Removed a load of inline directives. Gcc 2.5.8 + * writes utterly bogus code otherwise (sigh) + * fixed IGMP loopback to behave in the manner + * desired by mrouted, fixed the fact it has been + * broken since 1.3.6 and cleaned up a few minor + * points. + * + * Chih-Jen Chang : Tried to revise IGMP to Version 2 + * Tsu-Sheng Tsao E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu + * The enhancements are mainly based on Steve Deering's + * ipmulti-3.5 source code. + * Chih-Jen Chang : Added the igmp_get_mrouter_info and + * Tsu-Sheng Tsao igmp_set_mrouter_info to keep track of + * the mrouted version on that device. + * Chih-Jen Chang : Added the max_resp_time parameter to + * Tsu-Sheng Tsao igmp_heard_query(). Using this parameter + * to identify the multicast router version + * and do what the IGMP version 2 specified. + * Chih-Jen Chang : Added a timer to revert to IGMP V2 router + * Tsu-Sheng Tsao if the specified time expired. + * Alan Cox : Stop IGMP from 0.0.0.0 being accepted. + * Alan Cox : Use GFP_ATOMIC in the right places. + * Christian Daudt : igmp timer wasn't set for local group + * memberships but was being deleted, + * which caused a "del_timer() called + * from %p with timer not initialized\n" + * message (960131). + * Christian Daudt : removed del_timer from + * igmp_timer_expire function (960205). + * Christian Daudt : igmp_heard_report now only calls + * igmp_timer_expire if tm->running is + * true (960216). + * Malcolm Beattie : ttl comparison wrong in igmp_rcv made + * igmp_heard_query never trigger. Expiry + * miscalculation fixed in igmp_heard_query + * and random() made to return unsigned to + * prevent negative expiry times. + * Alexey Kuznetsov: Wrong group leaving behaviour, backport + * fix from pending 2.1.x patches. + * Alan Cox: Forget to enable FDDI support earlier. */ - - -#include <asm/segment.h> + + +#include <asm/uaccess.h> #include <asm/system.h> #include <linux/types.h> #include <linux/kernel.h> @@ -29,6 +79,7 @@ #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> +#include <linux/if_arp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> @@ -36,23 +87,136 @@ #include <net/sock.h> #include <linux/igmp.h> #include <net/checksum.h> -#include <net/head_explode.h> #ifdef CONFIG_IP_MULTICAST /* + * If time expired, change the router type to IGMP_NEW_ROUTER. + */ + +static void ip_router_timer_expire(unsigned long data) +{ + struct ip_router_info *i=(struct ip_router_info *)data; + + del_timer(&i->timer); + i->type=IGMP_NEW_ROUTER; /* Revert to new multicast router */ + i->time=0; +} + +/* + * Multicast router info manager + */ + +struct ip_router_info *ip_router_info_head=(struct ip_router_info *)0; + +/* + * Get the multicast router info on that device + */ + +static struct ip_router_info *igmp_get_mrouter_info(struct device *dev) +{ + register struct ip_router_info *i; + + for(i=ip_router_info_head;i!=NULL;i=i->next) + { + if (i->dev == dev) + { + return i; + } + } + + /* + * Not found. Create a new entry. The default is IGMP V2 router + */ + + i=(struct ip_router_info *)kmalloc(sizeof(*i), GFP_ATOMIC); + if(i==NULL) + return NULL; + i->dev = dev; + i->type = IGMP_NEW_ROUTER; + i->time = IGMP_AGE_THRESHOLD; + i->next = ip_router_info_head; + ip_router_info_head = i; + + init_timer(&i->timer); + i->timer.data=(unsigned long)i; + i->timer.function=&ip_router_timer_expire; + + return i; +} + +/* + * Set the multicast router info on that device + */ + +static struct ip_router_info *igmp_set_mrouter_info(struct device *dev,int type,int time) +{ + register struct ip_router_info *i; + + for(i=ip_router_info_head;i!=NULL;i=i->next) + { + if (i->dev == dev) + { + if(i->type==IGMP_OLD_ROUTER) + { + del_timer(&i->timer); + } + + i->type = type; + i->time = time; + + if(i->type==IGMP_OLD_ROUTER) + { + i->timer.expires=jiffies+i->time*HZ; + add_timer(&i->timer); + } + return i; + } + } + + /* + * Not found. Create a new entry. + */ + i=(struct ip_router_info *)kmalloc(sizeof(*i), GFP_ATOMIC); + if(i==NULL) + return NULL; + i->dev = dev; + i->type = type; + i->time = time; + i->next = ip_router_info_head; + ip_router_info_head = i; + + init_timer(&i->timer); + i->timer.data=(unsigned long)i; + i->timer.function=&ip_router_timer_expire; + if(i->type==IGMP_OLD_ROUTER) + { + i->timer.expires=jiffies+i->time*HZ; + add_timer(&i->timer); + } + + return i; +} + + +/* * Timer management */ - - -extern __inline__ void igmp_stop_timer(struct ip_mc_list *im) + +static void igmp_stop_timer(struct ip_mc_list *im) { - del_timer(&im->timer); - im->tm_running=0; + if (im->tm_running) { + del_timer(&im->timer); + im->tm_running=0; + } + else { + printk(KERN_ERR "igmp_stop_timer() called with timer not running by %p\n", + return_address()); + } } -extern __inline__ int random(void) +extern __inline__ unsigned int random(void) { static unsigned long seed=152L; seed=seed*69069L+1; @@ -60,20 +224,20 @@ extern __inline__ int random(void) } /* - * Inlined as its only called once. + * Inlined as it's only called once. */ -extern __inline__ void igmp_start_timer(struct ip_mc_list *im) +static void igmp_start_timer(struct ip_mc_list *im,unsigned char max_resp_time) { int tv; if(im->tm_running) return; - tv=random()%(10*HZ); /* Pick a number any number 8) */ - im->timer.expires=tv; + tv=random()%(max_resp_time*HZ/IGMP_TIMER_SCALE); /* Pick a number any number 8) */ + im->timer.expires=jiffies+tv; im->tm_running=1; add_timer(&im->timer); } - + /* * Send an IGMP report. */ @@ -84,26 +248,23 @@ static void igmp_send_report(struct device *dev, unsigned long address, int type { struct sk_buff *skb=alloc_skb(MAX_IGMP_SIZE, GFP_ATOMIC); int tmp; - unsigned char *dp; - + struct igmphdr *ih; + if(skb==NULL) return; tmp=ip_build_header(skb, INADDR_ANY, address, &dev, IPPROTO_IGMP, NULL, - skb->mem_len, 0, 1); + 28 , 0, 1, NULL); if(tmp<0) { kfree_skb(skb, FREE_WRITE); return; } - dp=skb->data+tmp; - skb->len=tmp+sizeof(struct igmphdr); - - *dp++=type; - *dp++=0; - skb->h.raw=dp; - dp=imp_putu16(dp,0); /* checksum */ - dp=imp_putn32(dp,address); /* Address (already in net order) */ - imp_putn16(skb->h.raw,ip_compute_csum(skb->data+tmp,sizeof(struct igmphdr))); /* Checksum fill */ + ih=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr)); + ih->type=type; + ih->code=0; + ih->csum=0; + ih->group=address; + ih->csum=ip_compute_csum((void *)ih,sizeof(struct igmphdr)); /* Checksum fill */ ip_queue_xmit(NULL,dev,skb,1); } @@ -111,39 +272,117 @@ static void igmp_send_report(struct device *dev, unsigned long address, int type static void igmp_timer_expire(unsigned long data) { struct ip_mc_list *im=(struct ip_mc_list *)data; - igmp_stop_timer(im); - igmp_send_report(im->interface, im->multiaddr, IGMP_HOST_MEMBERSHIP_REPORT); + struct ip_router_info *r; + + im->tm_running=0; + r=igmp_get_mrouter_info(im->interface); + if(r==NULL) + return; + if(r->type==IGMP_NEW_ROUTER) + igmp_send_report(im->interface, im->multiaddr, IGMP_HOST_NEW_MEMBERSHIP_REPORT); + else + igmp_send_report(im->interface, im->multiaddr, IGMP_HOST_MEMBERSHIP_REPORT); + im->reporter=1; } -extern __inline__ void igmp_init_timer(struct ip_mc_list *im) +static void igmp_init_timer(struct ip_mc_list *im) { im->tm_running=0; init_timer(&im->timer); im->timer.data=(unsigned long)im; im->timer.function=&igmp_timer_expire; } - -extern __inline__ void igmp_heard_report(struct device *dev, unsigned long address) + +static void igmp_heard_report(struct device *dev, __u32 address, __u32 src) { struct ip_mc_list *im; - for(im=dev->ip_mc_list;im!=NULL;im=im->next) - if(im->multiaddr==address) - igmp_stop_timer(im); + + if ((address & IGMP_LOCAL_GROUP_MASK) != IGMP_LOCAL_GROUP) + { + /* Timers are only set for non-local groups */ + for(im=dev->ip_mc_list;im!=NULL;im=im->next) + { + if(im->multiaddr==address) + { + if(im->tm_running) + igmp_stop_timer(im); + if(src!=dev->pa_addr) + im->reporter=0; + return; + } + } + } } -extern __inline__ void igmp_heard_query(struct device *dev) +static void igmp_heard_query(struct device *dev,unsigned char max_resp_time) { struct ip_mc_list *im; - for(im=dev->ip_mc_list;im!=NULL;im=im->next) - if(!im->tm_running && im->multiaddr!=IGMP_ALL_HOSTS) - igmp_start_timer(im); + int mrouter_type; + + /* + * The max_resp_time is in units of 1/10 second. + */ + if(max_resp_time>0) + { + mrouter_type=IGMP_NEW_ROUTER; + + if(igmp_set_mrouter_info(dev,mrouter_type,0)==NULL) + return; + /* + * - Start the timers in all of our membership records + * that the query applies to for the interface on + * which the query arrived excl. those that belong + * to a "local" group (224.0.0.X) + * - For timers already running check if they need to + * be reset. + * - Use the igmp->igmp_code field as the maximum + * delay possible + */ + for(im=dev->ip_mc_list;im!=NULL;im=im->next) + { + if(im->tm_running) + { + if(im->timer.expires>jiffies+max_resp_time*HZ/IGMP_TIMER_SCALE) + { + igmp_stop_timer(im); + igmp_start_timer(im,max_resp_time); + } + } + else + { + if((im->multiaddr & IGMP_LOCAL_GROUP_MASK)!=IGMP_LOCAL_GROUP) + igmp_start_timer(im,max_resp_time); + } + } + } + else + { + mrouter_type=IGMP_OLD_ROUTER; + max_resp_time=IGMP_MAX_HOST_REPORT_DELAY*IGMP_TIMER_SCALE; + + if(igmp_set_mrouter_info(dev,mrouter_type,IGMP_AGE_THRESHOLD)==NULL) + return; + + /* + * Start the timers in all of our membership records for + * the interface on which the query arrived, except those + * that are already running and those that belong to a + * "local" group (224.0.0.X). + */ + + for(im=dev->ip_mc_list;im!=NULL;im=im->next) + { + if(!im->tm_running && (im->multiaddr & IGMP_LOCAL_GROUP_MASK)!=IGMP_LOCAL_GROUP) + igmp_start_timer(im,max_resp_time); + } + } } /* * Map a multicast IP onto multicast MAC for type ethernet. */ - + extern __inline__ void ip_mc_map(unsigned long addr, char *buf) { addr=ntohl(addr); @@ -160,26 +399,26 @@ extern __inline__ void ip_mc_map(unsigned long addr, char *buf) /* * Add a filter to a device */ - + void ip_mc_filter_add(struct device *dev, unsigned long addr) { char buf[6]; - if(dev->type!=ARPHRD_ETHER) - return; /* Only do ethernet now */ - ip_mc_map(addr,buf); + if(dev->type!=ARPHRD_ETHER && dev->type!=ARPHRD_FDDI) + return; /* Only do ethernet or FDDI for now */ + ip_mc_map(addr,buf); dev_mc_add(dev,buf,ETH_ALEN,0); } /* * Remove a filter from a device */ - + void ip_mc_filter_del(struct device *dev, unsigned long addr) { char buf[6]; - if(dev->type!=ARPHRD_ETHER) - return; /* Only do ethernet now */ - ip_mc_map(addr,buf); + if(dev->type!=ARPHRD_ETHER && dev->type!=ARPHRD_FDDI) + return; /* Only do ethernet or FDDI for now */ + ip_mc_map(addr,buf); dev_mc_delete(dev,buf,ETH_ALEN,0); } @@ -188,37 +427,67 @@ extern __inline__ void igmp_group_dropped(struct ip_mc_list *im) del_timer(&im->timer); igmp_send_report(im->interface, im->multiaddr, IGMP_HOST_LEAVE_MESSAGE); ip_mc_filter_del(im->interface, im->multiaddr); -/* printk("Left group %lX\n",im->multiaddr);*/ } extern __inline__ void igmp_group_added(struct ip_mc_list *im) { + struct ip_router_info *r; igmp_init_timer(im); - igmp_send_report(im->interface, im->multiaddr, IGMP_HOST_MEMBERSHIP_REPORT); ip_mc_filter_add(im->interface, im->multiaddr); -/* printk("Joined group %lX\n",im->multiaddr);*/ + r=igmp_get_mrouter_info(im->interface); + if(r==NULL) + return; + if(r->type==IGMP_NEW_ROUTER) + igmp_send_report(im->interface, im->multiaddr, IGMP_HOST_NEW_MEMBERSHIP_REPORT); + else + igmp_send_report(im->interface, im->multiaddr, IGMP_HOST_MEMBERSHIP_REPORT); } int igmp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt, - unsigned long daddr, unsigned short len, unsigned long saddr, int redo, + __u32 daddr, unsigned short len, __u32 saddr, int redo, struct inet_protocol *protocol) { /* This basically follows the spec line by line -- see RFC1112 */ - struct igmp_header igh; - - /* Pull the IGMP header */ - igmp_explode(skb->h.raw,&igh); - - if(skb->len <sizeof(struct igmphdr) || skb->ip_hdr->ttl!=1 || ip_compute_csum((void *)skb->h.raw,sizeof(struct igmphdr))) + struct igmphdr *ih; + + /* + * Mrouted needs to able to query local interfaces. So + * report for the device this was sent at. (Which can + * be the loopback this time) + */ + + if(dev->flags&IFF_LOOPBACK) + { + dev=ip_dev_find(saddr); + if(dev==NULL) + dev=&loopback_dev; + } + ih=(struct igmphdr *)skb->h.raw; + + if(skb->len <sizeof(struct igmphdr) || skb->ip_hdr->ttl<1 || ip_compute_csum((void *)skb->h.raw,sizeof(struct igmphdr))) { kfree_skb(skb, FREE_READ); return 0; } - if(igh.type==IGMP_HOST_MEMBERSHIP_QUERY && daddr==IGMP_ALL_HOSTS) - igmp_heard_query(dev); - if(igh.type==IGMP_HOST_MEMBERSHIP_REPORT && daddr==igh.group) - igmp_heard_report(dev,igh.group); + /* + * I have a report that someone does this! + */ + + if(saddr==0) + { + printk(KERN_INFO "Broken multicast host using 0.0.0.0 heard on %s\n", + dev->name); + kfree_skb(skb, FREE_READ); + return 0; + } + + if(ih->type==IGMP_HOST_MEMBERSHIP_QUERY && daddr==IGMP_ALL_HOSTS) + igmp_heard_query(dev,ih->code); + if(ih->type==IGMP_HOST_MEMBERSHIP_REPORT && daddr==ih->group) + igmp_heard_report(dev,ih->group, saddr); + if(ih->type==IGMP_HOST_NEW_MEMBERSHIP_REPORT && daddr==ih->group) + igmp_heard_report(dev,ih->group, saddr); kfree_skb(skb, FREE_READ); return 0; } @@ -226,12 +495,12 @@ int igmp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt, /* * Multicast list managers */ - - + + /* * A socket has joined a multicast group on device dev. */ - + static void ip_mc_inc_group(struct device *dev, unsigned long addr) { struct ip_mc_list *i; @@ -257,7 +526,7 @@ static void ip_mc_inc_group(struct device *dev, unsigned long addr) /* * A socket has left a multicast group on device dev */ - + static void ip_mc_dec_group(struct device *dev, unsigned long addr) { struct ip_mc_list **i; @@ -265,15 +534,14 @@ static void ip_mc_dec_group(struct device *dev, unsigned long addr) { if((*i)->multiaddr==addr) { - if(--((*i)->users)) - return; - else + if(--((*i)->users) == 0) { struct ip_mc_list *tmp= *i; igmp_group_dropped(tmp); *i=(*i)->next; kfree_s(tmp,sizeof(*tmp)); } + return; } } } @@ -281,7 +549,7 @@ static void ip_mc_dec_group(struct device *dev, unsigned long addr) /* * Device going down: Clean up. */ - + void ip_mc_drop_device(struct device *dev) { struct ip_mc_list *i; @@ -297,7 +565,7 @@ void ip_mc_drop_device(struct device *dev) /* * Device going up. Make sure it is in all hosts */ - + void ip_mc_allhost(struct device *dev) { struct ip_mc_list *i; @@ -310,16 +578,17 @@ void ip_mc_allhost(struct device *dev) i->users=1; i->interface=dev; i->multiaddr=IGMP_ALL_HOSTS; + i->tm_running=0; i->next=dev->ip_mc_list; dev->ip_mc_list=i; ip_mc_filter_add(i->interface, i->multiaddr); -} - +} + /* * Join a socket to a group */ - + int ip_mc_join_group(struct sock *sk , struct device *dev, unsigned long addr) { int unused= -1; @@ -341,7 +610,7 @@ int ip_mc_join_group(struct sock *sk , struct device *dev, unsigned long addr) if(sk->ip_mc_list->multidev[i]==NULL) unused=i; } - + if(unused==-1) return -ENOBUFS; sk->ip_mc_list->multiaddr[unused]=addr; @@ -353,7 +622,7 @@ int ip_mc_join_group(struct sock *sk , struct device *dev, unsigned long addr) /* * Ask a socket to leave a group. */ - + int ip_mc_leave_group(struct sock *sk, struct device *dev, unsigned long addr) { int i; @@ -363,7 +632,7 @@ int ip_mc_leave_group(struct sock *sk, struct device *dev, unsigned long addr) return -EADDRNOTAVAIL; if(sk->ip_mc_list==NULL) return -EADDRNOTAVAIL; - + for(i=0;i<IP_MAX_MEMBERSHIPS;i++) { if(sk->ip_mc_list->multiaddr[i]==addr && sk->ip_mc_list->multidev[i]==dev) @@ -379,14 +648,14 @@ int ip_mc_leave_group(struct sock *sk, struct device *dev, unsigned long addr) /* * A socket is closing. */ - + void ip_mc_drop_socket(struct sock *sk) { int i; - + if(sk->ip_mc_list==NULL) return; - + for(i=0;i<IP_MAX_MEMBERSHIPS;i++) { if(sk->ip_mc_list->multidev[i]) diff --git a/net/ipv4/ip.c b/net/ipv4/ip.c deleted file mode 100644 index 62d2ad38e..000000000 --- a/net/ipv4/ip.c +++ /dev/null @@ -1,2702 +0,0 @@ -/* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * The Internet Protocol (IP) module. - * - * Version: @(#)ip.c 1.0.16b 9/1/93 - * - * Authors: Ross Biro, <bir7@leland.Stanford.Edu> - * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> - * Donald Becker, <becker@super.org> - * Alan Cox, <gw4pts@gw4pts.ampr.org> - * Richard Underwood - * Stefan Becker, <stefanb@yello.ping.de> - * Jorge Cwik, <jorge@laser.satlink.net> - * Arnt Gulbrandsen, <agulbra@nvg.unit.no> - * - * - * Fixes: - * Alan Cox : Commented a couple of minor bits of surplus code - * Alan Cox : Undefining IP_FORWARD doesn't include the code - * (just stops a compiler warning). - * Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes - * are junked rather than corrupting things. - * Alan Cox : Frames to bad broadcast subnets are dumped - * We used to process them non broadcast and - * boy could that cause havoc. - * Alan Cox : ip_forward sets the free flag on the - * new frame it queues. Still crap because - * it copies the frame but at least it - * doesn't eat memory too. - * Alan Cox : Generic queue code and memory fixes. - * Fred Van Kempen : IP fragment support (borrowed from NET2E) - * Gerhard Koerting: Forward fragmented frames correctly. - * Gerhard Koerting: Fixes to my fix of the above 8-). - * Gerhard Koerting: IP interface addressing fix. - * Linus Torvalds : More robustness checks - * Alan Cox : Even more checks: Still not as robust as it ought to be - * Alan Cox : Save IP header pointer for later - * Alan Cox : ip option setting - * Alan Cox : Use ip_tos/ip_ttl settings - * Alan Cox : Fragmentation bogosity removed - * (Thanks to Mark.Bush@prg.ox.ac.uk) - * Dmitry Gorodchanin : Send of a raw packet crash fix. - * Alan Cox : Silly ip bug when an overlength - * fragment turns up. Now frees the - * queue. - * Linus Torvalds/ : Memory leakage on fragmentation - * Alan Cox : handling. - * Gerhard Koerting: Forwarding uses IP priority hints - * Teemu Rantanen : Fragment problems. - * Alan Cox : General cleanup, comments and reformat - * Alan Cox : SNMP statistics - * Alan Cox : BSD address rule semantics. Also see - * UDP as there is a nasty checksum issue - * if you do things the wrong way. - * Alan Cox : Always defrag, moved IP_FORWARD to the config.in file - * Alan Cox : IP options adjust sk->priority. - * Pedro Roque : Fix mtu/length error in ip_forward. - * Alan Cox : Avoid ip_chk_addr when possible. - * Richard Underwood : IP multicasting. - * Alan Cox : Cleaned up multicast handlers. - * Alan Cox : RAW sockets demultiplex in the BSD style. - * Gunther Mayer : Fix the SNMP reporting typo - * Alan Cox : Always in group 224.0.0.1 - * Pauline Middelink : Fast ip_checksum update when forwarding - * Masquerading support. - * Alan Cox : Multicast loopback error for 224.0.0.1 - * Alan Cox : IP_MULTICAST_LOOP option. - * Alan Cox : Use notifiers. - * Bjorn Ekwall : Removed ip_csum (from slhc.c too) - * Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!) - * Stefan Becker : Send out ICMP HOST REDIRECT - * Arnt Gulbrandsen : ip_build_xmit - * Alan Cox : Per socket routing cache - * Alan Cox : Fixed routing cache, added header cache. - * Alan Cox : Loopback didnt work right in original ip_build_xmit - fixed it. - * Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net. - * Alan Cox : Incoming IP option handling. - * Alan Cox : Set saddr on raw output frames as per BSD. - * Alan Cox : Stopped broadcast source route explosions. - * Alan Cox : Can disable source routing - * - * - * - * To Fix: - * IP option processing is mostly not needed. ip_forward needs to know about routing rules - * and time stamp but that's about all. Use the route mtu field here too - * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient - * and could be made very efficient with the addition of some virtual memory hacks to permit - * the allocation of a buffer that can then be 'grown' by twiddling page tables. - * Output fragmentation wants updating along with the buffer management to use a single - * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet - * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause - * fragmentation anyway. - * - * FIXME: copy frag 0 iph to qp->iph - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <asm/segment.h> -#include <asm/system.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/config.h> - -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> -#include <linux/inet.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> - -#include <net/snmp.h> -#include <net/ip.h> -#include <net/protocol.h> -#include <net/route.h> -#include <net/tcp.h> -#include <net/udp.h> -#include <linux/skbuff.h> -#include <net/sock.h> -#include <net/arp.h> -#include <net/icmp.h> -#include <net/raw.h> -#include <net/checksum.h> -#include <linux/igmp.h> -#include <linux/ip_fw.h> - -#define CONFIG_IP_DEFRAG - -extern int last_retran; -extern void sort_send(struct sock *sk); - -#define min(a,b) ((a)<(b)?(a):(b)) -#define LOOPBACK(x) (((x) & htonl(0xff000000)) == htonl(0x7f000000)) - -/* - * SNMP management statistics - */ - -#ifdef CONFIG_IP_FORWARD -struct ip_mib ip_statistics={1,64,}; /* Forwarding=Yes, Default TTL=64 */ -#else -struct ip_mib ip_statistics={0,64,}; /* Forwarding=No, Default TTL=64 */ -#endif - -/* - * Handle the issuing of an ioctl() request - * for the ip device. This is scheduled to - * disappear - */ - -int ip_ioctl(struct sock *sk, int cmd, unsigned long arg) -{ - switch(cmd) - { - default: - return(-EINVAL); - } -} - - -/* - * Take an skb, and fill in the MAC header. - */ - -static int ip_send(struct sk_buff *skb, unsigned long daddr, int len, struct device *dev, unsigned long saddr) -{ - int mac = 0; - - skb->dev = dev; - skb->arp = 1; - if (dev->hard_header) - { - /* - * Build a hardware header. Source address is our mac, destination unknown - * (rebuild header will sort this out) - */ - mac = dev->hard_header(skb->data, dev, ETH_P_IP, NULL, NULL, len, skb); - if (mac < 0) - { - mac = -mac; - skb->arp = 0; - skb->raddr = daddr; /* next routing address */ - } - } - return mac; -} - -int ip_id_count = 0; - -/* - * This routine builds the appropriate hardware/IP headers for - * the routine. It assumes that if *dev != NULL then the - * protocol knows what it's doing, otherwise it uses the - * routing/ARP tables to select a device struct. - */ -int ip_build_header(struct sk_buff *skb, unsigned long saddr, unsigned long daddr, - struct device **dev, int type, struct options *opt, int len, int tos, int ttl) -{ - struct rtable *rt; - unsigned char *buff; - unsigned long raddr; - int tmp; - unsigned long src; - struct iphdr *iph; - - buff = skb->data; - - /* - * See if we need to look up the device. - */ - -#ifdef CONFIG_INET_MULTICAST - if(MULTICAST(daddr) && *dev==NULL && skb->sk && *skb->sk->ip_mc_name) - *dev=dev_get(skb->sk->ip_mc_name); -#endif - if (*dev == NULL) - { - if(skb->localroute) - rt = ip_rt_local(daddr, NULL, &src); - else - rt = ip_rt_route(daddr, NULL, &src); - if (rt == NULL) - { - ip_statistics.IpOutNoRoutes++; - return(-ENETUNREACH); - } - - *dev = rt->rt_dev; - /* - * If the frame is from us and going off machine it MUST MUST MUST - * have the output device ip address and never the loopback - */ - if (LOOPBACK(saddr) && !LOOPBACK(daddr)) - saddr = src;/*rt->rt_dev->pa_addr;*/ - raddr = rt->rt_gateway; - - } - else - { - /* - * We still need the address of the first hop. - */ - if(skb->localroute) - rt = ip_rt_local(daddr, NULL, &src); - else - rt = ip_rt_route(daddr, NULL, &src); - /* - * If the frame is from us and going off machine it MUST MUST MUST - * have the output device ip address and never the loopback - */ - if (LOOPBACK(saddr) && !LOOPBACK(daddr)) - saddr = src;/*rt->rt_dev->pa_addr;*/ - - raddr = (rt == NULL) ? 0 : rt->rt_gateway; - } - - /* - * No source addr so make it our addr - */ - if (saddr == 0) - saddr = src; - - /* - * No gateway so aim at the real destination - */ - if (raddr == 0) - raddr = daddr; - - /* - * Now build the MAC header. - */ - - tmp = ip_send(skb, raddr, len, *dev, saddr); - buff += tmp; - len -= tmp; - - /* - * Book keeping - */ - - skb->dev = *dev; - skb->saddr = saddr; - if (skb->sk) - skb->sk->saddr = saddr; - - /* - * Now build the IP header. - */ - - /* - * If we are using IPPROTO_RAW, then we don't need an IP header, since - * one is being supplied to us by the user - */ - - if(type == IPPROTO_RAW) - return (tmp); - - /* - * Build the IP addresses - */ - - iph=(struct iphdr *)buff; - - iph->version = 4; - iph->tos = tos; - iph->frag_off = 0; - iph->ttl = ttl; - iph->daddr = daddr; - iph->saddr = saddr; - iph->protocol = type; - iph->ihl = 5; - skb->ip_hdr = iph; - - return(20 + tmp); /* IP header plus MAC header size */ -} - - -/* - * Generate a checksum for an outgoing IP datagram. - */ - -void ip_send_check(struct iphdr *iph) -{ - iph->check = 0; - iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); -} - -/************************ Fragment Handlers From NET2E **********************************/ - - -/* - * This fragment handler is a bit of a heap. On the other hand it works quite - * happily and handles things quite well. - */ - -static struct ipq *ipqueue = NULL; /* IP fragment queue */ - -/* - * Create a new fragment entry. - */ - -static struct ipfrag *ip_frag_create(int offset, int end, struct sk_buff *skb, unsigned char *ptr) -{ - struct ipfrag *fp; - - fp = (struct ipfrag *) kmalloc(sizeof(struct ipfrag), GFP_ATOMIC); - if (fp == NULL) - { - NETDEBUG(printk("IP: frag_create: no memory left !\n")); - return(NULL); - } - memset(fp, 0, sizeof(struct ipfrag)); - - /* Fill in the structure. */ - fp->offset = offset; - fp->end = end; - fp->len = end - offset; - fp->skb = skb; - fp->ptr = ptr; - - return(fp); -} - - -/* - * Find the correct entry in the "incomplete datagrams" queue for - * this IP datagram, and return the queue entry address if found. - */ - -static struct ipq *ip_find(struct iphdr *iph) -{ - struct ipq *qp; - struct ipq *qplast; - - cli(); - qplast = NULL; - for(qp = ipqueue; qp != NULL; qplast = qp, qp = qp->next) - { - if (iph->id== qp->iph->id && iph->saddr == qp->iph->saddr && - iph->daddr == qp->iph->daddr && iph->protocol == qp->iph->protocol) - { - del_timer(&qp->timer); /* So it doesn't vanish on us. The timer will be reset anyway */ - sti(); - return(qp); - } - } - sti(); - return(NULL); -} - - -/* - * Remove an entry from the "incomplete datagrams" queue, either - * because we completed, reassembled and processed it, or because - * it timed out. - */ - -static void ip_free(struct ipq *qp) -{ - struct ipfrag *fp; - struct ipfrag *xp; - - /* - * Stop the timer for this entry. - */ - - del_timer(&qp->timer); - - /* Remove this entry from the "incomplete datagrams" queue. */ - cli(); - if (qp->prev == NULL) - { - ipqueue = qp->next; - if (ipqueue != NULL) - ipqueue->prev = NULL; - } - else - { - qp->prev->next = qp->next; - if (qp->next != NULL) - qp->next->prev = qp->prev; - } - - /* Release all fragment data. */ - - fp = qp->fragments; - while (fp != NULL) - { - xp = fp->next; - IS_SKB(fp->skb); - kfree_skb(fp->skb,FREE_READ); - kfree_s(fp, sizeof(struct ipfrag)); - fp = xp; - } - - /* Release the MAC header. */ - kfree_s(qp->mac, qp->maclen); - - /* Release the IP header. */ - kfree_s(qp->iph, 64 + 8); - - /* Finally, release the queue descriptor itself. */ - kfree_s(qp, sizeof(struct ipq)); - sti(); -} - - -/* - * Oops- a fragment queue timed out. Kill it and send an ICMP reply. - */ - -static void ip_expire(unsigned long arg) -{ - struct ipq *qp; - - qp = (struct ipq *)arg; - - /* - * Send an ICMP "Fragment Reassembly Timeout" message. - */ - - ip_statistics.IpReasmTimeout++; - ip_statistics.IpReasmFails++; - /* This if is always true... shrug */ - if(qp->fragments!=NULL) - icmp_send(qp->fragments->skb,ICMP_TIME_EXCEEDED, - ICMP_EXC_FRAGTIME, 0, qp->dev); - - /* - * Nuke the fragment queue. - */ - ip_free(qp); -} - - -/* - * Add an entry to the 'ipq' queue for a newly received IP datagram. - * We will (hopefully :-) receive all other fragments of this datagram - * in time, so we just create a queue for this datagram, in which we - * will insert the received fragments at their respective positions. - */ - -static struct ipq *ip_create(struct sk_buff *skb, struct iphdr *iph, struct device *dev) -{ - struct ipq *qp; - int maclen; - int ihlen; - - qp = (struct ipq *) kmalloc(sizeof(struct ipq), GFP_ATOMIC); - if (qp == NULL) - { - NETDEBUG(printk("IP: create: no memory left !\n")); - return(NULL); - skb->dev = qp->dev; - } - memset(qp, 0, sizeof(struct ipq)); - - /* - * Allocate memory for the MAC header. - * - * FIXME: We have a maximum MAC address size limit and define - * elsewhere. We should use it here and avoid the 3 kmalloc() calls - */ - - maclen = ((unsigned long) iph) - ((unsigned long) skb->data); - qp->mac = (unsigned char *) kmalloc(maclen, GFP_ATOMIC); - if (qp->mac == NULL) - { - NETDEBUG(printk("IP: create: no memory left !\n")); - kfree_s(qp, sizeof(struct ipq)); - return(NULL); - } - - /* - * Allocate memory for the IP header (plus 8 octets for ICMP). - */ - - ihlen = (iph->ihl * sizeof(unsigned long)); - qp->iph = (struct iphdr *) kmalloc(64 + 8, GFP_ATOMIC); - if (qp->iph == NULL) - { - NETDEBUG(printk("IP: create: no memory left !\n")); - kfree_s(qp->mac, maclen); - kfree_s(qp, sizeof(struct ipq)); - return(NULL); - } - - /* Fill in the structure. */ - memcpy(qp->mac, skb->data, maclen); - memcpy(qp->iph, iph, ihlen + 8); - qp->len = 0; - qp->ihlen = ihlen; - qp->maclen = maclen; - qp->fragments = NULL; - qp->dev = dev; - - /* Start a timer for this entry. */ - qp->timer.expires = IP_FRAG_TIME; /* about 30 seconds */ - qp->timer.data = (unsigned long) qp; /* pointer to queue */ - qp->timer.function = ip_expire; /* expire function */ - add_timer(&qp->timer); - - /* Add this entry to the queue. */ - qp->prev = NULL; - cli(); - qp->next = ipqueue; - if (qp->next != NULL) - qp->next->prev = qp; - ipqueue = qp; - sti(); - return(qp); -} - - -/* - * See if a fragment queue is complete. - */ - -static int ip_done(struct ipq *qp) -{ - struct ipfrag *fp; - int offset; - - /* Only possible if we received the final fragment. */ - if (qp->len == 0) - return(0); - - /* Check all fragment offsets to see if they connect. */ - fp = qp->fragments; - offset = 0; - while (fp != NULL) - { - if (fp->offset > offset) - return(0); /* fragment(s) missing */ - offset = fp->end; - fp = fp->next; - } - - /* All fragments are present. */ - return(1); -} - - -/* - * Build a new IP datagram from all its fragments. - * - * FIXME: We copy here because we lack an effective way of handling lists - * of bits on input. Until the new skb data handling is in I'm not going - * to touch this with a bargepole. This also causes a 4Kish limit on - * packet sizes. - */ - -static struct sk_buff *ip_glue(struct ipq *qp) -{ - struct sk_buff *skb; - struct iphdr *iph; - struct ipfrag *fp; - unsigned char *ptr; - int count, len; - - /* - * Allocate a new buffer for the datagram. - */ - - len = qp->maclen + qp->ihlen + qp->len; - - if ((skb = alloc_skb(len,GFP_ATOMIC)) == NULL) - { - ip_statistics.IpReasmFails++; - NETDEBUG(printk("IP: queue_glue: no memory for gluing queue 0x%X\n", (int) qp)); - ip_free(qp); - return(NULL); - } - - /* Fill in the basic details. */ - skb->len = (len - qp->maclen); - skb->h.raw = skb->data; - skb->free = 1; - - /* Copy the original MAC and IP headers into the new buffer. */ - ptr = (unsigned char *) skb->h.raw; - memcpy(ptr, ((unsigned char *) qp->mac), qp->maclen); - ptr += qp->maclen; - memcpy(ptr, ((unsigned char *) qp->iph), qp->ihlen); - ptr += qp->ihlen; - skb->h.raw += qp->maclen; - - count = 0; - - /* Copy the data portions of all fragments into the new buffer. */ - fp = qp->fragments; - while(fp != NULL) - { - if(count+fp->len > skb->len) - { - NETDEBUG(printk("Invalid fragment list: Fragment over size.\n")); - ip_free(qp); - kfree_skb(skb,FREE_WRITE); - ip_statistics.IpReasmFails++; - return NULL; - } - memcpy((ptr + fp->offset), fp->ptr, fp->len); - count += fp->len; - fp = fp->next; - } - - /* We glued together all fragments, so remove the queue entry. */ - ip_free(qp); - - /* Done with all fragments. Fixup the new IP header. */ - iph = skb->h.iph; - iph->frag_off = 0; - iph->tot_len = htons((iph->ihl * sizeof(unsigned long)) + count); - skb->ip_hdr = iph; - - ip_statistics.IpReasmOKs++; - return(skb); -} - - -/* - * Process an incoming IP datagram fragment. - */ - -static struct sk_buff *ip_defrag(struct iphdr *iph, struct sk_buff *skb, struct device *dev) -{ - struct ipfrag *prev, *next, *tmp; - struct ipfrag *tfp; - struct ipq *qp; - struct sk_buff *skb2; - unsigned char *ptr; - int flags, offset; - int i, ihl, end; - - ip_statistics.IpReasmReqds++; - - /* Find the entry of this IP datagram in the "incomplete datagrams" queue. */ - qp = ip_find(iph); - - /* Is this a non-fragmented datagram? */ - offset = ntohs(iph->frag_off); - flags = offset & ~IP_OFFSET; - offset &= IP_OFFSET; - if (((flags & IP_MF) == 0) && (offset == 0)) - { - if (qp != NULL) - ip_free(qp); /* Huh? How could this exist?? */ - return(skb); - } - - offset <<= 3; /* offset is in 8-byte chunks */ - - /* - * If the queue already existed, keep restarting its timer as long - * as we still are receiving fragments. Otherwise, create a fresh - * queue entry. - */ - - if (qp != NULL) - { - del_timer(&qp->timer); - qp->timer.expires = IP_FRAG_TIME; /* about 30 seconds */ - qp->timer.data = (unsigned long) qp; /* pointer to queue */ - qp->timer.function = ip_expire; /* expire function */ - add_timer(&qp->timer); - } - else - { - /* - * If we failed to create it, then discard the frame - */ - if ((qp = ip_create(skb, iph, dev)) == NULL) - { - skb->sk = NULL; - kfree_skb(skb, FREE_READ); - ip_statistics.IpReasmFails++; - return NULL; - } - } - - /* - * Determine the position of this fragment. - */ - - ihl = (iph->ihl * sizeof(unsigned long)); - end = offset + ntohs(iph->tot_len) - ihl; - - /* - * Point into the IP datagram 'data' part. - */ - - ptr = skb->data + dev->hard_header_len + ihl; - - /* - * Is this the final fragment? - */ - - if ((flags & IP_MF) == 0) - qp->len = end; - - /* - * Find out which fragments are in front and at the back of us - * in the chain of fragments so far. We must know where to put - * this fragment, right? - */ - - prev = NULL; - for(next = qp->fragments; next != NULL; next = next->next) - { - if (next->offset > offset) - break; /* bingo! */ - prev = next; - } - - /* - * We found where to put this one. - * Check for overlap with preceding fragment, and, if needed, - * align things so that any overlaps are eliminated. - */ - if (prev != NULL && offset < prev->end) - { - i = prev->end - offset; - offset += i; /* ptr into datagram */ - ptr += i; /* ptr into fragment data */ - } - - /* - * Look for overlap with succeeding segments. - * If we can merge fragments, do it. - */ - - for(tmp=next; tmp != NULL; tmp = tfp) - { - tfp = tmp->next; - if (tmp->offset >= end) - break; /* no overlaps at all */ - - i = end - next->offset; /* overlap is 'i' bytes */ - tmp->len -= i; /* so reduce size of */ - tmp->offset += i; /* next fragment */ - tmp->ptr += i; - /* - * If we get a frag size of <= 0, remove it and the packet - * that it goes with. - */ - if (tmp->len <= 0) - { - if (tmp->prev != NULL) - tmp->prev->next = tmp->next; - else - qp->fragments = tmp->next; - - if (tfp->next != NULL) - tmp->next->prev = tmp->prev; - - next=tfp; /* We have killed the original next frame */ - - kfree_skb(tmp->skb,FREE_READ); - kfree_s(tmp, sizeof(struct ipfrag)); - } - } - - /* - * Insert this fragment in the chain of fragments. - */ - - tfp = NULL; - tfp = ip_frag_create(offset, end, skb, ptr); - - /* - * No memory to save the fragment - so throw the lot - */ - - if (!tfp) - { - skb->sk = NULL; - kfree_skb(skb, FREE_READ); - return NULL; - } - tfp->prev = prev; - tfp->next = next; - if (prev != NULL) - prev->next = tfp; - else - qp->fragments = tfp; - - if (next != NULL) - next->prev = tfp; - - /* - * OK, so we inserted this new fragment into the chain. - * Check if we now have a full IP datagram which we can - * bump up to the IP layer... - */ - - if (ip_done(qp)) - { - skb2 = ip_glue(qp); /* glue together the fragments */ - return(skb2); - } - return(NULL); -} - - -/* - * This IP datagram is too large to be sent in one piece. Break it up into - * smaller pieces (each of size equal to the MAC header plus IP header plus - * a block of the data of the original IP data part) that will yet fit in a - * single device frame, and queue such a frame for sending by calling the - * ip_queue_xmit(). Note that this is recursion, and bad things will happen - * if this function causes a loop... - * - * Yes this is inefficient, feel free to submit a quicker one. - * - * **Protocol Violation** - * We copy all the options to each fragment. !FIXME! - */ -void ip_fragment(struct sock *sk, struct sk_buff *skb, struct device *dev, int is_frag) -{ - struct iphdr *iph; - unsigned char *raw; - unsigned char *ptr; - struct sk_buff *skb2; - int left, mtu, hlen, len; - int offset; - unsigned long flags; - - /* - * Point into the IP datagram header. - */ - - raw = skb->data; - iph = (struct iphdr *) (raw + dev->hard_header_len); - - skb->ip_hdr = iph; - - /* - * Setup starting values. - */ - - hlen = (iph->ihl * sizeof(unsigned long)); - left = ntohs(iph->tot_len) - hlen; /* Space per frame */ - hlen += dev->hard_header_len; /* Total header size */ - mtu = (dev->mtu - hlen); /* Size of data space */ - ptr = (raw + hlen); /* Where to start from */ - - /* - * Check for any "DF" flag. [DF means do not fragment] - */ - - if (ntohs(iph->frag_off) & IP_DF) - { - /* - * Reply giving the MTU of the failed hop. - */ - ip_statistics.IpFragFails++; - icmp_send(skb,ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, dev->mtu, dev); - return; - } - - /* - * The protocol doesn't seem to say what to do in the case that the - * frame + options doesn't fit the mtu. As it used to fall down dead - * in this case we were fortunate it didn't happen - */ - - if(mtu<8) - { - /* It's wrong but it's better than nothing */ - icmp_send(skb,ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED,dev->mtu, dev); - ip_statistics.IpFragFails++; - return; - } - - /* - * Fragment the datagram. - */ - - /* - * The initial offset is 0 for a complete frame. When - * fragmenting fragments it's wherever this one starts. - */ - - if (is_frag & 2) - offset = (ntohs(iph->frag_off) & 0x1fff) << 3; - else - offset = 0; - - - /* - * Keep copying data until we run out. - */ - - while(left > 0) - { - len = left; - /* IF: it doesn't fit, use 'mtu' - the data space left */ - if (len > mtu) - len = mtu; - /* IF: we are not sending upto and including the packet end - then align the next start on an eight byte boundary */ - if (len < left) - { - len/=8; - len*=8; - } - /* - * Allocate buffer. - */ - - if ((skb2 = alloc_skb(len + hlen,GFP_ATOMIC)) == NULL) - { - NETDEBUG(printk("IP: frag: no memory for new fragment!\n")); - ip_statistics.IpFragFails++; - return; - } - - /* - * Set up data on packet - */ - - skb2->arp = skb->arp; - if(skb->free==0) - printk("IP fragmenter: BUG free!=1 in fragmenter\n"); - skb2->free = 1; - skb2->len = len + hlen; - skb2->h.raw=(char *) skb2->data; - /* - * Charge the memory for the fragment to any owner - * it might possess - */ - - save_flags(flags); - if (sk) - { - cli(); - sk->wmem_alloc += skb2->mem_len; - skb2->sk=sk; - } - restore_flags(flags); - skb2->raddr = skb->raddr; /* For rebuild_header - must be here */ - - /* - * Copy the packet header into the new buffer. - */ - - memcpy(skb2->h.raw, raw, hlen); - - /* - * Copy a block of the IP datagram. - */ - memcpy(skb2->h.raw + hlen, ptr, len); - left -= len; - - skb2->h.raw+=dev->hard_header_len; - - /* - * Fill in the new header fields. - */ - iph = (struct iphdr *)(skb2->h.raw/*+dev->hard_header_len*/); - iph->frag_off = htons((offset >> 3)); - /* - * Added AC : If we are fragmenting a fragment thats not the - * last fragment then keep MF on each bit - */ - if (left > 0 || (is_frag & 1)) - iph->frag_off |= htons(IP_MF); - ptr += len; - offset += len; - - /* - * Put this fragment into the sending queue. - */ - - ip_statistics.IpFragCreates++; - - ip_queue_xmit(sk, dev, skb2, 2); - } - ip_statistics.IpFragOKs++; -} - - - -#ifdef CONFIG_IP_FORWARD - -/* - * Forward an IP datagram to its next destination. - */ - -void ip_forward(struct sk_buff *skb, struct device *dev, int is_frag, unsigned long target_addr, int target_strict) -{ - struct device *dev2; /* Output device */ - struct iphdr *iph; /* Our header */ - struct sk_buff *skb2; /* Output packet */ - struct rtable *rt; /* Route we use */ - unsigned char *ptr; /* Data pointer */ - unsigned long raddr; /* Router IP address */ -#ifdef CONFIG_IP_FIREWALL - int fw_res = 0; /* Forwarding result */ - - /* - * See if we are allowed to forward this. - * Note: demasqueraded fragments are always 'back'warded. - */ - - - if(!(is_frag&4) && (fw_res=ip_fw_chk(skb->h.iph, dev, ip_fw_fwd_chain, ip_fw_fwd_policy, 0))!=1) - { - if(fw_res==-1) - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, dev); - return; - } -#endif - /* - * According to the RFC, we must first decrease the TTL field. If - * that reaches zero, we must reply an ICMP control message telling - * that the packet's lifetime expired. - * - * Exception: - * We may not generate an ICMP for an ICMP. icmp_send does the - * enforcement of this so we can forget it here. It is however - * sometimes VERY important. - */ - - iph = skb->h.iph; - iph->ttl--; - - /* - * Re-compute the IP header checksum. - * This is inefficient. We know what has happened to the header - * and could thus adjust the checksum as Phil Karn does in KA9Q - */ - - iph->check = ntohs(iph->check) + 0x0100; - if ((iph->check & 0xFF00) == 0) - iph->check++; /* carry overflow */ - iph->check = htons(iph->check); - - if (iph->ttl <= 0) - { - /* Tell the sender its packet died... */ - icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0, dev); - return; - } - - /* - * OK, the packet is still valid. Fetch its destination address, - * and give it to the IP sender for further processing. - */ - - rt = ip_rt_route(target_addr, NULL, NULL); - if (rt == NULL) - { - /* - * Tell the sender its packet cannot be delivered. Again - * ICMP is screened later. - */ - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_UNREACH, 0, dev); - return; - } - - - /* - * Gosh. Not only is the packet valid; we even know how to - * forward it onto its final destination. Can we say this - * is being plain lucky? - * If the router told us that there is no GW, use the dest. - * IP address itself- we seem to be connected directly... - */ - - raddr = rt->rt_gateway; - - if (raddr != 0) - { - /* - * Strict routing permits no gatewaying - */ - - if(target_strict) - { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0, dev); - kfree_skb(skb, FREE_READ); - return; - } - - /* - * There is a gateway so find the correct route for it. - * Gateways cannot in turn be gatewayed. - */ - - rt = ip_rt_route(raddr, NULL, NULL); - if (rt == NULL) - { - /* - * Tell the sender its packet cannot be delivered... - */ - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, dev); - return; - } - if (rt->rt_gateway != 0) - raddr = rt->rt_gateway; - } - else - raddr = target_addr; - - /* - * Having picked a route we can now send the frame out. - */ - - dev2 = rt->rt_dev; - - /* - * In IP you never have to forward a frame on the interface that it - * arrived upon. We now generate an ICMP HOST REDIRECT giving the route - * we calculated. - */ -#ifndef CONFIG_IP_NO_ICMP_REDIRECT - if (dev == dev2 && !((iph->saddr^iph->daddr)&dev->pa_mask) && rt->rt_flags&RTF_MODIFIED) - icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, raddr, dev); -#endif - - /* - * We now allocate a new buffer, and copy the datagram into it. - * If the indicated interface is up and running, kick it. - */ - - if (dev2->flags & IFF_UP) - { -#ifdef CONFIG_IP_MASQUERADE - /* - * If this fragment needs masquerading, make it so... - * (Dont masquerade de-masqueraded fragments) - */ - if (!(is_frag&4) && fw_res==2) - ip_fw_masquerade(&skb, dev2); -#endif - - /* - * Current design decrees we copy the packet. For identical header - * lengths we could avoid it. The new skb code will let us push - * data so the problem goes away then. - */ - - skb2 = alloc_skb(dev2->hard_header_len + skb->len, GFP_ATOMIC); - /* - * This is rare and since IP is tolerant of network failures - * quite harmless. - */ - if (skb2 == NULL) - { - NETDEBUG(printk("\nIP: No memory available for IP forward\n")); - return; - } - ptr = skb2->data; - skb2->free = 1; - skb2->len = skb->len + dev2->hard_header_len; - skb2->h.raw = ptr; - - /* - * Copy the packet data into the new buffer. - */ - memcpy(ptr + dev2->hard_header_len, skb->h.raw, skb->len); - - /* Now build the MAC header. */ - (void) ip_send(skb2, raddr, skb->len, dev2, dev2->pa_addr); - - ip_statistics.IpForwDatagrams++; - - /* - * See if it needs fragmenting. Note in ip_rcv we tagged - * the fragment type. This must be right so that - * the fragmenter does the right thing. - */ - - if(skb2->len > dev2->mtu + dev2->hard_header_len) - { - ip_fragment(NULL,skb2,dev2, is_frag); - kfree_skb(skb2,FREE_WRITE); - } - else - { -#ifdef CONFIG_IP_ACCT - /* - * Count mapping we shortcut - */ - - ip_fw_chk(iph,dev,ip_acct_chain,IP_FW_F_ACCEPT,1); -#endif - - /* - * Map service types to priority. We lie about - * throughput being low priority, but it's a good - * choice to help improve general usage. - */ - if(iph->tos & IPTOS_LOWDELAY) - dev_queue_xmit(skb2, dev2, SOPRI_INTERACTIVE); - else if(iph->tos & IPTOS_THROUGHPUT) - dev_queue_xmit(skb2, dev2, SOPRI_BACKGROUND); - else - dev_queue_xmit(skb2, dev2, SOPRI_NORMAL); - } - } -} - - -#endif - -/* - * This function receives all incoming IP datagrams. - */ - -int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) -{ - struct iphdr *iph = skb->h.iph; - struct sock *raw_sk=NULL; - unsigned char hash; - unsigned char flag = 0; - struct inet_protocol *ipprot; - int brd=IS_MYADDR; - unsigned long target_addr; - int target_strict=0; - int is_frag=0; -#ifdef CONFIG_IP_FIREWALL - int err; -#endif - - ip_statistics.IpInReceives++; - - /* - * Tag the ip header of this packet so we can find it - */ - - skb->ip_hdr = iph; - - /* - * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum. - * RFC1122: 3.1.2.3 MUST discard a frame with invalid source address [NEEDS FIXING]. - * - * Is the datagram acceptable? - * - * 1. Length at least the size of an ip header - * 2. Version of 4 - * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums] - * 4. Doesn't have a bogus length - * (5. We ought to check for IP multicast addresses and undefined types.. does this matter ?) - */ - - if (skb->len<sizeof(struct iphdr) || iph->ihl<5 || iph->version != 4 || ip_fast_csum((unsigned char *)iph, iph->ihl) !=0 - || skb->len < ntohs(iph->tot_len)) - { - ip_statistics.IpInHdrErrors++; - kfree_skb(skb, FREE_WRITE); - return(0); - } - - /* - * Our transport medium may have padded the buffer out. Now we know it - * is IP we can trim to the true length of the frame. - */ - - skb->len=ntohs(iph->tot_len); - - /* - * See if the firewall wants to dispose of the packet. - */ - -#ifdef CONFIG_IP_FIREWALL - - if ((err=ip_fw_chk(iph,dev,ip_fw_blk_chain,ip_fw_blk_policy, 0))<1) - { - if(err==-1) - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0, dev); - kfree_skb(skb, FREE_WRITE); - return 0; - } - -#endif - - - /* - * Next analyse the packet for options. Studies show under one packet in - * a thousand have options.... - */ - - target_addr = iph->daddr; - - if (iph->ihl != 5) - { - /* Humph.. options. Lots of annoying fiddly bits */ - - /* - * This is straight from the RFC. It might even be right ;) - * - * RFC 1122: 3.2.1.8 STREAMID option is obsolete and MUST be ignored. - * RFC 1122: 3.2.1.8 MUST NOT crash on a zero length option. - * RFC 1122: 3.2.1.8 MUST support acting as final destination of a source route. - */ - - int opt_space=4*(iph->ihl-5); - int opt_size; - unsigned char *opt_ptr=skb->h.raw+sizeof(struct iphdr); - - while(opt_space>0) - { - if(*opt_ptr==IPOPT_NOOP) - { - opt_ptr++; - opt_space--; - continue; - } - if(*opt_ptr==IPOPT_END) - break; /* Done */ - if(opt_space<2 || (opt_size=opt_ptr[1])<2 || opt_ptr[1]>opt_space) - { - /* - * RFC 1122: 3.2.2.5 SHOULD send parameter problem reports. - */ - icmp_send(skb, ICMP_PARAMETERPROB, 0, 0, skb->dev); - kfree_skb(skb, FREE_READ); - return -EINVAL; - } - switch(opt_ptr[0]) - { - case IPOPT_SEC: - /* Should we drop this ?? */ - break; - case IPOPT_SSRR: /* These work almost the same way */ - target_strict=1; - /* Fall through */ - case IPOPT_LSRR: -#ifdef CONFIG_IP_NOSR - kfree_skb(skb, FREE_READ); - return -EINVAL; -#endif - case IPOPT_RR: - /* - * RFC 1122: 3.2.1.8 Support for RR is OPTIONAL. - */ - if (iph->daddr!=skb->dev->pa_addr && (brd = ip_chk_addr(iph->daddr)) == 0) - break; - if((opt_size<3) || ( opt_ptr[0]==IPOPT_RR && opt_ptr[2] > opt_size-4 )) - { - if(ip_chk_addr(iph->daddr)) - icmp_send(skb, ICMP_PARAMETERPROB, 0, 0, skb->dev); - kfree_skb(skb, FREE_READ); - return -EINVAL; - } - if(opt_ptr[2] > opt_size-4 ) - break; - /* Bytes are [IPOPT_xxRR][Length][EntryPointer][Entry0][Entry1].... */ - /* This isn't going to be too portable - FIXME */ - if(opt_ptr[0]!=IPOPT_RR) - { - int t; - target_addr=*(long *)(&opt_ptr[opt_ptr[2]]); /* Get hop */ - t=ip_chk_addr(target_addr); - if(t==IS_MULTICAST||t==IS_BROADCAST) - { - if(ip_chk_addr(iph->daddr)) - icmp_send(skb, ICMP_PARAMETERPROB, 0, 0, skb->dev); - kfree_skb(skb,FREE_READ); - return -EINVAL; - } - } - *(long *)(&opt_ptr[opt_ptr[2]])=skb->dev->pa_addr; /* Record hop */ - break; - case IPOPT_TIMESTAMP: - /* - * RFC 1122: 3.2.1.8 The timestamp option is OPTIONAL but if implemented - * MUST meet various rules (read the spec). - */ - NETDEBUG(printk("ICMP: Someone finish the timestamp routine ;)\n")); - break; - default: - break; - } - opt_ptr+=opt_size; - opt_space-=opt_size; - } - - } - - - /* - * Remember if the frame is fragmented. - */ - - if(iph->frag_off) - { - if (iph->frag_off & 0x0020) - is_frag|=1; - /* - * Last fragment ? - */ - - if (ntohs(iph->frag_off) & 0x1fff) - is_frag|=2; - } - - /* - * Do any IP forwarding required. chk_addr() is expensive -- avoid it someday. - * - * This is inefficient. While finding out if it is for us we could also compute - * the routing table entry. This is where the great unified cache theory comes - * in as and when someone implements it - * - * For most hosts over 99% of packets match the first conditional - * and don't go via ip_chk_addr. Note: brd is set to IS_MYADDR at - * function entry. - */ - - if ( iph->daddr == skb->dev->pa_addr || (brd = ip_chk_addr(iph->daddr)) != 0) - { -#ifdef CONFIG_IP_MULTICAST - - if(brd==IS_MULTICAST && iph->daddr!=IGMP_ALL_HOSTS && !(dev->flags&IFF_LOOPBACK)) - { - /* - * Check it is for one of our groups - */ - struct ip_mc_list *ip_mc=dev->ip_mc_list; - do - { - if(ip_mc==NULL) - { - kfree_skb(skb, FREE_WRITE); - return 0; - } - if(ip_mc->multiaddr==iph->daddr) - break; - ip_mc=ip_mc->next; - } - while(1); - } -#endif - -#ifdef CONFIG_IP_MASQUERADE - /* - * Do we need to de-masquerade this fragment? - */ - if (ip_fw_demasquerade(skb)) - { - struct iphdr *iph=skb->h.iph; - ip_forward(skb, dev, is_frag|4, iph->daddr, 0); - kfree_skb(skb, FREE_WRITE); - return(0); - } -#endif - - /* - * Account for the packet - */ - -#ifdef CONFIG_IP_ACCT - ip_fw_chk(iph,dev,ip_acct_chain,IP_FW_F_ACCEPT,1); -#endif - - /* - * Reassemble IP fragments. - */ - - if(is_frag) - { - /* Defragment. Obtain the complete packet if there is one */ - skb=ip_defrag(iph,skb,dev); - if(skb==NULL) - return 0; - skb->dev = dev; - iph=skb->h.iph; - } - - /* - * Point into the IP datagram, just past the header. - */ - - skb->ip_hdr = iph; - skb->h.raw += iph->ihl*4; - - /* - * Deliver to raw sockets. This is fun as to avoid copies we want to make no surplus copies. - * - * RFC 1122: SHOULD pass TOS value up to the transport layer. - */ - - hash = iph->protocol & (SOCK_ARRAY_SIZE-1); - - /* - * If there maybe a raw socket we must check - if not we don't care less - */ - - if((raw_sk=raw_prot.sock_array[hash])!=NULL) - { - struct sock *sknext=NULL; - struct sk_buff *skb1; - raw_sk=get_sock_raw(raw_sk, hash, iph->saddr, iph->daddr); - if(raw_sk) /* Any raw sockets */ - { - do - { - /* Find the next */ - sknext=get_sock_raw(raw_sk->next, hash, iph->saddr, iph->daddr); - if(sknext) - skb1=skb_clone(skb, GFP_ATOMIC); - else - break; /* One pending raw socket left */ - if(skb1) - raw_rcv(raw_sk, skb1, dev, iph->saddr,iph->daddr); - raw_sk=sknext; - } - while(raw_sk!=NULL); - - /* - * Here either raw_sk is the last raw socket, or NULL if none - */ - - /* - * We deliver to the last raw socket AFTER the protocol checks as it avoids a surplus copy - */ - } - } - - /* - * skb->h.raw now points at the protocol beyond the IP header. - */ - - hash = iph->protocol & (MAX_INET_PROTOS -1); - for (ipprot = (struct inet_protocol *)inet_protos[hash];ipprot != NULL;ipprot=(struct inet_protocol *)ipprot->next) - { - struct sk_buff *skb2; - - if (ipprot->protocol != iph->protocol) - continue; - /* - * See if we need to make a copy of it. This will - * only be set if more than one protocol wants it. - * and then not for the last one. If there is a pending - * raw delivery wait for that - */ - - if (ipprot->copy || raw_sk) - { - skb2 = skb_clone(skb, GFP_ATOMIC); - if(skb2==NULL) - continue; - } - else - { - skb2 = skb; - } - flag = 1; - - /* - * Pass on the datagram to each protocol that wants it, - * based on the datagram protocol. We should really - * check the protocol handler's return values here... - */ - - ipprot->handler(skb2, dev, NULL, iph->daddr, - (ntohs(iph->tot_len) - (iph->ihl * 4)), - iph->saddr, 0, ipprot); - - } - - /* - * All protocols checked. - * If this packet was a broadcast, we may *not* reply to it, since that - * causes (proven, grin) ARP storms and a leakage of memory (i.e. all - * ICMP reply messages get queued up for transmission...) - */ - - if(raw_sk!=NULL) /* Shift to last raw user */ - raw_rcv(raw_sk, skb, dev, iph->saddr, iph->daddr); - else if (!flag) /* Free and report errors */ - { - if (brd != IS_BROADCAST && brd!=IS_MULTICAST) - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0, dev); - kfree_skb(skb, FREE_WRITE); - } - - return(0); - } - - /* - * Do any IP forwarding required. chk_addr() is expensive -- avoid it someday. - * - * This is inefficient. While finding out if it is for us we could also compute - * the routing table entry. This is where the great unified cache theory comes - * in as and when someone implements it - * - * For most hosts over 99% of packets match the first conditional - * and don't go via ip_chk_addr. Note: brd is set to IS_MYADDR at - * function entry. - */ - - /* - * Don't forward multicast or broadcast frames. - */ - - if(skb->pkt_type!=PACKET_HOST || brd==IS_BROADCAST) - { - kfree_skb(skb,FREE_WRITE); - return 0; - } - - /* - * The packet is for another target. Forward the frame - */ - -#ifdef CONFIG_IP_FORWARD - ip_forward(skb, dev, is_frag, target_addr, target_strict); -#else -/* printk("Machine %lx tried to use us as a forwarder to %lx but we have forwarding disabled!\n", - iph->saddr,iph->daddr);*/ - ip_statistics.IpInAddrErrors++; -#endif - /* - * The forwarder is inefficient and copies the packet. We - * free the original now. - */ - - kfree_skb(skb, FREE_WRITE); - return(0); -} - - -/* - * Loop a packet back to the sender. - */ - -static void ip_loopback(struct device *old_dev, struct sk_buff *skb) -{ - extern struct device loopback_dev; - struct device *dev=&loopback_dev; - int len=skb->len-old_dev->hard_header_len; - struct sk_buff *newskb=alloc_skb(len+dev->hard_header_len, GFP_ATOMIC); - - if(newskb==NULL) - return; - - newskb->link3=NULL; - newskb->sk=NULL; - newskb->dev=dev; - newskb->saddr=skb->saddr; - newskb->daddr=skb->daddr; - newskb->raddr=skb->raddr; - newskb->free=1; - newskb->lock=0; - newskb->users=0; - newskb->pkt_type=skb->pkt_type; - newskb->len=len+dev->hard_header_len; - - - newskb->ip_hdr=(struct iphdr *)(newskb->data+ip_send(newskb, skb->ip_hdr->daddr, len, dev, skb->ip_hdr->saddr)); - memcpy(newskb->ip_hdr,skb->ip_hdr,len); - - /* Recurse. The device check against IFF_LOOPBACK will stop infinite recursion */ - - /*printk("Loopback output queued [%lX to %lX].\n", newskb->ip_hdr->saddr,newskb->ip_hdr->daddr);*/ - ip_queue_xmit(NULL, dev, newskb, 1); -} - - -/* - * Queues a packet to be sent, and starts the transmitter - * if necessary. if free = 1 then we free the block after - * transmit, otherwise we don't. If free==2 we not only - * free the block but also don't assign a new ip seq number. - * This routine also needs to put in the total length, - * and compute the checksum - */ - -void ip_queue_xmit(struct sock *sk, struct device *dev, - struct sk_buff *skb, int free) -{ - struct iphdr *iph; - unsigned char *ptr; - - /* Sanity check */ - if (dev == NULL) - { - NETDEBUG(printk("IP: ip_queue_xmit dev = NULL\n")); - return; - } - - IS_SKB(skb); - - /* - * Do some book-keeping in the packet for later - */ - - - skb->dev = dev; - skb->when = jiffies; - - /* - * Find the IP header and set the length. This is bad - * but once we get the skb data handling code in the - * hardware will push its header sensibly and we will - * set skb->ip_hdr to avoid this mess and the fixed - * header length problem - */ - - ptr = skb->data; - ptr += dev->hard_header_len; - iph = (struct iphdr *)ptr; - skb->ip_hdr = iph; - iph->tot_len = ntohs(skb->len-dev->hard_header_len); - -#ifdef CONFIG_IP_FIREWALL - if(ip_fw_chk(iph, dev, ip_fw_blk_chain, ip_fw_blk_policy, 0) != 1) - /* just don't send this packet */ - return; -#endif - - /* - * No reassigning numbers to fragments... - */ - - if(free!=2) - iph->id = htons(ip_id_count++); - else - free=1; - - /* All buffers without an owner socket get freed */ - if (sk == NULL) - free = 1; - - skb->free = free; - - /* - * Do we need to fragment. Again this is inefficient. - * We need to somehow lock the original buffer and use - * bits of it. - */ - - if(skb->len > dev->mtu + dev->hard_header_len) - { - ip_fragment(sk,skb,dev,0); - IS_SKB(skb); - kfree_skb(skb,FREE_WRITE); - return; - } - - /* - * Add an IP checksum - */ - - ip_send_check(iph); - - /* - * Print the frame when debugging - */ - - /* - * More debugging. You cannot queue a packet already on a list - * Spot this and moan loudly. - */ - if (skb->next != NULL) - { - NETDEBUG(printk("ip_queue_xmit: next != NULL\n")); - skb_unlink(skb); - } - - /* - * If a sender wishes the packet to remain unfreed - * we add it to his send queue. This arguably belongs - * in the TCP level since nobody else uses it. BUT - * remember IPng might change all the rules. - */ - - if (!free) - { - unsigned long flags; - /* The socket now has more outstanding blocks */ - - sk->packets_out++; - - /* Protect the list for a moment */ - save_flags(flags); - cli(); - - if (skb->link3 != NULL) - { - NETDEBUG(printk("ip.c: link3 != NULL\n")); - skb->link3 = NULL; - } - if (sk->send_head == NULL) - { - sk->send_tail = skb; - sk->send_head = skb; - } - else - { - sk->send_tail->link3 = skb; - sk->send_tail = skb; - } - /* skb->link3 is NULL */ - - /* Interrupt restore */ - restore_flags(flags); - } - else - /* Remember who owns the buffer */ - skb->sk = sk; - - /* - * If the indicated interface is up and running, send the packet. - */ - - ip_statistics.IpOutRequests++; -#ifdef CONFIG_IP_ACCT - ip_fw_chk(iph,dev,ip_acct_chain,IP_FW_F_ACCEPT,1); -#endif - -#ifdef CONFIG_IP_MULTICAST - - /* - * Multicasts are looped back for other local users - */ - - if (MULTICAST(iph->daddr) && !(dev->flags&IFF_LOOPBACK)) - { - if(sk==NULL || sk->ip_mc_loop) - { - if(iph->daddr==IGMP_ALL_HOSTS) - ip_loopback(dev,skb); - else - { - struct ip_mc_list *imc=dev->ip_mc_list; - while(imc!=NULL) - { - if(imc->multiaddr==iph->daddr) - { - ip_loopback(dev,skb); - break; - } - imc=imc->next; - } - } - } - /* Multicasts with ttl 0 must not go beyond the host */ - - if(skb->ip_hdr->ttl==0) - { - kfree_skb(skb, FREE_READ); - return; - } - } -#endif - if((dev->flags&IFF_BROADCAST) && iph->daddr==dev->pa_brdaddr && !(dev->flags&IFF_LOOPBACK)) - ip_loopback(dev,skb); - - if (dev->flags & IFF_UP) - { - /* - * If we have an owner use its priority setting, - * otherwise use NORMAL - */ - - if (sk != NULL) - { - dev_queue_xmit(skb, dev, sk->priority); - } - else - { - dev_queue_xmit(skb, dev, SOPRI_NORMAL); - } - } - else - { - ip_statistics.IpOutDiscards++; - if (free) - kfree_skb(skb, FREE_WRITE); - } -} - - - -#ifdef CONFIG_IP_MULTICAST - -/* - * Write an multicast group list table for the IGMP daemon to - * read. - */ - -int ip_mc_procinfo(char *buffer, char **start, off_t offset, int length) -{ - off_t pos=0, begin=0; - struct ip_mc_list *im; - unsigned long flags; - int len=0; - struct device *dev; - - len=sprintf(buffer,"Device : Count\tGroup Users Timer\n"); - save_flags(flags); - cli(); - - for(dev = dev_base; dev; dev = dev->next) - { - if((dev->flags&IFF_UP)&&(dev->flags&IFF_MULTICAST)) - { - len+=sprintf(buffer+len,"%-10s: %5d\n", - dev->name, dev->mc_count); - for(im = dev->ip_mc_list; im; im = im->next) - { - len+=sprintf(buffer+len, - "\t\t\t%08lX %5d %d:%08lX\n", - im->multiaddr, im->users, - im->tm_running, im->timer.expires); - pos=begin+len; - if(pos<offset) - { - len=0; - begin=pos; - } - if(pos>offset+length) - break; - } - } - } - restore_flags(flags); - *start=buffer+(offset-begin); - len-=(offset-begin); - if(len>length) - len=length; - return len; -} - - -#endif -/* - * Socket option code for IP. This is the end of the line after any TCP,UDP etc options on - * an IP socket. - * - * We implement IP_TOS (type of service), IP_TTL (time to live). - * - * Next release we will sort out IP_OPTIONS since for some people are kind of important. - */ - -static struct device *ip_mc_find_devfor(unsigned long addr) -{ - struct device *dev; - for(dev = dev_base; dev; dev = dev->next) - { - if((dev->flags&IFF_UP)&&(dev->flags&IFF_MULTICAST)&& - (dev->pa_addr==addr)) - return dev; - } - - return NULL; -} - -int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) -{ - int val,err; - unsigned char ucval; -#if defined(CONFIG_IP_FIREWALL) || defined(CONFIG_IP_ACCT) - struct ip_fw tmp_fw; -#endif - if (optval == NULL) - return(-EINVAL); - - err=verify_area(VERIFY_READ, optval, sizeof(int)); - if(err) - return err; - - val = get_fs_long((unsigned long *)optval); - ucval=get_fs_byte((unsigned char *)optval); - - if(level!=SOL_IP) - return -EOPNOTSUPP; - - switch(optname) - { - case IP_TOS: - if(val<0||val>255) - return -EINVAL; - sk->ip_tos=val; - if(val==IPTOS_LOWDELAY) - sk->priority=SOPRI_INTERACTIVE; - if(val==IPTOS_THROUGHPUT) - sk->priority=SOPRI_BACKGROUND; - return 0; - case IP_TTL: - if(val<1||val>255) - return -EINVAL; - sk->ip_ttl=val; - return 0; -#ifdef CONFIG_IP_MULTICAST - case IP_MULTICAST_TTL: - { - sk->ip_mc_ttl=(int)ucval; - return 0; - } - case IP_MULTICAST_LOOP: - { - if(ucval!=0 && ucval!=1) - return -EINVAL; - sk->ip_mc_loop=(int)ucval; - return 0; - } - case IP_MULTICAST_IF: - { - struct in_addr addr; - struct device *dev=NULL; - - /* - * Check the arguments are allowable - */ - - err=verify_area(VERIFY_READ, optval, sizeof(addr)); - if(err) - return err; - - memcpy_fromfs(&addr,optval,sizeof(addr)); - - - /* - * What address has been requested - */ - - if(addr.s_addr==INADDR_ANY) /* Default */ - { - sk->ip_mc_name[0]=0; - return 0; - } - - /* - * Find the device - */ - - dev=ip_mc_find_devfor(addr.s_addr); - - /* - * Did we find one - */ - - if(dev) - { - strcpy(sk->ip_mc_name,dev->name); - return 0; - } - return -EADDRNOTAVAIL; - } - - case IP_ADD_MEMBERSHIP: - { - -/* - * FIXME: Add/Del membership should have a semaphore protecting them from re-entry - */ - struct ip_mreq mreq; - unsigned long route_src; - struct rtable *rt; - struct device *dev=NULL; - - /* - * Check the arguments. - */ - - err=verify_area(VERIFY_READ, optval, sizeof(mreq)); - if(err) - return err; - - memcpy_fromfs(&mreq,optval,sizeof(mreq)); - - /* - * Get device for use later - */ - - if(mreq.imr_interface.s_addr==INADDR_ANY) - { - /* - * Not set so scan. - */ - if((rt=ip_rt_route(mreq.imr_multiaddr.s_addr,NULL, &route_src))!=NULL) - { - dev=rt->rt_dev; - rt->rt_use--; - } - } - else - { - /* - * Find a suitable device. - */ - - dev=ip_mc_find_devfor(mreq.imr_interface.s_addr); - } - - /* - * No device, no cookies. - */ - - if(!dev) - return -ENODEV; - - /* - * Join group. - */ - - return ip_mc_join_group(sk,dev,mreq.imr_multiaddr.s_addr); - } - - case IP_DROP_MEMBERSHIP: - { - struct ip_mreq mreq; - struct rtable *rt; - unsigned long route_src; - struct device *dev=NULL; - - /* - * Check the arguments - */ - - err=verify_area(VERIFY_READ, optval, sizeof(mreq)); - if(err) - return err; - - memcpy_fromfs(&mreq,optval,sizeof(mreq)); - - /* - * Get device for use later - */ - - if(mreq.imr_interface.s_addr==INADDR_ANY) - { - if((rt=ip_rt_route(mreq.imr_multiaddr.s_addr,NULL, &route_src))!=NULL) - { - dev=rt->rt_dev; - rt->rt_use--; - } - } - else - { - - dev=ip_mc_find_devfor(mreq.imr_interface.s_addr); - } - - /* - * Did we find a suitable device. - */ - - if(!dev) - return -ENODEV; - - /* - * Leave group - */ - - return ip_mc_leave_group(sk,dev,mreq.imr_multiaddr.s_addr); - } -#endif -#ifdef CONFIG_IP_FIREWALL - case IP_FW_ADD_BLK: - case IP_FW_DEL_BLK: - case IP_FW_ADD_FWD: - case IP_FW_DEL_FWD: - case IP_FW_CHK_BLK: - case IP_FW_CHK_FWD: - case IP_FW_FLUSH_BLK: - case IP_FW_FLUSH_FWD: - case IP_FW_ZERO_BLK: - case IP_FW_ZERO_FWD: - case IP_FW_POLICY_BLK: - case IP_FW_POLICY_FWD: - if(!suser()) - return -EPERM; - if(optlen>sizeof(tmp_fw) || optlen<1) - return -EINVAL; - err=verify_area(VERIFY_READ,optval,optlen); - if(err) - return err; - memcpy_fromfs(&tmp_fw,optval,optlen); - err=ip_fw_ctl(optname, &tmp_fw,optlen); - return -err; /* -0 is 0 after all */ - -#endif -#ifdef CONFIG_IP_ACCT - case IP_ACCT_DEL: - case IP_ACCT_ADD: - case IP_ACCT_FLUSH: - case IP_ACCT_ZERO: - if(!suser()) - return -EPERM; - if(optlen>sizeof(tmp_fw) || optlen<1) - return -EINVAL; - err=verify_area(VERIFY_READ,optval,optlen); - if(err) - return err; - memcpy_fromfs(&tmp_fw, optval,optlen); - err=ip_acct_ctl(optname, &tmp_fw,optlen); - return -err; /* -0 is 0 after all */ -#endif - /* IP_OPTIONS and friends go here eventually */ - default: - return(-ENOPROTOOPT); - } -} - -/* - * Get the options. Note for future reference. The GET of IP options gets the - * _received_ ones. The set sets the _sent_ ones. - */ - -int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen) -{ - int val,err; -#ifdef CONFIG_IP_MULTICAST - int len; -#endif - - if(level!=SOL_IP) - return -EOPNOTSUPP; - - switch(optname) - { - case IP_TOS: - val=sk->ip_tos; - break; - case IP_TTL: - val=sk->ip_ttl; - break; -#ifdef CONFIG_IP_MULTICAST - case IP_MULTICAST_TTL: - val=sk->ip_mc_ttl; - break; - case IP_MULTICAST_LOOP: - val=sk->ip_mc_loop; - break; - case IP_MULTICAST_IF: - err=verify_area(VERIFY_WRITE, optlen, sizeof(int)); - if(err) - return err; - len=strlen(sk->ip_mc_name); - err=verify_area(VERIFY_WRITE, optval, len); - if(err) - return err; - put_fs_long(len,(unsigned long *) optlen); - memcpy_tofs((void *)optval,sk->ip_mc_name, len); - return 0; -#endif - default: - return(-ENOPROTOOPT); - } - err=verify_area(VERIFY_WRITE, optlen, sizeof(int)); - if(err) - return err; - put_fs_long(sizeof(int),(unsigned long *) optlen); - - err=verify_area(VERIFY_WRITE, optval, sizeof(int)); - if(err) - return err; - put_fs_long(val,(unsigned long *)optval); - - return(0); -} - -/* - * Build and send a packet, with as little as one copy - * - * Doesn't care much about ip options... option length can be - * different for fragment at 0 and other fragments. - * - * Note that the fragment at the highest offset is sent first, - * so the getfrag routine can fill in the TCP/UDP checksum header - * field in the last fragment it sends... actually it also helps - * the reassemblers, they can put most packets in at the head of - * the fragment queue, and they know the total size in advance. This - * last feature will measurable improve the Linux fragment handler. - * - * The callback has five args, an arbitrary pointer (copy of frag), - * the source IP address (may depend on the routing table), the - * destination adddress (char *), the offset to copy from, and the - * length to be copied. - * - */ - -int ip_build_xmit(struct sock *sk, - void getfrag (void *, - int, - char *, - unsigned int, - unsigned int), - void *frag, - unsigned short int length, - int daddr, - int flags, - int type) -{ - struct rtable *rt; - unsigned int fraglen, maxfraglen, fragheaderlen; - int offset, mf; - unsigned long saddr; - unsigned short id; - struct iphdr *iph; - int local=0; - struct device *dev; - - -#ifdef CONFIG_INET_MULTICAST - if(sk && MULTICAST(daddr) && *sk->ip_mc_name) - { - dev=dev_get(skb->ip_mc_name); - if(!dev) - return -ENODEV; - rt=NULL; - } - else - { -#endif - /* - * Perform the IP routing decisions - */ - - if(sk->localroute || flags&MSG_DONTROUTE) - local=1; - - rt = sk->ip_route_cache; - - /* - * See if the routing cache is outdated. We need to clean this up once we are happy it is reliable - * by doing the invalidation actively in the route change and header change. - */ - - saddr=sk->ip_route_saddr; - if(!rt || sk->ip_route_stamp != rt_stamp || daddr!=sk->ip_route_daddr || sk->ip_route_local!=local || sk->saddr!=sk->ip_route_saddr) - { - if(local) - rt = ip_rt_local(daddr, NULL, &saddr); - else - rt = ip_rt_route(daddr, NULL, &saddr); - sk->ip_route_local=local; - sk->ip_route_daddr=daddr; - sk->ip_route_saddr=saddr; - sk->ip_route_stamp=rt_stamp; - sk->ip_route_cache=rt; - sk->ip_hcache_ver=NULL; - sk->ip_hcache_state= 0; - } - else if(rt) - { - /* - * Attempt header caches only if the cached route is being reused. Header cache - * is not ultra cheap to set up. This means we only set it up on the second packet, - * so one shot communications are not slowed. We assume (seems reasonable) that 2 is - * probably going to be a stream of data. - */ - if(rt->rt_dev->header_cache && sk->ip_hcache_state!= -1) - { - if(sk->ip_hcache_ver==NULL || sk->ip_hcache_stamp!=*sk->ip_hcache_ver) - rt->rt_dev->header_cache(rt->rt_dev,sk,saddr,daddr); - else - /* Can't cache. Remember this */ - sk->ip_hcache_state= -1; - } - } - - if (rt == NULL) - { - ip_statistics.IpOutNoRoutes++; - return(-ENETUNREACH); - } - - if (sk->saddr && (!LOOPBACK(sk->saddr) || LOOPBACK(daddr))) - saddr = sk->saddr; - - dev=rt->rt_dev; -#ifdef CONFIG_INET_MULTICAST - } -#endif - - /* - * Now compute the buffer space we require - */ - - fragheaderlen = dev->hard_header_len; - if(type != IPPROTO_RAW) - fragheaderlen += 20; - - /* - * Fragheaderlen is the size of 'overhead' on each buffer. Now work - * out the size of the frames to send. - */ - - maxfraglen = ((dev->mtu-20) & ~7) + fragheaderlen; - - /* - * Start at the end of the frame by handling the remainder. - */ - - offset = length - (length % (maxfraglen - fragheaderlen)); - - /* - * Amount of memory to allocate for final fragment. - */ - - fraglen = length - offset + fragheaderlen; - - if(fraglen==0) - { - fraglen = maxfraglen; - offset -= maxfraglen-fragheaderlen; - } - - - /* - * The last fragment will not have MF (more fragments) set. - */ - - mf = 0; - - /* - * Can't fragment raw packets - */ - - if (type == IPPROTO_RAW && offset > 0) - return(-EMSGSIZE); - - /* - * Get an identifier - */ - - id = htons(ip_id_count++); - - /* - * Being outputting the bytes. - */ - - do - { - struct sk_buff * skb; - int error; - char *data; - - /* - * Get the memory we require. - */ - - skb = sock_alloc_send_skb(sk, fraglen, 0, &error); - if (skb == NULL) - return(error); - - /* - * Fill in the control structures - */ - - skb->next = skb->prev = NULL; - skb->dev = dev; - skb->when = jiffies; - skb->free = 1; /* dubious, this one */ - skb->sk = sk; - skb->arp = 0; - skb->saddr = saddr; - skb->raddr = (rt&&rt->rt_gateway) ? rt->rt_gateway : daddr; - skb->len = fraglen; - - /* - * Save us ARP and stuff. In the optimal case we do no route lookup (route cache ok) - * no ARP lookup (arp cache ok) and output. The cache checks are still too slow but - * this can be fixed later. For gateway routes we ought to have a rt->.. header cache - * pointer to speed header cache builds for identical targets. - */ - - if(sk->ip_hcache_state>0) - { - memcpy(skb->data,sk->ip_hcache_data, dev->hard_header_len); - skb->arp=1; - } - else if (dev->hard_header) - { - if(dev->hard_header(skb->data, dev, ETH_P_IP, - NULL, NULL, 0, NULL)>0) - skb->arp=1; - } - - /* - * Find where to start putting bytes. - */ - - data = (char *)skb->data + dev->hard_header_len; - iph = (struct iphdr *)data; - - /* - * Only write IP header onto non-raw packets - */ - - if(type != IPPROTO_RAW) - { - - iph->version = 4; - iph->ihl = 5; /* ugh */ - iph->tos = sk->ip_tos; - iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4); - iph->id = id; - iph->frag_off = htons(offset>>3); - iph->frag_off |= mf; -#ifdef CONFIG_IP_MULTICAST - if (MULTICAST(daddr)) - iph->ttl = sk->ip_mc_ttl; - else -#endif - iph->ttl = sk->ip_ttl; - iph->protocol = type; - iph->check = 0; - iph->saddr = saddr; - iph->daddr = daddr; - iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); - data += iph->ihl*4; - - /* - * Any further fragments will have MF set. - */ - - mf = htons(IP_MF); - } - - /* - * User data callback - */ - - getfrag(frag, saddr, data, offset, fraglen-fragheaderlen); - - /* - * Account for the fragment. - */ - -#ifdef CONFIG_IP_ACCT - if(!offset) - ip_fw_chk(iph, dev, ip_acct_chain, IP_FW_F_ACCEPT, 1); -#endif - offset -= (maxfraglen-fragheaderlen); - fraglen = maxfraglen; - -#ifdef CONFIG_IP_MULTICAST - - /* - * Multicasts are looped back for other local users - */ - - if (MULTICAST(daddr) && !(dev->flags&IFF_LOOPBACK)) - { - /* - * Loop back any frames. The check for IGMP_ALL_HOSTS is because - * you are always magically a member of this group. - */ - - if(sk==NULL || sk->ip_mc_loop) - { - if(skb->daddr==IGMP_ALL_HOSTS) - ip_loopback(rt->rt_dev,skb); - else - { - struct ip_mc_list *imc=rt->rt_dev->ip_mc_list; - while(imc!=NULL) - { - if(imc->multiaddr==daddr) - { - ip_loopback(rt->rt_dev,skb); - break; - } - imc=imc->next; - } - } - } - - /* - * Multicasts with ttl 0 must not go beyond the host. Fixme: avoid the - * extra clone. - */ - - if(skb->ip_hdr->ttl==0) - kfree_skb(skb, FREE_READ); - } -#endif - /* - * Now queue the bytes into the device. - */ - - if (dev->flags & IFF_UP) - { - dev_queue_xmit(skb, dev, sk->priority); - } - else - { - /* - * Whoops... - * - * FIXME: There is a small nasty here. During the ip_build_xmit we could - * page fault between the route lookup and device send, the device might be - * removed and unloaded.... We need to add device locks on this. - */ - - ip_statistics.IpOutDiscards++; - kfree_skb(skb, FREE_WRITE); - return(0); /* lose rest of fragments */ - } - } - while (offset >= 0); - - return(0); -} - - -/* - * IP protocol layer initialiser - */ - -static struct packet_type ip_packet_type = -{ - 0, /* MUTTER ntohs(ETH_P_IP),*/ - NULL, /* All devices */ - ip_rcv, - NULL, - NULL, -}; - -/* - * Device notifier - */ - -static int ip_rt_event(unsigned long event, void *ptr) -{ - if(event==NETDEV_DOWN) - ip_rt_flush(ptr); - return NOTIFY_DONE; -} - -struct notifier_block ip_rt_notifier={ - ip_rt_event, - NULL, - 0 -}; - -/* - * IP registers the packet type and then calls the subprotocol initialisers - */ - -void ip_init(void) -{ - ip_packet_type.type=htons(ETH_P_IP); - dev_add_pack(&ip_packet_type); - - /* So we flush routes when a device is downed */ - register_netdevice_notifier(&ip_rt_notifier); -/* ip_raw_init(); - ip_packet_init(); - ip_tcp_init(); - ip_udp_init();*/ -} - diff --git a/net/ipv4/ip_alias.c b/net/ipv4/ip_alias.c new file mode 100644 index 000000000..488de23d4 --- /dev/null +++ b/net/ipv4/ip_alias.c @@ -0,0 +1,165 @@ +/* + * IP_ALIAS (AF_INET) aliasing module. + * + * + * Version: @(#)ip_alias.c 0.43 12/20/95 + * + * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> + * + * Fixes: + * JJC : ip_alias_dev_select method. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/module.h> + +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/if.h> +#include <linux/inet.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/route.h> +#include <net/route.h> + +#ifdef ALIAS_USER_LAND_DEBUG +#include "net_alias.h" +#include "ip_alias.h" +#include "user_stubs.h" +#endif + +#include <linux/net_alias.h> +#include <net/ip_alias.h> + +/* + * AF_INET alias init + */ + +static int ip_alias_init_1(struct net_alias_type *this, struct net_alias *alias, struct sockaddr *sa) +{ +#ifdef ALIAS_USER_LAND_DEBUG + printk("alias_init(%s) called.\n", alias->name); +#endif + MOD_INC_USE_COUNT; + return 0; +} + +/* + * AF_INET alias done + */ + +static int ip_alias_done_1(struct net_alias_type *this, struct net_alias *alias) +{ +#ifdef ALIAS_USER_LAND_DEBUG + printk("alias_done(%s) called.\n", alias->name); +#endif + MOD_DEC_USE_COUNT; + return 0; +} + +/* + * Print alias address info + */ + +int ip_alias_print_1(struct net_alias_type *this, struct net_alias *alias, char *buf, int len) +{ + char *p; + + p = (char *) &alias->dev.pa_addr; + return sprintf(buf, "%d.%d.%d.%d", + (p[0] & 255), (p[1] & 255), (p[2] & 255), (p[3] & 255)); +} + +struct device *ip_alias_dev_select(struct net_alias_type *this, struct device *main_dev, struct sockaddr *sa) +{ + __u32 addr; + struct rtable *rt; + struct device *dev=NULL; + + /* + * Defensive... + */ + + if (main_dev == NULL) + return NULL; + + /* + * Get u32 address. + */ + + addr = (sa)? (*(struct sockaddr_in *)sa).sin_addr.s_addr : 0; + if (addr == 0) + return NULL; + + /* + * Find 'closest' device to address given. any other suggestions? ... + * net_alias module will check if returned device is main_dev's alias + */ + + rt = ip_rt_route(addr, 0); + if(rt) + { + dev=rt->rt_dev; + ip_rt_put(rt); + } + return dev; +} + +/* + * net_alias AF_INET type defn. + */ + +struct net_alias_type ip_alias_type = +{ + AF_INET, /* type */ + 0, /* n_attach */ + "ip", /* name */ + NULL, /* get_addr32() */ + NULL, /* dev_addr_chk() */ + ip_alias_dev_select, /* dev_select() */ + ip_alias_init_1, /* alias_init_1() */ + ip_alias_done_1, /* alias_done_1() */ + ip_alias_print_1, /* alias_print_1() */ + NULL /* next */ +}; + +/* + * ip_alias module initialization + */ + +int ip_alias_init(void) +{ + return register_net_alias_type(&ip_alias_type, AF_INET); +} + +/* + * ip_alias module done + */ + +int ip_alias_done(void) +{ + return unregister_net_alias_type(&ip_alias_type); +} + +#ifdef MODULE + +int init_module(void) +{ + if (ip_alias_init() != 0) + return -EIO; + return 0; +} + +void cleanup_module(void) +{ + if (ip_alias_done() != 0) + printk(KERN_INFO "ip_alias: can't remove module"); +} + +#endif /* MODULE */ diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c new file mode 100644 index 000000000..81d90f5de --- /dev/null +++ b/net/ipv4/ip_forward.c @@ -0,0 +1,574 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The IP forwarding functionality. + * + * Authors: see ip.c + * + * Fixes: + * Many : Split from ip.c , see ip_input.c for + * history. + * Dave Gregorich : NULL ip_rt_put fix for multicast + * routing. + * Jos Vos : Add call_out_firewall before sending, + * use output device for accounting. + * Jos Vos : Call forward firewall after routing + * (always use output device). + * Alan Cox : Unshare buffer on forward. + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/icmp.h> +#include <linux/netdevice.h> +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/firewall.h> +#include <linux/ip_fw.h> +#ifdef CONFIG_IP_MASQUERADE +#include <net/ip_masq.h> +#endif +#include <net/checksum.h> +#include <linux/route.h> +#include <net/route.h> + +#ifdef CONFIG_IP_FORWARD +#ifdef CONFIG_IP_MROUTE + +/* + * Encapsulate a packet by attaching a valid IPIP header to it. + * This avoids tunnel drivers and other mess and gives us the speed so + * important for multicast video. + */ + +static void ip_encap(struct sk_buff *skb, int len, struct device *out, __u32 daddr) +{ + /* + * There is space for the IPIP header and MAC left. + * + * Firstly push down and install the IPIP header. + */ + struct iphdr *iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr)); + + if(len>65515) + len=65515; + + + iph->version = 4; + iph->tos = skb->ip_hdr->tos; + iph->ttl = skb->ip_hdr->ttl; + iph->frag_off = 0; + iph->daddr = daddr; + iph->saddr = out->pa_addr; + iph->protocol = IPPROTO_IPIP; + iph->ihl = 5; + iph->tot_len = htons(skb->len + len); /* Anand, ernet */ + iph->id = htons(ip_id_count++); + ip_send_check(iph); + + skb->dev = out; + skb->arp = 1; + skb->raddr=daddr; /* Router address is not destination address. The + * correct value is given eventually. I have not + * removed this statement. But could have. + * Anand, ernet. + */ + /* + * Now add the physical header (driver will push it down). + */ + + /* The last parameter of out->hard_header() needed skb->len + len. + * Anand, ernet. + */ + if (out->hard_header && out->hard_header(skb, out, ETH_P_IP, NULL, NULL, + skb->len + len)<0) + skb->arp=0; + /* + * Read to queue for transmission. + */ +} + +#endif + +/* + * Forward an IP datagram to its next destination. + */ + +int ip_forward(struct sk_buff *skb, struct device *dev, int is_frag, + __u32 target_addr) +{ + struct device *dev2; /* Output device */ + struct iphdr *iph; /* Our header */ + struct sk_buff *skb2; /* Output packet */ + struct rtable *rt; /* Route we use */ + unsigned char *ptr; /* Data pointer */ + unsigned long raddr; /* Router IP address */ + struct options * opt = (struct options*)skb->proto_priv; + struct hh_cache *hh = NULL; + int encap = 0; /* Encap length */ +#ifdef CONFIG_FIREWALL + int fw_res = 0; /* Forwarding result */ +#ifdef CONFIG_IP_MASQUERADE + struct sk_buff *skb_in = skb; /* So we can remember if the masquerader did some swaps */ +#endif /* CONFIG_IP_MASQUERADE */ +#endif /* CONFIG_FIREWALL */ + + /* + * We may be sharing the buffer with a snooper. That won't do + */ + + if((skb=skb_unshare(skb, GFP_ATOMIC,FREE_READ))==NULL) + return -1; + + /* + * According to the RFC, we must first decrease the TTL field. If + * that reaches zero, we must reply an ICMP control message telling + * that the packet's lifetime expired. + * + * Exception: + * We may not generate an ICMP for an ICMP. icmp_send does the + * enforcement of this so we can forget it here. It is however + * sometimes VERY important. + */ + + iph = skb->h.iph; + if (!(is_frag&IPFWD_NOTTLDEC)) + { + unsigned long checksum = iph->check; + iph->ttl--; + + /* + * Re-compute the IP header checksum. + * This is efficient. We know what has happened to the header + * and can thus adjust the checksum as Phil Karn does in KA9Q + * except we do this in "network byte order". + */ + checksum += htons(0x0100); + /* carry overflow? */ + checksum += checksum >> 16; + iph->check = checksum; + } + + if (iph->ttl <= 0) + { + /* Tell the sender its packet died... */ + icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0, dev); + return -1; + } + + /* If IPFWD_MULTITUNNEL flag is set, then we have to perform routing + * decision so as to reach the other end of the tunnel. This condition + * also means that we are dealing with a unicast IP packet "in a way". + * Anand, ernet. + */ + +#ifdef CONFIG_IP_MROUTE + if(!(is_frag&IPFWD_MULTICASTING) || (is_frag&IPFWD_MULTITUNNEL)) + { +#endif + /* + * OK, the packet is still valid. Fetch its destination address, + * and give it to the IP sender for further processing. + */ + + rt = ip_rt_route(target_addr, 0); + + if (rt == NULL) + { + /* + * Tell the sender its packet cannot be delivered. Again + * ICMP is screened later. + */ + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_UNREACH, 0, dev); + return -1; + } + + + /* + * Gosh. Not only is the packet valid; we even know how to + * forward it onto its final destination. Can we say this + * is being plain lucky? + * If the router told us that there is no GW, use the dest. + * IP address itself- we seem to be connected directly... + */ + + raddr = rt->rt_gateway; + + if (opt->is_strictroute && (rt->rt_flags & RTF_GATEWAY)) { + /* + * Strict routing permits no gatewaying + */ + + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0, dev); + return -1; + } + + /* + * Having picked a route we can now send the frame out + * after asking the firewall permission to do so. + */ + + dev2 = rt->rt_dev; + hh = rt->rt_hh; + /* + * In IP you never have to forward a frame on the interface that it + * arrived upon. We now generate an ICMP HOST REDIRECT giving the route + * we calculated. + */ +#ifndef CONFIG_IP_NO_ICMP_REDIRECT + if (dev == dev2 && + !((iph->saddr^dev->pa_addr)&dev->pa_mask) && + /* The daddr!=raddr test isn't obvious - what it's doing + is avoiding sending a frame the receiver will not + believe anyway.. */ + iph->daddr != raddr/*ANK*/ && !opt->srr) + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, raddr, dev); +#endif +#ifdef CONFIG_IP_MROUTE + + /* This is for ip encap. Anand, ernet.*/ + + if (is_frag&IPFWD_MULTITUNNEL) { + encap=20; + } + } + else + { + /* + * Multicast route forward. Routing is already done + */ + dev2=skb->dev; + raddr=skb->raddr; + if(is_frag&IPFWD_MULTITUNNEL) /* VIFF_TUNNEL mode */ + encap=20; + rt=NULL; + } +#endif + + /* + * See if we are allowed to forward this. + * Note: demasqueraded fragments are always 'back'warded. + */ + +#ifdef CONFIG_FIREWALL + if(!(is_frag&IPFWD_MASQUERADED)) + { +#ifdef CONFIG_IP_MASQUERADE + /* + * Check that any ICMP packets are not for a + * masqueraded connection. If so rewrite them + * and skip the firewall checks + */ + if (iph->protocol == IPPROTO_ICMP) + { + if ((fw_res = ip_fw_masq_icmp(&skb, dev2)) < 0) + { + if (rt) + ip_rt_put(rt); + /* Problem - ie bad checksum */ + return -1; + } + + if (fw_res) + /* ICMP matched - skip firewall */ + goto skip_call_fw_firewall; + } +#endif + fw_res=call_fw_firewall(PF_INET, dev2, iph, NULL); + switch (fw_res) { + case FW_ACCEPT: + case FW_MASQUERADE: + break; + case FW_REJECT: + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, dev); + /* fall thru */ + default: + return -1; + } + +#ifdef CONFIG_IP_MASQUERADE + skip_call_fw_firewall: +#endif + } +#endif + + /* + * We now may allocate a new buffer, and copy the datagram into it. + * If the indicated interface is up and running, kick it. + */ + + if (dev2->flags & IFF_UP) + { +#ifdef CONFIG_IP_MASQUERADE + /* + * If this fragment needs masquerading, make it so... + * (Don't masquerade de-masqueraded fragments) + */ + if (!(is_frag&IPFWD_MASQUERADED) && fw_res==FW_MASQUERADE) + if (ip_fw_masquerade(&skb, dev2) < 0) + { + /* + * Masquerading failed; silently discard this packet. + */ + if (rt) + ip_rt_put(rt); + return -1; + } +#endif + IS_SKB(skb); + + if (skb->len+encap > dev2->mtu && (iph->frag_off & htons(IP_DF))) + { + ip_statistics.IpFragFails++; + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(dev2->mtu), dev); + if(rt) + ip_rt_put(rt); + return -1; + } + +#ifdef CONFIG_IP_MROUTE + if(skb_headroom(skb)-encap<dev2->hard_header_len) + { + skb2 = alloc_skb(dev2->hard_header_len + skb->len + encap + 15, GFP_ATOMIC); +#else + if(skb_headroom(skb)<dev2->hard_header_len) + { + skb2 = alloc_skb(dev2->hard_header_len + skb->len + 15, GFP_ATOMIC); +#endif + /* + * This is rare and since IP is tolerant of network failures + * quite harmless. + */ + + if (skb2 == NULL) + { + NETDEBUG(printk("\nIP: No memory available for IP forward\n")); + if(rt) + ip_rt_put(rt); + return -1; + } + + IS_SKB(skb2); + /* + * Add the physical headers. + */ + skb2->protocol=htons(ETH_P_IP); +#ifdef CONFIG_IP_MROUTE + if(is_frag&IPFWD_MULTITUNNEL) + { + skb_reserve(skb2,(encap+dev2->hard_header_len+15)&~15); /* 16 byte aligned IP headers are good */ + +/* We need to pass on IP information of the incoming packet to ip_encap() + * to fillin ttl, and tos fields.The destination should be target_addr. + * Anand, ernet. + */ + + skb2->ip_hdr = skb->ip_hdr; + + ip_encap(skb2,skb->len, dev2, target_addr); + +/* The router address is got earlier that to take us to the remote tunnel + * Anand, ernet. + */ + skb2->raddr = rt->rt_gateway; + } + else +#endif + ip_send(rt,skb2,raddr,skb->len,dev2,dev2->pa_addr); + + /* + * We have to copy the bytes over as the new header wouldn't fit + * the old buffer. This should be very rare. + */ + + ptr = skb_put(skb2,skb->len); + skb2->free = 1; + skb2->h.raw = ptr; + /* + * Copy the packet data into the new buffer. + */ + memcpy(ptr, skb->h.raw, skb->len); + memcpy(skb2->proto_priv, skb->proto_priv, sizeof(skb->proto_priv)); + iph = skb2->ip_hdr = skb2->h.iph; + } + else + { + /* + * Build a new MAC header. + */ + + skb2 = skb; + skb2->dev=dev2; +#ifdef CONFIG_IP_MROUTE + if(is_frag&IPFWD_MULTITUNNEL) + ip_encap(skb,skb->len, dev2, raddr); + else + { +#endif + skb->arp=1; + skb->raddr=raddr; + if (hh) + { + memcpy(skb_push(skb, dev2->hard_header_len), hh->hh_data, dev2->hard_header_len); + if (!hh->hh_uptodate) + { +#if RT_CACHE_DEBUG >= 2 + printk("ip_forward: hh miss %08x via %08x\n", target_addr, rt->rt_gateway); +#endif + skb->arp = 0; + } + } + else if (dev2->hard_header) + { + if(dev2->hard_header(skb, dev2, ETH_P_IP, NULL, NULL, skb->len)<0) + skb->arp=0; + } +#ifdef CONFIG_IP_MROUTE + } +#endif + } +#ifdef CONFIG_FIREWALL + if((fw_res = call_out_firewall(PF_INET, skb2->dev, iph, NULL)) < FW_ACCEPT) + { + /* FW_ACCEPT and FW_MASQUERADE are treated equal: + masquerading is only supported via forward rules */ + if (fw_res == FW_REJECT) + icmp_send(skb2, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, dev); + if (skb != skb2) + kfree_skb(skb2,FREE_WRITE); + return -1; + } +#endif + ip_statistics.IpForwDatagrams++; + + if (opt->optlen) + { + unsigned char * optptr; + if (opt->rr_needaddr) + { + optptr = (unsigned char *)iph + opt->rr; + memcpy(&optptr[optptr[2]-5], &dev2->pa_addr, 4); + opt->is_changed = 1; + } + if (opt->srr_is_hit) + { + int srrptr, srrspace; + + optptr = (unsigned char *)iph + opt->srr; + + for ( srrptr=optptr[2], srrspace = optptr[1]; + srrptr <= srrspace; + srrptr += 4 + ) + { + if (srrptr + 3 > srrspace) + break; + if (memcmp(&target_addr, &optptr[srrptr-1], 4) == 0) + break; + } + if (srrptr + 3 <= srrspace) + { + opt->is_changed = 1; + memcpy(&optptr[srrptr-1], &dev2->pa_addr, 4); + iph->daddr = target_addr; + optptr[2] = srrptr+4; + } + else + printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); + } + if (opt->ts_needaddr) + { + optptr = (unsigned char *)iph + opt->ts; + memcpy(&optptr[optptr[2]-9], &dev2->pa_addr, 4); + opt->is_changed = 1; + } + if (opt->is_changed) + { + opt->is_changed = 0; + ip_send_check(iph); + } + } +/* + * ANK: this is point of "no return", we cannot send an ICMP, + * because we changed SRR option. + */ + + /* + * See if it needs fragmenting. Note in ip_rcv we tagged + * the fragment type. This must be right so that + * the fragmenter does the right thing. + */ + + if(skb2->len > dev2->mtu + dev2->hard_header_len) + { + ip_fragment(NULL,skb2,dev2, is_frag); + kfree_skb(skb2,FREE_WRITE); + } + else + { +#ifdef CONFIG_IP_ACCT + /* + * Count mapping we shortcut + */ + + ip_fw_chk(iph,dev2,NULL,ip_acct_chain,0,IP_FW_MODE_ACCT_OUT); +#endif + + /* + * Map service types to priority. We lie about + * throughput being low priority, but it's a good + * choice to help improve general usage. + */ + if(iph->tos & IPTOS_LOWDELAY) + dev_queue_xmit(skb2, dev2, SOPRI_INTERACTIVE); + else if(iph->tos & IPTOS_THROUGHPUT) + dev_queue_xmit(skb2, dev2, SOPRI_BACKGROUND); + else + dev_queue_xmit(skb2, dev2, SOPRI_NORMAL); + } + } + else + { + if(rt) + ip_rt_put(rt); + return -1; + } + if(rt) + ip_rt_put(rt); + + /* + * Tell the caller if their buffer is free. + */ + + if(skb==skb2) + return 0; + +#ifdef CONFIG_IP_MASQUERADE + /* + * The original is free. Free our copy and + * tell the caller not to free. + */ + if(skb!=skb_in) + { + kfree_skb(skb_in, FREE_WRITE); + return 0; + } +#endif + return 1; +} + + +#endif + + + diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c new file mode 100644 index 000000000..c6c33e03b --- /dev/null +++ b/net/ipv4/ip_fragment.c @@ -0,0 +1,799 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The IP fragmentation functionality. + * + * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG> + * Alan Cox <Alan.Cox@linux.org> + * + * Fixes: + * Alan Cox : Split from ip.c , see ip_input.c for history. + */ + +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/icmp.h> +#include <linux/netdevice.h> +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/firewall.h> +#include <linux/ip_fw.h> +#include <net/checksum.h> + +/* + * Fragment cache limits. We will commit 256K at one time. Should we + * cross that limit we will prune down to 192K. This should cope with + * even the most extreme cases without allowing an attacker to measurably + * harm machine performance. + */ + +#define IPFRAG_HIGH_THRESH (256*1024) +#define IPFRAG_LOW_THRESH (192*1024) + +/* + * This fragment handler is a bit of a heap. On the other hand it works quite + * happily and handles things quite well. + */ + +static struct ipq *ipqueue = NULL; /* IP fragment queue */ + +atomic_t ip_frag_mem = 0; /* Memory used for fragments */ + +/* + * Memory Tracking Functions + */ + +extern __inline__ void frag_kfree_skb(struct sk_buff *skb, int type) +{ + atomic_sub(skb->truesize, &ip_frag_mem); + kfree_skb(skb,type); +} + +extern __inline__ void frag_kfree_s(void *ptr, int len) +{ + atomic_sub(len, &ip_frag_mem); + kfree_s(ptr,len); +} + +extern __inline__ void *frag_kmalloc(int size, int pri) +{ + void *vp=kmalloc(size,pri); + if(!vp) + return NULL; + atomic_add(size, &ip_frag_mem); + return vp; +} + +/* + * Create a new fragment entry. + */ + +static struct ipfrag *ip_frag_create(int offset, int end, struct sk_buff *skb, unsigned char *ptr) +{ + struct ipfrag *fp; + unsigned long flags; + + fp = (struct ipfrag *) frag_kmalloc(sizeof(struct ipfrag), GFP_ATOMIC); + if (fp == NULL) + { + NETDEBUG(printk("IP: frag_create: no memory left !\n")); + return(NULL); + } + memset(fp, 0, sizeof(struct ipfrag)); + + /* Fill in the structure. */ + fp->offset = offset; + fp->end = end; + fp->len = end - offset; + fp->skb = skb; + fp->ptr = ptr; + + /* + * Charge for the SKB as well. + */ + + save_flags(flags); + cli(); + ip_frag_mem+=skb->truesize; + restore_flags(flags); + + return(fp); +} + + +/* + * Find the correct entry in the "incomplete datagrams" queue for + * this IP datagram, and return the queue entry address if found. + */ + +static struct ipq *ip_find(struct iphdr *iph) +{ + struct ipq *qp; + struct ipq *qplast; + + cli(); + qplast = NULL; + for(qp = ipqueue; qp != NULL; qplast = qp, qp = qp->next) + { + if (iph->id== qp->iph->id && iph->saddr == qp->iph->saddr && + iph->daddr == qp->iph->daddr && iph->protocol == qp->iph->protocol) + { + del_timer(&qp->timer); /* So it doesn't vanish on us. The timer will be reset anyway */ + sti(); + return(qp); + } + } + sti(); + return(NULL); +} + + +/* + * Remove an entry from the "incomplete datagrams" queue, either + * because we completed, reassembled and processed it, or because + * it timed out. + */ + +static void ip_free(struct ipq *qp) +{ + struct ipfrag *fp; + struct ipfrag *xp; + + /* + * Stop the timer for this entry. + */ + + del_timer(&qp->timer); + + /* Remove this entry from the "incomplete datagrams" queue. */ + cli(); + if (qp->prev == NULL) + { + ipqueue = qp->next; + if (ipqueue != NULL) + ipqueue->prev = NULL; + } + else + { + qp->prev->next = qp->next; + if (qp->next != NULL) + qp->next->prev = qp->prev; + } + + /* Release all fragment data. */ + + fp = qp->fragments; + while (fp != NULL) + { + xp = fp->next; + IS_SKB(fp->skb); + frag_kfree_skb(fp->skb,FREE_READ); + frag_kfree_s(fp, sizeof(struct ipfrag)); + fp = xp; + } + + /* Release the IP header. */ + frag_kfree_s(qp->iph, 64 + 8); + + /* Finally, release the queue descriptor itself. */ + frag_kfree_s(qp, sizeof(struct ipq)); + sti(); +} + + +/* + * Oops- a fragment queue timed out. Kill it and send an ICMP reply. + */ + +static void ip_expire(unsigned long arg) +{ + struct ipq *qp; + + qp = (struct ipq *)arg; + + /* + * Send an ICMP "Fragment Reassembly Timeout" message. + */ + + ip_statistics.IpReasmTimeout++; + ip_statistics.IpReasmFails++; + /* This if is always true... shrug */ + if(qp->fragments!=NULL) + icmp_send(qp->fragments->skb,ICMP_TIME_EXCEEDED, + ICMP_EXC_FRAGTIME, 0, qp->dev); + + /* + * Nuke the fragment queue. + */ + ip_free(qp); +} + +/* + * Memory limiting on fragments. Evictor trashes the oldest + * fragment queue until we are back under the low threshold + */ + +static void ip_evictor(void) +{ + while(ip_frag_mem>IPFRAG_LOW_THRESH) + { + if(!ipqueue) + panic("ip_evictor: memcount"); + ip_free(ipqueue); + } +} + +/* + * Add an entry to the 'ipq' queue for a newly received IP datagram. + * We will (hopefully :-) receive all other fragments of this datagram + * in time, so we just create a queue for this datagram, in which we + * will insert the received fragments at their respective positions. + */ + +static struct ipq *ip_create(struct sk_buff *skb, struct iphdr *iph, struct device *dev) +{ + struct ipq *qp; + int ihlen; + + qp = (struct ipq *) frag_kmalloc(sizeof(struct ipq), GFP_ATOMIC); + if (qp == NULL) + { + NETDEBUG(printk("IP: create: no memory left !\n")); + return(NULL); + } + memset(qp, 0, sizeof(struct ipq)); + + /* + * Allocate memory for the IP header (plus 8 octets for ICMP). + */ + + ihlen = iph->ihl * 4; + qp->iph = (struct iphdr *) frag_kmalloc(64 + 8, GFP_ATOMIC); + if (qp->iph == NULL) + { + NETDEBUG(printk("IP: create: no memory left !\n")); + frag_kfree_s(qp, sizeof(struct ipq)); + return(NULL); + } + + memcpy(qp->iph, iph, ihlen + 8); + qp->len = 0; + qp->ihlen = ihlen; + qp->fragments = NULL; + qp->dev = dev; + + /* Start a timer for this entry. */ + qp->timer.expires = jiffies + IP_FRAG_TIME; /* about 30 seconds */ + qp->timer.data = (unsigned long) qp; /* pointer to queue */ + qp->timer.function = ip_expire; /* expire function */ + add_timer(&qp->timer); + + /* Add this entry to the queue. */ + qp->prev = NULL; + cli(); + qp->next = ipqueue; + if (qp->next != NULL) + qp->next->prev = qp; + ipqueue = qp; + sti(); + return(qp); +} + + +/* + * See if a fragment queue is complete. + */ + +static int ip_done(struct ipq *qp) +{ + struct ipfrag *fp; + int offset; + + /* Only possible if we received the final fragment. */ + if (qp->len == 0) + return(0); + + /* Check all fragment offsets to see if they connect. */ + fp = qp->fragments; + offset = 0; + while (fp != NULL) + { + if (fp->offset > offset) + return(0); /* fragment(s) missing */ + offset = fp->end; + fp = fp->next; + } + + /* All fragments are present. */ + return(1); +} + + +/* + * Build a new IP datagram from all its fragments. + * + * FIXME: We copy here because we lack an effective way of handling lists + * of bits on input. Until the new skb data handling is in I'm not going + * to touch this with a bargepole. + */ + +static struct sk_buff *ip_glue(struct ipq *qp) +{ + struct sk_buff *skb; + struct iphdr *iph; + struct ipfrag *fp; + unsigned char *ptr; + int count, len; + + /* + * Allocate a new buffer for the datagram. + */ + len = qp->ihlen + qp->len; + + if ((skb = dev_alloc_skb(len)) == NULL) + { + ip_statistics.IpReasmFails++; + NETDEBUG(printk("IP: queue_glue: no memory for gluing queue %p\n", qp)); + ip_free(qp); + return(NULL); + } + + /* Fill in the basic details. */ + skb_put(skb,len); + skb->h.raw = skb->data; + skb->free = 1; + + /* Copy the original IP headers into the new buffer. */ + ptr = (unsigned char *) skb->h.raw; + memcpy(ptr, ((unsigned char *) qp->iph), qp->ihlen); + ptr += qp->ihlen; + + count = 0; + + /* Copy the data portions of all fragments into the new buffer. */ + fp = qp->fragments; + while(fp != NULL) + { + if(count+fp->len > skb->len) + { + NETDEBUG(printk("Invalid fragment list: Fragment over size.\n")); + ip_free(qp); + kfree_skb(skb,FREE_WRITE); + ip_statistics.IpReasmFails++; + return NULL; + } + memcpy((ptr + fp->offset), fp->ptr, fp->len); + count += fp->len; + fp = fp->next; + } + + /* We glued together all fragments, so remove the queue entry. */ + ip_free(qp); + + /* Done with all fragments. Fixup the new IP header. */ + iph = skb->h.iph; + iph->frag_off = 0; + iph->tot_len = htons((iph->ihl * 4) + count); + skb->ip_hdr = iph; + + ip_statistics.IpReasmOKs++; + return(skb); +} + + +/* + * Process an incoming IP datagram fragment. + */ + +struct sk_buff *ip_defrag(struct iphdr *iph, struct sk_buff *skb, struct device *dev) +{ + struct ipfrag *prev, *next, *tmp; + struct ipfrag *tfp; + struct ipq *qp; + struct sk_buff *skb2; + unsigned char *ptr; + int flags, offset; + int i, ihl, end; + + ip_statistics.IpReasmReqds++; + + /* + * Start by cleaning up the memory + */ + + if(ip_frag_mem>IPFRAG_HIGH_THRESH) + ip_evictor(); + /* + * Find the entry of this IP datagram in the "incomplete datagrams" queue. + */ + + qp = ip_find(iph); + + /* Is this a non-fragmented datagram? */ + offset = ntohs(iph->frag_off); + flags = offset & ~IP_OFFSET; + offset &= IP_OFFSET; + if (((flags & IP_MF) == 0) && (offset == 0)) + { + if (qp != NULL) + ip_free(qp); /* Huh? How could this exist?? */ + return(skb); + } + + offset <<= 3; /* offset is in 8-byte chunks */ + ihl = iph->ihl * 4; + + /* + * If the queue already existed, keep restarting its timer as long + * as we still are receiving fragments. Otherwise, create a fresh + * queue entry. + */ + + if (qp != NULL) + { + /* ANK. If the first fragment is received, + * we should remember the correct IP header (with options) + */ + if (offset == 0) + { + qp->ihlen = ihl; + memcpy(qp->iph, iph, ihl+8); + } + del_timer(&qp->timer); + qp->timer.expires = jiffies + IP_FRAG_TIME; /* about 30 seconds */ + qp->timer.data = (unsigned long) qp; /* pointer to queue */ + qp->timer.function = ip_expire; /* expire function */ + add_timer(&qp->timer); + } + else + { + /* + * If we failed to create it, then discard the frame + */ + if ((qp = ip_create(skb, iph, dev)) == NULL) + { + skb->sk = NULL; + frag_kfree_skb(skb, FREE_READ); + ip_statistics.IpReasmFails++; + return NULL; + } + } + + /* + * Attempt to construct an oversize packet. + */ + + if(ntohs(iph->tot_len)+(int)offset>65535) + { + skb->sk = NULL; + frag_kfree_skb(skb, FREE_READ); + ip_statistics.IpReasmFails++; + return NULL; + } + + /* + * Determine the position of this fragment. + */ + + end = offset + ntohs(iph->tot_len) - ihl; + + /* + * Point into the IP datagram 'data' part. + */ + + ptr = skb->data + ihl; + + /* + * Is this the final fragment? + */ + + if ((flags & IP_MF) == 0) + qp->len = end; + + /* + * Find out which fragments are in front and at the back of us + * in the chain of fragments so far. We must know where to put + * this fragment, right? + */ + + prev = NULL; + for(next = qp->fragments; next != NULL; next = next->next) + { + if (next->offset > offset) + break; /* bingo! */ + prev = next; + } + + /* + * We found where to put this one. + * Check for overlap with preceding fragment, and, if needed, + * align things so that any overlaps are eliminated. + */ + if (prev != NULL && offset < prev->end) + { + i = prev->end - offset; + offset += i; /* ptr into datagram */ + ptr += i; /* ptr into fragment data */ + } + + /* + * Look for overlap with succeeding segments. + * If we can merge fragments, do it. + */ + + for(tmp=next; tmp != NULL; tmp = tfp) + { + tfp = tmp->next; + if (tmp->offset >= end) + break; /* no overlaps at all */ + + i = end - next->offset; /* overlap is 'i' bytes */ + tmp->len -= i; /* so reduce size of */ + tmp->offset += i; /* next fragment */ + tmp->ptr += i; + /* + * If we get a frag size of <= 0, remove it and the packet + * that it goes with. + */ + if (tmp->len <= 0) + { + if (tmp->prev != NULL) + tmp->prev->next = tmp->next; + else + qp->fragments = tmp->next; + + if (tfp->next != NULL) + tmp->next->prev = tmp->prev; + + next=tfp; /* We have killed the original next frame */ + + frag_kfree_skb(tmp->skb,FREE_READ); + frag_kfree_s(tmp, sizeof(struct ipfrag)); + } + } + + /* + * Insert this fragment in the chain of fragments. + */ + + tfp = NULL; + tfp = ip_frag_create(offset, end, skb, ptr); + + /* + * No memory to save the fragment - so throw the lot + */ + + if (!tfp) + { + skb->sk = NULL; + frag_kfree_skb(skb, FREE_READ); + return NULL; + } + tfp->prev = prev; + tfp->next = next; + if (prev != NULL) + prev->next = tfp; + else + qp->fragments = tfp; + + if (next != NULL) + next->prev = tfp; + + /* + * OK, so we inserted this new fragment into the chain. + * Check if we now have a full IP datagram which we can + * bump up to the IP layer... + */ + + if (ip_done(qp)) + { + skb2 = ip_glue(qp); /* glue together the fragments */ + return(skb2); + } + return(NULL); +} + + +/* + * This IP datagram is too large to be sent in one piece. Break it up into + * smaller pieces (each of size equal to the MAC header plus IP header plus + * a block of the data of the original IP data part) that will yet fit in a + * single device frame, and queue such a frame for sending by calling the + * ip_queue_xmit(). Note that this is recursion, and bad things will happen + * if this function causes a loop... + * + * Yes this is inefficient, feel free to submit a quicker one. + * + */ + +void ip_fragment(struct sock *sk, struct sk_buff *skb, struct device *dev, int is_frag) +{ + struct iphdr *iph; + unsigned char *raw; + unsigned char *ptr; + struct sk_buff *skb2; + int left, mtu, hlen, len; + int offset; + + unsigned short true_hard_header_len; + + /* + * Point into the IP datagram header. + */ + + raw = skb->data; +#if 0 + iph = (struct iphdr *) (raw + dev->hard_header_len); + skb->ip_hdr = iph; +#else + iph = skb->ip_hdr; +#endif + + /* + * Calculate the length of the link-layer header appended to + * the IP-packet. + */ + true_hard_header_len = ((unsigned char *)iph) - raw; + + /* + * Setup starting values. + */ + + hlen = iph->ihl * 4; + left = ntohs(iph->tot_len) - hlen; /* Space per frame */ + hlen += true_hard_header_len; + mtu = (dev->mtu - hlen); /* Size of data space */ + ptr = (raw + hlen); /* Where to start from */ + + /* + * Check for any "DF" flag. [DF means do not fragment] + */ + + if (iph->frag_off & htons(IP_DF)) + { + ip_statistics.IpFragFails++; + NETDEBUG(printk("ip_queue_xmit: frag needed\n")); + return; + } + + /* + * The protocol doesn't seem to say what to do in the case that the + * frame + options doesn't fit the mtu. As it used to fall down dead + * in this case we were fortunate it didn't happen + */ + + if(mtu<8) + { + /* It's wrong but it's better than nothing */ + icmp_send(skb,ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED,dev->mtu, dev); + ip_statistics.IpFragFails++; + return; + } + + /* + * Fragment the datagram. + */ + + /* + * The initial offset is 0 for a complete frame. When + * fragmenting fragments it's wherever this one starts. + */ + + if (is_frag & 2) + offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; + else + offset = 0; + + + /* + * Keep copying data until we run out. + */ + + while(left > 0) + { + len = left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > mtu) + len = mtu; + /* IF: we are not sending upto and including the packet end + then align the next start on an eight byte boundary */ + if (len < left) + { + len/=8; + len*=8; + } + /* + * Allocate buffer. + */ + + if ((skb2 = alloc_skb(len + hlen+15,GFP_ATOMIC)) == NULL) + { + NETDEBUG(printk("IP: frag: no memory for new fragment!\n")); + ip_statistics.IpFragFails++; + return; + } + + /* + * Set up data on packet + */ + + skb2->arp = skb->arp; + skb2->protocol = htons(ETH_P_IP); /* Atleast PPP needs this */ +#if 0 + if(skb->free==0) + printk(KERN_ERR "IP fragmenter: BUG free!=1 in fragmenter\n"); +#endif + skb2->free = 1; + skb_put(skb2,len + hlen); + skb2->h.raw=(char *) skb2->data; + /* + * Charge the memory for the fragment to any owner + * it might possess + */ + + if (sk) + { + atomic_add(skb2->truesize, &sk->wmem_alloc); + skb2->sk=sk; + } + skb2->raddr = skb->raddr; /* For rebuild_header - must be here */ + + /* + * Copy the packet header into the new buffer. + */ + + memcpy(skb2->h.raw, raw, hlen); + + /* + * Copy a block of the IP datagram. + */ + memcpy(skb2->h.raw + hlen, ptr, len); + left -= len; + + skb2->h.raw+=true_hard_header_len; + + /* + * Fill in the new header fields. + */ + iph = (struct iphdr *)(skb2->h.raw/*+dev->hard_header_len*/); + iph->frag_off = htons((offset >> 3)); + skb2->ip_hdr = iph; + + /* ANK: dirty, but effective trick. Upgrade options only if + * the segment to be fragmented was THE FIRST (otherwise, + * options are already fixed) and make it ONCE + * on the initial skb, so that all the following fragments + * will inherit fixed options. + */ + if (offset == 0) + ip_options_fragment(skb); + + /* + * Added AC : If we are fragmenting a fragment that's not the + * last fragment then keep MF on each bit + */ + if (left > 0 || (is_frag & 1)) + iph->frag_off |= htons(IP_MF); + ptr += len; + offset += len; + + /* + * Put this fragment into the sending queue. + */ + + ip_statistics.IpFragCreates++; + + ip_queue_xmit(sk, dev, skb2, 2); + } + ip_statistics.IpFragOKs++; +} + + diff --git a/net/ipv4/ip_fw.c b/net/ipv4/ip_fw.c index b2e901926..f0ae86e36 100644 --- a/net/ipv4/ip_fw.c +++ b/net/ipv4/ip_fw.c @@ -19,12 +19,33 @@ * Porting bidirectional entries from BSD, fixing accounting issues, * adding struct ip_fwpkt for checking packets with interface address * Jos Vos 5/Mar/1995. + * Established connections (ACK check), ACK check on bidirectional rules, + * ICMP type check. + * Wilfred Mollenvanger 7/7/1995. + * TCP attack protection. + * Alan Cox 25/8/95, based on information from bugtraq. + * ICMP type printk, IP_FW_F_APPEND + * Bernd Eckenfels 1996-01-31 + * Split blocking chain into input and output chains, add new "insert" and + * "append" commands to replace semi-intelligent "add" command, let "delete". + * only delete the first matching entry, use 0xFFFF (0xFF) as ports (ICMP + * types) when counting packets being 2nd and further fragments. + * Jos Vos <jos@xos.nl> 8/2/1996. + * Add support for matching on device names. + * Jos Vos <jos@xos.nl> 15/2/1996. + * Transparent proxying support. + * Willy Konynenberg <willy@xos.nl> 10/5/96. + * Make separate accounting on incoming and outgoing packets possible. + * Jos Vos <jos@xos.nl> 18/5/1996. + * Added trap out of bad frames. + * Alan Cox <alan@cymru.net> 17/11/1996 + * * * Masquerading functionality * * Copyright (c) 1994 Pauline Middelink * - * The pieces which added masquerading functionality are totaly + * The pieces which added masquerading functionality are totally * my responsibility and have nothing to with the original authors * copyright or doing. * @@ -33,6 +54,12 @@ * Fixes: * Pauline Middelink : Added masquerading. * Alan Cox : Fixed an error in the merge. + * Thomas Quinot : Fixed port spoofing. + * Alan Cox : Cleaned up retransmits in spoofing. + * Alan Cox : Cleaned up length setting. + * Wouter Gadeyne : Fixed masquerading support of ftp PORT commands + * + * Juan Jose Ciarlante : Masquerading code moved to ip_masq.c * * All the real work was done by ..... * @@ -54,7 +81,7 @@ */ #include <linux/config.h> -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <linux/types.h> #include <linux/kernel.h> @@ -75,17 +102,25 @@ #include <net/route.h> #include <net/tcp.h> #include <net/udp.h> -#include <linux/skbuff.h> #include <net/sock.h> #include <net/icmp.h> +#include <net/netlink.h> +#include <linux/firewall.h> #include <linux/ip_fw.h> + +#ifdef CONFIG_IP_MASQUERADE +#include <net/ip_masq.h> +#endif + #include <net/checksum.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> /* * Implement IP packet firewall */ -#ifdef CONFIG_IPFIREWALL_DEBUG +#ifdef DEBUG_IP_FIREWALL #define dprintf1(a) printk(a) #define dprintf2(a1,a2) printk(a1,a2) #define dprintf3(a1,a2,a3) printk(a1,a2,a3) @@ -102,34 +137,30 @@ (ntohl(a)>>8)&0xFF,\ (ntohl(a))&0xFF); -#ifdef IPFIREWALL_DEBUG +#ifdef DEBUG_IP_FIREWALL #define dprint_ip(a) print_ip(a) #else #define dprint_ip(a) #endif -#ifdef CONFIG_IP_FIREWALL +#if defined(CONFIG_IP_ACCT) || defined(CONFIG_IP_FIREWALL) + struct ip_fw *ip_fw_fwd_chain; -struct ip_fw *ip_fw_blk_chain; -int ip_fw_blk_policy=IP_FW_F_ACCEPT; -int ip_fw_fwd_policy=IP_FW_F_ACCEPT; -#endif -#ifdef CONFIG_IP_ACCT +struct ip_fw *ip_fw_in_chain; +struct ip_fw *ip_fw_out_chain; struct ip_fw *ip_acct_chain; -#endif -#define IP_INFO_BLK 0 -#define IP_INFO_FWD 1 -#define IP_INFO_ACCT 2 +static struct ip_fw **chains[] = + {&ip_fw_fwd_chain, &ip_fw_in_chain, &ip_fw_out_chain, &ip_acct_chain}; +#endif /* CONFIG_IP_ACCT || CONFIG_IP_FIREWALL */ -#ifdef CONFIG_IP_MASQUERADE -/* - * Implement IP packet masquerading - */ +#ifdef CONFIG_IP_FIREWALL +int ip_fw_fwd_policy=IP_FW_F_ACCEPT; +int ip_fw_in_policy=IP_FW_F_ACCEPT; +int ip_fw_out_policy=IP_FW_F_ACCEPT; -static unsigned short masq_port = PORT_MASQ_BEGIN; -static char *strProt[] = {"UDP","TCP"}; -struct ip_masq *ip_msq_hosts; +static int *policies[] = + {&ip_fw_fwd_policy, &ip_fw_in_policy, &ip_fw_out_policy}; #endif @@ -164,26 +195,30 @@ extern inline int port_match(unsigned short *portptr,int nports,unsigned short p /* - * Returns 0 if packet should be dropped, 1 if it should be accepted, - * and -1 if an ICMP host unreachable packet should be sent. + * Returns one of the generic firewall policies, like FW_ACCEPT. * Also does accounting so you can feed it the accounting chain. - * If opt is set to 1, it means that we do this for accounting - * purposes (searches all entries and handles fragments different). - * If opt is set to 2, it doesn't count a matching packet, which - * is used when calling this for checking purposes (IP_FW_CHK_*). + * + * The modes is either IP_FW_MODE_FW (normal firewall mode), + * IP_FW_MODE_ACCT_IN or IP_FW_MODE_ACCT_OUT (accounting mode, + * steps through the entire chain and handles fragments + * differently), or IP_FW_MODE_CHK (handles user-level check, + * counters are not updated). */ -int ip_fw_chk(struct iphdr *ip, struct device *rif, struct ip_fw *chain, int policy, int opt) +int ip_fw_chk(struct iphdr *ip, struct device *rif, __u16 *redirport, struct ip_fw *chain, int policy, int mode) { struct ip_fw *f; struct tcphdr *tcp=(struct tcphdr *)((unsigned long *)ip+ip->ihl); struct udphdr *udp=(struct udphdr *)((unsigned long *)ip+ip->ihl); + struct icmphdr *icmp=(struct icmphdr *)((unsigned long *)ip+ip->ihl); __u32 src, dst; - __u16 src_port=0, dst_port=0; + __u16 src_port=0xFFFF, dst_port=0xFFFF, icmp_type=0xFF; unsigned short f_prt=0, prt; - char notcpsyn=1, frag1, match; - unsigned short f_flag; + char notcpsyn=0, notcpack=0, match; + unsigned short offset; + int answer; + unsigned char tosand, tosxor; /* * If the chain is empty follow policy. The BSD one @@ -208,11 +243,39 @@ int ip_fw_chk(struct iphdr *ip, struct device *rif, struct ip_fw *chain, int pol * of system. */ - frag1 = ((ntohs(ip->frag_off) & IP_OFFSET) == 0); - if (!frag1 && (opt != 1) && (ip->protocol == IPPROTO_TCP || - ip->protocol == IPPROTO_UDP)) - return(1); - + offset = ntohs(ip->frag_off) & IP_OFFSET; + + /* + * Don't allow a fragment of TCP 8 bytes in. Nobody + * normal causes this. Its a cracker trying to break + * in by doing a flag overwrite to pass the direction + * checks. + */ + + if (offset == 1 && ip->protocol == IPPROTO_TCP) + return FW_BLOCK; + + if (offset!=0 && !(mode & (IP_FW_MODE_ACCT_IN|IP_FW_MODE_ACCT_OUT)) && + (ip->protocol == IPPROTO_TCP || ip->protocol == IPPROTO_UDP || + ip->protocol == IPPROTO_ICMP)) + return FW_ACCEPT; + + /* + * Header fragment for TCP is too small to check the bits. + */ + + if(ip->protocol==IPPROTO_TCP && (ip->ihl<<2)+16 > ntohs(ip->tot_len)) + return FW_BLOCK; + + /* + * Too short. + * + * But only too short for a packet with ports... + */ + + else if((ntohs(ip->tot_len)<8+(ip->ihl<<2))&&(ip->protocol==IPPROTO_TCP || ip->protocol==IPPROTO_UDP)) + return FW_BLOCK; + src = ip->saddr; dst = ip->daddr; @@ -229,27 +292,33 @@ int ip_fw_chk(struct iphdr *ip, struct device *rif, struct ip_fw *chain, int pol { case IPPROTO_TCP: dprintf1("TCP "); - /* ports stay 0 if it is not the first fragment */ - if (frag1) { + /* ports stay 0xFFFF if it is not the first fragment */ + if (!offset) { src_port=ntohs(tcp->source); dst_port=ntohs(tcp->dest); - if(tcp->syn && !tcp->ack) - /* We *DO* have SYN, value FALSE */ - notcpsyn=0; + if(!tcp->ack && !tcp->rst) + /* We do NOT have ACK, value TRUE */ + notcpack=1; + if(!tcp->syn || !notcpack) + /* We do NOT have SYN, value TRUE */ + notcpsyn=1; } prt=IP_FW_F_TCP; break; case IPPROTO_UDP: dprintf1("UDP "); - /* ports stay 0 if it is not the first fragment */ - if (frag1) { + /* ports stay 0xFFFF if it is not the first fragment */ + if (!offset) { src_port=ntohs(udp->source); dst_port=ntohs(udp->dest); } prt=IP_FW_F_UDP; break; case IPPROTO_ICMP: - dprintf2("ICMP:%d ",((char *)portptr)[0]&0xff); + /* icmp_type stays 255 if it is not the first fragment */ + if (!offset) + icmp_type=(__u16)(icmp->type); + dprintf2("ICMP:%d ",icmp_type); prt=IP_FW_F_ICMP; break; default: @@ -257,15 +326,15 @@ int ip_fw_chk(struct iphdr *ip, struct device *rif, struct ip_fw *chain, int pol prt=IP_FW_F_ALL; break; } -#ifdef CONFIG_IP_FIREWALL_DEBUG +#ifdef DEBUG_IP_FIREWALL dprint_ip(ip->saddr); if (ip->protocol==IPPROTO_TCP || ip->protocol==IPPROTO_UDP) - /* This will print 0 when it is not the first fragment! */ + /* This will print 65535 when it is not the first fragment! */ dprintf2(":%d ", src_port); dprint_ip(ip->daddr); if (ip->protocol==IPPROTO_TCP || ip->protocol==IPPROTO_UDP) - /* This will print 0 when it is not the first fragment! */ + /* This will print 65535 when it is not the first fragment! */ dprintf2(":%d ",dst_port); dprintf1("\n"); #endif @@ -302,40 +371,60 @@ int ip_fw_chk(struct iphdr *ip, struct device *rif, struct ip_fw *chain, int pol /* reverse direction */ match |= 0x02; - if (match) + if (!match) + continue; + + /* + * Look for a VIA address match + */ + if(f->fw_via.s_addr && rif) { - /* - * Look for a VIA match - */ - if(f->fw_via.s_addr && rif) - { - if(rif->pa_addr!=f->fw_via.s_addr) - continue; /* Mismatch */ - } - /* - * Drop through - this is a match - */ + if(rif->pa_addr!=f->fw_via.s_addr) + continue; /* Mismatch */ + } + + /* + * Look for a VIA device match + */ + if(f->fw_viadev) + { + if(rif!=f->fw_viadev) + continue; /* Mismatch */ } - else - continue; /* * Ok the chain addresses match. */ +#ifdef CONFIG_IP_ACCT + /* + * See if we're in accounting mode and only want to + * count incoming or outgoing packets. + */ + + if (mode & (IP_FW_MODE_ACCT_IN|IP_FW_MODE_ACCT_OUT) && + ((mode == IP_FW_MODE_ACCT_IN && f->fw_flg&IP_FW_F_ACCTOUT) || + (mode == IP_FW_MODE_ACCT_OUT && f->fw_flg&IP_FW_F_ACCTIN))) + continue; + +#endif + /* + * For all non-TCP packets and/or non-first fragments, + * notcpsyn and notcpack will always be FALSE, + * so the IP_FW_F_TCPSYN and IP_FW_F_TCPACK flags + * are actually ignored for these packets. + */ + + if((f->fw_flg&IP_FW_F_TCPSYN) && notcpsyn) + continue; + + if((f->fw_flg&IP_FW_F_TCPACK) && notcpack) + continue; + f_prt=f->fw_flg&IP_FW_F_KIND; if (f_prt!=IP_FW_F_ALL) { /* - * This is actually buggy as if you set SYN flag - * on UDP or ICMP firewall it will never work,but - * actually it is a concern of software which sets - * firewall entries. - */ - - if((f->fw_flg&IP_FW_F_TCPSYN) && notcpsyn) - continue; - /* * Specific firewall - packet's protocol * must match firewall's. */ @@ -343,7 +432,10 @@ int ip_fw_chk(struct iphdr *ip, struct device *rif, struct ip_fw *chain, int pol if(prt!=f_prt) continue; - if(!(prt==IP_FW_F_ICMP || ((match & 0x01) && + if((prt==IP_FW_F_ICMP && + ! port_match(&f->fw_pts[0], f->fw_nsp, + icmp_type,f->fw_flg&IP_FW_F_SRNG)) || + !(prt==IP_FW_F_ICMP || ((match & 0x01) && port_match(&f->fw_pts[0], f->fw_nsp, src_port, f->fw_flg&IP_FW_F_SRNG) && port_match(&f->fw_pts[f->fw_nsp], f->fw_ndp, dst_port, @@ -356,492 +448,147 @@ int ip_fw_chk(struct iphdr *ip, struct device *rif, struct ip_fw *chain, int pol continue; } } + #ifdef CONFIG_IP_FIREWALL_VERBOSE /* * VERY ugly piece of code which actually - * makes kernel printf for denied packets... + * makes kernel printf for matching packets... */ if (f->fw_flg & IP_FW_F_PRN) { - if(opt != 1) { - if(f->fw_flg&IP_FW_F_ACCEPT) - printk("Accept "); - else if(f->fw_flg&IP_FW_F_ICMPRPL) - printk("Reject "); + __u32 *opt = (__u32 *) (ip + 1); + int opti; + + if(mode == IP_FW_MODE_ACCT_IN) + printk(KERN_INFO "IP acct in "); + else if(mode == IP_FW_MODE_ACCT_OUT) + printk(KERN_INFO "IP acct out "); + else { + if(chain == ip_fw_fwd_chain) + printk(KERN_INFO "IP fw-fwd "); + else if(chain == ip_fw_in_chain) + printk(KERN_INFO "IP fw-in "); + else + printk(KERN_INFO "IP fw-out "); + if(f->fw_flg&IP_FW_F_ACCEPT) { + if(f->fw_flg&IP_FW_F_REDIR) + printk("acc/r%d ", f->fw_pts[f->fw_nsp+f->fw_ndp]); + else if(f->fw_flg&IP_FW_F_MASQ) + printk("acc/masq "); + else + printk("acc "); + } else if(f->fw_flg&IP_FW_F_ICMPRPL) + printk("rej "); else - printk("Deny "); + printk("deny "); } + printk(rif ? rif->name : "-"); switch(ip->protocol) { case IPPROTO_TCP: - printk("TCP "); + printk(" TCP "); break; case IPPROTO_UDP: - printk("UDP "); + printk(" UDP "); + break; case IPPROTO_ICMP: - printk("ICMP "); + printk(" ICMP/%d ", icmp_type); break; default: - printk("p=%d ",ip->protocol); + printk(" PROTO=%d ", ip->protocol); break; } print_ip(ip->saddr); if(ip->protocol == IPPROTO_TCP || ip->protocol == IPPROTO_UDP) - printk(":%d", src_port); + printk(":%hu", src_port); printk(" "); print_ip(ip->daddr); if(ip->protocol == IPPROTO_TCP || ip->protocol == IPPROTO_UDP) - printk(":%d",dst_port); + printk(":%hu", dst_port); + printk(" L=%hu S=0x%2.2hX I=%hu F=0x%4.4hX T=%hu", + ntohs(ip->tot_len), ip->tos, ntohs(ip->id), + ip->frag_off, ip->ttl); + for (opti = 0; opti < (ip->ihl - sizeof(struct iphdr) / 4); opti++) + printk(" O=0x%8.8X", *opt++); printk("\n"); } #endif - if (opt != 2) { + if (mode != IP_FW_MODE_CHK) { f->fw_bcnt+=ntohs(ip->tot_len); f->fw_pcnt++; } - if (opt != 1) + if (!(mode & (IP_FW_MODE_ACCT_IN|IP_FW_MODE_ACCT_OUT))) break; } /* Loop */ - if(opt == 1) - return 0; - - /* - * We rely on policy defined in the rejecting entry or, if no match - * was found, we rely on the general policy variable for this type - * of firewall. - */ - - if(f!=NULL) /* A match was found */ - f_flag=f->fw_flg; - else - f_flag=policy; - if(f_flag&IP_FW_F_ACCEPT) - return ((f_flag&IP_FW_F_MASQ)?2:1); - if(f_flag&IP_FW_F_ICMPRPL) - return -1; - return 0; -} - -#ifdef CONFIG_IP_MASQUERADE - -static void masq_expire(unsigned long data) -{ - struct ip_masq *ms = (struct ip_masq *)data; - struct ip_masq *old,*cur; - unsigned long flags; - -#ifdef DEBUG_MASQ - printk("Masqueraded %s %lX:%X expired\n", - strProt[ms->protocol==IPPROTO_TCP], - ntohl(ms->src),ntohs(ms->sport)); -#endif - - save_flags(flags); - cli(); - - /* delete from list of hosts */ - old = NULL; - cur = ip_msq_hosts; - while (cur!=NULL) { - if (cur==ms) { - if (old==NULL) ip_msq_hosts = ms->next; - else old->next = ms->next; - kfree_s(ms,sizeof(*ms)); - break; - } - old = cur; - cur=cur->next; - } - restore_flags(flags); -} - -/* - * Create a new masquerade list entry, also allocate an - * unused mport, keeping the portnumber between the - * given boundaries MASQ_BEGIN and MASQ_END. - * - * FIXME: possible deadlock if all free ports are exhausted! - */ -static struct ip_masq *alloc_masq_entry(void) -{ - struct ip_masq *ms, *mst; - unsigned long flags; - - ms = (struct ip_masq *) kmalloc(sizeof(struct ip_masq), GFP_ATOMIC); - if (ms==NULL) - return NULL; - - memset(ms,0,sizeof(*ms)); - init_timer(&ms->timer); - ms->timer.data = (unsigned long)ms; - ms->timer.function = masq_expire; - - save_flags(flags); - cli(); - do - { - /* Try the next available port number */ - ms->mport = htons(masq_port++); - if (masq_port==PORT_MASQ_END) - masq_port = PORT_MASQ_BEGIN; - - /* Now hunt through the used ports to see if - * this port is in use... */ - mst = ip_msq_hosts; - while (mst && mst->mport!=ms->mport) - mst = mst->next; - } - while (mst!=NULL); - - /* add new entry in front of list to minimize lookup-time */ - ms->next = ip_msq_hosts; - ip_msq_hosts = ms; - restore_flags(flags); - - return ms; -} - -/* - * When passing an FTP 'PORT' command, try to replace the IP - * address with an newly assigned (masquereded) port on this - * host, so the ftp-data connect FROM the site will succeed... - * - * Also, when the size of the packet changes, create an delta - * offset, which will be added to every th->seq (and subtracted for - * (th->acqseq) whose seq > init_seq. - * - * Not for the faint of heart! - */ - -static struct sk_buff *revamp(struct sk_buff *skb, struct device *dev, struct ip_masq *ftp) -{ - struct iphdr *iph = skb->h.iph; - struct tcphdr *th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); - struct sk_buff *skb2; - char *p, *data = (char *)&th[1]; - unsigned char p1,p2,p3,p4,p5,p6; - unsigned long from; - unsigned short port; - struct ip_masq *ms; - char buf[20]; /* xxx.xxx.xxx.xxx\r\n */ - - /* - * Adjust seq and ack_seq with delta-offset for - * the packets AFTER this one... - */ - if (ftp->delta && after(ftp->init_seq,th->seq)) - { - th->seq += ftp->delta; -/* th->ack_seq += ftp->delta;*/ - } - - while (skb->len - ((unsigned char *)data - skb->h.raw) > 18) - { - if (memcmp(data,"PORT ",5)!=0 && memcmp(data,"port ",5)!=0) - { - data += 5; - continue; - } - p = data+5; - p1 = simple_strtoul(data+5,&data,10); - if (*data!=',') - continue; - p2 = simple_strtoul(data+1,&data,10); - if (*data!=',') - continue; - p3 = simple_strtoul(data+1,&data,10); - if (*data!=',') - continue; - p4 = simple_strtoul(data+1,&data,10); - if (*data!=',') - continue; - p5 = simple_strtoul(data+1,&data,10); - if (*data!=',') - continue; - p6 = simple_strtoul(data+1,&data,10); - if (*data!='\r' && *data!='\n') - continue; - - from = (p1<<24) | (p2<<16) | (p3<<8) | p4; - port = (p5<<8) | p6; - printk("PORT %lX:%X detected\n",from,port); - - /* - * Now create an masquerade entry for it - */ - ms = alloc_masq_entry(); - if (ms==NULL) - return skb; - ms->protocol = IPPROTO_TCP; - ms->src = htonl(from); /* derived from PORT cmd */ - ms->sport = htons(port); /* derived from PORT cmd */ - ms->dst = iph->daddr; - ms->dport = htons(20); /* ftp-data */ - ms->timer.expires = MASQUERADE_EXPIRE_TCP_FIN; - add_timer(&ms->timer); - - /* - * Replace the old PORT with the new one - */ - from = ntohl(dev->pa_addr); - port = ntohs(ms->mport); - sprintf(buf,"%ld,%ld,%ld,%ld,%d,%d", - from>>24&255,from>>16&255,from>>8&255,from&255, - port>>8&255,port&255); + if (!(mode & (IP_FW_MODE_ACCT_IN|IP_FW_MODE_ACCT_OUT))) { /* - * Calculate required delta-offset to keep TCP happy + * We rely on policy defined in the rejecting entry or, if no match + * was found, we rely on the general policy variable for this type + * of firewall. */ - ftp->delta += strlen(buf) - (data-p); - if (ftp->delta==0) - { - /* - * simple case, just replace the old PORT cmd - */ - ftp->init_seq = 0; - memcpy(p,buf,strlen(buf)); - return skb; - } - - /* - * Sizes differ, make a copy - */ - printk("MASQUERADE: resizing needed for %d bytes (%ld)\n",ftp->delta, skb->len); - if (!ftp->init_seq) - ftp->init_seq = th->seq; - - skb2 = alloc_skb(skb->mem_len-sizeof(struct sk_buff)+ftp->delta, GFP_ATOMIC); - if (skb2 == NULL) { - printk("MASQUERADE: No memory available\n"); - return skb; - } - skb2->free = skb->free; - skb2->len = skb->len + ftp->delta; - skb2->h.raw = &skb2->data[skb->h.raw - skb->data]; - - /* - * Copy the packet data into the new buffer. - * Thereby replacing the PORT cmd. - */ - memcpy(skb2->data, skb->data, (p - (char *)skb->data)); - memcpy(&skb2->data[(p - (char *)skb->data)], buf, strlen(buf)); - memcpy(&skb2->data[(p - (char *)skb->data) + strlen(buf)], data, - skb->mem_len - sizeof(struct sk_buff) - ((char *)skb->h.raw - data)); - - /* - * Problem, how to replace the new skb with old one, - * preferably inplace, so all the pointers in the - * calling tree keep ok :( - */ - kfree_skb(skb, FREE_WRITE); - return skb2; - } - return skb; -} -static void recalc_check(struct udphdr *uh, unsigned long saddr, - unsigned long daddr, int len) -{ - uh->check=0; - uh->check=csum_tcpudp_magic(saddr,daddr,len, - IPPROTO_UDP, csum_partial((char *)uh,len,0)); - if(uh->check==0) - uh->check=-0xFFFF; -} - -void ip_fw_masquerade(struct sk_buff **skb_ptr, struct device *dev) -{ - struct sk_buff *skb=*skb_ptr; - struct iphdr *iph = skb->h.iph; - unsigned short *portptr; - struct ip_masq *ms; - int size; - - /* - * We can only masquerade protocols with ports... - */ - - if (iph->protocol!=IPPROTO_UDP && iph->protocol!=IPPROTO_TCP) - return; - - /* - * Now hunt the list to see if we have an old entry - */ - - portptr = (unsigned short *)&(((char *)iph)[iph->ihl*4]); - ms = ip_msq_hosts; + if (f!=NULL) { + policy=f->fw_flg; + tosand=f->fw_tosand; + tosxor=f->fw_tosxor; + } else { + tosand=0xFF; + tosxor=0x00; + } -#ifdef DEBUG_MASQ - printk("Outgoing %s %lX:%X -> %lX:%X\n", - strProt[iph->protocol==IPPROTO_TCP], - ntohl(iph->saddr), ntohs(portptr[0]), - ntohl(iph->daddr), ntohs(portptr[1])); + if (policy&IP_FW_F_ACCEPT) { + /* Adjust priority and recompute checksum */ + __u8 old_tos = ip->tos; + ip->tos = (old_tos & tosand) ^ tosxor; + if (ip->tos != old_tos) + ip_send_check(ip); +#ifdef CONFIG_IP_TRANSPARENT_PROXY + if (policy&IP_FW_F_REDIR) { + if (redirport) + if ((*redirport = htons(f->fw_pts[f->fw_nsp+f->fw_ndp])) == 0) { + /* Wildcard redirection. + * Note that redirport will become + * 0xFFFF for non-TCP/UDP packets. + */ + *redirport = htons(dst_port); + } + answer = FW_REDIRECT; + } else #endif - while (ms!=NULL) - { - if (iph->protocol == ms->protocol && - iph->saddr == ms->src && iph->daddr == ms->dst && - portptr[0] == ms->sport && portptr[1] == ms->dport) - { - del_timer(&ms->timer); - break; - } - ms = ms->next; - } +#ifdef CONFIG_IP_MASQUERADE + if (policy&IP_FW_F_MASQ) + answer = FW_MASQUERADE; + else +#endif + answer = FW_ACCEPT; + + } else if(policy&IP_FW_F_ICMPRPL) + answer = FW_REJECT; + else + answer = FW_BLOCK; - /* - * Nope, not found, create a new entry for it - */ - - if (ms==NULL) - { - ms = alloc_masq_entry(); - if (ms==NULL) +#ifdef CONFIG_IP_FIREWALL_NETLINK + if(answer == FW_REJECT || answer == FW_BLOCK) { - printk("MASQUERADE: no memory left !\n"); - return; + struct sk_buff *skb=alloc_skb(128, GFP_ATOMIC); + if(skb) + { + int len=min(128,ntohs(ip->tot_len)); + skb_put(skb,len); + memcpy(skb->data,ip,len); + if(netlink_post(NETLINK_FIREWALL, skb)) + kfree_skb(skb, FREE_WRITE); + } } - ms->protocol = iph->protocol; - ms->src = iph->saddr; - ms->dst = iph->daddr; - ms->sport = portptr[0]; - ms->dport = portptr[1]; - } - - /* - * Change the fragments origin - */ - - size = skb->len - ((unsigned char *)portptr - skb->h.raw); - iph->saddr = dev->pa_addr; /* my own address */ - portptr[0] = ms->mport; - - /* - * Adjust packet accordingly to protocol - */ - - if (iph->protocol==IPPROTO_UDP) - { - ms->timer.expires = MASQUERADE_EXPIRE_UDP; - recalc_check((struct udphdr *)portptr,iph->saddr,iph->daddr,size); - } - else - { - struct tcphdr *th; - if (portptr[1]==htons(21)) - { - skb = revamp(*skb_ptr, dev, ms); - skb = *skb_ptr; - iph = skb->h.iph; - portptr = (unsigned short *)&(((char *)iph)[iph->ihl*4]); - } - th = (struct tcphdr *)portptr; - - /* - * Timeout depends if FIN packet was seen - */ - if (ms->sawfin || th->fin) - { - ms->timer.expires = MASQUERADE_EXPIRE_TCP_FIN; - ms->sawfin = 1; - } - else ms->timer.expires = MASQUERADE_EXPIRE_TCP; - - tcp_send_check(th,iph->saddr,iph->daddr,size,skb->sk); - } - add_timer(&ms->timer); - ip_send_check(iph); - - #ifdef DEBUG_MASQ - printk("O-routed from %lX:%X over %s\n",ntohl(dev->pa_addr),ntohs(ms->mport),dev->name); - #endif - } - - /* - * Check if it's an masqueraded port, look it up, - * and send it on it's way... - * - * Better not have many hosts using the designated portrange - * as 'normal' ports, or you'll be spending lots of time in - * this function. - */ - -int ip_fw_demasquerade(struct sk_buff *skb_ptr) -{ - struct iphdr *iph = skb_ptr->h.iph; - unsigned short *portptr; - struct ip_masq *ms; - struct tcphdr *th = (struct tcphdr *)(skb_ptr->h.raw+(iph->ihl<<2)); - - if (iph->protocol!=IPPROTO_UDP && iph->protocol!=IPPROTO_TCP) - return 0; - - portptr = (unsigned short *)&(((char *)iph)[iph->ihl*4]); - if (ntohs(portptr[1]) < PORT_MASQ_BEGIN || - ntohs(portptr[1]) > PORT_MASQ_END) - return 0; - -#ifdef DEBUG_MASQ - printk("Incoming %s %lX:%X -> %lX:%X\n", - strProt[iph->protocol==IPPROTO_TCP], - ntohl(iph->saddr), ntohs(portptr[0]), - ntohl(iph->daddr), ntohs(portptr[1])); -#endif - /* - * reroute to original host:port if found... - * - * NB. Cannot check destination address, just for the incoming port. - * reason: archie.doc.ac.uk has 6 interfaces, you send to - * phoenix and get a reply from any other interface(==dst)! - * - * [Only for UDP] - AC - */ - ms = ip_msq_hosts; - while (ms!=NULL) - { - if (iph->protocol==ms->protocol && - (iph->saddr==ms->dst || iph->protocol==IPPROTO_UDP) && - portptr[0]==ms->dport && - portptr[1]==ms->mport) - { - int size = skb_ptr->len - ((unsigned char *)portptr - skb_ptr->h.raw); - iph->daddr = ms->src; - portptr[1] = ms->sport; - - /* - * Yug! adjust UDP/TCP and IP checksums - */ - if (iph->protocol==IPPROTO_UDP) - recalc_check((struct udphdr *)portptr,iph->saddr,iph->daddr,size); - else - { - /* - * Adjust seq and ack_seq with delta-offset for - * the packets AFTER this one... - */ - if (ms->delta && after(ms->init_seq,th->ack_seq)) - { -/* th->seq += ms->delta;*/ - th->ack_seq -= ms->delta; - } - tcp_send_check((struct tcphdr *)portptr,iph->saddr,iph->daddr,size,skb_ptr->sk); - } - ip_send_check(iph); -#ifdef DEBUG_MASQ - printk("I-routed to %lX:%X\n",ntohl(iph->daddr),ntohs(portptr[1])); -#endif - return 1; - } - ms = ms->next; - } - - /* sorry, all this trouble for a no-hit :) */ - return 0; +#endif + return answer; + } else + /* we're doing accounting, always ok */ + return 0; } -#endif - static void zero_fw_chain(struct ip_fw *chainptr) @@ -872,182 +619,87 @@ static void free_fw_chain(struct ip_fw *volatile* chainptr) /* Volatiles to keep some of the compiler versions amused */ -static int add_to_chain(struct ip_fw *volatile* chainptr, struct ip_fw *frwl) +static int insert_in_chain(struct ip_fw *volatile* chainptr, struct ip_fw *frwl,int len) { struct ip_fw *ftmp; - struct ip_fw *chtmp=NULL; - struct ip_fw *volatile chtmp_prev=NULL; unsigned long flags; - unsigned long m_src_mask,m_dst_mask; - unsigned long n_sa,n_da,o_sa,o_da,o_sm,o_dm,n_sm,n_dm; - unsigned short n_sr,n_dr,o_sr,o_dr; - unsigned short oldkind,newkind; - int addb4=0; - int n_o,n_n; save_flags(flags); ftmp = kmalloc( sizeof(struct ip_fw), GFP_ATOMIC ); if ( ftmp == NULL ) { -#ifdef DEBUG_CONFIG_IP_FIREWALL +#ifdef DEBUG_IP_FIREWALL printk("ip_fw_ctl: malloc said no\n"); #endif return( ENOMEM ); } - memcpy(ftmp, frwl, sizeof( struct ip_fw ) ); - + memcpy(ftmp, frwl, len); + /* + * Allow the more recent "minimise cost" flag to be + * set. [Rob van Nieuwkerk] + */ + ftmp->fw_tosand |= 0x01; + ftmp->fw_tosxor &= 0xFE; ftmp->fw_pcnt=0L; ftmp->fw_bcnt=0L; - ftmp->fw_next = NULL; - cli(); - if (*chainptr==NULL) - { - *chainptr=ftmp; - } - else - { - chtmp_prev=NULL; - for (chtmp=*chainptr;chtmp!=NULL;chtmp=chtmp->fw_next) - { - addb4=0; - newkind=ftmp->fw_flg & IP_FW_F_KIND; - oldkind=chtmp->fw_flg & IP_FW_F_KIND; - - if (newkind!=IP_FW_F_ALL - && oldkind!=IP_FW_F_ALL - && oldkind!=newkind) - { - chtmp_prev=chtmp; - continue; - } + if ((ftmp->fw_vianame)[0]) { + if (!(ftmp->fw_viadev = dev_get(ftmp->fw_vianame))) + ftmp->fw_viadev = (struct device *) -1; + } else + ftmp->fw_viadev = NULL; - /* - * Very very *UGLY* code... - * Sorry,but i had to do this.... - */ + ftmp->fw_next = *chainptr; + *chainptr=ftmp; + restore_flags(flags); + return(0); +} - n_sa=ntohl(ftmp->fw_src.s_addr); - n_da=ntohl(ftmp->fw_dst.s_addr); - n_sm=ntohl(ftmp->fw_smsk.s_addr); - n_dm=ntohl(ftmp->fw_dmsk.s_addr); +static int append_to_chain(struct ip_fw *volatile* chainptr, struct ip_fw *frwl,int len) +{ + struct ip_fw *ftmp; + struct ip_fw *chtmp=NULL; + struct ip_fw *volatile chtmp_prev=NULL; + unsigned long flags; - o_sa=ntohl(chtmp->fw_src.s_addr); - o_da=ntohl(chtmp->fw_dst.s_addr); - o_sm=ntohl(chtmp->fw_smsk.s_addr); - o_dm=ntohl(chtmp->fw_dmsk.s_addr); + save_flags(flags); - m_src_mask = o_sm & n_sm; - m_dst_mask = o_dm & n_dm; + ftmp = kmalloc( sizeof(struct ip_fw), GFP_ATOMIC ); + if ( ftmp == NULL ) + { +#ifdef DEBUG_IP_FIREWALL + printk("ip_fw_ctl: malloc said no\n"); +#endif + return( ENOMEM ); + } - if ((o_sa & m_src_mask) == (n_sa & m_src_mask)) - { - if (n_sm > o_sm) - addb4++; - if (n_sm < o_sm) - addb4--; - } + memcpy(ftmp, frwl, len); + /* + * Allow the more recent "minimise cost" flag to be + * set. [Rob van Nieuwkerk] + */ + ftmp->fw_tosand |= 0x01; + ftmp->fw_tosxor &= 0xFE; + ftmp->fw_pcnt=0L; + ftmp->fw_bcnt=0L; - if ((o_da & m_dst_mask) == (n_da & m_dst_mask)) - { - if (n_dm > o_dm) - addb4++; - if (n_dm < o_dm) - addb4--; - } + ftmp->fw_next = NULL; - if (((o_da & o_dm) == (n_da & n_dm)) - &&((o_sa & o_sm) == (n_sa & n_sm))) - { - if (newkind!=IP_FW_F_ALL && - oldkind==IP_FW_F_ALL) - addb4++; - if (newkind==oldkind && (oldkind==IP_FW_F_TCP - || oldkind==IP_FW_F_UDP)) - { - - /* - * Here the main idea is to check the size - * of port range which the frwl covers - * We actually don't check their values but - * just the wideness of range they have - * so that less wide ranges or single ports - * go first and wide ranges go later. No ports - * at all treated as a range of maximum number - * of ports. - */ - - if (ftmp->fw_flg & IP_FW_F_SRNG) - n_sr=ftmp->fw_pts[1]-ftmp->fw_pts[0]; - else - n_sr=(ftmp->fw_nsp)? - ftmp->fw_nsp : 0xFFFF; - - if (chtmp->fw_flg & IP_FW_F_SRNG) - o_sr=chtmp->fw_pts[1]-chtmp->fw_pts[0]; - else - o_sr=(chtmp->fw_nsp)?chtmp->fw_nsp : 0xFFFF; - - if (n_sr<o_sr) - addb4++; - if (n_sr>o_sr) - addb4--; - - n_n=ftmp->fw_nsp; - n_o=chtmp->fw_nsp; - - /* - * Actually this cannot happen as the frwl control - * procedure checks for number of ports in source and - * destination range but we will try to be more safe. - */ - - if ((n_n>(IP_FW_MAX_PORTS-2)) || - (n_o>(IP_FW_MAX_PORTS-2))) - goto skip_check; - - if (ftmp->fw_flg & IP_FW_F_DRNG) - n_dr=ftmp->fw_pts[n_n+1]-ftmp->fw_pts[n_n]; - else - n_dr=(ftmp->fw_ndp)? ftmp->fw_ndp : 0xFFFF; - - if (chtmp->fw_flg & IP_FW_F_DRNG) - o_dr=chtmp->fw_pts[n_o+1]-chtmp->fw_pts[n_o]; - else - o_dr=(chtmp->fw_ndp)? chtmp->fw_ndp : 0xFFFF; - if (n_dr<o_dr) - addb4++; - if (n_dr>o_dr) - addb4--; -skip_check: - } - /* finally look at the interface address */ - if ((addb4 == 0) && ftmp->fw_via.s_addr && - !(chtmp->fw_via.s_addr)) - addb4++; - } - if (addb4>0) - { - if (chtmp_prev) - { - chtmp_prev->fw_next=ftmp; - ftmp->fw_next=chtmp; - } - else - { - *chainptr=ftmp; - ftmp->fw_next=chtmp; - } - restore_flags(flags); - return 0; - } - chtmp_prev=chtmp; - } - } + cli(); + + if ((ftmp->fw_vianame)[0]) { + if (!(ftmp->fw_viadev = dev_get(ftmp->fw_vianame))) + ftmp->fw_viadev = (struct device *) -1; + } else + ftmp->fw_viadev = NULL; + + chtmp_prev=NULL; + for (chtmp=*chainptr;chtmp!=NULL;chtmp=chtmp->fw_next) + chtmp_prev=chtmp; if (chtmp_prev) chtmp_prev->fw_next=ftmp; @@ -1071,7 +723,7 @@ static int del_from_chain(struct ip_fw *volatile*chainptr, struct ip_fw *frwl) if ( ftmp == NULL ) { -#ifdef DEBUG_CONFIG_IP_FIREWALL +#ifdef DEBUG_IP_FIREWALL printk("ip_fw_ctl: chain is empty\n"); #endif restore_flags(flags); @@ -1081,10 +733,10 @@ static int del_from_chain(struct ip_fw *volatile*chainptr, struct ip_fw *frwl) ltmp=NULL; was_found=0; - while( ftmp != NULL ) + while( !was_found && ftmp != NULL ) { matches=1; - if (ftmp->fw_src.s_addr!=frwl->fw_src.s_addr + if (ftmp->fw_src.s_addr!=frwl->fw_src.s_addr || ftmp->fw_dst.s_addr!=frwl->fw_dst.s_addr || ftmp->fw_smsk.s_addr!=frwl->fw_smsk.s_addr || ftmp->fw_dmsk.s_addr!=frwl->fw_dmsk.s_addr @@ -1102,6 +754,8 @@ static int del_from_chain(struct ip_fw *volatile*chainptr, struct ip_fw *frwl) if (ftmp->fw_pts[tmpnum]!=frwl->fw_pts[tmpnum]) matches=0; } + if (strncmp(ftmp->fw_vianame, frwl->fw_vianame, IFNAMSIZ)) + matches=0; if(matches) { was_found=1; @@ -1138,7 +792,7 @@ struct ip_fw *check_ipfw_struct(struct ip_fw *frwl, int len) if ( len != sizeof(struct ip_fw) ) { -#ifdef DEBUG_CONFIG_IP_FIREWALL +#ifdef DEBUG_IP_FIREWALL printk("ip_fw_ctl: len=%d, want %d\n",len, sizeof(struct ip_fw)); #endif return(NULL); @@ -1146,16 +800,34 @@ struct ip_fw *check_ipfw_struct(struct ip_fw *frwl, int len) if ( (frwl->fw_flg & ~IP_FW_F_MASK) != 0 ) { -#ifdef DEBUG_CONFIG_IP_FIREWALL +#ifdef DEBUG_IP_FIREWALL printk("ip_fw_ctl: undefined flag bits set (flags=%x)\n", frwl->fw_flg); #endif return(NULL); } +#ifndef CONFIG_IP_TRANSPARENT_PROXY + if (frwl->fw_flg & IP_FW_F_REDIR) { +#ifdef DEBUG_IP_FIREWALL + printk("ip_fw_ctl: unsupported flag IP_FW_F_REDIR\n"); +#endif + return(NULL); + } +#endif + +#ifndef CONFIG_IP_MASQUERADE + if (frwl->fw_flg & IP_FW_F_MASQ) { +#ifdef DEBUG_IP_FIREWALL + printk("ip_fw_ctl: unsupported flag IP_FW_F_MASQ\n"); +#endif + return(NULL); + } +#endif + if ( (frwl->fw_flg & IP_FW_F_SRNG) && frwl->fw_nsp < 2 ) { -#ifdef DEBUG_CONFIG_IP_FIREWALL +#ifdef DEBUG_IP_FIREWALL printk("ip_fw_ctl: src range set but fw_nsp=%d\n", frwl->fw_nsp); #endif @@ -1164,16 +836,16 @@ struct ip_fw *check_ipfw_struct(struct ip_fw *frwl, int len) if ( (frwl->fw_flg & IP_FW_F_DRNG) && frwl->fw_ndp < 2 ) { -#ifdef DEBUG_CONFIG_IP_FIREWALL +#ifdef DEBUG_IP_FIREWALL printk("ip_fw_ctl: dst range set but fw_ndp=%d\n", frwl->fw_ndp); #endif return(NULL); } - if ( frwl->fw_nsp + frwl->fw_ndp > IP_FW_MAX_PORTS ) + if ( frwl->fw_nsp + frwl->fw_ndp > (frwl->fw_flg & IP_FW_F_REDIR ? IP_FW_MAX_PORTS - 1 : IP_FW_MAX_PORTS) ) { -#ifdef DEBUG_CONFIG_IP_FIREWALL +#ifdef DEBUG_IP_FIREWALL printk("ip_fw_ctl: too many ports (%d+%d)\n", frwl->fw_nsp,frwl->fw_ndp); #endif @@ -1188,14 +860,6 @@ struct ip_fw *check_ipfw_struct(struct ip_fw *frwl, int len) #ifdef CONFIG_IP_ACCT -#if 0 -void ip_acct_cnt(struct iphdr *iph, struct device *dev, struct ip_fw *f) -{ - (void) ip_fw_chk(iph, dev, f, 0, 1); - return; -} -#endif - int ip_acct_ctl(int stage, void *m, int len) { if ( stage == IP_ACCT_FLUSH ) @@ -1208,9 +872,8 @@ int ip_acct_ctl(int stage, void *m, int len) zero_fw_chain(ip_acct_chain); return(0); } - if ( stage == IP_ACCT_ADD - || stage == IP_ACCT_DEL - ) + if ( stage == IP_ACCT_INSERT || stage == IP_ACCT_APPEND || + stage == IP_ACCT_DELETE ) { struct ip_fw *frwl; @@ -1219,21 +882,23 @@ int ip_acct_ctl(int stage, void *m, int len) switch (stage) { - case IP_ACCT_ADD: - return( add_to_chain(&ip_acct_chain,frwl)); - case IP_ACCT_DEL: + case IP_ACCT_INSERT: + return( insert_in_chain(&ip_acct_chain,frwl,len)); + case IP_ACCT_APPEND: + return( append_to_chain(&ip_acct_chain,frwl,len)); + case IP_ACCT_DELETE: return( del_from_chain(&ip_acct_chain,frwl)); default: /* * Should be panic but... (Why ??? - AC) */ -#ifdef DEBUG_CONFIG_IP_FIREWALL +#ifdef DEBUG_IP_FIREWALL printk("ip_acct_ctl: unknown request %d\n",stage); #endif return(EINVAL); } } -#ifdef DEBUG_CONFIG_IP_FIREWALL +#ifdef DEBUG_IP_FIREWALL printk("ip_acct_ctl: unknown request %d\n",stage); #endif return(EINVAL); @@ -1243,53 +908,41 @@ int ip_acct_ctl(int stage, void *m, int len) #ifdef CONFIG_IP_FIREWALL int ip_fw_ctl(int stage, void *m, int len) { - int ret; + int cmd, fwtype; - if ( stage == IP_FW_FLUSH_BLK ) - { - free_fw_chain(&ip_fw_blk_chain); - return(0); - } + cmd = stage & IP_FW_COMMAND; + fwtype = (stage & IP_FW_TYPE) >> IP_FW_SHIFT; - if ( stage == IP_FW_FLUSH_FWD ) + if ( cmd == IP_FW_FLUSH ) { - free_fw_chain(&ip_fw_fwd_chain); + free_fw_chain(chains[fwtype]); return(0); } - if ( stage == IP_FW_ZERO_BLK ) + if ( cmd == IP_FW_ZERO ) { - zero_fw_chain(ip_fw_blk_chain); + zero_fw_chain(*chains[fwtype]); return(0); } - if ( stage == IP_FW_ZERO_FWD ) - { - zero_fw_chain(ip_fw_fwd_chain); - return(0); - } - - if ( stage == IP_FW_POLICY_BLK || stage == IP_FW_POLICY_FWD ) + if ( cmd == IP_FW_POLICY ) { int *tmp_policy_ptr; tmp_policy_ptr=(int *)m; - if ( stage == IP_FW_POLICY_BLK ) - ip_fw_blk_policy=*tmp_policy_ptr; - else - ip_fw_fwd_policy=*tmp_policy_ptr; + *policies[fwtype] = *tmp_policy_ptr; return 0; } - if ( stage == IP_FW_CHK_BLK || stage == IP_FW_CHK_FWD ) + if ( cmd == IP_FW_CHECK ) { - struct device viadev; + struct device *viadev; struct ip_fwpkt *ipfwp; struct iphdr *ip; - if ( len < sizeof(struct ip_fwpkt) ) + if ( len != sizeof(struct ip_fwpkt) ) { -#ifdef DEBUG_CONFIG_IP_FIREWALL - printf("ip_fw_ctl: length=%d, expected %d\n", +#ifdef DEBUG_IP_FIREWALL + printk("ip_fw_ctl: length=%d, expected %d\n", len, sizeof(struct ip_fwpkt)); #endif return( EINVAL ); @@ -1298,28 +951,77 @@ int ip_fw_ctl(int stage, void *m, int len) ipfwp = (struct ip_fwpkt *)m; ip = &(ipfwp->fwp_iph); - if ( ip->ihl != sizeof(struct iphdr) / sizeof(int)) - { -#ifdef DEBUG_CONFIG_IP_FIREWALL + if ( !(viadev = dev_get(ipfwp->fwp_vianame)) ) { +#ifdef DEBUG_IP_FIREWALL + printk("ip_fw_ctl: invalid device \"%s\"\n", ipfwp->fwp_vianame); +#endif + return(EINVAL); + } else if ( viadev->pa_addr != ipfwp->fwp_via.s_addr ) { +#ifdef DEBUG_IP_FIREWALL + printk("ip_fw_ctl: device \"%s\" has another IP address\n", + ipfwp->fwp_vianame); +#endif + return(EINVAL); + } else if ( ip->ihl != sizeof(struct iphdr) / sizeof(int)) { +#ifdef DEBUG_IP_FIREWALL printk("ip_fw_ctl: ip->ihl=%d, want %d\n",ip->ihl, sizeof(struct iphdr)/sizeof(int)); #endif return(EINVAL); } - viadev.pa_addr = ipfwp->fwp_via.s_addr; - - if ((ret = ip_fw_chk(ip, &viadev, - stage == IP_FW_CHK_BLK ? - ip_fw_blk_chain : ip_fw_fwd_chain, - stage == IP_FW_CHK_BLK ? - ip_fw_blk_policy : ip_fw_fwd_policy, 2 )) > 0 - ) - return(0); - else if (ret == -1) - return(ECONNREFUSED); - else - return(ETIMEDOUT); + switch (ip_fw_chk(ip, viadev, NULL, *chains[fwtype], + *policies[fwtype], IP_FW_MODE_CHK)) + { + case FW_ACCEPT: + return(0); + case FW_REDIRECT: + return(ECONNABORTED); + case FW_MASQUERADE: + return(ECONNRESET); + case FW_REJECT: + return(ECONNREFUSED); + default: /* FW_BLOCK */ + return(ETIMEDOUT); + } + } + + if ( cmd == IP_FW_MASQ_TIMEOUTS ) + { +#ifdef CONFIG_IP_MASQUERADE + struct ip_fw_masq *masq; + + if ( len != sizeof(struct ip_fw_masq) ) + { +#ifdef DEBUG_IP_FIREWALL + printk("ip_fw_ctl (masq): length %d, expected %d\n", + len, sizeof(struct ip_fw_masq)); + +#endif + return( EINVAL ); + } + + masq = (struct ip_fw_masq *) m; + + if (masq->tcp_timeout) + { + ip_masq_expire->tcp_timeout = masq->tcp_timeout; + } + + if (masq->tcp_fin_timeout) + { + ip_masq_expire->tcp_fin_timeout = masq->tcp_fin_timeout; + } + + if (masq->udp_timeout) + { + ip_masq_expire->udp_timeout = masq->udp_timeout; + } + + return 0; +#else + return( EINVAL ); +#endif } /* @@ -1327,37 +1029,36 @@ int ip_fw_ctl(int stage, void *m, int len) * to blocking/forwarding chains or deleting 'em */ - if ( stage == IP_FW_ADD_BLK || stage == IP_FW_ADD_FWD - || stage == IP_FW_DEL_BLK || stage == IP_FW_DEL_FWD - ) + if ( cmd == IP_FW_INSERT || cmd == IP_FW_APPEND || cmd == IP_FW_DELETE ) { struct ip_fw *frwl; + int fwtype; + frwl=check_ipfw_struct(m,len); if (frwl==NULL) return (EINVAL); + fwtype = (stage & IP_FW_TYPE) >> IP_FW_SHIFT; - switch (stage) + switch (cmd) { - case IP_FW_ADD_BLK: - return(add_to_chain(&ip_fw_blk_chain,frwl)); - case IP_FW_ADD_FWD: - return(add_to_chain(&ip_fw_fwd_chain,frwl)); - case IP_FW_DEL_BLK: - return(del_from_chain(&ip_fw_blk_chain,frwl)); - case IP_FW_DEL_FWD: - return(del_from_chain(&ip_fw_fwd_chain,frwl)); + case IP_FW_INSERT: + return(insert_in_chain(chains[fwtype],frwl,len)); + case IP_FW_APPEND: + return(append_to_chain(chains[fwtype],frwl,len)); + case IP_FW_DELETE: + return(del_from_chain(chains[fwtype],frwl)); default: /* * Should be panic but... (Why are BSD people panic obsessed ??) */ -#ifdef DEBUG_CONFIG_IP_FIREWALL +#ifdef DEBUG_IP_FIREWALL printk("ip_fw_ctl: unknown request %d\n",stage); #endif return(EINVAL); } } -#ifdef DEBUG_CONFIG_IP_FIREWALL +#ifdef DEBUG_IP_FIREWALL printk("ip_fw_ctl: unknown request %d\n",stage); #endif return(EINVAL); @@ -1367,30 +1068,36 @@ int ip_fw_ctl(int stage, void *m, int len) #if defined(CONFIG_IP_FIREWALL) || defined(CONFIG_IP_ACCT) static int ip_chain_procinfo(int stage, char *buffer, char **start, - off_t offset, int length, int reset) + off_t offset, int length, int reset) { off_t pos=0, begin=0; struct ip_fw *i; unsigned long flags; int len, p; + int last_len = 0; switch(stage) { #ifdef CONFIG_IP_FIREWALL - case IP_INFO_BLK: - i = ip_fw_blk_chain; - len=sprintf(buffer, "IP firewall block rules, default %d\n", - ip_fw_blk_policy); + case IP_FW_IN: + i = ip_fw_in_chain; + len=sprintf(buffer, "IP firewall input rules, default %d\n", + ip_fw_in_policy); break; - case IP_INFO_FWD: + case IP_FW_OUT: + i = ip_fw_out_chain; + len=sprintf(buffer, "IP firewall output rules, default %d\n", + ip_fw_out_policy); + break; + case IP_FW_FWD: i = ip_fw_fwd_chain; len=sprintf(buffer, "IP firewall forward rules, default %d\n", ip_fw_fwd_policy); break; #endif #ifdef CONFIG_IP_ACCT - case IP_INFO_ACCT: + case IP_FW_ACCT: i = ip_acct_chain; len=sprintf(buffer,"IP accounting rules\n"); break; @@ -1407,14 +1114,16 @@ static int ip_chain_procinfo(int stage, char *buffer, char **start, while(i!=NULL) { - len+=sprintf(buffer+len,"%08lX/%08lX->%08lX/%08lX %08lX %X ", + len+=sprintf(buffer+len,"%08lX/%08lX->%08lX/%08lX %.16s %08lX %X ", ntohl(i->fw_src.s_addr),ntohl(i->fw_smsk.s_addr), ntohl(i->fw_dst.s_addr),ntohl(i->fw_dmsk.s_addr), + (i->fw_vianame)[0] ? i->fw_vianame : "-", ntohl(i->fw_via.s_addr),i->fw_flg); len+=sprintf(buffer+len,"%u %u %-9lu %-9lu", i->fw_nsp,i->fw_ndp, i->fw_pcnt,i->fw_bcnt); for (p = 0; p < IP_FW_MAX_PORTS; p++) len+=sprintf(buffer+len, " %u", i->fw_pts[p]); + len+=sprintf(buffer+len, " A%02X X%02X", i->fw_tosand, i->fw_tosxor); buffer[len++]='\n'; buffer[len]='\0'; pos=begin+len; @@ -1423,14 +1132,18 @@ static int ip_chain_procinfo(int stage, char *buffer, char **start, len=0; begin=pos; } + else if(pos>offset+length) + { + len = last_len; + break; + } else if(reset) { /* This needs to be done at this specific place! */ i->fw_pcnt=0L; i->fw_bcnt=0L; } - if(pos>offset+length) - break; + last_len = len; i=i->fw_next; } restore_flags(flags); @@ -1444,71 +1157,175 @@ static int ip_chain_procinfo(int stage, char *buffer, char **start, #ifdef CONFIG_IP_ACCT -int ip_acct_procinfo(char *buffer, char **start, off_t offset, int length, int reset) +static int ip_acct_procinfo(char *buffer, char **start, off_t offset, + int length, int reset) { - return ip_chain_procinfo(IP_INFO_ACCT, buffer,start,offset,length,reset); + return ip_chain_procinfo(IP_FW_ACCT, buffer,start, offset,length, + reset); } #endif #ifdef CONFIG_IP_FIREWALL -int ip_fw_blk_procinfo(char *buffer, char **start, off_t offset, int length, int reset) +static int ip_fw_in_procinfo(char *buffer, char **start, off_t offset, + int length, int reset) +{ + return ip_chain_procinfo(IP_FW_IN, buffer,start,offset,length, + reset); +} + +static int ip_fw_out_procinfo(char *buffer, char **start, off_t offset, + int length, int reset) { - return ip_chain_procinfo(IP_INFO_BLK, buffer,start,offset,length,reset); + return ip_chain_procinfo(IP_FW_OUT, buffer,start,offset,length, + reset); } -int ip_fw_fwd_procinfo(char *buffer, char **start, off_t offset, int length, int reset) +static int ip_fw_fwd_procinfo(char *buffer, char **start, off_t offset, + int length, int reset) { - return ip_chain_procinfo(IP_INFO_FWD, buffer,start,offset,length,reset); + return ip_chain_procinfo(IP_FW_FWD, buffer,start,offset,length, + reset); } #endif -#ifdef CONFIG_IP_MASQUERADE -int ip_msqhst_procinfo(char *buffer, char **start, off_t offset, int length) +#ifdef CONFIG_IP_FIREWALL +/* + * Interface to the generic firewall chains. + */ + +int ipfw_input_check(struct firewall_ops *this, int pf, struct device *dev, void *phdr, void *arg) { - off_t pos=0, begin=0; - struct ip_masq *ms; + return ip_fw_chk(phdr, dev, arg, ip_fw_in_chain, ip_fw_in_policy, IP_FW_MODE_FW); +} + +int ipfw_output_check(struct firewall_ops *this, int pf, struct device *dev, void *phdr, void *arg) +{ + return ip_fw_chk(phdr, dev, arg, ip_fw_out_chain, ip_fw_out_policy, IP_FW_MODE_FW); +} + +int ipfw_forward_check(struct firewall_ops *this, int pf, struct device *dev, void *phdr, void *arg) +{ + return ip_fw_chk(phdr, dev, arg, ip_fw_fwd_chain, ip_fw_fwd_policy, IP_FW_MODE_FW); +} + +struct firewall_ops ipfw_ops= +{ + NULL, + ipfw_forward_check, + ipfw_input_check, + ipfw_output_check, + PF_INET, + 0 /* We don't even allow a fall through so we are last */ +}; + +#endif + +#if defined(CONFIG_IP_ACCT) || defined(CONFIG_IP_FIREWALL) + +int ipfw_device_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct device *dev=ptr; + char *devname = dev->name; unsigned long flags; - int len=0; - - len=sprintf(buffer,"Prc FromIP FPrt ToIP TPrt Masq Init-seq Delta Expires\n"); + struct ip_fw *fw; + int chn; + save_flags(flags); cli(); - ms=ip_msq_hosts; - while (ms!=NULL) - { - int timer_active = del_timer(&ms->timer); - if (!timer_active) - ms->timer.expires = 0; - len+=sprintf(buffer+len,"%s %08lX:%04X %08lX:%04X %04X %08lX %5d %lu\n", - strProt[ms->protocol==IPPROTO_TCP], - ntohl(ms->src),ntohs(ms->sport), - ntohl(ms->dst),ntohs(ms->dport), - ntohs(ms->mport), - ms->init_seq,ms->delta,ms->timer.expires); - if (timer_active) - add_timer(&ms->timer); - - pos=begin+len; - if(pos<offset) - { - len=0; - begin=pos; - } - if(pos>offset+length) - break; - ms=ms->next; + if (event == NETDEV_UP) { + for (chn = 0; chn < IP_FW_CHAINS; chn++) + for (fw = *chains[chn]; fw; fw = fw->fw_next) + if ((fw->fw_vianame)[0] && !strncmp(devname, + fw->fw_vianame, IFNAMSIZ)) + fw->fw_viadev = dev; + } else if (event == NETDEV_DOWN) { + for (chn = 0; chn < IP_FW_CHAINS; chn++) + for (fw = *chains[chn]; fw; fw = fw->fw_next) + /* we could compare just the pointers ... */ + if ((fw->fw_vianame)[0] && !strncmp(devname, + fw->fw_vianame, IFNAMSIZ)) + fw->fw_viadev = (struct device *) -1; } + restore_flags(flags); - *start=buffer+(offset-begin); - len-=(offset-begin); - if(len>length) - len=length; - return len; + return NOTIFY_DONE; } - + +static struct notifier_block ipfw_dev_notifier={ + ipfw_device_event, + NULL, + 0 +}; + +#endif + +#ifdef CONFIG_PROC_FS +#ifdef CONFIG_IP_ACCT +static struct proc_dir_entry proc_net_ipacct = { + PROC_NET_IPACCT, 7, "ip_acct", + S_IFREG | S_IRUGO | S_IWUSR, 1, 0, 0, + 0, &proc_net_inode_operations, + ip_acct_procinfo +}; +#endif +#endif + +#ifdef CONFIG_IP_FIREWALL +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry proc_net_ipfwin = { + PROC_NET_IPFWIN, 8, "ip_input", + S_IFREG | S_IRUGO | S_IWUSR, 1, 0, 0, + 0, &proc_net_inode_operations, + ip_fw_in_procinfo +}; +static struct proc_dir_entry proc_net_ipfwout = { + PROC_NET_IPFWOUT, 9, "ip_output", + S_IFREG | S_IRUGO | S_IWUSR, 1, 0, 0, + 0, &proc_net_inode_operations, + ip_fw_out_procinfo +}; +static struct proc_dir_entry proc_net_ipfwfwd = { + PROC_NET_IPFWFWD, 10, "ip_forward", + S_IFREG | S_IRUGO | S_IWUSR, 1, 0, 0, + 0, &proc_net_inode_operations, + ip_fw_fwd_procinfo +}; +#endif #endif +void ip_fw_init(void) +{ +#ifdef CONFIG_PROC_FS +#ifdef CONFIG_IP_ACCT + proc_net_register(&proc_net_ipacct); +#endif +#endif +#ifdef CONFIG_IP_FIREWALL + + if(register_firewall(PF_INET,&ipfw_ops)<0) + panic("Unable to register IP firewall.\n"); + +#ifdef CONFIG_PROC_FS + proc_net_register(&proc_net_ipfwin); + proc_net_register(&proc_net_ipfwout); + proc_net_register(&proc_net_ipfwfwd); +#endif +#endif +#ifdef CONFIG_IP_MASQUERADE + + /* + * Initialize masquerading. + */ + + ip_masq_init(); +#endif + +#if defined(CONFIG_IP_ACCT) || defined(CONFIG_IP_FIREWALL) + /* Register for device up/down reports */ + register_netdevice_notifier(&ipfw_dev_notifier); +#endif +} diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c new file mode 100644 index 000000000..3e286c4d7 --- /dev/null +++ b/net/ipv4/ip_input.c @@ -0,0 +1,737 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The Internet Protocol (IP) module. + * + * Version: @(#)ip.c 1.0.16b 9/1/93 + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Donald Becker, <becker@super.org> + * Alan Cox, <Alan.Cox@linux.org> + * Richard Underwood + * Stefan Becker, <stefanb@yello.ping.de> + * Jorge Cwik, <jorge@laser.satlink.net> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * + * + * Fixes: + * Alan Cox : Commented a couple of minor bits of surplus code + * Alan Cox : Undefining IP_FORWARD doesn't include the code + * (just stops a compiler warning). + * Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes + * are junked rather than corrupting things. + * Alan Cox : Frames to bad broadcast subnets are dumped + * We used to process them non broadcast and + * boy could that cause havoc. + * Alan Cox : ip_forward sets the free flag on the + * new frame it queues. Still crap because + * it copies the frame but at least it + * doesn't eat memory too. + * Alan Cox : Generic queue code and memory fixes. + * Fred Van Kempen : IP fragment support (borrowed from NET2E) + * Gerhard Koerting: Forward fragmented frames correctly. + * Gerhard Koerting: Fixes to my fix of the above 8-). + * Gerhard Koerting: IP interface addressing fix. + * Linus Torvalds : More robustness checks + * Alan Cox : Even more checks: Still not as robust as it ought to be + * Alan Cox : Save IP header pointer for later + * Alan Cox : ip option setting + * Alan Cox : Use ip_tos/ip_ttl settings + * Alan Cox : Fragmentation bogosity removed + * (Thanks to Mark.Bush@prg.ox.ac.uk) + * Dmitry Gorodchanin : Send of a raw packet crash fix. + * Alan Cox : Silly ip bug when an overlength + * fragment turns up. Now frees the + * queue. + * Linus Torvalds/ : Memory leakage on fragmentation + * Alan Cox : handling. + * Gerhard Koerting: Forwarding uses IP priority hints + * Teemu Rantanen : Fragment problems. + * Alan Cox : General cleanup, comments and reformat + * Alan Cox : SNMP statistics + * Alan Cox : BSD address rule semantics. Also see + * UDP as there is a nasty checksum issue + * if you do things the wrong way. + * Alan Cox : Always defrag, moved IP_FORWARD to the config.in file + * Alan Cox : IP options adjust sk->priority. + * Pedro Roque : Fix mtu/length error in ip_forward. + * Alan Cox : Avoid ip_chk_addr when possible. + * Richard Underwood : IP multicasting. + * Alan Cox : Cleaned up multicast handlers. + * Alan Cox : RAW sockets demultiplex in the BSD style. + * Gunther Mayer : Fix the SNMP reporting typo + * Alan Cox : Always in group 224.0.0.1 + * Pauline Middelink : Fast ip_checksum update when forwarding + * Masquerading support. + * Alan Cox : Multicast loopback error for 224.0.0.1 + * Alan Cox : IP_MULTICAST_LOOP option. + * Alan Cox : Use notifiers. + * Bjorn Ekwall : Removed ip_csum (from slhc.c too) + * Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!) + * Stefan Becker : Send out ICMP HOST REDIRECT + * Arnt Gulbrandsen : ip_build_xmit + * Alan Cox : Per socket routing cache + * Alan Cox : Fixed routing cache, added header cache. + * Alan Cox : Loopback didn't work right in original ip_build_xmit - fixed it. + * Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net. + * Alan Cox : Incoming IP option handling. + * Alan Cox : Set saddr on raw output frames as per BSD. + * Alan Cox : Stopped broadcast source route explosions. + * Alan Cox : Can disable source routing + * Takeshi Sone : Masquerading didn't work. + * Dave Bonn,Alan Cox : Faster IP forwarding whenever possible. + * Alan Cox : Memory leaks, tramples, misc debugging. + * Alan Cox : Fixed multicast (by popular demand 8)) + * Alan Cox : Fixed forwarding (by even more popular demand 8)) + * Alan Cox : Fixed SNMP statistics [I think] + * Gerhard Koerting : IP fragmentation forwarding fix + * Alan Cox : Device lock against page fault. + * Alan Cox : IP_HDRINCL facility. + * Werner Almesberger : Zero fragment bug + * Alan Cox : RAW IP frame length bug + * Alan Cox : Outgoing firewall on build_xmit + * A.N.Kuznetsov : IP_OPTIONS support throughout the kernel + * Alan Cox : Multicast routing hooks + * Jos Vos : Do accounting *before* call_in_firewall + * Willy Konynenberg : Transparent proxying support + * + * + * + * To Fix: + * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient + * and could be made very efficient with the addition of some virtual memory hacks to permit + * the allocation of a buffer that can then be 'grown' by twiddling page tables. + * Output fragmentation wants updating along with the buffer management to use a single + * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet + * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause + * fragmentation anyway. + * + * FIXME: copy frag 0 iph to qp->iph + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/config.h> + +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> + +#include <net/snmp.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/route.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/arp.h> +#include <net/icmp.h> +#include <net/raw.h> +#include <net/checksum.h> +#include <linux/igmp.h> +#include <linux/ip_fw.h> +#ifdef CONFIG_IP_MASQUERADE +#include <net/ip_masq.h> +#endif +#include <linux/firewall.h> +#include <linux/mroute.h> +#include <net/netlink.h> +#ifdef CONFIG_NET_ALIAS +#include <linux/net_alias.h> +#endif + +extern int last_retran; +extern void sort_send(struct sock *sk); + +#define min(a,b) ((a)<(b)?(a):(b)) + +/* + * SNMP management statistics + */ + +#ifdef CONFIG_IP_FORWARD +struct ip_mib ip_statistics={1,64,}; /* Forwarding=Yes, Default TTL=64 */ +#else +struct ip_mib ip_statistics={2,64,}; /* Forwarding=No, Default TTL=64 */ +#endif + +/* + * Handle the issuing of an ioctl() request + * for the ip device. This is scheduled to + * disappear + */ + +int ip_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + switch(cmd) + { + default: + return(-EINVAL); + } +} + +#ifdef CONFIG_IP_TRANSPARENT_PROXY +/* + * Check the packet against our socket administration to see + * if it is related to a connection on our system. + * Needed for transparent proxying. + */ + +int ip_chksock(struct sk_buff *skb) +{ + switch (skb->h.iph->protocol) { + case IPPROTO_ICMP: + return icmp_chkaddr(skb); + case IPPROTO_TCP: + return tcp_chkaddr(skb); + case IPPROTO_UDP: + return udp_chkaddr(skb); + default: + return 0; + } +} +#endif + + +/* + * This function receives all incoming IP datagrams. + * + * On entry skb->data points to the start of the IP header and + * the MAC header has been removed. + */ + +int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) +{ + struct iphdr *iph = skb->h.iph; + struct sock *raw_sk=NULL; + unsigned char hash; + unsigned char flag = 0; + struct inet_protocol *ipprot; + int brd=IS_MYADDR; + struct options * opt = NULL; + int is_frag=0; + __u32 daddr; + +#ifdef CONFIG_FIREWALL + int fwres; + __u16 rport; +#endif +#ifdef CONFIG_IP_MROUTE + int mroute_pkt=0; +#endif + +#ifdef CONFIG_NET_IPV6 + /* + * Intercept IPv6 frames. We dump ST-II and invalid types just below.. + */ + + if(iph->version == 6) + return ipv6_rcv(skb,dev,pt); +#endif + + ip_statistics.IpInReceives++; + + /* + * Account for the packet (even if the packet is + * not accepted by the firewall!). + */ + +#ifdef CONFIG_IP_ACCT + ip_fw_chk(iph,dev,NULL,ip_acct_chain,0,IP_FW_MODE_ACCT_IN); +#endif + + /* + * Tag the ip header of this packet so we can find it + */ + + skb->ip_hdr = iph; + + /* + * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum. + * RFC1122: 3.1.2.3 MUST discard a frame with invalid source address [NEEDS FIXING]. + * + * Is the datagram acceptable? + * + * 1. Length at least the size of an ip header + * 2. Version of 4 + * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums] + * 4. Doesn't have a bogus length + * (5. We ought to check for IP multicast addresses and undefined types.. does this matter ?) + */ + + if (skb->len<sizeof(struct iphdr) || iph->ihl<5 || iph->version != 4 || ip_fast_csum((unsigned char *)iph, iph->ihl) !=0 + || skb->len < ntohs(iph->tot_len)) + { + ip_statistics.IpInHdrErrors++; + kfree_skb(skb, FREE_WRITE); + return(0); + } + + /* + * Our transport medium may have padded the buffer out. Now we know it + * is IP we can trim to the true length of the frame. + * Note this now means skb->len holds ntohs(iph->tot_len). + */ + + skb_trim(skb,ntohs(iph->tot_len)); + + /* + * Try to select closest <src,dst> alias device, if any. + * net_alias_dev_rcv_sel32 returns main device if it + * fails to found other. + */ + +#ifdef CONFIG_NET_ALIAS + if (iph->daddr != skb->dev->pa_addr && net_alias_has(skb->dev)) + skb->dev = dev = net_alias_dev_rcv_sel32(skb->dev, AF_INET, iph->saddr, iph->daddr); +#endif + + if (iph->ihl > 5) + { + skb->ip_summed = 0; + if (ip_options_compile(NULL, skb)) + return(0); + opt = (struct options*)skb->proto_priv; +#ifdef CONFIG_IP_NOSR + if (opt->srr) + { + kfree_skb(skb, FREE_READ); + return -EINVAL; + } +#endif + } + +#if defined(CONFIG_IP_TRANSPARENT_PROXY) && !defined(CONFIG_IP_ALWAYS_DEFRAG) +#define CONFIG_IP_ALWAYS_DEFRAG 1 +#endif +#ifdef CONFIG_IP_ALWAYS_DEFRAG + /* + * Defragment all incoming traffic before even looking at it. + * If you have forwarding enabled, this makes the system a + * defragmenting router. Not a common thing. + * You probably DON'T want to enable this unless you have to. + * You NEED to use this if you want to use transparent proxying, + * otherwise, we can't vouch for your sanity. + */ + + /* + * See if the frame is fragmented. + */ + + if(iph->frag_off) + { + if (iph->frag_off & htons(IP_MF)) + is_frag|=IPFWD_FRAGMENT; + /* + * Last fragment ? + */ + + if (iph->frag_off & htons(IP_OFFSET)) + is_frag|=IPFWD_LASTFRAG; + + /* + * Reassemble IP fragments. + */ + + if(is_frag) + { + /* Defragment. Obtain the complete packet if there is one */ + skb=ip_defrag(iph,skb,dev); + if(skb==NULL) + return 0; + skb->dev = dev; + iph=skb->h.iph; + is_frag = 0; + /* + * When the reassembled packet gets forwarded, the ip + * header checksum should be correct. + * For better performance, this should actually only + * be done in that particular case, i.e. set a flag + * here and calculate the checksum in ip_forward. + */ + ip_send_check(iph); + } + } + +#endif + /* + * See if the firewall wants to dispose of the packet. + */ + +#ifdef CONFIG_FIREWALL + + if ((fwres=call_in_firewall(PF_INET, skb->dev, iph, &rport))<FW_ACCEPT) + { + if(fwres==FW_REJECT) + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0, dev); + kfree_skb(skb, FREE_WRITE); + return 0; + } + +#ifdef CONFIG_IP_TRANSPARENT_PROXY + if (fwres==FW_REDIRECT) + skb->redirport = rport; + else +#endif + skb->redirport = 0; +#endif + +#ifndef CONFIG_IP_ALWAYS_DEFRAG + /* + * Remember if the frame is fragmented. + */ + + if(iph->frag_off) + { + if (iph->frag_off & htons(IP_MF)) + is_frag|=IPFWD_FRAGMENT; + /* + * Last fragment ? + */ + + if (iph->frag_off & htons(IP_OFFSET)) + is_frag|=IPFWD_LASTFRAG; + } + +#endif + /* + * Do any IP forwarding required. chk_addr() is expensive -- avoid it someday. + * + * This is inefficient. While finding out if it is for us we could also compute + * the routing table entry. This is where the great unified cache theory comes + * in as and when someone implements it + * + * For most hosts over 99% of packets match the first conditional + * and don't go via ip_chk_addr. Note: brd is set to IS_MYADDR at + * function entry. + */ + daddr = iph->daddr; +#ifdef CONFIG_IP_TRANSPARENT_PROXY + /* + * ip_chksock adds still more overhead for forwarded traffic... + */ + if ( iph->daddr == skb->dev->pa_addr || skb->redirport || (brd = ip_chk_addr(iph->daddr)) != 0 || ip_chksock(skb)) +#else + if ( iph->daddr == skb->dev->pa_addr || (brd = ip_chk_addr(iph->daddr)) != 0) +#endif + { + if (opt && opt->srr) + { + int srrspace, srrptr; + __u32 nexthop; + unsigned char * optptr = ((unsigned char *)iph) + opt->srr; + + if (brd != IS_MYADDR || skb->pkt_type != PACKET_HOST) + { + kfree_skb(skb, FREE_WRITE); + return 0; + } + + for ( srrptr=optptr[2], srrspace = optptr[1]; + srrptr <= srrspace; + srrptr += 4 + ) + { + int brd2; + if (srrptr + 3 > srrspace) + { + icmp_send(skb, ICMP_PARAMETERPROB, 0, opt->srr+2, + skb->dev); + kfree_skb(skb, FREE_WRITE); + return 0; + } + memcpy(&nexthop, &optptr[srrptr-1], 4); + if ((brd2 = ip_chk_addr(nexthop)) == 0) + break; + if (brd2 != IS_MYADDR) + { + + /* + * ANK: should we implement weak tunneling of multicasts? + * Are they obsolete? DVMRP specs (RFC-1075) is old enough... + * [They are obsolete] + */ + kfree_skb(skb, FREE_WRITE); + return -EINVAL; + } + memcpy(&daddr, &optptr[srrptr-1], 4); + } + if (srrptr <= srrspace) + { + opt->srr_is_hit = 1; + opt->is_changed = 1; +#ifdef CONFIG_IP_FORWARD + if (ip_forward(skb, dev, is_frag, nexthop)) + kfree_skb(skb, FREE_WRITE); +#else + ip_statistics.IpInAddrErrors++; + kfree_skb(skb, FREE_WRITE); +#endif + return 0; + } + } + +#ifdef CONFIG_IP_MULTICAST + if(!(dev->flags&IFF_ALLMULTI) && brd==IS_MULTICAST && iph->daddr!=IGMP_ALL_HOSTS && !(dev->flags&IFF_LOOPBACK)) + { + /* + * Check it is for one of our groups + */ + struct ip_mc_list *ip_mc=dev->ip_mc_list; + do + { + if(ip_mc==NULL) + { + kfree_skb(skb, FREE_WRITE); + return 0; + } + if(ip_mc->multiaddr==iph->daddr) + break; + ip_mc=ip_mc->next; + } + while(1); + } +#endif + +#ifndef CONFIG_IP_ALWAYS_DEFRAG + /* + * Reassemble IP fragments. + */ + + if(is_frag) + { + /* Defragment. Obtain the complete packet if there is one */ + skb=ip_defrag(iph,skb,dev); + if(skb==NULL) + return 0; + skb->dev = dev; + iph=skb->h.iph; + } + +#endif + +#ifdef CONFIG_IP_MASQUERADE + /* + * Do we need to de-masquerade this packet? + */ + { + int ret = ip_fw_demasquerade(&skb,dev); + if (ret < 0) { + kfree_skb(skb, FREE_WRITE); + return 0; + } + + if (ret) + { + struct iphdr *iph=skb->h.iph; + if (ip_forward(skb, dev, IPFWD_MASQUERADED, iph->daddr)) + kfree_skb(skb, FREE_WRITE); + return 0; + } + } +#endif + + /* + * Point into the IP datagram, just past the header. + */ + + skb->ip_hdr = iph; + skb->h.raw += iph->ihl*4; + +#ifdef CONFIG_IP_MROUTE + /* + * Check the state on multicast routing (multicast and not 224.0.0.z) + */ + + if(brd==IS_MULTICAST && (iph->daddr&htonl(0xFFFFFF00))!=htonl(0xE0000000)) + mroute_pkt=1; + +#endif + /* + * Deliver to raw sockets. This is fun as to avoid copies we want to make no surplus copies. + * + * RFC 1122: SHOULD pass TOS value up to the transport layer. + */ + + hash = iph->protocol & (SOCK_ARRAY_SIZE-1); + + /* + * If there maybe a raw socket we must check - if not we don't care less + */ + + if((raw_sk=raw_prot.sock_array[hash])!=NULL) + { + struct sock *sknext=NULL; + struct sk_buff *skb1; + raw_sk=get_sock_raw(raw_sk, iph->protocol, iph->saddr, iph->daddr); + if(raw_sk) /* Any raw sockets */ + { + do + { + /* Find the next */ + sknext=get_sock_raw(raw_sk->next, iph->protocol, iph->saddr, iph->daddr); + if(sknext) + skb1=skb_clone(skb, GFP_ATOMIC); + else + break; /* One pending raw socket left */ + if(skb1) + raw_rcv(raw_sk, skb1, dev, iph->saddr,daddr); + raw_sk=sknext; + } + while(raw_sk!=NULL); + + /* + * Here either raw_sk is the last raw socket, or NULL if none + */ + + /* + * We deliver to the last raw socket AFTER the protocol checks as it avoids a surplus copy + */ + } + } + + /* + * skb->h.raw now points at the protocol beyond the IP header. + */ + + hash = iph->protocol & (MAX_INET_PROTOS -1); + for (ipprot = (struct inet_protocol *)inet_protos[hash];ipprot != NULL;ipprot=(struct inet_protocol *)ipprot->next) + { + struct sk_buff *skb2; + + if (ipprot->protocol != iph->protocol) + continue; + /* + * See if we need to make a copy of it. This will + * only be set if more than one protocol wants it. + * and then not for the last one. If there is a pending + * raw delivery wait for that + */ + +#ifdef CONFIG_IP_MROUTE + if (ipprot->copy || raw_sk || mroute_pkt) +#else + if (ipprot->copy || raw_sk) +#endif + { + skb2 = skb_clone(skb, GFP_ATOMIC); + if(skb2==NULL) + continue; + } + else + { + skb2 = skb; + } + flag = 1; + + /* + * Pass on the datagram to each protocol that wants it, + * based on the datagram protocol. We should really + * check the protocol handler's return values here... + */ + + ipprot->handler(skb2, dev, opt, daddr, + (ntohs(iph->tot_len) - (iph->ihl * 4)), + iph->saddr, 0, ipprot); + } + + /* + * All protocols checked. + * If this packet was a broadcast, we may *not* reply to it, since that + * causes (proven, grin) ARP storms and a leakage of memory (i.e. all + * ICMP reply messages get queued up for transmission...) + */ + +#ifdef CONFIG_IP_MROUTE + /* + * Forward the last copy to the multicast router. If + * there is a pending raw delivery however make a copy + * and forward that. + */ + + if(mroute_pkt) + { + flag=1; + if(raw_sk==NULL) + ipmr_forward(skb, is_frag); + else + { + struct sk_buff *skb2=skb_clone(skb, GFP_ATOMIC); + if(skb2) + { + skb2->free=1; + ipmr_forward(skb2, is_frag); + } + } + } +#endif + + if(raw_sk!=NULL) /* Shift to last raw user */ + raw_rcv(raw_sk, skb, dev, iph->saddr, daddr); + else if (!flag) /* Free and report errors */ + { + if (brd != IS_BROADCAST && brd!=IS_MULTICAST) + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0, dev); + kfree_skb(skb, FREE_WRITE); + } + + return(0); + } + + /* + * Do any unicast IP forwarding required. + */ + + /* + * Don't forward multicast or broadcast frames. + */ + + if(skb->pkt_type!=PACKET_HOST || brd==IS_BROADCAST) + { + kfree_skb(skb,FREE_WRITE); + return 0; + } + + /* + * The packet is for another target. Forward the frame + */ + +#ifdef CONFIG_IP_FORWARD + if (opt && opt->is_strictroute) + { + icmp_send(skb, ICMP_PARAMETERPROB, 0, 16, skb->dev); + kfree_skb(skb, FREE_WRITE); + return -1; + } + if (ip_forward(skb, dev, is_frag, iph->daddr)) + kfree_skb(skb, FREE_WRITE); +#else +/* printk("Machine %lx tried to use us as a forwarder to %lx but we have forwarding disabled!\n", + iph->saddr,iph->daddr);*/ + ip_statistics.IpInAddrErrors++; + kfree_skb(skb, FREE_WRITE); +#endif + return(0); +} + + diff --git a/net/ipv4/ip_masq.c b/net/ipv4/ip_masq.c new file mode 100644 index 000000000..67e10979f --- /dev/null +++ b/net/ipv4/ip_masq.c @@ -0,0 +1,1023 @@ +/* + * + * Masquerading functionality + * + * Copyright (c) 1994 Pauline Middelink + * + * See ip_fw.c for original log + * + * Fixes: + * Juan Jose Ciarlante : Modularized application masquerading (see ip_masq_app.c) + * Juan Jose Ciarlante : New struct ip_masq_seq that holds output/input delta seq. + * Juan Jose Ciarlante : Added hashed lookup by proto,maddr,mport and proto,saddr,sport + * Juan Jose Ciarlante : Fixed deadlock if free ports get exhausted + * Juan Jose Ciarlante : Added NO_ADDR status flag. + * Nigel Metheringham : Added ICMP handling for demasquerade + * Nigel Metheringham : Checksum checking of masqueraded data + * Nigel Metheringham : Better handling of timeouts of TCP conns + * + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <asm/system.h> +#include <linux/stat.h> +#include <linux/proc_fs.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/inet.h> +#include <net/protocol.h> +#include <net/icmp.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/checksum.h> +#include <net/ip_masq.h> + +#define IP_MASQ_TAB_SIZE 256 /* must be power of 2 */ + +/* + * Implement IP packet masquerading + */ + +static const char *strProt[] = {"UDP","TCP"}; + +static __inline__ const char * masq_proto_name(unsigned proto) +{ + return strProt[proto==IPPROTO_TCP]; +} + +/* + * Last masq_port number in use. + * Will cycle in MASQ_PORT boundaries. + */ +static __u16 masq_port = PORT_MASQ_BEGIN; + +/* + * free ports counters (UDP & TCP) + * + * Their value is _less_ or _equal_ to actual free ports: + * same masq port, diff masq addr (firewall iface address) allocated + * entries are accounted but their actually don't eat a more than 1 port. + * + * Greater values could lower MASQ_EXPIRATION setting as a way to + * manage 'masq_entries resource'. + * + */ + +int ip_masq_free_ports[2] = { + PORT_MASQ_END - PORT_MASQ_BEGIN, /* UDP */ + PORT_MASQ_END - PORT_MASQ_BEGIN /* TCP */ +}; + +static struct symbol_table ip_masq_syms = { +#include <linux/symtab_begin.h> + X(ip_masq_new), + X(ip_masq_set_expire), + X(ip_masq_free_ports), + X(ip_masq_expire), + X(ip_masq_out_get_2), +#include <linux/symtab_end.h> +}; + +/* + * 2 ip_masq hash tables: for input and output pkts lookups. + */ + +struct ip_masq *ip_masq_m_tab[IP_MASQ_TAB_SIZE]; +struct ip_masq *ip_masq_s_tab[IP_MASQ_TAB_SIZE]; + +/* + * timeouts + */ + +static struct ip_fw_masq ip_masq_dummy = { + MASQUERADE_EXPIRE_TCP, + MASQUERADE_EXPIRE_TCP_FIN, + MASQUERADE_EXPIRE_UDP +}; + +struct ip_fw_masq *ip_masq_expire = &ip_masq_dummy; + +/* + * Returns hash value + */ + +static __inline__ unsigned + +ip_masq_hash_key(unsigned proto, __u32 addr, __u16 port) +{ + return (proto^ntohl(addr)^ntohs(port)) & (IP_MASQ_TAB_SIZE-1); +} + +/* + * Hashes ip_masq by its proto,addrs,ports. + * should be called with masked interrupts. + * returns bool success. + */ + +static __inline__ int +ip_masq_hash(struct ip_masq *ms) +{ + unsigned hash; + + if (ms->flags & IP_MASQ_F_HASHED) { + printk("ip_masq_hash(): request for already hashed\n"); + return 0; + } + /* + * Hash by proto,m{addr,port} + */ + hash = ip_masq_hash_key(ms->protocol, ms->maddr, ms->mport); + ms->m_link = ip_masq_m_tab[hash]; + ip_masq_m_tab[hash] = ms; + + /* + * Hash by proto,s{addr,port} + */ + hash = ip_masq_hash_key(ms->protocol, ms->saddr, ms->sport); + ms->s_link = ip_masq_s_tab[hash]; + ip_masq_s_tab[hash] = ms; + + + ms->flags |= IP_MASQ_F_HASHED; + return 1; +} + +/* + * UNhashes ip_masq from ip_masq_[ms]_tables. + * should be called with masked interrupts. + * returns bool success. + */ + +static __inline__ int ip_masq_unhash(struct ip_masq *ms) +{ + unsigned hash; + struct ip_masq ** ms_p; + if (!(ms->flags & IP_MASQ_F_HASHED)) { + printk("ip_masq_unhash(): request for unhash flagged\n"); + return 0; + } + /* + * UNhash by m{addr,port} + */ + hash = ip_masq_hash_key(ms->protocol, ms->maddr, ms->mport); + for (ms_p = &ip_masq_m_tab[hash]; *ms_p ; ms_p = &(*ms_p)->m_link) + if (ms == (*ms_p)) { + *ms_p = ms->m_link; + break; + } + /* + * UNhash by s{addr,port} + */ + hash = ip_masq_hash_key(ms->protocol, ms->saddr, ms->sport); + for (ms_p = &ip_masq_s_tab[hash]; *ms_p ; ms_p = &(*ms_p)->s_link) + if (ms == (*ms_p)) { + *ms_p = ms->s_link; + break; + } + + ms->flags &= ~IP_MASQ_F_HASHED; + return 1; +} + +/* + * Returns ip_masq associated with addresses found in iph. + * called for pkts coming from outside-to-INside the firewall + * + * NB. Cannot check destination address, just for the incoming port. + * reason: archie.doc.ac.uk has 6 interfaces, you send to + * phoenix and get a reply from any other interface(==dst)! + * + * [Only for UDP] - AC + */ + +struct ip_masq * +ip_masq_in_get(struct iphdr *iph) +{ + __u16 *portptr; + int protocol; + __u32 s_addr, d_addr; + __u16 s_port, d_port; + + portptr = (__u16 *)&(((char *)iph)[iph->ihl*4]); + protocol = iph->protocol; + s_addr = iph->saddr; + s_port = portptr[0]; + d_addr = iph->daddr; + d_port = portptr[1]; + + return ip_masq_in_get_2(protocol, s_addr, s_port, d_addr, d_port); +} + +/* + * Returns ip_masq associated with supplied parameters, either + * broken out of the ip/tcp headers or directly supplied for those + * pathological protocols with address/port in the data stream + * (ftp, irc). addresses and ports are in network order. + * called for pkts coming from INside-to-outside the firewall. + * + * NB. Cannot check destination address, just for the incoming port. + * reason: archie.doc.ac.uk has 6 interfaces, you send to + * phoenix and get a reply from any other interface(==dst)! + * + * [Only for UDP] - AC + */ + +struct ip_masq * +ip_masq_in_get_2(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) +{ + unsigned hash; + struct ip_masq *ms; + + hash = ip_masq_hash_key(protocol, d_addr, d_port); + for(ms = ip_masq_m_tab[hash]; ms ; ms = ms->m_link) { + if ( protocol==ms->protocol && + (s_addr==ms->daddr || ms->flags & IP_MASQ_F_NO_DADDR) && + (s_port==ms->dport || ms->flags & IP_MASQ_F_NO_DPORT) && + (d_addr==ms->maddr && d_port==ms->mport)) + return ms; + } + return NULL; +} + +/* + * Returns ip_masq associated with addresses found in iph. + * called for pkts coming from inside-to-OUTside the firewall. + */ + +struct ip_masq * +ip_masq_out_get(struct iphdr *iph) +{ + __u16 *portptr; + int protocol; + __u32 s_addr, d_addr; + __u16 s_port, d_port; + + portptr = (__u16 *)&(((char *)iph)[iph->ihl*4]); + protocol = iph->protocol; + s_addr = iph->saddr; + s_port = portptr[0]; + d_addr = iph->daddr; + d_port = portptr[1]; + + return ip_masq_out_get_2(protocol, s_addr, s_port, d_addr, d_port); +} + +/* + * Returns ip_masq associated with supplied parameters, either + * broken out of the ip/tcp headers or directly supplied for those + * pathological protocols with address/port in the data stream + * (ftp, irc). addresses and ports are in network order. + * called for pkts coming from inside-to-OUTside the firewall. + */ + +struct ip_masq * +ip_masq_out_get_2(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) +{ + unsigned hash; + struct ip_masq *ms; + + hash = ip_masq_hash_key(protocol, s_addr, s_port); + for(ms = ip_masq_s_tab[hash]; ms ; ms = ms->s_link) { + if (protocol == ms->protocol && + s_addr == ms->saddr && s_port == ms->sport && + d_addr == ms->daddr && d_port == ms->dport ) + return ms; + } + + return NULL; +} + +/* + * Returns ip_masq for given proto,m_addr,m_port. + * called by allocation routine to find an unused m_port. + */ + +struct ip_masq * +ip_masq_getbym(int protocol, __u32 m_addr, __u16 m_port) +{ + unsigned hash; + struct ip_masq *ms; + + hash = ip_masq_hash_key(protocol, m_addr, m_port); + for(ms = ip_masq_m_tab[hash]; ms ; ms = ms->m_link) { + if ( protocol==ms->protocol && + (m_addr==ms->maddr && m_port==ms->mport)) + return ms; + } + return NULL; +} + +static void masq_expire(unsigned long data) +{ + struct ip_masq *ms = (struct ip_masq *)data; + unsigned long flags; + +#ifdef DEBUG_CONFIG_IP_MASQUERADE + printk("Masqueraded %s %lX:%X expired\n", + masq_proto_name(ms->protocol), + ntohl(ms->saddr),ntohs(ms->sport)); +#endif + + save_flags(flags); + cli(); + + if (ip_masq_unhash(ms)) { + ip_masq_free_ports[ms->protocol==IPPROTO_TCP]++; + ip_masq_unbind_app(ms); + kfree_s(ms,sizeof(*ms)); + } + + restore_flags(flags); +} + +/* + * Create a new masquerade list entry, also allocate an + * unused mport, keeping the portnumber between the + * given boundaries MASQ_BEGIN and MASQ_END. + */ + +struct ip_masq * ip_masq_new(struct device *dev, int proto, __u32 saddr, __u16 sport, __u32 daddr, __u16 dport, unsigned mflags) +{ + struct ip_masq *ms, *mst; + int ports_tried, *free_ports_p; + unsigned long flags; + static int n_fails = 0; + + free_ports_p = &ip_masq_free_ports[proto==IPPROTO_TCP]; + + if (*free_ports_p == 0) { + if (++n_fails < 5) + printk("ip_masq_new(proto=%s): no free ports.\n", + masq_proto_name(proto)); + return NULL; + } + ms = (struct ip_masq *) kmalloc(sizeof(struct ip_masq), GFP_ATOMIC); + if (ms == NULL) { + if (++n_fails < 5) + printk("ip_masq_new(proto=%s): no memory available.\n", + masq_proto_name(proto)); + return NULL; + } + memset(ms, 0, sizeof(*ms)); + init_timer(&ms->timer); + ms->timer.data = (unsigned long)ms; + ms->timer.function = masq_expire; + ms->protocol = proto; + ms->saddr = saddr; + ms->sport = sport; + ms->daddr = daddr; + ms->dport = dport; + ms->flags = mflags; + ms->app_data = NULL; + + if (proto == IPPROTO_UDP) + ms->flags |= IP_MASQ_F_NO_DADDR; + + /* get masq address from rif */ + ms->maddr = dev->pa_addr; + + for (ports_tried = 0; ports_tried < *free_ports_p; ports_tried++){ + save_flags(flags); + cli(); + + /* + * Try the next available port number + */ + + ms->mport = htons(masq_port++); + if (masq_port==PORT_MASQ_END) masq_port = PORT_MASQ_BEGIN; + + restore_flags(flags); + + /* + * lookup to find out if this port is used. + */ + + mst = ip_masq_getbym(proto, ms->maddr, ms->mport); + if (mst == NULL) { + save_flags(flags); + cli(); + + if (*free_ports_p == 0) { + restore_flags(flags); + break; + } + (*free_ports_p)--; + ip_masq_hash(ms); + + restore_flags(flags); + + ip_masq_bind_app(ms); + n_fails = 0; + return ms; + } + } + + if (++n_fails < 5) + printk("ip_masq_new(proto=%s): could not get free masq entry (free=%d).\n", + masq_proto_name(ms->protocol), *free_ports_p); + kfree_s(ms, sizeof(*ms)); + return NULL; +} + +/* + * Set masq expiration (deletion) and adds timer, + * if timeout==0 cancel expiration. + * Warning: it does not check/delete previous timer! + */ + +void ip_masq_set_expire(struct ip_masq *ms, unsigned long tout) +{ + if (tout) { + ms->timer.expires = jiffies+tout; + add_timer(&ms->timer); + } else { + del_timer(&ms->timer); + } +} + +static void recalc_check(struct udphdr *uh, __u32 saddr, + __u32 daddr, int len) +{ + uh->check=0; + uh->check=csum_tcpudp_magic(saddr,daddr,len, + IPPROTO_UDP, csum_partial((char *)uh,len,0)); + if(uh->check==0) + uh->check=0xFFFF; +} + +int ip_fw_masquerade(struct sk_buff **skb_ptr, struct device *dev) +{ + struct sk_buff *skb=*skb_ptr; + struct iphdr *iph = skb->h.iph; + __u16 *portptr; + struct ip_masq *ms; + int size; + unsigned long timeout; + + /* + * We can only masquerade protocols with ports... + * [TODO] + * We may need to consider masq-ing some ICMP related to masq-ed protocols + */ + + if (iph->protocol!=IPPROTO_UDP && iph->protocol!=IPPROTO_TCP) + return -1; + + /* + * Now hunt the list to see if we have an old entry + */ + + portptr = (__u16 *)&(((char *)iph)[iph->ihl*4]); +#ifdef DEBUG_CONFIG_IP_MASQUERADE + printk("Outgoing %s %lX:%X -> %lX:%X\n", + masq_proto_name(iph->protocol), + ntohl(iph->saddr), ntohs(portptr[0]), + ntohl(iph->daddr), ntohs(portptr[1])); +#endif + + ms = ip_masq_out_get(iph); + if (ms!=NULL) + ip_masq_set_expire(ms,0); + + /* + * Nope, not found, create a new entry for it + */ + + if (ms==NULL) + { + ms = ip_masq_new(dev, iph->protocol, + iph->saddr, portptr[0], + iph->daddr, portptr[1], + 0); + if (ms == NULL) + return -1; + } + + /* + * Change the fragments origin + */ + + size = skb->len - ((unsigned char *)portptr - skb->h.raw); + /* + * Set iph addr and port from ip_masq obj. + */ + iph->saddr = ms->maddr; + portptr[0] = ms->mport; + + /* + * Attempt ip_masq_app call. + * will fix ip_masq and iph seq stuff + */ + if (ip_masq_app_pkt_out(ms, skb_ptr, dev) != 0) + { + /* + * skb has possibly changed, update pointers. + */ + skb = *skb_ptr; + iph = skb->h.iph; + portptr = (__u16 *)&(((char *)iph)[iph->ihl*4]); + size = skb->len - ((unsigned char *)portptr-skb->h.raw); + } + + /* + * Adjust packet accordingly to protocol + */ + + if (iph->protocol==IPPROTO_UDP) + { + timeout = ip_masq_expire->udp_timeout; + recalc_check((struct udphdr *)portptr,iph->saddr,iph->daddr,size); + } + else + { + struct tcphdr *th; + th = (struct tcphdr *)portptr; + + /* Set the flags up correctly... */ + if (th->fin) + { + ms->flags |= IP_MASQ_F_SAW_FIN_OUT; + } + + if (th->rst) + { + ms->flags |= IP_MASQ_F_SAW_RST; + } + + /* + * Timeout depends if FIN packet has been seen + * Very short timeout if RST packet seen. + */ + if (ms->flags & IP_MASQ_F_SAW_RST) + { + timeout = 1; + } + else if ((ms->flags & IP_MASQ_F_SAW_FIN) == IP_MASQ_F_SAW_FIN) + { + timeout = ip_masq_expire->tcp_fin_timeout; + } + else timeout = ip_masq_expire->tcp_timeout; + + skb->csum = csum_partial((void *)(th + 1), size - sizeof(*th), 0); + tcp_v4_check(th, size, iph->saddr, iph->daddr, + skb->csum); + } + ip_masq_set_expire(ms, timeout); + ip_send_check(iph); + + #ifdef DEBUG_CONFIG_IP_MASQUERADE + printk("O-routed from %lX:%X over %s\n",ntohl(ms->maddr),ntohs(ms->mport),dev->name); + #endif + + return 0; + } + + +/* + * Handle ICMP messages in forward direction. + * Find any that might be relevant, check against existing connections, + * forward to masqueraded host if relevant. + * Currently handles error types - unreachable, quench, ttl exceeded + */ + +int ip_fw_masq_icmp(struct sk_buff **skb_p, struct device *dev) +{ + struct sk_buff *skb = *skb_p; + struct iphdr *iph = skb->h.iph; + struct icmphdr *icmph = (struct icmphdr *)((char *)iph + (iph->ihl<<2)); + struct iphdr *ciph; /* The ip header contained within the ICMP */ + __u16 *pptr; /* port numbers from TCP/UDP contained header */ + struct ip_masq *ms; + unsigned short len = ntohs(iph->tot_len) - (iph->ihl * 4); + +#ifdef DEBUG_CONFIG_IP_MASQUERADE + printk("Incoming forward ICMP (%d) %lX -> %lX\n", + icmph->type, + ntohl(iph->saddr), ntohl(iph->daddr)); +#endif + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that + * means easy things are checked first to speed up + * processing.... however this means that some + * packets will manage to get a long way down this + * stack and then be rejected, but thats life + */ + if ((icmph->type != ICMP_DEST_UNREACH) && + (icmph->type != ICMP_SOURCE_QUENCH) && + (icmph->type != ICMP_TIME_EXCEEDED)) + return 0; + + /* Now find the contained IP header */ + ciph = (struct iphdr *) (icmph + 1); + + /* We are only interested ICMPs generated from TCP or UDP packets */ + if ((ciph->protocol != IPPROTO_UDP) && (ciph->protocol != IPPROTO_TCP)) + return 0; + + /* + * Find the ports involved - this packet was + * incoming so the ports are right way round + * (but reversed relative to outer IP header!) + */ + pptr = (__u16 *)&(((char *)ciph)[ciph->ihl*4]); + if (ntohs(pptr[1]) < PORT_MASQ_BEGIN || + ntohs(pptr[1]) > PORT_MASQ_END) + return 0; + + /* Ensure the checksum is correct */ + if (ip_compute_csum((unsigned char *) icmph, len)) + { + /* Failed checksum! */ + printk(KERN_INFO "MASQ: forward ICMP: failed checksum from %s!\n", + in_ntoa(iph->saddr)); + return(-1); + } + +#ifdef DEBUG_CONFIG_IP_MASQUERADE + printk("Handling forward ICMP for %lX:%X -> %lX:%X\n", + ntohl(ciph->saddr), ntohs(pptr[0]), + ntohl(ciph->daddr), ntohs(pptr[1])); +#endif + + /* This is pretty much what ip_masq_in_get() does */ + ms = ip_masq_in_get_2(ciph->protocol, ciph->saddr, pptr[0], ciph->daddr, pptr[1]); + + if (ms == NULL) + return 0; + + /* Now we do real damage to this packet...! */ + /* First change the source IP address, and recalc checksum */ + iph->saddr = ms->maddr; + ip_send_check(iph); + + /* Now change the *dest* address in the contained IP */ + ciph->daddr = ms->maddr; + ip_send_check(ciph); + + /* the TCP/UDP dest port - cannot redo check */ + pptr[1] = ms->mport; + + /* And finally the ICMP checksum */ + icmph->checksum = 0; + icmph->checksum = ip_compute_csum((unsigned char *) icmph, len); + +#ifdef DEBUG_CONFIG_IP_MASQUERADE + printk("Rewrote forward ICMP to %lX:%X -> %lX:%X\n", + ntohl(ciph->saddr), ntohs(pptr[0]), + ntohl(ciph->daddr), ntohs(pptr[1])); +#endif + + return 1; +} + +/* + * Handle ICMP messages in reverse (demasquerade) direction. + * Find any that might be relevant, check against existing connections, + * forward to masqueraded host if relevant. + * Currently handles error types - unreachable, quench, ttl exceeded + */ + +int ip_fw_demasq_icmp(struct sk_buff **skb_p, struct device *dev) +{ + struct sk_buff *skb = *skb_p; + struct iphdr *iph = skb->h.iph; + struct icmphdr *icmph = (struct icmphdr *)((char *)iph + (iph->ihl<<2)); + struct iphdr *ciph; /* The ip header contained within the ICMP */ + __u16 *pptr; /* port numbers from TCP/UDP contained header */ + struct ip_masq *ms; + unsigned short len = ntohs(iph->tot_len) - (iph->ihl * 4); + +#ifdef DEBUG_CONFIG_IP_MASQUERADE + printk("Incoming reverse ICMP (%d) %lX -> %lX\n", + icmph->type, + ntohl(iph->saddr), ntohl(iph->daddr)); +#endif + + if ((icmph->type != ICMP_DEST_UNREACH) && + (icmph->type != ICMP_SOURCE_QUENCH) && + (icmph->type != ICMP_TIME_EXCEEDED)) + return 0; + + /* Now find the contained IP header */ + ciph = (struct iphdr *) (icmph + 1); + + /* We are only interested ICMPs generated from TCP or UDP packets */ + if ((ciph->protocol != IPPROTO_UDP) && (ciph->protocol != IPPROTO_TCP)) + return 0; + + /* + * Find the ports involved - remember this packet was + * *outgoing* so the ports are reversed (and addresses) + */ + pptr = (__u16 *)&(((char *)ciph)[ciph->ihl*4]); + if (ntohs(pptr[0]) < PORT_MASQ_BEGIN || + ntohs(pptr[0]) > PORT_MASQ_END) + return 0; + + /* Ensure the checksum is correct */ + if (ip_compute_csum((unsigned char *) icmph, len)) + { + /* Failed checksum! */ + printk(KERN_INFO "MASQ: reverse ICMP: failed checksum from %s!\n", + in_ntoa(iph->saddr)); + return(-1); + } + +#ifdef DEBUG_CONFIG_IP_MASQUERADE + printk("Handling reverse ICMP for %lX:%X -> %lX:%X\n", + ntohl(ciph->saddr), ntohs(pptr[0]), + ntohl(ciph->daddr), ntohs(pptr[1])); +#endif + + /* This is pretty much what ip_masq_in_get() does, except params are wrong way round */ + ms = ip_masq_in_get_2(ciph->protocol, ciph->daddr, pptr[1], ciph->saddr, pptr[0]); + + if (ms == NULL) + return 0; + + /* Now we do real damage to this packet...! */ + /* First change the dest IP address, and recalc checksum */ + iph->daddr = ms->saddr; + ip_send_check(iph); + + /* Now change the *source* address in the contained IP */ + ciph->saddr = ms->saddr; + ip_send_check(ciph); + + /* the TCP/UDP source port - cannot redo check */ + pptr[0] = ms->sport; + + /* And finally the ICMP checksum */ + icmph->checksum = 0; + icmph->checksum = ip_compute_csum((unsigned char *) icmph, len); + +#ifdef DEBUG_CONFIG_IP_MASQUERADE + printk("Rewrote reverse ICMP to %lX:%X -> %lX:%X\n", + ntohl(ciph->saddr), ntohs(pptr[0]), + ntohl(ciph->daddr), ntohs(pptr[1])); +#endif + + return 1; +} + + + /* + * Check if it's an masqueraded port, look it up, + * and send it on its way... + * + * Better not have many hosts using the designated portrange + * as 'normal' ports, or you'll be spending many time in + * this function. + */ + +int ip_fw_demasquerade(struct sk_buff **skb_p, struct device *dev) +{ + struct sk_buff *skb = *skb_p; + struct iphdr *iph = skb->h.iph; + __u16 *portptr; + struct ip_masq *ms; + unsigned short len; + unsigned long timeout; + + switch (iph->protocol) { + case IPPROTO_ICMP: + return(ip_fw_demasq_icmp(skb_p, dev)); + case IPPROTO_TCP: + case IPPROTO_UDP: + /* Make sure packet is in the masq range */ + portptr = (__u16 *)&(((char *)iph)[iph->ihl*4]); + if (ntohs(portptr[1]) < PORT_MASQ_BEGIN || + ntohs(portptr[1]) > PORT_MASQ_END) + return 0; + /* Check that the checksum is OK */ + len = ntohs(iph->tot_len) - (iph->ihl * 4); + if ((iph->protocol == IPPROTO_UDP) && (portptr[3] == 0)) + /* No UDP checksum */ + break; + + switch (skb->ip_summed) + { + case CHECKSUM_NONE: + skb->csum = csum_partial((char *)portptr, len, 0); + case CHECKSUM_HW: + if (csum_tcpudp_magic(iph->saddr, iph->daddr, len, + iph->protocol, skb->csum)) + { + printk(KERN_INFO "MASQ: failed TCP/UDP checksum from %s!\n", + in_ntoa(iph->saddr)); + return -1; + } + default: + /* CHECKSUM_UNNECESSARY */ + } + break; + default: + return 0; + } + + +#ifdef DEBUG_CONFIG_IP_MASQUERADE + printk("Incoming %s %lX:%X -> %lX:%X\n", + masq_proto_name(iph->protocol), + ntohl(iph->saddr), ntohs(portptr[0]), + ntohl(iph->daddr), ntohs(portptr[1])); +#endif + /* + * reroute to original host:port if found... + */ + + ms = ip_masq_in_get(iph); + + if (ms != NULL) + { + /* Stop the timer ticking.... */ + ip_masq_set_expire(ms,0); + + /* + * Set dport if not defined yet. + */ + + if ( ms->flags & IP_MASQ_F_NO_DPORT && ms->protocol == IPPROTO_TCP ) { + ms->flags &= ~IP_MASQ_F_NO_DPORT; + ms->dport = portptr[0]; +#if DEBUG_CONFIG_IP_MASQUERADE + printk("ip_fw_demasquerade(): filled dport=%d\n", + ntohs(ms->dport)); +#endif + } + if (ms->flags & IP_MASQ_F_NO_DADDR && ms->protocol == IPPROTO_TCP) { + ms->flags &= ~IP_MASQ_F_NO_DADDR; + ms->daddr = iph->saddr; +#if DEBUG_CONFIG_IP_MASQUERADE + printk("ip_fw_demasquerade(): filled daddr=%X\n", + ntohs(ms->daddr)); +#endif + } + iph->daddr = ms->saddr; + portptr[1] = ms->sport; + + /* + * Attempt ip_masq_app call. + * will fix ip_masq and iph ack_seq stuff + */ + + if (ip_masq_app_pkt_in(ms, skb_p, dev) != 0) + { + /* + * skb has changed, update pointers. + */ + + skb = *skb_p; + iph = skb->h.iph; + portptr = (__u16 *)&(((char *)iph)[iph->ihl*4]); + len = ntohs(iph->tot_len) - (iph->ihl * 4); + } + + /* + * Yug! adjust UDP/TCP and IP checksums, also update + * timeouts. + * If a TCP RST is seen collapse the tunnel (by using short timeout)! + */ + if (iph->protocol==IPPROTO_UDP) + { + recalc_check((struct udphdr *)portptr,iph->saddr,iph->daddr,len); + timeout = ip_masq_expire->udp_timeout; + } + else + { + struct tcphdr *th; + skb->csum = csum_partial((void *)(((struct tcphdr *)portptr) + 1), + len - sizeof(struct tcphdr), 0); + th = (struct tcphdr *) portptr; + th->check = 0; + + tcp_v4_check(th, len, iph->saddr, iph->daddr, + skb->csum); + + /* Check if TCP FIN or RST */ + + if (th->fin) + { + ms->flags |= IP_MASQ_F_SAW_FIN_IN; + } + if (th->rst) + { + ms->flags |= IP_MASQ_F_SAW_RST; + } + + /* Now set the timeouts */ + if (ms->flags & IP_MASQ_F_SAW_RST) + { + timeout = 1; + } + else if ((ms->flags & IP_MASQ_F_SAW_FIN) == IP_MASQ_F_SAW_FIN) + { + timeout = ip_masq_expire->tcp_fin_timeout; + } + else timeout = ip_masq_expire->tcp_timeout; + } + ip_masq_set_expire(ms, timeout); + ip_send_check(iph); +#ifdef DEBUG_CONFIG_IP_MASQUERADE + printk("I-routed to %lX:%X\n",ntohl(iph->daddr),ntohs(portptr[1])); +#endif + return 1; + } + + /* sorry, all this trouble for a no-hit :) */ + return 0; +} + +/* + * /proc/net entry + */ + +static int ip_msqhst_procinfo(char *buffer, char **start, off_t offset, + int length, int unused) +{ + off_t pos=0, begin; + struct ip_masq *ms; + unsigned long flags; + char temp[129]; + int idx = 0; + int len=0; + + if (offset < 128) + { + sprintf(temp, + "Prc FromIP FPrt ToIP TPrt Masq Init-seq Delta PDelta Expires (free=%d,%d)", + ip_masq_free_ports[0], ip_masq_free_ports[1]); + len = sprintf(buffer, "%-127s\n", temp); + } + pos = 128; + save_flags(flags); + cli(); + + for(idx = 0; idx < IP_MASQ_TAB_SIZE; idx++) + for(ms = ip_masq_m_tab[idx]; ms ; ms = ms->m_link) + { + int timer_active; + pos += 128; + if (pos <= offset) + continue; + + timer_active = del_timer(&ms->timer); + if (!timer_active) + ms->timer.expires = jiffies; + sprintf(temp,"%s %08lX:%04X %08lX:%04X %04X %08X %6d %6d %7lu", + masq_proto_name(ms->protocol), + ntohl(ms->saddr), ntohs(ms->sport), + ntohl(ms->daddr), ntohs(ms->dport), + ntohs(ms->mport), + ms->out_seq.init_seq, + ms->out_seq.delta, + ms->out_seq.previous_delta, + ms->timer.expires-jiffies); + if (timer_active) + add_timer(&ms->timer); + len += sprintf(buffer+len, "%-127s\n", temp); + + if(len >= length) + goto done; + } +done: + restore_flags(flags); + begin = len - (pos - offset); + *start = buffer + begin; + len -= begin; + if(len>length) + len = length; + return len; +} + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry proc_net_ipmsqhst = { + PROC_NET_IPMSQHST, 13, "ip_masquerade", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + ip_msqhst_procinfo +}; +#endif + +/* + * Initialize ip masquerading + */ +int ip_masq_init(void) +{ + register_symtab (&ip_masq_syms); +#ifdef CONFIG_PROC_FS + proc_net_register(&proc_net_ipmsqhst); +#endif + ip_masq_app_init(); + + return 0; +} diff --git a/net/ipv4/ip_masq_app.c b/net/ipv4/ip_masq_app.c new file mode 100644 index 000000000..abe65987e --- /dev/null +++ b/net/ipv4/ip_masq_app.c @@ -0,0 +1,609 @@ +/* + * IP_MASQ_APP application masquerading module + * + * + * Version: @(#)ip_masq_app.c 0.04 96/06/17 + * + * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * JJC : Implemented also input pkt hook + * Miquel van Smoorenburg : Copy more stuff when resizing skb + * + * + * FIXME: + * - ip_masq_skb_replace(): use same skb if space available. + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <asm/system.h> +#include <linux/stat.h> +#include <linux/proc_fs.h> +#include <net/ip_masq.h> + +static const char *strProt[] = {"UDP","TCP"}; + +static __inline__ const char * masq_proto_name(unsigned proto) +{ + return strProt[proto==IPPROTO_TCP]; +} + +#define IP_MASQ_APP_TAB_SIZE 16 /* must be power of 2 */ + +#define IP_MASQ_APP_HASH(proto, port) ((port^proto) & (IP_MASQ_APP_TAB_SIZE-1)) +#define IP_MASQ_APP_TYPE(proto, port) ( proto<<16 | port ) +#define IP_MASQ_APP_PORT(type) ( type & 0xffff ) +#define IP_MASQ_APP_PROTO(type) ( (type>>16) & 0x00ff ) + + +static struct symbol_table ip_masq_app_syms = { +#include <linux/symtab_begin.h> + X(register_ip_masq_app), + X(unregister_ip_masq_app), + X(ip_masq_skb_replace), +#include <linux/symtab_end.h> +}; + +/* + * will hold masq app. hashed list heads + */ + +struct ip_masq_app *ip_masq_app_base[IP_MASQ_APP_TAB_SIZE]; + +/* + * ip_masq_app registration routine + * port: host byte order. + */ + +int register_ip_masq_app(struct ip_masq_app *mapp, unsigned short proto, __u16 port) +{ + unsigned long flags; + unsigned hash; + if (!mapp) { + printk("register_ip_masq_app(): NULL arg\n"); + return -EINVAL; + } + mapp->type = IP_MASQ_APP_TYPE(proto, port); + mapp->n_attach = 0; + hash = IP_MASQ_APP_HASH(proto, port); + + save_flags(flags); + cli(); + mapp->next = ip_masq_app_base[hash]; + ip_masq_app_base[hash] = mapp; + restore_flags(flags); + + return 0; +} + +/* + * ip_masq_app unreg. routine. + */ + +int unregister_ip_masq_app(struct ip_masq_app *mapp) +{ + struct ip_masq_app **mapp_p; + unsigned hash; + unsigned long flags; + if (!mapp) { + printk("unregister_ip_masq_app(): NULL arg\n"); + return -EINVAL; + } + /* + * only allow unregistration if it has no attachments + */ + if (mapp->n_attach) { + printk("unregister_ip_masq_app(): has %d attachments. failed\n", + mapp->n_attach); + return -EINVAL; + } + hash = IP_MASQ_APP_HASH(IP_MASQ_APP_PROTO(mapp->type), IP_MASQ_APP_PORT(mapp->type)); + + save_flags(flags); + cli(); + for (mapp_p = &ip_masq_app_base[hash]; *mapp_p ; mapp_p = &(*mapp_p)->next) + if (mapp == (*mapp_p)) { + *mapp_p = mapp->next; + restore_flags(flags); + return 0; + } + + restore_flags(flags); + printk("unregister_ip_masq_app(proto=%s,port=%u): not hashed!\n", + masq_proto_name(IP_MASQ_APP_PROTO(mapp->type)), IP_MASQ_APP_PORT(mapp->type)); + return -EINVAL; +} + +/* + * get ip_masq_app object by its proto and port (net byte order). + */ + +struct ip_masq_app * ip_masq_app_get(unsigned short proto, __u16 port) +{ + struct ip_masq_app *mapp; + unsigned hash; + unsigned type; + + port = ntohs(port); + type = IP_MASQ_APP_TYPE(proto,port); + hash = IP_MASQ_APP_HASH(proto,port); + for(mapp = ip_masq_app_base[hash]; mapp ; mapp = mapp->next) { + if (type == mapp->type) return mapp; + } + return NULL; +} + +/* + * ip_masq_app object binding related funcs. + */ + +/* + * change ip_masq_app object's number of bindings + */ + +static __inline__ int ip_masq_app_bind_chg(struct ip_masq_app *mapp, int delta) +{ + unsigned long flags; + int n_at; + if (!mapp) return -1; + save_flags(flags); + cli(); + n_at = mapp->n_attach + delta; + if (n_at < 0) { + restore_flags(flags); + printk("ip_masq_app: tried to set n_attach < 0 for (proto=%s,port==%d) ip_masq_app object.\n", + masq_proto_name(IP_MASQ_APP_PROTO(mapp->type)), + IP_MASQ_APP_PORT(mapp->type)); + return -1; + } + mapp->n_attach = n_at; + restore_flags(flags); + return 0; +} + +/* + * Bind ip_masq to its ip_masq_app based on proto and dport ALREADY + * set in ip_masq struct. Also calls constructor. + */ + +struct ip_masq_app * ip_masq_bind_app(struct ip_masq *ms) +{ + struct ip_masq_app * mapp; + mapp = ip_masq_app_get(ms->protocol, ms->dport); + if (mapp != NULL) { + /* + * don't allow binding if already bound + */ + + if (ms->app != NULL) { + printk("ip_masq_bind_app() called for already bound object.\n"); + return ms->app; + } + + ms->app = mapp; + if (mapp->masq_init_1) mapp->masq_init_1(mapp, ms); + ip_masq_app_bind_chg(mapp, +1); + } + return mapp; +} + +/* + * Unbind ms from type object and call ms destructor (does not kfree()). + */ + +int ip_masq_unbind_app(struct ip_masq *ms) +{ + struct ip_masq_app * mapp; + mapp = ms->app; + if (mapp != NULL) { + if (mapp->masq_done_1) mapp->masq_done_1(mapp, ms); + ms->app = NULL; + ip_masq_app_bind_chg(mapp, -1); + } + return (mapp != NULL); +} + +/* + * Fixes th->seq based on ip_masq_seq info. + */ + +static __inline__ void masq_fix_seq(const struct ip_masq_seq *ms_seq, struct tcphdr *th) +{ + __u32 seq; + + seq = ntohl(th->seq); + + /* + * Adjust seq with delta-offset for all packets after + * the most recent resized pkt seq and with previous_delta offset + * for all packets before most recent resized pkt seq. + */ + + if (ms_seq->delta || ms_seq->previous_delta) { + if(after(seq,ms_seq->init_seq) ) { + th->seq = htonl(seq + ms_seq->delta); +#if DEBUG_CONFIG_IP_MASQ_APP + printk("masq_fix_seq() : added delta (%d) to seq\n",ms_seq->delta); +#endif + } else { + th->seq = htonl(seq + ms_seq->previous_delta); +#if DEBUG_CONFIG_IP_MASQ_APP + printk("masq_fix_seq() : added previous_delta (%d) to seq\n",ms_seq->previous_delta); +#endif + } + } + + +} + +/* + * Fixes th->ack_seq based on ip_masq_seq info. + */ + +static __inline__ void masq_fix_ack_seq(const struct ip_masq_seq *ms_seq, struct tcphdr *th) +{ + __u32 ack_seq; + + ack_seq=ntohl(th->ack_seq); + + /* + * Adjust ack_seq with delta-offset for + * the packets AFTER most recent resized pkt has caused a shift + * for packets before most recent resized pkt, use previous_delta + */ + + if (ms_seq->delta || ms_seq->previous_delta) { + if(after(ack_seq,ms_seq->init_seq)) { + th->ack_seq = htonl(ack_seq-ms_seq->delta); +#if DEBUG_CONFIG_IP_MASQ_APP + printk("masq_fix_ack_seq() : subtracted delta (%d) from ack_seq\n",ms_seq->delta); +#endif + } else { + th->ack_seq = htonl(ack_seq-ms_seq->previous_delta); +#if DEBUG_CONFIG_IP_MASQ_APP + printk("masq_fix_ack_seq() : subtracted previous_delta (%d) from ack_seq\n",ms_seq->previous_delta); +#endif + } + } + +} + +/* + * Updates ip_masq_seq if pkt has been resized + * Assumes already checked proto==IPPROTO_TCP and diff!=0. + */ + +static __inline__ void masq_seq_update(struct ip_masq *ms, struct ip_masq_seq *ms_seq, unsigned mflag, __u32 seq, int diff) +{ + /* if (diff == 0) return; */ + + if ( !(ms->flags & mflag) || after(seq, ms_seq->init_seq)) + { + ms_seq->previous_delta=ms_seq->delta; + ms_seq->delta+=diff; + ms_seq->init_seq=seq; + ms->flags |= mflag; + } +} + +/* + * Output pkt hook. Will call bound ip_masq_app specific function + * called by ip_fw_masquerade(), assumes previously checked ms!=NULL + * returns (new - old) skb->len diff. + */ + +int ip_masq_app_pkt_out(struct ip_masq *ms, struct sk_buff **skb_p, struct device *dev) +{ + struct ip_masq_app * mapp; + struct iphdr *iph; + struct tcphdr *th; + int diff; + __u32 seq; + + /* + * check if application masquerading is bound to + * this ip_masq. + * assumes that once an ip_masq is bound, + * it will not be unbound during its life. + */ + + if ( (mapp = ms->app) == NULL) + return 0; + + iph = (*skb_p)->h.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + + /* + * Remember seq number in case this pkt gets resized + */ + + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + + if (ms->protocol == IPPROTO_TCP) { + if (ms->flags & IP_MASQ_F_OUT_SEQ) + masq_fix_seq(&ms->out_seq, th); + if (ms->flags & IP_MASQ_F_IN_SEQ) + masq_fix_ack_seq(&ms->in_seq, th); + } + + /* + * Call private output hook function + */ + + if ( mapp->pkt_out == NULL ) + return 0; + + diff = mapp->pkt_out(mapp, ms, skb_p, dev); + + /* + * Update ip_masq seq stuff if len has changed. + */ + + if (diff != 0 && ms->protocol == IPPROTO_TCP) + masq_seq_update(ms, &ms->out_seq, IP_MASQ_F_OUT_SEQ, seq, diff); + + return diff; +} + +/* + * Input pkt hook. Will call bound ip_masq_app specific function + * called by ip_fw_demasquerade(), assumes previously checked ms!=NULL. + * returns (new - old) skb->len diff. + */ + +int ip_masq_app_pkt_in(struct ip_masq *ms, struct sk_buff **skb_p, struct device *dev) +{ + struct ip_masq_app * mapp; + struct iphdr *iph; + struct tcphdr *th; + int diff; + __u32 seq; + + /* + * check if application masquerading is bound to + * this ip_masq. + * assumes that once an ip_masq is bound, + * it will not be unbound during its life. + */ + + if ( (mapp = ms->app) == NULL) + return 0; + + iph = (*skb_p)->h.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + + /* + * Remember seq number in case this pkt gets resized + */ + + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + + if (ms->protocol == IPPROTO_TCP) { + if (ms->flags & IP_MASQ_F_IN_SEQ) + masq_fix_seq(&ms->in_seq, th); + if (ms->flags & IP_MASQ_F_OUT_SEQ) + masq_fix_ack_seq(&ms->out_seq, th); + } + + /* + * Call private input hook function + */ + + if ( mapp->pkt_in == NULL ) + return 0; + + diff = mapp->pkt_in(mapp, ms, skb_p, dev); + + /* + * Update ip_masq seq stuff if len has changed. + */ + + if (diff != 0 && ms->protocol == IPPROTO_TCP) + masq_seq_update(ms, &ms->in_seq, IP_MASQ_F_IN_SEQ, seq, diff); + + return diff; +} + +/* + * /proc/ip_masq_app entry function + */ + +int ip_masq_app_getinfo(char *buffer, char **start, off_t offset, int length, int dummy) +{ + off_t pos=0, begin=0; + int len=0; + struct ip_masq_app * mapp; + unsigned idx; + + if (offset < 40) + len=sprintf(buffer,"%-39s\n", "prot port n_attach name"); + pos = 40; + + for (idx=0 ; idx < IP_MASQ_APP_TAB_SIZE; idx++) + for (mapp = ip_masq_app_base[idx]; mapp ; mapp = mapp->next) { + /* + * If you change the length of this sprintf, then all + * the length calculations need fixing too! + * Line length = 40 (3 + 2 + 7 + 1 + 7 + 1 + 2 + 17) + */ + pos += 40; + if (pos < offset) + continue; + + len += sprintf(buffer+len, "%-3s %-7u %-7d %-17s\n", + masq_proto_name(IP_MASQ_APP_PROTO(mapp->type)), + IP_MASQ_APP_PORT(mapp->type), mapp->n_attach, + mapp->name); + + if(len >= length) + goto done; + } +done: + begin = len - (pos - offset); + *start = buffer + begin; + len -= begin; + if (len > length) + len = length; + return len; +} + + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry proc_net_ip_masq_app = { + PROC_NET_IP_MASQ_APP, 11, "ip_masq_app", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + ip_masq_app_getinfo +}; +#endif + +/* + * Initialization routine + */ + +int ip_masq_app_init(void) +{ + + register_symtab (&ip_masq_app_syms); +#ifdef CONFIG_PROC_FS + proc_net_register(&proc_net_ip_masq_app); +#endif + return 0; +} + +/* + * Replace a segment (of skb->data) with a new one. + * FIXME: Should re-use same skb if space available, this could + * be done if n_len < o_len, unless some extra space + * were already allocated at driver level :P . + */ + +static struct sk_buff * skb_replace(struct sk_buff *skb, int pri, char *o_buf, int o_len, char *n_buf, int n_len) +{ + int maxsize, diff, o_offset; + struct sk_buff *n_skb; + int offset; + + maxsize = skb->truesize - sizeof(struct sk_buff); + + diff = n_len - o_len; + o_offset = o_buf - (char*) skb->data; + + if (maxsize <= n_len) { + if (diff != 0) { + memcpy(skb->data + o_offset + n_len,o_buf + o_len, + skb->len - (o_offset + o_len)); + } + + memcpy(skb->data + o_offset, n_buf, n_len); + + n_skb = skb; + skb->len = n_len; + skb->end = skb->head+n_len; + } else { + /* + * Sizes differ, make a copy. + * + * FIXME: move this to core/sbuff.c:skb_grow() + */ + + n_skb = alloc_skb(MAX_HEADER + skb->len + diff, pri); + if (n_skb == NULL) { + printk("skb_replace(): no room left (from %p)\n", + return_address()); + return skb; + + } + n_skb->free = skb->free; + skb_reserve(n_skb, MAX_HEADER); + skb_put(n_skb, skb->len + diff); + + /* + * Copy as much data from the old skb as possible. Even + * though we're only forwarding packets, we need stuff + * like skb->protocol (PPP driver wants it). + */ + offset = n_skb->data - skb->data; + n_skb->h.raw = skb->h.raw + offset; + n_skb->when = skb->when; + n_skb->dev = skb->dev; + n_skb->mac.raw = skb->mac.raw + offset; + n_skb->ip_hdr = (struct iphdr *)(((char *)skb->ip_hdr)+offset); + n_skb->pkt_type = skb->pkt_type; + n_skb->protocol = skb->protocol; + n_skb->ip_summed = skb->ip_summed; + + /* + * Copy pkt in new buffer + */ + + memcpy(n_skb->data, skb->data, o_offset); + memcpy(n_skb->data + o_offset, n_buf, n_len); + memcpy(n_skb->data + o_offset + n_len, o_buf + o_len, + skb->len - (o_offset + o_len) ); + + /* + * Problem, how to replace the new skb with old one, + * preferably inplace + */ + + kfree_skb(skb, FREE_WRITE); + } + return n_skb; +} + +/* + * calls skb_replace() and update ip header if new skb was allocated + */ + +struct sk_buff * ip_masq_skb_replace(struct sk_buff *skb, int pri, char *o_buf, int o_len, char *n_buf, int n_len) +{ + int diff; + struct sk_buff *n_skb; + unsigned skb_len; + + diff = n_len - o_len; + n_skb = skb_replace(skb, pri, o_buf, o_len, n_buf, n_len); + skb_len = skb->len; + + if (diff) + { + struct iphdr *iph; +#if DEBUG_CONFIG_IP_MASQ_APP + printk("masq_skb_replace(): pkt resized for %d bytes (len=%ld)\n", diff, skb->len); +#endif + /* + * update ip header + */ + iph = n_skb->h.iph; + iph->check = 0; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + iph->tot_len = htons(skb_len + diff); + } + return n_skb; +} diff --git a/net/ipv4/ip_masq_ftp.c b/net/ipv4/ip_masq_ftp.c new file mode 100644 index 000000000..75fbb01a7 --- /dev/null +++ b/net/ipv4/ip_masq_ftp.c @@ -0,0 +1,219 @@ +/* + * IP_MASQ_FTP ftp masquerading module + * + * + * Version: @(#)ip_masq_ftp.c 0.01 02/05/96 + * + * Author: Wouter Gadeyne + * + * + * Fixes: + * Wouter Gadeyne : Fixed masquerading support of ftp PORT commands + * Juan Jose Ciarlante : Code moved and adapted from ip_fw.c + * + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/module.h> +#include <asm/system.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <net/ip_masq.h> + +#define DEBUG_CONFIG_IP_MASQ_FTP 0 + +static int +masq_ftp_init_1 (struct ip_masq_app *mapp, struct ip_masq *ms) +{ + MOD_INC_USE_COUNT; + return 0; +} + +static int +masq_ftp_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms) +{ + MOD_DEC_USE_COUNT; + return 0; +} + +int +masq_ftp_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, struct device *dev) +{ + struct sk_buff *skb; + struct iphdr *iph; + struct tcphdr *th; + char *p, *data, *data_limit; + unsigned char p1,p2,p3,p4,p5,p6; + __u32 from; + __u16 port; + struct ip_masq *n_ms; + char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ + unsigned buf_len; + int diff; + + skb = *skb_p; + iph = skb->h.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + data = (char *)&th[1]; + + data_limit = skb->h.raw + skb->len - 18; + + while (data < data_limit) + { + if (memcmp(data,"PORT ",5) && memcmp(data,"port ",5)) + { + data ++; + continue; + } + p = data+5; + p1 = simple_strtoul(data+5,&data,10); + if (*data!=',') + continue; + p2 = simple_strtoul(data+1,&data,10); + if (*data!=',') + continue; + p3 = simple_strtoul(data+1,&data,10); + if (*data!=',') + continue; + p4 = simple_strtoul(data+1,&data,10); + if (*data!=',') + continue; + p5 = simple_strtoul(data+1,&data,10); + if (*data!=',') + continue; + p6 = simple_strtoul(data+1,&data,10); + if (*data!='\r' && *data!='\n') + continue; + + from = (p1<<24) | (p2<<16) | (p3<<8) | p4; + port = (p5<<8) | p6; +#if DEBUG_CONFIG_IP_MASQ_FTP + printk("PORT %X:%X detected\n",from,port); +#endif + /* + * Now update or create an masquerade entry for it + */ +#if DEBUG_CONFIG_IP_MASQ_FTP + printk("protocol %d %lX:%X %X:%X\n", iph->protocol, htonl(from), htons(port), iph->daddr, 0); + +#endif + n_ms = ip_masq_out_get_2(iph->protocol, + htonl(from), htons(port), + iph->daddr, 0); + if (n_ms) { + /* existing masquerade, clear timer */ + ip_masq_set_expire(n_ms,0); + } + else { + n_ms = ip_masq_new(dev, IPPROTO_TCP, + htonl(from), htons(port), + iph->daddr, 0, + IP_MASQ_F_NO_DPORT); + + if (n_ms==NULL) + return 0; + } + + /* + * keep for a bit longer than tcp_fin, caller may not reissue + * PORT before tcp_fin_timeout. + */ + ip_masq_set_expire(n_ms, ip_masq_expire->tcp_fin_timeout*3); + + /* + * Replace the old PORT with the new one + */ + from = ntohl(n_ms->maddr); + port = ntohs(n_ms->mport); + sprintf(buf,"%d,%d,%d,%d,%d,%d", + from>>24&255,from>>16&255,from>>8&255,from&255, + port>>8&255,port&255); + buf_len = strlen(buf); +#if DEBUG_CONFIG_IP_MASQ_FTP + printk("new PORT %X:%X\n",from,port); +#endif + + /* + * Calculate required delta-offset to keep TCP happy + */ + + diff = buf_len - (data-p); + + /* + * No shift. + */ + + if (diff==0) + { + /* + * simple case, just replace the old PORT cmd + */ + memcpy(p,buf,buf_len); + return 0; + } + + *skb_p = ip_masq_skb_replace(skb, GFP_ATOMIC, p, data-p, buf, buf_len); + return diff; + + } + return 0; + +} + +struct ip_masq_app ip_masq_ftp = { + NULL, /* next */ + "ftp", /* name */ + 0, /* type */ + 0, /* n_attach */ + masq_ftp_init_1, /* ip_masq_init_1 */ + masq_ftp_done_1, /* ip_masq_done_1 */ + masq_ftp_out, /* pkt_out */ + NULL /* pkt_in */ +}; + +/* + * ip_masq_ftp initialization + */ + +int ip_masq_ftp_init(void) +{ + return register_ip_masq_app(&ip_masq_ftp, IPPROTO_TCP, 21); +} + +/* + * ip_masq_ftp fin. + */ + +int ip_masq_ftp_done(void) +{ + return unregister_ip_masq_app(&ip_masq_ftp); +} + +#ifdef MODULE + +int init_module(void) +{ + if (ip_masq_ftp_init() != 0) + return -EIO; + register_symtab(0); + return 0; +} + +void cleanup_module(void) +{ + if (ip_masq_ftp_done() != 0) + printk("ip_masq_ftp: can't remove module"); +} + +#endif /* MODULE */ diff --git a/net/ipv4/ip_masq_irc.c b/net/ipv4/ip_masq_irc.c new file mode 100644 index 000000000..4bb93e5d1 --- /dev/null +++ b/net/ipv4/ip_masq_irc.c @@ -0,0 +1,270 @@ +/* + * IP_MASQ_IRC irc masquerading module + * + * + * Version: @(#)ip_masq_irc.c 0.01 03/20/96 + * + * Author: Juan Jose Ciarlante + * + * + * Fixes: + * - set NO_DADDR flag in ip_masq_new(). + * + * FIXME: + * - detect also previous "PRIVMSG" string ?. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/module.h> + +#include <linux/types.h> +#include <linux/kernel.h> +#include <asm/system.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <net/ip_masq.h> + +#define DEBUG_CONFIG_IP_MASQ_IRC 0 + +static int +masq_irc_init_1 (struct ip_masq_app *mapp, struct ip_masq *ms) +{ + MOD_INC_USE_COUNT; + return 0; +} + +static int +masq_irc_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms) +{ + MOD_DEC_USE_COUNT; + return 0; +} + +int +masq_irc_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, struct device *dev) +{ + struct sk_buff *skb; + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_limit; + __u32 s_addr; + __u16 s_port; + struct ip_masq *n_ms; + char buf[20]; /* "m_addr m_port" (dec base)*/ + unsigned buf_len; + int diff; + int xtra_args = 0; /* extra int args wanted after addr */ + char *dcc_p, *addr_beg_p, *addr_end_p; + + skb = *skb_p; + iph = skb->h.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + data = (char *)&th[1]; + + /* + * Hunt irc DCC string, the _shortest_: + * + * strlen("DCC CHAT chat AAAAAAAA P\x01\n")=26 + * strlen("DCC SEND F AAAAAAAA P S\x01\n")=25 + * AAAAAAAAA: bound addr (1.0.0.0==16777216, min 8 digits) + * P: bound port (min 1 d ) + * F: filename (min 1 d ) + * S: size (min 1 d ) + * 0x01, \n: terminators + */ + + data_limit = skb->h.raw + skb->len; + + while (data < (data_limit - 25) ) + { + if (memcmp(data,"DCC ",4)) { + data ++; + continue; + } + + dcc_p = data; + data += 4; /* point to DCC cmd */ + + if (memcmp(data, "CHAT ", 5) == 0 || + memcmp(data, "SEND ", 5) == 0) + { + /* + * extra arg (file_size) req. for "SEND" + */ + + if (*data == 'S') xtra_args++; + data += 5; + } + else + continue; + + /* + * skip next string. + */ + + while( *data++ != ' ') + + /* + * must still parse, at least, "AAAAAAAA P\x01\n", + * 12 bytes left. + */ + if (data > (data_limit-12)) return 0; + + + addr_beg_p = data; + + /* + * client bound address in dec base + */ + + s_addr = simple_strtoul(data,&data,10); + if (*data++ !=' ') + continue; + + /* + * client bound port in dec base + */ + + s_port = simple_strtoul(data,&data,10); + addr_end_p = data; + + /* + * should check args consistency? + */ + + while(xtra_args) { + if (*data != ' ') + break; + data++; + simple_strtoul(data,&data,10); + xtra_args--; + } + + if (xtra_args != 0) continue; + + /* + * terminators. + */ + + if (data[0] != 0x01) + continue; + if (data[1]!='\r' && data[1]!='\n') + continue; + + /* + * Now create an masquerade entry for it + * must set NO_DPORT and NO_DADDR because + * connection is requested by another client. + */ + + n_ms = ip_masq_new(dev, IPPROTO_TCP, + htonl(s_addr),htons(s_port), + 0, 0, + IP_MASQ_F_NO_DPORT|IP_MASQ_F_NO_DADDR + ); + if (n_ms==NULL) + return 0; + + ip_masq_set_expire(n_ms, ip_masq_expire->tcp_fin_timeout); + + /* + * Replace the old "address port" with the new one + */ + + buf_len = sprintf(buf,"%lu %u", + ntohl(n_ms->maddr),ntohs(n_ms->mport)); + + /* + * Calculate required delta-offset to keep TCP happy + */ + + diff = buf_len - (addr_end_p-addr_beg_p); + +#if DEBUG_CONFIG_IP_MASQ_IRC + *addr_beg_p = '\0'; + printk("masq_irc_out(): '%s' %X:%X detected (diff=%d)\n", dcc_p, s_addr,s_port, diff); +#endif + /* + * No shift. + */ + + if (diff==0) + { + /* + * simple case, just copy. + */ + memcpy(addr_beg_p,buf,buf_len); + return 0; + } + + *skb_p = ip_masq_skb_replace(skb, GFP_ATOMIC, + addr_beg_p, addr_end_p-addr_beg_p, + buf, buf_len); + return diff; + } + return 0; + +} + +/* + * Main irc object + * You need 1 object per port in case you need + * to offer also other used irc ports (6665,6666,etc), + * they will share methods but they need own space for + * data. + */ + +struct ip_masq_app ip_masq_irc = { + NULL, /* next */ + "irc", /* name */ + 0, /* type */ + 0, /* n_attach */ + masq_irc_init_1, /* init_1 */ + masq_irc_done_1, /* done_1 */ + masq_irc_out, /* pkt_out */ + NULL /* pkt_in */ +}; + +/* + * ip_masq_irc initialization + */ + +int ip_masq_irc_init(void) +{ + return register_ip_masq_app(&ip_masq_irc, IPPROTO_TCP, 6667); +} + +/* + * ip_masq_irc fin. + */ + +int ip_masq_irc_done(void) +{ + return unregister_ip_masq_app(&ip_masq_irc); +} + +#ifdef MODULE + +int init_module(void) +{ + if (ip_masq_irc_init() != 0) + return -EIO; + register_symtab(NULL); + return 0; +} + +void cleanup_module(void) +{ + if (ip_masq_irc_done() != 0) + printk("ip_masq_irc: can't remove module"); +} + +#endif /* MODULE */ diff --git a/net/ipv4/ip_masq_raudio.c b/net/ipv4/ip_masq_raudio.c new file mode 100644 index 000000000..9fe4e8f15 --- /dev/null +++ b/net/ipv4/ip_masq_raudio.c @@ -0,0 +1,232 @@ +/* + * IP_MASQ_RAUDIO - Real Audio masquerading module + * + * + * Version: @(#)$Id: ip_masq_raudio.c,v 1.3 1996/05/20 13:24:26 nigel Exp $ + * + * Author: Nigel Metheringham + * [strongly based on ftp module by Juan Jose Ciarlante & Wouter Gadeyne] + * [Real Audio information taken from Progressive Networks firewall docs] + * + * + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * + * Limitations + * The IP Masquerading proxies at present do not have access to a processed + * data stream. Hence for a protocol like the Real Audio control protocol, + * which depends on knowing where you are in the data stream, you either + * to keep a *lot* of state in your proxy, or you cheat and simplify the + * problem [needless to say I did the latter]. + * + * This proxy only handles data in the first packet. Everything else is + * passed transparently. This means it should work under all normal + * circumstances, but it could be fooled by new data formats or a + * malicious application! + * + * At present the "first packet" is defined as a packet starting with + * the protocol ID string - "PNA". + * When the link is up there appears to be enough control data + * crossing the control link to keep it open even if a long audio + * piece is playing. + * + */ + +#include <linux/module.h> +#include <asm/system.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <net/ip_masq.h> + +#ifndef DEBUG_CONFIG_IP_MASQ_RAUDIO +#define DEBUG_CONFIG_IP_MASQ_RAUDIO 0 +#endif + +struct raudio_priv_data { + /* Associated data connection - setup but not used at present */ + struct ip_masq *data_conn; + /* Have we seen and performed setup */ + short seen_start; +}; + +static int +masq_raudio_init_1 (struct ip_masq_app *mapp, struct ip_masq *ms) +{ + MOD_INC_USE_COUNT; + if ((ms->app_data = kmalloc(sizeof(struct raudio_priv_data), + GFP_ATOMIC)) == NULL) + printk(KERN_INFO "RealAudio: No memory for application data\n"); + else + { + struct raudio_priv_data *priv = + (struct raudio_priv_data *)ms->app_data; + priv->seen_start = 0; + priv->data_conn = NULL; + } + return 0; +} + +static int +masq_raudio_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms) +{ + MOD_DEC_USE_COUNT; + if (ms->app_data) + kfree_s(ms->app_data, sizeof(struct raudio_priv_data)); + return 0; +} + +int +masq_raudio_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, struct device *dev) +{ + struct sk_buff *skb; + struct iphdr *iph; + struct tcphdr *th; + char *p, *data, *data_limit; + struct ip_masq *n_ms; + unsigned short version, msg_id, msg_len, udp_port; + struct raudio_priv_data *priv = + (struct raudio_priv_data *)ms->app_data; + + /* Everything running correctly already */ + if (priv && priv->seen_start) + return 0; + + skb = *skb_p; + iph = skb->h.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + data = (char *)&th[1]; + + data_limit = skb->h.raw + skb->len - 18; + + /* Check to see if this is the first packet with protocol ID */ + if (memcmp(data, "PNA", 3)) { +#if DEBUG_CONFIG_IP_MASQ_RAUDIO + printk("RealAudio: not initial protocol packet - ignored\n"); +#endif + return(0); + } + data += 3; + memcpy(&version, data, 2); + +#if DEBUG_CONFIG_IP_MASQ_RAUDIO + printk("RealAudio: initial seen - protocol version %d\n", + ntohs(version)); +#endif + if (priv) + priv->seen_start = 1; + + if (ntohs(version) >= 256) + { + printk(KERN_INFO "RealAudio: version (%d) not supported\n", + ntohs(version)); + return 0; + } + + data += 2; + while (data < data_limit) { + memcpy(&msg_id, data, 2); + data += 2; + memcpy(&msg_len, data, 2); + data += 2; +#if DEBUG_CONFIG_IP_MASQ_RAUDIO + printk("RealAudio: msg %d - %d byte\n", + ntohs(msg_id), ntohs(msg_len)); +#endif + p = data; + data += ntohs(msg_len); + if (data > data_limit) + { + printk(KERN_INFO "RealAudio: Packet too short for data\n"); + return 0; + } + if (ntohs(msg_id) == 1) { + /* This is a message detailing the UDP port to be used */ + memcpy(&udp_port, p, 2); + n_ms = ip_masq_new(dev, IPPROTO_UDP, + ms->saddr, udp_port, + ms->daddr, 0, + IP_MASQ_F_NO_DPORT); + + if (n_ms==NULL) + return 0; + + memcpy(p, &(n_ms->mport), 2); +#if DEBUG_CONFIG_IP_MASQ_RAUDIO + printk("RealAudio: rewrote UDP port %d -> %d\n", + ntohs(udp_port), ntohs(n_ms->mport)); +#endif + ip_masq_set_expire(n_ms, ip_masq_expire->udp_timeout); + + /* Make ref in application data to data connection */ + if (priv) + priv->data_conn = n_ms; + + /* + * There is nothing else useful we can do + * Maybe a development could do more, but for now + * we exit gracefully! + */ + return 0; + + } else if (ntohs(msg_id) == 0) + return 0; + } + return 0; +} + +struct ip_masq_app ip_masq_raudio = { + NULL, /* next */ + "RealAudio", /* name */ + 0, /* type */ + 0, /* n_attach */ + masq_raudio_init_1, /* ip_masq_init_1 */ + masq_raudio_done_1, /* ip_masq_done_1 */ + masq_raudio_out, /* pkt_out */ + NULL /* pkt_in */ +}; + +/* + * ip_masq_raudio initialization + */ + +int ip_masq_raudio_init(void) +{ + return register_ip_masq_app(&ip_masq_raudio, IPPROTO_TCP, 7070); +} + +/* + * ip_masq_raudio fin. + */ + +int ip_masq_raudio_done(void) +{ + return unregister_ip_masq_app(&ip_masq_raudio); +} + +#ifdef MODULE + +int init_module(void) +{ + if (ip_masq_raudio_init() != 0) + return -EIO; + register_symtab(0); + return 0; +} + +void cleanup_module(void) +{ + if (ip_masq_raudio_done() != 0) + printk("ip_masq_raudio: can't remove module"); +} + +#endif /* MODULE */ diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c new file mode 100644 index 000000000..517c0e219 --- /dev/null +++ b/net/ipv4/ip_options.c @@ -0,0 +1,474 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The options processing module for ip.c + * + * Authors: A.N.Kuznetsov + * + */ + +#include <linux/types.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/icmp.h> +#include <linux/netdevice.h> +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> + +/* + * Write options to IP header, record destination address to + * source route option, address of outgoing interface + * (we should already know it, so that this function is allowed be + * called only after routing decision) and timestamp, + * if we originate this datagram. + */ + +void ip_options_build(struct sk_buff * skb, struct options * opt, + __u32 daddr, __u32 saddr, + int is_frag) +{ + unsigned char * iph = (unsigned char*)skb->ip_hdr; + + memcpy(skb->proto_priv, opt, sizeof(struct options)); + memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen); + opt = (struct options*)skb->proto_priv; + opt->is_data = 0; + + if (opt->srr) + memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4); + + if (!is_frag) + { + if (opt->rr_needaddr) + memcpy(iph+opt->rr+iph[opt->rr+2]-5, &saddr, 4); + if (opt->ts_needaddr) + memcpy(iph+opt->ts+iph[opt->ts+2]-9, &saddr, 4); + if (opt->ts_needtime) + { + struct timeval tv; + __u32 midtime; + do_gettimeofday(&tv); + midtime = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000); + memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4); + } + return; + } + if (opt->rr) + { + memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]); + opt->rr = 0; + opt->rr_needaddr = 0; + } + if (opt->ts) + { + memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]); + opt->ts = 0; + opt->ts_needaddr = opt->ts_needtime = 0; + } +} + +int ip_options_echo(struct options * dopt, struct options * sopt, + __u32 daddr, __u32 saddr, + struct sk_buff * skb) +{ + unsigned char *sptr, *dptr; + int soffset, doffset; + int optlen; + + memset(dopt, 0, sizeof(struct options)); + + dopt->is_data = 1; + + if (!sopt) + sopt = (struct options*)skb->proto_priv; + + if (sopt->optlen == 0) + { + dopt->optlen = 0; + return 0; + } + + sptr = (sopt->is_data ? sopt->__data - sizeof(struct iphdr) : + (unsigned char *)skb->ip_hdr); + dptr = dopt->__data; + + if (sopt->rr) + { + optlen = sptr[sopt->rr+1]; + soffset = sptr[sopt->rr+2]; + dopt->rr = dopt->optlen + sizeof(struct iphdr); + memcpy(dptr, sptr+sopt->rr, optlen); + if (sopt->rr_needaddr && soffset <= optlen) { + if (soffset + 3 > optlen) + return -EINVAL; + dptr[2] = soffset + 4; + dopt->rr_needaddr = 1; + } + dptr += optlen; + dopt->optlen += optlen; + } + if (sopt->ts) + { + optlen = sptr[sopt->ts+1]; + soffset = sptr[sopt->ts+2]; + dopt->ts = dopt->optlen + sizeof(struct iphdr); + memcpy(dptr, sptr+sopt->ts, optlen); + if (soffset <= optlen) + { + if (sopt->ts_needaddr) + { + if (soffset + 3 > optlen) + return -EINVAL; + dopt->ts_needaddr = 1; + soffset += 4; + } + if (sopt->ts_needtime) + { + if (soffset + 3 > optlen) + return -EINVAL; + dopt->ts_needtime = 1; + soffset += 4; + } + if (((struct timestamp*)(dptr+1))->flags == IPOPT_TS_PRESPEC) + { + __u32 addr; + memcpy(&addr, sptr+soffset-9, 4); + if (ip_chk_addr(addr) == 0) + { + dopt->ts_needtime = 0; + dopt->ts_needaddr = 0; + soffset -= 8; + } + } + dptr[2] = soffset; + } + dptr += optlen; + dopt->optlen += optlen; + } + if (sopt->srr) + { + unsigned char * start = sptr+sopt->srr; + __u32 faddr; + + optlen = start[1]; + soffset = start[2]; + doffset = 0; + if (soffset > optlen) + soffset = optlen + 1; + soffset -= 4; + if (soffset > 3) + { + memcpy(&faddr, &start[soffset-1], 4); + for (soffset-=4, doffset=4; soffset > 3; soffset-=4, doffset+=4) + memcpy(&dptr[doffset-1], &start[soffset-1], 4); + /* + * RFC1812 requires to fix illegal source routes. + */ + if (memcmp(&saddr, &start[soffset+3], 4) == 0) + doffset -= 4; + } + if (doffset > 3) + { + memcpy(&start[doffset-1], &daddr, 4); + dopt->faddr = faddr; + dptr[0] = start[0]; + dptr[1] = doffset+3; + dptr[2] = 4; + dptr += doffset+3; + dopt->srr = dopt->optlen + sizeof(struct iphdr); + dopt->optlen += doffset+3; + dopt->is_strictroute = sopt->is_strictroute; + } + } + while (dopt->optlen & 3) + { + *dptr++ = IPOPT_END; + dopt->optlen++; + } + return 0; +} + +void ip_options_fragment(struct sk_buff * skb) +{ + unsigned char * optptr = (unsigned char*)skb->ip_hdr; + struct options * opt = (struct options*)skb->proto_priv; + int l = opt->optlen; + int optlen; + + while (l > 0) + { + switch (*optptr) + { + case IPOPT_END: + return; + case IPOPT_NOOP: + l--; + optptr++; + continue; + } + optlen = optptr[1]; + if (optlen<2 || optlen>l) + return; + if (!(*optptr & 0x80)) + memset(optptr, IPOPT_NOOP, optlen); + l -= optlen; + optptr += optlen; + } + opt->ts = 0; + opt->rr = 0; + opt->rr_needaddr = 0; + opt->ts_needaddr = 0; + opt->ts_needtime = 0; + return; +} + +/* + * Verify options and fill pointers in struct options. + * Caller should clear *opt, and set opt->data. + * If opt == NULL, then skb->data should point to IP header. + */ + +int ip_options_compile(struct options * opt, struct sk_buff * skb) +{ + int l; + unsigned char * iph; + unsigned char * optptr; + int optlen; + unsigned char * pp_ptr = NULL; + + if (!opt) + { + opt = (struct options*)skb->proto_priv; + memset(opt, 0, sizeof(struct options)); + iph = (unsigned char*)skb->ip_hdr; + opt->optlen = ((struct iphdr *)iph)->ihl*4 - sizeof(struct iphdr); + optptr = iph + sizeof(struct iphdr); + opt->is_data = 0; + } + else + { + optptr = opt->is_data ? opt->__data : (unsigned char*)&skb->ip_hdr[1]; + iph = optptr - sizeof(struct iphdr); + } + + for (l = opt->optlen; l > 0; ) + { + switch (*optptr) + { + case IPOPT_END: + for (optptr++, l--; l>0; l--) + { + if (*optptr != IPOPT_END) + { + *optptr = IPOPT_END; + opt->is_changed = 1; + } + } + goto eol; + case IPOPT_NOOP: + l--; + optptr++; + continue; + } + optlen = optptr[1]; + if (optlen<2 || optlen>l) + { + pp_ptr = optptr; + goto error; + } + switch (*optptr) + { + case IPOPT_SSRR: + case IPOPT_LSRR: + if (optlen < 3) + { + pp_ptr = optptr + 1; + goto error; + } + if (optptr[2] < 4) + { + pp_ptr = optptr + 2; + goto error; + } + /* NB: cf RFC-1812 5.2.4.1 */ + if (opt->srr) + { + pp_ptr = optptr; + goto error; + } + if (!skb) + { + if (optptr[2] != 4 || optlen < 7 || ((optlen-3) & 3)) + { + pp_ptr = optptr + 1; + goto error; + } + memcpy(&opt->faddr, &optptr[3], 4); + if (optlen > 7) + memmove(&optptr[3], &optptr[7], optlen-7); + } + opt->is_strictroute = (optptr[0] == IPOPT_SSRR); + opt->srr = optptr - iph; + break; + case IPOPT_RR: + if (opt->rr) + { + pp_ptr = optptr; + goto error; + } + if (optlen < 3) + { + pp_ptr = optptr + 1; + goto error; + } + if (optptr[2] < 4) + { + pp_ptr = optptr + 2; + goto error; + } + if (optptr[2] <= optlen) + { + if (optptr[2]+3 > optlen) + { + pp_ptr = optptr + 2; + goto error; + } + if (skb) + { + memcpy(&optptr[optptr[2]-1], &skb->dev->pa_addr, 4); + opt->is_changed = 1; + } + optptr[2] += 4; + opt->rr_needaddr = 1; + } + opt->rr = optptr - iph; + break; + case IPOPT_TIMESTAMP: + if (opt->ts) + { + pp_ptr = optptr; + goto error; + } + if (optlen < 4) + { + pp_ptr = optptr + 1; + goto error; + } + if (optptr[2] < 5) + { + pp_ptr = optptr + 2; + goto error; + } + if (optptr[2] <= optlen) + { + struct timestamp * ts = (struct timestamp*)(optptr+1); + __u32 * timeptr = NULL; + if (ts->ptr+3 > ts->len) + { + pp_ptr = optptr + 2; + goto error; + } + switch (ts->flags) + { + case IPOPT_TS_TSONLY: + opt->ts = optptr - iph; + if (skb) + timeptr = (__u32*)&optptr[ts->ptr-1]; + opt->ts_needtime = 1; + ts->ptr += 4; + break; + case IPOPT_TS_TSANDADDR: + if (ts->ptr+7 > ts->len) + { + pp_ptr = optptr + 2; + goto error; + } + opt->ts = optptr - iph; + if (skb) + { + memcpy(&optptr[ts->ptr-1], &skb->dev->pa_addr, 4); + timeptr = (__u32*)&optptr[ts->ptr+3]; + } + opt->ts_needaddr = 1; + opt->ts_needtime = 1; + ts->ptr += 8; + break; + case IPOPT_TS_PRESPEC: + if (ts->ptr+7 > ts->len) + { + pp_ptr = optptr + 2; + goto error; + } + opt->ts = optptr - iph; + { + __u32 addr; + memcpy(&addr, &optptr[ts->ptr-1], 4); + if (ip_chk_addr(addr) == 0) + break; + if (skb) + timeptr = (__u32*)&optptr[ts->ptr+3]; + } + opt->ts_needaddr = 1; + opt->ts_needtime = 1; + ts->ptr += 8; + break; + default: + pp_ptr = optptr + 3; + goto error; + } + if (timeptr) + { + struct timeval tv; + __u32 midtime; + do_gettimeofday(&tv); + midtime = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000); + memcpy(timeptr, &midtime, sizeof(__u32)); + opt->is_changed = 1; + } + } + else + { + struct timestamp * ts = (struct timestamp*)(optptr+1); + if (ts->overflow == 15) + { + pp_ptr = optptr + 3; + goto error; + } + opt->ts = optptr - iph; + if (skb) + { + ts->overflow++; + opt->is_changed = 1; + } + } + break; + case IPOPT_SEC: + case IPOPT_SID: + default: + if (!skb) + { + pp_ptr = optptr; + goto error; + } + break; + } + l -= optlen; + optptr += optlen; + } + +eol: + if (!pp_ptr) + return 0; + +error: + if (skb) + { + icmp_send(skb, ICMP_PARAMETERPROB, 0, pp_ptr-iph, skb->dev); + kfree_skb(skb, FREE_READ); + } + return -EINVAL; +} + diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c new file mode 100644 index 000000000..b8c7891e8 --- /dev/null +++ b/net/ipv4/ip_output.c @@ -0,0 +1,1117 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The Internet Protocol (IP) output module. + * + * Version: @(#)ip.c 1.0.16b 9/1/93 + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Donald Becker, <becker@super.org> + * Alan Cox, <Alan.Cox@linux.org> + * Richard Underwood + * Stefan Becker, <stefanb@yello.ping.de> + * Jorge Cwik, <jorge@laser.satlink.net> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * + * See ip_input.c for original log + * + * Fixes: + * Alan Cox : Missing nonblock feature in ip_build_xmit. + * Mike Kilburn : htons() missing in ip_build_xmit. + * Bradford Johnson: Fix faulty handling of some frames when + * no route is found. + * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit + * (in case if packet not accepted by + * output firewall rules) + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/config.h> + +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> + +#include <net/snmp.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/route.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/arp.h> +#include <net/icmp.h> +#include <net/raw.h> +#include <net/checksum.h> +#include <linux/igmp.h> +#include <linux/ip_fw.h> +#include <linux/firewall.h> +#include <linux/mroute.h> +#include <net/netlink.h> + +/* + * Loop a packet back to the sender. + */ + +static void ip_loopback(struct device *old_dev, struct sk_buff *skb) +{ + struct device *dev=&loopback_dev; + int len=ntohs(skb->ip_hdr->tot_len); + struct sk_buff *newskb=dev_alloc_skb(len+dev->hard_header_len+15); + + if(newskb==NULL) + return; + + newskb->link3=NULL; + newskb->sk=NULL; + newskb->dev=dev; + newskb->saddr=skb->saddr; + newskb->daddr=skb->daddr; + newskb->raddr=skb->raddr; + newskb->free=1; + newskb->lock=0; + newskb->users=0; + newskb->pkt_type=skb->pkt_type; + + /* + * Put a MAC header on the packet + */ + ip_send(NULL,newskb, skb->ip_hdr->daddr, len, dev, skb->ip_hdr->saddr); + /* + * Add the rest of the data space. + */ + newskb->ip_hdr=(struct iphdr *)skb_put(newskb, len); + memcpy(newskb->proto_priv, skb->proto_priv, sizeof(skb->proto_priv)); + + /* + * Copy the data + */ + memcpy(newskb->ip_hdr,skb->ip_hdr,len); + + /* Recurse. The device check against IFF_LOOPBACK will stop infinite recursion */ + + /*printk("Loopback output queued [%lX to %lX].\n", newskb->ip_hdr->saddr,newskb->ip_hdr->daddr);*/ + ip_queue_xmit(NULL, dev, newskb, 2); +} + + + +/* + * Take an skb, and fill in the MAC header. + */ + +int ip_send(struct rtable * rt, struct sk_buff *skb, __u32 daddr, int len, struct device *dev, __u32 saddr) +{ + int mac = 0; + + skb->dev = dev; + skb->arp = 1; + skb->protocol = htons(ETH_P_IP); + if (dev->hard_header) + { + /* + * Build a hardware header. Source address is our mac, destination unknown + * (rebuild header will sort this out) + */ + skb_reserve(skb,(dev->hard_header_len+15)&~15); /* 16 byte aligned IP headers are good */ + if (rt && dev == rt->rt_dev && rt->rt_hh) + { + memcpy(skb_push(skb,dev->hard_header_len),rt->rt_hh->hh_data,dev->hard_header_len); + if (rt->rt_hh->hh_uptodate) + return dev->hard_header_len; +#if RT_CACHE_DEBUG >= 2 + printk("ip_send: hh miss %08x via %08x\n", daddr, rt->rt_gateway); +#endif + skb->arp = 0; + skb->raddr = daddr; + return dev->hard_header_len; + } + mac = dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, len); + if (mac < 0) + { + mac = -mac; + skb->arp = 0; + skb->raddr = daddr; /* next routing address */ + } + } + return mac; +} + +static int ip_send_room(struct rtable * rt, struct sk_buff *skb, __u32 daddr, int len, struct device *dev, __u32 saddr) +{ + int mac = 0; + + skb->dev = dev; + skb->arp = 1; + skb->protocol = htons(ETH_P_IP); + skb_reserve(skb,MAX_HEADER); + if (dev->hard_header) + { + if (rt && dev == rt->rt_dev && rt->rt_hh) + { + memcpy(skb_push(skb,dev->hard_header_len),rt->rt_hh->hh_data,dev->hard_header_len); + if (rt->rt_hh->hh_uptodate) + return dev->hard_header_len; +#if RT_CACHE_DEBUG >= 2 + printk("ip_send_room: hh miss %08x via %08x\n", daddr, rt->rt_gateway); +#endif + skb->arp = 0; + skb->raddr = daddr; + return dev->hard_header_len; + } + mac = dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, len); + if (mac < 0) + { + mac = -mac; + skb->arp = 0; + skb->raddr = daddr; /* next routing address */ + } + } + return mac; +} + +int ip_id_count = 0; + +/* + * This routine builds the appropriate hardware/IP headers for + * the routine. It assumes that if *dev != NULL then the + * protocol knows what it's doing, otherwise it uses the + * routing/ARP tables to select a device struct. + */ +int ip_build_header(struct sk_buff *skb, __u32 saddr, __u32 daddr, + struct device **dev, int type, struct options *opt, + int len, int tos, int ttl, struct rtable ** rp) +{ + struct rtable *rt; + __u32 raddr; + int tmp; + struct iphdr *iph; + __u32 final_daddr = daddr; + + + if (opt && opt->srr) + daddr = opt->faddr; + + /* + * See if we need to look up the device. + */ + +#ifdef CONFIG_IP_MULTICAST + if(MULTICAST(daddr) && *dev==NULL && skb->sk && *skb->sk->ip_mc_name) + *dev=dev_get(skb->sk->ip_mc_name); +#endif + if (rp) + { + rt = ip_check_route(rp, daddr, skb->localroute); + /* + * If rp != NULL rt_put following below should not + * release route, so that... + */ + if (rt) + atomic_inc(&rt->rt_refcnt); + } + else + rt = ip_rt_route(daddr, skb->localroute); + + + if (*dev == NULL) + { + if (rt == NULL) + { + ip_statistics.IpOutNoRoutes++; + return(-ENETUNREACH); + } + + *dev = rt->rt_dev; + } + + if ((LOOPBACK(saddr) && !LOOPBACK(daddr)) || !saddr) + saddr = rt ? rt->rt_src : (*dev)->pa_addr; + + raddr = rt ? rt->rt_gateway : daddr; + + if (opt && opt->is_strictroute && rt && (rt->rt_flags & RTF_GATEWAY)) + { + ip_rt_put(rt); + ip_statistics.IpOutNoRoutes++; + return -ENETUNREACH; + } + + /* + * Now build the MAC header. + */ + + if (type==IPPROTO_TCP) + tmp = ip_send_room(rt, skb, raddr, len, *dev, saddr); + else + tmp = ip_send(rt, skb, raddr, len, *dev, saddr); + + ip_rt_put(rt); + + /* + * Book keeping + */ + + skb->dev = *dev; + skb->saddr = saddr; + + /* + * Now build the IP header. + */ + + /* + * If we are using IPPROTO_RAW, then we don't need an IP header, since + * one is being supplied to us by the user + */ + + if(type == IPPROTO_RAW) + return (tmp); + + /* + * Build the IP addresses + */ + + if (opt) + iph=(struct iphdr *)skb_put(skb,sizeof(struct iphdr) + opt->optlen); + else + iph=(struct iphdr *)skb_put(skb,sizeof(struct iphdr)); + + iph->version = 4; + iph->ihl = 5; + iph->tos = tos; + iph->frag_off = 0; + iph->ttl = ttl; + iph->daddr = daddr; + iph->saddr = saddr; + iph->protocol = type; + skb->ip_hdr = iph; + + if (!opt || !opt->optlen) + return sizeof(struct iphdr) + tmp; + iph->ihl += opt->optlen>>2; + ip_options_build(skb, opt, final_daddr, (*dev)->pa_addr, 0); + return iph->ihl*4 + tmp; +} + + +/* + * Generate a checksum for an outgoing IP datagram. + */ + +void ip_send_check(struct iphdr *iph) +{ + iph->check = 0; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); +} + + + +/* + * Queues a packet to be sent, and starts the transmitter + * if necessary. if free = 1 then we free the block after + * transmit, otherwise we don't. If free==2 we not only + * free the block but also don't assign a new ip seq number. + * This routine also needs to put in the total length, + * and compute the checksum + */ + +void ip_queue_xmit(struct sock *sk, struct device *dev, + struct sk_buff *skb, int free) +{ + unsigned int tot_len; + struct iphdr *iph; + + IS_SKB(skb); + + /* + * Do some book-keeping in the packet for later + */ + + skb->sk = sk; + skb->dev = dev; + skb->when = jiffies; + + /* + * Find the IP header and set the length. This is bad + * but once we get the skb data handling code in the + * hardware will push its header sensibly and we will + * set skb->ip_hdr to avoid this mess and the fixed + * header length problem + */ + + iph = skb->ip_hdr; + tot_len = skb->len - (((unsigned char *)iph) - skb->data); + iph->tot_len = htons(tot_len); + + switch (free) { + /* No reassigning numbers to fragments... */ + case 2: + free = 1; + break; + default: + free = 1; + iph->id = htons(ip_id_count++); + } + + skb->free = free; + + /* Sanity check */ + if (dev == NULL) + goto no_device; + +#ifdef CONFIG_FIREWALL + if (call_out_firewall(PF_INET, skb->dev, iph, NULL) < FW_ACCEPT) + goto out; +#endif + + /* + * Do we need to fragment. Again this is inefficient. + * We need to somehow lock the original buffer and use + * bits of it. + */ + + if (tot_len > dev->mtu) + { + goto fragment; + } + + /* + * Add an IP checksum + */ + + ip_send_check(iph); + + /* + * More debugging. You cannot queue a packet already on a list + * Spot this and moan loudly. + */ + if (skb->next != NULL) + { + NETDEBUG(printk("ip_queue_xmit: next != NULL\n")); + skb_unlink(skb); + } + + /* + * If the indicated interface is up and running, send the packet. + */ + + ip_statistics.IpOutRequests++; +#ifdef CONFIG_IP_ACCT + ip_fw_chk(iph,dev,NULL,ip_acct_chain,0,IP_FW_MODE_ACCT_OUT); +#endif + +#ifdef CONFIG_IP_MULTICAST + + /* + * Multicasts are looped back for other local users + */ + + if (MULTICAST(iph->daddr) && !(dev->flags&IFF_LOOPBACK)) + { + if(sk==NULL || sk->ip_mc_loop) + { + if(iph->daddr==IGMP_ALL_HOSTS || (dev->flags&IFF_ALLMULTI)) + { + ip_loopback(dev,skb); + } + else + { + struct ip_mc_list *imc=dev->ip_mc_list; + while(imc!=NULL) + { + if(imc->multiaddr==iph->daddr) + { + ip_loopback(dev,skb); + break; + } + imc=imc->next; + } + } + } + /* Multicasts with ttl 0 must not go beyond the host */ + + if (iph->ttl==0) + goto out; + } +#endif + if ((dev->flags & IFF_BROADCAST) && !(dev->flags & IFF_LOOPBACK) + && (iph->daddr==dev->pa_brdaddr || iph->daddr==0xFFFFFFFF)) + ip_loopback(dev,skb); + + if (dev->flags & IFF_UP) + { + /* + * If we have an owner use its priority setting, + * otherwise use NORMAL + */ + int priority = SOPRI_NORMAL; + if (sk) + priority = sk->priority; + + dev_queue_xmit(skb, dev, priority); + return; + } + if(sk) + sk->err = ENETDOWN; + ip_statistics.IpOutDiscards++; +out: + if (free) + kfree_skb(skb, FREE_WRITE); + return; + +no_device: + NETDEBUG(printk("IP: ip_queue_xmit dev = NULL\n")); + goto out; + +fragment: + if ((iph->frag_off & htons(IP_DF))) + { + printk(KERN_DEBUG "sending pkt_too_big to self\n"); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(dev->mtu), dev); + goto out; + } + ip_fragment(sk,skb,dev,0); + goto out; +} + + +/* + * Build and send a packet, with as little as one copy + * + * Doesn't care much about ip options... option length can be + * different for fragment at 0 and other fragments. + * + * Note that the fragment at the highest offset is sent first, + * so the getfrag routine can fill in the TCP/UDP checksum header + * field in the last fragment it sends... actually it also helps + * the reassemblers, they can put most packets in at the head of + * the fragment queue, and they know the total size in advance. This + * last feature will measurable improve the Linux fragment handler. + * + * The callback has five args, an arbitrary pointer (copy of frag), + * the source IP address (may depend on the routing table), the + * destination address (char *), the offset to copy from, and the + * length to be copied. + * + */ + +int ip_build_xmit(struct sock *sk, + int getfrag (const void *, + __u32, + char *, + unsigned int, + unsigned int), + const void *frag, + unsigned short int length, + __u32 daddr, + __u32 user_saddr, + struct options * opt, + int flags, + int type, + int noblock) +{ + struct rtable *rt; + unsigned int fraglen, maxfraglen, fragheaderlen; + int offset, mf; + __u32 saddr; + unsigned short id; + struct iphdr *iph; + __u32 raddr; + struct device *dev = NULL; + struct hh_cache * hh=NULL; + int nfrags=0; + __u32 true_daddr = daddr; + int err; + + if (opt && opt->srr && !sk->ip_hdrincl) + daddr = opt->faddr; + + ip_statistics.IpOutRequests++; + +#ifdef CONFIG_IP_MULTICAST + if(MULTICAST(daddr) && *sk->ip_mc_name) + { + dev=dev_get(sk->ip_mc_name); + if(!dev) + return -ENODEV; + rt=NULL; + if (sk->saddr && (!LOOPBACK(sk->saddr) || LOOPBACK(daddr))) + saddr = sk->saddr; + else + saddr = dev->pa_addr; + } + else + { +#endif + rt = ip_check_route(&sk->ip_route_cache, daddr, + sk->localroute || (flags&MSG_DONTROUTE) || + (opt && opt->is_strictroute)); + if (rt == NULL) + { + ip_statistics.IpOutNoRoutes++; + return(-ENETUNREACH); + } + saddr = rt->rt_src; + + hh = rt->rt_hh; + + if (sk->saddr && (!LOOPBACK(sk->saddr) || LOOPBACK(daddr))) + saddr = sk->saddr; + + dev=rt->rt_dev; +#ifdef CONFIG_IP_MULTICAST + } + if (rt && !dev) + dev = rt->rt_dev; +#endif + if (user_saddr) + saddr = user_saddr; + + raddr = rt ? rt->rt_gateway : daddr; + /* + * Now compute the buffer space we require + */ + + /* + * Try the simple case first. This leaves broadcast, multicast, fragmented frames, and by + * choice RAW frames within 20 bytes of maximum size(rare) to the long path + */ + + if (!sk->ip_hdrincl) { + length += sizeof(struct iphdr); + if(opt) length += opt->optlen; + } + + if(length <= dev->mtu && !MULTICAST(daddr) && daddr!=0xFFFFFFFF && daddr!=dev->pa_brdaddr) + { + int error; + struct sk_buff *skb=sock_alloc_send_skb(sk, length+15+dev->hard_header_len,0, noblock, &error); + if(skb==NULL) + { + ip_statistics.IpOutDiscards++; + return error; + } + skb->dev=dev; + skb->protocol = htons(ETH_P_IP); + skb->free=1; + skb->when=jiffies; + skb->sk=sk; + skb->arp=0; + skb->saddr=saddr; + skb->raddr = raddr; + skb_reserve(skb,(dev->hard_header_len+15)&~15); + if (hh) + { + skb->arp=1; + memcpy(skb_push(skb,dev->hard_header_len),hh->hh_data,dev->hard_header_len); + if (!hh->hh_uptodate) + { + skb->arp = 0; +#if RT_CACHE_DEBUG >= 2 + printk("ip_build_xmit: hh miss %08x via %08x\n", rt->rt_dst, rt->rt_gateway); +#endif + } + } + else if(dev->hard_header) + { + if(dev->hard_header(skb,dev,ETH_P_IP,NULL,NULL,0)>0) + skb->arp=1; + } + else + skb->arp=1; + skb->ip_hdr=iph=(struct iphdr *)skb_put(skb,length); + dev_lock_list(); + if(!sk->ip_hdrincl) + { + iph->version=4; + iph->ihl=5; + iph->tos=sk->ip_tos; + iph->tot_len = htons(length); + iph->id=htons(ip_id_count++); + iph->frag_off = 0; + iph->ttl=sk->ip_ttl; + iph->protocol=type; + iph->saddr=saddr; + iph->daddr=daddr; + if (opt) + { + iph->ihl += opt->optlen>>2; + ip_options_build(skb, opt, + true_daddr, dev->pa_addr, 0); + } + iph->check=0; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + err = getfrag(frag,saddr,((char *)iph)+iph->ihl*4,0, length-iph->ihl*4); + } + else + err = getfrag(frag, saddr, (void *)iph, 0, length); + + dev_unlock_list(); + + if (err) + { + err = -EFAULT; + } + +#ifdef CONFIG_FIREWALL + if(!err && call_out_firewall(PF_INET, skb->dev, iph, NULL)< FW_ACCEPT) + { + err = -EPERM; + } +#endif + + if (err) + { + kfree_skb(skb, FREE_WRITE); + return err; + } + +#ifdef CONFIG_IP_ACCT + ip_fw_chk(iph,dev,NULL,ip_acct_chain,0,IP_FW_MODE_ACCT_OUT); +#endif + if(dev->flags&IFF_UP) + dev_queue_xmit(skb,dev,sk->priority); + else + { + ip_statistics.IpOutDiscards++; + kfree_skb(skb, FREE_WRITE); + } + return 0; + } + if (!sk->ip_hdrincl) + length -= sizeof(struct iphdr); + + if(opt) + { + length -= opt->optlen; + fragheaderlen = dev->hard_header_len + sizeof(struct iphdr) + opt->optlen; + maxfraglen = ((dev->mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen; + } + else + { + fragheaderlen = dev->hard_header_len; + if(!sk->ip_hdrincl) + fragheaderlen += 20; + + /* + * Fragheaderlen is the size of 'overhead' on each buffer. + * Now work out the size of the frames to send. + */ + + maxfraglen = ((dev->mtu-20) & ~7) + fragheaderlen; + } + + /* + * Start at the end of the frame by handling the remainder. + */ + + offset = length - (length % (maxfraglen - fragheaderlen)); + + /* + * Amount of memory to allocate for final fragment. + */ + + fraglen = length - offset + fragheaderlen; + + if(length-offset==0) + { + fraglen = maxfraglen; + offset -= maxfraglen-fragheaderlen; + } + + + /* + * The last fragment will not have MF (more fragments) set. + */ + + mf = 0; + + /* + * Can't fragment raw packets + */ + + if (sk->ip_hdrincl && offset > 0) + return(-EMSGSIZE); + + /* + * Lock the device lists. + */ + + dev_lock_list(); + + /* + * Get an identifier + */ + + id = htons(ip_id_count++); + + /* + * Being outputting the bytes. + */ + + do + { + struct sk_buff * skb; + int error; + char *data; + + /* + * Get the memory we require with some space left for alignment. + */ + + skb = sock_alloc_send_skb(sk, fraglen+15, 0, noblock, &error); + if (skb == NULL) + { + ip_statistics.IpOutDiscards++; + if(nfrags>1) + ip_statistics.IpFragCreates++; + dev_unlock_list(); + return(error); + } + + /* + * Fill in the control structures + */ + + skb->dev = dev; + skb->protocol = htons(ETH_P_IP); + skb->when = jiffies; + skb->free = 1; /* dubious, this one */ + skb->sk = sk; + skb->arp = 0; + skb->saddr = saddr; + skb->daddr = daddr; + skb->raddr = raddr; + skb_reserve(skb,(dev->hard_header_len+15)&~15); + data = skb_put(skb, fraglen-dev->hard_header_len); + + /* + * Save us ARP and stuff. In the optimal case we do no route lookup (route cache ok) + * no ARP lookup (arp cache ok) and output. The cache checks are still too slow but + * this can be fixed later. For gateway routes we ought to have a rt->.. header cache + * pointer to speed header cache builds for identical targets. + */ + + if (hh) + { + skb->arp=1; + memcpy(skb_push(skb,dev->hard_header_len),hh->hh_data,dev->hard_header_len); + if (!hh->hh_uptodate) + { + skb->arp = 0; +#if RT_CACHE_DEBUG >= 2 + printk("ip_build_xmit: hh miss %08x via %08x\n", rt->rt_dst, rt->rt_gateway); +#endif + } + } + else if (dev->hard_header) + { + if(dev->hard_header(skb, dev, ETH_P_IP, + NULL, NULL, 0)>0) + skb->arp=1; + } + else + skb->arp = 1; + + /* + * Find where to start putting bytes. + */ + + skb->ip_hdr = iph = (struct iphdr *)data; + + /* + * Only write IP header onto non-raw packets + */ + + if(!sk->ip_hdrincl) + { + + iph->version = 4; + iph->ihl = 5; /* ugh */ + if (opt) { + iph->ihl += opt->optlen>>2; + ip_options_build(skb, opt, + true_daddr, dev->pa_addr, offset); + } + iph->tos = sk->ip_tos; + iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4); + iph->id = id; + iph->frag_off = htons(offset>>3); + iph->frag_off |= mf; +#ifdef CONFIG_IP_MULTICAST + if (MULTICAST(daddr)) + iph->ttl = sk->ip_mc_ttl; + else +#endif + iph->ttl = sk->ip_ttl; + iph->protocol = type; + iph->check = 0; + iph->saddr = saddr; + iph->daddr = daddr; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + data += iph->ihl*4; + + /* + * Any further fragments will have MF set. + */ + + mf = htons(IP_MF); + } + + /* + * User data callback + */ + + err = getfrag(frag, saddr, data, offset, fraglen-fragheaderlen); + if (err) + { + err = -EFAULT; + } + + /* + * Account for the fragment. + */ + +#ifdef CONFIG_FIREWALL + if(!err && !offset && call_out_firewall(PF_INET, skb->dev, iph, NULL) < FW_ACCEPT) + { + err = -EPERM; + } +#endif + if (err) + { + kfree_skb(skb, FREE_WRITE); + dev_unlock_list(); + return err; + } + +#ifdef CONFIG_IP_ACCT + if(!offset) + ip_fw_chk(iph, dev, NULL, ip_acct_chain, 0, IP_FW_MODE_ACCT_OUT); +#endif + offset -= (maxfraglen-fragheaderlen); + fraglen = maxfraglen; + +#ifdef CONFIG_IP_MULTICAST + + /* + * Multicasts are looped back for other local users + */ + + if (MULTICAST(daddr) && !(dev->flags&IFF_LOOPBACK)) + { + /* + * Loop back any frames. The check for IGMP_ALL_HOSTS is because + * you are always magically a member of this group. + * + * Always loop back all host messages when running as a multicast router. + */ + + if(sk==NULL || sk->ip_mc_loop) + { + if(daddr==IGMP_ALL_HOSTS || (dev->flags&IFF_ALLMULTI)) + ip_loopback(dev,skb); + else + { + struct ip_mc_list *imc=dev->ip_mc_list; + while(imc!=NULL) + { + if(imc->multiaddr==daddr) + { + ip_loopback(dev,skb); + break; + } + imc=imc->next; + } + } + } + + /* + * Multicasts with ttl 0 must not go beyond the host. Fixme: avoid the + * extra clone. + */ + + if(skb->ip_hdr->ttl==0) + { + kfree_skb(skb, FREE_WRITE); + nfrags++; + continue; + } + } +#endif + + nfrags++; + + /* + * BSD loops broadcasts + */ + + if((dev->flags&IFF_BROADCAST) && (daddr==0xFFFFFFFF || daddr==dev->pa_brdaddr) && !(dev->flags&IFF_LOOPBACK)) + ip_loopback(dev,skb); + + /* + * Now queue the bytes into the device. + */ + + if (dev->flags & IFF_UP) + { + dev_queue_xmit(skb, dev, sk->priority); + } + else + { + /* + * Whoops... + */ + + ip_statistics.IpOutDiscards++; + if(nfrags>1) + ip_statistics.IpFragCreates+=nfrags; + kfree_skb(skb, FREE_WRITE); + dev_unlock_list(); + /* + * BSD behaviour. + */ + if(sk!=NULL) + sk->err=ENETDOWN; + return(0); /* lose rest of fragments */ + } + } + while (offset >= 0); + if(nfrags>1) + ip_statistics.IpFragCreates+=nfrags; + dev_unlock_list(); + return(0); +} + + +/* + * IP protocol layer initialiser + */ + +static struct packet_type ip_packet_type = +{ + 0, /* MUTTER ntohs(ETH_P_IP),*/ + NULL, /* All devices */ + ip_rcv, + NULL, + NULL, +}; + +#ifdef CONFIG_RTNETLINK + +/* + * Netlink hooks for IP + */ + +void ip_netlink_msg(unsigned long msg, __u32 daddr, __u32 gw, __u32 mask, short flags, short metric, char *name) +{ + struct sk_buff *skb=alloc_skb(sizeof(struct netlink_rtinfo), GFP_ATOMIC); + struct netlink_rtinfo *nrt; + struct sockaddr_in *s; + if(skb==NULL) + return; + skb->free=1; + nrt=(struct netlink_rtinfo *)skb_put(skb, sizeof(struct netlink_rtinfo)); + nrt->rtmsg_type=msg; + s=(struct sockaddr_in *)&nrt->rtmsg_dst; + s->sin_family=AF_INET; + s->sin_addr.s_addr=daddr; + s=(struct sockaddr_in *)&nrt->rtmsg_gateway; + s->sin_family=AF_INET; + s->sin_addr.s_addr=gw; + s=(struct sockaddr_in *)&nrt->rtmsg_genmask; + s->sin_family=AF_INET; + s->sin_addr.s_addr=mask; + nrt->rtmsg_flags=flags; + nrt->rtmsg_metric=metric; + strcpy(nrt->rtmsg_device,name); + if (netlink_post(NETLINK_ROUTE, skb)) + kfree_skb(skb, FREE_WRITE); +} + +#endif + +/* + * Device notifier + */ + +static int ip_rt_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct device *dev=ptr; + if(event==NETDEV_DOWN) + { + ip_netlink_msg(RTMSG_DELDEVICE, 0,0,0,0,0,dev->name); + ip_rt_flush(dev); + } +/* + * Join the initial group if multicast. + */ + if(event==NETDEV_UP) + { +#ifdef CONFIG_IP_MULTICAST + ip_mc_allhost(dev); +#endif + ip_netlink_msg(RTMSG_NEWDEVICE, 0,0,0,0,0,dev->name); + ip_rt_update(NETDEV_UP, dev); + } + return NOTIFY_DONE; +} + +struct notifier_block ip_rt_notifier={ + ip_rt_event, + NULL, + 0 +}; + +#ifdef CONFIG_IP_MULTICAST +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry proc_net_igmp = { + PROC_NET_IGMP, 4, "igmp", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + ip_mc_procinfo +}; +#endif +#endif + +/* + * IP registers the packet type and then calls the subprotocol initialisers + */ + +void ip_init(void) +{ + ip_packet_type.type=htons(ETH_P_IP); + dev_add_pack(&ip_packet_type); + + /* So we flush routes when a device is downed */ + register_netdevice_notifier(&ip_rt_notifier); + +/* ip_raw_init(); + ip_packet_init(); + ip_tcp_init(); + ip_udp_init();*/ + +#ifdef CONFIG_IP_MULTICAST +#ifdef CONFIG_PROC_FS + proc_net_register(&proc_net_igmp); +#endif +#endif +} + diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c new file mode 100644 index 000000000..7337fc08c --- /dev/null +++ b/net/ipv4/ip_sockglue.c @@ -0,0 +1,540 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The IP to API glue. + * + * Authors: see ip.c + * + * Fixes: + * Many : Split from ip.c , see ip.c for history. + * Martin Mares : TOS setting fixed. + * Alan Cox : Fixed a couple of oopses in Martin's + * TOS tweaks. + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/icmp.h> +#include <linux/netdevice.h> +#include <net/sock.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/firewall.h> +#include <linux/ip_fw.h> +#include <net/checksum.h> +#include <linux/route.h> +#include <linux/mroute.h> +#include <net/route.h> + +#include <asm/uaccess.h> + +#ifdef CONFIG_IP_MULTICAST + +/* + * Write an multicast group list table for the IGMP daemon to + * read. + */ + +int ip_mc_procinfo(char *buffer, char **start, off_t offset, int length, int dummy) +{ + off_t pos=0, begin=0; + struct ip_mc_list *im; + unsigned long flags; + int len=0; + struct device *dev; + + len=sprintf(buffer,"Device : Count\tGroup Users Timer\n"); + save_flags(flags); + cli(); + + for(dev = dev_base; dev; dev = dev->next) + { + if((dev->flags&IFF_UP)&&(dev->flags&IFF_MULTICAST)) + { + len+=sprintf(buffer+len,"%-10s: %5d\n", + dev->name, dev->mc_count); + for(im = dev->ip_mc_list; im; im = im->next) + { + len+=sprintf(buffer+len, + "\t\t\t%08lX %5d %d:%08lX\n", + im->multiaddr, im->users, + im->tm_running, im->timer.expires-jiffies); + pos=begin+len; + if(pos<offset) + { + len=0; + begin=pos; + } + if(pos>offset+length) + break; + } + } + } + restore_flags(flags); + *start=buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + return len; +} + + +/* + * Socket option code for IP. This is the end of the line after any TCP,UDP etc options on + * an IP socket. + * + * We implement IP_TOS (type of service), IP_TTL (time to live). + */ + +static struct device *ip_mc_find_devfor(unsigned long addr) +{ + struct device *dev; + for(dev = dev_base; dev; dev = dev->next) + { + if((dev->flags&IFF_UP)&&(dev->flags&IFF_MULTICAST)&& + (dev->pa_addr==addr)) + return dev; + } + + return NULL; +} + +#endif + +int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) +{ + int val,err; + unsigned char ucval; +#if defined(CONFIG_IP_FIREWALL) || defined(CONFIG_IP_ACCT) + struct ip_fw tmp_fw; +#endif + if (optval == NULL) + { + val=0; + ucval=0; + } + else + { + err = get_user(val, (int *) optval); + if (!err) + err = get_user(ucval, (unsigned char *) optval); + if (err) + return err; + } + + if(level!=SOL_IP) + return -EOPNOTSUPP; +#ifdef CONFIG_IP_MROUTE + if(optname>=MRT_BASE && optname <=MRT_BASE+10) + { + return ip_mroute_setsockopt(sk,optname,optval,optlen); + } +#endif + + switch(optname) + { + case IP_OPTIONS: + { + struct options * opt = NULL; + struct options * old_opt; + if (optlen > 40 || optlen < 0) + return -EINVAL; + opt = kmalloc(sizeof(struct options)+((optlen+3)&~3), GFP_KERNEL); + if (!opt) + return -ENOMEM; + memset(opt, 0, sizeof(struct options)); + if (optlen) + { + err = copy_from_user(opt->__data, optval, optlen); + if (err) + { + kfree_s(opt, sizeof(struct options) + ((optlen+3)&~3)); + return -EFAULT; + } + } + + while (optlen & 3) + opt->__data[optlen++] = IPOPT_END; + opt->optlen = optlen; + opt->is_data = 1; + opt->is_setbyuser = 1; + if (optlen && ip_options_compile(opt, NULL)) + { + kfree_s(opt, sizeof(struct options) + optlen); + return -EINVAL; + } + /* + * ANK: I'm afraid that receive handler may change + * options from under us. + */ + cli(); + old_opt = sk->opt; + sk->opt = opt; + sti(); + if (old_opt) + kfree_s(old_opt, sizeof(struct optlen) + old_opt->optlen); + return 0; + } + case IP_TOS: /* This sets both TOS and Precedence */ + if (val & ~0xfe) /* Reject setting of unused bits */ + return -EINVAL; + if ((val>>5) > 4 && !suser()) /* Only root can set Prec>4 */ + return -EPERM; + sk->ip_tos=val; + switch (val & 0x1E) { + case IPTOS_LOWDELAY: + sk->priority=SOPRI_INTERACTIVE; + break; + case IPTOS_THROUGHPUT: + case IPTOS_MINCOST: + sk->priority=SOPRI_BACKGROUND; + break; + default: + sk->priority=SOPRI_NORMAL; + break; + } + return 0; + case IP_TTL: + if(val<1||val>255) + return -EINVAL; + sk->ip_ttl=val; + return 0; + case IP_HDRINCL: + if(sk->type!=SOCK_RAW) + return -ENOPROTOOPT; + sk->ip_hdrincl=val?1:0; + return 0; +#ifdef CONFIG_IP_MULTICAST + case IP_MULTICAST_TTL: + { + sk->ip_mc_ttl=(int)ucval; + return 0; + } + case IP_MULTICAST_LOOP: + { + if(ucval!=0 && ucval!=1) + return -EINVAL; + sk->ip_mc_loop=(int)ucval; + return 0; + } + case IP_MULTICAST_IF: + { + struct in_addr addr; + struct device *dev=NULL; + + /* + * Check the arguments are allowable + */ + + err = copy_from_user(&addr,optval,sizeof(addr)); + if (err) + return -EFAULT; + + + /* + * What address has been requested + */ + + if(addr.s_addr==INADDR_ANY) /* Default */ + { + sk->ip_mc_name[0]=0; + return 0; + } + + /* + * Find the device + */ + + dev=ip_mc_find_devfor(addr.s_addr); + + /* + * Did we find one + */ + + if(dev) + { + strcpy(sk->ip_mc_name,dev->name); + return 0; + } + return -EADDRNOTAVAIL; + } + + case IP_ADD_MEMBERSHIP: + { + +/* + * FIXME: Add/Del membership should have a semaphore protecting them from re-entry + */ + struct ip_mreq mreq; + struct rtable *rt; + struct device *dev=NULL; + + /* + * Check the arguments. + */ + + err = copy_from_user(&mreq,optval,sizeof(mreq)); + if (err) + return -EFAULT; + + /* + * Get device for use later + */ + + if(mreq.imr_interface.s_addr==INADDR_ANY) + { + /* + * Not set so scan. + */ + if((rt=ip_rt_route(mreq.imr_multiaddr.s_addr,0))!=NULL) + { + dev=rt->rt_dev; + atomic_dec(&rt->rt_use); + ip_rt_put(rt); + } + } + else + { + /* + * Find a suitable device. + */ + + dev=ip_mc_find_devfor(mreq.imr_interface.s_addr); + } + + /* + * No device, no cookies. + */ + + if(!dev) + return -ENODEV; + + /* + * Join group. + */ + + return ip_mc_join_group(sk,dev,mreq.imr_multiaddr.s_addr); + } + + case IP_DROP_MEMBERSHIP: + { + struct ip_mreq mreq; + struct rtable *rt; + struct device *dev=NULL; + + /* + * Check the arguments + */ + + err = copy_from_user(&mreq,optval,sizeof(mreq)); + if (err) + return -EFAULT; + + /* + * Get device for use later + */ + + if(mreq.imr_interface.s_addr==INADDR_ANY) + { + if((rt=ip_rt_route(mreq.imr_multiaddr.s_addr,0))!=NULL) + { + dev=rt->rt_dev; + atomic_dec(&rt->rt_use); + ip_rt_put(rt); + } + } + else + { + + dev=ip_mc_find_devfor(mreq.imr_interface.s_addr); + } + + /* + * Did we find a suitable device. + */ + + if(!dev) + return -ENODEV; + + /* + * Leave group + */ + + return ip_mc_leave_group(sk,dev,mreq.imr_multiaddr.s_addr); + } +#endif +#ifdef CONFIG_IP_FIREWALL + case IP_FW_INSERT_IN: + case IP_FW_INSERT_OUT: + case IP_FW_INSERT_FWD: + case IP_FW_APPEND_IN: + case IP_FW_APPEND_OUT: + case IP_FW_APPEND_FWD: + case IP_FW_DELETE_IN: + case IP_FW_DELETE_OUT: + case IP_FW_DELETE_FWD: + case IP_FW_CHECK_IN: + case IP_FW_CHECK_OUT: + case IP_FW_CHECK_FWD: + case IP_FW_FLUSH_IN: + case IP_FW_FLUSH_OUT: + case IP_FW_FLUSH_FWD: + case IP_FW_ZERO_IN: + case IP_FW_ZERO_OUT: + case IP_FW_ZERO_FWD: + case IP_FW_POLICY_IN: + case IP_FW_POLICY_OUT: + case IP_FW_POLICY_FWD: + case IP_FW_MASQ_TIMEOUTS: + if(!suser()) + return -EPERM; + if(optlen>sizeof(tmp_fw) || optlen<1) + return -EINVAL; + err = copy_from_user(&tmp_fw,optval,optlen); + if (err) + return -EFAULT; + err=ip_fw_ctl(optname, &tmp_fw,optlen); + return -err; /* -0 is 0 after all */ + +#endif +#ifdef CONFIG_IP_ACCT + case IP_ACCT_INSERT: + case IP_ACCT_APPEND: + case IP_ACCT_DELETE: + case IP_ACCT_FLUSH: + case IP_ACCT_ZERO: + if(!suser()) + return -EPERM; + if(optlen>sizeof(tmp_fw) || optlen<1) + return -EINVAL; + err = copy_from_user(&tmp_fw, optval,optlen); + if (err) + return -EFAULT; + err=ip_acct_ctl(optname, &tmp_fw,optlen); + return -err; /* -0 is 0 after all */ +#endif + /* IP_OPTIONS and friends go here eventually */ + default: + return(-ENOPROTOOPT); + } +} + +/* + * Get the options. Note for future reference. The GET of IP options gets the + * _received_ ones. The set sets the _sent_ ones. + */ + +int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen) +{ + int val,err; +#ifdef CONFIG_IP_MULTICAST + int len; +#endif + + if(level!=SOL_IP) + return -EOPNOTSUPP; + +#ifdef CONFIG_IP_MROUTE + if(optname>=MRT_BASE && optname <=MRT_BASE+10) + { + return ip_mroute_getsockopt(sk,optname,optval,optlen); + } +#endif + + switch(optname) + { + case IP_OPTIONS: + { + unsigned char optbuf[sizeof(struct options)+40]; + struct options * opt = (struct options*)optbuf; + + cli(); + opt->optlen = 0; + if (sk->opt) + memcpy(optbuf, sk->opt, sizeof(struct options)+sk->opt->optlen); + sti(); + if (opt->optlen == 0) + { + return put_user(0, optlen); + } +/* + * Now we should undo all the changes done by ip_options_compile(). + */ + if (opt->srr) + { + unsigned char * optptr = opt->__data+opt->srr-sizeof(struct iphdr); + memmove(optptr+7, optptr+3, optptr[1]-7); + memcpy(optptr+3, &opt->faddr, 4); + } + if (opt->rr_needaddr) + { + unsigned char * optptr = opt->__data+opt->rr-sizeof(struct iphdr); + memset(&optptr[optptr[2]-1], 0, 4); + optptr[2] -= 4; + } + if (opt->ts) + { + unsigned char * optptr = opt->__data+opt->ts-sizeof(struct iphdr); + if (opt->ts_needtime) + { + memset(&optptr[optptr[2]-1], 0, 4); + optptr[2] -= 4; + } + if (opt->ts_needaddr) + { + memset(&optptr[optptr[2]-1], 0, 4); + optptr[2] -= 4; + } + } + err = put_user(opt->optlen, optlen); + if (!err) + { + if(copy_to_user(optval, opt->__data, opt->optlen)) + err = -EFAULT; + } + return err; + } + return 0; + case IP_TOS: + val=sk->ip_tos; + break; + case IP_TTL: + val=sk->ip_ttl; + break; + case IP_HDRINCL: + val=sk->ip_hdrincl; + break; +#ifdef CONFIG_IP_MULTICAST + case IP_MULTICAST_TTL: + val=sk->ip_mc_ttl; + break; + case IP_MULTICAST_LOOP: + val=sk->ip_mc_loop; + break; + case IP_MULTICAST_IF: + len=strlen(sk->ip_mc_name); + err = put_user(len, optlen); + if (!err) + { + err = copy_to_user((void *)optval,sk->ip_mc_name, len); + if (err) + err = -EFAULT; + } + return err; +#endif + default: + return(-ENOPROTOOPT); + } + err = put_user(sizeof(int), optlen); + if (err) + return err; + return put_user(val,(int *) optval); +} diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 5227d9474..372417cb1 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -7,6 +7,10 @@ * Fixes: * Alan Cox : Merged and made usable non modular (its so tiny its silly as * a module taking up 2 pages). + * Alan Cox : Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph) + * to keep ip_forward happy. + * Alan Cox : More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8). + * Kai Schulte : Fixed #defines for IP_FIREWALL->FIREWALL * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -15,40 +19,36 @@ * */ +#include <linux/module.h> + #include <linux/types.h> +#include <linux/sched.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/netdevice.h> -#include <netinet/in.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/firewall.h> + #include <net/datalink.h> #include <net/sock.h> #include <net/ip.h> +#include <net/icmp.h> #include <net/protocol.h> #include <net/ipip.h> /* - * NB. we must include the kernel idenfication string in to install the module. - */ - -#if ( defined(CONFIG_NET_IPIP) && defined(CONFIG_IP_FORWARD)) || defined(MODULE) -#ifdef MODULE -#include <linux/module.h> -#include <linux/version.h> - -static char kernel_version[] = UTS_RELEASE; - -#else -#define MOD_INC_USE_COUNT -#define MOD_DEC_USE_COUNT -#endif - - -/* - * The driver. + * The IPIP protocol driver. + * + * On entry here + * skb->data is the original IP header + * skb->ip_hdr points to the initial IP header. + * skb->h.raw points at the new header. */ int ipip_rcv(struct sk_buff *skb, struct device *dev, struct options *opt, - unsigned long daddr, unsigned short len, unsigned long saddr, + __u32 daddr, unsigned short len, __u32 saddr, int redo, struct inet_protocol *protocol) { /* Don't unlink in the middle of a turnaround */ @@ -56,16 +56,39 @@ int ipip_rcv(struct sk_buff *skb, struct device *dev, struct options *opt, #ifdef TUNNEL_DEBUG printk("ipip_rcv: got a packet!\n"); #endif - ip_forward(skb, dev, 0, daddr, 0); - kfree_skb(skb, FREE_READ); + /* + * Discard the original IP header + */ + + skb_pull(skb, ((struct iphdr *)skb->data)->ihl<<2); + + /* + * Adjust pointers + */ + + skb->h.iph=(struct iphdr *)skb->data; + skb->ip_hdr=(struct iphdr *)skb->data; + memset(skb->proto_priv, 0, sizeof(struct options)); + + /* + * If you want to add LZ compressed IP or things like that here, + * and in drivers/net/tunnel.c are the places to add. + */ + + skb->protocol = htons(ETH_P_IP); + skb->ip_summed = 0; + netif_rx(skb); MOD_DEC_USE_COUNT; return(0); } #ifdef MODULE + static struct inet_protocol ipip_protocol = { ipip_rcv, /* IPIP handler */ +#if 0 NULL, /* Will be UDP fraglist handler */ +#endif NULL, /* TUNNEL error control */ 0, /* next */ IPPROTO_IPIP, /* protocol ID */ @@ -88,8 +111,7 @@ int init_module( void) void cleanup_module( void) { if ( inet_del_protocol(&ipip_protocol) < 0 ) - printk("ipip close: can't remove protocol\n"); + printk(KERN_INFO "ipip close: can't remove protocol\n"); } #endif -#endif diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c new file mode 100644 index 000000000..79736745c --- /dev/null +++ b/net/ipv4/ipmr.c @@ -0,0 +1,950 @@ +/* + * IP multicast routing support for mrouted 3.6/3.8 + * + * (c) 1995 Alan Cox, <alan@cymru.net> + * Linux Consultancy and Custom Driver Development + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * + * Fixes: + * Michael Chastain : Incorrect size of copying. + * Alan Cox : Added the cache manager code. + * Alan Cox : Fixed the clone/copy bug and device race. + * Malcolm Beattie : Buffer handling fixes. + * Alexey Kuznetsov : Double buffer free and other fixes. + * SVR Anand : Fixed several multicast bugs and problems. + * + * Status: + * Cache manager under test. Forwarding in vague test mode + * Todo: + * Flow control + * Finish Tunnels + * Debug cache ttl handling properly + * Resolve IFF_ALLMULTI for rest of cards + */ + +#include <linux/config.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <linux/mm.h> +#include <linux/kernel.h> +#include <linux/fcntl.h> +#include <linux/stat.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/proc_fs.h> +#include <linux/mroute.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <linux/notifier.h> +#include <net/checksum.h> + +/* + * Multicast router control variables + */ + +static struct vif_device vif_table[MAXVIFS]; /* Devices */ +static unsigned long vifc_map; /* Active device map */ +int mroute_do_pim = 0; /* Set in PIM assert */ +static struct mfc_cache *mfc_cache_array[MFC_LINES]; /* Forwarding cache */ +static struct mfc_cache *cache_resolve_queue; /* Unresolved cache */ +int cache_resolve_queue_len = 0; /* Size of unresolved */ + +/* + * Delete a VIF entry + */ + +static void vif_delete(struct vif_device *v) +{ + if(!(v->flags&VIFF_TUNNEL)) + { + v->dev->flags&=~IFF_ALLMULTI; + dev_mc_upload(v->dev); + } + v->dev=NULL; +} + +/* + * Find a vif + */ + +static int ipmr_vifi_find(struct device *dev) +{ + struct vif_device *v=&vif_table[0]; + int ct; + for(ct=0;ct<MAXVIFS;ct++,v++) + { + if(v->dev==dev) + return ct; + } + return -1; +} + +/* + * Delete a multicast route cache entry + */ + +static void ipmr_cache_delete(struct mfc_cache *cache) +{ + struct sk_buff *skb; + int line; + struct mfc_cache **cp; + + /* + * Find the right cache line + */ + + if(cache->mfc_flags&MFC_QUEUED) + { + cp=&cache_resolve_queue; + del_timer(&cache->mfc_timer); + } + else + { + line=MFC_HASH(cache->mfc_mcastgrp,cache->mfc_origin); + cp=&(mfc_cache_array[line]); + } + + /* + * Unlink the buffer + */ + + while(*cp!=NULL) + { + if(*cp==cache) + { + *cp=cache->next; + break; + } + cp=&((*cp)->next); + } + + /* + * Free the buffer. If it is a pending resolution + * clean up the other resources. + */ + + if(cache->mfc_flags&MFC_QUEUED) + { + cache_resolve_queue_len--; + while((skb=skb_dequeue(&cache->mfc_unresolved))) + kfree_skb(skb, FREE_WRITE); + } + kfree_s(cache,sizeof(cache)); +} + +/* + * Cache expiry timer + */ + +static void ipmr_cache_timer(unsigned long data) +{ + struct mfc_cache *cache=(struct mfc_cache *)data; + ipmr_cache_delete(cache); +} + +/* + * Insert a multicast cache entry + */ + +static void ipmr_cache_insert(struct mfc_cache *c) +{ + int line=MFC_HASH(c->mfc_mcastgrp,c->mfc_origin); + c->next=mfc_cache_array[line]; + mfc_cache_array[line]=c; +} + +/* + * Find a multicast cache entry + */ + +struct mfc_cache *ipmr_cache_find(__u32 origin, __u32 mcastgrp) +{ + int line=MFC_HASH(mcastgrp,origin); + struct mfc_cache *cache; + cache=mfc_cache_array[line]; + while(cache!=NULL) + { + if(cache->mfc_origin==origin && cache->mfc_mcastgrp==mcastgrp) + return cache; + cache=cache->next; + } + cache=cache_resolve_queue; + while(cache!=NULL) + { + if(cache->mfc_origin==origin && cache->mfc_mcastgrp==mcastgrp) + return cache; + cache=cache->next; + } + return NULL; +} + +/* + * Allocate a multicast cache entry + */ + +static struct mfc_cache *ipmr_cache_alloc(int priority) +{ + struct mfc_cache *c=(struct mfc_cache *)kmalloc(sizeof(struct mfc_cache), priority); + if(c==NULL) + return NULL; + c->mfc_queuelen=0; + skb_queue_head_init(&c->mfc_unresolved); + init_timer(&c->mfc_timer); + c->mfc_timer.data=(long)c; + c->mfc_timer.function=ipmr_cache_timer; + return c; +} + +/* + * A cache entry has gone into a resolved state from queued + */ + +static void ipmr_cache_resolve(struct mfc_cache *cache) +{ + struct mfc_cache **p; + struct sk_buff *skb; + /* + * Kill the queue entry timer. + */ + del_timer(&cache->mfc_timer); + cache->mfc_flags&=~MFC_QUEUED; + /* + * Remove from the resolve queue + */ + p=&cache_resolve_queue; + while((*p)!=NULL) + { + if((*p)==cache) + { + *p=cache->next; + break; + } + p=&((*p)->next); + } + cache_resolve_queue_len--; + sti(); + /* + * Insert into the main cache + */ + ipmr_cache_insert(cache); + /* + * Play the pending entries through our router + */ + while((skb=skb_dequeue(&cache->mfc_unresolved))) + ipmr_forward(skb, skb->protocol); +} + +/* + * Bounce a cache query up to mrouted. We could use netlink for this but mrouted + * expects the following bizarre scheme.. + */ + +static void ipmr_cache_report(struct sk_buff *pkt) +{ + struct sk_buff *skb=alloc_skb(128, GFP_ATOMIC); + int ihl=pkt->ip_hdr->ihl<<2; + struct igmphdr *igmp; + if(!skb) + return; + + skb->free=1; + + /* + * Copy the IP header + */ + + skb->ip_hdr=(struct iphdr *)skb_put(skb,ihl); + skb->h.iph=skb->ip_hdr; + memcpy(skb->data,pkt->data,ihl); + skb->ip_hdr->protocol = 0; /* Flag to the kernel this is a route add */ + + /* + * Add our header + */ + + igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr)); + igmp->type = IGMPMSG_NOCACHE; /* non IGMP dummy message */ + igmp->code = 0; + skb->ip_hdr->tot_len=htons(skb->len); /* Fix the length */ + + /* + * Deliver to mrouted + */ + if(sock_queue_rcv_skb(mroute_socket,skb)<0) + { + skb->sk=NULL; + kfree_skb(skb, FREE_READ); + } +} + + +/* + * Queue a packet for resolution + */ + +static void ipmr_cache_unresolved(struct mfc_cache *cache, vifi_t vifi, struct sk_buff *skb, int is_frag) +{ + if(cache==NULL) + { + /* + * Create a new entry if allowable + */ + if(cache_resolve_queue_len>=10 || (cache=ipmr_cache_alloc(GFP_ATOMIC))==NULL) + { + kfree_skb(skb, FREE_WRITE); + return; + } + /* + * Fill in the new cache entry + */ + cache->mfc_parent=vifi; + cache->mfc_origin=skb->ip_hdr->saddr; + cache->mfc_mcastgrp=skb->ip_hdr->daddr; + cache->mfc_flags=MFC_QUEUED; + /* + * Link to the unresolved list + */ + cache->next=cache_resolve_queue; + cache_resolve_queue=cache; + cache_resolve_queue_len++; + /* + * Fire off the expiry timer + */ + cache->mfc_timer.expires=jiffies+10*HZ; + add_timer(&cache->mfc_timer); + /* + * Reflect first query at mrouted. + */ + if(mroute_socket) + ipmr_cache_report(skb); + } + /* + * See if we can append the packet + */ + if(cache->mfc_queuelen>3) + { + kfree_skb(skb, FREE_WRITE); + return; + } + /* + * Add to our 'pending' list. Cache the is_frag data + * in skb->protocol now it is spare. + */ + cache->mfc_queuelen++; + skb->protocol=is_frag; + skb_queue_tail(&cache->mfc_unresolved,skb); +} + +/* + * MFC cache manipulation by user space mroute daemon + */ + +int ipmr_mfc_modify(int action, struct mfcctl *mfc) +{ + struct mfc_cache *cache; + if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr)) + return -EINVAL; + /* + * Find the cache line + */ + + cli(); + + cache=ipmr_cache_find(mfc->mfcc_origin.s_addr,mfc->mfcc_mcastgrp.s_addr); + + /* + * Delete an entry + */ + if(action==MRT_DEL_MFC) + { + if(cache) + { + ipmr_cache_delete(cache); + sti(); + return 0; + } + sti(); + return -ENOENT; + } + if(cache) + { + /* + * Update the cache, see if it frees a pending queue + */ + + cache->mfc_flags|=MFC_RESOLVED; + memcpy(cache->mfc_ttls, mfc->mfcc_ttls,sizeof(cache->mfc_ttls)); + + /* + * Check to see if we resolved a queued list. If so we + * need to send on the frames and tidy up. + */ + + if(cache->mfc_flags&MFC_QUEUED) + ipmr_cache_resolve(cache); /* Unhook & send the frames */ + sti(); + return 0; + } + /* + * Unsolicited update - that's ok, add anyway. + */ + + + cache=ipmr_cache_alloc(GFP_ATOMIC); + if(cache==NULL) + { + sti(); + return -ENOMEM; + } + cache->mfc_flags=MFC_RESOLVED; + cache->mfc_origin=mfc->mfcc_origin.s_addr; + cache->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr; + cache->mfc_parent=mfc->mfcc_parent; + memcpy(cache->mfc_ttls, mfc->mfcc_ttls,sizeof(cache->mfc_ttls)); + ipmr_cache_insert(cache); + sti(); + return 0; +} + +/* + * Socket options and virtual interface manipulation. The whole + * virtual interface system is a complete heap, but unfortunately + * that's how BSD mrouted happens to think. Maybe one day with a proper + * MOSPF/PIM router set up we can clean this up. + */ + +int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen) +{ + int err; + struct vifctl vif; + struct mfcctl mfc; + + if(optname!=MRT_INIT) + { + if(sk!=mroute_socket) + return -EACCES; + } + + switch(optname) + { + case MRT_INIT: + if(sk->type!=SOCK_RAW || sk->num!=IPPROTO_IGMP) + return -EOPNOTSUPP; + if(optlen!=sizeof(int)) + return -ENOPROTOOPT; + { + int opt; + err = get_user(opt,(int *)optval); + if (err) + return err; + if (opt != 1) + return -ENOPROTOOPT; + } + if(mroute_socket) + return -EADDRINUSE; + mroute_socket=sk; + /* Initialise state */ + return 0; + case MRT_DONE: + mroute_close(sk); + mroute_socket=NULL; + return 0; + case MRT_ADD_VIF: + case MRT_DEL_VIF: + if(optlen!=sizeof(vif)) + return -EINVAL; + err = copy_from_user(&vif,optval,sizeof(vif)); + if (err) + return -EFAULT; + if(vif.vifc_vifi > MAXVIFS) + return -ENFILE; + if(optname==MRT_ADD_VIF) + { + struct vif_device *v=&vif_table[vif.vifc_vifi]; + struct device *dev; + /* Empty vif ? */ + if(vifc_map&(1<<vif.vifc_vifi)) + return -EADDRINUSE; + /* Find the interface */ + dev=ip_dev_find(vif.vifc_lcl_addr.s_addr); + if(!dev) + return -EADDRNOTAVAIL; + /* Must be tunnelled or multicastable */ + if(vif.vifc_flags&VIFF_TUNNEL) + { + if(vif.vifc_flags&VIFF_SRCRT) + return -EOPNOTSUPP; + /* IPIP will do all the work */ + } + else + { + if(dev->flags&IFF_MULTICAST) + { + /* Most ethernet cards don't know + how to do this yet.. */ + dev->flags|=IFF_ALLMULTI; + dev_mc_upload(dev); + } + else + { + /* We are stuck.. */ + return -EOPNOTSUPP; + } + } + /* + * Fill in the VIF structures + */ + cli(); + v->rate_limit=vif.vifc_rate_limit; + v->local=vif.vifc_lcl_addr.s_addr; + v->remote=vif.vifc_rmt_addr.s_addr; + v->flags=vif.vifc_flags; + v->threshold=vif.vifc_threshold; + v->dev=dev; + v->bytes_in = 0; + v->bytes_out = 0; + v->pkt_in = 0; + v->pkt_out = 0; + vifc_map|=(1<<vif.vifc_vifi); + sti(); + return 0; + } + else + /* + * VIF deletion + */ + { + struct vif_device *v=&vif_table[vif.vifc_vifi]; + if(vifc_map&(1<<vif.vifc_vifi)) + { + vif_delete(v); + vifc_map&=~(1<<vif.vifc_vifi); + return 0; + } + else + return -EADDRNOTAVAIL; + } + /* + * Manipulate the forwarding caches. These live + * in a sort of kernel/user symbiosis. + */ + case MRT_ADD_MFC: + case MRT_DEL_MFC: + err = copy_from_user(&mfc,optval, sizeof(mfc)); + return err ? -EFAULT : ipmr_mfc_modify(optname, &mfc); + /* + * Control PIM assert. + */ + case MRT_ASSERT: + { + int v; + if(optlen!=sizeof(int)) + return -EINVAL; + + if(get_user(v,(int *)optval)) + return -EFAULT; + mroute_do_pim=(v)?1:0; + return 0; + } + /* + * Spurious command, or MRT_VERSION which you cannot + * set. + */ + default: + return -EOPNOTSUPP; + } +} + +/* + * Getsock opt support for the multicast routing system. + */ + +int ip_mroute_getsockopt(struct sock *sk,int optname,char *optval,int *optlen) +{ + int olr; + int err; + + if(sk!=mroute_socket) + return -EACCES; + if(optname!=MRT_VERSION && optname!=MRT_ASSERT) + return -EOPNOTSUPP; + + err = get_user(olr, optlen); + if (err) + return err; + if(olr!=sizeof(int)) + return -EINVAL; + err = put_user(sizeof(int),optlen); + if (err) + return err; + if(optname==MRT_VERSION) + err = put_user(0x0305,(int *)optval); + else + err = put_user(mroute_do_pim,(int *)optval); + return err; +} + +/* + * The IP multicast ioctl support routines. + */ + +int ipmr_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + int err; + struct sioc_sg_req sr; + struct sioc_vif_req vr; + struct vif_device *vif; + + switch(cmd) + { + case SIOCGETVIFCNT: + err = copy_from_user(&vr,(void *)arg,sizeof(vr)); + if (err) + return -EFAULT; + if(vr.vifi>=MAXVIFS) + return -EINVAL; + vif=&vif_table[vr.vifi]; + if(vifc_map&(1<<vr.vifi)) + { + vr.icount=vif->pkt_in; + vr.ocount=vif->pkt_out; + vr.ibytes=vif->bytes_in; + vr.obytes=vif->bytes_out; + err = copy_to_user((void *)arg,&vr,sizeof(vr)); + if (err) + err = -EFAULT; + return err; + } + return -EADDRNOTAVAIL; + case SIOCGETSGCNT: + err = copy_from_user(&sr,(void *)arg,sizeof(sr)); + if (!err) + err = copy_to_user((void *)arg,&sr,sizeof(sr)); + if (err) + err = -EFAULT; + return err; + default: + return -EINVAL; + } +} + +/* + * Close the multicast socket, and clear the vif tables etc + */ + +void mroute_close(struct sock *sk) +{ + int i; + struct vif_device *v=&vif_table[0]; + + /* + * Shut down all active vif entries + */ + + for(i=0;i<MAXVIFS;i++) + { + if(vifc_map&(1<<i)) + { + if(!(v->flags&VIFF_TUNNEL)) + { + v->dev->flags&=~IFF_ALLMULTI; + dev_mc_upload(v->dev); + } + } + v++; + } + vifc_map=0; + /* + * Wipe the cache + */ + for(i=0;i<MFC_LINES;i++) + { + while(mfc_cache_array[i]!=NULL) + ipmr_cache_delete(mfc_cache_array[i]); + } + /* The timer will clear any 'pending' stuff */ +} + +static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct vif_device *v; + int ct; + if(event!=NETDEV_DOWN) + return NOTIFY_DONE; + v=&vif_table[0]; + for(ct=0;ct<MAXVIFS;ct++) + { + if((vifc_map&(1<<ct)) && v->dev==ptr) + { + vif_delete(v); + vifc_map&=~(1<<ct); + } + v++; + } + return NOTIFY_DONE; +} + + +static struct notifier_block ip_mr_notifier={ + ipmr_device_event, + NULL, + 0 +}; + +/* + * Processing handlers for ipmr_forward + */ + +static void ipmr_queue_xmit(struct sk_buff *skb, struct vif_device *vif, struct device *in_dev, int frag) +{ + int tunnel=0; + __u32 raddr=skb->raddr; + if(vif->flags&VIFF_TUNNEL) + { + tunnel=IPFWD_MULTITUNNEL; + raddr=vif->remote; + } + vif->pkt_out++; + vif->bytes_out+=skb->len; + skb->dev=vif->dev; + skb->raddr=skb->h.iph->daddr; + /* + * If the vif went down as we were forwarding.. just throw the + * frame. + */ + if(vif->dev==NULL || ip_forward(skb, in_dev, frag|IPFWD_MULTICASTING|tunnel, raddr)==-1) + kfree_skb(skb, FREE_WRITE); +} + +/* + * Multicast packets for forwarding arrive here + */ + +void ipmr_forward(struct sk_buff *skb, int is_frag) +{ + struct mfc_cache *cache; + struct sk_buff *skb2; + int psend = -1; + int vif=ipmr_vifi_find(skb->dev); + if(vif==-1) + { + kfree_skb(skb, FREE_WRITE); + return; + } + + /* + * Without the following addition, skb->h.iph points to something + * different that is not the ip header. + */ + + skb->h.iph = skb->ip_hdr; /* Anand, ernet. */ + + vif_table[vif].pkt_in++; + vif_table[vif].bytes_in+=skb->len; + + cache=ipmr_cache_find(skb->ip_hdr->saddr,skb->ip_hdr->daddr); + + /* + * No usable cache entry + */ + + if(cache==NULL || (cache->mfc_flags&MFC_QUEUED)) + ipmr_cache_unresolved(cache,vif,skb, is_frag); + else + { + /* + * Forward the frame + */ + int ct=0; + while(ct<MAXVIFS) + { + /* + * 0 means don't do it. Silly idea, 255 as don't do it would be cleaner! + */ + if(skb->ip_hdr->ttl > cache->mfc_ttls[ct] && cache->mfc_ttls[ct]>0) + { + if(psend!=-1) + { + /* + * May get variant mac headers + * so must copy -- boo hoo. + */ + skb2=skb_copy(skb, GFP_ATOMIC); + if(skb2) + { + skb2->free=1; + ipmr_queue_xmit(skb2, &vif_table[psend], skb->dev, is_frag); + } + } + psend=ct; + } + ct++; + } + if(psend==-1) + kfree_skb(skb, FREE_WRITE); + else + { + ipmr_queue_xmit(skb, &vif_table[psend], skb->dev, is_frag); + } + /* + * Adjust the stats + */ + } +} + +/* + * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif + */ + +int ipmr_vif_info(char *buffer, char **start, off_t offset, int length, int dummy) +{ + struct vif_device *vif; + int len=0; + off_t pos=0; + off_t begin=0; + int size; + int ct; + + len += sprintf(buffer, + "Interface Bytes In Pkts In Bytes Out Pkts Out Flags Local Remote\n"); + pos=len; + + for (ct=0;ct<MAXVIFS;ct++) + { + vif=&vif_table[ct]; + if(!(vifc_map&(1<<ct))) + continue; + if(vif->dev==NULL) + continue; + size = sprintf(buffer+len, "%-10s %8ld %7ld %8ld %7ld %05X %08lX %08lX\n", + vif->dev->name,vif->bytes_in, vif->pkt_in, vif->bytes_out,vif->pkt_out, + vif->flags, vif->local, vif->remote); + len+=size; + pos+=size; + if(pos<offset) + { + len=0; + begin=pos; + } + if(pos>offset+length) + break; + } + + *start=buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + return len; +} + +int ipmr_mfc_info(char *buffer, char **start, off_t offset, int length, int dummy) +{ + struct mfc_cache *mfc; + int len=0; + off_t pos=0; + off_t begin=0; + int size; + int ct; + + len += sprintf(buffer, + "Group Origin SrcIface \n"); + pos=len; + + for (ct=0;ct<MFC_LINES;ct++) + { + cli(); + mfc=mfc_cache_array[ct]; + while(mfc!=NULL) + { + char *name="none"; + char vifmap[MAXVIFS+1]; + int n; + /* + * Device name + */ + if(vifc_map&(1<<mfc->mfc_parent)) + name=vif_table[mfc->mfc_parent].dev->name; + /* + * Interface forwarding map + */ + for(n=0;n<MAXVIFS;n++) + if(vifc_map&(1<<n) && mfc->mfc_ttls[ct]) + vifmap[n]='X'; + else + vifmap[n]='-'; + vifmap[n]=0; + /* + * Now print it out + */ + size = sprintf(buffer+len, "%08lX %08lX %-8s %s\n", + (unsigned long)mfc->mfc_mcastgrp, + (unsigned long)mfc->mfc_origin, + name, + vifmap); + len+=size; + pos+=size; + if(pos<offset) + { + len=0; + begin=pos; + } + if(pos>offset+length) + { + sti(); + goto done; + } + mfc=mfc->next; + } + sti(); + } +done: + *start=buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + return len; +} + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry proc_net_ipmr_vif = { + PROC_NET_IPMR_VIF, 9 ,"ip_mr_vif", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + ipmr_vif_info +}; +static struct proc_dir_entry proc_net_ipmr_mfc = { + PROC_NET_IPMR_MFC, 11 ,"ip_mr_cache", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + ipmr_mfc_info +}; +#endif + +/* + * Setup for IP multicast routing + */ + +void ip_mr_init(void) +{ + printk(KERN_INFO "Linux IP multicast router 0.06.\n"); + register_netdevice_notifier(&ip_mr_notifier); +#ifdef CONFIG_PROC_FS + proc_net_register(&proc_net_ipmr_vif); + proc_net_register(&proc_net_ipmr_mfc); +#endif +} diff --git a/net/ipv4/packet.c b/net/ipv4/packet.c index fbc4dd5ca..89dd6549e 100644 --- a/net/ipv4/packet.c +++ b/net/ipv4/packet.c @@ -5,6 +5,9 @@ * * PACKET - implements raw packet sockets. * + * Doesn't belong in IP but it's currently too hooked into ip + * to separate. + * * Version: @(#)packet.c 1.0.6 05/25/93 * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> @@ -28,6 +31,9 @@ * dubious gcc output. Can you read * compiler: it said _VOLATILE_ * Richard Kooijman : Timestamp fixes. + * Alan Cox : New buffers. Use sk->mac.raw. + * Alan Cox : sendmsg/recvmsg support. + * Alan Cox : Protocol setting support * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -44,6 +50,7 @@ #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> +#include <linux/if_packet.h> #include <net/ip.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -51,18 +58,7 @@ #include <linux/errno.h> #include <linux/timer.h> #include <asm/system.h> -#include <asm/segment.h> - -/* - * We really ought to have a single public _inline_ min function! - */ - -static unsigned long min(unsigned long a, unsigned long b) -{ - if (a < b) - return(a); - return(b); -} +#include <asm/uaccess.h> /* @@ -72,7 +68,6 @@ static unsigned long min(unsigned long a, unsigned long b) int packet_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) { struct sock *sk; - unsigned long flags; /* * When we registered the protocol we saved the socket in the data @@ -80,50 +75,31 @@ int packet_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) */ sk = (struct sock *) pt->data; + + /* + * Yank back the headers [hope the device set this + * right or kerboom...] + */ + + skb_push(skb,skb->data-skb->mac.raw); /* - * The SOCK_PACKET socket receives _all_ frames, and as such - * therefore needs to put the header back onto the buffer. - * (it was removed by inet_bh()). + * The SOCK_PACKET socket receives _all_ frames. */ skb->dev = dev; - skb->len += dev->hard_header_len; /* * Charge the memory to the socket. This is done specifically * to prevent sockets using all the memory up. */ - if (sk->rmem_alloc & 0xFF000000) { - printk("packet_rcv: sk->rmem_alloc = %ld\n", sk->rmem_alloc); - sk->rmem_alloc = 0; - } - - if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf) + if(sock_queue_rcv_skb(sk,skb)<0) { -/* printk("packet_rcv: drop, %d+%d>%d\n", sk->rmem_alloc, skb->mem_len, sk->rcvbuf); */ skb->sk = NULL; kfree_skb(skb, FREE_READ); - return(0); + return 0; } - - save_flags(flags); - cli(); - - skb->sk = sk; - sk->rmem_alloc += skb->mem_len; - - /* - * Queue the packet up, and wake anyone waiting for it. - */ - - skb_queue_tail(&sk->receive_queue,skb); - if(!sk->dead) - sk->data_ready(sk,skb->len); - - restore_flags(flags); - /* * Processing complete. */ @@ -137,14 +113,15 @@ int packet_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) * protocol layers and you must therefore supply it with a complete frame */ -static int packet_sendto(struct sock *sk, unsigned char *from, int len, - int noblock, unsigned flags, struct sockaddr_in *usin, - int addr_len) +static int packet_sendmsg(struct sock *sk, struct msghdr *msg, int len, + int noblock, int flags) { struct sk_buff *skb; struct device *dev; - struct sockaddr *saddr=(struct sockaddr *)usin; - + struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name; + unsigned short proto=0; + int err; + /* * Check the flags. */ @@ -156,23 +133,25 @@ static int packet_sendto(struct sock *sk, unsigned char *from, int len, * Get and verify the address. */ - if (usin) + if (saddr) { - if (addr_len < sizeof(*saddr)) + if (msg->msg_namelen < sizeof(struct sockaddr)) return(-EINVAL); + if (msg->msg_namelen==sizeof(struct sockaddr_pkt)) + proto=saddr->spkt_protocol; } else - return(-EINVAL); /* SOCK_PACKET must be sent giving an address */ + return(-ENOTCONN); /* SOCK_PACKET must be sent giving an address */ /* * Find the device first to size check it */ - saddr->sa_data[13] = 0; - dev = dev_get(saddr->sa_data); + saddr->spkt_device[13] = 0; + dev = dev_get(saddr->spkt_device); if (dev == NULL) { - return(-ENXIO); + return(-ENODEV); } /* @@ -183,11 +162,12 @@ static int packet_sendto(struct sock *sk, unsigned char *from, int len, if(len>dev->mtu+dev->hard_header_len) return -EMSGSIZE; - skb = sk->prot->wmalloc(sk, len, 0, GFP_KERNEL); + skb = sock_wmalloc(sk, len, 0, GFP_KERNEL); /* * If the write buffer is full, then tough. At this level the user gets to - * deal with the problem - do your own algorithmic backoffs. + * deal with the problem - do your own algorithmic backoffs. That's far + * more flexible. */ if (skb == NULL) @@ -201,76 +181,217 @@ static int packet_sendto(struct sock *sk, unsigned char *from, int len, skb->sk = sk; skb->free = 1; - memcpy_fromfs(skb->data, from, len); - skb->len = len; - skb->arp = 1; /* No ARP needs doing on this (complete) frame */ + err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); + skb->arp = 1; /* No ARP needs doing on this (complete) frame */ + skb->protocol = proto; /* * Now send it */ - if (dev->flags & IFF_UP) - dev_queue_xmit(skb, dev, sk->priority); + if (err) + { + err = -EFAULT; + } else + { + if (!(dev->flags & IFF_UP)) + { + err = -ENODEV; + } + } + + if (err) + { kfree_skb(skb, FREE_WRITE); + return err; + } + + dev_queue_xmit(skb, dev, sk->priority); return(len); } /* - * A write to a SOCK_PACKET can't actually do anything useful and will - * always fail but we include it for completeness and future expansion. - */ - -static int packet_write(struct sock *sk, unsigned char *buff, - int len, int noblock, unsigned flags) -{ - return(packet_sendto(sk, buff, len, noblock, flags, NULL, 0)); -} - -/* * Close a SOCK_PACKET socket. This is fairly simple. We immediately go * to 'closed' state and remove our protocol entry in the device list. * The release_sock() will destroy the socket if a user has closed the * file side of the object. */ -static void packet_close(struct sock *sk, int timeout) +static void packet_close(struct sock *sk, unsigned long timeout) { - sk->inuse = 1; + /* + * Stop more data and kill the socket off. + */ + + lock_sock(sk); sk->state = TCP_CLOSE; - dev_remove_pack((struct packet_type *)sk->pair); - kfree_s((void *)sk->pair, sizeof(struct packet_type)); - sk->pair = NULL; + + /* + * Unhook the notifier + */ + + unregister_netdevice_notifier(&sk->protinfo.af_packet.notifier); + + if(sk->protinfo.af_packet.prot_hook) + { + /* + * Remove the protocol hook + */ + + dev_remove_pack((struct packet_type *)sk->protinfo.af_packet.prot_hook); + + /* + * Dispose of litter carefully. + */ + + kfree_s((void *)sk->protinfo.af_packet.prot_hook, sizeof(struct packet_type)); + sk->protinfo.af_packet.prot_hook = NULL; + } + release_sock(sk); + destroy_sock(sk); } /* - * Create a packet of type SOCK_PACKET. We do one slightly irregular - * thing here that wants tidying up. We borrow the 'pair' pointer in - * the socket object so we can find the packet_type entry in the - * device list. The reverse is easy as we use the data field of the - * packet type to point to our socket. + * Attach a packet hook to a device. */ -static int packet_init(struct sock *sk) +int packet_attach(struct sock *sk, struct device *dev) { - struct packet_type *p; - - p = (struct packet_type *) kmalloc(sizeof(*p), GFP_KERNEL); + struct packet_type *p = (struct packet_type *) kmalloc(sizeof(*p), GFP_KERNEL); if (p == NULL) return(-ENOMEM); p->func = packet_rcv; p->type = sk->num; p->data = (void *)sk; - p->dev = NULL; + p->dev = dev; dev_add_pack(p); /* * We need to remember this somewhere. */ - sk->pair = (struct sock *)p; + sk->protinfo.af_packet.prot_hook = p; + sk->protinfo.af_packet.bound_dev = dev; + return 0; +} + +/* + * Bind a packet socket to a device + */ + +static int packet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + char name[15]; + struct device *dev; + + /* + * Check legality + */ + + if(addr_len!=sizeof(struct sockaddr)) + return -EINVAL; + strncpy(name,uaddr->sa_data,14); + name[14]=0; + + /* + * Lock the device chain while we sanity check + * the bind request. + */ + + dev_lock_list(); + dev=dev_get(name); + if(dev==NULL) + { + dev_unlock_list(); + return -ENODEV; + } + + if(!(dev->flags&IFF_UP)) + { + dev_unlock_list(); + return -ENETDOWN; + } + + /* + * Perform the request. + */ + + memcpy(sk->protinfo.af_packet.device_name,name,15); + + /* + * Rewrite an existing hook if present. + */ + + if(sk->protinfo.af_packet.prot_hook) + { + dev_remove_pack(sk->protinfo.af_packet.prot_hook); + sk->protinfo.af_packet.prot_hook->dev=dev; + sk->protinfo.af_packet.bound_dev=dev; + dev_add_pack(sk->protinfo.af_packet.prot_hook); + } + else + { + int err=packet_attach(sk, dev); + if(err) + { + dev_unlock_list(); + return err; + } + } + /* + * Now the notifier is set up right this lot is safe. + */ + dev_unlock_list(); + return 0; +} + +/* + * This hook is called when a device goes up or down so that + * SOCK_PACKET sockets can come unbound properly. + */ + +static int packet_unbind(struct notifier_block *this, unsigned long msg, void *data) +{ + struct inet_packet_opt *ipo=(struct inet_packet_opt *)this; + if(msg==NETDEV_DOWN && data==ipo->bound_dev) + { + /* + * Our device has gone down. + */ + ipo->bound_dev=NULL; + dev_remove_pack(ipo->prot_hook); + kfree(ipo->prot_hook); + ipo->prot_hook=NULL; + } + return NOTIFY_DONE; +} + + +/* + * Create a packet of type SOCK_PACKET. + */ + +static int packet_init(struct sock *sk) +{ + /* + * Attach a protocol block + */ + + int err=packet_attach(sk, NULL); + if(err) + return err; + + /* + * Set up the per socket notifier. + */ + + sk->protinfo.af_packet.notifier.notifier_call=packet_unbind; + sk->protinfo.af_packet.notifier.priority=0; + + register_netdevice_notifier(&sk->protinfo.af_packet.notifier); return(0); } @@ -281,20 +402,20 @@ static int packet_init(struct sock *sk) * If necessary we block. */ -int packet_recvfrom(struct sock *sk, unsigned char *to, int len, - int noblock, unsigned flags, struct sockaddr_in *sin, - int *addr_len) +int packet_recvmsg(struct sock *sk, struct msghdr *msg, int len, + int noblock, int flags,int *addr_len) { int copied=0; struct sk_buff *skb; - struct sockaddr *saddr; + struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name; int err; - int truesize; - saddr = (struct sockaddr *)sin; - - if (sk->shutdown & RCV_SHUTDOWN) - return(0); + /* + * If there is no protocol hook then the device is down. + */ + + if(sk->protinfo.af_packet.prot_hook==NULL) + return -ENETDOWN; /* * If the address length field is there to be filled in, we fill @@ -326,10 +447,20 @@ int packet_recvfrom(struct sock *sk, unsigned char *to, int len, * user program they can ask the device for its MTU anyway. */ - truesize = skb->len; - copied = min(len, truesize); + copied = skb->len; + if(copied>len) + { + copied=len; + msg->msg_flags|=MSG_TRUNC; + } + + /* We can't use skb_copy_datagram here */ + err = memcpy_toiovec(msg->msg_iov, skb->data, copied); + if (err) + { + return -EFAULT; + } - memcpy_tofs(to, skb->data, copied); /* We can't use skb_copy_datagram here */ sk->stamp=skb->stamp; /* @@ -338,8 +469,9 @@ int packet_recvfrom(struct sock *sk, unsigned char *to, int len, if (saddr) { - saddr->sa_family = skb->dev->type; - memcpy(saddr->sa_data,skb->dev->name, 14); + saddr->spkt_family = skb->dev->type; + strncpy(saddr->spkt_device,skb->dev->name, 15); + saddr->spkt_protocol = skb->protocol; } /* @@ -347,29 +479,11 @@ int packet_recvfrom(struct sock *sk, unsigned char *to, int len, * races and re-entrancy issues from us. */ - skb_free_datagram(skb); + skb_free_datagram(sk, skb); - /* - * We are done. - */ - - release_sock(sk); - return(truesize); + return(copied); } - -/* - * A packet read can succeed and is just the same as a recvfrom but without the - * addresses being recorded. - */ - -int packet_read(struct sock *sk, unsigned char *buff, - int len, int noblock, unsigned flags) -{ - return(packet_recvfrom(sk, buff, len, noblock, flags, NULL, NULL)); -} - - /* * This structure declares to the lower layer socket subsystem currently * incorrectly embedded in the IP code how to behave. This interface needs @@ -378,33 +492,27 @@ int packet_read(struct sock *sk, unsigned char *buff, struct proto packet_prot = { - sock_wmalloc, - sock_rmalloc, - sock_wfree, - sock_rfree, - sock_rspace, - sock_wspace, packet_close, - packet_read, - packet_write, - packet_sendto, - packet_recvfrom, - ip_build_header, /* Not actually used */ - NULL, NULL, - ip_queue_xmit, /* These two are not actually used */ + NULL, /* accept */ NULL, NULL, NULL, - NULL, datagram_select, - NULL, + NULL, /* No ioctl */ packet_init, NULL, + NULL, NULL, /* No set/get socket options */ NULL, + packet_sendmsg, /* Sendmsg */ + packet_recvmsg, /* Recvmsg */ + packet_bind, /* Bind */ + NULL, /* Backlog_rcv */ 128, 0, "PACKET", 0, 0 }; + + diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index e7124a42d..4494270cc 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -22,6 +22,7 @@ * Erik Schoenfelder : /proc/net/snmp * Alan Cox : Handle dead sockets properly. * Gerhard Koerting : Show both timers + * Alan Cox : Allow inode to be NULL (kernel socket) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -29,7 +30,6 @@ * 2 of the License, or (at your option) any later version. */ #include <asm/system.h> -#include <linux/autoconf.h> #include <linux/sched.h> #include <linux/socket.h> #include <linux/net.h> @@ -59,6 +59,7 @@ get__netinfo(struct proto *pro, char *buffer, int format, char **start, off_t of { struct sock **s_array; struct sock *sp; + struct tcp_opt *tp; int i; int timer_active; int timer_active1; @@ -68,21 +69,37 @@ get__netinfo(struct proto *pro, char *buffer, int format, char **start, off_t of unsigned short destp, srcp; int len=0; off_t pos=0; - off_t begin=0; + off_t begin; + char tmpbuf[129]; s_array = pro->sock_array; - len+=sprintf(buffer, "sl local_address rem_address st tx_queue rx_queue tr tm->when uid\n"); + if (offset < 128) + len += sprintf(buffer, "%-127s\n", + " sl local_address rem_address st tx_queue " + "rx_queue tr tm->when retrnsmt uid timeout inode"); + pos = 128; /* - * This was very pretty but didn't work when a socket is destroyed at the wrong moment - * (eg a syn recv socket getting a reset), or a memory timer destroy. Instead of playing - * with timers we just concede defeat and cli(). + * This was very pretty but didn't work when a socket is destroyed + * at the wrong moment (eg a syn recv socket getting a reset), or + * a memory timer destroy. Instead of playing with timers we just + * concede defeat and cli(). */ for(i = 0; i < SOCK_ARRAY_SIZE; i++) { cli(); sp = s_array[i]; + while(sp != NULL) { + pos += 128; + if (pos < offset) + { + sp = sp->next; + continue; + } + + tp = &(sp->tp_pinfo.af_tcp); + dest = sp->daddr; src = sp->saddr; destp = sp->dummy_th.dest; @@ -107,56 +124,58 @@ get__netinfo(struct proto *pro, char *buffer, int format, char **start, off_t of timer_active=timer_active2; timer_expires=sp->timer.expires; } - len+=sprintf(buffer+len, "%2d: %08lX:%04X %08lX:%04X %02X %08lX:%08lX %02X:%08lX %08X %d %d\n", + sprintf(tmpbuf, "%4d: %08lX:%04X %08lX:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld", i, src, srcp, dest, destp, sp->state, - format==0?sp->write_seq-sp->rcv_ack_seq:sp->rmem_alloc, - format==0?sp->acked_seq-sp->copied_seq:sp->wmem_alloc, - timer_active, timer_expires, (unsigned) sp->retransmits, - sp->socket?SOCK_INODE(sp->socket)->i_uid:0, - timer_active?sp->timeout:0); + format==0?sp->write_seq-tp->snd_una:sp->wmem_alloc, + format==0?tp->rcv_nxt-sp->copied_seq:sp->rmem_alloc, + timer_active, timer_expires-jiffies, (unsigned) sp->retransmits, + (sp->socket&&SOCK_INODE(sp->socket))?SOCK_INODE(sp->socket)->i_uid:0, + timer_active?sp->timeout:0, + sp->socket && SOCK_INODE(sp->socket) ? + SOCK_INODE(sp->socket)->i_ino : 0); + if (timer_active1) add_timer(&sp->retransmit_timer); if (timer_active2) add_timer(&sp->timer); + len += sprintf(buffer+len, "%-127s\n", tmpbuf); /* * All sockets with (port mod SOCK_ARRAY_SIZE) = i * are kept in sock_array[i], so we must follow the * 'next' link to get them all. */ - sp = sp->next; - pos=begin+len; - if(pos<offset) - { - len=0; - begin=pos; - } - if(pos>offset+length) + if(len >= length) break; + sp = sp->next; } - sti(); /* We only turn interrupts back on for a moment, but because the interrupt queues anything built up - before this will clear before we jump back and cli, so it's not as bad as it looks */ - if(pos>offset+length) + sti(); /* We only turn interrupts back on for a moment, + but because the interrupt queues anything built + up before this will clear before we jump back + and cli(), so it's not as bad as it looks */ + if(len>= length) break; } - *start=buffer+(offset-begin); - len-=(offset-begin); + begin = len - (pos - offset); + *start = buffer + begin; + len -= begin; if(len>length) - len=length; + len = length; return len; } -int tcp_get_info(char *buffer, char **start, off_t offset, int length) +int tcp_get_info(char *buffer, char **start, off_t offset, int length, int dummy) { return get__netinfo(&tcp_prot, buffer,0, start, offset, length); } -int udp_get_info(char *buffer, char **start, off_t offset, int length) +int udp_get_info(char *buffer, char **start, off_t offset, int length, int dummy) { return get__netinfo(&udp_prot, buffer,1, start, offset, length); } -int raw_get_info(char *buffer, char **start, off_t offset, int length) +int raw_get_info(char *buffer, char **start, off_t offset, int length, int dummy) { return get__netinfo(&raw_prot, buffer,1, start, offset, length); } @@ -165,7 +184,7 @@ int raw_get_info(char *buffer, char **start, off_t offset, int length) /* * Report socket allocation statistics [mea@utu.fi] */ -int afinet_get_info(char *buffer, char **start, off_t offset, int length) +int afinet_get_info(char *buffer, char **start, off_t offset, int length, int dummy) { /* From net/socket.c */ extern int socket_get_info(char *, char **, off_t, int); @@ -194,7 +213,7 @@ int afinet_get_info(char *buffer, char **start, off_t offset, int length) * Called from the PROCfs module. This outputs /proc/net/snmp. */ -int snmp_get_info(char *buffer, char **start, off_t offset, int length) +int snmp_get_info(char *buffer, char **start, off_t offset, int length, int dummy) { extern struct tcp_mib tcp_statistics; extern struct udp_mib udp_statistics; @@ -265,4 +284,3 @@ int snmp_get_info(char *buffer, char **start, off_t offset, int length) len = length; return len; } - diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index f10cc8254..bb9ff5fbb 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -15,13 +15,15 @@ * udp_err is never called! * Alan Cox : Added new fields for init and ready for * proper fragmentation (_NO_ 4K limits!) + * Richard Colella : Hang on hash collision * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ -#include <asm/segment.h> + +#include <asm/uaccess.h> #include <asm/system.h> #include <linux/types.h> #include <linux/kernel.h> @@ -47,154 +49,172 @@ #ifdef CONFIG_IP_FORWARD #ifdef CONFIG_NET_IPIP -static struct inet_protocol ipip_protocol = { - ipip_rcv, /* IPIP handler */ - NULL, /* Will be UDP fraglist handler */ - NULL, /* TUNNEL error control */ - 0, /* next */ - IPPROTO_IPIP, /* protocol ID */ - 0, /* copy */ - NULL, /* data */ - "IPIP" /* name */ +static struct inet_protocol ipip_protocol = +{ + ipip_rcv, /* IPIP handler */ + NULL, /* TUNNEL error control */ + 0, /* next */ + IPPROTO_IPIP, /* protocol ID */ + 0, /* copy */ + NULL, /* data */ + "IPIP" /* name */ }; #endif #endif -static struct inet_protocol tcp_protocol = { - tcp_rcv, /* TCP handler */ - NULL, /* No fragment handler (and won't be for a long time) */ - tcp_err, /* TCP error control */ +static struct inet_protocol tcp_protocol = +{ + tcp_v4_rcv, /* TCP handler */ + tcp_v4_err, /* TCP error control */ #if defined(CONFIG_NET_IPIP) && defined(CONFIG_IP_FORWARD) - &ipip_protocol, + &ipip_protocol, #else - NULL, /* next */ + NULL, /* next */ #endif - IPPROTO_TCP, /* protocol ID */ - 0, /* copy */ - NULL, /* data */ - "TCP" /* name */ + IPPROTO_TCP, /* protocol ID */ + 0, /* copy */ + NULL, /* data */ + "TCP" /* name */ }; -static struct inet_protocol udp_protocol = { - udp_rcv, /* UDP handler */ - NULL, /* Will be UDP fraglist handler */ - udp_err, /* UDP error control */ - &tcp_protocol, /* next */ - IPPROTO_UDP, /* protocol ID */ - 0, /* copy */ - NULL, /* data */ - "UDP" /* name */ +static struct inet_protocol udp_protocol = +{ + udp_rcv, /* UDP handler */ + udp_err, /* UDP error control */ + &tcp_protocol, /* next */ + IPPROTO_UDP, /* protocol ID */ + 0, /* copy */ + NULL, /* data */ + "UDP" /* name */ }; -static struct inet_protocol icmp_protocol = { - icmp_rcv, /* ICMP handler */ - NULL, /* ICMP never fragments anyway */ - NULL, /* ICMP error control */ - &udp_protocol, /* next */ - IPPROTO_ICMP, /* protocol ID */ - 0, /* copy */ - NULL, /* data */ - "ICMP" /* name */ +static struct inet_protocol icmp_protocol = +{ + icmp_rcv, /* ICMP handler */ + NULL, /* ICMP error control */ + &udp_protocol, /* next */ + IPPROTO_ICMP, /* protocol ID */ + 0, /* copy */ + NULL, /* data */ + "ICMP" /* name */ }; #ifndef CONFIG_IP_MULTICAST struct inet_protocol *inet_protocol_base = &icmp_protocol; #else -static struct inet_protocol igmp_protocol = { - igmp_rcv, /* IGMP handler */ - NULL, /* IGMP never fragments anyway */ - NULL, /* IGMP error control */ - &icmp_protocol, /* next */ - IPPROTO_IGMP, /* protocol ID */ - 0, /* copy */ - NULL, /* data */ - "IGMP" /* name */ +static struct inet_protocol igmp_protocol = +{ + igmp_rcv, /* IGMP handler */ + NULL, /* IGMP error control */ + &icmp_protocol, /* next */ + IPPROTO_IGMP, /* protocol ID */ + 0, /* copy */ + NULL, /* data */ + "IGMP" /* name */ }; struct inet_protocol *inet_protocol_base = &igmp_protocol; #endif -struct inet_protocol *inet_protos[MAX_INET_PROTOS] = { - NULL +struct inet_protocol *inet_protos[MAX_INET_PROTOS] = +{ + NULL }; -struct inet_protocol * -inet_get_protocol(unsigned char prot) -{ - unsigned char hash; - struct inet_protocol *p; - - hash = prot & (MAX_INET_PROTOS - 1); - for (p = inet_protos[hash] ; p != NULL; p=p->next) { - if (p->protocol == prot) return((struct inet_protocol *) p); - } - return(NULL); -} - +/* + * Find a protocol in the protocol tables given its + * IP type. + */ -void -inet_add_protocol(struct inet_protocol *prot) +struct inet_protocol *inet_get_protocol(unsigned char prot) { - unsigned char hash; - struct inet_protocol *p2; - - hash = prot->protocol & (MAX_INET_PROTOS - 1); - prot ->next = inet_protos[hash]; - inet_protos[hash] = prot; - prot->copy = 0; - - /* Set the copy bit if we need to. */ - p2 = (struct inet_protocol *) prot->next; - while(p2 != NULL) { - if (p2->protocol == prot->protocol) { - prot->copy = 1; - break; + unsigned char hash; + struct inet_protocol *p; + + hash = prot & (MAX_INET_PROTOS - 1); + for (p = inet_protos[hash] ; p != NULL; p=p->next) + { + if (p->protocol == prot) + return((struct inet_protocol *) p); } - p2 = (struct inet_protocol *) prot->next; - } + return(NULL); } +/* + * Add a protocol handler to the hash tables + */ -int -inet_del_protocol(struct inet_protocol *prot) +void inet_add_protocol(struct inet_protocol *prot) { - struct inet_protocol *p; - struct inet_protocol *lp = NULL; - unsigned char hash; - - hash = prot->protocol & (MAX_INET_PROTOS - 1); - if (prot == inet_protos[hash]) { - inet_protos[hash] = (struct inet_protocol *) inet_protos[hash]->next; - return(0); - } - - p = (struct inet_protocol *) inet_protos[hash]; - while(p != NULL) { + unsigned char hash; + struct inet_protocol *p2; + + hash = prot->protocol & (MAX_INET_PROTOS - 1); + prot ->next = inet_protos[hash]; + inet_protos[hash] = prot; + prot->copy = 0; + /* - * We have to worry if the protocol being deleted is - * the last one on the list, then we may need to reset - * someone's copied bit. + * Set the copy bit if we need to. */ - if (p->next != NULL && p->next == prot) { - /* - * if we are the last one with this protocol and - * there is a previous one, reset its copy bit. - */ - if (p->copy == 0 && lp != NULL) lp->copy = 0; - p->next = prot->next; - return(0); + + p2 = (struct inet_protocol *) prot->next; + while(p2 != NULL) + { + if (p2->protocol == prot->protocol) + { + prot->copy = 1; + break; + } + p2 = (struct inet_protocol *) p2->next; } +} - if (p->next != NULL && p->next->protocol == prot->protocol) { - lp = p; +/* + * Remove a protocol from the hash tables. + */ + +int inet_del_protocol(struct inet_protocol *prot) +{ + struct inet_protocol *p; + struct inet_protocol *lp = NULL; + unsigned char hash; + + hash = prot->protocol & (MAX_INET_PROTOS - 1); + if (prot == inet_protos[hash]) + { + inet_protos[hash] = (struct inet_protocol *) inet_protos[hash]->next; + return(0); } - p = (struct inet_protocol *) p->next; - } - return(-1); + p = (struct inet_protocol *) inet_protos[hash]; + while(p != NULL) + { + /* + * We have to worry if the protocol being deleted is + * the last one on the list, then we may need to reset + * someone's copied bit. + */ + if (p->next != NULL && p->next == prot) + { + /* + * if we are the last one with this protocol and + * there is a previous one, reset its copy bit. + */ + if (p->copy == 0 && lp != NULL) + lp->copy = 0; + p->next = prot->next; + return(0); + } + if (p->next != NULL && p->next->protocol == prot->protocol) + lp = p; + + p = (struct inet_protocol *) p->next; + } + return(-1); } diff --git a/net/ipv4/rarp.c b/net/ipv4/rarp.c index a7b3719ed..28e2f4087 100644 --- a/net/ipv4/rarp.c +++ b/net/ipv4/rarp.c @@ -25,24 +25,31 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * + * Fixes + * Alan Cox : Rarp delete on device down needed as + * reported by Walter Wolfgang. + * */ +#include <linux/module.h> + #include <linux/types.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/mm.h> -#include <linux/config.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> +#include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/in.h> +#include <linux/config.h> + #include <asm/system.h> -#include <asm/segment.h> +#include <asm/uaccess.h> #include <stdarg.h> #include <linux/inet.h> -#include <linux/netdevice.h> #include <linux/etherdevice.h> #include <net/ip.h> #include <net/route.h> @@ -52,11 +59,13 @@ #include <net/sock.h> #include <net/arp.h> #include <net/rarp.h> -#ifdef CONFIG_AX25 +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) #include <net/ax25.h> #endif +#include <linux/proc_fs.h> +#include <linux/stat.h> -#ifdef CONFIG_INET_RARP +extern int (*rarp_ioctl_hook)(unsigned int,void*); /* * This structure defines the RARP mapping cache. As long as we make @@ -75,6 +84,7 @@ struct rarp_table struct rarp_table *rarp_tables = NULL; +static int rarp_rcv(struct sk_buff *, struct device *, struct packet_type *); static struct packet_type rarp_packet_type = { @@ -87,16 +97,6 @@ static struct packet_type rarp_packet_type = static initflag = 1; -/* - * Called once when data first added to rarp cache with ioctl. - */ - -static void rarp_init (void) -{ - /* Register the packet type */ - rarp_packet_type.type=htons(ETH_P_RARP); - dev_add_pack(&rarp_packet_type); -} /* * Release the memory for this entry. @@ -105,6 +105,7 @@ static void rarp_init (void) static inline void rarp_release_entry(struct rarp_table *entry) { kfree_s(entry, sizeof(struct rarp_table)); + MOD_DEC_USE_COUNT; return; } @@ -133,6 +134,55 @@ static void rarp_destroy(unsigned long ip_addr) sti(); } +/* + * Flush a device. + */ + +static void rarp_destroy_dev(struct device *dev) +{ + struct rarp_table *entry; + struct rarp_table **pentry; + + cli(); + pentry = &rarp_tables; + while ((entry = *pentry) != NULL) + { + if (entry->dev == dev) + { + *pentry = entry->next; + rarp_release_entry(entry); + } + else + pentry = &entry->next; + } + sti(); +} + +static int rarp_device_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + if(event!=NETDEV_DOWN) + return NOTIFY_DONE; + rarp_destroy_dev((struct device *)ptr); + return NOTIFY_DONE; +} + +/* + * Called once when data first added to rarp cache with ioctl. + */ + +static struct notifier_block rarp_dev_notifier={ + rarp_device_event, + NULL, + 0 +}; + +static void rarp_init_pkt (void) +{ + /* Register the packet type */ + rarp_packet_type.type=htons(ETH_P_RARP); + dev_add_pack(&rarp_packet_type); + register_netdevice_notifier(&rarp_dev_notifier); +} /* * Receive an arp request by the device layer. Maybe it should be @@ -140,17 +190,17 @@ static void rarp_destroy(unsigned long ip_addr) * "overhead" time isn't that high... */ -int rarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) +static int rarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) { /* * We shouldn't use this type conversion. Check later. */ - struct arphdr *rarp = (struct arphdr *)skb->h.raw; - unsigned char *rarp_ptr = (unsigned char *)(rarp+1); + struct arphdr *rarp = (struct arphdr *) skb->data; + unsigned char *rarp_ptr = skb_pull(skb,sizeof(struct arphdr)); struct rarp_table *entry; long sip,tip; unsigned char *sha,*tha; /* s for "source", t for "target" */ - + /* * If this test doesn't pass, it's not IP, or we should ignore it anyway */ @@ -176,18 +226,18 @@ int rarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) */ if ( -#ifdef CONFIG_AX25 +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) (rarp->ar_pro != htons(AX25_P_IP) && dev->type == ARPHRD_AX25) || #endif (rarp->ar_pro != htons(ETH_P_IP) && dev->type != ARPHRD_AX25) || rarp->ar_pln != 4) { - /* - * This packet is not for us. Remove it. - */ - kfree_skb(skb, FREE_READ); - return 0; -} + /* + * This packet is not for us. Remove it. + */ + kfree_skb(skb, FREE_READ); + return 0; + } /* * Extract variable width fields @@ -216,7 +266,7 @@ int rarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) sti(); arp_send(ARPOP_RREPLY, ETH_P_RARP, sip, dev, dev->pa_addr, sha, - dev->dev_addr); + dev->dev_addr, sha); } else sti(); @@ -238,9 +288,13 @@ static int rarp_req_set(struct arpreq *req) int htype, hlen; unsigned long ip; struct rtable *rt; + struct device * dev; + int err; - memcpy_fromfs(&r, req, sizeof(r)); - + err = copy_from_user(&r, req, sizeof(r)); + if (err) + return -EFAULT; + /* * We only understand about IP addresses... */ @@ -254,7 +308,7 @@ static int rarp_req_set(struct arpreq *req) htype = ARPHRD_ETHER; hlen = ETH_ALEN; break; -#ifdef CONFIG_AX25 +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) case ARPHRD_AX25: htype = ARPHRD_AX25; hlen = 7; @@ -268,7 +322,7 @@ static int rarp_req_set(struct arpreq *req) ip = si->sin_addr.s_addr; if (ip == 0) { - printk("RARP: SETRARP: requested PA is 0.0.0.0 !\n"); + printk(KERN_DEBUG "RARP: SETRARP: requested PA is 0.0.0.0 !\n"); return -EINVAL; } @@ -276,9 +330,11 @@ static int rarp_req_set(struct arpreq *req) * Is it reachable directly ? */ - rt = ip_rt_route(ip, NULL, NULL); + rt = ip_rt_route(ip, 0); if (rt == NULL) return -ENETUNREACH; + dev = rt->rt_dev; + ip_rt_put(rt); /* * Is there an existing entry for this address? Find out... @@ -302,9 +358,9 @@ static int rarp_req_set(struct arpreq *req) sti(); return -ENOMEM; } - if(initflag) + if (initflag) { - rarp_init(); + rarp_init_pkt(); initflag=0; } @@ -316,7 +372,10 @@ static int rarp_req_set(struct arpreq *req) entry->hlen = hlen; entry->htype = htype; memcpy(&entry->ha, &r.arp_ha.sa_data, hlen); - entry->dev = rt->rt_dev; + entry->dev = dev; + + /* Don't unlink if we have entries to serve. */ + MOD_INC_USE_COUNT; sti(); @@ -334,13 +393,16 @@ static int rarp_req_get(struct arpreq *req) struct rarp_table *entry; struct sockaddr_in *si; unsigned long ip; - + int err; + /* * We only understand about IP addresses... */ - memcpy_fromfs(&r, req, sizeof(r)); - + err = copy_from_user(&r, req, sizeof(r)); + if (err) + return -EFAULT; + if (r.arp_pa.sa_family != AF_INET) return -EPFNOSUPPORT; @@ -374,8 +436,7 @@ static int rarp_req_get(struct arpreq *req) * Copy the information back */ - memcpy_tofs(req, &r, sizeof(r)); - return 0; + return copy_to_user(req, &r, sizeof(r)) ? -EFAULT : 0; } @@ -394,10 +455,9 @@ int rarp_ioctl(unsigned int cmd, void *arg) case SIOCDRARP: if (!suser()) return -EPERM; - err = verify_area(VERIFY_READ, arg, sizeof(struct arpreq)); - if(err) - return err; - memcpy_fromfs(&r, arg, sizeof(r)); + err = copy_from_user(&r, arg, sizeof(r)); + if (err) + return -EFAULT; if (r.arp_pa.sa_family != AF_INET) return -EPFNOSUPPORT; si = (struct sockaddr_in *) &r.arp_pa; @@ -405,16 +465,11 @@ int rarp_ioctl(unsigned int cmd, void *arg) return 0; case SIOCGRARP: - err = verify_area(VERIFY_WRITE, arg, sizeof(struct arpreq)); - if(err) - return err; + return rarp_req_get((struct arpreq *)arg); case SIOCSRARP: if (!suser()) return -EPERM; - err = verify_area(VERIFY_READ, arg, sizeof(struct arpreq)); - if(err) - return err; return rarp_req_set((struct arpreq *)arg); default: return -EINVAL; @@ -424,7 +479,7 @@ int rarp_ioctl(unsigned int cmd, void *arg) return 0; } -int rarp_get_info(char *buffer, char **start, off_t offset, int length) +int rarp_get_info(char *buffer, char **start, off_t offset, int length, int dummy) { int len=0; off_t begin=0; @@ -433,7 +488,7 @@ int rarp_get_info(char *buffer, char **start, off_t offset, int length) struct rarp_table *entry; char ipbuffer[20]; unsigned long netip; - if(initflag) + if (initflag) { size = sprintf(buffer,"RARP disabled until entries added to cache.\n"); pos+=size; @@ -481,11 +536,49 @@ int rarp_get_info(char *buffer, char **start, off_t offset, int length) sti(); } - *start=buffer+(offset-begin); /* Start of wanted data */ - len-=(offset-begin); /* Start slop */ - if(len>length) - len=length; /* Ending slop */ + *start = buffer+(offset-begin); /* Start of wanted data */ + len -= (offset-begin); /* Start slop */ + if (len>length) + len = length; /* Ending slop */ return len; } +struct proc_dir_entry proc_net_rarp = { + PROC_NET_RARP, 4, "rarp", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + rarp_get_info +}; + +void +rarp_init(void) +{ + proc_net_register(&proc_net_rarp); + rarp_ioctl_hook = rarp_ioctl; +} + +#ifdef MODULE + +int init_module(void) +{ + rarp_init(); + return 0; +} + +void cleanup_module(void) +{ + struct rarp_table *rt, *rt_next; + proc_net_unregister(PROC_NET_RARP); + rarp_ioctl_hook = NULL; + cli(); + /* Destroy the RARP-table */ + rt = rarp_tables; + rarp_tables = NULL; + sti(); + /* ... and free it. */ + for ( ; rt != NULL; rt = rt_next) { + rt_next = rt->next; + rarp_release_entry(rt); + } +} #endif diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index f01489b4a..89e03aed6 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -27,14 +27,19 @@ * Alan Cox : Use new kernel side addresses * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets. * Alan Cox : BSD style RAW socket demultiplexing. + * Alan Cox : Beginnings of mrouted support. + * Alan Cox : Added IP_HDRINCL option. + * Alan Cox : Skip broadcast check if BSDism set. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ + +#include <linux/config.h> #include <asm/system.h> -#include <asm/segment.h> +#include <asm/uaccess.h> #include <linux/types.h> #include <linux/sched.h> #include <linux/errno.h> @@ -46,6 +51,7 @@ #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> +#include <linux/mroute.h> #include <net/ip.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -54,17 +60,17 @@ #include <net/udp.h> #include <net/checksum.h> -static inline unsigned long min(unsigned long a, unsigned long b) -{ - if (a < b) - return(a); - return(b); -} +#ifdef CONFIG_IP_MROUTE +struct sock *mroute_socket=NULL; +#endif -/* raw_err gets called by the icmp module. */ -void raw_err (int err, unsigned char *header, unsigned long daddr, - unsigned long saddr, struct inet_protocol *protocol) +/* + * Raw_err does not currently get called by the icmp module - FIXME: + */ + +void raw_err (int type, int code, unsigned char *header, __u32 daddr, + __u32 saddr, struct inet_protocol *protocol) { struct sock *sk; @@ -75,18 +81,42 @@ void raw_err (int err, unsigned char *header, unsigned long daddr, return; /* This is meaningless in raw sockets. */ - if (err & 0xff00 == (ICMP_SOURCE_QUENCH << 8)) + if (type == ICMP_SOURCE_QUENCH) { if (sk->cong_window > 1) sk->cong_window = sk->cong_window/2; return; } + + if(type == ICMP_PARAMETERPROB) + { + sk->err = EPROTO; + sk->error_report(sk); + } - sk->err = icmp_err_convert[err & 0xff].errno; - sk->error_report(sk); - + if(code<=NR_ICMP_UNREACH) + { + sk->err = icmp_err_convert[code & 0xff].errno; + sk->error_report(sk); + } + return; } +static inline int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) +{ + /* Charge it to the socket. */ + + if (__sock_queue_rcv_skb(sk,skb)<0) + { + ip_statistics.IpInDiscards++; + skb->sk=NULL; + kfree_skb(skb, FREE_READ); + return 0; + } + + ip_statistics.IpInDelivers++; + return 0; +} /* * This should be the easiest of all, all we do is @@ -94,29 +124,33 @@ void raw_err (int err, unsigned char *header, unsigned long daddr, * in ip.c */ -int raw_rcv(struct sock *sk, struct sk_buff *skb, struct device *dev, long saddr, long daddr) +int raw_rcv(struct sock *sk, struct sk_buff *skb, struct device *dev, __u32 saddr, __u32 daddr) { /* Now we need to copy this into memory. */ skb->sk = sk; - skb->len = ntohs(skb->ip_hdr->tot_len); + skb_trim(skb,ntohs(skb->ip_hdr->tot_len)); + skb->h.raw = (unsigned char *) skb->ip_hdr; skb->dev = dev; skb->saddr = daddr; skb->daddr = saddr; - /* Charge it to the socket. */ +#if 0 + /* + * For no adequately explained reasons BSD likes to mess up the header of + * the received frame. + */ + + if(sk->bsdism) + skb->ip_hdr->tot_len=ntohs(skb->ip_hdr->tot_len-4*skb->ip_hdr->ihl); +#endif - if(sock_queue_rcv_skb(sk,skb)<0) - { - ip_statistics.IpInDiscards++; - skb->sk=NULL; - kfree_skb(skb, FREE_READ); - return(0); + if (sk->users) { + __skb_queue_tail(&sk->back_log, skb); + return 0; } - - ip_statistics.IpInDelivers++; - release_sock(sk); - return(0); + raw_rcv_skb(sk, skb); + return 0; } /* @@ -127,28 +161,42 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb, struct device *dev, long saddr * Callback support is trivial for SOCK_RAW */ -static void raw_getfrag(void *p, int saddr, char *to, unsigned int offset, unsigned int fraglen) +static int raw_getfrag(const void *p, __u32 saddr, char *to, + unsigned int offset, unsigned int fraglen) { - memcpy_fromfs(to, (unsigned char *)p+offset, fraglen); + return copy_from_user(to, (const unsigned char *)p+offset, fraglen); } /* * IPPROTO_RAW needs extra work. */ -static void raw_getrawfrag(void *p, int saddr, char *to, unsigned int offset, unsigned int fraglen) +static int raw_getrawfrag(const void *p, __u32 saddr, char *to, unsigned int offset, unsigned int fraglen) { - memcpy_fromfs(to, (unsigned char *)p+offset, fraglen); + int err; + err = copy_from_user(to, (const unsigned char *)p+offset, fraglen); + if (err) + return err; if(offset==0) { struct iphdr *iph=(struct iphdr *)to; - iph->saddr=saddr; + if(!iph->saddr) + iph->saddr=saddr; iph->check=0; + iph->tot_len=htons(fraglen); /* This is right as you can't frag + RAW packets */ + /* + * Deliberate breach of modularity to keep + * ip_build_xmit clean (well less messy). + */ + if (!iph->id) + iph->id = htons(ip_id_count++); iph->check=ip_fast_csum((unsigned char *)iph, iph->ihl); } + return 0; } -static int raw_sendto(struct sock *sk, unsigned char *from, +static int raw_sendto(struct sock *sk, const unsigned char *from, int len, int noblock, unsigned flags, struct sockaddr_in *usin, int addr_len) { int err; @@ -174,42 +222,99 @@ static int raw_sendto(struct sock *sk, unsigned char *from, memcpy(&sin, usin, sizeof(sin)); if (sin.sin_family && sin.sin_family != AF_INET) return(-EINVAL); + /* + * Protocol type is host ordered byte. + */ + sin.sin_port=ntohs(sin.sin_port); } else { if (sk->state != TCP_ESTABLISHED) return(-EINVAL); sin.sin_family = AF_INET; - sin.sin_port = sk->protocol; + sin.sin_port = sk->num; sin.sin_addr.s_addr = sk->daddr; } if (sin.sin_port == 0) - sin.sin_port = sk->protocol; + sin.sin_port = sk->num; if (sin.sin_addr.s_addr == INADDR_ANY) sin.sin_addr.s_addr = ip_my_addr(); - if (sk->broadcast == 0 && ip_chk_addr(sin.sin_addr.s_addr)==IS_BROADCAST) + /* + * BSD raw sockets forget to check SO_BROADCAST .... + */ + + if (!sk->bsdism && sk->broadcast == 0 && ip_chk_addr(sin.sin_addr.s_addr)==IS_BROADCAST) return -EACCES; - if(sk->num==IPPROTO_RAW) - err=ip_build_xmit(sk, raw_getrawfrag, from, len, sin.sin_addr.s_addr, flags, sin.sin_port); + if(sk->ip_hdrincl) + { + if(len>65535) + return -EMSGSIZE; + err=ip_build_xmit(sk, raw_getrawfrag, from, len, sin.sin_addr.s_addr, 0, sk->opt, flags, sin.sin_port, noblock); + } else - err=ip_build_xmit(sk, raw_getfrag, from, len, sin.sin_addr.s_addr, flags, sin.sin_port); + { + if(len>65535-sizeof(struct iphdr)) + return -EMSGSIZE; + err=ip_build_xmit(sk, raw_getfrag, from, len, sin.sin_addr.s_addr, 0, sk->opt, flags, sin.sin_port, noblock); + } return err<0?err:len; } - -static int raw_write(struct sock *sk, unsigned char *buff, int len, int noblock, - unsigned flags) +/* + * Temporary + */ + +static int raw_sendmsg(struct sock *sk, struct msghdr *msg, int len, int noblock, + int flags) { - return(raw_sendto(sk, buff, len, noblock, flags, NULL, 0)); + if(msg->msg_iovlen==1) + return raw_sendto(sk,msg->msg_iov[0].iov_base,len, noblock, flags, msg->msg_name, msg->msg_namelen); + else + { + /* + * For awkward cases we linearise the buffer first. In theory this is only frames + * whose iovec's don't split on 4 byte boundaries, and soon encrypted stuff (to keep + * skip happy). We are a bit more general about it. + */ + + unsigned char *buf; + int fs; + int err; + if(len>65515) + return -EMSGSIZE; + buf=kmalloc(len, GFP_KERNEL); + if(buf==NULL) + return -ENOBUFS; + err = memcpy_fromiovec(buf, msg->msg_iov, len); + if (!err) + { + fs=get_fs(); + set_fs(get_ds()); + err=raw_sendto(sk,buf,len, noblock, flags, msg->msg_name, msg->msg_namelen); + set_fs(fs); + } + else + err = -EFAULT; + + kfree_s(buf,len); + return err; + } } - -static void raw_close(struct sock *sk, int timeout) +static void raw_close(struct sock *sk, unsigned long timeout) { sk->state = TCP_CLOSE; +#ifdef CONFIG_IP_MROUTE + if(sk==mroute_socket) + { + mroute_close(sk); + mroute_socket=NULL; + } +#endif + destroy_sock(sk); } @@ -224,14 +329,13 @@ static int raw_init(struct sock *sk) * we return it, otherwise we block. */ -int raw_recvfrom(struct sock *sk, unsigned char *to, int len, - int noblock, unsigned flags, struct sockaddr_in *sin, - int *addr_len) +int raw_recvmsg(struct sock *sk, struct msghdr *msg, int len, + int noblock, int flags,int *addr_len) { int copied=0; struct sk_buff *skb; int err; - int truesize; + struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name; if (flags & MSG_OOB) return -EOPNOTSUPP; @@ -246,10 +350,14 @@ int raw_recvfrom(struct sock *sk, unsigned char *to, int len, if(skb==NULL) return err; - truesize=skb->len; - copied = min(len, truesize); - - skb_copy_datagram(skb, 0, to, copied); + copied=skb->len; + if(copied>len) + { + copied=len; + msg->msg_flags|=MSG_TRUNC; + } + + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); sk->stamp=skb->stamp; /* Copy the address. */ @@ -258,47 +366,36 @@ int raw_recvfrom(struct sock *sk, unsigned char *to, int len, sin->sin_family = AF_INET; sin->sin_addr.s_addr = skb->daddr; } - skb_free_datagram(skb); - release_sock(sk); - return (truesize); /* len not copied. BSD returns the true size of the message so you know a bit fell off! */ -} - - -int raw_read (struct sock *sk, unsigned char *buff, int len, int noblock,unsigned flags) -{ - return(raw_recvfrom(sk, buff, len, noblock, flags, NULL, NULL)); + skb_free_datagram(sk, skb); + return err ? err : (copied); } struct proto raw_prot = { - sock_wmalloc, - sock_rmalloc, - sock_wfree, - sock_rfree, - sock_rspace, - sock_wspace, raw_close, - raw_read, - raw_write, - raw_sendto, - raw_recvfrom, - ip_build_header, udp_connect, NULL, - ip_queue_xmit, - NULL, NULL, NULL, NULL, datagram_select, +#ifdef CONFIG_IP_MROUTE + ipmr_ioctl, +#else NULL, +#endif raw_init, NULL, + NULL, ip_setsockopt, ip_getsockopt, + raw_sendmsg, + raw_recvmsg, + NULL, /* No special bind */ + raw_rcv_skb, 128, 0, "RAW", 0, 0, - {NULL,} + NULL }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d2186a45d..c9161b3c0 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -28,6 +28,20 @@ * Alan Cox : Removed compatibility cruft. * Alan Cox : RTF_REJECT support. * Alan Cox : TCP irtt support. + * Jonathan Naylor : Added Metric support. + * Miquel van Smoorenburg : BSD API fixes. + * Miquel van Smoorenburg : Metrics. + * Alan Cox : Use __u32 properly + * Alan Cox : Aligned routing errors more closely with BSD + * our system is still very different. + * Alan Cox : Faster /proc handling + * Alexey Kuznetsov : Massive rework to support tree based routing, + * routing caches and better behaviour. + * + * Olaf Erb : irtt wasn't being copied right. + * Bjorn Ekwall : Kerneld route support. + * Alan Cox : Multicast fixed (I hope) + * Pavel Krauz : Limited broadcast fixed * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -35,8 +49,10 @@ * 2 of the License, or (at your option) any later version. */ -#include <asm/segment.h> +#include <linux/config.h> +#include <asm/uaccess.h> #include <asm/system.h> +#include <asm/bitops.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -48,6 +64,7 @@ #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> +#include <linux/if_arp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> @@ -55,334 +72,1478 @@ #include <linux/skbuff.h> #include <net/sock.h> #include <net/icmp.h> +#include <net/netlink.h> +#ifdef CONFIG_KERNELD +#include <linux/kerneld.h> +#endif /* - * The routing table list + * Forwarding Information Base definitions. */ -static struct rtable *rt_base = NULL; -unsigned long rt_stamp = 1; /* Routing table version stamp for caches ( 0 is 'unset' ) */ +struct fib_node +{ + struct fib_node *fib_next; + __u32 fib_dst; + unsigned long fib_use; + struct fib_info *fib_info; + short fib_metric; + unsigned char fib_tos; +}; /* - * Pointer to the loopback route + * This structure contains data shared by many of routes. + */ + +struct fib_info +{ + struct fib_info *fib_next; + struct fib_info *fib_prev; + __u32 fib_gateway; + struct device *fib_dev; + int fib_refcnt; + unsigned long fib_window; + unsigned short fib_flags; + unsigned short fib_mtu; + unsigned short fib_irtt; +}; + +struct fib_zone +{ + struct fib_zone *fz_next; + struct fib_node **fz_hash_table; + struct fib_node *fz_list; + int fz_nent; + int fz_logmask; + __u32 fz_mask; +}; + +static struct fib_zone *fib_zones[33]; +static struct fib_zone *fib_zone_list; +static struct fib_node *fib_loopback = NULL; +static struct fib_info *fib_info_list; + +/* + * Backlogging. */ - -static struct rtable *rt_loopback = NULL; + +#define RT_BH_REDIRECT 0 +#define RT_BH_GARBAGE_COLLECT 1 +#define RT_BH_FREE 2 + +struct rt_req +{ + struct rt_req * rtr_next; + struct device *dev; + __u32 dst; + __u32 gw; + unsigned char tos; +}; + +int ip_rt_lock; +unsigned ip_rt_bh_mask; +static struct rt_req *rt_backlog; /* - * Remove a routing table entry. + * Route cache. */ -static void rt_del(unsigned long dst, char *devname) +struct rtable *ip_rt_hash_table[RT_HASH_DIVISOR]; +static int rt_cache_size; +static struct rtable *rt_free_queue; +struct wait_queue *rt_wait; + +static void rt_kick_backlog(void); +static void rt_cache_add(unsigned hash, struct rtable * rth); +static void rt_cache_flush(void); +static void rt_garbage_collect_1(void); + +/* + * Evaluate mask length. + */ + +static __inline__ int rt_logmask(__u32 mask) { - struct rtable *r, **rp; - unsigned long flags; + if (!(mask = ntohl(mask))) + return 32; + return ffz(~mask); +} - rp = &rt_base; - - /* - * This must be done with interrupts off because we could take - * an ICMP_REDIRECT. - */ - - save_flags(flags); - cli(); - while((r = *rp) != NULL) - { - /* Make sure both the destination and the device match */ - if ( r->rt_dst != dst || - (devname != NULL && strcmp((r->rt_dev)->name,devname) != 0) ) - { - rp = &r->rt_next; - continue; - } - *rp = r->rt_next; - - /* - * If we delete the loopback route update its pointer. - */ - - if (rt_loopback == r) - rt_loopback = NULL; - kfree_s(r, sizeof(struct rtable)); - } - rt_stamp++; /* New table revision */ - - restore_flags(flags); +/* + * Create mask from length. + */ + +static __inline__ __u32 rt_mask(int logmask) +{ + if (logmask >= 32) + return 0; + return htonl(~((1<<logmask)-1)); +} + +static __inline__ unsigned fz_hash_code(__u32 dst, int logmask) +{ + return ip_rt_hash_code(ntohl(dst)>>logmask); } +/* + * Free FIB node. + */ + +static void fib_free_node(struct fib_node * f) +{ + struct fib_info * fi = f->fib_info; + if (!--fi->fib_refcnt) + { +#if RT_CACHE_DEBUG >= 2 + printk("fib_free_node: fi %08x/%s is free\n", fi->fib_gateway, fi->fib_dev->name); +#endif + if (fi->fib_next) + fi->fib_next->fib_prev = fi->fib_prev; + if (fi->fib_prev) + fi->fib_prev->fib_next = fi->fib_next; + if (fi == fib_info_list) + fib_info_list = fi->fib_next; + } + kfree_s(f, sizeof(struct fib_node)); +} /* - * Remove all routing table entries for a device. This is called when - * a device is downed. + * Find gateway route by address. */ - -void ip_rt_flush(struct device *dev) + +static struct fib_node * fib_lookup_gateway(__u32 dst) { - struct rtable *r; - struct rtable **rp; - unsigned long flags; + struct fib_zone * fz; + struct fib_node * f; - rp = &rt_base; - save_flags(flags); - cli(); - while ((r = *rp) != NULL) { - if (r->rt_dev != dev) { - rp = &r->rt_next; - continue; + for (fz = fib_zone_list; fz; fz = fz->fz_next) + { + if (fz->fz_hash_table) + f = fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)]; + else + f = fz->fz_list; + + for ( ; f; f = f->fib_next) + { + if ((dst ^ f->fib_dst) & fz->fz_mask) + continue; + if (f->fib_info->fib_flags & RTF_GATEWAY) + return NULL; + return f; } - *rp = r->rt_next; - if (rt_loopback == r) - rt_loopback = NULL; - kfree_s(r, sizeof(struct rtable)); - } - rt_stamp++; /* New table revision */ - restore_flags(flags); + } + return NULL; } /* - * Used by 'rt_add()' when we can't get the netmask any other way.. + * Find local route by address. + * FIXME: I use "longest match" principle. If destination + * has some non-local route, I'll not search shorter matches. + * It's possible, I'm wrong, but I wanted to prevent following + * situation: + * route add 193.233.7.128 netmask 255.255.255.192 gw xxxxxx + * route add 193.233.7.0 netmask 255.255.255.0 eth1 + * (Two ethernets connected by serial line, one is small and other is large) + * Host 193.233.7.129 is locally unreachable, + * but old (<=1.3.37) code will send packets destined for it to eth1. * - * If the lower byte or two are zero, we guess the mask based on the - * number of zero 8-bit net numbers, otherwise we use the "default" - * masks judging by the destination address and our device netmask. */ - -static inline unsigned long default_mask(unsigned long dst) + +static struct fib_node * fib_lookup_local(__u32 dst) { - dst = ntohl(dst); - if (IN_CLASSA(dst)) - return htonl(IN_CLASSA_NET); - if (IN_CLASSB(dst)) - return htonl(IN_CLASSB_NET); - return htonl(IN_CLASSC_NET); -} + struct fib_zone * fz; + struct fib_node * f; + for (fz = fib_zone_list; fz; fz = fz->fz_next) + { + int longest_match_found = 0; + + if (fz->fz_hash_table) + f = fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)]; + else + f = fz->fz_list; + + for ( ; f; f = f->fib_next) + { + if ((dst ^ f->fib_dst) & fz->fz_mask) + continue; + if (!(f->fib_info->fib_flags & RTF_GATEWAY)) + return f; + longest_match_found = 1; + } + if (longest_match_found) + return NULL; + } + return NULL; +} /* - * If no mask is specified then generate a default entry. + * Main lookup routine. + * IMPORTANT NOTE: this algorithm has small difference from <=1.3.37 visible + * by user. It doesn't route non-CIDR broadcasts by default. + * + * F.e. + * ifconfig eth0 193.233.7.65 netmask 255.255.255.192 broadcast 193.233.7.255 + * is valid, but if you really are not able (not allowed, do not want) to + * use CIDR compliant broadcast 193.233.7.127, you should add host route: + * route add -host 193.233.7.255 eth0 */ -static unsigned long guess_mask(unsigned long dst, struct device * dev) +static struct fib_node * fib_lookup(__u32 dst) { - unsigned long mask; + struct fib_zone * fz; + struct fib_node * f; - if (!dst) - return 0; - mask = default_mask(dst); - if ((dst ^ dev->pa_addr) & mask) - return mask; - return dev->pa_mask; + for (fz = fib_zone_list; fz; fz = fz->fz_next) + { + if (fz->fz_hash_table) + f = fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)]; + else + f = fz->fz_list; + + for ( ; f; f = f->fib_next) + { + if ((dst ^ f->fib_dst) & fz->fz_mask) + continue; + return f; + } + } + return NULL; } +static __inline__ struct device * get_gw_dev(__u32 gw) +{ + struct fib_node * f; + f = fib_lookup_gateway(gw); + if (f) + return f->fib_info->fib_dev; + return NULL; +} /* - * Find the route entry through which our gateway will be reached + * Check if a mask is acceptable. */ -static inline struct device * get_gw_dev(unsigned long gw) +static inline int bad_mask(__u32 mask, __u32 addr) { - struct rtable * rt; + if (addr & (mask = ~mask)) + return 1; + mask = ntohl(mask); + if (mask & (mask+1)) + return 1; + return 0; +} + - for (rt = rt_base ; ; rt = rt->rt_next) +static int fib_del_list(struct fib_node **fp, __u32 dst, + struct device * dev, __u32 gtw, short flags, short metric, __u32 mask) +{ + struct fib_node *f; + int found=0; + + while((f = *fp) != NULL) { - if (!rt) - return NULL; - if ((gw ^ rt->rt_dst) & rt->rt_mask) + struct fib_info * fi = f->fib_info; + + /* + * Make sure the destination and netmask match. + * metric, gateway and device are also checked + * if they were specified. + */ + if (f->fib_dst != dst || + (gtw && fi->fib_gateway != gtw) || + (metric >= 0 && f->fib_metric != metric) || + (dev && fi->fib_dev != dev) ) + { + fp = &f->fib_next; continue; - /* - * Gateways behind gateways are a no-no + } + cli(); + *fp = f->fib_next; + if (fib_loopback == f) + fib_loopback = NULL; + sti(); + ip_netlink_msg(RTMSG_DELROUTE, dst, gtw, mask, flags, metric, fi->fib_dev->name); + fib_free_node(f); + found++; + } + return found; +} + +static __inline__ int fib_del_1(__u32 dst, __u32 mask, + struct device * dev, __u32 gtw, short flags, short metric) +{ + struct fib_node **fp; + struct fib_zone *fz; + int found=0; + + if (!mask) + { + for (fz=fib_zone_list; fz; fz = fz->fz_next) + { + int tmp; + if (fz->fz_hash_table) + fp = &fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)]; + else + fp = &fz->fz_list; + + tmp = fib_del_list(fp, dst, dev, gtw, flags, metric, mask); + fz->fz_nent -= tmp; + found += tmp; + } + } + else + { + if ((fz = fib_zones[rt_logmask(mask)]) != NULL) + { + if (fz->fz_hash_table) + fp = &fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)]; + else + fp = &fz->fz_list; + + found = fib_del_list(fp, dst, dev, gtw, flags, metric, mask); + fz->fz_nent -= found; + } + } + + if (found) + { + rt_cache_flush(); + return 0; + } + return -ESRCH; +} + + +static struct fib_info * fib_create_info(__u32 gw, struct device * dev, + unsigned short flags, unsigned short mss, + unsigned long window, unsigned short irtt) +{ + struct fib_info * fi; + + if (!(flags & RTF_MSS)) + { + mss = dev->mtu; +#ifdef CONFIG_NO_PATH_MTU_DISCOVERY + /* + * If MTU was not specified, use default. + * If you want to increase MTU for some net (local subnet) + * use "route add .... mss xxx". + * + * The MTU isn't currently always used and computed as it + * should be as far as I can tell. [Still verifying this is right] */ - - if (rt->rt_flags & RTF_GATEWAY) - return NULL; - return rt->rt_dev; + if ((flags & RTF_GATEWAY) && mss > 576) + mss = 576; +#endif + } + if (!(flags & RTF_WINDOW)) + window = 0; + if (!(flags & RTF_IRTT)) + irtt = 0; + + for (fi=fib_info_list; fi; fi = fi->fib_next) + { + if (fi->fib_gateway != gw || + fi->fib_dev != dev || + fi->fib_flags != flags || + fi->fib_mtu != mss || + fi->fib_window != window || + fi->fib_irtt != irtt) + continue; + fi->fib_refcnt++; +#if RT_CACHE_DEBUG >= 2 + printk("fib_create_info: fi %08x/%s is duplicate\n", fi->fib_gateway, fi->fib_dev->name); +#endif + return fi; } + fi = (struct fib_info*)kmalloc(sizeof(struct fib_info), GFP_KERNEL); + if (!fi) + return NULL; + memset(fi, 0, sizeof(struct fib_info)); + fi->fib_flags = flags; + fi->fib_dev = dev; + fi->fib_gateway = gw; + fi->fib_mtu = mss; + fi->fib_window = window; + fi->fib_refcnt++; + fi->fib_next = fib_info_list; + fi->fib_prev = NULL; + fi->fib_irtt = irtt; + if (fib_info_list) + fib_info_list->fib_prev = fi; + fib_info_list = fi; +#if RT_CACHE_DEBUG >= 2 + printk("fib_create_info: fi %08x/%s is created\n", fi->fib_gateway, fi->fib_dev->name); +#endif + return fi; } -/* - * Rewrote rt_add(), as the old one was weird - Linus - * - * This routine is used to update the IP routing table, either - * from the kernel (ICMP_REDIRECT) or via an ioctl call issued - * by the superuser. - */ - -void ip_rt_add(short flags, unsigned long dst, unsigned long mask, - unsigned long gw, struct device *dev, unsigned short mtu, unsigned long window, unsigned short irtt) + +static __inline__ void fib_add_1(short flags, __u32 dst, __u32 mask, + __u32 gw, struct device *dev, unsigned short mss, + unsigned long window, unsigned short irtt, short metric) { - struct rtable *r, *rt; - struct rtable **rp; - unsigned long cpuflags; + struct fib_node *f, *f1; + struct fib_node **fp; + struct fib_node **dup_fp = NULL; + struct fib_zone * fz; + struct fib_info * fi; + int logmask; /* - * A host is a unique machine and has no network bits. + * Allocate an entry and fill it in. */ - if (flags & RTF_HOST) + f = (struct fib_node *) kmalloc(sizeof(struct fib_node), GFP_KERNEL); + if (f == NULL) + return; + + memset(f, 0, sizeof(struct fib_node)); + f->fib_dst = dst; + f->fib_metric = metric; + f->fib_tos = 0; + + if ((fi = fib_create_info(gw, dev, flags, mss, window, irtt)) == NULL) { - mask = 0xffffffff; - } - + kfree_s(f, sizeof(struct fib_node)); + return; + } + f->fib_info = fi; + + logmask = rt_logmask(mask); + fz = fib_zones[logmask]; + + + if (!fz) + { + int i; + fz = kmalloc(sizeof(struct fib_zone), GFP_KERNEL); + if (!fz) + { + fib_free_node(f); + return; + } + memset(fz, 0, sizeof(struct fib_zone)); + fz->fz_logmask = logmask; + fz->fz_mask = mask; + for (i=logmask-1; i>=0; i--) + if (fib_zones[i]) + break; + cli(); + if (i<0) + { + fz->fz_next = fib_zone_list; + fib_zone_list = fz; + } + else + { + fz->fz_next = fib_zones[i]->fz_next; + fib_zones[i]->fz_next = fz; + } + fib_zones[logmask] = fz; + sti(); + } + /* - * Calculate the network mask + * If zone overgrows RTZ_HASHING_LIMIT, create hash table. */ - - else if (!mask) + + if (fz->fz_nent >= RTZ_HASHING_LIMIT && !fz->fz_hash_table && logmask<32) { - if (!((dst ^ dev->pa_addr) & dev->pa_mask)) + struct fib_node ** ht; +#if RT_CACHE_DEBUG >= 2 + printk("fib_add_1: hashing for zone %d started\n", logmask); +#endif + ht = kmalloc(RTZ_HASH_DIVISOR*sizeof(struct rtable*), GFP_KERNEL); + + if (ht) { - mask = dev->pa_mask; - flags &= ~RTF_GATEWAY; - if (flags & RTF_DYNAMIC) + memset(ht, 0, RTZ_HASH_DIVISOR*sizeof(struct fib_node*)); + cli(); + f1 = fz->fz_list; + while (f1) { - /*printk("Dynamic route to my own net rejected\n");*/ - return; + struct fib_node * next; + unsigned hash = fz_hash_code(f1->fib_dst, logmask); + next = f1->fib_next; + f1->fib_next = ht[hash]; + ht[hash] = f1; + f1 = next; } - } - else - mask = guess_mask(dst, dev); - dst &= mask; + fz->fz_list = NULL; + fz->fz_hash_table = ht; + sti(); + } } - + + if (fz->fz_hash_table) + fp = &fz->fz_hash_table[fz_hash_code(dst, logmask)]; + else + fp = &fz->fz_list; + /* - * A gateway must be reachable and not a local address + * Scan list to find the first route with the same destination */ - - if (gw == dev->pa_addr) - flags &= ~RTF_GATEWAY; - - if (flags & RTF_GATEWAY) + while ((f1 = *fp) != NULL) { + if (f1->fib_dst == dst) + break; + fp = &f1->fib_next; + } + + /* + * Find route with the same destination and less (or equal) metric. + */ + while ((f1 = *fp) != NULL && f1->fib_dst == dst) + { + if (f1->fib_metric >= metric) + break; /* - * Don't try to add a gateway we can't reach.. + * Record route with the same destination and gateway, + * but less metric. We'll delete it + * after instantiation of new route. */ - - if (dev != get_gw_dev(gw)) - return; - - flags |= RTF_GATEWAY; - } - else - gw = 0; - + if (f1->fib_info->fib_gateway == gw && + (gw || f1->fib_info->fib_dev == dev)) + dup_fp = fp; + fp = &f1->fib_next; + } + /* - * Allocate an entry and fill it in. + * Is it already present? */ - - rt = (struct rtable *) kmalloc(sizeof(struct rtable), GFP_ATOMIC); - if (rt == NULL) + + if (f1 && f1->fib_metric == metric && f1->fib_info == fi) { + fib_free_node(f); return; } + + /* + * Insert new entry to the list. + */ + + cli(); + f->fib_next = f1; + *fp = f; + if (!fib_loopback && (fi->fib_dev->flags & IFF_LOOPBACK)) + fib_loopback = f; + sti(); + fz->fz_nent++; + ip_netlink_msg(RTMSG_NEWROUTE, dst, gw, mask, flags, metric, fi->fib_dev->name); + + /* + * Delete route with the same destination and gateway. + * Note that we should have at most one such route. + */ + if (dup_fp) + fp = dup_fp; + else + fp = &f->fib_next; + + while ((f1 = *fp) != NULL && f1->fib_dst == dst) + { + if (f1->fib_info->fib_gateway == gw && + (gw || f1->fib_info->fib_dev == dev)) + { + cli(); + *fp = f1->fib_next; + if (fib_loopback == f1) + fib_loopback = NULL; + sti(); + ip_netlink_msg(RTMSG_DELROUTE, dst, gw, mask, flags, metric, f1->fib_info->fib_dev->name); + fib_free_node(f1); + fz->fz_nent--; + break; + } + fp = &f1->fib_next; + } + rt_cache_flush(); + return; +} + +static int rt_flush_list(struct fib_node ** fp, struct device *dev) +{ + int found = 0; + struct fib_node *f; + + while ((f = *fp) != NULL) { +/* + * "Magic" device route is allowed to point to loopback, + * discard it too. + */ + if (f->fib_info->fib_dev != dev && + (f->fib_info->fib_dev != &loopback_dev || f->fib_dst != dev->pa_addr)) { + fp = &f->fib_next; + continue; + } + cli(); + *fp = f->fib_next; + if (fib_loopback == f) + fib_loopback = NULL; + sti(); + fib_free_node(f); + found++; + } + return found; +} + +static __inline__ void fib_flush_1(struct device *dev) +{ + struct fib_zone *fz; + int found = 0; + + for (fz = fib_zone_list; fz; fz = fz->fz_next) + { + if (fz->fz_hash_table) + { + int i; + int tmp = 0; + for (i=0; i<RTZ_HASH_DIVISOR; i++) + tmp += rt_flush_list(&fz->fz_hash_table[i], dev); + fz->fz_nent -= tmp; + found += tmp; + } + else + { + int tmp; + tmp = rt_flush_list(&fz->fz_list, dev); + fz->fz_nent -= tmp; + found += tmp; + } + } + + if (found) + rt_cache_flush(); +} + + +/* + * Called from the PROCfs module. This outputs /proc/net/route. + * + * We preserve the old format but pad the buffers out. This means that + * we can spin over the other entries as we read them. Remember the + * gated BGP4 code could need to read 60,000+ routes on occasion (that's + * about 7Mb of data). To do that ok we will need to also cache the + * last route we got to (reads will generally be following on from + * one another without gaps). + */ + +int rt_get_info(char *buffer, char **start, off_t offset, int length, int dummy) +{ + struct fib_zone *fz; + struct fib_node *f; + int len=0; + off_t pos=0; + char temp[129]; + int i; + + pos = 128; + + if (offset<128) + { + sprintf(buffer,"%-127s\n","Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT"); + len = 128; + } + + while (ip_rt_lock) + sleep_on(&rt_wait); + ip_rt_fast_lock(); + + for (fz=fib_zone_list; fz; fz = fz->fz_next) + { + int maxslot; + struct fib_node ** fp; + + if (fz->fz_nent == 0) + continue; + + if (pos + 128*fz->fz_nent <= offset) + { + pos += 128*fz->fz_nent; + len = 0; + continue; + } + + if (fz->fz_hash_table) + { + maxslot = RTZ_HASH_DIVISOR; + fp = fz->fz_hash_table; + } + else + { + maxslot = 1; + fp = &fz->fz_list; + } + + for (i=0; i < maxslot; i++, fp++) + { + + for (f = *fp; f; f = f->fib_next) + { + struct fib_info * fi; + /* + * Spin through entries until we are ready + */ + pos += 128; + + if (pos <= offset) + { + len=0; + continue; + } + + fi = f->fib_info; + sprintf(temp, "%s\t%08lX\t%08lX\t%02X\t%d\t%lu\t%d\t%08lX\t%d\t%lu\t%u", + fi->fib_dev->name, (unsigned long)f->fib_dst, (unsigned long)fi->fib_gateway, + fi->fib_flags, 0, f->fib_use, f->fib_metric, + (unsigned long)fz->fz_mask, (int)fi->fib_mtu, fi->fib_window, (int)fi->fib_irtt); + sprintf(buffer+len,"%-127s\n",temp); + + len += 128; + if (pos >= offset+length) + goto done; + } + } + } + +done: + ip_rt_unlock(); + wake_up(&rt_wait); + + *start = buffer+len-(pos-offset); + len = pos - offset; + if (len>length) + len = length; + return len; +} + +int rt_cache_get_info(char *buffer, char **start, off_t offset, int length, int dummy) +{ + int len=0; + off_t pos=0; + char temp[129]; + struct rtable *r; + int i; + + pos = 128; + + if (offset<128) + { + sprintf(buffer,"%-127s\n","Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tHH\tARP"); + len = 128; + } + + + while (ip_rt_lock) + sleep_on(&rt_wait); + ip_rt_fast_lock(); + + for (i = 0; i<RT_HASH_DIVISOR; i++) + { + for (r = ip_rt_hash_table[i]; r; r = r->rt_next) + { + /* + * Spin through entries until we are ready + */ + pos += 128; + + if (pos <= offset) + { + len = 0; + continue; + } + + sprintf(temp, "%s\t%08lX\t%08lX\t%02X\t%d\t%u\t%d\t%08lX\t%d\t%lu\t%u\t%d\t%1d", + r->rt_dev->name, (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, + r->rt_flags, r->rt_refcnt, r->rt_use, 0, + (unsigned long)r->rt_src, (int)r->rt_mtu, r->rt_window, (int)r->rt_irtt, r->rt_hh ? r->rt_hh->hh_refcnt : -1, r->rt_hh ? r->rt_hh->hh_uptodate : 0); + sprintf(buffer+len,"%-127s\n",temp); + len += 128; + if (pos >= offset+length) + goto done; + } + } + +done: + ip_rt_unlock(); + wake_up(&rt_wait); + + *start = buffer+len-(pos-offset); + len = pos-offset; + if (len>length) + len = length; + return len; +} + + +static void rt_free(struct rtable * rt) +{ + unsigned long flags; + + save_flags(flags); + cli(); + if (!rt->rt_refcnt) + { + struct hh_cache * hh = rt->rt_hh; + rt->rt_hh = NULL; + restore_flags(flags); + if (hh && atomic_dec_and_test(&hh->hh_refcnt)) + kfree_s(hh, sizeof(struct hh_cache)); + kfree_s(rt, sizeof(struct rt_table)); + return; + } + rt->rt_next = rt_free_queue; + rt->rt_flags &= ~RTF_UP; + rt_free_queue = rt; + ip_rt_bh_mask |= RT_BH_FREE; +#if RT_CACHE_DEBUG >= 2 + printk("rt_free: %08x\n", rt->rt_dst); +#endif + restore_flags(flags); +} + +/* + * RT "bottom half" handlers. Called with masked interrupts. + */ + +static __inline__ void rt_kick_free_queue(void) +{ + struct rtable *rt, **rtp; + + rtp = &rt_free_queue; + + while ((rt = *rtp) != NULL) + { + if (!rt->rt_refcnt) + { + struct hh_cache * hh = rt->rt_hh; +#if RT_CACHE_DEBUG >= 2 + __u32 daddr = rt->rt_dst; +#endif + *rtp = rt->rt_next; + rt->rt_hh = NULL; + sti(); + if (hh && atomic_dec_and_test(&hh->hh_refcnt)) + kfree_s(hh, sizeof(struct hh_cache)); + kfree_s(rt, sizeof(struct rt_table)); +#if RT_CACHE_DEBUG >= 2 + printk("rt_kick_free_queue: %08x is free\n", daddr); +#endif + cli(); + continue; + } + rtp = &rt->rt_next; + } +} + +void ip_rt_run_bh() +{ + unsigned long flags; + save_flags(flags); + cli(); + if (ip_rt_bh_mask && !ip_rt_lock) + { + if (ip_rt_bh_mask & RT_BH_REDIRECT) + rt_kick_backlog(); + + if (ip_rt_bh_mask & RT_BH_GARBAGE_COLLECT) + { + ip_rt_fast_lock(); + ip_rt_bh_mask &= ~RT_BH_GARBAGE_COLLECT; + sti(); + rt_garbage_collect_1(); + cli(); + ip_rt_fast_unlock(); + } + + if (ip_rt_bh_mask & RT_BH_FREE) + rt_kick_free_queue(); + } + restore_flags(flags); +} + + +void ip_rt_check_expire() +{ + ip_rt_fast_lock(); + if (ip_rt_lock == 1) + { + int i; + struct rtable *rth, **rthp; + unsigned long flags; + unsigned long now = jiffies; + + save_flags(flags); + for (i=0; i<RT_HASH_DIVISOR; i++) + { + rthp = &ip_rt_hash_table[i]; + + while ((rth = *rthp) != NULL) + { + struct rtable * rth_next = rth->rt_next; + + /* + * Cleanup aged off entries. + */ + + cli(); + if (!rth->rt_refcnt && rth->rt_lastuse + RT_CACHE_TIMEOUT < now) + { + *rthp = rth_next; + sti(); + rt_cache_size--; +#if RT_CACHE_DEBUG >= 2 + printk("rt_check_expire clean %02x@%08x\n", i, rth->rt_dst); +#endif + rt_free(rth); + continue; + } + sti(); + + if (!rth_next) + break; + + /* + * LRU ordering. + */ + + if (rth->rt_lastuse + RT_CACHE_BUBBLE_THRESHOLD < rth_next->rt_lastuse || + (rth->rt_lastuse < rth_next->rt_lastuse && + rth->rt_use < rth_next->rt_use)) + { +#if RT_CACHE_DEBUG >= 2 + printk("rt_check_expire bubbled %02x@%08x<->%08x\n", i, rth->rt_dst, rth_next->rt_dst); +#endif + cli(); + *rthp = rth_next; + rth->rt_next = rth_next->rt_next; + rth_next->rt_next = rth; + sti(); + rthp = &rth_next->rt_next; + continue; + } + rthp = &rth->rt_next; + } + } + restore_flags(flags); + rt_kick_free_queue(); + } + ip_rt_unlock(); +} + +static void rt_redirect_1(__u32 dst, __u32 gw, struct device *dev) +{ + struct rtable *rt; + unsigned long hash = ip_rt_hash_code(dst); + + if (gw == dev->pa_addr) + return; + if (dev != get_gw_dev(gw)) + return; + rt = (struct rtable *) kmalloc(sizeof(struct rtable), GFP_ATOMIC); + if (rt == NULL) + return; memset(rt, 0, sizeof(struct rtable)); - rt->rt_flags = flags | RTF_UP; + rt->rt_flags = RTF_DYNAMIC | RTF_MODIFIED | RTF_HOST | RTF_GATEWAY | RTF_UP; rt->rt_dst = dst; rt->rt_dev = dev; rt->rt_gateway = gw; - rt->rt_mask = mask; - rt->rt_mss = dev->mtu - HEADER_SIZE; - rt->rt_window = 0; /* Default is no clamping */ + rt->rt_src = dev->pa_addr; + rt->rt_mtu = dev->mtu; +#ifdef CONFIG_NO_PATH_MTU_DISCOVERY + if (dev->mtu > 576) + rt->rt_mtu = 576; +#endif + rt->rt_lastuse = jiffies; + rt->rt_refcnt = 1; + rt_cache_add(hash, rt); + ip_rt_put(rt); + return; +} - /* Are the MSS/Window valid ? */ +static void rt_cache_flush(void) +{ + int i; + struct rtable * rth, * next; - if(rt->rt_flags & RTF_MSS) - rt->rt_mss = mtu; - - if(rt->rt_flags & RTF_WINDOW) - rt->rt_window = window; - if(rt->rt_flags & RTF_IRTT) - rt->rt_irtt = irtt; + for (i=0; i<RT_HASH_DIVISOR; i++) + { + int nr=0; - /* - * What we have to do is loop though this until we have - * found the first address which has a higher generality than - * the one in rt. Then we can put rt in right before it. - * The interrupts must be off for this process. - */ + cli(); + if (!(rth = ip_rt_hash_table[i])) + { + sti(); + continue; + } + + ip_rt_hash_table[i] = NULL; + sti(); + + for (; rth; rth=next) + { + next = rth->rt_next; + rt_cache_size--; + nr++; + rth->rt_next = NULL; + rt_free(rth); + } +#if RT_CACHE_DEBUG >= 2 + if (nr > 0) + printk("rt_cache_flush: %d@%02x\n", nr, i); +#endif + } +#if RT_CACHE_DEBUG >= 1 + if (rt_cache_size) + { + printk("rt_cache_flush: bug rt_cache_size=%d\n", rt_cache_size); + rt_cache_size = 0; + } +#endif +} + +static void rt_garbage_collect_1(void) +{ + int i; + unsigned expire = RT_CACHE_TIMEOUT>>1; + struct rtable * rth, **rthp; + unsigned long now = jiffies; + + for (;;) + { + for (i=0; i<RT_HASH_DIVISOR; i++) + { + if (!ip_rt_hash_table[i]) + continue; + for (rthp=&ip_rt_hash_table[i]; (rth=*rthp); rthp=&rth->rt_next) + { + if (rth->rt_lastuse + expire*(rth->rt_refcnt+1) > now) + continue; + rt_cache_size--; + cli(); + *rthp=rth->rt_next; + rth->rt_next = NULL; + sti(); + rt_free(rth); + break; + } + } + if (rt_cache_size < RT_CACHE_SIZE_MAX) + return; + expire >>= 1; + } +} + +static __inline__ void rt_req_enqueue(struct rt_req **q, struct rt_req *rtr) +{ + unsigned long flags; + struct rt_req * tail; + + save_flags(flags); + cli(); + tail = *q; + if (!tail) + rtr->rtr_next = rtr; + else + { + rtr->rtr_next = tail->rtr_next; + tail->rtr_next = rtr; + } + *q = rtr; + restore_flags(flags); + return; +} + +/* + * Caller should mask interrupts. + */ + +static __inline__ struct rt_req * rt_req_dequeue(struct rt_req **q) +{ + struct rt_req * rtr; + + if (*q) + { + rtr = (*q)->rtr_next; + (*q)->rtr_next = rtr->rtr_next; + if (rtr->rtr_next == rtr) + *q = NULL; + rtr->rtr_next = NULL; + return rtr; + } + return NULL; +} + +/* + Called with masked interrupts + */ + +static void rt_kick_backlog() +{ + if (!ip_rt_lock) + { + struct rt_req * rtr; + + ip_rt_fast_lock(); + + while ((rtr = rt_req_dequeue(&rt_backlog)) != NULL) + { + sti(); + rt_redirect_1(rtr->dst, rtr->gw, rtr->dev); + kfree_s(rtr, sizeof(struct rt_req)); + cli(); + } + + ip_rt_bh_mask &= ~RT_BH_REDIRECT; + + ip_rt_fast_unlock(); + } +} + +/* + * rt_{del|add|flush} called only from USER process. Waiting is OK. + */ + +static int rt_del(__u32 dst, __u32 mask, + struct device * dev, __u32 gtw, short rt_flags, short metric) +{ + int retval; + + while (ip_rt_lock) + sleep_on(&rt_wait); + ip_rt_fast_lock(); + retval = fib_del_1(dst, mask, dev, gtw, rt_flags, metric); + ip_rt_unlock(); + wake_up(&rt_wait); + return retval; +} + +static void rt_add(short flags, __u32 dst, __u32 mask, + __u32 gw, struct device *dev, unsigned short mss, + unsigned long window, unsigned short irtt, short metric) +{ + while (ip_rt_lock) + sleep_on(&rt_wait); + ip_rt_fast_lock(); + fib_add_1(flags, dst, mask, gw, dev, mss, window, irtt, metric); + ip_rt_unlock(); + wake_up(&rt_wait); +} + +void ip_rt_flush(struct device *dev) +{ + while (ip_rt_lock) + sleep_on(&rt_wait); + ip_rt_fast_lock(); + fib_flush_1(dev); + ip_rt_unlock(); + wake_up(&rt_wait); +} + +/* + Called by ICMP module. + */ + +void ip_rt_redirect(__u32 src, __u32 dst, __u32 gw, struct device *dev) +{ + struct rt_req * rtr; + struct rtable * rt; + + rt = ip_rt_route(dst, 0); + if (!rt) + return; + + if (rt->rt_gateway != src || + rt->rt_dev != dev || + ((gw^dev->pa_addr)&dev->pa_mask) || + ip_chk_addr(gw)) + { + ip_rt_put(rt); + return; + } + ip_rt_put(rt); + + ip_rt_fast_lock(); + if (ip_rt_lock == 1) + { + rt_redirect_1(dst, gw, dev); + ip_rt_unlock(); + return; + } + + rtr = kmalloc(sizeof(struct rt_req), GFP_ATOMIC); + if (rtr) + { + rtr->dst = dst; + rtr->gw = gw; + rtr->dev = dev; + rt_req_enqueue(&rt_backlog, rtr); + ip_rt_bh_mask |= RT_BH_REDIRECT; + } + ip_rt_unlock(); +} + + +static __inline__ void rt_garbage_collect(void) +{ + if (ip_rt_lock == 1) + { + rt_garbage_collect_1(); + return; + } + ip_rt_bh_mask |= RT_BH_GARBAGE_COLLECT; +} + +static void rt_cache_add(unsigned hash, struct rtable * rth) +{ + unsigned long flags; + struct rtable **rthp; + __u32 daddr = rth->rt_dst; + unsigned long now = jiffies; + +#if RT_CACHE_DEBUG >= 2 + if (ip_rt_lock != 1) + { + printk("rt_cache_add: ip_rt_lock==%d\n", ip_rt_lock); + return; + } +#endif + + save_flags(flags); + + if (rth->rt_dev->header_cache_bind) + { + struct rtable * rtg = rth; + + if (rth->rt_gateway != daddr) + { + ip_rt_fast_unlock(); + rtg = ip_rt_route(rth->rt_gateway, 0); + ip_rt_fast_lock(); + } + + if (rtg) + { + if (rtg == rth) + rtg->rt_dev->header_cache_bind(&rtg->rt_hh, rtg->rt_dev, ETH_P_IP, rtg->rt_dst); + else + { + if (rtg->rt_hh) + atomic_inc(&rtg->rt_hh->hh_refcnt); + rth->rt_hh = rtg->rt_hh; + ip_rt_put(rtg); + } + } + } + + if (rt_cache_size >= RT_CACHE_SIZE_MAX) + rt_garbage_collect(); - save_flags(cpuflags); cli(); + rth->rt_next = ip_rt_hash_table[hash]; +#if RT_CACHE_DEBUG >= 2 + if (rth->rt_next) + { + struct rtable * trth; + printk("rt_cache @%02x: %08x", hash, daddr); + for (trth=rth->rt_next; trth; trth=trth->rt_next) + printk(" . %08x", trth->rt_dst); + printk("\n"); + } +#endif + ip_rt_hash_table[hash] = rth; + rthp = &rth->rt_next; + sti(); + rt_cache_size++; /* - * Remove old route if we are getting a duplicate. + * Cleanup duplicate (and aged off) entries. */ - - rp = &rt_base; - while ((r = *rp) != NULL) + + while ((rth = *rthp) != NULL) { - if (r->rt_dst != dst || - r->rt_mask != mask) + + cli(); + if ((!rth->rt_refcnt && rth->rt_lastuse + RT_CACHE_TIMEOUT < now) + || rth->rt_dst == daddr) { - rp = &r->rt_next; + *rthp = rth->rt_next; + rt_cache_size--; + sti(); +#if RT_CACHE_DEBUG >= 2 + printk("rt_cache clean %02x@%08x\n", hash, rth->rt_dst); +#endif + rt_free(rth); continue; } - *rp = r->rt_next; - if (rt_loopback == r) - rt_loopback = NULL; - kfree_s(r, sizeof(struct rtable)); + sti(); + rthp = &rth->rt_next; } - - /* - * Add the new route - */ - - rp = &rt_base; - while ((r = *rp) != NULL) { - if ((r->rt_mask & mask) != mask) - break; - rp = &r->rt_next; + restore_flags(flags); +} + +/* + RT should be already locked. + + We could improve this by keeping a chain of say 32 struct rtable's + last freed for fast recycling. + + */ + +struct rtable * ip_rt_slow_route (__u32 daddr, int local) +{ + unsigned hash = ip_rt_hash_code(daddr)^local; + struct rtable * rth; + struct fib_node * f; + struct fib_info * fi; + __u32 saddr; + +#if RT_CACHE_DEBUG >= 2 + printk("rt_cache miss @%08x\n", daddr); +#endif + + rth = kmalloc(sizeof(struct rtable), GFP_ATOMIC); + if (!rth) + { + ip_rt_unlock(); + return NULL; + } + + if (local) + f = fib_lookup_local(daddr); + else + f = fib_lookup (daddr); + + if (f) + { + fi = f->fib_info; + f->fib_use++; + } + + if (!f || (fi->fib_flags & RTF_REJECT)) + { +#ifdef CONFIG_KERNELD + char wanted_route[20]; +#endif +#if RT_CACHE_DEBUG >= 2 + printk("rt_route failed @%08x\n", daddr); +#endif + ip_rt_unlock(); + kfree_s(rth, sizeof(struct rtable)); +#ifdef CONFIG_KERNELD + daddr=ntohl(daddr); + sprintf(wanted_route, "%d.%d.%d.%d", + (int)(daddr >> 24) & 0xff, (int)(daddr >> 16) & 0xff, + (int)(daddr >> 8) & 0xff, (int)daddr & 0xff); + kerneld_route(wanted_route); /* Dynamic route request */ +#endif + return NULL; + } + + saddr = fi->fib_dev->pa_addr; + + if (daddr == fi->fib_dev->pa_addr) + { + f->fib_use--; + if ((f = fib_loopback) != NULL) + { + f->fib_use++; + fi = f->fib_info; + } } - rt->rt_next = r; - *rp = rt; - /* - * Update the loopback route - */ - - if ((rt->rt_dev->flags & IFF_LOOPBACK) && !rt_loopback) - rt_loopback = rt; + if (!f) + { + ip_rt_unlock(); + kfree_s(rth, sizeof(struct rtable)); + return NULL; + } - rt_stamp++; /* New table revision */ - + rth->rt_dst = daddr; + rth->rt_src = saddr; + rth->rt_lastuse = jiffies; + rth->rt_refcnt = 1; + rth->rt_use = 1; + rth->rt_next = NULL; + rth->rt_hh = NULL; + rth->rt_gateway = fi->fib_gateway; + rth->rt_dev = fi->fib_dev; + rth->rt_mtu = fi->fib_mtu; + rth->rt_window = fi->fib_window; + rth->rt_irtt = fi->fib_irtt; + rth->rt_tos = f->fib_tos; + rth->rt_flags = fi->fib_flags | RTF_HOST; + if (local) + rth->rt_flags |= RTF_LOCAL; + + if (!(rth->rt_flags & RTF_GATEWAY)) + rth->rt_gateway = rth->rt_dst; /* - * Restore the interrupts and return + * Multicast or limited broadcast is never gatewayed. */ - - restore_flags(cpuflags); - return; + if (MULTICAST(daddr) || daddr == 0xFFFFFFFF) + rth->rt_gateway = rth->rt_dst; + + if (ip_rt_lock == 1) + rt_cache_add(hash, rth); + else + { + rt_free(rth); +#if RT_CACHE_DEBUG >= 1 + printk(KERN_DEBUG "rt_cache: route to %08x was born dead\n", daddr); +#endif + } + + ip_rt_unlock(); + return rth; } +void ip_rt_put(struct rtable * rt) +{ + if (rt) + atomic_dec(&rt->rt_refcnt); +} -/* - * Check if a mask is acceptable. - */ - -static inline int bad_mask(unsigned long mask, unsigned long addr) +struct rtable * ip_rt_route(__u32 daddr, int local) { - if (addr & (mask = ~mask)) - return 1; - mask = ntohl(mask); - if (mask & (mask+1)) - return 1; - return 0; + struct rtable * rth; + + ip_rt_fast_lock(); + + for (rth=ip_rt_hash_table[ip_rt_hash_code(daddr)^local]; rth; rth=rth->rt_next) + { + if (rth->rt_dst == daddr) + { + rth->rt_lastuse = jiffies; + atomic_inc(&rth->rt_use); + atomic_inc(&rth->rt_refcnt); + ip_rt_unlock(); + return rth; + } + } + return ip_rt_slow_route (daddr, local); } /* - * Process a route add request from the user + * Process a route add request from the user, or from a kernel + * task. */ -static int rt_new(struct rtentry *r) +int ip_rt_new(struct rtentry *r) { int err; char * devname; struct device * dev = NULL; - unsigned long flags, daddr, mask, gw; + unsigned long flags; + __u32 daddr, mask, gw; + short metric; /* * If a device is specified find it. */ - + if ((devname = r->rt_dev) != NULL) { err = getname(devname, &devname); @@ -391,7 +1552,7 @@ static int rt_new(struct rtentry *r) dev = dev_get(devname); putname(devname); if (!dev) - return -EINVAL; + return -ENODEV; } /* @@ -403,18 +1564,19 @@ static int rt_new(struct rtentry *r) /* * Make local copies of the important bits + * We decrement the metric by one for BSD compatibility. */ flags = r->rt_flags; - daddr = ((struct sockaddr_in *) &r->rt_dst)->sin_addr.s_addr; - mask = ((struct sockaddr_in *) &r->rt_genmask)->sin_addr.s_addr; - gw = ((struct sockaddr_in *) &r->rt_gateway)->sin_addr.s_addr; - + daddr = (__u32) ((struct sockaddr_in *) &r->rt_dst)->sin_addr.s_addr; + mask = (__u32) ((struct sockaddr_in *) &r->rt_genmask)->sin_addr.s_addr; + gw = (__u32) ((struct sockaddr_in *) &r->rt_gateway)->sin_addr.s_addr; + metric = r->rt_metric > 0 ? r->rt_metric - 1 : 0; /* * BSD emulation: Permits route add someroute gw one-of-my-addresses * to indicate which iface. Not as clean as the nice Linux dev technique - * but people keep using it... + * but people keep using it... (and gated likes it ;)) */ if (!dev && (flags & RTF_GATEWAY)) @@ -431,48 +1593,55 @@ static int rt_new(struct rtentry *r) } } - /* - * Ignore faulty masks - */ - - if (bad_mask(mask, daddr)) - mask = 0; - - /* - * Set the mask to nothing for host routes. - */ - - if (flags & RTF_HOST) + if (flags & RTF_HOST) mask = 0xffffffff; else if (mask && r->rt_genmask.sa_family != AF_INET) return -EAFNOSUPPORT; - /* - * You can only gateway IP via IP.. - */ - if (flags & RTF_GATEWAY) { if (r->rt_gateway.sa_family != AF_INET) return -EAFNOSUPPORT; + + /* + * Don't try to add a gateway we can't reach.. + * Tunnel devices are exempt from this rule. + */ + if (!dev) dev = get_gw_dev(gw); + else if (dev != get_gw_dev(gw) && dev->type != ARPHRD_TUNNEL) + return -EINVAL; + if (!dev) + return -ENETUNREACH; } - else if (!dev) - dev = ip_dev_check(daddr); + else + { + gw = 0; + if (!dev) + dev = ip_dev_bynet(daddr, mask); + if (!dev) + return -ENETUNREACH; + if (!mask) + { + if (((daddr ^ dev->pa_addr) & dev->pa_mask) == 0) + mask = dev->pa_mask; + } + } - /* - * Unknown device. - */ - - if (dev == NULL) - return -ENETUNREACH; +#ifndef CONFIG_IP_CLASSLESS + if (!mask) + mask = ip_get_mask(daddr); +#endif + + if (bad_mask(mask, daddr)) + return -EINVAL; /* * Add the route */ - - ip_rt_add(flags, daddr, mask, gw, dev, r->rt_mss, r->rt_window, r->rt_irtt); + + rt_add(flags, daddr, mask, gw, dev, r->rt_mss, r->rt_window, r->rt_irtt, metric); return 0; } @@ -481,151 +1650,35 @@ static int rt_new(struct rtentry *r) * Remove a route, as requested by the user. */ -static int rt_kill(struct rtentry *r) +int ip_rt_kill(struct rtentry *r) { struct sockaddr_in *trg; + struct sockaddr_in *msk; + struct sockaddr_in *gtw; char *devname; int err; + struct device * dev = NULL; trg = (struct sockaddr_in *) &r->rt_dst; + msk = (struct sockaddr_in *) &r->rt_genmask; + gtw = (struct sockaddr_in *) &r->rt_gateway; if ((devname = r->rt_dev) != NULL) { err = getname(devname, &devname); if (err) return err; - } - rt_del(trg->sin_addr.s_addr, devname); - if ( devname != NULL ) + dev = dev_get(devname); putname(devname); - return 0; -} - - -/* - * Called from the PROCfs module. This outputs /proc/net/route. - */ - -int rt_get_info(char *buffer, char **start, off_t offset, int length) -{ - struct rtable *r; - int len=0; - off_t pos=0; - off_t begin=0; - int size; - - len += sprintf(buffer, - "Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT\n"); - pos=len; - + if (!dev) + return -ENODEV; + } /* - * This isn't quite right -- r->rt_dst is a struct! + * metric can become negative here if it wasn't filled in + * but that's a fortunate accident; we really use that in rt_del. */ - - for (r = rt_base; r != NULL; r = r->rt_next) - { - size = sprintf(buffer+len, "%s\t%08lX\t%08lX\t%02X\t%d\t%lu\t%d\t%08lX\t%d\t%lu\t%u\n", - r->rt_dev->name, r->rt_dst, r->rt_gateway, - r->rt_flags, r->rt_refcnt, r->rt_use, r->rt_metric, - r->rt_mask, (int)r->rt_mss, r->rt_window, (int)r->rt_irtt); - len+=size; - pos+=size; - if(pos<offset) - { - len=0; - begin=pos; - } - if(pos>offset+length) - break; - } - - *start=buffer+(offset-begin); - len-=(offset-begin); - if(len>length) - len=length; - return len; -} - -/* - * This is hackish, but results in better code. Use "-S" to see why. - */ - -#define early_out ({ goto no_route; 1; }) - -/* - * Route a packet. This needs to be fairly quick. Florian & Co. - * suggested a unified ARP and IP routing cache. Done right its - * probably a brilliant idea. I'd actually suggest a unified - * ARP/IP routing/Socket pointer cache. Volunteers welcome - */ - -struct rtable * ip_rt_route(unsigned long daddr, struct options *opt, unsigned long *src_addr) -{ - struct rtable *rt; - - for (rt = rt_base; rt != NULL || early_out ; rt = rt->rt_next) - { - if (!((rt->rt_dst ^ daddr) & rt->rt_mask)) - break; - /* - * broadcast addresses can be special cases.. - */ - if (rt->rt_flags & RTF_GATEWAY) - continue; - if ((rt->rt_dev->flags & IFF_BROADCAST) && - (rt->rt_dev->pa_brdaddr == daddr)) - break; - } - - if(rt->rt_flags&RTF_REJECT) - return NULL; - - if(src_addr!=NULL) - *src_addr= rt->rt_dev->pa_addr; - - if (daddr == rt->rt_dev->pa_addr) { - if ((rt = rt_loopback) == NULL) - goto no_route; - } - rt->rt_use++; - return rt; -no_route: - return NULL; -} - -struct rtable * ip_rt_local(unsigned long daddr, struct options *opt, unsigned long *src_addr) -{ - struct rtable *rt; - - for (rt = rt_base; rt != NULL || early_out ; rt = rt->rt_next) - { - /* - * No routed addressing. - */ - if (rt->rt_flags&RTF_GATEWAY) - continue; - - if (!((rt->rt_dst ^ daddr) & rt->rt_mask)) - break; - /* - * broadcast addresses can be special cases.. - */ - - if ((rt->rt_dev->flags & IFF_BROADCAST) && - rt->rt_dev->pa_brdaddr == daddr) - break; - } - - if(src_addr!=NULL) - *src_addr= rt->rt_dev->pa_addr; - - if (daddr == rt->rt_dev->pa_addr) { - if ((rt = rt_loopback) == NULL) - goto no_route; - } - rt->rt_use++; - return rt; -no_route: - return NULL; + err=rt_del((__u32)trg->sin_addr.s_addr, (__u32)msk->sin_addr.s_addr, dev, + (__u32)gtw->sin_addr.s_addr, r->rt_flags, r->rt_metric - 1); + return err; } /* @@ -643,12 +1696,30 @@ int ip_rt_ioctl(unsigned int cmd, void *arg) case SIOCDELRT: /* Delete a route */ if (!suser()) return -EPERM; - err=verify_area(VERIFY_READ, arg, sizeof(struct rtentry)); + err = copy_from_user(&rt, arg, sizeof(struct rtentry)); if (err) - return err; - memcpy_fromfs(&rt, arg, sizeof(struct rtentry)); - return (cmd == SIOCDELRT) ? rt_kill(&rt) : rt_new(&rt); + return -EFAULT; + return (cmd == SIOCDELRT) ? ip_rt_kill(&rt) : ip_rt_new(&rt); } return -EINVAL; } + +void ip_rt_advice(struct rtable **rp, int advice) +{ + /* Thanks! */ + return; +} + +void ip_rt_update(int event, struct device *dev) +{ +/* + * This causes too much grief to do now. + */ +#ifdef COMING_IN_2_1 + if (event == NETDEV_UP) + rt_add(RTF_HOST|RTF_UP, dev->pa_addr, ~0, 0, dev, 0, 0, 0, 0); + else if (event == NETDEV_DOWN) + rt_del(dev->pa_addr, ~0, dev, 0, RTF_HOST|RTF_UP, 0); +#endif +} diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c new file mode 100644 index 000000000..5f6328d68 --- /dev/null +++ b/net/ipv4/sysctl_net_ipv4.c @@ -0,0 +1,66 @@ +/* -*- linux-c -*- + * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS] + */ + +#include <linux/mm.h> +#include <linux/sysctl.h> +#include <net/tcp.h> + +/* + * TCP configuration parameters + */ + +#define TCP_PMTU_DISC 0x00000001 /* perform PMTU discovery */ +#define TCP_CONG_AVOID 0x00000002 /* congestion avoidance algorithm */ +#define TCP_DELAY_ACKS 0x00000003 /* delayed ack stategy */ + +#if 0 +static int boolean_min = 0; +static int boolean_max = 1; +#endif + +/* From arp.c */ +extern int sysctl_arp_res_time; +extern int sysctl_arp_dead_res_time; +extern int sysctl_arp_max_tries; +extern int sysctl_arp_timeout; +extern int sysctl_arp_check_interval; +extern int sysctl_arp_confirm_interval; +extern int sysctl_arp_confirm_timeout; + +extern int sysctl_tcp_cong_avoidance; +extern int tcp_sysctl_congavoid(ctl_table *ctl, int write, struct file * filp, + void *buffer, size_t *lenp); + +ctl_table ipv4_table[] = { + {NET_IPV4_ARP_RES_TIME, "arp_res_time", + &sysctl_arp_res_time, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_ARP_DEAD_RES_TIME, "arp_dead_res_time", + &sysctl_arp_dead_res_time, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_ARP_MAX_TRIES, "arp_max_tries", + &sysctl_arp_max_tries, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_ARP_TIMEOUT, "arp_timeout", + &sysctl_arp_timeout, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_ARP_CHECK_INTERVAL, "arp_check_interval", + &sysctl_arp_check_interval, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_ARP_CONFIRM_INTERVAL, "arp_confirm_interval", + &sysctl_arp_confirm_interval, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_ARP_CONFIRM_TIMEOUT, "arp_confirm_timeout", + &sysctl_arp_confirm_timeout, sizeof(int), 0644, NULL, + &proc_dointvec}, +#if 0 + {TCP_PMTU_DISC, "tcp_pmtu_discovery", + &ipv4_pmtu_discovery, sizeof(int), 644, + NULL, &proc_dointvec, &sysctl_intvec_minmax, + &boolean_min, &boolean_max}, +#endif + + {NET_IPV4_TCP_VEGAS_CONG_AVOID, "tcp_vegas_cong_avoid", + &sysctl_tcp_cong_avoidance, sizeof(int), 0644, + NULL, &tcp_sysctl_congavoid }, + {0} +}; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 352e1a101..ac6e2ea53 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -19,50 +19,64 @@ * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> * - * Fixes: + * Fixes: * Alan Cox : Numerous verify_area() calls * Alan Cox : Set the ACK bit on a reset - * Alan Cox : Stopped it crashing if it closed while sk->inuse=1 - * and was trying to connect (tcp_err()). + * Alan Cox : Stopped it crashing if it closed while + * sk->inuse=1 and was trying to connect + * (tcp_err()). * Alan Cox : All icmp error handling was broken * pointers passed where wrong and the * socket was looked up backwards. Nobody * tested any icmp error code obviously. - * Alan Cox : tcp_err() now handled properly. It wakes people - * on errors. select behaves and the icmp error race + * Alan Cox : tcp_err() now handled properly. It + * wakes people on errors. select + * behaves and the icmp error race * has gone by moving it into sock.c - * Alan Cox : tcp_reset() fixed to work for everything not just - * packets for unknown sockets. + * Alan Cox : tcp_send_reset() fixed to work for + * everything not just packets for + * unknown sockets. * Alan Cox : tcp option processing. - * Alan Cox : Reset tweaked (still not 100%) [Had syn rule wrong] + * Alan Cox : Reset tweaked (still not 100%) [Had + * syn rule wrong] * Herp Rosmanith : More reset fixes - * Alan Cox : No longer acks invalid rst frames. Acking - * any kind of RST is right out. - * Alan Cox : Sets an ignore me flag on an rst receive - * otherwise odd bits of prattle escape still - * Alan Cox : Fixed another acking RST frame bug. Should stop - * LAN workplace lockups. - * Alan Cox : Some tidyups using the new skb list facilities + * Alan Cox : No longer acks invalid rst frames. + * Acking any kind of RST is right out. + * Alan Cox : Sets an ignore me flag on an rst + * receive otherwise odd bits of prattle + * escape still + * Alan Cox : Fixed another acking RST frame bug. + * Should stop LAN workplace lockups. + * Alan Cox : Some tidyups using the new skb list + * facilities * Alan Cox : sk->keepopen now seems to work * Alan Cox : Pulls options out correctly on accepts * Alan Cox : Fixed assorted sk->rqueue->next errors - * Alan Cox : PSH doesn't end a TCP read. Switched a bit to skb ops. - * Alan Cox : Tidied tcp_data to avoid a potential nasty. - * Alan Cox : Added some better commenting, as the tcp is hard to follow + * Alan Cox : PSH doesn't end a TCP read. Switched a + * bit to skb ops. + * Alan Cox : Tidied tcp_data to avoid a potential + * nasty. + * Alan Cox : Added some better commenting, as the + * tcp is hard to follow * Alan Cox : Removed incorrect check for 20 * psh * Michael O'Reilly : ack < copied bug fix. * Johannes Stille : Misc tcp fixes (not all in yet). * Alan Cox : FIN with no memory -> CRASH - * Alan Cox : Added socket option proto entries. Also added awareness of them to accept. + * Alan Cox : Added socket option proto entries. + * Also added awareness of them to accept. * Alan Cox : Added TCP options (SOL_TCP) - * Alan Cox : Switched wakeup calls to callbacks, so the kernel can layer network sockets. + * Alan Cox : Switched wakeup calls to callbacks, + * so the kernel can layer network + * sockets. * Alan Cox : Use ip_tos/ip_ttl settings. * Alan Cox : Handle FIN (more) properly (we hope). - * Alan Cox : RST frames sent on unsynchronised state ack error/ + * Alan Cox : RST frames sent on unsynchronised + * state ack error. * Alan Cox : Put in missing check for SYN bit. - * Alan Cox : Added tcp_select_window() aka NET2E + * Alan Cox : Added tcp_select_window() aka NET2E * window non shrink trick. - * Alan Cox : Added a couple of small NET2E timer fixes + * Alan Cox : Added a couple of small NET2E timer + * fixes * Charles Hedrick : TCP fixes * Toomas Tamm : TCP window fixes * Alan Cox : Small URG fix to rlogin ^C ack fight @@ -75,22 +89,29 @@ * Adam Caldwell : Assorted timer/timing errors * Matthew Dillon : Fixed another RST bug * Alan Cox : Move to kernel side addressing changes. - * Alan Cox : Beginning work on TCP fastpathing (not yet usable) + * Alan Cox : Beginning work on TCP fastpathing + * (not yet usable) * Arnt Gulbrandsen: Turbocharged tcp_check() routine. * Alan Cox : TCP fast path debugging * Alan Cox : Window clamping * Michael Riepe : Bug in tcp_check() * Matt Dillon : More TCP improvements and RST bug fixes - * Matt Dillon : Yet more small nasties remove from the TCP code - * (Be very nice to this man if tcp finally works 100%) 8) - * Alan Cox : BSD accept semantics. + * Matt Dillon : Yet more small nasties remove from the + * TCP code (Be very nice to this man if + * tcp finally works 100%) 8) + * Alan Cox : BSD accept semantics. * Alan Cox : Reset on closedown bug. * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). - * Michael Pall : Handle select() after URG properly in all cases. - * Michael Pall : Undo the last fix in tcp_read_urg() (multi URG PUSH broke rlogin). - * Michael Pall : Fix the multi URG PUSH problem in tcp_readable(), select() after URG works now. - * Michael Pall : recv(...,MSG_OOB) never blocks in the BSD api. - * Alan Cox : Changed the semantics of sk->socket to + * Michael Pall : Handle select() after URG properly in + * all cases. + * Michael Pall : Undo the last fix in tcp_read_urg() + * (multi URG PUSH broke rlogin). + * Michael Pall : Fix the multi URG PUSH problem in + * tcp_readable(), select() after URG + * works now. + * Michael Pall : recv(...,MSG_OOB) never blocks in the + * BSD api. + * Alan Cox : Changed the semantics of sk->socket to * fix a race and a signal problem with * accept() and async I/O. * Alan Cox : Relaxed the rules on tcp_sendto(). @@ -107,21 +128,23 @@ * Alan Cox : Reset tracing code. * Alan Cox : Spurious resets on shutdown. * Alan Cox : Giant 15 minute/60 second timer error - * Alan Cox : Small whoops in selecting before an accept. - * Alan Cox : Kept the state trace facility since it's - * handy for debugging. + * Alan Cox : Small whoops in selecting before an + * accept. + * Alan Cox : Kept the state trace facility since + * it's handy for debugging. * Alan Cox : More reset handler fixes. - * Alan Cox : Started rewriting the code based on the RFC's - * for other useful protocol references see: - * Comer, KA9Q NOS, and for a reference on the - * difference between specifications and how BSD + * Alan Cox : Started rewriting the code based on + * the RFC's for other useful protocol + * references see: Comer, KA9Q NOS, and + * for a reference on the difference + * between specifications and how BSD * works see the 4.4lite source. - * A.N.Kuznetsov : Don't time wait on completion of tidy + * A.N.Kuznetsov : Don't time wait on completion of tidy * close. * Linus Torvalds : Fin/Shutdown & copied_seq changes. * Linus Torvalds : Fixed BSD port reuse to work first syn - * Alan Cox : Reimplemented timers as per the RFC and using multiple - * timers for sanity. + * Alan Cox : Reimplemented timers as per the RFC + * and using multiple timers for sanity. * Alan Cox : Small bug fixes, and a lot of new * comments. * Alan Cox : Fixed dual reader crash by locking @@ -140,23 +163,49 @@ * Alan Cox : Cache last socket. * Alan Cox : Per route irtt. * Matt Day : Select() match BSD precisely on error - * - * + * Alan Cox : New buffers + * Marc Tamsky : Various sk->prot->retransmits and + * sk->retransmits misupdating fixed. + * Fixed tcp_write_timeout: stuck close, + * and TCP syn retries gets used now. + * Mark Yarvis : In tcp_read_wakeup(), don't send an + * ack if stat is TCP_CLOSED. + * Alan Cox : Look up device on a retransmit - routes may + * change. Doesn't yet cope with MSS shrink right + * but its a start! + * Marc Tamsky : Closing in closing fixes. + * Mike Shaver : RFC1122 verifications. + * Alan Cox : rcv_saddr errors. + * Alan Cox : Block double connect(). + * Alan Cox : Small hooks for enSKIP. + * Alexey Kuznetsov: Path MTU discovery. + * Alan Cox : Support soft errors. + * Alan Cox : Fix MTU discovery pathological case + * when the remote claims no mtu! + * Marc Tamsky : TCP_CLOSE fix. + * Colin (G3TNE) : Send a reset on syn ack replies in + * window but wrong (fixes NT lpd problems) + * Pedro Roque : Better TCP window handling, delayed ack. + * Joerg Reuter : No modification of locked buffers in + * tcp_do_retransmit() + * Eric Schenk : Changed receiver side silly window + * avoidance algorithm to BSD style + * algorithm. This doubles throughput + * against machines running Solaris, + * and seems to result in general + * improvement. + * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD + * Willy Konynenberg : Transparent proxying support. + * * To Fix: * Fast path the code. Two things here - fix the window calculation * so it doesn't iterate over the queue, also spot packets with no funny * options arriving in order and process directly. * - * Implement RFC 1191 [Path MTU discovery] - * Look at the effect of implementing RFC 1337 suggestions and their impact. - * Rewrite output state machine to use a single queue and do low window - * situations as per the spec (RFC 1122) + * Rewrite output state machine to use a single queue. * Speed up input assembly algorithm. - * RFC1323 - PAWS and window scaling. PAWS is required for IPv6 so we - * could do with it working on IPv4 + * RFC1323 - PAWS and window scaling. * User settable/learned rtt/max window/mtu - * Cope with MTU/device switches when retransmitting in tcp. - * Fix the window handling to use PR's new code. * * Change the fundamental structure to a single send queue maintained * by TCP (removing the bogus ip stuff [thus fixing mtu drops on @@ -167,7 +216,7 @@ * tcp_data/tcp_read as well as the window shrink crud. * Separate out duplicated code - tcp_alloc_skb, tcp_build_ack * tcp_queue_skb seem obvious routines to extract. - * + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -204,206 +253,224 @@ * TCP_LAST_ACK out side has shutdown after remote has * shutdown. There may still be data in our * buffer that we have to finish sending - * + * * TCP_CLOSE socket is finished */ +/* + * RFC1122 status: + * NOTE: I'm not going to be doing comments in the code for this one except + * for violations and the like. tcp.c is just too big... If I say something + * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out + * with Alan. -- MS 950903 + * + * Use of PSH (4.2.2.2) + * MAY aggregate data sent without the PSH flag. (does) + * MAY queue data received without the PSH flag. (does) + * SHOULD collapse successive PSH flags when it packetizes data. (doesn't) + * MAY implement PSH on send calls. (doesn't, thus:) + * MUST NOT buffer data indefinitely (doesn't [1 second]) + * MUST set PSH on last segment (does) + * MAY pass received PSH to application layer (doesn't) + * SHOULD send maximum-sized segment whenever possible. (almost always does) + * + * Window Size (4.2.2.3, 4.2.2.16) + * MUST treat window size as an unsigned number (does) + * SHOULD treat window size as a 32-bit number (does not) + * MUST NOT shrink window once it is offered (does not normally) + * + * Urgent Pointer (4.2.2.4) + * **MUST point urgent pointer to last byte of urgent data (not right + * after). (doesn't, to be like BSD) + * MUST inform application layer asynchronously of incoming urgent + * data. (does) + * MUST provide application with means of determining the amount of + * urgent data pending. (does) + * **MUST support urgent data sequence of arbitrary length. (doesn't, but + * it's sort of tricky to fix, as urg_ptr is a 16-bit quantity) + * [Follows BSD 1 byte of urgent data] + * + * TCP Options (4.2.2.5) + * MUST be able to receive TCP options in any segment. (does) + * MUST ignore unsupported options (does) + * + * Maximum Segment Size Option (4.2.2.6) + * MUST implement both sending and receiving MSS. (does) + * SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send + * it always). (does, even when MSS == 536, which is legal) + * MUST assume MSS == 536 if no MSS received at connection setup (does) + * MUST calculate "effective send MSS" correctly: + * min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts) + * (does - but allows operator override) + * + * TCP Checksum (4.2.2.7) + * MUST generate and check TCP checksum. (does) + * + * Initial Sequence Number Selection (4.2.2.8) + * MUST use the RFC 793 clock selection mechanism. (doesn't, but it's + * OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is + * necessary for 10Mbps networks - and harder than BSD to spoof!) + * + * Simultaneous Open Attempts (4.2.2.10) + * MUST support simultaneous open attempts (does) + * + * Recovery from Old Duplicate SYN (4.2.2.11) + * MUST keep track of active vs. passive open (does) + * + * RST segment (4.2.2.12) + * SHOULD allow an RST segment to contain data (does, but doesn't do + * anything with it, which is standard) + * + * Closing a Connection (4.2.2.13) + * MUST inform application of whether connection was closed by RST or + * normal close. (does) + * MAY allow "half-duplex" close (treat connection as closed for the + * local app, even before handshake is done). (does) + * MUST linger in TIME_WAIT for 2 * MSL (does) + * + * Retransmission Timeout (4.2.2.15) + * MUST implement Jacobson's slow start and congestion avoidance + * stuff. (does) + * + * Probing Zero Windows (4.2.2.17) + * MUST support probing of zero windows. (does) + * MAY keep offered window closed indefinitely. (does) + * MUST allow remote window to stay closed indefinitely. (does) + * + * Passive Open Calls (4.2.2.18) + * MUST NOT let new passive open affect other connections. (doesn't) + * MUST support passive opens (LISTENs) concurrently. (does) + * + * Time to Live (4.2.2.19) + * MUST make TCP TTL configurable. (does - IP_TTL option) + * + * Event Processing (4.2.2.20) + * SHOULD queue out-of-order segments. (does) + * MUST aggregate ACK segments whenever possible. (does but badly) + * + * Retransmission Timeout Calculation (4.2.3.1) + * MUST implement Karn's algorithm and Jacobson's algorithm for RTO + * calculation. (does, or at least explains them in the comments 8*b) + * SHOULD initialize RTO to 0 and RTT to 3. (does) + * + * When to Send an ACK Segment (4.2.3.2) + * SHOULD implement delayed ACK. (does) + * MUST keep ACK delay < 0.5 sec. (does) + * + * When to Send a Window Update (4.2.3.3) + * MUST implement receiver-side SWS. (does) + * + * When to Send Data (4.2.3.4) + * MUST implement sender-side SWS. (does) + * SHOULD implement Nagle algorithm. (does) + * + * TCP Connection Failures (4.2.3.5) + * MUST handle excessive retransmissions "properly" (see the RFC). (does) + * SHOULD inform application layer of soft errors. (does) + * + * TCP Keep-Alives (4.2.3.6) + * MAY provide keep-alives. (does) + * MUST make keep-alives configurable on a per-connection basis. (does) + * MUST default to no keep-alives. (does) + * **MUST make keep-alive interval configurable. (doesn't) + * **MUST make default keep-alive interval > 2 hours. (doesn't) + * MUST NOT interpret failure to ACK keep-alive packet as dead + * connection. (doesn't) + * SHOULD send keep-alive with no data. (does) + * + * TCP Multihoming (4.2.3.7) + * MUST get source address from IP layer before sending first + * SYN. (does) + * MUST use same local address for all segments of a connection. (does) + * + * IP Options (4.2.3.8) + * MUST ignore unsupported IP options. (does) + * MAY support Time Stamp and Record Route. (does) + * MUST allow application to specify a source route. (does) + * MUST allow received Source Route option to set route for all future + * segments on this connection. (does not (security issues)) + * + * ICMP messages (4.2.3.9) + * MUST act on ICMP errors. (does) + * MUST slow transmission upon receipt of a Source Quench. (does) + * MUST NOT abort connection upon receipt of soft Destination + * Unreachables (0, 1, 5), Time Exceededs and Parameter + * Problems. (doesn't) + * SHOULD report soft Destination Unreachables etc. to the + * application. (does) + * SHOULD abort connection upon receipt of hard Destination Unreachable + * messages (2, 3, 4). (does) + * + * Remote Address Validation (4.2.3.10) + * MUST reject as an error OPEN for invalid remote IP address. (does) + * MUST ignore SYN with invalid source address. (does) + * MUST silently discard incoming SYN for broadcast/multicast + * address. (does) + * + * Asynchronous Reports (4.2.4.1) + * MUST provide mechanism for reporting soft errors to application + * layer. (does) + * + * Type of Service (4.2.4.2) + * MUST allow application layer to set Type of Service. (does IP_TOS) + * + * (Whew. -- MS 950903) + **/ + #include <linux/types.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/time.h> -#include <linux/string.h> -#include <linux/config.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/termios.h> -#include <linux/in.h> #include <linux/fcntl.h> -#include <linux/inet.h> -#include <linux/netdevice.h> -#include <net/snmp.h> -#include <net/ip.h> -#include <net/protocol.h> + #include <net/icmp.h> #include <net/tcp.h> -#include <net/arp.h> -#include <linux/skbuff.h> -#include <net/sock.h> -#include <net/route.h> -#include <linux/errno.h> -#include <linux/timer.h> -#include <asm/system.h> -#include <asm/segment.h> -#include <linux/mm.h> -#include <net/checksum.h> -/* - * The MSL timer is the 'normal' timer. - */ - -#define reset_msl_timer(x,y,z) reset_timer(x,y,z) +#include <asm/uaccess.h> -#define SEQ_TICK 3 unsigned long seq_offset; struct tcp_mib tcp_statistics; -/* - * Cached last hit socket - */ - -volatile unsigned long th_cache_saddr,th_cache_daddr; -volatile unsigned short th_cache_dport, th_cache_sport; -volatile struct sock *th_cache_sk; - -void tcp_cache_zap(void) -{ - unsigned long flags; - save_flags(flags); - cli(); - th_cache_saddr=0; - th_cache_daddr=0; - th_cache_dport=0; - th_cache_sport=0; - th_cache_sk=NULL; - restore_flags(flags); -} - -static void tcp_close(struct sock *sk, int timeout); /* - * The less said about this the better, but it works and will do for 1.2 + * Find someone to 'accept'. Must be called with + * the socket locked or with interrupts disabled */ -static struct wait_queue *master_select_wakeup; - -static __inline__ int min(unsigned int a, unsigned int b) +static struct open_request *tcp_find_established(struct tcp_opt *tp) { - if (a < b) - return(a); - return(b); -} - -#undef STATE_TRACE - -#ifdef STATE_TRACE -static char *statename[]={ - "Unused","Established","Syn Sent","Syn Recv", - "Fin Wait 1","Fin Wait 2","Time Wait", "Close", - "Close Wait","Last ACK","Listen","Closing" -}; -#endif + struct open_request *req; -static __inline__ void tcp_set_state(struct sock *sk, int state) -{ - if(sk->state==TCP_ESTABLISHED) - tcp_statistics.TcpCurrEstab--; -#ifdef STATE_TRACE - if(sk->debug) - printk("TCP sk=%p, State %s -> %s\n",sk, statename[sk->state],statename[state]); -#endif - /* This is a hack but it doesn't occur often and it's going to - be a real to fix nicely */ - - if(state==TCP_ESTABLISHED && sk->state==TCP_SYN_RECV) - { - wake_up_interruptible(&master_select_wakeup); - } - sk->state=state; - if(state==TCP_ESTABLISHED) - tcp_statistics.TcpCurrEstab++; -} + req = tp->syn_wait_queue; -/* - * This routine picks a TCP windows for a socket based on - * the following constraints - * - * 1. The window can never be shrunk once it is offered (RFC 793) - * 2. We limit memory per socket - * - * For now we use NET2E3's heuristic of offering half the memory - * we have handy. All is not as bad as this seems however because - * of two things. Firstly we will bin packets even within the window - * in order to get the data we are waiting for into the memory limit. - * Secondly we bin common duplicate forms at receive time - * Better heuristics welcome - */ - -int tcp_select_window(struct sock *sk) -{ - int new_window = sk->prot->rspace(sk); + if (!req) + return NULL; - if(sk->window_clamp) - new_window=min(sk->window_clamp,new_window); - /* - * Two things are going on here. First, we don't ever offer a - * window less than min(sk->mss, MAX_WINDOW/2). This is the - * receiver side of SWS as specified in RFC1122. - * Second, we always give them at least the window they - * had before, in order to avoid retracting window. This - * is technically allowed, but RFC1122 advises against it and - * in practice it causes trouble. - * - * Fixme: This doesn't correctly handle the case where - * new_window > sk->window but not by enough to allow for the - * shift in sequence space. - */ - if (new_window < min(sk->mss, MAX_WINDOW/2) || new_window < sk->window) - return(sk->window); - return(new_window); -} + do { + if (req->sk && + (req->sk->state == TCP_ESTABLISHED || + req->sk->state >= TCP_FIN_WAIT1)) + { + return req; + } -/* - * Find someone to 'accept'. Must be called with - * sk->inuse=1 or cli() - */ + req = req->dl_next; -static struct sk_buff *tcp_find_established(struct sock *s) -{ - struct sk_buff *p=skb_peek(&s->receive_queue); - if(p==NULL) - return NULL; - do - { - if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1) - return p; - p=p->next; - } - while(p!=(struct sk_buff *)&s->receive_queue); + } while (req != tp->syn_wait_queue); + return NULL; } /* - * Remove a completed connection and return it. This is used by - * tcp_accept() to get connections from the queue. - */ - -static struct sk_buff *tcp_dequeue_established(struct sock *s) -{ - struct sk_buff *skb; - unsigned long flags; - save_flags(flags); - cli(); - skb=tcp_find_established(s); - if(skb!=NULL) - skb_unlink(skb); /* Take it off the queue */ - restore_flags(flags); - return skb; -} - -/* * This routine closes sockets which have been at least partially * opened, but not yet accepted. Currently it is only called by - * tcp_close, and timeout mirrors the value there. + * tcp_close, and timeout mirrors the value there. */ -static void tcp_close_pending (struct sock *sk) +static void tcp_close_pending (struct sock *sk) { struct sk_buff *skb; - while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) + while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) { - skb->sk->dead=1; tcp_close(skb->sk, 0); kfree_skb(skb, FREE_READ); } @@ -411,432 +478,23 @@ static void tcp_close_pending (struct sock *sk) } /* - * Enter the time wait state. + * Enter the time wait state. */ -static void tcp_time_wait(struct sock *sk) +void tcp_time_wait(struct sock *sk) { tcp_set_state(sk,TCP_TIME_WAIT); sk->shutdown = SHUTDOWN_MASK; if (!sk->dead) sk->state_change(sk); - reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); -} - -/* - * A socket has timed out on its send queue and wants to do a - * little retransmitting. Currently this means TCP. - */ - -void tcp_do_retransmit(struct sock *sk, int all) -{ - struct sk_buff * skb; - struct proto *prot; - struct device *dev; - int ct=0; - - prot = sk->prot; - skb = sk->send_head; - - while (skb != NULL) - { - struct tcphdr *th; - struct iphdr *iph; - int size; - - dev = skb->dev; - IS_SKB(skb); - skb->when = jiffies; - - /* - * In general it's OK just to use the old packet. However we - * need to use the current ack and window fields. Urg and - * urg_ptr could possibly stand to be updated as well, but we - * don't keep the necessary data. That shouldn't be a problem, - * if the other end is doing the right thing. Since we're - * changing the packet, we have to issue a new IP identifier. - */ - - iph = (struct iphdr *)(skb->data + dev->hard_header_len); - th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2)); - size = skb->len - (((unsigned char *) th) - skb->data); - - /* - * Note: We ought to check for window limits here but - * currently this is done (less efficiently) elsewhere. - * We do need to check for a route change but can't handle - * that until we have the new 1.3.x buffers in. - * - */ - - iph->id = htons(ip_id_count++); - ip_send_check(iph); - - /* - * This is not the right way to handle this. We have to - * issue an up to date window and ack report with this - * retransmit to keep the odd buggy tcp that relies on - * the fact BSD does this happy. - * We don't however need to recalculate the entire - * checksum, so someone wanting a small problem to play - * with might like to implement RFC1141/RFC1624 and speed - * this up by avoiding a full checksum. - */ - - th->ack_seq = ntohl(sk->acked_seq); - th->window = ntohs(tcp_select_window(sk)); - tcp_send_check(th, sk->saddr, sk->daddr, size, sk); - - /* - * If the interface is (still) up and running, kick it. - */ - - if (dev->flags & IFF_UP) - { - /* - * If the packet is still being sent by the device/protocol - * below then don't retransmit. This is both needed, and good - - * especially with connected mode AX.25 where it stops resends - * occurring of an as yet unsent anyway frame! - * We still add up the counts as the round trip time wants - * adjusting. - */ - if (sk && !skb_device_locked(skb)) - { - /* Remove it from any existing driver queue first! */ - skb_unlink(skb); - /* Now queue it */ - ip_statistics.IpOutRequests++; - dev_queue_xmit(skb, dev, sk->priority); - } - } - - /* - * Count retransmissions - */ - - ct++; - sk->prot->retransmits ++; - - /* - * Only one retransmit requested. - */ - - if (!all) - break; - - /* - * This should cut it off before we send too many packets. - */ - - if (ct >= sk->cong_window) - break; - skb = skb->link3; - } -} - -/* - * Reset the retransmission timer - */ - -static void reset_xmit_timer(struct sock *sk, int why, unsigned long when) -{ - del_timer(&sk->retransmit_timer); - sk->ip_xmit_timeout = why; - if((int)when < 0) - { - when=3; - printk("Error: Negative timer in xmit_timer\n"); - } - sk->retransmit_timer.expires=when; - add_timer(&sk->retransmit_timer); -} - -/* - * This is the normal code called for timeouts. It does the retransmission - * and then does backoff. tcp_do_retransmit is separated out because - * tcp_ack needs to send stuff from the retransmit queue without - * initiating a backoff. - */ - - -void tcp_retransmit_time(struct sock *sk, int all) -{ - tcp_do_retransmit(sk, all); - - /* - * Increase the timeout each time we retransmit. Note that - * we do not increase the rtt estimate. rto is initialized - * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests - * that doubling rto each time is the least we can get away with. - * In KA9Q, Karn uses this for the first few times, and then - * goes to quadratic. netBSD doubles, but only goes up to *64, - * and clamps at 1 to 64 sec afterwards. Note that 120 sec is - * defined in the protocol as the maximum possible RTT. I guess - * we'll have to use something other than TCP to talk to the - * University of Mars. - * - * PAWS allows us longer timeouts and large windows, so once - * implemented ftp to mars will work nicely. We will have to fix - * the 120 second clamps though! - */ - - sk->retransmits++; - sk->backoff++; - sk->rto = min(sk->rto << 1, 120*HZ); - reset_xmit_timer(sk, TIME_WRITE, sk->rto); -} - - -/* - * A timer event has trigger a tcp retransmit timeout. The - * socket xmit queue is ready and set up to send. Because - * the ack receive code keeps the queue straight we do - * nothing clever here. - */ - -static void tcp_retransmit(struct sock *sk, int all) -{ - if (all) - { - tcp_retransmit_time(sk, all); - return; - } - - sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */ - /* sk->ssthresh in theory can be zero. I guess that's OK */ - sk->cong_count = 0; - - sk->cong_window = 1; - - /* Do the actual retransmit. */ - tcp_retransmit_time(sk, all); -} - -/* - * A write timeout has occurred. Process the after effects. - */ - -static int tcp_write_timeout(struct sock *sk) -{ - /* - * Look for a 'soft' timeout. - */ - if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7)) - || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) - { - /* - * Attempt to recover if arp has changed (unlikely!) or - * a route has shifted (not supported prior to 1.3). - */ - arp_destroy (sk->daddr, 0); - /*ip_route_check (sk->daddr);*/ - } - /* - * Has it gone just too far ? - */ - if (sk->retransmits > TCP_RETR2) - { - sk->err = ETIMEDOUT; - sk->error_report(sk); - del_timer(&sk->retransmit_timer); - /* - * Time wait the socket - */ - if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) - { - tcp_set_state(sk,TCP_TIME_WAIT); - reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); - } - else - { - /* - * Clean up time. - */ - tcp_set_state(sk, TCP_CLOSE); - return 0; - } - } - return 1; -} - -/* - * The TCP retransmit timer. This lacks a few small details. - * - * 1. An initial rtt timeout on the probe0 should cause what we can - * of the first write queue buffer to be split and sent. - * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report - * ETIMEDOUT if we know an additional 'soft' error caused this. - * tcp_err should save a 'soft error' for us. - */ - -static void retransmit_timer(unsigned long data) -{ - struct sock *sk = (struct sock*)data; - int why = sk->ip_xmit_timeout; - - /* - * only process if socket is not in use - */ - - cli(); - if (sk->inuse || in_bh) - { - /* Try again in 1 second */ - sk->retransmit_timer.expires = HZ; - add_timer(&sk->retransmit_timer); - sti(); - return; - } - - sk->inuse = 1; - sti(); - - /* Always see if we need to send an ack. */ - - if (sk->ack_backlog && !sk->zapped) - { - sk->prot->read_wakeup (sk); - if (! sk->dead) - sk->data_ready(sk,0); - } - - /* Now we need to figure out why the socket was on the timer. */ - - switch (why) - { - /* Window probing */ - case TIME_PROBE0: - tcp_send_probe0(sk); - tcp_write_timeout(sk); - break; - /* Retransmitting */ - case TIME_WRITE: - /* It could be we got here because we needed to send an ack. - * So we need to check for that. - */ - { - struct sk_buff *skb; - unsigned long flags; - - save_flags(flags); - cli(); - skb = sk->send_head; - if (!skb) - { - restore_flags(flags); - } - else - { - /* - * Kicked by a delayed ack. Reset timer - * correctly now - */ - if (jiffies < skb->when + sk->rto) - { - reset_xmit_timer (sk, TIME_WRITE, skb->when + sk->rto - jiffies); - restore_flags(flags); - break; - } - restore_flags(flags); - /* - * Retransmission - */ - sk->prot->retransmit (sk, 0); - tcp_write_timeout(sk); - } - break; - } - /* Sending Keepalives */ - case TIME_KEEPOPEN: - /* - * this reset_timer() call is a hack, this is not - * how KEEPOPEN is supposed to work. - */ - reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN); - - /* Send something to keep the connection open. */ - if (sk->prot->write_wakeup) - sk->prot->write_wakeup (sk); - sk->retransmits++; - tcp_write_timeout(sk); - break; - default: - printk ("rexmit_timer: timer expired - reason unknown\n"); - break; - } - release_sock(sk); -} - -/* - * This routine is called by the ICMP module when it gets some - * sort of error condition. If err < 0 then the socket should - * be closed and the error returned to the user. If err > 0 - * it's just the icmp type << 8 | icmp code. After adjustment - * header points to the first 8 bytes of the tcp header. We need - * to find the appropriate port. - */ - -void tcp_err(int err, unsigned char *header, unsigned long daddr, - unsigned long saddr, struct inet_protocol *protocol) -{ - struct tcphdr *th; - struct sock *sk; - struct iphdr *iph=(struct iphdr *)header; - - header+=4*iph->ihl; - - - th =(struct tcphdr *)header; - sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr); - - if (sk == NULL) - return; - - if(err<0) - { - sk->err = -err; - sk->error_report(sk); - return; - } - - if ((err & 0xff00) == (ICMP_SOURCE_QUENCH << 8)) - { - /* - * FIXME: - * For now we will just trigger a linear backoff. - * The slow start code should cause a real backoff here. - */ - if (sk->cong_window > 4) - sk->cong_window--; - return; - } - -/* sk->err = icmp_err_convert[err & 0xff].errno; -- moved as TCP should hide non fatals internally (and does) */ - - /* - * If we've already connected we will keep trying - * until we time out, or the user gives up. - */ - - if (icmp_err_convert[err & 0xff].fatal || sk->state == TCP_SYN_SENT) - { - if (sk->state == TCP_SYN_SENT) - { - tcp_statistics.TcpAttemptFails++; - tcp_set_state(sk,TCP_CLOSE); - sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ - } - sk->err = icmp_err_convert[err & 0xff].errno; - } - return; + tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); } /* - * Walk down the receive queue counting readable data until we hit the end or we find a gap - * in the received data queue (ie a frame missing that needs sending to us). Not - * sorting using two queues as data arrives makes life so much harder. + * Walk down the receive queue counting readable data until we hit the + * end or we find a gap in the received data queue (ie a frame missing + * that needs sending to us). */ static int tcp_readable(struct sock *sk) @@ -855,29 +513,35 @@ static int tcp_readable(struct sock *sk) if (sk == NULL || (skb = skb_peek(&sk->receive_queue)) == NULL) { restore_flags(flags); - if(sk && sk->debug) + if(sk && sk->debug) printk("empty\n"); return(0); } - + counted = sk->copied_seq; /* Where we are at the moment */ amount = 0; - - /* - * Do until a push or until we are out of data. + + /* + * Do until a push or until we are out of data. */ - - do + + do { - if (before(counted, skb->h.th->seq)) /* Found a hole so stops here */ + /* Found a hole so stops here */ + if (before(counted, skb->seq)) break; - sum = skb->len -(counted - skb->h.th->seq); /* Length - header but start from where we are up to (avoid overlaps) */ + /* + * Length - header but start from where we are up to + * avoid overlaps + */ + sum = skb->len - (counted - skb->seq); if (skb->h.th->syn) sum++; - if (sum > 0) - { /* Add it up, move on */ + if (sum > 0) + { + /* Add it up, move on */ amount += sum; - if (skb->h.th->syn) + if (skb->h.th->syn) amount--; counted += sum; } @@ -897,9 +561,13 @@ static int tcp_readable(struct sock *sk) * and a blocking read(). And the queue scan in tcp_read() * was correct. Mike <pall@rz.uni-karlsruhe.de> */ + + /* don't count urg data */ if (skb->h.th->urg) - amount--; /* don't count urg data */ + amount--; +#if 0 if (amount && skb->h.th->psh) break; +#endif skb = skb->next; } while(skb != (struct sk_buff *)&sk->receive_queue); @@ -916,28 +584,30 @@ static int tcp_readable(struct sock *sk) static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait) { if (sel_type == SEL_IN) { - int retval; + struct open_request *req; - sk->inuse = 1; - retval = (tcp_find_established(sk) != NULL); + lock_sock(sk); + req = tcp_find_established(&sk->tp_pinfo.af_tcp); release_sock(sk); - if (!retval) - select_wait(&master_select_wakeup,wait); - return retval; + if (req) + return 1; + select_wait(sk->sleep,wait); + return 0; } return 0; } - /* * Wait for a TCP event. * - * Note that we don't need to set "sk->inuse", as the upper select layers + * Note that we don't need to lock the socket, as the upper select layers * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ -static int tcp_select(struct sock *sk, int sel_type, select_table *wait) +int tcp_select(struct sock *sk, int sel_type, select_table *wait) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + if (sk->state == TCP_LISTEN) return tcp_listen_select(sk, sel_type, wait); @@ -951,11 +621,11 @@ static int tcp_select(struct sock *sk, int sel_type, select_table *wait) if (sk->shutdown & RCV_SHUTDOWN) return 1; - if (sk->acked_seq == sk->copied_seq) + if (tp->rcv_nxt == sk->copied_seq) break; if (sk->urg_seq != sk->copied_seq || - sk->acked_seq != sk->copied_seq+1 || + tp->rcv_nxt != sk->copied_seq+1 || sk->urginline || !sk->urg_data) return 1; break; @@ -963,7 +633,7 @@ static int tcp_select(struct sock *sk, int sel_type, select_table *wait) case SEL_OUT: if (sk->err) return 1; - if (sk->shutdown & SEND_SHUTDOWN) + if (sk->shutdown & SEND_SHUTDOWN) return 0; if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) break; @@ -972,7 +642,7 @@ static int tcp_select(struct sock *sk, int sel_type, select_table *wait) * by Matt Dillon. */ - if (sk->prot->wspace(sk) < sk->mtu+128+sk->prot->max_header) + if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header) break; return 1; @@ -987,8 +657,7 @@ static int tcp_select(struct sock *sk, int sel_type, select_table *wait) int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) { - int err; - switch(cmd) + switch(cmd) { case TIOCINQ: @@ -998,42 +667,26 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) { unsigned long amount; - if (sk->state == TCP_LISTEN) + if (sk->state == TCP_LISTEN) return(-EINVAL); - sk->inuse = 1; + lock_sock(sk); amount = tcp_readable(sk); release_sock(sk); - err=verify_area(VERIFY_WRITE,(void *)arg, - sizeof(unsigned long)); - if(err) - return err; - put_fs_long(amount,(unsigned long *)arg); - return(0); + return put_user(amount, (int *)arg); } case SIOCATMARK: { int answ = sk->urg_data && sk->urg_seq == sk->copied_seq; - - err = verify_area(VERIFY_WRITE,(void *) arg, - sizeof(unsigned long)); - if (err) - return err; - put_fs_long(answ,(int *) arg); - return(0); + return put_user(answ,(int *) arg); } case TIOCOUTQ: { unsigned long amount; if (sk->state == TCP_LISTEN) return(-EINVAL); - amount = sk->prot->wspace(sk); - err=verify_area(VERIFY_WRITE,(void *)arg, - sizeof(unsigned long)); - if(err) - return err; - put_fs_long(amount,(unsigned long *)arg); - return(0); + amount = sock_wspace(sk); + return put_user(amount, (int *)arg); } default: return(-EINVAL); @@ -1041,353 +694,106 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) } -/* - * This routine computes a TCP checksum. - * - * Modified January 1995 from a go-faster DOS routine by - * Jorge Cwik <jorge@laser.satlink.net> - */ - -unsigned short tcp_check(struct tcphdr *th, int len, - unsigned long saddr, unsigned long daddr) -{ - return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP, - csum_partial((char *)th,len,0)); -} - - - -void tcp_send_check(struct tcphdr *th, unsigned long saddr, - unsigned long daddr, int len, struct sock *sk) -{ - th->check = 0; - th->check = tcp_check(th, len, saddr, daddr); - return; -} - -/* - * This is the main buffer sending routine. We queue the buffer - * having checked it is sane seeming. +/* + * This routine builds a generic TCP header. */ -static void tcp_send_skb(struct sock *sk, struct sk_buff *skb) +extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push) { - int size; - struct tcphdr * th = skb->h.th; - - /* - * length of packet (not counting length of pre-tcp headers) - */ - - size = skb->len - ((unsigned char *) th - skb->data); - - /* - * Sanity check it.. - */ - - if (size < sizeof(struct tcphdr) || size > skb->len) - { - printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %lu)\n", - skb, skb->data, th, skb->len); - kfree_skb(skb, FREE_WRITE); - return; - } - - /* - * If we have queued a header size packet.. (these crash a few - * tcp stacks if ack is not set) - */ - - if (size == sizeof(struct tcphdr)) - { - /* If it's got a syn or fin it's notionally included in the size..*/ - if(!th->syn && !th->fin) - { - printk("tcp_send_skb: attempt to queue a bogon.\n"); - kfree_skb(skb,FREE_WRITE); - return; - } - } - - /* - * Actual processing. - */ - - tcp_statistics.TcpOutSegs++; - skb->h.seq = ntohl(th->seq) + size - 4*th->doff; - - /* - * We must queue if - * - * a) The right edge of this frame exceeds the window - * b) We are retransmitting (Nagle's rule) - * c) We have too many packets 'in flight' - */ - - if (after(skb->h.seq, sk->window_seq) || - (sk->retransmits && sk->ip_xmit_timeout == TIME_WRITE) || - sk->packets_out >= sk->cong_window) - { - /* checksum will be supplied by tcp_write_xmit. So - * we shouldn't need to set it at all. I'm being paranoid */ - th->check = 0; - if (skb->next != NULL) - { - printk("tcp_send_partial: next != NULL\n"); - skb_unlink(skb); - } - skb_queue_tail(&sk->write_queue, skb); - - /* - * If we don't fit we have to start the zero window - * probes. This is broken - we really need to do a partial - * send _first_ (This is what causes the Cisco and PC/TCP - * grief). - */ - - if (before(sk->window_seq, sk->write_queue.next->h.seq) && - sk->send_head == NULL && sk->ack_backlog == 0) - reset_xmit_timer(sk, TIME_PROBE0, sk->rto); - } - else - { - /* - * This is going straight out - */ - - th->ack_seq = ntohl(sk->acked_seq); - th->window = ntohs(tcp_select_window(sk)); + struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); + memcpy(th,(void *) &(sk->dummy_th), sizeof(*th)); + th->seq = htonl(sk->write_seq); - tcp_send_check(th, sk->saddr, sk->daddr, size, sk); + th->psh =(push == 0) ? 1 : 0; - sk->sent_seq = sk->write_seq; - - /* - * This is mad. The tcp retransmit queue is put together - * by the ip layer. This causes half the problems with - * unroutable FIN's and other things. - */ - - sk->prot->queue_xmit(sk, skb->dev, skb, 0); - - /* - * Set for next retransmit based on expected ACK time. - * FIXME: We set this every time which means our - * retransmits are really about a window behind. - */ + sk->bytes_rcv = 0; + sk->ack_timed = 0; + th->ack_seq = htonl(tp->rcv_nxt); + th->window = htons(tcp_select_window(sk)); - reset_xmit_timer(sk, TIME_WRITE, sk->rto); - } + return(sizeof(*th)); } /* - * Locking problems lead us to a messy situation where we can have - * multiple partially complete buffers queued up. This is really bad - * as we don't want to be sending partial buffers. Fix this with - * a semaphore or similar to lock tcp_write per socket. - * - * These routines are pretty self descriptive. + * Wait for a socket to get into the connected state */ - -struct sk_buff * tcp_dequeue_partial(struct sock * sk) +static void wait_for_tcp_connect(struct sock * sk) { - struct sk_buff * skb; - unsigned long flags; - - save_flags(flags); + release_sock(sk); cli(); - skb = sk->partial; - if (skb) { - sk->partial = NULL; - del_timer(&sk->partial_timer); + if (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT && sk->err == 0) + { + interruptible_sleep_on(sk->sleep); } - restore_flags(flags); - return skb; + sti(); + lock_sock(sk); } -/* - * Empty the partial queue - */ - -static void tcp_send_partial(struct sock *sk) +static inline int tcp_memory_free(struct sock *sk) { - struct sk_buff *skb; - - if (sk == NULL) - return; - while ((skb = tcp_dequeue_partial(sk)) != NULL) - tcp_send_skb(sk, skb); + return sk->wmem_alloc < sk->sndbuf; } /* - * Queue a partial frame + * Wait for more memory for a socket */ - -void tcp_enqueue_partial(struct sk_buff * skb, struct sock * sk) +static void wait_for_tcp_memory(struct sock * sk) { - struct sk_buff * tmp; - unsigned long flags; + release_sock(sk); + if (!tcp_memory_free(sk)) { + struct wait_queue wait = { current, NULL }; - save_flags(flags); - cli(); - tmp = sk->partial; - if (tmp) - del_timer(&sk->partial_timer); - sk->partial = skb; - init_timer(&sk->partial_timer); - /* - * Wait up to 1 second for the buffer to fill. - */ - sk->partial_timer.expires = HZ; - sk->partial_timer.function = (void (*)(unsigned long)) tcp_send_partial; - sk->partial_timer.data = (unsigned long) sk; - add_timer(&sk->partial_timer); - restore_flags(flags); - if (tmp) - tcp_send_skb(sk, tmp); + sk->socket->flags &= ~SO_NOSPACE; + add_wait_queue(sk->sleep, &wait); + for (;;) { + if (current->signal & ~current->blocked) + break; + current->state = TASK_INTERRUPTIBLE; + if (tcp_memory_free(sk)) + break; + if (sk->shutdown & SEND_SHUTDOWN) + break; + if (sk->err) + break; + schedule(); + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); + } + lock_sock(sk); } -/* - * This routine sends an ack and also updates the window. - */ - -static void tcp_send_ack(unsigned long sequence, unsigned long ack, - struct sock *sk, - struct tcphdr *th, unsigned long daddr) +static int tcp_append_tail(struct sock *sk, struct sk_buff *skb, u8 *from, + int tcp_size, int seglen) { - struct sk_buff *buff; - struct tcphdr *t1; - struct device *dev = NULL; - int tmp; - - if(sk->zapped) - return; /* We have been reset, we may not send again */ - - /* - * We need to grab some memory, and put together an ack, - * and then put it into the queue to be sent. - */ - - buff = sk->prot->wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC); - if (buff == NULL) - { - /* - * Force it to send an ack. We don't have to do this - * (ACK is unreliable) but it's much better use of - * bandwidth on slow links to send a spare ack than - * resend packets. - */ - - sk->ack_backlog++; - if (sk->ip_xmit_timeout != TIME_WRITE && tcp_connected(sk->state)) - { - reset_xmit_timer(sk, TIME_WRITE, HZ); - } - return; - } - - /* - * Assemble a suitable TCP frame - */ - - buff->len = sizeof(struct tcphdr); - buff->sk = sk; - buff->localroute = sk->localroute; - t1 =(struct tcphdr *) buff->data; + int fault; + int copy; /* - * Put in the IP header and routing stuff. + * Add more stuff to the end + * of the skb */ - - tmp = sk->prot->build_header(buff, sk->saddr, daddr, &dev, - IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl); - if (tmp < 0) - { - buff->free = 1; - sk->prot->wfree(sk, buff->mem_addr, buff->mem_len); - return; - } - buff->len += tmp; - t1 =(struct tcphdr *)((char *)t1 +tmp); - memcpy(t1, th, sizeof(*t1)); - - /* - * Swap the send and the receive. - */ - - t1->dest = th->source; - t1->source = th->dest; - t1->seq = ntohl(sequence); - t1->ack = 1; - sk->window = tcp_select_window(sk); - t1->window = ntohs(sk->window); - t1->res1 = 0; - t1->res2 = 0; - t1->rst = 0; - t1->urg = 0; - t1->syn = 0; - t1->psh = 0; - t1->fin = 0; + copy = min(sk->mss - tcp_size, skb_tailroom(skb)); + copy = min(copy, seglen); - /* - * If we have nothing queued for transmit and the transmit timer - * is on we are just doing an ACK timeout and need to switch - * to a keepalive. - */ - - if (ack == sk->acked_seq) + tcp_size += copy; + + fault = copy_from_user(skb->tail, from, copy); + + if (fault) { - sk->ack_backlog = 0; - sk->bytes_rcv = 0; - sk->ack_timed = 0; - if (sk->send_head == NULL && skb_peek(&sk->write_queue) == NULL - && sk->ip_xmit_timeout == TIME_WRITE) - { - if(sk->keepopen) { - reset_xmit_timer(sk,TIME_KEEPOPEN,TCP_TIMEOUT_LEN); - } else { - delete_timer(sk); - } - } - } - - /* - * Fill in the packet and send it - */ - - t1->ack_seq = ntohl(ack); - t1->doff = sizeof(*t1)/4; - tcp_send_check(t1, sk->saddr, daddr, sizeof(*t1), sk); - if (sk->debug) - printk("\rtcp_ack: seq %lx ack %lx\n", sequence, ack); - tcp_statistics.TcpOutSegs++; - sk->prot->queue_xmit(sk, dev, buff, 1); -} - + return -1; + } -/* - * This routine builds a generic TCP header. - */ - -extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push) -{ + skb_put(skb, copy); + skb->csum = csum_partial(skb->tail - tcp_size, tcp_size, 0); - memcpy(th,(void *) &(sk->dummy_th), sizeof(*th)); - th->seq = htonl(sk->write_seq); - th->psh =(push == 0) ? 1 : 0; - th->doff = sizeof(*th)/4; - th->ack = 1; - th->fin = 0; - sk->ack_backlog = 0; - sk->bytes_rcv = 0; - sk->ack_timed = 0; - th->ack_seq = htonl(sk->acked_seq); - sk->window = tcp_select_window(sk); - th->window = htons(sk->window); + sk->write_seq += copy; + skb->end_seq += copy; - return(sizeof(*th)); + return copy; } /* @@ -1395,567 +801,352 @@ extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int pus * and starts the transmit system. */ -static int tcp_write(struct sock *sk, unsigned char *from, - int len, int nonblock, unsigned flags) +int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, + int len, int nonblock, int flags) { - int copied = 0; - int copy; - int tmp; - struct sk_buff *skb; - struct sk_buff *send_tmp; - unsigned char *buff; - struct proto *prot; - struct device *dev = NULL; - - sk->inuse=1; - prot = sk->prot; - while(len > 0) + int copied = 0; + struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); + + /* + * Wait for a connection to finish. + */ + while (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) { + + if (copied) + return copied; + if (sk->err) - { /* Stop on an error */ - release_sock(sk); - if (copied) - return(copied); - tmp = -sk->err; - sk->err = 0; - return(tmp); - } - - /* - * First thing we do is make sure that we are established. - */ - - if (sk->shutdown & SEND_SHUTDOWN) + return sock_error(sk); + + if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) { - release_sock(sk); - sk->err = EPIPE; - if (copied) - return(copied); - sk->err = 0; - return(-EPIPE); + if (sk->keepopen) + send_sig(SIGPIPE, current, 0); + return -EPIPE; } - - /* - * Wait for a connection to finish. - */ + + if (nonblock) + return -EAGAIN; + + if (current->signal & ~current->blocked) + return -ERESTARTSYS; + + wait_for_tcp_connect(sk); + } - while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) + + /* + * Ok commence sending + */ + + while(--iovlen >= 0) + { + int seglen=iov->iov_len; + unsigned char * from=iov->iov_base; + u32 actual_win; + + iov++; + + while(seglen > 0) { + int copy; + int tmp; + struct sk_buff *skb; + + /* + * Stop on errors + */ if (sk->err) { - release_sock(sk); if (copied) - return(copied); - tmp = -sk->err; - sk->err = 0; - return(tmp); + return copied; + return sock_error(sk); } - if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) + /* + * Make sure that we are established. + */ + if (sk->shutdown & SEND_SHUTDOWN) { - release_sock(sk); - if (copied) - return(copied); + if (copied) + return copied; + send_sig(SIGPIPE,current,0); + return -EPIPE; + } + + /* + *Now we need to check if we have a half built packet. + */ - if (sk->err) - { - tmp = -sk->err; - sk->err = 0; - return(tmp); - } + /* if we have queued packets */ + if (tp->send_head && !(flags & MSG_OOB) ) + { + int tcp_size; + + /* Tail */ + + skb = sk->write_queue.prev; + tcp_size = skb->tail - + (unsigned char *)(skb->h.th + 1); + + /* + * This window_seq test is somewhat dangerous + * If the remote does SWS avoidance we should + * queue the best we can + * if not we should in fact send multiple + * packets... + * a method for detecting this would be most + * welcome + */ - if (sk->keepopen) + if (skb->end > skb->tail && + sk->mss - tcp_size > 0 && + skb->end_seq < tp->snd_una + tp->snd_wnd) { - send_sig(SIGPIPE, current, 0); + int tcopy; + + tcopy = tcp_append_tail(sk, skb, from, + tcp_size, + seglen); + if (tcopy == -1) + { + return -EFAULT; + } + + from += tcopy; + copied += tcopy; + len -= tcopy; + seglen -= tcopy; + + /* + * FIXME: if we're nagling we + * should send here. + */ + continue; } - return(-EPIPE); } - if (nonblock || copied) + + /* + * We also need to worry about the window. + * If window < 1/2 the maximum window we've seen from this + * host, don't use it. This is sender side + * silly window prevention, as specified in RFC1122. + * (Note that this is different than earlier versions of + * SWS prevention, e.g. RFC813.). What we actually do is + * use the whole MSS. Since the results in the right + * edge of the packet being outside the window, it will + * be queued for later rather than sent. + */ + + copy = min(seglen, sk->mss); + + actual_win = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); + + if (copy > actual_win && + (((long) actual_win) >= (sk->max_window >> 1))) { - release_sock(sk); - if (copied) - return(copied); - return(-EAGAIN); + copy = actual_win; } - release_sock(sk); - cli(); - - if (sk->state != TCP_ESTABLISHED && - sk->state != TCP_CLOSE_WAIT && sk->err == 0) - { - interruptible_sleep_on(sk->sleep); - if (current->signal & ~current->blocked) - { - sti(); - if (copied) - return(copied); - return(-ERESTARTSYS); - } + if (copy <= 0) + { + printk(KERN_DEBUG "sendmsg: copy < 0\n"); + return -EIO; } - sk->inuse = 1; - sti(); - } - /* - * The following code can result in copy <= if sk->mss is ever - * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window). - * sk->mtu is constant once SYN processing is finished. I.e. we - * had better not get here until we've seen his SYN and at least one - * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.) - * But ESTABLISHED should guarantee that. sk->max_window is by definition - * non-decreasing. Note that any ioctl to set user_mss must be done - * before the exchange of SYN's. If the initial ack from the other - * end has a window of 0, max_window and thus mss will both be 0. - */ - - /* - * Now we need to check if we have a half built packet. - */ + /* + * If sk->packets_out > 0 segment will be nagled + * else we kick it right away + */ - if ((skb = tcp_dequeue_partial(sk)) != NULL) - { - int hdrlen; + tmp = MAX_HEADER + sk->prot->max_header + + sizeof(struct sk_buff) + 15; + if (copy < min(sk->mss, sk->max_window >> 1) && + !(flags & MSG_OOB) && sk->packets_out) + { + tmp += min(sk->mss, sk->max_window); + } + else + { + tmp += copy; + } - /* IP header + TCP header */ - hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data) - + sizeof(struct tcphdr); + skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL); + + /* + * If we didn't get any memory, we need to sleep. + */ - /* Add more stuff to the end of skb->len */ - if (!(flags & MSG_OOB)) + if (skb == NULL) { - copy = min(sk->mss - (skb->len - hdrlen), len); - /* FIXME: this is really a bug. */ - if (copy <= 0) + sk->socket->flags |= SO_NOSPACE; + if (nonblock) { - printk("TCP: **bug**: \"copy\" <= 0!!\n"); - copy = 0; + if (copied) + return copied; + return -EAGAIN; } - - memcpy_fromfs(skb->data + skb->len, from, copy); - skb->len += copy; - from += copy; - copied += copy; - len -= copy; - sk->write_seq += copy; - } - if ((skb->len - hdrlen) >= sk->mss || - (flags & MSG_OOB) || !sk->packets_out) - tcp_send_skb(sk, skb); - else - tcp_enqueue_partial(skb, sk); - continue; - } - /* - * We also need to worry about the window. - * If window < 1/2 the maximum window we've seen from this - * host, don't use it. This is sender side - * silly window prevention, as specified in RFC1122. - * (Note that this is different than earlier versions of - * SWS prevention, e.g. RFC813.). What we actually do is - * use the whole MSS. Since the results in the right - * edge of the packet being outside the window, it will - * be queued for later rather than sent. - */ + if (current->signal & ~current->blocked) + { + if (copied) + return copied; + return -ERESTARTSYS; + } - copy = sk->window_seq - sk->write_seq; - if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss) - copy = sk->mss; - if (copy > len) - copy = len; + wait_for_tcp_memory(sk); + continue; + } - /* - * We should really check the window here also. - */ - - send_tmp = NULL; - if (copy < sk->mss && !(flags & MSG_OOB)) - { - /* - * We will release the socket in case we sleep here. - */ - release_sock(sk); - /* - * NB: following must be mtu, because mss can be increased. - * mss is always <= mtu - */ - skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL); - sk->inuse = 1; - send_tmp = skb; - } - else - { + skb->sk = sk; + skb->free = 0; + skb->localroute = sk->localroute|(flags&MSG_DONTROUTE); + /* - * We will release the socket in case we sleep here. + * FIXME: we need to optimize this. + * Perhaps some hints here would be good. */ - release_sock(sk); - skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL); - sk->inuse = 1; - } - /* - * If we didn't get any memory, we need to sleep. - */ + tmp = tp->af_specific->build_net_header(sk, skb); - if (skb == NULL) - { - sk->socket->flags |= SO_NOSPACE; - if (nonblock) + if (tmp < 0) { - release_sock(sk); - if (copied) + sock_wfree(sk, skb); + if (copied) return(copied); - return(-EAGAIN); + return(tmp); } - /* - * FIXME: here is another race condition. - */ + skb->h.th =(struct tcphdr *) + skb_put(skb,sizeof(struct tcphdr)); - tmp = sk->wmem_alloc; - release_sock(sk); - cli(); - /* - * Again we will try to avoid it. - */ - if (tmp <= sk->wmem_alloc && - (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT) - && sk->err == 0) + seglen -= copy; + tmp = tcp_build_header(skb->h.th, sk, seglen || iovlen); + + if (tmp < 0) + { + sock_wfree(sk, skb); + if (copied) + return(copied); + return(tmp); + } + + if (flags & MSG_OOB) { - sk->socket->flags &= ~SO_NOSPACE; - interruptible_sleep_on(sk->sleep); - if (current->signal & ~current->blocked) - { - sti(); - if (copied) - return(copied); - return(-ERESTARTSYS); - } + skb->h.th->urg = 1; + skb->h.th->urg_ptr = ntohs(copy); } - sk->inuse = 1; - sti(); - continue; - } - skb->len = 0; - skb->sk = sk; - skb->free = 0; - skb->localroute = sk->localroute|(flags&MSG_DONTROUTE); - - buff = skb->data; - - /* - * FIXME: we need to optimize this. - * Perhaps some hints here would be good. - */ + skb->csum = csum_partial_copy_fromuser(from, + skb_put(skb, copy), copy, 0); - tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev, - IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl); - if (tmp < 0 ) - { - prot->wfree(sk, skb->mem_addr, skb->mem_len); - release_sock(sk); - if (copied) - return(copied); - return(tmp); - } - skb->len += tmp; - skb->dev = dev; - buff += tmp; - skb->h.th =(struct tcphdr *) buff; - tmp = tcp_build_header((struct tcphdr *)buff, sk, len-copy); - if (tmp < 0) - { - prot->wfree(sk, skb->mem_addr, skb->mem_len); - release_sock(sk); - if (copied) - return(copied); - return(tmp); - } + from += copy; + copied += copy; + len -= copy; + skb->free = 0; + sk->write_seq += copy; + + tcp_send_skb(sk, skb); - if (flags & MSG_OOB) - { - ((struct tcphdr *)buff)->urg = 1; - ((struct tcphdr *)buff)->urg_ptr = ntohs(copy); - } - skb->len += tmp; - memcpy_fromfs(buff+tmp, from, copy); - - from += copy; - copied += copy; - len -= copy; - skb->len += copy; - skb->free = 0; - sk->write_seq += copy; - - if (send_tmp != NULL && sk->packets_out) - { - tcp_enqueue_partial(send_tmp, sk); - continue; + release_sock(sk); + lock_sock(sk); } - tcp_send_skb(sk, skb); } - sk->err = 0; - -/* - * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly - * interactive fast network servers. It's meant to be on and - * it really improves the throughput though not the echo time - * on my slow slip link - Alan - */ -/* - * Avoid possible race on send_tmp - c/o Johannes Stille - */ - - if(sk->partial && ((!sk->packets_out) - /* If not nagling we can send on the before case too.. */ - || (sk->nonagle && before(sk->write_seq , sk->window_seq)) - )) - tcp_send_partial(sk); + sk->err = 0; - release_sock(sk); - return(copied); + return copied; } -/* - * This is just a wrapper. - */ - -static int tcp_sendto(struct sock *sk, unsigned char *from, - int len, int nonblock, unsigned flags, - struct sockaddr_in *addr, int addr_len) -{ - if (flags & ~(MSG_OOB|MSG_DONTROUTE)) - return -EINVAL; - if (sk->state == TCP_CLOSE) - return -ENOTCONN; - if (addr_len < sizeof(*addr)) - return -EINVAL; - if (addr->sin_family && addr->sin_family != AF_INET) - return -EINVAL; - if (addr->sin_port != sk->dummy_th.dest) - return -EISCONN; - if (addr->sin_addr.s_addr != sk->daddr) - return -EISCONN; - return tcp_write(sk, from, len, nonblock, flags); -} + /* * Send an ack if one is backlogged at this point. Ought to merge * this with tcp_send_ack(). + * This is called for delayed acks also. */ -static void tcp_read_wakeup(struct sock *sk) +void tcp_read_wakeup(struct sock *sk) { - int tmp; - struct device *dev = NULL; - struct tcphdr *t1; - struct sk_buff *buff; - - if (!sk->ack_backlog) - return; - - /* - * FIXME: we need to put code here to prevent this routine from - * being called. Being called once in a while is ok, so only check - * if this is the second time in a row. - */ - /* - * We need to grab some memory, and put together an ack, - * and then put it into the queue to be sent. + * If we're closed, don't send an ack, or we'll get a RST + * from the closed destination. */ - buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC); - if (buff == NULL) - { - /* Try again real soon. */ - reset_xmit_timer(sk, TIME_WRITE, HZ); + if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT)) return; - } - buff->len = sizeof(struct tcphdr); - buff->sk = sk; - buff->localroute = sk->localroute; - - /* - * Put in the IP header and routing stuff. - */ - - tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev, - IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl); - if (tmp < 0) - { - buff->free = 1; - sk->prot->wfree(sk, buff->mem_addr, buff->mem_len); - return; - } - - buff->len += tmp; - t1 =(struct tcphdr *)(buff->data +tmp); - - memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1)); - t1->seq = htonl(sk->sent_seq); - t1->ack = 1; - t1->res1 = 0; - t1->res2 = 0; - t1->rst = 0; - t1->urg = 0; - t1->syn = 0; - t1->psh = 0; - sk->ack_backlog = 0; - sk->bytes_rcv = 0; - sk->window = tcp_select_window(sk); - t1->window = ntohs(sk->window); - t1->ack_seq = ntohl(sk->acked_seq); - t1->doff = sizeof(*t1)/4; - tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk); - sk->prot->queue_xmit(sk, dev, buff, 1); - tcp_statistics.TcpOutSegs++; + tcp_send_ack(sk); } /* - * FIXME: - * This routine frees used buffers. - * It should consider sending an ACK to let the - * other end know we now have a bigger window. - */ - -static void cleanup_rbuf(struct sock *sk) -{ - unsigned long flags; - unsigned long left; - struct sk_buff *skb; - unsigned long rspace; - - if(sk->debug) - printk("cleaning rbuf for sk=%p\n", sk); - - save_flags(flags); - cli(); - - left = sk->prot->rspace(sk); - - /* - * We have to loop through all the buffer headers, - * and try to free up all the space we can. - */ - - while((skb=skb_peek(&sk->receive_queue)) != NULL) - { - if (!skb->used || skb->users) - break; - skb_unlink(skb); - skb->sk = sk; - kfree_skb(skb, FREE_READ); - } - - restore_flags(flags); - - /* - * FIXME: - * At this point we should send an ack if the difference - * in the window, and the amount of space is bigger than - * TCP_WINDOW_DIFF. - */ - - if(sk->debug) - printk("sk->rspace = %lu, was %lu\n", sk->prot->rspace(sk), - left); - if ((rspace=sk->prot->rspace(sk)) != left) - { - /* - * This area has caused the most trouble. The current strategy - * is to simply do nothing if the other end has room to send at - * least 3 full packets, because the ack from those will auto- - * matically update the window. If the other end doesn't think - * we have much space left, but we have room for at least 1 more - * complete packet than it thinks we do, we will send an ack - * immediately. Otherwise we will wait up to .5 seconds in case - * the user reads some more. - */ - sk->ack_backlog++; - /* - * It's unclear whether to use sk->mtu or sk->mss here. They differ only - * if the other end is offering a window smaller than the agreed on MSS - * (called sk->mtu here). In theory there's no connection between send - * and receive, and so no reason to think that they're going to send - * small packets. For the moment I'm using the hack of reducing the mss - * only on the send side, so I'm putting mtu here. - */ - - if (rspace > (sk->window - sk->bytes_rcv + sk->mtu)) - { - /* Send an ack right now. */ - tcp_read_wakeup(sk); - } - else - { - /* Force it to send an ack soon. */ - int was_active = del_timer(&sk->retransmit_timer); - if (!was_active || TCP_ACK_TIME < sk->timer.expires) - { - reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME); - } - else - add_timer(&sk->retransmit_timer); - } - } -} - - -/* * Handle reading urgent data. BSD has very simple semantics for * this, no blocking and very strange errors 8) */ - -static int tcp_read_urg(struct sock * sk, int nonblock, - unsigned char *to, int len, unsigned flags) + +static int tcp_recv_urg(struct sock * sk, int nonblock, + struct msghdr *msg, int len, int flags, + int *addr_len) { + int err=0; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + /* * No URG data to read */ if (sk->urginline || !sk->urg_data || sk->urg_data == URG_READ) return -EINVAL; /* Yes this is right ! */ - - if (sk->err) - { - int tmp = -sk->err; - sk->err = 0; - return tmp; - } - if (sk->state == TCP_CLOSE || sk->done) + if (sk->err) + return sock_error(sk); + + if (sk->state == TCP_CLOSE || sk->done) { - if (!sk->done) { + if (!sk->done) + { sk->done = 1; return 0; } return -ENOTCONN; } - if (sk->shutdown & RCV_SHUTDOWN) + if (sk->shutdown & RCV_SHUTDOWN) { sk->done = 1; return 0; } - sk->inuse = 1; - if (sk->urg_data & URG_VALID) + lock_sock(sk); + if (sk->urg_data & URG_VALID) { char c = sk->urg_data; if (!(flags & MSG_PEEK)) sk->urg_data = URG_READ; - put_fs_byte(c, to); + + if(len>0) + err = memcpy_toiovec(msg->msg_iov, &c, 1); + else + msg->msg_flags|=MSG_TRUNC; + + if(msg->msg_name) + { + tp->af_specific->addr2sockaddr(sk, (struct sockaddr *) + msg->msg_name); + } + if(addr_len) + *addr_len= tp->af_specific->sockaddr_len; + /* + * Read urgent data + */ + msg->msg_flags|=MSG_OOB; release_sock(sk); - return 1; + return err ? -EFAULT : 1; } release_sock(sk); - + /* * Fixed the recv(..., MSG_OOB) behaviour. BSD docs and * the available implementations agree in this case: @@ -1966,73 +1157,148 @@ static int tcp_read_urg(struct sock * sk, int nonblock, return -EAGAIN; } +/* + * Release a skb if it is no longer needed. This routine + * must be called with interrupts disabled or with the + * socket locked so that the sk_buff queue operation is ok. + */ + +static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb) +{ + sk->ack_backlog++; + + skb->sk = sk; + __skb_unlink(skb, &sk->receive_queue); + kfree_skb(skb, FREE_READ); +} + + +static void cleanup_rbuf(struct sock *sk) +{ + struct sk_buff *skb; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* + * NOTE! The socket must be locked, so that we don't get + * a messed-up receive queue. + */ + + while ((skb=skb_peek(&sk->receive_queue)) != NULL) { + if (!skb->used || skb->users) + break; + tcp_eat_skb(sk, skb); + } + + if(sk->debug) + printk("sk->rspace = %lu\n", sock_rspace(sk)); + + /* + * We send a ACK if the sender is blocked + * else let tcp_data deal with the acking policy. + */ + + if (sock_rspace(sk) > tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup) && + (tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup) < sk->mss)) + { + /* Send an ack right now. */ + sk->delayed_acks++; + tcp_read_wakeup(sk); + } + +} + /* * This routine copies from a sock struct into the user buffer. */ -static int tcp_read(struct sock *sk, unsigned char *to, - int len, int nonblock, unsigned flags) +int tcp_recvmsg(struct sock *sk, struct msghdr *msg, + int len, int nonblock, int flags, int *addr_len) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct wait_queue wait = { current, NULL }; int copied = 0; - unsigned long peek_seq; - volatile unsigned long *seq; /* So gcc doesn't overoptimise */ + u32 peek_seq; + volatile u32 *seq; /* So gcc doesn't overoptimise */ unsigned long used; + int err = 0; + int target = 1; /* Read at least this may bytes */ - /* - * This error should be checked. - */ - if (sk->state == TCP_LISTEN) return -ENOTCONN; /* - * Urgent data needs to be handled specially. + * Urgent data needs to be handled specially. */ - + if (flags & MSG_OOB) - return tcp_read_urg(sk, nonblock, to, len, flags); + return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len); /* * Copying sequence to update. This is volatile to handle - * the multi-reader case neatly (memcpy_to/fromfs might be + * the multi-reader case neatly (memcpy_to/fromfs might be * inline and thus not flush cached variables otherwise). */ - + peek_seq = sk->copied_seq; seq = &sk->copied_seq; if (flags & MSG_PEEK) seq = &peek_seq; + + /* + * Handle the POSIX bogosity MSG_WAITALL + */ + + if (flags & MSG_WAITALL) + target=len; add_wait_queue(sk->sleep, &wait); - sk->inuse = 1; - while (len > 0) + lock_sock(sk); + while (len > 0) { struct sk_buff * skb; - unsigned long offset; - + u32 offset; + /* * Are we at urgent data? Stop if we have read anything. */ - + if (copied && sk->urg_data && sk->urg_seq == *seq) break; /* + * We need to check signals first, to get correct SIGURG + * handling. FIXME: Need to check this doesnt impact 1003.1g + * and move it down to the bottom of the loop + */ + if (current->signal & ~current->blocked) { + if (copied) + break; + copied = -ERESTARTSYS; + break; + } + + /* * Next get a buffer. */ - + current->state = TASK_INTERRUPTIBLE; skb = skb_peek(&sk->receive_queue); - do + do { if (!skb) break; - if (before(*seq, skb->h.th->seq)) + /* + * now that we have two receive queues this + * shouldn't happen + */ + if (before(*seq, skb->seq)) { + printk("recvmsg bug: copied %X seq %X\n", + *seq, skb->seq); break; - offset = *seq - skb->h.th->seq; + } + offset = *seq - skb->seq; if (skb->h.th->syn) offset--; if (offset < skb->len) @@ -2045,19 +1311,18 @@ static int tcp_read(struct sock *sk, unsigned char *to, } while (skb != (struct sk_buff *)&sk->receive_queue); - if (copied) + if (copied >= target) break; - if (sk->err) + if (sk->err && !(flags&MSG_PEEK)) { - copied = -sk->err; - sk->err = 0; + copied = sock_error(sk); break; } - if (sk->state == TCP_CLOSE) + if (sk->state == TCP_CLOSE) { - if (!sk->done) + if (!sk->done) { sk->done = 1; break; @@ -2066,13 +1331,13 @@ static int tcp_read(struct sock *sk, unsigned char *to, break; } - if (sk->shutdown & RCV_SHUTDOWN) + if (sk->shutdown & RCV_SHUTDOWN) { sk->done = 1; break; } - - if (nonblock) + + if (nonblock) { copied = -EAGAIN; break; @@ -2083,44 +1348,38 @@ static int tcp_read(struct sock *sk, unsigned char *to, sk->socket->flags |= SO_WAITDATA; schedule(); sk->socket->flags &= ~SO_WAITDATA; - sk->inuse = 1; - - if (current->signal & ~current->blocked) - { - copied = -ERESTARTSYS; - break; - } + lock_sock(sk); continue; found_ok_skb: /* * Lock the buffer. We can be fairly relaxed as - * an interrupt will never steal a buffer we are + * an interrupt will never steal a buffer we are * using unless I've missed something serious in * tcp_data. */ - + skb->users++; - + /* - * Ok so how much can we use ? + * Ok so how much can we use ? */ - + used = skb->len - offset; if (len < used) used = len; /* - * Do we have urgent data here? + * Do we have urgent data here? */ - - if (sk->urg_data) + + if (sk->urg_data) { - unsigned long urg_offset = sk->urg_seq - *seq; - if (urg_offset < used) + u32 urg_offset = sk->urg_seq - *seq; + if (urg_offset < used) { - if (!urg_offset) + if (!urg_offset) { - if (!sk->urginline) + if (!sk->urginline) { ++*seq; offset++; @@ -2131,41 +1390,51 @@ static int tcp_read(struct sock *sk, unsigned char *to, used = urg_offset; } } - + /* * Copy it - We _MUST_ update *seq first so that we * don't ever double read when we have dual readers */ - + *seq += used; /* - * This memcpy_tofs can sleep. If it sleeps and we + * This memcpy_toiovec can sleep. If it sleeps and we * do a second read it relies on the skb->users to avoid * a crash when cleanup_rbuf() gets called. */ - - memcpy_tofs(to,((unsigned char *)skb->h.th) + - skb->h.th->doff*4 + offset, used); + + err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used); + + if (err) + { + /* + * exception. bailout! + */ + *seq -= err; + skb->users--; + return -EFAULT; + } + copied += used; len -= used; - to += used; - + /* * We now will not sleep again until we are finished * with skb. Sorry if you are doing the SMP port * but you'll just have to fix it neatly ;) */ - - skb->users --; - + + skb->users--; + if (after(sk->copied_seq,sk->urg_seq)) sk->urg_data = 0; if (used + offset < skb->len) continue; - + /* - * Process the FIN. + * Process the FIN. We may also need to handle PSH + * here and make it break out of MSG_WAITALL */ if (skb->h.th->fin) @@ -2173,22 +1442,33 @@ static int tcp_read(struct sock *sk, unsigned char *to, if (flags & MSG_PEEK) continue; skb->used = 1; + if (!skb->users) + tcp_eat_skb(sk, skb); continue; found_fin_ok: ++*seq; if (flags & MSG_PEEK) break; - + /* * All is done */ - + skb->used = 1; sk->shutdown |= RCV_SHUTDOWN; break; } + + if(copied > 0 && msg->msg_name) + { + tp->af_specific->addr2sockaddr(sk, (struct sockaddr *) + msg->msg_name); + } + if(addr_len) + *addr_len= tp->af_specific->sockaddr_len; + remove_wait_queue(sk->sleep, &wait); current->state = TASK_RUNNING; @@ -2198,13 +1478,15 @@ static int tcp_read(struct sock *sk, unsigned char *to, return copied; } + + /* * State processing on a close. This implements the state shift for - * sending our FIN frame. Note that we only send a FIN for some + * sending our FIN frame. Note that we only send a FIN for some * states. A shutdown() may have already sent the FIN, or we may be * closed. */ - + static int tcp_close_state(struct sock *sk, int dead) { int ns=TCP_CLOSE; @@ -2231,9 +1513,9 @@ static int tcp_close_state(struct sock *sk, int dead) ns=TCP_LAST_ACK; send_fin=1; } - + tcp_set_state(sk,ns); - + /* * This is a (useful) BSD violating of the RFC. There is a * problem with TCP as specified in that the other end could @@ -2249,118 +1531,15 @@ static int tcp_close_state(struct sock *sk, int dead) if(timer_active) add_timer(&sk->timer); else - reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT); + tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT); } - - return send_fin; -} - -/* - * Send a fin. - */ -static void tcp_send_fin(struct sock *sk) -{ - struct proto *prot =(struct proto *)sk->prot; - struct tcphdr *th =(struct tcphdr *)&sk->dummy_th; - struct tcphdr *t1; - struct sk_buff *buff; - struct device *dev=NULL; - int tmp; - - release_sock(sk); /* in case the malloc sleeps. */ - - buff = prot->wmalloc(sk, MAX_RESET_SIZE,1 , GFP_KERNEL); - sk->inuse = 1; - - if (buff == NULL) - { - /* This is a disaster if it occurs */ - printk("tcp_send_fin: Impossible malloc failure"); - return; - } - - /* - * Administrivia - */ - - buff->sk = sk; - buff->len = sizeof(*t1); - buff->localroute = sk->localroute; - t1 =(struct tcphdr *) buff->data; - - /* - * Put in the IP header and routing stuff. - */ - - tmp = prot->build_header(buff,sk->saddr, sk->daddr, &dev, - IPPROTO_TCP, sk->opt, - sizeof(struct tcphdr),sk->ip_tos,sk->ip_ttl); - if (tmp < 0) - { - int t; - /* - * Finish anyway, treat this as a send that got lost. - * (Not good). - */ - - buff->free = 1; - prot->wfree(sk,buff->mem_addr, buff->mem_len); - sk->write_seq++; - t=del_timer(&sk->timer); - if(t) - add_timer(&sk->timer); - else - reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); - return; - } - - /* - * We ought to check if the end of the queue is a buffer and - * if so simply add the fin to that buffer, not send it ahead. - */ - - t1 =(struct tcphdr *)((char *)t1 +tmp); - buff->len += tmp; - buff->dev = dev; - memcpy(t1, th, sizeof(*t1)); - t1->seq = ntohl(sk->write_seq); - sk->write_seq++; - buff->h.seq = sk->write_seq; - t1->ack = 1; - t1->ack_seq = ntohl(sk->acked_seq); - t1->window = ntohs(sk->window=tcp_select_window(sk)); - t1->fin = 1; - t1->rst = 0; - t1->doff = sizeof(*t1)/4; - tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk); - - /* - * If there is data in the write queue, the fin must be appended to - * the write queue. - */ - - if (skb_peek(&sk->write_queue) != NULL) - { - buff->free = 0; - if (buff->next != NULL) - { - printk("tcp_send_fin: next != NULL\n"); - skb_unlink(buff); - } - skb_queue_tail(&sk->write_queue, buff); - } - else - { - sk->sent_seq = sk->write_seq; - sk->prot->queue_xmit(sk, dev, buff, 0); - reset_xmit_timer(sk, TIME_WRITE, sk->rto); - } + return send_fin; } /* * Shutdown the sending side of a connection. Much like close except - * that we don't receive shut down or set sk->dead=1. + * that we don't receive shut down or set sk->dead. */ void tcp_shutdown(struct sock *sk, int how) @@ -2371,25 +1550,25 @@ void tcp_shutdown(struct sock *sk, int how) * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. */ - if (!(how & SEND_SHUTDOWN)) + if (!(how & SEND_SHUTDOWN)) return; - + /* * If we've already sent a FIN, or it's a closed state */ - + if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING || sk->state == TCP_LAST_ACK || - sk->state == TCP_TIME_WAIT || + sk->state == TCP_TIME_WAIT || sk->state == TCP_CLOSE || sk->state == TCP_LISTEN ) { return; } - sk->inuse = 1; + lock_sock(sk); /* * flag that the sender has shutdown @@ -2401,554 +1580,69 @@ void tcp_shutdown(struct sock *sk, int how) * Clear out any half completed packets. */ - if (sk->partial) - tcp_send_partial(sk); - /* * FIN if needed */ - if(tcp_close_state(sk,0)) + if (tcp_close_state(sk,0)) tcp_send_fin(sk); release_sock(sk); } -static int -tcp_recvfrom(struct sock *sk, unsigned char *to, - int to_len, int nonblock, unsigned flags, - struct sockaddr_in *addr, int *addr_len) -{ - int result; - - /* - * Have to check these first unlike the old code. If - * we check them after we lose data on an error - * which is wrong - */ - - if(addr_len) - *addr_len = sizeof(*addr); - result=tcp_read(sk, to, to_len, nonblock, flags); - - if (result < 0) - return(result); - - if(addr) - { - addr->sin_family = AF_INET; - addr->sin_port = sk->dummy_th.dest; - addr->sin_addr.s_addr = sk->daddr; - } - return(result); -} - - /* - * This routine will send an RST to the other tcp. + * Return 1 if we still have things to send in our buffers. */ - -static void tcp_reset(unsigned long saddr, unsigned long daddr, struct tcphdr *th, - struct proto *prot, struct options *opt, struct device *dev, int tos, int ttl) -{ - struct sk_buff *buff; - struct tcphdr *t1; - int tmp; - struct device *ndev=NULL; - /* - * Cannot reset a reset (Think about it). - */ - - if(th->rst) - return; - - /* - * We need to grab some memory, and put together an RST, - * and then put it into the queue to be sent. - */ - - buff = prot->wmalloc(NULL, MAX_RESET_SIZE, 1, GFP_ATOMIC); - if (buff == NULL) - return; - - buff->len = sizeof(*t1); - buff->sk = NULL; - buff->dev = dev; - buff->localroute = 0; - - t1 =(struct tcphdr *) buff->data; - - /* - * Put in the IP header and routing stuff. - */ - - tmp = prot->build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt, - sizeof(struct tcphdr),tos,ttl); - if (tmp < 0) - { - buff->free = 1; - prot->wfree(NULL, buff->mem_addr, buff->mem_len); - return; - } - - t1 =(struct tcphdr *)((char *)t1 +tmp); - buff->len += tmp; - memcpy(t1, th, sizeof(*t1)); - - /* - * Swap the send and the receive. - */ - - t1->dest = th->source; - t1->source = th->dest; - t1->rst = 1; - t1->window = 0; - - if(th->ack) - { - t1->ack = 0; - t1->seq = th->ack_seq; - t1->ack_seq = 0; - } - else - { - t1->ack = 1; - if(!th->syn) - t1->ack_seq=htonl(th->seq); - else - t1->ack_seq=htonl(th->seq+1); - t1->seq=0; - } - - t1->syn = 0; - t1->urg = 0; - t1->fin = 0; - t1->psh = 0; - t1->doff = sizeof(*t1)/4; - tcp_send_check(t1, saddr, daddr, sizeof(*t1), NULL); - prot->queue_xmit(NULL, ndev, buff, 1); - tcp_statistics.TcpOutSegs++; -} - - -/* - * Look for tcp options. Parses everything but only knows about MSS. - * This routine is always called with the packet containing the SYN. - * However it may also be called with the ack to the SYN. So you - * can't assume this is always the SYN. It's always called after - * we have set up sk->mtu to our own MTU. - * - * We need at minimum to add PAWS support here. Possibly large windows - * as Linux gets deployed on 100Mb/sec networks. - */ - -static void tcp_options(struct sock *sk, struct tcphdr *th) +static inline int closing(struct sock * sk) { - unsigned char *ptr; - int length=(th->doff*4)-sizeof(struct tcphdr); - int mss_seen = 0; - - ptr = (unsigned char *)(th + 1); - - while(length>0) - { - int opcode=*ptr++; - int opsize=*ptr++; - switch(opcode) - { - case TCPOPT_EOL: - return; - case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ - length--; - ptr--; /* the opsize=*ptr++ above was a mistake */ - continue; - - default: - if(opsize<=2) /* Avoid silly options looping forever */ - return; - switch(opcode) - { - case TCPOPT_MSS: - if(opsize==4 && th->syn) - { - sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr)); - mss_seen = 1; - } - break; - /* Add other options here as people feel the urge to implement stuff like large windows */ - } - ptr+=opsize-2; - length-=opsize; - } - } - if (th->syn) - { - if (! mss_seen) - sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */ + switch (sk->state) { + case TCP_FIN_WAIT1: + case TCP_CLOSING: + case TCP_LAST_ACK: + return 1; } -#ifdef CONFIG_INET_PCTCP - sk->mss = min(sk->max_window >> 1, sk->mtu); -#else - sk->mss = min(sk->max_window, sk->mtu); -#endif + return 0; } -static inline unsigned long default_mask(unsigned long dst) -{ - dst = ntohl(dst); - if (IN_CLASSA(dst)) - return htonl(IN_CLASSA_NET); - if (IN_CLASSB(dst)) - return htonl(IN_CLASSB_NET); - return htonl(IN_CLASSC_NET); -} -/* - * Default sequence number picking algorithm. - * As close as possible to RFC 793, which - * suggests using a 250kHz clock. - * Further reading shows this assumes 2MB/s networks. - * For 10MB/s ethernet, a 1MHz clock is appropriate. - * That's funny, Linux has one built in! Use it! - */ - -extern inline unsigned long tcp_init_seq(void) -{ - struct timeval tv; - do_gettimeofday(&tv); - return tv.tv_usec+tv.tv_sec*1000000; -} - -/* - * This routine handles a connection request. - * It should make sure we haven't already responded. - * Because of the way BSD works, we have to send a syn/ack now. - * This also means it will be harder to close a socket which is - * listening. - */ - -static void tcp_conn_request(struct sock *sk, struct sk_buff *skb, - unsigned long daddr, unsigned long saddr, - struct options *opt, struct device *dev, unsigned long seq) +void tcp_close(struct sock *sk, unsigned long timeout) { - struct sk_buff *buff; - struct tcphdr *t1; - unsigned char *ptr; - struct sock *newsk; - struct tcphdr *th; - struct device *ndev=NULL; - int tmp; - struct rtable *rt; - - th = skb->h.th; - - /* If the socket is dead, don't accept the connection. */ - if (!sk->dead) - { - sk->data_ready(sk,0); - } - else - { - if(sk->debug) - printk("Reset on %p: Connect on dead socket.\n",sk); - tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl); - tcp_statistics.TcpAttemptFails++; - kfree_skb(skb, FREE_READ); - return; - } - - /* - * Make sure we can accept more. This will prevent a - * flurry of syns from eating up all our memory. - */ - - if (sk->ack_backlog >= sk->max_ack_backlog) - { - tcp_statistics.TcpAttemptFails++; - kfree_skb(skb, FREE_READ); - return; - } - - /* - * We need to build a new sock struct. - * It is sort of bad to have a socket without an inode attached - * to it, but the wake_up's will just wake up the listening socket, - * and if the listening socket is destroyed before this is taken - * off of the queue, this will take care of it. - */ - - newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC); - if (newsk == NULL) - { - /* just ignore the syn. It will get retransmitted. */ - tcp_statistics.TcpAttemptFails++; - kfree_skb(skb, FREE_READ); - return; - } - - memcpy(newsk, sk, sizeof(*newsk)); - skb_queue_head_init(&newsk->write_queue); - skb_queue_head_init(&newsk->receive_queue); - newsk->send_head = NULL; - newsk->send_tail = NULL; - skb_queue_head_init(&newsk->back_log); - newsk->rtt = 0; /*TCP_CONNECT_TIME<<3*/ - newsk->rto = TCP_TIMEOUT_INIT; - newsk->mdev = 0; - newsk->max_window = 0; - newsk->cong_window = 1; - newsk->cong_count = 0; - newsk->ssthresh = 0; - newsk->backoff = 0; - newsk->blog = 0; - newsk->intr = 0; - newsk->proc = 0; - newsk->done = 0; - newsk->partial = NULL; - newsk->pair = NULL; - newsk->wmem_alloc = 0; - newsk->rmem_alloc = 0; - newsk->localroute = sk->localroute; - - newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF; - - newsk->err = 0; - newsk->shutdown = 0; - newsk->ack_backlog = 0; - newsk->acked_seq = skb->h.th->seq+1; - newsk->copied_seq = skb->h.th->seq+1; - newsk->fin_seq = skb->h.th->seq; - newsk->state = TCP_SYN_RECV; - newsk->timeout = 0; - newsk->ip_xmit_timeout = 0; - newsk->write_seq = seq; - newsk->window_seq = newsk->write_seq; - newsk->rcv_ack_seq = newsk->write_seq; - newsk->urg_data = 0; - newsk->retransmits = 0; - newsk->linger=0; - newsk->destroy = 0; - init_timer(&newsk->timer); - newsk->timer.data = (unsigned long)newsk; - newsk->timer.function = &net_timer; - init_timer(&newsk->retransmit_timer); - newsk->retransmit_timer.data = (unsigned long)newsk; - newsk->retransmit_timer.function=&retransmit_timer; - newsk->dummy_th.source = skb->h.th->dest; - newsk->dummy_th.dest = skb->h.th->source; - - /* - * Swap these two, they are from our point of view. - */ - - newsk->daddr = saddr; - newsk->saddr = daddr; - - put_sock(newsk->num,newsk); - newsk->dummy_th.res1 = 0; - newsk->dummy_th.doff = 6; - newsk->dummy_th.fin = 0; - newsk->dummy_th.syn = 0; - newsk->dummy_th.rst = 0; - newsk->dummy_th.psh = 0; - newsk->dummy_th.ack = 0; - newsk->dummy_th.urg = 0; - newsk->dummy_th.res2 = 0; - newsk->acked_seq = skb->h.th->seq + 1; - newsk->copied_seq = skb->h.th->seq + 1; - newsk->socket = NULL; - - /* - * Grab the ttl and tos values and use them - */ - - newsk->ip_ttl=sk->ip_ttl; - newsk->ip_tos=skb->ip_hdr->tos; - - /* - * Use 512 or whatever user asked for - */ - - /* - * Note use of sk->user_mss, since user has no direct access to newsk - */ - - rt=ip_rt_route(saddr, NULL,NULL); - - if(rt!=NULL && (rt->rt_flags&RTF_WINDOW)) - newsk->window_clamp = rt->rt_window; - else - newsk->window_clamp = 0; - - if (sk->user_mss) - newsk->mtu = sk->user_mss; - else if(rt!=NULL && (rt->rt_flags&RTF_MSS)) - newsk->mtu = rt->rt_mss - HEADER_SIZE; - else - { -#ifdef CONFIG_INET_SNARL /* Sub Nets Are Local */ - if ((saddr ^ daddr) & default_mask(saddr)) -#else - if ((saddr ^ daddr) & dev->pa_mask) -#endif - newsk->mtu = 576 - HEADER_SIZE; - else - newsk->mtu = MAX_WINDOW; - } + struct sk_buff *skb; /* - * But not bigger than device MTU + * We need to grab some memory, and put together a FIN, + * and then put it into the queue to be sent. */ - newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE); - - /* - * This will min with what arrived in the packet - */ + lock_sock(sk); - tcp_options(newsk,skb->h.th); - tcp_cache_zap(); - - buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC); - if (buff == NULL) - { - sk->err = ENOMEM; - newsk->dead = 1; - newsk->state = TCP_CLOSE; - /* And this will destroy it */ - release_sock(newsk); - kfree_skb(skb, FREE_READ); - tcp_statistics.TcpAttemptFails++; - return; - } - - buff->len = sizeof(struct tcphdr)+4; - buff->sk = newsk; - buff->localroute = newsk->localroute; - - t1 =(struct tcphdr *) buff->data; - - /* - * Put in the IP header and routing stuff. - */ - - tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev, - IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl); - - /* - * Something went wrong. - */ - - if (tmp < 0) - { - sk->err = tmp; - buff->free = 1; - kfree_skb(buff,FREE_WRITE); - newsk->dead = 1; - newsk->state = TCP_CLOSE; - release_sock(newsk); - skb->sk = sk; - kfree_skb(skb, FREE_READ); - tcp_statistics.TcpAttemptFails++; - return; - } - - buff->len += tmp; - t1 =(struct tcphdr *)((char *)t1 +tmp); - - memcpy(t1, skb->h.th, sizeof(*t1)); - buff->h.seq = newsk->write_seq; - /* - * Swap the send and the receive. - */ - t1->dest = skb->h.th->source; - t1->source = newsk->dummy_th.source; - t1->seq = ntohl(newsk->write_seq++); - t1->ack = 1; - newsk->window = tcp_select_window(newsk); - newsk->sent_seq = newsk->write_seq; - t1->window = ntohs(newsk->window); - t1->res1 = 0; - t1->res2 = 0; - t1->rst = 0; - t1->urg = 0; - t1->psh = 0; - t1->syn = 1; - t1->ack_seq = ntohl(skb->h.th->seq+1); - t1->doff = sizeof(*t1)/4+1; - ptr =(unsigned char *)(t1+1); - ptr[0] = 2; - ptr[1] = 4; - ptr[2] = ((newsk->mtu) >> 8) & 0xff; - ptr[3] =(newsk->mtu) & 0xff; - - tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk); - newsk->prot->queue_xmit(newsk, ndev, buff, 0); - reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT); - skb->sk = newsk; - - /* - * Charge the sock_buff to newsk. - */ - - sk->rmem_alloc -= skb->mem_len; - newsk->rmem_alloc += skb->mem_len; - - skb_queue_tail(&sk->receive_queue,skb); - sk->ack_backlog++; - release_sock(newsk); - tcp_statistics.TcpOutSegs++; -} - - -static void tcp_close(struct sock *sk, int timeout) -{ - /* - * We need to grab some memory, and put together a FIN, - * and then put it into the queue to be sent. - */ - - sk->inuse = 1; - - if(th_cache_sk==sk) - tcp_cache_zap(); if(sk->state == TCP_LISTEN) { /* Special case */ tcp_set_state(sk, TCP_CLOSE); tcp_close_pending(sk); release_sock(sk); + sk->dead = 1; return; } - + sk->keepopen = 1; sk->shutdown = SHUTDOWN_MASK; - if (!sk->dead) + if (!sk->dead) sk->state_change(sk); - if (timeout == 0) - { - struct sk_buff *skb; - - /* - * We need to flush the recv. buffs. We do this only on the - * descriptor close, not protocol-sourced closes, because the - * reader process may not have drained the data yet! - */ + /* + * We need to flush the recv. buffs. We do this only on the + * descriptor close, not protocol-sourced closes, because the + * reader process may not have drained the data yet! + */ - while((skb=skb_dequeue(&sk->receive_queue))!=NULL) - kfree_skb(skb, FREE_READ); - /* - * Get rid off any half-completed packets. - */ - - if (sk->partial) - tcp_send_partial(sk); - } + while((skb=skb_dequeue(&sk->receive_queue))!=NULL) + kfree_skb(skb, FREE_READ); /* @@ -2956,2132 +1650,148 @@ static void tcp_close(struct sock *sk, int timeout) * to send both the same way (sigh). */ - if(timeout) - { - tcp_set_state(sk, TCP_CLOSE); /* Dead */ - } - else + if (tcp_close_state(sk,1)==1) { - if(tcp_close_state(sk,1)==1) - { - tcp_send_fin(sk); - } - } - release_sock(sk); -} - - -/* - * This routine takes stuff off of the write queue, - * and puts it in the xmit queue. This happens as incoming acks - * open up the remote window for us. - */ - -static void tcp_write_xmit(struct sock *sk) -{ - struct sk_buff *skb; - - /* - * The bytes will have to remain here. In time closedown will - * empty the write queue and all will be happy - */ - - if(sk->zapped) - return; - - /* - * Anything on the transmit queue that fits the window can - * be added providing we are not - * - * a) retransmitting (Nagle's rule) - * b) exceeding our congestion window. - */ - - while((skb = skb_peek(&sk->write_queue)) != NULL && - before(skb->h.seq, sk->window_seq + 1) && - (sk->retransmits == 0 || - sk->ip_xmit_timeout != TIME_WRITE || - before(skb->h.seq, sk->rcv_ack_seq + 1)) - && sk->packets_out < sk->cong_window) - { - IS_SKB(skb); - skb_unlink(skb); - - /* - * See if we really need to send the packet. - */ - - if (before(skb->h.seq, sk->rcv_ack_seq +1)) - { - /* - * This is acked data. We can discard it. This - * cannot currently occur. - */ - - sk->retransmits = 0; - kfree_skb(skb, FREE_WRITE); - if (!sk->dead) - sk->write_space(sk); - } - else - { - struct tcphdr *th; - struct iphdr *iph; - int size; -/* - * put in the ack seq and window at this point rather than earlier, - * in order to keep them monotonic. We really want to avoid taking - * back window allocations. That's legal, but RFC1122 says it's frowned on. - * Ack and window will in general have changed since this packet was put - * on the write queue. - */ - iph = (struct iphdr *)(skb->data + - skb->dev->hard_header_len); - th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2)); - size = skb->len - (((unsigned char *) th) - skb->data); - - th->ack_seq = ntohl(sk->acked_seq); - th->window = ntohs(tcp_select_window(sk)); - - tcp_send_check(th, sk->saddr, sk->daddr, size, sk); - - sk->sent_seq = skb->h.seq; - - /* - * IP manages our queue for some crazy reason - */ - - sk->prot->queue_xmit(sk, skb->dev, skb, skb->free); - - /* - * Again we slide the timer wrongly - */ - - reset_xmit_timer(sk, TIME_WRITE, sk->rto); - } - } -} - - -/* - * This routine deals with incoming acks, but not outgoing ones. - */ - -extern __inline__ int tcp_ack(struct sock *sk, struct tcphdr *th, unsigned long saddr, int len) -{ - unsigned long ack; - int flag = 0; - - /* - * 1 - there was data in packet as well as ack or new data is sent or - * in shutdown state - * 2 - data from retransmit queue was acked and removed - * 4 - window shrunk or data from retransmit queue was acked and removed - */ - - if(sk->zapped) - return(1); /* Dead, cant ack any more so why bother */ - - /* - * Have we discovered a larger window - */ - - ack = ntohl(th->ack_seq); - - if (ntohs(th->window) > sk->max_window) - { - sk->max_window = ntohs(th->window); -#ifdef CONFIG_INET_PCTCP - /* Hack because we don't send partial packets to non SWS - handling hosts */ - sk->mss = min(sk->max_window>>1, sk->mtu); -#else - sk->mss = min(sk->max_window, sk->mtu); -#endif - } - - /* - * We have dropped back to keepalive timeouts. Thus we have - * no retransmits pending. - */ - - if (sk->retransmits && sk->ip_xmit_timeout == TIME_KEEPOPEN) - sk->retransmits = 0; - - /* - * If the ack is newer than sent or older than previous acks - * then we can probably ignore it. - */ - - if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) - { - if(sk->debug) - printk("Ack ignored %lu %lu\n",ack,sk->sent_seq); - - /* - * Keepalive processing. - */ - - if (after(ack, sk->sent_seq)) - { - return(0); - } - - /* - * Restart the keepalive timer. - */ - - if (sk->keepopen) - { - if(sk->ip_xmit_timeout==TIME_KEEPOPEN) - reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN); - } - return(1); + tcp_send_fin(sk); } - /* - * If there is data set flag 1 - */ - - if (len != th->doff*4) - flag |= 1; - - /* - * See if our window has been shrunk. - */ - - if (after(sk->window_seq, ack+ntohs(th->window))) - { - /* - * We may need to move packets from the send queue - * to the write queue, if the window has been shrunk on us. - * The RFC says you are not allowed to shrink your window - * like this, but if the other end does, you must be able - * to deal with it. - */ - struct sk_buff *skb; - struct sk_buff *skb2; - struct sk_buff *wskb = NULL; - - skb2 = sk->send_head; - sk->send_head = NULL; - sk->send_tail = NULL; - - /* - * This is an artifact of a flawed concept. We want one - * queue and a smarter send routine when we send all. - */ - - flag |= 4; /* Window changed */ - - sk->window_seq = ack + ntohs(th->window); + if (timeout) { cli(); - while (skb2 != NULL) + release_sock(sk); + current->timeout = timeout; + while(closing(sk) && current->timeout) { - skb = skb2; - skb2 = skb->link3; - skb->link3 = NULL; - if (after(skb->h.seq, sk->window_seq)) + interruptible_sleep_on(sk->sleep); + if (current->signal & ~current->blocked) { - if (sk->packets_out > 0) - sk->packets_out--; - /* We may need to remove this from the dev send list. */ - if (skb->next != NULL) - { - skb_unlink(skb); - } - /* Now add it to the write_queue. */ - if (wskb == NULL) - skb_queue_head(&sk->write_queue,skb); - else - skb_append(wskb,skb); - wskb = skb; - } - else - { - if (sk->send_head == NULL) - { - sk->send_head = skb; - sk->send_tail = skb; - } - else - { - sk->send_tail->link3 = skb; - sk->send_tail = skb; - } - skb->link3 = NULL; + break; } } + current->timeout=0; + lock_sock(sk); sti(); } /* - * Pipe has emptied - */ - - if (sk->send_tail == NULL || sk->send_head == NULL) - { - sk->send_head = NULL; - sk->send_tail = NULL; - sk->packets_out= 0; - } - - /* - * Update the right hand window edge of the host - */ - - sk->window_seq = ack + ntohs(th->window); - - /* - * We don't want too many packets out there. - */ - - if (sk->ip_xmit_timeout == TIME_WRITE && - sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) - { - /* - * This is Jacobson's slow start and congestion avoidance. - * SIGCOMM '88, p. 328. Because we keep cong_window in integral - * mss's, we can't do cwnd += 1 / cwnd. Instead, maintain a - * counter and increment it once every cwnd times. It's possible - * that this should be done only if sk->retransmits == 0. I'm - * interpreting "new data is acked" as including data that has - * been retransmitted but is just now being acked. - */ - if (sk->cong_window < sk->ssthresh) - /* - * In "safe" area, increase - */ - sk->cong_window++; - else - { - /* - * In dangerous area, increase slowly. In theory this is - * sk->cong_window += 1 / sk->cong_window - */ - if (sk->cong_count >= sk->cong_window) - { - sk->cong_window++; - sk->cong_count = 0; - } - else - sk->cong_count++; - } - } - - /* - * Remember the highest ack received. - */ - - sk->rcv_ack_seq = ack; - - /* - * If this ack opens up a zero window, clear backoff. It was - * being used to time the probes, and is probably far higher than - * it needs to be for normal retransmission. - */ - - if (sk->ip_xmit_timeout == TIME_PROBE0) - { - sk->retransmits = 0; /* Our probe was answered */ - - /* - * Was it a usable window open ? - */ - - if (skb_peek(&sk->write_queue) != NULL && /* should always be non-null */ - ! before (sk->window_seq, sk->write_queue.next->h.seq)) - { - sk->backoff = 0; - - /* - * Recompute rto from rtt. this eliminates any backoff. - */ - - sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1; - if (sk->rto > 120*HZ) - sk->rto = 120*HZ; - if (sk->rto < 20) /* Was 1*HZ, then 1 - turns out we must allow about - .2 of a second because of BSD delayed acks - on a 100Mb/sec link - .2 of a second is going to need huge windows (SIGH) */ - sk->rto = 20; - } - } - - /* - * See if we can take anything off of the retransmit queue. - */ - - while(sk->send_head != NULL) - { - /* Check for a bug. */ - if (sk->send_head->link3 && - after(sk->send_head->h.seq, sk->send_head->link3->h.seq)) - printk("INET: tcp.c: *** bug send_list out of order.\n"); - - /* - * If our packet is before the ack sequence we can - * discard it as it's confirmed to have arrived the other end. - */ - - if (before(sk->send_head->h.seq, ack+1)) - { - struct sk_buff *oskb; - if (sk->retransmits) - { - /* - * We were retransmitting. don't count this in RTT est - */ - flag |= 2; - - /* - * even though we've gotten an ack, we're still - * retransmitting as long as we're sending from - * the retransmit queue. Keeping retransmits non-zero - * prevents us from getting new data interspersed with - * retransmissions. - */ - - if (sk->send_head->link3) /* Any more queued retransmits? */ - sk->retransmits = 1; - else - sk->retransmits = 0; - } - /* - * Note that we only reset backoff and rto in the - * rtt recomputation code. And that doesn't happen - * if there were retransmissions in effect. So the - * first new packet after the retransmissions is - * sent with the backoff still in effect. Not until - * we get an ack from a non-retransmitted packet do - * we reset the backoff and rto. This allows us to deal - * with a situation where the network delay has increased - * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.) - */ - - /* - * We have one less packet out there. - */ - - if (sk->packets_out > 0) - sk->packets_out --; - /* - * Wake up the process, it can probably write more. - */ - if (!sk->dead) - sk->write_space(sk); - oskb = sk->send_head; - - if (!(flag&2)) /* Not retransmitting */ - { - long m; - - /* - * The following amusing code comes from Jacobson's - * article in SIGCOMM '88. Note that rtt and mdev - * are scaled versions of rtt and mean deviation. - * This is designed to be as fast as possible - * m stands for "measurement". - */ - - m = jiffies - oskb->when; /* RTT */ - if(m<=0) - m=1; /* IS THIS RIGHT FOR <0 ??? */ - m -= (sk->rtt >> 3); /* m is now error in rtt est */ - sk->rtt += m; /* rtt = 7/8 rtt + 1/8 new */ - if (m < 0) - m = -m; /* m is now abs(error) */ - m -= (sk->mdev >> 2); /* similar update on mdev */ - sk->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ - - /* - * Now update timeout. Note that this removes any backoff. - */ - - sk->rto = ((sk->rtt >> 2) + sk->mdev) >> 1; - if (sk->rto > 120*HZ) - sk->rto = 120*HZ; - if (sk->rto < 20) /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */ - sk->rto = 20; - sk->backoff = 0; - } - flag |= (2|4); /* 2 is really more like 'don't adjust the rtt - In this case as we just set it up */ - cli(); - oskb = sk->send_head; - IS_SKB(oskb); - sk->send_head = oskb->link3; - if (sk->send_head == NULL) - { - sk->send_tail = NULL; - } - - /* - * We may need to remove this from the dev send list. - */ - - if (oskb->next) - skb_unlink(oskb); - sti(); - kfree_skb(oskb, FREE_WRITE); /* write. */ - if (!sk->dead) - sk->write_space(sk); - } - else - { - break; - } - } - - /* - * XXX someone ought to look at this too.. at the moment, if skb_peek() - * returns non-NULL, we complete ignore the timer stuff in the else - * clause. We ought to organize the code so that else clause can - * (should) be executed regardless, possibly moving the PROBE timer - * reset over. The skb_peek() thing should only move stuff to the - * write queue, NOT also manage the timer functions. - */ - - /* - * Maybe we can take some stuff off of the write queue, - * and put it onto the xmit queue. - */ - if (skb_peek(&sk->write_queue) != NULL) - { - if (after (sk->window_seq+1, sk->write_queue.next->h.seq) && - (sk->retransmits == 0 || - sk->ip_xmit_timeout != TIME_WRITE || - before(sk->write_queue.next->h.seq, sk->rcv_ack_seq + 1)) - && sk->packets_out < sk->cong_window) - { - /* - * Add more data to the send queue. - */ - flag |= 1; - tcp_write_xmit(sk); - } - else if (before(sk->window_seq, sk->write_queue.next->h.seq) && - sk->send_head == NULL && - sk->ack_backlog == 0 && - sk->state != TCP_TIME_WAIT) - { - /* - * Data to queue but no room. - */ - reset_xmit_timer(sk, TIME_PROBE0, sk->rto); - } - } - else - { - /* - * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets - * from TCP_CLOSE we don't do anything - * - * from anything else, if there is write data (or fin) pending, - * we use a TIME_WRITE timeout, else if keepalive we reset to - * a KEEPALIVE timeout, else we delete the timer. - * - * We do not set flag for nominal write data, otherwise we may - * force a state where we start to write itsy bitsy tidbits - * of data. - */ - - switch(sk->state) { - case TCP_TIME_WAIT: - /* - * keep us in TIME_WAIT until we stop getting packets, - * reset the timeout. - */ - reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); - break; - case TCP_CLOSE: - /* - * don't touch the timer. - */ - break; - default: - /* - * Must check send_head, write_queue, and ack_backlog - * to determine which timeout to use. - */ - if (sk->send_head || skb_peek(&sk->write_queue) != NULL || sk->ack_backlog) { - reset_xmit_timer(sk, TIME_WRITE, sk->rto); - } else if (sk->keepopen) { - reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN); - } else { - del_timer(&sk->retransmit_timer); - sk->ip_xmit_timeout = 0; - } - break; - } - } - - /* - * We have nothing queued but space to send. Send any partial - * packets immediately (end of Nagle rule application). - */ - - if (sk->packets_out == 0 && sk->partial != NULL && - skb_peek(&sk->write_queue) == NULL && sk->send_head == NULL) - { - flag |= 1; - tcp_send_partial(sk); - } - - /* - * In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and - * we are now waiting for an acknowledge to our FIN. The other end is - * already in TIME_WAIT. - * - * Move to TCP_CLOSE on success. - */ - - if (sk->state == TCP_LAST_ACK) - { - if (!sk->dead) - sk->state_change(sk); - if(sk->debug) - printk("rcv_ack_seq: %lX==%lX, acked_seq: %lX==%lX\n", - sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq); - if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) - { - flag |= 1; - tcp_set_state(sk,TCP_CLOSE); - sk->shutdown = SHUTDOWN_MASK; - } - } - - /* - * Incoming ACK to a FIN we sent in the case of our initiating the close. - * - * Move to FIN_WAIT2 to await a FIN from the other end. Set - * SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in. - */ - - if (sk->state == TCP_FIN_WAIT1) - { - - if (!sk->dead) - sk->state_change(sk); - if (sk->rcv_ack_seq == sk->write_seq) - { - flag |= 1; - sk->shutdown |= SEND_SHUTDOWN; - tcp_set_state(sk, TCP_FIN_WAIT2); - } - } - - /* - * Incoming ACK to a FIN we sent in the case of a simultaneous close. - * - * Move to TIME_WAIT + * This will destroy it. The timers will take care of actually + * free'ing up the memory. */ + tcp_cache_zap(); /* Kill the cache again. */ - if (sk->state == TCP_CLOSING) + /* Now that the socket is dead, if we are in the FIN_WAIT2 state + * we may need to set up a timer. + */ + if (sk->state==TCP_FIN_WAIT2) { - - if (!sk->dead) - sk->state_change(sk); - if (sk->rcv_ack_seq == sk->write_seq) - { - flag |= 1; - tcp_time_wait(sk); - } - } - - /* - * Final ack of a three way shake - */ - - if(sk->state==TCP_SYN_RECV) - { - tcp_set_state(sk, TCP_ESTABLISHED); - tcp_options(sk,th); - sk->dummy_th.dest=th->source; - sk->copied_seq = sk->acked_seq; - if(!sk->dead) - sk->state_change(sk); - if(sk->max_window==0) - { - sk->max_window=32; /* Sanity check */ - sk->mss=min(sk->max_window,sk->mtu); - } - } - - /* - * I make no guarantees about the first clause in the following - * test, i.e. "(!flag) || (flag&4)". I'm not entirely sure under - * what conditions "!flag" would be true. However I think the rest - * of the conditions would prevent that from causing any - * unnecessary retransmission. - * Clearly if the first packet has expired it should be - * retransmitted. The other alternative, "flag&2 && retransmits", is - * harder to explain: You have to look carefully at how and when the - * timer is set and with what timeout. The most recent transmission always - * sets the timer. So in general if the most recent thing has timed - * out, everything before it has as well. So we want to go ahead and - * retransmit some more. If we didn't explicitly test for this - * condition with "flag&2 && retransmits", chances are "when + rto < jiffies" - * would not be true. If you look at the pattern of timing, you can - * show that rto is increased fast enough that the next packet would - * almost never be retransmitted immediately. Then you'd end up - * waiting for a timeout to send each packet on the retransmission - * queue. With my implementation of the Karn sampling algorithm, - * the timeout would double each time. The net result is that it would - * take a hideous amount of time to recover from a single dropped packet. - * It's possible that there should also be a test for TIME_WRITE, but - * I think as long as "send_head != NULL" and "retransmit" is on, we've - * got to be in real retransmission mode. - * Note that tcp_do_retransmit is called with all==1. Setting cong_window - * back to 1 at the timeout will cause us to send 1, then 2, etc. packets. - * As long as no further losses occur, this seems reasonable. - */ - - if (((!flag) || (flag&4)) && sk->send_head != NULL && - (((flag&2) && sk->retransmits) || - (sk->send_head->when + sk->rto < jiffies))) - { - if(sk->send_head->when + sk->rto < jiffies) - tcp_retransmit(sk,0); + int timer_active=del_timer(&sk->timer); + if(timer_active) + add_timer(&sk->timer); else - { - tcp_do_retransmit(sk, 1); - reset_xmit_timer(sk, TIME_WRITE, sk->rto); - } + tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT); } - return(1); + release_sock(sk); + sk->dead = 1; } /* - * Process the FIN bit. This now behaves as it is supposed to work - * and the FIN takes effect when it is validly part of sequence - * space. Not before when we get holes. - * - * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT - * (and thence onto LAST-ACK and finally, CLOSE, we never enter - * TIME-WAIT) - * - * If we are in FINWAIT-1, a received FIN indicates simultaneous - * close and we go into CLOSING (and later onto TIME-WAIT) - * - * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. - * + * Wait for an incoming connection, avoid race + * conditions. This must be called with the socket locked. */ - -static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) +static struct open_request * wait_for_connect(struct sock * sk) { - sk->fin_seq = th->seq + skb->len + th->syn + th->fin; - - if (!sk->dead) - { - sk->state_change(sk); - sock_wake_async(sk->socket, 1); - } - - switch(sk->state) - { - case TCP_SYN_RECV: - case TCP_SYN_SENT: - case TCP_ESTABLISHED: - /* - * move to CLOSE_WAIT, tcp_data() already handled - * sending the ack. - */ - tcp_set_state(sk,TCP_CLOSE_WAIT); - if (th->rst) - sk->shutdown = SHUTDOWN_MASK; - break; - - case TCP_CLOSE_WAIT: - case TCP_CLOSING: - /* - * received a retransmission of the FIN, do - * nothing. - */ - break; - case TCP_TIME_WAIT: - /* - * received a retransmission of the FIN, - * restart the TIME_WAIT timer. - */ - reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); - return(0); - case TCP_FIN_WAIT1: - /* - * This case occurs when a simultaneous close - * happens, we must ack the received FIN and - * enter the CLOSING state. - * - * This causes a WRITE timeout, which will either - * move on to TIME_WAIT when we timeout, or resend - * the FIN properly (maybe we get rid of that annoying - * FIN lost hang). The TIME_WRITE code is already correct - * for handling this timeout. - */ + struct wait_queue wait = { current, NULL }; + struct open_request *req = NULL; - if(sk->ip_xmit_timeout != TIME_WRITE) - reset_xmit_timer(sk, TIME_WRITE, sk->rto); - tcp_set_state(sk,TCP_CLOSING); + add_wait_queue(sk->sleep, &wait); + for (;;) { + current->state = TASK_INTERRUPTIBLE; + release_sock(sk); + schedule(); + lock_sock(sk); + req = tcp_find_established(&(sk->tp_pinfo.af_tcp)); + if (req) break; - case TCP_FIN_WAIT2: - /* - * received a FIN -- send ACK and enter TIME_WAIT - */ - reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); - sk->shutdown|=SHUTDOWN_MASK; - tcp_set_state(sk,TCP_TIME_WAIT); + if (current->signal & ~current->blocked) break; - case TCP_CLOSE: - /* - * already in CLOSE - */ - break; - default: - tcp_set_state(sk,TCP_LAST_ACK); - - /* Start the timers. */ - reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); - return(0); } - - return(0); -} - - - -/* - * This routine handles the data. If there is room in the buffer, - * it will be have already been moved into it. If there is no - * room, then we will just have to discard the packet. - */ - -extern __inline__ int tcp_data(struct sk_buff *skb, struct sock *sk, - unsigned long saddr, unsigned short len) -{ - struct sk_buff *skb1, *skb2; - struct tcphdr *th; - int dup_dumped=0; - unsigned long new_seq; - unsigned long shut_seq; - - th = skb->h.th; - skb->len = len -(th->doff*4); - - /* - * The bytes in the receive read/assembly queue has increased. Needed for the - * low memory discard algorithm - */ - - sk->bytes_rcv += skb->len; - - if (skb->len == 0 && !th->fin) - { - /* - * Don't want to keep passing ack's back and forth. - * (someone sent us dataless, boring frame) - */ - if (!th->ack) - tcp_send_ack(sk->sent_seq, sk->acked_seq,sk, th, saddr); - kfree_skb(skb, FREE_READ); - return(0); - } - - /* - * We no longer have anyone receiving data on this connection. - */ - -#ifndef TCP_DONT_RST_SHUTDOWN - - if(sk->shutdown & RCV_SHUTDOWN) - { - /* - * FIXME: BSD has some magic to avoid sending resets to - * broken 4.2 BSD keepalives. Much to my surprise a few non - * BSD stacks still have broken keepalives so we want to - * cope with it. - */ - - if(skb->len) /* We don't care if it's just an ack or - a keepalive/window probe */ - { - new_seq= th->seq + skb->len + th->syn; /* Right edge of _data_ part of frame */ - - /* Do this the way 4.4BSD treats it. Not what I'd - regard as the meaning of the spec but it's what BSD - does and clearly they know everything 8) */ - - /* - * This is valid because of two things - * - * a) The way tcp_data behaves at the bottom. - * b) A fin takes effect when read not when received. - */ - - shut_seq=sk->acked_seq+1; /* Last byte */ - - if(after(new_seq,shut_seq)) - { - if(sk->debug) - printk("Data arrived on %p after close [Data right edge %lX, Socket shut on %lX] %d\n", - sk, new_seq, shut_seq, sk->blog); - if(sk->dead) - { - sk->acked_seq = new_seq + th->fin; - tcp_reset(sk->saddr, sk->daddr, skb->h.th, - sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl); - tcp_statistics.TcpEstabResets++; - tcp_set_state(sk,TCP_CLOSE); - sk->err = EPIPE; - sk->shutdown = SHUTDOWN_MASK; - kfree_skb(skb, FREE_READ); - return 0; - } - } - } - } - -#endif - - /* - * Now we have to walk the chain, and figure out where this one - * goes into it. This is set up so that the last packet we received - * will be the first one we look at, that way if everything comes - * in order, there will be no performance loss, and if they come - * out of order we will be able to fit things in nicely. - * - * [AC: This is wrong. We should assume in order first and then walk - * forwards from the first hole based upon real traffic patterns.] - * - */ - - if (skb_peek(&sk->receive_queue) == NULL) /* Empty queue is easy case */ - { - skb_queue_head(&sk->receive_queue,skb); - skb1= NULL; - } - else - { - for(skb1=sk->receive_queue.prev; ; skb1 = skb1->prev) - { - if(sk->debug) - { - printk("skb1=%p :", skb1); - printk("skb1->h.th->seq = %ld: ", skb1->h.th->seq); - printk("skb->h.th->seq = %ld\n",skb->h.th->seq); - printk("copied_seq = %ld acked_seq = %ld\n", sk->copied_seq, - sk->acked_seq); - } - - /* - * Optimisation: Duplicate frame or extension of previous frame from - * same sequence point (lost ack case). - * The frame contains duplicate data or replaces a previous frame - * discard the previous frame (safe as sk->inuse is set) and put - * the new one in its place. - */ - - if (th->seq==skb1->h.th->seq && skb->len>= skb1->len) - { - skb_append(skb1,skb); - skb_unlink(skb1); - kfree_skb(skb1,FREE_READ); - dup_dumped=1; - skb1=NULL; - break; - } - - /* - * Found where it fits - */ - - if (after(th->seq+1, skb1->h.th->seq)) - { - skb_append(skb1,skb); - break; - } - - /* - * See if we've hit the start. If so insert. - */ - if (skb1 == skb_peek(&sk->receive_queue)) - { - skb_queue_head(&sk->receive_queue, skb); - break; - } - } - } - - /* - * Figure out what the ack value for this frame is - */ - - th->ack_seq = th->seq + skb->len; - if (th->syn) - th->ack_seq++; - if (th->fin) - th->ack_seq++; - - if (before(sk->acked_seq, sk->copied_seq)) - { - printk("*** tcp.c:tcp_data bug acked < copied\n"); - sk->acked_seq = sk->copied_seq; - } - - /* - * Now figure out if we can ack anything. This is very messy because we really want two - * receive queues, a completed and an assembly queue. We also want only one transmit - * queue. - */ - - if ((!dup_dumped && (skb1 == NULL || skb1->acked)) || before(th->seq, sk->acked_seq+1)) - { - if (before(th->seq, sk->acked_seq+1)) - { - int newwindow; - - if (after(th->ack_seq, sk->acked_seq)) - { - newwindow = sk->window-(th->ack_seq - sk->acked_seq); - if (newwindow < 0) - newwindow = 0; - sk->window = newwindow; - sk->acked_seq = th->ack_seq; - } - skb->acked = 1; - - /* - * When we ack the fin, we do the FIN - * processing. - */ - - if (skb->h.th->fin) - { - tcp_fin(skb,sk,skb->h.th); - } - - for(skb2 = skb->next; - skb2 != (struct sk_buff *)&sk->receive_queue; - skb2 = skb2->next) - { - if (before(skb2->h.th->seq, sk->acked_seq+1)) - { - if (after(skb2->h.th->ack_seq, sk->acked_seq)) - { - newwindow = sk->window - - (skb2->h.th->ack_seq - sk->acked_seq); - if (newwindow < 0) - newwindow = 0; - sk->window = newwindow; - sk->acked_seq = skb2->h.th->ack_seq; - } - skb2->acked = 1; - /* - * When we ack the fin, we do - * the fin handling. - */ - if (skb2->h.th->fin) - { - tcp_fin(skb,sk,skb->h.th); - } - - /* - * Force an immediate ack. - */ - - sk->ack_backlog = sk->max_ack_backlog; - } - else - { - break; - } - } - - /* - * This also takes care of updating the window. - * This if statement needs to be simplified. - */ - if (!sk->delay_acks || - sk->ack_backlog >= sk->max_ack_backlog || - sk->bytes_rcv > sk->max_unacked || th->fin) { - /* tcp_send_ack(sk->sent_seq, sk->acked_seq,sk,th, saddr); */ - } - else - { - sk->ack_backlog++; - if(sk->debug) - printk("Ack queued.\n"); - reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME); - } - } - } - - /* - * If we've missed a packet, send an ack. - * Also start a timer to send another. - */ - - if (!skb->acked) - { - - /* - * This is important. If we don't have much room left, - * we need to throw out a few packets so we have a good - * window. Note that mtu is used, not mss, because mss is really - * for the send side. He could be sending us stuff as large as mtu. - */ - - while (sk->prot->rspace(sk) < sk->mtu) - { - skb1 = skb_peek(&sk->receive_queue); - if (skb1 == NULL) - { - printk("INET: tcp.c:tcp_data memory leak detected.\n"); - break; - } - - /* - * Don't throw out something that has been acked. - */ - - if (skb1->acked) - { - break; - } - - skb_unlink(skb1); - kfree_skb(skb1, FREE_READ); - } - tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr); - sk->ack_backlog++; - reset_xmit_timer(sk, TIME_WRITE, TCP_ACK_TIME); - } - else - { - tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr); - } - - /* - * Now tell the user we may have some data. - */ - - if (!sk->dead) - { - if(sk->debug) - printk("Data wakeup.\n"); - sk->data_ready(sk,0); - } - return(0); + remove_wait_queue(sk->sleep, &wait); + return req; } /* - * This routine is only called when we have urgent data - * signalled. Its the 'slow' part of tcp_urg. It could be - * moved inline now as tcp_urg is only called from one - * place. We handle URGent data wrong. We have to - as - * BSD still doesn't use the correction from RFC961. + * This will accept the next outstanding connection. + * + * Be careful about race conditions here - this is subtle. */ - -static void tcp_check_urg(struct sock * sk, struct tcphdr * th) -{ - unsigned long ptr = ntohs(th->urg_ptr); - - if (ptr) - ptr--; - ptr += th->seq; - - /* ignore urgent data that we've already seen and read */ - if (after(sk->copied_seq, ptr)) - return; - - /* do we already have a newer (or duplicate) urgent pointer? */ - if (sk->urg_data && !after(ptr, sk->urg_seq)) - return; - - /* tell the world about our new urgent pointer */ - if (sk->proc != 0) { - if (sk->proc > 0) { - kill_proc(sk->proc, SIGURG, 1); - } else { - kill_pg(-sk->proc, SIGURG, 1); - } - } - sk->urg_data = URG_NOTYET; - sk->urg_seq = ptr; -} -/* - * This is the 'fast' part of urgent handling. - */ - -extern __inline__ int tcp_urg(struct sock *sk, struct tcphdr *th, - unsigned long saddr, unsigned long len) +struct sock *tcp_accept(struct sock *sk, int flags) { - unsigned long ptr; - - /* - * Check if we get a new urgent pointer - normally not - */ - - if (th->urg) - tcp_check_urg(sk,th); - - /* - * Do we wait for any urgent data? - normally not - */ - - if (sk->urg_data != URG_NOTYET) - return 0; + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct open_request *req; + struct sock *newsk = NULL; + int error; - /* - * Is the urgent pointer pointing into this packet? - */ - - ptr = sk->urg_seq - th->seq + th->doff*4; - if (ptr >= len) - return 0; - - /* - * Ok, got the correct packet, update info - */ - - sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th); - if (!sk->dead) - sk->data_ready(sk,0); - return 0; -} - -/* - * This will accept the next outstanding connection. - */ - -static struct sock *tcp_accept(struct sock *sk, int flags) -{ - struct sock *newsk; - struct sk_buff *skb; - /* * We need to make sure that this socket is listening, * and that it has something pending. */ - if (sk->state != TCP_LISTEN) - { - sk->err = EINVAL; - return(NULL); - } - - /* Avoid the race. */ - cli(); - sk->inuse = 1; - - while((skb = tcp_dequeue_established(sk)) == NULL) - { - if (flags & O_NONBLOCK) - { - sti(); - release_sock(sk); - sk->err = EAGAIN; - return(NULL); - } - + error = EINVAL; + if (sk->state != TCP_LISTEN) + goto no_listen; + + lock_sock(sk); + + req = tcp_find_established(tp); + if (req) { +got_new_connect: + tcp_synq_unlink(tp, req); + newsk = req->sk; + kfree(req); + sk->ack_backlog--; + error = 0; +out: release_sock(sk); - interruptible_sleep_on(sk->sleep); - if (current->signal & ~current->blocked) - { - sti(); - sk->err = ERESTARTSYS; - return(NULL); - } - sk->inuse = 1; - } - sti(); - - /* - * Now all we need to do is return skb->sk. - */ - - newsk = skb->sk; - - kfree_skb(skb, FREE_READ); - sk->ack_backlog--; - release_sock(sk); - return(newsk); +no_listen: + sk->err = error; + return newsk; + } + + error = EAGAIN; + if (flags & O_NONBLOCK) + goto out; + req = wait_for_connect(sk); + if (req) + goto got_new_connect; + error = ERESTARTSYS; + goto out; } /* - * This will initiate an outgoing connection. + * Socket option code for TCP. */ - -static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len) -{ - struct sk_buff *buff; - struct device *dev=NULL; - unsigned char *ptr; - int tmp; - int atype; - struct tcphdr *t1; - struct rtable *rt; - - if (sk->state != TCP_CLOSE) - { - return(-EISCONN); - } - - if (addr_len < 8) - return(-EINVAL); - - if (usin->sin_family && usin->sin_family != AF_INET) - return(-EAFNOSUPPORT); - - /* - * connect() to INADDR_ANY means loopback (BSD'ism). - */ - - if(usin->sin_addr.s_addr==INADDR_ANY) - usin->sin_addr.s_addr=ip_my_addr(); - - /* - * Don't want a TCP connection going to a broadcast address - */ - - if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST) - return -ENETUNREACH; - sk->inuse = 1; - sk->daddr = usin->sin_addr.s_addr; - sk->write_seq = tcp_init_seq(); - sk->window_seq = sk->write_seq; - sk->rcv_ack_seq = sk->write_seq -1; - sk->err = 0; - sk->dummy_th.dest = usin->sin_port; - release_sock(sk); - - buff = sk->prot->wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL); - if (buff == NULL) - { - return(-ENOMEM); - } - sk->inuse = 1; - buff->len = 24; - buff->sk = sk; - buff->free = 0; - buff->localroute = sk->localroute; - - t1 = (struct tcphdr *) buff->data; - - /* - * Put in the IP header and routing stuff. - */ - - rt=ip_rt_route(sk->daddr, NULL, NULL); - - - /* - * We need to build the routing stuff from the things saved in skb. - */ - - tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev, - IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl); - if (tmp < 0) - { - sk->prot->wfree(sk, buff->mem_addr, buff->mem_len); - release_sock(sk); - return(-ENETUNREACH); - } - - buff->len += tmp; - t1 = (struct tcphdr *)((char *)t1 +tmp); - - memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1)); - t1->seq = ntohl(sk->write_seq++); - sk->sent_seq = sk->write_seq; - buff->h.seq = sk->write_seq; - t1->ack = 0; - t1->window = 2; - t1->res1=0; - t1->res2=0; - t1->rst = 0; - t1->urg = 0; - t1->psh = 0; - t1->syn = 1; - t1->urg_ptr = 0; - t1->doff = 6; - /* use 512 or whatever user asked for */ - - if(rt!=NULL && (rt->rt_flags&RTF_WINDOW)) - sk->window_clamp=rt->rt_window; - else - sk->window_clamp=0; - - if (sk->user_mss) - sk->mtu = sk->user_mss; - else if(rt!=NULL && (rt->rt_flags&RTF_MTU)) - sk->mtu = rt->rt_mss; - else - { -#ifdef CONFIG_INET_SNARL - if ((sk->saddr ^ sk->daddr) & default_mask(sk->saddr)) -#else - if ((sk->saddr ^ sk->daddr) & dev->pa_mask) -#endif - sk->mtu = 576 - HEADER_SIZE; - else - sk->mtu = MAX_WINDOW; - } - /* - * but not bigger than device MTU - */ - - if(sk->mtu <32) - sk->mtu = 32; /* Sanity limit */ - - sk->mtu = min(sk->mtu, dev->mtu - HEADER_SIZE); - - /* - * Put in the TCP options to say MTU. - */ - - ptr = (unsigned char *)(t1+1); - ptr[0] = 2; - ptr[1] = 4; - ptr[2] = (sk->mtu) >> 8; - ptr[3] = (sk->mtu) & 0xff; - tcp_send_check(t1, sk->saddr, sk->daddr, - sizeof(struct tcphdr) + 4, sk); - - /* - * This must go first otherwise a really quick response will get reset. - */ - - tcp_cache_zap(); - tcp_set_state(sk,TCP_SYN_SENT); - if(rt&&rt->rt_flags&RTF_IRTT) - sk->rto = rt->rt_irtt; - else - sk->rto = TCP_TIMEOUT_INIT; - sk->retransmit_timer.function=&retransmit_timer; - sk->retransmit_timer.data = (unsigned long)sk; - reset_xmit_timer(sk, TIME_WRITE, sk->rto); /* Timer for repeating the SYN until an answer */ - sk->retransmits = TCP_SYN_RETRIES; - - sk->prot->queue_xmit(sk, dev, buff, 0); - reset_xmit_timer(sk, TIME_WRITE, sk->rto); - tcp_statistics.TcpActiveOpens++; - tcp_statistics.TcpOutSegs++; - - release_sock(sk); - return(0); -} - - -/* This functions checks to see if the tcp header is actually acceptable. */ -extern __inline__ int tcp_sequence(struct sock *sk, struct tcphdr *th, short len, - struct options *opt, unsigned long saddr, struct device *dev) +int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, + int optlen) { - unsigned long next_seq; - - next_seq = len - 4*th->doff; - if (th->fin) - next_seq++; - /* if we have a zero window, we can't have any data in the packet.. */ - if (next_seq && !sk->window) - goto ignore_it; - next_seq += th->seq; - - /* - * This isn't quite right. sk->acked_seq could be more recent - * than sk->window. This is however close enough. We will accept - * slightly more packets than we should, but it should not cause - * problems unless someone is trying to forge packets. - */ - - /* have we already seen all of this packet? */ - if (!after(next_seq+1, sk->acked_seq)) - goto ignore_it; - /* or does it start beyond the window? */ - if (!before(th->seq, sk->acked_seq + sk->window + 1)) - goto ignore_it; - - /* ok, at least part of this packet would seem interesting.. */ - return 1; - -ignore_it: - if (th->rst) - return 0; - - /* - * Send a reset if we get something not ours and we are - * unsynchronized. Note: We don't do anything to our end. We - * are just killing the bogus remote connection then we will - * connect again and it will work (with luck). - */ - - if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) - { - tcp_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl); - return 1; - } - - /* Try to resync things. */ - tcp_send_ack(sk->sent_seq, sk->acked_seq, sk, th, saddr); - return 0; -} - -/* - * When we get a reset we do this. - */ - -static int tcp_std_reset(struct sock *sk, struct sk_buff *skb) -{ - sk->zapped = 1; - sk->err = ECONNRESET; - if (sk->state == TCP_SYN_SENT) - sk->err = ECONNREFUSED; - if (sk->state == TCP_CLOSE_WAIT) - sk->err = EPIPE; -#ifdef TCP_DO_RFC1337 - /* - * Time wait assassination protection [RFC1337] - */ - if(sk->state!=TCP_TIME_WAIT) - { - tcp_set_state(sk,TCP_CLOSE); - sk->shutdown = SHUTDOWN_MASK; - } -#else - tcp_set_state(sk,TCP_CLOSE); - sk->shutdown = SHUTDOWN_MASK; -#endif - if (!sk->dead) - sk->state_change(sk); - kfree_skb(skb, FREE_READ); - release_sock(sk); - return(0); -} + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int val; -/* - * A TCP packet has arrived. - */ - -int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt, - unsigned long daddr, unsigned short len, - unsigned long saddr, int redo, struct inet_protocol * protocol) -{ - struct tcphdr *th; - struct sock *sk; - int syn_ok=0; - - tcp_statistics.TcpInSegs++; - - if(skb->pkt_type!=PACKET_HOST) + if (level != SOL_TCP) { - kfree_skb(skb,FREE_READ); - return(0); + return tp->af_specific->setsockopt(sk, level, optname, + optval, optlen); } - - th = skb->h.th; - /* - * Find the socket, using the last hit cache if applicable. - */ - - if(saddr==th_cache_saddr && daddr==th_cache_daddr && th->dest==th_cache_dport && th->source==th_cache_sport) - sk=(struct sock *)th_cache_sk; - else - { - sk = get_sock(&tcp_prot, th->dest, saddr, th->source, daddr); - th_cache_saddr=saddr; - th_cache_daddr=daddr; - th_cache_dport=th->dest; - th_cache_sport=th->source; - th_cache_sk=sk; - } - - /* - * If this socket has got a reset it's to all intents and purposes - * really dead. Count closed sockets as dead. - * - * Note: BSD appears to have a bug here. A 'closed' TCP in BSD - * simply drops data. This seems incorrect as a 'closed' TCP doesn't - * exist so should cause resets as if the port was unreachable. - */ - - if (sk!=NULL && (sk->zapped || sk->state==TCP_CLOSE)) - sk=NULL; - - if (!redo) - { - if (tcp_check(th, len, saddr, daddr )) - { - skb->sk = NULL; - kfree_skb(skb,FREE_READ); - /* - * We don't release the socket because it was - * never marked in use. - */ - return(0); - } - th->seq = ntohl(th->seq); - - /* See if we know about the socket. */ - if (sk == NULL) - { - /* - * No such TCB. If th->rst is 0 send a reset (checked in tcp_reset) - */ - tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255); - skb->sk = NULL; - /* - * Discard frame - */ - kfree_skb(skb, FREE_READ); - return(0); - } - - skb->len = len; - skb->acked = 0; - skb->used = 0; - skb->free = 0; - skb->saddr = daddr; - skb->daddr = saddr; - - /* We may need to add it to the backlog here. */ - cli(); - if (sk->inuse) - { - skb_queue_tail(&sk->back_log, skb); - sti(); - return(0); - } - sk->inuse = 1; - sti(); - } - else - { - if (sk==NULL) - { - tcp_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255); - skb->sk = NULL; - kfree_skb(skb, FREE_READ); - return(0); - } - } - - - if (!sk->prot) - { - printk("IMPOSSIBLE 3\n"); - return(0); - } - - - /* - * Charge the memory to the socket. - */ - - if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf) - { - kfree_skb(skb, FREE_READ); - release_sock(sk); - return(0); - } - - skb->sk=sk; - sk->rmem_alloc += skb->mem_len; - - /* - * This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We - * don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug - * compatibility. We also set up variables more thoroughly [Karn notes in the - * KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths]. - */ - - if(sk->state!=TCP_ESTABLISHED) /* Skip this lot for normal flow */ - { - - /* - * Now deal with unusual cases. - */ - - if(sk->state==TCP_LISTEN) - { - if(th->ack) /* These use the socket TOS.. might want to be the received TOS */ - tcp_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl); - - /* - * We don't care for RST, and non SYN are absorbed (old segments) - * Broadcast/multicast SYN isn't allowed. Note - bug if you change the - * netmask on a running connection it can go broadcast. Even Sun's have - * this problem so I'm ignoring it - */ - - if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR) - { - kfree_skb(skb, FREE_READ); - release_sock(sk); - return 0; - } - - /* - * Guess we need to make a new socket up - */ - - tcp_conn_request(sk, skb, daddr, saddr, opt, dev, tcp_init_seq()); - - /* - * Now we have several options: In theory there is nothing else - * in the frame. KA9Q has an option to send data with the syn, - * BSD accepts data with the syn up to the [to be] advertised window - * and Solaris 2.1 gives you a protocol error. For now we just ignore - * it, that fits the spec precisely and avoids incompatibilities. It - * would be nice in future to drop through and process the data. - */ - - release_sock(sk); - return 0; - } - - /* retransmitted SYN? */ - if (sk->state == TCP_SYN_RECV && th->syn && th->seq+1 == sk->acked_seq) - { - kfree_skb(skb, FREE_READ); - release_sock(sk); - return 0; - } - - /* - * SYN sent means we have to look for a suitable ack and either reset - * for bad matches or go to connected - */ - - if(sk->state==TCP_SYN_SENT) - { - /* Crossed SYN or previous junk segment */ - if(th->ack) - { - /* We got an ack, but it's not a good ack */ - if(!tcp_ack(sk,th,saddr,len)) - { - /* Reset the ack - its an ack from a - different connection [ th->rst is checked in tcp_reset()] */ - tcp_statistics.TcpAttemptFails++; - tcp_reset(daddr, saddr, th, - sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl); - kfree_skb(skb, FREE_READ); - release_sock(sk); - return(0); - } - if(th->rst) - return tcp_std_reset(sk,skb); - if(!th->syn) - { - /* A valid ack from a different connection - start. Shouldn't happen but cover it */ - kfree_skb(skb, FREE_READ); - release_sock(sk); - return 0; - } - /* - * Ok.. it's good. Set up sequence numbers and - * move to established. - */ - syn_ok=1; /* Don't reset this connection for the syn */ - sk->acked_seq=th->seq+1; - sk->fin_seq=th->seq; - tcp_send_ack(sk->sent_seq,sk->acked_seq,sk,th,sk->daddr); - tcp_set_state(sk, TCP_ESTABLISHED); - tcp_options(sk,th); - sk->dummy_th.dest=th->source; - sk->copied_seq = sk->acked_seq; - if(!sk->dead) - { - sk->state_change(sk); - sock_wake_async(sk->socket, 0); - } - if(sk->max_window==0) - { - sk->max_window = 32; - sk->mss = min(sk->max_window, sk->mtu); - } - } - else - { - /* See if SYN's cross. Drop if boring */ - if(th->syn && !th->rst) - { - /* Crossed SYN's are fine - but talking to - yourself is right out... */ - if(sk->saddr==saddr && sk->daddr==daddr && - sk->dummy_th.source==th->source && - sk->dummy_th.dest==th->dest) - { - tcp_statistics.TcpAttemptFails++; - return tcp_std_reset(sk,skb); - } - tcp_set_state(sk,TCP_SYN_RECV); - - /* - * FIXME: - * Must send SYN|ACK here - */ - } - /* Discard junk segment */ - kfree_skb(skb, FREE_READ); - release_sock(sk); - return 0; - } - /* - * SYN_RECV with data maybe.. drop through - */ - goto rfc_step6; - } - - /* - * BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is - * a more complex suggestion for fixing these reuse issues in RFC1644 - * but not yet ready for general use. Also see RFC1379. - */ - -#define BSD_TIME_WAIT -#ifdef BSD_TIME_WAIT - if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && - after(th->seq, sk->acked_seq) && !th->rst) - { - long seq=sk->write_seq; - if(sk->debug) - printk("Doing a BSD time wait\n"); - tcp_statistics.TcpEstabResets++; - sk->rmem_alloc -= skb->mem_len; - skb->sk = NULL; - sk->err=ECONNRESET; - tcp_set_state(sk, TCP_CLOSE); - sk->shutdown = SHUTDOWN_MASK; - release_sock(sk); - sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr); - if (sk && sk->state==TCP_LISTEN) - { - sk->inuse=1; - skb->sk = sk; - sk->rmem_alloc += skb->mem_len; - tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000); - release_sock(sk); - return 0; - } - kfree_skb(skb, FREE_READ); - return 0; - } -#endif - } - - /* - * We are now in normal data flow (see the step list in the RFC) - * Note most of these are inline now. I'll inline the lot when - * I have time to test it hard and look at what gcc outputs - */ - - if(!tcp_sequence(sk,th,len,opt,saddr,dev)) - { - kfree_skb(skb, FREE_READ); - release_sock(sk); - return 0; - } - - if(th->rst) - return tcp_std_reset(sk,skb); - - /* - * !syn_ok is effectively the state test in RFC793. - */ - - if(th->syn && !syn_ok) - { - tcp_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255); - return tcp_std_reset(sk,skb); - } - - /* - * Process the ACK - */ - - - if(th->ack && !tcp_ack(sk,th,saddr,len)) - { - /* - * Our three way handshake failed. - */ - - if(sk->state==TCP_SYN_RECV) - { - tcp_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl); - } - kfree_skb(skb, FREE_READ); - release_sock(sk); - return 0; - } - -rfc_step6: /* I'll clean this up later */ - - /* - * Process urgent data - */ - - if(tcp_urg(sk, th, saddr, len)) - { - kfree_skb(skb, FREE_READ); - release_sock(sk); - return 0; - } - - - /* - * Process the encapsulated data - */ - - if(tcp_data(skb,sk, saddr, len)) - { - kfree_skb(skb, FREE_READ); - release_sock(sk); - return 0; - } - - /* - * And done - */ - - release_sock(sk); - return 0; -} - -/* - * This routine sends a packet with an out of date sequence - * number. It assumes the other end will try to ack it. - */ - -static void tcp_write_wakeup(struct sock *sk) -{ - struct sk_buff *buff,*skb; - struct tcphdr *t1; - struct device *dev=NULL; - int tmp; - - if (sk->zapped) - return; /* After a valid reset we can send no more */ - - /* - * Write data can still be transmitted/retransmitted in the - * following states. If any other state is encountered, return. - * [listen/close will never occur here anyway] - */ - - if (sk->state != TCP_ESTABLISHED && - sk->state != TCP_CLOSE_WAIT && - sk->state != TCP_FIN_WAIT1 && - sk->state != TCP_LAST_ACK && - sk->state != TCP_CLOSING - ) - { - return; - } - - if (before(sk->sent_seq, sk->window_seq) && - (skb=skb_peek(&sk->write_queue))) - { - /* - * We are probing the opening of a window - * but the window size is != 0 - * must have been a result SWS advoidance ( sender ) - */ - - struct iphdr *iph; - struct tcphdr *th; - struct tcphdr *nth; - unsigned long win_size, ow_size; - void * tcp_data_start; - - win_size = sk->window_seq - sk->sent_seq; - - iph = (struct iphdr *)(skb->data + skb->dev->hard_header_len); - th = (struct tcphdr *)(((char *)iph) +(iph->ihl << 2)); - - buff = sk->prot->wmalloc(sk, win_size + th->doff * 4 + - (iph->ihl << 2) + - skb->dev->hard_header_len, - 1, GFP_ATOMIC); - if ( buff == NULL ) - return; - - buff->len = 0; - - /* - * If we strip the packet on the write queue we must - * be ready to retransmit this one - */ - - buff->free = 0; - - buff->sk = sk; - buff->localroute = sk->localroute; - - tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev, - IPPROTO_TCP, sk->opt, buff->mem_len, - sk->ip_tos,sk->ip_ttl); - if (tmp < 0) - { - sk->prot->wfree(sk, buff->mem_addr, buff->mem_len); - return; - } - - buff->len += tmp; - buff->dev = dev; - - nth = (struct tcphdr *) (buff->data + buff->len); - buff->len += th->doff * 4; - - memcpy(nth, th, th->doff * 4); - - nth->ack = 1; - nth->ack_seq = ntohl(sk->acked_seq); - nth->window = ntohs(tcp_select_window(sk)); - nth->check = 0; - - tcp_data_start = skb->data + skb->dev->hard_header_len + - (iph->ihl << 2) + th->doff * 4; - - memcpy(buff->data + buff->len, tcp_data_start, win_size); - buff->len += win_size; - buff->h.seq = sk->sent_seq + win_size; - - /* - * now: shrink the queue head segment - */ - - th->check = 0; - ow_size = skb->len - win_size - - ((unsigned long) (tcp_data_start - (void *) skb->data)); - - memmove(tcp_data_start, tcp_data_start + win_size, ow_size); - skb->len -= win_size; - sk->sent_seq += win_size; - th->seq = htonl(sk->sent_seq); - - if (th->urg) - { - unsigned short urg_ptr; - - urg_ptr = ntohs(th->urg_ptr); - if (urg_ptr <= win_size) - th->urg = 0; - else - { - urg_ptr -= win_size; - th->urg_ptr = htons(urg_ptr); - nth->urg_ptr = htons(win_size); - } - } - - tcp_send_check(nth, sk->saddr, sk->daddr, - nth->doff * 4 + win_size , sk); - } - else - { - buff = sk->prot->wmalloc(sk,MAX_ACK_SIZE,1, GFP_ATOMIC); - if (buff == NULL) - return; - - buff->len = sizeof(struct tcphdr); - buff->free = 1; - buff->sk = sk; - buff->localroute = sk->localroute; - - t1 = (struct tcphdr *) buff->data; - - /* - * Put in the IP header and routing stuff. - */ - - tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev, - IPPROTO_TCP, sk->opt, MAX_ACK_SIZE,sk->ip_tos,sk->ip_ttl); - if (tmp < 0) - { - sk->prot->wfree(sk, buff->mem_addr, buff->mem_len); - return; - } - - buff->len += tmp; - t1 = (struct tcphdr *)((char *)t1 +tmp); - - memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1)); - - /* - * Use a previous sequence. - * This should cause the other end to send an ack. - */ - - t1->seq = htonl(sk->sent_seq-1); - t1->ack = 1; - t1->res1= 0; - t1->res2= 0; - t1->rst = 0; - t1->urg = 0; - t1->psh = 0; - t1->fin = 0; /* We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */ - t1->syn = 0; - t1->ack_seq = ntohl(sk->acked_seq); - t1->window = ntohs(tcp_select_window(sk)); - t1->doff = sizeof(*t1)/4; - tcp_send_check(t1, sk->saddr, sk->daddr, sizeof(*t1), sk); - - } - - /* - * Send it. - */ - - sk->prot->queue_xmit(sk, dev, buff, 1); - tcp_statistics.TcpOutSegs++; -} - -/* - * A window probe timeout has occurred. - */ - -void tcp_send_probe0(struct sock *sk) -{ - if (sk->zapped) - return; /* After a valid reset we can send no more */ - - tcp_write_wakeup(sk); - - sk->backoff++; - sk->rto = min(sk->rto << 1, 120*HZ); - reset_xmit_timer (sk, TIME_PROBE0, sk->rto); - sk->retransmits++; - sk->prot->retransmits ++; -} - -/* - * Socket option code for TCP. - */ - -int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) -{ - int val,err; - - if(level!=SOL_TCP) - return ip_setsockopt(sk,level,optname,optval,optlen); - - if (optval == NULL) + if (optval == NULL) return(-EINVAL); - err=verify_area(VERIFY_READ, optval, sizeof(int)); - if(err) - return err; - - val = get_fs_long((unsigned long *)optval); + if (get_user(val, (int *)optval)) + return -EFAULT; switch(optname) { @@ -5103,13 +1813,18 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int op } } -int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen) +int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, + int *optlen) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int val,err; - if(level!=SOL_TCP) - return ip_getsockopt(sk,level,optname,optval,optlen); - + if(level != SOL_TCP) + { + return tp->af_specific->getsockopt(sk, level, optname, + optval, optlen); + } + switch(optname) { case TCP_MAXSEG: @@ -5121,49 +1836,29 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *o default: return(-ENOPROTOOPT); } - err=verify_area(VERIFY_WRITE, optlen, sizeof(int)); - if(err) - return err; - put_fs_long(sizeof(int),(unsigned long *) optlen); - - err=verify_area(VERIFY_WRITE, optval, sizeof(int)); - if(err) - return err; - put_fs_long(val,(unsigned long *)optval); - - return(0); -} - - -struct proto tcp_prot = { - sock_wmalloc, - sock_rmalloc, - sock_wfree, - sock_rfree, - sock_rspace, - sock_wspace, - tcp_close, - tcp_read, - tcp_write, - tcp_sendto, - tcp_recvfrom, - ip_build_header, - tcp_connect, - tcp_accept, - ip_queue_xmit, - tcp_retransmit, - tcp_write_wakeup, - tcp_read_wakeup, - tcp_rcv, - tcp_select, - tcp_ioctl, - NULL, - tcp_shutdown, - tcp_setsockopt, - tcp_getsockopt, - 128, - 0, - "TCP", - 0, 0, - {NULL,} -}; + + err = put_user(sizeof(int),(int *) optlen); + if (!err) + err = put_user(val,(int *)optval); + + return err; +} + +void tcp_set_keepalive(struct sock *sk, int val) +{ + if (!sk->keepopen && val) + { + tcp_inc_slow_timer(TCP_SLT_KEEPALIVE); + } + else if (sk->keepopen && !val) + { + tcp_dec_slow_timer(TCP_SLT_KEEPALIVE); + } +} + +/* + * Local variables: + * compile-command: "gcc -D__KERNEL__ -I/usr/src/linux/include -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer -fno-strength-reduce -pipe -m486 -DCPU=486 -c -o tcp.o tcp.c" + * c-file-style: "Linux" + * End: + */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c new file mode 100644 index 000000000..076568961 --- /dev/null +++ b/net/ipv4/tcp_input.c @@ -0,0 +1,1876 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: @(#)tcp_input.c 1.0.16 05/25/93 + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Mark Evans, <evansmp@uhura.aston.ac.uk> + * Corey Minyard <wf-rch!minyard@relay.EU.net> + * Florian La Roche, <flla@stud.uni-sb.de> + * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> + * Linus Torvalds, <torvalds@cs.helsinki.fi> + * Alan Cox, <gw4pts@gw4pts.ampr.org> + * Matthew Dillon, <dillon@apollo.west.oic.com> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Jorge Cwik, <jorge@laser.satlink.net> + */ + +/* + * TODO + * - A better sock cache + * + */ + +/* + * Changes: + * Pedro Roque : Fast Retransmit/Recovery. + * Two receive queues. + * Retransmit queue handled by TCP. + * Better retransmit timer handling. + * New congestion avoidance. + * Header prediction. + * Variable renaming. + * + * Eric : Fast Retransmit. + * Randy Scott : MSS option defines. + * Eric Schenk : Fixes to slow start algorithm. + * Eric Schenk : Yet another double ACK bug. + * Eric Schenk : Delayed ACK bug fixes. + * Eric Schenk : Floyd style fast retrans war avoidance. + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/sysctl.h> +#include <net/tcp.h> + + + +typedef void (*tcp_sys_cong_ctl_t)(struct sock *sk, + u32 seq, u32 ack, + u32 seq_rtt); + +static void tcp_cong_avoid_vanj(struct sock *sk, u32 seq, u32 ack, + u32 seq_rtt); +static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack, + u32 seq_rtt); + +int sysctl_tcp_cong_avoidance = 0; + +static tcp_sys_cong_ctl_t tcp_sys_cong_ctl_f = &tcp_cong_avoid_vanj; + +/* + * Called each time to estimate the delayed ack timeout. This is + * how it should be done so a fast link isnt impacted by ack delay. + * + * I think we need a medium deviation here also... + * The estimated value is changing to fast + */ + +static void tcp_delack_estimator(struct tcp_opt *tp) +{ + int m; + + /* + * Delayed ACK time estimator. + */ + + m = jiffies - tp->lrcvtime; + + tp->lrcvtime = jiffies; + + if (m < 0) + return; + + /* + * if the mesured value is bigger than + * twice the round trip time ignore it. + */ + if ((m << 2) <= tp->srtt) + { + m -= (tp->iat >> 3); + tp->iat += m; + + if (m <0) + m = -m; + + m -= (tp->iat_mdev >> 2); + tp->iat_mdev += m; + + tp->ato = (tp->iat >> 3) + (tp->iat_mdev >> 2); + + if (tp->ato < HZ/50) + tp->ato = HZ/50; + } + else + tp->ato = 0; +} + +/* + * Called on frames that were known _not_ to have been + * retransmitted [see Karn/Partridge Proceedings SIGCOMM 87]. + * The algorithm is from the SIGCOMM 88 piece by Van Jacobson. + */ + +extern __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt) +{ + long m; + /* + * The following amusing code comes from Jacobson's + * article in SIGCOMM '88. Note that rtt and mdev + * are scaled versions of rtt and mean deviation. + * This is designed to be as fast as possible + * m stands for "measurement". + */ + /* + * On a 1990 paper the rto value is changed to: + * RTO = rtt + 4 * mdev + */ + + m = mrtt; /* RTT */ + + if (tp->srtt != 0) { + if(m<=0) + m=1; /* IS THIS RIGHT FOR <0 ??? */ + m -= (tp->srtt >> 3); /* m is now error in rtt est */ + tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ + if (m < 0) + m = -m; /* m is now abs(error) */ + m -= (tp->mdev >> 2); /* similar update on mdev */ + tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ + } else { + /* no previous measure. */ + tp->srtt = m<<3; /* take the measured time to be rtt */ + tp->mdev = m<<2; /* make sure rto = 3*rtt */ + } + + + /* + * Now update timeout. Note that this removes any backoff. + */ + + tp->rto = (tp->srtt >> 3) + tp->mdev; + + if (tp->rto > 120*HZ) + tp->rto = 120*HZ; + + /* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */ + if (tp->rto < HZ/5) + tp->rto = HZ/5; + + tp->backoff = 0; +} + + +/* + * This functions checks to see if the tcp header is actually acceptable. + */ + +extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 seg_nxt) +{ + u32 end_window = tp->rcv_wup + tp->rcv_wnd; + u32 end_seq = seg_nxt; + + /* + * When the window is open (most common case) + * we want to accept segments if they have yet unseen data + * or in the case of a dataless segment if seg.seq == rcv.nxt + * this means: + * + * if (seq == end_seq) + * end_seq >= rcv.nxt + * else + * end_seq > rcv.nxt + */ + + if (seq == end_seq) + end_seq++; + + return ((before(seq, end_window) && after(end_seq, tp->rcv_nxt)) || + (seq == end_window && seq == end_seq)); +} + +/* + * When we get a reset we do this. This probably is a tcp_output routine + * really. + */ + +static int tcp_reset(struct sock *sk, struct sk_buff *skb) +{ + sk->zapped = 1; + /* + * We want the right error as BSD sees it (and indeed as we do). + */ + switch (sk->state) { + case TCP_TIME_WAIT: + break; + case TCP_SYN_SENT: + sk->err = ECONNREFUSED; + break; + case TCP_CLOSE_WAIT: + sk->err = EPIPE; + break; + default: + sk->err = ECONNRESET; + } +#ifdef CONFIG_TCP_RFC1337 + /* + * Time wait assassination protection [RFC1337] + * + * This is a good idea, but causes more sockets to take time to close. + * + * Ian Heavens has since shown this is an inadequate fix for the protocol + * bug in question. + */ + if(sk->state!=TCP_TIME_WAIT) + { + tcp_set_state(sk,TCP_CLOSE); + sk->shutdown = SHUTDOWN_MASK; + } +#else + tcp_set_state(sk,TCP_CLOSE); + sk->shutdown = SHUTDOWN_MASK; +#endif + if (!sk->dead) + sk->state_change(sk); + + return(0); +} + + +/* + * Look for tcp options. Parses everything but only knows about MSS. + * This routine is always called with the packet containing the SYN. + * However it may also be called with the ack to the SYN. So you + * can't assume this is always the SYN. It's always called after + * we have set up sk->mtu to our own MTU. + * + * We need at minimum to add PAWS support here. Possibly large windows + * as Linux gets deployed on 100Mb/sec networks. + */ + +int tcp_parse_options(struct tcphdr *th) +{ + unsigned char *ptr; + int length=(th->doff*4)-sizeof(struct tcphdr); + int mss = 0; + + ptr = (unsigned char *)(th + 1); + + while(length>0) + { + int opcode=*ptr++; + int opsize=*ptr++; + switch(opcode) + { + case TCPOPT_EOL: + return 0; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + ptr--; /* the opsize=*ptr++ above was a mistake */ + continue; + + default: + if(opsize<=2) /* Avoid silly options looping forever */ + return 0; + switch(opcode) + { + case TCPOPT_MSS: + if(opsize==TCPOLEN_MSS && th->syn) + { + mss = ntohs(*(unsigned short *)ptr); + } + break; + /* Add other options here as people feel the urge to implement stuff like large windows */ + } + ptr+=opsize-2; + length-=opsize; + } + } + + return mss; +} + + +/* + * See draft-stevens-tcpca-spec-01 for documentation. + */ + +static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup) +{ + struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); + + /* + * An ACK is a duplicate if: + * (1) it has the same sequence number as the largest number we've + * seen, + * (2) it has the same window as the last ACK, + * (3) we have outstanding data that has not been ACKed + * (4) The packet was not carrying any data. + * (5) [From Floyds paper on fast retransmit wars] + * The packet acked data after high_seq; + */ + + if (ack == tp->snd_una && sk->packets_out && (not_dup == 0) && + after(ack, tp->high_seq)) + { + + sk->dup_acks++; + + + /* + * 1. When the third duplicate ack is received, set ssthresh + * to one half the current congestion window, but no less + * than two segments. Retransmit the missing segment. + */ + + if (sk->dup_acks == 3) + { + sk->ssthresh = max(sk->cong_window >> 1, 2); + sk->cong_window = sk->ssthresh + 3; + tcp_do_retransmit(sk, 0); + } + + /* + * 2. Each time another duplicate ACK arrives, increment + * cwnd by the segment size. [...] Transmit a packet... + * + * Packet transmission will be done on normal flow processing + * since we're not in "retransmit mode" + */ + + if (sk->dup_acks > 3) + { + sk->cong_window++; + } + } + else + { + /* + * 3. When the next ACK arrives that acknowledges new data, + * set cwnd to ssthresh + */ + + if (sk->dup_acks >= 3) + { + sk->tp_pinfo.af_tcp.retrans_head = NULL; + sk->cong_window = sk->ssthresh; + sk->retransmits = 0; + } + sk->dup_acks = 0; + } + +} + +/* + * TCP slow start and congestion avoidance in two flavors: + * RFC 1122 and TCP Vegas. + * + * This is a /proc/sys configurable option. + */ + +#define SHIFT_FACTOR 16 + +static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack, + u32 seq_rtt) +{ + /* + * From: + * TCP Vegas: New Techniques for Congestion + * Detection and Avoidance. + * + * + * Warning: This code is a scratch implementation taken + * from the paper only. The code they distribute seams + * to have improved several things over the initial spec. + */ + + struct tcp_opt * tp; + unsigned int Actual, Expected; + unsigned int inv_rtt, inv_basertt; + u32 snt_bytes; + + + tp = &(sk->tp_pinfo.af_tcp); + + if (!seq_rtt) + seq_rtt = 1; + + if (tp->basertt) + tp->basertt = min(seq_rtt, tp->basertt); + else + tp->basertt = seq_rtt; + + /* + * + * Actual = throughput for this segment. + * Expected = number_of_bytes in transit / BaseRTT + * + */ + + snt_bytes = (ack - seq) << SHIFT_FACTOR; + inv_rtt = (1 << SHIFT_FACTOR) / seq_rtt; + + Actual = snt_bytes * inv_rtt; + + inv_basertt = (1 << SHIFT_FACTOR) / tp->basertt; + Expected = ((tp->snd_nxt - tp->snd_una) << SHIFT_FACTOR) * inv_basertt; + + /* + * Slow Start + */ + + if (sk->cong_window < sk->ssthresh && + (seq == tp->snd_nxt || + (((Expected - Actual) <= + ((TCP_VEGAS_GAMMA << SHIFT_FACTOR) * sk->mss * inv_basertt)) + ) + )) + { + /* + * "Vegas allows exponential growth only every other + * RTT" + */ + + if (!(sk->cong_count++)) + { + sk->cong_window++; + sk->cong_count = 0; + } + } + else + { + /* + * Congestion Avoidance + */ + + if (Expected - Actual <= + ((TCP_VEGAS_ALPHA << SHIFT_FACTOR) * sk->mss * inv_basertt)) + { + /* Increase Linearly */ + + if (sk->cong_count++ >= sk->cong_window) + { + sk->cong_window++; + sk->cong_count = 0; + } + } + + if (Expected - Actual >= + ((TCP_VEGAS_BETA << SHIFT_FACTOR) * sk->mss * inv_basertt)) + { + /* Decrease Linearly */ + + if (sk->cong_count++ >= sk->cong_window) + { + sk->cong_window--; + sk->cong_count = 0; + } + + /* Never less than 2 segments */ + if (sk->cong_window < 2) + sk->cong_window = 2; + } + } +} + +static void tcp_cong_avoid_vanj(struct sock *sk, u32 seq, u32 ack, u32 seq_rtt) +{ + + /* + * This is Jacobson's slow start and congestion avoidance. + * SIGCOMM '88, p. 328. Because we keep cong_window in + * integral mss's, we can't do cwnd += 1 / cwnd. + * Instead, maintain a counter and increment it once every + * cwnd times. + */ + + if (sk->cong_window <= sk->ssthresh) + { + /* + * In "safe" area, increase + */ + + sk->cong_window++; + } + else + { + /* + * In dangerous area, increase slowly. + * In theory this is + * sk->cong_window += 1 / sk->cong_window + */ + + if (sk->cong_count >= sk->cong_window) { + + sk->cong_window++; + sk->cong_count = 0; + } + else + sk->cong_count++; + } +} + + +#define FLAG_DATA 0x01 +#define FLAG_WIN_UPDATE 0x02 +#define FLAG_DATA_ACKED 0x04 + +static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __u32 *seq, + __u32 *seq_rtt) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb; + unsigned long now = jiffies; + int acked = 0; + + while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) + { + +#ifdef TCP_DEBUG + /* Check for a bug. */ + + if (skb->next != (struct sk_buff*) &sk->write_queue && + after(skb->end_seq, skb->next->seq)) + printk("INET: tcp_input.c: *** " + "bug send_list out of order.\n"); +#endif + /* + * If our packet is before the ack sequence we can + * discard it as it's confirmed to have arrived the + * other end. + */ + + if (after(skb->end_seq, ack)) + break; + + if (sk->debug) + { + printk(KERN_DEBUG "removing seg %x-%x from " + "retransmit queue\n", skb->seq, skb->end_seq); + } + + acked = FLAG_DATA_ACKED; + + atomic_dec(&sk->packets_out); + + *seq = skb->seq; + *seq_rtt = now - skb->when; + + skb_unlink(skb); + skb->free = 1; + + kfree_skb(skb, FREE_WRITE); + } + + if (acked && !sk->dead) + { + tp->retrans_head = NULL; + sk->write_space(sk); + } + + return acked; +} + +static void tcp_ack_probe(struct sock *sk, __u32 ack) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* + * Our probe was answered + */ + tp->probes_out = 0; + + /* + * Was it a usable window open ? + */ + + /* should always be non-null */ + if (tp->send_head != NULL && + !before (ack + tp->snd_wnd, tp->send_head->end_seq)) + { + tp->backoff = 0; + tp->pending = 0; + + tcp_clear_xmit_timer(sk, TIME_PROBE0); + + } + else + { + tcp_reset_xmit_timer(sk, TIME_PROBE0, + min(tp->rto << tp->backoff, 120*HZ)); + } +} + +/* + * This routine deals with incoming acks, but not outgoing ones. + */ + +static int tcp_ack(struct sock *sk, struct tcphdr *th, + u32 ack_seq, u32 ack, int len) +{ + int flag = 0; + u32 seq = 0; + u32 seq_rtt = 0; + struct sk_buff *skb; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + + if(sk->zapped) + return(1); /* Dead, can't ack any more so why bother */ + + + if (tp->pending == TIME_KEEPOPEN) + { + tp->probes_out = 0; + } + + tp->rcv_tstamp = jiffies; + + /* + * If the ack is newer than sent or older than previous acks + * then we can probably ignore it. + */ + + if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una)) + goto uninteresting_ack; + + /* + * If there is data set flag 1 + */ + + if (len != th->doff*4) + { + flag |= FLAG_DATA; + tcp_delack_estimator(tp); + } + + /* + * Update our send window + */ + + /* + * This is the window update code as per RFC 793 + * snd_wl{1,2} are used to prevent unordered + * segments from shrinking the window + */ + + if ((tp->snd_wl1 == 0) || before(tp->snd_wl1, ack_seq) || + (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack))) + { + tp->snd_wnd = ntohs(th->window); + tp->snd_wl1 = ack_seq; + tp->snd_wl2 = ack; + + flag |= FLAG_WIN_UPDATE; + + if (tp->snd_wnd > sk->max_window) + { + sk->max_window = tp->snd_wnd; + } + } + + + /* + * We passed data and got it acked, remove any soft error + * log. Something worked... + */ + + sk->err_soft = 0; + + /* + * If this ack opens up a zero window, clear backoff. It was + * being used to time the probes, and is probably far higher than + * it needs to be for normal retransmission. + */ + + if (tp->pending == TIME_PROBE0) + { + tcp_ack_probe(sk, ack); + } + + /* + * See if we can take anything off of the retransmit queue. + */ + + if (tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt)) + flag |= FLAG_DATA_ACKED; + + + /* + * if we where retransmiting don't count rtt estimate + */ + + if (sk->retransmits) + { + if (sk->packets_out == 0) + sk->retransmits = 0; + } + else + { + /* + * Note that we only reset backoff and rto in the + * rtt recomputation code. And that doesn't happen + * if there were retransmissions in effect. So the + * first new packet after the retransmissions is + * sent with the backoff still in effect. Not until + * we get an ack from a non-retransmitted packet do + * we reset the backoff and rto. This allows us to deal + * with a situation where the network delay has increased + * suddenly. I.e. Karn's algorithm. (SIGCOMM '87, p5.) + */ + + if (flag & FLAG_DATA_ACKED) + { + tcp_rtt_estimator(tp, seq_rtt); + + (*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt); + } + } + + +#ifdef TCP_DEBUG + + /* Sanity check out packets_out counter */ + if (skb_queue_len(&sk->write_queue) == 0 || + ack == tp->snd_nxt ) + { + if (sk->packets_out) + { + printk(KERN_DEBUG "tcp_ack: packets_out %d\n", + sk->packets_out); + sk->packets_out = 0; + } + } +#endif + + if (sk->packets_out) + { + if (flag & FLAG_DATA_ACKED) + { + long when; + + skb = skb_peek(&sk->write_queue); + + when = tp->rto - (jiffies - skb->when); + + if (when <= 0) + { + tp->retrans_head = NULL; + /* + * This is tricky. We are retransmiting a + * segment of a window when congestion occured. + */ + tcp_do_retransmit(sk, 0); + tcp_reset_xmit_timer(sk, TIME_RETRANS, + tp->rto); + } + else + tcp_reset_xmit_timer(sk, TIME_RETRANS, when); + } + } + else + tcp_clear_xmit_timer(sk, TIME_RETRANS); + + + /* + * Remember the highest ack received. + */ + + tp->snd_una = ack; + + tcp_fast_retrans(sk, ack, (flag & (FLAG_DATA|FLAG_WIN_UPDATE))); + + + return 1; + +uninteresting_ack: + + tcp_fast_retrans(sk, ack, 0); + + if(sk->debug) + printk("Ack ignored %u %u\n",ack,tp->snd_nxt); + + return 0; +} + + +/* + * Process the FIN bit. This now behaves as it is supposed to work + * and the FIN takes effect when it is validly part of sequence + * space. Not before when we get holes. + * + * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT + * (and thence onto LAST-ACK and finally, CLOSE, we never enter + * TIME-WAIT) + * + * If we are in FINWAIT-1, a received FIN indicates simultaneous + * close and we go into CLOSING (and later onto TIME-WAIT) + * + * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. + * + */ + +static int tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) +{ + sk->fin_seq = skb->end_seq; + + tcp_send_ack(sk); + + if (!sk->dead) + { + sk->state_change(sk); + sock_wake_async(sk->socket, 1); + } + + switch(sk->state) + { + case TCP_SYN_RECV: + case TCP_SYN_SENT: + case TCP_ESTABLISHED: + /* + * move to CLOSE_WAIT + */ + + tcp_set_state(sk, TCP_CLOSE_WAIT); + + if (th->rst) + sk->shutdown = SHUTDOWN_MASK; + break; + + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + /* + * received a retransmission of the FIN, do + * nothing. + */ + break; + case TCP_TIME_WAIT: + /* + * received a retransmission of the FIN, + * restart the TIME_WAIT timer. + */ + tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); + return(0); + case TCP_FIN_WAIT1: + /* + * This case occurs when a simultaneous close + * happens, we must ack the received FIN and + * enter the CLOSING state. + * + * This causes a WRITE timeout, which will either + * move on to TIME_WAIT when we timeout, or resend + * the FIN properly (maybe we get rid of that annoying + * FIN lost hang). The TIME_WRITE code is already + * correct for handling this timeout. + */ + + tcp_set_state(sk, TCP_CLOSING); + break; + case TCP_FIN_WAIT2: + /* + * received a FIN -- send ACK and enter TIME_WAIT + */ + tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); + sk->shutdown|=SHUTDOWN_MASK; + tcp_set_state(sk,TCP_TIME_WAIT); + break; + case TCP_CLOSE: + /* + * already in CLOSE + */ + break; + default: + tcp_set_state(sk,TCP_LAST_ACK); + + /* Start the timers. */ + tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); + return(0); + } + + return(0); +} + + + + /* + * This one checks to see if we can put data from the + * out_of_order queue into the receive_queue + */ + +static void tcp_ofo_queue(struct sock *sk) +{ + struct sk_buff * skb; + struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); + + while ((skb = skb_peek(&sk->out_of_order_queue))) { + + if (after(skb->seq, tp->rcv_nxt)) + break; + + if (!after(skb->end_seq, tp->rcv_nxt)) { + + if (sk->debug) + printk("ofo packet was allready received \n"); + + skb_unlink(skb); + kfree_skb(skb, FREE_READ); + + continue; + } + + if (sk->debug) + printk("ofo requeuing : rcv_next %X seq %X - %X\n", + tp->rcv_nxt, skb->seq, skb->end_seq); + + skb_unlink(skb); + + + skb_queue_tail(&sk->receive_queue, skb); + + + tp->rcv_nxt = skb->end_seq; + } +} + +static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff * skb1; + struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); + + /* + * Queue data for delivery to the user + * Packets in sequence go to the receive queue + * Out of sequence packets to out_of_order_queue + */ + + + if (skb->seq == tp->rcv_nxt) { + + /* + * Ok. In sequence. + */ + + + skb_queue_tail(&sk->receive_queue, skb); + + + tp->rcv_nxt = skb->end_seq; + + tcp_ofo_queue(sk); + + if (skb_queue_len(&sk->out_of_order_queue) == 0) + tp->pred_flags = htonl((0x5010 << 16) | tp->snd_wnd); + + return; + } + + /* + * Not in sequence + * either a retransmit or some packet got lost + */ + + if (!after(skb->end_seq, tp->rcv_nxt)) { + + /* + * A retransmit. + * 2nd most common case. + * force an imediate ack + */ + + if (sk->debug) + printk("retransmit received: seq %X\n", skb->seq); + + sk->delayed_acks = MAX_DELAY_ACK; + kfree_skb(skb, FREE_READ); + + return; + } + + + if (before(skb->seq, tp->rcv_nxt)) { + + /* + * Partial packet + * seq < rcv_next < end_seq + */ + + if (sk->debug) + printk("partial packet: rcv_next %X seq %X - %X\n", + tp->rcv_nxt, skb->seq, skb->end_seq); + + skb_queue_tail(&sk->receive_queue, skb); + + + tp->rcv_nxt = skb->end_seq; + + tcp_ofo_queue(sk); + + if (skb_queue_len(&sk->out_of_order_queue) == 0) + tp->pred_flags = htonl((0x5010 << 16) | tp->snd_wnd); + + return; + } + + /* + * Ok. This is an out_of_order segment + */ + + /* Force an ack */ + + sk->delayed_acks = MAX_DELAY_ACK; + + /* + * disable header predition + */ + + tp->pred_flags = 0; + + if (sk->debug) + printk("out of order segment: rcv_next %X seq %X - %X\n", + tp->rcv_nxt, skb->seq, skb->end_seq); + + if (skb_peek(&sk->out_of_order_queue) == NULL) { + skb_queue_head(&sk->out_of_order_queue,skb); + } + else + for(skb1=sk->out_of_order_queue.prev; ; skb1 = skb1->prev) { + + /* allready there */ + if (skb->seq==skb1->seq && skb->len>=skb1->len) + { + skb_append(skb1,skb); + skb_unlink(skb1); + kfree_skb(skb1,FREE_READ); + break; + } + + if (after(skb->seq, skb1->seq)) + { + skb_append(skb1,skb); + break; + } + + /* + * See if we've hit the start. If so insert. + */ + if (skb1 == skb_peek(&sk->out_of_order_queue)) { + skb_queue_head(&sk->out_of_order_queue,skb); + break; + } + } + +} + + +/* + * This routine handles the data. If there is room in the buffer, + * it will be have already been moved into it. If there is no + * room, then we will just have to discard the packet. + */ + +static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len) +{ + struct tcphdr *th; + struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); + + th = skb->h.th; + skb_pull(skb,th->doff*4); + skb_trim(skb,len-(th->doff*4)); + + if (skb->len == 0 && !th->fin) + { + return(0); + } + + /* + * FIXME: don't accept data after the receved fin + */ + + /* + * The bytes in the receive read/assembly queue has increased. + * Needed for the low memory discard algorithm + */ + + sk->bytes_rcv += skb->len; + + /* + * We no longer have anyone receiving data on this connection. + */ + + tcp_data_queue(sk, skb); + + if (before(tp->rcv_nxt, sk->copied_seq)) + { + printk("*** tcp.c:tcp_data bug acked < copied\n"); + tp->rcv_nxt = sk->copied_seq; + } + + sk->delayed_acks++; + + + /* + * Now tell the user we may have some data. + */ + + if (!sk->dead) + { + if(sk->debug) + printk("Data wakeup.\n"); + sk->data_ready(sk,0); + } + return(1); +} + +static void tcp_data_snd_check(struct sock *sk) +{ + struct sk_buff *skb; + struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); + + if ((skb = tp->send_head)) + { + if (!after(skb->end_seq, tp->snd_una + tp->snd_wnd) && + sk->packets_out < sk->cong_window ) + { + /* + * Add more data to the send queue. + */ + + tcp_write_xmit(sk); + wake_up_interruptible(sk->sleep); + } + else if (sk->packets_out == 0 && !tp->pending) + { + /* + * Data to queue but no room. + */ + tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); + } + } +} + +static __inline__ void tcp_ack_snd_check(struct sock *sk) +{ + /* + * This also takes care of updating the window. + * This if statement needs to be simplified. + * + * rules for delaying an ack: + * - delay time <= 0.5 HZ + * - we don't have a window update to send + * - must send at least every 2 full sized packets + */ + + if (sk->delayed_acks == 0) + return; + + if (sk->delayed_acks >= MAX_DELAY_ACK || tcp_raise_window(sk)) + { + tcp_send_ack(sk); + } + else + { + tcp_send_delayed_ack(sk, HZ/2); + } +} + +/* + * This routine is only called when we have urgent data + * signalled. Its the 'slow' part of tcp_urg. It could be + * moved inline now as tcp_urg is only called from one + * place. We handle URGent data wrong. We have to - as + * BSD still doesn't use the correction from RFC961. + * For 1003.1g we should support a new option TCP_STDURG to permit + * either form. + */ + +static void tcp_check_urg(struct sock * sk, struct tcphdr * th) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + u32 ptr = ntohs(th->urg_ptr); + + if (ptr) + ptr--; + ptr += ntohl(th->seq); + + /* ignore urgent data that we've already seen and read */ + if (after(sk->copied_seq, ptr)) + return; + + /* do we already have a newer (or duplicate) urgent pointer? */ + if (sk->urg_data && !after(ptr, sk->urg_seq)) + return; + + /* tell the world about our new urgent pointer */ + if (sk->proc != 0) { + if (sk->proc > 0) { + kill_proc(sk->proc, SIGURG, 1); + } else { + kill_pg(-sk->proc, SIGURG, 1); + } + } + /* + * We may be adding urgent data when the last byte read was + * urgent. To do this requires some care. We cannot just ignore + * sk->copied_seq since we would read the last urgent byte again + * as data, nor can we alter copied_seq until this data arrives + * or we break the sematics of SIOCATMARK (and thus sockatmark()) + */ + if (sk->urg_seq == sk->copied_seq) + sk->copied_seq++; /* Move the copied sequence on correctly */ + sk->urg_data = URG_NOTYET; + sk->urg_seq = ptr; + + /* disable header prediction */ + tp->pred_flags = 0; +} + +/* + * This is the 'fast' part of urgent handling. + */ + +static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len) +{ + /* + * Check if we get a new urgent pointer - normally not + */ + + if (th->urg) + tcp_check_urg(sk,th); + + /* + * Do we wait for any urgent data? - normally not + */ + + if (sk->urg_data == URG_NOTYET) { + u32 ptr; + + /* + * Is the urgent pointer pointing into this packet? + */ + ptr = sk->urg_seq - ntohl(th->seq) + th->doff*4; + if (ptr < len) { + sk->urg_data = URG_VALID | *(ptr + (unsigned char *) th); + if (!sk->dead) + sk->data_ready(sk,0); + } + } +} + + +static void prune_queue(struct sock *sk) +{ + struct sk_buff * skb; + + /* + * clean the out_of_order queue + */ + + while ((skb = skb_dequeue(&sk->out_of_order_queue))) + { + kfree_skb(skb, FREE_READ); + } +} + + +void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, __u16 len) +{ + struct tcp_opt *tp; + int queued = 0; + u32 flg; + + /* + * Header prediction. + * The code follows the one in the famous + * "30 instruction TCP receive" Van Jacobson mail. + * + * Van's trick is to deposit buffers into socket queue + * on a device interrupt, to call tcp_recv function + * on the receive process context and checksum and copy + * the buffer to user space. smart... + * + * Our current scheme is not silly either but we take the + * extra cost of the net_bh soft interrupt processing... + * We do checksum and copy also but from device to kernel. + */ + + tp = &(sk->tp_pinfo.af_tcp); + flg = *(((u32 *)th) + 3); + + /* + * pred_flags is 0x5?10 << 16 + snd_wnd + * if header_predition is to be made + * ? will be 0 else it will be !0 + * (when there are holes in the receive + * space for instance) + */ + + if (flg == tp->pred_flags && skb->seq == tp->rcv_nxt) + { + if (len <= sizeof(struct tcphdr)) + { + if (len == sizeof(struct tcphdr)) + { + tcp_ack(sk, th, skb->seq, skb->ack_seq, len); + } + + tcp_data_snd_check(sk); + + kfree_skb(skb, FREE_READ); + return; + + } + else if (skb->ack_seq == tp->snd_una) + { + /* + * Bulk data transfer: receiver + */ + + skb_pull(skb,sizeof(struct tcphdr)); + + skb_queue_tail(&sk->receive_queue, skb); + tp->rcv_nxt = skb->end_seq; + sk->bytes_rcv += len - sizeof(struct tcphdr); + + sk->data_ready(sk, 0); + tcp_delack_estimator(tp); + + if (sk->delayed_acks++) + { + tcp_send_delayed_ack(sk, HZ/2); + } + else + tcp_send_ack(sk); + + return; + } + } + + if (!tcp_sequence(tp, skb->seq, skb->end_seq)) + { + if (!th->rst) + { + if (after(skb->seq, tp->rcv_nxt)) + { + printk(KERN_DEBUG "->seq:%d end:%d " + "wup:%d wnd:%d\n", + skb->seq, skb->end_seq, + tp->rcv_wup, tp->rcv_wnd); + } + tcp_send_ack(sk); + kfree_skb(skb, FREE_READ); + return; + } + } + + if(th->syn && skb->seq != sk->syn_seq) + { + printk(KERN_DEBUG "syn in established state\n"); + tcp_reset(sk, skb); + kfree_skb(skb, FREE_READ); + return; + } + + if(th->rst) + { + tcp_reset(sk,skb); + kfree_skb(skb, FREE_READ); + return; + } + + if(th->ack) + { + tcp_ack(sk, th, skb->seq, skb->ack_seq, len); + } + + + /* + * Process urgent data + */ + + tcp_urg(sk, th, len); + + /* + * step 7: process the segment text + */ + + + queued = tcp_data(skb, sk, len); + + /* + * step 8: check the FIN bit + */ + + if (th->fin) + { + tcp_fin(skb, sk, th); + } + + tcp_data_snd_check(sk); + tcp_ack_snd_check(sk); + + /* + * If our receive queue has grown past its limits, + * try to prune away duplicates etc.. + */ + if (sk->rmem_alloc > sk->rcvbuf) + prune_queue(sk); + + /* + * And done + */ + + if (queued) + return; + + kfree_skb(skb, FREE_READ); +} + + +/* + * This function implements the receiving procedure of RFC 793. + * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be + * address independent. + */ + +int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, void *opt, __u16 len) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int queued = 0; + int rcv_mss; + + /* + * state == CLOSED + * tested in tcp_v{4,6}_rcv + */ + + switch (sk->state) { + + + case TCP_LISTEN: + + if (th->rst) + goto discard; + + /* + * These use the socket TOS.. + * might want to be the received TOS + */ + + if(th->ack) + { + /* + * send reset + */ + + return 1; + } + + + if(th->syn) + { + int err; + __u32 isn; + + isn = tp->af_specific->init_sequence(sk, skb); + err = tp->af_specific->conn_request(sk, skb, opt, isn); + + if (err < 0) + return 1; + + /* + * Now we have several options: In theory there is + * nothing else in the frame. KA9Q has an option to + * send data with the syn, BSD accepts data with the + * syn up to the [to be] advertised window and + * Solaris 2.1 gives you a protocol error. For now + * we just ignore it, that fits the spec precisely + * and avoids incompatibilities. It would be nice in + * future to drop through and process the data. + * + * Now that TTCP is starting to be used we ought to + * queue this data. + */ + + return 0; + } + + goto discard; + break; + + case TCP_SYN_SENT: + + /* + * SYN sent means we have to look for a suitable ack and + * either reset for bad matches or go to connected. + * The SYN_SENT case is unusual and should + * not be in line code. [AC] + */ + + if(th->ack) + { + /* We got an ack, but it's not a good ack */ + if(!tcp_ack(sk,th, skb->seq, skb->ack_seq, len)) + { + tcp_statistics.TcpAttemptFails++; + return 1; + } + + if(th->rst) + { + tcp_reset(sk,skb); + goto discard; + } + + if(!th->syn) + { + /* + * A valid ack from a different connection + * start. Shouldn't happen but cover it + */ + tcp_statistics.TcpAttemptFails++; + return 1; + } + + /* + * Ok.. it's good. Set up sequence + * numbers and + * move to established. + */ + + tp->rcv_nxt = skb->seq+1; + tp->rcv_wnd = 0; + tp->rcv_wup = skb->seq+1; + + tp->snd_wnd = htons(th->window); + tp->snd_wl1 = skb->seq; + tp->snd_wl2 = skb->ack_seq; + + sk->fin_seq = skb->seq; + tcp_send_ack(sk); + + tcp_set_state(sk, TCP_ESTABLISHED); + rcv_mss = tcp_parse_options(th); + + if (rcv_mss == 0) + { + rcv_mss = 536; + } + + sk->mss = min(sk->mss, rcv_mss); + + sk->dummy_th.dest = th->source; + sk->copied_seq = tp->rcv_nxt; + + if(!sk->dead) + { + sk->state_change(sk); + sock_wake_async(sk->socket, 0); + } + + /* Drop through step 6 */ + goto step6; + } + else + { + if(th->syn && !th->rst) + { + /* + * the previous version of the code + * checked for "connecting to self" + * here. that check is done now in + * tcp_connect + */ + + tcp_set_state(sk, TCP_SYN_RECV); + + tp->rcv_nxt = skb->seq + 1; + tp->rcv_wup = skb->seq + 1; + + tp->snd_wnd = htons(th->window); + tp->snd_wl1 = skb->seq; + + tcp_send_synack(sk); + goto discard; + } + + } + break; + + case TCP_TIME_WAIT: + /* + * RFC 1122: + * "When a connection is [...] on TIME-WAIT state [...] + * [a TCP] MAY accept a new SYN from the remote TCP to + * reopen the connection directly, if it: + * + * (1) assigns its initial sequence number for the new + * connection to be larger than the largest sequence + * number it used on the previous connection incarnation, + * and + * + * (2) returns to TIME-WAIT state if the SYN turns out + * to be an old duplicate". + */ + + if (th->syn && !th->rst && after(skb->seq, tp->rcv_nxt)) + { + __u32 isn; + int err; + + atomic_sub(skb->truesize, &sk->rmem_alloc); + skb->sk = NULL; + sk->err = ECONNRESET; + tcp_set_state(sk, TCP_CLOSE); + sk->shutdown = SHUTDOWN_MASK; + + isn = tp->rcv_nxt + 128000; + + sk = tp->af_specific->get_sock(skb, th); + + if (sk == NULL) + goto discard; + + skb->sk = sk; + tp = &sk->tp_pinfo.af_tcp; + atomic_add(skb->truesize, &sk->rmem_alloc); + + err = tp->af_specific->conn_request(sk, skb, opt, isn); + + if (err < 0) + return 1; + + return 0; + } + + break; + + } + + /* + * step 1: check sequence number + */ + + if (!tcp_sequence(tp, skb->seq, skb->end_seq)) + { + if (!th->rst) + { + tcp_send_ack(sk); + goto discard; + } + } + + + /* + * step 2: check RST bit + */ + + if(th->rst) + { + tcp_reset(sk,skb); + goto discard; + } + + /* + * step 3: check security and precedence + * [ignored] + */ + + /* + * step 4: + * + * Check for a SYN, and ensure it matches the SYN we were + * first sent. We have to handle the rather unusual (but valid) + * sequence that KA9Q derived products may generate of + * + * SYN + * SYN|ACK Data + * ACK (lost) + * SYN|ACK Data + More Data + * .. we must ACK not RST... + * + * We keep syn_seq as the sequence space occupied by the + * original syn. + */ + + if (th->syn && skb->seq!=sk->syn_seq) + { + tcp_reset(sk, skb); + return 1; + } + + /* + * step 5: check the ACK field + */ + + if (th->ack) + { + int acceptable = tcp_ack(sk,th,skb->seq, skb->ack_seq,len); + + switch(sk->state) { + case TCP_SYN_RECV: + if (acceptable) + { + tcp_set_state(sk, TCP_ESTABLISHED); + sk->dummy_th.dest=th->source; + sk->copied_seq = tp->rcv_nxt; + + if(!sk->dead) + sk->state_change(sk); + + tp->snd_una = skb->ack_seq; + tp->snd_wnd = htons(th->window); + tp->snd_wl1 = skb->seq; + tp->snd_wl2 = skb->ack_seq; + + } + else + return 1; + break; + + case TCP_FIN_WAIT1: + + if (tp->snd_una == sk->write_seq) + { + sk->shutdown |= SEND_SHUTDOWN; + tcp_set_state(sk, TCP_FIN_WAIT2); + if (!sk->dead) + sk->state_change(sk); + } + break; + + case TCP_CLOSING: + + if (tp->snd_una == sk->write_seq) + { + tcp_time_wait(sk); + if (!sk->dead) + sk->state_change(sk); + } + break; + + case TCP_LAST_ACK: + + if (tp->snd_una == sk->write_seq) + { + sk->shutdown = SHUTDOWN_MASK; + tcp_set_state(sk,TCP_CLOSE); + if (!sk->dead) + sk->state_change(sk); + goto discard; + } + break; + + case TCP_TIME_WAIT: + /* + * keep us in TIME_WAIT until we stop getting + * packets, reset the timeout. + */ + tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); + break; + + } + } + else + goto discard; + + step6: + + /* + * step 6: check the URG bit + */ + + tcp_urg(sk, th, len); + + /* + * step 7: process the segment text + */ + + switch (sk->state) { + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + if (!before(skb->seq, sk->fin_seq)) + break; + + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: + + /* + * RFC 793 says to queue data in this states, + * RFC 1122 says we MUST send a reset. + * BSD 4.4 also does reset. + */ + + if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) + { + if (after(skb->end_seq - th->fin, tp->rcv_nxt)) + { + tcp_reset(sk, skb); + return 1; + } + } + + case TCP_ESTABLISHED: + queued = tcp_data(skb, sk, len); + break; + } + + /* + * step 8: check the FIN bit + */ + + if (th->fin) + { + tcp_fin(skb, sk, th); + } + + tcp_data_snd_check(sk); + tcp_ack_snd_check(sk); + + if (queued) + return 0; + discard: + + kfree_skb(skb, FREE_READ); + return 0; +} + +int tcp_sysctl_congavoid(ctl_table *ctl, int write, struct file * filp, + void *buffer, size_t *lenp) +{ + int val = sysctl_tcp_cong_avoidance; + int retv; + + retv = proc_dointvec(ctl, write, filp, buffer, lenp); + + if (write) + { + switch (sysctl_tcp_cong_avoidance) { + case 0: + tcp_sys_cong_ctl_f = &tcp_cong_avoid_vanj; + break; + case 1: + tcp_sys_cong_ctl_f = &tcp_cong_avoid_vegas; + break; + default: + retv = -EINVAL; + sysctl_tcp_cong_avoidance = val; + } + } + + return retv; +} + +/* + * Local variables: + * compile-command: "gcc -D__KERNEL__ -I/usr/src/linux/include -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer -fno-strength-reduce -pipe -m486 -DCPU=486 -c -o tcp_input.o tcp_input.c" + * c-file-style: "Linux" + * End: + */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c new file mode 100644 index 000000000..a36860c7a --- /dev/null +++ b/net/ipv4/tcp_ipv4.c @@ -0,0 +1,1376 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * + * IPv4 specific functions + * + * + * code split from: + * linux/ipv4/tcp.c + * linux/ipv4/tcp_input.c + * linux/ipv4/tcp_output.c + * + * See tcp.c for author information + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/fcntl.h> +#include <linux/random.h> + +#include <net/icmp.h> +#include <net/tcp.h> +#include <net/ipv6.h> + +#include <asm/segment.h> + +static void tcp_v4_send_reset(unsigned long saddr, unsigned long daddr, + struct tcphdr *th, struct proto *prot, + struct options *opt, + struct device *dev, int tos, int ttl); + +void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, + struct sk_buff *skb); + +/* + * Cached last hit socket + */ + +static volatile unsigned long th_cache_saddr, th_cache_daddr; +static volatile unsigned short th_cache_dport, th_cache_sport; +static volatile struct sock *th_cache_sk; + +void tcp_cache_zap(void) +{ + th_cache_sk=NULL; +} + +/* + * Find the socket, using the last hit cache if applicable. + * The cache is not quite right... + */ + +static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, + u32 daddr, u16 dport, + u32 paddr, u16 pport) +{ + struct sock * sk; + + sk = (struct sock *) th_cache_sk; + if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr || + sport != th_cache_sport || dport != th_cache_dport) { + sk = get_sock(&tcp_prot, dport, saddr, sport, daddr, + paddr, pport); + if (sk) { + th_cache_saddr=saddr; + th_cache_daddr=daddr; + th_cache_dport=dport; + th_cache_sport=sport; + th_cache_sk=sk; + } + } + return sk; +} + +static __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) +{ + return secure_tcp_sequence_number(sk->saddr, sk->daddr, + skb->h.th->dest, + skb->h.th->source); +} + +/* + * From tcp.c + */ + +/* + * Check that a TCP address is unique, don't allow multiple + * connects to/from the same address + */ + +static int tcp_unique_address(u32 saddr, u16 snum, u32 daddr, u16 dnum) +{ + int retval = 1; + struct sock * sk; + + /* Make sure we are allowed to connect here. */ + cli(); + for (sk = tcp_prot.sock_array[snum & (SOCK_ARRAY_SIZE -1)]; + sk != NULL; sk = sk->next) + { + /* hash collision? */ + if (sk->num != snum) + continue; + if (sk->saddr != saddr) + continue; + if (sk->daddr != daddr) + continue; + if (sk->dummy_th.dest != dnum) + continue; + retval = 0; + break; + } + sti(); + return retval; +} + +/* + * This will initiate an outgoing connection. + */ + +int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct sk_buff *buff; + struct sk_buff *skb1; + struct device *dev=NULL; + unsigned char *ptr; + int tmp; + int atype; + struct tcphdr *t1; + struct rtable *rt; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; + + if (sk->state != TCP_CLOSE) + return(-EISCONN); + + /* + * Don't allow a double connect. + */ + + if(sk->daddr) + return -EINVAL; + + if (addr_len < sizeof(struct sockaddr_in)) + return(-EINVAL); + + if (usin->sin_family && usin->sin_family != AF_INET) + return(-EAFNOSUPPORT); + + /* + * connect() to INADDR_ANY means loopback (BSD'ism). + */ + + if (usin->sin_addr.s_addr==INADDR_ANY) + usin->sin_addr.s_addr=ip_my_addr(); + + /* + * Don't want a TCP connection going to a broadcast address + */ + + if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST + || atype==IS_MULTICAST) + { + return -ENETUNREACH; + } + + if (!tcp_unique_address(sk->saddr, sk->num, usin->sin_addr.s_addr, + usin->sin_port)) + { + return -EADDRNOTAVAIL; + } + + lock_sock(sk); + sk->daddr = usin->sin_addr.s_addr; + sk->dummy_th.dest = usin->sin_port; + sk->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr, + sk->dummy_th.source, + usin->sin_port); + + tp->snd_wnd = 0; + tp->snd_wl1 = 0; + tp->snd_wl2 = sk->write_seq; + tp->snd_una = sk->write_seq; + + tp->rcv_nxt = 0; + + sk->err = 0; + + buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL); + if (buff == NULL) + { + release_sock(sk); + return(-ENOBUFS); + } + + buff->sk = sk; + buff->free = 0; + buff->localroute = sk->localroute; + + /* + * Put in the IP header and routing stuff. + */ + + tmp = ip_build_header(buff, sk->saddr, sk->daddr, &dev, + IPPROTO_TCP, NULL, MAX_SYN_SIZE, sk->ip_tos, + sk->ip_ttl,&sk->ip_route_cache); + + if (tmp < 0) + { + sock_wfree(sk, buff); + release_sock(sk); + return(-ENETUNREACH); + } + if ((rt = sk->ip_route_cache) != NULL && !sk->saddr) + sk->saddr = rt->rt_src; + sk->rcv_saddr = sk->saddr; + + t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr)); + buff->h.th = t1; + + memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1)); + buff->seq = sk->write_seq++; + t1->seq = htonl(buff->seq); + tp->snd_nxt = sk->write_seq; + buff->end_seq = sk->write_seq; + t1->ack = 0; + t1->window = htons(512); + t1->syn = 1; + t1->doff = 6; + + /* use 512 or whatever user asked for */ + + if(rt!=NULL && (rt->rt_flags&RTF_WINDOW)) + sk->window_clamp=rt->rt_window; + else + sk->window_clamp=0; + + + if (rt) + sk->mtu = rt->rt_mtu; + else + sk->mtu = dev->mtu; + + if(sk->mtu < 64) + sk->mtu = 64; /* Sanity limit */ + + if (sk->user_mss) + sk->mss = sk->user_mss; + else + sk->mss = (sk->mtu - sizeof(struct iphdr) - + sizeof(struct tcphdr)); + + /* + * Put in the TCP options to say MSS. + */ + + ptr = skb_put(buff,4); + ptr[0] = TCPOPT_MSS; + ptr[1] = TCPOLEN_MSS; + ptr[2] = (sk->mss) >> 8; + ptr[3] = (sk->mss) & 0xff; + buff->csum = csum_partial(ptr, 4, 0); + tcp_v4_send_check(sk, t1, sizeof(struct tcphdr) + 4, buff); + + /* + * This must go first otherwise a really quick response + * will get reset. + */ + + tcp_cache_zap(); + tcp_set_state(sk,TCP_SYN_SENT); + + if(rt && (rt->rt_flags&RTF_IRTT)) + tp->rto = rt->rt_irtt; + else + tp->rto = TCP_TIMEOUT_INIT; + + tcp_init_xmit_timers(sk); + + /* Now works the right way instead of a hacked initial setting */ + sk->retransmits = 0; + + skb_queue_tail(&sk->write_queue, buff); + + sk->packets_out++; + buff->when = jiffies; + + skb1 = skb_clone(buff, GFP_KERNEL); + sk->wmem_alloc += skb1->truesize; + ip_queue_xmit(sk, dev, skb1, 1); + + /* Timer for repeating the SYN until an answer */ + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + tcp_statistics.TcpActiveOpens++; + tcp_statistics.TcpOutSegs++; + + release_sock(sk); + return(0); +} + +static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, + int len, int nonblock, int flags) +{ + int retval = -EINVAL; + + /* + * Do sanity checking for sendmsg/sendto/send + */ + + if (flags & ~(MSG_OOB|MSG_DONTROUTE)) + goto out; + if (msg->msg_name) { + struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name; + + if (msg->msg_namelen < sizeof(*addr)) + goto out; + if (addr->sin_family && addr->sin_family != AF_INET) + goto out; + retval = -ENOTCONN; + if(sk->state == TCP_CLOSE) + goto out; + retval = -EISCONN; + if (addr->sin_port != sk->dummy_th.dest) + goto out; + if (addr->sin_addr.s_addr != sk->daddr) + goto out; + } + + lock_sock(sk); + retval = tcp_do_sendmsg(sk, msg->msg_iovlen, msg->msg_iov, + len, nonblock, flags); + + release_sock(sk); + +out: + return retval; +} + +/* + * This routine is called by the ICMP module when it gets some + * sort of error condition. If err < 0 then the socket should + * be closed and the error returned to the user. If err > 0 + * it's just the icmp type << 8 | icmp code. After adjustment + * header points to the first 8 bytes of the tcp header. We need + * to find the appropriate port. + */ + +void tcp_v4_err(int type, int code, unsigned char *header, __u32 info, + __u32 daddr, __u32 saddr, struct inet_protocol *protocol, int len) +{ + struct tcphdr *th = (struct tcphdr *)header; + struct tcp_opt *tp; + struct sock *sk; + + if(len<8) /* We use the first 8 bytes only */ + return; + + th =(struct tcphdr *)header; + sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr, 0, 0); + + if (sk == NULL) + return; + + if (type == ICMP_SOURCE_QUENCH) + { + /* + * FIXME: + * Follow BSD for now and just reduce cong_window to 1 again. + * It is possible that we just want to reduce the + * window by 1/2, or that we want to reduce ssthresh by 1/2 + * here as well. + */ + + tp = &sk->tp_pinfo.af_tcp; + + sk->cong_window = 1; + tp->high_seq = tp->snd_nxt; + + return; + } + + if (type == ICMP_PARAMETERPROB) + { + sk->err=EPROTO; + sk->error_report(sk); + } + +#ifndef CONFIG_NO_PATH_MTU_DISCOVERY + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) + { + struct rtable * rt; + + unsigned short new_mtu = info; + + if ((rt = sk->ip_route_cache) != NULL) + if (rt->rt_mtu > new_mtu) + rt->rt_mtu = new_mtu; + + if ((sk->mtu > new_mtu) && + (new_mtu > sizeof(struct iphdr)+sizeof(struct tcphdr))) + { + sk->mss = (new_mtu - sizeof(struct iphdr) + - sizeof(struct tcphdr)); + } + + return; + } +#endif + + /* + * If we've already connected we will keep trying + * until we time out, or the user gives up. + */ + + if (code <= NR_ICMP_UNREACH) + { + if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) + { + sk->err = icmp_err_convert[code].errno; + if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) + { + tcp_statistics.TcpAttemptFails++; + tcp_set_state(sk,TCP_CLOSE); + sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ + } + } + else /* Only an error on timeout */ + sk->err_soft = icmp_err_convert[code].errno; + } +} + +/* + * This routine computes a TCP checksum. + * + * Modified January 1995 from a go-faster DOS routine by + * Jorge Cwik <jorge@laser.satlink.net> + */ +void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, + struct sk_buff *skb) +{ + __u32 saddr = sk->saddr; + __u32 daddr = sk->daddr; +#ifdef DEBUG_TCP_CHECK + u16 check; +#endif + th->check = 0; + th->check = tcp_v4_check(th, len, saddr, daddr, + csum_partial((char *)th, sizeof(*th), + skb->csum)); + +#ifdef DEBUG_TCP_CHECK + check = th->check; + th->check = 0; + th->check = tcp_v4_check(th, len, saddr, daddr, + csum_partial((char *)th,len,0)); + if (check != th->check) { + static int count = 0; + if (++count < 10) { + printk("Checksum %x (%x) from %p\n", th->check, check, + __builtin_return_address(0)); + printk("TCP=<off:%d a:%d s:%d f:%d> len=%d\n", th->doff*4, th->ack, th->syn, th->fin, len); + } + } +#endif +} + +/* + * This routine will send an RST to the other tcp. + */ + +static void tcp_v4_send_reset(unsigned long saddr, unsigned long daddr, + struct tcphdr *th, struct proto *prot, + struct options *opt, + struct device *dev, int tos, int ttl) +{ + struct sk_buff *buff; + struct tcphdr *t1; + int tmp; + struct device *ndev=NULL; + + /* + * Cannot reset a reset (Think about it). + */ + + if(th->rst) + return; + + /* + * We need to grab some memory, and put together an RST, + * and then put it into the queue to be sent. + */ + + buff = alloc_skb(MAX_RESET_SIZE, GFP_ATOMIC); + if (buff == NULL) + return; + + buff->sk = NULL; + buff->dev = dev; + buff->localroute = 0; + + + /* + * Put in the IP header and routing stuff. + */ + + tmp = ip_build_header(buff, saddr, daddr, &ndev, IPPROTO_TCP, opt, + sizeof(struct tcphdr),tos,ttl,NULL); + if (tmp < 0) + { + buff->free = 1; + sock_wfree(NULL, buff); + return; + } + + t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr)); + memset(t1, 0, sizeof(*t1)); + + /* + * Swap the send and the receive. + */ + + t1->dest = th->source; + t1->source = th->dest; + t1->doff = sizeof(*t1)/4; + t1->rst = 1; + + if(th->ack) + { + t1->seq = th->ack_seq; + } + else + { + t1->ack = 1; + if(!th->syn) + t1->ack_seq = th->seq; + else + t1->ack_seq = htonl(ntohl(th->seq)+1); + } + + + buff->csum = csum_partial((u8 *) t1, sizeof(*t1), 0); + t1->check = tcp_v4_check(t1, sizeof(*t1), saddr, daddr, buff->csum); + + ip_queue_xmit(NULL, ndev, buff, 1); + tcp_statistics.TcpOutSegs++; +} + +#ifdef CONFIG_IP_TRANSPARENT_PROXY +/* + * Check whether a received TCP packet might be for one of our + * connections. + */ + +int tcp_chkaddr(struct sk_buff *skb) +{ + struct iphdr *iph = skb->h.iph; + struct tcphdr *th = (struct tcphdr *)(skb->h.raw + iph->ihl*4); + struct sock *sk; + + sk = get_sock(&tcp_prot, th->dest, iph->saddr, th->source, iph->daddr, + 0, 0); + + if (!sk) + return 0; + + /* 0 means accept all LOCAL addresses here, not all the world... */ + + if (sk->rcv_saddr == 0) + return 0; + + return 1; +} +#endif + +static void tcp_v4_send_synack(struct sock *sk, struct open_request *req) +{ + struct tcp_v4_open_req *af_req = (struct tcp_v4_open_req *) req; + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct sk_buff * skb; + struct device *dev = NULL; + struct rtable *rt = NULL; + struct tcphdr *th; + unsigned char *ptr; + int mss; + int tmp; + + skb = sock_wmalloc(sk, MAX_SYN_SIZE, 1, GFP_ATOMIC); + + if (skb == NULL) + { + return; + } + + tmp = ip_build_header(skb, af_req->loc_addr, af_req->rmt_addr, &dev, + IPPROTO_TCP, af_req->opt, skb->truesize, + sk->ip_tos, sk->ip_ttl, &rt); + + if (tmp < 0) + { + skb->free = 1; + kfree_skb(skb, FREE_WRITE); + return; + } + + skb->dev = dev; + + if (rt) + mss = rt->rt_mtu; + else + mss = dev->mtu; + + mss -= sizeof(struct iphdr) + sizeof(struct tcphdr); + + if (sk->user_mss) + mss = min(mss, sk->user_mss); + + ip_rt_put(rt); + + th =(struct tcphdr *) skb_put(skb, sizeof(struct tcphdr)); + skb->h.th = th; + memset(th, 0, sizeof(struct tcphdr)); + + th->syn = 1; + th->ack = 1; + + th->source = sk->dummy_th.source; + th->dest = req->rmt_port; + + skb->seq = req->snt_isn; + skb->end_seq = skb->seq + 1; + + th->seq = ntohl(skb->seq); + th->ack_seq = htonl(req->rcv_isn + 1); + th->doff = sizeof(*th)/4 + 1; + + th->window = ntohs(tp->rcv_wnd); + + ptr = skb_put(skb, TCPOLEN_MSS); + ptr[0] = TCPOPT_MSS; + ptr[1] = TCPOLEN_MSS; + ptr[2] = (mss >> 8) & 0xff; + ptr[3] = mss & 0xff; + skb->csum = csum_partial(ptr, TCPOLEN_MSS, 0); + + th->check = tcp_v4_check(th, sizeof(*th) + TCPOLEN_MSS, af_req->loc_addr, + af_req->rmt_addr, + csum_partial((char *)th, sizeof(*th), skb->csum)); + + ip_queue_xmit(sk, dev, skb, 1); + tcp_statistics.TcpOutSegs++; + +} + +static void tcp_v4_or_free(struct open_request *req) +{ + struct tcp_v4_open_req *af_req = (struct tcp_v4_open_req *) req; + + if (af_req->req.sk) + return; + + if (af_req->opt) + { + kfree_s(af_req->opt, sizeof(struct options) + af_req->opt->optlen); + } +} + +static struct or_calltable or_ipv4 = { + tcp_v4_send_synack, + tcp_v4_or_free +}; + +static int tcp_v4_syn_filter(struct sock *sk, struct sk_buff *skb, __u32 saddr) +{ + return 0; +} + +int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, __u32 isn) +{ + struct options *opt = (struct options *) ptr; + struct tcp_v4_open_req *af_req; + struct open_request *req; + struct tcphdr *th = skb->h.th; + __u32 saddr = skb->saddr; + __u32 daddr = skb->daddr; + + /* If the socket is dead, don't accept the connection. */ + if (sk->dead) + { + if(sk->debug) + { + printk("Reset on %p: Connect on dead socket.\n",sk); + } + tcp_statistics.TcpAttemptFails++; + return -ENOTCONN; + } + + if (sk->ack_backlog >= sk->max_ack_backlog || + tcp_v4_syn_filter(sk, skb, saddr)) + { + printk(KERN_DEBUG "droping syn ack:%d max:%d\n", + sk->ack_backlog, sk->max_ack_backlog); +#ifdef CONFIG_IP_TCPSF + tcp_v4_random_drop(sk); +#endif + tcp_statistics.TcpAttemptFails++; + goto exit; + } + + + af_req = kmalloc(sizeof(struct tcp_v4_open_req), GFP_ATOMIC); + + if (af_req == NULL) + { + tcp_statistics.TcpAttemptFails++; + goto exit; + } + + sk->ack_backlog++; + req = (struct open_request *) af_req; + + memset(af_req, 0, sizeof(struct tcp_v4_open_req)); + + req->rcv_isn = skb->seq; + req->snt_isn = isn; + + /* mss */ + req->mss = tcp_parse_options(th); + + if (!req->mss) + { + req->mss = 536; + } + + req->rmt_port = th->source; + + af_req->loc_addr = daddr; + af_req->rmt_addr = saddr; + + /* + * options + */ + + if (opt && opt->optlen) + { + af_req->opt = (struct options*) kmalloc(sizeof(struct options) + + opt->optlen, GFP_ATOMIC); + if (af_req->opt) + { + if (ip_options_echo(af_req->opt, opt, skb->daddr, + skb->saddr, skb)) + { + kfree_s(af_req->opt, sizeof(struct options) + + opt->optlen); + af_req->opt = NULL; + } + } + } + + req->class = &or_ipv4; + + tcp_v4_send_synack(sk, req); + + req->expires = jiffies + TCP_TIMEOUT_INIT; + tcp_inc_slow_timer(TCP_SLT_SYNACK); + tcp_synq_queue(&sk->tp_pinfo.af_tcp, req); + + sk->data_ready(sk, 0); + + exit: + kfree_skb(skb, FREE_READ); + return 0; +} + +struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, + struct open_request *req) +{ + struct tcp_v4_open_req *af_req = (struct tcp_v4_open_req *) req; + struct tcp_opt *newtp; + struct sock *newsk; + struct rtable *rt; + int snd_mss; + + newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC); + if (newsk == NULL) + { + return NULL; + } + + memcpy(newsk, sk, sizeof(*newsk)); + newsk->opt = NULL; + newsk->ip_route_cache = NULL; + skb_queue_head_init(&newsk->write_queue); + skb_queue_head_init(&newsk->receive_queue); + skb_queue_head_init(&newsk->out_of_order_queue); + + /* + * Unused + */ + + newsk->send_head = NULL; + newsk->send_tail = NULL; + + newtp = &(newsk->tp_pinfo.af_tcp); + newtp->send_head = NULL; + newtp->retrans_head = NULL; + + newtp->pending = 0; + + skb_queue_head_init(&newsk->back_log); + + newsk->prot->init(newsk); + + newsk->cong_count = 0; + newsk->ssthresh = 0; + newtp->backoff = 0; + newsk->blog = 0; + newsk->intr = 0; + newsk->proc = 0; + newsk->done = 0; + newsk->partial = NULL; + newsk->pair = NULL; + newsk->wmem_alloc = 0; + newsk->rmem_alloc = 0; + newsk->localroute = sk->localroute; + + newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF; + + newsk->err = 0; + newsk->shutdown = 0; + newsk->ack_backlog = 0; + + newsk->fin_seq = req->rcv_isn; + newsk->syn_seq = req->rcv_isn; + newsk->state = TCP_SYN_RECV; + newsk->timeout = 0; + newsk->ip_xmit_timeout = 0; + + newsk->write_seq = req->snt_isn; + + newtp->snd_wnd = ntohs(skb->h.th->window); + newsk->max_window = newtp->snd_wnd; + newtp->snd_wl1 = req->rcv_isn; + newtp->snd_wl2 = newsk->write_seq; + newtp->snd_una = newsk->write_seq++; + newtp->snd_nxt = newsk->write_seq; + + newsk->urg_data = 0; + newsk->packets_out = 0; + newsk->retransmits = 0; + newsk->linger=0; + newsk->destroy = 0; + init_timer(&newsk->timer); + newsk->timer.data = (unsigned long) newsk; + newsk->timer.function = &net_timer; + + tcp_init_xmit_timers(newsk); + + newsk->dummy_th.source = sk->dummy_th.source; + newsk->dummy_th.dest = req->rmt_port; + + newtp->rcv_nxt = req->rcv_isn + 1; + newtp->rcv_wup = req->rcv_isn + 1; + newsk->copied_seq = req->rcv_isn + 1; + + newsk->socket = NULL; + + newsk->daddr = af_req->rmt_addr; + newsk->saddr = af_req->loc_addr; + newsk->rcv_saddr = af_req->loc_addr; + + /* + * options / mss / route_cache + */ + newsk->opt = af_req->opt; + rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : + newsk->daddr, 0); + + newsk->ip_route_cache = rt; + + if(rt != NULL && (rt->rt_flags&RTF_WINDOW)) + newsk->window_clamp = rt->rt_window; + else + newsk->window_clamp = 0; + + if (rt) + snd_mss = rt->rt_mtu; + else + snd_mss = skb->dev->mtu; + + newsk->mtu = snd_mss; + /* sanity check */ + if (newsk->mtu < 64) + { + newsk->mtu = 64; + } + + snd_mss -= sizeof(struct iphdr) + sizeof(struct tcphdr); + + if (sk->user_mss) + { + snd_mss = min(snd_mss, sk->user_mss); + } + + newsk->mss = min(req->mss, snd_mss); + + inet_put_sock(newsk->num, newsk); + + tcp_cache_zap(); + + return newsk; +} + +struct sock *tcp_v4_check_req(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct open_request *req; + + + /* + * assumption: the socket is not in use. + * as we checked the user count on tcp_rcv and we're + * running from a soft interrupt. + */ + + req = tp->syn_wait_queue; + + + if (!req) + { + return sk; + } + + do { + struct tcp_v4_open_req *af_req; + + af_req = (struct tcp_v4_open_req *) req; + + if (af_req->rmt_addr == skb->saddr && + af_req->loc_addr == skb->daddr && + req->rmt_port == skb->h.th->source) + { + u32 flg; + + if (req->sk) + { + printk(KERN_DEBUG "BUG: syn_recv:" + "socket exists\n"); + break; + } + + /* match */ + + /* + * Check for syn retransmission + */ + flg = *(((u32 *)skb->h.th) + 3); + flg &= __constant_htonl(0x002f0000); + + if ((flg == __constant_htonl(0x00020000)) && + (!after(skb->seq, req->rcv_isn))) + { + /* + * retransmited syn + * FIXME: must send an ack + */ + return NULL; + } + + atomic_sub(skb->truesize, &sk->rmem_alloc); + sk = tp->af_specific->syn_recv_sock(sk, skb, req); + + tcp_dec_slow_timer(TCP_SLT_SYNACK); + + if (sk == NULL) + { + return NULL; + } + + atomic_add(skb->truesize, &sk->rmem_alloc); + req->expires = 0UL; + req->sk = sk; + skb->sk = sk; + break; + } + + req = req->dl_next; + } while (req != tp->syn_wait_queue); + + + return sk; +} + +/* + * From tcp_input.c + */ + +int tcp_v4_rcv(struct sk_buff *skb, struct device *dev, struct options *opt, + __u32 daddr, unsigned short len, + __u32 saddr, int redo, struct inet_protocol * protocol) +{ + struct tcphdr *th; + struct sock *sk; + + /* + * "redo" is 1 if we have already seen this skb but couldn't + * use it at that time (the socket was locked). In that case + * we have already done a lot of the work (looked up the socket + * etc). + */ + + th = skb->h.th; + + sk = skb->sk; + + if (!redo) + { + + if (skb->pkt_type!=PACKET_HOST) + goto discard_it; + + /* + * Pull up the IP header. + */ + + skb_pull(skb, skb->h.raw-skb->data); + + /* + * Try to use the device checksum if provided. + */ + + switch (skb->ip_summed) + { + case CHECKSUM_NONE: + skb->csum = csum_partial((char *)th, len, 0); + case CHECKSUM_HW: + if (tcp_v4_check(th,len,saddr,daddr,skb->csum)) + goto discard_it; + default: + /* CHECKSUM_UNNECESSARY */ + } + + sk = get_tcp_sock(saddr, th->source, daddr, th->dest, + dev->pa_addr, skb->redirport); + + if (!sk) + goto no_tcp_socket; + + skb->sk = sk; + skb->seq = ntohl(th->seq); + skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4; + skb->ack_seq = ntohl(th->ack_seq); + + skb->acked = 0; + skb->used = 0; + skb->free = 1; + skb->saddr = saddr; + skb->daddr = daddr; + } + + /* + * We may need to add it to the backlog here. + */ + + if (sk->users) + { + __skb_queue_tail(&sk->back_log, skb); + return(0); + } + + if (!sk->prot) + { + printk(KERN_DEBUG "tcp_rcv: sk->prot == NULL\n"); + return(0); + } + + atomic_add(skb->truesize, &sk->rmem_alloc); + + if (sk->state == TCP_ESTABLISHED) + { + tcp_rcv_established(sk, skb, th, len); + return 0; + } + + if (sk->state == TCP_LISTEN) + { + /* + * find possible connection requests + */ + sk = tcp_v4_check_req(sk, skb); + + if (sk == NULL) + { + goto discard_it; + } + } + + if (tcp_rcv_state_process(sk, skb, th, opt, len) == 0) + return 0; + +no_tcp_socket: + + /* + * No such TCB. If th->rst is 0 send a reset + * (checked in tcp_send_reset) + */ + + tcp_v4_send_reset(daddr, saddr, th, &tcp_prot, opt, dev, + skb->ip_hdr->tos, 255); + +discard_it: + + /* + * Discard frame + */ + + kfree_skb(skb, FREE_READ); + return 0; +} + +int tcp_v4_rebuild_header(struct sock *sk, struct sk_buff *skb) +{ + struct options * opt = (struct options*)skb->proto_priv; + struct device * dev; + struct rtable *rt; + struct iphdr *iph; + struct tcphdr *th; + int size; + + /* + * Discard the surplus MAC header + */ + + skb_pull(skb, ((unsigned char *)skb->ip_hdr)-skb->data); + + iph = skb->ip_hdr; + th = (struct tcphdr *)(((char *)iph) + (iph->ihl << 2)); + size = skb->tail - (unsigned char *) th; + + dev = skb->dev; + + rt = ip_check_route(&sk->ip_route_cache, + opt->srr?opt->faddr:iph->daddr, + skb->localroute); + + +#ifndef CONFIG_NO_PATH_MTU_DISCOVERY + if (rt && ntohs(iph->tot_len) > rt->rt_mtu) + iph->frag_off &= ~htons(IP_DF); +#endif + + if (rt==NULL) /* Deep poo */ + { + if(skb->sk) + { + skb->sk->err_soft=ENETUNREACH; + skb->sk->error_report(skb->sk); + } + return -1; + } + + + dev=rt->rt_dev; + skb->raddr=rt->rt_gateway; + skb->dev=dev; + skb->arp=1; + + if (rt->rt_hh) + { + memcpy(skb_push(skb, dev->hard_header_len), + rt->rt_hh->hh_data, dev->hard_header_len); + + if (!rt->rt_hh->hh_uptodate) + { + skb->arp = 0; +#if RT_CACHE_DEBUG >= 2 + printk("tcp_do_rebuild_header: " + "hh miss %08x via %08x\n", + iph->daddr, rt->rt_gateway); +#endif + } + } + else if (dev->hard_header) + { + if(dev->hard_header(skb, dev, ETH_P_IP, NULL, NULL, + skb->len)<0) + skb->arp=0; + } + + return 0; +} + +int tcp_v4_backlog_rcv(struct sock *sk, struct sk_buff *skb) +{ + return tcp_v4_rcv(skb, skb->dev, (struct options *) skb->proto_priv, + skb->daddr, skb->len, skb->saddr, 1, + (struct inet_protocol *) sk->pair); +} + +static struct sock * tcp_v4_get_sock(struct sk_buff *skb, struct tcphdr *th) +{ + struct sock *sk; + + sk = get_tcp_sock(skb->saddr, th->source, skb->daddr, th->dest, 0, 0); + + return sk; +} + +int tcp_v4_build_header(struct sock *sk, struct sk_buff *skb) +{ + struct device *dev = NULL; + int tmp; + + tmp = ip_build_header(skb, sk->saddr, sk->daddr, &dev, + IPPROTO_TCP, sk->opt, skb->truesize, + sk->ip_tos, sk->ip_ttl, + &sk->ip_route_cache); + skb->dev = dev; + +#ifndef CONFIG_NO_PATH_MTU_DISCOVERY + if (tmp > 0) + { + skb->ip_hdr->frag_off |= htons(IP_DF); + } +#endif + + return tmp; +} + + +static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) +{ + struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; + + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = sk->daddr; + sin->sin_port = sk->dummy_th.dest; + +} + +struct tcp_func ipv4_specific = { + tcp_v4_build_header, + ip_queue_xmit, + tcp_v4_send_check, + tcp_v4_rebuild_header, + tcp_v4_conn_request, + tcp_v4_syn_recv_sock, + tcp_v4_init_sequence, + tcp_v4_get_sock, + ip_setsockopt, + ip_getsockopt, + v4_addr2sockaddr, + sizeof(struct sockaddr_in) +}; + +static int tcp_v4_init_sock(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + skb_queue_head_init(&sk->out_of_order_queue); + tcp_init_xmit_timers(sk); + + tp->srtt = 0; + tp->rto = TCP_TIMEOUT_INIT; /*TCP_WRITE_TIME*/ + tp->mdev = TCP_TIMEOUT_INIT; + + tp->ato = 0; + tp->iat = (HZ/5) << 3; + + tp->rcv_wnd = 8192; + + /* + * See draft-stevens-tcpca-spec-01 for discussion of the + * initialization of these values. + */ + sk->cong_window = 1; + sk->ssthresh = 0x7fffffff; + + sk->priority = 1; + sk->state = TCP_CLOSE; + + /* this is how many unacked bytes we will accept for this socket. */ + sk->max_unacked = 2048; /* needs to be at most 2 full packets. */ + sk->max_ack_backlog = SOMAXCONN; + + sk->mtu = 576; + sk->mss = 536; + + sk->dummy_th.doff = sizeof(sk->dummy_th)/4; + + + /* + * Speed up by setting some standard state for the dummy_th + * if TCP uses it (maybe move to tcp_init later) + */ + + sk->dummy_th.ack=1; + sk->dummy_th.doff=sizeof(struct tcphdr)>>2; + + sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific; + + return 0; +} + +static int tcp_v4_destroy_sock(struct sock *sk) +{ + struct sk_buff *skb; + + tcp_clear_xmit_timers(sk); + + if (sk->keepopen) + { + tcp_dec_slow_timer(TCP_SLT_KEEPALIVE); + } + + /* + * Cleanup up the write buffer. + */ + + while((skb = skb_dequeue(&sk->write_queue)) != NULL) { + IS_SKB(skb); + skb->free = 1; + kfree_skb(skb, FREE_WRITE); + } + + /* + * Cleans up our, hopefuly empty, out_of_order_queue + */ + + while((skb = skb_dequeue(&sk->out_of_order_queue)) != NULL) { + IS_SKB(skb); + kfree_skb(skb, FREE_READ); + } + + return 0; +} + +struct proto tcp_prot = { + tcp_close, + tcp_v4_connect, + tcp_accept, + NULL, + tcp_write_wakeup, + tcp_read_wakeup, + tcp_select, + tcp_ioctl, + tcp_v4_init_sock, + tcp_v4_destroy_sock, + tcp_shutdown, + tcp_setsockopt, + tcp_getsockopt, + tcp_v4_sendmsg, + tcp_recvmsg, + NULL, /* No special bind() */ + tcp_v4_backlog_rcv, + 128, + 0, + "TCP", + 0, 0, + NULL +}; + +/* + * Local variables: + * compile-command: "gcc -D__KERNEL__ -I/usr/src/linux/include -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer -fno-strength-reduce -pipe -m486 -DCPU=486 -c -o tcp_ipv4.o tcp_ipv4.c" + * c-file-style: "Linux" + * End: + */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c new file mode 100644 index 000000000..016cee54c --- /dev/null +++ b/net/ipv4/tcp_output.c @@ -0,0 +1,1210 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: @(#)tcp_input.c 1.0.16 05/25/93 + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Mark Evans, <evansmp@uhura.aston.ac.uk> + * Corey Minyard <wf-rch!minyard@relay.EU.net> + * Florian La Roche, <flla@stud.uni-sb.de> + * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> + * Linus Torvalds, <torvalds@cs.helsinki.fi> + * Alan Cox, <gw4pts@gw4pts.ampr.org> + * Matthew Dillon, <dillon@apollo.west.oic.com> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Jorge Cwik, <jorge@laser.satlink.net> + */ + +/* + * Changes: Pedro Roque : Retransmit queue handled by TCP. + * : Fragmentation on mtu decrease + * : Segment collapse on retransmit + * : AF independence + * + * Linus Torvalds : send_delayed_ack + * + */ + +#include <net/tcp.h> + +/* + * Get rid of any delayed acks, we sent one already.. + */ +static __inline__ void clear_delayed_acks(struct sock * sk) +{ + sk->delayed_acks = 0; + sk->ack_backlog = 0; + sk->bytes_rcv = 0; + tcp_clear_xmit_timer(sk, TIME_DACK); +} + +static __inline__ void update_send_head(struct sock *sk) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + tp->send_head = tp->send_head->next; + + if (tp->send_head == (struct sk_buff *) &sk->write_queue) + { + tp->send_head = NULL; + } + +} + +static __inline__ int tcp_snd_test(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int nagle_check = 1; + int len; + + /* + * RFC 1122 - section 4.2.3.4 + * + * We must queue if + * + * a) The right edge of this frame exceeds the window + * b) There are packets in flight and we have a small segment + * [SWS avoidance and Nagle algorithm] + * (part of SWS is done on packetization) + * c) We are retransmiting [Nagle] + * d) We have too many packets 'in flight' + */ + + len = skb->end_seq - skb->seq; + + if (!sk->nonagle && len < (sk->mss >> 1) && sk->packets_out) + { + nagle_check = 0; + } + + return (nagle_check && sk->packets_out < sk->cong_window && + !after(skb->end_seq, tp->snd_una + tp->snd_wnd) && + sk->retransmits == 0); +} + +/* + * This is the main buffer sending routine. We queue the buffer + * having checked it is sane seeming. + */ + +int tcp_send_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcphdr * th = skb->h.th; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int size; + + /* + * length of packet (not counting length of pre-tcp headers) + */ + + size = skb->len - ((unsigned char *) th - skb->data); + + /* + * Sanity check it.. + */ + + if (size < sizeof(struct tcphdr) || size > skb->len) + { + printk("tcp_send_skb: bad skb (skb = %p, data = %p, th = %p, len = %u)\n", + skb, skb->data, th, skb->len); + kfree_skb(skb, FREE_WRITE); + return 0; + } + + /* + * If we have queued a header size packet.. (these crash a few + * tcp stacks if ack is not set) + */ + + if (size == sizeof(struct tcphdr)) + { + /* + * If it's got a syn or fin discard + */ + if(!th->syn && !th->fin) + { + printk("tcp_send_skb: attempt to queue a bogon.\n"); + kfree_skb(skb,FREE_WRITE); + return 0; + } + } + + + /* + * Actual processing. + */ + + tcp_statistics.TcpOutSegs++; + skb->seq = ntohl(th->seq); + skb->end_seq = skb->seq + size - 4*th->doff; + + + if (tp->send_head || !tcp_snd_test(sk, skb)) + { + /* + * Remember where we must start sending + */ + + if (tp->send_head == NULL) + tp->send_head = skb; + + skb_queue_tail(&sk->write_queue, skb); + + if (sk->packets_out == 0 && !tp->pending) + { + tp->pending = TIME_PROBE0; + tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); + } + + } + else + { + struct sk_buff * buff; + + /* + * This is going straight out + */ + + skb_queue_tail(&sk->write_queue, skb); + + clear_delayed_acks(sk); + + th->ack_seq = htonl(tp->rcv_nxt); + th->window = htons(tcp_select_window(sk)); + + tp->af_specific->send_check(sk, th, size, skb); + + tp->snd_nxt = skb->end_seq; + + atomic_inc(&sk->packets_out); + + skb->when = jiffies; + + buff = skb_clone(skb, GFP_ATOMIC); + atomic_add(buff->truesize, &sk->wmem_alloc); + + tp->af_specific->queue_xmit(sk, skb->dev, buff, 1); + + if (!tcp_timer_is_set(sk, TIME_RETRANS)) + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + } + + return 0; +} + +/* + * Function to create two new tcp segments. + * Shrinks the given segment to the specified size and appends a new + * segment with the rest of the packet to the list. + * This won't be called frenquently, I hope... + */ + +static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *buff; + struct tcphdr *th, *nth; + int nsize; + int tmp; + + th = skb->h.th; + + /* size of new segment */ + nsize = skb->tail - ((unsigned char *) (th + 1)) - len; + + if (nsize <= 0) + { + printk(KERN_DEBUG "tcp_fragment: bug size <= 0\n"); + return -1; + } + + /* + * Get a new skb... force flag on + */ + buff = sock_wmalloc(sk, nsize + 128 + sk->prot->max_header + 15, 1, + GFP_ATOMIC); + + if (buff == NULL) + return -1; + + buff->sk = sk; + buff->localroute = sk->localroute; + + /* + * Put headers on the new packet + */ + + tmp = tp->af_specific->build_net_header(sk, buff); + + if (tmp < 0) + { + sock_wfree(sk, buff); + return -1; + } + + /* + * Move the TCP header over + */ + + nth = (struct tcphdr *) skb_put(buff, sizeof(*th)); + + buff->h.th = nth; + + memcpy(nth, th, sizeof(*th)); + + /* + * Correct the new header + */ + + buff->seq = skb->seq + len; + buff->end_seq = skb->end_seq; + nth->seq = htonl(buff->seq); + nth->check = 0; + nth->doff = 5; + + /* urg data is always an headache */ + if (th->urg) + { + if (th->urg_ptr > len) + { + th->urg = 0; + nth->urg_ptr -= len; + } + else + { + nth->urg = 0; + } + } + + /* + * Copy TCP options and data start to our new buffer + */ + + buff->csum = csum_partial_copy(((u8 *)(th + 1)) + len, + skb_put(buff, nsize), + nsize, 0); + + + skb->end_seq -= nsize; + + skb_trim(skb, skb->len - nsize); + + /* remember to checksum this packet afterwards */ + th->check = 0; + skb->csum = csum_partial((u8*) (th + 1), skb->tail - ((u8 *) (th + 1)), + 0); + + skb_append(skb, buff); + + return 0; +} + +static void tcp_wrxmit_prob(struct sock *sk, struct sk_buff *skb) +{ + /* + * This is acked data. We can discard it. This + * cannot currently occur. + */ + + sk->retransmits = 0; + + printk(KERN_DEBUG "tcp_write_xmit: bug skb in write queue\n"); + + update_send_head(sk); + + skb_unlink(skb); + skb->sk = NULL; + skb->free = 1; + kfree_skb(skb, FREE_WRITE); + + if (!sk->dead) + sk->write_space(sk); +} + +static int tcp_wrxmit_frag(struct sock *sk, struct sk_buff *skb, int size) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + printk(KERN_DEBUG "tcp_write_xmit: frag needed size=%d mss=%d\n", + size, sk->mss); + + if (tcp_fragment(sk, skb, sk->mss)) + { + /* !tcp_frament Failed! */ + tp->send_head = skb; + atomic_dec(&sk->packets_out); + return -1; + } + else + { + /* + * If tcp_fragment succeded then + * the send head is the resulting + * fragment + */ + tp->send_head = skb->next; + } + return 0; +} + +/* + * This routine writes packets to the network. + * It advances the send_head. + * This happens as incoming acks open up the remote window for us. + */ + +void tcp_write_xmit(struct sock *sk) +{ + struct sk_buff *skb; + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + u16 rcv_wnd; + int sent_pkts = 0; + + /* + * The bytes will have to remain here. In time closedown will + * empty the write queue and all will be happy + */ + + if(sk->zapped) + return; + + /* + * Anything on the transmit queue that fits the window can + * be added providing we are: + * + * a) following SWS avoidance [and Nagle algorithm] + * b) not exceeding our congestion window. + * c) not retransmiting [Nagle] + */ + + start_bh_atomic(); + + rcv_wnd = htons(tcp_select_window(sk)); + + while((skb = tp->send_head) && tcp_snd_test(sk, skb)) + { + struct tcphdr *th; + struct sk_buff *buff; + int size; + + IS_SKB(skb); + + /* + * See if we really need to send the packet. + */ + + if (!after(skb->end_seq, tp->snd_una)) + { + tcp_wrxmit_prob(sk, skb); + continue; + } + + + /* + * Advance the send_head + * This one is going out. + */ + + update_send_head(sk); + + atomic_inc(&sk->packets_out); + + +/* + * put in the ack seq and window at this point rather than earlier, + * in order to keep them monotonic. We really want to avoid taking + * back window allocations. That's legal, but RFC1122 says it's frowned on. + * Ack and window will in general have changed since this packet was put + * on the write queue. + */ + + th = skb->h.th; + size = skb->len - (((unsigned char *) th) - skb->data); + + if (size - (th->doff << 2) > sk->mss) + { + if (tcp_wrxmit_frag(sk, skb, size)) + break; + } + + th->ack_seq = htonl(tp->rcv_nxt); + th->window = rcv_wnd; + + tp->af_specific->send_check(sk, th, size, skb); + +#ifdef TCP_DEBUG + if (before(skb->end_seq, tp->snd_nxt)) + printk(KERN_DEBUG "tcp_write_xmit:" + " sending already sent seq\n"); +#endif + + tp->snd_nxt = skb->end_seq; + + skb->when = jiffies; + clear_delayed_acks(sk); + + buff = skb_clone(skb, GFP_ATOMIC); + atomic_add(buff->truesize, &sk->wmem_alloc); + + sent_pkts = 1; + tp->af_specific->queue_xmit(sk, skb->dev, buff, 1); + + } + + if (sent_pkts && !tcp_timer_is_set(sk, TIME_RETRANS)) + { + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + } + + end_bh_atomic(); +} + + + +/* + * This function returns the amount that we can raise the + * usable window based on the following constraints + * + * 1. The window can never be shrunk once it is offered (RFC 793) + * 2. We limit memory per socket + */ + + +unsigned short tcp_select_window(struct sock *sk) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + int mss = sk->mss; + long free_space = sock_rspace(sk); + long window; + long cur_win; + long usable; + + + if (sk->window_clamp) + { + free_space = min(sk->window_clamp, free_space); + mss = min(sk->window_clamp, mss); + } + + /* + * compute the actual window i.e. + * old_window - received_bytes_on_that_win + */ + + cur_win = tp->rcv_wup - (tp->rcv_nxt - tp->rcv_wnd); + window = tp->rcv_wnd; + + if ( cur_win < 0 ) + { + cur_win = 0; + printk(KERN_DEBUG "TSW: win < 0 w=%d 1=%u 2=%u\n", + tp->rcv_wnd, tp->rcv_nxt, tp->rcv_wup); + } + + /* + * RFC 1122: + * "the suggested [SWS] avoidance algoritm for the receiver is to keep + * RECV.NEXT + RCV.WIN fixed until: + * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" + * + * i.e. don't raise the right edge of the window until you can't raise + * it MSS bytes + */ + + /* + * It would be a good idea if it didn't break header prediction. + * and BSD made the header predition standard... + * It expects the same value in the header i.e. th->window to be + * constant + */ + + usable = free_space - cur_win; + if (usable < 0) + { + usable = 0; + } + + if ( window < usable ) + { + /* + * Window is not blocking the sender + * and we have enought free space for it + */ + + if (cur_win > (sk->mss << 1)) + goto out; + } + + + if (window >= usable) + { + /* + * We are offering too much, cut it down... + * but don't shrink the window + */ + + window = max(usable, cur_win); + } + else + { + if ((usable - window) >= mss) + { + window += mss; + } + } + + out: + tp->rcv_wnd = window; + tp->rcv_wup = tp->rcv_nxt; + return window; +} + +static int tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb) +{ + struct tcphdr *th1, *th2; + int size1, size2, avail; + struct sk_buff *buff = skb->next; + + th1 = skb->h.th; + + if (th1->urg) + return -1; + + avail = skb_tailroom(skb); + + /* + * size of tcp payload + */ + + size1 = skb->tail - (u8 *) (th1 + 1); + + th2 = buff->h.th; + + size2 = buff->tail - (u8 *) (th2 + 1); + + if (size2 > avail || size1 + size2 > sk->mss ) + return -1; + + /* + * ok. we will be able to collapse the packet + */ + + skb_unlink(buff); + + memcpy(skb_put(skb, size2), ((char *) th2) + (th2->doff << 2), size2); + + /* + * update sizes on original skb. both TCP and IP + */ + + skb->end_seq += size2; + + if (th2->urg) + { + th1->urg = 1; + th1->urg_ptr = th2->urg_ptr + size1; + } + + /* + * ... and off you go. + */ + + buff->free = 1; + kfree_skb(buff, FREE_WRITE); + atomic_dec(&sk->packets_out); + + /* + * Header checksum will be set by the retransmit procedure + * after calling rebuild header + */ + + th1->check = 0; + skb->csum = csum_partial((u8*) (th1+1), size1 + size2, 0); + + return 0; +} + + +/* + * A socket has timed out on its send queue and wants to do a + * little retransmitting. + * retransmit_head can be different from the head of the write_queue + * if we are doing fast retransmit. + */ + +void tcp_do_retransmit(struct sock *sk, int all) +{ + struct sk_buff * skb; + int ct=0; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + start_bh_atomic(); + + if (tp->retrans_head == NULL) + tp->retrans_head = skb_peek(&sk->write_queue); + + if (tp->retrans_head == tp->send_head) + tp->retrans_head = NULL; + + while ((skb = tp->retrans_head) != NULL) + { + struct sk_buff *buff; + struct tcphdr *th; + int tcp_size; + int size; + + IS_SKB(skb); + + /* + * In general it's OK just to use the old packet. However we + * need to use the current ack and window fields. Urg and + * urg_ptr could possibly stand to be updated as well, but we + * don't keep the necessary data. That shouldn't be a problem, + * if the other end is doing the right thing. Since we're + * changing the packet, we have to issue a new IP identifier. + */ + + th = skb->h.th; + + tcp_size = skb->tail - ((unsigned char *) (th + 1)); + + if (tcp_size > sk->mss) + { + if (tcp_fragment(sk, skb, sk->mss)) + { + printk(KERN_DEBUG "tcp_fragment failed\n"); + return; + } + atomic_inc(&sk->packets_out); + } + + if (!th->syn && + tcp_size < (sk->mss >> 1) && + skb->next != tp->send_head && + skb->next != (struct sk_buff *)&sk->write_queue) + { + tcp_retrans_try_collapse(sk, skb); + } + + if (tp->af_specific->rebuild_header(sk, skb)) + { +#ifdef TCP_DEBUG + printk(KERN_DEBUG "tcp_do_rebuild_header failed\n"); +#endif + break; + } + + if (sk->debug) + printk("retransmit sending\n"); + + /* + * update ack and window + */ + + th->ack_seq = htonl(tp->rcv_nxt); + th->window = ntohs(tcp_select_window(sk)); + + size = skb->tail - (unsigned char *) th; + tp->af_specific->send_check(sk, th, size, skb); + + skb->when = jiffies; + buff = skb_clone(skb, GFP_ATOMIC); + atomic_add(buff->truesize, &sk->wmem_alloc); + + clear_delayed_acks(sk); + + tp->af_specific->queue_xmit(sk, skb->dev, buff, 1); + + /* + * Count retransmissions + */ + + ct++; + sk->prot->retransmits ++; + tcp_statistics.TcpRetransSegs++; + + /* + * Record the high sequence number to help avoid doing + * to much fast retransmission. + */ + + if (sk->retransmits) + tp->high_seq = tp->snd_nxt; + + /* + * Only one retransmit requested. + */ + + if (!all) + break; + + /* + * This should cut it off before we send too many packets. + */ + + if (ct >= sk->cong_window) + break; + + /* + * Advance the pointer + */ + + tp->retrans_head = skb->next; + if ((tp->retrans_head == tp->send_head) || + (tp->retrans_head == (struct sk_buff *) &sk->write_queue)) + { + tp->retrans_head = NULL; + } + } + + end_bh_atomic(); +} + +/* + * Send a fin. + */ + +void tcp_send_fin(struct sock *sk) +{ + struct tcphdr *th =(struct tcphdr *)&sk->dummy_th; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct tcphdr *t1; + struct sk_buff *buff; + int tmp; + + + buff = sock_wmalloc(sk, MAX_RESET_SIZE, 1, GFP_KERNEL); + + if (buff == NULL) + { + /* This is a disaster if it occurs */ + printk("tcp_send_fin: Impossible malloc failure"); + return; + } + + /* + * Administrivia + */ + + buff->sk = sk; + buff->localroute = sk->localroute; + buff->csum = 0; + + /* + * Put in the IP header and routing stuff. + */ + + tmp = tp->af_specific->build_net_header(sk, buff); + + if (tmp < 0) + { + int t; + /* + * Finish anyway, treat this as a send that got lost. + * (Not good). + */ + + buff->free = 1; + sock_wfree(sk,buff); + sk->write_seq++; + t=del_timer(&sk->timer); + if(t) + add_timer(&sk->timer); + else + tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); + return; + } + + /* + * We ought to check if the end of the queue is a buffer and + * if so simply add the fin to that buffer, not send it ahead. + */ + + t1 =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr)); + buff->h.th = t1; + + memcpy(t1, th, sizeof(*t1)); + buff->seq = sk->write_seq; + sk->write_seq++; + buff->end_seq = sk->write_seq; + t1->seq = htonl(buff->seq); + t1->ack_seq = htonl(tp->rcv_nxt); + t1->window = htons(tcp_select_window(sk)); + t1->fin = 1; + + tp->af_specific->send_check(sk, t1, sizeof(*t1), buff); + + /* + * The fin can only be transmited after the data. + */ + + skb_queue_tail(&sk->write_queue, buff); + + if (tp->send_head == NULL) + { + struct sk_buff *skb1; + + atomic_inc(&sk->packets_out); + tp->snd_nxt = sk->write_seq; + buff->when = jiffies; + + skb1 = skb_clone(buff, GFP_KERNEL); + atomic_add(skb1->truesize, &sk->wmem_alloc); + + tp->af_specific->queue_xmit(sk, skb1->dev, skb1, 1); + + if (!tcp_timer_is_set(sk, TIME_RETRANS)) + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + } +} + +int tcp_send_synack(struct sock *sk) +{ + struct tcp_opt * tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff * skb; + struct sk_buff * buff; + struct tcphdr *th; + unsigned char *ptr; + int tmp; + + skb = sock_wmalloc(sk, MAX_SYN_SIZE, 1, GFP_ATOMIC); + + if (skb == NULL) + { + return -ENOMEM; + } + + skb->sk = sk; + skb->localroute = sk->localroute; + + tmp = tp->af_specific->build_net_header(sk, skb); + + if (tmp < 0) + { + skb->free = 1; + kfree_skb(skb, FREE_WRITE); + return tmp; + } + + th =(struct tcphdr *) skb_put(skb, sizeof(struct tcphdr)); + skb->h.th = th; + memset(th, 0, sizeof(struct tcphdr)); + + th->syn = 1; + th->ack = 1; + + th->source = sk->dummy_th.source; + th->dest = sk->dummy_th.dest; + + skb->seq = tp->snd_una; + skb->end_seq = skb->seq + 1 /* th->syn */ ; + th->seq = ntohl(skb->seq); + + th->window = ntohs(tp->rcv_wnd); + + th->ack_seq = htonl(tp->rcv_nxt); + th->doff = sizeof(*th)/4 + 1; + + ptr = skb_put(skb, TCPOLEN_MSS); + ptr[0] = TCPOPT_MSS; + ptr[1] = TCPOLEN_MSS; + ptr[2] = ((sk->mss) >> 8) & 0xff; + ptr[3] = (sk->mss) & 0xff; + skb->csum = csum_partial(ptr, TCPOLEN_MSS, 0); + + tp->af_specific->send_check(sk, th, sizeof(*th)+4, skb); + + skb_queue_tail(&sk->write_queue, skb); + + atomic_inc(&sk->packets_out); + + skb->when = jiffies; + buff = skb_clone(skb, GFP_ATOMIC); + + atomic_add(skb->truesize, &sk->wmem_alloc); + + tp->af_specific->queue_xmit(sk, skb->dev, buff, 1); + + tcp_reset_xmit_timer(sk, TIME_RETRANS, TCP_TIMEOUT_INIT); + + tcp_statistics.TcpOutSegs++; + + return 0; +} + +/* + * Set up the timers for sending a delayed ack.. + * + * rules for delaying an ack: + * - delay time <= 0.5 HZ + * - must send at least every 2 full sized packets + * - we don't have a window update to send + */ + +void tcp_send_delayed_ack(struct sock * sk, int max_timeout) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + unsigned long timeout, now; + + /* Calculate new timeout */ + now = jiffies; + timeout = tp->ato; + + if (timeout > max_timeout || sk->bytes_rcv > (sk->mss << 2)) + { + timeout = now; + } + else + timeout += now; + + /* Use new timeout only if there wasn't a older one earlier */ + if (!del_timer(&tp->delack_timer) || timeout < tp->delack_timer.expires) + { + tp->delack_timer.expires = timeout; + } + + add_timer(&tp->delack_timer); +} + + + +/* + * This routine sends an ack and also updates the window. + */ + +void tcp_send_ack(struct sock *sk) +{ + struct sk_buff *buff; + struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); + struct tcphdr *th; + int tmp; + + + if(sk->zapped) + { + /* We have been reset, we may not send again */ + return; + } + + /* + * We need to grab some memory, and put together an ack, + * and then put it into the queue to be sent. + */ + + buff = sock_wmalloc(sk, MAX_ACK_SIZE, 1, GFP_ATOMIC); + if (buff == NULL) + { + /* + * Force it to send an ack. We don't have to do this + * (ACK is unreliable) but it's much better use of + * bandwidth on slow links to send a spare ack than + * resend packets. + */ + + tcp_send_delayed_ack(sk, HZ/2); + return; + } + + clear_delayed_acks(sk); + + /* + * Assemble a suitable TCP frame + */ + + buff->sk = sk; + buff->localroute = sk->localroute; + buff->csum = 0; + + /* + * Put in the IP header and routing stuff. + */ + + tmp = tp->af_specific->build_net_header(sk, buff); + + if (tmp < 0) + { + buff->free = 1; + sock_wfree(sk, buff); + return; + } + + th =(struct tcphdr *)skb_put(buff,sizeof(struct tcphdr)); + + memcpy(th, &sk->dummy_th, sizeof(struct tcphdr)); + + /* + * Swap the send and the receive. + */ + + th->window = ntohs(tcp_select_window(sk)); + th->seq = ntohl(tp->snd_nxt); + th->ack_seq = ntohl(tp->rcv_nxt); + + /* + * Fill in the packet and send it + */ + + tp->af_specific->send_check(sk, th, sizeof(struct tcphdr), buff); + + if (sk->debug) + printk("\rtcp_send_ack: seq %x ack %x\n", + tp->snd_nxt, tp->rcv_nxt); + + tp->af_specific->queue_xmit(sk, buff->dev, buff, 1); + + tcp_statistics.TcpOutSegs++; +} + +/* + * This routine sends a packet with an out of date sequence + * number. It assumes the other end will try to ack it. + */ + +void tcp_write_wakeup(struct sock *sk) +{ + struct sk_buff *buff, *skb; + struct tcphdr *t1; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int tmp; + + if (sk->zapped) + return; /* After a valid reset we can send no more */ + + /* + * Write data can still be transmitted/retransmitted in the + * following states. If any other state is encountered, return. + * [listen/close will never occur here anyway] + */ + + if (sk->state != TCP_ESTABLISHED && + sk->state != TCP_CLOSE_WAIT && + sk->state != TCP_FIN_WAIT1 && + sk->state != TCP_LAST_ACK && + sk->state != TCP_CLOSING + ) + { + return; + } + + if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) && + (skb=tp->send_head)) + { + /* + * We are probing the opening of a window + * but the window size is != 0 + * must have been a result SWS avoidance ( sender ) + */ + + struct tcphdr *th; + unsigned long win_size; + + win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); + + if (win_size < skb->end_seq - skb->seq) + { + if (tcp_fragment(sk, skb, win_size)) + { + printk(KERN_DEBUG "tcp_write_wakeup: " + "fragment failed\n"); + return; + } + } + + + th = skb->h.th; + + tp->af_specific->send_check(sk, th, th->doff * 4 + win_size, + skb); + + buff = skb_clone(skb, GFP_ATOMIC); + + atomic_add(buff->truesize, &sk->wmem_alloc); + atomic_inc(&sk->packets_out); + + clear_delayed_acks(sk); + + if (!tcp_timer_is_set(sk, TIME_RETRANS)) + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + + skb->when = jiffies; + + update_send_head(sk); + + tp->snd_nxt = skb->end_seq; + } + else + { + buff = sock_wmalloc(sk,MAX_ACK_SIZE, 1, GFP_ATOMIC); + if (buff == NULL) + return; + + buff->free = 1; + buff->sk = sk; + buff->localroute = sk->localroute; + buff->csum = 0; + + /* + * Put in the IP header and routing stuff. + */ + + tmp = tp->af_specific->build_net_header(sk, buff); + + if (tmp < 0) + { + sock_wfree(sk, buff); + return; + } + + t1 = (struct tcphdr *) skb_put(buff, sizeof(struct tcphdr)); + memcpy(t1,(void *) &sk->dummy_th, sizeof(*t1)); + + /* + * Use a previous sequence. + * This should cause the other end to send an ack. + */ + + t1->seq = htonl(tp->snd_nxt-1); +/* t1->fin = 0; -- We are sending a 'previous' sequence, and 0 bytes of data - thus no FIN bit */ + t1->ack_seq = htonl(tp->rcv_nxt); + t1->window = htons(tcp_select_window(sk)); + + tp->af_specific->send_check(sk, t1, sizeof(*t1), buff); + } + + /* + * Send it. + */ + + tp->af_specific->queue_xmit(sk, buff->dev, buff, 1); + tcp_statistics.TcpOutSegs++; +} + +/* + * A window probe timeout has occurred. + * If window is not closed send a partial packet + * else a zero probe. + */ + +void tcp_send_probe0(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + if (sk->zapped) + return; /* After a valid reset we can send no more */ + + + tcp_write_wakeup(sk); + + tp->pending = TIME_PROBE0; + + tp->backoff++; + tp->probes_out++; + + tcp_reset_xmit_timer (sk, TIME_PROBE0, + min(tp->rto << tp->backoff, 120*HZ)); +} diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c new file mode 100644 index 000000000..e96089fab --- /dev/null +++ b/net/ipv4/tcp_timer.c @@ -0,0 +1,585 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: @(#)tcp.c 1.0.16 05/25/93 + * + * Authors: Ross Biro, <bir7@leland.Stanford.Edu> + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Mark Evans, <evansmp@uhura.aston.ac.uk> + * Corey Minyard <wf-rch!minyard@relay.EU.net> + * Florian La Roche, <flla@stud.uni-sb.de> + * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> + * Linus Torvalds, <torvalds@cs.helsinki.fi> + * Alan Cox, <gw4pts@gw4pts.ampr.org> + * Matthew Dillon, <dillon@apollo.west.oic.com> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Jorge Cwik, <jorge@laser.satlink.net> + */ + +#include <net/tcp.h> + +static void tcp_sltimer_handler(unsigned long); +static void tcp_syn_recv_timer(unsigned long); +static void tcp_keepalive(unsigned long data); + +struct timer_list tcp_slow_timer = { + NULL, NULL, + 0, 0, + tcp_sltimer_handler, +}; + + +struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = { + {0, TCP_SYNACK_PERIOD, 0, tcp_syn_recv_timer}, /* SYNACK */ + {0, TCP_KEEPALIVE_PERIOD, 0, tcp_keepalive} /* KEEPALIVE */ +}; + +/* + * Using different timers for retransmit, delayed acks and probes + * We may wish use just one timer maintaining a list of expire jiffies + * to optimize. + */ + +void tcp_init_xmit_timers(struct sock *sk) +{ + init_timer(&sk->tp_pinfo.af_tcp.retransmit_timer); + sk->tp_pinfo.af_tcp.retransmit_timer.function=&tcp_retransmit_timer; + sk->tp_pinfo.af_tcp.retransmit_timer.data = (unsigned long) sk; + + init_timer(&sk->tp_pinfo.af_tcp.delack_timer); + sk->tp_pinfo.af_tcp.delack_timer.function=&tcp_delack_timer; + sk->tp_pinfo.af_tcp.delack_timer.data = (unsigned long) sk; + + init_timer(&sk->tp_pinfo.af_tcp.probe_timer); + sk->tp_pinfo.af_tcp.probe_timer.function=&tcp_probe_timer; + sk->tp_pinfo.af_tcp.probe_timer.data = (unsigned long) sk; +} + +/* + * Reset the retransmission timer + */ + +void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + if((long)when <= 0) + { + printk("xmit_timer <= 0 - timer:%d when:%lx\n", what, when); + when=HZ/50; + } + + switch (what) { + case TIME_RETRANS: + /* + * When seting the transmit timer the probe timer + * should not be set. + * The delayed ack timer can be set if we are changing the + * retransmit timer when removing acked frames. + */ + del_timer(&tp->probe_timer); + del_timer(&tp->retransmit_timer); + tp->retransmit_timer.expires=jiffies+when; + add_timer(&tp->retransmit_timer); + break; + + case TIME_DACK: + del_timer(&tp->delack_timer); + tp->delack_timer.expires=jiffies+when; + add_timer(&tp->delack_timer); + break; + + case TIME_PROBE0: + del_timer(&tp->probe_timer); + tp->probe_timer.expires=jiffies+when; + add_timer(&tp->probe_timer); + break; + + case TIME_WRITE: + printk("bug: tcp_reset_xmit_timer TIME_WRITE\n"); + break; + + default: + printk("bug: unknown timer value\n"); + } +} + +void tcp_clear_xmit_timer(struct sock *sk, int what) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + switch (what) { + case TIME_RETRANS: + del_timer(&tp->retransmit_timer); + break; + case TIME_DACK: + del_timer(&tp->delack_timer); + break; + case TIME_PROBE0: + del_timer(&tp->probe_timer); + break; + default: + printk("bug: unknown timer value\n"); + } +} + +int tcp_timer_is_set(struct sock *sk, int what) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + switch (what) { + case TIME_RETRANS: + return tp->retransmit_timer.next != NULL; + break; + case TIME_DACK: + return tp->delack_timer.next != NULL; + break; + case TIME_PROBE0: + return tp->probe_timer.next != NULL; + break; + default: + printk("bug: unknown timer value\n"); + } + return 0; +} + +void tcp_clear_xmit_timers(struct sock *sk) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + del_timer(&tp->retransmit_timer); + del_timer(&tp->delack_timer); + del_timer(&tp->probe_timer); +} + +/* + * A write timeout has occurred. Process the after effects. BROKEN (badly) + */ + +static int tcp_write_timeout(struct sock *sk) +{ + /* + * Look for a 'soft' timeout. + */ + if ((sk->state == TCP_ESTABLISHED && sk->retransmits && !(sk->retransmits & 7)) + || (sk->state != TCP_ESTABLISHED && sk->retransmits > TCP_RETR1)) + { + /* + * Attempt to recover if arp has changed (unlikely!) or + * a route has shifted (not supported prior to 1.3). + */ + ip_rt_advice(&sk->ip_route_cache, 0); + } + + /* + * Have we tried to SYN too many times (repent repent 8)) + */ + + if(sk->retransmits > TCP_SYN_RETRIES && sk->state==TCP_SYN_SENT) + { + if(sk->err_soft) + sk->err=sk->err_soft; + else + sk->err=ETIMEDOUT; +#ifdef TCP_DEBUG + printk(KERN_DEBUG "syn timeout\n"); +#endif + + sk->error_report(sk); + tcp_clear_xmit_timers(sk); + tcp_statistics.TcpAttemptFails++; /* Is this right ??? - FIXME - */ + tcp_set_state(sk,TCP_CLOSE); + /* Don't FIN, we got nothing back */ + return 0; + } + /* + * Has it gone just too far ? + */ + if (sk->retransmits > TCP_RETR2) + { + if(sk->err_soft) + sk->err = sk->err_soft; + else + sk->err = ETIMEDOUT; + sk->error_report(sk); + + tcp_clear_xmit_timers(sk); + + /* + * Time wait the socket + */ + if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 || sk->state == TCP_CLOSING ) + { + tcp_set_state(sk,TCP_TIME_WAIT); + tcp_reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); + } + else + { + /* + * Clean up time. + */ + tcp_set_state(sk, TCP_CLOSE); + return 0; + } + } + return 1; +} + + +void tcp_delack_timer(unsigned long data) { + + struct sock *sk = (struct sock*)data; + + if(sk->zapped) + { + return; + } + + if (sk->delayed_acks) + { + tcp_read_wakeup(sk); + } +} + +void tcp_probe_timer(unsigned long data) { + + struct sock *sk = (struct sock*)data; + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + if(sk->zapped) + { + return; + } + + if (sk->users) + { + /* + * Try again in second + */ + + tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ); + return; + } + + /* + * *WARNING* RFC 1122 forbids this + * FIXME: We ought not to do it, Solaris 2.5 actually has fixing + * this behaviour in Solaris down as a bug fix. [AC] + */ + if (tp->probes_out > TCP_RETR2) + { + if(sk->err_soft) + sk->err = sk->err_soft; + else + sk->err = ETIMEDOUT; + sk->error_report(sk); + + /* + * Time wait the socket + */ + if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 + || sk->state == TCP_CLOSING ) + { + tcp_set_state(sk, TCP_TIME_WAIT); + tcp_reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN); + } + else + { + /* + * Clean up time. + */ + tcp_set_state(sk, TCP_CLOSE); + } + } + + tcp_send_probe0(sk); +} + +static __inline__ int tcp_keepopen_proc(struct sock *sk) +{ + int res = 0; + + if (sk->state == TCP_ESTABLISHED || sk->state == TCP_CLOSE_WAIT) + { + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + __u32 elapsed = jiffies - tp->rcv_tstamp; + + if (elapsed >= TCP_KEEPALIVE_TIME) + { + if (tp->probes_out > TCP_KEEPALIVE_PROBES) + { + if(sk->err_soft) + sk->err = sk->err_soft; + else + sk->err = ETIMEDOUT; + + tcp_set_state(sk, TCP_CLOSE); + } + else + { + tp->probes_out++; + tp->pending = TIME_KEEPOPEN; + tcp_write_wakeup(sk); + res = 1; + } + } + } + return res; +} + +/* + * Check all sockets for keepalive timer + * Called every 75 seconds + * This timer is started by af_inet init routine and is constantly + * running. + * + * It might be better to maintain a count of sockets that need it using + * setsockopt/tcp_destroy_sk and only set the timer when needed. + */ + +/* + * don't send over 5 keepopens at a time to avoid burstiness + * on big servers [AC] + */ +#define MAX_KA_PROBES 5 + +static void tcp_keepalive(unsigned long data) +{ + struct sock *sk; + int count = 0; + int i; + + for(i=0; i < SOCK_ARRAY_SIZE; i++) + { + sk = tcp_prot.sock_array[i]; + while (sk) + { + if (sk->keepopen) + { + count += tcp_keepopen_proc(sk); + } + + if (count == MAX_KA_PROBES) + return; + + sk = sk->next; + } + } +} + +/* + * The TCP retransmit timer. This lacks a few small details. + * + * 1. An initial rtt timeout on the probe0 should cause what we can + * of the first write queue buffer to be split and sent. + * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report + * ETIMEDOUT if we know an additional 'soft' error caused this. + * tcp_err should save a 'soft error' for us. + * [Unless someone has broken it then it does, except for one 2.0 + * broken case of a send when the route/device is directly unreachable, + * and we error but should retry! - FIXME] [AC] + */ + +void tcp_retransmit_timer(unsigned long data) +{ + struct sock *sk = (struct sock*)data; + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + /* + * We are reset. We will send no more retransmits. + */ + + if(sk->zapped) + { + tcp_clear_xmit_timer(sk, TIME_RETRANS); + return; + } + + /* + * Clear delay ack timer + */ + + tcp_clear_xmit_timer(sk, TIME_DACK); + + /* + * Retransmission + */ + + tp->retrans_head = NULL; + + + if (sk->retransmits == 0) + { + /* + * remember window where we lost + * "one half of the current window but at least 2 segments" + */ + + sk->ssthresh = max(sk->cong_window >> 1, 2); + sk->cong_count = 0; + sk->cong_window = 1; + } + + atomic_inc(&sk->retransmits); + + tcp_do_retransmit(sk, 0); + + /* + * Increase the timeout each time we retransmit. Note that + * we do not increase the rtt estimate. rto is initialized + * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests + * that doubling rto each time is the least we can get away with. + * In KA9Q, Karn uses this for the first few times, and then + * goes to quadratic. netBSD doubles, but only goes up to *64, + * and clamps at 1 to 64 sec afterwards. Note that 120 sec is + * defined in the protocol as the maximum possible RTT. I guess + * we'll have to use something other than TCP to talk to the + * University of Mars. + * + * PAWS allows us longer timeouts and large windows, so once + * implemented ftp to mars will work nicely. We will have to fix + * the 120 second clamps though! + */ + + tp->backoff++; + tp->rto = min(tp->rto << 1, 120*HZ); + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + + tcp_write_timeout(sk); +} + +/* + * Slow timer for SYN-RECV sockets + */ + +static void tcp_syn_recv_timer(unsigned long data) +{ + struct sock *sk; + unsigned long now = jiffies; + int i; + + for(i=0; i < SOCK_ARRAY_SIZE; i++) + { + sk = tcp_prot.sock_array[i]; + while (sk) + { + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + if (sk->state == TCP_LISTEN && !sk->users && + tp->syn_wait_queue) + { + struct open_request *req; + + req = tp->syn_wait_queue; + + do { + struct open_request *conn; + + conn = req; + req = req->dl_next; + + if (conn->sk) + { + continue; + } + + if ((long)(now - conn->expires) <= 0) + break; + + tcp_synq_unlink(tp, conn); + + if (conn->retrans >= TCP_RETR1) + { +#ifdef TCP_DEBUG + printk(KERN_DEBUG "syn_recv: " + "too many retransmits\n"); +#endif + (*conn->class->destructor)(conn); + tcp_dec_slow_timer(TCP_SLT_SYNACK); + sk->ack_backlog--; + kfree(conn); + + if (!tp->syn_wait_queue) + break; + } + else + { + __u32 timeo; + + (*conn->class->rtx_syn_ack)(sk, conn); + + conn->retrans++; +#ifdef TCP_DEBUG + printk(KERN_DEBUG "syn_ack rtx %d\n", conn->retrans); +#endif + timeo = min((TCP_TIMEOUT_INIT + << conn->retrans), + 120*HZ); + conn->expires = now + timeo; + tcp_synq_queue(tp, conn); + } + } while (req != tp->syn_wait_queue); + } + + sk = sk->next; + } + } +} + +void tcp_sltimer_handler(unsigned long data) +{ + struct tcp_sl_timer *slt = tcp_slt_array; + unsigned long next = ~0UL; + unsigned long now = jiffies; + int i; + + for (i=0; i < TCP_SLT_MAX; i++, slt++) + { + if (slt->count) + { + long trigger; + + trigger = slt->period - ((long)(now - slt->last)); + + if (trigger <= 0) + { + (*slt->handler)((unsigned long) slt); + slt->last = now; + trigger = slt->period; + } + next = min(next, trigger); + } + } + + if (next != ~0UL) + { + tcp_slow_timer.expires = now + next; + add_timer(&tcp_slow_timer); + } +} + +void __tcp_inc_slow_timer(struct tcp_sl_timer *slt) +{ + unsigned long now = jiffies; + unsigned long next = 0; + unsigned long when; + + slt->last = now; + + when = now + slt->period; + if (del_timer(&tcp_slow_timer)) + { + next = tcp_slow_timer.expires; + } + if (next && ((long)(next - when) < 0)) + { + when = next; + } + + tcp_slow_timer.expires = when; + add_timer(&tcp_slow_timer); +} diff --git a/net/ipv4/timer.c b/net/ipv4/timer.c index ebaa00d70..664d81167 100644 --- a/net/ipv4/timer.c +++ b/net/ipv4/timer.c @@ -50,7 +50,7 @@ #include <net/sock.h> #include <net/arp.h> -void delete_timer (struct sock *t) +void net_delete_timer (struct sock *t) { unsigned long flags; @@ -63,16 +63,16 @@ void delete_timer (struct sock *t) restore_flags (flags); } -void reset_timer (struct sock *t, int timeout, unsigned long len) +void net_reset_timer (struct sock *t, int timeout, unsigned long len) { - delete_timer (t); + net_delete_timer (t); t->timeout = timeout; #if 1 /* FIXME: ??? */ if ((int) len < 0) /* prevent close to infinite timers. THEY _DO_ */ len = 3; /* happen (negative values ?) - don't ask me why ! -FB */ #endif - t->timer.expires = len; + t->timer.expires = jiffies+len; add_timer (&t->timer); } @@ -92,18 +92,14 @@ void net_timer (unsigned long data) * only process if socket is not in use */ - cli(); - if (sk->inuse || in_bh) + if (sk->users) { - sk->timer.expires = 10; + sk->timer.expires = jiffies+HZ; add_timer(&sk->timer); sti(); return; } - sk->inuse = 1; - sti(); - /* Always see if we need to send an ack. */ if (sk->ack_backlog && !sk->zapped) @@ -118,10 +114,15 @@ void net_timer (unsigned long data) switch (why) { case TIME_DONE: - if (! sk->dead || sk->state != TCP_CLOSE) + /* If the socket hasn't been closed off, re-try a bit later */ + if (!sk->dead) { + net_reset_timer(sk, TIME_DONE, TCP_DONE_TIME); + break; + } + + if (sk->state != TCP_CLOSE) { - printk ("non dead socket in time_done\n"); - release_sock (sk); + printk ("non CLOSE socket in time_done\n"); break; } destroy_sock (sk); @@ -132,31 +133,22 @@ void net_timer (unsigned long data) * We've waited for a while for all the memory associated with * the socket to be freed. */ - if(sk->wmem_alloc!=0 || sk->rmem_alloc!=0) - { - sk->wmem_alloc++; /* So it DOESN'T go away */ - destroy_sock (sk); - sk->wmem_alloc--; /* Might now have hit 0 - fall through and do it again if so */ - sk->inuse = 0; /* This will be ok, the destroy won't totally work */ - } - if(sk->wmem_alloc==0 && sk->rmem_alloc==0) - destroy_sock(sk); /* Socket gone, DON'T update sk->inuse! */ - break; + + destroy_sock(sk); + break; + case TIME_CLOSE: /* We've waited long enough, close the socket. */ sk->state = TCP_CLOSE; - delete_timer (sk); - /* Kill the ARP entry in case the hardware has changed. */ - arp_destroy (sk->daddr, 0); + net_delete_timer (sk); if (!sk->dead) sk->state_change(sk); sk->shutdown = SHUTDOWN_MASK; - reset_timer (sk, TIME_DESTROY, TCP_DONE_TIME); - release_sock (sk); + net_reset_timer (sk, TIME_DONE, TCP_DONE_TIME); break; + default: printk ("net_timer: timer expired - reason %d is unknown\n", why); - release_sock (sk); break; } } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 9976e2be2..29e44e88a 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -45,6 +45,10 @@ * Arnt Gulbrandsen : New udp_send and stuff * Alan Cox : Cache last socket * Alan Cox : Route cache + * Jon Peatfield : Minor efficiency fix to sendto(). + * Mike Shaver : RFC1122 checks. + * Alan Cox : Nonblocking error fix. + * Willy Konynenberg : Transparent proxying support. * * * This program is free software; you can redistribute it and/or @@ -53,8 +57,34 @@ * 2 of the License, or (at your option) any later version. */ +/* RFC1122 Status: + 4.1.3.1 (Ports): + SHOULD send ICMP_PORT_UNREACHABLE in response to datagrams to + an un-listened port. (OK) + 4.1.3.2 (IP Options) + MUST pass IP options from IP -> application (OK) + MUST allow application to specify IP options (OK) + 4.1.3.3 (ICMP Messages) + MUST pass ICMP error messages to application (OK) + 4.1.3.4 (UDP Checksums) + MUST provide facility for checksumming (OK) + MAY allow application to control checksumming (OK) + MUST default to checksumming on (OK) + MUST discard silently datagrams with bad csums (OK) + 4.1.3.5 (UDP Multihoming) + MUST allow application to specify source address (OK) + SHOULD be able to communicate the chosen src addr up to application + when application doesn't choose (NOT YET - doesn't seem to be in the BSD API) + [Does opening a SOCK_PACKET and snooping your output count 8)] + 4.1.3.6 (Invalid Addresses) + MUST discard invalid source addresses (NOT YET -- will be implemented + in IP, so UDP will eventually be OK. Right now it's a violation.) + MUST only send datagrams with one of our addresses (NOT YET - ought to be OK ) + 950728 -- MS +*/ + #include <asm/system.h> -#include <asm/segment.h> +#include <asm/uaccess.h> #include <linux/types.h> #include <linux/sched.h> #include <linux/fcntl.h> @@ -80,7 +110,7 @@ #include <net/checksum.h> /* - * SNMP MIB for the UDP layer + * Snmp MIB for the UDP layer */ struct udp_mib udp_statistics; @@ -106,8 +136,6 @@ void udp_cache_zap(void) restore_flags(flags); } -static int udp_deliver(struct sock *sk, struct udphdr *uh, struct sk_buff *skb, struct device *dev, long saddr, long daddr, int len); - #define min(a,b) ((a)<(b)?(a):(b)) @@ -122,33 +150,40 @@ static int udp_deliver(struct sock *sk, struct udphdr *uh, struct sk_buff *skb, * to find the appropriate port. */ -void udp_err(int err, unsigned char *header, unsigned long daddr, - unsigned long saddr, struct inet_protocol *protocol) +void udp_err(int type, int code, unsigned char *header, __u32 info, + __u32 daddr, __u32 saddr, struct inet_protocol *protocol, int len) { - struct udphdr *th; + struct udphdr *uh; struct sock *sk; - struct iphdr *ip=(struct iphdr *)header; - - header += 4*ip->ihl; /* * Find the 8 bytes of post IP header ICMP included for us */ + + if(len<sizeof(struct udphdr)) + return; - th = (struct udphdr *)header; + uh = (struct udphdr *)header; - sk = get_sock(&udp_prot, th->source, daddr, th->dest, saddr); + sk = get_sock(&udp_prot, uh->source, daddr, uh->dest, saddr, 0, 0); if (sk == NULL) return; /* No socket for error */ - if (err & 0xff00 ==(ICMP_SOURCE_QUENCH << 8)) + if (type == ICMP_SOURCE_QUENCH) { /* Slow down! */ if (sk->cong_window > 1) sk->cong_window = sk->cong_window/2; return; } + if (type == ICMP_PARAMETERPROB) + { + sk->err = EPROTO; + sk->error_report(sk); + return; + } + /* * Various people wanted BSD UDP semantics. Well they've come * back out because they slow down response to stuff like dead @@ -157,27 +192,36 @@ void udp_err(int err, unsigned char *header, unsigned long daddr, * client code people. */ - if (icmp_err_convert[err & 0xff].fatal) + /* RFC1122: OK. Passes ICMP errors back to application, as per */ + /* 4.1.3.3. */ + /* After the comment above, that should be no surprise. */ + + if(code<=NR_ICMP_UNREACH && icmp_err_convert[code].fatal) { - sk->err = icmp_err_convert[err & 0xff].errno; + /* + * 4.x BSD compatibility item. Break RFC1122 to + * get BSD socket semantics. + */ + if(sk->bsdism && sk->state!=TCP_ESTABLISHED) + return; + sk->err = icmp_err_convert[code].errno; sk->error_report(sk); } } -static unsigned short udp_check(struct udphdr *uh, int len, unsigned long saddr, unsigned long daddr) +static unsigned short udp_check(struct udphdr *uh, int len, unsigned long saddr, unsigned long daddr, unsigned long base) { - return(csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, - csum_partial((char*)uh, len, 0))); + return(csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base)); } struct udpfakehdr { struct udphdr uh; - int daddr; - int other; - char *from; - int wcheck; + __u32 daddr; + __u32 other; + const char *from; + __u32 wcheck; }; /* @@ -186,13 +230,14 @@ struct udpfakehdr * for direct user->board I/O transfers. That one will be fun. */ -static void udp_getfrag(void *p, int saddr, char * to, unsigned int offset, unsigned int fraglen) +static int udp_getfrag(const void *p, __u32 saddr, char * to, unsigned int offset, unsigned int fraglen) { struct udpfakehdr *ufh = (struct udpfakehdr *)p; - char *src, *dst; + const char *src; + char *dst; unsigned int len; - if (offset) + if (offset) { len = fraglen; src = ufh->from+(offset-sizeof(struct udphdr)); @@ -204,7 +249,7 @@ static void udp_getfrag(void *p, int saddr, char * to, unsigned int offset, unsi src = ufh->from; dst = to+sizeof(struct udphdr); } - ufh->wcheck = csum_partial_copyffs(src, dst, len, ufh->wcheck); + ufh->wcheck = csum_partial_copy_fromuser(src, dst, len, ufh->wcheck); if (offset == 0) { ufh->wcheck = csum_partial((char *)ufh, sizeof(struct udphdr), @@ -216,20 +261,23 @@ static void udp_getfrag(void *p, int saddr, char * to, unsigned int offset, unsi ufh->uh.check = -1; memcpy(to, ufh, sizeof(struct udphdr)); } + return 0; } /* - * Uncheckummed UDP is sufficiently criticial to stuff like ATM video conferencing + * Unchecksummed UDP is sufficiently critical to stuff like ATM video conferencing * that we use two routines for this for speed. Probably we ought to have a CONFIG_FAST_NET * set for >10Mb/second boards to activate this sort of coding. Timing needed to verify if * this is a valid decision. */ -static void udp_getfrag_nosum(void *p, int saddr, char * to, unsigned int offset, unsigned int fraglen) +static int udp_getfrag_nosum(const void *p, __u32 saddr, char * to, unsigned int offset, unsigned int fraglen) { struct udpfakehdr *ufh = (struct udpfakehdr *)p; - char *src, *dst; + const char *src; + char *dst; unsigned int len; + int err; if (offset) { @@ -243,22 +291,27 @@ static void udp_getfrag_nosum(void *p, int saddr, char * to, unsigned int offset src = ufh->from; dst = to+sizeof(struct udphdr); } - memcpy_fromfs(src,dst,len); + err = copy_from_user(dst,src,len); if (offset == 0) memcpy(to, ufh, sizeof(struct udphdr)); + return err; } /* * Send UDP frames. */ - + static int udp_send(struct sock *sk, struct sockaddr_in *sin, - unsigned char *from, int len, int rt) + const unsigned char *from, int len, int rt, + __u32 saddr, int noblock) { int ulen = len + sizeof(struct udphdr); int a; struct udpfakehdr ufh; + + if(ulen>65535-sizeof(struct iphdr)) + return -EMSGSIZE; ufh.uh.source = sk->dummy_th.source; ufh.uh.dest = sin->sin_port; @@ -268,26 +321,64 @@ static int udp_send(struct sock *sk, struct sockaddr_in *sin, ufh.other = (htons(ulen) << 16) + IPPROTO_UDP*256; ufh.from = from; ufh.wcheck = 0; + +#ifdef CONFIG_IP_TRANSPARENT_PROXY + if (rt&MSG_PROXY) + { + /* + * We map the first 8 bytes of a second sockaddr_in + * into the last 8 (unused) bytes of a sockaddr_in. + * This _is_ ugly, but it's the only way to do it + * easily, without adding system calls. + */ + struct sockaddr_in *sinfrom = + (struct sockaddr_in *) sin->sin_zero; + + if (!suser()) + return(-EPERM); + if (sinfrom->sin_family && sinfrom->sin_family != AF_INET) + return(-EINVAL); + if (sinfrom->sin_port == 0) + return(-EINVAL); + saddr = sinfrom->sin_addr.s_addr; + ufh.uh.source = sinfrom->sin_port; + } +#endif + + /* RFC1122: OK. Provides the checksumming facility (MUST) as per */ + /* 4.1.3.4. It's configurable by the application via setsockopt() */ + /* (MAY) and it defaults to on (MUST). Almost makes up for the */ + /* violation above. -- MS */ + if(sk->no_check) a = ip_build_xmit(sk, udp_getfrag_nosum, &ufh, ulen, - sin->sin_addr.s_addr, rt, IPPROTO_UDP); + sin->sin_addr.s_addr, saddr, sk->opt, rt, IPPROTO_UDP, noblock); else a = ip_build_xmit(sk, udp_getfrag, &ufh, ulen, - sin->sin_addr.s_addr, rt, IPPROTO_UDP); - return(a<0 ? a : len); + sin->sin_addr.s_addr, saddr, sk->opt, rt, IPPROTO_UDP, noblock); + if(a<0) + return a; + udp_statistics.UdpOutDatagrams++; + return len; } -static int udp_sendto(struct sock *sk, unsigned char *from, int len, int noblock, +static int udp_sendto(struct sock *sk, const unsigned char *from, int len, int noblock, unsigned flags, struct sockaddr_in *usin, int addr_len) { struct sockaddr_in sin; int tmp; + __u32 saddr=0; /* * Check the flags. We support no flags for UDP sending */ + +#ifdef CONFIG_IP_TRANSPARENT_PROXY + if (flags&~(MSG_DONTROUTE|MSG_PROXY)) +#else if (flags&~MSG_DONTROUTE) +#endif return(-EINVAL); /* * Get and verify the address. @@ -297,19 +388,24 @@ static int udp_sendto(struct sock *sk, unsigned char *from, int len, int noblock { if (addr_len < sizeof(sin)) return(-EINVAL); - memcpy(&sin,usin,sizeof(sin)); - if (sin.sin_family && sin.sin_family != AF_INET) + if (usin->sin_family && usin->sin_family != AF_INET) return(-EINVAL); - if (sin.sin_port == 0) + if (usin->sin_port == 0) return(-EINVAL); } else { +#ifdef CONFIG_IP_TRANSPARENT_PROXY + /* We need to provide a sockaddr_in when using MSG_PROXY. */ + if (flags&MSG_PROXY) + return(-EINVAL); +#endif if (sk->state != TCP_ESTABLISHED) return(-EINVAL); sin.sin_family = AF_INET; sin.sin_port = sk->dummy_th.dest; sin.sin_addr.s_addr = sk->daddr; + usin = &sin; } /* @@ -317,16 +413,26 @@ static int udp_sendto(struct sock *sk, unsigned char *from, int len, int noblock * broadcasting of data. */ - if(sin.sin_addr.s_addr==INADDR_ANY) - sin.sin_addr.s_addr=ip_my_addr(); + /* RFC1122: OK. Allows the application to select the specific */ + /* source address for an outgoing packet (MUST) as per 4.1.3.5. */ + /* Optional addition: a mechanism for telling the application what */ + /* address was used. (4.1.3.5, MAY) -- MS */ + + /* RFC1122: MUST ensure that all outgoing packets have one */ + /* of this host's addresses as a source addr.(4.1.3.6) - bind in */ + /* af_inet.c checks these. It does need work to allow BSD style */ + /* bind to multicast as is done by xntpd */ + + if(usin->sin_addr.s_addr==INADDR_ANY) + usin->sin_addr.s_addr=ip_my_addr(); - if(!sk->broadcast && ip_chk_addr(sin.sin_addr.s_addr)==IS_BROADCAST) + if(!sk->broadcast && ip_chk_addr(usin->sin_addr.s_addr)==IS_BROADCAST) return -EACCES; /* Must turn broadcast on first */ - sk->inuse = 1; + lock_sock(sk); /* Send the packet. */ - tmp = udp_send(sk, &sin, from, len, flags); + tmp = udp_send(sk, usin, from, len, flags, saddr, noblock); /* The datagram has been sent off. Release the socket. */ release_sock(sk); @@ -334,23 +440,51 @@ static int udp_sendto(struct sock *sk, unsigned char *from, int len, int noblock } /* - * In BSD SOCK_DGRAM a write is just like a send. + * Temporary */ - -static int udp_write(struct sock *sk, unsigned char *buff, int len, int noblock, - unsigned flags) + +int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len, int noblock, + int flags) { - return(udp_sendto(sk, buff, len, noblock, flags, NULL, 0)); + if(msg->msg_iovlen==1) + return udp_sendto(sk,msg->msg_iov[0].iov_base,len, noblock, flags, msg->msg_name, msg->msg_namelen); + else + { + /* + * For awkward cases we linearise the buffer first. In theory this is only frames + * whose iovec's don't split on 4 byte boundaries, and soon encrypted stuff (to keep + * skip happy). We are a bit more general about it. + */ + + unsigned char *buf; + int fs; + int err; + if(len>65515) + return -EMSGSIZE; + buf=kmalloc(len, GFP_KERNEL); + if(buf==NULL) + return -ENOBUFS; + err = memcpy_fromiovec(buf, msg->msg_iov, len); + if (err) + err = -EFAULT; + if (!err) + { + fs=get_fs(); + set_fs(get_ds()); + err=udp_sendto(sk,buf,len, noblock, flags, msg->msg_name, msg->msg_namelen); + set_fs(fs); + } + kfree_s(buf,len); + return err; + } } - /* * IOCTL requests applicable to the UDP protocol */ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) { - int err; switch(cmd) { case TIOCOUTQ: @@ -358,13 +492,8 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) unsigned long amount; if (sk->state == TCP_LISTEN) return(-EINVAL); - amount = sk->prot->wspace(sk)/*/2*/; - err=verify_area(VERIFY_WRITE,(void *)arg, - sizeof(unsigned long)); - if(err) - return(err); - put_fs_long(amount,(unsigned long *)arg); - return(0); + amount = sock_wspace(sk); + return put_user(amount, (int *)arg); } case TIOCINQ: @@ -381,14 +510,9 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) * of this packet since that is all * that will be read. */ - amount = skb->len; + amount = skb->len-sizeof(struct udphdr); } - err=verify_area(VERIFY_WRITE,(void *)arg, - sizeof(unsigned long)); - if(err) - return(err); - put_fs_long(amount,(unsigned long *)arg); - return(0); + return put_user(amount, (int *)arg); } default: @@ -399,18 +523,18 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) /* - * This should be easy, if there is something there we\ + * This should be easy, if there is something there we * return it, otherwise we block. */ -int udp_recvfrom(struct sock *sk, unsigned char *to, int len, - int noblock, unsigned flags, struct sockaddr_in *sin, - int *addr_len) +int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len, + int noblock, int flags,int *addr_len) { int copied = 0; int truesize; struct sk_buff *skb; int er; + struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name; /* * Check any passed addresses @@ -428,14 +552,22 @@ int udp_recvfrom(struct sock *sk, unsigned char *to, int len, if(skb==NULL) return er; - truesize = skb->len; - copied = min(len, truesize); + truesize = skb->len - sizeof(struct udphdr); + copied = truesize; + + if(len<truesize) + { + copied=len; + msg->msg_flags|=MSG_TRUNC; + } /* * FIXME : should use udp header size info value */ - skb_copy_datagram(skb,sizeof(struct udphdr),to,copied); + er = skb_copy_datagram_iovec(skb,sizeof(struct udphdr),msg->msg_iov,copied); + if (er) + return er; sk->stamp=skb->stamp; /* Copy the address. */ @@ -444,28 +576,48 @@ int udp_recvfrom(struct sock *sk, unsigned char *to, int len, sin->sin_family = AF_INET; sin->sin_port = skb->h.uh->source; sin->sin_addr.s_addr = skb->daddr; +#ifdef CONFIG_IP_TRANSPARENT_PROXY + if (flags&MSG_PROXY) + { + /* + * We map the first 8 bytes of a second sockaddr_in + * into the last 8 (unused) bytes of a sockaddr_in. + * This _is_ ugly, but it's the only way to do it + * easily, without adding system calls. + */ + struct sockaddr_in *sinto = + (struct sockaddr_in *) sin->sin_zero; + + sinto->sin_family = AF_INET; + sinto->sin_port = skb->h.uh->dest; + sinto->sin_addr.s_addr = skb->saddr; + } +#endif } - skb_free_datagram(skb); - release_sock(sk); - return(truesize); + skb_free_datagram(sk, skb); + return(copied); } -/* - * Read has the same semantics as recv in SOCK_DGRAM - */ - -int udp_read(struct sock *sk, unsigned char *buff, int len, int noblock, - unsigned flags) -{ - return(udp_recvfrom(sk, buff, len, noblock, flags, NULL, NULL)); -} - - -int udp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len) +int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { + struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; struct rtable *rt; - unsigned long sa; + + /* + * 1003.1g - break association. + */ + + if (usin->sin_family==AF_UNSPEC) + { + sk->saddr=INADDR_ANY; + sk->rcv_saddr=INADDR_ANY; + sk->daddr=INADDR_ANY; + sk->state = TCP_CLOSE; + udp_cache_zap(); + return 0; + } + if (addr_len < sizeof(*usin)) return(-EINVAL); @@ -477,46 +629,106 @@ int udp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len) if(!sk->broadcast && ip_chk_addr(usin->sin_addr.s_addr)==IS_BROADCAST) return -EACCES; /* Must turn broadcast on first */ - rt=(sk->localroute?ip_rt_local:ip_rt_route)(usin->sin_addr.s_addr, NULL, &sa); - if(rt==NULL) + rt=ip_rt_route((__u32)usin->sin_addr.s_addr, sk->localroute); + if (rt==NULL) return -ENETUNREACH; - sk->saddr = sa; /* Update source address */ + if(!sk->saddr) + sk->saddr = rt->rt_src; /* Update source address */ + if(!sk->rcv_saddr) + sk->rcv_saddr = rt->rt_src; sk->daddr = usin->sin_addr.s_addr; sk->dummy_th.dest = usin->sin_port; sk->state = TCP_ESTABLISHED; udp_cache_zap(); sk->ip_route_cache = rt; - sk->ip_route_stamp = rt_stamp; return(0); } -static void udp_close(struct sock *sk, int timeout) +static void udp_close(struct sock *sk, unsigned long timeout) { - sk->inuse = 1; + lock_sock(sk); sk->state = TCP_CLOSE; if(uh_cache_sk==sk) udp_cache_zap(); - if (sk->dead) - destroy_sock(sk); - else - release_sock(sk); + release_sock(sk); + sk->dead = 1; + destroy_sock(sk); } +static inline int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) +{ + /* + * Charge it to the socket, dropping if the queue is full. + */ + + /* I assume this includes the IP options, as per RFC1122 (4.1.3.2). */ + /* If not, please let me know. -- MS */ + + if (__sock_queue_rcv_skb(sk,skb)<0) { + udp_statistics.UdpInErrors++; + ip_statistics.IpInDiscards++; + ip_statistics.IpInDelivers--; + skb->sk = NULL; + kfree_skb(skb, FREE_WRITE); + return 0; + } + udp_statistics.UdpInDatagrams++; + return 0; +} + + +static inline void udp_deliver(struct sock *sk, struct sk_buff *skb) +{ + skb->sk = sk; + + if (sk->users) { + __skb_queue_tail(&sk->back_log, skb); + return; + } + udp_queue_rcv_skb(sk, skb); +} + +#ifdef CONFIG_IP_TRANSPARENT_PROXY +/* + * Check whether a received UDP packet might be for one of our + * sockets. + */ + +int udp_chkaddr(struct sk_buff *skb) +{ + struct iphdr *iph = skb->h.iph; + struct udphdr *uh = (struct udphdr *)(skb->h.raw + iph->ihl*4); + struct sock *sk; + + sk = get_sock(&udp_prot, uh->dest, iph->saddr, uh->source, iph->daddr, 0, 0); + + if (!sk) return 0; + /* 0 means accept all LOCAL addresses here, not all the world... */ + if (sk->rcv_saddr == 0) return 0; + return 1; +} +#endif /* * All we need to do is get the socket, and then do a checksum. */ int udp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt, - unsigned long daddr, unsigned short len, - unsigned long saddr, int redo, struct inet_protocol *protocol) + __u32 daddr, unsigned short len, + __u32 saddr, int redo, struct inet_protocol *protocol) { struct sock *sk; struct udphdr *uh; unsigned short ulen; - int addr_type = IS_MYADDR; - + int addr_type; + + /* + * First time through the loop.. Do all the setup stuff + * (including finding out the socket we go to etc) + */ + + addr_type = IS_MYADDR; if(!dev || dev->pa_addr!=daddr) addr_type=ip_chk_addr(daddr); @@ -533,7 +745,7 @@ int udp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt, */ ulen = ntohs(uh->len); - + if (ulen > len || len < sizeof(*uh) || ulen < sizeof(*uh)) { NETDEBUG(printk("UDP: short packet: %d/%d\n", ulen, len)); @@ -542,10 +754,26 @@ int udp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt, return(0); } - if (uh->check && udp_check(uh, len, saddr, daddr)) + /* RFC1122 warning: According to 4.1.3.6, we MUST discard any */ + /* datagram which has an invalid source address, either here or */ + /* in IP. */ + /* Right now, IP isn't doing it, and neither is UDP. It's on the */ + /* FIXME list for IP, though, so I wouldn't worry about it. */ + /* (That's the Right Place to do it, IMHO.) -- MS */ + + if (uh->check && ( + ( (skb->ip_summed == CHECKSUM_HW) && udp_check(uh, len, saddr, daddr, skb->csum ) ) || + ( (skb->ip_summed == CHECKSUM_NONE) && udp_check(uh, len, saddr, daddr,csum_partial((char*)uh, len, 0))) + /* skip if CHECKSUM_UNNECESSARY */ + ) + ) { /* <mea@utu.fi> wants to know, who sent it, to go and stomp on the garbage sender... */ + + /* RFC1122: OK. Discards the bad packet silently (as far as */ + /* the network is concerned, anyway) as per 4.1.3.4 (MUST). */ + NETDEBUG(printk("UDP: bad checksum. From %08lX:%d to %08lX:%d ulen %d\n", ntohl(saddr),ntohs(uh->source), ntohl(daddr),ntohs(uh->dest), @@ -555,11 +783,20 @@ int udp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt, return(0); } + /* + * These are supposed to be switched. + */ + + skb->daddr = saddr; + skb->saddr = daddr; len=ulen; + skb->dev = dev; + skb_trim(skb,len); + #ifdef CONFIG_IP_MULTICAST - if (addr_type!=IS_MYADDR) + if (addr_type==IS_BROADCAST || addr_type==IS_MULTICAST) { /* * Multicasts and broadcasts go to each listener. @@ -579,7 +816,7 @@ int udp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt, else skb1=skb; if(skb1) - udp_deliver(sk, uh, skb1, dev,saddr,daddr,len); + udp_deliver(sk, skb1); sk=sknext; } while(sknext!=NULL); @@ -593,7 +830,7 @@ int udp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt, sk=(struct sock *)uh_cache_sk; else { - sk = get_sock(&udp_prot, uh->dest, saddr, uh->source, daddr); + sk = get_sock(&udp_prot, uh->dest, saddr, uh->source, daddr, dev->pa_addr, skb->redirport); uh_cache_saddr=saddr; uh_cache_daddr=daddr; uh_cache_dport=uh->dest; @@ -604,7 +841,7 @@ int udp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt, if (sk == NULL) { udp_statistics.UdpNoPorts++; - if (addr_type == IS_MYADDR) + if (addr_type != IS_BROADCAST && addr_type != IS_MULTICAST) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0, dev); } @@ -616,75 +853,32 @@ int udp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt, kfree_skb(skb, FREE_WRITE); return(0); } - return udp_deliver(sk,uh,skb,dev, saddr, daddr, len); -} - -static int udp_deliver(struct sock *sk, struct udphdr *uh, struct sk_buff *skb, struct device *dev, long saddr, long daddr, int len) -{ - skb->sk = sk; - skb->dev = dev; - skb->len = len; - - /* - * These are supposed to be switched. - */ - - skb->daddr = saddr; - skb->saddr = daddr; - - - /* - * Charge it to the socket, dropping if the queue is full. - */ - - skb->len = len - sizeof(*uh); - - if (sock_queue_rcv_skb(sk,skb)<0) - { - udp_statistics.UdpInErrors++; - ip_statistics.IpInDiscards++; - ip_statistics.IpInDelivers--; - skb->sk = NULL; - kfree_skb(skb, FREE_WRITE); - release_sock(sk); - return(0); - } - udp_statistics.UdpInDatagrams++; - release_sock(sk); - return(0); + udp_deliver(sk, skb); + return 0; } - struct proto udp_prot = { - sock_wmalloc, - sock_rmalloc, - sock_wfree, - sock_rfree, - sock_rspace, - sock_wspace, udp_close, - udp_read, - udp_write, - udp_sendto, - udp_recvfrom, - ip_build_header, udp_connect, NULL, - ip_queue_xmit, NULL, NULL, NULL, - udp_rcv, datagram_select, udp_ioctl, NULL, NULL, + NULL, ip_setsockopt, ip_getsockopt, + udp_sendmsg, + udp_recvmsg, + NULL, /* No special bind function */ + udp_queue_rcv_skb, 128, 0, "UDP", 0, 0, - {NULL,} + NULL }; diff --git a/net/ipv4/utils.c b/net/ipv4/utils.c index 21ce570f5..cbce01b68 100644 --- a/net/ipv4/utils.c +++ b/net/ipv4/utils.c @@ -21,7 +21,7 @@ * 2 of the License, or (at your option) any later version. */ -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <linux/types.h> #include <linux/kernel.h> @@ -62,7 +62,7 @@ char *in_ntoa(unsigned long in) * Convert an ASCII string to binary IP. */ -unsigned long in_aton(char *str) +unsigned long in_aton(const char *str) { unsigned long l; unsigned int val; diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile new file mode 100644 index 000000000..e11c5eee7 --- /dev/null +++ b/net/ipv6/Makefile @@ -0,0 +1,19 @@ +# +# Makefile for the Linux TCP/IP (INET6) layer. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# + + +O_TARGET := ipv6.o +O_OBJS := af_inet6.o ipv6_output.o ipv6_input.o addrconf.o sit.o \ + ipv6_route.o ipv6_sockglue.o ndisc.o udp.o raw.o \ + protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \ + exthdrs.o sysctl_net_ipv6.o datagram.o + +MOD_LIST_NAME := IPV6_MODULES +M_OBJS := $(O_TARGET) + +include $(TOPDIR)/Rules.make diff --git a/net/ipv6/README b/net/ipv6/README new file mode 100644 index 000000000..ca82fe438 --- /dev/null +++ b/net/ipv6/README @@ -0,0 +1,8 @@ +To join in the work on Linux IPv6 send mail to: + + majordomo@nuclecu.unam.mx + +and in the body of the message include: + +subscribe netdev + diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c new file mode 100644 index 000000000..6b7ec0ad4 --- /dev/null +++ b/net/ipv6/addrconf.c @@ -0,0 +1,1423 @@ +/* + * IPv6 Address [auto]configuration + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +/* + * Changes: + * + * Janos Farkas : delete timer on ifdown + * <chexum@bankinf.banki.hu> + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> + +#include <linux/proc_fs.h> +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/ndisc.h> +#include <net/ipv6_route.h> +#include <net/addrconf.h> +#include <net/sit.h> + +#include <asm/uaccess.h> + +#define HASH_SIZE 16 +/* + * Configured unicast address list + */ +struct inet6_ifaddr *inet6_addr_lst[HASH_SIZE]; + +/* + * Hash list of configured multicast addresses + */ +struct ipv6_mc_list *inet6_mcast_lst[HASH_SIZE]; + +/* + * AF_INET6 device list + */ +struct inet6_dev *inet6_dev_lst; +int in6_ifnum = 0; + +atomic_t addr_list_lock = 0; + +void addrconf_verify(unsigned long); + +static struct timer_list addr_chk_timer = { + NULL, NULL, + 0, 0, addrconf_verify +}; + + +int DupAddrDetectTransmits = 1; + +/* + * /proc/sys switch for autoconf (enabled by default) + */ +int addrconf_sys_autoconf = 1; + +static void addrconf_dad_start(struct inet6_ifaddr *ifp); +static void addrconf_rs_timer(unsigned long data); + +int ipv6_addr_type(struct in6_addr *addr) +{ + u32 st; + + st = addr->s6_addr32[0]; + + /* + * UCast Provider Based Address + * 0x4/3 + */ + + if ((st & __constant_htonl(0xE0000000)) == + __constant_htonl(0x40000000)) + { + return IPV6_ADDR_UNICAST; + } + + if ((st & __constant_htonl(0xFF000000)) == + __constant_htonl(0xFF000000)) + { + int type = IPV6_ADDR_MULTICAST; + + switch((st >> 16) & 0x0f) + { + case 0x01: + type |= IPV6_ADDR_LOOPBACK; + break; + case 0x02: + type |= IPV6_ADDR_LINKLOCAL; + break; + case 0x05: + type |= IPV6_ADDR_SITELOCAL; + break; + } + return type; + } + + if ((st & __constant_htonl(0xFFC00000)) == + __constant_htonl(0xFE800000)) + { + return (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST); + } + + if ((st & __constant_htonl(0xFFC00000)) == + __constant_htonl(0xFEC00000)) + { + return (IPV6_ADDR_SITELOCAL | IPV6_ADDR_UNICAST); + } + + if ((addr->s6_addr32[0] | addr->s6_addr32[1]) == 0) + { + if (addr->s6_addr32[2] == 0) + { + if (addr->in6_u.u6_addr32[3] == 0) + { + return IPV6_ADDR_ANY; + } + + if (addr->s6_addr32[3] == __constant_htonl(0x00000001)) + { + return (IPV6_ADDR_LOOPBACK | + IPV6_ADDR_UNICAST); + } + + return (IPV6_ADDR_COMPATv4 | IPV6_ADDR_UNICAST); + } + + if (addr->s6_addr32[2] == __constant_htonl(0x0000ffff)) + return IPV6_ADDR_MAPPED; + } + + return IPV6_ADDR_RESERVED; +} + +struct inet6_dev * ipv6_add_dev(struct device *dev) +{ + struct inet6_dev *dev6; + + /* + * called by netdev notifier from a syscall + */ + dev6 = (struct inet6_dev *) kmalloc(sizeof(struct inet6_dev), + GFP_ATOMIC); + + if (dev6 == NULL) + return NULL; + + memset(dev6, 0, sizeof(struct inet6_dev)); + dev6->dev = dev; + dev6->if_index = ++in6_ifnum; + + /* + * insert at head. + */ + + dev6->next = inet6_dev_lst; + inet6_dev_lst = dev6; + + return dev6; +} + +struct inet6_dev * ipv6_dev_by_index(int index) +{ + struct inet6_dev *in6_dev; + + for (in6_dev = inet6_dev_lst; in6_dev; in6_dev = in6_dev->next) + { + if (in6_dev->if_index == index) + return in6_dev; + } + + return NULL; +} + +void addrconf_forwarding_on(void) +{ + struct inet6_dev *in6_dev; + struct in6_addr maddr; + + for (in6_dev = inet6_dev_lst; in6_dev; in6_dev = in6_dev->next) + { + printk(KERN_DEBUG "dev %s\n", in6_dev->dev->name); + + if (in6_dev->dev->type == ARPHRD_ETHER) + { + printk(KERN_DEBUG "joining all-routers\n"); + in6_dev->router = 1; + ipv6_addr_all_routers(&maddr); + ipv6_dev_mc_inc(in6_dev->dev, &maddr); + } + } + + if (last_resort_rt && (last_resort_rt->rt_flags & RTI_ALLONLINK)) + { + rt_release(last_resort_rt); + last_resort_rt = NULL; + } +} + +struct inet6_dev * ipv6_get_idev(struct device *dev) +{ + struct inet6_dev *in6_dev; + + for (in6_dev = inet6_dev_lst; in6_dev; in6_dev = in6_dev->next) + { + if (in6_dev->dev == dev) + { + return in6_dev; + } + } + return NULL; +} + +struct inet6_ifaddr * ipv6_add_addr(struct inet6_dev *idev, + struct in6_addr *addr, int scope) +{ + struct inet6_ifaddr * ifaddr; + int hash; + unsigned long flags; + + save_flags(flags); + cli(); + + ifaddr = (struct inet6_ifaddr *) kmalloc(sizeof(struct inet6_ifaddr), + GFP_ATOMIC); + + if (ifaddr == NULL) + { + printk(KERN_DEBUG "ipv6_add_addr: malloc failed\n"); + restore_flags(flags); + return NULL; + } + + memset(ifaddr, 0, sizeof(struct inet6_ifaddr)); + memcpy(&ifaddr->addr, addr, sizeof(struct in6_addr)); + + ifaddr->scope = scope; + ifaddr->idev = idev; + + + /* add to list */ + + hash = ipv6_addr_hash(addr); + + ifaddr->lst_next = inet6_addr_lst[hash]; + inet6_addr_lst[hash] = ifaddr; + + + /* add to inet6_dev unicast addr list */ + ifaddr->if_next = idev->addr_list; + idev->addr_list = ifaddr; + + restore_flags(flags); + return ifaddr; + +} + +void ipv6_del_addr(struct inet6_ifaddr *ifp) +{ + struct inet6_ifaddr *iter, **back; + int hash; + + if (addr_list_lock) + { + ifp->flags |= ADDR_INVALID; + return; + } + + hash = ipv6_addr_hash(&ifp->addr); + + iter = inet6_addr_lst[hash]; + back = &inet6_addr_lst[hash]; + + for (; iter; iter = iter->lst_next) + { + if (iter == ifp) + { + *back = ifp->lst_next; + ifp->lst_next = NULL; + break; + } + back = &(iter->lst_next); + } + + iter = ifp->idev->addr_list; + back = &ifp->idev->addr_list; + + for (; iter; iter = iter->if_next) + { + if (iter == ifp) + { + *back = ifp->if_next; + ifp->if_next = NULL; + break; + } + back = &(iter->if_next); + } + + kfree(ifp); +} + +/* + * Choose an apropriate source address + * should do: + * i) get an address with an apropriate scope + * ii) see if there is a specific route for the destination and use + * an address of the attached interface + * iii) don't use deprecated addresses + * + * at the moment i believe only iii) is missing. + */ +struct inet6_ifaddr * ipv6_get_saddr(struct rt6_info *rt, struct in6_addr *daddr) +{ + int scope; + struct inet6_ifaddr * ifp = NULL; + struct inet6_dev * i6dev; + struct inet6_ifaddr * match = NULL; + struct device *dev = NULL; + int i; + + if (rt) + { + dev = rt->rt_dev; + } + + atomic_inc(&addr_list_lock); + + scope = ipv6_addr_type(daddr); + + scope &= IPV6_ADDR_SCOPE_MASK; + + if (rt && (rt->rt_flags & RTI_ALLONLINK)) + { + /* + * route for the "all destinations on link" rule + * when no routers are present + */ + scope = IFA_LINK; + } + + /* + * known dev + * search dev and walk through dev addresses + */ + + if (dev) + { + if (dev->flags & IFF_LOOPBACK) + { + scope = IFA_HOST; + } + + for (i6dev = inet6_dev_lst; i6dev; i6dev=i6dev->next) + { + if (i6dev->dev == dev) + { + for (ifp=i6dev->addr_list; ifp; + ifp=ifp->if_next) + { + if (ifp->scope == scope) + { + if (!(ifp->flags & ADDR_STATUS)) + { + goto out; + } + if (!(ifp->flags & ADDR_INVALID)) + { + match = ifp; + } + } + } + break; + } + } + } + + if (scope == IFA_LINK) + { + goto out; + } + + /* + * dev == NULL or search failed for specified dev + */ + + for (i=0; i < HASH_SIZE; i++) + { + for (ifp=inet6_addr_lst[i]; ifp; ifp=ifp->lst_next) + { + if (ifp->scope == scope) + { + if (!(ifp->flags & ADDR_STATUS)) + { + goto out; + } + if (!(ifp->flags & ADDR_INVALID)) + { + match = ifp; + } + } + } + } + + out: + if (ifp == NULL && match) + { + ifp = match; + } + atomic_dec(&addr_list_lock); + return ifp; +} + +struct inet6_ifaddr * ipv6_get_lladdr(struct device *dev) +{ + struct inet6_ifaddr *ifp; + struct inet6_dev *i6dev; + + for (i6dev = inet6_dev_lst; i6dev; i6dev=i6dev->next) + { + if (i6dev->dev == dev) + { + for (ifp=i6dev->addr_list; ifp; ifp=ifp->if_next) + { + if (ifp->scope == IFA_LINK) + return ifp; + } + break; + } + } + return NULL; +} + +/* + * Retrieve the ifaddr struct from an v6 address + * Called from ipv6_rcv to check if the address belongs + * to the host. + */ + +struct inet6_ifaddr * ipv6_chk_addr(struct in6_addr *addr) +{ + struct inet6_ifaddr * ifp; + u8 hash; + + atomic_inc(&addr_list_lock); + + hash = ipv6_addr_hash(addr); + + for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) + { + if (ipv6_addr_cmp(&ifp->addr, addr) == 0) + { + break; + } + } + + atomic_dec(&addr_list_lock); + return ifp; +} + +static void sit_route_add(struct inet6_dev *idev) +{ + struct in6_rtmsg rtmsg; + struct device *dev = idev->dev; + int err; + + rtmsg.rtmsg_type = RTMSG_NEWROUTE; + + memset(&rtmsg.rtmsg_dst, 0, sizeof(struct in6_addr)); + memset(&rtmsg.rtmsg_gateway, 0, sizeof(struct in6_addr)); + + if (dev->pa_dstaddr == 0) + { + /* prefix length - 96 bytes "::d.d.d.d" */ + rtmsg.rtmsg_prefixlen = 96; + rtmsg.rtmsg_metric = 1; + rtmsg.rtmsg_flags = RTF_NEXTHOP|RTF_UP; + } + else + { + rtmsg.rtmsg_prefixlen = 10; + rtmsg.rtmsg_dst.s6_addr32[0] = __constant_htonl(0xfe800000); + rtmsg.rtmsg_dst.s6_addr32[3] = dev->pa_dstaddr; + rtmsg.rtmsg_metric = 1; + rtmsg.rtmsg_flags = RTF_NEXTHOP|RTF_UP; + } + + rtmsg.rtmsg_ifindex = idev->if_index; + + err = ipv6_route_add(&rtmsg); + + if (err) + { + printk(KERN_DEBUG "sit_route_add: error in route_add\n"); + } +} + +static void init_loopback(struct device *dev) +{ + struct in6_addr addr; + struct inet6_dev *idev; + struct inet6_ifaddr * ifp; + struct in6_rtmsg rtmsg; + int err; + + /* ::1 */ + + memset(&addr, 0, sizeof(struct in6_addr)); + addr.s6_addr[15] = 1; + + idev = ipv6_add_dev(dev); + + if (idev == NULL) + { + printk(KERN_DEBUG "init loopback: add_dev failed\n"); + return; + } + + ifp = ipv6_add_addr(idev, &addr, IFA_HOST); + + if (ifp == NULL) + { + printk(KERN_DEBUG "init_loopback: add_addr failed\n"); + return; + } + + ifp->flags |= ADDR_PERMANENT; + + memcpy(&rtmsg.rtmsg_dst, &addr, sizeof(struct in6_addr)); + memset(&rtmsg.rtmsg_gateway, 0, sizeof(struct in6_addr)); + + rtmsg.rtmsg_prefixlen = 128; + rtmsg.rtmsg_metric = 1; + rtmsg.rtmsg_ifindex = idev->if_index; + + rtmsg.rtmsg_flags = RTF_NEXTHOP|RTF_HOST|RTF_UP; + + err = ipv6_route_add(&rtmsg); + + if (err) + { + printk(KERN_DEBUG "init_loopback: error in route_add\n"); + } + + /* add route for ::127.0.0.1 */ +} + +static void addrconf_eth_config(struct device *dev) +{ + struct in6_addr addr; + struct in6_addr maddr; + struct inet6_ifaddr * ifp; + struct inet6_dev * idev; + + memset(&addr, 0, sizeof(struct in6_addr)); + + /* generate link local address*/ + addr.s6_addr[0] = 0xFE; + addr.s6_addr[1] = 0x80; + + memcpy(addr.s6_addr + (sizeof(struct in6_addr) - dev->addr_len), + dev->dev_addr, dev->addr_len); + + idev = ipv6_add_dev(dev); + + if (idev == NULL) + return; + + ifp = ipv6_add_addr(idev, &addr, IFA_LINK); + + if (ifp == NULL) + return; + + ifp->flags |= (DAD_INCOMPLETE | ADDR_PERMANENT); + ifp->prefix_len = 10; + + /* join to all nodes multicast group */ + ipv6_addr_all_nodes(&maddr); + ipv6_dev_mc_inc(dev, &maddr); + + if (ipv6_forwarding) + { + idev->router = 1; + ipv6_addr_all_routers(&maddr); + ipv6_dev_mc_inc(dev, &maddr); + } + + /* join to solicited addr multicast group */ + addrconf_addr_solict_mult(&addr, &maddr); + ipv6_dev_mc_inc(dev, &maddr); + + /* start dad */ + addrconf_dad_start(ifp); +} + +void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len) +{ + struct prefix_info *pinfo; + struct rt6_info *rt; + __u32 valid_lft; + __u32 prefered_lft; + int addr_type; + unsigned long rt_expires; + + pinfo = (struct prefix_info *) opt; + + if (len < sizeof(struct prefix_info)) + { + printk(KERN_DEBUG "addrconf: prefix option too short\n"); + return; + } + + /* + * Validation checks ([ADDRCONF], page 19) + */ + + addr_type = ipv6_addr_type(&pinfo->prefix); + + if (addr_type & IPV6_ADDR_LINKLOCAL) + { + return; + } + + valid_lft = ntohl(pinfo->valid); + prefered_lft = ntohl(pinfo->prefered); + + if (prefered_lft > valid_lft) + { + printk(KERN_WARNING + "addrconf: prefix option has invalid lifetime\n"); + return; + } + + /* + * If we where using an "all destinations on link" route + * delete it + */ + + if (last_resort_rt && (last_resort_rt->rt_flags & RTI_ALLONLINK)) + { + rt_release(last_resort_rt); + last_resort_rt = NULL; + } + + /* + * Two things going on here: + * 1) Add routes for on-link prefixes + * 2) Configure prefixes with the auto flag set + */ + + rt_expires = jiffies + valid_lft * HZ; + if (rt_expires < jiffies) + { + rt_expires = ~0; + } + + rt = fibv6_lookup(&pinfo->prefix, dev, RTI_DYNAMIC|RTI_GATEWAY); + + if (rt) + { + if (pinfo->onlink == 0 || valid_lft == 0) + { + /* + * delete route + */ + fib6_del_rt(rt); + rt = NULL; + } + else + { + rt->rt_expires = rt_expires; + } + } + else if (pinfo->onlink && valid_lft) + { + struct in6_rtmsg rtmsg; + struct inet6_dev *idev; + + printk(KERN_DEBUG "adding on link route\n"); + ipv6_addr_copy(&rtmsg.rtmsg_dst, &pinfo->prefix); + memset(&rtmsg.rtmsg_gateway, 0, sizeof(struct in6_addr)); + + rtmsg.rtmsg_prefixlen = pinfo->prefix_len; + rtmsg.rtmsg_metric = 1; + + if ((idev = ipv6_get_idev(dev))) + { + rtmsg.rtmsg_ifindex = idev->if_index; + } + rtmsg.rtmsg_flags = RTF_UP | RTF_ADDRCONF; + rtmsg.rtmsg_info = rt_expires; + + ipv6_route_add(&rtmsg); + } + + if (pinfo->autoconf && addrconf_sys_autoconf) + { + struct inet6_ifaddr * ifp; + struct in6_addr addr; + int plen; + + plen = pinfo->prefix_len >> 3; + + if (plen + dev->addr_len == sizeof(struct in6_addr)) + { + memcpy(&addr, &pinfo->prefix, plen); + memcpy(addr.s6_addr + plen, dev->dev_addr, + dev->addr_len); + } + else + { + printk(KERN_DEBUG + "addrconf: prefix_len invalid\n"); + return; + } + + ifp = ipv6_chk_addr(&addr); + + if (ifp == NULL && valid_lft) + { + /* create */ + + struct inet6_dev *in6_dev; + + in6_dev = ipv6_get_idev(dev); + + if (in6_dev == NULL) + { + printk(KERN_DEBUG + "addrconf: device not configured\n"); + } + + ifp = ipv6_add_addr(in6_dev, &addr, + addr_type & IPV6_ADDR_SCOPE_MASK); + + if (dev->flags & IFF_MULTICAST) + { + struct in6_addr maddr; + + /* join to solicited addr multicast group */ + addrconf_addr_solict_mult(&addr, &maddr); + ipv6_dev_mc_inc(dev, &maddr); + } + + ifp->flags |= DAD_INCOMPLETE; + ifp->prefix_len = pinfo->prefix_len; + + addrconf_dad_start(ifp); + + } + + if (ifp && valid_lft == 0) + { + ipv6_del_addr(ifp); + ifp = NULL; + } + + if (ifp) + { + ifp->valid_lft = valid_lft; + ifp->prefered_lft = prefered_lft; + ifp->tstamp = jiffies; + } + } + +} + +static int addrconf_ifdown(struct device *dev) +{ + struct inet6_dev *idev, **bidev; + struct inet6_ifaddr *ifa, **bifa; + int i; + + start_bh_atomic(); + + bidev = &inet6_dev_lst; + + for (idev = inet6_dev_lst; idev; idev = idev->next) + { + if (idev->dev == dev) + { + *bidev = idev->next; + break; + } + bidev = &idev; + } + + if (idev == NULL) + { + printk(KERN_DEBUG "addrconf_ifdown: device not found\n"); + end_bh_atomic(); + return -ENODEV; + } + + /* + * FIXME: clear multicast group membership + */ + + /* + * clean addr_list + */ + + for (i=0; i<16; i++) + { + bifa = &inet6_addr_lst[i]; + + for (ifa=inet6_addr_lst[i]; ifa; ) + { + if (ifa->idev == idev) + { + *bifa = ifa->lst_next; + del_timer(&ifa->timer); + kfree(ifa); + ifa = *bifa; + continue; + } + bifa = &ifa; + ifa = ifa->lst_next; + } + } + + kfree(idev); + end_bh_atomic(); + return 0; +} + +/* + * Set destination address. + * Special case for SIT interfaces where we create a new "virtual" + * device. + */ +int addrconf_set_dstaddr(void *arg) +{ + struct in6_ifreq ireq; + struct device *dev; + int err; + + err = copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)); + + if (err) + return -EFAULT; + + dev = dev_get(ireq.devname); + + if (dev->type == ARPHRD_SIT) + { + struct device *dev; + + if (!(ipv6_addr_type(&ireq.addr) & IPV6_ADDR_COMPATv4)) + { + return -EADDRNOTAVAIL; + } + + dev = sit_add_tunnel(ireq.addr.s6_addr32[3]); + + if (dev == NULL) + return -ENODEV; + + return 0; + } + + return -EINVAL; +} + +/* + * Obtain if_index from device name + */ +int addrconf_get_ifindex(void *arg) +{ + struct ifreq ifr; + int res = -ENODEV; + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + { + res = -EFAULT; + } + else + { + struct inet6_dev *idev; + + for (idev = inet6_dev_lst; idev; idev=idev->next) + { + if (!strncmp(ifr.ifr_name, idev->dev->name, IFNAMSIZ)) + { + res = 0; + ifr.ifr_ifindex = idev->if_index; + if (copy_to_user(arg, &ifr, sizeof(ifr))) + { + res = -EFAULT; + } + break; + } + } + } + + return res; +} + +/* + * Manual configuration of address on an interface + */ +int addrconf_add_ifaddr(void *arg) +{ + struct inet6_dev *in6_dev; + struct in6_ifreq ireq; + struct inet6_ifaddr *ifp; + struct device *dev; + int addr_type; + int err; + + if (!suser()) + return -EPERM; + + err = copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)); + if (err) + return -EFAULT; + + dev = dev_get(ireq.devname); + + if (dev == NULL) + return -EINVAL; + + in6_dev = ipv6_get_idev(dev); + + if (in6_dev == NULL) + return -EINVAL; + + addr_type = ipv6_addr_type(&ireq.addr); + addr_type &= IPV6_ADDR_SCOPE_MASK; + + ifp = ipv6_add_addr(in6_dev, &ireq.addr, addr_type); + + if (ifp == NULL) + return -ENOMEM; + + ifp->prefix_len = 128; + + if (dev->flags & IFF_MULTICAST) + { + struct in6_addr maddr; + + /* join to solicited addr multicast group */ + addrconf_addr_solict_mult(&ireq.addr, &maddr); + ipv6_dev_mc_inc(dev, &maddr); + } + + + ifp->prefix_len = ireq.prefix_len; + ifp->flags |= ADDR_PERMANENT; + + if (!(dev->flags & (IFF_NOARP|IFF_LOOPBACK))) + { + ifp->flags |= DAD_INCOMPLETE; + addrconf_dad_start(ifp); + } + return 0; +} + +static void sit_add_v4_addrs(struct inet6_dev *idev) +{ + struct inet6_ifaddr * ifp; + struct in6_addr addr; + struct device *dev; + int scope; + + memset(&addr, 0, sizeof(struct in6_addr)); + + if (idev->dev->pa_dstaddr) + { + addr.s6_addr32[0] = __constant_htonl(0xfe800000); + scope = IFA_LINK; + } + else + { + scope = IPV6_ADDR_COMPATv4; + } + + for (dev = dev_base; dev != NULL; dev = dev->next) + { + if (dev->family == AF_INET && (dev->flags & IFF_UP)) + { + int flag = scope; + + addr.s6_addr32[3] = dev->pa_addr; + + if (dev->flags & IFF_LOOPBACK) + { + if (idev->dev->pa_dstaddr) + continue; + + flag |= IFA_HOST; + } + + ifp = ipv6_add_addr(idev, &addr, flag); + + if (ifp == NULL) + continue; + + ifp->flags |= ADDR_PERMANENT; + } + } +} + +int addrconf_notify(struct notifier_block *this, unsigned long event, + void * data) +{ + struct device *dev; + struct inet6_dev * idev; + + dev = (struct device *) data; + + switch(event) { + case NETDEV_UP: + switch(dev->type) { + case ARPHRD_SIT: + + printk(KERN_DEBUG "sit device up: %s\n", dev->name); + + /* + * Configure the tunnel with one of our IPv4 + * addresses... we should configure all of + * our v4 addrs in the tunnel + */ + + idev = ipv6_add_dev(dev); + + sit_add_v4_addrs(idev); + + /* + * we do an hack for now to configure the tunnel + * route. + */ + + sit_route_add(idev); + break; + + case ARPHRD_LOOPBACK: + init_loopback(dev); + break; + + case ARPHRD_ETHER: + + printk(KERN_DEBUG "Configuring eth interface\n"); + addrconf_eth_config(dev); + break; + } + rt6_sndmsg(RTMSG_NEWDEVICE, NULL, NULL, 0, dev, 0, 0); + break; + + case NETDEV_DOWN: + /* + * Remove all addresses from this interface + * and take the interface out of the list. + */ + if (addrconf_ifdown(dev) == 0) + { + rt6_ifdown(dev); + rt6_sndmsg(RTMSG_DELDEVICE, NULL, NULL, 0, dev, 0, 0); + } + + break; + } + + return NOTIFY_OK; +} + +static void addrconf_dad_completed(struct inet6_ifaddr *ifp) +{ + struct in6_rtmsg rtmsg; + struct device *dev; + int err; + + + if (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) + { + struct in6_addr all_routers; + + /* + * 1) configure a link route for this interface + * 2) send a (delayed) router solicitation + */ + + memcpy(&rtmsg.rtmsg_dst, &ifp->addr, sizeof(struct in6_addr)); + memset(&rtmsg.rtmsg_gateway, 0, sizeof(struct in6_addr)); + + dev = ifp->idev->dev; + + rtmsg.rtmsg_prefixlen = ifp->prefix_len; + rtmsg.rtmsg_metric = 1; + rtmsg.rtmsg_ifindex = ifp->idev->if_index; + + rtmsg.rtmsg_flags = RTF_UP; + + err = ipv6_route_add(&rtmsg); + + if (err) + { + printk(KERN_DEBUG "dad_complete: error in route_add\n"); + } + + if (ipv6_forwarding == 0) + { + ipv6_addr_set(&all_routers, + __constant_htonl(0xff020000U), 0, 0, + __constant_htonl(0x2U)); + + /* + * If a host as already performed a random delay + * [...] as part of DAD [...] there is no need + * to delay again before sending the first RS + */ + ndisc_send_rs(ifp->idev->dev, &ifp->addr, + &all_routers); + + ifp->probes = 1; + ifp->timer.function = addrconf_rs_timer; + ifp->timer.expires = (jiffies + + RTR_SOLICITATION_INTERVAL); + ifp->idev->if_flags |= IF_RS_SENT; + add_timer(&ifp->timer); + } + } + +} + +static void addrconf_dad_timer(unsigned long data) +{ + struct inet6_ifaddr *ifp; + struct in6_addr unspec; + struct in6_addr mcaddr; + + ifp = (struct inet6_ifaddr *) data; + + if (ifp->probes-- == 0) + { + /* + * DAD was successful + */ + + ifp->flags &= ~DAD_INCOMPLETE; + addrconf_dad_completed(ifp); + return; + } + + /* send a neighbour solicitation for our addr */ + memset(&unspec, 0, sizeof(unspec)); + addrconf_addr_solict_mult(&ifp->addr, &mcaddr); + + ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &unspec); + + ifp->timer.expires = jiffies + RETRANS_TIMER; + add_timer(&ifp->timer); +} + +static void addrconf_rs_timer(unsigned long data) +{ + struct inet6_ifaddr *ifp; + + ifp = (struct inet6_ifaddr *) data; + + if (ipv6_forwarding) + return; + + if (ifp->idev->if_flags & IF_RA_RCVD) + { + /* + * Announcement received after solicitation + * was sent + */ + return; + } + + if (ifp->probes++ <= MAX_RTR_SOLICITATIONS) + { + struct in6_addr all_routers; + + ipv6_addr_set(&all_routers, + __constant_htonl(0xff020000U), 0, 0, + __constant_htonl(0x2U)); + + ndisc_send_rs(ifp->idev->dev, &ifp->addr, + &all_routers); + + + ifp->timer.function = addrconf_rs_timer; + ifp->timer.expires = jiffies + RTR_SOLICITATION_INTERVAL; + add_timer(&ifp->timer); + } + else + { + printk(KERN_DEBUG "%s: no IPv6 routers present\n", + ifp->idev->dev->name); + + if (!default_rt_list && !last_resort_rt) + { + struct rt6_info *rt; + + /* + * create a last resort route with all + * destinations on link + */ + rt = kmalloc(sizeof(struct rt6_info), GFP_ATOMIC); + + if (rt) + { + memset(rt, 0, sizeof(struct rt6_info)); + rt->rt_dev = ifp->idev->dev; + rt->rt_ref = 1; + rt->rt_flags = (RTI_ALLONLINK | RTF_UP); + last_resort_rt = rt; + } + } + } +} + +static void addrconf_dad_start(struct inet6_ifaddr *ifp) +{ + static int rand_seed = 1; + int rand_num; + + if (rand_seed) + { + rand_seed = 0; + nd_rand_seed = ifp->addr.s6_addr32[3]; + } + + init_timer(&ifp->timer); + ifp->probes = DupAddrDetectTransmits; + + rand_num = ipv6_random() % MAX_RTR_SOLICITATION_DELAY; + + ifp->timer.function = addrconf_dad_timer; + ifp->timer.data = (unsigned long) ifp; + ifp->timer.expires = jiffies + rand_num; + + add_timer(&ifp->timer); +} + +static int iface_proc_info(char *buffer, char **start, off_t offset, + int length, int dummy) +{ + struct inet6_ifaddr *ifp; + int i; + int len = 0; + + for (i=0; i < HASH_SIZE; i++) + for (ifp=inet6_addr_lst[i]; ifp; ifp=ifp->lst_next) + { + int j; + + for (j=0; j<16; j++) + { + sprintf(buffer + len, "%02x", + ifp->addr.s6_addr[j]); + len += 2; + } + + len += sprintf(buffer + len, + " %02x %02x %02x %02x %8s\n", + ifp->idev->if_index, + ifp->prefix_len, + ifp->scope, + ifp->flags, + ifp->idev->dev->name); + } + + *start = buffer + offset; + + len -= offset; + + if (len > length) + len = length; + return len; +} + +struct proc_dir_entry iface_proc_entry = +{ + 0, 8, "if_inet6", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, NULL, + &iface_proc_info +}; + + +/* + * Periodic address status verification + */ + +void addrconf_verify(unsigned long foo) +{ + struct inet6_ifaddr *ifp; + unsigned long now = jiffies; + int i; + + for (i=0; i < HASH_SIZE; i++) + { + for (ifp=inet6_addr_lst[i]; ifp;) + { + if (!(ifp->flags & ADDR_PERMANENT)) + { + struct inet6_ifaddr *bp; + unsigned long age; + + age = (now - ifp->tstamp) / HZ; + + if (age > ifp->prefered_lft) + { + ifp->flags |= ADDR_DEPRECATED; + } + + bp = ifp; + ifp=ifp->lst_next; + + if (age > bp->valid_lft) + { + ipv6_del_addr(bp); + } + continue; + } + ifp=ifp->lst_next; + } + } + + addr_chk_timer.expires = jiffies + ADDR_CHECK_FREQUENCY; + add_timer(&addr_chk_timer); +} + +void addrconf_init() +{ + struct device *dev; + + /* init addr hash list */ + memset(inet6_addr_lst, 0, 16 * sizeof(struct inet6_ifaddr *)); + + memset(inet6_mcast_lst, 0, 16 * sizeof(struct ipv6_mc_list *)); + + inet6_dev_lst = NULL; + + /* + * Init loopback device + */ + + dev = dev_get("lo"); + + if (dev && (dev->flags & IFF_UP)) + init_loopback(dev); + + /* + * and maybe: + * search availiable AF_INET devs and try to configure them + */ + + dev = dev_get("eth0"); + + if (dev && (dev->flags & IFF_UP)) + addrconf_eth_config(dev); + + proc_register_dynamic(&proc_net, &iface_proc_entry); + + addr_chk_timer.expires = jiffies + ADDR_CHECK_FREQUENCY; + add_timer(&addr_chk_timer); +} + +void addrconf_cleanup(void) +{ + struct inet6_dev *idev, *bidev; + struct inet6_ifaddr *ifa, *bifa; + int i; + + del_timer(&addr_chk_timer); + + /* + * clean dev list. + */ + + for (idev = inet6_dev_lst; idev; ) + { + bidev = idev; + idev = idev->next; + kfree(bidev); + } + + /* + * clean addr_list + */ + + for (i=0; i<16; i++) + { + for (ifa=inet6_addr_lst[i]; ifa; ) + { + bifa = ifa; + ifa = ifa->lst_next; + kfree(bifa); + } + } + + proc_unregister(&proc_net, iface_proc_entry.low_ino); +} + +/* + * Local variables: + * c-file-style: "Linux" + * End: + */ diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c new file mode 100644 index 000000000..2609e5294 --- /dev/null +++ b/net/ipv6/af_inet6.c @@ -0,0 +1,872 @@ +/* + * AF_INET6 socket family + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * Adapted from linux/net/ipv4/af_inet.c + * + * $Id: af_inet6.c,v 1.13 1996/10/31 19:47:17 roque Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +#include <linux/module.h> +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> + +#include <asm/uaccess.h> +#include <asm/system.h> + +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/arp.h> +#include <net/rarp.h> +#include <net/route.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/raw.h> +#include <net/icmp.h> +#include <linux/icmpv6.h> +#include <net/inet_common.h> +#include <net/transp_v6.h> +#include <net/ndisc.h> +#include <net/ipv6_route.h> +#include <net/sit.h> +#include <linux/ip_fw.h> +#include <net/addrconf.h> + +/* + * Default callbacks for user INET sockets. These just wake up + * the user owning the socket. + */ + +static void def_callback1(struct sock *sk) +{ + if(!sk->dead) + wake_up_interruptible(sk->sleep); +} + +static void def_callback2(struct sock *sk,int len) +{ + if(!sk->dead) + { + wake_up_interruptible(sk->sleep); + sock_wake_async(sk->socket, 1); + } +} + +static void def_callback3(struct sock *sk) +{ + long wmem; + + wmem = (long) sk->wmem_alloc; + + if (wmem < 0) { + printk(KERN_DEBUG "bug wmem_alloc < 0\n"); + sk->wmem_alloc = 0; + } + + if(!sk->dead && sk->wmem_alloc*2 <= sk->sndbuf) + { + wake_up_interruptible(sk->sleep); + sock_wake_async(sk->socket, 2); + } +} + +struct sock * rawv6_sock_array[SOCK_ARRAY_SIZE]; + +static int inet6_create(struct socket *sock, int protocol) +{ + struct sock *sk; + struct proto *prot; + int err; + + sk = sk_alloc(GFP_KERNEL); + if (sk == NULL) + return(-ENOBUFS); + + /* Efficient way to set most fields to zero */ + memset(sk,0,sizeof(*sk)); + + /* + * Note for tcp that also wiped the dummy_th block for us. + */ + + switch(sock->type) + { + case SOCK_STREAM: + case SOCK_SEQPACKET: + if (protocol && protocol != IPPROTO_TCP) + { + kfree_s((void *)sk, sizeof(*sk)); + return(-EPROTONOSUPPORT); + } + protocol = IPPROTO_TCP; + sk->no_check = TCP_NO_CHECK; + prot = &tcpv6_prot; + break; + + case SOCK_DGRAM: + if (protocol && protocol != IPPROTO_UDP) + { + kfree_s((void *)sk, sizeof(*sk)); + return(-EPROTONOSUPPORT); + } + protocol = IPPROTO_UDP; + sk->no_check = UDP_NO_CHECK; + prot=&udpv6_prot; + break; + + case SOCK_RAW: + if (!suser()) + { + kfree_s((void *)sk, sizeof(*sk)); + return(-EPERM); + } + if (!protocol) + { + kfree_s((void *)sk, sizeof(*sk)); + return(-EPROTONOSUPPORT); + } + prot = &rawv6_prot; + sk->reuse = 1; + sk->num = protocol; + break; + default: + kfree_s((void *)sk, sizeof(*sk)); + return(-ESOCKTNOSUPPORT); + } + + sk->socket = sock; + + sk->family = AF_INET6; + sk->type = sock->type; + sk->protocol = protocol; + sk->allocation = GFP_KERNEL; + sk->sndbuf = SK_WMEM_MAX; + sk->rcvbuf = SK_RMEM_MAX; + sk->priority = 1; + + sk->prot = prot; + sk->backlog_rcv = prot->backlog_rcv; + + sk->sleep = sock->wait; + sock->data =(void *) sk; + + sk->state = TCP_CLOSE; + + skb_queue_head_init(&sk->write_queue); + skb_queue_head_init(&sk->receive_queue); + skb_queue_head_init(&sk->back_log); + + sk->timer.data = (unsigned long)sk; + sk->timer.function = &net_timer; + init_timer(&sk->timer); + + sk->state_change = def_callback1; + sk->data_ready = def_callback2; + sk->write_space = def_callback3; + sk->error_report = def_callback1; + + sk->net_pinfo.af_inet6.hop_limit = ipv6_hop_limit; + sk->net_pinfo.af_inet6.mcast_hops = IPV6_DEFAULT_MCASTHOPS; + sk->net_pinfo.af_inet6.mc_loop = 1; + + /* + * init the ipv4 part of the socket since + * we can have sockets using v6 API for ipv4 + */ + + sk->ip_ttl=64; + +#ifdef CONFIG_IP_MULTICAST + sk->ip_mc_loop=1; + sk->ip_mc_ttl=1; + *sk->ip_mc_name=0; + sk->ip_mc_list=NULL; +#endif + + + if (sk->type==SOCK_RAW && protocol==IPPROTO_RAW) + sk->ip_hdrincl=1; + + if (sk->num) + { + /* + * It assumes that any protocol which allows + * the user to assign a number at socket + * creation time automatically + * shares. + */ + + inet_put_sock(sk->num, sk); + sk->dummy_th.source = ntohs(sk->num); + } + + if (sk->prot->init) + { + err = sk->prot->init(sk); + if (err != 0) + { + destroy_sock(sk); + return(err); + } + } + MOD_INC_USE_COUNT; + return(0); +} + +static int inet6_dup(struct socket *newsock, struct socket *oldsock) +{ + return(inet6_create(newsock, + ((struct sock *)(oldsock->data))->protocol)); +} + + +/* + * bind for INET6 API + */ + +static int inet6_bind(struct socket *sock, struct sockaddr *uaddr, + int addr_len) +{ + struct sockaddr_in6 *addr=(struct sockaddr_in6 *)uaddr; + struct sock *sk=(struct sock *)sock->data, *sk2; + __u32 v4addr = 0; + unsigned short snum = 0; + int addr_type = 0; + + /* + * If the socket has its own bind function then use it. + */ + + if(sk->prot->bind) + return sk->prot->bind(sk, uaddr, addr_len); + + /* check this error. */ + if (sk->state != TCP_CLOSE) + return(-EINVAL); + + if(addr_len < sizeof(struct sockaddr_in6)) + return -EINVAL; + + if(sock->type != SOCK_RAW) + { + if (sk->num != 0) + return(-EINVAL); + + snum = ntohs(addr->sin6_port); + + if (snum == 0) + snum = get_new_socknum(sk->prot, 0); + + if (snum < PROT_SOCK && !suser()) + return(-EACCES); + } + + addr_type = ipv6_addr_type(&addr->sin6_addr); + + if ((addr_type & IPV6_ADDR_MULTICAST) && sock->type == SOCK_STREAM) + { + return(-EINVAL); + } + + /* + * check if the address belongs to the host + */ + + if (addr_type == IPV6_ADDR_MAPPED) + { + v4addr = addr->sin6_addr.s6_addr32[3]; + + if (ip_chk_addr(v4addr) != IS_MYADDR) + return(-EADDRNOTAVAIL); + } + else + { + if (addr_type != IPV6_ADDR_ANY) + { + /* + * ipv4 addr of the socket is invalid. + * only the unpecified and mapped address + * have a v4 equivalent. + */ + + v4addr = LOOPBACK4_IPV6; + + if (!(addr_type & IPV6_ADDR_MULTICAST)) + { + if (ipv6_chk_addr(&addr->sin6_addr) == NULL) + return(-EADDRNOTAVAIL); + } + } + } + + sk->rcv_saddr = v4addr; + sk->saddr = v4addr; + + memcpy(&sk->net_pinfo.af_inet6.rcv_saddr, &addr->sin6_addr, + sizeof(struct in6_addr)); + + if (!(addr_type & IPV6_ADDR_MULTICAST)) + memcpy(&sk->net_pinfo.af_inet6.saddr, &addr->sin6_addr, + sizeof(struct in6_addr)); + + if(sock->type != SOCK_RAW) + { + /* Make sure we are allowed to bind here. */ + cli(); + for(sk2 = sk->prot->sock_array[snum & (SOCK_ARRAY_SIZE -1)]; + sk2 != NULL; sk2 = sk2->next) + { + /* + * Hash collision or real match ? + */ + + if (sk2->num != snum) + continue; + + /* + * Either bind on the port is wildcard means + * they will overlap and thus be in error. + * We use the sk2 v4 address to test the + * other socket since addr_any in av4 implies + * addr_any in v6 + */ + + if (addr_type == IPV6_ADDR_ANY || (!sk2->rcv_saddr)) + { + /* + * Allow only if both are setting reuse. + */ + if(sk2->reuse && sk->reuse && sk2->state!=TCP_LISTEN) + continue; + sti(); + return(-EADDRINUSE); + } + + /* + * Two binds match ? + */ + + if (ipv6_addr_cmp(&sk->net_pinfo.af_inet6.rcv_saddr, + &sk2->net_pinfo.af_inet6.rcv_saddr)) + + continue; + /* + * Reusable port ? + */ + + if (!sk->reuse) + { + sti(); + return(-EADDRINUSE); + } + + /* + * Reuse ? + */ + + if (!sk2->reuse || sk2->state==TCP_LISTEN) + { + sti(); + return(-EADDRINUSE); + } + } + sti(); + + inet_remove_sock(sk); + + /* + if(sock->type==SOCK_DGRAM) + udp_cache_zap(); + if(sock->type==SOCK_STREAM) + tcp_cache_zap(); + */ + inet_put_sock(snum, sk); + sk->dummy_th.source = ntohs(sk->num); + sk->dummy_th.dest = 0; + sk->daddr = 0; + } + + return(0); +} + +static int inet6_release(struct socket *sock, struct socket *peer) +{ + MOD_DEC_USE_COUNT; + return inet_release(sock, peer); +} + +static int inet6_socketpair(struct socket *sock1, struct socket *sock2) +{ + return(-EOPNOTSUPP); +} + +/* + * This does both peername and sockname. + */ + +static int inet6_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sockaddr_in6 *sin=(struct sockaddr_in6 *)uaddr; + struct sock *sk; + + sin->sin6_family = AF_INET6; + sk = (struct sock *) sock->data; + if (peer) + { + if (!tcp_connected(sk->state)) + return(-ENOTCONN); + sin->sin6_port = sk->dummy_th.dest; + memcpy(&sin->sin6_addr, &sk->net_pinfo.af_inet6.daddr, + sizeof(struct in6_addr)); + } + else + { + if (ipv6_addr_type(&sk->net_pinfo.af_inet6.rcv_saddr) == + IPV6_ADDR_ANY) + memcpy(&sin->sin6_addr, + &sk->net_pinfo.af_inet6.saddr, + sizeof(struct in6_addr)); + + else + memcpy(&sin->sin6_addr, + &sk->net_pinfo.af_inet6.rcv_saddr, + sizeof(struct in6_addr)); + + sin->sin6_port = sk->dummy_th.source; + + } + + *uaddr_len = sizeof(*sin); + return(0); +} + +static int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk=(struct sock *)sock->data; + int err; + int pid; + + switch(cmd) + { + case FIOSETOWN: + case SIOCSPGRP: + err = get_user(pid, (int *) arg); + if(err) + return err; + + /* see inet_fcntl */ + if (current->pid != pid && current->pgrp != -pid && !suser()) + return -EPERM; + sk->proc = pid; + return(0); + case FIOGETOWN: + case SIOCGPGRP: + err = put_user(sk->proc,(int *)arg); + if(err) + return err; + return(0); + case SIOCGSTAMP: + if(sk->stamp.tv_sec==0) + return -ENOENT; + err = copy_to_user((void *)arg, &sk->stamp, + sizeof(struct timeval)); + if (err) + return -EFAULT; + return 0; + + case SIOCADDRT: + case SIOCDELRT: + + return(ipv6_route_ioctl(cmd,(void *)arg)); + + case SIOCGIFCONF: + case SIOCGIFFLAGS: + case SIOCSIFFLAGS: + case SIOCADDMULTI: + case SIOCDELMULTI: +/* + + this ioctls deal with addresses + must process the addr info before + calling dev_ioctl to perform dev specific functions + + case SIOCGIFADDR: + case SIOCSIFADDR: + + + case SIOCGIFDSTADDR: + + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + */ + + case SIOCGIFMETRIC: + case SIOCSIFMETRIC: + case SIOCGIFMEM: + case SIOCSIFMEM: + case SIOCGIFMTU: + case SIOCSIFMTU: + case SIOCSIFLINK: + case SIOCGIFHWADDR: + case SIOCSIFHWADDR: + case SIOCSIFMAP: + case SIOCGIFMAP: + case SIOCSIFSLAVE: + case SIOCGIFSLAVE: + + return(dev_ioctl(cmd,(void *) arg)); + + return -EINVAL; + + case SIOGIFINDEX: + /* + * This one will be moved to the generic device + * layer in the near future + */ + return addrconf_get_ifindex((void *) arg); + + case SIOCSIFADDR: + return addrconf_add_ifaddr((void *) arg); + case SIOCSIFDSTADDR: + return addrconf_set_dstaddr((void *) arg); + default: + if ((cmd >= SIOCDEVPRIVATE) && + (cmd <= (SIOCDEVPRIVATE + 15))) + return(dev_ioctl(cmd,(void *) arg)); + + if (sk->prot->ioctl==NULL) + return(-EINVAL); + return(sk->prot->ioctl(sk, cmd, arg)); + } + /*NOTREACHED*/ + return(0); +} + +/* + * This routine must find a socket given a TCP or UDP header. + * Everything is assumed to be in net order. + * + * We give priority to more closely bound ports: if some socket + * is bound to a particular foreign address, it will get the packet + * rather than somebody listening to any address.. + */ + +struct sock *inet6_get_sock(struct proto *prot, + struct in6_addr *loc_addr, + struct in6_addr *rmt_addr, + unsigned short loc_port, + unsigned short rmt_port) +{ + struct sock *s; + struct sock *result = NULL; + int badness = -1; + unsigned short hnum; + struct ipv6_pinfo *np; + hnum = ntohs(loc_port); + + /* + * SOCK_ARRAY_SIZE must be a power of two. This will work better + * than a prime unless 3 or more sockets end up using the same + * array entry. This should not be a problem because most + * well known sockets don't overlap that much, and for + * the other ones, we can just be careful about picking our + * socket number when we choose an arbitrary one. + */ + + for(s = prot->sock_array[hnum & (SOCK_ARRAY_SIZE - 1)]; + s != NULL; s = s->next) + { + int score = 0; + + if ((s->num != hnum) || s->family != AF_INET6) + continue; + + if(s->dead && (s->state == TCP_CLOSE)) + { + printk(KERN_DEBUG "dead or closed socket\n"); + continue; + } + + np = &s->net_pinfo.af_inet6; + + /* remote port matches? */ + + if (s->dummy_th.dest) { + if (s->dummy_th.dest != rmt_port) + { + continue; + } + score++; + } + + /* local address matches? */ + + if (!ipv6_addr_any(&np->rcv_saddr)) + { + if (ipv6_addr_cmp(&np->rcv_saddr, loc_addr)) + { + continue; + } + score++; + } + + /* remote address matches? */ + if (!ipv6_addr_any(&np->daddr)) + { + if (ipv6_addr_cmp(&np->daddr, rmt_addr)) + { + continue; + } + score++; + } + + /* perfect match? */ + if (score == 3) + return s; + /* no, check if this is the best so far.. */ + if (score <= badness) + continue; + result = s; + badness = score; + } + return result; +} + +static int __inline__ inet6_mc_check(struct sock *sk, struct in6_addr *addr) +{ + struct ipv6_mc_socklist *mc; + + for (mc = sk->net_pinfo.af_inet6.ipv6_mc_list; mc; mc=mc->next) + { + if (ipv6_addr_cmp(&mc->addr, addr) == 0) + return 1; + } + + return 0; +} + +/* + * Deliver a datagram to raw sockets. + */ + +struct sock *inet6_get_sock_raw(struct sock *sk, unsigned short num, + struct in6_addr *loc_addr, + struct in6_addr *rmt_addr) + +{ + struct sock *s; + struct ipv6_pinfo *np; + int addr_type = 0; + + s=sk; + + addr_type = ipv6_addr_type(loc_addr); + + for(; s != NULL; s = s->next) + { + if (s->num != num) + continue; + + if(s->dead && (s->state == TCP_CLOSE)) + continue; + + np = &s->net_pinfo.af_inet6; + + if (!ipv6_addr_any(&np->daddr) && + ipv6_addr_cmp(&np->daddr, rmt_addr)) + { + continue; + } + + if (!ipv6_addr_any(&np->rcv_saddr)) + { + if (ipv6_addr_cmp(&np->rcv_saddr, loc_addr) == 0) + return(s); + + if ((addr_type & IPV6_ADDR_MULTICAST) && + inet6_mc_check(s, loc_addr)) + return (s); + + continue; + } + + return(s); + } + return(NULL); +} + +/* + * inet6_get_sock_mcast for UDP sockets. + */ + +struct sock *inet6_get_sock_mcast(struct sock *sk, + unsigned short num, unsigned short rmt_port, + struct in6_addr *loc_addr, + struct in6_addr *rmt_addr) +{ + struct sock *s; + struct ipv6_pinfo *np; + + s=sk; + + for(; s != NULL; s = s->next) + { + if (s->num != num) + continue; + + if(s->dead && (s->state == TCP_CLOSE)) + continue; + + np = &s->net_pinfo.af_inet6; + + if (s->dummy_th.dest) { + if (s->dummy_th.dest != rmt_port) + { + continue; + } + } + + if (!ipv6_addr_any(&np->daddr) && + ipv6_addr_cmp(&np->daddr, rmt_addr)) + { + continue; + } + + + if (!ipv6_addr_any(&np->rcv_saddr)) + { + if (ipv6_addr_cmp(&np->rcv_saddr, loc_addr) == 0) + return(s); + } + + if (!inet6_mc_check(s, loc_addr)) + { + continue; + } + + return(s); + } + return(NULL); +} + + +static struct proto_ops inet6_proto_ops = { + AF_INET6, + + inet6_create, + inet6_dup, + inet6_release, + inet6_bind, + inet_connect, /* ok */ + inet6_socketpair, /* a do nothing */ + inet_accept, /* ok */ + inet6_getname, + inet_select, /* ok */ + inet6_ioctl, /* must change */ + inet_listen, /* ok */ + inet_shutdown, /* ok */ + inet_setsockopt, /* ok */ + inet_getsockopt, /* ok */ + inet_fcntl, /* ok */ + inet_sendmsg, /* ok */ + inet_recvmsg /* ok */ +}; + +#ifdef MODULE +int init_module(void) +#else +void inet6_proto_init(struct net_proto *pro) +#endif +{ + int i; + + printk(KERN_INFO "IPv6 v0.1 for NET3.037\n"); + + sock_register(inet6_proto_ops.family, &inet6_proto_ops); + + for(i = 0; i < SOCK_ARRAY_SIZE; i++) + { + rawv6_sock_array[i] = NULL; + } + + /* + * ipngwg API draft makes clear that the correct semantics + * for TCP and UDP is to consider one TCP and UDP instance + * in a host availiable by both INET and INET6 APIs and + * hable to communicate via both network protocols. + */ + + tcpv6_prot.inuse = 0; + tcpv6_prot.highestinuse = 0; + tcpv6_prot.sock_array = tcp_sock_array; + + udpv6_prot.inuse = 0; + udpv6_prot.highestinuse = 0; + udpv6_prot.sock_array = udp_sock_array; + + rawv6_prot.inuse = 0; + rawv6_prot.highestinuse = 0; + rawv6_prot.sock_array = rawv6_sock_array; + + ipv6_init(); + + icmpv6_init(&inet6_proto_ops); + ndisc_init(&inet6_proto_ops); + + addrconf_init(); + + sit_init(); + + /* init v6 transport protocols */ + + udpv6_init(); + /* add /proc entries here */ + + tcpv6_init(); + +#ifdef MODULE + return 0; +#endif +} + +#ifdef MODULE +void cleanup_module(void) +{ + sit_cleanup(); + ipv6_cleanup(); + sock_unregister(AF_INET6); +} +#endif + diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c new file mode 100644 index 000000000..03a58e843 --- /dev/null +++ b/net/ipv6/datagram.c @@ -0,0 +1,196 @@ +/* + * common UDP/RAW code + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * $Id: datagram.c,v 1.3 1996/10/11 16:03:05 roque Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in6.h> +#include <linux/ipv6.h> + +#include <net/ipv6.h> +#include <net/ndisc.h> +#include <net/ipv6_route.h> +#include <net/addrconf.h> +#include <net/transp_v6.h> + + +int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; + struct ipv6_options *opt = (struct ipv6_options *) skb->proto_priv; + struct cmsghdr *cmsg = msg->msg_control; + int len = msg->msg_controllen; + + msg->msg_controllen = 0; + + if (np->rxinfo && (len >= sizeof(struct cmsghdr) + + sizeof(struct in6_pktinfo))) + { + struct in6_pktinfo *src_info; + struct inet6_dev *in6_dev; + + cmsg->cmsg_len = (sizeof(struct cmsghdr) + + sizeof(struct in6_pktinfo)); + cmsg->cmsg_level = SOL_IPV6; + cmsg->cmsg_type = IPV6_RXINFO; + + src_info = (struct in6_pktinfo *) cmsg->cmsg_data; + in6_dev = ipv6_get_idev(skb->dev); + + if (in6_dev == NULL) + { + printk(KERN_DEBUG "recv_ctl: unknown device\n"); + return -ENODEV; + } + + src_info->ipi6_ifindex = in6_dev->if_index; + ipv6_addr_copy(&src_info->ipi6_addr, + &skb->ipv6_hdr->daddr); + + len -= cmsg->cmsg_len; + msg->msg_controllen += cmsg->cmsg_len; + cmsg = (struct cmsghdr *)((u8*) cmsg + cmsg->cmsg_len); + } + + if (opt->srcrt) + { + int hdrlen = sizeof(struct rt0_hdr) + (opt->srcrt->hdrlen << 3); + + if (len >= sizeof(struct cmsghdr) + hdrlen) + { + struct rt0_hdr *rt0; + + cmsg->cmsg_len = sizeof(struct cmsghdr) + hdrlen; + cmsg->cmsg_level = SOL_IPV6; + cmsg->cmsg_type = IPV6_RXINFO; + + rt0 = (struct rt0_hdr *) cmsg->cmsg_data; + memcpy(rt0, opt->srcrt, hdrlen); + + len -= cmsg->cmsg_len; + msg->msg_controllen += cmsg->cmsg_len; + cmsg = (struct cmsghdr *)((u8*) cmsg + cmsg->cmsg_len); + } + } + return 0; +} + + +int datagram_send_ctl(struct msghdr *msg, struct device **src_dev, + struct in6_addr **src_addr, struct ipv6_options *opt) +{ + struct inet6_dev *in6_dev = NULL; + struct in6_pktinfo *src_info; + struct cmsghdr *cmsg; + struct ipv6_rt_hdr *rthdr; + int len; + int err = -EINVAL; + + for (cmsg = msg->msg_control; cmsg; cmsg = cmsg_nxthdr(msg, cmsg)) + { + if (cmsg->cmsg_level != SOL_IPV6) + { + printk(KERN_DEBUG "cmsg_level %d\n", cmsg->cmsg_level); + continue; + } + + switch (cmsg->cmsg_type) { + + case IPV6_TXINFO: + if (cmsg->cmsg_len < (sizeof(struct cmsghdr) + + sizeof(struct in6_pktinfo))) + { + goto exit_f; + } + + src_info = (struct in6_pktinfo *) cmsg->cmsg_data; + + if (src_info->ipi6_ifindex) + { + in6_dev = ipv6_dev_by_index(src_info->ipi6_ifindex); + if (in6_dev == NULL) + { + goto exit_f; + } + + *src_dev = in6_dev->dev; + } + + if (!ipv6_addr_any(&src_info->ipi6_addr)) + { + struct inet6_ifaddr *ifp; + + ifp = ipv6_chk_addr(&src_info->ipi6_addr); + + if ( ifp == NULL) + { + goto exit_f; + } + + *src_addr = &src_info->ipi6_addr; + err = 0; + } + + break; + + case SCM_SRCRT: + + len = cmsg->cmsg_len; + + len -= sizeof(struct cmsghdr); + + /* validate option length */ + if (len < sizeof(struct ipv6_rt_hdr)) + { + goto exit_f; + } + + rthdr = (struct ipv6_rt_hdr *) cmsg->cmsg_data; + + /* + * TYPE 0 + */ + if (rthdr->type) + { + goto exit_f; + } + + if (((rthdr->hdrlen + 1) << 3) < len) + { + goto exit_f; + } + + /* segments left must also match */ + if ((rthdr->hdrlen >> 1) != rthdr->segments_left) + { + goto exit_f; + } + + opt->opt_nflen += ((rthdr->hdrlen + 1) << 3); + opt->srcrt = rthdr; + err = 0; + + break; + default: + printk(KERN_DEBUG "invalid cmsg type: %d\n", + cmsg->cmsg_type); + break; + } + } + + exit_f: + return err; +} diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c new file mode 100644 index 000000000..6c5c8ab7e --- /dev/null +++ b/net/ipv6/exthdrs.c @@ -0,0 +1,173 @@ +/* + * Extension Header handling for IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * $Id: exthdrs.c,v 1.7 1996/09/12 18:44:18 roque Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/in6.h> +#include <linux/icmpv6.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/rawv6.h> +#include <net/ndisc.h> +#include <net/ipv6_route.h> +#include <net/addrconf.h> + +/* + * inbound + */ + +int ipv6_routing_header(struct sk_buff **skb_ptr, struct device *dev, + __u8 *nhptr, struct ipv6_options *opt) +{ + struct sk_buff *skb = *skb_ptr; + struct in6_addr *addr; + struct in6_addr daddr; + int addr_type = 0; + int strict = 0; + __u32 bit_map; + int pos; + int n, i; + + struct ipv6_rt_hdr *hdr = (struct ipv6_rt_hdr *) skb->h.raw; + struct rt0_hdr *rthdr; + + if (hdr->segments_left == 0) + { + struct ipv6_options *opt; + + opt = (struct ipv6_options *) skb->proto_priv; + opt->srcrt = hdr; + + skb->h.raw += (hdr->hdrlen + 1) << 3; + return hdr->nexthdr; + } + + if (hdr->type != IPV6_SRCRT_TYPE_0 || hdr->hdrlen & 0x01 || + hdr->hdrlen > 46) + { + /* + * Discard + */ + + pos = (__u8 *) hdr - (__u8 *) skb->ipv6_hdr + 2; + + if (hdr->type) + pos += 2; + else + pos += 1; + + icmpv6_send(skb, ICMPV6_PARAMETER_PROB, 0, pos, dev); + kfree_skb(skb, FREE_READ); + return 0; + } + + /* + * This is the routing header forwarding algorithm from + * RFC 1883, page 17. + */ + + n = hdr->hdrlen >> 1; + + if (hdr->segments_left > n) + { + pos = (__u8 *) hdr - (__u8 *) skb->ipv6_hdr + 2; + + pos += 3; + + icmpv6_send(skb, ICMPV6_PARAMETER_PROB, 0, pos, dev); + kfree_skb(skb, FREE_READ); + return 0; + } + + i = n - --hdr->segments_left; + + rthdr = (struct rt0_hdr *) hdr; + addr = rthdr->addr; + addr += i - 1; + + addr_type = ipv6_addr_type(addr); + + if (addr_type == IPV6_ADDR_MULTICAST) + { + kfree_skb(skb, FREE_READ); + return 0; + } + + ipv6_addr_copy(&daddr, addr); + ipv6_addr_copy(addr, &skb->ipv6_hdr->daddr); + ipv6_addr_copy(&skb->ipv6_hdr->daddr, &daddr); + + /* + * Check Strick Source Route + */ + + bit_map = ntohl(rthdr->bitmap); + + if ((bit_map & (1 << i)) == IPV6_SRCRT_STRICT) + { + strict = 1; + } + + ipv6_forward(skb, dev, (strict ? IP6_FW_STRICT : 0) | IP6_FW_SRCRT); + + return 0; +} + + +/* + * outbound + */ + +int ipv6opt_bld_rthdr(struct sk_buff *skb, struct ipv6_options *opt, + struct in6_addr *addr, int proto) +{ + struct rt0_hdr *phdr, *ihdr; + int hops; + + ihdr = (struct rt0_hdr *) opt->srcrt; + + phdr = (struct rt0_hdr *) skb_put(skb, (ihdr->rt_hdr.hdrlen + 1) << 3); + memcpy(phdr, ihdr, sizeof(struct ipv6_rt_hdr)); + + hops = ihdr->rt_hdr.hdrlen >> 1; + + if (hops > 1) + { + memcpy(phdr->addr, ihdr->addr + 1, + (hops - 1) * sizeof(struct in6_addr)); + } + + ipv6_addr_copy(phdr->addr + (hops - 1), addr); + + phdr->rt_hdr.nexthdr = proto; + + return NEXTHDR_ROUTING; +} + +/* + * Local variables: + * c-file-style: "Linux" + * End: + */ diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c new file mode 100644 index 000000000..f959189c6 --- /dev/null +++ b/net/ipv6/icmp.c @@ -0,0 +1,560 @@ +/* + * Internet Control Message Protocol (ICMPv6) + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * Based on net/ipv4/icmp.c + * + * RFC 1885 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * + * Andi Kleen : exception handling + */ + +#define __NO_VERSION__ +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/skbuff.h> + +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/icmpv6.h> + +#include <net/ip.h> +#include <net/sock.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/route.h> +#include <net/ndisc.h> +#include <net/raw.h> +#include <net/inet_common.h> +#include <net/transp_v6.h> +#include <net/ipv6_route.h> +#include <net/addrconf.h> +#include <net/rawv6.h> + +#include <asm/uaccess.h> +#include <asm/system.h> + +/* + * ICMP socket for flow control. + */ + +static struct socket icmpv6_socket; + +int icmpv6_rcv(struct sk_buff *skb, struct device *dev, + struct in6_addr *saddr, struct in6_addr *daddr, + struct ipv6_options *opt, unsigned short len, + int redo, struct inet6_protocol *protocol); + +static struct inet6_protocol icmpv6_protocol = +{ + icmpv6_rcv, /* handler */ + NULL, /* error control */ + NULL, /* next */ + IPPROTO_ICMPV6, /* protocol ID */ + 0, /* copy */ + NULL, /* data */ + "ICMPv6" /* name */ +}; + + + +struct icmpv6_msg { + struct icmpv6hdr icmph; + __u8 *data; + struct in6_addr *daddr; + int len; + __u32 csum; +}; + + + +/* + * getfrag callback + * not static because it's needed in ndisc.c + */ + +static int icmpv6_getfrag(const void *data, struct in6_addr *saddr, + char *buff, unsigned int offset, unsigned int len) +{ + struct icmpv6_msg *msg = (struct icmpv6_msg *) data; + struct icmpv6hdr *icmph; + __u32 csum; + + /* + * in theory offset must be 0 since we never send more + * than 576 bytes on an error or more than the path mtu + * on an echo reply. (those are the rules on RFC 1883) + */ + + if (offset) + { + csum = csum_partial_copy((void *) msg->data + + offset - sizeof(struct icmpv6hdr), + buff, len, msg->csum); + msg->csum = csum; + return 0; + } + + csum = csum_partial_copy((void *) &msg->icmph, buff, + sizeof(struct icmpv6hdr), msg->csum); + + csum = csum_partial_copy((void *) msg->data, + buff + sizeof(struct icmpv6hdr), + len - sizeof(struct icmpv6hdr), csum); + + icmph = (struct icmpv6hdr *) buff; + + icmph->checksum = csum_ipv6_magic(saddr, msg->daddr, msg->len, + IPPROTO_ICMPV6, csum); + return 0; +} + +/* + * an inline helper for the "simple" if statement bellow + * checks if parameter problem report is caused by an + * unrecognized IPv6 option that has the Option Type + * highest-order two bits set to 10 + */ +static __inline__ int opt_unrec(struct sk_buff *skb, __u32 offset) +{ + char *buff = (char *) skb->ipv6_hdr; + + return ( ( *(buff + offset) & 0xC0 ) == 0x80 ); +} + +/* + * Send an ICMP message in response to a packet in error + */ + +void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, + struct device *dev) +{ + struct ipv6hdr *hdr = skb->ipv6_hdr; + struct sock *sk = (struct sock *) icmpv6_socket.data; + struct in6_addr *saddr = NULL; + struct device *src_dev = NULL; + struct icmpv6_msg msg; + int addr_type = 0; + int optlen; + int len; + + /* + * sanity check pointer in case of parameter problem + */ + + if (type == ICMPV6_PARAMETER_PROB && + (info > (skb->tail - ((unsigned char *) hdr)))) + { + printk(KERN_DEBUG "icmpv6_send: bug! pointer > skb\n"); + return; + } + + /* + * Make sure we respect the rules + * i.e. RFC 1885 2.4(e) + * Rule (e.1) is enforced by not using icmpv6_send + * in any code that processes icmp errors. + */ + + addr_type = ipv6_addr_type(&hdr->daddr); + + if (ipv6_chk_addr(&hdr->daddr)) + { + saddr = &hdr->daddr; + } + + /* + * Dest addr check + */ + + if ((addr_type & IPV6_ADDR_MULTICAST || skb->pkt_type != PACKET_HOST)) + { + if (type != ICMPV6_PKT_TOOBIG && + !(type == ICMPV6_PARAMETER_PROB && + code == ICMPV6_UNK_OPTION && + (opt_unrec(skb, info)))) + { + return; + } + + saddr = NULL; + } + + addr_type = ipv6_addr_type(&hdr->saddr); + + /* + * Source addr check + */ + + if (addr_type & IPV6_ADDR_LINKLOCAL) + { + src_dev = skb->dev; + } + + /* + * Must not send if we know that source is Anycast also. + * for now we don't know that. + */ + if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) + { + printk(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n"); + return; + } + + /* + * ok. kick it. checksum will be provided by the + * getfrag_t callback. + */ + + msg.icmph.type = type; + msg.icmph.code = code; + msg.icmph.checksum = 0; + msg.icmph.icmp6_pointer = htonl(info); + + msg.data = (__u8 *) skb->ipv6_hdr; + msg.csum = 0; + msg.daddr = &hdr->saddr; + /* + if (skb->opt) + optlen = skb->opt->optlen; + else + */ + + optlen = 0; + + len = min(skb->tail - ((unsigned char *) hdr), + 576 - sizeof(struct ipv6hdr) - sizeof(struct icmpv6hdr) + - optlen); + + if (len < 0) + { + printk(KERN_DEBUG "icmp: len problem\n"); + return; + } + + len += sizeof(struct icmpv6hdr); + + msg.len = len; + + + ipv6_build_xmit(sk, icmpv6_getfrag, &msg, &hdr->saddr, len, + saddr, src_dev, NULL, IPPROTO_ICMPV6, 1); +} + +static void icmpv6_echo_reply(struct sk_buff *skb) +{ + struct sock *sk = (struct sock *) icmpv6_socket.data; + struct ipv6hdr *hdr = skb->ipv6_hdr; + struct icmpv6hdr *icmph = (struct icmpv6hdr *) skb->h.raw; + struct in6_addr *saddr; + struct icmpv6_msg msg; + unsigned char *data; + int len; + + data = (char *) (icmph + 1); + + saddr = &hdr->daddr; + + if (ipv6_addr_type(saddr) & IPV6_ADDR_MULTICAST) + saddr = NULL; + + len = skb->tail - data; + len += sizeof(struct icmpv6hdr); + + msg.icmph.type = ICMPV6_ECHO_REPLY; + msg.icmph.code = 0; + msg.icmph.checksum = 0; + msg.icmph.icmp6_identifier = icmph->icmp6_identifier; + msg.icmph.icmp6_sequence = icmph->icmp6_sequence; + + msg.data = data; + msg.csum = 0; + msg.len = len; + msg.daddr = &hdr->saddr; + + ipv6_build_xmit(sk, icmpv6_getfrag, &msg, &hdr->saddr, len, saddr, + skb->dev, NULL, IPPROTO_ICMPV6, 1); +} + +static __inline__ int ipv6_ext_hdr(u8 nexthdr) +{ + /* + * find out if nexthdr is an extension header or a protocol + */ + return ( (nexthdr == NEXTHDR_HOP) || + (nexthdr == NEXTHDR_ROUTING) || + (nexthdr == NEXTHDR_FRAGMENT) || + (nexthdr == NEXTHDR_ESP) || + (nexthdr == NEXTHDR_AUTH) || + (nexthdr == NEXTHDR_NONE) || + (nexthdr == NEXTHDR_DEST) ); + +} + +static void icmpv6_notify(int type, int code, unsigned char *buff, int len, + struct in6_addr *saddr, struct in6_addr *daddr, + struct inet6_protocol *protocol) +{ + struct ipv6hdr *hdr = (struct ipv6hdr *) buff; + struct inet6_protocol *ipprot; + struct sock *sk; + char * pbuff; + __u32 info = 0; + int hash; + u8 nexthdr; + + /* now skip over extension headers */ + + nexthdr = hdr->nexthdr; + + pbuff = (char *) (hdr + 1); + len -= sizeof(struct ipv6hdr); + + while (ipv6_ext_hdr(nexthdr)) + { + int hdrlen; + + if (nexthdr == NEXTHDR_NONE) + return; + + nexthdr = *pbuff; + hdrlen = *(pbuff+1); + + if (((hdrlen + 1) << 3) > len) + return; + + pbuff += hdrlen; + len -= hdrlen; + } + + hash = nexthdr & (MAX_INET_PROTOS -1); + + for (ipprot = (struct inet6_protocol *) inet6_protos[hash]; + ipprot != NULL; + ipprot=(struct inet6_protocol *)ipprot->next) + { + if (ipprot->protocol != nexthdr) + continue; + + if (ipprot->err_handler) + { + ipprot->err_handler(type, code, pbuff, info, + saddr, daddr, ipprot); + } + return; + } + + /* delivery to upper layer protocols failed. try raw sockets */ + + sk = rawv6_prot.sock_array[hash]; + + if (sk == NULL) + { + return; + } + + while ((sk = inet6_get_sock_raw(sk, nexthdr, daddr, saddr))) + { + rawv6_err(sk, type, code, pbuff, saddr, daddr); + sk = sk->next; + } + + return; +} + +/* + * Handle icmp messages + */ + +int icmpv6_rcv(struct sk_buff *skb, struct device *dev, + struct in6_addr *saddr, struct in6_addr *daddr, + struct ipv6_options *opt, unsigned short len, + int redo, struct inet6_protocol *protocol) +{ + struct ipv6hdr *orig_hdr; + struct icmpv6hdr *hdr = (struct icmpv6hdr *) skb->h.raw; + int ulen; + + /* perform checksum */ + + + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = csum_partial((char *)hdr, len, 0); + case CHECKSUM_HW: + if (csum_ipv6_magic(saddr, daddr, len, IPPROTO_ICMPV6, + skb->csum)) + { + printk(KERN_DEBUG "icmpv6 checksum failed\n"); + goto discard_it; + } + default: + /* CHECKSUM_UNNECESSARY */ + } + + /* + * length of original packet carried in skb + */ + ulen = skb->tail - (unsigned char *) (hdr + 1); + + switch (hdr->type) { + + case ICMPV6_ECHO_REQUEST: + icmpv6_echo_reply(skb); + break; + + case ICMPV6_ECHO_REPLY: + /* we coulnd't care less */ + break; + + case ICMPV6_PKT_TOOBIG: + orig_hdr = (struct ipv6hdr *) (hdr + 1); + if (ulen >= sizeof(struct ipv6hdr)) + { + rt6_handle_pmtu(&orig_hdr->daddr, + ntohl(hdr->icmp6_mtu)); + } + + /* + * Drop through to notify + */ + + case ICMPV6_DEST_UNREACH: + case ICMPV6_TIME_EXCEEDED: + case ICMPV6_PARAMETER_PROB: + + icmpv6_notify(hdr->type, hdr->code, (char *) (hdr + 1), ulen, + saddr, daddr, protocol); + break; + + case NDISC_ROUTER_SOLICITATION: + case NDISC_ROUTER_ADVERTISEMENT: + case NDISC_NEIGHBOUR_SOLICITATION: + case NDISC_NEIGHBOUR_ADVERTISEMENT: + case NDISC_REDIRECT: + ndisc_rcv(skb, dev, saddr, daddr, opt, len); + break; + + case ICMPV6_MEMBERSHIP_QUERY: + case ICMPV6_MEMBERSHIP_REPORT: + case ICMPV6_MEMBERSHIP_REDUCTION: + /* forward the packet to the igmp module */ + break; + + default: + printk(KERN_DEBUG "icmpv6: msg of unkown type\n"); + + /* informational */ + if (hdr->type & 0x80) + { + goto discard_it; + } + + /* + * error of unkown type. + * must pass to upper level + */ + + icmpv6_notify(hdr->type, hdr->code, (char *) (hdr + 1), ulen, + saddr, daddr, protocol); + } + + discard_it: + + kfree_skb(skb, FREE_READ); + return 0; +} + +void icmpv6_init(struct proto_ops *ops) +{ + struct sock *sk; + int err; + + icmpv6_socket.type=SOCK_RAW; + icmpv6_socket.ops=ops; + + if((err=ops->create(&icmpv6_socket, IPPROTO_ICMPV6))<0) + printk(KERN_DEBUG + "Failed to create the ICMP control socket.\n"); + + MOD_DEC_USE_COUNT; + + sk = icmpv6_socket.data; + sk->allocation = GFP_ATOMIC; + sk->num = 256; /* Don't receive any data */ + + inet6_add_protocol(&icmpv6_protocol); +} + +static struct icmp6_err { + int err; + int fatal; +} tab_unreach[] = { + { ENETUNREACH, 0}, /* NOROUTE */ + { EACCES, 1}, /* ADM_PROHIBITED */ + { EOPNOTSUPP, 1}, /* NOT_NEIGHBOUR */ + { EHOSTUNREACH, 0}, /* ADDR_UNREACH */ + { ECONNREFUSED, 1}, /* PORT_UNREACH */ +}; + +int icmpv6_err_convert(int type, int code, int *err) +{ + int fatal = 0; + + *err = 0; + + switch (type) { + case ICMPV6_DEST_UNREACH: + if (code <= ICMPV6_PORT_UNREACH) + { + *err = tab_unreach[code].err; + fatal = tab_unreach[code].fatal; + } + break; + + case ICMPV6_PKT_TOOBIG: + *err = EMSGSIZE; + break; + + case ICMPV6_PARAMETER_PROB: + *err = EPROTO; + fatal = 1; + break; + }; + + return fatal; +} + +/* + * Local variables: + * compile-command: "gcc -D__KERNEL__ -I/usr/src/linux/include -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer -fno-strength-reduce -pipe -m486 -DCPU=486 -DMODULE -DMODVERSIONS -include /usr/src/linux/include/linux/modversions.h -c -o icmp.o icmp.c" + * End: + */ diff --git a/net/ipv6/ipv6_input.c b/net/ipv6/ipv6_input.c new file mode 100644 index 000000000..64a9d79f0 --- /dev/null +++ b/net/ipv6/ipv6_input.c @@ -0,0 +1,437 @@ +/* + * IPv6 input + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * Ian P. Morris <I.P.Morris@soton.ac.uk> + * + * Based in linux/net/ipv4/ip_input.c + * + * $Id: ipv6_input.c,v 1.13 1996/10/11 16:03:06 roque Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/in6.h> +#include <linux/icmpv6.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/rawv6.h> +#include <net/ndisc.h> +#include <net/ipv6_route.h> +#include <net/addrconf.h> + +/* + * Header processing function list + * We process headers in order (as per RFC) + * If the processing function returns 0 the packet is considered + * delivered else it returns the value of the nexthdr. + * The ptr field of the function points to the previous nexthdr field. + * This is allows the processing function to change it if it's sematics + * is: return a new packet without this header (like fragmentation). + * When a next_header value is not within the list + * the inet protocol list is searched (i.e. to deliver to + * TCP for instance) + */ + +static int ipv6_dest_opt(struct sk_buff **skb_ptr, struct device *dev, __u8 *nhptr, + struct ipv6_options *opt); + + +struct hdrtype_proc { + u8 type; + int (*func) (struct sk_buff **, struct device *dev, __u8 *ptr, + struct ipv6_options *opt); +} hdrproc_lst[] = { + /* + TODO + + {NEXTHDR_HOP, ipv6_hop_by_hop} + */ + {NEXTHDR_ROUTING, ipv6_routing_header}, + {NEXTHDR_FRAGMENT, ipv6_reassembly}, + + {NEXTHDR_DEST, ipv6_dest_opt}, + /* + {NEXTHDR_AUTH, ipv6_auth_hdr}, + {NEXTHDR_ESP, ipv6_esp_hdr}, + */ + {NEXTHDR_MAX, NULL} +}; + +/* New header structures */ + + +struct ipv6_tlvtype { + u8 type; + u8 len; +}; + +struct ipv6_destopt_hdr { + u8 nexthdr; + u8 hdrlen; +}; + + +struct tlvtype_proc { + u8 type; + int (*func) (struct sk_buff *, struct device *dev, __u8 *ptr, + struct ipv6_options *opt); + + /* these functions do NOT update skb->h.raw */ + +} tlvprocdestopt_lst[] = { + {255, NULL} +}; + + +static int parse_tlv(struct tlvtype_proc *procs, struct sk_buff *skb, + struct device *dev, __u8 *nhptr, struct ipv6_options *opt, + void *lastopt) +{ + struct ipv6_tlvtype *hdr; + struct tlvtype_proc *curr; + int pos; + + while ((hdr=(struct ipv6_tlvtype *)skb->h.raw) != lastopt) + switch (hdr->type & 0x3F) + { + case 0: /* TLV encoded Pad1 */ + skb->h.raw++; + break; + + case 1: /* TLV encoded PadN */ + skb->h.raw += hdr->len+2; + break; + + default: /* Other TLV code so scan list */ + for (curr=procs; curr->type != 255; curr++) + if (curr->type == (hdr->type & 0x3F)) + { + curr->func(skb, dev, nhptr, opt); + skb->h.raw += hdr->len+2; + break; + } + + if (curr->type==255) + { + /* unkown type */ + pos= (__u8 *) skb->h.raw - (__u8 *) skb->ipv6_hdr; + /* I think this is correct please check - IPM */ + + switch ((hdr->type & 0xC0) >> 6) { + case 0: /* ignore */ + skb->h.raw += hdr->len+2; + break; + + case 1: /* drop packet */ + kfree_skb(skb, FREE_READ); + return 0; + + case 2: /* send ICMP PARM PROB regardless and + drop packet */ + icmpv6_send(skb, ICMPV6_PARAMETER_PROB, + 2, pos, dev); + kfree_skb(skb, FREE_READ); + return 0; + + case 3: /* Send ICMP if not a multicast address + and drop packet */ + if (!(ipv6_addr_type(&(skb->ipv6_hdr->daddr)) & IPV6_ADDR_MULTICAST) ) + icmpv6_send(skb, ICMPV6_PARAMETER_PROB, 2, pos, dev); + kfree_skb(skb, FREE_READ); + return 0; + } + } + break; + } + + return 1; +} + + + +static int ipv6_dest_opt(struct sk_buff **skb_ptr, struct device *dev, __u8 *nhptr, + struct ipv6_options *opt) +{ + struct sk_buff *skb=*skb_ptr; + struct ipv6_destopt_hdr *hdr = (struct ipv6_destopt_hdr *) skb->h.raw; + + if (parse_tlv(tlvprocdestopt_lst, skb, dev, nhptr, opt,skb->h.raw+hdr->hdrlen)) + return hdr->nexthdr; + else + return 0; +} + + + +/* + * 0 - deliver + * 1 - block + */ +static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb) +{ + struct icmpv6hdr *icmph; + struct raw6_opt *opt; + + opt = &sk->tp_pinfo.tp_raw; + icmph = (struct icmpv6hdr *) (skb->ipv6_hdr + 1); + return test_bit(icmph->type, &opt->filter); +} + +/* + * demultiplex raw sockets. + * (should consider queueing the skb in the sock receive_queue + * without calling rawv6.c) + */ +static struct sock * ipv6_raw_deliver(struct sk_buff *skb, + struct device *dev, + struct ipv6_options *opt, + __u16 nexthdr, + __u16 len, + struct in6_addr *saddr, + struct in6_addr *daddr) +{ + struct sock *sk, *sk2; + __u8 hash; + + hash = nexthdr & (SOCK_ARRAY_SIZE-1); + + sk = rawv6_prot.sock_array[hash]; + + + /* + * The first socket found will be delivered after + * delivery to transport protocols. + */ + + if (sk == NULL) + return NULL; + + sk = inet6_get_sock_raw(sk, nexthdr, daddr, saddr); + + if (sk) + { + sk2 = sk; + + while ((sk2 = inet6_get_sock_raw(sk2->next, nexthdr, + daddr, saddr))) + { + struct sk_buff *buff; + + if (nexthdr == IPPROTO_ICMPV6 && + icmpv6_filter(sk2, skb)) + { + continue; + } + buff = skb_clone(skb, GFP_ATOMIC); + buff->sk = sk2; + rawv6_rcv(buff, dev, saddr, daddr, opt, len); + } + } + + if (sk && nexthdr == IPPROTO_ICMPV6 && icmpv6_filter(sk, skb)) + { + sk = NULL; + } + + return sk; +} + +int ipv6_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) +{ + struct inet6_ifaddr *ifp; + struct ipv6_options *opt = (struct ipv6_options *) skb->proto_priv; + struct ipv6hdr *hdr; + u8 hash; + u8 addr_type; + struct inet6_protocol *ipprot; + struct sock *raw_sk; + int found = 0; + int nexthdr = 0; + __u8 *nhptr; + int pkt_len; + + hdr = skb->ipv6_hdr = (struct ipv6hdr *) skb->h.raw; + + if (skb->len < sizeof(struct ipv6hdr) || hdr->version != 6) + { + ipv6_statistics.Ip6InHdrErrors++; + printk(KERN_DEBUG "ipv6_rcv: broken header\n"); + kfree_skb(skb, FREE_READ); + return 0; + } + + pkt_len = ntohs(hdr->payload_len); + + if (pkt_len + sizeof(struct ipv6hdr) > skb->len) + { + printk(KERN_DEBUG "ipv6_rcv: invalid payload length\n"); + kfree_skb(skb, FREE_READ); + return 0; + } + + skb_trim(skb, pkt_len + sizeof(struct ipv6hdr)); + + /* check daddr */ + + /* Accounting & Firewall check */ + + addr_type = ipv6_addr_type(&hdr->daddr); + + if (addr_type & IPV6_ADDR_MULTICAST) + { + /* + * if mcast address is not for one of our groups + * either pass it to mcast router or discard it + */ + + if (ipv6_chk_mcast_addr(dev, &hdr->daddr) == 0) + { + /* something like: + if (acting_as_router) + ipv6_mcast_route(skb, ...) + else + */ + kfree_skb(skb, FREE_READ); + return 0; + } + } + + if (addr_type & IPV6_ADDR_MULTICAST || + (ifp = ipv6_chk_addr(&hdr->daddr))) + { + + /* loop in a cicle parsing nexthdrs */ + + skb->h.raw += sizeof(struct ipv6hdr); + + /* extension header processing must update skb->h.raw */ + + nexthdr = hdr->nexthdr; + nhptr = &hdr->nexthdr; + + + while(1) + { + struct hdrtype_proc *hdrt; + + /* check for extension header */ + + for (hdrt=hdrproc_lst; hdrt->type != NEXTHDR_MAX; hdrt++) + { + if (hdrt->type == nexthdr) + { + if ((nexthdr = hdrt->func(&skb, dev, nhptr, opt))) + { + nhptr = skb->h.raw; + hdr = skb->ipv6_hdr; + continue; + } + return 0; + } + } + break; + + } + + /* + * deliver to raw sockets + * should we deliver raw after or before parsing + * extension headers ? + * delivering after means we do reassembly of datagrams + * in ip. + */ + + pkt_len = skb->tail - skb->h.raw; + + raw_sk = ipv6_raw_deliver(skb, dev, opt, nexthdr, pkt_len, + &hdr->saddr, &hdr->daddr); + + /* check inet6_protocol list */ + + hash = nexthdr & (MAX_INET_PROTOS -1); + for (ipprot = (struct inet6_protocol *) inet6_protos[hash]; + ipprot != NULL; + ipprot = (struct inet6_protocol *) ipprot->next) + { + struct sk_buff *buff = skb; + + if (ipprot->protocol != nexthdr) + continue; + + if (ipprot->copy || raw_sk) + buff = skb_clone(skb, GFP_ATOMIC); + + + ipprot->handler(buff, dev, + &hdr->saddr, &hdr->daddr, + opt, pkt_len, + 0, ipprot); + found = 1; + } + + if (raw_sk) + { + skb->sk = raw_sk; + rawv6_rcv(skb, dev, &hdr->saddr, &hdr->daddr, opt, + htons(hdr->payload_len)); + found = 1; + } + + /* not found: send ICMP parameter problem back */ + + if (!found) + { + printk(KERN_DEBUG "proto not found %d\n", nexthdr); + skb->sk = NULL; + kfree_skb(skb, FREE_READ); + } + + } + else + { + if (ipv6_forwarding) + { + if (addr_type & IPV6_ADDR_LINKLOCAL) + { + printk(KERN_DEBUG + "link local pkt to forward\n"); + kfree_skb(skb, FREE_READ); + return 0; + } + ipv6_forward(skb, dev, 0); + } + else + { + printk(KERN_WARNING "IPV6: packet to forward -" + "host not configured as router\n"); + kfree_skb(skb, FREE_READ); + } + } + + return 0; +} + +/* + * Local variables: + * c-file-style: "Linux" + * End: + */ diff --git a/net/ipv6/ipv6_output.c b/net/ipv6/ipv6_output.c new file mode 100644 index 000000000..7f82dba03 --- /dev/null +++ b/net/ipv6/ipv6_output.c @@ -0,0 +1,1003 @@ +/* + * IPv6 output functions + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * Based on linux/net/ipv4/ip_output.c + * + * $Id: ipv6_output.c,v 1.19 1996/10/16 18:34:16 roque Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * + * Andi Kleen : exception handling + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/in6.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/ndisc.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/ipv6_route.h> +#include <net/addrconf.h> + +static u32 ipv6_fragmentation_id = 1; +int ipv6_forwarding = 0; /* default: host */ + +static int __inline__ ipv6_build_mac_header(struct sk_buff *skb, + struct device *dev, + struct neighbour *neigh, + int len) +{ + int mac; + int hdrlen = 0; + + skb->arp = 1; + skb->nexthop = neigh; + + + if (dev->hard_header_len) + { + skb_reserve(skb, (dev->hard_header_len + 15) & ~15); + + if (neigh && (neigh->flags & NCF_HHVALID)) + { + /* + * Cached hardware header + */ + + memcpy(skb_push(skb, dev->hard_header_len), + neigh->hh_data, dev->hard_header_len); + + return dev->hard_header_len; + } + + if (dev->hard_header) + { + mac = dev->hard_header(skb, dev, ETH_P_IPV6, + NULL, NULL, len); + + if (mac < 0) + { + hdrlen = -mac; + skb->arp = 0; + } + else + { + hdrlen = mac; + } + } + else + hdrlen = dev->hard_header_len; + } + + return hdrlen; +} + +void ipv6_redo_mac_hdr(struct sk_buff *skb, struct neighbour *neigh, int len) +{ + struct device *dev = neigh->dev; + int mac; + + skb->dev = dev; + skb->nexthop = neigh; + skb->arp = 1; + + skb_pull(skb, (unsigned char *) skb->ipv6_hdr - skb->data); + + /* + * neighbour cache should have the ether address + * cached... use it + */ + + if (dev->hard_header) + { + if (neigh && (neigh->flags & NCF_HHVALID)) + { + /* + * Cached hardware header + */ + + memcpy(skb_push(skb, dev->hard_header_len), + neigh->hh_data, dev->hard_header_len); + return; + } + + mac = dev->hard_header(skb, dev, ETH_P_IPV6, + NULL, NULL, len); + + if (mac < 0) + { + skb->arp = 0; + } + + } +} + +void default_output_method(struct sk_buff *skb, struct rt6_info *rt) +{ + struct sock *sk = skb->sk; + struct device *dev = skb->dev; + + if (dev->flags & IFF_UP) + { + /* + * If we have an owner use its priority setting, + * otherwise use NORMAL + */ + + if (sk != NULL) + { + dev_queue_xmit(skb, dev, sk->priority); + } + else + { + dev_queue_xmit(skb, dev, SOPRI_NORMAL); + } + } + else + { + if(sk) + sk->err = ENETDOWN; + + ipv6_statistics.Ip6OutDiscards++; + + kfree_skb(skb, FREE_WRITE); + } +} + +/* + * xmit an sk_buff (used by TCP) + * sk can be NULL (for sending RESETs) + */ +int ipv6_xmit(struct sock *sk, struct sk_buff *skb, struct in6_addr *saddr, + struct in6_addr *daddr, struct ipv6_options *opt, int proto) +{ + struct ipv6hdr *hdr; + struct dest_entry *dc; + struct ipv6_pinfo *np = NULL; + struct device *dev = skb->dev; + int seg_len; + int addr_type; + int rt_flags = 0; + + + addr_type = ipv6_addr_type(daddr); + + if (addr_type & (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_SITELOCAL)) + { + /* + * force device match on route lookup + */ + + rt_flags |= RTI_DEVRT; + } + + if (skb->localroute) + { + rt_flags |= RTI_GATEWAY; + } + + hdr = skb->ipv6_hdr; + + + if (sk) + { + np = &sk->net_pinfo.af_inet6; + } + + if (np && np->dest) + { + dc = ipv6_dst_check(np->dest, daddr, np->dc_sernum, rt_flags); + } + else + { + dc = ipv6_dst_route(daddr, dev, rt_flags); + } + + if (dc == NULL) + { + ipv6_statistics.Ip6OutNoRoutes++; + return(-ENETUNREACH); + } + + dev = dc->rt.rt_dev; + + if (saddr == NULL) + { + struct inet6_ifaddr *ifa; + + ifa = ipv6_get_saddr((struct rt6_info *) dc, daddr); + + if (ifa == NULL) + { + printk(KERN_DEBUG + "ipv6_xmit: get_saddr failed\n"); + return -ENETUNREACH; + } + + saddr = &ifa->addr; + + if (np) + { + ipv6_addr_copy(&np->saddr, saddr); + } + } + + seg_len = skb->tail - ((unsigned char *) hdr); + + /* + * Link Layer headers + */ + + skb->sk = sk; + skb->protocol = __constant_htons(ETH_P_IPV6); + skb->free = 1; + skb->dev = dev; + + ipv6_redo_mac_hdr(skb, dc->dc_nexthop, seg_len); + + /* + * Fill in the IPv6 header + */ + + hdr->version = 6; + hdr->priority = np ? np->priority : 0; + + if (np) + memcpy(hdr->flow_lbl, (void *) &np->flow_lbl, 3); + else + memset(hdr->flow_lbl, 0, 3); + + hdr->payload_len = htons(seg_len - sizeof(struct ipv6hdr)); + hdr->nexthdr = proto; + hdr->hop_limit = np ? np->hop_limit : ipv6_hop_limit; + + memcpy(&hdr->saddr, saddr, sizeof(struct in6_addr)); + memcpy(&hdr->daddr, daddr, sizeof(struct in6_addr)); + + + /* + * Options + */ + + + /* + * Output the packet + */ + + ipv6_statistics.Ip6OutRequests++; + + if (dc->rt.rt_output_method) + { + (*dc->rt.rt_output_method)(skb, (struct rt6_info *) dc); + } + else + default_output_method(skb, (struct rt6_info *) dc); + + /* + * Update serial number of cached dest_entry or + * release destination cache entry + */ + + if (np) + { + np->dest = dc; + if (dc->rt.fib_node) + { + np->dc_sernum = dc->rt.fib_node->fn_sernum; + } + } + else + { + ipv6_dst_unlock(dc); + } + + return 0; +} + +/* + * To avoid extra problems ND packets are send through this + * routine. It's code duplication but i really want to avoid + * extra checks since ipv6_build_header is used by TCP (which + * is for us performace critical) + */ + +int ipv6_bld_hdr_2(struct sock *sk, struct sk_buff *skb, struct device *dev, + struct neighbour *neigh, + struct in6_addr *saddr, struct in6_addr *daddr, + int proto, int len) +{ + struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; + struct ipv6hdr *hdr; + int hdrlen = 0; + + skb->dev = dev; + + /* build MAC header */ + hdrlen += ipv6_build_mac_header(skb, dev, neigh, len); + + /* build fixed IPv6 header */ + + if (proto == IPPROTO_RAW) + return hdrlen; + + + hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr)); + skb->ipv6_hdr = hdr; + + hdr->version = 6; + hdr->priority = np->priority & 0x0f; + + memset(hdr->flow_lbl, 0, 3); + + hdr->hop_limit = np->hop_limit; + + if (saddr == NULL) + { + printk(KERN_DEBUG "bug: bld_hdr called with no saddr\n"); + return -ENETUNREACH; + } + + memcpy(&hdr->saddr, saddr, sizeof(struct in6_addr)); + memcpy(&hdr->daddr, daddr, sizeof(struct in6_addr)); + + hdrlen += sizeof(struct ipv6hdr); + + hdr->nexthdr = proto; + + return hdrlen; +} + +void ipv6_queue_xmit(struct sock *sk, struct device *dev, struct sk_buff *skb, + int free) +{ + struct ipv6hdr *hdr; + u32 seg_len; + + hdr = skb->ipv6_hdr; + skb->sk = sk; + skb->protocol = __constant_htons(ETH_P_IPV6); + skb->free=1; + + seg_len = skb->tail - ((unsigned char *) hdr); + + hdr->payload_len = htons(seg_len - sizeof(struct ipv6hdr)); + + if (dev == NULL) + { + printk(KERN_DEBUG "ipv6_queue_xmit: unknown device\n"); + return; + } + + skb->dev = dev; + + ipv6_statistics.Ip6OutRequests++; + + + /* + * Multicast loopback + */ + + if (dev->flags & IFF_UP) + { + /* + * If we have an owner use its priority setting, + * otherwise use NORMAL + */ + + if (sk != NULL) + { + dev_queue_xmit(skb, dev, sk->priority); + } + else + { + dev_queue_xmit(skb, dev, SOPRI_NORMAL); + } + } + else + { + if(sk) + sk->err = ENETDOWN; + + ipv6_statistics.Ip6OutDiscards++; + + kfree_skb(skb, FREE_WRITE); + } + +} + + +int ipv6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data, + struct in6_addr *dest, unsigned short int length, + struct in6_addr *saddr, struct device *dev, + struct ipv6_options *opt, int proto, + int noblock) +{ + rt6_output_method_t output_method = default_output_method; + int hlimit; + struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; + struct dest_entry *dc = NULL; + struct in6_addr *daddr = dest; + struct ipv6hdr *hdr; + struct neighbour *neigh; + int addr_type; + int pktlength; + int pmtu = 0; + int rt_flags = 0; + int error; + + if (opt && opt->srcrt) + { + struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; + daddr = rt0->addr; + } + + addr_type = ipv6_addr_type(daddr); + if (addr_type & IPV6_ADDR_MULTICAST) + { + hlimit = np->mcast_hops; + if (dev == NULL) + { + dev = np->mc_if; + } + } + else + hlimit = np->hop_limit; + + if (addr_type & (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_SITELOCAL | + IPV6_ADDR_MULTICAST)) + { + /* + * force device match on route lookup + */ + + rt_flags |= RTI_DEVRT; + } + + if (sk->localroute) + { + rt_flags |= RTI_GATEWAY; + } + + if (np->dest) + { + np->dest = ipv6_dst_check(np->dest, daddr, np->dc_sernum, + rt_flags); + + dc = np->dest; + + if (dc && dc->rt.fib_node) + { + np->dc_sernum = dc->rt.fib_node->fn_sernum; + } + else + { + printk(KERN_WARNING "dc entry not in table\n"); + } + } + else + { + dc = ipv6_dst_route(daddr, dev, rt_flags); + } + + if (dc == NULL) + { + if ((addr_type & IPV6_ADDR_MULTICAST) && dev) + { + neigh = NULL; + pmtu = dev->mtu; + } + else + { + ipv6_statistics.Ip6OutNoRoutes++; + return(-ENETUNREACH); + } + } + else + { + neigh = dc->dc_nexthop; + dev = neigh->dev; + + if (dc->rt.rt_output_method) + { + output_method = dc->rt.rt_output_method; + } + + if (dc->dc_flags & DCF_PMTU) + pmtu = dc->dc_pmtu; + else + pmtu = dev->mtu; + } + + + if (saddr == NULL) + { + struct inet6_ifaddr *ifa; + + ifa = ipv6_get_saddr((struct rt6_info *) dc, daddr); + + if (ifa == NULL) + { + printk(KERN_DEBUG + "ipv6_build_xmit: get_saddr failed\n"); + return -ENETUNREACH; + } + + saddr = &ifa->addr; + } + + if (dc && np->dest == NULL) + { + ipv6_dst_unlock(dc); + } + + pktlength = length; + + if (!sk->ip_hdrincl) + { + pktlength += sizeof(struct ipv6hdr); + if (opt) + { + pktlength += opt->opt_flen + opt->opt_nflen; + } + } + + + dev_lock_list(); + + /* + * reminder: don't allow fragmentation for IPPROTO_RAW + */ + + + if (pktlength <= pmtu) + { + struct sk_buff *skb = + sock_alloc_send_skb(sk, pktlength+15+ + dev->hard_header_len, + 0, noblock, &error); + + if (skb == NULL) + { + ipv6_statistics.Ip6OutDiscards++; + dev_unlock_list(); + return error; + + } + + skb->dev=dev; + skb->protocol = htons(ETH_P_IPV6); + skb->free=1; + skb->when=jiffies; + skb->sk=sk; + skb->arp=0; + + /* build the mac header... */ + ipv6_build_mac_header(skb, dev, neigh, pktlength); + + hdr = (struct ipv6hdr *) skb->tail; + + if (!sk->ip_hdrincl) + { + skb_put(skb, sizeof(struct ipv6hdr)); + skb->ipv6_hdr = hdr; + + hdr->version = 6; + hdr->priority = np->priority; + + memcpy(hdr->flow_lbl, &np->flow_lbl, 3); + + hdr->payload_len = htons(pktlength - + sizeof(struct ipv6hdr)); + + hdr->hop_limit = hlimit; + + memcpy(&hdr->saddr, saddr, sizeof(struct in6_addr)); + memcpy(&hdr->daddr, daddr, sizeof(struct in6_addr)); + + if (opt && opt->srcrt) + { + hdr->nexthdr = ipv6opt_bld_rthdr(skb, opt, + dest, proto); + + } + else + hdr->nexthdr = proto; + } + + skb_put(skb, length); + error = getfrag(data, &hdr->saddr, + ((char *) hdr) + (pktlength - length), + 0, length); + + if (!error) + { + ipv6_statistics.Ip6OutRequests++; + (*output_method)(skb, (struct rt6_info *) dc); + } else + { + error = -EFAULT; + kfree_skb(skb, FREE_WRITE); + } + + dev_unlock_list(); + return error; + } + else + { + /* + * Fragmentation + */ + + /* + * Extension header order: + * Hop-by-hop -> Routing -> Fragment -> rest (...) + * + * We must build the non-fragmented part that + * will be in every packet... this also means + * that other extension headers (Dest, Auth, etc) + * must be considered in the data to be fragmented + */ + + struct sk_buff *last_skb; + struct frag_hdr *fhdr; + int unfrag_len; + int payl_len; + int frag_len; + int last_len; + int nfrags; + int err; + int fhdr_dist; + __u32 id; + + if (sk->ip_hdrincl) + { + return -EMSGSIZE; + } + + id = ipv6_fragmentation_id++; + + unfrag_len = sizeof(struct ipv6hdr) + sizeof(struct frag_hdr); + payl_len = length; + + if (opt) + { + unfrag_len += opt->opt_nflen; + payl_len += opt->opt_flen; + } + + nfrags = payl_len / ((pmtu - unfrag_len) & ~0x7); + + /* + * Length of fragmented part on every packet but + * the last must be an: + * "integer multiple of 8 octects". + */ + + frag_len = (pmtu - unfrag_len) & ~0x7; + + /* + * We must send from end to start because of + * UDP/ICMP checksums. We do a funny trick: + * fill the last skb first with the fixed + * header (and its data) and then use it + * to create the following segments and send it + * in the end. If the peer is checking the M_flag + * to trigger the reassembly code then this + * might be a good idea. + */ + + last_len = payl_len - (nfrags * frag_len); + + if (last_len == 0) + { + last_len = frag_len; + nfrags--; + } + + last_skb = sock_alloc_send_skb(sk, unfrag_len + frag_len + + dev->hard_header_len + 15, + 0, noblock, &err); + + if (last_skb == NULL) + { + dev_unlock_list(); + return err; + } + + last_skb->dev=dev; + last_skb->protocol = htons(ETH_P_IPV6); + last_skb->free=1; + last_skb->when=jiffies; + last_skb->sk=sk; + last_skb->arp=0; + + /* + * build the mac header... + */ + ipv6_build_mac_header(last_skb, dev, neigh, + unfrag_len + frag_len); + + hdr = (struct ipv6hdr *) skb_put(last_skb, + sizeof(struct ipv6hdr)); + last_skb->ipv6_hdr = hdr; + + hdr->version = 6; + hdr->priority = np->priority; + + memcpy(hdr->flow_lbl, &np->flow_lbl, 3); + hdr->payload_len = htons(unfrag_len + frag_len - + sizeof(struct ipv6hdr)); + + hdr->hop_limit = hlimit; + + hdr->nexthdr = NEXTHDR_FRAGMENT; + + memcpy(&hdr->saddr, saddr, sizeof(struct in6_addr)); + memcpy(&hdr->daddr, daddr, sizeof(struct in6_addr)); + + if (opt && opt->srcrt) + { + hdr->nexthdr = ipv6opt_bld_rthdr(last_skb, opt, dest, + NEXTHDR_FRAGMENT); + } + + fhdr = (struct frag_hdr *) + skb_put(last_skb, sizeof(struct frag_hdr)); + + memset(fhdr, 0, sizeof(struct frag_hdr)); + + fhdr->nexthdr = proto; + fhdr->frag_off = ntohs(nfrags * frag_len); + fhdr->identification = id; + + fhdr_dist = (unsigned char *) fhdr - last_skb->data; + + error = getfrag(data, &hdr->saddr, last_skb->tail, + nfrags * frag_len, last_len); + + if (!error) + { + while (nfrags--) + { + struct sk_buff *skb; + + struct frag_hdr *fhdr2; + + printk(KERN_DEBUG "sending frag %d\n", nfrags); + skb = skb_copy(last_skb, sk->allocation); + + fhdr2 = (struct frag_hdr *) + (skb->data + fhdr_dist); + + /* more flag on */ + fhdr2->frag_off = ntohs(nfrags * frag_len + 1); + + /* + * FIXME: + * if (nfrags == 0) + * put rest of headers + */ + + error = getfrag(data, &hdr->saddr, + skb_put(skb, frag_len), + nfrags * frag_len, frag_len); + + if (error) + { + kfree_skb(skb, FREE_WRITE); + break; + } + + ipv6_statistics.Ip6OutRequests++; + (*output_method)(skb, (struct rt6_info *) dc); + } + } + + if (error) + { + kfree_skb(last_skb, FREE_WRITE); + dev_unlock_list(); + return -EFAULT; + } + + printk(KERN_DEBUG "sending last frag \n"); + + hdr->payload_len = htons(unfrag_len + last_len - + sizeof(struct ipv6hdr)); + + /* + * update last_skb to reflect the getfrag we did + * on start. + */ + last_skb->tail += last_len; + last_skb->len += last_len; + + /* + * toss the mac header out and rebuild it. + * needed because of the different frame length. + * ie: not needed for an ethernet. + */ + + if (dev->type != ARPHRD_ETHER && last_len != frag_len) + { + ipv6_redo_mac_hdr(last_skb, neigh, + unfrag_len + last_len); + } + + ipv6_statistics.Ip6OutRequests++; + (*output_method)(last_skb, (struct rt6_info *) dc); + + dev_unlock_list(); + return 0; + } + return -1; +} + +static int pri_values[4] = +{ + SOPRI_BACKGROUND, + SOPRI_NORMAL, + SOPRI_NORMAL, + SOPRI_INTERACTIVE +}; + +void ipv6_forward(struct sk_buff *skb, struct device *dev, int flags) +{ + struct neighbour *neigh; + struct dest_entry *dest; + int priority; + int rt_flags; + int size; + int pmtu; + + if (skb->ipv6_hdr->hop_limit <= 1) + { + icmpv6_send(skb, ICMPV6_TIME_EXCEEDED, ICMPV6_EXC_HOPLIMIT, + 0, dev); + + kfree_skb(skb, FREE_READ); + return; + } + + skb->ipv6_hdr->hop_limit--; + + if (ipv6_addr_type(&skb->ipv6_hdr->saddr) & IPV6_ADDR_LINKLOCAL) + { + printk(KERN_DEBUG "ipv6_forward: link local source addr\n"); + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOT_NEIGHBOUR, + 0, dev); + kfree_skb(skb, FREE_READ); + return; + } + + rt_flags = RTF_MODIFIED; + + if ((flags & IP6_FW_STRICT)) + { + rt_flags |= RTF_GATEWAY; + } + + dest = ipv6_dst_route(&skb->ipv6_hdr->daddr, NULL, rt_flags); + + if (dest == NULL) + { + int code; + + if (flags & IP6_FW_STRICT) + code = ICMPV6_NOT_NEIGHBOUR; + else + code = ICMPV6_NOROUTE; + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, dev); + + kfree_skb(skb, FREE_READ); + return; + } + + neigh = dest->dc_nexthop; + + if (neigh->dev == dev && (dev->flags & IFF_MULTICAST) && + !(flags & IP6_FW_SRCRT)) + { + struct in6_addr *target = NULL; + + /* + * outgoing device equal to incoming device + * send a redirect + */ + + if ((dest->dc_flags & RTF_GATEWAY)) + { + target = &neigh->addr; + } + else + { + target = &skb->ipv6_hdr->daddr; + } + + ndisc_send_redirect(skb, neigh, target); + } + + pmtu = neigh->dev->mtu; + + size = sizeof(struct ipv6hdr) + ntohs(skb->ipv6_hdr->payload_len); + + if (size > pmtu) + { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, pmtu, dev); + kfree_skb(skb, FREE_READ); + return; + } + + ipv6_dst_unlock(dest); + + if (skb_headroom(skb) < neigh->dev->hard_header_len) + { + struct sk_buff *buff; + + buff = alloc_skb(neigh->dev->hard_header_len + skb->len + 15, + GFP_ATOMIC); + + if (buff == NULL) + { + return; + } + + skb_reserve(buff, (neigh->dev->hard_header_len + 15) & ~15); + + buff->protocol = __constant_htons(ETH_P_IPV6); + buff->free = 1; + buff->h.raw = skb_put(buff, size); + + memcpy(buff->h.raw, skb->ipv6_hdr, size); + buff->ipv6_hdr = (struct ipv6hdr *) buff->h.raw; + kfree_skb(skb, FREE_READ); + skb = buff; + } + + ipv6_redo_mac_hdr(skb, neigh, size); + + priority = skb->ipv6_hdr->priority; + + priority = (priority & 0x7) >> 1; + priority = pri_values[priority]; + + if (dev->flags & IFF_UP) + { + dev_queue_xmit(skb, neigh->dev, priority); + } + else + { + ipv6_statistics.Ip6OutDiscards++; + kfree_skb(skb, FREE_READ); + } +} + + +/* + * Local variables: + * c-file-style: "Linux" + * End: + */ diff --git a/net/ipv6/ipv6_route.c b/net/ipv6/ipv6_route.c new file mode 100644 index 000000000..e68990a0f --- /dev/null +++ b/net/ipv6/ipv6_route.c @@ -0,0 +1,2056 @@ +/* + * IPv6 routing table + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * + * Masaki Hirabaru : Fix for /proc info > pagesize + * <masaki@merit.edu> + */ + +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/route.h> +#include <linux/netdevice.h> +#include <linux/in6.h> + +#ifdef CONFIG_PROC_FS +#include <linux/proc_fs.h> +#endif + +#include <net/tcp.h> +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/ndisc.h> +#include <net/protocol.h> +#include <net/ipv6_route.h> +#include <net/addrconf.h> + +#include <net/netlink.h> + +#include <asm/uaccess.h> + +/* + * Routing Table + * + * simplified version of a radix tree + * + * - every node shares it's acestors prefix + * - the tree is ordered from less to most specific mask + * - default routes are handled apart + * + * this facilitates recursion a lot + */ + +static struct rt6_info null_entry = { + NULL, NULL, + {{{0}}}, + 0, 1, + NULL, NULL, + 0, 0, RTF_REJECT +}; + +struct fib6_node routing_table = { + NULL, NULL, NULL, &null_entry, + 0, RTN_ROOT, 0 +}; + +struct rt6_info *default_rt_list = NULL; +struct rt6_info *loopback_rt = NULL; + +/* + * last_resort_rt - no routers present. + * Assume all destinations on link. + */ +struct rt6_info *last_resort_rt = NULL; + +static struct rt6_req request_queue = { + 0, NULL, &request_queue, &request_queue +}; + + +/* + * A routing update causes an increase of the serial number on the + * afected subtree. This allows for cached routes to be asynchronously + * tested when modifications are made to the destination cache as a + * result of redirects, path MTU changes, etc. + */ + +static __u32 rt_sernum = 0; + +static atomic_t rt6_lock = 0; +static int rt6_bh_mask = 0; + +#define RT_BH_REQUEST 1 +#define RT_BH_GC 2 + +static void __rt6_run_bh(void); + +typedef void (*f_pnode)(struct fib6_node *fn, void *); + +static void rt6_walk_tree(f_pnode func, void * arg, int filter); +static void rt6_rt_timeout(struct fib6_node *fn, void *arg); +static int rt6_msgrcv(int unit, struct sk_buff *skb); + +struct rt6_statistics rt6_stats = { + 1, 0, 1, 1, 0 +}; + +static atomic_t rt_clients = 0; + +void rt6_timer_handler(unsigned long data); + +struct timer_list rt6_gc_timer = { + NULL, + NULL, + 0, + 0, + rt6_timer_handler +}; + +static __inline__ void rt6_run_bh(void) +{ + unsigned long flags; + + save_flags(flags); + cli(); + + if (rt6_lock == 0 && rt6_bh_mask) + { + __rt6_run_bh(); + } + restore_flags(flags); +} + +/* + * request queue operations + * FIFO queue/dequeue + */ +static __inline__ void rtreq_queue(struct rt6_req * req) +{ + unsigned long flags; + struct rt6_req *next = &request_queue; + + save_flags(flags); + cli(); + + req->prev = next->prev; + req->prev->next = req; + next->prev = req; + req->next = next; + restore_flags(flags); +} + +static __inline__ struct rt6_req * rtreq_dequeue(void) +{ + struct rt6_req *next = &request_queue; + struct rt6_req *head; + + head = next->next; + + if (head == next) + { + return NULL; + } + + head->next->prev = head->prev; + next->next = head->next; + + head->next = NULL; + head->prev = NULL; + + return head; +} + +/* + * compare "prefix length" bits of an address + */ +static __inline__ int addr_match(struct in6_addr *a1, struct in6_addr *a2, + int prefixlen) +{ + int pdw; + int pbi; + + pdw = prefixlen >> 0x05; /* num of whole __u32 in prefix */ + pbi = prefixlen & 0x1f; /* num of bits in incomplete u32 in prefix */ + + if (pdw) + { + if (memcmp(a1, a2, pdw << 2)) + return 0; + } + + if (pbi) + { + __u32 w1, w2; + __u32 mask; + + w1 = a1->s6_addr32[pdw]; + w2 = a2->s6_addr32[pdw]; + + mask = htonl((0xffffffff) << (0x20 - pbi)); + + if ((w1 ^ w2) & mask) + return 0; + } + + return 1; +} + +/* + * test bit. range [0-127] + */ + +static __inline__ int addr_bit_set(struct in6_addr *addr, int fn_bit) +{ + int dw; + __u32 b1; + __u32 mask; + int bit = fn_bit; + + dw = bit >> 0x05; + + b1 = addr->s6_addr32[dw]; + + bit = ~bit; + bit &= 0x1f; + mask = htonl(1 << bit); + return (b1 & mask); +} + +static __inline__ int addr_bit_equal(struct in6_addr *a1, struct in6_addr *a2, + int fn_bit) +{ + int dw; + __u32 b1, b2; + __u32 mask; + int bit = fn_bit; + + dw = bit >> 0x05; + + b1 = a1->s6_addr32[dw]; + b2 = a2->s6_addr32[dw]; + + bit = ~bit; + bit &= 0x1f; + mask = htonl(1 << bit); + return !((b1 ^ b2) & mask); +} + +/* + * find the first different bit between two addresses + */ +static __inline__ int addr_diff(struct in6_addr *a1, struct in6_addr *a2) +{ + int i; + + for (i = 0; i<4; i++) + { + __u32 b1, b2; + __u32 xb; + + b1 = a1->s6_addr32[i]; + b2 = a2->s6_addr32[i]; + + xb = b1 ^ b2; + + if (xb) + { + int res = 0; + int j=31; + + xb = ntohl(xb); + + while (test_bit(j, &xb) == 0) + { + res++; + j--; + } + + return (i * 32 + res); + } + } + + /* + * bit values are in range [0-127] + * 128 is an ilegal value as we should *never* get to + * this point since that would mean the addrs are equal + */ + return 128; +} + +/* + * add a rt to a node that may already contain routes + * sort routes in ascending metric order so that fib lookup + * returns the smallest metric by default + */ + +static __inline__ void fib6_add_rt2node(struct fib6_node *fn, + struct rt6_info *rt) +{ + struct rt6_info *iter, **back; + + rt->fib_node = fn; + back = &fn->leaf; + + for (iter = fn->leaf; iter; iter=iter->next) + { + if (iter->rt_metric > rt->rt_metric) + { + break; + } + + back = &iter->next; + } + + rt->next = iter; + *back = rt; +} + +/* + * Routing Table + */ + +static int fib6_add_1(struct rt6_info *rt) +{ + struct fib6_node *fn; + struct fib6_node *pn = NULL; + struct fib6_node *in; + struct fib6_node *ln; + struct in6_addr *addr; + __u32 bit; + __u32 dir = 0; + __u32 sernum = ++rt_sernum; + int pbit = rt->rt_prefixlen - 1; + + addr = &rt->rt_dst; + + /* insert node in tree */ + + fn = &routing_table; + + for (;;) + { + if (fn == NULL) + { + ln = kmalloc(sizeof(struct fib6_node), GFP_ATOMIC); + + if (ln == NULL) + return (-ENOMEM); + + memset(ln, 0, sizeof(struct fib6_node)); + ln->fn_bit = pbit; + ln->fn_flags = RTN_BACKTRACK; + + ln->parent = pn; + ln->leaf = rt; + ln->fn_sernum = sernum; + rt->fib_node = ln; + + atomic_inc(&rt->rt_ref); + + if (dir) + pn->right = ln; + else + pn->left = ln; + + rt6_stats.fib_nodes++; + rt6_stats.fib_route_nodes++; + rt6_stats.fib_rt_entries++; + + return(0); + } + + if (addr_match(&fn->leaf->rt_dst, addr, fn->fn_bit)) + { + if (pbit == fn->fn_bit && pbit && + addr_bit_equal(addr, &fn->leaf->rt_dst, + rt->rt_prefixlen)) + { + /* clean up an intermediate node */ + if ((fn->fn_flags & RTN_BACKTRACK) == 0) + { + rt_release(fn->leaf); + fn->leaf = NULL; + fn->fn_flags |= RTN_BACKTRACK; + } + + fib6_add_rt2node(fn, rt); + fn->fn_sernum = sernum; + atomic_inc(&rt->rt_ref); + + rt6_stats.fib_route_nodes++; + rt6_stats.fib_rt_entries++; + + return 0; + } + + if (pbit > fn->fn_bit || pbit == 0) + { + /* walk down on tree */ + + fn->fn_sernum = sernum; + + dir = addr_bit_set(addr, fn->fn_bit); + pn = fn; + fn = dir ? fn->right: fn->left; + + continue; + } + } + + /* + * split since we don't have a common prefix anymore or + * we have a less significant route. + * we've to insert an intermediate node on the list + * this new node will point to the one we need to create + * and the current + */ + + pn = fn->parent; + + /* find 1st bit in difference between the 2 addrs */ + bit = addr_diff(addr, &fn->leaf->rt_dst); + + + /* + * (intermediate) + * / \ + * (new leaf node) (old node) + */ + if (rt->rt_prefixlen > bit) + { + in = kmalloc(sizeof(struct fib6_node), GFP_ATOMIC); + + if (in == NULL) + return (-ENOMEM); + + memset(in, 0, sizeof(struct fib6_node)); + + /* + * new intermediate node. + * RTN_BACKTRACK will + * be off since that an address that chooses one of + * the branches would not match less specific routes + * int the other branch + */ + + in->fn_bit = bit; + in->parent = pn; + in->leaf = rt; + in->fn_sernum = sernum; + atomic_inc(&rt->rt_ref); + + /* leaf node */ + ln = kmalloc(sizeof(struct fib6_node), GFP_ATOMIC); + + if (ln == NULL) + { + kfree(in); + return (-ENOMEM); + } + + /* update parent pointer */ + if (dir) + pn->right = in; + else + pn->left = in; + + memset(ln, 0, sizeof(struct fib6_node)); + ln->fn_bit = pbit; + ln->fn_flags = RTN_BACKTRACK; + + ln->parent = in; + fn->parent = in; + + ln->leaf = rt; + ln->fn_sernum = sernum; + atomic_inc(&rt->rt_ref); + + rt->fib_node = ln; + + if (addr_bit_set(addr, bit)) + { + in->right = ln; + in->left = fn; + } + else + { + in->left = ln; + in->right = fn; + } + + rt6_stats.fib_nodes += 2; + rt6_stats.fib_route_nodes++; + rt6_stats.fib_rt_entries++; + + return 0; + } + + /* + * (new leaf node) + * / \ + * (old node) NULL + */ + + ln = kmalloc(sizeof(struct fib6_node), GFP_ATOMIC); + + if (ln == NULL) + return (-ENOMEM); + + memset(ln, 0, sizeof(struct fib6_node)); + ln->fn_bit = pbit; + ln->fn_flags = RTN_BACKTRACK; + + + ln->parent = pn; + ln->leaf = rt; + ln->fn_sernum = sernum; + atomic_inc(&rt->rt_ref); + + rt->fib_node = ln; + + if (dir) + pn->right = ln; + else + pn->left = ln; + + + if (addr_bit_set(&fn->leaf->rt_dst, pbit)) + ln->right = fn; + else + ln->left = fn; + + fn->parent = ln; + + rt6_stats.fib_nodes++; + rt6_stats.fib_route_nodes++; + rt6_stats.fib_rt_entries++; + + return(0); + } + + return (-1); +} + +static struct rt6_info * fib6_lookup_1(struct in6_addr *addr, int flags) +{ + struct fib6_node *fn, *next; + int dir; + + fn = &routing_table; + + for (;;) + { + dir = addr_bit_set(addr, fn->fn_bit); + + next = dir ? fn->right: fn->left; + + if (next) + { + fn = next; + continue; + } + + break; + } + + + while ((fn->fn_flags & RTN_ROOT) == 0) + { + if (fn->fn_flags & RTN_BACKTRACK) + { + if (addr_match(&fn->leaf->rt_dst, addr, + fn->leaf->rt_prefixlen)) + { + struct rt6_info *rt; + + for (rt = fn->leaf; rt; rt = rt->next) + { + if ((rt->rt_flags & flags) == 0) + return rt; + } + } + } + + fn = fn->parent; + } + + return NULL; +} + + + +/* + * called to trim the tree of intermediate nodes when possible + */ + +static void fib6_del_3(struct fib6_node *fn) +{ + int children = 0; + int dir = 0; + int bit; + + /* + * 0 or one children: + * delete the node + * + * 2 children: + * move the bit down + */ + + if (fn->left) + { + children++; + dir = 0; + } + + if (fn->right) + { + children++; + dir = 1; + } + + if (children < 2) + { + struct fib6_node *child; + struct fib6_node *pn; + + child = dir ? fn->right : fn->left; + + if (fn->parent->left == fn) + { + fn->parent->left = child; + } + else + { + fn->parent->right = child; + } + + if (child) + { + child->parent = fn->parent; + } + + /* + * try to collapse on top + */ + pn = fn->parent; + fn->parent = NULL; + + if ((pn->fn_flags & (RTN_BACKTRACK | RTN_ROOT)) == 0) + { + if (pn->leaf) + { + rt_release(pn->leaf); + pn->leaf = NULL; + } + fib6_del_3(pn); + } + + if (fn->fn_flags & RTN_BACKTRACK) + { + rt6_stats.fib_route_nodes--; + } + rt6_stats.fib_nodes--; + kfree(fn); + return; + } + + bit = addr_diff(&fn->left->leaf->rt_dst, &fn->right->leaf->rt_dst); + + fn->fn_bit = bit; + fn->fn_flags &= ~RTN_BACKTRACK; + + fn->leaf = fn->left->leaf; + atomic_inc(&fn->leaf->rt_ref); + + rt6_stats.fib_route_nodes--; +} + +static struct fib6_node * fib6_del_2(struct in6_addr *addr, __u32 prefixlen, + struct in6_addr *gw, struct device *dev) +{ + struct fib6_node *fn; + + for (fn = &routing_table; fn;) + { + int dir; + + if ((fn->fn_flags & RTN_BACKTRACK) && + prefixlen == fn->leaf->rt_prefixlen && + addr_match(&fn->leaf->rt_dst, addr, fn->leaf->rt_prefixlen) + ) + { + break; + } + + dir = addr_bit_set(addr, fn->fn_bit); + + fn = dir ? fn->right: fn->left; + } + + /* + * if route tree node found + * search among it's entries + */ + + if (fn) + { + struct rt6_info *back = NULL; + struct rt6_info *lf; + + for(lf = fn->leaf; lf; lf=lf->next) + { + if ((gw && (ipv6_addr_cmp(addr, &lf->rt_dst) == 0)) || + (dev && dev == lf->rt_dev)) + { + /* delete this entry */ + if (back == NULL) + fn->leaf = lf->next; + else + back->next = lf->next; + + lf->fib_node = NULL; + rt_release(lf); + return fn; + } + back = lf; + } + } + + return NULL; +} + +static struct fib6_node * fib6_del_rt_2(struct rt6_info *rt) +{ + struct fib6_node *fn; + struct in6_addr *addr = &rt->rt_dst; + int prefixlen = rt->rt_prefixlen; + + for (fn = &routing_table; fn;) + { + int dir; + + if ((fn->fn_flags & RTN_BACKTRACK) && + prefixlen == fn->leaf->rt_prefixlen && + addr_match(&fn->leaf->rt_dst, addr, fn->leaf->rt_prefixlen) + ) + { + break; + } + + dir = addr_bit_set(addr, fn->fn_bit); + + fn = dir ? fn->right: fn->left; + } + + /* + * if route tree node found + * search among its entries + */ + + if (fn) + { + struct rt6_info **back; + struct rt6_info *lf; + + back = &fn->leaf; + + for(lf = fn->leaf; lf; lf=lf->next) + { + if (rt == lf) + { + /* + * delete this entry + */ + + *back = lf->next; + rt_release(lf); + return fn; + } + back = &lf->next; + } + } + + return NULL; +} + +int fib6_del_1(struct in6_addr *addr, __u32 prefixlen, struct in6_addr *gw, + struct device *dev) +{ + struct fib6_node *fn; + + fn = fib6_del_2(addr, prefixlen, gw, dev); + + if (fn == NULL) + return -ENOENT; + + if (fn->leaf == NULL) + { + fib6_del_3(fn); + } + + return 0; +} + +int fib6_del_rt(struct rt6_info *rt) +{ + struct fib6_node *fn; + + fn = fib6_del_rt_2(rt); + + if (fn == NULL) + return -ENOENT; + + if (fn->leaf == NULL) + { + fib6_del_3(fn); + } + + return 0; +} + +static void fib6_flush_1(struct fib6_node *fn, void *p_arg) +{ + struct rt6_info *rt; + + for (rt = fn->leaf; rt;) + { + struct rt6_info *itr; + + itr = rt; + rt = rt->next; + itr->fib_node = NULL; + rt_release(itr); + } + + if (fn->fn_flags & RTN_BACKTRACK) + { + rt6_stats.fib_route_nodes--; + } + rt6_stats.fib_nodes--; + kfree(fn); +} + +void fib6_flush(void) +{ + rt6_walk_tree(fib6_flush_1, NULL, RT6_FILTER_NONE); +} + +int ipv6_route_add(struct in6_rtmsg *rtmsg) +{ + struct rt6_info *rt; + struct device * dev = NULL; + struct inet6_dev *idev; + struct rt6_req *request; + int flags = rtmsg->rtmsg_flags; + + idev = ipv6_dev_by_index(rtmsg->rtmsg_ifindex); + if (idev) + { + dev = idev->dev; + } + + rt = (struct rt6_info *) kmalloc(sizeof(struct rt6_info), + GFP_ATOMIC); + + rt6_stats.fib_rt_alloc++; + + memset(rt, 0, sizeof(struct rt6_info)); + + memcpy(&rt->rt_dst, &rtmsg->rtmsg_dst, sizeof(struct in6_addr)); + rt->rt_prefixlen = rtmsg->rtmsg_prefixlen; + + if (rt->rt_prefixlen == 0) + { + printk(KERN_DEBUG "ip6_fib: zero length route not allowed\n"); + return -EINVAL; + } + + if (flags & (RTF_GATEWAY | RTF_NEXTHOP)) + { + /* check to see if its an acceptable gateway */ + if (flags & RTF_GATEWAY) + { + struct rt6_info *gw_rt; + + gw_rt = fibv6_lookup(&rtmsg->rtmsg_gateway, dev, + RTI_GATEWAY); + + if (gw_rt == NULL) + { + return -EHOSTUNREACH; + } + + dev = gw_rt->rt_dev; + } + + rt->rt_nexthop = ndisc_get_neigh(dev, &rtmsg->rtmsg_gateway); + + if (rt->rt_nexthop == NULL) + { + printk(KERN_DEBUG "ipv6_route_add: no nexthop\n"); + kfree(rt); + return -EINVAL; + } + + rt->rt_dev = dev; + + if (loopback_rt == NULL && (dev->flags & IFF_LOOPBACK)) + { + loopback_rt = rt; + } + + } + else + { + if (dev == NULL) + { + printk(KERN_DEBUG "ipv6_route_add: NULL dev\n"); + kfree(rt); + return -EINVAL; + } + + rt->rt_dev = dev; + rt->rt_nexthop = NULL; + } + + rt->rt_metric = rtmsg->rtmsg_metric; + rt->rt_flags = rtmsg->rtmsg_flags; + + if (rt->rt_flags & RTF_ADDRCONF) + { + rt->rt_expires = rtmsg->rtmsg_info; + } + + request = kmalloc(sizeof(struct rt6_req), GFP_ATOMIC); + if (request == NULL) + { + printk(KERN_WARNING "ipv6_route_add: kmalloc failed\n"); + return -ENOMEM; + } + + request->operation = RT_OPER_ADD; + request->ptr = rt; + request->next = request->prev = NULL; + rtreq_queue(request); + rt6_bh_mask |= RT_BH_REQUEST; + + rt6_run_bh(); + + return 0; +} + +int ipv6_route_del(struct in6_rtmsg *rtmsg) +{ + struct rt6_info * rt; + int res = -ENOENT; + + atomic_inc(&rt6_lock); + + rt = fib6_lookup_1(&rtmsg->rtmsg_dst, 0); + + if (rt && (rt->rt_prefixlen == rtmsg->rtmsg_prefixlen)) + { + int test; + + start_bh_atomic(); + + test = (rt6_lock == 1); + + if (test) + { + res = fib6_del_rt(rt); + } + end_bh_atomic(); + + if (!test) + { + struct rt6_req *request; + + request = kmalloc(sizeof(struct rt6_req), GFP_KERNEL); + + if (!request) + { + res = -ENOMEM; + goto out; + } + request->operation = RT_OPER_DEL; + request->ptr = rt; + request->next = request->prev = NULL; + rtreq_queue(request); + rt6_bh_mask |= RT_BH_REQUEST; + res = 0; + } + } + out: + atomic_dec(&rt6_lock); + rt6_run_bh(); + return res; +} + +/* + * search the routing table + * the flags parameter restricts the search to entries where + * the flag is *not* set + */ +struct rt6_info * fibv6_lookup(struct in6_addr *addr, struct device *src_dev, + int flags) +{ + struct rt6_info *rt; + + atomic_inc(&rt6_lock); + + if ((rt = fib6_lookup_1(addr, flags))) + { + if (src_dev) + { + struct rt6_info *sprt; + + for (sprt=rt; sprt; sprt=sprt->next) + { + if (sprt->rt_dev == src_dev) + { + rt = sprt; + goto out; + } + } + + if (flags & RTI_DEVRT) + { + rt = NULL; + } + } + + goto out; + } + + if (!(flags & RTI_GATEWAY)) + { + if ((rt = dflt_rt_lookup())) + { + goto out; + } + + rt = last_resort_rt; + } + out: + atomic_dec(&rt6_lock); + return rt; +} + +/* + * Destination Cache + */ + +struct dest_entry * ipv6_dst_route(struct in6_addr * daddr, + struct device *src_dev, + int flags) +{ + struct dest_entry * dc = NULL; + struct rt6_info * rt; + + atomic_inc(&rt6_lock); + + rt = fibv6_lookup(daddr, src_dev, flags); + + if (rt == NULL) + { + goto exit; + } + + if (rt->rt_nexthop) + { + /* + * We can use the generic route + * (warning: the pmtu value maybe invalid) + */ + + dc = (struct dest_entry *) rt; + atomic_inc(&rt->rt_use); + } + else + { + struct rt6_req *request; + + if (ipv6_chk_addr(daddr) && !(rt->rt_dev->flags & IFF_LOOPBACK)) + { + rt = loopback_rt; + + if (rt == NULL) + { + goto exit; + } + } + + /* + * dynamicly allocate a new route + */ + + dc = (struct dest_entry *) kmalloc(sizeof(struct dest_entry), + GFP_ATOMIC); + + if (dc == NULL) + { + printk(KERN_WARNING "dst_route: kmalloc failed\n"); + goto exit; + } + + rt6_stats.fib_rt_alloc++; + rt6_stats.fib_dc_alloc++; + + memset(dc, 0, sizeof(struct dest_entry)); + + memcpy(&dc->dc_addr, daddr, sizeof(struct in6_addr)); + dc->rt.rt_prefixlen = 128; + dc->dc_usecnt = 1; + dc->rt.rt_metric = rt->rt_metric; + + dc->dc_flags = (rt->rt_flags | RTF_HOST | RTI_DYNAMIC | + RTI_DCACHE | DCF_PMTU); + + dc->dc_pmtu = rt->rt_dev->mtu; + dc->rt.rt_dev = rt->rt_dev; + dc->rt.rt_output_method = rt->rt_output_method; + dc->dc_tstamp = jiffies; + /* add it to the request queue */ + + request = kmalloc(sizeof(struct rt6_req), GFP_ATOMIC); + + if (request == NULL) + { + printk(KERN_WARNING "dst_route: kmalloc failed\n"); + dc = NULL; + goto exit; + } + + dc->dc_nexthop = ndisc_get_neigh(rt->rt_dev, daddr); + + rt6_bh_mask |= RT_BH_REQUEST; + + request->operation = RT_OPER_ADD; + request->ptr = (struct rt6_info *) dc; + request->next = request->prev = NULL; + rtreq_queue(request); + } + + atomic_inc(&rt_clients); + + exit: + + atomic_dec(&rt6_lock); + rt6_run_bh(); + + return dc; +} + +/* + * check cache entry for vality... + * this needs to be done as a inline func that calls + * ipv6_slow_dst_check if entry is invalid + */ + +struct dest_entry * ipv6_dst_check(struct dest_entry *dc, + struct in6_addr *daddr, + __u32 sernum, int flags) +{ + int uptodate = 0; + + /* + * destination cache becomes invalid when routing + * changes or a more specific dynamic entry is + * created. + * if route is removed from table fib_node will + * become NULL + */ + + if (dc->rt.fib_node && (dc->rt.fib_node->fn_sernum == sernum)) + uptodate = 1; + + if (uptodate && ((dc->dc_flags & DCF_INVALID) == 0)) + { + if (dc->dc_nexthop && !(dc->dc_nexthop->flags & NCF_NOARP)) + { + ndisc_event_send(dc->dc_nexthop, NULL); + } + return dc; + } + + /* route for destination may have changed */ + + ipv6_dst_unlock(dc); + + return ipv6_dst_route(daddr, NULL, flags); +} + +void ipv6_dst_unlock(struct dest_entry *dc) +{ + /* + * decrement counter and mark entry for deletion + * if counter reaches 0. we delay deletions in hope + * we can reuse cache entries. + */ + + atomic_dec(&dc->dc_usecnt); + + if (dc->dc_usecnt == 0) + { + + if (dc->dc_flags & RTI_DCACHE) + { + /* + * update last usage tstamp + */ + + dc->dc_tstamp = jiffies; + rt6_bh_mask |= RT_BH_GC; + } + + if (dc->rt.rt_ref == 0) + { + /* + * entry out of the routing table + * pending to be released on last deref + */ + + if (dc->dc_nexthop) + { + ndisc_dec_neigh(dc->dc_nexthop); + } + + if (dc->dc_flags & RTI_DCACHE) + { + rt6_stats.fib_dc_alloc--; + } + + rt6_stats.fib_rt_alloc--; + kfree(dc); + } + + } + + atomic_dec(&rt_clients); +} + +/* + * Received a packet too big icmp that lowers the mtu for this + * address. If the route for the destination is genric we create + * a new route with the apropriate MTU info. The route_add + * procedure will update the serial number on the generic routes + * belonging to the afected tree forcing clients to request a route + * lookup. + */ +void rt6_handle_pmtu(struct in6_addr *addr, int pmtu) +{ + struct rt6_info *rt; + struct rt6_req *req; + struct dest_entry *dc; + + printk(KERN_DEBUG "rt6_handle_pmtu\n"); + + if (pmtu < 0 || pmtu > 65536) + { + printk(KERN_DEBUG "invalid MTU value\n"); + return; + } + + rt = fibv6_lookup(addr, NULL, 0); + + if (rt == NULL) + { + printk(KERN_DEBUG "rt6_handle_pmtu: route not found\n"); + return; + } + + if (rt->rt_flags & RTI_DCACHE) + { + /* + * we do have a destination cache entry for this + * address. + */ + + dc = (struct dest_entry *) rt; + + /* + * fixme: some sanity checks are likely to be needed + * here + */ + + dc->dc_pmtu = pmtu; + dc->dc_flags |= DCF_PMTU; + return; + } + + req = (struct rt6_req *) kmalloc(sizeof(struct rt6_req), GFP_ATOMIC); + + /* now add the new destination cache entry */ + + dc = (struct dest_entry *) kmalloc(sizeof(struct dest_entry), + GFP_ATOMIC); + + rt6_stats.fib_rt_alloc++; + rt6_stats.fib_dc_alloc++; + + memset(dc, 0, sizeof(struct dest_entry)); + + memcpy(&dc->dc_addr, addr, sizeof(struct in6_addr)); + dc->rt.rt_prefixlen = 128; + dc->rt.rt_metric = rt->rt_metric; + + dc->dc_flags = (rt->rt_flags | RTI_DYNAMIC | RTI_DCACHE | DCF_PMTU | + RTF_HOST); + + dc->dc_pmtu = pmtu; + dc->dc_tstamp = jiffies; + + dc->dc_nexthop = rt->rt_nexthop; + atomic_inc(&dc->dc_nexthop->refcnt); + + dc->rt.rt_dev = rt->rt_dev; + dc->rt.rt_output_method = rt->rt_output_method; + + req->operation = RT_OPER_ADD; + req->ptr = (struct rt6_info *) dc; + req->next = req->prev = NULL; + + rtreq_queue(req); + + rt6_bh_mask |= RT_BH_REQUEST; + + rt6_run_bh(); +} + +/* + * Redirect received: target is nexthop for dest + */ +struct rt6_info * ipv6_rt_redirect(struct device *dev, struct in6_addr *dest, + struct in6_addr *target, int on_link) + +{ + struct rt6_info *rt; + struct rt6_req *req; + int metric; + + rt = fibv6_lookup(dest, dev, 0); + + if (rt == NULL) + { + printk(KERN_WARNING "rt_redirect: unable to locate route\n"); + return NULL; + } + + metric = rt->rt_metric; + + if ((rt->rt_flags & RTF_HOST) == 0) + { + /* Need to create an host route for this address */ + + rt = (struct rt6_info *) kmalloc(sizeof(struct rt6_info), + GFP_ATOMIC); + memset(rt, 0, sizeof(struct rt6_info)); + ipv6_addr_copy(&rt->rt_dst, dest); + rt->rt_prefixlen = 128; + rt->rt_flags = RTF_HOST | RTF_UP; + rt->rt_dev = dev; + + /* + * clone rt->rt_output_method ? + */ + + rt->rt_metric = metric; + + rt6_stats.fib_rt_alloc++; + + req = (struct rt6_req *) kmalloc(sizeof(struct rt6_req), + GFP_ATOMIC); + req->operation = RT_OPER_ADD; + req->ptr = rt; + req->next = req->prev = NULL; + + rtreq_queue(req); + rt6_bh_mask |= RT_BH_REQUEST; + } + else + { + rt->rt_flags |= RTF_MODIFIED; + } + + rt->rt_flags |= RTF_DYNAMIC; + + if (on_link) + { + rt->rt_flags &= ~RTF_GATEWAY; + } + else + { + rt->rt_flags |= RTF_GATEWAY; + } + + if (rt->rt_nexthop) + { + if (ipv6_addr_cmp(&rt->rt_nexthop->addr, target) == 0) + { + atomic_inc(&rt->rt_nexthop->refcnt); + goto exit; + } + else + { + ndisc_dec_neigh(rt->rt_nexthop); + } + } + + rt->rt_nexthop = ndisc_get_neigh(dev, target); + + exit: + rt6_run_bh(); + return rt; +} + +static int dcache_gc_node(struct fib6_node *fn, int timeout) +{ + struct rt6_info *rt, *back; + int more = 0; + unsigned long now = jiffies; + + back = NULL; + + for (rt = fn->leaf; rt;) + { + if ((rt->rt_flags & RTI_DCACHE) && rt->rt_use == 0) + { + struct dest_entry *dc; + + dc = (struct dest_entry *) rt; + + if (now - dc->dc_tstamp > timeout) + { + struct rt6_info *old; + + old = rt; + + rt = rt->next; + + if (back == NULL) + { + fn->leaf = rt; + } + else + { + back->next = rt; + } + + old->fib_node = NULL; + rt_release(old); + rt6_stats.fib_rt_entries--; + continue; + } + else + { + more++; + } + } + + back = rt; + rt = rt->next; + } + + if (fn->leaf == NULL) + { + return -1; + } + return more; +} + +struct dc_gc_args { + unsigned long timeout; + int more; +}; + +static void dc_garbage_collect(struct fib6_node *fn, void *p_arg) +{ + struct dc_gc_args * args = (struct dc_gc_args *) p_arg; + + if (fn->fn_flags & RTN_BACKTRACK) + { + if (fn->fn_bit == 127) + { + int more; + + more = dcache_gc_node(fn, args->timeout); + + if (more == -1) + { + if (fn->parent->left == fn) + fn->parent->left = NULL; + else + fn->parent->right = NULL; + + kfree(fn); + + rt6_stats.fib_nodes--; + rt6_stats.fib_route_nodes--; + + return; + } + args->more += more; + } + } + else if (!(fn->fn_flags & RTN_ROOT)) + { + int children = 0; + struct fib6_node *chld = NULL; + + if (fn->left) + { + children++; + chld = fn->left; + } + + if (fn->right) + { + children++; + chld = fn->right; + } + + if (children <= 1) + { + struct fib6_node *pn = fn->parent; + + if (pn->left == fn) + { + pn->left = chld; + } + else + { + pn->right = chld; + } + + if (chld) + { + chld->parent = pn; + } + + rt_release(fn->leaf); + + rt6_stats.fib_nodes--; + kfree(fn); + } + } +} + +/* + * called with ints off + */ + +static void __rt6_run_bh(void) +{ + static last_gc_run = 0; + + if (rt6_bh_mask & RT_BH_REQUEST) + { + struct rt6_req *request; + + while ((request = rtreq_dequeue())) + { + struct rt6_info *rt; + + rt = request->ptr; + + switch (request->operation) { + case RT_OPER_ADD: + fib6_add_1(rt); + break; + + case RT_OPER_DEL: + fib6_del_rt(rt); + break; + + default: + printk(KERN_WARNING + "rt6_run_bh: bad request in queue\n"); + } + + kfree(request); + } + + rt6_bh_mask &= ~RT_BH_REQUEST; + } + + if (rt6_bh_mask & RT_BH_GC) + { + if (jiffies - last_gc_run > DC_TIME_RUN) + { + struct dc_gc_args args; + + if (rt6_stats.fib_dc_alloc >= DC_WATER_MARK) + args.timeout = DC_SHORT_TIMEOUT; + else + args.timeout = DC_LONG_TIMEOUT; + + args.more = 0; + rt6_walk_tree(dc_garbage_collect, &args, RT6_FILTER_NONE); + + last_gc_run = jiffies; + + if (!args.more) + { + rt6_bh_mask &= ~RT_BH_GC; + } + } + } +} + +/* + * Timer for expiring routes learned via addrconf and stale DC + * entries when there is no network actuvity + */ + +void rt6_timer_handler(unsigned long data) +{ + unsigned long flags; + + save_flags(flags); + cli(); + + if (rt6_lock == 0) + { + if (rt_clients == 0 && rt6_bh_mask) + { + __rt6_run_bh(); + } + + /* + * route expiry + */ + + rt6_walk_tree(rt6_rt_timeout, NULL, RT6_FILTER_RTNODES); + } + + restore_flags(flags); + + rt6_gc_timer.expires = jiffies + 4 * DC_LONG_TIMEOUT; + add_timer(&rt6_gc_timer); +} + +/* + * Check if routes should be timed out. + * Called from rt6_walk_tree for every node. + */ + +static void rt6_rt_timeout(struct fib6_node *fn, void *arg) +{ + struct rt6_info *rt; + unsigned long now = jiffies; + + for (rt = fn->leaf; rt; rt = rt->next) + { + if ((rt->rt_flags & RTF_ADDRCONF) && now > rt->rt_expires) + { + struct rt6_req *req; + + /* + * request route deletion. routes will only + * be deleted after walk_tree completes + */ + + req = (struct rt6_req *) kmalloc(sizeof(struct rt6_req), + GFP_ATOMIC); + req->operation = RT_OPER_DEL; + req->ptr = rt; + req->next = req->prev = NULL; + } + } +} + +static void rt6_sndrtmsg(struct in6_rtmsg *rtmsg) +{ + struct sk_buff *skb; + + skb = alloc_skb(sizeof(struct in6_rtmsg), GFP_ATOMIC); + if (skb == NULL) + return; + + skb->free = 1; + + memcpy(skb_put(skb, sizeof(struct in6_rtmsg)), &rtmsg, + sizeof(struct in6_rtmsg)); + + if (netlink_post(NETLINK_ROUTE6, skb)) + { + kfree_skb(skb, FREE_WRITE); + } +} + +int ipv6_route_ioctl(unsigned int cmd, void *arg) +{ + struct in6_rtmsg rtmsg; + int err; + + switch(cmd) + { + case SIOCADDRT: /* Add a route */ + case SIOCDELRT: /* Delete a route */ + if (!suser()) + return -EPERM; + err = copy_from_user(&rtmsg, arg, + sizeof(struct in6_rtmsg)); + if (err) + return -EFAULT; + + err = (cmd == SIOCDELRT) ? ipv6_route_del(&rtmsg) : + ipv6_route_add(&rtmsg); + + if (err == 0) + { + rt6_sndrtmsg(&rtmsg); + } + return err; + } + + return -EINVAL; +} + +static void rt6_ifdown_scan(struct fib6_node *fn, void *arg) +{ + struct rt6_info *rt; + struct device *dev = (struct device *) arg; + + for (rt = fn->leaf; rt; rt=rt->next) + { + if (((rt->rt_flags & RTI_DCACHE) == 0) && rt->rt_dev == dev) + { + struct rt6_req *req; + + req = kmalloc(sizeof(struct rt6_req), GFP_ATOMIC); + req->operation = RT_OPER_DEL; + req->ptr = rt; + req->next = req->prev = NULL; + rt6_bh_mask |= RT_BH_REQUEST; + } + } +} + +void rt6_ifdown(struct device *dev) +{ + rt6_walk_tree(rt6_ifdown_scan, (void *) dev, RT6_FILTER_RTNODES); +} + +static void rt6_walk_tree(f_pnode func, void * arg, int filter) +{ + struct fib6_node *fn; + /* + * adquire lock + * this warranties that the operation will be atomic with + * respect to the garbage collect routine that also does + * a tree transversal and tags nodes with the RTN_TAG flag + */ + atomic_inc(&rt6_lock); + + fn = &routing_table; + + do { + if (!(fn->fn_flags & RTN_TAG)) + { + fn->fn_flags |= RTN_TAG; + + if (fn->left) + { + fn = fn->left; + continue; + } + } + + fn->fn_flags &= ~RTN_TAG; + + if (fn->right) + { + fn = fn->right; + continue; + } + + do { + struct fib6_node *node; + + if (fn->fn_flags & RTN_ROOT) + break; + node = fn; + fn = fn->parent; + + if (!(node->fn_flags & RTN_TAG) && + (!filter || (node->fn_flags & RTN_BACKTRACK))) + { + (*func)(node, arg); + } + + } while (!(fn->fn_flags & RTN_TAG)); + + } while (!(fn->fn_flags & RTN_ROOT) || (fn->fn_flags & RTN_TAG)); + + atomic_dec(&rt6_lock); +} + +#ifdef CONFIG_PROC_FS +#define RT6_INFO_LEN (32 + 2 + 32 + 2 + 2 + 2 + 4 + 8 + 7 + 1) + +struct rt6_proc_arg { + char *buffer; + int offset; + int length; + int skip; + int len; +}; + +static void rt6_info_node(struct fib6_node *fn, void *p_arg) +{ + struct rt6_info *rt; + struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg; + + for (rt = fn->leaf; rt; rt = rt->next) + { + int i; + + if (arg->skip < arg->offset / RT6_INFO_LEN) + { + arg->skip++; + continue; + } + + if (arg->len >= arg->length) + return; + + for (i=0; i<16; i++) + { + sprintf(arg->buffer + arg->len, "%02x", + rt->rt_dst.s6_addr[i]); + arg->len += 2; + } + arg->len += sprintf(arg->buffer + arg->len, " %02x ", + rt->rt_prefixlen); + if (rt->rt_nexthop) + { + for (i=0; i<16; i++) + { + sprintf(arg->buffer + arg->len, "%02x", + rt->rt_nexthop->addr.s6_addr[i]); + arg->len += 2; + } + } + else + { + sprintf(arg->buffer + arg->len, + "00000000000000000000000000000000"); + arg->len += 32; + } + arg->len += sprintf(arg->buffer + arg->len, + " %02x %02x %02x %04x %8s\n", + rt->rt_metric, rt->rt_use, + rt->rt_ref, rt->rt_flags, + rt->rt_dev ? rt->rt_dev->name : ""); + } +} + +static int rt6_proc_info(char *buffer, char **start, off_t offset, int length, + int dummy) +{ + struct rt6_proc_arg arg; + struct fib6_node sfn; + arg.buffer = buffer; + arg.offset = offset; + arg.length = length; + arg.skip = 0; + arg.len = 0; + + rt6_walk_tree(rt6_info_node, &arg, RT6_FILTER_RTNODES); + + sfn.leaf = default_rt_list; + rt6_info_node(&sfn, &arg); + + sfn.leaf = last_resort_rt; + rt6_info_node(&sfn, &arg); + + *start = buffer; + + if (offset) + *start += offset % RT6_INFO_LEN; + + arg.len -= offset % RT6_INFO_LEN; + + if (arg.len > length) + arg.len = length; + + return arg.len; +} + + +static int rt6_proc_stats(char *buffer, char **start, off_t offset, int length, + int dummy) +{ + int len; + + len = sprintf(buffer, "%04x %04x %04x %04x %04x\n", + rt6_stats.fib_nodes, rt6_stats.fib_route_nodes, + rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries, + rt6_stats.fib_dc_alloc); + + len -= offset; + + if (len > length) + len = length; + + *start = buffer + offset; + + return len; +} + +#endif /* CONFIG_PROC_FS */ + +/* + * init/cleanup code + * + */ + +void ipv6_route_init(void) +{ +#ifdef CONFIG_PROC_FS + proc_net_register(&(struct proc_dir_entry) { + PROC_NET_RT6, 10, "ipv6_route", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + rt6_proc_info + }); + proc_net_register(&(struct proc_dir_entry) { + PROC_NET_RT6_STATS, 9, "rt6_stats", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + rt6_proc_stats + }); + +#endif + rt6_gc_timer.expires = jiffies + 4 * DC_LONG_TIMEOUT; + add_timer(&rt6_gc_timer); + netlink_attach(NETLINK_ROUTE6, rt6_msgrcv); +} + +#ifdef MODULE +void ipv6_route_cleanup(void) +{ + proc_net_unregister(PROC_NET_RT6); + proc_net_unregister(PROC_NET_RT6_STATS); + netlink_detach(NETLINK_ROUTE6); + del_timer(&rt6_gc_timer); + fib6_flush(); +} +#endif + +/* + * NETLINK interface + * routing socket moral equivalent + */ + +static int rt6_msgrcv(int unit, struct sk_buff *skb) +{ + int count = 0; + struct in6_rtmsg *rtmsg; + + while (skb->len) + { + if (skb->len < sizeof(struct in6_rtmsg)) + { + count = -EINVAL; + goto out; + } + + rtmsg = (struct in6_rtmsg *) skb->data; + skb_pull(skb, sizeof(struct in6_rtmsg)); + count += sizeof(struct in6_rtmsg); + + switch (rtmsg->rtmsg_type) { + case RTMSG_NEWROUTE: + ipv6_route_add(rtmsg); + break; + case RTMSG_DELROUTE: + ipv6_route_del(rtmsg); + break; + default: + count = -EINVAL; + goto out; + } + } + + out: + kfree_skb(skb, FREE_READ); + return count; +} + +void rt6_sndmsg(__u32 type, struct in6_addr *dst, struct in6_addr *gw, + __u16 plen, struct device *dev, __u16 metric, __u16 flags) +{ + struct sk_buff *skb; + struct in6_rtmsg *msg; + int ifindex = 0; + + skb = alloc_skb(sizeof(struct in6_rtmsg), GFP_ATOMIC); + if (skb == NULL) + return; + + skb->free = 1; + + msg = (struct in6_rtmsg *) skb_put(skb, sizeof(struct in6_rtmsg)); + + msg->rtmsg_type = type; + + if (dst) + { + ipv6_addr_copy(&msg->rtmsg_dst, dst); + } + else + memset(&msg->rtmsg_dst, 0, sizeof(struct in6_addr)); + + if (gw) + { + ipv6_addr_copy(&msg->rtmsg_gateway, gw); + } + else + memset(&msg->rtmsg_gateway, 0, sizeof(struct in6_addr)); + + msg->rtmsg_prefixlen = plen; + msg->rtmsg_metric = metric; + + if (dev) + { + struct inet6_dev *idev; + + idev = ipv6_get_idev(dev); + if (idev) + { + ifindex = idev->if_index; + } + } + + msg->rtmsg_ifindex = ifindex; + + msg->rtmsg_flags = flags; + + if (netlink_post(NETLINK_ROUTE6, skb)) + { + kfree_skb(skb, FREE_WRITE); + } +} diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c new file mode 100644 index 000000000..7ae830876 --- /dev/null +++ b/net/ipv6/ipv6_sockglue.c @@ -0,0 +1,295 @@ +/* + * IPv6 BSD socket options interface + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * Based on linux/net/ipv4/ip_sockglue.c + * + * $Id: ipv6_sockglue.c,v 1.12 1996/10/29 22:45:53 roque Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> + +#include <linux/sysctl.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/ndisc.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/ipv6_route.h> +#include <net/addrconf.h> +#include <net/inet_common.h> +#include <net/sit.h> +#include <net/tcp.h> +#include <net/udp.h> + +#include <asm/uaccess.h> + +struct ipv6_mib ipv6_statistics={0, }; +struct packet_type ipv6_packet_type = +{ + 0, + NULL, /* All devices */ + ipv6_rcv, + NULL, + NULL +}; + +/* + * addrconf module should be notifyed of a device going up + */ +static struct notifier_block ipv6_dev_notf = { + addrconf_notify, + NULL, + 0 +}; + +int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval, + int optlen) +{ + struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; + int val, err; + int retv = -EOPNOTSUPP; + + if(level!=SOL_IPV6) + goto out; + + if (optval == NULL) + { + val=0; + } + else + { + err = get_user(val, (int *) optval); + if(err) + return err; + } + + + switch (optname) { + + case IPV6_ADDRFORM: + if (val == PF_INET) + { + if (sk->protocol != IPPROTO_UDP && + sk->protocol != IPPROTO_TCP) + { + goto out; + } + + if (sk->state != TCP_ESTABLISHED) + { + retv = ENOTCONN; + goto out; + } + + if (!(ipv6_addr_type(&np->daddr) & IPV6_ADDR_MAPPED)) + { + retv = -EADDRNOTAVAIL; + goto out; + } + + if (sk->protocol == IPPROTO_TCP) + { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + sk->prot = &tcp_prot; + tp->af_specific = &ipv4_specific; + } + else + { + sk->prot = &udp_prot; + } + sk->socket->ops = &inet_proto_ops; + retv = 0; + } + else + { + retv = -EINVAL; + } + break; + + case IPV6_RXINFO: + np->rxinfo = val; + retv = 0; + break; + + case IPV6_UNICAST_HOPS: + if (val > 255) + { + retv = -EINVAL; + } + else + { + np->hop_limit = val; + retv = 0; + } + break; + + case IPV6_MULTICAST_HOPS: + if (val > 255) + { + retv = -EINVAL; + } + else + { + np->mcast_hops = val; + retv = 0; + } + break; + + case IPV6_MULTICAST_LOOP: + np->mc_loop = val; + break; + + case IPV6_MULTICAST_IF: + { + struct in6_addr addr; + + err = copy_from_user(&addr, optval, sizeof(struct in6_addr)); + if(err) + return -EFAULT; + + if (ipv6_addr_any(&addr)) + { + np->mc_if = NULL; + } + else + { + struct inet6_ifaddr *ifp; + + ifp = ipv6_chk_addr(&addr); + + if (ifp == NULL) + { + retv = -EADDRNOTAVAIL; + break; + } + + np->mc_if = ifp->idev->dev; + } + retv = 0; + break; + } + case IPV6_ADD_MEMBERSHIP: + case IPV6_DROP_MEMBERSHIP: + { + struct ipv6_mreq mreq; + struct device *dev = NULL; + int err; + + err = copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq)); + if(err) + return -EFAULT; + + if (mreq.ipv6mr_ifindex == 0) + { + struct in6_addr mcast; + struct dest_entry *dc; + + ipv6_addr_set(&mcast, __constant_htonl(0xff000000), + 0, 0, 0); + dc = ipv6_dst_route(&mcast, NULL, 0); + + if (dc) + { + dev = dc->rt.rt_dev; + ipv6_dst_unlock(dc); + } + } + else + { + struct inet6_dev *idev; + + if ((idev = ipv6_dev_by_index(mreq.ipv6mr_ifindex))) + { + dev = idev->dev; + } + } + + if (dev == NULL) + { + return -ENODEV; + } + + if (optname == IPV6_ADD_MEMBERSHIP) + { + retv = ipv6_sock_mc_join(sk, dev, &mreq.ipv6mr_multiaddr); + } + else + { + retv = ipv6_sock_mc_drop(sk, dev, &mreq.ipv6mr_multiaddr); + } + } + } + + out: + return retv; +} + +int ipv6_getsockopt(struct sock *sk, int level, int optname, char *optval, + int *optlen) +{ + return 0; +} + +#ifdef MODULE + +/* + * sysctl registration functions defined in sysctl_net_ipv6.c + */ + +extern void ipv6_sysctl_register(void); +extern void ipv6_sysctl_unregister(void); +#endif + +void ipv6_init(void) +{ + ipv6_packet_type.type = ntohs(ETH_P_IPV6); + + dev_add_pack(&ipv6_packet_type); + +#ifdef MODULE + ipv6_sysctl_register(); +#endif + + register_netdevice_notifier(&ipv6_dev_notf); + + ipv6_route_init(); +} + +#ifdef MODULE +void ipv6_cleanup(void) +{ + unregister_netdevice_notifier(&ipv6_dev_notf); + dev_remove_pack(&ipv6_packet_type); + ipv6_sysctl_unregister(); + ipv6_route_cleanup(); + ndisc_cleanup(); + addrconf_cleanup(); +} +#endif + +/* + * Local variables: + * compile-command: "gcc -D__KERNEL__ -I/usr/src/linux/include -Wall -Wstrict-prototypes -O6 -m486 -c ipv6_sockglue.c" + * End: + */ diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c new file mode 100644 index 000000000..14ba9ef5f --- /dev/null +++ b/net/ipv6/mcast.c @@ -0,0 +1,220 @@ +/* + * Multicast support for IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/if_inet6.h> +#include <net/ndisc.h> +#include <net/ipv6_route.h> +#include <net/addrconf.h> + + +/* + * socket join on multicast group + */ +int ipv6_sock_mc_join(struct sock *sk, struct device *dev, + struct in6_addr *addr) +{ + struct ipv6_mc_socklist *mc_lst; + struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; + int err; + + if (!(ipv6_addr_type(addr) & IPV6_ADDR_MULTICAST)) + return -EINVAL; + + if(!(dev->flags & IFF_MULTICAST)) + return -EADDRNOTAVAIL; + + mc_lst = (struct ipv6_mc_socklist *) + kmalloc(sizeof(struct ipv6_mc_socklist), GFP_KERNEL); + + if (mc_lst == NULL) + return -ENOMEM; + + mc_lst->next = NULL; + memcpy(&mc_lst->addr, addr, sizeof(struct in6_addr)); + mc_lst->dev = dev; + + /* + * now add/increase the group membership on the device + */ + + err = ipv6_dev_mc_inc(dev, addr); + + if (err) + { + kfree(mc_lst); + return err; + } + + mc_lst->next = np->ipv6_mc_list; + np->ipv6_mc_list = mc_lst; + + return 0; +} + +/* + * socket leave on multicast group + */ +int ipv6_sock_mc_drop(struct sock *sk, struct device *dev, + struct in6_addr *addr) +{ + return 0; +} + +void ipv6_sock_mc_close(struct sock *sk) +{ + struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; + struct ipv6_mc_socklist *mc_lst; + + for (mc_lst = np->ipv6_mc_list; mc_lst; ) + { + struct ipv6_mc_socklist *back; + + /* + * leave group + */ + + back = mc_lst; + mc_lst = mc_lst->next; + kfree(back); + } +} + +/* + * device multicast group inc (add if not found) + */ +int ipv6_dev_mc_inc(struct device *dev, struct in6_addr *addr) +{ + struct ipv6_mc_list *mc; + struct inet6_dev *i6dev; + char buf[6]; + u8 hash; + + for (i6dev = inet6_dev_lst; i6dev; i6dev=i6dev->next) + if (i6dev->dev == dev) + break; + + if (i6dev == NULL) + { + printk(KERN_DEBUG "ipv6_dev_mc_inc: device not found\n"); + return -EINVAL; + } + + for (mc = i6dev->mc_list; mc; mc = mc->if_next) + if (ipv6_addr_cmp(&mc->addr, addr) == 0) + { + atomic_inc(&mc->users); + return 0; + } + + /* + * not found: create a new one. + */ + + mc = (struct ipv6_mc_list *) kmalloc(sizeof(struct ipv6_mc_list), + GFP_ATOMIC); + + if (mc == NULL) + { + return -ENOMEM; + } + + memset(mc, 0, sizeof(struct ipv6_mc_list)); + + memcpy(&mc->addr, addr, sizeof(struct in6_addr)); + mc->dev = dev; + mc->users = 1; + + hash = ipv6_addr_hash(addr); + + mc->next = inet6_mcast_lst[hash]; + inet6_mcast_lst[hash] = mc; + + mc->if_next = i6dev->mc_list; + i6dev->mc_list = mc; + + /* + * multicast mapping is defined in IPv6-over-foo documents + */ + + switch (dev->type) { + case ARPHRD_ETHER: + ipv6_mc_map(addr, buf); + dev_mc_add(dev, buf, ETH_ALEN, 0); + break; + + default: + printk(KERN_DEBUG "dev_mc_inc: unkown device type\n"); + } + + + /* + * FIXME: ICMP report handling + */ + + return 0; +} + +/* + * device multicast group del + */ +int ipv6_dev_mc_dec(struct device *dev, struct in6_addr *addr) +{ + return 0; +} + +/* + * check if the interface/address pair is valid + */ +int ipv6_chk_mcast_addr(struct device *dev, struct in6_addr *addr) +{ + struct ipv6_mc_list *mc; + u8 hash; + + hash = ipv6_addr_hash(addr); + + for (mc = inet6_mcast_lst[hash]; mc; mc=mc->next) + if ((mc->dev == dev) && + ipv6_addr_cmp(&mc->addr, addr) == 0) + { + return 1; + } + + return 0; +} + +/* + * IGMP handling (alias multicast ICMPv6 messages) + */ + +/* + * Local variables: + * compile-command: "gcc -D__KERNEL__ -I/usr/src/linux/include -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer -fno-strength-reduce -pipe -m486 -DCPU=486 -DMODULE -DMODVERSIONS -include /usr/src/linux/include/linux/modversions.h -c -o mcast.o mcast.c" + * End: + */ diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c new file mode 100644 index 000000000..31d50a5b5 --- /dev/null +++ b/net/ipv6/ndisc.c @@ -0,0 +1,1927 @@ +/* + * Neighbour Discovery for IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * Mike Shaver <shaver@ingenia.com> + * + * $Id: ndisc.c,v 1.28 1996/10/11 16:03:06 roque Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * + * Lars Fenneberg : fixed MTU setting on receipt + * of an RA. + * + * Janos Farkas : kmalloc failure checks + */ + +/* + * Interface: + * + * ndisc_lookup will be called from eth.c on dev->(re)build_header + * + * ndisc_rcv + * ndisc_validate is called by higher layers when they know a neighbour + * is reachable. + * + * Manages neighbour cache + * + */ + +#define __NO_VERSION__ +#include <linux/module.h> +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/ipv6.h> +#include <linux/icmpv6.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/ndisc.h> +#include <net/ipv6_route.h> +#include <net/addrconf.h> + + +#include <net/checksum.h> +#include <linux/proc_fs.h> + +#define NCACHE_NUM_BUCKETS 32 + +static struct socket ndisc_socket; + +unsigned long nd_rand_seed = 152L; + +struct ndisc_statistics nd_stats; + +static struct neighbour *neighbours[NCACHE_NUM_BUCKETS]; +static struct timer_list ndisc_timer; +static struct timer_list ndisc_gc_timer; + +static atomic_t ndisc_lock = 0; + +/* + * Protocol variables + */ + +int nd_max_multicast_solicit = 3; +int nd_max_unicast_solicit = 3; +int nd_retrans_timer = RETRANS_TIMER; +int nd_reachable_time = RECHABLE_TIME; +int nd_base_reachable_time = RECHABLE_TIME; +int nd_delay_first_probe = 5 * HZ; +int nd_gc_interval = 5 * HZ; + +/* + * garbage collection timeout must be greater than reachable time + * since tstamp is updated by reachable confirmations only. + * gc_staletime actually means the time after last confirmation + * *NOT* after the last time the entry was used. + */ + +int nd_gc_staletime = 3 * RECHABLE_TIME; + +static struct neighbour ndisc_insert_queue = { + {{{0,}}}, 0, 0, NULL, 0, + {0,}, NULL, {0,}, 0, 0, 0, 0, 0, + &ndisc_insert_queue, + &ndisc_insert_queue +}; + +static int ndisc_ins_queue_len = 0; + +int ndisc_event_timer(struct neighbour *neigh); + +static void ndisc_bh_insert(void); + +int ipv6_random(void) +{ + nd_rand_seed=nd_rand_seed*69069L+1; + return nd_rand_seed^jiffies; +} + +static __inline__ unsigned long rand_reach_time(void) +{ + unsigned long val; + + val = ipv6_random() % (MAX_RANDOM_FACTOR * nd_base_reachable_time); + if (val < (MIN_RANDOM_FACTOR * nd_base_reachable_time)) + { + val += (MIN_RANDOM_FACTOR * nd_base_reachable_time); + } + + return val; +} + +void ndisc_verify_reachability(struct neighbour * neigh); + +/* + * (inline) support functions + */ + +static __inline__ __u32 ndisc_hash(struct in6_addr *addr) +{ + + __u32 hash_val; + + hash_val = addr->s6_addr32[2] ^ addr->s6_addr32[3]; + + hash_val ^= hash_val >> 16; + + return (hash_val & (NCACHE_NUM_BUCKETS - 1)); +} + + +static __inline__ void ndisc_neigh_queue(struct neighbour *neigh) +{ + struct neighbour *next = &ndisc_insert_queue; + + ndisc_ins_queue_len++; + + neigh->prev = next->prev; + neigh->prev->next = neigh; + next->prev = neigh; + neigh->next = next; +} + +static __inline__ struct neighbour * ndisc_dequeue(void) +{ + struct neighbour *next = &ndisc_insert_queue; + struct neighbour *head; + + ndisc_ins_queue_len--; + + head = next->next; + + if (head == next) + { + return NULL; + } + + head->next->prev = head->prev; + next->next = head->next; + + head->next = NULL; + head->prev = NULL; + + return head; +} + +static __inline__ void ndisc_release_lock(void) +{ + unsigned long flags; + + save_flags(flags); + cli(); + + ndisc_lock--; + + if (ndisc_lock == 0 && ndisc_ins_queue_len) + { + ndisc_bh_insert(); + } + + restore_flags(flags); +} + +static void ndisc_insert_neigh(struct neighbour *neigh) +{ + + struct neighbour * bucket; + __u32 hash_val = ndisc_hash(&neigh->addr); + + bucket = neighbours[hash_val]; + + if (!bucket) + { + neighbours[hash_val] = neigh; + return; + } + + for (; bucket->next; bucket = bucket->next) + ; + + bucket->next = neigh; + neigh->prev = bucket; +} + +static __inline__ struct neighbour * +ndisc_retrieve_neigh(struct device *dev, struct in6_addr *addr) +{ + + struct neighbour * iter; + iter = neighbours[ndisc_hash(addr)]; + + for (; iter; iter = iter->next) + { + if (dev == iter->dev && ipv6_addr_cmp(addr, &iter->addr) == 0) + return iter; + } + return NULL; +} + +static void ndisc_unlink_neigh(struct neighbour * neigh) +{ + if (neigh->prev) + neigh->prev->next = neigh->next; + else + { + int hash = ndisc_hash(&neigh->addr); + neighbours[hash] = neigh->next; + } + + if (neigh->next) + neigh->next->prev = neigh->prev; +} + +static void ndisc_release_neigh(struct neighbour * neigh) +{ + struct sk_buff *skb; + + while((skb=skb_dequeue(&neigh->arp_queue))) + { + dev_kfree_skb(skb, FREE_WRITE); + } + + if (neigh->refcnt == 0) + { + ndisc_unlink_neigh(neigh); + kfree(neigh); + } +} + +static void ndisc_bh_insert(void) +{ + struct neighbour *neigh; + + while((neigh = ndisc_dequeue())) + { + ndisc_insert_neigh(neigh); + } +} + + +static void ndisc_garbage_collect(unsigned long arg) +{ + struct neighbour * neigh; + static unsigned long last_rand = 0; + unsigned long now = jiffies; + unsigned long flags; + int i = 0; + + + /* + * periodicly compute ReachableTime from random function + */ + if (now - last_rand > REACH_RANDOM_INTERVAL) + { + last_rand = now; + nd_reachable_time = rand_reach_time(); + } + + save_flags(flags); + cli(); + + if (ndisc_lock) + { + restore_flags(flags); + ndisc_gc_timer.expires = now + HZ; + add_timer(&ndisc_gc_timer); + return; + } + + for (; i < NCACHE_NUM_BUCKETS; i++) + for (neigh = neighbours[i]; neigh;) + { + /* + * Release unused entries + */ + if (neigh->refcnt == 0 && + ((neigh->nud_state == NUD_FAILED) || + ((neigh->nud_state == NUD_REACHABLE) && + (neigh->tstamp <= (now - nd_gc_staletime)) + ) + ) + ) + { + struct neighbour *prev; + + prev = neigh; + neigh = neigh->next; + ndisc_release_neigh(prev); + continue; + } + neigh = neigh->next; + } + + restore_flags(flags); + + ndisc_gc_timer.expires = now + nd_gc_interval; + add_timer(&ndisc_gc_timer); +} + +static __inline__ void ndisc_add_timer(struct neighbour *neigh, int timer) +{ + unsigned long now = jiffies; + unsigned long tval; + + neigh->expires = now + timer; + tval = del_timer(&ndisc_timer); + + if (tval) + { + tval = min(tval, neigh->expires); + } + else + tval = neigh->expires; + + ndisc_timer.expires = tval; + add_timer(&ndisc_timer); +} + +static void ndisc_del_timer(struct neighbour *neigh) +{ + unsigned long tval; + + if (!(neigh->nud_state & NUD_IN_TIMER)) + return; + + tval = del_timer(&ndisc_timer); + + if (tval == neigh->expires) + { + int i; + + tval = ~0UL; + + /* need to search the entire neighbour cache */ + for (i=0; i < NCACHE_NUM_BUCKETS; i++) + { + for (neigh = neighbours[i]; neigh; neigh=neigh->next) + if (neigh->nud_state & NUD_IN_TIMER) + { + tval = min(tval, neigh->expires); + } + } + + } + + if (tval == ~(0UL)) + return; + + ndisc_timer.expires = tval; + add_timer(&ndisc_timer); +} + +static struct neighbour * ndisc_new_neigh(struct device *dev, + struct in6_addr *addr) +{ + struct neighbour *neigh; + unsigned long flags; + + neigh = (struct neighbour *) kmalloc(sizeof(struct neighbour), + GFP_ATOMIC); + + if (neigh == NULL) + { + printk(KERN_DEBUG "ndisc: kmalloc failure\n"); + return NULL; + } + + nd_stats.allocs++; + + memset(neigh, 0, sizeof (struct neighbour)); + skb_queue_head_init(&neigh->arp_queue); + + ipv6_addr_copy(&neigh->addr, addr); + neigh->len = 128; + neigh->type = ipv6_addr_type(addr); + neigh->dev = dev; + neigh->tstamp = jiffies; + + if (dev->type == ARPHRD_LOOPBACK || dev->type == ARPHRD_SIT) + { + neigh->flags |= NCF_NOARP; + } + + save_flags(flags); + cli(); + + if (ndisc_lock == 0) + { + /* Add to the cache. */ + ndisc_insert_neigh(neigh); + } + else + { + ndisc_neigh_queue(neigh); + } + + restore_flags(flags); + + return neigh; +} + +/* + * Called when creating a new dest_cache entry for a given destination + * is likely that an entry for the refered gateway exists in cache + * + */ + +struct neighbour * ndisc_get_neigh(struct device *dev, struct in6_addr *addr) +{ + struct neighbour *neigh; + + /* + * neighbour cache: + * cached information about nexthop and addr resolution + */ + + if (dev == NULL) + { + printk(KERN_DEBUG "ncache_get_neigh: NULL device\n"); + return NULL; + } + + atomic_inc(&ndisc_lock); + + neigh = ndisc_retrieve_neigh(dev, addr); + + ndisc_release_lock(); + + if (neigh == NULL) + { + neigh = ndisc_new_neigh(dev, addr); + } + + atomic_inc(&neigh->refcnt); + + return neigh; +} + +/* + * return values + * 0 - Address Resolution succeded, send packet + * 1 - Address Resolution unfinished / packet queued + */ + +int ndisc_eth_resolv(unsigned char *h_dest, struct device *dev, + struct sk_buff *skb) +{ + struct neighbour *neigh; + + neigh = skb->nexthop; + + if (neigh == NULL) + { + int addr_type; + + addr_type = ipv6_addr_type(&skb->ipv6_hdr->daddr); + + if (addr_type & IPV6_ADDR_MULTICAST) + { + ipv6_mc_map(&skb->ipv6_hdr->daddr, h_dest); + return 0; + } + + printk(KERN_DEBUG "ndisc_eth_resolv: nexthop is NULL\n"); + goto discard; + } + + if (skb->pkt_type == PACKET_NDISC) + goto ndisc_pkt; + + switch (neigh->nud_state) { + case NUD_FAILED: + case NUD_NONE: + ndisc_event_send(neigh, skb); + + case NUD_INCOMPLETE: + if (skb_queue_len(&neigh->arp_queue) >= NDISC_QUEUE_LEN) + { + struct sk_buff *buff; + + buff = neigh->arp_queue.prev; + skb_unlink(buff); + dev_kfree_skb(buff, FREE_WRITE); + } + skb_queue_head(&neigh->arp_queue, skb); + return 1; + default: + ndisc_event_send(neigh, skb); + } + + ndisc_pkt: + + if (neigh->h_dest == NULL) + { + printk(KERN_DEBUG "neigh->h_dest is NULL\n"); + goto discard; + } + + memcpy(h_dest, neigh->h_dest, dev->addr_len); + + if ((neigh->flags & NCF_HHVALID) == 0) + { + /* + * copy header to hh_data and move h_dest pointer + * this is strictly media dependent. + */ + } + return 0; + + discard: + + dev_kfree_skb(skb, FREE_WRITE); + return 1; +} + + +/* Send the actual Neighbour Advertisement */ + +void ndisc_send_na(struct device *dev, struct neighbour *neigh, + struct in6_addr *daddr, + struct in6_addr *solicited_addr, + int router, int solicited, int override, int inc_opt) +{ + struct sock *sk = (struct sock *)ndisc_socket.data; + struct nd_msg *msg; + int len, opt_len; + struct sk_buff *skb; + int err; + + opt_len = ((dev->addr_len + 1) >> 3) + 1; + len = sizeof(struct icmpv6hdr) + sizeof(struct in6_addr); + + if (inc_opt) + { + len += opt_len << 3; + } + + skb = sock_alloc_send_skb(sk, MAX_HEADER + len, 0, 0, &err); + + if (skb == NULL) + { + printk(KERN_DEBUG "send_na: alloc skb failed\n"); + return; + } + + skb->free=1; + + if (ipv6_bld_hdr_2(sk, skb, dev, neigh, solicited_addr, daddr, + IPPROTO_ICMPV6, len) < 0) + { + kfree_skb(skb, FREE_WRITE); + printk(KERN_DEBUG + "ndisc_send_na: ipv6_build_header returned < 0\n"); + return; + } + + skb->pkt_type = PACKET_NDISC; + + msg = (struct nd_msg *) skb_put(skb, len); + + msg->icmph.type = NDISC_NEIGHBOUR_ADVERTISEMENT; + msg->icmph.code = 0; + msg->icmph.checksum = 0; + + msg->icmph.icmp6_unused = 0; + msg->icmph.icmp6_router = router; + msg->icmph.icmp6_solicited = solicited; + msg->icmph.icmp6_override = override; + + /* Set the target address. */ + ipv6_addr_copy(&msg->target, solicited_addr); + + if (inc_opt) + { + /* Set the source link-layer address option. */ + msg->opt.opt_type = ND_OPT_TARGET_LL_ADDR; + msg->opt.opt_len = opt_len; + memcpy(msg->opt.link_addr, dev->dev_addr, dev->addr_len); + + if ((opt_len << 3) - (2 + dev->addr_len)) + { + memset(msg->opt.link_addr + dev->addr_len, 0, + (opt_len << 3) - (2 + dev->addr_len)); + } + } + + /* checksum */ + msg->icmph.checksum = csum_ipv6_magic(solicited_addr, daddr, len, + IPPROTO_ICMPV6, + csum_partial((__u8 *) msg, + len, 0)); + + ipv6_queue_xmit(sk, skb->dev, skb, 1); +} + +void ndisc_send_ns(struct device *dev, struct neighbour *neigh, + struct in6_addr *solicit, + struct in6_addr *daddr, struct in6_addr *saddr) +{ + struct sock *sk = (struct sock *) ndisc_socket.data; + struct sk_buff *skb; + struct nd_msg *msg; + int len, opt_len; + int err; + + /* length of addr in 8 octet groups.*/ + opt_len = ((dev->addr_len + 1) >> 3) + 1; + len = sizeof(struct icmpv6hdr) + sizeof(struct in6_addr) + + (opt_len << 3); + + skb = sock_alloc_send_skb(sk, MAX_HEADER + len, 0, 0, &err); + if (skb == NULL) + { + printk(KERN_DEBUG "send_ns: alloc skb failed\n"); + return; + } + + skb->free=1; + skb->pkt_type = PACKET_NDISC; + + if (saddr == NULL) + { + struct inet6_ifaddr *ifa; + + /* use link local address */ + ifa = ipv6_get_lladdr(dev); + + if (ifa) + { + saddr = &ifa->addr; + } + } + + if(ipv6_addr_type(daddr) == IPV6_ADDR_MULTICAST) + { + nd_stats.snt_probes_mcast++; + } + else + { + nd_stats.snt_probes_ucast++; + } + + if (ipv6_bld_hdr_2(sk, skb, dev, neigh, saddr, daddr, IPPROTO_ICMPV6, + len) < 0 ) + { + kfree_skb(skb, FREE_WRITE); + printk(KERN_DEBUG + "ndisc_send_ns: ipv6_build_header returned < 0\n"); + return; + } + + msg = (struct nd_msg *)skb_put(skb, len); + msg->icmph.type = NDISC_NEIGHBOUR_SOLICITATION; + msg->icmph.code = 0; + msg->icmph.checksum = 0; + msg->icmph.icmp6_unused = 0; + + /* Set the target address. */ + ipv6_addr_copy(&msg->target, solicit); + + /* Set the source link-layer address option. */ + msg->opt.opt_type = ND_OPT_SOURCE_LL_ADDR; + msg->opt.opt_len = opt_len; + + memcpy(msg->opt.link_addr, dev->dev_addr, dev->addr_len); + + if ((opt_len << 3) - (2 + dev->addr_len)) + { + memset(msg->opt.link_addr + dev->addr_len, 0, + (opt_len << 3) - (2 + dev->addr_len)); + } + + /* checksum */ + msg->icmph.checksum = csum_ipv6_magic(&skb->ipv6_hdr->saddr, + daddr, len, + IPPROTO_ICMPV6, + csum_partial((__u8 *) msg, + len, 0)); + /* send it! */ + ipv6_queue_xmit(sk, skb->dev, skb, 1); +} + +void ndisc_send_rs(struct device *dev, struct in6_addr *saddr, + struct in6_addr *daddr) +{ + struct sock *sk = (struct sock *) ndisc_socket.data; + struct sk_buff *skb; + struct icmpv6hdr *hdr; + __u8 * opt; + int len, opt_len; + int err; + + /* length of addr in 8 octet groups.*/ + opt_len = ((dev->addr_len + 1) >> 3) + 1; + len = sizeof(struct icmpv6hdr) + (opt_len << 3); + + skb = sock_alloc_send_skb(sk, MAX_HEADER + len, 0, 0, &err); + if (skb == NULL) + { + printk(KERN_DEBUG "send_ns: alloc skb failed\n"); + return; + } + + skb->free=1; + + if (ipv6_bld_hdr_2(sk, skb, dev, NULL, saddr, daddr, IPPROTO_ICMPV6, + len) < 0 ) + { + kfree_skb(skb, FREE_WRITE); + printk(KERN_DEBUG + "ndisc_send_ns: ipv6_build_header returned < 0\n"); + return; + } + + hdr = (struct icmpv6hdr *) skb_put(skb, len); + hdr->type = NDISC_ROUTER_SOLICITATION; + hdr->code = 0; + hdr->checksum = 0; + hdr->icmp6_unused = 0; + + opt = (u8*) (hdr + 1); + + /* Set the source link-layer address option. */ + opt[0] = ND_OPT_SOURCE_LL_ADDR; + opt[1] = opt_len; + + memcpy(opt + 2, dev->dev_addr, dev->addr_len); + + if ((opt_len << 3) - (2 + dev->addr_len)) + { + memset(opt + 2 + dev->addr_len, 0, + (opt_len << 3) - (2 + dev->addr_len)); + } + + /* checksum */ + hdr->checksum = csum_ipv6_magic(&skb->ipv6_hdr->saddr, daddr, len, + IPPROTO_ICMPV6, + csum_partial((__u8 *) hdr, len, 0)); + + /* send it! */ + ipv6_queue_xmit(sk, skb->dev, skb, 1); +} + + +static int ndisc_store_hwaddr(struct device *dev, __u8 *opt, int opt_len, + __u8 *h_addr, int option) +{ + while (*opt != option && opt_len) + { + int len; + + len = opt[1] << 3; + + if (len == 0) + { + printk(KERN_WARNING "nd: option has 0 len\n"); + return -EINVAL; + } + + opt += len; + opt_len -= len; + } + + if (*opt == option) + { + memcpy(h_addr, opt + 2, dev->addr_len); + return 0; + } + + return -EINVAL; +} + +/* Called when a timer expires for a neighbour entry. */ + +static void ndisc_timer_handler(unsigned long arg) +{ + unsigned long now = jiffies; + struct neighbour * neigh; + unsigned long ntimer = ~0UL; + int i; + + atomic_inc(&ndisc_lock); + + for (i=0; i < NCACHE_NUM_BUCKETS; i++) + { + for (neigh = neighbours[i]; neigh;) + { + if (neigh->nud_state & NUD_IN_TIMER) + { + int time; + + if (neigh->expires <= now) + { + time = ndisc_event_timer(neigh); + } + else + time = neigh->expires - now; + + if (time == 0) + { + unsigned long flags; + + save_flags(flags); + cli(); + + if (ndisc_lock == 1) + { + struct neighbour *old = neigh; + + neigh = neigh->next; + ndisc_release_neigh(old); + restore_flags(flags); + continue; + } + + restore_flags(flags); + } + + ntimer = min(ntimer, time); + } + neigh = neigh->next; + } + } + + if (ntimer != (~0UL)) + { + ndisc_timer.expires = jiffies + ntimer; + add_timer(&ndisc_timer); + } + ndisc_release_lock(); +} + + +int ndisc_event_timer(struct neighbour *neigh) +{ + struct in6_addr *daddr; + struct in6_addr *target; + struct in6_addr mcaddr; + struct device *dev; + int max_probes; + + if (neigh->nud_state == NUD_DELAY) + { + neigh->nud_state = NUD_PROBE; + } + + max_probes = (neigh->nud_state == NUD_PROBE ? nd_max_unicast_solicit: + nd_max_multicast_solicit); + + if (neigh->probes == max_probes) + { + struct sk_buff *skb; + + neigh->nud_state = NUD_FAILED; + neigh->flags |= NCF_INVALID; + nd_stats.res_failed++; + + while((skb=skb_dequeue(&neigh->arp_queue))) + { + /* + * "The sender MUST return an ICMP + * destination unreachable" + */ + icmpv6_send(skb, ICMPV6_DEST_UNREACH, + ICMPV6_ADDR_UNREACH, 0, neigh->dev); + + dev_kfree_skb(skb, FREE_WRITE); + } + return 0; + } + + neigh->probes++; + + dev = neigh->dev; + target = &neigh->addr; + + if (neigh->nud_state == NUD_INCOMPLETE) + { + addrconf_addr_solict_mult(&neigh->addr, &mcaddr); + daddr = &mcaddr; + neigh = NULL; + } + else + { + daddr = &neigh->addr; + } + + ndisc_send_ns(dev, neigh, target, daddr, NULL); + + return nd_retrans_timer; +} + +void ndisc_event_send(struct neighbour *neigh, struct sk_buff *skb) +{ + unsigned long now = jiffies; + struct in6_addr daddr; + struct in6_addr *saddr = NULL; + + switch (neigh->nud_state) { + case NUD_FAILED: + neigh->probes = 0; + case NUD_NONE: + + if (skb && !skb->stamp.tv_sec) + { + /* + * skb->stamp allows us to know if we are + * originating the skb or forwarding it. + * (it is set on netif_rx) + */ + saddr = &skb->ipv6_hdr->saddr; + } + + neigh->nud_state = NUD_INCOMPLETE; + addrconf_addr_solict_mult(&neigh->addr, &daddr); + ndisc_send_ns(neigh->dev, NULL, &neigh->addr, &daddr, saddr); + ndisc_add_timer(neigh, nd_retrans_timer); + + break; + + case NUD_REACHABLE: + if (now - neigh->tstamp < nd_reachable_time) + break; + + case NUD_STALE: + neigh->nud_state = NUD_DELAY; + ndisc_add_timer(neigh, nd_delay_first_probe); + } +} + +/* + * Received a neighbour announce + */ +void ndisc_event_na(struct neighbour *neigh, unsigned char * opt, int opt_len, + int solicited, int override) +{ + struct sk_buff *skb; + + if (neigh->nud_state == NUD_NONE) + { + neigh->nud_state = NUD_INCOMPLETE; + } + + if (neigh->nud_state == NUD_INCOMPLETE || override) + { + + if (opt_len == 0) + { + printk(KERN_DEBUG "no opt on NA\n"); + } + else + { + /* record hardware address */ + + neigh->h_dest = neigh->hh_data; + neigh->flags &= ~NCF_HHVALID; + + if (ndisc_store_hwaddr(neigh->dev, opt, opt_len, + neigh->h_dest, + ND_OPT_TARGET_LL_ADDR)) + { + printk(KERN_DEBUG + "event_na: invalid TARGET_LL_ADDR\n"); + neigh->h_dest = NULL; + neigh->nud_state = NUD_NONE; + return; + } + } + } + + + if (solicited || override || neigh->nud_state == NUD_INCOMPLETE) + { + + neigh->probes = 0; + neigh->tstamp = jiffies; + + if (neigh->nud_state & NUD_IN_TIMER) + { + ndisc_del_timer(neigh); + } + + if (solicited) + { + neigh->nud_state = NUD_REACHABLE; + } + else + { + neigh->nud_state = NUD_STALE; + } + } + + while ((skb=skb_dequeue(&neigh->arp_queue))) + { + int priority = SOPRI_NORMAL; + + if (skb->sk) + priority = skb->sk->priority; + + dev_queue_xmit(skb, neigh->dev, priority); + } +} + +static void ndisc_event_ns(struct in6_addr *saddr, struct sk_buff *skb) +{ + struct neighbour *neigh; + u8 *opt; + int len; + + opt = skb->h.raw; + opt += sizeof(struct icmpv6hdr) + sizeof(struct in6_addr); + + len = skb->tail - opt; + + neigh = ndisc_retrieve_neigh(skb->dev, saddr); + + if (neigh == NULL) + { + neigh = ndisc_new_neigh(skb->dev, saddr); + } + + switch(neigh->nud_state) { + case NUD_REACHABLE: + case NUD_STALE: + case NUD_DELAY: + if (*opt != ND_OPT_SOURCE_LL_ADDR || + len != neigh->dev->addr_len || + memcmp(neigh->h_dest, opt + 2, len)) + { + break; + } + + if (neigh->nud_state & NUD_IN_TIMER) + { + ndisc_del_timer(neigh); + } + default: + neigh->flags &= ~NCF_HHVALID; + neigh->h_dest = neigh->hh_data; + + if (ndisc_store_hwaddr(neigh->dev, opt, len, + neigh->h_dest, + ND_OPT_SOURCE_LL_ADDR)) + { + printk(KERN_DEBUG + "event_ns: invalid SOURCE_LL_ADDR\n"); + neigh->h_dest = NULL; + neigh->nud_state = NUD_NONE; + return; + } + + neigh->nud_state = NUD_STALE; + neigh->tstamp = jiffies; + neigh->probes = 0; + } + +} + +static struct rt6_info *ndisc_get_dflt_router(struct device *dev, + struct in6_addr *addr) +{ + struct rt6_info *iter; + + for (iter = default_rt_list; iter; iter=iter->next) + { + if (dev == iter->rt_dev && + ipv6_addr_cmp(&iter->rt_dst, addr) == 0) + { + return iter; + } + } + return NULL; +} + +static void ndisc_add_dflt_router(struct rt6_info *rt) +{ + struct rt6_info *iter; + + rt->rt_ref++; + rt->fib_node = &routing_table; + rt6_stats.fib_rt_alloc++; + + if (default_rt_list == NULL) + { + default_rt_list = rt; + return; + } + + for (iter = default_rt_list; iter->next; iter=iter->next) + ; + + iter->next = rt; +} + +static void ndisc_del_dflt_router(struct rt6_info *rt) +{ + struct rt6_info *iter, *back; + + if (rt == default_rt_list) + { + default_rt_list = rt->next; + } + else + { + back = NULL; + for (iter = default_rt_list; iter; iter=iter->next) + { + if (iter == rt) + { + back->next = rt->next; + break; + } + back = iter; + } + } + + rt->fib_node = NULL; + rt_release(rt); +} + +static void ndisc_purge_dflt_routers(void) +{ + struct rt6_info *iter, *rt; + + for (iter = default_rt_list; iter; ) + { + rt = iter; + iter=iter->next; + rt_release(rt); + } + default_rt_list = NULL; +} + +static void ndisc_ll_addr_update(struct neighbour *neigh, u8* opt, int len, + int type) +{ + switch(neigh->nud_state) { + case NUD_REACHABLE: + case NUD_STALE: + case NUD_DELAY: + if (len == neigh->dev->addr_len && + memcmp(neigh->h_dest, opt + 2, len) == 0) + { + break; + } + + if (neigh->nud_state & NUD_IN_TIMER) + { + ndisc_del_timer(neigh); + } + default: + neigh->flags &= ~NCF_HHVALID; + neigh->h_dest = neigh->hh_data; + + if (ndisc_store_hwaddr(neigh->dev, opt, len, neigh->h_dest, + type)) + { + printk(KERN_DEBUG "NDISC: invalid LL_ADDR\n"); + neigh->h_dest = NULL; + neigh->nud_state = NUD_NONE; + break; + } + + neigh->nud_state = NUD_STALE; + neigh->tstamp = jiffies; + neigh->probes = 0; + } + +} + +struct rt6_info * dflt_rt_lookup(void) +{ + struct rt6_info *match = NULL; + struct rt6_info *rt; + int score = -1; + unsigned long now = jiffies; + + for (rt = default_rt_list; rt; rt=rt->next) + { + struct neighbour *neigh = rt->rt_nexthop; + + if (score < 0) + { + score = 0; + match = rt; + } + + if (neigh->nud_state == NUD_REACHABLE) + { + if (score < 1) + { + score = 1; + match = rt; + } + + if (now - neigh->tstamp < nd_reachable_time) + { + return rt; + } + } + + } + + return match; +} + +static void ndisc_router_discovery(struct sk_buff *skb) +{ + struct ra_msg *ra_msg = (struct ra_msg *) skb->h.raw; + struct neighbour *neigh; + struct inet6_dev *in6_dev; + struct rt6_info *rt; + int lifetime; + int optlen; + + __u8 * opt = (__u8 *)(ra_msg + 1); + + optlen = (skb->tail - skb->h.raw) - sizeof(struct ra_msg); + + if (skb->ipv6_hdr->hop_limit != 255) + { + printk(KERN_WARNING + "NDISC: fake router advertisment received\n"); + return; + } + + /* + * set the RA_RECV flag in the interface + */ + + in6_dev = ipv6_get_idev(skb->dev); + if (in6_dev == NULL) + { + printk(KERN_DEBUG "RA: can't find in6 device\n"); + return; + } + + if (in6_dev->if_flags & IF_RS_SENT) + { + /* + * flag that an RA was received after an RS was sent + * out on this interface. + */ + in6_dev->if_flags |= IF_RA_RCVD; + } + + lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); + + rt = ndisc_get_dflt_router(skb->dev, &skb->ipv6_hdr->saddr); + + if (rt && lifetime == 0) + { + ndisc_del_dflt_router(rt); + rt = NULL; + } + + if (rt == NULL && lifetime) + { + printk(KERN_DEBUG "ndisc_rdisc: new default router\n"); + + rt = (struct rt6_info *) kmalloc(sizeof(struct rt6_info), + GFP_ATOMIC); + if (rt) + { + neigh = ndisc_retrieve_neigh(skb->dev, + &skb->ipv6_hdr->saddr); + + if (neigh == NULL) + { + neigh = ndisc_new_neigh(skb->dev, + &skb->ipv6_hdr->saddr); + } + + if (neigh) + { + atomic_inc(&neigh->refcnt); + neigh->flags |= NCF_ROUTER; + + memset(rt, 0, sizeof(struct rt6_info)); + + ipv6_addr_copy(&rt->rt_dst, + &skb->ipv6_hdr->saddr); + rt->rt_metric = 1; + rt->rt_flags = RTF_GATEWAY | RTF_DYNAMIC; + rt->rt_dev = skb->dev; + rt->rt_nexthop = neigh; + + ndisc_add_dflt_router(rt); + } + else + { + kfree(rt); + } + } + } + + if (rt) + { + rt->rt_expires = jiffies + (HZ * lifetime); + } + + if (ra_msg->icmph.icmp6_hop_limit) + { + ipv6_hop_limit = ra_msg->icmph.icmp6_hop_limit; + } + + /* + * Update Reachable Time and Retrans Timer + */ + + if (ra_msg->retrans_timer) + { + nd_retrans_timer = ntohl(ra_msg->retrans_timer); + } + + if (ra_msg->reachable_time) + { + __u32 rtime = ntohl(ra_msg->reachable_time); + + if (rtime != nd_base_reachable_time) + { + nd_base_reachable_time = rtime; + nd_gc_staletime = 3 * nd_base_reachable_time; + nd_reachable_time = rand_reach_time(); + } + + } + + /* + * Process options. + */ + + while(optlen > 0) { + int len; + + len = (opt[1] << 3); + + if (len == 0) + { + printk(KERN_DEBUG "RA: opt has 0 len\n"); + break; + } + + switch(*opt) { + case ND_OPT_SOURCE_LL_ADDR: + + if (rt == NULL) + break; + + neigh = rt->rt_nexthop; + + ndisc_ll_addr_update(neigh, opt, len, + ND_OPT_SOURCE_LL_ADDR); + break; + + case ND_OPT_PREFIX_INFO: + addrconf_prefix_rcv(skb->dev, opt, len); + break; + + case ND_OPT_MTU: + + if (rt) + { + int mtu; + struct device *dev; + + mtu = htonl(*(__u32 *)(opt+4)); + dev = rt->rt_nexthop->dev; + + if (mtu < 576) + { + printk(KERN_DEBUG "NDISC: router " + "announcement with mtu = %d\n", + mtu); + break; + } + + if (dev->change_mtu) + { + dev->change_mtu(dev, mtu); + } + else + { + dev->mtu = mtu; + } + } + break; + + case ND_OPT_TARGET_LL_ADDR: + case ND_OPT_REDIRECT_HDR: + printk(KERN_DEBUG "got illegal option with RA"); + break; + default: + printk(KERN_DEBUG "unkown option in RA\n"); + } + optlen -= len; + opt += len; + } + +} + +void ndisc_forwarding_on(void) +{ + /* + * forwarding was turned on + */ + + ndisc_purge_dflt_routers(); +} + +void ndisc_forwarding_off(void) +{ + /* + * forwarding was turned off + */ +} + +static void ndisc_redirect_rcv(struct sk_buff *skb) +{ + struct icmpv6hdr *icmph; + struct in6_addr *dest; + struct in6_addr *target; /* new first hop to destination */ + struct neighbour *neigh; + struct rt6_info *rt; + int on_link = 0; + int optlen; + u8 * opt; + + if (skb->ipv6_hdr->hop_limit != 255) + { + printk(KERN_WARNING + "NDISC: fake ICMP redirect received\n"); + return; + } + + if (!(ipv6_addr_type(&skb->ipv6_hdr->saddr) & IPV6_ADDR_LINKLOCAL)) + { + printk(KERN_WARNING + "ICMP redirect: source address is not linklocal\n"); + return; + } + + optlen = skb->tail - skb->h.raw; + optlen -= sizeof(struct icmpv6hdr) + 2 * sizeof(struct in6_addr); + + if (optlen < 0) + { + printk(KERN_WARNING "ICMP redirect: packet too small\n"); + return; + } + + icmph = (struct icmpv6hdr *) skb->h.raw; + target = (struct in6_addr *) (icmph + 1); + dest = target + 1; + + if (ipv6_addr_type(dest) & IPV6_ADDR_MULTICAST) + { + printk(KERN_WARNING "ICMP redirect for multicast addr\n"); + return; + } + + if (ipv6_addr_cmp(dest, target) == 0) + { + on_link = 1; + } + else if (!(ipv6_addr_type(target) & IPV6_ADDR_LINKLOCAL)) + { + printk(KERN_WARNING + "ICMP redirect: target address is not linklocal\n"); + return; + } + + /* passed validation tests */ + + rt = ipv6_rt_redirect(skb->dev, dest, target, on_link); + + if (rt == NULL) + { + printk(KERN_WARNING "ICMP redirect: no route to host\n"); + return; + } + + neigh = rt->rt_nexthop; + + opt = (u8 *) (dest + 1); + + while (optlen > 0) + { + int len; + + len = (opt[1] << 3); + + if (*opt == ND_OPT_TARGET_LL_ADDR) + { + ndisc_ll_addr_update(neigh, opt, len, + ND_OPT_TARGET_LL_ADDR); + } + + opt += len; + optlen -= len; + } +} + +void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, + struct in6_addr *target) +{ + struct sock *sk = (struct sock *) ndisc_socket.data; + int len = sizeof(struct icmpv6hdr) + 2 * sizeof(struct in6_addr); + struct sk_buff *buff; + struct inet6_ifaddr *ifp; + struct icmpv6hdr *icmph; + struct in6_addr *addrp; + struct rt6_info *rt; + int ta_len = 0; + u8 *opt; + int rd_len; + int err; + int hlen; + + rt = fibv6_lookup(&skb->ipv6_hdr->saddr, skb->dev, 0); + + if (rt->rt_flags & RTF_GATEWAY) + { + printk(KERN_DEBUG "ndisc_send_redirect: not a neighbour\n"); + return; + } + + if (neigh->nud_state == NUD_REACHABLE) + { + ta_len = ((neigh->dev->addr_len + 1) >> 3) + 1; + len += (ta_len << 3); + } + + rd_len = min(536 - len, ntohs(skb->ipv6_hdr->payload_len) + 8); + rd_len &= ~0x7; + len += rd_len; + + ifp = ipv6_get_lladdr(skb->dev); + + if (ifp == NULL) + { + printk(KERN_DEBUG "redirect: no link_local addr for dev\n"); + return; + } + + buff = sock_alloc_send_skb(sk, MAX_HEADER + len, 0, 0, &err); + + if (buff == NULL) + { + printk(KERN_DEBUG "ndisc_send_redirect: alloc_skb failed\n"); + return; + } + + + hlen = 0; + if (skb->dev->hard_header_len) + { + hlen = (skb->dev->hard_header_len + 15) & ~15; + } + + skb_reserve(buff, hlen + sizeof(struct ipv6hdr)); + + icmph = (struct icmpv6hdr *) skb_put(buff, len); + + memset(icmph, 0, sizeof(struct icmpv6hdr)); + icmph->type = NDISC_REDIRECT; + + /* + * copy target and destination addresses + */ + + addrp = (struct in6_addr *)(icmph + 1); + ipv6_addr_copy(addrp, target); + addrp++; + ipv6_addr_copy(addrp, &skb->ipv6_hdr->daddr); + + opt = (u8*) (addrp + 1); + + /* + * include target_address option + */ + + if (ta_len) + { + int zb; + + *(opt++) = ND_OPT_TARGET_LL_ADDR; + *(opt++) = ta_len; + + memcpy(opt, neigh->h_dest, neigh->dev->addr_len); + opt += neigh->dev->addr_len; + + /* + * if link layer address doesn't end on a 8 byte + * boundary memset(0) the remider + */ + + zb = (neigh->dev->addr_len + 2) & 0x7; + if (zb) + { + int comp; + + comp = 8 - zb; + memset(opt, 0, comp); + opt += comp; + } + } + + /* + * build redirect option and copy skb over to the new packet. + */ + + memset(opt, 0, 8); + *(opt++) = ND_OPT_REDIRECT_HDR; + *(opt++) = (rd_len >> 3); + opt += 6; + + memcpy(opt, &skb->ipv6_hdr, rd_len - 8); + + icmph->checksum = csum_ipv6_magic(&ifp->addr, &skb->ipv6_hdr->saddr, + len, IPPROTO_ICMPV6, + csum_partial((u8 *) icmph, len, 0)); + + ipv6_xmit(sk, buff, &ifp->addr, &skb->ipv6_hdr->saddr, NULL, IPPROTO_ICMPV6); +} + +/* Called by upper layers to validate neighbour cache entries. */ + +void ndisc_validate(struct neighbour *neigh) +{ + if (neigh->nud_state == NUD_INCOMPLETE) + return; + + if (neigh->nud_state == NUD_DELAY) + { + ndisc_del_timer(neigh); + } + + nd_stats.rcv_upper_conf++; + neigh->nud_state = NUD_REACHABLE; + neigh->tstamp = jiffies; +} + +int ndisc_rcv(struct sk_buff *skb, struct device *dev, + struct in6_addr *saddr, struct in6_addr *daddr, + struct ipv6_options *opt, unsigned short len) +{ + struct nd_msg *msg = (struct nd_msg *) skb->h.raw; + struct neighbour *neigh; + struct inet6_ifaddr *ifp; + + switch (msg->icmph.type) { + case NDISC_NEIGHBOUR_SOLICITATION: + if ((ifp = ipv6_chk_addr(&msg->target))) + { + int addr_type; + + if (ifp->flags & DAD_INCOMPLETE) + { + /* + * DAD failed + */ + + printk(KERN_DEBUG "duplicate address\n"); + del_timer(&ifp->timer); + return 0; + } + + addr_type = ipv6_addr_type(saddr); + if (addr_type & IPV6_ADDR_UNICAST) + { + int inc; + + /* + * update / create cache entry + * for the source adddress + */ + + nd_stats.rcv_probes_ucast++; + ndisc_event_ns(saddr, skb); + + /* answer solicitation */ + neigh = ndisc_retrieve_neigh(dev, saddr); + + inc = ipv6_addr_type(daddr); + inc &= IPV6_ADDR_MULTICAST; + + ndisc_send_na(dev, neigh, saddr, &ifp->addr, + ifp->idev->router, 1, inc, inc); + } + else + { + /* FIXME */ + printk(KERN_DEBUG "ns: non unicast saddr\n"); + } + } + break; + + case NDISC_NEIGHBOUR_ADVERTISEMENT: + + neigh = ndisc_retrieve_neigh(skb->dev, &msg->target); + if (neigh) + { + if (neigh->flags & NCF_ROUTER) + { + if (msg->icmph.icmp6_router == 0) + { + /* + * Change: router to host + */ + + struct rt6_info *rt; + rt = ndisc_get_dflt_router(skb->dev, + saddr); + if (rt) + { + ndisc_del_dflt_router(rt); + } + } + } + else + { + if (msg->icmph.icmp6_router) + { + neigh->flags |= NCF_ROUTER; + } + } + ndisc_event_na(neigh, (unsigned char *) &msg->opt, + skb->tail - (u8 *)&msg->opt /*opt_len*/, + msg->icmph.icmp6_solicited, + msg->icmph.icmp6_override); + } + break; + + } + + if (ipv6_forwarding == 0) + { + switch (msg->icmph.type) { + case NDISC_ROUTER_ADVERTISEMENT: + ndisc_router_discovery(skb); + break; + + case NDISC_REDIRECT: + ndisc_redirect_rcv(skb); + break; + } + } + + return 0; +} + +int ndisc_get_info(char *buffer, char **start, off_t offset, int length, + int dummy) +{ + struct neighbour *neigh; + unsigned long now = jiffies; + int len = 0; + int i; + + atomic_inc(&ndisc_lock); + + for (i = 0; i < NCACHE_NUM_BUCKETS; i++) + { + for(neigh = neighbours[i]; neigh; neigh=neigh->next) + { + int j; + + for (j=0; j<16; j++) + { + sprintf(buffer + len, "%02x", + neigh->addr.s6_addr[j]); + len += 2; + } + + len += sprintf(buffer + len, + " %02x %02x %08lx %08lx %04x %04x ", + i, + neigh->nud_state, + neigh->expires - now, + now - neigh->tstamp, + neigh->refcnt, + neigh->flags); + + if (neigh->h_dest) + { + for (j=0; j< neigh->dev->addr_len; j++) + { + sprintf(buffer + len, "%02x", + neigh->h_dest[j]); + len += 2; + } + } + else + len += sprintf(buffer + len, "000000000000"); + len += sprintf(buffer + len, "\n"); + + } + } + + ndisc_release_lock(); + + *start = buffer + offset; + + len -= offset; + + if (len > length) + len = length; + return len; +} + +struct proc_dir_entry ndisc_proc_entry = +{ + 0, 11, "ndisc_cache", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, NULL, + &ndisc_get_info +}; + +void ndisc_init(struct proto_ops *ops) +{ + struct sock *sk; + int i = 0; + int err; + + /* + * Init ndisc_socket + */ + ndisc_socket.type=SOCK_RAW; + ndisc_socket.ops=ops; + + if((err=ops->create(&ndisc_socket, IPPROTO_ICMPV6))<0) + printk(KERN_DEBUG + "Failed to create the NDISC control socket.\n"); + + MOD_DEC_USE_COUNT; + + sk = ndisc_socket.data; + sk->allocation = GFP_ATOMIC; + sk->net_pinfo.af_inet6.hop_limit = 255; + sk->net_pinfo.af_inet6.priority = 15; + sk->num = 256; /* Don't receive any data */ + + /* + * Initialize the neighbours hash buckets. + */ + + for (; i < NCACHE_NUM_BUCKETS; i++) + neighbours[i] = NULL; + + /* General ND state machine timer. */ + init_timer(&ndisc_timer); + ndisc_timer.function = ndisc_timer_handler; + ndisc_timer.data = 0L; + ndisc_timer.expires = 0L; + + /* ND GC timer */ + init_timer(&ndisc_gc_timer); + ndisc_gc_timer.function = ndisc_garbage_collect; + ndisc_gc_timer.data = 0L; + ndisc_gc_timer.expires = jiffies + nd_gc_interval; + + add_timer(&ndisc_gc_timer); + +#ifdef CONFIG_IPV6_MODULE + ndisc_eth_hook = ndisc_eth_resolv; + proc_register_dynamic(&proc_net, &ndisc_proc_entry); +#endif +} + +#ifdef CONFIG_IPV6_MODULE +void ndisc_cleanup(void) +{ + ndisc_eth_hook = NULL; + proc_unregister(&proc_net, ndisc_proc_entry.low_ino); + del_timer(&ndisc_gc_timer); + del_timer(&ndisc_timer); +} +#endif + +/* + * Local variables: + * compile-command: "gcc -D__KERNEL__ -I/usr/src/linux/include -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer -fno-strength-reduce -pipe -m486 -DCPU=486 -DMODULE -DMODVERSIONS -include /usr/src/linux/include/linux/modversions.h -c -o ndisc.o ndisc.c" + * c-file-style: "Linux" + * End: + */ diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c new file mode 100644 index 000000000..7ba6f5be1 --- /dev/null +++ b/net/ipv6/protocol.c @@ -0,0 +1,112 @@ +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> + +struct inet6_protocol *inet6_protocol_base = NULL; +struct inet6_protocol *inet6_protos[MAX_INET_PROTOS] = +{ + NULL +}; + + +struct inet6_protocol *inet6_get_protocol(unsigned char prot) +{ + unsigned char hash; + struct inet6_protocol *p; + + hash = prot & (MAX_INET_PROTOS - 1); + for (p = inet6_protos[hash] ; p != NULL; p=p->next) + { + if (p->protocol == prot) + return((struct inet6_protocol *) p); + } + return(NULL); +} + +void inet6_add_protocol(struct inet6_protocol *prot) +{ + unsigned char hash; + struct inet6_protocol *p2; + + hash = prot->protocol & (MAX_INET_PROTOS - 1); + prot ->next = inet6_protos[hash]; + inet6_protos[hash] = prot; + prot->copy = 0; + + /* + * Set the copy bit if we need to. + */ + + p2 = (struct inet6_protocol *) prot->next; + while(p2 != NULL) + { + if (p2->protocol == prot->protocol) + { + prot->copy = 1; + break; + } + p2 = (struct inet6_protocol *) p2->next; + } +} + +/* + * Remove a protocol from the hash tables. + */ + +int inet6_del_protocol(struct inet6_protocol *prot) +{ + struct inet6_protocol *p; + struct inet6_protocol *lp = NULL; + unsigned char hash; + + hash = prot->protocol & (MAX_INET_PROTOS - 1); + if (prot == inet6_protos[hash]) + { + inet6_protos[hash] = (struct inet6_protocol *) inet6_protos[hash]->next; + return(0); + } + + p = (struct inet6_protocol *) inet6_protos[hash]; + while(p != NULL) + { + /* + * We have to worry if the protocol being deleted is + * the last one on the list, then we may need to reset + * someone's copied bit. + */ + if (p->next != NULL && p->next == prot) + { + /* + * if we are the last one with this protocol and + * there is a previous one, reset its copy bit. + */ + if (p->copy == 0 && lp != NULL) + lp->copy = 0; + p->next = prot->next; + return(0); + } + if (p->next != NULL && p->next->protocol == prot->protocol) + lp = p; + + p = (struct inet6_protocol *) p->next; + } + return(-1); +} + +/* + * Local variables: + * compile-command: "gcc -D__KERNEL__ -I/usr/src/linux/include -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer -fno-strength-reduce -pipe -m486 -DCPU=486 -DMODULE -DMODVERSIONS -include /usr/src/linux/include/linux/modversions.h -c -o protocol.o protocol.c" + * End: + */ diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c new file mode 100644 index 000000000..bab6514d6 --- /dev/null +++ b/net/ipv6/raw.c @@ -0,0 +1,474 @@ +/* + * RAW sockets for IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * Adapted from linux/net/ipv4/raw.c + * + * $Id: raw.c,v 1.5 1996/10/29 22:45:53 roque Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/icmpv6.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ip.h> +#include <net/udp.h> + +#include <net/ipv6.h> +#include <net/ndisc.h> +#include <net/protocol.h> +#include <net/ipv6_route.h> +#include <net/addrconf.h> +#include <net/transp_v6.h> + +#include <asm/uaccess.h> + +void rawv6_err(struct sock *sk, int type, int code, unsigned char *buff, + struct in6_addr *saddr, struct in6_addr *daddr) +{ + if (sk == NULL) + return; + +} + +static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) +{ + /* Charge it to the socket. */ + + if (sock_queue_rcv_skb(sk,skb)<0) + { + /* ip_statistics.IpInDiscards++; */ + skb->sk=NULL; + kfree_skb(skb, FREE_READ); + return 0; + } + + /* ip_statistics.IpInDelivers++; */ + return 0; +} + +/* + * This is next to useless... + * if we demultiplex in network layer we don't need the extra call + * just to queue the skb... + * maybe we could have the network decide uppon an hint if it + * should call raw_rcv for demultiplexing + */ +int rawv6_rcv(struct sk_buff *skb, struct device *dev, + struct in6_addr *saddr, struct in6_addr *daddr, + struct ipv6_options *opt, unsigned short len) +{ + struct sock *sk; + + sk = skb->sk; + + if (sk->ip_hdrincl) + { + skb->h.raw = (unsigned char *) skb->ipv6_hdr; + } + + if (sk->users) { + __skb_queue_tail(&sk->back_log, skb); + return 0; + } + + rawv6_rcv_skb(sk, skb); + return 0; +} + + +/* + * This should be easy, if there is something there + * we return it, otherwise we block. + */ + +int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, int len, + int noblock, int flags,int *addr_len) +{ + struct sockaddr_in6 *sin6=(struct sockaddr_in6 *)msg->msg_name; + struct sk_buff *skb; + int copied=0; + int err; + + + if (flags & MSG_OOB) + return -EOPNOTSUPP; + + if (sk->shutdown & RCV_SHUTDOWN) + return(0); + + if (addr_len) + *addr_len=sizeof(*sin6); + + skb=skb_recv_datagram(sk, flags, noblock, &err); + if(skb==NULL) + return err; + + copied = min(len, skb->tail - skb->h.raw); + + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + sk->stamp=skb->stamp; + + if (err) + return err; + + /* Copy the address. */ + if (sin6) + { + sin6->sin6_family = AF_INET6; + memcpy(&sin6->sin6_addr, &skb->ipv6_hdr->saddr, + sizeof(struct in6_addr)); + + *addr_len = sizeof(struct sockaddr_in6); + } + + if (msg->msg_control) + { + int err; + + err = datagram_recv_ctl(sk, msg, skb); + + if (err < 0) + { + copied = err; + } + } + + skb_free_datagram(sk, skb); + return (copied); +} + +/* + * Sending... + */ + +struct rawv6_fakehdr { + struct iovec *iov; + struct sock *sk; + __u32 len; + __u32 cksum; + __u32 proto; + struct in6_addr *daddr; +}; + +static int rawv6_getfrag(const void *data, struct in6_addr *saddr, + char *buff, unsigned int offset, unsigned int len) +{ + struct iovec *iov = (struct iovec *) data; + + return memcpy_fromiovecend(buff, iov, offset, len); +} + +static int rawv6_frag_cksum(const void *data, struct in6_addr *addr, + char *buff, unsigned int offset, + unsigned int len) +{ + struct rawv6_fakehdr *hdr = (struct rawv6_fakehdr *) data; + + hdr->cksum = csum_partial_copy_fromiovecend(buff, hdr->iov, offset, + len, hdr->cksum); + + if (offset == 0) + { + struct sock *sk; + struct raw6_opt *opt; + struct in6_addr *daddr; + + sk = hdr->sk; + opt = &sk->tp_pinfo.tp_raw; + + if (hdr->daddr) + { + daddr = hdr->daddr; + } + else + { + daddr = addr + 1; + } + + hdr->cksum = csum_ipv6_magic(addr, daddr, hdr->len, + hdr->proto, hdr->cksum); + + if (opt->offset < len) + { + __u16 *csum; + + csum = (__u16 *) (buff + opt->offset); + *csum = hdr->cksum; + } + else + { + /* + * FIXME + * signal an error to user via sk->err + */ + printk(KERN_DEBUG "icmp: cksum offset too big\n"); + } + } + return 0; +} + + +static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len, + int noblock, int flags) +{ + struct ipv6_options opt_space; + struct sockaddr_in6 * sin6 = (struct sockaddr_in6 *) msg->msg_name; + struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; + struct ipv6_options *opt = NULL; + struct device *dev = NULL; + struct in6_addr *saddr = NULL; + int addr_len = msg->msg_namelen; + struct in6_addr *daddr; + struct raw6_opt *raw_opt; + u16 proto; + int err; + + + /* Mirror BSD error message compatibility */ + if (flags & MSG_OOB) + return -EOPNOTSUPP; + + if (flags & ~MSG_DONTROUTE) + return(-EINVAL); + /* + * Get and verify the address. + */ + + if (sin6) + { + if (addr_len < sizeof(struct sockaddr_in6)) + return(-EINVAL); + + if (sin6->sin6_family && sin6->sin6_family != AF_INET6) + return(-EINVAL); + + /* port is the proto value [0..255] carried in nexthdr */ + proto = ntohs(sin6->sin6_port); + + if (!proto) + proto = sk->num; + + if (proto > 255) + return(-EINVAL); + + daddr = &sin6->sin6_addr; + + if (np->dest && ipv6_addr_cmp(daddr, &np->daddr)) + { + ipv6_dst_unlock(np->dest); + np->dest = NULL; + } + } + else + { + if (sk->state != TCP_ESTABLISHED) + return(-EINVAL); + + proto = sk->num; + daddr = &(sk->net_pinfo.af_inet6.daddr); + } + + if (ipv6_addr_any(daddr)) + { + /* + * unspecfied destination address + * treated as error... is this correct ? + */ + return(-EINVAL); + } + + /* + * We don't allow > 64K sends yet. + */ + if (len + (sk->ip_hdrincl ? 0 : sizeof(struct ipv6hdr)) > 65535) + return -EMSGSIZE; + + if (msg->msg_control) + { + opt = &opt_space; + memset(opt, 0, sizeof(struct ipv6_options)); + + err = datagram_send_ctl(msg, &dev, &saddr, opt); + if (err < 0) + { + printk(KERN_DEBUG "invalid msg_control\n"); + return err; + } + } + + raw_opt = &sk->tp_pinfo.tp_raw; + + + if (raw_opt->checksum) + { + struct rawv6_fakehdr hdr; + + hdr.iov = msg->msg_iov; + hdr.sk = sk; + hdr.len = len; + hdr.cksum = 0; + hdr.proto = proto; + + if (opt && opt->srcrt) + { + hdr.daddr = daddr; + } + else + { + hdr.daddr = NULL; + } + + err = ipv6_build_xmit(sk, rawv6_frag_cksum, &hdr, daddr, len, + saddr, dev, opt, proto, noblock); + } + else + { + err = ipv6_build_xmit(sk, rawv6_getfrag, msg->msg_iov, daddr, + len, saddr, dev, opt, proto, + noblock); + } + + return err<0?err:len; +} + +static int rawv6_seticmpfilter(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + struct raw6_opt *opt = &sk->tp_pinfo.tp_raw; + int err = 0; + + switch (optname) { + case ICMPV6_FILTER: + err = copy_from_user(&opt->filter, optval, + sizeof(struct icmp6_filter)); + if (err) + err = -EFAULT; + break; + default: + err = -ENOPROTOOPT; + }; + + return err; +} + +static int rawv6_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + struct raw6_opt *opt = &sk->tp_pinfo.tp_raw; + int val, err; + + switch(level) + { + case SOL_RAW: + break; + + case SOL_ICMPV6: + if (sk->num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + return rawv6_seticmpfilter(sk, level, optname, optval, + optlen); + case SOL_IPV6: + if (optname == IPV6_CHECKSUM) + break; + default: + return ipv6_setsockopt(sk, level, optname, optval, + optlen); + } + + if (optval == NULL) + return(-EINVAL); + + err = get_user(val, (int *)optval); + if(err) + return err; + + switch (optname) + { + case IPV6_CHECKSUM: + if (val < 0) + { + opt->checksum = 0; + } + else + { + opt->checksum = 1; + opt->offset = val; + } + + return 0; + break; + + default: + return(-ENOPROTOOPT); + } +} + +static void rawv6_close(struct sock *sk, unsigned long timeout) +{ + struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; + + sk->state = TCP_CLOSE; + + if (np->dest) + { + ipv6_dst_unlock(np->dest); + } + + destroy_sock(sk); +} + +static int rawv6_init_sk(struct sock *sk) +{ + return(0); +} + +struct proto rawv6_prot = { + rawv6_close, + udpv6_connect, + NULL, + NULL, + NULL, + NULL, + datagram_select, + NULL, + rawv6_init_sk, + NULL, + NULL, + rawv6_setsockopt, + ipv6_getsockopt, /* FIXME */ + rawv6_sendmsg, + rawv6_recvmsg, + NULL, /* No special bind */ + rawv6_rcv_skb, + 128, + 0, + "RAW", + 0, 0, + NULL +}; + +/* + * Local variables: + * compile-command: "gcc -D__KERNEL__ -I/usr/src/linux/include -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer -fno-strength-reduce -pipe -m486 -DCPU=486 -DMODULE -DMODVERSIONS -include /usr/src/linux/include/linux/modversions.h -c -o rawv6.o rawv6.c" + * c-file-style: "Linux" + * End: + */ diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c new file mode 100644 index 000000000..e76dcc17c --- /dev/null +++ b/net/ipv6/reassembly.c @@ -0,0 +1,354 @@ +/* + * IPv6 fragment reassembly + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * Based on: net/ipv4/ip_fragment.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/in6.h> +#include <linux/ipv6.h> +#include <linux/icmpv6.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/rawv6.h> +#include <net/ndisc.h> +#include <net/ipv6_route.h> +#include <net/addrconf.h> + + +static struct frag_queue ipv6_frag_queue = { + &ipv6_frag_queue, &ipv6_frag_queue, + 0, {0}, NULL, NULL, + 0 +}; + +static void create_frag_entry(struct sk_buff *skb, + struct device *dev, + __u8 *nhptr, + struct frag_hdr *fhdr); +static int reasm_frag_1(struct frag_queue *fq, + struct sk_buff **skb_in); + +static void reasm_queue(struct frag_queue *fq, + struct sk_buff *skb, + struct frag_hdr *fhdr); + +static int reasm_frag(struct frag_queue *fq, struct sk_buff **skb, + __u8 *nhptr, + struct frag_hdr *fhdr) +{ + __u32 expires; + int nh; + + expires = del_timer(&fq->timer); + + /* + * We queue the packet even if it's the last. + * It's a trade off. This allows the reassembly + * code to be simpler (=faster) and of the + * steps we do for queueing the only unnecessary + * one it's the kmalloc for a struct ipv6_frag. + * Feel free to try other alternatives... + */ + reasm_queue(fq, *skb, fhdr); + + if ((fhdr->frag_off & __constant_htons(0x0001)) == 0) + { + fq->last_in = 1; + fq->nhptr = nhptr; + } + + if (fq->last_in) + { + if ((nh = reasm_frag_1(fq, skb))) + return nh; + } + + fq->timer.expires = expires; + add_timer(&fq->timer); + + return 0; +} + +int ipv6_reassembly(struct sk_buff **skb, struct device *dev, __u8 *nhptr, + struct ipv6_options *opt) +{ + struct frag_hdr *fhdr = (struct frag_hdr *) ((*skb)->h.raw); + struct frag_queue *fq; + + for (fq = ipv6_frag_queue.next; fq != &ipv6_frag_queue; fq = fq->next) + { + if (fq->id == fhdr->identification) + { + return reasm_frag(fq, skb, nhptr,fhdr); + } + } + + create_frag_entry(*skb, dev, nhptr, fhdr); + + + return 0; +} + + +static void fq_free(struct frag_queue *fq) +{ + struct ipv6_frag *fp, *back; + + for(fp = fq->fragments; fp; ) + { + kfree_skb(fp->skb, FREE_READ); + back = fp; + fp=fp->next; + kfree(back); + } + + fq->prev->next = fq->next; + fq->next->prev = fq->prev; + + fq->prev = fq->next = NULL; + + kfree(fq); + +} + +static void frag_expire(unsigned long data) +{ + struct frag_queue *fq; + struct ipv6_frag *frag; + + fq = (struct frag_queue *) data; + + del_timer(&fq->timer); + + frag = fq->fragments; + + if (frag == NULL) + { + printk(KERN_DEBUG "invalid fragment queue\n"); + return; + } + + icmpv6_send(frag->skb, ICMPV6_TIME_EXCEEDED, ICMPV6_EXC_FRAGTIME, 0, + frag->skb->dev); + + fq_free(fq); +} + + +static void create_frag_entry(struct sk_buff *skb, struct device *dev, + __u8 *nhptr, + struct frag_hdr *fhdr) +{ + struct frag_queue *fq; + + fq = (struct frag_queue *) kmalloc(sizeof(struct frag_queue), + GFP_ATOMIC); + + if (fq == NULL) + { + kfree_skb(skb, FREE_READ); + return; + } + + memset(fq, 0, sizeof(struct frag_queue)); + + fq->id = fhdr->identification; + + fq->dev = dev; + + /* init_timer has been done by the memset */ + fq->timer.function = frag_expire; + fq->timer.data = (long) fq; + fq->timer.expires = jiffies + IPV6_FRAG_TIMEOUT; + + fq->nexthdr = fhdr->nexthdr; + + + if ((fhdr->frag_off & __constant_htons(0x0001)) == 0) + { + fq->last_in = 1; + fq->nhptr = nhptr; + } + reasm_queue(fq, skb, fhdr); + + fq->prev = ipv6_frag_queue.prev; + fq->next = &ipv6_frag_queue; + fq->prev->next = fq; + ipv6_frag_queue.prev = fq; + + add_timer(&fq->timer); +} + + +static void reasm_queue(struct frag_queue *fq, struct sk_buff *skb, + struct frag_hdr *fhdr) +{ + struct ipv6_frag *nfp, *fp, **bptr; + + nfp = (struct ipv6_frag *) kmalloc(sizeof(struct ipv6_frag), + GFP_ATOMIC); + + if (nfp == NULL) + { + kfree_skb(skb, FREE_READ); + return; + } + + + nfp->offset = ntohs(fhdr->frag_off) & ~0x7; + nfp->len = (ntohs(skb->ipv6_hdr->payload_len) - + ((u8 *) (fhdr + 1) - (u8 *) (skb->ipv6_hdr + 1))); + + + nfp->skb = skb; + nfp->fhdr = fhdr; + + nfp->next = NULL; + + bptr = &fq->fragments; + + + for (fp = fq->fragments; fp; fp=fp->next) + { + if (nfp->offset <= fp->offset) + break; + bptr = &fp->next; + } + + if (fp && fp->offset == nfp->offset) + { + if (fp->len != nfp->len) + { + /* this cannot happen */ + printk(KERN_DEBUG "reasm_queue: dup with wrong len\n"); + } + + /* duplicate. discard it. */ + kfree_skb(skb, FREE_READ); + kfree(nfp); + return; + } + + + *bptr = nfp; + nfp->next = fp; +} + +/* + * check if this fragment completes the packet + * returns true on success + */ +static int reasm_frag_1(struct frag_queue *fq, struct sk_buff **skb_in) +{ + struct ipv6_frag *fp; + struct ipv6_frag *tail = NULL; + struct sk_buff *skb; + __u32 offset = 0; + __u32 payload_len; + __u16 unfrag_len; + __u16 copy; + int nh; + + + for(fp = fq->fragments; fp; fp=fp->next) + { + if (offset != fp->offset) + return 0; + + offset += fp->len; + tail = fp; + } + + /* + * we know the m_flag arrived and we have a queue, + * starting from 0, without gaps. + * this means we have all fragments. + */ + + unfrag_len = (u8 *) (tail->fhdr) - (u8 *) (tail->skb->ipv6_hdr + 1); + + payload_len = (unfrag_len + tail->offset + + (tail->skb->tail - (__u8 *) (tail->fhdr + 1))); + + printk(KERN_DEBUG "reasm: payload len = %d\n", payload_len); + + if ((skb = dev_alloc_skb(sizeof(struct ipv6hdr) + payload_len))==NULL) + { + printk(KERN_DEBUG "reasm_frag: no memory for reassembly\n"); + fq_free(fq); + return 1; + } + + copy = unfrag_len + sizeof(struct ipv6hdr); + + skb->ipv6_hdr = (struct ipv6hdr *) skb->data; + + skb->free = 1; + skb->dev = fq->dev; + + + nh = fq->nexthdr; + + *(fq->nhptr) = nh; + memcpy(skb_put(skb, copy), tail->skb->ipv6_hdr, copy); + + skb->h.raw = skb->tail; + + skb->ipv6_hdr->payload_len = ntohs(payload_len); + + *skb_in = skb; + + /* + * FIXME: If we don't have a checksum we ought to be able + * to defragment and checksum in this pass. [AC] + */ + for(fp = fq->fragments; fp; ) + { + struct ipv6_frag *back; + + memcpy(skb_put(skb, fp->len), (__u8*)(fp->fhdr + 1), fp->len); + kfree_skb(fp->skb, FREE_READ); + back = fp; + fp=fp->next; + kfree(back); + } + + fq->prev->next = fq->next; + fq->next->prev = fq->prev; + + fq->prev = fq->next = NULL; + + kfree(fq); + + return nh; +} + + + +/* + * Local variables: + * compile-command: "gcc -D__KERNEL__ -I/usr/src/linux/include -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer -fno-strength-reduce -pipe -m486 -DCPU=486 -DMODULE -DMODVERSIONS -include /usr/src/linux/include/linux/modversions.h -c -o reassembly.o reassembly.c" + * c-file-style: "Linux" + * End: + */ diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c new file mode 100644 index 000000000..f96b62229 --- /dev/null +++ b/net/ipv6/sit.c @@ -0,0 +1,599 @@ +/* + * IPv6 over IPv4 tunnel device - Simple Internet Transition (SIT) + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/icmp.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/ndisc.h> +#include <net/ipv6_route.h> +#include <net/addrconf.h> +#include <net/ip.h> +#include <net/udp.h> +#include <net/sit.h> + + +static int sit_init_dev(struct device *dev); + +static struct device sit_device = { + "sit0", + 0, 0, 0, 0, + 0x0, 0, + 0, 0, 0, NULL, sit_init_dev +}; + +static unsigned long sit_gc_last_run; +static void sit_mtu_cache_gc(void); + +static int sit_xmit(struct sk_buff *skb, + struct device *dev); +static int sit_rcv(struct sk_buff *skb, + struct device *dev, + struct options *opt, + __u32 daddr, unsigned short len, + __u32 saddr, int redo, + struct inet_protocol * protocol); + +static int sit_open(struct device *dev); +static int sit_close(struct device *dev); + +static struct enet_statistics * sit_get_stats(struct device *dev); + +static void sit_err(int type, int code, + unsigned char *buff, __u32 info, + __u32 daddr, __u32 saddr, + struct inet_protocol *protocol, + int len); + +static struct inet_protocol sit_protocol = { + sit_rcv, + sit_err, + 0, + IPPROTO_IPV6, + 0, + NULL, + "IPv6" +}; + +#define SIT_NUM_BUCKETS 16 + +struct sit_mtu_info *sit_mtu_cache[SIT_NUM_BUCKETS]; + +static int vif_num = 0; +static struct sit_vif *vif_list = NULL; + +static __inline__ __u32 sit_addr_hash(__u32 addr) +{ + + __u32 hash_val; + + hash_val = addr; + + hash_val ^= hash_val >> 16; + hash_val ^= hash_val >> 8; + + return (hash_val & (SIT_NUM_BUCKETS - 1)); +} + +static void sit_cache_insert(__u32 addr, int mtu) +{ + struct sit_mtu_info *minfo; + int hash; + + minfo = kmalloc(sizeof(struct sit_mtu_info), GFP_ATOMIC); + + if (minfo == NULL) + return; + + minfo->addr = addr; + minfo->tstamp = jiffies; + minfo->mtu = mtu; + + hash = sit_addr_hash(addr); + + minfo->next = sit_mtu_cache[hash]; + sit_mtu_cache[hash] = minfo; +} + +static struct sit_mtu_info * sit_mtu_lookup(__u32 addr) +{ + struct sit_mtu_info *iter; + int hash; + + hash = sit_addr_hash(addr); + + for(iter = sit_mtu_cache[hash]; iter; iter=iter->next) + { + if (iter->addr == addr) + { + iter->tstamp = jiffies; + break; + } + } + + /* + * run garbage collector + */ + + if (jiffies - sit_gc_last_run > SIT_GC_FREQUENCY) + { + sit_mtu_cache_gc(); + sit_gc_last_run = jiffies; + } + + return iter; +} + +static void sit_mtu_cache_gc(void) +{ + struct sit_mtu_info *iter, *back; + unsigned long now = jiffies; + int i; + + for (i=0; i < SIT_NUM_BUCKETS; i++) + { + back = NULL; + for (iter = sit_mtu_cache[i]; iter;) + { + if (now - iter->tstamp > SIT_GC_TIMEOUT) + { + struct sit_mtu_info *old; + + old = iter; + iter = iter->next; + + if (back) + { + back->next = iter; + } + else + { + sit_mtu_cache[i] = iter; + } + + kfree(old); + continue; + } + back = iter; + iter = iter->next; + } + } +} + +static int sit_init_dev(struct device *dev) +{ + int i; + + dev->open = sit_open; + dev->stop = sit_close; + + dev->hard_start_xmit = sit_xmit; + dev->get_stats = sit_get_stats; + + dev->priv = kmalloc(sizeof(struct enet_statistics), GFP_KERNEL); + + if (dev->priv == NULL) + return -ENOMEM; + + memset(dev->priv, 0, sizeof(struct enet_statistics)); + + + for (i = 0; i < DEV_NUMBUFFS; i++) + skb_queue_head_init(&dev->buffs[i]); + + dev->hard_header = NULL; + dev->rebuild_header = NULL; + dev->set_mac_address = NULL; + dev->header_cache_bind = NULL; + dev->header_cache_update= NULL; + + dev->type = ARPHRD_SIT; + + dev->hard_header_len = MAX_HEADER; + dev->mtu = 1500 - sizeof(struct iphdr); + dev->addr_len = 0; + dev->tx_queue_len = 2; + + memset(dev->broadcast, 0, MAX_ADDR_LEN); + memset(dev->dev_addr, 0, MAX_ADDR_LEN); + + dev->flags = IFF_NOARP; + + dev->family = AF_INET6; + dev->pa_addr = 0; + dev->pa_brdaddr = 0; + dev->pa_dstaddr = 0; + dev->pa_mask = 0; + dev->pa_alen = 4; + + return 0; +} + +static int sit_init_vif(struct device *dev) +{ + int i; + + dev->flags = IFF_NOARP|IFF_POINTOPOINT|IFF_MULTICAST; + dev->priv = kmalloc(sizeof(struct enet_statistics), GFP_KERNEL); + + if (dev->priv == NULL) + return -ENOMEM; + + memset(dev->priv, 0, sizeof(struct enet_statistics)); + + for (i = 0; i < DEV_NUMBUFFS; i++) + skb_queue_head_init(&dev->buffs[i]); + + return 0; +} + +static int sit_open(struct device *dev) +{ + return 0; +} + +static int sit_close(struct device *dev) +{ + return 0; +} + + +int sit_init(void) +{ + int i; + + /* register device */ + + if (register_netdev(&sit_device) != 0) + { + return -EIO; + } + + inet_add_protocol(&sit_protocol); + + for (i=0; i < SIT_NUM_BUCKETS; i++) + sit_mtu_cache[i] = NULL; + + sit_gc_last_run = jiffies; + + return 0; +} + +struct device *sit_add_tunnel(__u32 dstaddr) +{ + struct sit_vif *vif; + struct device *dev; + + if ((sit_device.flags & IFF_UP) == 0) + return NULL; + + vif = kmalloc(sizeof(struct sit_vif), GFP_KERNEL); + if (vif == NULL) + return NULL; + + /* + * Create PtoP configured tunnel + */ + + dev = kmalloc(sizeof(struct device), GFP_KERNEL); + if (dev == NULL) + return NULL; + + memcpy(dev, &sit_device, sizeof(struct device)); + dev->init = sit_init_vif; + dev->pa_dstaddr = dstaddr; + + dev->name = vif->name; + sprintf(vif->name, "sit%d", ++vif_num); + + register_netdev(dev); + + vif->dev = dev; + vif->next = vif_list; + vif_list = vif; + + return dev; +} + +void sit_cleanup(void) +{ + struct sit_vif *vif; + + for (vif = vif_list; vif;) + { + struct device *dev = vif->dev; + struct sit_vif *cur; + + unregister_netdev(dev); + kfree(dev->priv); + kfree(dev); + + cur = vif; + vif = vif->next; + } + + vif_list = NULL; + + unregister_netdev(&sit_device); + inet_del_protocol(&sit_protocol); + +} + + + +/* + * receive IPv4 ICMP messages + */ + +static void sit_err(int type, int code, unsigned char *buff, __u32 info, + __u32 daddr, __u32 saddr, struct inet_protocol *protocol, + int len) + +{ + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) + { + struct sit_mtu_info *minfo; + + info -= sizeof(struct iphdr); + + minfo = sit_mtu_lookup(daddr); + + printk(KERN_DEBUG "sit: %08lx pmtu = %ul\n", ntohl(saddr), + info); + if (minfo == NULL) + { + minfo = kmalloc(sizeof(struct sit_mtu_info), + GFP_ATOMIC); + + if (minfo == NULL) + return; + + start_bh_atomic(); + sit_cache_insert(daddr, info); + end_bh_atomic(); + } + else + { + minfo->mtu = info; + } + } +} + +static int sit_rcv(struct sk_buff *skb, struct device *idev, + struct options *opt, + __u32 daddr, unsigned short len, + __u32 saddr, int redo, struct inet_protocol * protocol) +{ + struct enet_statistics *stats; + struct device *dev = NULL; + struct sit_vif *vif; + + skb->h.raw = skb_pull(skb, skb->h.raw - skb->data); + skb->protocol = __constant_htons(ETH_P_IPV6); + + for (vif = vif_list; vif; vif = vif->next) + { + if (saddr == vif->dev->pa_dstaddr) + { + dev = vif->dev; + break; + } + } + + if (dev == NULL) + { + dev = &sit_device; + } + + skb->dev = dev; + skb->ip_summed = CHECKSUM_NONE; + + stats = (struct enet_statistics *)dev->priv; + stats->rx_packets++; + + ipv6_rcv(skb, dev, NULL); + return 0; +} + +static int sit_xmit(struct sk_buff *skb, struct device *dev) +{ + struct enet_statistics *stats; + struct sit_mtu_info *minfo; + struct in6_addr *addr6; + unsigned long flags; + struct rtable *rt; + struct iphdr *iph; + __u32 saddr; + __u32 daddr; + __u32 raddr; + int addr_type; + int mtu; + int len; + + /* + * Make sure we are not busy (check lock variable) + */ + + stats = (struct enet_statistics *)dev->priv; + save_flags(flags); + cli(); + if (dev->tbusy != 0) + { + restore_flags(flags); + printk(KERN_DEBUG "sit_xmit: busy\n"); + return(1); + } + dev->tbusy = 1; + restore_flags(flags); + + daddr = dev->pa_dstaddr; + if (daddr == 0) + { + struct neighbour *neigh; + + neigh = skb->nexthop; + if (neigh == NULL) + { + printk(KERN_DEBUG "sit: nexthop == NULL\n"); + goto on_error; + } + + addr6 = &neigh->addr; + addr_type = ipv6_addr_type(addr6); + + if (addr_type == IPV6_ADDR_ANY) + { + addr6 = &skb->ipv6_hdr->daddr; + addr_type = ipv6_addr_type(addr6); + } + + if ((addr_type & IPV6_ADDR_COMPATv4) == 0) + { + printk(KERN_DEBUG "sit_xmit: non v4 address\n"); + goto on_error; + } + daddr = addr6->s6_addr32[3]; + } + + len = skb->tail - (skb->data + sizeof(struct ipv6hdr)); + + if (skb->sk) + { + atomic_sub(skb->truesize, &skb->sk->wmem_alloc); + } + + skb->sk = NULL; + + iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr)); + + skb->protocol = htons(ETH_P_IP); + + /* get route */ + + rt = ip_rt_route(daddr, skb->localroute); + + if (rt == NULL) + { + printk(KERN_DEBUG "sit: no route to host\n"); + goto on_error; + } + + minfo = sit_mtu_lookup(daddr); + + if (minfo) + mtu = minfo->mtu; + else + mtu = rt->rt_dev->mtu; + + if (mtu > 576 && len > mtu) + { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); + goto on_error; + } + + saddr = rt->rt_src; + skb->dev = rt->rt_dev; + raddr = rt->rt_gateway; + + if (raddr == 0) + raddr = daddr; + + /* now for the device header */ + + skb->arp = 1; + + if (skb->dev->hard_header_len) + { + int mac; + + if (skb->data - skb->head < skb->dev->hard_header_len) + { + printk(KERN_DEBUG "sit: space at head < dev header\n"); + goto on_error; + } + + if (skb->dev->hard_header) + { + mac = skb->dev->hard_header(skb, skb->dev, ETH_P_IP, + NULL, NULL, len); + + if (mac < 0) + skb->arp = 0; + + skb->raddr = raddr; + } + + } + + ip_rt_put(rt); + + + iph->version = 4; + iph->ihl = 5; + iph->tos = 0; /* tos set to 0... */ + + if (mtu > 576) + { + iph->frag_off = htons(IP_DF); + } + else + iph->frag_off = 0; + + iph->ttl = 64; + iph->saddr = saddr; + iph->daddr = daddr; + iph->protocol = IPPROTO_IPV6; + skb->ip_hdr = iph; + + ip_send_check(iph); + + ip_queue_xmit(NULL, skb->dev, skb, 1); + + stats->tx_packets++; + dev->tbusy=0; + + return 0; + + on_error: + kfree_skb(skb, FREE_WRITE); + dev->tbusy=0; + stats->tx_errors++; + return 0; +} + +static struct enet_statistics *sit_get_stats(struct device *dev) +{ + return((struct enet_statistics*) dev->priv); +} + + +/* + * Local variables: + * compile-command: "gcc -D__KERNEL__ -I/usr/src/linux/include -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer -fno-strength-reduce -pipe -m486 -DCPU=486 -DMODULE -DMODVERSIONS -include /usr/src/linux/include/linux/modversions.h -c -o sit.o sit.c" + * c-file-style: "Linux" + * End: + */ diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c new file mode 100644 index 000000000..ce7bb4681 --- /dev/null +++ b/net/ipv6/sysctl_net_ipv6.c @@ -0,0 +1,78 @@ +/* + * sysctl_net_ipv6.c: sysctl interface to net IPV6 subsystem. + */ + +#include <linux/mm.h> +#include <linux/sysctl.h> +#include <linux/in6.h> +#include <linux/ipv6.h> +#include <net/ndisc.h> +#include <net/ipv6.h> +#include <net/addrconf.h> + + +int ipv6_hop_limit = IPV6_DEFAULT_HOPLIMIT; + +int ipv6_sysctl_forwarding(ctl_table *ctl, int write, struct file * filp, + void *buffer, size_t *lenp) +{ + int val = ipv6_forwarding; + int retv; + + retv = proc_dointvec(ctl, write, filp, buffer, lenp); + + if (write) + { + if (ipv6_forwarding && val == 0) { + printk(KERN_DEBUG "sysctl: IPv6 forwarding enabled\n"); + ndisc_forwarding_on(); + addrconf_forwarding_on(); + } + + if (ipv6_forwarding == 0 && val) { + ndisc_forwarding_off(); + } + } + return retv; +} + +ctl_table ipv6_table[] = { + {NET_IPV6_FORWARDING, "ipv6_forwarding", + &ipv6_forwarding, sizeof(int), 0644, NULL, + &ipv6_sysctl_forwarding}, + + {NET_IPV6_HOPLIMIT, "ipv6_hop_limit", + &ipv6_hop_limit, sizeof(int), 0644, NULL, + &proc_dointvec}, + + {0} +}; + +#ifdef MODULE +static struct ctl_table_header *ipv6_sysctl_header; +static struct ctl_table ipv6_root_table[]; +static struct ctl_table ipv6_net_table[]; + + +ctl_table ipv6_root_table[] = { + {CTL_NET, "net", NULL, 0, 0555, ipv6_net_table}, + {0} +}; + +ctl_table ipv6_net_table[] = { + {NET_IPV6, "ipv6", NULL, 0, 0555, ipv6_table}, + {0} +}; + +void ipv6_sysctl_register(void) +{ + ipv6_sysctl_header = register_sysctl_table(ipv6_root_table, 0); +} + +void ipv6_sysctl_unregister(void) +{ + unregister_sysctl_table(ipv6_sysctl_header); +} + +#endif + diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c new file mode 100644 index 000000000..bb03b34dd --- /dev/null +++ b/net/ipv6/tcp_ipv6.c @@ -0,0 +1,1280 @@ +/* + * TCP over IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * $Id: tcp_ipv6.c,v 1.15 1996/10/29 22:45:53 roque Exp $ + * + * Based on: + * linux/net/ipv4/tcp.c + * linux/net/ipv4/tcp_input.c + * linux/net/ipv4/tcp_output.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/sched.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/netdevice.h> + +#include <linux/ipv6.h> +#include <linux/icmpv6.h> +#include <linux/random.h> + +#include <net/tcp.h> +#include <net/ndisc.h> +#include <net/ipv6.h> +#include <net/transp_v6.h> +#include <net/addrconf.h> +#include <net/ipv6_route.h> + +#include <asm/uaccess.h> + +static void tcp_v6_send_reset(struct in6_addr *saddr, struct in6_addr *daddr, + struct tcphdr *th, struct proto *prot, + struct ipv6_options *opt, + struct device *dev, int pri, int hop_limit); + +static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, + struct sk_buff *skb); + +static int tcp_v6_backlog_rcv(struct sock *sk, struct sk_buff *skb); +static int tcp_v6_build_header(struct sock *sk, struct sk_buff *skb); + +static struct tcp_func ipv6_mapped; +static struct tcp_func ipv6_specific; + +static __inline__ u16 tcp_v6_check(struct tcphdr *th, int len, + struct in6_addr *saddr, + struct in6_addr *daddr, + unsigned long base) +{ + return csum_ipv6_magic(saddr, daddr, len, IPPROTO_TCP, base); +} + +static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb) +{ + __u32 si; + __u32 di; + + if (skb->protocol == __constant_htons(ETH_P_IPV6)) + { + si = skb->ipv6_hdr->saddr.s6_addr32[3]; + di = skb->ipv6_hdr->daddr.s6_addr32[3]; + } + else + { + si = skb->saddr; + di = skb->daddr; + } + + return secure_tcp_sequence_number(di, si, + skb->h.th->dest, + skb->h.th->source); +} + +static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; + struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct dest_entry *dc; + struct inet6_ifaddr *ifa; + struct tcphdr *th; + __u8 *ptr; + struct sk_buff *buff; + struct sk_buff *skb1; + int addr_type; + int tmp; + + if (sk->state != TCP_CLOSE) + return(-EISCONN); + + /* + * Don't allow a double connect. + */ + + if(!ipv6_addr_any(&np->daddr)) + return -EINVAL; + + if (addr_len < sizeof(struct sockaddr_in6)) + return(-EINVAL); + + if (usin->sin6_family && usin->sin6_family != AF_INET6) + return(-EAFNOSUPPORT); + + /* + * connect() to INADDR_ANY means loopback (BSD'ism). + */ + + if(ipv6_addr_any(&usin->sin6_addr)) + usin->sin6_addr.s6_addr[15] = 0x1; + + addr_type = ipv6_addr_type(&usin->sin6_addr); + + if(addr_type & IPV6_ADDR_MULTICAST) + { + return -ENETUNREACH; + } + + /* + * connect to self not allowed + */ + + if (ipv6_addr_cmp(&usin->sin6_addr, &np->saddr) == 0 && + usin->sin6_port == sk->dummy_th.source) + { + return (-EINVAL); + } + + memcpy(&np->daddr, &usin->sin6_addr, sizeof(struct in6_addr)); + + /* + * TCP over IPv4 + */ + + if (addr_type == IPV6_ADDR_MAPPED) + { + struct sockaddr_in sin; + int err; + + printk(KERN_DEBUG "connect: ipv4 mapped\n"); + + sin.sin_family = AF_INET; + sin.sin_port = usin->sin6_port; + sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; + + sk->tp_pinfo.af_tcp.af_specific = &ipv6_mapped; + sk->backlog_rcv = tcp_v4_backlog_rcv; + + err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); + + if (err) + { + sk->tp_pinfo.af_tcp.af_specific = &ipv6_specific; + sk->backlog_rcv = tcp_v6_backlog_rcv; + } + + return err; + } + + dc = ipv6_dst_route(&np->daddr, NULL, (sk->localroute ? RTI_GATEWAY : 0)); + + if (dc == NULL) + { + return -ENETUNREACH; + } + + np->dest = dc; + np->dc_sernum = (dc->rt.fib_node ? dc->rt.fib_node->fn_sernum : 0); + + ifa = ipv6_get_saddr((struct rt6_info *)dc, &np->daddr); + + if (ifa == NULL) + { + return -ENETUNREACH; + } + + + /* + * Init variables + */ + + lock_sock(sk); + + sk->dummy_th.dest = usin->sin6_port; + sk->write_seq = secure_tcp_sequence_number(np->saddr.s6_addr32[3], + np->daddr.s6_addr32[3], + sk->dummy_th.source, + sk->dummy_th.dest); + + tp->snd_wnd = 0; + tp->snd_wl1 = 0; + tp->snd_wl2 = sk->write_seq; + tp->snd_una = sk->write_seq; + + tp->rcv_nxt = 0; + + sk->err = 0; + + release_sock(sk); + + buff = sock_wmalloc(sk, MAX_SYN_SIZE, 0, GFP_KERNEL); + + if (buff == NULL) + { + return(-ENOMEM); + } + lock_sock(sk); + buff->sk = sk; + buff->free = 0; + buff->localroute = sk->localroute; + + tmp = tcp_v6_build_header(sk, buff); + + /* set the source address */ + + memcpy(&np->saddr, &ifa->addr, sizeof(struct in6_addr)); + memcpy(&np->rcv_saddr, &ifa->addr, sizeof(struct in6_addr)); + + /* build the tcp header */ + th = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr)); + buff->h.th = th; + + memcpy(th, (void *) &(sk->dummy_th), sizeof(*th)); + buff->seq = sk->write_seq++; + th->seq = htonl(buff->seq); + tp->snd_nxt = sk->write_seq; + buff->end_seq = sk->write_seq; + th->ack = 0; + th->window = 2; + th->syn = 1; + th->doff = 6; + + sk->window_clamp=0; + + if ((dc->dc_flags & DCF_PMTU)) + sk->mtu = dc->dc_pmtu; + else + sk->mtu = dc->rt.rt_dev->mtu; + + sk->mss = sk->mtu - sizeof(struct ipv6hdr) - sizeof(struct tcphdr); + + /* + * Put in the TCP options to say MTU. + */ + + ptr = skb_put(buff,4); + ptr[0] = 2; + ptr[1] = 4; + ptr[2] = (sk->mss) >> 8; + ptr[3] = (sk->mss) & 0xff; + buff->csum = csum_partial(ptr, 4, 0); + + tcp_v6_send_check(sk, th, sizeof(struct tcphdr) + 4, buff); + + tcp_set_state(sk, TCP_SYN_SENT); + + /* FIXME: should use dcache->rtt if availiable */ + tp->rto = TCP_TIMEOUT_INIT; + + tcp_init_xmit_timers(sk); + + sk->retransmits = 0; + + skb_queue_tail(&sk->write_queue, buff); + sk->packets_out++; + buff->when = jiffies; + skb1 = skb_clone(buff, GFP_KERNEL); + sk->wmem_alloc += skb1->truesize; + + tmp = ipv6_xmit(sk, skb1, &np->saddr, &np->daddr, NULL, IPPROTO_TCP); + + /* Timer for repeating the SYN until an answer */ + + tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); + tcp_statistics.TcpActiveOpens++; + tcp_statistics.TcpOutSegs++; + + release_sock(sk); + + return(tmp); +} + +static int tcp_v6_sendmsg(struct sock *sk, struct msghdr *msg, + int len, int nonblock, int flags) +{ + struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; + int retval = -EINVAL; + + /* + * Do sanity checking for sendmsg/sendto/send + */ + + if (flags & ~(MSG_OOB|MSG_DONTROUTE)) + goto out; + if (msg->msg_name) { + struct sockaddr_in6 *addr=(struct sockaddr_in6 *)msg->msg_name; + + if (msg->msg_namelen < sizeof(*addr)) + goto out; + + if (addr->sin6_family && addr->sin6_family != AF_INET6) + goto out; + retval = -ENOTCONN; + + if(sk->state == TCP_CLOSE) + goto out; + retval = -EISCONN; + if (addr->sin6_port != sk->dummy_th.dest) + goto out; + if (ipv6_addr_cmp(&addr->sin6_addr, &np->daddr)) + goto out; + } + + lock_sock(sk); + retval = tcp_do_sendmsg(sk, msg->msg_iovlen, msg->msg_iov, + len, nonblock, flags); + + release_sock(sk); + +out: + return retval; +} + +void tcp_v6_err(int type, int code, unsigned char *header, __u32 info, + struct in6_addr *saddr, struct in6_addr *daddr, + struct inet6_protocol *protocol) +{ + struct tcphdr *th = (struct tcphdr *)header; + struct ipv6_pinfo *np; + struct sock *sk; + int err; + int opening; + + sk = inet6_get_sock(&tcpv6_prot, daddr, saddr, th->source, th->dest); + + if (sk == NULL) + { + return; + } + + np = &sk->net_pinfo.af_inet6; + + if (type == ICMPV6_PKT_TOOBIG) + { + /* icmp should have updated the destination cache entry */ + + np->dest = ipv6_dst_check(np->dest, &np->daddr, np->dc_sernum, + 0); + + np->dc_sernum = (np->dest->rt.fib_node ? + np->dest->rt.fib_node->fn_sernum : 0); + + if (np->dest->dc_flags & DCF_PMTU) + sk->mtu = np->dest->dc_pmtu; + + sk->mtu = (sk->mtu - sizeof(struct ipv6hdr) - + sizeof(struct tcphdr)); + + return; + } + + opening = (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV); + + if (icmpv6_err_convert(type, code, &err) || opening) + { + sk->err = err; + if (opening) + { + tcp_statistics.TcpAttemptFails++; + tcp_set_state(sk,TCP_CLOSE); + sk->error_report(sk); + } + } + else + sk->err_soft = err; +} + + +static void tcp_v6_send_synack(struct sock *sk, struct open_request *req) +{ + struct tcp_v6_open_req *af_req = (struct tcp_v6_open_req *) req; + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct sk_buff * skb; + struct tcphdr *th; + unsigned char *ptr; + struct dest_entry *dc; + int mss; + + skb = sock_wmalloc(sk, MAX_SYN_SIZE, 1, GFP_ATOMIC); + + if (skb == NULL) + { + return; + } + + skb_reserve(skb, (MAX_HEADER + 15) & ~15); + skb->ipv6_hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr)); + + dc = ipv6_dst_route(&af_req->rmt_addr, af_req->dev, 0); + + skb->dev = af_req->dev; + + if (dc) + { + if (dc->dc_flags & DCF_PMTU) + mss = dc->dc_pmtu; + else + mss = dc->dc_nexthop->dev->mtu; + mss -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); + + ipv6_dst_unlock(dc); + } + else + mss = 516; + + th =(struct tcphdr *) skb_put(skb, sizeof(struct tcphdr)); + skb->h.th = th; + memset(th, 0, sizeof(struct tcphdr)); + + th->syn = 1; + th->ack = 1; + + th->source = sk->dummy_th.source; + th->dest = req->rmt_port; + + skb->seq = req->snt_isn; + skb->end_seq = skb->seq + 1; + + th->seq = ntohl(skb->seq); + th->ack_seq = htonl(req->rcv_isn + 1); + th->doff = sizeof(*th)/4 + 1; + + th->window = ntohs(tp->rcv_wnd); + + ptr = skb_put(skb, TCPOLEN_MSS); + ptr[0] = TCPOPT_MSS; + ptr[1] = TCPOLEN_MSS; + ptr[2] = (mss >> 8) & 0xff; + ptr[3] = mss & 0xff; + skb->csum = csum_partial(ptr, TCPOLEN_MSS, 0); + + th->check = tcp_v6_check(th, sizeof(*th) + TCPOLEN_MSS, &af_req->loc_addr, + &af_req->rmt_addr, + csum_partial((char *)th, sizeof(*th), skb->csum)); + + ipv6_xmit(sk, skb, &af_req->loc_addr, &af_req->rmt_addr, af_req->opt, + IPPROTO_TCP); + + tcp_statistics.TcpOutSegs++; + +} + +static void tcp_v6_or_free(struct open_request *req) +{ +} + +static struct or_calltable or_ipv6 = { + tcp_v6_send_synack, + tcp_v6_or_free +}; + +static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, + __u32 isn) +{ + struct tcp_v6_open_req *af_req; + struct open_request *req; + + /* If the socket is dead, don't accept the connection. */ + if (sk->dead) + { + if(sk->debug) + { + printk("Reset on %p: Connect on dead socket.\n",sk); + } + tcp_statistics.TcpAttemptFails++; + return -ENOTCONN; + } + + if (skb->protocol == __constant_htons(ETH_P_IP)) + { + return tcp_v4_conn_request(sk, skb, ptr, isn); + } + + /* + * There are no SYN attacks on IPv6, yet... + */ + if (sk->ack_backlog >= sk->max_ack_backlog) + { + printk(KERN_DEBUG "droping syn ack:%d max:%d\n", + sk->ack_backlog, sk->max_ack_backlog); + tcp_statistics.TcpAttemptFails++; + goto exit; + } + + af_req = kmalloc(sizeof(struct tcp_v6_open_req), GFP_ATOMIC); + + if (af_req == NULL) + { + tcp_statistics.TcpAttemptFails++; + goto exit; + } + + sk->ack_backlog++; + req = (struct open_request *) af_req; + + memset(af_req, 0, sizeof(struct tcp_v6_open_req)); + + req->rcv_isn = skb->seq; + req->snt_isn = isn; + + /* mss */ + req->mss = tcp_parse_options(skb->h.th); + + if (!req->mss) + { + req->mss = 536; + } + + req->rmt_port = skb->h.th->source; + + ipv6_addr_copy(&af_req->rmt_addr, &skb->ipv6_hdr->saddr); + ipv6_addr_copy(&af_req->loc_addr, &skb->ipv6_hdr->daddr); + + /* FIXME: options */ + + /* keep incoming device so that link locals have meaning */ + af_req->dev = skb->dev; + + req->class = &or_ipv6; + + tcp_v6_send_synack(sk, req); + + req->expires = jiffies + TCP_TIMEOUT_INIT; + tcp_inc_slow_timer(TCP_SLT_SYNACK); + tcp_synq_queue(&sk->tp_pinfo.af_tcp, req); + + sk->data_ready(sk, 0); + + exit: + kfree_skb(skb, FREE_READ); + return 0; +} + +static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, + struct sk_buff *skb) +{ + struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; + th->check = 0; + + th->check = csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, + csum_partial((char *)th, sizeof(*th), + skb->csum)); +} + +static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, + struct open_request *req) +{ + struct tcp_v6_open_req *af_req = (struct tcp_v6_open_req *) req; + struct ipv6_pinfo *np; + struct dest_entry *dc; + struct tcp_opt *newtp; + struct sock *newsk; + + + if (skb->protocol == __constant_htons(ETH_P_IP)) + { + /* + * v6 mapped + */ + + newsk = tcp_v4_syn_recv_sock(sk, skb, req); + + if (newsk == NULL) + return NULL; + + np = &newsk->net_pinfo.af_inet6; + + ipv6_addr_set(&np->daddr, 0, 0, __constant_htonl(0x0000FFFF), + newsk->daddr); + + ipv6_addr_set(&np->saddr, 0, 0, __constant_htonl(0x0000FFFF), + newsk->saddr); + + ipv6_addr_copy(&np->rcv_saddr, &np->saddr); + + newsk->tp_pinfo.af_tcp.af_specific = &ipv6_mapped; + newsk->backlog_rcv = tcp_v4_backlog_rcv; + + return newsk; + } + + newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC); + if (newsk == NULL) + { + return NULL; + } + + memcpy(newsk, sk, sizeof(*newsk)); + newsk->opt = NULL; + newsk->ip_route_cache = NULL; + skb_queue_head_init(&newsk->write_queue); + skb_queue_head_init(&newsk->receive_queue); + skb_queue_head_init(&newsk->out_of_order_queue); + + /* + * Unused + */ + + newsk->send_head = NULL; + newsk->send_tail = NULL; + + newtp = &(newsk->tp_pinfo.af_tcp); + np = &newsk->net_pinfo.af_inet6; + + newtp->send_head = NULL; + newtp->retrans_head = NULL; + + newtp->pending = 0; + + skb_queue_head_init(&newsk->back_log); + + newsk->prot->init(newsk); + + newsk->cong_count = 0; + newsk->ssthresh = 0; + newtp->backoff = 0; + newsk->blog = 0; + newsk->intr = 0; + newsk->proc = 0; + newsk->done = 0; + newsk->partial = NULL; + newsk->pair = NULL; + newsk->wmem_alloc = 0; + newsk->rmem_alloc = 0; + newsk->localroute = sk->localroute; + + newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF; + + newsk->err = 0; + newsk->shutdown = 0; + newsk->ack_backlog = 0; + + newsk->fin_seq = req->rcv_isn; + newsk->syn_seq = req->rcv_isn; + newsk->state = TCP_SYN_RECV; + newsk->timeout = 0; + newsk->ip_xmit_timeout = 0; + + newsk->write_seq = req->snt_isn; + + newtp->snd_wnd = ntohs(skb->h.th->window); + newsk->max_window = newtp->snd_wnd; + newtp->snd_wl1 = req->rcv_isn; + newtp->snd_wl2 = newsk->write_seq; + newtp->snd_una = newsk->write_seq++; + newtp->snd_nxt = newsk->write_seq; + + newsk->urg_data = 0; + newsk->packets_out = 0; + newsk->retransmits = 0; + newsk->linger=0; + newsk->destroy = 0; + init_timer(&newsk->timer); + newsk->timer.data = (unsigned long) newsk; + newsk->timer.function = &net_timer; + + tcp_init_xmit_timers(newsk); + + newsk->dummy_th.source = sk->dummy_th.source; + newsk->dummy_th.dest = req->rmt_port; + + newtp->rcv_nxt = req->rcv_isn + 1; + newtp->rcv_wup = req->rcv_isn + 1; + newsk->copied_seq = req->rcv_isn + 1; + + newsk->socket = NULL; + + ipv6_addr_copy(&np->daddr, &af_req->rmt_addr); + ipv6_addr_copy(&np->saddr, &af_req->loc_addr); + ipv6_addr_copy(&np->rcv_saddr, &af_req->loc_addr); + + /* + * options / mss + */ + + dc = ipv6_dst_route(&af_req->rmt_addr, af_req->dev, 0); + np->dest = dc; + + if (np->dest && (np->dest->dc_flags & DCF_PMTU)) + newsk->mtu = np->dest->dc_pmtu; + else + newsk->mtu = af_req->dev->mtu; + + newsk->mss = min(req->mss, (newsk->mtu - sizeof(struct ipv6hdr) - + sizeof(struct tcphdr))); + + newsk->daddr = LOOPBACK4_IPV6; + newsk->saddr = LOOPBACK4_IPV6; + newsk->rcv_saddr= LOOPBACK4_IPV6; + + inet_put_sock(newsk->num, newsk); + + return newsk; + +} + +static void tcp_v6_send_reset(struct in6_addr *saddr, struct in6_addr *daddr, + struct tcphdr *th, struct proto *prot, + struct ipv6_options *opt, + struct device *dev, int pri, int hop_limit) +{ + struct sk_buff *buff; + struct tcphdr *t1; + + if(th->rst) + return; + + /* + * We need to grab some memory, and put together an RST, + * and then put it into the queue to be sent. + */ + + buff = alloc_skb(MAX_RESET_SIZE, GFP_ATOMIC); + if (buff == NULL) + return; + + buff->sk = NULL; + buff->dev = dev; + buff->localroute = 0; + + tcp_v6_build_header(NULL, buff); + + t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr)); + memset(t1, 0, sizeof(*t1)); + + /* + * Swap the send and the receive. + */ + + t1->dest = th->source; + t1->source = th->dest; + t1->doff = sizeof(*t1)/4; + t1->rst = 1; + + if(th->ack) + { + t1->seq = th->ack_seq; + } + else + { + t1->ack = 1; + if(!th->syn) + t1->ack_seq = th->seq; + else + t1->ack_seq = htonl(ntohl(th->seq)+1); + } + + buff->csum = csum_partial((char *)t1, sizeof(*t1), 0); + + t1->check = csum_ipv6_magic(saddr, daddr, sizeof(*t1), IPPROTO_TCP, + buff->csum); + + + ipv6_xmit(NULL, buff, saddr, daddr, NULL, IPPROTO_TCP); + + tcp_statistics.TcpOutSegs++; +} + +struct sock *tcp_v6_check_req(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct open_request *req; + + + /* + * assumption: the socket is not in use. + * as we checked the user count on tcp_rcv and we're + * running from a soft interrupt. + */ + + req = tp->syn_wait_queue; + + + if (!req) + { + return sk; + } + + do { + struct tcp_v6_open_req *af_req; + + af_req = (struct tcp_v6_open_req *) req; + + if (!ipv6_addr_cmp(&af_req->rmt_addr, &skb->ipv6_hdr->saddr) && + !ipv6_addr_cmp(&af_req->loc_addr, &skb->ipv6_hdr->daddr) && + req->rmt_port == skb->h.th->source) + { + u32 flg; + + if (req->sk) + { + printk(KERN_DEBUG "BUG: syn_recv:" + "socket exists\n"); + break; + } + + /* match */ + + /* + * Check for syn retransmission + */ + flg = *(((u32 *)skb->h.th) + 3); + flg &= __constant_htonl(0x002f0000); + + if ((flg == __constant_htonl(0x00020000)) && + (!after(skb->seq, req->rcv_isn))) + { + /* + * retransmited syn + * FIXME: must send an ack + */ + return NULL; + } + + atomic_sub(skb->truesize, &sk->rmem_alloc); + sk = tp->af_specific->syn_recv_sock(sk, skb, req); + + tcp_dec_slow_timer(TCP_SLT_SYNACK); + + if (sk == NULL) + { + return NULL; + } + + atomic_add(skb->truesize, &sk->rmem_alloc); + req->expires = 0UL; + req->sk = sk; + skb->sk = sk; + break; + } + + req = req->dl_next; + } while (req != tp->syn_wait_queue); + + + return sk; + +} + +int tcp_v6_rcv(struct sk_buff *skb, struct device *dev, + struct in6_addr *saddr, struct in6_addr *daddr, + struct ipv6_options *opt, unsigned short len, + int redo, struct inet6_protocol *protocol) +{ + struct tcphdr *th; + struct sock *sk; + + /* + * "redo" is 1 if we have already seen this skb but couldn't + * use it at that time (the socket was locked). In that case + * we have already done a lot of the work (looked up the socket + * etc). + */ + + th = skb->h.th; + + sk = skb->sk; + + if (!redo) + { + + if (skb->pkt_type != PACKET_HOST) + goto discard_it; + + /* + * Pull up the IP header. + */ + + skb_pull(skb, skb->h.raw - skb->data); + + /* + * Try to use the device checksum if provided. + */ + + switch (skb->ip_summed) + { + case CHECKSUM_NONE: + skb->csum = csum_partial((char *)th, len, 0); + case CHECKSUM_HW: + if (tcp_v6_check(th,len,saddr,daddr,skb->csum)) + { + printk(KERN_DEBUG "tcp csum failed\n"); + goto discard_it; + } + default: + /* CHECKSUM_UNNECESSARY */ + } + + sk = inet6_get_sock(&tcpv6_prot, daddr, saddr, + th->dest, th->source); + + if (!sk) + { + printk(KERN_DEBUG "socket not found\n"); + goto no_tcp_socket; + } + + skb->sk = sk; + skb->seq = ntohl(th->seq); + skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4; + skb->ack_seq = ntohl(th->ack_seq); + + skb->acked = 0; + skb->used = 0; + skb->free = 1; + } + + /* + * We may need to add it to the backlog here. + */ + + if (sk->users) + { + __skb_queue_tail(&sk->back_log, skb); + return(0); + } + + /* + * Signal NDISC that the connection is making + * "forward progress" + */ + if (sk->state != TCP_LISTEN) + { + struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; + struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp); + + if (after(skb->seq, tp->rcv_nxt) || + after(skb->ack_seq, tp->snd_una)) + { + if (np->dest) + ndisc_validate(np->dest->dc_nexthop); + } + } + + if (!sk->prot) + { + printk(KERN_DEBUG "tcp_rcv: sk->prot == NULL\n"); + return(0); + } + + atomic_add(skb->truesize, &sk->rmem_alloc); + + if (sk->state == TCP_ESTABLISHED) + { + tcp_rcv_established(sk, skb, th, len); + return 0; + } + + if (sk->state == TCP_LISTEN) + { + /* + * find possible connection requests + */ + sk = tcp_v6_check_req(sk, skb); + + if (sk == NULL) + { + goto discard_it; + } + + } + + if (tcp_rcv_state_process(sk, skb, th, opt, len) == 0) + return 0; + +no_tcp_socket: + + /* + * No such TCB. If th->rst is 0 send a reset + * (checked in tcp_send_reset) + */ + + tcp_v6_send_reset(daddr, saddr, th, &tcpv6_prot, opt, dev, + skb->ipv6_hdr->priority, 255); + +discard_it: + + /* + * Discard frame + */ + + kfree_skb(skb, FREE_READ); + return 0; + +} + +static int tcp_v6_rebuild_header(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; + + if (np->dest) + { + np->dest = ipv6_dst_check(np->dest, &np->daddr, + np->dc_sernum, 0); + + } + else + { + np->dest = ipv6_dst_route(&np->daddr, NULL, 0); + } + + if (!np->dest) + { + /* + * lost route to destination + */ + return -1; + } + + np->dc_sernum = (np->dest->rt.fib_node ? + np->dest->rt.fib_node->fn_sernum : 0); + + ipv6_redo_mac_hdr(skb, np->dest->dc_nexthop, + skb->tail - (u8*) skb->ipv6_hdr); + return 0; +} + +static int tcp_v6_backlog_rcv(struct sock *sk, struct sk_buff *skb) +{ + int res; + + res = tcp_v6_rcv(skb, skb->dev, + &skb->ipv6_hdr->saddr, &skb->ipv6_hdr->daddr, + (struct ipv6_options *) skb->proto_priv, + skb->len, 1, + (struct inet6_protocol *) sk->pair); + return res; +} + +static struct sock * tcp_v6_get_sock(struct sk_buff *skb, struct tcphdr *th) +{ + struct in6_addr *saddr; + struct in6_addr *daddr; + struct sock *sk; + + saddr = &skb->ipv6_hdr->saddr; + daddr = &skb->ipv6_hdr->daddr; + + sk = inet6_get_sock(&tcpv6_prot, daddr, saddr, th->source, th->dest); + + return sk; +} + +static int tcp_v6_build_header(struct sock *sk, struct sk_buff *skb) +{ + skb_reserve(skb, (MAX_HEADER + 15) & ~15); + skb->ipv6_hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr)); + + /* + * FIXME: reserve space for option headers + * length member of np->opt + */ + + return 0; +} + +static void tcp_v6_xmit(struct sock *sk, struct device *dev, struct sk_buff *skb, + int free) +{ + struct ipv6_pinfo * np = &sk->net_pinfo.af_inet6; + int err; + + err = ipv6_xmit(sk, skb, &np->saddr, &np->daddr, NULL, IPPROTO_TCP); + + /* + * FIXME: check error handling. + */ + + sk->err_soft = err; +} + + + +static void v6_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) +{ + struct ipv6_pinfo * np = &sk->net_pinfo.af_inet6; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; + + sin6->sin6_family = AF_INET6; + memcpy(&sin6->sin6_addr, &np->daddr, sizeof(struct in6_addr)); + sin6->sin6_port = sk->dummy_th.dest; + +} + +static struct tcp_func ipv6_specific = { + tcp_v6_build_header, + tcp_v6_xmit, + tcp_v6_send_check, + tcp_v6_rebuild_header, + tcp_v6_conn_request, + tcp_v6_syn_recv_sock, + tcp_v6_init_sequence, + tcp_v6_get_sock, + ipv6_setsockopt, + ipv6_getsockopt, + v6_addr2sockaddr, + sizeof(struct sockaddr_in6) +}; + +/* + * TCP over IPv4 via INET6 API + */ + +static struct tcp_func ipv6_mapped = { + tcp_v4_build_header, + ip_queue_xmit, + tcp_v4_send_check, + tcp_v4_rebuild_header, + tcp_v6_conn_request, + tcp_v6_syn_recv_sock, + tcp_v6_init_sequence, + tcp_v6_get_sock, + ipv6_setsockopt, + ipv6_getsockopt, + v6_addr2sockaddr, + sizeof(struct sockaddr_in6) +}; + +static int tcp_v6_init_sock(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + skb_queue_head_init(&sk->out_of_order_queue); + tcp_init_xmit_timers(sk); + + tp->srtt = 0; + tp->rto = TCP_TIMEOUT_INIT; /*TCP_WRITE_TIME*/ + tp->mdev = TCP_TIMEOUT_INIT; + + tp->ato = 0; + tp->iat = (HZ/5) << 3; + + tp->rcv_wnd = 8192; + + /* start with only sending one packet at a time. */ + sk->cong_window = 1; + sk->ssthresh = 0x7fffffff; + + sk->priority = 1; + sk->state = TCP_CLOSE; + + /* this is how many unacked bytes we will accept for this socket. */ + sk->max_unacked = 2048; /* needs to be at most 2 full packets. */ + sk->max_ack_backlog = SOMAXCONN; + + sk->mtu = 576; + sk->mss = 516; + + sk->dummy_th.doff = sizeof(sk->dummy_th)/4; + + + /* + * Speed up by setting some standard state for the dummy_th + * if TCP uses it (maybe move to tcp_init later) + */ + + sk->dummy_th.ack=1; + sk->dummy_th.doff=sizeof(struct tcphdr)>>2; + + sk->tp_pinfo.af_tcp.af_specific = &ipv6_specific; + + return 0; +} + +static int tcp_v6_destroy_sock(struct sock *sk) +{ + struct ipv6_pinfo * np = &sk->net_pinfo.af_inet6; + struct sk_buff *skb; + + tcp_clear_xmit_timers(sk); + + if (sk->keepopen) + { + tcp_dec_slow_timer(TCP_SLT_KEEPALIVE); + } + + /* + * Cleanup up the write buffer. + */ + + while((skb = skb_dequeue(&sk->write_queue)) != NULL) { + IS_SKB(skb); + skb->free = 1; + kfree_skb(skb, FREE_WRITE); + } + + /* + * Cleans up our, hopefuly empty, out_of_order_queue + */ + + while((skb = skb_dequeue(&sk->out_of_order_queue)) != NULL) { + IS_SKB(skb); + kfree_skb(skb, FREE_READ); + } + + /* + * Release destination entry + */ + + if (np->dest) + { + ipv6_dst_unlock(np->dest); + } + + return 0; +} + + +struct proto tcpv6_prot = { + tcp_close, + tcp_v6_connect, + tcp_accept, + NULL, + tcp_write_wakeup, + tcp_read_wakeup, + tcp_select, + tcp_ioctl, + tcp_v6_init_sock, + tcp_v6_destroy_sock, + tcp_shutdown, + tcp_setsockopt, + tcp_getsockopt, + tcp_v6_sendmsg, + tcp_recvmsg, + NULL, /* No special bind() */ + tcp_v6_backlog_rcv, + 128, + 0, + "TCPv6", + 0, 0, + NULL +}; + +static struct inet6_protocol tcpv6_protocol = +{ + tcp_v6_rcv, /* TCP handler */ + tcp_v6_err, /* TCP error control */ + NULL, /* next */ + IPPROTO_TCP, /* protocol ID */ + 0, /* copy */ + NULL, /* data */ + "TCPv6" /* name */ +}; + + +void tcpv6_init(void) +{ + /* register inet6 protocol */ + inet6_add_protocol(&tcpv6_protocol); +} + +/* + * Local variables: + * compile-command: "gcc -D__KERNEL__ -I/usr/src/linux/include -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer -fno-strength-reduce -pipe -m486 -DCPU=486 -DMODULE -DMODVERSIONS -include /usr/src/linux/include/linux/modversions.h -c -o tcp_ipv6.o tcp_ipv6.c" + * c-file-style: "Linux" + * End: + */ diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c new file mode 100644 index 000000000..380122210 --- /dev/null +++ b/net/ipv6/udp.c @@ -0,0 +1,634 @@ +/* + * UDP over IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * Based on linux/ipv4/udp.c + * + * $Id: udp.c,v 1.6 1996/10/16 18:34:16 roque Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/ipv6.h> +#include <linux/icmpv6.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/ndisc.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/ipv6_route.h> +#include <net/addrconf.h> +#include <net/ip.h> +#include <net/udp.h> + +#include <net/checksum.h> + +struct udp_mib udp_stats_in6; + +/* + * + */ + +int udpv6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; + struct in6_addr *daddr; + struct dest_entry *dest; + struct ipv6_pinfo *np; + struct inet6_ifaddr *ifa; + int addr_type; + + if (addr_len < sizeof(*usin)) + return(-EINVAL); + + if (usin->sin6_family && usin->sin6_family != AF_INET6) + return(-EAFNOSUPPORT); + + addr_type = ipv6_addr_type(&usin->sin6_addr); + np = &sk->net_pinfo.af_inet6; + + if (addr_type == IPV6_ADDR_ANY) + { + /* + * connect to self + */ + usin->sin6_addr.s6_addr[15] = 0x01; + } + + daddr = &usin->sin6_addr; + + if (addr_type == IPV6_ADDR_MAPPED) + { + struct sockaddr_in sin; + int err; + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = daddr->s6_addr32[3]; + + err = udp_connect(sk, (struct sockaddr*) &sin, sizeof(sin)); + + if (err < 0) + { + return err; + } + + ipv6_addr_copy(&np->daddr, daddr); + + if(ipv6_addr_any(&np->saddr)) + { + ipv6_addr_set(&np->saddr, 0, 0, + __constant_htonl(0x0000ffff), + sk->saddr); + + } + + if(ipv6_addr_any(&np->rcv_saddr)) + { + ipv6_addr_set(&np->rcv_saddr, 0, 0, + __constant_htonl(0x0000ffff), + sk->rcv_saddr); + } + + } + + ipv6_addr_copy(&np->daddr, daddr); + + /* + * Check for a route to destination an obtain the + * destination cache for it. + */ + + dest = ipv6_dst_route(daddr, NULL, sk->localroute ? RTI_GATEWAY : 0); + + np->dest = dest; + + if (dest == NULL) + return -ENETUNREACH; + + /* get the source adddress used in the apropriate device */ + + ifa = ipv6_get_saddr((struct rt6_info *) dest, daddr); + + if(ipv6_addr_any(&np->saddr)) + { + ipv6_addr_copy(&np->saddr, &ifa->addr); + } + + if(ipv6_addr_any(&np->rcv_saddr)) + { + ipv6_addr_copy(&np->rcv_saddr, &ifa->addr); + sk->rcv_saddr = 0xffffffff; + } + + sk->dummy_th.dest = usin->sin6_port; + + sk->state = TCP_ESTABLISHED; + + return(0); +} + +static void udpv6_close(struct sock *sk, unsigned long timeout) +{ + struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; + + lock_sock(sk); + sk->state = TCP_CLOSE; + + if (np->dest) + { + ipv6_dst_unlock(np->dest); + } + + release_sock(sk); + destroy_sock(sk); +} + +/* + * This should be easy, if there is something there we + * return it, otherwise we block. + */ + +int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len, + int noblock, int flags, int *addr_len) +{ + int copied = 0; + int truesize; + struct sk_buff *skb; + int err; + + + /* + * Check any passed addresses + */ + + if (addr_len) + *addr_len=sizeof(struct sockaddr_in6); + + /* + * From here the generic datagram does a lot of the work. Come + * the finished NET3, it will do _ALL_ the work! + */ + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if(skb==NULL) + return err; + + truesize = skb->tail - skb->h.raw - sizeof(struct udphdr); + + copied=truesize; + if(copied>len) + { + copied=len; + msg->msg_flags|=MSG_TRUNC; + } + + /* + * FIXME : should use udp header size info value + */ + + err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), + msg->msg_iov, copied); + if (err) + return err; + + sk->stamp=skb->stamp; + + /* Copy the address. */ + if (msg->msg_name) + { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *) msg->msg_name; + + sin6->sin6_family = AF_INET6; + sin6->sin6_port = skb->h.uh->source; + + if (skb->protocol == __constant_htons(ETH_P_IP)) + { + ipv6_addr_set(&sin6->sin6_addr, 0, 0, + __constant_htonl(0xffff), skb->daddr); + } + else + { + memcpy(&sin6->sin6_addr, &skb->ipv6_hdr->saddr, + sizeof(struct in6_addr)); + + if (msg->msg_control) + { + int err; + + err = datagram_recv_ctl(sk, msg, skb); + + if (err < 0) + { + copied = err; + } + } + } + } + + skb_free_datagram(sk, skb); + return(copied); +} + +void udpv6_err(int type, int code, unsigned char *buff, __u32 info, + struct in6_addr *saddr, struct in6_addr *daddr, + struct inet6_protocol *protocol) +{ + struct sock *sk; + struct udphdr *uh; + int err; + + uh = (struct udphdr *) buff; + + sk = inet6_get_sock(&udpv6_prot, daddr, saddr, uh->source, uh->dest); + + if (sk == NULL) + { + printk(KERN_DEBUG "icmp for unkown sock\n"); + return; + } + + if (icmpv6_err_convert(type, code, &err)) + { + if(sk->bsdism && sk->state!=TCP_ESTABLISHED) + return; + + sk->err = err; + sk->error_report(sk); + } + else + sk->err_soft = err; +} + +static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) +{ + + if (sock_queue_rcv_skb(sk,skb)<0) { + udp_stats_in6.UdpInErrors++; + ipv6_statistics.Ip6InDiscards++; + ipv6_statistics.Ip6InDelivers--; + skb->sk = NULL; + kfree_skb(skb, FREE_WRITE); + return 0; + } + udp_stats_in6.UdpInDatagrams++; + return 0; +} + +int udpv6_rcv(struct sk_buff *skb, struct device *dev, + struct in6_addr *saddr, struct in6_addr *daddr, + struct ipv6_options *opt, unsigned short len, + int redo, struct inet6_protocol *protocol) +{ + struct sock *sk; + struct udphdr *uh; + int ulen; + + /* + * check if the address is ours... + * I believe that this is being done in IP layer + */ + + uh = (struct udphdr *) skb->h.uh; + + ipv6_statistics.Ip6InDelivers++; + + ulen = ntohs(uh->len); + + if (ulen > len || len < sizeof(*uh)) + { + printk(KERN_DEBUG "UDP: short packet: %d/%d\n", ulen, len); + udp_stats_in6.UdpInErrors++; + kfree_skb(skb, FREE_READ); + return(0); + } + + if (uh->check == 0) + { + printk(KERN_DEBUG "IPv6: udp checksum is 0\n"); + goto discard; + } + + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = csum_partial((char*)uh, len, 0); + case CHECKSUM_HW: + if (csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, skb->csum)) + { + printk(KERN_DEBUG "IPv6: udp checksum error\n"); + goto discard; + } + } + + len = ulen; + + /* + * Multicast receive code + */ + if (ipv6_addr_type(daddr) & IPV6_ADDR_MULTICAST) + { + struct sock *sk2; + int lport; + + lport = ntohs(uh->dest); + sk = udpv6_prot.sock_array[lport & (SOCK_ARRAY_SIZE-1)]; + + sk = inet6_get_sock_mcast(sk, lport, uh->source, + daddr, saddr); + + if (sk) + { + sk2 = sk; + + while ((sk2 = inet6_get_sock_mcast(sk2->next, lport, + uh->source, + daddr, saddr))) + { + struct sk_buff *buff; + + buff = skb_clone(skb, GFP_ATOMIC); + + if (sock_queue_rcv_skb(sk, buff) < 0) + { + buff->sk = NULL; + kfree_skb(buff, FREE_READ); + } + } + } + if (!sk || sock_queue_rcv_skb(sk, skb) < 0) + { + skb->sk = NULL; + kfree_skb(skb, FREE_READ); + } + return 0; + } + + /* Unicast */ + + /* + * check socket cache ... must talk to Alan about his plans + * for sock caches... i'll skip this for now. + */ + + sk = inet6_get_sock(&udpv6_prot, daddr, saddr, uh->dest, uh->source); + + if (sk == NULL) + { + udp_stats_in6.UdpNoPorts++; + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, + 0, dev); + + kfree_skb(skb, FREE_READ); + return(0); + } + + /* deliver */ + + if (sk->users) + { + __skb_queue_tail(&sk->back_log, skb); + } + else + { + udpv6_queue_rcv_skb(sk, skb); + } + + return(0); + + discard: + udp_stats_in6.UdpInErrors++; + kfree_skb(skb, FREE_READ); + return(0); +} + +/* + * Sending + */ + +struct udpv6fakehdr +{ + struct udphdr uh; + struct iovec *iov; + __u32 wcheck; + __u32 pl_len; + struct in6_addr *daddr; +}; + +/* + * with checksum + */ + +static int udpv6_getfrag(const void *data, struct in6_addr *addr, + char *buff, unsigned int offset, unsigned int len) +{ + struct udpv6fakehdr *udh = (struct udpv6fakehdr *) data; + char *dst; + int final = 0; + int clen = len; + + dst = buff; + + if (offset) + { + offset -= sizeof(struct udphdr); + } + else + { + dst += sizeof(struct udphdr); + final = 1; + clen -= sizeof(struct udphdr); + } + + udh->wcheck = csum_partial_copy_fromiovecend(dst, udh->iov, offset, + clen, udh->wcheck); + + if (final) + { + struct in6_addr *daddr; + + udh->wcheck = csum_partial((char *)udh, sizeof(struct udphdr), + udh->wcheck); + + if (udh->daddr) + { + daddr = udh->daddr; + } + else + { + /* + * use packet destination address + * this should improve cache locality + */ + daddr = addr + 1; + } + udh->uh.check = csum_ipv6_magic(addr, daddr, + udh->pl_len, IPPROTO_UDP, + udh->wcheck); + if (udh->uh.check == 0) + udh->uh.check = -1; + + memcpy(buff, udh, sizeof(struct udphdr)); + } + return 0; +} + +static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen, + int noblock, int flags) +{ + + struct ipv6_options opt_space; + struct udpv6fakehdr udh; + struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) msg->msg_name; + struct ipv6_options *opt = NULL; + struct device *dev = NULL; + int addr_len = msg->msg_namelen; + struct in6_addr *daddr; + struct in6_addr *saddr = NULL; + int len = ulen + sizeof(struct udphdr); + int addr_type; + int err; + + + if (flags & ~MSG_DONTROUTE) + return(-EINVAL); + + if (sin6) + { + if (addr_len < sizeof(*sin6)) + return(-EINVAL); + + if (sin6->sin6_family && sin6->sin6_family != AF_INET6) + return(-EINVAL); + + if (sin6->sin6_port == 0) + return(-EINVAL); + + udh.uh.dest = sin6->sin6_port; + daddr = &sin6->sin6_addr; + + if (np->dest && ipv6_addr_cmp(daddr, &np->daddr)) + { + ipv6_dst_unlock(np->dest); + np->dest = NULL; + } + } + else + { + if (sk->state != TCP_ESTABLISHED) + return(-EINVAL); + + udh.uh.dest = sk->dummy_th.dest; + daddr = &sk->net_pinfo.af_inet6.daddr; + } + + addr_type = ipv6_addr_type(daddr); + + if (addr_type == IPV6_ADDR_MAPPED) + { + struct sockaddr_in sin; + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = daddr->s6_addr32[3]; + + return udp_sendmsg(sk, msg, len, noblock, flags); + } + + udh.daddr = NULL; + + if (msg->msg_control) + { + opt = &opt_space; + memset(opt, 0, sizeof(struct ipv6_options)); + + err = datagram_send_ctl(msg, &dev, &saddr, opt); + if (err < 0) + { + printk(KERN_DEBUG "invalid msg_control\n"); + return err; + } + + if (opt->srcrt) + { + udh.daddr = daddr; + } + } + + udh.uh.source = sk->dummy_th.source; + udh.uh.len = htons(len); + udh.uh.check = 0; + udh.iov = msg->msg_iov; + udh.wcheck = 0; + udh.pl_len = len; + + err = ipv6_build_xmit(sk, udpv6_getfrag, &udh, daddr, len, + saddr, dev, opt, IPPROTO_UDP, noblock); + + if (err < 0) + return err; + + udp_stats_in6.UdpOutDatagrams++; + return ulen; +} + +static struct inet6_protocol udpv6_protocol = +{ + udpv6_rcv, /* UDP handler */ + udpv6_err, /* UDP error control */ + NULL, /* next */ + IPPROTO_UDP, /* protocol ID */ + 0, /* copy */ + NULL, /* data */ + "UDPv6" /* name */ +}; + + +struct proto udpv6_prot = { + udpv6_close, + udpv6_connect, + NULL, + NULL, + NULL, + NULL, + datagram_select, + udp_ioctl, + NULL, + NULL, + NULL, + ipv6_setsockopt, + ipv6_getsockopt, + udpv6_sendmsg, + udpv6_recvmsg, + NULL, /* No special bind function */ + udpv6_queue_rcv_skb, + 128, + 0, + "UDP", + 0, 0, + NULL +}; + +void udpv6_init(void) +{ + inet6_add_protocol(&udpv6_protocol); +} diff --git a/net/ipx/Makefile b/net/ipx/Makefile index 8d38f1686..c54f9436f 100644 --- a/net/ipx/Makefile +++ b/net/ipx/Makefile @@ -7,29 +7,11 @@ # # Note 2! The CFLAGS definition is now in the main makefile... -.c.o: - $(CC) $(CFLAGS) -c $< -.s.o: - $(AS) -o $*.o $< -.c.s: - $(CC) $(CFLAGS) -S $< +O_TARGET := ipx.o +O_OBJS := af_ipx.o sysctl_net_ipx.o +M_OBJS := $(O_TARGET) - -OBJS := af_ipx.o - - -ipx.o: $(OBJS) - $(LD) -r -o ipx.o $(OBJS) - -dep: - $(CPP) -M *.c > .depend +include $(TOPDIR)/Rules.make tar: tar -cvf /dev/f1 . - -# -# include a dependency file if one exists -# -ifeq (.depend,$(wildcard .depend)) -include .depend -endif diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 941be7224..82a85f685 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -33,18 +33,36 @@ * Asynchronous I/O support. * Changed to use notifiers and the newer packet_type stuff. * Assorted major fixes <Alejandro Liu> - * Revision 0.30: Moved to net/ipx/... + * Revision 0.30: Moved to net/ipx/... <Alan Cox> * Don't set address length on recvfrom that errors. * Incorrect verify_area. + * Revision 0.31: New sk_buffs. This still needs a lot of testing. <Alan Cox> + * Revision 0.32: Using sock_alloc_send_skb, firewall hooks. <Alan Cox> + * Supports sendmsg/recvmsg + * Revision 0.33: Internal network support, routing changes, uses a + * protocol private area for ipx data. + * Revision 0.34: Module support. <Jim Freeman> + * Revision 0.35: Checksum support. <Neil Turton>, hooked in by <Alan Cox> + * Handles WIN95 discovery packets <Volker Lendecke> + * Revision 0.36: Internal bump up for 2.1 + * Revision 0.37: Began adding POSIXisms. + * + * Protect the module by a MOD_INC_USE_COUNT/MOD_DEC_USE_COUNT + * pair. Also, now usage count is managed this way + * -Count one if the auto_interface mode is on + * -Count one per configured interface + * + * Jacques Gelinas (jacques@solucorp.qc.ca) * - * TODO: use sock_alloc_send_skb to allocate sending buffers. Check with Caldera first * * Portions Copyright (c) 1995 Caldera, Inc. <greg@caldera.com> * Neither Greg Page nor Caldera, Inc. admit liability nor provide * warranty for any of this software. This material is provided * "AS-IS" and at no charge. */ - + +#include <linux/module.h> + #include <linux/config.h> #include <linux/errno.h> #include <linux/types.h> @@ -56,41 +74,56 @@ #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> -#include <linux/ipx.h> -#include <linux/inet.h> #include <linux/netdevice.h> -#include <linux/skbuff.h> +#include <net/ipx.h> +#include <linux/inet.h> +#include <linux/route.h> #include <net/sock.h> -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/termios.h> /* For TIOCOUTQ/INQ */ #include <linux/interrupt.h> #include <net/p8022.h> +#include <net/p8022tr.h> #include <net/psnap.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/firewall.h> + +#ifdef MODULE +static void ipx_proto_finito(void); +#endif /* def MODULE */ -#ifdef CONFIG_IPX /* Configuration Variables */ static unsigned char ipxcfg_max_hops = 16; -static char ipxcfg_auto_select_primary = 0; -static char ipxcfg_auto_create_interfaces = 0; +static char ipxcfg_auto_select_primary = 0; +static char ipxcfg_auto_create_interfaces = 0; /* Global Variables */ static struct datalink_proto *p8022_datalink = NULL; +static struct datalink_proto *p8022tr_datalink = NULL; static struct datalink_proto *pEII_datalink = NULL; static struct datalink_proto *p8023_datalink = NULL; static struct datalink_proto *pSNAP_datalink = NULL; -static ipx_interface *ipx_interfaces = NULL; static ipx_route *ipx_routes = NULL; -static ipx_interface *ipx_internal_net = NULL; +static ipx_interface *ipx_interfaces = NULL; static ipx_interface *ipx_primary_net = NULL; +static ipx_interface *ipx_internal_net = NULL; static int ipxcfg_set_auto_create(char val) { - ipxcfg_auto_create_interfaces = val; + if (ipxcfg_auto_create_interfaces != val){ + if (val){ + MOD_INC_USE_COUNT; + }else{ + MOD_DEC_USE_COUNT; + } + ipxcfg_auto_create_interfaces = val; + } return 0; } @@ -110,8 +143,7 @@ ipxcfg_get_config_data(ipx_config_data *arg) vals.ipxcfg_auto_create_interfaces = ipxcfg_auto_create_interfaces; vals.ipxcfg_auto_select_primary = ipxcfg_auto_select_primary; - memcpy_tofs(arg, &vals, sizeof(vals)); - return 0; + return copy_to_user(arg, &vals, sizeof(vals)) ? -EFAULT : 0; } @@ -138,7 +170,7 @@ ipx_remove_socket(ipx_socket *sk) cli(); /* Determine interface with which socket is associated */ - intrfc = sk->ipx_intrfc; + intrfc = sk->protinfo.af_ipx.intrfc; if (intrfc == NULL) { restore_flags(flags); return; @@ -179,7 +211,8 @@ ipx_destroy_socket(ipx_socket *sk) kfree_skb(skb,FREE_READ); } - kfree_s(sk,sizeof(*sk)); + sk_free(sk); + MOD_DEC_USE_COUNT; } /* The following code is used to support IPX Interfaces (IPXITF). An @@ -187,7 +220,7 @@ ipx_destroy_socket(ipx_socket *sk) */ static ipx_route * ipxrtr_lookup(unsigned long); - + static void ipxitf_clear_primary_net(void) { @@ -229,7 +262,7 @@ ipxitf_insert_socket(ipx_interface *intrfc, ipx_socket *sk) { ipx_socket *s; - sk->ipx_intrfc = intrfc; + sk->protinfo.af_ipx.intrfc = intrfc; sk->next = NULL; if (intrfc->if_sklist == NULL) { intrfc->if_sklist = sk; @@ -246,13 +279,34 @@ ipxitf_find_socket(ipx_interface *intrfc, unsigned short port) ipx_socket *s; for (s=intrfc->if_sklist; - (s != NULL) && (s->ipx_port != port); + (s != NULL) && (s->protinfo.af_ipx.port != port); s=s->next) ; return s; } +#ifdef CONFIG_IPX_INTERN + +static ipx_socket * +ipxitf_find_internal_socket(ipx_interface *intrfc, + unsigned char *node, unsigned short port) +{ + ipx_socket *s = intrfc->if_sklist; + + while (s != NULL) + { + if ( (s->protinfo.af_ipx.port == port) + && (memcmp(node, s->protinfo.af_ipx.node, IPX_NODE_LEN) == 0)) + { + break; + } + s = s->next; + } + return s; +} +#endif + static void ipxrtr_del_routes(ipx_interface *); static void @@ -268,8 +322,8 @@ ipxitf_down(ipx_interface *intrfc) for (s = intrfc->if_sklist; s != NULL; ) { s->err = ENOLINK; s->error_report(s); - s->ipx_intrfc = NULL; - s->ipx_port = 0; + s->protinfo.af_ipx.intrfc = NULL; + s->protinfo.af_ipx.port = 0; s->zapped=1; /* Indicates it is no longer bound */ t = s; s = s->next; @@ -296,10 +350,15 @@ ipxitf_down(ipx_interface *intrfc) ipx_internal_net = NULL; kfree_s(intrfc, sizeof(*intrfc)); + /* sockets still dangling + * - must be closed from user space + */ + MOD_DEC_USE_COUNT; + return; } static int -ipxitf_device_event(unsigned long event, void *ptr) +ipxitf_device_event(struct notifier_block *notifier, unsigned long event, void *ptr) { struct device *dev = ptr; ipx_interface *i, *tmp; @@ -319,32 +378,96 @@ ipxitf_device_event(unsigned long event, void *ptr) return NOTIFY_DONE; } -static int -ipxitf_def_skb_handler(struct sock *sock, struct sk_buff *skb) +static int ipxitf_def_skb_handler(struct sock *sock, struct sk_buff *skb) { int retval; - if((retval = sock_queue_rcv_skb(sock, skb))<0) { + if((retval = sock_queue_rcv_skb(sock, skb))<0) + { /* - * We do a FREE_WRITE here because this indicates how - * to treat the socket with which the packet is - * associated. If this packet is associated with a - * socket at all, it must be the originator of the - * packet. Incoming packets will have no socket - * associated with them at this point. + * skb->sk is NULL here, so FREE_WRITE does not hurt + * the sending socket. */ kfree_skb(skb,FREE_WRITE); } return retval; } +/* + * On input skb->sk is NULL. Nobody is charged for the memory. + */ + +#ifdef CONFIG_IPX_INTERN +static int +ipxitf_demux_socket(ipx_interface *intrfc, struct sk_buff *skb, int copy) +{ + ipx_packet *ipx = (ipx_packet *)(skb->h.raw); + ipx_socket *s; + + int is_broadcast = (memcmp(ipx->ipx_dest.node, ipx_broadcast_node, + IPX_NODE_LEN) == 0); + + s = intrfc->if_sklist; + + while (s != NULL) + { + if ( (s->protinfo.af_ipx.port == ipx->ipx_dest.sock) + && ( is_broadcast + || (memcmp(ipx->ipx_dest.node, s->protinfo.af_ipx.node, + IPX_NODE_LEN) == 0))) + { + /* We found a socket to which to send */ + struct sk_buff *skb1; + + if (copy != 0) + { + skb1 = skb_clone(skb, GFP_ATOMIC); + if (skb1 != NULL) + { + skb1->arp = skb1->free = 1; + } + else + { + return -ENOMEM; + } + } + else + { + skb1 = skb; + copy = 1; /* skb may only be used once */ + } + ipxitf_def_skb_handler(s, skb1); + + if (intrfc != ipx_internal_net) + { + /* on an external interface, at most + * one socket can listen. + */ + break; + } + } + s = s->next; + } + + if (copy == 0) + { + /* skb was solely for us, and we did not make a copy, + * so free it. FREE_WRITE does not hurt, because + * skb->sk is NULL here. + */ + kfree_skb(skb, FREE_WRITE); + } + return 0; +} + +#else + static int ipxitf_demux_socket(ipx_interface *intrfc, struct sk_buff *skb, int copy) { ipx_packet *ipx = (ipx_packet *)(skb->h.raw); ipx_socket *sock1 = NULL, *sock2 = NULL; struct sk_buff *skb1 = NULL, *skb2 = NULL; - int ipx_offset; sock1 = ipxitf_find_socket(intrfc, ipx->ipx_dest.sock); @@ -355,98 +478,98 @@ ipxitf_demux_socket(ipx_interface *intrfc, struct sk_buff *skb, int copy) * The *SPECIAL* socket list contains: 0x452(SAP), 0x453(RIP) and * 0x456(Diagnostic). */ - if (ipx_primary_net && (intrfc != ipx_primary_net)) { - switch (ntohs(ipx->ipx_dest.sock)) { - case 0x452: - case 0x453: - case 0x456: - /* - * The appropriate thing to do here is to - * dup the packet and route to the primary net - * interface via ipxitf_send; however, we'll cheat - * and just demux it here. - */ - sock2 = ipxitf_find_socket(ipx_primary_net, + + if (ipx_primary_net && (intrfc != ipx_primary_net)) + { + switch (ntohs(ipx->ipx_dest.sock)) + { + case 0x452: + case 0x453: + case 0x456: + /* + * The appropriate thing to do here is to + * dup the packet and route to the primary net + * interface via ipxitf_send; however, we'll cheat + * and just demux it here. + */ + sock2 = ipxitf_find_socket(ipx_primary_net, ipx->ipx_dest.sock); - break; - default: - break; + break; + default: + break; } } - /* if there is nothing to do, return */ - if ((sock1 == NULL) && (sock2 == NULL)) { + /* + * if there is nothing to do, return. The kfree will + * cancel any charging. + */ + + if (sock1 == NULL && sock2 == NULL) + { if (!copy) kfree_skb(skb,FREE_WRITE); return 0; } - ipx_offset = (char *)(skb->h.raw) - (char *)(skb->data); - - /* This next segment of code is a little awkward, but it sets it up + /* + * This next segment of code is a little awkward, but it sets it up * so that the appropriate number of copies of the SKB are made and * that skb1 and skb2 point to it (them) so that it (they) can be * demuxed to sock1 and/or sock2. If we are unable to make enough * copies, we do as much as is possible. */ - if (copy) { + + if (copy) + { skb1 = skb_clone(skb, GFP_ATOMIC); - if (skb1 != NULL) { - skb1->h.raw = (unsigned char *)&(skb1->data[ipx_offset]); + if (skb1 != NULL) skb1->arp = skb1->free = 1; - } - } else { + } + else + { skb1 = skb; } - if (skb1 == NULL) return -ENOMEM; + if (skb1 == NULL) + return -ENOMEM; - /* Do we need 2 SKBs? */ - if (sock1 && sock2) { + /* + * Do we need 2 SKBs? + */ + + if (sock1 && sock2) + { skb2 = skb_clone(skb1, GFP_ATOMIC); - if (skb2 != NULL) { - skb2->h.raw = (unsigned char *)&(skb2->data[ipx_offset]); + if (skb2 != NULL) skb2->arp = skb2->free = 1; - } - } else { - skb2 = skb1; } + else + skb2 = skb1; - if (sock1) { + if (sock1) (void) ipxitf_def_skb_handler(sock1, skb1); - } - if (skb2 == NULL) return -ENOMEM; + if (skb2 == NULL) + return -ENOMEM; - if (sock2) { + if (sock2) (void) ipxitf_def_skb_handler(sock2, skb2); - } return 0; } +#endif static struct sk_buff * ipxitf_adjust_skbuff(ipx_interface *intrfc, struct sk_buff *skb) { struct sk_buff *skb2; - int in_offset = skb->h.raw - skb->data; + int in_offset = skb->h.raw - skb->head; int out_offset = intrfc->if_ipx_offset; - char *oldraw; int len; /* Hopefully, most cases */ - if (in_offset == out_offset) { - skb->len += out_offset; - skb->arp = skb->free = 1; - return skb; - } - - /* Existing SKB will work, just need to move things around a little */ - if (in_offset > out_offset) { - oldraw = skb->h.raw; - skb->h.raw = &(skb->data[out_offset]); - memmove(skb->h.raw, oldraw, skb->len); - skb->len += out_offset; + if (in_offset >= out_offset) { skb->arp = skb->free = 1; return skb; } @@ -455,8 +578,8 @@ ipxitf_adjust_skbuff(ipx_interface *intrfc, struct sk_buff *skb) len = skb->len + out_offset; skb2 = alloc_skb(len, GFP_ATOMIC); if (skb2 != NULL) { - skb2->h.raw = &(skb2->data[out_offset]); - skb2->len = len; + skb_reserve(skb2,out_offset); + skb2->h.raw=skb_put(skb2,skb->len); skb2->free=1; skb2->arp=1; memcpy(skb2->h.raw, skb->h.raw, skb->len); @@ -465,8 +588,7 @@ ipxitf_adjust_skbuff(ipx_interface *intrfc, struct sk_buff *skb) return skb2; } -static int -ipxitf_send(ipx_interface *intrfc, struct sk_buff *skb, char *node) +static int ipxitf_send(ipx_interface *intrfc, struct sk_buff *skb, char *node) { ipx_packet *ipx = (ipx_packet *)(skb->h.raw); struct device *dev = intrfc->if_dev; @@ -475,29 +597,71 @@ ipxitf_send(ipx_interface *intrfc, struct sk_buff *skb, char *node) int send_to_wire = 1; int addr_len; - /* We need to know how many skbuffs it will take to send out this - * packet to avoid unnecessary copies. + /* + * We need to know how many skbuffs it will take to send out this + * packet to avoid unnecessary copies. */ + if ((dl == NULL) || (dev == NULL) || (dev->flags & IFF_LOOPBACK)) - send_to_wire = 0; + send_to_wire = 0; /* No non looped */ - /* See if this should be demuxed to sockets on this interface */ - if (ipx->ipx_dest.net == intrfc->if_netnum) { + /* + * See if this should be demuxed to sockets on this interface + * + * We want to ensure the original was eaten or that we only use + * up clones. + */ + + if (ipx->ipx_dest.net == intrfc->if_netnum) + { + /* + * To our own node, loop and free the original. + */ if (memcmp(intrfc->if_node, node, IPX_NODE_LEN) == 0) + { + /* + * Don't charge sender + */ + if(skb->sk) + { + atomic_sub(skb->truesize, &skb->sk->wmem_alloc); + skb->sk=NULL; + } + /* + * Will charge receiver + */ return ipxitf_demux_socket(intrfc, skb, 0); - if (memcmp(ipx_broadcast_node, node, IPX_NODE_LEN) == 0) { + } + /* + * Broadcast, loop and possibly keep to send on. + */ + if (memcmp(ipx_broadcast_node, node, IPX_NODE_LEN) == 0) + { + if (!send_to_wire && skb->sk) + { + atomic_sub(skb->truesize, &skb->sk->wmem_alloc); + skb->sk=NULL; + } ipxitf_demux_socket(intrfc, skb, send_to_wire); - if (!send_to_wire) return 0; + if (!send_to_wire) + return 0; } } - /* if the originating net is not equal to our net; this is routed */ - if (ipx->ipx_source.net != intrfc->if_netnum) { + /* + * If the originating net is not equal to our net; this is routed + * We are still charging the sender. Which is right - the driver + * free will handle this fairly. + */ + + if (ipx->ipx_source.net != intrfc->if_netnum) + { if (++(ipx->ipx_tctrl) > ipxcfg_max_hops) send_to_wire = 0; } - if (!send_to_wire) { + if (!send_to_wire) + { /* * We do a FREE_WRITE here because this indicates how * to treat the socket with which the packet is @@ -510,66 +674,91 @@ ipxitf_send(ipx_interface *intrfc, struct sk_buff *skb, char *node) return 0; } - /* determine the appropriate hardware address */ + /* + * Determine the appropriate hardware address + */ + addr_len = dev->addr_len; - if (memcmp(ipx_broadcast_node, node, IPX_NODE_LEN) == 0) { + if (memcmp(ipx_broadcast_node, node, IPX_NODE_LEN) == 0) memcpy(dest_node, dev->broadcast, addr_len); - } else { + else memcpy(dest_node, &(node[IPX_NODE_LEN-addr_len]), addr_len); - } - /* make any compensation for differing physical/data link size */ + /* + * Make any compensation for differing physical/data link size + */ + skb = ipxitf_adjust_skbuff(intrfc, skb); - if (skb == NULL) return 0; + if (skb == NULL) + return 0; /* set up data link and physical headers */ skb->dev = dev; + skb->protocol = htons(ETH_P_IPX); dl->datalink_header(dl, skb, dest_node); +#if 0 + /* + * Now log the packet just before transmission + */ + + dump_pkt("IPX snd:", (ipx_packet *)skb->h.raw); + dump_data("ETH hdr:", skb->data, skb->h.raw - skb->data); +#endif - if (skb->sk != NULL) { - /* This is an outbound packet from this host. We need to - * increment the write count. - */ - skb->sk->wmem_alloc += skb->mem_len; - } - - /* Send it out */ + /* + * Send it out + */ + dev_queue_xmit(skb, dev, SOPRI_NORMAL); return 0; } -static int -ipxrtr_add_route(unsigned long, ipx_interface *, unsigned char *); +static int ipxrtr_add_route(unsigned long, ipx_interface *, unsigned char *); -static int -ipxitf_add_local_route(ipx_interface *intrfc) +static int ipxitf_add_local_route(ipx_interface *intrfc) { return ipxrtr_add_route(intrfc->if_netnum, intrfc, NULL); } -static char * ipx_frame_name(unsigned short); -static char * ipx_device_name(ipx_interface *); +static const char * ipx_frame_name(unsigned short); +static const char * ipx_device_name(ipx_interface *); static int ipxrtr_route_skb(struct sk_buff *); -static int -ipxitf_rcv(ipx_interface *intrfc, struct sk_buff *skb) +static int ipxitf_rcv(ipx_interface *intrfc, struct sk_buff *skb) { ipx_packet *ipx = (ipx_packet *) (skb->h.raw); ipx_interface *i; +#ifdef CONFIG_FIREWALL + /* + * We firewall first, ask questions later. + */ + + if (call_in_firewall(PF_IPX, skb->dev, ipx, NULL)!=FW_ACCEPT) + { + kfree_skb(skb, FREE_READ); + return 0; + } + +#endif + /* See if we should update our network number */ if ((intrfc->if_netnum == 0L) && (ipx->ipx_source.net == ipx->ipx_dest.net) && - (ipx->ipx_source.net != 0L)) { + (ipx->ipx_source.net != 0L)) + { /* NB: NetWare servers lie about their hop count so we * dropped the test based on it. This is the best way * to determine this is a 0 hop count packet. */ - if ((i=ipxitf_find_using_net(ipx->ipx_source.net))==NULL) { + if ((i=ipxitf_find_using_net(ipx->ipx_source.net))==NULL) + { intrfc->if_netnum = ipx->ipx_source.net; (void) ipxitf_add_local_route(intrfc); - } else { - printk("IPX: Network number collision %lx\n\t%s %s and %s %s\n", + } + else + { + printk(KERN_WARNING "IPX: Network number collision %lx\n %s %s and %s %s\n", htonl(ipx->ipx_source.net), ipx_device_name(i), ipx_frame_name(i->if_dlink_type), @@ -583,19 +772,36 @@ ipxitf_rcv(ipx_interface *intrfc, struct sk_buff *skb) if (ipx->ipx_source.net == 0L) ipx->ipx_source.net = intrfc->if_netnum; - if (intrfc->if_netnum != ipx->ipx_dest.net) { + if (intrfc->if_netnum != ipx->ipx_dest.net) + { +#ifdef CONFIG_FIREWALL + /* + * See if we are allowed to firewall forward + */ + if (call_fw_firewall(PF_IPX, skb->dev, ipx, NULL)!=FW_ACCEPT) + { + kfree_skb(skb, FREE_READ); + return 0; + } +#endif /* We only route point-to-point packets. */ if ((skb->pkt_type != PACKET_BROADCAST) && (skb->pkt_type != PACKET_MULTICAST)) - return ipxrtr_route_skb(skb); - + { + skb=skb_unshare(skb, GFP_ATOMIC, FREE_READ); + if(skb) + return ipxrtr_route_skb(skb); + else + return 0; + } kfree_skb(skb,FREE_READ); return 0; } /* see if we should keep it */ if ((memcmp(ipx_broadcast_node, ipx->ipx_dest.node, IPX_NODE_LEN) == 0) - || (memcmp(intrfc->if_node, ipx->ipx_dest.node, IPX_NODE_LEN) == 0)) { + || (memcmp(intrfc->if_node, ipx->ipx_dest.node, IPX_NODE_LEN) == 0)) + { return ipxitf_demux_socket(intrfc, skb, 0); } @@ -620,6 +826,8 @@ ipxitf_insert(ipx_interface *intrfc) if (ipxcfg_auto_select_primary && (ipx_primary_net == NULL)) ipx_primary_net = intrfc; + MOD_INC_USE_COUNT; + return; } static int @@ -659,6 +867,7 @@ ipx_map_frame_type(unsigned char type) switch (type) { case IPX_FRAME_ETHERII: return htons(ETH_P_IPX); case IPX_FRAME_8022: return htons(ETH_P_802_2); + case IPX_FRAME_TR_8022: return htons(ETH_P_TR_802_2); case IPX_FRAME_SNAP: return htons(ETH_P_SNAP); case IPX_FRAME_8023: return htons(ETH_P_802_3); } @@ -688,6 +897,10 @@ ipxitf_create(ipx_interface_definition *idef) dlink_type = htons(ETH_P_IPX); datalink = pEII_datalink; break; + case IPX_FRAME_TR_8022: + dlink_type = htons(ETH_P_TR_802_2); + datalink = p8022tr_datalink; + break; case IPX_FRAME_8022: dlink_type = htons(ETH_P_802_2); datalink = p8022_datalink; @@ -719,8 +932,8 @@ ipxitf_create(ipx_interface_definition *idef) if(dev->addr_len>IPX_NODE_LEN) return -EINVAL; - if ((intrfc = ipxitf_find_using_phys(dev, dlink_type)) == NULL) { - + if ((intrfc = ipxitf_find_using_phys(dev, dlink_type)) == NULL) + { /* Ok now create */ intrfc=(ipx_interface *)kmalloc(sizeof(ipx_interface),GFP_ATOMIC); if (intrfc==NULL) @@ -736,9 +949,14 @@ ipxitf_create(ipx_interface_definition *idef) ipx_primary_net = intrfc; intrfc->if_internal = 0; intrfc->if_ipx_offset = dev->hard_header_len + datalink->header_length; - memset(intrfc->if_node, 0, IPX_NODE_LEN); - memcpy((char *)&(intrfc->if_node[IPX_NODE_LEN-dev->addr_len]), dev->dev_addr, dev->addr_len); - + if(memcmp(idef->ipx_node, "\000\000\000\000\000\000", IPX_NODE_LEN)==0) + { + memset(intrfc->if_node, 0, IPX_NODE_LEN); + memcpy((char *)&(intrfc->if_node[IPX_NODE_LEN-dev->addr_len]), + dev->dev_addr, dev->addr_len); + } + else + memcpy(intrfc->if_node, idef->ipx_node, IPX_NODE_LEN); ipxitf_insert(intrfc); } @@ -788,6 +1006,7 @@ ipxitf_auto_create(struct device *dev, unsigned short dlink_type) switch (htons(dlink_type)) { case ETH_P_IPX: datalink = pEII_datalink; break; case ETH_P_802_2: datalink = p8022_datalink; break; + case ETH_P_TR_802_2: datalink = p8022tr_datalink; break; case ETH_P_SNAP: datalink = pSNAP_datalink; break; case ETH_P_802_3: datalink = p8023_datalink; break; default: return NULL; @@ -820,20 +1039,17 @@ ipxitf_auto_create(struct device *dev, unsigned short dlink_type) } static int -ipxitf_ioctl(unsigned int cmd, void *arg) +ipxitf_ioctl_real(unsigned int cmd, void *arg) { - int err; switch(cmd) { - case SIOCSIFADDR: - { + case SIOCSIFADDR: { struct ifreq ifr; struct sockaddr_ipx *sipx; ipx_interface_definition f; - err=verify_area(VERIFY_READ,arg,sizeof(ifr)); - if(err) - return err; - memcpy_fromfs(&ifr,arg,sizeof(ifr)); + + if (copy_from_user(&ifr,arg,sizeof(ifr))) + return -EFAULT; sipx=(struct sockaddr_ipx *)&ifr.ifr_addr; if(sipx->sipx_family!=AF_IPX) return -EINVAL; @@ -847,16 +1063,15 @@ ipxitf_ioctl(unsigned int cmd, void *arg) else return ipxitf_create(&f); } - case SIOCGIFADDR: - { + case SIOCGIFADDR: { struct ifreq ifr; struct sockaddr_ipx *sipx; ipx_interface *ipxif; struct device *dev; - err=verify_area(VERIFY_WRITE,arg,sizeof(ifr)); - if(err) - return err; - memcpy_fromfs(&ifr,arg,sizeof(ifr)); + int err; + + if (copy_from_user(&ifr,arg,sizeof(ifr))) + return -EFAULT; sipx=(struct sockaddr_ipx *)&ifr.ifr_addr; dev=dev_get(ifr.ifr_name); if(!dev) @@ -864,26 +1079,42 @@ ipxitf_ioctl(unsigned int cmd, void *arg) ipxif=ipxitf_find_using_phys(dev, ipx_map_frame_type(sipx->sipx_type)); if(ipxif==NULL) return -EADDRNOTAVAIL; + sipx->sipx_family=AF_IPX; sipx->sipx_network=ipxif->if_netnum; memcpy(sipx->sipx_node, ipxif->if_node, sizeof(sipx->sipx_node)); - memcpy_tofs(arg,&ifr,sizeof(ifr)); - return 0; + err = copy_to_user(arg,&ifr,sizeof(ifr)); + if (err) + return -EFAULT; + return err; } - case SIOCAIPXITFCRT: - err=verify_area(VERIFY_READ,arg,sizeof(char)); - if(err) + case SIOCAIPXITFCRT: { + int err, val; + err = get_user(val, (unsigned char *) arg); + if (err) return err; - return ipxcfg_set_auto_create(get_fs_byte(arg)); - case SIOCAIPXPRISLT: - err=verify_area(VERIFY_READ,arg,sizeof(char)); - if(err) + return ipxcfg_set_auto_create(val); + } + case SIOCAIPXPRISLT: { + int err, val; + err = get_user(val, (unsigned char *) arg); + if (err) return err; - return ipxcfg_set_auto_select(get_fs_byte(arg)); + return ipxcfg_set_auto_select(val); + } default: return -EINVAL; } } +static int +ipxitf_ioctl(unsigned int cmd, void *arg) +{ + int ret; + MOD_INC_USE_COUNT; + ret = ipxitf_ioctl_real (cmd,arg); + MOD_DEC_USE_COUNT; + return ret; +} /*******************************************************************************************************************\ * * * Routing tables for the IPX socket layer * @@ -915,6 +1146,8 @@ ipxrtr_add_route(unsigned long network, ipx_interface *intrfc, unsigned char *no rt->ir_next=ipx_routes; ipx_routes=rt; } + else if (intrfc == ipx_internal_net) + return(-EEXIST); rt->ir_net = network; rt->ir_intrfc = intrfc; @@ -979,8 +1212,70 @@ ipxrtr_delete(long net) return -ENOENT; } -static int -ipxrtr_route_packet(ipx_socket *sk, struct sockaddr_ipx *usipx, void *ubuf, int len) +/* + * Checksum routine for IPX + */ + +/* Note: We assume ipx_tctrl==0 and htons(length)==ipx_pktsize */ + +static __u16 ipx_set_checksum(ipx_packet *packet,int length) +{ + /* + * NOTE: sum is a net byte order quantity, which optimizes the + * loop. This only works on big and little endian machines. (I + * don't know of a machine that isn't.) + */ + + __u32 sum=0; + + /* + * Pointer to second word - We skip the checksum field + */ + + __u16 *p=(__u16 *)&packet->ipx_pktsize; + + /* + * Number of complete words + */ + + __u32 i=length>>1; + + /* + * Loop through all complete words except the checksum field + */ + + while(--i) + sum+=*p++; + + /* + * Add on the last part word if it exists + */ + + if(packet->ipx_pktsize&htons(1)) + sum+=ntohs(0xff00)&*p; + + /* + * Do final fixup + */ + + sum=(sum&0xffff)+(sum>>16); + + /* + * It's a pity there's no concept of carry in C + */ + + if(sum>=0x10000) + sum++; + + return ~sum; +}; + + +/* + * Route an outgoing frame from a socket. + */ + +static int ipxrtr_route_packet(ipx_socket *sk, struct sockaddr_ipx *usipx, struct iovec *iov, int len, int noblock) { struct sk_buff *skb; ipx_interface *intrfc; @@ -988,12 +1283,16 @@ ipxrtr_route_packet(ipx_socket *sk, struct sockaddr_ipx *usipx, void *ubuf, int int size; int ipx_offset; ipx_route *rt = NULL; - + int err; + /* Find the appropriate interface on which to send packet */ - if ((usipx->sipx_network == 0L) && (ipx_primary_net != NULL)) { + if ((usipx->sipx_network == 0L) && (ipx_primary_net != NULL)) + { usipx->sipx_network = ipx_primary_net->if_netnum; intrfc = ipx_primary_net; - } else { + } + else + { rt = ipxrtr_lookup(usipx->sipx_network); if (rt==NULL) { return -ENETUNREACH; @@ -1005,32 +1304,67 @@ ipxrtr_route_packet(ipx_socket *sk, struct sockaddr_ipx *usipx, void *ubuf, int size=sizeof(ipx_packet)+len; size += ipx_offset; - if(size+sk->wmem_alloc>sk->sndbuf) return -EAGAIN; - - skb=alloc_skb(size,GFP_KERNEL); - if(skb==NULL) return -ENOMEM; - - skb->sk=sk; - skb->len=size; + skb=sock_alloc_send_skb(sk, size, 0, noblock, &err); + if(skb==NULL) + return err; + + skb_reserve(skb,ipx_offset); skb->free=1; skb->arp=1; + skb->sk=sk; /* Fill in IPX header */ - ipx=(ipx_packet *)&(skb->data[ipx_offset]); - ipx->ipx_checksum=0xFFFF; + ipx=(ipx_packet *)skb_put(skb,sizeof(ipx_packet)); ipx->ipx_pktsize=htons(len+sizeof(ipx_packet)); ipx->ipx_tctrl=0; ipx->ipx_type=usipx->sipx_type; skb->h.raw = (unsigned char *)ipx; - ipx->ipx_source.net = sk->ipx_intrfc->if_netnum; - memcpy(ipx->ipx_source.node, sk->ipx_intrfc->if_node, IPX_NODE_LEN); - ipx->ipx_source.sock = sk->ipx_port; + ipx->ipx_source.net = sk->protinfo.af_ipx.intrfc->if_netnum; +#ifdef CONFIG_IPX_INTERN + memcpy(ipx->ipx_source.node, sk->protinfo.af_ipx.node, IPX_NODE_LEN); +#else + if ((err = ntohs(sk->protinfo.af_ipx.port)) == 0x453 || err == 0x452) + { + /* RIP/SAP special handling for mars_nwe */ + ipx->ipx_source.net = intrfc->if_netnum; + memcpy(ipx->ipx_source.node, intrfc->if_node, IPX_NODE_LEN); + } + else + { + ipx->ipx_source.net = sk->protinfo.af_ipx.intrfc->if_netnum; + memcpy(ipx->ipx_source.node, sk->protinfo.af_ipx.intrfc->if_node, IPX_NODE_LEN); + } +#endif + ipx->ipx_source.sock = sk->protinfo.af_ipx.port; ipx->ipx_dest.net=usipx->sipx_network; memcpy(ipx->ipx_dest.node,usipx->sipx_node,IPX_NODE_LEN); ipx->ipx_dest.sock=usipx->sipx_port; - memcpy_fromfs((char *)(ipx+1),ubuf,len); + err = memcpy_fromiovec(skb_put(skb,len),iov,len); + if (err) + { + kfree_skb(skb, FREE_WRITE); + return -EFAULT; + } + + /* + * Apply checksum. Not allowed on 802.3 links. + */ + + if(sk->no_check || intrfc->if_dlink_type==IPX_FRAME_8023) + ipx->ipx_checksum=0xFFFF; + else + ipx->ipx_checksum=ipx_set_checksum(ipx, len+sizeof(ipx_packet)); + +#ifdef CONFIG_FIREWALL + if(call_out_firewall(PF_IPX, skb->dev, ipx, NULL)!=FW_ACCEPT) + { + kfree_skb(skb, FREE_WRITE); + return -EPERM; + } +#endif + return ipxitf_send(intrfc, skb, (rt && rt->ir_routed) ? rt->ir_router_node : ipx->ipx_dest.node); } @@ -1057,19 +1391,17 @@ ipxrtr_route_skb(struct sk_buff *skb) /* * We use a normal struct rtentry for route handling */ - + static int ipxrtr_ioctl(unsigned int cmd, void *arg) { int err; struct rtentry rt; /* Use these to behave like 'other' stacks */ struct sockaddr_ipx *sg,*st; - - err=verify_area(VERIFY_READ,arg,sizeof(rt)); - if(err) - return err; - memcpy_fromfs(&rt,arg,sizeof(rt)); - + err = copy_from_user(&rt,arg,sizeof(rt)); + if (err) + return -EFAULT; + sg=(struct sockaddr_ipx *)&rt.rt_gateway; st=(struct sockaddr_ipx *)&rt.rt_dst; @@ -1097,7 +1429,7 @@ static int ipxrtr_ioctl(unsigned int cmd, void *arg) } } -static char * +static const char * ipx_frame_name(unsigned short frame) { switch (ntohs(frame)) { @@ -1105,11 +1437,12 @@ ipx_frame_name(unsigned short frame) case ETH_P_802_2: return "802.2"; case ETH_P_SNAP: return "SNAP"; case ETH_P_802_3: return "802.3"; + case ETH_P_TR_802_2: return "802.2TR"; default: return "None"; } } -static char * +static const char * ipx_device_name(ipx_interface *intrfc) { return (intrfc->if_internal ? "Internal" : @@ -1117,8 +1450,8 @@ ipx_device_name(ipx_interface *intrfc) } /* Called from proc fs */ -int -ipx_get_interface_info(char *buffer, char **start, off_t offset, int length) +static int ipx_interface_get_info(char *buffer, char **start, off_t offset, + int length, int dummy) { ipx_interface *i; int len=0; @@ -1126,7 +1459,7 @@ ipx_get_interface_info(char *buffer, char **start, off_t offset, int length) off_t begin=0; /* Theory.. Keep printing in the same place until we pass offset */ - + len += sprintf (buffer,"%-11s%-15s%-9s%-11s%s\n", "Network", "Node_Address", "Primary", "Device", "Frame_Type"); for (i = ipx_interfaces; i != NULL; i = i->if_next) { @@ -1160,8 +1493,8 @@ ipx_get_interface_info(char *buffer, char **start, off_t offset, int length) return len; } -int -ipx_get_info(char *buffer, char **start, off_t offset, int length) +static int ipx_get_info(char *buffer, char **start, off_t offset, + int length, int dummy) { ipx_socket *s; ipx_interface *i; @@ -1170,27 +1503,47 @@ ipx_get_info(char *buffer, char **start, off_t offset, int length) off_t begin=0; /* Theory.. Keep printing in the same place until we pass offset */ - + +#ifdef CONFIG_IPX_INTERN + len += sprintf (buffer,"%-28s%-28s%-10s%-10s%-7s%s\n", "Local_Address", +#else len += sprintf (buffer,"%-15s%-28s%-10s%-10s%-7s%s\n", "Local_Address", +#endif "Remote_Address", "Tx_Queue", "Rx_Queue", "State", "Uid"); for (i = ipx_interfaces; i != NULL; i = i->if_next) { for (s = i->if_sklist; s != NULL; s = s->next) { - len += sprintf (buffer+len,"%08lX:%04X ", - htonl(i->if_netnum), - htons(s->ipx_port)); +#ifdef CONFIG_IPX_INTERN + len += sprintf(buffer+len, + "%08lX:%02X%02X%02X%02X%02X%02X:%04X ", + htonl(s->protinfo.af_ipx.intrfc->if_netnum), + s->protinfo.af_ipx.node[0], + s->protinfo.af_ipx.node[1], + s->protinfo.af_ipx.node[2], + s->protinfo.af_ipx.node[3], + s->protinfo.af_ipx.node[4], + s->protinfo.af_ipx.node[5], + htons(s->protinfo.af_ipx.port)); +#else + len += sprintf(buffer+len,"%08lX:%04X ", + htonl(i->if_netnum), + htons(s->protinfo.af_ipx.port)); +#endif if (s->state!=TCP_ESTABLISHED) { len += sprintf(buffer+len, "%-28s", "Not_Connected"); } else { len += sprintf (buffer+len, "%08lX:%02X%02X%02X%02X%02X%02X:%04X ", - htonl(s->ipx_dest_addr.net), - s->ipx_dest_addr.node[0], s->ipx_dest_addr.node[1], - s->ipx_dest_addr.node[2], s->ipx_dest_addr.node[3], - s->ipx_dest_addr.node[4], s->ipx_dest_addr.node[5], - htons(s->ipx_dest_addr.sock)); + htonl(s->protinfo.af_ipx.dest_addr.net), + s->protinfo.af_ipx.dest_addr.node[0], + s->protinfo.af_ipx.dest_addr.node[1], + s->protinfo.af_ipx.dest_addr.node[2], + s->protinfo.af_ipx.dest_addr.node[3], + s->protinfo.af_ipx.dest_addr.node[4], + s->protinfo.af_ipx.dest_addr.node[5], + htons(s->protinfo.af_ipx.dest_addr.sock)); } - len += sprintf (buffer+len,"%08lX %08lX ", + len += sprintf (buffer+len,"%08X %08X ", s->wmem_alloc, s->rmem_alloc); len += sprintf (buffer+len,"%02X %03d\n", s->state, SOCK_INODE(s->socket)->i_uid); @@ -1217,7 +1570,8 @@ ipx_get_info(char *buffer, char **start, off_t offset, int length) return len; } -int ipx_rt_get_info(char *buffer, char **start, off_t offset, int length) +static int ipx_rt_get_info(char *buffer, char **start, off_t offset, + int length, int dummy) { ipx_route *rt; int len=0; @@ -1260,7 +1614,7 @@ int ipx_rt_get_info(char *buffer, char **start, off_t offset, int length) * Handling for system calls applied via the various interfaces to an IPX socket object * * * \*******************************************************************************************************************/ - + static int ipx_fcntl(struct socket *sock, unsigned int cmd, unsigned long arg) { switch(cmd) @@ -1277,13 +1631,12 @@ static int ipx_setsockopt(struct socket *sock, int level, int optname, char *opt sk=(ipx_socket *)sock->data; - if(optval==NULL) + if (optval==NULL) return(-EINVAL); - err=verify_area(VERIFY_READ,optval,sizeof(int)); - if(err) + err = get_user(opt, (unsigned int *)optval); + if (err) return err; - opt=get_fs_long((unsigned long *)optval); switch(level) { @@ -1291,14 +1644,14 @@ static int ipx_setsockopt(struct socket *sock, int level, int optname, char *opt switch(optname) { case IPX_TYPE: - sk->ipx_type=opt; + sk->protinfo.af_ipx.type=opt; return 0; default: return -EOPNOTSUPP; } break; - case SOL_SOCKET: + case SOL_SOCKET: return sock_setsockopt(sk,level,optname,optval,optlen); default: @@ -1322,7 +1675,7 @@ static int ipx_getsockopt(struct socket *sock, int level, int optname, switch(optname) { case IPX_TYPE: - val=sk->ipx_type; + val=sk->protinfo.af_ipx.type; break; default: return -ENOPROTOOPT; @@ -1335,13 +1688,10 @@ static int ipx_getsockopt(struct socket *sock, int level, int optname, default: return -EOPNOTSUPP; } - err=verify_area(VERIFY_WRITE,optlen,sizeof(int)); - if(err) - return err; - put_fs_long(sizeof(int),(unsigned long *)optlen); - err=verify_area(VERIFY_WRITE,optval,sizeof(int)); - put_fs_long(val,(unsigned long *)optval); - return(0); + err = put_user(sizeof(int), optlen); + if (!err) + err = put_user(val, (int *)optval); + return err; } static int ipx_listen(struct socket *sock, int backlog) @@ -1364,11 +1714,10 @@ static void def_callback2(struct sock *sk, int len) } } -static int -ipx_create(struct socket *sock, int protocol) +static int ipx_create(struct socket *sock, int protocol) { ipx_socket *sk; - sk=(ipx_socket *)kmalloc(sizeof(*sk),GFP_KERNEL); + sk=(ipx_socket *)sk_alloc(GFP_KERNEL); if(sk==NULL) return(-ENOMEM); switch(sock->type) @@ -1379,17 +1728,9 @@ ipx_create(struct socket *sock, int protocol) kfree_s((void *)sk,sizeof(*sk)); return(-ESOCKTNOSUPPORT); } - sk->dead=0; - sk->next=NULL; - sk->broadcast=0; sk->rcvbuf=SK_RMEM_MAX; sk->sndbuf=SK_WMEM_MAX; - sk->wmem_alloc=0; - sk->rmem_alloc=0; - sk->inuse=0; - sk->shutdown=0; sk->prot=NULL; /* So we use default free mechanisms */ - sk->err=0; skb_queue_head_init(&sk->receive_queue); skb_queue_head_init(&sk->write_queue); sk->send_head=NULL; @@ -1397,13 +1738,8 @@ ipx_create(struct socket *sock, int protocol) sk->state=TCP_CLOSE; sk->socket=sock; sk->type=sock->type; - sk->ipx_type=0; /* General user level IPX */ - sk->debug=0; - sk->ipx_intrfc = NULL; - memset(&sk->ipx_dest_addr,'\0',sizeof(sk->ipx_dest_addr)); - sk->ipx_port = 0; sk->mtu=IPX_MTU; - + sk->no_check = 1; /* Checksum off by default */ if(sock!=NULL) { sock->data=(void *)sk; @@ -1416,6 +1752,7 @@ ipx_create(struct socket *sock, int protocol) sk->error_report=def_callback1; sk->zapped=1; + MOD_INC_USE_COUNT; return 0; } @@ -1455,7 +1792,7 @@ ipx_first_free_socketnum(ipx_interface *intrfc) return ntohs(socketNum); } -static int ipx_bind(struct socket *sock, struct sockaddr *uaddr,int addr_len) +static int ipx_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { ipx_socket *sk; ipx_interface *intrfc; @@ -1464,7 +1801,7 @@ static int ipx_bind(struct socket *sock, struct sockaddr *uaddr,int addr_len) sk=(ipx_socket *)sock->data; if(sk->zapped==0) - return -EIO; + return -EINVAL; if(addr_len!=sizeof(struct sockaddr_ipx)) return -EINVAL; @@ -1480,8 +1817,62 @@ static int ipx_bind(struct socket *sock, struct sockaddr *uaddr,int addr_len) } if(ntohs(addr->sipx_port)<IPX_MIN_EPHEMERAL_SOCKET && !suser()) - return -EPERM; /* protect IPX system stuff like routing/sap */ - + return -EACCES; /* protect IPX system stuff like routing/sap */ + + sk->protinfo.af_ipx.port=addr->sipx_port; + +#ifdef CONFIG_IPX_INTERN + if (intrfc == ipx_internal_net) + { + /* The source address is to be set explicitly if the + * socket is to be bound on the internal network. If a + * node number 0 was specified, the default is used. + */ + + if (memcmp(addr->sipx_node, ipx_broadcast_node, + IPX_NODE_LEN) == 0) + { + return -EINVAL; + } + if (memcmp(addr->sipx_node, ipx_this_node, IPX_NODE_LEN) == 0) + { + memcpy(sk->protinfo.af_ipx.node, intrfc->if_node, + IPX_NODE_LEN); + } + else + { + memcpy(sk->protinfo.af_ipx.node, addr->sipx_node, IPX_NODE_LEN); + } + if (ipxitf_find_internal_socket(intrfc, + sk->protinfo.af_ipx.node, + sk->protinfo.af_ipx.port) != NULL) + { + if(sk->debug) + printk("IPX: bind failed because port %X in" + " use.\n", (int)addr->sipx_port); + return -EADDRINUSE; + } + } + else + { + /* Source addresses are easy. It must be our + * network:node pair for an interface routed to IPX + * with the ipx routing ioctl() + */ + + memcpy(sk->protinfo.af_ipx.node, intrfc->if_node, + IPX_NODE_LEN); + + if(ipxitf_find_socket(intrfc, addr->sipx_port)!=NULL) { + if(sk->debug) + printk("IPX: bind failed because port %X in" + " use.\n", (int)addr->sipx_port); + return -EADDRINUSE; + } + } + +#else + /* Source addresses are easy. It must be our network:node pair for an interface routed to IPX with the ipx routing ioctl() */ @@ -1492,7 +1883,8 @@ static int ipx_bind(struct socket *sock, struct sockaddr *uaddr,int addr_len) return -EADDRINUSE; } - sk->ipx_port=addr->sipx_port; +#endif + ipxitf_insert_socket(intrfc, sk); sk->zapped=0; if(sk->debug) @@ -1513,24 +1905,30 @@ static int ipx_connect(struct socket *sock, struct sockaddr *uaddr, return(-EINVAL); addr=(struct sockaddr_ipx *)uaddr; - if(sk->ipx_port==0) + if(sk->protinfo.af_ipx.port==0) /* put the autobinding in */ { struct sockaddr_ipx uaddr; int ret; uaddr.sipx_port = 0; - uaddr.sipx_network = 0L; - ret = ipx_bind (sock, (struct sockaddr *)&uaddr, sizeof(struct sockaddr_ipx)); + uaddr.sipx_network = 0L; +#ifdef CONFIG_IPX_INTERN + memcpy(uaddr.sipx_node, sk->protinfo.af_ipx.intrfc->if_node, + IPX_NODE_LEN); +#endif + ret = ipx_bind (sock, (struct sockaddr *)&uaddr, + sizeof(struct sockaddr_ipx)); if (ret != 0) return (ret); } if(ipxrtr_lookup(addr->sipx_network)==NULL) return -ENETUNREACH; - sk->ipx_dest_addr.net=addr->sipx_network; - sk->ipx_dest_addr.sock=addr->sipx_port; - memcpy(sk->ipx_dest_addr.node,addr->sipx_node,IPX_NODE_LEN); - sk->ipx_type=addr->sipx_type; + sk->protinfo.af_ipx.dest_addr.net=addr->sipx_network; + sk->protinfo.af_ipx.dest_addr.sock=addr->sipx_port; + memcpy(sk->protinfo.af_ipx.dest_addr.node, + addr->sipx_node,IPX_NODE_LEN); + sk->protinfo.af_ipx.type=addr->sipx_type; sock->state = SS_CONNECTED; sk->state=TCP_ESTABLISHED; return 0; @@ -1543,8 +1941,10 @@ static int ipx_socketpair(struct socket *sock1, struct socket *sock2) static int ipx_accept(struct socket *sock, struct socket *newsock, int flags) { - if(newsock->data) + if(newsock->data) { kfree_s(newsock->data,sizeof(ipx_socket)); + MOD_DEC_USE_COUNT; + } return -EOPNOTSUPP; } @@ -1562,24 +1962,29 @@ static int ipx_getname(struct socket *sock, struct sockaddr *uaddr, if(peer) { if(sk->state!=TCP_ESTABLISHED) return -ENOTCONN; - addr=&sk->ipx_dest_addr; + addr=&sk->protinfo.af_ipx.dest_addr; sipx.sipx_network = addr->net; memcpy(sipx.sipx_node,addr->node,IPX_NODE_LEN); sipx.sipx_port = addr->sock; } else { - if (sk->ipx_intrfc != NULL) { - sipx.sipx_network = sk->ipx_intrfc->if_netnum; - memcpy(sipx.sipx_node, sk->ipx_intrfc->if_node, - IPX_NODE_LEN); + if (sk->protinfo.af_ipx.intrfc != NULL) { + sipx.sipx_network = sk->protinfo.af_ipx.intrfc->if_netnum; +#ifdef CONFIG_IPX_INTERN + memcpy(sipx.sipx_node, sk->protinfo.af_ipx.node, IPX_NODE_LEN); +#else + memcpy(sipx.sipx_node, + sk->protinfo.af_ipx.intrfc->if_node, IPX_NODE_LEN); +#endif + } else { sipx.sipx_network = 0L; memset(sipx.sipx_node, '\0', IPX_NODE_LEN); } - sipx.sipx_port = sk->ipx_port; + sipx.sipx_port = sk->protinfo.af_ipx.port; } sipx.sipx_family = AF_IPX; - sipx.sipx_type = sk->ipx_type; + sipx.sipx_type = sk->protinfo.af_ipx.type; memcpy(uaddr,&sipx,sizeof(sipx)); return 0; } @@ -1588,20 +1993,31 @@ static int ipx_getname(struct socket *sock, struct sockaddr *uaddr, /* * User to dump IPX packets (debugging) */ -void dump_data(char *str,unsigned char *d) { +void dump_data(char *str,unsigned char *d, int len) { static char h2c[] = "0123456789ABCDEF"; int l,i; char *p, b[64]; - for (l=0;l<16;l++) { + for (l=0;len > 0 && l<16;l++) { p = b; - for (i=0; i < 8 ; i++) { - *(p++) = h2c[d[i] & 0x0f]; - *(p++) = h2c[(d[i] >> 4) & 0x0f]; + for (i=0; i < 8 ; i++, --len) { + if (len > 0) { + *(p++) = h2c[(d[i] >> 4) & 0x0f]; + *(p++) = h2c[d[i] & 0x0f]; + } + else { + *(p++) = ' '; + *(p++) = ' '; + } *(p++) = ' '; } *(p++) = '-'; *(p++) = ' '; - for (i=0; i < 8 ; i++) *(p++) = ' '<= d[i] && d[i]<'\177' ? d[i] : '.'; + len += 8; + for (i=0; i < 8 ; i++, --len) + if (len > 0) + *(p++) = ' '<= d[i] && d[i]<'\177' ? d[i] : '.'; + else + *(p++) = ' '; *p = '\000'; d += i; printk("%s-%04X: %s\n",str,l*8,b); @@ -1623,8 +2039,10 @@ void dump_hdr(char *str,ipx_packet *p) { } void dump_pkt(char *str,ipx_packet *p) { + int len = ntohs(p->ipx_pktsize); dump_hdr(str,p); - dump_data(str,(unsigned char *)p); + if (len > 30) + dump_data(str,(unsigned char *)p + 30, len - 30); } #endif @@ -1634,26 +2052,32 @@ int ipx_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) ipx_interface *intrfc; ipx_packet *ipx; + ipx=(ipx_packet *)skb->h.raw; - if(ipx->ipx_checksum!=IPX_NO_CHECKSUM) { - /* We don't do checksum options. We can't really. Novell don't seem to have documented them. - If you need them try the XNS checksum since IPX is basically XNS in disguise. It might be - the same... */ + /* Too small */ + + if(ntohs(ipx->ipx_pktsize)<sizeof(ipx_packet)) { kfree_skb(skb,FREE_READ); return 0; } - /* Too small */ - if(htons(ipx->ipx_pktsize)<sizeof(ipx_packet)) { - kfree_skb(skb,FREE_READ); - return 0; + if(ipx->ipx_checksum!=IPX_NO_CHECKSUM) + { + if(ipx_set_checksum(ipx, ntohs(ipx->ipx_pktsize))!=ipx->ipx_checksum) + { + kfree_skb(skb,FREE_READ); + return 0; + } } /* Determine what local ipx endpoint this is */ intrfc = ipxitf_find_using_phys(dev, pt->type); - if (intrfc == NULL) { - if (ipxcfg_auto_create_interfaces) { + if (intrfc == NULL) + { + if (ipxcfg_auto_create_interfaces && + ntohl(ipx->ipx_dest.net)!=0L) + { intrfc = ipxitf_auto_create(dev, pt->type); } @@ -1667,87 +2091,99 @@ int ipx_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) return ipxitf_rcv(intrfc, skb); } -static int ipx_sendto(struct socket *sock, void *ubuf, int len, int noblock, - unsigned flags, struct sockaddr *usip, int addr_len) +static int ipx_sendmsg(struct socket *sock, struct msghdr *msg, int len, int noblock, + int flags) { ipx_socket *sk=(ipx_socket *)sock->data; - struct sockaddr_ipx *usipx=(struct sockaddr_ipx *)usip; + struct sockaddr_ipx *usipx=(struct sockaddr_ipx *)msg->msg_name; struct sockaddr_ipx local_sipx; int retval; - if (sk->zapped) return -EIO; /* Socket not bound */ - if(flags) return -EINVAL; + if (sk->zapped) + return -EIO; /* Socket not bound */ + if(flags) + return -EINVAL; - if(usipx) { - if(sk->ipx_port == 0) { + if(usipx) + { + if(sk->protinfo.af_ipx.port == 0) + { struct sockaddr_ipx uaddr; int ret; uaddr.sipx_port = 0; uaddr.sipx_network = 0L; - ret = ipx_bind (sock, (struct sockaddr *)&uaddr, sizeof(struct sockaddr_ipx)); +#ifdef CONFIG_IPX_INTERN + memcpy(uaddr.sipx_node, sk->protinfo.af_ipx.intrfc + ->if_node, IPX_NODE_LEN); +#endif + ret = ipx_bind (sock, (struct sockaddr *)&uaddr, + sizeof(struct sockaddr_ipx)); if (ret != 0) return ret; } - if(addr_len <sizeof(*usipx)) + if(msg->msg_namelen <sizeof(*usipx)) return -EINVAL; if(usipx->sipx_family != AF_IPX) return -EINVAL; - } else { + } + else + { if(sk->state!=TCP_ESTABLISHED) return -ENOTCONN; usipx=&local_sipx; usipx->sipx_family=AF_IPX; - usipx->sipx_type=sk->ipx_type; - usipx->sipx_port=sk->ipx_dest_addr.sock; - usipx->sipx_network=sk->ipx_dest_addr.net; - memcpy(usipx->sipx_node,sk->ipx_dest_addr.node,IPX_NODE_LEN); + usipx->sipx_type=sk->protinfo.af_ipx.type; + usipx->sipx_port=sk->protinfo.af_ipx.dest_addr.sock; + usipx->sipx_network=sk->protinfo.af_ipx.dest_addr.net; + memcpy(usipx->sipx_node,sk->protinfo.af_ipx.dest_addr.node,IPX_NODE_LEN); } - retval = ipxrtr_route_packet(sk, usipx, ubuf, len); - if (retval < 0) return retval; + retval = ipxrtr_route_packet(sk, usipx, msg->msg_iov, len, noblock); + if (retval < 0) + return retval; return len; } -static int ipx_send(struct socket *sock, void *ubuf, int size, int noblock, unsigned flags) -{ - return ipx_sendto(sock,ubuf,size,noblock,flags,NULL,0); -} -static int ipx_recvfrom(struct socket *sock, void *ubuf, int size, int noblock, - unsigned flags, struct sockaddr *sip, int *addr_len) +static int ipx_recvmsg(struct socket *sock, struct msghdr *msg, int size, int noblock, + int flags, int *addr_len) { ipx_socket *sk=(ipx_socket *)sock->data; - struct sockaddr_ipx *sipx=(struct sockaddr_ipx *)sip; + struct sockaddr_ipx *sipx=(struct sockaddr_ipx *)msg->msg_name; struct ipx_packet *ipx = NULL; int copied = 0; int truesize; struct sk_buff *skb; - int er; - - if(sk->err) - { - er= -sk->err; - sk->err=0; - return er; - } + int err; if (sk->zapped) - return -EIO; + return -ENOTCONN; - skb=skb_recv_datagram(sk,flags,noblock,&er); + skb=skb_recv_datagram(sk,flags,noblock,&err); if(skb==NULL) - return er; + return err; + if(addr_len) *addr_len=sizeof(*sipx); ipx = (ipx_packet *)(skb->h.raw); truesize=ntohs(ipx->ipx_pktsize) - sizeof(ipx_packet); - copied = (truesize > size) ? size : truesize; - skb_copy_datagram(skb,sizeof(struct ipx_packet),ubuf,copied); + copied = truesize; + if(copied > size) + { + copied=size; + msg->msg_flags|=MSG_TRUNC; + } + + err = skb_copy_datagram_iovec(skb,sizeof(struct ipx_packet),msg->msg_iov,copied); + + if (err) + return err; + if(sipx) { sipx->sipx_family=AF_IPX; @@ -1756,31 +2192,10 @@ static int ipx_recvfrom(struct socket *sock, void *ubuf, int size, int noblock, sipx->sipx_network=ipx->ipx_source.net; sipx->sipx_type = ipx->ipx_type; } - skb_free_datagram(skb); - return(truesize); + skb_free_datagram(sk, skb); + return(copied); } -static int ipx_write(struct socket *sock, char *ubuf, int size, int noblock) -{ - return ipx_send(sock,ubuf,size,noblock,0); -} - - -static int ipx_recv(struct socket *sock, void *ubuf, int size , int noblock, - unsigned flags) -{ - ipx_socket *sk=(ipx_socket *)sock->data; - if(sk->zapped) - return -ENOTCONN; - return ipx_recvfrom(sock,ubuf,size,noblock,flags,NULL, NULL); -} - -static int ipx_read(struct socket *sock, char *ubuf, int size, int noblock) -{ - return ipx_recv(sock,ubuf,size,noblock,0); -} - - static int ipx_shutdown(struct socket *sk,int how) { return -EOPNOTSUPP; @@ -1795,32 +2210,23 @@ static int ipx_select(struct socket *sock , int sel_type, select_table *wait) static int ipx_ioctl(struct socket *sock,unsigned int cmd, unsigned long arg) { - int err; long amount=0; ipx_socket *sk=(ipx_socket *)sock->data; switch(cmd) { case TIOCOUTQ: - err=verify_area(VERIFY_WRITE,(void *)arg,sizeof(unsigned long)); - if(err) - return err; amount=sk->sndbuf-sk->wmem_alloc; if(amount<0) amount=0; - put_fs_long(amount,(unsigned long *)arg); - return 0; + return put_user(amount, (int *)arg); case TIOCINQ: { struct sk_buff *skb; /* These two are safe on a single CPU system as only user tasks fiddle here */ if((skb=skb_peek(&sk->receive_queue))!=NULL) - amount=skb->len; - err=verify_area(VERIFY_WRITE,(void *)arg,sizeof(unsigned long)); - if(err) - return err; - put_fs_long(amount,(unsigned long *)arg); - return 0; + amount=skb->len-sizeof(struct ipx_packet); + return put_user(amount, (int *)arg); } case SIOCADDRT: case SIOCDELRT: @@ -1828,31 +2234,29 @@ static int ipx_ioctl(struct socket *sock,unsigned int cmd, unsigned long arg) return -EPERM; return(ipxrtr_ioctl(cmd,(void *)arg)); case SIOCSIFADDR: - case SIOCGIFADDR: case SIOCAIPXITFCRT: case SIOCAIPXPRISLT: if(!suser()) return -EPERM; + case SIOCGIFADDR: return(ipxitf_ioctl(cmd,(void *)arg)); case SIOCIPXCFGDATA: { - err=verify_area(VERIFY_WRITE,(void *)arg, - sizeof(ipx_config_data)); - if(err) return err; return(ipxcfg_get_config_data((void *)arg)); } case SIOCGSTAMP: + { + int ret = -EINVAL; if (sk) { if(sk->stamp.tv_sec==0) return -ENOENT; - err=verify_area(VERIFY_WRITE,(void *)arg,sizeof(struct timeval)); - if(err) - return err; - memcpy_tofs((void *)arg,&sk->stamp,sizeof(struct timeval)); - return 0; + ret = copy_to_user((void *)arg,&sk->stamp,sizeof(struct timeval)); + if (ret) + ret = -EFAULT; } - return -EINVAL; + return 0; + } case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: @@ -1878,22 +2282,18 @@ static struct proto_ops ipx_proto_ops = { ipx_socketpair, ipx_accept, ipx_getname, - ipx_read, - ipx_write, ipx_select, ipx_ioctl, ipx_listen, - ipx_send, - ipx_recv, - ipx_sendto, - ipx_recvfrom, ipx_shutdown, ipx_setsockopt, ipx_getsockopt, ipx_fcntl, + ipx_sendmsg, + ipx_recvmsg }; -/* Called by ddi.c on kernel start up */ +/* Called by protocol.c on kernel start up */ static struct packet_type ipx_8023_packet_type = @@ -1904,7 +2304,7 @@ static struct packet_type ipx_8023_packet_type = NULL, NULL, }; - + static struct packet_type ipx_dix_packet_type = { 0, /* MUTTER ntohs(ETH_P_IPX),*/ @@ -1913,7 +2313,7 @@ static struct packet_type ipx_dix_packet_type = NULL, NULL, }; - + static struct notifier_block ipx_dev_notifier={ ipxitf_device_event, NULL, @@ -1923,12 +2323,30 @@ static struct notifier_block ipx_dev_notifier={ extern struct datalink_proto *make_EII_client(void); extern struct datalink_proto *make_8023_client(void); +extern void destroy_EII_client(struct datalink_proto *); +extern void destroy_8023_client(struct datalink_proto *); -void ipx_proto_init(struct net_proto *pro) -{ - unsigned char val = 0xE0; - unsigned char snapval[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 }; +struct proc_dir_entry ipx_procinfo = { + PROC_NET_IPX, 3, "ipx", S_IFREG | S_IRUGO, + 1, 0, 0, 0, &proc_net_inode_operations, ipx_get_info +}; +struct proc_dir_entry ipx_if_procinfo = { + PROC_NET_IPX_INTERFACE, 13, "ipx_interface", S_IFREG | S_IRUGO, + 1, 0, 0, 0, &proc_net_inode_operations, ipx_interface_get_info +}; + +struct proc_dir_entry ipx_rt_procinfo = { + PROC_NET_IPX_ROUTE, 9, "ipx_route", S_IFREG | S_IRUGO, + 1, 0, 0, 0, &proc_net_inode_operations, ipx_rt_get_info +}; + +static unsigned char ipx_8022_type = 0xE0; +static unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 }; + +void +ipx_proto_init(struct net_proto *pro) +{ (void) sock_register(ipx_proto_ops.family, &ipx_proto_ops); pEII_datalink = make_EII_client(); @@ -1939,15 +2357,91 @@ void ipx_proto_init(struct net_proto *pro) ipx_8023_packet_type.type=htons(ETH_P_802_3); dev_add_pack(&ipx_8023_packet_type); - if ((p8022_datalink = register_8022_client(val, ipx_rcv)) == NULL) - printk("IPX: Unable to register with 802.2\n"); + if ((p8022_datalink = register_8022_client(ipx_8022_type, ipx_rcv)) == NULL) + printk(KERN_CRIT "IPX: Unable to register with 802.2\n"); - if ((pSNAP_datalink = register_snap_client(snapval, ipx_rcv)) == NULL) - printk("IPX: Unable to register with SNAP\n"); + if ((p8022tr_datalink = register_8022tr_client(ipx_8022_type, ipx_rcv)) == NULL) + printk(KERN_CRIT "IPX: Unable to register with 802.2TR\n"); + + if ((pSNAP_datalink = register_snap_client(ipx_snap_id, ipx_rcv)) == NULL) + printk(KERN_CRIT "IPX: Unable to register with SNAP\n"); register_netdevice_notifier(&ipx_dev_notifier); +#ifdef CONFIG_PROC_FS + proc_net_register(&ipx_procinfo); + proc_net_register(&ipx_if_procinfo); + proc_net_register(&ipx_rt_procinfo); +#endif - printk("Swansea University Computer Society IPX 0.30 for NET3.029\n"); - printk("IPX Portions Copyright (c) 1995 Caldera, Inc.\n"); + printk(KERN_INFO "Swansea University Computer Society IPX 0.35 for NET3.037\n"); + printk(KERN_INFO "IPX Portions Copyright (c) 1995 Caldera, Inc.\n"); } -#endif + +#ifdef MODULE +/* Note on MOD_{INC,DEC}_USE_COUNT: + * + * Use counts are incremented/decremented when + * sockets are created/deleted. + * + * Routes are always associated with an interface, and + * allocs/frees will remain properly accounted for by + * their associated interfaces. + * + * Ergo, before the ipx module can be removed, all IPX + * sockets be closed from user space. + */ + +static void +ipx_proto_finito(void) +{ ipx_interface *ifc; + + while (ipx_interfaces) { + ifc = ipx_interfaces; + ipx_interfaces = ifc->if_next; + ifc->if_next = NULL; + ipxitf_down(ifc); + } + +#ifdef CONFIG_PROC_FS + proc_net_unregister(PROC_NET_IPX_ROUTE); + proc_net_unregister(PROC_NET_IPX_INTERFACE); + proc_net_unregister(PROC_NET_IPX); +#endif + + unregister_netdevice_notifier(&ipx_dev_notifier); + + unregister_snap_client(ipx_snap_id); + pSNAP_datalink = NULL; + + unregister_8022tr_client(ipx_8022_type); + p8022tr_datalink = NULL; + + unregister_8022_client(ipx_8022_type); + p8022_datalink = NULL; + + dev_remove_pack(&ipx_8023_packet_type); + destroy_8023_client(p8023_datalink); + p8023_datalink = NULL; + + dev_remove_pack(&ipx_dix_packet_type); + destroy_EII_client(pEII_datalink); + pEII_datalink = NULL; + + (void) sock_unregister(ipx_proto_ops.family); + + return; +} + +int init_module(void) +{ + ipx_proto_init(NULL); + register_symtab(0); + return 0; +} + +void cleanup_module(void) +{ + ipx_proto_finito(); + return; +} +#endif /* def MODULE */ diff --git a/net/ipx/sysctl_net_ipx.c b/net/ipx/sysctl_net_ipx.c new file mode 100644 index 000000000..c699d6ff0 --- /dev/null +++ b/net/ipx/sysctl_net_ipx.c @@ -0,0 +1,13 @@ +/* -*- linux-c -*- + * sysctl_net_ipx.c: sysctl interface to net IPX subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/ipx directory entry (empty =) ). [MS] + */ + +#include <linux/mm.h> +#include <linux/sysctl.h> + +ctl_table ipx_table[] = { + {0} +}; diff --git a/net/netlink.c b/net/netlink.c new file mode 100644 index 000000000..355b35f79 --- /dev/null +++ b/net/netlink.c @@ -0,0 +1,264 @@ +/* + * SKIPLINK An implementation of a loadable kernel mode driver providing + * multiple kernel/user space bidirectional communications links. + * + * Author: Alan Cox <alan@cymru.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/module.h> + +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/sched.h> +#include <linux/lp.h> +#include <linux/malloc.h> +#include <linux/ioport.h> +#include <linux/fcntl.h> +#include <linux/delay.h> +#include <linux/skbuff.h> + +#include <net/netlink.h> + +#include <asm/io.h> +#include <asm/uaccess.h> +#include <asm/system.h> + +static int (*netlink_handler[MAX_LINKS])(int minor, struct sk_buff *skb); +static struct sk_buff_head skb_queue_rd[MAX_LINKS]; +static int rdq_size[MAX_LINKS]; +static struct wait_queue *read_space_wait[MAX_LINKS]; + +static int active_map = 0; +static int open_map = 0; + +/* + * Device operations + */ + +/* + * Default write handler. + */ + +static int netlink_err(int minor, struct sk_buff *skb) +{ + kfree_skb(skb, FREE_READ); + return -EUNATCH; +} + +/* + * Exported do nothing receiver for one way + * interfaces. + */ + +int netlink_donothing(int minor, struct sk_buff *skb) +{ + kfree_skb(skb, FREE_READ); + return -EINVAL; +} + +static int netlink_select(struct inode *inode, struct file *file, int sel_type, select_table * wait) +{ + unsigned int minor = MINOR(inode->i_rdev); + switch (sel_type) { + case SEL_IN: + if (skb_peek(&skb_queue_rd[minor])!=NULL) + return 1; + select_wait(&read_space_wait[minor], wait); + break; + case SEL_OUT: + return 1; + } + return 0; +} + +/* + * Write a message to the kernel side of a communication link + */ + +static long netlink_write(struct inode * inode, struct file * file, + const char * buf, unsigned long count) +{ + int err; + unsigned int minor = MINOR(inode->i_rdev); + struct sk_buff *skb; + skb=alloc_skb(count, GFP_KERNEL); + skb->free=1; + err = copy_from_user(skb_put(skb,count),buf, count); + return err ? -EFAULT : (netlink_handler[minor])(minor,skb); +} + +/* + * Read a message from the kernel side of the communication link + */ + +static long netlink_read(struct inode * inode, struct file * file, char * buf, + unsigned long count) +{ + int err; + unsigned int minor = MINOR(inode->i_rdev); + struct sk_buff *skb; + cli(); + while((skb=skb_dequeue(&skb_queue_rd[minor]))==NULL) + { + if(file->f_flags&O_NONBLOCK) + { + sti(); + return -EAGAIN; + } + interruptible_sleep_on(&read_space_wait[minor]); + if(current->signal & ~current->blocked) + { + sti(); + return -ERESTARTSYS; + } + } + rdq_size[minor]-=skb->len; + sti(); + if(skb->len<count) + count=skb->len; + err = copy_to_user(buf,skb->data,count); + kfree_skb(skb, FREE_READ); + return err ? -EFAULT : count; +} + +static loff_t netlink_lseek(struct inode * inode, struct file * file, + loff_t offset, int origin) +{ + return -ESPIPE; +} + +static int netlink_open(struct inode * inode, struct file * file) +{ + unsigned int minor = MINOR(inode->i_rdev); + + if(minor>=MAX_LINKS) + return -ENODEV; + if(open_map&(1<<minor)) + return -EBUSY; + if(active_map&(1<<minor)) + { + open_map|=(1<<minor); + MOD_INC_USE_COUNT; + return 0; + } + return -EUNATCH; +} + +static void netlink_release(struct inode * inode, struct file * file) +{ + unsigned int minor = MINOR(inode->i_rdev); + open_map&=~(1<<minor); + MOD_DEC_USE_COUNT; +} + + +static int netlink_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + unsigned int minor = MINOR(inode->i_rdev); + int retval = 0; + + if (minor >= MAX_LINKS) + return -ENODEV; + switch ( cmd ) { + default: + retval = -EINVAL; + } + return retval; +} + + +static struct file_operations netlink_fops = { + netlink_lseek, + netlink_read, + netlink_write, + NULL, /* netlink_readdir */ + netlink_select, + netlink_ioctl, + NULL, /* netlink_mmap */ + netlink_open, + netlink_release +}; + +/* + * We export these functions to other modules. They provide a + * complete set of kernel non-blocking support for message + * queueing. + */ + +int netlink_attach(int unit, int (*function)(int minor, struct sk_buff *skb)) +{ + if(unit>=MAX_LINKS) + return -ENODEV; + if(active_map&(1<<unit)) + return -EBUSY; + active_map|=(1<<unit); + netlink_handler[unit]=function; + return 0; +} + +void netlink_detach(int unit) +{ + active_map&=~(1<<unit); + netlink_handler[unit]=netlink_err; +} + +int netlink_post(int unit, struct sk_buff *skb) +{ + unsigned long flags; + int ret=-EUNATCH; + if(open_map&(1<<unit)) + { + save_flags(flags); + cli(); + if(rdq_size[unit]+skb->len>MAX_QBYTES) + ret=-EAGAIN; + else + { + skb_queue_tail(&skb_queue_rd[unit], skb); + rdq_size[unit]+=skb->len; + ret=0; + wake_up_interruptible(&read_space_wait[unit]); + } + restore_flags(flags); + } + return ret; +} + +int init_netlink(void) +{ + int ct; + + if(register_chrdev(NETLINK_MAJOR,"netlink", &netlink_fops)) { + printk(KERN_ERR "netlink: unable to get major %d\n", NETLINK_MAJOR); + return -EIO; + } + for(ct=0;ct<MAX_LINKS;ct++) + { + skb_queue_head_init(&skb_queue_rd[ct]); + netlink_handler[ct]=netlink_err; + } + return 0; +} + +#ifdef MODULE + +int init_module(void) +{ + printk(KERN_INFO "Network Kernel/User communications module 0.04\n"); + return init_netlink(); +} + +void cleanup_module(void) +{ + unregister_chrdev(NET_MAJOR,"netlink"); +} + +#endif diff --git a/net/netrom/Makefile b/net/netrom/Makefile index d838c4da5..da5a1f429 100644 --- a/net/netrom/Makefile +++ b/net/netrom/Makefile @@ -1,5 +1,5 @@ # -# Makefile for the Linux TCP/IP (INET) layer. +# Makefile for the Linux NET/ROM layer. # # Note! Dependencies are done automagically by 'make dep', which also # removes any old dependencies. DON'T put your own dependencies here @@ -7,34 +7,11 @@ # # Note 2! The CFLAGS definition is now in the main makefile... -.c.o: - $(CC) $(CFLAGS) -c $< -.s.o: - $(AS) -o $*.o $< -.c.s: - $(CC) $(CFLAGS) -S $< +O_TARGET := netrom.o +O_OBJS := af_netrom.o sysctl_net_netrom.o nr_dev.o nr_in.o nr_out.o nr_route.o nr_subr.o nr_timer.o +M_OBJS := $(O_TARGET) - -OBJS := af_netrom.o - -ifdef CONFIG_AX25 - -OBJS := $(OBJS) nr_dev.o nr_in.o nr_out.o nr_route.o nr_subr.o nr_timer.o - -endif - -netrom.o: $(OBJS) - $(LD) -r -o netrom.o $(OBJS) - -dep: - $(CPP) -M *.c > .depend +include $(TOPDIR)/Rules.make tar: tar -cvf /dev/f1 . - -# -# include a dependency file if one exists -# -ifeq (.depend,$(wildcard .depend)) -include .depend -endif diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index ca1199be0..6eac7e8a8 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1,10 +1,10 @@ /* - * NET/ROM release 003 + * NET/ROM release 005 * * This is ALPHA test software. This code may break your machine, randomly fail to work with new * releases, misbehave and/or generally screw up. It might even work. * - * This code REQUIRES 1.3.0 or higher/ NET3.029 + * This code REQUIRES 2.1.0 or higher/ NET3.037 * * This module: * This module is free software; you can redistribute it and/or @@ -17,14 +17,22 @@ * NET/ROM 002 Darryl(G7LED) Fixes and address enhancement. * Jonathan(G4KLX) Complete bind re-think. * Alan(GW4PTS) Trivial tweaks into new format. - * - * To do: - * Fix non-blocking connect failure. - * Make it use normal SIOCADDRT/DELRT not funny node ioctl() calls. + * NET/ROM 003 Jonathan(G4KLX) Added G8BPQ extensions. + * Added NET/ROM routing ioctl. + * Darryl(G7LED) Fix autobinding (on connect). + * Fixed nr_release(), set TCP_CLOSE, wakeup app + * context, THEN make the sock dead. + * Circuit ID check before allocating it on + * a connection. + * Alan(GW4PTS) sendmsg/recvmsg only. Fixed connect clear bug + * inherited from AX.25 + * NET/ROM 004 Jonathan(G4KLX) Converted to module. + * NET/ROM 005 Jonathan(G4KLX) Linux 2.1 + * Alan(GW4PTS) Started POSIXisms */ #include <linux/config.h> -#ifdef CONFIG_NETROM +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> @@ -35,13 +43,14 @@ #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> +#include <linux/stat.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <linux/fcntl.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ @@ -49,17 +58,22 @@ #include <linux/interrupt.h> #include <linux/notifier.h> #include <net/netrom.h> - +#include <linux/proc_fs.h> #include <net/ip.h> #include <net/arp.h> +#include <linux/if_arp.h> -/************************************************************************\ -* * -* Handlers for the socket list * -* * -\************************************************************************/ - -struct nr_parms_struct nr_default; +int sysctl_netrom_default_path_quality = NR_DEFAULT_QUAL; +int sysctl_netrom_obsolescence_count_initialiser = NR_DEFAULT_OBS; +int sysctl_netrom_network_ttl_initialiser = NR_DEFAULT_TTL; +int sysctl_netrom_transport_timeout = NR_DEFAULT_T1; +int sysctl_netrom_transport_maximum_tries = NR_DEFAULT_N2; +int sysctl_netrom_transport_acknowledge_delay = NR_DEFAULT_T2; +int sysctl_netrom_transport_busy_delay = NR_DEFAULT_T4; +int sysctl_netrom_transport_requested_window_size = NR_DEFAULT_WINDOW; +int sysctl_netrom_transport_no_activity_timeout = NR_DEFAULT_IDLE; +int sysctl_netrom_transport_packet_length = NR_DEFAULT_PACLEN; +int sysctl_netrom_routing_control = 1; static unsigned short circuit = 0x101; @@ -96,14 +110,37 @@ static void nr_remove_socket(struct sock *sk) } /* + * Kill all bound sockets on a dropped device. + */ +static void nr_kill_by_device(struct device *dev) +{ + struct sock *s; + + for (s = nr_list; s != NULL; s = s->next) { + if (s->protinfo.nr->device == dev) { + s->protinfo.nr->state = NR_STATE_0; + s->protinfo.nr->device = NULL; + s->state = TCP_CLOSE; + s->err = ENETUNREACH; + s->shutdown |= SEND_SHUTDOWN; + s->state_change(s); + s->dead = 1; + } + } +} + +/* * Handle device status changes. */ -static int nr_device_event(unsigned long event, void *ptr) +static int nr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { + struct device *dev = (struct device *)ptr; + if (event != NETDEV_DOWN) return NOTIFY_DONE; - nr_rt_device_down(ptr); + nr_kill_by_device(dev); + nr_rt_device_down(dev); return NOTIFY_DONE; } @@ -128,7 +165,7 @@ static void nr_insert_socket(struct sock *sk) * Find a socket that wants to accept the Connect Request we just * received. */ -static struct sock *nr_find_listener(ax25_address *addr, int type) +static struct sock *nr_find_listener(ax25_address *addr) { unsigned long flags; struct sock *s; @@ -137,7 +174,7 @@ static struct sock *nr_find_listener(ax25_address *addr, int type) cli(); for (s = nr_list; s != NULL; s = s->next) { - if (ax25cmp(&s->nr->source_addr, addr) == 0 && s->type == type && s->state == TCP_LISTEN) { + if (ax25cmp(&s->protinfo.nr->source_addr, addr) == 0 && s->state == TCP_LISTEN) { restore_flags(flags); return s; } @@ -150,7 +187,7 @@ static struct sock *nr_find_listener(ax25_address *addr, int type) /* * Find a connected NET/ROM socket given my circuit IDs. */ -static struct sock *nr_find_socket(unsigned char index, unsigned char id, int type) +static struct sock *nr_find_socket(unsigned char index, unsigned char id) { struct sock *s; unsigned long flags; @@ -159,7 +196,7 @@ static struct sock *nr_find_socket(unsigned char index, unsigned char id, int ty cli(); for (s = nr_list; s != NULL; s = s->next) { - if (s->nr->my_index == index && s->nr->my_id == id && s->type == type) { + if (s->protinfo.nr->my_index == index && s->protinfo.nr->my_id == id) { restore_flags(flags); return s; } @@ -173,7 +210,7 @@ static struct sock *nr_find_socket(unsigned char index, unsigned char id, int ty /* * Find a connected NET/ROM socket given their circuit IDs. */ -static struct sock *nr_find_peer(unsigned char index, unsigned char id, int type) +static struct sock *nr_find_peer(unsigned char index, unsigned char id) { struct sock *s; unsigned long flags; @@ -182,7 +219,7 @@ static struct sock *nr_find_peer(unsigned char index, unsigned char id, int type cli(); for (s = nr_list; s != NULL; s = s->next) { - if (s->nr->your_index == index && s->nr->your_id == id && s->type == type) { + if (s->protinfo.nr->your_index == index && s->protinfo.nr->your_id == id) { restore_flags(flags); return s; } @@ -196,7 +233,7 @@ static struct sock *nr_find_peer(unsigned char index, unsigned char id, int type /* * Deferred destroy. */ -void nr_destory_socket(struct sock *); +void nr_destroy_socket(struct sock *); /* * Handler for deferred kills. @@ -212,7 +249,7 @@ static void nr_destroy_timer(unsigned long data) * Once it is removed from the queue no interrupt or bottom half will * touch it and we are (fairly 8-) ) safe. */ -void nr_destroy_socket(struct sock *sk) /* Not static as its used by the timer */ +void nr_destroy_socket(struct sock *sk) /* Not static as it's used by the timer */ { struct sk_buff *skb; unsigned long flags; @@ -223,13 +260,13 @@ void nr_destroy_socket(struct sock *sk) /* Not static as its used by the timer * del_timer(&sk->timer); nr_remove_socket(sk); - nr_clear_tx_queue(sk); /* Flush the send queue */ + nr_clear_queues(sk); /* Flush the queues */ while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) { if (skb->sk != sk) { /* A pending connection */ skb->sk->dead = 1; /* Queue the unaccepted socket for death */ nr_set_timer(skb->sk); - skb->sk->nr->state = NR_STATE_0; + skb->sk->protinfo.nr->state = NR_STATE_0; } kfree_skb(skb, FREE_READ); @@ -237,31 +274,123 @@ void nr_destroy_socket(struct sock *sk) /* Not static as its used by the timer * if (sk->wmem_alloc || sk->rmem_alloc) { /* Defer: outstanding buffers */ init_timer(&sk->timer); - sk->timer.expires = 10 * HZ; + sk->timer.expires = jiffies + 10 * HZ; sk->timer.function = nr_destroy_timer; sk->timer.data = (unsigned long)sk; add_timer(&sk->timer); } else { - kfree_s(sk->nr, sizeof(*sk->nr)); - kfree_s(sk, sizeof(*sk)); + kfree_s(sk->protinfo.nr, sizeof(*sk->protinfo.nr)); + sk_free(sk); } restore_flags(flags); } -/*******************************************************************************************************************\ -* * -* Handling for system calls applied via the various interfaces to a NET/ROM socket object * -* * -\*******************************************************************************************************************/ +/* + * Handling for system calls applied via the various interfaces to a + * NET/ROM socket object. + */ static int nr_fcntl(struct socket *sock, unsigned int cmd, unsigned long arg) { - switch(cmd) - { - default: - return(-EINVAL); - } + return -EINVAL; +} + +/* + * dl1bke 960311: set parameters for existing NET/ROM connections, + * includes a KILL command to abort any connection. + * VERY useful for debugging ;-) + */ +static int nr_ctl_ioctl(const unsigned int cmd, void *arg) +{ + struct nr_ctl_struct nr_ctl; + struct sock *sk; + unsigned long flags; + int err; + + if ((err = verify_area(VERIFY_READ, arg, sizeof(nr_ctl))) != 0) + return err; + + copy_from_user(&nr_ctl, arg, sizeof(nr_ctl)); + + if ((sk = nr_find_socket(nr_ctl.index, nr_ctl.id)) == NULL) + return -ENOTCONN; + + switch (nr_ctl.cmd) { + case NETROM_KILL: + nr_clear_queues(sk); + nr_write_internal(sk, NR_DISCREQ); + sk->protinfo.nr->state = NR_STATE_0; + sk->state = TCP_CLOSE; + sk->err = ENETRESET; + sk->shutdown |= SEND_SHUTDOWN; + if (!sk->dead) + sk->state_change(sk); + sk->dead = 1; + nr_set_timer(sk); + break; + + case NETROM_T1: + if (nr_ctl.arg < 1) + return -EINVAL; + sk->protinfo.nr->rtt = (nr_ctl.arg * PR_SLOWHZ) / 2; + sk->protinfo.nr->t1 = nr_ctl.arg * PR_SLOWHZ; + save_flags(flags); cli(); + if (sk->protinfo.nr->t1timer > sk->protinfo.nr->t1) + sk->protinfo.nr->t1timer = sk->protinfo.nr->t1; + restore_flags(flags); + break; + + case NETROM_T2: + if (nr_ctl.arg < 1) + return -EINVAL; + save_flags(flags); cli(); + sk->protinfo.nr->t2 = nr_ctl.arg * PR_SLOWHZ; + if (sk->protinfo.nr->t2timer > sk->protinfo.nr->t2) + sk->protinfo.nr->t2timer = sk->protinfo.nr->t2; + restore_flags(flags); + break; + + case NETROM_N2: + if (nr_ctl.arg < 1 || nr_ctl.arg > 10) + return -EINVAL; + sk->protinfo.nr->n2count = 0; + sk->protinfo.nr->n2 = nr_ctl.arg; + break; + + case NETROM_T4: + if (nr_ctl.arg < 1) + return -EINVAL; + save_flags(flags); cli(); + sk->protinfo.nr->t4 = nr_ctl.arg * PR_SLOWHZ; + if (sk->protinfo.nr->t4timer > sk->protinfo.nr->t4) + sk->protinfo.nr->t4timer = sk->protinfo.nr->t4; + restore_flags(flags); + break; + + case NETROM_IDLE: + if (nr_ctl.arg < 1) + return -EINVAL; + save_flags(flags); cli(); + sk->protinfo.nr->idle = nr_ctl.arg * 60 * PR_SLOWHZ; + if (sk->protinfo.nr->idletimer > sk->protinfo.nr->idle) + sk->protinfo.nr->idletimer = sk->protinfo.nr->idle; + restore_flags(flags); + break; + + case NETROM_PACLEN: + if (nr_ctl.arg < 16 || nr_ctl.arg > 65535) + return -EINVAL; + if (nr_ctl.arg > 236) /* we probably want this */ + printk(KERN_WARNING "nr_ctl_ioctl: Warning --- huge paclen %d\n", (int)nr_ctl.arg); + sk->protinfo.nr->paclen = nr_ctl.arg; + break; + + default: + return -EINVAL; + } + + return 0; } static int nr_setsockopt(struct socket *sock, int level, int optname, @@ -284,25 +413,47 @@ static int nr_setsockopt(struct socket *sock, int level, int optname, if ((err = verify_area(VERIFY_READ, optval, sizeof(int))) != 0) return err; - opt = get_fs_long((unsigned long *)optval); + get_user(opt, (int *)optval); switch (optname) { case NETROM_T1: if (opt < 1) return -EINVAL; - sk->nr->t1 = opt * PR_SLOWHZ; + sk->protinfo.nr->rtt = (opt * PR_SLOWHZ) / 2; return 0; case NETROM_T2: if (opt < 1) return -EINVAL; - sk->nr->t2 = opt * PR_SLOWHZ; + sk->protinfo.nr->t2 = opt * PR_SLOWHZ; return 0; case NETROM_N2: if (opt < 1 || opt > 31) return -EINVAL; - sk->nr->n2 = opt; + sk->protinfo.nr->n2 = opt; + return 0; + + case NETROM_T4: + if (opt < 1) + return -EINVAL; + sk->protinfo.nr->t4 = opt * PR_SLOWHZ; + return 0; + + case NETROM_IDLE: + if (opt < 1) + return -EINVAL; + sk->protinfo.nr->idle = opt * 60 * PR_SLOWHZ; + return 0; + + case NETROM_HDRINCL: + sk->protinfo.nr->hdrincl = opt ? 1 : 0; + return 0; + + case NETROM_PACLEN: + if (opt < 1 || opt > 65536) + return -EINVAL; + sk->protinfo.nr->paclen = opt; return 0; default: @@ -327,17 +478,33 @@ static int nr_getsockopt(struct socket *sock, int level, int optname, switch (optname) { case NETROM_T1: - val = sk->nr->t1 / PR_SLOWHZ; + val = (sk->protinfo.nr->t1 * 2) / PR_SLOWHZ; break; case NETROM_T2: - val = sk->nr->t2 / PR_SLOWHZ; + val = sk->protinfo.nr->t2 / PR_SLOWHZ; break; case NETROM_N2: - val = sk->nr->n2; + val = sk->protinfo.nr->n2; break; + case NETROM_T4: + val = sk->protinfo.nr->t4 / PR_SLOWHZ; + break; + + case NETROM_IDLE: + val = sk->protinfo.nr->idle / (PR_SLOWHZ * 60); + break; + + case NETROM_HDRINCL: + val = sk->protinfo.nr->hdrincl; + break; + + case NETROM_PACLEN: + val = sk->protinfo.nr->paclen; + break; + default: return -ENOPROTOOPT; } @@ -345,12 +512,12 @@ static int nr_getsockopt(struct socket *sock, int level, int optname, if ((err = verify_area(VERIFY_WRITE, optlen, sizeof(int))) != 0) return err; - put_fs_long(sizeof(int), (unsigned long *)optlen); + put_user(sizeof(int), optlen); if ((err = verify_area(VERIFY_WRITE, optval, sizeof(int))) != 0) return err; - put_fs_long(val, (unsigned long *)optval); + put_user(val, (int *)optval); return 0; } @@ -359,8 +526,8 @@ static int nr_listen(struct socket *sock, int backlog) { struct sock *sk = (struct sock *)sock->data; - if (sk->type == SOCK_SEQPACKET && sk->state != TCP_LISTEN) { - memset(&sk->nr->user_addr, '\0', sizeof(ax25_address)); + if (sk->state != TCP_LISTEN) { + memset(&sk->protinfo.nr->user_addr, '\0', AX25_ADDR_LEN); sk->max_ack_backlog = backlog; sk->state = TCP_LISTEN; return 0; @@ -386,25 +553,17 @@ static int nr_create(struct socket *sock, int protocol) struct sock *sk; nr_cb *nr; - if ((sk = (struct sock *)kmalloc(sizeof(*sk), GFP_ATOMIC)) == NULL) + if (sock->type != SOCK_SEQPACKET || protocol != 0) + return -ESOCKTNOSUPPORT; + + if ((sk = sk_alloc(GFP_ATOMIC)) == NULL) return -ENOMEM; if ((nr = (nr_cb *)kmalloc(sizeof(*nr), GFP_ATOMIC)) == NULL) { - kfree_s(sk, sizeof(*sk)); + sk_free(sk); return -ENOMEM; } - sk->type = sock->type; - - switch (sock->type) { - case SOCK_SEQPACKET: - break; - default: - kfree_s((void *)sk, sizeof(*sk)); - kfree_s((void *)nr, sizeof(*nr)); - return -ESOCKTNOSUPPORT; - } - skb_queue_head_init(&sk->receive_queue); skb_queue_head_init(&sk->write_queue); skb_queue_head_init(&sk->back_log); @@ -412,27 +571,16 @@ static int nr_create(struct socket *sock, int protocol) init_timer(&sk->timer); sk->socket = sock; + sk->type = sock->type; sk->protocol = protocol; - sk->dead = 0; - sk->next = NULL; - sk->broadcast = 0; + sk->allocation = GFP_KERNEL; sk->rcvbuf = SK_RMEM_MAX; sk->sndbuf = SK_WMEM_MAX; - sk->wmem_alloc = 0; - sk->rmem_alloc = 0; - sk->inuse = 0; - sk->debug = 0; - sk->prot = NULL; /* So we use default free mechanisms */ - sk->err = 0; - sk->localroute = 0; - sk->send_head = NULL; sk->state = TCP_CLOSE; - sk->shutdown = 0; sk->priority = SOPRI_NORMAL; - sk->ack_backlog = 0; sk->mtu = NETROM_MTU; /* 236 */ sk->zapped = 1; - sk->window = nr_default.window; + sk->window = sysctl_netrom_transport_requested_window_size; sk->state_change = def_callback1; sk->data_ready = def_callback2; @@ -446,18 +594,23 @@ static int nr_create(struct socket *sock, int protocol) skb_queue_head_init(&nr->ack_queue); skb_queue_head_init(&nr->reseq_queue); + skb_queue_head_init(&nr->frag_queue); nr->my_index = 0; nr->my_id = 0; - nr->rtt = nr_default.timeout; - nr->t1 = nr_default.timeout; - nr->t2 = nr_default.ack_delay; - nr->n2 = nr_default.tries; - - nr->t1timer = 0; - nr->t2timer = 0; - nr->t4timer = 0; - nr->n2count = 0; + nr->rtt = sysctl_netrom_transport_timeout / 2; + nr->t1 = sysctl_netrom_transport_timeout; + nr->t2 = sysctl_netrom_transport_acknowledge_delay; + nr->n2 = sysctl_netrom_transport_maximum_tries; + nr->t4 = sysctl_netrom_transport_busy_delay; + nr->idle = sysctl_netrom_transport_no_activity_timeout; + nr->paclen = sysctl_netrom_transport_packet_length; + + nr->t1timer = 0; + nr->t2timer = 0; + nr->t4timer = 0; + nr->idletimer = 0; + nr->n2count = 0; nr->va = 0; nr->vr = 0; @@ -470,14 +623,18 @@ static int nr_create(struct socket *sock, int protocol) nr->my_index = 0; nr->my_id = 0; + nr->bpqext = 1; + nr->fraglen = 0; + nr->hdrincl = 0; nr->state = NR_STATE_0; + nr->device = NULL; - memset(&nr->source_addr, '\0', sizeof(ax25_address)); - memset(&nr->user_addr, '\0', sizeof(ax25_address)); - memset(&nr->dest_addr, '\0', sizeof(ax25_address)); + memset(&nr->source_addr, '\0', AX25_ADDR_LEN); + memset(&nr->user_addr, '\0', AX25_ADDR_LEN); + memset(&nr->dest_addr, '\0', AX25_ADDR_LEN); - nr->sk = sk; - sk->nr = nr; + nr->sk = sk; + sk->protinfo.nr = nr; return 0; } @@ -487,24 +644,15 @@ static struct sock *nr_make_new(struct sock *osk) struct sock *sk; nr_cb *nr; - if ((sk = (struct sock *)kmalloc(sizeof(*sk), GFP_ATOMIC)) == NULL) + if (osk->type != SOCK_SEQPACKET) return NULL; - if ((nr = (nr_cb *)kmalloc(sizeof(*nr), GFP_ATOMIC)) == NULL) { - kfree_s(sk, sizeof(*sk)); + if ((sk = (struct sock *)sk_alloc(GFP_ATOMIC)) == NULL) return NULL; - } - sk->type = osk->type; - sk->socket = osk->socket; - - switch (osk->type) { - case SOCK_SEQPACKET: - break; - default: - kfree_s((void *)sk, sizeof(*sk)); - kfree_s((void *)nr, sizeof(*nr)); - return NULL; + if ((nr = (nr_cb *)kmalloc(sizeof(*nr), GFP_ATOMIC)) == NULL) { + sk_free(sk); + return NULL; } skb_queue_head_init(&sk->receive_queue); @@ -513,26 +661,15 @@ static struct sock *nr_make_new(struct sock *osk) init_timer(&sk->timer); - sk->rmem_alloc = 0; - sk->dead = 0; - sk->next = NULL; + sk->type = osk->type; + sk->socket = osk->socket; sk->priority = osk->priority; - sk->broadcast = 0; sk->protocol = osk->protocol; sk->rcvbuf = osk->rcvbuf; sk->sndbuf = osk->sndbuf; - sk->wmem_alloc = 0; - sk->rmem_alloc = 0; - sk->inuse = 0; - sk->ack_backlog = 0; - sk->prot = NULL; /* So we use default free mechanisms */ - sk->err = 0; - sk->localroute = 0; - sk->send_head = NULL; sk->debug = osk->debug; sk->state = TCP_ESTABLISHED; sk->window = osk->window; - sk->shutdown = 0; sk->mtu = osk->mtu; sk->sleep = osk->sleep; sk->zapped = osk->zapped; @@ -544,24 +681,34 @@ static struct sock *nr_make_new(struct sock *osk) skb_queue_head_init(&nr->ack_queue); skb_queue_head_init(&nr->reseq_queue); - - nr->rtt = osk->nr->rtt; - nr->t1 = osk->nr->t1; - nr->t2 = osk->nr->t2; - nr->n2 = osk->nr->n2; - - nr->t1timer = 0; - nr->t2timer = 0; - nr->t4timer = 0; - nr->n2count = 0; + skb_queue_head_init(&nr->frag_queue); + + nr->rtt = osk->protinfo.nr->rtt; + nr->t1 = osk->protinfo.nr->t1; + nr->t2 = osk->protinfo.nr->t2; + nr->n2 = osk->protinfo.nr->n2; + nr->t4 = osk->protinfo.nr->t4; + nr->idle = osk->protinfo.nr->idle; + nr->paclen = osk->protinfo.nr->paclen; + + nr->device = osk->protinfo.nr->device; + nr->bpqext = osk->protinfo.nr->bpqext; + nr->hdrincl = osk->protinfo.nr->hdrincl; + nr->fraglen = 0; + + nr->t1timer = 0; + nr->t2timer = 0; + nr->t4timer = 0; + nr->idletimer = 0; + nr->n2count = 0; nr->va = 0; nr->vr = 0; nr->vs = 0; nr->vl = 0; - sk->nr = nr; - nr->sk = sk; + sk->protinfo.nr = nr; + nr->sk = sk; return sk; } @@ -579,51 +726,56 @@ static int nr_release(struct socket *sock, struct socket *peer) if (sk == NULL) return 0; - if (sk->type == SOCK_SEQPACKET) { - switch (sk->nr->state) { - case NR_STATE_0: - sk->dead = 1; - sk->state_change(sk); - nr_destroy_socket(sk); - break; + switch (sk->protinfo.nr->state) { - case NR_STATE_1: - sk->nr->state = NR_STATE_0; - sk->dead = 1; - sk->state_change(sk); - nr_destroy_socket(sk); - break; + case NR_STATE_0: + sk->state = TCP_CLOSE; + sk->shutdown |= SEND_SHUTDOWN; + sk->state_change(sk); + sk->dead = 1; + nr_destroy_socket(sk); + break; - case NR_STATE_2: - nr_write_internal(sk, NR_DISCACK); - sk->nr->state = NR_STATE_0; - sk->dead = 1; - sk->state_change(sk); - nr_destroy_socket(sk); - break; - - case NR_STATE_3: - nr_clear_tx_queue(sk); - sk->nr->n2count = 0; - nr_write_internal(sk, NR_DISCREQ); - sk->nr->t1timer = sk->nr->t1 = nr_calculate_t1(sk); - sk->nr->t2timer = 0; - sk->nr->t4timer = 0; - sk->nr->state = NR_STATE_2; - sk->state_change(sk); - sk->dead = 1; - break; + case NR_STATE_1: + sk->protinfo.nr->state = NR_STATE_0; + sk->state = TCP_CLOSE; + sk->shutdown |= SEND_SHUTDOWN; + sk->state_change(sk); + sk->dead = 1; + nr_destroy_socket(sk); + break; - default: - break; - } - } else { - sk->dead = 1; - sk->state_change(sk); - nr_destroy_socket(sk); + case NR_STATE_2: + nr_write_internal(sk, NR_DISCACK); + sk->protinfo.nr->state = NR_STATE_0; + sk->state = TCP_CLOSE; + sk->shutdown = SEND_SHUTDOWN; + sk->state_change(sk); + sk->dead = 1; + nr_destroy_socket(sk); + break; + + case NR_STATE_3: + nr_clear_queues(sk); + sk->protinfo.nr->n2count = 0; + nr_write_internal(sk, NR_DISCREQ); + sk->protinfo.nr->t1timer = sk->protinfo.nr->t1 = nr_calculate_t1(sk); + sk->protinfo.nr->t2timer = 0; + sk->protinfo.nr->t4timer = 0; + sk->protinfo.nr->state = NR_STATE_2; + sk->state = TCP_CLOSE; + sk->shutdown |= SEND_SHUTDOWN; + sk->state_change(sk); + sk->dead = 1; + sk->destroy = 1; + break; + + default: + break; } sock->data = NULL; + sk->socket = NULL; /* Not used, but we should do this. **/ return 0; } @@ -632,25 +784,18 @@ static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk; struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr; + struct device *dev; ax25_address *user, *source; sk = (struct sock *)sock->data; if (sk->zapped == 0) - return -EIO; + return -EINVAL; if (addr_len != sizeof(struct sockaddr_ax25) && addr_len != sizeof(struct full_sockaddr_ax25)) return -EINVAL; -#ifdef DONTDO - if (nr_find_listener(&addr->fsa_ax25.sax25_call, sk->type) != NULL) { - if (sk->debug) - printk("NET/ROM: bind failed: in use\n"); - return -EADDRINUSE; - } -#endif - - if (nr_dev_get(&addr->fsa_ax25.sax25_call) == NULL) { + if ((dev = nr_dev_get(&addr->fsa_ax25.sax25_call)) == NULL) { if (sk->debug) printk("NET/ROM: bind failed: invalid node callsign\n"); return -EADDRNOTAVAIL; @@ -661,9 +806,9 @@ static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) */ if (addr->fsa_ax25.sax25_ndigis == 1) { if (!suser()) - return -EPERM; - memcpy(&sk->nr->user_addr, &addr->fsa_digipeater[0], sizeof(ax25_address)); - memcpy(&sk->nr->source_addr, &addr->fsa_ax25.sax25_call, sizeof(ax25_address)); + return -EACCES; + sk->protinfo.nr->user_addr = addr->fsa_digipeater[0]; + sk->protinfo.nr->source_addr = addr->fsa_ax25.sax25_call; } else { source = &addr->fsa_ax25.sax25_call; @@ -673,10 +818,11 @@ static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) user = source; } - memcpy(&sk->nr->user_addr, user, sizeof(ax25_address)); - memcpy(&sk->nr->source_addr, source, sizeof(ax25_address)); + sk->protinfo.nr->user_addr = *user; + sk->protinfo.nr->source_addr = *source; } + sk->protinfo.nr->device = dev; nr_insert_socket(sk); sk->zapped = 0; @@ -705,21 +851,21 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr, return -ECONNREFUSED; } - if (sk->state == TCP_ESTABLISHED && sk->type == SOCK_SEQPACKET) + if (sk->state == TCP_ESTABLISHED) return -EISCONN; /* No reconnect on a seqpacket socket */ sk->state = TCP_CLOSE; sock->state = SS_UNCONNECTED; - if (addr_len != sizeof(struct sockaddr_ax25)) + if (addr_len != sizeof(struct sockaddr_ax25) && addr_len != sizeof(struct full_sockaddr_ax25)) return -EINVAL; - if ((dev = nr_dev_first()) == NULL) - return -ENETUNREACH; - if (sk->zapped) { /* Must bind first - autobinding in this may or may not work */ sk->zapped = 0; + if ((dev = nr_dev_first()) == NULL) + return -ENETUNREACH; + source = (ax25_address *)dev->dev_addr; if ((user = ax25_findbyuid(current->euid)) == NULL) { @@ -728,24 +874,28 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr, user = source; } - memcpy(&sk->nr->user_addr, user, sizeof(ax25_address)); - memcpy(&sk->nr->source_addr, source, sizeof(ax25_address)); + sk->protinfo.nr->user_addr = *user; + sk->protinfo.nr->source_addr = *source; + sk->protinfo.nr->device = dev; nr_insert_socket(sk); /* Finish the bind */ } - - memcpy(&sk->nr->dest_addr, &addr->sax25_call, sizeof(ax25_address)); - sk->nr->my_index = circuit / 256; - sk->nr->my_id = circuit % 256; + sk->protinfo.nr->dest_addr = addr->sax25_call; + + while (nr_find_socket((unsigned char)circuit / 256, (unsigned char)circuit % 256) != NULL) + circuit++; + + sk->protinfo.nr->my_index = circuit / 256; + sk->protinfo.nr->my_id = circuit % 256; circuit++; /* Move to connecting socket, start sending Connect Requests */ - sock->state = SS_CONNECTING; - sk->state = TCP_SYN_SENT; + sock->state = SS_CONNECTING; + sk->state = TCP_SYN_SENT; nr_establish_data_link(sk); - sk->nr->state = NR_STATE_1; + sk->protinfo.nr->state = NR_STATE_1; nr_set_timer(sk); /* Now the loop */ @@ -768,7 +918,7 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr, if (sk->state != TCP_ESTABLISHED) { sti(); sock->state = SS_UNCONNECTED; - return -sk->err; /* Always set at this point */ + return sock_error(sk); /* Always set at this point */ } sock->state = SS_CONNECTED; @@ -790,7 +940,7 @@ static int nr_accept(struct socket *sock, struct socket *newsock, int flags) struct sk_buff *skb; if (newsock->data) - kfree_s(newsock->data, sizeof(struct sock)); + sk_free(newsock->data); newsock->data = NULL; @@ -802,8 +952,10 @@ static int nr_accept(struct socket *sock, struct socket *newsock, int flags) if (sk->state != TCP_LISTEN) return -EINVAL; - /* The write queue this time is holding sockets ready to use - hooked into the SABM we saved */ + /* + * The write queue this time is holding sockets ready to use + * hooked into the SABM we saved + */ do { cli(); if ((skb = skb_dequeue(&sk->receive_queue)) == NULL) { @@ -845,13 +997,13 @@ static int nr_getname(struct socket *sock, struct sockaddr *uaddr, return -ENOTCONN; sax->fsa_ax25.sax25_family = AF_NETROM; sax->fsa_ax25.sax25_ndigis = 1; - memcpy(&sax->fsa_ax25.sax25_call, &sk->nr->user_addr, sizeof(ax25_address)); - memcpy(&sax->fsa_digipeater[0], &sk->nr->dest_addr, sizeof(ax25_address)); - *uaddr_len = sizeof(struct sockaddr_ax25) + sizeof(ax25_address); + sax->fsa_ax25.sax25_call = sk->protinfo.nr->user_addr; + sax->fsa_digipeater[0] = sk->protinfo.nr->dest_addr; + *uaddr_len = sizeof(struct full_sockaddr_ax25); } else { sax->fsa_ax25.sax25_family = AF_NETROM; sax->fsa_ax25.sax25_ndigis = 0; - memcpy(&sax->fsa_ax25.sax25_call, &sk->nr->source_addr, sizeof(ax25_address)); + sax->fsa_ax25.sax25_call = sk->protinfo.nr->source_addr; *uaddr_len = sizeof(struct sockaddr_ax25); } @@ -864,36 +1016,45 @@ int nr_rx_frame(struct sk_buff *skb, struct device *dev) struct sock *make; ax25_address *src, *dest, *user; unsigned short circuit_index, circuit_id; - unsigned short frametype, window; + unsigned short frametype, window, timeout; - skb->sk = NULL; /* Initially we don't know who its for */ - - src = (ax25_address *)(skb->data + 17); - dest = (ax25_address *)(skb->data + 24); + skb->sk = NULL; /* Initially we don't know who it's for */ + + /* + * skb->data points to the netrom frame start + */ + + src = (ax25_address *)(skb->data + 0); + dest = (ax25_address *)(skb->data + 7); - circuit_index = skb->data[32]; - circuit_id = skb->data[33]; - frametype = skb->data[36]; + circuit_index = skb->data[15]; + circuit_id = skb->data[16]; + frametype = skb->data[19]; #ifdef CONFIG_INET /* * Check for an incoming IP over NET/ROM frame. */ if ((frametype & 0x0F) == NR_PROTOEXT && circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) { - skb->h.raw = skb->data + 37; + skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN); + skb->h.raw = skb->data; return nr_rx_ip(skb, dev); } #endif /* - * Find an existing socket connection, based on circuit ID, if its + * Find an existing socket connection, based on circuit ID, if it's * a Connect Request base it on their circuit ID. */ - if (((frametype & 0x0F) != NR_CONNREQ && (sk = nr_find_socket(circuit_index, circuit_id, SOCK_SEQPACKET)) != NULL) || - ((frametype & 0x0F) == NR_CONNREQ && (sk = nr_find_peer(circuit_index, circuit_id, SOCK_SEQPACKET)) != NULL)) { - skb->h.raw = skb->data + 37; - skb->len -= 20; + if (((frametype & 0x0F) != NR_CONNREQ && (sk = nr_find_socket(circuit_index, circuit_id)) != NULL) || + ((frametype & 0x0F) == NR_CONNREQ && (sk = nr_find_peer(circuit_index, circuit_id)) != NULL)) { + skb->h.raw = skb->data; + + if ((frametype & 0x0F) == NR_CONNACK && skb->len == 22) + sk->protinfo.nr->bpqext = 1; + else + sk->protinfo.nr->bpqext = 0; return nr_process_rx_frame(sk, skb); } @@ -901,29 +1062,30 @@ int nr_rx_frame(struct sk_buff *skb, struct device *dev) if ((frametype & 0x0F) != NR_CONNREQ) return 0; - sk = nr_find_listener(dest, SOCK_SEQPACKET); + sk = nr_find_listener(dest); + + user = (ax25_address *)(skb->data + 21); if (sk == NULL || sk->ack_backlog == sk->max_ack_backlog || (make = nr_make_new(sk)) == NULL) { nr_transmit_dm(skb); return 0; } - user = (ax25_address *)(skb->data + 38); - window = skb->data[37]; + window = skb->data[20]; skb->sk = make; make->state = TCP_ESTABLISHED; /* Fill in his circuit details */ - memcpy(&make->nr->source_addr, dest, sizeof(ax25_address)); - memcpy(&make->nr->dest_addr, src, sizeof(ax25_address)); - memcpy(&make->nr->user_addr, user, sizeof(ax25_address)); - - make->nr->your_index = circuit_index; - make->nr->your_id = circuit_id; + make->protinfo.nr->source_addr = *dest; + make->protinfo.nr->dest_addr = *src; + make->protinfo.nr->user_addr = *user; + + make->protinfo.nr->your_index = circuit_index; + make->protinfo.nr->your_id = circuit_id; - make->nr->my_index = circuit / 256; - make->nr->my_id = circuit % 256; + make->protinfo.nr->my_index = circuit / 256; + make->protinfo.nr->my_id = circuit % 256; circuit++; @@ -931,14 +1093,24 @@ int nr_rx_frame(struct sk_buff *skb, struct device *dev) if (window < make->window) make->window = window; + /* L4 timeout negotiation */ + if (skb->len == 37) { + timeout = skb->data[36] * 256 + skb->data[35]; + if (timeout * PR_SLOWHZ < make->protinfo.nr->rtt * 2) + make->protinfo.nr->rtt = (timeout * PR_SLOWHZ) / 2; + make->protinfo.nr->bpqext = 1; + } else { + make->protinfo.nr->bpqext = 0; + } + nr_write_internal(make, NR_CONNACK); - make->nr->condition = 0x00; - make->nr->vs = 0; - make->nr->va = 0; - make->nr->vr = 0; - make->nr->vl = 0; - make->nr->state = NR_STATE_3; + make->protinfo.nr->condition = 0x00; + make->protinfo.nr->vs = 0; + make->protinfo.nr->va = 0; + make->protinfo.nr->vr = 0; + make->protinfo.nr->vl = 0; + make->protinfo.nr->state = NR_STATE_3; sk->ack_backlog++; make->pair = sk; @@ -954,34 +1126,38 @@ int nr_rx_frame(struct sk_buff *skb, struct device *dev) return 1; } -static int nr_sendto(struct socket *sock, void *ubuf, int len, int noblock, - unsigned flags, struct sockaddr *usip, int addr_len) +static int nr_sendmsg(struct socket *sock, struct msghdr *msg, int len, int noblock, int flags) { struct sock *sk = (struct sock *)sock->data; - struct sockaddr_ax25 *usax = (struct sockaddr_ax25 *)usip; + struct sockaddr_ax25 *usax = (struct sockaddr_ax25 *)msg->msg_name; int err; struct sockaddr_ax25 sax; struct sk_buff *skb; unsigned char *asmptr; int size; - if (sk->err) { - err = sk->err; - sk->err = 0; - return -err; - } + if (sk->err) + return sock_error(sk); if (flags) return -EINVAL; if (sk->zapped) return -EADDRNOTAVAIL; + + if (sk->shutdown & SEND_SHUTDOWN) { + send_sig(SIGPIPE, current, 0); + return -EPIPE; + } + + if (sk->protinfo.nr->device == NULL) + return -ENETUNREACH; if (usax) { - if (addr_len < sizeof(sax)) + if (msg->msg_namelen < sizeof(sax)) return -EINVAL; - memcpy(&sax, usax, sizeof(sax)); - if (sk->type == SOCK_SEQPACKET && memcmp(&sk->nr->dest_addr, &sax.sax25_call, sizeof(ax25_address)) != 0) + sax = *usax; + if (ax25cmp(&sk->protinfo.nr->dest_addr, &sax.sax25_call) != 0) return -EISCONN; if (sax.sax25_family != AF_NETROM) return -EINVAL; @@ -989,7 +1165,7 @@ static int nr_sendto(struct socket *sock, void *ubuf, int len, int noblock, if (sk->state != TCP_ESTABLISHED) return -ENOTCONN; sax.sax25_family = AF_NETROM; - memcpy(&sax.sax25_call, &sk->nr->dest_addr, sizeof(ax25_address)); + sax.sax25_call = sk->protinfo.nr->dest_addr; } if (sk->debug) @@ -999,43 +1175,30 @@ static int nr_sendto(struct socket *sock, void *ubuf, int len, int noblock, if (sk->debug) printk("NET/ROM: sendto: building packet.\n"); - size = len + 37; + size = len + AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + NR_NETWORK_LEN + NR_TRANSPORT_LEN; - if ((skb = sock_alloc_send_skb(sk, size, 0, &err)) == NULL) + if ((skb = sock_alloc_send_skb(sk, size, 0, 0, &err)) == NULL) return err; skb->sk = sk; skb->free = 1; skb->arp = 1; - skb->len = size; + + skb_reserve(skb, size - len); - asmptr = skb->data + 16; + /* + * Push down the NET/ROM header + */ + + asmptr = skb_push(skb, NR_TRANSPORT_LEN); if (sk->debug) printk("Building NET/ROM Header.\n"); - /* Build a NET/ROM Network header */ - - *asmptr++ = AX25_P_NETROM; - - memcpy(asmptr, &sk->nr->source_addr, sizeof(ax25_address)); - asmptr[6] &= ~LAPB_C; - asmptr[6] &= ~LAPB_E; - asmptr[6] |= SSID_SPARE; - asmptr += 7; - - memcpy(asmptr, &sax.sax25_call, sizeof(ax25_address)); - asmptr[6] &= ~LAPB_C; - asmptr[6] |= LAPB_E; - asmptr[6] |= SSID_SPARE; - asmptr += 7; - - *asmptr++ = nr_default.ttl; - /* Build a NET/ROM Transport header */ - *asmptr++ = sk->nr->your_index; - *asmptr++ = sk->nr->your_id; + *asmptr++ = sk->protinfo.nr->your_index; + *asmptr++ = sk->protinfo.nr->your_id; *asmptr++ = 0; /* To be filled in later */ *asmptr++ = 0; /* Ditto */ *asmptr++ = NR_INFO; @@ -1043,13 +1206,19 @@ static int nr_sendto(struct socket *sock, void *ubuf, int len, int noblock, if (sk->debug) printk("Built header.\n"); - skb->h.raw = asmptr; + /* + * Put the data on the end + */ + + skb->h.raw = skb_put(skb, len); + + asmptr = skb->h.raw; if (sk->debug) printk("NET/ROM: Appending user data\n"); /* User data follows immediately after the NET/ROM transport header */ - memcpy_fromfs(asmptr, ubuf, len); + memcpy_fromiovec(asmptr, msg->msg_iov, len); if (sk->debug) printk("NET/ROM: Transmitting buffer\n"); @@ -1064,76 +1233,59 @@ static int nr_sendto(struct socket *sock, void *ubuf, int len, int noblock, return len; } -static int nr_send(struct socket *sock, void *ubuf, int size, int noblock, unsigned flags) -{ - return nr_sendto(sock, ubuf, size, noblock, flags, NULL, 0); -} -static int nr_write(struct socket *sock, char *ubuf, int size, int noblock) -{ - return nr_send(sock, ubuf, size, noblock, 0); -} - -static int nr_recvfrom(struct socket *sock, void *ubuf, int size, int noblock, - unsigned flags, struct sockaddr *sip, int *addr_len) +static int nr_recvmsg(struct socket *sock, struct msghdr *msg, int size, int noblock, + int flags, int *addr_len) { struct sock *sk = (struct sock *)sock->data; - struct sockaddr_ax25 *sax = (struct sockaddr_ax25 *)sip; - int copied = 0; + struct sockaddr_ax25 *sax = (struct sockaddr_ax25 *)msg->msg_name; + int copied; struct sk_buff *skb; int er; - if (sk->err) { - er = -sk->err; - sk->err = 0; - return er; - } - if (addr_len != NULL) *addr_len = sizeof(*sax); - /* This works for seqpacket too. The receiver has ordered the queue for us! We do one quick check first though */ - if (sk->type == SOCK_SEQPACKET && sk->state != TCP_ESTABLISHED) + /* + * This works for seqpacket too. The receiver has ordered the queue for + * us! We do one quick check first though + */ + + if (sk->state != TCP_ESTABLISHED) return -ENOTCONN; /* Now we can treat all alike */ if ((skb = skb_recv_datagram(sk, flags, noblock, &er)) == NULL) return er; - copied = (size < skb->len) ? size : skb->len; + if (!sk->protinfo.nr->hdrincl) { + skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN); + skb->h.raw = skb->data; + } + + copied = skb->len; + + if (copied > size) { + copied = size; + msg->msg_flags |= MSG_TRUNC; + } - skb_copy_datagram(skb, 0, ubuf, copied); + skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); if (sax != NULL) { struct sockaddr_ax25 addr; addr.sax25_family = AF_NETROM; - memcpy(&addr.sax25_call, skb->data + 24, sizeof(ax25_address)); + memcpy(&addr.sax25_call, skb->data + 7, AX25_ADDR_LEN); - memcpy(sax, &addr, sizeof(*sax)); + *sax = addr; *addr_len = sizeof(*sax); } - skb_free_datagram(skb); + skb_free_datagram(sk, skb); return copied; -} - -static int nr_recv(struct socket *sock, void *ubuf, int size , int noblock, - unsigned flags) -{ - struct sock *sk = (struct sock *)sock->data; - - if (sk->zapped) - return -ENOTCONN; - - return nr_recvfrom(sock, ubuf, size, noblock, flags, NULL, NULL); -} - -static int nr_read(struct socket *sock, char *ubuf, int size, int noblock) -{ - return nr_recv(sock, ubuf, size, noblock, 0); } static int nr_shutdown(struct socket *sk, int how) @@ -1156,23 +1308,22 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) switch (cmd) { case TIOCOUTQ: - if ((err = verify_area(VERIFY_WRITE, (void *)arg, sizeof(unsigned long))) != 0) + if ((err = verify_area(VERIFY_WRITE, (void *)arg, sizeof(int))) != 0) return err; amount = sk->sndbuf - sk->wmem_alloc; if (amount < 0) amount = 0; - put_fs_long(amount, (unsigned long *)arg); + put_user(amount, (int *)arg); return 0; - case TIOCINQ: - { + case TIOCINQ: { struct sk_buff *skb; /* These two are safe on a single CPU system as only user tasks fiddle here */ if ((skb = skb_peek(&sk->receive_queue)) != NULL) - amount = skb->len; - if ((err = verify_area(VERIFY_WRITE, (void *)arg, sizeof(unsigned long))) != 0) + amount = skb->len - 20; + if ((err = verify_area(VERIFY_WRITE, (void *)arg, sizeof(int))) != 0) return err; - put_fs_long(amount, (unsigned long *)arg); + put_user(amount, (int *)arg); return 0; } @@ -1182,7 +1333,7 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return -ENOENT; if ((err = verify_area(VERIFY_WRITE,(void *)arg,sizeof(struct timeval))) != 0) return err; - memcpy_tofs((void *)arg, &sk->stamp, sizeof(struct timeval)); + copy_to_user((void *)arg, &sk->stamp, sizeof(struct timeval)); return 0; } return -EINVAL; @@ -1199,73 +1350,60 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCSIFMETRIC: return -EINVAL; - case SIOCNRADDNODE: - case SIOCNRDELNODE: - case SIOCNRADDNEIGH: - case SIOCNRDELNEIGH: + case SIOCADDRT: + case SIOCDELRT: case SIOCNRDECOBS: if (!suser()) return -EPERM; return nr_rt_ioctl(cmd, (void *)arg); - case SIOCNRGETPARMS: - { - struct nr_parms_struct nr_parms; - if ((err = verify_area(VERIFY_WRITE, (void *)arg, sizeof(struct nr_parms_struct))) != 0) - return err; - memcpy_fromfs(&nr_parms, (void *)arg, sizeof(struct nr_parms_struct)); - nr_parms = nr_default; - memcpy_tofs((void *)arg, &nr_parms, sizeof(struct nr_parms_struct)); - return 0; - } - - case SIOCNRSETPARMS: - { - struct nr_parms_struct nr_parms; - if (!suser()) return -EPERM; - if ((err = verify_area(VERIFY_READ, (void *)arg, sizeof(struct nr_parms_struct))) != 0) - return err; - memcpy_fromfs(&nr_parms, (void *)arg, sizeof(struct nr_parms_struct)); - nr_default = nr_parms; - return 0; - } - - default: + case SIOCNRCTLCON: + if (!suser()) return -EPERM; + return nr_ctl_ioctl(cmd, (void *)arg); + + default: return dev_ioctl(cmd, (void *)arg); } /*NOTREACHED*/ - return(0); + return 0; } -int nr_get_info(char *buffer, char **start, off_t offset, int length) +static int nr_get_info(char *buffer, char **start, off_t offset, int length, int dummy) { struct sock *s; + struct device *dev; + const char *devname; int len = 0; off_t pos = 0; off_t begin = 0; cli(); - len += sprintf(buffer, "user_addr dest_node src_node my your st vs vr va t1 t2 n2 rtt wnd Snd-Q Rcv-Q\n"); + len += sprintf(buffer, "user_addr dest_node src_node dev my your st vs vr va t1 t2 n2 rtt wnd paclen Snd-Q Rcv-Q\n"); for (s = nr_list; s != NULL; s = s->next) { + if ((dev = s->protinfo.nr->device) == NULL) + devname = "???"; + else + devname = dev->name; + len += sprintf(buffer + len, "%-9s ", - ax2asc(&s->nr->user_addr)); + ax2asc(&s->protinfo.nr->user_addr)); len += sprintf(buffer + len, "%-9s ", - ax2asc(&s->nr->dest_addr)); - len += sprintf(buffer + len, "%-9s %02X/%02X %02X/%02X %2d %2d %2d %2d %3d/%03d %2d/%02d %2d/%02d %3d %3d %5ld %5ld\n", - ax2asc(&s->nr->source_addr), - s->nr->my_index, s->nr->my_id, - s->nr->your_index, s->nr->your_id, - s->nr->state, - s->nr->vs, s->nr->vr, s->nr->va, - s->nr->t1timer / PR_SLOWHZ, - s->nr->t1 / PR_SLOWHZ, - s->nr->t2timer / PR_SLOWHZ, - s->nr->t2 / PR_SLOWHZ, - s->nr->n2count, s->nr->n2, - s->nr->rtt / PR_SLOWHZ, - s->window, + ax2asc(&s->protinfo.nr->dest_addr)); + len += sprintf(buffer + len, "%-9s %-3s %02X/%02X %02X/%02X %2d %3d %3d %3d %3d/%03d %2d/%02d %2d/%02d %3d %3d %6d %5d %5d\n", + ax2asc(&s->protinfo.nr->source_addr), + devname, s->protinfo.nr->my_index, s->protinfo.nr->my_id, + s->protinfo.nr->your_index, s->protinfo.nr->your_id, + s->protinfo.nr->state, + s->protinfo.nr->vs, s->protinfo.nr->vr, s->protinfo.nr->va, + s->protinfo.nr->t1timer / PR_SLOWHZ, + s->protinfo.nr->t1 / PR_SLOWHZ, + s->protinfo.nr->t2timer / PR_SLOWHZ, + s->protinfo.nr->t2 / PR_SLOWHZ, + s->protinfo.nr->n2count, s->protinfo.nr->n2, + s->protinfo.nr->rtt / PR_SLOWHZ, + s->window, s->protinfo.nr->paclen, s->wmem_alloc, s->rmem_alloc); pos = begin + len; @@ -1289,7 +1427,7 @@ int nr_get_info(char *buffer, char **start, off_t offset, int length) return(len); } -static struct proto_ops nr_proto_ops = { +struct proto_ops nr_proto_ops = { AF_NETROM, nr_create, @@ -1300,40 +1438,61 @@ static struct proto_ops nr_proto_ops = { nr_socketpair, nr_accept, nr_getname, - nr_read, - nr_write, nr_select, nr_ioctl, nr_listen, - nr_send, - nr_recv, - nr_sendto, - nr_recvfrom, nr_shutdown, nr_setsockopt, nr_getsockopt, nr_fcntl, + nr_sendmsg, + nr_recvmsg }; -static struct notifier_block nr_dev_notifier = { +struct notifier_block nr_dev_notifier = { nr_device_event, 0 }; +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry proc_net_nr = { + PROC_NET_NR, 2, "nr", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + nr_get_info +}; +static struct proc_dir_entry proc_net_nr_neigh = { + PROC_NET_NR_NEIGH, 8, "nr_neigh", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + nr_neigh_get_info +}; +static struct proc_dir_entry proc_net_nr_nodes = { + PROC_NET_NR_NODES, 8, "nr_nodes", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + nr_nodes_get_info +}; +#endif + void nr_proto_init(struct net_proto *pro) { sock_register(nr_proto_ops.family, &nr_proto_ops); register_netdevice_notifier(&nr_dev_notifier); - printk("G4KLX NET/ROM for Linux. Version 0.2 ALPHA for AX.25 029 for Linux 1.3.0\n"); - - nr_default.quality = NR_DEFAULT_QUAL; - nr_default.obs_count = NR_DEFAULT_OBS; - nr_default.ttl = NR_DEFAULT_TTL; - nr_default.timeout = NR_DEFAULT_T1; - nr_default.ack_delay = NR_DEFAULT_T2; - nr_default.busy_delay = NR_DEFAULT_T4; - nr_default.tries = NR_DEFAULT_N2; - nr_default.window = NR_DEFAULT_WINDOW; + printk(KERN_INFO "G4KLX NET/ROM for Linux. Version 0.5 for AX25.034 Linux 2.1\n"); + + if (!ax25_protocol_register(AX25_P_NETROM, nr_route_frame)) + printk(KERN_ERR "NET/ROM unable to register protocol with AX.25\n"); + if (!ax25_linkfail_register(nr_link_failed)) + printk(KERN_ERR "NET/ROM unable to register linkfail handler with AX.25\n"); + + nr_register_sysctl(); + +#ifdef CONFIG_PROC_FS + proc_net_register(&proc_net_nr); + proc_net_register(&proc_net_nr_neigh); + proc_net_register(&proc_net_nr_nodes); +#endif } #endif diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c index f9fd83f73..fbbd913e9 100644 --- a/net/netrom/nr_dev.c +++ b/net/netrom/nr_dev.c @@ -1,5 +1,5 @@ /* - * NET/ROM release 002 + * NET/ROM release 004 * * This is ALPHA test software. This code may break your machine, randomly fail to work with new * releases, misbehave and/or generally screw up. It might even work. @@ -14,15 +14,22 @@ * * History * NET/ROM 001 Jonathan(G4KLX) Cloned from loopback.c + * NET/ROM 002 Steve Whitehouse(GW7RRM) fixed the set_mac_address + * NET/ROM 003 Jonathan(G4KLX) Put nr_rebuild_header into line with + * ax25_rebuild_header + * NET/ROM 004 Jonathan(G4KLX) Callsign registration with AX.25. */ #include <linux/config.h> -#ifdef CONFIG_NETROM +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) +#include <linux/module.h> +#include <linux/proc_fs.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/interrupt.h> #include <linux/fs.h> #include <linux/types.h> +#include <linux/sysctl.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/errno.h> @@ -31,12 +38,13 @@ #include <linux/if_ether.h> /* For the statistics structure. */ #include <asm/system.h> -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/io.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> +#include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/ip.h> @@ -46,7 +54,7 @@ #include <net/netrom.h> /* - * Only allow IP over NET/ROM frames through if the netrom device is up. + * Only allow IP over NET/ROM frames through if the netrom device is up. */ int nr_rx_ip(struct sk_buff *skb, struct device *dev) @@ -59,47 +67,36 @@ int nr_rx_ip(struct sk_buff *skb, struct device *dev) } stats->rx_packets++; - skb->protocol=htons(ETH_P_IP); + skb->protocol = htons(ETH_P_IP); + /* Spoof incoming device */ - skb->dev=dev; + skb->dev = dev; - ip_rcv(skb, dev, NULL); + skb->h.raw = skb->data; + ip_rcv(skb, skb->dev, NULL); return 1; } -/* - * We can't handle ARP so put some identification characters into the ARP - * packet so that the transmit routine can identify it, and throw it away. - */ - -static int nr_header(unsigned char *buff, struct device *dev, unsigned short type, - void *daddr, void *saddr, unsigned len, struct sk_buff *skb) +static int nr_header(struct sk_buff *skb, struct device *dev, unsigned short type, + void *daddr, void *saddr, unsigned len) { - if (type == ETH_P_ARP) { - *buff++ = 0xFF; /* Mark it */ - *buff++ = 0xFE; - return 37; - } + unsigned char *buff = skb_push(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN); - buff += 16; - - *buff++ = AX25_P_NETROM; - memcpy(buff, (saddr != NULL) ? saddr : dev->dev_addr, dev->addr_len); buff[6] &= ~LAPB_C; buff[6] &= ~LAPB_E; - buff[6] |= SSID_SPARE; - buff += dev->addr_len; + buff[6] |= SSSID_SPARE; + buff += AX25_ADDR_LEN; if (daddr != NULL) memcpy(buff, daddr, dev->addr_len); buff[6] &= ~LAPB_C; buff[6] |= LAPB_E; - buff[6] |= SSID_SPARE; - buff += dev->addr_len; + buff[6] |= SSSID_SPARE; + buff += AX25_ADDR_LEN; - *buff++ = nr_default.ttl; + *buff++ = sysctl_netrom_network_ttl_initialiser; *buff++ = NR_PROTO_IP; *buff++ = NR_PROTO_IP; @@ -116,25 +113,55 @@ static int nr_header(unsigned char *buff, struct device *dev, unsigned short typ static int nr_rebuild_header(void *buff, struct device *dev, unsigned long raddr, struct sk_buff *skb) { + struct enet_statistics *stats = (struct enet_statistics *)dev->priv; unsigned char *bp = (unsigned char *)buff; + struct sk_buff *skbn; - if (arp_find(bp + 24, raddr, dev, dev->pa_addr, skb)) + if (!arp_query(bp + 7, raddr, dev)) { + dev_kfree_skb(skb, FREE_WRITE); return 1; + } - bp[23] &= ~LAPB_C; - bp[23] &= ~LAPB_E; - bp[23] |= SSID_SPARE; + bp[6] &= ~LAPB_C; + bp[6] &= ~LAPB_E; + bp[6] |= SSSID_SPARE; + bp += AX25_ADDR_LEN; - bp[30] &= ~LAPB_C; - bp[30] |= LAPB_E; - bp[30] |= SSID_SPARE; + bp[6] &= ~LAPB_C; + bp[6] |= LAPB_E; + bp[6] |= SSSID_SPARE; - return 0; + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { + dev_kfree_skb(skb, FREE_WRITE); + return 1; + } + + skbn->sk = skb->sk; + + if (skbn->sk != NULL) + atomic_add(skbn->truesize, &skbn->sk->wmem_alloc); + + dev_kfree_skb(skb, FREE_WRITE); + + if (!nr_route_frame(skbn, NULL)) { + dev_kfree_skb(skbn, FREE_WRITE); + stats->tx_errors++; + } + + stats->tx_packets++; + + return 1; } static int nr_set_mac_address(struct device *dev, void *addr) { - memcpy(dev->dev_addr, addr, dev->addr_len); + struct sockaddr *sa = addr; + + ax25_listen_release((ax25_address *)dev->dev_addr, NULL); + + memcpy(dev->dev_addr, sa->sa_data, dev->addr_len); + + ax25_listen_register((ax25_address *)dev->dev_addr, NULL); return 0; } @@ -144,6 +171,10 @@ static int nr_open(struct device *dev) dev->tbusy = 0; dev->start = 1; + MOD_INC_USE_COUNT; + + ax25_listen_register((ax25_address *)dev->dev_addr, NULL); + return 0; } @@ -152,19 +183,22 @@ static int nr_close(struct device *dev) dev->tbusy = 1; dev->start = 0; + ax25_listen_release((ax25_address *)dev->dev_addr, NULL); + + MOD_DEC_USE_COUNT; + return 0; } static int nr_xmit(struct sk_buff *skb, struct device *dev) { struct enet_statistics *stats = (struct enet_statistics *)dev->priv; - struct sk_buff *skbn; if (skb == NULL || dev == NULL) return 0; if (!dev->start) { - printk("netrom: xmit call when iface is down\n"); + printk(KERN_ERR "netrom: xmit call when iface is down\n"); return 1; } @@ -180,25 +214,9 @@ static int nr_xmit(struct sk_buff *skb, struct device *dev) sti(); - if (skb->data[0] != 0xFF && skb->data[1] != 0xFE) { - if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { - dev->tbusy = 0; - stats->tx_errors++; - return 1; - } - - if (!nr_route_frame(skbn, NULL)) { - skbn->free = 1; - kfree_skb(skbn, FREE_WRITE); - dev->tbusy = 0; - stats->tx_errors++; - return 1; - } - } - dev_kfree_skb(skb, FREE_WRITE); - stats->tx_packets++; + stats->tx_errors++; dev->tbusy = 0; @@ -223,8 +241,8 @@ int nr_init(struct device *dev) dev->stop = nr_close; dev->hard_header = nr_header; - dev->hard_header_len = 37; - dev->addr_len = 7; + dev->hard_header_len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + NR_NETWORK_LEN + NR_TRANSPORT_LEN; + dev->addr_len = AX25_ADDR_LEN; dev->type = ARPHRD_NETROM; dev->rebuild_header = nr_rebuild_header; dev->set_mac_address = nr_set_mac_address; @@ -238,7 +256,8 @@ int nr_init(struct device *dev) dev->pa_mask = 0; dev->pa_alen = sizeof(unsigned long); - dev->priv = kmalloc(sizeof(struct enet_statistics), GFP_KERNEL); + if ((dev->priv = kmalloc(sizeof(struct enet_statistics), GFP_KERNEL)) == NULL) + return -ENOMEM; memset(dev->priv, 0, sizeof(struct enet_statistics)); @@ -251,4 +270,60 @@ int nr_init(struct device *dev) return 0; }; +#ifdef MODULE +extern struct proto_ops nr_proto_ops; +extern struct notifier_block nr_dev_notifier; + +static struct device dev_nr[] = { + {"nr0", 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, nr_init}, + {"nr1", 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, nr_init}, + {"nr2", 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, nr_init}, + {"nr3", 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, nr_init} +}; + +int init_module(void) +{ + int i; + + for (i = 0; i < 4; i++) + register_netdev(&dev_nr[i]); + + register_symtab(NULL); + + nr_proto_init(NULL); + + return 0; +} + +void cleanup_module(void) +{ + int i; + +#ifdef CONFIG_PROC_FS + proc_net_unregister(PROC_NET_NR); + proc_net_unregister(PROC_NET_NR_NEIGH); + proc_net_unregister(PROC_NET_NR_NODES); +#endif + nr_rt_free(); + + ax25_protocol_release(AX25_P_NETROM); + ax25_linkfail_release(nr_link_failed); + + unregister_netdevice_notifier(&nr_dev_notifier); + + nr_unregister_sysctl(); + + sock_unregister(nr_proto_ops.family); + + for (i = 0; i < 4; i++) { + if (dev_nr[i].priv != NULL) { + kfree(dev_nr[i].priv); + dev_nr[i].priv = NULL; + unregister_netdev(&dev_nr[i]); + } + } +} + +#endif + #endif diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c index 7f0513732..85d28a114 100644 --- a/net/netrom/nr_in.c +++ b/net/netrom/nr_in.c @@ -1,5 +1,5 @@ /* - * NET/ROM release 002 + * NET/ROM release 004 * * This is ALPHA test software. This code may break your machine, randomly fail to work with new * releases, misbehave and/or generally screw up. It might even work. @@ -20,10 +20,13 @@ * * History * NET/ROM 001 Jonathan(G4KLX) Cloned from ax25_in.c + * NET/ROM 003 Jonathan(G4KLX) Added NET/ROM fragment reception. + * Darryl(G7LED) Added missing INFO with NAK case, optimized + * INFOACK handling, removed reconnect on error. */ #include <linux/config.h> -#ifdef CONFIG_NETROM +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> @@ -40,13 +43,52 @@ #include <linux/skbuff.h> #include <net/sock.h> #include <net/ip.h> /* For ip_rcv */ -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <net/netrom.h> +static int nr_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more) +{ + struct sk_buff *skbo, *skbn = skb; + + if (more) { + sk->protinfo.nr->fraglen += skb->len; + skb_queue_tail(&sk->protinfo.nr->frag_queue, skb); + return 0; + } + + if (!more && sk->protinfo.nr->fraglen > 0) { /* End of fragment */ + sk->protinfo.nr->fraglen += skb->len; + skb_queue_tail(&sk->protinfo.nr->frag_queue, skb); + + if ((skbn = alloc_skb(sk->protinfo.nr->fraglen, GFP_ATOMIC)) == NULL) + return 1; + + skbn->free = 1; + skbn->arp = 1; + skbn->sk = sk; + sk->rmem_alloc += skbn->truesize; + skbn->h.raw = skbn->data; + + skbo = skb_dequeue(&sk->protinfo.nr->frag_queue); + memcpy(skb_put(skbn, skbo->len), skbo->data, skbo->len); + kfree_skb(skbo, FREE_READ); + + while ((skbo = skb_dequeue(&sk->protinfo.nr->frag_queue)) != NULL) { + skb_pull(skbo, NR_NETWORK_LEN + NR_TRANSPORT_LEN); + memcpy(skb_put(skbn, skbo->len), skbo->data, skbo->len); + kfree_skb(skbo, FREE_READ); + } + + sk->protinfo.nr->fraglen = 0; + } + + return sock_queue_rcv_skb(sk, skbn); +} + /* * State machine for state 1, Awaiting Connection State. * The handling of the timer(s) is in file nr_timer.c. @@ -58,32 +100,33 @@ static int nr_state1_machine(struct sock *sk, struct sk_buff *skb, int frametype case NR_CONNACK: nr_calculate_rtt(sk); - sk->window = skb->data[37]; - sk->nr->your_index = skb->data[34]; - sk->nr->your_id = skb->data[35]; - sk->nr->t1timer = 0; - sk->nr->t2timer = 0; - sk->nr->t4timer = 0; - sk->nr->vs = 0; - sk->nr->va = 0; - sk->nr->vr = 0; - sk->nr->vl = 0; - sk->nr->state = NR_STATE_3; - sk->state = TCP_ESTABLISHED; - sk->nr->n2count = 0; + sk->protinfo.nr->your_index = skb->data[17]; + sk->protinfo.nr->your_id = skb->data[18]; + sk->protinfo.nr->t1timer = 0; + sk->protinfo.nr->t2timer = 0; + sk->protinfo.nr->t4timer = 0; + sk->protinfo.nr->vs = 0; + sk->protinfo.nr->va = 0; + sk->protinfo.nr->vr = 0; + sk->protinfo.nr->vl = 0; + sk->protinfo.nr->state = NR_STATE_3; + sk->protinfo.nr->n2count = 0; + sk->window = skb->data[20]; + sk->state = TCP_ESTABLISHED; /* For WAIT_SABM connections we will produce an accept ready socket here */ if (!sk->dead) sk->state_change(sk); break; - case NR_CONNACK + NR_CHOKE_FLAG: - nr_clear_tx_queue(sk); - sk->nr->state = NR_STATE_0; - sk->state = TCP_CLOSE; - sk->err = ECONNREFUSED; + case NR_CONNACK | NR_CHOKE_FLAG: + nr_clear_queues(sk); + sk->protinfo.nr->state = NR_STATE_0; + sk->state = TCP_CLOSE; + sk->err = ECONNREFUSED; + sk->shutdown |= SEND_SHUTDOWN; if (!sk->dead) sk->state_change(sk); - sk->dead = 1; + sk->dead = 1; break; default: @@ -104,15 +147,15 @@ static int nr_state2_machine(struct sock *sk, struct sk_buff *skb, int frametype case NR_DISCREQ: nr_write_internal(sk, NR_DISCACK); - break; case NR_DISCACK: - sk->nr->state = NR_STATE_0; - sk->state = TCP_CLOSE; - sk->err = 0; + sk->protinfo.nr->state = NR_STATE_0; + sk->state = TCP_CLOSE; + sk->err = 0; + sk->shutdown |= SEND_SHUTDOWN; if (!sk->dead) sk->state_change(sk); - sk->dead = 1; + sk->dead = 1; break; default: @@ -135,118 +178,105 @@ static int nr_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype unsigned short nr, ns; int queued = 0; - nr = skb->data[35]; - ns = skb->data[34]; + nr = skb->data[18]; + ns = skb->data[17]; switch (frametype) { case NR_CONNREQ: nr_write_internal(sk, NR_CONNACK); - sk->nr->condition = 0x00; - sk->nr->t1timer = 0; - sk->nr->t2timer = 0; - sk->nr->t4timer = 0; - sk->nr->vs = 0; - sk->nr->va = 0; - sk->nr->vr = 0; - sk->nr->vl = 0; break; case NR_DISCREQ: - nr_clear_tx_queue(sk); + nr_clear_queues(sk); nr_write_internal(sk, NR_DISCACK); - sk->nr->state = NR_STATE_0; - sk->state = TCP_CLOSE; - sk->err = 0; + sk->protinfo.nr->state = NR_STATE_0; + sk->state = TCP_CLOSE; + sk->err = 0; + sk->shutdown |= SEND_SHUTDOWN; if (!sk->dead) sk->state_change(sk); - sk->dead = 1; + sk->dead = 1; break; case NR_DISCACK: - nr_clear_tx_queue(sk); - sk->nr->state = NR_STATE_0; - sk->state = TCP_CLOSE; - sk->err = ECONNRESET; + nr_clear_queues(sk); + sk->protinfo.nr->state = NR_STATE_0; + sk->state = TCP_CLOSE; + sk->err = ECONNRESET; + sk->shutdown |= SEND_SHUTDOWN; if (!sk->dead) sk->state_change(sk); - sk->dead = 1; + sk->dead = 1; break; case NR_INFOACK: - case NR_INFOACK + NR_CHOKE_FLAG: + case NR_INFOACK | NR_CHOKE_FLAG: + case NR_INFOACK | NR_NAK_FLAG: + case NR_INFOACK | NR_NAK_FLAG | NR_CHOKE_FLAG: if (frametype & NR_CHOKE_FLAG) { - sk->nr->condition |= PEER_RX_BUSY_CONDITION; - sk->nr->t4timer = nr_default.busy_delay; + sk->protinfo.nr->condition |= PEER_RX_BUSY_CONDITION; + sk->protinfo.nr->t4timer = sk->protinfo.nr->t4; } else { - sk->nr->condition &= ~PEER_RX_BUSY_CONDITION; - sk->nr->t4timer = 0; + sk->protinfo.nr->condition &= ~PEER_RX_BUSY_CONDITION; + sk->protinfo.nr->t4timer = 0; } if (!nr_validate_nr(sk, nr)) { - nr_nr_error_recovery(sk); - sk->nr->state = NR_STATE_1; break; } - if (sk->nr->condition & PEER_RX_BUSY_CONDITION) { - nr_frames_acked(sk, nr); - } else { - nr_check_iframes_acked(sk, nr); - } - break; - - case NR_INFOACK + NR_NAK_FLAG: - case NR_INFOACK + NR_NAK_FLAG + NR_CHOKE_FLAG: - if (frametype & NR_CHOKE_FLAG) { - sk->nr->condition |= PEER_RX_BUSY_CONDITION; - sk->nr->t4timer = nr_default.busy_delay; - } else { - sk->nr->condition &= ~PEER_RX_BUSY_CONDITION; - sk->nr->t4timer = 0; - } - if (nr_validate_nr(sk, nr)) { + if (frametype & NR_NAK_FLAG) { nr_frames_acked(sk, nr); nr_send_nak_frame(sk); } else { - nr_nr_error_recovery(sk); - sk->nr->state = NR_STATE_1; + if (sk->protinfo.nr->condition & PEER_RX_BUSY_CONDITION) { + nr_frames_acked(sk, nr); + } else { + nr_check_iframes_acked(sk, nr); + } } break; case NR_INFO: - case NR_INFO + NR_CHOKE_FLAG: - case NR_INFO + NR_MORE_FLAG: - case NR_INFO + NR_CHOKE_FLAG + NR_MORE_FLAG: + case NR_INFO | NR_NAK_FLAG: + case NR_INFO | NR_CHOKE_FLAG: + case NR_INFO | NR_MORE_FLAG: + case NR_INFO | NR_NAK_FLAG | NR_CHOKE_FLAG: + case NR_INFO | NR_CHOKE_FLAG | NR_MORE_FLAG: + case NR_INFO | NR_NAK_FLAG | NR_MORE_FLAG: + case NR_INFO | NR_NAK_FLAG | NR_CHOKE_FLAG | NR_MORE_FLAG: if (frametype & NR_CHOKE_FLAG) { - sk->nr->condition |= PEER_RX_BUSY_CONDITION; - sk->nr->t4timer = nr_default.busy_delay; + sk->protinfo.nr->condition |= PEER_RX_BUSY_CONDITION; + sk->protinfo.nr->t4timer = sk->protinfo.nr->t4; } else { - sk->nr->condition &= ~PEER_RX_BUSY_CONDITION; - sk->nr->t4timer = 0; + sk->protinfo.nr->condition &= ~PEER_RX_BUSY_CONDITION; + sk->protinfo.nr->t4timer = 0; } - if (!nr_validate_nr(sk, nr)) { - nr_nr_error_recovery(sk); - sk->nr->state = NR_STATE_1; - break; - } - if (sk->nr->condition & PEER_RX_BUSY_CONDITION) { - nr_frames_acked(sk, nr); - } else { - nr_check_iframes_acked(sk, nr); + if (nr_validate_nr(sk, nr)) { + if (frametype & NR_NAK_FLAG) { + nr_frames_acked(sk, nr); + nr_send_nak_frame(sk); + } else { + if (sk->protinfo.nr->condition & PEER_RX_BUSY_CONDITION) { + nr_frames_acked(sk, nr); + } else { + nr_check_iframes_acked(sk, nr); + } + } } queued = 1; - skb_queue_head(&sk->nr->reseq_queue, skb); - if (sk->nr->condition & OWN_RX_BUSY_CONDITION) + skb_queue_head(&sk->protinfo.nr->reseq_queue, skb); + if (sk->protinfo.nr->condition & OWN_RX_BUSY_CONDITION) break; skb_queue_head_init(&temp_queue); do { - save_vr = sk->nr->vr; - while ((skbn = skb_dequeue(&sk->nr->reseq_queue)) != NULL) { - ns = skbn->data[34]; - if (ns == sk->nr->vr) { - if (sock_queue_rcv_skb(sk, skbn) == 0) { - sk->nr->vr = (sk->nr->vr + 1) % NR_MODULUS; + save_vr = sk->protinfo.nr->vr; + while ((skbn = skb_dequeue(&sk->protinfo.nr->reseq_queue)) != NULL) { + ns = skbn->data[17]; + if (ns == sk->protinfo.nr->vr) { + if (nr_queue_rx_frame(sk, skbn, frametype & NR_MORE_FLAG) == 0) { + sk->protinfo.nr->vr = (sk->protinfo.nr->vr + 1) % NR_MODULUS; } else { - sk->nr->condition |= OWN_RX_BUSY_CONDITION; + sk->protinfo.nr->condition |= OWN_RX_BUSY_CONDITION; skb_queue_tail(&temp_queue, skbn); } } else if (nr_in_rx_window(sk, ns)) { @@ -257,18 +287,18 @@ static int nr_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype } } while ((skbn = skb_dequeue(&temp_queue)) != NULL) { - skb_queue_tail(&sk->nr->reseq_queue, skbn); + skb_queue_tail(&sk->protinfo.nr->reseq_queue, skbn); } - } while (save_vr != sk->nr->vr); + } while (save_vr != sk->protinfo.nr->vr); /* * Window is full, ack it immediately. */ - if (((sk->nr->vl + sk->window) % NR_MODULUS) == sk->nr->vr) { + if (((sk->protinfo.nr->vl + sk->window) % NR_MODULUS) == sk->protinfo.nr->vr) { nr_enquiry_response(sk); } else { - if (!(sk->nr->condition & ACK_PENDING_CONDITION)) { - sk->nr->t2timer = sk->nr->t2; - sk->nr->condition |= ACK_PENDING_CONDITION; + if (!(sk->protinfo.nr->condition & ACK_PENDING_CONDITION)) { + sk->protinfo.nr->t2timer = sk->protinfo.nr->t2; + sk->protinfo.nr->condition |= ACK_PENDING_CONDITION; } } break; @@ -284,12 +314,15 @@ static int nr_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype int nr_process_rx_frame(struct sock *sk, struct sk_buff *skb) { int queued = 0, frametype; + + if (sk->protinfo.nr->state == NR_STATE_0) + return 0; del_timer(&sk->timer); - frametype = skb->data[36]; + frametype = skb->data[19]; - switch (sk->nr->state) + switch (sk->protinfo.nr->state) { case NR_STATE_1: queued = nr_state1_machine(sk, skb, frametype); @@ -300,14 +333,11 @@ int nr_process_rx_frame(struct sock *sk, struct sk_buff *skb) case NR_STATE_3: queued = nr_state3_machine(sk, skb, frametype); break; - default: - printk("nr_process_rx_frame: frame received - state: %d\n", sk->nr->state); - break; } nr_set_timer(sk); - return(queued); + return queued; } #endif diff --git a/net/netrom/nr_out.c b/net/netrom/nr_out.c index 2ebdd743d..61935f30c 100644 --- a/net/netrom/nr_out.c +++ b/net/netrom/nr_out.c @@ -1,5 +1,5 @@ /* - * NET/ROM release 002 + * NET/ROM release 004 * * This is ALPHA test software. This code may break your machine, randomly fail to work with new * releases, misbehave and/or generally screw up. It might even work. @@ -14,10 +14,12 @@ * * History * NET/ROM 001 Jonathan(G4KLX) Cloned from ax25_out.c + * NET/ROM 003 Jonathan(G4KLX) Added NET/ROM fragmentation. + * Darryl(G7LED) Fixed NAK, to give out correct reponse. */ #include <linux/config.h> -#ifdef CONFIG_NETROM +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> @@ -33,38 +35,82 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <net/netrom.h> -int nr_output(struct sock *sk, struct sk_buff *skb) +/* + * This is where all NET/ROM frames pass, except for IP-over-NET/ROM which + * cannot be fragmented in this manner. + */ +void nr_output(struct sock *sk, struct sk_buff *skb) { - skb_queue_tail(&sk->write_queue, skb); /* Throw it on the queue */ + struct sk_buff *skbn; + unsigned char transport[NR_TRANSPORT_LEN]; + int err, frontlen, len, mtu; - if (sk->nr->state == NR_STATE_3) - nr_kick(sk); + mtu = sk->protinfo.nr->paclen; + + if (skb->len - NR_TRANSPORT_LEN > mtu) { + /* Save a copy of the Transport Header */ + memcpy(transport, skb->data, NR_TRANSPORT_LEN); + skb_pull(skb, NR_TRANSPORT_LEN); + + frontlen = skb_headroom(skb); + + while (skb->len > 0) { + if ((skbn = sock_alloc_send_skb(sk, frontlen + mtu, 0, 0, &err)) == NULL) + return; + + skbn->sk = sk; + skbn->free = 1; + skbn->arp = 1; + + skb_reserve(skbn, frontlen); + + len = (mtu > skb->len) ? skb->len : mtu; + + /* Copy the user data */ + memcpy(skb_put(skbn, len), skb->data, len); + skb_pull(skb, len); + + /* Duplicate the Transport Header */ + skb_push(skbn, NR_TRANSPORT_LEN); + memcpy(skbn->data, transport, NR_TRANSPORT_LEN); + + if (skb->len > 0) + skbn->data[4] |= NR_MORE_FLAG; + + skb_queue_tail(&sk->write_queue, skbn); /* Throw it on the queue */ + } + + skb->free = 1; + kfree_skb(skb, FREE_WRITE); + } else { + skb_queue_tail(&sk->write_queue, skb); /* Throw it on the queue */ + } - return 0; + if (sk->protinfo.nr->state == NR_STATE_3) + nr_kick(sk); } /* - * This procedure is passed a buffer descriptor for an iframe. It builds - * the rest of the control part of the frame and then writes it out. + * This procedure is passed a buffer descriptor for an iframe. It builds + * the rest of the control part of the frame and then writes it out. */ static void nr_send_iframe(struct sock *sk, struct sk_buff *skb) { - unsigned char *dptr; - if (skb == NULL) return; - dptr = skb->data + 34; - - *dptr++ = sk->nr->vs; - *dptr++ = sk->nr->vr; + skb->data[2] = sk->protinfo.nr->vs; + skb->data[3] = sk->protinfo.nr->vr; + + if (sk->protinfo.nr->condition & OWN_RX_BUSY_CONDITION) + skb->data[4] |= NR_CHOKE_FLAG; nr_transmit_buffer(sk, skb); } @@ -73,17 +119,23 @@ void nr_send_nak_frame(struct sock *sk) { struct sk_buff *skb, *skbn; - if ((skb = skb_peek(&sk->nr->ack_queue)) == NULL) + if ((skb = skb_peek(&sk->protinfo.nr->ack_queue)) == NULL) return; if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) return; - nr_send_iframe(sk, skbn); + skbn->data[2] = sk->protinfo.nr->va; + skbn->data[3] = sk->protinfo.nr->vr; - sk->nr->condition &= ~ACK_PENDING_CONDITION; - sk->nr->vl = sk->nr->vr; - sk->nr->t1timer = 0; + if (sk->protinfo.nr->condition & OWN_RX_BUSY_CONDITION) + skbn->data[4] |= NR_CHOKE_FLAG; + + nr_transmit_buffer(sk, skbn); + + sk->protinfo.nr->condition &= ~ACK_PENDING_CONDITION; + sk->protinfo.nr->vl = sk->protinfo.nr->vr; + sk->protinfo.nr->t1timer = 0; } void nr_kick(struct sock *sk) @@ -94,31 +146,32 @@ void nr_kick(struct sock *sk) del_timer(&sk->timer); - start = (skb_peek(&sk->nr->ack_queue) == NULL) ? sk->nr->va : sk->nr->vs; - end = (sk->nr->va + sk->window) % NR_MODULUS; + start = (skb_peek(&sk->protinfo.nr->ack_queue) == NULL) ? sk->protinfo.nr->va : sk->protinfo.nr->vs; + end = (sk->protinfo.nr->va + sk->window) % NR_MODULUS; - if (!(sk->nr->condition & PEER_RX_BUSY_CONDITION) && - start != end && + if (!(sk->protinfo.nr->condition & PEER_RX_BUSY_CONDITION) && + start != end && skb_peek(&sk->write_queue) != NULL) { - sk->nr->vs = start; + sk->protinfo.nr->vs = start; /* * Transmit data until either we're out of data to send or * the window is full. */ - do { - /* - * Dequeue the frame and copy it. - */ - skb = skb_dequeue(&sk->write_queue); + /* + * Dequeue the frame and copy it. + */ + skb = skb_dequeue(&sk->write_queue); + + do { if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { skb_queue_head(&sk->write_queue, skb); - return; + break; } - next = (sk->nr->vs + 1) % NR_MODULUS; + next = (sk->protinfo.nr->vs + 1) % NR_MODULUS; last = (next == end); /* @@ -126,20 +179,20 @@ void nr_kick(struct sock *sk) */ nr_send_iframe(sk, skbn); - sk->nr->vs = next; + sk->protinfo.nr->vs = next; /* * Requeue the original data frame. */ - skb_queue_tail(&sk->nr->ack_queue, skb); + skb_queue_tail(&sk->protinfo.nr->ack_queue, skb); - } while (!last && skb_peek(&sk->write_queue) != NULL); + } while (!last && (skb = skb_dequeue(&sk->write_queue)) != NULL); - sk->nr->vl = sk->nr->vr; - sk->nr->condition &= ~ACK_PENDING_CONDITION; + sk->protinfo.nr->vl = sk->protinfo.nr->vr; + sk->protinfo.nr->condition &= ~ACK_PENDING_CONDITION; - if (sk->nr->t1timer == 0) { - sk->nr->t1timer = sk->nr->t1 = nr_calculate_t1(sk); + if (sk->protinfo.nr->t1timer == 0) { + sk->protinfo.nr->t1timer = sk->protinfo.nr->t1 = nr_calculate_t1(sk); } } @@ -150,34 +203,36 @@ void nr_transmit_buffer(struct sock *sk, struct sk_buff *skb) { unsigned char *dptr; - dptr = skb->data + 16; + /* + * Add the protocol byte and network header. + */ + dptr = skb_push(skb, NR_NETWORK_LEN); - *dptr++ = AX25_P_NETROM; - - memcpy(dptr, &sk->nr->source_addr, sizeof(ax25_address)); + memcpy(dptr, &sk->protinfo.nr->source_addr, AX25_ADDR_LEN); dptr[6] &= ~LAPB_C; dptr[6] &= ~LAPB_E; - dptr[6] |= SSID_SPARE; - dptr += 7; + dptr[6] |= SSSID_SPARE; + dptr += AX25_ADDR_LEN; - memcpy(dptr, &sk->nr->dest_addr, sizeof(ax25_address)); + memcpy(dptr, &sk->protinfo.nr->dest_addr, AX25_ADDR_LEN); dptr[6] &= ~LAPB_C; dptr[6] |= LAPB_E; - dptr[6] |= SSID_SPARE; - dptr += 7; + dptr[6] |= SSSID_SPARE; + dptr += AX25_ADDR_LEN; - *dptr++ = nr_default.ttl; + *dptr++ = sysctl_netrom_network_ttl_initialiser; skb->arp = 1; if (!nr_route_frame(skb, NULL)) { kfree_skb(skb, FREE_WRITE); - sk->state = TCP_CLOSE; - sk->err = ENETUNREACH; + sk->state = TCP_CLOSE; + sk->err = ENETUNREACH; + sk->shutdown |= SEND_SHUTDOWN; if (!sk->dead) sk->state_change(sk); - sk->dead = 1; + sk->dead = 1; } } @@ -186,20 +241,15 @@ void nr_transmit_buffer(struct sock *sk, struct sk_buff *skb) * Networking Conference paper, as is the whole state machine. */ -void nr_nr_error_recovery(struct sock *sk) -{ - nr_establish_data_link(sk); -} - void nr_establish_data_link(struct sock *sk) { - sk->nr->condition = 0x00; - sk->nr->n2count = 0; + sk->protinfo.nr->condition = 0x00; + sk->protinfo.nr->n2count = 0; nr_write_internal(sk, NR_CONNREQ); - sk->nr->t2timer = 0; - sk->nr->t1timer = sk->nr->t1 = nr_calculate_t1(sk); + sk->protinfo.nr->t2timer = 0; + sk->protinfo.nr->t1timer = sk->protinfo.nr->t1 = nr_calculate_t1(sk); } /* @@ -209,33 +259,31 @@ void nr_enquiry_response(struct sock *sk) { int frametype = NR_INFOACK; - if (sk->nr->condition & OWN_RX_BUSY_CONDITION) { - frametype += NR_CHOKE_FLAG; + if (sk->protinfo.nr->condition & OWN_RX_BUSY_CONDITION) { + frametype |= NR_CHOKE_FLAG; } else { - if (skb_peek(&sk->nr->reseq_queue) != NULL) { - frametype += NR_NAK_FLAG; + if (skb_peek(&sk->protinfo.nr->reseq_queue) != NULL) { + frametype |= NR_NAK_FLAG; } } nr_write_internal(sk, frametype); - sk->nr->vl = sk->nr->vr; - sk->nr->condition &= ~ACK_PENDING_CONDITION; + sk->protinfo.nr->vl = sk->protinfo.nr->vr; + sk->protinfo.nr->condition &= ~ACK_PENDING_CONDITION; } void nr_check_iframes_acked(struct sock *sk, unsigned short nr) { - if (sk->nr->vs == nr) { + if (sk->protinfo.nr->vs == nr) { nr_frames_acked(sk, nr); - nr_requeue_frames(sk); nr_calculate_rtt(sk); - sk->nr->t1timer = 0; - sk->nr->n2count = 0; + sk->protinfo.nr->t1timer = 0; + sk->protinfo.nr->n2count = 0; } else { - if (sk->nr->va != nr) { + if (sk->protinfo.nr->va != nr) { nr_frames_acked(sk, nr); - nr_requeue_frames(sk); - sk->nr->t1timer = sk->nr->t1 = nr_calculate_t1(sk); + sk->protinfo.nr->t1timer = sk->protinfo.nr->t1 = nr_calculate_t1(sk); } } } diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index 356d3c0f6..73f5f0ba4 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -1,5 +1,5 @@ /* - * NET/ROM release 002 + * NET/ROM release 004 * * This is ALPHA test software. This code may break your machine, randomly fail to work with new * releases, misbehave and/or generally screw up. It might even work. @@ -14,16 +14,16 @@ * * History * NET/ROM 001 Jonathan(G4KLX) First attempt. - * - * TO DO - * Sort out the which pointer when shuffling entries in the routes - * section. Also reset the which pointer when a route becomes "good" - * again, ie when a NODES broadcast is processed via calls to - * nr_add_node(). + * NET/ROM 003 Jonathan(G4KLX) Use SIOCADDRT/SIOCDELRT ioctl values + * for NET/ROM routes. + * Use '*' for a blank mnemonic in /proc/net/nr_nodes. + * Change default quality for new neighbour when same + * as node callsign. + * Alan Cox(GW4PTS) Added the firewall hooks. */ #include <linux/config.h> -#ifdef CONFIG_NETROM +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> @@ -37,36 +37,43 @@ #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> +#include <net/arp.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <linux/fcntl.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/notifier.h> +#include <linux/firewall.h> #include <net/netrom.h> -static int nr_neigh_no = 1; +static unsigned int nr_neigh_no = 1; static struct nr_node *nr_node_list = NULL; static struct nr_neigh *nr_neigh_list = NULL; +static void nr_remove_neigh(struct nr_neigh *); + /* * Add a new route to a node, and in the process add the node and the * neighbour if it is new. */ -static int nr_add_node(ax25_address *nr, char *mnemonic, ax25_address *ax25, - struct device *dev, int quality, int obs_count) +static int nr_add_node(ax25_address *nr, const char *mnemonic, ax25_address *ax25, + ax25_digi *ax25_digi, struct device *dev, int quality, int obs_count) { struct nr_node *nr_node; struct nr_neigh *nr_neigh; struct nr_route nr_route; unsigned long flags; int i, found; - + + if (nr_dev_get(nr) != NULL) /* Can't add routes to ourself */ + return -EINVAL; + for (nr_node = nr_node_list; nr_node != NULL; nr_node = nr_node->next) if (ax25cmp(nr, &nr_node->callsign) == 0) break; @@ -82,13 +89,24 @@ static int nr_add_node(ax25_address *nr, char *mnemonic, ax25_address *ax25, if ((nr_neigh = (struct nr_neigh *)kmalloc(sizeof(*nr_neigh), GFP_ATOMIC)) == NULL) return -ENOMEM; - memcpy(&nr_neigh->callsign, ax25, sizeof(ax25_address)); - - nr_neigh->dev = dev; - nr_neigh->quality = nr_default.quality; - nr_neigh->locked = 0; - nr_neigh->count = 0; - nr_neigh->number = nr_neigh_no++; + nr_neigh->callsign = *ax25; + nr_neigh->digipeat = NULL; + nr_neigh->dev = dev; + if (ax25cmp(nr, ax25) == 0) + nr_neigh->quality = quality; + else + nr_neigh->quality = sysctl_netrom_default_path_quality; + nr_neigh->locked = 0; + nr_neigh->count = 0; + nr_neigh->number = nr_neigh_no++; + + if (ax25_digi != NULL) { + if ((nr_neigh->digipeat = kmalloc(sizeof(*ax25_digi), GFP_KERNEL)) == NULL) { + kfree_s(nr_neigh, sizeof(*nr_neigh)); + return -ENOMEM; + } + *nr_neigh->digipeat = *ax25_digi; + } save_flags(flags); cli(); @@ -103,15 +121,15 @@ static int nr_add_node(ax25_address *nr, char *mnemonic, ax25_address *ax25, if ((nr_node = (struct nr_node *)kmalloc(sizeof(*nr_node), GFP_ATOMIC)) == NULL) return -ENOMEM; - memcpy(&nr_node->callsign, nr, sizeof(ax25_address)); - memcpy(&nr_node->mnemonic, mnemonic, sizeof(nr_node->mnemonic)); + nr_node->callsign = *nr; + strcpy(nr_node->mnemonic, mnemonic); nr_node->which = 0; nr_node->count = 1; nr_node->routes[0].quality = quality; nr_node->routes[0].obs_count = obs_count; - nr_node->routes[0].neighbour = nr_neigh->number; + nr_node->routes[0].neighbour = nr_neigh; save_flags(flags); cli(); @@ -124,10 +142,13 @@ static int nr_add_node(ax25_address *nr, char *mnemonic, ax25_address *ax25, nr_neigh->count++; return 0; + } else { + if (nr_node->mnemonic[0] == '\0') + strcpy(nr_node->mnemonic, mnemonic); } for (found = 0, i = 0; i < nr_node->count; i++) { - if (nr_node->routes[i].neighbour == nr_neigh->number) { + if (nr_node->routes[i].neighbour == nr_neigh) { nr_node->routes[i].quality = quality; nr_node->routes[i].obs_count = obs_count; found = 1; @@ -143,16 +164,22 @@ static int nr_add_node(ax25_address *nr, char *mnemonic, ax25_address *ax25, nr_node->routes[0].quality = quality; nr_node->routes[0].obs_count = obs_count; - nr_node->routes[0].neighbour = nr_neigh->number; + nr_node->routes[0].neighbour = nr_neigh; + nr_node->which++; nr_node->count++; nr_neigh->count++; } else { /* It must be better than the worst */ if (quality > nr_node->routes[2].quality) { + nr_node->routes[2].neighbour->count--; + + if (nr_node->routes[2].neighbour->count == 0 && !nr_node->routes[2].neighbour->locked) + nr_remove_neigh(nr_node->routes[2].neighbour); + nr_node->routes[2].quality = quality; nr_node->routes[2].obs_count = obs_count; - nr_node->routes[2].neighbour = nr_neigh->number; + nr_node->routes[2].neighbour = nr_neigh; nr_neigh->count++; } @@ -198,7 +225,7 @@ static int nr_add_node(ax25_address *nr, char *mnemonic, ax25_address *ax25, } for (i = 0; i < nr_node->count; i++) { - if (nr_node->routes[i].neighbour == nr_neigh->number) { + if (nr_node->routes[i].neighbour == nr_neigh) { if (i < nr_node->which) nr_node->which = i; break; @@ -248,6 +275,8 @@ static void nr_remove_neigh(struct nr_neigh *nr_neigh) if ((s = nr_neigh_list) == nr_neigh) { nr_neigh_list = nr_neigh->next; restore_flags(flags); + if (nr_neigh->digipeat != NULL) + kfree_s(nr_neigh->digipeat, sizeof(ax25_digi)); kfree_s(nr_neigh, sizeof(struct nr_neigh)); return; } @@ -256,6 +285,8 @@ static void nr_remove_neigh(struct nr_neigh *nr_neigh) if (s->next == nr_neigh) { s->next = nr_neigh->next; restore_flags(flags); + if (nr_neigh->digipeat != NULL) + kfree_s(nr_neigh->digipeat, sizeof(ax25_digi)); kfree_s(nr_neigh, sizeof(struct nr_neigh)); return; } @@ -289,7 +320,7 @@ static int nr_del_node(ax25_address *callsign, ax25_address *neighbour, struct d if (nr_neigh == NULL) return -EINVAL; for (i = 0; i < nr_node->count; i++) { - if (nr_node->routes[i].neighbour == nr_neigh->number) { + if (nr_node->routes[i].neighbour == nr_neigh) { nr_neigh->count--; if (nr_neigh->count == 0 && !nr_neigh->locked) @@ -336,13 +367,13 @@ static int nr_add_neigh(ax25_address *callsign, struct device *dev, unsigned int if ((nr_neigh = (struct nr_neigh *)kmalloc(sizeof(*nr_neigh), GFP_ATOMIC)) == NULL) return -ENOMEM; - memcpy(&nr_neigh->callsign, callsign, sizeof(ax25_address)); - - nr_neigh->dev = dev; - nr_neigh->quality = quality; - nr_neigh->locked = 1; - nr_neigh->count = 0; - nr_neigh->number = nr_neigh_no++; + nr_neigh->callsign = *callsign; + nr_neigh->digipeat = NULL; + nr_neigh->dev = dev; + nr_neigh->quality = quality; + nr_neigh->locked = 1; + nr_neigh->count = 0; + nr_neigh->number = nr_neigh_no++; save_flags(flags); cli(); @@ -385,7 +416,7 @@ static int nr_del_neigh(ax25_address *callsign, struct device *dev, unsigned int */ static int nr_dec_obs(void) { - struct nr_neigh *t, *nr_neigh; + struct nr_neigh *nr_neigh; struct nr_node *s, *nr_node; int i; @@ -402,21 +433,12 @@ static int nr_dec_obs(void) break; case 1: /* From 1 -> 0 */ - nr_neigh = nr_neigh_list; - - while (nr_neigh != NULL) { - t = nr_neigh; - nr_neigh = nr_neigh->next; - - if (t->number == s->routes[i].neighbour) { - t->count--; + nr_neigh = s->routes[i].neighbour; + + nr_neigh->count--; - if (t->count == 0 && !t->locked) - nr_remove_neigh(t); - - break; - } - } + if (nr_neigh->count == 0 && !nr_neigh->locked) + nr_remove_neigh(nr_neigh); s->count--; @@ -465,7 +487,7 @@ void nr_rt_device_down(struct device *dev) nr_node = nr_node->next; for (i = 0; i < t->count; i++) { - if (t->routes[i].neighbour == s->number) { + if (t->routes[i].neighbour == s) { t->count--; switch (i) { @@ -490,6 +512,7 @@ void nr_rt_device_down(struct device *dev) /* * Check that the device given is a valid AX.25 interface that is "up". + * Or a valid ethernet interface with an AX.25 callsign binding. */ static struct device *nr_ax25_dev_get(char *devname) { @@ -500,7 +523,7 @@ static struct device *nr_ax25_dev_get(char *devname) if ((dev->flags & IFF_UP) && dev->type == ARPHRD_AX25) return dev; - + return NULL; } @@ -538,48 +561,54 @@ struct device *nr_dev_get(ax25_address *addr) */ int nr_rt_ioctl(unsigned int cmd, void *arg) { - struct nr_node_struct nr_node; - struct nr_neigh_struct nr_neigh; + struct nr_route_struct nr_route; struct device *dev; int err; switch (cmd) { - case SIOCNRADDNODE: - if ((err = verify_area(VERIFY_READ, arg, sizeof(struct nr_node_struct))) != 0) + case SIOCADDRT: + if ((err = verify_area(VERIFY_READ, arg, sizeof(struct nr_route_struct))) != 0) return err; - memcpy_fromfs(&nr_node, arg, sizeof(struct nr_node_struct)); - if ((dev = nr_ax25_dev_get(nr_node.device)) == NULL) + copy_from_user(&nr_route, arg, sizeof(struct nr_route_struct)); + if ((dev = nr_ax25_dev_get(nr_route.device)) == NULL) return -EINVAL; - return nr_add_node(&nr_node.callsign, nr_node.mnemonic, - &nr_node.neighbour, dev, nr_node.quality, nr_node.obs_count); - - case SIOCNRDELNODE: - if ((err = verify_area(VERIFY_READ, arg, sizeof(struct nr_node_struct))) != 0) - return err; - memcpy_fromfs(&nr_node, arg, sizeof(struct nr_node_struct)); - if ((dev = nr_ax25_dev_get(nr_node.device)) == NULL) - return -EINVAL; - return nr_del_node(&nr_node.callsign, &nr_node.neighbour, dev); - - case SIOCNRADDNEIGH: - if ((err = verify_area(VERIFY_READ, arg, sizeof(struct nr_neigh_struct))) != 0) - return err; - memcpy_fromfs(&nr_neigh, arg, sizeof(struct nr_neigh_struct)); - if ((dev = nr_ax25_dev_get(nr_neigh.device)) == NULL) - return -EINVAL; - return nr_add_neigh(&nr_neigh.callsign, dev, nr_neigh.quality); + switch (nr_route.type) { + case NETROM_NODE: + return nr_add_node(&nr_route.callsign, + nr_route.mnemonic, + &nr_route.neighbour, + NULL, dev, nr_route.quality, + nr_route.obs_count); + case NETROM_NEIGH: + return nr_add_neigh(&nr_route.callsign, + dev, nr_route.quality); + default: + return -EINVAL; + } - case SIOCNRDELNEIGH: - if ((err = verify_area(VERIFY_READ, arg, sizeof(struct nr_neigh_struct))) != 0) + case SIOCDELRT: + if ((err = verify_area(VERIFY_READ, arg, sizeof(struct nr_route_struct))) != 0) return err; - memcpy_fromfs(&nr_neigh, arg, sizeof(struct nr_neigh_struct)); - if ((dev = nr_ax25_dev_get(nr_neigh.device)) == NULL) + copy_from_user(&nr_route, arg, sizeof(struct nr_route_struct)); + if ((dev = nr_ax25_dev_get(nr_route.device)) == NULL) return -EINVAL; - return nr_del_neigh(&nr_neigh.callsign, dev, nr_neigh.quality); + switch (nr_route.type) { + case NETROM_NODE: + return nr_del_node(&nr_route.callsign, + &nr_route.neighbour, dev); + case NETROM_NEIGH: + return nr_del_neigh(&nr_route.callsign, + dev, nr_route.quality); + default: + return -EINVAL; + } case SIOCNRDECOBS: return nr_dec_obs(); + + default: + return -EINVAL; } return 0; @@ -601,35 +630,44 @@ void nr_link_failed(ax25_address *callsign, struct device *dev) if (nr_neigh == NULL) return; for (nr_node = nr_node_list; nr_node != NULL; nr_node = nr_node->next) - if (nr_node->which >= nr_node->count && nr_node->routes[nr_node->which].neighbour == nr_neigh->number) + if (nr_node->which < nr_node->count && nr_node->routes[nr_node->which].neighbour == nr_neigh) nr_node->which++; } /* - * Route a frame to an appropriate AX.25 connection. A NULL dev means - * that the frame was generated internally. + * Route a frame to an appropriate AX.25 connection. A NULL ax25_cb + * indicates an internally generated frame. */ -int nr_route_frame(struct sk_buff *skb, struct device *device) +int nr_route_frame(struct sk_buff *skb, ax25_cb *ax25) { - ax25_address *ax25_src, *ax25_dest; - ax25_address *nr_src, *nr_dest; + ax25_address *nr_src, *nr_dest; struct nr_neigh *nr_neigh; struct nr_node *nr_node; struct device *dev; + unsigned char *dptr; + +#ifdef CONFIG_FIREWALL + if (ax25 != NULL && call_in_firewall(PF_NETROM, skb->dev, skb->data, NULL) != FW_ACCEPT) + return 0; + if (ax25 == NULL && call_out_firewall(PF_NETROM, skb->dev, skb->data, NULL) != FW_ACCEPT) + return 0; +#endif - ax25_dest = (ax25_address *)(skb->data + 1); - ax25_src = (ax25_address *)(skb->data + 8); - nr_src = (ax25_address *)(skb->data + 17); - nr_dest = (ax25_address *)(skb->data + 24); + nr_src = (ax25_address *)(skb->data + 0); + nr_dest = (ax25_address *)(skb->data + 7); - if (device != NULL) - nr_add_node(nr_src, "", ax25_src, device, 0, nr_default.obs_count); + if (ax25 != NULL) + nr_add_node(nr_src, "", &ax25->dest_addr, ax25->digipeat, + ax25->device, 0, sysctl_netrom_network_ttl_initialiser); if ((dev = nr_dev_get(nr_dest)) != NULL) /* Its for me */ return nr_rx_frame(skb, dev); + if (!sysctl_netrom_routing_control && ax25 != NULL) + return 0; + /* Its Time-To-Live has expired */ - if (--skb->data[31] == 0) + if (--skb->data[14] == 0) return 0; for (nr_node = nr_node_list; nr_node != NULL; nr_node = nr_node->next) @@ -639,48 +677,47 @@ int nr_route_frame(struct sk_buff *skb, struct device *device) if (nr_node == NULL || nr_node->which >= nr_node->count) return 0; - for (nr_neigh = nr_neigh_list; nr_neigh != NULL; nr_neigh = nr_neigh->next) - if (nr_neigh->number == nr_node->routes[nr_node->which].neighbour) - break; - - if (nr_neigh == NULL) - return 0; + nr_neigh = nr_node->routes[nr_node->which].neighbour; if ((dev = nr_dev_first()) == NULL) return 0; - if (device != NULL) - skb->len += dev->hard_header_len; +#ifdef CONFIG_FIREWALL + if (ax25 != NULL && call_fw_firewall(PF_NETROM, skb->dev, skb->data, NULL) != FW_ACCEPT) + return 0; +#endif - ax25_send_frame(skb, (ax25_address *)dev->dev_addr, &nr_neigh->callsign, nr_neigh->dev); + dptr = skb_push(skb, 1); + *dptr = AX25_P_NETROM; - return 1; + return ax25_send_frame(skb, (ax25_address *)dev->dev_addr, &nr_neigh->callsign, nr_neigh->digipeat, nr_neigh->dev); } -int nr_nodes_get_info(char *buffer, char **start, off_t offset, int length) +int nr_nodes_get_info(char *buffer, char **start, off_t offset, + int length, int dummy) { struct nr_node *nr_node; int len = 0; off_t pos = 0; off_t begin = 0; int i; - + cli(); len += sprintf(buffer, "callsign mnemonic w n qual obs neigh qual obs neigh qual obs neigh\n"); for (nr_node = nr_node_list; nr_node != NULL; nr_node = nr_node->next) { - len += sprintf(buffer + len, "%-9s %-7s %d %d ", + len += sprintf(buffer + len, "%-9s %-7s %d %d", ax2asc(&nr_node->callsign), - nr_node->mnemonic, + (nr_node->mnemonic[0] == '\0') ? "*" : nr_node->mnemonic, nr_node->which + 1, nr_node->count); for (i = 0; i < nr_node->count; i++) { - len += sprintf(buffer + len, " %3d %d %05d", + len += sprintf(buffer + len, " %3d %d %05d", nr_node->routes[i].quality, nr_node->routes[i].obs_count, - nr_node->routes[i].neighbour); + nr_node->routes[i].neighbour->number); } len += sprintf(buffer + len, "\n"); @@ -691,7 +728,7 @@ int nr_nodes_get_info(char *buffer, char **start, off_t offset, int length) len = 0; begin = pos; } - + if (pos > offset + length) break; } @@ -703,22 +740,23 @@ int nr_nodes_get_info(char *buffer, char **start, off_t offset, int length) if (len > length) len = length; - return(len); + return len; } -int nr_neigh_get_info(char *buffer, char **start, off_t offset, int length) +int nr_neigh_get_info(char *buffer, char **start, off_t offset, + int length, int dummy) { struct nr_neigh *nr_neigh; int len = 0; off_t pos = 0; off_t begin = 0; - + cli(); - len += sprintf(buffer, "addr callsign dev qual lock count\n"); + len += sprintf(buffer, "addr callsign dev qual lock count\n"); for (nr_neigh = nr_neigh_list; nr_neigh != NULL; nr_neigh = nr_neigh->next) { - len += sprintf(buffer + len, "%05d %-9s %-3s %3d %d %3d\n", + len += sprintf(buffer + len, "%05d %-9s %-4s %3d %d %3d\n", nr_neigh->number, ax2asc(&nr_neigh->callsign), nr_neigh->dev ? nr_neigh->dev->name : "???", @@ -744,7 +782,34 @@ int nr_neigh_get_info(char *buffer, char **start, off_t offset, int length) if (len > length) len = length; - return(len); + return len; } +#ifdef MODULE + +/* + * Free all memory associated with the nodes and routes lists. + */ +void nr_rt_free(void) +{ + struct nr_neigh *s, *nr_neigh = nr_neigh_list; + struct nr_node *t, *nr_node = nr_node_list; + + while (nr_node != NULL) { + t = nr_node; + nr_node = nr_node->next; + + nr_remove_node(t); + } + + while (nr_neigh != NULL) { + s = nr_neigh; + nr_neigh = nr_neigh->next; + + nr_remove_neigh(s); + } +} + +#endif + #endif diff --git a/net/netrom/nr_subr.c b/net/netrom/nr_subr.c index 3d5c2fc12..ab788f6af 100644 --- a/net/netrom/nr_subr.c +++ b/net/netrom/nr_subr.c @@ -1,5 +1,5 @@ /* - * NET/ROM release 002 + * NET/ROM release 004 * * This is ALPHA test software. This code may break your machine, randomly fail to work with new * releases, misbehave and/or generally screw up. It might even work. @@ -14,10 +14,11 @@ * * History * NET/ROM 001 Jonathan(G4KLX) Cloned from ax25_subr.c + * NET/ROM 003 Jonathan(G4KLX) Added G8BPQ NET/ROM extensions. */ #include <linux/config.h> -#ifdef CONFIG_NETROM +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> @@ -33,7 +34,7 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <linux/fcntl.h> #include <linux/mm.h> @@ -41,9 +42,9 @@ #include <net/netrom.h> /* - * This routine purges the input queue of frames. + * This routine purges all of the queues of frames. */ -void nr_clear_tx_queue(struct sock *sk) +void nr_clear_queues(struct sock *sk) { struct sk_buff *skb; @@ -53,14 +54,17 @@ void nr_clear_tx_queue(struct sock *sk) kfree_skb(skb, FREE_WRITE); } - while ((skb = skb_dequeue(&sk->nr->ack_queue)) != NULL) { + while ((skb = skb_dequeue(&sk->protinfo.nr->ack_queue)) != NULL) { skb->sk = sk; skb->free = 1; kfree_skb(skb, FREE_WRITE); } - while ((skb = skb_dequeue(&sk->nr->reseq_queue)) != NULL) { - skb->free = 1; + while ((skb = skb_dequeue(&sk->protinfo.nr->reseq_queue)) != NULL) { + kfree_skb(skb, FREE_READ); + } + + while ((skb = skb_dequeue(&sk->protinfo.nr->frag_queue)) != NULL) { kfree_skb(skb, FREE_READ); } } @@ -77,13 +81,13 @@ void nr_frames_acked(struct sock *sk, unsigned short nr) /* * Remove all the ack-ed frames from the ack queue. */ - if (sk->nr->va != nr) { - while (skb_peek(&sk->nr->ack_queue) != NULL && sk->nr->va != nr) { - skb = skb_dequeue(&sk->nr->ack_queue); + if (sk->protinfo.nr->va != nr) { + while (skb_peek(&sk->protinfo.nr->ack_queue) != NULL && sk->protinfo.nr->va != nr) { + skb = skb_dequeue(&sk->protinfo.nr->ack_queue); skb->sk = sk; skb->free = 1; kfree_skb(skb, FREE_WRITE); - sk->nr->va = (sk->nr->va + 1) % NR_MODULUS; + sk->protinfo.nr->va = (sk->protinfo.nr->va + 1) % NR_MODULUS; } } } @@ -97,7 +101,7 @@ void nr_requeue_frames(struct sock *sk) { struct sk_buff *skb, *skb_prev = NULL; - while ((skb = skb_dequeue(&sk->nr->ack_queue)) != NULL) { + while ((skb = skb_dequeue(&sk->protinfo.nr->ack_queue)) != NULL) { if (skb_prev == NULL) skb_queue_head(&sk->write_queue, skb); else @@ -112,14 +116,14 @@ void nr_requeue_frames(struct sock *sk) */ int nr_validate_nr(struct sock *sk, unsigned short nr) { - unsigned short vc = sk->nr->va; + unsigned short vc = sk->protinfo.nr->va; - while (vc != sk->nr->vs) { + while (vc != sk->protinfo.nr->vs) { if (nr == vc) return 1; vc = (vc + 1) % NR_MODULUS; } - if (nr == sk->nr->vs) return 1; + if (nr == sk->protinfo.nr->vs) return 1; return 0; } @@ -129,15 +133,13 @@ int nr_validate_nr(struct sock *sk, unsigned short nr) */ int nr_in_rx_window(struct sock *sk, unsigned short ns) { - unsigned short vc = sk->nr->vl; - unsigned short vt = (sk->nr->vl + sk->window) % NR_MODULUS; + unsigned short vc = sk->protinfo.nr->vr; + unsigned short vt = (sk->protinfo.nr->vl + sk->window) % NR_MODULUS; while (vc != vt) { if (ns == vc) return 1; vc = (vc + 1) % NR_MODULUS; } - - if (ns == vt) return 1; return 0; } @@ -150,73 +152,89 @@ void nr_write_internal(struct sock *sk, int frametype) { struct sk_buff *skb; unsigned char *dptr; - int len; + int len, timeout; + + len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + NR_NETWORK_LEN + NR_TRANSPORT_LEN; switch (frametype & 0x0F) { - case NR_CONNREQ: len = 52; break; - case NR_CONNACK: len = 38; break; - case NR_DISCREQ: len = 37; break; - case NR_DISCACK: len = 37; break; - case NR_INFOACK: len = 37; break; + case NR_CONNREQ: + len += 17; + break; + case NR_CONNACK: + len += (sk->protinfo.nr->bpqext) ? 2 : 1; + break; + case NR_DISCREQ: + case NR_DISCACK: + case NR_INFOACK: + break; default: - printk("nr_write_internal: invalid frame type %d\n", frametype); + printk(KERN_ERR "nr_write_internal: invalid frame type %d\n", frametype); return; } if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) return; - dptr = skb->data + 32; + /* + * Space for AX.25 and NET/ROM network header + */ + skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + NR_NETWORK_LEN); + + dptr = skb_put(skb, skb_tailroom(skb)); switch (frametype & 0x0F) { case NR_CONNREQ: - *dptr++ = sk->nr->my_index; - *dptr++ = sk->nr->my_id; - *dptr++ = 0; - *dptr++ = 0; - *dptr++ = frametype; - *dptr++ = sk->window; - memcpy(dptr, &sk->nr->user_addr, sizeof(ax25_address)); + timeout = (sk->protinfo.nr->rtt / PR_SLOWHZ) * 2; + *dptr++ = sk->protinfo.nr->my_index; + *dptr++ = sk->protinfo.nr->my_id; + *dptr++ = 0; + *dptr++ = 0; + *dptr++ = frametype; + *dptr++ = sk->window; + memcpy(dptr, &sk->protinfo.nr->user_addr, AX25_ADDR_LEN); dptr[6] &= ~LAPB_C; dptr[6] &= ~LAPB_E; - dptr[6] |= SSID_SPARE; - dptr += 7; - memcpy(dptr, &sk->nr->source_addr, sizeof(ax25_address)); + dptr[6] |= SSSID_SPARE; + dptr += AX25_ADDR_LEN; + memcpy(dptr, &sk->protinfo.nr->source_addr, AX25_ADDR_LEN); dptr[6] &= ~LAPB_C; dptr[6] &= ~LAPB_E; - dptr[6] |= SSID_SPARE; + dptr[6] |= SSSID_SPARE; + dptr += AX25_ADDR_LEN; + *dptr++ = timeout % 256; + *dptr++ = timeout / 256; break; case NR_CONNACK: - *dptr++ = sk->nr->your_index; - *dptr++ = sk->nr->your_id; - *dptr++ = sk->nr->my_index; - *dptr++ = sk->nr->my_id; + *dptr++ = sk->protinfo.nr->your_index; + *dptr++ = sk->protinfo.nr->your_id; + *dptr++ = sk->protinfo.nr->my_index; + *dptr++ = sk->protinfo.nr->my_id; *dptr++ = frametype; *dptr++ = sk->window; + if (sk->protinfo.nr->bpqext) *dptr++ = sysctl_netrom_network_ttl_initialiser; break; case NR_DISCREQ: case NR_DISCACK: - *dptr++ = sk->nr->your_index; - *dptr++ = sk->nr->your_id; + *dptr++ = sk->protinfo.nr->your_index; + *dptr++ = sk->protinfo.nr->your_id; *dptr++ = 0; *dptr++ = 0; *dptr++ = frametype; break; case NR_INFOACK: - *dptr++ = sk->nr->your_index; - *dptr++ = sk->nr->your_id; + *dptr++ = sk->protinfo.nr->your_index; + *dptr++ = sk->protinfo.nr->your_id; *dptr++ = 0; - *dptr++ = sk->nr->vr; + *dptr++ = sk->protinfo.nr->vr; *dptr++ = frametype; break; } skb->free = 1; - skb->len = len; nr_transmit_buffer(sk, skb); } @@ -229,37 +247,39 @@ void nr_transmit_dm(struct sk_buff *skb) { struct sk_buff *skbn; unsigned char *dptr; + int len; - if ((skbn = alloc_skb(38, GFP_ATOMIC)) == NULL) + len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + NR_NETWORK_LEN + NR_TRANSPORT_LEN + 1; + + if ((skbn = alloc_skb(len, GFP_ATOMIC)) == NULL) return; - dptr = skbn->data + 16; + skb_reserve(skbn, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN); - *dptr++ = AX25_P_NETROM; - - memcpy(dptr, skb->data + 24, 7); + dptr = skb_put(skbn, NR_NETWORK_LEN + NR_TRANSPORT_LEN); + + memcpy(dptr, skb->data + 7, AX25_ADDR_LEN); dptr[6] &= ~LAPB_C; dptr[6] &= ~LAPB_E; - dptr[6] |= SSID_SPARE; - dptr += 7; + dptr[6] |= SSSID_SPARE; + dptr += AX25_ADDR_LEN; - memcpy(dptr, skb->data + 17, 7); + memcpy(dptr, skb->data + 0, AX25_ADDR_LEN); dptr[6] &= ~LAPB_C; dptr[6] |= LAPB_E; - dptr[6] |= SSID_SPARE; - dptr += 7; + dptr[6] |= SSSID_SPARE; + dptr += AX25_ADDR_LEN; - *dptr++ = nr_default.ttl; + *dptr++ = sysctl_netrom_network_ttl_initialiser; - *dptr++ = skb->data[32]; - *dptr++ = skb->data[33]; + *dptr++ = skb->data[15]; + *dptr++ = skb->data[16]; *dptr++ = 0; *dptr++ = 0; - *dptr++ = NR_CONNACK + NR_CHOKE_FLAG; + *dptr++ = NR_CONNACK | NR_CHOKE_FLAG; *dptr++ = 0; skbn->free = 1; - skbn->len = 38; skbn->sk = NULL; if (!nr_route_frame(skbn, NULL)) @@ -273,10 +293,12 @@ unsigned short nr_calculate_t1(struct sock *sk) { int n, t; - for (t = 2, n = 0; n < sk->nr->n2count; n++) + for (t = 2, n = 0; n < sk->protinfo.nr->n2count; n++) t *= 2; - return t * sk->nr->rtt; + if (t > 8) t = 8; + + return t * sk->protinfo.nr->rtt; } /* @@ -284,12 +306,22 @@ unsigned short nr_calculate_t1(struct sock *sk) */ void nr_calculate_rtt(struct sock *sk) { - if (sk->nr->n2count == 0) - sk->nr->rtt = (9 * sk->nr->rtt + sk->nr->t1 - sk->nr->t1timer) / 10; - - /* Don't go below one second */ - if (sk->nr->rtt < 1 * PR_SLOWHZ) - sk->nr->rtt = 1 * PR_SLOWHZ; + if (sk->protinfo.nr->t1timer > 0 && sk->protinfo.nr->n2count == 0) + sk->protinfo.nr->rtt = (9 * sk->protinfo.nr->rtt + sk->protinfo.nr->t1 - sk->protinfo.nr->t1timer) / 10; + +#ifdef NR_T1CLAMPLO + /* Don't go below one tenth of a second */ + if (sk->protinfo.nr->rtt < (NR_T1CLAMPLO)) + sk->protinfo.nr->rtt = (NR_T1CLAMPLO); +#else /* Failsafe - some people might have sub 1/10th RTTs :-) **/ + if (sk->protinfo.nr->rtt == 0) + sk->protinfo.nr->rtt = PR_SLOWHZ; +#endif +#ifdef NR_T1CLAMPHI + /* OR above clamped seconds **/ + if (sk->protinfo.nr->rtt > (NR_T1CLAMPHI)) + sk->protinfo.nr->rtt = (NR_T1CLAMPHI); +#endif } #endif diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c index 2e9269f13..0149851fd 100644 --- a/net/netrom/nr_timer.c +++ b/net/netrom/nr_timer.c @@ -1,5 +1,5 @@ /* - * NET/ROM release 002 + * NET/ROM release 004 * * This is ALPHA test software. This code may break your machine, randomly fail to work with new * releases, misbehave and/or generally screw up. It might even work. @@ -17,7 +17,7 @@ */ #include <linux/config.h> -#ifdef CONFIG_NETROM +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> @@ -33,7 +33,7 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> -#include <asm/segment.h> +#include <asm/uaccess.h> #include <asm/system.h> #include <linux/fcntl.h> #include <linux/mm.h> @@ -58,7 +58,7 @@ void nr_set_timer(struct sock *sk) sk->timer.data = (unsigned long)sk; sk->timer.function = &nr_timer; - sk->timer.expires = 10; + sk->timer.expires = jiffies+10; add_timer(&sk->timer); } @@ -73,25 +73,25 @@ static void nr_reset_timer(struct sock *sk) sk->timer.data = (unsigned long)sk; sk->timer.function = &nr_timer; - sk->timer.expires = 10; + sk->timer.expires = jiffies+10; add_timer(&sk->timer); } /* * NET/ROM TIMER * - * This routine is called every 500ms. Decrement timer by this + * This routine is called every 100ms. Decrement timer by this * amount - if expired then process the event. */ static void nr_timer(unsigned long param) { struct sock *sk = (struct sock *)param; - switch (sk->nr->state) { + switch (sk->protinfo.nr->state) { case NR_STATE_0: /* Magic here: If we listen() and a new link dies before it - is accepted() it isnt 'dead' so doesnt get removed. */ - if (sk->dead) { + is accepted() it isn't 'dead' so doesn't get removed. */ + if (sk->destroy || (sk->state == TCP_LISTEN && sk->dead)) { del_timer(&sk->timer); nr_destroy_socket(sk); return; @@ -102,11 +102,11 @@ static void nr_timer(unsigned long param) /* * Check for the state of the receive buffer. */ - if (sk->rmem_alloc < (sk->rcvbuf / 2) && (sk->nr->condition & OWN_RX_BUSY_CONDITION)) { - sk->nr->condition &= ~OWN_RX_BUSY_CONDITION; + if (sk->rmem_alloc < (sk->rcvbuf / 2) && (sk->protinfo.nr->condition & OWN_RX_BUSY_CONDITION)) { + sk->protinfo.nr->condition &= ~OWN_RX_BUSY_CONDITION; nr_write_internal(sk, NR_INFOACK); - sk->nr->condition &= ~ACK_PENDING_CONDITION; - sk->nr->vl = sk->nr->vr; + sk->protinfo.nr->condition &= ~ACK_PENDING_CONDITION; + sk->protinfo.nr->vl = sk->protinfo.nr->vr; break; } /* @@ -119,72 +119,75 @@ static void nr_timer(unsigned long param) break; } - if (sk->nr->t2timer > 0 && --sk->nr->t2timer == 0) { - if (sk->nr->state == NR_STATE_3) { - if (sk->nr->condition & ACK_PENDING_CONDITION) { - sk->nr->condition &= ~ACK_PENDING_CONDITION; + if (sk->protinfo.nr->t2timer > 0 && --sk->protinfo.nr->t2timer == 0) { + if (sk->protinfo.nr->state == NR_STATE_3) { + if (sk->protinfo.nr->condition & ACK_PENDING_CONDITION) { + sk->protinfo.nr->condition &= ~ACK_PENDING_CONDITION; nr_enquiry_response(sk); } } } - if (sk->nr->t4timer > 0 && --sk->nr->t4timer == 0) { - sk->nr->condition &= ~PEER_RX_BUSY_CONDITION; + if (sk->protinfo.nr->t4timer > 0 && --sk->protinfo.nr->t4timer == 0) { + sk->protinfo.nr->condition &= ~PEER_RX_BUSY_CONDITION; } - if (sk->nr->t1timer == 0 || --sk->nr->t1timer > 0) { + if (sk->protinfo.nr->t1timer == 0 || --sk->protinfo.nr->t1timer > 0) { nr_reset_timer(sk); return; } - switch (sk->nr->state) { + switch (sk->protinfo.nr->state) { case NR_STATE_1: - if (sk->nr->n2count == sk->nr->n2) { - nr_clear_tx_queue(sk); - sk->nr->state = NR_STATE_0; - sk->state = TCP_CLOSE; - sk->err = ETIMEDOUT; + if (sk->protinfo.nr->n2count == sk->protinfo.nr->n2) { + nr_clear_queues(sk); + sk->protinfo.nr->state = NR_STATE_0; + sk->state = TCP_CLOSE; + sk->err = ETIMEDOUT; + sk->shutdown |= SEND_SHUTDOWN; if (!sk->dead) sk->state_change(sk); - sk->dead = 1; + sk->dead = 1; } else { - sk->nr->n2count++; + sk->protinfo.nr->n2count++; nr_write_internal(sk, NR_CONNREQ); } break; case NR_STATE_2: - if (sk->nr->n2count == sk->nr->n2) { - nr_clear_tx_queue(sk); - sk->nr->state = NR_STATE_0; - sk->state = TCP_CLOSE; - sk->err = ETIMEDOUT; + if (sk->protinfo.nr->n2count == sk->protinfo.nr->n2) { + nr_clear_queues(sk); + sk->protinfo.nr->state = NR_STATE_0; + sk->state = TCP_CLOSE; + sk->err = ETIMEDOUT; + sk->shutdown |= SEND_SHUTDOWN; if (!sk->dead) sk->state_change(sk); - sk->dead = 1; + sk->dead = 1; } else { - sk->nr->n2count++; + sk->protinfo.nr->n2count++; nr_write_internal(sk, NR_DISCREQ); } break; case NR_STATE_3: - if (sk->nr->n2count == sk->nr->n2) { - nr_clear_tx_queue(sk); - sk->nr->state = NR_STATE_0; - sk->state = TCP_CLOSE; - sk->err = ETIMEDOUT; + if (sk->protinfo.nr->n2count == sk->protinfo.nr->n2) { + nr_clear_queues(sk); + sk->protinfo.nr->state = NR_STATE_0; + sk->state = TCP_CLOSE; + sk->err = ETIMEDOUT; + sk->shutdown |= SEND_SHUTDOWN; if (!sk->dead) sk->state_change(sk); - sk->dead = 1; + sk->dead = 1; } else { - sk->nr->n2count++; + sk->protinfo.nr->n2count++; nr_requeue_frames(sk); } break; } - sk->nr->t1timer = sk->nr->t1 = nr_calculate_t1(sk); + sk->protinfo.nr->t1timer = sk->protinfo.nr->t1 = nr_calculate_t1(sk); nr_set_timer(sk); } diff --git a/net/netrom/sysctl_net_netrom.c b/net/netrom/sysctl_net_netrom.c new file mode 100644 index 000000000..3cbc0b761 --- /dev/null +++ b/net/netrom/sysctl_net_netrom.c @@ -0,0 +1,89 @@ +/* -*- linux-c -*- + * sysctl_net_netrom.c: sysctl interface to net NET/ROM subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/netrom directory entry (empty =) ). [MS] + */ + +#include <linux/mm.h> +#include <linux/sysctl.h> +#include <net/ax25.h> +#include <net/netrom.h> + +/* + * Values taken from NET/ROM documentation. + */ +static int min_quality[] = {0}, max_quality[] = {255}; +static int min_obs[] = {0}, max_obs[] = {255}; +static int min_ttl[] = {0}, max_ttl[] = {255}; +static int min_t1[] = {5 * PR_SLOWHZ}; +static int max_t1[] = {600 * PR_SLOWHZ}; +static int min_n2[] = {2}, max_n2[] = {127}; +static int min_t2[] = {1 * PR_SLOWHZ}; +static int max_t2[] = {60 * PR_SLOWHZ}; +static int min_t4[] = {1 * PR_SLOWHZ}; +static int max_t4[] = {1000 * PR_SLOWHZ}; +static int min_window[] = {1}, max_window[] = {127}; +static int min_idle[] = {0 * PR_SLOWHZ}; +static int max_idle[] = {65535 * PR_SLOWHZ}; +static int min_n1[] = {1}, max_n1[] = {236}; +static int min_route[] = {0}, max_route[] = {1}; + +static struct ctl_table_header *nr_table_header; + +static ctl_table nr_table[] = { + {NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality", + &sysctl_netrom_default_path_quality, sizeof(int), 0644, NULL, + &proc_dointvec_minmax, &sysctl_intvec, NULL, &min_quality, &max_quality}, + {NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser", + &sysctl_netrom_obsolescence_count_initialiser, sizeof(int), 0644, NULL, + &proc_dointvec_minmax, &sysctl_intvec, NULL, &min_obs, &max_obs}, + {NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser", + &sysctl_netrom_network_ttl_initialiser, sizeof(int), 0644, NULL, + &proc_dointvec_minmax, &sysctl_intvec, NULL, &min_ttl, &max_ttl}, + {NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout", + &sysctl_netrom_transport_timeout, sizeof(int), 0644, NULL, + &proc_dointvec_minmax, &sysctl_intvec, NULL, &min_t1, &max_t1}, + {NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries", + &sysctl_netrom_transport_maximum_tries, sizeof(int), 0644, NULL, + &proc_dointvec_minmax, &sysctl_intvec, NULL, &min_n2, &max_n2}, + {NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay", + &sysctl_netrom_transport_acknowledge_delay, sizeof(int), 0644, NULL, + &proc_dointvec_minmax, &sysctl_intvec, NULL, &min_t2, &max_t2}, + {NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay", + &sysctl_netrom_transport_busy_delay, sizeof(int), 0644, NULL, + &proc_dointvec_minmax, &sysctl_intvec, NULL, &min_t4, &max_t4}, + {NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size", + &sysctl_netrom_transport_requested_window_size, sizeof(int), 0644, NULL, + &proc_dointvec_minmax, &sysctl_intvec, NULL, &min_window, &max_window}, + {NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout", + &sysctl_netrom_transport_no_activity_timeout, sizeof(int), 0644, NULL, + &proc_dointvec_minmax, &sysctl_intvec, NULL, &min_idle, &max_idle}, + {NET_NETROM_TRANSPORT_PACKET_LENGTH, "transport_packet_length", + &sysctl_netrom_transport_packet_length, sizeof(int), 0644, NULL, + &proc_dointvec_minmax, &sysctl_intvec, NULL, &min_n1, &max_n1}, + {NET_NETROM_ROUTING_CONTROL, "routing_control", + &sysctl_netrom_routing_control, sizeof(int), 0644, NULL, + &proc_dointvec_minmax, &sysctl_intvec, NULL, &min_route, &max_route}, + {0} +}; + +static ctl_table nr_dir_table[] = { + {NET_NETROM, "netrom", NULL, 0, 0555, nr_table}, + {0} +}; + +static ctl_table nr_root_table[] = { + {CTL_NET, "net", NULL, 0, 0555, nr_dir_table}, + {0} +}; + +void nr_register_sysctl(void) +{ + nr_table_header = register_sysctl_table(nr_root_table, 1); +} + +void nr_unregister_sysctl(void) +{ + unregister_sysctl_table(nr_table_header); +} diff --git a/net/netsyms.c b/net/netsyms.c new file mode 100644 index 000000000..22f253d63 --- /dev/null +++ b/net/netsyms.c @@ -0,0 +1,270 @@ +/* + * linux/net/netsyms.c + * + * Symbol table for the linux networking subsystem. Moved here to + * make life simpler in ksyms.c. + */ + +#include <linux/config.h> +#include <linux/module.h> + +#include <linux/types.h> +#include <linux/net.h> +#include <linux/in.h> +#include <linux/netdevice.h> +#include <linux/trdevice.h> +#include <linux/ioport.h> + +#ifdef CONFIG_INET +#include <linux/ip.h> +#include <linux/etherdevice.h> +#include <net/protocol.h> +#include <net/arp.h> +#include <net/ip.h> +#include <net/udp.h> +#include <net/tcp.h> +#include <net/icmp.h> +#include <net/route.h> +#include <net/inet_common.h> +#include <linux/net_alias.h> + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) +#include <linux/in6.h> +#include <net/ndisc.h> +#include <net/transp_v6.h> +#endif + +#endif + +#ifdef CONFIG_NETLINK +#include <net/netlink.h> +#endif + +#ifdef CONFIG_NET_ALIAS +#include <linux/net_alias.h> +#endif + +#if defined(CONFIG_ULTRA) || defined(CONFIG_WD80x3) || \ + defined(CONFIG_EL2) || defined(CONFIG_NE2000) || \ + defined(CONFIG_E2100) || defined(CONFIG_HPLAN_PLUS) || \ + defined(CONFIG_HPLAN) || defined(CONFIG_AC3200) +#include "../drivers/net/8390.h" +#endif + +extern int (*rarp_ioctl_hook)(int,void*); + +#ifdef CONFIG_IPX_MODULE +extern struct datalink_proto *make_EII_client(void); +extern struct datalink_proto *make_8023_client(void); +extern void destroy_EII_client(struct datalink_proto *); +extern void destroy_8023_client(struct datalink_proto *); +#endif + +extern char *skb_push_errstr; +extern char *skb_put_errstr; + +static struct symbol_table net_syms = { +#include <linux/symtab_begin.h> + + /* Skbuff symbols. */ + X(skb_push_errstr), + X(skb_put_errstr), + + /* Socket layer registration */ + X(sock_register), + X(sock_unregister), + + /* Socket layer support routines */ + X(memcpy_fromiovec), + X(sock_setsockopt), + X(sock_getsockopt), + X(sk_alloc), + X(sk_free), + X(sock_wake_async), + X(sock_alloc_send_skb), + X(skb_recv_datagram), + X(skb_free_datagram), + X(skb_copy_datagram), + X(skb_copy_datagram_iovec), + X(datagram_select), + +#ifdef CONFIG_IPX_MODULE + X(make_8023_client), + X(destroy_8023_client), + X(make_EII_client), + X(destroy_EII_client), +#endif + +#ifdef CONFIG_INET + /* Internet layer registration */ + X(get_new_socknum), + X(inet_add_protocol), + X(inet_del_protocol), + X(rarp_ioctl_hook), + X(init_etherdev), + X(ip_rt_route), + X(icmp_send), + X(ip_options_compile), + X(ip_rt_put), + X(arp_send), + X(ip_id_count), + X(ip_send_check), +#ifdef CONFIG_IP_FORWARD + X(ip_forward), +#endif + +#ifdef CONFIG_IPV6_MODULE + /* inet functions common to v4 and v6 */ + X(inet_proto_ops), + X(inet_remove_sock), + X(inet_release), + X(inet_connect), + X(inet_accept), + X(inet_select), + X(inet_listen), + X(inet_shutdown), + X(inet_setsockopt), + X(inet_getsockopt), + X(inet_fcntl), + X(inet_sendmsg), + X(inet_recvmsg), + X(tcp_sock_array), + X(udp_sock_array), + X(destroy_sock), + X(ip_queue_xmit), + X(csum_partial), + X(ip_my_addr), + X(skb_copy), + X(dev_lockct), + X(ndisc_eth_hook), + X(memcpy_fromiovecend), + X(csum_partial_copy), + X(csum_partial_copy_fromiovecend), + X(__release_sock), + X(net_timer), + X(inet_put_sock), + /* UDP/TCP exported functions for TCPv6 */ + X(udp_ioctl), + X(udp_connect), + X(udp_sendmsg), + X(tcp_cache_zap), + X(tcp_close), + X(tcp_accept), + X(tcp_write_wakeup), + X(tcp_read_wakeup), + X(tcp_select), + X(tcp_ioctl), + X(tcp_shutdown), + X(tcp_setsockopt), + X(tcp_getsockopt), + X(tcp_recvmsg), + X(tcp_send_synack), + X(sock_wfree), + X(sock_wmalloc), + X(tcp_reset_xmit_timer), + X(tcp_parse_options), + X(tcp_rcv_established), + X(tcp_init_xmit_timers), + X(tcp_clear_xmit_timers), + X(tcp_slt_array), + X(__tcp_inc_slow_timer), + X(tcp_statistics), + X(tcp_rcv_state_process), + X(tcp_do_sendmsg), + X(tcp_v4_build_header), + X(tcp_v4_rebuild_header), + X(tcp_v4_send_check), + X(tcp_v4_conn_request), + X(tcp_v4_syn_recv_sock), + X(tcp_v4_backlog_rcv), + X(tcp_v4_connect), + X(ip_chk_addr), + X(net_reset_timer), + X(net_delete_timer), + X(udp_prot), + X(tcp_prot), + X(ipv4_specific), +#endif + +#if defined(CONFIG_ULTRA) || defined(CONFIG_WD80x3) || \ + defined(CONFIG_EL2) || defined(CONFIG_NE2000) || \ + defined(CONFIG_E2100) || defined(CONFIG_HPLAN_PLUS) || \ + defined(CONFIG_HPLAN) || defined(CONFIG_AC3200) + /* If 8390 NIC support is built in, we will need these. */ + X(ei_open), + X(ei_close), + X(ei_debug), + X(ei_interrupt), + X(ethdev_init), + X(NS8390_init), +#endif + +#ifdef CONFIG_TR + X(tr_setup), + X(tr_type_trans), +#endif + +#ifdef CONFIG_NET_ALIAS +#include <linux/net_alias.h> +#endif + +#endif /* CONFIG_INET */ + + /* Device callback registration */ + X(register_netdevice_notifier), + X(unregister_netdevice_notifier), + +#ifdef CONFIG_NET_ALIAS + X(register_net_alias_type), + X(unregister_net_alias_type), +#endif + + /* support for loadable net drivers */ +#ifdef CONFIG_INET + X(register_netdev), + X(unregister_netdev), + X(ether_setup), + X(eth_type_trans), + X(eth_copy_and_sum), + X(arp_query), + X(alloc_skb), + X(kfree_skb), + X(skb_clone), + X(dev_alloc_skb), + X(dev_kfree_skb), + X(skb_device_unlock), + X(skb_device_locked), + X(netif_rx), + X(dev_tint), + X(irq2dev_map), + X(dev_add_pack), + X(dev_remove_pack), + X(dev_get), + X(dev_ioctl), + X(dev_queue_xmit), + X(dev_base), + X(dev_close), + X(dev_mc_add), + X(arp_find), + X(n_tty_ioctl), + X(tty_register_ldisc), + X(kill_fasync), + X(arp_query), + X(ip_rcv), + X(arp_rcv), +#endif /* CONFIG_INET */ + +#ifdef CONFIG_NETLINK + X(netlink_attach), + X(netlink_detach), + X(netlink_donothing), + X(netlink_post), +#endif /* CONFIG_NETLINK */ + +#include <linux/symtab_end.h> +}; + +void export_net_symbols(void) +{ + register_symtab(&net_syms); +} diff --git a/net/protocols.c b/net/protocols.c index 76def9857..d5090bc47 100644 --- a/net/protocols.c +++ b/net/protocols.c @@ -8,29 +8,42 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/net.h> - +#include <linux/fs.h> #define CONFIG_UNIX /* always present... */ #ifdef CONFIG_UNIX -#include <net/unix.h> +#include <net/af_unix.h> #endif + #ifdef CONFIG_INET #include <linux/inet.h> +#ifdef CONFIG_IPV6 +extern void inet6_proto_init(struct net_proto *pro); #endif -#ifdef CONFIG_IPX +#endif /* INET */ + +#if defined(CONFIG_IPX) || defined(CONFIG_IPX_MODULE) #include <net/ipxcall.h> #include <net/p8022call.h> +#include <net/p8022trcall.h> +#endif +#ifdef CONFIG_X25 +#include <net/x25call.h> #endif #ifdef CONFIG_AX25 #include <net/ax25call.h> #ifdef CONFIG_NETROM #include <net/nrcall.h> #endif +#ifdef CONFIG_ROSE +#include <net/rosecall.h> #endif -#ifdef CONFIG_ATALK -#ifndef CONFIG_IPX +#endif +#if defined(CONFIG_ATALK) || defined(CONFIG_ATALK_MODULE) +#if ! ( defined(CONFIG_IPX) || defined(CONFIG_IPX_MODULE) ) #include <net/p8022call.h> +#include <net/p8022trcall.h> #endif #include <net/atalkcall.h> #endif @@ -48,21 +61,29 @@ struct net_proto protocols[] = { #ifdef CONFIG_UNIX { "UNIX", unix_proto_init }, /* Unix domain socket family */ #endif -#if defined(CONFIG_IPX)||defined(CONFIG_ATALK) +#if defined(CONFIG_IPX) || defined(CONFIG_IPX_MODULE) || \ + defined(CONFIG_ATALK) || defined(CONFIG_ATALK_MODULE) { "802.2", p8022_proto_init }, /* 802.2 demultiplexor */ + { "802.2TR", p8022tr_proto_init }, /* 802.2 demultiplexor */ { "SNAP", snap_proto_init }, /* SNAP demultiplexor */ #endif #ifdef CONFIG_TR { "RIF", rif_init }, /* RIF for Token ring */ #endif #ifdef CONFIG_AX25 - { "AX.25", ax25_proto_init }, + { "AX.25", ax25_proto_init }, /* Amateur Radio AX.25 */ #ifdef CONFIG_NETROM - { "NET/ROM", nr_proto_init }, + { "NET/ROM", nr_proto_init }, /* Amateur Radio NET/ROM */ +#endif +#ifdef CONFIG_ROSE + { "Rose", rose_proto_init }, /* Amateur Radio X.25 PLP */ #endif #endif #ifdef CONFIG_INET { "INET", inet_proto_init }, /* TCP/IP */ +#ifdef CONFIG_IPV6 + { "INET6", inet6_proto_init}, /* IPv6 */ +#endif #endif #ifdef CONFIG_IPX { "IPX", ipx_proto_init }, /* IPX */ @@ -70,7 +91,8 @@ struct net_proto protocols[] = { #ifdef CONFIG_ATALK { "DDP", atalk_proto_init }, /* Netatalk Appletalk driver */ #endif +#ifdef CONFIG_X25 + { "X.25", x25_proto_init }, /* CCITT X.25 Packet Layer */ +#endif { NULL, NULL } /* End marker */ }; - - diff --git a/net/rose/Makefile b/net/rose/Makefile new file mode 100644 index 000000000..0d71de9cf --- /dev/null +++ b/net/rose/Makefile @@ -0,0 +1,17 @@ +# +# Makefile for the Linux Rose (X.25 PLP) layer. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definition is now in the main makefile... + +O_TARGET := rose.o +O_OBJS := af_rose.o sysctl_net_rose.o rose_dev.o rose_in.o rose_link.o rose_out.o rose_route.o rose_subr.o rose_timer.o +M_OBJS := $(O_TARGET) + +include $(TOPDIR)/Rules.make + +tar: + tar -cvf /dev/f1 . diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c new file mode 100644 index 000000000..4b8acd3f8 --- /dev/null +++ b/net/rose/af_rose.c @@ -0,0 +1,1459 @@ +/* + * Rose release 001 + * + * This is ALPHA test software. This code may break your machine, randomly fail to work with new + * releases, misbehave and/or generally screw up. It might even work. + * + * This code REQUIRES 2.1.0 or higher/ NET3.029 + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * Rose 001 Jonathan(G4KLX) Cloned from af_netrom.c. + */ + +#include <linux/config.h> +#if defined(CONFIG_ROSE) || defined(CONFIG_ROSE_MODULE) +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/stat.h> +#include <net/ax25.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <asm/segment.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <linux/fcntl.h> +#include <linux/termios.h> /* For TIOCINQ/OUTQ */ +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/notifier.h> +#include <net/rose.h> +#include <linux/proc_fs.h> +#include <net/ip.h> +#include <net/arp.h> +#include <linux/if_arp.h> + +int sysctl_rose_restart_request_timeout = ROSE_DEFAULT_T0; +int sysctl_rose_call_request_timeout = ROSE_DEFAULT_T1; +int sysctl_rose_reset_request_timeout = ROSE_DEFAULT_T2; +int sysctl_rose_clear_request_timeout = ROSE_DEFAULT_T3; +int sysctl_rose_no_activity_timeout = ROSE_DEFAULT_IDLE; +int sysctl_rose_routing_control = 1; + +static unsigned int lci = 1; + +static struct sock *volatile rose_list = NULL; + +/* + * Convert a Rose address into text. + */ +char *rose2asc(rose_address *addr) +{ + static char buffer[11]; + + if (addr->rose_addr[0] == 0x00 && addr->rose_addr[1] == 0x00 && + addr->rose_addr[2] == 0x00 && addr->rose_addr[3] == 0x00 && + addr->rose_addr[4] == 0x00) { + strcpy(buffer, "*"); + } else { + sprintf(buffer, "%02X%02X%02X%02X%02X", addr->rose_addr[0] & 0xFF, + addr->rose_addr[1] & 0xFF, + addr->rose_addr[2] & 0xFF, + addr->rose_addr[3] & 0xFF, + addr->rose_addr[4] & 0xFF); + } + + return buffer; +} + +/* + * Compare two Rose addresses, 0 == equal. + */ +int rosecmp(rose_address *addr1, rose_address *addr2) +{ + int i; + + for (i = 0; i < 5; i++) + if (addr1->rose_addr[i] != addr2->rose_addr[i]) + return 1; + + return 0; +} + +/* + * Socket removal during an interrupt is now safe. + */ +static void rose_remove_socket(struct sock *sk) +{ + struct sock *s; + unsigned long flags; + + save_flags(flags); + cli(); + + if ((s = rose_list) == sk) { + rose_list = s->next; + restore_flags(flags); + return; + } + + while (s != NULL && s->next != NULL) { + if (s->next == sk) { + s->next = sk->next; + restore_flags(flags); + return; + } + + s = s->next; + } + + restore_flags(flags); +} + +/* + * Kill all bound sockets on a dropped device. + */ +static void rose_kill_by_device(struct device *dev) +{ + struct sock *s; + + for (s = rose_list; s != NULL; s = s->next) { + if (s->protinfo.rose->device == dev) { + s->protinfo.rose->state = ROSE_STATE_0; + s->protinfo.rose->device = NULL; + s->state = TCP_CLOSE; + s->err = ENETUNREACH; + s->shutdown |= SEND_SHUTDOWN; + s->state_change(s); + s->dead = 1; + } + } +} + +/* + * Handle device status changes. + */ +static int rose_device_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct device *dev = (struct device *)ptr; + + if (event != NETDEV_DOWN) + return NOTIFY_DONE; + + rose_kill_by_device(dev); + rose_rt_device_down(dev); + rose_link_device_down(dev); + + return NOTIFY_DONE; +} + +/* + * Add a socket to the bound sockets list. + */ +static void rose_insert_socket(struct sock *sk) +{ + unsigned long flags; + + save_flags(flags); + cli(); + + sk->next = rose_list; + rose_list = sk; + + restore_flags(flags); +} + +/* + * Find a socket that wants to accept the Call Request we just + * received. + */ +static struct sock *rose_find_listener(ax25_address *call) +{ + unsigned long flags; + struct sock *s; + + save_flags(flags); + cli(); + + for (s = rose_list; s != NULL; s = s->next) { + if (ax25cmp(&s->protinfo.rose->source_call, call) == 0 && s->protinfo.rose->source_ndigis == 0 && s->state == TCP_LISTEN) { + restore_flags(flags); + return s; + } + } + + for (s = rose_list; s != NULL; s = s->next) { + if (ax25cmp(&s->protinfo.rose->source_call, &null_ax25_address) == 0 && s->state == TCP_LISTEN) { + restore_flags(flags); + return s; + } + } + + restore_flags(flags); + return NULL; +} + +/* + * Find a connected Rose socket given my LCI and device. + */ +struct sock *rose_find_socket(unsigned int lci, struct device *dev) +{ + struct sock *s; + unsigned long flags; + + save_flags(flags); + cli(); + + for (s = rose_list; s != NULL; s = s->next) { + if (s->protinfo.rose->lci == lci && s->protinfo.rose->neighbour->dev == dev) { + restore_flags(flags); + return s; + } + } + + restore_flags(flags); + + return NULL; +} + +/* + * Find a unique LCI for a given device. + */ +unsigned int rose_new_lci(struct device *dev) +{ + lci++; + if (lci > 4095) lci = 1; + + while (rose_find_socket(lci, dev) != NULL) { + lci++; + if (lci > 4095) lci = 1; + } + + return lci; +} + +/* + * Deferred destroy. + */ +void rose_destroy_socket(struct sock *); + +/* + * Handler for deferred kills. + */ +static void rose_destroy_timer(unsigned long data) +{ + rose_destroy_socket((struct sock *)data); +} + +/* + * This is called from user mode and the timers. Thus it protects itself against + * interrupt users but doesn't worry about being called during work. + * Once it is removed from the queue no interrupt or bottom half will + * touch it and we are (fairly 8-) ) safe. + */ +void rose_destroy_socket(struct sock *sk) /* Not static as it's used by the timer */ +{ + struct sk_buff *skb; + unsigned long flags; + + save_flags(flags); + cli(); + + del_timer(&sk->timer); + + rose_remove_socket(sk); + rose_clear_queues(sk); /* Flush the queues */ + + while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) { + if (skb->sk != sk) { /* A pending connection */ + skb->sk->dead = 1; /* Queue the unaccepted socket for death */ + rose_set_timer(skb->sk); + skb->sk->protinfo.rose->state = ROSE_STATE_0; + } + + kfree_skb(skb, FREE_READ); + } + + if (sk->wmem_alloc || sk->rmem_alloc) { /* Defer: outstanding buffers */ + init_timer(&sk->timer); + sk->timer.expires = jiffies + 10 * HZ; + sk->timer.function = rose_destroy_timer; + sk->timer.data = (unsigned long)sk; + add_timer(&sk->timer); + } else { + kfree_s(sk->protinfo.rose, sizeof(*sk->protinfo.rose)); + sk_free(sk); + } + + restore_flags(flags); +} + +/* + * Handling for system calls applied via the various interfaces to a + * Rose socket object. + */ + +static int rose_fcntl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + return -EINVAL; +} + +/* + * dl1bke 960311: set parameters for existing Rose connections, + * includes a KILL command to abort any connection. + * VERY useful for debugging ;-) + */ +static int rose_ctl_ioctl(const unsigned int cmd, void *arg) +{ + struct rose_ctl_struct rose_ctl; + struct sock *sk; + unsigned long flags; + struct device *dev; + int err; + + if ((err = verify_area(VERIFY_READ, arg, sizeof(rose_ctl))) != 0) + return err; + + copy_from_user(&rose_ctl, arg, sizeof(rose_ctl)); + + if ((dev = rose_ax25_dev_get(rose_ctl.dev)) == NULL) + return -EINVAL; + + if ((sk = rose_find_socket(rose_ctl.lci, dev)) == NULL) + return -ENOTCONN; + + switch (rose_ctl.cmd) { + case ROSE_KILL: + rose_clear_queues(sk); + rose_write_internal(sk, ROSE_CLEAR_REQUEST); + sk->protinfo.rose->state = ROSE_STATE_0; + sk->state = TCP_CLOSE; + sk->err = ENETRESET; + sk->shutdown |= SEND_SHUTDOWN; + if (!sk->dead) + sk->state_change(sk); + sk->dead = 1; + rose_set_timer(sk); + break; + + case ROSE_T0: + if (rose_ctl.arg < 1) + return -EINVAL; + if (sk->protinfo.rose->neighbour != NULL) { + save_flags(flags); cli(); + sk->protinfo.rose->neighbour->t0 = rose_ctl.arg * PR_SLOWHZ; + restore_flags(flags); + } + break; + + case ROSE_T1: + if (rose_ctl.arg < 1) + return -EINVAL; + save_flags(flags); cli(); + sk->protinfo.rose->t1 = rose_ctl.arg * PR_SLOWHZ; + restore_flags(flags); + break; + + case ROSE_T2: + if (rose_ctl.arg < 1) + return -EINVAL; + save_flags(flags); cli(); + sk->protinfo.rose->t2 = rose_ctl.arg * PR_SLOWHZ; + restore_flags(flags); + break; + + case ROSE_T3: + if (rose_ctl.arg < 1) + return -EINVAL; + save_flags(flags); cli(); + sk->protinfo.rose->t3 = rose_ctl.arg * PR_SLOWHZ; + restore_flags(flags); + break; + + case ROSE_IDLE: + if (rose_ctl.arg < 1) + return -EINVAL; + save_flags(flags); cli(); + sk->protinfo.rose->idle = rose_ctl.arg * 60 * PR_SLOWHZ; + restore_flags(flags); + break; + + default: + return -EINVAL; + } + + return 0; +} + +static int rose_setsockopt(struct socket *sock, int level, int optname, + char *optval, int optlen) +{ + struct sock *sk; + int err, opt; + + sk = (struct sock *)sock->data; + + if (level == SOL_SOCKET) + return sock_setsockopt(sk, level, optname, optval, optlen); + + if (level != SOL_ROSE) + return -EOPNOTSUPP; + + if (optval == NULL) + return -EINVAL; + + if ((err = verify_area(VERIFY_READ, optval, sizeof(int))) != 0) + return err; + + get_user(opt, (int *)optval); + + switch (optname) { + case ROSE_T0: + if (opt < 1) + return -EINVAL; + if (sk->protinfo.rose->neighbour != NULL) + sk->protinfo.rose->neighbour->t0 = opt * PR_SLOWHZ; + return 0; + + case ROSE_T1: + if (opt < 1) + return -EINVAL; + sk->protinfo.rose->t1 = opt * PR_SLOWHZ; + return 0; + + case ROSE_T2: + if (opt < 1) + return -EINVAL; + sk->protinfo.rose->t2 = opt * PR_SLOWHZ; + return 0; + + case ROSE_T3: + if (opt < 1) + return -EINVAL; + sk->protinfo.rose->t3 = opt * PR_SLOWHZ; + return 0; + + case ROSE_IDLE: + if (opt < 1) + return -EINVAL; + sk->protinfo.rose->idle = opt * 60 * PR_SLOWHZ; + return 0; + + case ROSE_HDRINCL: + sk->protinfo.rose->hdrincl = opt ? 1 : 0; + return 0; + + default: + return -ENOPROTOOPT; + } +} + +static int rose_getsockopt(struct socket *sock, int level, int optname, + char *optval, int *optlen) +{ + struct sock *sk; + int val = 0; + int err; + + sk = (struct sock *)sock->data; + + if (level == SOL_SOCKET) + return sock_getsockopt(sk, level, optname, optval, optlen); + + if (level != SOL_ROSE) + return -EOPNOTSUPP; + + switch (optname) { + case ROSE_T0: + if (sk->protinfo.rose->neighbour != NULL) + val = sk->protinfo.rose->neighbour->t0 / PR_SLOWHZ; + else + val = sysctl_rose_restart_request_timeout / PR_SLOWHZ; + break; + + case ROSE_T1: + val = sk->protinfo.rose->t1 / PR_SLOWHZ; + break; + + case ROSE_T2: + val = sk->protinfo.rose->t2 / PR_SLOWHZ; + break; + + case ROSE_T3: + val = sk->protinfo.rose->t3 / PR_SLOWHZ; + break; + + case ROSE_IDLE: + val = sk->protinfo.rose->idle / (PR_SLOWHZ * 60); + break; + + case ROSE_HDRINCL: + val = sk->protinfo.rose->hdrincl; + break; + + default: + return -ENOPROTOOPT; + } + + if ((err = verify_area(VERIFY_WRITE, optlen, sizeof(int))) != 0) + return err; + + put_user(sizeof(int), (unsigned long *)optlen); + + if ((err = verify_area(VERIFY_WRITE, optval, sizeof(int))) != 0) + return err; + + put_user(val, (unsigned long *)optval); + + return 0; +} + +static int rose_listen(struct socket *sock, int backlog) +{ + struct sock *sk = (struct sock *)sock->data; + + if (sk->state != TCP_LISTEN) { + sk->protinfo.rose->dest_ndigis = 0; + memset(&sk->protinfo.rose->dest_addr, '\0', ROSE_ADDR_LEN); + memset(&sk->protinfo.rose->dest_call, '\0', AX25_ADDR_LEN); + memset(&sk->protinfo.rose->dest_digi, '\0', AX25_ADDR_LEN); + sk->max_ack_backlog = backlog; + sk->state = TCP_LISTEN; + return 0; + } + + return -EOPNOTSUPP; +} + +static void def_callback1(struct sock *sk) +{ + if (!sk->dead) + wake_up_interruptible(sk->sleep); +} + +static void def_callback2(struct sock *sk, int len) +{ + if (!sk->dead) + wake_up_interruptible(sk->sleep); +} + +static int rose_create(struct socket *sock, int protocol) +{ + struct sock *sk; + rose_cb *rose; + + if (sock->type != SOCK_SEQPACKET || protocol != 0) + return -ESOCKTNOSUPPORT; + + if ((sk = sk_alloc(GFP_ATOMIC)) == NULL) + return -ENOMEM; + + if ((rose = (rose_cb *)kmalloc(sizeof(*rose), GFP_ATOMIC)) == NULL) { + sk_free(sk); + return -ENOMEM; + } + + skb_queue_head_init(&sk->receive_queue); + skb_queue_head_init(&sk->write_queue); + skb_queue_head_init(&sk->back_log); + + init_timer(&sk->timer); + + sk->socket = sock; + sk->type = sock->type; + sk->protocol = protocol; + sk->allocation = GFP_KERNEL; + sk->rcvbuf = SK_RMEM_MAX; + sk->sndbuf = SK_WMEM_MAX; + sk->state = TCP_CLOSE; + sk->priority = SOPRI_NORMAL; + sk->mtu = ROSE_MTU; /* 128 */ + sk->zapped = 1; + sk->window = ROSE_DEFAULT_WINDOW; + + sk->state_change = def_callback1; + sk->data_ready = def_callback2; + sk->write_space = def_callback1; + sk->error_report = def_callback1; + + if (sock != NULL) { + sock->data = (void *)sk; + sk->sleep = sock->wait; + } + + skb_queue_head_init(&rose->ack_queue); + skb_queue_head_init(&rose->frag_queue); + + rose->lci = 0; + + rose->t1 = sysctl_rose_call_request_timeout; + rose->t2 = sysctl_rose_reset_request_timeout; + rose->t3 = sysctl_rose_clear_request_timeout; + rose->idle = sysctl_rose_no_activity_timeout; + + rose->timer = 0; + + rose->va = 0; + rose->vr = 0; + rose->vs = 0; + rose->vl = 0; + + rose->fraglen = 0; + rose->hdrincl = 0; + rose->state = ROSE_STATE_0; + rose->neighbour = NULL; + rose->device = NULL; + + rose->source_ndigis = 0; + rose->dest_ndigis = 0; + + memset(&rose->source_addr, '\0', ROSE_ADDR_LEN); + memset(&rose->dest_addr, '\0', ROSE_ADDR_LEN); + memset(&rose->source_call, '\0', AX25_ADDR_LEN); + memset(&rose->dest_call, '\0', AX25_ADDR_LEN); + memset(&rose->source_digi, '\0', AX25_ADDR_LEN); + memset(&rose->dest_digi, '\0', AX25_ADDR_LEN); + + rose->sk = sk; + sk->protinfo.rose = rose; + + return 0; +} + +static struct sock *rose_make_new(struct sock *osk) +{ + struct sock *sk; + rose_cb *rose; + + if (osk->type != SOCK_SEQPACKET) + return NULL; + + if ((sk = (struct sock *)sk_alloc(GFP_ATOMIC)) == NULL) + return NULL; + + if ((rose = (rose_cb *)kmalloc(sizeof(*rose), GFP_ATOMIC)) == NULL) { + sk_free(sk); + return NULL; + } + + skb_queue_head_init(&sk->receive_queue); + skb_queue_head_init(&sk->write_queue); + skb_queue_head_init(&sk->back_log); + + init_timer(&sk->timer); + + sk->type = osk->type; + sk->socket = osk->socket; + sk->priority = osk->priority; + sk->protocol = osk->protocol; + sk->rcvbuf = osk->rcvbuf; + sk->sndbuf = osk->sndbuf; + sk->debug = osk->debug; + sk->state = TCP_ESTABLISHED; + sk->window = osk->window; + sk->mtu = osk->mtu; + sk->sleep = osk->sleep; + sk->zapped = osk->zapped; + + sk->state_change = def_callback1; + sk->data_ready = def_callback2; + sk->write_space = def_callback1; + sk->error_report = def_callback1; + + skb_queue_head_init(&rose->ack_queue); + skb_queue_head_init(&rose->frag_queue); + + rose->t1 = osk->protinfo.rose->t1; + rose->t2 = osk->protinfo.rose->t2; + rose->t3 = osk->protinfo.rose->t3; + rose->idle = osk->protinfo.rose->idle; + + rose->device = osk->protinfo.rose->device; + rose->hdrincl = osk->protinfo.rose->hdrincl; + rose->fraglen = 0; + + rose->timer = 0; + + rose->va = 0; + rose->vr = 0; + rose->vs = 0; + rose->vl = 0; + + sk->protinfo.rose = rose; + rose->sk = sk; + + return sk; +} + +static int rose_dup(struct socket *newsock, struct socket *oldsock) +{ + struct sock *sk = (struct sock *)oldsock->data; + + return rose_create(newsock, sk->protocol); +} + +static int rose_release(struct socket *sock, struct socket *peer) +{ + struct sock *sk = (struct sock *)sock->data; + + if (sk == NULL) return 0; + + switch (sk->protinfo.rose->state) { + + case ROSE_STATE_0: + sk->state = TCP_CLOSE; + sk->shutdown |= SEND_SHUTDOWN; + sk->state_change(sk); + sk->dead = 1; + rose_destroy_socket(sk); + break; + + case ROSE_STATE_1: + sk->protinfo.rose->state = ROSE_STATE_0; + sk->state = TCP_CLOSE; + sk->shutdown |= SEND_SHUTDOWN; + sk->state_change(sk); + sk->dead = 1; + rose_destroy_socket(sk); + break; + + case ROSE_STATE_2: + sk->protinfo.rose->state = ROSE_STATE_0; + sk->state = TCP_CLOSE; + sk->shutdown |= SEND_SHUTDOWN; + sk->state_change(sk); + sk->dead = 1; + rose_destroy_socket(sk); + break; + + case ROSE_STATE_3: + case ROSE_STATE_4: + rose_clear_queues(sk); + rose_write_internal(sk, ROSE_CLEAR_REQUEST); + sk->protinfo.rose->timer = sk->protinfo.rose->t3; + sk->protinfo.rose->state = ROSE_STATE_2; + sk->state = TCP_CLOSE; + sk->shutdown |= SEND_SHUTDOWN; + sk->state_change(sk); + sk->dead = 1; + sk->destroy = 1; + break; + + default: + break; + } + + sock->data = NULL; + sk->socket = NULL; /* Not used, but we should do this. **/ + + return 0; +} + +static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk; + struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr; + struct device *dev; + ax25_address *user, *source; + + sk = (struct sock *)sock->data; + + if (sk->zapped == 0) + return -EINVAL; + + if (addr_len != sizeof(struct sockaddr_rose)) + return -EINVAL; + + if ((dev = rose_dev_get(&addr->srose_addr)) == NULL) { + if (sk->debug) + printk("Rose: bind failed: invalid address\n"); + return -EADDRNOTAVAIL; + } + + source = &addr->srose_call; + + if ((user = ax25_findbyuid(current->euid)) == NULL) { + if (ax25_uid_policy && !suser()) + return -EACCES; + user = source; + } + + sk->protinfo.rose->source_addr = addr->srose_addr; + sk->protinfo.rose->source_call = *user; + sk->protinfo.rose->device = dev; + + if (addr->srose_ndigis == 1) { + sk->protinfo.rose->source_ndigis = 1; + sk->protinfo.rose->source_digi = addr->srose_digi; + } + + rose_insert_socket(sk); + + sk->zapped = 0; + + if (sk->debug) + printk("Rose: socket is bound\n"); + + return 0; +} + +static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) +{ + struct sock *sk = (struct sock *)sock->data; + struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr; + ax25_address *user; + struct device *dev; + + if (sk->state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { + sock->state = SS_CONNECTED; + return 0; /* Connect completed during a ERESTARTSYS event */ + } + + if (sk->state == TCP_CLOSE && sock->state == SS_CONNECTING) { + sock->state = SS_UNCONNECTED; + return -ECONNREFUSED; + } + + if (sk->state == TCP_ESTABLISHED) + return -EISCONN; /* No reconnect on a seqpacket socket */ + + sk->state = TCP_CLOSE; + sock->state = SS_UNCONNECTED; + + if (addr_len != sizeof(struct sockaddr_rose)) + return -EINVAL; + + if ((sk->protinfo.rose->neighbour = rose_get_neigh(&addr->srose_addr)) == NULL) + return -ENETUNREACH; + + if (sk->zapped) { /* Must bind first - autobinding in this may or may not work */ + sk->zapped = 0; + + if ((dev = rose_dev_first()) == NULL) + return -ENETUNREACH; + + if ((user = ax25_findbyuid(current->euid)) == NULL) + return -EINVAL; + + memcpy(&sk->protinfo.rose->source_addr, dev->dev_addr, ROSE_ADDR_LEN); + sk->protinfo.rose->source_call = *user; + sk->protinfo.rose->device = dev; + + rose_insert_socket(sk); /* Finish the bind */ + } + + sk->protinfo.rose->dest_addr = addr->srose_addr; + sk->protinfo.rose->dest_call = addr->srose_call; + if (addr->srose_ndigis == 1) { + sk->protinfo.rose->dest_ndigis = 1; + sk->protinfo.rose->dest_digi = addr->srose_digi; + } + sk->protinfo.rose->lci = rose_new_lci(sk->protinfo.rose->neighbour->dev); + + /* Move to connecting socket, start sending Connect Requests */ + sock->state = SS_CONNECTING; + sk->state = TCP_SYN_SENT; + + sk->protinfo.rose->state = ROSE_STATE_1; + sk->protinfo.rose->timer = sk->protinfo.rose->t1; + rose_write_internal(sk, ROSE_CALL_REQUEST); + + rose_set_timer(sk); + + /* Now the loop */ + if (sk->state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) + return -EINPROGRESS; + + cli(); /* To avoid races on the sleep */ + + /* + * A Connect Ack with Choke or timeout or failed routing will go to closed. + */ + while (sk->state == TCP_SYN_SENT) { + interruptible_sleep_on(sk->sleep); + if (current->signal & ~current->blocked) { + sti(); + return -ERESTARTSYS; + } + } + + if (sk->state != TCP_ESTABLISHED) { + sti(); + sock->state = SS_UNCONNECTED; + return sock_error(sk); /* Always set at this point */ + } + + sock->state = SS_CONNECTED; + + sti(); + + return 0; +} + +static int rose_socketpair(struct socket *sock1, struct socket *sock2) +{ + return -EOPNOTSUPP; +} + +static int rose_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct sock *sk; + struct sock *newsk; + struct sk_buff *skb; + + if (newsock->data) + sk_free(newsock->data); + + newsock->data = NULL; + + sk = (struct sock *)sock->data; + + if (sk->type != SOCK_SEQPACKET) + return -EOPNOTSUPP; + + if (sk->state != TCP_LISTEN) + return -EINVAL; + + /* + * The write queue this time is holding sockets ready to use + * hooked into the SABM we saved + */ + do { + cli(); + if ((skb = skb_dequeue(&sk->receive_queue)) == NULL) { + if (flags & O_NONBLOCK) { + sti(); + return 0; + } + interruptible_sleep_on(sk->sleep); + if (current->signal & ~current->blocked) { + sti(); + return -ERESTARTSYS; + } + } + } while (skb == NULL); + + newsk = skb->sk; + newsk->pair = NULL; + sti(); + + /* Now attach up the new socket */ + skb->sk = NULL; + kfree_skb(skb, FREE_READ); + sk->ack_backlog--; + newsock->data = newsk; + + return 0; +} + +static int rose_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sockaddr_rose *srose = (struct sockaddr_rose *)uaddr; + struct sock *sk; + + sk = (struct sock *)sock->data; + + if (peer != 0) { + if (sk->state != TCP_ESTABLISHED) + return -ENOTCONN; + srose->srose_family = AF_ROSE; + srose->srose_ndigis = 0; + srose->srose_addr = sk->protinfo.rose->dest_addr; + srose->srose_call = sk->protinfo.rose->dest_call; + if (sk->protinfo.rose->dest_ndigis == 1) { + srose->srose_ndigis = 1; + srose->srose_digi = sk->protinfo.rose->dest_digi; + } + *uaddr_len = sizeof(struct sockaddr_rose); + } else { + srose->srose_family = AF_ROSE; + srose->srose_ndigis = 0; + srose->srose_addr = sk->protinfo.rose->source_addr; + srose->srose_call = sk->protinfo.rose->source_call; + if (sk->protinfo.rose->source_ndigis == 1) { + srose->srose_ndigis = 1; + srose->srose_digi = sk->protinfo.rose->source_digi; + } + *uaddr_len = sizeof(struct sockaddr_rose); + } + + return 0; +} + +int rose_rx_call_request(struct sk_buff *skb, struct device *dev, struct rose_neigh *neigh, unsigned int lci) +{ + struct sock *sk; + struct sock *make; + rose_cb rose; + + skb->sk = NULL; /* Initially we don't know who it's for */ + + /* + * skb->data points to the rose frame start + */ + + /* + * XXX This is an error. + */ + if (!rose_parse_facilities(skb, &rose)) { + return 0; + } + + sk = rose_find_listener(&rose.source_call); + + /* + * We can't accept the Call Request. + */ + if (sk == NULL || sk->ack_backlog == sk->max_ack_backlog || (make = rose_make_new(sk)) == NULL) { + rose_transmit_clear_request(neigh, lci, 0x01); + return 0; + } + + skb->sk = make; + make->state = TCP_ESTABLISHED; + + make->protinfo.rose->lci = lci; + make->protinfo.rose->dest_addr = rose.dest_addr; + make->protinfo.rose->dest_call = rose.dest_call; + make->protinfo.rose->dest_ndigis = rose.dest_ndigis; + make->protinfo.rose->dest_digi = rose.dest_digi; + make->protinfo.rose->source_addr = rose.source_addr; + make->protinfo.rose->source_call = rose.source_call; + make->protinfo.rose->source_ndigis = rose.source_ndigis; + make->protinfo.rose->source_digi = rose.source_digi; + make->protinfo.rose->neighbour = neigh; + make->protinfo.rose->device = dev; + + rose_write_internal(make, ROSE_CALL_ACCEPTED); + + make->protinfo.rose->condition = 0x00; + make->protinfo.rose->vs = 0; + make->protinfo.rose->va = 0; + make->protinfo.rose->vr = 0; + make->protinfo.rose->vl = 0; + make->protinfo.rose->state = ROSE_STATE_3; + sk->ack_backlog++; + make->pair = sk; + + rose_insert_socket(make); + + skb_queue_head(&sk->receive_queue, skb); + + rose_set_timer(make); + + if (!sk->dead) + sk->data_ready(sk, skb->len); + + return 1; +} + +static int rose_sendmsg(struct socket *sock, struct msghdr *msg, int len, int noblock, int flags) +{ + struct sock *sk = (struct sock *)sock->data; + struct sockaddr_rose *usrose = (struct sockaddr_rose *)msg->msg_name; + int err; + struct sockaddr_rose srose; + struct sk_buff *skb; + unsigned char *asmptr; + int size; + + if (sk->err) + return sock_error(sk); + + if (flags) + return -EINVAL; + + if (sk->zapped) + return -EADDRNOTAVAIL; + + if (sk->shutdown & SEND_SHUTDOWN) { + send_sig(SIGPIPE, current, 0); + return -EPIPE; + } + + if (sk->protinfo.rose->device == NULL) + return -ENETUNREACH; + + if (usrose) { + if (msg->msg_namelen < sizeof(srose)) + return -EINVAL; + srose = *usrose; + if (rosecmp(&sk->protinfo.rose->dest_addr, &srose.srose_addr) != 0 || + ax25cmp(&sk->protinfo.rose->dest_call, &srose.srose_call) != 0) + return -EISCONN; + if (srose.srose_ndigis == 1 && sk->protinfo.rose->dest_ndigis == 1) { + if (ax25cmp(&sk->protinfo.rose->dest_digi, &srose.srose_digi) != 0) + return -EISCONN; + } + if (srose.srose_family != AF_ROSE) + return -EINVAL; + } else { + if (sk->state != TCP_ESTABLISHED) + return -ENOTCONN; + + srose.srose_family = AF_ROSE; + srose.srose_addr = sk->protinfo.rose->dest_addr; + srose.srose_call = sk->protinfo.rose->dest_call; + srose.srose_ndigis = 0; + + if (sk->protinfo.rose->dest_ndigis == 1) { + srose.srose_ndigis = 1; + srose.srose_digi = sk->protinfo.rose->dest_digi; + } + } + + if (sk->debug) + printk("Rose: sendto: Addresses built.\n"); + + /* Build a packet */ + if (sk->debug) + printk("Rose: sendto: building packet.\n"); + + size = len + AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN; + + if ((skb = sock_alloc_send_skb(sk, size, 0, 0, &err)) == NULL) + return err; + + skb->sk = sk; + skb->free = 1; + skb->arp = 1; + + skb_reserve(skb, size - len); + + /* + * Push down the Rose header + */ + + asmptr = skb_push(skb, ROSE_MIN_LEN); + + if (sk->debug) + printk("Building Rose Header.\n"); + + /* Build a Rose Transport header */ + + *asmptr++ = ((sk->protinfo.rose->lci >> 8) & 0x0F) | GFI; + *asmptr++ = (sk->protinfo.rose->lci >> 0) & 0xFF; + *asmptr++ = ROSE_DATA; + + if (sk->debug) + printk("Built header.\n"); + + /* + * Put the data on the end + */ + + skb->h.raw = skb_put(skb, len); + + asmptr = skb->h.raw; + + if (sk->debug) + printk("Rose: Appending user data\n"); + + /* User data follows immediately after the Rose transport header */ + memcpy_fromiovec(asmptr, msg->msg_iov, len); + + if (sk->debug) + printk("Rose: Transmitting buffer\n"); + + if (sk->state != TCP_ESTABLISHED) { + kfree_skb(skb, FREE_WRITE); + return -ENOTCONN; + } + + rose_output(sk, skb); /* Shove it onto the queue */ + + return len; +} + + +static int rose_recvmsg(struct socket *sock, struct msghdr *msg, int size, int noblock, + int flags, int *addr_len) +{ + struct sock *sk = (struct sock *)sock->data; + struct sockaddr_rose *srose = (struct sockaddr_rose *)msg->msg_name; + int copied; + struct sk_buff *skb; + int er; + + if (addr_len != NULL) + *addr_len = sizeof(*srose); + + /* + * This works for seqpacket too. The receiver has ordered the queue for + * us! We do one quick check first though + */ + if (sk->state != TCP_ESTABLISHED) + return -ENOTCONN; + + /* Now we can treat all alike */ + if ((skb = skb_recv_datagram(sk, flags, noblock, &er)) == NULL) + return er; + + if (!sk->protinfo.rose->hdrincl) { + skb_pull(skb, ROSE_MIN_LEN); + skb->h.raw = skb->data; + } + + copied = skb->len; + + if (copied > size) { + copied = size; + msg->msg_flags |= MSG_TRUNC; + } + + skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + + if (srose != NULL) { + struct sockaddr_rose addr; + + addr.srose_family = AF_ROSE; + addr.srose_addr = sk->protinfo.rose->dest_addr; + addr.srose_call = sk->protinfo.rose->dest_call; + addr.srose_ndigis = 0; + + if (sk->protinfo.rose->dest_ndigis == 1) { + addr.srose_ndigis = 1; + addr.srose_digi = sk->protinfo.rose->dest_digi; + } + + *srose = addr; + + *addr_len = sizeof(*srose); + } + + skb_free_datagram(sk, skb); + + return copied; +} + +static int rose_shutdown(struct socket *sk, int how) +{ + return -EOPNOTSUPP; +} + +static int rose_select(struct socket *sock , int sel_type, select_table *wait) +{ + struct sock *sk = (struct sock *)sock->data; + + return datagram_select(sk, sel_type, wait); +} + +static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = (struct sock *)sock->data; + int err; + long amount = 0; + + switch (cmd) { + case TIOCOUTQ: + if ((err = verify_area(VERIFY_WRITE, (void *)arg, sizeof(unsigned int))) != 0) + return err; + amount = sk->sndbuf - sk->wmem_alloc; + if (amount < 0) + amount = 0; + put_user(amount, (unsigned int *)arg); + return 0; + + case TIOCINQ: { + struct sk_buff *skb; + /* These two are safe on a single CPU system as only user tasks fiddle here */ + if ((skb = skb_peek(&sk->receive_queue)) != NULL) + amount = skb->len - 20; + if ((err = verify_area(VERIFY_WRITE, (void *)arg, sizeof(unsigned int))) != 0) + return err; + put_user(amount, (unsigned int *)arg); + return 0; + } + + case SIOCGSTAMP: + if (sk != NULL) { + if (sk->stamp.tv_sec==0) + return -ENOENT; + if ((err = verify_area(VERIFY_WRITE,(void *)arg,sizeof(struct timeval))) != 0) + return err; + copy_to_user((void *)arg, &sk->stamp, sizeof(struct timeval)); + return 0; + } + return -EINVAL; + + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFMETRIC: + case SIOCSIFMETRIC: + return -EINVAL; + + case SIOCADDRT: + case SIOCDELRT: + if (!suser()) return -EPERM; + return rose_rt_ioctl(cmd, (void *)arg); + + case SIOCRSCTLCON: + if (!suser()) return -EPERM; + return rose_ctl_ioctl(cmd, (void *)arg); + + default: + return dev_ioctl(cmd, (void *)arg); + } + + /*NOTREACHED*/ + return 0; +} + +static int rose_get_info(char *buffer, char **start, off_t offset, int length, int dummy) +{ + struct sock *s; + struct device *dev; + const char *devname, *callsign; + int len = 0; + off_t pos = 0; + off_t begin = 0; + + cli(); + + len += sprintf(buffer, "dest_addr dest_call dest_digi src_addr src_call src_digi dev lci st vs vr va t t1 t2 t3 Snd-Q Rcv-Q\n"); + + for (s = rose_list; s != NULL; s = s->next) { + if ((dev = s->protinfo.rose->device) == NULL) + devname = "???"; + else + devname = dev->name; + + len += sprintf(buffer + len, "%-10s %-9s ", + rose2asc(&s->protinfo.rose->dest_addr), + ax2asc(&s->protinfo.rose->dest_call)); + len += sprintf(buffer + len, "%-9s ", + ax2asc(&s->protinfo.rose->dest_digi)); + + if (ax25cmp(&s->protinfo.rose->source_call, &null_ax25_address) == 0) + callsign = "??????-?"; + else + callsign = ax2asc(&s->protinfo.rose->source_call); + + len += sprintf(buffer + len, "%-10s %-9s ", + rose2asc(&s->protinfo.rose->source_addr), + callsign); + len += sprintf(buffer + len, "%-9s %-5s %3.3X %d %d %d %d %3d %3d %3d %3d %5d %5d\n", + ax2asc(&s->protinfo.rose->source_digi), + devname, s->protinfo.rose->lci & 0x0FFF, + s->protinfo.rose->state, + s->protinfo.rose->vs, s->protinfo.rose->vr, s->protinfo.rose->va, + s->protinfo.rose->timer / PR_SLOWHZ, + s->protinfo.rose->t1 / PR_SLOWHZ, + s->protinfo.rose->t2 / PR_SLOWHZ, + s->protinfo.rose->t3 / PR_SLOWHZ, + s->wmem_alloc, s->rmem_alloc); + + pos = begin + len; + + if (pos < offset) { + len = 0; + begin = pos; + } + + if (pos > offset + length) + break; + } + + sti(); + + *start = buffer + (offset - begin); + len -= (offset - begin); + + if (len > length) len = length; + + return(len); +} + +struct proto_ops rose_proto_ops = { + AF_ROSE, + + rose_create, + rose_dup, + rose_release, + rose_bind, + rose_connect, + rose_socketpair, + rose_accept, + rose_getname, + rose_select, + rose_ioctl, + rose_listen, + rose_shutdown, + rose_setsockopt, + rose_getsockopt, + rose_fcntl, + rose_sendmsg, + rose_recvmsg +}; + +struct notifier_block rose_dev_notifier = { + rose_device_event, + 0 +}; + +void rose_proto_init(struct net_proto *pro) +{ + sock_register(rose_proto_ops.family, &rose_proto_ops); + register_netdevice_notifier(&rose_dev_notifier); + printk(KERN_INFO "G4KLX Rose for Linux. Version 0.1 for AX25.034 Linux 2.1\n"); + + if (!ax25_protocol_register(AX25_P_ROSE, rose_route_frame)) + printk(KERN_ERR "Rose unable to register protocol with AX.25\n"); + if (!ax25_linkfail_register(rose_link_failed)) + printk(KERN_ERR "Rose unable to register linkfail handler with AX.25\n"); + + rose_register_sysctl(); + +#ifdef CONFIG_PROC_FS + proc_net_register(&(struct proc_dir_entry) { + PROC_NET_RS, 4, "rose", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + rose_get_info + }); + proc_net_register(&(struct proc_dir_entry) { + PROC_NET_RS_NEIGH, 10, "rose_neigh", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + rose_neigh_get_info + }); + proc_net_register(&(struct proc_dir_entry) { + PROC_NET_RS_NODES, 10, "rose_nodes", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + rose_nodes_get_info + }); + + proc_net_register(&(struct proc_dir_entry) { + PROC_NET_RS_ROUTES, 11, "rose_routes", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + rose_routes_get_info + }); +#endif +} + +#endif diff --git a/net/rose/rose_dev.c b/net/rose/rose_dev.c new file mode 100644 index 000000000..7fe6a00e6 --- /dev/null +++ b/net/rose/rose_dev.c @@ -0,0 +1,298 @@ +/* + * Rose release 001 + * + * This is ALPHA test software. This code may break your machine, randomly fail to work with new + * releases, misbehave and/or generally screw up. It might even work. + * + * This code REQUIRES 2.1.0 or higher/ NET3.029 + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * Rose 001 Jonathan(G4KLX) Cloned from nr_dev.c. + */ + +#include <linux/config.h> +#if defined(CONFIG_ROSE) || defined(CONFIG_ROSE_MODULE) +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/interrupt.h> +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/sysctl.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/errno.h> +#include <linux/fcntl.h> +#include <linux/in.h> +#include <linux/if_ether.h> /* For the statistics structure. */ + +#include <asm/system.h> +#include <asm/segment.h> +#include <asm/io.h> + +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/if_arp.h> +#include <linux/skbuff.h> + +#include <net/ip.h> +#include <net/arp.h> + +#include <net/ax25.h> +#include <net/rose.h> + +/* + * Only allow IP over Rose frames through if the netrom device is up. + */ + +int rose_rx_ip(struct sk_buff *skb, struct device *dev) +{ + struct enet_statistics *stats = (struct enet_statistics *)dev->priv; + + if (!dev->start) { + stats->rx_errors++; + return 0; + } + + stats->rx_packets++; + skb->protocol = htons(ETH_P_IP); + + /* Spoof incoming device */ + skb->dev = dev; + + skb->h.raw = skb->data; + ip_rcv(skb, skb->dev, NULL); + + return 1; +} + +static int rose_header(struct sk_buff *skb, struct device *dev, unsigned short type, + void *daddr, void *saddr, unsigned len) +{ + unsigned char *buff = skb_push(skb, ROSE_MIN_LEN + 2); + + *buff++ = GFI | Q_BIT; + *buff++ = 0x00; + *buff++ = ROSE_DATA; + *buff++ = 0x7F; + *buff++ = AX25_P_IP; + + if (daddr != NULL) + return 37; + + return -37; +} + +static int rose_rebuild_header(void *buff, struct device *dev, + unsigned long raddr, struct sk_buff *skb) +{ + struct enet_statistics *stats = (struct enet_statistics *)dev->priv; + unsigned char *bp = (unsigned char *)buff; + struct sk_buff *skbn; + + if (!arp_query(bp + 7, raddr, dev)) { + dev_kfree_skb(skb, FREE_WRITE); + return 1; + } + + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { + dev_kfree_skb(skb, FREE_WRITE); + return 1; + } + + skbn->sk = skb->sk; + + if (skbn->sk != NULL) + atomic_add(skbn->truesize, &skbn->sk->wmem_alloc); + + dev_kfree_skb(skb, FREE_WRITE); + + if (!rose_route_frame(skbn, NULL)) { + dev_kfree_skb(skbn, FREE_WRITE); + stats->tx_errors++; + } + + stats->tx_packets++; + + return 1; +} + +static int rose_set_mac_address(struct device *dev, void *addr) +{ + struct sockaddr *sa = addr; + + ax25_listen_release((ax25_address *)dev->dev_addr, NULL); + + memcpy(dev->dev_addr, sa->sa_data, dev->addr_len); + + ax25_listen_register((ax25_address *)dev->dev_addr, NULL); + + return 0; +} + +static int rose_open(struct device *dev) +{ + dev->tbusy = 0; + dev->start = 1; + + MOD_INC_USE_COUNT; + + ax25_listen_register((ax25_address *)dev->dev_addr, NULL); + + return 0; +} + +static int rose_close(struct device *dev) +{ + dev->tbusy = 1; + dev->start = 0; + + ax25_listen_release((ax25_address *)dev->dev_addr, NULL); + + MOD_DEC_USE_COUNT; + + return 0; +} + +static int rose_xmit(struct sk_buff *skb, struct device *dev) +{ + struct enet_statistics *stats = (struct enet_statistics *)dev->priv; + + if (skb == NULL || dev == NULL) + return 0; + + if (!dev->start) { + printk(KERN_ERR "rose: xmit call when iface is down\n"); + return 1; + } + + cli(); + + if (dev->tbusy != 0) { + sti(); + stats->tx_errors++; + return 1; + } + + dev->tbusy = 1; + + sti(); + + dev_kfree_skb(skb, FREE_WRITE); + + stats->tx_errors++; + + dev->tbusy = 0; + + mark_bh(NET_BH); + + return 0; +} + +static struct enet_statistics *rose_get_stats(struct device *dev) +{ + return (struct enet_statistics *)dev->priv; +} + +int rose_init(struct device *dev) +{ + int i; + + dev->mtu = ROSE_PACLEN - 2; + dev->tbusy = 0; + dev->hard_start_xmit = rose_xmit; + dev->open = rose_open; + dev->stop = rose_close; + + dev->hard_header = rose_header; + dev->hard_header_len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN; + dev->addr_len = ROSE_ADDR_LEN; + dev->type = ARPHRD_ROSE; + dev->rebuild_header = rose_rebuild_header; + dev->set_mac_address = rose_set_mac_address; + + /* New-style flags. */ + dev->flags = 0; + dev->family = AF_INET; + + dev->pa_addr = 0; + dev->pa_brdaddr = 0; + dev->pa_mask = 0; + dev->pa_alen = sizeof(unsigned long); + + if ((dev->priv = kmalloc(sizeof(struct enet_statistics), GFP_KERNEL)) == NULL) + return -ENOMEM; + + memset(dev->priv, 0, sizeof(struct enet_statistics)); + + dev->get_stats = rose_get_stats; + + /* Fill in the generic fields of the device structure. */ + for (i = 0; i < DEV_NUMBUFFS; i++) + skb_queue_head_init(&dev->buffs[i]); + + return 0; +}; + +#ifdef MODULE +extern struct proto_ops rose_proto_ops; +extern struct notifier_block rose_dev_notifier; + +static struct device dev_rose[] = { + {"rose0", 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, rose_init}, + {"rose1", 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, rose_init} +}; + +int init_module(void) +{ + register_netdev(&dev_rose[0]); + register_netdev(&dev_rose[1]); + + register_symtab(NULL); + + rose_proto_init(NULL); + + return 0; +} + +void cleanup_module(void) +{ + int i; + +#ifdef CONFIG_PROC_FS + proc_net_unregister(PROC_NET_RS); + proc_net_unregister(PROC_NET_RS_NEIGH); + proc_net_unregister(PROC_NET_RS_NODES); + proc_net_unregister(PROC_NET_RS_ROUTES); +#endif + rose_rt_free(); + + ax25_protocol_release(AX25_P_ROSE); + ax25_linkfail_release(rose_link_failed); + + rose_unregister_sysctl(); + + unregister_netdevice_notifier(&rose_dev_notifier); + + sock_unregister(rose_proto_ops.family); + + for (i = 0; i < 2; i++) { + if (dev_rose[i].priv != NULL) { + kfree(dev_rose[i].priv); + dev_rose[i].priv = NULL; + unregister_netdev(&dev_rose[i]); + } + } +} + +#endif + +#endif diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c new file mode 100644 index 000000000..20374dbb1 --- /dev/null +++ b/net/rose/rose_in.c @@ -0,0 +1,333 @@ +/* + * Rose release 001 + * + * This is ALPHA test software. This code may break your machine, randomly fail to work with new + * releases, misbehave and/or generally screw up. It might even work. + * + * This code REQUIRES 2.1.0 or higher/ NET3.029 + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Most of this code is based on the SDL diagrams published in the 7th + * ARRL Computer Networking Conference papers. The diagrams have mistakes + * in them, but are mostly correct. Before you modify the code could you + * read the SDL diagrams as the code is not obvious and probably very + * easy to break; + * + * History + * Rose 001 Jonathan(G4KLX) Cloned from nr_in.c + */ + +#include <linux/config.h> +#if defined(CONFIG_ROSE) || defined(CONFIG_ROSE_MODULE) +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <net/ax25.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/ip.h> /* For ip_rcv */ +#include <asm/segment.h> +#include <asm/system.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <net/rose.h> + +static int rose_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more) +{ + struct sk_buff *skbo, *skbn = skb; + + if (more) { + sk->protinfo.rose->fraglen += skb->len; + skb_queue_tail(&sk->protinfo.rose->frag_queue, skb); + return 0; + } + + if (!more && sk->protinfo.rose->fraglen > 0) { /* End of fragment */ + sk->protinfo.rose->fraglen += skb->len; + skb_queue_tail(&sk->protinfo.rose->frag_queue, skb); + + if ((skbn = alloc_skb(sk->protinfo.rose->fraglen, GFP_ATOMIC)) == NULL) + return 1; + + skbn->free = 1; + skbn->arp = 1; + skbn->sk = sk; + sk->rmem_alloc += skbn->truesize; + skbn->h.raw = skbn->data; + + skbo = skb_dequeue(&sk->protinfo.rose->frag_queue); + memcpy(skb_put(skbn, skbo->len), skbo->data, skbo->len); + kfree_skb(skbo, FREE_READ); + + while ((skbo = skb_dequeue(&sk->protinfo.rose->frag_queue)) != NULL) { + skb_pull(skbo, ROSE_MIN_LEN); + memcpy(skb_put(skbn, skbo->len), skbo->data, skbo->len); + kfree_skb(skbo, FREE_READ); + } + + sk->protinfo.rose->fraglen = 0; + } + + return sock_queue_rcv_skb(sk, skbn); +} + +/* + * State machine for state 1, Awaiting Call Accepted State. + * The handling of the timer(s) is in file rose_timer.c. + * Handling of state 0 and connection release is in af_rose.c. + */ +static int rose_state1_machine(struct sock *sk, struct sk_buff *skb, int frametype) +{ + switch (frametype) { + + case ROSE_CALL_ACCEPTED: + sk->protinfo.rose->timer = 0; + sk->protinfo.rose->vs = 0; + sk->protinfo.rose->va = 0; + sk->protinfo.rose->vr = 0; + sk->protinfo.rose->vl = 0; + sk->protinfo.rose->state = ROSE_STATE_3; + sk->state = TCP_ESTABLISHED; + if (!sk->dead) + sk->state_change(sk); + break; + + case ROSE_CLEAR_REQUEST: + rose_clear_queues(sk); + sk->protinfo.rose->state = ROSE_STATE_0; + sk->state = TCP_CLOSE; + sk->err = ECONNREFUSED; + sk->shutdown |= SEND_SHUTDOWN; + if (!sk->dead) + sk->state_change(sk); + sk->dead = 1; + break; + + default: /* XXX */ + printk(KERN_WARNING "rose: unknown %02X in state 1\n", frametype); + break; + } + + return 0; +} + +/* + * State machine for state 2, Awaiting Clear Confirmation State. + * The handling of the timer(s) is in file rose_timer.c + * Handling of state 0 and connection release is in af_rose.c. + */ +static int rose_state2_machine(struct sock *sk, struct sk_buff *skb, int frametype) +{ + switch (frametype) { + + case ROSE_CLEAR_REQUEST: + case ROSE_CLEAR_CONFIRMATION: + sk->protinfo.rose->state = ROSE_STATE_0; + sk->state = TCP_CLOSE; + sk->err = 0; + sk->shutdown |= SEND_SHUTDOWN; + if (!sk->dead) + sk->state_change(sk); + sk->dead = 1; + break; + + default: /* XXX */ + printk(KERN_WARNING "rose: unknown %02X in state 2\n", frametype); + break; + } + + return 0; +} + +/* + * State machine for state 3, Connected State. + * The handling of the timer(s) is in file rose_timer.c + * Handling of state 0 and connection release is in af_rose.c. + */ +static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype, int ns, int nr, int q, int d, int m) +{ + int queued = 0; + + switch (frametype) { + + case ROSE_RESET_REQUEST: + rose_clear_queues(sk); + rose_write_internal(sk, ROSE_RESET_CONFIRMATION); + sk->protinfo.rose->condition = 0x00; + sk->protinfo.rose->vs = 0; + sk->protinfo.rose->vr = 0; + sk->protinfo.rose->va = 0; + sk->protinfo.rose->vl = 0; + break; + + case ROSE_CLEAR_REQUEST: + rose_clear_queues(sk); + rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); + sk->protinfo.rose->state = ROSE_STATE_0; + sk->state = TCP_CLOSE; + sk->err = 0; + sk->shutdown |= SEND_SHUTDOWN; + if (!sk->dead) + sk->state_change(sk); + sk->dead = 1; + break; + + case ROSE_RR: + case ROSE_RNR: + if (frametype == ROSE_RNR) { + sk->protinfo.rose->condition |= PEER_RX_BUSY_CONDITION; + } else { + sk->protinfo.rose->condition &= ~PEER_RX_BUSY_CONDITION; + } + if (!rose_validate_nr(sk, nr)) { + rose_clear_queues(sk); + rose_write_internal(sk, ROSE_RESET_REQUEST); + sk->protinfo.rose->condition = 0x00; + sk->protinfo.rose->vs = 0; + sk->protinfo.rose->vr = 0; + sk->protinfo.rose->va = 0; + sk->protinfo.rose->vl = 0; + sk->protinfo.rose->state = ROSE_STATE_4; + sk->protinfo.rose->timer = sk->protinfo.rose->t2; + } else { + if (sk->protinfo.rose->condition & PEER_RX_BUSY_CONDITION) { + rose_frames_acked(sk, nr); + } else { + rose_check_iframes_acked(sk, nr); + } + } + break; + + case ROSE_DATA: /* XXX */ + sk->protinfo.rose->condition &= ~PEER_RX_BUSY_CONDITION; + if (!rose_validate_nr(sk, nr)) { + rose_clear_queues(sk); + rose_write_internal(sk, ROSE_RESET_REQUEST); + sk->protinfo.rose->condition = 0x00; + sk->protinfo.rose->vs = 0; + sk->protinfo.rose->vr = 0; + sk->protinfo.rose->va = 0; + sk->protinfo.rose->vl = 0; + sk->protinfo.rose->state = ROSE_STATE_4; + sk->protinfo.rose->timer = sk->protinfo.rose->t2; + break; + } + if (sk->protinfo.rose->condition & PEER_RX_BUSY_CONDITION) { + rose_frames_acked(sk, nr); + } else { + rose_check_iframes_acked(sk, nr); + } + if (sk->protinfo.rose->condition & OWN_RX_BUSY_CONDITION) + break; + if (ns == sk->protinfo.rose->vr) { + if (rose_queue_rx_frame(sk, skb, m) == 0) { + sk->protinfo.rose->vr = (sk->protinfo.rose->vr + 1) % ROSE_MODULUS; + queued = 1; + } else { + sk->protinfo.rose->condition |= OWN_RX_BUSY_CONDITION; + } + } + /* + * If the window is full, ack the frame. + */ + if (((sk->protinfo.rose->vl + sk->window) % ROSE_MODULUS) == sk->protinfo.rose->vr) + rose_enquiry_response(sk); + break; + + default: + printk(KERN_WARNING "rose: unknown %02X in state 3\n", frametype); + break; + } + + return queued; +} + +/* + * State machine for state 4, Awaiting Reset Confirmation State. + * The handling of the timer(s) is in file rose_timer.c + * Handling of state 0 and connection release is in af_rose.c. + */ +static int rose_state4_machine(struct sock *sk, struct sk_buff *skb, int frametype) +{ + switch (frametype) { + + case ROSE_RESET_CONFIRMATION: + case ROSE_RESET_REQUEST: + sk->protinfo.rose->timer = 0; + sk->protinfo.rose->condition = 0x00; + sk->protinfo.rose->va = 0; + sk->protinfo.rose->vr = 0; + sk->protinfo.rose->vs = 0; + sk->protinfo.rose->vl = 0; + sk->protinfo.rose->state = ROSE_STATE_3; + break; + + case ROSE_CLEAR_REQUEST: + rose_clear_queues(sk); + rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); + sk->protinfo.rose->timer = 0; + sk->protinfo.rose->state = ROSE_STATE_0; + sk->state = TCP_CLOSE; + sk->err = 0; + sk->shutdown |= SEND_SHUTDOWN; + if (!sk->dead) + sk->state_change(sk); + sk->dead = 1; + break; + + default: /* XXX */ + printk(KERN_WARNING "rose: unknown %02X in state 4\n", frametype); + break; + } + + return 0; +} + +/* Higher level upcall for a LAPB frame */ +int rose_process_rx_frame(struct sock *sk, struct sk_buff *skb) +{ + int queued = 0, frametype, ns, nr, q, d, m; + + if (sk->protinfo.rose->state == ROSE_STATE_0) + return 0; + + del_timer(&sk->timer); + + frametype = rose_decode(skb, &ns, &nr, &q, &d, &m); + + switch (sk->protinfo.rose->state) { + case ROSE_STATE_1: + queued = rose_state1_machine(sk, skb, frametype); + break; + case ROSE_STATE_2: + queued = rose_state2_machine(sk, skb, frametype); + break; + case ROSE_STATE_3: + queued = rose_state3_machine(sk, skb, frametype, ns, nr, q, d, m); + break; + case ROSE_STATE_4: + queued = rose_state4_machine(sk, skb, frametype); + break; + } + + rose_set_timer(sk); + + return queued; +} + +#endif diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c new file mode 100644 index 000000000..d0bf308f0 --- /dev/null +++ b/net/rose/rose_link.c @@ -0,0 +1,301 @@ +/* + * Rose release 001 + * + * This is ALPHA test software. This code may break your machine, randomly fail to work with new + * releases, misbehave and/or generally screw up. It might even work. + * + * This code REQUIRES 2.1.0 or higher/ NET3.029 + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * Rose 001 Jonathan(G4KLX) Cloned from rose_timer.c + */ + +#include <linux/config.h> +#if defined(CONFIG_ROSE) || defined(CONFIG_ROSE_MODULE) +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <net/ax25.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <asm/segment.h> +#include <asm/system.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/firewall.h> +#include <net/rose.h> + +static void rose_link_timer(unsigned long); + +/* + * Linux set/reset timer routines + */ +static void rose_link_set_timer(struct rose_neigh *neigh) +{ + unsigned long flags; + + save_flags(flags); + cli(); + del_timer(&neigh->timer); + restore_flags(flags); + + neigh->timer.next = neigh->timer.prev = NULL; + neigh->timer.data = (unsigned long)neigh; + neigh->timer.function = &rose_link_timer; + + neigh->timer.expires = jiffies + 10; + add_timer(&neigh->timer); +} + +static void rose_link_reset_timer(struct rose_neigh *neigh) +{ + unsigned long flags; + + save_flags(flags); + cli(); + del_timer(&neigh->timer); + restore_flags(flags); + + neigh->timer.data = (unsigned long)neigh; + neigh->timer.function = &rose_link_timer; + neigh->timer.expires = jiffies + 10; + add_timer(&neigh->timer); +} + +/* + * Rose Link Timer + * + * This routine is called every 100ms. Decrement timer by this + * amount - if expired then process the event. + */ +static void rose_link_timer(unsigned long param) +{ + struct rose_neigh *neigh = (struct rose_neigh *)param; + + if (neigh->t0timer == 0 || --neigh->t0timer > 0) { + rose_link_reset_timer(neigh); + return; + } + + /* + * T0 for a link has expired. + */ + rose_transmit_restart_request(neigh); + + neigh->t0timer = neigh->t0; + + rose_link_set_timer(neigh); +} + +/* + * This handles all restart and diagnostic frames. + */ +void rose_link_rx_restart(struct sk_buff *skb, struct rose_neigh *neigh, unsigned short frametype) +{ + struct sk_buff *skbn; + + switch (frametype) { + case ROSE_RESTART_REQUEST: + neigh->t0timer = 0; + neigh->restarted = 1; + del_timer(&neigh->timer); + rose_transmit_restart_confirmation(neigh); + break; + + case ROSE_RESTART_CONFIRMATION: + neigh->t0timer = 0; + neigh->restarted = 1; + del_timer(&neigh->timer); + break; + + case ROSE_DIAGNOSTIC: + printk(KERN_WARNING "rose: diagnostic #%d\n", skb->data[3]); + break; + + default: + printk(KERN_WARNING "rose: received unknown %02X with LCI 000\n", frametype); + break; + } + + if (neigh->restarted) { + while ((skbn = skb_dequeue(&neigh->queue)) != NULL) + if (!ax25_send_frame(skbn, (ax25_address *)neigh->dev->dev_addr, &neigh->callsign, neigh->digipeat, neigh->dev)) + kfree_skb(skbn, FREE_WRITE); + } +} + +/* + * This routine is called when a Restart Request is needed + */ +void rose_transmit_restart_request(struct rose_neigh *neigh) +{ + struct sk_buff *skb; + unsigned char *dptr; + int len; + + len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 3; + + if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) + return; + + skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN); + + dptr = skb_put(skb, ROSE_MIN_LEN + 3); + + *dptr++ = AX25_P_ROSE; + *dptr++ = GFI; + *dptr++ = 0x00; + *dptr++ = ROSE_RESTART_REQUEST; + *dptr++ = 0x00; + *dptr++ = 0; + + skb->free = 1; + skb->sk = NULL; + + if (!ax25_send_frame(skb, (ax25_address *)neigh->dev->dev_addr, &neigh->callsign, neigh->digipeat, neigh->dev)) + kfree_skb(skb, FREE_WRITE); +} + +/* + * This routine is called when a Restart Confirmation is needed + */ +void rose_transmit_restart_confirmation(struct rose_neigh *neigh) +{ + struct sk_buff *skb; + unsigned char *dptr; + int len; + + len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 1; + + if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) + return; + + skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN); + + dptr = skb_put(skb, ROSE_MIN_LEN + 1); + + *dptr++ = AX25_P_ROSE; + *dptr++ = GFI; + *dptr++ = 0x00; + *dptr++ = ROSE_RESTART_CONFIRMATION; + + skb->free = 1; + skb->sk = NULL; + + if (!ax25_send_frame(skb, (ax25_address *)neigh->dev->dev_addr, &neigh->callsign, neigh->digipeat, neigh->dev)) + kfree_skb(skb, FREE_WRITE); +} + +/* + * This routine is called when a Diagnostic is required. + */ +void rose_transmit_diagnostic(struct rose_neigh *neigh, unsigned char diag) +{ + struct sk_buff *skb; + unsigned char *dptr; + int len; + + len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 2; + + if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) + return; + + skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN); + + dptr = skb_put(skb, ROSE_MIN_LEN + 2); + + *dptr++ = AX25_P_ROSE; + *dptr++ = GFI; + *dptr++ = 0x00; + *dptr++ = ROSE_DIAGNOSTIC; + *dptr++ = diag; + + skb->free = 1; + skb->sk = NULL; + + if (!ax25_send_frame(skb, (ax25_address *)neigh->dev->dev_addr, &neigh->callsign, neigh->digipeat, neigh->dev)) + kfree_skb(skb, FREE_WRITE); +} + +/* + * This routine is called when a Clear Request is needed outside of the context + * of a connected socket. + */ +void rose_transmit_clear_request(struct rose_neigh *neigh, unsigned int lci, unsigned char cause) +{ + struct sk_buff *skb; + unsigned char *dptr; + int len; + + len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 3; + + if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) + return; + + skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN); + + dptr = skb_put(skb, ROSE_MIN_LEN + 3); + + *dptr++ = AX25_P_ROSE; + *dptr++ = ((lci >> 8) & 0x0F) | GFI; + *dptr++ = ((lci >> 0) & 0xFF); + *dptr++ = ROSE_CLEAR_REQUEST; + *dptr++ = cause; + *dptr++ = 0x00; + + skb->free = 1; + skb->sk = NULL; + + if (!ax25_send_frame(skb, (ax25_address *)neigh->dev->dev_addr, &neigh->callsign, neigh->digipeat, neigh->dev)) + kfree_skb(skb, FREE_WRITE); +} + +void rose_transmit_link(struct sk_buff *skb, struct rose_neigh *neigh) +{ + unsigned char *dptr; + +#ifdef CONFIG_FIREWALL + if (call_fw_firewall(PF_ROSE, skb->dev, skb->data, NULL) != FW_ACCEPT) + return; +#endif + + if (!ax25_link_up((ax25_address *)neigh->dev->dev_addr, &neigh->callsign, neigh->dev)) + neigh->restarted = 0; + + dptr = skb_push(skb, 1); + *dptr++ = AX25_P_ROSE; + + skb->arp = 1; + skb->free = 1; + + if (neigh->restarted) { + if (!ax25_send_frame(skb, (ax25_address *)neigh->dev->dev_addr, &neigh->callsign, neigh->digipeat, neigh->dev)) + kfree_skb(skb, FREE_WRITE); + } else { + skb_queue_tail(&neigh->queue, skb); + + if (neigh->t0timer == 0) { + rose_transmit_restart_request(neigh); + neigh->t0timer = neigh->t0; + rose_link_set_timer(neigh); + } + } +} + +#endif diff --git a/net/rose/rose_out b/net/rose/rose_out new file mode 100644 index 000000000..745cb5a2b --- /dev/null +++ b/net/rose/rose_out @@ -0,0 +1,254 @@ +/* + * Rose release 001 + * + * This is ALPHA test software. This code may break your machine, randomly fail to work with new + * releases, misbehave and/or generally screw up. It might even work. + * + * This code REQUIRES 2.1.0 or higher/ NET3.029 + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * Rose 001 Jonathan(G4KLX) Cloned from nr_out.c + */ + +#include <linux/config.h> +#if defined(CONFIG_ROSE) || defined(CONFIG_ROSE_MODULE) +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <net/ax25.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <asm/segment.h> +#include <asm/system.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <net/rose.h> + +/* + * This is where all Rose frames pass; + */ +void rose_output(struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff *skbn; + unsigned char header[ROSE_MIN_LEN]; + int err, frontlen, len; + + if (skb->len - ROSE_MIN_LEN > ROSE_PACLEN) { + /* Save a copy of the Header */ + memcpy(header, skb->data, ROSE_MIN_LEN); + skb_pull(skb, ROSE_MIN_LEN); + + frontlen = skb_headroom(skb); + + while (skb->len > 0) { + if ((skbn = sock_alloc_send_skb(sk, frontlen + ROSE_PACLEN, 0, 0, &err)) == NULL) + return; + + skbn->sk = sk; + skbn->free = 1; + skbn->arp = 1; + + skb_reserve(skbn, frontlen); + + len = (ROSE_PACLEN > skb->len) ? skb->len : ROSE_PACLEN; + + /* Copy the user data */ + memcpy(skb_put(skbn, len), skb->data, len); + skb_pull(skb, len); + + /* Duplicate the Header */ + skb_push(skbn, ROSE_MIN_LEN); + memcpy(skbn->data, header, ROSE_MIN_LEN); + + if (skb->len > 0) + skbn->data[2] |= M_BIT; + + skb_queue_tail(&sk->write_queue, skbn); /* Throw it on the queue */ + } + + skb->free = 1; + kfree_skb(skb, FREE_WRITE); + } else { + skb_queue_tail(&sk->write_queue, skb); /* Throw it on the queue */ + } + + if (sk->protinfo.rose->state == ROSE_STATE_3) + rose_kick(sk); +} + +/* + * This procedure is passed a buffer descriptor for an iframe. It builds + * the rest of the control part of the frame and then writes it out. + */ +static void rose_send_iframe(struct sock *sk, struct sk_buff *skb, int last) +{ + if (skb == NULL) + return; + + if (last) + skb->data[0] |= D_BIT; + + skb->data[2] |= (sk->protinfo.rose->vr << 5) & 0xE0; + skb->data[2] |= (sk->protinfo.rose->vs << 1) & 0x0E; + + rose_transmit_buffer(sk, skb); +} + +void rose_send_nak_frame(struct sock *sk) +{ + struct sk_buff *skb, *skbn; + + if ((skb = skb_peek(&sk->protinfo.rose->ack_queue)) == NULL) + return; + + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) + return; + + skbn->data[2] = sk->protinfo.rose->va; + skbn->data[3] = sk->protinfo.rose->vr; + + if (sk->protinfo.rose->condition & OWN_RX_BUSY_CONDITION) + skbn->data[4] |= NR_CHOKE_FLAG; + + rose_transmit_buffer(sk, skbn); + + sk->protinfo.rose->condition &= ~ACK_PENDING_CONDITION; + sk->protinfo.rose->vl = sk->protinfo.rose->vr; + sk->protinfo.rose->t1timer = 0; +} + +void rose_kick(struct sock *sk) +{ + struct sk_buff *skb, *skbn; + int last = 1; + unsigned short start, end, next; + + del_timer(&sk->timer); + + start = (skb_peek(&sk->protinfo.rose->ack_queue) == NULL) ? sk->protinfo.rose->va : sk->protinfo.rose->vs; + end = (sk->protinfo.rose->va + sk->window) % ROSE_MODULUS; + + if (!(sk->protinfo.rose->condition & PEER_RX_BUSY_CONDITION) && + start != end && + skb_peek(&sk->write_queue) != NULL) { + + sk->protinfo.rose->vs = start; + + /* + * Transmit data until either we're out of data to send or + * the window is full. + */ + + /* + * Dequeue the frame and copy it. + */ + skb = skb_dequeue(&sk->write_queue); + + do { + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { + skb_queue_head(&sk->write_queue, skb); + break; + } + + next = (sk->protinfo.rose->vs + 1) % ROSE_MODULUS; + last = (next == end); + + /* + * Transmit the frame copy. + */ + rose_send_iframe(sk, skbn, last); + + sk->protinfo.rose->vs = next; + + /* + * Requeue the original data frame. + */ + skb_queue_tail(&sk->protinfo.rose->ack_queue, skb); + + } while (!last && (skb = skb_dequeue(&sk->write_queue)) != NULL); + + sk->protinfo.rose->vl = sk->protinfo.rose->vr; + sk->protinfo.rose->condition &= ~ACK_PENDING_CONDITION; + } + + rose_set_timer(sk); +} + +void rose_transmit_buffer(struct sock *sk, struct sk_buff *skb) +{ + unsigned char *dptr; + + dptr = skb_push(skb, 1); + *dptr = AX25_P_ROSE; + + skb->arp = 1; + + if (!ax25_send_frame(skb, (ax25_address *)sk->protinfo.rose->neighbour->dev->dev_addr, &sk->protinfo.rose->neighbour->callsign, sk->protinfo.rose->neighbour->digipeat, sk->protinfo.rose->neighbour->dev)) { + kfree_skb(skb, FREE_WRITE); + + sk->state = TCP_CLOSE; + sk->err = ENETUNREACH; + if (!sk->dead) + sk->state_change(sk); + sk->dead = 1; + } +} + +/* + * The following routines are taken from page 170 of the 7th ARRL Computer + * Networking Conference paper, as is the whole state machine. + */ + +void rose_establish_data_link(struct sock *sk) +{ + sk->protinfo.rose->condition = 0x00; + + rose_write_internal(sk, ROSE_CALL_REQUEST); + + sk->protinfo.rose->t1timer = sk->protinfo.rose->t1; +} + +/* + * Never send a NAK when we are CHOKEd. + */ +void rose_enquiry_response(struct sock *sk) +{ + int frametype = NR_INFOACK; + + if (sk->protinfo.rose->condition & OWN_RX_BUSY_CONDITION) + frametype |= NR_CHOKE_FLAG; + + rose_write_internal(sk, frametype); + + sk->protinfo.rose->vl = sk->protinfo.rose->vr; + sk->protinfo.rose->condition &= ~ACK_PENDING_CONDITION; +} + +void rose_check_iframes_acked(struct sock *sk, unsigned short nr) +{ + if (sk->protinfo.rose->vs == nr) { + rose_frames_acked(sk, nr); + } else { + if (sk->protinfo.rose->va != nr) { + rose_frames_acked(sk, nr); + } + } +} + +#endif diff --git a/net/rose/rose_out.c b/net/rose/rose_out.c new file mode 100644 index 000000000..50b2587f8 --- /dev/null +++ b/net/rose/rose_out.c @@ -0,0 +1,193 @@ +/* + * Rose release 001 + * + * This is ALPHA test software. This code may break your machine, randomly fail to work with new + * releases, misbehave and/or generally screw up. It might even work. + * + * This code REQUIRES 2.1.0 or higher/ NET3.029 + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * Rose 001 Jonathan(G4KLX) Cloned from nr_out.c + */ + +#include <linux/config.h> +#if defined(CONFIG_ROSE) || defined(CONFIG_ROSE_MODULE) +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <net/ax25.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <asm/segment.h> +#include <asm/system.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <net/rose.h> + +/* + * This is where all Rose frames pass; + */ +void rose_output(struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff *skbn; + unsigned char header[ROSE_MIN_LEN]; + int err, frontlen, len; + + if (skb->len - ROSE_MIN_LEN > ROSE_PACLEN) { + /* Save a copy of the Header */ + memcpy(header, skb->data, ROSE_MIN_LEN); + skb_pull(skb, ROSE_MIN_LEN); + + frontlen = skb_headroom(skb); + + while (skb->len > 0) { + if ((skbn = sock_alloc_send_skb(sk, frontlen + ROSE_PACLEN, 0, 0, &err)) == NULL) + return; + + skbn->sk = sk; + skbn->free = 1; + skbn->arp = 1; + + skb_reserve(skbn, frontlen); + + len = (ROSE_PACLEN > skb->len) ? skb->len : ROSE_PACLEN; + + /* Copy the user data */ + memcpy(skb_put(skbn, len), skb->data, len); + skb_pull(skb, len); + + /* Duplicate the Header */ + skb_push(skbn, ROSE_MIN_LEN); + memcpy(skbn->data, header, ROSE_MIN_LEN); + + if (skb->len > 0) + skbn->data[2] |= M_BIT; + + skb_queue_tail(&sk->write_queue, skbn); /* Throw it on the queue */ + } + + skb->free = 1; + kfree_skb(skb, FREE_WRITE); + } else { + skb_queue_tail(&sk->write_queue, skb); /* Throw it on the queue */ + } + + if (sk->protinfo.rose->state == ROSE_STATE_3) + rose_kick(sk); +} + +/* + * This procedure is passed a buffer descriptor for an iframe. It builds + * the rest of the control part of the frame and then writes it out. + */ +static void rose_send_iframe(struct sock *sk, struct sk_buff *skb) +{ + if (skb == NULL) + return; + + skb->data[2] |= (sk->protinfo.rose->vr << 5) & 0xE0; + skb->data[2] |= (sk->protinfo.rose->vs << 1) & 0x0E; + + rose_transmit_link(skb, sk->protinfo.rose->neighbour); +} + +void rose_kick(struct sock *sk) +{ + struct sk_buff *skb, *skbn; + int last = 1; + unsigned short start, end, next; + + del_timer(&sk->timer); + + start = (skb_peek(&sk->protinfo.rose->ack_queue) == NULL) ? sk->protinfo.rose->va : sk->protinfo.rose->vs; + end = (sk->protinfo.rose->va + sk->window) % ROSE_MODULUS; + + if (!(sk->protinfo.rose->condition & PEER_RX_BUSY_CONDITION) && + start != end && + skb_peek(&sk->write_queue) != NULL) { + + sk->protinfo.rose->vs = start; + + /* + * Transmit data until either we're out of data to send or + * the window is full. + */ + + /* + * Dequeue the frame and copy it. + */ + skb = skb_dequeue(&sk->write_queue); + + do { + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { + skb_queue_head(&sk->write_queue, skb); + break; + } + + next = (sk->protinfo.rose->vs + 1) % ROSE_MODULUS; + last = (next == end); + + /* + * Transmit the frame copy. + */ + rose_send_iframe(sk, skbn); + + sk->protinfo.rose->vs = next; + + /* + * Requeue the original data frame. + */ + skb_queue_tail(&sk->protinfo.rose->ack_queue, skb); + + } while (!last && (skb = skb_dequeue(&sk->write_queue)) != NULL); + + sk->protinfo.rose->vl = sk->protinfo.rose->vr; + } + + rose_set_timer(sk); +} + +/* + * The following routines are taken from page 170 of the 7th ARRL Computer + * Networking Conference paper, as is the whole state machine. + */ + +void rose_enquiry_response(struct sock *sk) +{ + if (sk->protinfo.rose->condition & OWN_RX_BUSY_CONDITION) { + rose_write_internal(sk, ROSE_RNR); + } else { + rose_write_internal(sk, ROSE_RR); + } + + sk->protinfo.rose->vl = sk->protinfo.rose->vr; +} + +void rose_check_iframes_acked(struct sock *sk, unsigned short nr) +{ + if (sk->protinfo.rose->vs == nr) { + rose_frames_acked(sk, nr); + } else { + if (sk->protinfo.rose->va != nr) { + rose_frames_acked(sk, nr); + } + } +} + +#endif diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c new file mode 100644 index 000000000..9396831b3 --- /dev/null +++ b/net/rose/rose_route.c @@ -0,0 +1,810 @@ +/* + * Rose release 001 + * + * This is ALPHA test software. This code may break your machine, randomly fail to work with new + * releases, misbehave and/or generally screw up. It might even work. + * + * This code REQUIRES 2.1.0 or higher/ NET3.029 + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * Rose 001 Jonathan(G4KLX) Cloned from nr_route.c. + */ + +#include <linux/config.h> +#if defined(CONFIG_ROSE) || defined(CONFIG_ROSE_MODULE) +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <net/ax25.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <net/arp.h> +#include <linux/if_arp.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <asm/segment.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <linux/fcntl.h> +#include <linux/termios.h> /* For TIOCINQ/OUTQ */ +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/notifier.h> +#include <linux/firewall.h> +#include <net/rose.h> + +static unsigned int rose_neigh_no = 1; + +static struct rose_node *rose_node_list = NULL; +static struct rose_neigh *rose_neigh_list = NULL; +static struct rose_route *rose_route_list = NULL; + +static void rose_remove_neigh(struct rose_neigh *); + +/* + * Add a new route to a node, and in the process add the node and the + * neighbour if it is new. + */ +static int rose_add_node(struct rose_route_struct *rose_route, struct device *dev) +{ + struct rose_node *rose_node; + struct rose_neigh *rose_neigh; + unsigned long flags; + int i; + + for (rose_node = rose_node_list; rose_node != NULL; rose_node = rose_node->next) + if (rosecmp(&rose_route->address, &rose_node->address) == 0) + break; + + for (rose_neigh = rose_neigh_list; rose_neigh != NULL; rose_neigh = rose_neigh->next) + if (ax25cmp(&rose_route->neighbour, &rose_neigh->callsign) == 0 && rose_neigh->dev == dev) + break; + + if (rose_neigh == NULL) { + if ((rose_neigh = (struct rose_neigh *)kmalloc(sizeof(*rose_neigh), GFP_ATOMIC)) == NULL) + return -ENOMEM; + + rose_neigh->callsign = rose_route->neighbour; + rose_neigh->digipeat = NULL; + rose_neigh->dev = dev; + rose_neigh->count = 0; + rose_neigh->number = rose_neigh_no++; + rose_neigh->restarted = 0; + skb_queue_head_init(&rose_neigh->queue); + rose_neigh->t0 = sysctl_rose_restart_request_timeout; + rose_neigh->t0timer = 0; + init_timer(&rose_neigh->timer); + + if (rose_route->ndigis != 0) { + if ((rose_neigh->digipeat = kmalloc(sizeof(ax25_digi), GFP_KERNEL)) == NULL) { + kfree_s(rose_neigh, sizeof(*rose_neigh)); + return -ENOMEM; + } + rose_neigh->digipeat->ndigi = rose_route->ndigis; + for (i = 0; i < rose_route->ndigis; i++) + rose_neigh->digipeat->calls[i] = rose_route->digipeaters[i]; + } + + save_flags(flags); cli(); + rose_neigh->next = rose_neigh_list; + rose_neigh_list = rose_neigh; + restore_flags(flags); + } + + if (rose_node == NULL) { + if ((rose_node = (struct rose_node *)kmalloc(sizeof(*rose_node), GFP_ATOMIC)) == NULL) + return -ENOMEM; + + rose_node->address = rose_route->address; + rose_node->which = 0; + rose_node->count = 1; + + rose_node->neighbour[0] = rose_neigh; + + save_flags(flags); cli(); + rose_node->next = rose_node_list; + rose_node_list = rose_node; + restore_flags(flags); + + rose_neigh->count++; + + return 0; + } + + /* We have space at the bottom, slot it in */ + if (rose_node->count < 3) { + rose_node->neighbour[2] = rose_node->neighbour[1]; + rose_node->neighbour[1] = rose_node->neighbour[0]; + + rose_node->neighbour[0] = rose_neigh; + + rose_node->count++; + rose_neigh->count++; + } + + return 0; +} + +static void rose_remove_node(struct rose_node *rose_node) +{ + struct rose_node *s; + unsigned long flags; + + save_flags(flags); + cli(); + + if ((s = rose_node_list) == rose_node) { + rose_node_list = rose_node->next; + restore_flags(flags); + kfree_s(rose_node, sizeof(struct rose_node)); + return; + } + + while (s != NULL && s->next != NULL) { + if (s->next == rose_node) { + s->next = rose_node->next; + restore_flags(flags); + kfree_s(rose_node, sizeof(struct rose_node)); + return; + } + + s = s->next; + } + + restore_flags(flags); +} + +static void rose_remove_neigh(struct rose_neigh *rose_neigh) +{ + struct rose_neigh *s; + unsigned long flags; + struct sk_buff *skb; + + del_timer(&rose_neigh->timer); + + while ((skb = skb_dequeue(&rose_neigh->queue)) != NULL) + kfree_skb(skb, FREE_WRITE); + + save_flags(flags); + cli(); + + if ((s = rose_neigh_list) == rose_neigh) { + rose_neigh_list = rose_neigh->next; + restore_flags(flags); + if (rose_neigh->digipeat != NULL) + kfree_s(rose_neigh->digipeat, sizeof(ax25_digi)); + kfree_s(rose_neigh, sizeof(struct rose_neigh)); + return; + } + + while (s != NULL && s->next != NULL) { + if (s->next == rose_neigh) { + s->next = rose_neigh->next; + restore_flags(flags); + if (rose_neigh->digipeat != NULL) + kfree_s(rose_neigh->digipeat, sizeof(ax25_digi)); + kfree_s(rose_neigh, sizeof(struct rose_neigh)); + return; + } + + s = s->next; + } + + restore_flags(flags); +} + +static void rose_remove_route(struct rose_route *rose_route) +{ + struct rose_route *s; + unsigned long flags; + + save_flags(flags); + cli(); + + if ((s = rose_route_list) == rose_route) { + rose_route_list = rose_route->next; + restore_flags(flags); + kfree_s(rose_route, sizeof(struct rose_route)); + return; + } + + while (s != NULL && s->next != NULL) { + if (s->next == rose_route) { + s->next = rose_route->next; + restore_flags(flags); + kfree_s(rose_route, sizeof(struct rose_route)); + return; + } + + s = s->next; + } + + restore_flags(flags); +} + +/* + * "Delete" a node. Strictly speaking remove a route to a node. The node + * is only deleted if no routes are left to it. + */ +static int rose_del_node(struct rose_route_struct *rose_route, struct device *dev) +{ + struct rose_node *rose_node; + struct rose_neigh *rose_neigh; + int i; + + for (rose_node = rose_node_list; rose_node != NULL; rose_node = rose_node->next) + if (rosecmp(&rose_route->address, &rose_node->address) == 0) + break; + + if (rose_node == NULL) return -EINVAL; + + for (rose_neigh = rose_neigh_list; rose_neigh != NULL; rose_neigh = rose_neigh->next) + if (ax25cmp(&rose_route->neighbour, &rose_neigh->callsign) == 0 && rose_neigh->dev == dev) + break; + + if (rose_neigh == NULL) return -EINVAL; + + for (i = 0; i < rose_node->count; i++) { + if (rose_node->neighbour[i] == rose_neigh) { + rose_neigh->count--; + + if (rose_neigh->count == 0) + rose_remove_neigh(rose_neigh); + + rose_node->count--; + + if (rose_node->count == 0) { + rose_remove_node(rose_node); + } else { + switch (i) { + case 0: + rose_node->neighbour[0] = rose_node->neighbour[1]; + case 1: + rose_node->neighbour[1] = rose_node->neighbour[2]; + case 2: + break; + } + } + + return 0; + } + } + + return -EINVAL; +} + +/* + * A device has been removed. Remove its routes and neighbours. + */ +void rose_rt_device_down(struct device *dev) +{ + struct rose_neigh *s, *rose_neigh = rose_neigh_list; + struct rose_node *t, *rose_node; + int i; + + while (rose_neigh != NULL) { + s = rose_neigh; + rose_neigh = rose_neigh->next; + + if (s->dev == dev) { + rose_node = rose_node_list; + + while (rose_node != NULL) { + t = rose_node; + rose_node = rose_node->next; + + for (i = 0; i < t->count; i++) { + if (t->neighbour[i] == s) { + t->count--; + + switch (i) { + case 0: + t->neighbour[0] = t->neighbour[1]; + case 1: + t->neighbour[1] = t->neighbour[2]; + case 2: + break; + } + } + } + + if (t->count <= 0) + rose_remove_node(t); + } + + rose_remove_neigh(s); + } + } +} + +/* + * A device has been removed. Remove its links. + */ +void rose_route_device_down(struct device *dev) +{ + struct rose_route *s, *rose_route = rose_route_list; + + while (rose_route != NULL) { + s = rose_route; + rose_route = rose_route->next; + + if (s->neigh1->dev == dev || s->neigh2->dev == dev) + rose_remove_route(s); + } +} + +/* + * Check that the device given is a valid AX.25 interface that is "up". + */ +struct device *rose_ax25_dev_get(char *devname) +{ + struct device *dev; + + if ((dev = dev_get(devname)) == NULL) + return NULL; + + if ((dev->flags & IFF_UP) && dev->type == ARPHRD_AX25) + return dev; + + return NULL; +} + +/* + * Find the first active Rose device, usually "rose0". + */ +struct device *rose_dev_first(void) +{ + struct device *dev, *first = NULL; + + for (dev = dev_base; dev != NULL; dev = dev->next) + if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE) + if (first == NULL || strncmp(dev->name, first->name, 3) < 0) + first = dev; + + return first; +} + +/* + * Find the Rose device for the given address. + */ +struct device *rose_dev_get(rose_address *addr) +{ + struct device *dev; + + for (dev = dev_base; dev != NULL; dev = dev->next) + if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && rosecmp(addr, (rose_address *)dev->dev_addr) == 0) + return dev; + + return NULL; +} + +/* + * Find a neighbour given a Rose address. + */ +struct rose_neigh *rose_get_neigh(rose_address *addr) +{ + struct rose_node *node; + + for (node = rose_node_list; node != NULL; node = node->next) + if (rosecmp(&node->address, addr) == 0) + break; + + if (node == NULL) return NULL; + + if (node->which >= node->count) return NULL; + + return node->neighbour[node->which]; +} + +/* + * Handle the ioctls that control the routing functions. + */ +int rose_rt_ioctl(unsigned int cmd, void *arg) +{ + struct rose_route_struct rose_route; + struct device *dev; + int err; + + switch (cmd) { + + case SIOCADDRT: + if ((err = verify_area(VERIFY_READ, arg, sizeof(struct rose_route_struct))) != 0) + return err; + copy_from_user(&rose_route, arg, sizeof(struct rose_route_struct)); + if ((dev = rose_ax25_dev_get(rose_route.device)) == NULL) + return -EINVAL; + if (rose_dev_get(&rose_route.address) != NULL) /* Can't add routes to ourself */ + return -EINVAL; + return rose_add_node(&rose_route, dev); + + case SIOCDELRT: + if ((err = verify_area(VERIFY_READ, arg, sizeof(struct rose_route_struct))) != 0) + return err; + copy_from_user(&rose_route, arg, sizeof(struct rose_route_struct)); + if ((dev = rose_ax25_dev_get(rose_route.device)) == NULL) + return -EINVAL; + return rose_del_node(&rose_route, dev); + + default: + return -EINVAL; + } + + return 0; +} + +/* + * A level 2 link has timed out, therefore it appears to be a poor link, + * then don't use that neighbour until it is reset. XXX others. + */ +void rose_link_failed(ax25_address *callsign, struct device *dev) +{ + struct rose_neigh *rose_neigh; + struct rose_node *rose_node; + struct sk_buff *skb; + + for (rose_neigh = rose_neigh_list; rose_neigh != NULL; rose_neigh = rose_neigh->next) + if (ax25cmp(&rose_neigh->callsign, callsign) == 0 && rose_neigh->dev == dev) + break; + + if (rose_neigh == NULL) return; + + rose_neigh->restarted = 0; + rose_neigh->t0timer = 0; + del_timer(&rose_neigh->timer); + + while ((skb = skb_dequeue(&rose_neigh->queue)) != NULL) + kfree_skb(skb, FREE_WRITE); + + for (rose_node = rose_node_list; rose_node != NULL; rose_node = rose_node->next) + if (rose_node->which < rose_node->count && rose_node->neighbour[rose_node->which] == rose_neigh) + rose_node->which++; +} + +/* + * A device has been "downed" remove its link status. XXX others. + */ +void rose_link_device_down(struct device *dev) +{ + struct rose_neigh *rose_neigh; + struct rose_node *rose_node; + struct sk_buff *skb; + + for (rose_neigh = rose_neigh_list; rose_neigh != NULL; rose_neigh = rose_neigh->next) { + if (rose_neigh->dev == dev) { + rose_neigh->restarted = 0; + rose_neigh->t0timer = 0; + del_timer(&rose_neigh->timer); + + while ((skb = skb_dequeue(&rose_neigh->queue)) != NULL) + kfree_skb(skb, FREE_WRITE); + + for (rose_node = rose_node_list; rose_node != NULL; rose_node = rose_node->next) + if (rose_node->which < rose_node->count && rose_node->neighbour[rose_node->which] == rose_neigh) + rose_node->which++; + } + } +} + +/* + * Route a frame to an appropriate AX.25 connection. A NULL ax25_cb + * indicates an internally generated frame. + */ +int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) +{ + struct rose_neigh *rose_neigh, *new_neigh; + struct rose_node *rose_node; + struct rose_route *rose_route; + rose_address *dest_addr; + struct sock *sk; + unsigned short frametype; + unsigned int lci; + struct device *dev; + unsigned long flags; + +#ifdef CONFIG_FIREWALL + if (call_in_firewall(PF_ROSE, skb->dev, skb->data, NULL) != FW_ACCEPT) + return 0; +#endif + + frametype = skb->data[2]; + lci = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF); + + for (rose_neigh = rose_neigh_list; rose_neigh != NULL; rose_neigh = rose_neigh->next) + if (ax25cmp(&ax25->dest_addr, &rose_neigh->callsign) == 0 && ax25->device == rose_neigh->dev) + break; + + if (rose_neigh == NULL) + return 0; + + /* + * LCI of zero is always for us, and its always a restart + * frame. + */ + if (lci == 0) { + rose_link_rx_restart(skb, rose_neigh, frametype); + return 0; + } + + /* + * Find an existing socket. + */ + if ((sk = rose_find_socket(lci, rose_neigh->dev)) != NULL) { + skb->h.raw = skb->data; + return rose_process_rx_frame(sk, skb); + } + + /* + * Is is a Call Request and is it for us ? + */ + if (frametype == ROSE_CALL_REQUEST) { + dest_addr = (rose_address *)(skb->data + 4); + + if ((dev = rose_dev_get(dest_addr)) != NULL) + return rose_rx_call_request(skb, dev, rose_neigh, lci); + } + + if (!sysctl_rose_routing_control) { + rose_transmit_clear_request(rose_neigh, lci, 0x0D); + return 0; + } + + /* + * Route it to the next in line if we have an entry for it. + */ + + /* + * We should check for the random number in the facilities + * here. XXX. + */ + for (rose_route = rose_route_list; rose_route != NULL; rose_route = rose_route->next) { + if (rose_route->lci1 == lci && rose_route->neigh1 == rose_neigh) { + skb->data[0] &= 0xF0; + skb->data[0] |= (rose_route->lci2 >> 8) & 0x0F; + skb->data[1] = (rose_route->lci2 >> 0) & 0xFF; + rose_transmit_link(skb, rose_route->neigh2); + if (frametype == ROSE_CLEAR_CONFIRMATION) + rose_remove_route(rose_route); + return 1; + } + if (rose_route->lci2 == lci && rose_route->neigh2 == rose_neigh) { + skb->data[0] &= 0xF0; + skb->data[0] |= (rose_route->lci1 >> 8) & 0x0F; + skb->data[1] = (rose_route->lci1 >> 0) & 0xFF; + rose_transmit_link(skb, rose_route->neigh1); + if (frametype == ROSE_CLEAR_CONFIRMATION) + rose_remove_route(rose_route); + return 1; + } + } + + /* + * We know that: + * 1. The frame isn't for us, + * 2. It isn't "owned" by any existing route. + */ + if (frametype != ROSE_CALL_REQUEST) /* XXX */ + return 0; + + dest_addr = (rose_address *)(skb->data + 4); + + /* + * Create a new route entry, if we can. + */ + for (rose_node = rose_node_list; rose_node != NULL; rose_node = rose_node->next) + if (rosecmp(&rose_node->address, dest_addr) == 0) + break; + /* + * Its an unknown node, or is unreachable. + */ + if (rose_node == NULL || rose_node->which >= rose_node->count) { + rose_transmit_clear_request(rose_neigh, lci, 0x0D); + return 0; + } + + if ((rose_route = (struct rose_route *)kmalloc(sizeof(*rose_route), GFP_ATOMIC)) == NULL) { + rose_transmit_clear_request(rose_neigh, lci, 0x0D); + return 0; + } + + new_neigh = rose_node->neighbour[rose_node->which]; + + rose_route->lci1 = lci; + rose_route->neigh1 = rose_neigh; + rose_route->lci2 = rose_new_lci(new_neigh->dev); + rose_route->neigh2 = new_neigh; + + save_flags(flags); cli(); + rose_route->next = rose_route_list; + rose_route_list = rose_route; + restore_flags(flags); + + skb->data[0] &= 0xF0; + skb->data[0] |= (rose_route->lci2 >> 8) & 0x0F; + skb->data[1] = (rose_route->lci2 >> 0) & 0xFF; + + rose_transmit_link(skb, rose_route->neigh2); + + return 1; +} + +int rose_nodes_get_info(char *buffer, char **start, off_t offset, + int length, int dummy) +{ + struct rose_node *rose_node; + int len = 0; + off_t pos = 0; + off_t begin = 0; + int i; + + cli(); + + len += sprintf(buffer, "address w n neigh neigh neigh\n"); + + for (rose_node = rose_node_list; rose_node != NULL; rose_node = rose_node->next) { + len += sprintf(buffer + len, "%-10s %d %d", + rose2asc(&rose_node->address), + rose_node->which + 1, + rose_node->count); + + for (i = 0; i < rose_node->count; i++) + len += sprintf(buffer + len, " %05d", + rose_node->neighbour[i]->number); + + len += sprintf(buffer + len, "\n"); + + pos = begin + len; + + if (pos < offset) { + len = 0; + begin = pos; + } + + if (pos > offset + length) + break; + } + + sti(); + + *start = buffer + (offset - begin); + len -= (offset - begin); + + if (len > length) len = length; + + return len; +} + +int rose_neigh_get_info(char *buffer, char **start, off_t offset, + int length, int dummy) +{ + struct rose_neigh *rose_neigh; + int len = 0; + off_t pos = 0; + off_t begin = 0; + + cli(); + + len += sprintf(buffer, "addr callsign dev count restart t0\n"); + + for (rose_neigh = rose_neigh_list; rose_neigh != NULL; rose_neigh = rose_neigh->next) { + len += sprintf(buffer + len, "%05d %-9s %-4s %3d %3s %3d/%03d\n", + rose_neigh->number, + ax2asc(&rose_neigh->callsign), + rose_neigh->dev ? rose_neigh->dev->name : "???", + rose_neigh->count, + (rose_neigh->restarted) ? "yes" : "no", + rose_neigh->t0timer / PR_SLOWHZ, + rose_neigh->t0 / PR_SLOWHZ); + + pos = begin + len; + + if (pos < offset) { + len = 0; + begin = pos; + } + + if (pos > offset + length) + break; + } + + sti(); + + *start = buffer + (offset - begin); + len -= (offset - begin); + + if (len > length) len = length; + + return len; +} + +int rose_routes_get_info(char *buffer, char **start, off_t offset, + int length, int dummy) +{ + struct rose_route *rose_route; + int len = 0; + off_t pos = 0; + off_t begin = 0; + + cli(); + + len += sprintf(buffer, "lci callsign dev <-> lci callsign dev\n"); + + for (rose_route = rose_route_list; rose_route != NULL; rose_route = rose_route->next) { + len += sprintf(buffer + len, "%3.3X %-9s %-4s ", + rose_route->lci1, + ax2asc(&rose_route->neigh1->callsign), + rose_route->neigh1->dev ? rose_route->neigh1->dev->name : "???"); + len += sprintf(buffer + len, "%3.3X %-9s %-4s\n", + rose_route->lci2, + ax2asc(&rose_route->neigh2->callsign), + rose_route->neigh2->dev ? rose_route->neigh2->dev->name : "???"); + + pos = begin + len; + + if (pos < offset) { + len = 0; + begin = pos; + } + + if (pos > offset + length) + break; + } + + sti(); + + *start = buffer + (offset - begin); + len -= (offset - begin); + + if (len > length) len = length; + + return len; +} + +#ifdef MODULE + +/* + * Release all memory associated with Rose routing structures. + */ +void rose_rt_free(void) +{ + struct rose_neigh *s, *rose_neigh = rose_neigh_list; + struct rose_node *t, *rose_node = rose_node_list; + struct rose_route *u, *rose_route = rose_route_list; + + while (rose_neigh != NULL) { + s = rose_neigh; + rose_neigh = rose_neigh->next; + + rose_remove_neigh(s); + } + + while (rose_node != NULL) { + t = rose_node; + rose_node = rose_node->next; + + rose_remove_node(t); + } + + while (rose_route != NULL) { + u = rose_route; + rose_route = rose_route->next; + + rose_remove_route(u); + } +} + +#endif + +#endif diff --git a/net/rose/rose_subr.c b/net/rose/rose_subr.c new file mode 100644 index 000000000..0c1c83fa8 --- /dev/null +++ b/net/rose/rose_subr.c @@ -0,0 +1,494 @@ +/* + * Rose release 001 + * + * This is ALPHA test software. This code may break your machine, randomly fail to work with new + * releases, misbehave and/or generally screw up. It might even work. + * + * This code REQUIRES 2.1.0 or higher/ NET3.029 + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * Rose 001 Jonathan(G4KLX) Cloned from nr_subr.c + */ + +#include <linux/config.h> +#if defined(CONFIG_ROSE) || defined(CONFIG_ROSE_MODULE) +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <net/ax25.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <asm/segment.h> +#include <asm/system.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <net/rose.h> + +/* + * This routine purges all of the queues of frames. + */ +void rose_clear_queues(struct sock *sk) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(&sk->write_queue)) != NULL) { + skb->sk = sk; + skb->free = 1; + kfree_skb(skb, FREE_WRITE); + } + + while ((skb = skb_dequeue(&sk->protinfo.rose->ack_queue)) != NULL) { + skb->sk = sk; + skb->free = 1; + kfree_skb(skb, FREE_WRITE); + } + + while ((skb = skb_dequeue(&sk->protinfo.rose->frag_queue)) != NULL) { + kfree_skb(skb, FREE_READ); + } +} + +/* + * This routine purges the input queue of those frames that have been + * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the + * SDL diagram. + */ +void rose_frames_acked(struct sock *sk, unsigned short nr) +{ + struct sk_buff *skb; + + /* + * Remove all the ack-ed frames from the ack queue. + */ + if (sk->protinfo.rose->va != nr) { + while (skb_peek(&sk->protinfo.rose->ack_queue) != NULL && sk->protinfo.rose->va != nr) { + skb = skb_dequeue(&sk->protinfo.rose->ack_queue); + skb->sk = sk; + skb->free = 1; + kfree_skb(skb, FREE_WRITE); + sk->protinfo.rose->va = (sk->protinfo.rose->va + 1) % ROSE_MODULUS; + } + } +} + +/* + * Requeue all the un-ack-ed frames on the output queue to be picked + * up by rose_kick called from the timer. This arrangement handles the + * possibility of an empty output queue. + */ +void rose_requeue_frames(struct sock *sk) +{ + struct sk_buff *skb, *skb_prev = NULL; + + while ((skb = skb_dequeue(&sk->protinfo.rose->ack_queue)) != NULL) { + if (skb_prev == NULL) + skb_queue_head(&sk->write_queue, skb); + else + skb_append(skb_prev, skb); + skb_prev = skb; + } +} + +/* + * Validate that the value of nr is between va and vs. Return true or + * false for testing. + */ +int rose_validate_nr(struct sock *sk, unsigned short nr) +{ + unsigned short vc = sk->protinfo.rose->va; + + while (vc != sk->protinfo.rose->vs) { + if (nr == vc) return 1; + vc = (vc + 1) % ROSE_MODULUS; + } + + if (nr == sk->protinfo.rose->vs) return 1; + + return 0; +} + +/* + * This routine is called when the packet layer internally generates a + * control frame. + */ +void rose_write_internal(struct sock *sk, int frametype) +{ + struct sk_buff *skb; + unsigned char *dptr; + unsigned char lci1, lci2; + char buffer[100]; + int len, faclen = 0; + + len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 1; + + switch (frametype) { + case ROSE_CALL_REQUEST: + len += 1 + ROSE_ADDR_LEN + ROSE_ADDR_LEN; + faclen = rose_create_facilities(buffer, sk->protinfo.rose); + len += faclen; + break; + case ROSE_CALL_ACCEPTED: + case ROSE_CLEAR_REQUEST: + case ROSE_RESET_REQUEST: + case ROSE_DIAGNOSTIC: + len += 2; + break; + case ROSE_INTERRUPT: + len += 1; + break; + } + + if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) + return; + + /* + * Space for AX.25 header and PID. + */ + skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + 1); + + dptr = skb_put(skb, skb_tailroom(skb)); + + lci1 = (sk->protinfo.rose->lci >> 8) & 0x0F; + lci2 = (sk->protinfo.rose->lci >> 0) & 0xFF; + + switch (frametype) { + + case ROSE_CALL_REQUEST: + *dptr++ = GFI | lci1; + *dptr++ = lci2; + *dptr++ = frametype; + *dptr++ = 0xAA; + memcpy(dptr, &sk->protinfo.rose->dest_addr, ROSE_ADDR_LEN); + dptr += ROSE_ADDR_LEN; + memcpy(dptr, &sk->protinfo.rose->source_addr, ROSE_ADDR_LEN); + dptr += ROSE_ADDR_LEN; + memcpy(dptr, buffer, faclen); + dptr += faclen; + break; + + case ROSE_CALL_ACCEPTED: + *dptr++ = GFI | lci1; + *dptr++ = lci2; + *dptr++ = frametype; + *dptr++ = 0x00; /* Address length */ + *dptr++ = 0; /* Facilities length */ + break; + + case ROSE_CLEAR_REQUEST: + case ROSE_RESET_REQUEST: + *dptr++ = GFI | lci1; + *dptr++ = lci2; + *dptr++ = frametype; + *dptr++ = 0x00; /* XXX */ + *dptr++ = 0x00; /* XXX */ + break; + + case ROSE_INTERRUPT: + *dptr++ = GFI | lci1; + *dptr++ = lci2; + *dptr++ = frametype; + *dptr++ = 0x00; /* XXX */ + break; + + case ROSE_RR: + case ROSE_RNR: + case ROSE_REJ: + *dptr++ = GFI | lci1; + *dptr++ = lci2; + *dptr = frametype; + *dptr++ |= (sk->protinfo.rose->vr << 5) & 0xE0; + break; + + case ROSE_CLEAR_CONFIRMATION: + case ROSE_INTERRUPT_CONFIRMATION: + case ROSE_RESET_CONFIRMATION: + *dptr++ = GFI | lci1; + *dptr++ = lci2; + *dptr++ = frametype; + break; + + default: + printk(KERN_ERR "rose_write_internal: invalid frametype %02X\n", frametype); + kfree_skb(skb, FREE_WRITE); + return; + } + + rose_transmit_link(skb, sk->protinfo.rose->neighbour); +} + +int rose_decode(struct sk_buff *skb, int *ns, int *nr, int *q, int *d, int *m) +{ + unsigned char *frame; + + frame = skb->data; + + *ns = *nr = *q = *d = *m = 0; + + switch (frame[2]) { + case ROSE_CALL_REQUEST: + case ROSE_CALL_ACCEPTED: + case ROSE_CLEAR_REQUEST: + case ROSE_CLEAR_CONFIRMATION: + case ROSE_INTERRUPT: + case ROSE_INTERRUPT_CONFIRMATION: + case ROSE_RESET_REQUEST: + case ROSE_RESET_CONFIRMATION: + case ROSE_RESTART_REQUEST: + case ROSE_RESTART_CONFIRMATION: + case ROSE_REGISTRATION_REQUEST: + case ROSE_REGISTRATION_CONFIRMATION: + case ROSE_DIAGNOSTIC: + return frame[2]; + default: + break; + } + + if ((frame[2] & 0x1F) == ROSE_RR || + (frame[2] & 0x1F) == ROSE_RNR || + (frame[2] & 0x1F) == ROSE_REJ) { + *nr = (frame[2] >> 5) & 0x07; + return frame[2] & 0x1F; + } + + if ((frame[2] & 0x01) == ROSE_DATA) { + *q = (frame[0] & Q_BIT) == Q_BIT; + *d = (frame[0] & D_BIT) == D_BIT; + *m = (frame[2] & M_BIT) == M_BIT; + *nr = (frame[2] >> 5) & 0x07; + *ns = (frame[2] >> 1) & 0x07; + return ROSE_DATA; + } + + return ROSE_ILLEGAL; +} + +static int rose_parse_national(unsigned char *p, rose_cb *rose, int len) +{ + unsigned char l, n = 0; + + do { + switch (*p & 0xC0) { + case 0x00: + p += 2; + n += 2; + len -= 2; + break; + + case 0x40: + if (*p == FAC_NATIONAL_RAND) + rose->rand = ((p[1] << 8) & 0xFF00) + ((p[2] << 0) & 0x00FF); + p += 3; + n += 3; + len -= 3; + break; + + case 0x80: + p += 4; + n += 4; + len -= 4; + break; + + case 0xC0: + l = p[1]; + if (*p == FAC_NATIONAL_DEST_DIGI) { + memcpy(&rose->source_digi, p + 2, AX25_ADDR_LEN); + rose->source_ndigis = 1; + } + if (*p == FAC_NATIONAL_SRC_DIGI) { + memcpy(&rose->dest_digi, p + 2, AX25_ADDR_LEN); + rose->dest_ndigis = 1; + } + p += l + 2; + n += l + 2; + len -= l + 2; + break; + } + } while (*p != 0x00 && len > 0); + + return n; +} + +static int rose_parse_ccitt(unsigned char *p, rose_cb *rose, int len) +{ + unsigned char l, n = 0; + char callsign[11]; + + do { + switch (*p & 0xC0) { + case 0x00: + p += 2; + n += 2; + len -= 2; + break; + + case 0x40: + p += 3; + n += 3; + len -= 3; + break; + + case 0x80: + p += 4; + n += 4; + len -= 4; + break; + + case 0xC0: + l = p[1]; + if (*p == FAC_CCITT_DEST_NSAP) { + memcpy(&rose->source_addr, p + 7, ROSE_ADDR_LEN); + memcpy(callsign, p + 12, l - 10); + callsign[l - 10] = '\0'; + rose->source_call = *asc2ax(callsign); + } + if (*p == FAC_CCITT_SRC_NSAP) { + memcpy(&rose->dest_addr, p + 7, ROSE_ADDR_LEN); + memcpy(callsign, p + 12, l - 10); + callsign[l - 10] = '\0'; + rose->dest_call = *asc2ax(callsign); + } + p += l + 2; + n += l + 2; + len -= l + 2; + break; + } + } while (*p != 0x00 && len > 0); + + return n; +} + +int rose_parse_facilities(struct sk_buff *skb, rose_cb *rose) +{ + int facilities_len, len; + unsigned char *p; + + memset(rose, 0x00, sizeof(rose_cb)); + + len = (((skb->data[3] >> 4) & 0x0F) + 1) / 2; + len += (((skb->data[3] >> 0) & 0x0F) + 1) / 2; + + p = skb->data + len + 4; + + facilities_len = *p++; + + if (facilities_len == 0) + return 0; + + while (facilities_len > 0) { + if (*p == 0x00) { + facilities_len--; + p++; + + switch (*p) { + case FAC_NATIONAL: /* National */ + len = rose_parse_national(p + 1, rose, facilities_len - 1); + facilities_len -= len + 1; + p += len + 1; + break; + + case FAC_CCITT: /* CCITT */ + len = rose_parse_ccitt(p + 1, rose, facilities_len - 1); + facilities_len -= len + 1; + p += len + 1; + break; + + default: + printk(KERN_DEBUG "rose_parse_facilities: unknown facilities family %02X\n", *p); + facilities_len--; + p++; + break; + } + } + } + + return 1; +} + +int rose_create_facilities(unsigned char *buffer, rose_cb *rose) +{ + unsigned char *p = buffer + 1; + char *callsign; + int len; + + /* National Facilities */ + if (rose->rand != 0 || rose->source_ndigis == 1 || rose->dest_ndigis == 1) { + *p++ = 0x00; + *p++ = FAC_NATIONAL; + + if (rose->rand != 0) { + *p++ = FAC_NATIONAL_RAND; + *p++ = (rose->rand >> 8) & 0xFF; + *p++ = (rose->rand >> 0) & 0xFF; + } + + if (rose->source_ndigis == 1) { + *p++ = FAC_NATIONAL_SRC_DIGI; + *p++ = AX25_ADDR_LEN; + memcpy(p, &rose->source_digi, AX25_ADDR_LEN); + p += AX25_ADDR_LEN; + } + + if (rose->dest_ndigis == 1) { + *p++ = FAC_NATIONAL_DEST_DIGI; + *p++ = AX25_ADDR_LEN; + memcpy(p, &rose->dest_digi, AX25_ADDR_LEN); + p += AX25_ADDR_LEN; + } + } + + *p++ = 0x00; + *p++ = FAC_CCITT; + + *p++ = FAC_CCITT_DEST_NSAP; + + callsign = ax2asc(&rose->dest_call); + + *p++ = strlen(callsign) + 10; + *p++ = (strlen(callsign) + 9) * 2; /* ??? */ + + *p++ = 0x47; *p++ = 0x00; *p++ = 0x11; + *p++ = ROSE_ADDR_LEN * 2; + memcpy(p, &rose->dest_addr, ROSE_ADDR_LEN); + p += ROSE_ADDR_LEN; + + memcpy(p, callsign, strlen(callsign)); + p += strlen(callsign); + + *p++ = FAC_CCITT_SRC_NSAP; + + callsign = ax2asc(&rose->source_call); + + *p++ = strlen(callsign) + 10; + *p++ = (strlen(callsign) + 9) * 2; /* ??? */ + + *p++ = 0x47; *p++ = 0x00; *p++ = 0x11; + *p++ = ROSE_ADDR_LEN * 2; + memcpy(p, &rose->source_addr, ROSE_ADDR_LEN); + p += ROSE_ADDR_LEN; + + memcpy(p, callsign, strlen(callsign)); + p += strlen(callsign); + + len = p - buffer; + buffer[0] = len - 1; + + return len; +} + +#endif diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c new file mode 100644 index 000000000..313756847 --- /dev/null +++ b/net/rose/rose_timer.c @@ -0,0 +1,153 @@ +/* + * Rose release 001 + * + * This is ALPHA test software. This code may break your machine, randomly fail to work with new + * releases, misbehave and/or generally screw up. It might even work. + * + * This code REQUIRES 2.1.0 or higher/ NET3.029 + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * Rose 001 Jonathan(G4KLX) Cloned from nr_timer.c + */ + +#include <linux/config.h> +#if defined(CONFIG_ROSE) || defined(CONFIG_ROSE_MODULE) +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <net/ax25.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <asm/segment.h> +#include <asm/system.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <net/rose.h> + +static void rose_timer(unsigned long); + +/* + * Linux set/reset timer routines + */ +void rose_set_timer(struct sock *sk) +{ + unsigned long flags; + + save_flags(flags); + cli(); + del_timer(&sk->timer); + restore_flags(flags); + + sk->timer.next = sk->timer.prev = NULL; + sk->timer.data = (unsigned long)sk; + sk->timer.function = &rose_timer; + + sk->timer.expires = jiffies + 10; + add_timer(&sk->timer); +} + +static void rose_reset_timer(struct sock *sk) +{ + unsigned long flags; + + save_flags(flags); + cli(); + del_timer(&sk->timer); + restore_flags(flags); + + sk->timer.data = (unsigned long)sk; + sk->timer.function = &rose_timer; + sk->timer.expires = jiffies + 10; + add_timer(&sk->timer); +} + +/* + * Rose Timer + * + * This routine is called every 100ms. Decrement timer by this + * amount - if expired then process the event. + */ +static void rose_timer(unsigned long param) +{ + struct sock *sk = (struct sock *)param; + + switch (sk->protinfo.rose->state) { + case ROSE_STATE_0: + /* Magic here: If we listen() and a new link dies before it + is accepted() it isn't 'dead' so doesn't get removed. */ + if (sk->destroy || (sk->state == TCP_LISTEN && sk->dead)) { + del_timer(&sk->timer); + rose_destroy_socket(sk); + return; + } + break; + + case ROSE_STATE_3: + /* + * Check for the state of the receive buffer. + */ + if (sk->rmem_alloc < (sk->rcvbuf / 2) && (sk->protinfo.rose->condition & OWN_RX_BUSY_CONDITION)) { + sk->protinfo.rose->condition &= ~OWN_RX_BUSY_CONDITION; + sk->protinfo.rose->vl = sk->protinfo.rose->vr; + rose_write_internal(sk, ROSE_RR); + break; + } + /* + * Check for frames to transmit. + */ + rose_kick(sk); + break; + + default: + break; + } + + if (sk->protinfo.rose->timer == 0 || --sk->protinfo.rose->timer > 0) { + rose_reset_timer(sk); + return; + } + + /* + * Timer has expired, it may have been T1, T2, or T3. We can tell + * by the socket state. + */ + switch (sk->protinfo.rose->state) { + case ROSE_STATE_1: /* T1 */ + case ROSE_STATE_4: /* T2 */ + rose_write_internal(sk, ROSE_CLEAR_REQUEST); + sk->protinfo.rose->state = ROSE_STATE_2; + sk->protinfo.rose->timer = sk->protinfo.rose->t3; + break; + + case ROSE_STATE_2: /* T3 */ + rose_clear_queues(sk); + sk->protinfo.rose->state = ROSE_STATE_0; + sk->state = TCP_CLOSE; + sk->err = ETIMEDOUT; + sk->shutdown |= SEND_SHUTDOWN; + if (!sk->dead) + sk->state_change(sk); + sk->dead = 1; + break; + } + + rose_set_timer(sk); +} + +#endif diff --git a/net/rose/sysctl_net_rose.c b/net/rose/sysctl_net_rose.c new file mode 100644 index 000000000..558702dbd --- /dev/null +++ b/net/rose/sysctl_net_rose.c @@ -0,0 +1,62 @@ +/* -*- linux-c -*- + * sysctl_net_rose.c: sysctl interface to net Rose subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/rose directory entry (empty =) ). [MS] + */ + +#include <linux/mm.h> +#include <linux/sysctl.h> +#include <net/ax25.h> +#include <net/rose.h> + +static int min_timer[] = {1 * PR_SLOWHZ}; +static int max_timer[] = {300 * PR_SLOWHZ}; +static int min_idle[] = {0 * PR_SLOWHZ}; +static int max_idle[] = {65535 * PR_SLOWHZ}; +static int min_route[] = {0}; +static int max_route[] = {0}; + +static struct ctl_table_header *rose_table_header; + +static ctl_table rose_table[] = { + {NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout", + &sysctl_rose_restart_request_timeout, sizeof(int), 0644, NULL, + &proc_dointvec_minmax, &sysctl_intvec, NULL, &min_timer, &max_timer}, + {NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout", + &sysctl_rose_call_request_timeout, sizeof(int), 0644, NULL, + &proc_dointvec_minmax, &sysctl_intvec, NULL, &min_timer, &max_timer}, + {NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout", + &sysctl_rose_reset_request_timeout, sizeof(int), 0644, NULL, + &proc_dointvec_minmax, &sysctl_intvec, NULL, &min_timer, &max_timer}, + {NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout", + &sysctl_rose_clear_request_timeout, sizeof(int), 0644, NULL, + &proc_dointvec_minmax, &sysctl_intvec, NULL, &min_timer, &max_timer}, + {NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout", + &sysctl_rose_no_activity_timeout, sizeof(int), 0644, NULL, + &proc_dointvec_minmax, &sysctl_intvec, NULL, &min_idle, &max_idle}, + {NET_ROSE_ROUTING_CONTROL, "routing_control", + &sysctl_rose_routing_control, sizeof(int), 0644, NULL, + &proc_dointvec_minmax, &sysctl_intvec, NULL, &min_route, &max_route}, + {0} +}; + +static ctl_table rose_dir_table[] = { + {NET_ROSE, "rose", NULL, 0, 0555, rose_table}, + {0} +}; + +static ctl_table rose_root_table[] = { + {CTL_NET, "net", NULL, 0, 0555, rose_dir_table}, + {0} +}; + +void rose_register_sysctl(void) +{ + rose_table_header = register_sysctl_table(rose_root_table, 1); +} + +void rose_unregister_sysctl(void) +{ + unregister_sysctl_table(rose_table_header); +} diff --git a/net/socket.c b/net/socket.c index c036b3dca..e96ec9d05 100644 --- a/net/socket.c +++ b/net/socket.c @@ -32,6 +32,9 @@ * Alan Cox : Made sock_alloc()/sock_release() public * for NetROM and future kernel nfsd type * stuff. + * Alan Cox : sendmsg/recvmsg basics. + * Tom Dyas : Export net symbols. + * Marcin Dalecki : Fixed problems with CONFIG_NET="n". * * * This program is free software; you can redistribute it and/or @@ -61,23 +64,34 @@ #include <linux/net.h> #include <linux/interrupt.h> #include <linux/netdevice.h> +#include <linux/proc_fs.h> +#include <linux/firewall.h> + +#if defined(CONFIG_KERNELD) && defined(CONFIG_NET) +#include <linux/kerneld.h> +#endif + +#include <net/netlink.h> #include <asm/system.h> -#include <asm/segment.h> +#include <asm/uaccess.h> + +#if defined(CONFIG_MODULES) && defined(CONFIG_NET) +extern void export_net_symbols(void); +#endif -static int sock_lseek(struct inode *inode, struct file *file, off_t offset, - int whence); -static int sock_read(struct inode *inode, struct file *file, char *buf, - int size); -static int sock_write(struct inode *inode, struct file *file, char *buf, - int size); +static long long sock_lseek(struct inode *inode, struct file *file, + long long offset, int whence); +static long sock_read(struct inode *inode, struct file *file, + char *buf, unsigned long size); +static long sock_write(struct inode *inode, struct file *file, + const char *buf, unsigned long size); static void sock_close(struct inode *inode, struct file *file); static int sock_select(struct inode *inode, struct file *file, int which, select_table *seltable); static int sock_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg); static int sock_fasync(struct inode *inode, struct file *filp, int on); - /* @@ -113,42 +127,43 @@ static int sockets_in_use = 0; * divide and look after the messy bits. */ -#define MAX_SOCK_ADDR 128 /* 108 for Unix domain - 16 for IP, 16 for IPX, about 80 for AX.25 */ +#define MAX_SOCK_ADDR 128 /* 108 for Unix domain - + 16 for IP, 16 for IPX, + 24 for IPv6, + about 80 for AX.25 */ -static int move_addr_to_kernel(void *uaddr, int ulen, void *kaddr) +int move_addr_to_kernel(void *uaddr, int ulen, void *kaddr) { - int err; if(ulen<0||ulen>MAX_SOCK_ADDR) return -EINVAL; if(ulen==0) return 0; - if((err=verify_area(VERIFY_READ,uaddr,ulen))<0) - return err; - memcpy_fromfs(kaddr,uaddr,ulen); + if(copy_from_user(kaddr,uaddr,ulen)) + return -EFAULT; return 0; } -static int move_addr_to_user(void *kaddr, int klen, void *uaddr, int *ulen) +int move_addr_to_user(void *kaddr, int klen, void *uaddr, int *ulen) { int err; int len; - - if((err=verify_area(VERIFY_WRITE,ulen,sizeof(*ulen)))<0) + if((err=get_user(len, ulen))) return err; - len=get_fs_long(ulen); if(len>klen) len=klen; if(len<0 || len> MAX_SOCK_ADDR) return -EINVAL; if(len) { - if((err=verify_area(VERIFY_WRITE,uaddr,len))<0) - return err; - memcpy_tofs(uaddr,kaddr,len); + if(copy_to_user(uaddr,kaddr,len)) + return -EFAULT; } - put_fs_long(len,ulen); - return 0; + /* + * "fromlen shall refer to the value before truncation.." + * 1003.1g + */ + return put_user(klen, ulen); } /* @@ -158,36 +173,31 @@ static int move_addr_to_user(void *kaddr, int klen, void *uaddr, int *ulen) static int get_fd(struct inode *inode) { int fd; - struct file *file; /* * Find a file descriptor suitable for return to the user. */ - file = get_empty_filp(); - if (!file) - return(-1); + fd = get_unused_fd(); + if (fd >= 0) { + struct file *file = get_empty_filp(); - for (fd = 0; fd < NR_OPEN; ++fd) - if (!current->files->fd[fd]) - break; - if (fd == NR_OPEN) - { - file->f_count = 0; - return(-1); - } + if (!file) { + put_unused_fd(fd); + return -ENFILE; + } - FD_CLR(fd, ¤t->files->close_on_exec); current->files->fd[fd] = file; - file->f_op = &socket_file_ops; - file->f_mode = 3; - file->f_flags = O_RDWR; - file->f_count = 1; - file->f_inode = inode; - if (inode) - inode->i_count++; - file->f_pos = 0; - return(fd); + file->f_op = &socket_file_ops; + file->f_mode = 3; + file->f_flags = O_RDWR; + file->f_count = 1; + file->f_inode = inode; + if (inode) + inode->i_count++; + file->f_pos = 0; + } + return fd; } @@ -197,7 +207,8 @@ static int get_fd(struct inode *inode) * The original socket implementation wasn't very clever, which is * why this exists at all.. */ -inline struct socket *socki_lookup(struct inode *inode) + +__inline struct socket *socki_lookup(struct inode *inode) { return &inode->u.socket_i; } @@ -206,7 +217,7 @@ inline struct socket *socki_lookup(struct inode *inode) * Go from a file number to its socket slot. */ -static inline struct socket *sockfd_lookup(int fd, struct file **pfile) +extern __inline struct socket *sockfd_lookup(int fd, struct file **pfile) { struct file *file; struct inode *inode; @@ -250,6 +261,7 @@ struct socket *sock_alloc(void) sock->conn = NULL; sock->iconn = NULL; sock->next = NULL; + sock->file = NULL; sock->wait = &inode->i_wait; sock->inode = inode; /* "backlink": we could use pointer arithmetic instead */ sock->fasync_list = NULL; @@ -297,6 +309,7 @@ void sock_release(struct socket *sock) if (peersock) sock_release_peer(peersock); --sockets_in_use; /* Bookkeeping.. */ + sock->file=NULL; iput(SOCK_INODE(sock)); } @@ -304,9 +317,10 @@ void sock_release(struct socket *sock) * Sockets are not seekable. */ -static int sock_lseek(struct inode *inode, struct file *file, off_t offset, int whence) +static long long sock_lseek(struct inode *inode, struct file *file, + long long offset, int whence) { - return(-ESPIPE); + return -ESPIPE; } /* @@ -314,10 +328,13 @@ static int sock_lseek(struct inode *inode, struct file *file, off_t offset, int * area ubuf...ubuf+size-1 is writable before asking the protocol. */ -static int sock_read(struct inode *inode, struct file *file, char *ubuf, int size) +static long sock_read(struct inode *inode, struct file *file, + char *ubuf, unsigned long size) { struct socket *sock; int err; + struct iovec iov; + struct msghdr msg; sock = socki_lookup(inode); if (sock->flags & SO_ACCEPTCON) @@ -329,7 +346,14 @@ static int sock_read(struct inode *inode, struct file *file, char *ubuf, int siz return 0; if ((err=verify_area(VERIFY_WRITE,ubuf,size))<0) return err; - return(sock->ops->read(sock, ubuf, size, (file->f_flags & O_NONBLOCK))); + msg.msg_name=NULL; + msg.msg_iov=&iov; + msg.msg_iovlen=1; + msg.msg_control=NULL; + iov.iov_base=ubuf; + iov.iov_len=size; + + return(sock->ops->recvmsg(sock, &msg, size,(file->f_flags & O_NONBLOCK), 0,&msg.msg_namelen)); } /* @@ -337,10 +361,13 @@ static int sock_read(struct inode *inode, struct file *file, char *ubuf, int siz * readable by the user process. */ -static int sock_write(struct inode *inode, struct file *file, char *ubuf, int size) +static long sock_write(struct inode *inode, struct file *file, + const char *ubuf, unsigned long size) { struct socket *sock; int err; + struct msghdr msg; + struct iovec iov; sock = socki_lookup(inode); @@ -351,15 +378,23 @@ static int sock_write(struct inode *inode, struct file *file, char *ubuf, int si return -EINVAL; if(size==0) /* Match SYS5 behaviour */ return 0; - + if ((err=verify_area(VERIFY_READ,ubuf,size))<0) return err; - return(sock->ops->write(sock, ubuf, size,(file->f_flags & O_NONBLOCK))); + + msg.msg_name=NULL; + msg.msg_iov=&iov; + msg.msg_iovlen=1; + msg.msg_control=NULL; + iov.iov_base=(void *)ubuf; + iov.iov_len=size; + + return(sock->ops->sendmsg(sock, &msg, size,(file->f_flags & O_NONBLOCK),0)); } /* * With an ioctl arg may well be a user mode pointer, but we don't know what to do - * with it - thats up to the protocol still. + * with it - that's up to the protocol still. */ int sock_ioctl(struct inode *inode, struct file *file, unsigned int cmd, @@ -476,104 +511,51 @@ int sock_wake_async(struct socket *sock, int how) return 0; } - + /* - * Wait for a connection. + * Perform the socket system call. we locate the appropriate + * family, then create a fresh socket. */ -int sock_awaitconn(struct socket *mysock, struct socket *servsock, int flags) +static int find_protocol_family(int family) { - struct socket *last; - - /* - * We must be listening - */ - if (!(servsock->flags & SO_ACCEPTCON)) - { - return(-EINVAL); - } - - /* - * Put ourselves on the server's incomplete connection queue. - */ - - mysock->next = NULL; - cli(); - if (!(last = servsock->iconn)) - servsock->iconn = mysock; - else + register int i; + for (i = 0; i < NPROTO; i++) { - while (last->next) - last = last->next; - last->next = mysock; - } - mysock->state = SS_CONNECTING; - mysock->conn = servsock; - sti(); - - /* - * Wake up server, then await connection. server will set state to - * SS_CONNECTED if we're connected. - */ - wake_up_interruptible(servsock->wait); - sock_wake_async(servsock, 0); - - if (mysock->state != SS_CONNECTED) - { - if (flags & O_NONBLOCK) - return -EINPROGRESS; - - interruptible_sleep_on(mysock->wait); - if (mysock->state != SS_CONNECTED && - mysock->state != SS_DISCONNECTING) - { - /* - * if we're not connected we could have been - * 1) interrupted, so we need to remove ourselves - * from the server list - * 2) rejected (mysock->conn == NULL), and have - * already been removed from the list - */ - if (mysock->conn == servsock) - { - cli(); - if ((last = servsock->iconn) == mysock) - servsock->iconn = mysock->next; - else - { - while (last->next != mysock) - last = last->next; - last->next = mysock->next; - } - sti(); - } - return(mysock->conn ? -EINTR : -EACCES); - } + if (pops[i] == NULL) + continue; + if (pops[i]->family == family) + return i; } - return(0); + return -1; } - -/* - * Perform the socket system call. we locate the appropriate - * family, then create a fresh socket. - */ - -static int sock_socket(int family, int type, int protocol) +asmlinkage int sys_socket(int family, int type, int protocol) { int i, fd; struct socket *sock; struct proto_ops *ops; /* Locate the correct protocol family. */ - for (i = 0; i < NPROTO; ++i) + i = find_protocol_family(family); + +#if defined(CONFIG_KERNELD) && defined(CONFIG_NET) + /* Attempt to load a protocol module if the find failed. + * + * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user + * requested real, full-featured networking support upon configuration. + * Otherwise module support will break! + */ + if (i < 0) { - if (pops[i] == NULL) continue; - if (pops[i]->family == family) - break; + char module_name[30]; + sprintf(module_name,"net-pf-%d",family); + request_module(module_name); + i = find_protocol_family(family); } +#endif - if (i == NPROTO) + if (i < 0) { return -EINVAL; } @@ -599,7 +581,7 @@ static int sock_socket(int family, int type, int protocol) if (!(sock = sock_alloc())) { - printk("NET: sock_socket: no more sockets\n"); + printk(KERN_WARNING "socket: no more sockets\n"); return(-ENOSR); /* Was: EAGAIN, but we are out of system resources! */ } @@ -618,6 +600,8 @@ static int sock_socket(int family, int type, int protocol) return(-EINVAL); } + sock->file=current->files->fd[fd]; + return(fd); } @@ -625,7 +609,7 @@ static int sock_socket(int family, int type, int protocol) * Create a pair of connected sockets. */ -static int sock_socketpair(int family, int type, int protocol, unsigned long usockvec[2]) +asmlinkage int sys_socketpair(int family, int type, int protocol, int usockvec[2]) { int fd1, fd2, i; struct socket *sock1, *sock2; @@ -636,7 +620,7 @@ static int sock_socketpair(int family, int type, int protocol, unsigned long uso * supports the socketpair call. */ - if ((fd1 = sock_socket(family, type, protocol)) < 0) + if ((fd1 = sys_socket(family, type, protocol)) < 0) return(fd1); sock1 = sockfd_lookup(fd1, NULL); if (!sock1->ops->socketpair) @@ -649,7 +633,7 @@ static int sock_socketpair(int family, int type, int protocol, unsigned long uso * Now grab another socket and try to connect the two together. */ - if ((fd2 = sock_socket(family, type, protocol)) < 0) + if ((fd2 = sys_socket(family, type, protocol)) < 0) { sys_close(fd1); return(-EINVAL); @@ -668,17 +652,14 @@ static int sock_socketpair(int family, int type, int protocol, unsigned long uso sock1->state = SS_CONNECTED; sock2->state = SS_CONNECTED; - er=verify_area(VERIFY_WRITE, usockvec, 2 * sizeof(int)); - if(er) - { + er = put_user(fd1, &usockvec[0]); + if (!er) + er = put_user(fd2, &usockvec[1]); + if (er) { sys_close(fd1); sys_close(fd2); - return er; } - put_fs_long(fd1, &usockvec[0]); - put_fs_long(fd2, &usockvec[1]); - - return(0); + return er; } @@ -690,7 +671,7 @@ static int sock_socketpair(int family, int type, int protocol, unsigned long uso * the protocol layer (having also checked the address is ok). */ -static int sock_bind(int fd, struct sockaddr *umyaddr, int addrlen) +asmlinkage int sys_bind(int fd, struct sockaddr *umyaddr, int addrlen) { struct socket *sock; int i; @@ -720,24 +701,26 @@ static int sock_bind(int fd, struct sockaddr *umyaddr, int addrlen) * ready for listening. */ -static int sock_listen(int fd, int backlog) +asmlinkage int sys_listen(int fd, int backlog) { struct socket *sock; - + int err=-EOPNOTSUPP; + if (fd < 0 || fd >= NR_OPEN || current->files->fd[fd] == NULL) return(-EBADF); if (!(sock = sockfd_lookup(fd, NULL))) return(-ENOTSOCK); if (sock->state != SS_UNCONNECTED) - { return(-EINVAL); - } if (sock->ops && sock->ops->listen) - sock->ops->listen(sock, backlog); - sock->flags |= SO_ACCEPTCON; - return(0); + { + err=sock->ops->listen(sock, backlog); + if(!err) + sock->flags |= SO_ACCEPTCON; + } + return(err); } @@ -745,11 +728,15 @@ static int sock_listen(int fd, int backlog) * For accept, we attempt to create a new socket, set up the link * with the client, wake up the client, then return the new * connected fd. We collect the address of the connector in kernel - * space and move it to user at the very end. This is buggy because + * space and move it to user at the very end. This is unclean because * we open the socket then return an error. + * + * 1003.1g addcs the ability to recvmsg() to query connection pending + * status to recvmsg. We need to add that support in a way thats + * clean when we restucture accept also. */ -static int sock_accept(int fd, struct sockaddr *upeer_sockaddr, int *upeer_addrlen) +asmlinkage int sys_accept(int fd, struct sockaddr *upeer_sockaddr, int *upeer_addrlen) { struct file *file; struct socket *sock, *newsock; @@ -772,7 +759,7 @@ static int sock_accept(int fd, struct sockaddr *upeer_sockaddr, int *upeer_addrl if (!(newsock = sock_alloc())) { - printk("NET: sock_accept: no more sockets\n"); + printk(KERN_WARNING "accept: no more sockets\n"); return(-ENOSR); /* Was: EAGAIN, but we are out of system resources! */ } @@ -796,7 +783,8 @@ static int sock_accept(int fd, struct sockaddr *upeer_sockaddr, int *upeer_addrl sock_release(newsock); return(-EINVAL); } - + newsock->file=current->files->fd[fd]; + if (upeer_sockaddr) { newsock->ops->getname(newsock, (struct sockaddr *)address, &len, 1); @@ -809,9 +797,16 @@ static int sock_accept(int fd, struct sockaddr *upeer_sockaddr, int *upeer_addrl /* * Attempt to connect to a socket with the server address. The address * is in user space so we verify it is OK and move it to kernel space. + * + * For 1003.1g we need to add clean support for a bind to AF_UNSPEC to + * break bindings + * + * NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and + * other SEQPACKET protocols that take time to connect() as it doesn't + * include the -EINPROGRESS status for such sockets. */ -static int sock_connect(int fd, struct sockaddr *uservaddr, int addrlen) +asmlinkage int sys_connect(int fd, struct sockaddr *uservaddr, int addrlen) { struct socket *sock; struct file *file; @@ -862,7 +857,7 @@ static int sock_connect(int fd, struct sockaddr *uservaddr, int addrlen) * name to user space. */ -static int sock_getsockname(int fd, struct sockaddr *usockaddr, int *usockaddr_len) +asmlinkage int sys_getsockname(int fd, struct sockaddr *usockaddr, int *usockaddr_len) { struct socket *sock; char address[MAX_SOCK_ADDR]; @@ -887,7 +882,7 @@ static int sock_getsockname(int fd, struct sockaddr *usockaddr, int *usockaddr_l * name to user space. */ -static int sock_getpeername(int fd, struct sockaddr *usockaddr, int *usockaddr_len) +asmlinkage int sys_getpeername(int fd, struct sockaddr *usockaddr, int *usockaddr_len) { struct socket *sock; char address[MAX_SOCK_ADDR]; @@ -912,11 +907,13 @@ static int sock_getpeername(int fd, struct sockaddr *usockaddr, int *usockaddr_l * in user space. We check it can be read. */ -static int sock_send(int fd, void * buff, int len, unsigned flags) +asmlinkage int sys_send(int fd, void * buff, size_t len, unsigned flags) { struct socket *sock; struct file *file; int err; + struct msghdr msg; + struct iovec iov; if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL)) return(-EBADF); @@ -928,7 +925,14 @@ static int sock_send(int fd, void * buff, int len, unsigned flags) err=verify_area(VERIFY_READ, buff, len); if(err) return err; - return(sock->ops->send(sock, buff, len, (file->f_flags & O_NONBLOCK), flags)); + + iov.iov_base=buff; + iov.iov_len=len; + msg.msg_name=NULL; + msg.msg_iov=&iov; + msg.msg_iovlen=1; + msg.msg_control=NULL; + return(sock->ops->sendmsg(sock, &msg, len, (file->f_flags & O_NONBLOCK), flags)); } /* @@ -937,13 +941,15 @@ static int sock_send(int fd, void * buff, int len, unsigned flags) * the protocol. */ -static int sock_sendto(int fd, void * buff, int len, unsigned flags, +asmlinkage int sys_sendto(int fd, void * buff, size_t len, unsigned flags, struct sockaddr *addr, int addr_len) { struct socket *sock; struct file *file; char address[MAX_SOCK_ADDR]; int err; + struct msghdr msg; + struct iovec iov; if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL)) return(-EBADF); @@ -955,25 +961,35 @@ static int sock_sendto(int fd, void * buff, int len, unsigned flags, err=verify_area(VERIFY_READ,buff,len); if(err) return err; - - if((err=move_addr_to_kernel(addr,addr_len,address))<0) - return err; - return(sock->ops->sendto(sock, buff, len, (file->f_flags & O_NONBLOCK), - flags, (struct sockaddr *)address, addr_len)); + iov.iov_base=buff; + iov.iov_len=len; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_iov=&iov; + msg.msg_iovlen=1; + msg.msg_control=NULL; + if (addr && addr_len) { + err=move_addr_to_kernel(addr,addr_len,address); + if (err < 0) + return err; + msg.msg_name=address; + msg.msg_namelen=addr_len; + } + + return(sock->ops->sendmsg(sock, &msg, len, (file->f_flags & O_NONBLOCK), + flags)); } /* - * Receive a datagram from a socket. This isn't really right. The BSD manual - * pages explicitly state that recv is recvfrom with a NULL to argument. The - * Linux stack gets the right results for the wrong reason and this need to - * be tidied in the inet layer and removed from here. - * We check the buffer is writable and valid. + * Receive a datagram from a socket. Call the protocol recvmsg method */ -static int sock_recv(int fd, void * buff, int len, unsigned flags) +asmlinkage int sys_recv(int fd, void * ubuf, size_t size, unsigned flags) { + struct iovec iov; + struct msghdr msg; struct socket *sock; struct file *file; int err; @@ -984,15 +1000,22 @@ static int sock_recv(int fd, void * buff, int len, unsigned flags) if (!(sock = sockfd_lookup(fd, NULL))) return(-ENOTSOCK); - if(len<0) + if(size<0) return -EINVAL; - if(len==0) + if(size==0) return 0; - err=verify_area(VERIFY_WRITE, buff, len); + err=verify_area(VERIFY_WRITE, ubuf, size); if(err) return err; - - return(sock->ops->recv(sock, buff, len,(file->f_flags & O_NONBLOCK), flags)); + + msg.msg_name=NULL; + msg.msg_iov=&iov; + msg.msg_iovlen=1; + msg.msg_control=NULL; + iov.iov_base=ubuf; + iov.iov_len=size; + + return(sock->ops->recvmsg(sock, &msg, size,(file->f_flags & O_NONBLOCK), flags,&msg.msg_namelen)); } /* @@ -1001,11 +1024,13 @@ static int sock_recv(int fd, void * buff, int len, unsigned flags) * sender address from kernel to user space. */ -static int sock_recvfrom(int fd, void * buff, int len, unsigned flags, +asmlinkage int sys_recvfrom(int fd, void * ubuf, size_t size, unsigned flags, struct sockaddr *addr, int *addr_len) { struct socket *sock; struct file *file; + struct iovec iov; + struct msghdr msg; char address[MAX_SOCK_ADDR]; int err; int alen; @@ -1013,24 +1038,31 @@ static int sock_recvfrom(int fd, void * buff, int len, unsigned flags, return(-EBADF); if (!(sock = sockfd_lookup(fd, NULL))) return(-ENOTSOCK); - if(len<0) + if(size<0) return -EINVAL; - if(len==0) + if(size==0) return 0; - err=verify_area(VERIFY_WRITE,buff,len); + err=verify_area(VERIFY_WRITE,ubuf,size); if(err) return err; - len=sock->ops->recvfrom(sock, buff, len, (file->f_flags & O_NONBLOCK), - flags, (struct sockaddr *)address, &alen); + msg.msg_control=NULL; + msg.msg_iovlen=1; + msg.msg_iov=&iov; + iov.iov_len=size; + iov.iov_base=ubuf; + msg.msg_name=address; + msg.msg_namelen=MAX_SOCK_ADDR; + size=sock->ops->recvmsg(sock, &msg, size, (file->f_flags & O_NONBLOCK), + flags, &alen); - if(len<0) - return len; + if(size<0) + return size; if(addr!=NULL && (err=move_addr_to_user(address,alen, addr, addr_len))<0) return err; - return len; + return size; } /* @@ -1038,7 +1070,7 @@ static int sock_recvfrom(int fd, void * buff, int len, unsigned flags, * to pass the user mode parameter for the protocols to sort out. */ -static int sock_setsockopt(int fd, int level, int optname, char *optval, int optlen) +asmlinkage int sys_setsockopt(int fd, int level, int optname, char *optval, int optlen) { struct socket *sock; struct file *file; @@ -1056,7 +1088,7 @@ static int sock_setsockopt(int fd, int level, int optname, char *optval, int opt * to pass a user mode parameter for the protocols to sort out. */ -static int sock_getsockopt(int fd, int level, int optname, char *optval, int *optlen) +asmlinkage int sys_getsockopt(int fd, int level, int optname, char *optval, int *optlen) { struct socket *sock; struct file *file; @@ -1076,7 +1108,7 @@ static int sock_getsockopt(int fd, int level, int optname, char *optval, int *op * Shutdown a socket. */ -static int sock_shutdown(int fd, int how) +asmlinkage int sys_shutdown(int fd, int how) { struct socket *sock; struct file *file; @@ -1089,6 +1121,149 @@ static int sock_shutdown(int fd, int how) return(sock->ops->shutdown(sock, how)); } +/* + * BSD sendmsg interface + */ + +asmlinkage int sys_sendmsg(int fd, struct msghdr *msg, unsigned int flags) +{ + struct socket *sock; + struct file *file; + char address[MAX_SOCK_ADDR]; + struct iovec iov[UIO_MAXIOV]; + struct msghdr msg_sys; + void * krn_msg_ctl = NULL; + int err; + int total_len; + + if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL)) + return(-EBADF); + if (!(sock = sockfd_lookup(fd, NULL))) + return(-ENOTSOCK); + + if(sock->ops->sendmsg==NULL) + return -EOPNOTSUPP; + + if (copy_from_user(&msg_sys,msg,sizeof(struct msghdr))) + return -EFAULT; + + /* do not move before msg_sys is valid */ + if(msg_sys.msg_iovlen>UIO_MAXIOV) + return -EINVAL; + + /* This will also move the address data into kernel space */ + err = verify_iovec(&msg_sys, iov, address, VERIFY_READ); + if (err < 0) + return err; + total_len=err; + + if (msg_sys.msg_control) + { + krn_msg_ctl = kmalloc(msg_sys.msg_controllen, GFP_KERNEL); + err = copy_from_user(krn_msg_ctl, msg_sys.msg_control, + msg_sys.msg_controllen); + if (err) + return -EFAULT; + msg_sys.msg_control = krn_msg_ctl; + } + + err = sock->ops->sendmsg(sock, &msg_sys, total_len, + (file->f_flags&O_NONBLOCK), flags); + + if (msg_sys.msg_control) + { + kfree(krn_msg_ctl); + } + + return err; +} + +/* + * BSD recvmsg interface + */ + +asmlinkage int sys_recvmsg(int fd, struct msghdr *msg, unsigned int flags) +{ + struct socket *sock; + struct file *file; + struct iovec iov[UIO_MAXIOV]; + struct msghdr msg_sys; + void *usr_msg_ctl = NULL; + void *krn_msg_ctl = NULL; + int err; + int total_len; + int len; + + /* kernel mode address */ + char addr[MAX_SOCK_ADDR]; + int addr_len; + + /* user mode address pointers */ + struct sockaddr *uaddr; + int *uaddr_len; + + if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL)) + return(-EBADF); + if (!(sock = sockfd_lookup(fd, NULL))) + return(-ENOTSOCK); + + if (copy_from_user(&msg_sys,msg,sizeof(struct msghdr))) + return -EFAULT; + + if(msg_sys.msg_iovlen>UIO_MAXIOV) + return -EINVAL; + + /* + * save the user-mode address (verify_iovec will change the + * kernel msghdr to use the kernel address space) + */ + uaddr = msg_sys.msg_name; + uaddr_len = &msg->msg_namelen; + err=verify_iovec(&msg_sys,iov,addr, VERIFY_WRITE); + if(err<0) + return err; + + total_len=err; + + + + if (msg_sys.msg_control) + { + usr_msg_ctl = msg_sys.msg_control; + krn_msg_ctl = kmalloc(msg_sys.msg_controllen, GFP_KERNEL); + err = copy_from_user(krn_msg_ctl, usr_msg_ctl, + msg_sys.msg_controllen); + if (err) + return -EFAULT; + msg_sys.msg_control = krn_msg_ctl; + } + + if(sock->ops->recvmsg==NULL) + return -EOPNOTSUPP; + len=sock->ops->recvmsg(sock, &msg_sys, total_len, (file->f_flags&O_NONBLOCK), flags, &addr_len); + if(len<0) + return len; + + if (uaddr != NULL) + { + err = move_addr_to_user(addr, addr_len, uaddr, uaddr_len); + } + + if (msg_sys.msg_control) + { + if (!err) + { + err = copy_to_user(usr_msg_ctl, krn_msg_ctl, + msg_sys.msg_controllen); + if (err) + err = -EFAULT; + } + kfree(krn_msg_ctl); + } + + return err ? err : len; +} + /* * Perform a file control on a socket file descriptor. @@ -1119,85 +1294,91 @@ int sock_fcntl(struct file *filp, unsigned int cmd, unsigned long arg) asmlinkage int sys_socketcall(int call, unsigned long *args) { - int er; - unsigned char nargs[16]={0,3,3,3,2,3,3,3, - 4,4,4,6,6,2,5,5}; - + unsigned char nargs[18]={0,3,3,3,2,3,3,3, + 4,4,4,6,6,2,5,5,3,3}; + unsigned long a[6]; unsigned long a0,a1; - if(call<1||call>SYS_GETSOCKOPT) + if(call<1||call>SYS_RECVMSG) return -EINVAL; - er=verify_area(VERIFY_READ, args, nargs[call] * sizeof(unsigned long)); - if(er) - return er; + if ((copy_from_user(a, args, nargs[call] * sizeof(unsigned long)))) + return -EFAULT; - a0=get_fs_long(args); - a1=get_fs_long(args+1); + a0=a[0]; + a1=a[1]; switch(call) { case SYS_SOCKET: - return(sock_socket(a0,a1,get_fs_long(args+2))); + return(sys_socket(a0,a1,a[2])); case SYS_BIND: - return(sock_bind(a0,(struct sockaddr *)a1, - get_fs_long(args+2))); + return(sys_bind(a0,(struct sockaddr *)a1, + a[2])); case SYS_CONNECT: - return(sock_connect(a0, (struct sockaddr *)a1, - get_fs_long(args+2))); + return(sys_connect(a0, (struct sockaddr *)a1, + a[2])); case SYS_LISTEN: - return(sock_listen(a0,a1)); + return(sys_listen(a0,a1)); case SYS_ACCEPT: - return(sock_accept(a0,(struct sockaddr *)a1, - (int *)get_fs_long(args+2))); + return(sys_accept(a0,(struct sockaddr *)a1, + (int *)a[2])); case SYS_GETSOCKNAME: - return(sock_getsockname(a0,(struct sockaddr *)a1, - (int *)get_fs_long(args+2))); + return(sys_getsockname(a0,(struct sockaddr *)a1, + (int *)a[2])); case SYS_GETPEERNAME: - return(sock_getpeername(a0, (struct sockaddr *)a1, - (int *)get_fs_long(args+2))); + return(sys_getpeername(a0, (struct sockaddr *)a1, + (int *)a[2])); case SYS_SOCKETPAIR: - return(sock_socketpair(a0,a1, - get_fs_long(args+2), - (unsigned long *)get_fs_long(args+3))); + return(sys_socketpair(a0,a1, + a[2], + (int *)a[3])); case SYS_SEND: - return(sock_send(a0, + return(sys_send(a0, (void *)a1, - get_fs_long(args+2), - get_fs_long(args+3))); + a[2], + a[3])); case SYS_SENDTO: - return(sock_sendto(a0,(void *)a1, - get_fs_long(args+2), - get_fs_long(args+3), - (struct sockaddr *)get_fs_long(args+4), - get_fs_long(args+5))); + return(sys_sendto(a0,(void *)a1, + a[2], + a[3], + (struct sockaddr *)a[4], + a[5])); case SYS_RECV: - return(sock_recv(a0, + return(sys_recv(a0, (void *)a1, - get_fs_long(args+2), - get_fs_long(args+3))); + a[2], + a[3])); case SYS_RECVFROM: - return(sock_recvfrom(a0, + return(sys_recvfrom(a0, (void *)a1, - get_fs_long(args+2), - get_fs_long(args+3), - (struct sockaddr *)get_fs_long(args+4), - (int *)get_fs_long(args+5))); + a[2], + a[3], + (struct sockaddr *)a[4], + (int *)a[5])); case SYS_SHUTDOWN: - return(sock_shutdown(a0,a1)); + return(sys_shutdown(a0,a1)); case SYS_SETSOCKOPT: - return(sock_setsockopt(a0, + return(sys_setsockopt(a0, a1, - get_fs_long(args+2), - (char *)get_fs_long(args+3), - get_fs_long(args+4))); + a[2], + (char *)a[3], + a[4])); case SYS_GETSOCKOPT: - return(sock_getsockopt(a0, + return(sys_getsockopt(a0, a1, - get_fs_long(args+2), - (char *)get_fs_long(args+3), - (int *)get_fs_long(args+4))); + a[2], + (char *)a[3], + (int *)a[4])); + case SYS_SENDMSG: + return sys_sendmsg(a0, + (struct msghdr *) a1, + a[2]); + case SYS_RECVMSG: + return sys_recvmsg(a0, + (struct msghdr *) a1, + a[2]); } return -EINVAL; /* to keep gcc happy */ } @@ -1241,7 +1422,7 @@ int sock_unregister(int family) { if (pops[i] == NULL) continue; - if(pops[i]->family == family) + if (pops[i]->family == family) { pops[i]=NULL; sti(); @@ -1272,34 +1453,50 @@ void sock_init(void) { int i; - printk("Swansea University Computer Society NET3.029 Snap #6 for Linux 1.3.0\n"); + printk(KERN_INFO "Swansea University Computer Society NET3.037 for Linux 2.1\n"); /* * Initialize all address (protocol) families. */ for (i = 0; i < NPROTO; ++i) pops[i] = NULL; + + /* + * The netlink device handler may be needed early. + */ +#ifdef CONFIG_NETLINK + init_netlink(); +#endif /* - * Initialize the protocols module. + * Attach the routing/device information port. */ - proto_init(); +#if defined(CONFIG_RTNETLINK) + netlink_attach(NETLINK_ROUTE, netlink_donothing); +#endif -#ifdef CONFIG_NET - /* - * Initialize the DEV module. + /* + * Attach the firewall module if configured */ + +#ifdef CONFIG_FIREWALL + fwchain_init(); +#endif + + /* + * Initialize the protocols module. + */ + + proto_init(); - dev_init(); - /* - * And the bottom half handler + * Export networking symbols to the world. */ - bh_base[NET_BH].routine= net_bh; - enable_bh(NET_BH); -#endif +#if defined(CONFIG_MODULES) && defined(CONFIG_NET) + export_net_symbols(); +#endif } int socket_get_info(char *buffer, char **start, off_t offset, int length) diff --git a/net/sysctl_net.c b/net/sysctl_net.c new file mode 100644 index 000000000..8bdb4f224 --- /dev/null +++ b/net/sysctl_net.c @@ -0,0 +1,68 @@ +/* -*- linux-c -*- + * sysctl_net.c: sysctl interface to net subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net directories for each protocol family. [MS] + * + * $Log: sysctl_net.c,v $ + * Revision 1.2 1996/05/08 20:24:40 shaver + * Added bits for NET_BRIDGE and the NET_IPV4_ARP stuff and + * NET_IPV4_IP_FORWARD. + * + * + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/sysctl.h> + +#ifdef CONFIG_INET +extern ctl_table ipv4_table[]; +#endif + +#ifdef CONFIG_IPX +extern ctl_table ipx_table[]; +#endif + +#ifdef CONFIG_ATALK +extern ctl_table atalk_table[]; +#endif + +extern ctl_table core_table[], unix_table[]; + +#ifdef CONFIG_NET +extern ctl_table ether_table[], e802_table[]; +#endif + +#ifdef CONFIG_BRIDGE +extern ctl_table bridge_table[]; +#endif + +#ifdef CONFIG_IPV6 +extern ctl_table ipv6_table[]; +#endif + +ctl_table net_table[] = { + {NET_CORE, "core", NULL, 0, 0555, core_table}, + {NET_UNIX, "unix", NULL, 0, 0555, unix_table}, +#ifdef CONFIG_NET + {NET_802, "802", NULL, 0, 0555, e802_table}, + {NET_ETHER, "ethernet", NULL, 0, 0555, ether_table}, +#endif +#ifdef CONFIG_INET + {NET_IPV4, "ipv4", NULL, 0, 0555, ipv4_table}, +#endif +#ifdef CONFIG_IPX + {NET_IPX, "ipx", NULL, 0, 0555, ipx_table}, +#endif +#ifdef CONFIG_ATALK + {NET_ATALK, "appletalk", NULL, 0, 0555, atalk_table}, +#endif +#ifdef CONFIG_BRIDGE + {NET_BRIDGE, "bridge", NULL, 0, 0555, bridge_table}, +#endif +#ifdef CONFIG_IPV6 + {NET_IPV6, "ipv6", NULL, 0, 0555, ipv6_table}, +#endif + {0} +}; diff --git a/net/unix/Makefile b/net/unix/Makefile index e4fb629bd..9116cc054 100644 --- a/net/unix/Makefile +++ b/net/unix/Makefile @@ -1,5 +1,5 @@ # -# Makefile for the UNIX Protocol Family. +# Makefile for the Linux TCP/IP (INET) layer. # # Note! Dependencies are done automagically by 'make dep', which also # removes any old dependencies. DON'T put your own dependencies here @@ -7,27 +7,10 @@ # # Note 2! The CFLAGS definition is now in the main makefile... -.c.o: - $(CC) $(CFLAGS) -c $< -.s.o: - $(AS) -o $*.o $< -.c.s: - $(CC) $(CFLAGS) -S $< +O_TARGET := unix.o +O_OBJS := af_unix.o garbage.o sysctl_net_unix.o -OBJS = sock.o proc.o - -unix.o: $(OBJS) - $(LD) -r -o unix.o $(OBJS) - -dep: - $(CPP) -M *.c > .depend +include $(TOPDIR)/Rules.make tar: - tar -cvf /dev/f1 . - -# -# include a dependency file if one exists -# -ifeq (.depend,$(wildcard .depend)) -include .depend -endif + tar -cvf /dev/f1 . diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c new file mode 100644 index 000000000..aeb752d96 --- /dev/null +++ b/net/unix/af_unix.c @@ -0,0 +1,1331 @@ +/* + * NET3: Implementation of BSD Unix domain sockets. + * + * Authors: Alan Cox, <alan@cymru.net> + * + * Currently this contains all but the file descriptor passing code. + * Before that goes in the odd bugs in the iovec handlers need + * fixing, and this bit testing. BSD fd passing is not a trivial part + * of the exercise it turns out. Anyone like writing garbage collectors. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Linus Torvalds : Assorted bug cures. + * Niibe Yutaka : async I/O support. + * Carsten Paeth : PF_UNIX check, address fixes. + * Alan Cox : Limit size of allocated blocks. + * Alan Cox : Fixed the stupid socketpair bug. + * Alan Cox : BSD compatibility fine tuning. + * Alan Cox : Fixed a bug in connect when interrupted. + * Alan Cox : Sorted out a proper draft version of + * file descriptor passing hacked up from + * Mike Shaver's work. + * Marty Leisner : Fixes to fd passing + * Nick Nevin : recvmsg bugfix. + * Alan Cox : Started proper garbage collector + * Heiko EiBfeldt : Missing verify_area check + * Alan Cox : Started POSIXisms + * + * Known differences from reference BSD that was tested: + * + * [TO FIX] + * ECONNREFUSED is not returned from one end of a connected() socket to the + * other the moment one end closes. + * fstat() doesn't return st_dev=NODEV, and give the blksize as high water mark + * and a fake inode identifier (nor the BSD first socket fstat twice bug). + * [NOT TO FIX] + * accept() returns a path name even if the connecting socket has closed + * in the meantime (BSD loses the path and gives up). + * accept() returns 0 length path for an unbound connector. BSD returns 16 + * and a null first byte in the path (but not for gethost/peername - BSD bug ??) + * socketpair(...SOCK_RAW..) doesn't panic the kernel. + * BSD af_unix apparently has connect forgetting to block properly. + * (need to check this with the POSIX spec in detail) + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/socket.h> +#include <linux/un.h> +#include <linux/fcntl.h> +#include <linux/termios.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/in.h> +#include <linux/fs.h> +#include <linux/malloc.h> +#include <asm/uaccess.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <net/af_unix.h> +#include <linux/proc_fs.h> + +unix_socket *unix_socket_list=NULL; + +#define min(a,b) (((a)<(b))?(a):(b)) + +/* + * Make sure the unix name is null-terminated. + */ + +static inline void unix_mkname(struct sockaddr_un * sunaddr, unsigned long len) +{ + if (len >= sizeof(*sunaddr)) + len = sizeof(*sunaddr)-1; + ((char *)sunaddr)[len]=0; +} + +/* + * Note: Sockets may not be removed _during_ an interrupt or net_bh + * handler using this technique. They can be added although we do not + * use this facility. + */ + +static void unix_remove_socket(unix_socket *sk) +{ + unix_socket **s; + + cli(); + s=&unix_socket_list; + + while(*s!=NULL) + { + if(*s==sk) + { + *s=sk->next; + sti(); + return; + } + s=&((*s)->next); + } + sti(); +} + +static void unix_insert_socket(unix_socket *sk) +{ + cli(); + sk->next=unix_socket_list; + unix_socket_list=sk; + sti(); +} + +static unix_socket *unix_find_socket(struct inode *i) +{ + unix_socket *s; + cli(); + s=unix_socket_list; + while(s) + { + if(s->protinfo.af_unix.inode==i) + { + sti(); + return(s); + } + s=s->next; + } + sti(); + return(NULL); +} + +/* + * Delete a unix socket. We have to allow for deferring this on a timer. + */ + +static void unix_destroy_timer(unsigned long data) +{ + unix_socket *sk=(unix_socket *)data; + if(sk->protinfo.af_unix.locks==0 && sk->wmem_alloc==0) + { + if(sk->protinfo.af_unix.name) + kfree(sk->protinfo.af_unix.name); + sk_free(sk); + return; + } + + /* + * Retry; + */ + + sk->timer.expires=jiffies+10*HZ; /* No real hurry try it every 10 seconds or so */ + add_timer(&sk->timer); +} + + +static void unix_delayed_delete(unix_socket *sk) +{ + sk->timer.data=(unsigned long)sk; + sk->timer.expires=jiffies+HZ; /* Normally 1 second after will clean up. After that we try every 10 */ + sk->timer.function=unix_destroy_timer; + add_timer(&sk->timer); +} + +static void unix_destroy_socket(unix_socket *sk) +{ + struct sk_buff *skb; + + unix_remove_socket(sk); + + while((skb=skb_dequeue(&sk->receive_queue))!=NULL) + { + if(sk->state==TCP_LISTEN) + { + unix_socket *osk=skb->sk; + osk->state=TCP_CLOSE; + kfree_skb(skb, FREE_WRITE); /* Now surplus - free the skb first before the socket */ + osk->state_change(osk); /* So the connect wakes and cleans up (if any) */ + /* osk will be destroyed when it gets to close or the timer fires */ + } + else + { + /* passed fds are erased in the kfree_skb hook */ + kfree_skb(skb,FREE_WRITE); + } + } + + if(sk->protinfo.af_unix.inode!=NULL) + { + iput(sk->protinfo.af_unix.inode); + sk->protinfo.af_unix.inode=NULL; + } + + if(--sk->protinfo.af_unix.locks==0 && sk->wmem_alloc==0) + { + if(sk->protinfo.af_unix.name) + kfree(sk->protinfo.af_unix.name); + sk_free(sk); + } + else + { + sk->dead=1; + unix_delayed_delete(sk); /* Try every so often until buffers are all freed */ + } +} + +/* + * Fixme: We need async I/O on AF_UNIX doing next. + */ + +static int unix_fcntl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + return -EINVAL; +} + +/* + * Yes socket options work with the new unix domain socketry!!!!!!! + */ + +static int unix_setsockopt(struct socket *sock, int level, int optname, char *optval, int optlen) +{ + unix_socket *sk=sock->data; + if(level!=SOL_SOCKET) + return -EOPNOTSUPP; + return sock_setsockopt(sk,level,optname,optval,optlen); +} + +static int unix_getsockopt(struct socket *sock, int level, int optname, char *optval, int *optlen) +{ + unix_socket *sk=sock->data; + if(level!=SOL_SOCKET) + return -EOPNOTSUPP; + return sock_getsockopt(sk,level,optname,optval,optlen); +} + +static int unix_listen(struct socket *sock, int backlog) +{ + unix_socket *sk=sock->data; + if(sk->type!=SOCK_STREAM) + return -EOPNOTSUPP; /* Only stream sockets accept */ + if(sk->protinfo.af_unix.name==NULL) + return -EINVAL; /* No listens on an unbound socket */ + sk->max_ack_backlog=backlog; + sk->state=TCP_LISTEN; + return 0; +} + +static void def_callback1(struct sock *sk) +{ + if(!sk->dead) + wake_up_interruptible(sk->sleep); +} + +static void def_callback2(struct sock *sk, int len) +{ + if(!sk->dead) + { + wake_up_interruptible(sk->sleep); + sock_wake_async(sk->socket, 1); + } +} + +static void def_callback3(struct sock *sk) +{ + if(!sk->dead) + { + wake_up_interruptible(sk->sleep); + sock_wake_async(sk->socket, 2); + } +} + +static int unix_create(struct socket *sock, int protocol) +{ + unix_socket *sk; + if(protocol && protocol != PF_UNIX) + return -EPROTONOSUPPORT; + sk=(unix_socket *)sk_alloc(GFP_KERNEL); + if(sk==NULL) + return -ENOMEM; + switch(sock->type) + { + case SOCK_STREAM: + break; + /* + * Believe it or not BSD has AF_UNIX, SOCK_RAW though + * nothing uses it. + */ + case SOCK_RAW: + sock->type=SOCK_DGRAM; + case SOCK_DGRAM: + break; + default: + sk_free(sk); + return -ESOCKTNOSUPPORT; + } + sk->type=sock->type; + init_timer(&sk->timer); + skb_queue_head_init(&sk->write_queue); + skb_queue_head_init(&sk->receive_queue); + skb_queue_head_init(&sk->back_log); + sk->protinfo.af_unix.family=AF_UNIX; + sk->protinfo.af_unix.inode=NULL; + sk->protinfo.af_unix.locks=1; /* Us */ + sk->protinfo.af_unix.readsem=MUTEX; /* single task reading lock */ + sk->rcvbuf=SK_RMEM_MAX; + sk->sndbuf=SK_WMEM_MAX; + sk->allocation=GFP_KERNEL; + sk->state=TCP_CLOSE; + sk->priority=SOPRI_NORMAL; + sk->state_change=def_callback1; + sk->data_ready=def_callback2; + sk->write_space=def_callback3; + sk->error_report=def_callback1; + sk->mtu=4096; + sk->socket=sock; + sock->data=(void *)sk; + sk->sleep=sock->wait; + unix_insert_socket(sk); + return 0; +} + +static int unix_dup(struct socket *newsock, struct socket *oldsock) +{ + return unix_create(newsock,0); +} + +static int unix_release(struct socket *sock, struct socket *peer) +{ + unix_socket *sk=sock->data; + unix_socket *skpair; + + /* May not have data attached */ + + if(sk==NULL) + return 0; + + sk->state_change(sk); + sk->dead=1; + skpair=(unix_socket *)sk->protinfo.af_unix.other; /* Person we send to (default) */ + if(sk->type==SOCK_STREAM && skpair!=NULL && skpair->state!=TCP_LISTEN) + { + skpair->shutdown=SHUTDOWN_MASK; /* No more writes */ + skpair->state_change(skpair); /* Wake any blocked writes */ + } + if(skpair!=NULL) + skpair->protinfo.af_unix.locks--; /* It may now die */ + sk->protinfo.af_unix.other=NULL; /* No pair */ + unix_destroy_socket(sk); /* Try to flush out this socket. Throw out buffers at least */ + unix_gc(); /* Garbage collect fds */ + + /* + * FIXME: BSD difference: In BSD all sockets connected to use get ECONNRESET and we die on the spot. In + * Linux we behave like files and pipes do and wait for the last dereference. + */ + + sock->data = NULL; + sk->socket = NULL; + + return 0; +} + + +static unix_socket *unix_find_other(char *path, int *error) +{ + int old_fs; + int err; + struct inode *inode; + unix_socket *u; + + old_fs=get_fs(); + set_fs(get_ds()); + err = open_namei(path, 2, S_IFSOCK, &inode, NULL); + set_fs(old_fs); + if(err<0) + { + *error=err; + return NULL; + } + u=unix_find_socket(inode); + iput(inode); + if(u==NULL) + { + *error=-ECONNREFUSED; + return NULL; + } + return u; +} + + +static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr; + unix_socket *sk=sock->data; + int old_fs; + int err; + + if(sk->protinfo.af_unix.name) + return -EINVAL; /* Already bound */ + + if(addr_len>sizeof(struct sockaddr_un) || addr_len<3 || sunaddr->sun_family!=AF_UNIX) + return -EINVAL; + unix_mkname(sunaddr, addr_len); + /* + * Put ourselves in the filesystem + */ + if(sk->protinfo.af_unix.inode!=NULL) + return -EINVAL; + + sk->protinfo.af_unix.name=kmalloc(addr_len+1, GFP_KERNEL); + if(sk->protinfo.af_unix.name==NULL) + return -ENOBUFS; + memcpy(sk->protinfo.af_unix.name, sunaddr->sun_path, addr_len+1); + + old_fs=get_fs(); + set_fs(get_ds()); + + err=do_mknod(sk->protinfo.af_unix.name,S_IFSOCK|S_IRWXUGO,0); + if(err==0) + err=open_namei(sk->protinfo.af_unix.name, 2, S_IFSOCK, &sk->protinfo.af_unix.inode, NULL); + + set_fs(old_fs); + + if(err<0) + { + kfree_s(sk->protinfo.af_unix.name,addr_len+1); + sk->protinfo.af_unix.name=NULL; + if(err==-EEXIST) + return -EADDRINUSE; + else + return err; + } + + return 0; + +} + +static int unix_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) +{ + unix_socket *sk=sock->data; + struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr; + unix_socket *other; + struct sk_buff *skb; + int err; + + /* + * 1003.1g breaking connected state with AF_UNSPEC + */ + + if(sunaddr->sun_family==AF_UNSPEC) + { + if(sk->protinfo.af_unix.other) + { + sk->protinfo.af_unix.other->protinfo.af_unix.locks--; + sk->protinfo.af_unix.other=NULL; + sock->state=SS_UNCONNECTED; + } + return 0; + } + + if(sk->type==SOCK_STREAM && sk->protinfo.af_unix.other) + { + if(sock->state==SS_CONNECTING && sk->state==TCP_ESTABLISHED) + { + sock->state=SS_CONNECTED; + return 0; + } + if(sock->state==SS_CONNECTING && sk->state == TCP_CLOSE) + { + sock->state=SS_UNCONNECTED; + return -ECONNREFUSED; + } + if(sock->state!=SS_CONNECTING) + return -EISCONN; + if(flags&O_NONBLOCK) + return -EALREADY; + /* + * Drop through the connect up logic to the wait. + */ + } + + if(addr_len < sizeof(sunaddr->sun_family)+1 || sunaddr->sun_family!=AF_UNIX) + return -EINVAL; + + unix_mkname(sunaddr, addr_len); + + if(sk->type==SOCK_DGRAM) + { + if(sk->protinfo.af_unix.other) + { + sk->protinfo.af_unix.other->protinfo.af_unix.locks--; + sk->protinfo.af_unix.other=NULL; + sock->state=SS_UNCONNECTED; + } + other=unix_find_other(sunaddr->sun_path, &err); + if(other==NULL) + return err; + if(other->type!=sk->type) + return -EPROTOTYPE; + other->protinfo.af_unix.locks++; + sk->protinfo.af_unix.other=other; + sock->state=SS_CONNECTED; + sk->state=TCP_ESTABLISHED; + return 0; /* Done */ + } + + + if(sock->state==SS_UNCONNECTED) + { + /* + * Now ready to connect + */ + + skb=sock_alloc_send_skb(sk, 0, 0, 0, &err); /* Marker object */ + if(skb==NULL) + return err; + skb->sk=sk; /* So they know it is us */ + skb->free=1; + skb->h.filp=NULL; + sk->state=TCP_CLOSE; + unix_mkname(sunaddr, addr_len); + other=unix_find_other(sunaddr->sun_path, &err); + if(other==NULL) + { + kfree_skb(skb, FREE_WRITE); + return err; + } + if(other->type!=sk->type) + { + kfree_skb(skb, FREE_WRITE); + return -EPROTOTYPE; + } + other->protinfo.af_unix.locks++; /* Lock the other socket so it doesn't run off for a moment */ + other->ack_backlog++; + sk->protinfo.af_unix.other=other; + skb_queue_tail(&other->receive_queue,skb); + sk->state=TCP_SYN_SENT; + sock->state=SS_CONNECTING; + sti(); + other->data_ready(other,0); /* Wake up ! */ + } + + + /* Wait for an accept */ + + cli(); + while(sk->state==TCP_SYN_SENT) + { + if(flags&O_NONBLOCK) + { + sti(); + return -EINPROGRESS; + } + interruptible_sleep_on(sk->sleep); + if(current->signal & ~current->blocked) + { + sti(); + return -ERESTARTSYS; + } + } + + /* + * Has the other end closed on us ? + */ + + if(sk->state==TCP_CLOSE) + { + sk->protinfo.af_unix.other->protinfo.af_unix.locks--; + sk->protinfo.af_unix.other=NULL; + sock->state=SS_UNCONNECTED; + sti(); + return -ECONNREFUSED; + } + + /* + * Amazingly it has worked + */ + + sock->state=SS_CONNECTED; + sti(); + return 0; + +} + +static int unix_socketpair(struct socket *a, struct socket *b) +{ + unix_socket *ska,*skb; + + ska=a->data; + skb=b->data; + + /* Join our sockets back to back */ + ska->protinfo.af_unix.locks++; + skb->protinfo.af_unix.locks++; + ska->protinfo.af_unix.other=skb; + skb->protinfo.af_unix.other=ska; + ska->state=TCP_ESTABLISHED; + skb->state=TCP_ESTABLISHED; + return 0; +} + +static int unix_accept(struct socket *sock, struct socket *newsock, int flags) +{ + unix_socket *sk=sock->data; + unix_socket *newsk, *tsk; + struct sk_buff *skb; + + if(sk->type!=SOCK_STREAM) + { + return -EOPNOTSUPP; + } + if(sk->state!=TCP_LISTEN) + { + return -EINVAL; + } + + newsk=newsock->data; + if(sk->protinfo.af_unix.name!=NULL) + { + newsk->protinfo.af_unix.name=kmalloc(strlen(sk->protinfo.af_unix.name)+1, GFP_KERNEL); + if(newsk->protinfo.af_unix.name==NULL) + return -ENOMEM; + strcpy(newsk->protinfo.af_unix.name, sk->protinfo.af_unix.name); + } + + do + { + cli(); + skb=skb_dequeue(&sk->receive_queue); + if(skb==NULL) + { + if(flags&O_NONBLOCK) + { + sti(); + return -EAGAIN; + } + interruptible_sleep_on(sk->sleep); + if(current->signal & ~current->blocked) + { + sti(); + return -ERESTARTSYS; + } + sti(); + } + } + while(skb==NULL); + tsk=skb->sk; + kfree_skb(skb, FREE_WRITE); /* The buffer is just used as a tag */ + sk->ack_backlog--; + newsk->protinfo.af_unix.other=tsk; + tsk->protinfo.af_unix.other=newsk; + tsk->state=TCP_ESTABLISHED; + newsk->state=TCP_ESTABLISHED; + newsk->protinfo.af_unix.locks++; /* Swap lock over */ + sk->protinfo.af_unix.locks--; /* Locked to child socket not master */ + tsk->protinfo.af_unix.locks++; /* Back lock */ + sti(); + tsk->state_change(tsk); /* Wake up any sleeping connect */ + sock_wake_async(tsk->socket, 0); + return 0; +} + +static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer) +{ + unix_socket *sk=sock->data; + struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr; + + if(peer) + { + if(sk->protinfo.af_unix.other==NULL) + return -ENOTCONN; + sk=sk->protinfo.af_unix.other; + } + sunaddr->sun_family=AF_UNIX; + if(sk->protinfo.af_unix.name==NULL) + { + *sunaddr->sun_path=0; + *uaddr_len=sizeof(sunaddr->sun_family)+1; + return 0; /* Not bound */ + } + *uaddr_len=sizeof(sunaddr->sun_family)+strlen(sk->protinfo.af_unix.name)+1; + strcpy(sunaddr->sun_path,sk->protinfo.af_unix.name); /* 108 byte limited */ + return 0; +} + +/* + * Copy file descriptors into system space. + * Return number copied or negative error code + */ + +static int unix_fd_copy(struct sock *sk, struct cmsghdr *cmsg, struct file **fp) +{ + int num=cmsg->cmsg_len-sizeof(struct cmsghdr); + int i; + int *fdp=(int *)cmsg->cmsg_data; + + num /= sizeof(int); /* Odd bytes are forgotten in BSD not errored */ + if (num >= UNIX_MAX_FD) + return -EINVAL; + + /* + * Verify the descriptors. + */ + + for(i=0; i< num; i++) + { + int fd; + + fd = fdp[i]; + if (fd < 0 || fd >= NR_OPEN) + return -EBADF; + if (current->files->fd[fd]==NULL) + return -EBADF; + } + + /* add another reference to these files */ + for(i=0; i< num; i++) + { + fp[i]=current->files->fd[fdp[i]]; + fp[i]->f_count++; + unix_inflight(fp[i]); + } + + return num; +} + +/* + * Free the descriptors in the array + */ + +static void unix_fd_free(struct sock *sk, struct file **fp, int num) +{ + int i; + for(i=0;i<num;i++) + { + close_fp(fp[i]); + unix_notinflight(fp[i]); + } +} + + +/* + * Perform the AF_UNIX file descriptor pass out functionality. This + * is nasty and messy as is the whole design of BSD file passing. + */ + +static void unix_detach_fds(struct sk_buff *skb, struct cmsghdr *cmsg) +{ + int i; + /* count of space in parent for fds */ + int cmnum; + struct file **fp; + int *cmfptr; + int fdnum; + + cmfptr = NULL; + cmnum = 0; + if (cmsg) + { + cmnum = (cmsg->cmsg_len-sizeof(struct cmsghdr)) / sizeof(int); + cmfptr = (int *)&cmsg->cmsg_data; + } + + fdnum = *(int *)skb->h.filp; + fp = (struct file **)(skb->h.filp+sizeof(long)); + + if (cmnum > fdnum) + cmnum = fdnum; + + /* + * Copy those that fit + */ + for (i = 0 ; i < cmnum ; i++) + { + int new_fd = get_unused_fd(); + if (new_fd < 0) + break; + current->files->fd[new_fd]=fp[i]; + *cmfptr++ = new_fd; + unix_notinflight(fp[i]); + } + /* + * Dump those that don't + */ + for( ; i < fdnum ; i++) + { + close_fp(fp[i]); + unix_notinflight(fp[i]); + } + kfree(skb->h.filp); + skb->h.filp=NULL; + + /* no need to use destructor */ + skb->destructor = NULL; +} + +static void unix_destruct_fds(struct sk_buff *skb) +{ + unix_detach_fds(skb,NULL); +} + +/* + * Attach the file descriptor array to an sk_buff + */ +static void unix_attach_fds(int fpnum,struct file **fp,struct sk_buff *skb) +{ + + skb->h.filp = kmalloc(sizeof(long)+fpnum*sizeof(struct file *), + GFP_KERNEL); + /* number of descriptors starts block */ + *(int *)skb->h.filp = fpnum; + /* actual descriptors */ + memcpy(skb->h.filp+sizeof(long),fp,fpnum*sizeof(struct file *)); + skb->destructor = unix_destruct_fds; +} + +/* + * Send AF_UNIX data. + */ + +static int unix_sendmsg(struct socket *sock, struct msghdr *msg, int len, int nonblock, int flags) +{ + unix_socket *sk=sock->data; + unix_socket *other; + struct sockaddr_un *sunaddr=msg->msg_name; + int err,size; + struct sk_buff *skb; + int limit=0; + int sent=0; + struct file *fp[UNIX_MAX_FD]; + /* number of fds waiting to be passed, 0 means either + * no fds to pass or they've already been passed + */ + int fpnum=0; + + if(sk->err) + return sock_error(sk); + + if(flags&MSG_OOB) + return -EOPNOTSUPP; + + if(flags) /* For now */ { + return -EINVAL; + } + + if(sk->shutdown&SEND_SHUTDOWN) + { + send_sig(SIGPIPE,current,0); + return -EPIPE; + } + + if(sunaddr!=NULL) + { + if(sock->type==SOCK_STREAM) + { + if(sk->state==TCP_ESTABLISHED) + return -EISCONN; + else + return -EOPNOTSUPP; + } + } + + if(sunaddr==NULL) + { + if(sk->protinfo.af_unix.other==NULL) + return -ENOTCONN; + } + + /* + * A control message has been attached. + */ + if(msg->msg_control) + { + struct cmsghdr *cm = msg->msg_control; + + if(cm==NULL || msg->msg_controllen<sizeof(struct cmsghdr) || + cm->cmsg_type!=SCM_RIGHTS || + cm->cmsg_level!=SOL_SOCKET || + msg->msg_controllen!=cm->cmsg_len) + { + return -EINVAL; + } + + fpnum = unix_fd_copy(sk, cm, fp); + + if(fpnum<0) { + return fpnum; + } + } + + while(sent < len) + { + /* + * Optimisation for the fact that under 0.01% of X messages typically + * need breaking up. + */ + + size=len-sent; + + if(size>(sk->sndbuf-sizeof(struct sk_buff))/2) /* Keep two messages in the pipe so it schedules better */ + { + if(sock->type==SOCK_DGRAM) + { + unix_fd_free(sk,fp,fpnum); + return -EMSGSIZE; + } + size=(sk->sndbuf-sizeof(struct sk_buff))/2; + } + /* + * Keep to page sized kmalloc()'s as various people + * have suggested. Big mallocs stress the vm too + * much. + */ +#define MAX_ALLOC (PAGE_SIZE*7/8) + if(size > MAX_ALLOC && sock->type!=SOCK_DGRAM) + limit = MAX_ALLOC; /* Fall back to 4K if we can't grab a big buffer this instant */ + else + limit = 0; /* Otherwise just grab and wait */ + + /* + * Grab a buffer + */ + + skb=sock_alloc_send_skb(sk,size,limit,nonblock, &err); + + if(skb==NULL) + { + unix_fd_free(sk,fp,fpnum); + if(sent) + { + sk->err=-err; + return sent; + } + return err; + } + + /* + * If you pass two values to the sock_alloc_send_skb + * it tries to grab the large buffer with GFP_BUFFER + * (which can fail easily), and if it fails grab the + * fallback size buffer which is under a page and will + * succeed. [Alan] + */ + size = min(size, skb_tailroom(skb)); + + skb->sk=sk; + skb->free=1; + + if(fpnum) + { + unix_attach_fds(fpnum,fp,skb); + fpnum=0; + } + else + skb->h.filp=NULL; + + memcpy_fromiovec(skb_put(skb,size),msg->msg_iov, size); + + cli(); + if(sunaddr==NULL) + { + other=sk->protinfo.af_unix.other; + if(sock->type==SOCK_DGRAM && other->dead) + { + other->protinfo.af_unix.locks--; + sk->protinfo.af_unix.other=NULL; + sock->state=SS_UNCONNECTED; + sti(); + kfree_skb(skb, FREE_WRITE); + /* + * Check with 1003.1g - what should + * datagram error + */ + if (!sent) + sent = -ECONNRESET; + return sent; + } + /* + * Stream sockets SIGPIPE + */ + if(sock->type==SOCK_STREAM && other->dead) + { + kfree_skb(skb, FREE_WRITE); + sti(); + if(!sent) + { + send_sig(SIGPIPE,current,0); + sent = -EPIPE; + } + return sent; + } + } + else + { + unix_mkname(sunaddr, msg->msg_namelen); + other=unix_find_other(sunaddr->sun_path, &err); + if(other==NULL) + { + sti(); + kfree_skb(skb, FREE_WRITE); + if(sent) + return sent; + else + return err; + } + } + skb_queue_tail(&other->receive_queue, skb); + sti(); + /* if we sent an fd, only do it once */ + other->data_ready(other,size); + sent+=size; + } + return sent; +} + +/* + * Sleep until data has arrive. But check for races.. + */ + +static void unix_data_wait(unix_socket * sk) +{ + /* + * AF_UNIX sockets get no messages during interrupts, so this + * is safe without cli/sti. + */ + if (!skb_peek(&sk->receive_queue)) { + sk->socket->flags |= SO_WAITDATA; + interruptible_sleep_on(sk->sleep); + sk->socket->flags &= ~SO_WAITDATA; + } +} + +static int unix_recvmsg(struct socket *sock, struct msghdr *msg, int size, int noblock, int flags, int *addr_len) +{ + unix_socket *sk=sock->data; + struct sockaddr_un *sunaddr=msg->msg_name; + struct sk_buff *skb; + int copied=0; + unsigned char *sp; + int len; + int num; + struct iovec *iov=msg->msg_iov; + struct cmsghdr *cm=NULL; + int ct=msg->msg_iovlen; + int err = 0; + int target = 1; + + if(flags&MSG_OOB) + return -EOPNOTSUPP; + if(flags&MSG_WAITALL) + target = size; + + + if(addr_len) + *addr_len=0; + + if(msg->msg_control) + { + cm=msg->msg_control; + + if(msg->msg_controllen<sizeof(struct cmsghdr) +#if 0 +/* investigate this further -- Stevens example doesn't seem to care */ + || + cm->cmsg_type!=SCM_RIGHTS || + cm->cmsg_level!=SOL_SOCKET || + msg->msg_controllen!=cm->cmsg_len +#endif + ) + { + printk(KERN_DEBUG "unix_recvmsg: Bad msg_control\n"); + return -EINVAL; + } + } + + down(&sk->protinfo.af_unix.readsem); /* Lock the socket */ + while(ct--) + { + int done=0; + sp=iov->iov_base; + len=iov->iov_len; + iov++; + + while(done<len) + { + if (copied && (flags & MSG_PEEK)) + goto out; + if (copied == size) + goto out; + skb=skb_dequeue(&sk->receive_queue); + if(skb==NULL) + { + up(&sk->protinfo.af_unix.readsem); + + if(copied >= target) + return copied; + + /* + * POSIX checking order... + */ + + if(sk->err) + return sock_error(sk); + if(sk->shutdown & RCV_SHUTDOWN) + return copied; + + if(current->signal & ~current->blocked) + return -ERESTARTSYS; + if(noblock) + return -EAGAIN; + + unix_data_wait(sk); + down(&sk->protinfo.af_unix.readsem); + continue; + } + if(msg->msg_name!=NULL) + { + sunaddr->sun_family=AF_UNIX; + if(skb->sk->protinfo.af_unix.name) + { + memcpy(sunaddr->sun_path, skb->sk->protinfo.af_unix.name, 108); + if(addr_len) + *addr_len=strlen(sunaddr->sun_path)+sizeof(short); + } + else + if(addr_len) + *addr_len=sizeof(short); + } + + num=skb->len; + if(num>len-done) + { + num=len-done; + msg->msg_flags|=MSG_TRUNC; + } + err = copy_to_user(sp, skb->data, num); + + if (err) + { + goto out; + } + + if (skb->h.filp!=NULL) + unix_detach_fds(skb,cm); + + copied+=num; + done+=num; + sp+=num; + if (!(flags & MSG_PEEK)) + skb_pull(skb, num); + /* put the skb back if we didn't use it up.. */ + if (skb->len) { + skb_queue_head(&sk->receive_queue, skb); + continue; + } + kfree_skb(skb, FREE_WRITE); + if(sock->type==SOCK_DGRAM || cm) + goto out; + } + } +out: + up(&sk->protinfo.af_unix.readsem); + + return err ? -EFAULT : copied; +} + +static int unix_shutdown(struct socket *sock, int mode) +{ + unix_socket *sk=(unix_socket *)sock->data; + unix_socket *other=sk->protinfo.af_unix.other; + if(mode&SEND_SHUTDOWN) + { + sk->shutdown|=SEND_SHUTDOWN; + sk->state_change(sk); + if(other) + { + other->shutdown|=RCV_SHUTDOWN; + other->state_change(other); + } + } + other=sk->protinfo.af_unix.other; + if(mode&RCV_SHUTDOWN) + { + sk->shutdown|=RCV_SHUTDOWN; + sk->state_change(sk); + if(other) + { + other->shutdown|=SEND_SHUTDOWN; + other->state_change(other); + } + } + return 0; +} + + +static int unix_select(struct socket *sock, int sel_type, select_table *wait) +{ + return datagram_select(sock->data,sel_type,wait); +} + +static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + unix_socket *sk=sock->data; + long amount=0; + + switch(cmd) + { + + case TIOCOUTQ: + amount=sk->sndbuf-sk->wmem_alloc; + if(amount<0) + amount=0; + return put_user(amount, (int *)arg); + case TIOCINQ: + { + struct sk_buff *skb; + if(sk->state==TCP_LISTEN) + return -EINVAL; + /* + * These two are safe on a single CPU system as + * only user tasks fiddle here + */ + if((skb=skb_peek(&sk->receive_queue))!=NULL) + amount=skb->len; + return put_user(amount, (int *)arg); + } + + default: + return -EINVAL; + } + /*NOTREACHED*/ + return(0); +} + +#ifdef CONFIG_PROC_FS +static int unix_get_info(char *buffer, char **start, off_t offset, int length, int dummy) +{ + off_t pos=0; + off_t begin=0; + int len=0; + unix_socket *s=unix_socket_list; + + len+= sprintf(buffer,"Num RefCount Protocol Flags Type St " + "Inode Path\n"); + + while(s!=NULL) + { + len+=sprintf(buffer+len,"%p: %08X %08X %08lX %04X %02X %5ld", + s, + s->protinfo.af_unix.locks, + 0, + s->socket->flags, + s->socket->type, + s->socket->state, + s->socket->inode ? s->socket->inode->i_ino : 0); + if(s->protinfo.af_unix.name!=NULL) + len+=sprintf(buffer+len, " %s\n", s->protinfo.af_unix.name); + else + buffer[len++]='\n'; + + pos=begin+len; + if(pos<offset) + { + len=0; + begin=pos; + } + if(pos>offset+length) + break; + s=s->next; + } + *start=buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + return len; +} +#endif + +struct proto_ops unix_proto_ops = { + AF_UNIX, + + unix_create, + unix_dup, + unix_release, + unix_bind, + unix_connect, + unix_socketpair, + unix_accept, + unix_getname, + unix_select, + unix_ioctl, + unix_listen, + unix_shutdown, + unix_setsockopt, + unix_getsockopt, + unix_fcntl, + unix_sendmsg, + unix_recvmsg +}; + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry proc_net_unix = { + PROC_NET_UNIX, 4, "unix", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + unix_get_info +}; +#endif + +void unix_proto_init(struct net_proto *pro) +{ + printk(KERN_INFO "NET3: Unix domain sockets 0.14 for Linux NET3.037.\n"); + sock_register(unix_proto_ops.family, &unix_proto_ops); +#ifdef CONFIG_PROC_FS + proc_net_register(&proc_net_unix); +#endif +} +/* + * Local variables: + * compile-command: "gcc -g -D__KERNEL__ -Wall -O6 -I/usr/src/linux/include -c af_unix.c" + * End: + */ diff --git a/net/unix/garbage.c b/net/unix/garbage.c new file mode 100644 index 000000000..c53c4d4e6 --- /dev/null +++ b/net/unix/garbage.c @@ -0,0 +1,280 @@ +/* + * NET3: Garbage Collector For AF_UNIX sockets + * + * Garbage Collector: + * Copyright (C) Barak A. Pearlmutter. + * Released under the GPL version 2 or later. + * + * Chopped about by Alan Cox 22/3/96 to make it fit the AF_UNIX socket problem. + * If it doesn't work blame me, it worked when Barak sent it. + * + * Assumptions: + * + * - object w/ a bit + * - free list + * + * Current optimizations: + * + * - explicit stack instead of recursion + * - tail recurse on first born instead of immediate push/pop + * + * Future optimizations: + * + * - don't just push entire root set; process in place + * - use linked list for internal stack + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * + */ + +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/socket.h> +#include <linux/un.h> +#include <linux/fcntl.h> +#include <linux/termios.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/in.h> +#include <linux/fs.h> +#include <linux/malloc.h> +#include <asm/uaccess.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <net/af_unix.h> +#include <linux/proc_fs.h> + +/* Internal data structures and random procedures: */ + +#define MAX_STACK 1000 /* Maximum depth of tree (about 1 page) */ +static unix_socket **stack; /* stack of objects to mark */ +static int in_stack = 0; /* first free entry in stack */ + + +extern inline unix_socket *unix_get_socket(struct file *filp) +{ + unix_socket * u_sock = NULL; + struct inode *inode = filp->f_inode; + + /* + * Socket ? + */ + if (inode && inode->i_sock) { + struct socket * s = &inode->u.socket_i; + + /* + * AF_UNIX ? + */ + if (s->ops == &unix_proto_ops) + u_sock = s->data; + } + return u_sock; +} + +/* + * Keep the number of times in flight count for the file + * descriptor if it is for an AF_UNIX socket. + */ + +void unix_inflight(struct file *fp) +{ + unix_socket *s=unix_get_socket(fp); + if(s) + s->protinfo.af_unix.inflight++; +} + +void unix_notinflight(struct file *fp) +{ + unix_socket *s=unix_get_socket(fp); + if(s) + s->protinfo.af_unix.inflight--; +} + + +/* + * Garbage Collector Support Functions + */ + +extern inline void push_stack(unix_socket *x) +{ + if (in_stack == MAX_STACK) + panic("can't push onto full stack"); + stack[in_stack++] = x; +} + +extern inline unix_socket *pop_stack(void) +{ + if (in_stack == 0) + panic("can't pop empty gc stack"); + return stack[--in_stack]; +} + +extern inline int empty_stack(void) +{ + return in_stack == 0; +} + +extern inline void maybe_mark_and_push(unix_socket *x) +{ + if (x->protinfo.af_unix.marksweep&MARKED) + return; + x->protinfo.af_unix.marksweep|=MARKED; + push_stack(x); +} + + +/* The external entry point: unix_gc() */ + +void unix_gc(void) +{ + static int in_unix_gc=0; + unix_socket *s; + unix_socket *next; + + /* + * Avoid a recursive GC. + */ + + if(in_unix_gc) + return; + in_unix_gc=1; + + stack=(unix_socket **)get_free_page(GFP_KERNEL); + + /* + * Assume everything is now unmarked + */ + + /* Invariant to be maintained: + - everything marked is either: + -- (a) on the stack, or + -- (b) has all of its children marked + - everything on the stack is always marked + - nothing is ever pushed onto the stack twice, because: + -- nothing previously marked is ever pushed on the stack + */ + + /* + * Push root set + */ + + for(s=unix_socket_list;s!=NULL;s=s->next) + { + /* + * If all instances of the descriptor are not + * in flight we are in use. + */ + if(s->socket && s->socket->file && s->socket->file->f_count > s->protinfo.af_unix.inflight) + maybe_mark_and_push(s); + } + + /* + * Mark phase + */ + + while (!empty_stack()) + { + unix_socket *x = pop_stack(); + unix_socket *f=NULL,*sk; + struct sk_buff *skb; +tail: + skb=skb_peek(&x->receive_queue); + + /* + * Loop through all but first born + */ + + while(skb && skb != (struct sk_buff *)&x->receive_queue) + { + /* + * Do we have file descriptors ? + */ + if(skb->h.filp) + { + /* + * Process the descriptors of this socket + */ + int nfd=*(int *)skb->h.filp; + struct file **fp=(struct file **)(skb->h.filp+sizeof(int)); + while(nfd--) + { + /* + * Get the socket the fd matches if + * it indeed does so + */ + if((sk=unix_get_socket(*fp++))!=NULL) + { + /* + * Remember the first, mark the + * rest. + */ + if(f==NULL) + f=sk; + else + maybe_mark_and_push(sk); + } + } + } + skb=skb->next; + } + /* + * Handle first born specially + */ + + if (f) + { + if (!(f->protinfo.af_unix.marksweep&MARKED)) + { + f->protinfo.af_unix.marksweep|=MARKED; + x=f; + f=NULL; + goto tail; + } + } + } + + /* + * Sweep phase. NOTE: this part dominates the time complexity + */ + + for(s=unix_socket_list;s!=NULL;s=next) + { + next=s->next; + if (!(s->protinfo.af_unix.marksweep&MARKED)) + { + /* + * We exist only in the passing tree of sockets + * that is no longer connected to active descriptors + * Time to die.. + * + * Subtle item: We will correctly sweep out the + * socket that has just been closed by the user. + * We must not close this as we are in the middle + * of its close at this moment. Skip that file + * using f_count==0 to spot it. + */ + + if(s->socket && s->socket->file && s->socket->file->f_count) + close_fp(s->socket->file); + } + else + s->protinfo.af_unix.marksweep&=~MARKED; /* unmark everything for next collection */ + } + + in_unix_gc=0; + + free_page((long)stack); +} diff --git a/net/unix/proc.c b/net/unix/proc.c deleted file mode 100644 index 64a777330..000000000 --- a/net/unix/proc.c +++ /dev/null @@ -1,100 +0,0 @@ -/* - * UNIX An implementation of the AF_UNIX network domain for the - * LINUX operating system. UNIX is implemented using the - * BSD Socket interface as the means of communication with - * the user level. - * - * The functions in this file provide an interface between - * the PROC file system and the "unix" family of networking - * protocols. It is mainly used for debugging and statistics. - * - * Version: @(#)proc.c 1.0.4 05/23/93 - * - * Authors: Ross Biro, <bir7@leland.Stanford.Edu> - * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> - * Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de> - * Fred Baumgarten, <dc6iq@insu1.etec.uni-kalrsruhe.de> - * - * Fixes: - * Dmitry Gorodchanin : /proc locking fix - * Mathijs Maassen : unbound /proc fix. - * Alan Cox : Fix sock=NULL race - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include <linux/autoconf.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/socket.h> -#include <linux/net.h> -#include <linux/un.h> -#include <linux/param.h> -#include <net/unix.h> - - -/* Called from PROCfs. */ -int unix_get_info(char *buffer, char **start, off_t offset, int length) -{ - off_t pos=0; - off_t begin=0; - int len=0; - int i; - unsigned long flags; - socket_state s_state; - short s_type; - long s_flags; - - len += sprintf(buffer, "Num RefCount Protocol Flags Type St Path\n"); - - for(i = 0; i < NSOCKETS_UNIX; i++) - { - save_flags(flags); - cli(); - if (unix_datas[i].refcnt>0 && unix_datas[i].socket!=NULL) - { - /* sprintf is slow... lock only for the variable reads */ - s_type=unix_datas[i].socket->type; - s_flags=unix_datas[i].socket->flags; - s_state=unix_datas[i].socket->state; - restore_flags(flags); - len += sprintf(buffer+len, "%2d: %08X %08X %08lX %04X %02X", i, - unix_datas[i].refcnt, - unix_datas[i].protocol, - s_flags, - s_type, - s_state - ); - - /* If socket is bound to a filename, we'll print it. */ - if(unix_datas[i].sockaddr_len>0) - { - len += sprintf(buffer+len, " %s\n", - unix_datas[i].sockaddr_un.sun_path); - } - else - { /* just add a newline */ - buffer[len++]='\n'; - } - - pos=begin+len; - if(pos<offset) - { - len=0; - begin=pos; - } - if(pos>offset+length) - break; - } - else - restore_flags(flags); - } - - *start=buffer+(offset-begin); - len-=(offset-begin); - if(len>length) - len=length; - return len; -} diff --git a/net/unix/sock.c b/net/unix/sock.c deleted file mode 100644 index 9066658a7..000000000 --- a/net/unix/sock.c +++ /dev/null @@ -1,912 +0,0 @@ -/* - * UNIX An implementation of the AF_UNIX network domain for the - * LINUX operating system. UNIX is implemented using the - * BSD Socket interface as the means of communication with - * the user level. - * - * Version: @(#)sock.c 1.0.5 05/25/93 - * - * Authors: Orest Zborowski, <obz@Kodak.COM> - * Ross Biro, <bir7@leland.Stanford.Edu> - * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> - * - * Fixes: - * Alan Cox : Verify Area - * NET2E Team : Page fault locks - * Dmitry Gorodchanin : /proc locking - * - * To Do: - * Some nice person is looking into Unix sockets done properly. NET3 - * will replace all of this and include datagram sockets and socket - * options - so please stop asking me for them 8-) - * - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or(at your option) any later version. - */ - -#include <linux/kernel.h> -#include <linux/major.h> -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/stat.h> -#include <linux/socket.h> -#include <linux/un.h> -#include <linux/fcntl.h> -#include <linux/termios.h> -#include <linux/sockios.h> -#include <linux/net.h> -#include <linux/fs.h> -#include <linux/malloc.h> - -#include <asm/system.h> -#include <asm/segment.h> - -#include <stdarg.h> - -#include <net/unix.h> - -/* - * Because these have the address in them they casually waste an extra 8K of kernel data - * space that need not be wasted. - */ - -struct unix_proto_data unix_datas[NSOCKETS_UNIX]; - -static int unix_proto_create(struct socket *sock, int protocol); -static int unix_proto_dup(struct socket *newsock, struct socket *oldsock); -static int unix_proto_release(struct socket *sock, struct socket *peer); -static int unix_proto_bind(struct socket *sock, struct sockaddr *umyaddr, - int sockaddr_len); -static int unix_proto_connect(struct socket *sock, struct sockaddr *uservaddr, - int sockaddr_len, int flags); -static int unix_proto_socketpair(struct socket *sock1, struct socket *sock2); -static int unix_proto_accept(struct socket *sock, struct socket *newsock, - int flags); -static int unix_proto_getname(struct socket *sock, struct sockaddr *usockaddr, - int *usockaddr_len, int peer); -static int unix_proto_read(struct socket *sock, char *ubuf, int size, - int nonblock); -static int unix_proto_write(struct socket *sock, char *ubuf, int size, - int nonblock); -static int unix_proto_select(struct socket *sock, int sel_type, select_table * wait); -static int unix_proto_ioctl(struct socket *sock, unsigned int cmd, - unsigned long arg); -static int unix_proto_listen(struct socket *sock, int backlog); -static int unix_proto_send(struct socket *sock, void *buff, int len, - int nonblock, unsigned flags); -static int unix_proto_recv(struct socket *sock, void *buff, int len, - int nonblock, unsigned flags); -static int unix_proto_sendto(struct socket *sock, void *buff, int len, - int nonblock, unsigned flags, - struct sockaddr *addr, int addr_len); -static int unix_proto_recvfrom(struct socket *sock, void *buff, int len, - int nonblock, unsigned flags, - struct sockaddr *addr, int *addr_len); - -static int unix_proto_shutdown(struct socket *sock, int how); - -static int unix_proto_setsockopt(struct socket *sock, int level, int optname, - char *optval, int optlen); -static int unix_proto_getsockopt(struct socket *sock, int level, int optname, - char *optval, int *optlen); - - -static inline int min(int a, int b) -{ - if (a < b) - return(a); - return(b); -} - - - -/* Support routines doing anti page fault locking - * FvK & Matt Dillon (borrowed From NET2E3) - */ - -/* - * Locking for unix-domain sockets. We don't use the socket structure's - * wait queue because it is allowed to 'go away' outside of our control, - * whereas unix_proto_data structures stick around. - */ - -static void unix_lock(struct unix_proto_data *upd) -{ - while (upd->lock_flag) - sleep_on(&upd->wait); - upd->lock_flag = 1; -} - - -static void unix_unlock(struct unix_proto_data *upd) -{ - upd->lock_flag = 0; - wake_up(&upd->wait); -} - -/* - * We don't have to do anything. - */ - -static int unix_proto_listen(struct socket *sock, int backlog) -{ - return(0); -} - -/* - * Until the new NET3 Unix code is done we have no options. - */ - -static int unix_proto_setsockopt(struct socket *sock, int level, int optname, - char *optval, int optlen) -{ - return(-EOPNOTSUPP); -} - - -static int unix_proto_getsockopt(struct socket *sock, int level, int optname, - char *optval, int *optlen) -{ - return(-EOPNOTSUPP); -} - - -/* - * SendTo() doesn't matter as we also have no Datagram support! - */ - -static int unix_proto_sendto(struct socket *sock, void *buff, int len, int nonblock, - unsigned flags, struct sockaddr *addr, int addr_len) -{ - return(-EOPNOTSUPP); -} - -static int unix_proto_recvfrom(struct socket *sock, void *buff, int len, int nonblock, - unsigned flags, struct sockaddr *addr, int *addr_len) -{ - return(-EOPNOTSUPP); -} - -/* - * You can't shutdown a unix domain socket. - */ - -static int unix_proto_shutdown(struct socket *sock, int how) -{ - return(-EOPNOTSUPP); -} - - -/* - * Send data to a unix socket. - */ - -static int unix_proto_send(struct socket *sock, void *buff, int len, int nonblock, - unsigned flags) -{ - if (flags != 0) - return(-EINVAL); - return(unix_proto_write(sock, (char *) buff, len, nonblock)); -} - - -/* - * Receive data. This version of AF_UNIX also lacks MSG_PEEK 8( - */ - -static int unix_proto_recv(struct socket *sock, void *buff, int len, int nonblock, - unsigned flags) -{ - if (flags != 0) - return(-EINVAL); - return(unix_proto_read(sock, (char *) buff, len, nonblock)); -} - -/* - * Given an address and an inode go find a unix control structure - */ - -static struct unix_proto_data * -unix_data_lookup(struct sockaddr_un *sockun, int sockaddr_len, - struct inode *inode) -{ - struct unix_proto_data *upd; - - for(upd = unix_datas; upd <= last_unix_data; ++upd) - { - if (upd->refcnt > 0 && upd->socket && - upd->socket->state == SS_UNCONNECTED && - upd->sockaddr_un.sun_family == sockun->sun_family && - upd->inode == inode) - - return(upd); - } - return(NULL); -} - -/* - * We allocate a page of data for the socket. This is woefully inadequate and helps cause vast - * amounts of excess task switching and blocking when transferring stuff like bitmaps via X. - * It doesn't help this problem that the Linux scheduler is desperately in need of a major - * rewrite. Somewhere near 16K would be better maybe 32. - */ - -static struct unix_proto_data * -unix_data_alloc(void) -{ - struct unix_proto_data *upd; - - cli(); - for(upd = unix_datas; upd <= last_unix_data; ++upd) - { - if (!upd->refcnt) - { - upd->refcnt = -1; /* unix domain socket not yet initialised - bgm */ - sti(); - upd->socket = NULL; - upd->sockaddr_len = 0; - upd->sockaddr_un.sun_family = 0; - upd->buf = NULL; - upd->bp_head = upd->bp_tail = 0; - upd->inode = NULL; - upd->peerupd = NULL; - return(upd); - } - } - sti(); - return(NULL); -} - -/* - * The data area is owned by all its users. Thus we need to track owners - * carefully and not free data at the wrong moment. These look like they need - * interrupt protection but they don't because no interrupt ever fiddles with - * these counts. With an SMP Linux you'll need to protect these! - */ - -static inline void unix_data_ref(struct unix_proto_data *upd) -{ - if (!upd) - { - return; - } - ++upd->refcnt; -} - - -static void unix_data_deref(struct unix_proto_data *upd) -{ - if (!upd) - { - return; - } - if (upd->refcnt == 1) - { - if (upd->buf) - { - free_page((unsigned long)upd->buf); - upd->buf = NULL; - upd->bp_head = upd->bp_tail = 0; - } - } - --upd->refcnt; -} - - -/* - * Upon a create, we allocate an empty protocol data, - * and grab a page to buffer writes. - */ - -static int unix_proto_create(struct socket *sock, int protocol) -{ - struct unix_proto_data *upd; - - /* - * No funny SOCK_RAW stuff - */ - - if (protocol != 0) - { - return(-EINVAL); - } - - if (!(upd = unix_data_alloc())) - { - printk("UNIX: create: can't allocate buffer\n"); - return(-ENOMEM); - } - if (!(upd->buf = (char*) get_free_page(GFP_USER))) - { - printk("UNIX: create: can't get page!\n"); - unix_data_deref(upd); - return(-ENOMEM); - } - upd->protocol = protocol; - upd->socket = sock; - UN_DATA(sock) = upd; - upd->refcnt = 1; /* Now it's complete - bgm */ - return(0); -} - -/* - * Duplicate a socket. - */ - -static int unix_proto_dup(struct socket *newsock, struct socket *oldsock) -{ - struct unix_proto_data *upd = UN_DATA(oldsock); - return(unix_proto_create(newsock, upd->protocol)); -} - - -/* - * Release a Unix domain socket. - */ - -static int unix_proto_release(struct socket *sock, struct socket *peer) -{ - struct unix_proto_data *upd = UN_DATA(sock); - - if (!upd) - return(0); - - if (upd->socket != sock) - { - printk("UNIX: release: socket link mismatch!\n"); - return(-EINVAL); - } - - if (upd->inode) - { - iput(upd->inode); - upd->inode = NULL; - } - - UN_DATA(sock) = NULL; - upd->socket = NULL; - - if (upd->peerupd) - unix_data_deref(upd->peerupd); - unix_data_deref(upd); - return(0); -} - - -/* - * Bind a name to a socket. - * This is where much of the work is done: we allocate a fresh page for - * the buffer, grab the appropriate inode and set things up. - * - * FIXME: what should we do if an address is already bound? - * Here we return EINVAL, but it may be necessary to re-bind. - * I think thats what BSD does in the case of datagram sockets... - */ - -static int unix_proto_bind(struct socket *sock, struct sockaddr *umyaddr, - int sockaddr_len) -{ - char fname[UNIX_PATH_MAX + 1]; - struct unix_proto_data *upd = UN_DATA(sock); - unsigned long old_fs; - int i; - - if (sockaddr_len <= UN_PATH_OFFSET || - sockaddr_len > sizeof(struct sockaddr_un)) - { - return(-EINVAL); - } - if (upd->sockaddr_len || upd->inode) - { - /*printk("UNIX: bind: already bound!\n");*/ - return(-EINVAL); - } - memcpy(&upd->sockaddr_un, umyaddr, sockaddr_len); - upd->sockaddr_un.sun_path[sockaddr_len-UN_PATH_OFFSET] = '\0'; - if (upd->sockaddr_un.sun_family != AF_UNIX) - { - return(-EINVAL); - } - - memcpy(fname, upd->sockaddr_un.sun_path, sockaddr_len-UN_PATH_OFFSET); - fname[sockaddr_len-UN_PATH_OFFSET] = '\0'; - old_fs = get_fs(); - set_fs(get_ds()); - - i = do_mknod(fname, S_IFSOCK | S_IRWXUGO, 0); - - if (i == 0) - i = open_namei(fname, 2, S_IFSOCK, &upd->inode, NULL); - set_fs(old_fs); - if (i < 0) - { -/* printk("UNIX: bind: can't open socket %s\n", fname);*/ - if(i==-EEXIST) - i=-EADDRINUSE; - return(i); - } - upd->sockaddr_len = sockaddr_len; /* now it's legal */ - - return(0); -} - - -/* - * Perform a connection. we can only connect to unix sockets - * (I can't for the life of me find an application where that - * wouldn't be the case!) - */ - -static int unix_proto_connect(struct socket *sock, struct sockaddr *uservaddr, - int sockaddr_len, int flags) -{ - char fname[sizeof(((struct sockaddr_un *)0)->sun_path) + 1]; - struct sockaddr_un sockun; - struct unix_proto_data *serv_upd; - struct inode *inode; - unsigned long old_fs; - int i; - - if (sockaddr_len <= UN_PATH_OFFSET || - sockaddr_len > sizeof(struct sockaddr_un)) - { - return(-EINVAL); - } - - if (sock->state == SS_CONNECTING) - return(-EINPROGRESS); - if (sock->state == SS_CONNECTED) - return(-EISCONN); - - memcpy(&sockun, uservaddr, sockaddr_len); - sockun.sun_path[sockaddr_len-UN_PATH_OFFSET] = '\0'; - if (sockun.sun_family != AF_UNIX) - { - return(-EINVAL); - } - -/* - * Try to open the name in the filesystem - this is how we - * identify ourselves and our server. Note that we don't - * hold onto the inode that long, just enough to find our - * server. When we're connected, we mooch off the server. - */ - - memcpy(fname, sockun.sun_path, sockaddr_len-UN_PATH_OFFSET); - fname[sockaddr_len-UN_PATH_OFFSET] = '\0'; - old_fs = get_fs(); - set_fs(get_ds()); - i = open_namei(fname, 2, S_IFSOCK, &inode, NULL); - set_fs(old_fs); - if (i < 0) - { - return(i); - } - - serv_upd = unix_data_lookup(&sockun, sockaddr_len, inode); - iput(inode); - if (!serv_upd) - { - return(-EINVAL); - } - - if ((i = sock_awaitconn(sock, serv_upd->socket, flags)) < 0) - { - return(i); - } - - if (sock->conn) - { - unix_data_ref(UN_DATA(sock->conn)); - UN_DATA(sock)->peerupd = UN_DATA(sock->conn); /* ref server */ - } - return(0); -} - - -/* - * To do a socketpair, we just connect the two datas, easy! - * Since we always wait on the socket inode, they're no contention - * for a wait area, and deadlock prevention in the case of a process - * writing to itself is, ignored, in true unix fashion! - */ - -static int unix_proto_socketpair(struct socket *sock1, struct socket *sock2) -{ - struct unix_proto_data *upd1 = UN_DATA(sock1), *upd2 = UN_DATA(sock2); - - unix_data_ref(upd1); - unix_data_ref(upd2); - upd1->peerupd = upd2; - upd2->peerupd = upd1; - return(0); -} - - -/* - * On accept, we ref the peer's data for safe writes. - */ - -static int unix_proto_accept(struct socket *sock, struct socket *newsock, int flags) -{ - struct socket *clientsock; - -/* - * If there aren't any sockets awaiting connection, - * then wait for one, unless nonblocking. - */ - - while(!(clientsock = sock->iconn)) - { - if (flags & O_NONBLOCK) - return(-EAGAIN); - sock->flags |= SO_WAITDATA; - interruptible_sleep_on(sock->wait); - sock->flags &= ~SO_WAITDATA; - if (current->signal & ~current->blocked) - { - return(-ERESTARTSYS); - } - } -/* - * Great. Finish the connection relative to server and client, - * wake up the client and return the new fd to the server. - */ - - sock->iconn = clientsock->next; - clientsock->next = NULL; - newsock->conn = clientsock; - clientsock->conn = newsock; - clientsock->state = SS_CONNECTED; - newsock->state = SS_CONNECTED; - unix_data_ref(UN_DATA(clientsock)); - UN_DATA(newsock)->peerupd = UN_DATA(clientsock); - UN_DATA(newsock)->sockaddr_un = UN_DATA(sock)->sockaddr_un; - UN_DATA(newsock)->sockaddr_len = UN_DATA(sock)->sockaddr_len; - wake_up_interruptible(clientsock->wait); - sock_wake_async(clientsock, 0); - return(0); -} - - -/* - * Gets the current name or the name of the connected socket. - */ - -static int unix_proto_getname(struct socket *sock, struct sockaddr *usockaddr, - int *usockaddr_len, int peer) -{ - struct unix_proto_data *upd; - int len; - - if (peer) - { - if (sock->state != SS_CONNECTED) - { - return(-EINVAL); - } - upd = UN_DATA(sock->conn); - } - else - upd = UN_DATA(sock); - - len = upd->sockaddr_len; - memcpy(usockaddr, &upd->sockaddr_un, len); - *usockaddr_len=len; - return(0); -} - - -/* - * We read from our own buf. - */ - -static int unix_proto_read(struct socket *sock, char *ubuf, int size, int nonblock) -{ - struct unix_proto_data *upd; - int todo, avail; - - if ((todo = size) <= 0) - return(0); - - upd = UN_DATA(sock); - while(!(avail = UN_BUF_AVAIL(upd))) - { - if (sock->state != SS_CONNECTED) - { - return((sock->state == SS_DISCONNECTING) ? 0 : -EINVAL); - } - if (nonblock) - return(-EAGAIN); - sock->flags |= SO_WAITDATA; - interruptible_sleep_on(sock->wait); - sock->flags &= ~SO_WAITDATA; - if (current->signal & ~current->blocked) - { - return(-ERESTARTSYS); - } - } - -/* - * Copy from the read buffer into the user's buffer, - * watching for wraparound. Then we wake up the writer. - */ - - unix_lock(upd); - do - { - int part, cando; - - if (avail <= 0) - { - printk("UNIX: read: AVAIL IS NEGATIVE!!!\n"); - send_sig(SIGKILL, current, 1); - return(-EPIPE); - } - - if ((cando = todo) > avail) - cando = avail; - if (cando >(part = BUF_SIZE - upd->bp_tail)) - cando = part; - memcpy_tofs(ubuf, upd->buf + upd->bp_tail, cando); - upd->bp_tail =(upd->bp_tail + cando) &(BUF_SIZE-1); - ubuf += cando; - todo -= cando; - if (sock->state == SS_CONNECTED) - { - wake_up_interruptible(sock->conn->wait); - sock_wake_async(sock->conn, 2); - } - avail = UN_BUF_AVAIL(upd); - } - while(todo && avail); - unix_unlock(upd); - return(size - todo); -} - - -/* - * We write to our peer's buf. When we connected we ref'd this - * peer so we are safe that the buffer remains, even after the - * peer has disconnected, which we check other ways. - */ - -static int unix_proto_write(struct socket *sock, char *ubuf, int size, int nonblock) -{ - struct unix_proto_data *pupd; - int todo, space; - - if ((todo = size) <= 0) - return(0); - if (sock->state != SS_CONNECTED) - { - if (sock->state == SS_DISCONNECTING) - { - send_sig(SIGPIPE, current, 1); - return(-EPIPE); - } - return(-EINVAL); - } - pupd = UN_DATA(sock)->peerupd; /* safer than sock->conn */ - - while(!(space = UN_BUF_SPACE(pupd))) - { - sock->flags |= SO_NOSPACE; - if (nonblock) - return(-EAGAIN); - sock->flags &= ~SO_NOSPACE; - interruptible_sleep_on(sock->wait); - if (current->signal & ~current->blocked) - { - return(-ERESTARTSYS); - } - if (sock->state == SS_DISCONNECTING) - { - send_sig(SIGPIPE, current, 1); - return(-EPIPE); - } - } - -/* - * Copy from the user's buffer to the write buffer, - * watching for wraparound. Then we wake up the reader. - */ - - unix_lock(pupd); - - do - { - int part, cando; - - if (space <= 0) - { - printk("UNIX: write: SPACE IS NEGATIVE!!!\n"); - send_sig(SIGKILL, current, 1); - return(-EPIPE); - } - - /* - * We may become disconnected inside this loop, so watch - * for it (peerupd is safe until we close). - */ - - if (sock->state == SS_DISCONNECTING) - { - send_sig(SIGPIPE, current, 1); - unix_unlock(pupd); - return(-EPIPE); - } - - if ((cando = todo) > space) - cando = space; - - if (cando >(part = BUF_SIZE - pupd->bp_head)) - cando = part; - - memcpy_fromfs(pupd->buf + pupd->bp_head, ubuf, cando); - pupd->bp_head =(pupd->bp_head + cando) &(BUF_SIZE-1); - ubuf += cando; - todo -= cando; - if (sock->state == SS_CONNECTED) - { - wake_up_interruptible(sock->conn->wait); - sock_wake_async(sock->conn, 1); - } - space = UN_BUF_SPACE(pupd); - } - while(todo && space); - - unix_unlock(pupd); - return(size - todo); -} - -/* - * Select on a unix domain socket. - */ - -static int unix_proto_select(struct socket *sock, int sel_type, select_table * wait) -{ - struct unix_proto_data *upd, *peerupd; - - /* - * Handle server sockets specially. - */ - if (sock->flags & SO_ACCEPTCON) - { - if (sel_type == SEL_IN) - { - if (sock->iconn) - return(1); - select_wait(sock->wait, wait); - return(sock->iconn ? 1 : 0); - } - select_wait(sock->wait, wait); - return(0); - } - - if (sel_type == SEL_IN) - { - upd = UN_DATA(sock); - if (UN_BUF_AVAIL(upd)) /* even if disconnected */ - return(1); - else if (sock->state != SS_CONNECTED) - { - return(1); - } - select_wait(sock->wait,wait); - return(0); - } - - if (sel_type == SEL_OUT) - { - if (sock->state != SS_CONNECTED) - { - return(1); - } - peerupd = UN_DATA(sock->conn); - if (UN_BUF_SPACE(peerupd) > 0) - return(1); - select_wait(sock->wait,wait); - return(0); - } - - /* - * Exceptions - SEL_EX - */ - - return(0); -} - - -/* - * ioctl() calls sent to an AF_UNIX socket - */ - -static int unix_proto_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) -{ - struct unix_proto_data *upd, *peerupd; - int er; - - upd = UN_DATA(sock); - peerupd = (sock->state == SS_CONNECTED) ? UN_DATA(sock->conn) : NULL; - - switch(cmd) - { - case TIOCINQ: - if (sock->flags & SO_ACCEPTCON) - return(-EINVAL); - er=verify_area(VERIFY_WRITE,(void *)arg, sizeof(unsigned long)); - if(er) - return er; - if (UN_BUF_AVAIL(upd) || peerupd) - put_fs_long(UN_BUF_AVAIL(upd),(unsigned long *)arg); - else - put_fs_long(0,(unsigned long *)arg); - break; - case TIOCOUTQ: - if (sock->flags & SO_ACCEPTCON) - return(-EINVAL); - er=verify_area(VERIFY_WRITE,(void *)arg, sizeof(unsigned long)); - if(er) - return er; - if (peerupd) - put_fs_long(UN_BUF_SPACE(peerupd),(unsigned long *)arg); - else - put_fs_long(0,(unsigned long *)arg); - break; - default: - return(-EINVAL); - } - return(0); -} - - -static struct proto_ops unix_proto_ops = { - AF_UNIX, - unix_proto_create, - unix_proto_dup, - unix_proto_release, - unix_proto_bind, - unix_proto_connect, - unix_proto_socketpair, - unix_proto_accept, - unix_proto_getname, - unix_proto_read, - unix_proto_write, - unix_proto_select, - unix_proto_ioctl, - unix_proto_listen, - unix_proto_send, - unix_proto_recv, - unix_proto_sendto, - unix_proto_recvfrom, - unix_proto_shutdown, - unix_proto_setsockopt, - unix_proto_getsockopt, - NULL /* unix_proto_fcntl */ -}; - -/* - * Initialise the Unix domain protocol. - */ - -void unix_proto_init(struct net_proto *pro) -{ - struct unix_proto_data *upd; - - /* - * Tell SOCKET that we are alive... - */ - - (void) sock_register(unix_proto_ops.family, &unix_proto_ops); - - for(upd = unix_datas; upd <= last_unix_data; ++upd) - { - upd->refcnt = 0; - } -} diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c new file mode 100644 index 000000000..b436aabb3 --- /dev/null +++ b/net/unix/sysctl_net_unix.c @@ -0,0 +1,13 @@ +/* -*- linux-c -*- + * sysctl_net_unix.c: sysctl interface to net af_unix subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/unix directory entry (empty =) ). [MS] + */ + +#include <linux/mm.h> +#include <linux/sysctl.h> + +ctl_table unix_table[] = { + {0} +}; |