diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/Config.in | 3 | ||||
-rw-r--r-- | net/ipv4/Makefile | 8 | ||||
-rw-r--r-- | net/ipv4/af_inet.c | 9 | ||||
-rw-r--r-- | net/ipv4/devinet.c | 35 | ||||
-rw-r--r-- | net/ipv4/fib_frontend.c | 2 | ||||
-rw-r--r-- | net/ipv4/fib_semantics.c | 2 | ||||
-rw-r--r-- | net/ipv4/icmp.c | 26 | ||||
-rw-r--r-- | net/ipv4/igmp.c | 5 | ||||
-rw-r--r-- | net/ipv4/ip_fw.c | 39 | ||||
-rw-r--r-- | net/ipv4/ip_input.c | 4 | ||||
-rw-r--r-- | net/ipv4/ip_masq.c | 123 | ||||
-rw-r--r-- | net/ipv4/ip_masq_mfw.c | 775 | ||||
-rw-r--r-- | net/ipv4/ip_masq_portfw.c | 21 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 47 | ||||
-rw-r--r-- | net/ipv4/ipconfig.c | 646 | ||||
-rw-r--r-- | net/ipv4/ipmr.c | 3 | ||||
-rw-r--r-- | net/ipv4/route.c | 4 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 13 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 20 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 163 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 21 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 47 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 12 |
23 files changed, 1380 insertions, 648 deletions
diff --git a/net/ipv4/Config.in b/net/ipv4/Config.in index e0379e69b..8e4b3945e 100644 --- a/net/ipv4/Config.in +++ b/net/ipv4/Config.in @@ -47,6 +47,7 @@ if [ "$CONFIG_IP_FIREWALL" = "y" ]; then if [ "$CONFIG_IP_MASQUERADE_MOD" = "y" ]; then tristate 'IP: ipautofw masq support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_IPAUTOFW tristate 'IP: ipportfw masq support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_IPPORTFW + tristate 'IP: ip fwmark masq-forwarding support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_MFW fi fi fi @@ -71,7 +72,7 @@ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then bool 'IP: ARP daemon support (EXPERIMENTAL)' CONFIG_ARPD fi fi -bool 'IP: TCP syncookie support (not enabled per default) ' CONFIG_SYN_COOKIES +bool 'IP: TCP syncookie support (not enabled per default)' CONFIG_SYN_COOKIES comment '(it is safe to leave these untouched)' #bool 'IP: PC/TCP compatibility mode' CONFIG_INET_PCTCP tristate 'IP: Reverse ARP' CONFIG_INET_RARP diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ad2a0a650..8ab280deb 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -81,6 +81,14 @@ ifeq ($(CONFIG_IP_MASQUERADE_MOD),y) endif endif + ifeq ($(CONFIG_IP_MASQUERADE_MFW),y) + IPV4_OBJS += ip_masq_mfw.o + else + ifeq ($(CONFIG_IP_MASQUERADE_MFW),m) + M_OBJS += ip_masq_mfw.o + endif + endif + endif M_OBJS += ip_masq_user.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 54a4578ca..3520b0c52 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -5,7 +5,7 @@ * * PF_INET protocol family socket handler. * - * Version: $Id: af_inet.c,v 1.80 1998/11/08 11:17:03 davem Exp $ + * Version: $Id: af_inet.c,v 1.82 1999/01/04 20:36:44 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -190,8 +190,9 @@ static __inline__ void kill_sk_later(struct sock *sk) * [PR] */ - printk(KERN_DEBUG "Socket destroy delayed (r=%d w=%d)\n", - atomic_read(&sk->rmem_alloc), atomic_read(&sk->wmem_alloc)); + NETDEBUG(printk(KERN_DEBUG "Socket destroy delayed (r=%d w=%d)\n", + atomic_read(&sk->rmem_alloc), + atomic_read(&sk->wmem_alloc))); sk->destroy = 1; sk->ack_backlog = 0; @@ -1059,7 +1060,7 @@ __initfunc(void inet_proto_init(struct net_proto *pro)) struct sk_buff *dummy_skb; struct inet_protocol *p; - printk(KERN_INFO "Swansea University Computer Society TCP/IP for NET3.037\n"); + printk(KERN_INFO "NET4: Linux TCP/IP 1.0 for NET4.0\n"); if (sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)) { diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index ac7c04432..b1aa1a04e 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1,7 +1,7 @@ /* * NET3 IP device support routines. * - * Version: $Id: devinet.c,v 1.23 1998/08/26 12:03:21 davem Exp $ + * Version: $Id: devinet.c,v 1.25 1999/01/04 20:14:33 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -990,39 +990,6 @@ static void devinet_sysctl_unregister(struct ipv4_devconf *p) } #endif -#ifdef CONFIG_IP_PNP_BOOTP - -/* - * Addition and deletion of fake interface addresses - * for sending of BOOTP packets. In this case, we must - * set the local address to zero which is not permitted - * otherwise. - */ - -__initfunc(int inet_add_bootp_addr(struct device *dev)) -{ - struct in_device *in_dev = dev->ip_ptr; - struct in_ifaddr *ifa; - - if (!in_dev && !(in_dev = inetdev_init(dev))) - return -ENOBUFS; - if (!(ifa = inet_alloc_ifa())) - return -ENOBUFS; - ifa->ifa_dev = in_dev; - in_dev->ifa_list = ifa; - rtmsg_ifa(RTM_NEWADDR, ifa); - notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); - return 0; -} - -__initfunc(void inet_del_bootp_addr(struct device *dev)) -{ - if (dev->ip_ptr) - inetdev_destroy(dev->ip_ptr); -} - -#endif - __initfunc(void devinet_init(void)) { register_gifconf(PF_INET, inet_gifconf); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 013a4ba9a..a3585cc0c 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -5,7 +5,7 @@ * * IPv4 Forwarding Information Base: FIB frontend. * - * Version: $Id: fib_frontend.c,v 1.12 1998/08/26 12:03:24 davem Exp $ + * Version: $Id: fib_frontend.c,v 1.14 1999/01/04 20:13:55 davem Exp $ * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index c77ecc251..7bff36095 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -984,7 +984,7 @@ void fib_node_get_info(int type, int dead, struct fib_info *fi, u32 prefix, u32 flags, 0, 0, 0, mask, 0, 0, 0); } - memset(buffer+len, 0, 127-len); + memset(buffer+len, ' ', 127-len); buffer[127] = '\n'; } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index af1bb4a44..5ac2d9a53 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -3,7 +3,7 @@ * * Alan Cox, <alan@cymru.net> * - * Version: $Id: icmp.c,v 1.47 1998/10/21 05:32:24 davem Exp $ + * Version: $Id: icmp.c,v 1.48 1999/01/02 16:51:41 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -50,6 +50,8 @@ * Yu Tianli : Fixed two ugly bugs in icmp_send * - IP option length was accounted wrongly * - ICMP header length was not accounted at all. + * Tristan Greaves : Added sysctl option to ignore bogus broadcast + * responses from broken routers. * * To Fix: * @@ -311,6 +313,9 @@ struct icmp_err icmp_err_convert[] = { int sysctl_icmp_echo_ignore_all = 0; int sysctl_icmp_echo_ignore_broadcasts = 0; +/* Control parameter - ignore bogus broadcast responses? */ +int sysctl_icmp_ignore_bogus_error_responses =0; + /* * ICMP control array. This specifies what to do with each ICMP. */ @@ -701,16 +706,19 @@ static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb, int len) * first check your netmask matches at both ends, if it does then * get the other vendor to fix their kit. */ - - if (inet_addr_type(iph->daddr) == RTN_BROADCAST) + + if (!sysctl_icmp_ignore_bogus_error_responses) { - if (net_ratelimit()) - printk(KERN_WARNING "%s sent an invalid ICMP error to a broadcast.\n", - in_ntoa(skb->nh.iph->saddr)); - return; + + if (inet_addr_type(iph->daddr) == RTN_BROADCAST) + { + if (net_ratelimit()) + printk(KERN_WARNING "%s sent an invalid ICMP error to a broadcast.\n", + in_ntoa(skb->nh.iph->saddr)); + return; + } } - /* * Deliver ICMP message to raw sockets. Pretty useless feature? */ @@ -886,8 +894,10 @@ static void icmp_timestamp(struct icmphdr *icmph, struct sk_buff *skb, int len) static void icmp_address(struct icmphdr *icmph, struct sk_buff *skb, int len) { +#if 0 if (net_ratelimit()) printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n"); +#endif } /* diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index af49104b3..b0e7b6d01 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -8,7 +8,7 @@ * the older version didn't come out right using gcc 2.5.8, the newer one * seems to fall out with gcc 2.6.2. * - * Version: $Id: igmp.c,v 1.27 1998/08/26 12:03:39 davem Exp $ + * Version: $Id: igmp.c,v 1.28 1998/11/30 15:53:13 davem Exp $ * * Authors: * Alan Cox <Alan.Cox@linux.org> @@ -538,6 +538,7 @@ static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr) /* * Join a socket to a group */ +int sysctl_igmp_max_memberships = IP_MAX_MEMBERSHIPS; int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) { @@ -578,7 +579,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) count++; } err = -ENOBUFS; - if (iml == NULL || count >= IP_MAX_MEMBERSHIPS) + if (iml == NULL || count >= sysctl_igmp_max_memberships) goto done; memcpy(&iml->multi, imr, sizeof(*imr)); iml->next = sk->ip_mc_list; diff --git a/net/ipv4/ip_fw.c b/net/ipv4/ip_fw.c index 5044e7b45..cf2731df1 100644 --- a/net/ipv4/ip_fw.c +++ b/net/ipv4/ip_fw.c @@ -29,6 +29,9 @@ * 1-May-1998: Remove caching of device pointer. * 12-May-1998: Allow tiny fragment case for TCP/UDP. * 15-May-1998: Treat short packets as fragments, don't just block. + * 3-Jan-1999: Fixed serious procfs security hole -- users should never + * be allowed to view the chains! + * Marc Santoro <ultima@snicker.emoti.com> */ /* @@ -60,7 +63,6 @@ #include <linux/sched.h> #include <linux/string.h> #include <linux/errno.h> -#include <linux/config.h> #include <linux/socket.h> #include <linux/sockios.h> @@ -115,8 +117,8 @@ * UP. * * For backchains and counters, we use an array, indexed by - * [smp_processor_id()*2 + !in_interrupt()]; the array is of size - * [smp_num_cpus*2]. For v2.0, smp_num_cpus is effectively 1. So, + * [cpu_number_map[smp_processor_id()]*2 + !in_interrupt()]; the array is of + * size [smp_num_cpus*2]. For v2.0, smp_num_cpus is effectively 1. So, * confident of uniqueness, we modify counters even though we only * have a read lock (to read the counters, you need a write lock, * though). */ @@ -140,7 +142,11 @@ static struct sock *ipfwsk; #endif -#define SLOT_NUMBER() (smp_processor_id()*2 + !in_interrupt()) +#ifdef __SMP__ +#define SLOT_NUMBER() (cpu_number_map[smp_processor_id()]*2 + !in_interrupt()) +#else +#define SLOT_NUMBER() (!in_interrupt()) +#endif #define NUM_SLOTS (smp_num_cpus*2) #define SIZEOF_STRUCT_IP_CHAIN (sizeof(struct ip_chain) \ @@ -505,7 +511,7 @@ static void cleanup(struct ip_chain *chain, printk("%s\n",chain->label); } -static inline void +static inline int ip_fw_domatch(struct ip_fwkernel *f, struct iphdr *ip, const char *rif, @@ -546,9 +552,15 @@ ip_fw_domatch(struct ip_fwkernel *f, len-(sizeof(__u32)*2+IFNAMSIZ)); netlink_broadcast(ipfwsk, outskb, 0, ~0, GFP_KERNEL); } - else duprintf("netlink post failed - alloc_skb failed!\n"); + else { + if (net_ratelimit()) + printk(KERN_WARNING "ip_fw: packet drop due to " + "netlink failure\n"); + return 0; + } } #endif + return 1; } /* @@ -691,9 +703,13 @@ ip_fw_check(struct iphdr *ip, for (; f; f = f->next) { if (ip_rule_match(f,rif,ip, tcpsyn,src_port,dst_port,offset)) { - if (!testing) - ip_fw_domatch(f, ip, rif, chain->label, skb, - slot, src_port,dst_port); + if (!testing + && !ip_fw_domatch(f, ip, rif, chain->label, + skb, slot, + src_port, dst_port)) { + ret = FW_BLOCK; + goto out; + } break; } } @@ -755,6 +771,7 @@ ip_fw_check(struct iphdr *ip, } } while (ret == FW_SKIP+2); + out: if (!testing) FWC_READ_UNLOCK(&ip_fw_lock); /* Recalculate checksum if not going to reject, and TOS changed. */ @@ -1667,13 +1684,13 @@ struct firewall_ops ipfw_ops= #ifdef CONFIG_PROC_FS static struct proc_dir_entry proc_net_ipfwchains_chain = { PROC_NET_IPFW_CHAINS, sizeof(IP_FW_PROC_CHAINS)-1, - IP_FW_PROC_CHAINS, S_IFREG | S_IRUGO | S_IWUSR, 1, 0, 0, + IP_FW_PROC_CHAINS, S_IFREG | S_IRUSR | S_IWUSR, 1, 0, 0, 0, &proc_net_inode_operations, ip_chain_procinfo }; static struct proc_dir_entry proc_net_ipfwchains_chainnames = { PROC_NET_IPFW_CHAIN_NAMES, sizeof(IP_FW_PROC_CHAIN_NAMES)-1, - IP_FW_PROC_CHAIN_NAMES, S_IFREG | S_IRUGO | S_IWUSR, 1, 0, 0, + IP_FW_PROC_CHAIN_NAMES, S_IFREG | S_IRUSR | S_IWUSR, 1, 0, 0, 0, &proc_net_inode_operations, ip_chain_name_procinfo }; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 260d178f1..fbbfbbfc6 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) module. * - * Version: $Id: ip_input.c,v 1.34 1998/10/03 09:37:23 davem Exp $ + * Version: $Id: ip_input.c,v 1.35 1999/01/12 14:32:48 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -503,7 +503,9 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) { int fwres; u16 rport; +#ifdef CONFIG_IP_ROUTE_TOS u8 tos = iph->tos; +#endif if ((fwres=call_in_firewall(PF_INET, skb->dev, iph, &rport, &skb))<FW_ACCEPT) { if (fwres==FW_REJECT) diff --git a/net/ipv4/ip_masq.c b/net/ipv4/ip_masq.c index 7a57caeb0..154e70686 100644 --- a/net/ipv4/ip_masq.c +++ b/net/ipv4/ip_masq.c @@ -4,7 +4,7 @@ * * Copyright (c) 1994 Pauline Middelink * - * $Id: ip_masq.c,v 1.28 1998/11/21 00:33:30 davem Exp $ + * $Id: ip_masq.c,v 1.33 1999/01/15 06:45:17 davem Exp $ * * * See ip_fw.c for original log @@ -44,6 +44,8 @@ * Juan Jose Ciarlante : fixed stupid SMP locking bug * Juan Jose Ciarlante : fixed "tap"ing in demasq path by copy-on-w * Juan Jose Ciarlante : make masq_proto_doff() robust against fake sized/corrupted packets + * Kai Bankett : do not toss other IP protos in proto_doff() + * Dan Kegel : pointed correct NAT behavior for UDP streams * */ @@ -391,6 +393,20 @@ EXPORT_SYMBOL(ip_masq_expire); struct ip_fw_masq *ip_masq_expire = &ip_masq_dummy; #endif +/* + * These flags enable non-strict d{addr,port} checks + * Given that both (in/out) lookup tables are hashed + * by m{addr,port} and s{addr,port} this is quite easy + */ + +#define MASQ_DADDR_PASS (IP_MASQ_F_NO_DADDR|IP_MASQ_F_DLOOSE) +#define MASQ_DPORT_PASS (IP_MASQ_F_NO_DPORT|IP_MASQ_F_DLOOSE) + +/* + * By default enable dest loose semantics + */ +#define CONFIG_IP_MASQ_LOOSE_DEFAULT 1 + /* * Set masq expiration (deletion) and adds timer, @@ -522,12 +538,12 @@ static struct ip_masq * __ip_masq_in_get(int protocol, __u32 s_addr, __u16 s_por hash = ip_masq_hash_key(protocol, d_addr, d_port); - for(ms = ip_masq_m_tab[hash]; ms ; ms = ms->m_link) { - if (protocol==ms->protocol && - ((s_addr==ms->daddr || ms->flags & IP_MASQ_F_NO_DADDR)) && - (s_port==ms->dport || ms->flags & IP_MASQ_F_NO_DPORT) && - (d_addr==ms->maddr && d_port==ms->mport)) { + if (protocol==ms->protocol && + (d_addr==ms->maddr && d_port==ms->mport) && + (s_addr==ms->daddr || ms->flags & MASQ_DADDR_PASS) && + (s_port==ms->dport || ms->flags & MASQ_DPORT_PASS) + ) { IP_MASQ_DEBUG(2, "look/in %d %08X:%04hX->%08X:%04hX OK\n", protocol, s_addr, @@ -578,7 +594,9 @@ static struct ip_masq * __ip_masq_out_get(int protocol, __u32 s_addr, __u16 s_po for(ms = ip_masq_s_tab[hash]; ms ; ms = ms->s_link) { if (protocol == ms->protocol && s_addr == ms->saddr && s_port == ms->sport && - d_addr == ms->daddr && d_port == ms->dport ) { + (d_addr==ms->daddr || ms->flags & MASQ_DADDR_PASS) && + (d_port==ms->dport || ms->flags & MASQ_DPORT_PASS) + ) { IP_MASQ_DEBUG(2, "lk/out1 %d %08X:%04hX->%08X:%04hX OK\n", protocol, s_addr, @@ -600,7 +618,9 @@ static struct ip_masq * __ip_masq_out_get(int protocol, __u32 s_addr, __u16 s_po if (ms->flags & IP_MASQ_F_NO_SPORT && protocol == ms->protocol && s_addr == ms->saddr && - d_addr == ms->daddr && d_port == ms->dport ) { + (d_addr==ms->daddr || ms->flags & MASQ_DADDR_PASS) && + (d_port==ms->dport || ms->flags & MASQ_DPORT_PASS) + ) { IP_MASQ_DEBUG(2, "lk/out2 %d %08X:%04hX->%08X:%04hX OK\n", protocol, s_addr, @@ -623,7 +643,7 @@ out: return ms; } -#ifdef CONFIG_IP_MASQUERADE_NREUSE +#ifdef CONFIG_IP_MASQ_NREUSE /* * Returns ip_masq for given proto,m_addr,m_port. * called by allocation routine to find an unused m_port. @@ -841,7 +861,15 @@ struct ip_masq * ip_masq_new(int proto, __u32 maddr, __u16 mport, __u32 saddr, _ atomic_set(&ms->refcnt,0); if (proto == IPPROTO_UDP && !mport) +#ifdef CONFIG_IP_MASQ_LOOSE_DEFAULT + /* + * Flag this tunnel as "dest loose" + * + */ + ms->flags |= IP_MASQ_F_DLOOSE; +#else ms->flags |= IP_MASQ_F_NO_DADDR; +#endif /* get masq address from rif */ @@ -916,7 +944,7 @@ struct ip_masq * ip_masq_new(int proto, __u32 maddr, __u16 mport, __u32 saddr, _ else write_lock(&__ip_masq_lock); -#ifdef CONFIG_IP_MASQUERADE_NREUSE +#ifdef CONFIG_IP_MASQ_NREUSE mst = __ip_masq_getbym(proto, maddr, mport); #else mst = __ip_masq_in_get(proto, daddr, dport, maddr, mport); @@ -966,6 +994,9 @@ mport_nono: /* * Get transport protocol data offset, check against size + * return: + * 0 if other IP proto + * -1 if error */ static __inline__ int proto_doff(unsigned proto, char *th, unsigned size) { @@ -993,6 +1024,9 @@ static __inline__ int proto_doff(unsigned proto, char *th, unsigned size) } break; + default: + /* Other proto: nothing to say, by now :) */ + ret = 0; } if (ret < 0) IP_MASQ_DEBUG(0, "mess proto_doff for proto=%d, size =%d\n", @@ -1024,11 +1058,16 @@ int ip_fw_masquerade(struct sk_buff **skb_p, __u32 maddr) h.raw = (char*) iph + iph->ihl * 4; size = ntohs(iph->tot_len) - (iph->ihl * 4); + doff = proto_doff(iph->protocol, h.raw, size); - if (doff < 0) { - IP_MASQ_DEBUG(0, "O-pkt invalid packet data size\n"); + if (doff <= 0) { + /* + * Output path: do not pass other IP protos nor + * invalid packets. + */ return -1; } + switch (iph->protocol) { case IPPROTO_ICMP: return(ip_fw_masq_icmp(skb_p, maddr)); @@ -1131,6 +1170,13 @@ int ip_fw_masquerade(struct sk_buff **skb_p, __u32 maddr) IP_MASQ_DEBUG(1, "ip_fw_masquerade(): filled sport=%d\n", ntohs(ms->sport)); } + if (ms->flags & IP_MASQ_F_DLOOSE) { + /* + * update dest loose values + */ + ms->dport = h.portp[1]; + ms->daddr = iph->daddr; + } } else { /* * Nope, not found, create a new entry for it @@ -1431,8 +1477,8 @@ int ip_fw_masq_icmp(struct sk_buff **skb_p, __u32 maddr) if (ip_compute_csum((unsigned char *) icmph, len)) { /* Failed checksum! */ - IP_MASQ_WARNING( "forward ICMP: failed checksum from %d.%d.%d.%d!\n", - NIPQUAD(iph->saddr)); + IP_MASQ_DEBUG(0, "forward ICMP: failed checksum from %d.%d.%d.%d!\n", + NIPQUAD(iph->saddr)); return(-1); } @@ -1632,7 +1678,8 @@ int ip_fw_demasq_icmp(struct sk_buff **skb_p) return -1; } ciph = (struct iphdr *) (icmph + 1); - + cicmph = (struct icmphdr *)((char *)ciph + + (ciph->ihl<<2)); /* Now we do real damage to this packet...! */ /* First change the dest IP address, and recalc checksum */ iph->daddr = ms->saddr; @@ -1707,6 +1754,7 @@ int ip_fw_demasq_icmp(struct sk_buff **skb_p) return -1; } ciph = (struct iphdr *) (icmph + 1); + pptr = (__u16 *)&(((char *)ciph)[ciph->ihl*4]); /* Now we do real damage to this packet...! */ /* First change the dest IP address, and recalc checksum */ @@ -1776,9 +1824,17 @@ int ip_fw_demasquerade(struct sk_buff **skb_p) size = ntohs(iph->tot_len) - (iph->ihl * 4); doff = proto_doff(iph->protocol, h.raw, size); - if (doff < 0) { - IP_MASQ_DEBUG(0, "I-pkt invalid packet data size\n"); - return -1; + + switch (doff) { + case 0: + /* + * Input path: other IP protos Ok, will + * reach local sockets path. + */ + return 0; + case -1: + IP_MASQ_DEBUG(0, "I-pkt invalid packet data size\n"); + return -1; } maddr = iph->daddr; @@ -1870,10 +1926,18 @@ int ip_fw_demasquerade(struct sk_buff **skb_p) */ ms->flags &= ~IP_MASQ_F_NO_REPLY; - /* - * Set dport if not defined yet. + /* + * Set daddr,dport if not defined yet + * and tunnel is not setup as "dest loose" */ + if (ms->flags & IP_MASQ_F_DLOOSE) { + /* + * update dest loose values + */ + ms->dport = h.portp[0]; + ms->daddr = iph->saddr; + } else { if ( ms->flags & IP_MASQ_F_NO_DPORT ) { /* && ms->protocol == IPPROTO_TCP ) { */ ms->flags &= ~IP_MASQ_F_NO_DPORT; ms->dport = h.portp[0]; @@ -1890,6 +1954,7 @@ int ip_fw_demasquerade(struct sk_buff **skb_p) ntohl(ms->daddr)); } + } if ((skb=masq_skb_cow(skb_p, &iph, &h.raw)) == NULL) { ip_masq_put(ms); return -1; @@ -2232,13 +2297,6 @@ void ip_masq_proc_unregister(struct proc_dir_entry *ent) proc_unregister(proc_net_ip_masq, ent->low_ino); } -/* - * Wrapper over inet_select_addr() - */ -u32 ip_masq_select_addr(struct device *dev, u32 dst, int scope) -{ - return inet_select_addr(dev, dst, scope); -} __initfunc(static void masq_proc_init(void)) { @@ -2257,6 +2315,13 @@ __initfunc(static void masq_proc_init(void)) } } #endif /* CONFIG_PROC_FS */ +/* + * Wrapper over inet_select_addr() + */ +u32 ip_masq_select_addr(struct device *dev, u32 dst, int scope) +{ + return inet_select_addr(dev, dst, scope); +} /* * Initialize ip masquerading @@ -2309,8 +2374,8 @@ __initfunc(int ip_masq_init(void)) #ifdef CONFIG_IP_MASQUERADE_IPPORTFW ip_portfw_init(); #endif -#ifdef CONFIG_IP_MASQUERADE_IPMARKFW - ip_markfw_init(); +#ifdef CONFIG_IP_MASQUERADE_MFW + ip_mfw_init(); #endif ip_masq_app_init(); diff --git a/net/ipv4/ip_masq_mfw.c b/net/ipv4/ip_masq_mfw.c new file mode 100644 index 000000000..e3903c0cb --- /dev/null +++ b/net/ipv4/ip_masq_mfw.c @@ -0,0 +1,775 @@ +/* + * IP_MASQ_MARKFW masquerading module + * + * Does (reverse-masq) forwarding based on skb->fwmark value + * + * $Id: ip_masq_mfw.c,v 1.2 1998/12/12 02:40:42 davem Exp $ + * + * Author: Juan Jose Ciarlante <jjciarla@raiz.uncu.edu.ar> + * based on Steven Clarke's portfw + * + * Fixes: + * JuanJo Ciarlante: added u-space sched support + * JuanJo Ciarlante: if rport==0, use packet dest port *grin* + * JuanJo Ciarlante: fixed tcp syn&&!ack creation + * + * + */ +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/list.h> +#include <net/ip.h> +#include <linux/ip_fw.h> +#include <linux/ip_masq.h> +#include <net/ip_masq.h> +#include <net/ip_masq_mod.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <asm/softirq.h> +#include <asm/spinlock.h> +#include <asm/atomic.h> + +static struct ip_masq_mod *mmod_self = NULL; +#ifdef CONFIG_IP_MASQ_DEBUG +static int debug=0; +MODULE_PARM(debug, "i"); +#endif + +/* + * Lists structure: + * There is a "main" linked list with entries hashed + * by fwmark value (struct ip_masq_mfw, the "m-entries"). + * + * Each of this m-entry holds a double linked list + * of "forward-to" hosts (struct ip_masq_mfw_host, the "m.host"), + * the round-robin scheduling takes place by rotating m.host entries + * "inside" its m-entry. + */ + +/* + * Each forwarded host (addr:port) is stored here + */ +struct ip_masq_mfw_host { + struct list_head list; + __u32 addr; + __u16 port; + __u16 pad0; + __u32 fwmark; + int pref; + atomic_t pref_cnt; +}; + +#define IP_MASQ_MFW_HSIZE 16 +/* + * This entries are indexed by fwmark, + * they hold a list of forwarded addr:port + */ + +struct ip_masq_mfw { + struct ip_masq_mfw *next; /* linked list */ + __u32 fwmark; /* key: firewall mark */ + struct list_head hosts; /* list of forward-to hosts */ + atomic_t nhosts; /* number of "" */ +#ifdef __SMP__ + rwlock_t lock; +#endif +}; + + +static struct semaphore mfw_sema = MUTEX; +#ifdef __SMP__ +static rwlock_t mfw_lock = RW_LOCK_UNLOCKED; +#endif + +static struct ip_masq_mfw *ip_masq_mfw_table[IP_MASQ_MFW_HSIZE]; + +static __inline__ int mfw_hash_val(int fwmark) +{ + return fwmark & 0x0f; +} + +/* + * Get m-entry by "fwmark" + * Caller must lock tables. + */ + +static struct ip_masq_mfw *__mfw_get(int fwmark) +{ + struct ip_masq_mfw* mfw; + int hash = mfw_hash_val(fwmark); + + for (mfw=ip_masq_mfw_table[hash];mfw;mfw=mfw->next) { + if (mfw->fwmark==fwmark) { + goto out; + } + } +out: + return mfw; +} + +/* + * Links m-entry. + * Caller should have checked if already present for same fwmark + * + * Caller must lock tables. + */ +static int __mfw_add(struct ip_masq_mfw *mfw) +{ + int fwmark = mfw->fwmark; + int hash = mfw_hash_val(fwmark); + + mfw->next = ip_masq_mfw_table[hash]; + ip_masq_mfw_table[hash] = mfw; + ip_masq_mod_inc_nent(mmod_self); + + return 0; +} + +/* + * Creates a m-entry (doesn't link it) + */ + +static struct ip_masq_mfw * mfw_new(int fwmark) +{ + struct ip_masq_mfw *mfw; + + mfw = kmalloc(sizeof(*mfw), GFP_KERNEL); + if (mfw == NULL) + goto out; + + MOD_INC_USE_COUNT; + memset(mfw, 0, sizeof(*mfw)); + mfw->fwmark = fwmark; +#ifdef __SMP__ + mfw->lock = (rwlock_t) RW_LOCK_UNLOCKED; +#endif + + INIT_LIST_HEAD(&mfw->hosts); +out: + return mfw; +} + +static void mfw_host_to_user(struct ip_masq_mfw_host *h, struct ip_mfw_user *mu) +{ + mu->raddr = h->addr; + mu->rport = h->port; + mu->fwmark = h->fwmark; + mu->pref = h->pref; +} + +/* + * Creates a m.host (doesn't link it in a m-entry) + */ +static struct ip_masq_mfw_host * mfw_host_new(struct ip_mfw_user *mu) +{ + struct ip_masq_mfw_host * mfw_host; + mfw_host = kmalloc(sizeof (*mfw_host), GFP_KERNEL); + if (!mfw_host) + return NULL; + + MOD_INC_USE_COUNT; + memset(mfw_host, 0, sizeof(*mfw_host)); + mfw_host->addr = mu->raddr; + mfw_host->port = mu->rport; + mfw_host->fwmark = mu->fwmark; + mfw_host->pref = mu->pref; + atomic_set(&mfw_host->pref_cnt, mu->pref); + + return mfw_host; +} + +/* + * Create AND link m.host to m-entry. + * It locks m.lock. + */ +static int mfw_addhost(struct ip_masq_mfw *mfw, struct ip_mfw_user *mu, int attail) +{ + struct ip_masq_mfw_host *mfw_host; + + mfw_host = mfw_host_new(mu); + if (!mfw_host) + return -ENOMEM; + + write_lock_bh(&mfw->lock); + list_add(&mfw_host->list, attail? mfw->hosts.prev : &mfw->hosts); + atomic_inc(&mfw->nhosts); + write_unlock_bh(&mfw->lock); + + return 0; +} + +/* + * Unlink AND destroy m.host(s) from m-entry. + * Wildcard (nul host or addr) ok. + * It uses m.lock. + */ +static int mfw_delhost(struct ip_masq_mfw *mfw, struct ip_mfw_user *mu) +{ + + struct list_head *l,*e; + struct ip_masq_mfw_host *h; + int n_del = 0; + l = &mfw->hosts; + + write_lock_bh(&mfw->lock); + for (e=l->next; e!=l; e=e->next) + { + h = list_entry(e, struct ip_masq_mfw_host, list); + if ((!mu->raddr || h->addr == mu->raddr) && + (!mu->rport || h->port == mu->rport)) { + /* HIT */ + atomic_dec(&mfw->nhosts); + list_del(&h->list); + kfree_s(h, sizeof(*h)); + MOD_DEC_USE_COUNT; + n_del++; + } + + } + write_unlock_bh(&mfw->lock); + return n_del? 0 : -ESRCH; +} + +/* + * Changes m.host parameters + * Wildcards ok + * + * Caller must lock tables. + */ +static int __mfw_edithost(struct ip_masq_mfw *mfw, struct ip_mfw_user *mu) +{ + + struct list_head *l,*e; + struct ip_masq_mfw_host *h; + int n_edit = 0; + l = &mfw->hosts; + + for (e=l->next; e!=l; e=e->next) + { + h = list_entry(e, struct ip_masq_mfw_host, list); + if ((!mu->raddr || h->addr == mu->raddr) && + (!mu->rport || h->port == mu->rport)) { + /* HIT */ + h->pref = mu->pref; + atomic_set(&h->pref_cnt, mu->pref); + n_edit++; + } + + } + return n_edit? 0 : -ESRCH; +} + +/* + * Destroys m-entry. + * Caller must have checked that it doesn't hold any m.host(s) + */ +static void mfw_destroy(struct ip_masq_mfw *mfw) +{ + kfree_s(mfw, sizeof(*mfw)); + MOD_DEC_USE_COUNT; +} + +/* + * Unlink m-entry. + * + * Caller must lock tables. + */ +static int __mfw_del(struct ip_masq_mfw *mfw) +{ + struct ip_masq_mfw **mfw_p; + int ret = -EINVAL; + + + for(mfw_p=&ip_masq_mfw_table[mfw_hash_val(mfw->fwmark)]; + *mfw_p; + mfw_p = &((*mfw_p)->next)) + { + if (mfw==(*mfw_p)) { + *mfw_p = mfw->next; + ip_masq_mod_dec_nent(mmod_self); + ret = 0; + goto out; + } + } +out: + return ret; +} + +/* + * Crude m.host scheduler + * This interface could be exported to allow playing with + * other sched policies. + * + * Caller must lock m-entry. + */ +static struct ip_masq_mfw_host * __mfw_sched(struct ip_masq_mfw *mfw, int force) +{ + struct ip_masq_mfw_host *h = NULL; + + if (atomic_read(&mfw->nhosts) == 0) + goto out; + + /* + * Here resides actual sched policy: + * When pref_cnt touches 0, entry gets shifted to tail and + * its pref_cnt reloaded from h->pref (actual value + * passed from u-space). + * + * Exception is pref==0: avoid scheduling. + */ + + h = list_entry(mfw->hosts.next, struct ip_masq_mfw_host, list); + + if (atomic_read(&mfw->nhosts) <= 1) + goto out; + + if ((h->pref && atomic_dec_and_test(&h->pref_cnt)) || force) { + atomic_set(&h->pref_cnt, h->pref); + list_del(&h->list); + list_add(&h->list, mfw->hosts.prev); + } +out: + return h; +} + +/* + * Main lookup routine. + * HITs fwmark and schedules m.host entries if required + */ +static struct ip_masq_mfw_host * mfw_lookup(int fwmark) +{ + struct ip_masq_mfw *mfw; + struct ip_masq_mfw_host *h = NULL; + + read_lock(&mfw_lock); + mfw = __mfw_get(fwmark); + + if (mfw) { + write_lock(&mfw->lock); + h = __mfw_sched(mfw, 0); + write_unlock(&mfw->lock); + } + + read_unlock(&mfw_lock); + return h; +} + +#ifdef CONFIG_PROC_FS +static int mfw_procinfo(char *buffer, char **start, off_t offset, + int length, int dummy) +{ + struct ip_masq_mfw *mfw; + struct ip_masq_mfw_host *h; + struct list_head *l,*e; + off_t pos=0, begin; + char temp[129]; + int idx = 0; + int len=0; + + MOD_INC_USE_COUNT; + + IP_MASQ_DEBUG(1-debug, "Entered mfw_info\n"); + + if (offset < 64) + { + sprintf(temp, "FwMark > RAddr RPort PrCnt Pref"); + len = sprintf(buffer, "%-63s\n", temp); + } + pos = 64; + + for(idx = 0; idx < IP_MASQ_MFW_HSIZE; idx++) + { + read_lock(&mfw_lock); + for(mfw = ip_masq_mfw_table[idx]; mfw ; mfw = mfw->next) + { + read_lock_bh(&mfw->lock); + l=&mfw->hosts; + + for(e=l->next;l!=e;e=e->next) { + h = list_entry(e, struct ip_masq_mfw_host, list); + pos += 64; + if (pos <= offset) { + len = 0; + continue; + } + + sprintf(temp,"0x%x > %08lX %5u %5d %5d", + h->fwmark, + ntohl(h->addr), ntohs(h->port), + atomic_read(&h->pref_cnt), h->pref); + len += sprintf(buffer+len, "%-63s\n", temp); + + if(len >= length) { + read_unlock_bh(&mfw->lock); + read_unlock(&mfw_lock); + goto done; + } + } + read_unlock_bh(&mfw->lock); + } + read_unlock(&mfw_lock); + } + +done: + + if (len) { + begin = len - (pos - offset); + *start = buffer + begin; + len -= begin; + } + if(len>length) + len = length; + MOD_DEC_USE_COUNT; + return len; +} +static struct proc_dir_entry mfw_proc_entry = { +/* 0, 0, NULL", */ + 0, 3, "mfw", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + mfw_procinfo +}; + +#define proc_ent &mfw_proc_entry +#else /* !CONFIG_PROC_FS */ + +#define proc_ent NULL +#endif + + +static void mfw_flush(void) +{ + struct ip_masq_mfw *mfw, *local_table[IP_MASQ_MFW_HSIZE]; + struct ip_masq_mfw_host *h; + struct ip_masq_mfw *mfw_next; + int idx; + struct list_head *l,*e; + + write_lock_bh(&mfw_lock); + memcpy(local_table, ip_masq_mfw_table, sizeof ip_masq_mfw_table); + memset(ip_masq_mfw_table, 0, sizeof ip_masq_mfw_table); + write_unlock_bh(&mfw_lock); + + /* + * For every hash table row ... + */ + for(idx=0;idx<IP_MASQ_MFW_HSIZE;idx++) { + + /* + * For every m-entry in row ... + */ + for(mfw=local_table[idx];mfw;mfw=mfw_next) { + /* + * For every m.host in m-entry ... + */ + l=&mfw->hosts; + while((e=l->next) != l) { + h = list_entry(e, struct ip_masq_mfw_host, list); + atomic_dec(&mfw->nhosts); + list_del(&h->list); + kfree_s(h, sizeof(*h)); + MOD_DEC_USE_COUNT; + } + + if (atomic_read(&mfw->nhosts)) { + IP_MASQ_ERR("mfw_flush(): after flushing row nhosts=%d\n", + atomic_read(&mfw->nhosts)); + } + mfw_next = mfw->next; + kfree_s(mfw, sizeof(*mfw)); + MOD_DEC_USE_COUNT; + ip_masq_mod_dec_nent(mmod_self); + } + } +} + +/* + * User space control entry point + */ +static int mfw_ctl(int optname, struct ip_masq_ctl *mctl, int optlen) +{ + struct ip_mfw_user *mu = &mctl->u.mfw_user; + struct ip_masq_mfw *mfw; + int ret = EINVAL; + int arglen = optlen - IP_MASQ_CTL_BSIZE; + int cmd; + + + IP_MASQ_DEBUG(1-debug, "ip_masq_user_ctl(len=%d/%d|%d/%d)\n", + arglen, + sizeof (*mu), + optlen, + sizeof (*mctl)); + + /* + * checks ... + */ + if (arglen != sizeof(*mu) && optlen != sizeof(*mctl)) + return -EINVAL; + + /* + * Don't trust the lusers - plenty of error checking! + */ + cmd = mctl->m_cmd; + IP_MASQ_DEBUG(1-debug, "ip_masq_mfw_ctl(cmd=%d, fwmark=%d)\n", + cmd, mu->fwmark); + + + switch(cmd) { + case IP_MASQ_CMD_NONE: + return 0; + case IP_MASQ_CMD_FLUSH: + break; + case IP_MASQ_CMD_ADD: + case IP_MASQ_CMD_INSERT: + case IP_MASQ_CMD_SET: + if (mu->fwmark == 0) { + IP_MASQ_DEBUG(1-debug, "invalid fwmark==0\n"); + return -EINVAL; + } + if (mu->pref < 0) { + IP_MASQ_DEBUG(1-debug, "invalid pref==%d\n", + mu->pref); + return -EINVAL; + } + break; + } + + + ret = -EINVAL; + + switch(cmd) { + case IP_MASQ_CMD_ADD: + case IP_MASQ_CMD_INSERT: + if (!mu->raddr) { + IP_MASQ_DEBUG(0-debug, "ip_masq_mfw_ctl(ADD): invalid redirect 0x%x:%d\n", + mu->raddr, mu->rport); + goto out; + } + + /* + * Cannot just use mfw_lock because below + * are allocations that can sleep; so + * to assure "new entry" atomic creation + * I use a semaphore. + * + */ + down(&mfw_sema); + + read_lock(&mfw_lock); + mfw = __mfw_get(mu->fwmark); + read_unlock(&mfw_lock); + + /* + * If first host, create m-entry + */ + if (mfw == NULL) { + mfw = mfw_new(mu->fwmark); + if (mfw == NULL) + ret = -ENOMEM; + } + + if (mfw) { + /* + * Put m.host in m-entry. + */ + ret = mfw_addhost(mfw, mu, cmd == IP_MASQ_CMD_ADD); + + /* + * If first host, link m-entry to hash table. + * Already protected by global lock. + */ + if (ret == 0 && atomic_read(&mfw->nhosts) == 1) { + write_lock_bh(&mfw_lock); + __mfw_add(mfw); + write_unlock_bh(&mfw_lock); + } + if (atomic_read(&mfw->nhosts) == 0) { + mfw_destroy(mfw); + } + } + + up(&mfw_sema); + + break; + + case IP_MASQ_CMD_DEL: + down(&mfw_sema); + + read_lock(&mfw_lock); + mfw = __mfw_get(mu->fwmark); + read_unlock(&mfw_lock); + + if (mfw) { + ret = mfw_delhost(mfw, mu); + + /* + * Last lease will free + * XXX check logic XXX + */ + if (atomic_read(&mfw->nhosts) == 0) { + write_lock_bh(&mfw_lock); + __mfw_del(mfw); + write_unlock_bh(&mfw_lock); + mfw_destroy(mfw); + } + } else + ret = -ESRCH; + + up(&mfw_sema); + break; + case IP_MASQ_CMD_FLUSH: + + down(&mfw_sema); + mfw_flush(); + up(&mfw_sema); + ret = 0; + break; + case IP_MASQ_CMD_SET: + /* + * No need to semaphorize here, main list is not + * modified. + */ + read_lock(&mfw_lock); + + mfw = __mfw_get(mu->fwmark); + if (mfw) { + write_lock_bh(&mfw->lock); + + if (mu->flags & IP_MASQ_MFW_SCHED) { + struct ip_masq_mfw_host *h; + if ((h=__mfw_sched(mfw, 1))) { + mfw_host_to_user(h, mu); + ret = 0; + } + } else { + ret = __mfw_edithost(mfw, mu); + } + + write_unlock_bh(&mfw->lock); + } + + read_unlock(&mfw_lock); + break; + } +out: + + return ret; +} + +/* + * Module stubs called from ip_masq core module + */ + +/* + * Input rule stub, called very early for each incoming packet, + * to see if this module has "interest" in packet. + */ +static int mfw_in_rule(const struct sk_buff *skb, const struct iphdr *iph) +{ + int val; + read_lock(&mfw_lock); + val = ( __mfw_get(skb->fwmark) != 0); + read_unlock(&mfw_lock); + return val; +} + +/* + * Input-create stub, called to allow "custom" masq creation + */ +static struct ip_masq * mfw_in_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr) +{ + union ip_masq_tphdr tph; + struct ip_masq *ms = NULL; + struct ip_masq_mfw_host *h = NULL; + + tph.raw = (char*) iph + iph->ihl * 4; + + switch (iph->protocol) { + case IPPROTO_TCP: + /* + * Only open TCP tunnel if SYN+!ACK packet + */ + if (!tph.th->syn && tph.th->ack) + return NULL; + case IPPROTO_UDP: + break; + default: + return NULL; + } + + /* + * If no entry exists in the masquerading table + * and the port is involved + * in port forwarding, create a new masq entry + */ + + if ((h=mfw_lookup(skb->fwmark))) { + ms = ip_masq_new(iph->protocol, + iph->daddr, tph.portp[1], + /* if no redir-port, use packet dest port */ + h->addr, h->port? h->port : tph.portp[1], + iph->saddr, tph.portp[0], + 0); + + if (ms != NULL) + ip_masq_listen(ms); + } + return ms; +} + + +#define mfw_in_update NULL +#define mfw_out_rule NULL +#define mfw_out_create NULL +#define mfw_out_update NULL + +static struct ip_masq_mod mfw_mod = { + NULL, /* next */ + NULL, /* next_reg */ + "mfw", /* name */ + ATOMIC_INIT(0), /* nent */ + ATOMIC_INIT(0), /* refcnt */ + proc_ent, + mfw_ctl, + NULL, /* masq_mod_init */ + NULL, /* masq_mod_done */ + mfw_in_rule, + mfw_in_update, + mfw_in_create, + mfw_out_rule, + mfw_out_update, + mfw_out_create, +}; + + +__initfunc(int ip_mfw_init(void)) +{ + return register_ip_masq_mod ((mmod_self=&mfw_mod)); +} + +int ip_mfw_done(void) +{ + return unregister_ip_masq_mod(&mfw_mod); +} + +#ifdef MODULE +EXPORT_NO_SYMBOLS; + +int init_module(void) +{ + if (ip_mfw_init() != 0) + return -EIO; + return 0; +} + +void cleanup_module(void) +{ + if (ip_mfw_done() != 0) + printk(KERN_INFO "can't remove module"); +} + +#endif /* MODULE */ diff --git a/net/ipv4/ip_masq_portfw.c b/net/ipv4/ip_masq_portfw.c index 4384d9cf6..ad2667401 100644 --- a/net/ipv4/ip_masq_portfw.c +++ b/net/ipv4/ip_masq_portfw.c @@ -2,7 +2,7 @@ * IP_MASQ_PORTFW masquerading module * * - * $Id: ip_masq_portfw.c,v 1.2 1998/08/29 23:51:11 davem Exp $ + * $Id: ip_masq_portfw.c,v 1.3 1998/12/08 05:42:12 davem Exp $ * * Author: Steven Clarke <steven.clarke@monmouth.demon.co.uk> * @@ -269,15 +269,18 @@ static __inline__ int portfw_ctl(int optname, struct ip_masq_ctl *mctl, int optl IP_MASQ_DEBUG(1-debug, "ip_masq_portfw_ctl(cmd=%d)\n", cmd); - if (cmd != IP_MASQ_CMD_FLUSH) { - if (htons(mm->lport) < IP_PORTFW_PORT_MIN - || htons(mm->lport) > IP_PORTFW_PORT_MAX) - return EINVAL; - - if (mm->protocol!=IPPROTO_TCP && mm->protocol!=IPPROTO_UDP) - return EINVAL; - } + switch (cmd) { + case IP_MASQ_CMD_NONE: + return 0; + case IP_MASQ_CMD_FLUSH: + break; + default: + if (htons(mm->lport) < IP_PORTFW_PORT_MIN || htons(mm->lport) > IP_PORTFW_PORT_MAX) + return EINVAL; + if (mm->protocol!=IPPROTO_TCP && mm->protocol!=IPPROTO_UDP) + return EINVAL; + } switch(cmd) { case IP_MASQ_CMD_ADD: diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 5edfbef93..ce027c374 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) output module. * - * Version: $Id: ip_output.c,v 1.63 1998/10/03 09:37:30 davem Exp $ + * Version: $Id: ip_output.c,v 1.64 1999/01/04 20:05:33 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -35,6 +35,9 @@ * Andi Kleen : Split fast and slow ip_build_xmit path * for decreased register pressure on x86 * and more readibility. + * Marc Boucher : When call_out_firewall returns FW_QUEUE, + * silently abort send instead of failing + * with -EPERM. */ #include <asm/uaccess.h> @@ -128,8 +131,10 @@ void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, dev = rt->u.dst.dev; +#ifdef CONFIG_FIREWALL if (call_out_firewall(PF_INET, dev, iph, NULL, &skb) < FW_ACCEPT) goto drop; +#endif ip_send_check(iph); @@ -137,8 +142,10 @@ void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, skb->dst->output(skb); return; +#ifdef CONFIG_FIREWALL drop: kfree_skb(skb); +#endif } int __ip_finish_output(struct sk_buff *skb) @@ -284,8 +291,10 @@ void ip_queue_xmit(struct sk_buff *skb) dev = rt->u.dst.dev; +#ifdef CONFIG_FIREWALL if (call_out_firewall(PF_INET, dev, iph, NULL, &skb) < FW_ACCEPT) goto drop; +#endif /* This can happen when the transport layer has segments queued * with a cached route, and by the time we get here things are @@ -461,7 +470,7 @@ int ip_build_xmit_slow(struct sock *sk, id = htons(ip_id_count++); /* - * Being outputting the bytes. + * Begin outputting the bytes. */ do { @@ -546,9 +555,19 @@ int ip_build_xmit_slow(struct sock *sk, * Account for the fragment. */ - if(!err && - call_out_firewall(PF_INET, rt->u.dst.dev, skb->nh.iph, NULL, &skb) < FW_ACCEPT) - err = -EPERM; +#ifdef CONFIG_FIREWALL + if(!err) { + int fw_res; + + fw_res = call_out_firewall(PF_INET, rt->u.dst.dev, skb->nh.iph, NULL, &skb); + if(fw_res == FW_QUEUE) { + kfree_skb(skb); + skb = NULL; + } else if(fw_res < FW_ACCEPT) { + err = -EPERM; + } + } +#endif if (err) { ip_statistics.IpOutDiscards++; @@ -564,7 +583,7 @@ int ip_build_xmit_slow(struct sock *sk, nfrags++; err = 0; - if (rt->u.dst.output(skb)) { + if (skb && rt->u.dst.output(skb)) { err = -ENETDOWN; ip_statistics.IpOutDiscards++; break; @@ -663,8 +682,20 @@ int ip_build_xmit(struct sock *sk, if (err) err = -EFAULT; - if(!err && call_out_firewall(PF_INET, rt->u.dst.dev, iph, NULL, &skb) < FW_ACCEPT) - err = -EPERM; +#ifdef CONFIG_FIREWALL + if(!err) { + int fw_res; + + fw_res = call_out_firewall(PF_INET, rt->u.dst.dev, iph, NULL, &skb); + if(fw_res == FW_QUEUE) { + /* re-queued elsewhere; silently abort this send */ + kfree_skb(skb); + return 0; + } + if(fw_res < FW_ACCEPT) + err = -EPERM; + } +#endif if (err) { kfree_skb(skb); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index db1d7fc3f..94e64eec6 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1,13 +1,17 @@ /* - * $Id: ipconfig.c,v 1.16 1998/10/21 22:27:26 davem Exp $ + * $Id: ipconfig.c,v 1.19 1999/01/15 06:54:00 davem Exp $ * * Automatic Configuration of IP -- use BOOTP or RARP or user-supplied * information to configure own IP address and routes. * - * Copyright (C) 1996, 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz> + * Copyright (C) 1996--1998 Martin Mares <mj@atrey.karlin.mff.cuni.cz> * * Derived from network configuration code in fs/nfs/nfsroot.c, * originally Copyright (C) 1995, 1996 Gero Kuhlmann and me. + * + * BOOTP rewritten to construct and analyse packets itself instead + * of misusing the IP layer. num_bugs_causing_wrong_arp_replies--; + * -- MJ, December 1998 */ #include <linux/config.h> @@ -21,22 +25,20 @@ #include <linux/in.h> #include <linux/if.h> #include <linux/inet.h> -#include <linux/net.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/socket.h> -#include <linux/inetdevice.h> #include <linux/route.h> -#include <net/route.h> -#include <net/sock.h> +#include <linux/udp.h> #include <net/arp.h> -#include <net/ip_fib.h> +#include <net/ip.h> #include <net/ipconfig.h> #include <asm/segment.h> #include <asm/uaccess.h> +#include <asm/checksum.h> /* Define this to allow debugging output */ #undef IPCONFIG_DEBUG @@ -60,8 +62,6 @@ u32 ic_myaddr __initdata = INADDR_NONE; /* My IP address */ u32 ic_servaddr __initdata = INADDR_NONE; /* Server IP address */ u32 ic_gateway __initdata = INADDR_NONE; /* Gateway IP address */ u32 ic_netmask __initdata = INADDR_NONE; /* Netmask for local subnet */ -int ic_bootp_flag __initdata = 1; /* Use BOOTP */ -int ic_rarp_flag __initdata = 1; /* Use RARP */ int ic_enable __initdata = 1; /* Automatic IP configuration enabled */ int ic_host_name_set __initdata = 0; /* Host name configured manually */ int ic_set_manually __initdata = 0; /* IPconfig parameters set manually */ @@ -73,13 +73,24 @@ u8 root_server_path[256] __initdata = { 0, }; /* Path to mount as root */ #define CONFIG_IP_PNP_DYNAMIC -static int ic_got_reply __initdata = 0; +static int ic_proto_enabled __initdata = 0 /* Protocols enabled */ +#ifdef CONFIG_IP_PNP_BOOTP + | IC_BOOTP +#endif +#ifdef CONFIG_IP_PNP_RARP + | IC_RARP +#endif + ; +static int ic_got_reply __initdata = 0; /* Protocol(s) we got reply from */ + +#else -#define IC_GOT_BOOTP 1 -#define IC_GOT_RARP 2 +static int ic_proto_enabled __initdata = 0; #endif +static int ic_proto_have_if __initdata = 0; + /* * Network devices */ @@ -88,14 +99,13 @@ struct ic_device { struct ic_device *next; struct device *dev; unsigned short flags; + int able; }; static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */ static struct device *ic_dev __initdata = NULL; /* Selected device */ -static int bootp_dev_count __initdata = 0; /* BOOTP capable devices */ -static int rarp_dev_count __initdata = 0; /* RARP capable devices */ -__initfunc(int ic_open_devs(void)) +static int __init ic_open_devs(void) { struct ic_device *d, **last; struct device *dev; @@ -103,10 +113,20 @@ __initfunc(int ic_open_devs(void)) last = &ic_first_dev; for (dev = dev_base; dev; dev = dev->next) - if (dev->type < ARPHRD_SLIP && - !(dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) && - strncmp(dev->name, "dummy", 5) && - (!user_dev_name[0] || !strcmp(dev->name, user_dev_name))) { + if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : + (!(dev->flags & IFF_LOOPBACK) && + (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) && + strncmp(dev->name, "dummy", 5))) { + int able = 0; + if (dev->mtu >= 364) + able |= IC_BOOTP; + else + printk(KERN_WARNING "BOOTP: Ignoring device %s, MTU %d too small", dev->name, dev->mtu); + if (!(dev->flags & IFF_NOARP)) + able |= IC_RARP; + able &= ic_proto_enabled; + if (ic_proto_enabled && !able) + continue; oflags = dev->flags; if (dev_change_flags(dev, oflags | IFF_UP) < 0) { printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name); @@ -118,14 +138,13 @@ __initfunc(int ic_open_devs(void)) *last = d; last = &d->next; d->flags = oflags; - bootp_dev_count++; - if (!(dev->flags & IFF_NOARP)) - rarp_dev_count++; - DBG(("IP-Config: Opened %s\n", dev->name)); + d->able = able; + ic_proto_have_if |= able; + DBG(("IP-Config: Opened %s (able=%d)\n", dev->name, able)); } *last = NULL; - if (!bootp_dev_count) { + if (!ic_first_dev) { if (user_dev_name[0]) printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name); else @@ -135,7 +154,7 @@ __initfunc(int ic_open_devs(void)) return 0; } -__initfunc(void ic_close_devs(void)) +static void __init ic_close_devs(void) { struct ic_device *d, *next; struct device *dev; @@ -164,7 +183,7 @@ set_sockaddr(struct sockaddr_in *sin, u32 addr, u16 port) sin->sin_port = port; } -__initfunc(static int ic_dev_ioctl(unsigned int cmd, struct ifreq *arg)) +static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg) { int res; @@ -175,7 +194,7 @@ __initfunc(static int ic_dev_ioctl(unsigned int cmd, struct ifreq *arg)) return res; } -__initfunc(static int ic_route_ioctl(unsigned int cmd, struct rtentry *arg)) +static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg) { int res; @@ -190,7 +209,7 @@ __initfunc(static int ic_route_ioctl(unsigned int cmd, struct rtentry *arg)) * Set up interface addresses and routes. */ -__initfunc(static int ic_setup_if(void)) +static int __init ic_setup_if(void) { struct ifreq ir; struct sockaddr_in *sin = (void *) &ir.ifr_ifru.ifru_addr; @@ -216,7 +235,7 @@ __initfunc(static int ic_setup_if(void)) return 0; } -__initfunc(int ic_setup_routes(void)) +static int __init ic_setup_routes(void) { /* No need to setup device routes, only the default route... */ @@ -246,7 +265,7 @@ __initfunc(int ic_setup_routes(void)) * Fill in default values for all missing parameters. */ -__initfunc(int ic_defaults(void)) +static int __init ic_defaults(void) { /* * At this point we have no userspace running so need not @@ -270,6 +289,7 @@ __initfunc(int ic_defaults(void)) printk(KERN_ERR "IP-Config: Unable to guess netmask for address %08x\n", ic_myaddr); return -1; } + printk("IP-Config: Guessing netmask %s\n", in_ntoa(ic_netmask)); } return 0; @@ -281,25 +301,22 @@ __initfunc(int ic_defaults(void)) #ifdef CONFIG_IP_PNP_RARP -static int ic_rarp_recv(struct sk_buff *skb, struct device *dev, - struct packet_type *pt); +static int ic_rarp_recv(struct sk_buff *skb, struct device *dev, struct packet_type *pt); static struct packet_type rarp_packet_type __initdata = { - 0, /* Should be: __constant_htons(ETH_P_RARP) - * - but this _doesn't_ come out constant! */ + __constant_htons(ETH_P_RARP), NULL, /* Listen to all devices */ ic_rarp_recv, NULL, NULL }; -__initfunc(static void ic_rarp_init(void)) +static inline void ic_rarp_init(void) { - rarp_packet_type.type = htons(ETH_P_RARP); dev_add_pack(&rarp_packet_type); } -__initfunc(static void ic_rarp_cleanup(void)) +static inline void ic_rarp_cleanup(void) { dev_remove_pack(&rarp_packet_type); } @@ -307,14 +324,18 @@ __initfunc(static void ic_rarp_cleanup(void)) /* * Process received RARP packet. */ -__initfunc(static int -ic_rarp_recv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)) +static int __init +ic_rarp_recv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) { struct arphdr *rarp = (struct arphdr *)skb->h.raw; unsigned char *rarp_ptr = (unsigned char *) (rarp + 1); unsigned long sip, tip; unsigned char *sha, *tha; /* s for "source", t for "target" */ + /* If we already have a reply, just drop the packet */ + if (ic_got_reply) + goto drop; + /* If this test doesn't pass, it's not IP, or we should ignore it anyway */ if (rarp->ar_hln != dev->addr_len || dev->type != ntohs(rarp->ar_hrd)) goto drop; @@ -346,7 +367,7 @@ ic_rarp_recv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)) /* Victory! The packet is what we were looking for! */ if (!ic_got_reply) { - ic_got_reply = IC_GOT_RARP; + ic_got_reply = IC_RARP; ic_dev = dev; if (ic_myaddr == INADDR_NONE) ic_myaddr = tip; @@ -363,16 +384,16 @@ drop: /* * Send RARP request packet over all devices which allow RARP. */ -__initfunc(static void ic_rarp_send(void)) +static void __init ic_rarp_send(void) { struct ic_device *d; - for (d=ic_first_dev; d; d=d->next) { - struct device *dev = d->dev; - if (!(dev->flags & IFF_NOARP)) + for (d=ic_first_dev; d; d=d->next) + if (d->able & IC_RARP) { + struct device *dev = d->dev; arp_send(ARPOP_RREQUEST, ETH_P_RARP, 0, dev, 0, NULL, dev->dev_addr, dev->dev_addr); - } + } } #endif @@ -383,10 +404,9 @@ __initfunc(static void ic_rarp_send(void)) #ifdef CONFIG_IP_PNP_BOOTP -static struct socket *ic_bootp_xmit_sock __initdata = NULL; /* BOOTP send socket */ -static struct socket *ic_bootp_recv_sock __initdata = NULL; /* BOOTP receive socket */ - struct bootp_pkt { /* BOOTP packet format */ + struct iphdr iph; /* IP header */ + struct udphdr udph; /* UDP header */ u8 op; /* 1=request, 2=reply */ u8 htype; /* HW address type */ u8 hlen; /* HW address length */ @@ -407,240 +427,23 @@ struct bootp_pkt { /* BOOTP packet format */ #define BOOTP_REQUEST 1 #define BOOTP_REPLY 2 -static struct bootp_pkt *ic_xmit_bootp __initdata = NULL; /* Packet being transmitted */ -static struct bootp_pkt *ic_recv_bootp __initdata = NULL; /* Packet being received */ - -/* - * Dirty tricks for BOOTP packet routing. We replace the standard lookup function - * for the local fib by our version which does fake lookups and returns our private - * fib entries. Ugly, but it seems to be the simplest way to do the job. - */ - -static void *ic_old_local_lookup __initdata = NULL; /* Old local routing table lookup function */ -static struct fib_info *ic_bootp_tx_fib __initdata = NULL; /* Our fake fib entries */ -static struct fib_info *ic_bootp_rx_fib __initdata = NULL; - -__initfunc(static int ic_bootp_route_lookup(struct fib_table *tb, const struct rt_key *key, - struct fib_result *res)) -{ - static u32 ic_brl_zero = 0; - - DBG(("BOOTP: Route lookup: %d:%08x -> %d:%08x: ", key->iif, key->src, key->oif, key->dst)); - res->scope = RT_SCOPE_UNIVERSE; - res->prefix = &ic_brl_zero; - res->prefixlen = 0; - res->nh_sel = 0; - if (key->src == 0 && key->dst == 0xffffffff && key->iif == loopback_dev.ifindex) { /* Packet output */ - DBG(("Output\n")); - res->type = RTN_UNICAST; - res->fi = ic_bootp_tx_fib; - } else if (key->iif && key->iif != loopback_dev.ifindex && key->oif == 0) { /* Packet input */ - DBG(("Input\n")); - res->type = RTN_LOCAL; - res->fi = ic_bootp_rx_fib; - } else if (!key->iif && !key->oif && !key->src) { /* Address check by inet_addr_type() */ - DBG(("Check\n")); - res->type = RTN_UNICAST; - res->fi = ic_bootp_tx_fib; - } else { - DBG(("Drop\n")); - return -EINVAL; - } - return 0; -} - -__initfunc(static int ic_set_bootp_route(struct ic_device *d)) -{ - struct fib_info *f = ic_bootp_tx_fib; - struct fib_nh *n = &f->fib_nh[0]; - - n->nh_dev = d->dev; - n->nh_oif = n->nh_dev->ifindex; - rt_cache_flush(0); - return 0; -} - -__initfunc(static int ic_bootp_route_init(void)) -{ - int size = sizeof(struct fib_info) + sizeof(struct fib_nh); - struct fib_info *rf, *tf; - struct fib_nh *nh; - - if (!(rf = ic_bootp_rx_fib = kmalloc(size, GFP_KERNEL)) || - !(tf = ic_bootp_tx_fib = kmalloc(size, GFP_KERNEL))) - return -1; - - memset(rf, 0, size); - rf->fib_nhs = 1; - nh = &rf->fib_nh[0]; - nh->nh_scope = RT_SCOPE_UNIVERSE; - - memset(tf, 0, size); - rf->fib_nhs = 1; - nh = &rf->fib_nh[0]; - nh->nh_dev = ic_first_dev->dev; - nh->nh_scope = RT_SCOPE_UNIVERSE; - nh->nh_oif = nh->nh_dev->ifindex; - - /* Dirty trick: replace standard routing table lookup by our function */ - ic_old_local_lookup = local_table->tb_lookup; - local_table->tb_lookup = ic_bootp_route_lookup; - - return 0; -} - -__initfunc(static void ic_bootp_route_cleanup(void)) -{ - if (ic_old_local_lookup) - local_table->tb_lookup = ic_old_local_lookup; - if (ic_bootp_rx_fib) - kfree_s(ic_bootp_rx_fib, sizeof(struct fib_info) + sizeof(struct fib_nh)); - if (ic_bootp_tx_fib) - kfree_s(ic_bootp_tx_fib, sizeof(struct fib_info) + sizeof(struct fib_nh)); -} - - -/* - * Allocation and freeing of BOOTP packet buffers. - */ -__initfunc(static int ic_bootp_alloc(void)) -{ - if (!(ic_xmit_bootp = kmalloc(sizeof(struct bootp_pkt), GFP_KERNEL)) || - !(ic_recv_bootp = kmalloc(sizeof(struct bootp_pkt), GFP_KERNEL))) { - printk(KERN_ERR "BOOTP: Out of memory!\n"); - return -1; - } - return 0; -} - -__initfunc(static void ic_bootp_free(void)) -{ - if (ic_xmit_bootp) { - kfree_s(ic_xmit_bootp, sizeof(struct bootp_pkt)); - ic_xmit_bootp = NULL; - } - if (ic_recv_bootp) { - kfree_s(ic_recv_bootp, sizeof(struct bootp_pkt)); - ic_recv_bootp = NULL; - } -} - - -/* - * Add / Remove fake interface addresses for BOOTP packet sending. - */ -__initfunc(static int ic_bootp_addrs_add(void)) -{ - struct ic_device *d; - int err; - - for(d=ic_first_dev; d; d=d->next) - if ((err = inet_add_bootp_addr(d->dev)) < 0) { - printk(KERN_ERR "BOOTP: Unable to set interface address\n"); - return -1; - } - return 0; -} - -__initfunc(static void ic_bootp_addrs_del(void)) -{ - struct ic_device *d; - - for(d=ic_first_dev; d; d=d->next) - inet_del_bootp_addr(d->dev); -} - -/* - * UDP socket operations. - */ -__initfunc(static int ic_udp_open(struct socket **sock)) -{ - int err; - - if ((err = sock_create(PF_INET, SOCK_DGRAM, IPPROTO_UDP, sock)) < 0) - printk(KERN_ERR "BOOTP: Cannot open UDP socket!\n"); - return err; -} - -static inline void ic_udp_close(struct socket *sock) -{ - if (sock) - sock_release(sock); -} - -__initfunc(static int ic_udp_connect(struct socket *sock, u32 addr, u16 port)) -{ - struct sockaddr_in sa; - int err; - - set_sockaddr(&sa, htonl(addr), htons(port)); - err = sock->ops->connect(sock, (struct sockaddr *) &sa, sizeof(sa), 0); - if (err < 0) { - printk(KERN_ERR "BOOTP: connect() failed (%d)\n", err); - return -1; - } - return 0; -} - -__initfunc(static int ic_udp_bind(struct socket *sock, u32 addr, u16 port)) -{ - struct sockaddr_in sa; - int err; - - set_sockaddr(&sa, htonl(addr), htons(port)); - err = sock->ops->bind(sock, (struct sockaddr *) &sa, sizeof(sa)); - if (err < 0) { - printk(KERN_ERR "BOOTP: bind() failed (%d)\n", err); - return -1; - } - return 0; -} - -__initfunc(static int ic_udp_send(struct socket *sock, void *buf, int size)) -{ - mm_segment_t oldfs; - int result; - struct msghdr msg; - struct iovec iov; - - oldfs = get_fs(); - set_fs(get_ds()); - iov.iov_base = buf; - iov.iov_len = size; - memset(&msg, 0, sizeof(msg)); - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - result = sock_sendmsg(sock, &msg, size); - set_fs(oldfs); - - return (result != size); -} +static u32 ic_bootp_xid; -__initfunc(static int ic_udp_recv(struct socket *sock, void *buf, int size)) -{ - mm_segment_t oldfs; - int result; - struct msghdr msg; - struct iovec iov; +static int ic_bootp_recv(struct sk_buff *skb, struct device *dev, struct packet_type *pt); - oldfs = get_fs(); - set_fs(get_ds()); - iov.iov_base = buf; - iov.iov_len = size; - memset(&msg, 0, sizeof(msg)); - msg.msg_flags = MSG_DONTWAIT; - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - result = sock_recvmsg(sock, &msg, size, MSG_DONTWAIT); - set_fs(oldfs); - return result; -} +static struct packet_type bootp_packet_type __initdata = { + __constant_htons(ETH_P_IP), + NULL, /* Listen to all devices */ + ic_bootp_recv, + NULL, + NULL +}; /* * Initialize BOOTP extension fields in the request. */ -__initfunc(static void ic_bootp_init_ext(u8 *e)) +static void __init ic_bootp_init_ext(u8 *e) { *e++ = 99; /* RFC1048 Magic Cookie */ *e++ = 130; @@ -668,96 +471,95 @@ __initfunc(static void ic_bootp_init_ext(u8 *e)) /* * Initialize the BOOTP mechanism. */ -__initfunc(static int ic_bootp_init(void)) +static inline void ic_bootp_init(void) { - /* Allocate memory for BOOTP packets */ - if (ic_bootp_alloc() < 0) - return -1; - - /* Add fake zero addresses to all interfaces */ - if (ic_bootp_addrs_add() < 0) - return -1; - - /* Initialize BOOTP routing */ - if (ic_bootp_route_init() < 0) - return -1; - - /* Initialize common portion of BOOTP request */ - memset(ic_xmit_bootp, 0, sizeof(struct bootp_pkt)); - ic_xmit_bootp->op = BOOTP_REQUEST; - get_random_bytes(&ic_xmit_bootp->xid, sizeof(ic_xmit_bootp->xid)); - ic_bootp_init_ext(ic_xmit_bootp->vendor_area); - - DBG(("BOOTP: XID=%08x\n", ic_xmit_bootp->xid)); - - /* Open the sockets */ - if (ic_udp_open(&ic_bootp_xmit_sock) || - ic_udp_open(&ic_bootp_recv_sock)) - return -1; - - /* Bind/connect the sockets */ - ic_bootp_xmit_sock->sk->broadcast = 1; - ic_bootp_xmit_sock->sk->reuse = 1; - ic_bootp_recv_sock->sk->reuse = 1; - ic_set_bootp_route(ic_first_dev); - if (ic_udp_bind(ic_bootp_recv_sock, INADDR_ANY, 68) || - ic_udp_bind(ic_bootp_xmit_sock, INADDR_ANY, 68) || - ic_udp_connect(ic_bootp_xmit_sock, INADDR_BROADCAST, 67)) - return -1; - - return 0; + get_random_bytes(&ic_bootp_xid, sizeof(u32)); + DBG(("BOOTP: XID=%08x\n", ic_bootp_xid)); + dev_add_pack(&bootp_packet_type); } /* * BOOTP cleanup. */ -__initfunc(static void ic_bootp_cleanup(void)) +static inline void ic_bootp_cleanup(void) { - ic_udp_close(ic_bootp_xmit_sock); - ic_udp_close(ic_bootp_recv_sock); - ic_bootp_addrs_del(); - ic_bootp_free(); - ic_bootp_route_cleanup(); + dev_remove_pack(&bootp_packet_type); } /* * Send BOOTP request to single interface. */ -__initfunc(static int ic_bootp_send_if(struct ic_device *d, u32 jiffies)) +static void __init ic_bootp_send_if(struct ic_device *d, u32 jiffies) { struct device *dev = d->dev; - struct bootp_pkt *b = ic_xmit_bootp; - + struct sk_buff *skb; + struct bootp_pkt *b; + int hh_len = (dev->hard_header_len + 15) & ~15; + struct iphdr *h; + + /* Allocate packet */ + skb = alloc_skb(sizeof(struct bootp_pkt) + hh_len + 15, GFP_KERNEL); + if (!skb) + return; + skb_reserve(skb, hh_len); + b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt)); + memset(b, 0, sizeof(struct bootp_pkt)); + + /* Construct IP header */ + skb->nh.iph = h = &b->iph; + h->version = 4; + h->ihl = 5; + h->tot_len = htons(sizeof(struct bootp_pkt)); + h->frag_off = htons(IP_DF); + h->ttl = 64; + h->protocol = IPPROTO_UDP; + h->daddr = INADDR_BROADCAST; + h->check = ip_fast_csum((unsigned char *) h, h->ihl); + + /* Construct UDP header */ + b->udph.source = htons(68); + b->udph.dest = htons(67); + b->udph.len = htons(sizeof(struct bootp_pkt) - sizeof(struct iphdr)); + /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */ + + /* Construct BOOTP header */ + b->op = BOOTP_REQUEST; b->htype = dev->type; b->hlen = dev->addr_len; - memset(b->hw_addr, 0, sizeof(b->hw_addr)); memcpy(b->hw_addr, dev->dev_addr, dev->addr_len); b->secs = htons(jiffies / HZ); - ic_set_bootp_route(d); - return ic_udp_send(ic_bootp_xmit_sock, b, sizeof(struct bootp_pkt)); + b->xid = ic_bootp_xid; + ic_bootp_init_ext(b->vendor_area); + + /* Chain packet down the line... */ + skb->dev = dev; + skb->protocol = __constant_htons(ETH_P_IP); + if ((dev->hard_header && + dev->hard_header(skb, dev, ntohs(skb->protocol), dev->broadcast, dev->dev_addr, skb->len) < 0) || + dev_queue_xmit(skb) < 0) + printk("E"); } /* * Send BOOTP requests to all interfaces. */ -__initfunc(static int ic_bootp_send(u32 jiffies)) +static void __init ic_bootp_send(u32 jiffies) { struct ic_device *d; for(d=ic_first_dev; d; d=d->next) - if (ic_bootp_send_if(d, jiffies) < 0) - return -1; - return 0; + if (d->able & IC_BOOTP) + ic_bootp_send_if(d, jiffies); } /* * Copy BOOTP-supplied string if not already set. */ -__initfunc(static int ic_bootp_string(char *dest, char *src, int len, int max)) +static int __init ic_bootp_string(char *dest, char *src, int len, int max) { if (!len) return 0; @@ -772,7 +574,7 @@ __initfunc(static int ic_bootp_string(char *dest, char *src, int len, int max)) /* * Process BOOTP extension. */ -__initfunc(static void ic_do_bootp_ext(u8 *ext)) +static void __init ic_do_bootp_ext(u8 *ext) { #ifdef IPCONFIG_DEBUG u8 *c; @@ -808,65 +610,64 @@ __initfunc(static void ic_do_bootp_ext(u8 *ext)) /* - * Receive BOOTP request. + * Receive BOOTP reply. */ -__initfunc(static void ic_bootp_recv(void)) +static int __init ic_bootp_recv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) { + struct bootp_pkt *b = (struct bootp_pkt *) skb->nh.iph; + struct iphdr *h = &b->iph; int len; - u8 *ext, *end, *opt; - struct ic_device *d; - struct bootp_pkt *b = ic_recv_bootp; - if ((len = ic_udp_recv(ic_bootp_recv_sock, b, sizeof(struct bootp_pkt))) < 0) - return; + /* If we already have a reply, just drop the packet */ + if (ic_got_reply) + goto drop; - /* Check consistency of incoming packet */ - if (len < 300 || /* See RFC 1542:2.1 */ - b->op != BOOTP_REPLY || - b->xid != ic_xmit_bootp->xid) { - printk("?"); - return; - } + /* Check whether it's a BOOTP packet */ + if (skb->pkt_type == PACKET_OTHERHOST || + skb->len < sizeof(struct udphdr) + sizeof(struct iphdr) || + h->ihl != 5 || + h->version != 4 || + ip_fast_csum((char *) h, h->ihl) != 0 || + skb->len < ntohs(h->tot_len) || + h->protocol != IPPROTO_UDP || + b->udph.source != htons(67) || + b->udph.dest != htons(68) || + ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr)) + goto drop; - /* Find interface this arrived from */ - for(d=ic_first_dev; d; d=d->next) { - struct device *dev = d->dev; - if (b->htype == dev->type || - b->hlen == dev->addr_len || - !memcmp(b->hw_addr, dev->dev_addr, dev->addr_len)) - break; - } - if (!d) { /* Unknown device */ - printk("!"); - return; + /* Fragments are not supported */ + if (h->frag_off & htons(IP_OFFSET|IP_MF)) { + printk(KERN_ERR "BOOTP: Ignoring fragmented reply.\n"); + goto drop; } - /* Record BOOTP packet arrival */ - cli(); - if (ic_got_reply) { - sti(); - return; + /* Is it a reply to our BOOTP request? */ + len = ntohs(b->udph.len) - sizeof(struct udphdr); + if (len < 300 || /* See RFC 951:2.1 */ + b->op != BOOTP_REPLY || + b->xid != ic_bootp_xid) { + printk("?"); + goto drop; } - ic_got_reply = IC_GOT_BOOTP; - sti(); - ic_dev = d->dev; /* Extract basic fields */ ic_myaddr = b->your_ip; ic_servaddr = b->server_ip; + ic_got_reply = IC_BOOTP; + ic_dev = dev; /* Parse extensions */ if (b->vendor_area[0] == 99 && /* Check magic cookie */ b->vendor_area[1] == 130 && b->vendor_area[2] == 83 && b->vendor_area[3] == 99) { - ext = &b->vendor_area[4]; - end = (u8 *) b + len; + u8 *ext = &b->vendor_area[4]; + u8 *end = (u8 *) b + len; while (ext < end && *ext != 0xff) { if (*ext == 0) /* Padding */ ext++; else { - opt = ext; + u8 *opt = ext; ext += ext[1] + 2; if (ext <= end) ic_do_bootp_ext(opt); @@ -876,7 +677,12 @@ __initfunc(static void ic_bootp_recv(void)) if (ic_gateway == INADDR_NONE && b->relay_ip) ic_gateway = b->relay_ip; -} + +drop: + kfree_skb(skb); + return 0; +} + #endif @@ -887,11 +693,13 @@ __initfunc(static void ic_bootp_recv(void)) #ifdef CONFIG_IP_PNP_DYNAMIC -__initfunc(int ic_dynamic(void)) +static int __init ic_dynamic(void) { int retries; unsigned long timeout, jiff; unsigned long start_jiffies; + int do_rarp = ic_proto_have_if & IC_RARP; + int do_bootp = ic_proto_have_if & IC_BOOTP; /* * If neither BOOTP nor RARP was selected, return with an error. This @@ -899,30 +707,22 @@ __initfunc(int ic_dynamic(void)) * sing, and without BOOTP and RARP we are not able to get that in- * formation. */ - if (!ic_bootp_flag && !ic_rarp_flag) { + if (!ic_proto_enabled) { printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n"); return -1; } #ifdef CONFIG_IP_PNP_BOOTP - if (ic_bootp_flag && !bootp_dev_count) { + if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP) printk(KERN_ERR "BOOTP: No suitable device found.\n"); - ic_bootp_flag = 0; - } -#else - ic_bootp_flag = 0; #endif #ifdef CONFIG_IP_PNP_RARP - if (ic_rarp_flag && !rarp_dev_count) { + if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP) printk(KERN_ERR "RARP: No suitable device found.\n"); - ic_rarp_flag = 0; - } -#else - ic_rarp_flag = 0; #endif - if (!ic_bootp_flag && !ic_rarp_flag) + if (!ic_proto_have_if) /* Error message already printed */ return -1; @@ -930,14 +730,12 @@ __initfunc(int ic_dynamic(void)) * Setup RARP and BOOTP protocols */ #ifdef CONFIG_IP_PNP_RARP - if (ic_rarp_flag) + if (do_rarp) ic_rarp_init(); #endif #ifdef CONFIG_IP_PNP_BOOTP - if (ic_bootp_flag && ic_bootp_init() < 0) { - ic_bootp_cleanup(); - return -1; - } + if (do_bootp) + ic_bootp_init(); #endif /* @@ -949,36 +747,26 @@ __initfunc(int ic_dynamic(void)) * applies.. - AC] */ printk(KERN_NOTICE "Sending %s%s%s requests...", - ic_bootp_flag ? "BOOTP" : "", - ic_bootp_flag && ic_rarp_flag ? " and " : "", - ic_rarp_flag ? "RARP" : ""); + do_bootp ? "BOOTP" : "", + do_bootp && do_rarp ? " and " : "", + do_rarp ? "RARP" : ""); start_jiffies = jiffies; retries = CONF_RETRIES; get_random_bytes(&timeout, sizeof(timeout)); timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM); for(;;) { #ifdef CONFIG_IP_PNP_BOOTP - if (ic_bootp_flag && ic_bootp_send(jiffies - start_jiffies) < 0) { - printk(" BOOTP failed!\n"); - ic_bootp_cleanup(); - ic_bootp_flag = 0; - if (!ic_rarp_flag) - break; - } + if (do_bootp) + ic_bootp_send(jiffies - start_jiffies); #endif #ifdef CONFIG_IP_PNP_RARP - if (ic_rarp_flag) + if (do_rarp) ic_rarp_send(); #endif printk("."); jiff = jiffies + timeout; while (jiffies < jiff && !ic_got_reply) -#ifdef CONFIG_IP_PNP_BOOTP - if (ic_bootp_flag) - ic_bootp_recv(); -#else ; -#endif if (ic_got_reply) { printk(" OK\n"); break; @@ -993,11 +781,11 @@ __initfunc(int ic_dynamic(void)) } #ifdef CONFIG_IP_PNP_RARP - if (ic_rarp_flag) + if (do_rarp) ic_rarp_cleanup(); #endif #ifdef CONFIG_IP_PNP_BOOTP - if (ic_bootp_flag) + if (do_bootp) ic_bootp_cleanup(); #endif @@ -1005,7 +793,7 @@ __initfunc(int ic_dynamic(void)) return -1; printk("IP-Config: Got %s answer from %s, ", - (ic_got_reply == IC_GOT_BOOTP) ? "BOOTP" : "RARP", + (ic_got_reply & IC_BOOTP) ? "BOOTP" : "RARP", in_ntoa(ic_servaddr)); printk("my address is %s\n", in_ntoa(ic_myaddr)); @@ -1018,7 +806,7 @@ __initfunc(int ic_dynamic(void)) * IP Autoconfig dispatcher. */ -__initfunc(int ip_auto_config(void)) +int __init ip_auto_config(void) { if (!ic_enable) return 0; @@ -1094,25 +882,44 @@ __initfunc(int ip_auto_config(void)) * <device> - use all available devices * <bootp|rarp|both|off> - use both protocols to determine my own address */ -__initfunc(void ip_auto_config_setup(char *addrs, int *ints)) +static int __init ic_proto_name(char *name) +{ + if (!strcmp(name, "off")) { + ic_proto_enabled = 0; + return 1; + } +#ifdef CONFIG_IP_PNP_BOOTP + else if (!strcmp(name, "bootp")) { + ic_proto_enabled &= ~IC_RARP; + return 1; + } +#endif +#ifdef CONFIG_IP_PNP_RARP + else if (!strcmp(name, "rarp")) { + ic_proto_enabled &= ~IC_BOOTP; + return 1; + } +#endif +#ifdef CONFIG_IP_PNP_DYNAMIC + else if (!strcmp(name, "both")) { + return 1; + } +#endif + return 0; +} + +void __init ip_auto_config_setup(char *addrs, int *ints) { char *cp, *ip, *dp; int num = 0; ic_set_manually = 1; - - if (!strcmp(addrs, "bootp")) { - ic_rarp_flag = 0; - return; - } else if (!strcmp(addrs, "rarp")) { - ic_bootp_flag = 0; - return; - } else if (!strcmp(addrs, "both")) { - return; - } else if (!strcmp(addrs, "off")) { + if (!strcmp(addrs, "off")) { ic_enable = 0; return; } + if (ic_proto_name(addrs)) + return; /* Parse the whole string */ ip = addrs; @@ -1153,12 +960,7 @@ __initfunc(void ip_auto_config_setup(char *addrs, int *ints)) user_dev_name[IFNAMSIZ-1] = '\0'; break; case 6: - if (!strcmp(ip, "rarp")) - ic_bootp_flag = 0; - else if (!strcmp(ip, "bootp")) - ic_rarp_flag = 0; - else if (strcmp(ip, "both")) - ic_bootp_flag = ic_rarp_flag = 0; + ic_proto_name(ip); break; } } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 79ecd1102..99cda3ea0 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -9,7 +9,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Version: $Id: ipmr.c,v 1.37 1998/10/03 09:37:39 davem Exp $ + * Version: $Id: ipmr.c,v 1.38 1999/01/12 14:34:40 davem Exp $ * * Fixes: * Michael Chastain : Incorrect size of copying. @@ -267,7 +267,6 @@ static void ipmr_update_threshoulds(struct mfc_cache *cache, unsigned char *ttls cache->mfc_minvif = vifi; if (cache->mfc_maxvif <= vifi) cache->mfc_maxvif = vifi + 1; - vifi++; } } end_bh_atomic(); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a3d002fae..0079ed04d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -5,7 +5,7 @@ * * ROUTE - implementation of the IP router. * - * Version: $Id: route.c,v 1.58 1998/10/03 09:37:50 davem Exp $ + * Version: $Id: route.c,v 1.61 1999/01/12 14:34:43 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -1307,6 +1307,7 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int key.dst = key.src = htonl(INADDR_LOOPBACK); dev_out = &loopback_dev; key.oif = loopback_dev.ifindex; + res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; } @@ -1334,6 +1335,7 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int if (key.src == 0) key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); + res.type = RTN_UNICAST; goto make_route; } return -ENETUNREACH; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index c186a8953..10f5e9324 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1,7 +1,7 @@ /* * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem. * - * $Id: sysctl_net_ipv4.c,v 1.36 1998/10/21 05:26:59 davem Exp $ + * $Id: sysctl_net_ipv4.c,v 1.38 1999/01/02 16:51:48 davem Exp $ * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS] @@ -31,6 +31,7 @@ static int boolean_max = 1; /* From icmp.c */ extern int sysctl_icmp_echo_ignore_all; extern int sysctl_icmp_echo_ignore_broadcasts; +extern int sysctl_icmp_ignore_bogus_error_responses; /* From ip_fragment.c */ extern int sysctl_ipfrag_low_thresh; @@ -66,6 +67,9 @@ extern int sysctl_icmp_timeexceed_time; extern int sysctl_icmp_paramprob_time; extern int sysctl_icmp_echoreply_time; +/* From igmp.c */ +extern int sysctl_igmp_max_memberships; + int tcp_retr1_max = 255; struct ipv4_config ipv4_config; @@ -164,6 +168,9 @@ ctl_table ipv4_table[] = { {NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts", &sysctl_icmp_echo_ignore_broadcasts, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses", + &sysctl_icmp_ignore_bogus_error_responses, sizeof(int), 0644, NULL, + &proc_dointvec}, {NET_IPV4_ICMP_DESTUNREACH_RATE, "icmp_destunreach_rate", &sysctl_icmp_destunreach_time, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_ICMP_TIMEEXCEED_RATE, "icmp_timeexceed_rate", @@ -173,6 +180,10 @@ ctl_table ipv4_table[] = { {NET_IPV4_ICMP_ECHOREPLY_RATE, "icmp_echoreply_rate", &sysctl_icmp_echoreply_time, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_ROUTE, "route", NULL, 0, 0555, ipv4_route_table}, +#ifdef CONFIG_IP_MULTICAST + {NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships", + &sysctl_igmp_max_memberships, sizeof(int), 0644, NULL, &proc_dointvec}, +#endif {0} }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b6f1c7a93..67e482e86 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.132 1998/11/08 13:21:14 davem Exp $ + * Version: $Id: tcp.c,v 1.134 1999/01/09 08:50:09 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -812,7 +812,7 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags) * FIXME: the *_user functions should * return how much data was * copied before the fault - * occured and then a partial + * occurred and then a partial * packet with this data should * be sent. Unfortunately * csum_and_copy_from_user doesn't @@ -1612,19 +1612,15 @@ struct sock *tcp_accept(struct sock *sk, int flags) if(sk->keepopen) tcp_inc_slow_timer(TCP_SLT_KEEPALIVE); - /* - * This does not pass any already set errors on the new socket - * to the user, but they will be returned on the first socket operation - * after the accept. - * - * Once linux gets a multithreaded net_bh or equivalent there will be a race - * here - you'll have to check for sk->zapped as set by the ICMP handler then. - */ + release_sock(sk); + return newsk; - error = 0; out: + /* sk should be in LISTEN state, thus accept can use sk->err for + * internal purposes without stomping one anyone's feed. + */ + sk->err = error; release_sock(sk); - sk->err = error; return newsk; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 59ae01f88..aca7026b9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_input.c,v 1.141 1998/11/18 02:12:07 davem Exp $ + * Version: $Id: tcp_input.c,v 1.153 1999/01/20 07:20:03 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -100,8 +100,10 @@ static void tcp_delack_estimator(struct tcp_opt *tp) tp->lrcvtime = jiffies; /* Help sender leave slow start quickly, - * this sets our initial ato value. + * and also makes sure we do not take this + * branch ever again for this connection. */ + tp->ato = 1; tcp_enter_quickack_mode(tp); } else { int m = jiffies - tp->lrcvtime; @@ -111,12 +113,12 @@ static void tcp_delack_estimator(struct tcp_opt *tp) m = 1; if(m > tp->rto) tp->ato = tp->rto; - else - tp->ato = (tp->ato >> 1) + m; - - /* We are not in "quick ack" mode. */ - if(tp->ato <= (HZ/100)) - tp->ato = ((HZ/100)*2); + else { + /* This funny shift makes sure we + * clear the "quick ack mode" bit. + */ + tp->ato = ((tp->ato << 1) >> 2) + m; + } } } @@ -127,7 +129,10 @@ static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th, struct sk_buff *skb) { tp->delayed_acks++; - /* Tiny-grams with PSH set make us ACK quickly. */ + + /* Tiny-grams with PSH set make us ACK quickly. + * Note: This also clears the "quick ack mode" bit. + */ if(th->psh && (skb->len < (tp->mss_cache >> 1))) tp->ato = HZ/50; } @@ -301,7 +306,7 @@ static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, /* The retransmission queue is always in order, so * we can short-circuit the walk early. */ - if(!before(start_seq, TCP_SKB_CB(skb)->end_seq)) + if(after(TCP_SKB_CB(skb)->seq, end_seq)) break; /* We play conservative, we don't allow SACKS to partially @@ -311,7 +316,8 @@ static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, if(!after(start_seq, TCP_SKB_CB(skb)->seq) && !before(end_seq, TCP_SKB_CB(skb)->end_seq)) { /* If this was a retransmitted frame, account for it. */ - if(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) + if((TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) && + tp->retrans_out) tp->retrans_out--; TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; @@ -598,6 +604,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, unsigned long now = jiffies; int acked = 0; + /* If we are retransmitting, and this ACK clears up to + * the retransmit head, or further, then clear our state. + */ + if (tp->retrans_head != NULL && + !before(ack, TCP_SKB_CB(tp->retrans_head)->end_seq)) + tp->retrans_head = NULL; + while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); __u8 sacked = scb->sacked; @@ -625,6 +638,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, if(tp->fackets_out) tp->fackets_out--; } else { + /* This is pure paranoia. */ tp->retrans_head = NULL; } tp->packets_out--; @@ -633,9 +647,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, __skb_unlink(skb, skb->list); kfree_skb(skb); } - - if (acked) - tp->retrans_head = NULL; return acked; } @@ -723,10 +734,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp, } else { tcp_set_rto(tp); } - if (should_advance_cwnd(tp, flag)) - tcp_cong_avoid(tp); - /* NOTE: safe here so long as cong_ctl doesn't use rto */ tcp_bound_rto(tp); } @@ -740,7 +748,6 @@ static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) * congestion window is handled properly by that code. */ if (tp->retransmits) { - tp->retrans_head = NULL; tcp_xmit_retransmit_queue(sk); tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); } else { @@ -816,6 +823,12 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt); + /* We must do this here, before code below clears out important + * state contained in tp->fackets_out and tp->retransmits. -DaveM + */ + if (should_advance_cwnd(tp, flag)) + tcp_cong_avoid(tp); + /* If we have a timestamp, we always do rtt estimates. */ if (tp->saw_tstamp) { tcp_ack_saw_tstamp(sk, tp, seq, ack, flag); @@ -845,8 +858,6 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, } } } - if (should_advance_cwnd(tp, flag)) - tcp_cong_avoid(tp); } if (tp->packets_out) { @@ -1166,7 +1177,7 @@ coalesce: /* Zap SWALK, by moving every further SACK up by one slot. * Decrease num_sacks. */ - for(this_sack += 1; this_sack < num_sacks-1; this_sack++, swalk++) { + for(; this_sack < num_sacks-1; this_sack++, swalk++) { struct tcp_sack_block *next = (swalk + 1); swalk->start_seq = next->start_seq; swalk->end_seq = next->end_seq; @@ -1298,7 +1309,7 @@ static void tcp_sack_extend(struct tcp_opt *tp, struct sk_buff *old_skb, struct int num_sacks = tp->num_sacks; int this_sack; - for(this_sack = 0; this_sack < num_sacks; this_sack++, tp++) { + for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) { if(sp->end_seq == TCP_SKB_CB(old_skb)->end_seq) break; } @@ -1346,7 +1357,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) /* Queue data for delivery to the user. * Packets in sequence go to the receive queue. - * Out of sequence packets to out_of_order_queue. + * Out of sequence packets to the out_of_order_queue. */ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { /* Ok. In sequence. */ @@ -1394,7 +1405,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) tp->delayed_acks++; tcp_enter_quickack_mode(tp); - /* Disable header predition. */ + /* Disable header prediction. */ tp->pred_flags = 0; SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", @@ -1657,9 +1668,12 @@ static inline void tcp_urg(struct sock *sk, struct tcphdr *th, unsigned long len } } -/* - * Clean first the out_of_order queue, then the receive queue until - * the socket is in its memory limits again. +/* Clean the out_of_order queue if we can, trying to get + * the socket within its memory limits again. + * + * Return less than zero if we should start dropping frames + * until the socket owning process reads some of the data + * to stabilize the situation. */ static int prune_queue(struct sock *sk) { @@ -1670,46 +1684,50 @@ static int prune_queue(struct sock *sk) net_statistics.PruneCalled++; - /* First Clean the out_of_order queue. */ - /* Start with the end because there are probably the least - * useful packets (crossing fingers). - */ - while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue))) { - net_statistics.OfoPruned += skb->len; - kfree_skb(skb); - if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) - return 0; + /* First, purge the out_of_order queue. */ + skb = __skb_dequeue_tail(&tp->out_of_order_queue); + if(skb != NULL) { + /* Free it all. */ + do { net_statistics.OfoPruned += skb->len; + kfree_skb(skb); + skb = __skb_dequeue_tail(&tp->out_of_order_queue); + } while(skb != NULL); + + /* Reset SACK state. A conforming SACK implementation will + * do the same at a timeout based retransmit. When a connection + * is in a sad state like this, we care only about integrity + * of the connection not performance. + */ + if(tp->sack_ok) + tp->num_sacks = 0; } - /* Now continue with the receive queue if it wasn't enough. - * But only do this if we are really being abused. + /* If we are really being abused, tell the caller to silently + * drop receive data on the floor. It will get retransmitted + * and hopefully then we'll have sufficient space. + * + * We used to try to purge the in-order packets too, but that + * turns out to be deadly and fraught with races. Consider: + * + * 1) If we acked the data, we absolutely cannot drop the + * packet. This data would then never be retransmitted. + * 2) It is possible, with a proper sequence of events involving + * delayed acks and backlog queue handling, to have the user + * read the data before it gets acked. The previous code + * here got this wrong, and it lead to data corruption. + * 3) Too much state changes happen when the FIN arrives, so once + * we've seen that we can't remove any in-order data safely. + * + * The net result is that removing in-order receive data is too + * complex for anyones sanity. So we don't do it anymore. But + * if we are really having our buffer space abused we stop accepting + * new receive data. */ - while ((atomic_read(&sk->rmem_alloc) >= (sk->rcvbuf * 2)) && - (skb = skb_peek_tail(&sk->receive_queue))) { - /* Never toss anything when we've seen the FIN. - * It's just too complex to recover from it. - */ - if(skb->h.th->fin) - break; - - /* Never remove packets that have been already acked */ - if (before(TCP_SKB_CB(skb)->end_seq, tp->last_ack_sent+1)) { - SOCK_DEBUG(sk, "prune_queue: hit acked data c=%x,%x,%x\n", - tp->copied_seq, TCP_SKB_CB(skb)->end_seq, - tp->last_ack_sent); - return -1; - } - - net_statistics.RcvPruned += skb->len; + if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1)) + return 0; - __skb_unlink(skb, skb->list); - tp->rcv_nxt = TCP_SKB_CB(skb)->seq; - SOCK_DEBUG(sk, "prune_queue: removing %x-%x (c=%x)\n", - TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tp->copied_seq); - kfree_skb(skb); - } - return 0; + /* Massive buffer overcommit. */ + return -1; } /* @@ -1762,6 +1780,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (tcp_fast_parse_options(sk, th, tp)) { if (tp->saw_tstamp) { if (tcp_paws_discard(tp, th, len)) { + tcp_statistics.TcpInErrs++; if (!th->rst) { tcp_send_ack(sk); goto discard; @@ -2043,27 +2062,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* We got an ack, but it's not a good ack. */ if(!tcp_ack(sk,th, TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->ack_seq, len)) { - sk->err = ECONNRESET; - sk->state_change(sk); - tcp_statistics.TcpAttemptFails++; + TCP_SKB_CB(skb)->ack_seq, len)) return 1; - } if(th->rst) { tcp_reset(sk); goto discard; } - if(!th->syn) { - /* A valid ack from a different connection - * start. Shouldn't happen but cover it. - */ - sk->err = ECONNRESET; - sk->state_change(sk); - tcp_statistics.TcpAttemptFails++; - return 1; - } + if(!th->syn) + goto discard; /* Ok.. it's good. Set up sequence numbers and * move to established. @@ -2159,6 +2167,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (tp->saw_tstamp) { if (tcp_paws_discard(tp, th, len)) { + tcp_statistics.TcpInErrs++; if (!th->rst) { tcp_send_ack(sk); goto discard; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f486852d1..660e64c44 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_ipv4.c,v 1.162 1998/11/07 11:50:26 davem Exp $ + * Version: $Id: tcp_ipv4.c,v 1.164 1999/01/04 20:36:55 davem Exp $ * * IPv4 specific functions * @@ -265,7 +265,7 @@ unsigned short tcp_good_socknum(void) struct tcp_bind_bucket *tb; int low = sysctl_local_port_range[0]; int high = sysctl_local_port_range[1]; - int remaining = high - low + 1; + int remaining = (high - low) + 1; int rover; SOCKHASH_LOCK(); @@ -1642,14 +1642,15 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len) skb->csum = csum_partial((char *)th, len, 0); case CHECKSUM_HW: if (tcp_v4_check(th,len,skb->nh.iph->saddr,skb->nh.iph->daddr,skb->csum)) { - printk(KERN_DEBUG "TCPv4 bad checksum from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, " - "len=%d/%d/%d\n", - NIPQUAD(skb->nh.iph->saddr), - ntohs(th->source), - NIPQUAD(skb->nh.iph->daddr), - ntohs(th->dest), - len, skb->len, - ntohs(skb->nh.iph->tot_len)); + NETDEBUG(printk(KERN_DEBUG "TCPv4 bad checksum " + "from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, " + "len=%d/%d/%d\n", + NIPQUAD(skb->nh.iph->saddr), + ntohs(th->source), + NIPQUAD(skb->nh.iph->daddr), + ntohs(th->dest), + len, skb->len, + ntohs(skb->nh.iph->tot_len))); bad_packet: tcp_statistics.TcpInErrs++; goto discard_it; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 25695f05d..3e99d80db 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.97 1998/11/08 13:21:27 davem Exp $ + * Version: $Id: tcp_output.c,v 1.101 1999/01/20 07:20:14 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -49,7 +49,7 @@ static __inline__ void clear_delayed_acks(struct sock * sk) tp->delayed_acks = 0; if(tcp_in_quickack_mode(tp)) - tp->ato = ((HZ/100)*2); + tcp_exit_quickack_mode(tp); tcp_clear_xmit_timer(sk, TIME_DACK); } @@ -80,15 +80,28 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); int tcp_header_size = tp->tcp_header_len; struct tcphdr *th; + int sysctl_flags; +#define SYSCTL_FLAG_TSTAMPS 0x1 +#define SYSCTL_FLAG_WSCALE 0x2 +#define SYSCTL_FLAG_SACK 0x4 + + sysctl_flags = 0; if(tcb->flags & TCPCB_FLAG_SYN) { tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; - if(sysctl_tcp_timestamps) + if(sysctl_tcp_timestamps) { tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; - if(sysctl_tcp_window_scaling) + sysctl_flags |= SYSCTL_FLAG_TSTAMPS; + } + if(sysctl_tcp_window_scaling) { tcp_header_size += TCPOLEN_WSCALE_ALIGNED; - if(sysctl_tcp_sack && !sysctl_tcp_timestamps) - tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; + sysctl_flags |= SYSCTL_FLAG_WSCALE; + } + if(sysctl_tcp_sack) { + sysctl_flags |= SYSCTL_FLAG_SACK; + if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) + tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; + } } else if(tp->sack_ok && tp->num_sacks) { /* A SACK is 2 pad bytes, a 2 byte header, plus * 2 32-bit sequence numbers for each SACK block. @@ -118,9 +131,9 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) */ th->window = htons(tp->rcv_wnd); tcp_syn_build_options((__u32 *)(th + 1), tp->mss_clamp, - sysctl_tcp_timestamps, - sysctl_tcp_sack, - sysctl_tcp_window_scaling, + (sysctl_flags & SYSCTL_FLAG_TSTAMPS), + (sysctl_flags & SYSCTL_FLAG_SACK), + (sysctl_flags & SYSCTL_FLAG_WSCALE), tp->rcv_wscale, TCP_SKB_CB(skb)->when); } else { @@ -134,6 +147,9 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) tcp_statistics.TcpOutSegs++; tp->af_specific->queue_xmit(skb); } +#undef SYSCTL_FLAG_TSTAMPS +#undef SYSCTL_FLAG_WSCALE +#undef SYSCTL_FLAG_SACK } /* This is the main buffer sending routine. We queue the buffer @@ -528,8 +544,10 @@ static __inline__ void update_retrans_head(struct sock *sk) tp->retrans_head = tp->retrans_head->next; if((tp->retrans_head == tp->send_head) || - (tp->retrans_head == (struct sk_buff *) &sk->write_queue)) + (tp->retrans_head == (struct sk_buff *) &sk->write_queue)) { tp->retrans_head = NULL; + tp->rexmt_done = 1; + } } /* This retransmits one SKB. Policy decisions and retransmit queue @@ -594,7 +612,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; - if (tp->retrans_head == NULL) + if (tp->retrans_head == NULL && + tp->rexmt_done == 0) tp->retrans_head = skb_peek(&sk->write_queue); if (tp->retrans_head == tp->send_head) tp->retrans_head = NULL; @@ -981,7 +1000,13 @@ void tcp_send_ack(struct sock *sk) * (ACK is unreliable) but it's much better use of * bandwidth on slow links to send a spare ack than * resend packets. + * + * This is the one possible way that we can delay an + * ACK and have tp->ato indicate that we are in + * quick ack mode, so clear it. */ + if(tcp_in_quickack_mode(tp)) + tcp_exit_quickack_mode(tp); tcp_send_delayed_ack(tp, HZ/2); return; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index ea46d3268..41e54309c 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_timer.c,v 1.55 1998/11/07 11:55:42 davem Exp $ + * Version: $Id: tcp_timer.c,v 1.57 1999/01/20 07:20:21 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -170,8 +170,13 @@ void tcp_delack_timer(unsigned long data) if(!sk->zapped && sk->tp_pinfo.af_tcp.delayed_acks && - sk->state != TCP_CLOSE) - tcp_send_ack(sk); + sk->state != TCP_CLOSE) { + /* If socket is currently locked, defer the ACK. */ + if (!atomic_read(&sk->sock_readers)) + tcp_send_ack(sk); + else + tcp_send_delayed_ack(&(sk->tp_pinfo.af_tcp), HZ/10); + } } void tcp_probe_timer(unsigned long data) @@ -463,6 +468,7 @@ void tcp_retransmit_timer(unsigned long data) /* Retransmission. */ tp->retrans_head = NULL; + tp->rexmt_done = 0; tp->fackets_out = 0; tp->retrans_out = 0; if (tp->retransmits == 0) { |