diff options
author | Ralf Baechle <ralf@linux-mips.org> | 2000-03-23 02:25:38 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 2000-03-23 02:25:38 +0000 |
commit | 16b5d462f73eb29d1f67fa01cc1ea66afdc72569 (patch) | |
tree | 5407bd573f4840e473ea27cbe61e5c7a07131fcd /net | |
parent | ce8a076e11e7e5ee36007f9a3eee5bb3744cb8f6 (diff) |
Merge with Linux 2.3.99-pre2.
Diffstat (limited to 'net')
61 files changed, 13633 insertions, 77 deletions
diff --git a/net/Config.in b/net/Config.in index 624885478..ce5b6faa9 100644 --- a/net/Config.in +++ b/net/Config.in @@ -13,9 +13,9 @@ if [ "$CONFIG_NETLINK" = "y" ]; then tristate ' Netlink device emulation' CONFIG_NETLINK_DEV fi bool 'Network packet filtering (replaces ipchains)' CONFIG_NETFILTER -if [ "$CONFIG_NETFILTER" = "y" ]; then - bool ' Network packet filtering debugging' CONFIG_NETFILTER_DEBUG -fi +#if [ "$CONFIG_NETFILTER" = "y" ]; then +# bool ' Network packet filtering debugging' CONFIG_NETFILTER_DEBUG +#fi bool 'Socket Filtering' CONFIG_FILTER tristate 'Unix domain sockets' CONFIG_UNIX bool 'TCP/IP networking' CONFIG_INET diff --git a/net/Makefile b/net/Makefile index bf234eae1..44b34d799 100644 --- a/net/Makefile +++ b/net/Makefile @@ -20,6 +20,10 @@ endif ifeq ($(CONFIG_INET),y) SUB_DIRS += ipv4 +ifeq ($(CONFIG_NETFILTER),y) +SUB_DIRS += ipv4/netfilter +MOD_SUB_DIRS += ipv4/netfilter +endif endif ifeq ($(CONFIG_UNIX),y) @@ -198,7 +202,7 @@ endif endif L_TARGET := network.a -L_OBJS := $(SOCK) protocols.o $(join $(SUB_DIRS),$(SUB_DIRS:%=/%.o)) +L_OBJS := $(SOCK) protocols.o $(join $(SUB_DIRS), $(patsubst %,/%.o,$(notdir $(SUB_DIRS)))) M_OBJS := diff --git a/net/core/netfilter.c b/net/core/netfilter.c index 18f697755..02c3bc989 100644 --- a/net/core/netfilter.c +++ b/net/core/netfilter.c @@ -4,9 +4,10 @@ * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any * way. * - * Rusty Russell (C)1998 -- This code is GPL. + * Rusty Russell (C)2000 -- This code is GPL. * * February 2000: Modified by James Morris to have 1 queue per protocol. + * 15-Mar-2000: Added NF_REPEAT --RR. */ #include <linux/config.h> #include <linux/netfilter.h> @@ -56,8 +57,6 @@ int nf_register_hook(struct nf_hook_ops *reg) { struct list_head *i; - NFDEBUG("nf_register_hook: pf=%i hook=%u.\n", reg->pf, reg->hooknum); - br_write_lock_bh(BR_NETPROTO_LOCK); for (i = nf_hooks[reg->pf][reg->hooknum].next; i != &nf_hooks[reg->pf][reg->hooknum]; @@ -119,7 +118,16 @@ out: void nf_unregister_sockopt(struct nf_sockopt_ops *reg) { /* No point being interruptible: we're probably in cleanup_module() */ + restart: down(&nf_sockopt_mutex); + if (reg->use != 0) { + /* To be woken by nf_sockopt call... */ + reg->cleanup_task = current; + up(&nf_sockopt_mutex); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + goto restart; + } list_del(®->list); up(&nf_sockopt_mutex); } @@ -178,7 +186,7 @@ void nf_dump_skb(int pf, struct sk_buff *skb) dst_port = ntohs(tcp->dest); } - printk("PROTO=%d %ld.%ld.%ld.%ld:%hu %ld.%ld.%ld.%ld:%hu" + printk("PROTO=%d %d.%d.%d.%d:%hu %d.%d.%d.%d:%hu" " L=%hu S=0x%2.2hX I=%hu F=0x%4.4hX T=%hu", ip->protocol, (ntohl(ip->saddr)>>24)&0xFF, @@ -261,9 +269,16 @@ void nf_debug_ip_finish_output2(struct sk_buff *skb) if (skb->nf_debug != ((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_FORWARD) | (1 << NF_IP_POST_ROUTING))) { - printk("ip_finish_output: bad unowned skb = %p: ",skb); - debug_print_hooks_ip(skb->nf_debug); - nf_dump_skb(PF_INET, skb); + /* Fragments will have no owners, but still + may be local */ + if (!(skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) + || skb->nf_debug != ((1 << NF_IP_LOCAL_OUT) + | (1 << NF_IP_POST_ROUTING))){ + printk("ip_finish_output:" + " bad unowned skb = %p: ",skb); + debug_print_hooks_ip(skb->nf_debug); + nf_dump_skb(PF_INET, skb); + } } } } @@ -274,31 +289,42 @@ static int nf_sockopt(struct sock *sk, int pf, int val, char *opt, int *len, int get) { struct list_head *i; + struct nf_sockopt_ops *ops; int ret; if (down_interruptible(&nf_sockopt_mutex) != 0) return -EINTR; for (i = nf_sockopts.next; i != &nf_sockopts; i = i->next) { - struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i; + ops = (struct nf_sockopt_ops *)i; if (ops->pf == pf) { if (get) { if (val >= ops->get_optmin && val < ops->get_optmax) { + ops->use++; + up(&nf_sockopt_mutex); ret = ops->get(sk, val, opt, len); goto out; } } else { if (val >= ops->set_optmin && val < ops->set_optmax) { + ops->use++; + up(&nf_sockopt_mutex); ret = ops->set(sk, val, opt, *len); goto out; } } } } - ret = -ENOPROTOOPT; + up(&nf_sockopt_mutex); + return -ENOPROTOOPT; + out: + down(&nf_sockopt_mutex); + ops->use--; + if (ops->cleanup_task) + wake_up_process(ops->cleanup_task); up(&nf_sockopt_mutex); return ret; } @@ -334,6 +360,10 @@ static unsigned int nf_iterate(struct list_head *head, case NF_DROP: return NF_DROP; + case NF_REPEAT: + *i = (*i)->prev; + break; + #ifdef CONFIG_NETFILTER_DEBUG case NF_ACCEPT: break; @@ -367,7 +397,6 @@ int nf_register_queue_handler(int pf, nf_queue_outfn_t outfn, void *data) /* The caller must flush their queue before this */ int nf_unregister_queue_handler(int pf) { - NFDEBUG("Unregistering Netfilter queue handler for pf=%d\n", pf); br_write_lock_bh(BR_NETPROTO_LOCK); queue_handler[pf].outfn = NULL; queue_handler[pf].data = NULL; @@ -390,7 +419,6 @@ static void nf_queue(struct sk_buff *skb, struct nf_info *info; if (!queue_handler[pf].outfn) { - NFDEBUG("nf_queue: noone wants the packet, dropping it.\n"); kfree_skb(skb); return; } @@ -432,6 +460,14 @@ int nf_hook_slow(int pf, unsigned int hook, struct sk_buff *skb, unsigned int verdict; int ret = 0; +#ifdef CONFIG_NETFILTER_DEBUG + if (skb->nf_debug & (1 << hook)) { + printk("nf_hook: hook %i already set.\n", hook); + nf_dump_skb(pf, skb); + } + skb->nf_debug |= (1 << hook); +#endif + elem = &nf_hooks[pf][hook]; verdict = nf_iterate(&nf_hooks[pf][hook], &skb, hook, indev, outdev, &elem, okfn); @@ -473,6 +509,11 @@ void nf_reinject(struct sk_buff *skb, struct nf_info *info, } /* Continue traversal iff userspace said ok... */ + if (verdict == NF_REPEAT) { + elem = elem->prev; + verdict = NF_ACCEPT; + } + if (verdict == NF_ACCEPT) { verdict = nf_iterate(&nf_hooks[info->pf][info->hook], &skb, info->hook, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ecda47d7a..dad1f3925 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4,7 +4,7 @@ * Authors: Alan Cox <iiitac@pyr.swan.ac.uk> * Florian La Roche <rzsfl@rz.uni-sb.de> * - * Version: $Id: skbuff.c,v 1.69 2000/03/06 03:47:58 davem Exp $ + * Version: $Id: skbuff.c,v 1.70 2000/03/17 14:41:39 davem Exp $ * * Fixes: * Alan Cox : Fixed the worst of the load balancer bugs. @@ -203,7 +203,7 @@ static inline void skb_headerinit(void *p, kmem_cache_t *cache, skb->dst = NULL; skb->rx_dev = NULL; #ifdef CONFIG_NETFILTER - skb->nfmark = skb->nfreason = skb->nfcache = 0; + skb->nfmark = skb->nfcache = 0; skb->nfct = NULL; #ifdef CONFIG_NETFILTER_DEBUG skb->nf_debug = 0; @@ -319,7 +319,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->security=old->security; #ifdef CONFIG_NETFILTER new->nfmark=old->nfmark; - new->nfreason=old->nfreason; new->nfcache=old->nfcache; new->nfct=old->nfct; nf_conntrack_get(new->nfct); diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 5e54a6fa8..2ba5f2f6c 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -805,10 +805,7 @@ non_local_input: key.scope = RT_SCOPE_UNIVERSE; #ifdef CONFIG_DECNET_ROUTE_FWMASK - if (skb->nfreason == NF_REASON_FOR_ROUTING) - key.fwmark = skb->fwmark; - else - key.fwmark = 0; + key.fwmark = skb->fwmark; #else key.fwmark = 0; #endif @@ -886,9 +883,7 @@ int dn_route_input(struct sk_buff *skb) (rt->key.daddr == cb->dst) && (rt->key.oif == 0) && #ifdef CONFIG_DECNET_ROUTE_FWMASK - (rt->key.fwmark == (skb->nfreason == - NF_REASON_FOR_ROUTING - ? skb->nfmark : 0)) && + (rt->key.fwmark == skb->nfmark) && #endif (rt->key.iif == cb->iif)) { rt->u.dst.lastuse = jiffies; diff --git a/net/ipv4/Config.in b/net/ipv4/Config.in index 32e2aca16..68fea0272 100644 --- a/net/ipv4/Config.in +++ b/net/ipv4/Config.in @@ -9,7 +9,7 @@ if [ "$CONFIG_IP_ADVANCED_ROUTER" = "y" ]; then bool ' IP: policy routing' CONFIG_IP_MULTIPLE_TABLES if [ "$CONFIG_IP_MULTIPLE_TABLES" = "y" ]; then if [ "$CONFIG_NETFILTER" = "y" ]; then - bool ' IP: use FWMARK value as routing key' CONFIG_IP_ROUTE_FWMARK + bool ' IP: use netfilter MARK value as routing key' CONFIG_IP_ROUTE_FWMARK fi bool ' IP: fast network address translation' CONFIG_IP_ROUTE_NAT fi @@ -53,3 +53,6 @@ bool ' IP: Allow large windows (not recommended if <16Mb of memory)' CONFIG_SKB #if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then # bool ' IP: support checksum copy to user for UDP (EXPERIMENTAL)' CONFIG_UDP_DELAY_CSUM #fi +if [ "$CONFIG_NETFILTER" != "n" ]; then + source net/ipv4/netfilter/Config.in +fi diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index d7da63f4e..7561e190b 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -3,7 +3,7 @@ * * Alan Cox, <alan@redhat.com> * - * Version: $Id: icmp.c,v 1.65 2000/02/22 23:54:25 davem Exp $ + * Version: $Id: icmp.c,v 1.66 2000/03/17 14:41:50 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -332,20 +332,6 @@ struct icmp_control static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; /* - * Build xmit assembly blocks - */ - -struct icmp_bxm -{ - void *data_ptr; - int data_len; - struct icmphdr icmph; - unsigned long csum; - struct ip_options replyopts; - unsigned char optbuf[40]; -}; - -/* * The ICMP socket. This is the most convenient way to flow control * our ICMP output as well as maintain a clean interface throughout * all layers. All Socketless IP sends will soon be gone. @@ -508,7 +494,7 @@ static int icmp_glue_bits(const void *p, char *to, unsigned int offset, unsigned * Driving logic for building and sending ICMP messages. */ -static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) +void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) { struct sock *sk=icmp_socket->sk; struct ipcm_cookie ipc; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 8d651b042..01a39b6e4 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -599,6 +599,10 @@ int ipgre_rcv(struct sk_buff *skb, unsigned short len) skb->dev = tunnel->dev; dst_release(skb->dst); skb->dst = NULL; +#ifdef CONFIG_NETFILTER + nf_conntrack_put(skb->nfct); + skb->nfct = NULL; +#endif netif_rx(skb); read_unlock(&ipgre_lock); return(0); @@ -818,6 +822,11 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) ip_select_ident(iph, &rt->u.dst); ip_send_check(iph); +#ifdef CONFIG_NETFILTER + nf_conntrack_put(skb->nfct); + skb->nfct = NULL; +#endif + stats->tx_bytes += skb->len; stats->tx_packets++; ip_send(skb); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 26d025d32..f3013ca57 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) output module. * - * Version: $Id: ip_output.c,v 1.81 2000/03/06 03:48:01 davem Exp $ + * Version: $Id: ip_output.c,v 1.82 2000/03/17 14:41:50 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -894,6 +894,9 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) /* Connection association is same as pre-frag packet */ skb2->nfct = skb->nfct; nf_conntrack_get(skb2->nfct); +#ifdef CONFIG_NETFILTER_DEBUG + skb2->nf_debug = skb->nf_debug; +#endif #endif /* diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 5518ec1cb..2823c2c7e 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -1,7 +1,7 @@ /* * Linux NET3: IP/IP protocol decoder. * - * Version: $Id: ipip.c,v 1.30 2000/01/06 00:41:55 davem Exp $ + * Version: $Id: ipip.c,v 1.31 2000/03/17 14:41:51 davem Exp $ * * Authors: * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 @@ -93,6 +93,7 @@ */ +#include <linux/config.h> #include <linux/module.h> #include <linux/types.h> #include <linux/sched.h> @@ -483,6 +484,10 @@ int ipip_rcv(struct sk_buff *skb, unsigned short len) skb->dev = tunnel->dev; dst_release(skb->dst); skb->dst = NULL; +#ifdef CONFIG_NETFILTER + nf_conntrack_put(skb->nfct); + skb->nfct = NULL; +#endif netif_rx(skb); read_unlock(&ipip_lock); return 0; @@ -619,6 +624,11 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) ip_select_ident(iph, &rt->u.dst); ip_send_check(iph); +#ifdef CONFIG_NETFILTER + nf_conntrack_put(skb->nfct); + skb->nfct = NULL; +#endif + stats->tx_bytes += skb->len; stats->tx_packets++; ip_send(skb); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index fce5a43f8..1e33ec4ca 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -9,7 +9,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Version: $Id: ipmr.c,v 1.50 2000/01/09 02:19:32 davem Exp $ + * Version: $Id: ipmr.c,v 1.51 2000/03/17 14:41:52 davem Exp $ * * Fixes: * Michael Chastain : Incorrect size of copying. @@ -1100,6 +1100,10 @@ static void ip_encap(struct sk_buff *skb, u32 saddr, u32 daddr) skb->h.ipiph = skb->nh.iph; skb->nh.iph = iph; +#ifdef CONFIG_NETFILTER + nf_conntrack_put(skb->nfct); + skb->nfct = NULL; +#endif } static inline int ipmr_forward_finish(struct sk_buff *skb) @@ -1433,6 +1437,10 @@ int pim_rcv_v1(struct sk_buff * skb, unsigned short len) skb->dst = NULL; ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len; ((struct net_device_stats*)reg_dev->priv)->rx_packets++; +#ifdef CONFIG_NETFILTER + nf_conntrack_put(skb->nfct); + skb->nfct = NULL; +#endif netif_rx(skb); dev_put(reg_dev); return 0; @@ -1488,6 +1496,10 @@ int pim_rcv(struct sk_buff * skb, unsigned short len) ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len; ((struct net_device_stats*)reg_dev->priv)->rx_packets++; skb->dst = NULL; +#ifdef CONFIG_NETFILTER + nf_conntrack_put(skb->nfct); + skb->nfct = NULL; +#endif netif_rx(skb); dev_put(reg_dev); return 0; diff --git a/net/ipv4/netfilter/Config.in b/net/ipv4/netfilter/Config.in new file mode 100644 index 000000000..bf2a28269 --- /dev/null +++ b/net/ipv4/netfilter/Config.in @@ -0,0 +1,64 @@ +# +# IP netfilter configuration +# +mainmenu_option next_comment +comment ' IP: Netfilter Configuration' + +tristate 'Connection tracking (required for masq/NAT)' CONFIG_IP_NF_CONNTRACK +if [ "$CONFIG_IP_NF_CONNTRACK" != "n" ]; then + dep_tristate ' FTP protocol support' CONFIG_IP_NF_FTP $CONFIG_IP_NF_CONNTRACK +fi + +if [ "$CONFIG_EXPERIMENTAL" = "y" -a "$CONFIG_NETLINK" = "y" ]; then + tristate 'Userspace queueing via NETLINK (EXPERIMENTAL)' CONFIG_IP_NF_QUEUE +fi +tristate 'IP tables support (required for filtering/masq/NAT)' CONFIG_IP_NF_IPTABLES +if [ "$CONFIG_IP_NF_IPTABLES" != "n" ]; then +# The simple matches. + dep_tristate ' limit match support' CONFIG_IP_NF_MATCH_LIMIT $CONFIG_IP_NF_IPTABLES + dep_tristate ' MAC address match support' CONFIG_IP_NF_MATCH_MAC $CONFIG_IP_NF_IPTABLES + dep_tristate ' netfilter MARK match support' CONFIG_IP_NF_MATCH_MARK $CONFIG_IP_NF_IPTABLES + dep_tristate ' Multiple port match support' CONFIG_IP_NF_MATCH_MULTIPORT $CONFIG_IP_NF_IPTABLES + dep_tristate ' TOS match support' CONFIG_IP_NF_MATCH_TOS $CONFIG_IP_NF_IPTABLES + if [ "$CONFIG_IP_NF_CONNTRACK" != "n" ]; then + dep_tristate ' Connection state match support' CONFIG_IP_NF_MATCH_STATE $CONFIG_IP_NF_CONNTRACK $CONFIG_IP_NF_IPTABLES + fi + if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + dep_tristate ' Unclean match support (EXPERIMENTAL)' CONFIG_IP_NF_MATCH_UNCLEAN $CONFIG_IP_NF_IPTABLES + dep_tristate ' Owner match support (EXPERIMENTAL)' CONFIG_IP_NF_MATCH_OWNER $CONFIG_IP_NF_IPTABLES + fi +# The targets + dep_tristate ' Packet filtering' CONFIG_IP_NF_FILTER $CONFIG_IP_NF_IPTABLES + if [ "$CONFIG_IP_NF_FILTER" != "n" ]; then + dep_tristate ' REJECT target support' CONFIG_IP_NF_TARGET_REJECT $CONFIG_IP_NF_FILTER + if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + dep_tristate ' MIRROR target support (EXPERIMENTAL)' CONFIG_IP_NF_TARGET_MIRROR $CONFIG_IP_NF_FILTER + fi + fi + + if [ "$CONFIG_IP_NF_CONNTRACK" != "n" ]; then + dep_tristate ' Full NAT' CONFIG_IP_NF_NAT $CONFIG_IP_NF_IPTABLES + if [ "$CONFIG_IP_NF_NAT" != "n" ]; then + dep_tristate ' MASQUERADE target support' CONFIG_IP_NF_TARGET_MASQUERADE $CONFIG_IP_NF_NAT + dep_tristate ' REDIRECT target support' CONFIG_IP_NF_TARGET_REDIRECT $CONFIG_IP_NF_NAT + fi + fi + + dep_tristate ' Packet mangling' CONFIG_IP_NF_MANGLE $CONFIG_IP_NF_IPTABLES + if [ "$CONFIG_IP_NF_MANGLE" != "n" ]; then + dep_tristate ' TOS target support' CONFIG_IP_NF_TARGET_TOS $CONFIG_IP_NF_MANGLE + dep_tristate ' MARK target support' CONFIG_IP_NF_TARGET_MARK $CONFIG_IP_NF_MANGLE + fi + dep_tristate ' LOG target support' CONFIG_IP_NF_TARGET_LOG $CONFIG_IP_NF_IPTABLES +fi + +# Backwards compatibility modules: only if you don't build in the others. +if [ "$CONFIG_IP_NF_CONNTRACK" != "y" ]; then + if [ "$CONFIG_IP_NF_IPTABLES" != "y" ]; then + tristate 'ipchains (2.2-style) support' CONFIG_IP_NF_COMPAT_IPCHAINS + if [ "$CONFIG_IP_NF_COMPAT_IPCHAINS" != "y" ]; then + tristate 'ipfwadm (2.0-style) support' CONFIG_IP_NF_COMPAT_IPFWADM + fi + fi +fi +endmenu diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile new file mode 100644 index 000000000..41a61e010 --- /dev/null +++ b/net/ipv4/netfilter/Makefile @@ -0,0 +1,234 @@ +# +# Makefile for the netfilter modules on top of IPv4. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definition is now in the main makefile... + +O_TARGET := netfilter.o +MOD_LIST_NAME := IPV4_MODULES +M_OBJS := + +IP_NF_CONNTRACK_OBJ:=ip_conntrack_core.o ip_conntrack_proto_generic.o ip_conntrack_proto_tcp.o ip_conntrack_proto_udp.o ip_conntrack_proto_icmp.o + +IP_NF_NAT_OBJ:=ip_nat_core.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o + +# Link order matters here. +ifeq ($(CONFIG_IP_NF_CONNTRACK),y) +OX_OBJS += ip_conntrack_standalone.o +O_OBJS += $(IP_NF_CONNTRACK_OBJ) +else + ifeq ($(CONFIG_IP_NF_CONNTRACK),m) + M_OBJS += ip_conntrack.o + endif +endif + +ifeq ($(CONFIG_IP_NF_QUEUE),y) +O_OBJS += ip_queue.o +else + ifeq ($(CONFIG_IP_NF_QUEUE),m) + M_OBJS += ip_queue.o + endif +endif + +ifeq ($(CONFIG_IP_NF_FTP),y) +OX_OBJS += ip_conntrack_ftp.o +else + ifeq ($(CONFIG_IP_NF_FTP),m) + M_OBJS += ip_conntrack_ftp.o + endif +endif + +ifeq ($(CONFIG_IP_NF_IPTABLES),y) +O_OBJS += ip_tables.o +else + ifeq ($(CONFIG_IP_NF_IPTABLES),m) + M_OBJS += ip_tables.o + endif +endif + +ifeq ($(CONFIG_IP_NF_MATCH_LIMIT),y) +O_OBJS += ipt_limit.o +else + ifeq ($(CONFIG_IP_NF_MATCH_LIMIT),m) + M_OBJS += ipt_limit.o + endif +endif + +ifeq ($(CONFIG_IP_NF_MATCH_MARK),y) +O_OBJS += ipt_mark.o +else + ifeq ($(CONFIG_IP_NF_MATCH_MARK),m) + M_OBJS += ipt_mark.o + endif +endif + +ifeq ($(CONFIG_IP_NF_MATCH_MAC),y) +O_OBJS += ipt_mac.o +else + ifeq ($(CONFIG_IP_NF_MATCH_MAC),m) + M_OBJS += ipt_mac.o + endif +endif + +ifeq ($(CONFIG_IP_NF_MATCH_MULTIPORT),y) +O_OBJS += ipt_multiport.o +else + ifeq ($(CONFIG_IP_NF_MATCH_MULTIPORT),m) + M_OBJS += ipt_multiport.o + endif +endif + +ifeq ($(CONFIG_IP_NF_MATCH_OWNER),y) +O_OBJS += ipt_owner.o +else + ifeq ($(CONFIG_IP_NF_MATCH_OWNER),m) + M_OBJS += ipt_owner.o + endif +endif + +ifeq ($(CONFIG_IP_NF_MATCH_TOS),y) +O_OBJS += ipt_tos.o +else + ifeq ($(CONFIG_IP_NF_MATCH_TOS),m) + M_OBJS += ipt_tos.o + endif +endif + +ifeq ($(CONFIG_IP_NF_MATCH_STATE),y) +O_OBJS += ipt_state.o +else + ifeq ($(CONFIG_IP_NF_MATCH_STATE),m) + M_OBJS += ipt_state.o + endif +endif + +ifeq ($(CONFIG_IP_NF_MATCH_UNCLEAN),y) +O_OBJS += ipt_unclean.o +else + ifeq ($(CONFIG_IP_NF_MATCH_UNCLEAN),m) + M_OBJS += ipt_unclean.o + endif +endif + +ifeq ($(CONFIG_IP_NF_FILTER),y) +O_OBJS += iptable_filter.o +else + ifeq ($(CONFIG_IP_NF_FILTER),m) + M_OBJS += iptable_filter.o + endif +endif + +ifeq ($(CONFIG_IP_NF_NAT),y) +OX_OBJS += ip_nat_standalone.o +O_OBJS += ip_nat_rule.o $(IP_NF_NAT_OBJ) + ifeq ($(CONFIG_IP_NF_FTP),y) + O_OBJS += ip_nat_ftp.o + endif +else + ifeq ($(CONFIG_IP_NF_NAT),m) + M_OBJS += iptable_nat.o + ifeq ($(CONFIG_IP_NF_FTP),m) + M_OBJS += ip_nat_ftp.o + endif + endif +endif + +ifeq ($(CONFIG_IP_NF_MANGLE),y) +O_OBJS += iptable_mangle.o +else + ifeq ($(CONFIG_IP_NF_MANGLE),m) + M_OBJS += iptable_mangle.o + endif +endif + +ifeq ($(CONFIG_IP_NF_TARGET_REJECT),y) +O_OBJS += ipt_REJECT.o +else + ifeq ($(CONFIG_IP_NF_TARGET_REJECT),m) + M_OBJS += ipt_REJECT.o + endif +endif + +ifeq ($(CONFIG_IP_NF_TARGET_MIRROR),y) +O_OBJS += ipt_MIRROR.o +else + ifeq ($(CONFIG_IP_NF_TARGET_MIRROR),m) + M_OBJS += ipt_MIRROR.o + endif +endif + +ifeq ($(CONFIG_IP_NF_TARGET_TOS),y) +O_OBJS += ipt_TOS.o +else + ifeq ($(CONFIG_IP_NF_TARGET_TOS),m) + M_OBJS += ipt_TOS.o + endif +endif + +ifeq ($(CONFIG_IP_NF_TARGET_MARK),y) +O_OBJS += ipt_MARK.o +else + ifeq ($(CONFIG_IP_NF_TARGET_MARK),m) + M_OBJS += ipt_MARK.o + endif +endif + +ifeq ($(CONFIG_IP_NF_TARGET_MASQUERADE),y) +O_OBJS += ipt_MASQUERADE.o +else + ifeq ($(CONFIG_IP_NF_TARGET_MASQUERADE),m) + M_OBJS += ipt_MASQUERADE.o + endif +endif + +ifeq ($(CONFIG_IP_NF_TARGET_REDIRECT),y) +O_OBJS += ipt_REDIRECT.o +else + ifeq ($(CONFIG_IP_NF_TARGET_REDIRECT),m) + M_OBJS += ipt_REDIRECT.o + endif +endif + +ifeq ($(CONFIG_IP_NF_TARGET_LOG),y) +O_OBJS += ipt_LOG.o +else + ifeq ($(CONFIG_IP_NF_TARGET_LOG),m) + M_OBJS += ipt_LOG.o + endif +endif + +ifeq ($(CONFIG_IP_NF_COMPAT_IPCHAINS),y) +O_OBJS += ipchains.o +else + ifeq ($(CONFIG_IP_NF_COMPAT_IPCHAINS),m) + M_OBJS += ipchains.o + endif +endif + +ifeq ($(CONFIG_IP_NF_COMPAT_IPFWADM),y) +O_OBJS += ipfwadm.o +else + ifeq ($(CONFIG_IP_NF_COMPAT_IPFWADM),m) + M_OBJS += ipfwadm.o + endif +endif + +include $(TOPDIR)/Rules.make + +ip_conntrack.o: ip_conntrack_standalone.o $(IP_NF_CONNTRACK_OBJ) + $(LD) -r -o $@ $(IP_NF_CONNTRACK_OBJ) ip_conntrack_standalone.o + +iptable_nat.o: ip_nat_standalone.o ip_nat_rule.o $(IP_NF_NAT_OBJ) + $(LD) -r -o $@ ip_nat_standalone.o ip_nat_rule.o $(IP_NF_NAT_OBJ) + +# All the parts of conntrack and NAT required for compatibility layer. +IP_NF_COMPAT_LAYER:=ip_fw_compat.o ip_fw_compat_redir.o ip_fw_compat_masq.o $(IP_NF_CONNTRACK_OBJ) $(IP_NF_NAT_OBJ) + +ipfwadm.o: ipfwadm_core.o $(IP_NF_COMPAT_LAYER) + $(LD) -r -o $@ ipfwadm_core.o $(IP_NF_COMPAT_LAYER) + +ipchains.o: ipchains_core.o $(IP_NF_COMPAT_LAYER) + $(LD) -r -o $@ ipchains_core.o $(IP_NF_COMPAT_LAYER) diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c new file mode 100644 index 000000000..9007cdc89 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -0,0 +1,891 @@ +/* Connection state tracking for netfilter. This is separated from, + but required by, the NAT layer; it can also be used by an iptables + extension. */ + +/* (c) 1999 Paul `Rusty' Russell. Licenced under the GNU General + Public Licence. */ + +#ifdef MODULE +#define __NO_VERSION__ +#endif +#include <linux/version.h> +#include <linux/config.h> +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/vmalloc.h> +#include <linux/brlock.h> +#include <net/checksum.h> +#include <linux/stddef.h> +#include <linux/sysctl.h> + +/* This rwlock protects the main hash table, protocol/helper/expected + registrations, conntrack timers*/ +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) + +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_core.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +DECLARE_RWLOCK(ip_conntrack_lock); + +void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL; +static LIST_HEAD(expect_list); +static LIST_HEAD(protocol_list); +static LIST_HEAD(helpers); +unsigned int ip_conntrack_htable_size = 0; +static int ip_conntrack_max = 0; +static atomic_t ip_conntrack_count = ATOMIC_INIT(0); +struct list_head *ip_conntrack_hash; + +extern struct ip_conntrack_protocol ip_conntrack_generic_protocol; + +static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr, + u_int8_t protocol) +{ + return protocol == curr->proto; +} + +struct ip_conntrack_protocol *__find_proto(u_int8_t protocol) +{ + struct ip_conntrack_protocol *p; + + MUST_BE_READ_LOCKED(&ip_conntrack_lock); + p = LIST_FIND(&protocol_list, proto_cmpfn, + struct ip_conntrack_protocol *, protocol); + if (!p) + p = &ip_conntrack_generic_protocol; + + return p; +} + +struct ip_conntrack_protocol *find_proto(u_int8_t protocol) +{ + struct ip_conntrack_protocol *p; + + READ_LOCK(&ip_conntrack_lock); + p = __find_proto(protocol); + READ_UNLOCK(&ip_conntrack_lock); + return p; +} + +static inline void ip_conntrack_put(struct ip_conntrack *ct) +{ + IP_NF_ASSERT(ct); + IP_NF_ASSERT(ct->infos[0].master); + /* nf_conntrack_put wants to go via an info struct, so feed it + one at random. */ + nf_conntrack_put(&ct->infos[0]); +} + +static inline u_int32_t +hash_conntrack(const struct ip_conntrack_tuple *tuple) +{ +#if 0 + dump_tuple(tuple); +#endif +#ifdef CONFIG_NETFILTER_DEBUG + if (tuple->src.pad) + DEBUGP("Tuple %p has non-zero padding.\n", tuple); +#endif + /* ntohl because more differences in low bits. */ + /* To ensure that halves of the same connection don't hash + clash, we add the source per-proto again. */ + return (ntohl(tuple->src.ip + tuple->dst.ip + + tuple->src.u.all + tuple->dst.u.all + + tuple->dst.protonum) + + ntohs(tuple->src.u.all)) + % ip_conntrack_htable_size; +} + +inline int +get_tuple(const struct iphdr *iph, size_t len, + struct ip_conntrack_tuple *tuple, + struct ip_conntrack_protocol *protocol) +{ + int ret; + + /* Can only happen when extracting tuples from inside ICMP + packets */ + if (iph->frag_off & htons(IP_OFFSET)) { + if (net_ratelimit()) + printk("ip_conntrack_core: Frag of proto %u.\n", + iph->protocol); + return 0; + } + /* Guarantee 8 protocol bytes: if more wanted, use len param */ + else if (iph->ihl * 4 + 8 > len) + return 0; + + tuple->src.ip = iph->saddr; + tuple->src.pad = 0; + tuple->dst.ip = iph->daddr; + tuple->dst.protonum = iph->protocol; + + ret = protocol->pkt_to_tuple((u_int32_t *)iph + iph->ihl, + len - 4*iph->ihl, + tuple); + return ret; +} + +static int +invert_tuple(struct ip_conntrack_tuple *inverse, + const struct ip_conntrack_tuple *orig, + const struct ip_conntrack_protocol *protocol) +{ + inverse->src.ip = orig->dst.ip; + inverse->src.pad = 0; + inverse->dst.ip = orig->src.ip; + inverse->dst.protonum = orig->dst.protonum; + + return protocol->invert_tuple(inverse, orig); +} + +static void +destroy_conntrack(struct nf_conntrack *nfct) +{ + struct ip_conntrack *ct = (struct ip_conntrack *)nfct; + + IP_NF_ASSERT(atomic_read(&nfct->use) == 0); + IP_NF_ASSERT(!timer_pending(&ct->timeout)); + + if (ct->master.master) + nf_conntrack_put(&ct->master); + + if (ip_conntrack_destroyed) + ip_conntrack_destroyed(ct); + kfree(ct); + atomic_dec(&ip_conntrack_count); +} + +static void death_by_timeout(unsigned long ul_conntrack) +{ + struct ip_conntrack *ct = (void *)ul_conntrack; + + WRITE_LOCK(&ip_conntrack_lock); + /* Remove from both hash lists */ + LIST_DELETE(&ip_conntrack_hash + [hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)], + &ct->tuplehash[IP_CT_DIR_ORIGINAL]); + LIST_DELETE(&ip_conntrack_hash + [hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple)], + &ct->tuplehash[IP_CT_DIR_REPLY]); + /* If our expected is in the list, take it out. */ + if (ct->expected.expectant) { + IP_NF_ASSERT(list_inlist(&expect_list, &ct->expected)); + IP_NF_ASSERT(ct->expected.expectant == ct); + LIST_DELETE(&expect_list, &ct->expected); + } + WRITE_UNLOCK(&ip_conntrack_lock); + ip_conntrack_put(ct); +} + +static inline int +conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i, + const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack) +{ + MUST_BE_READ_LOCKED(&ip_conntrack_lock); + return i->ctrack != ignored_conntrack + && memcmp(tuple, &i->tuple, sizeof(*tuple)) == 0; +} + +static struct ip_conntrack_tuple_hash * +__ip_conntrack_find(const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack) +{ + struct ip_conntrack_tuple_hash *h; + + MUST_BE_READ_LOCKED(&ip_conntrack_lock); + h = LIST_FIND(&ip_conntrack_hash[hash_conntrack(tuple)], + conntrack_tuple_cmp, + struct ip_conntrack_tuple_hash *, + tuple, ignored_conntrack); + return h; +} + +/* Find a connection corresponding to a tuple. */ +struct ip_conntrack_tuple_hash * +ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack) +{ + struct ip_conntrack_tuple_hash *h; + + READ_LOCK(&ip_conntrack_lock); + h = __ip_conntrack_find(tuple, ignored_conntrack); + if (h) + atomic_inc(&h->ctrack->ct_general.use); + READ_UNLOCK(&ip_conntrack_lock); + + return h; +} + +/* Returns true if a connection correspondings to the tuple (required + for NAT). */ +int +ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack) +{ + struct ip_conntrack_tuple_hash *h; + + READ_LOCK(&ip_conntrack_lock); + h = __ip_conntrack_find(tuple, ignored_conntrack); + READ_UNLOCK(&ip_conntrack_lock); + + return h != NULL; +} + +/* Returns TRUE if it dealt with ICMP, and filled in skb fields */ +int icmp_error_track(struct sk_buff *skb) +{ + const struct iphdr *iph = skb->nh.iph; + struct icmphdr *hdr = (struct icmphdr *)((u_int32_t *)iph + iph->ihl); + struct ip_conntrack_tuple innertuple, origtuple; + struct iphdr *inner = (struct iphdr *)(hdr + 1); + size_t datalen = skb->len - iph->ihl*4 - sizeof(*hdr); + struct ip_conntrack_protocol *innerproto; + struct ip_conntrack_tuple_hash *h; + enum ip_conntrack_info ctinfo; + + if (iph->protocol != IPPROTO_ICMP) + return 0; + + if (skb->len < iph->ihl * 4 + sizeof(struct icmphdr)) { + DEBUGP("icmp_error_track: too short\n"); + return 1; + } + + if (hdr->type != ICMP_DEST_UNREACH + && hdr->type != ICMP_SOURCE_QUENCH + && hdr->type != ICMP_TIME_EXCEEDED + && hdr->type != ICMP_PARAMETERPROB + && hdr->type != ICMP_REDIRECT) + return 0; + + /* Ignore it if the checksum's bogus. */ + if (ip_compute_csum((unsigned char *)hdr, sizeof(*hdr) + datalen)) { + DEBUGP("icmp_error_track: bad csum\n"); + return 1; + } + + innerproto = find_proto(inner->protocol); + /* Are they talking about one of our connections? */ + if (inner->ihl * 4 + 8 > datalen + || !get_tuple(inner, datalen, &origtuple, innerproto)) { + DEBUGP("icmp_error: ! get_tuple p=%u (%u*4+%u dlen=%u)\n", + inner->protocol, inner->ihl, 8, + datalen); + return 1; + } + + /* Ordinarily, we'd expect the inverted tupleproto, but it's + been preserved inside the ICMP. */ + if (!invert_tuple(&innertuple, &origtuple, innerproto)) { + DEBUGP("icmp_error_track: Can't invert tuple\n"); + return 1; + } + h = ip_conntrack_find_get(&innertuple, NULL); + if (!h) { + DEBUGP("icmp_error_track: no match\n"); + return 1; + } + + ctinfo = IP_CT_RELATED; + if (DIRECTION(h) == IP_CT_DIR_REPLY) + ctinfo += IP_CT_IS_REPLY; + + /* Update skb to refer to this connection */ + skb->nfct = &h->ctrack->infos[ctinfo]; + return 1; +} + +static inline int helper_cmp(const struct ip_conntrack_helper *i, + const struct ip_conntrack_tuple *rtuple) +{ + return i->will_help(rtuple); +} + +/* Compare all but src per-proto part. */ +static int expect_cmp(const struct ip_conntrack_expect *i, + const struct ip_conntrack_tuple *tuple) +{ + return (tuple->src.ip == i->tuple.src.ip + && tuple->dst.ip == i->tuple.dst.ip + && tuple->dst.u.all == i->tuple.dst.u.all + && tuple->dst.protonum == i->tuple.dst.protonum); +} + +/* Allocate a new conntrack; we set everything up, then grab write + lock and see if we lost a race. If we lost it we return 0, + indicating the controlling code should look again. */ +static int +init_conntrack(const struct ip_conntrack_tuple *tuple, + struct ip_conntrack_protocol *protocol, + struct sk_buff *skb) +{ + struct ip_conntrack *conntrack; + struct ip_conntrack_tuple repl_tuple; + size_t hash, repl_hash; + struct ip_conntrack_expect *expected; + enum ip_conntrack_info ctinfo; + int i; + + if (!invert_tuple(&repl_tuple, tuple, protocol)) { + DEBUGP("Can't invert tuple.\n"); + return 1; + } + + if(ip_conntrack_max && + (atomic_read(&ip_conntrack_count) >= ip_conntrack_max)) { + if (net_ratelimit()) + printk(KERN_WARNING "ip_conntrack: maximum limit of %d entries exceeded\n", ip_conntrack_max); + return 1; + } + + conntrack = kmalloc(sizeof(struct ip_conntrack), GFP_ATOMIC); + if (!conntrack) { + DEBUGP("Can't allocate conntrack.\n"); + return 1; + } + hash = hash_conntrack(tuple); + repl_hash = hash_conntrack(&repl_tuple); + + memset(conntrack, 0, sizeof(struct ip_conntrack)); + atomic_set(&conntrack->ct_general.use, 1); + conntrack->ct_general.destroy = destroy_conntrack; + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple; + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack; + conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple; + conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack; + for(i=0; i < IP_CT_NUMBER; i++) + conntrack->infos[i].master = &conntrack->ct_general; + + if (!protocol->new(conntrack, skb->nh.iph, skb->len)) { + kfree(conntrack); + return 1; + } + + /* Sew in at head of hash list. */ + WRITE_LOCK(&ip_conntrack_lock); + /* Check noone else beat us in the race... */ + if (__ip_conntrack_find(tuple, NULL)) { + WRITE_UNLOCK(&ip_conntrack_lock); + printk("ip_conntrack: Wow someone raced us!\n"); + kfree(conntrack); + return 0; + } + conntrack->helper = LIST_FIND(&helpers, helper_cmp, + struct ip_conntrack_helper *, + &repl_tuple); + /* Need finding and deleting of expected ONLY if we win race */ + expected = LIST_FIND(&expect_list, expect_cmp, + struct ip_conntrack_expect *, tuple); + if (expected) { + /* Welcome, Mr. Bond. We've been expecting you... */ + conntrack->status = IPS_EXPECTED; + conntrack->master.master = &expected->expectant->ct_general; + IP_NF_ASSERT(conntrack->master.master); + LIST_DELETE(&expect_list, expected); + expected->expectant = NULL; + nf_conntrack_get(&conntrack->master); + ctinfo = IP_CT_RELATED; + } else { + ctinfo = IP_CT_NEW; + } + list_prepend(&ip_conntrack_hash[hash], + &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]); + list_prepend(&ip_conntrack_hash[repl_hash], + &conntrack->tuplehash[IP_CT_DIR_REPLY]); + WRITE_UNLOCK(&ip_conntrack_lock); + + /* Update skb to refer to this connection */ + skb->nfct = &conntrack->infos[ctinfo]; + + atomic_inc(&ip_conntrack_count); + return 1; +} + +static void +resolve_normal_ct(struct sk_buff *skb) +{ + struct ip_conntrack_tuple tuple; + struct ip_conntrack_tuple_hash *h; + struct ip_conntrack_protocol *proto; + enum ip_conntrack_info ctinfo; + + proto = find_proto(skb->nh.iph->protocol); + if (!get_tuple(skb->nh.iph, skb->len, &tuple, proto)) + return; + + /* Loop around search/insert race */ + do { + /* look for tuple match */ + h = ip_conntrack_find_get(&tuple, NULL); + if (!h && init_conntrack(&tuple, proto, skb)) + return; + } while (!h); + + /* It exists; we have (non-exclusive) reference. */ + if (DIRECTION(h) == IP_CT_DIR_REPLY) { + ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY; + h->ctrack->status |= IPS_SEEN_REPLY; + } else { + /* Once we've had two way comms, always ESTABLISHED. */ + if (h->ctrack->status & IPS_SEEN_REPLY) { + DEBUGP("ip_conntrack_in: normal packet for %p\n", + h->ctrack); + ctinfo = IP_CT_ESTABLISHED; + } else if (h->ctrack->status & IPS_EXPECTED) { + DEBUGP("ip_conntrack_in: related packet for %p\n", + h->ctrack); + ctinfo = IP_CT_RELATED; + } else { + DEBUGP("ip_conntrack_in: new packet for %p\n", + h->ctrack); + ctinfo = IP_CT_NEW; + } + } + skb->nfct = &h->ctrack->infos[ctinfo]; +} + +/* Return conntrack and conntrack_info a given skb */ +struct ip_conntrack * +ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo) +{ + if (!skb->nfct) { + /* It may be an icmp error... */ + if (!icmp_error_track(skb)) + resolve_normal_ct(skb); + } + + if (skb->nfct) { + struct ip_conntrack *ct + = (struct ip_conntrack *)skb->nfct->master; + + /* ctinfo is the index of the nfct inside the conntrack */ + *ctinfo = skb->nfct - ct->infos; + IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER); + return ct; + } + return NULL; +} + +/* Netfilter hook itself. */ +unsigned int ip_conntrack_in(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + struct ip_conntrack_protocol *proto; + int ret; + + /* FIXME: Do this right please. --RR */ + (*pskb)->nfcache |= NFC_UNKNOWN; + + /* Previously seen (loopback)? Ignore. Do this before + fragment check. */ + if ((*pskb)->nfct) + return NF_ACCEPT; + + /* Gather fragments. */ + if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + *pskb = ip_ct_gather_frags(*pskb); + if (!*pskb) + return NF_STOLEN; + } + + ct = ip_conntrack_get(*pskb, &ctinfo); + if (!ct) + /* Not valid part of a connection */ + return NF_ACCEPT; + + proto = find_proto((*pskb)->nh.iph->protocol); + /* If this is new, this is first time timer will be set */ + ret = proto->packet(ct, (*pskb)->nh.iph, (*pskb)->len, ctinfo); + + if (ret == -1) { + /* Invalid */ + nf_conntrack_put((*pskb)->nfct); + (*pskb)->nfct = NULL; + return NF_ACCEPT; + } + + if (ret != NF_DROP && ct->helper) { + ret = ct->helper->help((*pskb)->nh.iph, (*pskb)->len, + ct, ctinfo); + if (ret == -1) { + /* Invalid */ + nf_conntrack_put((*pskb)->nfct); + (*pskb)->nfct = NULL; + return NF_ACCEPT; + } + } + + return ret; +} + +int invert_tuplepr(struct ip_conntrack_tuple *inverse, + const struct ip_conntrack_tuple *orig) +{ + return invert_tuple(inverse, orig, find_proto(orig->dst.protonum)); +} + +/* Add a related connection. */ +int ip_conntrack_expect_related(struct ip_conntrack *related_to, + const struct ip_conntrack_tuple *tuple) +{ + WRITE_LOCK(&ip_conntrack_lock); + related_to->expected.tuple = *tuple; + + if (!related_to->expected.expectant) { + list_prepend(&expect_list, &related_to->expected); + related_to->expected.expectant = related_to; + } else { + IP_NF_ASSERT(list_inlist(&expect_list, &related_to->expected)); + IP_NF_ASSERT(related_to->expected.expectant + == related_to); + } + WRITE_UNLOCK(&ip_conntrack_lock); + + return 0; +} + +/* Alter reply tuple (maybe alter helper). If it's already taken, + return 0 and don't do alteration. */ +int ip_conntrack_alter_reply(struct ip_conntrack *conntrack, + const struct ip_conntrack_tuple *newreply) +{ + unsigned int newindex = hash_conntrack(newreply); + + WRITE_LOCK(&ip_conntrack_lock); + if (__ip_conntrack_find(newreply, conntrack)) { + WRITE_UNLOCK(&ip_conntrack_lock); + return 0; + } + DEBUGP("Altering reply tuple of %p to ", conntrack); + DUMP_TUPLE(newreply); + + LIST_DELETE(&ip_conntrack_hash + [hash_conntrack(&conntrack->tuplehash[IP_CT_DIR_REPLY] + .tuple)], + &conntrack->tuplehash[IP_CT_DIR_REPLY]); + conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; + list_prepend(&ip_conntrack_hash[newindex], + &conntrack->tuplehash[IP_CT_DIR_REPLY]); + conntrack->helper = LIST_FIND(&helpers, helper_cmp, + struct ip_conntrack_helper *, + newreply); + WRITE_UNLOCK(&ip_conntrack_lock); + return 1; +} + +int ip_conntrack_helper_register(struct ip_conntrack_helper *me) +{ + MOD_INC_USE_COUNT; + + WRITE_LOCK(&ip_conntrack_lock); + list_prepend(&helpers, me); + WRITE_UNLOCK(&ip_conntrack_lock); + + return 0; +} + +static inline int unhelp(struct ip_conntrack_tuple_hash *i, + const struct ip_conntrack_helper *me) +{ + if (i->ctrack->helper == me) { + i->ctrack->helper = NULL; + /* Get rid of any expected. */ + if (i->ctrack->expected.expectant) { + IP_NF_ASSERT(i->ctrack->expected.expectant + == i->ctrack); + LIST_DELETE(&expect_list, &i->ctrack->expected); + i->ctrack->expected.expectant = NULL; + } + } + return 0; +} + +void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me) +{ + unsigned int i; + + /* Need write lock here, to delete helper. */ + WRITE_LOCK(&ip_conntrack_lock); + LIST_DELETE(&helpers, me); + + /* Get rid of expecteds, set helpers to NULL. */ + for (i = 0; i < ip_conntrack_htable_size; i++) + LIST_FIND_W(&ip_conntrack_hash[i], unhelp, + struct ip_conntrack_tuple_hash *, me); + WRITE_UNLOCK(&ip_conntrack_lock); + + /* Someone could be still looking at the helper in a bh. */ + br_write_lock_bh(BR_NETPROTO_LOCK); + br_write_unlock_bh(BR_NETPROTO_LOCK); + + MOD_DEC_USE_COUNT; +} + +/* Refresh conntrack for this many jiffies: if noone calls this, + conntrack will vanish with current skb. */ +void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies) +{ + WRITE_LOCK(&ip_conntrack_lock); + /* If this hasn't had a timer before, it's still being set up */ + if (ct->timeout.data == 0) { + ct->timeout.data = (unsigned long)ct; + ct->timeout.function = death_by_timeout; + ct->timeout.expires = jiffies + extra_jiffies; + atomic_inc(&ct->ct_general.use); + add_timer(&ct->timeout); + } else { + /* Need del_timer for race avoidance (may already be dying). */ + if (del_timer(&ct->timeout)) { + ct->timeout.expires = jiffies + extra_jiffies; + add_timer(&ct->timeout); + } + } + WRITE_UNLOCK(&ip_conntrack_lock); +} + +/* Returns new sk_buff, or NULL */ +struct sk_buff * +ip_ct_gather_frags(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; +#ifdef CONFIG_NETFILTER_DEBUG + unsigned int olddebug = skb->nf_debug; +#endif + if (sk) sock_hold(sk); + skb = ip_defrag(skb); + if (!skb) { + if (sk) sock_put(sk); + return skb; + } + if (sk) { + skb_set_owner_w(skb, sk); + sock_put(sk); + } + + ip_send_check(skb->nh.iph); + skb->nfcache |= NFC_ALTERED; +#ifdef CONFIG_NETFILTER_DEBUG + /* Packet path as if nothing had happened. */ + skb->nf_debug = olddebug; +#endif + return skb; +} + +static inline int +do_kill(const struct ip_conntrack_tuple_hash *i, + int (*kill)(const struct ip_conntrack *i, void *data), + void *data) +{ + return kill(i->ctrack, data); +} + +/* Bring out ya dead! */ +static struct ip_conntrack_tuple_hash * +get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data), + void *data) +{ + struct ip_conntrack_tuple_hash *h = NULL; + unsigned int i; + + READ_LOCK(&ip_conntrack_lock); + for (i = 0; !h && i < ip_conntrack_htable_size; i++) { + h = LIST_FIND(&ip_conntrack_hash[i], do_kill, + struct ip_conntrack_tuple_hash *, kill, data); + } + if (h) + atomic_inc(&h->ctrack->ct_general.use); + READ_UNLOCK(&ip_conntrack_lock); + + return h; +} + +void +ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data), + void *data) +{ + struct ip_conntrack_tuple_hash *h; + + /* This is order n^2, by the way. */ + while ((h = get_next_corpse(kill, data)) != NULL) { + /* Time to push up daises... */ + if (del_timer(&h->ctrack->timeout)) + death_by_timeout((unsigned long)h->ctrack); + /* ... else the timer will get him soon. */ + + ip_conntrack_put(h->ctrack); + } +} + +/* Fast function for those who don't want to parse /proc (and I don't + blame them). */ +/* Reversing the socket's dst/src point of view gives us the reply + mapping. */ +static int +getorigdst(struct sock *sk, int optval, void *user, int *len) +{ + struct ip_conntrack_tuple_hash *h; + struct ip_conntrack_tuple tuple = { { sk->rcv_saddr, { sk->sport }, + 0 }, + { sk->daddr, { sk->dport }, + IPPROTO_TCP } }; + + /* We only do TCP at the moment: is there a better way? */ + if (strcmp(sk->prot->name, "TCP") != 0) { + DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n"); + return -ENOPROTOOPT; + } + + if (*len != sizeof(struct sockaddr_in)) { + DEBUGP("SO_ORIGINAL_DST: len %u not %u\n", + *len, sizeof(struct sockaddr_in)); + return -EINVAL; + } + + h = ip_conntrack_find_get(&tuple, NULL); + if (h) { + struct sockaddr_in sin; + + sin.sin_family = AF_INET; + sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u.tcp.port; + sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.ip; + + DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", + IP_PARTS(sin.sin_addr.s_addr), ntohs(sin.sin_port)); + ip_conntrack_put(h->ctrack); + if (copy_to_user(user, &sin, sizeof(sin)) != 0) + return -EFAULT; + else + return 0; + } + DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n", + IP_PARTS(tuple.src.ip), ntohs(tuple.src.u.tcp.port), + IP_PARTS(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port)); + return -ENOENT; +} + +static struct nf_sockopt_ops so_getorigdst += { { NULL, NULL }, PF_INET, + 0, 0, NULL, /* Setsockopts */ + SO_ORIGINAL_DST, SO_ORIGINAL_DST+1, &getorigdst, + 0, NULL }; + +#define NET_IP_CONNTRACK_MAX 2089 +#define NET_IP_CONNTRACK_MAX_NAME "ip_conntrack_max" + +static struct ctl_table_header *ip_conntrack_sysctl_header; + +static ctl_table ip_conntrack_table[] = { + { NET_IP_CONNTRACK_MAX, NET_IP_CONNTRACK_MAX_NAME, &ip_conntrack_max, + sizeof(ip_conntrack_max), 0644, NULL, proc_dointvec }, + { 0 } +}; + +static ctl_table ip_conntrack_dir_table[] = { + {NET_IPV4, "ipv4", NULL, 0, 0555, ip_conntrack_table, 0, 0, 0, 0, 0}, + { 0 } +}; + +static ctl_table ip_conntrack_root_table[] = { + {CTL_NET, "net", NULL, 0, 0555, ip_conntrack_dir_table, 0, 0, 0, 0, 0}, + { 0 } +}; + +static int kill_all(const struct ip_conntrack *i, void *data) +{ + return 1; +} + +/* Mishearing the voices in his head, our hero wonders how he's + supposed to kill the mall. */ +void ip_conntrack_cleanup(void) +{ + unregister_sysctl_table(ip_conntrack_sysctl_header); + ip_ct_selective_cleanup(kill_all, NULL); + vfree(ip_conntrack_hash); + nf_unregister_sockopt(&so_getorigdst); +} + +int __init ip_conntrack_init(void) +{ + unsigned int i; + int ret; + + /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB + * machine has 256 buckets. 1GB machine has 8192 buckets. */ + ip_conntrack_htable_size + = (((num_physpages << PAGE_SHIFT) / 16384) + / sizeof(struct list_head)); + ip_conntrack_max = 8 * ip_conntrack_htable_size; + + printk("ip_conntrack (%u buckets, %d max)\n", + ip_conntrack_htable_size, ip_conntrack_max); + + ret = nf_register_sockopt(&so_getorigdst); + if (ret != 0) + return ret; + + ip_conntrack_hash = vmalloc(sizeof(struct list_head) + * ip_conntrack_htable_size); + if (!ip_conntrack_hash) { + nf_unregister_sockopt(&so_getorigdst); + return -ENOMEM; + } + + /* Don't NEED lock here, but good form anyway. */ + WRITE_LOCK(&ip_conntrack_lock); + /* Sew in builtin protocols. */ + list_append(&protocol_list, &ip_conntrack_protocol_tcp); + list_append(&protocol_list, &ip_conntrack_protocol_udp); + list_append(&protocol_list, &ip_conntrack_protocol_icmp); + WRITE_UNLOCK(&ip_conntrack_lock); + + for (i = 0; i < ip_conntrack_htable_size; i++) + INIT_LIST_HEAD(&ip_conntrack_hash[i]); + +/* This is fucking braindead. There is NO WAY of doing this without + the CONFIG_SYSCTL unless you don't want to detect errors. + Grrr... --RR */ +#ifdef CONFIG_SYSCTL + ip_conntrack_sysctl_header + = register_sysctl_table(ip_conntrack_root_table, 0); + if (ip_conntrack_sysctl_header == NULL) { + vfree(ip_conntrack_hash); + nf_unregister_sockopt(&so_getorigdst); + return -ENOMEM; + } +#endif /*CONFIG_SYSCTL*/ + + ret = ip_conntrack_protocol_tcp_init(); + if (ret != 0) { + unregister_sysctl_table(ip_conntrack_sysctl_header); + vfree(ip_conntrack_hash); + nf_unregister_sockopt(&so_getorigdst); + } + + return ret; +} + diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c new file mode 100644 index 000000000..9137d13ea --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -0,0 +1,251 @@ +/* FTP extension for IP connection tracking. */ +#ifdef MODULE +#define EXPORT_SYMTAB +#endif +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/ip.h> +#include <net/checksum.h> +#include <net/tcp.h> + +#include <linux/netfilter_ipv4/lockhelp.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_ftp.h> + +DECLARE_LOCK(ip_ftp_lock); + +#define SERVER_STRING "227 Entering Passive Mode (" +#define CLIENT_STRING "PORT " + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +#define IP_PARTS_NATIVE(n) \ +(unsigned int)((n)>>24)&0xFF, \ +(unsigned int)((n)>>16)&0xFF, \ +(unsigned int)((n)>>8)&0xFF, \ +(unsigned int)((n)&0xFF) + +#define IP_PARTS(n) IP_PARTS_NATIVE(ntohl(n)) + +static struct { + const char *pattern; + size_t plen; + char term; +} search[2] = { + [IP_CT_FTP_PORT] { CLIENT_STRING, sizeof(CLIENT_STRING) - 1, '\r' }, + [IP_CT_FTP_PASV] { SERVER_STRING, sizeof(SERVER_STRING) - 1, ')' } +}; + +/* Returns 0, or length of numbers */ +static int try_number(const char *data, size_t dlen, u_int32_t array[6], + char term) +{ + u_int32_t i, len; + + /* Keep data pointing at next char. */ + for (i = 0, len = 0; len < dlen; len++, data++) { + if (*data >= '0' && *data <= '9') { + array[i] = array[i]*10 + *data - '0'; + } + else if (*data == ',') + i++; + else { + /* Unexpected character; true if it's the + terminator and we're finished. */ + if (*data == term && i == 5) + return len; + + DEBUGP("Char %u (got %u nums) `%u' unexpected\n", + len, i, *data); + return 0; + } + } + + return 0; +} + +/* Return 1 for match, 0 for accept, -1 for partial. */ +static int find_pattern(const char *data, size_t dlen, + const char *pattern, size_t plen, + char term, + unsigned int *numoff, + unsigned int *numlen, + u_int32_t array[6]) +{ + if (dlen == 0) + return 0; + + if (dlen < plen) { + /* Short packet: try for partial? */ + if (strnicmp(data, pattern, dlen) == 0) + return -1; + else return 0; + } + + if (strnicmp(data, pattern, plen) != 0) { +#if 0 + size_t i; + + DEBUGP("ftp: string mismatch\n"); + for (i = 0; i < plen; i++) { + DEBUGFTP("ftp:char %u `%c'(%u) vs `%c'(%u)\n", + i, data[i], data[i], + pattern[i], pattern[i]); + } +#endif + return 0; + } + + *numoff = plen; + *numlen = try_number(data + plen, dlen - plen, array, term); + if (!*numlen) + return -1; + + return 1; +} + +/* FIXME: This should be in userspace. Later. */ +static int help(const struct iphdr *iph, size_t len, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo) +{ + /* tcplen not negative guarenteed by ip_conntrack_tcp.c */ + struct tcphdr *tcph = (void *)iph + iph->ihl * 4; + const char *data = (const char *)tcph + tcph->doff * 4; + unsigned int tcplen = len - iph->ihl * 4; + unsigned int datalen = tcplen - tcph->doff * 4; + u_int32_t old_seq_aft_nl; + int old_seq_aft_nl_set; + u_int32_t array[6] = { 0 }; + int dir = CTINFO2DIR(ctinfo); + unsigned int matchlen, matchoff; + struct ip_conntrack_tuple t; + struct ip_ct_ftp *info = &ct->help.ct_ftp_info; + + /* Can't track connections formed before we registered */ + if (!info) + return NF_ACCEPT; + + /* Until there's been traffic both ways, don't look in packets. */ + if (ctinfo != IP_CT_ESTABLISHED + && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) { + DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo); + return NF_ACCEPT; + } + + /* Not whole TCP header? */ + if (tcplen < sizeof(struct tcphdr) || tcplen < tcph->doff*4) { + DEBUGP("ftp: tcplen = %u\n", (unsigned)tcplen); + return NF_ACCEPT; + } + + /* Checksum invalid? Ignore. */ + /* FIXME: Source route IP option packets --RR */ + if (tcp_v4_check(tcph, tcplen, iph->saddr, iph->daddr, + csum_partial((char *)tcph, tcplen, 0))) { + DEBUGP("ftp_help: bad csum: %p %u %u.%u.%u.%u %u.%u.%u.%u\n", + tcph, tcplen, IP_PARTS(iph->saddr), + IP_PARTS(iph->daddr)); + return NF_ACCEPT; + } + + LOCK_BH(&ip_ftp_lock); + old_seq_aft_nl_set = info->seq_aft_nl_set[dir]; + old_seq_aft_nl = info->seq_aft_nl[dir]; + + DEBUGP("conntrack_ftp: datalen %u\n", datalen); + if ((datalen > 0) && (data[datalen-1] == '\n')) { + DEBUGP("conntrack_ftp: datalen %u ends in \\n\n", datalen); + if (!old_seq_aft_nl_set + || after(ntohl(tcph->seq) + datalen, old_seq_aft_nl)) { + DEBUGP("conntrack_ftp: updating nl to %u\n", + ntohl(tcph->seq) + datalen); + info->seq_aft_nl[dir] = ntohl(tcph->seq) + datalen; + info->seq_aft_nl_set[dir] = 1; + } + } + UNLOCK_BH(&ip_ftp_lock); + + if(!old_seq_aft_nl_set || + (ntohl(tcph->seq) != old_seq_aft_nl)) { + DEBUGP("ip_conntrack_ftp_help: wrong seq pos %s(%u)\n", + old_seq_aft_nl_set ? "":"(UNSET) ", old_seq_aft_nl); + return NF_ACCEPT; + } + + switch (find_pattern(data, datalen, + search[dir].pattern, + search[dir].plen, search[dir].term, + &matchoff, &matchlen, + array)) { + case -1: /* partial */ + /* We don't usually drop packets. After all, this is + connection tracking, not packet filtering. + However, it is neccessary for accurate tracking in + this case. */ + DEBUGP("conntrack_ftp: partial `%.*s'\n", + (int)datalen, data); + return NF_DROP; + + case 0: /* no match */ + DEBUGP("ip_conntrack_ftp_help: no match\n"); + return NF_ACCEPT; + } + + DEBUGP("conntrack_ftp: match `%.*s' (%u bytes at %u)\n", + (int)matchlen, data + matchoff, + matchlen, ntohl(tcph->seq) + matchoff); + + /* Update the ftp info */ + LOCK_BH(&ip_ftp_lock); + info->is_ftp = 1; + info->seq = ntohl(tcph->seq) + matchoff; + info->len = matchlen; + info->ftptype = dir; + info->port = array[4] << 8 | array[5]; + + t = ((struct ip_conntrack_tuple) + { { ct->tuplehash[!dir].tuple.src.ip, + { 0 }, 0 }, + { htonl((array[0] << 24) | (array[1] << 16) + | (array[2] << 8) | array[3]), + { htons(array[4] << 8 | array[5]) }, + IPPROTO_TCP }}); + ip_conntrack_expect_related(ct, &t); + UNLOCK_BH(&ip_ftp_lock); + + return NF_ACCEPT; +} + +/* Returns TRUE if it wants to help this connection (tuple is the + tuple of REPLY packets from server). */ +static int ftp_will_help(const struct ip_conntrack_tuple *rtuple) +{ + return (rtuple->dst.protonum == IPPROTO_TCP + && rtuple->src.u.tcp.port == __constant_htons(21)); +} + +static struct ip_conntrack_helper ftp = { { NULL, NULL }, + ftp_will_help, + help }; + +static int __init init(void) +{ + return ip_conntrack_helper_register(&ftp); +} + +static void __exit fini(void) +{ + ip_conntrack_helper_unregister(&ftp); +} + +struct module *ip_conntrack_ftp = THIS_MODULE; +EXPORT_SYMBOL(ip_conntrack_ftp); +EXPORT_SYMBOL(ip_ftp_lock); + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_generic.c b/net/ipv4/netfilter/ip_conntrack_proto_generic.c new file mode 100644 index 000000000..77a491e34 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_proto_generic.c @@ -0,0 +1,60 @@ +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> + +#define GENERIC_TIMEOUT (3600*HZ) + +static int generic_pkt_to_tuple(const void *datah, size_t datalen, + struct ip_conntrack_tuple *tuple) +{ + tuple->src.u.all = 0; + tuple->dst.u.all = 0; + + return 1; +} + +static int generic_invert_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig) +{ + tuple->src.u.all = 0; + tuple->dst.u.all = 0; + + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static unsigned int generic_print_tuple(char *buffer, + const struct ip_conntrack_tuple *tuple) +{ + return 0; +} + +/* Print out the private part of the conntrack. */ +static unsigned int generic_print_conntrack(char *buffer, + const struct ip_conntrack *state) +{ + return 0; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int established(struct ip_conntrack *conntrack, + struct iphdr *iph, size_t len, + enum ip_conntrack_info conntrackinfo) +{ + ip_ct_refresh(conntrack, GENERIC_TIMEOUT); + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int new(struct ip_conntrack *conntrack, struct iphdr *iph, size_t len) +{ + return 1; +} + +struct ip_conntrack_protocol ip_conntrack_generic_protocol += { { NULL, NULL }, 0, "unknown", + generic_pkt_to_tuple, generic_invert_tuple, generic_print_tuple, + generic_print_conntrack, established, new, NULL }; + diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c new file mode 100644 index 000000000..1d1256be5 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -0,0 +1,111 @@ +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <linux/icmp.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> + +#define ICMP_TIMEOUT (30*HZ) + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int icmp_pkt_to_tuple(const void *datah, size_t datalen, + struct ip_conntrack_tuple *tuple) +{ + const struct icmphdr *hdr = datah; + + tuple->dst.u.icmp.type = hdr->type; + tuple->src.u.icmp.id = hdr->un.echo.id; + tuple->dst.u.icmp.code = hdr->code; + + return 1; +} + +static int icmp_invert_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig) +{ + /* Add 1; spaces filled with 0. */ + static u_int8_t invmap[] + = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1, + [ICMP_ECHOREPLY] = ICMP_ECHO + 1, + [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, + [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, + [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, + [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, + [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, + [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1}; + + if (orig->dst.u.icmp.type >= sizeof(invmap) + || !invmap[orig->dst.u.icmp.type]) + return 0; + + tuple->src.u.icmp.id = orig->src.u.icmp.id; + tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1; + tuple->dst.u.icmp.code = orig->dst.u.icmp.code; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static unsigned int icmp_print_tuple(char *buffer, + const struct ip_conntrack_tuple *tuple) +{ + return sprintf(buffer, "type=%u code=%u id=%u ", + tuple->dst.u.icmp.type, + tuple->dst.u.icmp.code, + ntohs(tuple->src.u.icmp.id)); +} + +/* Print out the private part of the conntrack. */ +static unsigned int icmp_print_conntrack(char *buffer, + const struct ip_conntrack *conntrack) +{ + return 0; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int icmp_packet(struct ip_conntrack *ct, + struct iphdr *iph, size_t len, + enum ip_conntrack_info ctinfo) +{ + /* FIXME: Should keep count of orig - reply packets: if == 0, + destroy --RR */ + /* Delete connection immediately on reply: won't actually + vanish as we still have skb */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { + if (del_timer(&ct->timeout)) + ct->timeout.function((unsigned long)ct); + } else + ip_ct_refresh(ct, ICMP_TIMEOUT); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int icmp_new(struct ip_conntrack *conntrack, + struct iphdr *iph, size_t len) +{ + static u_int8_t valid_new[] + = { [ICMP_ECHO] = 1, + [ICMP_TIMESTAMP] = 1, + [ICMP_INFO_REQUEST] = 1, + [ICMP_ADDRESS] = 1 }; + + if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) + || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) { + /* Can't create a new ICMP `conn' with this. */ + DEBUGP("icmp: can't create new conn with type %u\n", + conntrack->tuplehash[0].tuple.dst.u.icmp.type); + DUMP_TUPLE(&conntrack->tuplehash[0].tuple); + return 0; + } + return 1; +} + +struct ip_conntrack_protocol ip_conntrack_protocol_icmp += { { NULL, NULL }, IPPROTO_ICMP, "icmp", + icmp_pkt_to_tuple, icmp_invert_tuple, icmp_print_tuple, + icmp_print_conntrack, icmp_packet, icmp_new, NULL }; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c new file mode 100644 index 000000000..3dd448252 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -0,0 +1,227 @@ +#define __NO_VERSION__ +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> +#include <linux/netfilter_ipv4/lockhelp.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* Protects conntrack->proto.tcp_state */ +static DECLARE_RWLOCK(tcp_lock); + +/* FIXME: Examine ipfilter's timeouts and conntrack transitions more + closely. They're more complex. --RR */ + +/* Actually, I believe that neither ipmasq (where this code is stolen + from) nor ipfilter do it exactly right. A new conntrack machine taking + into account packet loss (which creates uncertainty as to exactly + the conntrack of the connection) is required. RSN. --RR */ +enum tcp_conntrack { + TCP_CONNTRACK_NONE, + TCP_CONNTRACK_ESTABLISHED, + TCP_CONNTRACK_SYN_SENT, + TCP_CONNTRACK_SYN_RECV, + TCP_CONNTRACK_FIN_WAIT, + TCP_CONNTRACK_TIME_WAIT, + TCP_CONNTRACK_CLOSE, + TCP_CONNTRACK_CLOSE_WAIT, + TCP_CONNTRACK_LAST_ACK, + TCP_CONNTRACK_LISTEN, + TCP_CONNTRACK_MAX +}; + +static const char *tcp_conntrack_names[] = { + "NONE", + "ESTABLISHED", + "SYN_SENT", + "SYN_RECV", + "FIN_WAIT", + "TIME_WAIT", + "CLOSE", + "CLOSE_WAIT", + "LAST_ACK", + "LISTEN" +}; + +#define SECS *HZ +#define MINS * 60 SECS +#define HOURS * 60 MINS +#define DAYS * 24 HOURS + + +static unsigned long tcp_timeouts[] += { 30 MINS, /* TCP_CONNTRACK_NONE, */ + 5 DAYS, /* TCP_CONNTRACK_ESTABLISHED, */ + 2 MINS, /* TCP_CONNTRACK_SYN_SENT, */ + 60 SECS, /* TCP_CONNTRACK_SYN_RECV, */ + 2 MINS, /* TCP_CONNTRACK_FIN_WAIT, */ + 2 MINS, /* TCP_CONNTRACK_TIME_WAIT, */ + 10 SECS, /* TCP_CONNTRACK_CLOSE, */ + 60 SECS, /* TCP_CONNTRACK_CLOSE_WAIT, */ + 30 SECS, /* TCP_CONNTRACK_LAST_ACK, */ + 2 MINS, /* TCP_CONNTRACK_LISTEN, */ +}; + +#define sNO TCP_CONNTRACK_NONE +#define sES TCP_CONNTRACK_ESTABLISHED +#define sSS TCP_CONNTRACK_SYN_SENT +#define sSR TCP_CONNTRACK_SYN_RECV +#define sFW TCP_CONNTRACK_FIN_WAIT +#define sTW TCP_CONNTRACK_TIME_WAIT +#define sCL TCP_CONNTRACK_CLOSE +#define sCW TCP_CONNTRACK_CLOSE_WAIT +#define sLA TCP_CONNTRACK_LAST_ACK +#define sLI TCP_CONNTRACK_LISTEN +#define sIV TCP_CONNTRACK_MAX + +static enum tcp_conntrack tcp_conntracks[2][5][TCP_CONNTRACK_MAX] = { + { +/* ORIGINAL */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI */ +/*syn*/ {sSS, sES, sSS, sES, sSS, sSS, sSS, sSS, sSS, sLI }, +/*fin*/ {sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI }, +/*ack*/ {sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sES }, +/*rst*/ {sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL }, +/*none*/{sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } + }, + { +/* REPLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI */ +/*syn*/ {sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR }, +/*fin*/ {sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI }, +/*ack*/ {sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI }, +/*rst*/ {sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI }, +/*none*/{sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } + } +}; + +static int tcp_pkt_to_tuple(const void *datah, size_t datalen, + struct ip_conntrack_tuple *tuple) +{ + const struct tcphdr *hdr = datah; + + tuple->src.u.tcp.port = hdr->source; + tuple->dst.u.tcp.port = hdr->dest; + + return 1; +} + +static int tcp_invert_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig) +{ + tuple->src.u.tcp.port = orig->dst.u.tcp.port; + tuple->dst.u.tcp.port = orig->src.u.tcp.port; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static unsigned int tcp_print_tuple(char *buffer, + const struct ip_conntrack_tuple *tuple) +{ + return sprintf(buffer, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.tcp.port), + ntohs(tuple->dst.u.tcp.port)); +} + +/* Print out the private part of the conntrack. */ +static unsigned int tcp_print_conntrack(char *buffer, + const struct ip_conntrack *conntrack) +{ + enum tcp_conntrack state; + + READ_LOCK(&tcp_lock); + state = conntrack->proto.tcp_state; + READ_UNLOCK(&tcp_lock); + + return sprintf(buffer, "%s ", tcp_conntrack_names[state]); +} + +static unsigned int get_conntrack_index(const struct tcphdr *tcph) +{ + if (tcph->rst) return 3; + else if (tcph->syn) return 0; + else if (tcph->fin) return 1; + else if (tcph->ack) return 2; + else return 4; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int tcp_packet(struct ip_conntrack *conntrack, + struct iphdr *iph, size_t len, + enum ip_conntrack_info ctinfo) +{ + enum tcp_conntrack newconntrack; + struct tcphdr *tcph = (struct tcphdr *)((u_int32_t *)iph + iph->ihl); + + /* We're guaranteed to have the base header, but maybe not the + options. */ + if (len < (iph->ihl + tcph->doff) * 4) { + DEBUGP("ip_conntrack_tcp: Truncated packet.\n"); + return -1; + } + + WRITE_LOCK(&tcp_lock); + newconntrack + = tcp_conntracks + [CTINFO2DIR(ctinfo)] + [get_conntrack_index(tcph)][conntrack->proto.tcp_state]; + + /* Invalid */ + if (newconntrack == TCP_CONNTRACK_MAX) { + DEBUGP("ip_conntrack_tcp: Invalid dir=%i index=%u conntrack=%u\n", + CTINFO2DIR(ctinfo), get_conntrack_index(tcph), + conntrack->proto.tcp_state); + WRITE_UNLOCK(&tcp_lock); + return -1; + } + + conntrack->proto.tcp_state = newconntrack; + WRITE_UNLOCK(&tcp_lock); + + /* Refresh: need write lock to write to conntrack. */ + ip_ct_refresh(conntrack, tcp_timeouts[conntrack->proto.tcp_state]); + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int tcp_new(struct ip_conntrack *conntrack, + struct iphdr *iph, size_t len) +{ + enum tcp_conntrack newconntrack; + struct tcphdr *tcph = (struct tcphdr *)((u_int32_t *)iph + iph->ihl); + + /* Don't need lock here: this conntrack not in circulation yet */ + newconntrack + = tcp_conntracks[0][get_conntrack_index(tcph)] + [TCP_CONNTRACK_NONE]; + + /* Invalid: delete conntrack */ + if (newconntrack == TCP_CONNTRACK_MAX) { + DEBUGP("ip_conntrack_tcp: invalid new deleting.\n"); + return 0; + } else { + conntrack->proto.tcp_state = newconntrack; + ip_ct_refresh(conntrack, tcp_timeouts[conntrack->proto.tcp_state]); + } + return 1; +} + +struct ip_conntrack_protocol ip_conntrack_protocol_tcp += { { NULL, NULL }, IPPROTO_TCP, "tcp", + tcp_pkt_to_tuple, tcp_invert_tuple, tcp_print_tuple, tcp_print_conntrack, + tcp_packet, tcp_new, NULL }; + +int __init ip_conntrack_protocol_tcp_init(void) +{ + return 0; +} diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c new file mode 100644 index 000000000..688ae10fb --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -0,0 +1,65 @@ +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <linux/udp.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> + +#define UDP_TIMEOUT (60*HZ) + +static int udp_pkt_to_tuple(const void *datah, size_t datalen, + struct ip_conntrack_tuple *tuple) +{ + const struct udphdr *hdr = datah; + + tuple->src.u.udp.port = hdr->source; + tuple->dst.u.udp.port = hdr->dest; + + return 1; +} + +static int udp_invert_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig) +{ + tuple->src.u.udp.port = orig->dst.u.udp.port; + tuple->dst.u.udp.port = orig->src.u.udp.port; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static unsigned int udp_print_tuple(char *buffer, + const struct ip_conntrack_tuple *tuple) +{ + return sprintf(buffer, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.udp.port), + ntohs(tuple->dst.u.udp.port)); +} + +/* Print out the private part of the conntrack. */ +static unsigned int udp_print_conntrack(char *buffer, + const struct ip_conntrack *conntrack) +{ + return 0; +} + +/* Returns verdict for packet, and may modify conntracktype */ +static int udp_packet(struct ip_conntrack *conntrack, + struct iphdr *iph, size_t len, + enum ip_conntrack_info conntrackinfo) +{ + /* Refresh. */ + ip_ct_refresh(conntrack, UDP_TIMEOUT); + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int udp_new(struct ip_conntrack *conntrack, + struct iphdr *iph, size_t len) +{ + return 1; +} + +struct ip_conntrack_protocol ip_conntrack_protocol_udp += { { NULL, NULL }, IPPROTO_UDP, "udp", + udp_pkt_to_tuple, udp_invert_tuple, udp_print_tuple, udp_print_conntrack, + udp_packet, udp_new, NULL }; diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c new file mode 100644 index 000000000..ce79c3263 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -0,0 +1,297 @@ +/* This file contains all the functions required for the standalone + ip_conntrack module. + + These are not required by the compatibility layer. +*/ + +/* (c) 1999 Paul `Rusty' Russell. Licenced under the GNU General + Public Licence. */ + +#ifdef MODULE +#define EXPORT_SYMTAB +#endif +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/version.h> +#include <net/checksum.h> + +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) + +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> +#include <linux/netfilter_ipv4/ip_conntrack_core.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +struct module *ip_conntrack_module = THIS_MODULE; + +static unsigned int +print_tuple(char *buffer, const struct ip_conntrack_tuple *tuple, + struct ip_conntrack_protocol *proto) +{ + int len; + + len = sprintf(buffer, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ", + NIPQUAD(tuple->src.ip), NIPQUAD(tuple->dst.ip)); + + len += proto->print_tuple(buffer + len, tuple); + + return len; +} + +/* FIXME: Don't print source proto part. --RR */ +static unsigned int +print_expect(char *buffer, const struct ip_conntrack_expect *expect) +{ + unsigned int len; + + len = sprintf(buffer, "EXPECTING: proto=%u ", + expect->tuple.dst.protonum); + len += print_tuple(buffer + len, &expect->tuple, + __find_proto(expect->tuple.dst.protonum)); + len += sprintf(buffer + len, "\n"); + return len; +} + +static unsigned int +print_conntrack(char *buffer, const struct ip_conntrack *conntrack) +{ + unsigned int len; + struct ip_conntrack_protocol *proto + = __find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.protonum); + + len = sprintf(buffer, "%-8s %u %lu ", + proto->name, + conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.protonum, + timer_pending(&conntrack->timeout) + ? (conntrack->timeout.expires - jiffies)/HZ : 0); + + len += proto->print_conntrack(buffer + len, conntrack); + len += print_tuple(buffer + len, + &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + proto); + if (!(conntrack->status & IPS_SEEN_REPLY)) + len += sprintf(buffer + len, "[UNREPLIED] "); + len += print_tuple(buffer + len, + &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple, + proto); + len += sprintf(buffer + len, "\n"); + + return len; +} + +/* Returns true when finished. */ +static int +conntrack_iterate(const struct ip_conntrack_tuple_hash *hash, + char *buffer, off_t offset, off_t *upto, + unsigned int *len, unsigned int maxlen) +{ + unsigned int newlen; + IP_NF_ASSERT(hash->ctrack); + + MUST_BE_READ_LOCKED(&ip_conntrack_lock); + + /* Only count originals */ + if (DIRECTION(hash)) + return 0; + + if ((*upto)++ < offset) + return 0; + + newlen = print_conntrack(buffer + *len, hash->ctrack); + if (*len + newlen > maxlen) + return 1; + else *len += newlen; + + return 0; +} + +static int +list_conntracks(char *buffer, char **start, off_t offset, int length) +{ + unsigned int i; + unsigned int len = 0; + off_t upto = 0; + struct list_head *e; + + READ_LOCK(&ip_conntrack_lock); + /* Traverse hash; print originals then reply. */ + for (i = 0; i < ip_conntrack_htable_size; i++) { + if (LIST_FIND(&ip_conntrack_hash[i], conntrack_iterate, + struct ip_conntrack_tuple_hash *, + buffer, offset, &upto, &len, length)) + goto finished; + } + + /* Now iterate through expecteds. */ + for (e = expect_list.next; e != &expect_list; e = e->next) { + unsigned int last_len; + struct ip_conntrack_expect *expect + = (struct ip_conntrack_expect *)e; + if (upto++ < offset) continue; + + last_len = len; + len += print_expect(buffer + len, expect); + if (len > length) { + len = last_len; + goto finished; + } + } + + finished: + READ_UNLOCK(&ip_conntrack_lock); + + /* `start' hack - see fs/proc/generic.c line ~165 */ + *start = (char *)((unsigned int)upto - offset); + return len; +} + +static unsigned int ip_refrag(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct rtable *rt = (struct rtable *)(*pskb)->dst; + + /* Local packets are never produced too large for their + interface. We degfragment them at LOCAL_OUT, however, + so we have to refragment them here. */ + if ((*pskb)->len > rt->u.dst.pmtu) { + DEBUGP("ip_conntrack: refragm %p (size %u) to %u (okfn %p)\n", + *pskb, (*pskb)->len, rt->u.dst.pmtu, okfn); + /* No hook can be after us, so this should be OK. */ + ip_fragment(*pskb, okfn); + return NF_STOLEN; + } + return NF_ACCEPT; +} + +/* Connection tracking may drop packets, but never alters them, so + make it the first hook. */ +static struct nf_hook_ops ip_conntrack_in_ops += { { NULL, NULL }, ip_conntrack_in, PF_INET, NF_IP_PRE_ROUTING, + NF_IP_PRI_CONNTRACK }; +static struct nf_hook_ops ip_conntrack_local_out_ops += { { NULL, NULL }, ip_conntrack_in, PF_INET, NF_IP_LOCAL_OUT, + NF_IP_PRI_CONNTRACK }; +/* Refragmenter; last chance. */ +static struct nf_hook_ops ip_conntrack_out_ops += { { NULL, NULL }, ip_refrag, PF_INET, NF_IP_POST_ROUTING, NF_IP_PRI_LAST }; + +static int init_or_cleanup(int init) +{ + int ret = 0; + + if (!init) goto cleanup; + + ret = ip_conntrack_init(); + if (ret < 0) + goto cleanup_nothing; + + proc_net_create("ip_conntrack",0,list_conntracks); + ret = nf_register_hook(&ip_conntrack_in_ops); + if (ret < 0) { + printk("ip_conntrack: can't register in hook.\n"); + goto cleanup_init; + } + ret = nf_register_hook(&ip_conntrack_local_out_ops); + if (ret < 0) { + printk("ip_conntrack: can't register local out hook.\n"); + goto cleanup_inops; + } + ret = nf_register_hook(&ip_conntrack_out_ops); + if (ret < 0) { + printk("ip_conntrack: can't register post-routing hook.\n"); + goto cleanup_inandlocalops; + } + + return ret; + + cleanup: + nf_unregister_hook(&ip_conntrack_out_ops); + cleanup_inandlocalops: + nf_unregister_hook(&ip_conntrack_local_out_ops); + cleanup_inops: + nf_unregister_hook(&ip_conntrack_in_ops); + cleanup_init: + proc_net_remove("ip_conntrack"); + ip_conntrack_cleanup(); + cleanup_nothing: + return ret; +} + +/* FIXME: Allow NULL functions and sub in pointers to generic for + them. --RR */ +int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto) +{ + int ret = 0; + struct list_head *i; + + WRITE_LOCK(&ip_conntrack_lock); + for (i = protocol_list.next; i != &protocol_list; i = i->next) { + if (((struct ip_conntrack_protocol *)i)->proto + == proto->proto) { + ret = -EBUSY; + goto out; + } + } + + list_prepend(&protocol_list, proto); + MOD_INC_USE_COUNT; + + out: + WRITE_UNLOCK(&ip_conntrack_lock); + return ret; +} + +/* FIXME: Implement this --RR */ +#if 0 +void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto) +{ +} +#endif + +static int __init init(void) +{ + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +module_init(init); +module_exit(fini); + +EXPORT_SYMBOL(ip_conntrack_protocol_register); +EXPORT_SYMBOL(invert_tuplepr); +EXPORT_SYMBOL(ip_conntrack_alter_reply); +EXPORT_SYMBOL(ip_conntrack_destroyed); +EXPORT_SYMBOL(ip_conntrack_get); +EXPORT_SYMBOL(ip_conntrack_module); +EXPORT_SYMBOL(ip_conntrack_helper_register); +EXPORT_SYMBOL(ip_conntrack_helper_unregister); +EXPORT_SYMBOL(ip_conntrack_lock); +EXPORT_SYMBOL(find_proto); +EXPORT_SYMBOL(get_tuple); +EXPORT_SYMBOL(ip_ct_selective_cleanup); +EXPORT_SYMBOL(ip_ct_refresh); +EXPORT_SYMBOL(ip_conntrack_expect_related); +EXPORT_SYMBOL(ip_conntrack_tuple_taken); +EXPORT_SYMBOL(ip_ct_gather_frags); diff --git a/net/ipv4/netfilter/ip_fw_compat.c b/net/ipv4/netfilter/ip_fw_compat.c new file mode 100644 index 000000000..72dc3d816 --- /dev/null +++ b/net/ipv4/netfilter/ip_fw_compat.c @@ -0,0 +1,238 @@ +/* Compatibility framework for ipchains and ipfwadm support; designed + to look as much like the 2.2 infrastructure as possible. */ +struct notifier_block; + +#include <linux/netfilter_ipv4.h> +#include <linux/ip.h> +#include <net/icmp.h> +#include <linux/if.h> +#include <linux/inetdevice.h> +#include <linux/netdevice.h> +#include <linux/module.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/netfilter_ipv4/compat_firewall.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> + +EXPORT_NO_SYMBOLS; + +static struct firewall_ops *fwops; + +/* From ip_fw_compat_redir.c */ +extern unsigned int +do_redirect(struct sk_buff *skb, + const struct net_device *dev, + u_int16_t redirpt); + +extern void +check_for_redirect(struct sk_buff *skb); + +extern void +check_for_unredirect(struct sk_buff *skb); + +/* From ip_fw_compat_masq.c */ +extern unsigned int +do_masquerade(struct sk_buff **pskb, const struct net_device *dev); + +extern unsigned int +check_for_demasq(struct sk_buff **pskb); + +extern int __init masq_init(void); +extern void masq_cleanup(void); + +/* They call these; we do what they want. */ +int register_firewall(int pf, struct firewall_ops *fw) +{ + if (pf != PF_INET) { + printk("Attempt to register non-IP firewall module.\n"); + return -EINVAL; + } + if (fwops) { + printk("Attempt to register multiple firewall modules.\n"); + return -EBUSY; + } + + fwops = fw; + return 0; +} + +int unregister_firewall(int pf, struct firewall_ops *fw) +{ + fwops = NULL; + return 0; +} + +static unsigned int +fw_in(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + int ret = FW_BLOCK; + u_int16_t redirpt; + + (*pskb)->nfcache |= NFC_UNKNOWN; + (*pskb)->ip_summed = CHECKSUM_NONE; + + switch (hooknum) { + case NF_IP_PRE_ROUTING: + if (fwops->fw_acct_in) + fwops->fw_acct_in(fwops, PF_INET, + (struct net_device *)in, + (*pskb)->nh.raw, &redirpt, pskb); + + if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + *pskb = ip_ct_gather_frags(*pskb); + + if (!*pskb) + return NF_STOLEN; + } + + ret = fwops->fw_input(fwops, PF_INET, (struct net_device *)in, + (*pskb)->nh.raw, &redirpt, pskb); + break; + + case NF_IP_FORWARD: + /* Connection will only be set if it was + demasqueraded: if so, skip forward chain. */ + if ((*pskb)->nfct) + ret = FW_ACCEPT; + else ret = fwops->fw_forward(fwops, PF_INET, + (struct net_device *)out, + (*pskb)->nh.raw, &redirpt, pskb); + break; + + case NF_IP_POST_ROUTING: + ret = fwops->fw_output(fwops, PF_INET, + (struct net_device *)out, + (*pskb)->nh.raw, &redirpt, pskb); + if (fwops->fw_acct_out && (ret == FW_ACCEPT || ret == FW_SKIP)) + fwops->fw_acct_out(fwops, PF_INET, + (struct net_device *)in, + (*pskb)->nh.raw, &redirpt, pskb); + break; + } + + switch (ret) { + case FW_REJECT: { + /* Alexey says: + * + * Generally, routing is THE FIRST thing to make, when + * packet enters IP stack. Before packet is routed you + * cannot call any service routines from IP stack. */ + struct iphdr *iph = (*pskb)->nh.iph; + + if ((*pskb)->dst != NULL + || ip_route_input(*pskb, iph->daddr, iph->saddr, iph->tos, + (struct net_device *)in) == 0) + icmp_send(*pskb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, + 0); + return NF_DROP; + } + + case FW_ACCEPT: + case FW_SKIP: + if (hooknum == NF_IP_PRE_ROUTING) { + check_for_demasq(pskb); + check_for_redirect(*pskb); + } else if (hooknum == NF_IP_POST_ROUTING) + check_for_unredirect(*pskb); + + return NF_ACCEPT; + + case FW_MASQUERADE: + if (hooknum == NF_IP_FORWARD) + return do_masquerade(pskb, out); + else return NF_ACCEPT; + + case FW_REDIRECT: + if (hooknum == NF_IP_PRE_ROUTING) + return do_redirect(*pskb, in, redirpt); + else return NF_ACCEPT; + + default: + /* FW_BLOCK */ + return NF_DROP; + } +} + +extern int ip_fw_ctl(int optval, void *user, unsigned int len); + +static int sock_fn(struct sock *sk, int optval, void *user, unsigned int len) +{ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + return -ip_fw_ctl(optval, user, len); +} + +static struct nf_hook_ops preroute_ops += { { NULL, NULL }, fw_in, PF_INET, NF_IP_PRE_ROUTING, NF_IP_PRI_FILTER }; + +static struct nf_hook_ops postroute_ops += { { NULL, NULL }, fw_in, PF_INET, NF_IP_POST_ROUTING, NF_IP_PRI_FILTER }; + +static struct nf_hook_ops forward_ops += { { NULL, NULL }, fw_in, PF_INET, NF_IP_FORWARD, NF_IP_PRI_FILTER }; + +static struct nf_sockopt_ops sock_ops += { { NULL, NULL }, PF_INET, 64, 64 + 1024 + 1, &sock_fn, 0, 0, NULL, + 0, NULL }; + +extern int ipfw_init_or_cleanup(int init); + +static int init_or_cleanup(int init) +{ + int ret = 0; + + if (!init) goto cleanup; + + ret = nf_register_sockopt(&sock_ops); + + if (ret < 0) + goto cleanup_nothing; + + ret = ipfw_init_or_cleanup(1); + if (ret < 0) + goto cleanup_sockopt; + + ret = masq_init(); + if (ret < 0) + goto cleanup_ipfw; + + nf_register_hook(&preroute_ops); + nf_register_hook(&postroute_ops); + nf_register_hook(&forward_ops); + + return ret; + + cleanup: + nf_unregister_hook(&preroute_ops); + nf_unregister_hook(&postroute_ops); + nf_unregister_hook(&forward_ops); + + masq_cleanup(); + + cleanup_ipfw: + ipfw_init_or_cleanup(0); + + cleanup_sockopt: + nf_unregister_sockopt(&sock_ops); + + cleanup_nothing: + return ret; +} + +static int __init init(void) +{ + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_fw_compat_masq.c b/net/ipv4/netfilter/ip_fw_compat_masq.c new file mode 100644 index 000000000..e0074c1e2 --- /dev/null +++ b/net/ipv4/netfilter/ip_fw_compat_masq.c @@ -0,0 +1,288 @@ +/* Masquerading compatibility layer. + + Note that there are no restrictions on other programs binding to + ports 61000:65095 (in 2.0 and 2.2 they get EADDRINUSE). Just DONT + DO IT. + */ +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/icmp.h> +#include <linux/udp.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/proc_fs.h> +#include <linux/version.h> +#include <net/route.h> + +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) + +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_core.h> +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_core.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +unsigned int +do_masquerade(struct sk_buff **pskb, const struct net_device *dev) +{ + struct iphdr *iph = (*pskb)->nh.iph; + struct ip_nat_info *info; + enum ip_conntrack_info ctinfo; + struct ip_conntrack *ct; + unsigned int ret; + + /* Sorry, only ICMP, TCP and UDP. */ + if (iph->protocol != IPPROTO_ICMP + && iph->protocol != IPPROTO_TCP + && iph->protocol != IPPROTO_UDP) + return NF_DROP; + + /* Feed it to connection tracking; in fact we're in NF_IP_FORWARD, + but connection tracking doesn't expect that */ + ret = ip_conntrack_in(NF_IP_POST_ROUTING, pskb, dev, NULL, NULL); + if (ret != NF_ACCEPT) { + DEBUGP("ip_conntrack_in returned %u.\n", ret); + return ret; + } + + ct = ip_conntrack_get(*pskb, &ctinfo); + + if (!ct) { + DEBUGP("ip_conntrack_in set to invalid conntrack.\n"); + return NF_DROP; + } + + info = &ct->nat.info; + + WRITE_LOCK(&ip_nat_lock); + /* Setup the masquerade, if not already */ + if (!info->initialized) { + u_int32_t newsrc; + struct rtable *rt; + struct ip_nat_multi_range range; + + /* Pass 0 instead of saddr, since it's going to be changed + anyway. */ + if (ip_route_output(&rt, iph->daddr, 0, 0, 0) != 0) { + DEBUGP("ipnat_rule_masquerade: Can't reroute.\n"); + return NF_DROP; + } + newsrc = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, + RT_SCOPE_UNIVERSE); + ip_rt_put(rt); + range = ((struct ip_nat_multi_range) + { 1, + {{IP_NAT_RANGE_MAP_IPS|IP_NAT_RANGE_PROTO_SPECIFIED, + newsrc, newsrc, + { htons(61000) }, { htons(65095) } } } }); + + ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); + place_in_hashes(ct, info); + info->initialized = 1; + } else + DEBUGP("Masquerading already done on this conn.\n"); + WRITE_UNLOCK(&ip_nat_lock); + + return do_bindings(ct, ctinfo, info, NF_IP_POST_ROUTING, pskb); +} + +unsigned int +check_for_demasq(struct sk_buff **pskb) +{ + struct ip_conntrack_tuple tuple; + struct iphdr *iph = (*pskb)->nh.iph; + struct ip_conntrack_protocol *protocol; + struct ip_conntrack_tuple_hash *h; + enum ip_conntrack_info ctinfo; + int ret; + + protocol = find_proto(iph->protocol); + + /* We don't feed packets to conntrack system unless we know + they're part of an connection already established by an + explicit masq command. */ + switch (iph->protocol) { + case IPPROTO_ICMP: + /* ICMP errors. */ + if (icmp_error_track(*pskb)) { + /* If it is valid, tranlsate it */ + if ((*pskb)->nfct) { + struct ip_conntrack *ct + = (struct ip_conntrack *) + (*pskb)->nfct->master; + enum ip_conntrack_dir dir; + + if ((*pskb)->nfct-ct->infos >= IP_CT_IS_REPLY) + dir = IP_CT_DIR_REPLY; + else + dir = IP_CT_DIR_ORIGINAL; + + icmp_reply_translation(*pskb, + ct, + NF_IP_PRE_ROUTING, + dir); + } + return NF_ACCEPT; + } + /* Fall thru... */ + case IPPROTO_TCP: + case IPPROTO_UDP: + if (!get_tuple(iph, (*pskb)->len, &tuple, protocol)) { + printk("ip_fw_compat_masq: Couldn't get tuple\n"); + return NF_ACCEPT; + } + break; + + default: + /* Not ours... */ + return NF_ACCEPT; + } + h = ip_conntrack_find_get(&tuple, NULL); + + /* MUST be found, and MUST be reply. */ + if (h && DIRECTION(h) == 1) { + ret = ip_conntrack_in(NF_IP_PRE_ROUTING, pskb, + NULL, NULL, NULL); + + /* Put back the reference gained from find_get */ + nf_conntrack_put(&h->ctrack->infos[0]); + if (ret == NF_ACCEPT) { + struct ip_conntrack *ct; + ct = ip_conntrack_get(*pskb, &ctinfo); + + if (ct) { + struct ip_nat_info *info = &ct->nat.info; + + do_bindings(ct, ctinfo, info, + NF_IP_PRE_ROUTING, + pskb); + } else + printk("ip_fw_compat_masq: conntrack" + " didn't like\n"); + } + } else { + if (h) + /* Put back the reference gained from find_get */ + nf_conntrack_put(&h->ctrack->infos[0]); + ret = NF_ACCEPT; + } + + return ret; +} + +int ip_fw_masq_timeouts(void *user, int len) +{ + printk("Sorry: masquerading timeouts set 5DAYS/2MINS/60SECS\n"); + return 0; +} + +static const char *masq_proto_name(u_int16_t protonum) +{ + switch (protonum) { + case IPPROTO_TCP: return "TCP"; + case IPPROTO_UDP: return "UDP"; + case IPPROTO_ICMP: return "ICMP"; + default: return "MORE-CAFFIENE-FOR-RUSTY"; + } +} + +static unsigned int +print_masq(char *buffer, const struct ip_conntrack *conntrack) +{ + char temp[129]; + + /* This is for backwards compatibility, but ick!. + We should never export jiffies to userspace. + */ + sprintf(temp,"%s %08X:%04X %08X:%04X %04X %08X %6d %6d %7lu", + masq_proto_name(conntrack->tuplehash[0].tuple.dst.protonum), + ntohl(conntrack->tuplehash[0].tuple.src.ip), + ntohs(conntrack->tuplehash[0].tuple.src.u.all), + ntohl(conntrack->tuplehash[0].tuple.dst.ip), + ntohs(conntrack->tuplehash[0].tuple.dst.u.all), + ntohs(conntrack->tuplehash[1].tuple.dst.u.all), + /* Sorry, no init_seq, delta or previous_delta (yet). */ + 0, 0, 0, + conntrack->timeout.expires - jiffies); + + return sprintf(buffer, "%-127s\n", temp); +} + +/* Returns true when finished. */ +static int +masq_iterate(const struct ip_conntrack_tuple_hash *hash, + char *buffer, off_t offset, off_t *upto, + unsigned int *len, unsigned int maxlen) +{ + unsigned int newlen; + + IP_NF_ASSERT(hash->ctrack); + + /* Only count originals */ + if (DIRECTION(hash)) + return 0; + + if ((*upto)++ < offset) + return 0; + + newlen = print_masq(buffer + *len, hash->ctrack); + if (*len + newlen > maxlen) + return 1; + else *len += newlen; + + return 0; +} + +/* Everything in the hash is masqueraded. */ +static int +masq_procinfo(char *buffer, char **start, off_t offset, int length) +{ + unsigned int i; + int len = 0; + off_t upto = 0; + + READ_LOCK(&ip_conntrack_lock); + /* Traverse hash; print originals then reply. */ + for (i = 0; i < ip_conntrack_htable_size; i++) { + if (LIST_FIND(&ip_conntrack_hash[i], masq_iterate, + struct ip_conntrack_tuple_hash *, + buffer, offset, &upto, &len, length)) + break; + } + READ_UNLOCK(&ip_conntrack_lock); + + /* `start' hack - see fs/proc/generic.c line ~165 */ + *start = (char *)((unsigned int)upto - offset); + return len; +} + +int __init masq_init(void) +{ + int ret; + + ret = ip_conntrack_init(); + if (ret == 0) { + ret = ip_nat_init(); + if (ret == 0) + proc_net_create("ip_masquerade", 0, masq_procinfo); + else + ip_conntrack_cleanup(); + } + + return ret; +} + +void masq_cleanup(void) +{ + ip_nat_cleanup(); + ip_conntrack_cleanup(); + proc_net_remove("ip_masquerade"); +} diff --git a/net/ipv4/netfilter/ip_fw_compat_redir.c b/net/ipv4/netfilter/ip_fw_compat_redir.c new file mode 100644 index 000000000..d4d910e77 --- /dev/null +++ b/net/ipv4/netfilter/ip_fw_compat_redir.c @@ -0,0 +1,284 @@ +/* This is a file to handle the "simple" NAT cases (redirect and + masquerade) required for the compatibility layer. + + `bind to foreign address' and `getpeername' hacks are not + supported. + + FIXME: Timing is overly simplistic. If anyone complains, make it + use conntrack. +*/ +#include <linux/config.h> +#include <linux/netfilter.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <linux/tcp.h> +#include <net/checksum.h> +#include <linux/timer.h> +#include <linux/netdevice.h> +#include <linux/if.h> +#include <linux/in.h> + +#include <linux/netfilter_ipv4/lockhelp.h> + +static DECLARE_LOCK(redir_lock); +#define ASSERT_READ_LOCK(x) MUST_BE_LOCKED(&redir_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_LOCKED(&redir_lock) + +#include <linux/netfilter_ipv4/listhelp.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +#ifdef CONFIG_NETFILTER_DEBUG +#define IP_NF_ASSERT(x) \ +do { \ + if (!(x)) \ + /* Wooah! I'm tripping my conntrack in a frenzy of \ + netplay... */ \ + printk("ASSERT: %s:%i(%s)\n", \ + __FILE__, __LINE__, __FUNCTION__); \ +} while(0); +#else +#define IP_NF_ASSERT(x) +#endif + +static u_int16_t +cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck) +{ + u_int32_t diffs[] = { oldvalinv, newval }; + return csum_fold(csum_partial((char *)diffs, sizeof(diffs), + oldcheck^0xFFFF)); +} + +struct redir_core { + u_int32_t orig_srcip, orig_dstip; + u_int16_t orig_sport, orig_dport; + + u_int32_t new_dstip; + u_int16_t new_dport; +}; + +struct redir +{ + struct list_head list; + struct redir_core core; + struct timer_list destroyme; +}; + +static LIST_HEAD(redirs); + +static int +redir_cmp(const struct redir *i, + u_int32_t orig_srcip, u_int32_t orig_dstip, + u_int16_t orig_sport, u_int16_t orig_dport) +{ + return (i->core.orig_srcip == orig_srcip + && i->core.orig_dstip == orig_dstip + && i->core.orig_sport == orig_sport + && i->core.orig_dport == orig_dport); +} + +/* Search for an existing redirection of the TCP packet. */ +static struct redir * +find_redir(u_int32_t orig_srcip, u_int32_t orig_dstip, + u_int16_t orig_sport, u_int16_t orig_dport) +{ + return LIST_FIND(&redirs, redir_cmp, struct redir *, + orig_srcip, orig_dstip, orig_sport, orig_dport); +} + +static void do_tcp_redir(struct sk_buff *skb, struct redir *redir) +{ + struct iphdr *iph = skb->nh.iph; + struct tcphdr *tcph = (struct tcphdr *)((u_int32_t *)iph + + iph->ihl); + + tcph->check = cheat_check(~redir->core.orig_dstip, + redir->core.new_dstip, + cheat_check(redir->core.orig_dport ^ 0xFFFF, + redir->core.new_dport, + tcph->check)); + iph->check = cheat_check(~redir->core.orig_dstip, + redir->core.new_dstip, iph->check); + tcph->dest = redir->core.new_dport; + iph->daddr = redir->core.new_dstip; + + skb->nfcache |= NFC_ALTERED; +} + +static int +unredir_cmp(const struct redir *i, + u_int32_t new_dstip, u_int32_t orig_srcip, + u_int16_t new_dport, u_int16_t orig_sport) +{ + return (i->core.orig_srcip == orig_srcip + && i->core.new_dstip == new_dstip + && i->core.orig_sport == orig_sport + && i->core.new_dport == new_dport); +} + +/* Match reply packet against redir */ +static struct redir * +find_unredir(u_int32_t new_dstip, u_int32_t orig_srcip, + u_int16_t new_dport, u_int16_t orig_sport) +{ + return LIST_FIND(&redirs, unredir_cmp, struct redir *, + new_dstip, orig_srcip, new_dport, orig_sport); +} + +/* `unredir' a reply packet. */ +static void do_tcp_unredir(struct sk_buff *skb, struct redir *redir) +{ + struct iphdr *iph = skb->nh.iph; + struct tcphdr *tcph = (struct tcphdr *)((u_int32_t *)iph + + iph->ihl); + + tcph->check = cheat_check(~redir->core.new_dstip, + redir->core.orig_dstip, + cheat_check(redir->core.new_dport ^ 0xFFFF, + redir->core.orig_dport, + tcph->check)); + iph->check = cheat_check(~redir->core.new_dstip, + redir->core.orig_dstip, + iph->check); + tcph->source = redir->core.orig_dport; + iph->saddr = redir->core.orig_dstip; + + skb->nfcache |= NFC_ALTERED; +} + +/* REDIRECT a packet. */ +unsigned int +do_redirect(struct sk_buff *skb, + const struct net_device *dev, + u_int16_t redirpt) +{ + struct iphdr *iph = skb->nh.iph; + u_int32_t newdst; + + /* Figure out address: not loopback. */ + if (!dev) + return NF_DROP; + + /* Grab first address on interface. */ + newdst = ((struct in_device *)dev->ip_ptr)->ifa_list->ifa_local; + + switch (iph->protocol) { + case IPPROTO_UDP: { + /* Simple mangle. */ + struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + + iph->ihl); + + udph->check = cheat_check(~iph->daddr, newdst, + cheat_check(udph->dest ^ 0xFFFF, + redirpt, + udph->check)); + iph->check = cheat_check(~iph->daddr, newdst, iph->check); + udph->dest = redirpt; + iph->daddr = newdst; + + skb->nfcache |= NFC_ALTERED; + return NF_ACCEPT; + } + case IPPROTO_TCP: { + /* Mangle, maybe record. */ + struct tcphdr *tcph = (struct tcphdr *)((u_int32_t *)iph + + iph->ihl); + struct redir *redir; + int ret; + + DEBUGP("Doing tcp redirect. %08X:%u %08X:%u -> %08X:%u\n", + iph->saddr, tcph->source, iph->daddr, tcph->dest, + newdst, redirpt); + LOCK_BH(&redir_lock); + redir = find_redir(iph->saddr, iph->daddr, + tcph->source, tcph->dest); + + if (!redir) { + redir = kmalloc(sizeof(struct redir), GFP_ATOMIC); + if (!redir) { + ret = NF_DROP; + goto out; + } + list_prepend(&redirs, redir); + init_timer(&redir->destroyme); + } + /* In case mangling has changed, rewrite this part. */ + redir->core = ((struct redir_core) + { iph->saddr, iph->daddr, + tcph->source, tcph->dest, + newdst, redirpt }); + do_tcp_redir(skb, redir); + ret = NF_ACCEPT; + + out: + UNLOCK_BH(&redir_lock); + return ret; + } + + default: /* give up if not TCP or UDP. */ + return NF_DROP; + } +} + +static void destroyme(unsigned long me) +{ + LOCK_BH(&redir_lock); + LIST_DELETE(&redirs, (struct redir *)me); + UNLOCK_BH(&redir_lock); +} + +/* Incoming packet: is it a reply to a masqueraded connection, or + part of an already-redirected TCP connection? */ +void +check_for_redirect(struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + struct tcphdr *tcph = (struct tcphdr *)((u_int32_t *)iph + + iph->ihl); + struct redir *redir; + + if (iph->protocol != IPPROTO_TCP) + return; + + LOCK_BH(&redir_lock); + redir = find_redir(iph->saddr, iph->daddr, tcph->source, tcph->dest); + if (redir) { + DEBUGP("Doing tcp redirect again.\n"); + do_tcp_redir(skb, redir); + if (tcph->rst || tcph->fin) { + redir->destroyme.function = destroyme; + redir->destroyme.data = (unsigned long)redir; + mod_timer(&redir->destroyme, 75*HZ); + } + } + UNLOCK_BH(&redir_lock); +} + +void +check_for_unredirect(struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + struct tcphdr *tcph = (struct tcphdr *)((u_int32_t *)iph + + iph->ihl); + struct redir *redir; + + if (iph->protocol != IPPROTO_TCP) + return; + + LOCK_BH(&redir_lock); + redir = find_unredir(iph->saddr, iph->daddr, tcph->source, tcph->dest); + if (redir) { + DEBUGP("Doing tcp unredirect.\n"); + do_tcp_unredir(skb, redir); + if (tcph->rst || tcph->fin) { + redir->destroyme.function = destroyme; + redir->destroyme.data = (unsigned long)redir; + mod_timer(&redir->destroyme, 75*HZ); + } + } + UNLOCK_BH(&redir_lock); +} diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c new file mode 100644 index 000000000..996e5a7ff --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -0,0 +1,855 @@ +/* NAT for netfilter; shared with compatibility layer. */ + +/* (c) 1999 Paul `Rusty' Russell. Licenced under the GNU General + Public Licence. */ +#ifdef MODULE +#define __NO_VERSION__ +#endif +#include <linux/version.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/skbuff.h> +#include <linux/netfilter_ipv4.h> +#include <linux/brlock.h> +#include <net/checksum.h> +#include <net/icmp.h> +#include <net/ip.h> +#include <net/tcp.h> /* For tcp_prot in getorigdst */ + +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) + +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> +#include <linux/netfilter_ipv4/ip_nat_core.h> +#include <linux/netfilter_ipv4/ip_nat_helper.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +DECLARE_RWLOCK(ip_nat_lock); + +#define IP_NAT_HTABLE_SIZE 64 + +static struct list_head bysource[IP_NAT_HTABLE_SIZE]; +static struct list_head byipsproto[IP_NAT_HTABLE_SIZE]; +LIST_HEAD(protos); +static LIST_HEAD(helpers); + +extern struct ip_nat_protocol unknown_nat_protocol; + +/* We keep extra hashes for each conntrack, for fast searching. */ +static inline size_t +hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto) +{ + /* Modified src and dst, to ensure we don't create two + identical streams. */ + return (src + dst + proto) % IP_NAT_HTABLE_SIZE; +} + +static inline size_t +hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto) +{ + /* Original src, to ensure we map it consistently if poss. */ + return (manip->ip + manip->u.all + proto) % IP_NAT_HTABLE_SIZE; +} + +/* Noone using conntrack by the time this called. */ +static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn) +{ + struct ip_nat_info *info = &conn->nat.info; + + if (!info->initialized) + return; + + IP_NF_ASSERT(info->bysource.conntrack); + IP_NF_ASSERT(info->byipsproto.conntrack); + + WRITE_LOCK(&ip_nat_lock); + LIST_DELETE(&bysource[hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.src, + conn->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.protonum)], + &info->bysource); + + LIST_DELETE(&byipsproto + [hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY] + .tuple.src.ip, + conn->tuplehash[IP_CT_DIR_REPLY] + .tuple.dst.ip, + conn->tuplehash[IP_CT_DIR_REPLY] + .tuple.dst.protonum)], + &info->byipsproto); + WRITE_UNLOCK(&ip_nat_lock); +} + +/* We do checksum mangling, so if they were wrong before they're still + * wrong. Also works for incomplete packets (eg. ICMP dest + * unreachables.) */ +u_int16_t +ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck) +{ + u_int32_t diffs[] = { oldvalinv, newval }; + return csum_fold(csum_partial((char *)diffs, sizeof(diffs), + oldcheck^0xFFFF)); +} + +static inline int cmp_proto(const struct ip_nat_protocol *i, int proto) +{ + return i->protonum == proto; +} + +struct ip_nat_protocol * +find_nat_proto(u_int16_t protonum) +{ + struct ip_nat_protocol *i; + + MUST_BE_READ_LOCKED(&ip_nat_lock); + i = LIST_FIND(&protos, cmp_proto, struct ip_nat_protocol *, protonum); + if (!i) + i = &unknown_nat_protocol; + return i; +} + +/* Is this tuple already taken? (not by us) */ +int +ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack) +{ + /* Conntrack tracking doesn't keep track of outgoing tuples; only + incoming ones. NAT means they don't have a fixed mapping, + so we invert the tuple and look for the incoming reply. + + We could keep a separate hash if this proves too slow. */ + struct ip_conntrack_tuple reply; + + invert_tuplepr(&reply, tuple); + return ip_conntrack_tuple_taken(&reply, ignored_conntrack); +} + +/* Does tuple + the source manip come within the range mr */ +static int +in_range(const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_manip *manip, + const struct ip_nat_multi_range *mr) +{ + struct ip_nat_protocol *proto = find_nat_proto(tuple->dst.protonum); + unsigned int i; + struct ip_conntrack_tuple newtuple = { *manip, tuple->dst }; + + for (i = 0; i < mr->rangesize; i++) { + /* If we are allowed to map IPs, then we must be in the + range specified, otherwise we must be unchanged. */ + if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) { + if (ntohl(newtuple.src.ip) < ntohl(mr->range[i].min_ip) + || (ntohl(newtuple.src.ip) + > ntohl(mr->range[i].max_ip))) + continue; + } else { + if (newtuple.src.ip != tuple->src.ip) + continue; + } + + if ((mr->range[i].flags & IP_NAT_RANGE_PROTO_SPECIFIED) + && proto->in_range(&newtuple, IP_NAT_MANIP_SRC, + &mr->range[i].min, &mr->range[i].max)) + return 1; + } + return 0; +} + +static inline int +src_cmp(const struct ip_nat_hash *i, + const struct ip_conntrack_tuple *tuple, + const struct ip_nat_multi_range *mr) +{ + return (i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum + == tuple->dst.protonum + && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip + == tuple->src.ip + && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all + == tuple->src.u.all + && in_range(tuple, + &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.src, + mr)); +} + +/* Only called for SRC manip */ +static struct ip_conntrack_manip * +find_appropriate_src(const struct ip_conntrack_tuple *tuple, + const struct ip_nat_multi_range *mr) +{ + unsigned int h = hash_by_src(&tuple->src, tuple->dst.protonum); + struct ip_nat_hash *i; + + MUST_BE_READ_LOCKED(&ip_nat_lock); + i = LIST_FIND(&bysource[h], src_cmp, struct ip_nat_hash *, tuple, mr); + if (i) + return &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src; + else + return NULL; +} + +/* If it's really a local destination manip, it may need to do a + source manip too. */ +static int +do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp) +{ + struct rtable *rt; + + /* FIXME: IPTOS_TOS(iph->tos) --RR */ + if (ip_route_output(&rt, var_ip, 0, 0, 0) != 0) { + DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n", + IP_PARTS(var_ip)); + return 0; + } + + *other_ipp = rt->rt_src; + ip_rt_put(rt); + return 1; +} + +/* Simple way to iterate through all. */ +static inline int fake_cmp(const struct ip_nat_hash *i, + u_int32_t src, u_int32_t dst, u_int16_t protonum, + unsigned int *score, + const struct ip_conntrack *conntrack) +{ + /* Compare backwards: we're dealing with OUTGOING tuples, and + inside the conntrack is the REPLY tuple. Don't count this + conntrack. */ + if (i->conntrack != conntrack + && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == dst + && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip == src + && (i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum + == protonum)) + (*score)++; + return 0; +} + +static inline unsigned int +count_maps(u_int32_t src, u_int32_t dst, u_int16_t protonum, + const struct ip_conntrack *conntrack) +{ + unsigned int score = 0; + + MUST_BE_READ_LOCKED(&ip_nat_lock); + LIST_FIND(&byipsproto[hash_by_ipsproto(src, dst, protonum)], + fake_cmp, struct ip_nat_hash *, src, dst, protonum, &score, + conntrack); + + return score; +} + +/* For [FUTURE] fragmentation handling, we want the least-used + src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus + if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports + 1-65535, we don't do pro-rata allocation based on ports; we choose + the ip with the lowest src-ip/dst-ip/proto usage. + + If an allocation then fails (eg. all 6 ports used in the 1.2.3.4 + range), we eliminate that and try again. This is not the most + efficient approach, but if you're worried about that, don't hand us + ranges you don't really have. */ +static struct ip_nat_range * +find_best_ips_proto(struct ip_conntrack_tuple *tuple, + const struct ip_nat_multi_range *mr, + const struct ip_conntrack *conntrack, + unsigned int hooknum) +{ + unsigned int i; + struct { + const struct ip_nat_range *range; + unsigned int score; + struct ip_conntrack_tuple tuple; + } best = { NULL, 0xFFFFFFFF }; + u_int32_t *var_ipp, *other_ipp, saved_ip; + + if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) { + var_ipp = &tuple->src.ip; + saved_ip = tuple->dst.ip; + other_ipp = &tuple->dst.ip; + } else { + var_ipp = &tuple->dst.ip; + saved_ip = tuple->src.ip; + other_ipp = &tuple->src.ip; + } + + IP_NF_ASSERT(mr->rangesize >= 1); + for (i = 0; i < mr->rangesize; i++) { + u_int32_t minip, maxip; + + /* Don't do ranges which are already eliminated. */ + if (mr->range[i].flags & IP_NAT_RANGE_FULL) { + continue; + } + + if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) { + minip = mr->range[i].min_ip; + maxip = mr->range[i].max_ip; + } else + minip = maxip = *var_ipp; + + for (*var_ipp = minip; + ntohl(*var_ipp) <= ntohl(maxip); + *var_ipp = htonl(ntohl(*var_ipp) + 1)) { + unsigned int score; + + /* Reset the other ip in case it was mangled by + * do_extra_mangle last time. */ + *other_ipp = saved_ip; + + if (hooknum == NF_IP_LOCAL_OUT + && !do_extra_mangle(*var_ipp, other_ipp)) { + DEBUGP("Range %u %u.%u.%u.%u rt failed!\n", + i, IP_PARTS(*var_ipp)); + /* Can't route? This whole range part is + * probably screwed, but keep trying + * anyway. */ + continue; + } + + /* Count how many others map onto this. */ + score = count_maps(tuple->src.ip, tuple->dst.ip, + tuple->dst.protonum, conntrack); + if (score < best.score) { + /* Optimization: doesn't get any better than + this. */ + if (score == 0) + return (struct ip_nat_range *) + &mr->range[i]; + + best.score = score; + best.tuple = *tuple; + best.range = &mr->range[i]; + } + } + } + *tuple = best.tuple; + + /* Discard const. */ + return (struct ip_nat_range *)best.range; +} + +static int +get_unique_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig_tuple, + const struct ip_nat_multi_range *mrr, + struct ip_conntrack *conntrack, + unsigned int hooknum) +{ + struct ip_nat_protocol *proto + = find_nat_proto(orig_tuple->dst.protonum); + struct ip_nat_range *rptr; + unsigned int i; + int ret; + + /* We temporarily use flags for marking full parts, but we + always clean up afterwards */ + struct ip_nat_multi_range *mr = (void *)mrr; + + /* 1) If this srcip/proto/src-proto-part is currently mapped, + and that same mapping gives a unique tuple within the given + range, use that. + + This is only required for source (ie. NAT/masq) mappings. + So far, we don't do local source mappings, so multiple + manips not an issue. */ + if (hooknum == NF_IP_POST_ROUTING) { + struct ip_conntrack_manip *manip; + + manip = find_appropriate_src(orig_tuple, mr); + if (manip) { + /* Apply same source manipulation. */ + *tuple = ((struct ip_conntrack_tuple) + { *manip, orig_tuple->dst }); + DEBUGP("get_unique_tuple: Found current src map\n"); + return 1; + } + } + + /* 2) Select the least-used IP/proto combination in the given + range. + */ + *tuple = *orig_tuple; + while ((rptr = find_best_ips_proto(tuple, mr, conntrack, hooknum)) + != NULL) { + DEBUGP("Found best for "); DUMP_TUPLE(tuple); + /* 3) The per-protocol part of the manip is made to + map into the range to make a unique tuple. */ + + /* Only bother mapping if it's not already in range + and unique */ + if ((!(rptr->flags & IP_NAT_RANGE_PROTO_SPECIFIED) + || proto->in_range(tuple, HOOK2MANIP(hooknum), + &rptr->min, &rptr->max)) + && !ip_nat_used_tuple(tuple, conntrack)) { + ret = 1; + goto clear_fulls; + } else { + if (proto->unique_tuple(tuple, rptr, + HOOK2MANIP(hooknum), + conntrack)) { + /* Must be unique. */ + IP_NF_ASSERT(!ip_nat_used_tuple(tuple, + conntrack)); + ret = 1; + goto clear_fulls; + } + DEBUGP("Protocol can't get unique tuple.\n"); + } + + /* Eliminate that from range, and try again. */ + rptr->flags |= IP_NAT_RANGE_FULL; + *tuple = *orig_tuple; + } + + ret = 0; + + clear_fulls: + /* Clear full flags. */ + IP_NF_ASSERT(mr->rangesize >= 1); + for (i = 0; i < mr->rangesize; i++) + mr->range[i].flags &= ~IP_NAT_RANGE_FULL; + + return ret; +} + +static inline int +helper_cmp(const struct ip_nat_helper *helper, + u_int16_t protocol, + u_int16_t protocol_dst) +{ + return (protocol == helper->protocol + && protocol_dst == helper->protocol_dst); +} + +/* Where to manip the reply packets (will be reverse manip). */ +static unsigned int opposite_hook[NF_IP_NUMHOOKS] += { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING, + [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING, + [NF_IP_LOCAL_OUT] = NF_IP_PRE_ROUTING +}; + +unsigned int +ip_nat_setup_info(struct ip_conntrack *conntrack, + const struct ip_nat_multi_range *mr, + unsigned int hooknum) +{ + struct ip_conntrack_tuple new_tuple, inv_tuple, reply; + struct ip_conntrack_tuple orig_tp; + struct ip_nat_info *info = &conntrack->nat.info; + + MUST_BE_WRITE_LOCKED(&ip_nat_lock); + IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING + || hooknum == NF_IP_POST_ROUTING + || hooknum == NF_IP_LOCAL_OUT); + IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS); + + /* What we've got will look like inverse of reply. Normally + this is what is in the conntrack, except for prior + manipulations (future optimization: if num_manips == 0, + orig_tp = + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ + invert_tuplepr(&orig_tp, + &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple); + +#if 0 + { + unsigned int i; + + DEBUGP("Hook %u (%s), ", hooknum, + HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST"); + DUMP_TUPLE(&orig_tp); + DEBUGP("Range %p: ", mr); + for (i = 0; i < mr->rangesize; i++) { + DEBUGP("%u:%s%s%s %u.%u.%u.%u - %u.%u.%u.%u %u - %u\n", + i, + (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) + ? " MAP_IPS" : "", + (mr->range[i].flags + & IP_NAT_RANGE_PROTO_SPECIFIED) + ? " PROTO_SPECIFIED" : "", + (mr->range[i].flags & IP_NAT_RANGE_FULL) + ? " FULL" : "", + IP_PARTS(mr->range[i].min_ip), + IP_PARTS(mr->range[i].max_ip), + mr->range[i].min.all, + mr->range[i].max.all); + } + } +#endif + + do { + if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack, + hooknum)) { + DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n", + conntrack); + return NF_DROP; + } + +#if 0 + DEBUGP("Hook %u (%s) %p\n", hooknum, + HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST", + conntrack); + DEBUGP("Original: "); + DUMP_TUPLE(&orig_tp); + DEBUGP("New: "); + DUMP_TUPLE(&new_tuple); +#endif + + /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT): + the original (A/B/C/D') and the mangled one (E/F/G/H'). + + We're only allowed to work with the SRC per-proto + part, so we create inverses of both to start, then + derive the other fields we need. */ + + /* Reply connection: simply invert the new tuple + (G/H/E/F') */ + invert_tuplepr(&reply, &new_tuple); + + /* Alter conntrack table so it recognizes replies. + If fail this race (reply tuple now used), repeat. */ + } while (!ip_conntrack_alter_reply(conntrack, &reply)); + + /* FIXME: We can simply used existing conntrack reply tuple + here --RR */ + /* Create inverse of original: C/D/A/B' */ + invert_tuplepr(&inv_tuple, &orig_tp); + + /* Has source changed?. */ + if (memcmp(&new_tuple.src, &orig_tp.src, sizeof(new_tuple.src)) + != 0) { + /* In this direction, a source manip. */ + info->manips[info->num_manips++] = + ((struct ip_nat_info_manip) + { IP_CT_DIR_ORIGINAL, hooknum, + IP_NAT_MANIP_SRC, new_tuple.src }); + + IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS); + + /* In the reverse direction, a destination manip. */ + info->manips[info->num_manips++] = + ((struct ip_nat_info_manip) + { IP_CT_DIR_REPLY, opposite_hook[hooknum], + IP_NAT_MANIP_DST, orig_tp.src }); + IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS); + } + + /* Has destination changed? */ + if (memcmp(&new_tuple.dst, &orig_tp.dst, sizeof(new_tuple.dst)) + != 0) { + /* In this direction, a destination manip */ + info->manips[info->num_manips++] = + ((struct ip_nat_info_manip) + { IP_CT_DIR_ORIGINAL, hooknum, + IP_NAT_MANIP_DST, reply.src }); + + IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS); + + /* In the reverse direction, a source manip. */ + info->manips[info->num_manips++] = + ((struct ip_nat_info_manip) + { IP_CT_DIR_REPLY, opposite_hook[hooknum], + IP_NAT_MANIP_SRC, inv_tuple.src }); + IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS); + } + + /* If there's a helper, assign it; based on new tuple. */ + info->helper = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *, + new_tuple.dst.protonum, + new_tuple.dst.u.all); + + /* It's done. */ + info->initialized |= (1 << HOOK2MANIP(hooknum)); + return NF_ACCEPT; +} + +void replace_in_hashes(struct ip_conntrack *conntrack, + struct ip_nat_info *info) +{ + /* Source has changed, so replace in hashes. */ + unsigned int srchash + = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.src, + conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.protonum); + /* We place packet as seen OUTGOUNG in byips_proto hash + (ie. reverse dst and src of reply packet. */ + unsigned int ipsprotohash + = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY] + .tuple.dst.ip, + conntrack->tuplehash[IP_CT_DIR_REPLY] + .tuple.src.ip, + conntrack->tuplehash[IP_CT_DIR_REPLY] + .tuple.dst.protonum); + + IP_NF_ASSERT(info->bysource.conntrack == conntrack); + MUST_BE_WRITE_LOCKED(&ip_nat_lock); + + list_del(&info->bysource.list); + list_del(&info->byipsproto.list); + + list_prepend(&bysource[srchash], &info->bysource); + list_prepend(&byipsproto[ipsprotohash], &info->byipsproto); +} + +void place_in_hashes(struct ip_conntrack *conntrack, + struct ip_nat_info *info) +{ + unsigned int srchash + = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.src, + conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.protonum); + /* We place packet as seen OUTGOUNG in byips_proto hash + (ie. reverse dst and src of reply packet. */ + unsigned int ipsprotohash + = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY] + .tuple.dst.ip, + conntrack->tuplehash[IP_CT_DIR_REPLY] + .tuple.src.ip, + conntrack->tuplehash[IP_CT_DIR_REPLY] + .tuple.dst.protonum); + + IP_NF_ASSERT(!info->bysource.conntrack); + + MUST_BE_WRITE_LOCKED(&ip_nat_lock); + info->byipsproto.conntrack = conntrack; + info->bysource.conntrack = conntrack; + + list_prepend(&bysource[srchash], &info->bysource); + list_prepend(&byipsproto[ipsprotohash], &info->byipsproto); +} + +static void +manip_pkt(u_int16_t proto, struct iphdr *iph, size_t len, + const struct ip_conntrack_manip *manip, + enum ip_nat_manip_type maniptype) +{ + find_nat_proto(proto)->manip_pkt(iph, len, manip, maniptype); + + if (maniptype == IP_NAT_MANIP_SRC) { + iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip, + iph->check); + iph->saddr = manip->ip; + } else { + iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip, + iph->check); + iph->daddr = manip->ip; + } +#if 0 + if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) + DEBUGP("IP: checksum on packet bad.\n"); + + if (proto == IPPROTO_TCP) { + void *th = (u_int32_t *)iph + iph->ihl; + if (tcp_v4_check(th, len - 4*iph->ihl, iph->saddr, iph->daddr, + csum_partial((char *)th, len-4*iph->ihl, 0))) + DEBUGP("TCP: checksum on packet bad\n"); + } +#endif +} + +/* Do packet manipulations according to binding. */ +unsigned int +do_bindings(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + struct ip_nat_info *info, + unsigned int hooknum, + struct sk_buff **pskb) +{ + unsigned int i; + struct ip_nat_helper *helper; + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + + /* Need nat lock to protect against modification, but neither + conntrack (referenced) and helper (deleted with + synchronize_bh()) can vanish. */ + READ_LOCK(&ip_nat_lock); + for (i = 0; i < info->num_manips; i++) { + if (info->manips[i].direction == dir + && info->manips[i].hooknum == hooknum) { + DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n", + *pskb, + info->manips[i].maniptype == IP_NAT_MANIP_SRC + ? "SRC" : "DST", + IP_PARTS(info->manips[i].manip.ip), + htons(info->manips[i].manip.u.all)); + manip_pkt((*pskb)->nh.iph->protocol, + (*pskb)->nh.iph, + (*pskb)->len, + &info->manips[i].manip, + info->manips[i].maniptype); + } + } + helper = info->helper; + READ_UNLOCK(&ip_nat_lock); + + if (helper) { + /* Always defragged for helpers */ + IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off + & __constant_htons(IP_MF|IP_OFFSET))); + return helper->help(ct, info, ctinfo, hooknum, pskb); + } else return NF_ACCEPT; +} + +void +icmp_reply_translation(struct sk_buff *skb, + struct ip_conntrack *conntrack, + unsigned int hooknum, + int dir) +{ + struct iphdr *iph = skb->nh.iph; + struct icmphdr *hdr = (struct icmphdr *)((u_int32_t *)iph + iph->ihl); + struct iphdr *inner = (struct iphdr *)(hdr + 1); + size_t datalen = skb->len - ((void *)inner - (void *)iph); + unsigned int i; + struct ip_nat_info *info = &conntrack->nat.info; + + IP_NF_ASSERT(skb->len >= iph->ihl*4 + sizeof(struct icmphdr)); + + DEBUGP("icmp_reply_translation: translating error %p hook %u dir %s\n", + skb, hooknum, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); + /* Note: May not be from a NAT'd host, but probably safest to + do translation always as if it came from the host itself + (even though a "host unreachable" coming from the host + itself is a bit wierd). + + More explanation: some people use NAT for anonomizing. + Also, CERT recommends dropping all packets from private IP + addresses (although ICMP errors from internal links with + such addresses are not too uncommon, as Alan Cox points + out) */ + + READ_LOCK(&ip_nat_lock); + for (i = 0; i < info->num_manips; i++) { + DEBUGP("icmp_reply: manip %u dir %s hook %u\n", + i, info->manips[i].direction == IP_CT_DIR_ORIGINAL ? + "ORIG" : "REPLY", info->manips[i].hooknum); + /* Mapping the inner packet is just like a normal + packet in the other direction, except it was never + src/dst reversed, so where we would normally apply + a dst manip, we reply a src, and vice versa. */ + if (info->manips[i].direction != dir + && info->manips[i].hooknum == opposite_hook[hooknum]) { + DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n", + info->manips[i].maniptype == IP_NAT_MANIP_SRC + ? "DST" : "SRC", + IP_PARTS(info->manips[i].manip.ip), + ntohs(info->manips[i].manip.u.udp.port)); + manip_pkt(inner->protocol, inner, + skb->len - ((void *)inner - (void *)iph), + &info->manips[i].manip, + !info->manips[i].maniptype); + } + /* Outer packet needs to have IP header NATed like + it's a reply. */ + else if (info->manips[i].direction != dir + && info->manips[i].hooknum == hooknum) { + /* Use mapping to map outer packet: 0 give no + per-proto mapping */ + DEBUGP("icmp_reply: outer %s %u.%u.%u.%u\n", + info->manips[i].maniptype == IP_NAT_MANIP_SRC + ? "SRC" : "DST", + IP_PARTS(info->manips[i].manip.ip)); + manip_pkt(0, iph, skb->len, + &info->manips[i].manip, + info->manips[i].maniptype); + } + } + READ_UNLOCK(&ip_nat_lock); + + /* Since we mangled inside ICMP packet, recalculate its + checksum from scratch. (Hence the handling of incorrect + checksums in conntrack, so we don't accidentally fix one.) */ + hdr->checksum = 0; + hdr->checksum = ip_compute_csum((unsigned char *)hdr, + sizeof(*hdr) + datalen); +} + +int ip_nat_helper_register(struct ip_nat_helper *me) +{ + int ret = 0; + + WRITE_LOCK(&ip_nat_lock); + if (LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *, + me->protocol, me->protocol_dst)) + ret = -EBUSY; + else { + list_prepend(&helpers, me); + MOD_INC_USE_COUNT; + } + WRITE_UNLOCK(&ip_nat_lock); + + return ret; +} + +static int +kill_helper(const struct ip_conntrack *i, void *helper) +{ + int ret; + + READ_LOCK(&ip_nat_lock); + ret = (i->nat.info.helper == helper); + READ_UNLOCK(&ip_nat_lock); + + return ret; +} + +void ip_nat_helper_unregister(struct ip_nat_helper *me) +{ + WRITE_LOCK(&ip_nat_lock); + LIST_DELETE(&helpers, me); + WRITE_UNLOCK(&ip_nat_lock); + + /* Someone could be still looking at the helper in a bh. */ + br_write_lock_bh(BR_NETPROTO_LOCK); + br_write_unlock_bh(BR_NETPROTO_LOCK); + + /* Find anything using it, and umm, kill them. We can't turn + them into normal connections: if we've adjusted SYNs, then + they'll ackstorm. So we just drop it. We used to just + bump module count when a connection existed, but that + forces admins to gen fake RSTs or bounce box, either of + which is just a long-winded way of making things + worse. --RR */ + ip_ct_selective_cleanup(kill_helper, me); + + MOD_DEC_USE_COUNT; +} + +int __init ip_nat_init(void) +{ + size_t i; + + /* Sew in builtin protocols. */ + WRITE_LOCK(&ip_nat_lock); + list_append(&protos, &ip_nat_protocol_tcp); + list_append(&protos, &ip_nat_protocol_udp); + list_append(&protos, &ip_nat_protocol_icmp); + WRITE_UNLOCK(&ip_nat_lock); + + for (i = 0; i < IP_NAT_HTABLE_SIZE; i++) { + INIT_LIST_HEAD(&bysource[i]); + INIT_LIST_HEAD(&byipsproto[i]); + } + + /* FIXME: Man, this is a hack. <SIGH> */ + IP_NF_ASSERT(ip_conntrack_destroyed == NULL); + ip_conntrack_destroyed = &ip_nat_cleanup_conntrack; + + return 0; +} + +void ip_nat_cleanup(void) +{ + ip_conntrack_destroyed = NULL; +} diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c new file mode 100644 index 000000000..8252e6d9b --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_ftp.c @@ -0,0 +1,403 @@ +/* FTP extension for TCP NAT alteration. */ +#include <linux/module.h> +#include <linux/netfilter_ipv4.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <net/tcp.h> +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_helper.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_nat_ftp.h> +#include <linux/netfilter_ipv4/ip_conntrack_ftp.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> + +EXPORT_NO_SYMBOLS; + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* FIXME: Time out? --RR */ + +static int +ftp_nat_expected(struct sk_buff **pskb, + unsigned int hooknum, + struct ip_conntrack *ct, + struct ip_nat_info *info, + struct ip_conntrack *master, + struct ip_nat_info *masterinfo, + unsigned int *verdict) +{ + struct ip_nat_multi_range mr; + u_int32_t newdstip, newsrcip, newip; + struct ip_ct_ftp *ftpinfo; + + IP_NF_ASSERT(info); + IP_NF_ASSERT(master); + IP_NF_ASSERT(masterinfo); + + IP_NF_ASSERT(!(info->initialized & (1<<HOOK2MANIP(hooknum)))); + + DEBUGP("nat_expected: We have a connection!\n"); + /* Master must be an ftp connection */ + ftpinfo = &master->help.ct_ftp_info; + + LOCK_BH(&ip_ftp_lock); + if (!ftpinfo->is_ftp) { + UNLOCK_BH(&ip_ftp_lock); + DEBUGP("nat_expected: master not ftp\n"); + return 0; + } + + if (ftpinfo->ftptype == IP_CT_FTP_PORT) { + /* PORT command: make connection go to the client. */ + newdstip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip; + newsrcip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip; + DEBUGP("nat_expected: PORT cmd. %u.%u.%u.%u->%u.%u.%u.%u\n", + IP_PARTS(newsrcip), IP_PARTS(newdstip)); + } else { + /* PASV command: make the connection go to the server */ + newdstip = master->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip; + newsrcip = master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip; + DEBUGP("nat_expected: PASV cmd. %u.%u.%u.%u->%u.%u.%u.%u\n", + IP_PARTS(newsrcip), IP_PARTS(newdstip)); + } + UNLOCK_BH(&ip_ftp_lock); + + if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) + newip = newsrcip; + else + newip = newdstip; + + DEBUGP("nat_expected: IP to %u.%u.%u.%u\n", IP_PARTS(newip)); + + mr.rangesize = 1; + /* We don't want to manip the per-protocol, just the IPs. */ + mr.range[0].flags = IP_NAT_RANGE_MAP_IPS; + mr.range[0].min_ip = mr.range[0].max_ip = newip; + + *verdict = ip_nat_setup_info(ct, &mr, hooknum); + + return 1; +} + +/* This is interesting. We simply use the port given us by the client + or server. In practice it's extremely unlikely to clash; if it + does, the rule won't be able to get a unique tuple and will drop + the packets. */ +static int +mangle_packet(struct sk_buff **pskb, + u_int32_t newip, + u_int16_t port, + unsigned int matchoff, + unsigned int matchlen, + struct ip_nat_ftp_info *this_way, + struct ip_nat_ftp_info *other_way) +{ + struct iphdr *iph = (*pskb)->nh.iph; + struct tcphdr *tcph; + unsigned char *data; + unsigned int tcplen, newlen, newtcplen; + char buffer[sizeof("nnn,nnn,nnn,nnn,nnn,nnn")]; + + MUST_BE_LOCKED(&ip_ftp_lock); + sprintf(buffer, "%u,%u,%u,%u,%u,%u", + IP_PARTS(newip), port>>8, port&0xFF); + + tcplen = (*pskb)->len - iph->ihl * 4; + newtcplen = tcplen - matchlen + strlen(buffer); + newlen = iph->ihl*4 + newtcplen; + + /* So there I am, in the middle of my `netfilter-is-wonderful' + talk in Sydney, and someone asks `What happens if you try + to enlarge a 64k packet here?'. I think I said something + eloquent like `fuck'. */ + if (newlen > 65535) { + if (net_ratelimit()) + printk("nat_ftp cheat: %u.%u.%u.%u->%u.%u.%u.%u %u\n", + NIPQUAD((*pskb)->nh.iph->saddr), + NIPQUAD((*pskb)->nh.iph->daddr), + (*pskb)->nh.iph->protocol); + return NF_DROP; + } + + if (newlen > (*pskb)->len + skb_tailroom(*pskb)) { + struct sk_buff *newskb; + newskb = skb_copy_expand(*pskb, skb_headroom(*pskb), newlen, + GFP_ATOMIC); + if (!newskb) { + DEBUGP("ftp: oom\n"); + return 0; + } else { + kfree_skb(*pskb); + *pskb = newskb; + iph = (*pskb)->nh.iph; + } + } + + tcph = (void *)iph + iph->ihl*4; + data = (void *)tcph + tcph->doff*4; + + DEBUGP("Mapping `%.*s' [%u %u %u] to new `%s' [%u]\n", + (int)matchlen, data+matchoff, + data[matchoff], data[matchoff+1], + matchlen, buffer, strlen(buffer)); + + /* SYN adjust. If it's uninitialized, or this is after last + correction, record it: we don't handle more than one + adjustment in the window, but do deal with common case of a + retransmit. */ + if (this_way->syn_offset_before == this_way->syn_offset_after + || before(this_way->syn_correction_pos, ntohl(tcph->seq))) { + this_way->syn_correction_pos = ntohl(tcph->seq); + this_way->syn_offset_before = this_way->syn_offset_after; + this_way->syn_offset_after = (int32_t) + this_way->syn_offset_before + newlen - (*pskb)->len; + } + + /* Move post-replacement */ + memmove(data + matchoff + strlen(buffer), + data + matchoff + matchlen, + (*pskb)->tail - (data + matchoff + matchlen)); + memcpy(data + matchoff, buffer, strlen(buffer)); + + /* Resize packet. */ + if (newlen > (*pskb)->len) { + DEBUGP("ip_nat_ftp: Extending packet by %u to %u bytes\n", + newlen - (*pskb)->len, newlen); + skb_put(*pskb, newlen - (*pskb)->len); + } else { + DEBUGP("ip_nat_ftp: Shrinking packet from %u to %u bytes\n", + (*pskb)->len, newlen); + skb_trim(*pskb, newlen); + } + + /* Fix checksums */ + iph->tot_len = htons(newlen); + (*pskb)->csum = csum_partial((char *)tcph + tcph->doff*4, + newtcplen - tcph->doff*4, 0); + tcph->check = 0; + tcph->check = tcp_v4_check(tcph, newtcplen, iph->saddr, iph->daddr, + csum_partial((char *)tcph, tcph->doff*4, + (*pskb)->csum)); + ip_send_check(iph); + return 1; +} + +/* Grrr... SACK. Fuck me even harder. Don't want to fix it on the + fly, so blow it away. */ +static void +delete_sack(struct sk_buff *skb, struct tcphdr *tcph) +{ + unsigned int i; + u_int8_t *opt = (u_int8_t *)tcph; + + DEBUGP("Seeking SACKPERM in SYN packet (doff = %u).\n", + tcph->doff * 4); + for (i = sizeof(struct tcphdr); i < tcph->doff * 4;) { + DEBUGP("%u ", opt[i]); + switch (opt[i]) { + case TCPOPT_NOP: + case TCPOPT_EOL: + i++; + break; + + case TCPOPT_SACK_PERM: + goto found_opt; + + default: + /* Worst that can happen: it will take us over. */ + i += opt[i+1] ?: 1; + } + } + DEBUGP("\n"); + return; + + found_opt: + DEBUGP("\n"); + DEBUGP("Found SACKPERM at offset %u.\n", i); + + /* Must be within TCP header, and valid SACK perm. */ + if (i + opt[i+1] <= tcph->doff*4 && opt[i+1] == 2) { + /* Replace with NOPs. */ + tcph->check + = ip_nat_cheat_check(*((u_int16_t *)(opt + i))^0xFFFF, + 0, tcph->check); + opt[i] = opt[i+1] = 0; + } + else DEBUGP("Something wrong with SACK_PERM.\n"); +} + +static int ftp_data_fixup(const struct ip_ct_ftp *ct_ftp_info, + struct ip_conntrack *ct, + struct ip_nat_ftp_info *ftp, + unsigned int datalen, + struct sk_buff **pskb) +{ + u_int32_t newip; + struct ip_conntrack_tuple t; + struct iphdr *iph = (*pskb)->nh.iph; + struct tcphdr *tcph = (void *)iph + iph->ihl*4; + + MUST_BE_LOCKED(&ip_ftp_lock); + DEBUGP("FTP_NAT: seq %u + %u in %u + %u\n", + ct_ftp_info->seq, ct_ftp_info->len, + ntohl(tcph->seq), datalen); + + /* Change address inside packet to match way we're mapping + this connection. */ + if (ct_ftp_info->ftptype == IP_CT_FTP_PASV) { + /* PASV response: must be where client thinks server + is */ + newip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip; + } else { + /* PORT command: must be where server thinks client is */ + newip = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip; + } + + if (!mangle_packet(pskb, newip, ct_ftp_info->port, + ct_ftp_info->seq - ntohl(tcph->seq), + ct_ftp_info->len, + &ftp[ct_ftp_info->ftptype], + &ftp[!ct_ftp_info->ftptype])) + return 0; + + /* Alter conntrack's expectations. */ + + /* We can read expect here without conntrack lock, since it's + only set in ip_conntrack_ftp, with ip_ftp_lock held + writable */ + t = ct->expected.tuple; + t.dst.ip = newip; + ip_conntrack_expect_related(ct, &t); + + return 1; +} + +static unsigned int help(struct ip_conntrack *ct, + struct ip_nat_info *info, + enum ip_conntrack_info ctinfo, + unsigned int hooknum, + struct sk_buff **pskb) +{ + struct iphdr *iph = (*pskb)->nh.iph; + struct tcphdr *tcph = (void *)iph + iph->ihl*4; + u_int32_t newseq, newack; + unsigned int datalen; + int dir; + int score; + struct ip_ct_ftp *ct_ftp_info + = &ct->help.ct_ftp_info; + struct ip_nat_ftp_info *ftp + = &ct->nat.help.ftp_info[0]; + + /* Delete SACK_OK on initial TCP SYNs. */ + if (tcph->syn && !tcph->ack) + delete_sack(*pskb, tcph); + + /* Only mangle things once: original direction in POST_ROUTING + and reply direction on PRE_ROUTING. */ + dir = CTINFO2DIR(ctinfo); + if (!((hooknum == NF_IP_POST_ROUTING && dir == IP_CT_DIR_ORIGINAL) + || (hooknum == NF_IP_PRE_ROUTING && dir == IP_CT_DIR_REPLY))) { + DEBUGP("nat_ftp: Not touching dir %s at hook %s\n", + dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY", + hooknum == NF_IP_POST_ROUTING ? "POSTROUTING" + : hooknum == NF_IP_PRE_ROUTING ? "PREROUTING" + : hooknum == NF_IP_LOCAL_OUT ? "OUTPUT" : "???"); + return NF_ACCEPT; + } + + datalen = (*pskb)->len - iph->ihl * 4 - tcph->doff * 4; + score = 0; + LOCK_BH(&ip_ftp_lock); + if (ct_ftp_info->len) { + /* If it's in the right range... */ + score += between(ct_ftp_info->seq, ntohl(tcph->seq), + ntohl(tcph->seq) + datalen); + score += between(ct_ftp_info->seq + ct_ftp_info->len, + ntohl(tcph->seq), + ntohl(tcph->seq) + datalen); + if (score == 1) { + /* Half a match? This means a partial retransmisison. + It's a cracker being funky. */ + if (net_ratelimit()) { + printk("FTP_NAT: partial packet %u/%u in %u/%u\n", + ct_ftp_info->seq, ct_ftp_info->len, + ntohl(tcph->seq), + ntohl(tcph->seq) + datalen); + } + UNLOCK_BH(&ip_ftp_lock); + return NF_DROP; + } else if (score == 2) { + if (!ftp_data_fixup(ct_ftp_info, ct, ftp, datalen, + pskb)) { + UNLOCK_BH(&ip_ftp_lock); + return NF_DROP; + } + + /* skb may have been reallocated */ + iph = (*pskb)->nh.iph; + tcph = (void *)iph + iph->ihl*4; + } + } + + /* Sequence adjust */ + if (after(ntohl(tcph->seq), ftp[dir].syn_correction_pos)) + newseq = ntohl(tcph->seq) + ftp[dir].syn_offset_after; + else + newseq = ntohl(tcph->seq) + ftp[dir].syn_offset_before; + newseq = htonl(newseq); + + /* Ack adjust */ + if (after(ntohl(tcph->ack_seq), ftp[!dir].syn_correction_pos)) + newack = ntohl(tcph->ack_seq) - ftp[!dir].syn_offset_after; + else + newack = ntohl(tcph->ack_seq) - ftp[!dir].syn_offset_before; + newack = htonl(newack); + UNLOCK_BH(&ip_ftp_lock); + + tcph->check = ip_nat_cheat_check(~tcph->seq, newseq, + ip_nat_cheat_check(~tcph->ack_seq, + newack, + tcph->check)); + tcph->seq = newseq; + tcph->ack_seq = newack; + + return NF_ACCEPT; +} + +static struct ip_nat_helper ftp += { { NULL, NULL }, IPPROTO_TCP, __constant_htons(21), help, "ftp" }; +static struct ip_nat_expect ftp_expect += { { NULL, NULL }, ftp_nat_expected }; + +extern struct module *ip_conntrack_ftp; + +static int __init init(void) +{ + int ret; + + ret = ip_nat_expect_register(&ftp_expect); + if (ret == 0) { + ret = ip_nat_helper_register(&ftp); + + if (ret == 0) + __MOD_INC_USE_COUNT(ip_conntrack_ftp); + else + ip_nat_expect_unregister(&ftp_expect); + } + return ret; +} + +static void __exit fini(void) +{ + __MOD_DEC_USE_COUNT(ip_conntrack_ftp); + ip_nat_helper_unregister(&ftp); + ip_nat_expect_unregister(&ftp_expect); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c new file mode 100644 index 000000000..9bc7427ce --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c @@ -0,0 +1,97 @@ +#include <linux/types.h> +#include <linux/init.h> +#include <linux/netfilter.h> +#include <linux/ip.h> +#include <linux/icmp.h> +#include <linux/if.h> + +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> + +static int +icmp_in_range(const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype, + const union ip_conntrack_manip_proto *min, + const union ip_conntrack_manip_proto *max) +{ + return (tuple->src.u.icmp.id >= min->icmp.id + && tuple->src.u.icmp.id <= max->icmp.id); +} + +static int +icmp_unique_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range, + enum ip_nat_manip_type maniptype, + const struct ip_conntrack *conntrack) +{ + static u_int16_t id = 0; + unsigned int range_size + = (unsigned int)range->max.icmp.id - range->min.icmp.id + 1; + unsigned int i; + + /* If no range specified... */ + if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) + range_size = 0xFFFF; + + for (i = 0; i < range_size; i++, id++) { + tuple->src.u.icmp.id = range->min.icmp.id + (id % range_size); + if (!ip_nat_used_tuple(tuple, conntrack)) + return 1; + } + return 0; +} + +static void +icmp_manip_pkt(struct iphdr *iph, size_t len, + const struct ip_conntrack_manip *manip, + enum ip_nat_manip_type maniptype) +{ + struct icmphdr *hdr = (struct icmphdr *)((u_int32_t *)iph + iph->ihl); + + hdr->checksum = ip_nat_cheat_check(hdr->un.echo.id ^ 0xFFFF, + manip->u.icmp.id, + hdr->checksum); + hdr->un.echo.id = manip->u.icmp.id; +} + +static unsigned int +icmp_print(char *buffer, + const struct ip_conntrack_tuple *match, + const struct ip_conntrack_tuple *mask) +{ + unsigned int len = 0; + + if (mask->src.u.icmp.id) + len += sprintf(buffer + len, "id=%u ", + ntohs(match->src.u.icmp.id)); + + if (mask->dst.u.icmp.type) + len += sprintf(buffer + len, "type=%u ", + ntohs(match->dst.u.icmp.type)); + + if (mask->dst.u.icmp.code) + len += sprintf(buffer + len, "code=%u ", + ntohs(match->dst.u.icmp.code)); + + return len; +} + +static unsigned int +icmp_print_range(char *buffer, const struct ip_nat_range *range) +{ + if (range->min.icmp.id != 0 || range->max.icmp.id != 0xFFFF) + return sprintf(buffer, "id %u-%u ", + ntohs(range->min.icmp.id), + ntohs(range->max.icmp.id)); + else return 0; +} + +struct ip_nat_protocol ip_nat_protocol_icmp += { { NULL, NULL }, "ICMP", IPPROTO_ICMP, + icmp_manip_pkt, + icmp_in_range, + icmp_unique_tuple, + icmp_print, + icmp_print_range +}; diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c new file mode 100644 index 000000000..7ff6ccb50 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c @@ -0,0 +1,143 @@ +#include <linux/types.h> +#include <linux/init.h> +#include <linux/netfilter.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/if.h> + +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> + +static int +tcp_in_range(const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype, + const union ip_conntrack_manip_proto *min, + const union ip_conntrack_manip_proto *max) +{ + u_int16_t port; + + if (maniptype == IP_NAT_MANIP_SRC) + port = tuple->src.u.tcp.port; + else + port = tuple->dst.u.tcp.port; + + return ntohs(port) >= ntohs(min->tcp.port) + && ntohs(port) <= ntohs(max->tcp.port); +} + +static int +tcp_unique_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range, + enum ip_nat_manip_type maniptype, + const struct ip_conntrack *conntrack) +{ + static u_int16_t port = 0, *portptr; + unsigned int range_size, min, i; + + if (maniptype == IP_NAT_MANIP_SRC) + portptr = &tuple->src.u.tcp.port; + else + portptr = &tuple->dst.u.tcp.port; + + /* If no range specified... */ + if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { + /* If it's dst rewrite, can't change port */ + if (maniptype == IP_NAT_MANIP_DST) + return 0; + + /* Map privileged onto privileged. */ + if (ntohs(*portptr) < 1024) { + /* Loose convention: >> 512 is credential passing */ + if (ntohs(*portptr)<512) { + min = 1; + range_size = 511 - min + 1; + } else { + min = 600; + range_size = 1023 - min + 1; + } + } else { + min = 1024; + range_size = 65535 - 1024 + 1; + } + } else { + min = ntohs(range->min.tcp.port); + range_size = ntohs(range->max.tcp.port) - min + 1; + } + + for (i = 0; i < range_size; i++, port++) { + *portptr = htons(min + port % range_size); + if (!ip_nat_used_tuple(tuple, conntrack)) { + return 1; + } + } + return 0; +} + +static void +tcp_manip_pkt(struct iphdr *iph, size_t len, + const struct ip_conntrack_manip *manip, + enum ip_nat_manip_type maniptype) +{ + struct tcphdr *hdr = (struct tcphdr *)((u_int32_t *)iph + iph->ihl); + u_int32_t oldip; + u_int16_t *portptr; + + if (maniptype == IP_NAT_MANIP_SRC) { + /* Get rid of src ip and src pt */ + oldip = iph->saddr; + portptr = &hdr->source; + } else { + /* Get rid of dst ip and dst pt */ + oldip = iph->daddr; + portptr = &hdr->dest; + } + hdr->check = ip_nat_cheat_check(~oldip, manip->ip, + ip_nat_cheat_check(*portptr ^ 0xFFFF, + manip->u.tcp.port, + hdr->check)); + *portptr = manip->u.tcp.port; +} + +static unsigned int +tcp_print(char *buffer, + const struct ip_conntrack_tuple *match, + const struct ip_conntrack_tuple *mask) +{ + unsigned int len = 0; + + if (mask->src.u.tcp.port) + len += sprintf(buffer + len, "srcpt=%u ", + ntohs(match->src.u.tcp.port)); + + + if (mask->dst.u.tcp.port) + len += sprintf(buffer + len, "dstpt=%u ", + ntohs(match->dst.u.tcp.port)); + + return len; +} + +static unsigned int +tcp_print_range(char *buffer, const struct ip_nat_range *range) +{ + if (range->min.tcp.port != 0 || range->max.tcp.port != 0xFFFF) { + if (range->min.tcp.port == range->max.tcp.port) + return sprintf(buffer, "port %u ", + ntohs(range->min.tcp.port)); + else + return sprintf(buffer, "ports %u-%u ", + ntohs(range->min.tcp.port), + ntohs(range->max.tcp.port)); + } + else return 0; +} + +struct ip_nat_protocol ip_nat_protocol_tcp += { { NULL, NULL }, "TCP", IPPROTO_TCP, + tcp_manip_pkt, + tcp_in_range, + tcp_unique_tuple, + tcp_print, + tcp_print_range +}; diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c new file mode 100644 index 000000000..e0dc25910 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_proto_udp.c @@ -0,0 +1,141 @@ +#include <linux/types.h> +#include <linux/init.h> +#include <linux/netfilter.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <linux/if.h> + +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> + +static int +udp_in_range(const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype, + const union ip_conntrack_manip_proto *min, + const union ip_conntrack_manip_proto *max) +{ + u_int16_t port; + + if (maniptype == IP_NAT_MANIP_SRC) + port = tuple->src.u.udp.port; + else + port = tuple->dst.u.udp.port; + + return ntohs(port) >= ntohs(min->udp.port) + && ntohs(port) <= ntohs(max->udp.port); +} + +static int +udp_unique_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range, + enum ip_nat_manip_type maniptype, + const struct ip_conntrack *conntrack) +{ + static u_int16_t port = 0, *portptr; + unsigned int range_size, min, i; + + if (maniptype == IP_NAT_MANIP_SRC) + portptr = &tuple->src.u.udp.port; + else + portptr = &tuple->dst.u.udp.port; + + /* If no range specified... */ + if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { + /* If it's dst rewrite, can't change port */ + if (maniptype == IP_NAT_MANIP_DST) + return 0; + + if (ntohs(*portptr) < 1024) { + /* Loose convention: >> 512 is credential passing */ + if (ntohs(*portptr)<512) { + min = 1; + range_size = 511 - min + 1; + } else { + min = 600; + range_size = 1023 - min + 1; + } + } else { + min = 1024; + range_size = 65535 - 1024 + 1; + } + } else { + min = ntohs(range->min.udp.port); + range_size = ntohs(range->max.udp.port) - min + 1; + } + + for (i = 0; i < range_size; i++, port++) { + *portptr = htons(min + port % range_size); + if (!ip_nat_used_tuple(tuple, conntrack)) + return 1; + } + return 0; +} + +static void +udp_manip_pkt(struct iphdr *iph, size_t len, + const struct ip_conntrack_manip *manip, + enum ip_nat_manip_type maniptype) +{ + struct udphdr *hdr = (struct udphdr *)((u_int32_t *)iph + iph->ihl); + u_int32_t oldip; + u_int16_t *portptr; + + if (maniptype == IP_NAT_MANIP_SRC) { + /* Get rid of src ip and src pt */ + oldip = iph->saddr; + portptr = &hdr->source; + } else { + /* Get rid of dst ip and dst pt */ + oldip = iph->daddr; + portptr = &hdr->dest; + } + hdr->check = ip_nat_cheat_check(~oldip, manip->ip, + ip_nat_cheat_check(*portptr ^ 0xFFFF, + manip->u.udp.port, + hdr->check)); + *portptr = manip->u.udp.port; +} + +static unsigned int +udp_print(char *buffer, + const struct ip_conntrack_tuple *match, + const struct ip_conntrack_tuple *mask) +{ + unsigned int len = 0; + + if (mask->src.u.udp.port) + len += sprintf(buffer + len, "srcpt=%u ", + ntohs(match->src.u.udp.port)); + + + if (mask->dst.u.udp.port) + len += sprintf(buffer + len, "dstpt=%u ", + ntohs(match->dst.u.udp.port)); + + return len; +} + +static unsigned int +udp_print_range(char *buffer, const struct ip_nat_range *range) +{ + if (range->min.udp.port != 0 || range->max.udp.port != 0xFFFF) { + if (range->min.udp.port == range->max.udp.port) + return sprintf(buffer, "port %u ", + ntohs(range->min.udp.port)); + else + return sprintf(buffer, "ports %u-%u ", + ntohs(range->min.udp.port), + ntohs(range->max.udp.port)); + } + else return 0; +} + +struct ip_nat_protocol ip_nat_protocol_udp += { { NULL, NULL }, "UDP", IPPROTO_UDP, + udp_manip_pkt, + udp_in_range, + udp_unique_tuple, + udp_print, + udp_print_range +}; diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c new file mode 100644 index 000000000..0e3907036 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c @@ -0,0 +1,61 @@ +/* The "unknown" protocol. This is what is used for protocols we + * don't understand. It's returned by find_proto(). + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/netfilter.h> +#include <linux/if.h> + +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> + +static int unknown_in_range(const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type manip_type, + const union ip_conntrack_manip_proto *min, + const union ip_conntrack_manip_proto *max) +{ + return 1; +} + +static int unknown_unique_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range, + enum ip_nat_manip_type maniptype, + const struct ip_conntrack *conntrack) +{ + /* Sorry: we can't help you; if it's not unique, we can't frob + anything. */ + return 0; +} + +static void +unknown_manip_pkt(struct iphdr *iph, size_t len, + const struct ip_conntrack_manip *manip, + enum ip_nat_manip_type maniptype) +{ + return; +} + +static unsigned int +unknown_print(char *buffer, + const struct ip_conntrack_tuple *match, + const struct ip_conntrack_tuple *mask) +{ + return 0; +} + +static unsigned int +unknown_print_range(char *buffer, const struct ip_nat_range *range) +{ + return 0; +} + +struct ip_nat_protocol unknown_nat_protocol = { + { NULL, NULL }, "unknown", 0, + unknown_manip_pkt, + unknown_in_range, + unknown_unique_tuple, + unknown_print, + unknown_print_range +}; diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c new file mode 100644 index 000000000..74516687b --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_rule.c @@ -0,0 +1,327 @@ +/* Everything about the rules for NAT. */ +#define __NO_VERSION__ +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/module.h> +#include <linux/kmod.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <net/checksum.h> +#include <linux/bitops.h> +#include <linux/version.h> + +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_core.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +#define NAT_VALID_HOOKS ((1<<NF_IP_PRE_ROUTING) | (1<<NF_IP_POST_ROUTING) | (1<<NF_IP_LOCAL_OUT)) + +/* Standard entry. */ +struct ipt_standard +{ + struct ipt_entry entry; + struct ipt_standard_target target; +}; + +struct ipt_error_target +{ + struct ipt_entry_target target; + char errorname[IPT_FUNCTION_MAXNAMELEN]; +}; + +struct ipt_error +{ + struct ipt_entry entry; + struct ipt_error_target target; +}; + +static struct +{ + struct ipt_replace repl; + struct ipt_standard entries[3]; + struct ipt_error term; +} nat_initial_table __initdata += { { "nat", NAT_VALID_HOOKS, 4, + sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), + { [NF_IP_PRE_ROUTING] 0, + [NF_IP_POST_ROUTING] sizeof(struct ipt_standard), + [NF_IP_LOCAL_OUT] sizeof(struct ipt_standard) * 2 }, + { [NF_IP_PRE_ROUTING] 0, + [NF_IP_POST_ROUTING] sizeof(struct ipt_standard), + [NF_IP_LOCAL_OUT] sizeof(struct ipt_standard) * 2 }, + 0, NULL, { } }, + { + /* PRE_ROUTING */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { sizeof(struct ipt_standard_target), { "" }, { } }, + -NF_ACCEPT - 1 } }, + /* POST_ROUTING */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { sizeof(struct ipt_standard_target), { "" }, { } }, + -NF_ACCEPT - 1 } }, + /* LOCAL_OUT */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { sizeof(struct ipt_standard_target), { "" }, { } }, + -NF_ACCEPT - 1 } } + }, + /* ERROR */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_error), + 0, { 0, 0 }, { } }, + { { sizeof(struct ipt_error_target), { IPT_ERROR_TARGET }, + { } }, + "ERROR" + } + } +}; + +static struct ipt_table nat_table += { { NULL, NULL }, "nat", &nat_initial_table.repl, + NAT_VALID_HOOKS, RW_LOCK_UNLOCKED, NULL }; + +LIST_HEAD(nat_expect_list); + +/* Source NAT */ +static unsigned int ipt_snat_target(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userinfo) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + + IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING); + + ct = ip_conntrack_get(*pskb, &ctinfo); + + /* Connection must be valid and new. */ + IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + IP_NF_ASSERT(out); + + return ip_nat_setup_info(ct, targinfo, hooknum); +} + +static unsigned int ipt_dnat_target(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userinfo) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + + IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING + || hooknum == NF_IP_LOCAL_OUT); + + ct = ip_conntrack_get(*pskb, &ctinfo); + + /* Connection must be valid and new. */ + IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + + return ip_nat_setup_info(ct, targinfo, hooknum); +} + +static int ipt_snat_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ip_nat_multi_range *mr = targinfo; + + /* Must be a valid range */ + if (targinfosize < sizeof(struct ip_nat_multi_range)) { + DEBUGP("SNAT: Target size %u too small\n", targinfosize); + return 0; + } + + if (targinfosize != IPT_ALIGN((sizeof(struct ip_nat_multi_range) + + (sizeof(struct ip_nat_range) + * (mr->rangesize - 1))))) { + DEBUGP("SNAT: Target size %u wrong for %u ranges\n", + targinfosize, mr->rangesize); + return 0; + } + + if (hook_mask & ~(1 << NF_IP_POST_ROUTING)) { + DEBUGP("SNAT: hook mask 0x%x bad\n", hook_mask); + return 0; + } + return 1; +} + +static int ipt_dnat_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ip_nat_multi_range *mr = targinfo; + + /* Must be a valid range */ + if (targinfosize < sizeof(struct ip_nat_multi_range)) { + DEBUGP("DNAT: Target size %u too small\n", targinfosize); + return 0; + } + + if (targinfosize != IPT_ALIGN((sizeof(struct ip_nat_multi_range) + + (sizeof(struct ip_nat_range) + * (mr->rangesize - 1))))) { + DEBUGP("DNAT: Target size %u wrong for %u ranges\n", + targinfosize, mr->rangesize); + return 0; + } + + if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))) { + DEBUGP("DNAT: hook mask 0x%x bad\n", hook_mask); + return 0; + } + return 1; +} + +static inline unsigned int +alloc_null_binding(struct ip_conntrack *conntrack, + struct ip_nat_info *info, + unsigned int hooknum) +{ + /* Force range to this IP; let proto decide mapping for + per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). + Use reply in case it's already been mangled (eg local packet). + */ + u_int32_t ip + = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC + ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip + : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip); + struct ip_nat_multi_range mr + = { 1, { { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } } } }; + + DEBUGP("Allocating NULL binding for %p (%u.%u.%u.%u)\n", conntrack, + IP_PARTS(ip)); + return ip_nat_setup_info(conntrack, &mr, hooknum); +} + +static inline int call_expect(const struct ip_nat_expect *i, + struct sk_buff **pskb, + unsigned int hooknum, + struct ip_conntrack *ct, + struct ip_nat_info *info, + struct ip_conntrack *master, + struct ip_nat_info *masterinfo, + unsigned int *verdict) +{ + return i->expect(pskb, hooknum, ct, info, master, masterinfo, + verdict); +} + +int ip_nat_rule_find(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + struct ip_conntrack *ct, + struct ip_nat_info *info) +{ + int ret; + + /* Master won't vanish while this ctrack still alive */ + if (ct->master.master) { + struct ip_conntrack *master; + + master = (struct ip_conntrack *)ct->master.master; + if (LIST_FIND(&nat_expect_list, + call_expect, + struct ip_nat_expect *, + pskb, hooknum, ct, info, + master, &master->nat.info, &ret)) + return ret; + } + ret = ipt_do_table(pskb, hooknum, in, out, &nat_table, NULL); + if (ret == NF_ACCEPT) { + if (!(info->initialized & (1 << HOOK2MANIP(hooknum)))) + /* NUL mapping */ + ret = alloc_null_binding(ct, info, hooknum); + } + return ret; +} + +int ip_nat_expect_register(struct ip_nat_expect *expect) +{ + WRITE_LOCK(&ip_nat_lock); + list_prepend(&nat_expect_list, expect); + WRITE_UNLOCK(&ip_nat_lock); + + return 0; +} + +void ip_nat_expect_unregister(struct ip_nat_expect *expect) +{ + WRITE_LOCK(&ip_nat_lock); + LIST_DELETE(&nat_expect_list, expect); + WRITE_UNLOCK(&ip_nat_lock); +} + +static struct ipt_target ipt_snat_reg += { { NULL, NULL }, "SNAT", ipt_snat_target, ipt_snat_checkentry, NULL }; +static struct ipt_target ipt_dnat_reg += { { NULL, NULL }, "DNAT", ipt_dnat_target, ipt_dnat_checkentry, NULL }; + +int __init ip_nat_rule_init(void) +{ + int ret; + + ret = ipt_register_table(&nat_table); + if (ret != 0) + return ret; + ret = ipt_register_target(&ipt_snat_reg); + if (ret != 0) + goto unregister_table; + + ret = ipt_register_target(&ipt_dnat_reg); + if (ret != 0) + goto unregister_snat; + + return ret; + + unregister_snat: + ipt_unregister_target(&ipt_snat_reg); + unregister_table: + ipt_unregister_table(&nat_table); + + return ret; +} + +void ip_nat_rule_cleanup(void) +{ + ipt_unregister_target(&ipt_dnat_reg); + ipt_unregister_target(&ipt_snat_reg); + ipt_unregister_table(&nat_table); +} diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c new file mode 100644 index 000000000..bf278d6f9 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_standalone.c @@ -0,0 +1,273 @@ +/* This file contains all the functions required for the standalone + ip_nat module. + + These are not required by the compatibility layer. +*/ + +/* (c) 1999 Paul `Rusty' Russell. Licenced under the GNU General + Public Licence. */ + +#ifdef MODULE +#define EXPORT_SYMTAB +#endif +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <net/checksum.h> +#include <linux/spinlock.h> +#include <linux/version.h> +#include <linux/brlock.h> + +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) + +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> +#include <linux/netfilter_ipv4/ip_nat_core.h> +#include <linux/netfilter_ipv4/ip_nat_helper.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ip_conntrack_core.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +#define HOOKNAME(hooknum) ((hooknum) == NF_IP_POST_ROUTING ? "POST_ROUTING" \ + : ((hooknum) == NF_IP_PRE_ROUTING ? "PRE_ROUTING" \ + : ((hooknum) == NF_IP_LOCAL_OUT ? "LOCAL_OUT" \ + : "*ERROR*"))) + +static unsigned int +ip_nat_fn(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + struct ip_nat_info *info; + /* maniptype == SRC for postrouting. */ + enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum); + + /* We never see fragments: conntrack defrags on pre-routing + and local-out, and ip_nat_out protects post-routing. */ + IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off + & __constant_htons(IP_MF|IP_OFFSET))); + + /* FIXME: One day, fill in properly. --RR */ + (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED; + + /* If we had a hardware checksum before, it's now invalid */ + if ((*pskb)->pkt_type != PACKET_LOOPBACK) + (*pskb)->ip_summed = CHECKSUM_NONE; + + ct = ip_conntrack_get(*pskb, &ctinfo); + /* Can't track? Maybe out of memory: this would make NAT + unreliable. */ + if (!ct) + return NF_DROP; + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED+IP_CT_IS_REPLY: + if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) { + icmp_reply_translation(*pskb, ct, hooknum, + CTINFO2DIR(ctinfo)); + return NF_ACCEPT; + } + /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + case IP_CT_NEW: + info = &ct->nat.info; + + WRITE_LOCK(&ip_nat_lock); + /* Seen it before? This can happen for loopback, retrans, + or local packets.. */ + if (!(info->initialized & (1 << maniptype))) { + int in_hashes = info->initialized; + unsigned int ret; + + ret = ip_nat_rule_find(pskb, hooknum, in, out, + ct, info); + if (ret != NF_ACCEPT) { + WRITE_UNLOCK(&ip_nat_lock); + return ret; + } + + if (in_hashes) { + IP_NF_ASSERT(info->bysource.conntrack); + replace_in_hashes(ct, info); + } else { + place_in_hashes(ct, info); + } + } else + DEBUGP("Already setup manip %s for ct %p\n", + maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST", + ct); + WRITE_UNLOCK(&ip_nat_lock); + break; + + default: + /* ESTABLISHED */ + IP_NF_ASSERT(ctinfo == IP_CT_ESTABLISHED + || ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY)); + info = &ct->nat.info; + } + + IP_NF_ASSERT(info); + return do_bindings(ct, ctinfo, info, hooknum, pskb); +} + +static unsigned int +ip_nat_out(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* We can hit fragment here; forwarded packets get + defragmented by connection tracking coming in, then + fragmented (grr) by the forward code. + + In future: If we have nfct != NULL, AND we have NAT + initialized, AND there is no helper, then we can do full + NAPT on the head, and IP-address-only NAT on the rest. + + I'm starting to have nightmares about fragments. */ + + if ((*pskb)->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) { + *pskb = ip_ct_gather_frags(*pskb); + + if (!*pskb) + return NF_STOLEN; + } + + return ip_nat_fn(hooknum, pskb, in, out, okfn); +} + +/* We must be after connection tracking and before packet filtering. */ + +/* Before packet filtering, change destination */ +static struct nf_hook_ops ip_nat_in_ops += { { NULL, NULL }, ip_nat_fn, PF_INET, NF_IP_PRE_ROUTING, NF_IP_PRI_NAT_DST }; +/* After packet filtering, change source */ +static struct nf_hook_ops ip_nat_out_ops += { { NULL, NULL }, ip_nat_out, PF_INET, NF_IP_POST_ROUTING, NF_IP_PRI_NAT_SRC}; +/* Before packet filtering, change destination */ +static struct nf_hook_ops ip_nat_local_out_ops += { { NULL, NULL }, ip_nat_fn, PF_INET, NF_IP_LOCAL_OUT, NF_IP_PRI_NAT_DST }; + +/* Protocol registration. */ +int ip_nat_protocol_register(struct ip_nat_protocol *proto) +{ + int ret = 0; + struct list_head *i; + + WRITE_LOCK(&ip_nat_lock); + for (i = protos.next; i != &protos; i = i->next) { + if (((struct ip_nat_protocol *)i)->protonum + == proto->protonum) { + ret = -EBUSY; + goto out; + } + } + + list_prepend(&protos, proto); + MOD_INC_USE_COUNT; + + out: + WRITE_UNLOCK(&ip_nat_lock); + return ret; +} + +/* Noone stores the protocol anywhere; simply delete it. */ +void ip_nat_protocol_unregister(struct ip_nat_protocol *proto) +{ + WRITE_LOCK(&ip_nat_lock); + LIST_DELETE(&protos, proto); + WRITE_UNLOCK(&ip_nat_lock); + + /* Someone could be still looking at the proto in a bh. */ + br_write_lock_bh(BR_NETPROTO_LOCK); + br_write_unlock_bh(BR_NETPROTO_LOCK); + + MOD_DEC_USE_COUNT; +} + +static int init_or_cleanup(int init) +{ + int ret = 0; + + if (!init) goto cleanup; + + ret = ip_nat_rule_init(); + if (ret < 0) { + printk("ip_nat_init: can't setup rules.\n"); + goto cleanup_nothing; + } + ret = ip_nat_init(); + if (ret < 0) { + printk("ip_nat_init: can't setup rules.\n"); + goto cleanup_rule_init; + } + ret = nf_register_hook(&ip_nat_in_ops); + if (ret < 0) { + printk("ip_nat_init: can't register in hook.\n"); + goto cleanup_nat; + } + ret = nf_register_hook(&ip_nat_out_ops); + if (ret < 0) { + printk("ip_nat_init: can't register out hook.\n"); + goto cleanup_inops; + } + ret = nf_register_hook(&ip_nat_local_out_ops); + if (ret < 0) { + printk("ip_nat_init: can't register local out hook.\n"); + goto cleanup_outops; + } + __MOD_INC_USE_COUNT(ip_conntrack_module); + return ret; + + cleanup: + __MOD_DEC_USE_COUNT(ip_conntrack_module); + nf_unregister_hook(&ip_nat_local_out_ops); + cleanup_outops: + nf_unregister_hook(&ip_nat_out_ops); + cleanup_inops: + nf_unregister_hook(&ip_nat_in_ops); + cleanup_nat: + ip_nat_cleanup(); + cleanup_rule_init: + ip_nat_rule_cleanup(); + cleanup_nothing: + MUST_BE_READ_WRITE_UNLOCKED(&ip_nat_lock); + return ret; +} + +static int __init init(void) +{ + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +module_init(init); +module_exit(fini); + +EXPORT_SYMBOL(ip_nat_setup_info); +EXPORT_SYMBOL(ip_nat_helper_register); +EXPORT_SYMBOL(ip_nat_helper_unregister); +EXPORT_SYMBOL(ip_nat_expect_register); +EXPORT_SYMBOL(ip_nat_expect_unregister); +EXPORT_SYMBOL(ip_nat_cheat_check); diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c new file mode 100644 index 000000000..d5ca01aa6 --- /dev/null +++ b/net/ipv4/netfilter/ip_queue.c @@ -0,0 +1,752 @@ +/* + * This is a module which is used for queueing IPv4 packets and + * communicating with userspace via netlink. + * + * (C) 2000 James Morris + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/init.h> +#include <linux/ip.h> +#include <linux/notifier.h> +#include <linux/netdevice.h> +#include <linux/netfilter.h> +#include <linux/netlink.h> +#include <linux/spinlock.h> +#include <linux/smp_lock.h> +#include <linux/rtnetlink.h> +#include <linux/sysctl.h> +#include <linux/proc_fs.h> +#include <net/sock.h> + +#include <linux/netfilter_ipv4/ip_queue.h> + +EXPORT_NO_SYMBOLS; + +#define IPQ_THR_NAME "kipq" +#define IPQ_NAME "ip_queue" +#define IPQ_QMAX_DEFAULT 1024 + +#define IPQ_PROC_FS_NAME "ip_queue" + +#define NET_IPQ_QMAX 2088 +#define NET_IPQ_QMAX_NAME "ip_queue_maxlen" + +typedef struct ipq_queue_element { + struct list_head list; /* Links element into queue */ + unsigned char state; /* State of this element */ + int verdict; /* Current verdict */ + struct nf_info *info; /* Extra info from netfilter */ + struct sk_buff *skb; /* Packet inside */ +} ipq_queue_element_t; + +typedef int (*ipq_send_cb_t)(ipq_queue_element_t *e); + +typedef struct ipq_peer { + pid_t pid; /* PID of userland peer */ + unsigned char died; /* We think the peer died */ + unsigned char copy_mode; /* Copy packet as well as metadata? */ + size_t copy_range; /* Range past metadata to copy */ + ipq_send_cb_t send; /* Callback for sending data to peer */ +} ipq_peer_t; + +typedef struct ipq_thread { + pid_t pid; /* PID of kernel thread */ + unsigned char terminate; /* Termination flag */ + unsigned char running; /* Running flag */ + wait_queue_head_t wq; /* I/O wait queue */ + void (*process)(void *data); /* Queue processing function */ +} ipq_thread_t; + +typedef struct ipq_queue { + int len; /* Current queue len */ + int *maxlen; /* Maximum queue len, via sysctl */ + unsigned char state; /* Current queue state */ + struct list_head list; /* Head of packet queue */ + spinlock_t lock; /* Queue spinlock */ + ipq_peer_t peer; /* Userland peer */ + ipq_thread_t thread; /* Thread context */ +} ipq_queue_t; + + +/**************************************************************************** +* +* Kernel thread +* +****************************************************************************/ + +static void ipq_thread_init(char *thread_name) +{ + lock_kernel(); + exit_files(current); + daemonize(); + strcpy(current->comm, thread_name); + unlock_kernel(); + spin_lock_irq(¤t->sigmask_lock); + flush_signals(current); + sigfillset(¤t->blocked); + recalc_sigpending(current); + spin_unlock_irq(¤t->sigmask_lock); +} + +static int ipq_thread_start(void *data) +{ + ipq_queue_t *q = (ipq_queue_t *)data; + + q->thread.running = 1; + ipq_thread_init(IPQ_THR_NAME); + q->thread.pid = current->pid; + while (!q->thread.terminate) { + interruptible_sleep_on(&q->thread.wq); + q->thread.process(q); + } + q->thread.running = 0; + return 0; +} + +static void ipq_thread_stop(ipq_queue_t *q) +{ + if (!(q->thread.pid || q->thread.running)) + return; + q->state = IPQ_QS_FLUSH; + q->thread.terminate = 1; + wake_up_interruptible(&q->thread.wq); + current->state = TASK_INTERRUPTIBLE; + while (q->thread.running) { + schedule_timeout(HZ/10); + current->state = TASK_RUNNING; + } +} + +static int ipq_thread_create(ipq_queue_t *q) +{ + int status = kernel_thread(ipq_thread_start, q, 0); + return (status < 0) ? status : 0; +} + + +/**************************************************************************** + * + * Packet queue + * + ****************************************************************************/ + +/* Must be called under spinlock */ +static __inline__ void +ipq_dequeue(ipq_queue_t *q, + ipq_queue_element_t *e) +{ + list_del(&e->list); + nf_reinject(e->skb, e->info, e->verdict); + kfree(e); + q->len--; +} + +/* Must be called under spinlock */ +static __inline__ void +ipq_queue_drop(ipq_queue_t *q, + ipq_queue_element_t *e) +{ + e->verdict = NF_DROP; + ipq_dequeue(q, e); +} + +static int +ipq_notify_peer(ipq_queue_t *q, + ipq_queue_element_t *e) +{ + int status = q->peer.send(e); + + if (status >= 0) { + e->state = IPQ_PS_WAITING; + return status; + } + if (status == -ERESTARTSYS || status == -EAGAIN) + return 0; + printk(KERN_INFO "%s: error notifying peer %d, resetting " + "state and flushing queue\n", IPQ_NAME, q->peer.pid); + q->state = IPQ_QS_FLUSH; + q->peer.died = 1; + q->peer.pid = 0; + q->peer.copy_mode = IPQ_COPY_META; + q->peer.copy_range = 0; + return status; +} + +static void +ipq_queue_process(void *data) +{ + struct list_head *i; + ipq_queue_t *q = (ipq_queue_t *)data; + +restart: + if (q->state == IPQ_QS_HOLD) + return; + spin_lock_bh(&q->lock); + for (i = q->list.prev; i != &q->list; i = i->prev) { + ipq_queue_element_t *e = (ipq_queue_element_t *)i; + + if (q->state == IPQ_QS_FLUSH) { + QDEBUG("flushing packet %p\n", e); + ipq_queue_drop(q, e); + continue; + } + switch (e->state) { + case IPQ_PS_NEW: { + int status = ipq_notify_peer(q, e); + if (status < 0) { + spin_unlock_bh(&q->lock); + goto restart; + } + break; + } + case IPQ_PS_VERDICT: + ipq_dequeue(q, e); + break; + case IPQ_PS_WAITING: + break; + default: + printk(KERN_INFO "%s: dropping stuck packet %p " + "with ps=%d qs=%d\n", IPQ_NAME, + e, e->state, q->state); + ipq_queue_drop(q, e); + } + } + spin_unlock_bh(&q->lock); + if (q->state == IPQ_QS_FLUSH) + q->state = IPQ_QS_HOLD; +} + +static ipq_queue_t * +ipq_queue_create(nf_queue_outfn_t outfn, + ipq_send_cb_t send_cb, + int *errp, + int *sysctl_qmax) +{ + int status; + ipq_queue_t *q; + + *errp = 0; + q = kmalloc(sizeof(ipq_queue_t), GFP_KERNEL); + if (q == NULL) { + *errp = -ENOMEM; + return NULL; + } + q->thread.terminate = 0; + q->thread.running = 0; + q->thread.process = ipq_queue_process; + init_waitqueue_head(&q->thread.wq); + q->peer.pid = 0; + q->peer.died = 0; + q->peer.copy_mode = IPQ_COPY_META; + q->peer.copy_range = 0; + q->peer.send = send_cb; + q->len = 0; + q->maxlen = sysctl_qmax; + q->state = IPQ_QS_HOLD; + INIT_LIST_HEAD(&q->list); + spin_lock_init(&q->lock); + status = nf_register_queue_handler(PF_INET, outfn, q); + if (status < 0) { + *errp = -EBUSY; + kfree(q); + return NULL; + } + status = ipq_thread_create(q); + if (status < 0) { + nf_unregister_queue_handler(PF_INET); + *errp = status; + kfree(q); + return NULL; + } + return q; +} + +static int +ipq_enqueue(ipq_queue_t *q, + struct sk_buff *skb, + struct nf_info *info) +{ + ipq_queue_element_t *e = NULL; + + e = kmalloc(sizeof(*e), GFP_ATOMIC); + if (e == NULL) { + printk(KERN_ERR "%s: out of memory in %s\n", + IPQ_NAME, __FUNCTION__); + return -ENOMEM; + } + e->state = IPQ_PS_NEW; + e->verdict = NF_DROP; + e->info = info; + e->skb = skb; + spin_lock_bh(&q->lock); + if (q->len >= *q->maxlen) { + spin_unlock_bh(&q->lock); + printk(KERN_WARNING "%s: queue full at %d entries, " + "dropping packet.\n", IPQ_NAME, q->len); + kfree(e); + nf_reinject(skb, info, NF_DROP); + return 0; + } + list_add(&e->list, &q->list); + q->len++; + spin_unlock_bh(&q->lock); + wake_up_interruptible(&q->thread.wq); + return 0; +} + +/* FIXME: need to find a way to notify user during module unload */ +static void +ipq_queue_destroy(ipq_queue_t *q) +{ + ipq_thread_stop(q); + nf_unregister_queue_handler(PF_INET); + kfree(q); +} + +static int +ipq_queue_mangle_ipv4(unsigned char *buf, + ipq_verdict_msg_t *v, + ipq_queue_element_t *e) +{ + struct iphdr *user_iph = (struct iphdr *)buf; + + if (v->data_len < sizeof(*user_iph)) + return 0; + + if (e->skb->nh.iph->check != user_iph->check) { + int diff = v->data_len - e->skb->len; + + if (diff < 0) + skb_trim(e->skb, v->data_len); + else if (diff > 0) { + if (v->data_len > 0xFFFF) { + e->verdict = NF_DROP; + return -EINVAL; + } + if (diff > skb_tailroom(e->skb)) { + struct sk_buff *newskb; + + /* Ack, we waste a memcpy() of data here */ + newskb = skb_copy_expand(e->skb, + skb_headroom(e->skb), + diff, + GFP_ATOMIC); + if (newskb == NULL) { + printk(KERN_WARNING "%s: OOM in %s, " + "dropping packet\n", + IPQ_THR_NAME, __FUNCTION__); + e->verdict = NF_DROP; + return -ENOMEM; + } + kfree_skb(e->skb); + e->skb = newskb; + } + skb_put(e->skb, diff); + } + memcpy(e->skb->data, buf, v->data_len); + e->skb->nfcache |= NFC_ALTERED; + } + return 0; +} + +static int +ipq_queue_set_verdict(ipq_queue_t *q, + ipq_verdict_msg_t *v, + unsigned char *buf, + unsigned int len) +{ + struct list_head *i; + + if (v->value < 0 || v->value > NF_MAX_VERDICT) + return -EINVAL; + spin_lock_bh(&q->lock); + for (i = q->list.next; i != &q->list; i = i->next) { + ipq_queue_element_t *e = (ipq_queue_element_t *)i; + + if (v->id == (unsigned long )e) { + int status = 0; + e->state = IPQ_PS_VERDICT; + e->verdict = v->value; + + if (buf && v->data_len == len) + status = ipq_queue_mangle_ipv4(buf, v, e); + spin_unlock_bh(&q->lock); + return status; + } + } + spin_unlock_bh(&q->lock); + return -ENOENT; +} + +static int +ipq_receive_peer(ipq_queue_t *q, + ipq_peer_msg_t *m, + unsigned char type, + unsigned int len) +{ + if (q->state == IPQ_QS_FLUSH) + return -EBUSY; + + if (len < sizeof(ipq_peer_msg_t)) + return -EINVAL; + + switch (type) { + case IPQM_MODE: + switch (m->msg.mode.value) { + case IPQ_COPY_NONE: + q->peer.copy_mode = IPQ_COPY_NONE; + q->peer.copy_range = 0; + q->state = IPQ_QS_FLUSH; + break; + case IPQ_COPY_META: + if (q->state == IPQ_QS_FLUSH) + return -EAGAIN; + q->peer.copy_mode = IPQ_COPY_META; + q->peer.copy_range = 0; + q->state = IPQ_QS_COPY; + break; + case IPQ_COPY_PACKET: + if (q->state == IPQ_QS_FLUSH) + return -EAGAIN; + q->peer.copy_mode = IPQ_COPY_PACKET; + q->peer.copy_range = m->msg.mode.range; + q->state = IPQ_QS_COPY; + break; + default: + return -EINVAL; + } + break; + case IPQM_VERDICT: { + int status; + unsigned char *data = NULL; + + if (m->msg.verdict.value > NF_MAX_VERDICT) + return -EINVAL; + if (m->msg.verdict.data_len) + data = (unsigned char *)m + sizeof(*m); + status = ipq_queue_set_verdict(q, &m->msg.verdict, + data, len - sizeof(*m)); + if (status < 0) + return status; + break; + } + default: + return -EINVAL; + } + wake_up_interruptible(&q->thread.wq); + return 0; +} + + +/**************************************************************************** + * + * Netfilter interface + * + ****************************************************************************/ + +/* + * Packets arrive here from netfilter for queuing to userspace. + * All of them must be fed back via nf_reinject() or Alexey will kill Rusty. + */ +static int +receive_netfilter(struct sk_buff *skb, + struct nf_info *info, + void *data) +{ + ipq_queue_t *q = (ipq_queue_t *)data; + + if (q->state == IPQ_QS_FLUSH) + return -EBUSY; + return ipq_enqueue(q, skb, info); +} + +/**************************************************************************** + * + * Netlink interface. + * + ****************************************************************************/ + +static struct sk_buff * +netlink_build_message(ipq_queue_element_t *e, + int *errp); + +extern __inline__ void +receive_user_skb(struct sk_buff *skb); + +static int +netlink_send_peer(ipq_queue_element_t *e); + +static struct sock *nfnl = NULL; +ipq_queue_t *nlq = NULL; + +static int +netlink_send_peer(ipq_queue_element_t *e) +{ + int status = 0; + struct sk_buff *skb; + + if (!nlq->peer.pid) + return -EINVAL; + skb = netlink_build_message(e, &status); + if (skb == NULL) + return status; + return netlink_unicast(nfnl, skb, nlq->peer.pid, 0); +} + +static struct sk_buff * +netlink_build_message(ipq_queue_element_t *e, + int *errp) +{ + unsigned char *old_tail; + size_t size = 0; + size_t data_len = 0; + struct sk_buff *skb; + ipq_packet_msg_t *pm; + struct nlmsghdr *nlh; + + switch (nlq->peer.copy_mode) { + size_t copy_range; + + case IPQ_COPY_META: + size = NLMSG_SPACE(sizeof(*pm)); + data_len = 0; + break; + case IPQ_COPY_PACKET: + copy_range = nlq->peer.copy_range; + if (copy_range == 0 || copy_range > e->skb->len) + data_len = e->skb->len; + else + data_len = copy_range; + size = NLMSG_SPACE(sizeof(*pm) + data_len); + break; + case IPQ_COPY_NONE: + default: + *errp = -EINVAL; + return NULL; + } + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + goto nlmsg_failure; + old_tail = skb->tail; + nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh)); + pm = NLMSG_DATA(nlh); + memset(pm, 0, sizeof(*pm)); + pm->packet_id = (unsigned long )e; + pm->data_len = data_len; + pm->timestamp_sec = e->skb->stamp.tv_sec; + pm->timestamp_usec = e->skb->stamp.tv_usec; + pm->hook = e->info->hook; + if (e->info->indev) strcpy(pm->indev_name, e->info->indev->name); + else pm->indev_name[0] = '\0'; + if (e->info->outdev) strcpy(pm->outdev_name, e->info->outdev->name); + else pm->outdev_name[0] = '\0'; + if (data_len) + memcpy(++pm, e->skb->data, data_len); + nlh->nlmsg_len = skb->tail - old_tail; + NETLINK_CB(skb).dst_groups = 0; + return skb; +nlmsg_failure: + if (skb) + kfree(skb); + *errp = 0; + printk(KERN_ERR "%s: error creating netlink message\n", IPQ_NAME); + return NULL; +} + +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0); +/* + * FIXME: ping old peer if we detect a new peer then resend. + */ +extern __inline__ void +receive_user_skb(struct sk_buff *skb) +{ + int status, type; + struct nlmsghdr *nlh; + + nlh = (struct nlmsghdr *)skb->data; + if (nlh->nlmsg_len < sizeof(*nlh) + || skb->len < nlh->nlmsg_len + || nlh->nlmsg_pid <= 0 + || !(nlh->nlmsg_flags & NLM_F_REQUEST) + || nlh->nlmsg_flags & NLM_F_MULTI) + RCV_SKB_FAIL(-EINVAL); + if (nlh->nlmsg_flags & MSG_TRUNC) + RCV_SKB_FAIL(-ECOMM); + type = nlh->nlmsg_type; + if (type < NLMSG_NOOP || type >= IPQM_MAX) + RCV_SKB_FAIL(-EINVAL); + if (type <= IPQM_BASE) + return; + if(!cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) + RCV_SKB_FAIL(-EPERM); + if (nlq->peer.pid && !nlq->peer.died + && (nlq->peer.pid != nlh->nlmsg_pid)) + printk(KERN_WARNING "%s: peer pid changed from %d to %d\n", + IPQ_NAME, nlq->peer.pid, nlh->nlmsg_pid); + nlq->peer.pid = nlh->nlmsg_pid; + nlq->peer.died = 0; + status = ipq_receive_peer(nlq, NLMSG_DATA(nlh), + type, skb->len - NLMSG_LENGTH(0)); + if (status < 0) + RCV_SKB_FAIL(status); + if (nlh->nlmsg_flags & NLM_F_ACK) + netlink_ack(skb, nlh, 0); + return; +} + +/* Note: we are only dealing with single part messages at the moment. */ +static void +receive_user_sk(struct sock *sk, + int len) +{ + do { + struct sk_buff *skb; + + if (rtnl_shlock_nowait()) + return; + while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) { + receive_user_skb(skb); + kfree_skb(skb); + } + up(&rtnl_sem); + } while (nfnl && nfnl->receive_queue.qlen); +} + + +/**************************************************************************** + * + * System events + * + ****************************************************************************/ + +static int +receive_event(struct notifier_block *this, + unsigned long event, + void *ptr) +{ + if (event == NETDEV_UNREGISTER) + if (nlq) + ipq_thread_stop(nlq); + return NOTIFY_DONE; +} + +struct notifier_block ipq_dev_notifier = { + receive_event, + NULL, + 0 +}; + + +/**************************************************************************** + * + * Sysctl - queue tuning. + * + ****************************************************************************/ + +static int sysctl_maxlen = IPQ_QMAX_DEFAULT; + +static struct ctl_table_header *ipq_sysctl_header; + +static ctl_table ipq_table[] = { + { NET_IPQ_QMAX, NET_IPQ_QMAX_NAME, &sysctl_maxlen, + sizeof(sysctl_maxlen), 0644, NULL, proc_dointvec }, + { 0 } +}; + +static ctl_table ipq_dir_table[] = { + {NET_IPV4, "ipv4", NULL, 0, 0555, ipq_table, 0, 0, 0, 0, 0}, + { 0 } +}; + +static ctl_table ipq_root_table[] = { + {CTL_NET, "net", NULL, 0, 0555, ipq_dir_table, 0, 0, 0, 0, 0}, + { 0 } +}; + +/**************************************************************************** + * + * Procfs - debugging info. + * + ****************************************************************************/ + +static int +ipq_get_info(char *buffer, char **start, off_t offset, int length) +{ + int len; + + spin_lock_bh(&nlq->lock); + len = sprintf(buffer, + "Thread pid : %d\n" + "Thread terminate : %d\n" + "Thread running : %d\n" + "Peer pid : %d\n" + "Peer died : %d\n" + "Peer copy mode : %d\n" + "Peer copy range : %d\n" + "Queue length : %d\n" + "Queue max. length : %d\n" + "Queue state : %d\n", + nlq->thread.pid, + nlq->thread.terminate, + nlq->thread.running, + nlq->peer.pid, + nlq->peer.died, + nlq->peer.copy_mode, + nlq->peer.copy_range, + nlq->len, + *nlq->maxlen, + nlq->state); + spin_unlock_bh(&nlq->lock); + *start = buffer + offset; + len -= offset; + if (len > length) + len = length; + else if (len < 0) + len = 0; + return len; +} + +/**************************************************************************** + * + * Module stuff. + * + ****************************************************************************/ + +static int __init init(void) +{ + int status = 0; + + nfnl = netlink_kernel_create(NETLINK_FIREWALL, receive_user_sk); + if (nfnl == NULL) { + printk(KERN_ERR "%s: initialisation failed: unable to " + "create kernel netlink socket\n", IPQ_NAME); + return -ENOMEM; + } + nlq = ipq_queue_create(receive_netfilter, + netlink_send_peer, &status, &sysctl_maxlen); + if (nlq == NULL) { + printk(KERN_ERR "%s: initialisation failed: unable to " + "initialise queue\n", IPQ_NAME); + sock_release(nfnl->socket); + return status; + } + register_netdevice_notifier(&ipq_dev_notifier); + proc_net_create(IPQ_PROC_FS_NAME, 0, ipq_get_info); + ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0); + return status; +} + +static void __exit fini(void) +{ + unregister_sysctl_table(ipq_sysctl_header); + proc_net_remove(IPQ_PROC_FS_NAME); + unregister_netdevice_notifier(&ipq_dev_notifier); + ipq_queue_destroy(nlq); + sock_release(nfnl->socket); +} + +MODULE_DESCRIPTION("IPv4 packet queue handler"); +module_init(init); +module_exit(fini); + diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c new file mode 100644 index 000000000..8cc8c24ac --- /dev/null +++ b/net/ipv4/netfilter/ip_tables.c @@ -0,0 +1,1664 @@ +/* + * Packet matching code. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + */ +#include <linux/config.h> +#include <linux/skbuff.h> +#include <linux/kmod.h> +#include <linux/vmalloc.h> +#include <linux/netdevice.h> +#include <linux/module.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmp.h> +#include <asm/uaccess.h> +#include <asm/semaphore.h> + +#include <linux/netfilter_ipv4/ip_tables.h> + +#ifndef IP_OFFSET +#define IP_OFFSET 0x1FFF +#endif + +/*#define DEBUG_IP_FIREWALL*/ +/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */ +/*#define DEBUG_IP_FIREWALL_USER*/ + +#ifdef DEBUG_IP_FIREWALL +#define dprintf(format, args...) printk(format , ## args) +#else +#define dprintf(format, args...) +#endif + +#ifdef DEBUG_IP_FIREWALL_USER +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +#ifdef CONFIG_NETFILTER_DEBUG +#define IP_NF_ASSERT(x) \ +do { \ + if (!(x)) \ + printk("IPT_ASSERT: %s:%s:%u\n", \ + __FUNCTION__, __FILE__, __LINE__); \ +} while(0) +#else +#define IP_NF_ASSERT(x) +#endif +#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) + +/* Mutex protects lists (only traversed in user context). */ +static DECLARE_MUTEX(ipt_mutex); + +/* Must have mutex */ +#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) +#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) +#include <linux/netfilter_ipv4/lockhelp.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#if 0 +/* All the better to debug you with... */ +#define static +#define inline +#endif + +/* Locking is simple: we assume at worst case there will be one packet + in user context and one from bottom halves (or soft irq if Alexey's + softnet patch was applied). + + We keep a set of rules for each CPU, so we can avoid write-locking + them; doing a readlock_bh() stops packets coming through if we're + in user context. + + To be cache friendly on SMP, we arrange them like so: + [ n-entries ] + ... cache-align padding ... + [ n-entries ] + + Hence the start of any table is given by get_table() below. */ + +/* The table itself */ +struct ipt_table_info +{ + /* Size per table */ + unsigned int size; + /* Number of entries: FIXME. --RR */ + unsigned int number; + + /* Entry points and underflows */ + unsigned int hook_entry[NF_IP_NUMHOOKS]; + unsigned int underflow[NF_IP_NUMHOOKS]; + + char padding[SMP_ALIGN((NF_IP_NUMHOOKS*2+2)*sizeof(unsigned int))]; + + /* ipt_entry tables: one per CPU */ + char entries[0]; +}; + +static LIST_HEAD(ipt_target); +static LIST_HEAD(ipt_match); +static LIST_HEAD(ipt_tables); +#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) + +#ifdef CONFIG_SMP +#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*cpu_number_map(p)) +#else +#define TABLE_OFFSET(t,p) 0 +#endif + +#if 0 +#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) +#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) +#define up(x) do { printk("UP:%u:" #x "\n", __LINE__); up(x); } while(0) +#endif + +/* Returns whether matches rule or not. */ +static inline int +ip_packet_match(const struct iphdr *ip, + const char *indev, + const char *outdev, + const struct ipt_ip *ipinfo, + int isfrag) +{ + size_t i; + unsigned long ret; + +#define FWINV(bool,invflg) ((bool) ^ !!(ipinfo->invflags & invflg)) + + if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr, + IPT_INV_SRCIP) + || FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr, + IPT_INV_DSTIP)) { + dprintf("Source or dest mismatch.\n"); + + dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n", + NIPQUAD(ip->saddr), + NIPQUAD(ipinfo->smsk.s_addr), + NIPQUAD(ipinfo->src.s_addr), + ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : ""); + dprintf("DST: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n", + NIPQUAD(ip->daddr), + NIPQUAD(ipinfo->dmsk.s_addr), + NIPQUAD(ipinfo->dst.s_addr), + ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : ""); + return 0; + } + + /* Look for ifname matches; this should unroll nicely. */ + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { + ret |= (((const unsigned long *)indev)[i] + ^ ((const unsigned long *)ipinfo->iniface)[i]) + & ((const unsigned long *)ipinfo->iniface_mask)[i]; + } + + if (FWINV(ret != 0, IPT_INV_VIA_IN)) { + dprintf("VIA in mismatch (%s vs %s).%s\n", + indev, ipinfo->iniface, + ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":""); + return 0; + } + + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { + ret |= (((const unsigned long *)outdev)[i] + ^ ((const unsigned long *)ipinfo->outiface)[i]) + & ((const unsigned long *)ipinfo->outiface_mask)[i]; + } + + if (FWINV(ret != 0, IPT_INV_VIA_OUT)) { + dprintf("VIA out mismatch (%s vs %s).%s\n", + outdev, ipinfo->outiface, + ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":""); + return 0; + } + + /* Check specific protocol */ + if (ipinfo->proto + && FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) { + dprintf("Packet protocol %hi does not match %hi.%s\n", + ip->protocol, ipinfo->proto, + ipinfo->invflags&IPT_INV_PROTO ? " (INV)":""); + return 0; + } + + /* If we have a fragment rule but the packet is not a fragment + * then we return zero */ + if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) { + dprintf("Fragment rule but not fragment.%s\n", + ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : ""); + return 0; + } + + return 1; +} + +static inline int +ip_checkentry(const struct ipt_ip *ip) +{ + if (ip->flags & ~IPT_F_MASK) { + duprintf("Unknown flag bits set: %08X\n", + ip->flags & ~IPT_F_MASK); + return 0; + } + if (ip->invflags & ~IPT_INV_MASK) { + duprintf("Unknown invflag bits set: %08X\n", + ip->invflags & ~IPT_INV_MASK); + return 0; + } + return 1; +} + +static unsigned int +ipt_error(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userinfo) +{ + if (net_ratelimit()) + printk("ip_tables: error: `%s'\n", (char *)targinfo); + + return NF_DROP; +} + +static inline +int do_match(struct ipt_entry_match *m, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + /* Stop iteration if it doesn't match */ + if (!m->u.match->match(skb, in, out, m->data, + offset, hdr, datalen, hotdrop)) + return 1; + else + return 0; +} + +static inline struct ipt_entry * +get_entry(void *base, unsigned int offset) +{ + return (struct ipt_entry *)(base + offset); +} + +/* Returns one of the generic firewall policies, like NF_ACCEPT. */ +unsigned int +ipt_do_table(struct sk_buff **pskb, + unsigned int hook, + const struct net_device *in, + const struct net_device *out, + struct ipt_table *table, + void *userdata) +{ + static const char nulldevname[IFNAMSIZ] = { 0 }; + u_int16_t offset; + struct iphdr *ip; + void *protohdr; + u_int16_t datalen; + int hotdrop = 0; + /* Initializing verdict to NF_DROP keeps gcc happy. */ + unsigned int verdict = NF_DROP; + const char *indev, *outdev; + void *table_base; + struct ipt_entry *e, *back; + + /* Initialization */ + ip = (*pskb)->nh.iph; + protohdr = (u_int32_t *)ip + ip->ihl; + datalen = (*pskb)->len - ip->ihl * 4; + indev = in ? in->name : nulldevname; + outdev = out ? out->name : nulldevname; + /* We handle fragments by dealing with the first fragment as + * if it was a normal packet. All other fragments are treated + * normally, except that they will NEVER match rules that ask + * things we don't know, ie. tcp syn flag or ports). If the + * rule is also a fragment-specific rule, non-fragments won't + * match it. */ + offset = ntohs(ip->frag_off) & IP_OFFSET; + + read_lock_bh(&table->lock); + IP_NF_ASSERT(table->valid_hooks & (1 << hook)); + table_base = (void *)table->private->entries + + TABLE_OFFSET(table->private, smp_processor_id()); + e = get_entry(table_base, table->private->hook_entry[hook]); + + /* Check noone else using our table */ + IP_NF_ASSERT(((struct ipt_entry *)table_base)->comefrom == 0xdead57ac); +#ifdef CONFIG_NETFILTER_DEBUG + ((struct ipt_entry *)table_base)->comefrom = 0x57acc001; +#endif + + /* For return from builtin chain */ + back = get_entry(table_base, table->private->underflow[hook]); + + do { + IP_NF_ASSERT(e); + IP_NF_ASSERT(back); + (*pskb)->nfcache |= e->nfcache; + if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) { + struct ipt_entry_target *t; + + if (IPT_MATCH_ITERATE(e, do_match, + *pskb, in, out, + offset, protohdr, + datalen, &hotdrop) != 0) + goto no_match; + + ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1); + + t = ipt_get_target(e); + IP_NF_ASSERT(t->u.target); + /* Standard target? */ + if (!t->u.target->target) { + int v; + + v = ((struct ipt_standard_target *)t)->verdict; + if (v < 0) { + /* Pop from stack? */ + if (v != IPT_RETURN) { + verdict = (unsigned)(-v) - 1; + break; + } + e = back; + back = get_entry(table_base, + back->comefrom); + continue; + } + if (table_base + v + != (void *)e + e->next_offset) { + /* Save old back ptr in next entry */ + struct ipt_entry *next + = (void *)e + e->next_offset; + next->comefrom + = (void *)back - table_base; + /* set back pointer to next entry */ + back = next; + } + + e = get_entry(table_base, v); + } else { + verdict = t->u.target->target(pskb, hook, + in, out, + t->data, + userdata); + + /* Target might have changed stuff. */ + ip = (*pskb)->nh.iph; + protohdr = (u_int32_t *)ip + ip->ihl; + datalen = (*pskb)->len - ip->ihl * 4; + + if (verdict == IPT_CONTINUE) + e = (void *)e + e->next_offset; + else + /* Verdict */ + break; + } + } else { + + no_match: + e = (void *)e + e->next_offset; + } + } while (!hotdrop); + +#ifdef CONFIG_NETFILTER_DEBUG + ((struct ipt_entry *)table_base)->comefrom = 0xdead57ac; +#endif + read_unlock_bh(&table->lock); + +#ifdef DEBUG_ALLOW_ALL + return NF_ACCEPT; +#else + if (hotdrop) + return NF_DROP; + else return verdict; +#endif +} + +/* If it succeeds, returns element and locks mutex */ +static inline void * +find_inlist_lock_noload(struct list_head *head, + const char *name, + int *error, + struct semaphore *mutex) +{ + void *ret; + +#if 0 + duprintf("find_inlist: searching for `%s' in %s.\n", + name, head == &ipt_target ? "ipt_target" + : head == &ipt_match ? "ipt_match" + : head == &ipt_tables ? "ipt_tables" : "UNKNOWN"); +#endif + + *error = down_interruptible(mutex); + if (*error != 0) + return NULL; + + ret = list_named_find(head, name); + if (!ret) { + *error = -ENOENT; + up(mutex); + } + return ret; +} + +#ifndef CONFIG_KMOD +#define find_inlist_lock(h,n,p,e,m) find_inlist_lock_noload((h),(n),(e),(m)) +#else +static void * +find_inlist_lock(struct list_head *head, + const char *name, + const char *prefix, + int *error, + struct semaphore *mutex) +{ + void *ret; + + ret = find_inlist_lock_noload(head, name, error, mutex); + if (!ret) { + char modulename[IPT_FUNCTION_MAXNAMELEN + strlen(prefix) + 1]; + strcpy(modulename, prefix); + strcat(modulename, name); + duprintf("find_inlist: loading `%s'.\n", modulename); + request_module(modulename); + ret = find_inlist_lock_noload(head, name, error, mutex); + } + + return ret; +} +#endif + +static inline struct ipt_table * +find_table_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&ipt_tables, name, "iptable_", error, mutex); +} + +static inline struct ipt_match * +find_match_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&ipt_match, name, "ipt_", error, mutex); +} + +static inline struct ipt_target * +find_target_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&ipt_target, name, "ipt_", error, mutex); +} + +/* All zeroes == unconditional rule. */ +static inline int +unconditional(const struct ipt_ip *ip) +{ + unsigned int i; + + for (i = 0; i < sizeof(*ip)/sizeof(__u32); i++) + if (((__u32 *)ip)[i]) + return 0; + + return 1; +} + +/* Figures out from what hook each rule can be called: returns 0 if + there are loops. Puts hook bitmask in comefrom. */ +static int +mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) +{ + unsigned int hook; + + /* No recursion; use packet counter to save back ptrs (reset + to 0 as we leave), and comefrom to save source hook bitmask */ + for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) { + unsigned int pos = newinfo->hook_entry[hook]; + struct ipt_entry *e + = (struct ipt_entry *)(newinfo->entries + pos); + + if (!(valid_hooks & (1 << hook))) + continue; + + /* Set initial back pointer. */ + e->counters.pcnt = pos; + + for (;;) { + struct ipt_standard_target *t + = (void *)ipt_get_target(e); + + if (e->comefrom & (1 << NF_IP_NUMHOOKS)) { + printk("iptables: loop hook %u pos %u %08X.\n", + hook, pos, e->comefrom); + return 0; + } + e->comefrom + |= ((1 << hook) | (1 << NF_IP_NUMHOOKS)); + + /* Unconditional return/END. */ + if (e->target_offset == sizeof(struct ipt_entry) + && (strcmp(t->target.u.name, IPT_STANDARD_TARGET) + == 0) + && t->verdict < 0 + && unconditional(&e->ip)) { + unsigned int oldpos, size; + + /* Return: backtrack through the last + big jump. */ + do { + e->comefrom ^= (1<<NF_IP_NUMHOOKS); +#ifdef DEBUG_IP_FIREWALL_USER + if (e->comefrom + & (1 << NF_IP_NUMHOOKS)) { + duprintf("Back unset " + "on hook %u " + "rule %u\n", + hook, pos); + } +#endif + oldpos = pos; + pos = e->counters.pcnt; + e->counters.pcnt = 0; + + /* We're at the start. */ + if (pos == oldpos) + goto next; + + e = (struct ipt_entry *) + (newinfo->entries + pos); + } while (oldpos == pos + e->next_offset); + + /* Move along one */ + size = e->next_offset; + e = (struct ipt_entry *) + (newinfo->entries + pos + size); + e->counters.pcnt = pos; + pos += size; + } else { + int newpos = t->verdict; + + if (strcmp(t->target.u.name, + IPT_STANDARD_TARGET) == 0 + && newpos >= 0) { + /* This a jump; chase it. */ + duprintf("Jump rule %u -> %u\n", + pos, newpos); + } else { + /* ... this is a fallthru */ + newpos = pos + e->next_offset; + } + e = (struct ipt_entry *) + (newinfo->entries + newpos); + e->counters.pcnt = pos; + pos = newpos; + } + } + next: + duprintf("Finished chain %u\n", hook); + } + return 1; +} + +static inline int +cleanup_match(struct ipt_entry_match *m, unsigned int *i) +{ + if (i && (*i)-- == 0) + return 1; + + if (m->u.match->me) + __MOD_DEC_USE_COUNT(m->u.match->me); + + return 0; +} + +static inline int +standard_check(const struct ipt_entry_target *t, + unsigned int max_offset) +{ + struct ipt_standard_target *targ = (void *)t; + + /* Check standard info. */ + if (t->target_size != sizeof(struct ipt_standard_target)) { + duprintf("standard_check: target size %u != %u\n", + t->target_size, sizeof(struct ipt_standard_target)); + return 0; + } + + if (targ->verdict >= 0 + && targ->verdict > max_offset - sizeof(struct ipt_entry)) { + duprintf("ipt_standard_check: bad verdict (%i)\n", + targ->verdict); + return 0; + } + + if (targ->verdict < -NF_MAX_VERDICT - 1) { + duprintf("ipt_standard_check: bad negative verdict (%i)\n", + targ->verdict); + return 0; + } + return 1; +} + +static inline int +check_match(struct ipt_entry_match *m, + const char *name, + const struct ipt_ip *ip, + unsigned int hookmask, + unsigned int *i) +{ + int ret; + struct ipt_match *match; + + match = find_match_lock(m->u.name, &ret, &ipt_mutex); + if (!match) { + duprintf("check_match: `%s' not found\n", m->u.name); + return ret; + } + if (match->me) + __MOD_INC_USE_COUNT(match->me); + m->u.match = match; + up(&ipt_mutex); + + if (m->u.match->checkentry + && !m->u.match->checkentry(name, ip, m->data, + m->match_size - sizeof(*m), + hookmask)) { + if (m->u.match->me) + __MOD_DEC_USE_COUNT(m->u.match->me); + duprintf("ip_tables: check failed for `%s'.\n", + m->u.match->name); + return -EINVAL; + } + + (*i)++; + return 0; +} + +static struct ipt_target ipt_standard_target; + +static inline int +check_entry(struct ipt_entry *e, const char *name, unsigned int size, + unsigned int *i) +{ + struct ipt_entry_target *t; + struct ipt_target *target; + int ret; + unsigned int j; + + if (!ip_checkentry(&e->ip)) { + duprintf("ip_tables: ip check failed %p %s.\n", e, name); + return -EINVAL; + } + + j = 0; + ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, e->comefrom, &j); + if (ret != 0) + goto cleanup_matches; + + t = ipt_get_target(e); + target = find_target_lock(t->u.name, &ret, &ipt_mutex); + if (!target) { + duprintf("check_entry: `%s' not found\n", t->u.name); + up(&ipt_mutex); + return ret; + } + if (target->me) + __MOD_INC_USE_COUNT(target->me); + t->u.target = target; + up(&ipt_mutex); + + if (t->u.target == &ipt_standard_target) { + if (!standard_check(t, size)) { + ret = -EINVAL; + goto cleanup_matches; + } + } else if (t->u.target->checkentry + && !t->u.target->checkentry(name, e, t->data, + t->target_size - sizeof(*t), + e->comefrom)) { + if (t->u.target->me) + __MOD_DEC_USE_COUNT(t->u.target->me); + duprintf("ip_tables: check failed for `%s'.\n", + t->u.target->name); + ret = -EINVAL; + goto cleanup_matches; + } + + (*i)++; + return 0; + + cleanup_matches: + IPT_MATCH_ITERATE(e, cleanup_match, &j); + return ret; +} + +static inline int +check_entry_size_and_hooks(struct ipt_entry *e, + struct ipt_table_info *newinfo, + unsigned char *base, + unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, + unsigned int *i) +{ + unsigned int h; + + if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 + || (unsigned char *)e + sizeof(struct ipt_entry) >= limit) { + duprintf("Bad offset %p\n", e); + return -EINVAL; + } + + if (e->next_offset + < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) { + duprintf("checking: element %p size %u\n", + e, e->next_offset); + return -EINVAL; + } + + /* Check hooks & underflows */ + for (h = 0; h < NF_IP_NUMHOOKS; h++) { + if ((unsigned char *)e - base == hook_entries[h]) + newinfo->hook_entry[h] = hook_entries[h]; + if ((unsigned char *)e - base == underflows[h]) + newinfo->underflow[h] = underflows[h]; + } + + /* FIXME: underflows must be unconditional, standard verdicts + < 0 (not IPT_RETURN). --RR */ + + /* Clear counters and comefrom */ + e->counters = ((struct ipt_counters) { 0, 0 }); + e->comefrom = 0; + + (*i)++; + return 0; +} + +static inline int +cleanup_entry(struct ipt_entry *e, unsigned int *i) +{ + struct ipt_entry_target *t; + + if (i && (*i)-- == 0) + return 1; + + /* Cleanup all matches */ + IPT_MATCH_ITERATE(e, cleanup_match, NULL); + t = ipt_get_target(e); + if (t->u.target->me) + __MOD_DEC_USE_COUNT(t->u.target->me); + + return 0; +} + +/* Checks and translates the user-supplied table segment (held in + newinfo) */ +static int +translate_table(const char *name, + unsigned int valid_hooks, + struct ipt_table_info *newinfo, + unsigned int size, + unsigned int number, + const unsigned int *hook_entries, + const unsigned int *underflows) +{ + unsigned int i; + int ret; + + newinfo->size = size; + newinfo->number = number; + + /* Init all hooks to impossible value. */ + for (i = 0; i < NF_IP_NUMHOOKS; i++) { + newinfo->hook_entry[i] = 0xFFFFFFFF; + newinfo->underflow[i] = 0xFFFFFFFF; + } + + duprintf("translate_table: size %u\n", newinfo->size); + i = 0; + /* Walk through entries, checking offsets. */ + ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + check_entry_size_and_hooks, + newinfo, + newinfo->entries, + newinfo->entries + size, + hook_entries, underflows, &i); + if (ret != 0) + return ret; + + if (i != number) { + duprintf("translate_table: %u not %u entries\n", + i, number); + return -EINVAL; + } + + /* Check hooks all assigned */ + for (i = 0; i < NF_IP_NUMHOOKS; i++) { + /* Only hooks which are valid */ + if (!(valid_hooks & (1 << i))) + continue; + if (newinfo->hook_entry[i] == 0xFFFFFFFF) { + duprintf("Invalid hook entry %u %u\n", + i, hook_entries[i]); + return -EINVAL; + } + if (newinfo->underflow[i] == 0xFFFFFFFF) { + duprintf("Invalid underflow %u %u\n", + i, underflows[i]); + return -EINVAL; + } + } + + if (!mark_source_chains(newinfo, valid_hooks)) + return -ELOOP; + + /* Finally, each sanity check must pass */ + i = 0; + ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + check_entry, name, size, &i); + + if (ret != 0) { + IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + cleanup_entry, &i); + return ret; + } + + /* And one copy for every other CPU */ + for (i = 1; i < smp_num_cpus; i++) { + memcpy(newinfo->entries + SMP_ALIGN(newinfo->size*i), + newinfo->entries, + SMP_ALIGN(newinfo->size)); + } + + return ret; +} + +static struct ipt_table_info * +replace_table(struct ipt_table *table, + unsigned int num_counters, + struct ipt_table_info *newinfo, + int *error) +{ + struct ipt_table_info *oldinfo; + +#ifdef CONFIG_NETFILTER_DEBUG + { + struct ipt_entry *table_base; + unsigned int i; + + for (i = 0; i < smp_num_cpus; i++) { + table_base = + (void *)newinfo->entries + + TABLE_OFFSET(newinfo, i); + + table_base->comefrom = 0xdead57ac; + } + } +#endif + + /* Do the substitution. */ + write_lock_bh(&table->lock); + /* Check inside lock: is the old number correct? */ + if (num_counters != table->private->number) { + duprintf("num_counters != table->private->number (%u/%u)\n", + num_counters, table->private->number); + write_unlock_bh(&table->lock); + *error = -EAGAIN; + return NULL; + } + oldinfo = table->private; + table->private = newinfo; + write_unlock_bh(&table->lock); + + return oldinfo; +} + +/* Gets counters. */ +static inline int +add_entry_to_counter(const struct ipt_entry *e, + struct ipt_counters total[], + unsigned int *i) +{ + ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + +static void +get_counters(const struct ipt_table_info *t, + struct ipt_counters counters[]) +{ + unsigned int cpu; + unsigned int i; + + for (cpu = 0; cpu < smp_num_cpus; cpu++) { + i = 0; + IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + t->size, + add_entry_to_counter, + counters, + &i); + } +} + +static int +copy_entries_to_user(unsigned int total_size, + struct ipt_table *table, + void *userptr) +{ + unsigned int off, num, countersize; + struct ipt_entry *e; + struct ipt_counters *counters; + int ret = 0; + + /* We need atomic snapshot of counters: rest doesn't change + (other than comefrom, which userspace doesn't care + about). */ + countersize = sizeof(struct ipt_counters) * table->private->number; + counters = vmalloc(countersize); + + if (counters == NULL) + return -ENOMEM; + + /* First, sum counters... */ + memset(counters, 0, countersize); + write_lock_bh(&table->lock); + get_counters(table->private, counters); + write_unlock_bh(&table->lock); + + /* ... then copy entire thing from CPU 0... */ + if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + ret = -EFAULT; + goto free_counters; + } + + /* FIXME: use iterator macros --RR */ + /* ... then go back and fix counters and names */ + for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ + unsigned int i; + struct ipt_entry_match *m; + struct ipt_entry_target *t; + + e = (struct ipt_entry *)(table->private->entries + off); + if (copy_to_user(userptr + off + + offsetof(struct ipt_entry, counters), + &counters[num], + sizeof(counters[num])) != 0) { + ret = -EFAULT; + goto free_counters; + } + + for (i = sizeof(struct ipt_entry); + i < e->target_offset; + i += m->match_size) { + m = (void *)e + i; + + if (copy_to_user(userptr + off + i + + offsetof(struct ipt_entry_match, + u.name), + m->u.match->name, + strlen(m->u.match->name)+1) != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + t = ipt_get_target(e); + if (copy_to_user(userptr + off + e->target_offset + + offsetof(struct ipt_entry_target, + u.name), + t->u.target->name, + strlen(t->u.target->name)+1) != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + free_counters: + vfree(counters); + return ret; +} + +static int +get_entries(const struct ipt_get_entries *entries, + struct ipt_get_entries *uptr) +{ + int ret; + struct ipt_table *t; + + t = find_table_lock(entries->name, &ret, &ipt_mutex); + if (t) { + duprintf("t->private->number = %u\n", + t->private->number); + if (entries->size == t->private->size) + ret = copy_entries_to_user(t->private->size, + t, uptr->entries); + else { + duprintf("get_entries: I've got %u not %u!\n", + t->private->size, + entries->size); + ret = -EINVAL; + } + up(&ipt_mutex); + } else + duprintf("get_entries: Can't find %s!\n", + entries->name); + + return ret; +} + +static int +do_replace(void *user, unsigned int len) +{ + int ret; + struct ipt_replace tmp; + struct ipt_table *t; + struct ipt_table_info *newinfo, *oldinfo; + struct ipt_counters *counters; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + newinfo = vmalloc(sizeof(struct ipt_table_info) + + SMP_ALIGN(tmp.size) * smp_num_cpus); + if (!newinfo) + return -ENOMEM; + + if (copy_from_user(newinfo->entries, user + sizeof(tmp), + tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + counters = vmalloc(tmp.num_counters * sizeof(struct ipt_counters)); + if (!counters) { + ret = -ENOMEM; + goto free_newinfo; + } + memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters)); + + ret = translate_table(tmp.name, tmp.valid_hooks, + newinfo, tmp.size, tmp.num_entries, + tmp.hook_entry, tmp.underflow); + if (ret != 0) + goto free_newinfo_counters; + + duprintf("ip_tables: Translated table\n"); + + t = find_table_lock(tmp.name, &ret, &ipt_mutex); + if (!t) + goto free_newinfo_counters_untrans; + + /* You lied! */ + if (tmp.valid_hooks != t->valid_hooks) { + duprintf("Valid hook crap: %08X vs %08X\n", + tmp.valid_hooks, t->valid_hooks); + ret = -EINVAL; + goto free_newinfo_counters_untrans_unlock; + } + + oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret); + if (!oldinfo) + goto free_newinfo_counters_untrans_unlock; + + /* Get the old counters. */ + get_counters(oldinfo, counters); + /* Decrease module usage counts and free resource */ + IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); + vfree(oldinfo); + /* Silent error: too late now. */ + copy_to_user(tmp.counters, counters, + sizeof(struct ipt_counters) * tmp.num_counters); + + up(&ipt_mutex); + return 0; + + free_newinfo_counters_untrans_unlock: + up(&ipt_mutex); + free_newinfo_counters_untrans: + IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); + free_newinfo_counters: + vfree(counters); + free_newinfo: + vfree(newinfo); + return ret; +} + +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static inline int +add_counter_to_entry(struct ipt_entry *e, + const struct ipt_counters addme[], + unsigned int *i) +{ +#if 0 + duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n", + *i, + (long unsigned int)e->counters.pcnt, + (long unsigned int)e->counters.bcnt, + (long unsigned int)addme[*i].pcnt, + (long unsigned int)addme[*i].bcnt); +#endif + + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + +static int +do_add_counters(void *user, unsigned int len) +{ + unsigned int i; + struct ipt_counters_info tmp, *paddc; + struct ipt_table *t; + int ret; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters)) + return -EINVAL; + + paddc = vmalloc(len); + if (!paddc) + return -ENOMEM; + + if (copy_from_user(paddc, user, len) != 0) { + ret = -EFAULT; + goto free; + } + + t = find_table_lock(tmp.name, &ret, &ipt_mutex); + if (!t) + goto free; + + write_lock_bh(&t->lock); + if (t->private->number != paddc->num_counters) { + ret = -EINVAL; + goto unlock_up_free; + } + + i = 0; + IPT_ENTRY_ITERATE(t->private->entries, + t->private->size, + add_counter_to_entry, + paddc->counters, + &i); + unlock_up_free: + write_unlock_bh(&t->lock); + up(&ipt_mutex); + free: + vfree(paddc); + + return ret; +} + +static int +do_ipt_set_ctl(struct sock *sk, int cmd, void *user, unsigned int len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IPT_SO_SET_REPLACE: + ret = do_replace(user, len); + break; + + case IPT_SO_SET_ADD_COUNTERS: + ret = do_add_counters(user, len); + break; + + default: + duprintf("do_ipt_set_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +static int +do_ipt_get_ctl(struct sock *sk, int cmd, void *user, int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IPT_SO_GET_INFO: { + char name[IPT_TABLE_MAXNAMELEN]; + struct ipt_table *t; + + if (*len != sizeof(struct ipt_getinfo)) { + duprintf("length %u != %u\n", *len, + sizeof(struct ipt_getinfo)); + ret = -EINVAL; + break; + } + + if (copy_from_user(name, user, sizeof(name)) != 0) { + ret = -EFAULT; + break; + } + t = find_table_lock(name, &ret, &ipt_mutex); + if (t) { + struct ipt_getinfo info; + + info.valid_hooks = t->valid_hooks; + memcpy(info.hook_entry, t->private->hook_entry, + sizeof(info.hook_entry)); + memcpy(info.underflow, t->private->underflow, + sizeof(info.underflow)); + info.num_entries = t->private->number; + info.size = t->private->size; + strcpy(info.name, name); + + if (copy_to_user(user, &info, *len) != 0) + ret = -EFAULT; + else + ret = 0; + + up(&ipt_mutex); + } + } + break; + + case IPT_SO_GET_ENTRIES: { + struct ipt_get_entries get; + + if (*len < sizeof(get)) { + duprintf("get_entries: %u < %u\n", *len, sizeof(get)); + ret = -EINVAL; + } else if (copy_from_user(&get, user, sizeof(get)) != 0) { + ret = -EFAULT; + } else if (*len != sizeof(struct ipt_get_entries) + get.size) { + duprintf("get_entries: %u != %u\n", *len, + sizeof(struct ipt_get_entries) + get.size); + ret = -EINVAL; + } else + ret = get_entries(&get, user); + break; + } + + default: + duprintf("do_ipt_get_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +/* Registration hooks for targets. */ +int +ipt_register_target(struct ipt_target *target) +{ + int ret; + + ret = down_interruptible(&ipt_mutex); + if (ret != 0) + return ret; + + if (list_named_insert(&ipt_target, target)) { + MOD_INC_USE_COUNT; + ret = 0; + } else { + duprintf("ipt_register_target: `%s' already in list!\n", + target->name); + ret = -EINVAL; + } + up(&ipt_mutex); + return ret; +} + +void +ipt_unregister_target(struct ipt_target *target) +{ + down(&ipt_mutex); + LIST_DELETE(&ipt_target, target); + up(&ipt_mutex); + MOD_DEC_USE_COUNT; +} + +int +ipt_register_match(struct ipt_match *match) +{ + int ret; + + ret = down_interruptible(&ipt_mutex); + if (ret != 0) + return ret; + + if (list_named_insert(&ipt_match, match)) { + MOD_INC_USE_COUNT; + ret = 0; + } else { + duprintf("ipt_register_match: `%s' already in list!\n", + match->name); + ret = -EINVAL; + } + up(&ipt_mutex); + + return ret; +} + +void +ipt_unregister_match(struct ipt_match *match) +{ + down(&ipt_mutex); + LIST_DELETE(&ipt_match, match); + up(&ipt_mutex); + MOD_DEC_USE_COUNT; +} + +int ipt_register_table(struct ipt_table *table) +{ + int ret; + struct ipt_table_info *newinfo; + static struct ipt_table_info bootstrap + = { 0, 0, { 0 }, { 0 }, { }, { } }; + + newinfo = vmalloc(sizeof(struct ipt_table_info) + + SMP_ALIGN(table->table->size) * smp_num_cpus); + if (!newinfo) { + ret = -ENOMEM; + return ret; + } + memcpy(newinfo->entries, table->table->entries, table->table->size); + + ret = translate_table(table->name, table->valid_hooks, + newinfo, table->table->size, + table->table->num_entries, + table->table->hook_entry, + table->table->underflow); + if (ret != 0) { + vfree(newinfo); + return ret; + } + + ret = down_interruptible(&ipt_mutex); + if (ret != 0) { + vfree(newinfo); + return ret; + } + + /* Don't autoload: we'd eat our tail... */ + if (list_named_find(&ipt_tables, table->name)) { + ret = -EEXIST; + goto free_unlock; + } + + /* Simplifies replace_table code. */ + table->private = &bootstrap; + if (!replace_table(table, 0, newinfo, &ret)) + goto free_unlock; + + duprintf("table->private->number = %u\n", + table->private->number); + + table->lock = RW_LOCK_UNLOCKED; + list_prepend(&ipt_tables, table); + MOD_INC_USE_COUNT; + + unlock: + up(&ipt_mutex); + return ret; + + free_unlock: + vfree(newinfo); + goto unlock; +} + +void ipt_unregister_table(struct ipt_table *table) +{ + down(&ipt_mutex); + LIST_DELETE(&ipt_tables, table); + up(&ipt_mutex); + + /* Decrease module usage counts and free resources */ + IPT_ENTRY_ITERATE(table->private->entries, table->private->size, + cleanup_entry, NULL); + vfree(table->private); + MOD_DEC_USE_COUNT; +} + +/* Returns 1 if the port is matched by the range, 0 otherwise */ +static inline int +port_match(u_int16_t min, u_int16_t max, u_int16_t port, int invert) +{ + int ret; + + ret = (port >= min && port <= max) ^ invert; + return ret; +} + +static int +tcp_find_option(u_int8_t option, + const struct tcphdr *tcp, + u_int16_t datalen, + int invert, + int *hotdrop) +{ + unsigned int i = sizeof(struct tcphdr); + const u_int8_t *opt = (u_int8_t *)tcp; + + duprintf("tcp_match: finding option\n"); + /* If we don't have the whole header, drop packet. */ + if (tcp->doff * 4 > datalen) { + *hotdrop = 1; + return 0; + } + + while (i < tcp->doff * 4) { + if (opt[i] == option) return !invert; + if (opt[i] < 2) i++; + else i += opt[i+1]?:1; + } + + return invert; +} + +static int +tcp_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + const struct tcphdr *tcp = hdr; + const struct ipt_tcp *tcpinfo = matchinfo; + + /* To quote Alan: + + Don't allow a fragment of TCP 8 bytes in. Nobody normal + causes this. Its a cracker trying to break in by doing a + flag overwrite to pass the direction checks. + */ + + if (offset == 1) { + duprintf("Dropping evil TCP offset=1 frag.\n"); + *hotdrop = 1; + return 0; + } else if (offset == 0 && datalen < sizeof(struct tcphdr)) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + duprintf("Dropping evil TCP offset=0 tinygram.\n"); + *hotdrop = 1; + return 0; + } + + /* FIXME: Try tcp doff >> packet len against various stacks --RR */ + +#define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg)) + + /* Must not be a fragment. */ + return !offset + && port_match(tcpinfo->spts[0], tcpinfo->spts[1], + ntohs(tcp->source), + !!(tcpinfo->invflags & IPT_TCP_INV_SRCPT)) + && port_match(tcpinfo->dpts[0], tcpinfo->dpts[1], + ntohs(tcp->dest), + !!(tcpinfo->invflags & IPT_TCP_INV_DSTPT)) + && FWINVTCP((((unsigned char *)tcp)[13] + & tcpinfo->flg_mask) + == tcpinfo->flg_cmp, + IPT_TCP_INV_FLAGS) + && (!tcpinfo->option + || tcp_find_option(tcpinfo->option, tcp, datalen, + tcpinfo->invflags + & IPT_TCP_INV_OPTION, + hotdrop)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +tcp_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_tcp *tcpinfo = matchinfo; + + /* Must specify proto == TCP, and no unknown invflags */ + return ip->proto == IPPROTO_TCP + && !(ip->invflags & IPT_INV_PROTO) + && matchsize == IPT_ALIGN(sizeof(struct ipt_tcp)) + && !(tcpinfo->invflags & ~IPT_TCP_INV_MASK); +} + +static int +udp_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + const struct udphdr *udp = hdr; + const struct ipt_udp *udpinfo = matchinfo; + + if (offset == 0 && datalen < sizeof(struct udphdr)) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + duprintf("Dropping evil UDP tinygram.\n"); + *hotdrop = 1; + return 0; + } + + /* Must not be a fragment. */ + return !offset + && port_match(udpinfo->spts[0], udpinfo->spts[1], + ntohs(udp->source), + !!(udpinfo->invflags & IPT_UDP_INV_SRCPT)) + && port_match(udpinfo->dpts[0], udpinfo->dpts[1], + ntohs(udp->dest), + !!(udpinfo->invflags & IPT_UDP_INV_DSTPT)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +udp_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ipt_udp *udpinfo = matchinfo; + + /* Must specify proto == UDP, and no unknown invflags */ + if (ip->proto != IPPROTO_UDP || (ip->invflags & IPT_INV_PROTO)) { + duprintf("ipt_udp: Protocol %u != %u\n", ip->proto, + IPPROTO_UDP); + return 0; + } + if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_udp))) { + duprintf("ipt_udp: matchsize %u != %u\n", + matchinfosize, IPT_ALIGN(sizeof(struct ipt_udp))); + return 0; + } + if (udpinfo->invflags & ~IPT_UDP_INV_MASK) { + duprintf("ipt_udp: unknown flags %X\n", + udpinfo->invflags); + return 0; + } + + return 1; +} + +/* Returns 1 if the type and code is matched by the range, 0 otherwise */ +static inline int +icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, + u_int8_t type, u_int8_t code, + int invert) +{ + return (type == test_type && code >= min_code && code <= max_code) + ^ invert; +} + +static int +icmp_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + const struct icmphdr *icmp = hdr; + const struct ipt_icmp *icmpinfo = matchinfo; + + if (offset == 0 && datalen < 2) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + duprintf("Dropping evil ICMP tinygram.\n"); + *hotdrop = 1; + return 0; + } + + /* Must not be a fragment. */ + return !offset + && icmp_type_code_match(icmpinfo->type, + icmpinfo->code[0], + icmpinfo->code[1], + icmp->type, icmp->code, + !!(icmpinfo->invflags&IPT_ICMP_INV)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +icmp_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_icmp *icmpinfo = matchinfo; + + /* Must specify proto == ICMP, and no unknown invflags */ + return ip->proto == IPPROTO_ICMP + && !(ip->invflags & IPT_INV_PROTO) + && matchsize == IPT_ALIGN(sizeof(struct ipt_icmp)) + && !(icmpinfo->invflags & ~IPT_ICMP_INV); +} + +/* The built-in targets: standard (NULL) and error. */ +static struct ipt_target ipt_standard_target += { { NULL, NULL }, IPT_STANDARD_TARGET, NULL, NULL, NULL }; +static struct ipt_target ipt_error_target += { { NULL, NULL }, IPT_ERROR_TARGET, ipt_error, NULL, NULL }; + +static struct nf_sockopt_ops ipt_sockopts += { { NULL, NULL }, PF_INET, IPT_BASE_CTL, IPT_SO_SET_MAX+1, do_ipt_set_ctl, + IPT_BASE_CTL, IPT_SO_GET_MAX+1, do_ipt_get_ctl, 0, NULL }; + +static struct ipt_match tcp_matchstruct += { { NULL, NULL }, "tcp", &tcp_match, &tcp_checkentry, NULL }; +static struct ipt_match udp_matchstruct += { { NULL, NULL }, "udp", &udp_match, &udp_checkentry, NULL }; +static struct ipt_match icmp_matchstruct += { { NULL, NULL }, "icmp", &icmp_match, &icmp_checkentry, NULL }; + +static int __init init(void) +{ + int ret; + + /* Noone else will be downing sem now, so we won't sleep */ + down(&ipt_mutex); + list_append(&ipt_target, &ipt_standard_target); + list_append(&ipt_target, &ipt_error_target); + list_append(&ipt_match, &tcp_matchstruct); + list_append(&ipt_match, &udp_matchstruct); + list_append(&ipt_match, &icmp_matchstruct); + up(&ipt_mutex); + + /* Register setsockopt */ + ret = nf_register_sockopt(&ipt_sockopts); + if (ret < 0) { + duprintf("Unable to register sockopts.\n"); + return ret; + } + + printk("iptables: (c)2000 Netfilter core team\n"); + return 0; +} + +static void __exit fini(void) +{ + nf_unregister_sockopt(&ipt_sockopts); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipchains_core.c b/net/ipv4/netfilter/ipchains_core.c new file mode 100644 index 000000000..02bd7ad83 --- /dev/null +++ b/net/ipv4/netfilter/ipchains_core.c @@ -0,0 +1,1768 @@ +/* Minor modifications to fit on compatibility framework: + Rusty.Russell@rustcorp.com.au +*/ + +/* + * This code is heavily based on the code on the old ip_fw.c code; see below for + * copyrights and attributions of the old code. This code is basically GPL. + * + * 15-Aug-1997: Major changes to allow graphs for firewall rules. + * Paul Russell <Paul.Russell@rustcorp.com.au> and + * Michael Neuling <Michael.Neuling@rustcorp.com.au> + * 24-Aug-1997: Generalised protocol handling (not just TCP/UDP/ICMP). + * Added explicit RETURN from chains. + * Removed TOS mangling (done in ipchains 1.0.1). + * Fixed read & reset bug by reworking proc handling. + * Paul Russell <Paul.Russell@rustcorp.com.au> + * 28-Sep-1997: Added packet marking for net sched code. + * Removed fw_via comparisons: all done on device name now, + * similar to changes in ip_fw.c in DaveM's CVS970924 tree. + * Paul Russell <Paul.Russell@rustcorp.com.au> + * 2-Nov-1997: Moved types across to __u16, etc. + * Added inverse flags. + * Fixed fragment bug (in args to port_match). + * Changed mark to only one flag (MARKABS). + * 21-Nov-1997: Added ability to test ICMP code. + * 19-Jan-1998: Added wildcard interfaces. + * 6-Feb-1998: Merged 2.0 and 2.1 versions. + * Initialised ip_masq for 2.0.x version. + * Added explicit NETLINK option for 2.1.x version. + * Added packet and byte counters for policy matches. + * 26-Feb-1998: Fixed race conditions, added SMP support. + * 18-Mar-1998: Fix SMP, fix race condition fix. + * 1-May-1998: Remove caching of device pointer. + * 12-May-1998: Allow tiny fragment case for TCP/UDP. + * 15-May-1998: Treat short packets as fragments, don't just block. + * 3-Jan-1999: Fixed serious procfs security hole -- users should never + * be allowed to view the chains! + * Marc Santoro <ultima@snicker.emoti.com> + * 29-Jan-1999: Locally generated bogus IPs dealt with, rather than crash + * during dump_packet. --RR. + * 19-May-1999: Star Wars: The Phantom Menace opened. Rule num + * printed in log (modified from Michael Hasenstein's patch). + * Added SYN in log message. --RR + * 23-Jul-1999: Fixed small fragment security exposure opened on 15-May-1998. + * John McDonald <jm@dataprotect.com> + * Thomas Lopatic <tl@dataprotect.com> + */ + +/* + * + * The origina Linux port was done Alan Cox, with changes/fixes from + * Pauline Middlelink, Jos Vos, Thomas Quinot, Wouter Gadeyne, Juan + * Jose Ciarlante, Bernd Eckenfels, Keith Owens and others. + * + * Copyright from the original FreeBSD version follows: + * + * Copyright (c) 1993 Daniel Boulet + * Copyright (c) 1994 Ugen J.S.Antsilevich + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. */ + +#include <linux/config.h> + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/errno.h> + +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/icmp.h> +#include <linux/udp.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/route.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/sock.h> +#include <net/icmp.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4/compat_firewall.h> +#include <linux/netfilter_ipv4/ipchains_core.h> + +#include <net/checksum.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> + +/* Understanding locking in this code: (thanks to Alan Cox for using + * little words to explain this to me). -- PR + * + * In UP, there can be two packets traversing the chains: + * 1) A packet from the current userspace context + * 2) A packet off the bh handlers (timer or net). + * + * For SMP (kernel v2.1+), multiply this by # CPUs. + * + * [Note that this in not correct for 2.2 - because the socket code always + * uses lock_kernel() to serialize, and bottom halves (timers and net_bhs) + * only run on one CPU at a time. This will probably change for 2.3. + * It is still good to use spinlocks because that avoids the global cli() + * for updating the tables, which is rather costly in SMP kernels -AK] + * + * This means counters and backchains can get corrupted if no precautions + * are taken. + * + * To actually alter a chain on UP, we need only do a cli(), as this will + * stop a bh handler firing, as we are in the current userspace context + * (coming from a setsockopt()). + * + * On SMP, we need a write_lock_irqsave(), which is a simple cli() in + * UP. + * + * For backchains and counters, we use an array, indexed by + * [cpu_number_map[smp_processor_id()]*2 + !in_interrupt()]; the array is of + * size [smp_num_cpus*2]. For v2.0, smp_num_cpus is effectively 1. So, + * confident of uniqueness, we modify counters even though we only + * have a read lock (to read the counters, you need a write lock, + * though). */ + +/* Why I didn't use straight locking... -- PR + * + * The backchains can be separated out of the ip_chains structure, and + * allocated as needed inside ip_fw_check(). + * + * The counters, however, can't. Trying to lock these means blocking + * interrupts every time we want to access them. This would suck HARD + * performance-wise. Not locking them leads to possible corruption, + * made worse on 32-bit machines (counters are 64-bit). */ + +/*#define DEBUG_IP_FIREWALL*/ +/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */ +/*#define DEBUG_IP_FIREWALL_USER*/ +/*#define DEBUG_IP_FIREWALL_LOCKING*/ + +static struct sock *ipfwsk; + +#ifdef CONFIG_SMP +#define SLOT_NUMBER() (cpu_number_map(smp_processor_id())*2 + !in_interrupt()) +#else /* !SMP */ +#define SLOT_NUMBER() (!in_interrupt()) +#endif /* CONFIG_SMP */ +#define NUM_SLOTS (smp_num_cpus*2) + +#define SIZEOF_STRUCT_IP_CHAIN (sizeof(struct ip_chain) \ + + NUM_SLOTS*sizeof(struct ip_reent)) +#define SIZEOF_STRUCT_IP_FW_KERNEL (sizeof(struct ip_fwkernel) \ + + NUM_SLOTS*sizeof(struct ip_counters)) + +#ifdef DEBUG_IP_FIREWALL_LOCKING +static unsigned int fwc_rlocks, fwc_wlocks; +#define FWC_DEBUG_LOCK(d) \ +do { \ + FWC_DONT_HAVE_LOCK(d); \ + d |= (1 << SLOT_NUMBER()); \ +} while (0) + +#define FWC_DEBUG_UNLOCK(d) \ +do { \ + FWC_HAVE_LOCK(d); \ + d &= ~(1 << SLOT_NUMBER()); \ +} while (0) + +#define FWC_DONT_HAVE_LOCK(d) \ +do { \ + if ((d) & (1 << SLOT_NUMBER())) \ + printk("%s:%i: Got lock on %i already!\n", \ + __FILE__, __LINE__, SLOT_NUMBER()); \ +} while(0) + +#define FWC_HAVE_LOCK(d) \ +do { \ + if (!((d) & (1 << SLOT_NUMBER()))) \ + printk("%s:%i:No lock on %i!\n", \ + __FILE__, __LINE__, SLOT_NUMBER()); \ +} while (0) + +#else +#define FWC_DEBUG_LOCK(d) do { } while(0) +#define FWC_DEBUG_UNLOCK(d) do { } while(0) +#define FWC_DONT_HAVE_LOCK(d) do { } while(0) +#define FWC_HAVE_LOCK(d) do { } while(0) +#endif /*DEBUG_IP_FIRWALL_LOCKING*/ + +#define FWC_READ_LOCK(l) do { FWC_DEBUG_LOCK(fwc_rlocks); read_lock(l); } while (0) +#define FWC_WRITE_LOCK(l) do { FWC_DEBUG_LOCK(fwc_wlocks); write_lock(l); } while (0) +#define FWC_READ_LOCK_IRQ(l,f) do { FWC_DEBUG_LOCK(fwc_rlocks); read_lock_irqsave(l,f); } while (0) +#define FWC_WRITE_LOCK_IRQ(l,f) do { FWC_DEBUG_LOCK(fwc_wlocks); write_lock_irqsave(l,f); } while (0) +#define FWC_READ_UNLOCK(l) do { FWC_DEBUG_UNLOCK(fwc_rlocks); read_unlock(l); } while (0) +#define FWC_WRITE_UNLOCK(l) do { FWC_DEBUG_UNLOCK(fwc_wlocks); write_unlock(l); } while (0) +#define FWC_READ_UNLOCK_IRQ(l,f) do { FWC_DEBUG_UNLOCK(fwc_rlocks); read_unlock_irqrestore(l,f); } while (0) +#define FWC_WRITE_UNLOCK_IRQ(l,f) do { FWC_DEBUG_UNLOCK(fwc_wlocks); write_unlock_irqrestore(l,f); } while (0) + +struct ip_chain; + +struct ip_counters +{ + __u64 pcnt, bcnt; /* Packet and byte counters */ +}; + +struct ip_fwkernel +{ + struct ip_fw ipfw; + struct ip_fwkernel *next; /* where to go next if current + * rule doesn't match */ + struct ip_chain *branch; /* which branch to jump to if + * current rule matches */ + int simplebranch; /* Use this if branch == NULL */ + struct ip_counters counters[0]; /* Actually several of these */ +}; + +struct ip_reent +{ + struct ip_chain *prevchain; /* Pointer to referencing chain */ + struct ip_fwkernel *prevrule; /* Pointer to referencing rule */ + struct ip_counters counters; +}; + +struct ip_chain +{ + ip_chainlabel label; /* Defines the label for each block */ + struct ip_chain *next; /* Pointer to next block */ + struct ip_fwkernel *chain; /* Pointer to first rule in block */ + __u32 refcount; /* Number of refernces to block */ + int policy; /* Default rule for chain. Only * + * used in built in chains */ + struct ip_reent reent[0]; /* Actually several of these */ +}; + +/* + * Implement IP packet firewall + */ + +#ifdef DEBUG_IP_FIREWALL +#define dprintf(format, args...) printk(format , ## args) +#else +#define dprintf(format, args...) +#endif + +#ifdef DEBUG_IP_FIREWALL_USER +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +/* Lock around ip_fw_chains linked list structure */ +rwlock_t ip_fw_lock = RW_LOCK_UNLOCKED; + +/* Head of linked list of fw rules */ +static struct ip_chain *ip_fw_chains; + +#define IP_FW_INPUT_CHAIN ip_fw_chains +#define IP_FW_FORWARD_CHAIN (ip_fw_chains->next) +#define IP_FW_OUTPUT_CHAIN (ip_fw_chains->next->next) + +/* Returns 1 if the port is matched by the range, 0 otherwise */ +extern inline int port_match(__u16 min, __u16 max, __u16 port, + int frag, int invert) +{ + if (frag) /* Fragments fail ANY port test. */ + return (min == 0 && max == 0xFFFF); + else return (port >= min && port <= max) ^ invert; +} + +/* Returns whether matches rule or not. */ +static int ip_rule_match(struct ip_fwkernel *f, + const char *ifname, + struct iphdr *ip, + char tcpsyn, + __u16 src_port, __u16 dst_port, + char isfrag) +{ +#define FWINV(bool,invflg) ((bool) ^ !!(f->ipfw.fw_invflg & invflg)) + /* + * This is a bit simpler as we don't have to walk + * an interface chain as you do in BSD - same logic + * however. + */ + + if (FWINV((ip->saddr&f->ipfw.fw_smsk.s_addr) != f->ipfw.fw_src.s_addr, + IP_FW_INV_SRCIP) + || FWINV((ip->daddr&f->ipfw.fw_dmsk.s_addr)!=f->ipfw.fw_dst.s_addr, + IP_FW_INV_DSTIP)) { + dprintf("Source or dest mismatch.\n"); + + dprintf("SRC: %u. Mask: %u. Target: %u.%s\n", ip->saddr, + f->ipfw.fw_smsk.s_addr, f->ipfw.fw_src.s_addr, + f->ipfw.fw_invflg & IP_FW_INV_SRCIP ? " (INV)" : ""); + dprintf("DST: %u. Mask: %u. Target: %u.%s\n", ip->daddr, + f->ipfw.fw_dmsk.s_addr, f->ipfw.fw_dst.s_addr, + f->ipfw.fw_invflg & IP_FW_INV_DSTIP ? " (INV)" : ""); + return 0; + } + + /* + * Look for a VIA device match + */ + if (f->ipfw.fw_flg & IP_FW_F_WILDIF) { + if (FWINV(strncmp(ifname, f->ipfw.fw_vianame, + strlen(f->ipfw.fw_vianame)) != 0, + IP_FW_INV_VIA)) { + dprintf("Wildcard interface mismatch.%s\n", + f->ipfw.fw_invflg & IP_FW_INV_VIA ? " (INV)" : ""); + return 0; /* Mismatch */ + } + } + else if (FWINV(strcmp(ifname, f->ipfw.fw_vianame) != 0, + IP_FW_INV_VIA)) { + dprintf("Interface name does not match.%s\n", + f->ipfw.fw_invflg & IP_FW_INV_VIA + ? " (INV)" : ""); + return 0; /* Mismatch */ + } + + /* + * Ok the chain addresses match. + */ + + /* If we have a fragment rule but the packet is not a fragment + * the we return zero */ + if (FWINV((f->ipfw.fw_flg&IP_FW_F_FRAG) && !isfrag, IP_FW_INV_FRAG)) { + dprintf("Fragment rule but not fragment.%s\n", + f->ipfw.fw_invflg & IP_FW_INV_FRAG ? " (INV)" : ""); + return 0; + } + + /* Fragment NEVER passes a SYN test, even an inverted one. */ + if (FWINV((f->ipfw.fw_flg&IP_FW_F_TCPSYN) && !tcpsyn, IP_FW_INV_SYN) + || (isfrag && (f->ipfw.fw_flg&IP_FW_F_TCPSYN))) { + dprintf("Rule requires SYN and packet has no SYN.%s\n", + f->ipfw.fw_invflg & IP_FW_INV_SYN ? " (INV)" : ""); + return 0; + } + + if (f->ipfw.fw_proto) { + /* + * Specific firewall - packet's protocol + * must match firewall's. + */ + + if (FWINV(ip->protocol!=f->ipfw.fw_proto, IP_FW_INV_PROTO)) { + dprintf("Packet protocol %hi does not match %hi.%s\n", + ip->protocol, f->ipfw.fw_proto, + f->ipfw.fw_invflg&IP_FW_INV_PROTO ? " (INV)":""); + return 0; + } + + /* For non TCP/UDP/ICMP, port range is max anyway. */ + if (!port_match(f->ipfw.fw_spts[0], + f->ipfw.fw_spts[1], + src_port, isfrag, + !!(f->ipfw.fw_invflg&IP_FW_INV_SRCPT)) + || !port_match(f->ipfw.fw_dpts[0], + f->ipfw.fw_dpts[1], + dst_port, isfrag, + !!(f->ipfw.fw_invflg + &IP_FW_INV_DSTPT))) { + dprintf("Port match failed.\n"); + return 0; + } + } + + dprintf("Match succeeded.\n"); + return 1; +} + +static const char *branchname(struct ip_chain *branch,int simplebranch) +{ + if (branch) + return branch->label; + switch (simplebranch) + { + case FW_BLOCK: return IP_FW_LABEL_BLOCK; + case FW_ACCEPT: return IP_FW_LABEL_ACCEPT; + case FW_REJECT: return IP_FW_LABEL_REJECT; + case FW_REDIRECT: return IP_FW_LABEL_REDIRECT; + case FW_MASQUERADE: return IP_FW_LABEL_MASQUERADE; + case FW_SKIP: return "-"; + case FW_SKIP+1: return IP_FW_LABEL_RETURN; + default: + return "UNKNOWN"; + } +} + +/* + * VERY ugly piece of code which actually + * makes kernel printf for matching packets... + */ +static void dump_packet(const struct iphdr *ip, + const char *ifname, + struct ip_fwkernel *f, + const ip_chainlabel chainlabel, + __u16 src_port, + __u16 dst_port, + unsigned int count, + int syn) +{ + __u32 *opt = (__u32 *) (ip + 1); + int opti; + + if (f) { + printk(KERN_INFO "Packet log: %s ",chainlabel); + printk("%s ",branchname(f->branch,f->simplebranch)); + if (f->simplebranch==FW_REDIRECT) + printk("%d ",f->ipfw.fw_redirpt); + } + + printk("%s PROTO=%d %d.%d.%d.%d:%hu %d.%d.%d.%d:%hu" + " L=%hu S=0x%2.2hX I=%hu F=0x%4.4hX T=%hu", + ifname, ip->protocol, + (ntohl(ip->saddr)>>24)&0xFF, + (ntohl(ip->saddr)>>16)&0xFF, + (ntohl(ip->saddr)>>8)&0xFF, + (ntohl(ip->saddr))&0xFF, + src_port, + (ntohl(ip->daddr)>>24)&0xFF, + (ntohl(ip->daddr)>>16)&0xFF, + (ntohl(ip->daddr)>>8)&0xFF, + (ntohl(ip->daddr))&0xFF, + dst_port, + ntohs(ip->tot_len), ip->tos, ntohs(ip->id), + ntohs(ip->frag_off), ip->ttl); + + for (opti = 0; opti < (ip->ihl - sizeof(struct iphdr) / 4); opti++) + printk(" O=0x%8.8X", *opt++); + printk(" %s(#%d)\n", syn ? "SYN " : /* "PENANCE" */ "", count); +} + +/* function for checking chain labels for user space. */ +static int check_label(ip_chainlabel label) +{ + unsigned int i; + /* strlen must be < IP_FW_MAX_LABEL_LENGTH. */ + for (i = 0; i < IP_FW_MAX_LABEL_LENGTH + 1; i++) + if (label[i] == '\0') return 1; + + return 0; +} + +/* This function returns a pointer to the first chain with a label + * that matches the one given. */ +static struct ip_chain *find_label(ip_chainlabel label) +{ + struct ip_chain *tmp; + FWC_HAVE_LOCK(fwc_rlocks | fwc_wlocks); + for (tmp = ip_fw_chains; tmp; tmp = tmp->next) + if (strcmp(tmp->label,label) == 0) + break; + return tmp; +} + +/* This function returns a boolean which when true sets answer to one + of the FW_*. */ +static int find_special(ip_chainlabel label, int *answer) +{ + if (label[0] == '\0') { + *answer = FW_SKIP; /* => pass-through rule */ + return 1; + } else if (strcmp(label,IP_FW_LABEL_ACCEPT) == 0) { + *answer = FW_ACCEPT; + return 1; + } else if (strcmp(label,IP_FW_LABEL_BLOCK) == 0) { + *answer = FW_BLOCK; + return 1; + } else if (strcmp(label,IP_FW_LABEL_REJECT) == 0) { + *answer = FW_REJECT; + return 1; + } else if (strcmp(label,IP_FW_LABEL_REDIRECT) == 0) { + *answer = FW_REDIRECT; + return 1; + } else if (strcmp(label,IP_FW_LABEL_MASQUERADE) == 0) { + *answer = FW_MASQUERADE; + return 1; + } else if (strcmp(label, IP_FW_LABEL_RETURN) == 0) { + *answer = FW_SKIP+1; + return 1; + } else { + return 0; + } +} + +/* This function cleans up the prevchain and prevrule. If the verbose + * flag is set then he names of the chains will be printed as it + * cleans up. */ +static void cleanup(struct ip_chain *chain, + const int verbose, + unsigned int slot) +{ + struct ip_chain *tmpchain = chain->reent[slot].prevchain; + if (verbose) + printk(KERN_ERR "Chain backtrace: "); + while (tmpchain) { + if (verbose) + printk("%s<-",chain->label); + chain->reent[slot].prevchain = NULL; + chain = tmpchain; + tmpchain = chain->reent[slot].prevchain; + } + if (verbose) + printk("%s\n",chain->label); +} + +static inline int +ip_fw_domatch(struct ip_fwkernel *f, + struct iphdr *ip, + const char *rif, + const ip_chainlabel label, + struct sk_buff *skb, + unsigned int slot, + __u16 src_port, __u16 dst_port, + unsigned int count, + int tcpsyn) +{ + f->counters[slot].bcnt+=ntohs(ip->tot_len); + f->counters[slot].pcnt++; + if (f->ipfw.fw_flg & IP_FW_F_PRN) { + dump_packet(ip,rif,f,label,src_port,dst_port,count,tcpsyn); + } + ip->tos = (ip->tos & f->ipfw.fw_tosand) ^ f->ipfw.fw_tosxor; + +/* This functionality is useless in stock 2.0.x series, but we don't + * discard the mark thing altogether, to avoid breaking ipchains (and, + * more importantly, the ipfwadm wrapper) --PR */ + if (f->ipfw.fw_flg & IP_FW_F_MARKABS) { + skb->nfmark = f->ipfw.fw_mark; + } else { + skb->nfmark += f->ipfw.fw_mark; + } + if (f->ipfw.fw_flg & IP_FW_F_NETLINK) { +#if defined(CONFIG_NETLINK_DEV) || defined(CONFIG_NETLINK_DEV_MODULE) + size_t len = min(f->ipfw.fw_outputsize, ntohs(ip->tot_len)) + + sizeof(__u32) + sizeof(skb->nfmark) + IFNAMSIZ; + struct sk_buff *outskb=alloc_skb(len, GFP_ATOMIC); + + duprintf("Sending packet out NETLINK (length = %u).\n", + (unsigned int)len); + if (outskb) { + /* Prepend length, mark & interface */ + skb_put(outskb, len); + *((__u32 *)outskb->data) = (__u32)len; + *((__u32 *)(outskb->data+sizeof(__u32))) = skb->nfmark; + strcpy(outskb->data+sizeof(__u32)*2, rif); + memcpy(outskb->data+sizeof(__u32)*2+IFNAMSIZ, ip, + len-(sizeof(__u32)*2+IFNAMSIZ)); + netlink_broadcast(ipfwsk, outskb, 0, ~0, GFP_KERNEL); + } + else { +#endif + if (net_ratelimit()) + printk(KERN_WARNING "ip_fw: packet drop due to " + "netlink failure\n"); + return 0; +#if defined(CONFIG_NETLINK_DEV) || defined(CONFIG_NETLINK_DEV_MODULE) + } +#endif + } + return 1; +} + +/* + * Returns one of the generic firewall policies, like FW_ACCEPT. + * + * The testing is either false for normal firewall mode or true for + * user checking mode (counters are not updated, TOS & mark not done). + */ +static int +ip_fw_check(struct iphdr *ip, + const char *rif, + __u16 *redirport, + struct ip_chain *chain, + struct sk_buff *skb, + unsigned int slot, + int testing) +{ + struct tcphdr *tcp=(struct tcphdr *)((__u32 *)ip+ip->ihl); + struct udphdr *udp=(struct udphdr *)((__u32 *)ip+ip->ihl); + struct icmphdr *icmp=(struct icmphdr *)((__u32 *)ip+ip->ihl); + __u32 src, dst; + __u16 src_port = 0xFFFF, dst_port = 0xFFFF; + char tcpsyn=0; + __u16 offset; + unsigned char oldtos; + struct ip_fwkernel *f; + int ret = FW_SKIP+2; + unsigned int count; + + /* We handle fragments by dealing with the first fragment as + * if it was a normal packet. All other fragments are treated + * normally, except that they will NEVER match rules that ask + * things we don't know, ie. tcp syn flag or ports). If the + * rule is also a fragment-specific rule, non-fragments won't + * match it. */ + + offset = ntohs(ip->frag_off) & IP_OFFSET; + + /* + * Don't allow a fragment of TCP 8 bytes in. Nobody + * normal causes this. Its a cracker trying to break + * in by doing a flag overwrite to pass the direction + * checks. + */ + if (offset == 1 && ip->protocol == IPPROTO_TCP) { + if (!testing && net_ratelimit()) { + printk("Suspect TCP fragment.\n"); + dump_packet(ip,rif,NULL,NULL,0,0,0,0); + } + return FW_BLOCK; + } + + /* If we can't investigate ports, treat as fragment. It's + * either a trucated whole packet, or a truncated first + * fragment, or a TCP first fragment of length 8-15, in which + * case the above rule stops reassembly. + */ + if (offset == 0) { + unsigned int size_req; + switch (ip->protocol) { + case IPPROTO_TCP: + /* Don't care about things past flags word */ + size_req = 16; + break; + + case IPPROTO_UDP: + case IPPROTO_ICMP: + size_req = 8; + break; + + default: + size_req = 0; + } + + /* If it is a truncated first fragment then it can be + * used to rewrite port information, and thus should + * be blocked. + */ + if (ntohs(ip->tot_len) < (ip->ihl<<2)+size_req) { + if (!testing && net_ratelimit()) { + printk("Suspect short first fragment.\n"); + dump_packet(ip,rif,NULL,NULL,0,0,0,0); + } + return FW_BLOCK; + } + } + + src = ip->saddr; + dst = ip->daddr; + oldtos = ip->tos; + + /* + * If we got interface from which packet came + * we can use the address directly. Linux 2.1 now uses address + * chains per device too, but unlike BSD we first check if the + * incoming packet matches a device address and the routing + * table before calling the firewall. + */ + + dprintf("Packet "); + switch(ip->protocol) + { + case IPPROTO_TCP: + dprintf("TCP "); + if (!offset) { + src_port=ntohs(tcp->source); + dst_port=ntohs(tcp->dest); + + /* Connection initilisation can only + * be made when the syn bit is set and + * neither of the ack or reset is + * set. */ + if(tcp->syn && !(tcp->ack || tcp->rst)) + tcpsyn=1; + } + break; + case IPPROTO_UDP: + dprintf("UDP "); + if (!offset) { + src_port=ntohs(udp->source); + dst_port=ntohs(udp->dest); + } + break; + case IPPROTO_ICMP: + if (!offset) { + src_port=(__u16)icmp->type; + dst_port=(__u16)icmp->code; + } + dprintf("ICMP "); + break; + default: + dprintf("p=%d ",ip->protocol); + break; + } +#ifdef DEBUG_IP_FIREWALL + print_ip(ip->saddr); + + if (offset) + dprintf(":fragment (%i) ", ((int)offset)<<2); + else if (ip->protocol==IPPROTO_TCP || ip->protocol==IPPROTO_UDP + || ip->protocol==IPPROTO_ICMP) + dprintf(":%hu:%hu", src_port, dst_port); + dprintf("\n"); +#endif + + if (!testing) FWC_READ_LOCK(&ip_fw_lock); + else FWC_HAVE_LOCK(fwc_rlocks); + + f = chain->chain; + do { + count = 0; + for (; f; f = f->next) { + count++; + if (ip_rule_match(f,rif,ip, + tcpsyn,src_port,dst_port,offset)) { + if (!testing + && !ip_fw_domatch(f, ip, rif, chain->label, + skb, slot, + src_port, dst_port, + count, tcpsyn)) { + ret = FW_BLOCK; + goto out; + } + break; + } + } + if (f) { + if (f->branch) { + /* Do sanity check to see if we have + * already set prevchain and if so we + * must be in a loop */ + if (f->branch->reent[slot].prevchain) { + if (!testing) { + printk(KERN_ERR + "IP firewall: " + "Loop detected " + "at `%s'.\n", + f->branch->label); + cleanup(chain, 1, slot); + ret = FW_BLOCK; + } else { + cleanup(chain, 0, slot); + ret = FW_SKIP+1; + } + } + else { + f->branch->reent[slot].prevchain + = chain; + f->branch->reent[slot].prevrule + = f->next; + chain = f->branch; + f = chain->chain; + } + } + else if (f->simplebranch == FW_SKIP) + f = f->next; + else if (f->simplebranch == FW_SKIP+1) { + /* Just like falling off the chain */ + goto fall_off_chain; + } else { + cleanup(chain, 0, slot); + ret = f->simplebranch; + } + } /* f == NULL */ + else { + fall_off_chain: + if (chain->reent[slot].prevchain) { + struct ip_chain *tmp = chain; + f = chain->reent[slot].prevrule; + chain = chain->reent[slot].prevchain; + tmp->reent[slot].prevchain = NULL; + } + else { + ret = chain->policy; + if (!testing) { + chain->reent[slot].counters.pcnt++; + chain->reent[slot].counters.bcnt + += ntohs(ip->tot_len); + } + } + } + } while (ret == FW_SKIP+2); + + out: + if (!testing) FWC_READ_UNLOCK(&ip_fw_lock); + + /* Recalculate checksum if not going to reject, and TOS changed. */ + if (ip->tos != oldtos + && ret != FW_REJECT && ret != FW_BLOCK + && !testing) + ip_send_check(ip); + + if (ret == FW_REDIRECT && redirport) { + if ((*redirport = htons(f->ipfw.fw_redirpt)) == 0) { + /* Wildcard redirection. + * Note that redirport will become + * 0xFFFF for non-TCP/UDP packets. + */ + *redirport = htons(dst_port); + } + } + +#ifdef DEBUG_ALLOW_ALL + return (testing ? ret : FW_ACCEPT); +#else + return ret; +#endif +} + +/* Must have write lock & interrupts off for any of these */ + +/* This function sets all the byte counters in a chain to zero. The + * input is a pointer to the chain required for zeroing */ +static int zero_fw_chain(struct ip_chain *chainptr) +{ + struct ip_fwkernel *i; + + FWC_HAVE_LOCK(fwc_wlocks); + for (i = chainptr->chain; i; i = i->next) + memset(i->counters, 0, sizeof(struct ip_counters)*NUM_SLOTS); + return 0; +} + +static int clear_fw_chain(struct ip_chain *chainptr) +{ + struct ip_fwkernel *i= chainptr->chain; + + FWC_HAVE_LOCK(fwc_wlocks); + chainptr->chain=NULL; + + while (i) { + struct ip_fwkernel *tmp = i->next; + if (i->branch) + i->branch->refcount--; + kfree(i); + i = tmp; + } + return 0; +} + +static int replace_in_chain(struct ip_chain *chainptr, + struct ip_fwkernel *frwl, + __u32 position) +{ + struct ip_fwkernel *f = chainptr->chain; + + FWC_HAVE_LOCK(fwc_wlocks); + + while (--position && f != NULL) f = f->next; + if (f == NULL) + return EINVAL; + + if (f->branch) f->branch->refcount--; + if (frwl->branch) frwl->branch->refcount++; + + frwl->next = f->next; + memcpy(f,frwl,sizeof(struct ip_fwkernel)); + kfree(frwl); + return 0; +} + +static int append_to_chain(struct ip_chain *chainptr, struct ip_fwkernel *rule) +{ + struct ip_fwkernel *i; + + FWC_HAVE_LOCK(fwc_wlocks); + /* Special case if no rules already present */ + if (chainptr->chain == NULL) { + + /* If pointer writes are atomic then turning off + * interupts is not necessary. */ + chainptr->chain = rule; + if (rule->branch) rule->branch->refcount++; + return 0; + } + + /* Find the rule before the end of the chain */ + for (i = chainptr->chain; i->next; i = i->next); + i->next = rule; + if (rule->branch) rule->branch->refcount++; + return 0; +} + +/* This function inserts a rule at the position of position in the + * chain refenced by chainptr. If position is 1 then this rule will + * become the new rule one. */ +static int insert_in_chain(struct ip_chain *chainptr, + struct ip_fwkernel *frwl, + __u32 position) +{ + struct ip_fwkernel *f = chainptr->chain; + + FWC_HAVE_LOCK(fwc_wlocks); + /* special case if the position is number 1 */ + if (position == 1) { + frwl->next = chainptr->chain; + if (frwl->branch) frwl->branch->refcount++; + chainptr->chain = frwl; + return 0; + } + position--; + while (--position && f != NULL) f = f->next; + if (f == NULL) + return EINVAL; + if (frwl->branch) frwl->branch->refcount++; + frwl->next = f->next; + + f->next = frwl; + return 0; +} + +/* This function deletes the a rule from a given rulenum and chain. + * With rulenum = 1 is the first rule is deleted. */ + +static int del_num_from_chain(struct ip_chain *chainptr, __u32 rulenum) +{ + struct ip_fwkernel *i=chainptr->chain,*tmp; + + FWC_HAVE_LOCK(fwc_wlocks); + + if (!chainptr->chain) + return ENOENT; + + /* Need a special case for the first rule */ + if (rulenum == 1) { + /* store temp to allow for freeing up of memory */ + tmp = chainptr->chain; + if (chainptr->chain->branch) chainptr->chain->branch->refcount--; + chainptr->chain = chainptr->chain->next; + kfree(tmp); /* free memory that is now unused */ + } else { + rulenum--; + while (--rulenum && i->next ) i = i->next; + if (!i->next) + return ENOENT; + tmp = i->next; + if (i->next->branch) + i->next->branch->refcount--; + i->next = i->next->next; + kfree(tmp); + } + return 0; +} + + +/* This function deletes the a rule from a given rule and chain. + * The rule that is deleted is the first occursance of that rule. */ +static int del_rule_from_chain(struct ip_chain *chainptr, + struct ip_fwkernel *frwl) +{ + struct ip_fwkernel *ltmp,*ftmp = chainptr->chain ; + int was_found; + + FWC_HAVE_LOCK(fwc_wlocks); + + /* Sure, we should compare marks, but since the `ipfwadm' + * script uses it for an unholy hack... well, life is easier + * this way. We also mask it out of the flags word. --PR */ + for (ltmp=NULL, was_found=0; + !was_found && ftmp != NULL; + ltmp = ftmp,ftmp = ftmp->next) { + if (ftmp->ipfw.fw_src.s_addr!=frwl->ipfw.fw_src.s_addr + || ftmp->ipfw.fw_dst.s_addr!=frwl->ipfw.fw_dst.s_addr + || ftmp->ipfw.fw_smsk.s_addr!=frwl->ipfw.fw_smsk.s_addr + || ftmp->ipfw.fw_dmsk.s_addr!=frwl->ipfw.fw_dmsk.s_addr +#if 0 + || ftmp->ipfw.fw_flg!=frwl->ipfw.fw_flg +#else + || ((ftmp->ipfw.fw_flg & ~IP_FW_F_MARKABS) + != (frwl->ipfw.fw_flg & ~IP_FW_F_MARKABS)) +#endif + || ftmp->ipfw.fw_invflg!=frwl->ipfw.fw_invflg + || ftmp->ipfw.fw_proto!=frwl->ipfw.fw_proto +#if 0 + || ftmp->ipfw.fw_mark!=frwl->ipfw.fw_mark +#endif + || ftmp->ipfw.fw_redirpt!=frwl->ipfw.fw_redirpt + || ftmp->ipfw.fw_spts[0]!=frwl->ipfw.fw_spts[0] + || ftmp->ipfw.fw_spts[1]!=frwl->ipfw.fw_spts[1] + || ftmp->ipfw.fw_dpts[0]!=frwl->ipfw.fw_dpts[0] + || ftmp->ipfw.fw_dpts[1]!=frwl->ipfw.fw_dpts[1] + || ftmp->ipfw.fw_outputsize!=frwl->ipfw.fw_outputsize) { + duprintf("del_rule_from_chain: mismatch:" + "src:%u/%u dst:%u/%u smsk:%u/%u dmsk:%u/%u " + "flg:%hX/%hX invflg:%hX/%hX proto:%u/%u " + "mark:%u/%u " + "ports:%hu-%hu/%hu-%hu %hu-%hu/%hu-%hu " + "outputsize:%hu-%hu\n", + ftmp->ipfw.fw_src.s_addr, + frwl->ipfw.fw_src.s_addr, + ftmp->ipfw.fw_dst.s_addr, + frwl->ipfw.fw_dst.s_addr, + ftmp->ipfw.fw_smsk.s_addr, + frwl->ipfw.fw_smsk.s_addr, + ftmp->ipfw.fw_dmsk.s_addr, + frwl->ipfw.fw_dmsk.s_addr, + ftmp->ipfw.fw_flg, + frwl->ipfw.fw_flg, + ftmp->ipfw.fw_invflg, + frwl->ipfw.fw_invflg, + ftmp->ipfw.fw_proto, + frwl->ipfw.fw_proto, + ftmp->ipfw.fw_mark, + frwl->ipfw.fw_mark, + ftmp->ipfw.fw_spts[0], + frwl->ipfw.fw_spts[0], + ftmp->ipfw.fw_spts[1], + frwl->ipfw.fw_spts[1], + ftmp->ipfw.fw_dpts[0], + frwl->ipfw.fw_dpts[0], + ftmp->ipfw.fw_dpts[1], + frwl->ipfw.fw_dpts[1], + ftmp->ipfw.fw_outputsize, + frwl->ipfw.fw_outputsize); + continue; + } + + if (strncmp(ftmp->ipfw.fw_vianame, + frwl->ipfw.fw_vianame, + IFNAMSIZ)) { + duprintf("del_rule_from_chain: if mismatch: %s/%s\n", + ftmp->ipfw.fw_vianame, + frwl->ipfw.fw_vianame); + continue; + } + if (ftmp->branch != frwl->branch) { + duprintf("del_rule_from_chain: branch mismatch: " + "%s/%s\n", + ftmp->branch?ftmp->branch->label:"(null)", + frwl->branch?frwl->branch->label:"(null)"); + continue; + } + if (ftmp->branch == NULL + && ftmp->simplebranch != frwl->simplebranch) { + duprintf("del_rule_from_chain: simplebranch mismatch: " + "%i/%i\n", + ftmp->simplebranch, frwl->simplebranch); + continue; + } + was_found = 1; + if (ftmp->branch) + ftmp->branch->refcount--; + if (ltmp) + ltmp->next = ftmp->next; + else + chainptr->chain = ftmp->next; + kfree(ftmp); + break; + } + + if (was_found) + return 0; + else { + duprintf("del_rule_from_chain: no matching rule found\n"); + return EINVAL; + } +} + +/* This function takes the label of a chain and deletes the first + * chain with that name. No special cases required for the built in + * chains as they have their refcount initilised to 1 so that they are + * never deleted. */ +static int del_chain(ip_chainlabel label) +{ + struct ip_chain *tmp,*tmp2; + + FWC_HAVE_LOCK(fwc_wlocks); + /* Corner case: return EBUSY not ENOENT for first elem ("input") */ + if (strcmp(label, ip_fw_chains->label) == 0) + return EBUSY; + + for (tmp = ip_fw_chains; tmp->next; tmp = tmp->next) + if(strcmp(tmp->next->label,label) == 0) + break; + + tmp2 = tmp->next; + if (!tmp2) + return ENOENT; + + if (tmp2->refcount) + return EBUSY; + + if (tmp2->chain) + return ENOTEMPTY; + + tmp->next = tmp2->next; + kfree(tmp2); + return 0; +} + +/* This is a function to initilise a chain. Built in rules start with + * refcount = 1 so that they cannot be deleted. User defined rules + * start with refcount = 0 so they can be deleted. */ +static struct ip_chain *ip_init_chain(ip_chainlabel name, + __u32 ref, + int policy) +{ + unsigned int i; + struct ip_chain *label + = kmalloc(SIZEOF_STRUCT_IP_CHAIN, GFP_KERNEL); + if (label == NULL) + panic("Can't kmalloc for firewall chains.\n"); + strcpy(label->label,name); + label->next = NULL; + label->chain = NULL; + label->refcount = ref; + label->policy = policy; + for (i = 0; i < smp_num_cpus*2; i++) { + label->reent[i].counters.pcnt = label->reent[i].counters.bcnt + = 0; + label->reent[i].prevchain = NULL; + label->reent[i].prevrule = NULL; + } + + return label; +} + +/* This is a function for reating a new chain. The chains is not + * created if a chain of the same name already exists */ +static int create_chain(ip_chainlabel label) +{ + struct ip_chain *tmp; + + if (!check_label(label)) + return EINVAL; + + FWC_HAVE_LOCK(fwc_wlocks); + for (tmp = ip_fw_chains; tmp->next; tmp = tmp->next) + if (strcmp(tmp->label,label) == 0) + return EEXIST; + + if (strcmp(tmp->label,label) == 0) + return EEXIST; + + tmp->next = ip_init_chain(label, 0, FW_SKIP); /* refcount is + * zero since this is a + * user defined chain * + * and therefore can be + * deleted */ + return 0; +} + +/* This function simply changes the policy on one of the built in + * chains. checking must be done before this is call to ensure that + * chainptr is pointing to one of the three possible chains */ +static int change_policy(struct ip_chain *chainptr, int policy) +{ + FWC_HAVE_LOCK(fwc_wlocks); + chainptr->policy = policy; + return 0; +} + +/* This function takes an ip_fwuser and converts it to a ip_fwkernel. It also + * performs some checks in the structure. */ +static struct ip_fwkernel *convert_ipfw(struct ip_fwuser *fwuser, int *errno) +{ + struct ip_fwkernel *fwkern; + + if ( (fwuser->ipfw.fw_flg & ~IP_FW_F_MASK) != 0 ) { + duprintf("convert_ipfw: undefined flag bits set (flags=%x)\n", + fwuser->ipfw.fw_flg); + *errno = EINVAL; + return NULL; + } + +#ifdef DEBUG_IP_FIREWALL_USER + /* These are sanity checks that don't really matter. + * We can get rid of these once testing is complete. + */ + if ((fwuser->ipfw.fw_flg & IP_FW_F_TCPSYN) + && ((fwuser->ipfw.fw_invflg & IP_FW_INV_PROTO) + || fwuser->ipfw.fw_proto != IPPROTO_TCP)) { + duprintf("convert_ipfw: TCP SYN flag set but proto != TCP!\n"); + *errno = EINVAL; + return NULL; + } + + if (strcmp(fwuser->label, IP_FW_LABEL_REDIRECT) != 0 + && fwuser->ipfw.fw_redirpt != 0) { + duprintf("convert_ipfw: Target not REDIR but redirpt != 0!\n"); + *errno = EINVAL; + return NULL; + } + + if ((!(fwuser->ipfw.fw_flg & IP_FW_F_FRAG) + && (fwuser->ipfw.fw_invflg & IP_FW_INV_FRAG)) + || (!(fwuser->ipfw.fw_flg & IP_FW_F_TCPSYN) + && (fwuser->ipfw.fw_invflg & IP_FW_INV_SYN))) { + duprintf("convert_ipfw: Can't have INV flag if flag unset!\n"); + *errno = EINVAL; + return NULL; + } + + if (((fwuser->ipfw.fw_invflg & IP_FW_INV_SRCPT) + && fwuser->ipfw.fw_spts[0] == 0 + && fwuser->ipfw.fw_spts[1] == 0xFFFF) + || ((fwuser->ipfw.fw_invflg & IP_FW_INV_DSTPT) + && fwuser->ipfw.fw_dpts[0] == 0 + && fwuser->ipfw.fw_dpts[1] == 0xFFFF) + || ((fwuser->ipfw.fw_invflg & IP_FW_INV_VIA) + && (fwuser->ipfw.fw_vianame)[0] == '\0') + || ((fwuser->ipfw.fw_invflg & IP_FW_INV_SRCIP) + && fwuser->ipfw.fw_smsk.s_addr == 0) + || ((fwuser->ipfw.fw_invflg & IP_FW_INV_DSTIP) + && fwuser->ipfw.fw_dmsk.s_addr == 0)) { + duprintf("convert_ipfw: INV flag makes rule unmatchable!\n"); + *errno = EINVAL; + return NULL; + } + + if ((fwuser->ipfw.fw_flg & IP_FW_F_FRAG) + && !(fwuser->ipfw.fw_invflg & IP_FW_INV_FRAG) + && (fwuser->ipfw.fw_spts[0] != 0 + || fwuser->ipfw.fw_spts[1] != 0xFFFF + || fwuser->ipfw.fw_dpts[0] != 0 + || fwuser->ipfw.fw_dpts[1] != 0xFFFF + || (fwuser->ipfw.fw_flg & IP_FW_F_TCPSYN))) { + duprintf("convert_ipfw: Can't test ports or SYN with frag!\n"); + *errno = EINVAL; + return NULL; + } +#endif + + if ((fwuser->ipfw.fw_spts[0] != 0 + || fwuser->ipfw.fw_spts[1] != 0xFFFF + || fwuser->ipfw.fw_dpts[0] != 0 + || fwuser->ipfw.fw_dpts[1] != 0xFFFF) + && ((fwuser->ipfw.fw_invflg & IP_FW_INV_PROTO) + || (fwuser->ipfw.fw_proto != IPPROTO_TCP + && fwuser->ipfw.fw_proto != IPPROTO_UDP + && fwuser->ipfw.fw_proto != IPPROTO_ICMP))) { + duprintf("convert_ipfw: Can only test ports for TCP/UDP/ICMP!\n"); + *errno = EINVAL; + return NULL; + } + + fwkern = kmalloc(SIZEOF_STRUCT_IP_FW_KERNEL, GFP_KERNEL); + if (!fwkern) { + duprintf("convert_ipfw: kmalloc failed!\n"); + *errno = ENOMEM; + return NULL; + } + memcpy(&fwkern->ipfw,&fwuser->ipfw,sizeof(struct ip_fw)); + + if (!find_special(fwuser->label, &fwkern->simplebranch)) { + fwkern->branch = find_label(fwuser->label); + if (!fwkern->branch) { + duprintf("convert_ipfw: chain doesn't exist `%s'.\n", + fwuser->label); + kfree(fwkern); + *errno = ENOENT; + return NULL; + } else if (fwkern->branch == IP_FW_INPUT_CHAIN + || fwkern->branch == IP_FW_FORWARD_CHAIN + || fwkern->branch == IP_FW_OUTPUT_CHAIN) { + duprintf("convert_ipfw: Can't branch to builtin chain `%s'.\n", + fwuser->label); + kfree(fwkern); + *errno = ENOENT; + return NULL; + } + } else + fwkern->branch = NULL; + memset(fwkern->counters, 0, sizeof(struct ip_counters)*NUM_SLOTS); + + /* Handle empty vianame by making it a wildcard */ + if ((fwkern->ipfw.fw_vianame)[0] == '\0') + fwkern->ipfw.fw_flg |= IP_FW_F_WILDIF; + + fwkern->next = NULL; + return fwkern; +} + +int ip_fw_ctl(int cmd, void *m, int len) +{ + int ret; + struct ip_chain *chain; + unsigned long flags; + + FWC_WRITE_LOCK_IRQ(&ip_fw_lock, flags); + + switch (cmd) { + case IP_FW_FLUSH: + if (len != sizeof(ip_chainlabel) || !check_label(m)) + ret = EINVAL; + else if ((chain = find_label(m)) == NULL) + ret = ENOENT; + else ret = clear_fw_chain(chain); + break; + + case IP_FW_ZERO: + if (len != sizeof(ip_chainlabel) || !check_label(m)) + ret = EINVAL; + else if ((chain = find_label(m)) == NULL) + ret = ENOENT; + else ret = zero_fw_chain(chain); + break; + + case IP_FW_CHECK: { + struct ip_fwtest *new = m; + struct iphdr *ip; + + /* Don't need write lock. */ + FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags); + + if (len != sizeof(struct ip_fwtest) || !check_label(m)) + return EINVAL; + + /* Need readlock to do find_label */ + FWC_READ_LOCK(&ip_fw_lock); + + if ((chain = find_label(new->fwt_label)) == NULL) + ret = ENOENT; + else { + ip = &(new->fwt_packet.fwp_iph); + + if (ip->ihl != sizeof(struct iphdr) / sizeof(int)) { + duprintf("ip_fw_ctl: ip->ihl=%d, want %d\n", + ip->ihl, + sizeof(struct iphdr) / sizeof(int)); + ret = EINVAL; + } + else { + ret = ip_fw_check(ip, new->fwt_packet.fwp_vianame, + NULL, chain, + NULL, SLOT_NUMBER(), 1); + switch (ret) { + case FW_ACCEPT: + ret = 0; break; + case FW_REDIRECT: + ret = ECONNABORTED; break; + case FW_MASQUERADE: + ret = ECONNRESET; break; + case FW_REJECT: + ret = ECONNREFUSED; break; + /* Hack to help diag; these only get + returned when testing. */ + case FW_SKIP+1: + ret = ELOOP; break; + case FW_SKIP: + ret = ENFILE; break; + default: /* FW_BLOCK */ + ret = ETIMEDOUT; break; + } + } + } + FWC_READ_UNLOCK(&ip_fw_lock); + return ret; + } + + case IP_FW_MASQ_TIMEOUTS: { + ret = ip_fw_masq_timeouts(m, len); + } + break; + + case IP_FW_REPLACE: { + struct ip_fwkernel *ip_fwkern; + struct ip_fwnew *new = m; + + if (len != sizeof(struct ip_fwnew) + || !check_label(new->fwn_label)) + ret = EINVAL; + else if ((chain = find_label(new->fwn_label)) == NULL) + ret = ENOENT; + else if ((ip_fwkern = convert_ipfw(&new->fwn_rule, &ret)) + != NULL) + ret = replace_in_chain(chain, ip_fwkern, + new->fwn_rulenum); + } + break; + + case IP_FW_APPEND: { + struct ip_fwchange *new = m; + struct ip_fwkernel *ip_fwkern; + + if (len != sizeof(struct ip_fwchange) + || !check_label(new->fwc_label)) + ret = EINVAL; + else if ((chain = find_label(new->fwc_label)) == NULL) + ret = ENOENT; + else if ((ip_fwkern = convert_ipfw(&new->fwc_rule, &ret)) + != NULL) + ret = append_to_chain(chain, ip_fwkern); + } + break; + + case IP_FW_INSERT: { + struct ip_fwkernel *ip_fwkern; + struct ip_fwnew *new = m; + + if (len != sizeof(struct ip_fwnew) + || !check_label(new->fwn_label)) + ret = EINVAL; + else if ((chain = find_label(new->fwn_label)) == NULL) + ret = ENOENT; + else if ((ip_fwkern = convert_ipfw(&new->fwn_rule, &ret)) + != NULL) + ret = insert_in_chain(chain, ip_fwkern, + new->fwn_rulenum); + } + break; + + case IP_FW_DELETE: { + struct ip_fwchange *new = m; + struct ip_fwkernel *ip_fwkern; + + if (len != sizeof(struct ip_fwchange) + || !check_label(new->fwc_label)) + ret = EINVAL; + else if ((chain = find_label(new->fwc_label)) == NULL) + ret = ENOENT; + else if ((ip_fwkern = convert_ipfw(&new->fwc_rule, &ret)) + != NULL) { + ret = del_rule_from_chain(chain, ip_fwkern); + kfree(ip_fwkern); + } + } + break; + + case IP_FW_DELETE_NUM: { + struct ip_fwdelnum *new = m; + + if (len != sizeof(struct ip_fwdelnum) + || !check_label(new->fwd_label)) + ret = EINVAL; + else if ((chain = find_label(new->fwd_label)) == NULL) + ret = ENOENT; + else ret = del_num_from_chain(chain, new->fwd_rulenum); + } + break; + + case IP_FW_CREATECHAIN: { + if (len != sizeof(ip_chainlabel)) { + duprintf("create_chain: bad size %i\n", len); + ret = EINVAL; + } + else ret = create_chain(m); + } + break; + + case IP_FW_DELETECHAIN: { + if (len != sizeof(ip_chainlabel)) { + duprintf("delete_chain: bad size %i\n", len); + ret = EINVAL; + } + else ret = del_chain(m); + } + break; + + case IP_FW_POLICY: { + struct ip_fwpolicy *new = m; + + if (len != sizeof(struct ip_fwpolicy) + || !check_label(new->fwp_label)) + ret = EINVAL; + else if ((chain = find_label(new->fwp_label)) == NULL) + ret = ENOENT; + else if (chain != IP_FW_INPUT_CHAIN + && chain != IP_FW_FORWARD_CHAIN + && chain != IP_FW_OUTPUT_CHAIN) { + duprintf("change_policy: can't change policy on user" + " defined chain.\n"); + ret = EINVAL; + } + else { + int pol = FW_SKIP; + find_special(new->fwp_policy, &pol); + + switch(pol) { + case FW_MASQUERADE: + if (chain != IP_FW_FORWARD_CHAIN) { + ret = EINVAL; + break; + } + /* Fall thru... */ + case FW_BLOCK: + case FW_ACCEPT: + case FW_REJECT: + ret = change_policy(chain, pol); + break; + default: + duprintf("change_policy: bad policy `%s'\n", + new->fwp_policy); + ret = EINVAL; + } + } + break; + } + default: + duprintf("ip_fw_ctl: unknown request %d\n",cmd); + ret = ENOPROTOOPT; + } + + FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags); + return ret; +} + +/* Returns bytes used - doesn't NUL terminate */ +static int dump_rule(char *buffer, + const char *chainlabel, + const struct ip_fwkernel *rule) +{ + int len; + unsigned int i; + __u64 packets = 0, bytes = 0; + + FWC_HAVE_LOCK(fwc_wlocks); + for (i = 0; i < NUM_SLOTS; i++) { + packets += rule->counters[i].pcnt; + bytes += rule->counters[i].bcnt; + } + + len=sprintf(buffer, + "%9s " /* Chain name */ + "%08X/%08X->%08X/%08X " /* Source & Destination IPs */ + "%.16s " /* Interface */ + "%X %X " /* fw_flg and fw_invflg fields */ + "%u " /* Protocol */ + "%-9u %-9u %-9u %-9u " /* Packet & byte counters */ + "%u-%u %u-%u " /* Source & Dest port ranges */ + "A%02X X%02X " /* TOS and and xor masks */ + "%08X " /* Redirection port */ + "%u " /* fw_mark field */ + "%u " /* output size */ + "%9s\n", /* Target */ + chainlabel, + ntohl(rule->ipfw.fw_src.s_addr), + ntohl(rule->ipfw.fw_smsk.s_addr), + ntohl(rule->ipfw.fw_dst.s_addr), + ntohl(rule->ipfw.fw_dmsk.s_addr), + (rule->ipfw.fw_vianame)[0] ? rule->ipfw.fw_vianame : "-", + rule->ipfw.fw_flg, + rule->ipfw.fw_invflg, + rule->ipfw.fw_proto, + (__u32)(packets >> 32), (__u32)packets, + (__u32)(bytes >> 32), (__u32)bytes, + rule->ipfw.fw_spts[0], rule->ipfw.fw_spts[1], + rule->ipfw.fw_dpts[0], rule->ipfw.fw_dpts[1], + rule->ipfw.fw_tosand, rule->ipfw.fw_tosxor, + rule->ipfw.fw_redirpt, + rule->ipfw.fw_mark, + rule->ipfw.fw_outputsize, + branchname(rule->branch,rule->simplebranch)); + + duprintf("dump_rule: %i bytes done.\n", len); + return len; +} + +/* File offset is actually in records, not bytes. */ +static int ip_chain_procinfo(char *buffer, char **start, + off_t offset, int length +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,29) + , int reset +#endif + ) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,29) + /* FIXME: No more `atomic' read and reset. Wonderful 8-( --RR */ + int reset = 0; +#endif + struct ip_chain *i; + struct ip_fwkernel *j = ip_fw_chains->chain; + unsigned long flags; + int len = 0; + int last_len = 0; + off_t upto = 0; + + duprintf("Offset starts at %lu\n", offset); + duprintf("ip_fw_chains is 0x%0lX\n", (unsigned long int)ip_fw_chains); + + /* Need a write lock to lock out ``readers'' which update counters. */ + FWC_WRITE_LOCK_IRQ(&ip_fw_lock, flags); + + for (i = ip_fw_chains; i; i = i->next) { + for (j = i->chain; j; j = j->next) { + if (upto == offset) break; + duprintf("Skipping rule in chain `%s'\n", + i->label); + upto++; + } + if (upto == offset) break; + } + + /* Don't init j first time, or once i = NULL */ + for (; i; (void)((i = i->next) && (j = i->chain))) { + duprintf("Dumping chain `%s'\n", i->label); + for (; j; j = j->next, upto++, last_len = len) + { + len += dump_rule(buffer+len, i->label, j); + if (len > length) { + duprintf("Dumped to %i (past %i). " + "Moving back to %i.\n", + len, length, last_len); + len = last_len; + goto outside; + } + else if (reset) + memset(j->counters, 0, + sizeof(struct ip_counters)*NUM_SLOTS); + } + } +outside: + FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags); + buffer[len] = '\0'; + + duprintf("ip_chain_procinfo: Length = %i (of %i). Offset = %li.\n", + len, length, upto); + /* `start' hack - see fs/proc/generic.c line ~165 */ + *start=(char *)((unsigned int)upto-offset); + return len; +} + +static int ip_chain_name_procinfo(char *buffer, char **start, + off_t offset, int length) +{ + struct ip_chain *i; + int len = 0,last_len = 0; + off_t pos = 0,begin = 0; + unsigned long flags; + + /* Need a write lock to lock out ``readers'' which update counters. */ + FWC_WRITE_LOCK_IRQ(&ip_fw_lock, flags); + + for (i = ip_fw_chains; i; i = i->next) + { + unsigned int j; + __u32 packetsHi = 0, packetsLo = 0, bytesHi = 0, bytesLo = 0; + + for (j = 0; j < NUM_SLOTS; j++) { + packetsLo += i->reent[j].counters.pcnt & 0xFFFFFFFF; + packetsHi += ((i->reent[j].counters.pcnt >> 32) + & 0xFFFFFFFF); + bytesLo += i->reent[j].counters.bcnt & 0xFFFFFFFF; + bytesHi += ((i->reent[j].counters.bcnt >> 32) + & 0xFFFFFFFF); + } + + /* print the label and the policy */ + len+=sprintf(buffer+len,"%s %s %i %u %u %u %u\n", + i->label,branchname(NULL, i->policy),i->refcount, + packetsHi, packetsLo, bytesHi, bytesLo); + pos=begin+len; + if(pos<offset) { + len=0; + begin=pos; + } + else if(pos>offset+length) { + len = last_len; + break; + } + + last_len = len; + } + FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags); + + *start = buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + return len; +} + +/* + * Interface to the generic firewall chains. + */ +int ipfw_input_check(struct firewall_ops *this, int pf, + struct net_device *dev, void *phdr, void *arg, + struct sk_buff **pskb) +{ + return ip_fw_check(phdr, dev->name, + arg, IP_FW_INPUT_CHAIN, *pskb, SLOT_NUMBER(), 0); +} + +int ipfw_output_check(struct firewall_ops *this, int pf, + struct net_device *dev, void *phdr, void *arg, + struct sk_buff **pskb) +{ + /* Locally generated bogus packets by root. <SIGH>. */ + if (((struct iphdr *)phdr)->ihl * 4 < sizeof(struct iphdr) + || (*pskb)->len < sizeof(struct iphdr)) + return FW_ACCEPT; + return ip_fw_check(phdr, dev->name, + arg, IP_FW_OUTPUT_CHAIN, *pskb, SLOT_NUMBER(), 0); +} + +int ipfw_forward_check(struct firewall_ops *this, int pf, + struct net_device *dev, void *phdr, void *arg, + struct sk_buff **pskb) +{ + return ip_fw_check(phdr, dev->name, + arg, IP_FW_FORWARD_CHAIN, *pskb, SLOT_NUMBER(), 0); +} + +struct firewall_ops ipfw_ops= +{ + NULL, + ipfw_forward_check, + ipfw_input_check, + ipfw_output_check, + NULL, + NULL +}; + +int ipfw_init_or_cleanup(int init) +{ + int ret = 0; + unsigned long flags; + + FWC_WRITE_LOCK_IRQ(&ip_fw_lock, flags); + + if (!init) goto cleanup; + +#ifdef DEBUG_IP_FIREWALL_LOCKING + fwc_wlocks = fwc_rlocks = 0; +#endif + +#if defined(CONFIG_NETLINK_DEV) || defined(CONFIG_NETLINK_DEV_MODULE) + ipfwsk = netlink_kernel_create(NETLINK_FIREWALL, NULL); + if (ipfwsk == NULL) + goto cleanup_nothing; +#endif + + ret = register_firewall(PF_INET, &ipfw_ops); + if (ret < 0) + goto cleanup_netlink; + + proc_net_create(IP_FW_PROC_CHAINS, S_IFREG | S_IRUSR | S_IWUSR, ip_chain_procinfo); + proc_net_create(IP_FW_PROC_CHAIN_NAMES, S_IFREG | S_IRUSR | S_IWUSR, ip_chain_name_procinfo); + + IP_FW_INPUT_CHAIN = ip_init_chain(IP_FW_LABEL_INPUT, 1, FW_ACCEPT); + IP_FW_FORWARD_CHAIN = ip_init_chain(IP_FW_LABEL_FORWARD, 1, FW_ACCEPT); + IP_FW_OUTPUT_CHAIN = ip_init_chain(IP_FW_LABEL_OUTPUT, 1, FW_ACCEPT); + + FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags); + return ret; + + cleanup: + while (ip_fw_chains) { + struct ip_chain *next = ip_fw_chains->next; + + clear_fw_chain(ip_fw_chains); + kfree(ip_fw_chains); + ip_fw_chains = next; + } + + proc_net_remove(IP_FW_PROC_CHAINS); + proc_net_remove(IP_FW_PROC_CHAIN_NAMES); + + unregister_firewall(PF_INET, &ipfw_ops); + + cleanup_netlink: +#if defined(CONFIG_NETLINK_DEV) || defined(CONFIG_NETLINK_DEV_MODULE) + sock_release(ipfwsk->socket); + + cleanup_nothing: +#endif + FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags); + return ret; +} diff --git a/net/ipv4/netfilter/ipfwadm_core.c b/net/ipv4/netfilter/ipfwadm_core.c new file mode 100644 index 000000000..a1f4e16cf --- /dev/null +++ b/net/ipv4/netfilter/ipfwadm_core.c @@ -0,0 +1,1410 @@ +/* Minor modifications to fit on compatibility framework: + Rusty.Russell@rustcorp.com.au +*/ + +#define CONFIG_IP_FIREWALL +#define CONFIG_IP_FIREWALL_VERBOSE +#define CONFIG_IP_MASQUERADE +#define CONFIG_IP_ACCT +#define CONFIG_IP_TRANSPARENT_PROXY +#define CONFIG_IP_FIREWALL_NETLINK + +/* + * IP firewalling code. This is taken from 4.4BSD. Please note the + * copyright message below. As per the GPL it must be maintained + * and the licenses thus do not conflict. While this port is subject + * to the GPL I also place my modifications under the original + * license in recognition of the original copyright. + * -- Alan Cox. + * + * $Id: ipfwadm_core.c,v 1.1 2000/03/17 14:42:00 davem Exp $ + * + * Ported from BSD to Linux, + * Alan Cox 22/Nov/1994. + * Zeroing /proc and other additions + * Jos Vos 4/Feb/1995. + * Merged and included the FreeBSD-Current changes at Ugen's request + * (but hey it's a lot cleaner now). Ugen would prefer in some ways + * we waited for his final product but since Linux 1.2.0 is about to + * appear it's not practical - Read: It works, it's not clean but please + * don't consider it to be his standard of finished work. + * Alan Cox 12/Feb/1995 + * Porting bidirectional entries from BSD, fixing accounting issues, + * adding struct ip_fwpkt for checking packets with interface address + * Jos Vos 5/Mar/1995. + * Established connections (ACK check), ACK check on bidirectional rules, + * ICMP type check. + * Wilfred Mollenvanger 7/7/1995. + * TCP attack protection. + * Alan Cox 25/8/95, based on information from bugtraq. + * ICMP type printk, IP_FW_F_APPEND + * Bernd Eckenfels 1996-01-31 + * Split blocking chain into input and output chains, add new "insert" and + * "append" commands to replace semi-intelligent "add" command, let "delete". + * only delete the first matching entry, use 0xFFFF (0xFF) as ports (ICMP + * types) when counting packets being 2nd and further fragments. + * Jos Vos <jos@xos.nl> 8/2/1996. + * Add support for matching on device names. + * Jos Vos <jos@xos.nl> 15/2/1996. + * Transparent proxying support. + * Willy Konynenberg <willy@xos.nl> 10/5/96. + * Make separate accounting on incoming and outgoing packets possible. + * Jos Vos <jos@xos.nl> 18/5/1996. + * Added trap out of bad frames. + * Alan Cox <alan@cymru.net> 17/11/1996 + * + * + * Masquerading functionality + * + * Copyright (c) 1994 Pauline Middelink + * + * The pieces which added masquerading functionality are totally + * my responsibility and have nothing to with the original authors + * copyright or doing. + * + * Parts distributed under GPL. + * + * Fixes: + * Pauline Middelink : Added masquerading. + * Alan Cox : Fixed an error in the merge. + * Thomas Quinot : Fixed port spoofing. + * Alan Cox : Cleaned up retransmits in spoofing. + * Alan Cox : Cleaned up length setting. + * Wouter Gadeyne : Fixed masquerading support of ftp PORT commands + * + * Juan Jose Ciarlante : Masquerading code moved to ip_masq.c + * Andi Kleen : Print frag_offsets and the ip flags properly. + * + * All the real work was done by ..... + * + */ + + +/* + * Copyright (c) 1993 Daniel Boulet + * Copyright (c) 1994 Ugen J.S.Antsilevich + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + */ + +#include <linux/config.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/errno.h> + +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/icmp.h> +#include <linux/udp.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/route.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/sock.h> +#include <net/icmp.h> +#include <linux/netlink.h> +#include <linux/init.h> +#include <linux/netfilter_ipv4/ipfwadm_core.h> +#include <linux/netfilter_ipv4/compat_firewall.h> + +#include <net/checksum.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/version.h> + +/* + * Implement IP packet firewall + */ + +#ifdef DEBUG_IP_FIREWALL +#define dprintf1(a) printk(a) +#define dprintf2(a1,a2) printk(a1,a2) +#define dprintf3(a1,a2,a3) printk(a1,a2,a3) +#define dprintf4(a1,a2,a3,a4) printk(a1,a2,a3,a4) +#else +#define dprintf1(a) +#define dprintf2(a1,a2) +#define dprintf3(a1,a2,a3) +#define dprintf4(a1,a2,a3,a4) +#endif + +#define print_ip(a) printk("%d.%d.%d.%d",(ntohl(a)>>24)&0xFF,\ + (ntohl(a)>>16)&0xFF,\ + (ntohl(a)>>8)&0xFF,\ + (ntohl(a))&0xFF); + +#ifdef DEBUG_IP_FIREWALL +#define dprint_ip(a) print_ip(a) +#else +#define dprint_ip(a) +#endif + +#if defined(CONFIG_IP_ACCT) || defined(CONFIG_IP_FIREWALL) + +struct ip_fw *ip_fw_fwd_chain; +struct ip_fw *ip_fw_in_chain; +struct ip_fw *ip_fw_out_chain; +struct ip_fw *ip_acct_chain; +struct ip_fw *ip_masq_chain; + +static struct ip_fw **chains[] = + {&ip_fw_fwd_chain, &ip_fw_in_chain, &ip_fw_out_chain, &ip_acct_chain, + &ip_masq_chain + }; +#endif /* CONFIG_IP_ACCT || CONFIG_IP_FIREWALL */ + +#ifdef CONFIG_IP_FIREWALL +int ip_fw_fwd_policy=IP_FW_F_ACCEPT; +int ip_fw_in_policy=IP_FW_F_ACCEPT; +int ip_fw_out_policy=IP_FW_F_ACCEPT; + +static int *policies[] = + {&ip_fw_fwd_policy, &ip_fw_in_policy, &ip_fw_out_policy}; + +#endif + +#ifdef CONFIG_IP_FIREWALL_NETLINK +struct sock *ipfwsk; +#endif + +/* + * Returns 1 if the port is matched by the vector, 0 otherwise + */ + +extern inline int port_match(unsigned short *portptr,int nports,unsigned short port,int range_flag) +{ + if (!nports) + return 1; + if ( range_flag ) + { + if ( portptr[0] <= port && port <= portptr[1] ) + { + return( 1 ); + } + nports -= 2; + portptr += 2; + } + while ( nports-- > 0 ) + { + if ( *portptr++ == port ) + { + return( 1 ); + } + } + return(0); +} + +#if defined(CONFIG_IP_ACCT) || defined(CONFIG_IP_FIREWALL) + +#ifdef CONFIG_IP_FIREWALL_VERBOSE + +/* + * VERY ugly piece of code which actually makes kernel printf for + * matching packets. + */ + +static char *chain_name(struct ip_fw *chain, int mode) +{ + switch (mode) { + case IP_FW_MODE_ACCT_IN: return "acct in"; + case IP_FW_MODE_ACCT_OUT: return "acct out"; + default: + if (chain == ip_fw_fwd_chain) + return "fw-fwd"; + else if (chain == ip_fw_in_chain) + return "fw-in"; + else + return "fw-out"; + } +} + +static char *rule_name(struct ip_fw *f, int mode, char *buf) +{ + if (mode == IP_FW_MODE_ACCT_IN || mode == IP_FW_MODE_ACCT_OUT) + return ""; + + if(f->fw_flg&IP_FW_F_ACCEPT) { + if(f->fw_flg&IP_FW_F_REDIR) { + sprintf(buf, "acc/r%d ", f->fw_pts[f->fw_nsp+f->fw_ndp]); + return buf; + } else if(f->fw_flg&IP_FW_F_MASQ) + return "acc/masq "; + else + return "acc "; + } else if(f->fw_flg&IP_FW_F_ICMPRPL) { + return "rej "; + } else { + return "deny "; + } +} + +static void print_packet(struct iphdr *ip, + u16 src_port, u16 dst_port, u16 icmp_type, + char *chain, char *rule, char *devname) +{ + __u32 *opt = (__u32 *) (ip + 1); + int opti; + __u16 foff = ntohs(ip->frag_off); + + printk(KERN_INFO "IP %s %s%s", chain, rule, devname); + + switch(ip->protocol) + { + case IPPROTO_TCP: + printk(" TCP "); + break; + case IPPROTO_UDP: + printk(" UDP "); + break; + case IPPROTO_ICMP: + printk(" ICMP/%d ", icmp_type); + break; + default: + printk(" PROTO=%d ", ip->protocol); + break; + } + print_ip(ip->saddr); + if(ip->protocol == IPPROTO_TCP || ip->protocol == IPPROTO_UDP) + printk(":%hu", src_port); + printk(" "); + print_ip(ip->daddr); + if(ip->protocol == IPPROTO_TCP || ip->protocol == IPPROTO_UDP) + printk(":%hu", dst_port); + printk(" L=%hu S=0x%2.2hX I=%hu FO=0x%4.4hX T=%hu", + ntohs(ip->tot_len), ip->tos, ntohs(ip->id), + foff & IP_OFFSET, ip->ttl); + if (foff & IP_DF) printk(" DF=1"); + if (foff & IP_MF) printk(" MF=1"); + for (opti = 0; opti < (ip->ihl - sizeof(struct iphdr) / 4); opti++) + printk(" O=0x%8.8X", *opt++); + printk("\n"); +} +#endif + +/* + * Returns one of the generic firewall policies, like FW_ACCEPT. + * Also does accounting so you can feed it the accounting chain. + * + * The modes is either IP_FW_MODE_FW (normal firewall mode), + * IP_FW_MODE_ACCT_IN or IP_FW_MODE_ACCT_OUT (accounting mode, + * steps through the entire chain and handles fragments + * differently), or IP_FW_MODE_CHK (handles user-level check, + * counters are not updated). + */ + + +int ip_fw_chk(struct iphdr *ip, struct net_device *rif, __u16 *redirport, + struct ip_fw *chain, int policy, int mode) +{ + struct ip_fw *f; + struct tcphdr *tcp=(struct tcphdr *)((__u32 *)ip+ip->ihl); + struct udphdr *udp=(struct udphdr *)((__u32 *)ip+ip->ihl); + struct icmphdr *icmp=(struct icmphdr *)((__u32 *)ip+ip->ihl); + __u32 src, dst; + __u16 src_port=0xFFFF, dst_port=0xFFFF, icmp_type=0xFF; + unsigned short f_prt=0, prt; + char notcpsyn=0, notcpack=0, match; + unsigned short offset; + int answer; + unsigned char tosand, tosxor; + + /* + * If the chain is empty follow policy. The BSD one + * accepts anything giving you a time window while + * flushing and rebuilding the tables. + */ + + src = ip->saddr; + dst = ip->daddr; + + /* + * This way we handle fragmented packets. + * we ignore all fragments but the first one + * so the whole packet can't be reassembled. + * This way we relay on the full info which + * stored only in first packet. + * + * Note that this theoretically allows partial packet + * spoofing. Not very dangerous but paranoid people may + * wish to play with this. It also allows the so called + * "fragment bomb" denial of service attack on some types + * of system. + */ + + offset = ntohs(ip->frag_off) & IP_OFFSET; + + /* + * Don't allow a fragment of TCP 8 bytes in. Nobody + * normal causes this. Its a cracker trying to break + * in by doing a flag overwrite to pass the direction + * checks. + */ + + if (offset == 1 && ip->protocol == IPPROTO_TCP) + return FW_BLOCK; + + if (offset!=0 && !(mode & (IP_FW_MODE_ACCT_IN|IP_FW_MODE_ACCT_OUT)) && + (ip->protocol == IPPROTO_TCP || ip->protocol == IPPROTO_UDP || + ip->protocol == IPPROTO_ICMP)) + return FW_ACCEPT; + + /* + * Header fragment for TCP is too small to check the bits. + */ + + if(ip->protocol==IPPROTO_TCP && (ip->ihl<<2)+16 > ntohs(ip->tot_len)) + return FW_BLOCK; + + /* + * Too short. + * + * But only too short for a packet with ports... + */ + + else if((ntohs(ip->tot_len)<8+(ip->ihl<<2))&&(ip->protocol==IPPROTO_TCP || ip->protocol==IPPROTO_UDP)) + return FW_BLOCK; + + src = ip->saddr; + dst = ip->daddr; + + /* + * If we got interface from which packet came + * we can use the address directly. This is unlike + * 4.4BSD derived systems that have an address chain + * per device. We have a device per address with dummy + * devices instead. + */ + + dprintf1("Packet "); + switch(ip->protocol) + { + case IPPROTO_TCP: + dprintf1("TCP "); + /* ports stay 0xFFFF if it is not the first fragment */ + if (!offset) { + src_port=ntohs(tcp->source); + dst_port=ntohs(tcp->dest); + if(!tcp->ack && !tcp->rst) + /* We do NOT have ACK, value TRUE */ + notcpack=1; + if(!tcp->syn || !notcpack) + /* We do NOT have SYN, value TRUE */ + notcpsyn=1; + } + prt=IP_FW_F_TCP; + break; + case IPPROTO_UDP: + dprintf1("UDP "); + /* ports stay 0xFFFF if it is not the first fragment */ + if (!offset) { + src_port=ntohs(udp->source); + dst_port=ntohs(udp->dest); + } + prt=IP_FW_F_UDP; + break; + case IPPROTO_ICMP: + /* icmp_type stays 255 if it is not the first fragment */ + if (!offset) + icmp_type=(__u16)(icmp->type); + dprintf2("ICMP:%d ",icmp_type); + prt=IP_FW_F_ICMP; + break; + default: + dprintf2("p=%d ",ip->protocol); + prt=IP_FW_F_ALL; + break; + } +#ifdef DEBUG_IP_FIREWALL + dprint_ip(ip->saddr); + + if (ip->protocol==IPPROTO_TCP || ip->protocol==IPPROTO_UDP) + /* This will print 65535 when it is not the first fragment! */ + dprintf2(":%d ", src_port); + dprint_ip(ip->daddr); + if (ip->protocol==IPPROTO_TCP || ip->protocol==IPPROTO_UDP) + /* This will print 65535 when it is not the first fragment! */ + dprintf2(":%d ",dst_port); + dprintf1("\n"); +#endif + + for (f=chain;f;f=f->fw_next) + { + /* + * This is a bit simpler as we don't have to walk + * an interface chain as you do in BSD - same logic + * however. + */ + + /* + * Match can become 0x01 (a "normal" match was found), + * 0x02 (a reverse match was found), and 0x03 (the + * IP addresses match in both directions). + * Now we know in which direction(s) we should look + * for a match for the TCP/UDP ports. Both directions + * might match (e.g., when both addresses are on the + * same network for which an address/mask is given), but + * the ports might only match in one direction. + * This was obviously wrong in the original BSD code. + */ + match = 0x00; + + if ((src&f->fw_smsk.s_addr)==f->fw_src.s_addr + && (dst&f->fw_dmsk.s_addr)==f->fw_dst.s_addr) + /* normal direction */ + match |= 0x01; + + if ((f->fw_flg & IP_FW_F_BIDIR) && + (dst&f->fw_smsk.s_addr)==f->fw_src.s_addr + && (src&f->fw_dmsk.s_addr)==f->fw_dst.s_addr) + /* reverse direction */ + match |= 0x02; + + if (!match) + continue; + + /* + * Look for a VIA device match + */ + if(f->fw_viadev) + { + if(rif!=f->fw_viadev) + continue; /* Mismatch */ + } + + /* This looks stupid, because we scan almost static + list, searching for static key. However, this way seems + to be only reasonable way of handling fw_via rules + (btw bsd makes the same thing). + + It will not affect performance if you will follow + the following simple rules: + + - if inteface is aliased, ALWAYS specify fw_viadev, + so that previous check will guarantee, that we will + not waste time when packet arrive on another interface. + + - avoid using fw_via.s_addr if fw_via.s_addr is owned + by an aliased interface. + + --ANK + */ + if (f->fw_via.s_addr && rif) { + struct in_ifaddr *ifa; + + if (rif->ip_ptr == NULL) + continue; /* Mismatch */ + + for (ifa = ((struct in_device*)(rif->ip_ptr))->ifa_list; + ifa; ifa = ifa->ifa_next) { + if (ifa->ifa_local == f->fw_via.s_addr) + goto ifa_ok; + } + continue; /* Mismatch */ + + ifa_ok: + } + + /* + * Ok the chain addresses match. + */ + +#ifdef CONFIG_IP_ACCT + /* + * See if we're in accounting mode and only want to + * count incoming or outgoing packets. + */ + + if (mode & (IP_FW_MODE_ACCT_IN|IP_FW_MODE_ACCT_OUT) && + ((mode == IP_FW_MODE_ACCT_IN && f->fw_flg&IP_FW_F_ACCTOUT) || + (mode == IP_FW_MODE_ACCT_OUT && f->fw_flg&IP_FW_F_ACCTIN))) + continue; + +#endif + /* + * For all non-TCP packets and/or non-first fragments, + * notcpsyn and notcpack will always be FALSE, + * so the IP_FW_F_TCPSYN and IP_FW_F_TCPACK flags + * are actually ignored for these packets. + */ + + if((f->fw_flg&IP_FW_F_TCPSYN) && notcpsyn) + continue; + + if((f->fw_flg&IP_FW_F_TCPACK) && notcpack) + continue; + + f_prt=f->fw_flg&IP_FW_F_KIND; + if (f_prt!=IP_FW_F_ALL) + { + /* + * Specific firewall - packet's protocol + * must match firewall's. + */ + + if(prt!=f_prt) + continue; + + if((prt==IP_FW_F_ICMP && + ! port_match(&f->fw_pts[0], f->fw_nsp, + icmp_type,f->fw_flg&IP_FW_F_SRNG)) || + !(prt==IP_FW_F_ICMP || ((match & 0x01) && + port_match(&f->fw_pts[0], f->fw_nsp, src_port, + f->fw_flg&IP_FW_F_SRNG) && + port_match(&f->fw_pts[f->fw_nsp], f->fw_ndp, dst_port, + f->fw_flg&IP_FW_F_DRNG)) || ((match & 0x02) && + port_match(&f->fw_pts[0], f->fw_nsp, dst_port, + f->fw_flg&IP_FW_F_SRNG) && + port_match(&f->fw_pts[f->fw_nsp], f->fw_ndp, src_port, + f->fw_flg&IP_FW_F_DRNG)))) + { + continue; + } + } + +#ifdef CONFIG_IP_FIREWALL_VERBOSE + if (f->fw_flg & IP_FW_F_PRN) + { + char buf[16]; + + print_packet(ip, src_port, dst_port, icmp_type, + chain_name(chain, mode), + rule_name(f, mode, buf), + rif ? rif->name : "-"); + } +#endif + if (mode != IP_FW_MODE_CHK) { + f->fw_bcnt+=ntohs(ip->tot_len); + f->fw_pcnt++; + } + if (!(mode & (IP_FW_MODE_ACCT_IN|IP_FW_MODE_ACCT_OUT))) + break; + } /* Loop */ + + if (!(mode & (IP_FW_MODE_ACCT_IN|IP_FW_MODE_ACCT_OUT))) { + + /* + * We rely on policy defined in the rejecting entry or, if no match + * was found, we rely on the general policy variable for this type + * of firewall. + */ + + if (f!=NULL) { + policy=f->fw_flg; + tosand=f->fw_tosand; + tosxor=f->fw_tosxor; + } else { + tosand=0xFF; + tosxor=0x00; + } + + if (policy&IP_FW_F_ACCEPT) { + /* Adjust priority and recompute checksum */ + __u8 old_tos = ip->tos; + ip->tos = (old_tos & tosand) ^ tosxor; + if (ip->tos != old_tos) + ip_send_check(ip); +#ifdef CONFIG_IP_TRANSPARENT_PROXY + if (policy&IP_FW_F_REDIR) { + if (redirport) + if ((*redirport = htons(f->fw_pts[f->fw_nsp+f->fw_ndp])) == 0) { + /* Wildcard redirection. + * Note that redirport will become + * 0xFFFF for non-TCP/UDP packets. + */ + *redirport = htons(dst_port); + } + answer = FW_REDIRECT; + } else +#endif +#ifdef CONFIG_IP_MASQUERADE + if (policy&IP_FW_F_MASQ) + answer = FW_MASQUERADE; + else +#endif + answer = FW_ACCEPT; + + } else if(policy&IP_FW_F_ICMPRPL) + answer = FW_REJECT; + else + answer = FW_BLOCK; + +#ifdef CONFIG_IP_FIREWALL_NETLINK + if((policy&IP_FW_F_PRN) && (answer == FW_REJECT || answer == FW_BLOCK)) + { + struct sk_buff *skb=alloc_skb(128, GFP_ATOMIC); + if(skb) + { + int len=min(128,ntohs(ip->tot_len)); + skb_put(skb,len); + memcpy(skb->data,ip,len); + if(netlink_post(NETLINK_FIREWALL, skb)) + kfree_skb(skb); + } + } +#endif + return answer; + } else + /* we're doing accounting, always ok */ + return 0; +} + + +static void zero_fw_chain(struct ip_fw *chainptr) +{ + struct ip_fw *ctmp=chainptr; + while(ctmp) + { + ctmp->fw_pcnt=0L; + ctmp->fw_bcnt=0L; + ctmp=ctmp->fw_next; + } +} + +static void free_fw_chain(struct ip_fw *volatile* chainptr) +{ + unsigned long flags; + save_flags(flags); + cli(); + while ( *chainptr != NULL ) + { + struct ip_fw *ftmp; + ftmp = *chainptr; + *chainptr = ftmp->fw_next; + kfree_s(ftmp,sizeof(*ftmp)); + } + restore_flags(flags); +} + +/* Volatiles to keep some of the compiler versions amused */ + +static int insert_in_chain(struct ip_fw *volatile* chainptr, struct ip_fw *frwl,int len) +{ + struct ip_fw *ftmp; + unsigned long flags; + + save_flags(flags); + + ftmp = kmalloc( sizeof(struct ip_fw), GFP_ATOMIC ); + if ( ftmp == NULL ) + { +#ifdef DEBUG_IP_FIREWALL + printk("ip_fw_ctl: malloc said no\n"); +#endif + return( ENOMEM ); + } + + memcpy(ftmp, frwl, len); + /* + * Allow the more recent "minimise cost" flag to be + * set. [Rob van Nieuwkerk] + */ + ftmp->fw_tosand |= 0x01; + ftmp->fw_tosxor &= 0xFE; + ftmp->fw_pcnt=0L; + ftmp->fw_bcnt=0L; + + cli(); + + if ((ftmp->fw_vianame)[0]) { + if (!(ftmp->fw_viadev = dev_get_by_name(ftmp->fw_vianame))) + ftmp->fw_viadev = (struct net_device *) -1; + } else + ftmp->fw_viadev = NULL; + + ftmp->fw_next = *chainptr; + *chainptr=ftmp; + restore_flags(flags); + return(0); +} + +static int append_to_chain(struct ip_fw *volatile* chainptr, struct ip_fw *frwl,int len) +{ + struct ip_fw *ftmp; + struct ip_fw *chtmp=NULL; + struct ip_fw *volatile chtmp_prev=NULL; + unsigned long flags; + + save_flags(flags); + + ftmp = kmalloc( sizeof(struct ip_fw), GFP_ATOMIC ); + if ( ftmp == NULL ) + { +#ifdef DEBUG_IP_FIREWALL + printk("ip_fw_ctl: malloc said no\n"); +#endif + return( ENOMEM ); + } + + memcpy(ftmp, frwl, len); + /* + * Allow the more recent "minimise cost" flag to be + * set. [Rob van Nieuwkerk] + */ + ftmp->fw_tosand |= 0x01; + ftmp->fw_tosxor &= 0xFE; + ftmp->fw_pcnt=0L; + ftmp->fw_bcnt=0L; + + ftmp->fw_next = NULL; + + cli(); + + if ((ftmp->fw_vianame)[0]) { + if (!(ftmp->fw_viadev = dev_get_by_name(ftmp->fw_vianame))) + ftmp->fw_viadev = (struct net_device *) -1; + } else + ftmp->fw_viadev = NULL; + + chtmp_prev=NULL; + for (chtmp=*chainptr;chtmp!=NULL;chtmp=chtmp->fw_next) + chtmp_prev=chtmp; + + if (chtmp_prev) + chtmp_prev->fw_next=ftmp; + else + *chainptr=ftmp; + restore_flags(flags); + return(0); +} + +static int del_from_chain(struct ip_fw *volatile*chainptr, struct ip_fw *frwl) +{ + struct ip_fw *ftmp,*ltmp; + unsigned short tport1,tport2,tmpnum; + char matches,was_found; + unsigned long flags; + + save_flags(flags); + cli(); + + ftmp=*chainptr; + + if ( ftmp == NULL ) + { +#ifdef DEBUG_IP_FIREWALL + printk("ip_fw_ctl: chain is empty\n"); +#endif + restore_flags(flags); + return( EINVAL ); + } + + ltmp=NULL; + was_found=0; + + while( !was_found && ftmp != NULL ) + { + matches=1; + if (ftmp->fw_src.s_addr!=frwl->fw_src.s_addr + || ftmp->fw_dst.s_addr!=frwl->fw_dst.s_addr + || ftmp->fw_smsk.s_addr!=frwl->fw_smsk.s_addr + || ftmp->fw_dmsk.s_addr!=frwl->fw_dmsk.s_addr + || ftmp->fw_via.s_addr!=frwl->fw_via.s_addr + || ftmp->fw_flg!=frwl->fw_flg) + matches=0; + + tport1=ftmp->fw_nsp+ftmp->fw_ndp; + tport2=frwl->fw_nsp+frwl->fw_ndp; + if (tport1!=tport2) + matches=0; + else if (tport1!=0) + { + for (tmpnum=0;tmpnum < tport1 && tmpnum < IP_FW_MAX_PORTS;tmpnum++) + if (ftmp->fw_pts[tmpnum]!=frwl->fw_pts[tmpnum]) + matches=0; + } + if (strncmp(ftmp->fw_vianame, frwl->fw_vianame, IFNAMSIZ)) + matches=0; + if(matches) + { + was_found=1; + if (ltmp) + { + ltmp->fw_next=ftmp->fw_next; + kfree_s(ftmp,sizeof(*ftmp)); + ftmp=ltmp->fw_next; + } + else + { + *chainptr=ftmp->fw_next; + kfree_s(ftmp,sizeof(*ftmp)); + ftmp=*chainptr; + } + } + else + { + ltmp = ftmp; + ftmp = ftmp->fw_next; + } + } + restore_flags(flags); + if (was_found) + return 0; + else + return(EINVAL); +} + +#endif /* CONFIG_IP_ACCT || CONFIG_IP_FIREWALL */ + +struct ip_fw *check_ipfw_struct(struct ip_fw *frwl, int len) +{ + + if ( len != sizeof(struct ip_fw) ) + { +#ifdef DEBUG_IP_FIREWALL + printk("ip_fw_ctl: len=%d, want %d\n",len, sizeof(struct ip_fw)); +#endif + return(NULL); + } + + if ( (frwl->fw_flg & ~IP_FW_F_MASK) != 0 ) + { +#ifdef DEBUG_IP_FIREWALL + printk("ip_fw_ctl: undefined flag bits set (flags=%x)\n", + frwl->fw_flg); +#endif + return(NULL); + } + +#ifndef CONFIG_IP_TRANSPARENT_PROXY + if (frwl->fw_flg & IP_FW_F_REDIR) { +#ifdef DEBUG_IP_FIREWALL + printk("ip_fw_ctl: unsupported flag IP_FW_F_REDIR\n"); +#endif + return(NULL); + } +#endif + +#ifndef CONFIG_IP_MASQUERADE + if (frwl->fw_flg & IP_FW_F_MASQ) { +#ifdef DEBUG_IP_FIREWALL + printk("ip_fw_ctl: unsupported flag IP_FW_F_MASQ\n"); +#endif + return(NULL); + } +#endif + + if ( (frwl->fw_flg & IP_FW_F_SRNG) && frwl->fw_nsp < 2 ) + { +#ifdef DEBUG_IP_FIREWALL + printk("ip_fw_ctl: src range set but fw_nsp=%d\n", + frwl->fw_nsp); +#endif + return(NULL); + } + + if ( (frwl->fw_flg & IP_FW_F_DRNG) && frwl->fw_ndp < 2 ) + { +#ifdef DEBUG_IP_FIREWALL + printk("ip_fw_ctl: dst range set but fw_ndp=%d\n", + frwl->fw_ndp); +#endif + return(NULL); + } + + if ( frwl->fw_nsp + frwl->fw_ndp > (frwl->fw_flg & IP_FW_F_REDIR ? IP_FW_MAX_PORTS - 1 : IP_FW_MAX_PORTS) ) + { +#ifdef DEBUG_IP_FIREWALL + printk("ip_fw_ctl: too many ports (%d+%d)\n", + frwl->fw_nsp,frwl->fw_ndp); +#endif + return(NULL); + } + + return frwl; +} + + + + +#ifdef CONFIG_IP_ACCT + +int ip_acct_ctl(int stage, void *m, int len) +{ + if ( stage == IP_ACCT_FLUSH ) + { + free_fw_chain(&ip_acct_chain); + return(0); + } + if ( stage == IP_ACCT_ZERO ) + { + zero_fw_chain(ip_acct_chain); + return(0); + } + if ( stage == IP_ACCT_INSERT || stage == IP_ACCT_APPEND || + stage == IP_ACCT_DELETE ) + { + struct ip_fw *frwl; + + if (!(frwl=check_ipfw_struct(m,len))) + return (EINVAL); + + switch (stage) + { + case IP_ACCT_INSERT: + return( insert_in_chain(&ip_acct_chain,frwl,len)); + case IP_ACCT_APPEND: + return( append_to_chain(&ip_acct_chain,frwl,len)); + case IP_ACCT_DELETE: + return( del_from_chain(&ip_acct_chain,frwl)); + default: + /* + * Should be panic but... (Why ??? - AC) + */ +#ifdef DEBUG_IP_FIREWALL + printk("ip_acct_ctl: unknown request %d\n",stage); +#endif + return(EINVAL); + } + } +#ifdef DEBUG_IP_FIREWALL + printk("ip_acct_ctl: unknown request %d\n",stage); +#endif + return(EINVAL); +} +#endif + +#ifdef CONFIG_IP_FIREWALL +int ip_fw_ctl(int stage, void *m, int len) +{ + int cmd, fwtype; + + cmd = stage & IP_FW_COMMAND; + fwtype = (stage & IP_FW_TYPE) >> IP_FW_SHIFT; + + if ( cmd == IP_FW_FLUSH ) + { + free_fw_chain(chains[fwtype]); + return(0); + } + + if ( cmd == IP_FW_ZERO ) + { + zero_fw_chain(*chains[fwtype]); + return(0); + } + + if ( cmd == IP_FW_POLICY ) + { + int *tmp_policy_ptr; + tmp_policy_ptr=(int *)m; + *policies[fwtype] = *tmp_policy_ptr; + return 0; + } + + if ( cmd == IP_FW_CHECK ) + { + struct net_device *viadev; + struct ip_fwpkt *ipfwp; + struct iphdr *ip; + + if ( len != sizeof(struct ip_fwpkt) ) + { +#ifdef DEBUG_IP_FIREWALL + printk("ip_fw_ctl: length=%d, expected %d\n", + len, sizeof(struct ip_fwpkt)); +#endif + return( EINVAL ); + } + + ipfwp = (struct ip_fwpkt *)m; + ip = &(ipfwp->fwp_iph); + + if ( !(viadev = dev_get_by_name(ipfwp->fwp_vianame)) ) { +#ifdef DEBUG_IP_FIREWALL + printk("ip_fw_ctl: invalid device \"%s\"\n", ipfwp->fwp_vianame); +#endif + return(EINVAL); + } else if ( ip->ihl != sizeof(struct iphdr) / sizeof(int)) { +#ifdef DEBUG_IP_FIREWALL + printk("ip_fw_ctl: ip->ihl=%d, want %d\n",ip->ihl, + sizeof(struct iphdr)/sizeof(int)); +#endif + return(EINVAL); + } + + switch (ip_fw_chk(ip, viadev, NULL, *chains[fwtype], + *policies[fwtype], IP_FW_MODE_CHK)) + { + case FW_ACCEPT: + return(0); + case FW_REDIRECT: + return(ECONNABORTED); + case FW_MASQUERADE: + return(ECONNRESET); + case FW_REJECT: + return(ECONNREFUSED); + default: /* FW_BLOCK */ + return(ETIMEDOUT); + } + } + + if ( cmd == IP_FW_MASQ_TIMEOUTS ) + return ip_fw_masq_timeouts(m, len); + +/* + * Here we really working hard-adding new elements + * to blocking/forwarding chains or deleting 'em + */ + + if ( cmd == IP_FW_INSERT || cmd == IP_FW_APPEND || cmd == IP_FW_DELETE ) + { + struct ip_fw *frwl; + int fwtype; + + frwl=check_ipfw_struct(m,len); + if (frwl==NULL) + return (EINVAL); + fwtype = (stage & IP_FW_TYPE) >> IP_FW_SHIFT; + + switch (cmd) + { + case IP_FW_INSERT: + return(insert_in_chain(chains[fwtype],frwl,len)); + case IP_FW_APPEND: + return(append_to_chain(chains[fwtype],frwl,len)); + case IP_FW_DELETE: + return(del_from_chain(chains[fwtype],frwl)); + default: + /* + * Should be panic but... (Why are BSD people panic obsessed ??) + */ +#ifdef DEBUG_IP_FIREWALL + printk("ip_fw_ctl: unknown request %d\n",stage); +#endif + return(EINVAL); + } + } + +#ifdef DEBUG_IP_FIREWALL + printk("ip_fw_ctl: unknown request %d\n",stage); +#endif + return(ENOPROTOOPT); +} +#endif /* CONFIG_IP_FIREWALL */ + +#ifdef CONFIG_PROC_FS +#if defined(CONFIG_IP_FIREWALL) || defined(CONFIG_IP_ACCT) + +static int ip_chain_procinfo(int stage, char *buffer, char **start, + off_t offset, int length, int reset) +{ + off_t pos=0, begin=0; + struct ip_fw *i; + unsigned long flags; + int len, p; + int last_len = 0; + + + switch(stage) + { +#ifdef CONFIG_IP_FIREWALL + case IP_FW_IN: + i = ip_fw_in_chain; + len=sprintf(buffer, "IP firewall input rules, default %d\n", + ip_fw_in_policy); + break; + case IP_FW_OUT: + i = ip_fw_out_chain; + len=sprintf(buffer, "IP firewall output rules, default %d\n", + ip_fw_out_policy); + break; + case IP_FW_FWD: + i = ip_fw_fwd_chain; + len=sprintf(buffer, "IP firewall forward rules, default %d\n", + ip_fw_fwd_policy); + break; +#endif +#ifdef CONFIG_IP_ACCT + case IP_FW_ACCT: + i = ip_acct_chain; + len=sprintf(buffer,"IP accounting rules\n"); + break; +#endif + default: + /* this should never be reached, but safety first... */ + i = NULL; + len=0; + break; + } + + save_flags(flags); + cli(); + + while(i!=NULL) + { + len+=sprintf(buffer+len,"%08X/%08X->%08X/%08X %.16s %08X %X ", + ntohl(i->fw_src.s_addr),ntohl(i->fw_smsk.s_addr), + ntohl(i->fw_dst.s_addr),ntohl(i->fw_dmsk.s_addr), + (i->fw_vianame)[0] ? i->fw_vianame : "-", + ntohl(i->fw_via.s_addr), i->fw_flg); + /* 10 is enough for a 32 bit box but the counters are 64bit on + the Alpha and Ultrapenguin */ + len+=sprintf(buffer+len,"%u %u %-20lu %-20lu", + i->fw_nsp,i->fw_ndp, i->fw_pcnt,i->fw_bcnt); + for (p = 0; p < IP_FW_MAX_PORTS; p++) + len+=sprintf(buffer+len, " %u", i->fw_pts[p]); + len+=sprintf(buffer+len, " A%02X X%02X", i->fw_tosand, i->fw_tosxor); + buffer[len++]='\n'; + buffer[len]='\0'; + pos=begin+len; + if(pos<offset) + { + len=0; + begin=pos; + } + else if(pos>offset+length) + { + len = last_len; + break; + } + else if(reset) + { + /* This needs to be done at this specific place! */ + i->fw_pcnt=0L; + i->fw_bcnt=0L; + } + last_len = len; + i=i->fw_next; + } + restore_flags(flags); + *start=buffer+(offset-begin); + len-=(offset-begin); + if(len>length) + len=length; + return len; +} +#endif + +#ifdef CONFIG_IP_ACCT + +static int ip_acct_procinfo(char *buffer, char **start, off_t offset, + int length +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,29) + , int reset +#endif + ) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,29) + /* FIXME: No more `atomic' read and reset. Wonderful 8-( --RR */ + int reset = 0; +#endif + return ip_chain_procinfo(IP_FW_ACCT, buffer,start, offset,length, + reset); +} + +#endif + +#ifdef CONFIG_IP_FIREWALL + +static int ip_fw_in_procinfo(char *buffer, char **start, off_t offset, + int length +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,29) + , int reset +#endif + ) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,29) + /* FIXME: No more `atomic' read and reset. Wonderful 8-( --RR */ + int reset = 0; +#endif + return ip_chain_procinfo(IP_FW_IN, buffer,start,offset,length, + reset); +} + +static int ip_fw_out_procinfo(char *buffer, char **start, off_t offset, + int length +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,29) + , int reset +#endif + ) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,29) + /* FIXME: No more `atomic' read and reset. Wonderful 8-( --RR */ + int reset = 0; +#endif + return ip_chain_procinfo(IP_FW_OUT, buffer,start,offset,length, + reset); +} + +static int ip_fw_fwd_procinfo(char *buffer, char **start, off_t offset, + int length +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,29) + , int reset +#endif + ) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,29) + /* FIXME: No more `atomic' read and reset. Wonderful 8-( --RR */ + int reset = 0; +#endif + return ip_chain_procinfo(IP_FW_FWD, buffer,start,offset,length, + reset); +} +#endif +#endif + + +#ifdef CONFIG_IP_FIREWALL +/* + * Interface to the generic firewall chains. + */ + +int ipfw_input_check(struct firewall_ops *this, int pf, + struct net_device *dev, void *phdr, void *arg, + struct sk_buff **pskb) +{ + return ip_fw_chk(phdr, dev, arg, ip_fw_in_chain, ip_fw_in_policy, + IP_FW_MODE_FW); +} + +int ipfw_output_check(struct firewall_ops *this, int pf, + struct net_device *dev, void *phdr, void *arg, + struct sk_buff **pskb) +{ + return ip_fw_chk(phdr, dev, arg, ip_fw_out_chain, ip_fw_out_policy, + IP_FW_MODE_FW); +} + +int ipfw_forward_check(struct firewall_ops *this, int pf, + struct net_device *dev, void *phdr, void *arg, + struct sk_buff **pskb) +{ + return ip_fw_chk(phdr, dev, arg, ip_fw_fwd_chain, ip_fw_fwd_policy, + IP_FW_MODE_FW); +} + +#ifdef CONFIG_IP_ACCT +int ipfw_acct_in(struct firewall_ops *this, int pf, struct net_device *dev, + void *phdr, void *arg, struct sk_buff **pskb) +{ + return ip_fw_chk(phdr,dev,NULL,ip_acct_chain,0,IP_FW_MODE_ACCT_IN); +} + +int ipfw_acct_out(struct firewall_ops *this, int pf, struct net_device *dev, + void *phdr, void *arg, struct sk_buff **pskb) +{ + return ip_fw_chk(phdr,dev,NULL,ip_acct_chain,0,IP_FW_MODE_ACCT_OUT); +} +#endif + +struct firewall_ops ipfw_ops= +{ + NULL, + ipfw_forward_check, + ipfw_input_check, + ipfw_output_check, +#ifdef CONFIG_IP_ACCT + ipfw_acct_in, + ipfw_acct_out +#else + NULL, + NULL +#endif +}; + +#endif + +#if defined(CONFIG_IP_ACCT) || defined(CONFIG_IP_FIREWALL) + +int ipfw_device_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev=ptr; + char *devname = dev->name; + unsigned long flags; + struct ip_fw *fw; + int chn; + + save_flags(flags); + cli(); + + if (event == NETDEV_UP) { + for (chn = 0; chn < IP_FW_CHAINS; chn++) + for (fw = *chains[chn]; fw; fw = fw->fw_next) + if ((fw->fw_vianame)[0] && !strncmp(devname, + fw->fw_vianame, IFNAMSIZ)) + fw->fw_viadev = dev; + } else if (event == NETDEV_DOWN) { + for (chn = 0; chn < IP_FW_CHAINS; chn++) + for (fw = *chains[chn]; fw; fw = fw->fw_next) + /* we could compare just the pointers ... */ + if ((fw->fw_vianame)[0] && !strncmp(devname, + fw->fw_vianame, IFNAMSIZ)) + fw->fw_viadev = (struct net_device*)-1; + } + + restore_flags(flags); + return NOTIFY_DONE; +} + +static struct notifier_block ipfw_dev_notifier={ + ipfw_device_event, + NULL, + 0 +}; + +#endif + +int ipfw_init_or_cleanup(int init) +{ + int ret = 0; + + if (!init) + goto cleanup; + + ret = register_firewall(PF_INET, &ipfw_ops); + if (ret < 0) + goto cleanup_nothing; + +#ifdef CONFIG_IP_ACCT + proc_net_create("ip_acct", S_IFREG | S_IRUGO | S_IWUSR, ip_acct_procinfo); +#endif + proc_net_create("ip_input", S_IFREG | S_IRUGO | S_IWUSR, ip_fw_in_procinfo); + proc_net_create("ip_output", S_IFREG | S_IRUGO | S_IWUSR, ip_fw_out_procinfo); + proc_net_create("ip_forward", S_IFREG | S_IRUGO | S_IWUSR, ip_fw_fwd_procinfo); + + /* Register for device up/down reports */ + register_netdevice_notifier(&ipfw_dev_notifier); + +#ifdef CONFIG_IP_FIREWALL_NETLINK + ipfwsk = netlink_kernel_create(NETLINK_FIREWALL, NULL); +#endif + return ret; + + cleanup: +#ifdef CONFIG_IP_FIREWALL_NETLINK + sock_release(ipfwsk->socket); +#endif + unregister_netdevice_notifier(&ipfw_dev_notifier); + +#ifdef CONFIG_IP_ACCT + proc_net_remove("ip_acct"); +#endif + proc_net_remove("ip_input"); + proc_net_remove("ip_output"); + proc_net_remove("ip_forward"); + + free_fw_chain(chains[IP_FW_FWD]); + free_fw_chain(chains[IP_FW_IN]); + free_fw_chain(chains[IP_FW_OUT]); + free_fw_chain(chains[IP_FW_ACCT]); + + unregister_firewall(PF_INET, &ipfw_ops); + + cleanup_nothing: + return ret; +} diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c new file mode 100644 index 000000000..6e69d6a90 --- /dev/null +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -0,0 +1,368 @@ +/* + * This is a module which is used for logging packets. + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/spinlock.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/tcp.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +struct in_device; +#include <net/route.h> +#include <linux/netfilter_ipv4/ipt_LOG.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +struct esphdr { + __u32 spi; +}; /* FIXME evil kludge */ + +/* Make init and cleanup non-static, so gcc doesn't warn about unused, + but don't export the symbols */ +EXPORT_NO_SYMBOLS; + +/* Use lock to serialize, so printks don't overlap */ +static spinlock_t log_lock = SPIN_LOCK_UNLOCKED; + +/* One level of recursion won't kill us */ +static void dump_packet(const struct ipt_log_info *info, + struct iphdr *iph, unsigned int len, int recurse) +{ + void *protoh = (u_int32_t *)iph + iph->ihl; + unsigned int datalen = len - iph->ihl * 4; + + /* Important fields: + * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ + /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ + printk("SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ", + (ntohl(iph->saddr)>>24)&0xFF, + (ntohl(iph->saddr)>>16)&0xFF, + (ntohl(iph->saddr)>>8)&0xFF, + (ntohl(iph->saddr))&0xFF, + (ntohl(iph->daddr)>>24)&0xFF, + (ntohl(iph->daddr)>>16)&0xFF, + (ntohl(iph->daddr)>>8)&0xFF, + (ntohl(iph->daddr))&0xFF); + + /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ + printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", + ntohs(iph->tot_len), iph->tos & IPTOS_TOS_MASK, + iph->tos & IPTOS_PREC_MASK, iph->ttl, ntohs(iph->id)); + + /* Max length: 6 "CE DF MF " */ + if (ntohs(iph->frag_off) & IP_CE) + printk("CE "); + if (ntohs(iph->frag_off) & IP_DF) + printk("DF "); + if (ntohs(iph->frag_off) & IP_MF) + printk("MF "); + + /* Max length: 11 "FRAG:65535 " */ + if (ntohs(iph->frag_off) & IP_OFFSET) + printk("FRAG:%u ", ntohs(iph->frag_off) & IP_OFFSET); + + if ((info->logflags & IPT_LOG_IPOPT) + && iph->ihl * 4 != sizeof(struct iphdr)) { + unsigned int i; + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + printk("OPT ("); + for (i = sizeof(struct iphdr); i < iph->ihl * 4; i++) + printk("%02X", ((u_int8_t *)iph)[i]); + printk(") "); + } + + switch (iph->protocol) { + case IPPROTO_TCP: { + struct tcphdr *tcph = protoh; + + /* Max length: 10 "PROTO=TCP " */ + printk("PROTO=TCP "); + + if (ntohs(iph->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + if (datalen < sizeof (*tcph)) { + printk("INCOMPLETE [%u bytes] ", datalen); + break; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + printk("SPT=%u DPT=%u ", + ntohs(tcph->source), ntohs(tcph->dest)); + /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ + if (info->logflags & IPT_LOG_TCPSEQ) + printk("SEQ=%u ACK=%u ", + ntohl(tcph->seq), ntohl(tcph->ack_seq)); + /* Max length: 13 "WINDOW=65535 " */ + printk("WINDOW=%u ", ntohs(tcph->window)); + /* Max length: 9 "RES=0x3F " */ + printk("RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(tcph) & TCP_RESERVED_BITS) >> 22)); + /* Max length: 36 "URG ACK PSH RST SYN FIN " */ + if (tcph->urg) + printk("URG "); + if (tcph->ack) + printk("ACK "); + if (tcph->psh) + printk("PSH "); + if (tcph->rst) + printk("RST "); + if (tcph->syn) + printk("SYN "); + if (tcph->fin) + printk("FIN "); + /* Max length: 11 "URGP=65535 " */ + printk("URGP=%u ", ntohs(tcph->urg_ptr)); + + if ((info->logflags & IPT_LOG_TCPOPT) + && tcph->doff * 4 != sizeof(struct tcphdr)) { + unsigned int i; + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + printk("OPT ("); + for (i =sizeof(struct tcphdr); i < tcph->doff * 4; i++) + printk("%02X", ((u_int8_t *)tcph)[i]); + printk(") "); + } + break; + } + case IPPROTO_UDP: { + struct udphdr *udph = protoh; + + /* Max length: 10 "PROTO=UDP " */ + printk("PROTO=UDP "); + + if (ntohs(iph->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + if (datalen < sizeof (*udph)) { + printk("INCOMPLETE [%u bytes] ", datalen); + break; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + printk("SPT=%u DPT=%u LEN=%u ", + ntohs(udph->source), ntohs(udph->dest), + ntohs(udph->len)); + break; + } + case IPPROTO_ICMP: { + struct icmphdr *icmph = protoh; + static size_t required_len[NR_ICMP_TYPES+1] + = { [ICMP_ECHOREPLY] = 4, + [ICMP_DEST_UNREACH] + = 8 + sizeof(struct iphdr) + 8, + [ICMP_SOURCE_QUENCH] + = 8 + sizeof(struct iphdr) + 8, + [ICMP_REDIRECT] + = 8 + sizeof(struct iphdr) + 8, + [ICMP_ECHO] = 4, + [ICMP_TIME_EXCEEDED] + = 8 + sizeof(struct iphdr) + 8, + [ICMP_PARAMETERPROB] + = 8 + sizeof(struct iphdr) + 8, + [ICMP_TIMESTAMP] = 20, + [ICMP_TIMESTAMPREPLY] = 20, + [ICMP_ADDRESS] = 12, + [ICMP_ADDRESSREPLY] = 12 }; + + /* Max length: 11 "PROTO=ICMP " */ + printk("PROTO=ICMP "); + + if (ntohs(iph->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + if (datalen < 4) { + printk("INCOMPLETE [%u bytes] ", datalen); + break; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + printk("TYPE=%u CODE=%u ", icmph->type, icmph->code); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + if (icmph->type <= NR_ICMP_TYPES + && required_len[icmph->type] + && datalen < required_len[icmph->type]) { + printk("INCOMPLETE [%u bytes] ", datalen); + break; + } + + switch (icmph->type) { + case ICMP_ECHOREPLY: + case ICMP_ECHO: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + printk("ID=%u SEQ=%u ", + ntohs(icmph->un.echo.id), + ntohs(icmph->un.echo.sequence)); + break; + + case ICMP_PARAMETERPROB: + /* Max length: 14 "PARAMETER=255 " */ + printk("PARAMETER=%u ", + ntohl(icmph->un.gateway) >> 24); + break; + case ICMP_REDIRECT: + /* Max length: 24 "GATEWAY=255.255.255.255 " */ + printk("GATEWAY=%u.%u.%u.%u ", + (ntohl(icmph->un.gateway)>>24)&0xFF, + (ntohl(icmph->un.gateway)>>16)&0xFF, + (ntohl(icmph->un.gateway)>>8)&0xFF, + (ntohl(icmph->un.gateway))&0xFF); + /* Fall through */ + case ICMP_DEST_UNREACH: + case ICMP_SOURCE_QUENCH: + case ICMP_TIME_EXCEEDED: + /* Max length: 3+maxlen */ + if (recurse) { + printk("["); + dump_packet(info, + (struct iphdr *)(icmph + 1), + datalen-sizeof(struct iphdr), + 0); + printk("] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (icmph->type == ICMP_DEST_UNREACH + && icmph->code == ICMP_FRAG_NEEDED) + printk("MTU=%u ", ntohs(icmph->un.frag.mtu)); + } + break; + } + /* Max Length */ + case IPPROTO_AH: + case IPPROTO_ESP: { + struct esphdr *esph = protoh; + int esp= (iph->protocol==IPPROTO_ESP); + + /* Max length: 10 "PROTO=ESP " */ + printk("PROTO=%s ",esp? "ESP" : "AH"); + + if (ntohs(iph->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + if (datalen < sizeof (*esph)) { + printk("INCOMPLETE [%u bytes] ", datalen); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + printk("SPI=0x%x ", ntohl(esph->spi) ); + break; + } + /* Max length: 10 "PROTO 255 " */ + default: + printk("PROTO=%u ", iph->protocol); + } + + /* Proto Max log string length */ + /* IP: 40+46+6+11+127 = 230 */ + /* TCP: 10+max(25,20+30+13+9+36+11+127) = 256 */ + /* UDP: 10+max(25,20) = 35 */ + /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ + /* ESP: 10+max(25)+15 = 50 */ + /* AH: 9+max(25)+15 = 49 */ + /* unknown: 10 */ + + /* (ICMP allows recursion one level deep) */ + /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */ + /* maxlen = 230+ 91 + 230 + 256 = 807 */ +} + +static unsigned int +ipt_log_target(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userinfo) +{ + struct iphdr *iph = (*pskb)->nh.iph; + const struct ipt_log_info *loginfo = targinfo; + char level_string[4] = "< >"; + + level_string[1] = '0' + (loginfo->level % 8); + spin_lock_bh(&log_lock); + printk(level_string); + printk("%sIN=%s OUT=%s ", + loginfo->prefix, + in ? in->name : "", + out ? out->name : ""); + if (in && !out) { + /* MAC logging for input chain only. */ + printk("MAC="); + if ((*pskb)->dev && (*pskb)->dev->hard_header_len) { + int i; + unsigned char *p = (*pskb)->mac.raw; + for (i = 0; i < (*pskb)->dev->hard_header_len; i++,p++) + printk("%02x%c", *p, + i==(*pskb)->dev->hard_header_len - 1 + ? ' ':':'); + } + } + + dump_packet(loginfo, iph, (*pskb)->len, 1); + printk("\n"); + spin_unlock_bh(&log_lock); + + return IPT_CONTINUE; +} + +static int ipt_log_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ipt_log_info *loginfo = targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_log_info))) { + DEBUGP("LOG: targinfosize %u != %u\n", + targinfosize, IPT_ALIGN(sizeof(struct ipt_log_info))); + return 0; + } + + if (loginfo->level >= 8) { + DEBUGP("LOG: level %u >= 8\n", loginfo->level); + return 0; + } + + if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') { + DEBUGP("LOG: prefix term %i\n", + loginfo->prefix[sizeof(loginfo->prefix)-1]); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_log_reg += { { NULL, NULL }, "LOG", ipt_log_target, ipt_log_checkentry, THIS_MODULE }; + +static int __init init(void) +{ + if (ipt_register_target(&ipt_log_reg)) + return -EINVAL; + + return 0; +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_log_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c new file mode 100644 index 000000000..32906eefe --- /dev/null +++ b/net/ipv4/netfilter/ipt_MARK.c @@ -0,0 +1,68 @@ +/* This is a module which is used for setting the NFMARK field of an skb. */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <net/checksum.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_MARK.h> + +EXPORT_NO_SYMBOLS; + +static unsigned int +target(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userinfo) +{ + const struct ipt_mark_target_info *markinfo = targinfo; + + if((*pskb)->nfmark != markinfo->mark) { + (*pskb)->nfmark = markinfo->mark; + (*pskb)->nfcache |= NFC_ALTERED; + } + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) { + printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_mark_target_info))); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_mark_reg += { { NULL, NULL }, "MARK", target, checkentry, THIS_MODULE }; + +static int __init init(void) +{ + if (ipt_register_target(&ipt_mark_reg)) + return -EINVAL; + + return 0; +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_mark_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c new file mode 100644 index 000000000..9f94f8f44 --- /dev/null +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -0,0 +1,171 @@ +/* Masquerade. Simple mapping which alters range to a local IP address + (depending on route). */ +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/timer.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <net/protocol.h> +#include <net/checksum.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +EXPORT_NO_SYMBOLS; + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* Lock protects masq region inside conntrack */ +static DECLARE_RWLOCK(masq_lock); + +/* FIXME: Multiple targets. --RR */ +static int +masquerade_check(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ip_nat_multi_range *mr = targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(*mr))) { + DEBUGP("masquerade_check: size %u != %u.\n", + targinfosize, sizeof(*mr)); + return 0; + } + if (hook_mask & ~(1 << NF_IP_POST_ROUTING)) { + DEBUGP("masquerade_check: bad hooks %x.\n", hook_mask); + return 0; + } + if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { + DEBUGP("masquerade_check: bad MAP_IPS.\n"); + return 0; + } + if (mr->rangesize != 1) { + DEBUGP("masquerade_check: bad rangesize %u.\n", mr->rangesize); + return 0; + } + return 1; +} + +static unsigned int +masquerade_target(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userinfo) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + const struct ip_nat_range *r; + struct ip_nat_multi_range newrange; + u_int32_t newsrc; + struct rtable *rt; + + IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING); + + /* FIXME: For the moment, don't do local packets, breaks + testsuite for 2.3.49 --RR */ + if ((*pskb)->sk) + return NF_ACCEPT; + + ct = ip_conntrack_get(*pskb, &ctinfo); + IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW + || ctinfo == IP_CT_RELATED)); + + r = targinfo; + + if (ip_route_output(&rt, (*pskb)->nh.iph->daddr, + 0, + RT_TOS((*pskb)->nh.iph->tos)|RTO_CONN, + out->ifindex) != 0) { + /* Shouldn't happen */ + printk("MASQUERADE: No route: Rusty's brain broke!\n"); + return NF_DROP; + } + + newsrc = rt->rt_src; + DEBUGP("newsrc = %u.%u.%u.%u\n", IP_PARTS(newsrc)); + ip_rt_put(rt); + + WRITE_LOCK(&masq_lock); + ct->nat.masq_index = out->ifindex; + WRITE_UNLOCK(&masq_lock); + + /* Transfer from original range. */ + newrange = ((struct ip_nat_multi_range) + { 1, { { r->flags | IP_NAT_RANGE_MAP_IPS, + newsrc, newsrc, + r->min, r->max } } }); + + /* Hand modified range to generic setup. */ + return ip_nat_setup_info(ct, &newrange, hooknum); +} + +static inline int +device_cmp(const struct ip_conntrack *i, void *ifindex) +{ + int ret; + + READ_LOCK(&masq_lock); + ret = (i->nat.masq_index == (int)(long)ifindex); + READ_UNLOCK(&masq_lock); + + return ret; +} + +int masq_device_event(struct notifier_block *this, + unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + + if (event == NETDEV_DOWN) { + /* Device was downed. Search entire table for + conntracks which were associated with that device, + and forget them. */ + IP_NF_ASSERT(dev->ifindex != 0); + + ip_ct_selective_cleanup(device_cmp, (void *)(long)dev->ifindex); + } + + return NOTIFY_DONE; +} + +static struct notifier_block masq_dev_notifier = { + masq_device_event, + NULL, + 0 +}; + +static struct ipt_target masquerade += { { NULL, NULL }, "MASQUERADE", masquerade_target, masquerade_check, + THIS_MODULE }; + +static int __init init(void) +{ + int ret; + + ret = ipt_register_target(&masquerade); + + if (ret == 0) { + /* Register for device down reports */ + register_netdevice_notifier(&masq_dev_notifier); + } + + return ret; +} + +static void __exit fini(void) +{ + ipt_unregister_target(&masquerade); + unregister_netdevice_notifier(&masq_dev_notifier); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_MIRROR.c b/net/ipv4/netfilter/ipt_MIRROR.c new file mode 100644 index 000000000..9dec181c1 --- /dev/null +++ b/net/ipv4/netfilter/ipt_MIRROR.c @@ -0,0 +1,131 @@ +/* + This is a module which is used for resending packets with inverted src and dst. + + Based on code from: ip_nat_dumb.c,v 1.9 1999/08/20 + and various sources. + + Copyright (C) 2000 Emmanuel Roger <winfield@freegates.be> + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2 of the License, or (at your + option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <net/ip.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netdevice.h> +#include <linux/route.h> +struct in_device; +#include <net/route.h> +EXPORT_NO_SYMBOLS; + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int route_mirror(struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + struct rtable *rt; + + if (ip_route_output(&rt, iph->daddr, iph->saddr, + RT_TOS(iph->tos) | RTO_CONN, + 0)) { + return -EINVAL; + } + /* check if the interface we are living by is the same as the one we arrived on */ + + if (skb->rx_dev != rt->u.dst.dev) { + /* Drop old route. */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + return 0; + } + else return -EINVAL; +} + +static int +ip_rewrite(struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + u32 odaddr = iph->saddr; + u32 osaddr = iph->daddr; + + skb->nfcache |= NFC_ALTERED; + + /* Rewrite IP header */ + iph->daddr = odaddr; + iph->saddr = osaddr; + + return 0; +} + + +static unsigned int ipt_mirror_target(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userinfo) +{ + if ((*pskb)->dst != NULL) { + if (!ip_rewrite(*pskb) && !route_mirror(*pskb)) { + ip_send(*pskb); + return NF_STOLEN; + } + } + return NF_DROP; +} + +static int ipt_mirror_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + /* Only on INPUT, FORWARD or PRE_ROUTING, otherwise loop danger. */ + if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) + | (1 << NF_IP_FORWARD) + | (1 << NF_IP_LOCAL_IN))) { + DEBUGP("MIRROR: bad hook\n"); + return 0; + } + + if (targinfosize != IPT_ALIGN(0)) { + DEBUGP("MIRROR: targinfosize %u != 0\n", targinfosize); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_mirror_reg += { { NULL, NULL }, "MIRROR", ipt_mirror_target, ipt_mirror_checkentry, + THIS_MODULE }; + +static int __init init(void) +{ + return ipt_register_target(&ipt_mirror_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_mirror_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c new file mode 100644 index 000000000..690d3a8a1 --- /dev/null +++ b/net/ipv4/netfilter/ipt_REDIRECT.c @@ -0,0 +1,104 @@ +/* Redirect. Simple mapping which alters dst to a local IP address. */ +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/timer.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/netdevice.h> +#include <linux/if.h> +#include <linux/inetdevice.h> +#include <net/protocol.h> +#include <net/checksum.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> + +EXPORT_NO_SYMBOLS; + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* FIXME: Take multiple ranges --RR */ +static int +redirect_check(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ip_nat_multi_range *mr = targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(*mr))) { + DEBUGP("redirect_check: size %u.\n", targinfosize); + return 0; + } + if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))) { + DEBUGP("redirect_check: bad hooks %x.\n", hook_mask); + return 0; + } + if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { + DEBUGP("redirect_check: bad MAP_IPS.\n"); + return 0; + } + if (mr->rangesize != 1) { + DEBUGP("redirect_check: bad rangesize %u.\n", mr->rangesize); + return 0; + } + return 1; +} + +static unsigned int +redirect_target(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userinfo) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + u_int32_t newdst; + const struct ip_nat_range *r = targinfo; + struct ip_nat_multi_range newrange; + + IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING + || hooknum == NF_IP_LOCAL_OUT); + + ct = ip_conntrack_get(*pskb, &ctinfo); + IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + + /* Local packets: make them go to loopback */ + if (hooknum == NF_IP_LOCAL_OUT) + newdst = htonl(0x7F000001); + else + /* Grab first address on interface. */ + newdst = (((struct in_device *)(*pskb)->dev->ip_ptr) + ->ifa_list->ifa_local); + + /* Transfer from original range. */ + newrange = ((struct ip_nat_multi_range) + { 1, { { r->flags | IP_NAT_RANGE_MAP_IPS, + newdst, newdst, + r->min, r->max } } }); + + /* Hand modified range to generic setup. */ + return ip_nat_setup_info(ct, &newrange, hooknum); +} + +static struct ipt_target redirect_reg += { { NULL, NULL }, "REDIRECT", redirect_target, redirect_check, THIS_MODULE }; + +static int __init init(void) +{ + return ipt_register_target(&redirect_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&redirect_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c new file mode 100644 index 000000000..b183e822c --- /dev/null +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -0,0 +1,145 @@ +/* + * This is a module which is used for rejecting packets. + * Added support for customized reject packets (Jozsef Kadlecsik). + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <net/icmp.h> +#include <net/tcp.h> +struct in_device; +#include <net/route.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_REJECT.h> +EXPORT_NO_SYMBOLS; + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static unsigned int reject(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userinfo) +{ + const struct ipt_reject_info *reject = targinfo; + + switch (reject->with) { + case IPT_ICMP_NET_UNREACHABLE: + icmp_send(*pskb, ICMP_DEST_UNREACH, ICMP_NET_UNREACH, 0); + break; + case IPT_ICMP_HOST_UNREACHABLE: + icmp_send(*pskb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); + break; + case IPT_ICMP_PROT_UNREACHABLE: + icmp_send(*pskb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); + break; + case IPT_ICMP_PORT_UNREACHABLE: + icmp_send(*pskb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + break; + case IPT_ICMP_ECHOREPLY: { + struct icmphdr *icmph = (struct icmphdr *) + ((u_int32_t *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl); + unsigned int datalen = (*pskb)->len - (*pskb)->nh.iph->ihl * 4; + + /* Not non-head frags, or truncated */ + if (((ntohs((*pskb)->nh.iph->frag_off) & IP_OFFSET) == 0) + && datalen >= 4) { + /* Usually I don't like cut & pasting code, + but dammit, my party is starting in 45 + mins! --RR */ + struct icmp_bxm icmp_param; + + icmp_param.icmph=*icmph; + icmp_param.icmph.type=ICMP_ECHOREPLY; + icmp_param.data_ptr=(icmph+1); + icmp_param.data_len=datalen; + icmp_reply(&icmp_param, *pskb); + } + } + break; + case IPT_TCP_RESET: + tcp_v4_send_reset(*pskb); + break; + } + + return NF_DROP; +} + +static inline int find_ping_match(const struct ipt_entry_match *m) +{ + const struct ipt_icmp *icmpinfo = (const struct ipt_icmp *)m->data; + + if (strcmp(m->u.match->name, "icmp") == 0 + && icmpinfo->type == ICMP_ECHO + && !(icmpinfo->invflags & IPT_ICMP_INV)) + return 1; + + return 0; +} + +static int check(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ipt_reject_info *rejinfo = targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_icmp))) { + DEBUGP("REJECT: targinfosize %u != 0\n", targinfosize); + return 0; + } + + /* Only allow these for packet filtering. */ + if ((hook_mask & ~((1 << NF_IP_LOCAL_IN) + | (1 << NF_IP_FORWARD) + | (1 << NF_IP_LOCAL_OUT))) != 0) { + DEBUGP("REJECT: bad hook mask %X\n", hook_mask); + return 0; + } + + if (rejinfo->with == IPT_ICMP_ECHOREPLY) { + /* Must specify that it's an ICMP ping packet. */ + if (e->ip.proto != IPPROTO_ICMP + || (e->ip.invflags & IPT_INV_PROTO)) { + DEBUGP("REJECT: ECHOREPLY illegal for non-icmp\n"); + return 0; + } + /* Must contain ICMP match. */ + if (IPT_MATCH_ITERATE(e, find_ping_match) == 0) { + DEBUGP("REJECT: ECHOREPLY illegal for non-ping\n"); + return 0; + } + } else if (rejinfo->with == IPT_TCP_RESET) { + if (e->ip.proto != IPPROTO_TCP + || (e->ip.invflags & IPT_INV_PROTO)) { + DEBUGP("REJECT: TCP_RESET illegal for non-tcp\n"); + return 0; + } + } + + return 1; +} + +static struct ipt_target ipt_reject_reg += { { NULL, NULL }, "REJECT", reject, check, THIS_MODULE }; + +static int __init init(void) +{ + if (ipt_register_target(&ipt_reject_reg)) + return -EINVAL; + return 0; +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_reject_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c new file mode 100644 index 000000000..fbfb4974f --- /dev/null +++ b/net/ipv4/netfilter/ipt_TOS.c @@ -0,0 +1,87 @@ +/* This is a module which is used for setting the TOS field of a packet. */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <net/checksum.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_TOS.h> + +EXPORT_NO_SYMBOLS; + +static unsigned int +target(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userinfo) +{ + struct iphdr *iph = (*pskb)->nh.iph; + const struct ipt_tos_target_info *tosinfo = targinfo; + + if ((iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) { + u_int8_t diffs[2]; + + diffs[0] = iph->tos; + iph->tos = (iph->tos & IPTOS_PREC_MASK) | tosinfo->tos; + diffs[1] = iph->tos; + iph->check = csum_fold(csum_partial((char *)diffs, + sizeof(diffs), + iph->check^0xFFFF)); + (*pskb)->nfcache |= NFC_ALTERED; + } + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const u_int8_t tos = ((struct ipt_tos_target_info *)targinfo)->tos; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_tos_target_info))) { + printk(KERN_WARNING "TOS: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_tos_target_info))); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "TOS: can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + if (tos != IPTOS_LOWDELAY + && tos != IPTOS_THROUGHPUT + && tos != IPTOS_RELIABILITY + && tos != IPTOS_MINCOST + && tos != IPTOS_NORMALSVC) { + printk(KERN_WARNING "TOS: bad tos value %#x\n", tos); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_tos_reg += { { NULL, NULL }, "TOS", target, checkentry, THIS_MODULE }; + +static int __init init(void) +{ + if (ipt_register_target(&ipt_tos_reg)) + return -EINVAL; + + return 0; +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_tos_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_limit.c b/net/ipv4/netfilter/ipt_limit.c new file mode 100644 index 000000000..3785ba371 --- /dev/null +++ b/net/ipv4/netfilter/ipt_limit.c @@ -0,0 +1,144 @@ +/* Kernel module to control the rate + * + * Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr> + * Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr> + * + * 2 September 1999: Changed from the target RATE to the match + * `limit', removed logging. Did I mention that + * Alexey is a fucking genius? + * Rusty Russell (rusty@rustcorp.com.au). */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_limit.h> +EXPORT_NO_SYMBOLS; + +#define IP_PARTS_NATIVE(n) \ +(unsigned int)((n)>>24)&0xFF, \ +(unsigned int)((n)>>16)&0xFF, \ +(unsigned int)((n)>>8)&0xFF, \ +(unsigned int)((n)&0xFF) + +#define IP_PARTS(n) IP_PARTS_NATIVE(ntohl(n)) + +/* The algorithm used is the Simple Token Bucket Filter (TBF) + * see net/sched/sch_tbf.c in the linux source tree + */ + +static spinlock_t limit_lock = SPIN_LOCK_UNLOCKED; + +/* Rusty: This is my (non-mathematically-inclined) understanding of + this algorithm. The `average rate' in jiffies becomes your initial + amount of credit `credit' and the most credit you can ever have + `credit_cap'. The `peak rate' becomes the cost of passing the + test, `cost'. + + `prev' tracks the last packet hit: you gain one credit per jiffy. + If you get credit balance more than this, the extra credit is + discarded. Every time the match passes, you lose `cost' credits; + if you don't have that many, the test fails. + + See Alexey's formal explanation in net/sched/sch_tbf.c. + + To avoid underflow, we multiply by 128 (ie. you get 128 credits per + jiffy). Hence a cost of 2^32-1, means one pass per 32768 seconds + at 1024HZ (or one every 9 hours). A cost of 1 means 12800 passes + per second at 100HZ. */ + +#define CREDITS_PER_JIFFY 128 + +static int +ipt_limit_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + struct ipt_rateinfo *r = ((struct ipt_rateinfo *)matchinfo)->master; + unsigned long now = jiffies; + + spin_lock_bh(&limit_lock); + r->credit += (now - xchg(&r->prev, now)) * CREDITS_PER_JIFFY; + if (r->credit > r->credit_cap) + r->credit = r->credit_cap; + + if (r->credit >= r->cost) { + /* We're not limited. */ + r->credit -= r->cost; + spin_unlock_bh(&limit_lock); + return 1; + } + + spin_unlock_bh(&limit_lock); + return 0; +} + +/* Precision saver. */ +static u_int32_t +user2credits(u_int32_t user) +{ + /* If multiplying would overflow... */ + if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) + /* Divide first. */ + return (user / IPT_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY; + + return (user * HZ * CREDITS_PER_JIFFY) / IPT_LIMIT_SCALE; +} + +static int +ipt_limit_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + struct ipt_rateinfo *r = matchinfo; + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_rateinfo))) + return 0; + + /* Check for overflow. */ + if (r->burst == 0 + || user2credits(r->avg * r->burst) < user2credits(r->avg)) { + printk("Call rusty: overflow in ipt_limit: %u/%u\n", + r->avg, r->burst); + return 0; + } + + /* User avg in seconds * IPT_LIMIT_SCALE: convert to jiffies * + 128. */ + r->prev = jiffies; + r->credit = user2credits(r->avg * r->burst); /* Credits full. */ + r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */ + r->cost = user2credits(r->avg); + + /* For SMP, we only want to use one set of counters. */ + r->master = r; + + return 1; +} + +static struct ipt_match ipt_limit_reg += { { NULL, NULL }, "limit", ipt_limit_match, ipt_limit_checkentry, + THIS_MODULE }; + +static int __init init(void) +{ + if (ipt_register_match(&ipt_limit_reg)) + return -EINVAL; + return 0; +} + +static void __exit fini(void) +{ + ipt_unregister_match(&ipt_limit_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_mac.c b/net/ipv4/netfilter/ipt_mac.c new file mode 100644 index 000000000..90dbec59d --- /dev/null +++ b/net/ipv4/netfilter/ipt_mac.c @@ -0,0 +1,63 @@ +/* Kernel module to match MAC address parameters. */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/if_ether.h> + +#include <linux/netfilter_ipv4/ipt_mac.h> +#include <linux/netfilter_ipv4/ip_tables.h> +EXPORT_NO_SYMBOLS; + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + const struct ipt_mac_info *info = matchinfo; + + /* Is mac pointer valid? */ + return (skb->mac.raw >= skb->head + && skb->mac.raw < skb->head + skb->len - ETH_HLEN + /* If so, compare... */ + && ((memcmp(skb->mac.ethernet->h_source, info->srcaddr, ETH_ALEN) + == 0) ^ info->invert)); +} + +static int +ipt_mac_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (hook_mask + & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN))) { + printk("ipt_mac: only valid for PRE_ROUTING or LOCAL_IN.\n"); + return 0; + } + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_mac_info))) + return 0; + + return 1; +} + +static struct ipt_match mac_match += { { NULL, NULL }, "mac", &match, &ipt_mac_checkentry, THIS_MODULE }; + +static int __init init(void) +{ + return ipt_register_match(&mac_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&mac_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c new file mode 100644 index 000000000..0d828fd20 --- /dev/null +++ b/net/ipv4/netfilter/ipt_mark.c @@ -0,0 +1,52 @@ +/* Kernel module to match NFMARK values. */ +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter_ipv4/ipt_mark.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +EXPORT_NO_SYMBOLS; + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + const struct ipt_mark_info *info = matchinfo; + + return ((skb->nfmark & info->mask) == info->mark) ^ info->invert; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info))) + return 0; + + return 1; +} + +static struct ipt_match mark_match += { { NULL, NULL }, "mark", &match, &checkentry, THIS_MODULE }; + +static int __init init(void) +{ + return ipt_register_match(&mark_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&mark_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_multiport.c b/net/ipv4/netfilter/ipt_multiport.c new file mode 100644 index 000000000..08cc4a968 --- /dev/null +++ b/net/ipv4/netfilter/ipt_multiport.c @@ -0,0 +1,102 @@ +/* Kernel module to match one of a list of TCP/UDP ports: ports are in + the same place so we can treat them as equal. */ +#include <linux/module.h> +#include <linux/types.h> +#include <linux/udp.h> +#include <linux/skbuff.h> + +#include <linux/netfilter_ipv4/ipt_multiport.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +#if 0 +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +EXPORT_NO_SYMBOLS; + +/* Returns 1 if the port is matched by the test, 0 otherwise. */ +static inline int +ports_match(const u_int16_t *portlist, enum ipt_multiport_flags flags, + u_int8_t count, u_int16_t src, u_int16_t dst) +{ + unsigned int i; + for (i=0; i<count; i++) { + if (flags != IPT_MULTIPORT_DESTINATION + && portlist[i] == src) + return 1; + + if (flags != IPT_MULTIPORT_SOURCE + && portlist[i] == dst) + return 1; + } + + return 0; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + const struct udphdr *udp = hdr; + const struct ipt_multiport *multiinfo = matchinfo; + + /* Must be big enough to read ports. */ + if (offset == 0 && datalen < sizeof(struct udphdr)) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + duprintf("ipt_multiport:" + " Dropping evil offset=0 tinygram.\n"); + *hotdrop = 1; + return 0; + } + + /* Must not be a fragment. */ + return !offset + && ports_match(multiinfo->ports, + multiinfo->flags, multiinfo->count, + ntohs(udp->source), ntohs(udp->dest)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_multiport *multiinfo = matchinfo; + + /* Must specify proto == TCP/UDP, no unknown flags or bad count */ + return (ip->proto == IPPROTO_TCP || ip->proto == IPPROTO_UDP) + && !(ip->flags & IPT_INV_PROTO) + && matchsize == IPT_ALIGN(sizeof(struct ipt_multiport)) + && (multiinfo->flags == IPT_MULTIPORT_SOURCE + || multiinfo->flags == IPT_MULTIPORT_DESTINATION + || multiinfo->flags == IPT_MULTIPORT_EITHER) + && multiinfo->count <= IPT_MULTI_PORTS; +} + +static struct ipt_match multiport_match += { { NULL, NULL }, "multiport", &match, &checkentry, THIS_MODULE }; + +static int __init init(void) +{ + return ipt_register_match(&multiport_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&multiport_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c new file mode 100644 index 000000000..5438571d3 --- /dev/null +++ b/net/ipv4/netfilter/ipt_owner.c @@ -0,0 +1,136 @@ +/* Kernel module to match various things tied to sockets associated with + locally generated outgoing packets. + + (C)2000 Marc Boucher + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/file.h> +#include <net/sock.h> + +#include <linux/netfilter_ipv4/ipt_owner.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +EXPORT_NO_SYMBOLS; + +static int +match_pid(const struct sk_buff *skb, pid_t pid) +{ + struct task_struct *p; + int i; + + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + if(p && p->files) { + for (i=0; i < p->files->max_fds; i++) { + if (fcheck_task(p, i) == skb->sk->socket->file) { + read_unlock(&tasklist_lock); + return 1; + } + } + } + read_unlock(&tasklist_lock); + return 0; +} + +static int +match_sid(const struct sk_buff *skb, pid_t sid) +{ + struct task_struct *p; + int i, found=0; + + read_lock(&tasklist_lock); + for_each_task(p) { + if ((p->session != sid) || !p->files) + continue; + + for (i=0; i < p->files->max_fds; i++) { + if (fcheck_task(p, i) == skb->sk->socket->file) { + found = 1; + break; + } + } + if(found) + break; + } + read_unlock(&tasklist_lock); + + return found; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + const struct ipt_owner_info *info = matchinfo; + + if (!skb->sk || !skb->sk->socket || !skb->sk->socket->file) + return 0; + + if(info->match & IPT_OWNER_UID) { + if((skb->sk->socket->file->f_uid != info->uid) ^ + !!(info->invert & IPT_OWNER_UID)) + return 0; + } + + if(info->match & IPT_OWNER_GID) { + if((skb->sk->socket->file->f_gid != info->gid) ^ + !!(info->invert & IPT_OWNER_GID)) + return 0; + } + + if(info->match & IPT_OWNER_PID) { + if (!match_pid(skb, info->pid) ^ + !!(info->invert & IPT_OWNER_PID)) + return 0; + } + + if(info->match & IPT_OWNER_SID) { + if (!match_sid(skb, info->sid) ^ + !!(info->invert & IPT_OWNER_SID)) + return 0; + } + + return 1; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (hook_mask + & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) { + printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n"); + return 0; + } + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_owner_info))) + return 0; + + return 1; +} + +static struct ipt_match owner_match += { { NULL, NULL }, "owner", &match, &checkentry, THIS_MODULE }; + +static int __init init(void) +{ + return ipt_register_match(&owner_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&owner_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_state.c b/net/ipv4/netfilter/ipt_state.c new file mode 100644 index 000000000..1baa54d62 --- /dev/null +++ b/net/ipv4/netfilter/ipt_state.c @@ -0,0 +1,61 @@ +/* Kernel module to match connection tracking information. + * GPL (C) 1999 Rusty Russell (rusty@rustcorp.com.au). + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_state.h> +EXPORT_NO_SYMBOLS; + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + const struct ipt_state_info *sinfo = matchinfo; + enum ip_conntrack_info ctinfo; + unsigned int statebit; + + if (!ip_conntrack_get((struct sk_buff *)skb, &ctinfo)) + statebit = IPT_STATE_INVALID; + else + statebit = IPT_STATE_BIT(ctinfo); + + return (sinfo->statemask & statebit); +} + +static int check(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_state_info))) + return 0; + + return 1; +} + +static struct ipt_match state_match += { { NULL, NULL }, "state", &match, &check, THIS_MODULE }; + +static int __init init(void) +{ + __MOD_INC_USE_COUNT(ip_conntrack_module); + return ipt_register_match(&state_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&state_match); + __MOD_DEC_USE_COUNT(ip_conntrack_module); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_tos.c b/net/ipv4/netfilter/ipt_tos.c new file mode 100644 index 000000000..6da72b2d8 --- /dev/null +++ b/net/ipv4/netfilter/ipt_tos.c @@ -0,0 +1,53 @@ +/* Kernel module to match TOS values. */ +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter_ipv4/ipt_tos.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +EXPORT_NO_SYMBOLS; + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + const struct ipt_tos_info *info = matchinfo; + const struct iphdr *iph = skb->nh.iph; + + return (iph->tos == info->tos) ^ info->invert; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_tos_info))) + return 0; + + return 1; +} + +static struct ipt_match tos_match += { { NULL, NULL }, "tos", &match, &checkentry, THIS_MODULE }; + +static int __init init(void) +{ + return ipt_register_match(&tos_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&tos_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_unclean.c b/net/ipv4/netfilter/ipt_unclean.c new file mode 100644 index 000000000..056224a87 --- /dev/null +++ b/net/ipv4/netfilter/ipt_unclean.c @@ -0,0 +1,576 @@ +/* Kernel module to match suspect packets. */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <linux/tcp.h> +#include <linux/icmp.h> +#include <net/checksum.h> + +#include <linux/netfilter_ipv4/ip_tables.h> + +EXPORT_NO_SYMBOLS; + +#define limpk(format, args...) \ +do { \ + if (net_ratelimit()) \ + printk("ipt_unclean: %s" format, \ + embedded ? "(embedded packet) " : "" , ## args); \ +} while(0) + +enum icmp_error_status +{ + ICMP_MAY_BE_ERROR, + ICMP_IS_ERROR, + ICMP_NOT_ERROR +}; + +struct icmp_info +{ + size_t min_len, max_len; + enum icmp_error_status err; + u_int8_t min_code, max_code; +}; + +static int +check_ip(struct iphdr *iph, size_t length, int embedded); + +/* ICMP-specific checks. */ +static int +check_icmp(const struct icmphdr *icmph, + u_int16_t datalen, + unsigned int offset, + int more_frags, + int embedded) +{ + static struct icmp_info info[] + = { [ICMP_ECHOREPLY] + = { 8, 65536, ICMP_NOT_ERROR, 0, 0 }, + [ICMP_DEST_UNREACH] + = { 8 + 28, 65536, ICMP_IS_ERROR, 0, 15 }, + [ICMP_SOURCE_QUENCH] + = { 8 + 28, 65536, ICMP_IS_ERROR, 0, 0 }, + [ICMP_REDIRECT] + = { 8 + 28, 65536, ICMP_IS_ERROR, 0, 3 }, + [ICMP_ECHO] + = { 8, 65536, ICMP_NOT_ERROR, 0, 0 }, + /* Router advertisement. */ + [9] + = { 8, 8 + 255 * 8, ICMP_NOT_ERROR, 0, 0 }, + /* Router solicitation. */ + [10] + = { 8, 8, ICMP_NOT_ERROR, 0, 0 }, + [ICMP_TIME_EXCEEDED] + = { 8 + 28, 65536, ICMP_IS_ERROR, 0, 1 }, + [ICMP_PARAMETERPROB] + = { 8 + 28, 65536, ICMP_IS_ERROR, 0, 1 }, + [ICMP_TIMESTAMP] + = { 20, 20, ICMP_NOT_ERROR, 0, 0 }, + [ICMP_TIMESTAMPREPLY] + = { 20, 20, ICMP_NOT_ERROR, 0, 0 }, + [ICMP_INFO_REQUEST] + = { 8, 65536, ICMP_NOT_ERROR, 0, 0 }, + [ICMP_INFO_REPLY] + = { 8, 65536, ICMP_NOT_ERROR, 0, 0 }, + [ICMP_ADDRESS] + = { 12, 12, ICMP_NOT_ERROR, 0, 0 }, + [ICMP_ADDRESSREPLY] + = { 12, 12, ICMP_NOT_ERROR, 0, 0 } }; + + /* Can't do anything if it's a fragment. */ + if (!offset) + return 1; + + /* Must cover type and code. */ + if (datalen < 2) { + limpk("ICMP len=%u too short\n", datalen); + return 0; + } + + /* If not embedded. */ + if (!embedded) { + /* Bad checksum? Don't print, just drop. */ + if (!more_frags + && ip_compute_csum((unsigned char *) icmph, datalen) != 0) + return 0; + + /* CHECK: Truncated ICMP (even if first fragment). */ + if (icmph->type < sizeof(info)/sizeof(struct icmp_info) + && info[icmph->type].min_len != 0 + && datalen < info[icmph->type].min_len) { + limpk("ICMP type %u len %u too short\n", + icmph->type, datalen); + return 0; + } + + /* CHECK: Check within known error ICMPs. */ + if (icmph->type < sizeof(info)/sizeof(struct icmp_info) + && info[icmph->type].err == ICMP_IS_ERROR) { + /* CHECK: Embedded packet must be at least + length of iph + 8 bytes. */ + struct iphdr *inner = (void *)icmph + 8; + + if (datalen - 8 < sizeof(struct iphdr)) { + limpk("ICMP error internal way too short\n"); + return 0; + } + if (datalen - 8 < inner->ihl*4 + 8) { + limpk("ICMP error internal too short\n"); + return 0; + } + if (!check_ip(inner, datalen - 8, 1)) + return 0; + } + } else { + /* CHECK: Can't embed ICMP unless known non-error. */ + if (icmph->type >= sizeof(info)/sizeof(struct icmp_info) + || info[icmph->type].err != ICMP_NOT_ERROR) { + limpk("ICMP type %u not embeddable\n", + icmph->type); + return 0; + } + } + + /* CHECK: Invalid ICMP codes. */ + if (icmph->type < sizeof(info)/sizeof(struct icmp_info) + && (icmph->code < info[icmph->type].min_code + || icmph->code > info[icmph->type].max_code)) { + limpk("ICMP type=%u code=%u\n", + icmph->type, icmph->code); + return 0; + } + + /* CHECK: Above maximum length. */ + if (icmph->type < sizeof(info)/sizeof(struct icmp_info) + && info[icmph->type].max_len != 0 + && datalen > info[icmph->type].max_len) { + limpk("ICMP type=%u too long: %u bytes\n", + icmph->type, datalen); + return 0; + } + + switch (icmph->type) { + case ICMP_PARAMETERPROB: { + /* CHECK: Problem param must be within error packet's + * IP header. */ + struct iphdr *iph = (void *)icmph + 8; + u_int32_t arg = ntohl(icmph->un.gateway); + + if (icmph->code == 0) { + if ((arg >> 24) >= iph->ihl*4) { + limpk("ICMP PARAMETERPROB ptr = %u\n", + ntohl(icmph->un.gateway) >> 24); + return 0; + } + arg &= 0x00FFFFFF; + } + + /* CHECK: Rest must be zero. */ + if (arg) { + limpk("ICMP PARAMETERPROB nonzero arg = %u\n", + arg); + return 0; + } + break; + } + + case ICMP_TIME_EXCEEDED: + case ICMP_SOURCE_QUENCH: + /* CHECK: Unused must be zero. */ + if (icmph->un.gateway != 0) { + limpk("ICMP type=%u unused = %u\n", + icmph->type, ntohl(icmph->un.gateway)); + return 0; + } + break; + } + + return 1; +} + +/* UDP-specific checks. */ +static int +check_udp(const struct iphdr *iph, + const struct udphdr *udph, + u_int16_t datalen, + unsigned int offset, + int more_frags, + int embedded) +{ + /* Can't do anything if it's a fragment. */ + if (!offset) + return 1; + + /* CHECK: Must cover UDP header. */ + if (datalen < sizeof(struct udphdr)) { + limpk("UDP len=%u too short\n", datalen); + return 0; + } + + /* Bad checksum? Don't print, just drop. */ + /* FIXME: SRC ROUTE packets won't match checksum --RR */ + if (!more_frags && !embedded + && csum_tcpudp_magic(iph->saddr, iph->daddr, datalen, IPPROTO_UDP, + csum_partial((char *)udph, datalen, 0)) != 0) + return 0; + + /* CHECK: Ports can't be zero. */ + if (!udph->source || !udph->dest) { + limpk("UDP zero ports %u/%u\n", + ntohs(udph->source), ntohs(udph->dest)); + return 0; + } + + if (!more_frags) { + if (!embedded) { + /* CHECK: UDP length must match. */ + if (ntohs(udph->len) != datalen) { + limpk("UDP len too short %u vs %u\n", + ntohs(udph->len), datalen); + return 0; + } + } else { + /* CHECK: UDP length be >= this truncated pkt. */ + if (ntohs(udph->len) < datalen) { + limpk("UDP len too long %u vs %u\n", + ntohs(udph->len), datalen); + return 0; + } + } + } else { + /* CHECK: UDP length must be > this frag's length. */ + if (ntohs(udph->len) <= datalen) { + limpk("UDP fragment len too short %u vs %u\n", + ntohs(udph->len), datalen); + return 0; + } + } + + return 1; +} + +#define TH_FIN 0x01 +#define TH_SYN 0x02 +#define TH_RST 0x04 +#define TH_PUSH 0x08 +#define TH_ACK 0x10 +#define TH_URG 0x20 + +/* TCP-specific checks. */ +static int +check_tcp(const struct iphdr *iph, + const struct tcphdr *tcph, + u_int16_t datalen, + unsigned int offset, + int more_frags, + int embedded) +{ + u_int8_t *opt = (u_int8_t *)(tcph + 1); + u_int8_t tcpflags; + int end_of_options = 0; + size_t i; + + /* CHECK: Can't have offset=1: used to override TCP syn-checks. */ + /* In fact, this is caught below (offset < 516). */ + + /* Can't do anything if it's a fragment. */ + if (!offset) + return 1; + + /* CHECK: Smaller than minimal TCP hdr. */ + if (datalen < sizeof(struct tcphdr)) { + if (!embedded) { + limpk("Packet length %u < TCP header.\n", datalen); + return 0; + } + /* Must have ports available (datalen >= 8). */ + /* CHECK: TCP ports inside ICMP error */ + if (!tcph->source || !tcph->dest) { + limpk("Zero TCP ports %u/%u.\n", + htons(tcph->source), htons(tcph->dest)); + return 0; + } + return 1; + } + + /* CHECK: Smaller than actual TCP hdr. */ + if (datalen < tcph->doff * 4) { + if (!embedded) { + limpk("Packet length %u < actual TCP header.\n", + datalen); + return 0; + } else + return 1; + } + + /* Bad checksum? Don't print, just drop. */ + /* FIXME: SRC ROUTE packets won't match checksum --RR */ + if (!more_frags && !embedded + && csum_tcpudp_magic(iph->saddr, iph->daddr, datalen, IPPROTO_TCP, + csum_partial((char *)tcph, datalen, 0)) != 0) + return 0; + + /* CHECK: TCP ports non-zero */ + if (!tcph->source || !tcph->dest) { + limpk("Zero TCP ports %u/%u.\n", + htons(tcph->source), htons(tcph->dest)); + return 0; + } + + /* CHECK: TCP reserved bits zero. */ + if(tcp_flag_word(tcph) & TCP_RESERVED_BITS) { + limpk("TCP reserved bits not zero\n"); + return 0; + } + + /* CHECK: TCP flags. */ + tcpflags = ((u_int8_t *)tcph)[13]; + if (tcpflags != TH_SYN + && tcpflags != (TH_SYN|TH_ACK) + && tcpflags != (TH_RST|TH_ACK) + && tcpflags != (TH_RST|TH_ACK|TH_PUSH) + && tcpflags != (TH_FIN|TH_ACK) + && tcpflags != TH_ACK + && tcpflags != (TH_ACK|TH_PUSH) + && tcpflags != (TH_ACK|TH_URG) + && tcpflags != (TH_ACK|TH_URG|TH_PUSH) + && tcpflags != (TH_FIN|TH_ACK|TH_PUSH) + && tcpflags != (TH_FIN|TH_ACK|TH_URG) + && tcpflags != (TH_FIN|TH_ACK|TH_URG|TH_PUSH)) { + limpk("TCP flags bad: %u\n", tcpflags); + return 0; + } + + for (i = sizeof(struct tcphdr); i < tcph->doff * 4; ) { + switch (opt[i]) { + case 0: + end_of_options = 1; + i++; + break; + case 1: + i++; + break; + default: + /* CHECK: options after EOO. */ + if (end_of_options) { + limpk("TCP option %u after end\n", + opt[i]); + return 0; + } + /* CHECK: options at tail. */ + else if (i+1 >= tcph->doff * 4) { + limpk("TCP option %u at tail\n", + opt[i]); + return 0; + } + /* CHECK: zero-length options. */ + else if (opt[i+1] == 0) { + limpk("TCP option %u 0 len\n", + opt[i]); + return 0; + } + /* CHECK: oversize options. */ + else if (opt[i+1] + i >= tcph->doff * 4) { + limpk("TCP option %u at %Zu too long\n", + (unsigned int) opt[i], i); + return 0; + } + } + } + + return 1; +} + +/* Returns 1 if ok */ +/* Standard IP checks. */ +static int +check_ip(struct iphdr *iph, size_t length, int embedded) +{ + u_int8_t *opt = (u_int8_t *)(iph + 1); + int end_of_options = 0; + void *protoh; + size_t datalen; + unsigned int i; + unsigned int offset; + + /* Should only happen for local outgoing raw-socket packets. */ + /* CHECK: length >= ip header. */ + if (length < sizeof(struct iphdr) || length < iph->ihl * 4) { + limpk("Packet length %Zu < IP header.\n", length); + return 0; + } + + offset = ntohs(iph->frag_off) & IP_OFFSET; + protoh = (void *)iph + iph->ihl * 4; + datalen = length - iph->ihl * 4; + + /* CHECK: Embedded fragment. */ + if (embedded && offset) { + limpk("Embedded fragment.\n"); + return 0; + } + + for (i = sizeof(struct iphdr); i < iph->ihl * 4; ) { + switch (opt[i]) { + case 0: + end_of_options = 1; + i++; + break; + case 1: + i++; + break; + default: + /* CHECK: options after EOO. */ + if (end_of_options) { + limpk("IP option %u after end\n", + opt[i]); + return 0; + } + /* CHECK: options at tail. */ + else if (i+1 >= iph->ihl * 4) { + limpk("IP option %u at tail\n", + opt[i]); + return 0; + } + /* CHECK: zero-length options. */ + else if (opt[i+1] == 0) { + limpk("IP option %u 0 len\n", + opt[i]); + return 0; + } + /* CHECK: oversize options. */ + else if (opt[i+1] + i >= iph->ihl * 4) { + limpk("IP option %u at %u too long\n", + opt[i], i); + return 0; + } + } + } + + /* Fragment checks. */ + + /* CHECK: More fragments, but doesn't fill 8-byte boundary. */ + if ((ntohs(iph->frag_off) & IP_MF) + && (ntohs(iph->tot_len) % 8) != 0) { + limpk("Truncated fragment %u long.\n", ntohs(iph->tot_len)); + return 0; + } + + /* CHECK: Oversize fragment a-la Ping of Death. */ + if (offset * 8 + datalen > 65535) { + limpk("Oversize fragment to %u.\n", offset * 8); + return 0; + } + + /* CHECK: DF set and offset or MF set. */ + if ((ntohs(iph->frag_off) & IP_DF) + && (offset || (ntohs(iph->frag_off) & IP_MF))) { + limpk("DF set and offset=%u, MF=%u.\n", + offset, ntohs(iph->frag_off) & IP_MF); + return 0; + } + + /* CHECK: Zero-sized fragments. */ + if ((offset || (ntohs(iph->frag_off) & IP_MF)) + && datalen == 0) { + limpk("Zero size fragment offset=%u\n", offset); + return 0; + } + + /* Note: we can have even middle fragments smaller than this: + consider a large packet passing through a 600MTU then + 576MTU link: this gives a fragment of 24 data bytes. But + everyone packs fragments largest first, hence a fragment + can't START before 576 - MAX_IP_HEADER_LEN. */ + + /* Used to be min-size 576: I recall Alan Cox saying ax25 goes + down to 128 (576 taken from RFC 791: All hosts must be + prepared to accept datagrams of up to 576 octets). Use 128 + here. */ +#define MIN_LIKELY_MTU 128 + /* CHECK: Min size of first frag = 128. */ + if ((ntohs(iph->frag_off) & IP_MF) + && offset == 0 + && ntohs(iph->tot_len) < MIN_LIKELY_MTU) { + limpk("First fragment size %u < %u\n", ntohs(iph->tot_len), + MIN_LIKELY_MTU); + return 0; + } + + /* CHECK: Min offset of frag = 128 - 60 (max IP hdr len). */ + if (offset && offset * 8 < MIN_LIKELY_MTU - 60) { + limpk("Fragment starts at %u < %u\n", offset * 8, + MIN_LIKELY_MTU-60); + return 0; + } + + /* CHECK: Protocol specification non-zero. */ + if (iph->protocol == 0) { + limpk("Zero protocol\n"); + return 0; + } + + /* Per-protocol checks. */ + switch (iph->protocol) { + case IPPROTO_ICMP: + return check_icmp(protoh, datalen, offset, + (ntohs(iph->frag_off) & IP_MF), + embedded); + + case IPPROTO_UDP: + return check_udp(iph, protoh, datalen, offset, + (ntohs(iph->frag_off) & IP_MF), + embedded); + + case IPPROTO_TCP: + return check_tcp(iph, protoh, datalen, offset, + (ntohs(iph->frag_off) & IP_MF), + embedded); + default: + /* Ignorance is bliss. */ + return 1; + } +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + return !check_ip(skb->nh.iph, skb->len, 0); +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(0)) + return 0; + + return 1; +} + +static struct ipt_match unclean_match += { { NULL, NULL }, "unclean", &match, &checkentry, THIS_MODULE }; + +static int __init init(void) +{ + return ipt_register_match(&unclean_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&unclean_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c new file mode 100644 index 000000000..a10bb3682 --- /dev/null +++ b/net/ipv4/netfilter/iptable_filter.c @@ -0,0 +1,181 @@ +/* + * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + */ +#include <linux/module.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +#define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT)) + +/* Standard entry. */ +struct ipt_standard +{ + struct ipt_entry entry; + struct ipt_standard_target target; +}; + +struct ipt_error_target +{ + struct ipt_entry_target target; + char errorname[IPT_FUNCTION_MAXNAMELEN]; +}; + +struct ipt_error +{ + struct ipt_entry entry; + struct ipt_error_target target; +}; + +static struct +{ + struct ipt_replace repl; + struct ipt_standard entries[3]; + struct ipt_error term; +} initial_table __initdata += { { "filter", FILTER_VALID_HOOKS, 4, + sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), + { [NF_IP_LOCAL_IN] 0, + [NF_IP_FORWARD] sizeof(struct ipt_standard), + [NF_IP_LOCAL_OUT] sizeof(struct ipt_standard) * 2 }, + { [NF_IP_LOCAL_IN] 0, + [NF_IP_FORWARD] sizeof(struct ipt_standard), + [NF_IP_LOCAL_OUT] sizeof(struct ipt_standard) * 2 }, + 0, NULL, { } }, + { + /* LOCAL_IN */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { sizeof(struct ipt_standard_target), { "" }, { } }, + -NF_ACCEPT - 1 } }, + /* FORWARD */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { sizeof(struct ipt_standard_target), { "" }, { } }, + -NF_ACCEPT - 1 } }, + /* LOCAL_OUT */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { sizeof(struct ipt_standard_target), { "" }, { } }, + -NF_ACCEPT - 1 } } + }, + /* ERROR */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_error), + 0, { 0, 0 }, { } }, + { { sizeof(struct ipt_error_target), { IPT_ERROR_TARGET }, + { } }, + "ERROR" + } + } +}; + +static struct ipt_table packet_filter += { { NULL, NULL }, "filter", &initial_table.repl, + FILTER_VALID_HOOKS, RW_LOCK_UNLOCKED, NULL }; + +/* The work comes in here from netfilter.c. */ +static unsigned int +ipt_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL); +} + +static unsigned int +ipt_local_out_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + if (net_ratelimit()) + printk("ipt_hook: happy cracking.\n"); + return NF_ACCEPT; + } + + return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL); +} + +static struct nf_hook_ops ipt_ops[] += { { { NULL, NULL }, ipt_hook, PF_INET, NF_IP_LOCAL_IN, NF_IP_PRI_FILTER }, + { { NULL, NULL }, ipt_hook, PF_INET, NF_IP_FORWARD, NF_IP_PRI_FILTER }, + { { NULL, NULL }, ipt_local_out_hook, PF_INET, NF_IP_LOCAL_OUT, + NF_IP_PRI_FILTER } +}; + +/* Default to no forward for security reasons. */ +static int forward = NF_DROP; +MODULE_PARM(forward, "i"); + +static int __init init(void) +{ + int ret; + + if (forward < 0 || forward > NF_MAX_VERDICT) { + printk("iptables forward must be 0 or 1\n"); + return -EINVAL; + } + + /* Entry 1 is the FORWARD hook */ + initial_table.entries[1].target.verdict = -forward - 1; + + /* Register table */ + ret = ipt_register_table(&packet_filter); + if (ret < 0) + return ret; + + /* Register hooks */ + ret = nf_register_hook(&ipt_ops[0]); + if (ret < 0) + goto cleanup_table; + + ret = nf_register_hook(&ipt_ops[1]); + if (ret < 0) + goto cleanup_hook0; + + ret = nf_register_hook(&ipt_ops[2]); + if (ret < 0) + goto cleanup_hook1; + + return ret; + + cleanup_hook1: + nf_unregister_hook(&ipt_ops[1]); + cleanup_hook0: + nf_unregister_hook(&ipt_ops[0]); + cleanup_table: + ipt_unregister_table(&packet_filter); + + return ret; +} + +static void __exit fini(void) +{ + unsigned int i; + + for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++) + nf_unregister_hook(&ipt_ops[i]); + + ipt_unregister_table(&packet_filter); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c new file mode 100644 index 000000000..ef506ca7a --- /dev/null +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -0,0 +1,152 @@ +/* + * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + */ +#include <linux/module.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +#define MANGLE_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT)) + +/* Standard entry. */ +struct ipt_standard +{ + struct ipt_entry entry; + struct ipt_standard_target target; +}; + +struct ipt_error_target +{ + struct ipt_entry_target target; + char errorname[IPT_FUNCTION_MAXNAMELEN]; +}; + +struct ipt_error +{ + struct ipt_entry entry; + struct ipt_error_target target; +}; + +static struct +{ + struct ipt_replace repl; + struct ipt_standard entries[2]; + struct ipt_error term; +} initial_table __initdata += { { "mangle", MANGLE_VALID_HOOKS, 3, + sizeof(struct ipt_standard) * 2 + sizeof(struct ipt_error), + { [NF_IP_PRE_ROUTING] 0, + [NF_IP_LOCAL_OUT] sizeof(struct ipt_standard) }, + { [NF_IP_PRE_ROUTING] 0, + [NF_IP_LOCAL_OUT] sizeof(struct ipt_standard) }, + 0, NULL, { } }, + { + /* PRE_ROUTING */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { sizeof(struct ipt_standard_target), { "" }, { } }, + -NF_ACCEPT - 1 } }, + /* LOCAL_OUT */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { sizeof(struct ipt_standard_target), { "" }, { } }, + -NF_ACCEPT - 1 } } + }, + /* ERROR */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_error), + 0, { 0, 0 }, { } }, + { { sizeof(struct ipt_error_target), { IPT_ERROR_TARGET }, + { } }, + "ERROR" + } + } +}; + +static struct ipt_table packet_mangler += { { NULL, NULL }, "mangle", &initial_table.repl, + MANGLE_VALID_HOOKS, RW_LOCK_UNLOCKED, NULL }; + +/* The work comes in here from netfilter.c. */ +static unsigned int +ipt_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL); +} + +static unsigned int +ipt_local_out_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + if (net_ratelimit()) + printk("ipt_hook: happy cracking.\n"); + return NF_ACCEPT; + } + + return ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL); +} + +static struct nf_hook_ops ipt_ops[] += { { { NULL, NULL }, ipt_hook, PF_INET, NF_IP_PRE_ROUTING, NF_IP_PRI_MANGLE }, + { { NULL, NULL }, ipt_local_out_hook, PF_INET, NF_IP_LOCAL_OUT, + NF_IP_PRI_MANGLE } +}; + +static int __init init(void) +{ + int ret; + + /* Register table */ + ret = ipt_register_table(&packet_mangler); + if (ret < 0) + return ret; + + /* Register hooks */ + ret = nf_register_hook(&ipt_ops[0]); + if (ret < 0) + goto cleanup_table; + + ret = nf_register_hook(&ipt_ops[1]); + if (ret < 0) + goto cleanup_hook0; + + return ret; + + cleanup_hook0: + nf_unregister_hook(&ipt_ops[0]); + cleanup_table: + ipt_unregister_table(&packet_mangler); + + return ret; +} + +static void __exit fini(void) +{ + unsigned int i; + + for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++) + nf_unregister_hook(&ipt_ops[i]); + + ipt_unregister_table(&packet_mangler); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4e649eded..c683f2f23 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -5,7 +5,7 @@ * * ROUTE - implementation of the IP router. * - * Version: $Id: route.c,v 1.81 2000/02/09 11:16:42 davem Exp $ + * Version: $Id: route.c,v 1.82 2000/03/17 14:41:52 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -1187,10 +1187,7 @@ ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr, rth->rt_dst = daddr; rth->key.tos = tos; #ifdef CONFIG_IP_ROUTE_FWMARK - if (skb->nfreason == NF_REASON_FOR_ROUTING) - rth->key.fwmark = skb->nfmark; - else - rth->key.fwmark = 0; + rth->key.fwmark = skb->nfmark; #endif rth->key.src = saddr; rth->rt_src = saddr; @@ -1269,10 +1266,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, key.src = saddr; key.tos = tos; #ifdef CONFIG_IP_ROUTE_FWMARK - if (skb->nfreason == NF_REASON_FOR_ROUTING) - key.fwmark = skb->nfmark; - else - key.fwmark = 0; + key.fwmark = skb->nfmark; #endif key.iif = dev->ifindex; key.oif = 0; @@ -1395,10 +1389,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, rth->rt_dst = daddr; rth->key.tos = tos; #ifdef CONFIG_IP_ROUTE_FWMARK - if (skb->nfreason == NF_REASON_FOR_ROUTING) - rth->key.fwmark = skb->nfmark; - else - rth->key.fwmark = 0; + rth->key.fwmark = skb->nfmark; #endif rth->key.src = saddr; rth->rt_src = saddr; @@ -1473,10 +1464,7 @@ local_input: rth->rt_dst = daddr; rth->key.tos = tos; #ifdef CONFIG_IP_ROUTE_FWMARK - if (skb->nfreason == NF_REASON_FOR_ROUTING) - rth->key.fwmark = skb->nfmark; - else - rth->key.fwmark = 0; + rth->key.fwmark = skb->nfmark; #endif rth->key.src = saddr; rth->rt_src = saddr; @@ -1563,9 +1551,7 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr, rth->key.iif == iif && rth->key.oif == 0 && #ifdef CONFIG_IP_ROUTE_FWMARK - rth->key.fwmark - == (skb->nfreason == NF_REASON_FOR_ROUTING - ? skb->nfmark : 0) && + rth->key.fwmark == skb->nfmark && #endif rth->key.tos == tos) { rth->u.dst.lastuse = jiffies; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 204f25574..1edee9f51 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_ipv4.c,v 1.201 2000/03/08 19:36:42 davem Exp $ + * Version: $Id: tcp_ipv4.c,v 1.202 2000/03/17 14:41:53 davem Exp $ * * IPv4 specific functions * @@ -72,8 +72,6 @@ extern int sysctl_ip_dynaddr; struct inode tcp_inode; struct socket *tcp_socket=&tcp_inode.u.socket_i; -static void tcp_v4_send_reset(struct sk_buff *skb); - void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, struct sk_buff *skb); @@ -1059,7 +1057,7 @@ void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, * Exception: precedence violation. We do not implement it in any case. */ -static void tcp_v4_send_reset(struct sk_buff *skb) +void tcp_v4_send_reset(struct sk_buff *skb) { struct tcphdr *th = skb->h.th; struct tcphdr rth; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 818ad66ca..c52797d70 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -6,7 +6,7 @@ * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * - * $Id: sit.c,v 1.35 2000/01/06 00:42:08 davem Exp $ + * $Id: sit.c,v 1.36 2000/03/17 14:42:08 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -15,6 +15,7 @@ */ #define __NO_VERSION__ +#include <linux/config.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> @@ -388,6 +389,10 @@ int ipip6_rcv(struct sk_buff *skb, unsigned short len) skb->dev = tunnel->dev; dst_release(skb->dst); skb->dst = NULL; +#ifdef CONFIG_NETFILTER + nf_conntrack_put(skb->nfct); + skb->nfct = NULL; +#endif netif_rx(skb); read_unlock(&ipip6_lock); return 0; @@ -547,6 +552,11 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) ip_select_ident(iph, &rt->u.dst); ip_send_check(iph); +#ifdef CONFIG_NETFILTER + nf_conntrack_put(skb->nfct); + skb->nfct = NULL; +#endif + stats->tx_bytes += skb->len; stats->tx_packets++; ip_send(skb); diff --git a/net/khttpd/security.c b/net/khttpd/security.c index df346aadb..bd578941d 100644 --- a/net/khttpd/security.c +++ b/net/khttpd/security.c @@ -115,7 +115,7 @@ struct file *OpenFileForSecurity(char *Filename) - filp = filp_open(Filename,00,O_RDONLY); + filp = filp_open(Filename, 0, O_RDONLY, NULL); if ((IS_ERR(filp))||(filp==NULL)||(filp->f_dentry==NULL)) diff --git a/net/netsyms.c b/net/netsyms.c index 48cd5b503..c6745cafe 100644 --- a/net/netsyms.c +++ b/net/netsyms.c @@ -229,6 +229,7 @@ EXPORT_SYMBOL(inet_del_protocol); EXPORT_SYMBOL(ip_route_output); EXPORT_SYMBOL(ip_route_input); EXPORT_SYMBOL(icmp_send); +EXPORT_SYMBOL(icmp_reply); EXPORT_SYMBOL(ip_options_compile); EXPORT_SYMBOL(ip_options_undo); EXPORT_SYMBOL(arp_send); @@ -339,6 +340,7 @@ EXPORT_SYMBOL(tcp_sendmsg); EXPORT_SYMBOL(tcp_v4_rebuild_header); EXPORT_SYMBOL(tcp_v4_send_check); EXPORT_SYMBOL(tcp_v4_conn_request); +EXPORT_SYMBOL(tcp_v4_send_reset); EXPORT_SYMBOL(tcp_create_openreq_child); EXPORT_SYMBOL(tcp_bucket_create); EXPORT_SYMBOL(__tcp_put_port); diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 143d6e361..31dedf1ea 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -66,7 +66,7 @@ static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp, struct fw_head *head = (struct fw_head*)tp->root; struct fw_filter *f; #ifdef CONFIG_NETFILTER - u32 id = (skb->nfreason == NF_REASON_FOR_CLS_FW ? skb->nfmark : 0); + u32 id = skb->nfmark; #else u32 id = 0; #endif diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index ddc738fcc..947aede01 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -224,14 +224,14 @@ used on the egress (might slow things for an iota) return fwres; } -/* after iptables */ +/* after ipt_filter */ static struct nf_hook_ops ing_ops = { { NULL, NULL}, ing_hook, PF_INET, NF_IP_PRE_ROUTING, - 1 + NF_IP_PRI_FILTER + 1 }; int ingress_init(struct Qdisc *sch,struct rtattr *opt) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index d63024983..97e323d0c 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -636,7 +636,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) dprintk("svc: incomplete TCP record (%d of %d)\n", len, svsk->sk_reclen); svc_sock_received(svsk, ready); - len = -EAGAIN; /* record not complete */ + return -EAGAIN; /* record not complete */ } /* Frob argbuf */ diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index a57c2a06d..cbe730b5d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -8,7 +8,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Version: $Id: af_unix.c,v 1.89 2000/02/27 19:52:50 davem Exp $ + * Version: $Id: af_unix.c,v 1.90 2000/03/16 20:38:45 davem Exp $ * * Fixes: * Linus Torvalds : Assorted bug cures. @@ -569,7 +569,7 @@ static unix_socket *unix_find_other(struct sockaddr_un *sunname, int len, /* Do not believe to VFS, grab kernel lock */ lock_kernel(); - dentry = open_namei(sunname->sun_path, 2|O_NOFOLLOW, S_IFSOCK); + dentry = __open_namei(sunname->sun_path, 2|O_NOFOLLOW, S_IFSOCK, NULL); if (IS_ERR(dentry)) { *error = PTR_ERR(dentry); unlock_kernel(); |