summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>2000-03-23 02:25:38 +0000
committerRalf Baechle <ralf@linux-mips.org>2000-03-23 02:25:38 +0000
commit16b5d462f73eb29d1f67fa01cc1ea66afdc72569 (patch)
tree5407bd573f4840e473ea27cbe61e5c7a07131fcd /net
parentce8a076e11e7e5ee36007f9a3eee5bb3744cb8f6 (diff)
Merge with Linux 2.3.99-pre2.
Diffstat (limited to 'net')
-rw-r--r--net/Config.in6
-rw-r--r--net/Makefile6
-rw-r--r--net/core/netfilter.c63
-rw-r--r--net/core/skbuff.c5
-rw-r--r--net/decnet/dn_route.c9
-rw-r--r--net/ipv4/Config.in5
-rw-r--r--net/ipv4/icmp.c18
-rw-r--r--net/ipv4/ip_gre.c9
-rw-r--r--net/ipv4/ip_output.c5
-rw-r--r--net/ipv4/ipip.c12
-rw-r--r--net/ipv4/ipmr.c14
-rw-r--r--net/ipv4/netfilter/Config.in64
-rw-r--r--net/ipv4/netfilter/Makefile234
-rw-r--r--net/ipv4/netfilter/ip_conntrack_core.c891
-rw-r--r--net/ipv4/netfilter/ip_conntrack_ftp.c251
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_generic.c60
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_icmp.c111
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_tcp.c227
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_udp.c65
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c297
-rw-r--r--net/ipv4/netfilter/ip_fw_compat.c238
-rw-r--r--net/ipv4/netfilter/ip_fw_compat_masq.c288
-rw-r--r--net/ipv4/netfilter/ip_fw_compat_redir.c284
-rw-r--r--net/ipv4/netfilter/ip_nat_core.c855
-rw-r--r--net/ipv4/netfilter/ip_nat_ftp.c403
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_icmp.c97
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_tcp.c143
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_udp.c141
-rw-r--r--net/ipv4/netfilter/ip_nat_proto_unknown.c61
-rw-r--r--net/ipv4/netfilter/ip_nat_rule.c327
-rw-r--r--net/ipv4/netfilter/ip_nat_standalone.c273
-rw-r--r--net/ipv4/netfilter/ip_queue.c752
-rw-r--r--net/ipv4/netfilter/ip_tables.c1664
-rw-r--r--net/ipv4/netfilter/ipchains_core.c1768
-rw-r--r--net/ipv4/netfilter/ipfwadm_core.c1410
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c368
-rw-r--r--net/ipv4/netfilter/ipt_MARK.c68
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c171
-rw-r--r--net/ipv4/netfilter/ipt_MIRROR.c131
-rw-r--r--net/ipv4/netfilter/ipt_REDIRECT.c104
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c145
-rw-r--r--net/ipv4/netfilter/ipt_TOS.c87
-rw-r--r--net/ipv4/netfilter/ipt_limit.c144
-rw-r--r--net/ipv4/netfilter/ipt_mac.c63
-rw-r--r--net/ipv4/netfilter/ipt_mark.c52
-rw-r--r--net/ipv4/netfilter/ipt_multiport.c102
-rw-r--r--net/ipv4/netfilter/ipt_owner.c136
-rw-r--r--net/ipv4/netfilter/ipt_state.c61
-rw-r--r--net/ipv4/netfilter/ipt_tos.c53
-rw-r--r--net/ipv4/netfilter/ipt_unclean.c576
-rw-r--r--net/ipv4/netfilter/iptable_filter.c181
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c152
-rw-r--r--net/ipv4/route.c26
-rw-r--r--net/ipv4/tcp_ipv4.c6
-rw-r--r--net/ipv6/sit.c12
-rw-r--r--net/khttpd/security.c2
-rw-r--r--net/netsyms.c2
-rw-r--r--net/sched/cls_fw.c2
-rw-r--r--net/sched/sch_ingress.c4
-rw-r--r--net/sunrpc/svcsock.c2
-rw-r--r--net/unix/af_unix.c4
61 files changed, 13633 insertions, 77 deletions
diff --git a/net/Config.in b/net/Config.in
index 624885478..ce5b6faa9 100644
--- a/net/Config.in
+++ b/net/Config.in
@@ -13,9 +13,9 @@ if [ "$CONFIG_NETLINK" = "y" ]; then
tristate ' Netlink device emulation' CONFIG_NETLINK_DEV
fi
bool 'Network packet filtering (replaces ipchains)' CONFIG_NETFILTER
-if [ "$CONFIG_NETFILTER" = "y" ]; then
- bool ' Network packet filtering debugging' CONFIG_NETFILTER_DEBUG
-fi
+#if [ "$CONFIG_NETFILTER" = "y" ]; then
+# bool ' Network packet filtering debugging' CONFIG_NETFILTER_DEBUG
+#fi
bool 'Socket Filtering' CONFIG_FILTER
tristate 'Unix domain sockets' CONFIG_UNIX
bool 'TCP/IP networking' CONFIG_INET
diff --git a/net/Makefile b/net/Makefile
index bf234eae1..44b34d799 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -20,6 +20,10 @@ endif
ifeq ($(CONFIG_INET),y)
SUB_DIRS += ipv4
+ifeq ($(CONFIG_NETFILTER),y)
+SUB_DIRS += ipv4/netfilter
+MOD_SUB_DIRS += ipv4/netfilter
+endif
endif
ifeq ($(CONFIG_UNIX),y)
@@ -198,7 +202,7 @@ endif
endif
L_TARGET := network.a
-L_OBJS := $(SOCK) protocols.o $(join $(SUB_DIRS),$(SUB_DIRS:%=/%.o))
+L_OBJS := $(SOCK) protocols.o $(join $(SUB_DIRS), $(patsubst %,/%.o,$(notdir $(SUB_DIRS))))
M_OBJS :=
diff --git a/net/core/netfilter.c b/net/core/netfilter.c
index 18f697755..02c3bc989 100644
--- a/net/core/netfilter.c
+++ b/net/core/netfilter.c
@@ -4,9 +4,10 @@
* Thanks to Rob `CmdrTaco' Malda for not influencing this code in any
* way.
*
- * Rusty Russell (C)1998 -- This code is GPL.
+ * Rusty Russell (C)2000 -- This code is GPL.
*
* February 2000: Modified by James Morris to have 1 queue per protocol.
+ * 15-Mar-2000: Added NF_REPEAT --RR.
*/
#include <linux/config.h>
#include <linux/netfilter.h>
@@ -56,8 +57,6 @@ int nf_register_hook(struct nf_hook_ops *reg)
{
struct list_head *i;
- NFDEBUG("nf_register_hook: pf=%i hook=%u.\n", reg->pf, reg->hooknum);
-
br_write_lock_bh(BR_NETPROTO_LOCK);
for (i = nf_hooks[reg->pf][reg->hooknum].next;
i != &nf_hooks[reg->pf][reg->hooknum];
@@ -119,7 +118,16 @@ out:
void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
{
/* No point being interruptible: we're probably in cleanup_module() */
+ restart:
down(&nf_sockopt_mutex);
+ if (reg->use != 0) {
+ /* To be woken by nf_sockopt call... */
+ reg->cleanup_task = current;
+ up(&nf_sockopt_mutex);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule();
+ goto restart;
+ }
list_del(&reg->list);
up(&nf_sockopt_mutex);
}
@@ -178,7 +186,7 @@ void nf_dump_skb(int pf, struct sk_buff *skb)
dst_port = ntohs(tcp->dest);
}
- printk("PROTO=%d %ld.%ld.%ld.%ld:%hu %ld.%ld.%ld.%ld:%hu"
+ printk("PROTO=%d %d.%d.%d.%d:%hu %d.%d.%d.%d:%hu"
" L=%hu S=0x%2.2hX I=%hu F=0x%4.4hX T=%hu",
ip->protocol,
(ntohl(ip->saddr)>>24)&0xFF,
@@ -261,9 +269,16 @@ void nf_debug_ip_finish_output2(struct sk_buff *skb)
if (skb->nf_debug != ((1 << NF_IP_PRE_ROUTING)
| (1 << NF_IP_FORWARD)
| (1 << NF_IP_POST_ROUTING))) {
- printk("ip_finish_output: bad unowned skb = %p: ",skb);
- debug_print_hooks_ip(skb->nf_debug);
- nf_dump_skb(PF_INET, skb);
+ /* Fragments will have no owners, but still
+ may be local */
+ if (!(skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET))
+ || skb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
+ | (1 << NF_IP_POST_ROUTING))){
+ printk("ip_finish_output:"
+ " bad unowned skb = %p: ",skb);
+ debug_print_hooks_ip(skb->nf_debug);
+ nf_dump_skb(PF_INET, skb);
+ }
}
}
}
@@ -274,31 +289,42 @@ static int nf_sockopt(struct sock *sk, int pf, int val,
char *opt, int *len, int get)
{
struct list_head *i;
+ struct nf_sockopt_ops *ops;
int ret;
if (down_interruptible(&nf_sockopt_mutex) != 0)
return -EINTR;
for (i = nf_sockopts.next; i != &nf_sockopts; i = i->next) {
- struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i;
+ ops = (struct nf_sockopt_ops *)i;
if (ops->pf == pf) {
if (get) {
if (val >= ops->get_optmin
&& val < ops->get_optmax) {
+ ops->use++;
+ up(&nf_sockopt_mutex);
ret = ops->get(sk, val, opt, len);
goto out;
}
} else {
if (val >= ops->set_optmin
&& val < ops->set_optmax) {
+ ops->use++;
+ up(&nf_sockopt_mutex);
ret = ops->set(sk, val, opt, *len);
goto out;
}
}
}
}
- ret = -ENOPROTOOPT;
+ up(&nf_sockopt_mutex);
+ return -ENOPROTOOPT;
+
out:
+ down(&nf_sockopt_mutex);
+ ops->use--;
+ if (ops->cleanup_task)
+ wake_up_process(ops->cleanup_task);
up(&nf_sockopt_mutex);
return ret;
}
@@ -334,6 +360,10 @@ static unsigned int nf_iterate(struct list_head *head,
case NF_DROP:
return NF_DROP;
+ case NF_REPEAT:
+ *i = (*i)->prev;
+ break;
+
#ifdef CONFIG_NETFILTER_DEBUG
case NF_ACCEPT:
break;
@@ -367,7 +397,6 @@ int nf_register_queue_handler(int pf, nf_queue_outfn_t outfn, void *data)
/* The caller must flush their queue before this */
int nf_unregister_queue_handler(int pf)
{
- NFDEBUG("Unregistering Netfilter queue handler for pf=%d\n", pf);
br_write_lock_bh(BR_NETPROTO_LOCK);
queue_handler[pf].outfn = NULL;
queue_handler[pf].data = NULL;
@@ -390,7 +419,6 @@ static void nf_queue(struct sk_buff *skb,
struct nf_info *info;
if (!queue_handler[pf].outfn) {
- NFDEBUG("nf_queue: noone wants the packet, dropping it.\n");
kfree_skb(skb);
return;
}
@@ -432,6 +460,14 @@ int nf_hook_slow(int pf, unsigned int hook, struct sk_buff *skb,
unsigned int verdict;
int ret = 0;
+#ifdef CONFIG_NETFILTER_DEBUG
+ if (skb->nf_debug & (1 << hook)) {
+ printk("nf_hook: hook %i already set.\n", hook);
+ nf_dump_skb(pf, skb);
+ }
+ skb->nf_debug |= (1 << hook);
+#endif
+
elem = &nf_hooks[pf][hook];
verdict = nf_iterate(&nf_hooks[pf][hook], &skb, hook, indev,
outdev, &elem, okfn);
@@ -473,6 +509,11 @@ void nf_reinject(struct sk_buff *skb, struct nf_info *info,
}
/* Continue traversal iff userspace said ok... */
+ if (verdict == NF_REPEAT) {
+ elem = elem->prev;
+ verdict = NF_ACCEPT;
+ }
+
if (verdict == NF_ACCEPT) {
verdict = nf_iterate(&nf_hooks[info->pf][info->hook],
&skb, info->hook,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ecda47d7a..dad1f3925 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4,7 +4,7 @@
* Authors: Alan Cox <iiitac@pyr.swan.ac.uk>
* Florian La Roche <rzsfl@rz.uni-sb.de>
*
- * Version: $Id: skbuff.c,v 1.69 2000/03/06 03:47:58 davem Exp $
+ * Version: $Id: skbuff.c,v 1.70 2000/03/17 14:41:39 davem Exp $
*
* Fixes:
* Alan Cox : Fixed the worst of the load balancer bugs.
@@ -203,7 +203,7 @@ static inline void skb_headerinit(void *p, kmem_cache_t *cache,
skb->dst = NULL;
skb->rx_dev = NULL;
#ifdef CONFIG_NETFILTER
- skb->nfmark = skb->nfreason = skb->nfcache = 0;
+ skb->nfmark = skb->nfcache = 0;
skb->nfct = NULL;
#ifdef CONFIG_NETFILTER_DEBUG
skb->nf_debug = 0;
@@ -319,7 +319,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->security=old->security;
#ifdef CONFIG_NETFILTER
new->nfmark=old->nfmark;
- new->nfreason=old->nfreason;
new->nfcache=old->nfcache;
new->nfct=old->nfct;
nf_conntrack_get(new->nfct);
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 5e54a6fa8..2ba5f2f6c 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -805,10 +805,7 @@ non_local_input:
key.scope = RT_SCOPE_UNIVERSE;
#ifdef CONFIG_DECNET_ROUTE_FWMASK
- if (skb->nfreason == NF_REASON_FOR_ROUTING)
- key.fwmark = skb->fwmark;
- else
- key.fwmark = 0;
+ key.fwmark = skb->fwmark;
#else
key.fwmark = 0;
#endif
@@ -886,9 +883,7 @@ int dn_route_input(struct sk_buff *skb)
(rt->key.daddr == cb->dst) &&
(rt->key.oif == 0) &&
#ifdef CONFIG_DECNET_ROUTE_FWMASK
- (rt->key.fwmark == (skb->nfreason ==
- NF_REASON_FOR_ROUTING
- ? skb->nfmark : 0)) &&
+ (rt->key.fwmark == skb->nfmark) &&
#endif
(rt->key.iif == cb->iif)) {
rt->u.dst.lastuse = jiffies;
diff --git a/net/ipv4/Config.in b/net/ipv4/Config.in
index 32e2aca16..68fea0272 100644
--- a/net/ipv4/Config.in
+++ b/net/ipv4/Config.in
@@ -9,7 +9,7 @@ if [ "$CONFIG_IP_ADVANCED_ROUTER" = "y" ]; then
bool ' IP: policy routing' CONFIG_IP_MULTIPLE_TABLES
if [ "$CONFIG_IP_MULTIPLE_TABLES" = "y" ]; then
if [ "$CONFIG_NETFILTER" = "y" ]; then
- bool ' IP: use FWMARK value as routing key' CONFIG_IP_ROUTE_FWMARK
+ bool ' IP: use netfilter MARK value as routing key' CONFIG_IP_ROUTE_FWMARK
fi
bool ' IP: fast network address translation' CONFIG_IP_ROUTE_NAT
fi
@@ -53,3 +53,6 @@ bool ' IP: Allow large windows (not recommended if <16Mb of memory)' CONFIG_SKB
#if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
# bool ' IP: support checksum copy to user for UDP (EXPERIMENTAL)' CONFIG_UDP_DELAY_CSUM
#fi
+if [ "$CONFIG_NETFILTER" != "n" ]; then
+ source net/ipv4/netfilter/Config.in
+fi
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index d7da63f4e..7561e190b 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -3,7 +3,7 @@
*
* Alan Cox, <alan@redhat.com>
*
- * Version: $Id: icmp.c,v 1.65 2000/02/22 23:54:25 davem Exp $
+ * Version: $Id: icmp.c,v 1.66 2000/03/17 14:41:50 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -332,20 +332,6 @@ struct icmp_control
static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
/*
- * Build xmit assembly blocks
- */
-
-struct icmp_bxm
-{
- void *data_ptr;
- int data_len;
- struct icmphdr icmph;
- unsigned long csum;
- struct ip_options replyopts;
- unsigned char optbuf[40];
-};
-
-/*
* The ICMP socket. This is the most convenient way to flow control
* our ICMP output as well as maintain a clean interface throughout
* all layers. All Socketless IP sends will soon be gone.
@@ -508,7 +494,7 @@ static int icmp_glue_bits(const void *p, char *to, unsigned int offset, unsigned
* Driving logic for building and sending ICMP messages.
*/
-static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
+void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
{
struct sock *sk=icmp_socket->sk;
struct ipcm_cookie ipc;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 8d651b042..01a39b6e4 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -599,6 +599,10 @@ int ipgre_rcv(struct sk_buff *skb, unsigned short len)
skb->dev = tunnel->dev;
dst_release(skb->dst);
skb->dst = NULL;
+#ifdef CONFIG_NETFILTER
+ nf_conntrack_put(skb->nfct);
+ skb->nfct = NULL;
+#endif
netif_rx(skb);
read_unlock(&ipgre_lock);
return(0);
@@ -818,6 +822,11 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
ip_select_ident(iph, &rt->u.dst);
ip_send_check(iph);
+#ifdef CONFIG_NETFILTER
+ nf_conntrack_put(skb->nfct);
+ skb->nfct = NULL;
+#endif
+
stats->tx_bytes += skb->len;
stats->tx_packets++;
ip_send(skb);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 26d025d32..f3013ca57 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -5,7 +5,7 @@
*
* The Internet Protocol (IP) output module.
*
- * Version: $Id: ip_output.c,v 1.81 2000/03/06 03:48:01 davem Exp $
+ * Version: $Id: ip_output.c,v 1.82 2000/03/17 14:41:50 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -894,6 +894,9 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
/* Connection association is same as pre-frag packet */
skb2->nfct = skb->nfct;
nf_conntrack_get(skb2->nfct);
+#ifdef CONFIG_NETFILTER_DEBUG
+ skb2->nf_debug = skb->nf_debug;
+#endif
#endif
/*
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 5518ec1cb..2823c2c7e 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -1,7 +1,7 @@
/*
* Linux NET3: IP/IP protocol decoder.
*
- * Version: $Id: ipip.c,v 1.30 2000/01/06 00:41:55 davem Exp $
+ * Version: $Id: ipip.c,v 1.31 2000/03/17 14:41:51 davem Exp $
*
* Authors:
* Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
@@ -93,6 +93,7 @@
*/
+#include <linux/config.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/sched.h>
@@ -483,6 +484,10 @@ int ipip_rcv(struct sk_buff *skb, unsigned short len)
skb->dev = tunnel->dev;
dst_release(skb->dst);
skb->dst = NULL;
+#ifdef CONFIG_NETFILTER
+ nf_conntrack_put(skb->nfct);
+ skb->nfct = NULL;
+#endif
netif_rx(skb);
read_unlock(&ipip_lock);
return 0;
@@ -619,6 +624,11 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
ip_select_ident(iph, &rt->u.dst);
ip_send_check(iph);
+#ifdef CONFIG_NETFILTER
+ nf_conntrack_put(skb->nfct);
+ skb->nfct = NULL;
+#endif
+
stats->tx_bytes += skb->len;
stats->tx_packets++;
ip_send(skb);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index fce5a43f8..1e33ec4ca 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -9,7 +9,7 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * Version: $Id: ipmr.c,v 1.50 2000/01/09 02:19:32 davem Exp $
+ * Version: $Id: ipmr.c,v 1.51 2000/03/17 14:41:52 davem Exp $
*
* Fixes:
* Michael Chastain : Incorrect size of copying.
@@ -1100,6 +1100,10 @@ static void ip_encap(struct sk_buff *skb, u32 saddr, u32 daddr)
skb->h.ipiph = skb->nh.iph;
skb->nh.iph = iph;
+#ifdef CONFIG_NETFILTER
+ nf_conntrack_put(skb->nfct);
+ skb->nfct = NULL;
+#endif
}
static inline int ipmr_forward_finish(struct sk_buff *skb)
@@ -1433,6 +1437,10 @@ int pim_rcv_v1(struct sk_buff * skb, unsigned short len)
skb->dst = NULL;
((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
((struct net_device_stats*)reg_dev->priv)->rx_packets++;
+#ifdef CONFIG_NETFILTER
+ nf_conntrack_put(skb->nfct);
+ skb->nfct = NULL;
+#endif
netif_rx(skb);
dev_put(reg_dev);
return 0;
@@ -1488,6 +1496,10 @@ int pim_rcv(struct sk_buff * skb, unsigned short len)
((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
((struct net_device_stats*)reg_dev->priv)->rx_packets++;
skb->dst = NULL;
+#ifdef CONFIG_NETFILTER
+ nf_conntrack_put(skb->nfct);
+ skb->nfct = NULL;
+#endif
netif_rx(skb);
dev_put(reg_dev);
return 0;
diff --git a/net/ipv4/netfilter/Config.in b/net/ipv4/netfilter/Config.in
new file mode 100644
index 000000000..bf2a28269
--- /dev/null
+++ b/net/ipv4/netfilter/Config.in
@@ -0,0 +1,64 @@
+#
+# IP netfilter configuration
+#
+mainmenu_option next_comment
+comment ' IP: Netfilter Configuration'
+
+tristate 'Connection tracking (required for masq/NAT)' CONFIG_IP_NF_CONNTRACK
+if [ "$CONFIG_IP_NF_CONNTRACK" != "n" ]; then
+ dep_tristate ' FTP protocol support' CONFIG_IP_NF_FTP $CONFIG_IP_NF_CONNTRACK
+fi
+
+if [ "$CONFIG_EXPERIMENTAL" = "y" -a "$CONFIG_NETLINK" = "y" ]; then
+ tristate 'Userspace queueing via NETLINK (EXPERIMENTAL)' CONFIG_IP_NF_QUEUE
+fi
+tristate 'IP tables support (required for filtering/masq/NAT)' CONFIG_IP_NF_IPTABLES
+if [ "$CONFIG_IP_NF_IPTABLES" != "n" ]; then
+# The simple matches.
+ dep_tristate ' limit match support' CONFIG_IP_NF_MATCH_LIMIT $CONFIG_IP_NF_IPTABLES
+ dep_tristate ' MAC address match support' CONFIG_IP_NF_MATCH_MAC $CONFIG_IP_NF_IPTABLES
+ dep_tristate ' netfilter MARK match support' CONFIG_IP_NF_MATCH_MARK $CONFIG_IP_NF_IPTABLES
+ dep_tristate ' Multiple port match support' CONFIG_IP_NF_MATCH_MULTIPORT $CONFIG_IP_NF_IPTABLES
+ dep_tristate ' TOS match support' CONFIG_IP_NF_MATCH_TOS $CONFIG_IP_NF_IPTABLES
+ if [ "$CONFIG_IP_NF_CONNTRACK" != "n" ]; then
+ dep_tristate ' Connection state match support' CONFIG_IP_NF_MATCH_STATE $CONFIG_IP_NF_CONNTRACK $CONFIG_IP_NF_IPTABLES
+ fi
+ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
+ dep_tristate ' Unclean match support (EXPERIMENTAL)' CONFIG_IP_NF_MATCH_UNCLEAN $CONFIG_IP_NF_IPTABLES
+ dep_tristate ' Owner match support (EXPERIMENTAL)' CONFIG_IP_NF_MATCH_OWNER $CONFIG_IP_NF_IPTABLES
+ fi
+# The targets
+ dep_tristate ' Packet filtering' CONFIG_IP_NF_FILTER $CONFIG_IP_NF_IPTABLES
+ if [ "$CONFIG_IP_NF_FILTER" != "n" ]; then
+ dep_tristate ' REJECT target support' CONFIG_IP_NF_TARGET_REJECT $CONFIG_IP_NF_FILTER
+ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
+ dep_tristate ' MIRROR target support (EXPERIMENTAL)' CONFIG_IP_NF_TARGET_MIRROR $CONFIG_IP_NF_FILTER
+ fi
+ fi
+
+ if [ "$CONFIG_IP_NF_CONNTRACK" != "n" ]; then
+ dep_tristate ' Full NAT' CONFIG_IP_NF_NAT $CONFIG_IP_NF_IPTABLES
+ if [ "$CONFIG_IP_NF_NAT" != "n" ]; then
+ dep_tristate ' MASQUERADE target support' CONFIG_IP_NF_TARGET_MASQUERADE $CONFIG_IP_NF_NAT
+ dep_tristate ' REDIRECT target support' CONFIG_IP_NF_TARGET_REDIRECT $CONFIG_IP_NF_NAT
+ fi
+ fi
+
+ dep_tristate ' Packet mangling' CONFIG_IP_NF_MANGLE $CONFIG_IP_NF_IPTABLES
+ if [ "$CONFIG_IP_NF_MANGLE" != "n" ]; then
+ dep_tristate ' TOS target support' CONFIG_IP_NF_TARGET_TOS $CONFIG_IP_NF_MANGLE
+ dep_tristate ' MARK target support' CONFIG_IP_NF_TARGET_MARK $CONFIG_IP_NF_MANGLE
+ fi
+ dep_tristate ' LOG target support' CONFIG_IP_NF_TARGET_LOG $CONFIG_IP_NF_IPTABLES
+fi
+
+# Backwards compatibility modules: only if you don't build in the others.
+if [ "$CONFIG_IP_NF_CONNTRACK" != "y" ]; then
+ if [ "$CONFIG_IP_NF_IPTABLES" != "y" ]; then
+ tristate 'ipchains (2.2-style) support' CONFIG_IP_NF_COMPAT_IPCHAINS
+ if [ "$CONFIG_IP_NF_COMPAT_IPCHAINS" != "y" ]; then
+ tristate 'ipfwadm (2.0-style) support' CONFIG_IP_NF_COMPAT_IPFWADM
+ fi
+ fi
+fi
+endmenu
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
new file mode 100644
index 000000000..41a61e010
--- /dev/null
+++ b/net/ipv4/netfilter/Makefile
@@ -0,0 +1,234 @@
+#
+# Makefile for the netfilter modules on top of IPv4.
+#
+# Note! Dependencies are done automagically by 'make dep', which also
+# removes any old dependencies. DON'T put your own dependencies here
+# unless it's something special (ie not a .c file).
+#
+# Note 2! The CFLAGS definition is now in the main makefile...
+
+O_TARGET := netfilter.o
+MOD_LIST_NAME := IPV4_MODULES
+M_OBJS :=
+
+IP_NF_CONNTRACK_OBJ:=ip_conntrack_core.o ip_conntrack_proto_generic.o ip_conntrack_proto_tcp.o ip_conntrack_proto_udp.o ip_conntrack_proto_icmp.o
+
+IP_NF_NAT_OBJ:=ip_nat_core.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o
+
+# Link order matters here.
+ifeq ($(CONFIG_IP_NF_CONNTRACK),y)
+OX_OBJS += ip_conntrack_standalone.o
+O_OBJS += $(IP_NF_CONNTRACK_OBJ)
+else
+ ifeq ($(CONFIG_IP_NF_CONNTRACK),m)
+ M_OBJS += ip_conntrack.o
+ endif
+endif
+
+ifeq ($(CONFIG_IP_NF_QUEUE),y)
+O_OBJS += ip_queue.o
+else
+ ifeq ($(CONFIG_IP_NF_QUEUE),m)
+ M_OBJS += ip_queue.o
+ endif
+endif
+
+ifeq ($(CONFIG_IP_NF_FTP),y)
+OX_OBJS += ip_conntrack_ftp.o
+else
+ ifeq ($(CONFIG_IP_NF_FTP),m)
+ M_OBJS += ip_conntrack_ftp.o
+ endif
+endif
+
+ifeq ($(CONFIG_IP_NF_IPTABLES),y)
+O_OBJS += ip_tables.o
+else
+ ifeq ($(CONFIG_IP_NF_IPTABLES),m)
+ M_OBJS += ip_tables.o
+ endif
+endif
+
+ifeq ($(CONFIG_IP_NF_MATCH_LIMIT),y)
+O_OBJS += ipt_limit.o
+else
+ ifeq ($(CONFIG_IP_NF_MATCH_LIMIT),m)
+ M_OBJS += ipt_limit.o
+ endif
+endif
+
+ifeq ($(CONFIG_IP_NF_MATCH_MARK),y)
+O_OBJS += ipt_mark.o
+else
+ ifeq ($(CONFIG_IP_NF_MATCH_MARK),m)
+ M_OBJS += ipt_mark.o
+ endif
+endif
+
+ifeq ($(CONFIG_IP_NF_MATCH_MAC),y)
+O_OBJS += ipt_mac.o
+else
+ ifeq ($(CONFIG_IP_NF_MATCH_MAC),m)
+ M_OBJS += ipt_mac.o
+ endif
+endif
+
+ifeq ($(CONFIG_IP_NF_MATCH_MULTIPORT),y)
+O_OBJS += ipt_multiport.o
+else
+ ifeq ($(CONFIG_IP_NF_MATCH_MULTIPORT),m)
+ M_OBJS += ipt_multiport.o
+ endif
+endif
+
+ifeq ($(CONFIG_IP_NF_MATCH_OWNER),y)
+O_OBJS += ipt_owner.o
+else
+ ifeq ($(CONFIG_IP_NF_MATCH_OWNER),m)
+ M_OBJS += ipt_owner.o
+ endif
+endif
+
+ifeq ($(CONFIG_IP_NF_MATCH_TOS),y)
+O_OBJS += ipt_tos.o
+else
+ ifeq ($(CONFIG_IP_NF_MATCH_TOS),m)
+ M_OBJS += ipt_tos.o
+ endif
+endif
+
+ifeq ($(CONFIG_IP_NF_MATCH_STATE),y)
+O_OBJS += ipt_state.o
+else
+ ifeq ($(CONFIG_IP_NF_MATCH_STATE),m)
+ M_OBJS += ipt_state.o
+ endif
+endif
+
+ifeq ($(CONFIG_IP_NF_MATCH_UNCLEAN),y)
+O_OBJS += ipt_unclean.o
+else
+ ifeq ($(CONFIG_IP_NF_MATCH_UNCLEAN),m)
+ M_OBJS += ipt_unclean.o
+ endif
+endif
+
+ifeq ($(CONFIG_IP_NF_FILTER),y)
+O_OBJS += iptable_filter.o
+else
+ ifeq ($(CONFIG_IP_NF_FILTER),m)
+ M_OBJS += iptable_filter.o
+ endif
+endif
+
+ifeq ($(CONFIG_IP_NF_NAT),y)
+OX_OBJS += ip_nat_standalone.o
+O_OBJS += ip_nat_rule.o $(IP_NF_NAT_OBJ)
+ ifeq ($(CONFIG_IP_NF_FTP),y)
+ O_OBJS += ip_nat_ftp.o
+ endif
+else
+ ifeq ($(CONFIG_IP_NF_NAT),m)
+ M_OBJS += iptable_nat.o
+ ifeq ($(CONFIG_IP_NF_FTP),m)
+ M_OBJS += ip_nat_ftp.o
+ endif
+ endif
+endif
+
+ifeq ($(CONFIG_IP_NF_MANGLE),y)
+O_OBJS += iptable_mangle.o
+else
+ ifeq ($(CONFIG_IP_NF_MANGLE),m)
+ M_OBJS += iptable_mangle.o
+ endif
+endif
+
+ifeq ($(CONFIG_IP_NF_TARGET_REJECT),y)
+O_OBJS += ipt_REJECT.o
+else
+ ifeq ($(CONFIG_IP_NF_TARGET_REJECT),m)
+ M_OBJS += ipt_REJECT.o
+ endif
+endif
+
+ifeq ($(CONFIG_IP_NF_TARGET_MIRROR),y)
+O_OBJS += ipt_MIRROR.o
+else
+ ifeq ($(CONFIG_IP_NF_TARGET_MIRROR),m)
+ M_OBJS += ipt_MIRROR.o
+ endif
+endif
+
+ifeq ($(CONFIG_IP_NF_TARGET_TOS),y)
+O_OBJS += ipt_TOS.o
+else
+ ifeq ($(CONFIG_IP_NF_TARGET_TOS),m)
+ M_OBJS += ipt_TOS.o
+ endif
+endif
+
+ifeq ($(CONFIG_IP_NF_TARGET_MARK),y)
+O_OBJS += ipt_MARK.o
+else
+ ifeq ($(CONFIG_IP_NF_TARGET_MARK),m)
+ M_OBJS += ipt_MARK.o
+ endif
+endif
+
+ifeq ($(CONFIG_IP_NF_TARGET_MASQUERADE),y)
+O_OBJS += ipt_MASQUERADE.o
+else
+ ifeq ($(CONFIG_IP_NF_TARGET_MASQUERADE),m)
+ M_OBJS += ipt_MASQUERADE.o
+ endif
+endif
+
+ifeq ($(CONFIG_IP_NF_TARGET_REDIRECT),y)
+O_OBJS += ipt_REDIRECT.o
+else
+ ifeq ($(CONFIG_IP_NF_TARGET_REDIRECT),m)
+ M_OBJS += ipt_REDIRECT.o
+ endif
+endif
+
+ifeq ($(CONFIG_IP_NF_TARGET_LOG),y)
+O_OBJS += ipt_LOG.o
+else
+ ifeq ($(CONFIG_IP_NF_TARGET_LOG),m)
+ M_OBJS += ipt_LOG.o
+ endif
+endif
+
+ifeq ($(CONFIG_IP_NF_COMPAT_IPCHAINS),y)
+O_OBJS += ipchains.o
+else
+ ifeq ($(CONFIG_IP_NF_COMPAT_IPCHAINS),m)
+ M_OBJS += ipchains.o
+ endif
+endif
+
+ifeq ($(CONFIG_IP_NF_COMPAT_IPFWADM),y)
+O_OBJS += ipfwadm.o
+else
+ ifeq ($(CONFIG_IP_NF_COMPAT_IPFWADM),m)
+ M_OBJS += ipfwadm.o
+ endif
+endif
+
+include $(TOPDIR)/Rules.make
+
+ip_conntrack.o: ip_conntrack_standalone.o $(IP_NF_CONNTRACK_OBJ)
+ $(LD) -r -o $@ $(IP_NF_CONNTRACK_OBJ) ip_conntrack_standalone.o
+
+iptable_nat.o: ip_nat_standalone.o ip_nat_rule.o $(IP_NF_NAT_OBJ)
+ $(LD) -r -o $@ ip_nat_standalone.o ip_nat_rule.o $(IP_NF_NAT_OBJ)
+
+# All the parts of conntrack and NAT required for compatibility layer.
+IP_NF_COMPAT_LAYER:=ip_fw_compat.o ip_fw_compat_redir.o ip_fw_compat_masq.o $(IP_NF_CONNTRACK_OBJ) $(IP_NF_NAT_OBJ)
+
+ipfwadm.o: ipfwadm_core.o $(IP_NF_COMPAT_LAYER)
+ $(LD) -r -o $@ ipfwadm_core.o $(IP_NF_COMPAT_LAYER)
+
+ipchains.o: ipchains_core.o $(IP_NF_COMPAT_LAYER)
+ $(LD) -r -o $@ ipchains_core.o $(IP_NF_COMPAT_LAYER)
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
new file mode 100644
index 000000000..9007cdc89
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -0,0 +1,891 @@
+/* Connection state tracking for netfilter. This is separated from,
+ but required by, the NAT layer; it can also be used by an iptables
+ extension. */
+
+/* (c) 1999 Paul `Rusty' Russell. Licenced under the GNU General
+ Public Licence. */
+
+#ifdef MODULE
+#define __NO_VERSION__
+#endif
+#include <linux/version.h>
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/brlock.h>
+#include <net/checksum.h>
+#include <linux/stddef.h>
+#include <linux/sysctl.h>
+
+/* This rwlock protects the main hash table, protocol/helper/expected
+ registrations, conntrack timers*/
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
+
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+DECLARE_RWLOCK(ip_conntrack_lock);
+
+void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
+static LIST_HEAD(expect_list);
+static LIST_HEAD(protocol_list);
+static LIST_HEAD(helpers);
+unsigned int ip_conntrack_htable_size = 0;
+static int ip_conntrack_max = 0;
+static atomic_t ip_conntrack_count = ATOMIC_INIT(0);
+struct list_head *ip_conntrack_hash;
+
+extern struct ip_conntrack_protocol ip_conntrack_generic_protocol;
+
+static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr,
+ u_int8_t protocol)
+{
+ return protocol == curr->proto;
+}
+
+struct ip_conntrack_protocol *__find_proto(u_int8_t protocol)
+{
+ struct ip_conntrack_protocol *p;
+
+ MUST_BE_READ_LOCKED(&ip_conntrack_lock);
+ p = LIST_FIND(&protocol_list, proto_cmpfn,
+ struct ip_conntrack_protocol *, protocol);
+ if (!p)
+ p = &ip_conntrack_generic_protocol;
+
+ return p;
+}
+
+struct ip_conntrack_protocol *find_proto(u_int8_t protocol)
+{
+ struct ip_conntrack_protocol *p;
+
+ READ_LOCK(&ip_conntrack_lock);
+ p = __find_proto(protocol);
+ READ_UNLOCK(&ip_conntrack_lock);
+ return p;
+}
+
+static inline void ip_conntrack_put(struct ip_conntrack *ct)
+{
+ IP_NF_ASSERT(ct);
+ IP_NF_ASSERT(ct->infos[0].master);
+ /* nf_conntrack_put wants to go via an info struct, so feed it
+ one at random. */
+ nf_conntrack_put(&ct->infos[0]);
+}
+
+static inline u_int32_t
+hash_conntrack(const struct ip_conntrack_tuple *tuple)
+{
+#if 0
+ dump_tuple(tuple);
+#endif
+#ifdef CONFIG_NETFILTER_DEBUG
+ if (tuple->src.pad)
+ DEBUGP("Tuple %p has non-zero padding.\n", tuple);
+#endif
+ /* ntohl because more differences in low bits. */
+ /* To ensure that halves of the same connection don't hash
+ clash, we add the source per-proto again. */
+ return (ntohl(tuple->src.ip + tuple->dst.ip
+ + tuple->src.u.all + tuple->dst.u.all
+ + tuple->dst.protonum)
+ + ntohs(tuple->src.u.all))
+ % ip_conntrack_htable_size;
+}
+
+inline int
+get_tuple(const struct iphdr *iph, size_t len,
+ struct ip_conntrack_tuple *tuple,
+ struct ip_conntrack_protocol *protocol)
+{
+ int ret;
+
+ /* Can only happen when extracting tuples from inside ICMP
+ packets */
+ if (iph->frag_off & htons(IP_OFFSET)) {
+ if (net_ratelimit())
+ printk("ip_conntrack_core: Frag of proto %u.\n",
+ iph->protocol);
+ return 0;
+ }
+ /* Guarantee 8 protocol bytes: if more wanted, use len param */
+ else if (iph->ihl * 4 + 8 > len)
+ return 0;
+
+ tuple->src.ip = iph->saddr;
+ tuple->src.pad = 0;
+ tuple->dst.ip = iph->daddr;
+ tuple->dst.protonum = iph->protocol;
+
+ ret = protocol->pkt_to_tuple((u_int32_t *)iph + iph->ihl,
+ len - 4*iph->ihl,
+ tuple);
+ return ret;
+}
+
+static int
+invert_tuple(struct ip_conntrack_tuple *inverse,
+ const struct ip_conntrack_tuple *orig,
+ const struct ip_conntrack_protocol *protocol)
+{
+ inverse->src.ip = orig->dst.ip;
+ inverse->src.pad = 0;
+ inverse->dst.ip = orig->src.ip;
+ inverse->dst.protonum = orig->dst.protonum;
+
+ return protocol->invert_tuple(inverse, orig);
+}
+
+static void
+destroy_conntrack(struct nf_conntrack *nfct)
+{
+ struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
+
+ IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
+ IP_NF_ASSERT(!timer_pending(&ct->timeout));
+
+ if (ct->master.master)
+ nf_conntrack_put(&ct->master);
+
+ if (ip_conntrack_destroyed)
+ ip_conntrack_destroyed(ct);
+ kfree(ct);
+ atomic_dec(&ip_conntrack_count);
+}
+
+static void death_by_timeout(unsigned long ul_conntrack)
+{
+ struct ip_conntrack *ct = (void *)ul_conntrack;
+
+ WRITE_LOCK(&ip_conntrack_lock);
+ /* Remove from both hash lists */
+ LIST_DELETE(&ip_conntrack_hash
+ [hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)],
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
+ LIST_DELETE(&ip_conntrack_hash
+ [hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple)],
+ &ct->tuplehash[IP_CT_DIR_REPLY]);
+ /* If our expected is in the list, take it out. */
+ if (ct->expected.expectant) {
+ IP_NF_ASSERT(list_inlist(&expect_list, &ct->expected));
+ IP_NF_ASSERT(ct->expected.expectant == ct);
+ LIST_DELETE(&expect_list, &ct->expected);
+ }
+ WRITE_UNLOCK(&ip_conntrack_lock);
+ ip_conntrack_put(ct);
+}
+
+static inline int
+conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
+ const struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack *ignored_conntrack)
+{
+ MUST_BE_READ_LOCKED(&ip_conntrack_lock);
+ return i->ctrack != ignored_conntrack
+ && memcmp(tuple, &i->tuple, sizeof(*tuple)) == 0;
+}
+
+static struct ip_conntrack_tuple_hash *
+__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack *ignored_conntrack)
+{
+ struct ip_conntrack_tuple_hash *h;
+
+ MUST_BE_READ_LOCKED(&ip_conntrack_lock);
+ h = LIST_FIND(&ip_conntrack_hash[hash_conntrack(tuple)],
+ conntrack_tuple_cmp,
+ struct ip_conntrack_tuple_hash *,
+ tuple, ignored_conntrack);
+ return h;
+}
+
+/* Find a connection corresponding to a tuple. */
+struct ip_conntrack_tuple_hash *
+ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack *ignored_conntrack)
+{
+ struct ip_conntrack_tuple_hash *h;
+
+ READ_LOCK(&ip_conntrack_lock);
+ h = __ip_conntrack_find(tuple, ignored_conntrack);
+ if (h)
+ atomic_inc(&h->ctrack->ct_general.use);
+ READ_UNLOCK(&ip_conntrack_lock);
+
+ return h;
+}
+
+/* Returns true if a connection correspondings to the tuple (required
+ for NAT). */
+int
+ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack *ignored_conntrack)
+{
+ struct ip_conntrack_tuple_hash *h;
+
+ READ_LOCK(&ip_conntrack_lock);
+ h = __ip_conntrack_find(tuple, ignored_conntrack);
+ READ_UNLOCK(&ip_conntrack_lock);
+
+ return h != NULL;
+}
+
+/* Returns TRUE if it dealt with ICMP, and filled in skb fields */
+int icmp_error_track(struct sk_buff *skb)
+{
+ const struct iphdr *iph = skb->nh.iph;
+ struct icmphdr *hdr = (struct icmphdr *)((u_int32_t *)iph + iph->ihl);
+ struct ip_conntrack_tuple innertuple, origtuple;
+ struct iphdr *inner = (struct iphdr *)(hdr + 1);
+ size_t datalen = skb->len - iph->ihl*4 - sizeof(*hdr);
+ struct ip_conntrack_protocol *innerproto;
+ struct ip_conntrack_tuple_hash *h;
+ enum ip_conntrack_info ctinfo;
+
+ if (iph->protocol != IPPROTO_ICMP)
+ return 0;
+
+ if (skb->len < iph->ihl * 4 + sizeof(struct icmphdr)) {
+ DEBUGP("icmp_error_track: too short\n");
+ return 1;
+ }
+
+ if (hdr->type != ICMP_DEST_UNREACH
+ && hdr->type != ICMP_SOURCE_QUENCH
+ && hdr->type != ICMP_TIME_EXCEEDED
+ && hdr->type != ICMP_PARAMETERPROB
+ && hdr->type != ICMP_REDIRECT)
+ return 0;
+
+ /* Ignore it if the checksum's bogus. */
+ if (ip_compute_csum((unsigned char *)hdr, sizeof(*hdr) + datalen)) {
+ DEBUGP("icmp_error_track: bad csum\n");
+ return 1;
+ }
+
+ innerproto = find_proto(inner->protocol);
+ /* Are they talking about one of our connections? */
+ if (inner->ihl * 4 + 8 > datalen
+ || !get_tuple(inner, datalen, &origtuple, innerproto)) {
+ DEBUGP("icmp_error: ! get_tuple p=%u (%u*4+%u dlen=%u)\n",
+ inner->protocol, inner->ihl, 8,
+ datalen);
+ return 1;
+ }
+
+ /* Ordinarily, we'd expect the inverted tupleproto, but it's
+ been preserved inside the ICMP. */
+ if (!invert_tuple(&innertuple, &origtuple, innerproto)) {
+ DEBUGP("icmp_error_track: Can't invert tuple\n");
+ return 1;
+ }
+ h = ip_conntrack_find_get(&innertuple, NULL);
+ if (!h) {
+ DEBUGP("icmp_error_track: no match\n");
+ return 1;
+ }
+
+ ctinfo = IP_CT_RELATED;
+ if (DIRECTION(h) == IP_CT_DIR_REPLY)
+ ctinfo += IP_CT_IS_REPLY;
+
+ /* Update skb to refer to this connection */
+ skb->nfct = &h->ctrack->infos[ctinfo];
+ return 1;
+}
+
+static inline int helper_cmp(const struct ip_conntrack_helper *i,
+ const struct ip_conntrack_tuple *rtuple)
+{
+ return i->will_help(rtuple);
+}
+
+/* Compare all but src per-proto part. */
+static int expect_cmp(const struct ip_conntrack_expect *i,
+ const struct ip_conntrack_tuple *tuple)
+{
+ return (tuple->src.ip == i->tuple.src.ip
+ && tuple->dst.ip == i->tuple.dst.ip
+ && tuple->dst.u.all == i->tuple.dst.u.all
+ && tuple->dst.protonum == i->tuple.dst.protonum);
+}
+
+/* Allocate a new conntrack; we set everything up, then grab write
+ lock and see if we lost a race. If we lost it we return 0,
+ indicating the controlling code should look again. */
+static int
+init_conntrack(const struct ip_conntrack_tuple *tuple,
+ struct ip_conntrack_protocol *protocol,
+ struct sk_buff *skb)
+{
+ struct ip_conntrack *conntrack;
+ struct ip_conntrack_tuple repl_tuple;
+ size_t hash, repl_hash;
+ struct ip_conntrack_expect *expected;
+ enum ip_conntrack_info ctinfo;
+ int i;
+
+ if (!invert_tuple(&repl_tuple, tuple, protocol)) {
+ DEBUGP("Can't invert tuple.\n");
+ return 1;
+ }
+
+ if(ip_conntrack_max &&
+ (atomic_read(&ip_conntrack_count) >= ip_conntrack_max)) {
+ if (net_ratelimit())
+ printk(KERN_WARNING "ip_conntrack: maximum limit of %d entries exceeded\n", ip_conntrack_max);
+ return 1;
+ }
+
+ conntrack = kmalloc(sizeof(struct ip_conntrack), GFP_ATOMIC);
+ if (!conntrack) {
+ DEBUGP("Can't allocate conntrack.\n");
+ return 1;
+ }
+ hash = hash_conntrack(tuple);
+ repl_hash = hash_conntrack(&repl_tuple);
+
+ memset(conntrack, 0, sizeof(struct ip_conntrack));
+ atomic_set(&conntrack->ct_general.use, 1);
+ conntrack->ct_general.destroy = destroy_conntrack;
+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
+ conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
+ conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
+ for(i=0; i < IP_CT_NUMBER; i++)
+ conntrack->infos[i].master = &conntrack->ct_general;
+
+ if (!protocol->new(conntrack, skb->nh.iph, skb->len)) {
+ kfree(conntrack);
+ return 1;
+ }
+
+ /* Sew in at head of hash list. */
+ WRITE_LOCK(&ip_conntrack_lock);
+ /* Check noone else beat us in the race... */
+ if (__ip_conntrack_find(tuple, NULL)) {
+ WRITE_UNLOCK(&ip_conntrack_lock);
+ printk("ip_conntrack: Wow someone raced us!\n");
+ kfree(conntrack);
+ return 0;
+ }
+ conntrack->helper = LIST_FIND(&helpers, helper_cmp,
+ struct ip_conntrack_helper *,
+ &repl_tuple);
+ /* Need finding and deleting of expected ONLY if we win race */
+ expected = LIST_FIND(&expect_list, expect_cmp,
+ struct ip_conntrack_expect *, tuple);
+ if (expected) {
+ /* Welcome, Mr. Bond. We've been expecting you... */
+ conntrack->status = IPS_EXPECTED;
+ conntrack->master.master = &expected->expectant->ct_general;
+ IP_NF_ASSERT(conntrack->master.master);
+ LIST_DELETE(&expect_list, expected);
+ expected->expectant = NULL;
+ nf_conntrack_get(&conntrack->master);
+ ctinfo = IP_CT_RELATED;
+ } else {
+ ctinfo = IP_CT_NEW;
+ }
+ list_prepend(&ip_conntrack_hash[hash],
+ &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]);
+ list_prepend(&ip_conntrack_hash[repl_hash],
+ &conntrack->tuplehash[IP_CT_DIR_REPLY]);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+
+ /* Update skb to refer to this connection */
+ skb->nfct = &conntrack->infos[ctinfo];
+
+ atomic_inc(&ip_conntrack_count);
+ return 1;
+}
+
+static void
+resolve_normal_ct(struct sk_buff *skb)
+{
+ struct ip_conntrack_tuple tuple;
+ struct ip_conntrack_tuple_hash *h;
+ struct ip_conntrack_protocol *proto;
+ enum ip_conntrack_info ctinfo;
+
+ proto = find_proto(skb->nh.iph->protocol);
+ if (!get_tuple(skb->nh.iph, skb->len, &tuple, proto))
+ return;
+
+ /* Loop around search/insert race */
+ do {
+ /* look for tuple match */
+ h = ip_conntrack_find_get(&tuple, NULL);
+ if (!h && init_conntrack(&tuple, proto, skb))
+ return;
+ } while (!h);
+
+ /* It exists; we have (non-exclusive) reference. */
+ if (DIRECTION(h) == IP_CT_DIR_REPLY) {
+ ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
+ h->ctrack->status |= IPS_SEEN_REPLY;
+ } else {
+ /* Once we've had two way comms, always ESTABLISHED. */
+ if (h->ctrack->status & IPS_SEEN_REPLY) {
+ DEBUGP("ip_conntrack_in: normal packet for %p\n",
+ h->ctrack);
+ ctinfo = IP_CT_ESTABLISHED;
+ } else if (h->ctrack->status & IPS_EXPECTED) {
+ DEBUGP("ip_conntrack_in: related packet for %p\n",
+ h->ctrack);
+ ctinfo = IP_CT_RELATED;
+ } else {
+ DEBUGP("ip_conntrack_in: new packet for %p\n",
+ h->ctrack);
+ ctinfo = IP_CT_NEW;
+ }
+ }
+ skb->nfct = &h->ctrack->infos[ctinfo];
+}
+
+/* Return conntrack and conntrack_info a given skb */
+struct ip_conntrack *
+ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
+{
+ if (!skb->nfct) {
+ /* It may be an icmp error... */
+ if (!icmp_error_track(skb))
+ resolve_normal_ct(skb);
+ }
+
+ if (skb->nfct) {
+ struct ip_conntrack *ct
+ = (struct ip_conntrack *)skb->nfct->master;
+
+ /* ctinfo is the index of the nfct inside the conntrack */
+ *ctinfo = skb->nfct - ct->infos;
+ IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
+ return ct;
+ }
+ return NULL;
+}
+
+/* Netfilter hook itself. */
+unsigned int ip_conntrack_in(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct ip_conntrack *ct;
+ enum ip_conntrack_info ctinfo;
+ struct ip_conntrack_protocol *proto;
+ int ret;
+
+ /* FIXME: Do this right please. --RR */
+ (*pskb)->nfcache |= NFC_UNKNOWN;
+
+ /* Previously seen (loopback)? Ignore. Do this before
+ fragment check. */
+ if ((*pskb)->nfct)
+ return NF_ACCEPT;
+
+ /* Gather fragments. */
+ if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
+ *pskb = ip_ct_gather_frags(*pskb);
+ if (!*pskb)
+ return NF_STOLEN;
+ }
+
+ ct = ip_conntrack_get(*pskb, &ctinfo);
+ if (!ct)
+ /* Not valid part of a connection */
+ return NF_ACCEPT;
+
+ proto = find_proto((*pskb)->nh.iph->protocol);
+ /* If this is new, this is first time timer will be set */
+ ret = proto->packet(ct, (*pskb)->nh.iph, (*pskb)->len, ctinfo);
+
+ if (ret == -1) {
+ /* Invalid */
+ nf_conntrack_put((*pskb)->nfct);
+ (*pskb)->nfct = NULL;
+ return NF_ACCEPT;
+ }
+
+ if (ret != NF_DROP && ct->helper) {
+ ret = ct->helper->help((*pskb)->nh.iph, (*pskb)->len,
+ ct, ctinfo);
+ if (ret == -1) {
+ /* Invalid */
+ nf_conntrack_put((*pskb)->nfct);
+ (*pskb)->nfct = NULL;
+ return NF_ACCEPT;
+ }
+ }
+
+ return ret;
+}
+
+int invert_tuplepr(struct ip_conntrack_tuple *inverse,
+ const struct ip_conntrack_tuple *orig)
+{
+ return invert_tuple(inverse, orig, find_proto(orig->dst.protonum));
+}
+
+/* Add a related connection. */
+int ip_conntrack_expect_related(struct ip_conntrack *related_to,
+ const struct ip_conntrack_tuple *tuple)
+{
+ WRITE_LOCK(&ip_conntrack_lock);
+ related_to->expected.tuple = *tuple;
+
+ if (!related_to->expected.expectant) {
+ list_prepend(&expect_list, &related_to->expected);
+ related_to->expected.expectant = related_to;
+ } else {
+ IP_NF_ASSERT(list_inlist(&expect_list, &related_to->expected));
+ IP_NF_ASSERT(related_to->expected.expectant
+ == related_to);
+ }
+ WRITE_UNLOCK(&ip_conntrack_lock);
+
+ return 0;
+}
+
+/* Alter reply tuple (maybe alter helper). If it's already taken,
+ return 0 and don't do alteration. */
+int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
+ const struct ip_conntrack_tuple *newreply)
+{
+ unsigned int newindex = hash_conntrack(newreply);
+
+ WRITE_LOCK(&ip_conntrack_lock);
+ if (__ip_conntrack_find(newreply, conntrack)) {
+ WRITE_UNLOCK(&ip_conntrack_lock);
+ return 0;
+ }
+ DEBUGP("Altering reply tuple of %p to ", conntrack);
+ DUMP_TUPLE(newreply);
+
+ LIST_DELETE(&ip_conntrack_hash
+ [hash_conntrack(&conntrack->tuplehash[IP_CT_DIR_REPLY]
+ .tuple)],
+ &conntrack->tuplehash[IP_CT_DIR_REPLY]);
+ conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
+ list_prepend(&ip_conntrack_hash[newindex],
+ &conntrack->tuplehash[IP_CT_DIR_REPLY]);
+ conntrack->helper = LIST_FIND(&helpers, helper_cmp,
+ struct ip_conntrack_helper *,
+ newreply);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+ return 1;
+}
+
+int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
+{
+ MOD_INC_USE_COUNT;
+
+ WRITE_LOCK(&ip_conntrack_lock);
+ list_prepend(&helpers, me);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+
+ return 0;
+}
+
+static inline int unhelp(struct ip_conntrack_tuple_hash *i,
+ const struct ip_conntrack_helper *me)
+{
+ if (i->ctrack->helper == me) {
+ i->ctrack->helper = NULL;
+ /* Get rid of any expected. */
+ if (i->ctrack->expected.expectant) {
+ IP_NF_ASSERT(i->ctrack->expected.expectant
+ == i->ctrack);
+ LIST_DELETE(&expect_list, &i->ctrack->expected);
+ i->ctrack->expected.expectant = NULL;
+ }
+ }
+ return 0;
+}
+
+void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
+{
+ unsigned int i;
+
+ /* Need write lock here, to delete helper. */
+ WRITE_LOCK(&ip_conntrack_lock);
+ LIST_DELETE(&helpers, me);
+
+ /* Get rid of expecteds, set helpers to NULL. */
+ for (i = 0; i < ip_conntrack_htable_size; i++)
+ LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
+ struct ip_conntrack_tuple_hash *, me);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+
+ /* Someone could be still looking at the helper in a bh. */
+ br_write_lock_bh(BR_NETPROTO_LOCK);
+ br_write_unlock_bh(BR_NETPROTO_LOCK);
+
+ MOD_DEC_USE_COUNT;
+}
+
+/* Refresh conntrack for this many jiffies: if noone calls this,
+ conntrack will vanish with current skb. */
+void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies)
+{
+ WRITE_LOCK(&ip_conntrack_lock);
+ /* If this hasn't had a timer before, it's still being set up */
+ if (ct->timeout.data == 0) {
+ ct->timeout.data = (unsigned long)ct;
+ ct->timeout.function = death_by_timeout;
+ ct->timeout.expires = jiffies + extra_jiffies;
+ atomic_inc(&ct->ct_general.use);
+ add_timer(&ct->timeout);
+ } else {
+ /* Need del_timer for race avoidance (may already be dying). */
+ if (del_timer(&ct->timeout)) {
+ ct->timeout.expires = jiffies + extra_jiffies;
+ add_timer(&ct->timeout);
+ }
+ }
+ WRITE_UNLOCK(&ip_conntrack_lock);
+}
+
+/* Returns new sk_buff, or NULL */
+struct sk_buff *
+ip_ct_gather_frags(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+#ifdef CONFIG_NETFILTER_DEBUG
+ unsigned int olddebug = skb->nf_debug;
+#endif
+ if (sk) sock_hold(sk);
+ skb = ip_defrag(skb);
+ if (!skb) {
+ if (sk) sock_put(sk);
+ return skb;
+ }
+ if (sk) {
+ skb_set_owner_w(skb, sk);
+ sock_put(sk);
+ }
+
+ ip_send_check(skb->nh.iph);
+ skb->nfcache |= NFC_ALTERED;
+#ifdef CONFIG_NETFILTER_DEBUG
+ /* Packet path as if nothing had happened. */
+ skb->nf_debug = olddebug;
+#endif
+ return skb;
+}
+
+static inline int
+do_kill(const struct ip_conntrack_tuple_hash *i,
+ int (*kill)(const struct ip_conntrack *i, void *data),
+ void *data)
+{
+ return kill(i->ctrack, data);
+}
+
+/* Bring out ya dead! */
+static struct ip_conntrack_tuple_hash *
+get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data),
+ void *data)
+{
+ struct ip_conntrack_tuple_hash *h = NULL;
+ unsigned int i;
+
+ READ_LOCK(&ip_conntrack_lock);
+ for (i = 0; !h && i < ip_conntrack_htable_size; i++) {
+ h = LIST_FIND(&ip_conntrack_hash[i], do_kill,
+ struct ip_conntrack_tuple_hash *, kill, data);
+ }
+ if (h)
+ atomic_inc(&h->ctrack->ct_general.use);
+ READ_UNLOCK(&ip_conntrack_lock);
+
+ return h;
+}
+
+void
+ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
+ void *data)
+{
+ struct ip_conntrack_tuple_hash *h;
+
+ /* This is order n^2, by the way. */
+ while ((h = get_next_corpse(kill, data)) != NULL) {
+ /* Time to push up daises... */
+ if (del_timer(&h->ctrack->timeout))
+ death_by_timeout((unsigned long)h->ctrack);
+ /* ... else the timer will get him soon. */
+
+ ip_conntrack_put(h->ctrack);
+ }
+}
+
+/* Fast function for those who don't want to parse /proc (and I don't
+ blame them). */
+/* Reversing the socket's dst/src point of view gives us the reply
+ mapping. */
+static int
+getorigdst(struct sock *sk, int optval, void *user, int *len)
+{
+ struct ip_conntrack_tuple_hash *h;
+ struct ip_conntrack_tuple tuple = { { sk->rcv_saddr, { sk->sport },
+ 0 },
+ { sk->daddr, { sk->dport },
+ IPPROTO_TCP } };
+
+ /* We only do TCP at the moment: is there a better way? */
+ if (strcmp(sk->prot->name, "TCP") != 0) {
+ DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
+ return -ENOPROTOOPT;
+ }
+
+ if (*len != sizeof(struct sockaddr_in)) {
+ DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
+ *len, sizeof(struct sockaddr_in));
+ return -EINVAL;
+ }
+
+ h = ip_conntrack_find_get(&tuple, NULL);
+ if (h) {
+ struct sockaddr_in sin;
+
+ sin.sin_family = AF_INET;
+ sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.u.tcp.port;
+ sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.ip;
+
+ DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
+ IP_PARTS(sin.sin_addr.s_addr), ntohs(sin.sin_port));
+ ip_conntrack_put(h->ctrack);
+ if (copy_to_user(user, &sin, sizeof(sin)) != 0)
+ return -EFAULT;
+ else
+ return 0;
+ }
+ DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
+ IP_PARTS(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
+ IP_PARTS(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
+ return -ENOENT;
+}
+
+static struct nf_sockopt_ops so_getorigdst
+= { { NULL, NULL }, PF_INET,
+ 0, 0, NULL, /* Setsockopts */
+ SO_ORIGINAL_DST, SO_ORIGINAL_DST+1, &getorigdst,
+ 0, NULL };
+
+#define NET_IP_CONNTRACK_MAX 2089
+#define NET_IP_CONNTRACK_MAX_NAME "ip_conntrack_max"
+
+static struct ctl_table_header *ip_conntrack_sysctl_header;
+
+static ctl_table ip_conntrack_table[] = {
+ { NET_IP_CONNTRACK_MAX, NET_IP_CONNTRACK_MAX_NAME, &ip_conntrack_max,
+ sizeof(ip_conntrack_max), 0644, NULL, proc_dointvec },
+ { 0 }
+};
+
+static ctl_table ip_conntrack_dir_table[] = {
+ {NET_IPV4, "ipv4", NULL, 0, 0555, ip_conntrack_table, 0, 0, 0, 0, 0},
+ { 0 }
+};
+
+static ctl_table ip_conntrack_root_table[] = {
+ {CTL_NET, "net", NULL, 0, 0555, ip_conntrack_dir_table, 0, 0, 0, 0, 0},
+ { 0 }
+};
+
+static int kill_all(const struct ip_conntrack *i, void *data)
+{
+ return 1;
+}
+
+/* Mishearing the voices in his head, our hero wonders how he's
+ supposed to kill the mall. */
+void ip_conntrack_cleanup(void)
+{
+ unregister_sysctl_table(ip_conntrack_sysctl_header);
+ ip_ct_selective_cleanup(kill_all, NULL);
+ vfree(ip_conntrack_hash);
+ nf_unregister_sockopt(&so_getorigdst);
+}
+
+int __init ip_conntrack_init(void)
+{
+ unsigned int i;
+ int ret;
+
+ /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
+ * machine has 256 buckets. 1GB machine has 8192 buckets. */
+ ip_conntrack_htable_size
+ = (((num_physpages << PAGE_SHIFT) / 16384)
+ / sizeof(struct list_head));
+ ip_conntrack_max = 8 * ip_conntrack_htable_size;
+
+ printk("ip_conntrack (%u buckets, %d max)\n",
+ ip_conntrack_htable_size, ip_conntrack_max);
+
+ ret = nf_register_sockopt(&so_getorigdst);
+ if (ret != 0)
+ return ret;
+
+ ip_conntrack_hash = vmalloc(sizeof(struct list_head)
+ * ip_conntrack_htable_size);
+ if (!ip_conntrack_hash) {
+ nf_unregister_sockopt(&so_getorigdst);
+ return -ENOMEM;
+ }
+
+ /* Don't NEED lock here, but good form anyway. */
+ WRITE_LOCK(&ip_conntrack_lock);
+ /* Sew in builtin protocols. */
+ list_append(&protocol_list, &ip_conntrack_protocol_tcp);
+ list_append(&protocol_list, &ip_conntrack_protocol_udp);
+ list_append(&protocol_list, &ip_conntrack_protocol_icmp);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+
+ for (i = 0; i < ip_conntrack_htable_size; i++)
+ INIT_LIST_HEAD(&ip_conntrack_hash[i]);
+
+/* This is fucking braindead. There is NO WAY of doing this without
+ the CONFIG_SYSCTL unless you don't want to detect errors.
+ Grrr... --RR */
+#ifdef CONFIG_SYSCTL
+ ip_conntrack_sysctl_header
+ = register_sysctl_table(ip_conntrack_root_table, 0);
+ if (ip_conntrack_sysctl_header == NULL) {
+ vfree(ip_conntrack_hash);
+ nf_unregister_sockopt(&so_getorigdst);
+ return -ENOMEM;
+ }
+#endif /*CONFIG_SYSCTL*/
+
+ ret = ip_conntrack_protocol_tcp_init();
+ if (ret != 0) {
+ unregister_sysctl_table(ip_conntrack_sysctl_header);
+ vfree(ip_conntrack_hash);
+ nf_unregister_sockopt(&so_getorigdst);
+ }
+
+ return ret;
+}
+
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
new file mode 100644
index 000000000..9137d13ea
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -0,0 +1,251 @@
+/* FTP extension for IP connection tracking. */
+#ifdef MODULE
+#define EXPORT_SYMTAB
+#endif
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter_ipv4/lockhelp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
+
+DECLARE_LOCK(ip_ftp_lock);
+
+#define SERVER_STRING "227 Entering Passive Mode ("
+#define CLIENT_STRING "PORT "
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+#define IP_PARTS_NATIVE(n) \
+(unsigned int)((n)>>24)&0xFF, \
+(unsigned int)((n)>>16)&0xFF, \
+(unsigned int)((n)>>8)&0xFF, \
+(unsigned int)((n)&0xFF)
+
+#define IP_PARTS(n) IP_PARTS_NATIVE(ntohl(n))
+
+static struct {
+ const char *pattern;
+ size_t plen;
+ char term;
+} search[2] = {
+ [IP_CT_FTP_PORT] { CLIENT_STRING, sizeof(CLIENT_STRING) - 1, '\r' },
+ [IP_CT_FTP_PASV] { SERVER_STRING, sizeof(SERVER_STRING) - 1, ')' }
+};
+
+/* Returns 0, or length of numbers */
+static int try_number(const char *data, size_t dlen, u_int32_t array[6],
+ char term)
+{
+ u_int32_t i, len;
+
+ /* Keep data pointing at next char. */
+ for (i = 0, len = 0; len < dlen; len++, data++) {
+ if (*data >= '0' && *data <= '9') {
+ array[i] = array[i]*10 + *data - '0';
+ }
+ else if (*data == ',')
+ i++;
+ else {
+ /* Unexpected character; true if it's the
+ terminator and we're finished. */
+ if (*data == term && i == 5)
+ return len;
+
+ DEBUGP("Char %u (got %u nums) `%u' unexpected\n",
+ len, i, *data);
+ return 0;
+ }
+ }
+
+ return 0;
+}
+
+/* Return 1 for match, 0 for accept, -1 for partial. */
+static int find_pattern(const char *data, size_t dlen,
+ const char *pattern, size_t plen,
+ char term,
+ unsigned int *numoff,
+ unsigned int *numlen,
+ u_int32_t array[6])
+{
+ if (dlen == 0)
+ return 0;
+
+ if (dlen < plen) {
+ /* Short packet: try for partial? */
+ if (strnicmp(data, pattern, dlen) == 0)
+ return -1;
+ else return 0;
+ }
+
+ if (strnicmp(data, pattern, plen) != 0) {
+#if 0
+ size_t i;
+
+ DEBUGP("ftp: string mismatch\n");
+ for (i = 0; i < plen; i++) {
+ DEBUGFTP("ftp:char %u `%c'(%u) vs `%c'(%u)\n",
+ i, data[i], data[i],
+ pattern[i], pattern[i]);
+ }
+#endif
+ return 0;
+ }
+
+ *numoff = plen;
+ *numlen = try_number(data + plen, dlen - plen, array, term);
+ if (!*numlen)
+ return -1;
+
+ return 1;
+}
+
+/* FIXME: This should be in userspace. Later. */
+static int help(const struct iphdr *iph, size_t len,
+ struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ /* tcplen not negative guarenteed by ip_conntrack_tcp.c */
+ struct tcphdr *tcph = (void *)iph + iph->ihl * 4;
+ const char *data = (const char *)tcph + tcph->doff * 4;
+ unsigned int tcplen = len - iph->ihl * 4;
+ unsigned int datalen = tcplen - tcph->doff * 4;
+ u_int32_t old_seq_aft_nl;
+ int old_seq_aft_nl_set;
+ u_int32_t array[6] = { 0 };
+ int dir = CTINFO2DIR(ctinfo);
+ unsigned int matchlen, matchoff;
+ struct ip_conntrack_tuple t;
+ struct ip_ct_ftp *info = &ct->help.ct_ftp_info;
+
+ /* Can't track connections formed before we registered */
+ if (!info)
+ return NF_ACCEPT;
+
+ /* Until there's been traffic both ways, don't look in packets. */
+ if (ctinfo != IP_CT_ESTABLISHED
+ && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) {
+ DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo);
+ return NF_ACCEPT;
+ }
+
+ /* Not whole TCP header? */
+ if (tcplen < sizeof(struct tcphdr) || tcplen < tcph->doff*4) {
+ DEBUGP("ftp: tcplen = %u\n", (unsigned)tcplen);
+ return NF_ACCEPT;
+ }
+
+ /* Checksum invalid? Ignore. */
+ /* FIXME: Source route IP option packets --RR */
+ if (tcp_v4_check(tcph, tcplen, iph->saddr, iph->daddr,
+ csum_partial((char *)tcph, tcplen, 0))) {
+ DEBUGP("ftp_help: bad csum: %p %u %u.%u.%u.%u %u.%u.%u.%u\n",
+ tcph, tcplen, IP_PARTS(iph->saddr),
+ IP_PARTS(iph->daddr));
+ return NF_ACCEPT;
+ }
+
+ LOCK_BH(&ip_ftp_lock);
+ old_seq_aft_nl_set = info->seq_aft_nl_set[dir];
+ old_seq_aft_nl = info->seq_aft_nl[dir];
+
+ DEBUGP("conntrack_ftp: datalen %u\n", datalen);
+ if ((datalen > 0) && (data[datalen-1] == '\n')) {
+ DEBUGP("conntrack_ftp: datalen %u ends in \\n\n", datalen);
+ if (!old_seq_aft_nl_set
+ || after(ntohl(tcph->seq) + datalen, old_seq_aft_nl)) {
+ DEBUGP("conntrack_ftp: updating nl to %u\n",
+ ntohl(tcph->seq) + datalen);
+ info->seq_aft_nl[dir] = ntohl(tcph->seq) + datalen;
+ info->seq_aft_nl_set[dir] = 1;
+ }
+ }
+ UNLOCK_BH(&ip_ftp_lock);
+
+ if(!old_seq_aft_nl_set ||
+ (ntohl(tcph->seq) != old_seq_aft_nl)) {
+ DEBUGP("ip_conntrack_ftp_help: wrong seq pos %s(%u)\n",
+ old_seq_aft_nl_set ? "":"(UNSET) ", old_seq_aft_nl);
+ return NF_ACCEPT;
+ }
+
+ switch (find_pattern(data, datalen,
+ search[dir].pattern,
+ search[dir].plen, search[dir].term,
+ &matchoff, &matchlen,
+ array)) {
+ case -1: /* partial */
+ /* We don't usually drop packets. After all, this is
+ connection tracking, not packet filtering.
+ However, it is neccessary for accurate tracking in
+ this case. */
+ DEBUGP("conntrack_ftp: partial `%.*s'\n",
+ (int)datalen, data);
+ return NF_DROP;
+
+ case 0: /* no match */
+ DEBUGP("ip_conntrack_ftp_help: no match\n");
+ return NF_ACCEPT;
+ }
+
+ DEBUGP("conntrack_ftp: match `%.*s' (%u bytes at %u)\n",
+ (int)matchlen, data + matchoff,
+ matchlen, ntohl(tcph->seq) + matchoff);
+
+ /* Update the ftp info */
+ LOCK_BH(&ip_ftp_lock);
+ info->is_ftp = 1;
+ info->seq = ntohl(tcph->seq) + matchoff;
+ info->len = matchlen;
+ info->ftptype = dir;
+ info->port = array[4] << 8 | array[5];
+
+ t = ((struct ip_conntrack_tuple)
+ { { ct->tuplehash[!dir].tuple.src.ip,
+ { 0 }, 0 },
+ { htonl((array[0] << 24) | (array[1] << 16)
+ | (array[2] << 8) | array[3]),
+ { htons(array[4] << 8 | array[5]) },
+ IPPROTO_TCP }});
+ ip_conntrack_expect_related(ct, &t);
+ UNLOCK_BH(&ip_ftp_lock);
+
+ return NF_ACCEPT;
+}
+
+/* Returns TRUE if it wants to help this connection (tuple is the
+ tuple of REPLY packets from server). */
+static int ftp_will_help(const struct ip_conntrack_tuple *rtuple)
+{
+ return (rtuple->dst.protonum == IPPROTO_TCP
+ && rtuple->src.u.tcp.port == __constant_htons(21));
+}
+
+static struct ip_conntrack_helper ftp = { { NULL, NULL },
+ ftp_will_help,
+ help };
+
+static int __init init(void)
+{
+ return ip_conntrack_helper_register(&ftp);
+}
+
+static void __exit fini(void)
+{
+ ip_conntrack_helper_unregister(&ftp);
+}
+
+struct module *ip_conntrack_ftp = THIS_MODULE;
+EXPORT_SYMBOL(ip_conntrack_ftp);
+EXPORT_SYMBOL(ip_ftp_lock);
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_generic.c b/net/ipv4/netfilter/ip_conntrack_proto_generic.c
new file mode 100644
index 000000000..77a491e34
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_generic.c
@@ -0,0 +1,60 @@
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+
+#define GENERIC_TIMEOUT (3600*HZ)
+
+static int generic_pkt_to_tuple(const void *datah, size_t datalen,
+ struct ip_conntrack_tuple *tuple)
+{
+ tuple->src.u.all = 0;
+ tuple->dst.u.all = 0;
+
+ return 1;
+}
+
+static int generic_invert_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack_tuple *orig)
+{
+ tuple->src.u.all = 0;
+ tuple->dst.u.all = 0;
+
+ return 1;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static unsigned int generic_print_tuple(char *buffer,
+ const struct ip_conntrack_tuple *tuple)
+{
+ return 0;
+}
+
+/* Print out the private part of the conntrack. */
+static unsigned int generic_print_conntrack(char *buffer,
+ const struct ip_conntrack *state)
+{
+ return 0;
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int established(struct ip_conntrack *conntrack,
+ struct iphdr *iph, size_t len,
+ enum ip_conntrack_info conntrackinfo)
+{
+ ip_ct_refresh(conntrack, GENERIC_TIMEOUT);
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static int new(struct ip_conntrack *conntrack, struct iphdr *iph, size_t len)
+{
+ return 1;
+}
+
+struct ip_conntrack_protocol ip_conntrack_generic_protocol
+= { { NULL, NULL }, 0, "unknown",
+ generic_pkt_to_tuple, generic_invert_tuple, generic_print_tuple,
+ generic_print_conntrack, established, new, NULL };
+
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
new file mode 100644
index 000000000..1d1256be5
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -0,0 +1,111 @@
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/icmp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+
+#define ICMP_TIMEOUT (30*HZ)
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static int icmp_pkt_to_tuple(const void *datah, size_t datalen,
+ struct ip_conntrack_tuple *tuple)
+{
+ const struct icmphdr *hdr = datah;
+
+ tuple->dst.u.icmp.type = hdr->type;
+ tuple->src.u.icmp.id = hdr->un.echo.id;
+ tuple->dst.u.icmp.code = hdr->code;
+
+ return 1;
+}
+
+static int icmp_invert_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack_tuple *orig)
+{
+ /* Add 1; spaces filled with 0. */
+ static u_int8_t invmap[]
+ = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1,
+ [ICMP_ECHOREPLY] = ICMP_ECHO + 1,
+ [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1,
+ [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1,
+ [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1,
+ [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1,
+ [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1,
+ [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1};
+
+ if (orig->dst.u.icmp.type >= sizeof(invmap)
+ || !invmap[orig->dst.u.icmp.type])
+ return 0;
+
+ tuple->src.u.icmp.id = orig->src.u.icmp.id;
+ tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1;
+ tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
+ return 1;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static unsigned int icmp_print_tuple(char *buffer,
+ const struct ip_conntrack_tuple *tuple)
+{
+ return sprintf(buffer, "type=%u code=%u id=%u ",
+ tuple->dst.u.icmp.type,
+ tuple->dst.u.icmp.code,
+ ntohs(tuple->src.u.icmp.id));
+}
+
+/* Print out the private part of the conntrack. */
+static unsigned int icmp_print_conntrack(char *buffer,
+ const struct ip_conntrack *conntrack)
+{
+ return 0;
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int icmp_packet(struct ip_conntrack *ct,
+ struct iphdr *iph, size_t len,
+ enum ip_conntrack_info ctinfo)
+{
+ /* FIXME: Should keep count of orig - reply packets: if == 0,
+ destroy --RR */
+ /* Delete connection immediately on reply: won't actually
+ vanish as we still have skb */
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
+ if (del_timer(&ct->timeout))
+ ct->timeout.function((unsigned long)ct);
+ } else
+ ip_ct_refresh(ct, ICMP_TIMEOUT);
+
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static int icmp_new(struct ip_conntrack *conntrack,
+ struct iphdr *iph, size_t len)
+{
+ static u_int8_t valid_new[]
+ = { [ICMP_ECHO] = 1,
+ [ICMP_TIMESTAMP] = 1,
+ [ICMP_INFO_REQUEST] = 1,
+ [ICMP_ADDRESS] = 1 };
+
+ if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new)
+ || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) {
+ /* Can't create a new ICMP `conn' with this. */
+ DEBUGP("icmp: can't create new conn with type %u\n",
+ conntrack->tuplehash[0].tuple.dst.u.icmp.type);
+ DUMP_TUPLE(&conntrack->tuplehash[0].tuple);
+ return 0;
+ }
+ return 1;
+}
+
+struct ip_conntrack_protocol ip_conntrack_protocol_icmp
+= { { NULL, NULL }, IPPROTO_ICMP, "icmp",
+ icmp_pkt_to_tuple, icmp_invert_tuple, icmp_print_tuple,
+ icmp_print_conntrack, icmp_packet, icmp_new, NULL };
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
new file mode 100644
index 000000000..3dd448252
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -0,0 +1,227 @@
+#define __NO_VERSION__
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/lockhelp.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+/* Protects conntrack->proto.tcp_state */
+static DECLARE_RWLOCK(tcp_lock);
+
+/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
+ closely. They're more complex. --RR */
+
+/* Actually, I believe that neither ipmasq (where this code is stolen
+ from) nor ipfilter do it exactly right. A new conntrack machine taking
+ into account packet loss (which creates uncertainty as to exactly
+ the conntrack of the connection) is required. RSN. --RR */
+enum tcp_conntrack {
+ TCP_CONNTRACK_NONE,
+ TCP_CONNTRACK_ESTABLISHED,
+ TCP_CONNTRACK_SYN_SENT,
+ TCP_CONNTRACK_SYN_RECV,
+ TCP_CONNTRACK_FIN_WAIT,
+ TCP_CONNTRACK_TIME_WAIT,
+ TCP_CONNTRACK_CLOSE,
+ TCP_CONNTRACK_CLOSE_WAIT,
+ TCP_CONNTRACK_LAST_ACK,
+ TCP_CONNTRACK_LISTEN,
+ TCP_CONNTRACK_MAX
+};
+
+static const char *tcp_conntrack_names[] = {
+ "NONE",
+ "ESTABLISHED",
+ "SYN_SENT",
+ "SYN_RECV",
+ "FIN_WAIT",
+ "TIME_WAIT",
+ "CLOSE",
+ "CLOSE_WAIT",
+ "LAST_ACK",
+ "LISTEN"
+};
+
+#define SECS *HZ
+#define MINS * 60 SECS
+#define HOURS * 60 MINS
+#define DAYS * 24 HOURS
+
+
+static unsigned long tcp_timeouts[]
+= { 30 MINS, /* TCP_CONNTRACK_NONE, */
+ 5 DAYS, /* TCP_CONNTRACK_ESTABLISHED, */
+ 2 MINS, /* TCP_CONNTRACK_SYN_SENT, */
+ 60 SECS, /* TCP_CONNTRACK_SYN_RECV, */
+ 2 MINS, /* TCP_CONNTRACK_FIN_WAIT, */
+ 2 MINS, /* TCP_CONNTRACK_TIME_WAIT, */
+ 10 SECS, /* TCP_CONNTRACK_CLOSE, */
+ 60 SECS, /* TCP_CONNTRACK_CLOSE_WAIT, */
+ 30 SECS, /* TCP_CONNTRACK_LAST_ACK, */
+ 2 MINS, /* TCP_CONNTRACK_LISTEN, */
+};
+
+#define sNO TCP_CONNTRACK_NONE
+#define sES TCP_CONNTRACK_ESTABLISHED
+#define sSS TCP_CONNTRACK_SYN_SENT
+#define sSR TCP_CONNTRACK_SYN_RECV
+#define sFW TCP_CONNTRACK_FIN_WAIT
+#define sTW TCP_CONNTRACK_TIME_WAIT
+#define sCL TCP_CONNTRACK_CLOSE
+#define sCW TCP_CONNTRACK_CLOSE_WAIT
+#define sLA TCP_CONNTRACK_LAST_ACK
+#define sLI TCP_CONNTRACK_LISTEN
+#define sIV TCP_CONNTRACK_MAX
+
+static enum tcp_conntrack tcp_conntracks[2][5][TCP_CONNTRACK_MAX] = {
+ {
+/* ORIGINAL */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI */
+/*syn*/ {sSS, sES, sSS, sES, sSS, sSS, sSS, sSS, sSS, sLI },
+/*fin*/ {sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI },
+/*ack*/ {sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sES },
+/*rst*/ {sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL },
+/*none*/{sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
+ },
+ {
+/* REPLY */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI */
+/*syn*/ {sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR },
+/*fin*/ {sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI },
+/*ack*/ {sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI },
+/*rst*/ {sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI },
+/*none*/{sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
+ }
+};
+
+static int tcp_pkt_to_tuple(const void *datah, size_t datalen,
+ struct ip_conntrack_tuple *tuple)
+{
+ const struct tcphdr *hdr = datah;
+
+ tuple->src.u.tcp.port = hdr->source;
+ tuple->dst.u.tcp.port = hdr->dest;
+
+ return 1;
+}
+
+static int tcp_invert_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack_tuple *orig)
+{
+ tuple->src.u.tcp.port = orig->dst.u.tcp.port;
+ tuple->dst.u.tcp.port = orig->src.u.tcp.port;
+ return 1;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static unsigned int tcp_print_tuple(char *buffer,
+ const struct ip_conntrack_tuple *tuple)
+{
+ return sprintf(buffer, "sport=%hu dport=%hu ",
+ ntohs(tuple->src.u.tcp.port),
+ ntohs(tuple->dst.u.tcp.port));
+}
+
+/* Print out the private part of the conntrack. */
+static unsigned int tcp_print_conntrack(char *buffer,
+ const struct ip_conntrack *conntrack)
+{
+ enum tcp_conntrack state;
+
+ READ_LOCK(&tcp_lock);
+ state = conntrack->proto.tcp_state;
+ READ_UNLOCK(&tcp_lock);
+
+ return sprintf(buffer, "%s ", tcp_conntrack_names[state]);
+}
+
+static unsigned int get_conntrack_index(const struct tcphdr *tcph)
+{
+ if (tcph->rst) return 3;
+ else if (tcph->syn) return 0;
+ else if (tcph->fin) return 1;
+ else if (tcph->ack) return 2;
+ else return 4;
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int tcp_packet(struct ip_conntrack *conntrack,
+ struct iphdr *iph, size_t len,
+ enum ip_conntrack_info ctinfo)
+{
+ enum tcp_conntrack newconntrack;
+ struct tcphdr *tcph = (struct tcphdr *)((u_int32_t *)iph + iph->ihl);
+
+ /* We're guaranteed to have the base header, but maybe not the
+ options. */
+ if (len < (iph->ihl + tcph->doff) * 4) {
+ DEBUGP("ip_conntrack_tcp: Truncated packet.\n");
+ return -1;
+ }
+
+ WRITE_LOCK(&tcp_lock);
+ newconntrack
+ = tcp_conntracks
+ [CTINFO2DIR(ctinfo)]
+ [get_conntrack_index(tcph)][conntrack->proto.tcp_state];
+
+ /* Invalid */
+ if (newconntrack == TCP_CONNTRACK_MAX) {
+ DEBUGP("ip_conntrack_tcp: Invalid dir=%i index=%u conntrack=%u\n",
+ CTINFO2DIR(ctinfo), get_conntrack_index(tcph),
+ conntrack->proto.tcp_state);
+ WRITE_UNLOCK(&tcp_lock);
+ return -1;
+ }
+
+ conntrack->proto.tcp_state = newconntrack;
+ WRITE_UNLOCK(&tcp_lock);
+
+ /* Refresh: need write lock to write to conntrack. */
+ ip_ct_refresh(conntrack, tcp_timeouts[conntrack->proto.tcp_state]);
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static int tcp_new(struct ip_conntrack *conntrack,
+ struct iphdr *iph, size_t len)
+{
+ enum tcp_conntrack newconntrack;
+ struct tcphdr *tcph = (struct tcphdr *)((u_int32_t *)iph + iph->ihl);
+
+ /* Don't need lock here: this conntrack not in circulation yet */
+ newconntrack
+ = tcp_conntracks[0][get_conntrack_index(tcph)]
+ [TCP_CONNTRACK_NONE];
+
+ /* Invalid: delete conntrack */
+ if (newconntrack == TCP_CONNTRACK_MAX) {
+ DEBUGP("ip_conntrack_tcp: invalid new deleting.\n");
+ return 0;
+ } else {
+ conntrack->proto.tcp_state = newconntrack;
+ ip_ct_refresh(conntrack, tcp_timeouts[conntrack->proto.tcp_state]);
+ }
+ return 1;
+}
+
+struct ip_conntrack_protocol ip_conntrack_protocol_tcp
+= { { NULL, NULL }, IPPROTO_TCP, "tcp",
+ tcp_pkt_to_tuple, tcp_invert_tuple, tcp_print_tuple, tcp_print_conntrack,
+ tcp_packet, tcp_new, NULL };
+
+int __init ip_conntrack_protocol_tcp_init(void)
+{
+ return 0;
+}
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
new file mode 100644
index 000000000..688ae10fb
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -0,0 +1,65 @@
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/udp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+
+#define UDP_TIMEOUT (60*HZ)
+
+static int udp_pkt_to_tuple(const void *datah, size_t datalen,
+ struct ip_conntrack_tuple *tuple)
+{
+ const struct udphdr *hdr = datah;
+
+ tuple->src.u.udp.port = hdr->source;
+ tuple->dst.u.udp.port = hdr->dest;
+
+ return 1;
+}
+
+static int udp_invert_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack_tuple *orig)
+{
+ tuple->src.u.udp.port = orig->dst.u.udp.port;
+ tuple->dst.u.udp.port = orig->src.u.udp.port;
+ return 1;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static unsigned int udp_print_tuple(char *buffer,
+ const struct ip_conntrack_tuple *tuple)
+{
+ return sprintf(buffer, "sport=%hu dport=%hu ",
+ ntohs(tuple->src.u.udp.port),
+ ntohs(tuple->dst.u.udp.port));
+}
+
+/* Print out the private part of the conntrack. */
+static unsigned int udp_print_conntrack(char *buffer,
+ const struct ip_conntrack *conntrack)
+{
+ return 0;
+}
+
+/* Returns verdict for packet, and may modify conntracktype */
+static int udp_packet(struct ip_conntrack *conntrack,
+ struct iphdr *iph, size_t len,
+ enum ip_conntrack_info conntrackinfo)
+{
+ /* Refresh. */
+ ip_ct_refresh(conntrack, UDP_TIMEOUT);
+ return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static int udp_new(struct ip_conntrack *conntrack,
+ struct iphdr *iph, size_t len)
+{
+ return 1;
+}
+
+struct ip_conntrack_protocol ip_conntrack_protocol_udp
+= { { NULL, NULL }, IPPROTO_UDP, "udp",
+ udp_pkt_to_tuple, udp_invert_tuple, udp_print_tuple, udp_print_conntrack,
+ udp_packet, udp_new, NULL };
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
new file mode 100644
index 000000000..ce79c3263
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -0,0 +1,297 @@
+/* This file contains all the functions required for the standalone
+ ip_conntrack module.
+
+ These are not required by the compatibility layer.
+*/
+
+/* (c) 1999 Paul `Rusty' Russell. Licenced under the GNU General
+ Public Licence. */
+
+#ifdef MODULE
+#define EXPORT_SYMTAB
+#endif
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/version.h>
+#include <net/checksum.h>
+
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
+
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+struct module *ip_conntrack_module = THIS_MODULE;
+
+static unsigned int
+print_tuple(char *buffer, const struct ip_conntrack_tuple *tuple,
+ struct ip_conntrack_protocol *proto)
+{
+ int len;
+
+ len = sprintf(buffer, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ",
+ NIPQUAD(tuple->src.ip), NIPQUAD(tuple->dst.ip));
+
+ len += proto->print_tuple(buffer + len, tuple);
+
+ return len;
+}
+
+/* FIXME: Don't print source proto part. --RR */
+static unsigned int
+print_expect(char *buffer, const struct ip_conntrack_expect *expect)
+{
+ unsigned int len;
+
+ len = sprintf(buffer, "EXPECTING: proto=%u ",
+ expect->tuple.dst.protonum);
+ len += print_tuple(buffer + len, &expect->tuple,
+ __find_proto(expect->tuple.dst.protonum));
+ len += sprintf(buffer + len, "\n");
+ return len;
+}
+
+static unsigned int
+print_conntrack(char *buffer, const struct ip_conntrack *conntrack)
+{
+ unsigned int len;
+ struct ip_conntrack_protocol *proto
+ = __find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.protonum);
+
+ len = sprintf(buffer, "%-8s %u %lu ",
+ proto->name,
+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.protonum,
+ timer_pending(&conntrack->timeout)
+ ? (conntrack->timeout.expires - jiffies)/HZ : 0);
+
+ len += proto->print_conntrack(buffer + len, conntrack);
+ len += print_tuple(buffer + len,
+ &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ proto);
+ if (!(conntrack->status & IPS_SEEN_REPLY))
+ len += sprintf(buffer + len, "[UNREPLIED] ");
+ len += print_tuple(buffer + len,
+ &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple,
+ proto);
+ len += sprintf(buffer + len, "\n");
+
+ return len;
+}
+
+/* Returns true when finished. */
+static int
+conntrack_iterate(const struct ip_conntrack_tuple_hash *hash,
+ char *buffer, off_t offset, off_t *upto,
+ unsigned int *len, unsigned int maxlen)
+{
+ unsigned int newlen;
+ IP_NF_ASSERT(hash->ctrack);
+
+ MUST_BE_READ_LOCKED(&ip_conntrack_lock);
+
+ /* Only count originals */
+ if (DIRECTION(hash))
+ return 0;
+
+ if ((*upto)++ < offset)
+ return 0;
+
+ newlen = print_conntrack(buffer + *len, hash->ctrack);
+ if (*len + newlen > maxlen)
+ return 1;
+ else *len += newlen;
+
+ return 0;
+}
+
+static int
+list_conntracks(char *buffer, char **start, off_t offset, int length)
+{
+ unsigned int i;
+ unsigned int len = 0;
+ off_t upto = 0;
+ struct list_head *e;
+
+ READ_LOCK(&ip_conntrack_lock);
+ /* Traverse hash; print originals then reply. */
+ for (i = 0; i < ip_conntrack_htable_size; i++) {
+ if (LIST_FIND(&ip_conntrack_hash[i], conntrack_iterate,
+ struct ip_conntrack_tuple_hash *,
+ buffer, offset, &upto, &len, length))
+ goto finished;
+ }
+
+ /* Now iterate through expecteds. */
+ for (e = expect_list.next; e != &expect_list; e = e->next) {
+ unsigned int last_len;
+ struct ip_conntrack_expect *expect
+ = (struct ip_conntrack_expect *)e;
+ if (upto++ < offset) continue;
+
+ last_len = len;
+ len += print_expect(buffer + len, expect);
+ if (len > length) {
+ len = last_len;
+ goto finished;
+ }
+ }
+
+ finished:
+ READ_UNLOCK(&ip_conntrack_lock);
+
+ /* `start' hack - see fs/proc/generic.c line ~165 */
+ *start = (char *)((unsigned int)upto - offset);
+ return len;
+}
+
+static unsigned int ip_refrag(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct rtable *rt = (struct rtable *)(*pskb)->dst;
+
+ /* Local packets are never produced too large for their
+ interface. We degfragment them at LOCAL_OUT, however,
+ so we have to refragment them here. */
+ if ((*pskb)->len > rt->u.dst.pmtu) {
+ DEBUGP("ip_conntrack: refragm %p (size %u) to %u (okfn %p)\n",
+ *pskb, (*pskb)->len, rt->u.dst.pmtu, okfn);
+ /* No hook can be after us, so this should be OK. */
+ ip_fragment(*pskb, okfn);
+ return NF_STOLEN;
+ }
+ return NF_ACCEPT;
+}
+
+/* Connection tracking may drop packets, but never alters them, so
+ make it the first hook. */
+static struct nf_hook_ops ip_conntrack_in_ops
+= { { NULL, NULL }, ip_conntrack_in, PF_INET, NF_IP_PRE_ROUTING,
+ NF_IP_PRI_CONNTRACK };
+static struct nf_hook_ops ip_conntrack_local_out_ops
+= { { NULL, NULL }, ip_conntrack_in, PF_INET, NF_IP_LOCAL_OUT,
+ NF_IP_PRI_CONNTRACK };
+/* Refragmenter; last chance. */
+static struct nf_hook_ops ip_conntrack_out_ops
+= { { NULL, NULL }, ip_refrag, PF_INET, NF_IP_POST_ROUTING, NF_IP_PRI_LAST };
+
+static int init_or_cleanup(int init)
+{
+ int ret = 0;
+
+ if (!init) goto cleanup;
+
+ ret = ip_conntrack_init();
+ if (ret < 0)
+ goto cleanup_nothing;
+
+ proc_net_create("ip_conntrack",0,list_conntracks);
+ ret = nf_register_hook(&ip_conntrack_in_ops);
+ if (ret < 0) {
+ printk("ip_conntrack: can't register in hook.\n");
+ goto cleanup_init;
+ }
+ ret = nf_register_hook(&ip_conntrack_local_out_ops);
+ if (ret < 0) {
+ printk("ip_conntrack: can't register local out hook.\n");
+ goto cleanup_inops;
+ }
+ ret = nf_register_hook(&ip_conntrack_out_ops);
+ if (ret < 0) {
+ printk("ip_conntrack: can't register post-routing hook.\n");
+ goto cleanup_inandlocalops;
+ }
+
+ return ret;
+
+ cleanup:
+ nf_unregister_hook(&ip_conntrack_out_ops);
+ cleanup_inandlocalops:
+ nf_unregister_hook(&ip_conntrack_local_out_ops);
+ cleanup_inops:
+ nf_unregister_hook(&ip_conntrack_in_ops);
+ cleanup_init:
+ proc_net_remove("ip_conntrack");
+ ip_conntrack_cleanup();
+ cleanup_nothing:
+ return ret;
+}
+
+/* FIXME: Allow NULL functions and sub in pointers to generic for
+ them. --RR */
+int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto)
+{
+ int ret = 0;
+ struct list_head *i;
+
+ WRITE_LOCK(&ip_conntrack_lock);
+ for (i = protocol_list.next; i != &protocol_list; i = i->next) {
+ if (((struct ip_conntrack_protocol *)i)->proto
+ == proto->proto) {
+ ret = -EBUSY;
+ goto out;
+ }
+ }
+
+ list_prepend(&protocol_list, proto);
+ MOD_INC_USE_COUNT;
+
+ out:
+ WRITE_UNLOCK(&ip_conntrack_lock);
+ return ret;
+}
+
+/* FIXME: Implement this --RR */
+#if 0
+void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto)
+{
+}
+#endif
+
+static int __init init(void)
+{
+ return init_or_cleanup(1);
+}
+
+static void __exit fini(void)
+{
+ init_or_cleanup(0);
+}
+
+module_init(init);
+module_exit(fini);
+
+EXPORT_SYMBOL(ip_conntrack_protocol_register);
+EXPORT_SYMBOL(invert_tuplepr);
+EXPORT_SYMBOL(ip_conntrack_alter_reply);
+EXPORT_SYMBOL(ip_conntrack_destroyed);
+EXPORT_SYMBOL(ip_conntrack_get);
+EXPORT_SYMBOL(ip_conntrack_module);
+EXPORT_SYMBOL(ip_conntrack_helper_register);
+EXPORT_SYMBOL(ip_conntrack_helper_unregister);
+EXPORT_SYMBOL(ip_conntrack_lock);
+EXPORT_SYMBOL(find_proto);
+EXPORT_SYMBOL(get_tuple);
+EXPORT_SYMBOL(ip_ct_selective_cleanup);
+EXPORT_SYMBOL(ip_ct_refresh);
+EXPORT_SYMBOL(ip_conntrack_expect_related);
+EXPORT_SYMBOL(ip_conntrack_tuple_taken);
+EXPORT_SYMBOL(ip_ct_gather_frags);
diff --git a/net/ipv4/netfilter/ip_fw_compat.c b/net/ipv4/netfilter/ip_fw_compat.c
new file mode 100644
index 000000000..72dc3d816
--- /dev/null
+++ b/net/ipv4/netfilter/ip_fw_compat.c
@@ -0,0 +1,238 @@
+/* Compatibility framework for ipchains and ipfwadm support; designed
+ to look as much like the 2.2 infrastructure as possible. */
+struct notifier_block;
+
+#include <linux/netfilter_ipv4.h>
+#include <linux/ip.h>
+#include <net/icmp.h>
+#include <linux/if.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/netfilter_ipv4/compat_firewall.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+
+EXPORT_NO_SYMBOLS;
+
+static struct firewall_ops *fwops;
+
+/* From ip_fw_compat_redir.c */
+extern unsigned int
+do_redirect(struct sk_buff *skb,
+ const struct net_device *dev,
+ u_int16_t redirpt);
+
+extern void
+check_for_redirect(struct sk_buff *skb);
+
+extern void
+check_for_unredirect(struct sk_buff *skb);
+
+/* From ip_fw_compat_masq.c */
+extern unsigned int
+do_masquerade(struct sk_buff **pskb, const struct net_device *dev);
+
+extern unsigned int
+check_for_demasq(struct sk_buff **pskb);
+
+extern int __init masq_init(void);
+extern void masq_cleanup(void);
+
+/* They call these; we do what they want. */
+int register_firewall(int pf, struct firewall_ops *fw)
+{
+ if (pf != PF_INET) {
+ printk("Attempt to register non-IP firewall module.\n");
+ return -EINVAL;
+ }
+ if (fwops) {
+ printk("Attempt to register multiple firewall modules.\n");
+ return -EBUSY;
+ }
+
+ fwops = fw;
+ return 0;
+}
+
+int unregister_firewall(int pf, struct firewall_ops *fw)
+{
+ fwops = NULL;
+ return 0;
+}
+
+static unsigned int
+fw_in(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ int ret = FW_BLOCK;
+ u_int16_t redirpt;
+
+ (*pskb)->nfcache |= NFC_UNKNOWN;
+ (*pskb)->ip_summed = CHECKSUM_NONE;
+
+ switch (hooknum) {
+ case NF_IP_PRE_ROUTING:
+ if (fwops->fw_acct_in)
+ fwops->fw_acct_in(fwops, PF_INET,
+ (struct net_device *)in,
+ (*pskb)->nh.raw, &redirpt, pskb);
+
+ if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
+ *pskb = ip_ct_gather_frags(*pskb);
+
+ if (!*pskb)
+ return NF_STOLEN;
+ }
+
+ ret = fwops->fw_input(fwops, PF_INET, (struct net_device *)in,
+ (*pskb)->nh.raw, &redirpt, pskb);
+ break;
+
+ case NF_IP_FORWARD:
+ /* Connection will only be set if it was
+ demasqueraded: if so, skip forward chain. */
+ if ((*pskb)->nfct)
+ ret = FW_ACCEPT;
+ else ret = fwops->fw_forward(fwops, PF_INET,
+ (struct net_device *)out,
+ (*pskb)->nh.raw, &redirpt, pskb);
+ break;
+
+ case NF_IP_POST_ROUTING:
+ ret = fwops->fw_output(fwops, PF_INET,
+ (struct net_device *)out,
+ (*pskb)->nh.raw, &redirpt, pskb);
+ if (fwops->fw_acct_out && (ret == FW_ACCEPT || ret == FW_SKIP))
+ fwops->fw_acct_out(fwops, PF_INET,
+ (struct net_device *)in,
+ (*pskb)->nh.raw, &redirpt, pskb);
+ break;
+ }
+
+ switch (ret) {
+ case FW_REJECT: {
+ /* Alexey says:
+ *
+ * Generally, routing is THE FIRST thing to make, when
+ * packet enters IP stack. Before packet is routed you
+ * cannot call any service routines from IP stack. */
+ struct iphdr *iph = (*pskb)->nh.iph;
+
+ if ((*pskb)->dst != NULL
+ || ip_route_input(*pskb, iph->daddr, iph->saddr, iph->tos,
+ (struct net_device *)in) == 0)
+ icmp_send(*pskb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH,
+ 0);
+ return NF_DROP;
+ }
+
+ case FW_ACCEPT:
+ case FW_SKIP:
+ if (hooknum == NF_IP_PRE_ROUTING) {
+ check_for_demasq(pskb);
+ check_for_redirect(*pskb);
+ } else if (hooknum == NF_IP_POST_ROUTING)
+ check_for_unredirect(*pskb);
+
+ return NF_ACCEPT;
+
+ case FW_MASQUERADE:
+ if (hooknum == NF_IP_FORWARD)
+ return do_masquerade(pskb, out);
+ else return NF_ACCEPT;
+
+ case FW_REDIRECT:
+ if (hooknum == NF_IP_PRE_ROUTING)
+ return do_redirect(*pskb, in, redirpt);
+ else return NF_ACCEPT;
+
+ default:
+ /* FW_BLOCK */
+ return NF_DROP;
+ }
+}
+
+extern int ip_fw_ctl(int optval, void *user, unsigned int len);
+
+static int sock_fn(struct sock *sk, int optval, void *user, unsigned int len)
+{
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ return -ip_fw_ctl(optval, user, len);
+}
+
+static struct nf_hook_ops preroute_ops
+= { { NULL, NULL }, fw_in, PF_INET, NF_IP_PRE_ROUTING, NF_IP_PRI_FILTER };
+
+static struct nf_hook_ops postroute_ops
+= { { NULL, NULL }, fw_in, PF_INET, NF_IP_POST_ROUTING, NF_IP_PRI_FILTER };
+
+static struct nf_hook_ops forward_ops
+= { { NULL, NULL }, fw_in, PF_INET, NF_IP_FORWARD, NF_IP_PRI_FILTER };
+
+static struct nf_sockopt_ops sock_ops
+= { { NULL, NULL }, PF_INET, 64, 64 + 1024 + 1, &sock_fn, 0, 0, NULL,
+ 0, NULL };
+
+extern int ipfw_init_or_cleanup(int init);
+
+static int init_or_cleanup(int init)
+{
+ int ret = 0;
+
+ if (!init) goto cleanup;
+
+ ret = nf_register_sockopt(&sock_ops);
+
+ if (ret < 0)
+ goto cleanup_nothing;
+
+ ret = ipfw_init_or_cleanup(1);
+ if (ret < 0)
+ goto cleanup_sockopt;
+
+ ret = masq_init();
+ if (ret < 0)
+ goto cleanup_ipfw;
+
+ nf_register_hook(&preroute_ops);
+ nf_register_hook(&postroute_ops);
+ nf_register_hook(&forward_ops);
+
+ return ret;
+
+ cleanup:
+ nf_unregister_hook(&preroute_ops);
+ nf_unregister_hook(&postroute_ops);
+ nf_unregister_hook(&forward_ops);
+
+ masq_cleanup();
+
+ cleanup_ipfw:
+ ipfw_init_or_cleanup(0);
+
+ cleanup_sockopt:
+ nf_unregister_sockopt(&sock_ops);
+
+ cleanup_nothing:
+ return ret;
+}
+
+static int __init init(void)
+{
+ return init_or_cleanup(1);
+}
+
+static void __exit fini(void)
+{
+ init_or_cleanup(0);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_fw_compat_masq.c b/net/ipv4/netfilter/ip_fw_compat_masq.c
new file mode 100644
index 000000000..e0074c1e2
--- /dev/null
+++ b/net/ipv4/netfilter/ip_fw_compat_masq.c
@@ -0,0 +1,288 @@
+/* Masquerading compatibility layer.
+
+ Note that there are no restrictions on other programs binding to
+ ports 61000:65095 (in 2.0 and 2.2 they get EADDRINUSE). Just DONT
+ DO IT.
+ */
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/udp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/version.h>
+#include <net/route.h>
+
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
+
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+unsigned int
+do_masquerade(struct sk_buff **pskb, const struct net_device *dev)
+{
+ struct iphdr *iph = (*pskb)->nh.iph;
+ struct ip_nat_info *info;
+ enum ip_conntrack_info ctinfo;
+ struct ip_conntrack *ct;
+ unsigned int ret;
+
+ /* Sorry, only ICMP, TCP and UDP. */
+ if (iph->protocol != IPPROTO_ICMP
+ && iph->protocol != IPPROTO_TCP
+ && iph->protocol != IPPROTO_UDP)
+ return NF_DROP;
+
+ /* Feed it to connection tracking; in fact we're in NF_IP_FORWARD,
+ but connection tracking doesn't expect that */
+ ret = ip_conntrack_in(NF_IP_POST_ROUTING, pskb, dev, NULL, NULL);
+ if (ret != NF_ACCEPT) {
+ DEBUGP("ip_conntrack_in returned %u.\n", ret);
+ return ret;
+ }
+
+ ct = ip_conntrack_get(*pskb, &ctinfo);
+
+ if (!ct) {
+ DEBUGP("ip_conntrack_in set to invalid conntrack.\n");
+ return NF_DROP;
+ }
+
+ info = &ct->nat.info;
+
+ WRITE_LOCK(&ip_nat_lock);
+ /* Setup the masquerade, if not already */
+ if (!info->initialized) {
+ u_int32_t newsrc;
+ struct rtable *rt;
+ struct ip_nat_multi_range range;
+
+ /* Pass 0 instead of saddr, since it's going to be changed
+ anyway. */
+ if (ip_route_output(&rt, iph->daddr, 0, 0, 0) != 0) {
+ DEBUGP("ipnat_rule_masquerade: Can't reroute.\n");
+ return NF_DROP;
+ }
+ newsrc = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
+ RT_SCOPE_UNIVERSE);
+ ip_rt_put(rt);
+ range = ((struct ip_nat_multi_range)
+ { 1,
+ {{IP_NAT_RANGE_MAP_IPS|IP_NAT_RANGE_PROTO_SPECIFIED,
+ newsrc, newsrc,
+ { htons(61000) }, { htons(65095) } } } });
+
+ ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
+ place_in_hashes(ct, info);
+ info->initialized = 1;
+ } else
+ DEBUGP("Masquerading already done on this conn.\n");
+ WRITE_UNLOCK(&ip_nat_lock);
+
+ return do_bindings(ct, ctinfo, info, NF_IP_POST_ROUTING, pskb);
+}
+
+unsigned int
+check_for_demasq(struct sk_buff **pskb)
+{
+ struct ip_conntrack_tuple tuple;
+ struct iphdr *iph = (*pskb)->nh.iph;
+ struct ip_conntrack_protocol *protocol;
+ struct ip_conntrack_tuple_hash *h;
+ enum ip_conntrack_info ctinfo;
+ int ret;
+
+ protocol = find_proto(iph->protocol);
+
+ /* We don't feed packets to conntrack system unless we know
+ they're part of an connection already established by an
+ explicit masq command. */
+ switch (iph->protocol) {
+ case IPPROTO_ICMP:
+ /* ICMP errors. */
+ if (icmp_error_track(*pskb)) {
+ /* If it is valid, tranlsate it */
+ if ((*pskb)->nfct) {
+ struct ip_conntrack *ct
+ = (struct ip_conntrack *)
+ (*pskb)->nfct->master;
+ enum ip_conntrack_dir dir;
+
+ if ((*pskb)->nfct-ct->infos >= IP_CT_IS_REPLY)
+ dir = IP_CT_DIR_REPLY;
+ else
+ dir = IP_CT_DIR_ORIGINAL;
+
+ icmp_reply_translation(*pskb,
+ ct,
+ NF_IP_PRE_ROUTING,
+ dir);
+ }
+ return NF_ACCEPT;
+ }
+ /* Fall thru... */
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ if (!get_tuple(iph, (*pskb)->len, &tuple, protocol)) {
+ printk("ip_fw_compat_masq: Couldn't get tuple\n");
+ return NF_ACCEPT;
+ }
+ break;
+
+ default:
+ /* Not ours... */
+ return NF_ACCEPT;
+ }
+ h = ip_conntrack_find_get(&tuple, NULL);
+
+ /* MUST be found, and MUST be reply. */
+ if (h && DIRECTION(h) == 1) {
+ ret = ip_conntrack_in(NF_IP_PRE_ROUTING, pskb,
+ NULL, NULL, NULL);
+
+ /* Put back the reference gained from find_get */
+ nf_conntrack_put(&h->ctrack->infos[0]);
+ if (ret == NF_ACCEPT) {
+ struct ip_conntrack *ct;
+ ct = ip_conntrack_get(*pskb, &ctinfo);
+
+ if (ct) {
+ struct ip_nat_info *info = &ct->nat.info;
+
+ do_bindings(ct, ctinfo, info,
+ NF_IP_PRE_ROUTING,
+ pskb);
+ } else
+ printk("ip_fw_compat_masq: conntrack"
+ " didn't like\n");
+ }
+ } else {
+ if (h)
+ /* Put back the reference gained from find_get */
+ nf_conntrack_put(&h->ctrack->infos[0]);
+ ret = NF_ACCEPT;
+ }
+
+ return ret;
+}
+
+int ip_fw_masq_timeouts(void *user, int len)
+{
+ printk("Sorry: masquerading timeouts set 5DAYS/2MINS/60SECS\n");
+ return 0;
+}
+
+static const char *masq_proto_name(u_int16_t protonum)
+{
+ switch (protonum) {
+ case IPPROTO_TCP: return "TCP";
+ case IPPROTO_UDP: return "UDP";
+ case IPPROTO_ICMP: return "ICMP";
+ default: return "MORE-CAFFIENE-FOR-RUSTY";
+ }
+}
+
+static unsigned int
+print_masq(char *buffer, const struct ip_conntrack *conntrack)
+{
+ char temp[129];
+
+ /* This is for backwards compatibility, but ick!.
+ We should never export jiffies to userspace.
+ */
+ sprintf(temp,"%s %08X:%04X %08X:%04X %04X %08X %6d %6d %7lu",
+ masq_proto_name(conntrack->tuplehash[0].tuple.dst.protonum),
+ ntohl(conntrack->tuplehash[0].tuple.src.ip),
+ ntohs(conntrack->tuplehash[0].tuple.src.u.all),
+ ntohl(conntrack->tuplehash[0].tuple.dst.ip),
+ ntohs(conntrack->tuplehash[0].tuple.dst.u.all),
+ ntohs(conntrack->tuplehash[1].tuple.dst.u.all),
+ /* Sorry, no init_seq, delta or previous_delta (yet). */
+ 0, 0, 0,
+ conntrack->timeout.expires - jiffies);
+
+ return sprintf(buffer, "%-127s\n", temp);
+}
+
+/* Returns true when finished. */
+static int
+masq_iterate(const struct ip_conntrack_tuple_hash *hash,
+ char *buffer, off_t offset, off_t *upto,
+ unsigned int *len, unsigned int maxlen)
+{
+ unsigned int newlen;
+
+ IP_NF_ASSERT(hash->ctrack);
+
+ /* Only count originals */
+ if (DIRECTION(hash))
+ return 0;
+
+ if ((*upto)++ < offset)
+ return 0;
+
+ newlen = print_masq(buffer + *len, hash->ctrack);
+ if (*len + newlen > maxlen)
+ return 1;
+ else *len += newlen;
+
+ return 0;
+}
+
+/* Everything in the hash is masqueraded. */
+static int
+masq_procinfo(char *buffer, char **start, off_t offset, int length)
+{
+ unsigned int i;
+ int len = 0;
+ off_t upto = 0;
+
+ READ_LOCK(&ip_conntrack_lock);
+ /* Traverse hash; print originals then reply. */
+ for (i = 0; i < ip_conntrack_htable_size; i++) {
+ if (LIST_FIND(&ip_conntrack_hash[i], masq_iterate,
+ struct ip_conntrack_tuple_hash *,
+ buffer, offset, &upto, &len, length))
+ break;
+ }
+ READ_UNLOCK(&ip_conntrack_lock);
+
+ /* `start' hack - see fs/proc/generic.c line ~165 */
+ *start = (char *)((unsigned int)upto - offset);
+ return len;
+}
+
+int __init masq_init(void)
+{
+ int ret;
+
+ ret = ip_conntrack_init();
+ if (ret == 0) {
+ ret = ip_nat_init();
+ if (ret == 0)
+ proc_net_create("ip_masquerade", 0, masq_procinfo);
+ else
+ ip_conntrack_cleanup();
+ }
+
+ return ret;
+}
+
+void masq_cleanup(void)
+{
+ ip_nat_cleanup();
+ ip_conntrack_cleanup();
+ proc_net_remove("ip_masquerade");
+}
diff --git a/net/ipv4/netfilter/ip_fw_compat_redir.c b/net/ipv4/netfilter/ip_fw_compat_redir.c
new file mode 100644
index 000000000..d4d910e77
--- /dev/null
+++ b/net/ipv4/netfilter/ip_fw_compat_redir.c
@@ -0,0 +1,284 @@
+/* This is a file to handle the "simple" NAT cases (redirect and
+ masquerade) required for the compatibility layer.
+
+ `bind to foreign address' and `getpeername' hacks are not
+ supported.
+
+ FIXME: Timing is overly simplistic. If anyone complains, make it
+ use conntrack.
+*/
+#include <linux/config.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <net/checksum.h>
+#include <linux/timer.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/in.h>
+
+#include <linux/netfilter_ipv4/lockhelp.h>
+
+static DECLARE_LOCK(redir_lock);
+#define ASSERT_READ_LOCK(x) MUST_BE_LOCKED(&redir_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_LOCKED(&redir_lock)
+
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+#ifdef CONFIG_NETFILTER_DEBUG
+#define IP_NF_ASSERT(x) \
+do { \
+ if (!(x)) \
+ /* Wooah! I'm tripping my conntrack in a frenzy of \
+ netplay... */ \
+ printk("ASSERT: %s:%i(%s)\n", \
+ __FILE__, __LINE__, __FUNCTION__); \
+} while(0);
+#else
+#define IP_NF_ASSERT(x)
+#endif
+
+static u_int16_t
+cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
+{
+ u_int32_t diffs[] = { oldvalinv, newval };
+ return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
+ oldcheck^0xFFFF));
+}
+
+struct redir_core {
+ u_int32_t orig_srcip, orig_dstip;
+ u_int16_t orig_sport, orig_dport;
+
+ u_int32_t new_dstip;
+ u_int16_t new_dport;
+};
+
+struct redir
+{
+ struct list_head list;
+ struct redir_core core;
+ struct timer_list destroyme;
+};
+
+static LIST_HEAD(redirs);
+
+static int
+redir_cmp(const struct redir *i,
+ u_int32_t orig_srcip, u_int32_t orig_dstip,
+ u_int16_t orig_sport, u_int16_t orig_dport)
+{
+ return (i->core.orig_srcip == orig_srcip
+ && i->core.orig_dstip == orig_dstip
+ && i->core.orig_sport == orig_sport
+ && i->core.orig_dport == orig_dport);
+}
+
+/* Search for an existing redirection of the TCP packet. */
+static struct redir *
+find_redir(u_int32_t orig_srcip, u_int32_t orig_dstip,
+ u_int16_t orig_sport, u_int16_t orig_dport)
+{
+ return LIST_FIND(&redirs, redir_cmp, struct redir *,
+ orig_srcip, orig_dstip, orig_sport, orig_dport);
+}
+
+static void do_tcp_redir(struct sk_buff *skb, struct redir *redir)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct tcphdr *tcph = (struct tcphdr *)((u_int32_t *)iph
+ + iph->ihl);
+
+ tcph->check = cheat_check(~redir->core.orig_dstip,
+ redir->core.new_dstip,
+ cheat_check(redir->core.orig_dport ^ 0xFFFF,
+ redir->core.new_dport,
+ tcph->check));
+ iph->check = cheat_check(~redir->core.orig_dstip,
+ redir->core.new_dstip, iph->check);
+ tcph->dest = redir->core.new_dport;
+ iph->daddr = redir->core.new_dstip;
+
+ skb->nfcache |= NFC_ALTERED;
+}
+
+static int
+unredir_cmp(const struct redir *i,
+ u_int32_t new_dstip, u_int32_t orig_srcip,
+ u_int16_t new_dport, u_int16_t orig_sport)
+{
+ return (i->core.orig_srcip == orig_srcip
+ && i->core.new_dstip == new_dstip
+ && i->core.orig_sport == orig_sport
+ && i->core.new_dport == new_dport);
+}
+
+/* Match reply packet against redir */
+static struct redir *
+find_unredir(u_int32_t new_dstip, u_int32_t orig_srcip,
+ u_int16_t new_dport, u_int16_t orig_sport)
+{
+ return LIST_FIND(&redirs, unredir_cmp, struct redir *,
+ new_dstip, orig_srcip, new_dport, orig_sport);
+}
+
+/* `unredir' a reply packet. */
+static void do_tcp_unredir(struct sk_buff *skb, struct redir *redir)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct tcphdr *tcph = (struct tcphdr *)((u_int32_t *)iph
+ + iph->ihl);
+
+ tcph->check = cheat_check(~redir->core.new_dstip,
+ redir->core.orig_dstip,
+ cheat_check(redir->core.new_dport ^ 0xFFFF,
+ redir->core.orig_dport,
+ tcph->check));
+ iph->check = cheat_check(~redir->core.new_dstip,
+ redir->core.orig_dstip,
+ iph->check);
+ tcph->source = redir->core.orig_dport;
+ iph->saddr = redir->core.orig_dstip;
+
+ skb->nfcache |= NFC_ALTERED;
+}
+
+/* REDIRECT a packet. */
+unsigned int
+do_redirect(struct sk_buff *skb,
+ const struct net_device *dev,
+ u_int16_t redirpt)
+{
+ struct iphdr *iph = skb->nh.iph;
+ u_int32_t newdst;
+
+ /* Figure out address: not loopback. */
+ if (!dev)
+ return NF_DROP;
+
+ /* Grab first address on interface. */
+ newdst = ((struct in_device *)dev->ip_ptr)->ifa_list->ifa_local;
+
+ switch (iph->protocol) {
+ case IPPROTO_UDP: {
+ /* Simple mangle. */
+ struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph
+ + iph->ihl);
+
+ udph->check = cheat_check(~iph->daddr, newdst,
+ cheat_check(udph->dest ^ 0xFFFF,
+ redirpt,
+ udph->check));
+ iph->check = cheat_check(~iph->daddr, newdst, iph->check);
+ udph->dest = redirpt;
+ iph->daddr = newdst;
+
+ skb->nfcache |= NFC_ALTERED;
+ return NF_ACCEPT;
+ }
+ case IPPROTO_TCP: {
+ /* Mangle, maybe record. */
+ struct tcphdr *tcph = (struct tcphdr *)((u_int32_t *)iph
+ + iph->ihl);
+ struct redir *redir;
+ int ret;
+
+ DEBUGP("Doing tcp redirect. %08X:%u %08X:%u -> %08X:%u\n",
+ iph->saddr, tcph->source, iph->daddr, tcph->dest,
+ newdst, redirpt);
+ LOCK_BH(&redir_lock);
+ redir = find_redir(iph->saddr, iph->daddr,
+ tcph->source, tcph->dest);
+
+ if (!redir) {
+ redir = kmalloc(sizeof(struct redir), GFP_ATOMIC);
+ if (!redir) {
+ ret = NF_DROP;
+ goto out;
+ }
+ list_prepend(&redirs, redir);
+ init_timer(&redir->destroyme);
+ }
+ /* In case mangling has changed, rewrite this part. */
+ redir->core = ((struct redir_core)
+ { iph->saddr, iph->daddr,
+ tcph->source, tcph->dest,
+ newdst, redirpt });
+ do_tcp_redir(skb, redir);
+ ret = NF_ACCEPT;
+
+ out:
+ UNLOCK_BH(&redir_lock);
+ return ret;
+ }
+
+ default: /* give up if not TCP or UDP. */
+ return NF_DROP;
+ }
+}
+
+static void destroyme(unsigned long me)
+{
+ LOCK_BH(&redir_lock);
+ LIST_DELETE(&redirs, (struct redir *)me);
+ UNLOCK_BH(&redir_lock);
+}
+
+/* Incoming packet: is it a reply to a masqueraded connection, or
+ part of an already-redirected TCP connection? */
+void
+check_for_redirect(struct sk_buff *skb)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct tcphdr *tcph = (struct tcphdr *)((u_int32_t *)iph
+ + iph->ihl);
+ struct redir *redir;
+
+ if (iph->protocol != IPPROTO_TCP)
+ return;
+
+ LOCK_BH(&redir_lock);
+ redir = find_redir(iph->saddr, iph->daddr, tcph->source, tcph->dest);
+ if (redir) {
+ DEBUGP("Doing tcp redirect again.\n");
+ do_tcp_redir(skb, redir);
+ if (tcph->rst || tcph->fin) {
+ redir->destroyme.function = destroyme;
+ redir->destroyme.data = (unsigned long)redir;
+ mod_timer(&redir->destroyme, 75*HZ);
+ }
+ }
+ UNLOCK_BH(&redir_lock);
+}
+
+void
+check_for_unredirect(struct sk_buff *skb)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct tcphdr *tcph = (struct tcphdr *)((u_int32_t *)iph
+ + iph->ihl);
+ struct redir *redir;
+
+ if (iph->protocol != IPPROTO_TCP)
+ return;
+
+ LOCK_BH(&redir_lock);
+ redir = find_unredir(iph->saddr, iph->daddr, tcph->source, tcph->dest);
+ if (redir) {
+ DEBUGP("Doing tcp unredirect.\n");
+ do_tcp_unredir(skb, redir);
+ if (tcph->rst || tcph->fin) {
+ redir->destroyme.function = destroyme;
+ redir->destroyme.data = (unsigned long)redir;
+ mod_timer(&redir->destroyme, 75*HZ);
+ }
+ }
+ UNLOCK_BH(&redir_lock);
+}
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
new file mode 100644
index 000000000..996e5a7ff
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -0,0 +1,855 @@
+/* NAT for netfilter; shared with compatibility layer. */
+
+/* (c) 1999 Paul `Rusty' Russell. Licenced under the GNU General
+ Public Licence. */
+#ifdef MODULE
+#define __NO_VERSION__
+#endif
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/brlock.h>
+#include <net/checksum.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/tcp.h> /* For tcp_prot in getorigdst */
+
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+DECLARE_RWLOCK(ip_nat_lock);
+
+#define IP_NAT_HTABLE_SIZE 64
+
+static struct list_head bysource[IP_NAT_HTABLE_SIZE];
+static struct list_head byipsproto[IP_NAT_HTABLE_SIZE];
+LIST_HEAD(protos);
+static LIST_HEAD(helpers);
+
+extern struct ip_nat_protocol unknown_nat_protocol;
+
+/* We keep extra hashes for each conntrack, for fast searching. */
+static inline size_t
+hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto)
+{
+ /* Modified src and dst, to ensure we don't create two
+ identical streams. */
+ return (src + dst + proto) % IP_NAT_HTABLE_SIZE;
+}
+
+static inline size_t
+hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto)
+{
+ /* Original src, to ensure we map it consistently if poss. */
+ return (manip->ip + manip->u.all + proto) % IP_NAT_HTABLE_SIZE;
+}
+
+/* Noone using conntrack by the time this called. */
+static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
+{
+ struct ip_nat_info *info = &conn->nat.info;
+
+ if (!info->initialized)
+ return;
+
+ IP_NF_ASSERT(info->bysource.conntrack);
+ IP_NF_ASSERT(info->byipsproto.conntrack);
+
+ WRITE_LOCK(&ip_nat_lock);
+ LIST_DELETE(&bysource[hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.src,
+ conn->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.protonum)],
+ &info->bysource);
+
+ LIST_DELETE(&byipsproto
+ [hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY]
+ .tuple.src.ip,
+ conn->tuplehash[IP_CT_DIR_REPLY]
+ .tuple.dst.ip,
+ conn->tuplehash[IP_CT_DIR_REPLY]
+ .tuple.dst.protonum)],
+ &info->byipsproto);
+ WRITE_UNLOCK(&ip_nat_lock);
+}
+
+/* We do checksum mangling, so if they were wrong before they're still
+ * wrong. Also works for incomplete packets (eg. ICMP dest
+ * unreachables.) */
+u_int16_t
+ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
+{
+ u_int32_t diffs[] = { oldvalinv, newval };
+ return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
+ oldcheck^0xFFFF));
+}
+
+static inline int cmp_proto(const struct ip_nat_protocol *i, int proto)
+{
+ return i->protonum == proto;
+}
+
+struct ip_nat_protocol *
+find_nat_proto(u_int16_t protonum)
+{
+ struct ip_nat_protocol *i;
+
+ MUST_BE_READ_LOCKED(&ip_nat_lock);
+ i = LIST_FIND(&protos, cmp_proto, struct ip_nat_protocol *, protonum);
+ if (!i)
+ i = &unknown_nat_protocol;
+ return i;
+}
+
+/* Is this tuple already taken? (not by us) */
+int
+ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack *ignored_conntrack)
+{
+ /* Conntrack tracking doesn't keep track of outgoing tuples; only
+ incoming ones. NAT means they don't have a fixed mapping,
+ so we invert the tuple and look for the incoming reply.
+
+ We could keep a separate hash if this proves too slow. */
+ struct ip_conntrack_tuple reply;
+
+ invert_tuplepr(&reply, tuple);
+ return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
+}
+
+/* Does tuple + the source manip come within the range mr */
+static int
+in_range(const struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack_manip *manip,
+ const struct ip_nat_multi_range *mr)
+{
+ struct ip_nat_protocol *proto = find_nat_proto(tuple->dst.protonum);
+ unsigned int i;
+ struct ip_conntrack_tuple newtuple = { *manip, tuple->dst };
+
+ for (i = 0; i < mr->rangesize; i++) {
+ /* If we are allowed to map IPs, then we must be in the
+ range specified, otherwise we must be unchanged. */
+ if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
+ if (ntohl(newtuple.src.ip) < ntohl(mr->range[i].min_ip)
+ || (ntohl(newtuple.src.ip)
+ > ntohl(mr->range[i].max_ip)))
+ continue;
+ } else {
+ if (newtuple.src.ip != tuple->src.ip)
+ continue;
+ }
+
+ if ((mr->range[i].flags & IP_NAT_RANGE_PROTO_SPECIFIED)
+ && proto->in_range(&newtuple, IP_NAT_MANIP_SRC,
+ &mr->range[i].min, &mr->range[i].max))
+ return 1;
+ }
+ return 0;
+}
+
+static inline int
+src_cmp(const struct ip_nat_hash *i,
+ const struct ip_conntrack_tuple *tuple,
+ const struct ip_nat_multi_range *mr)
+{
+ return (i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
+ == tuple->dst.protonum
+ && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
+ == tuple->src.ip
+ && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
+ == tuple->src.u.all
+ && in_range(tuple,
+ &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.src,
+ mr));
+}
+
+/* Only called for SRC manip */
+static struct ip_conntrack_manip *
+find_appropriate_src(const struct ip_conntrack_tuple *tuple,
+ const struct ip_nat_multi_range *mr)
+{
+ unsigned int h = hash_by_src(&tuple->src, tuple->dst.protonum);
+ struct ip_nat_hash *i;
+
+ MUST_BE_READ_LOCKED(&ip_nat_lock);
+ i = LIST_FIND(&bysource[h], src_cmp, struct ip_nat_hash *, tuple, mr);
+ if (i)
+ return &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src;
+ else
+ return NULL;
+}
+
+/* If it's really a local destination manip, it may need to do a
+ source manip too. */
+static int
+do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp)
+{
+ struct rtable *rt;
+
+ /* FIXME: IPTOS_TOS(iph->tos) --RR */
+ if (ip_route_output(&rt, var_ip, 0, 0, 0) != 0) {
+ DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n",
+ IP_PARTS(var_ip));
+ return 0;
+ }
+
+ *other_ipp = rt->rt_src;
+ ip_rt_put(rt);
+ return 1;
+}
+
+/* Simple way to iterate through all. */
+static inline int fake_cmp(const struct ip_nat_hash *i,
+ u_int32_t src, u_int32_t dst, u_int16_t protonum,
+ unsigned int *score,
+ const struct ip_conntrack *conntrack)
+{
+ /* Compare backwards: we're dealing with OUTGOING tuples, and
+ inside the conntrack is the REPLY tuple. Don't count this
+ conntrack. */
+ if (i->conntrack != conntrack
+ && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == dst
+ && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip == src
+ && (i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum
+ == protonum))
+ (*score)++;
+ return 0;
+}
+
+static inline unsigned int
+count_maps(u_int32_t src, u_int32_t dst, u_int16_t protonum,
+ const struct ip_conntrack *conntrack)
+{
+ unsigned int score = 0;
+
+ MUST_BE_READ_LOCKED(&ip_nat_lock);
+ LIST_FIND(&byipsproto[hash_by_ipsproto(src, dst, protonum)],
+ fake_cmp, struct ip_nat_hash *, src, dst, protonum, &score,
+ conntrack);
+
+ return score;
+}
+
+/* For [FUTURE] fragmentation handling, we want the least-used
+ src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
+ if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
+ 1-65535, we don't do pro-rata allocation based on ports; we choose
+ the ip with the lowest src-ip/dst-ip/proto usage.
+
+ If an allocation then fails (eg. all 6 ports used in the 1.2.3.4
+ range), we eliminate that and try again. This is not the most
+ efficient approach, but if you're worried about that, don't hand us
+ ranges you don't really have. */
+static struct ip_nat_range *
+find_best_ips_proto(struct ip_conntrack_tuple *tuple,
+ const struct ip_nat_multi_range *mr,
+ const struct ip_conntrack *conntrack,
+ unsigned int hooknum)
+{
+ unsigned int i;
+ struct {
+ const struct ip_nat_range *range;
+ unsigned int score;
+ struct ip_conntrack_tuple tuple;
+ } best = { NULL, 0xFFFFFFFF };
+ u_int32_t *var_ipp, *other_ipp, saved_ip;
+
+ if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
+ var_ipp = &tuple->src.ip;
+ saved_ip = tuple->dst.ip;
+ other_ipp = &tuple->dst.ip;
+ } else {
+ var_ipp = &tuple->dst.ip;
+ saved_ip = tuple->src.ip;
+ other_ipp = &tuple->src.ip;
+ }
+
+ IP_NF_ASSERT(mr->rangesize >= 1);
+ for (i = 0; i < mr->rangesize; i++) {
+ u_int32_t minip, maxip;
+
+ /* Don't do ranges which are already eliminated. */
+ if (mr->range[i].flags & IP_NAT_RANGE_FULL) {
+ continue;
+ }
+
+ if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
+ minip = mr->range[i].min_ip;
+ maxip = mr->range[i].max_ip;
+ } else
+ minip = maxip = *var_ipp;
+
+ for (*var_ipp = minip;
+ ntohl(*var_ipp) <= ntohl(maxip);
+ *var_ipp = htonl(ntohl(*var_ipp) + 1)) {
+ unsigned int score;
+
+ /* Reset the other ip in case it was mangled by
+ * do_extra_mangle last time. */
+ *other_ipp = saved_ip;
+
+ if (hooknum == NF_IP_LOCAL_OUT
+ && !do_extra_mangle(*var_ipp, other_ipp)) {
+ DEBUGP("Range %u %u.%u.%u.%u rt failed!\n",
+ i, IP_PARTS(*var_ipp));
+ /* Can't route? This whole range part is
+ * probably screwed, but keep trying
+ * anyway. */
+ continue;
+ }
+
+ /* Count how many others map onto this. */
+ score = count_maps(tuple->src.ip, tuple->dst.ip,
+ tuple->dst.protonum, conntrack);
+ if (score < best.score) {
+ /* Optimization: doesn't get any better than
+ this. */
+ if (score == 0)
+ return (struct ip_nat_range *)
+ &mr->range[i];
+
+ best.score = score;
+ best.tuple = *tuple;
+ best.range = &mr->range[i];
+ }
+ }
+ }
+ *tuple = best.tuple;
+
+ /* Discard const. */
+ return (struct ip_nat_range *)best.range;
+}
+
+static int
+get_unique_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack_tuple *orig_tuple,
+ const struct ip_nat_multi_range *mrr,
+ struct ip_conntrack *conntrack,
+ unsigned int hooknum)
+{
+ struct ip_nat_protocol *proto
+ = find_nat_proto(orig_tuple->dst.protonum);
+ struct ip_nat_range *rptr;
+ unsigned int i;
+ int ret;
+
+ /* We temporarily use flags for marking full parts, but we
+ always clean up afterwards */
+ struct ip_nat_multi_range *mr = (void *)mrr;
+
+ /* 1) If this srcip/proto/src-proto-part is currently mapped,
+ and that same mapping gives a unique tuple within the given
+ range, use that.
+
+ This is only required for source (ie. NAT/masq) mappings.
+ So far, we don't do local source mappings, so multiple
+ manips not an issue. */
+ if (hooknum == NF_IP_POST_ROUTING) {
+ struct ip_conntrack_manip *manip;
+
+ manip = find_appropriate_src(orig_tuple, mr);
+ if (manip) {
+ /* Apply same source manipulation. */
+ *tuple = ((struct ip_conntrack_tuple)
+ { *manip, orig_tuple->dst });
+ DEBUGP("get_unique_tuple: Found current src map\n");
+ return 1;
+ }
+ }
+
+ /* 2) Select the least-used IP/proto combination in the given
+ range.
+ */
+ *tuple = *orig_tuple;
+ while ((rptr = find_best_ips_proto(tuple, mr, conntrack, hooknum))
+ != NULL) {
+ DEBUGP("Found best for "); DUMP_TUPLE(tuple);
+ /* 3) The per-protocol part of the manip is made to
+ map into the range to make a unique tuple. */
+
+ /* Only bother mapping if it's not already in range
+ and unique */
+ if ((!(rptr->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
+ || proto->in_range(tuple, HOOK2MANIP(hooknum),
+ &rptr->min, &rptr->max))
+ && !ip_nat_used_tuple(tuple, conntrack)) {
+ ret = 1;
+ goto clear_fulls;
+ } else {
+ if (proto->unique_tuple(tuple, rptr,
+ HOOK2MANIP(hooknum),
+ conntrack)) {
+ /* Must be unique. */
+ IP_NF_ASSERT(!ip_nat_used_tuple(tuple,
+ conntrack));
+ ret = 1;
+ goto clear_fulls;
+ }
+ DEBUGP("Protocol can't get unique tuple.\n");
+ }
+
+ /* Eliminate that from range, and try again. */
+ rptr->flags |= IP_NAT_RANGE_FULL;
+ *tuple = *orig_tuple;
+ }
+
+ ret = 0;
+
+ clear_fulls:
+ /* Clear full flags. */
+ IP_NF_ASSERT(mr->rangesize >= 1);
+ for (i = 0; i < mr->rangesize; i++)
+ mr->range[i].flags &= ~IP_NAT_RANGE_FULL;
+
+ return ret;
+}
+
+static inline int
+helper_cmp(const struct ip_nat_helper *helper,
+ u_int16_t protocol,
+ u_int16_t protocol_dst)
+{
+ return (protocol == helper->protocol
+ && protocol_dst == helper->protocol_dst);
+}
+
+/* Where to manip the reply packets (will be reverse manip). */
+static unsigned int opposite_hook[NF_IP_NUMHOOKS]
+= { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING,
+ [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING,
+ [NF_IP_LOCAL_OUT] = NF_IP_PRE_ROUTING
+};
+
+unsigned int
+ip_nat_setup_info(struct ip_conntrack *conntrack,
+ const struct ip_nat_multi_range *mr,
+ unsigned int hooknum)
+{
+ struct ip_conntrack_tuple new_tuple, inv_tuple, reply;
+ struct ip_conntrack_tuple orig_tp;
+ struct ip_nat_info *info = &conntrack->nat.info;
+
+ MUST_BE_WRITE_LOCKED(&ip_nat_lock);
+ IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
+ || hooknum == NF_IP_POST_ROUTING
+ || hooknum == NF_IP_LOCAL_OUT);
+ IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
+
+ /* What we've got will look like inverse of reply. Normally
+ this is what is in the conntrack, except for prior
+ manipulations (future optimization: if num_manips == 0,
+ orig_tp =
+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
+ invert_tuplepr(&orig_tp,
+ &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+#if 0
+ {
+ unsigned int i;
+
+ DEBUGP("Hook %u (%s), ", hooknum,
+ HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST");
+ DUMP_TUPLE(&orig_tp);
+ DEBUGP("Range %p: ", mr);
+ for (i = 0; i < mr->rangesize; i++) {
+ DEBUGP("%u:%s%s%s %u.%u.%u.%u - %u.%u.%u.%u %u - %u\n",
+ i,
+ (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS)
+ ? " MAP_IPS" : "",
+ (mr->range[i].flags
+ & IP_NAT_RANGE_PROTO_SPECIFIED)
+ ? " PROTO_SPECIFIED" : "",
+ (mr->range[i].flags & IP_NAT_RANGE_FULL)
+ ? " FULL" : "",
+ IP_PARTS(mr->range[i].min_ip),
+ IP_PARTS(mr->range[i].max_ip),
+ mr->range[i].min.all,
+ mr->range[i].max.all);
+ }
+ }
+#endif
+
+ do {
+ if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack,
+ hooknum)) {
+ DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n",
+ conntrack);
+ return NF_DROP;
+ }
+
+#if 0
+ DEBUGP("Hook %u (%s) %p\n", hooknum,
+ HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
+ conntrack);
+ DEBUGP("Original: ");
+ DUMP_TUPLE(&orig_tp);
+ DEBUGP("New: ");
+ DUMP_TUPLE(&new_tuple);
+#endif
+
+ /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
+ the original (A/B/C/D') and the mangled one (E/F/G/H').
+
+ We're only allowed to work with the SRC per-proto
+ part, so we create inverses of both to start, then
+ derive the other fields we need. */
+
+ /* Reply connection: simply invert the new tuple
+ (G/H/E/F') */
+ invert_tuplepr(&reply, &new_tuple);
+
+ /* Alter conntrack table so it recognizes replies.
+ If fail this race (reply tuple now used), repeat. */
+ } while (!ip_conntrack_alter_reply(conntrack, &reply));
+
+ /* FIXME: We can simply used existing conntrack reply tuple
+ here --RR */
+ /* Create inverse of original: C/D/A/B' */
+ invert_tuplepr(&inv_tuple, &orig_tp);
+
+ /* Has source changed?. */
+ if (memcmp(&new_tuple.src, &orig_tp.src, sizeof(new_tuple.src))
+ != 0) {
+ /* In this direction, a source manip. */
+ info->manips[info->num_manips++] =
+ ((struct ip_nat_info_manip)
+ { IP_CT_DIR_ORIGINAL, hooknum,
+ IP_NAT_MANIP_SRC, new_tuple.src });
+
+ IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
+
+ /* In the reverse direction, a destination manip. */
+ info->manips[info->num_manips++] =
+ ((struct ip_nat_info_manip)
+ { IP_CT_DIR_REPLY, opposite_hook[hooknum],
+ IP_NAT_MANIP_DST, orig_tp.src });
+ IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
+ }
+
+ /* Has destination changed? */
+ if (memcmp(&new_tuple.dst, &orig_tp.dst, sizeof(new_tuple.dst))
+ != 0) {
+ /* In this direction, a destination manip */
+ info->manips[info->num_manips++] =
+ ((struct ip_nat_info_manip)
+ { IP_CT_DIR_ORIGINAL, hooknum,
+ IP_NAT_MANIP_DST, reply.src });
+
+ IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
+
+ /* In the reverse direction, a source manip. */
+ info->manips[info->num_manips++] =
+ ((struct ip_nat_info_manip)
+ { IP_CT_DIR_REPLY, opposite_hook[hooknum],
+ IP_NAT_MANIP_SRC, inv_tuple.src });
+ IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
+ }
+
+ /* If there's a helper, assign it; based on new tuple. */
+ info->helper = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,
+ new_tuple.dst.protonum,
+ new_tuple.dst.u.all);
+
+ /* It's done. */
+ info->initialized |= (1 << HOOK2MANIP(hooknum));
+ return NF_ACCEPT;
+}
+
+void replace_in_hashes(struct ip_conntrack *conntrack,
+ struct ip_nat_info *info)
+{
+ /* Source has changed, so replace in hashes. */
+ unsigned int srchash
+ = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.src,
+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.protonum);
+ /* We place packet as seen OUTGOUNG in byips_proto hash
+ (ie. reverse dst and src of reply packet. */
+ unsigned int ipsprotohash
+ = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
+ .tuple.dst.ip,
+ conntrack->tuplehash[IP_CT_DIR_REPLY]
+ .tuple.src.ip,
+ conntrack->tuplehash[IP_CT_DIR_REPLY]
+ .tuple.dst.protonum);
+
+ IP_NF_ASSERT(info->bysource.conntrack == conntrack);
+ MUST_BE_WRITE_LOCKED(&ip_nat_lock);
+
+ list_del(&info->bysource.list);
+ list_del(&info->byipsproto.list);
+
+ list_prepend(&bysource[srchash], &info->bysource);
+ list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
+}
+
+void place_in_hashes(struct ip_conntrack *conntrack,
+ struct ip_nat_info *info)
+{
+ unsigned int srchash
+ = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.src,
+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.protonum);
+ /* We place packet as seen OUTGOUNG in byips_proto hash
+ (ie. reverse dst and src of reply packet. */
+ unsigned int ipsprotohash
+ = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
+ .tuple.dst.ip,
+ conntrack->tuplehash[IP_CT_DIR_REPLY]
+ .tuple.src.ip,
+ conntrack->tuplehash[IP_CT_DIR_REPLY]
+ .tuple.dst.protonum);
+
+ IP_NF_ASSERT(!info->bysource.conntrack);
+
+ MUST_BE_WRITE_LOCKED(&ip_nat_lock);
+ info->byipsproto.conntrack = conntrack;
+ info->bysource.conntrack = conntrack;
+
+ list_prepend(&bysource[srchash], &info->bysource);
+ list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
+}
+
+static void
+manip_pkt(u_int16_t proto, struct iphdr *iph, size_t len,
+ const struct ip_conntrack_manip *manip,
+ enum ip_nat_manip_type maniptype)
+{
+ find_nat_proto(proto)->manip_pkt(iph, len, manip, maniptype);
+
+ if (maniptype == IP_NAT_MANIP_SRC) {
+ iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip,
+ iph->check);
+ iph->saddr = manip->ip;
+ } else {
+ iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip,
+ iph->check);
+ iph->daddr = manip->ip;
+ }
+#if 0
+ if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
+ DEBUGP("IP: checksum on packet bad.\n");
+
+ if (proto == IPPROTO_TCP) {
+ void *th = (u_int32_t *)iph + iph->ihl;
+ if (tcp_v4_check(th, len - 4*iph->ihl, iph->saddr, iph->daddr,
+ csum_partial((char *)th, len-4*iph->ihl, 0)))
+ DEBUGP("TCP: checksum on packet bad\n");
+ }
+#endif
+}
+
+/* Do packet manipulations according to binding. */
+unsigned int
+do_bindings(struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ struct ip_nat_info *info,
+ unsigned int hooknum,
+ struct sk_buff **pskb)
+{
+ unsigned int i;
+ struct ip_nat_helper *helper;
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+ /* Need nat lock to protect against modification, but neither
+ conntrack (referenced) and helper (deleted with
+ synchronize_bh()) can vanish. */
+ READ_LOCK(&ip_nat_lock);
+ for (i = 0; i < info->num_manips; i++) {
+ if (info->manips[i].direction == dir
+ && info->manips[i].hooknum == hooknum) {
+ DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n",
+ *pskb,
+ info->manips[i].maniptype == IP_NAT_MANIP_SRC
+ ? "SRC" : "DST",
+ IP_PARTS(info->manips[i].manip.ip),
+ htons(info->manips[i].manip.u.all));
+ manip_pkt((*pskb)->nh.iph->protocol,
+ (*pskb)->nh.iph,
+ (*pskb)->len,
+ &info->manips[i].manip,
+ info->manips[i].maniptype);
+ }
+ }
+ helper = info->helper;
+ READ_UNLOCK(&ip_nat_lock);
+
+ if (helper) {
+ /* Always defragged for helpers */
+ IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
+ & __constant_htons(IP_MF|IP_OFFSET)));
+ return helper->help(ct, info, ctinfo, hooknum, pskb);
+ } else return NF_ACCEPT;
+}
+
+void
+icmp_reply_translation(struct sk_buff *skb,
+ struct ip_conntrack *conntrack,
+ unsigned int hooknum,
+ int dir)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct icmphdr *hdr = (struct icmphdr *)((u_int32_t *)iph + iph->ihl);
+ struct iphdr *inner = (struct iphdr *)(hdr + 1);
+ size_t datalen = skb->len - ((void *)inner - (void *)iph);
+ unsigned int i;
+ struct ip_nat_info *info = &conntrack->nat.info;
+
+ IP_NF_ASSERT(skb->len >= iph->ihl*4 + sizeof(struct icmphdr));
+
+ DEBUGP("icmp_reply_translation: translating error %p hook %u dir %s\n",
+ skb, hooknum, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
+ /* Note: May not be from a NAT'd host, but probably safest to
+ do translation always as if it came from the host itself
+ (even though a "host unreachable" coming from the host
+ itself is a bit wierd).
+
+ More explanation: some people use NAT for anonomizing.
+ Also, CERT recommends dropping all packets from private IP
+ addresses (although ICMP errors from internal links with
+ such addresses are not too uncommon, as Alan Cox points
+ out) */
+
+ READ_LOCK(&ip_nat_lock);
+ for (i = 0; i < info->num_manips; i++) {
+ DEBUGP("icmp_reply: manip %u dir %s hook %u\n",
+ i, info->manips[i].direction == IP_CT_DIR_ORIGINAL ?
+ "ORIG" : "REPLY", info->manips[i].hooknum);
+ /* Mapping the inner packet is just like a normal
+ packet in the other direction, except it was never
+ src/dst reversed, so where we would normally apply
+ a dst manip, we reply a src, and vice versa. */
+ if (info->manips[i].direction != dir
+ && info->manips[i].hooknum == opposite_hook[hooknum]) {
+ DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n",
+ info->manips[i].maniptype == IP_NAT_MANIP_SRC
+ ? "DST" : "SRC",
+ IP_PARTS(info->manips[i].manip.ip),
+ ntohs(info->manips[i].manip.u.udp.port));
+ manip_pkt(inner->protocol, inner,
+ skb->len - ((void *)inner - (void *)iph),
+ &info->manips[i].manip,
+ !info->manips[i].maniptype);
+ }
+ /* Outer packet needs to have IP header NATed like
+ it's a reply. */
+ else if (info->manips[i].direction != dir
+ && info->manips[i].hooknum == hooknum) {
+ /* Use mapping to map outer packet: 0 give no
+ per-proto mapping */
+ DEBUGP("icmp_reply: outer %s %u.%u.%u.%u\n",
+ info->manips[i].maniptype == IP_NAT_MANIP_SRC
+ ? "SRC" : "DST",
+ IP_PARTS(info->manips[i].manip.ip));
+ manip_pkt(0, iph, skb->len,
+ &info->manips[i].manip,
+ info->manips[i].maniptype);
+ }
+ }
+ READ_UNLOCK(&ip_nat_lock);
+
+ /* Since we mangled inside ICMP packet, recalculate its
+ checksum from scratch. (Hence the handling of incorrect
+ checksums in conntrack, so we don't accidentally fix one.) */
+ hdr->checksum = 0;
+ hdr->checksum = ip_compute_csum((unsigned char *)hdr,
+ sizeof(*hdr) + datalen);
+}
+
+int ip_nat_helper_register(struct ip_nat_helper *me)
+{
+ int ret = 0;
+
+ WRITE_LOCK(&ip_nat_lock);
+ if (LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,
+ me->protocol, me->protocol_dst))
+ ret = -EBUSY;
+ else {
+ list_prepend(&helpers, me);
+ MOD_INC_USE_COUNT;
+ }
+ WRITE_UNLOCK(&ip_nat_lock);
+
+ return ret;
+}
+
+static int
+kill_helper(const struct ip_conntrack *i, void *helper)
+{
+ int ret;
+
+ READ_LOCK(&ip_nat_lock);
+ ret = (i->nat.info.helper == helper);
+ READ_UNLOCK(&ip_nat_lock);
+
+ return ret;
+}
+
+void ip_nat_helper_unregister(struct ip_nat_helper *me)
+{
+ WRITE_LOCK(&ip_nat_lock);
+ LIST_DELETE(&helpers, me);
+ WRITE_UNLOCK(&ip_nat_lock);
+
+ /* Someone could be still looking at the helper in a bh. */
+ br_write_lock_bh(BR_NETPROTO_LOCK);
+ br_write_unlock_bh(BR_NETPROTO_LOCK);
+
+ /* Find anything using it, and umm, kill them. We can't turn
+ them into normal connections: if we've adjusted SYNs, then
+ they'll ackstorm. So we just drop it. We used to just
+ bump module count when a connection existed, but that
+ forces admins to gen fake RSTs or bounce box, either of
+ which is just a long-winded way of making things
+ worse. --RR */
+ ip_ct_selective_cleanup(kill_helper, me);
+
+ MOD_DEC_USE_COUNT;
+}
+
+int __init ip_nat_init(void)
+{
+ size_t i;
+
+ /* Sew in builtin protocols. */
+ WRITE_LOCK(&ip_nat_lock);
+ list_append(&protos, &ip_nat_protocol_tcp);
+ list_append(&protos, &ip_nat_protocol_udp);
+ list_append(&protos, &ip_nat_protocol_icmp);
+ WRITE_UNLOCK(&ip_nat_lock);
+
+ for (i = 0; i < IP_NAT_HTABLE_SIZE; i++) {
+ INIT_LIST_HEAD(&bysource[i]);
+ INIT_LIST_HEAD(&byipsproto[i]);
+ }
+
+ /* FIXME: Man, this is a hack. <SIGH> */
+ IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
+ ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
+
+ return 0;
+}
+
+void ip_nat_cleanup(void)
+{
+ ip_conntrack_destroyed = NULL;
+}
diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c
new file mode 100644
index 000000000..8252e6d9b
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_ftp.c
@@ -0,0 +1,403 @@
+/* FTP extension for TCP NAT alteration. */
+#include <linux/module.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <net/tcp.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_ftp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+
+EXPORT_NO_SYMBOLS;
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+/* FIXME: Time out? --RR */
+
+static int
+ftp_nat_expected(struct sk_buff **pskb,
+ unsigned int hooknum,
+ struct ip_conntrack *ct,
+ struct ip_nat_info *info,
+ struct ip_conntrack *master,
+ struct ip_nat_info *masterinfo,
+ unsigned int *verdict)
+{
+ struct ip_nat_multi_range mr;
+ u_int32_t newdstip, newsrcip, newip;
+ struct ip_ct_ftp *ftpinfo;
+
+ IP_NF_ASSERT(info);
+ IP_NF_ASSERT(master);
+ IP_NF_ASSERT(masterinfo);
+
+ IP_NF_ASSERT(!(info->initialized & (1<<HOOK2MANIP(hooknum))));
+
+ DEBUGP("nat_expected: We have a connection!\n");
+ /* Master must be an ftp connection */
+ ftpinfo = &master->help.ct_ftp_info;
+
+ LOCK_BH(&ip_ftp_lock);
+ if (!ftpinfo->is_ftp) {
+ UNLOCK_BH(&ip_ftp_lock);
+ DEBUGP("nat_expected: master not ftp\n");
+ return 0;
+ }
+
+ if (ftpinfo->ftptype == IP_CT_FTP_PORT) {
+ /* PORT command: make connection go to the client. */
+ newdstip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
+ newsrcip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip;
+ DEBUGP("nat_expected: PORT cmd. %u.%u.%u.%u->%u.%u.%u.%u\n",
+ IP_PARTS(newsrcip), IP_PARTS(newdstip));
+ } else {
+ /* PASV command: make the connection go to the server */
+ newdstip = master->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip;
+ newsrcip = master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip;
+ DEBUGP("nat_expected: PASV cmd. %u.%u.%u.%u->%u.%u.%u.%u\n",
+ IP_PARTS(newsrcip), IP_PARTS(newdstip));
+ }
+ UNLOCK_BH(&ip_ftp_lock);
+
+ if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC)
+ newip = newsrcip;
+ else
+ newip = newdstip;
+
+ DEBUGP("nat_expected: IP to %u.%u.%u.%u\n", IP_PARTS(newip));
+
+ mr.rangesize = 1;
+ /* We don't want to manip the per-protocol, just the IPs. */
+ mr.range[0].flags = IP_NAT_RANGE_MAP_IPS;
+ mr.range[0].min_ip = mr.range[0].max_ip = newip;
+
+ *verdict = ip_nat_setup_info(ct, &mr, hooknum);
+
+ return 1;
+}
+
+/* This is interesting. We simply use the port given us by the client
+ or server. In practice it's extremely unlikely to clash; if it
+ does, the rule won't be able to get a unique tuple and will drop
+ the packets. */
+static int
+mangle_packet(struct sk_buff **pskb,
+ u_int32_t newip,
+ u_int16_t port,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct ip_nat_ftp_info *this_way,
+ struct ip_nat_ftp_info *other_way)
+{
+ struct iphdr *iph = (*pskb)->nh.iph;
+ struct tcphdr *tcph;
+ unsigned char *data;
+ unsigned int tcplen, newlen, newtcplen;
+ char buffer[sizeof("nnn,nnn,nnn,nnn,nnn,nnn")];
+
+ MUST_BE_LOCKED(&ip_ftp_lock);
+ sprintf(buffer, "%u,%u,%u,%u,%u,%u",
+ IP_PARTS(newip), port>>8, port&0xFF);
+
+ tcplen = (*pskb)->len - iph->ihl * 4;
+ newtcplen = tcplen - matchlen + strlen(buffer);
+ newlen = iph->ihl*4 + newtcplen;
+
+ /* So there I am, in the middle of my `netfilter-is-wonderful'
+ talk in Sydney, and someone asks `What happens if you try
+ to enlarge a 64k packet here?'. I think I said something
+ eloquent like `fuck'. */
+ if (newlen > 65535) {
+ if (net_ratelimit())
+ printk("nat_ftp cheat: %u.%u.%u.%u->%u.%u.%u.%u %u\n",
+ NIPQUAD((*pskb)->nh.iph->saddr),
+ NIPQUAD((*pskb)->nh.iph->daddr),
+ (*pskb)->nh.iph->protocol);
+ return NF_DROP;
+ }
+
+ if (newlen > (*pskb)->len + skb_tailroom(*pskb)) {
+ struct sk_buff *newskb;
+ newskb = skb_copy_expand(*pskb, skb_headroom(*pskb), newlen,
+ GFP_ATOMIC);
+ if (!newskb) {
+ DEBUGP("ftp: oom\n");
+ return 0;
+ } else {
+ kfree_skb(*pskb);
+ *pskb = newskb;
+ iph = (*pskb)->nh.iph;
+ }
+ }
+
+ tcph = (void *)iph + iph->ihl*4;
+ data = (void *)tcph + tcph->doff*4;
+
+ DEBUGP("Mapping `%.*s' [%u %u %u] to new `%s' [%u]\n",
+ (int)matchlen, data+matchoff,
+ data[matchoff], data[matchoff+1],
+ matchlen, buffer, strlen(buffer));
+
+ /* SYN adjust. If it's uninitialized, or this is after last
+ correction, record it: we don't handle more than one
+ adjustment in the window, but do deal with common case of a
+ retransmit. */
+ if (this_way->syn_offset_before == this_way->syn_offset_after
+ || before(this_way->syn_correction_pos, ntohl(tcph->seq))) {
+ this_way->syn_correction_pos = ntohl(tcph->seq);
+ this_way->syn_offset_before = this_way->syn_offset_after;
+ this_way->syn_offset_after = (int32_t)
+ this_way->syn_offset_before + newlen - (*pskb)->len;
+ }
+
+ /* Move post-replacement */
+ memmove(data + matchoff + strlen(buffer),
+ data + matchoff + matchlen,
+ (*pskb)->tail - (data + matchoff + matchlen));
+ memcpy(data + matchoff, buffer, strlen(buffer));
+
+ /* Resize packet. */
+ if (newlen > (*pskb)->len) {
+ DEBUGP("ip_nat_ftp: Extending packet by %u to %u bytes\n",
+ newlen - (*pskb)->len, newlen);
+ skb_put(*pskb, newlen - (*pskb)->len);
+ } else {
+ DEBUGP("ip_nat_ftp: Shrinking packet from %u to %u bytes\n",
+ (*pskb)->len, newlen);
+ skb_trim(*pskb, newlen);
+ }
+
+ /* Fix checksums */
+ iph->tot_len = htons(newlen);
+ (*pskb)->csum = csum_partial((char *)tcph + tcph->doff*4,
+ newtcplen - tcph->doff*4, 0);
+ tcph->check = 0;
+ tcph->check = tcp_v4_check(tcph, newtcplen, iph->saddr, iph->daddr,
+ csum_partial((char *)tcph, tcph->doff*4,
+ (*pskb)->csum));
+ ip_send_check(iph);
+ return 1;
+}
+
+/* Grrr... SACK. Fuck me even harder. Don't want to fix it on the
+ fly, so blow it away. */
+static void
+delete_sack(struct sk_buff *skb, struct tcphdr *tcph)
+{
+ unsigned int i;
+ u_int8_t *opt = (u_int8_t *)tcph;
+
+ DEBUGP("Seeking SACKPERM in SYN packet (doff = %u).\n",
+ tcph->doff * 4);
+ for (i = sizeof(struct tcphdr); i < tcph->doff * 4;) {
+ DEBUGP("%u ", opt[i]);
+ switch (opt[i]) {
+ case TCPOPT_NOP:
+ case TCPOPT_EOL:
+ i++;
+ break;
+
+ case TCPOPT_SACK_PERM:
+ goto found_opt;
+
+ default:
+ /* Worst that can happen: it will take us over. */
+ i += opt[i+1] ?: 1;
+ }
+ }
+ DEBUGP("\n");
+ return;
+
+ found_opt:
+ DEBUGP("\n");
+ DEBUGP("Found SACKPERM at offset %u.\n", i);
+
+ /* Must be within TCP header, and valid SACK perm. */
+ if (i + opt[i+1] <= tcph->doff*4 && opt[i+1] == 2) {
+ /* Replace with NOPs. */
+ tcph->check
+ = ip_nat_cheat_check(*((u_int16_t *)(opt + i))^0xFFFF,
+ 0, tcph->check);
+ opt[i] = opt[i+1] = 0;
+ }
+ else DEBUGP("Something wrong with SACK_PERM.\n");
+}
+
+static int ftp_data_fixup(const struct ip_ct_ftp *ct_ftp_info,
+ struct ip_conntrack *ct,
+ struct ip_nat_ftp_info *ftp,
+ unsigned int datalen,
+ struct sk_buff **pskb)
+{
+ u_int32_t newip;
+ struct ip_conntrack_tuple t;
+ struct iphdr *iph = (*pskb)->nh.iph;
+ struct tcphdr *tcph = (void *)iph + iph->ihl*4;
+
+ MUST_BE_LOCKED(&ip_ftp_lock);
+ DEBUGP("FTP_NAT: seq %u + %u in %u + %u\n",
+ ct_ftp_info->seq, ct_ftp_info->len,
+ ntohl(tcph->seq), datalen);
+
+ /* Change address inside packet to match way we're mapping
+ this connection. */
+ if (ct_ftp_info->ftptype == IP_CT_FTP_PASV) {
+ /* PASV response: must be where client thinks server
+ is */
+ newip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip;
+ } else {
+ /* PORT command: must be where server thinks client is */
+ newip = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip;
+ }
+
+ if (!mangle_packet(pskb, newip, ct_ftp_info->port,
+ ct_ftp_info->seq - ntohl(tcph->seq),
+ ct_ftp_info->len,
+ &ftp[ct_ftp_info->ftptype],
+ &ftp[!ct_ftp_info->ftptype]))
+ return 0;
+
+ /* Alter conntrack's expectations. */
+
+ /* We can read expect here without conntrack lock, since it's
+ only set in ip_conntrack_ftp, with ip_ftp_lock held
+ writable */
+ t = ct->expected.tuple;
+ t.dst.ip = newip;
+ ip_conntrack_expect_related(ct, &t);
+
+ return 1;
+}
+
+static unsigned int help(struct ip_conntrack *ct,
+ struct ip_nat_info *info,
+ enum ip_conntrack_info ctinfo,
+ unsigned int hooknum,
+ struct sk_buff **pskb)
+{
+ struct iphdr *iph = (*pskb)->nh.iph;
+ struct tcphdr *tcph = (void *)iph + iph->ihl*4;
+ u_int32_t newseq, newack;
+ unsigned int datalen;
+ int dir;
+ int score;
+ struct ip_ct_ftp *ct_ftp_info
+ = &ct->help.ct_ftp_info;
+ struct ip_nat_ftp_info *ftp
+ = &ct->nat.help.ftp_info[0];
+
+ /* Delete SACK_OK on initial TCP SYNs. */
+ if (tcph->syn && !tcph->ack)
+ delete_sack(*pskb, tcph);
+
+ /* Only mangle things once: original direction in POST_ROUTING
+ and reply direction on PRE_ROUTING. */
+ dir = CTINFO2DIR(ctinfo);
+ if (!((hooknum == NF_IP_POST_ROUTING && dir == IP_CT_DIR_ORIGINAL)
+ || (hooknum == NF_IP_PRE_ROUTING && dir == IP_CT_DIR_REPLY))) {
+ DEBUGP("nat_ftp: Not touching dir %s at hook %s\n",
+ dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY",
+ hooknum == NF_IP_POST_ROUTING ? "POSTROUTING"
+ : hooknum == NF_IP_PRE_ROUTING ? "PREROUTING"
+ : hooknum == NF_IP_LOCAL_OUT ? "OUTPUT" : "???");
+ return NF_ACCEPT;
+ }
+
+ datalen = (*pskb)->len - iph->ihl * 4 - tcph->doff * 4;
+ score = 0;
+ LOCK_BH(&ip_ftp_lock);
+ if (ct_ftp_info->len) {
+ /* If it's in the right range... */
+ score += between(ct_ftp_info->seq, ntohl(tcph->seq),
+ ntohl(tcph->seq) + datalen);
+ score += between(ct_ftp_info->seq + ct_ftp_info->len,
+ ntohl(tcph->seq),
+ ntohl(tcph->seq) + datalen);
+ if (score == 1) {
+ /* Half a match? This means a partial retransmisison.
+ It's a cracker being funky. */
+ if (net_ratelimit()) {
+ printk("FTP_NAT: partial packet %u/%u in %u/%u\n",
+ ct_ftp_info->seq, ct_ftp_info->len,
+ ntohl(tcph->seq),
+ ntohl(tcph->seq) + datalen);
+ }
+ UNLOCK_BH(&ip_ftp_lock);
+ return NF_DROP;
+ } else if (score == 2) {
+ if (!ftp_data_fixup(ct_ftp_info, ct, ftp, datalen,
+ pskb)) {
+ UNLOCK_BH(&ip_ftp_lock);
+ return NF_DROP;
+ }
+
+ /* skb may have been reallocated */
+ iph = (*pskb)->nh.iph;
+ tcph = (void *)iph + iph->ihl*4;
+ }
+ }
+
+ /* Sequence adjust */
+ if (after(ntohl(tcph->seq), ftp[dir].syn_correction_pos))
+ newseq = ntohl(tcph->seq) + ftp[dir].syn_offset_after;
+ else
+ newseq = ntohl(tcph->seq) + ftp[dir].syn_offset_before;
+ newseq = htonl(newseq);
+
+ /* Ack adjust */
+ if (after(ntohl(tcph->ack_seq), ftp[!dir].syn_correction_pos))
+ newack = ntohl(tcph->ack_seq) - ftp[!dir].syn_offset_after;
+ else
+ newack = ntohl(tcph->ack_seq) - ftp[!dir].syn_offset_before;
+ newack = htonl(newack);
+ UNLOCK_BH(&ip_ftp_lock);
+
+ tcph->check = ip_nat_cheat_check(~tcph->seq, newseq,
+ ip_nat_cheat_check(~tcph->ack_seq,
+ newack,
+ tcph->check));
+ tcph->seq = newseq;
+ tcph->ack_seq = newack;
+
+ return NF_ACCEPT;
+}
+
+static struct ip_nat_helper ftp
+= { { NULL, NULL }, IPPROTO_TCP, __constant_htons(21), help, "ftp" };
+static struct ip_nat_expect ftp_expect
+= { { NULL, NULL }, ftp_nat_expected };
+
+extern struct module *ip_conntrack_ftp;
+
+static int __init init(void)
+{
+ int ret;
+
+ ret = ip_nat_expect_register(&ftp_expect);
+ if (ret == 0) {
+ ret = ip_nat_helper_register(&ftp);
+
+ if (ret == 0)
+ __MOD_INC_USE_COUNT(ip_conntrack_ftp);
+ else
+ ip_nat_expect_unregister(&ftp_expect);
+ }
+ return ret;
+}
+
+static void __exit fini(void)
+{
+ __MOD_DEC_USE_COUNT(ip_conntrack_ftp);
+ ip_nat_helper_unregister(&ftp);
+ ip_nat_expect_unregister(&ftp_expect);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c
new file mode 100644
index 000000000..9bc7427ce
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c
@@ -0,0 +1,97 @@
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/if.h>
+
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+
+static int
+icmp_in_range(const struct ip_conntrack_tuple *tuple,
+ enum ip_nat_manip_type maniptype,
+ const union ip_conntrack_manip_proto *min,
+ const union ip_conntrack_manip_proto *max)
+{
+ return (tuple->src.u.icmp.id >= min->icmp.id
+ && tuple->src.u.icmp.id <= max->icmp.id);
+}
+
+static int
+icmp_unique_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_nat_range *range,
+ enum ip_nat_manip_type maniptype,
+ const struct ip_conntrack *conntrack)
+{
+ static u_int16_t id = 0;
+ unsigned int range_size
+ = (unsigned int)range->max.icmp.id - range->min.icmp.id + 1;
+ unsigned int i;
+
+ /* If no range specified... */
+ if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED))
+ range_size = 0xFFFF;
+
+ for (i = 0; i < range_size; i++, id++) {
+ tuple->src.u.icmp.id = range->min.icmp.id + (id % range_size);
+ if (!ip_nat_used_tuple(tuple, conntrack))
+ return 1;
+ }
+ return 0;
+}
+
+static void
+icmp_manip_pkt(struct iphdr *iph, size_t len,
+ const struct ip_conntrack_manip *manip,
+ enum ip_nat_manip_type maniptype)
+{
+ struct icmphdr *hdr = (struct icmphdr *)((u_int32_t *)iph + iph->ihl);
+
+ hdr->checksum = ip_nat_cheat_check(hdr->un.echo.id ^ 0xFFFF,
+ manip->u.icmp.id,
+ hdr->checksum);
+ hdr->un.echo.id = manip->u.icmp.id;
+}
+
+static unsigned int
+icmp_print(char *buffer,
+ const struct ip_conntrack_tuple *match,
+ const struct ip_conntrack_tuple *mask)
+{
+ unsigned int len = 0;
+
+ if (mask->src.u.icmp.id)
+ len += sprintf(buffer + len, "id=%u ",
+ ntohs(match->src.u.icmp.id));
+
+ if (mask->dst.u.icmp.type)
+ len += sprintf(buffer + len, "type=%u ",
+ ntohs(match->dst.u.icmp.type));
+
+ if (mask->dst.u.icmp.code)
+ len += sprintf(buffer + len, "code=%u ",
+ ntohs(match->dst.u.icmp.code));
+
+ return len;
+}
+
+static unsigned int
+icmp_print_range(char *buffer, const struct ip_nat_range *range)
+{
+ if (range->min.icmp.id != 0 || range->max.icmp.id != 0xFFFF)
+ return sprintf(buffer, "id %u-%u ",
+ ntohs(range->min.icmp.id),
+ ntohs(range->max.icmp.id));
+ else return 0;
+}
+
+struct ip_nat_protocol ip_nat_protocol_icmp
+= { { NULL, NULL }, "ICMP", IPPROTO_ICMP,
+ icmp_manip_pkt,
+ icmp_in_range,
+ icmp_unique_tuple,
+ icmp_print,
+ icmp_print_range
+};
diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c
new file mode 100644
index 000000000..7ff6ccb50
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c
@@ -0,0 +1,143 @@
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/if.h>
+
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+
+static int
+tcp_in_range(const struct ip_conntrack_tuple *tuple,
+ enum ip_nat_manip_type maniptype,
+ const union ip_conntrack_manip_proto *min,
+ const union ip_conntrack_manip_proto *max)
+{
+ u_int16_t port;
+
+ if (maniptype == IP_NAT_MANIP_SRC)
+ port = tuple->src.u.tcp.port;
+ else
+ port = tuple->dst.u.tcp.port;
+
+ return ntohs(port) >= ntohs(min->tcp.port)
+ && ntohs(port) <= ntohs(max->tcp.port);
+}
+
+static int
+tcp_unique_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_nat_range *range,
+ enum ip_nat_manip_type maniptype,
+ const struct ip_conntrack *conntrack)
+{
+ static u_int16_t port = 0, *portptr;
+ unsigned int range_size, min, i;
+
+ if (maniptype == IP_NAT_MANIP_SRC)
+ portptr = &tuple->src.u.tcp.port;
+ else
+ portptr = &tuple->dst.u.tcp.port;
+
+ /* If no range specified... */
+ if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
+ /* If it's dst rewrite, can't change port */
+ if (maniptype == IP_NAT_MANIP_DST)
+ return 0;
+
+ /* Map privileged onto privileged. */
+ if (ntohs(*portptr) < 1024) {
+ /* Loose convention: >> 512 is credential passing */
+ if (ntohs(*portptr)<512) {
+ min = 1;
+ range_size = 511 - min + 1;
+ } else {
+ min = 600;
+ range_size = 1023 - min + 1;
+ }
+ } else {
+ min = 1024;
+ range_size = 65535 - 1024 + 1;
+ }
+ } else {
+ min = ntohs(range->min.tcp.port);
+ range_size = ntohs(range->max.tcp.port) - min + 1;
+ }
+
+ for (i = 0; i < range_size; i++, port++) {
+ *portptr = htons(min + port % range_size);
+ if (!ip_nat_used_tuple(tuple, conntrack)) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static void
+tcp_manip_pkt(struct iphdr *iph, size_t len,
+ const struct ip_conntrack_manip *manip,
+ enum ip_nat_manip_type maniptype)
+{
+ struct tcphdr *hdr = (struct tcphdr *)((u_int32_t *)iph + iph->ihl);
+ u_int32_t oldip;
+ u_int16_t *portptr;
+
+ if (maniptype == IP_NAT_MANIP_SRC) {
+ /* Get rid of src ip and src pt */
+ oldip = iph->saddr;
+ portptr = &hdr->source;
+ } else {
+ /* Get rid of dst ip and dst pt */
+ oldip = iph->daddr;
+ portptr = &hdr->dest;
+ }
+ hdr->check = ip_nat_cheat_check(~oldip, manip->ip,
+ ip_nat_cheat_check(*portptr ^ 0xFFFF,
+ manip->u.tcp.port,
+ hdr->check));
+ *portptr = manip->u.tcp.port;
+}
+
+static unsigned int
+tcp_print(char *buffer,
+ const struct ip_conntrack_tuple *match,
+ const struct ip_conntrack_tuple *mask)
+{
+ unsigned int len = 0;
+
+ if (mask->src.u.tcp.port)
+ len += sprintf(buffer + len, "srcpt=%u ",
+ ntohs(match->src.u.tcp.port));
+
+
+ if (mask->dst.u.tcp.port)
+ len += sprintf(buffer + len, "dstpt=%u ",
+ ntohs(match->dst.u.tcp.port));
+
+ return len;
+}
+
+static unsigned int
+tcp_print_range(char *buffer, const struct ip_nat_range *range)
+{
+ if (range->min.tcp.port != 0 || range->max.tcp.port != 0xFFFF) {
+ if (range->min.tcp.port == range->max.tcp.port)
+ return sprintf(buffer, "port %u ",
+ ntohs(range->min.tcp.port));
+ else
+ return sprintf(buffer, "ports %u-%u ",
+ ntohs(range->min.tcp.port),
+ ntohs(range->max.tcp.port));
+ }
+ else return 0;
+}
+
+struct ip_nat_protocol ip_nat_protocol_tcp
+= { { NULL, NULL }, "TCP", IPPROTO_TCP,
+ tcp_manip_pkt,
+ tcp_in_range,
+ tcp_unique_tuple,
+ tcp_print,
+ tcp_print_range
+};
diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c
new file mode 100644
index 000000000..e0dc25910
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_proto_udp.c
@@ -0,0 +1,141 @@
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/if.h>
+
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+
+static int
+udp_in_range(const struct ip_conntrack_tuple *tuple,
+ enum ip_nat_manip_type maniptype,
+ const union ip_conntrack_manip_proto *min,
+ const union ip_conntrack_manip_proto *max)
+{
+ u_int16_t port;
+
+ if (maniptype == IP_NAT_MANIP_SRC)
+ port = tuple->src.u.udp.port;
+ else
+ port = tuple->dst.u.udp.port;
+
+ return ntohs(port) >= ntohs(min->udp.port)
+ && ntohs(port) <= ntohs(max->udp.port);
+}
+
+static int
+udp_unique_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_nat_range *range,
+ enum ip_nat_manip_type maniptype,
+ const struct ip_conntrack *conntrack)
+{
+ static u_int16_t port = 0, *portptr;
+ unsigned int range_size, min, i;
+
+ if (maniptype == IP_NAT_MANIP_SRC)
+ portptr = &tuple->src.u.udp.port;
+ else
+ portptr = &tuple->dst.u.udp.port;
+
+ /* If no range specified... */
+ if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
+ /* If it's dst rewrite, can't change port */
+ if (maniptype == IP_NAT_MANIP_DST)
+ return 0;
+
+ if (ntohs(*portptr) < 1024) {
+ /* Loose convention: >> 512 is credential passing */
+ if (ntohs(*portptr)<512) {
+ min = 1;
+ range_size = 511 - min + 1;
+ } else {
+ min = 600;
+ range_size = 1023 - min + 1;
+ }
+ } else {
+ min = 1024;
+ range_size = 65535 - 1024 + 1;
+ }
+ } else {
+ min = ntohs(range->min.udp.port);
+ range_size = ntohs(range->max.udp.port) - min + 1;
+ }
+
+ for (i = 0; i < range_size; i++, port++) {
+ *portptr = htons(min + port % range_size);
+ if (!ip_nat_used_tuple(tuple, conntrack))
+ return 1;
+ }
+ return 0;
+}
+
+static void
+udp_manip_pkt(struct iphdr *iph, size_t len,
+ const struct ip_conntrack_manip *manip,
+ enum ip_nat_manip_type maniptype)
+{
+ struct udphdr *hdr = (struct udphdr *)((u_int32_t *)iph + iph->ihl);
+ u_int32_t oldip;
+ u_int16_t *portptr;
+
+ if (maniptype == IP_NAT_MANIP_SRC) {
+ /* Get rid of src ip and src pt */
+ oldip = iph->saddr;
+ portptr = &hdr->source;
+ } else {
+ /* Get rid of dst ip and dst pt */
+ oldip = iph->daddr;
+ portptr = &hdr->dest;
+ }
+ hdr->check = ip_nat_cheat_check(~oldip, manip->ip,
+ ip_nat_cheat_check(*portptr ^ 0xFFFF,
+ manip->u.udp.port,
+ hdr->check));
+ *portptr = manip->u.udp.port;
+}
+
+static unsigned int
+udp_print(char *buffer,
+ const struct ip_conntrack_tuple *match,
+ const struct ip_conntrack_tuple *mask)
+{
+ unsigned int len = 0;
+
+ if (mask->src.u.udp.port)
+ len += sprintf(buffer + len, "srcpt=%u ",
+ ntohs(match->src.u.udp.port));
+
+
+ if (mask->dst.u.udp.port)
+ len += sprintf(buffer + len, "dstpt=%u ",
+ ntohs(match->dst.u.udp.port));
+
+ return len;
+}
+
+static unsigned int
+udp_print_range(char *buffer, const struct ip_nat_range *range)
+{
+ if (range->min.udp.port != 0 || range->max.udp.port != 0xFFFF) {
+ if (range->min.udp.port == range->max.udp.port)
+ return sprintf(buffer, "port %u ",
+ ntohs(range->min.udp.port));
+ else
+ return sprintf(buffer, "ports %u-%u ",
+ ntohs(range->min.udp.port),
+ ntohs(range->max.udp.port));
+ }
+ else return 0;
+}
+
+struct ip_nat_protocol ip_nat_protocol_udp
+= { { NULL, NULL }, "UDP", IPPROTO_UDP,
+ udp_manip_pkt,
+ udp_in_range,
+ udp_unique_tuple,
+ udp_print,
+ udp_print_range
+};
diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c
new file mode 100644
index 000000000..0e3907036
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c
@@ -0,0 +1,61 @@
+/* The "unknown" protocol. This is what is used for protocols we
+ * don't understand. It's returned by find_proto().
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/netfilter.h>
+#include <linux/if.h>
+
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+
+static int unknown_in_range(const struct ip_conntrack_tuple *tuple,
+ enum ip_nat_manip_type manip_type,
+ const union ip_conntrack_manip_proto *min,
+ const union ip_conntrack_manip_proto *max)
+{
+ return 1;
+}
+
+static int unknown_unique_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_nat_range *range,
+ enum ip_nat_manip_type maniptype,
+ const struct ip_conntrack *conntrack)
+{
+ /* Sorry: we can't help you; if it's not unique, we can't frob
+ anything. */
+ return 0;
+}
+
+static void
+unknown_manip_pkt(struct iphdr *iph, size_t len,
+ const struct ip_conntrack_manip *manip,
+ enum ip_nat_manip_type maniptype)
+{
+ return;
+}
+
+static unsigned int
+unknown_print(char *buffer,
+ const struct ip_conntrack_tuple *match,
+ const struct ip_conntrack_tuple *mask)
+{
+ return 0;
+}
+
+static unsigned int
+unknown_print_range(char *buffer, const struct ip_nat_range *range)
+{
+ return 0;
+}
+
+struct ip_nat_protocol unknown_nat_protocol = {
+ { NULL, NULL }, "unknown", 0,
+ unknown_manip_pkt,
+ unknown_in_range,
+ unknown_unique_tuple,
+ unknown_print,
+ unknown_print_range
+};
diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c
new file mode 100644
index 000000000..74516687b
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_rule.c
@@ -0,0 +1,327 @@
+/* Everything about the rules for NAT. */
+#define __NO_VERSION__
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <net/checksum.h>
+#include <linux/bitops.h>
+#include <linux/version.h>
+
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+#define NAT_VALID_HOOKS ((1<<NF_IP_PRE_ROUTING) | (1<<NF_IP_POST_ROUTING) | (1<<NF_IP_LOCAL_OUT))
+
+/* Standard entry. */
+struct ipt_standard
+{
+ struct ipt_entry entry;
+ struct ipt_standard_target target;
+};
+
+struct ipt_error_target
+{
+ struct ipt_entry_target target;
+ char errorname[IPT_FUNCTION_MAXNAMELEN];
+};
+
+struct ipt_error
+{
+ struct ipt_entry entry;
+ struct ipt_error_target target;
+};
+
+static struct
+{
+ struct ipt_replace repl;
+ struct ipt_standard entries[3];
+ struct ipt_error term;
+} nat_initial_table __initdata
+= { { "nat", NAT_VALID_HOOKS, 4,
+ sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
+ { [NF_IP_PRE_ROUTING] 0,
+ [NF_IP_POST_ROUTING] sizeof(struct ipt_standard),
+ [NF_IP_LOCAL_OUT] sizeof(struct ipt_standard) * 2 },
+ { [NF_IP_PRE_ROUTING] 0,
+ [NF_IP_POST_ROUTING] sizeof(struct ipt_standard),
+ [NF_IP_LOCAL_OUT] sizeof(struct ipt_standard) * 2 },
+ 0, NULL, { } },
+ {
+ /* PRE_ROUTING */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_standard),
+ 0, { 0, 0 }, { } },
+ { { sizeof(struct ipt_standard_target), { "" }, { } },
+ -NF_ACCEPT - 1 } },
+ /* POST_ROUTING */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_standard),
+ 0, { 0, 0 }, { } },
+ { { sizeof(struct ipt_standard_target), { "" }, { } },
+ -NF_ACCEPT - 1 } },
+ /* LOCAL_OUT */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_standard),
+ 0, { 0, 0 }, { } },
+ { { sizeof(struct ipt_standard_target), { "" }, { } },
+ -NF_ACCEPT - 1 } }
+ },
+ /* ERROR */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_error),
+ 0, { 0, 0 }, { } },
+ { { sizeof(struct ipt_error_target), { IPT_ERROR_TARGET },
+ { } },
+ "ERROR"
+ }
+ }
+};
+
+static struct ipt_table nat_table
+= { { NULL, NULL }, "nat", &nat_initial_table.repl,
+ NAT_VALID_HOOKS, RW_LOCK_UNLOCKED, NULL };
+
+LIST_HEAD(nat_expect_list);
+
+/* Source NAT */
+static unsigned int ipt_snat_target(struct sk_buff **pskb,
+ unsigned int hooknum,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *targinfo,
+ void *userinfo)
+{
+ struct ip_conntrack *ct;
+ enum ip_conntrack_info ctinfo;
+
+ IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING);
+
+ ct = ip_conntrack_get(*pskb, &ctinfo);
+
+ /* Connection must be valid and new. */
+ IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
+ IP_NF_ASSERT(out);
+
+ return ip_nat_setup_info(ct, targinfo, hooknum);
+}
+
+static unsigned int ipt_dnat_target(struct sk_buff **pskb,
+ unsigned int hooknum,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *targinfo,
+ void *userinfo)
+{
+ struct ip_conntrack *ct;
+ enum ip_conntrack_info ctinfo;
+
+ IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
+ || hooknum == NF_IP_LOCAL_OUT);
+
+ ct = ip_conntrack_get(*pskb, &ctinfo);
+
+ /* Connection must be valid and new. */
+ IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
+
+ return ip_nat_setup_info(ct, targinfo, hooknum);
+}
+
+static int ipt_snat_checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ struct ip_nat_multi_range *mr = targinfo;
+
+ /* Must be a valid range */
+ if (targinfosize < sizeof(struct ip_nat_multi_range)) {
+ DEBUGP("SNAT: Target size %u too small\n", targinfosize);
+ return 0;
+ }
+
+ if (targinfosize != IPT_ALIGN((sizeof(struct ip_nat_multi_range)
+ + (sizeof(struct ip_nat_range)
+ * (mr->rangesize - 1))))) {
+ DEBUGP("SNAT: Target size %u wrong for %u ranges\n",
+ targinfosize, mr->rangesize);
+ return 0;
+ }
+
+ if (hook_mask & ~(1 << NF_IP_POST_ROUTING)) {
+ DEBUGP("SNAT: hook mask 0x%x bad\n", hook_mask);
+ return 0;
+ }
+ return 1;
+}
+
+static int ipt_dnat_checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ struct ip_nat_multi_range *mr = targinfo;
+
+ /* Must be a valid range */
+ if (targinfosize < sizeof(struct ip_nat_multi_range)) {
+ DEBUGP("DNAT: Target size %u too small\n", targinfosize);
+ return 0;
+ }
+
+ if (targinfosize != IPT_ALIGN((sizeof(struct ip_nat_multi_range)
+ + (sizeof(struct ip_nat_range)
+ * (mr->rangesize - 1))))) {
+ DEBUGP("DNAT: Target size %u wrong for %u ranges\n",
+ targinfosize, mr->rangesize);
+ return 0;
+ }
+
+ if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))) {
+ DEBUGP("DNAT: hook mask 0x%x bad\n", hook_mask);
+ return 0;
+ }
+ return 1;
+}
+
+static inline unsigned int
+alloc_null_binding(struct ip_conntrack *conntrack,
+ struct ip_nat_info *info,
+ unsigned int hooknum)
+{
+ /* Force range to this IP; let proto decide mapping for
+ per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
+ Use reply in case it's already been mangled (eg local packet).
+ */
+ u_int32_t ip
+ = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
+ ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip
+ : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip);
+ struct ip_nat_multi_range mr
+ = { 1, { { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } } } };
+
+ DEBUGP("Allocating NULL binding for %p (%u.%u.%u.%u)\n", conntrack,
+ IP_PARTS(ip));
+ return ip_nat_setup_info(conntrack, &mr, hooknum);
+}
+
+static inline int call_expect(const struct ip_nat_expect *i,
+ struct sk_buff **pskb,
+ unsigned int hooknum,
+ struct ip_conntrack *ct,
+ struct ip_nat_info *info,
+ struct ip_conntrack *master,
+ struct ip_nat_info *masterinfo,
+ unsigned int *verdict)
+{
+ return i->expect(pskb, hooknum, ct, info, master, masterinfo,
+ verdict);
+}
+
+int ip_nat_rule_find(struct sk_buff **pskb,
+ unsigned int hooknum,
+ const struct net_device *in,
+ const struct net_device *out,
+ struct ip_conntrack *ct,
+ struct ip_nat_info *info)
+{
+ int ret;
+
+ /* Master won't vanish while this ctrack still alive */
+ if (ct->master.master) {
+ struct ip_conntrack *master;
+
+ master = (struct ip_conntrack *)ct->master.master;
+ if (LIST_FIND(&nat_expect_list,
+ call_expect,
+ struct ip_nat_expect *,
+ pskb, hooknum, ct, info,
+ master, &master->nat.info, &ret))
+ return ret;
+ }
+ ret = ipt_do_table(pskb, hooknum, in, out, &nat_table, NULL);
+ if (ret == NF_ACCEPT) {
+ if (!(info->initialized & (1 << HOOK2MANIP(hooknum))))
+ /* NUL mapping */
+ ret = alloc_null_binding(ct, info, hooknum);
+ }
+ return ret;
+}
+
+int ip_nat_expect_register(struct ip_nat_expect *expect)
+{
+ WRITE_LOCK(&ip_nat_lock);
+ list_prepend(&nat_expect_list, expect);
+ WRITE_UNLOCK(&ip_nat_lock);
+
+ return 0;
+}
+
+void ip_nat_expect_unregister(struct ip_nat_expect *expect)
+{
+ WRITE_LOCK(&ip_nat_lock);
+ LIST_DELETE(&nat_expect_list, expect);
+ WRITE_UNLOCK(&ip_nat_lock);
+}
+
+static struct ipt_target ipt_snat_reg
+= { { NULL, NULL }, "SNAT", ipt_snat_target, ipt_snat_checkentry, NULL };
+static struct ipt_target ipt_dnat_reg
+= { { NULL, NULL }, "DNAT", ipt_dnat_target, ipt_dnat_checkentry, NULL };
+
+int __init ip_nat_rule_init(void)
+{
+ int ret;
+
+ ret = ipt_register_table(&nat_table);
+ if (ret != 0)
+ return ret;
+ ret = ipt_register_target(&ipt_snat_reg);
+ if (ret != 0)
+ goto unregister_table;
+
+ ret = ipt_register_target(&ipt_dnat_reg);
+ if (ret != 0)
+ goto unregister_snat;
+
+ return ret;
+
+ unregister_snat:
+ ipt_unregister_target(&ipt_snat_reg);
+ unregister_table:
+ ipt_unregister_table(&nat_table);
+
+ return ret;
+}
+
+void ip_nat_rule_cleanup(void)
+{
+ ipt_unregister_target(&ipt_dnat_reg);
+ ipt_unregister_target(&ipt_snat_reg);
+ ipt_unregister_table(&nat_table);
+}
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
new file mode 100644
index 000000000..bf278d6f9
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -0,0 +1,273 @@
+/* This file contains all the functions required for the standalone
+ ip_nat module.
+
+ These are not required by the compatibility layer.
+*/
+
+/* (c) 1999 Paul `Rusty' Russell. Licenced under the GNU General
+ Public Licence. */
+
+#ifdef MODULE
+#define EXPORT_SYMTAB
+#endif
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <net/checksum.h>
+#include <linux/spinlock.h>
+#include <linux/version.h>
+#include <linux/brlock.h>
+
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+#define HOOKNAME(hooknum) ((hooknum) == NF_IP_POST_ROUTING ? "POST_ROUTING" \
+ : ((hooknum) == NF_IP_PRE_ROUTING ? "PRE_ROUTING" \
+ : ((hooknum) == NF_IP_LOCAL_OUT ? "LOCAL_OUT" \
+ : "*ERROR*")))
+
+static unsigned int
+ip_nat_fn(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct ip_conntrack *ct;
+ enum ip_conntrack_info ctinfo;
+ struct ip_nat_info *info;
+ /* maniptype == SRC for postrouting. */
+ enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum);
+
+ /* We never see fragments: conntrack defrags on pre-routing
+ and local-out, and ip_nat_out protects post-routing. */
+ IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
+ & __constant_htons(IP_MF|IP_OFFSET)));
+
+ /* FIXME: One day, fill in properly. --RR */
+ (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED;
+
+ /* If we had a hardware checksum before, it's now invalid */
+ if ((*pskb)->pkt_type != PACKET_LOOPBACK)
+ (*pskb)->ip_summed = CHECKSUM_NONE;
+
+ ct = ip_conntrack_get(*pskb, &ctinfo);
+ /* Can't track? Maybe out of memory: this would make NAT
+ unreliable. */
+ if (!ct)
+ return NF_DROP;
+
+ switch (ctinfo) {
+ case IP_CT_RELATED:
+ case IP_CT_RELATED+IP_CT_IS_REPLY:
+ if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) {
+ icmp_reply_translation(*pskb, ct, hooknum,
+ CTINFO2DIR(ctinfo));
+ return NF_ACCEPT;
+ }
+ /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
+ case IP_CT_NEW:
+ info = &ct->nat.info;
+
+ WRITE_LOCK(&ip_nat_lock);
+ /* Seen it before? This can happen for loopback, retrans,
+ or local packets.. */
+ if (!(info->initialized & (1 << maniptype))) {
+ int in_hashes = info->initialized;
+ unsigned int ret;
+
+ ret = ip_nat_rule_find(pskb, hooknum, in, out,
+ ct, info);
+ if (ret != NF_ACCEPT) {
+ WRITE_UNLOCK(&ip_nat_lock);
+ return ret;
+ }
+
+ if (in_hashes) {
+ IP_NF_ASSERT(info->bysource.conntrack);
+ replace_in_hashes(ct, info);
+ } else {
+ place_in_hashes(ct, info);
+ }
+ } else
+ DEBUGP("Already setup manip %s for ct %p\n",
+ maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST",
+ ct);
+ WRITE_UNLOCK(&ip_nat_lock);
+ break;
+
+ default:
+ /* ESTABLISHED */
+ IP_NF_ASSERT(ctinfo == IP_CT_ESTABLISHED
+ || ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY));
+ info = &ct->nat.info;
+ }
+
+ IP_NF_ASSERT(info);
+ return do_bindings(ct, ctinfo, info, hooknum, pskb);
+}
+
+static unsigned int
+ip_nat_out(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ /* We can hit fragment here; forwarded packets get
+ defragmented by connection tracking coming in, then
+ fragmented (grr) by the forward code.
+
+ In future: If we have nfct != NULL, AND we have NAT
+ initialized, AND there is no helper, then we can do full
+ NAPT on the head, and IP-address-only NAT on the rest.
+
+ I'm starting to have nightmares about fragments. */
+
+ if ((*pskb)->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
+ *pskb = ip_ct_gather_frags(*pskb);
+
+ if (!*pskb)
+ return NF_STOLEN;
+ }
+
+ return ip_nat_fn(hooknum, pskb, in, out, okfn);
+}
+
+/* We must be after connection tracking and before packet filtering. */
+
+/* Before packet filtering, change destination */
+static struct nf_hook_ops ip_nat_in_ops
+= { { NULL, NULL }, ip_nat_fn, PF_INET, NF_IP_PRE_ROUTING, NF_IP_PRI_NAT_DST };
+/* After packet filtering, change source */
+static struct nf_hook_ops ip_nat_out_ops
+= { { NULL, NULL }, ip_nat_out, PF_INET, NF_IP_POST_ROUTING, NF_IP_PRI_NAT_SRC};
+/* Before packet filtering, change destination */
+static struct nf_hook_ops ip_nat_local_out_ops
+= { { NULL, NULL }, ip_nat_fn, PF_INET, NF_IP_LOCAL_OUT, NF_IP_PRI_NAT_DST };
+
+/* Protocol registration. */
+int ip_nat_protocol_register(struct ip_nat_protocol *proto)
+{
+ int ret = 0;
+ struct list_head *i;
+
+ WRITE_LOCK(&ip_nat_lock);
+ for (i = protos.next; i != &protos; i = i->next) {
+ if (((struct ip_nat_protocol *)i)->protonum
+ == proto->protonum) {
+ ret = -EBUSY;
+ goto out;
+ }
+ }
+
+ list_prepend(&protos, proto);
+ MOD_INC_USE_COUNT;
+
+ out:
+ WRITE_UNLOCK(&ip_nat_lock);
+ return ret;
+}
+
+/* Noone stores the protocol anywhere; simply delete it. */
+void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
+{
+ WRITE_LOCK(&ip_nat_lock);
+ LIST_DELETE(&protos, proto);
+ WRITE_UNLOCK(&ip_nat_lock);
+
+ /* Someone could be still looking at the proto in a bh. */
+ br_write_lock_bh(BR_NETPROTO_LOCK);
+ br_write_unlock_bh(BR_NETPROTO_LOCK);
+
+ MOD_DEC_USE_COUNT;
+}
+
+static int init_or_cleanup(int init)
+{
+ int ret = 0;
+
+ if (!init) goto cleanup;
+
+ ret = ip_nat_rule_init();
+ if (ret < 0) {
+ printk("ip_nat_init: can't setup rules.\n");
+ goto cleanup_nothing;
+ }
+ ret = ip_nat_init();
+ if (ret < 0) {
+ printk("ip_nat_init: can't setup rules.\n");
+ goto cleanup_rule_init;
+ }
+ ret = nf_register_hook(&ip_nat_in_ops);
+ if (ret < 0) {
+ printk("ip_nat_init: can't register in hook.\n");
+ goto cleanup_nat;
+ }
+ ret = nf_register_hook(&ip_nat_out_ops);
+ if (ret < 0) {
+ printk("ip_nat_init: can't register out hook.\n");
+ goto cleanup_inops;
+ }
+ ret = nf_register_hook(&ip_nat_local_out_ops);
+ if (ret < 0) {
+ printk("ip_nat_init: can't register local out hook.\n");
+ goto cleanup_outops;
+ }
+ __MOD_INC_USE_COUNT(ip_conntrack_module);
+ return ret;
+
+ cleanup:
+ __MOD_DEC_USE_COUNT(ip_conntrack_module);
+ nf_unregister_hook(&ip_nat_local_out_ops);
+ cleanup_outops:
+ nf_unregister_hook(&ip_nat_out_ops);
+ cleanup_inops:
+ nf_unregister_hook(&ip_nat_in_ops);
+ cleanup_nat:
+ ip_nat_cleanup();
+ cleanup_rule_init:
+ ip_nat_rule_cleanup();
+ cleanup_nothing:
+ MUST_BE_READ_WRITE_UNLOCKED(&ip_nat_lock);
+ return ret;
+}
+
+static int __init init(void)
+{
+ return init_or_cleanup(1);
+}
+
+static void __exit fini(void)
+{
+ init_or_cleanup(0);
+}
+
+module_init(init);
+module_exit(fini);
+
+EXPORT_SYMBOL(ip_nat_setup_info);
+EXPORT_SYMBOL(ip_nat_helper_register);
+EXPORT_SYMBOL(ip_nat_helper_unregister);
+EXPORT_SYMBOL(ip_nat_expect_register);
+EXPORT_SYMBOL(ip_nat_expect_unregister);
+EXPORT_SYMBOL(ip_nat_cheat_check);
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
new file mode 100644
index 000000000..d5ca01aa6
--- /dev/null
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -0,0 +1,752 @@
+/*
+ * This is a module which is used for queueing IPv4 packets and
+ * communicating with userspace via netlink.
+ *
+ * (C) 2000 James Morris
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/netlink.h>
+#include <linux/spinlock.h>
+#include <linux/smp_lock.h>
+#include <linux/rtnetlink.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <net/sock.h>
+
+#include <linux/netfilter_ipv4/ip_queue.h>
+
+EXPORT_NO_SYMBOLS;
+
+#define IPQ_THR_NAME "kipq"
+#define IPQ_NAME "ip_queue"
+#define IPQ_QMAX_DEFAULT 1024
+
+#define IPQ_PROC_FS_NAME "ip_queue"
+
+#define NET_IPQ_QMAX 2088
+#define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
+
+typedef struct ipq_queue_element {
+ struct list_head list; /* Links element into queue */
+ unsigned char state; /* State of this element */
+ int verdict; /* Current verdict */
+ struct nf_info *info; /* Extra info from netfilter */
+ struct sk_buff *skb; /* Packet inside */
+} ipq_queue_element_t;
+
+typedef int (*ipq_send_cb_t)(ipq_queue_element_t *e);
+
+typedef struct ipq_peer {
+ pid_t pid; /* PID of userland peer */
+ unsigned char died; /* We think the peer died */
+ unsigned char copy_mode; /* Copy packet as well as metadata? */
+ size_t copy_range; /* Range past metadata to copy */
+ ipq_send_cb_t send; /* Callback for sending data to peer */
+} ipq_peer_t;
+
+typedef struct ipq_thread {
+ pid_t pid; /* PID of kernel thread */
+ unsigned char terminate; /* Termination flag */
+ unsigned char running; /* Running flag */
+ wait_queue_head_t wq; /* I/O wait queue */
+ void (*process)(void *data); /* Queue processing function */
+} ipq_thread_t;
+
+typedef struct ipq_queue {
+ int len; /* Current queue len */
+ int *maxlen; /* Maximum queue len, via sysctl */
+ unsigned char state; /* Current queue state */
+ struct list_head list; /* Head of packet queue */
+ spinlock_t lock; /* Queue spinlock */
+ ipq_peer_t peer; /* Userland peer */
+ ipq_thread_t thread; /* Thread context */
+} ipq_queue_t;
+
+
+/****************************************************************************
+*
+* Kernel thread
+*
+****************************************************************************/
+
+static void ipq_thread_init(char *thread_name)
+{
+ lock_kernel();
+ exit_files(current);
+ daemonize();
+ strcpy(current->comm, thread_name);
+ unlock_kernel();
+ spin_lock_irq(&current->sigmask_lock);
+ flush_signals(current);
+ sigfillset(&current->blocked);
+ recalc_sigpending(current);
+ spin_unlock_irq(&current->sigmask_lock);
+}
+
+static int ipq_thread_start(void *data)
+{
+ ipq_queue_t *q = (ipq_queue_t *)data;
+
+ q->thread.running = 1;
+ ipq_thread_init(IPQ_THR_NAME);
+ q->thread.pid = current->pid;
+ while (!q->thread.terminate) {
+ interruptible_sleep_on(&q->thread.wq);
+ q->thread.process(q);
+ }
+ q->thread.running = 0;
+ return 0;
+}
+
+static void ipq_thread_stop(ipq_queue_t *q)
+{
+ if (!(q->thread.pid || q->thread.running))
+ return;
+ q->state = IPQ_QS_FLUSH;
+ q->thread.terminate = 1;
+ wake_up_interruptible(&q->thread.wq);
+ current->state = TASK_INTERRUPTIBLE;
+ while (q->thread.running) {
+ schedule_timeout(HZ/10);
+ current->state = TASK_RUNNING;
+ }
+}
+
+static int ipq_thread_create(ipq_queue_t *q)
+{
+ int status = kernel_thread(ipq_thread_start, q, 0);
+ return (status < 0) ? status : 0;
+}
+
+
+/****************************************************************************
+ *
+ * Packet queue
+ *
+ ****************************************************************************/
+
+/* Must be called under spinlock */
+static __inline__ void
+ipq_dequeue(ipq_queue_t *q,
+ ipq_queue_element_t *e)
+{
+ list_del(&e->list);
+ nf_reinject(e->skb, e->info, e->verdict);
+ kfree(e);
+ q->len--;
+}
+
+/* Must be called under spinlock */
+static __inline__ void
+ipq_queue_drop(ipq_queue_t *q,
+ ipq_queue_element_t *e)
+{
+ e->verdict = NF_DROP;
+ ipq_dequeue(q, e);
+}
+
+static int
+ipq_notify_peer(ipq_queue_t *q,
+ ipq_queue_element_t *e)
+{
+ int status = q->peer.send(e);
+
+ if (status >= 0) {
+ e->state = IPQ_PS_WAITING;
+ return status;
+ }
+ if (status == -ERESTARTSYS || status == -EAGAIN)
+ return 0;
+ printk(KERN_INFO "%s: error notifying peer %d, resetting "
+ "state and flushing queue\n", IPQ_NAME, q->peer.pid);
+ q->state = IPQ_QS_FLUSH;
+ q->peer.died = 1;
+ q->peer.pid = 0;
+ q->peer.copy_mode = IPQ_COPY_META;
+ q->peer.copy_range = 0;
+ return status;
+}
+
+static void
+ipq_queue_process(void *data)
+{
+ struct list_head *i;
+ ipq_queue_t *q = (ipq_queue_t *)data;
+
+restart:
+ if (q->state == IPQ_QS_HOLD)
+ return;
+ spin_lock_bh(&q->lock);
+ for (i = q->list.prev; i != &q->list; i = i->prev) {
+ ipq_queue_element_t *e = (ipq_queue_element_t *)i;
+
+ if (q->state == IPQ_QS_FLUSH) {
+ QDEBUG("flushing packet %p\n", e);
+ ipq_queue_drop(q, e);
+ continue;
+ }
+ switch (e->state) {
+ case IPQ_PS_NEW: {
+ int status = ipq_notify_peer(q, e);
+ if (status < 0) {
+ spin_unlock_bh(&q->lock);
+ goto restart;
+ }
+ break;
+ }
+ case IPQ_PS_VERDICT:
+ ipq_dequeue(q, e);
+ break;
+ case IPQ_PS_WAITING:
+ break;
+ default:
+ printk(KERN_INFO "%s: dropping stuck packet %p "
+ "with ps=%d qs=%d\n", IPQ_NAME,
+ e, e->state, q->state);
+ ipq_queue_drop(q, e);
+ }
+ }
+ spin_unlock_bh(&q->lock);
+ if (q->state == IPQ_QS_FLUSH)
+ q->state = IPQ_QS_HOLD;
+}
+
+static ipq_queue_t *
+ipq_queue_create(nf_queue_outfn_t outfn,
+ ipq_send_cb_t send_cb,
+ int *errp,
+ int *sysctl_qmax)
+{
+ int status;
+ ipq_queue_t *q;
+
+ *errp = 0;
+ q = kmalloc(sizeof(ipq_queue_t), GFP_KERNEL);
+ if (q == NULL) {
+ *errp = -ENOMEM;
+ return NULL;
+ }
+ q->thread.terminate = 0;
+ q->thread.running = 0;
+ q->thread.process = ipq_queue_process;
+ init_waitqueue_head(&q->thread.wq);
+ q->peer.pid = 0;
+ q->peer.died = 0;
+ q->peer.copy_mode = IPQ_COPY_META;
+ q->peer.copy_range = 0;
+ q->peer.send = send_cb;
+ q->len = 0;
+ q->maxlen = sysctl_qmax;
+ q->state = IPQ_QS_HOLD;
+ INIT_LIST_HEAD(&q->list);
+ spin_lock_init(&q->lock);
+ status = nf_register_queue_handler(PF_INET, outfn, q);
+ if (status < 0) {
+ *errp = -EBUSY;
+ kfree(q);
+ return NULL;
+ }
+ status = ipq_thread_create(q);
+ if (status < 0) {
+ nf_unregister_queue_handler(PF_INET);
+ *errp = status;
+ kfree(q);
+ return NULL;
+ }
+ return q;
+}
+
+static int
+ipq_enqueue(ipq_queue_t *q,
+ struct sk_buff *skb,
+ struct nf_info *info)
+{
+ ipq_queue_element_t *e = NULL;
+
+ e = kmalloc(sizeof(*e), GFP_ATOMIC);
+ if (e == NULL) {
+ printk(KERN_ERR "%s: out of memory in %s\n",
+ IPQ_NAME, __FUNCTION__);
+ return -ENOMEM;
+ }
+ e->state = IPQ_PS_NEW;
+ e->verdict = NF_DROP;
+ e->info = info;
+ e->skb = skb;
+ spin_lock_bh(&q->lock);
+ if (q->len >= *q->maxlen) {
+ spin_unlock_bh(&q->lock);
+ printk(KERN_WARNING "%s: queue full at %d entries, "
+ "dropping packet.\n", IPQ_NAME, q->len);
+ kfree(e);
+ nf_reinject(skb, info, NF_DROP);
+ return 0;
+ }
+ list_add(&e->list, &q->list);
+ q->len++;
+ spin_unlock_bh(&q->lock);
+ wake_up_interruptible(&q->thread.wq);
+ return 0;
+}
+
+/* FIXME: need to find a way to notify user during module unload */
+static void
+ipq_queue_destroy(ipq_queue_t *q)
+{
+ ipq_thread_stop(q);
+ nf_unregister_queue_handler(PF_INET);
+ kfree(q);
+}
+
+static int
+ipq_queue_mangle_ipv4(unsigned char *buf,
+ ipq_verdict_msg_t *v,
+ ipq_queue_element_t *e)
+{
+ struct iphdr *user_iph = (struct iphdr *)buf;
+
+ if (v->data_len < sizeof(*user_iph))
+ return 0;
+
+ if (e->skb->nh.iph->check != user_iph->check) {
+ int diff = v->data_len - e->skb->len;
+
+ if (diff < 0)
+ skb_trim(e->skb, v->data_len);
+ else if (diff > 0) {
+ if (v->data_len > 0xFFFF) {
+ e->verdict = NF_DROP;
+ return -EINVAL;
+ }
+ if (diff > skb_tailroom(e->skb)) {
+ struct sk_buff *newskb;
+
+ /* Ack, we waste a memcpy() of data here */
+ newskb = skb_copy_expand(e->skb,
+ skb_headroom(e->skb),
+ diff,
+ GFP_ATOMIC);
+ if (newskb == NULL) {
+ printk(KERN_WARNING "%s: OOM in %s, "
+ "dropping packet\n",
+ IPQ_THR_NAME, __FUNCTION__);
+ e->verdict = NF_DROP;
+ return -ENOMEM;
+ }
+ kfree_skb(e->skb);
+ e->skb = newskb;
+ }
+ skb_put(e->skb, diff);
+ }
+ memcpy(e->skb->data, buf, v->data_len);
+ e->skb->nfcache |= NFC_ALTERED;
+ }
+ return 0;
+}
+
+static int
+ipq_queue_set_verdict(ipq_queue_t *q,
+ ipq_verdict_msg_t *v,
+ unsigned char *buf,
+ unsigned int len)
+{
+ struct list_head *i;
+
+ if (v->value < 0 || v->value > NF_MAX_VERDICT)
+ return -EINVAL;
+ spin_lock_bh(&q->lock);
+ for (i = q->list.next; i != &q->list; i = i->next) {
+ ipq_queue_element_t *e = (ipq_queue_element_t *)i;
+
+ if (v->id == (unsigned long )e) {
+ int status = 0;
+ e->state = IPQ_PS_VERDICT;
+ e->verdict = v->value;
+
+ if (buf && v->data_len == len)
+ status = ipq_queue_mangle_ipv4(buf, v, e);
+ spin_unlock_bh(&q->lock);
+ return status;
+ }
+ }
+ spin_unlock_bh(&q->lock);
+ return -ENOENT;
+}
+
+static int
+ipq_receive_peer(ipq_queue_t *q,
+ ipq_peer_msg_t *m,
+ unsigned char type,
+ unsigned int len)
+{
+ if (q->state == IPQ_QS_FLUSH)
+ return -EBUSY;
+
+ if (len < sizeof(ipq_peer_msg_t))
+ return -EINVAL;
+
+ switch (type) {
+ case IPQM_MODE:
+ switch (m->msg.mode.value) {
+ case IPQ_COPY_NONE:
+ q->peer.copy_mode = IPQ_COPY_NONE;
+ q->peer.copy_range = 0;
+ q->state = IPQ_QS_FLUSH;
+ break;
+ case IPQ_COPY_META:
+ if (q->state == IPQ_QS_FLUSH)
+ return -EAGAIN;
+ q->peer.copy_mode = IPQ_COPY_META;
+ q->peer.copy_range = 0;
+ q->state = IPQ_QS_COPY;
+ break;
+ case IPQ_COPY_PACKET:
+ if (q->state == IPQ_QS_FLUSH)
+ return -EAGAIN;
+ q->peer.copy_mode = IPQ_COPY_PACKET;
+ q->peer.copy_range = m->msg.mode.range;
+ q->state = IPQ_QS_COPY;
+ break;
+ default:
+ return -EINVAL;
+ }
+ break;
+ case IPQM_VERDICT: {
+ int status;
+ unsigned char *data = NULL;
+
+ if (m->msg.verdict.value > NF_MAX_VERDICT)
+ return -EINVAL;
+ if (m->msg.verdict.data_len)
+ data = (unsigned char *)m + sizeof(*m);
+ status = ipq_queue_set_verdict(q, &m->msg.verdict,
+ data, len - sizeof(*m));
+ if (status < 0)
+ return status;
+ break;
+ }
+ default:
+ return -EINVAL;
+ }
+ wake_up_interruptible(&q->thread.wq);
+ return 0;
+}
+
+
+/****************************************************************************
+ *
+ * Netfilter interface
+ *
+ ****************************************************************************/
+
+/*
+ * Packets arrive here from netfilter for queuing to userspace.
+ * All of them must be fed back via nf_reinject() or Alexey will kill Rusty.
+ */
+static int
+receive_netfilter(struct sk_buff *skb,
+ struct nf_info *info,
+ void *data)
+{
+ ipq_queue_t *q = (ipq_queue_t *)data;
+
+ if (q->state == IPQ_QS_FLUSH)
+ return -EBUSY;
+ return ipq_enqueue(q, skb, info);
+}
+
+/****************************************************************************
+ *
+ * Netlink interface.
+ *
+ ****************************************************************************/
+
+static struct sk_buff *
+netlink_build_message(ipq_queue_element_t *e,
+ int *errp);
+
+extern __inline__ void
+receive_user_skb(struct sk_buff *skb);
+
+static int
+netlink_send_peer(ipq_queue_element_t *e);
+
+static struct sock *nfnl = NULL;
+ipq_queue_t *nlq = NULL;
+
+static int
+netlink_send_peer(ipq_queue_element_t *e)
+{
+ int status = 0;
+ struct sk_buff *skb;
+
+ if (!nlq->peer.pid)
+ return -EINVAL;
+ skb = netlink_build_message(e, &status);
+ if (skb == NULL)
+ return status;
+ return netlink_unicast(nfnl, skb, nlq->peer.pid, 0);
+}
+
+static struct sk_buff *
+netlink_build_message(ipq_queue_element_t *e,
+ int *errp)
+{
+ unsigned char *old_tail;
+ size_t size = 0;
+ size_t data_len = 0;
+ struct sk_buff *skb;
+ ipq_packet_msg_t *pm;
+ struct nlmsghdr *nlh;
+
+ switch (nlq->peer.copy_mode) {
+ size_t copy_range;
+
+ case IPQ_COPY_META:
+ size = NLMSG_SPACE(sizeof(*pm));
+ data_len = 0;
+ break;
+ case IPQ_COPY_PACKET:
+ copy_range = nlq->peer.copy_range;
+ if (copy_range == 0 || copy_range > e->skb->len)
+ data_len = e->skb->len;
+ else
+ data_len = copy_range;
+ size = NLMSG_SPACE(sizeof(*pm) + data_len);
+ break;
+ case IPQ_COPY_NONE:
+ default:
+ *errp = -EINVAL;
+ return NULL;
+ }
+ skb = alloc_skb(size, GFP_ATOMIC);
+ if (!skb)
+ goto nlmsg_failure;
+ old_tail = skb->tail;
+ nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh));
+ pm = NLMSG_DATA(nlh);
+ memset(pm, 0, sizeof(*pm));
+ pm->packet_id = (unsigned long )e;
+ pm->data_len = data_len;
+ pm->timestamp_sec = e->skb->stamp.tv_sec;
+ pm->timestamp_usec = e->skb->stamp.tv_usec;
+ pm->hook = e->info->hook;
+ if (e->info->indev) strcpy(pm->indev_name, e->info->indev->name);
+ else pm->indev_name[0] = '\0';
+ if (e->info->outdev) strcpy(pm->outdev_name, e->info->outdev->name);
+ else pm->outdev_name[0] = '\0';
+ if (data_len)
+ memcpy(++pm, e->skb->data, data_len);
+ nlh->nlmsg_len = skb->tail - old_tail;
+ NETLINK_CB(skb).dst_groups = 0;
+ return skb;
+nlmsg_failure:
+ if (skb)
+ kfree(skb);
+ *errp = 0;
+ printk(KERN_ERR "%s: error creating netlink message\n", IPQ_NAME);
+ return NULL;
+}
+
+#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0);
+/*
+ * FIXME: ping old peer if we detect a new peer then resend.
+ */
+extern __inline__ void
+receive_user_skb(struct sk_buff *skb)
+{
+ int status, type;
+ struct nlmsghdr *nlh;
+
+ nlh = (struct nlmsghdr *)skb->data;
+ if (nlh->nlmsg_len < sizeof(*nlh)
+ || skb->len < nlh->nlmsg_len
+ || nlh->nlmsg_pid <= 0
+ || !(nlh->nlmsg_flags & NLM_F_REQUEST)
+ || nlh->nlmsg_flags & NLM_F_MULTI)
+ RCV_SKB_FAIL(-EINVAL);
+ if (nlh->nlmsg_flags & MSG_TRUNC)
+ RCV_SKB_FAIL(-ECOMM);
+ type = nlh->nlmsg_type;
+ if (type < NLMSG_NOOP || type >= IPQM_MAX)
+ RCV_SKB_FAIL(-EINVAL);
+ if (type <= IPQM_BASE)
+ return;
+ if(!cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN))
+ RCV_SKB_FAIL(-EPERM);
+ if (nlq->peer.pid && !nlq->peer.died
+ && (nlq->peer.pid != nlh->nlmsg_pid))
+ printk(KERN_WARNING "%s: peer pid changed from %d to %d\n",
+ IPQ_NAME, nlq->peer.pid, nlh->nlmsg_pid);
+ nlq->peer.pid = nlh->nlmsg_pid;
+ nlq->peer.died = 0;
+ status = ipq_receive_peer(nlq, NLMSG_DATA(nlh),
+ type, skb->len - NLMSG_LENGTH(0));
+ if (status < 0)
+ RCV_SKB_FAIL(status);
+ if (nlh->nlmsg_flags & NLM_F_ACK)
+ netlink_ack(skb, nlh, 0);
+ return;
+}
+
+/* Note: we are only dealing with single part messages at the moment. */
+static void
+receive_user_sk(struct sock *sk,
+ int len)
+{
+ do {
+ struct sk_buff *skb;
+
+ if (rtnl_shlock_nowait())
+ return;
+ while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) {
+ receive_user_skb(skb);
+ kfree_skb(skb);
+ }
+ up(&rtnl_sem);
+ } while (nfnl && nfnl->receive_queue.qlen);
+}
+
+
+/****************************************************************************
+ *
+ * System events
+ *
+ ****************************************************************************/
+
+static int
+receive_event(struct notifier_block *this,
+ unsigned long event,
+ void *ptr)
+{
+ if (event == NETDEV_UNREGISTER)
+ if (nlq)
+ ipq_thread_stop(nlq);
+ return NOTIFY_DONE;
+}
+
+struct notifier_block ipq_dev_notifier = {
+ receive_event,
+ NULL,
+ 0
+};
+
+
+/****************************************************************************
+ *
+ * Sysctl - queue tuning.
+ *
+ ****************************************************************************/
+
+static int sysctl_maxlen = IPQ_QMAX_DEFAULT;
+
+static struct ctl_table_header *ipq_sysctl_header;
+
+static ctl_table ipq_table[] = {
+ { NET_IPQ_QMAX, NET_IPQ_QMAX_NAME, &sysctl_maxlen,
+ sizeof(sysctl_maxlen), 0644, NULL, proc_dointvec },
+ { 0 }
+};
+
+static ctl_table ipq_dir_table[] = {
+ {NET_IPV4, "ipv4", NULL, 0, 0555, ipq_table, 0, 0, 0, 0, 0},
+ { 0 }
+};
+
+static ctl_table ipq_root_table[] = {
+ {CTL_NET, "net", NULL, 0, 0555, ipq_dir_table, 0, 0, 0, 0, 0},
+ { 0 }
+};
+
+/****************************************************************************
+ *
+ * Procfs - debugging info.
+ *
+ ****************************************************************************/
+
+static int
+ipq_get_info(char *buffer, char **start, off_t offset, int length)
+{
+ int len;
+
+ spin_lock_bh(&nlq->lock);
+ len = sprintf(buffer,
+ "Thread pid : %d\n"
+ "Thread terminate : %d\n"
+ "Thread running : %d\n"
+ "Peer pid : %d\n"
+ "Peer died : %d\n"
+ "Peer copy mode : %d\n"
+ "Peer copy range : %d\n"
+ "Queue length : %d\n"
+ "Queue max. length : %d\n"
+ "Queue state : %d\n",
+ nlq->thread.pid,
+ nlq->thread.terminate,
+ nlq->thread.running,
+ nlq->peer.pid,
+ nlq->peer.died,
+ nlq->peer.copy_mode,
+ nlq->peer.copy_range,
+ nlq->len,
+ *nlq->maxlen,
+ nlq->state);
+ spin_unlock_bh(&nlq->lock);
+ *start = buffer + offset;
+ len -= offset;
+ if (len > length)
+ len = length;
+ else if (len < 0)
+ len = 0;
+ return len;
+}
+
+/****************************************************************************
+ *
+ * Module stuff.
+ *
+ ****************************************************************************/
+
+static int __init init(void)
+{
+ int status = 0;
+
+ nfnl = netlink_kernel_create(NETLINK_FIREWALL, receive_user_sk);
+ if (nfnl == NULL) {
+ printk(KERN_ERR "%s: initialisation failed: unable to "
+ "create kernel netlink socket\n", IPQ_NAME);
+ return -ENOMEM;
+ }
+ nlq = ipq_queue_create(receive_netfilter,
+ netlink_send_peer, &status, &sysctl_maxlen);
+ if (nlq == NULL) {
+ printk(KERN_ERR "%s: initialisation failed: unable to "
+ "initialise queue\n", IPQ_NAME);
+ sock_release(nfnl->socket);
+ return status;
+ }
+ register_netdevice_notifier(&ipq_dev_notifier);
+ proc_net_create(IPQ_PROC_FS_NAME, 0, ipq_get_info);
+ ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
+ return status;
+}
+
+static void __exit fini(void)
+{
+ unregister_sysctl_table(ipq_sysctl_header);
+ proc_net_remove(IPQ_PROC_FS_NAME);
+ unregister_netdevice_notifier(&ipq_dev_notifier);
+ ipq_queue_destroy(nlq);
+ sock_release(nfnl->socket);
+}
+
+MODULE_DESCRIPTION("IPv4 packet queue handler");
+module_init(init);
+module_exit(fini);
+
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
new file mode 100644
index 000000000..8cc8c24ac
--- /dev/null
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -0,0 +1,1664 @@
+/*
+ * Packet matching code.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ */
+#include <linux/config.h>
+#include <linux/skbuff.h>
+#include <linux/kmod.h>
+#include <linux/vmalloc.h>
+#include <linux/netdevice.h>
+#include <linux/module.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+#ifndef IP_OFFSET
+#define IP_OFFSET 0x1FFF
+#endif
+
+/*#define DEBUG_IP_FIREWALL*/
+/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */
+/*#define DEBUG_IP_FIREWALL_USER*/
+
+#ifdef DEBUG_IP_FIREWALL
+#define dprintf(format, args...) printk(format , ## args)
+#else
+#define dprintf(format, args...)
+#endif
+
+#ifdef DEBUG_IP_FIREWALL_USER
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+#ifdef CONFIG_NETFILTER_DEBUG
+#define IP_NF_ASSERT(x) \
+do { \
+ if (!(x)) \
+ printk("IPT_ASSERT: %s:%s:%u\n", \
+ __FUNCTION__, __FILE__, __LINE__); \
+} while(0)
+#else
+#define IP_NF_ASSERT(x)
+#endif
+#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
+
+/* Mutex protects lists (only traversed in user context). */
+static DECLARE_MUTEX(ipt_mutex);
+
+/* Must have mutex */
+#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
+#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
+#include <linux/netfilter_ipv4/lockhelp.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#if 0
+/* All the better to debug you with... */
+#define static
+#define inline
+#endif
+
+/* Locking is simple: we assume at worst case there will be one packet
+ in user context and one from bottom halves (or soft irq if Alexey's
+ softnet patch was applied).
+
+ We keep a set of rules for each CPU, so we can avoid write-locking
+ them; doing a readlock_bh() stops packets coming through if we're
+ in user context.
+
+ To be cache friendly on SMP, we arrange them like so:
+ [ n-entries ]
+ ... cache-align padding ...
+ [ n-entries ]
+
+ Hence the start of any table is given by get_table() below. */
+
+/* The table itself */
+struct ipt_table_info
+{
+ /* Size per table */
+ unsigned int size;
+ /* Number of entries: FIXME. --RR */
+ unsigned int number;
+
+ /* Entry points and underflows */
+ unsigned int hook_entry[NF_IP_NUMHOOKS];
+ unsigned int underflow[NF_IP_NUMHOOKS];
+
+ char padding[SMP_ALIGN((NF_IP_NUMHOOKS*2+2)*sizeof(unsigned int))];
+
+ /* ipt_entry tables: one per CPU */
+ char entries[0];
+};
+
+static LIST_HEAD(ipt_target);
+static LIST_HEAD(ipt_match);
+static LIST_HEAD(ipt_tables);
+#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
+
+#ifdef CONFIG_SMP
+#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*cpu_number_map(p))
+#else
+#define TABLE_OFFSET(t,p) 0
+#endif
+
+#if 0
+#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0)
+#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; })
+#define up(x) do { printk("UP:%u:" #x "\n", __LINE__); up(x); } while(0)
+#endif
+
+/* Returns whether matches rule or not. */
+static inline int
+ip_packet_match(const struct iphdr *ip,
+ const char *indev,
+ const char *outdev,
+ const struct ipt_ip *ipinfo,
+ int isfrag)
+{
+ size_t i;
+ unsigned long ret;
+
+#define FWINV(bool,invflg) ((bool) ^ !!(ipinfo->invflags & invflg))
+
+ if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,
+ IPT_INV_SRCIP)
+ || FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,
+ IPT_INV_DSTIP)) {
+ dprintf("Source or dest mismatch.\n");
+
+ dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n",
+ NIPQUAD(ip->saddr),
+ NIPQUAD(ipinfo->smsk.s_addr),
+ NIPQUAD(ipinfo->src.s_addr),
+ ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : "");
+ dprintf("DST: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n",
+ NIPQUAD(ip->daddr),
+ NIPQUAD(ipinfo->dmsk.s_addr),
+ NIPQUAD(ipinfo->dst.s_addr),
+ ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : "");
+ return 0;
+ }
+
+ /* Look for ifname matches; this should unroll nicely. */
+ for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
+ ret |= (((const unsigned long *)indev)[i]
+ ^ ((const unsigned long *)ipinfo->iniface)[i])
+ & ((const unsigned long *)ipinfo->iniface_mask)[i];
+ }
+
+ if (FWINV(ret != 0, IPT_INV_VIA_IN)) {
+ dprintf("VIA in mismatch (%s vs %s).%s\n",
+ indev, ipinfo->iniface,
+ ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":"");
+ return 0;
+ }
+
+ for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
+ ret |= (((const unsigned long *)outdev)[i]
+ ^ ((const unsigned long *)ipinfo->outiface)[i])
+ & ((const unsigned long *)ipinfo->outiface_mask)[i];
+ }
+
+ if (FWINV(ret != 0, IPT_INV_VIA_OUT)) {
+ dprintf("VIA out mismatch (%s vs %s).%s\n",
+ outdev, ipinfo->outiface,
+ ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":"");
+ return 0;
+ }
+
+ /* Check specific protocol */
+ if (ipinfo->proto
+ && FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) {
+ dprintf("Packet protocol %hi does not match %hi.%s\n",
+ ip->protocol, ipinfo->proto,
+ ipinfo->invflags&IPT_INV_PROTO ? " (INV)":"");
+ return 0;
+ }
+
+ /* If we have a fragment rule but the packet is not a fragment
+ * then we return zero */
+ if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) {
+ dprintf("Fragment rule but not fragment.%s\n",
+ ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : "");
+ return 0;
+ }
+
+ return 1;
+}
+
+static inline int
+ip_checkentry(const struct ipt_ip *ip)
+{
+ if (ip->flags & ~IPT_F_MASK) {
+ duprintf("Unknown flag bits set: %08X\n",
+ ip->flags & ~IPT_F_MASK);
+ return 0;
+ }
+ if (ip->invflags & ~IPT_INV_MASK) {
+ duprintf("Unknown invflag bits set: %08X\n",
+ ip->invflags & ~IPT_INV_MASK);
+ return 0;
+ }
+ return 1;
+}
+
+static unsigned int
+ipt_error(struct sk_buff **pskb,
+ unsigned int hooknum,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *targinfo,
+ void *userinfo)
+{
+ if (net_ratelimit())
+ printk("ip_tables: error: `%s'\n", (char *)targinfo);
+
+ return NF_DROP;
+}
+
+static inline
+int do_match(struct ipt_entry_match *m,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int offset,
+ const void *hdr,
+ u_int16_t datalen,
+ int *hotdrop)
+{
+ /* Stop iteration if it doesn't match */
+ if (!m->u.match->match(skb, in, out, m->data,
+ offset, hdr, datalen, hotdrop))
+ return 1;
+ else
+ return 0;
+}
+
+static inline struct ipt_entry *
+get_entry(void *base, unsigned int offset)
+{
+ return (struct ipt_entry *)(base + offset);
+}
+
+/* Returns one of the generic firewall policies, like NF_ACCEPT. */
+unsigned int
+ipt_do_table(struct sk_buff **pskb,
+ unsigned int hook,
+ const struct net_device *in,
+ const struct net_device *out,
+ struct ipt_table *table,
+ void *userdata)
+{
+ static const char nulldevname[IFNAMSIZ] = { 0 };
+ u_int16_t offset;
+ struct iphdr *ip;
+ void *protohdr;
+ u_int16_t datalen;
+ int hotdrop = 0;
+ /* Initializing verdict to NF_DROP keeps gcc happy. */
+ unsigned int verdict = NF_DROP;
+ const char *indev, *outdev;
+ void *table_base;
+ struct ipt_entry *e, *back;
+
+ /* Initialization */
+ ip = (*pskb)->nh.iph;
+ protohdr = (u_int32_t *)ip + ip->ihl;
+ datalen = (*pskb)->len - ip->ihl * 4;
+ indev = in ? in->name : nulldevname;
+ outdev = out ? out->name : nulldevname;
+ /* We handle fragments by dealing with the first fragment as
+ * if it was a normal packet. All other fragments are treated
+ * normally, except that they will NEVER match rules that ask
+ * things we don't know, ie. tcp syn flag or ports). If the
+ * rule is also a fragment-specific rule, non-fragments won't
+ * match it. */
+ offset = ntohs(ip->frag_off) & IP_OFFSET;
+
+ read_lock_bh(&table->lock);
+ IP_NF_ASSERT(table->valid_hooks & (1 << hook));
+ table_base = (void *)table->private->entries
+ + TABLE_OFFSET(table->private, smp_processor_id());
+ e = get_entry(table_base, table->private->hook_entry[hook]);
+
+ /* Check noone else using our table */
+ IP_NF_ASSERT(((struct ipt_entry *)table_base)->comefrom == 0xdead57ac);
+#ifdef CONFIG_NETFILTER_DEBUG
+ ((struct ipt_entry *)table_base)->comefrom = 0x57acc001;
+#endif
+
+ /* For return from builtin chain */
+ back = get_entry(table_base, table->private->underflow[hook]);
+
+ do {
+ IP_NF_ASSERT(e);
+ IP_NF_ASSERT(back);
+ (*pskb)->nfcache |= e->nfcache;
+ if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) {
+ struct ipt_entry_target *t;
+
+ if (IPT_MATCH_ITERATE(e, do_match,
+ *pskb, in, out,
+ offset, protohdr,
+ datalen, &hotdrop) != 0)
+ goto no_match;
+
+ ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
+
+ t = ipt_get_target(e);
+ IP_NF_ASSERT(t->u.target);
+ /* Standard target? */
+ if (!t->u.target->target) {
+ int v;
+
+ v = ((struct ipt_standard_target *)t)->verdict;
+ if (v < 0) {
+ /* Pop from stack? */
+ if (v != IPT_RETURN) {
+ verdict = (unsigned)(-v) - 1;
+ break;
+ }
+ e = back;
+ back = get_entry(table_base,
+ back->comefrom);
+ continue;
+ }
+ if (table_base + v
+ != (void *)e + e->next_offset) {
+ /* Save old back ptr in next entry */
+ struct ipt_entry *next
+ = (void *)e + e->next_offset;
+ next->comefrom
+ = (void *)back - table_base;
+ /* set back pointer to next entry */
+ back = next;
+ }
+
+ e = get_entry(table_base, v);
+ } else {
+ verdict = t->u.target->target(pskb, hook,
+ in, out,
+ t->data,
+ userdata);
+
+ /* Target might have changed stuff. */
+ ip = (*pskb)->nh.iph;
+ protohdr = (u_int32_t *)ip + ip->ihl;
+ datalen = (*pskb)->len - ip->ihl * 4;
+
+ if (verdict == IPT_CONTINUE)
+ e = (void *)e + e->next_offset;
+ else
+ /* Verdict */
+ break;
+ }
+ } else {
+
+ no_match:
+ e = (void *)e + e->next_offset;
+ }
+ } while (!hotdrop);
+
+#ifdef CONFIG_NETFILTER_DEBUG
+ ((struct ipt_entry *)table_base)->comefrom = 0xdead57ac;
+#endif
+ read_unlock_bh(&table->lock);
+
+#ifdef DEBUG_ALLOW_ALL
+ return NF_ACCEPT;
+#else
+ if (hotdrop)
+ return NF_DROP;
+ else return verdict;
+#endif
+}
+
+/* If it succeeds, returns element and locks mutex */
+static inline void *
+find_inlist_lock_noload(struct list_head *head,
+ const char *name,
+ int *error,
+ struct semaphore *mutex)
+{
+ void *ret;
+
+#if 0
+ duprintf("find_inlist: searching for `%s' in %s.\n",
+ name, head == &ipt_target ? "ipt_target"
+ : head == &ipt_match ? "ipt_match"
+ : head == &ipt_tables ? "ipt_tables" : "UNKNOWN");
+#endif
+
+ *error = down_interruptible(mutex);
+ if (*error != 0)
+ return NULL;
+
+ ret = list_named_find(head, name);
+ if (!ret) {
+ *error = -ENOENT;
+ up(mutex);
+ }
+ return ret;
+}
+
+#ifndef CONFIG_KMOD
+#define find_inlist_lock(h,n,p,e,m) find_inlist_lock_noload((h),(n),(e),(m))
+#else
+static void *
+find_inlist_lock(struct list_head *head,
+ const char *name,
+ const char *prefix,
+ int *error,
+ struct semaphore *mutex)
+{
+ void *ret;
+
+ ret = find_inlist_lock_noload(head, name, error, mutex);
+ if (!ret) {
+ char modulename[IPT_FUNCTION_MAXNAMELEN + strlen(prefix) + 1];
+ strcpy(modulename, prefix);
+ strcat(modulename, name);
+ duprintf("find_inlist: loading `%s'.\n", modulename);
+ request_module(modulename);
+ ret = find_inlist_lock_noload(head, name, error, mutex);
+ }
+
+ return ret;
+}
+#endif
+
+static inline struct ipt_table *
+find_table_lock(const char *name, int *error, struct semaphore *mutex)
+{
+ return find_inlist_lock(&ipt_tables, name, "iptable_", error, mutex);
+}
+
+static inline struct ipt_match *
+find_match_lock(const char *name, int *error, struct semaphore *mutex)
+{
+ return find_inlist_lock(&ipt_match, name, "ipt_", error, mutex);
+}
+
+static inline struct ipt_target *
+find_target_lock(const char *name, int *error, struct semaphore *mutex)
+{
+ return find_inlist_lock(&ipt_target, name, "ipt_", error, mutex);
+}
+
+/* All zeroes == unconditional rule. */
+static inline int
+unconditional(const struct ipt_ip *ip)
+{
+ unsigned int i;
+
+ for (i = 0; i < sizeof(*ip)/sizeof(__u32); i++)
+ if (((__u32 *)ip)[i])
+ return 0;
+
+ return 1;
+}
+
+/* Figures out from what hook each rule can be called: returns 0 if
+ there are loops. Puts hook bitmask in comefrom. */
+static int
+mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
+{
+ unsigned int hook;
+
+ /* No recursion; use packet counter to save back ptrs (reset
+ to 0 as we leave), and comefrom to save source hook bitmask */
+ for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) {
+ unsigned int pos = newinfo->hook_entry[hook];
+ struct ipt_entry *e
+ = (struct ipt_entry *)(newinfo->entries + pos);
+
+ if (!(valid_hooks & (1 << hook)))
+ continue;
+
+ /* Set initial back pointer. */
+ e->counters.pcnt = pos;
+
+ for (;;) {
+ struct ipt_standard_target *t
+ = (void *)ipt_get_target(e);
+
+ if (e->comefrom & (1 << NF_IP_NUMHOOKS)) {
+ printk("iptables: loop hook %u pos %u %08X.\n",
+ hook, pos, e->comefrom);
+ return 0;
+ }
+ e->comefrom
+ |= ((1 << hook) | (1 << NF_IP_NUMHOOKS));
+
+ /* Unconditional return/END. */
+ if (e->target_offset == sizeof(struct ipt_entry)
+ && (strcmp(t->target.u.name, IPT_STANDARD_TARGET)
+ == 0)
+ && t->verdict < 0
+ && unconditional(&e->ip)) {
+ unsigned int oldpos, size;
+
+ /* Return: backtrack through the last
+ big jump. */
+ do {
+ e->comefrom ^= (1<<NF_IP_NUMHOOKS);
+#ifdef DEBUG_IP_FIREWALL_USER
+ if (e->comefrom
+ & (1 << NF_IP_NUMHOOKS)) {
+ duprintf("Back unset "
+ "on hook %u "
+ "rule %u\n",
+ hook, pos);
+ }
+#endif
+ oldpos = pos;
+ pos = e->counters.pcnt;
+ e->counters.pcnt = 0;
+
+ /* We're at the start. */
+ if (pos == oldpos)
+ goto next;
+
+ e = (struct ipt_entry *)
+ (newinfo->entries + pos);
+ } while (oldpos == pos + e->next_offset);
+
+ /* Move along one */
+ size = e->next_offset;
+ e = (struct ipt_entry *)
+ (newinfo->entries + pos + size);
+ e->counters.pcnt = pos;
+ pos += size;
+ } else {
+ int newpos = t->verdict;
+
+ if (strcmp(t->target.u.name,
+ IPT_STANDARD_TARGET) == 0
+ && newpos >= 0) {
+ /* This a jump; chase it. */
+ duprintf("Jump rule %u -> %u\n",
+ pos, newpos);
+ } else {
+ /* ... this is a fallthru */
+ newpos = pos + e->next_offset;
+ }
+ e = (struct ipt_entry *)
+ (newinfo->entries + newpos);
+ e->counters.pcnt = pos;
+ pos = newpos;
+ }
+ }
+ next:
+ duprintf("Finished chain %u\n", hook);
+ }
+ return 1;
+}
+
+static inline int
+cleanup_match(struct ipt_entry_match *m, unsigned int *i)
+{
+ if (i && (*i)-- == 0)
+ return 1;
+
+ if (m->u.match->me)
+ __MOD_DEC_USE_COUNT(m->u.match->me);
+
+ return 0;
+}
+
+static inline int
+standard_check(const struct ipt_entry_target *t,
+ unsigned int max_offset)
+{
+ struct ipt_standard_target *targ = (void *)t;
+
+ /* Check standard info. */
+ if (t->target_size != sizeof(struct ipt_standard_target)) {
+ duprintf("standard_check: target size %u != %u\n",
+ t->target_size, sizeof(struct ipt_standard_target));
+ return 0;
+ }
+
+ if (targ->verdict >= 0
+ && targ->verdict > max_offset - sizeof(struct ipt_entry)) {
+ duprintf("ipt_standard_check: bad verdict (%i)\n",
+ targ->verdict);
+ return 0;
+ }
+
+ if (targ->verdict < -NF_MAX_VERDICT - 1) {
+ duprintf("ipt_standard_check: bad negative verdict (%i)\n",
+ targ->verdict);
+ return 0;
+ }
+ return 1;
+}
+
+static inline int
+check_match(struct ipt_entry_match *m,
+ const char *name,
+ const struct ipt_ip *ip,
+ unsigned int hookmask,
+ unsigned int *i)
+{
+ int ret;
+ struct ipt_match *match;
+
+ match = find_match_lock(m->u.name, &ret, &ipt_mutex);
+ if (!match) {
+ duprintf("check_match: `%s' not found\n", m->u.name);
+ return ret;
+ }
+ if (match->me)
+ __MOD_INC_USE_COUNT(match->me);
+ m->u.match = match;
+ up(&ipt_mutex);
+
+ if (m->u.match->checkentry
+ && !m->u.match->checkentry(name, ip, m->data,
+ m->match_size - sizeof(*m),
+ hookmask)) {
+ if (m->u.match->me)
+ __MOD_DEC_USE_COUNT(m->u.match->me);
+ duprintf("ip_tables: check failed for `%s'.\n",
+ m->u.match->name);
+ return -EINVAL;
+ }
+
+ (*i)++;
+ return 0;
+}
+
+static struct ipt_target ipt_standard_target;
+
+static inline int
+check_entry(struct ipt_entry *e, const char *name, unsigned int size,
+ unsigned int *i)
+{
+ struct ipt_entry_target *t;
+ struct ipt_target *target;
+ int ret;
+ unsigned int j;
+
+ if (!ip_checkentry(&e->ip)) {
+ duprintf("ip_tables: ip check failed %p %s.\n", e, name);
+ return -EINVAL;
+ }
+
+ j = 0;
+ ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, e->comefrom, &j);
+ if (ret != 0)
+ goto cleanup_matches;
+
+ t = ipt_get_target(e);
+ target = find_target_lock(t->u.name, &ret, &ipt_mutex);
+ if (!target) {
+ duprintf("check_entry: `%s' not found\n", t->u.name);
+ up(&ipt_mutex);
+ return ret;
+ }
+ if (target->me)
+ __MOD_INC_USE_COUNT(target->me);
+ t->u.target = target;
+ up(&ipt_mutex);
+
+ if (t->u.target == &ipt_standard_target) {
+ if (!standard_check(t, size)) {
+ ret = -EINVAL;
+ goto cleanup_matches;
+ }
+ } else if (t->u.target->checkentry
+ && !t->u.target->checkentry(name, e, t->data,
+ t->target_size - sizeof(*t),
+ e->comefrom)) {
+ if (t->u.target->me)
+ __MOD_DEC_USE_COUNT(t->u.target->me);
+ duprintf("ip_tables: check failed for `%s'.\n",
+ t->u.target->name);
+ ret = -EINVAL;
+ goto cleanup_matches;
+ }
+
+ (*i)++;
+ return 0;
+
+ cleanup_matches:
+ IPT_MATCH_ITERATE(e, cleanup_match, &j);
+ return ret;
+}
+
+static inline int
+check_entry_size_and_hooks(struct ipt_entry *e,
+ struct ipt_table_info *newinfo,
+ unsigned char *base,
+ unsigned char *limit,
+ const unsigned int *hook_entries,
+ const unsigned int *underflows,
+ unsigned int *i)
+{
+ unsigned int h;
+
+ if ((unsigned long)e % __alignof__(struct ipt_entry) != 0
+ || (unsigned char *)e + sizeof(struct ipt_entry) >= limit) {
+ duprintf("Bad offset %p\n", e);
+ return -EINVAL;
+ }
+
+ if (e->next_offset
+ < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) {
+ duprintf("checking: element %p size %u\n",
+ e, e->next_offset);
+ return -EINVAL;
+ }
+
+ /* Check hooks & underflows */
+ for (h = 0; h < NF_IP_NUMHOOKS; h++) {
+ if ((unsigned char *)e - base == hook_entries[h])
+ newinfo->hook_entry[h] = hook_entries[h];
+ if ((unsigned char *)e - base == underflows[h])
+ newinfo->underflow[h] = underflows[h];
+ }
+
+ /* FIXME: underflows must be unconditional, standard verdicts
+ < 0 (not IPT_RETURN). --RR */
+
+ /* Clear counters and comefrom */
+ e->counters = ((struct ipt_counters) { 0, 0 });
+ e->comefrom = 0;
+
+ (*i)++;
+ return 0;
+}
+
+static inline int
+cleanup_entry(struct ipt_entry *e, unsigned int *i)
+{
+ struct ipt_entry_target *t;
+
+ if (i && (*i)-- == 0)
+ return 1;
+
+ /* Cleanup all matches */
+ IPT_MATCH_ITERATE(e, cleanup_match, NULL);
+ t = ipt_get_target(e);
+ if (t->u.target->me)
+ __MOD_DEC_USE_COUNT(t->u.target->me);
+
+ return 0;
+}
+
+/* Checks and translates the user-supplied table segment (held in
+ newinfo) */
+static int
+translate_table(const char *name,
+ unsigned int valid_hooks,
+ struct ipt_table_info *newinfo,
+ unsigned int size,
+ unsigned int number,
+ const unsigned int *hook_entries,
+ const unsigned int *underflows)
+{
+ unsigned int i;
+ int ret;
+
+ newinfo->size = size;
+ newinfo->number = number;
+
+ /* Init all hooks to impossible value. */
+ for (i = 0; i < NF_IP_NUMHOOKS; i++) {
+ newinfo->hook_entry[i] = 0xFFFFFFFF;
+ newinfo->underflow[i] = 0xFFFFFFFF;
+ }
+
+ duprintf("translate_table: size %u\n", newinfo->size);
+ i = 0;
+ /* Walk through entries, checking offsets. */
+ ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ check_entry_size_and_hooks,
+ newinfo,
+ newinfo->entries,
+ newinfo->entries + size,
+ hook_entries, underflows, &i);
+ if (ret != 0)
+ return ret;
+
+ if (i != number) {
+ duprintf("translate_table: %u not %u entries\n",
+ i, number);
+ return -EINVAL;
+ }
+
+ /* Check hooks all assigned */
+ for (i = 0; i < NF_IP_NUMHOOKS; i++) {
+ /* Only hooks which are valid */
+ if (!(valid_hooks & (1 << i)))
+ continue;
+ if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
+ duprintf("Invalid hook entry %u %u\n",
+ i, hook_entries[i]);
+ return -EINVAL;
+ }
+ if (newinfo->underflow[i] == 0xFFFFFFFF) {
+ duprintf("Invalid underflow %u %u\n",
+ i, underflows[i]);
+ return -EINVAL;
+ }
+ }
+
+ if (!mark_source_chains(newinfo, valid_hooks))
+ return -ELOOP;
+
+ /* Finally, each sanity check must pass */
+ i = 0;
+ ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ check_entry, name, size, &i);
+
+ if (ret != 0) {
+ IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ cleanup_entry, &i);
+ return ret;
+ }
+
+ /* And one copy for every other CPU */
+ for (i = 1; i < smp_num_cpus; i++) {
+ memcpy(newinfo->entries + SMP_ALIGN(newinfo->size*i),
+ newinfo->entries,
+ SMP_ALIGN(newinfo->size));
+ }
+
+ return ret;
+}
+
+static struct ipt_table_info *
+replace_table(struct ipt_table *table,
+ unsigned int num_counters,
+ struct ipt_table_info *newinfo,
+ int *error)
+{
+ struct ipt_table_info *oldinfo;
+
+#ifdef CONFIG_NETFILTER_DEBUG
+ {
+ struct ipt_entry *table_base;
+ unsigned int i;
+
+ for (i = 0; i < smp_num_cpus; i++) {
+ table_base =
+ (void *)newinfo->entries
+ + TABLE_OFFSET(newinfo, i);
+
+ table_base->comefrom = 0xdead57ac;
+ }
+ }
+#endif
+
+ /* Do the substitution. */
+ write_lock_bh(&table->lock);
+ /* Check inside lock: is the old number correct? */
+ if (num_counters != table->private->number) {
+ duprintf("num_counters != table->private->number (%u/%u)\n",
+ num_counters, table->private->number);
+ write_unlock_bh(&table->lock);
+ *error = -EAGAIN;
+ return NULL;
+ }
+ oldinfo = table->private;
+ table->private = newinfo;
+ write_unlock_bh(&table->lock);
+
+ return oldinfo;
+}
+
+/* Gets counters. */
+static inline int
+add_entry_to_counter(const struct ipt_entry *e,
+ struct ipt_counters total[],
+ unsigned int *i)
+{
+ ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
+
+ (*i)++;
+ return 0;
+}
+
+static void
+get_counters(const struct ipt_table_info *t,
+ struct ipt_counters counters[])
+{
+ unsigned int cpu;
+ unsigned int i;
+
+ for (cpu = 0; cpu < smp_num_cpus; cpu++) {
+ i = 0;
+ IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
+ t->size,
+ add_entry_to_counter,
+ counters,
+ &i);
+ }
+}
+
+static int
+copy_entries_to_user(unsigned int total_size,
+ struct ipt_table *table,
+ void *userptr)
+{
+ unsigned int off, num, countersize;
+ struct ipt_entry *e;
+ struct ipt_counters *counters;
+ int ret = 0;
+
+ /* We need atomic snapshot of counters: rest doesn't change
+ (other than comefrom, which userspace doesn't care
+ about). */
+ countersize = sizeof(struct ipt_counters) * table->private->number;
+ counters = vmalloc(countersize);
+
+ if (counters == NULL)
+ return -ENOMEM;
+
+ /* First, sum counters... */
+ memset(counters, 0, countersize);
+ write_lock_bh(&table->lock);
+ get_counters(table->private, counters);
+ write_unlock_bh(&table->lock);
+
+ /* ... then copy entire thing from CPU 0... */
+ if (copy_to_user(userptr, table->private->entries, total_size) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+
+ /* FIXME: use iterator macros --RR */
+ /* ... then go back and fix counters and names */
+ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
+ unsigned int i;
+ struct ipt_entry_match *m;
+ struct ipt_entry_target *t;
+
+ e = (struct ipt_entry *)(table->private->entries + off);
+ if (copy_to_user(userptr + off
+ + offsetof(struct ipt_entry, counters),
+ &counters[num],
+ sizeof(counters[num])) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+
+ for (i = sizeof(struct ipt_entry);
+ i < e->target_offset;
+ i += m->match_size) {
+ m = (void *)e + i;
+
+ if (copy_to_user(userptr + off + i
+ + offsetof(struct ipt_entry_match,
+ u.name),
+ m->u.match->name,
+ strlen(m->u.match->name)+1) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+ }
+
+ t = ipt_get_target(e);
+ if (copy_to_user(userptr + off + e->target_offset
+ + offsetof(struct ipt_entry_target,
+ u.name),
+ t->u.target->name,
+ strlen(t->u.target->name)+1) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+ }
+
+ free_counters:
+ vfree(counters);
+ return ret;
+}
+
+static int
+get_entries(const struct ipt_get_entries *entries,
+ struct ipt_get_entries *uptr)
+{
+ int ret;
+ struct ipt_table *t;
+
+ t = find_table_lock(entries->name, &ret, &ipt_mutex);
+ if (t) {
+ duprintf("t->private->number = %u\n",
+ t->private->number);
+ if (entries->size == t->private->size)
+ ret = copy_entries_to_user(t->private->size,
+ t, uptr->entries);
+ else {
+ duprintf("get_entries: I've got %u not %u!\n",
+ t->private->size,
+ entries->size);
+ ret = -EINVAL;
+ }
+ up(&ipt_mutex);
+ } else
+ duprintf("get_entries: Can't find %s!\n",
+ entries->name);
+
+ return ret;
+}
+
+static int
+do_replace(void *user, unsigned int len)
+{
+ int ret;
+ struct ipt_replace tmp;
+ struct ipt_table *t;
+ struct ipt_table_info *newinfo, *oldinfo;
+ struct ipt_counters *counters;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ return -EFAULT;
+
+ newinfo = vmalloc(sizeof(struct ipt_table_info)
+ + SMP_ALIGN(tmp.size) * smp_num_cpus);
+ if (!newinfo)
+ return -ENOMEM;
+
+ if (copy_from_user(newinfo->entries, user + sizeof(tmp),
+ tmp.size) != 0) {
+ ret = -EFAULT;
+ goto free_newinfo;
+ }
+
+ counters = vmalloc(tmp.num_counters * sizeof(struct ipt_counters));
+ if (!counters) {
+ ret = -ENOMEM;
+ goto free_newinfo;
+ }
+ memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters));
+
+ ret = translate_table(tmp.name, tmp.valid_hooks,
+ newinfo, tmp.size, tmp.num_entries,
+ tmp.hook_entry, tmp.underflow);
+ if (ret != 0)
+ goto free_newinfo_counters;
+
+ duprintf("ip_tables: Translated table\n");
+
+ t = find_table_lock(tmp.name, &ret, &ipt_mutex);
+ if (!t)
+ goto free_newinfo_counters_untrans;
+
+ /* You lied! */
+ if (tmp.valid_hooks != t->valid_hooks) {
+ duprintf("Valid hook crap: %08X vs %08X\n",
+ tmp.valid_hooks, t->valid_hooks);
+ ret = -EINVAL;
+ goto free_newinfo_counters_untrans_unlock;
+ }
+
+ oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret);
+ if (!oldinfo)
+ goto free_newinfo_counters_untrans_unlock;
+
+ /* Get the old counters. */
+ get_counters(oldinfo, counters);
+ /* Decrease module usage counts and free resource */
+ IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
+ vfree(oldinfo);
+ /* Silent error: too late now. */
+ copy_to_user(tmp.counters, counters,
+ sizeof(struct ipt_counters) * tmp.num_counters);
+
+ up(&ipt_mutex);
+ return 0;
+
+ free_newinfo_counters_untrans_unlock:
+ up(&ipt_mutex);
+ free_newinfo_counters_untrans:
+ IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL);
+ free_newinfo_counters:
+ vfree(counters);
+ free_newinfo:
+ vfree(newinfo);
+ return ret;
+}
+
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static inline int
+add_counter_to_entry(struct ipt_entry *e,
+ const struct ipt_counters addme[],
+ unsigned int *i)
+{
+#if 0
+ duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
+ *i,
+ (long unsigned int)e->counters.pcnt,
+ (long unsigned int)e->counters.bcnt,
+ (long unsigned int)addme[*i].pcnt,
+ (long unsigned int)addme[*i].bcnt);
+#endif
+
+ ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+ (*i)++;
+ return 0;
+}
+
+static int
+do_add_counters(void *user, unsigned int len)
+{
+ unsigned int i;
+ struct ipt_counters_info tmp, *paddc;
+ struct ipt_table *t;
+ int ret;
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ return -EFAULT;
+
+ if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters))
+ return -EINVAL;
+
+ paddc = vmalloc(len);
+ if (!paddc)
+ return -ENOMEM;
+
+ if (copy_from_user(paddc, user, len) != 0) {
+ ret = -EFAULT;
+ goto free;
+ }
+
+ t = find_table_lock(tmp.name, &ret, &ipt_mutex);
+ if (!t)
+ goto free;
+
+ write_lock_bh(&t->lock);
+ if (t->private->number != paddc->num_counters) {
+ ret = -EINVAL;
+ goto unlock_up_free;
+ }
+
+ i = 0;
+ IPT_ENTRY_ITERATE(t->private->entries,
+ t->private->size,
+ add_counter_to_entry,
+ paddc->counters,
+ &i);
+ unlock_up_free:
+ write_unlock_bh(&t->lock);
+ up(&ipt_mutex);
+ free:
+ vfree(paddc);
+
+ return ret;
+}
+
+static int
+do_ipt_set_ctl(struct sock *sk, int cmd, void *user, unsigned int len)
+{
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case IPT_SO_SET_REPLACE:
+ ret = do_replace(user, len);
+ break;
+
+ case IPT_SO_SET_ADD_COUNTERS:
+ ret = do_add_counters(user, len);
+ break;
+
+ default:
+ duprintf("do_ipt_set_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static int
+do_ipt_get_ctl(struct sock *sk, int cmd, void *user, int *len)
+{
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case IPT_SO_GET_INFO: {
+ char name[IPT_TABLE_MAXNAMELEN];
+ struct ipt_table *t;
+
+ if (*len != sizeof(struct ipt_getinfo)) {
+ duprintf("length %u != %u\n", *len,
+ sizeof(struct ipt_getinfo));
+ ret = -EINVAL;
+ break;
+ }
+
+ if (copy_from_user(name, user, sizeof(name)) != 0) {
+ ret = -EFAULT;
+ break;
+ }
+ t = find_table_lock(name, &ret, &ipt_mutex);
+ if (t) {
+ struct ipt_getinfo info;
+
+ info.valid_hooks = t->valid_hooks;
+ memcpy(info.hook_entry, t->private->hook_entry,
+ sizeof(info.hook_entry));
+ memcpy(info.underflow, t->private->underflow,
+ sizeof(info.underflow));
+ info.num_entries = t->private->number;
+ info.size = t->private->size;
+ strcpy(info.name, name);
+
+ if (copy_to_user(user, &info, *len) != 0)
+ ret = -EFAULT;
+ else
+ ret = 0;
+
+ up(&ipt_mutex);
+ }
+ }
+ break;
+
+ case IPT_SO_GET_ENTRIES: {
+ struct ipt_get_entries get;
+
+ if (*len < sizeof(get)) {
+ duprintf("get_entries: %u < %u\n", *len, sizeof(get));
+ ret = -EINVAL;
+ } else if (copy_from_user(&get, user, sizeof(get)) != 0) {
+ ret = -EFAULT;
+ } else if (*len != sizeof(struct ipt_get_entries) + get.size) {
+ duprintf("get_entries: %u != %u\n", *len,
+ sizeof(struct ipt_get_entries) + get.size);
+ ret = -EINVAL;
+ } else
+ ret = get_entries(&get, user);
+ break;
+ }
+
+ default:
+ duprintf("do_ipt_get_ctl: unknown request %i\n", cmd);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+/* Registration hooks for targets. */
+int
+ipt_register_target(struct ipt_target *target)
+{
+ int ret;
+
+ ret = down_interruptible(&ipt_mutex);
+ if (ret != 0)
+ return ret;
+
+ if (list_named_insert(&ipt_target, target)) {
+ MOD_INC_USE_COUNT;
+ ret = 0;
+ } else {
+ duprintf("ipt_register_target: `%s' already in list!\n",
+ target->name);
+ ret = -EINVAL;
+ }
+ up(&ipt_mutex);
+ return ret;
+}
+
+void
+ipt_unregister_target(struct ipt_target *target)
+{
+ down(&ipt_mutex);
+ LIST_DELETE(&ipt_target, target);
+ up(&ipt_mutex);
+ MOD_DEC_USE_COUNT;
+}
+
+int
+ipt_register_match(struct ipt_match *match)
+{
+ int ret;
+
+ ret = down_interruptible(&ipt_mutex);
+ if (ret != 0)
+ return ret;
+
+ if (list_named_insert(&ipt_match, match)) {
+ MOD_INC_USE_COUNT;
+ ret = 0;
+ } else {
+ duprintf("ipt_register_match: `%s' already in list!\n",
+ match->name);
+ ret = -EINVAL;
+ }
+ up(&ipt_mutex);
+
+ return ret;
+}
+
+void
+ipt_unregister_match(struct ipt_match *match)
+{
+ down(&ipt_mutex);
+ LIST_DELETE(&ipt_match, match);
+ up(&ipt_mutex);
+ MOD_DEC_USE_COUNT;
+}
+
+int ipt_register_table(struct ipt_table *table)
+{
+ int ret;
+ struct ipt_table_info *newinfo;
+ static struct ipt_table_info bootstrap
+ = { 0, 0, { 0 }, { 0 }, { }, { } };
+
+ newinfo = vmalloc(sizeof(struct ipt_table_info)
+ + SMP_ALIGN(table->table->size) * smp_num_cpus);
+ if (!newinfo) {
+ ret = -ENOMEM;
+ return ret;
+ }
+ memcpy(newinfo->entries, table->table->entries, table->table->size);
+
+ ret = translate_table(table->name, table->valid_hooks,
+ newinfo, table->table->size,
+ table->table->num_entries,
+ table->table->hook_entry,
+ table->table->underflow);
+ if (ret != 0) {
+ vfree(newinfo);
+ return ret;
+ }
+
+ ret = down_interruptible(&ipt_mutex);
+ if (ret != 0) {
+ vfree(newinfo);
+ return ret;
+ }
+
+ /* Don't autoload: we'd eat our tail... */
+ if (list_named_find(&ipt_tables, table->name)) {
+ ret = -EEXIST;
+ goto free_unlock;
+ }
+
+ /* Simplifies replace_table code. */
+ table->private = &bootstrap;
+ if (!replace_table(table, 0, newinfo, &ret))
+ goto free_unlock;
+
+ duprintf("table->private->number = %u\n",
+ table->private->number);
+
+ table->lock = RW_LOCK_UNLOCKED;
+ list_prepend(&ipt_tables, table);
+ MOD_INC_USE_COUNT;
+
+ unlock:
+ up(&ipt_mutex);
+ return ret;
+
+ free_unlock:
+ vfree(newinfo);
+ goto unlock;
+}
+
+void ipt_unregister_table(struct ipt_table *table)
+{
+ down(&ipt_mutex);
+ LIST_DELETE(&ipt_tables, table);
+ up(&ipt_mutex);
+
+ /* Decrease module usage counts and free resources */
+ IPT_ENTRY_ITERATE(table->private->entries, table->private->size,
+ cleanup_entry, NULL);
+ vfree(table->private);
+ MOD_DEC_USE_COUNT;
+}
+
+/* Returns 1 if the port is matched by the range, 0 otherwise */
+static inline int
+port_match(u_int16_t min, u_int16_t max, u_int16_t port, int invert)
+{
+ int ret;
+
+ ret = (port >= min && port <= max) ^ invert;
+ return ret;
+}
+
+static int
+tcp_find_option(u_int8_t option,
+ const struct tcphdr *tcp,
+ u_int16_t datalen,
+ int invert,
+ int *hotdrop)
+{
+ unsigned int i = sizeof(struct tcphdr);
+ const u_int8_t *opt = (u_int8_t *)tcp;
+
+ duprintf("tcp_match: finding option\n");
+ /* If we don't have the whole header, drop packet. */
+ if (tcp->doff * 4 > datalen) {
+ *hotdrop = 1;
+ return 0;
+ }
+
+ while (i < tcp->doff * 4) {
+ if (opt[i] == option) return !invert;
+ if (opt[i] < 2) i++;
+ else i += opt[i+1]?:1;
+ }
+
+ return invert;
+}
+
+static int
+tcp_match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ const void *hdr,
+ u_int16_t datalen,
+ int *hotdrop)
+{
+ const struct tcphdr *tcp = hdr;
+ const struct ipt_tcp *tcpinfo = matchinfo;
+
+ /* To quote Alan:
+
+ Don't allow a fragment of TCP 8 bytes in. Nobody normal
+ causes this. Its a cracker trying to break in by doing a
+ flag overwrite to pass the direction checks.
+ */
+
+ if (offset == 1) {
+ duprintf("Dropping evil TCP offset=1 frag.\n");
+ *hotdrop = 1;
+ return 0;
+ } else if (offset == 0 && datalen < sizeof(struct tcphdr)) {
+ /* We've been asked to examine this packet, and we
+ can't. Hence, no choice but to drop. */
+ duprintf("Dropping evil TCP offset=0 tinygram.\n");
+ *hotdrop = 1;
+ return 0;
+ }
+
+ /* FIXME: Try tcp doff >> packet len against various stacks --RR */
+
+#define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg))
+
+ /* Must not be a fragment. */
+ return !offset
+ && port_match(tcpinfo->spts[0], tcpinfo->spts[1],
+ ntohs(tcp->source),
+ !!(tcpinfo->invflags & IPT_TCP_INV_SRCPT))
+ && port_match(tcpinfo->dpts[0], tcpinfo->dpts[1],
+ ntohs(tcp->dest),
+ !!(tcpinfo->invflags & IPT_TCP_INV_DSTPT))
+ && FWINVTCP((((unsigned char *)tcp)[13]
+ & tcpinfo->flg_mask)
+ == tcpinfo->flg_cmp,
+ IPT_TCP_INV_FLAGS)
+ && (!tcpinfo->option
+ || tcp_find_option(tcpinfo->option, tcp, datalen,
+ tcpinfo->invflags
+ & IPT_TCP_INV_OPTION,
+ hotdrop));
+}
+
+/* Called when user tries to insert an entry of this type. */
+static int
+tcp_checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ const struct ipt_tcp *tcpinfo = matchinfo;
+
+ /* Must specify proto == TCP, and no unknown invflags */
+ return ip->proto == IPPROTO_TCP
+ && !(ip->invflags & IPT_INV_PROTO)
+ && matchsize == IPT_ALIGN(sizeof(struct ipt_tcp))
+ && !(tcpinfo->invflags & ~IPT_TCP_INV_MASK);
+}
+
+static int
+udp_match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ const void *hdr,
+ u_int16_t datalen,
+ int *hotdrop)
+{
+ const struct udphdr *udp = hdr;
+ const struct ipt_udp *udpinfo = matchinfo;
+
+ if (offset == 0 && datalen < sizeof(struct udphdr)) {
+ /* We've been asked to examine this packet, and we
+ can't. Hence, no choice but to drop. */
+ duprintf("Dropping evil UDP tinygram.\n");
+ *hotdrop = 1;
+ return 0;
+ }
+
+ /* Must not be a fragment. */
+ return !offset
+ && port_match(udpinfo->spts[0], udpinfo->spts[1],
+ ntohs(udp->source),
+ !!(udpinfo->invflags & IPT_UDP_INV_SRCPT))
+ && port_match(udpinfo->dpts[0], udpinfo->dpts[1],
+ ntohs(udp->dest),
+ !!(udpinfo->invflags & IPT_UDP_INV_DSTPT));
+}
+
+/* Called when user tries to insert an entry of this type. */
+static int
+udp_checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchinfosize,
+ unsigned int hook_mask)
+{
+ const struct ipt_udp *udpinfo = matchinfo;
+
+ /* Must specify proto == UDP, and no unknown invflags */
+ if (ip->proto != IPPROTO_UDP || (ip->invflags & IPT_INV_PROTO)) {
+ duprintf("ipt_udp: Protocol %u != %u\n", ip->proto,
+ IPPROTO_UDP);
+ return 0;
+ }
+ if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_udp))) {
+ duprintf("ipt_udp: matchsize %u != %u\n",
+ matchinfosize, IPT_ALIGN(sizeof(struct ipt_udp)));
+ return 0;
+ }
+ if (udpinfo->invflags & ~IPT_UDP_INV_MASK) {
+ duprintf("ipt_udp: unknown flags %X\n",
+ udpinfo->invflags);
+ return 0;
+ }
+
+ return 1;
+}
+
+/* Returns 1 if the type and code is matched by the range, 0 otherwise */
+static inline int
+icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
+ u_int8_t type, u_int8_t code,
+ int invert)
+{
+ return (type == test_type && code >= min_code && code <= max_code)
+ ^ invert;
+}
+
+static int
+icmp_match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ const void *hdr,
+ u_int16_t datalen,
+ int *hotdrop)
+{
+ const struct icmphdr *icmp = hdr;
+ const struct ipt_icmp *icmpinfo = matchinfo;
+
+ if (offset == 0 && datalen < 2) {
+ /* We've been asked to examine this packet, and we
+ can't. Hence, no choice but to drop. */
+ duprintf("Dropping evil ICMP tinygram.\n");
+ *hotdrop = 1;
+ return 0;
+ }
+
+ /* Must not be a fragment. */
+ return !offset
+ && icmp_type_code_match(icmpinfo->type,
+ icmpinfo->code[0],
+ icmpinfo->code[1],
+ icmp->type, icmp->code,
+ !!(icmpinfo->invflags&IPT_ICMP_INV));
+}
+
+/* Called when user tries to insert an entry of this type. */
+static int
+icmp_checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ const struct ipt_icmp *icmpinfo = matchinfo;
+
+ /* Must specify proto == ICMP, and no unknown invflags */
+ return ip->proto == IPPROTO_ICMP
+ && !(ip->invflags & IPT_INV_PROTO)
+ && matchsize == IPT_ALIGN(sizeof(struct ipt_icmp))
+ && !(icmpinfo->invflags & ~IPT_ICMP_INV);
+}
+
+/* The built-in targets: standard (NULL) and error. */
+static struct ipt_target ipt_standard_target
+= { { NULL, NULL }, IPT_STANDARD_TARGET, NULL, NULL, NULL };
+static struct ipt_target ipt_error_target
+= { { NULL, NULL }, IPT_ERROR_TARGET, ipt_error, NULL, NULL };
+
+static struct nf_sockopt_ops ipt_sockopts
+= { { NULL, NULL }, PF_INET, IPT_BASE_CTL, IPT_SO_SET_MAX+1, do_ipt_set_ctl,
+ IPT_BASE_CTL, IPT_SO_GET_MAX+1, do_ipt_get_ctl, 0, NULL };
+
+static struct ipt_match tcp_matchstruct
+= { { NULL, NULL }, "tcp", &tcp_match, &tcp_checkentry, NULL };
+static struct ipt_match udp_matchstruct
+= { { NULL, NULL }, "udp", &udp_match, &udp_checkentry, NULL };
+static struct ipt_match icmp_matchstruct
+= { { NULL, NULL }, "icmp", &icmp_match, &icmp_checkentry, NULL };
+
+static int __init init(void)
+{
+ int ret;
+
+ /* Noone else will be downing sem now, so we won't sleep */
+ down(&ipt_mutex);
+ list_append(&ipt_target, &ipt_standard_target);
+ list_append(&ipt_target, &ipt_error_target);
+ list_append(&ipt_match, &tcp_matchstruct);
+ list_append(&ipt_match, &udp_matchstruct);
+ list_append(&ipt_match, &icmp_matchstruct);
+ up(&ipt_mutex);
+
+ /* Register setsockopt */
+ ret = nf_register_sockopt(&ipt_sockopts);
+ if (ret < 0) {
+ duprintf("Unable to register sockopts.\n");
+ return ret;
+ }
+
+ printk("iptables: (c)2000 Netfilter core team\n");
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ nf_unregister_sockopt(&ipt_sockopts);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipchains_core.c b/net/ipv4/netfilter/ipchains_core.c
new file mode 100644
index 000000000..02bd7ad83
--- /dev/null
+++ b/net/ipv4/netfilter/ipchains_core.c
@@ -0,0 +1,1768 @@
+/* Minor modifications to fit on compatibility framework:
+ Rusty.Russell@rustcorp.com.au
+*/
+
+/*
+ * This code is heavily based on the code on the old ip_fw.c code; see below for
+ * copyrights and attributions of the old code. This code is basically GPL.
+ *
+ * 15-Aug-1997: Major changes to allow graphs for firewall rules.
+ * Paul Russell <Paul.Russell@rustcorp.com.au> and
+ * Michael Neuling <Michael.Neuling@rustcorp.com.au>
+ * 24-Aug-1997: Generalised protocol handling (not just TCP/UDP/ICMP).
+ * Added explicit RETURN from chains.
+ * Removed TOS mangling (done in ipchains 1.0.1).
+ * Fixed read & reset bug by reworking proc handling.
+ * Paul Russell <Paul.Russell@rustcorp.com.au>
+ * 28-Sep-1997: Added packet marking for net sched code.
+ * Removed fw_via comparisons: all done on device name now,
+ * similar to changes in ip_fw.c in DaveM's CVS970924 tree.
+ * Paul Russell <Paul.Russell@rustcorp.com.au>
+ * 2-Nov-1997: Moved types across to __u16, etc.
+ * Added inverse flags.
+ * Fixed fragment bug (in args to port_match).
+ * Changed mark to only one flag (MARKABS).
+ * 21-Nov-1997: Added ability to test ICMP code.
+ * 19-Jan-1998: Added wildcard interfaces.
+ * 6-Feb-1998: Merged 2.0 and 2.1 versions.
+ * Initialised ip_masq for 2.0.x version.
+ * Added explicit NETLINK option for 2.1.x version.
+ * Added packet and byte counters for policy matches.
+ * 26-Feb-1998: Fixed race conditions, added SMP support.
+ * 18-Mar-1998: Fix SMP, fix race condition fix.
+ * 1-May-1998: Remove caching of device pointer.
+ * 12-May-1998: Allow tiny fragment case for TCP/UDP.
+ * 15-May-1998: Treat short packets as fragments, don't just block.
+ * 3-Jan-1999: Fixed serious procfs security hole -- users should never
+ * be allowed to view the chains!
+ * Marc Santoro <ultima@snicker.emoti.com>
+ * 29-Jan-1999: Locally generated bogus IPs dealt with, rather than crash
+ * during dump_packet. --RR.
+ * 19-May-1999: Star Wars: The Phantom Menace opened. Rule num
+ * printed in log (modified from Michael Hasenstein's patch).
+ * Added SYN in log message. --RR
+ * 23-Jul-1999: Fixed small fragment security exposure opened on 15-May-1998.
+ * John McDonald <jm@dataprotect.com>
+ * Thomas Lopatic <tl@dataprotect.com>
+ */
+
+/*
+ *
+ * The origina Linux port was done Alan Cox, with changes/fixes from
+ * Pauline Middlelink, Jos Vos, Thomas Quinot, Wouter Gadeyne, Juan
+ * Jose Ciarlante, Bernd Eckenfels, Keith Owens and others.
+ *
+ * Copyright from the original FreeBSD version follows:
+ *
+ * Copyright (c) 1993 Daniel Boulet
+ * Copyright (c) 1994 Ugen J.S.Antsilevich
+ *
+ * Redistribution and use in source forms, with and without modification,
+ * are permitted provided that this entire comment appears intact.
+ *
+ * Redistribution in binary form may occur without any restrictions.
+ * Obviously, it would be nice if you gave credit where credit is due
+ * but requiring it would be too onerous.
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind. */
+
+#include <linux/config.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/icmp.h>
+#include <linux/udp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/compat_firewall.h>
+#include <linux/netfilter_ipv4/ipchains_core.h>
+
+#include <net/checksum.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+
+/* Understanding locking in this code: (thanks to Alan Cox for using
+ * little words to explain this to me). -- PR
+ *
+ * In UP, there can be two packets traversing the chains:
+ * 1) A packet from the current userspace context
+ * 2) A packet off the bh handlers (timer or net).
+ *
+ * For SMP (kernel v2.1+), multiply this by # CPUs.
+ *
+ * [Note that this in not correct for 2.2 - because the socket code always
+ * uses lock_kernel() to serialize, and bottom halves (timers and net_bhs)
+ * only run on one CPU at a time. This will probably change for 2.3.
+ * It is still good to use spinlocks because that avoids the global cli()
+ * for updating the tables, which is rather costly in SMP kernels -AK]
+ *
+ * This means counters and backchains can get corrupted if no precautions
+ * are taken.
+ *
+ * To actually alter a chain on UP, we need only do a cli(), as this will
+ * stop a bh handler firing, as we are in the current userspace context
+ * (coming from a setsockopt()).
+ *
+ * On SMP, we need a write_lock_irqsave(), which is a simple cli() in
+ * UP.
+ *
+ * For backchains and counters, we use an array, indexed by
+ * [cpu_number_map[smp_processor_id()]*2 + !in_interrupt()]; the array is of
+ * size [smp_num_cpus*2]. For v2.0, smp_num_cpus is effectively 1. So,
+ * confident of uniqueness, we modify counters even though we only
+ * have a read lock (to read the counters, you need a write lock,
+ * though). */
+
+/* Why I didn't use straight locking... -- PR
+ *
+ * The backchains can be separated out of the ip_chains structure, and
+ * allocated as needed inside ip_fw_check().
+ *
+ * The counters, however, can't. Trying to lock these means blocking
+ * interrupts every time we want to access them. This would suck HARD
+ * performance-wise. Not locking them leads to possible corruption,
+ * made worse on 32-bit machines (counters are 64-bit). */
+
+/*#define DEBUG_IP_FIREWALL*/
+/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */
+/*#define DEBUG_IP_FIREWALL_USER*/
+/*#define DEBUG_IP_FIREWALL_LOCKING*/
+
+static struct sock *ipfwsk;
+
+#ifdef CONFIG_SMP
+#define SLOT_NUMBER() (cpu_number_map(smp_processor_id())*2 + !in_interrupt())
+#else /* !SMP */
+#define SLOT_NUMBER() (!in_interrupt())
+#endif /* CONFIG_SMP */
+#define NUM_SLOTS (smp_num_cpus*2)
+
+#define SIZEOF_STRUCT_IP_CHAIN (sizeof(struct ip_chain) \
+ + NUM_SLOTS*sizeof(struct ip_reent))
+#define SIZEOF_STRUCT_IP_FW_KERNEL (sizeof(struct ip_fwkernel) \
+ + NUM_SLOTS*sizeof(struct ip_counters))
+
+#ifdef DEBUG_IP_FIREWALL_LOCKING
+static unsigned int fwc_rlocks, fwc_wlocks;
+#define FWC_DEBUG_LOCK(d) \
+do { \
+ FWC_DONT_HAVE_LOCK(d); \
+ d |= (1 << SLOT_NUMBER()); \
+} while (0)
+
+#define FWC_DEBUG_UNLOCK(d) \
+do { \
+ FWC_HAVE_LOCK(d); \
+ d &= ~(1 << SLOT_NUMBER()); \
+} while (0)
+
+#define FWC_DONT_HAVE_LOCK(d) \
+do { \
+ if ((d) & (1 << SLOT_NUMBER())) \
+ printk("%s:%i: Got lock on %i already!\n", \
+ __FILE__, __LINE__, SLOT_NUMBER()); \
+} while(0)
+
+#define FWC_HAVE_LOCK(d) \
+do { \
+ if (!((d) & (1 << SLOT_NUMBER()))) \
+ printk("%s:%i:No lock on %i!\n", \
+ __FILE__, __LINE__, SLOT_NUMBER()); \
+} while (0)
+
+#else
+#define FWC_DEBUG_LOCK(d) do { } while(0)
+#define FWC_DEBUG_UNLOCK(d) do { } while(0)
+#define FWC_DONT_HAVE_LOCK(d) do { } while(0)
+#define FWC_HAVE_LOCK(d) do { } while(0)
+#endif /*DEBUG_IP_FIRWALL_LOCKING*/
+
+#define FWC_READ_LOCK(l) do { FWC_DEBUG_LOCK(fwc_rlocks); read_lock(l); } while (0)
+#define FWC_WRITE_LOCK(l) do { FWC_DEBUG_LOCK(fwc_wlocks); write_lock(l); } while (0)
+#define FWC_READ_LOCK_IRQ(l,f) do { FWC_DEBUG_LOCK(fwc_rlocks); read_lock_irqsave(l,f); } while (0)
+#define FWC_WRITE_LOCK_IRQ(l,f) do { FWC_DEBUG_LOCK(fwc_wlocks); write_lock_irqsave(l,f); } while (0)
+#define FWC_READ_UNLOCK(l) do { FWC_DEBUG_UNLOCK(fwc_rlocks); read_unlock(l); } while (0)
+#define FWC_WRITE_UNLOCK(l) do { FWC_DEBUG_UNLOCK(fwc_wlocks); write_unlock(l); } while (0)
+#define FWC_READ_UNLOCK_IRQ(l,f) do { FWC_DEBUG_UNLOCK(fwc_rlocks); read_unlock_irqrestore(l,f); } while (0)
+#define FWC_WRITE_UNLOCK_IRQ(l,f) do { FWC_DEBUG_UNLOCK(fwc_wlocks); write_unlock_irqrestore(l,f); } while (0)
+
+struct ip_chain;
+
+struct ip_counters
+{
+ __u64 pcnt, bcnt; /* Packet and byte counters */
+};
+
+struct ip_fwkernel
+{
+ struct ip_fw ipfw;
+ struct ip_fwkernel *next; /* where to go next if current
+ * rule doesn't match */
+ struct ip_chain *branch; /* which branch to jump to if
+ * current rule matches */
+ int simplebranch; /* Use this if branch == NULL */
+ struct ip_counters counters[0]; /* Actually several of these */
+};
+
+struct ip_reent
+{
+ struct ip_chain *prevchain; /* Pointer to referencing chain */
+ struct ip_fwkernel *prevrule; /* Pointer to referencing rule */
+ struct ip_counters counters;
+};
+
+struct ip_chain
+{
+ ip_chainlabel label; /* Defines the label for each block */
+ struct ip_chain *next; /* Pointer to next block */
+ struct ip_fwkernel *chain; /* Pointer to first rule in block */
+ __u32 refcount; /* Number of refernces to block */
+ int policy; /* Default rule for chain. Only *
+ * used in built in chains */
+ struct ip_reent reent[0]; /* Actually several of these */
+};
+
+/*
+ * Implement IP packet firewall
+ */
+
+#ifdef DEBUG_IP_FIREWALL
+#define dprintf(format, args...) printk(format , ## args)
+#else
+#define dprintf(format, args...)
+#endif
+
+#ifdef DEBUG_IP_FIREWALL_USER
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+/* Lock around ip_fw_chains linked list structure */
+rwlock_t ip_fw_lock = RW_LOCK_UNLOCKED;
+
+/* Head of linked list of fw rules */
+static struct ip_chain *ip_fw_chains;
+
+#define IP_FW_INPUT_CHAIN ip_fw_chains
+#define IP_FW_FORWARD_CHAIN (ip_fw_chains->next)
+#define IP_FW_OUTPUT_CHAIN (ip_fw_chains->next->next)
+
+/* Returns 1 if the port is matched by the range, 0 otherwise */
+extern inline int port_match(__u16 min, __u16 max, __u16 port,
+ int frag, int invert)
+{
+ if (frag) /* Fragments fail ANY port test. */
+ return (min == 0 && max == 0xFFFF);
+ else return (port >= min && port <= max) ^ invert;
+}
+
+/* Returns whether matches rule or not. */
+static int ip_rule_match(struct ip_fwkernel *f,
+ const char *ifname,
+ struct iphdr *ip,
+ char tcpsyn,
+ __u16 src_port, __u16 dst_port,
+ char isfrag)
+{
+#define FWINV(bool,invflg) ((bool) ^ !!(f->ipfw.fw_invflg & invflg))
+ /*
+ * This is a bit simpler as we don't have to walk
+ * an interface chain as you do in BSD - same logic
+ * however.
+ */
+
+ if (FWINV((ip->saddr&f->ipfw.fw_smsk.s_addr) != f->ipfw.fw_src.s_addr,
+ IP_FW_INV_SRCIP)
+ || FWINV((ip->daddr&f->ipfw.fw_dmsk.s_addr)!=f->ipfw.fw_dst.s_addr,
+ IP_FW_INV_DSTIP)) {
+ dprintf("Source or dest mismatch.\n");
+
+ dprintf("SRC: %u. Mask: %u. Target: %u.%s\n", ip->saddr,
+ f->ipfw.fw_smsk.s_addr, f->ipfw.fw_src.s_addr,
+ f->ipfw.fw_invflg & IP_FW_INV_SRCIP ? " (INV)" : "");
+ dprintf("DST: %u. Mask: %u. Target: %u.%s\n", ip->daddr,
+ f->ipfw.fw_dmsk.s_addr, f->ipfw.fw_dst.s_addr,
+ f->ipfw.fw_invflg & IP_FW_INV_DSTIP ? " (INV)" : "");
+ return 0;
+ }
+
+ /*
+ * Look for a VIA device match
+ */
+ if (f->ipfw.fw_flg & IP_FW_F_WILDIF) {
+ if (FWINV(strncmp(ifname, f->ipfw.fw_vianame,
+ strlen(f->ipfw.fw_vianame)) != 0,
+ IP_FW_INV_VIA)) {
+ dprintf("Wildcard interface mismatch.%s\n",
+ f->ipfw.fw_invflg & IP_FW_INV_VIA ? " (INV)" : "");
+ return 0; /* Mismatch */
+ }
+ }
+ else if (FWINV(strcmp(ifname, f->ipfw.fw_vianame) != 0,
+ IP_FW_INV_VIA)) {
+ dprintf("Interface name does not match.%s\n",
+ f->ipfw.fw_invflg & IP_FW_INV_VIA
+ ? " (INV)" : "");
+ return 0; /* Mismatch */
+ }
+
+ /*
+ * Ok the chain addresses match.
+ */
+
+ /* If we have a fragment rule but the packet is not a fragment
+ * the we return zero */
+ if (FWINV((f->ipfw.fw_flg&IP_FW_F_FRAG) && !isfrag, IP_FW_INV_FRAG)) {
+ dprintf("Fragment rule but not fragment.%s\n",
+ f->ipfw.fw_invflg & IP_FW_INV_FRAG ? " (INV)" : "");
+ return 0;
+ }
+
+ /* Fragment NEVER passes a SYN test, even an inverted one. */
+ if (FWINV((f->ipfw.fw_flg&IP_FW_F_TCPSYN) && !tcpsyn, IP_FW_INV_SYN)
+ || (isfrag && (f->ipfw.fw_flg&IP_FW_F_TCPSYN))) {
+ dprintf("Rule requires SYN and packet has no SYN.%s\n",
+ f->ipfw.fw_invflg & IP_FW_INV_SYN ? " (INV)" : "");
+ return 0;
+ }
+
+ if (f->ipfw.fw_proto) {
+ /*
+ * Specific firewall - packet's protocol
+ * must match firewall's.
+ */
+
+ if (FWINV(ip->protocol!=f->ipfw.fw_proto, IP_FW_INV_PROTO)) {
+ dprintf("Packet protocol %hi does not match %hi.%s\n",
+ ip->protocol, f->ipfw.fw_proto,
+ f->ipfw.fw_invflg&IP_FW_INV_PROTO ? " (INV)":"");
+ return 0;
+ }
+
+ /* For non TCP/UDP/ICMP, port range is max anyway. */
+ if (!port_match(f->ipfw.fw_spts[0],
+ f->ipfw.fw_spts[1],
+ src_port, isfrag,
+ !!(f->ipfw.fw_invflg&IP_FW_INV_SRCPT))
+ || !port_match(f->ipfw.fw_dpts[0],
+ f->ipfw.fw_dpts[1],
+ dst_port, isfrag,
+ !!(f->ipfw.fw_invflg
+ &IP_FW_INV_DSTPT))) {
+ dprintf("Port match failed.\n");
+ return 0;
+ }
+ }
+
+ dprintf("Match succeeded.\n");
+ return 1;
+}
+
+static const char *branchname(struct ip_chain *branch,int simplebranch)
+{
+ if (branch)
+ return branch->label;
+ switch (simplebranch)
+ {
+ case FW_BLOCK: return IP_FW_LABEL_BLOCK;
+ case FW_ACCEPT: return IP_FW_LABEL_ACCEPT;
+ case FW_REJECT: return IP_FW_LABEL_REJECT;
+ case FW_REDIRECT: return IP_FW_LABEL_REDIRECT;
+ case FW_MASQUERADE: return IP_FW_LABEL_MASQUERADE;
+ case FW_SKIP: return "-";
+ case FW_SKIP+1: return IP_FW_LABEL_RETURN;
+ default:
+ return "UNKNOWN";
+ }
+}
+
+/*
+ * VERY ugly piece of code which actually
+ * makes kernel printf for matching packets...
+ */
+static void dump_packet(const struct iphdr *ip,
+ const char *ifname,
+ struct ip_fwkernel *f,
+ const ip_chainlabel chainlabel,
+ __u16 src_port,
+ __u16 dst_port,
+ unsigned int count,
+ int syn)
+{
+ __u32 *opt = (__u32 *) (ip + 1);
+ int opti;
+
+ if (f) {
+ printk(KERN_INFO "Packet log: %s ",chainlabel);
+ printk("%s ",branchname(f->branch,f->simplebranch));
+ if (f->simplebranch==FW_REDIRECT)
+ printk("%d ",f->ipfw.fw_redirpt);
+ }
+
+ printk("%s PROTO=%d %d.%d.%d.%d:%hu %d.%d.%d.%d:%hu"
+ " L=%hu S=0x%2.2hX I=%hu F=0x%4.4hX T=%hu",
+ ifname, ip->protocol,
+ (ntohl(ip->saddr)>>24)&0xFF,
+ (ntohl(ip->saddr)>>16)&0xFF,
+ (ntohl(ip->saddr)>>8)&0xFF,
+ (ntohl(ip->saddr))&0xFF,
+ src_port,
+ (ntohl(ip->daddr)>>24)&0xFF,
+ (ntohl(ip->daddr)>>16)&0xFF,
+ (ntohl(ip->daddr)>>8)&0xFF,
+ (ntohl(ip->daddr))&0xFF,
+ dst_port,
+ ntohs(ip->tot_len), ip->tos, ntohs(ip->id),
+ ntohs(ip->frag_off), ip->ttl);
+
+ for (opti = 0; opti < (ip->ihl - sizeof(struct iphdr) / 4); opti++)
+ printk(" O=0x%8.8X", *opt++);
+ printk(" %s(#%d)\n", syn ? "SYN " : /* "PENANCE" */ "", count);
+}
+
+/* function for checking chain labels for user space. */
+static int check_label(ip_chainlabel label)
+{
+ unsigned int i;
+ /* strlen must be < IP_FW_MAX_LABEL_LENGTH. */
+ for (i = 0; i < IP_FW_MAX_LABEL_LENGTH + 1; i++)
+ if (label[i] == '\0') return 1;
+
+ return 0;
+}
+
+/* This function returns a pointer to the first chain with a label
+ * that matches the one given. */
+static struct ip_chain *find_label(ip_chainlabel label)
+{
+ struct ip_chain *tmp;
+ FWC_HAVE_LOCK(fwc_rlocks | fwc_wlocks);
+ for (tmp = ip_fw_chains; tmp; tmp = tmp->next)
+ if (strcmp(tmp->label,label) == 0)
+ break;
+ return tmp;
+}
+
+/* This function returns a boolean which when true sets answer to one
+ of the FW_*. */
+static int find_special(ip_chainlabel label, int *answer)
+{
+ if (label[0] == '\0') {
+ *answer = FW_SKIP; /* => pass-through rule */
+ return 1;
+ } else if (strcmp(label,IP_FW_LABEL_ACCEPT) == 0) {
+ *answer = FW_ACCEPT;
+ return 1;
+ } else if (strcmp(label,IP_FW_LABEL_BLOCK) == 0) {
+ *answer = FW_BLOCK;
+ return 1;
+ } else if (strcmp(label,IP_FW_LABEL_REJECT) == 0) {
+ *answer = FW_REJECT;
+ return 1;
+ } else if (strcmp(label,IP_FW_LABEL_REDIRECT) == 0) {
+ *answer = FW_REDIRECT;
+ return 1;
+ } else if (strcmp(label,IP_FW_LABEL_MASQUERADE) == 0) {
+ *answer = FW_MASQUERADE;
+ return 1;
+ } else if (strcmp(label, IP_FW_LABEL_RETURN) == 0) {
+ *answer = FW_SKIP+1;
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+/* This function cleans up the prevchain and prevrule. If the verbose
+ * flag is set then he names of the chains will be printed as it
+ * cleans up. */
+static void cleanup(struct ip_chain *chain,
+ const int verbose,
+ unsigned int slot)
+{
+ struct ip_chain *tmpchain = chain->reent[slot].prevchain;
+ if (verbose)
+ printk(KERN_ERR "Chain backtrace: ");
+ while (tmpchain) {
+ if (verbose)
+ printk("%s<-",chain->label);
+ chain->reent[slot].prevchain = NULL;
+ chain = tmpchain;
+ tmpchain = chain->reent[slot].prevchain;
+ }
+ if (verbose)
+ printk("%s\n",chain->label);
+}
+
+static inline int
+ip_fw_domatch(struct ip_fwkernel *f,
+ struct iphdr *ip,
+ const char *rif,
+ const ip_chainlabel label,
+ struct sk_buff *skb,
+ unsigned int slot,
+ __u16 src_port, __u16 dst_port,
+ unsigned int count,
+ int tcpsyn)
+{
+ f->counters[slot].bcnt+=ntohs(ip->tot_len);
+ f->counters[slot].pcnt++;
+ if (f->ipfw.fw_flg & IP_FW_F_PRN) {
+ dump_packet(ip,rif,f,label,src_port,dst_port,count,tcpsyn);
+ }
+ ip->tos = (ip->tos & f->ipfw.fw_tosand) ^ f->ipfw.fw_tosxor;
+
+/* This functionality is useless in stock 2.0.x series, but we don't
+ * discard the mark thing altogether, to avoid breaking ipchains (and,
+ * more importantly, the ipfwadm wrapper) --PR */
+ if (f->ipfw.fw_flg & IP_FW_F_MARKABS) {
+ skb->nfmark = f->ipfw.fw_mark;
+ } else {
+ skb->nfmark += f->ipfw.fw_mark;
+ }
+ if (f->ipfw.fw_flg & IP_FW_F_NETLINK) {
+#if defined(CONFIG_NETLINK_DEV) || defined(CONFIG_NETLINK_DEV_MODULE)
+ size_t len = min(f->ipfw.fw_outputsize, ntohs(ip->tot_len))
+ + sizeof(__u32) + sizeof(skb->nfmark) + IFNAMSIZ;
+ struct sk_buff *outskb=alloc_skb(len, GFP_ATOMIC);
+
+ duprintf("Sending packet out NETLINK (length = %u).\n",
+ (unsigned int)len);
+ if (outskb) {
+ /* Prepend length, mark & interface */
+ skb_put(outskb, len);
+ *((__u32 *)outskb->data) = (__u32)len;
+ *((__u32 *)(outskb->data+sizeof(__u32))) = skb->nfmark;
+ strcpy(outskb->data+sizeof(__u32)*2, rif);
+ memcpy(outskb->data+sizeof(__u32)*2+IFNAMSIZ, ip,
+ len-(sizeof(__u32)*2+IFNAMSIZ));
+ netlink_broadcast(ipfwsk, outskb, 0, ~0, GFP_KERNEL);
+ }
+ else {
+#endif
+ if (net_ratelimit())
+ printk(KERN_WARNING "ip_fw: packet drop due to "
+ "netlink failure\n");
+ return 0;
+#if defined(CONFIG_NETLINK_DEV) || defined(CONFIG_NETLINK_DEV_MODULE)
+ }
+#endif
+ }
+ return 1;
+}
+
+/*
+ * Returns one of the generic firewall policies, like FW_ACCEPT.
+ *
+ * The testing is either false for normal firewall mode or true for
+ * user checking mode (counters are not updated, TOS & mark not done).
+ */
+static int
+ip_fw_check(struct iphdr *ip,
+ const char *rif,
+ __u16 *redirport,
+ struct ip_chain *chain,
+ struct sk_buff *skb,
+ unsigned int slot,
+ int testing)
+{
+ struct tcphdr *tcp=(struct tcphdr *)((__u32 *)ip+ip->ihl);
+ struct udphdr *udp=(struct udphdr *)((__u32 *)ip+ip->ihl);
+ struct icmphdr *icmp=(struct icmphdr *)((__u32 *)ip+ip->ihl);
+ __u32 src, dst;
+ __u16 src_port = 0xFFFF, dst_port = 0xFFFF;
+ char tcpsyn=0;
+ __u16 offset;
+ unsigned char oldtos;
+ struct ip_fwkernel *f;
+ int ret = FW_SKIP+2;
+ unsigned int count;
+
+ /* We handle fragments by dealing with the first fragment as
+ * if it was a normal packet. All other fragments are treated
+ * normally, except that they will NEVER match rules that ask
+ * things we don't know, ie. tcp syn flag or ports). If the
+ * rule is also a fragment-specific rule, non-fragments won't
+ * match it. */
+
+ offset = ntohs(ip->frag_off) & IP_OFFSET;
+
+ /*
+ * Don't allow a fragment of TCP 8 bytes in. Nobody
+ * normal causes this. Its a cracker trying to break
+ * in by doing a flag overwrite to pass the direction
+ * checks.
+ */
+ if (offset == 1 && ip->protocol == IPPROTO_TCP) {
+ if (!testing && net_ratelimit()) {
+ printk("Suspect TCP fragment.\n");
+ dump_packet(ip,rif,NULL,NULL,0,0,0,0);
+ }
+ return FW_BLOCK;
+ }
+
+ /* If we can't investigate ports, treat as fragment. It's
+ * either a trucated whole packet, or a truncated first
+ * fragment, or a TCP first fragment of length 8-15, in which
+ * case the above rule stops reassembly.
+ */
+ if (offset == 0) {
+ unsigned int size_req;
+ switch (ip->protocol) {
+ case IPPROTO_TCP:
+ /* Don't care about things past flags word */
+ size_req = 16;
+ break;
+
+ case IPPROTO_UDP:
+ case IPPROTO_ICMP:
+ size_req = 8;
+ break;
+
+ default:
+ size_req = 0;
+ }
+
+ /* If it is a truncated first fragment then it can be
+ * used to rewrite port information, and thus should
+ * be blocked.
+ */
+ if (ntohs(ip->tot_len) < (ip->ihl<<2)+size_req) {
+ if (!testing && net_ratelimit()) {
+ printk("Suspect short first fragment.\n");
+ dump_packet(ip,rif,NULL,NULL,0,0,0,0);
+ }
+ return FW_BLOCK;
+ }
+ }
+
+ src = ip->saddr;
+ dst = ip->daddr;
+ oldtos = ip->tos;
+
+ /*
+ * If we got interface from which packet came
+ * we can use the address directly. Linux 2.1 now uses address
+ * chains per device too, but unlike BSD we first check if the
+ * incoming packet matches a device address and the routing
+ * table before calling the firewall.
+ */
+
+ dprintf("Packet ");
+ switch(ip->protocol)
+ {
+ case IPPROTO_TCP:
+ dprintf("TCP ");
+ if (!offset) {
+ src_port=ntohs(tcp->source);
+ dst_port=ntohs(tcp->dest);
+
+ /* Connection initilisation can only
+ * be made when the syn bit is set and
+ * neither of the ack or reset is
+ * set. */
+ if(tcp->syn && !(tcp->ack || tcp->rst))
+ tcpsyn=1;
+ }
+ break;
+ case IPPROTO_UDP:
+ dprintf("UDP ");
+ if (!offset) {
+ src_port=ntohs(udp->source);
+ dst_port=ntohs(udp->dest);
+ }
+ break;
+ case IPPROTO_ICMP:
+ if (!offset) {
+ src_port=(__u16)icmp->type;
+ dst_port=(__u16)icmp->code;
+ }
+ dprintf("ICMP ");
+ break;
+ default:
+ dprintf("p=%d ",ip->protocol);
+ break;
+ }
+#ifdef DEBUG_IP_FIREWALL
+ print_ip(ip->saddr);
+
+ if (offset)
+ dprintf(":fragment (%i) ", ((int)offset)<<2);
+ else if (ip->protocol==IPPROTO_TCP || ip->protocol==IPPROTO_UDP
+ || ip->protocol==IPPROTO_ICMP)
+ dprintf(":%hu:%hu", src_port, dst_port);
+ dprintf("\n");
+#endif
+
+ if (!testing) FWC_READ_LOCK(&ip_fw_lock);
+ else FWC_HAVE_LOCK(fwc_rlocks);
+
+ f = chain->chain;
+ do {
+ count = 0;
+ for (; f; f = f->next) {
+ count++;
+ if (ip_rule_match(f,rif,ip,
+ tcpsyn,src_port,dst_port,offset)) {
+ if (!testing
+ && !ip_fw_domatch(f, ip, rif, chain->label,
+ skb, slot,
+ src_port, dst_port,
+ count, tcpsyn)) {
+ ret = FW_BLOCK;
+ goto out;
+ }
+ break;
+ }
+ }
+ if (f) {
+ if (f->branch) {
+ /* Do sanity check to see if we have
+ * already set prevchain and if so we
+ * must be in a loop */
+ if (f->branch->reent[slot].prevchain) {
+ if (!testing) {
+ printk(KERN_ERR
+ "IP firewall: "
+ "Loop detected "
+ "at `%s'.\n",
+ f->branch->label);
+ cleanup(chain, 1, slot);
+ ret = FW_BLOCK;
+ } else {
+ cleanup(chain, 0, slot);
+ ret = FW_SKIP+1;
+ }
+ }
+ else {
+ f->branch->reent[slot].prevchain
+ = chain;
+ f->branch->reent[slot].prevrule
+ = f->next;
+ chain = f->branch;
+ f = chain->chain;
+ }
+ }
+ else if (f->simplebranch == FW_SKIP)
+ f = f->next;
+ else if (f->simplebranch == FW_SKIP+1) {
+ /* Just like falling off the chain */
+ goto fall_off_chain;
+ } else {
+ cleanup(chain, 0, slot);
+ ret = f->simplebranch;
+ }
+ } /* f == NULL */
+ else {
+ fall_off_chain:
+ if (chain->reent[slot].prevchain) {
+ struct ip_chain *tmp = chain;
+ f = chain->reent[slot].prevrule;
+ chain = chain->reent[slot].prevchain;
+ tmp->reent[slot].prevchain = NULL;
+ }
+ else {
+ ret = chain->policy;
+ if (!testing) {
+ chain->reent[slot].counters.pcnt++;
+ chain->reent[slot].counters.bcnt
+ += ntohs(ip->tot_len);
+ }
+ }
+ }
+ } while (ret == FW_SKIP+2);
+
+ out:
+ if (!testing) FWC_READ_UNLOCK(&ip_fw_lock);
+
+ /* Recalculate checksum if not going to reject, and TOS changed. */
+ if (ip->tos != oldtos
+ && ret != FW_REJECT && ret != FW_BLOCK
+ && !testing)
+ ip_send_check(ip);
+
+ if (ret == FW_REDIRECT && redirport) {
+ if ((*redirport = htons(f->ipfw.fw_redirpt)) == 0) {
+ /* Wildcard redirection.
+ * Note that redirport will become
+ * 0xFFFF for non-TCP/UDP packets.
+ */
+ *redirport = htons(dst_port);
+ }
+ }
+
+#ifdef DEBUG_ALLOW_ALL
+ return (testing ? ret : FW_ACCEPT);
+#else
+ return ret;
+#endif
+}
+
+/* Must have write lock & interrupts off for any of these */
+
+/* This function sets all the byte counters in a chain to zero. The
+ * input is a pointer to the chain required for zeroing */
+static int zero_fw_chain(struct ip_chain *chainptr)
+{
+ struct ip_fwkernel *i;
+
+ FWC_HAVE_LOCK(fwc_wlocks);
+ for (i = chainptr->chain; i; i = i->next)
+ memset(i->counters, 0, sizeof(struct ip_counters)*NUM_SLOTS);
+ return 0;
+}
+
+static int clear_fw_chain(struct ip_chain *chainptr)
+{
+ struct ip_fwkernel *i= chainptr->chain;
+
+ FWC_HAVE_LOCK(fwc_wlocks);
+ chainptr->chain=NULL;
+
+ while (i) {
+ struct ip_fwkernel *tmp = i->next;
+ if (i->branch)
+ i->branch->refcount--;
+ kfree(i);
+ i = tmp;
+ }
+ return 0;
+}
+
+static int replace_in_chain(struct ip_chain *chainptr,
+ struct ip_fwkernel *frwl,
+ __u32 position)
+{
+ struct ip_fwkernel *f = chainptr->chain;
+
+ FWC_HAVE_LOCK(fwc_wlocks);
+
+ while (--position && f != NULL) f = f->next;
+ if (f == NULL)
+ return EINVAL;
+
+ if (f->branch) f->branch->refcount--;
+ if (frwl->branch) frwl->branch->refcount++;
+
+ frwl->next = f->next;
+ memcpy(f,frwl,sizeof(struct ip_fwkernel));
+ kfree(frwl);
+ return 0;
+}
+
+static int append_to_chain(struct ip_chain *chainptr, struct ip_fwkernel *rule)
+{
+ struct ip_fwkernel *i;
+
+ FWC_HAVE_LOCK(fwc_wlocks);
+ /* Special case if no rules already present */
+ if (chainptr->chain == NULL) {
+
+ /* If pointer writes are atomic then turning off
+ * interupts is not necessary. */
+ chainptr->chain = rule;
+ if (rule->branch) rule->branch->refcount++;
+ return 0;
+ }
+
+ /* Find the rule before the end of the chain */
+ for (i = chainptr->chain; i->next; i = i->next);
+ i->next = rule;
+ if (rule->branch) rule->branch->refcount++;
+ return 0;
+}
+
+/* This function inserts a rule at the position of position in the
+ * chain refenced by chainptr. If position is 1 then this rule will
+ * become the new rule one. */
+static int insert_in_chain(struct ip_chain *chainptr,
+ struct ip_fwkernel *frwl,
+ __u32 position)
+{
+ struct ip_fwkernel *f = chainptr->chain;
+
+ FWC_HAVE_LOCK(fwc_wlocks);
+ /* special case if the position is number 1 */
+ if (position == 1) {
+ frwl->next = chainptr->chain;
+ if (frwl->branch) frwl->branch->refcount++;
+ chainptr->chain = frwl;
+ return 0;
+ }
+ position--;
+ while (--position && f != NULL) f = f->next;
+ if (f == NULL)
+ return EINVAL;
+ if (frwl->branch) frwl->branch->refcount++;
+ frwl->next = f->next;
+
+ f->next = frwl;
+ return 0;
+}
+
+/* This function deletes the a rule from a given rulenum and chain.
+ * With rulenum = 1 is the first rule is deleted. */
+
+static int del_num_from_chain(struct ip_chain *chainptr, __u32 rulenum)
+{
+ struct ip_fwkernel *i=chainptr->chain,*tmp;
+
+ FWC_HAVE_LOCK(fwc_wlocks);
+
+ if (!chainptr->chain)
+ return ENOENT;
+
+ /* Need a special case for the first rule */
+ if (rulenum == 1) {
+ /* store temp to allow for freeing up of memory */
+ tmp = chainptr->chain;
+ if (chainptr->chain->branch) chainptr->chain->branch->refcount--;
+ chainptr->chain = chainptr->chain->next;
+ kfree(tmp); /* free memory that is now unused */
+ } else {
+ rulenum--;
+ while (--rulenum && i->next ) i = i->next;
+ if (!i->next)
+ return ENOENT;
+ tmp = i->next;
+ if (i->next->branch)
+ i->next->branch->refcount--;
+ i->next = i->next->next;
+ kfree(tmp);
+ }
+ return 0;
+}
+
+
+/* This function deletes the a rule from a given rule and chain.
+ * The rule that is deleted is the first occursance of that rule. */
+static int del_rule_from_chain(struct ip_chain *chainptr,
+ struct ip_fwkernel *frwl)
+{
+ struct ip_fwkernel *ltmp,*ftmp = chainptr->chain ;
+ int was_found;
+
+ FWC_HAVE_LOCK(fwc_wlocks);
+
+ /* Sure, we should compare marks, but since the `ipfwadm'
+ * script uses it for an unholy hack... well, life is easier
+ * this way. We also mask it out of the flags word. --PR */
+ for (ltmp=NULL, was_found=0;
+ !was_found && ftmp != NULL;
+ ltmp = ftmp,ftmp = ftmp->next) {
+ if (ftmp->ipfw.fw_src.s_addr!=frwl->ipfw.fw_src.s_addr
+ || ftmp->ipfw.fw_dst.s_addr!=frwl->ipfw.fw_dst.s_addr
+ || ftmp->ipfw.fw_smsk.s_addr!=frwl->ipfw.fw_smsk.s_addr
+ || ftmp->ipfw.fw_dmsk.s_addr!=frwl->ipfw.fw_dmsk.s_addr
+#if 0
+ || ftmp->ipfw.fw_flg!=frwl->ipfw.fw_flg
+#else
+ || ((ftmp->ipfw.fw_flg & ~IP_FW_F_MARKABS)
+ != (frwl->ipfw.fw_flg & ~IP_FW_F_MARKABS))
+#endif
+ || ftmp->ipfw.fw_invflg!=frwl->ipfw.fw_invflg
+ || ftmp->ipfw.fw_proto!=frwl->ipfw.fw_proto
+#if 0
+ || ftmp->ipfw.fw_mark!=frwl->ipfw.fw_mark
+#endif
+ || ftmp->ipfw.fw_redirpt!=frwl->ipfw.fw_redirpt
+ || ftmp->ipfw.fw_spts[0]!=frwl->ipfw.fw_spts[0]
+ || ftmp->ipfw.fw_spts[1]!=frwl->ipfw.fw_spts[1]
+ || ftmp->ipfw.fw_dpts[0]!=frwl->ipfw.fw_dpts[0]
+ || ftmp->ipfw.fw_dpts[1]!=frwl->ipfw.fw_dpts[1]
+ || ftmp->ipfw.fw_outputsize!=frwl->ipfw.fw_outputsize) {
+ duprintf("del_rule_from_chain: mismatch:"
+ "src:%u/%u dst:%u/%u smsk:%u/%u dmsk:%u/%u "
+ "flg:%hX/%hX invflg:%hX/%hX proto:%u/%u "
+ "mark:%u/%u "
+ "ports:%hu-%hu/%hu-%hu %hu-%hu/%hu-%hu "
+ "outputsize:%hu-%hu\n",
+ ftmp->ipfw.fw_src.s_addr,
+ frwl->ipfw.fw_src.s_addr,
+ ftmp->ipfw.fw_dst.s_addr,
+ frwl->ipfw.fw_dst.s_addr,
+ ftmp->ipfw.fw_smsk.s_addr,
+ frwl->ipfw.fw_smsk.s_addr,
+ ftmp->ipfw.fw_dmsk.s_addr,
+ frwl->ipfw.fw_dmsk.s_addr,
+ ftmp->ipfw.fw_flg,
+ frwl->ipfw.fw_flg,
+ ftmp->ipfw.fw_invflg,
+ frwl->ipfw.fw_invflg,
+ ftmp->ipfw.fw_proto,
+ frwl->ipfw.fw_proto,
+ ftmp->ipfw.fw_mark,
+ frwl->ipfw.fw_mark,
+ ftmp->ipfw.fw_spts[0],
+ frwl->ipfw.fw_spts[0],
+ ftmp->ipfw.fw_spts[1],
+ frwl->ipfw.fw_spts[1],
+ ftmp->ipfw.fw_dpts[0],
+ frwl->ipfw.fw_dpts[0],
+ ftmp->ipfw.fw_dpts[1],
+ frwl->ipfw.fw_dpts[1],
+ ftmp->ipfw.fw_outputsize,
+ frwl->ipfw.fw_outputsize);
+ continue;
+ }
+
+ if (strncmp(ftmp->ipfw.fw_vianame,
+ frwl->ipfw.fw_vianame,
+ IFNAMSIZ)) {
+ duprintf("del_rule_from_chain: if mismatch: %s/%s\n",
+ ftmp->ipfw.fw_vianame,
+ frwl->ipfw.fw_vianame);
+ continue;
+ }
+ if (ftmp->branch != frwl->branch) {
+ duprintf("del_rule_from_chain: branch mismatch: "
+ "%s/%s\n",
+ ftmp->branch?ftmp->branch->label:"(null)",
+ frwl->branch?frwl->branch->label:"(null)");
+ continue;
+ }
+ if (ftmp->branch == NULL
+ && ftmp->simplebranch != frwl->simplebranch) {
+ duprintf("del_rule_from_chain: simplebranch mismatch: "
+ "%i/%i\n",
+ ftmp->simplebranch, frwl->simplebranch);
+ continue;
+ }
+ was_found = 1;
+ if (ftmp->branch)
+ ftmp->branch->refcount--;
+ if (ltmp)
+ ltmp->next = ftmp->next;
+ else
+ chainptr->chain = ftmp->next;
+ kfree(ftmp);
+ break;
+ }
+
+ if (was_found)
+ return 0;
+ else {
+ duprintf("del_rule_from_chain: no matching rule found\n");
+ return EINVAL;
+ }
+}
+
+/* This function takes the label of a chain and deletes the first
+ * chain with that name. No special cases required for the built in
+ * chains as they have their refcount initilised to 1 so that they are
+ * never deleted. */
+static int del_chain(ip_chainlabel label)
+{
+ struct ip_chain *tmp,*tmp2;
+
+ FWC_HAVE_LOCK(fwc_wlocks);
+ /* Corner case: return EBUSY not ENOENT for first elem ("input") */
+ if (strcmp(label, ip_fw_chains->label) == 0)
+ return EBUSY;
+
+ for (tmp = ip_fw_chains; tmp->next; tmp = tmp->next)
+ if(strcmp(tmp->next->label,label) == 0)
+ break;
+
+ tmp2 = tmp->next;
+ if (!tmp2)
+ return ENOENT;
+
+ if (tmp2->refcount)
+ return EBUSY;
+
+ if (tmp2->chain)
+ return ENOTEMPTY;
+
+ tmp->next = tmp2->next;
+ kfree(tmp2);
+ return 0;
+}
+
+/* This is a function to initilise a chain. Built in rules start with
+ * refcount = 1 so that they cannot be deleted. User defined rules
+ * start with refcount = 0 so they can be deleted. */
+static struct ip_chain *ip_init_chain(ip_chainlabel name,
+ __u32 ref,
+ int policy)
+{
+ unsigned int i;
+ struct ip_chain *label
+ = kmalloc(SIZEOF_STRUCT_IP_CHAIN, GFP_KERNEL);
+ if (label == NULL)
+ panic("Can't kmalloc for firewall chains.\n");
+ strcpy(label->label,name);
+ label->next = NULL;
+ label->chain = NULL;
+ label->refcount = ref;
+ label->policy = policy;
+ for (i = 0; i < smp_num_cpus*2; i++) {
+ label->reent[i].counters.pcnt = label->reent[i].counters.bcnt
+ = 0;
+ label->reent[i].prevchain = NULL;
+ label->reent[i].prevrule = NULL;
+ }
+
+ return label;
+}
+
+/* This is a function for reating a new chain. The chains is not
+ * created if a chain of the same name already exists */
+static int create_chain(ip_chainlabel label)
+{
+ struct ip_chain *tmp;
+
+ if (!check_label(label))
+ return EINVAL;
+
+ FWC_HAVE_LOCK(fwc_wlocks);
+ for (tmp = ip_fw_chains; tmp->next; tmp = tmp->next)
+ if (strcmp(tmp->label,label) == 0)
+ return EEXIST;
+
+ if (strcmp(tmp->label,label) == 0)
+ return EEXIST;
+
+ tmp->next = ip_init_chain(label, 0, FW_SKIP); /* refcount is
+ * zero since this is a
+ * user defined chain *
+ * and therefore can be
+ * deleted */
+ return 0;
+}
+
+/* This function simply changes the policy on one of the built in
+ * chains. checking must be done before this is call to ensure that
+ * chainptr is pointing to one of the three possible chains */
+static int change_policy(struct ip_chain *chainptr, int policy)
+{
+ FWC_HAVE_LOCK(fwc_wlocks);
+ chainptr->policy = policy;
+ return 0;
+}
+
+/* This function takes an ip_fwuser and converts it to a ip_fwkernel. It also
+ * performs some checks in the structure. */
+static struct ip_fwkernel *convert_ipfw(struct ip_fwuser *fwuser, int *errno)
+{
+ struct ip_fwkernel *fwkern;
+
+ if ( (fwuser->ipfw.fw_flg & ~IP_FW_F_MASK) != 0 ) {
+ duprintf("convert_ipfw: undefined flag bits set (flags=%x)\n",
+ fwuser->ipfw.fw_flg);
+ *errno = EINVAL;
+ return NULL;
+ }
+
+#ifdef DEBUG_IP_FIREWALL_USER
+ /* These are sanity checks that don't really matter.
+ * We can get rid of these once testing is complete.
+ */
+ if ((fwuser->ipfw.fw_flg & IP_FW_F_TCPSYN)
+ && ((fwuser->ipfw.fw_invflg & IP_FW_INV_PROTO)
+ || fwuser->ipfw.fw_proto != IPPROTO_TCP)) {
+ duprintf("convert_ipfw: TCP SYN flag set but proto != TCP!\n");
+ *errno = EINVAL;
+ return NULL;
+ }
+
+ if (strcmp(fwuser->label, IP_FW_LABEL_REDIRECT) != 0
+ && fwuser->ipfw.fw_redirpt != 0) {
+ duprintf("convert_ipfw: Target not REDIR but redirpt != 0!\n");
+ *errno = EINVAL;
+ return NULL;
+ }
+
+ if ((!(fwuser->ipfw.fw_flg & IP_FW_F_FRAG)
+ && (fwuser->ipfw.fw_invflg & IP_FW_INV_FRAG))
+ || (!(fwuser->ipfw.fw_flg & IP_FW_F_TCPSYN)
+ && (fwuser->ipfw.fw_invflg & IP_FW_INV_SYN))) {
+ duprintf("convert_ipfw: Can't have INV flag if flag unset!\n");
+ *errno = EINVAL;
+ return NULL;
+ }
+
+ if (((fwuser->ipfw.fw_invflg & IP_FW_INV_SRCPT)
+ && fwuser->ipfw.fw_spts[0] == 0
+ && fwuser->ipfw.fw_spts[1] == 0xFFFF)
+ || ((fwuser->ipfw.fw_invflg & IP_FW_INV_DSTPT)
+ && fwuser->ipfw.fw_dpts[0] == 0
+ && fwuser->ipfw.fw_dpts[1] == 0xFFFF)
+ || ((fwuser->ipfw.fw_invflg & IP_FW_INV_VIA)
+ && (fwuser->ipfw.fw_vianame)[0] == '\0')
+ || ((fwuser->ipfw.fw_invflg & IP_FW_INV_SRCIP)
+ && fwuser->ipfw.fw_smsk.s_addr == 0)
+ || ((fwuser->ipfw.fw_invflg & IP_FW_INV_DSTIP)
+ && fwuser->ipfw.fw_dmsk.s_addr == 0)) {
+ duprintf("convert_ipfw: INV flag makes rule unmatchable!\n");
+ *errno = EINVAL;
+ return NULL;
+ }
+
+ if ((fwuser->ipfw.fw_flg & IP_FW_F_FRAG)
+ && !(fwuser->ipfw.fw_invflg & IP_FW_INV_FRAG)
+ && (fwuser->ipfw.fw_spts[0] != 0
+ || fwuser->ipfw.fw_spts[1] != 0xFFFF
+ || fwuser->ipfw.fw_dpts[0] != 0
+ || fwuser->ipfw.fw_dpts[1] != 0xFFFF
+ || (fwuser->ipfw.fw_flg & IP_FW_F_TCPSYN))) {
+ duprintf("convert_ipfw: Can't test ports or SYN with frag!\n");
+ *errno = EINVAL;
+ return NULL;
+ }
+#endif
+
+ if ((fwuser->ipfw.fw_spts[0] != 0
+ || fwuser->ipfw.fw_spts[1] != 0xFFFF
+ || fwuser->ipfw.fw_dpts[0] != 0
+ || fwuser->ipfw.fw_dpts[1] != 0xFFFF)
+ && ((fwuser->ipfw.fw_invflg & IP_FW_INV_PROTO)
+ || (fwuser->ipfw.fw_proto != IPPROTO_TCP
+ && fwuser->ipfw.fw_proto != IPPROTO_UDP
+ && fwuser->ipfw.fw_proto != IPPROTO_ICMP))) {
+ duprintf("convert_ipfw: Can only test ports for TCP/UDP/ICMP!\n");
+ *errno = EINVAL;
+ return NULL;
+ }
+
+ fwkern = kmalloc(SIZEOF_STRUCT_IP_FW_KERNEL, GFP_KERNEL);
+ if (!fwkern) {
+ duprintf("convert_ipfw: kmalloc failed!\n");
+ *errno = ENOMEM;
+ return NULL;
+ }
+ memcpy(&fwkern->ipfw,&fwuser->ipfw,sizeof(struct ip_fw));
+
+ if (!find_special(fwuser->label, &fwkern->simplebranch)) {
+ fwkern->branch = find_label(fwuser->label);
+ if (!fwkern->branch) {
+ duprintf("convert_ipfw: chain doesn't exist `%s'.\n",
+ fwuser->label);
+ kfree(fwkern);
+ *errno = ENOENT;
+ return NULL;
+ } else if (fwkern->branch == IP_FW_INPUT_CHAIN
+ || fwkern->branch == IP_FW_FORWARD_CHAIN
+ || fwkern->branch == IP_FW_OUTPUT_CHAIN) {
+ duprintf("convert_ipfw: Can't branch to builtin chain `%s'.\n",
+ fwuser->label);
+ kfree(fwkern);
+ *errno = ENOENT;
+ return NULL;
+ }
+ } else
+ fwkern->branch = NULL;
+ memset(fwkern->counters, 0, sizeof(struct ip_counters)*NUM_SLOTS);
+
+ /* Handle empty vianame by making it a wildcard */
+ if ((fwkern->ipfw.fw_vianame)[0] == '\0')
+ fwkern->ipfw.fw_flg |= IP_FW_F_WILDIF;
+
+ fwkern->next = NULL;
+ return fwkern;
+}
+
+int ip_fw_ctl(int cmd, void *m, int len)
+{
+ int ret;
+ struct ip_chain *chain;
+ unsigned long flags;
+
+ FWC_WRITE_LOCK_IRQ(&ip_fw_lock, flags);
+
+ switch (cmd) {
+ case IP_FW_FLUSH:
+ if (len != sizeof(ip_chainlabel) || !check_label(m))
+ ret = EINVAL;
+ else if ((chain = find_label(m)) == NULL)
+ ret = ENOENT;
+ else ret = clear_fw_chain(chain);
+ break;
+
+ case IP_FW_ZERO:
+ if (len != sizeof(ip_chainlabel) || !check_label(m))
+ ret = EINVAL;
+ else if ((chain = find_label(m)) == NULL)
+ ret = ENOENT;
+ else ret = zero_fw_chain(chain);
+ break;
+
+ case IP_FW_CHECK: {
+ struct ip_fwtest *new = m;
+ struct iphdr *ip;
+
+ /* Don't need write lock. */
+ FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags);
+
+ if (len != sizeof(struct ip_fwtest) || !check_label(m))
+ return EINVAL;
+
+ /* Need readlock to do find_label */
+ FWC_READ_LOCK(&ip_fw_lock);
+
+ if ((chain = find_label(new->fwt_label)) == NULL)
+ ret = ENOENT;
+ else {
+ ip = &(new->fwt_packet.fwp_iph);
+
+ if (ip->ihl != sizeof(struct iphdr) / sizeof(int)) {
+ duprintf("ip_fw_ctl: ip->ihl=%d, want %d\n",
+ ip->ihl,
+ sizeof(struct iphdr) / sizeof(int));
+ ret = EINVAL;
+ }
+ else {
+ ret = ip_fw_check(ip, new->fwt_packet.fwp_vianame,
+ NULL, chain,
+ NULL, SLOT_NUMBER(), 1);
+ switch (ret) {
+ case FW_ACCEPT:
+ ret = 0; break;
+ case FW_REDIRECT:
+ ret = ECONNABORTED; break;
+ case FW_MASQUERADE:
+ ret = ECONNRESET; break;
+ case FW_REJECT:
+ ret = ECONNREFUSED; break;
+ /* Hack to help diag; these only get
+ returned when testing. */
+ case FW_SKIP+1:
+ ret = ELOOP; break;
+ case FW_SKIP:
+ ret = ENFILE; break;
+ default: /* FW_BLOCK */
+ ret = ETIMEDOUT; break;
+ }
+ }
+ }
+ FWC_READ_UNLOCK(&ip_fw_lock);
+ return ret;
+ }
+
+ case IP_FW_MASQ_TIMEOUTS: {
+ ret = ip_fw_masq_timeouts(m, len);
+ }
+ break;
+
+ case IP_FW_REPLACE: {
+ struct ip_fwkernel *ip_fwkern;
+ struct ip_fwnew *new = m;
+
+ if (len != sizeof(struct ip_fwnew)
+ || !check_label(new->fwn_label))
+ ret = EINVAL;
+ else if ((chain = find_label(new->fwn_label)) == NULL)
+ ret = ENOENT;
+ else if ((ip_fwkern = convert_ipfw(&new->fwn_rule, &ret))
+ != NULL)
+ ret = replace_in_chain(chain, ip_fwkern,
+ new->fwn_rulenum);
+ }
+ break;
+
+ case IP_FW_APPEND: {
+ struct ip_fwchange *new = m;
+ struct ip_fwkernel *ip_fwkern;
+
+ if (len != sizeof(struct ip_fwchange)
+ || !check_label(new->fwc_label))
+ ret = EINVAL;
+ else if ((chain = find_label(new->fwc_label)) == NULL)
+ ret = ENOENT;
+ else if ((ip_fwkern = convert_ipfw(&new->fwc_rule, &ret))
+ != NULL)
+ ret = append_to_chain(chain, ip_fwkern);
+ }
+ break;
+
+ case IP_FW_INSERT: {
+ struct ip_fwkernel *ip_fwkern;
+ struct ip_fwnew *new = m;
+
+ if (len != sizeof(struct ip_fwnew)
+ || !check_label(new->fwn_label))
+ ret = EINVAL;
+ else if ((chain = find_label(new->fwn_label)) == NULL)
+ ret = ENOENT;
+ else if ((ip_fwkern = convert_ipfw(&new->fwn_rule, &ret))
+ != NULL)
+ ret = insert_in_chain(chain, ip_fwkern,
+ new->fwn_rulenum);
+ }
+ break;
+
+ case IP_FW_DELETE: {
+ struct ip_fwchange *new = m;
+ struct ip_fwkernel *ip_fwkern;
+
+ if (len != sizeof(struct ip_fwchange)
+ || !check_label(new->fwc_label))
+ ret = EINVAL;
+ else if ((chain = find_label(new->fwc_label)) == NULL)
+ ret = ENOENT;
+ else if ((ip_fwkern = convert_ipfw(&new->fwc_rule, &ret))
+ != NULL) {
+ ret = del_rule_from_chain(chain, ip_fwkern);
+ kfree(ip_fwkern);
+ }
+ }
+ break;
+
+ case IP_FW_DELETE_NUM: {
+ struct ip_fwdelnum *new = m;
+
+ if (len != sizeof(struct ip_fwdelnum)
+ || !check_label(new->fwd_label))
+ ret = EINVAL;
+ else if ((chain = find_label(new->fwd_label)) == NULL)
+ ret = ENOENT;
+ else ret = del_num_from_chain(chain, new->fwd_rulenum);
+ }
+ break;
+
+ case IP_FW_CREATECHAIN: {
+ if (len != sizeof(ip_chainlabel)) {
+ duprintf("create_chain: bad size %i\n", len);
+ ret = EINVAL;
+ }
+ else ret = create_chain(m);
+ }
+ break;
+
+ case IP_FW_DELETECHAIN: {
+ if (len != sizeof(ip_chainlabel)) {
+ duprintf("delete_chain: bad size %i\n", len);
+ ret = EINVAL;
+ }
+ else ret = del_chain(m);
+ }
+ break;
+
+ case IP_FW_POLICY: {
+ struct ip_fwpolicy *new = m;
+
+ if (len != sizeof(struct ip_fwpolicy)
+ || !check_label(new->fwp_label))
+ ret = EINVAL;
+ else if ((chain = find_label(new->fwp_label)) == NULL)
+ ret = ENOENT;
+ else if (chain != IP_FW_INPUT_CHAIN
+ && chain != IP_FW_FORWARD_CHAIN
+ && chain != IP_FW_OUTPUT_CHAIN) {
+ duprintf("change_policy: can't change policy on user"
+ " defined chain.\n");
+ ret = EINVAL;
+ }
+ else {
+ int pol = FW_SKIP;
+ find_special(new->fwp_policy, &pol);
+
+ switch(pol) {
+ case FW_MASQUERADE:
+ if (chain != IP_FW_FORWARD_CHAIN) {
+ ret = EINVAL;
+ break;
+ }
+ /* Fall thru... */
+ case FW_BLOCK:
+ case FW_ACCEPT:
+ case FW_REJECT:
+ ret = change_policy(chain, pol);
+ break;
+ default:
+ duprintf("change_policy: bad policy `%s'\n",
+ new->fwp_policy);
+ ret = EINVAL;
+ }
+ }
+ break;
+ }
+ default:
+ duprintf("ip_fw_ctl: unknown request %d\n",cmd);
+ ret = ENOPROTOOPT;
+ }
+
+ FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags);
+ return ret;
+}
+
+/* Returns bytes used - doesn't NUL terminate */
+static int dump_rule(char *buffer,
+ const char *chainlabel,
+ const struct ip_fwkernel *rule)
+{
+ int len;
+ unsigned int i;
+ __u64 packets = 0, bytes = 0;
+
+ FWC_HAVE_LOCK(fwc_wlocks);
+ for (i = 0; i < NUM_SLOTS; i++) {
+ packets += rule->counters[i].pcnt;
+ bytes += rule->counters[i].bcnt;
+ }
+
+ len=sprintf(buffer,
+ "%9s " /* Chain name */
+ "%08X/%08X->%08X/%08X " /* Source & Destination IPs */
+ "%.16s " /* Interface */
+ "%X %X " /* fw_flg and fw_invflg fields */
+ "%u " /* Protocol */
+ "%-9u %-9u %-9u %-9u " /* Packet & byte counters */
+ "%u-%u %u-%u " /* Source & Dest port ranges */
+ "A%02X X%02X " /* TOS and and xor masks */
+ "%08X " /* Redirection port */
+ "%u " /* fw_mark field */
+ "%u " /* output size */
+ "%9s\n", /* Target */
+ chainlabel,
+ ntohl(rule->ipfw.fw_src.s_addr),
+ ntohl(rule->ipfw.fw_smsk.s_addr),
+ ntohl(rule->ipfw.fw_dst.s_addr),
+ ntohl(rule->ipfw.fw_dmsk.s_addr),
+ (rule->ipfw.fw_vianame)[0] ? rule->ipfw.fw_vianame : "-",
+ rule->ipfw.fw_flg,
+ rule->ipfw.fw_invflg,
+ rule->ipfw.fw_proto,
+ (__u32)(packets >> 32), (__u32)packets,
+ (__u32)(bytes >> 32), (__u32)bytes,
+ rule->ipfw.fw_spts[0], rule->ipfw.fw_spts[1],
+ rule->ipfw.fw_dpts[0], rule->ipfw.fw_dpts[1],
+ rule->ipfw.fw_tosand, rule->ipfw.fw_tosxor,
+ rule->ipfw.fw_redirpt,
+ rule->ipfw.fw_mark,
+ rule->ipfw.fw_outputsize,
+ branchname(rule->branch,rule->simplebranch));
+
+ duprintf("dump_rule: %i bytes done.\n", len);
+ return len;
+}
+
+/* File offset is actually in records, not bytes. */
+static int ip_chain_procinfo(char *buffer, char **start,
+ off_t offset, int length
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,29)
+ , int reset
+#endif
+ )
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,29)
+ /* FIXME: No more `atomic' read and reset. Wonderful 8-( --RR */
+ int reset = 0;
+#endif
+ struct ip_chain *i;
+ struct ip_fwkernel *j = ip_fw_chains->chain;
+ unsigned long flags;
+ int len = 0;
+ int last_len = 0;
+ off_t upto = 0;
+
+ duprintf("Offset starts at %lu\n", offset);
+ duprintf("ip_fw_chains is 0x%0lX\n", (unsigned long int)ip_fw_chains);
+
+ /* Need a write lock to lock out ``readers'' which update counters. */
+ FWC_WRITE_LOCK_IRQ(&ip_fw_lock, flags);
+
+ for (i = ip_fw_chains; i; i = i->next) {
+ for (j = i->chain; j; j = j->next) {
+ if (upto == offset) break;
+ duprintf("Skipping rule in chain `%s'\n",
+ i->label);
+ upto++;
+ }
+ if (upto == offset) break;
+ }
+
+ /* Don't init j first time, or once i = NULL */
+ for (; i; (void)((i = i->next) && (j = i->chain))) {
+ duprintf("Dumping chain `%s'\n", i->label);
+ for (; j; j = j->next, upto++, last_len = len)
+ {
+ len += dump_rule(buffer+len, i->label, j);
+ if (len > length) {
+ duprintf("Dumped to %i (past %i). "
+ "Moving back to %i.\n",
+ len, length, last_len);
+ len = last_len;
+ goto outside;
+ }
+ else if (reset)
+ memset(j->counters, 0,
+ sizeof(struct ip_counters)*NUM_SLOTS);
+ }
+ }
+outside:
+ FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags);
+ buffer[len] = '\0';
+
+ duprintf("ip_chain_procinfo: Length = %i (of %i). Offset = %li.\n",
+ len, length, upto);
+ /* `start' hack - see fs/proc/generic.c line ~165 */
+ *start=(char *)((unsigned int)upto-offset);
+ return len;
+}
+
+static int ip_chain_name_procinfo(char *buffer, char **start,
+ off_t offset, int length)
+{
+ struct ip_chain *i;
+ int len = 0,last_len = 0;
+ off_t pos = 0,begin = 0;
+ unsigned long flags;
+
+ /* Need a write lock to lock out ``readers'' which update counters. */
+ FWC_WRITE_LOCK_IRQ(&ip_fw_lock, flags);
+
+ for (i = ip_fw_chains; i; i = i->next)
+ {
+ unsigned int j;
+ __u32 packetsHi = 0, packetsLo = 0, bytesHi = 0, bytesLo = 0;
+
+ for (j = 0; j < NUM_SLOTS; j++) {
+ packetsLo += i->reent[j].counters.pcnt & 0xFFFFFFFF;
+ packetsHi += ((i->reent[j].counters.pcnt >> 32)
+ & 0xFFFFFFFF);
+ bytesLo += i->reent[j].counters.bcnt & 0xFFFFFFFF;
+ bytesHi += ((i->reent[j].counters.bcnt >> 32)
+ & 0xFFFFFFFF);
+ }
+
+ /* print the label and the policy */
+ len+=sprintf(buffer+len,"%s %s %i %u %u %u %u\n",
+ i->label,branchname(NULL, i->policy),i->refcount,
+ packetsHi, packetsLo, bytesHi, bytesLo);
+ pos=begin+len;
+ if(pos<offset) {
+ len=0;
+ begin=pos;
+ }
+ else if(pos>offset+length) {
+ len = last_len;
+ break;
+ }
+
+ last_len = len;
+ }
+ FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags);
+
+ *start = buffer+(offset-begin);
+ len-=(offset-begin);
+ if(len>length)
+ len=length;
+ return len;
+}
+
+/*
+ * Interface to the generic firewall chains.
+ */
+int ipfw_input_check(struct firewall_ops *this, int pf,
+ struct net_device *dev, void *phdr, void *arg,
+ struct sk_buff **pskb)
+{
+ return ip_fw_check(phdr, dev->name,
+ arg, IP_FW_INPUT_CHAIN, *pskb, SLOT_NUMBER(), 0);
+}
+
+int ipfw_output_check(struct firewall_ops *this, int pf,
+ struct net_device *dev, void *phdr, void *arg,
+ struct sk_buff **pskb)
+{
+ /* Locally generated bogus packets by root. <SIGH>. */
+ if (((struct iphdr *)phdr)->ihl * 4 < sizeof(struct iphdr)
+ || (*pskb)->len < sizeof(struct iphdr))
+ return FW_ACCEPT;
+ return ip_fw_check(phdr, dev->name,
+ arg, IP_FW_OUTPUT_CHAIN, *pskb, SLOT_NUMBER(), 0);
+}
+
+int ipfw_forward_check(struct firewall_ops *this, int pf,
+ struct net_device *dev, void *phdr, void *arg,
+ struct sk_buff **pskb)
+{
+ return ip_fw_check(phdr, dev->name,
+ arg, IP_FW_FORWARD_CHAIN, *pskb, SLOT_NUMBER(), 0);
+}
+
+struct firewall_ops ipfw_ops=
+{
+ NULL,
+ ipfw_forward_check,
+ ipfw_input_check,
+ ipfw_output_check,
+ NULL,
+ NULL
+};
+
+int ipfw_init_or_cleanup(int init)
+{
+ int ret = 0;
+ unsigned long flags;
+
+ FWC_WRITE_LOCK_IRQ(&ip_fw_lock, flags);
+
+ if (!init) goto cleanup;
+
+#ifdef DEBUG_IP_FIREWALL_LOCKING
+ fwc_wlocks = fwc_rlocks = 0;
+#endif
+
+#if defined(CONFIG_NETLINK_DEV) || defined(CONFIG_NETLINK_DEV_MODULE)
+ ipfwsk = netlink_kernel_create(NETLINK_FIREWALL, NULL);
+ if (ipfwsk == NULL)
+ goto cleanup_nothing;
+#endif
+
+ ret = register_firewall(PF_INET, &ipfw_ops);
+ if (ret < 0)
+ goto cleanup_netlink;
+
+ proc_net_create(IP_FW_PROC_CHAINS, S_IFREG | S_IRUSR | S_IWUSR, ip_chain_procinfo);
+ proc_net_create(IP_FW_PROC_CHAIN_NAMES, S_IFREG | S_IRUSR | S_IWUSR, ip_chain_name_procinfo);
+
+ IP_FW_INPUT_CHAIN = ip_init_chain(IP_FW_LABEL_INPUT, 1, FW_ACCEPT);
+ IP_FW_FORWARD_CHAIN = ip_init_chain(IP_FW_LABEL_FORWARD, 1, FW_ACCEPT);
+ IP_FW_OUTPUT_CHAIN = ip_init_chain(IP_FW_LABEL_OUTPUT, 1, FW_ACCEPT);
+
+ FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags);
+ return ret;
+
+ cleanup:
+ while (ip_fw_chains) {
+ struct ip_chain *next = ip_fw_chains->next;
+
+ clear_fw_chain(ip_fw_chains);
+ kfree(ip_fw_chains);
+ ip_fw_chains = next;
+ }
+
+ proc_net_remove(IP_FW_PROC_CHAINS);
+ proc_net_remove(IP_FW_PROC_CHAIN_NAMES);
+
+ unregister_firewall(PF_INET, &ipfw_ops);
+
+ cleanup_netlink:
+#if defined(CONFIG_NETLINK_DEV) || defined(CONFIG_NETLINK_DEV_MODULE)
+ sock_release(ipfwsk->socket);
+
+ cleanup_nothing:
+#endif
+ FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags);
+ return ret;
+}
diff --git a/net/ipv4/netfilter/ipfwadm_core.c b/net/ipv4/netfilter/ipfwadm_core.c
new file mode 100644
index 000000000..a1f4e16cf
--- /dev/null
+++ b/net/ipv4/netfilter/ipfwadm_core.c
@@ -0,0 +1,1410 @@
+/* Minor modifications to fit on compatibility framework:
+ Rusty.Russell@rustcorp.com.au
+*/
+
+#define CONFIG_IP_FIREWALL
+#define CONFIG_IP_FIREWALL_VERBOSE
+#define CONFIG_IP_MASQUERADE
+#define CONFIG_IP_ACCT
+#define CONFIG_IP_TRANSPARENT_PROXY
+#define CONFIG_IP_FIREWALL_NETLINK
+
+/*
+ * IP firewalling code. This is taken from 4.4BSD. Please note the
+ * copyright message below. As per the GPL it must be maintained
+ * and the licenses thus do not conflict. While this port is subject
+ * to the GPL I also place my modifications under the original
+ * license in recognition of the original copyright.
+ * -- Alan Cox.
+ *
+ * $Id: ipfwadm_core.c,v 1.1 2000/03/17 14:42:00 davem Exp $
+ *
+ * Ported from BSD to Linux,
+ * Alan Cox 22/Nov/1994.
+ * Zeroing /proc and other additions
+ * Jos Vos 4/Feb/1995.
+ * Merged and included the FreeBSD-Current changes at Ugen's request
+ * (but hey it's a lot cleaner now). Ugen would prefer in some ways
+ * we waited for his final product but since Linux 1.2.0 is about to
+ * appear it's not practical - Read: It works, it's not clean but please
+ * don't consider it to be his standard of finished work.
+ * Alan Cox 12/Feb/1995
+ * Porting bidirectional entries from BSD, fixing accounting issues,
+ * adding struct ip_fwpkt for checking packets with interface address
+ * Jos Vos 5/Mar/1995.
+ * Established connections (ACK check), ACK check on bidirectional rules,
+ * ICMP type check.
+ * Wilfred Mollenvanger 7/7/1995.
+ * TCP attack protection.
+ * Alan Cox 25/8/95, based on information from bugtraq.
+ * ICMP type printk, IP_FW_F_APPEND
+ * Bernd Eckenfels 1996-01-31
+ * Split blocking chain into input and output chains, add new "insert" and
+ * "append" commands to replace semi-intelligent "add" command, let "delete".
+ * only delete the first matching entry, use 0xFFFF (0xFF) as ports (ICMP
+ * types) when counting packets being 2nd and further fragments.
+ * Jos Vos <jos@xos.nl> 8/2/1996.
+ * Add support for matching on device names.
+ * Jos Vos <jos@xos.nl> 15/2/1996.
+ * Transparent proxying support.
+ * Willy Konynenberg <willy@xos.nl> 10/5/96.
+ * Make separate accounting on incoming and outgoing packets possible.
+ * Jos Vos <jos@xos.nl> 18/5/1996.
+ * Added trap out of bad frames.
+ * Alan Cox <alan@cymru.net> 17/11/1996
+ *
+ *
+ * Masquerading functionality
+ *
+ * Copyright (c) 1994 Pauline Middelink
+ *
+ * The pieces which added masquerading functionality are totally
+ * my responsibility and have nothing to with the original authors
+ * copyright or doing.
+ *
+ * Parts distributed under GPL.
+ *
+ * Fixes:
+ * Pauline Middelink : Added masquerading.
+ * Alan Cox : Fixed an error in the merge.
+ * Thomas Quinot : Fixed port spoofing.
+ * Alan Cox : Cleaned up retransmits in spoofing.
+ * Alan Cox : Cleaned up length setting.
+ * Wouter Gadeyne : Fixed masquerading support of ftp PORT commands
+ *
+ * Juan Jose Ciarlante : Masquerading code moved to ip_masq.c
+ * Andi Kleen : Print frag_offsets and the ip flags properly.
+ *
+ * All the real work was done by .....
+ *
+ */
+
+
+/*
+ * Copyright (c) 1993 Daniel Boulet
+ * Copyright (c) 1994 Ugen J.S.Antsilevich
+ *
+ * Redistribution and use in source forms, with and without modification,
+ * are permitted provided that this entire comment appears intact.
+ *
+ * Redistribution in binary form may occur without any restrictions.
+ * Obviously, it would be nice if you gave credit where credit is due
+ * but requiring it would be too onerous.
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ */
+
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/icmp.h>
+#include <linux/udp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+#include <linux/netfilter_ipv4/ipfwadm_core.h>
+#include <linux/netfilter_ipv4/compat_firewall.h>
+
+#include <net/checksum.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/version.h>
+
+/*
+ * Implement IP packet firewall
+ */
+
+#ifdef DEBUG_IP_FIREWALL
+#define dprintf1(a) printk(a)
+#define dprintf2(a1,a2) printk(a1,a2)
+#define dprintf3(a1,a2,a3) printk(a1,a2,a3)
+#define dprintf4(a1,a2,a3,a4) printk(a1,a2,a3,a4)
+#else
+#define dprintf1(a)
+#define dprintf2(a1,a2)
+#define dprintf3(a1,a2,a3)
+#define dprintf4(a1,a2,a3,a4)
+#endif
+
+#define print_ip(a) printk("%d.%d.%d.%d",(ntohl(a)>>24)&0xFF,\
+ (ntohl(a)>>16)&0xFF,\
+ (ntohl(a)>>8)&0xFF,\
+ (ntohl(a))&0xFF);
+
+#ifdef DEBUG_IP_FIREWALL
+#define dprint_ip(a) print_ip(a)
+#else
+#define dprint_ip(a)
+#endif
+
+#if defined(CONFIG_IP_ACCT) || defined(CONFIG_IP_FIREWALL)
+
+struct ip_fw *ip_fw_fwd_chain;
+struct ip_fw *ip_fw_in_chain;
+struct ip_fw *ip_fw_out_chain;
+struct ip_fw *ip_acct_chain;
+struct ip_fw *ip_masq_chain;
+
+static struct ip_fw **chains[] =
+ {&ip_fw_fwd_chain, &ip_fw_in_chain, &ip_fw_out_chain, &ip_acct_chain,
+ &ip_masq_chain
+ };
+#endif /* CONFIG_IP_ACCT || CONFIG_IP_FIREWALL */
+
+#ifdef CONFIG_IP_FIREWALL
+int ip_fw_fwd_policy=IP_FW_F_ACCEPT;
+int ip_fw_in_policy=IP_FW_F_ACCEPT;
+int ip_fw_out_policy=IP_FW_F_ACCEPT;
+
+static int *policies[] =
+ {&ip_fw_fwd_policy, &ip_fw_in_policy, &ip_fw_out_policy};
+
+#endif
+
+#ifdef CONFIG_IP_FIREWALL_NETLINK
+struct sock *ipfwsk;
+#endif
+
+/*
+ * Returns 1 if the port is matched by the vector, 0 otherwise
+ */
+
+extern inline int port_match(unsigned short *portptr,int nports,unsigned short port,int range_flag)
+{
+ if (!nports)
+ return 1;
+ if ( range_flag )
+ {
+ if ( portptr[0] <= port && port <= portptr[1] )
+ {
+ return( 1 );
+ }
+ nports -= 2;
+ portptr += 2;
+ }
+ while ( nports-- > 0 )
+ {
+ if ( *portptr++ == port )
+ {
+ return( 1 );
+ }
+ }
+ return(0);
+}
+
+#if defined(CONFIG_IP_ACCT) || defined(CONFIG_IP_FIREWALL)
+
+#ifdef CONFIG_IP_FIREWALL_VERBOSE
+
+/*
+ * VERY ugly piece of code which actually makes kernel printf for
+ * matching packets.
+ */
+
+static char *chain_name(struct ip_fw *chain, int mode)
+{
+ switch (mode) {
+ case IP_FW_MODE_ACCT_IN: return "acct in";
+ case IP_FW_MODE_ACCT_OUT: return "acct out";
+ default:
+ if (chain == ip_fw_fwd_chain)
+ return "fw-fwd";
+ else if (chain == ip_fw_in_chain)
+ return "fw-in";
+ else
+ return "fw-out";
+ }
+}
+
+static char *rule_name(struct ip_fw *f, int mode, char *buf)
+{
+ if (mode == IP_FW_MODE_ACCT_IN || mode == IP_FW_MODE_ACCT_OUT)
+ return "";
+
+ if(f->fw_flg&IP_FW_F_ACCEPT) {
+ if(f->fw_flg&IP_FW_F_REDIR) {
+ sprintf(buf, "acc/r%d ", f->fw_pts[f->fw_nsp+f->fw_ndp]);
+ return buf;
+ } else if(f->fw_flg&IP_FW_F_MASQ)
+ return "acc/masq ";
+ else
+ return "acc ";
+ } else if(f->fw_flg&IP_FW_F_ICMPRPL) {
+ return "rej ";
+ } else {
+ return "deny ";
+ }
+}
+
+static void print_packet(struct iphdr *ip,
+ u16 src_port, u16 dst_port, u16 icmp_type,
+ char *chain, char *rule, char *devname)
+{
+ __u32 *opt = (__u32 *) (ip + 1);
+ int opti;
+ __u16 foff = ntohs(ip->frag_off);
+
+ printk(KERN_INFO "IP %s %s%s", chain, rule, devname);
+
+ switch(ip->protocol)
+ {
+ case IPPROTO_TCP:
+ printk(" TCP ");
+ break;
+ case IPPROTO_UDP:
+ printk(" UDP ");
+ break;
+ case IPPROTO_ICMP:
+ printk(" ICMP/%d ", icmp_type);
+ break;
+ default:
+ printk(" PROTO=%d ", ip->protocol);
+ break;
+ }
+ print_ip(ip->saddr);
+ if(ip->protocol == IPPROTO_TCP || ip->protocol == IPPROTO_UDP)
+ printk(":%hu", src_port);
+ printk(" ");
+ print_ip(ip->daddr);
+ if(ip->protocol == IPPROTO_TCP || ip->protocol == IPPROTO_UDP)
+ printk(":%hu", dst_port);
+ printk(" L=%hu S=0x%2.2hX I=%hu FO=0x%4.4hX T=%hu",
+ ntohs(ip->tot_len), ip->tos, ntohs(ip->id),
+ foff & IP_OFFSET, ip->ttl);
+ if (foff & IP_DF) printk(" DF=1");
+ if (foff & IP_MF) printk(" MF=1");
+ for (opti = 0; opti < (ip->ihl - sizeof(struct iphdr) / 4); opti++)
+ printk(" O=0x%8.8X", *opt++);
+ printk("\n");
+}
+#endif
+
+/*
+ * Returns one of the generic firewall policies, like FW_ACCEPT.
+ * Also does accounting so you can feed it the accounting chain.
+ *
+ * The modes is either IP_FW_MODE_FW (normal firewall mode),
+ * IP_FW_MODE_ACCT_IN or IP_FW_MODE_ACCT_OUT (accounting mode,
+ * steps through the entire chain and handles fragments
+ * differently), or IP_FW_MODE_CHK (handles user-level check,
+ * counters are not updated).
+ */
+
+
+int ip_fw_chk(struct iphdr *ip, struct net_device *rif, __u16 *redirport,
+ struct ip_fw *chain, int policy, int mode)
+{
+ struct ip_fw *f;
+ struct tcphdr *tcp=(struct tcphdr *)((__u32 *)ip+ip->ihl);
+ struct udphdr *udp=(struct udphdr *)((__u32 *)ip+ip->ihl);
+ struct icmphdr *icmp=(struct icmphdr *)((__u32 *)ip+ip->ihl);
+ __u32 src, dst;
+ __u16 src_port=0xFFFF, dst_port=0xFFFF, icmp_type=0xFF;
+ unsigned short f_prt=0, prt;
+ char notcpsyn=0, notcpack=0, match;
+ unsigned short offset;
+ int answer;
+ unsigned char tosand, tosxor;
+
+ /*
+ * If the chain is empty follow policy. The BSD one
+ * accepts anything giving you a time window while
+ * flushing and rebuilding the tables.
+ */
+
+ src = ip->saddr;
+ dst = ip->daddr;
+
+ /*
+ * This way we handle fragmented packets.
+ * we ignore all fragments but the first one
+ * so the whole packet can't be reassembled.
+ * This way we relay on the full info which
+ * stored only in first packet.
+ *
+ * Note that this theoretically allows partial packet
+ * spoofing. Not very dangerous but paranoid people may
+ * wish to play with this. It also allows the so called
+ * "fragment bomb" denial of service attack on some types
+ * of system.
+ */
+
+ offset = ntohs(ip->frag_off) & IP_OFFSET;
+
+ /*
+ * Don't allow a fragment of TCP 8 bytes in. Nobody
+ * normal causes this. Its a cracker trying to break
+ * in by doing a flag overwrite to pass the direction
+ * checks.
+ */
+
+ if (offset == 1 && ip->protocol == IPPROTO_TCP)
+ return FW_BLOCK;
+
+ if (offset!=0 && !(mode & (IP_FW_MODE_ACCT_IN|IP_FW_MODE_ACCT_OUT)) &&
+ (ip->protocol == IPPROTO_TCP || ip->protocol == IPPROTO_UDP ||
+ ip->protocol == IPPROTO_ICMP))
+ return FW_ACCEPT;
+
+ /*
+ * Header fragment for TCP is too small to check the bits.
+ */
+
+ if(ip->protocol==IPPROTO_TCP && (ip->ihl<<2)+16 > ntohs(ip->tot_len))
+ return FW_BLOCK;
+
+ /*
+ * Too short.
+ *
+ * But only too short for a packet with ports...
+ */
+
+ else if((ntohs(ip->tot_len)<8+(ip->ihl<<2))&&(ip->protocol==IPPROTO_TCP || ip->protocol==IPPROTO_UDP))
+ return FW_BLOCK;
+
+ src = ip->saddr;
+ dst = ip->daddr;
+
+ /*
+ * If we got interface from which packet came
+ * we can use the address directly. This is unlike
+ * 4.4BSD derived systems that have an address chain
+ * per device. We have a device per address with dummy
+ * devices instead.
+ */
+
+ dprintf1("Packet ");
+ switch(ip->protocol)
+ {
+ case IPPROTO_TCP:
+ dprintf1("TCP ");
+ /* ports stay 0xFFFF if it is not the first fragment */
+ if (!offset) {
+ src_port=ntohs(tcp->source);
+ dst_port=ntohs(tcp->dest);
+ if(!tcp->ack && !tcp->rst)
+ /* We do NOT have ACK, value TRUE */
+ notcpack=1;
+ if(!tcp->syn || !notcpack)
+ /* We do NOT have SYN, value TRUE */
+ notcpsyn=1;
+ }
+ prt=IP_FW_F_TCP;
+ break;
+ case IPPROTO_UDP:
+ dprintf1("UDP ");
+ /* ports stay 0xFFFF if it is not the first fragment */
+ if (!offset) {
+ src_port=ntohs(udp->source);
+ dst_port=ntohs(udp->dest);
+ }
+ prt=IP_FW_F_UDP;
+ break;
+ case IPPROTO_ICMP:
+ /* icmp_type stays 255 if it is not the first fragment */
+ if (!offset)
+ icmp_type=(__u16)(icmp->type);
+ dprintf2("ICMP:%d ",icmp_type);
+ prt=IP_FW_F_ICMP;
+ break;
+ default:
+ dprintf2("p=%d ",ip->protocol);
+ prt=IP_FW_F_ALL;
+ break;
+ }
+#ifdef DEBUG_IP_FIREWALL
+ dprint_ip(ip->saddr);
+
+ if (ip->protocol==IPPROTO_TCP || ip->protocol==IPPROTO_UDP)
+ /* This will print 65535 when it is not the first fragment! */
+ dprintf2(":%d ", src_port);
+ dprint_ip(ip->daddr);
+ if (ip->protocol==IPPROTO_TCP || ip->protocol==IPPROTO_UDP)
+ /* This will print 65535 when it is not the first fragment! */
+ dprintf2(":%d ",dst_port);
+ dprintf1("\n");
+#endif
+
+ for (f=chain;f;f=f->fw_next)
+ {
+ /*
+ * This is a bit simpler as we don't have to walk
+ * an interface chain as you do in BSD - same logic
+ * however.
+ */
+
+ /*
+ * Match can become 0x01 (a "normal" match was found),
+ * 0x02 (a reverse match was found), and 0x03 (the
+ * IP addresses match in both directions).
+ * Now we know in which direction(s) we should look
+ * for a match for the TCP/UDP ports. Both directions
+ * might match (e.g., when both addresses are on the
+ * same network for which an address/mask is given), but
+ * the ports might only match in one direction.
+ * This was obviously wrong in the original BSD code.
+ */
+ match = 0x00;
+
+ if ((src&f->fw_smsk.s_addr)==f->fw_src.s_addr
+ && (dst&f->fw_dmsk.s_addr)==f->fw_dst.s_addr)
+ /* normal direction */
+ match |= 0x01;
+
+ if ((f->fw_flg & IP_FW_F_BIDIR) &&
+ (dst&f->fw_smsk.s_addr)==f->fw_src.s_addr
+ && (src&f->fw_dmsk.s_addr)==f->fw_dst.s_addr)
+ /* reverse direction */
+ match |= 0x02;
+
+ if (!match)
+ continue;
+
+ /*
+ * Look for a VIA device match
+ */
+ if(f->fw_viadev)
+ {
+ if(rif!=f->fw_viadev)
+ continue; /* Mismatch */
+ }
+
+ /* This looks stupid, because we scan almost static
+ list, searching for static key. However, this way seems
+ to be only reasonable way of handling fw_via rules
+ (btw bsd makes the same thing).
+
+ It will not affect performance if you will follow
+ the following simple rules:
+
+ - if inteface is aliased, ALWAYS specify fw_viadev,
+ so that previous check will guarantee, that we will
+ not waste time when packet arrive on another interface.
+
+ - avoid using fw_via.s_addr if fw_via.s_addr is owned
+ by an aliased interface.
+
+ --ANK
+ */
+ if (f->fw_via.s_addr && rif) {
+ struct in_ifaddr *ifa;
+
+ if (rif->ip_ptr == NULL)
+ continue; /* Mismatch */
+
+ for (ifa = ((struct in_device*)(rif->ip_ptr))->ifa_list;
+ ifa; ifa = ifa->ifa_next) {
+ if (ifa->ifa_local == f->fw_via.s_addr)
+ goto ifa_ok;
+ }
+ continue; /* Mismatch */
+
+ ifa_ok:
+ }
+
+ /*
+ * Ok the chain addresses match.
+ */
+
+#ifdef CONFIG_IP_ACCT
+ /*
+ * See if we're in accounting mode and only want to
+ * count incoming or outgoing packets.
+ */
+
+ if (mode & (IP_FW_MODE_ACCT_IN|IP_FW_MODE_ACCT_OUT) &&
+ ((mode == IP_FW_MODE_ACCT_IN && f->fw_flg&IP_FW_F_ACCTOUT) ||
+ (mode == IP_FW_MODE_ACCT_OUT && f->fw_flg&IP_FW_F_ACCTIN)))
+ continue;
+
+#endif
+ /*
+ * For all non-TCP packets and/or non-first fragments,
+ * notcpsyn and notcpack will always be FALSE,
+ * so the IP_FW_F_TCPSYN and IP_FW_F_TCPACK flags
+ * are actually ignored for these packets.
+ */
+
+ if((f->fw_flg&IP_FW_F_TCPSYN) && notcpsyn)
+ continue;
+
+ if((f->fw_flg&IP_FW_F_TCPACK) && notcpack)
+ continue;
+
+ f_prt=f->fw_flg&IP_FW_F_KIND;
+ if (f_prt!=IP_FW_F_ALL)
+ {
+ /*
+ * Specific firewall - packet's protocol
+ * must match firewall's.
+ */
+
+ if(prt!=f_prt)
+ continue;
+
+ if((prt==IP_FW_F_ICMP &&
+ ! port_match(&f->fw_pts[0], f->fw_nsp,
+ icmp_type,f->fw_flg&IP_FW_F_SRNG)) ||
+ !(prt==IP_FW_F_ICMP || ((match & 0x01) &&
+ port_match(&f->fw_pts[0], f->fw_nsp, src_port,
+ f->fw_flg&IP_FW_F_SRNG) &&
+ port_match(&f->fw_pts[f->fw_nsp], f->fw_ndp, dst_port,
+ f->fw_flg&IP_FW_F_DRNG)) || ((match & 0x02) &&
+ port_match(&f->fw_pts[0], f->fw_nsp, dst_port,
+ f->fw_flg&IP_FW_F_SRNG) &&
+ port_match(&f->fw_pts[f->fw_nsp], f->fw_ndp, src_port,
+ f->fw_flg&IP_FW_F_DRNG))))
+ {
+ continue;
+ }
+ }
+
+#ifdef CONFIG_IP_FIREWALL_VERBOSE
+ if (f->fw_flg & IP_FW_F_PRN)
+ {
+ char buf[16];
+
+ print_packet(ip, src_port, dst_port, icmp_type,
+ chain_name(chain, mode),
+ rule_name(f, mode, buf),
+ rif ? rif->name : "-");
+ }
+#endif
+ if (mode != IP_FW_MODE_CHK) {
+ f->fw_bcnt+=ntohs(ip->tot_len);
+ f->fw_pcnt++;
+ }
+ if (!(mode & (IP_FW_MODE_ACCT_IN|IP_FW_MODE_ACCT_OUT)))
+ break;
+ } /* Loop */
+
+ if (!(mode & (IP_FW_MODE_ACCT_IN|IP_FW_MODE_ACCT_OUT))) {
+
+ /*
+ * We rely on policy defined in the rejecting entry or, if no match
+ * was found, we rely on the general policy variable for this type
+ * of firewall.
+ */
+
+ if (f!=NULL) {
+ policy=f->fw_flg;
+ tosand=f->fw_tosand;
+ tosxor=f->fw_tosxor;
+ } else {
+ tosand=0xFF;
+ tosxor=0x00;
+ }
+
+ if (policy&IP_FW_F_ACCEPT) {
+ /* Adjust priority and recompute checksum */
+ __u8 old_tos = ip->tos;
+ ip->tos = (old_tos & tosand) ^ tosxor;
+ if (ip->tos != old_tos)
+ ip_send_check(ip);
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ if (policy&IP_FW_F_REDIR) {
+ if (redirport)
+ if ((*redirport = htons(f->fw_pts[f->fw_nsp+f->fw_ndp])) == 0) {
+ /* Wildcard redirection.
+ * Note that redirport will become
+ * 0xFFFF for non-TCP/UDP packets.
+ */
+ *redirport = htons(dst_port);
+ }
+ answer = FW_REDIRECT;
+ } else
+#endif
+#ifdef CONFIG_IP_MASQUERADE
+ if (policy&IP_FW_F_MASQ)
+ answer = FW_MASQUERADE;
+ else
+#endif
+ answer = FW_ACCEPT;
+
+ } else if(policy&IP_FW_F_ICMPRPL)
+ answer = FW_REJECT;
+ else
+ answer = FW_BLOCK;
+
+#ifdef CONFIG_IP_FIREWALL_NETLINK
+ if((policy&IP_FW_F_PRN) && (answer == FW_REJECT || answer == FW_BLOCK))
+ {
+ struct sk_buff *skb=alloc_skb(128, GFP_ATOMIC);
+ if(skb)
+ {
+ int len=min(128,ntohs(ip->tot_len));
+ skb_put(skb,len);
+ memcpy(skb->data,ip,len);
+ if(netlink_post(NETLINK_FIREWALL, skb))
+ kfree_skb(skb);
+ }
+ }
+#endif
+ return answer;
+ } else
+ /* we're doing accounting, always ok */
+ return 0;
+}
+
+
+static void zero_fw_chain(struct ip_fw *chainptr)
+{
+ struct ip_fw *ctmp=chainptr;
+ while(ctmp)
+ {
+ ctmp->fw_pcnt=0L;
+ ctmp->fw_bcnt=0L;
+ ctmp=ctmp->fw_next;
+ }
+}
+
+static void free_fw_chain(struct ip_fw *volatile* chainptr)
+{
+ unsigned long flags;
+ save_flags(flags);
+ cli();
+ while ( *chainptr != NULL )
+ {
+ struct ip_fw *ftmp;
+ ftmp = *chainptr;
+ *chainptr = ftmp->fw_next;
+ kfree_s(ftmp,sizeof(*ftmp));
+ }
+ restore_flags(flags);
+}
+
+/* Volatiles to keep some of the compiler versions amused */
+
+static int insert_in_chain(struct ip_fw *volatile* chainptr, struct ip_fw *frwl,int len)
+{
+ struct ip_fw *ftmp;
+ unsigned long flags;
+
+ save_flags(flags);
+
+ ftmp = kmalloc( sizeof(struct ip_fw), GFP_ATOMIC );
+ if ( ftmp == NULL )
+ {
+#ifdef DEBUG_IP_FIREWALL
+ printk("ip_fw_ctl: malloc said no\n");
+#endif
+ return( ENOMEM );
+ }
+
+ memcpy(ftmp, frwl, len);
+ /*
+ * Allow the more recent "minimise cost" flag to be
+ * set. [Rob van Nieuwkerk]
+ */
+ ftmp->fw_tosand |= 0x01;
+ ftmp->fw_tosxor &= 0xFE;
+ ftmp->fw_pcnt=0L;
+ ftmp->fw_bcnt=0L;
+
+ cli();
+
+ if ((ftmp->fw_vianame)[0]) {
+ if (!(ftmp->fw_viadev = dev_get_by_name(ftmp->fw_vianame)))
+ ftmp->fw_viadev = (struct net_device *) -1;
+ } else
+ ftmp->fw_viadev = NULL;
+
+ ftmp->fw_next = *chainptr;
+ *chainptr=ftmp;
+ restore_flags(flags);
+ return(0);
+}
+
+static int append_to_chain(struct ip_fw *volatile* chainptr, struct ip_fw *frwl,int len)
+{
+ struct ip_fw *ftmp;
+ struct ip_fw *chtmp=NULL;
+ struct ip_fw *volatile chtmp_prev=NULL;
+ unsigned long flags;
+
+ save_flags(flags);
+
+ ftmp = kmalloc( sizeof(struct ip_fw), GFP_ATOMIC );
+ if ( ftmp == NULL )
+ {
+#ifdef DEBUG_IP_FIREWALL
+ printk("ip_fw_ctl: malloc said no\n");
+#endif
+ return( ENOMEM );
+ }
+
+ memcpy(ftmp, frwl, len);
+ /*
+ * Allow the more recent "minimise cost" flag to be
+ * set. [Rob van Nieuwkerk]
+ */
+ ftmp->fw_tosand |= 0x01;
+ ftmp->fw_tosxor &= 0xFE;
+ ftmp->fw_pcnt=0L;
+ ftmp->fw_bcnt=0L;
+
+ ftmp->fw_next = NULL;
+
+ cli();
+
+ if ((ftmp->fw_vianame)[0]) {
+ if (!(ftmp->fw_viadev = dev_get_by_name(ftmp->fw_vianame)))
+ ftmp->fw_viadev = (struct net_device *) -1;
+ } else
+ ftmp->fw_viadev = NULL;
+
+ chtmp_prev=NULL;
+ for (chtmp=*chainptr;chtmp!=NULL;chtmp=chtmp->fw_next)
+ chtmp_prev=chtmp;
+
+ if (chtmp_prev)
+ chtmp_prev->fw_next=ftmp;
+ else
+ *chainptr=ftmp;
+ restore_flags(flags);
+ return(0);
+}
+
+static int del_from_chain(struct ip_fw *volatile*chainptr, struct ip_fw *frwl)
+{
+ struct ip_fw *ftmp,*ltmp;
+ unsigned short tport1,tport2,tmpnum;
+ char matches,was_found;
+ unsigned long flags;
+
+ save_flags(flags);
+ cli();
+
+ ftmp=*chainptr;
+
+ if ( ftmp == NULL )
+ {
+#ifdef DEBUG_IP_FIREWALL
+ printk("ip_fw_ctl: chain is empty\n");
+#endif
+ restore_flags(flags);
+ return( EINVAL );
+ }
+
+ ltmp=NULL;
+ was_found=0;
+
+ while( !was_found && ftmp != NULL )
+ {
+ matches=1;
+ if (ftmp->fw_src.s_addr!=frwl->fw_src.s_addr
+ || ftmp->fw_dst.s_addr!=frwl->fw_dst.s_addr
+ || ftmp->fw_smsk.s_addr!=frwl->fw_smsk.s_addr
+ || ftmp->fw_dmsk.s_addr!=frwl->fw_dmsk.s_addr
+ || ftmp->fw_via.s_addr!=frwl->fw_via.s_addr
+ || ftmp->fw_flg!=frwl->fw_flg)
+ matches=0;
+
+ tport1=ftmp->fw_nsp+ftmp->fw_ndp;
+ tport2=frwl->fw_nsp+frwl->fw_ndp;
+ if (tport1!=tport2)
+ matches=0;
+ else if (tport1!=0)
+ {
+ for (tmpnum=0;tmpnum < tport1 && tmpnum < IP_FW_MAX_PORTS;tmpnum++)
+ if (ftmp->fw_pts[tmpnum]!=frwl->fw_pts[tmpnum])
+ matches=0;
+ }
+ if (strncmp(ftmp->fw_vianame, frwl->fw_vianame, IFNAMSIZ))
+ matches=0;
+ if(matches)
+ {
+ was_found=1;
+ if (ltmp)
+ {
+ ltmp->fw_next=ftmp->fw_next;
+ kfree_s(ftmp,sizeof(*ftmp));
+ ftmp=ltmp->fw_next;
+ }
+ else
+ {
+ *chainptr=ftmp->fw_next;
+ kfree_s(ftmp,sizeof(*ftmp));
+ ftmp=*chainptr;
+ }
+ }
+ else
+ {
+ ltmp = ftmp;
+ ftmp = ftmp->fw_next;
+ }
+ }
+ restore_flags(flags);
+ if (was_found)
+ return 0;
+ else
+ return(EINVAL);
+}
+
+#endif /* CONFIG_IP_ACCT || CONFIG_IP_FIREWALL */
+
+struct ip_fw *check_ipfw_struct(struct ip_fw *frwl, int len)
+{
+
+ if ( len != sizeof(struct ip_fw) )
+ {
+#ifdef DEBUG_IP_FIREWALL
+ printk("ip_fw_ctl: len=%d, want %d\n",len, sizeof(struct ip_fw));
+#endif
+ return(NULL);
+ }
+
+ if ( (frwl->fw_flg & ~IP_FW_F_MASK) != 0 )
+ {
+#ifdef DEBUG_IP_FIREWALL
+ printk("ip_fw_ctl: undefined flag bits set (flags=%x)\n",
+ frwl->fw_flg);
+#endif
+ return(NULL);
+ }
+
+#ifndef CONFIG_IP_TRANSPARENT_PROXY
+ if (frwl->fw_flg & IP_FW_F_REDIR) {
+#ifdef DEBUG_IP_FIREWALL
+ printk("ip_fw_ctl: unsupported flag IP_FW_F_REDIR\n");
+#endif
+ return(NULL);
+ }
+#endif
+
+#ifndef CONFIG_IP_MASQUERADE
+ if (frwl->fw_flg & IP_FW_F_MASQ) {
+#ifdef DEBUG_IP_FIREWALL
+ printk("ip_fw_ctl: unsupported flag IP_FW_F_MASQ\n");
+#endif
+ return(NULL);
+ }
+#endif
+
+ if ( (frwl->fw_flg & IP_FW_F_SRNG) && frwl->fw_nsp < 2 )
+ {
+#ifdef DEBUG_IP_FIREWALL
+ printk("ip_fw_ctl: src range set but fw_nsp=%d\n",
+ frwl->fw_nsp);
+#endif
+ return(NULL);
+ }
+
+ if ( (frwl->fw_flg & IP_FW_F_DRNG) && frwl->fw_ndp < 2 )
+ {
+#ifdef DEBUG_IP_FIREWALL
+ printk("ip_fw_ctl: dst range set but fw_ndp=%d\n",
+ frwl->fw_ndp);
+#endif
+ return(NULL);
+ }
+
+ if ( frwl->fw_nsp + frwl->fw_ndp > (frwl->fw_flg & IP_FW_F_REDIR ? IP_FW_MAX_PORTS - 1 : IP_FW_MAX_PORTS) )
+ {
+#ifdef DEBUG_IP_FIREWALL
+ printk("ip_fw_ctl: too many ports (%d+%d)\n",
+ frwl->fw_nsp,frwl->fw_ndp);
+#endif
+ return(NULL);
+ }
+
+ return frwl;
+}
+
+
+
+
+#ifdef CONFIG_IP_ACCT
+
+int ip_acct_ctl(int stage, void *m, int len)
+{
+ if ( stage == IP_ACCT_FLUSH )
+ {
+ free_fw_chain(&ip_acct_chain);
+ return(0);
+ }
+ if ( stage == IP_ACCT_ZERO )
+ {
+ zero_fw_chain(ip_acct_chain);
+ return(0);
+ }
+ if ( stage == IP_ACCT_INSERT || stage == IP_ACCT_APPEND ||
+ stage == IP_ACCT_DELETE )
+ {
+ struct ip_fw *frwl;
+
+ if (!(frwl=check_ipfw_struct(m,len)))
+ return (EINVAL);
+
+ switch (stage)
+ {
+ case IP_ACCT_INSERT:
+ return( insert_in_chain(&ip_acct_chain,frwl,len));
+ case IP_ACCT_APPEND:
+ return( append_to_chain(&ip_acct_chain,frwl,len));
+ case IP_ACCT_DELETE:
+ return( del_from_chain(&ip_acct_chain,frwl));
+ default:
+ /*
+ * Should be panic but... (Why ??? - AC)
+ */
+#ifdef DEBUG_IP_FIREWALL
+ printk("ip_acct_ctl: unknown request %d\n",stage);
+#endif
+ return(EINVAL);
+ }
+ }
+#ifdef DEBUG_IP_FIREWALL
+ printk("ip_acct_ctl: unknown request %d\n",stage);
+#endif
+ return(EINVAL);
+}
+#endif
+
+#ifdef CONFIG_IP_FIREWALL
+int ip_fw_ctl(int stage, void *m, int len)
+{
+ int cmd, fwtype;
+
+ cmd = stage & IP_FW_COMMAND;
+ fwtype = (stage & IP_FW_TYPE) >> IP_FW_SHIFT;
+
+ if ( cmd == IP_FW_FLUSH )
+ {
+ free_fw_chain(chains[fwtype]);
+ return(0);
+ }
+
+ if ( cmd == IP_FW_ZERO )
+ {
+ zero_fw_chain(*chains[fwtype]);
+ return(0);
+ }
+
+ if ( cmd == IP_FW_POLICY )
+ {
+ int *tmp_policy_ptr;
+ tmp_policy_ptr=(int *)m;
+ *policies[fwtype] = *tmp_policy_ptr;
+ return 0;
+ }
+
+ if ( cmd == IP_FW_CHECK )
+ {
+ struct net_device *viadev;
+ struct ip_fwpkt *ipfwp;
+ struct iphdr *ip;
+
+ if ( len != sizeof(struct ip_fwpkt) )
+ {
+#ifdef DEBUG_IP_FIREWALL
+ printk("ip_fw_ctl: length=%d, expected %d\n",
+ len, sizeof(struct ip_fwpkt));
+#endif
+ return( EINVAL );
+ }
+
+ ipfwp = (struct ip_fwpkt *)m;
+ ip = &(ipfwp->fwp_iph);
+
+ if ( !(viadev = dev_get_by_name(ipfwp->fwp_vianame)) ) {
+#ifdef DEBUG_IP_FIREWALL
+ printk("ip_fw_ctl: invalid device \"%s\"\n", ipfwp->fwp_vianame);
+#endif
+ return(EINVAL);
+ } else if ( ip->ihl != sizeof(struct iphdr) / sizeof(int)) {
+#ifdef DEBUG_IP_FIREWALL
+ printk("ip_fw_ctl: ip->ihl=%d, want %d\n",ip->ihl,
+ sizeof(struct iphdr)/sizeof(int));
+#endif
+ return(EINVAL);
+ }
+
+ switch (ip_fw_chk(ip, viadev, NULL, *chains[fwtype],
+ *policies[fwtype], IP_FW_MODE_CHK))
+ {
+ case FW_ACCEPT:
+ return(0);
+ case FW_REDIRECT:
+ return(ECONNABORTED);
+ case FW_MASQUERADE:
+ return(ECONNRESET);
+ case FW_REJECT:
+ return(ECONNREFUSED);
+ default: /* FW_BLOCK */
+ return(ETIMEDOUT);
+ }
+ }
+
+ if ( cmd == IP_FW_MASQ_TIMEOUTS )
+ return ip_fw_masq_timeouts(m, len);
+
+/*
+ * Here we really working hard-adding new elements
+ * to blocking/forwarding chains or deleting 'em
+ */
+
+ if ( cmd == IP_FW_INSERT || cmd == IP_FW_APPEND || cmd == IP_FW_DELETE )
+ {
+ struct ip_fw *frwl;
+ int fwtype;
+
+ frwl=check_ipfw_struct(m,len);
+ if (frwl==NULL)
+ return (EINVAL);
+ fwtype = (stage & IP_FW_TYPE) >> IP_FW_SHIFT;
+
+ switch (cmd)
+ {
+ case IP_FW_INSERT:
+ return(insert_in_chain(chains[fwtype],frwl,len));
+ case IP_FW_APPEND:
+ return(append_to_chain(chains[fwtype],frwl,len));
+ case IP_FW_DELETE:
+ return(del_from_chain(chains[fwtype],frwl));
+ default:
+ /*
+ * Should be panic but... (Why are BSD people panic obsessed ??)
+ */
+#ifdef DEBUG_IP_FIREWALL
+ printk("ip_fw_ctl: unknown request %d\n",stage);
+#endif
+ return(EINVAL);
+ }
+ }
+
+#ifdef DEBUG_IP_FIREWALL
+ printk("ip_fw_ctl: unknown request %d\n",stage);
+#endif
+ return(ENOPROTOOPT);
+}
+#endif /* CONFIG_IP_FIREWALL */
+
+#ifdef CONFIG_PROC_FS
+#if defined(CONFIG_IP_FIREWALL) || defined(CONFIG_IP_ACCT)
+
+static int ip_chain_procinfo(int stage, char *buffer, char **start,
+ off_t offset, int length, int reset)
+{
+ off_t pos=0, begin=0;
+ struct ip_fw *i;
+ unsigned long flags;
+ int len, p;
+ int last_len = 0;
+
+
+ switch(stage)
+ {
+#ifdef CONFIG_IP_FIREWALL
+ case IP_FW_IN:
+ i = ip_fw_in_chain;
+ len=sprintf(buffer, "IP firewall input rules, default %d\n",
+ ip_fw_in_policy);
+ break;
+ case IP_FW_OUT:
+ i = ip_fw_out_chain;
+ len=sprintf(buffer, "IP firewall output rules, default %d\n",
+ ip_fw_out_policy);
+ break;
+ case IP_FW_FWD:
+ i = ip_fw_fwd_chain;
+ len=sprintf(buffer, "IP firewall forward rules, default %d\n",
+ ip_fw_fwd_policy);
+ break;
+#endif
+#ifdef CONFIG_IP_ACCT
+ case IP_FW_ACCT:
+ i = ip_acct_chain;
+ len=sprintf(buffer,"IP accounting rules\n");
+ break;
+#endif
+ default:
+ /* this should never be reached, but safety first... */
+ i = NULL;
+ len=0;
+ break;
+ }
+
+ save_flags(flags);
+ cli();
+
+ while(i!=NULL)
+ {
+ len+=sprintf(buffer+len,"%08X/%08X->%08X/%08X %.16s %08X %X ",
+ ntohl(i->fw_src.s_addr),ntohl(i->fw_smsk.s_addr),
+ ntohl(i->fw_dst.s_addr),ntohl(i->fw_dmsk.s_addr),
+ (i->fw_vianame)[0] ? i->fw_vianame : "-",
+ ntohl(i->fw_via.s_addr), i->fw_flg);
+ /* 10 is enough for a 32 bit box but the counters are 64bit on
+ the Alpha and Ultrapenguin */
+ len+=sprintf(buffer+len,"%u %u %-20lu %-20lu",
+ i->fw_nsp,i->fw_ndp, i->fw_pcnt,i->fw_bcnt);
+ for (p = 0; p < IP_FW_MAX_PORTS; p++)
+ len+=sprintf(buffer+len, " %u", i->fw_pts[p]);
+ len+=sprintf(buffer+len, " A%02X X%02X", i->fw_tosand, i->fw_tosxor);
+ buffer[len++]='\n';
+ buffer[len]='\0';
+ pos=begin+len;
+ if(pos<offset)
+ {
+ len=0;
+ begin=pos;
+ }
+ else if(pos>offset+length)
+ {
+ len = last_len;
+ break;
+ }
+ else if(reset)
+ {
+ /* This needs to be done at this specific place! */
+ i->fw_pcnt=0L;
+ i->fw_bcnt=0L;
+ }
+ last_len = len;
+ i=i->fw_next;
+ }
+ restore_flags(flags);
+ *start=buffer+(offset-begin);
+ len-=(offset-begin);
+ if(len>length)
+ len=length;
+ return len;
+}
+#endif
+
+#ifdef CONFIG_IP_ACCT
+
+static int ip_acct_procinfo(char *buffer, char **start, off_t offset,
+ int length
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,29)
+ , int reset
+#endif
+ )
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,29)
+ /* FIXME: No more `atomic' read and reset. Wonderful 8-( --RR */
+ int reset = 0;
+#endif
+ return ip_chain_procinfo(IP_FW_ACCT, buffer,start, offset,length,
+ reset);
+}
+
+#endif
+
+#ifdef CONFIG_IP_FIREWALL
+
+static int ip_fw_in_procinfo(char *buffer, char **start, off_t offset,
+ int length
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,29)
+ , int reset
+#endif
+ )
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,29)
+ /* FIXME: No more `atomic' read and reset. Wonderful 8-( --RR */
+ int reset = 0;
+#endif
+ return ip_chain_procinfo(IP_FW_IN, buffer,start,offset,length,
+ reset);
+}
+
+static int ip_fw_out_procinfo(char *buffer, char **start, off_t offset,
+ int length
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,29)
+ , int reset
+#endif
+ )
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,29)
+ /* FIXME: No more `atomic' read and reset. Wonderful 8-( --RR */
+ int reset = 0;
+#endif
+ return ip_chain_procinfo(IP_FW_OUT, buffer,start,offset,length,
+ reset);
+}
+
+static int ip_fw_fwd_procinfo(char *buffer, char **start, off_t offset,
+ int length
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,3,29)
+ , int reset
+#endif
+ )
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,29)
+ /* FIXME: No more `atomic' read and reset. Wonderful 8-( --RR */
+ int reset = 0;
+#endif
+ return ip_chain_procinfo(IP_FW_FWD, buffer,start,offset,length,
+ reset);
+}
+#endif
+#endif
+
+
+#ifdef CONFIG_IP_FIREWALL
+/*
+ * Interface to the generic firewall chains.
+ */
+
+int ipfw_input_check(struct firewall_ops *this, int pf,
+ struct net_device *dev, void *phdr, void *arg,
+ struct sk_buff **pskb)
+{
+ return ip_fw_chk(phdr, dev, arg, ip_fw_in_chain, ip_fw_in_policy,
+ IP_FW_MODE_FW);
+}
+
+int ipfw_output_check(struct firewall_ops *this, int pf,
+ struct net_device *dev, void *phdr, void *arg,
+ struct sk_buff **pskb)
+{
+ return ip_fw_chk(phdr, dev, arg, ip_fw_out_chain, ip_fw_out_policy,
+ IP_FW_MODE_FW);
+}
+
+int ipfw_forward_check(struct firewall_ops *this, int pf,
+ struct net_device *dev, void *phdr, void *arg,
+ struct sk_buff **pskb)
+{
+ return ip_fw_chk(phdr, dev, arg, ip_fw_fwd_chain, ip_fw_fwd_policy,
+ IP_FW_MODE_FW);
+}
+
+#ifdef CONFIG_IP_ACCT
+int ipfw_acct_in(struct firewall_ops *this, int pf, struct net_device *dev,
+ void *phdr, void *arg, struct sk_buff **pskb)
+{
+ return ip_fw_chk(phdr,dev,NULL,ip_acct_chain,0,IP_FW_MODE_ACCT_IN);
+}
+
+int ipfw_acct_out(struct firewall_ops *this, int pf, struct net_device *dev,
+ void *phdr, void *arg, struct sk_buff **pskb)
+{
+ return ip_fw_chk(phdr,dev,NULL,ip_acct_chain,0,IP_FW_MODE_ACCT_OUT);
+}
+#endif
+
+struct firewall_ops ipfw_ops=
+{
+ NULL,
+ ipfw_forward_check,
+ ipfw_input_check,
+ ipfw_output_check,
+#ifdef CONFIG_IP_ACCT
+ ipfw_acct_in,
+ ipfw_acct_out
+#else
+ NULL,
+ NULL
+#endif
+};
+
+#endif
+
+#if defined(CONFIG_IP_ACCT) || defined(CONFIG_IP_FIREWALL)
+
+int ipfw_device_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *dev=ptr;
+ char *devname = dev->name;
+ unsigned long flags;
+ struct ip_fw *fw;
+ int chn;
+
+ save_flags(flags);
+ cli();
+
+ if (event == NETDEV_UP) {
+ for (chn = 0; chn < IP_FW_CHAINS; chn++)
+ for (fw = *chains[chn]; fw; fw = fw->fw_next)
+ if ((fw->fw_vianame)[0] && !strncmp(devname,
+ fw->fw_vianame, IFNAMSIZ))
+ fw->fw_viadev = dev;
+ } else if (event == NETDEV_DOWN) {
+ for (chn = 0; chn < IP_FW_CHAINS; chn++)
+ for (fw = *chains[chn]; fw; fw = fw->fw_next)
+ /* we could compare just the pointers ... */
+ if ((fw->fw_vianame)[0] && !strncmp(devname,
+ fw->fw_vianame, IFNAMSIZ))
+ fw->fw_viadev = (struct net_device*)-1;
+ }
+
+ restore_flags(flags);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block ipfw_dev_notifier={
+ ipfw_device_event,
+ NULL,
+ 0
+};
+
+#endif
+
+int ipfw_init_or_cleanup(int init)
+{
+ int ret = 0;
+
+ if (!init)
+ goto cleanup;
+
+ ret = register_firewall(PF_INET, &ipfw_ops);
+ if (ret < 0)
+ goto cleanup_nothing;
+
+#ifdef CONFIG_IP_ACCT
+ proc_net_create("ip_acct", S_IFREG | S_IRUGO | S_IWUSR, ip_acct_procinfo);
+#endif
+ proc_net_create("ip_input", S_IFREG | S_IRUGO | S_IWUSR, ip_fw_in_procinfo);
+ proc_net_create("ip_output", S_IFREG | S_IRUGO | S_IWUSR, ip_fw_out_procinfo);
+ proc_net_create("ip_forward", S_IFREG | S_IRUGO | S_IWUSR, ip_fw_fwd_procinfo);
+
+ /* Register for device up/down reports */
+ register_netdevice_notifier(&ipfw_dev_notifier);
+
+#ifdef CONFIG_IP_FIREWALL_NETLINK
+ ipfwsk = netlink_kernel_create(NETLINK_FIREWALL, NULL);
+#endif
+ return ret;
+
+ cleanup:
+#ifdef CONFIG_IP_FIREWALL_NETLINK
+ sock_release(ipfwsk->socket);
+#endif
+ unregister_netdevice_notifier(&ipfw_dev_notifier);
+
+#ifdef CONFIG_IP_ACCT
+ proc_net_remove("ip_acct");
+#endif
+ proc_net_remove("ip_input");
+ proc_net_remove("ip_output");
+ proc_net_remove("ip_forward");
+
+ free_fw_chain(chains[IP_FW_FWD]);
+ free_fw_chain(chains[IP_FW_IN]);
+ free_fw_chain(chains[IP_FW_OUT]);
+ free_fw_chain(chains[IP_FW_ACCT]);
+
+ unregister_firewall(PF_INET, &ipfw_ops);
+
+ cleanup_nothing:
+ return ret;
+}
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
new file mode 100644
index 000000000..6e69d6a90
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -0,0 +1,368 @@
+/*
+ * This is a module which is used for logging packets.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/spinlock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+struct in_device;
+#include <net/route.h>
+#include <linux/netfilter_ipv4/ipt_LOG.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+struct esphdr {
+ __u32 spi;
+}; /* FIXME evil kludge */
+
+/* Make init and cleanup non-static, so gcc doesn't warn about unused,
+ but don't export the symbols */
+EXPORT_NO_SYMBOLS;
+
+/* Use lock to serialize, so printks don't overlap */
+static spinlock_t log_lock = SPIN_LOCK_UNLOCKED;
+
+/* One level of recursion won't kill us */
+static void dump_packet(const struct ipt_log_info *info,
+ struct iphdr *iph, unsigned int len, int recurse)
+{
+ void *protoh = (u_int32_t *)iph + iph->ihl;
+ unsigned int datalen = len - iph->ihl * 4;
+
+ /* Important fields:
+ * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
+ /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
+ printk("SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ",
+ (ntohl(iph->saddr)>>24)&0xFF,
+ (ntohl(iph->saddr)>>16)&0xFF,
+ (ntohl(iph->saddr)>>8)&0xFF,
+ (ntohl(iph->saddr))&0xFF,
+ (ntohl(iph->daddr)>>24)&0xFF,
+ (ntohl(iph->daddr)>>16)&0xFF,
+ (ntohl(iph->daddr)>>8)&0xFF,
+ (ntohl(iph->daddr))&0xFF);
+
+ /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
+ printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+ ntohs(iph->tot_len), iph->tos & IPTOS_TOS_MASK,
+ iph->tos & IPTOS_PREC_MASK, iph->ttl, ntohs(iph->id));
+
+ /* Max length: 6 "CE DF MF " */
+ if (ntohs(iph->frag_off) & IP_CE)
+ printk("CE ");
+ if (ntohs(iph->frag_off) & IP_DF)
+ printk("DF ");
+ if (ntohs(iph->frag_off) & IP_MF)
+ printk("MF ");
+
+ /* Max length: 11 "FRAG:65535 " */
+ if (ntohs(iph->frag_off) & IP_OFFSET)
+ printk("FRAG:%u ", ntohs(iph->frag_off) & IP_OFFSET);
+
+ if ((info->logflags & IPT_LOG_IPOPT)
+ && iph->ihl * 4 != sizeof(struct iphdr)) {
+ unsigned int i;
+
+ /* Max length: 127 "OPT (" 15*4*2chars ") " */
+ printk("OPT (");
+ for (i = sizeof(struct iphdr); i < iph->ihl * 4; i++)
+ printk("%02X", ((u_int8_t *)iph)[i]);
+ printk(") ");
+ }
+
+ switch (iph->protocol) {
+ case IPPROTO_TCP: {
+ struct tcphdr *tcph = protoh;
+
+ /* Max length: 10 "PROTO=TCP " */
+ printk("PROTO=TCP ");
+
+ if (ntohs(iph->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ if (datalen < sizeof (*tcph)) {
+ printk("INCOMPLETE [%u bytes] ", datalen);
+ break;
+ }
+
+ /* Max length: 20 "SPT=65535 DPT=65535 " */
+ printk("SPT=%u DPT=%u ",
+ ntohs(tcph->source), ntohs(tcph->dest));
+ /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
+ if (info->logflags & IPT_LOG_TCPSEQ)
+ printk("SEQ=%u ACK=%u ",
+ ntohl(tcph->seq), ntohl(tcph->ack_seq));
+ /* Max length: 13 "WINDOW=65535 " */
+ printk("WINDOW=%u ", ntohs(tcph->window));
+ /* Max length: 9 "RES=0x3F " */
+ printk("RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(tcph) & TCP_RESERVED_BITS) >> 22));
+ /* Max length: 36 "URG ACK PSH RST SYN FIN " */
+ if (tcph->urg)
+ printk("URG ");
+ if (tcph->ack)
+ printk("ACK ");
+ if (tcph->psh)
+ printk("PSH ");
+ if (tcph->rst)
+ printk("RST ");
+ if (tcph->syn)
+ printk("SYN ");
+ if (tcph->fin)
+ printk("FIN ");
+ /* Max length: 11 "URGP=65535 " */
+ printk("URGP=%u ", ntohs(tcph->urg_ptr));
+
+ if ((info->logflags & IPT_LOG_TCPOPT)
+ && tcph->doff * 4 != sizeof(struct tcphdr)) {
+ unsigned int i;
+
+ /* Max length: 127 "OPT (" 15*4*2chars ") " */
+ printk("OPT (");
+ for (i =sizeof(struct tcphdr); i < tcph->doff * 4; i++)
+ printk("%02X", ((u_int8_t *)tcph)[i]);
+ printk(") ");
+ }
+ break;
+ }
+ case IPPROTO_UDP: {
+ struct udphdr *udph = protoh;
+
+ /* Max length: 10 "PROTO=UDP " */
+ printk("PROTO=UDP ");
+
+ if (ntohs(iph->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ if (datalen < sizeof (*udph)) {
+ printk("INCOMPLETE [%u bytes] ", datalen);
+ break;
+ }
+
+ /* Max length: 20 "SPT=65535 DPT=65535 " */
+ printk("SPT=%u DPT=%u LEN=%u ",
+ ntohs(udph->source), ntohs(udph->dest),
+ ntohs(udph->len));
+ break;
+ }
+ case IPPROTO_ICMP: {
+ struct icmphdr *icmph = protoh;
+ static size_t required_len[NR_ICMP_TYPES+1]
+ = { [ICMP_ECHOREPLY] = 4,
+ [ICMP_DEST_UNREACH]
+ = 8 + sizeof(struct iphdr) + 8,
+ [ICMP_SOURCE_QUENCH]
+ = 8 + sizeof(struct iphdr) + 8,
+ [ICMP_REDIRECT]
+ = 8 + sizeof(struct iphdr) + 8,
+ [ICMP_ECHO] = 4,
+ [ICMP_TIME_EXCEEDED]
+ = 8 + sizeof(struct iphdr) + 8,
+ [ICMP_PARAMETERPROB]
+ = 8 + sizeof(struct iphdr) + 8,
+ [ICMP_TIMESTAMP] = 20,
+ [ICMP_TIMESTAMPREPLY] = 20,
+ [ICMP_ADDRESS] = 12,
+ [ICMP_ADDRESSREPLY] = 12 };
+
+ /* Max length: 11 "PROTO=ICMP " */
+ printk("PROTO=ICMP ");
+
+ if (ntohs(iph->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ if (datalen < 4) {
+ printk("INCOMPLETE [%u bytes] ", datalen);
+ break;
+ }
+
+ /* Max length: 18 "TYPE=255 CODE=255 " */
+ printk("TYPE=%u CODE=%u ", icmph->type, icmph->code);
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ if (icmph->type <= NR_ICMP_TYPES
+ && required_len[icmph->type]
+ && datalen < required_len[icmph->type]) {
+ printk("INCOMPLETE [%u bytes] ", datalen);
+ break;
+ }
+
+ switch (icmph->type) {
+ case ICMP_ECHOREPLY:
+ case ICMP_ECHO:
+ /* Max length: 19 "ID=65535 SEQ=65535 " */
+ printk("ID=%u SEQ=%u ",
+ ntohs(icmph->un.echo.id),
+ ntohs(icmph->un.echo.sequence));
+ break;
+
+ case ICMP_PARAMETERPROB:
+ /* Max length: 14 "PARAMETER=255 " */
+ printk("PARAMETER=%u ",
+ ntohl(icmph->un.gateway) >> 24);
+ break;
+ case ICMP_REDIRECT:
+ /* Max length: 24 "GATEWAY=255.255.255.255 " */
+ printk("GATEWAY=%u.%u.%u.%u ",
+ (ntohl(icmph->un.gateway)>>24)&0xFF,
+ (ntohl(icmph->un.gateway)>>16)&0xFF,
+ (ntohl(icmph->un.gateway)>>8)&0xFF,
+ (ntohl(icmph->un.gateway))&0xFF);
+ /* Fall through */
+ case ICMP_DEST_UNREACH:
+ case ICMP_SOURCE_QUENCH:
+ case ICMP_TIME_EXCEEDED:
+ /* Max length: 3+maxlen */
+ if (recurse) {
+ printk("[");
+ dump_packet(info,
+ (struct iphdr *)(icmph + 1),
+ datalen-sizeof(struct iphdr),
+ 0);
+ printk("] ");
+ }
+
+ /* Max length: 10 "MTU=65535 " */
+ if (icmph->type == ICMP_DEST_UNREACH
+ && icmph->code == ICMP_FRAG_NEEDED)
+ printk("MTU=%u ", ntohs(icmph->un.frag.mtu));
+ }
+ break;
+ }
+ /* Max Length */
+ case IPPROTO_AH:
+ case IPPROTO_ESP: {
+ struct esphdr *esph = protoh;
+ int esp= (iph->protocol==IPPROTO_ESP);
+
+ /* Max length: 10 "PROTO=ESP " */
+ printk("PROTO=%s ",esp? "ESP" : "AH");
+
+ if (ntohs(iph->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ if (datalen < sizeof (*esph)) {
+ printk("INCOMPLETE [%u bytes] ", datalen);
+ break;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 " */
+ printk("SPI=0x%x ", ntohl(esph->spi) );
+ break;
+ }
+ /* Max length: 10 "PROTO 255 " */
+ default:
+ printk("PROTO=%u ", iph->protocol);
+ }
+
+ /* Proto Max log string length */
+ /* IP: 40+46+6+11+127 = 230 */
+ /* TCP: 10+max(25,20+30+13+9+36+11+127) = 256 */
+ /* UDP: 10+max(25,20) = 35 */
+ /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
+ /* ESP: 10+max(25)+15 = 50 */
+ /* AH: 9+max(25)+15 = 49 */
+ /* unknown: 10 */
+
+ /* (ICMP allows recursion one level deep) */
+ /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */
+ /* maxlen = 230+ 91 + 230 + 256 = 807 */
+}
+
+static unsigned int
+ipt_log_target(struct sk_buff **pskb,
+ unsigned int hooknum,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *targinfo,
+ void *userinfo)
+{
+ struct iphdr *iph = (*pskb)->nh.iph;
+ const struct ipt_log_info *loginfo = targinfo;
+ char level_string[4] = "< >";
+
+ level_string[1] = '0' + (loginfo->level % 8);
+ spin_lock_bh(&log_lock);
+ printk(level_string);
+ printk("%sIN=%s OUT=%s ",
+ loginfo->prefix,
+ in ? in->name : "",
+ out ? out->name : "");
+ if (in && !out) {
+ /* MAC logging for input chain only. */
+ printk("MAC=");
+ if ((*pskb)->dev && (*pskb)->dev->hard_header_len) {
+ int i;
+ unsigned char *p = (*pskb)->mac.raw;
+ for (i = 0; i < (*pskb)->dev->hard_header_len; i++,p++)
+ printk("%02x%c", *p,
+ i==(*pskb)->dev->hard_header_len - 1
+ ? ' ':':');
+ }
+ }
+
+ dump_packet(loginfo, iph, (*pskb)->len, 1);
+ printk("\n");
+ spin_unlock_bh(&log_lock);
+
+ return IPT_CONTINUE;
+}
+
+static int ipt_log_checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ const struct ipt_log_info *loginfo = targinfo;
+
+ if (targinfosize != IPT_ALIGN(sizeof(struct ipt_log_info))) {
+ DEBUGP("LOG: targinfosize %u != %u\n",
+ targinfosize, IPT_ALIGN(sizeof(struct ipt_log_info)));
+ return 0;
+ }
+
+ if (loginfo->level >= 8) {
+ DEBUGP("LOG: level %u >= 8\n", loginfo->level);
+ return 0;
+ }
+
+ if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') {
+ DEBUGP("LOG: prefix term %i\n",
+ loginfo->prefix[sizeof(loginfo->prefix)-1]);
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ipt_target ipt_log_reg
+= { { NULL, NULL }, "LOG", ipt_log_target, ipt_log_checkentry, THIS_MODULE };
+
+static int __init init(void)
+{
+ if (ipt_register_target(&ipt_log_reg))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&ipt_log_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c
new file mode 100644
index 000000000..32906eefe
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_MARK.c
@@ -0,0 +1,68 @@
+/* This is a module which is used for setting the NFMARK field of an skb. */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_MARK.h>
+
+EXPORT_NO_SYMBOLS;
+
+static unsigned int
+target(struct sk_buff **pskb,
+ unsigned int hooknum,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *targinfo,
+ void *userinfo)
+{
+ const struct ipt_mark_target_info *markinfo = targinfo;
+
+ if((*pskb)->nfmark != markinfo->mark) {
+ (*pskb)->nfmark = markinfo->mark;
+ (*pskb)->nfcache |= NFC_ALTERED;
+ }
+ return IPT_CONTINUE;
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) {
+ printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n",
+ targinfosize,
+ IPT_ALIGN(sizeof(struct ipt_mark_target_info)));
+ return 0;
+ }
+
+ if (strcmp(tablename, "mangle") != 0) {
+ printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ipt_target ipt_mark_reg
+= { { NULL, NULL }, "MARK", target, checkentry, THIS_MODULE };
+
+static int __init init(void)
+{
+ if (ipt_register_target(&ipt_mark_reg))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&ipt_mark_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
new file mode 100644
index 000000000..9f94f8f44
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -0,0 +1,171 @@
+/* Masquerade. Simple mapping which alters range to a local IP address
+ (depending on route). */
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <net/protocol.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+EXPORT_NO_SYMBOLS;
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+/* Lock protects masq region inside conntrack */
+static DECLARE_RWLOCK(masq_lock);
+
+/* FIXME: Multiple targets. --RR */
+static int
+masquerade_check(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ const struct ip_nat_multi_range *mr = targinfo;
+
+ if (targinfosize != IPT_ALIGN(sizeof(*mr))) {
+ DEBUGP("masquerade_check: size %u != %u.\n",
+ targinfosize, sizeof(*mr));
+ return 0;
+ }
+ if (hook_mask & ~(1 << NF_IP_POST_ROUTING)) {
+ DEBUGP("masquerade_check: bad hooks %x.\n", hook_mask);
+ return 0;
+ }
+ if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
+ DEBUGP("masquerade_check: bad MAP_IPS.\n");
+ return 0;
+ }
+ if (mr->rangesize != 1) {
+ DEBUGP("masquerade_check: bad rangesize %u.\n", mr->rangesize);
+ return 0;
+ }
+ return 1;
+}
+
+static unsigned int
+masquerade_target(struct sk_buff **pskb,
+ unsigned int hooknum,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *targinfo,
+ void *userinfo)
+{
+ struct ip_conntrack *ct;
+ enum ip_conntrack_info ctinfo;
+ const struct ip_nat_range *r;
+ struct ip_nat_multi_range newrange;
+ u_int32_t newsrc;
+ struct rtable *rt;
+
+ IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING);
+
+ /* FIXME: For the moment, don't do local packets, breaks
+ testsuite for 2.3.49 --RR */
+ if ((*pskb)->sk)
+ return NF_ACCEPT;
+
+ ct = ip_conntrack_get(*pskb, &ctinfo);
+ IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW
+ || ctinfo == IP_CT_RELATED));
+
+ r = targinfo;
+
+ if (ip_route_output(&rt, (*pskb)->nh.iph->daddr,
+ 0,
+ RT_TOS((*pskb)->nh.iph->tos)|RTO_CONN,
+ out->ifindex) != 0) {
+ /* Shouldn't happen */
+ printk("MASQUERADE: No route: Rusty's brain broke!\n");
+ return NF_DROP;
+ }
+
+ newsrc = rt->rt_src;
+ DEBUGP("newsrc = %u.%u.%u.%u\n", IP_PARTS(newsrc));
+ ip_rt_put(rt);
+
+ WRITE_LOCK(&masq_lock);
+ ct->nat.masq_index = out->ifindex;
+ WRITE_UNLOCK(&masq_lock);
+
+ /* Transfer from original range. */
+ newrange = ((struct ip_nat_multi_range)
+ { 1, { { r->flags | IP_NAT_RANGE_MAP_IPS,
+ newsrc, newsrc,
+ r->min, r->max } } });
+
+ /* Hand modified range to generic setup. */
+ return ip_nat_setup_info(ct, &newrange, hooknum);
+}
+
+static inline int
+device_cmp(const struct ip_conntrack *i, void *ifindex)
+{
+ int ret;
+
+ READ_LOCK(&masq_lock);
+ ret = (i->nat.masq_index == (int)(long)ifindex);
+ READ_UNLOCK(&masq_lock);
+
+ return ret;
+}
+
+int masq_device_event(struct notifier_block *this,
+ unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = ptr;
+
+ if (event == NETDEV_DOWN) {
+ /* Device was downed. Search entire table for
+ conntracks which were associated with that device,
+ and forget them. */
+ IP_NF_ASSERT(dev->ifindex != 0);
+
+ ip_ct_selective_cleanup(device_cmp, (void *)(long)dev->ifindex);
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block masq_dev_notifier = {
+ masq_device_event,
+ NULL,
+ 0
+};
+
+static struct ipt_target masquerade
+= { { NULL, NULL }, "MASQUERADE", masquerade_target, masquerade_check,
+ THIS_MODULE };
+
+static int __init init(void)
+{
+ int ret;
+
+ ret = ipt_register_target(&masquerade);
+
+ if (ret == 0) {
+ /* Register for device down reports */
+ register_netdevice_notifier(&masq_dev_notifier);
+ }
+
+ return ret;
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&masquerade);
+ unregister_netdevice_notifier(&masq_dev_notifier);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_MIRROR.c b/net/ipv4/netfilter/ipt_MIRROR.c
new file mode 100644
index 000000000..9dec181c1
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_MIRROR.c
@@ -0,0 +1,131 @@
+/*
+ This is a module which is used for resending packets with inverted src and dst.
+
+ Based on code from: ip_nat_dumb.c,v 1.9 1999/08/20
+ and various sources.
+
+ Copyright (C) 2000 Emmanuel Roger <winfield@freegates.be>
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software Foundation,
+ Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netdevice.h>
+#include <linux/route.h>
+struct in_device;
+#include <net/route.h>
+EXPORT_NO_SYMBOLS;
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static int route_mirror(struct sk_buff *skb)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct rtable *rt;
+
+ if (ip_route_output(&rt, iph->daddr, iph->saddr,
+ RT_TOS(iph->tos) | RTO_CONN,
+ 0)) {
+ return -EINVAL;
+ }
+ /* check if the interface we are living by is the same as the one we arrived on */
+
+ if (skb->rx_dev != rt->u.dst.dev) {
+ /* Drop old route. */
+ dst_release(skb->dst);
+ skb->dst = &rt->u.dst;
+ return 0;
+ }
+ else return -EINVAL;
+}
+
+static int
+ip_rewrite(struct sk_buff *skb)
+{
+ struct iphdr *iph = skb->nh.iph;
+ u32 odaddr = iph->saddr;
+ u32 osaddr = iph->daddr;
+
+ skb->nfcache |= NFC_ALTERED;
+
+ /* Rewrite IP header */
+ iph->daddr = odaddr;
+ iph->saddr = osaddr;
+
+ return 0;
+}
+
+
+static unsigned int ipt_mirror_target(struct sk_buff **pskb,
+ unsigned int hooknum,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *targinfo,
+ void *userinfo)
+{
+ if ((*pskb)->dst != NULL) {
+ if (!ip_rewrite(*pskb) && !route_mirror(*pskb)) {
+ ip_send(*pskb);
+ return NF_STOLEN;
+ }
+ }
+ return NF_DROP;
+}
+
+static int ipt_mirror_checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ /* Only on INPUT, FORWARD or PRE_ROUTING, otherwise loop danger. */
+ if (hook_mask & ~((1 << NF_IP_PRE_ROUTING)
+ | (1 << NF_IP_FORWARD)
+ | (1 << NF_IP_LOCAL_IN))) {
+ DEBUGP("MIRROR: bad hook\n");
+ return 0;
+ }
+
+ if (targinfosize != IPT_ALIGN(0)) {
+ DEBUGP("MIRROR: targinfosize %u != 0\n", targinfosize);
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ipt_target ipt_mirror_reg
+= { { NULL, NULL }, "MIRROR", ipt_mirror_target, ipt_mirror_checkentry,
+ THIS_MODULE };
+
+static int __init init(void)
+{
+ return ipt_register_target(&ipt_mirror_reg);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&ipt_mirror_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
new file mode 100644
index 000000000..690d3a8a1
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -0,0 +1,104 @@
+/* Redirect. Simple mapping which alters dst to a local IP address. */
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/inetdevice.h>
+#include <net/protocol.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+
+EXPORT_NO_SYMBOLS;
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+/* FIXME: Take multiple ranges --RR */
+static int
+redirect_check(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ const struct ip_nat_multi_range *mr = targinfo;
+
+ if (targinfosize != IPT_ALIGN(sizeof(*mr))) {
+ DEBUGP("redirect_check: size %u.\n", targinfosize);
+ return 0;
+ }
+ if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))) {
+ DEBUGP("redirect_check: bad hooks %x.\n", hook_mask);
+ return 0;
+ }
+ if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
+ DEBUGP("redirect_check: bad MAP_IPS.\n");
+ return 0;
+ }
+ if (mr->rangesize != 1) {
+ DEBUGP("redirect_check: bad rangesize %u.\n", mr->rangesize);
+ return 0;
+ }
+ return 1;
+}
+
+static unsigned int
+redirect_target(struct sk_buff **pskb,
+ unsigned int hooknum,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *targinfo,
+ void *userinfo)
+{
+ struct ip_conntrack *ct;
+ enum ip_conntrack_info ctinfo;
+ u_int32_t newdst;
+ const struct ip_nat_range *r = targinfo;
+ struct ip_nat_multi_range newrange;
+
+ IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
+ || hooknum == NF_IP_LOCAL_OUT);
+
+ ct = ip_conntrack_get(*pskb, &ctinfo);
+ IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
+
+ /* Local packets: make them go to loopback */
+ if (hooknum == NF_IP_LOCAL_OUT)
+ newdst = htonl(0x7F000001);
+ else
+ /* Grab first address on interface. */
+ newdst = (((struct in_device *)(*pskb)->dev->ip_ptr)
+ ->ifa_list->ifa_local);
+
+ /* Transfer from original range. */
+ newrange = ((struct ip_nat_multi_range)
+ { 1, { { r->flags | IP_NAT_RANGE_MAP_IPS,
+ newdst, newdst,
+ r->min, r->max } } });
+
+ /* Hand modified range to generic setup. */
+ return ip_nat_setup_info(ct, &newrange, hooknum);
+}
+
+static struct ipt_target redirect_reg
+= { { NULL, NULL }, "REDIRECT", redirect_target, redirect_check, THIS_MODULE };
+
+static int __init init(void)
+{
+ return ipt_register_target(&redirect_reg);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&redirect_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
new file mode 100644
index 000000000..b183e822c
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -0,0 +1,145 @@
+/*
+ * This is a module which is used for rejecting packets.
+ * Added support for customized reject packets (Jozsef Kadlecsik).
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+struct in_device;
+#include <net/route.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_REJECT.h>
+EXPORT_NO_SYMBOLS;
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static unsigned int reject(struct sk_buff **pskb,
+ unsigned int hooknum,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *targinfo,
+ void *userinfo)
+{
+ const struct ipt_reject_info *reject = targinfo;
+
+ switch (reject->with) {
+ case IPT_ICMP_NET_UNREACHABLE:
+ icmp_send(*pskb, ICMP_DEST_UNREACH, ICMP_NET_UNREACH, 0);
+ break;
+ case IPT_ICMP_HOST_UNREACHABLE:
+ icmp_send(*pskb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+ break;
+ case IPT_ICMP_PROT_UNREACHABLE:
+ icmp_send(*pskb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);
+ break;
+ case IPT_ICMP_PORT_UNREACHABLE:
+ icmp_send(*pskb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+ break;
+ case IPT_ICMP_ECHOREPLY: {
+ struct icmphdr *icmph = (struct icmphdr *)
+ ((u_int32_t *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl);
+ unsigned int datalen = (*pskb)->len - (*pskb)->nh.iph->ihl * 4;
+
+ /* Not non-head frags, or truncated */
+ if (((ntohs((*pskb)->nh.iph->frag_off) & IP_OFFSET) == 0)
+ && datalen >= 4) {
+ /* Usually I don't like cut & pasting code,
+ but dammit, my party is starting in 45
+ mins! --RR */
+ struct icmp_bxm icmp_param;
+
+ icmp_param.icmph=*icmph;
+ icmp_param.icmph.type=ICMP_ECHOREPLY;
+ icmp_param.data_ptr=(icmph+1);
+ icmp_param.data_len=datalen;
+ icmp_reply(&icmp_param, *pskb);
+ }
+ }
+ break;
+ case IPT_TCP_RESET:
+ tcp_v4_send_reset(*pskb);
+ break;
+ }
+
+ return NF_DROP;
+}
+
+static inline int find_ping_match(const struct ipt_entry_match *m)
+{
+ const struct ipt_icmp *icmpinfo = (const struct ipt_icmp *)m->data;
+
+ if (strcmp(m->u.match->name, "icmp") == 0
+ && icmpinfo->type == ICMP_ECHO
+ && !(icmpinfo->invflags & IPT_ICMP_INV))
+ return 1;
+
+ return 0;
+}
+
+static int check(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ const struct ipt_reject_info *rejinfo = targinfo;
+
+ if (targinfosize != IPT_ALIGN(sizeof(struct ipt_icmp))) {
+ DEBUGP("REJECT: targinfosize %u != 0\n", targinfosize);
+ return 0;
+ }
+
+ /* Only allow these for packet filtering. */
+ if ((hook_mask & ~((1 << NF_IP_LOCAL_IN)
+ | (1 << NF_IP_FORWARD)
+ | (1 << NF_IP_LOCAL_OUT))) != 0) {
+ DEBUGP("REJECT: bad hook mask %X\n", hook_mask);
+ return 0;
+ }
+
+ if (rejinfo->with == IPT_ICMP_ECHOREPLY) {
+ /* Must specify that it's an ICMP ping packet. */
+ if (e->ip.proto != IPPROTO_ICMP
+ || (e->ip.invflags & IPT_INV_PROTO)) {
+ DEBUGP("REJECT: ECHOREPLY illegal for non-icmp\n");
+ return 0;
+ }
+ /* Must contain ICMP match. */
+ if (IPT_MATCH_ITERATE(e, find_ping_match) == 0) {
+ DEBUGP("REJECT: ECHOREPLY illegal for non-ping\n");
+ return 0;
+ }
+ } else if (rejinfo->with == IPT_TCP_RESET) {
+ if (e->ip.proto != IPPROTO_TCP
+ || (e->ip.invflags & IPT_INV_PROTO)) {
+ DEBUGP("REJECT: TCP_RESET illegal for non-tcp\n");
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static struct ipt_target ipt_reject_reg
+= { { NULL, NULL }, "REJECT", reject, check, THIS_MODULE };
+
+static int __init init(void)
+{
+ if (ipt_register_target(&ipt_reject_reg))
+ return -EINVAL;
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&ipt_reject_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c
new file mode 100644
index 000000000..fbfb4974f
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_TOS.c
@@ -0,0 +1,87 @@
+/* This is a module which is used for setting the TOS field of a packet. */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_TOS.h>
+
+EXPORT_NO_SYMBOLS;
+
+static unsigned int
+target(struct sk_buff **pskb,
+ unsigned int hooknum,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *targinfo,
+ void *userinfo)
+{
+ struct iphdr *iph = (*pskb)->nh.iph;
+ const struct ipt_tos_target_info *tosinfo = targinfo;
+
+ if ((iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) {
+ u_int8_t diffs[2];
+
+ diffs[0] = iph->tos;
+ iph->tos = (iph->tos & IPTOS_PREC_MASK) | tosinfo->tos;
+ diffs[1] = iph->tos;
+ iph->check = csum_fold(csum_partial((char *)diffs,
+ sizeof(diffs),
+ iph->check^0xFFFF));
+ (*pskb)->nfcache |= NFC_ALTERED;
+ }
+ return IPT_CONTINUE;
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_entry *e,
+ void *targinfo,
+ unsigned int targinfosize,
+ unsigned int hook_mask)
+{
+ const u_int8_t tos = ((struct ipt_tos_target_info *)targinfo)->tos;
+
+ if (targinfosize != IPT_ALIGN(sizeof(struct ipt_tos_target_info))) {
+ printk(KERN_WARNING "TOS: targinfosize %u != %Zu\n",
+ targinfosize,
+ IPT_ALIGN(sizeof(struct ipt_tos_target_info)));
+ return 0;
+ }
+
+ if (strcmp(tablename, "mangle") != 0) {
+ printk(KERN_WARNING "TOS: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
+ return 0;
+ }
+
+ if (tos != IPTOS_LOWDELAY
+ && tos != IPTOS_THROUGHPUT
+ && tos != IPTOS_RELIABILITY
+ && tos != IPTOS_MINCOST
+ && tos != IPTOS_NORMALSVC) {
+ printk(KERN_WARNING "TOS: bad tos value %#x\n", tos);
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct ipt_target ipt_tos_reg
+= { { NULL, NULL }, "TOS", target, checkentry, THIS_MODULE };
+
+static int __init init(void)
+{
+ if (ipt_register_target(&ipt_tos_reg))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_target(&ipt_tos_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_limit.c b/net/ipv4/netfilter/ipt_limit.c
new file mode 100644
index 000000000..3785ba371
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_limit.c
@@ -0,0 +1,144 @@
+/* Kernel module to control the rate
+ *
+ * Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr>
+ * Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr>
+ *
+ * 2 September 1999: Changed from the target RATE to the match
+ * `limit', removed logging. Did I mention that
+ * Alexey is a fucking genius?
+ * Rusty Russell (rusty@rustcorp.com.au). */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_limit.h>
+EXPORT_NO_SYMBOLS;
+
+#define IP_PARTS_NATIVE(n) \
+(unsigned int)((n)>>24)&0xFF, \
+(unsigned int)((n)>>16)&0xFF, \
+(unsigned int)((n)>>8)&0xFF, \
+(unsigned int)((n)&0xFF)
+
+#define IP_PARTS(n) IP_PARTS_NATIVE(ntohl(n))
+
+/* The algorithm used is the Simple Token Bucket Filter (TBF)
+ * see net/sched/sch_tbf.c in the linux source tree
+ */
+
+static spinlock_t limit_lock = SPIN_LOCK_UNLOCKED;
+
+/* Rusty: This is my (non-mathematically-inclined) understanding of
+ this algorithm. The `average rate' in jiffies becomes your initial
+ amount of credit `credit' and the most credit you can ever have
+ `credit_cap'. The `peak rate' becomes the cost of passing the
+ test, `cost'.
+
+ `prev' tracks the last packet hit: you gain one credit per jiffy.
+ If you get credit balance more than this, the extra credit is
+ discarded. Every time the match passes, you lose `cost' credits;
+ if you don't have that many, the test fails.
+
+ See Alexey's formal explanation in net/sched/sch_tbf.c.
+
+ To avoid underflow, we multiply by 128 (ie. you get 128 credits per
+ jiffy). Hence a cost of 2^32-1, means one pass per 32768 seconds
+ at 1024HZ (or one every 9 hours). A cost of 1 means 12800 passes
+ per second at 100HZ. */
+
+#define CREDITS_PER_JIFFY 128
+
+static int
+ipt_limit_match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ const void *hdr,
+ u_int16_t datalen,
+ int *hotdrop)
+{
+ struct ipt_rateinfo *r = ((struct ipt_rateinfo *)matchinfo)->master;
+ unsigned long now = jiffies;
+
+ spin_lock_bh(&limit_lock);
+ r->credit += (now - xchg(&r->prev, now)) * CREDITS_PER_JIFFY;
+ if (r->credit > r->credit_cap)
+ r->credit = r->credit_cap;
+
+ if (r->credit >= r->cost) {
+ /* We're not limited. */
+ r->credit -= r->cost;
+ spin_unlock_bh(&limit_lock);
+ return 1;
+ }
+
+ spin_unlock_bh(&limit_lock);
+ return 0;
+}
+
+/* Precision saver. */
+static u_int32_t
+user2credits(u_int32_t user)
+{
+ /* If multiplying would overflow... */
+ if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
+ /* Divide first. */
+ return (user / IPT_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+
+ return (user * HZ * CREDITS_PER_JIFFY) / IPT_LIMIT_SCALE;
+}
+
+static int
+ipt_limit_checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ struct ipt_rateinfo *r = matchinfo;
+
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_rateinfo)))
+ return 0;
+
+ /* Check for overflow. */
+ if (r->burst == 0
+ || user2credits(r->avg * r->burst) < user2credits(r->avg)) {
+ printk("Call rusty: overflow in ipt_limit: %u/%u\n",
+ r->avg, r->burst);
+ return 0;
+ }
+
+ /* User avg in seconds * IPT_LIMIT_SCALE: convert to jiffies *
+ 128. */
+ r->prev = jiffies;
+ r->credit = user2credits(r->avg * r->burst); /* Credits full. */
+ r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */
+ r->cost = user2credits(r->avg);
+
+ /* For SMP, we only want to use one set of counters. */
+ r->master = r;
+
+ return 1;
+}
+
+static struct ipt_match ipt_limit_reg
+= { { NULL, NULL }, "limit", ipt_limit_match, ipt_limit_checkentry,
+ THIS_MODULE };
+
+static int __init init(void)
+{
+ if (ipt_register_match(&ipt_limit_reg))
+ return -EINVAL;
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&ipt_limit_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_mac.c b/net/ipv4/netfilter/ipt_mac.c
new file mode 100644
index 000000000..90dbec59d
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_mac.c
@@ -0,0 +1,63 @@
+/* Kernel module to match MAC address parameters. */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/if_ether.h>
+
+#include <linux/netfilter_ipv4/ipt_mac.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+EXPORT_NO_SYMBOLS;
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ const void *hdr,
+ u_int16_t datalen,
+ int *hotdrop)
+{
+ const struct ipt_mac_info *info = matchinfo;
+
+ /* Is mac pointer valid? */
+ return (skb->mac.raw >= skb->head
+ && skb->mac.raw < skb->head + skb->len - ETH_HLEN
+ /* If so, compare... */
+ && ((memcmp(skb->mac.ethernet->h_source, info->srcaddr, ETH_ALEN)
+ == 0) ^ info->invert));
+}
+
+static int
+ipt_mac_checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ if (hook_mask
+ & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN))) {
+ printk("ipt_mac: only valid for PRE_ROUTING or LOCAL_IN.\n");
+ return 0;
+ }
+
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_mac_info)))
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_match mac_match
+= { { NULL, NULL }, "mac", &match, &ipt_mac_checkentry, THIS_MODULE };
+
+static int __init init(void)
+{
+ return ipt_register_match(&mac_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&mac_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c
new file mode 100644
index 000000000..0d828fd20
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_mark.c
@@ -0,0 +1,52 @@
+/* Kernel module to match NFMARK values. */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter_ipv4/ipt_mark.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+EXPORT_NO_SYMBOLS;
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ const void *hdr,
+ u_int16_t datalen,
+ int *hotdrop)
+{
+ const struct ipt_mark_info *info = matchinfo;
+
+ return ((skb->nfmark & info->mask) == info->mark) ^ info->invert;
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info)))
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_match mark_match
+= { { NULL, NULL }, "mark", &match, &checkentry, THIS_MODULE };
+
+static int __init init(void)
+{
+ return ipt_register_match(&mark_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&mark_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_multiport.c b/net/ipv4/netfilter/ipt_multiport.c
new file mode 100644
index 000000000..08cc4a968
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_multiport.c
@@ -0,0 +1,102 @@
+/* Kernel module to match one of a list of TCP/UDP ports: ports are in
+ the same place so we can treat them as equal. */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/udp.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter_ipv4/ipt_multiport.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+#if 0
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+EXPORT_NO_SYMBOLS;
+
+/* Returns 1 if the port is matched by the test, 0 otherwise. */
+static inline int
+ports_match(const u_int16_t *portlist, enum ipt_multiport_flags flags,
+ u_int8_t count, u_int16_t src, u_int16_t dst)
+{
+ unsigned int i;
+ for (i=0; i<count; i++) {
+ if (flags != IPT_MULTIPORT_DESTINATION
+ && portlist[i] == src)
+ return 1;
+
+ if (flags != IPT_MULTIPORT_SOURCE
+ && portlist[i] == dst)
+ return 1;
+ }
+
+ return 0;
+}
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ const void *hdr,
+ u_int16_t datalen,
+ int *hotdrop)
+{
+ const struct udphdr *udp = hdr;
+ const struct ipt_multiport *multiinfo = matchinfo;
+
+ /* Must be big enough to read ports. */
+ if (offset == 0 && datalen < sizeof(struct udphdr)) {
+ /* We've been asked to examine this packet, and we
+ can't. Hence, no choice but to drop. */
+ duprintf("ipt_multiport:"
+ " Dropping evil offset=0 tinygram.\n");
+ *hotdrop = 1;
+ return 0;
+ }
+
+ /* Must not be a fragment. */
+ return !offset
+ && ports_match(multiinfo->ports,
+ multiinfo->flags, multiinfo->count,
+ ntohs(udp->source), ntohs(udp->dest));
+}
+
+/* Called when user tries to insert an entry of this type. */
+static int
+checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ const struct ipt_multiport *multiinfo = matchinfo;
+
+ /* Must specify proto == TCP/UDP, no unknown flags or bad count */
+ return (ip->proto == IPPROTO_TCP || ip->proto == IPPROTO_UDP)
+ && !(ip->flags & IPT_INV_PROTO)
+ && matchsize == IPT_ALIGN(sizeof(struct ipt_multiport))
+ && (multiinfo->flags == IPT_MULTIPORT_SOURCE
+ || multiinfo->flags == IPT_MULTIPORT_DESTINATION
+ || multiinfo->flags == IPT_MULTIPORT_EITHER)
+ && multiinfo->count <= IPT_MULTI_PORTS;
+}
+
+static struct ipt_match multiport_match
+= { { NULL, NULL }, "multiport", &match, &checkentry, THIS_MODULE };
+
+static int __init init(void)
+{
+ return ipt_register_match(&multiport_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&multiport_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
new file mode 100644
index 000000000..5438571d3
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_owner.c
@@ -0,0 +1,136 @@
+/* Kernel module to match various things tied to sockets associated with
+ locally generated outgoing packets.
+
+ (C)2000 Marc Boucher
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/file.h>
+#include <net/sock.h>
+
+#include <linux/netfilter_ipv4/ipt_owner.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+EXPORT_NO_SYMBOLS;
+
+static int
+match_pid(const struct sk_buff *skb, pid_t pid)
+{
+ struct task_struct *p;
+ int i;
+
+ read_lock(&tasklist_lock);
+ p = find_task_by_pid(pid);
+ if(p && p->files) {
+ for (i=0; i < p->files->max_fds; i++) {
+ if (fcheck_task(p, i) == skb->sk->socket->file) {
+ read_unlock(&tasklist_lock);
+ return 1;
+ }
+ }
+ }
+ read_unlock(&tasklist_lock);
+ return 0;
+}
+
+static int
+match_sid(const struct sk_buff *skb, pid_t sid)
+{
+ struct task_struct *p;
+ int i, found=0;
+
+ read_lock(&tasklist_lock);
+ for_each_task(p) {
+ if ((p->session != sid) || !p->files)
+ continue;
+
+ for (i=0; i < p->files->max_fds; i++) {
+ if (fcheck_task(p, i) == skb->sk->socket->file) {
+ found = 1;
+ break;
+ }
+ }
+ if(found)
+ break;
+ }
+ read_unlock(&tasklist_lock);
+
+ return found;
+}
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ const void *hdr,
+ u_int16_t datalen,
+ int *hotdrop)
+{
+ const struct ipt_owner_info *info = matchinfo;
+
+ if (!skb->sk || !skb->sk->socket || !skb->sk->socket->file)
+ return 0;
+
+ if(info->match & IPT_OWNER_UID) {
+ if((skb->sk->socket->file->f_uid != info->uid) ^
+ !!(info->invert & IPT_OWNER_UID))
+ return 0;
+ }
+
+ if(info->match & IPT_OWNER_GID) {
+ if((skb->sk->socket->file->f_gid != info->gid) ^
+ !!(info->invert & IPT_OWNER_GID))
+ return 0;
+ }
+
+ if(info->match & IPT_OWNER_PID) {
+ if (!match_pid(skb, info->pid) ^
+ !!(info->invert & IPT_OWNER_PID))
+ return 0;
+ }
+
+ if(info->match & IPT_OWNER_SID) {
+ if (!match_sid(skb, info->sid) ^
+ !!(info->invert & IPT_OWNER_SID))
+ return 0;
+ }
+
+ return 1;
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ if (hook_mask
+ & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) {
+ printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n");
+ return 0;
+ }
+
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_owner_info)))
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_match owner_match
+= { { NULL, NULL }, "owner", &match, &checkentry, THIS_MODULE };
+
+static int __init init(void)
+{
+ return ipt_register_match(&owner_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&owner_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_state.c b/net/ipv4/netfilter/ipt_state.c
new file mode 100644
index 000000000..1baa54d62
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_state.c
@@ -0,0 +1,61 @@
+/* Kernel module to match connection tracking information.
+ * GPL (C) 1999 Rusty Russell (rusty@rustcorp.com.au).
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_state.h>
+EXPORT_NO_SYMBOLS;
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ const void *hdr,
+ u_int16_t datalen,
+ int *hotdrop)
+{
+ const struct ipt_state_info *sinfo = matchinfo;
+ enum ip_conntrack_info ctinfo;
+ unsigned int statebit;
+
+ if (!ip_conntrack_get((struct sk_buff *)skb, &ctinfo))
+ statebit = IPT_STATE_INVALID;
+ else
+ statebit = IPT_STATE_BIT(ctinfo);
+
+ return (sinfo->statemask & statebit);
+}
+
+static int check(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_state_info)))
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_match state_match
+= { { NULL, NULL }, "state", &match, &check, THIS_MODULE };
+
+static int __init init(void)
+{
+ __MOD_INC_USE_COUNT(ip_conntrack_module);
+ return ipt_register_match(&state_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&state_match);
+ __MOD_DEC_USE_COUNT(ip_conntrack_module);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_tos.c b/net/ipv4/netfilter/ipt_tos.c
new file mode 100644
index 000000000..6da72b2d8
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_tos.c
@@ -0,0 +1,53 @@
+/* Kernel module to match TOS values. */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter_ipv4/ipt_tos.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+EXPORT_NO_SYMBOLS;
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ const void *hdr,
+ u_int16_t datalen,
+ int *hotdrop)
+{
+ const struct ipt_tos_info *info = matchinfo;
+ const struct iphdr *iph = skb->nh.iph;
+
+ return (iph->tos == info->tos) ^ info->invert;
+}
+
+static int
+checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ if (matchsize != IPT_ALIGN(sizeof(struct ipt_tos_info)))
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_match tos_match
+= { { NULL, NULL }, "tos", &match, &checkentry, THIS_MODULE };
+
+static int __init init(void)
+{
+ return ipt_register_match(&tos_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&tos_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_unclean.c b/net/ipv4/netfilter/ipt_unclean.c
new file mode 100644
index 000000000..056224a87
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_unclean.c
@@ -0,0 +1,576 @@
+/* Kernel module to match suspect packets. */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/icmp.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+EXPORT_NO_SYMBOLS;
+
+#define limpk(format, args...) \
+do { \
+ if (net_ratelimit()) \
+ printk("ipt_unclean: %s" format, \
+ embedded ? "(embedded packet) " : "" , ## args); \
+} while(0)
+
+enum icmp_error_status
+{
+ ICMP_MAY_BE_ERROR,
+ ICMP_IS_ERROR,
+ ICMP_NOT_ERROR
+};
+
+struct icmp_info
+{
+ size_t min_len, max_len;
+ enum icmp_error_status err;
+ u_int8_t min_code, max_code;
+};
+
+static int
+check_ip(struct iphdr *iph, size_t length, int embedded);
+
+/* ICMP-specific checks. */
+static int
+check_icmp(const struct icmphdr *icmph,
+ u_int16_t datalen,
+ unsigned int offset,
+ int more_frags,
+ int embedded)
+{
+ static struct icmp_info info[]
+ = { [ICMP_ECHOREPLY]
+ = { 8, 65536, ICMP_NOT_ERROR, 0, 0 },
+ [ICMP_DEST_UNREACH]
+ = { 8 + 28, 65536, ICMP_IS_ERROR, 0, 15 },
+ [ICMP_SOURCE_QUENCH]
+ = { 8 + 28, 65536, ICMP_IS_ERROR, 0, 0 },
+ [ICMP_REDIRECT]
+ = { 8 + 28, 65536, ICMP_IS_ERROR, 0, 3 },
+ [ICMP_ECHO]
+ = { 8, 65536, ICMP_NOT_ERROR, 0, 0 },
+ /* Router advertisement. */
+ [9]
+ = { 8, 8 + 255 * 8, ICMP_NOT_ERROR, 0, 0 },
+ /* Router solicitation. */
+ [10]
+ = { 8, 8, ICMP_NOT_ERROR, 0, 0 },
+ [ICMP_TIME_EXCEEDED]
+ = { 8 + 28, 65536, ICMP_IS_ERROR, 0, 1 },
+ [ICMP_PARAMETERPROB]
+ = { 8 + 28, 65536, ICMP_IS_ERROR, 0, 1 },
+ [ICMP_TIMESTAMP]
+ = { 20, 20, ICMP_NOT_ERROR, 0, 0 },
+ [ICMP_TIMESTAMPREPLY]
+ = { 20, 20, ICMP_NOT_ERROR, 0, 0 },
+ [ICMP_INFO_REQUEST]
+ = { 8, 65536, ICMP_NOT_ERROR, 0, 0 },
+ [ICMP_INFO_REPLY]
+ = { 8, 65536, ICMP_NOT_ERROR, 0, 0 },
+ [ICMP_ADDRESS]
+ = { 12, 12, ICMP_NOT_ERROR, 0, 0 },
+ [ICMP_ADDRESSREPLY]
+ = { 12, 12, ICMP_NOT_ERROR, 0, 0 } };
+
+ /* Can't do anything if it's a fragment. */
+ if (!offset)
+ return 1;
+
+ /* Must cover type and code. */
+ if (datalen < 2) {
+ limpk("ICMP len=%u too short\n", datalen);
+ return 0;
+ }
+
+ /* If not embedded. */
+ if (!embedded) {
+ /* Bad checksum? Don't print, just drop. */
+ if (!more_frags
+ && ip_compute_csum((unsigned char *) icmph, datalen) != 0)
+ return 0;
+
+ /* CHECK: Truncated ICMP (even if first fragment). */
+ if (icmph->type < sizeof(info)/sizeof(struct icmp_info)
+ && info[icmph->type].min_len != 0
+ && datalen < info[icmph->type].min_len) {
+ limpk("ICMP type %u len %u too short\n",
+ icmph->type, datalen);
+ return 0;
+ }
+
+ /* CHECK: Check within known error ICMPs. */
+ if (icmph->type < sizeof(info)/sizeof(struct icmp_info)
+ && info[icmph->type].err == ICMP_IS_ERROR) {
+ /* CHECK: Embedded packet must be at least
+ length of iph + 8 bytes. */
+ struct iphdr *inner = (void *)icmph + 8;
+
+ if (datalen - 8 < sizeof(struct iphdr)) {
+ limpk("ICMP error internal way too short\n");
+ return 0;
+ }
+ if (datalen - 8 < inner->ihl*4 + 8) {
+ limpk("ICMP error internal too short\n");
+ return 0;
+ }
+ if (!check_ip(inner, datalen - 8, 1))
+ return 0;
+ }
+ } else {
+ /* CHECK: Can't embed ICMP unless known non-error. */
+ if (icmph->type >= sizeof(info)/sizeof(struct icmp_info)
+ || info[icmph->type].err != ICMP_NOT_ERROR) {
+ limpk("ICMP type %u not embeddable\n",
+ icmph->type);
+ return 0;
+ }
+ }
+
+ /* CHECK: Invalid ICMP codes. */
+ if (icmph->type < sizeof(info)/sizeof(struct icmp_info)
+ && (icmph->code < info[icmph->type].min_code
+ || icmph->code > info[icmph->type].max_code)) {
+ limpk("ICMP type=%u code=%u\n",
+ icmph->type, icmph->code);
+ return 0;
+ }
+
+ /* CHECK: Above maximum length. */
+ if (icmph->type < sizeof(info)/sizeof(struct icmp_info)
+ && info[icmph->type].max_len != 0
+ && datalen > info[icmph->type].max_len) {
+ limpk("ICMP type=%u too long: %u bytes\n",
+ icmph->type, datalen);
+ return 0;
+ }
+
+ switch (icmph->type) {
+ case ICMP_PARAMETERPROB: {
+ /* CHECK: Problem param must be within error packet's
+ * IP header. */
+ struct iphdr *iph = (void *)icmph + 8;
+ u_int32_t arg = ntohl(icmph->un.gateway);
+
+ if (icmph->code == 0) {
+ if ((arg >> 24) >= iph->ihl*4) {
+ limpk("ICMP PARAMETERPROB ptr = %u\n",
+ ntohl(icmph->un.gateway) >> 24);
+ return 0;
+ }
+ arg &= 0x00FFFFFF;
+ }
+
+ /* CHECK: Rest must be zero. */
+ if (arg) {
+ limpk("ICMP PARAMETERPROB nonzero arg = %u\n",
+ arg);
+ return 0;
+ }
+ break;
+ }
+
+ case ICMP_TIME_EXCEEDED:
+ case ICMP_SOURCE_QUENCH:
+ /* CHECK: Unused must be zero. */
+ if (icmph->un.gateway != 0) {
+ limpk("ICMP type=%u unused = %u\n",
+ icmph->type, ntohl(icmph->un.gateway));
+ return 0;
+ }
+ break;
+ }
+
+ return 1;
+}
+
+/* UDP-specific checks. */
+static int
+check_udp(const struct iphdr *iph,
+ const struct udphdr *udph,
+ u_int16_t datalen,
+ unsigned int offset,
+ int more_frags,
+ int embedded)
+{
+ /* Can't do anything if it's a fragment. */
+ if (!offset)
+ return 1;
+
+ /* CHECK: Must cover UDP header. */
+ if (datalen < sizeof(struct udphdr)) {
+ limpk("UDP len=%u too short\n", datalen);
+ return 0;
+ }
+
+ /* Bad checksum? Don't print, just drop. */
+ /* FIXME: SRC ROUTE packets won't match checksum --RR */
+ if (!more_frags && !embedded
+ && csum_tcpudp_magic(iph->saddr, iph->daddr, datalen, IPPROTO_UDP,
+ csum_partial((char *)udph, datalen, 0)) != 0)
+ return 0;
+
+ /* CHECK: Ports can't be zero. */
+ if (!udph->source || !udph->dest) {
+ limpk("UDP zero ports %u/%u\n",
+ ntohs(udph->source), ntohs(udph->dest));
+ return 0;
+ }
+
+ if (!more_frags) {
+ if (!embedded) {
+ /* CHECK: UDP length must match. */
+ if (ntohs(udph->len) != datalen) {
+ limpk("UDP len too short %u vs %u\n",
+ ntohs(udph->len), datalen);
+ return 0;
+ }
+ } else {
+ /* CHECK: UDP length be >= this truncated pkt. */
+ if (ntohs(udph->len) < datalen) {
+ limpk("UDP len too long %u vs %u\n",
+ ntohs(udph->len), datalen);
+ return 0;
+ }
+ }
+ } else {
+ /* CHECK: UDP length must be > this frag's length. */
+ if (ntohs(udph->len) <= datalen) {
+ limpk("UDP fragment len too short %u vs %u\n",
+ ntohs(udph->len), datalen);
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+#define TH_FIN 0x01
+#define TH_SYN 0x02
+#define TH_RST 0x04
+#define TH_PUSH 0x08
+#define TH_ACK 0x10
+#define TH_URG 0x20
+
+/* TCP-specific checks. */
+static int
+check_tcp(const struct iphdr *iph,
+ const struct tcphdr *tcph,
+ u_int16_t datalen,
+ unsigned int offset,
+ int more_frags,
+ int embedded)
+{
+ u_int8_t *opt = (u_int8_t *)(tcph + 1);
+ u_int8_t tcpflags;
+ int end_of_options = 0;
+ size_t i;
+
+ /* CHECK: Can't have offset=1: used to override TCP syn-checks. */
+ /* In fact, this is caught below (offset < 516). */
+
+ /* Can't do anything if it's a fragment. */
+ if (!offset)
+ return 1;
+
+ /* CHECK: Smaller than minimal TCP hdr. */
+ if (datalen < sizeof(struct tcphdr)) {
+ if (!embedded) {
+ limpk("Packet length %u < TCP header.\n", datalen);
+ return 0;
+ }
+ /* Must have ports available (datalen >= 8). */
+ /* CHECK: TCP ports inside ICMP error */
+ if (!tcph->source || !tcph->dest) {
+ limpk("Zero TCP ports %u/%u.\n",
+ htons(tcph->source), htons(tcph->dest));
+ return 0;
+ }
+ return 1;
+ }
+
+ /* CHECK: Smaller than actual TCP hdr. */
+ if (datalen < tcph->doff * 4) {
+ if (!embedded) {
+ limpk("Packet length %u < actual TCP header.\n",
+ datalen);
+ return 0;
+ } else
+ return 1;
+ }
+
+ /* Bad checksum? Don't print, just drop. */
+ /* FIXME: SRC ROUTE packets won't match checksum --RR */
+ if (!more_frags && !embedded
+ && csum_tcpudp_magic(iph->saddr, iph->daddr, datalen, IPPROTO_TCP,
+ csum_partial((char *)tcph, datalen, 0)) != 0)
+ return 0;
+
+ /* CHECK: TCP ports non-zero */
+ if (!tcph->source || !tcph->dest) {
+ limpk("Zero TCP ports %u/%u.\n",
+ htons(tcph->source), htons(tcph->dest));
+ return 0;
+ }
+
+ /* CHECK: TCP reserved bits zero. */
+ if(tcp_flag_word(tcph) & TCP_RESERVED_BITS) {
+ limpk("TCP reserved bits not zero\n");
+ return 0;
+ }
+
+ /* CHECK: TCP flags. */
+ tcpflags = ((u_int8_t *)tcph)[13];
+ if (tcpflags != TH_SYN
+ && tcpflags != (TH_SYN|TH_ACK)
+ && tcpflags != (TH_RST|TH_ACK)
+ && tcpflags != (TH_RST|TH_ACK|TH_PUSH)
+ && tcpflags != (TH_FIN|TH_ACK)
+ && tcpflags != TH_ACK
+ && tcpflags != (TH_ACK|TH_PUSH)
+ && tcpflags != (TH_ACK|TH_URG)
+ && tcpflags != (TH_ACK|TH_URG|TH_PUSH)
+ && tcpflags != (TH_FIN|TH_ACK|TH_PUSH)
+ && tcpflags != (TH_FIN|TH_ACK|TH_URG)
+ && tcpflags != (TH_FIN|TH_ACK|TH_URG|TH_PUSH)) {
+ limpk("TCP flags bad: %u\n", tcpflags);
+ return 0;
+ }
+
+ for (i = sizeof(struct tcphdr); i < tcph->doff * 4; ) {
+ switch (opt[i]) {
+ case 0:
+ end_of_options = 1;
+ i++;
+ break;
+ case 1:
+ i++;
+ break;
+ default:
+ /* CHECK: options after EOO. */
+ if (end_of_options) {
+ limpk("TCP option %u after end\n",
+ opt[i]);
+ return 0;
+ }
+ /* CHECK: options at tail. */
+ else if (i+1 >= tcph->doff * 4) {
+ limpk("TCP option %u at tail\n",
+ opt[i]);
+ return 0;
+ }
+ /* CHECK: zero-length options. */
+ else if (opt[i+1] == 0) {
+ limpk("TCP option %u 0 len\n",
+ opt[i]);
+ return 0;
+ }
+ /* CHECK: oversize options. */
+ else if (opt[i+1] + i >= tcph->doff * 4) {
+ limpk("TCP option %u at %Zu too long\n",
+ (unsigned int) opt[i], i);
+ return 0;
+ }
+ }
+ }
+
+ return 1;
+}
+
+/* Returns 1 if ok */
+/* Standard IP checks. */
+static int
+check_ip(struct iphdr *iph, size_t length, int embedded)
+{
+ u_int8_t *opt = (u_int8_t *)(iph + 1);
+ int end_of_options = 0;
+ void *protoh;
+ size_t datalen;
+ unsigned int i;
+ unsigned int offset;
+
+ /* Should only happen for local outgoing raw-socket packets. */
+ /* CHECK: length >= ip header. */
+ if (length < sizeof(struct iphdr) || length < iph->ihl * 4) {
+ limpk("Packet length %Zu < IP header.\n", length);
+ return 0;
+ }
+
+ offset = ntohs(iph->frag_off) & IP_OFFSET;
+ protoh = (void *)iph + iph->ihl * 4;
+ datalen = length - iph->ihl * 4;
+
+ /* CHECK: Embedded fragment. */
+ if (embedded && offset) {
+ limpk("Embedded fragment.\n");
+ return 0;
+ }
+
+ for (i = sizeof(struct iphdr); i < iph->ihl * 4; ) {
+ switch (opt[i]) {
+ case 0:
+ end_of_options = 1;
+ i++;
+ break;
+ case 1:
+ i++;
+ break;
+ default:
+ /* CHECK: options after EOO. */
+ if (end_of_options) {
+ limpk("IP option %u after end\n",
+ opt[i]);
+ return 0;
+ }
+ /* CHECK: options at tail. */
+ else if (i+1 >= iph->ihl * 4) {
+ limpk("IP option %u at tail\n",
+ opt[i]);
+ return 0;
+ }
+ /* CHECK: zero-length options. */
+ else if (opt[i+1] == 0) {
+ limpk("IP option %u 0 len\n",
+ opt[i]);
+ return 0;
+ }
+ /* CHECK: oversize options. */
+ else if (opt[i+1] + i >= iph->ihl * 4) {
+ limpk("IP option %u at %u too long\n",
+ opt[i], i);
+ return 0;
+ }
+ }
+ }
+
+ /* Fragment checks. */
+
+ /* CHECK: More fragments, but doesn't fill 8-byte boundary. */
+ if ((ntohs(iph->frag_off) & IP_MF)
+ && (ntohs(iph->tot_len) % 8) != 0) {
+ limpk("Truncated fragment %u long.\n", ntohs(iph->tot_len));
+ return 0;
+ }
+
+ /* CHECK: Oversize fragment a-la Ping of Death. */
+ if (offset * 8 + datalen > 65535) {
+ limpk("Oversize fragment to %u.\n", offset * 8);
+ return 0;
+ }
+
+ /* CHECK: DF set and offset or MF set. */
+ if ((ntohs(iph->frag_off) & IP_DF)
+ && (offset || (ntohs(iph->frag_off) & IP_MF))) {
+ limpk("DF set and offset=%u, MF=%u.\n",
+ offset, ntohs(iph->frag_off) & IP_MF);
+ return 0;
+ }
+
+ /* CHECK: Zero-sized fragments. */
+ if ((offset || (ntohs(iph->frag_off) & IP_MF))
+ && datalen == 0) {
+ limpk("Zero size fragment offset=%u\n", offset);
+ return 0;
+ }
+
+ /* Note: we can have even middle fragments smaller than this:
+ consider a large packet passing through a 600MTU then
+ 576MTU link: this gives a fragment of 24 data bytes. But
+ everyone packs fragments largest first, hence a fragment
+ can't START before 576 - MAX_IP_HEADER_LEN. */
+
+ /* Used to be min-size 576: I recall Alan Cox saying ax25 goes
+ down to 128 (576 taken from RFC 791: All hosts must be
+ prepared to accept datagrams of up to 576 octets). Use 128
+ here. */
+#define MIN_LIKELY_MTU 128
+ /* CHECK: Min size of first frag = 128. */
+ if ((ntohs(iph->frag_off) & IP_MF)
+ && offset == 0
+ && ntohs(iph->tot_len) < MIN_LIKELY_MTU) {
+ limpk("First fragment size %u < %u\n", ntohs(iph->tot_len),
+ MIN_LIKELY_MTU);
+ return 0;
+ }
+
+ /* CHECK: Min offset of frag = 128 - 60 (max IP hdr len). */
+ if (offset && offset * 8 < MIN_LIKELY_MTU - 60) {
+ limpk("Fragment starts at %u < %u\n", offset * 8,
+ MIN_LIKELY_MTU-60);
+ return 0;
+ }
+
+ /* CHECK: Protocol specification non-zero. */
+ if (iph->protocol == 0) {
+ limpk("Zero protocol\n");
+ return 0;
+ }
+
+ /* Per-protocol checks. */
+ switch (iph->protocol) {
+ case IPPROTO_ICMP:
+ return check_icmp(protoh, datalen, offset,
+ (ntohs(iph->frag_off) & IP_MF),
+ embedded);
+
+ case IPPROTO_UDP:
+ return check_udp(iph, protoh, datalen, offset,
+ (ntohs(iph->frag_off) & IP_MF),
+ embedded);
+
+ case IPPROTO_TCP:
+ return check_tcp(iph, protoh, datalen, offset,
+ (ntohs(iph->frag_off) & IP_MF),
+ embedded);
+ default:
+ /* Ignorance is bliss. */
+ return 1;
+ }
+}
+
+static int
+match(const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const void *matchinfo,
+ int offset,
+ const void *hdr,
+ u_int16_t datalen,
+ int *hotdrop)
+{
+ return !check_ip(skb->nh.iph, skb->len, 0);
+}
+
+/* Called when user tries to insert an entry of this type. */
+static int
+checkentry(const char *tablename,
+ const struct ipt_ip *ip,
+ void *matchinfo,
+ unsigned int matchsize,
+ unsigned int hook_mask)
+{
+ if (matchsize != IPT_ALIGN(0))
+ return 0;
+
+ return 1;
+}
+
+static struct ipt_match unclean_match
+= { { NULL, NULL }, "unclean", &match, &checkentry, THIS_MODULE };
+
+static int __init init(void)
+{
+ return ipt_register_match(&unclean_match);
+}
+
+static void __exit fini(void)
+{
+ ipt_unregister_match(&unclean_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
new file mode 100644
index 000000000..a10bb3682
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -0,0 +1,181 @@
+/*
+ * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+#define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT))
+
+/* Standard entry. */
+struct ipt_standard
+{
+ struct ipt_entry entry;
+ struct ipt_standard_target target;
+};
+
+struct ipt_error_target
+{
+ struct ipt_entry_target target;
+ char errorname[IPT_FUNCTION_MAXNAMELEN];
+};
+
+struct ipt_error
+{
+ struct ipt_entry entry;
+ struct ipt_error_target target;
+};
+
+static struct
+{
+ struct ipt_replace repl;
+ struct ipt_standard entries[3];
+ struct ipt_error term;
+} initial_table __initdata
+= { { "filter", FILTER_VALID_HOOKS, 4,
+ sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
+ { [NF_IP_LOCAL_IN] 0,
+ [NF_IP_FORWARD] sizeof(struct ipt_standard),
+ [NF_IP_LOCAL_OUT] sizeof(struct ipt_standard) * 2 },
+ { [NF_IP_LOCAL_IN] 0,
+ [NF_IP_FORWARD] sizeof(struct ipt_standard),
+ [NF_IP_LOCAL_OUT] sizeof(struct ipt_standard) * 2 },
+ 0, NULL, { } },
+ {
+ /* LOCAL_IN */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_standard),
+ 0, { 0, 0 }, { } },
+ { { sizeof(struct ipt_standard_target), { "" }, { } },
+ -NF_ACCEPT - 1 } },
+ /* FORWARD */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_standard),
+ 0, { 0, 0 }, { } },
+ { { sizeof(struct ipt_standard_target), { "" }, { } },
+ -NF_ACCEPT - 1 } },
+ /* LOCAL_OUT */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_standard),
+ 0, { 0, 0 }, { } },
+ { { sizeof(struct ipt_standard_target), { "" }, { } },
+ -NF_ACCEPT - 1 } }
+ },
+ /* ERROR */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_error),
+ 0, { 0, 0 }, { } },
+ { { sizeof(struct ipt_error_target), { IPT_ERROR_TARGET },
+ { } },
+ "ERROR"
+ }
+ }
+};
+
+static struct ipt_table packet_filter
+= { { NULL, NULL }, "filter", &initial_table.repl,
+ FILTER_VALID_HOOKS, RW_LOCK_UNLOCKED, NULL };
+
+/* The work comes in here from netfilter.c. */
+static unsigned int
+ipt_hook(unsigned int hook,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL);
+}
+
+static unsigned int
+ipt_local_out_hook(unsigned int hook,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ /* root is playing with raw sockets. */
+ if ((*pskb)->len < sizeof(struct iphdr)
+ || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
+ if (net_ratelimit())
+ printk("ipt_hook: happy cracking.\n");
+ return NF_ACCEPT;
+ }
+
+ return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL);
+}
+
+static struct nf_hook_ops ipt_ops[]
+= { { { NULL, NULL }, ipt_hook, PF_INET, NF_IP_LOCAL_IN, NF_IP_PRI_FILTER },
+ { { NULL, NULL }, ipt_hook, PF_INET, NF_IP_FORWARD, NF_IP_PRI_FILTER },
+ { { NULL, NULL }, ipt_local_out_hook, PF_INET, NF_IP_LOCAL_OUT,
+ NF_IP_PRI_FILTER }
+};
+
+/* Default to no forward for security reasons. */
+static int forward = NF_DROP;
+MODULE_PARM(forward, "i");
+
+static int __init init(void)
+{
+ int ret;
+
+ if (forward < 0 || forward > NF_MAX_VERDICT) {
+ printk("iptables forward must be 0 or 1\n");
+ return -EINVAL;
+ }
+
+ /* Entry 1 is the FORWARD hook */
+ initial_table.entries[1].target.verdict = -forward - 1;
+
+ /* Register table */
+ ret = ipt_register_table(&packet_filter);
+ if (ret < 0)
+ return ret;
+
+ /* Register hooks */
+ ret = nf_register_hook(&ipt_ops[0]);
+ if (ret < 0)
+ goto cleanup_table;
+
+ ret = nf_register_hook(&ipt_ops[1]);
+ if (ret < 0)
+ goto cleanup_hook0;
+
+ ret = nf_register_hook(&ipt_ops[2]);
+ if (ret < 0)
+ goto cleanup_hook1;
+
+ return ret;
+
+ cleanup_hook1:
+ nf_unregister_hook(&ipt_ops[1]);
+ cleanup_hook0:
+ nf_unregister_hook(&ipt_ops[0]);
+ cleanup_table:
+ ipt_unregister_table(&packet_filter);
+
+ return ret;
+}
+
+static void __exit fini(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++)
+ nf_unregister_hook(&ipt_ops[i]);
+
+ ipt_unregister_table(&packet_filter);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
new file mode 100644
index 000000000..ef506ca7a
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -0,0 +1,152 @@
+/*
+ * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+#define MANGLE_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))
+
+/* Standard entry. */
+struct ipt_standard
+{
+ struct ipt_entry entry;
+ struct ipt_standard_target target;
+};
+
+struct ipt_error_target
+{
+ struct ipt_entry_target target;
+ char errorname[IPT_FUNCTION_MAXNAMELEN];
+};
+
+struct ipt_error
+{
+ struct ipt_entry entry;
+ struct ipt_error_target target;
+};
+
+static struct
+{
+ struct ipt_replace repl;
+ struct ipt_standard entries[2];
+ struct ipt_error term;
+} initial_table __initdata
+= { { "mangle", MANGLE_VALID_HOOKS, 3,
+ sizeof(struct ipt_standard) * 2 + sizeof(struct ipt_error),
+ { [NF_IP_PRE_ROUTING] 0,
+ [NF_IP_LOCAL_OUT] sizeof(struct ipt_standard) },
+ { [NF_IP_PRE_ROUTING] 0,
+ [NF_IP_LOCAL_OUT] sizeof(struct ipt_standard) },
+ 0, NULL, { } },
+ {
+ /* PRE_ROUTING */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_standard),
+ 0, { 0, 0 }, { } },
+ { { sizeof(struct ipt_standard_target), { "" }, { } },
+ -NF_ACCEPT - 1 } },
+ /* LOCAL_OUT */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_standard),
+ 0, { 0, 0 }, { } },
+ { { sizeof(struct ipt_standard_target), { "" }, { } },
+ -NF_ACCEPT - 1 } }
+ },
+ /* ERROR */
+ { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+ 0,
+ sizeof(struct ipt_entry),
+ sizeof(struct ipt_error),
+ 0, { 0, 0 }, { } },
+ { { sizeof(struct ipt_error_target), { IPT_ERROR_TARGET },
+ { } },
+ "ERROR"
+ }
+ }
+};
+
+static struct ipt_table packet_mangler
+= { { NULL, NULL }, "mangle", &initial_table.repl,
+ MANGLE_VALID_HOOKS, RW_LOCK_UNLOCKED, NULL };
+
+/* The work comes in here from netfilter.c. */
+static unsigned int
+ipt_hook(unsigned int hook,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ return ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL);
+}
+
+static unsigned int
+ipt_local_out_hook(unsigned int hook,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ /* root is playing with raw sockets. */
+ if ((*pskb)->len < sizeof(struct iphdr)
+ || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
+ if (net_ratelimit())
+ printk("ipt_hook: happy cracking.\n");
+ return NF_ACCEPT;
+ }
+
+ return ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL);
+}
+
+static struct nf_hook_ops ipt_ops[]
+= { { { NULL, NULL }, ipt_hook, PF_INET, NF_IP_PRE_ROUTING, NF_IP_PRI_MANGLE },
+ { { NULL, NULL }, ipt_local_out_hook, PF_INET, NF_IP_LOCAL_OUT,
+ NF_IP_PRI_MANGLE }
+};
+
+static int __init init(void)
+{
+ int ret;
+
+ /* Register table */
+ ret = ipt_register_table(&packet_mangler);
+ if (ret < 0)
+ return ret;
+
+ /* Register hooks */
+ ret = nf_register_hook(&ipt_ops[0]);
+ if (ret < 0)
+ goto cleanup_table;
+
+ ret = nf_register_hook(&ipt_ops[1]);
+ if (ret < 0)
+ goto cleanup_hook0;
+
+ return ret;
+
+ cleanup_hook0:
+ nf_unregister_hook(&ipt_ops[0]);
+ cleanup_table:
+ ipt_unregister_table(&packet_mangler);
+
+ return ret;
+}
+
+static void __exit fini(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++)
+ nf_unregister_hook(&ipt_ops[i]);
+
+ ipt_unregister_table(&packet_mangler);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 4e649eded..c683f2f23 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -5,7 +5,7 @@
*
* ROUTE - implementation of the IP router.
*
- * Version: $Id: route.c,v 1.81 2000/02/09 11:16:42 davem Exp $
+ * Version: $Id: route.c,v 1.82 2000/03/17 14:41:52 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -1187,10 +1187,7 @@ ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
rth->rt_dst = daddr;
rth->key.tos = tos;
#ifdef CONFIG_IP_ROUTE_FWMARK
- if (skb->nfreason == NF_REASON_FOR_ROUTING)
- rth->key.fwmark = skb->nfmark;
- else
- rth->key.fwmark = 0;
+ rth->key.fwmark = skb->nfmark;
#endif
rth->key.src = saddr;
rth->rt_src = saddr;
@@ -1269,10 +1266,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
key.src = saddr;
key.tos = tos;
#ifdef CONFIG_IP_ROUTE_FWMARK
- if (skb->nfreason == NF_REASON_FOR_ROUTING)
- key.fwmark = skb->nfmark;
- else
- key.fwmark = 0;
+ key.fwmark = skb->nfmark;
#endif
key.iif = dev->ifindex;
key.oif = 0;
@@ -1395,10 +1389,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
rth->rt_dst = daddr;
rth->key.tos = tos;
#ifdef CONFIG_IP_ROUTE_FWMARK
- if (skb->nfreason == NF_REASON_FOR_ROUTING)
- rth->key.fwmark = skb->nfmark;
- else
- rth->key.fwmark = 0;
+ rth->key.fwmark = skb->nfmark;
#endif
rth->key.src = saddr;
rth->rt_src = saddr;
@@ -1473,10 +1464,7 @@ local_input:
rth->rt_dst = daddr;
rth->key.tos = tos;
#ifdef CONFIG_IP_ROUTE_FWMARK
- if (skb->nfreason == NF_REASON_FOR_ROUTING)
- rth->key.fwmark = skb->nfmark;
- else
- rth->key.fwmark = 0;
+ rth->key.fwmark = skb->nfmark;
#endif
rth->key.src = saddr;
rth->rt_src = saddr;
@@ -1563,9 +1551,7 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
rth->key.iif == iif &&
rth->key.oif == 0 &&
#ifdef CONFIG_IP_ROUTE_FWMARK
- rth->key.fwmark
- == (skb->nfreason == NF_REASON_FOR_ROUTING
- ? skb->nfmark : 0) &&
+ rth->key.fwmark == skb->nfmark &&
#endif
rth->key.tos == tos) {
rth->u.dst.lastuse = jiffies;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 204f25574..1edee9f51 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_ipv4.c,v 1.201 2000/03/08 19:36:42 davem Exp $
+ * Version: $Id: tcp_ipv4.c,v 1.202 2000/03/17 14:41:53 davem Exp $
*
* IPv4 specific functions
*
@@ -72,8 +72,6 @@ extern int sysctl_ip_dynaddr;
struct inode tcp_inode;
struct socket *tcp_socket=&tcp_inode.u.socket_i;
-static void tcp_v4_send_reset(struct sk_buff *skb);
-
void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
struct sk_buff *skb);
@@ -1059,7 +1057,7 @@ void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
* Exception: precedence violation. We do not implement it in any case.
*/
-static void tcp_v4_send_reset(struct sk_buff *skb)
+void tcp_v4_send_reset(struct sk_buff *skb)
{
struct tcphdr *th = skb->h.th;
struct tcphdr rth;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 818ad66ca..c52797d70 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -6,7 +6,7 @@
* Pedro Roque <roque@di.fc.ul.pt>
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
*
- * $Id: sit.c,v 1.35 2000/01/06 00:42:08 davem Exp $
+ * $Id: sit.c,v 1.36 2000/03/17 14:42:08 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -15,6 +15,7 @@
*/
#define __NO_VERSION__
+#include <linux/config.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/types.h>
@@ -388,6 +389,10 @@ int ipip6_rcv(struct sk_buff *skb, unsigned short len)
skb->dev = tunnel->dev;
dst_release(skb->dst);
skb->dst = NULL;
+#ifdef CONFIG_NETFILTER
+ nf_conntrack_put(skb->nfct);
+ skb->nfct = NULL;
+#endif
netif_rx(skb);
read_unlock(&ipip6_lock);
return 0;
@@ -547,6 +552,11 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
ip_select_ident(iph, &rt->u.dst);
ip_send_check(iph);
+#ifdef CONFIG_NETFILTER
+ nf_conntrack_put(skb->nfct);
+ skb->nfct = NULL;
+#endif
+
stats->tx_bytes += skb->len;
stats->tx_packets++;
ip_send(skb);
diff --git a/net/khttpd/security.c b/net/khttpd/security.c
index df346aadb..bd578941d 100644
--- a/net/khttpd/security.c
+++ b/net/khttpd/security.c
@@ -115,7 +115,7 @@ struct file *OpenFileForSecurity(char *Filename)
- filp = filp_open(Filename,00,O_RDONLY);
+ filp = filp_open(Filename, 0, O_RDONLY, NULL);
if ((IS_ERR(filp))||(filp==NULL)||(filp->f_dentry==NULL))
diff --git a/net/netsyms.c b/net/netsyms.c
index 48cd5b503..c6745cafe 100644
--- a/net/netsyms.c
+++ b/net/netsyms.c
@@ -229,6 +229,7 @@ EXPORT_SYMBOL(inet_del_protocol);
EXPORT_SYMBOL(ip_route_output);
EXPORT_SYMBOL(ip_route_input);
EXPORT_SYMBOL(icmp_send);
+EXPORT_SYMBOL(icmp_reply);
EXPORT_SYMBOL(ip_options_compile);
EXPORT_SYMBOL(ip_options_undo);
EXPORT_SYMBOL(arp_send);
@@ -339,6 +340,7 @@ EXPORT_SYMBOL(tcp_sendmsg);
EXPORT_SYMBOL(tcp_v4_rebuild_header);
EXPORT_SYMBOL(tcp_v4_send_check);
EXPORT_SYMBOL(tcp_v4_conn_request);
+EXPORT_SYMBOL(tcp_v4_send_reset);
EXPORT_SYMBOL(tcp_create_openreq_child);
EXPORT_SYMBOL(tcp_bucket_create);
EXPORT_SYMBOL(__tcp_put_port);
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 143d6e361..31dedf1ea 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -66,7 +66,7 @@ static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
struct fw_head *head = (struct fw_head*)tp->root;
struct fw_filter *f;
#ifdef CONFIG_NETFILTER
- u32 id = (skb->nfreason == NF_REASON_FOR_CLS_FW ? skb->nfmark : 0);
+ u32 id = skb->nfmark;
#else
u32 id = 0;
#endif
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index ddc738fcc..947aede01 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -224,14 +224,14 @@ used on the egress (might slow things for an iota)
return fwres;
}
-/* after iptables */
+/* after ipt_filter */
static struct nf_hook_ops ing_ops =
{
{ NULL, NULL},
ing_hook,
PF_INET,
NF_IP_PRE_ROUTING,
- 1
+ NF_IP_PRI_FILTER + 1
};
int ingress_init(struct Qdisc *sch,struct rtattr *opt)
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index d63024983..97e323d0c 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -636,7 +636,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
dprintk("svc: incomplete TCP record (%d of %d)\n",
len, svsk->sk_reclen);
svc_sock_received(svsk, ready);
- len = -EAGAIN; /* record not complete */
+ return -EAGAIN; /* record not complete */
}
/* Frob argbuf */
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index a57c2a06d..cbe730b5d 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -8,7 +8,7 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * Version: $Id: af_unix.c,v 1.89 2000/02/27 19:52:50 davem Exp $
+ * Version: $Id: af_unix.c,v 1.90 2000/03/16 20:38:45 davem Exp $
*
* Fixes:
* Linus Torvalds : Assorted bug cures.
@@ -569,7 +569,7 @@ static unix_socket *unix_find_other(struct sockaddr_un *sunname, int len,
/* Do not believe to VFS, grab kernel lock */
lock_kernel();
- dentry = open_namei(sunname->sun_path, 2|O_NOFOLLOW, S_IFSOCK);
+ dentry = __open_namei(sunname->sun_path, 2|O_NOFOLLOW, S_IFSOCK, NULL);
if (IS_ERR(dentry)) {
*error = PTR_ERR(dentry);
unlock_kernel();