summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorRalf Baechle <ralf@linux-mips.org>1999-10-09 00:00:47 +0000
committerRalf Baechle <ralf@linux-mips.org>1999-10-09 00:00:47 +0000
commitd6434e1042f3b0a6dfe1b1f615af369486f9b1fa (patch)
treee2be02f33984c48ec019c654051d27964e42c441 /net/ipv4
parent609d1e803baf519487233b765eb487f9ec227a18 (diff)
Merge with 2.3.19.
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/.cvsignore2
-rw-r--r--net/ipv4/Config.in38
-rw-r--r--net/ipv4/Makefile51
-rw-r--r--net/ipv4/af_inet.c546
-rw-r--r--net/ipv4/arp.c86
-rw-r--r--net/ipv4/devinet.c260
-rw-r--r--net/ipv4/fib_frontend.c107
-rw-r--r--net/ipv4/fib_hash.c32
-rw-r--r--net/ipv4/fib_rules.c39
-rw-r--r--net/ipv4/fib_semantics.c112
-rw-r--r--net/ipv4/icmp.c182
-rw-r--r--net/ipv4/igmp.c270
-rw-r--r--net/ipv4/ip_forward.c192
-rw-r--r--net/ipv4/ip_fragment.c12
-rw-r--r--net/ipv4/ip_fw.c1731
-rw-r--r--net/ipv4/ip_gre.c94
-rw-r--r--net/ipv4/ip_input.c234
-rw-r--r--net/ipv4/ip_masq.c2453
-rw-r--r--net/ipv4/ip_masq_app.c603
-rw-r--r--net/ipv4/ip_masq_autofw.c448
-rw-r--r--net/ipv4/ip_masq_cuseeme.c264
-rw-r--r--net/ipv4/ip_masq_ftp.c393
-rw-r--r--net/ipv4/ip_masq_irc.c370
-rw-r--r--net/ipv4/ip_masq_mfw.c769
-rw-r--r--net/ipv4/ip_masq_mod.c322
-rw-r--r--net/ipv4/ip_masq_portfw.c508
-rw-r--r--net/ipv4/ip_masq_quake.c324
-rw-r--r--net/ipv4/ip_masq_raudio.c578
-rw-r--r--net/ipv4/ip_masq_user.c467
-rw-r--r--net/ipv4/ip_masq_vdolive.c298
-rw-r--r--net/ipv4/ip_nat_dumb.c15
-rw-r--r--net/ipv4/ip_output.c457
-rw-r--r--net/ipv4/ip_sockglue.c352
-rw-r--r--net/ipv4/ipconfig.c42
-rw-r--r--net/ipv4/ipip.c79
-rw-r--r--net/ipv4/ipmr.c1131
-rw-r--r--net/ipv4/protocol.c28
-rw-r--r--net/ipv4/rarp.c606
-rw-r--r--net/ipv4/raw.c140
-rw-r--r--net/ipv4/route.c458
-rw-r--r--net/ipv4/syncookies.c24
-rw-r--r--net/ipv4/sysctl_net_ipv4.c42
-rw-r--r--net/ipv4/tcp.c520
-rw-r--r--net/ipv4/tcp_input.c1617
-rw-r--r--net/ipv4/tcp_ipv4.c1543
-rw-r--r--net/ipv4/tcp_output.c210
-rw-r--r--net/ipv4/tcp_timer.c420
-rw-r--r--net/ipv4/timer.c132
-rw-r--r--net/ipv4/udp.c345
49 files changed, 5460 insertions, 14486 deletions
diff --git a/net/ipv4/.cvsignore b/net/ipv4/.cvsignore
deleted file mode 100644
index 857dd22e9..000000000
--- a/net/ipv4/.cvsignore
+++ /dev/null
@@ -1,2 +0,0 @@
-.depend
-.*.flags
diff --git a/net/ipv4/Config.in b/net/ipv4/Config.in
index 29786da5e..a84d61d17 100644
--- a/net/ipv4/Config.in
+++ b/net/ipv4/Config.in
@@ -15,6 +15,9 @@ if [ "$CONFIG_IP_ADVANCED_ROUTER" = "y" ]; then
bool 'IP: fast network address translation' CONFIG_IP_ROUTE_NAT
fi
fi
+if [ "$CONFIG_IP_MULTIPLE_TABLES" = "y" ]; then
+ bool 'IP: use FWMARK value as routing key' CONFIG_IP_ROUTE_FWMARK
+fi
bool 'IP: kernel level autoconfiguration' CONFIG_IP_PNP
if [ "$CONFIG_IP_PNP" = "y" ]; then
bool ' BOOTP support' CONFIG_IP_PNP_BOOTP
@@ -22,40 +25,6 @@ if [ "$CONFIG_IP_PNP" = "y" ]; then
# not yet ready..
# bool ' ARP support' CONFIG_IP_PNP_ARP
fi
-if [ "$CONFIG_FIREWALL" = "y" ]; then
- bool 'IP: firewalling' CONFIG_IP_FIREWALL
- if [ "$CONFIG_IP_FIREWALL" = "y" ]; then
- if [ "$CONFIG_NETLINK" = "y" ]; then
- bool 'IP: firewall packet netlink device' CONFIG_IP_FIREWALL_NETLINK
- if [ "$CONFIG_IP_FIREWALL_NETLINK" = "y" ]; then
- define_bool CONFIG_NETLINK_DEV y
- fi
- fi
- bool 'IP: always defragment (required for masquerading)' CONFIG_IP_ALWAYS_DEFRAG
- if [ "$CONFIG_IP_MULTIPLE_TABLES" = "y" ]; then
- bool 'IP: use FWMARK value as routing key' CONFIG_IP_ROUTE_FWMARK
- fi
- fi
-fi
-if [ "$CONFIG_IP_FIREWALL" = "y" ]; then
- if [ "$CONFIG_IP_ALWAYS_DEFRAG" != "n" ]; then
- bool 'IP: transparent proxy support' CONFIG_IP_TRANSPARENT_PROXY
- bool 'IP: masquerading' CONFIG_IP_MASQUERADE
- if [ "$CONFIG_IP_MASQUERADE" != "n" ]; then
- comment 'Protocol-specific masquerading support will be built as modules.'
- bool 'IP: ICMP masquerading' CONFIG_IP_MASQUERADE_ICMP
- comment 'Protocol-specific masquerading support will be built as modules.'
- if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
- bool 'IP: masquerading special modules support' CONFIG_IP_MASQUERADE_MOD
- if [ "$CONFIG_IP_MASQUERADE_MOD" = "y" ]; then
- tristate 'IP: ipautofw masq support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_IPAUTOFW
- tristate 'IP: ipportfw masq support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_IPPORTFW
- tristate 'IP: ip fwmark masq-forwarding support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_MFW
- fi
- fi
- fi
- fi
-fi
bool 'IP: optimize as router not host' CONFIG_IP_ROUTER
tristate 'IP: tunneling' CONFIG_NET_IPIP
tristate 'IP: GRE tunnels over IP' CONFIG_NET_IPGRE
@@ -78,7 +47,6 @@ fi
bool 'IP: TCP syncookie support (not enabled per default)' CONFIG_SYN_COOKIES
comment '(it is safe to leave these untouched)'
#bool 'IP: PC/TCP compatibility mode' CONFIG_INET_PCTCP
-tristate 'IP: Reverse ARP' CONFIG_INET_RARP
#bool 'IP: Path MTU Discovery (normally enabled)' CONFIG_PATH_MTU_DISCOVERY
#bool 'IP: Disable NAGLE algorithm (normally enabled)' CONFIG_TCP_NAGLE_OFF
bool 'IP: Allow large windows (not recommended if <16Mb of memory)' CONFIG_SKB_LARGE
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 8ab280deb..e2f9a45b7 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,7 +8,7 @@
# Note 2! The CFLAGS definition is now in the main makefile...
O_TARGET := ipv4.o
-IPV4_OBJS := utils.o route.o proc.o timer.o protocol.o \
+IPV4_OBJS := utils.o route.o proc.o protocol.o \
ip_input.o ip_fragment.o ip_forward.o ip_options.o \
ip_output.o ip_sockglue.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o\
@@ -19,10 +19,6 @@ IPV4X_OBJS :=
MOD_LIST_NAME := IPV4_MODULES
M_OBJS :=
-ifeq ($(CONFIG_IP_FIREWALL),y)
-IPV4_OBJS += ip_fw.o
-endif
-
ifeq ($(CONFIG_IP_MULTIPLE_TABLES),y)
IPV4_OBJS += fib_rules.o
endif
@@ -35,14 +31,6 @@ ifeq ($(CONFIG_IP_MROUTE),y)
IPV4_OBJS += ipmr.o
endif
-ifeq ($(CONFIG_INET_RARP),y)
-IPV4_OBJS += rarp.o
-else
- ifeq ($(CONFIG_INET_RARP),m)
- M_OBJS += rarp.o
- endif
-endif
-
ifeq ($(CONFIG_NET_IPIP),y)
IPV4X_OBJS += ipip.o
else
@@ -59,43 +47,6 @@ else
endif
endif
-ifeq ($(CONFIG_IP_MASQUERADE),y)
-IPV4X_OBJS += ip_masq.o ip_masq_app.o
-
-ifeq ($(CONFIG_IP_MASQUERADE_MOD),y)
- IPV4X_OBJS += ip_masq_mod.o
-
- ifeq ($(CONFIG_IP_MASQUERADE_IPAUTOFW),y)
- IPV4_OBJS += ip_masq_autofw.o
- else
- ifeq ($(CONFIG_IP_MASQUERADE_IPAUTOFW),m)
- M_OBJS += ip_masq_autofw.o
- endif
- endif
-
- ifeq ($(CONFIG_IP_MASQUERADE_IPPORTFW),y)
- IPV4_OBJS += ip_masq_portfw.o
- else
- ifeq ($(CONFIG_IP_MASQUERADE_IPPORTFW),m)
- M_OBJS += ip_masq_portfw.o
- endif
- endif
-
- ifeq ($(CONFIG_IP_MASQUERADE_MFW),y)
- IPV4_OBJS += ip_masq_mfw.o
- else
- ifeq ($(CONFIG_IP_MASQUERADE_MFW),m)
- M_OBJS += ip_masq_mfw.o
- endif
- endif
-
-endif
-
-M_OBJS += ip_masq_user.o
-M_OBJS += ip_masq_ftp.o ip_masq_irc.o ip_masq_raudio.o ip_masq_quake.o
-M_OBJS += ip_masq_vdolive.o ip_masq_cuseeme.o
-endif
-
ifeq ($(CONFIG_SYN_COOKIES),y)
IPV4_OBJS += syncookies.o
# module not supported, because it would be too messy.
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 15b26fa1c..526dd4dd6 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -5,7 +5,7 @@
*
* PF_INET protocol family socket handler.
*
- * Version: $Id: af_inet.c,v 1.93 1999/07/02 11:26:24 davem Exp $
+ * Version: $Id: af_inet.c,v 1.97 1999/09/08 03:46:46 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -80,16 +80,17 @@
#include <linux/stat.h>
#include <linux/init.h>
#include <linux/poll.h>
+#include <linux/netfilter_ipv4.h>
#include <asm/uaccess.h>
#include <asm/system.h>
+#include <linux/smp_lock.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/arp.h>
-#include <net/rarp.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/udp.h>
@@ -99,13 +100,9 @@
#include <net/icmp.h>
#include <net/ipip.h>
#include <net/inet_common.h>
-#include <linux/ip_fw.h>
#ifdef CONFIG_IP_MROUTE
#include <linux/mroute.h>
#endif
-#ifdef CONFIG_IP_MASQUERADE
-#include <net/ip_masq.h>
-#endif
#ifdef CONFIG_BRIDGE
#include <net/br.h>
#endif
@@ -120,6 +117,8 @@
struct linux_mib net_statistics;
+atomic_t inet_sock_nr;
+
extern int raw_get_info(char *, char **, off_t, int, int);
extern int snmp_get_info(char *, char **, off_t, int, int);
extern int netstat_get_info(char *, char **, off_t, int, int);
@@ -136,90 +135,78 @@ extern int dlci_ioctl(unsigned int, void*);
int (*dlci_ioctl_hook)(unsigned int, void *) = NULL;
#endif
-int (*rarp_ioctl_hook)(unsigned int,void*) = NULL;
+/* New destruction routine */
-/*
- * Destroy an AF_INET socket
- */
-
-static __inline__ void kill_sk_queues(struct sock *sk)
+void inet_sock_destruct(struct sock *sk)
{
- struct sk_buff *skb;
+ __skb_queue_purge(&sk->receive_queue);
+ __skb_queue_purge(&sk->error_queue);
- /* First the read buffer. */
- while((skb = skb_dequeue(&sk->receive_queue)) != NULL)
- kfree_skb(skb);
+ if (sk->type == SOCK_STREAM && sk->state != TCP_CLOSE) {
+ printk("Attempt to release TCP socket in state %d %p\n",
+ sk->state,
+ sk);
+ return;
+ }
+ if (!sk->dead) {
+ printk("Attempt to release alive inet socket %p\n", sk);
+ return;
+ }
- /* Next, the error queue. */
- while((skb = skb_dequeue(&sk->error_queue)) != NULL)
- kfree_skb(skb);
+ BUG_TRAP(atomic_read(&sk->rmem_alloc) == 0);
+ BUG_TRAP(atomic_read(&sk->wmem_alloc) == 0);
- /* It is _impossible_ for the backlog to contain anything
- * when we get here. All user references to this socket
- * have gone away, only the net layer knows can touch it.
- */
+ if (sk->protinfo.af_inet.opt)
+ kfree(sk->protinfo.af_inet.opt);
+ dst_release(sk->dst_cache);
+ atomic_dec(&inet_sock_nr);
+#ifdef INET_REFCNT_DEBUG
+ printk(KERN_DEBUG "INET socket %p released, %d are still alive\n", sk, atomic_read(&inet_sock_nr));
+#endif
}
-static __inline__ void kill_sk_now(struct sock *sk)
+void inet_sock_release(struct sock *sk)
{
- /* Remove from protocol hash chains. */
- sk->prot->unhash(sk);
+ if (sk->prot->destroy)
+ sk->prot->destroy(sk);
- if(sk->opt)
- kfree(sk->opt);
- dst_release(sk->dst_cache);
- sk_free(sk);
-}
+ /* Observation: when inet_sock_release is called, processes have
+ no access to socket. But net still has.
+ Step one, detach it from networking:
-static __inline__ void kill_sk_later(struct sock *sk)
-{
- /* this should never happen. */
- /* actually it can if an ack has just been sent. */
- /*
- * It's more normal than that...
- * It can happen because a skb is still in the device queues
- * [PR]
+ A. Remove from hash tables.
*/
-
- NETDEBUG(printk(KERN_DEBUG "Socket destroy delayed (r=%d w=%d)\n",
- atomic_read(&sk->rmem_alloc),
- atomic_read(&sk->wmem_alloc)));
-
- sk->destroy = 1;
- sk->ack_backlog = 0;
- bh_unlock_sock(sk);
- net_reset_timer(sk, TIME_DESTROY, SOCK_DESTROY_TIME);
-}
-/* Callers must hold the BH spinlock.
- *
- * At this point, there should be no process reference to this
- * socket, and thus no user references at all. Therefore we
- * can assume the socket waitqueue is inactive and nobody will
- * try to jump onto it.
- */
-void destroy_sock(struct sock *sk)
-{
- /* Now we can no longer get new packets or once the
- * timers are killed, send them.
- */
- net_delete_timer(sk);
+ sk->prot->unhash(sk);
- if (sk->prot->destroy)
- sk->prot->destroy(sk);
+ /* In this point socket cannot receive new packets,
+ but it is possible that some packets are in flight
+ because some CPU runs receiver and did hash table lookup
+ before we unhashed socket. They will achieve receive queue
+ and will be purged by socket destructor.
+
+ Also we still have packets pending on receive
+ queue and probably, our own packets waiting in device queues.
+ sock_destroy will drain receive queue, but transmitted
+ packets will delay socket destruction until the last reference
+ will be released.
+ */
- kill_sk_queues(sk);
+ write_lock_irq(&sk->callback_lock);
+ sk->dead=1;
+ sk->socket = NULL;
+ sk->sleep = NULL;
+ write_unlock_irq(&sk->callback_lock);
- /* Now if everything is gone we can free the socket
- * structure, otherwise we need to keep it around until
- * everything is gone.
- */
- if (atomic_read(&sk->rmem_alloc) == 0 && atomic_read(&sk->wmem_alloc) == 0)
- kill_sk_now(sk);
- else
- kill_sk_later(sk);
+#ifdef INET_REFCNT_DEBUG
+ if (atomic_read(&sk->refcnt) != 1) {
+ printk(KERN_DEBUG "Destruction inet %p delayed, c=%d\n", sk, atomic_read(&sk->refcnt));
+ }
+#endif
+ sock_put(sk);
}
+
/*
* The routines beyond this point handle the behaviour of an AF_INET
* socket object. Mostly it punts to the subprotocols of IP to do
@@ -264,12 +251,16 @@ int inet_getsockopt(struct socket *sock, int level, int optname,
static int inet_autobind(struct sock *sk)
{
/* We may need to bind the socket. */
+ lock_sock(sk);
if (sk->num == 0) {
- if (sk->prot->get_port(sk, 0) != 0)
+ if (sk->prot->get_port(sk, 0) != 0) {
+ release_sock(sk);
return -EAGAIN;
+ }
sk->sport = htons(sk->num);
sk->prot->hash(sk);
}
+ release_sock(sk);
return 0;
}
@@ -289,37 +280,47 @@ int inet_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock->sk;
unsigned char old_state;
+ int err;
if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
return -EINVAL;
- if ((unsigned) backlog == 0) /* BSDism */
- backlog = 1;
- if ((unsigned) backlog > SOMAXCONN)
- backlog = SOMAXCONN;
- sk->max_ack_backlog = backlog;
+ lock_sock(sk);
+ old_state = sk->state;
+ err = -EINVAL;
+ if (!((1<<old_state)&(TCPF_CLOSE|TCPF_LISTEN)))
+ goto out;
/* Really, if the socket is already in listen state
* we can only allow the backlog to be adjusted.
*/
- old_state = sk->state;
if (old_state != TCP_LISTEN) {
sk->state = TCP_LISTEN;
sk->ack_backlog = 0;
if (sk->num == 0) {
if (sk->prot->get_port(sk, 0) != 0) {
sk->state = old_state;
- return -EAGAIN;
+ err = -EAGAIN;
+ goto out;
}
sk->sport = htons(sk->num);
+ } else {
+ /* Not nice, but the simplest solution however */
+ if (sk->prev)
+ ((struct tcp_bind_bucket*)sk->prev)->fastreuse = 0;
}
- dst_release(xchg(&sk->dst_cache, NULL));
+ sk_dst_reset(sk);
sk->prot->hash(sk);
sk->socket->flags |= SO_ACCEPTCON;
sk->write_space = inet_listen_write_space;
}
- return 0;
+ sk->max_ack_backlog = backlog;
+ err = 0;
+
+out:
+ release_sock(sk);
+ return err;
}
/*
@@ -334,24 +335,6 @@ static int inet_create(struct socket *sock, int protocol)
struct sock *sk;
struct proto *prot;
- /* Compatibility */
- if (sock->type == SOCK_PACKET) {
- static int warned;
- if (net_families[PF_PACKET]==NULL)
- {
-#if defined(CONFIG_KMOD) && defined(CONFIG_PACKET_MODULE)
- char module_name[30];
- sprintf(module_name,"net-pf-%d", PF_PACKET);
- request_module(module_name);
- if (net_families[PF_PACKET] == NULL)
-#endif
- return -ESOCKTNOSUPPORT;
- }
- if (!warned++)
- printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n", current->comm);
- return net_families[PF_PACKET]->create(sock, protocol);
- }
-
sock->state = SS_UNCONNECTED;
sk = sk_alloc(PF_INET, GFP_KERNEL, 1);
if (sk == NULL)
@@ -363,9 +346,9 @@ static int inet_create(struct socket *sock, int protocol)
goto free_and_noproto;
protocol = IPPROTO_TCP;
if (ipv4_config.no_pmtu_disc)
- sk->ip_pmtudisc = IP_PMTUDISC_DONT;
+ sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT;
else
- sk->ip_pmtudisc = IP_PMTUDISC_WANT;
+ sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_WANT;
prot = &tcp_prot;
sock->ops = &inet_stream_ops;
break;
@@ -376,7 +359,7 @@ static int inet_create(struct socket *sock, int protocol)
goto free_and_noproto;
protocol = IPPROTO_UDP;
sk->no_check = UDP_CSUM_DEFAULT;
- sk->ip_pmtudisc = IP_PMTUDISC_DONT;
+ sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT;
prot=&udp_prot;
sock->ops = &inet_dgram_ops;
break;
@@ -387,19 +370,19 @@ static int inet_create(struct socket *sock, int protocol)
goto free_and_noproto;
prot = &raw_prot;
sk->reuse = 1;
- sk->ip_pmtudisc = IP_PMTUDISC_DONT;
+ sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT;
sk->num = protocol;
sock->ops = &inet_dgram_ops;
if (protocol == IPPROTO_RAW)
- sk->ip_hdrincl = 1;
+ sk->protinfo.af_inet.hdrincl = 1;
break;
default:
goto free_and_badtype;
}
sock_init_data(sock,sk);
-
- sk->destruct = NULL;
+
+ sk->destruct = inet_sock_destruct;
sk->zapped=0;
#ifdef CONFIG_TCP_NAGLE_OFF
@@ -412,15 +395,17 @@ static int inet_create(struct socket *sock, int protocol)
sk->backlog_rcv = prot->backlog_rcv;
sk->timer.data = (unsigned long)sk;
- sk->timer.function = &net_timer;
+ sk->timer.function = &tcp_keepalive_timer;
- sk->ip_ttl=ip_statistics.IpDefaultTTL;
+ sk->protinfo.af_inet.ttl=ip_statistics.IpDefaultTTL;
+
+ sk->protinfo.af_inet.mc_loop=1;
+ sk->protinfo.af_inet.mc_ttl=1;
+ sk->protinfo.af_inet.mc_index=0;
+ sk->protinfo.af_inet.mc_list=NULL;
+
+ atomic_inc(&inet_sock_nr);
- sk->ip_mc_loop=1;
- sk->ip_mc_ttl=1;
- sk->ip_mc_index=0;
- sk->ip_mc_list=NULL;
-
if (sk->num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
@@ -436,7 +421,8 @@ static int inet_create(struct socket *sock, int protocol)
if (sk->prot->init) {
int err = sk->prot->init(sk);
if (err != 0) {
- destroy_sock(sk);
+ sk->dead = 1;
+ inet_sock_release(sk);
return(err);
}
}
@@ -465,18 +451,13 @@ do_oom:
* should refer to it.
*/
-int inet_release(struct socket *sock, struct socket *peersock)
+int inet_release(struct socket *sock)
{
struct sock *sk = sock->sk;
if (sk) {
long timeout;
- /* Begin closedown and wake up sleepers. */
- if (sock->state != SS_UNCONNECTED)
- sock->state = SS_DISCONNECTING;
- sk->state_change(sk);
-
/* Applications forget to leave groups before exiting */
ip_mc_drop_socket(sk);
@@ -494,7 +475,6 @@ int inet_release(struct socket *sock, struct socket *peersock)
timeout = MAX_SCHEDULE_TIMEOUT;
}
sock->sk = NULL;
- sk->socket = NULL;
sk->prot->close(sk, timeout);
}
return(0);
@@ -506,27 +486,25 @@ static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
struct sock *sk=sock->sk;
unsigned short snum;
int chk_addr_ret;
+ int err;
/* If the socket has its own bind function then use it. (RAW) */
if(sk->prot->bind)
return sk->prot->bind(sk, uaddr, addr_len);
-
- /* Check these errors (active socket, bad address length, double bind). */
- if ((sk->state != TCP_CLOSE) ||
- (addr_len < sizeof(struct sockaddr_in)) ||
- (sk->num != 0))
+
+ if (addr_len < sizeof(struct sockaddr_in))
return -EINVAL;
chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
if (addr->sin_addr.s_addr != 0 && chk_addr_ret != RTN_LOCAL &&
chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) {
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- /* Superuser may bind to any address to allow transparent proxying. */
- if(chk_addr_ret != RTN_UNICAST || !capable(CAP_NET_ADMIN))
-#endif
- return -EADDRNOTAVAIL; /* Source address MUST be ours! */
+ return -EADDRNOTAVAIL; /* Source address MUST be ours! */
}
+ snum = ntohs(addr->sin_port);
+ if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+ return -EACCES;
+
/* We keep a pair of addresses. rcv_saddr is the one
* used by hash lookups, and saddr is used for transmit.
*
@@ -534,63 +512,67 @@ static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
* would be illegal to use them (multicast/broadcast) in
* which case the sending device address is used.
*/
+ lock_sock(sk);
+
+ /* Check these errors (active socket, double bind). */
+ err = -EINVAL;
+ if ((sk->state != TCP_CLOSE) ||
+ (sk->num != 0))
+ goto out;
+
sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr;
- if(chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
+ if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
sk->saddr = 0; /* Use device */
- snum = ntohs(addr->sin_port);
-#ifdef CONFIG_IP_MASQUERADE
- /* The kernel masquerader needs some ports. */
- if((snum >= PORT_MASQ_BEGIN) && (snum <= PORT_MASQ_END))
- return -EADDRINUSE;
-#endif
- if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
- return -EACCES;
-
/* Make sure we are allowed to bind here. */
- if (sk->prot->get_port(sk, snum) != 0)
- return -EADDRINUSE;
+ if (sk->prot->get_port(sk, snum) != 0) {
+ sk->saddr = sk->rcv_saddr = 0;
+ err = -EADDRINUSE;
+ goto out;
+ }
sk->sport = htons(sk->num);
sk->daddr = 0;
sk->dport = 0;
sk->prot->hash(sk);
- dst_release(sk->dst_cache);
- sk->dst_cache=NULL;
- return(0);
+ sk_dst_reset(sk);
+ err = 0;
+out:
+ release_sock(sk);
+ return err;
}
int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
int addr_len, int flags)
{
struct sock *sk=sock->sk;
- int err;
- if (inet_autobind(sk) != 0)
+ if (uaddr->sa_family == AF_UNSPEC)
+ return sk->prot->disconnect(sk, flags);
+
+ if (sk->num==0 && inet_autobind(sk) != 0)
return -EAGAIN;
- if (sk->prot->connect == NULL)
- return -EOPNOTSUPP;
- err = sk->prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
- if (err < 0)
- return err;
- return(0);
+ return sk->prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
}
static void inet_wait_for_connect(struct sock *sk)
{
DECLARE_WAITQUEUE(wait, current);
+ __set_current_state(TASK_INTERRUPTIBLE);
add_wait_queue(sk->sleep, &wait);
- current->state = TASK_INTERRUPTIBLE;
- while (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) {
+
+ while ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
if (signal_pending(current))
break;
if (sk->err)
break;
+ release_sock(sk);
schedule();
- current->state = TASK_INTERRUPTIBLE;
+ lock_sock(sk);
+ set_current_state(TASK_INTERRUPTIBLE);
}
- current->state = TASK_RUNNING;
+ __set_current_state(TASK_RUNNING);
remove_wait_queue(sk->sleep, &wait);
}
@@ -605,68 +587,71 @@ int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr,
struct sock *sk=sock->sk;
int err;
- if(sock->state != SS_UNCONNECTED && sock->state != SS_CONNECTING) {
- if(sock->state == SS_CONNECTED)
- return -EISCONN;
- return -EINVAL;
+ if (uaddr->sa_family == AF_UNSPEC) {
+ lock_sock(sk);
+ err = sk->prot->disconnect(sk, flags);
+ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+ release_sock(sk);
+ return err;
}
- if(sock->state == SS_CONNECTING) {
- /* Note: tcp_connected contains SYN_RECV, which may cause
- bogus results here. -AK */
- if(tcp_connected(sk->state)) {
+ lock_sock(sk);
+ switch (sock->state) {
+ default:
+ err = -EINVAL;
+ goto out;
+ case SS_CONNECTED:
+ err = -EISCONN;
+ goto out;
+ case SS_CONNECTING:
+ if (tcp_established(sk->state)) {
sock->state = SS_CONNECTED;
- return 0;
+ err = 0;
+ goto out;
}
- if (sk->zapped || sk->err)
+ if (sk->err)
goto sock_error;
+ err = -EALREADY;
if (flags & O_NONBLOCK)
- return -EALREADY;
- } else {
- if (sk->prot->connect == NULL)
- return -EOPNOTSUPP;
-
- /* We may need to bind the socket. */
- if (inet_autobind(sk) != 0)
- return -EAGAIN;
-
+ goto out;
+ break;
+ case SS_UNCONNECTED:
err = sk->prot->connect(sk, uaddr, addr_len);
- /* Note: there is a theoretical race here when an wake up
- occurred before inet_wait_for_connect is entered. In 2.3
- the wait queue setup should be moved before the low level
- connect call. -AK*/
if (err < 0)
- return err;
+ goto out;
sock->state = SS_CONNECTING;
}
-
- if (sk->state > TCP_FIN_WAIT2 && sock->state == SS_CONNECTING)
+
+ if (sk->state > TCP_FIN_WAIT2)
goto sock_error;
- if (sk->state != TCP_ESTABLISHED && (flags & O_NONBLOCK))
- return -EINPROGRESS;
+ err = -EINPROGRESS;
+ if (!tcp_established(sk->state) && (flags & O_NONBLOCK))
+ goto out;
- if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) {
+ if ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
inet_wait_for_connect(sk);
+ err = -ERESTARTSYS;
if (signal_pending(current))
- return -ERESTARTSYS;
+ goto out;
}
- sock->state = SS_CONNECTED;
- if ((sk->state != TCP_ESTABLISHED) && sk->err)
+ if (sk->err && !tcp_established(sk->state))
goto sock_error;
- return 0;
+ sock->state = SS_CONNECTED;
+ err = 0;
+out:
+ release_sock(sk);
+ return err;
-sock_error:
- /* This is ugly but needed to fix a race in the ICMP error handler */
- if (sk->zapped && sk->state != TCP_CLOSE) {
- lock_sock(sk);
- tcp_set_state(sk, TCP_CLOSE);
- release_sock(sk);
- sk->zapped = 0;
- }
+sock_error:
+ err = sock_error(sk) ? : -ECONNABORTED;
sock->state = SS_UNCONNECTED;
- return sock_error(sk);
+ if (sk->prot->disconnect(sk, O_NONBLOCK))
+ sock->state = SS_DISCONNECTING;
+ release_sock(sk);
+
+ return err;
}
/*
@@ -675,62 +660,27 @@ sock_error:
int inet_accept(struct socket *sock, struct socket *newsock, int flags)
{
- struct sock *sk1 = sock->sk, *sk2;
- struct sock *newsk = newsock->sk;
+ struct sock *sk1 = sock->sk;
+ struct sock *sk2;
int err = -EINVAL;
- if (sock->state != SS_UNCONNECTED || !(sock->flags & SO_ACCEPTCON))
+ if((sk2 = sk1->prot->accept(sk1,flags,&err)) == NULL)
goto do_err;
- err = -EOPNOTSUPP;
- if (sk1->prot->accept == NULL)
- goto do_err;
-
- if((sk2 = sk1->prot->accept(sk1,flags)) == NULL)
- goto do_sk1_err;
+ lock_sock(sk2);
- /*
- * We've been passed an extra socket.
- * We need to free it up because the tcp module creates
- * its own when it accepts one.
- */
- sk2->sleep = newsk->sleep;
+ BUG_TRAP((1<<sk2->state)&(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_CLOSE));
+ write_lock_irq(&sk2->callback_lock);
+ sk2->sleep = &newsock->wait;
newsock->sk = sk2;
sk2->socket = newsock;
- newsk->socket = NULL;
-
- if (flags & O_NONBLOCK)
- goto do_half_success;
-
- if(sk2->state == TCP_ESTABLISHED)
- goto do_full_success;
- if(sk2->err > 0)
- goto do_connect_err;
- err = -ECONNABORTED;
- if (sk2->state == TCP_CLOSE)
- goto do_bad_connection;
-do_full_success:
- destroy_sock(newsk);
+ write_unlock_irq(&sk2->callback_lock);
+
newsock->state = SS_CONNECTED;
+ release_sock(sk2);
return 0;
-do_half_success:
- destroy_sock(newsk);
- return(0);
-
-do_connect_err:
- err = sock_error(sk2);
-do_bad_connection:
- sk2->sleep = NULL;
- sk2->socket = NULL;
- destroy_sock(sk2);
- newsock->sk = newsk;
- newsk->socket = newsock;
- return err;
-
-do_sk1_err:
- err = sock_error(sk1);
do_err:
return err;
}
@@ -748,7 +698,7 @@ static int inet_getname(struct socket *sock, struct sockaddr *uaddr,
sin->sin_family = AF_INET;
if (peer) {
- if (!tcp_connected(sk->state))
+ if (!sk->dport)
return -ENOTCONN;
sin->sin_port = sk->dport;
sin->sin_addr.s_addr = sk->daddr;
@@ -772,12 +722,9 @@ int inet_recvmsg(struct socket *sock, struct msghdr *msg, int size,
int addr_len = 0;
int err;
- if (sock->flags & SO_ACCEPTCON)
- return -EINVAL;
- if (sk->prot->recvmsg == NULL)
- return -EOPNOTSUPP;
/* We may need to bind the socket. */
- if (inet_autobind(sk) != 0)
+ /* It is pretty strange. I would return error in this case --ANK */
+ if (sk->num==0 && inet_autobind(sk) != 0)
return -EAGAIN;
err = sk->prot->recvmsg(sk, msg, size, flags&MSG_DONTWAIT,
flags&~MSG_DONTWAIT, &addr_len);
@@ -792,27 +739,17 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, int size,
{
struct sock *sk = sock->sk;
- if (sk->shutdown & SEND_SHUTDOWN) {
- if (!(msg->msg_flags&MSG_NOSIGNAL))
- send_sig(SIGPIPE, current, 1);
- return -EPIPE;
- }
- if (sk->prot->sendmsg == NULL)
- return -EOPNOTSUPP;
- if(sk->err)
- return sock_error(sk);
-
/* We may need to bind the socket. */
- if (inet_autobind(sk) != 0)
+ if (sk->num==0 && inet_autobind(sk) != 0)
return -EAGAIN;
return sk->prot->sendmsg(sk, msg, size);
}
-
int inet_shutdown(struct socket *sock, int how)
{
struct sock *sk = sock->sk;
+ int err;
/* This should really check to make sure
* the socket is a TCP socket. (WHY AC...)
@@ -824,19 +761,24 @@ int inet_shutdown(struct socket *sock, int how)
return -EINVAL;
if (!sk)
return -ENOTCONN;
- if (sock->state == SS_CONNECTING && sk->state == TCP_ESTABLISHED)
+
+ lock_sock(sk);
+ if (sock->state == SS_CONNECTING && tcp_established(sk->state))
sock->state = SS_CONNECTED;
- if (!tcp_connected(sk->state))
- return -ENOTCONN;
+ err = -ENOTCONN;
+ if (!tcp_connected(sk->state))
+ goto out;
sk->shutdown |= how;
if (sk->prot->shutdown)
sk->prot->shutdown(sk, how);
/* Wake up anyone sleeping in poll. */
sk->state_change(sk);
- return(0);
+ err = 0;
+out:
+ release_sock(sk);
+ return err;
}
-
unsigned int inet_poll(struct file * file, struct socket *sock, poll_table *wait)
{
struct sock *sk = sock->sk;
@@ -892,15 +834,6 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
case SIOCGARP:
case SIOCSARP:
return(arp_ioctl(cmd,(void *) arg));
- case SIOCDRARP:
- case SIOCGRARP:
- case SIOCSRARP:
-#ifdef CONFIG_KMOD
- if (rarp_ioctl_hook == NULL)
- request_module("rarp");
-#endif
- if (rarp_ioctl_hook != NULL)
- return(rarp_ioctl_hook(cmd,(void *) arg));
case SIOCGIFADDR:
case SIOCSIFADDR:
case SIOCGIFBRDADDR:
@@ -915,8 +848,11 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
return(devinet_ioctl(cmd,(void *) arg));
case SIOCGIFBR:
case SIOCSIFBR:
-#ifdef CONFIG_BRIDGE
- return(br_ioctl(cmd,(void *) arg));
+#ifdef CONFIG_BRIDGE
+ lock_kernel();
+ err = br_ioctl(cmd,(void *) arg);
+ unlock_kernel();
+ return err;
#else
return -ENOPKG;
#endif
@@ -924,7 +860,10 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
case SIOCADDDLCI:
case SIOCDELDLCI:
#ifdef CONFIG_DLCI
- return(dlci_ioctl(cmd, (void *) arg));
+ lock_kernel();
+ err = dlci_ioctl(cmd, (void *) arg);
+ unlock_kernel();
+ return err;
#endif
#ifdef CONFIG_DLCI_MODULE
@@ -934,8 +873,12 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
request_module("dlci");
#endif
- if (dlci_ioctl_hook)
- return((*dlci_ioctl_hook)(cmd, (void *) arg));
+ if (dlci_ioctl_hook) {
+ lock_kernel();
+ err = (*dlci_ioctl_hook)(cmd, (void *) arg);
+ unlock_kernel();
+ return err;
+ }
#endif
return -ENOPKG;
@@ -960,7 +903,6 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
struct proto_ops inet_stream_ops = {
PF_INET,
- sock_no_dup,
inet_release,
inet_bind,
inet_stream_connect,
@@ -975,13 +917,13 @@ struct proto_ops inet_stream_ops = {
inet_getsockopt,
sock_no_fcntl,
inet_sendmsg,
- inet_recvmsg
+ inet_recvmsg,
+ sock_no_mmap
};
struct proto_ops inet_dgram_ops = {
PF_INET,
- sock_no_dup,
inet_release,
inet_bind,
inet_dgram_connect,
@@ -996,7 +938,8 @@ struct proto_ops inet_dgram_ops = {
inet_getsockopt,
sock_no_fcntl,
inet_sendmsg,
- inet_recvmsg
+ inet_recvmsg,
+ sock_no_mmap
};
struct net_proto_family inet_family_ops = {
@@ -1006,14 +949,6 @@ struct net_proto_family inet_family_ops = {
#ifdef CONFIG_PROC_FS
-#ifdef CONFIG_INET_RARP
-static struct proc_dir_entry proc_net_rarp = {
- PROC_NET_RARP, 4, "rarp",
- S_IFREG | S_IRUGO, 1, 0, 0,
- 0, &proc_net_inode_operations,
- rarp_get_info
-};
-#endif /* RARP */
static struct proc_dir_entry proc_net_raw = {
PROC_NET_RAW, 3, "raw",
S_IFREG | S_IRUGO, 1, 0, 0,
@@ -1060,7 +995,7 @@ extern void tcp_v4_init(struct net_proto_family *);
* Called by socket.c on kernel startup.
*/
-__initfunc(void inet_proto_init(struct net_proto *pro))
+void __init inet_proto_init(struct net_proto *pro)
{
struct sk_buff *dummy_skb;
struct inet_protocol *p;
@@ -1127,34 +1062,17 @@ __initfunc(void inet_proto_init(struct net_proto *pro))
#endif
/*
- * Set the firewalling up
- */
-#if defined(CONFIG_IP_FIREWALL)
- ip_fw_init();
-#endif
-
-#ifdef CONFIG_IP_MASQUERADE
- ip_masq_init();
-#endif
-
- /*
* Initialise the multicast router
*/
#if defined(CONFIG_IP_MROUTE)
ip_mr_init();
#endif
-#ifdef CONFIG_INET_RARP
- rarp_ioctl_hook = rarp_ioctl;
-#endif
/*
* Create all the /proc entries.
*/
#ifdef CONFIG_PROC_FS
-#ifdef CONFIG_INET_RARP
- proc_net_register(&proc_net_rarp);
-#endif /* RARP */
proc_net_register(&proc_net_raw);
proc_net_register(&proc_net_snmp);
proc_net_register(&proc_net_netstat);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index a3ca88701..0b1ee6387 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1,6 +1,6 @@
/* linux/net/inet/arp.c
*
- * Version: $Id: arp.c,v 1.78 1999/06/09 10:10:36 davem Exp $
+ * Version: $Id: arp.c,v 1.81 1999/08/30 10:17:05 davem Exp $
*
* Copyright (C) 1994 by Florian La Roche
*
@@ -115,6 +115,9 @@
#include <net/netrom.h>
#endif
#endif
+#ifdef CONFIG_ATM_CLIP
+#include <net/atmclip.h>
+#endif
#include <asm/system.h>
#include <asm/uaccess.h>
@@ -127,6 +130,7 @@ static char *ax2asc2(ax25_address *a, char *buf);
/*
* Interface to generic neighbour cache.
*/
+static u32 arp_hash(const void *pkey, const struct net_device *dev);
static int arp_constructor(struct neighbour *neigh);
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -186,16 +190,18 @@ struct neigh_table arp_tbl =
AF_INET,
sizeof(struct neighbour) + 4,
4,
+ arp_hash,
arp_constructor,
NULL,
NULL,
parp_redo,
+ "arp_cache",
{ NULL, NULL, &arp_tbl, 0, NULL, NULL,
30*HZ, 1*HZ, 60*HZ, 30*HZ, 5*HZ, 3, 3, 0, 3, 1*HZ, (8*HZ)/10, 64, 1*HZ },
30*HZ, 128, 512, 1024,
};
-int arp_mc_map(u32 addr, u8 *haddr, struct device *dev, int dir)
+int arp_mc_map(u32 addr, u8 *haddr, struct net_device *dev, int dir)
{
switch (dev->type) {
case ARPHRD_ETHER:
@@ -213,12 +219,24 @@ int arp_mc_map(u32 addr, u8 *haddr, struct device *dev, int dir)
}
+static u32 arp_hash(const void *pkey, const struct net_device *dev)
+{
+ u32 hash_val;
+
+ hash_val = *(u32*)pkey;
+ hash_val ^= (hash_val>>16);
+ hash_val ^= hash_val>>8;
+ hash_val ^= hash_val>>3;
+ hash_val = (hash_val^dev->ifindex)&NEIGH_HASHMASK;
+
+ return hash_val;
+}
static int arp_constructor(struct neighbour *neigh)
{
u32 addr = *(u32*)neigh->primary_key;
- struct device *dev = neigh->dev;
- struct in_device *in_dev = dev->ip_ptr;
+ struct net_device *dev = neigh->dev;
+ struct in_device *in_dev = in_dev_get(dev);
if (in_dev == NULL)
return -EINVAL;
@@ -227,6 +245,8 @@ static int arp_constructor(struct neighbour *neigh)
if (in_dev->arp_parms)
neigh->parms = in_dev->arp_parms;
+ in_dev_put(in_dev);
+
if (dev->hard_header == NULL) {
neigh->nud_state = NUD_NOARP;
neigh->ops = &arp_direct_ops;
@@ -293,7 +313,6 @@ static int arp_constructor(struct neighbour *neigh)
else
neigh->output = neigh->ops->output;
}
-
return 0;
}
@@ -307,7 +326,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
{
u32 saddr;
u8 *dst_ha = NULL;
- struct device *dev = neigh->dev;
+ struct net_device *dev = neigh->dev;
u32 target = *(u32*)neigh->primary_key;
int probes = atomic_read(&neigh->probes);
@@ -345,7 +364,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
* is allowed to use this function, it is scheduled to be removed. --ANK
*/
-static int arp_set_predefined(int addr_hint, unsigned char * haddr, u32 paddr, struct device * dev)
+static int arp_set_predefined(int addr_hint, unsigned char * haddr, u32 paddr, struct net_device * dev)
{
switch (addr_hint) {
case RTN_LOCAL:
@@ -365,7 +384,7 @@ static int arp_set_predefined(int addr_hint, unsigned char * haddr, u32 paddr, s
int arp_find(unsigned char *haddr, struct sk_buff *skb)
{
- struct device *dev = skb->dev;
+ struct net_device *dev = skb->dev;
u32 paddr;
struct neighbour *n;
@@ -401,7 +420,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
int arp_bind_neighbour(struct dst_entry *dst)
{
- struct device *dev = dst->dev;
+ struct net_device *dev = dst->dev;
if (dev == NULL)
return 0;
@@ -409,7 +428,11 @@ int arp_bind_neighbour(struct dst_entry *dst)
u32 nexthop = ((struct rtable*)dst)->rt_gateway;
if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT))
nexthop = 0;
- dst->neighbour = __neigh_lookup(&arp_tbl, &nexthop, dev, 1);
+ dst->neighbour = __neigh_lookup(
+#ifdef CONFIG_ATM_CLIP
+ dev->type == ARPHRD_ATM ? &clip_tbl :
+#endif
+ &arp_tbl, &nexthop, dev, 1);
}
return (dst->neighbour != NULL);
}
@@ -424,7 +447,7 @@ int arp_bind_neighbour(struct dst_entry *dst)
*/
void arp_send(int type, int ptype, u32 dest_ip,
- struct device *dev, u32 src_ip,
+ struct net_device *dev, u32 src_ip,
unsigned char *dest_hw, unsigned char *src_hw,
unsigned char *target_hw)
{
@@ -531,7 +554,7 @@ static void parp_redo(struct sk_buff *skb)
* Receive an arp request by the device layer.
*/
-int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
+int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
{
struct arphdr *arp = skb->nh.arph;
unsigned char *arp_ptr= (unsigned char *)(arp+1);
@@ -540,7 +563,7 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
u32 sip, tip;
u16 dev_type = dev->type;
int addr_type;
- struct in_device *in_dev = dev->ip_ptr;
+ struct in_device *in_dev = in_dev_get(dev);
struct neighbour *n;
/*
@@ -558,6 +581,9 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
arp->ar_pln != 4)
goto out;
+ if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+ goto out_of_mem;
+
switch (dev_type) {
default:
if (arp->ar_pro != __constant_htons(ETH_P_IP))
@@ -610,7 +636,7 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
#endif
}
- /* Undertsand only these message types */
+ /* Understand only these message types */
if (arp->ar_op != __constant_htons(ARPOP_REPLY) &&
arp->ar_op != __constant_htons(ARPOP_REQUEST))
@@ -685,6 +711,7 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
} else {
pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb);
+ in_dev_put(in_dev);
return 0;
}
goto out;
@@ -731,6 +758,9 @@ int arp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
out:
kfree_skb(skb);
+ if (in_dev)
+ in_dev_put(in_dev);
+out_of_mem:
return 0;
}
@@ -744,7 +774,7 @@ out:
* Set (create) an ARP cache entry.
*/
-int arp_req_set(struct arpreq *r, struct device * dev)
+int arp_req_set(struct arpreq *r, struct net_device * dev)
{
u32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
struct neighbour *neigh;
@@ -768,8 +798,8 @@ int arp_req_set(struct arpreq *r, struct device * dev)
ipv4_devconf.proxy_arp = 1;
return 0;
}
- if (dev->ip_ptr) {
- ((struct in_device*)dev->ip_ptr)->cnf.proxy_arp = 1;
+ if (__in_dev_get(dev)) {
+ __in_dev_get(dev)->cnf.proxy_arp = 1;
return 0;
}
return -ENXIO;
@@ -816,7 +846,7 @@ static unsigned arp_state_to_flags(struct neighbour *neigh)
* Get an ARP cache entry.
*/
-static int arp_req_get(struct arpreq *r, struct device *dev)
+static int arp_req_get(struct arpreq *r, struct net_device *dev)
{
u32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
struct neighbour *neigh;
@@ -836,7 +866,7 @@ static int arp_req_get(struct arpreq *r, struct device *dev)
return err;
}
-int arp_req_delete(struct arpreq *r, struct device * dev)
+int arp_req_delete(struct arpreq *r, struct net_device * dev)
{
int err;
u32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
@@ -851,8 +881,8 @@ int arp_req_delete(struct arpreq *r, struct device * dev)
ipv4_devconf.proxy_arp = 0;
return 0;
}
- if (dev->ip_ptr) {
- ((struct in_device*)dev->ip_ptr)->cnf.proxy_arp = 0;
+ if (__in_dev_get(dev)) {
+ __in_dev_get(dev)->cnf.proxy_arp = 0;
return 0;
}
return -ENXIO;
@@ -887,7 +917,7 @@ int arp_ioctl(unsigned int cmd, void *arg)
{
int err;
struct arpreq r;
- struct device * dev = NULL;
+ struct net_device * dev = NULL;
switch(cmd) {
case SIOCDARP:
@@ -915,7 +945,7 @@ int arp_ioctl(unsigned int cmd, void *arg)
rtnl_lock();
if (r.arp_dev[0]) {
err = -ENODEV;
- if ((dev = dev_get(r.arp_dev)) == NULL)
+ if ((dev = __dev_get_by_name(r.arp_dev)) == NULL)
goto out;
/* Mmmm... It is wrong... ARPHRD_NETROM==0 */
@@ -973,7 +1003,7 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy
struct neighbour *n;
read_lock_bh(&arp_tbl.lock);
for (n=arp_tbl.hash_buckets[i]; n; n=n->next) {
- struct device *dev = n->dev;
+ struct net_device *dev = n->dev;
int hatype = dev->type;
/* Do not confuse users "arp -a" with magic entries */
@@ -1028,7 +1058,7 @@ int arp_get_info(char *buffer, char **start, off_t offset, int length, int dummy
for (i=0; i<=PNEIGH_HASHMASK; i++) {
struct pneigh_entry *n;
for (n=arp_tbl.phash_buckets[i]; n; n=n->next) {
- struct device *dev = n->dev;
+ struct net_device *dev = n->dev;
int hatype = dev ? dev->type : 0;
size = sprintf(buffer+len,
@@ -1067,7 +1097,7 @@ done:
It is necessary, that this routine was called after route cache will be
flushed.
*/
-void arp_ifdown(struct device *dev)
+void arp_ifdown(struct net_device *dev)
{
neigh_ifdown(&arp_tbl, dev);
}
@@ -1082,7 +1112,7 @@ static struct packet_type arp_packet_type =
__constant_htons(ETH_P_ARP),
NULL, /* All devices */
arp_rcv,
- NULL,
+ (void*)1,
NULL
};
@@ -1095,7 +1125,7 @@ static struct proc_dir_entry proc_net_arp = {
};
#endif
-__initfunc(void arp_init (void))
+void __init arp_init (void)
{
neigh_table_init(&arp_tbl);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index ff2c930d1..2c2da8eee 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1,7 +1,7 @@
/*
* NET3 IP device support routines.
*
- * Version: $Id: devinet.c,v 1.32 1999/06/09 11:15:33 davem Exp $
+ * Version: $Id: devinet.c,v 1.35 1999/08/31 07:03:20 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -77,6 +77,11 @@ static void devinet_sysctl_unregister(struct ipv4_devconf *p);
int inet_ifa_count;
int inet_dev_count;
+/* Locks all the inet devices. */
+
+rwlock_t inetdev_lock = RW_LOCK_UNLOCKED;
+
+
static struct in_ifaddr * inet_alloc_ifa(void)
{
struct in_ifaddr *ifa;
@@ -92,19 +97,41 @@ static struct in_ifaddr * inet_alloc_ifa(void)
static __inline__ void inet_free_ifa(struct in_ifaddr *ifa)
{
+ if (ifa->ifa_dev)
+ __in_dev_put(ifa->ifa_dev);
kfree_s(ifa, sizeof(*ifa));
inet_ifa_count--;
}
-struct in_device *inetdev_init(struct device *dev)
+void in_dev_finish_destroy(struct in_device *idev)
+{
+ struct net_device *dev = idev->dev;
+
+ BUG_TRAP(idev->ifa_list==NULL);
+ BUG_TRAP(idev->mc_list==NULL);
+#ifdef NET_REFCNT_DEBUG
+ printk(KERN_DEBUG "in_dev_finish_destroy: %p=%s\n", idev, dev ? dev->name : "NIL");
+#endif
+ dev_put(dev);
+ if (!idev->dead) {
+ printk("Freeing alive in_device %p\n", idev);
+ return;
+ }
+ inet_dev_count--;
+ kfree_s(idev, sizeof(*idev));
+}
+
+struct in_device *inetdev_init(struct net_device *dev)
{
struct in_device *in_dev;
+ ASSERT_RTNL();
+
in_dev = kmalloc(sizeof(*in_dev), GFP_KERNEL);
if (!in_dev)
return NULL;
- inet_dev_count++;
memset(in_dev, 0, sizeof(*in_dev));
+ in_dev->lock = RW_LOCK_UNLOCKED;
memcpy(&in_dev->cnf, &ipv4_devconf_dflt, sizeof(in_dev->cnf));
in_dev->cnf.sysctl = NULL;
in_dev->dev = dev;
@@ -112,10 +139,17 @@ struct in_device *inetdev_init(struct device *dev)
kfree(in_dev);
return NULL;
}
+ inet_dev_count++;
+ /* Reference in_dev->dev */
+ dev_hold(dev);
#ifdef CONFIG_SYSCTL
neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4, NET_IPV4_NEIGH, "ipv4");
#endif
+ write_lock_bh(&inetdev_lock);
dev->ip_ptr = in_dev;
+ /* Account for reference dev->ip_ptr */
+ in_dev_hold(in_dev);
+ write_unlock_bh(&inetdev_lock);
#ifdef CONFIG_SYSCTL
devinet_sysctl_register(in_dev, &in_dev->cnf);
#endif
@@ -128,6 +162,10 @@ static void inetdev_destroy(struct in_device *in_dev)
{
struct in_ifaddr *ifa;
+ ASSERT_RTNL();
+
+ in_dev->dead = 1;
+
ip_mc_destroy_dev(in_dev);
while ((ifa = in_dev->ifa_list) != NULL) {
@@ -138,28 +176,38 @@ static void inetdev_destroy(struct in_device *in_dev)
#ifdef CONFIG_SYSCTL
devinet_sysctl_unregister(&in_dev->cnf);
#endif
+ write_lock_bh(&inetdev_lock);
in_dev->dev->ip_ptr = NULL;
- synchronize_bh();
+ /* in_dev_put following below will kill the in_device */
+ write_unlock_bh(&inetdev_lock);
+
+
neigh_parms_release(&arp_tbl, in_dev->arp_parms);
- kfree(in_dev);
+ in_dev_put(in_dev);
}
-struct in_ifaddr * inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b)
+int inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b)
{
+ read_lock(&in_dev->lock);
for_primary_ifa(in_dev) {
if (inet_ifa_match(a, ifa)) {
- if (!b || inet_ifa_match(b, ifa))
- return ifa;
+ if (!b || inet_ifa_match(b, ifa)) {
+ read_unlock(&in_dev->lock);
+ return 1;
+ }
}
} endfor_ifa(in_dev);
- return NULL;
-}
+ read_unlock(&in_dev->lock);
+ return 0;
+}
static void
inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy)
{
struct in_ifaddr *ifa1 = *ifap;
+ ASSERT_RTNL();
+
/* 1. Deleting primary ifaddr forces deletion all secondaries */
if (!(ifa1->ifa_flags&IFA_F_SECONDARY)) {
@@ -173,8 +221,9 @@ inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy)
ifap1 = &ifa->ifa_next;
continue;
}
+ write_lock_bh(&in_dev->lock);
*ifap1 = ifa->ifa_next;
- synchronize_bh();
+ write_unlock_bh(&in_dev->lock);
rtmsg_ifa(RTM_DELADDR, ifa);
notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa);
@@ -184,8 +233,9 @@ inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy)
/* 2. Unlink it */
+ write_lock_bh(&in_dev->lock);
*ifap = ifa1->ifa_next;
- synchronize_bh();
+ write_unlock_bh(&in_dev->lock);
/* 3. Announce address deletion */
@@ -201,16 +251,20 @@ inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy)
notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
if (destroy) {
inet_free_ifa(ifa1);
+
if (in_dev->ifa_list == NULL)
inetdev_destroy(in_dev);
}
}
static int
-inet_insert_ifa(struct in_device *in_dev, struct in_ifaddr *ifa)
+inet_insert_ifa(struct in_ifaddr *ifa)
{
+ struct in_device *in_dev = ifa->ifa_dev;
struct in_ifaddr *ifa1, **ifap, **last_primary;
+ ASSERT_RTNL();
+
if (ifa->ifa_local == 0) {
inet_free_ifa(ifa);
return 0;
@@ -241,8 +295,9 @@ inet_insert_ifa(struct in_device *in_dev, struct in_ifaddr *ifa)
}
ifa->ifa_next = *ifap;
- wmb();
+ write_lock_bh(&in_dev->lock);
*ifap = ifa;
+ write_unlock_bh(&in_dev->lock);
/* Send message first, then call notifier.
Notifier will trigger FIB update, so that
@@ -254,9 +309,11 @@ inet_insert_ifa(struct in_device *in_dev, struct in_ifaddr *ifa)
}
static int
-inet_set_ifa(struct device *dev, struct in_ifaddr *ifa)
+inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
{
- struct in_device *in_dev = dev->ip_ptr;
+ struct in_device *in_dev = __in_dev_get(dev);
+
+ ASSERT_RTNL();
if (in_dev == NULL) {
in_dev = inetdev_init(dev);
@@ -265,23 +322,34 @@ inet_set_ifa(struct device *dev, struct in_ifaddr *ifa)
return -ENOBUFS;
}
}
- ifa->ifa_dev = in_dev;
+ if (ifa->ifa_dev != in_dev) {
+ BUG_TRAP(ifa->ifa_dev==NULL);
+ in_dev_hold(in_dev);
+ ifa->ifa_dev=in_dev;
+ }
if (LOOPBACK(ifa->ifa_local))
ifa->ifa_scope = RT_SCOPE_HOST;
- return inet_insert_ifa(in_dev, ifa);
+ return inet_insert_ifa(ifa);
}
struct in_device *inetdev_by_index(int ifindex)
{
- struct device *dev;
- dev = dev_get_by_index(ifindex);
+ struct net_device *dev;
+ struct in_device *in_dev = NULL;
+ read_lock(&dev_base_lock);
+ dev = __dev_get_by_index(ifindex);
if (dev)
- return dev->ip_ptr;
- return NULL;
+ in_dev = in_dev_get(dev);
+ read_unlock(&dev_base_lock);
+ return in_dev;
}
+/* Called only from RTNL semaphored context. No locks. */
+
struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, u32 prefix, u32 mask)
{
+ ASSERT_RTNL();
+
for_primary_ifa(in_dev) {
if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa))
return ifa;
@@ -291,10 +359,6 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, u32 prefix, u32 ma
#ifdef CONFIG_RTNETLINK
-/* rtm_{add|del} functions are not reenterable, so that
- this structure can be made static
- */
-
int
inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
{
@@ -303,8 +367,11 @@ inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
struct ifaddrmsg *ifm = NLMSG_DATA(nlh);
struct in_ifaddr *ifa, **ifap;
+ ASSERT_RTNL();
+
if ((in_dev = inetdev_by_index(ifm->ifa_index)) == NULL)
return -EADDRNOTAVAIL;
+ __in_dev_put(in_dev);
for (ifap=&in_dev->ifa_list; (ifa=*ifap)!=NULL; ifap=&ifa->ifa_next) {
if ((rta[IFA_LOCAL-1] && memcmp(RTA_DATA(rta[IFA_LOCAL-1]), &ifa->ifa_local, 4)) ||
@@ -324,18 +391,20 @@ int
inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
{
struct rtattr **rta = arg;
- struct device *dev;
+ struct net_device *dev;
struct in_device *in_dev;
struct ifaddrmsg *ifm = NLMSG_DATA(nlh);
struct in_ifaddr *ifa;
+ ASSERT_RTNL();
+
if (ifm->ifa_prefixlen > 32 || rta[IFA_LOCAL-1] == NULL)
return -EINVAL;
- if ((dev = dev_get_by_index(ifm->ifa_index)) == NULL)
+ if ((dev = __dev_get_by_index(ifm->ifa_index)) == NULL)
return -ENODEV;
- if ((in_dev = dev->ip_ptr) == NULL) {
+ if ((in_dev = __in_dev_get(dev)) == NULL) {
in_dev = inetdev_init(dev);
if (!in_dev)
return -ENOBUFS;
@@ -356,13 +425,14 @@ inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
memcpy(&ifa->ifa_anycast, RTA_DATA(rta[IFA_ANYCAST-1]), 4);
ifa->ifa_flags = ifm->ifa_flags;
ifa->ifa_scope = ifm->ifa_scope;
+ in_dev_hold(in_dev);
ifa->ifa_dev = in_dev;
if (rta[IFA_LABEL-1])
memcpy(ifa->ifa_label, RTA_DATA(rta[IFA_LABEL-1]), IFNAMSIZ);
else
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
- return inet_insert_ifa(in_dev, ifa);
+ return inet_insert_ifa(ifa);
}
#endif
@@ -399,11 +469,10 @@ int devinet_ioctl(unsigned int cmd, void *arg)
struct in_device *in_dev;
struct in_ifaddr **ifap = NULL;
struct in_ifaddr *ifa = NULL;
- struct device *dev;
+ struct net_device *dev;
#ifdef CONFIG_IP_ALIAS
char *colon;
#endif
- int exclusive = 0;
int ret = 0;
/*
@@ -440,8 +509,6 @@ int devinet_ioctl(unsigned int cmd, void *arg)
case SIOCSIFFLAGS:
if (!capable(CAP_NET_ADMIN))
return -EACCES;
- rtnl_lock();
- exclusive = 1;
break;
case SIOCSIFADDR: /* Set interface address (and family) */
case SIOCSIFBRDADDR: /* Set the broadcast address */
@@ -451,15 +518,14 @@ int devinet_ioctl(unsigned int cmd, void *arg)
return -EACCES;
if (sin->sin_family != AF_INET)
return -EINVAL;
- rtnl_lock();
- exclusive = 1;
break;
default:
return -EINVAL;
}
+ rtnl_lock();
- if ((dev = dev_get(ifr.ifr_name)) == NULL) {
+ if ((dev = __dev_get_by_name(ifr.ifr_name)) == NULL) {
ret = -ENODEV;
goto done;
}
@@ -469,7 +535,7 @@ int devinet_ioctl(unsigned int cmd, void *arg)
*colon = ':';
#endif
- if ((in_dev=dev->ip_ptr) != NULL) {
+ if ((in_dev=__in_dev_get(dev)) != NULL) {
for (ifap=&in_dev->ifa_list; (ifa=*ifap) != NULL; ifap=&ifa->ifa_next)
if (strcmp(ifr.ifr_name, ifa->ifa_label) == 0)
break;
@@ -557,7 +623,7 @@ int devinet_ioctl(unsigned int cmd, void *arg)
if (ifa->ifa_broadcast != sin->sin_addr.s_addr) {
inet_del_ifa(in_dev, ifap, 0);
ifa->ifa_broadcast = sin->sin_addr.s_addr;
- inet_insert_ifa(in_dev, ifa);
+ inet_insert_ifa(ifa);
}
break;
@@ -569,7 +635,7 @@ int devinet_ioctl(unsigned int cmd, void *arg)
}
inet_del_ifa(in_dev, ifap, 0);
ifa->ifa_address = sin->sin_addr.s_addr;
- inet_insert_ifa(in_dev, ifa);
+ inet_insert_ifa(ifa);
}
break;
@@ -587,71 +653,83 @@ int devinet_ioctl(unsigned int cmd, void *arg)
inet_del_ifa(in_dev, ifap, 0);
ifa->ifa_mask = sin->sin_addr.s_addr;
ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask);
- inet_set_ifa(dev, ifa);
+ inet_insert_ifa(ifa);
}
break;
}
done:
- if (exclusive)
- rtnl_unlock();
+ rtnl_unlock();
return ret;
rarok:
+ rtnl_unlock();
if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
return -EFAULT;
return 0;
}
static int
-inet_gifconf(struct device *dev, char *buf, int len)
+inet_gifconf(struct net_device *dev, char *buf, int len)
{
- struct in_device *in_dev = dev->ip_ptr;
+ struct in_device *in_dev = __in_dev_get(dev);
struct in_ifaddr *ifa;
- struct ifreq *ifr = (struct ifreq *) buf;
+ struct ifreq ifr;
int done=0;
if (in_dev==NULL || (ifa=in_dev->ifa_list)==NULL)
return 0;
for ( ; ifa; ifa = ifa->ifa_next) {
- if (!ifr) {
+ if (!buf) {
done += sizeof(ifr);
continue;
}
if (len < (int) sizeof(ifr))
return done;
- memset(ifr, 0, sizeof(struct ifreq));
+ memset(&ifr, 0, sizeof(struct ifreq));
if (ifa->ifa_label)
- strcpy(ifr->ifr_name, ifa->ifa_label);
+ strcpy(ifr.ifr_name, ifa->ifa_label);
else
- strcpy(ifr->ifr_name, dev->name);
+ strcpy(ifr.ifr_name, dev->name);
- (*(struct sockaddr_in *) &ifr->ifr_addr).sin_family = AF_INET;
- (*(struct sockaddr_in *) &ifr->ifr_addr).sin_addr.s_addr = ifa->ifa_local;
+ (*(struct sockaddr_in *) &ifr.ifr_addr).sin_family = AF_INET;
+ (*(struct sockaddr_in *) &ifr.ifr_addr).sin_addr.s_addr = ifa->ifa_local;
- ifr++;
+ if (copy_to_user(buf, &ifr, sizeof(struct ifreq)))
+ return -EFAULT;
+ buf += sizeof(struct ifreq);
len -= sizeof(struct ifreq);
done += sizeof(struct ifreq);
}
return done;
}
-u32 inet_select_addr(const struct device *dev, u32 dst, int scope)
+u32 inet_select_addr(const struct net_device *dev, u32 dst, int scope)
{
u32 addr = 0;
- const struct in_device *in_dev = dev->ip_ptr;
+ struct in_device *in_dev;
- if (in_dev == NULL)
+ read_lock(&inetdev_lock);
+ in_dev = __in_dev_get(dev);
+ if (in_dev == NULL) {
+ read_unlock(&inetdev_lock);
return 0;
+ }
+ read_lock(&in_dev->lock);
for_primary_ifa(in_dev) {
if (ifa->ifa_scope > scope)
continue;
- addr = ifa->ifa_local;
- if (!dst || inet_ifa_match(dst, ifa))
- return addr;
+ if (!dst || inet_ifa_match(dst, ifa)) {
+ addr = ifa->ifa_local;
+ break;
+ }
+ if (!addr)
+ addr = ifa->ifa_local;
} endfor_ifa(in_dev);
-
+ read_unlock(&in_dev->lock);
+ read_unlock(&inetdev_lock);
+
if (addr || scope >= RT_SCOPE_LINK)
return addr;
@@ -660,17 +738,23 @@ u32 inet_select_addr(const struct device *dev, u32 dst, int scope)
in dev_base list.
*/
read_lock(&dev_base_lock);
+ read_lock(&inetdev_lock);
for (dev=dev_base; dev; dev=dev->next) {
- if ((in_dev=dev->ip_ptr) == NULL)
+ if ((in_dev=__in_dev_get(dev)) == NULL)
continue;
+ read_lock(&in_dev->lock);
for_primary_ifa(in_dev) {
if (ifa->ifa_scope <= scope) {
+ read_unlock(&in_dev->lock);
+ read_unlock(&inetdev_lock);
read_unlock(&dev_base_lock);
return ifa->ifa_local;
}
} endfor_ifa(in_dev);
+ read_unlock(&in_dev->lock);
}
+ read_unlock(&inetdev_lock);
read_unlock(&dev_base_lock);
return 0;
@@ -689,22 +773,27 @@ int unregister_inetaddr_notifier(struct notifier_block *nb)
{
return notifier_chain_unregister(&inetaddr_chain,nb);
}
-
+
+/* Called only under RTNL semaphore */
+
static int inetdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
- struct device *dev = ptr;
- struct in_device *in_dev = dev->ip_ptr;
+ struct net_device *dev = ptr;
+ struct in_device *in_dev = __in_dev_get(dev);
+
+ ASSERT_RTNL();
if (in_dev == NULL)
return NOTIFY_DONE;
switch (event) {
case NETDEV_REGISTER:
- if (in_dev)
- printk(KERN_DEBUG "inetdev_event: bug\n");
+ printk(KERN_DEBUG "inetdev_event: bug\n");
dev->ip_ptr = NULL;
break;
case NETDEV_UP:
+ if (dev->mtu < 68)
+ break;
if (dev == &loopback_dev) {
struct in_ifaddr *ifa;
if ((ifa = inet_alloc_ifa()) != NULL) {
@@ -712,10 +801,11 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, void
ifa->ifa_address = htonl(INADDR_LOOPBACK);
ifa->ifa_prefixlen = 8;
ifa->ifa_mask = inet_make_mask(8);
+ in_dev_hold(in_dev);
ifa->ifa_dev = in_dev;
ifa->ifa_scope = RT_SCOPE_HOST;
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
- inet_insert_ifa(in_dev, ifa);
+ inet_insert_ifa(ifa);
}
}
ip_mc_up(in_dev);
@@ -723,6 +813,10 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, void
case NETDEV_DOWN:
ip_mc_down(in_dev);
break;
+ case NETDEV_CHANGEMTU:
+ if (dev->mtu >= 68)
+ break;
+ /* MTU falled under 68, disable IP */
case NETDEV_UNREGISTER:
inetdev_destroy(in_dev);
break;
@@ -786,7 +880,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
int idx, ip_idx;
int s_idx, s_ip_idx;
- struct device *dev;
+ struct net_device *dev;
struct in_device *in_dev;
struct in_ifaddr *ifa;
@@ -798,17 +892,27 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
continue;
if (idx > s_idx)
s_ip_idx = 0;
- if ((in_dev = dev->ip_ptr) == NULL)
+ read_lock(&inetdev_lock);
+ if ((in_dev = __in_dev_get(dev)) == NULL) {
+ read_unlock(&inetdev_lock);
continue;
+ }
+ read_lock(&in_dev->lock);
for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
ifa = ifa->ifa_next, ip_idx++) {
if (ip_idx < s_ip_idx)
continue;
if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq, RTM_NEWADDR) <= 0)
+ cb->nlh->nlmsg_seq, RTM_NEWADDR) <= 0) {
+ read_unlock(&in_dev->lock);
+ read_unlock(&inetdev_lock);
goto done;
+ }
}
+ read_unlock(&in_dev->lock);
+ read_unlock(&inetdev_lock);
}
+
done:
read_unlock(&dev_base_lock);
cb->args[0] = idx;
@@ -879,7 +983,7 @@ static struct rtnetlink_link inet_rtnetlink_table[RTM_MAX-RTM_BASE+1] =
void inet_forward_change()
{
- struct device *dev;
+ struct net_device *dev;
int on = ipv4_devconf.forwarding;
ipv4_devconf.accept_redirects = !on;
@@ -887,9 +991,12 @@ void inet_forward_change()
read_lock(&dev_base_lock);
for (dev = dev_base; dev; dev = dev->next) {
- struct in_device *in_dev = dev->ip_ptr;
+ struct in_device *in_dev;
+ read_lock(&inetdev_lock);
+ in_dev = __in_dev_get(dev);
if (in_dev)
in_dev->cnf.forwarding = on;
+ read_unlock(&inetdev_lock);
}
read_unlock(&dev_base_lock);
@@ -921,7 +1028,7 @@ int devinet_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
static struct devinet_sysctl_table
{
struct ctl_table_header *sysctl_header;
- ctl_table devinet_vars[12];
+ ctl_table devinet_vars[13];
ctl_table devinet_dev[2];
ctl_table devinet_conf_dir[2];
ctl_table devinet_proto_dir[2];
@@ -961,6 +1068,9 @@ static struct devinet_sysctl_table
{NET_IPV4_CONF_LOG_MARTIANS, "log_martians",
&ipv4_devconf.log_martians, sizeof(int), 0644, NULL,
&proc_dointvec},
+ {NET_IPV4_CONF_TAG, "tag",
+ &ipv4_devconf.tag, sizeof(int), 0644, NULL,
+ &proc_dointvec},
{0}},
{{NET_PROTO_CONF_ALL, "all", NULL, 0, 0555, devinet_sysctl.devinet_vars},{0}},
@@ -972,7 +1082,7 @@ static struct devinet_sysctl_table
static void devinet_sysctl_register(struct in_device *in_dev, struct ipv4_devconf *p)
{
int i;
- struct device *dev = in_dev ? in_dev->dev : NULL;
+ struct net_device *dev = in_dev ? in_dev->dev : NULL;
struct devinet_sysctl_table *t;
t = kmalloc(sizeof(*t), GFP_KERNEL);
@@ -1017,7 +1127,7 @@ static void devinet_sysctl_unregister(struct ipv4_devconf *p)
}
#endif
-__initfunc(void devinet_init(void))
+void __init devinet_init(void)
{
register_gifconf(PF_INET, inet_gifconf);
register_netdevice_notifier(&ip_netdev_notifier);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index d57d4daa9..656acf2c9 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -5,7 +5,7 @@
*
* IPv4 Forwarding Information Base: FIB frontend.
*
- * Version: $Id: fib_frontend.c,v 1.16 1999/06/09 10:10:42 davem Exp $
+ * Version: $Id: fib_frontend.c,v 1.19 1999/08/31 07:03:23 davem Exp $
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
@@ -142,25 +142,37 @@ fib_get_procinfo(char *buffer, char **start, off_t offset, int length, int dummy
* Find the first device with a given source address.
*/
-struct device * ip_dev_find(u32 addr)
+struct net_device * ip_dev_find(u32 addr)
{
struct rt_key key;
struct fib_result res;
+ struct net_device *dev = NULL;
memset(&key, 0, sizeof(key));
key.dst = addr;
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ res.r = NULL;
+#endif
- if (!local_table || local_table->tb_lookup(local_table, &key, &res)
- || res.type != RTN_LOCAL)
+ if (!local_table || local_table->tb_lookup(local_table, &key, &res)) {
return NULL;
-
- return FIB_RES_DEV(res);
+ }
+ if (res.type != RTN_LOCAL)
+ goto out;
+ dev = FIB_RES_DEV(res);
+ if (dev)
+ atomic_inc(&dev->refcnt);
+
+out:
+ fib_res_put(&res);
+ return dev;
}
unsigned inet_addr_type(u32 addr)
{
struct rt_key key;
struct fib_result res;
+ unsigned ret = RTN_BROADCAST;
if (ZERONET(addr) || BADCLASS(addr))
return RTN_BROADCAST;
@@ -169,13 +181,18 @@ unsigned inet_addr_type(u32 addr)
memset(&key, 0, sizeof(key));
key.dst = addr;
-
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ res.r = NULL;
+#endif
+
if (local_table) {
- if (local_table->tb_lookup(local_table, &key, &res) == 0)
- return res.type;
- return RTN_UNICAST;
+ ret = RTN_UNICAST;
+ if (local_table->tb_lookup(local_table, &key, &res) == 0) {
+ ret = res.type;
+ fib_res_put(&res);
+ }
}
- return RTN_BROADCAST;
+ return ret;
}
/* Given (packet source, input interface) and optional (dst, oif, tos):
@@ -187,11 +204,13 @@ unsigned inet_addr_type(u32 addr)
*/
int fib_validate_source(u32 src, u32 dst, u8 tos, int oif,
- struct device *dev, u32 *spec_dst, u32 *itag)
+ struct net_device *dev, u32 *spec_dst, u32 *itag)
{
- struct in_device *in_dev = dev->ip_ptr;
+ struct in_device *in_dev;
struct rt_key key;
struct fib_result res;
+ int no_addr, rpf;
+ int ret;
key.dst = src;
key.src = dst;
@@ -200,12 +219,22 @@ int fib_validate_source(u32 src, u32 dst, u8 tos, int oif,
key.iif = oif;
key.scope = RT_SCOPE_UNIVERSE;
+ no_addr = rpf = 0;
+ read_lock(&inetdev_lock);
+ in_dev = __in_dev_get(dev);
+ if (in_dev) {
+ no_addr = in_dev->ifa_list == NULL;
+ rpf = IN_DEV_RPFILTER(in_dev);
+ }
+ read_unlock(&inetdev_lock);
+
if (in_dev == NULL)
- return -EINVAL;
+ goto e_inval;
+
if (fib_lookup(&key, &res))
goto last_resort;
if (res.type != RTN_UNICAST)
- return -EINVAL;
+ goto e_inval_res;
*spec_dst = FIB_RES_PREFSRC(res);
if (itag)
fib_combine_itag(itag, &res);
@@ -214,25 +243,39 @@ int fib_validate_source(u32 src, u32 dst, u8 tos, int oif,
#else
if (FIB_RES_DEV(res) == dev)
#endif
- return FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
-
- if (in_dev->ifa_list == NULL)
+ {
+ ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+ fib_res_put(&res);
+ return ret;
+ }
+ fib_res_put(&res);
+ if (no_addr)
goto last_resort;
- if (IN_DEV_RPFILTER(in_dev))
- return -EINVAL;
+ if (rpf)
+ goto e_inval;
key.oif = dev->ifindex;
- if (fib_lookup(&key, &res) == 0 && res.type == RTN_UNICAST) {
- *spec_dst = FIB_RES_PREFSRC(res);
- return FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+
+ ret = 0;
+ if (fib_lookup(&key, &res) == 0) {
+ if (res.type == RTN_UNICAST) {
+ *spec_dst = FIB_RES_PREFSRC(res);
+ ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+ }
+ fib_res_put(&res);
}
- return 0;
+ return ret;
last_resort:
- if (IN_DEV_RPFILTER(in_dev))
- return -EINVAL;
+ if (rpf)
+ goto e_inval;
*spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
*itag = 0;
return 0;
+
+e_inval_res:
+ fib_res_put(&res);
+e_inval:
+ return -EINVAL;
}
#ifndef CONFIG_IP_NOSIOCRT
@@ -421,7 +464,7 @@ static void fib_magic(int cmd, int type, u32 dst, int dst_len, struct in_ifaddr
static void fib_add_ifaddr(struct in_ifaddr *ifa)
{
struct in_device *in_dev = ifa->ifa_dev;
- struct device *dev = in_dev->dev;
+ struct net_device *dev = in_dev->dev;
struct in_ifaddr *prim = ifa;
u32 mask = ifa->ifa_mask;
u32 addr = ifa->ifa_local;
@@ -460,7 +503,7 @@ static void fib_add_ifaddr(struct in_ifaddr *ifa)
static void fib_del_ifaddr(struct in_ifaddr *ifa)
{
struct in_device *in_dev = ifa->ifa_dev;
- struct device *dev = in_dev->dev;
+ struct net_device *dev = in_dev->dev;
struct in_ifaddr *ifa1;
struct in_ifaddr *prim = ifa;
u32 brd = ifa->ifa_address|~ifa->ifa_mask;
@@ -526,7 +569,7 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
#undef BRD1_OK
}
-static void fib_disable_ip(struct device *dev, int force)
+static void fib_disable_ip(struct net_device *dev, int force)
{
if (fib_sync_down(0, dev, force))
fib_flush();
@@ -560,8 +603,8 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
- struct device *dev = ptr;
- struct in_device *in_dev = dev->ip_ptr;
+ struct net_device *dev = ptr;
+ struct in_device *in_dev = __in_dev_get(dev);
if (!in_dev)
return NOTIFY_DONE;
@@ -602,7 +645,7 @@ struct notifier_block fib_netdev_notifier = {
0
};
-__initfunc(void ip_fib_init(void))
+void __init ip_fib_init(void)
{
#ifdef CONFIG_PROC_FS
proc_net_register(&(struct proc_dir_entry) {
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 0472f6118..5c36d1f0d 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -5,7 +5,7 @@
*
* IPv4 FIB: lookup engine and maintenance routines.
*
- * Version: $Id: fib_hash.c,v 1.10 1999/06/09 10:10:45 davem Exp $
+ * Version: $Id: fib_hash.c,v 1.12 1999/08/31 07:03:27 davem Exp $
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
@@ -48,6 +48,8 @@
printk(KERN_DEBUG a)
*/
+static kmem_cache_t * fn_hash_kmem;
+
/*
These bizarre types are just to force strict type checking.
When I reversed order of bytes and changed to natural mask lengths,
@@ -216,7 +218,7 @@ static void fn_rehash_zone(struct fn_zone *fz)
static void fn_free_node(struct fib_node * f)
{
fib_release_info(FIB_INFO(f));
- kfree_s(f, sizeof(struct fib_node));
+ kmem_cache_free(fn_hash_kmem, f);
}
@@ -298,7 +300,6 @@ fn_hash_lookup(struct fib_table *tb, const struct rt_key *key, struct fib_result
res->type = f->fn_type;
res->scope = f->fn_scope;
res->prefixlen = fz->fz_order;
- res->prefix = &fz_prefix(f->fn_key, fz);
goto out;
}
if (err < 0)
@@ -372,7 +373,10 @@ fn_hash_select_default(struct fib_table *tb, const struct rt_key *key, struct fi
if (next_fi != res->fi)
break;
} else if (!fib_detect_death(fi, order, &last_resort, &last_idx)) {
+ if (res->fi)
+ fib_info_put(res->fi);
res->fi = fi;
+ atomic_inc(&fi->fib_clntref);
fn_hash_last_dflt = order;
goto out;
}
@@ -386,13 +390,21 @@ fn_hash_select_default(struct fib_table *tb, const struct rt_key *key, struct fi
}
if (!fib_detect_death(fi, order, &last_resort, &last_idx)) {
+ if (res->fi)
+ fib_info_put(res->fi);
res->fi = fi;
+ atomic_inc(&fi->fib_clntref);
fn_hash_last_dflt = order;
goto out;
}
- if (last_idx >= 0)
+ if (last_idx >= 0) {
+ if (res->fi)
+ fib_info_put(res->fi);
res->fi = last_resort;
+ if (last_resort)
+ atomic_inc(&last_resort->fib_clntref);
+ }
fn_hash_last_dflt = last_idx;
out:
read_unlock(&fib_hash_lock);
@@ -554,7 +566,7 @@ create:
replace:
err = -ENOBUFS;
- new_f = (struct fib_node *) kmalloc(sizeof(struct fib_node), GFP_KERNEL);
+ new_f = kmem_cache_alloc(fn_hash_kmem, SLAB_KERNEL);
if (new_f == NULL)
goto out;
@@ -887,13 +899,21 @@ static void rtmsg_fib(int event, struct fib_node* f, int z, int tb_id,
#ifdef CONFIG_IP_MULTIPLE_TABLES
struct fib_table * fib_hash_init(int id)
#else
-__initfunc(struct fib_table * fib_hash_init(int id))
+struct fib_table * __init fib_hash_init(int id)
#endif
{
struct fib_table *tb;
+
+ if (fn_hash_kmem == NULL)
+ fn_hash_kmem = kmem_cache_create("ip_fib_hash",
+ sizeof(struct fib_node),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+
tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash), GFP_KERNEL);
if (tb == NULL)
return NULL;
+
tb->tb_id = id;
tb->tb_lookup = fn_hash_lookup;
tb->tb_insert = fn_hash_insert;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 97074198e..5ee1bfd78 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -5,7 +5,7 @@
*
* IPv4 Forwarding Information Base: policy rules.
*
- * Version: $Id: fib_rules.c,v 1.11 1999/06/09 10:10:47 davem Exp $
+ * Version: $Id: fib_rules.c,v 1.14 1999/08/31 07:03:29 davem Exp $
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
@@ -52,6 +52,7 @@
struct fib_rule
{
struct fib_rule *r_next;
+ atomic_t r_clntref;
u32 r_preference;
unsigned char r_table;
unsigned char r_action;
@@ -72,11 +73,12 @@ struct fib_rule
__u32 r_tclassid;
#endif
char r_ifname[IFNAMSIZ];
+ int r_dead;
};
-static struct fib_rule default_rule = { NULL, 0x7FFF, RT_TABLE_DEFAULT, RTN_UNICAST, };
-static struct fib_rule main_rule = { &default_rule, 0x7FFE, RT_TABLE_MAIN, RTN_UNICAST, };
-static struct fib_rule local_rule = { &main_rule, 0, RT_TABLE_LOCAL, RTN_UNICAST, };
+static struct fib_rule default_rule = { NULL, ATOMIC_INIT(2), 0x7FFF, RT_TABLE_DEFAULT, RTN_UNICAST, };
+static struct fib_rule main_rule = { &default_rule, ATOMIC_INIT(2), 0x7FFE, RT_TABLE_MAIN, RTN_UNICAST, };
+static struct fib_rule local_rule = { &main_rule, ATOMIC_INIT(2), 0, RT_TABLE_LOCAL, RTN_UNICAST, };
static struct fib_rule *fib_rules = &local_rule;
static rwlock_t fib_rules_lock = RW_LOCK_UNLOCKED;
@@ -107,9 +109,9 @@ int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
write_lock_bh(&fib_rules_lock);
*rp = r->r_next;
+ r->r_dead = 1;
write_unlock_bh(&fib_rules_lock);
- if (r != &default_rule && r != &main_rule)
- kfree(r);
+ fib_rule_put(r);
err = 0;
break;
}
@@ -129,6 +131,15 @@ static struct fib_table *fib_empty_table(void)
return NULL;
}
+void fib_rule_put(struct fib_rule *r)
+{
+ if (atomic_dec_and_test(&r->r_clntref)) {
+ if (r->r_dead)
+ kfree(r);
+ else
+ printk("Freeing alive rule %p\n", r);
+ }
+}
int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
{
@@ -179,11 +190,11 @@ int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
memcpy(&new_r->r_preference, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
new_r->r_table = table_id;
if (rta[RTA_IIF-1]) {
- struct device *dev;
+ struct net_device *dev;
memcpy(new_r->r_ifname, RTA_DATA(rta[RTA_IIF-1]), IFNAMSIZ);
new_r->r_ifname[IFNAMSIZ-1] = 0;
new_r->r_ifindex = -1;
- dev = dev_get(new_r->r_ifname);
+ dev = __dev_get_by_name(new_r->r_ifname);
if (dev)
new_r->r_ifindex = dev->ifindex;
}
@@ -209,6 +220,7 @@ int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
}
new_r->r_next = r;
+ atomic_inc(&new_r->r_clntref);
write_lock_bh(&fib_rules_lock);
*rp = new_r;
write_unlock_bh(&fib_rules_lock);
@@ -251,7 +263,7 @@ u32 fib_rules_tclass(struct fib_result *res)
#endif
-static void fib_rules_detach(struct device *dev)
+static void fib_rules_detach(struct net_device *dev)
{
struct fib_rule *r;
@@ -264,7 +276,7 @@ static void fib_rules_detach(struct device *dev)
}
}
-static void fib_rules_attach(struct device *dev)
+static void fib_rules_attach(struct net_device *dev)
{
struct fib_rule *r;
@@ -322,8 +334,9 @@ FRprintk("tb %d r %d ", r->r_table, r->r_action);
continue;
err = tb->tb_lookup(tb, key, res);
if (err == 0) {
-FRprintk("ok\n");
res->r = policy;
+ if (policy)
+ atomic_inc(&policy->r_clntref);
read_unlock(&fib_rules_lock);
return 0;
}
@@ -349,7 +362,7 @@ void fib_select_default(const struct rt_key *key, struct fib_result *res)
static int fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr)
{
- struct device *dev = ptr;
+ struct net_device *dev = ptr;
if (event == NETDEV_UNREGISTER)
fib_rules_detach(dev);
@@ -435,7 +448,7 @@ int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb)
#endif /* CONFIG_RTNETLINK */
-__initfunc(void fib_rules_init(void))
+void __init fib_rules_init(void)
{
register_netdevice_notifier(&fib_rules_notifier);
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index b78f7ebaf..bb9f81658 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -5,7 +5,7 @@
*
* IPv4 Forwarding Information Base: semantics.
*
- * Version: $Id: fib_semantics.c,v 1.13 1999/03/21 05:22:34 davem Exp $
+ * Version: $Id: fib_semantics.c,v 1.15 1999/08/20 11:05:07 davem Exp $
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
@@ -46,6 +46,8 @@
#define FSprintk(a...)
static struct fib_info *fib_info_list;
+static rwlock_t fib_info_lock = RW_LOCK_UNLOCKED;
+int fib_info_cnt;
#define for_fib_info() { struct fib_info *fi; \
for (fi = fib_info_list; fi; fi = fi->fib_next)
@@ -98,19 +100,38 @@ static struct
{ -EINVAL, RT_SCOPE_NOWHERE} /* RTN_XRESOLVE */
};
+
/* Release a nexthop info record */
+void free_fib_info(struct fib_info *fi)
+{
+ if (fi->fib_dead == 0) {
+ printk("Freeing alive fib_info %p\n", fi);
+ return;
+ }
+ change_nexthops(fi) {
+ if (nh->nh_dev)
+ dev_put(nh->nh_dev);
+ nh->nh_dev = NULL;
+ } endfor_nexthops(fi);
+ fib_info_cnt--;
+ kfree(fi);
+}
+
void fib_release_info(struct fib_info *fi)
{
- if (fi && !--fi->fib_refcnt) {
+ write_lock(&fib_info_lock);
+ if (fi && --fi->fib_treeref == 0) {
if (fi->fib_next)
fi->fib_next->fib_prev = fi->fib_prev;
if (fi->fib_prev)
fi->fib_prev->fib_next = fi->fib_next;
if (fi == fib_info_list)
fib_info_list = fi->fib_next;
- kfree(fi);
+ fi->fib_dead = 1;
+ fib_info_put(fi);
}
+ write_unlock(&fib_info_lock);
}
extern __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
@@ -142,9 +163,7 @@ extern __inline__ struct fib_info * fib_find_info(const struct fib_info *nfi)
if (nfi->fib_protocol == fi->fib_protocol &&
nfi->fib_prefsrc == fi->fib_prefsrc &&
nfi->fib_priority == fi->fib_priority &&
- nfi->fib_mtu == fi->fib_mtu &&
- nfi->fib_rtt == fi->fib_rtt &&
- nfi->fib_window == fi->fib_window &&
+ memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(fi->fib_metrics)) == 0 &&
((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
(nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
return fi;
@@ -156,17 +175,21 @@ extern __inline__ struct fib_info * fib_find_info(const struct fib_info *nfi)
Used only by redirect accept routine.
*/
-int ip_fib_check_default(u32 gw, struct device *dev)
+int ip_fib_check_default(u32 gw, struct net_device *dev)
{
+ read_lock(&fib_info_lock);
for_fib_info() {
if (fi->fib_flags & RTNH_F_DEAD)
continue;
for_nexthops(fi) {
if (nh->nh_dev == dev && nh->nh_gw == gw &&
- !(nh->nh_flags&RTNH_F_DEAD))
+ !(nh->nh_flags&RTNH_F_DEAD)) {
+ read_unlock(&fib_info_lock);
return 0;
+ }
} endfor_nexthops(fi);
} endfor_fib_info();
+ read_unlock(&fib_info_lock);
return -1;
}
@@ -331,17 +354,18 @@ static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_n
return 0;
#endif
if (nh->nh_flags&RTNH_F_ONLINK) {
- struct device *dev;
+ struct net_device *dev;
if (r->rtm_scope >= RT_SCOPE_LINK)
return -EINVAL;
if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
return -EINVAL;
- if ((dev = dev_get_by_index(nh->nh_oif)) == NULL)
+ if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
return -ENODEV;
if (!(dev->flags&IFF_UP))
return -ENETDOWN;
nh->nh_dev = dev;
+ atomic_inc(&dev->refcnt);
nh->nh_scope = RT_SCOPE_LINK;
return 0;
}
@@ -359,6 +383,9 @@ static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_n
nh->nh_scope = res.scope;
nh->nh_oif = FIB_RES_OIF(res);
nh->nh_dev = FIB_RES_DEV(res);
+ if (nh->nh_dev)
+ atomic_inc(&nh->nh_dev->refcnt);
+ fib_res_put(&res);
} else {
struct in_device *in_dev;
@@ -368,10 +395,14 @@ static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_n
in_dev = inetdev_by_index(nh->nh_oif);
if (in_dev == NULL)
return -ENODEV;
- if (!(in_dev->dev->flags&IFF_UP))
+ if (!(in_dev->dev->flags&IFF_UP)) {
+ in_dev_put(in_dev);
return -ENETDOWN;
+ }
nh->nh_dev = in_dev->dev;
+ atomic_inc(&nh->nh_dev->refcnt);
nh->nh_scope = RT_SCOPE_HOST;
+ in_dev_put(in_dev);
}
return 0;
}
@@ -405,6 +436,7 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
err = -ENOBUFS;
if (fi == NULL)
goto failure;
+ fib_info_cnt++;
memset(fi, 0, sizeof(*fi)+nhs*sizeof(struct fib_nh));
fi->fib_protocol = r->rtm_protocol;
@@ -419,7 +451,7 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
while (RTA_OK(attr, attrlen)) {
unsigned flavor = attr->rta_type;
if (flavor) {
- if (flavor > FIB_MAX_METRICS)
+ if (flavor > RTAX_MAX)
goto err_inval;
fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr);
}
@@ -505,17 +537,21 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
link_it:
if ((ofi = fib_find_info(fi)) != NULL) {
- kfree(fi);
- ofi->fib_refcnt++;
+ fi->fib_dead = 1;
+ free_fib_info(fi);
+ ofi->fib_treeref++;
return ofi;
}
- fi->fib_refcnt++;
+ fi->fib_treeref++;
+ atomic_inc(&fi->fib_clntref);
+ write_lock(&fib_info_lock);
fi->fib_next = fib_info_list;
fi->fib_prev = NULL;
if (fib_info_list)
fib_info_list->fib_prev = fi;
fib_info_list = fi;
+ write_unlock(&fib_info_lock);
return fi;
err_inval:
@@ -523,8 +559,10 @@ err_inval:
failure:
*errp = err;
- if (fi)
- kfree(fi);
+ if (fi) {
+ fi->fib_dead = 1;
+ free_fib_info(fi);
+ }
return NULL;
}
@@ -543,6 +581,7 @@ fib_semantic_match(int type, struct fib_info *fi, const struct rt_key *key, stru
#ifdef CONFIG_IP_ROUTE_NAT
case RTN_NAT:
FIB_RES_RESET(*res);
+ atomic_inc(&fi->fib_clntref);
return 0;
#endif
case RTN_UNICAST:
@@ -559,15 +598,20 @@ fib_semantic_match(int type, struct fib_info *fi, const struct rt_key *key, stru
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (nhsel < fi->fib_nhs) {
res->nh_sel = nhsel;
+ atomic_inc(&fi->fib_clntref);
return 0;
}
#else
- if (nhsel < 1)
+ if (nhsel < 1) {
+ atomic_inc(&fi->fib_clntref);
return 0;
+ }
#endif
endfor_nexthops(fi);
+ res->fi = NULL;
return 1;
default:
+ res->fi = NULL;
printk(KERN_DEBUG "impossible 102\n");
return -EINVAL;
}
@@ -612,16 +656,8 @@ fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
if (fi->fib_nh[0].nh_tclassid)
RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid);
#endif
- if (fi->fib_mtu || fi->fib_window || fi->fib_rtt) {
- int i;
- struct rtattr *mx = (struct rtattr *)skb->tail;
- RTA_PUT(skb, RTA_METRICS, 0, NULL);
- for (i=0; i<FIB_MAX_METRICS; i++) {
- if (fi->fib_metrics[i])
- RTA_PUT(skb, i+1, sizeof(unsigned), fi->fib_metrics + i);
- }
- mx->rta_len = skb->tail - (u8*)mx;
- }
+ if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
+ goto rtattr_failure;
if (fi->fib_prefsrc)
RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc);
if (fi->fib_nhs == 1) {
@@ -731,7 +767,7 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
#ifdef CONFIG_IP_ALIAS
char *colon;
#endif
- struct device *dev;
+ struct net_device *dev;
char devname[IFNAMSIZ];
if (copy_from_user(devname, r->rt_dev, IFNAMSIZ-1))
@@ -742,14 +778,14 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
if (colon)
*colon = 0;
#endif
- dev = dev_get(devname);
+ dev = __dev_get_by_name(devname);
if (!dev)
return -ENODEV;
rta->rta_oif = &dev->ifindex;
#ifdef CONFIG_IP_ALIAS
if (colon) {
struct in_ifaddr *ifa;
- struct in_device *in_dev = dev->ip_ptr;
+ struct in_device *in_dev = __in_dev_get(dev);
if (!in_dev)
return -ENODEV;
*colon = ':';
@@ -789,10 +825,10 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
mx->rta_len = RTA_LENGTH(0);
if (r->rt_flags&RTF_MTU) {
rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
- rec->rta_type = RTAX_MTU;
+ rec->rta_type = RTAX_ADVMSS;
rec->rta_len = RTA_LENGTH(4);
mx->rta_len += RTA_LENGTH(4);
- *(u32*)RTA_DATA(rec) = r->rt_mtu;
+ *(u32*)RTA_DATA(rec) = r->rt_mtu - 40;
}
if (r->rt_flags&RTF_WINDOW) {
rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
@@ -806,7 +842,7 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
rec->rta_type = RTAX_RTT;
rec->rta_len = RTA_LENGTH(4);
mx->rta_len += RTA_LENGTH(4);
- *(u32*)RTA_DATA(rec) = r->rt_irtt;
+ *(u32*)RTA_DATA(rec) = r->rt_irtt<<3;
}
}
return 0;
@@ -821,7 +857,7 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
- device went down -> we must shutdown all nexthops going via it.
*/
-int fib_sync_down(u32 local, struct device *dev, int force)
+int fib_sync_down(u32 local, struct net_device *dev, int force)
{
int ret = 0;
int scope = RT_SCOPE_NOWHERE;
@@ -865,7 +901,7 @@ int fib_sync_down(u32 local, struct device *dev, int force)
It takes sense only on multipath routes.
*/
-int fib_sync_up(struct device *dev)
+int fib_sync_up(struct net_device *dev)
{
int ret = 0;
@@ -882,7 +918,7 @@ int fib_sync_up(struct device *dev)
}
if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
continue;
- if (nh->nh_dev != dev || dev->ip_ptr == NULL)
+ if (nh->nh_dev != dev || __in_dev_get(dev) == NULL)
continue;
alive++;
nh->nh_power = 0;
@@ -977,7 +1013,7 @@ void fib_node_get_info(int type, int dead, struct fib_info *fi, u32 prefix, u32
len = sprintf(buffer, "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
fi->fib_dev ? fi->fib_dev->name : "*", prefix,
fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority,
- mask, fi->fib_mtu, fi->fib_window, fi->fib_rtt);
+ mask, fi->fib_advmss+40, fi->fib_window, fi->fib_rtt>>3);
} else {
len = sprintf(buffer, "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
prefix, 0,
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 78b5d8f9b..4d24ed413 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -3,7 +3,7 @@
*
* Alan Cox, <alan@redhat.com>
*
- * Version: $Id: icmp.c,v 1.57 1999/06/09 10:10:50 davem Exp $
+ * Version: $Id: icmp.c,v 1.61 1999/08/31 07:03:33 davem Exp $
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -261,6 +261,7 @@
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/string.h>
+#include <linux/netfilter_ipv4.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <net/route.h>
@@ -278,10 +279,6 @@
#include <asm/uaccess.h>
#include <net/checksum.h>
-#ifdef CONFIG_IP_MASQUERADE
-#include <net/ip_masq.h>
-#endif
-
#define min(a,b) ((a)<(b)?(a):(b))
/*
@@ -357,6 +354,47 @@ struct icmp_bxm
struct inode icmp_inode;
struct socket *icmp_socket=&icmp_inode.u.socket_i;
+/* ICMPv4 socket is only a bit non-reenterable (unlike ICMPv6,
+ which is strongly non-reenterable). A bit later it will be made
+ reenterable and the lock may be removed then.
+ */
+
+static int icmp_xmit_holder = -1;
+
+static int icmp_xmit_lock_bh(void)
+{
+ if (!spin_trylock(&icmp_socket->sk->lock.slock)) {
+ if (icmp_xmit_holder == smp_processor_id())
+ return -EAGAIN;
+ spin_lock(&icmp_socket->sk->lock.slock);
+ }
+ icmp_xmit_holder = smp_processor_id();
+ return 0;
+}
+
+static __inline__ int icmp_xmit_lock(void)
+{
+ int ret;
+ local_bh_disable();
+ ret = icmp_xmit_lock_bh();
+ if (ret)
+ local_bh_enable();
+ return ret;
+}
+
+static void icmp_xmit_unlock_bh(void)
+{
+ icmp_xmit_holder = -1;
+ spin_unlock(&icmp_socket->sk->lock.slock);
+}
+
+static __inline__ void icmp_xmit_unlock(void)
+{
+ icmp_xmit_unlock_bh();
+ local_bh_enable();
+}
+
+
/*
* Send an ICMP frame.
*/
@@ -480,21 +518,26 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
if (ip_options_echo(&icmp_param->replyopts, skb))
return;
+ if (icmp_xmit_lock_bh())
+ return;
+
icmp_param->icmph.checksum=0;
icmp_param->csum=0;
icmp_out_count(icmp_param->icmph.type);
- sk->ip_tos = skb->nh.iph->tos;
+ sk->protinfo.af_inet.tos = skb->nh.iph->tos;
daddr = ipc.addr = rt->rt_src;
ipc.opt = &icmp_param->replyopts;
if (ipc.opt->srr)
daddr = icmp_param->replyopts.faddr;
if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0))
- return;
+ goto out;
ip_build_xmit(sk, icmp_glue_bits, icmp_param,
icmp_param->data_len+sizeof(struct icmphdr),
&ipc, rt, MSG_DONTWAIT);
ip_rt_put(rt);
+out:
+ icmp_xmit_unlock_bh();
}
@@ -536,10 +579,8 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info)
* Now check at the protocol level
*/
if (!rt) {
-#ifndef CONFIG_IP_ALWAYS_DEFRAG
if (net_ratelimit())
printk(KERN_DEBUG "icmp_send: destinationless packet\n");
-#endif
return;
}
if (rt->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST))
@@ -575,6 +616,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info)
}
+ if (icmp_xmit_lock())
+ return;
+
/*
* Construct source address and options.
*/
@@ -588,11 +632,6 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info)
iph->saddr = rt->key.src;
}
#endif
-#ifdef CONFIG_IP_MASQUERADE
- if (type==ICMP_DEST_UNREACH && IPCB(skb_in)->flags&IPSKB_MASQUERADED) {
- ip_fw_unmasq_icmp(skb_in);
- }
-#endif
saddr = iph->daddr;
if (!(rt->rt_flags & RTCF_LOCAL))
@@ -609,7 +648,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info)
* grow the routing table.
*/
if (ip_route_output(&rt, iph->saddr, saddr, RT_TOS(tos), 0))
- return;
+ goto out;
if (ip_options_echo(&icmp_param.replyopts, skb_in))
goto ende;
@@ -626,13 +665,13 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info)
icmp_param.csum=0;
icmp_param.data_ptr=iph;
icmp_out_count(icmp_param.icmph.type);
- icmp_socket->sk->ip_tos = tos;
+ icmp_socket->sk->protinfo.af_inet.tos = tos;
ipc.addr = iph->saddr;
ipc.opt = &icmp_param.replyopts;
if (icmp_param.replyopts.srr) {
ip_rt_put(rt);
if (ip_route_output(&rt, icmp_param.replyopts.faddr, saddr, RT_TOS(tos), 0))
- return;
+ goto out;
}
if (!icmpv4_xrlim_allow(rt, type, code))
@@ -656,6 +695,8 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info)
ende:
ip_rt_put(rt);
+out:
+ icmp_xmit_unlock();
}
@@ -752,19 +793,22 @@ static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb, int len)
/* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
hash = iph->protocol & (MAX_INET_PROTOS - 1);
+ read_lock(&raw_v4_lock);
if ((raw_sk = raw_v4_htable[hash]) != NULL)
{
- while ((raw_sk = raw_v4_lookup(raw_sk, iph->protocol, iph->saddr,
- iph->daddr, skb->dev->ifindex)) != NULL) {
+ while ((raw_sk = __raw_v4_lookup(raw_sk, iph->protocol, iph->saddr,
+ iph->daddr, skb->dev->ifindex)) != NULL) {
raw_err(raw_sk, skb);
raw_sk = raw_sk->next;
}
}
+ read_unlock(&raw_v4_lock);
/*
* This can't change while we are doing it.
*/
+ read_lock(&inet_protocol_lock);
ipprot = (struct inet_protocol *) inet_protos[hash];
while(ipprot != NULL) {
struct inet_protocol *nextip;
@@ -783,6 +827,7 @@ static void icmp_unreach(struct icmphdr *icmph, struct sk_buff *skb, int len)
ipprot = nextip;
}
+ read_unlock(&inet_protocol_lock);
}
@@ -935,90 +980,41 @@ static void icmp_address(struct icmphdr *icmph, struct sk_buff *skb, int len)
static void icmp_address_reply(struct icmphdr *icmph, struct sk_buff *skb, int len)
{
struct rtable *rt = (struct rtable*)skb->dst;
- struct device *dev = skb->dev;
- struct in_device *in_dev = dev->ip_ptr;
+ struct net_device *dev = skb->dev;
+ struct in_device *in_dev;
struct in_ifaddr *ifa;
u32 mask;
- if (!in_dev || !in_dev->ifa_list ||
- !IN_DEV_LOG_MARTIANS(in_dev) ||
- !IN_DEV_FORWARD(in_dev) ||
- len < 4 ||
- !(rt->rt_flags&RTCF_DIRECTSRC))
+ if (len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC))
return;
- mask = *(u32*)&icmph[1];
- for (ifa=in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
- if (mask == ifa->ifa_mask && inet_ifa_match(rt->rt_src, ifa))
- return;
+ in_dev = in_dev_get(dev);
+ if (!in_dev)
+ return;
+ read_lock(&in_dev->lock);
+ if (in_dev->ifa_list &&
+ IN_DEV_LOG_MARTIANS(in_dev) &&
+ IN_DEV_FORWARD(in_dev)) {
+
+ mask = *(u32*)&icmph[1];
+ for (ifa=in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ if (mask == ifa->ifa_mask && inet_ifa_match(rt->rt_src, ifa))
+ break;
+ }
+ if (!ifa && net_ratelimit()) {
+ char b1[16], b2[16];
+ printk(KERN_INFO "Wrong address mask %s from %s/%s\n",
+ in_ntoa2(mask, b1), in_ntoa2(rt->rt_src, b2), dev->name);
+ }
}
- if (net_ratelimit())
- printk(KERN_INFO "Wrong address mask %08lX from %08lX/%s\n",
- ntohl(mask), ntohl(rt->rt_src), dev->name);
+ read_unlock(&in_dev->lock);
+ in_dev_put(in_dev);
}
static void icmp_discard(struct icmphdr *icmph, struct sk_buff *skb, int len)
{
}
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
-/*
- * Check incoming icmp packets not addressed locally, to check whether
- * they relate to a (proxying) socket on our system.
- * Needed for transparent proxying.
- *
- * This code is presently ugly and needs cleanup.
- * Probably should add a chkaddr entry to ipprot to call a chk routine
- * in udp.c or tcp.c...
- */
-
-/* This should work with the new hashes now. -DaveM */
-extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif);
-extern struct sock *udp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif);
-
-int icmp_chkaddr(struct sk_buff *skb)
-{
- struct icmphdr *icmph=(struct icmphdr *)(skb->nh.raw + skb->nh.iph->ihl*4);
- struct iphdr *iph = (struct iphdr *) (icmph + 1);
- void (*handler)(struct icmphdr *icmph, struct sk_buff *skb, int len) = icmp_pointers[icmph->type].handler;
-
- if (handler == icmp_unreach || handler == icmp_redirect) {
- struct sock *sk;
-
- switch (iph->protocol) {
- case IPPROTO_TCP:
- {
- struct tcphdr *th = (struct tcphdr *)(((unsigned char *)iph)+(iph->ihl<<2));
-
- sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex);
- if (!sk || (sk->state == TCP_LISTEN))
- return 0;
- /*
- * This packet came from us.
- */
- return 1;
- }
- case IPPROTO_UDP:
- {
- struct udphdr *uh = (struct udphdr *)(((unsigned char *)iph)+(iph->ihl<<2));
-
- sk = udp_v4_lookup(iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex);
- if (!sk) return 0;
- if (sk->saddr != iph->saddr && inet_addr_type(iph->saddr) != RTN_LOCAL)
- return 0;
- /*
- * This packet may have come from us.
- * Assume it did.
- */
- return 1;
- }
- }
- }
- return 0;
-}
-
-#endif
-
/*
* Deal with incoming ICMP packets.
*/
@@ -1133,7 +1129,7 @@ static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1] = {
{ &icmp_statistics.IcmpOutAddrMaskReps, &icmp_statistics.IcmpInAddrMaskReps, icmp_address_reply, 0, }
};
-__initfunc(void icmp_init(struct net_proto_family *ops))
+void __init icmp_init(struct net_proto_family *ops)
{
int err;
@@ -1151,7 +1147,7 @@ __initfunc(void icmp_init(struct net_proto_family *ops))
if ((err=ops->create(icmp_socket, IPPROTO_ICMP))<0)
panic("Failed to create the ICMP control socket.\n");
icmp_socket->sk->allocation=GFP_ATOMIC;
- icmp_socket->sk->ip_ttl = MAXTTL;
+ icmp_socket->sk->protinfo.af_inet.ttl = MAXTTL;
/* Unhash it so that IP input processing does not even
* see it, we do not wish this socket to see incoming
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 61c530418..0ff9ffeda 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -8,7 +8,7 @@
* the older version didn't come out right using gcc 2.5.8, the newer one
* seems to fall out with gcc 2.6.2.
*
- * Version: $Id: igmp.c,v 1.32 1999/06/09 10:10:53 davem Exp $
+ * Version: $Id: igmp.c,v 1.34 1999/08/20 11:05:12 davem Exp $
*
* Authors:
* Alan Cox <Alan.Cox@linux.org>
@@ -97,19 +97,12 @@
#include <linux/mroute.h>
#endif
-/* Big mc list lock for all the devices */
-static rwlock_t ip_mc_lock = RW_LOCK_UNLOCKED;
-/* Big mc list semaphore for all the sockets.
- We do not refer to this list in IP data paths or from BH,
- so that semaphore is OK.
- */
-DECLARE_MUTEX(ip_sk_mc_sem);
-
#define IP_MAX_MEMBERSHIPS 20
#ifdef CONFIG_IP_MULTICAST
+
/* Parameter names and values are taken from igmp-v2-06 draft */
#define IGMP_V1_Router_Present_Timeout (400*HZ)
@@ -129,36 +122,72 @@ DECLARE_MUTEX(ip_sk_mc_sem);
#define IGMP_V1_SEEN(in_dev) ((in_dev)->mr_v1_seen && (long)(jiffies - (in_dev)->mr_v1_seen) < 0)
+#endif
+
+static void ip_ma_put(struct ip_mc_list *im)
+{
+ if (atomic_dec_and_test(&im->refcnt)) {
+ in_dev_put(im->interface);
+ kfree_s(im, sizeof(*im));
+ }
+}
+
+#ifdef CONFIG_IP_MULTICAST
+
/*
* Timer management
*/
static __inline__ void igmp_stop_timer(struct ip_mc_list *im)
{
- if (im->tm_running) {
- del_timer(&im->timer);
- im->tm_running=0;
- }
+ spin_lock_bh(&im->lock);
+ if (del_timer(&im->timer))
+ atomic_dec(&im->refcnt);
+ im->tm_running=0;
+ im->reporter = 0;
+ im->unsolicit_count = 0;
+ spin_unlock_bh(&im->lock);
}
static __inline__ void igmp_start_timer(struct ip_mc_list *im, int max_delay)
{
- int tv;
- if (im->tm_running)
- return;
- tv=net_random() % max_delay;
+ int tv=net_random() % max_delay;
+
+ spin_lock_bh(&im->lock);
+ if (!del_timer(&im->timer))
+ atomic_inc(&im->refcnt);
im->timer.expires=jiffies+tv+2;
im->tm_running=1;
add_timer(&im->timer);
+ spin_unlock_bh(&im->lock);
+}
+
+static void igmp_mod_timer(struct ip_mc_list *im, int max_delay)
+{
+ spin_lock_bh(&im->lock);
+ im->unsolicit_count = 0;
+ if (del_timer(&im->timer)) {
+ if ((long)(im->timer.expires-jiffies) < max_delay) {
+ add_timer(&im->timer);
+ im->tm_running=1;
+ spin_unlock_bh(&im->lock);
+ return;
+ }
+ atomic_dec(&im->refcnt);
+ }
+ spin_unlock_bh(&im->lock);
+
+ igmp_start_timer(im, max_delay);
}
+
/*
* Send an IGMP report.
*/
#define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4)
-static int igmp_send_report(struct device *dev, u32 group, int type)
+static int igmp_send_report(struct net_device *dev, u32 group, int type)
{
struct sk_buff *skb;
struct iphdr *iph;
@@ -225,8 +254,6 @@ static void igmp_timer_expire(unsigned long data)
struct in_device *in_dev = im->interface;
int err;
- read_lock(&ip_mc_lock);
-
im->tm_running=0;
if (IGMP_V1_SEEN(in_dev))
@@ -236,7 +263,9 @@ static void igmp_timer_expire(unsigned long data)
/* Failed. Retry later. */
if (err) {
- igmp_start_timer(im, IGMP_Unsolicited_Report_Interval);
+ if (!in_dev->dead)
+ igmp_start_timer(im, IGMP_Unsolicited_Report_Interval);
+ ip_ma_put(im);
return;
}
@@ -245,7 +274,7 @@ static void igmp_timer_expire(unsigned long data)
igmp_start_timer(im, IGMP_Unsolicited_Report_Interval);
}
im->reporter = 1;
- read_unlock(&ip_mc_lock);
+ ip_ma_put(im);
}
static void igmp_heard_report(struct in_device *in_dev, u32 group)
@@ -254,19 +283,17 @@ static void igmp_heard_report(struct in_device *in_dev, u32 group)
/* Timers are only set for non-local groups */
- if (LOCAL_MCAST(group))
+ if (group == IGMP_ALL_HOSTS)
return;
- read_lock(&ip_mc_lock);
+ read_lock(&in_dev->lock);
for (im=in_dev->mc_list; im!=NULL; im=im->next) {
if (im->multiaddr == group) {
igmp_stop_timer(im);
- im->reporter = 0;
- im->unsolicit_count = 0;
break;
}
}
- read_unlock(&ip_mc_lock);
+ read_unlock(&in_dev->lock);
}
static void igmp_heard_query(struct in_device *in_dev, unsigned char max_resp_time,
@@ -284,7 +311,7 @@ static void igmp_heard_query(struct in_device *in_dev, unsigned char max_resp_ti
in_dev->mr_v1_seen = jiffies + IGMP_V1_Router_Present_Timeout;
group = 0;
}
-
+
/*
* - Start the timers in all of our membership records
* that the query applies to for the interface on
@@ -295,28 +322,30 @@ static void igmp_heard_query(struct in_device *in_dev, unsigned char max_resp_ti
* - Use the igmp->igmp_code field as the maximum
* delay possible
*/
- read_lock(&ip_mc_lock);
+ read_lock(&in_dev->lock);
for (im=in_dev->mc_list; im!=NULL; im=im->next) {
if (group && group != im->multiaddr)
continue;
- if (LOCAL_MCAST(im->multiaddr))
+ if (im->multiaddr == IGMP_ALL_HOSTS)
continue;
- im->unsolicit_count = 0;
- if (im->tm_running && (long)(im->timer.expires-jiffies) > max_delay)
- igmp_stop_timer(im);
- igmp_start_timer(im, max_delay);
+ igmp_mod_timer(im, max_delay);
}
- read_unlock(&ip_mc_lock);
+ read_unlock(&in_dev->lock);
}
int igmp_rcv(struct sk_buff *skb, unsigned short len)
{
/* This basically follows the spec line by line -- see RFC1112 */
struct igmphdr *ih = skb->h.igmph;
- struct in_device *in_dev = skb->dev->ip_ptr;
+ struct in_device *in_dev = in_dev_get(skb->dev);
+
+ if (in_dev==NULL) {
+ kfree_skb(skb);
+ return 0;
+ }
- if (len < sizeof(struct igmphdr) || ip_compute_csum((void *)ih, len)
- || in_dev==NULL) {
+ if (len < sizeof(struct igmphdr) || ip_compute_csum((void *)ih, len)) {
+ in_dev_put(in_dev);
kfree_skb(skb);
return 0;
}
@@ -334,6 +363,7 @@ int igmp_rcv(struct sk_buff *skb, unsigned short len)
break;
case IGMP_PIM:
#ifdef CONFIG_IP_PIMSM_V1
+ in_dev_put(in_dev);
return pim_rcv_v1(skb, len);
#endif
case IGMP_DVMRP:
@@ -345,6 +375,7 @@ int igmp_rcv(struct sk_buff *skb, unsigned short len)
default:
NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type));
}
+ in_dev_put(in_dev);
kfree_skb(skb);
return 0;
}
@@ -359,7 +390,7 @@ int igmp_rcv(struct sk_buff *skb, unsigned short len)
static void ip_mc_filter_add(struct in_device *in_dev, u32 addr)
{
char buf[MAX_ADDR_LEN];
- struct device *dev = in_dev->dev;
+ struct net_device *dev = in_dev->dev;
/* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG.
We will get multicast token leakage, when IFF_MULTICAST
@@ -367,7 +398,7 @@ static void ip_mc_filter_add(struct in_device *in_dev, u32 addr)
routine. Something sort of:
if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; }
--ANK
- */
+ */
if (arp_mc_map(addr, buf, dev, 0) == 0)
dev_mc_add(dev,buf,dev->addr_len,0);
}
@@ -379,7 +410,7 @@ static void ip_mc_filter_add(struct in_device *in_dev, u32 addr)
static void ip_mc_filter_del(struct in_device *in_dev, u32 addr)
{
char buf[MAX_ADDR_LEN];
- struct device *dev = in_dev->dev;
+ struct net_device *dev = in_dev->dev;
if (arp_mc_map(addr, buf, dev, 0) == 0)
dev_mc_delete(dev,buf,dev->addr_len,0);
@@ -387,18 +418,23 @@ static void ip_mc_filter_del(struct in_device *in_dev, u32 addr)
static void igmp_group_dropped(struct ip_mc_list *im)
{
+#ifdef CONFIG_IP_MULTICAST
+ int reporter;
+#endif
+
if (im->loaded) {
im->loaded = 0;
ip_mc_filter_del(im->interface, im->multiaddr);
}
#ifdef CONFIG_IP_MULTICAST
- if (LOCAL_MCAST(im->multiaddr))
+ if (im->multiaddr == IGMP_ALL_HOSTS)
return;
+ reporter = im->reporter;
igmp_stop_timer(im);
- if (im->reporter && !IGMP_V1_SEEN(im->interface))
+ if (reporter && !IGMP_V1_SEEN(im->interface))
igmp_send_report(im->interface->dev, im->multiaddr, IGMP_HOST_LEAVE_MESSAGE);
#endif
}
@@ -411,7 +447,7 @@ static void igmp_group_added(struct ip_mc_list *im)
}
#ifdef CONFIG_IP_MULTICAST
- if (LOCAL_MCAST(im->multiaddr))
+ if (im->multiaddr == IGMP_ALL_HOSTS)
return;
igmp_start_timer(im, IGMP_Initial_Report_Delay);
@@ -430,24 +466,27 @@ static void igmp_group_added(struct ip_mc_list *im)
void ip_mc_inc_group(struct in_device *in_dev, u32 addr)
{
- struct ip_mc_list *i, *im;
+ struct ip_mc_list *im;
- im = (struct ip_mc_list *)kmalloc(sizeof(*im), GFP_KERNEL);
+ ASSERT_RTNL();
- write_lock_bh(&ip_mc_lock);
- for (i=in_dev->mc_list; i; i=i->next) {
- if (i->multiaddr == addr) {
- i->users++;
- if (im)
- kfree(im);
+ for (im=in_dev->mc_list; im; im=im->next) {
+ if (im->multiaddr == addr) {
+ im->users++;
goto out;
}
}
+
+ im = (struct ip_mc_list *)kmalloc(sizeof(*im), GFP_KERNEL);
if (!im)
goto out;
+
im->users=1;
im->interface=in_dev;
+ in_dev_hold(in_dev);
im->multiaddr=addr;
+ atomic_set(&im->refcnt, 1);
+ spin_lock_init(&im->lock);
#ifdef CONFIG_IP_MULTICAST
im->tm_running=0;
init_timer(&im->timer);
@@ -457,15 +496,14 @@ void ip_mc_inc_group(struct in_device *in_dev, u32 addr)
im->reporter = 0;
im->loaded = 0;
#endif
+ write_lock_bh(&in_dev->lock);
im->next=in_dev->mc_list;
in_dev->mc_list=im;
+ write_unlock_bh(&in_dev->lock);
igmp_group_added(im);
- write_unlock_bh(&ip_mc_lock);
if (in_dev->dev->flags & IFF_UP)
ip_rt_multicast_event(in_dev);
- return;
out:
- write_unlock_bh(&ip_mc_lock);
return;
}
@@ -477,25 +515,27 @@ int ip_mc_dec_group(struct in_device *in_dev, u32 addr)
{
int err = -ESRCH;
struct ip_mc_list *i, **ip;
-
- write_lock_bh(&ip_mc_lock);
+
+ ASSERT_RTNL();
+
for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) {
if (i->multiaddr==addr) {
if (--i->users == 0) {
+ write_lock_bh(&in_dev->lock);
*ip = i->next;
+ write_unlock_bh(&in_dev->lock);
igmp_group_dropped(i);
- write_unlock_bh(&ip_mc_lock);
if (in_dev->dev->flags & IFF_UP)
ip_rt_multicast_event(in_dev);
- kfree_s(i, sizeof(*i));
+
+ ip_ma_put(i);
return 0;
}
err = 0;
break;
}
}
- write_unlock_bh(&ip_mc_lock);
return -ESRCH;
}
@@ -505,10 +545,10 @@ void ip_mc_down(struct in_device *in_dev)
{
struct ip_mc_list *i;
- read_lock_bh(&ip_mc_lock);
+ ASSERT_RTNL();
+
for (i=in_dev->mc_list; i; i=i->next)
igmp_group_dropped(i);
- read_unlock_bh(&ip_mc_lock);
ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
}
@@ -519,12 +559,12 @@ void ip_mc_up(struct in_device *in_dev)
{
struct ip_mc_list *i;
+ ASSERT_RTNL();
+
ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
- read_lock_bh(&ip_mc_lock);
for (i=in_dev->mc_list; i; i=i->next)
igmp_group_added(i);
- read_unlock_bh(&ip_mc_lock);
}
/*
@@ -535,24 +575,32 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
{
struct ip_mc_list *i;
- write_lock_bh(&ip_mc_lock);
+ ASSERT_RTNL();
+
+ write_lock_bh(&in_dev->lock);
while ((i = in_dev->mc_list) != NULL) {
in_dev->mc_list = i->next;
+ write_unlock_bh(&in_dev->lock);
+
igmp_group_dropped(i);
- kfree_s(i, sizeof(*i));
+ ip_ma_put(i);
+
+ write_lock_bh(&in_dev->lock);
}
- write_unlock_bh(&ip_mc_lock);
+ write_unlock_bh(&in_dev->lock);
}
static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr)
{
struct rtable *rt;
- struct device *dev = NULL;
+ struct net_device *dev = NULL;
+ struct in_device *idev = NULL;
if (imr->imr_address.s_addr) {
dev = ip_dev_find(imr->imr_address.s_addr);
if (!dev)
return NULL;
+ __dev_put(dev);
}
if (!dev && !ip_route_output(&rt, imr->imr_multiaddr.s_addr, 0, 0, 0)) {
@@ -561,9 +609,9 @@ static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr)
}
if (dev) {
imr->imr_ifindex = dev->ifindex;
- return dev->ip_ptr;
+ idev = __in_dev_get(dev);
}
- return NULL;
+ return idev;
}
/*
@@ -586,8 +634,11 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
if (!imr->imr_ifindex)
in_dev = ip_mc_find_dev(imr);
- else
+ else {
in_dev = inetdev_by_index(imr->imr_ifindex);
+ if (in_dev)
+ __in_dev_put(in_dev);
+ }
if (!in_dev) {
iml = NULL;
@@ -598,31 +649,28 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
err = -EADDRINUSE;
- down(&ip_sk_mc_sem);
- for (i=sk->ip_mc_list; i; i=i->next) {
+ for (i=sk->protinfo.af_inet.mc_list; i; i=i->next) {
if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) {
/* New style additions are reference counted */
if (imr->imr_address.s_addr == 0) {
i->count++;
err = 0;
}
- goto done_unlock;
+ goto done;
}
count++;
}
err = -ENOBUFS;
if (iml == NULL || count >= sysctl_igmp_max_memberships)
- goto done_unlock;
+ goto done;
memcpy(&iml->multi, imr, sizeof(*imr));
- iml->next = sk->ip_mc_list;
+ iml->next = sk->protinfo.af_inet.mc_list;
iml->count = 1;
- sk->ip_mc_list = iml;
+ sk->protinfo.af_inet.mc_list = iml;
ip_mc_inc_group(in_dev, addr);
iml = NULL;
err = 0;
-done_unlock:
- up(&ip_sk_mc_sem);
done:
rtnl_shunlock();
if (iml)
@@ -638,26 +686,30 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
{
struct ip_mc_socklist *iml, **imlp;
- down(&ip_sk_mc_sem);
- for (imlp=&sk->ip_mc_list; (iml=*imlp)!=NULL; imlp=&iml->next) {
+ rtnl_lock();
+ for (imlp=&sk->protinfo.af_inet.mc_list; (iml=*imlp)!=NULL; imlp=&iml->next) {
if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr &&
iml->multi.imr_address.s_addr==imr->imr_address.s_addr &&
(!imr->imr_ifindex || iml->multi.imr_ifindex==imr->imr_ifindex)) {
struct in_device *in_dev;
- if (--iml->count)
+ if (--iml->count) {
+ rtnl_unlock();
return 0;
+ }
*imlp = iml->next;
- up(&ip_sk_mc_sem);
in_dev = inetdev_by_index(iml->multi.imr_ifindex);
- if (in_dev)
+ if (in_dev) {
ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr);
+ in_dev_put(in_dev);
+ }
+ rtnl_unlock();
sock_kfree_s(sk, iml, sizeof(*iml));
return 0;
}
}
- up(&ip_sk_mc_sem);
+ rtnl_unlock();
return -EADDRNOTAVAIL;
}
@@ -669,36 +721,36 @@ void ip_mc_drop_socket(struct sock *sk)
{
struct ip_mc_socklist *iml;
- down(&ip_sk_mc_sem);
- while ((iml=sk->ip_mc_list) != NULL) {
+ if (sk->protinfo.af_inet.mc_list == NULL)
+ return;
+
+ rtnl_lock();
+ while ((iml=sk->protinfo.af_inet.mc_list) != NULL) {
struct in_device *in_dev;
- sk->ip_mc_list = iml->next;
- up(&ip_sk_mc_sem);
+ sk->protinfo.af_inet.mc_list = iml->next;
- if ((in_dev = inetdev_by_index(iml->multi.imr_ifindex)) != NULL)
+ if ((in_dev = inetdev_by_index(iml->multi.imr_ifindex)) != NULL) {
ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
+ in_dev_put(in_dev);
+ }
sock_kfree_s(sk, iml, sizeof(*iml));
- down(&ip_sk_mc_sem);
}
- up(&ip_sk_mc_sem);
+ rtnl_unlock();
}
-int ip_check_mc(struct device *dev, u32 mc_addr)
+int ip_check_mc(struct in_device *in_dev, u32 mc_addr)
{
- struct in_device *in_dev = dev->ip_ptr;
struct ip_mc_list *im;
- if (in_dev) {
- read_lock(&ip_mc_lock);
- for (im=in_dev->mc_list; im; im=im->next) {
- if (im->multiaddr == mc_addr) {
- read_unlock(&ip_mc_lock);
- return 1;
- }
+ read_lock(&in_dev->lock);
+ for (im=in_dev->mc_list; im; im=im->next) {
+ if (im->multiaddr == mc_addr) {
+ read_unlock(&in_dev->lock);
+ return 1;
}
- read_unlock(&ip_mc_lock);
}
+ read_unlock(&in_dev->lock);
return 0;
}
@@ -710,15 +762,15 @@ int ip_mc_procinfo(char *buffer, char **start, off_t offset, int length, int dum
off_t pos=0, begin=0;
struct ip_mc_list *im;
int len=0;
- struct device *dev;
+ struct net_device *dev;
len=sprintf(buffer,"Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n");
read_lock(&dev_base_lock);
for(dev = dev_base; dev; dev = dev->next) {
- struct in_device *in_dev = dev->ip_ptr;
+ struct in_device *in_dev = in_dev_get(dev);
char *querier = "NONE";
-
+
if (in_dev == NULL)
continue;
@@ -727,7 +779,7 @@ int ip_mc_procinfo(char *buffer, char **start, off_t offset, int length, int dum
len+=sprintf(buffer+len,"%d\t%-10s: %5d %7s\n",
dev->ifindex, dev->name, dev->mc_count, querier);
- read_lock(&ip_mc_lock);
+ read_lock(&in_dev->lock);
for (im = in_dev->mc_list; im; im = im->next) {
len+=sprintf(buffer+len,
"\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n",
@@ -741,11 +793,13 @@ int ip_mc_procinfo(char *buffer, char **start, off_t offset, int length, int dum
begin=pos;
}
if(pos>offset+length) {
- read_unlock(&ip_mc_lock);
+ read_unlock(&in_dev->lock);
+ in_dev_put(in_dev);
goto done;
}
}
- read_unlock(&ip_mc_lock);
+ read_unlock(&in_dev->lock);
+ in_dev_put(in_dev);
}
done:
read_unlock(&dev_base_lock);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 08ebbc2f1..b4afbc85a 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -5,7 +5,7 @@
*
* The IP forwarding functionality.
*
- * Version: $Id: ip_forward.c,v 1.43 1999/03/21 05:22:37 davem Exp $
+ * Version: $Id: ip_forward.c,v 1.45 1999/08/20 11:05:16 davem Exp $
*
* Authors: see ip.c
*
@@ -36,48 +36,49 @@
#include <net/icmp.h>
#include <linux/tcp.h>
#include <linux/udp.h>
-#include <linux/firewall.h>
-#include <linux/ip_fw.h>
-#ifdef CONFIG_IP_MASQUERADE
-#include <net/ip_masq.h>
-#endif
+#include <linux/netfilter_ipv4.h>
#include <net/checksum.h>
#include <linux/route.h>
#include <net/route.h>
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
-/*
- * Check the packet against our socket administration to see
- * if it is related to a connection on our system.
- * Needed for transparent proxying.
- */
-
-int ip_chksock(struct sk_buff *skb)
+static inline int ip_forward_finish(struct sk_buff *skb)
{
- switch (skb->nh.iph->protocol) {
- case IPPROTO_ICMP:
- return icmp_chkaddr(skb);
- case IPPROTO_TCP:
- return tcp_chkaddr(skb);
- case IPPROTO_UDP:
- return udp_chkaddr(skb);
- default:
+ struct ip_options * opt = &(IPCB(skb)->opt);
+
+ ip_statistics.IpForwDatagrams++;
+
+ if (opt->optlen == 0) {
+#ifdef CONFIG_NET_FASTROUTE
+ struct rtable *rt = (struct rtable*)skb->dst;
+
+ if (rt->rt_flags&RTCF_FAST && !netdev_fastroute_obstacles) {
+ struct dst_entry *old_dst;
+ unsigned h = ((*(u8*)&rt->key.dst)^(*(u8*)&rt->key.src))&NETDEV_FASTROUTE_HMASK;
+
+ write_lock_irq(&skb->dev->fastpath_lock);
+ old_dst = skb->dev->fastpath[h];
+ skb->dev->fastpath[h] = dst_clone(&rt->u.dst);
+ write_unlock_irq(&skb->dev->fastpath_lock);
+
+ dst_release(old_dst);
+ }
+#endif
+ ip_send(skb);
return 0;
}
-}
-#endif
+ ip_forward_options(skb);
+ ip_send(skb);
+ return 0;
+}
int ip_forward(struct sk_buff *skb)
{
- struct device *dev2; /* Output device */
+ struct net_device *dev2; /* Output device */
struct iphdr *iph; /* Our header */
struct rtable *rt; /* Route we use */
struct ip_options * opt = &(IPCB(skb)->opt);
unsigned short mtu;
-#if defined(CONFIG_FIREWALL) || defined(CONFIG_IP_MASQUERADE)
- int fw_res = 0;
-#endif
if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
return 0;
@@ -94,20 +95,6 @@ int ip_forward(struct sk_buff *skb)
iph = skb->nh.iph;
rt = (struct rtable*)skb->dst;
-#ifdef CONFIG_CPU_IS_SLOW
- if (net_cpu_congestion > 1 && !(iph->tos&IPTOS_RELIABILITY) &&
- IPTOS_PREC(iph->tos) < IPTOS_PREC_INTERNETCONTROL) {
- if (((xtime.tv_usec&0xF)<<net_cpu_congestion) > 0x1C)
- goto drop;
- }
-#endif
-
-
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- if (ip_chksock(skb))
- goto local_pkt;
-#endif
-
if (iph->ttl <= 1)
goto too_many_hops;
@@ -123,10 +110,6 @@ int ip_forward(struct sk_buff *skb)
dev2 = rt->u.dst.dev;
mtu = rt->u.dst.pmtu;
-#ifdef CONFIG_NET_SECURITY
- call_fw_firewall(PF_SECURITY, dev2, NULL, &mtu, NULL);
-#endif
-
/*
* We now generate an ICMP HOST REDIRECT giving the route
* we calculated.
@@ -160,121 +143,8 @@ int ip_forward(struct sk_buff *skb)
}
#endif
-#ifdef CONFIG_IP_MASQUERADE
- if(!(IPCB(skb)->flags&IPSKB_MASQUERADED)) {
- /*
- * Check that any ICMP packets are not for a
- * masqueraded connection. If so rewrite them
- * and skip the firewall checks
- */
- if (iph->protocol == IPPROTO_ICMP) {
- __u32 maddr;
-#ifdef CONFIG_IP_MASQUERADE_ICMP
- struct icmphdr *icmph = (struct icmphdr *)((char*)iph + (iph->ihl << 2));
- if ((icmph->type==ICMP_DEST_UNREACH)||
- (icmph->type==ICMP_SOURCE_QUENCH)||
- (icmph->type==ICMP_TIME_EXCEEDED))
- {
-#endif
- maddr = inet_select_addr(dev2, rt->rt_gateway, RT_SCOPE_UNIVERSE);
- fw_res = ip_fw_masq_icmp(&skb, maddr);
- if (fw_res < 0) {
- kfree_skb(skb);
- return -1;
- }
-
- if (fw_res)
- /* ICMP matched - skip firewall */
- goto skip_call_fw_firewall;
-#ifdef CONFIG_IP_MASQUERADE_ICMP
- }
-#endif
- }
- if (rt->rt_flags&RTCF_MASQ)
- goto skip_call_fw_firewall;
-#endif /* CONFIG_IP_MASQUERADE */
-
-#ifdef CONFIG_FIREWALL
- fw_res=call_fw_firewall(PF_INET, dev2, iph, NULL, &skb);
- switch (fw_res) {
- case FW_ACCEPT:
- case FW_MASQUERADE:
- break;
- case FW_REJECT:
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
- /* fall thru */
- default:
- kfree_skb(skb);
- return -1;
- }
-#endif
-
-#ifdef CONFIG_IP_MASQUERADE
- }
-
-skip_call_fw_firewall:
- /*
- * If this fragment needs masquerading, make it so...
- * (Don't masquerade de-masqueraded fragments)
- */
- if (!(IPCB(skb)->flags&IPSKB_MASQUERADED) &&
- (fw_res==FW_MASQUERADE || rt->rt_flags&RTCF_MASQ)) {
- u32 maddr;
-
-#ifdef CONFIG_IP_ROUTE_NAT
- maddr = (rt->rt_flags&RTCF_MASQ) ? rt->rt_src_map : 0;
-
- if (maddr == 0)
-#endif
- maddr = inet_select_addr(dev2, rt->rt_gateway, RT_SCOPE_UNIVERSE);
-
- if (ip_fw_masquerade(&skb, maddr) < 0) {
- kfree_skb(skb);
- return -1;
- } else {
- /*
- * Masquerader may have changed skb
- */
- iph = skb->nh.iph;
- opt = &(IPCB(skb)->opt);
- }
- }
-#endif
-
-
-#ifdef CONFIG_FIREWALL
- if ((fw_res = call_out_firewall(PF_INET, dev2, iph, NULL,&skb)) < FW_ACCEPT) {
- /* FW_ACCEPT and FW_MASQUERADE are treated equal:
- masquerading is only supported via forward rules */
- if (fw_res == FW_REJECT)
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
- kfree_skb(skb);
- return -1;
- }
-#endif
-
- ip_statistics.IpForwDatagrams++;
-
- if (opt->optlen == 0) {
-#ifdef CONFIG_NET_FASTROUTE
- if (rt->rt_flags&RTCF_FAST && !netdev_fastroute_obstacles) {
- unsigned h = ((*(u8*)&rt->key.dst)^(*(u8*)&rt->key.src))&NETDEV_FASTROUTE_HMASK;
- /* Time to switch to functional programming :-) */
- dst_release_irqwait(xchg(&skb->dev->fastpath[h], dst_clone(&rt->u.dst)));
- }
-#endif
- ip_send(skb);
- return 0;
- }
-
- ip_forward_options(skb);
- ip_send(skb);
- return 0;
-
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
-local_pkt:
- return ip_local_deliver(skb);
-#endif
+ return NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev2,
+ ip_forward_finish);
frag_needed:
ip_statistics.IpFragFails++;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b1bd8b1dc..4e6ffef06 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -5,7 +5,7 @@
*
* The IP fragmentation functionality.
*
- * Version: $Id: ip_fragment.c,v 1.42 1999/06/12 13:11:34 davem Exp $
+ * Version: $Id: ip_fragment.c,v 1.45 1999/08/30 10:17:10 davem Exp $
*
* Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
* Alan Cox <Alan.Cox@linux.org>
@@ -20,6 +20,7 @@
* John McDonald : 0 length frag bug.
*/
+#include <linux/config.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/sched.h>
@@ -33,8 +34,7 @@
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/inet.h>
-#include <linux/firewall.h>
-#include <linux/ip_fw.h>
+#include <linux/netfilter_ipv4.h>
/* Fragment cache limits. We will commit 256K at one time. Should we
* cross that limit we will prune down to 192K. This should cope with
@@ -66,7 +66,7 @@ struct ipq {
short ihlen; /* length of the IP header */
struct timer_list timer; /* when will this queue expire? */
struct ipq **pprev;
- struct device *dev; /* Device - for icmp replies */
+ struct net_device *dev; /* Device - for icmp replies */
};
#define IPQ_HASHSZ 64
@@ -387,6 +387,10 @@ static struct sk_buff *ip_glue(struct ipq *qp)
*/
skb->security = qp->fragments->skb->security;
+#ifdef CONFIG_NETFILTER_DEBUG
+ skb->nf_debug = qp->fragments->skb->nf_debug;
+#endif
+
/* Done with all fragments. Fixup the new IP header. */
iph = skb->nh.iph;
iph->frag_off = 0;
diff --git a/net/ipv4/ip_fw.c b/net/ipv4/ip_fw.c
deleted file mode 100644
index f3dbafc04..000000000
--- a/net/ipv4/ip_fw.c
+++ /dev/null
@@ -1,1731 +0,0 @@
-/*
- * This code is heavily based on the code on the old ip_fw.c code; see below for
- * copyrights and attributions of the old code. This code is basically GPL.
- *
- * 15-Aug-1997: Major changes to allow graphs for firewall rules.
- * Paul Russell <Paul.Russell@rustcorp.com.au> and
- * Michael Neuling <Michael.Neuling@rustcorp.com.au>
- * 24-Aug-1997: Generalised protocol handling (not just TCP/UDP/ICMP).
- * Added explicit RETURN from chains.
- * Removed TOS mangling (done in ipchains 1.0.1).
- * Fixed read & reset bug by reworking proc handling.
- * Paul Russell <Paul.Russell@rustcorp.com.au>
- * 28-Sep-1997: Added packet marking for net sched code.
- * Removed fw_via comparisons: all done on device name now,
- * similar to changes in ip_fw.c in DaveM's CVS970924 tree.
- * Paul Russell <Paul.Russell@rustcorp.com.au>
- * 2-Nov-1997: Moved types across to __u16, etc.
- * Added inverse flags.
- * Fixed fragment bug (in args to port_match).
- * Changed mark to only one flag (MARKABS).
- * 21-Nov-1997: Added ability to test ICMP code.
- * 19-Jan-1998: Added wildcard interfaces.
- * 6-Feb-1998: Merged 2.0 and 2.1 versions.
- * Initialised ip_masq for 2.0.x version.
- * Added explicit NETLINK option for 2.1.x version.
- * Added packet and byte counters for policy matches.
- * 26-Feb-1998: Fixed race conditions, added SMP support.
- * 18-Mar-1998: Fix SMP, fix race condition fix.
- * 1-May-1998: Remove caching of device pointer.
- * 12-May-1998: Allow tiny fragment case for TCP/UDP.
- * 15-May-1998: Treat short packets as fragments, don't just block.
- * 3-Jan-1999: Fixed serious procfs security hole -- users should never
- * be allowed to view the chains!
- * Marc Santoro <ultima@snicker.emoti.com>
- * 29-Jan-1999: Locally generated bogus IPs dealt with, rather than crash
- * during dump_packet. --RR.
- */
-
-/*
- *
- * The origina Linux port was done Alan Cox, with changes/fixes from
- * Pauline Middlelink, Jos Vos, Thomas Quinot, Wouter Gadeyne, Juan
- * Jose Ciarlante, Bernd Eckenfels, Keith Owens and others.
- *
- * Copyright from the original FreeBSD version follows:
- *
- * Copyright (c) 1993 Daniel Boulet
- * Copyright (c) 1994 Ugen J.S.Antsilevich
- *
- * Redistribution and use in source forms, with and without modification,
- * are permitted provided that this entire comment appears intact.
- *
- * Redistribution in binary form may occur without any restrictions.
- * Obviously, it would be nice if you gave credit where credit is due
- * but requiring it would be too onerous.
- *
- * This software is provided ``AS IS'' without any warranties of any kind. */
-
-
-#include <linux/config.h>
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/types.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <linux/icmp.h>
-#include <linux/udp.h>
-#include <net/ip.h>
-#include <net/protocol.h>
-#include <net/route.h>
-#include <net/tcp.h>
-#include <net/udp.h>
-#include <net/sock.h>
-#include <net/icmp.h>
-#include <linux/netlink.h>
-#include <linux/init.h>
-#include <linux/firewall.h>
-#include <linux/ip_fw.h>
-
-#ifdef CONFIG_IP_MASQUERADE
-#include <net/ip_masq.h>
-#endif
-
-#include <net/checksum.h>
-#include <linux/proc_fs.h>
-#include <linux/stat.h>
-
-/* Understanding locking in this code: (thanks to Alan Cox for using
- * little words to explain this to me). -- PR
- *
- * In UP, there can be two packets traversing the chains:
- * 1) A packet from the current userspace context
- * 2) A packet off the bh handlers (timer or net).
- *
- * For SMP (kernel v2.1+), multiply this by # CPUs.
- *
- * [Note that this in not correct for 2.2 - because the socket code always
- * uses lock_kernel() to serialize, and bottom halves (timers and net_bhs)
- * only run on one CPU at a time. This will probably change for 2.3.
- * It is still good to use spinlocks because that avoids the global cli()
- * for updating the tables, which is rather costly in SMP kernels -AK]
- *
- * This means counters and backchains can get corrupted if no precautions
- * are taken.
- *
- * To actually alter a chain on UP, we need only do a cli(), as this will
- * stop a bh handler firing, as we are in the current userspace context
- * (coming from a setsockopt()).
- *
- * On SMP, we need a write_lock_irqsave(), which is a simple cli() in
- * UP.
- *
- * For backchains and counters, we use an array, indexed by
- * [cpu_number_map[smp_processor_id()]*2 + !in_interrupt()]; the array is of
- * size [smp_num_cpus*2]. For v2.0, smp_num_cpus is effectively 1. So,
- * confident of uniqueness, we modify counters even though we only
- * have a read lock (to read the counters, you need a write lock,
- * though). */
-
-/* Why I didn't use straight locking... -- PR
- *
- * The backchains can be separated out of the ip_chains structure, and
- * allocated as needed inside ip_fw_check().
- *
- * The counters, however, can't. Trying to lock these means blocking
- * interrupts every time we want to access them. This would suck HARD
- * performance-wise. Not locking them leads to possible corruption,
- * made worse on 32-bit machines (counters are 64-bit). */
-
-/*#define DEBUG_IP_FIREWALL*/
-/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */
-/*#define DEBUG_IP_FIREWALL_USER*/
-/*#define DEBUG_IP_FIREWALL_LOCKING*/
-
-#ifdef CONFIG_IP_FIREWALL_NETLINK
-static struct sock *ipfwsk;
-#endif
-
-#ifdef __SMP__
-#define SLOT_NUMBER() (cpu_number_map[smp_processor_id()]*2 + !in_interrupt())
-#else
-#define SLOT_NUMBER() (!in_interrupt())
-#endif
-#define NUM_SLOTS (smp_num_cpus*2)
-
-#define SIZEOF_STRUCT_IP_CHAIN (sizeof(struct ip_chain) \
- + NUM_SLOTS*sizeof(struct ip_reent))
-#define SIZEOF_STRUCT_IP_FW_KERNEL (sizeof(struct ip_fwkernel) \
- + NUM_SLOTS*sizeof(struct ip_counters))
-
-#ifdef DEBUG_IP_FIREWALL_LOCKING
-static unsigned int fwc_rlocks, fwc_wlocks;
-#define FWC_DEBUG_LOCK(d) \
-do { \
- FWC_DONT_HAVE_LOCK(d); \
- d |= (1 << SLOT_NUMBER()); \
-} while (0)
-
-#define FWC_DEBUG_UNLOCK(d) \
-do { \
- FWC_HAVE_LOCK(d); \
- d &= ~(1 << SLOT_NUMBER()); \
-} while (0)
-
-#define FWC_DONT_HAVE_LOCK(d) \
-do { \
- if ((d) & (1 << SLOT_NUMBER())) \
- printk("%s:%i: Got lock on %i already!\n", \
- __FILE__, __LINE__, SLOT_NUMBER()); \
-} while(0)
-
-#define FWC_HAVE_LOCK(d) \
-do { \
- if (!((d) & (1 << SLOT_NUMBER()))) \
- printk("%s:%i:No lock on %i!\n", \
- __FILE__, __LINE__, SLOT_NUMBER()); \
-} while (0)
-
-#else
-#define FWC_DEBUG_LOCK(d) do { } while(0)
-#define FWC_DEBUG_UNLOCK(d) do { } while(0)
-#define FWC_DONT_HAVE_LOCK(d) do { } while(0)
-#define FWC_HAVE_LOCK(d) do { } while(0)
-#endif /*DEBUG_IP_FIRWALL_LOCKING*/
-
-#define FWC_READ_LOCK(l) do { FWC_DEBUG_LOCK(fwc_rlocks); read_lock(l); } while (0)
-#define FWC_WRITE_LOCK(l) do { FWC_DEBUG_LOCK(fwc_wlocks); write_lock(l); } while (0)
-#define FWC_READ_LOCK_IRQ(l,f) do { FWC_DEBUG_LOCK(fwc_rlocks); read_lock_irqsave(l,f); } while (0)
-#define FWC_WRITE_LOCK_IRQ(l,f) do { FWC_DEBUG_LOCK(fwc_wlocks); write_lock_irqsave(l,f); } while (0)
-#define FWC_READ_UNLOCK(l) do { FWC_DEBUG_UNLOCK(fwc_rlocks); read_unlock(l); } while (0)
-#define FWC_WRITE_UNLOCK(l) do { FWC_DEBUG_UNLOCK(fwc_wlocks); write_unlock(l); } while (0)
-#define FWC_READ_UNLOCK_IRQ(l,f) do { FWC_DEBUG_UNLOCK(fwc_rlocks); read_unlock_irqrestore(l,f); } while (0)
-#define FWC_WRITE_UNLOCK_IRQ(l,f) do { FWC_DEBUG_UNLOCK(fwc_wlocks); write_unlock_irqrestore(l,f); } while (0)
-
-struct ip_chain;
-
-struct ip_counters
-{
- __u64 pcnt, bcnt; /* Packet and byte counters */
-};
-
-struct ip_fwkernel
-{
- struct ip_fw ipfw;
- struct ip_fwkernel *next; /* where to go next if current
- * rule doesn't match */
- struct ip_chain *branch; /* which branch to jump to if
- * current rule matches */
- int simplebranch; /* Use this if branch == NULL */
- struct ip_counters counters[0]; /* Actually several of these */
-};
-
-struct ip_reent
-{
- struct ip_chain *prevchain; /* Pointer to referencing chain */
- struct ip_fwkernel *prevrule; /* Pointer to referencing rule */
- struct ip_counters counters;
-};
-
-struct ip_chain
-{
- ip_chainlabel label; /* Defines the label for each block */
- struct ip_chain *next; /* Pointer to next block */
- struct ip_fwkernel *chain; /* Pointer to first rule in block */
- __u32 refcount; /* Number of refernces to block */
- int policy; /* Default rule for chain. Only *
- * used in built in chains */
- struct ip_reent reent[0]; /* Actually several of these */
-};
-
-/*
- * Implement IP packet firewall
- */
-
-#ifdef DEBUG_IP_FIREWALL
-#define dprintf(format, args...) printk(format , ## args)
-#else
-#define dprintf(format, args...)
-#endif
-
-#ifdef DEBUG_IP_FIREWALL_USER
-#define duprintf(format, args...) printk(format , ## args)
-#else
-#define duprintf(format, args...)
-#endif
-
-/* Lock around ip_fw_chains linked list structure */
-rwlock_t ip_fw_lock = RW_LOCK_UNLOCKED;
-
-/* Head of linked list of fw rules */
-static struct ip_chain *ip_fw_chains;
-
-#define IP_FW_INPUT_CHAIN ip_fw_chains
-#define IP_FW_FORWARD_CHAIN (ip_fw_chains->next)
-#define IP_FW_OUTPUT_CHAIN (ip_fw_chains->next->next)
-
-/* Returns 1 if the port is matched by the range, 0 otherwise */
-extern inline int port_match(__u16 min, __u16 max, __u16 port,
- int frag, int invert)
-{
- if (frag) /* Fragments fail ANY port test. */
- return (min == 0 && max == 0xFFFF);
- else return (port >= min && port <= max) ^ invert;
-}
-
-/* Returns whether matches rule or not. */
-static int ip_rule_match(struct ip_fwkernel *f,
- const char *ifname,
- struct iphdr *ip,
- char tcpsyn,
- __u16 src_port, __u16 dst_port,
- char isfrag)
-{
-#define FWINV(bool,invflg) ((bool) ^ !!(f->ipfw.fw_invflg & invflg))
- /*
- * This is a bit simpler as we don't have to walk
- * an interface chain as you do in BSD - same logic
- * however.
- */
-
- if (FWINV((ip->saddr&f->ipfw.fw_smsk.s_addr) != f->ipfw.fw_src.s_addr,
- IP_FW_INV_SRCIP)
- || FWINV((ip->daddr&f->ipfw.fw_dmsk.s_addr)!=f->ipfw.fw_dst.s_addr,
- IP_FW_INV_DSTIP)) {
- dprintf("Source or dest mismatch.\n");
-
- dprintf("SRC: %u. Mask: %u. Target: %u.%s\n", ip->saddr,
- f->ipfw.fw_smsk.s_addr, f->ipfw.fw_src.s_addr,
- f->ipfw.fw_invflg & IP_FW_INV_SRCIP ? " (INV)" : "");
- dprintf("DST: %u. Mask: %u. Target: %u.%s\n", ip->daddr,
- f->ipfw.fw_dmsk.s_addr, f->ipfw.fw_dst.s_addr,
- f->ipfw.fw_invflg & IP_FW_INV_DSTIP ? " (INV)" : "");
- return 0;
- }
-
- /*
- * Look for a VIA device match
- */
- if (f->ipfw.fw_flg & IP_FW_F_WILDIF) {
- if (FWINV(strncmp(ifname, f->ipfw.fw_vianame,
- strlen(f->ipfw.fw_vianame)) != 0,
- IP_FW_INV_VIA)) {
- dprintf("Wildcard interface mismatch.%s\n",
- f->ipfw.fw_invflg & IP_FW_INV_VIA ? " (INV)" : "");
- return 0; /* Mismatch */
- }
- }
- else if (FWINV(strcmp(ifname, f->ipfw.fw_vianame) != 0,
- IP_FW_INV_VIA)) {
- dprintf("Interface name does not match.%s\n",
- f->ipfw.fw_invflg & IP_FW_INV_VIA
- ? " (INV)" : "");
- return 0; /* Mismatch */
- }
-
- /*
- * Ok the chain addresses match.
- */
-
- /* If we have a fragment rule but the packet is not a fragment
- * the we return zero */
- if (FWINV((f->ipfw.fw_flg&IP_FW_F_FRAG) && !isfrag, IP_FW_INV_FRAG)) {
- dprintf("Fragment rule but not fragment.%s\n",
- f->ipfw.fw_invflg & IP_FW_INV_FRAG ? " (INV)" : "");
- return 0;
- }
-
- /* Fragment NEVER passes a SYN test, even an inverted one. */
- if (FWINV((f->ipfw.fw_flg&IP_FW_F_TCPSYN) && !tcpsyn, IP_FW_INV_SYN)
- || (isfrag && (f->ipfw.fw_flg&IP_FW_F_TCPSYN))) {
- dprintf("Rule requires SYN and packet has no SYN.%s\n",
- f->ipfw.fw_invflg & IP_FW_INV_SYN ? " (INV)" : "");
- return 0;
- }
-
- if (f->ipfw.fw_proto) {
- /*
- * Specific firewall - packet's protocol
- * must match firewall's.
- */
-
- if (FWINV(ip->protocol!=f->ipfw.fw_proto, IP_FW_INV_PROTO)) {
- dprintf("Packet protocol %hi does not match %hi.%s\n",
- ip->protocol, f->ipfw.fw_proto,
- f->ipfw.fw_invflg&IP_FW_INV_PROTO ? " (INV)":"");
- return 0;
- }
-
- /* For non TCP/UDP/ICMP, port range is max anyway. */
- if (!port_match(f->ipfw.fw_spts[0],
- f->ipfw.fw_spts[1],
- src_port, isfrag,
- !!(f->ipfw.fw_invflg&IP_FW_INV_SRCPT))
- || !port_match(f->ipfw.fw_dpts[0],
- f->ipfw.fw_dpts[1],
- dst_port, isfrag,
- !!(f->ipfw.fw_invflg
- &IP_FW_INV_DSTPT))) {
- dprintf("Port match failed.\n");
- return 0;
- }
- }
-
- dprintf("Match succeeded.\n");
- return 1;
-}
-
-static const char *branchname(struct ip_chain *branch,int simplebranch)
-{
- if (branch)
- return branch->label;
- switch (simplebranch)
- {
- case FW_BLOCK: return IP_FW_LABEL_BLOCK;
- case FW_ACCEPT: return IP_FW_LABEL_ACCEPT;
- case FW_REJECT: return IP_FW_LABEL_REJECT;
- case FW_REDIRECT: return IP_FW_LABEL_REDIRECT;
- case FW_MASQUERADE: return IP_FW_LABEL_MASQUERADE;
- case FW_SKIP: return "-";
- case FW_SKIP+1: return IP_FW_LABEL_RETURN;
- default:
- return "UNKNOWN";
- }
-}
-
-/*
- * VERY ugly piece of code which actually
- * makes kernel printf for matching packets...
- */
-static void dump_packet(const struct iphdr *ip,
- const char *ifname,
- struct ip_fwkernel *f,
- const ip_chainlabel chainlabel,
- __u16 src_port,
- __u16 dst_port)
-{
- __u32 *opt = (__u32 *) (ip + 1);
- int opti;
-
- if (f)
- {
- printk(KERN_INFO "Packet log: %s ",chainlabel);
-
- printk("%s ",branchname(f->branch,f->simplebranch));
- if (f->simplebranch==FW_REDIRECT)
- printk("%d ",f->ipfw.fw_redirpt);
- }
-
- printk("%s PROTO=%d %ld.%ld.%ld.%ld:%hu %ld.%ld.%ld.%ld:%hu"
- " L=%hu S=0x%2.2hX I=%hu F=0x%4.4hX T=%hu",
- ifname, ip->protocol,
- (ntohl(ip->saddr)>>24)&0xFF,
- (ntohl(ip->saddr)>>16)&0xFF,
- (ntohl(ip->saddr)>>8)&0xFF,
- (ntohl(ip->saddr))&0xFF,
- src_port,
- (ntohl(ip->daddr)>>24)&0xFF,
- (ntohl(ip->daddr)>>16)&0xFF,
- (ntohl(ip->daddr)>>8)&0xFF,
- (ntohl(ip->daddr))&0xFF,
- dst_port,
- ntohs(ip->tot_len), ip->tos, ntohs(ip->id),
- ntohs(ip->frag_off), ip->ttl);
-
- for (opti = 0; opti < (ip->ihl - sizeof(struct iphdr) / 4); opti++)
- printk(" O=0x%8.8X", *opt++);
- printk("\n");
-}
-
-/* function for checking chain labels for user space. */
-static int check_label(ip_chainlabel label)
-{
- unsigned int i;
- /* strlen must be < IP_FW_MAX_LABEL_LENGTH. */
- for (i = 0; i < IP_FW_MAX_LABEL_LENGTH + 1; i++)
- if (label[i] == '\0') return 1;
-
- return 0;
-}
-
-/* This function returns a pointer to the first chain with a label
- * that matches the one given. */
-static struct ip_chain *find_label(ip_chainlabel label)
-{
- struct ip_chain *tmp;
- FWC_HAVE_LOCK(fwc_rlocks | fwc_wlocks);
- for (tmp = ip_fw_chains; tmp; tmp = tmp->next)
- if (strcmp(tmp->label,label) == 0)
- break;
- return tmp;
-}
-
-/* This function returns a boolean which when true sets answer to one
- of the FW_*. */
-static int find_special(ip_chainlabel label, int *answer)
-{
- if (label[0] == '\0') {
- *answer = FW_SKIP; /* => pass-through rule */
- return 1;
- } else if (strcmp(label,IP_FW_LABEL_ACCEPT) == 0) {
- *answer = FW_ACCEPT;
- return 1;
- } else if (strcmp(label,IP_FW_LABEL_BLOCK) == 0) {
- *answer = FW_BLOCK;
- return 1;
- } else if (strcmp(label,IP_FW_LABEL_REJECT) == 0) {
- *answer = FW_REJECT;
- return 1;
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- } else if (strcmp(label,IP_FW_LABEL_REDIRECT) == 0) {
- *answer = FW_REDIRECT;
- return 1;
-#endif
-#ifdef CONFIG_IP_MASQUERADE
- } else if (strcmp(label,IP_FW_LABEL_MASQUERADE) == 0) {
- *answer = FW_MASQUERADE;
- return 1;
-#endif
- } else if (strcmp(label, IP_FW_LABEL_RETURN) == 0) {
- *answer = FW_SKIP+1;
- return 1;
- } else {
- return 0;
- }
-}
-
-/* This function cleans up the prevchain and prevrule. If the verbose
- * flag is set then he names of the chains will be printed as it
- * cleans up. */
-static void cleanup(struct ip_chain *chain,
- const int verbose,
- unsigned int slot)
-{
- struct ip_chain *tmpchain = chain->reent[slot].prevchain;
- if (verbose)
- printk(KERN_ERR "Chain backtrace: ");
- while (tmpchain) {
- if (verbose)
- printk("%s<-",chain->label);
- chain->reent[slot].prevchain = NULL;
- chain = tmpchain;
- tmpchain = chain->reent[slot].prevchain;
- }
- if (verbose)
- printk("%s\n",chain->label);
-}
-
-static inline int
-ip_fw_domatch(struct ip_fwkernel *f,
- struct iphdr *ip,
- const char *rif,
- const ip_chainlabel label,
- struct sk_buff *skb,
- unsigned int slot,
- __u16 src_port, __u16 dst_port)
-{
- f->counters[slot].bcnt+=ntohs(ip->tot_len);
- f->counters[slot].pcnt++;
- if (f->ipfw.fw_flg & IP_FW_F_PRN) {
- dump_packet(ip,rif,f,label,src_port,dst_port);
- }
- ip->tos = (ip->tos & f->ipfw.fw_tosand) ^ f->ipfw.fw_tosxor;
-
-/* This functionality is useless in stock 2.0.x series, but we don't
- * discard the mark thing altogether, to avoid breaking ipchains (and,
- * more importantly, the ipfwadm wrapper) --PR */
- if (f->ipfw.fw_flg & IP_FW_F_MARKABS)
- skb->fwmark = f->ipfw.fw_mark;
- else
- skb->fwmark+=f->ipfw.fw_mark;
-#ifdef CONFIG_IP_FIREWALL_NETLINK
- if (f->ipfw.fw_flg & IP_FW_F_NETLINK) {
- size_t len = min(f->ipfw.fw_outputsize, ntohs(ip->tot_len))
- + sizeof(__u32) + sizeof(skb->fwmark) + IFNAMSIZ;
- struct sk_buff *outskb=alloc_skb(len, GFP_ATOMIC);
-
- duprintf("Sending packet out NETLINK (length = %u).\n",
- (unsigned int)len);
- if (outskb) {
- /* Prepend length, mark & interface */
- skb_put(outskb, len);
- *((__u32 *)outskb->data) = (__u32)len;
- *((__u32 *)(outskb->data+sizeof(__u32))) = skb->fwmark;
- strcpy(outskb->data+sizeof(__u32)*2, rif);
- memcpy(outskb->data+sizeof(__u32)*2+IFNAMSIZ, ip,
- len-(sizeof(__u32)*2+IFNAMSIZ));
- netlink_broadcast(ipfwsk, outskb, 0, ~0, GFP_KERNEL);
- }
- else {
- if (net_ratelimit())
- printk(KERN_WARNING "ip_fw: packet drop due to "
- "netlink failure\n");
- return 0;
- }
- }
-#endif
- return 1;
-}
-
-/*
- * Returns one of the generic firewall policies, like FW_ACCEPT.
- *
- * The testing is either false for normal firewall mode or true for
- * user checking mode (counters are not updated, TOS & mark not done).
- */
-static int
-ip_fw_check(struct iphdr *ip,
- const char *rif,
- __u16 *redirport,
- struct ip_chain *chain,
- struct sk_buff *skb,
- unsigned int slot,
- int testing)
-{
- struct tcphdr *tcp=(struct tcphdr *)((__u32 *)ip+ip->ihl);
- struct udphdr *udp=(struct udphdr *)((__u32 *)ip+ip->ihl);
- struct icmphdr *icmp=(struct icmphdr *)((__u32 *)ip+ip->ihl);
- __u32 src, dst;
- __u16 src_port = 0xFFFF, dst_port = 0xFFFF;
- char tcpsyn=0;
- __u16 offset;
- unsigned char oldtos;
- struct ip_fwkernel *f;
- int ret = FW_SKIP+2;
-
- /* We handle fragments by dealing with the first fragment as
- * if it was a normal packet. All other fragments are treated
- * normally, except that they will NEVER match rules that ask
- * things we don't know, ie. tcp syn flag or ports). If the
- * rule is also a fragment-specific rule, non-fragments won't
- * match it. */
-
- offset = ntohs(ip->frag_off) & IP_OFFSET;
-
- /*
- * Don't allow a fragment of TCP 8 bytes in. Nobody
- * normal causes this. Its a cracker trying to break
- * in by doing a flag overwrite to pass the direction
- * checks.
- */
-
- if (offset == 1 && ip->protocol == IPPROTO_TCP) {
- if (!testing && net_ratelimit()) {
- printk("Suspect TCP fragment.\n");
- dump_packet(ip,rif,NULL,NULL,0,0);
- }
- return FW_BLOCK;
- }
-
- /* If we can't investigate ports, treat as fragment. It's
- * either a trucated whole packet, or a truncated first
- * fragment, or a TCP first fragment of length 8-15, in which
- * case the above rule stops reassembly.
- */
- if (offset == 0) {
- unsigned int size_req;
- switch (ip->protocol) {
- case IPPROTO_TCP:
- /* Don't care about things past flags word */
- size_req = 16;
- break;
-
- case IPPROTO_UDP:
- case IPPROTO_ICMP:
- size_req = 8;
- break;
-
- default:
- size_req = 0;
- }
- offset = (ntohs(ip->tot_len) < (ip->ihl<<2)+size_req);
- }
-
- src = ip->saddr;
- dst = ip->daddr;
- oldtos = ip->tos;
-
- /*
- * If we got interface from which packet came
- * we can use the address directly. Linux 2.1 now uses address
- * chains per device too, but unlike BSD we first check if the
- * incoming packet matches a device address and the routing
- * table before calling the firewall.
- */
-
- dprintf("Packet ");
- switch(ip->protocol)
- {
- case IPPROTO_TCP:
- dprintf("TCP ");
- if (!offset) {
- src_port=ntohs(tcp->source);
- dst_port=ntohs(tcp->dest);
-
- /* Connection initilisation can only
- * be made when the syn bit is set and
- * neither of the ack or reset is
- * set. */
- if(tcp->syn && !(tcp->ack || tcp->rst))
- tcpsyn=1;
- }
- break;
- case IPPROTO_UDP:
- dprintf("UDP ");
- if (!offset) {
- src_port=ntohs(udp->source);
- dst_port=ntohs(udp->dest);
- }
- break;
- case IPPROTO_ICMP:
- if (!offset) {
- src_port=(__u16)icmp->type;
- dst_port=(__u16)icmp->code;
- }
- dprintf("ICMP ");
- break;
- default:
- dprintf("p=%d ",ip->protocol);
- break;
- }
-#ifdef DEBUG_IP_FIREWALL
- print_ip(ip->saddr);
-
- if (offset)
- dprintf(":fragment (%i) ", ((int)offset)<<2);
- else if (ip->protocol==IPPROTO_TCP || ip->protocol==IPPROTO_UDP
- || ip->protocol==IPPROTO_ICMP)
- dprintf(":%hu:%hu", src_port, dst_port);
- dprintf("\n");
-#endif
-
- if (!testing) FWC_READ_LOCK(&ip_fw_lock);
- else FWC_HAVE_LOCK(fwc_rlocks);
-
- f = chain->chain;
- do {
- for (; f; f = f->next) {
- if (ip_rule_match(f,rif,ip,
- tcpsyn,src_port,dst_port,offset)) {
- if (!testing
- && !ip_fw_domatch(f, ip, rif, chain->label,
- skb, slot,
- src_port, dst_port)) {
- ret = FW_BLOCK;
- goto out;
- }
- break;
- }
- }
- if (f) {
- if (f->branch) {
- /* Do sanity check to see if we have
- * already set prevchain and if so we
- * must be in a loop */
- if (f->branch->reent[slot].prevchain) {
- if (!testing) {
- printk(KERN_ERR
- "IP firewall: "
- "Loop detected "
- "at `%s'.\n",
- f->branch->label);
- cleanup(chain, 1, slot);
- ret = FW_BLOCK;
- } else {
- cleanup(chain, 0, slot);
- ret = FW_SKIP+1;
- }
- }
- else {
- f->branch->reent[slot].prevchain
- = chain;
- f->branch->reent[slot].prevrule
- = f->next;
- chain = f->branch;
- f = chain->chain;
- }
- }
- else if (f->simplebranch == FW_SKIP)
- f = f->next;
- else if (f->simplebranch == FW_SKIP+1) {
- /* Just like falling off the chain */
- goto fall_off_chain;
- }
- else {
- cleanup(chain, 0, slot);
- ret = f->simplebranch;
- }
- } /* f == NULL */
- else {
- fall_off_chain:
- if (chain->reent[slot].prevchain) {
- struct ip_chain *tmp = chain;
- f = chain->reent[slot].prevrule;
- chain = chain->reent[slot].prevchain;
- tmp->reent[slot].prevchain = NULL;
- }
- else {
- ret = chain->policy;
- if (!testing) {
- chain->reent[slot].counters.pcnt++;
- chain->reent[slot].counters.bcnt
- += ntohs(ip->tot_len);
- }
- }
- }
- } while (ret == FW_SKIP+2);
-
- out:
- if (!testing) FWC_READ_UNLOCK(&ip_fw_lock);
-
- /* Recalculate checksum if not going to reject, and TOS changed. */
- if (ip->tos != oldtos
- && ret != FW_REJECT && ret != FW_BLOCK
- && !testing)
- ip_send_check(ip);
-
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- if (ret == FW_REDIRECT && redirport) {
- if ((*redirport = htons(f->ipfw.fw_redirpt)) == 0) {
- /* Wildcard redirection.
- * Note that redirport will become
- * 0xFFFF for non-TCP/UDP packets.
- */
- *redirport = htons(dst_port);
- }
- }
-#endif
-
-#ifdef DEBUG_ALLOW_ALL
- return (testing ? ret : FW_ACCEPT);
-#else
- return ret;
-#endif
-}
-
-/* Must have write lock & interrupts off for any of these */
-
-/* This function sets all the byte counters in a chain to zero. The
- * input is a pointer to the chain required for zeroing */
-static int zero_fw_chain(struct ip_chain *chainptr)
-{
- struct ip_fwkernel *i;
-
- FWC_HAVE_LOCK(fwc_wlocks);
- for (i = chainptr->chain; i; i = i->next)
- memset(i->counters, 0, sizeof(struct ip_counters)*NUM_SLOTS);
- return 0;
-}
-
-static int clear_fw_chain(struct ip_chain *chainptr)
-{
- struct ip_fwkernel *i= chainptr->chain;
-
- FWC_HAVE_LOCK(fwc_wlocks);
- chainptr->chain=NULL;
-
- while (i) {
- struct ip_fwkernel *tmp = i->next;
- if (i->branch)
- i->branch->refcount--;
- kfree(i);
- i = tmp;
- }
- return 0;
-}
-
-static int replace_in_chain(struct ip_chain *chainptr,
- struct ip_fwkernel *frwl,
- __u32 position)
-{
- struct ip_fwkernel *f = chainptr->chain;
-
- FWC_HAVE_LOCK(fwc_wlocks);
-
- while (--position && f != NULL) f = f->next;
- if (f == NULL)
- return EINVAL;
-
- if (f->branch) f->branch->refcount--;
- if (frwl->branch) frwl->branch->refcount++;
-
- frwl->next = f->next;
- memcpy(f,frwl,sizeof(struct ip_fwkernel));
- kfree(frwl);
- return 0;
-}
-
-static int append_to_chain(struct ip_chain *chainptr, struct ip_fwkernel *rule)
-{
- struct ip_fwkernel *i;
-
- FWC_HAVE_LOCK(fwc_wlocks);
- /* Special case if no rules already present */
- if (chainptr->chain == NULL) {
-
- /* If pointer writes are atomic then turning off
- * interupts is not necessary. */
- chainptr->chain = rule;
- if (rule->branch) rule->branch->refcount++;
- return 0;
- }
-
- /* Find the rule before the end of the chain */
- for (i = chainptr->chain; i->next; i = i->next);
- i->next = rule;
- if (rule->branch) rule->branch->refcount++;
- return 0;
-}
-
-/* This function inserts a rule at the position of position in the
- * chain refenced by chainptr. If position is 1 then this rule will
- * become the new rule one. */
-static int insert_in_chain(struct ip_chain *chainptr,
- struct ip_fwkernel *frwl,
- __u32 position)
-{
- struct ip_fwkernel *f = chainptr->chain;
-
- FWC_HAVE_LOCK(fwc_wlocks);
- /* special case if the position is number 1 */
- if (position == 1) {
- frwl->next = chainptr->chain;
- if (frwl->branch) frwl->branch->refcount++;
- chainptr->chain = frwl;
- return 0;
- }
- position--;
- while (--position && f != NULL) f = f->next;
- if (f == NULL)
- return EINVAL;
- if (frwl->branch) frwl->branch->refcount++;
- frwl->next = f->next;
-
- f->next = frwl;
- return 0;
-}
-
-/* This function deletes the a rule from a given rulenum and chain.
- * With rulenum = 1 is the first rule is deleted. */
-
-static int del_num_from_chain(struct ip_chain *chainptr, __u32 rulenum)
-{
- struct ip_fwkernel *i=chainptr->chain,*tmp;
-
- FWC_HAVE_LOCK(fwc_wlocks);
-
- if (!chainptr->chain)
- return ENOENT;
-
- /* Need a special case for the first rule */
- if (rulenum == 1) {
- /* store temp to allow for freeing up of memory */
- tmp = chainptr->chain;
- if (chainptr->chain->branch) chainptr->chain->branch->refcount--;
- chainptr->chain = chainptr->chain->next;
- kfree(tmp); /* free memory that is now unused */
- } else {
- rulenum--;
- while (--rulenum && i->next ) i = i->next;
- if (!i->next)
- return ENOENT;
- tmp = i->next;
- if (i->next->branch)
- i->next->branch->refcount--;
- i->next = i->next->next;
- kfree(tmp);
- }
- return 0;
-}
-
-
-/* This function deletes the a rule from a given rule and chain.
- * The rule that is deleted is the first occursance of that rule. */
-static int del_rule_from_chain(struct ip_chain *chainptr,
- struct ip_fwkernel *frwl)
-{
- struct ip_fwkernel *ltmp,*ftmp = chainptr->chain ;
- int was_found;
-
- FWC_HAVE_LOCK(fwc_wlocks);
-
- /* Sure, we should compare marks, but since the `ipfwadm'
- * script uses it for an unholy hack... well, life is easier
- * this way. We also mask it out of the flags word. --PR */
- for (ltmp=NULL, was_found=0;
- !was_found && ftmp != NULL;
- ltmp = ftmp,ftmp = ftmp->next) {
- if (ftmp->ipfw.fw_src.s_addr!=frwl->ipfw.fw_src.s_addr
- || ftmp->ipfw.fw_dst.s_addr!=frwl->ipfw.fw_dst.s_addr
- || ftmp->ipfw.fw_smsk.s_addr!=frwl->ipfw.fw_smsk.s_addr
- || ftmp->ipfw.fw_dmsk.s_addr!=frwl->ipfw.fw_dmsk.s_addr
-#if 0
- || ftmp->ipfw.fw_flg!=frwl->ipfw.fw_flg
-#else
- || ((ftmp->ipfw.fw_flg & ~IP_FW_F_MARKABS)
- != (frwl->ipfw.fw_flg & ~IP_FW_F_MARKABS))
-#endif
- || ftmp->ipfw.fw_invflg!=frwl->ipfw.fw_invflg
- || ftmp->ipfw.fw_proto!=frwl->ipfw.fw_proto
-#if 0
- || ftmp->ipfw.fw_mark!=frwl->ipfw.fw_mark
-#endif
- || ftmp->ipfw.fw_redirpt!=frwl->ipfw.fw_redirpt
- || ftmp->ipfw.fw_spts[0]!=frwl->ipfw.fw_spts[0]
- || ftmp->ipfw.fw_spts[1]!=frwl->ipfw.fw_spts[1]
- || ftmp->ipfw.fw_dpts[0]!=frwl->ipfw.fw_dpts[0]
- || ftmp->ipfw.fw_dpts[1]!=frwl->ipfw.fw_dpts[1]
- || ftmp->ipfw.fw_outputsize!=frwl->ipfw.fw_outputsize) {
- duprintf("del_rule_from_chain: mismatch:"
- "src:%u/%u dst:%u/%u smsk:%u/%u dmsk:%u/%u "
- "flg:%hX/%hX invflg:%hX/%hX proto:%u/%u "
- "mark:%u/%u "
- "ports:%hu-%hu/%hu-%hu %hu-%hu/%hu-%hu "
- "outputsize:%hu-%hu\n",
- ftmp->ipfw.fw_src.s_addr,
- frwl->ipfw.fw_src.s_addr,
- ftmp->ipfw.fw_dst.s_addr,
- frwl->ipfw.fw_dst.s_addr,
- ftmp->ipfw.fw_smsk.s_addr,
- frwl->ipfw.fw_smsk.s_addr,
- ftmp->ipfw.fw_dmsk.s_addr,
- frwl->ipfw.fw_dmsk.s_addr,
- ftmp->ipfw.fw_flg,
- frwl->ipfw.fw_flg,
- ftmp->ipfw.fw_invflg,
- frwl->ipfw.fw_invflg,
- ftmp->ipfw.fw_proto,
- frwl->ipfw.fw_proto,
- ftmp->ipfw.fw_mark,
- frwl->ipfw.fw_mark,
- ftmp->ipfw.fw_spts[0],
- frwl->ipfw.fw_spts[0],
- ftmp->ipfw.fw_spts[1],
- frwl->ipfw.fw_spts[1],
- ftmp->ipfw.fw_dpts[0],
- frwl->ipfw.fw_dpts[0],
- ftmp->ipfw.fw_dpts[1],
- frwl->ipfw.fw_dpts[1],
- ftmp->ipfw.fw_outputsize,
- frwl->ipfw.fw_outputsize);
- continue;
- }
-
- if (strncmp(ftmp->ipfw.fw_vianame,
- frwl->ipfw.fw_vianame,
- IFNAMSIZ)) {
- duprintf("del_rule_from_chain: if mismatch: %s/%s\n",
- ftmp->ipfw.fw_vianame,
- frwl->ipfw.fw_vianame);
- continue;
- }
- if (ftmp->branch != frwl->branch) {
- duprintf("del_rule_from_chain: branch mismatch: "
- "%s/%s\n",
- ftmp->branch?ftmp->branch->label:"(null)",
- frwl->branch?frwl->branch->label:"(null)");
- continue;
- }
- if (ftmp->branch == NULL
- && ftmp->simplebranch != frwl->simplebranch) {
- duprintf("del_rule_from_chain: simplebranch mismatch: "
- "%i/%i\n",
- ftmp->simplebranch, frwl->simplebranch);
- continue;
- }
- was_found = 1;
- if (ftmp->branch)
- ftmp->branch->refcount--;
- if (ltmp)
- ltmp->next = ftmp->next;
- else
- chainptr->chain = ftmp->next;
- kfree(ftmp);
- break;
- }
-
- if (was_found)
- return 0;
- else {
- duprintf("del_rule_from_chain: no matching rule found\n");
- return EINVAL;
- }
-}
-
-/* This function takes the label of a chain and deletes the first
- * chain with that name. No special cases required for the built in
- * chains as they have their refcount initilised to 1 so that they are
- * never deleted. */
-static int del_chain(ip_chainlabel label)
-{
- struct ip_chain *tmp,*tmp2;
-
- FWC_HAVE_LOCK(fwc_wlocks);
- /* Corner case: return EBUSY not ENOENT for first elem ("input") */
- if (strcmp(label, ip_fw_chains->label) == 0)
- return EBUSY;
-
- for (tmp = ip_fw_chains; tmp->next; tmp = tmp->next)
- if(strcmp(tmp->next->label,label) == 0)
- break;
-
- tmp2 = tmp->next;
- if (!tmp2)
- return ENOENT;
-
- if (tmp2->refcount)
- return EBUSY;
-
- if (tmp2->chain)
- return ENOTEMPTY;
-
- tmp->next = tmp2->next;
- kfree(tmp2);
- return 0;
-}
-
-/* This is a function to initilise a chain. Built in rules start with
- * refcount = 1 so that they cannot be deleted. User defined rules
- * start with refcount = 0 so they can be deleted. */
-static struct ip_chain *ip_init_chain(ip_chainlabel name,
- __u32 ref,
- int policy)
-{
- unsigned int i;
- struct ip_chain *label
- = kmalloc(SIZEOF_STRUCT_IP_CHAIN, GFP_KERNEL);
- if (label == NULL)
- panic("Can't kmalloc for firewall chains.\n");
- strcpy(label->label,name);
- label->next = NULL;
- label->chain = NULL;
- label->refcount = ref;
- label->policy = policy;
- for (i = 0; i < smp_num_cpus*2; i++) {
- label->reent[i].counters.pcnt = label->reent[i].counters.bcnt
- = 0;
- label->reent[i].prevchain = NULL;
- label->reent[i].prevrule = NULL;
- }
-
- return label;
-}
-
-/* This is a function for reating a new chain. The chains is not
- * created if a chain of the same name already exists */
-static int create_chain(ip_chainlabel label)
-{
- struct ip_chain *tmp;
-
- if (!check_label(label))
- return EINVAL;
-
- FWC_HAVE_LOCK(fwc_wlocks);
- for (tmp = ip_fw_chains; tmp->next; tmp = tmp->next)
- if (strcmp(tmp->label,label) == 0)
- return EEXIST;
-
- if (strcmp(tmp->label,label) == 0)
- return EEXIST;
-
- tmp->next = ip_init_chain(label, 0, FW_SKIP); /* refcount is
- * zero since this is a
- * user defined chain *
- * and therefore can be
- * deleted */
- return 0;
-}
-
-/* This function simply changes the policy on one of the built in
- * chains. checking must be done before this is call to ensure that
- * chainptr is pointing to one of the three possible chains */
-static int change_policy(struct ip_chain *chainptr, int policy)
-{
- FWC_HAVE_LOCK(fwc_wlocks);
- chainptr->policy = policy;
- return 0;
-}
-
-/* This function takes an ip_fwuser and converts it to a ip_fwkernel. It also
- * performs some checks in the structure. */
-static struct ip_fwkernel *convert_ipfw(struct ip_fwuser *fwuser, int *errno)
-{
- struct ip_fwkernel *fwkern;
-
- if ( (fwuser->ipfw.fw_flg & ~IP_FW_F_MASK) != 0 ) {
- duprintf("convert_ipfw: undefined flag bits set (flags=%x)\n",
- fwuser->ipfw.fw_flg);
- *errno = EINVAL;
- return NULL;
- }
-
-#ifdef DEBUG_IP_FIREWALL_USER
- /* These are sanity checks that don't really matter.
- * We can get rid of these once testing is complete.
- */
- if ((fwuser->ipfw.fw_flg & IP_FW_F_TCPSYN)
- && ((fwuser->ipfw.fw_invflg & IP_FW_INV_PROTO)
- || fwuser->ipfw.fw_proto != IPPROTO_TCP)) {
- duprintf("convert_ipfw: TCP SYN flag set but proto != TCP!\n");
- *errno = EINVAL;
- return NULL;
- }
-
- if (strcmp(fwuser->label, IP_FW_LABEL_REDIRECT) != 0
- && fwuser->ipfw.fw_redirpt != 0) {
- duprintf("convert_ipfw: Target not REDIR but redirpt != 0!\n");
- *errno = EINVAL;
- return NULL;
- }
-
- if ((!(fwuser->ipfw.fw_flg & IP_FW_F_FRAG)
- && (fwuser->ipfw.fw_invflg & IP_FW_INV_FRAG))
- || (!(fwuser->ipfw.fw_flg & IP_FW_F_TCPSYN)
- && (fwuser->ipfw.fw_invflg & IP_FW_INV_SYN))) {
- duprintf("convert_ipfw: Can't have INV flag if flag unset!\n");
- *errno = EINVAL;
- return NULL;
- }
-
- if (((fwuser->ipfw.fw_invflg & IP_FW_INV_SRCPT)
- && fwuser->ipfw.fw_spts[0] == 0
- && fwuser->ipfw.fw_spts[1] == 0xFFFF)
- || ((fwuser->ipfw.fw_invflg & IP_FW_INV_DSTPT)
- && fwuser->ipfw.fw_dpts[0] == 0
- && fwuser->ipfw.fw_dpts[1] == 0xFFFF)
- || ((fwuser->ipfw.fw_invflg & IP_FW_INV_VIA)
- && (fwuser->ipfw.fw_vianame)[0] == '\0')
- || ((fwuser->ipfw.fw_invflg & IP_FW_INV_SRCIP)
- && fwuser->ipfw.fw_smsk.s_addr == 0)
- || ((fwuser->ipfw.fw_invflg & IP_FW_INV_DSTIP)
- && fwuser->ipfw.fw_dmsk.s_addr == 0)) {
- duprintf("convert_ipfw: INV flag makes rule unmatchable!\n");
- *errno = EINVAL;
- return NULL;
- }
-
- if ((fwuser->ipfw.fw_flg & IP_FW_F_FRAG)
- && !(fwuser->ipfw.fw_invflg & IP_FW_INV_FRAG)
- && (fwuser->ipfw.fw_spts[0] != 0
- || fwuser->ipfw.fw_spts[1] != 0xFFFF
- || fwuser->ipfw.fw_dpts[0] != 0
- || fwuser->ipfw.fw_dpts[1] != 0xFFFF
- || (fwuser->ipfw.fw_flg & IP_FW_F_TCPSYN))) {
- duprintf("convert_ipfw: Can't test ports or SYN with frag!\n");
- *errno = EINVAL;
- return NULL;
- }
-#endif
-
- if ((fwuser->ipfw.fw_spts[0] != 0
- || fwuser->ipfw.fw_spts[1] != 0xFFFF
- || fwuser->ipfw.fw_dpts[0] != 0
- || fwuser->ipfw.fw_dpts[1] != 0xFFFF)
- && ((fwuser->ipfw.fw_invflg & IP_FW_INV_PROTO)
- || (fwuser->ipfw.fw_proto != IPPROTO_TCP
- && fwuser->ipfw.fw_proto != IPPROTO_UDP
- && fwuser->ipfw.fw_proto != IPPROTO_ICMP))) {
- duprintf("convert_ipfw: Can only test ports for TCP/UDP/ICMP!\n");
- *errno = EINVAL;
- return NULL;
- }
-
- fwkern = kmalloc(SIZEOF_STRUCT_IP_FW_KERNEL, GFP_KERNEL);
- if (!fwkern) {
- duprintf("convert_ipfw: kmalloc failed!\n");
- *errno = ENOMEM;
- return NULL;
- }
- memcpy(&fwkern->ipfw,&fwuser->ipfw,sizeof(struct ip_fw));
-
- if (!find_special(fwuser->label, &fwkern->simplebranch)) {
- fwkern->branch = find_label(fwuser->label);
- if (!fwkern->branch) {
- duprintf("convert_ipfw: chain doesn't exist `%s'.\n",
- fwuser->label);
- kfree(fwkern);
- *errno = ENOENT;
- return NULL;
- } else if (fwkern->branch == IP_FW_INPUT_CHAIN
- || fwkern->branch == IP_FW_FORWARD_CHAIN
- || fwkern->branch == IP_FW_OUTPUT_CHAIN) {
- duprintf("convert_ipfw: Can't branch to builtin chain `%s'.\n",
- fwuser->label);
- kfree(fwkern);
- *errno = ENOENT;
- return NULL;
- }
- } else
- fwkern->branch = NULL;
- memset(fwkern->counters, 0, sizeof(struct ip_counters)*NUM_SLOTS);
-
- /* Handle empty vianame by making it a wildcard */
- if ((fwkern->ipfw.fw_vianame)[0] == '\0')
- fwkern->ipfw.fw_flg |= IP_FW_F_WILDIF;
-
- fwkern->next = NULL;
- return fwkern;
-}
-
-int ip_fw_ctl(int cmd, void *m, int len)
-{
- int ret;
- struct ip_chain *chain;
- unsigned long flags;
-
- FWC_WRITE_LOCK_IRQ(&ip_fw_lock, flags);
-
- switch (cmd) {
- case IP_FW_FLUSH:
- if (len != sizeof(ip_chainlabel) || !check_label(m))
- ret = EINVAL;
- else if ((chain = find_label(m)) == NULL)
- ret = ENOENT;
- else ret = clear_fw_chain(chain);
- break;
-
- case IP_FW_ZERO:
- if (len != sizeof(ip_chainlabel) || !check_label(m))
- ret = EINVAL;
- else if ((chain = find_label(m)) == NULL)
- ret = ENOENT;
- else ret = zero_fw_chain(chain);
- break;
-
- case IP_FW_CHECK: {
- struct ip_fwtest *new = m;
- struct iphdr *ip;
-
- /* Don't need write lock. */
- FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags);
-
- if (len != sizeof(struct ip_fwtest) || !check_label(m))
- return EINVAL;
-
- /* Need readlock to do find_label */
- FWC_READ_LOCK(&ip_fw_lock);
-
- if ((chain = find_label(new->fwt_label)) == NULL)
- ret = ENOENT;
- else {
- ip = &(new->fwt_packet.fwp_iph);
-
- if (ip->ihl != sizeof(struct iphdr) / sizeof(int)) {
- duprintf("ip_fw_ctl: ip->ihl=%d, want %d\n",
- ip->ihl,
- sizeof(struct iphdr) / sizeof(int));
- ret = EINVAL;
- }
- else {
- ret = ip_fw_check(ip, new->fwt_packet.fwp_vianame,
- NULL, chain,
- NULL, SLOT_NUMBER(), 1);
- switch (ret) {
- case FW_ACCEPT:
- ret = 0; break;
- case FW_REDIRECT:
- ret = ECONNABORTED; break;
- case FW_MASQUERADE:
- ret = ECONNRESET; break;
- case FW_REJECT:
- ret = ECONNREFUSED; break;
- /* Hack to help diag; these only get
- returned when testing. */
- case FW_SKIP+1:
- ret = ELOOP; break;
- case FW_SKIP:
- ret = ENFILE; break;
- default: /* FW_BLOCK */
- ret = ETIMEDOUT; break;
- }
- }
- }
- FWC_READ_UNLOCK(&ip_fw_lock);
- return ret;
- }
-
- case IP_FW_MASQ_TIMEOUTS: {
-#ifdef CONFIG_IP_MASQUERADE
- ret = ip_fw_masq_timeouts(m, len);
-#else
- ret = EINVAL;
-#endif
- }
- break;
-
- case IP_FW_REPLACE: {
- struct ip_fwkernel *ip_fwkern;
- struct ip_fwnew *new = m;
-
- if (len != sizeof(struct ip_fwnew)
- || !check_label(new->fwn_label))
- ret = EINVAL;
- else if ((chain = find_label(new->fwn_label)) == NULL)
- ret = ENOENT;
- else if ((ip_fwkern = convert_ipfw(&new->fwn_rule, &ret))
- != NULL)
- ret = replace_in_chain(chain, ip_fwkern,
- new->fwn_rulenum);
- }
- break;
-
- case IP_FW_APPEND: {
- struct ip_fwchange *new = m;
- struct ip_fwkernel *ip_fwkern;
-
- if (len != sizeof(struct ip_fwchange)
- || !check_label(new->fwc_label))
- ret = EINVAL;
- else if ((chain = find_label(new->fwc_label)) == NULL)
- ret = ENOENT;
- else if ((ip_fwkern = convert_ipfw(&new->fwc_rule, &ret))
- != NULL)
- ret = append_to_chain(chain, ip_fwkern);
- }
- break;
-
- case IP_FW_INSERT: {
- struct ip_fwkernel *ip_fwkern;
- struct ip_fwnew *new = m;
-
- if (len != sizeof(struct ip_fwnew)
- || !check_label(new->fwn_label))
- ret = EINVAL;
- else if ((chain = find_label(new->fwn_label)) == NULL)
- ret = ENOENT;
- else if ((ip_fwkern = convert_ipfw(&new->fwn_rule, &ret))
- != NULL)
- ret = insert_in_chain(chain, ip_fwkern,
- new->fwn_rulenum);
- }
- break;
-
- case IP_FW_DELETE: {
- struct ip_fwchange *new = m;
- struct ip_fwkernel *ip_fwkern;
-
- if (len != sizeof(struct ip_fwchange)
- || !check_label(new->fwc_label))
- ret = EINVAL;
- else if ((chain = find_label(new->fwc_label)) == NULL)
- ret = ENOENT;
- else if ((ip_fwkern = convert_ipfw(&new->fwc_rule, &ret))
- != NULL)
- ret = del_rule_from_chain(chain, ip_fwkern);
- }
- break;
-
- case IP_FW_DELETE_NUM: {
- struct ip_fwdelnum *new = m;
-
- if (len != sizeof(struct ip_fwdelnum)
- || !check_label(new->fwd_label))
- ret = EINVAL;
- else if ((chain = find_label(new->fwd_label)) == NULL)
- ret = ENOENT;
- else ret = del_num_from_chain(chain, new->fwd_rulenum);
- }
- break;
-
- case IP_FW_CREATECHAIN: {
- if (len != sizeof(ip_chainlabel)) {
- duprintf("create_chain: bad size %i\n", len);
- ret = EINVAL;
- }
- else ret = create_chain(m);
- }
- break;
-
- case IP_FW_DELETECHAIN: {
- if (len != sizeof(ip_chainlabel)) {
- duprintf("delete_chain: bad size %i\n", len);
- ret = EINVAL;
- }
- else ret = del_chain(m);
- }
- break;
-
- case IP_FW_POLICY: {
- struct ip_fwpolicy *new = m;
-
- if (len != sizeof(struct ip_fwpolicy)
- || !check_label(new->fwp_label))
- ret = EINVAL;
- else if ((chain = find_label(new->fwp_label)) == NULL)
- ret = ENOENT;
- else if (chain != IP_FW_INPUT_CHAIN
- && chain != IP_FW_FORWARD_CHAIN
- && chain != IP_FW_OUTPUT_CHAIN) {
- duprintf("change_policy: can't change policy on user"
- " defined chain.\n");
- ret = EINVAL;
- }
- else {
- int pol = FW_SKIP;
- find_special(new->fwp_policy, &pol);
-
- switch(pol) {
- case FW_MASQUERADE:
- if (chain != IP_FW_FORWARD_CHAIN) {
- ret = EINVAL;
- break;
- }
- /* Fall thru... */
- case FW_BLOCK:
- case FW_ACCEPT:
- case FW_REJECT:
- ret = change_policy(chain, pol);
- break;
- default:
- duprintf("change_policy: bad policy `%s'\n",
- new->fwp_policy);
- ret = EINVAL;
- }
- }
- break;
-
- }
- default:
- duprintf("ip_fw_ctl: unknown request %d\n",cmd);
- ret = EINVAL;
- }
-
- FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags);
- return ret;
-}
-
-/* Returns bytes used - doesn't NUL terminate */
-static int dump_rule(char *buffer,
- const char *chainlabel,
- const struct ip_fwkernel *rule)
-{
- int len;
- unsigned int i;
- __u64 packets = 0, bytes = 0;
-
- FWC_HAVE_LOCK(fwc_wlocks);
- for (i = 0; i < NUM_SLOTS; i++) {
- packets += rule->counters[i].pcnt;
- bytes += rule->counters[i].bcnt;
- }
-
- len=sprintf(buffer,
- "%9s " /* Chain name */
- "%08lX/%08lX->%08lX/%08lX " /* Source & Destination IPs */
- "%.16s " /* Interface */
- "%X %X " /* fw_flg and fw_invflg fields */
- "%u " /* Protocol */
- "%-9u %-9u %-9u %-9u " /* Packet & byte counters */
- "%u-%u %u-%u " /* Source & Dest port ranges */
- "A%02X X%02X " /* TOS and and xor masks */
- "%08X " /* Redirection port */
- "%u " /* fw_mark field */
- "%u " /* output size */
- "%9s\n", /* Target */
- chainlabel,
- ntohl(rule->ipfw.fw_src.s_addr),
- ntohl(rule->ipfw.fw_smsk.s_addr),
- ntohl(rule->ipfw.fw_dst.s_addr),
- ntohl(rule->ipfw.fw_dmsk.s_addr),
- (rule->ipfw.fw_vianame)[0] ? rule->ipfw.fw_vianame : "-",
- rule->ipfw.fw_flg,
- rule->ipfw.fw_invflg,
- rule->ipfw.fw_proto,
- (__u32)(packets >> 32), (__u32)packets,
- (__u32)(bytes >> 32), (__u32)bytes,
- rule->ipfw.fw_spts[0], rule->ipfw.fw_spts[1],
- rule->ipfw.fw_dpts[0], rule->ipfw.fw_dpts[1],
- rule->ipfw.fw_tosand, rule->ipfw.fw_tosxor,
- rule->ipfw.fw_redirpt,
- rule->ipfw.fw_mark,
- rule->ipfw.fw_outputsize,
- branchname(rule->branch,rule->simplebranch));
-
- duprintf("dump_rule: %i bytes done.\n", len);
- return len;
-}
-
-/* File offset is actually in records, not bytes. */
-static int ip_chain_procinfo(char *buffer, char **start,
- off_t offset, int length, int reset)
-{
- struct ip_chain *i;
- struct ip_fwkernel *j = ip_fw_chains->chain;
- unsigned long flags;
- int len = 0;
- int last_len = 0;
- off_t upto = 0;
-
- duprintf("Offset starts at %lu\n", offset);
- duprintf("ip_fw_chains is 0x%0lX\n", (unsigned long int)ip_fw_chains);
-
- /* Need a write lock to lock out ``readers'' which update counters. */
- FWC_WRITE_LOCK_IRQ(&ip_fw_lock, flags);
-
- for (i = ip_fw_chains; i; i = i->next) {
- for (j = i->chain; j; j = j->next) {
- if (upto == offset) break;
- duprintf("Skipping rule in chain `%s'\n",
- i->label);
- upto++;
- }
- if (upto == offset) break;
- }
-
- /* Don't init j first time, or once i = NULL */
- for (; i; (void)((i = i->next) && (j = i->chain))) {
- duprintf("Dumping chain `%s'\n", i->label);
- for (; j; j = j->next, upto++, last_len = len)
- {
- len += dump_rule(buffer+len, i->label, j);
- if (len > length) {
- duprintf("Dumped to %i (past %i). "
- "Moving back to %i.\n",
- len, length, last_len);
- len = last_len;
- goto outside;
- }
- else if (reset)
- memset(j->counters, 0,
- sizeof(struct ip_counters)*NUM_SLOTS);
- }
- }
-outside:
- FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags);
- buffer[len] = '\0';
-
- duprintf("ip_chain_procinfo: Length = %i (of %i). Offset = %li.\n",
- len, length, upto);
- /* `start' hack - see fs/proc/generic.c line ~165 */
- *start=(char *)((unsigned int)upto-offset);
- return len;
-}
-
-static int ip_chain_name_procinfo(char *buffer, char **start,
- off_t offset, int length, int reset)
-{
- struct ip_chain *i;
- int len = 0,last_len = 0;
- off_t pos = 0,begin = 0;
- unsigned long flags;
-
- /* Need a write lock to lock out ``readers'' which update counters. */
- FWC_WRITE_LOCK_IRQ(&ip_fw_lock, flags);
-
- for (i = ip_fw_chains; i; i = i->next)
- {
- unsigned int j;
- __u32 packetsHi = 0, packetsLo = 0, bytesHi = 0, bytesLo = 0;
-
- for (j = 0; j < NUM_SLOTS; j++) {
- packetsLo += i->reent[j].counters.pcnt & 0xFFFFFFFF;
- packetsHi += ((i->reent[j].counters.pcnt >> 32)
- & 0xFFFFFFFF);
- bytesLo += i->reent[j].counters.bcnt & 0xFFFFFFFF;
- bytesHi += ((i->reent[j].counters.bcnt >> 32)
- & 0xFFFFFFFF);
- }
-
- /* print the label and the policy */
- len+=sprintf(buffer+len,"%s %s %i %u %u %u %u\n",
- i->label,branchname(NULL, i->policy),i->refcount,
- packetsHi, packetsLo, bytesHi, bytesLo);
- pos=begin+len;
- if(pos<offset) {
- len=0;
- begin=pos;
- }
- else if(pos>offset+length) {
- len = last_len;
- break;
- }
-
- last_len = len;
- }
- FWC_WRITE_UNLOCK_IRQ(&ip_fw_lock, flags);
-
- *start = buffer+(offset-begin);
- len-=(offset-begin);
- if(len>length)
- len=length;
- return len;
-}
-
-/*
- * Interface to the generic firewall chains.
- */
-int ipfw_input_check(struct firewall_ops *this, int pf, struct device *dev,
- void *phdr, void *arg, struct sk_buff **pskb)
-{
- return ip_fw_check(phdr, dev->name,
- arg, IP_FW_INPUT_CHAIN, *pskb, SLOT_NUMBER(), 0);
-}
-
-int ipfw_output_check(struct firewall_ops *this, int pf, struct device *dev,
- void *phdr, void *arg, struct sk_buff **pskb)
-{
- /* Locally generated bogus packets by root. <SIGH>. */
- if (((struct iphdr *)phdr)->ihl * 4 < sizeof(struct iphdr)
- || (*pskb)->len < sizeof(struct iphdr))
- return FW_ACCEPT;
- return ip_fw_check(phdr, dev->name,
- arg, IP_FW_OUTPUT_CHAIN, *pskb, SLOT_NUMBER(), 0);
-}
-
-int ipfw_forward_check(struct firewall_ops *this, int pf, struct device *dev,
- void *phdr, void *arg, struct sk_buff **pskb)
-{
- return ip_fw_check(phdr, dev->name,
- arg, IP_FW_FORWARD_CHAIN, *pskb, SLOT_NUMBER(), 0);
-}
-
-struct firewall_ops ipfw_ops=
-{
- NULL,
- ipfw_forward_check,
- ipfw_input_check,
- ipfw_output_check,
- PF_INET,
- 0 /* We don't even allow a fall through so we are last */
-};
-
-#ifdef CONFIG_PROC_FS
-static struct proc_dir_entry proc_net_ipfwchains_chain = {
- PROC_NET_IPFW_CHAINS, sizeof(IP_FW_PROC_CHAINS)-1,
- IP_FW_PROC_CHAINS, S_IFREG | S_IRUSR | S_IWUSR, 1, 0, 0,
- 0, &proc_net_inode_operations, ip_chain_procinfo
-};
-
-static struct proc_dir_entry proc_net_ipfwchains_chainnames = {
- PROC_NET_IPFW_CHAIN_NAMES, sizeof(IP_FW_PROC_CHAIN_NAMES)-1,
- IP_FW_PROC_CHAIN_NAMES, S_IFREG | S_IRUSR | S_IWUSR, 1, 0, 0,
- 0, &proc_net_inode_operations, ip_chain_name_procinfo
-};
-
-#endif
-
-__initfunc(void ip_fw_init(void))
-{
-#ifdef DEBUG_IP_FIRWALL_LOCKING
- fwc_wlocks = fwc_rlocks = 0;
-#endif
-
- IP_FW_INPUT_CHAIN = ip_init_chain(IP_FW_LABEL_INPUT, 1, FW_ACCEPT);
- IP_FW_FORWARD_CHAIN = ip_init_chain(IP_FW_LABEL_FORWARD, 1, FW_ACCEPT);
- IP_FW_OUTPUT_CHAIN = ip_init_chain(IP_FW_LABEL_OUTPUT, 1, FW_ACCEPT);
-
- if(register_firewall(PF_INET,&ipfw_ops)<0)
- panic("Unable to register IP firewall.\n");
-
-#ifdef CONFIG_PROC_FS
- proc_net_register(&proc_net_ipfwchains_chain);
- proc_net_register(&proc_net_ipfwchains_chainnames);
-#endif
-
-#ifdef CONFIG_IP_FIREWALL_NETLINK
- ipfwsk = netlink_kernel_create(NETLINK_FIREWALL, NULL);
- if (ipfwsk == NULL)
- panic("ip_fw_init: cannot initialize netlink\n");
-#endif
-#if defined(DEBUG_IP_FIREWALL) || defined(DEBUG_IP_FIREWALL_USER)
- printk("Firewall graphs enabled! Untested kernel coming thru. \n");
-#endif
-}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 6a7546fd5..0a5402030 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -111,13 +111,13 @@
Alexey Kuznetsov.
*/
-static int ipgre_tunnel_init(struct device *dev);
+static int ipgre_tunnel_init(struct net_device *dev);
/* Fallback tunnel: no source, no destination, no key, no options */
-static int ipgre_fb_tunnel_init(struct device *dev);
+static int ipgre_fb_tunnel_init(struct net_device *dev);
-static struct device ipgre_fb_tunnel_dev = {
+static struct net_device ipgre_fb_tunnel_dev = {
NULL, 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipgre_fb_tunnel_init,
};
@@ -153,6 +153,8 @@ static struct ip_tunnel *tunnels[4][HASH_SIZE];
#define tunnels_l (tunnels[1])
#define tunnels_wc (tunnels[0])
+static rwlock_t ipgre_lock = RW_LOCK_UNLOCKED;
+
/* Given src, dst and key, find approriate for input tunnel. */
static struct ip_tunnel * ipgre_tunnel_lookup(u32 remote, u32 local, u32 key)
@@ -212,8 +214,9 @@ static void ipgre_tunnel_link(struct ip_tunnel *t)
struct ip_tunnel **tp = ipgre_bucket(t);
t->next = *tp;
- wmb();
+ write_lock_bh(&ipgre_lock);
*tp = t;
+ write_unlock_bh(&ipgre_lock);
}
static void ipgre_tunnel_unlink(struct ip_tunnel *t)
@@ -222,8 +225,9 @@ static void ipgre_tunnel_unlink(struct ip_tunnel *t)
for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) {
if (t == *tp) {
+ write_lock_bh(&ipgre_lock);
*tp = t->next;
- synchronize_bh();
+ write_unlock_bh(&ipgre_lock);
break;
}
}
@@ -235,7 +239,7 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int
u32 local = parms->iph.saddr;
u32 key = parms->i_key;
struct ip_tunnel *t, **tp, *nt;
- struct device *dev;
+ struct net_device *dev;
unsigned h = HASH(key);
int prio = 0;
@@ -266,12 +270,13 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int
nt->dev = dev;
dev->name = nt->parms.name;
dev->init = ipgre_tunnel_init;
+ dev->new_style = 1;
memcpy(&nt->parms, parms, sizeof(*parms));
if (dev->name[0] == 0) {
int i;
for (i=1; i<100; i++) {
sprintf(dev->name, "gre%d", i);
- if (dev_get(dev->name) == NULL)
+ if (__dev_get_by_name(dev->name) == NULL)
break;
}
if (i==100)
@@ -281,6 +286,7 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int
if (register_netdevice(dev) < 0)
goto failed;
+ dev_hold(dev);
ipgre_tunnel_link(nt);
/* Do not decrement MOD_USE_COUNT here. */
return nt;
@@ -291,16 +297,19 @@ failed:
return NULL;
}
-static void ipgre_tunnel_destroy(struct device *dev)
+static void ipgre_tunnel_destructor(struct net_device *dev)
{
- ipgre_tunnel_unlink((struct ip_tunnel*)dev->priv);
-
if (dev != &ipgre_fb_tunnel_dev) {
- kfree(dev);
MOD_DEC_USE_COUNT;
}
}
+static void ipgre_tunnel_uninit(struct net_device *dev)
+{
+ ipgre_tunnel_unlink((struct ip_tunnel*)dev->priv);
+ dev_put(dev);
+}
+
void ipgre_err(struct sk_buff *skb, unsigned char *dp, int len)
{
@@ -370,18 +379,21 @@ void ipgre_err(struct sk_buff *skb, unsigned char *dp, int len)
break;
}
+ read_lock(&ipgre_lock);
t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((u32*)p) + (grehlen>>2) - 1) : 0);
if (t == NULL || t->parms.iph.daddr == 0 || MULTICAST(t->parms.iph.daddr))
- return;
+ goto out;
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
- return;
+ goto out;
if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
t->err_count++;
else
t->err_count = 1;
t->err_time = jiffies;
+out:
+ read_unlock(&ipgre_lock);
return;
#else
struct iphdr *iph = (struct iphdr*)dp;
@@ -533,7 +545,7 @@ int ipgre_rcv(struct sk_buff *skb, unsigned short len)
- We do not support routing headers.
*/
if (flags&(GRE_VERSION|GRE_ROUTING))
- goto drop;
+ goto drop_nolock;
if (flags&GRE_CSUM) {
csum = ip_compute_csum(h, len);
@@ -549,6 +561,7 @@ int ipgre_rcv(struct sk_buff *skb, unsigned short len)
}
}
+ read_lock(&ipgre_lock);
if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) {
skb->mac.raw = skb->nh.raw;
skb->nh.raw = skb_pull(skb, h + offset - skb->data);
@@ -587,16 +600,19 @@ int ipgre_rcv(struct sk_buff *skb, unsigned short len)
dst_release(skb->dst);
skb->dst = NULL;
netif_rx(skb);
+ read_unlock(&ipgre_lock);
return(0);
}
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);
drop:
+ read_unlock(&ipgre_lock);
+drop_nolock:
kfree_skb(skb);
return(0);
}
-static int ipgre_tunnel_xmit(struct sk_buff *skb, struct device *dev)
+static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
struct net_device_stats *stats = &tunnel->stat;
@@ -605,7 +621,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct device *dev)
u8 tos;
u16 df;
struct rtable *rt; /* Route to the other host */
- struct device *tdev; /* Device to other host */
+ struct net_device *tdev; /* Device to other host */
struct iphdr *iph; /* Our new IP header */
int max_headroom; /* The extra header space needed */
int gre_hlen;
@@ -819,7 +835,7 @@ tx_error:
}
static int
-ipgre_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd)
+ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
{
int err = 0;
struct ip_tunnel_parm p;
@@ -890,7 +906,6 @@ ipgre_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd)
err = -EINVAL;
break;
}
- start_bh_atomic();
ipgre_tunnel_unlink(t);
t->parms.iph.saddr = p.iph.saddr;
t->parms.iph.daddr = p.iph.daddr;
@@ -899,7 +914,6 @@ ipgre_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd)
memcpy(dev->dev_addr, &p.iph.saddr, 4);
memcpy(dev->broadcast, &p.iph.daddr, 4);
ipgre_tunnel_link(t);
- end_bh_atomic();
netdev_state_change(dev);
}
}
@@ -945,12 +959,12 @@ done:
return err;
}
-static struct net_device_stats *ipgre_tunnel_get_stats(struct device *dev)
+static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
{
return &(((struct ip_tunnel*)dev->priv)->stat);
}
-static int ipgre_tunnel_change_mtu(struct device *dev, int new_mtu)
+static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
{
struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
@@ -989,7 +1003,7 @@ static int ipgre_tunnel_change_mtu(struct device *dev, int new_mtu)
*/
-static int ipgre_header(struct sk_buff *skb, struct device *dev, unsigned short type,
+static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
void *daddr, void *saddr, unsigned len)
{
struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
@@ -1017,7 +1031,7 @@ static int ipgre_header(struct sk_buff *skb, struct device *dev, unsigned short
return -t->hlen;
}
-static int ipgre_open(struct device *dev)
+static int ipgre_open(struct net_device *dev)
{
struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
@@ -1032,23 +1046,25 @@ static int ipgre_open(struct device *dev)
}
dev = rt->u.dst.dev;
ip_rt_put(rt);
- if (dev->ip_ptr == NULL) {
+ if (__in_dev_get(dev) == NULL) {
MOD_DEC_USE_COUNT;
return -EADDRNOTAVAIL;
}
t->mlink = dev->ifindex;
- ip_mc_inc_group(dev->ip_ptr, t->parms.iph.daddr);
+ ip_mc_inc_group(__in_dev_get(dev), t->parms.iph.daddr);
}
return 0;
}
-static int ipgre_close(struct device *dev)
+static int ipgre_close(struct net_device *dev)
{
struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
if (MULTICAST(t->parms.iph.daddr) && t->mlink) {
- dev = dev_get_by_index(t->mlink);
- if (dev && dev->ip_ptr)
- ip_mc_dec_group(dev->ip_ptr, t->parms.iph.daddr);
+ struct in_device *in_dev = inetdev_by_index(t->mlink);
+ if (in_dev) {
+ ip_mc_dec_group(in_dev, t->parms.iph.daddr);
+ in_dev_put(in_dev);
+ }
}
MOD_DEC_USE_COUNT;
return 0;
@@ -1056,11 +1072,12 @@ static int ipgre_close(struct device *dev)
#endif
-static void ipgre_tunnel_init_gen(struct device *dev)
+static void ipgre_tunnel_init_gen(struct net_device *dev)
{
struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
- dev->destructor = ipgre_tunnel_destroy;
+ dev->uninit = ipgre_tunnel_uninit;
+ dev->destructor = ipgre_tunnel_destructor;
dev->hard_start_xmit = ipgre_tunnel_xmit;
dev->get_stats = ipgre_tunnel_get_stats;
dev->do_ioctl = ipgre_tunnel_ioctl;
@@ -1078,9 +1095,9 @@ static void ipgre_tunnel_init_gen(struct device *dev)
memcpy(dev->broadcast, &t->parms.iph.daddr, 4);
}
-static int ipgre_tunnel_init(struct device *dev)
+static int ipgre_tunnel_init(struct net_device *dev)
{
- struct device *tdev = NULL;
+ struct net_device *tdev = NULL;
struct ip_tunnel *tunnel;
struct iphdr *iph;
int hlen = LL_MAX_HEADER;
@@ -1116,7 +1133,7 @@ static int ipgre_tunnel_init(struct device *dev)
}
if (!tdev && tunnel->parms.link)
- tdev = dev_get_by_index(tunnel->parms.link);
+ tdev = __dev_get_by_index(tunnel->parms.link);
if (tdev) {
hlen = tdev->hard_header_len;
@@ -1140,20 +1157,20 @@ static int ipgre_tunnel_init(struct device *dev)
}
#ifdef MODULE
-static int ipgre_fb_tunnel_open(struct device *dev)
+static int ipgre_fb_tunnel_open(struct net_device *dev)
{
MOD_INC_USE_COUNT;
return 0;
}
-static int ipgre_fb_tunnel_close(struct device *dev)
+static int ipgre_fb_tunnel_close(struct net_device *dev)
{
MOD_DEC_USE_COUNT;
return 0;
}
#endif
-__initfunc(int ipgre_fb_tunnel_init(struct device *dev))
+int __init ipgre_fb_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
struct iphdr *iph;
@@ -1170,6 +1187,7 @@ __initfunc(int ipgre_fb_tunnel_init(struct device *dev))
iph->ihl = 5;
tunnel->hlen = sizeof(struct iphdr) + 4;
+ dev_hold(dev);
tunnels_wc[0] = &ipgre_fb_tunnel;
return 0;
}
@@ -1193,7 +1211,7 @@ static struct inet_protocol ipgre_protocol = {
#ifdef MODULE
int init_module(void)
#else
-__initfunc(int ipgre_init(void))
+int __init ipgre_init(void)
#endif
{
printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 107ccaa16..4ebdbdfb4 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -5,7 +5,7 @@
*
* The Internet Protocol (IP) module.
*
- * Version: $Id: ip_input.c,v 1.40 1999/06/09 10:10:55 davem Exp $
+ * Version: $Id: ip_input.c,v 1.42 1999/08/20 11:05:27 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -140,11 +140,7 @@
#include <net/icmp.h>
#include <net/raw.h>
#include <net/checksum.h>
-#include <linux/ip_fw.h>
-#ifdef CONFIG_IP_MASQUERADE
-#include <net/ip_masq.h>
-#endif
-#include <linux/firewall.h>
+#include <linux/netfilter_ipv4.h>
#include <linux/mroute.h>
#include <linux/netlink.h>
@@ -154,10 +150,6 @@
struct ip_mib ip_statistics={2,IPDEFTTL,}; /* Forwarding=No, Default TTL=64 */
-#if defined(CONFIG_IP_TRANSPARENT_PROXY) && !defined(CONFIG_IP_ALWAYS_DEFRAG)
-#define CONFIG_IP_ALWAYS_DEFRAG 1
-#endif
-
/*
* Process Router Attention IP option
*/
@@ -167,13 +159,16 @@ int ip_call_ra_chain(struct sk_buff *skb)
u8 protocol = skb->nh.iph->protocol;
struct sock *last = NULL;
+ read_lock(&ip_ra_lock);
for (ra = ip_ra_chain; ra; ra = ra->next) {
struct sock *sk = ra->sk;
if (sk && sk->num == protocol) {
if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
skb = ip_defrag(skb);
- if (skb == NULL)
+ if (skb == NULL) {
+ read_unlock(&ip_ra_lock);
return 1;
+ }
}
if (last) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
@@ -186,8 +181,10 @@ int ip_call_ra_chain(struct sk_buff *skb)
if (last) {
raw_rcv(last, skb);
+ read_unlock(&ip_ra_lock);
return 1;
}
+ read_unlock(&ip_ra_lock);
return 0;
}
@@ -214,59 +211,19 @@ static int ip_run_ipprot(struct sk_buff *skb, struct iphdr *iph,
return ret;
}
-extern struct sock *raw_v4_input(struct sk_buff *, struct iphdr *, int);
-
-/*
- * Deliver IP Packets to the higher protocol layers.
- */
-int ip_local_deliver(struct sk_buff *skb)
+static inline int ip_local_deliver_finish(struct sk_buff *skb)
{
struct iphdr *iph = skb->nh.iph;
-#ifndef CONFIG_IP_ALWAYS_DEFRAG
- /*
- * Reassemble IP fragments.
- */
+#ifdef CONFIG_NETFILTER_DEBUG
+ nf_debug_ip_local_deliver(skb);
+#endif /*CONFIG_NETFILTER_DEBUG*/
- if (iph->frag_off & htons(IP_MF|IP_OFFSET)) {
- skb = ip_defrag(skb);
- if (!skb)
- return 0;
- iph = skb->nh.iph;
+ /* Free rx_dev before enqueueing to sockets */
+ if (skb->rx_dev) {
+ dev_put(skb->rx_dev);
+ skb->rx_dev = NULL;
}
-#endif
-
-#ifdef CONFIG_IP_MASQUERADE
- /* Do we need to de-masquerade this packet? */
- if((IPCB(skb)->flags&IPSKB_MASQUERADED)) {
- /* Some masq modules can re-inject packets if
- * bad configured.
- */
- printk(KERN_DEBUG "ip_input(): demasq recursion detected. "
- "Check masq modules configuration\n");
- kfree_skb(skb);
- return 0;
- } else {
- int ret = ip_fw_demasquerade(&skb);
-
- if (ret < 0) {
- kfree_skb(skb);
- return 0;
- }
- if (ret) {
- iph = skb->nh.iph;
- IPCB(skb)->flags |= IPSKB_MASQUERADED;
- dst_release(skb->dst);
- skb->dst = NULL;
- if (ip_route_input(skb, iph->daddr, iph->saddr,
- iph->tos, skb->dev)) {
- kfree_skb(skb);
- return 0;
- }
- return skb->dst->input(skb);
- }
- }
-#endif
/* Point into the IP datagram, just past the header. */
skb->h.raw = skb->nh.raw + iph->ihl*4;
@@ -284,19 +241,26 @@ int ip_local_deliver(struct sk_buff *skb)
if(raw_sk != NULL)
raw_sk = raw_v4_input(skb, iph, hash);
+ read_lock(&inet_protocol_lock);
ipprot = (struct inet_protocol *) inet_protos[hash];
flag = 0;
if(ipprot != NULL) {
if(raw_sk == NULL &&
ipprot->next == NULL &&
ipprot->protocol == iph->protocol) {
+ int ret;
+
/* Fast path... */
- return ipprot->handler(skb, (ntohs(iph->tot_len) -
- (iph->ihl * 4)));
+ ret = ipprot->handler(skb, (ntohs(iph->tot_len) -
+ (iph->ihl * 4)));
+
+ read_unlock(&inet_protocol_lock);
+ return ret;
} else {
flag = ip_run_ipprot(skb, iph, ipprot, (raw_sk != NULL));
}
- }
+ }
+ read_unlock(&inet_protocol_lock);
/* All protocols checked.
* If this packet was a broadcast, we may *not* reply to it, since that
@@ -305,6 +269,7 @@ int ip_local_deliver(struct sk_buff *skb)
*/
if(raw_sk != NULL) { /* Shift to last raw user */
raw_rcv(raw_sk, skb);
+ sock_put(raw_sk);
} else if (!flag) { /* Free and report errors */
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);
kfree_skb(skb);
@@ -315,75 +280,31 @@ int ip_local_deliver(struct sk_buff *skb)
}
/*
- * Main IP Receive routine.
+ * Deliver IP Packets to the higher protocol layers.
*/
-int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
+int ip_local_deliver(struct sk_buff *skb)
{
struct iphdr *iph = skb->nh.iph;
-#ifdef CONFIG_FIREWALL
- int fwres;
- u16 rport;
-#endif /* CONFIG_FIREWALL */
-
- /* When the interface is in promisc. mode, drop all the crap
- * that it receives, do not try to analyse it.
- */
- if (skb->pkt_type == PACKET_OTHERHOST)
- goto drop;
-
- ip_statistics.IpInReceives++;
/*
- * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum.
- *
- * Is the datagram acceptable?
- *
- * 1. Length at least the size of an ip header
- * 2. Version of 4
- * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
- * 4. Doesn't have a bogus length
+ * Reassemble IP fragments.
*/
- if (skb->len < sizeof(struct iphdr))
- goto inhdr_error;
- if (iph->ihl < 5 || iph->version != 4 || ip_fast_csum((u8 *)iph, iph->ihl) != 0)
- goto inhdr_error;
-
- {
- __u32 len = ntohs(iph->tot_len);
- if (skb->len < len)
- goto inhdr_error;
-
- /* Our transport medium may have padded the buffer out. Now we know it
- * is IP we can trim to the true length of the frame.
- * Note this now means skb->len holds ntohs(iph->tot_len).
- */
- __skb_trim(skb, len);
- }
-
-#ifdef CONFIG_IP_ALWAYS_DEFRAG
- /* Won't send ICMP reply, since skb->dst == NULL. --RR */
if (iph->frag_off & htons(IP_MF|IP_OFFSET)) {
skb = ip_defrag(skb);
if (!skb)
return 0;
iph = skb->nh.iph;
- ip_send_check(iph);
}
-#endif
-#ifdef CONFIG_FIREWALL
- /*
- * See if the firewall wants to dispose of the packet.
- *
- * We can't do ICMP reply or local delivery before routing,
- * so we delay those decisions until after route. --RR
- */
- fwres = call_in_firewall(PF_INET, dev, iph, &rport, &skb);
- if (fwres < FW_ACCEPT && fwres != FW_REJECT)
- goto drop;
- iph = skb->nh.iph;
-#endif /* CONFIG_FIREWALL */
+ return NF_HOOK(PF_INET, NF_IP_LOCAL_IN, skb, skb->dev, NULL,
+ ip_local_deliver_finish);
+}
+
+static inline int ip_rcv_finish(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct iphdr *iph = skb->nh.iph;
/*
* Initialise the virtual path cache for the packet. It describes
@@ -428,36 +349,85 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
opt = &(IPCB(skb)->opt);
if (opt->srr) {
- struct in_device *in_dev = dev->ip_ptr;
- if (in_dev && !IN_DEV_SOURCE_ROUTE(in_dev)) {
- if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
- printk(KERN_INFO "source route option %d.%d.%d.%d -> %d.%d.%d.%d\n",
- NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
- goto drop;
+ struct in_device *in_dev = in_dev_get(dev);
+ if (in_dev) {
+ if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
+ if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
+ printk(KERN_INFO "source route option %d.%d.%d.%d -> %d.%d.%d.%d\n",
+ NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+ in_dev_put(in_dev);
+ goto drop;
+ }
+ in_dev_put(in_dev);
}
if (ip_options_rcv_srr(skb))
goto drop;
}
}
-#ifdef CONFIG_FIREWALL
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- if (fwres == FW_REDIRECT && (IPCB(skb)->redirport = rport) != 0)
- return ip_local_deliver(skb);
-#endif /* CONFIG_IP_TRANSPARENT_PROXY */
+ return skb->dst->input(skb);
+
+inhdr_error:
+ ip_statistics.IpInHdrErrors++;
+drop:
+ kfree_skb(skb);
+ return(0);
+}
+
+/*
+ * Main IP Receive routine.
+ */
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+{
+ struct iphdr *iph = skb->nh.iph;
- if (fwres == FW_REJECT) {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+ /* When the interface is in promisc. mode, drop all the crap
+ * that it receives, do not try to analyse it.
+ */
+ if (skb->pkt_type == PACKET_OTHERHOST)
goto drop;
+
+ ip_statistics.IpInReceives++;
+
+ if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+ goto out;
+
+ /*
+ * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum.
+ *
+ * Is the datagram acceptable?
+ *
+ * 1. Length at least the size of an ip header
+ * 2. Version of 4
+ * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
+ * 4. Doesn't have a bogus length
+ */
+
+ if (skb->len < sizeof(struct iphdr))
+ goto inhdr_error;
+ if (iph->ihl < 5 || iph->version != 4 || ip_fast_csum((u8 *)iph, iph->ihl) != 0)
+ goto inhdr_error;
+
+ {
+ __u32 len = ntohs(iph->tot_len);
+ if (skb->len < len)
+ goto inhdr_error;
+
+ /* Our transport medium may have padded the buffer out. Now we know it
+ * is IP we can trim to the true length of the frame.
+ * Note this now means skb->len holds ntohs(iph->tot_len).
+ */
+ __skb_trim(skb, len);
}
-#endif /* CONFIG_FIREWALL */
- return skb->dst->input(skb);
+ return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
+ ip_rcv_finish);
inhdr_error:
ip_statistics.IpInHdrErrors++;
drop:
kfree_skb(skb);
+out:
return(0);
}
diff --git a/net/ipv4/ip_masq.c b/net/ipv4/ip_masq.c
deleted file mode 100644
index 28c3093d5..000000000
--- a/net/ipv4/ip_masq.c
+++ /dev/null
@@ -1,2453 +0,0 @@
-/*
- *
- * Masquerading functionality
- *
- * Copyright (c) 1994 Pauline Middelink
- *
- * $Id: ip_masq.c,v 1.34 1999/03/17 01:53:51 davem Exp $
- *
- *
- * See ip_fw.c for original log
- *
- * Fixes:
- * Juan Jose Ciarlante : Modularized application masquerading (see ip_masq_app.c)
- * Juan Jose Ciarlante : New struct ip_masq_seq that holds output/input delta seq.
- * Juan Jose Ciarlante : Added hashed lookup by proto,maddr,mport and proto,saddr,sport
- * Juan Jose Ciarlante : Fixed deadlock if free ports get exhausted
- * Juan Jose Ciarlante : Added NO_ADDR status flag.
- * Richard Lynch : Added IP Autoforward
- * Nigel Metheringham : Added ICMP handling for demasquerade
- * Nigel Metheringham : Checksum checking of masqueraded data
- * Nigel Metheringham : Better handling of timeouts of TCP conns
- * Delian Delchev : Added support for ICMP requests and replys
- * Nigel Metheringham : ICMP in ICMP handling, tidy ups, bug fixes, made ICMP optional
- * Juan Jose Ciarlante : re-assign maddr if no packet received from outside
- * Juan Jose Ciarlante : ported to 2.1 tree
- * Juan Jose Ciarlante : reworked control connections
- * Steven Clarke : Added Port Forwarding
- * Juan Jose Ciarlante : Just ONE ip_masq_new (!)
- * Juan Jose Ciarlante : IP masq modules support
- * Juan Jose Ciarlante : don't go into search loop if mport specified
- * Juan Jose Ciarlante : locking
- * Steven Clarke : IP_MASQ_S_xx state design
- * Juan Jose Ciarlante : IP_MASQ_S state implementation
- * Juan Jose Ciarlante : xx_get() clears timer, _put() inserts it
- * Juan Jose Ciarlante : create /proc/net/ip_masq/
- * Juan Jose Ciarlante : reworked checksums (save payload csum if possible)
- * Juan Jose Ciarlante : added missing ip_fw_masquerade checksum
- * Juan Jose Ciarlante : csum savings
- * Juan Jose Ciarlante : added user-space tunnel creation/del, etc
- * Juan Jose Ciarlante : (last) moved to ip_masq_user runtime module
- * Juan Jose Ciarlante : user timeout handling again
- * Juan Jose Ciarlante : make new modules support optional
- * Juan Jose Ciarlante : u-space context => locks reworked
- * Juan Jose Ciarlante : fixed stupid SMP locking bug
- * Juan Jose Ciarlante : fixed "tap"ing in demasq path by copy-on-w
- * Juan Jose Ciarlante : make masq_proto_doff() robust against fake sized/corrupted packets
- * Kai Bankett : do not toss other IP protos in proto_doff()
- * Dan Kegel : pointed correct NAT behavior for UDP streams
- *
- */
-
-#include <linux/config.h>
-#include <linux/module.h>
-#ifdef CONFIG_KMOD
-#include <linux/kmod.h>
-#endif
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/skbuff.h>
-#include <asm/system.h>
-#include <linux/stat.h>
-#include <linux/proc_fs.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/inet.h>
-#include <linux/init.h>
-#include <net/protocol.h>
-#include <net/icmp.h>
-#include <net/tcp.h>
-#include <net/udp.h>
-#include <net/checksum.h>
-#include <net/ip_masq.h>
-
-#ifdef CONFIG_IP_MASQUERADE_MOD
-#include <net/ip_masq_mod.h>
-#endif
-
-#include <linux/sysctl.h>
-#include <linux/ip_fw.h>
-#include <linux/ip_masq.h>
-
-int sysctl_ip_masq_debug = 0;
-
-/*
- * Exported wrapper
- */
-int ip_masq_get_debug_level(void)
-{
- return sysctl_ip_masq_debug;
-}
-
-struct ip_masq_hook *ip_masq_user_hook = NULL;
-
-/*
- * Timeout table[state]
- */
-/* static int masq_timeout_table[IP_MASQ_S_LAST+1] = { */
-static struct ip_masq_timeout_table masq_timeout_table = {
- ATOMIC_INIT(0), /* refcnt */
- 0, /* scale */
- {
- 30*60*HZ, /* IP_MASQ_S_NONE, */
- 15*60*HZ, /* IP_MASQ_S_ESTABLISHED, */
- 2*60*HZ, /* IP_MASQ_S_SYN_SENT, */
- 1*60*HZ, /* IP_MASQ_S_SYN_RECV, */
- 2*60*HZ, /* IP_MASQ_S_FIN_WAIT, */
- 2*60*HZ, /* IP_MASQ_S_TIME_WAIT, */
- 10*HZ, /* IP_MASQ_S_CLOSE, */
- 60*HZ, /* IP_MASQ_S_CLOSE_WAIT, */
- 30*HZ, /* IP_MASQ_S_LAST_ACK, */
- 2*60*HZ, /* IP_MASQ_S_LISTEN, */
- 5*60*HZ, /* IP_MASQ_S_UDP, */
- 1*60*HZ, /* IP_MASQ_S_ICMP, */
- 2*HZ,/* IP_MASQ_S_LAST */
- }, /* timeout */
-};
-
-#define MASQUERADE_EXPIRE_RETRY masq_timeout_table.timeout[IP_MASQ_S_TIME_WAIT]
-
-static const char * state_name_table[IP_MASQ_S_LAST+1] = {
- "NONE", /* IP_MASQ_S_NONE, */
- "ESTABLISHED", /* IP_MASQ_S_ESTABLISHED, */
- "SYN_SENT", /* IP_MASQ_S_SYN_SENT, */
- "SYN_RECV", /* IP_MASQ_S_SYN_RECV, */
- "FIN_WAIT", /* IP_MASQ_S_FIN_WAIT, */
- "TIME_WAIT", /* IP_MASQ_S_TIME_WAIT, */
- "CLOSE", /* IP_MASQ_S_CLOSE, */
- "CLOSE_WAIT", /* IP_MASQ_S_CLOSE_WAIT, */
- "LAST_ACK", /* IP_MASQ_S_LAST_ACK, */
- "LISTEN", /* IP_MASQ_S_LISTEN, */
- "UDP", /* IP_MASQ_S_UDP, */
- "ICMP", /* IP_MASQ_S_ICMP, */
- "BUG!", /* IP_MASQ_S_LAST */
-};
-
-#define mNO IP_MASQ_S_NONE
-#define mES IP_MASQ_S_ESTABLISHED
-#define mSS IP_MASQ_S_SYN_SENT
-#define mSR IP_MASQ_S_SYN_RECV
-#define mFW IP_MASQ_S_FIN_WAIT
-#define mTW IP_MASQ_S_TIME_WAIT
-#define mCL IP_MASQ_S_CLOSE
-#define mCW IP_MASQ_S_CLOSE_WAIT
-#define mLA IP_MASQ_S_LAST_ACK
-#define mLI IP_MASQ_S_LISTEN
-
-struct masq_tcp_states_t {
- int next_state[IP_MASQ_S_LAST]; /* should be _LAST_TCP */
-};
-
-const char * ip_masq_state_name(int state)
-{
- if (state >= IP_MASQ_S_LAST)
- return "ERR!";
- return state_name_table[state];
-}
-
-struct masq_tcp_states_t masq_tcp_states [] = {
-/* INPUT */
-/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI */
-/*syn*/ {{mSR, mES, mES, mSR, mSR, mSR, mSR, mSR, mSR, mSR }},
-/*fin*/ {{mCL, mCW, mSS, mTW, mTW, mTW, mCL, mCW, mLA, mLI }},
-/*ack*/ {{mCL, mES, mSS, mSR, mFW, mTW, mCL, mCW, mCL, mLI }},
-/*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI }},
-
-/* OUTPUT */
-/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI */
-/*syn*/ {{mSS, mES, mSS, mES, mSS, mSS, mSS, mSS, mSS, mLI }},
-/*fin*/ {{mTW, mFW, mSS, mTW, mFW, mTW, mCL, mTW, mLA, mLI }},
-/*ack*/ {{mES, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mES }},
-/*rst*/ {{mCL, mCL, mSS, mCL, mCL, mTW, mCL, mCL, mCL, mCL }},
-};
-
-static __inline__ int masq_tcp_state_idx(struct tcphdr *th, int output)
-{
- /*
- * [0-3]: input states, [4-7]: output.
- */
- if (output)
- output=4;
-
- if (th->rst)
- return output+3;
- if (th->syn)
- return output+0;
- if (th->fin)
- return output+1;
- if (th->ack)
- return output+2;
- return -1;
-}
-
-
-
-static int masq_set_state_timeout(struct ip_masq *ms, int state)
-{
- struct ip_masq_timeout_table *mstim = ms->timeout_table;
- int scale;
-
- /*
- * Use default timeout table if no specific for this entry
- */
- if (!mstim)
- mstim = &masq_timeout_table;
-
- ms->timeout = mstim->timeout[ms->state=state];
- scale = mstim->scale;
-
- if (scale<0)
- ms->timeout >>= -scale;
- else if (scale > 0)
- ms->timeout <<= scale;
-
- return state;
-}
-
-static int masq_tcp_state(struct ip_masq *ms, int output, struct tcphdr *th)
-{
- int state_idx;
- int new_state = IP_MASQ_S_CLOSE;
-
- if ((state_idx = masq_tcp_state_idx(th, output)) < 0) {
- IP_MASQ_DEBUG(1, "masq_state_idx(%d)=%d!!!\n",
- output, state_idx);
- goto tcp_state_out;
- }
-
- new_state = masq_tcp_states[state_idx].next_state[ms->state];
-
-tcp_state_out:
- if (new_state!=ms->state)
- IP_MASQ_DEBUG(1, "%s %s [%c%c%c%c] %08lX:%04X-%08lX:%04X state: %s->%s\n",
- masq_proto_name(ms->protocol),
- output? "output" : "input ",
- th->syn? 'S' : '.',
- th->fin? 'F' : '.',
- th->ack? 'A' : '.',
- th->rst? 'R' : '.',
- ntohl(ms->saddr), ntohs(ms->sport),
- ntohl(ms->daddr), ntohs(ms->dport),
- ip_masq_state_name(ms->state),
- ip_masq_state_name(new_state));
- return masq_set_state_timeout(ms, new_state);
-}
-
-
-/*
- * Handle state transitions
- */
-static int masq_set_state(struct ip_masq *ms, int output, struct iphdr *iph, void *tp)
-{
- switch (iph->protocol) {
- case IPPROTO_ICMP:
- return masq_set_state_timeout(ms, IP_MASQ_S_ICMP);
- case IPPROTO_UDP:
- return masq_set_state_timeout(ms, IP_MASQ_S_UDP);
- case IPPROTO_TCP:
- return masq_tcp_state(ms, output, tp);
- }
- return -1;
-}
-
-/*
- * Set LISTEN timeout. (ip_masq_put will setup timer)
- */
-int ip_masq_listen(struct ip_masq *ms)
-{
- masq_set_state_timeout(ms, IP_MASQ_S_LISTEN);
- return ms->timeout;
-}
-
-/*
- * Dynamic address rewriting
- */
-extern int sysctl_ip_dynaddr;
-
-/*
- * Lookup lock
- */
-rwlock_t __ip_masq_lock = RW_LOCK_UNLOCKED;
-
-/*
- * Implement IP packet masquerading
- */
-
-/*
- * Converts an ICMP reply code into the equivalent request code
- */
-static __inline__ const __u8 icmp_type_request(__u8 type)
-{
- switch (type)
- {
- case ICMP_ECHOREPLY: return ICMP_ECHO; break;
- case ICMP_TIMESTAMPREPLY: return ICMP_TIMESTAMP; break;
- case ICMP_INFO_REPLY: return ICMP_INFO_REQUEST; break;
- case ICMP_ADDRESSREPLY: return ICMP_ADDRESS; break;
- default: return (255); break;
- }
-}
-
-/*
- * Helper macros - attempt to make code clearer!
- */
-
-/* ID used in ICMP lookups */
-#define icmp_id(icmph) ((icmph->un).echo.id)
-/* (port) hash value using in ICMP lookups for requests */
-#define icmp_hv_req(icmph) ((__u16)(icmph->code+(__u16)(icmph->type<<8)))
-/* (port) hash value using in ICMP lookups for replies */
-#define icmp_hv_rep(icmph) ((__u16)(icmph->code+(__u16)(icmp_type_request(icmph->type)<<8)))
-
-/*
- * Last masq_port number in use.
- * Will cycle in MASQ_PORT boundaries.
- */
-static __u16 masq_port = PORT_MASQ_BEGIN;
-static spinlock_t masq_port_lock = SPIN_LOCK_UNLOCKED;
-
-/*
- * free ports counters (UDP & TCP)
- *
- * Their value is _less_ or _equal_ to actual free ports:
- * same masq port, diff masq addr (firewall iface address) allocated
- * entries are accounted but their actually don't eat a more than 1 port.
- *
- * Greater values could lower MASQ_EXPIRATION setting as a way to
- * manage 'masq_entries resource'.
- *
- * By default we will reuse masq.port iff (output) connection
- * (5-upla) if not duplicated.
- * This may break midentd and others ...
- */
-
-#ifdef CONFIG_IP_MASQ_NREUSE
-#define PORT_MASQ_MUL 1
-#else
-#define PORT_MASQ_MUL 10
-#endif
-
-/*
- * At the moment, hardcore in sync with masq_proto_num
- */
-atomic_t ip_masq_free_ports[3] = {
- ATOMIC_INIT((PORT_MASQ_END-PORT_MASQ_BEGIN) * PORT_MASQ_MUL),/* UDP */
- ATOMIC_INIT((PORT_MASQ_END-PORT_MASQ_BEGIN) * PORT_MASQ_MUL),/* TCP */
- ATOMIC_INIT((PORT_MASQ_END-PORT_MASQ_BEGIN) * PORT_MASQ_MUL),/* ICMP */
-};
-
-/*
- * Counts entries that have been requested with specific mport.
- * Used for incoming packets to "relax" input rule (port in MASQ range).
- */
-atomic_t mport_count = ATOMIC_INIT(0);
-
-EXPORT_SYMBOL(ip_masq_get_debug_level);
-EXPORT_SYMBOL(ip_masq_new);
-EXPORT_SYMBOL(ip_masq_listen);
-EXPORT_SYMBOL(ip_masq_free_ports);
-EXPORT_SYMBOL(ip_masq_out_get);
-EXPORT_SYMBOL(ip_masq_in_get);
-EXPORT_SYMBOL(ip_masq_put);
-EXPORT_SYMBOL(ip_masq_control_add);
-EXPORT_SYMBOL(ip_masq_control_del);
-EXPORT_SYMBOL(ip_masq_control_get);
-EXPORT_SYMBOL(ip_masq_user_hook);
-EXPORT_SYMBOL(ip_masq_m_tab);
-EXPORT_SYMBOL(ip_masq_state_name);
-EXPORT_SYMBOL(ip_masq_select_addr);
-EXPORT_SYMBOL(__ip_masq_lock);
-
-/*
- * 2 ip_masq hash tables: for input and output pkts lookups.
- */
-
-struct ip_masq *ip_masq_m_tab[IP_MASQ_TAB_SIZE];
-struct ip_masq *ip_masq_s_tab[IP_MASQ_TAB_SIZE];
-
-/*
- * timeouts
- */
-
-#if 000 /* FIXED timeout handling */
-static struct ip_fw_masq ip_masq_dummy = {
- MASQUERADE_EXPIRE_TCP,
- MASQUERADE_EXPIRE_TCP_FIN,
- MASQUERADE_EXPIRE_UDP
-};
-
-EXPORT_SYMBOL(ip_masq_expire);
-struct ip_fw_masq *ip_masq_expire = &ip_masq_dummy;
-#endif
-
-/*
- * These flags enable non-strict d{addr,port} checks
- * Given that both (in/out) lookup tables are hashed
- * by m{addr,port} and s{addr,port} this is quite easy
- */
-
-#define MASQ_DADDR_PASS (IP_MASQ_F_NO_DADDR|IP_MASQ_F_DLOOSE)
-#define MASQ_DPORT_PASS (IP_MASQ_F_NO_DPORT|IP_MASQ_F_DLOOSE)
-
-/*
- * By default enable dest loose semantics
- */
-#define CONFIG_IP_MASQ_LOOSE_DEFAULT 1
-
-
-/*
- * Set masq expiration (deletion) and adds timer,
- * if timeout==0 cancel expiration.
- * Warning: it does not check/delete previous timer!
- */
-
-static void __ip_masq_set_expire(struct ip_masq *ms, unsigned long tout)
-{
- if (tout) {
- ms->timer.expires = jiffies+tout;
- add_timer(&ms->timer);
- } else {
- del_timer(&ms->timer);
- }
-}
-
-
-/*
- * Returns hash value
- */
-
-static __inline__ unsigned
-ip_masq_hash_key(unsigned proto, __u32 addr, __u16 port)
-{
- return (proto^ntohl(addr)^ntohs(port)) & (IP_MASQ_TAB_SIZE-1);
-}
-
-/*
- * Hashes ip_masq by its proto,addrs,ports.
- * should be called with locked tables.
- * returns bool success.
- */
-
-static int ip_masq_hash(struct ip_masq *ms)
-{
- unsigned hash;
-
- if (ms->flags & IP_MASQ_F_HASHED) {
- IP_MASQ_ERR( "ip_masq_hash(): request for already hashed, called from %p\n",
- __builtin_return_address(0));
- return 0;
- }
- /*
- * Hash by proto,m{addr,port}
- */
- hash = ip_masq_hash_key(ms->protocol, ms->maddr, ms->mport);
- ms->m_link = ip_masq_m_tab[hash];
- atomic_inc(&ms->refcnt);
- ip_masq_m_tab[hash] = ms;
-
- /*
- * Hash by proto,s{addr,port}
- */
- hash = ip_masq_hash_key(ms->protocol, ms->saddr, ms->sport);
- ms->s_link = ip_masq_s_tab[hash];
- atomic_inc(&ms->refcnt);
- ip_masq_s_tab[hash] = ms;
-
-
- ms->flags |= IP_MASQ_F_HASHED;
- return 1;
-}
-
-/*
- * UNhashes ip_masq from ip_masq_[ms]_tables.
- * should be called with locked tables.
- * returns bool success.
- */
-
-static int ip_masq_unhash(struct ip_masq *ms)
-{
- unsigned hash;
- struct ip_masq ** ms_p;
- if (!(ms->flags & IP_MASQ_F_HASHED)) {
- IP_MASQ_ERR( "ip_masq_unhash(): request for unhash flagged, called from %p\n",
- __builtin_return_address(0));
- return 0;
- }
- /*
- * UNhash by m{addr,port}
- */
- hash = ip_masq_hash_key(ms->protocol, ms->maddr, ms->mport);
- for (ms_p = &ip_masq_m_tab[hash]; *ms_p ; ms_p = &(*ms_p)->m_link)
- if (ms == (*ms_p)) {
- atomic_dec(&ms->refcnt);
- *ms_p = ms->m_link;
- break;
- }
-
- /*
- * UNhash by s{addr,port}
- */
- hash = ip_masq_hash_key(ms->protocol, ms->saddr, ms->sport);
- for (ms_p = &ip_masq_s_tab[hash]; *ms_p ; ms_p = &(*ms_p)->s_link)
- if (ms == (*ms_p)) {
- atomic_dec(&ms->refcnt);
- *ms_p = ms->s_link;
- break;
- }
-
- ms->flags &= ~IP_MASQ_F_HASHED;
- return 1;
-}
-
-/*
- * Returns ip_masq associated with supplied parameters, either
- * broken out of the ip/tcp headers or directly supplied for those
- * pathological protocols with address/port in the data stream
- * (ftp, irc). addresses and ports are in network order.
- * called for pkts coming from OUTside-to-INside the firewall.
- *
- * s_addr, s_port: pkt source address (foreign host)
- * d_addr, d_port: pkt dest address (firewall)
- *
- * NB. Cannot check destination address, just for the incoming port.
- * reason: archie.doc.ac.uk has 6 interfaces, you send to
- * phoenix and get a reply from any other interface(==dst)!
- *
- * [Only for UDP] - AC
- *
- * Caller must lock tables
- */
-
-static struct ip_masq * __ip_masq_in_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
-{
- unsigned hash;
- struct ip_masq *ms = NULL;
-
- hash = ip_masq_hash_key(protocol, d_addr, d_port);
-
- for(ms = ip_masq_m_tab[hash]; ms ; ms = ms->m_link) {
- if (protocol==ms->protocol &&
- (d_addr==ms->maddr && d_port==ms->mport) &&
- (s_addr==ms->daddr || ms->flags & MASQ_DADDR_PASS) &&
- (s_port==ms->dport || ms->flags & MASQ_DPORT_PASS)
- ) {
- IP_MASQ_DEBUG(2, "look/in %d %08X:%04hX->%08X:%04hX OK\n",
- protocol,
- s_addr,
- s_port,
- d_addr,
- d_port);
- atomic_inc(&ms->refcnt);
- goto out;
- }
- }
- IP_MASQ_DEBUG(2, "look/in %d %08X:%04hX->%08X:%04hX fail\n",
- protocol,
- s_addr,
- s_port,
- d_addr,
- d_port);
-
-out:
- return ms;
-}
-
-/*
- * Returns ip_masq associated with supplied parameters, either
- * broken out of the ip/tcp headers or directly supplied for those
- * pathological protocols with address/port in the data stream
- * (ftp, irc). addresses and ports are in network order.
- * called for pkts coming from inside-to-OUTside the firewall.
- *
- * Normally we know the source address and port but for some protocols
- * (e.g. ftp PASV) we do not know the source port initially. Alas the
- * hash is keyed on source port so if the first lookup fails then try again
- * with a zero port, this time only looking at entries marked "no source
- * port".
- *
- * Caller must lock tables
- */
-
-static struct ip_masq * __ip_masq_out_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
-{
- unsigned hash;
- struct ip_masq *ms = NULL;
-
- /*
- * Check for "full" addressed entries
- */
- hash = ip_masq_hash_key(protocol, s_addr, s_port);
-
- for(ms = ip_masq_s_tab[hash]; ms ; ms = ms->s_link) {
- if (protocol == ms->protocol &&
- s_addr == ms->saddr && s_port == ms->sport &&
- (d_addr==ms->daddr || ms->flags & MASQ_DADDR_PASS) &&
- (d_port==ms->dport || ms->flags & MASQ_DPORT_PASS)
- ) {
- IP_MASQ_DEBUG(2, "lk/out1 %d %08X:%04hX->%08X:%04hX OK\n",
- protocol,
- s_addr,
- s_port,
- d_addr,
- d_port);
-
- atomic_inc(&ms->refcnt);
- goto out;
- }
-
- }
-
- /*
- * Check for NO_SPORT entries
- */
- hash = ip_masq_hash_key(protocol, s_addr, 0);
- for(ms = ip_masq_s_tab[hash]; ms ; ms = ms->s_link) {
- if (ms->flags & IP_MASQ_F_NO_SPORT &&
- protocol == ms->protocol &&
- s_addr == ms->saddr &&
- (d_addr==ms->daddr || ms->flags & MASQ_DADDR_PASS) &&
- (d_port==ms->dport || ms->flags & MASQ_DPORT_PASS)
- ) {
- IP_MASQ_DEBUG(2, "lk/out2 %d %08X:%04hX->%08X:%04hX OK\n",
- protocol,
- s_addr,
- s_port,
- d_addr,
- d_port);
-
- atomic_inc(&ms->refcnt);
- goto out;
- }
- }
- IP_MASQ_DEBUG(2, "lk/out1 %d %08X:%04hX->%08X:%04hX fail\n",
- protocol,
- s_addr,
- s_port,
- d_addr,
- d_port);
-
-out:
- return ms;
-}
-
-#ifdef CONFIG_IP_MASQ_NREUSE
-/*
- * Returns ip_masq for given proto,m_addr,m_port.
- * called by allocation routine to find an unused m_port.
- *
- * Caller must lock tables
- */
-
-static struct ip_masq * __ip_masq_getbym(int protocol, __u32 m_addr, __u16 m_port)
-{
- unsigned hash;
- struct ip_masq *ms = NULL;
-
- hash = ip_masq_hash_key(protocol, m_addr, m_port);
-
- for(ms = ip_masq_m_tab[hash]; ms ; ms = ms->m_link) {
- if ( protocol==ms->protocol &&
- (m_addr==ms->maddr && m_port==ms->mport)) {
- atomic_inc(&ms->refcnt);
- goto out;
- }
- }
-
-out:
- return ms;
-}
-#endif
-
-struct ip_masq * ip_masq_out_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
-{
- struct ip_masq *ms;
-
- read_lock(&__ip_masq_lock);
- ms = __ip_masq_out_get(protocol, s_addr, s_port, d_addr, d_port);
- read_unlock(&__ip_masq_lock);
-
- if (ms)
- __ip_masq_set_expire(ms, 0);
- return ms;
-}
-
-struct ip_masq * ip_masq_in_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
-{
- struct ip_masq *ms;
-
- read_lock(&__ip_masq_lock);
- ms = __ip_masq_in_get(protocol, s_addr, s_port, d_addr, d_port);
- read_unlock(&__ip_masq_lock);
-
- if (ms)
- __ip_masq_set_expire(ms, 0);
- return ms;
-}
-
-static __inline__ void __ip_masq_put(struct ip_masq *ms)
-{
- atomic_dec(&ms->refcnt);
-}
-
-void ip_masq_put(struct ip_masq *ms)
-{
- /*
- * Decrement refcnt
- */
- __ip_masq_put(ms);
-
- /*
- * if refcnt==2 (2 hashes)
- */
- if (atomic_read(&ms->refcnt)==2) {
- __ip_masq_set_expire(ms, ms->timeout);
- } else {
- IP_MASQ_DEBUG(0, "did not set timer with refcnt=%d, called from %p\n",
- atomic_read(&ms->refcnt),
- __builtin_return_address(0));
- }
-}
-
-static void masq_expire(unsigned long data)
-{
- struct ip_masq *ms = (struct ip_masq *)data;
- ms->timeout = MASQUERADE_EXPIRE_RETRY;
-
- /*
- * hey, I'm using it
- */
- atomic_inc(&ms->refcnt);
-
- IP_MASQ_DEBUG(1, "Masqueraded %s %08lX:%04X expired\n",
- masq_proto_name(ms->protocol),
- ntohl(ms->saddr),ntohs(ms->sport));
-
- write_lock(&__ip_masq_lock);
-
-#if 0000
- /*
- * Already locked, do bounce ...
- */
- if (ip_masq_nlocks(&__ip_masq_lock) != 1) {
- goto masq_expire_later;
- }
-
-#endif
- /*
- * do I control anybody?
- */
- if (atomic_read(&ms->n_control))
- goto masq_expire_later;
-
- /*
- * does anybody controls me?
- */
-
- if (ms->control)
- ip_masq_control_del(ms);
-
- if (ip_masq_unhash(ms)) {
- if (ms->flags&IP_MASQ_F_MPORT) {
- atomic_dec(&mport_count);
- } else {
- atomic_inc(ip_masq_free_ports + masq_proto_num(ms->protocol));
- }
- ip_masq_unbind_app(ms);
- }
-
- /*
- * refcnt==1 implies I'm the only one referrer
- */
- if (atomic_read(&ms->refcnt) == 1) {
- kfree_s(ms,sizeof(*ms));
- MOD_DEC_USE_COUNT;
- goto masq_expire_out;
- }
-
-masq_expire_later:
- IP_MASQ_DEBUG(0, "masq_expire delayed: %s %08lX:%04X->%08lX:%04X masq.refcnt-1=%d masq.n_control=%d\n",
- masq_proto_name(ms->protocol),
- ntohl(ms->saddr), ntohs(ms->sport),
- ntohl(ms->daddr), ntohs(ms->dport),
- atomic_read(&ms->refcnt)-1,
- atomic_read(&ms->n_control));
-
- ip_masq_put(ms);
-
-masq_expire_out:
- write_unlock(&__ip_masq_lock);
-}
-
-static __u16 get_next_mport(void)
-{
- __u16 mport;
-
- spin_lock_irq(&masq_port_lock);
- /*
- * Try the next available port number
- */
- mport = htons(masq_port++);
- if (masq_port==PORT_MASQ_END) masq_port = PORT_MASQ_BEGIN;
-
- spin_unlock_irq(&masq_port_lock);
- return mport;
-}
-
-/*
- * Create a new masquerade list entry, also allocate an
- * unused mport, keeping the portnumber between the
- * given boundaries MASQ_BEGIN and MASQ_END.
- *
- * Be careful, it can be called from u-space
- */
-
-struct ip_masq * ip_masq_new(int proto, __u32 maddr, __u16 mport, __u32 saddr, __u16 sport, __u32 daddr, __u16 dport, unsigned mflags)
-{
- struct ip_masq *ms, *mst;
- int ports_tried;
- atomic_t *free_ports_p = NULL;
- static int n_fails = 0;
- int prio;
-
-
- if (masq_proto_num(proto)!=-1 && mport == 0) {
- free_ports_p = ip_masq_free_ports + masq_proto_num(proto);
-
- if (atomic_read(free_ports_p) == 0) {
- if (++n_fails < 5)
- IP_MASQ_ERR( "ip_masq_new(proto=%s): no free ports.\n",
- masq_proto_name(proto));
- return NULL;
- }
- }
-
- prio = (mflags&IP_MASQ_F_USER) ? GFP_KERNEL : GFP_ATOMIC;
-
- ms = (struct ip_masq *) kmalloc(sizeof(struct ip_masq), prio);
- if (ms == NULL) {
- if (++n_fails < 5)
- IP_MASQ_ERR("ip_masq_new(proto=%s): no memory available.\n",
- masq_proto_name(proto));
- return NULL;
- }
- MOD_INC_USE_COUNT;
- memset(ms, 0, sizeof(*ms));
- init_timer(&ms->timer);
- ms->timer.data = (unsigned long)ms;
- ms->timer.function = masq_expire;
- ms->protocol = proto;
- ms->saddr = saddr;
- ms->sport = sport;
- ms->daddr = daddr;
- ms->dport = dport;
- ms->flags = mflags;
- ms->app_data = NULL;
- ms->control = NULL;
-
- atomic_set(&ms->n_control,0);
- atomic_set(&ms->refcnt,0);
-
- if (proto == IPPROTO_UDP && !mport)
-#ifdef CONFIG_IP_MASQ_LOOSE_DEFAULT
- /*
- * Flag this tunnel as "dest loose"
- *
- */
- ms->flags |= IP_MASQ_F_DLOOSE;
-#else
- ms->flags |= IP_MASQ_F_NO_DADDR;
-#endif
-
-
- /* get masq address from rif */
- ms->maddr = maddr;
-
- /*
- * This flag will allow masq. addr (ms->maddr)
- * to follow forwarding interface address.
- */
- ms->flags |= IP_MASQ_F_NO_REPLY;
-
- /*
- * We want a specific mport. Be careful.
- */
- if (masq_proto_num(proto) == -1 || mport) {
- ms->mport = mport;
-
- /*
- * Check 5-upla uniqueness
- */
- if (mflags & IP_MASQ_F_USER)
- write_lock_bh(&__ip_masq_lock);
- else
- write_lock(&__ip_masq_lock);
-
- mst = __ip_masq_in_get(proto, daddr, dport, maddr, mport);
- if (mst==NULL) {
- ms->flags |= IP_MASQ_F_MPORT;
-
- atomic_inc(&mport_count);
- ip_masq_hash(ms);
-
- if (mflags & IP_MASQ_F_USER)
- write_unlock_bh(&__ip_masq_lock);
- else
- write_unlock(&__ip_masq_lock);
-
- ip_masq_bind_app(ms);
- atomic_inc(&ms->refcnt);
- masq_set_state_timeout(ms, IP_MASQ_S_NONE);
- return ms;
- }
- if (mflags & IP_MASQ_F_USER)
- write_unlock_bh(&__ip_masq_lock);
- else
- write_unlock(&__ip_masq_lock);
-
- __ip_masq_put(mst);
-
- IP_MASQ_ERR( "Already used connection: %s, %d.%d.%d.%d:%d => %d.%d.%d.%d:%d, called from %p\n",
- masq_proto_name(proto),
- NIPQUAD(maddr), ntohs(mport),
- NIPQUAD(daddr), ntohs(dport),
- __builtin_return_address(0));
-
-
- goto mport_nono;
- }
-
-
- for (ports_tried = 0;
- (atomic_read(free_ports_p) && (ports_tried <= (PORT_MASQ_END - PORT_MASQ_BEGIN)));
- ports_tried++){
-
- mport = ms->mport = get_next_mport();
- /*
- * lookup to find out if this connection is used.
- */
-
- if (mflags & IP_MASQ_F_USER)
- write_lock_bh(&__ip_masq_lock);
- else
- write_lock(&__ip_masq_lock);
-
-#ifdef CONFIG_IP_MASQ_NREUSE
- mst = __ip_masq_getbym(proto, maddr, mport);
-#else
- mst = __ip_masq_in_get(proto, daddr, dport, maddr, mport);
-#endif
- if (mst == NULL) {
-
- if (atomic_read(free_ports_p) == 0) {
- if (mflags & IP_MASQ_F_USER)
- write_unlock_bh(&__ip_masq_lock);
- else
- write_unlock(&__ip_masq_lock);
-
- break;
- }
- atomic_dec(free_ports_p);
- ip_masq_hash(ms);
-
- if (mflags & IP_MASQ_F_USER)
- write_unlock_bh(&__ip_masq_lock);
- else
- write_unlock(&__ip_masq_lock);
-
- ip_masq_bind_app(ms);
- n_fails = 0;
- atomic_inc(&ms->refcnt);
- masq_set_state_timeout(ms, IP_MASQ_S_NONE);
- return ms;
- }
- if (mflags & IP_MASQ_F_USER)
- write_unlock_bh(&__ip_masq_lock);
- else
- write_unlock(&__ip_masq_lock);
-
- __ip_masq_put(mst);
- }
-
- if (++n_fails < 5)
- IP_MASQ_ERR( "ip_masq_new(proto=%s): could not get free masq entry (free=%d).\n",
- masq_proto_name(ms->protocol),
- atomic_read(free_ports_p));
-mport_nono:
- kfree_s(ms, sizeof(*ms));
-
- MOD_DEC_USE_COUNT;
- return NULL;
-}
-
-/*
- * Get transport protocol data offset, check against size
- * return:
- * 0 if other IP proto
- * -1 if error
- */
-static __inline__ int proto_doff(unsigned proto, char *th, unsigned size)
-{
- int ret = -1;
- switch (proto) {
- case IPPROTO_ICMP:
- if (size >= sizeof(struct icmphdr))
- ret = sizeof(struct icmphdr);
- break;
- case IPPROTO_UDP:
- if (size >= sizeof(struct udphdr))
- ret = sizeof(struct udphdr);
- break;
- case IPPROTO_TCP:
- /*
- * Is this case, this check _also_ avoids
- * touching an invalid pointer if
- * size is invalid
- */
- if (size >= sizeof(struct tcphdr)) {
- ret = ((struct tcphdr*)th)->doff << 2;
- if (ret > size) {
- ret = -1 ;
- }
- }
-
- break;
- default:
- /* Other proto: nothing to say, by now :) */
- ret = 0;
- }
- if (ret < 0)
- IP_MASQ_DEBUG(0, "mess proto_doff for proto=%d, size =%d\n",
- proto, size);
- return ret;
-}
-
-int ip_fw_masquerade(struct sk_buff **skb_p, __u32 maddr)
-{
- struct sk_buff *skb = *skb_p;
- struct iphdr *iph = skb->nh.iph;
- union ip_masq_tphdr h;
- struct ip_masq *ms;
- int size;
-
- /*
- * doff holds transport protocol data offset
- * csum holds its checksum
- * csum_ok says if csum is valid
- */
- int doff = 0;
- int csum = 0;
- int csum_ok = 0;
-
- /*
- * We can only masquerade protocols with ports... and hack some ICMPs
- */
-
- h.raw = (char*) iph + iph->ihl * 4;
- size = ntohs(iph->tot_len) - (iph->ihl * 4);
-
-
- doff = proto_doff(iph->protocol, h.raw, size);
- if (doff <= 0) {
- /*
- * Output path: do not pass other IP protos nor
- * invalid packets.
- */
- return -1;
- }
-
- switch (iph->protocol) {
- case IPPROTO_ICMP:
- return(ip_fw_masq_icmp(skb_p, maddr));
- case IPPROTO_UDP:
- if (h.uh->check == 0)
- /* No UDP checksum */
- break;
- case IPPROTO_TCP:
- /* Make sure packet is in the masq range */
- IP_MASQ_DEBUG(3, "O-pkt: %s size=%d\n",
- masq_proto_name(iph->protocol),
- size);
-
-#ifdef CONFIG_IP_MASQ_DEBUG
- if (ip_masq_get_debug_level() > 3) {
- skb->ip_summed = CHECKSUM_NONE;
- }
-#endif
- /* Check that the checksum is OK */
- switch (skb->ip_summed)
- {
- case CHECKSUM_NONE:
- {
- csum = csum_partial(h.raw + doff, size - doff, 0);
- IP_MASQ_DEBUG(3, "O-pkt: %s I-datacsum=%d\n",
- masq_proto_name(iph->protocol),
- csum);
-
- skb->csum = csum_partial(h.raw , doff, csum);
- }
- case CHECKSUM_HW:
- if (csum_tcpudp_magic(iph->saddr, iph->daddr,
- size, iph->protocol, skb->csum))
- {
- IP_MASQ_DEBUG(0, "Outgoing failed %s checksum from %d.%d.%d.%d (size=%d)!\n",
- masq_proto_name(iph->protocol),
- NIPQUAD(iph->saddr),
- size);
- return -1;
- }
- default:
- /* CHECKSUM_UNNECESSARY */
- }
- break;
- default:
- return -1;
- }
- /*
- * Now hunt the list to see if we have an old entry
- */
-
- /* h.raw = (char*) iph + iph->ihl * 4; */
-
- IP_MASQ_DEBUG(2, "Outgoing %s %08lX:%04X -> %08lX:%04X\n",
- masq_proto_name(iph->protocol),
- ntohl(iph->saddr), ntohs(h.portp[0]),
- ntohl(iph->daddr), ntohs(h.portp[1]));
-
- ms = ip_masq_out_get_iph(iph);
- if (ms!=NULL) {
-
- /*
- * If sysctl !=0 and no pkt has been received yet
- * in this tunnel and routing iface address has changed...
- * "You are welcome, diald".
- */
- if ( sysctl_ip_dynaddr && ms->flags & IP_MASQ_F_NO_REPLY && maddr != ms->maddr) {
-
- if (sysctl_ip_dynaddr > 1) {
- IP_MASQ_INFO( "ip_fw_masquerade(): change masq.addr from %d.%d.%d.%d to %d.%d.%d.%d\n",
- NIPQUAD(ms->maddr),NIPQUAD(maddr));
- }
-
- write_lock(&__ip_masq_lock);
-
- ip_masq_unhash(ms);
- ms->maddr = maddr;
- ip_masq_hash(ms);
-
- write_unlock(&__ip_masq_lock);
- }
-
- /*
- * Set sport if not defined yet (e.g. ftp PASV). Because
- * masq entries are hashed on sport, unhash with old value
- * and hash with new.
- */
-
- if ( ms->flags & IP_MASQ_F_NO_SPORT && ms->protocol == IPPROTO_TCP ) {
- ms->flags &= ~IP_MASQ_F_NO_SPORT;
-
- write_lock(&__ip_masq_lock);
-
- ip_masq_unhash(ms);
- ms->sport = h.portp[0];
- ip_masq_hash(ms); /* hash on new sport */
-
- write_unlock(&__ip_masq_lock);
-
- IP_MASQ_DEBUG(1, "ip_fw_masquerade(): filled sport=%d\n",
- ntohs(ms->sport));
- }
- if (ms->flags & IP_MASQ_F_DLOOSE) {
- /*
- * update dest loose values
- */
- ms->dport = h.portp[1];
- ms->daddr = iph->daddr;
- }
- } else {
- /*
- * Nope, not found, create a new entry for it
- */
-
-#ifdef CONFIG_IP_MASQUERADE_MOD
- if (!(ms = ip_masq_mod_out_create(skb, iph, maddr)))
-#endif
- ms = ip_masq_new(iph->protocol,
- maddr, 0,
- iph->saddr, h.portp[0],
- iph->daddr, h.portp[1],
- 0);
- if (ms == NULL)
- return -1;
- }
-
- /*
- * Call module's output update hook
- */
-
-#ifdef CONFIG_IP_MASQUERADE_MOD
- ip_masq_mod_out_update(skb, iph, ms);
-#endif
-
- /*
- * Change the fragments origin
- */
-
- size = skb->len - (h.raw - skb->nh.raw);
-
- /*
- * Set iph addr and port from ip_masq obj.
- */
- iph->saddr = ms->maddr;
- h.portp[0] = ms->mport;
-
- /*
- * Invalidate csum saving if tunnel has masq helper
- */
-
- if (ms->app)
- csum_ok = 0;
-
- /*
- * Attempt ip_masq_app call.
- * will fix ip_masq and iph seq stuff
- */
- if (ip_masq_app_pkt_out(ms, skb_p, maddr) != 0)
- {
- /*
- * skb has possibly changed, update pointers.
- */
- skb = *skb_p;
- iph = skb->nh.iph;
- h.raw = (char*) iph + iph->ihl *4;
- size = skb->len - (h.raw - skb->nh.raw);
- /* doff should have not changed */
- }
-
- /*
- * Adjust packet accordingly to protocol
- */
-
- /*
- * Transport's payload partial csum
- */
-
- if (!csum_ok) {
- csum = csum_partial(h.raw + doff, size - doff, 0);
- }
- skb->csum = csum;
-
- IP_MASQ_DEBUG(3, "O-pkt: %s size=%d O-datacsum=%d\n",
- masq_proto_name(iph->protocol),
- size,
- csum);
-
- /*
- * Protocol csum
- */
- switch (iph->protocol) {
- case IPPROTO_TCP:
- h.th->check = 0;
- h.th->check=csum_tcpudp_magic(iph->saddr, iph->daddr,
- size, iph->protocol,
- csum_partial(h.raw , doff, csum));
- IP_MASQ_DEBUG(3, "O-pkt: %s O-csum=%d (+%d)\n",
- masq_proto_name(iph->protocol),
- h.th->check,
- (char*) & (h.th->check) - (char*) h.raw);
-
- break;
- case IPPROTO_UDP:
- h.uh->check = 0;
- h.uh->check=csum_tcpudp_magic(iph->saddr, iph->daddr,
- size, iph->protocol,
- csum_partial(h.raw , doff, csum));
- if (h.uh->check == 0)
- h.uh->check = 0xFFFF;
- IP_MASQ_DEBUG(3, "O-pkt: %s O-csum=%d (+%d)\n",
- masq_proto_name(iph->protocol),
- h.uh->check,
- (char*) &(h.uh->check)- (char*) h.raw);
- break;
- }
- ip_send_check(iph);
-
- IP_MASQ_DEBUG(2, "O-routed from %08lX:%04X with masq.addr %08lX\n",
- ntohl(ms->maddr),ntohs(ms->mport),ntohl(maddr));
-
- masq_set_state(ms, 1, iph, h.portp);
- ip_masq_put(ms);
-
- return 0;
- }
-
-/*
- * Restore original addresses and ports in the original IP
- * datagram if the failing packet has been [de]masqueraded.
- * This is ugly in the extreme. We no longer have the original
- * packet so we have to reconstruct it from the failing packet
- * plus data in the masq tables. The resulting "original data"
- * should be good enough to tell the sender which session to
- * throttle. Relies on far too much knowledge of masq internals,
- * there ought to be a better way - KAO 990303.
- *
- * Moved here from icmp.c - JJC.
- * Already known: type == ICMP_DEST_UNREACH, IPSKB_MASQUERADED
- * skb->nh.iph points to original header.
- *
- * Must try both OUT and IN tables; we could add a flag
- * ala IPSKB_MASQUERADED to avoid 2nd tables lookup, but this is VERY
- * unlike because routing makes mtu decision before reaching
- * ip_fw_masquerade().
- *
- */
-int ip_fw_unmasq_icmp(struct sk_buff *skb) {
- struct ip_masq *ms;
- struct iphdr *iph = skb->nh.iph;
- __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
-
- /*
- * Always called from _bh context: use read_[un]lock()
- */
-
- /*
- * Peek "out" table, this packet has bounced:
- * out->in(frag_needed!)->OUT[icmp]
- *
- * iph->daddr is IN host
- * iph->saddr is OUT host
- */
- read_lock(&__ip_masq_lock);
- ms = __ip_masq_out_get(iph->protocol,
- iph->daddr, portp[1],
- iph->saddr, portp[0]);
- read_unlock(&__ip_masq_lock);
- if (ms) {
- IP_MASQ_DEBUG(1, "Incoming frag_need rewrited from %d.%d.%d.%d to %d.%d.%d.%d\n",
- NIPQUAD(iph->daddr), NIPQUAD(ms->maddr));
- iph->daddr = ms->maddr;
- portp[1] = ms->mport;
- __ip_masq_put(ms);
- return 1;
- }
- /*
- * Peek "in" table
- * in->out(frag_needed!)->IN[icmp]
- *
- * iph->daddr is OUT host
- * iph->saddr is MASQ host
- *
- */
- read_lock(&__ip_masq_lock);
- ms = __ip_masq_in_get(iph->protocol,
- iph->daddr, portp[1],
- iph->saddr, portp[0]);
- read_unlock(&__ip_masq_lock);
- if (ms) {
- IP_MASQ_DEBUG(1, "Outgoing frag_need rewrited from %d.%d.%d.%d to %d.%d.%d.%d\n",
- NIPQUAD(iph->saddr), NIPQUAD(ms->saddr));
- iph->saddr = ms->saddr;
- portp[0] = ms->sport;
- __ip_masq_put(ms);
- return 1;
- }
- return 0;
-
-}
-/*
- * Handle ICMP messages in forward direction.
- * Find any that might be relevant, check against existing connections,
- * forward to masqueraded host if relevant.
- * Currently handles error types - unreachable, quench, ttl exceeded
- */
-
-int ip_fw_masq_icmp(struct sk_buff **skb_p, __u32 maddr)
-{
- struct sk_buff *skb = *skb_p;
- struct iphdr *iph = skb->nh.iph;
- struct icmphdr *icmph = (struct icmphdr *)((char *)iph + (iph->ihl<<2));
- struct iphdr *ciph; /* The ip header contained within the ICMP */
- __u16 *pptr; /* port numbers from TCP/UDP contained header */
- struct ip_masq *ms;
- unsigned short len = ntohs(iph->tot_len) - (iph->ihl * 4);
-
- IP_MASQ_DEBUG(2, "Incoming forward ICMP (%d,%d) %lX -> %lX\n",
- icmph->type, ntohs(icmp_id(icmph)),
- ntohl(iph->saddr), ntohl(iph->daddr));
-
-#ifdef CONFIG_IP_MASQUERADE_ICMP
- if ((icmph->type == ICMP_ECHO ) ||
- (icmph->type == ICMP_TIMESTAMP ) ||
- (icmph->type == ICMP_INFO_REQUEST ) ||
- (icmph->type == ICMP_ADDRESS )) {
-
- IP_MASQ_DEBUG(2, "icmp request rcv %lX->%lX id %d type %d\n",
- ntohl(iph->saddr),
- ntohl(iph->daddr),
- ntohs(icmp_id(icmph)),
- icmph->type);
-
- ms = ip_masq_out_get(iph->protocol,
- iph->saddr,
- icmp_id(icmph),
- iph->daddr,
- icmp_hv_req(icmph));
- if (ms == NULL) {
- ms = ip_masq_new(iph->protocol,
- maddr, 0,
- iph->saddr, icmp_id(icmph),
- iph->daddr, icmp_hv_req(icmph),
- 0);
- if (ms == NULL)
- return (-1);
- IP_MASQ_DEBUG(1, "Created new icmp entry\n");
- }
- /* Rewrite source address */
-
- /*
- * If sysctl !=0 and no pkt has been received yet
- * in this tunnel and routing iface address has changed...
- * "You are welcome, diald".
- */
- if ( sysctl_ip_dynaddr && ms->flags & IP_MASQ_F_NO_REPLY && maddr != ms->maddr) {
-
- if (sysctl_ip_dynaddr > 1) {
- IP_MASQ_INFO( "ip_fw_masq_icmp(): change masq.addr %d.%d.%d.%d to %d.%d.%d.%d",
- NIPQUAD(ms->maddr), NIPQUAD(maddr));
- }
-
- write_lock(&__ip_masq_lock);
-
- ip_masq_unhash(ms);
- ms->maddr = maddr;
- ip_masq_hash(ms);
-
- write_unlock(&__ip_masq_lock);
- }
-
- iph->saddr = ms->maddr;
- ip_send_check(iph);
- /* Rewrite port (id) */
- (icmph->un).echo.id = ms->mport;
- icmph->checksum = 0;
- icmph->checksum = ip_compute_csum((unsigned char *)icmph, len);
-
- IP_MASQ_DEBUG(2, "icmp request rwt %lX->%lX id %d type %d\n",
- ntohl(iph->saddr),
- ntohl(iph->daddr),
- ntohs(icmp_id(icmph)),
- icmph->type);
-
- masq_set_state(ms, 1, iph, icmph);
- ip_masq_put(ms);
-
- return 1;
- }
-#endif
-
- /*
- * Work through seeing if this is for us.
- * These checks are supposed to be in an order that
- * means easy things are checked first to speed up
- * processing.... however this means that some
- * packets will manage to get a long way down this
- * stack and then be rejected, but thats life
- */
- if ((icmph->type != ICMP_DEST_UNREACH) &&
- (icmph->type != ICMP_SOURCE_QUENCH) &&
- (icmph->type != ICMP_TIME_EXCEEDED))
- return 0;
-
- /* Now find the contained IP header */
- ciph = (struct iphdr *) (icmph + 1);
-
-#ifdef CONFIG_IP_MASQUERADE_ICMP
- if (ciph->protocol == IPPROTO_ICMP) {
- /*
- * This section handles ICMP errors for ICMP packets
- */
- struct icmphdr *cicmph = (struct icmphdr *)((char *)ciph +
- (ciph->ihl<<2));
-
-
- IP_MASQ_DEBUG(2, "fw icmp/icmp rcv %lX->%lX id %d type %d\n",
- ntohl(ciph->saddr),
- ntohl(ciph->daddr),
- ntohs(icmp_id(cicmph)),
- cicmph->type);
-
- read_lock(&__ip_masq_lock);
- ms = __ip_masq_out_get(ciph->protocol,
- ciph->daddr,
- icmp_id(cicmph),
- ciph->saddr,
- icmp_hv_rep(cicmph));
- read_unlock(&__ip_masq_lock);
-
- if (ms == NULL)
- return 0;
-
- /* Now we do real damage to this packet...! */
- /* First change the source IP address, and recalc checksum */
- iph->saddr = ms->maddr;
- ip_send_check(iph);
-
- /* Now change the *dest* address in the contained IP */
- ciph->daddr = ms->maddr;
- __ip_masq_put(ms);
-
- ip_send_check(ciph);
-
- /* Change the ID to the masqed one! */
- (cicmph->un).echo.id = ms->mport;
-
- /* And finally the ICMP checksum */
- icmph->checksum = 0;
- icmph->checksum = ip_compute_csum((unsigned char *) icmph, len);
-
-
- IP_MASQ_DEBUG(2, "fw icmp/icmp rwt %lX->%lX id %d type %d\n",
- ntohl(ciph->saddr),
- ntohl(ciph->daddr),
- ntohs(icmp_id(cicmph)),
- cicmph->type);
-
- return 1;
- }
-#endif /* CONFIG_IP_MASQUERADE_ICMP */
-
- /* We are only interested ICMPs generated from TCP or UDP packets */
- if ((ciph->protocol != IPPROTO_UDP) && (ciph->protocol != IPPROTO_TCP))
- return 0;
-
- /*
- * Find the ports involved - this packet was
- * incoming so the ports are right way round
- * (but reversed relative to outer IP header!)
- */
- pptr = (__u16 *)&(((char *)ciph)[ciph->ihl*4]);
-#if 0
- if (ntohs(pptr[1]) < PORT_MASQ_BEGIN ||
- ntohs(pptr[1]) > PORT_MASQ_END)
- return 0;
-#endif
-
- /* Ensure the checksum is correct */
- if (ip_compute_csum((unsigned char *) icmph, len))
- {
- /* Failed checksum! */
- IP_MASQ_DEBUG(0, "forward ICMP: failed checksum from %d.%d.%d.%d!\n",
- NIPQUAD(iph->saddr));
- return(-1);
- }
-
-
- IP_MASQ_DEBUG(2, "Handling forward ICMP for %08lX:%04X -> %08lX:%04X\n",
- ntohl(ciph->saddr), ntohs(pptr[0]),
- ntohl(ciph->daddr), ntohs(pptr[1]));
-
-
-#if 0
- /* This is pretty much what __ip_masq_in_get_iph() does */
- ms = __ip_masq_in_get(ciph->protocol, ciph->saddr, pptr[0], ciph->daddr, pptr[1]);
-#endif
- read_lock(&__ip_masq_lock);
- ms = __ip_masq_out_get(ciph->protocol,
- ciph->daddr,
- pptr[1],
- ciph->saddr,
- pptr[0]);
- read_unlock(&__ip_masq_lock);
-
- if (ms == NULL)
- return 0;
-
- /* Now we do real damage to this packet...! */
- /* First change the source IP address, and recalc checksum */
- iph->saddr = ms->maddr;
- ip_send_check(iph);
-
- /* Now change the *dest* address in the contained IP */
- ciph->daddr = ms->maddr;
- ip_send_check(ciph);
-
- /* the TCP/UDP dest port - cannot redo check */
- pptr[1] = ms->mport;
- __ip_masq_put(ms);
-
- /* And finally the ICMP checksum */
- icmph->checksum = 0;
- icmph->checksum = ip_compute_csum((unsigned char *) icmph, len);
-
-
- IP_MASQ_DEBUG(2, "Rewrote forward ICMP to %08lX:%04X -> %08lX:%04X\n",
- ntohl(ciph->saddr), ntohs(pptr[0]),
- ntohl(ciph->daddr), ntohs(pptr[1]));
-
-
- return 1;
-}
-
-
-/*
- * Own skb_cow() beast, tweaked for rewriting commonly
- * used pointers in masq code
- */
-static struct sk_buff * masq_skb_cow(struct sk_buff **skb_p,
- struct iphdr **iph_p, unsigned char **t_p) {
- struct sk_buff *skb=(*skb_p);
- if (skb_cloned(skb)) {
- skb = skb_copy(skb, GFP_ATOMIC);
- if (skb) {
- /*
- * skb changed, update other pointers
- */
- struct iphdr *iph = skb->nh.iph;
- kfree_skb(*skb_p);
- *skb_p = skb;
- *iph_p = iph;
- *t_p = (char*) iph + iph->ihl * 4;
- }
- }
- return skb;
-}
-
-/*
- * Handle ICMP messages in reverse (demasquerade) direction.
- * Find any that might be relevant, check against existing connections,
- * forward to masqueraded host if relevant.
- * Currently handles error types - unreachable, quench, ttl exceeded
- */
-
-int ip_fw_demasq_icmp(struct sk_buff **skb_p)
-{
- struct sk_buff *skb = *skb_p;
- struct iphdr *iph = skb->nh.iph;
- struct icmphdr *icmph = (struct icmphdr *)((char *)iph + (iph->ihl<<2));
- struct iphdr *ciph; /* The ip header contained within the ICMP */
- __u16 *pptr; /* port numbers from TCP/UDP contained header */
- struct ip_masq *ms;
- unsigned short len = ntohs(iph->tot_len) - (iph->ihl * 4);
-
-
- IP_MASQ_DEBUG(2, "icmp in/rev (%d,%d) %lX -> %lX\n",
- icmph->type, ntohs(icmp_id(icmph)),
- ntohl(iph->saddr), ntohl(iph->daddr));
-
-
-#ifdef CONFIG_IP_MASQUERADE_ICMP
- if ((icmph->type == ICMP_ECHOREPLY) ||
- (icmph->type == ICMP_TIMESTAMPREPLY) ||
- (icmph->type == ICMP_INFO_REPLY) ||
- (icmph->type == ICMP_ADDRESSREPLY)) {
-
- IP_MASQ_DEBUG(2, "icmp reply rcv %lX->%lX id %d type %d, req %d\n",
- ntohl(iph->saddr),
- ntohl(iph->daddr),
- ntohs(icmp_id(icmph)),
- icmph->type,
- icmp_type_request(icmph->type));
-
- ms = ip_masq_in_get(iph->protocol,
- iph->saddr,
- icmp_hv_rep(icmph),
- iph->daddr,
- icmp_id(icmph));
- if (ms == NULL)
- return 0;
-
- /*
- * got reply, so clear flag
- */
- ms->flags &= ~IP_MASQ_F_NO_REPLY;
-
- if ((skb=masq_skb_cow(skb_p, &iph, (unsigned char**)&icmph)) == NULL) {
- ip_masq_put(ms);
- return -1;
- }
-
- /* Reset source address */
- iph->daddr = ms->saddr;
- /* Redo IP header checksum */
- ip_send_check(iph);
- /* Set ID to fake port number */
- (icmph->un).echo.id = ms->sport;
- /* Reset ICMP checksum and set expiry */
- icmph->checksum=0;
- icmph->checksum=ip_compute_csum((unsigned char *)icmph,len);
-
-
-
- IP_MASQ_DEBUG(2, "icmp reply rwt %lX->%lX id %d type %d\n",
- ntohl(iph->saddr),
- ntohl(iph->daddr),
- ntohs(icmp_id(icmph)),
- icmph->type);
-
- masq_set_state(ms, 0, iph, icmph);
- ip_masq_put(ms);
-
- return 1;
- } else {
-#endif
- if ((icmph->type != ICMP_DEST_UNREACH) &&
- (icmph->type != ICMP_SOURCE_QUENCH) &&
- (icmph->type != ICMP_TIME_EXCEEDED))
- return 0;
-#ifdef CONFIG_IP_MASQUERADE_ICMP
- }
-#endif
- /*
- * If we get here we have an ICMP error of one of the above 3 types
- * Now find the contained IP header
- */
-
- ciph = (struct iphdr *) (icmph + 1);
-
-#ifdef CONFIG_IP_MASQUERADE_ICMP
- if (ciph->protocol == IPPROTO_ICMP) {
- /*
- * This section handles ICMP errors for ICMP packets
- *
- * First get a new ICMP header structure out of the IP packet
- */
- struct icmphdr *cicmph = (struct icmphdr *)((char *)ciph +
- (ciph->ihl<<2));
-
-
- IP_MASQ_DEBUG(2, "rv icmp/icmp rcv %lX->%lX id %d type %d\n",
- ntohl(ciph->saddr),
- ntohl(ciph->daddr),
- ntohs(icmp_id(cicmph)),
- cicmph->type);
-
- read_lock(&__ip_masq_lock);
- ms = __ip_masq_in_get(ciph->protocol,
- ciph->daddr,
- icmp_hv_req(cicmph),
- ciph->saddr,
- icmp_id(cicmph));
- read_unlock(&__ip_masq_lock);
-
- if (ms == NULL)
- return 0;
-
- if ((skb=masq_skb_cow(skb_p, &iph, (unsigned char**)&icmph)) == NULL) {
- __ip_masq_put(ms);
- return -1;
- }
- ciph = (struct iphdr *) (icmph + 1);
- cicmph = (struct icmphdr *)((char *)ciph +
- (ciph->ihl<<2));
- /* Now we do real damage to this packet...! */
- /* First change the dest IP address, and recalc checksum */
- iph->daddr = ms->saddr;
- ip_send_check(iph);
-
- /* Now change the *source* address in the contained IP */
- ciph->saddr = ms->saddr;
- ip_send_check(ciph);
-
- /* Change the ID to the original one! */
- (cicmph->un).echo.id = ms->sport;
- __ip_masq_put(ms);
-
- /* And finally the ICMP checksum */
- icmph->checksum = 0;
- icmph->checksum = ip_compute_csum((unsigned char *) icmph, len);
-
-
- IP_MASQ_DEBUG(2, "rv icmp/icmp rwt %lX->%lX id %d type %d\n",
- ntohl(ciph->saddr),
- ntohl(ciph->daddr),
- ntohs(icmp_id(cicmph)),
- cicmph->type);
-
- return 1;
- }
-#endif /* CONFIG_IP_MASQUERADE_ICMP */
-
- /* We are only interested ICMPs generated from TCP or UDP packets */
- if ((ciph->protocol != IPPROTO_UDP) &&
- (ciph->protocol != IPPROTO_TCP))
- return 0;
-
- /*
- * Find the ports involved - remember this packet was
- * *outgoing* so the ports are reversed (and addresses)
- */
- pptr = (__u16 *)&(((char *)ciph)[ciph->ihl*4]);
- if (ntohs(pptr[0]) < PORT_MASQ_BEGIN ||
- ntohs(pptr[0]) > PORT_MASQ_END)
- return 0;
-
- /* Ensure the checksum is correct */
- if (ip_compute_csum((unsigned char *) icmph, len))
- {
- /* Failed checksum! */
- IP_MASQ_ERR( "reverse ICMP: failed checksum from %d.%d.%d.%d!\n",
- NIPQUAD(iph->saddr));
- return(-1);
- }
-
-
- IP_MASQ_DEBUG(2, "Handling reverse ICMP for %08lX:%04X -> %08lX:%04X\n",
- ntohl(ciph->saddr), ntohs(pptr[0]),
- ntohl(ciph->daddr), ntohs(pptr[1]));
-
-
- /* This is pretty much what __ip_masq_in_get_iph() does, except params are wrong way round */
- read_lock(&__ip_masq_lock);
- ms = __ip_masq_in_get(ciph->protocol,
- ciph->daddr,
- pptr[1],
- ciph->saddr,
- pptr[0]);
- read_unlock(&__ip_masq_lock);
-
- if (ms == NULL)
- return 0;
-
- if ((skb=masq_skb_cow(skb_p, &iph, (unsigned char**)&icmph)) == NULL) {
- __ip_masq_put(ms);
- return -1;
- }
- ciph = (struct iphdr *) (icmph + 1);
- pptr = (__u16 *)&(((char *)ciph)[ciph->ihl*4]);
-
- /* Now we do real damage to this packet...! */
- /* First change the dest IP address, and recalc checksum */
- iph->daddr = ms->saddr;
- ip_send_check(iph);
-
- /* Now change the *source* address in the contained IP */
- ciph->saddr = ms->saddr;
- ip_send_check(ciph);
-
- /* the TCP/UDP source port - cannot redo check */
- pptr[0] = ms->sport;
- __ip_masq_put(ms);
-
- /* And finally the ICMP checksum */
- icmph->checksum = 0;
- icmph->checksum = ip_compute_csum((unsigned char *) icmph, len);
-
-
- IP_MASQ_DEBUG(2, "Rewrote reverse ICMP to %08lX:%04X -> %08lX:%04X\n",
- ntohl(ciph->saddr), ntohs(pptr[0]),
- ntohl(ciph->daddr), ntohs(pptr[1]));
-
-
- return 1;
-}
-
- /*
- * Check if it's an masqueraded port, look it up,
- * and send it on its way...
- *
- * Better not have many hosts using the designated portrange
- * as 'normal' ports, or you'll be spending many time in
- * this function.
- */
-
-int ip_fw_demasquerade(struct sk_buff **skb_p)
-{
- struct sk_buff *skb = *skb_p;
- struct iphdr *iph = skb->nh.iph;
- union ip_masq_tphdr h;
- struct ip_masq *ms;
- unsigned short size;
- int doff = 0;
- int csum = 0;
- int csum_ok = 0;
- __u32 maddr;
-
- /*
- * Big tappo: only PACKET_HOST (nor loopback neither mcasts)
- * ... don't know why 1st test DOES NOT include 2nd (?)
- */
-
- if (skb->pkt_type != PACKET_HOST || skb->dev == &loopback_dev) {
- IP_MASQ_DEBUG(2, "ip_fw_demasquerade(): packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
- skb->pkt_type,
- iph->protocol,
- NIPQUAD(iph->daddr));
- return 0;
- }
-
- h.raw = (char*) iph + iph->ihl * 4;
-
- /*
- * IP payload size
- */
- size = ntohs(iph->tot_len) - (iph->ihl * 4);
-
- doff = proto_doff(iph->protocol, h.raw, size);
-
- switch (doff) {
- case 0:
- /*
- * Input path: other IP protos Ok, will
- * reach local sockets path.
- */
- return 0;
- case -1:
- IP_MASQ_DEBUG(0, "I-pkt invalid packet data size\n");
- return -1;
- }
-
- maddr = iph->daddr;
- switch (iph->protocol) {
- case IPPROTO_ICMP:
- return(ip_fw_demasq_icmp(skb_p));
- case IPPROTO_TCP:
- case IPPROTO_UDP:
- /*
- * Make sure packet is in the masq range
- * ... or some mod-ule relaxes input range
- * ... or there is still some `special' mport opened
- */
- if ((ntohs(h.portp[1]) < PORT_MASQ_BEGIN
- || ntohs(h.portp[1]) > PORT_MASQ_END)
-#ifdef CONFIG_IP_MASQUERADE_MOD
- && (ip_masq_mod_in_rule(skb, iph) != 1)
-#endif
- && atomic_read(&mport_count) == 0 )
- return 0;
-
- /* Check that the checksum is OK */
- if ((iph->protocol == IPPROTO_UDP) && (h.uh->check == 0))
- /* No UDP checksum */
- break;
-#ifdef CONFIG_IP_MASQ_DEBUG
- if (ip_masq_get_debug_level() > 3) {
- skb->ip_summed = CHECKSUM_NONE;
- }
-#endif
-
- switch (skb->ip_summed)
- {
- case CHECKSUM_NONE:
- csum = csum_partial(h.raw + doff, size - doff, 0);
- csum_ok++;
- skb->csum = csum_partial(h.raw , doff, csum);
-
- case CHECKSUM_HW:
- if (csum_tcpudp_magic(iph->saddr, iph->daddr,
- size, iph->protocol, skb->csum))
- {
- IP_MASQ_DEBUG(0, "Incoming failed %s checksum from %d.%d.%d.%d (size=%d)!\n",
- masq_proto_name(iph->protocol),
- NIPQUAD(iph->saddr),
- size);
- return -1;
- }
- default:
- /* CHECKSUM_UNNECESSARY */
- }
- break;
- default:
- return 0;
- }
-
-
-
- IP_MASQ_DEBUG(2, "Incoming %s %08lX:%04X -> %08lX:%04X\n",
- masq_proto_name(iph->protocol),
- ntohl(iph->saddr), ntohs(h.portp[0]),
- ntohl(iph->daddr), ntohs(h.portp[1]));
-
- /*
- * reroute to original host:port if found...
- */
-
- ms = ip_masq_in_get_iph(iph);
-
- /*
- * Give additional modules a chance to create an entry
- */
-#ifdef CONFIG_IP_MASQUERADE_MOD
- if (!ms)
- ms = ip_masq_mod_in_create(skb, iph, maddr);
-
- /*
- * Call module's input update hook
- */
- ip_masq_mod_in_update(skb, iph, ms);
-#endif
-
-
- if (ms != NULL)
- {
-
- /*
- * got reply, so clear flag
- */
- ms->flags &= ~IP_MASQ_F_NO_REPLY;
-
- /*
- * Set daddr,dport if not defined yet
- * and tunnel is not setup as "dest loose"
- */
-
- if (ms->flags & IP_MASQ_F_DLOOSE) {
- /*
- * update dest loose values
- */
- ms->dport = h.portp[0];
- ms->daddr = iph->saddr;
- } else {
- if ( ms->flags & IP_MASQ_F_NO_DPORT ) { /* && ms->protocol == IPPROTO_TCP ) { */
- ms->flags &= ~IP_MASQ_F_NO_DPORT;
- ms->dport = h.portp[0];
-
- IP_MASQ_DEBUG(1, "ip_fw_demasquerade(): filled dport=%d\n",
- ntohs(ms->dport));
-
- }
- if (ms->flags & IP_MASQ_F_NO_DADDR ) { /* && ms->protocol == IPPROTO_TCP) { */
- ms->flags &= ~IP_MASQ_F_NO_DADDR;
- ms->daddr = iph->saddr;
-
- IP_MASQ_DEBUG(1, "ip_fw_demasquerade(): filled daddr=%lX\n",
- ntohl(ms->daddr));
-
- }
- }
- if ((skb=masq_skb_cow(skb_p, &iph, &h.raw)) == NULL) {
- ip_masq_put(ms);
- return -1;
- }
- iph->daddr = ms->saddr;
- h.portp[1] = ms->sport;
-
- /*
- * Invalidate csum saving if tunnel has masq helper
- */
-
- if (ms->app)
- csum_ok = 0;
-
- /*
- * Attempt ip_masq_app call.
- * will fix ip_masq and iph ack_seq stuff
- */
-
- if (ip_masq_app_pkt_in(ms, skb_p, maddr) != 0)
- {
- /*
- * skb has changed, update pointers.
- */
-
- skb = *skb_p;
- iph = skb->nh.iph;
- h.raw = (char*) iph + iph->ihl*4;
- size = ntohs(iph->tot_len) - (iph->ihl * 4);
- }
-
- /*
- * Yug! adjust UDP/TCP checksums
- */
-
- /*
- * Transport's payload partial csum
- */
-
- if (!csum_ok) {
- csum = csum_partial(h.raw + doff, size - doff, 0);
- }
- skb->csum = csum;
-
- /*
- * Protocol csum
- */
- switch (iph->protocol) {
- case IPPROTO_TCP:
- h.th->check = 0;
- h.th->check=csum_tcpudp_magic(iph->saddr, iph->daddr,
- size, iph->protocol,
- csum_partial(h.raw , doff, csum));
- break;
- case IPPROTO_UDP:
- h.uh->check = 0;
- h.uh->check=csum_tcpudp_magic(iph->saddr, iph->daddr,
- size, iph->protocol,
- csum_partial(h.raw , doff, csum));
- if (h.uh->check == 0)
- h.uh->check = 0xFFFF;
- break;
- }
- ip_send_check(iph);
-
- IP_MASQ_DEBUG(2, "I-routed to %08lX:%04X\n",ntohl(iph->daddr),ntohs(h.portp[1]));
-
- masq_set_state (ms, 0, iph, h.portp);
- ip_masq_put(ms);
-
- return 1;
- }
-
- /* sorry, all this trouble for a no-hit :) */
- return 0;
-}
-
-
-void ip_masq_control_add(struct ip_masq *ms, struct ip_masq* ctl_ms)
-{
- if (ms->control) {
- IP_MASQ_ERR( "request control ADD for already controlled: %d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n",
- NIPQUAD(ms->saddr),ntohs(ms->sport),
- NIPQUAD(ms->daddr),ntohs(ms->dport));
- ip_masq_control_del(ms);
- }
- IP_MASQ_DEBUG(1, "ADDing control for: ms.dst=%d.%d.%d.%d:%d ctl_ms.dst=%d.%d.%d.%d:%d\n",
- NIPQUAD(ms->daddr),ntohs(ms->dport),
- NIPQUAD(ctl_ms->daddr),ntohs(ctl_ms->dport));
- ms->control = ctl_ms;
- atomic_inc(&ctl_ms->n_control);
-}
-
-void ip_masq_control_del(struct ip_masq *ms)
-{
- struct ip_masq *ctl_ms = ms->control;
- if (!ctl_ms) {
- IP_MASQ_ERR( "request control DEL for uncontrolled: %d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n",
- NIPQUAD(ms->saddr),ntohs(ms->sport),
- NIPQUAD(ms->daddr),ntohs(ms->dport));
- return;
- }
- IP_MASQ_DEBUG(1, "DELeting control for: ms.dst=%d.%d.%d.%d:%d ctl_ms.dst=%d.%d.%d.%d:%d\n",
- NIPQUAD(ms->daddr),ntohs(ms->dport),
- NIPQUAD(ctl_ms->daddr),ntohs(ctl_ms->dport));
- ms->control = NULL;
- if (atomic_read(&ctl_ms->n_control) == 0) {
- IP_MASQ_ERR( "BUG control DEL with n=0 : %d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n",
- NIPQUAD(ms->saddr),ntohs(ms->sport),
- NIPQUAD(ms->daddr),ntohs(ms->dport));
- return;
-
- }
- atomic_dec(&ctl_ms->n_control);
-}
-
-struct ip_masq * ip_masq_control_get(struct ip_masq *ms)
-{
- return ms->control;
-}
-
-
-#ifdef CONFIG_PROC_FS
-/*
- * /proc/net entries
- * From userspace
- */
-static int ip_msqhst_procinfo(char *buffer, char **start, off_t offset,
- int length, int unused)
-{
- off_t pos=0, begin;
- struct ip_masq *ms;
- char temp[129];
- int idx = 0;
- int len=0;
-
-
- if (offset < 128)
- {
- sprintf(temp,
- "Prc FromIP FPrt ToIP TPrt Masq Init-seq Delta PDelta Expires (free=%d,%d,%d)",
- atomic_read(ip_masq_free_ports),
- atomic_read(ip_masq_free_ports+1),
- atomic_read(ip_masq_free_ports+2));
- len = sprintf(buffer, "%-127s\n", temp);
- }
- pos = 128;
-
- for(idx = 0; idx < IP_MASQ_TAB_SIZE; idx++)
- {
- /*
- * Lock is actually only need in next loop
- * we are called from uspace: must stop bh.
- */
- read_lock_bh(&__ip_masq_lock);
-
- for(ms = ip_masq_m_tab[idx]; ms ; ms = ms->m_link)
- {
- pos += 128;
- if (pos <= offset) {
- len = 0;
- continue;
- }
-
- /*
- * We have locked the tables, no need to del/add timers
- * nor cli() 8)
- */
-
- sprintf(temp,"%s %08lX:%04X %08lX:%04X %04X %08X %6d %6d %7lu",
- masq_proto_name(ms->protocol),
- ntohl(ms->saddr), ntohs(ms->sport),
- ntohl(ms->daddr), ntohs(ms->dport),
- ntohs(ms->mport),
- ms->out_seq.init_seq,
- ms->out_seq.delta,
- ms->out_seq.previous_delta,
- ms->timer.expires-jiffies);
- len += sprintf(buffer+len, "%-127s\n", temp);
-
- if(len >= length) {
-
- read_unlock_bh(&__ip_masq_lock);
- goto done;
- }
- }
- read_unlock_bh(&__ip_masq_lock);
-
- }
-done:
-
-
- begin = len - (pos - offset);
- *start = buffer + begin;
- len -= begin;
- if(len>length)
- len = length;
- return len;
-}
-
-#endif
-
-/*
- * Timeouts handling by ipfwadm/ipchains
- * From ip_fw.c
- */
-
-int ip_fw_masq_timeouts(void *m, int len)
-{
- struct ip_fw_masq *masq;
- int ret = EINVAL;
-
- if (len != sizeof(struct ip_fw_masq)) {
- IP_MASQ_DEBUG(1, "ip_fw_masq_timeouts: length %d, expected %d\n",
- len, sizeof(struct ip_fw_masq));
- } else {
- masq = (struct ip_fw_masq *)m;
- if (masq->tcp_timeout)
- masq_timeout_table.timeout[IP_MASQ_S_ESTABLISHED]
- = masq->tcp_timeout;
-
- if (masq->tcp_fin_timeout)
- masq_timeout_table.timeout[IP_MASQ_S_FIN_WAIT]
- = masq->tcp_fin_timeout;
-
- if (masq->udp_timeout)
- masq_timeout_table.timeout[IP_MASQ_S_UDP]
- = masq->udp_timeout;
- ret = 0;
- }
- return ret;
-}
-/*
- * Module autoloading stuff
- */
-
-static int ip_masq_user_check_hook(void) {
-#ifdef CONFIG_KMOD
- if (ip_masq_user_hook == NULL) {
- IP_MASQ_DEBUG(1, "About to request \"ip_masq_user\" module\n");
- request_module("ip_masq_user");
- }
-#endif /* CONFIG_KMOD */
- return (ip_masq_user_hook != NULL);
-}
-
-/*
- * user module hook- info
- */
-static int ip_masq_user_info(char *buffer, char **start, off_t offset,
- int len, int *eof, void *data)
-{
- int ret = -ENOPKG;
- if (ip_masq_user_check_hook()) {
- ret = ip_masq_user_hook->info(buffer, start, offset, len, (int) data);
- }
- return ret;
-}
-
-/*
- * user module hook- entry mgmt
- */
-static int ip_masq_user_ctl(int optname, void *arg, int arglen)
-{
- int ret = -ENOPKG;
- if (ip_masq_user_check_hook()) {
- ret = ip_masq_user_hook->ctl(optname, arg, arglen);
- }
- return ret;
-}
-
-/*
- * Control from ip_sockglue
- * MAIN ENTRY point from userspace (apart from /proc *info entries)
- * Returns errno
- */
-int ip_masq_uctl(int optname, char * optval , int optlen)
-{
- struct ip_masq_ctl masq_ctl;
- int ret = -EINVAL;
-
- if(optlen>sizeof(masq_ctl))
- return -EINVAL;
-
- if(copy_from_user(&masq_ctl,optval,optlen))
- return -EFAULT;
-
- IP_MASQ_DEBUG(1,"ip_masq_ctl(optname=%d, optlen=%d, target=%d, cmd=%d)\n",
- optname, optlen, masq_ctl.m_target, masq_ctl.m_cmd);
-
- switch (masq_ctl.m_target) {
- case IP_MASQ_TARGET_USER:
- ret = ip_masq_user_ctl(optname, &masq_ctl, optlen);
- break;
-#ifdef CONFIG_IP_MASQUERADE_MOD
- case IP_MASQ_TARGET_MOD:
- ret = ip_masq_mod_ctl(optname, &masq_ctl, optlen);
- break;
-#endif
- }
-
- /*
- * If ret>0, copy to user space
- */
-
- if (ret > 0 && ret <= sizeof (masq_ctl)) {
- if (copy_to_user(optval, &masq_ctl, ret) )
- return -EFAULT;
- ret = 0;
- }
-
- return ret;
-}
-
-#ifdef CONFIG_PROC_FS
-static struct proc_dir_entry *proc_net_ip_masq = NULL;
-
-#ifdef MODULE
-static void ip_masq_proc_count(struct inode *inode, int fill)
-{
- if (fill)
- MOD_INC_USE_COUNT;
- else
- MOD_DEC_USE_COUNT;
-}
-#endif
-
-int ip_masq_proc_register(struct proc_dir_entry *ent)
-{
- if (!proc_net_ip_masq) return -1;
- IP_MASQ_DEBUG(1, "registering \"/proc/net/ip_masq/%s\" entry\n",
- ent->name);
- return proc_register(proc_net_ip_masq, ent);
-}
-void ip_masq_proc_unregister(struct proc_dir_entry *ent)
-{
- if (!proc_net_ip_masq) return;
- IP_MASQ_DEBUG(1, "unregistering \"/proc/net/ip_masq/%s\" entry\n",
- ent->name);
- proc_unregister(proc_net_ip_masq, ent->low_ino);
-}
-
-
-__initfunc(static void masq_proc_init(void))
-{
- IP_MASQ_DEBUG(1,"registering /proc/net/ip_masq\n");
- if (!proc_net_ip_masq) {
- struct proc_dir_entry *ent;
- ent = create_proc_entry("net/ip_masq", S_IFDIR, 0);
- if (ent) {
-#ifdef MODULE
- ent->fill_inode = ip_masq_proc_count;
-#endif
- proc_net_ip_masq = ent;
- } else {
- IP_MASQ_ERR("Could not create \"/proc/net/ip_masq\" entry\n");
- }
- }
-}
-#endif /* CONFIG_PROC_FS */
-/*
- * Wrapper over inet_select_addr()
- */
-u32 ip_masq_select_addr(struct device *dev, u32 dst, int scope)
-{
- return inet_select_addr(dev, dst, scope);
-}
-
-/*
- * Initialize ip masquerading
- */
-__initfunc(int ip_masq_init(void))
-{
-#ifdef CONFIG_PROC_FS
- proc_net_register(&(struct proc_dir_entry) {
- PROC_NET_IPMSQHST, 13, "ip_masquerade",
- S_IFREG | S_IRUGO, 1, 0, 0,
- 0, &proc_net_inode_operations,
- ip_msqhst_procinfo
- });
- masq_proc_init();
-
- ip_masq_proc_register(&(struct proc_dir_entry) {
- 0, 3, "tcp",
- S_IFREG | S_IRUGO, 1, 0, 0,
- 0, &proc_net_inode_operations,
- NULL, /* get_info */
- NULL, /* fill_inode */
- NULL, NULL, NULL,
- (char *) IPPROTO_TCP,
- ip_masq_user_info
- });
- ip_masq_proc_register(&(struct proc_dir_entry) {
- 0, 3, "udp",
- S_IFREG | S_IRUGO, 1, 0, 0,
- 0, &proc_net_inode_operations,
- NULL, /* get_info */
- NULL, /* fill_inode */
- NULL, NULL, NULL,
- (char *) IPPROTO_UDP,
- ip_masq_user_info
- });
- ip_masq_proc_register(&(struct proc_dir_entry) {
- 0, 4, "icmp",
- S_IFREG | S_IRUGO, 1, 0, 0,
- 0, &proc_net_inode_operations,
- NULL, /* get_info */
- NULL, /* fill_inode */
- NULL, NULL, NULL,
- (char *) IPPROTO_ICMP,
- ip_masq_user_info
- });
-#endif
-#ifdef CONFIG_IP_MASQUERADE_IPAUTOFW
- ip_autofw_init();
-#endif
-#ifdef CONFIG_IP_MASQUERADE_IPPORTFW
- ip_portfw_init();
-#endif
-#ifdef CONFIG_IP_MASQUERADE_MFW
- ip_mfw_init();
-#endif
- ip_masq_app_init();
-
- return 0;
-}
diff --git a/net/ipv4/ip_masq_app.c b/net/ipv4/ip_masq_app.c
deleted file mode 100644
index 45fd14fa4..000000000
--- a/net/ipv4/ip_masq_app.c
+++ /dev/null
@@ -1,603 +0,0 @@
-/*
- * IP_MASQ_APP application masquerading module
- *
- *
- * $Id: ip_masq_app.c,v 1.16 1998/08/29 23:51:14 davem Exp $
- *
- * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Fixes:
- * JJC : Implemented also input pkt hook
- * Miquel van Smoorenburg : Copy more stuff when resizing skb
- *
- *
- * FIXME:
- * - ip_masq_skb_replace(): use same skb if space available.
- *
- */
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/skbuff.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/init.h>
-#include <net/protocol.h>
-#include <net/tcp.h>
-#include <net/udp.h>
-#include <asm/system.h>
-#include <linux/stat.h>
-#include <linux/proc_fs.h>
-#include <net/ip_masq.h>
-
-#define IP_MASQ_APP_TAB_SIZE 16 /* must be power of 2 */
-
-#define IP_MASQ_APP_HASH(proto, port) ((port^proto) & (IP_MASQ_APP_TAB_SIZE-1))
-#define IP_MASQ_APP_TYPE(proto, port) ( proto<<16 | port )
-#define IP_MASQ_APP_PORT(type) ( type & 0xffff )
-#define IP_MASQ_APP_PROTO(type) ( (type>>16) & 0x00ff )
-
-
-EXPORT_SYMBOL(register_ip_masq_app);
-EXPORT_SYMBOL(unregister_ip_masq_app);
-EXPORT_SYMBOL(ip_masq_skb_replace);
-
-/*
- * will hold masq app. hashed list heads
- */
-
-struct ip_masq_app *ip_masq_app_base[IP_MASQ_APP_TAB_SIZE];
-
-/*
- * ip_masq_app registration routine
- * port: host byte order.
- */
-
-int register_ip_masq_app(struct ip_masq_app *mapp, unsigned short proto, __u16 port)
-{
- unsigned long flags;
- unsigned hash;
- if (!mapp) {
- IP_MASQ_ERR("register_ip_masq_app(): NULL arg\n");
- return -EINVAL;
- }
- mapp->type = IP_MASQ_APP_TYPE(proto, port);
- mapp->n_attach = 0;
- hash = IP_MASQ_APP_HASH(proto, port);
-
- save_flags(flags);
- cli();
- mapp->next = ip_masq_app_base[hash];
- ip_masq_app_base[hash] = mapp;
- restore_flags(flags);
-
- return 0;
-}
-
-/*
- * ip_masq_app unreg. routine.
- */
-
-int unregister_ip_masq_app(struct ip_masq_app *mapp)
-{
- struct ip_masq_app **mapp_p;
- unsigned hash;
- unsigned long flags;
- if (!mapp) {
- IP_MASQ_ERR("unregister_ip_masq_app(): NULL arg\n");
- return -EINVAL;
- }
- /*
- * only allow unregistration if it has no attachments
- */
- if (mapp->n_attach) {
- IP_MASQ_ERR("unregister_ip_masq_app(): has %d attachments. failed\n",
- mapp->n_attach);
- return -EINVAL;
- }
- hash = IP_MASQ_APP_HASH(IP_MASQ_APP_PROTO(mapp->type), IP_MASQ_APP_PORT(mapp->type));
-
- save_flags(flags);
- cli();
- for (mapp_p = &ip_masq_app_base[hash]; *mapp_p ; mapp_p = &(*mapp_p)->next)
- if (mapp == (*mapp_p)) {
- *mapp_p = mapp->next;
- restore_flags(flags);
- return 0;
- }
-
- restore_flags(flags);
- IP_MASQ_ERR("unregister_ip_masq_app(proto=%s,port=%u): not hashed!\n",
- masq_proto_name(IP_MASQ_APP_PROTO(mapp->type)), IP_MASQ_APP_PORT(mapp->type));
- return -EINVAL;
-}
-
-/*
- * get ip_masq_app object by its proto and port (net byte order).
- */
-
-struct ip_masq_app * ip_masq_app_get(unsigned short proto, __u16 port)
-{
- struct ip_masq_app *mapp;
- unsigned hash;
- unsigned type;
-
- port = ntohs(port);
- type = IP_MASQ_APP_TYPE(proto,port);
- hash = IP_MASQ_APP_HASH(proto,port);
- for(mapp = ip_masq_app_base[hash]; mapp ; mapp = mapp->next) {
- if (type == mapp->type) return mapp;
- }
- return NULL;
-}
-
-/*
- * ip_masq_app object binding related funcs.
- */
-
-/*
- * change ip_masq_app object's number of bindings
- */
-
-static __inline__ int ip_masq_app_bind_chg(struct ip_masq_app *mapp, int delta)
-{
- unsigned long flags;
- int n_at;
- if (!mapp) return -1;
- save_flags(flags);
- cli();
- n_at = mapp->n_attach + delta;
- if (n_at < 0) {
- restore_flags(flags);
- IP_MASQ_ERR("ip_masq_app: tried to set n_attach < 0 for (proto=%s,port==%d) ip_masq_app object.\n",
- masq_proto_name(IP_MASQ_APP_PROTO(mapp->type)),
- IP_MASQ_APP_PORT(mapp->type));
- return -1;
- }
- mapp->n_attach = n_at;
- restore_flags(flags);
- return 0;
-}
-
-/*
- * Bind ip_masq to its ip_masq_app based on proto and dport ALREADY
- * set in ip_masq struct. Also calls constructor.
- */
-
-struct ip_masq_app * ip_masq_bind_app(struct ip_masq *ms)
-{
- struct ip_masq_app * mapp;
-
- if (ms->protocol != IPPROTO_TCP && ms->protocol != IPPROTO_UDP)
- return NULL;
-
- mapp = ip_masq_app_get(ms->protocol, ms->dport);
-
-#if 0000
-/* #ifdef CONFIG_IP_MASQUERADE_IPAUTOFW */
- if (mapp == NULL)
- mapp = ip_masq_app_get(ms->protocol, ms->sport);
-/* #endif */
-#endif
-
- if (mapp != NULL) {
- /*
- * don't allow binding if already bound
- */
-
- if (ms->app != NULL) {
- IP_MASQ_ERR("ip_masq_bind_app() called for already bound object.\n");
- return ms->app;
- }
-
- ms->app = mapp;
- if (mapp->masq_init_1) mapp->masq_init_1(mapp, ms);
- ip_masq_app_bind_chg(mapp, +1);
- }
- return mapp;
-}
-
-/*
- * Unbind ms from type object and call ms destructor (does not kfree()).
- */
-
-int ip_masq_unbind_app(struct ip_masq *ms)
-{
- struct ip_masq_app * mapp;
- mapp = ms->app;
-
- if (ms->protocol != IPPROTO_TCP && ms->protocol != IPPROTO_UDP)
- return 0;
-
- if (mapp != NULL) {
- if (mapp->masq_done_1) mapp->masq_done_1(mapp, ms);
- ms->app = NULL;
- ip_masq_app_bind_chg(mapp, -1);
- }
- return (mapp != NULL);
-}
-
-/*
- * Fixes th->seq based on ip_masq_seq info.
- */
-
-static __inline__ void masq_fix_seq(const struct ip_masq_seq *ms_seq, struct tcphdr *th)
-{
- __u32 seq;
-
- seq = ntohl(th->seq);
-
- /*
- * Adjust seq with delta-offset for all packets after
- * the most recent resized pkt seq and with previous_delta offset
- * for all packets before most recent resized pkt seq.
- */
-
- if (ms_seq->delta || ms_seq->previous_delta) {
- if(after(seq,ms_seq->init_seq) ) {
- th->seq = htonl(seq + ms_seq->delta);
- IP_MASQ_DEBUG(1, "masq_fix_seq() : added delta (%d) to seq\n",ms_seq->delta);
- } else {
- th->seq = htonl(seq + ms_seq->previous_delta);
- IP_MASQ_DEBUG(1, "masq_fix_seq() : added previous_delta (%d) to seq\n",ms_seq->previous_delta);
- }
- }
-
-
-}
-
-/*
- * Fixes th->ack_seq based on ip_masq_seq info.
- */
-
-static __inline__ void masq_fix_ack_seq(const struct ip_masq_seq *ms_seq, struct tcphdr *th)
-{
- __u32 ack_seq;
-
- ack_seq=ntohl(th->ack_seq);
-
- /*
- * Adjust ack_seq with delta-offset for
- * the packets AFTER most recent resized pkt has caused a shift
- * for packets before most recent resized pkt, use previous_delta
- */
-
- if (ms_seq->delta || ms_seq->previous_delta) {
- if(after(ack_seq,ms_seq->init_seq)) {
- th->ack_seq = htonl(ack_seq-ms_seq->delta);
- IP_MASQ_DEBUG(1, "masq_fix_ack_seq() : subtracted delta (%d) from ack_seq\n",ms_seq->delta);
-
- } else {
- th->ack_seq = htonl(ack_seq-ms_seq->previous_delta);
- IP_MASQ_DEBUG(1, "masq_fix_ack_seq() : subtracted previous_delta (%d) from ack_seq\n",ms_seq->previous_delta);
- }
- }
-
-}
-
-/*
- * Updates ip_masq_seq if pkt has been resized
- * Assumes already checked proto==IPPROTO_TCP and diff!=0.
- */
-
-static __inline__ void masq_seq_update(struct ip_masq *ms, struct ip_masq_seq *ms_seq, unsigned mflag, __u32 seq, int diff)
-{
- /* if (diff == 0) return; */
-
- if ( !(ms->flags & mflag) || after(seq, ms_seq->init_seq))
- {
- ms_seq->previous_delta=ms_seq->delta;
- ms_seq->delta+=diff;
- ms_seq->init_seq=seq;
- ms->flags |= mflag;
- }
-}
-
-/*
- * Output pkt hook. Will call bound ip_masq_app specific function
- * called by ip_fw_masquerade(), assumes previously checked ms!=NULL
- * returns (new - old) skb->len diff.
- */
-
-int ip_masq_app_pkt_out(struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr)
-{
- struct ip_masq_app * mapp;
- struct iphdr *iph;
- struct tcphdr *th;
- int diff;
- __u32 seq;
-
- /*
- * check if application masquerading is bound to
- * this ip_masq.
- * assumes that once an ip_masq is bound,
- * it will not be unbound during its life.
- */
-
- if ( (mapp = ms->app) == NULL)
- return 0;
-
- iph = (*skb_p)->nh.iph;
- th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
-
- /*
- * Remember seq number in case this pkt gets resized
- */
-
- seq = ntohl(th->seq);
-
- /*
- * Fix seq stuff if flagged as so.
- */
-
- if (ms->protocol == IPPROTO_TCP) {
- if (ms->flags & IP_MASQ_F_OUT_SEQ)
- masq_fix_seq(&ms->out_seq, th);
- if (ms->flags & IP_MASQ_F_IN_SEQ)
- masq_fix_ack_seq(&ms->in_seq, th);
- }
-
- /*
- * Call private output hook function
- */
-
- if ( mapp->pkt_out == NULL )
- return 0;
-
- diff = mapp->pkt_out(mapp, ms, skb_p, maddr);
-
- /*
- * Update ip_masq seq stuff if len has changed.
- */
-
- if (diff != 0 && ms->protocol == IPPROTO_TCP)
- masq_seq_update(ms, &ms->out_seq, IP_MASQ_F_OUT_SEQ, seq, diff);
-
- return diff;
-}
-
-/*
- * Input pkt hook. Will call bound ip_masq_app specific function
- * called by ip_fw_demasquerade(), assumes previously checked ms!=NULL.
- * returns (new - old) skb->len diff.
- */
-
-int ip_masq_app_pkt_in(struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr)
-{
- struct ip_masq_app * mapp;
- struct iphdr *iph;
- struct tcphdr *th;
- int diff;
- __u32 seq;
-
- /*
- * check if application masquerading is bound to
- * this ip_masq.
- * assumes that once an ip_masq is bound,
- * it will not be unbound during its life.
- */
-
- if ( (mapp = ms->app) == NULL)
- return 0;
-
- iph = (*skb_p)->nh.iph;
- th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
-
- /*
- * Remember seq number in case this pkt gets resized
- */
-
- seq = ntohl(th->seq);
-
- /*
- * Fix seq stuff if flagged as so.
- */
-
- if (ms->protocol == IPPROTO_TCP) {
- if (ms->flags & IP_MASQ_F_IN_SEQ)
- masq_fix_seq(&ms->in_seq, th);
- if (ms->flags & IP_MASQ_F_OUT_SEQ)
- masq_fix_ack_seq(&ms->out_seq, th);
- }
-
- /*
- * Call private input hook function
- */
-
- if ( mapp->pkt_in == NULL )
- return 0;
-
- diff = mapp->pkt_in(mapp, ms, skb_p, maddr);
-
- /*
- * Update ip_masq seq stuff if len has changed.
- */
-
- if (diff != 0 && ms->protocol == IPPROTO_TCP)
- masq_seq_update(ms, &ms->in_seq, IP_MASQ_F_IN_SEQ, seq, diff);
-
- return diff;
-}
-
-/*
- * /proc/ip_masq_app entry function
- */
-
-int ip_masq_app_getinfo(char *buffer, char **start, off_t offset, int length, int dummy)
-{
- off_t pos=0, begin=0;
- int len=0;
- struct ip_masq_app * mapp;
- unsigned idx;
-
- if (offset < 40)
- len=sprintf(buffer,"%-39s\n", "prot port n_attach name");
- pos = 40;
-
- for (idx=0 ; idx < IP_MASQ_APP_TAB_SIZE; idx++)
- for (mapp = ip_masq_app_base[idx]; mapp ; mapp = mapp->next) {
- /*
- * If you change the length of this sprintf, then all
- * the length calculations need fixing too!
- * Line length = 40 (3 + 2 + 7 + 1 + 7 + 1 + 2 + 17)
- */
- pos += 40;
- if (pos < offset)
- continue;
-
- len += sprintf(buffer+len, "%-3s %-7u %-7d %-17s\n",
- masq_proto_name(IP_MASQ_APP_PROTO(mapp->type)),
- IP_MASQ_APP_PORT(mapp->type), mapp->n_attach,
- mapp->name);
-
- if(len >= length)
- goto done;
- }
-done:
- begin = len - (pos - offset);
- *start = buffer + begin;
- len -= begin;
- if (len > length)
- len = length;
- return len;
-}
-
-
-#ifdef CONFIG_PROC_FS
-static struct proc_dir_entry proc_net_ip_masq_app = {
- PROC_NET_IP_MASQ_APP, 3, "app",
- S_IFREG | S_IRUGO, 1, 0, 0,
- 0, &proc_net_inode_operations,
- ip_masq_app_getinfo
-};
-#endif
-
-/*
- * Initialization routine
- */
-
-__initfunc(int ip_masq_app_init(void))
-{
-#ifdef CONFIG_PROC_FS
- ip_masq_proc_register(&proc_net_ip_masq_app);
-#endif
- return 0;
-}
-
-/*
- * Replace a segment (of skb->data) with a new one.
- * FIXME: Should re-use same skb if space available, this could
- * be done if n_len < o_len, unless some extra space
- * were already allocated at driver level :P .
- */
-
-static struct sk_buff * skb_replace(struct sk_buff *skb, int pri, char *o_buf, int o_len, char *n_buf, int n_len)
-{
- int maxsize, diff, o_offset;
- struct sk_buff *n_skb;
- int offset;
-
- maxsize = skb->truesize;
-
- diff = n_len - o_len;
- o_offset = o_buf - (char*) skb->data;
-
- if (maxsize <= n_len) {
- if (diff != 0) {
- memcpy(skb->data + o_offset + n_len,o_buf + o_len,
- skb->len - (o_offset + o_len));
- }
-
- memcpy(skb->data + o_offset, n_buf, n_len);
-
- n_skb = skb;
- skb->len = n_len;
- skb->end = skb->head+n_len;
- } else {
- /*
- * Sizes differ, make a copy.
- *
- * FIXME: move this to core/sbuff.c:skb_grow()
- */
-
- n_skb = alloc_skb(MAX_HEADER + skb->len + diff, pri);
- if (n_skb == NULL) {
- IP_MASQ_ERR(KERN_ERR "skb_replace(): no room left (from %p)\n",
- return_address());
- return skb;
-
- }
- skb_reserve(n_skb, MAX_HEADER);
- skb_put(n_skb, skb->len + diff);
-
- /*
- * Copy as much data from the old skb as possible. Even
- * though we're only forwarding packets, we need stuff
- * like skb->protocol (PPP driver wants it).
- */
- offset = n_skb->data - skb->data;
- n_skb->nh.raw = skb->nh.raw + offset;
- n_skb->h.raw = skb->h.raw + offset;
- n_skb->dev = skb->dev;
- n_skb->mac.raw = skb->mac.raw + offset;
- n_skb->pkt_type = skb->pkt_type;
- n_skb->protocol = skb->protocol;
- n_skb->ip_summed = skb->ip_summed;
- n_skb->dst = dst_clone(skb->dst);
-
- /*
- * Copy pkt in new buffer
- */
-
- memcpy(n_skb->data, skb->data, o_offset);
- memcpy(n_skb->data + o_offset, n_buf, n_len);
- memcpy(n_skb->data + o_offset + n_len, o_buf + o_len,
- skb->len - (o_offset + o_len) );
-
- /*
- * Problem, how to replace the new skb with old one,
- * preferably inplace
- */
-
- kfree_skb(skb);
- }
- return n_skb;
-}
-
-/*
- * calls skb_replace() and update ip header if new skb was allocated
- */
-
-struct sk_buff * ip_masq_skb_replace(struct sk_buff *skb, int pri, char *o_buf, int o_len, char *n_buf, int n_len)
-{
- int diff;
- struct sk_buff *n_skb;
- unsigned skb_len;
-
- diff = n_len - o_len;
- n_skb = skb_replace(skb, pri, o_buf, o_len, n_buf, n_len);
- skb_len = skb->len;
-
- if (diff)
- {
- struct iphdr *iph;
- IP_MASQ_DEBUG(1, "masq_skb_replace(): pkt resized for %d bytes (len=%d)\n", diff, skb->len);
- /*
- * update ip header
- */
- iph = n_skb->nh.iph;
- iph->check = 0;
- iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
- iph->tot_len = htons(skb_len + diff);
- }
- return n_skb;
-}
diff --git a/net/ipv4/ip_masq_autofw.c b/net/ipv4/ip_masq_autofw.c
deleted file mode 100644
index d2a1729c5..000000000
--- a/net/ipv4/ip_masq_autofw.c
+++ /dev/null
@@ -1,448 +0,0 @@
-/*
- * IP_MASQ_AUTOFW auto forwarding module
- *
- *
- * $Id: ip_masq_autofw.c,v 1.3 1998/08/29 23:51:10 davem Exp $
- *
- * Author: Richard Lynch
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- *
- * Fixes:
- * Juan Jose Ciarlante : created this new file from ip_masq.c and ip_fw.c
- * Juan Jose Ciarlante : modularized
- * Juan Jose Ciarlante : use GFP_KERNEL when creating entries
- * Juan Jose Ciarlante : call del_timer() when freeing entries (!)
- * FIXME:
- * - implement refcnt
- *
- */
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/errno.h>
-#include <asm/system.h>
-#include <linux/stat.h>
-#include <linux/proc_fs.h>
-#include <linux/if.h>
-#include <linux/init.h>
-#include <linux/ip_fw.h>
-#include <net/ip_masq.h>
-#include <net/ip_masq_mod.h>
-#include <linux/ip_masq.h>
-
-#define IP_AUTOFW_EXPIRE 15*HZ
-
-/* WARNING: bitwise equal to ip_autofw_user in linux/ip_masq.h */
-struct ip_autofw {
- struct ip_autofw * next;
- __u16 type;
- __u16 low;
- __u16 hidden;
- __u16 high;
- __u16 visible;
- __u16 protocol;
- __u32 lastcontact;
- __u32 where;
- __u16 ctlproto;
- __u16 ctlport;
- __u16 flags;
- struct timer_list timer;
-};
-
-/*
- * Debug level
- */
-#ifdef CONFIG_IP_MASQ_DEBUG
-static int debug=0;
-MODULE_PARM(debug, "i");
-#endif
-
-/*
- * Auto-forwarding table
- */
-
-static struct ip_autofw * ip_autofw_hosts = NULL;
-static struct ip_masq_mod * mmod_self = NULL;
-
-/*
- * Check if a masq entry should be created for a packet
- */
-
-static __inline__ struct ip_autofw * ip_autofw_check_range (__u32 where, __u16 port, __u16 protocol, int reqact)
-{
- struct ip_autofw *af;
- af=ip_autofw_hosts;
- port=ntohs(port);
- while (af) {
- if (af->type==IP_FWD_RANGE &&
- port>=af->low &&
- port<=af->high &&
- protocol==af->protocol &&
-
- /*
- * It's ok to create masq entries after
- * the timeout if we're in insecure mode
- */
- (af->flags & IP_AUTOFW_ACTIVE || !reqact || !(af->flags & IP_AUTOFW_SECURE)) &&
- (!(af->flags & IP_AUTOFW_SECURE) || af->lastcontact==where || !reqact))
- return(af);
- af=af->next;
- }
- return(NULL);
-}
-
-static __inline__ struct ip_autofw * ip_autofw_check_port (__u16 port, __u16 protocol)
-{
- struct ip_autofw *af;
- af=ip_autofw_hosts;
- port=ntohs(port);
- while (af)
- {
- if (af->type==IP_FWD_PORT && port==af->visible && protocol==af->protocol)
- return(af);
- af=af->next;
- }
- return(NULL);
-}
-
-static __inline__ struct ip_autofw * ip_autofw_check_direct (__u16 port, __u16 protocol)
-{
- struct ip_autofw *af;
- af=ip_autofw_hosts;
- port=ntohs(port);
- while (af)
- {
- if (af->type==IP_FWD_DIRECT && af->low<=port && af->high>=port)
- return(af);
- af=af->next;
- }
- return(NULL);
-}
-
-static __inline__ void ip_autofw_update_out (__u32 who, __u32 where, __u16 port, __u16 protocol)
-{
- struct ip_autofw *af;
- af=ip_autofw_hosts;
- port=ntohs(port);
- while (af)
- {
- if (af->type==IP_FWD_RANGE && af->ctlport==port && af->ctlproto==protocol)
- {
- if (af->flags & IP_AUTOFW_USETIME)
- {
- mod_timer(&af->timer,
- jiffies+IP_AUTOFW_EXPIRE);
- }
- af->flags|=IP_AUTOFW_ACTIVE;
- af->lastcontact=where;
- af->where=who;
- }
- af=af->next;
- }
-}
-
-#if 0
-static __inline__ void ip_autofw_update_in (__u32 where, __u16 port, __u16 protocol)
-{
- struct ip_autofw *af;
- af=ip_autofw_check_range(where, port,protocol);
- if (af)
- {
- mod_timer(&af->timer, jiffies+IP_AUTOFW_EXPIRE);
- }
-}
-#endif
-
-
-static __inline__ void ip_autofw_expire(unsigned long data)
-{
- struct ip_autofw * af;
- af=(struct ip_autofw *) data;
- af->flags &= ~IP_AUTOFW_ACTIVE;
- af->timer.expires=0;
- af->lastcontact=0;
- if (af->flags & IP_AUTOFW_SECURE)
- af->where=0;
-}
-
-
-
-static __inline__ int ip_autofw_add(struct ip_autofw_user * af)
-{
- struct ip_autofw * newaf;
- newaf = kmalloc( sizeof(struct ip_autofw), GFP_KERNEL );
- init_timer(&newaf->timer);
- if ( newaf == NULL )
- {
- printk("ip_autofw_add: malloc said no\n");
- return( ENOMEM );
- }
-
- MOD_INC_USE_COUNT;
-
- memcpy(newaf, af, sizeof(struct ip_autofw_user));
- newaf->timer.data = (unsigned long) newaf;
- newaf->timer.function = ip_autofw_expire;
- newaf->timer.expires = 0;
- newaf->lastcontact=0;
- newaf->next=ip_autofw_hosts;
- ip_autofw_hosts=newaf;
- ip_masq_mod_inc_nent(mmod_self);
- return(0);
-}
-
-static __inline__ int ip_autofw_del(struct ip_autofw_user * af)
-{
- struct ip_autofw ** af_p, *curr;
-
- for (af_p=&ip_autofw_hosts, curr=*af_p; (curr=*af_p); af_p = &(*af_p)->next) {
- if (af->type == curr->type &&
- af->low == curr->low &&
- af->high == curr->high &&
- af->hidden == curr->hidden &&
- af->visible == curr->visible &&
- af->protocol == curr->protocol &&
- af->where == curr->where &&
- af->ctlproto == curr->ctlproto &&
- af->ctlport == curr->ctlport)
- {
- ip_masq_mod_dec_nent(mmod_self);
- *af_p = curr->next;
- if (af->flags&IP_AUTOFW_ACTIVE)
- del_timer(&curr->timer);
- kfree_s(curr,sizeof(struct ip_autofw));
- MOD_DEC_USE_COUNT;
- return 0;
- }
- curr=curr->next;
- }
- return EINVAL;
-}
-
-static __inline__ int ip_autofw_flush(void)
-{
- struct ip_autofw * af;
-
- while (ip_autofw_hosts)
- {
- af=ip_autofw_hosts;
- ip_masq_mod_dec_nent(mmod_self);
- ip_autofw_hosts=ip_autofw_hosts->next;
- if (af->flags&IP_AUTOFW_ACTIVE)
- del_timer(&af->timer);
- kfree_s(af,sizeof(struct ip_autofw));
- MOD_DEC_USE_COUNT;
- }
- return(0);
-}
-
-/*
- * Methods for registered object
- */
-
-static int autofw_ctl(int optname, struct ip_masq_ctl *mctl, int optlen)
-{
- struct ip_autofw_user *af = &mctl->u.autofw_user;
-
- switch (mctl->m_cmd) {
- case IP_MASQ_CMD_ADD:
- case IP_MASQ_CMD_INSERT:
- if (optlen<sizeof(*af))
- return EINVAL;
- return ip_autofw_add(af);
- case IP_MASQ_CMD_DEL:
- if (optlen<sizeof(*af))
- return EINVAL;
- return ip_autofw_del(af);
- case IP_MASQ_CMD_FLUSH:
- return ip_autofw_flush();
-
- }
- return EINVAL;
-}
-
-
-static int autofw_out_update(const struct sk_buff *skb, const struct iphdr *iph, struct ip_masq *ms)
-{
- const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
- /*
- * Update any ipautofw entries ...
- */
-
- ip_autofw_update_out(iph->saddr, iph->daddr, portp[1], iph->protocol);
- return IP_MASQ_MOD_NOP;
-}
-
-static struct ip_masq * autofw_out_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr)
-{
- const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
- /*
- * If the source port is supposed to match the masq port, then
- * make it so
- */
-
- if (ip_autofw_check_direct(portp[1],iph->protocol)) {
- return ip_masq_new(iph->protocol,
- maddr, portp[0],
- iph->saddr, portp[0],
- iph->daddr, portp[1],
- 0);
- }
- return NULL;
-}
-
-#if 0
-static int autofw_in_update(const struct sk_buff *skb, const struct iphdr *iph, __u16 *portp, struct ip_masq *ms)
-{
- const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
- ip_autofw_update_in(iph->saddr, portp[1], iph->protocol);
- return IP_MASQ_MOD_NOP;
-}
-#endif
-
-static int autofw_in_rule(const struct sk_buff *skb, const struct iphdr *iph)
-{
- const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
- return (ip_autofw_check_range(iph->saddr, portp[1], iph->protocol, 0)
- || ip_autofw_check_direct(portp[1], iph->protocol)
- || ip_autofw_check_port(portp[1], iph->protocol));
-}
-
-static struct ip_masq * autofw_in_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr)
-{
- const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
- struct ip_autofw *af;
-
- if ((af=ip_autofw_check_range(iph->saddr, portp[1], iph->protocol, 0))) {
- IP_MASQ_DEBUG(1-debug, "autofw_check_range HIT\n");
- return ip_masq_new(iph->protocol,
- maddr, portp[1],
- af->where, portp[1],
- iph->saddr, portp[0],
- 0);
- }
- if ((af=ip_autofw_check_port(portp[1], iph->protocol)) ) {
- IP_MASQ_DEBUG(1-debug, "autofw_check_port HIT\n");
- return ip_masq_new(iph->protocol,
- maddr, htons(af->visible),
- af->where, htons(af->hidden),
- iph->saddr, portp[0],
- 0);
- }
- return NULL;
-}
-
-#ifdef CONFIG_PROC_FS
-static int autofw_procinfo(char *buffer, char **start, off_t offset,
- int length, int unused)
-{
- off_t pos=0, begin=0;
- struct ip_autofw * af;
- int len=0;
-
- len=sprintf(buffer,"Type Prot Low High Vis Hid Where Last CPto CPrt Timer Flags\n");
-
- for(af = ip_autofw_hosts; af ; af = af->next)
- {
- len+=sprintf(buffer+len,"%4X %4X %04X-%04X/%04X %04X %08lX %08lX %04X %04X %6lu %4X\n",
- af->type,
- af->protocol,
- af->low,
- af->high,
- af->visible,
- af->hidden,
- ntohl(af->where),
- ntohl(af->lastcontact),
- af->ctlproto,
- af->ctlport,
- (af->timer.expires<jiffies ? 0 : af->timer.expires-jiffies),
- af->flags);
-
- pos=begin+len;
- if(pos<offset)
- {
- len=0;
- begin=pos;
- }
- if(pos>offset+length)
- break;
- }
- *start=buffer+(offset-begin);
- len-=(offset-begin);
- if(len>length)
- len=length;
- return len;
-}
-
-static struct proc_dir_entry autofw_proc_entry = {
- 0, 0, NULL,
- S_IFREG | S_IRUGO, 1, 0, 0,
- 0, &proc_net_inode_operations,
- autofw_procinfo
-};
-
-#define proc_ent &autofw_proc_entry
-#else /* !CONFIG_PROC_FS */
-
-#define proc_ent NULL
-#endif
-
-
-#define autofw_in_update NULL
-#define autofw_out_rule NULL
-#define autofw_mod_init NULL
-#define autofw_mod_done NULL
-
-static struct ip_masq_mod autofw_mod = {
- NULL, /* next */
- NULL, /* next_reg */
- "autofw", /* name */
- ATOMIC_INIT(0), /* nent */
- ATOMIC_INIT(0), /* refcnt */
- proc_ent,
- autofw_ctl,
- autofw_mod_init,
- autofw_mod_done,
- autofw_in_rule,
- autofw_in_update,
- autofw_in_create,
- autofw_out_rule,
- autofw_out_update,
- autofw_out_create,
-};
-
-__initfunc(int ip_autofw_init(void))
-{
- return register_ip_masq_mod ((mmod_self=&autofw_mod));
-}
-
-int ip_autofw_done(void)
-{
- return unregister_ip_masq_mod(&autofw_mod);
-}
-
-#ifdef MODULE
-EXPORT_NO_SYMBOLS;
-
-int init_module(void)
-{
- if (ip_autofw_init() != 0)
- return -EIO;
- return 0;
-}
-
-void cleanup_module(void)
-{
- if (ip_autofw_done() != 0)
- printk(KERN_INFO "ip_autofw_done(): can't remove module");
-}
-
-#endif /* MODULE */
diff --git a/net/ipv4/ip_masq_cuseeme.c b/net/ipv4/ip_masq_cuseeme.c
deleted file mode 100644
index 9b412bafe..000000000
--- a/net/ipv4/ip_masq_cuseeme.c
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * IP_MASQ_FTP CUSeeMe masquerading module
- *
- *
- * Version: @(#)$Id: ip_masq_cuseeme.c,v 1.4 1998/10/06 04:48:57 davem Exp $
- *
- * Author: Richard Lynch
- *
- *
- * Fixes:
- * Richard Lynch : Updated patch to conform to new module
- * specifications
- * Nigel Metheringham : Multiple port support
- * Michael Owings : Fixed broken init code
- * Added code to update inbound
- * packets with correct local addresses.
- * Fixes audio and "chat" problems
- * Thanx to the CU-SeeMe Consortium for
- * technical docs
- * Steven Clarke : Small changes for 2.1
- *
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Multiple Port Support
- * The helper can be made to handle up to MAX_MASQ_APP_PORTS (normally 12)
- * with the port numbers being defined at module load time. The module
- * uses the symbol "ports" to define a list of monitored ports, which can
- * be specified on the insmod command line as
- * ports=x1,x2,x3...
- * where x[n] are integer port numbers. This option can be put into
- * /etc/conf.modules (or /etc/modules.conf depending on your config)
- * where modload will pick it up should you use modload to load your
- * modules.
- *
- */
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <asm/system.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/init.h>
-#include <net/protocol.h>
-#include <net/udp.h>
-
-/* #define IP_MASQ_NDEBUG */
-#include <net/ip_masq.h>
-
-#pragma pack(1)
-/* CU-SeeMe Data Header */
-typedef struct {
- u_short dest_family;
- u_short dest_port;
- u_long dest_addr;
- short family;
- u_short port;
- u_long addr;
- u_long seq;
- u_short msg;
- u_short data_type;
- u_short packet_len;
-} cu_header;
-
-/* Open Continue Header */
-typedef struct {
- cu_header cu_head;
- u_short client_count; /* Number of client info structs */
- u_long seq_no;
- char user_name[20];
- char stuff[4]; /* flags, version stuff, etc */
-}oc_header;
-
-/* client info structures */
-typedef struct {
- u_long address; /* Client address */
- char stuff[8]; /* Flags, pruning bitfield, packet counts etc */
-} client_info;
-#pragma pack()
-
-/*
- * List of ports (up to MAX_MASQ_APP_PORTS) to be handled by helper
- * First port is set to the default port.
- */
-static int ports[MAX_MASQ_APP_PORTS] = {7648}; /* I rely on the trailing items being set to zero */
-struct ip_masq_app *masq_incarnations[MAX_MASQ_APP_PORTS];
-
-/*
- * Debug level
- */
-#ifdef CONFIG_IP_MASQ_DEBUG
-static int debug=0;
-MODULE_PARM(debug, "i");
-#endif
-
-MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i");
-
-static int
-masq_cuseeme_init_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
-{
- MOD_INC_USE_COUNT;
- return 0;
-}
-
-static int
-masq_cuseeme_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
-{
- MOD_DEC_USE_COUNT;
- return 0;
-}
-
-int
-masq_cuseeme_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr)
-{
- struct sk_buff *skb = *skb_p;
- struct iphdr *iph = skb->nh.iph;
- struct udphdr *uh = (struct udphdr *)&(((char *)iph)[iph->ihl*4]);
- cu_header *cu_head;
- char *data=(char *)&uh[1];
-
- if (skb->len - ((unsigned char *) data - skb->h.raw) >= sizeof(cu_header))
- {
- cu_head = (cu_header *) data;
- /* cu_head->port = ms->mport; */
- if( cu_head->addr )
- cu_head->addr = (u_long) maddr;
- if(ntohs(cu_head->data_type) == 257)
- IP_MASQ_DEBUG(1-debug, "Sending talk packet!\n");
- }
- return 0;
-}
-
-int
-masq_cuseeme_in (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr)
-{
- struct sk_buff *skb = *skb_p;
- struct iphdr *iph = skb->nh.iph;
- struct udphdr *uh = (struct udphdr *)&(((char *)iph)[iph->ihl*4]);
- cu_header *cu_head;
- oc_header *oc;
- client_info *ci;
- char *data=(char *)&uh[1];
- u_short len = skb->len - ((unsigned char *) data - skb->h.raw);
- int i, off;
-
- if (len >= sizeof(cu_header))
- {
- cu_head = (cu_header *) data;
- if(cu_head->dest_addr) /* Correct destination address */
- cu_head->dest_addr = (u_long) ms->saddr;
- if(ntohs(cu_head->data_type)==101 && len > sizeof(oc_header))
- {
- oc = (oc_header * ) data;
- /* Spin (grovel) thru client_info structs till we find our own */
- off=sizeof(oc_header);
- for(i=0;
- (i < oc->client_count && off+sizeof(client_info) <= len);
- i++)
- {
- ci=(client_info *)(data+off);
- if(ci->address==(u_long) maddr)
- {
- /* Update w/ our real ip address and exit */
- ci->address = (u_long) ms->saddr;
- break;
- }
- else
- off+=sizeof(client_info);
- }
- }
- }
- return 0;
-}
-
-struct ip_masq_app ip_masq_cuseeme = {
- NULL, /* next */
- "cuseeme",
- 0, /* type */
- 0, /* n_attach */
- masq_cuseeme_init_1, /* ip_masq_init_1 */
- masq_cuseeme_done_1, /* ip_masq_done_1 */
- masq_cuseeme_out, /* pkt_out */
- masq_cuseeme_in /* pkt_in */
-};
-
-
-/*
- * ip_masq_cuseeme initialization
- */
-
-__initfunc(int ip_masq_cuseeme_init(void))
-{
- int i, j;
-
- for (i=0; (i<MAX_MASQ_APP_PORTS); i++) {
- if (ports[i]) {
- if ((masq_incarnations[i] = kmalloc(sizeof(struct ip_masq_app),
- GFP_KERNEL)) == NULL)
- return -ENOMEM;
- memcpy(masq_incarnations[i], &ip_masq_cuseeme, sizeof(struct ip_masq_app));
- if ((j = register_ip_masq_app(masq_incarnations[i],
- IPPROTO_UDP,
- ports[i]))) {
- return j;
- }
-#if DEBUG_CONFIG_IP_MASQ_CUSEEME
- IP_MASQ_DEBUG(1-debug, "CuSeeMe: loaded support on port[%d] = %d\n",
- i, ports[i]);
-#endif
- } else {
- /* To be safe, force the incarnation table entry to NULL */
- masq_incarnations[i] = NULL;
- }
- }
- return 0;
-}
-
-/*
- * ip_masq_cuseeme fin.
- */
-
-int ip_masq_cuseeme_done(void)
-{
- int i, j, k;
-
- k=0;
- for (i=0; (i<MAX_MASQ_APP_PORTS); i++) {
- if (masq_incarnations[i]) {
- if ((j = unregister_ip_masq_app(masq_incarnations[i]))) {
- k = j;
- } else {
- kfree(masq_incarnations[i]);
- masq_incarnations[i] = NULL;
- IP_MASQ_DEBUG(1-debug, "CuSeeMe: unloaded support on port[%d] = %d\n", i, ports[i]);
- }
- }
- }
- return k;
-}
-
-#ifdef MODULE
-EXPORT_NO_SYMBOLS;
-
-int init_module(void)
-{
- if (ip_masq_cuseeme_init() != 0)
- return -EIO;
- return 0;
-}
-
-void cleanup_module(void)
-{
- if (ip_masq_cuseeme_done() != 0)
- IP_MASQ_DEBUG(1-debug, "ip_masq_cuseeme: can't remove module");
-}
-
-#endif /* MODULE */
diff --git a/net/ipv4/ip_masq_ftp.c b/net/ipv4/ip_masq_ftp.c
deleted file mode 100644
index 35d1f5440..000000000
--- a/net/ipv4/ip_masq_ftp.c
+++ /dev/null
@@ -1,393 +0,0 @@
-/*
- * IP_MASQ_FTP ftp masquerading module
- *
- *
- * Version: @(#)ip_masq_ftp.c 0.04 02/05/96
- *
- * Author: Wouter Gadeyne
- *
- *
- * Fixes:
- * Wouter Gadeyne : Fixed masquerading support of ftp PORT commands
- * Juan Jose Ciarlante : Code moved and adapted from ip_fw.c
- * Keith Owens : Add keep alive for ftp control channel
- * Nigel Metheringham : Added multiple port support
- * Juan Jose Ciarlante : Use control_add() for ftp control chan
- * Juan Jose Ciarlante : Litl bits for 2.1
- * Juan Jose Ciarlante : use ip_masq_listen()
- * Juan Jose Ciarlante : use private app_data for own flag(s)
- *
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Multiple Port Support
- * The helper can be made to handle up to MAX_MASQ_APP_PORTS (normally 12)
- * with the port numbers being defined at module load time. The module
- * uses the symbol "ports" to define a list of monitored ports, which can
- * be specified on the insmod command line as
- * ports=x1,x2,x3...
- * where x[n] are integer port numbers. This option can be put into
- * /etc/conf.modules (or /etc/modules.conf depending on your config)
- * where modload will pick it up should you use modload to load your
- * modules.
- *
- */
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <asm/system.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/init.h>
-#include <net/protocol.h>
-#include <net/tcp.h>
-
-/* #define IP_MASQ_NDEBUG */
-#include <net/ip_masq.h>
-
-
-/*
- * List of ports (up to MAX_MASQ_APP_PORTS) to be handled by helper
- * First port is set to the default port.
- */
-static int ports[MAX_MASQ_APP_PORTS] = {21}; /* I rely on the trailing items being set to zero */
-struct ip_masq_app *masq_incarnations[MAX_MASQ_APP_PORTS];
-
-/*
- * Debug level
- */
-#ifdef CONFIG_IP_MASQ_DEBUG
-static int debug=0;
-MODULE_PARM(debug, "i");
-#endif
-
-MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i");
-
-/* Dummy variable */
-static int masq_ftp_pasv;
-
-static int
-masq_ftp_init_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
-{
- MOD_INC_USE_COUNT;
- return 0;
-}
-
-static int
-masq_ftp_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
-{
- MOD_DEC_USE_COUNT;
- return 0;
-}
-
-int
-masq_ftp_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr)
-{
- struct sk_buff *skb;
- struct iphdr *iph;
- struct tcphdr *th;
- char *p, *data, *data_limit;
- unsigned char p1,p2,p3,p4,p5,p6;
- __u32 from;
- __u16 port;
- struct ip_masq *n_ms;
- char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
- unsigned buf_len;
- int diff;
-
- skb = *skb_p;
- iph = skb->nh.iph;
- th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
- data = (char *)&th[1];
-
- data_limit = skb->h.raw + skb->len - 18;
- if (skb->len >= 6 && (memcmp(data, "PASV\r\n", 6) == 0 || memcmp(data, "pasv\r\n", 6) == 0))
- ms->app_data = &masq_ftp_pasv;
-
- while (data < data_limit)
- {
- if (memcmp(data,"PORT ",5) && memcmp(data,"port ",5))
- {
- data ++;
- continue;
- }
- p = data+5;
- p1 = simple_strtoul(data+5,&data,10);
- if (*data!=',')
- continue;
- p2 = simple_strtoul(data+1,&data,10);
- if (*data!=',')
- continue;
- p3 = simple_strtoul(data+1,&data,10);
- if (*data!=',')
- continue;
- p4 = simple_strtoul(data+1,&data,10);
- if (*data!=',')
- continue;
- p5 = simple_strtoul(data+1,&data,10);
- if (*data!=',')
- continue;
- p6 = simple_strtoul(data+1,&data,10);
- if (*data!='\r' && *data!='\n')
- continue;
-
- from = (p1<<24) | (p2<<16) | (p3<<8) | p4;
- port = (p5<<8) | p6;
-
- IP_MASQ_DEBUG(1-debug, "PORT %X:%X detected\n",from,port);
-
- /*
- * Now update or create an masquerade entry for it
- */
-
- IP_MASQ_DEBUG(1-debug, "protocol %d %lX:%X %X:%X\n", iph->protocol, htonl(from), htons(port), iph->daddr, 0);
-
- n_ms = ip_masq_out_get(iph->protocol,
- htonl(from), htons(port),
- iph->daddr, 0);
- if (!n_ms) {
- n_ms = ip_masq_new(IPPROTO_TCP,
- maddr, 0,
- htonl(from), htons(port),
- iph->daddr, 0,
- IP_MASQ_F_NO_DPORT);
-
- if (n_ms==NULL)
- return 0;
- ip_masq_control_add(n_ms, ms);
- }
-
- /*
- * Replace the old PORT with the new one
- */
- from = ntohl(n_ms->maddr);
- port = ntohs(n_ms->mport);
- sprintf(buf,"%d,%d,%d,%d,%d,%d",
- from>>24&255,from>>16&255,from>>8&255,from&255,
- port>>8&255,port&255);
- buf_len = strlen(buf);
-
- IP_MASQ_DEBUG(1-debug, "new PORT %X:%X\n",from,port);
-
- /*
- * Calculate required delta-offset to keep TCP happy
- */
-
- diff = buf_len - (data-p);
-
- /*
- * No shift.
- */
-
- if (diff==0) {
- /*
- * simple case, just replace the old PORT cmd
- */
- memcpy(p,buf,buf_len);
- } else {
-
- *skb_p = ip_masq_skb_replace(skb, GFP_ATOMIC, p, data-p, buf, buf_len);
- }
- /*
- * Move tunnel to listen state
- */
- ip_masq_listen(n_ms);
- ip_masq_put(n_ms);
-
- return diff;
-
- }
- return 0;
-
-}
-
-/*
- * Look at incoming ftp packets to catch the response to a PASV command. When
- * we see one we build a masquerading entry for the client address, client port
- * 0 (unknown at the moment), the server address and the server port. Mark the
- * current masquerade entry as a control channel and point the new entry at the
- * control entry. All this work just for ftp keepalive across masquerading.
- *
- * The incoming packet should be something like
- * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
- * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
- * ncftp 2.3.0 cheats by skipping the leading number then going 22 bytes into
- * the data so we do the same. If it's good enough for ncftp then it's good
- * enough for me.
- *
- * In this case, the client is the source machine being masqueraded, the server
- * is the destination for ftp requests. It all depends on your point of view ...
- */
-
-int
-masq_ftp_in (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr)
-{
- struct sk_buff *skb;
- struct iphdr *iph;
- struct tcphdr *th;
- char *data, *data_limit;
- unsigned char p1,p2,p3,p4,p5,p6;
- __u32 to;
- __u16 port;
- struct ip_masq *n_ms;
-
- if (ms->app_data != &masq_ftp_pasv)
- return 0; /* quick exit if no outstanding PASV */
-
- skb = *skb_p;
- iph = skb->nh.iph;
- th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
- data = (char *)&th[1];
- data_limit = skb->h.raw + skb->len;
-
- while (data < data_limit && *data != ' ')
- ++data;
- while (data < data_limit && *data == ' ')
- ++data;
- data += 22;
- if (data >= data_limit || *data != '(')
- return 0;
- p1 = simple_strtoul(data+1, &data, 10);
- if (data >= data_limit || *data != ',')
- return 0;
- p2 = simple_strtoul(data+1, &data, 10);
- if (data >= data_limit || *data != ',')
- return 0;
- p3 = simple_strtoul(data+1, &data, 10);
- if (data >= data_limit || *data != ',')
- return 0;
- p4 = simple_strtoul(data+1, &data, 10);
- if (data >= data_limit || *data != ',')
- return 0;
- p5 = simple_strtoul(data+1, &data, 10);
- if (data >= data_limit || *data != ',')
- return 0;
- p6 = simple_strtoul(data+1, &data, 10);
- if (data >= data_limit || *data != ')')
- return 0;
-
- to = (p1<<24) | (p2<<16) | (p3<<8) | p4;
- port = (p5<<8) | p6;
-
- /*
- * Now update or create an masquerade entry for it
- */
- IP_MASQ_DEBUG(1-debug, "PASV response %lX:%X %X:%X detected\n", ntohl(ms->saddr), 0, to, port);
-
- n_ms = ip_masq_out_get(iph->protocol,
- ms->saddr, 0,
- htonl(to), htons(port));
- if (!n_ms) {
- n_ms = ip_masq_new(IPPROTO_TCP,
- maddr, 0,
- ms->saddr, 0,
- htonl(to), htons(port),
- IP_MASQ_F_NO_SPORT);
-
- if (n_ms==NULL)
- return 0;
- ip_masq_control_add(n_ms, ms);
- }
-
-#if 0 /* v0.12 state processing */
-
- /*
- * keep for a bit longer than tcp_fin, client may not issue open
- * to server port before tcp_fin_timeout.
- */
- n_ms->timeout = ip_masq_expire->tcp_fin_timeout*3;
-#endif
- ms->app_data = NULL;
- ip_masq_put(n_ms);
-
- return 0; /* no diff required for incoming packets, thank goodness */
-}
-
-struct ip_masq_app ip_masq_ftp = {
- NULL, /* next */
- "ftp", /* name */
- 0, /* type */
- 0, /* n_attach */
- masq_ftp_init_1, /* ip_masq_init_1 */
- masq_ftp_done_1, /* ip_masq_done_1 */
- masq_ftp_out, /* pkt_out */
- masq_ftp_in, /* pkt_in */
-};
-
-/*
- * ip_masq_ftp initialization
- */
-
-__initfunc(int ip_masq_ftp_init(void))
-{
- int i, j;
-
- for (i=0; (i<MAX_MASQ_APP_PORTS); i++) {
- if (ports[i]) {
- if ((masq_incarnations[i] = kmalloc(sizeof(struct ip_masq_app),
- GFP_KERNEL)) == NULL)
- return -ENOMEM;
- memcpy(masq_incarnations[i], &ip_masq_ftp, sizeof(struct ip_masq_app));
- if ((j = register_ip_masq_app(masq_incarnations[i],
- IPPROTO_TCP,
- ports[i]))) {
- return j;
- }
- IP_MASQ_DEBUG(1-debug, "Ftp: loaded support on port[%d] = %d\n",
- i, ports[i]);
- } else {
- /* To be safe, force the incarnation table entry to NULL */
- masq_incarnations[i] = NULL;
- }
- }
- return 0;
-}
-
-/*
- * ip_masq_ftp fin.
- */
-
-int ip_masq_ftp_done(void)
-{
- int i, j, k;
-
- k=0;
- for (i=0; (i<MAX_MASQ_APP_PORTS); i++) {
- if (masq_incarnations[i]) {
- if ((j = unregister_ip_masq_app(masq_incarnations[i]))) {
- k = j;
- } else {
- kfree(masq_incarnations[i]);
- masq_incarnations[i] = NULL;
- IP_MASQ_DEBUG(1-debug, "Ftp: unloaded support on port[%d] = %d\n",
- i, ports[i]);
- }
- }
- }
- return k;
-}
-
-#ifdef MODULE
-EXPORT_NO_SYMBOLS;
-
-int init_module(void)
-{
- if (ip_masq_ftp_init() != 0)
- return -EIO;
- return 0;
-}
-
-void cleanup_module(void)
-{
- if (ip_masq_ftp_done() != 0)
- printk(KERN_INFO "ip_masq_ftp: can't remove module");
-}
-
-#endif /* MODULE */
diff --git a/net/ipv4/ip_masq_irc.c b/net/ipv4/ip_masq_irc.c
deleted file mode 100644
index 11c0ca83f..000000000
--- a/net/ipv4/ip_masq_irc.c
+++ /dev/null
@@ -1,370 +0,0 @@
-/*
- * IP_MASQ_IRC irc masquerading module
- *
- *
- * Version: @(#)ip_masq_irc.c 0.03 97/11/30
- *
- * Author: Juan Jose Ciarlante
- *
- * Additions:
- * - recognize a few non-irc-II DCC requests (Oliver Wagner)
- * DCC MOVE (AmIRC/DCC.MOVE; SEND with resuming)
- * DCC SCHAT (AmIRC IDEA encrypted CHAT)
- * DCC TSEND (AmIRC/PIRCH SEND without ACKs)
- * Fixes:
- * Juan Jose Ciarlante : set NO_DADDR flag in ip_masq_new()
- * Nigel Metheringham : Added multiple port support
- * Juan Jose Ciarlante : litl bits for 2.1
- * Oliver Wagner : more IRC cmds processing
- * <winmute@lucifer.gv.kotnet.org>
- * Juan Jose Ciarlante : put new ms entry to listen()
- *
- * FIXME:
- * - detect also previous "PRIVMSG" string ?.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Multiple Port Support
- * The helper can be made to handle up to MAX_MASQ_APP_PORTS (normally 12)
- * with the port numbers being defined at module load time. The module
- * uses the symbol "ports" to define a list of monitored ports, which can
- * be specified on the insmod command line as
- * ports=x1,x2,x3...
- * where x[n] are integer port numbers. This option can be put into
- * /etc/conf.modules (or /etc/modules.conf depending on your config)
- * where modload will pick it up should you use modload to load your
- * modules.
- *
- */
-
-#include <linux/config.h>
-#include <linux/module.h>
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <asm/system.h>
-#include <linux/skbuff.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/init.h>
-#include <net/protocol.h>
-#include <net/tcp.h>
-#include <net/ip_masq.h>
-
-
-/*
- * List of ports (up to MAX_MASQ_APP_PORTS) to be handled by helper
- * First port is set to the default port.
- */
-int ports[MAX_MASQ_APP_PORTS] = {6667}; /* I rely on the trailing items being set to zero */
-struct ip_masq_app *masq_incarnations[MAX_MASQ_APP_PORTS];
-/*
- * Debug level
- */
-#ifdef CONFIG_IP_MASQ_DEBUG
-static int debug=0;
-MODULE_PARM(debug, "i");
-#endif
-
-MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i");
-
-
-/*
- * List of supported DCC protocols
- */
-
-#define NUM_DCCPROTO 5
-
-struct dccproto
-{
- char *match;
- int matchlen;
- int xtra_args;
-};
-
-struct dccproto dccprotos[NUM_DCCPROTO] = {
- { "SEND ", 5, 1 },
- { "CHAT ", 5, 0, },
- { "MOVE ", 5, 1 },
- { "TSEND ", 6, 1, },
- { "SCHAT ", 6, 0, }
-};
-#define MAXMATCHLEN 6
-
-static int
-masq_irc_init_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
-{
- MOD_INC_USE_COUNT;
- return 0;
-}
-
-static int
-masq_irc_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
-{
- MOD_DEC_USE_COUNT;
- return 0;
-}
-
-int
-masq_irc_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr)
-{
- struct sk_buff *skb;
- struct iphdr *iph;
- struct tcphdr *th;
- char *data, *data_limit;
- __u32 s_addr;
- __u16 s_port;
- struct ip_masq *n_ms;
- char buf[20]; /* "m_addr m_port" (dec base)*/
- unsigned buf_len;
- int diff;
- int xtra_args = 0; /* extra int args wanted after addr */
- char *dcc_p, *addr_beg_p, *addr_end_p;
-
- skb = *skb_p;
- iph = skb->nh.iph;
- th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
- data = (char *)&th[1];
-
- /*
- * Hunt irc DCC string, the _shortest_:
- *
- * strlen("DCC CHAT chat AAAAAAAA P\x01\n")=26
- * strlen("DCC SCHAT chat AAAAAAAA P\x01\n")=27
- * strlen("DCC SEND F AAAAAAAA P S\x01\n")=25
- * strlen("DCC MOVE F AAAAAAAA P S\x01\n")=25
- * strlen("DCC TSEND F AAAAAAAA P S\x01\n")=26
- * strlen("DCC MOVE F AAAAAAAA P S\x01\n")=25
- * AAAAAAAAA: bound addr (1.0.0.0==16777216, min 8 digits)
- * P: bound port (min 1 d )
- * F: filename (min 1 d )
- * S: size (min 1 d )
- * 0x01, \n: terminators
- */
-
- data_limit = skb->h.raw + skb->len;
-
- while (data < (data_limit - ( 21 + MAXMATCHLEN ) ) )
- {
- int i;
- if (memcmp(data,"DCC ",4)) {
- data ++;
- continue;
- }
-
- dcc_p = data;
- data += 4; /* point to DCC cmd */
-
- for(i=0; i<NUM_DCCPROTO; i++)
- {
- /*
- * go through the table and hunt a match string
- */
-
- if( memcmp(data, dccprotos[i].match, dccprotos[i].matchlen ) == 0 )
- {
- xtra_args = dccprotos[i].xtra_args;
- data += dccprotos[i].matchlen;
-
- /*
- * skip next string.
- */
-
- while( *data++ != ' ')
-
- /*
- * must still parse, at least, "AAAAAAAA P\x01\n",
- * 12 bytes left.
- */
- if (data > (data_limit-12)) return 0;
-
-
- addr_beg_p = data;
-
- /*
- * client bound address in dec base
- */
-
- s_addr = simple_strtoul(data,&data,10);
- if (*data++ !=' ')
- continue;
-
- /*
- * client bound port in dec base
- */
-
- s_port = simple_strtoul(data,&data,10);
- addr_end_p = data;
-
- /*
- * should check args consistency?
- */
-
- while(xtra_args) {
- if (*data != ' ')
- break;
- data++;
- simple_strtoul(data,&data,10);
- xtra_args--;
- }
-
- if (xtra_args != 0) continue;
-
- /*
- * terminators.
- */
-
- if (data[0] != 0x01)
- continue;
- if (data[1]!='\r' && data[1]!='\n')
- continue;
-
- /*
- * Now create an masquerade entry for it
- * must set NO_DPORT and NO_DADDR because
- * connection is requested by another client.
- */
-
- n_ms = ip_masq_new(IPPROTO_TCP,
- maddr, 0,
- htonl(s_addr),htons(s_port),
- 0, 0,
- IP_MASQ_F_NO_DPORT|IP_MASQ_F_NO_DADDR);
- if (n_ms==NULL)
- return 0;
-
- /*
- * Replace the old "address port" with the new one
- */
-
- buf_len = sprintf(buf,"%lu %u",
- ntohl(n_ms->maddr),ntohs(n_ms->mport));
-
- /*
- * Calculate required delta-offset to keep TCP happy
- */
-
- diff = buf_len - (addr_end_p-addr_beg_p);
-
- *addr_beg_p = '\0';
- IP_MASQ_DEBUG(1-debug, "masq_irc_out(): '%s' %X:%X detected (diff=%d)\n", dcc_p, s_addr,s_port, diff);
-
- /*
- * No shift.
- */
-
- if (diff==0) {
- /*
- * simple case, just copy.
- */
- memcpy(addr_beg_p,buf,buf_len);
- } else {
-
- *skb_p = ip_masq_skb_replace(skb, GFP_ATOMIC,
- addr_beg_p, addr_end_p-addr_beg_p,
- buf, buf_len);
- }
- ip_masq_listen(n_ms);
- ip_masq_put(n_ms);
- return diff;
- }
- }
- }
- return 0;
-
-}
-
-/*
- * Main irc object
- * You need 1 object per port in case you need
- * to offer also other used irc ports (6665,6666,etc),
- * they will share methods but they need own space for
- * data.
- */
-
-struct ip_masq_app ip_masq_irc = {
- NULL, /* next */
- "irc", /* name */
- 0, /* type */
- 0, /* n_attach */
- masq_irc_init_1, /* init_1 */
- masq_irc_done_1, /* done_1 */
- masq_irc_out, /* pkt_out */
- NULL /* pkt_in */
-};
-
-/*
- * ip_masq_irc initialization
- */
-
-__initfunc(int ip_masq_irc_init(void))
-{
- int i, j;
-
- for (i=0; (i<MAX_MASQ_APP_PORTS); i++) {
- if (ports[i]) {
- if ((masq_incarnations[i] = kmalloc(sizeof(struct ip_masq_app),
- GFP_KERNEL)) == NULL)
- return -ENOMEM;
- memcpy(masq_incarnations[i], &ip_masq_irc, sizeof(struct ip_masq_app));
- if ((j = register_ip_masq_app(masq_incarnations[i],
- IPPROTO_TCP,
- ports[i]))) {
- return j;
- }
- IP_MASQ_DEBUG(1-debug,
- "Irc: loaded support on port[%d] = %d\n",
- i, ports[i]);
- } else {
- /* To be safe, force the incarnation table entry to NULL */
- masq_incarnations[i] = NULL;
- }
- }
- return 0;
-}
-
-/*
- * ip_masq_irc fin.
- */
-
-int ip_masq_irc_done(void)
-{
- int i, j, k;
-
- k=0;
- for (i=0; (i<MAX_MASQ_APP_PORTS); i++) {
- if (masq_incarnations[i]) {
- if ((j = unregister_ip_masq_app(masq_incarnations[i]))) {
- k = j;
- } else {
- kfree(masq_incarnations[i]);
- masq_incarnations[i] = NULL;
- IP_MASQ_DEBUG(1-debug, "Irc: unloaded support on port[%d] = %d\n",
- i, ports[i]);
- }
- }
- }
- return k;
-}
-
-
-#ifdef MODULE
-EXPORT_NO_SYMBOLS;
-
-int init_module(void)
-{
- if (ip_masq_irc_init() != 0)
- return -EIO;
- return 0;
-}
-
-void cleanup_module(void)
-{
- if (ip_masq_irc_done() != 0)
- printk(KERN_INFO "ip_masq_irc: can't remove module");
-}
-
-#endif /* MODULE */
diff --git a/net/ipv4/ip_masq_mfw.c b/net/ipv4/ip_masq_mfw.c
deleted file mode 100644
index db6d66d0a..000000000
--- a/net/ipv4/ip_masq_mfw.c
+++ /dev/null
@@ -1,769 +0,0 @@
-/*
- * IP_MASQ_MARKFW masquerading module
- *
- * Does (reverse-masq) forwarding based on skb->fwmark value
- *
- * $Id: ip_masq_mfw.c,v 1.4 1999/05/13 23:25:07 davem Exp $
- *
- * Author: Juan Jose Ciarlante <jjciarla@raiz.uncu.edu.ar>
- * based on Steven Clarke's portfw
- *
- * Fixes:
- * JuanJo Ciarlante: added u-space sched support
- * JuanJo Ciarlante: if rport==0, use packet dest port *grin*
- * JuanJo Ciarlante: fixed tcp syn&&!ack creation
- *
- *
- */
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/list.h>
-#include <net/ip.h>
-#include <linux/ip_fw.h>
-#include <linux/ip_masq.h>
-#include <net/ip_masq.h>
-#include <net/ip_masq_mod.h>
-#include <linux/proc_fs.h>
-#include <linux/init.h>
-#include <asm/softirq.h>
-#include <asm/spinlock.h>
-#include <asm/atomic.h>
-
-static struct ip_masq_mod *mmod_self = NULL;
-#ifdef CONFIG_IP_MASQ_DEBUG
-static int debug=0;
-MODULE_PARM(debug, "i");
-#endif
-
-/*
- * Lists structure:
- * There is a "main" linked list with entries hashed
- * by fwmark value (struct ip_masq_mfw, the "m-entries").
- *
- * Each of this m-entry holds a double linked list
- * of "forward-to" hosts (struct ip_masq_mfw_host, the "m.host"),
- * the round-robin scheduling takes place by rotating m.host entries
- * "inside" its m-entry.
- */
-
-/*
- * Each forwarded host (addr:port) is stored here
- */
-struct ip_masq_mfw_host {
- struct list_head list;
- __u32 addr;
- __u16 port;
- __u16 pad0;
- __u32 fwmark;
- int pref;
- atomic_t pref_cnt;
-};
-
-#define IP_MASQ_MFW_HSIZE 16
-/*
- * This entries are indexed by fwmark,
- * they hold a list of forwarded addr:port
- */
-
-struct ip_masq_mfw {
- struct ip_masq_mfw *next; /* linked list */
- __u32 fwmark; /* key: firewall mark */
- struct list_head hosts; /* list of forward-to hosts */
- atomic_t nhosts; /* number of "" */
- rwlock_t lock;
-};
-
-
-static DECLARE_MUTEX(mfw_sema);
-static rwlock_t mfw_lock = RW_LOCK_UNLOCKED;
-
-static struct ip_masq_mfw *ip_masq_mfw_table[IP_MASQ_MFW_HSIZE];
-
-static __inline__ int mfw_hash_val(int fwmark)
-{
- return fwmark & 0x0f;
-}
-
-/*
- * Get m-entry by "fwmark"
- * Caller must lock tables.
- */
-
-static struct ip_masq_mfw *__mfw_get(int fwmark)
-{
- struct ip_masq_mfw* mfw;
- int hash = mfw_hash_val(fwmark);
-
- for (mfw=ip_masq_mfw_table[hash];mfw;mfw=mfw->next) {
- if (mfw->fwmark==fwmark) {
- goto out;
- }
- }
-out:
- return mfw;
-}
-
-/*
- * Links m-entry.
- * Caller should have checked if already present for same fwmark
- *
- * Caller must lock tables.
- */
-static int __mfw_add(struct ip_masq_mfw *mfw)
-{
- int fwmark = mfw->fwmark;
- int hash = mfw_hash_val(fwmark);
-
- mfw->next = ip_masq_mfw_table[hash];
- ip_masq_mfw_table[hash] = mfw;
- ip_masq_mod_inc_nent(mmod_self);
-
- return 0;
-}
-
-/*
- * Creates a m-entry (doesn't link it)
- */
-
-static struct ip_masq_mfw * mfw_new(int fwmark)
-{
- struct ip_masq_mfw *mfw;
-
- mfw = kmalloc(sizeof(*mfw), GFP_KERNEL);
- if (mfw == NULL)
- goto out;
-
- MOD_INC_USE_COUNT;
- memset(mfw, 0, sizeof(*mfw));
- mfw->fwmark = fwmark;
- mfw->lock = (rwlock_t) RW_LOCK_UNLOCKED;
-
- INIT_LIST_HEAD(&mfw->hosts);
-out:
- return mfw;
-}
-
-static void mfw_host_to_user(struct ip_masq_mfw_host *h, struct ip_mfw_user *mu)
-{
- mu->raddr = h->addr;
- mu->rport = h->port;
- mu->fwmark = h->fwmark;
- mu->pref = h->pref;
-}
-
-/*
- * Creates a m.host (doesn't link it in a m-entry)
- */
-static struct ip_masq_mfw_host * mfw_host_new(struct ip_mfw_user *mu)
-{
- struct ip_masq_mfw_host * mfw_host;
- mfw_host = kmalloc(sizeof (*mfw_host), GFP_KERNEL);
- if (!mfw_host)
- return NULL;
-
- MOD_INC_USE_COUNT;
- memset(mfw_host, 0, sizeof(*mfw_host));
- mfw_host->addr = mu->raddr;
- mfw_host->port = mu->rport;
- mfw_host->fwmark = mu->fwmark;
- mfw_host->pref = mu->pref;
- atomic_set(&mfw_host->pref_cnt, mu->pref);
-
- return mfw_host;
-}
-
-/*
- * Create AND link m.host to m-entry.
- * It locks m.lock.
- */
-static int mfw_addhost(struct ip_masq_mfw *mfw, struct ip_mfw_user *mu, int attail)
-{
- struct ip_masq_mfw_host *mfw_host;
-
- mfw_host = mfw_host_new(mu);
- if (!mfw_host)
- return -ENOMEM;
-
- write_lock_bh(&mfw->lock);
- list_add(&mfw_host->list, attail? mfw->hosts.prev : &mfw->hosts);
- atomic_inc(&mfw->nhosts);
- write_unlock_bh(&mfw->lock);
-
- return 0;
-}
-
-/*
- * Unlink AND destroy m.host(s) from m-entry.
- * Wildcard (nul host or addr) ok.
- * It uses m.lock.
- */
-static int mfw_delhost(struct ip_masq_mfw *mfw, struct ip_mfw_user *mu)
-{
-
- struct list_head *l,*e;
- struct ip_masq_mfw_host *h;
- int n_del = 0;
- l = &mfw->hosts;
-
- write_lock_bh(&mfw->lock);
- for (e=l->next; e!=l; e=e->next)
- {
- h = list_entry(e, struct ip_masq_mfw_host, list);
- if ((!mu->raddr || h->addr == mu->raddr) &&
- (!mu->rport || h->port == mu->rport)) {
- /* HIT */
- atomic_dec(&mfw->nhosts);
- list_del(&h->list);
- kfree_s(h, sizeof(*h));
- MOD_DEC_USE_COUNT;
- n_del++;
- }
-
- }
- write_unlock_bh(&mfw->lock);
- return n_del? 0 : -ESRCH;
-}
-
-/*
- * Changes m.host parameters
- * Wildcards ok
- *
- * Caller must lock tables.
- */
-static int __mfw_edithost(struct ip_masq_mfw *mfw, struct ip_mfw_user *mu)
-{
-
- struct list_head *l,*e;
- struct ip_masq_mfw_host *h;
- int n_edit = 0;
- l = &mfw->hosts;
-
- for (e=l->next; e!=l; e=e->next)
- {
- h = list_entry(e, struct ip_masq_mfw_host, list);
- if ((!mu->raddr || h->addr == mu->raddr) &&
- (!mu->rport || h->port == mu->rport)) {
- /* HIT */
- h->pref = mu->pref;
- atomic_set(&h->pref_cnt, mu->pref);
- n_edit++;
- }
-
- }
- return n_edit? 0 : -ESRCH;
-}
-
-/*
- * Destroys m-entry.
- * Caller must have checked that it doesn't hold any m.host(s)
- */
-static void mfw_destroy(struct ip_masq_mfw *mfw)
-{
- kfree_s(mfw, sizeof(*mfw));
- MOD_DEC_USE_COUNT;
-}
-
-/*
- * Unlink m-entry.
- *
- * Caller must lock tables.
- */
-static int __mfw_del(struct ip_masq_mfw *mfw)
-{
- struct ip_masq_mfw **mfw_p;
- int ret = -EINVAL;
-
-
- for(mfw_p=&ip_masq_mfw_table[mfw_hash_val(mfw->fwmark)];
- *mfw_p;
- mfw_p = &((*mfw_p)->next))
- {
- if (mfw==(*mfw_p)) {
- *mfw_p = mfw->next;
- ip_masq_mod_dec_nent(mmod_self);
- ret = 0;
- goto out;
- }
- }
-out:
- return ret;
-}
-
-/*
- * Crude m.host scheduler
- * This interface could be exported to allow playing with
- * other sched policies.
- *
- * Caller must lock m-entry.
- */
-static struct ip_masq_mfw_host * __mfw_sched(struct ip_masq_mfw *mfw, int force)
-{
- struct ip_masq_mfw_host *h = NULL;
-
- if (atomic_read(&mfw->nhosts) == 0)
- goto out;
-
- /*
- * Here resides actual sched policy:
- * When pref_cnt touches 0, entry gets shifted to tail and
- * its pref_cnt reloaded from h->pref (actual value
- * passed from u-space).
- *
- * Exception is pref==0: avoid scheduling.
- */
-
- h = list_entry(mfw->hosts.next, struct ip_masq_mfw_host, list);
-
- if (atomic_read(&mfw->nhosts) <= 1)
- goto out;
-
- if ((h->pref && atomic_dec_and_test(&h->pref_cnt)) || force) {
- atomic_set(&h->pref_cnt, h->pref);
- list_del(&h->list);
- list_add(&h->list, mfw->hosts.prev);
- }
-out:
- return h;
-}
-
-/*
- * Main lookup routine.
- * HITs fwmark and schedules m.host entries if required
- */
-static struct ip_masq_mfw_host * mfw_lookup(int fwmark)
-{
- struct ip_masq_mfw *mfw;
- struct ip_masq_mfw_host *h = NULL;
-
- read_lock(&mfw_lock);
- mfw = __mfw_get(fwmark);
-
- if (mfw) {
- write_lock(&mfw->lock);
- h = __mfw_sched(mfw, 0);
- write_unlock(&mfw->lock);
- }
-
- read_unlock(&mfw_lock);
- return h;
-}
-
-#ifdef CONFIG_PROC_FS
-static int mfw_procinfo(char *buffer, char **start, off_t offset,
- int length, int dummy)
-{
- struct ip_masq_mfw *mfw;
- struct ip_masq_mfw_host *h;
- struct list_head *l,*e;
- off_t pos=0, begin;
- char temp[129];
- int idx = 0;
- int len=0;
-
- MOD_INC_USE_COUNT;
-
- IP_MASQ_DEBUG(1-debug, "Entered mfw_info\n");
-
- if (offset < 64)
- {
- sprintf(temp, "FwMark > RAddr RPort PrCnt Pref");
- len = sprintf(buffer, "%-63s\n", temp);
- }
- pos = 64;
-
- for(idx = 0; idx < IP_MASQ_MFW_HSIZE; idx++)
- {
- read_lock(&mfw_lock);
- for(mfw = ip_masq_mfw_table[idx]; mfw ; mfw = mfw->next)
- {
- read_lock_bh(&mfw->lock);
- l=&mfw->hosts;
-
- for(e=l->next;l!=e;e=e->next) {
- h = list_entry(e, struct ip_masq_mfw_host, list);
- pos += 64;
- if (pos <= offset) {
- len = 0;
- continue;
- }
-
- sprintf(temp,"0x%x > %08lX %5u %5d %5d",
- h->fwmark,
- ntohl(h->addr), ntohs(h->port),
- atomic_read(&h->pref_cnt), h->pref);
- len += sprintf(buffer+len, "%-63s\n", temp);
-
- if(len >= length) {
- read_unlock_bh(&mfw->lock);
- read_unlock(&mfw_lock);
- goto done;
- }
- }
- read_unlock_bh(&mfw->lock);
- }
- read_unlock(&mfw_lock);
- }
-
-done:
-
- if (len) {
- begin = len - (pos - offset);
- *start = buffer + begin;
- len -= begin;
- }
- if(len>length)
- len = length;
- MOD_DEC_USE_COUNT;
- return len;
-}
-static struct proc_dir_entry mfw_proc_entry = {
-/* 0, 0, NULL", */
- 0, 3, "mfw",
- S_IFREG | S_IRUGO, 1, 0, 0,
- 0, &proc_net_inode_operations,
- mfw_procinfo
-};
-
-#define proc_ent &mfw_proc_entry
-#else /* !CONFIG_PROC_FS */
-
-#define proc_ent NULL
-#endif
-
-
-static void mfw_flush(void)
-{
- struct ip_masq_mfw *mfw, *local_table[IP_MASQ_MFW_HSIZE];
- struct ip_masq_mfw_host *h;
- struct ip_masq_mfw *mfw_next;
- int idx;
- struct list_head *l,*e;
-
- write_lock_bh(&mfw_lock);
- memcpy(local_table, ip_masq_mfw_table, sizeof ip_masq_mfw_table);
- memset(ip_masq_mfw_table, 0, sizeof ip_masq_mfw_table);
- write_unlock_bh(&mfw_lock);
-
- /*
- * For every hash table row ...
- */
- for(idx=0;idx<IP_MASQ_MFW_HSIZE;idx++) {
-
- /*
- * For every m-entry in row ...
- */
- for(mfw=local_table[idx];mfw;mfw=mfw_next) {
- /*
- * For every m.host in m-entry ...
- */
- l=&mfw->hosts;
- while((e=l->next) != l) {
- h = list_entry(e, struct ip_masq_mfw_host, list);
- atomic_dec(&mfw->nhosts);
- list_del(&h->list);
- kfree_s(h, sizeof(*h));
- MOD_DEC_USE_COUNT;
- }
-
- if (atomic_read(&mfw->nhosts)) {
- IP_MASQ_ERR("mfw_flush(): after flushing row nhosts=%d\n",
- atomic_read(&mfw->nhosts));
- }
- mfw_next = mfw->next;
- kfree_s(mfw, sizeof(*mfw));
- MOD_DEC_USE_COUNT;
- ip_masq_mod_dec_nent(mmod_self);
- }
- }
-}
-
-/*
- * User space control entry point
- */
-static int mfw_ctl(int optname, struct ip_masq_ctl *mctl, int optlen)
-{
- struct ip_mfw_user *mu = &mctl->u.mfw_user;
- struct ip_masq_mfw *mfw;
- int ret = EINVAL;
- int arglen = optlen - IP_MASQ_CTL_BSIZE;
- int cmd;
-
-
- IP_MASQ_DEBUG(1-debug, "ip_masq_user_ctl(len=%d/%d|%d/%d)\n",
- arglen,
- sizeof (*mu),
- optlen,
- sizeof (*mctl));
-
- /*
- * checks ...
- */
- if (arglen != sizeof(*mu) && optlen != sizeof(*mctl))
- return -EINVAL;
-
- /*
- * Don't trust the lusers - plenty of error checking!
- */
- cmd = mctl->m_cmd;
- IP_MASQ_DEBUG(1-debug, "ip_masq_mfw_ctl(cmd=%d, fwmark=%d)\n",
- cmd, mu->fwmark);
-
-
- switch(cmd) {
- case IP_MASQ_CMD_NONE:
- return 0;
- case IP_MASQ_CMD_FLUSH:
- break;
- case IP_MASQ_CMD_ADD:
- case IP_MASQ_CMD_INSERT:
- case IP_MASQ_CMD_SET:
- if (mu->fwmark == 0) {
- IP_MASQ_DEBUG(1-debug, "invalid fwmark==0\n");
- return -EINVAL;
- }
- if (mu->pref < 0) {
- IP_MASQ_DEBUG(1-debug, "invalid pref==%d\n",
- mu->pref);
- return -EINVAL;
- }
- break;
- }
-
-
- ret = -EINVAL;
-
- switch(cmd) {
- case IP_MASQ_CMD_ADD:
- case IP_MASQ_CMD_INSERT:
- if (!mu->raddr) {
- IP_MASQ_DEBUG(0-debug, "ip_masq_mfw_ctl(ADD): invalid redirect 0x%x:%d\n",
- mu->raddr, mu->rport);
- goto out;
- }
-
- /*
- * Cannot just use mfw_lock because below
- * are allocations that can sleep; so
- * to assure "new entry" atomic creation
- * I use a semaphore.
- *
- */
- down(&mfw_sema);
-
- read_lock(&mfw_lock);
- mfw = __mfw_get(mu->fwmark);
- read_unlock(&mfw_lock);
-
- /*
- * If first host, create m-entry
- */
- if (mfw == NULL) {
- mfw = mfw_new(mu->fwmark);
- if (mfw == NULL)
- ret = -ENOMEM;
- }
-
- if (mfw) {
- /*
- * Put m.host in m-entry.
- */
- ret = mfw_addhost(mfw, mu, cmd == IP_MASQ_CMD_ADD);
-
- /*
- * If first host, link m-entry to hash table.
- * Already protected by global lock.
- */
- if (ret == 0 && atomic_read(&mfw->nhosts) == 1) {
- write_lock_bh(&mfw_lock);
- __mfw_add(mfw);
- write_unlock_bh(&mfw_lock);
- }
- if (atomic_read(&mfw->nhosts) == 0) {
- mfw_destroy(mfw);
- }
- }
-
- up(&mfw_sema);
-
- break;
-
- case IP_MASQ_CMD_DEL:
- down(&mfw_sema);
-
- read_lock(&mfw_lock);
- mfw = __mfw_get(mu->fwmark);
- read_unlock(&mfw_lock);
-
- if (mfw) {
- ret = mfw_delhost(mfw, mu);
-
- /*
- * Last lease will free
- * XXX check logic XXX
- */
- if (atomic_read(&mfw->nhosts) == 0) {
- write_lock_bh(&mfw_lock);
- __mfw_del(mfw);
- write_unlock_bh(&mfw_lock);
- mfw_destroy(mfw);
- }
- } else
- ret = -ESRCH;
-
- up(&mfw_sema);
- break;
- case IP_MASQ_CMD_FLUSH:
-
- down(&mfw_sema);
- mfw_flush();
- up(&mfw_sema);
- ret = 0;
- break;
- case IP_MASQ_CMD_SET:
- /*
- * No need to semaphorize here, main list is not
- * modified.
- */
- read_lock(&mfw_lock);
-
- mfw = __mfw_get(mu->fwmark);
- if (mfw) {
- write_lock_bh(&mfw->lock);
-
- if (mu->flags & IP_MASQ_MFW_SCHED) {
- struct ip_masq_mfw_host *h;
- if ((h=__mfw_sched(mfw, 1))) {
- mfw_host_to_user(h, mu);
- ret = 0;
- }
- } else {
- ret = __mfw_edithost(mfw, mu);
- }
-
- write_unlock_bh(&mfw->lock);
- }
-
- read_unlock(&mfw_lock);
- break;
- }
-out:
-
- return ret;
-}
-
-/*
- * Module stubs called from ip_masq core module
- */
-
-/*
- * Input rule stub, called very early for each incoming packet,
- * to see if this module has "interest" in packet.
- */
-static int mfw_in_rule(const struct sk_buff *skb, const struct iphdr *iph)
-{
- int val;
- read_lock(&mfw_lock);
- val = ( __mfw_get(skb->fwmark) != 0);
- read_unlock(&mfw_lock);
- return val;
-}
-
-/*
- * Input-create stub, called to allow "custom" masq creation
- */
-static struct ip_masq * mfw_in_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr)
-{
- union ip_masq_tphdr tph;
- struct ip_masq *ms = NULL;
- struct ip_masq_mfw_host *h = NULL;
-
- tph.raw = (char*) iph + iph->ihl * 4;
-
- switch (iph->protocol) {
- case IPPROTO_TCP:
- /*
- * Only open TCP tunnel if SYN+!ACK packet
- */
- if (!tph.th->syn && tph.th->ack)
- return NULL;
- case IPPROTO_UDP:
- break;
- default:
- return NULL;
- }
-
- /*
- * If no entry exists in the masquerading table
- * and the port is involved
- * in port forwarding, create a new masq entry
- */
-
- if ((h=mfw_lookup(skb->fwmark))) {
- ms = ip_masq_new(iph->protocol,
- iph->daddr, tph.portp[1],
- /* if no redir-port, use packet dest port */
- h->addr, h->port? h->port : tph.portp[1],
- iph->saddr, tph.portp[0],
- 0);
-
- if (ms != NULL)
- ip_masq_listen(ms);
- }
- return ms;
-}
-
-
-#define mfw_in_update NULL
-#define mfw_out_rule NULL
-#define mfw_out_create NULL
-#define mfw_out_update NULL
-
-static struct ip_masq_mod mfw_mod = {
- NULL, /* next */
- NULL, /* next_reg */
- "mfw", /* name */
- ATOMIC_INIT(0), /* nent */
- ATOMIC_INIT(0), /* refcnt */
- proc_ent,
- mfw_ctl,
- NULL, /* masq_mod_init */
- NULL, /* masq_mod_done */
- mfw_in_rule,
- mfw_in_update,
- mfw_in_create,
- mfw_out_rule,
- mfw_out_update,
- mfw_out_create,
-};
-
-
-__initfunc(int ip_mfw_init(void))
-{
- return register_ip_masq_mod ((mmod_self=&mfw_mod));
-}
-
-int ip_mfw_done(void)
-{
- return unregister_ip_masq_mod(&mfw_mod);
-}
-
-#ifdef MODULE
-EXPORT_NO_SYMBOLS;
-
-int init_module(void)
-{
- if (ip_mfw_init() != 0)
- return -EIO;
- return 0;
-}
-
-void cleanup_module(void)
-{
- if (ip_mfw_done() != 0)
- printk(KERN_INFO "can't remove module");
-}
-
-#endif /* MODULE */
diff --git a/net/ipv4/ip_masq_mod.c b/net/ipv4/ip_masq_mod.c
deleted file mode 100644
index 654ab9ff7..000000000
--- a/net/ipv4/ip_masq_mod.c
+++ /dev/null
@@ -1,322 +0,0 @@
-/*
- * IP_MASQ_MOD masq modules support
- *
- *
- * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
- *
- * $Id: ip_masq_mod.c,v 1.5 1998/08/29 23:51:09 davem Exp $
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Changes:
- * Cyrus Durgin: fixed kerneld stuff for kmod.
- */
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <net/ip_masq.h>
-#include <net/ip_masq_mod.h>
-
-#include <linux/ip_masq.h>
-#ifdef CONFIG_KMOD
-#include <linux/kmod.h>
-#endif
-
-EXPORT_SYMBOL(register_ip_masq_mod);
-EXPORT_SYMBOL(unregister_ip_masq_mod);
-EXPORT_SYMBOL(ip_masq_mod_lkp_link);
-EXPORT_SYMBOL(ip_masq_mod_lkp_unlink);
-
-static spinlock_t masq_mod_lock = SPIN_LOCK_UNLOCKED;
-
-/*
- * Base pointer for registered modules
- */
-struct ip_masq_mod * ip_masq_mod_reg_base = NULL;
-
-/*
- * Base pointer for lookup (subset of above, a module could be
- * registered, but it could have no active rule); will avoid
- * unnecessary lookups.
- */
-struct ip_masq_mod * ip_masq_mod_lkp_base = NULL;
-
-int ip_masq_mod_register_proc(struct ip_masq_mod *mmod)
-{
-#ifdef CONFIG_PROC_FS
- int ret;
-
- struct proc_dir_entry *ent = mmod->mmod_proc_ent;
-
- if (!ent)
- return 0;
- if (!ent->name) {
- ent->name = mmod->mmod_name;
- ent->namelen = strlen (mmod->mmod_name);
- }
- ret = ip_masq_proc_register(ent);
- if (ret) mmod->mmod_proc_ent = NULL;
-
- return ret;
-#else
- return 0;
-#endif
-}
-
-void ip_masq_mod_unregister_proc(struct ip_masq_mod *mmod)
-{
-#ifdef CONFIG_PROC_FS
- struct proc_dir_entry *ent = mmod->mmod_proc_ent;
- if (!ent)
- return;
- ip_masq_proc_unregister(ent);
-#endif
-}
-
-/*
- * Link/unlink object for lookups
- */
-
-int ip_masq_mod_lkp_unlink(struct ip_masq_mod *mmod)
-{
- struct ip_masq_mod **mmod_p;
-
- write_lock_bh(&masq_mod_lock);
-
- for (mmod_p = &ip_masq_mod_lkp_base; *mmod_p ; mmod_p = &(*mmod_p)->next)
- if (mmod == (*mmod_p)) {
- *mmod_p = mmod->next;
- mmod->next = NULL;
- write_unlock_bh(&masq_mod_lock);
- return 0;
- }
-
- write_unlock_bh(&masq_mod_lock);
- return -EINVAL;
-}
-
-int ip_masq_mod_lkp_link(struct ip_masq_mod *mmod)
-{
- write_lock_bh(&masq_mod_lock);
-
- mmod->next = ip_masq_mod_lkp_base;
- ip_masq_mod_lkp_base=mmod;
-
- write_unlock_bh(&masq_mod_lock);
- return 0;
-}
-
-int register_ip_masq_mod(struct ip_masq_mod *mmod)
-{
- if (!mmod) {
- IP_MASQ_ERR("register_ip_masq_mod(): NULL arg\n");
- return -EINVAL;
- }
- if (!mmod->mmod_name) {
- IP_MASQ_ERR("register_ip_masq_mod(): NULL mmod_name\n");
- return -EINVAL;
- }
- ip_masq_mod_register_proc(mmod);
-
- mmod->next_reg = ip_masq_mod_reg_base;
- ip_masq_mod_reg_base=mmod;
-
- return 0;
-}
-
-int unregister_ip_masq_mod(struct ip_masq_mod *mmod)
-{
- struct ip_masq_mod **mmod_p;
-
- if (!mmod) {
- IP_MASQ_ERR( "unregister_ip_masq_mod(): NULL arg\n");
- return -EINVAL;
- }
-
- /*
- * Only allow unregistration if it is not referenced
- */
- if (atomic_read(&mmod->refcnt)) {
- IP_MASQ_ERR( "unregister_ip_masq_mod(): is in use by %d guys. failed\n",
- atomic_read(&mmod->refcnt));
- return -EINVAL;
- }
-
- /*
- * Must be already unlinked from lookup list
- */
- if (mmod->next) {
- IP_MASQ_WARNING("MASQ: unregistering \"%s\" while in lookup list.fixed.",
- mmod->mmod_name);
- ip_masq_mod_lkp_unlink(mmod);
- }
-
- for (mmod_p = &ip_masq_mod_reg_base; *mmod_p ; mmod_p = &(*mmod_p)->next_reg)
- if (mmod == (*mmod_p)) {
- ip_masq_mod_unregister_proc(mmod);
- *mmod_p = mmod->next_reg;
- return 0;
- }
-
- IP_MASQ_ERR("unregister_ip_masq_mod(%s): not linked \n", mmod->mmod_name);
- return -EINVAL;
-}
-
-int ip_masq_mod_in_rule(const struct sk_buff *skb, const struct iphdr *iph)
-{
- struct ip_masq_mod *mmod;
- int ret = IP_MASQ_MOD_NOP;
-
- for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) {
- if (!mmod->mmod_in_rule) continue;
- switch (ret=mmod->mmod_in_rule(skb, iph)) {
- case IP_MASQ_MOD_NOP:
- continue;
- case IP_MASQ_MOD_ACCEPT:
- case IP_MASQ_MOD_REJECT:
- goto out;
- }
- }
-out:
- return ret;
-}
-
-int ip_masq_mod_out_rule(const struct sk_buff *skb, const struct iphdr *iph)
-{
- struct ip_masq_mod *mmod;
- int ret = IP_MASQ_MOD_NOP;
-
- for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) {
- if (!mmod->mmod_out_rule) continue;
- switch (ret=mmod->mmod_out_rule(skb, iph)) {
- case IP_MASQ_MOD_NOP:
- continue;
- case IP_MASQ_MOD_ACCEPT:
- case IP_MASQ_MOD_REJECT:
- goto out;
- }
- }
-out:
- return ret;
-}
-
-struct ip_masq * ip_masq_mod_in_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr)
-{
- struct ip_masq_mod *mmod;
- struct ip_masq *ms = NULL;
-
- for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) {
- if (!mmod->mmod_in_create) continue;
- if ((ms=mmod->mmod_in_create(skb, iph, maddr))) {
- goto out;
- }
- }
-out:
- return ms;
-}
-
-struct ip_masq * ip_masq_mod_out_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr)
-{
- struct ip_masq_mod *mmod;
- struct ip_masq *ms = NULL;
-
- for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) {
- if (!mmod->mmod_out_create) continue;
- if ((ms=mmod->mmod_out_create(skb, iph, maddr))) {
- goto out;
- }
- }
-out:
- return ms;
-}
-
-int ip_masq_mod_in_update(const struct sk_buff *skb, const struct iphdr *iph, struct ip_masq *ms)
-{
- struct ip_masq_mod *mmod;
- int ret = IP_MASQ_MOD_NOP;
-
- for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) {
- if (!mmod->mmod_in_update) continue;
- switch (ret=mmod->mmod_in_update(skb, iph, ms)) {
- case IP_MASQ_MOD_NOP:
- continue;
- case IP_MASQ_MOD_ACCEPT:
- case IP_MASQ_MOD_REJECT:
- goto out;
- }
- }
-out:
- return ret;
-}
-
-int ip_masq_mod_out_update(const struct sk_buff *skb, const struct iphdr *iph, struct ip_masq *ms)
-{
- struct ip_masq_mod *mmod;
- int ret = IP_MASQ_MOD_NOP;
-
- for (mmod=ip_masq_mod_lkp_base;mmod;mmod=mmod->next) {
- if (!mmod->mmod_out_update) continue;
- switch (ret=mmod->mmod_out_update(skb, iph, ms)) {
- case IP_MASQ_MOD_NOP:
- continue;
- case IP_MASQ_MOD_ACCEPT:
- case IP_MASQ_MOD_REJECT:
- goto out;
- }
- }
-out:
- return ret;
-}
-
-struct ip_masq_mod * ip_masq_mod_getbyname(const char *mmod_name)
-{
- struct ip_masq_mod * mmod;
-
- IP_MASQ_DEBUG(1, "searching mmod_name \"%s\"\n", mmod_name);
-
- for (mmod=ip_masq_mod_reg_base; mmod ; mmod=mmod->next_reg) {
- if (mmod->mmod_ctl && *(mmod_name)
- && (strcmp(mmod_name, mmod->mmod_name)==0)) {
- /* HIT */
- return mmod;
- }
- }
- return NULL;
-}
-
-/*
- * Module control entry
- */
-int ip_masq_mod_ctl(int optname, struct ip_masq_ctl *mctl, int optlen)
-{
- struct ip_masq_mod * mmod;
-#ifdef CONFIG_KMOD
- char kmod_name[IP_MASQ_TNAME_MAX+8];
-#endif
- /* tappo */
- mctl->m_tname[IP_MASQ_TNAME_MAX-1] = 0;
-
- mmod = ip_masq_mod_getbyname(mctl->m_tname);
- if (mmod)
- return mmod->mmod_ctl(optname, mctl, optlen);
-#ifdef CONFIG_KMOD
- sprintf(kmod_name,"ip_masq_%s", mctl->m_tname);
-
- IP_MASQ_DEBUG(1, "About to request \"%s\" module\n", kmod_name);
-
- /*
- * Let sleep for a while ...
- */
- request_module(kmod_name);
- mmod = ip_masq_mod_getbyname(mctl->m_tname);
- if (mmod)
- return mmod->mmod_ctl(optname, mctl, optlen);
-#endif
- return ESRCH;
-}
diff --git a/net/ipv4/ip_masq_portfw.c b/net/ipv4/ip_masq_portfw.c
deleted file mode 100644
index 91e1b726d..000000000
--- a/net/ipv4/ip_masq_portfw.c
+++ /dev/null
@@ -1,508 +0,0 @@
-/*
- * IP_MASQ_PORTFW masquerading module
- *
- *
- * $Id: ip_masq_portfw.c,v 1.3 1998/12/08 05:42:12 davem Exp $
- *
- * Author: Steven Clarke <steven.clarke@monmouth.demon.co.uk>
- *
- * Fixes:
- * Juan Jose Ciarlante : created this new file from ip_masq.c and ip_fw.c
- * Juan Jose Ciarlante : modularized
- * Juan Jose Ciarlante : use GFP_KERNEL
- * Juan Jose Ciarlante : locking
- *
- */
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/list.h>
-#include <net/ip.h>
-#include <linux/ip_fw.h>
-#include <linux/ip_masq.h>
-#include <net/ip_masq.h>
-#include <net/ip_masq_mod.h>
-#include <linux/proc_fs.h>
-#include <linux/init.h>
-
-#define IP_PORTFW_PORT_MIN 1
-#define IP_PORTFW_PORT_MAX 60999
-
-struct ip_portfw {
- struct list_head list;
- __u32 laddr, raddr;
- __u16 lport, rport;
- atomic_t pref_cnt; /* pref "counter" down to 0 */
- int pref; /* user set pref */
-};
-
-static struct ip_masq_mod *mmod_self = NULL;
-/*
- * Debug level
- */
-#ifdef CONFIG_IP_MASQ_DEBUG
-static int debug=0;
-MODULE_PARM(debug, "i");
-#endif
-
-/*
- * Lock
- */
-static spinlock_t portfw_lock = SPIN_LOCK_UNLOCKED;
-
-static struct list_head portfw_list[2];
-static __inline__ int portfw_idx(int protocol)
-{
- return (protocol==IPPROTO_TCP);
-}
-
-/*
- *
- * Delete forwarding entry(s):
- * called from _DEL, u-space.
- * . "relaxed" match, except for lport
- *
- */
-
-static __inline__ int ip_portfw_del(__u16 protocol, __u16 lport, __u32 laddr, __u16 rport, __u32 raddr)
-{
- int prot = portfw_idx(protocol);
- struct ip_portfw *n;
- struct list_head *entry;
- struct list_head *list = &portfw_list[prot];
- int nent;
-
- nent = atomic_read(&mmod_self->mmod_nent);
-
- write_lock_bh(&portfw_lock);
-
- for (entry=list->next;entry != list;entry = entry->next) {
- n = list_entry(entry, struct ip_portfw, list);
- if (n->lport == lport &&
- (!laddr || n->laddr == laddr) &&
- (!raddr || n->raddr == raddr) &&
- (!rport || n->rport == rport)) {
- list_del(entry);
- ip_masq_mod_dec_nent(mmod_self);
- kfree_s(n, sizeof(struct ip_portfw));
- MOD_DEC_USE_COUNT;
- }
- }
- write_unlock_bh(&portfw_lock);
-
- return nent==atomic_read(&mmod_self->mmod_nent)? ESRCH : 0;
-}
-
-/*
- * Flush tables
- * called from _FLUSH, u-space.
- */
-static __inline__ void ip_portfw_flush(void)
-{
- int prot;
- struct list_head *l;
- struct list_head *e;
- struct ip_portfw *n;
-
- write_lock_bh(&portfw_lock);
-
- for (prot = 0; prot < 2;prot++) {
- l = &portfw_list[prot];
- while((e=l->next) != l) {
- ip_masq_mod_dec_nent(mmod_self);
- n = list_entry (e, struct ip_portfw, list);
- list_del(e);
- kfree_s(n, sizeof (*n));
- MOD_DEC_USE_COUNT;
- }
- }
-
- write_unlock_bh(&portfw_lock);
-}
-
-/*
- * Lookup routine for lport,laddr match
- * must be called with locked tables
- */
-static __inline__ struct ip_portfw *ip_portfw_lookup(__u16 protocol, __u16 lport, __u32 laddr, __u32 *daddr_p, __u16 *dport_p)
-{
- int prot = portfw_idx(protocol);
-
- struct ip_portfw *n = NULL;
- struct list_head *l, *e;
-
- l = &portfw_list[prot];
-
- for (e=l->next;e!=l;e=e->next) {
- n = list_entry(e, struct ip_portfw, list);
- if (lport == n->lport && laddr == n->laddr) {
- /* Please be nice, don't pass only a NULL dport */
- if (daddr_p) {
- *daddr_p = n->raddr;
- *dport_p = n->rport;
- }
-
- goto out;
- }
- }
- n = NULL;
-out:
- return n;
-}
-
-/*
- * Edit routine for lport,[laddr], [raddr], [rport] match
- * By now, only called from u-space
- */
-static __inline__ int ip_portfw_edit(__u16 protocol, __u16 lport, __u32 laddr, __u16 rport, __u32 raddr, int pref)
-{
- int prot = portfw_idx(protocol);
-
- struct ip_portfw *n = NULL;
- struct list_head *l, *e;
- int count = 0;
-
-
- read_lock_bh(&portfw_lock);
-
- l = &portfw_list[prot];
-
- for (e=l->next;e!=l;e=e->next) {
- n = list_entry(e, struct ip_portfw, list);
- if (lport == n->lport &&
- (!laddr || laddr == n->laddr) &&
- (!rport || rport == n->rport) &&
- (!raddr || raddr == n->raddr)) {
- n->pref = pref;
- atomic_set(&n->pref_cnt, pref);
- count++;
- }
- }
-
- read_unlock_bh(&portfw_lock);
-
- return count;
-}
-
-/*
- * Add/edit en entry
- * called from _ADD, u-space.
- * must return 0 or +errno
- */
-static __inline__ int ip_portfw_add(__u16 protocol, __u16 lport, __u32 laddr, __u16 rport, __u32 raddr, int pref)
-{
- struct ip_portfw *npf;
- int prot = portfw_idx(protocol);
-
- if (pref <= 0)
- return EINVAL;
-
- if (ip_portfw_edit(protocol, lport, laddr, rport, raddr, pref)) {
- /*
- * Edit ok ...
- */
- return 0;
- }
-
- /* may block ... */
- npf = (struct ip_portfw*) kmalloc(sizeof(struct ip_portfw), GFP_KERNEL);
-
- if (!npf)
- return ENOMEM;
-
- MOD_INC_USE_COUNT;
- memset(npf, 0, sizeof(*npf));
-
- npf->laddr = laddr;
- npf->lport = lport;
- npf->rport = rport;
- npf->raddr = raddr;
- npf->pref = pref;
-
- atomic_set(&npf->pref_cnt, npf->pref);
- INIT_LIST_HEAD(&npf->list);
-
- write_lock_bh(&portfw_lock);
-
- /*
- * Add at head
- */
- list_add(&npf->list, &portfw_list[prot]);
-
- write_unlock_bh(&portfw_lock);
-
- ip_masq_mod_inc_nent(mmod_self);
- return 0;
-}
-
-
-
-static __inline__ int portfw_ctl(int optname, struct ip_masq_ctl *mctl, int optlen)
-{
- struct ip_portfw_user *mm = &mctl->u.portfw_user;
- int ret = EINVAL;
- int arglen = optlen - IP_MASQ_CTL_BSIZE;
- int cmd;
-
-
- IP_MASQ_DEBUG(1-debug, "ip_masq_user_ctl(len=%d/%d|%d/%d)\n",
- arglen,
- sizeof (*mm),
- optlen,
- sizeof (*mctl));
-
- /*
- * Yes, I'm a bad guy ...
- */
- if (arglen != sizeof(*mm) && optlen != sizeof(*mctl))
- return EINVAL;
-
- /*
- * Don't trust the lusers - plenty of error checking!
- */
- cmd = mctl->m_cmd;
- IP_MASQ_DEBUG(1-debug, "ip_masq_portfw_ctl(cmd=%d)\n", cmd);
-
-
- switch (cmd) {
- case IP_MASQ_CMD_NONE:
- return 0;
- case IP_MASQ_CMD_FLUSH:
- break;
- default:
- if (htons(mm->lport) < IP_PORTFW_PORT_MIN || htons(mm->lport) > IP_PORTFW_PORT_MAX)
- return EINVAL;
-
- if (mm->protocol!=IPPROTO_TCP && mm->protocol!=IPPROTO_UDP)
- return EINVAL;
- }
-
- switch(cmd) {
- case IP_MASQ_CMD_ADD:
- ret = ip_portfw_add(mm->protocol,
- mm->lport, mm->laddr,
- mm->rport, mm->raddr,
- mm->pref);
- break;
-
- case IP_MASQ_CMD_DEL:
- ret = ip_portfw_del(mm->protocol,
- mm->lport, mm->laddr,
- mm->rport, mm->raddr);
- break;
- case IP_MASQ_CMD_FLUSH:
- ip_portfw_flush();
- ret = 0;
- break;
- }
-
-
- return ret;
-}
-
-
-
-
-#ifdef CONFIG_PROC_FS
-
-static int portfw_procinfo(char *buffer, char **start, off_t offset,
- int length, int unused)
-{
- off_t pos=0, begin;
- struct ip_portfw *pf;
- struct list_head *l, *e;
- char temp[65];
- int ind;
- int len=0;
-
-
- if (offset < 64)
- {
- sprintf(temp, "Prot LAddr LPort > RAddr RPort PrCnt Pref");
- len = sprintf(buffer, "%-63s\n", temp);
- }
- pos = 64;
-
- read_lock_bh(&portfw_lock);
-
- for(ind = 0; ind < 2; ind++)
- {
- l = &portfw_list[ind];
- for (e=l->next; e!=l; e=e->next)
- {
- pf = list_entry(e, struct ip_portfw, list);
- pos += 64;
- if (pos <= offset) {
- len = 0;
- continue;
- }
-
- sprintf(temp,"%s %08lX %5u > %08lX %5u %5d %5d",
- ind ? "TCP" : "UDP",
- ntohl(pf->laddr), ntohs(pf->lport),
- ntohl(pf->raddr), ntohs(pf->rport),
- atomic_read(&pf->pref_cnt), pf->pref);
- len += sprintf(buffer+len, "%-63s\n", temp);
-
- if (len >= length)
- goto done;
- }
- }
-done:
- read_unlock_bh(&portfw_lock);
-
- begin = len - (pos - offset);
- *start = buffer + begin;
- len -= begin;
- if(len>length)
- len = length;
- return len;
-}
-
-static struct proc_dir_entry portfw_proc_entry = {
-/* 0, 0, NULL", */
- 0, 6, "portfw", /* Just for compatibility, for now ... */
- S_IFREG | S_IRUGO, 1, 0, 0,
- 0, &proc_net_inode_operations,
- portfw_procinfo
-};
-
-#define proc_ent &portfw_proc_entry
-#else /* !CONFIG_PROC_FS */
-
-#define proc_ent NULL
-#endif
-
-static int portfw_in_rule(const struct sk_buff *skb, const struct iphdr *iph)
-{
- const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
-#ifdef CONFIG_IP_MASQ_DEBUG
- struct rtable *rt = (struct rtable *)skb->dst;
-#endif
- struct ip_portfw *pfw;
-
- IP_MASQ_DEBUG(2, "portfw_in_rule(): skb:= dev=%s (index=%d), rt_iif=%d, rt_flags=0x%x rt_dev___=%s daddr=%d.%d.%d.%d dport=%d\n",
- skb->dev->name, skb->dev->ifindex, rt->rt_iif, rt->rt_flags,
- rt->u.dst.dev->name,
- NIPQUAD(iph->daddr), ntohs(portp[1]));
-
- read_lock(&portfw_lock);
- pfw = ip_portfw_lookup(iph->protocol, portp[1], iph->daddr, NULL, NULL);
- read_unlock(&portfw_lock);
- return (pfw!=0);
-}
-
-static struct ip_masq * portfw_in_create(const struct sk_buff *skb, const struct iphdr *iph, __u32 maddr)
-{
- /*
- * If no entry exists in the masquerading table
- * and the port is involved
- * in port forwarding, create a new masq entry
- */
-
- __u32 raddr;
- __u16 rport;
- const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
- struct ip_masq *ms = NULL;
- struct ip_portfw *pf;
-
- /*
- * Lock for writing.
- */
- write_lock(&portfw_lock);
-
- if ((pf=ip_portfw_lookup(iph->protocol,
- portp[1], iph->daddr,
- &raddr, &rport))) {
- ms = ip_masq_new(iph->protocol,
- iph->daddr, portp[1],
- raddr, rport,
- iph->saddr, portp[0],
- 0);
- ip_masq_listen(ms);
-
- if (!ms || atomic_read(&mmod_self->mmod_nent) <= 1
- /* || ip_masq_nlocks(&portfw_lock) != 1 */ )
- /*
- * Maybe later...
- */
- goto out;
-
- /*
- * Entry created, lock==1.
- * if pref_cnt == 0, move
- * entry at _tail_.
- * This is a simple load balance scheduling
- */
-
- if (atomic_dec_and_test(&pf->pref_cnt)) {
-
- atomic_set(&pf->pref_cnt, pf->pref);
- list_del(&pf->list);
- list_add(&pf->list,
- portfw_list[portfw_idx(iph->protocol)].prev);
-
- }
- }
-out:
- write_unlock(&portfw_lock);
- return ms;
-}
-
-#define portfw_in_update NULL
-#define portfw_out_rule NULL
-#define portfw_out_create NULL
-#define portfw_out_update NULL
-
-static struct ip_masq_mod portfw_mod = {
- NULL, /* next */
- NULL, /* next_reg */
- "portfw", /* name */
- ATOMIC_INIT(0), /* nent */
- ATOMIC_INIT(0), /* refcnt */
- proc_ent,
- portfw_ctl,
- NULL, /* masq_mod_init */
- NULL, /* masq_mod_done */
- portfw_in_rule,
- portfw_in_update,
- portfw_in_create,
- portfw_out_rule,
- portfw_out_update,
- portfw_out_create,
-};
-
-
-
-__initfunc(int ip_portfw_init(void))
-{
- INIT_LIST_HEAD(&portfw_list[0]);
- INIT_LIST_HEAD(&portfw_list[1]);
- return register_ip_masq_mod ((mmod_self=&portfw_mod));
-}
-
-int ip_portfw_done(void)
-{
- return unregister_ip_masq_mod(&portfw_mod);
-}
-
-#ifdef MODULE
-EXPORT_NO_SYMBOLS;
-
-int init_module(void)
-{
- if (ip_portfw_init() != 0)
- return -EIO;
- return 0;
-}
-
-void cleanup_module(void)
-{
- if (ip_portfw_done() != 0)
- printk(KERN_INFO "ip_portfw_done(): can't remove module");
-}
-
-#endif /* MODULE */
diff --git a/net/ipv4/ip_masq_quake.c b/net/ipv4/ip_masq_quake.c
deleted file mode 100644
index 17b11a799..000000000
--- a/net/ipv4/ip_masq_quake.c
+++ /dev/null
@@ -1,324 +0,0 @@
-/*
- * IP_MASQ_QUAKE quake masquerading module
- *
- *
- * Version: @(#)ip_masq_quake.c 0.02 22/02/97
- *
- * Author: Harald Hoyer mailto:HarryH@Royal.Net
- *
- *
- * Fixes:
- * Harald Hoyer : Unofficial Quake Specs found at
- * http://www.gamers.org/dEngine/quake/spec/
- * Harald Hoyer : Check for QUAKE-STRING
- * Juan Jose Ciarlante : litl bits for 2.1
- * Horst von Brand : Add #include <linux/string.h>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- *
- */
-
-#include <linux/module.h>
-#include <asm/system.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/init.h>
-#include <net/protocol.h>
-#include <net/udp.h>
-#include <net/ip_masq.h>
-
-#define DEBUG_CONFIG_IP_MASQ_QUAKE 0
-
-typedef struct
-{
- __u16 type; // (Little Endian) Type of message.
- __u16 length; // (Little Endian) Length of message, header included.
- char message[0]; // The contents of the message.
-} QUAKEHEADER;
-
-struct quake_priv_data {
- /* Have we seen a client connect message */
- signed char cl_connect;
-};
-
-static int
-masq_quake_init_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
-{
- MOD_INC_USE_COUNT;
- if ((ms->app_data = kmalloc(sizeof(struct quake_priv_data),
- GFP_ATOMIC)) == NULL)
- printk(KERN_INFO "Quake: No memory for application data\n");
- else
- {
- struct quake_priv_data *priv =
- (struct quake_priv_data *)ms->app_data;
- priv->cl_connect = 0;
- }
- return 0;
-}
-
-static int
-masq_quake_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
-{
- MOD_DEC_USE_COUNT;
- if (ms->app_data)
- kfree_s(ms->app_data, sizeof(struct quake_priv_data));
- return 0;
-}
-
-int
-masq_quake_in (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr)
-{
- struct sk_buff *skb;
- struct iphdr *iph;
- struct udphdr *uh;
- QUAKEHEADER *qh;
- __u16 udp_port;
- char *data;
- unsigned char code;
- struct quake_priv_data *priv = (struct quake_priv_data *)ms->app_data;
-
- if(priv->cl_connect == -1)
- return 0;
-
- skb = *skb_p;
-
- iph = skb->nh.iph;
- uh = (struct udphdr *)&(((char *)iph)[iph->ihl*4]);
-
- /* Check for lenght */
- if(ntohs(uh->len) < 5)
- return 0;
-
- qh = (QUAKEHEADER *)&uh[1];
-
- if(qh->type != 0x0080)
- return 0;
-
-
- code = qh->message[0];
-
-#if DEBUG_CONFIG_IP_MASQ_QUAKE
- printk("Quake_in: code = %d \n", (int)code);
-#endif
-
- switch(code) {
- case 0x01:
- /* Connection Request */
-
- if(ntohs(qh->length) < 0x0c) {
-#if DEBUG_CONFIG_IP_MASQ_QUAKE
- printk("Quake_in: length < 0xc \n");
-#endif
- return 0;
- }
-
- data = &qh->message[1];
-
- /* Check for stomping string */
- if(memcmp(data,"QUAKE\0\3",7)) {
-#if DEBUG_CONFIG_IP_MASQ_QUAKE
- printk("Quake_out: memcmp failed \n");
-#endif
- return 0;
- }
- else {
- priv->cl_connect = 1;
-#if DEBUG_CONFIG_IP_MASQ_QUAKE
- printk("Quake_out: memcmp ok \n");
-#endif
- }
- break;
-
- case 0x81:
- /* Accept Connection */
- if((ntohs(qh->length) < 0x09) || (priv->cl_connect == 0))
- return 0;
- data = &qh->message[1];
-
- memcpy(&udp_port, data, 2);
-
- ms->dport = htons(udp_port);
-
-#if DEBUG_CONFIG_IP_MASQ_QUAKE
- printk("Quake_in: in_rewrote UDP port %d \n", udp_port);
-#endif
- priv->cl_connect = -1;
-
- break;
- }
-
- return 0;
-}
-
-int
-masq_quake_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr)
-{
- struct sk_buff *skb;
- struct iphdr *iph;
- struct udphdr *uh;
- QUAKEHEADER *qh;
- __u16 udp_port;
- char *data;
- unsigned char code;
- struct ip_masq *n_ms;
- struct quake_priv_data *priv = (struct quake_priv_data *)ms->app_data;
-
- if(priv->cl_connect == -1)
- return 0;
-
- skb = *skb_p;
-
- iph = skb->nh.iph;
- uh = (struct udphdr *)&(((char *)iph)[iph->ihl*4]);
-
- /* Check for lenght */
- if(ntohs(uh->len) < 5)
- return 0;
-
- qh = (QUAKEHEADER *)&uh[1];
-
-#if DEBUG_CONFIG_IP_MASQ_QUAKE
- printk("Quake_out: qh->type = %d \n", (int)qh->type);
-#endif
-
- if(qh->type != 0x0080)
- return 0;
-
- code = qh->message[0];
-
-#if DEBUG_CONFIG_IP_MASQ_QUAKE
- printk("Quake_out: code = %d \n", (int)code);
-#endif
-
- switch(code) {
- case 0x01:
- /* Connection Request */
-
- if(ntohs(qh->length) < 0x0c) {
-#if DEBUG_CONFIG_IP_MASQ_QUAKE
- printk("Quake_out: length < 0xc \n");
-#endif
- return 0;
- }
-
- data = &qh->message[1];
-
- /* Check for stomping string */
- if(memcmp(data,"QUAKE\0\3",7)) {
-#if DEBUG_CONFIG_IP_MASQ_QUAKE
- printk("Quake_out: memcmp failed \n");
-#endif
- return 0;
- }
- else {
- priv->cl_connect = 1;
-#if DEBUG_CONFIG_IP_MASQ_QUAKE
- printk("Quake_out: memcmp ok \n");
-#endif
- }
- break;
-
- case 0x81:
- /* Accept Connection */
- if((ntohs(qh->length) < 0x09) || (priv->cl_connect == 0))
- return 0;
-
- data = &qh->message[1];
-
- memcpy(&udp_port, data, 2);
-
- n_ms = ip_masq_new(IPPROTO_UDP,
- maddr, 0,
- ms->saddr, htons(udp_port),
- ms->daddr, ms->dport,
- 0);
-
- if (n_ms==NULL)
- return 0;
-
-#if DEBUG_CONFIG_IP_MASQ_QUAKE
- printk("Quake_out: out_rewrote UDP port %d -> %d\n",
- udp_port, ntohs(n_ms->mport));
-#endif
- udp_port = ntohs(n_ms->mport);
- memcpy(data, &udp_port, 2);
-
- ip_masq_listen(n_ms);
- ip_masq_control_add(n_ms, ms);
- ip_masq_put(n_ms);
-
- break;
- }
-
- return 0;
-}
-
-struct ip_masq_app ip_masq_quake = {
- NULL, /* next */
- "Quake_26", /* name */
- 0, /* type */
- 0, /* n_attach */
- masq_quake_init_1, /* ip_masq_init_1 */
- masq_quake_done_1, /* ip_masq_done_1 */
- masq_quake_out, /* pkt_out */
- masq_quake_in /* pkt_in */
-};
-struct ip_masq_app ip_masq_quakenew = {
- NULL, /* next */
- "Quake_27", /* name */
- 0, /* type */
- 0, /* n_attach */
- masq_quake_init_1, /* ip_masq_init_1 */
- masq_quake_done_1, /* ip_masq_done_1 */
- masq_quake_out, /* pkt_out */
- masq_quake_in /* pkt_in */
-};
-
-/*
- * ip_masq_quake initialization
- */
-
-__initfunc(int ip_masq_quake_init(void))
-{
- return (register_ip_masq_app(&ip_masq_quake, IPPROTO_UDP, 26000) +
- register_ip_masq_app(&ip_masq_quakenew, IPPROTO_UDP, 27000));
-}
-
-/*
- * ip_masq_quake fin.
- */
-
-int ip_masq_quake_done(void)
-{
- return (unregister_ip_masq_app(&ip_masq_quake) +
- unregister_ip_masq_app(&ip_masq_quakenew));
-}
-
-#ifdef MODULE
-EXPORT_NO_SYMBOLS;
-
-int init_module(void)
-{
- if (ip_masq_quake_init() != 0)
- return -EIO;
- return 0;
-}
-
-void cleanup_module(void)
-{
- if (ip_masq_quake_done() != 0)
- printk("ip_masq_quake: can't remove module");
-}
-
-#endif /* MODULE */
-
-
diff --git a/net/ipv4/ip_masq_raudio.c b/net/ipv4/ip_masq_raudio.c
deleted file mode 100644
index ee3e276b9..000000000
--- a/net/ipv4/ip_masq_raudio.c
+++ /dev/null
@@ -1,578 +0,0 @@
-/*
- * IP_MASQ_RAUDIO - Real Audio masquerading module
- *
- *
- * Version: @(#)$Id: ip_masq_raudio.c,v 1.11 1998/10/06 04:49:04 davem Exp $
- *
- * Author: Nigel Metheringham
- * Real Time Streaming code by Progressive Networks
- * [strongly based on ftp module by Juan Jose Ciarlante & Wouter Gadeyne]
- * [Real Audio information taken from Progressive Networks firewall docs]
- * [Kudos to Progressive Networks for making the protocol specs available]
- *
- *
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- *
- * Limitations
- * The IP Masquerading proxies at present do not have access to a processed
- * data stream. Hence for a protocol like the Real Audio control protocol,
- * which depends on knowing where you are in the data stream, you either
- * to keep a *lot* of state in your proxy, or you cheat and simplify the
- * problem [needless to say I did the latter].
- *
- * This proxy only handles data in the first packet. Everything else is
- * passed transparently. This means it should work under all normal
- * circumstances, but it could be fooled by new data formats or a
- * malicious application!
- *
- * At present the "first packet" is defined as a packet starting with
- * the protocol ID string - "PNA".
- * When the link is up there appears to be enough control data
- * crossing the control link to keep it open even if a long audio
- * piece is playing.
- *
- * The Robust UDP support added in RealAudio 3.0 is supported, but due
- * to servers/clients not making great use of this has not been greatly
- * tested. RealVideo (as used in the Real client version 4.0beta1) is
- * supported but again is not greatly tested (bandwidth requirements
- * appear to exceed that available at the sites supporting the protocol).
- *
- * Multiple Port Support
- * The helper can be made to handle up to MAX_MASQ_APP_PORTS (normally 12)
- * with the port numbers being defined at module load time. The module
- * uses the symbol "ports" to define a list of monitored ports, which can
- * be specified on the insmod command line as
- * ports=x1,x2,x3...
- * where x[n] are integer port numbers. This option can be put into
- * /etc/conf.modules (or /etc/modules.conf depending on your config)
- * where modload will pick it up should you use modload to load your
- * modules.
- *
- * Fixes:
- * Juan Jose Ciarlante : Use control_add() for control chan
- * 10/15/97 - Modifications to allow masquerading of RTSP connections as
- * well as PNA, which can potentially exist on the same port.
- * Joe Rumsey <ogre@real.com>
- *
- */
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <asm/system.h>
-#include <linux/types.h>
-#include <linux/ctype.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/init.h>
-#include <net/protocol.h>
-#include <net/tcp.h>
-#include <net/ip_masq.h>
-
-/*
-#ifndef DEBUG_CONFIG_IP_MASQ_RAUDIO
-#define DEBUG_CONFIG_IP_MASQ_RAUDIO 0
-#endif
-*/
-
-#define TOLOWER(c) (((c) >= 'A' && (c) <= 'Z') ? ((c) - 'A' + 'a') : (c))
-#define ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
-
-struct raudio_priv_data {
- /* Associated data connection - setup but not used at present */
- struct ip_masq *data_conn;
- /* UDP Error correction connection - setup but not used at present */
- struct ip_masq *error_conn;
- /* Have we seen and performed setup */
- short seen_start;
- short is_rtsp;
-};
-
-int
-masq_rtsp_out (struct ip_masq_app *mapp,
- struct ip_masq *ms,
- struct sk_buff **skb_p,
- __u32 maddr);
-
-/*
- * List of ports (up to MAX_MASQ_APP_PORTS) to be handled by helper
- * First port is set to the default port.
- */
-int ports[MAX_MASQ_APP_PORTS] = {554, 7070, 0}; /* I rely on the trailing items being set to zero */
-struct ip_masq_app *masq_incarnations[MAX_MASQ_APP_PORTS];
-
-/*
- * Debug level
- */
-#ifdef CONFIG_IP_MASQ_DEBUG
-static int debug=0;
-MODULE_PARM(debug, "i");
-#endif
-
-MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i");
-
-
-static int
-masq_raudio_init_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
-{
- MOD_INC_USE_COUNT;
- if ((ms->app_data = kmalloc(sizeof(struct raudio_priv_data),
- GFP_ATOMIC)) == NULL)
- printk(KERN_INFO "RealAudio: No memory for application data\n");
- else
- {
- struct raudio_priv_data *priv =
- (struct raudio_priv_data *)ms->app_data;
- priv->seen_start = 0;
- priv->data_conn = NULL;
- priv->error_conn = NULL;
- priv->is_rtsp = 0;
- }
- return 0;
-}
-
-static int
-masq_raudio_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
-{
- MOD_DEC_USE_COUNT;
- if (ms->app_data)
- kfree_s(ms->app_data, sizeof(struct raudio_priv_data));
- return 0;
-}
-
-int
-masq_raudio_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr)
-{
- struct sk_buff *skb;
- struct iphdr *iph;
- struct tcphdr *th;
- char *p, *data, *data_limit;
- struct ip_masq *n_ms;
- unsigned short version, msg_id, msg_len, udp_port;
- struct raudio_priv_data *priv =
- (struct raudio_priv_data *)ms->app_data;
-
- /* Everything running correctly already */
- if (priv && priv->seen_start)
- return 0;
-
- if(priv && priv->is_rtsp)
- return masq_rtsp_out(mapp, ms, skb_p, maddr);
-
- skb = *skb_p;
- iph = skb->nh.iph;
- th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
- data = (char *)&th[1];
-
- data_limit = skb->h.raw + skb->len;
-
- if(memcmp(data, "OPTIONS", 7) == 0 ||
- memcmp(data, "DESCRIBE", 8) == 0)
- {
- IP_MASQ_DEBUG(1-debug, "RealAudio: Detected RTSP connection\n");
- /* This is an RTSP client */
- if(priv)
- priv->is_rtsp = 1;
- return masq_rtsp_out(mapp, ms, skb_p, maddr);
- }
-
- /* Check to see if this is the first packet with protocol ID */
- if (memcmp(data, "PNA", 3)) {
- IP_MASQ_DEBUG(1-debug, "RealAudio: not initial protocol packet - ignored\n");
- return(0);
- }
- data += 3;
- memcpy(&version, data, 2);
-
- IP_MASQ_DEBUG(1-debug, "RealAudio: initial seen - protocol version %d\n",
- ntohs(version));
- if (priv)
- priv->seen_start = 1;
-
- if (ntohs(version) >= 256)
- {
- printk(KERN_INFO "RealAudio: version (%d) not supported\n",
- ntohs(version));
- return 0;
- }
-
- data += 2;
- while (data+4 < data_limit) {
- memcpy(&msg_id, data, 2);
- data += 2;
- memcpy(&msg_len, data, 2);
- data += 2;
- if (ntohs(msg_id) == 0) {
- /* The zero tag indicates the end of options */
- IP_MASQ_DEBUG(1-debug, "RealAudio: packet end tag seen\n");
- return 0;
- }
- IP_MASQ_DEBUG(1-debug, "RealAudio: msg %d - %d byte\n",
- ntohs(msg_id), ntohs(msg_len));
- if (ntohs(msg_id) == 0) {
- /* The zero tag indicates the end of options */
- return 0;
- }
- p = data;
- data += ntohs(msg_len);
- if (data > data_limit)
- {
- printk(KERN_INFO "RealAudio: Packet too short for data\n");
- return 0;
- }
- if ((ntohs(msg_id) == 1) || (ntohs(msg_id) == 7)) {
- /*
- * MsgId == 1
- * Audio UDP data port on client
- *
- * MsgId == 7
- * Robust UDP error correction port number on client
- *
- * Since these messages are treated just the same, they
- * are bundled together here....
- */
- memcpy(&udp_port, p, 2);
-
- /*
- * Sometimes a server sends a message 7 with a zero UDP port
- * Rather than do anything with this, just ignore it!
- */
- if (udp_port == 0)
- continue;
-
-
- n_ms = ip_masq_new(IPPROTO_UDP,
- maddr, 0,
- ms->saddr, udp_port,
- ms->daddr, 0,
- IP_MASQ_F_NO_DPORT);
-
- if (n_ms==NULL)
- return 0;
-
- ip_masq_listen(n_ms);
- ip_masq_control_add(n_ms, ms);
-
- memcpy(p, &(n_ms->mport), 2);
- IP_MASQ_DEBUG(1-debug, "RealAudio: rewrote UDP port %d -> %d in msg %d\n",
- ntohs(udp_port), ntohs(n_ms->mport), ntohs(msg_id));
-
- /* Make ref in application data to data connection */
- if (priv) {
- if (ntohs(msg_id) == 1)
- priv->data_conn = n_ms;
- else
- priv->error_conn = n_ms;
- }
-
- ip_masq_put(n_ms);
- }
- }
- return 0;
-}
-
-/*
- * masq_rtsp_out
- *
- *
- */
-int
-masq_rtsp_out (struct ip_masq_app *mapp,
- struct ip_masq *ms,
- struct sk_buff **skb_p,
- __u32 maddr)
-{
- struct sk_buff *skb;
- struct iphdr *iph;
- struct tcphdr *th;
- char *data, *data_limit;
- struct ip_masq *n_ms, *n_ms2;
- unsigned short udp_port;
- struct raudio_priv_data *priv =
- (struct raudio_priv_data *)ms->app_data;
- const char* srch = "transport:";
- const char* srchpos = srch;
- const char* srchend = srch + strlen(srch);
- int state = 0;
- char firstport[6];
- int firstportpos = 0;
- char secondport[6];
- int secondportpos = 0;
- char *portstart = NULL, *portend = NULL;
- int diff;
-
- /* Everything running correctly already */
- if (priv && priv->seen_start)
- return 0;
-
- skb = *skb_p;
- iph = skb->nh.iph;
- th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
- data = (char *)&th[1];
-
- data_limit = skb->h.raw + skb->len;
-
- firstport[0] = 0;
- secondport[0] = 0;
-
- while(data < data_limit && state >= 0)
- {
- switch(state)
- {
- case 0:
- case 1:
- if(TOLOWER(*data) == *srchpos)
- {
- srchpos++;
- if(srchpos == srchend)
- {
- IP_MASQ_DEBUG(1-debug, "Found string %s in message\n",
- srch);
- state++;
- if(state == 1)
- {
- srch = "client_port";
- srchpos = srch;
- srchend = srch + strlen(srch);
- }
- }
- }
- else
- {
- srchpos = srch;
- }
- break;
- case 2:
- if(*data == '=')
- state = 3;
- break;
- case 3:
- if(ISDIGIT(*data))
- {
- portstart = data;
- firstportpos = 0;
- firstport[firstportpos++] = *data;
- state = 4;
- }
- break;
- case 4:
- if(*data == '-')
- {
- state = 5;
- }
- else if(*data == ';')
- {
- portend = data - 1;
- firstport[firstportpos] = 0;
- state = -1;
- }
- else if(ISDIGIT(*data))
- {
- firstport[firstportpos++] = *data;
- }
- else if(*data != ' ' && *data != '\t')
- {
- /* This is a badly formed RTSP message, let's bail out */
- IP_MASQ_DEBUG(1-debug, "Badly formed RTSP Message\n");
- return 0;
- }
- break;
- case 5:
- if(ISDIGIT(*data))
- {
- secondportpos = 0;
- secondport[secondportpos++] = *data;
- state = 6;
- }
- else if(*data == ';')
- {
- portend = data - 1;
- secondport[secondportpos] = 0;
- state = -1;
- }
- break;
- case 6:
- if(*data == ';')
- {
- portend = data - 1;
- secondport[secondportpos] = 0;
- state = -1;
- }
- else if(ISDIGIT(*data))
- {
- secondport[secondportpos++] = *data;
- }
- else if(*data != ' ' && *data != '\t')
- {
- /* This is a badly formed RTSP message, let's bail out */
- IP_MASQ_DEBUG(1-debug, "Badly formed RTSP Message\n");
- return 0;
- }
- break;
- }
- data++;
- }
-
- if(state >= 0)
- return 0;
-
- if(firstportpos > 0)
- {
- char newbuf[12]; /* xxxxx-xxxxx\0 */
- char* tmpptr;
-
- udp_port = htons(simple_strtoul(firstport, &tmpptr, 10));
- n_ms = ip_masq_new(IPPROTO_UDP,
- maddr, 0,
- ms->saddr, udp_port,
- ms->daddr, 0,
- IP_MASQ_F_NO_DPORT);
- if (n_ms==NULL)
- return 0;
-
- ip_masq_listen(n_ms);
- ip_masq_control_add(n_ms, ms);
-
- if(secondportpos > 0)
- {
- udp_port = htons(simple_strtoul(secondport, &tmpptr, 10));
- n_ms2 = ip_masq_new(IPPROTO_UDP,
- maddr, 0,
- ms->saddr, udp_port,
- ms->daddr, 0,
- IP_MASQ_F_NO_DPORT);
- if (n_ms2==NULL) {
- ip_masq_put(n_ms);
- return 0;
- }
-
- ip_masq_listen(n_ms2);
- ip_masq_control_add(n_ms2, ms);
- sprintf(newbuf, "%d-%d", ntohs(n_ms->mport),
- ntohs(n_ms2->mport));
- }
- else
- {
- sprintf(newbuf, "%d", ntohs(n_ms->mport));
- n_ms2 = NULL;
- }
- *skb_p = ip_masq_skb_replace(skb, GFP_ATOMIC,
- portstart, portend - portstart + 1,
- newbuf, strlen(newbuf));
- IP_MASQ_DEBUG(1-debug, "RTSP: rewrote client_port to %s\n", newbuf);
- diff = strlen(newbuf) - (portend - portstart);
- }
- else
- {
- return 0;
- }
-
- if(priv)
- {
- priv->seen_start = 1;
- if(n_ms)
- priv->data_conn = n_ms;
- if(n_ms2)
- priv->error_conn = n_ms2;
- }
- /*
- * Release tunnels
- */
-
- if (n_ms)
- ip_masq_put(n_ms);
-
- if (n_ms2)
- ip_masq_put(n_ms2);
-
- return diff;
-}
-
-struct ip_masq_app ip_masq_raudio = {
- NULL, /* next */
- "RealAudio", /* name */
- 0, /* type */
- 0, /* n_attach */
- masq_raudio_init_1, /* ip_masq_init_1 */
- masq_raudio_done_1, /* ip_masq_done_1 */
- masq_raudio_out, /* pkt_out */
- NULL /* pkt_in */
-};
-
-/*
- * ip_masq_raudio initialization
- */
-
-__initfunc(int ip_masq_raudio_init(void))
-{
- int i, j;
-
- for (i=0; (i<MAX_MASQ_APP_PORTS); i++) {
- if (ports[i]) {
- if ((masq_incarnations[i] = kmalloc(sizeof(struct ip_masq_app),
- GFP_KERNEL)) == NULL)
- return -ENOMEM;
- memcpy(masq_incarnations[i], &ip_masq_raudio, sizeof(struct ip_masq_app));
- if ((j = register_ip_masq_app(masq_incarnations[i],
- IPPROTO_TCP,
- ports[i]))) {
- return j;
- }
- IP_MASQ_DEBUG(1-debug, "RealAudio: loaded support on port[%d] = %d\n",
- i, ports[i]);
- } else {
- /* To be safe, force the incarnation table entry to NULL */
- masq_incarnations[i] = NULL;
- }
- }
- return 0;
-}
-
-/*
- * ip_masq_raudio fin.
- */
-
-int ip_masq_raudio_done(void)
-{
- int i, j, k;
-
- k=0;
- for (i=0; (i<MAX_MASQ_APP_PORTS); i++) {
- if (masq_incarnations[i]) {
- if ((j = unregister_ip_masq_app(masq_incarnations[i]))) {
- k = j;
- } else {
- kfree(masq_incarnations[i]);
- masq_incarnations[i] = NULL;
- IP_MASQ_DEBUG(1-debug, "RealAudio: unloaded support on port[%d] = %d\n",
- i, ports[i]);
- }
- }
- }
- return k;
-}
-
-#ifdef MODULE
-EXPORT_NO_SYMBOLS;
-
-int init_module(void)
-{
- if (ip_masq_raudio_init() != 0)
- return -EIO;
- return 0;
-}
-
-void cleanup_module(void)
-{
- if (ip_masq_raudio_done() != 0)
- printk(KERN_INFO "ip_masq_raudio: can't remove module");
-}
-
-#endif /* MODULE */
diff --git a/net/ipv4/ip_masq_user.c b/net/ipv4/ip_masq_user.c
deleted file mode 100644
index 9264301ae..000000000
--- a/net/ipv4/ip_masq_user.c
+++ /dev/null
@@ -1,467 +0,0 @@
-/*
- * IP_MASQ_USER user space control module
- *
- *
- * $Id: ip_masq_user.c,v 1.1 1998/08/29 23:51:08 davem Exp $
- */
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/skbuff.h>
-#include <asm/system.h>
-#include <linux/stat.h>
-#include <linux/proc_fs.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/inet.h>
-#include <linux/init.h>
-#include <net/protocol.h>
-#include <net/icmp.h>
-#include <net/tcp.h>
-#include <net/udp.h>
-#include <net/checksum.h>
-#include <net/ip_masq.h>
-#include <net/ip_masq_mod.h>
-#include <linux/sysctl.h>
-#include <linux/ip_fw.h>
-
-#include <linux/ip_masq.h>
-
-/*
- * Debug level
- */
-static int debug=0;
-
-MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i");
-MODULE_PARM(debug, "i");
-
-/*
-static int check_5uple (struct ip_masq_user *ums) {
- return 0;
-}
-*/
-static void masq_user_k2u(const struct ip_masq *ms, struct ip_masq_user *ums)
-{
- ums->protocol = ms->protocol;
- ums->daddr = ms->daddr;
- ums->dport = ms->dport;
- ums->maddr = ms->maddr;
- ums->mport = ms->mport;
- ums->saddr = ms->saddr;
- ums->sport = ms->sport;
- ums->timeout = ms->timeout;
-}
-
-
-static int ip_masq_user_maddr(struct ip_masq_user *ums)
-{
- struct device *dev;
- struct rtable *rt;
- int ret = -EINVAL;
- u32 rt_daddr, rt_saddr;
- u32 tos;
-
- /*
- * Did specify masq address.
- */
- if (ums->maddr)
- return 0;
-
- /*
- * Select address to use for routing query
- */
-
- rt_daddr = ums->rt_daddr? ums->rt_daddr : ums->daddr;
- rt_saddr = ums->rt_saddr? ums->rt_saddr : ums->saddr;
-
-
- /*
- * No address for routing, cannot continue
- */
- if (rt_daddr == 0) {
- IP_MASQ_DEBUG(1-debug, "cannot setup maddr with daddr=%lX, rt_addr=%lX\n",
- ntohl(ums->daddr), ntohl(ums->rt_daddr));
- return -EINVAL;
- }
-
- /*
- * Find out rt device
- */
-
- rt_saddr = 0;
- tos = RT_TOS(ums->ip_tos) | RTO_CONN;
-
- if ((ret=ip_route_output(&rt, rt_daddr, rt_saddr, tos, 0 /* dev */))) {
- IP_MASQ_DEBUG(0-debug, "could not setup maddr for routing daddr=%lX, saddr=%lX\n",
- ntohl(rt_daddr), ntohl(rt_saddr));
- return ret;
- }
- dev = rt->u.dst.dev;
- ums->maddr = ip_masq_select_addr(dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
-
- IP_MASQ_DEBUG(1-debug, "did setup maddr=%lX\n", ntohl(ums->maddr));
- ip_rt_put(rt);
- return 0;
-}
-
-/*
- * Create new entry (from uspace)
- */
-static int ip_masq_user_new(struct ip_masq_user *ums)
-{
- struct ip_masq *ms = NULL;
- unsigned mflags = 0;
- int ret;
-
- if (masq_proto_num (ums->protocol) == -1) {
- return EPROTONOSUPPORT;
- }
-
- if (ums->dport == 0) {
- ums->flags |= IP_MASQ_USER_F_LISTEN;
- }
-
- if (ums->flags | IP_MASQ_USER_F_LISTEN) {
- if ((ums->saddr == 0) || (ums->sport == 0)) {
- return EINVAL;
- }
- mflags |= (IP_MASQ_F_NO_DPORT|IP_MASQ_F_NO_DADDR);
-
- }
-
- if ((ret = ip_masq_user_maddr(ums)) < 0) {
- return -ret;
- }
-
- mflags |= IP_MASQ_F_USER;
- ms = ip_masq_new(ums->protocol,
- ums->maddr, ums->mport,
- ums->saddr, ums->sport,
- ums->daddr, ums->dport,
- mflags);
-
- if (ms == NULL) {
- /*
- * FIXME: ip_masq_new() should return errno
- */
- return EBUSY;
- }
-
- /*
- * Setup timeouts for this new entry
- */
-
- if (ums->timeout) {
- ms->timeout = ums->timeout;
- } else if (ums->flags | IP_MASQ_USER_F_LISTEN) {
- ip_masq_listen(ms);
- }
-
- masq_user_k2u(ms, ums);
- ip_masq_put(ms);
- return 0;
-}
-
-/*
- * Delete existing entry
- */
-static int ip_masq_user_del(struct ip_masq_user *ums)
-{
- struct ip_masq *ms=NULL;
-
- if (masq_proto_num (ums->protocol) == -1) {
- return EPROTONOSUPPORT;
- }
- start_bh_atomic();
- if (ums->mport && ums->maddr) {
- ms = ip_masq_in_get(ums->protocol,
- ums->daddr, ums->dport,
- ums->maddr, ums->mport);
- end_bh_atomic();
- } else if (ums->sport && ums->saddr) {
- ms = ip_masq_out_get(ums->protocol,
- ums->saddr, ums->sport,
- ums->daddr, ums->dport);
- end_bh_atomic();
- } else
- return EINVAL;
-
- if (ms == NULL) {
- return ESRCH;
- }
-
- /*
- * got (locked) entry, setup almost tiny timeout :) and
- * give away
- *
- * FIXME: should use something better than S_CLOSE
- */
- ms->timeout = IP_MASQ_S_CLOSE;
-
- masq_user_k2u(ms, ums);
- ip_masq_put(ms);
- return 0;
-}
-
-static struct ip_masq * ip_masq_user_locked_get (struct ip_masq_user *ums, int *err)
-{
- struct ip_masq *ms=NULL;
- if (masq_proto_num (ums->protocol) == -1) {
- *err = EPROTONOSUPPORT;
- }
-
- start_bh_atomic();
- if (ums->mport && ums->maddr) {
- ms = ip_masq_in_get(ums->protocol,
- ums->daddr, ums->dport,
- ums->maddr, ums->mport);
- end_bh_atomic();
- } else if (ums->sport && ums->saddr) {
- ms = ip_masq_out_get(ums->protocol,
- ums->saddr, ums->sport,
- ums->daddr, ums->dport);
- end_bh_atomic();
- } else
- *err = EINVAL;
-
- if (ms == NULL) *err = ESRCH;
- return ms;
-}
-
-/*
- * Get existing entry (complete full tunnel info)
- */
-static int ip_masq_user_get(struct ip_masq_user *ums)
-{
- struct ip_masq *ms=NULL;
- int err;
-
- ms = ip_masq_user_locked_get(ums, &err);
- if (ms == NULL)
- return err;
-
- masq_user_k2u(ms, ums);
-
- ip_masq_put(ms);
- return 0;
-}
-
-/*
- * Set (some, valid) entry parameters
- */
-static int ip_masq_user_set(struct ip_masq_user *ums)
-{
- struct ip_masq *ms = NULL;
- int err;
-
- ms = ip_masq_user_locked_get(ums, &err);
- if (ms == NULL)
- return err;
-
- /*
- * FIXME: must allow selecting what you want to set
- */
- ms->timeout = ums->timeout;
-
- masq_user_k2u(ms, ums);
-
- ip_masq_put(ms);
- return 0;
-}
-
-
-/*
- * Entry point
- * ret value:
- * <0 err
- * ==0 ok
- * >0 ok, copy to user
- */
-static int ip_masq_user_ctl(int optname, struct ip_masq_ctl *mctl, int optlen)
-{
- struct ip_masq_user *ums = &mctl->u.user;
- int ret = EINVAL;
- int arglen = optlen - IP_MASQ_CTL_BSIZE;
- int cmd;
-
- IP_MASQ_DEBUG(1-debug, "ip_masq_user_ctl(len=%d/%d|%d/%d)\n",
- arglen,
- sizeof (*ums),
- optlen,
- sizeof (*mctl));
-
- /*
- * Yes, I'm a bad guy ...
- */
- if (arglen != sizeof(*ums) && optlen != sizeof(*mctl))
- return EINVAL;
-
- MOD_INC_USE_COUNT;
-
- /*
- * Don't trust the lusers - plenty of error checking!
- */
- cmd = mctl->m_cmd;
- IP_MASQ_DEBUG(1-debug, "ip_masq_user_ctl(cmd=%d)\n", cmd);
-
- switch (mctl->m_cmd) {
- case IP_MASQ_CMD_ADD:
- case IP_MASQ_CMD_INSERT:
- ret = ip_masq_user_new(ums);
- break;
- case IP_MASQ_CMD_DEL:
- ret = ip_masq_user_del(ums);
- break;
- case IP_MASQ_CMD_SET:
- ret = ip_masq_user_set(ums);
- break;
- case IP_MASQ_CMD_GET:
- ret = ip_masq_user_get(ums);
- break;
- }
-
- /*
- * For all of the above, return masq tunnel info
- */
-
- ret = -ret;
-
- if (ret == 0) {
- ret = sizeof (*ums) + IP_MASQ_CTL_BSIZE;
- IP_MASQ_DEBUG(1-debug, "will return %d bytes to user\n", ret);
- }
-
- MOD_DEC_USE_COUNT;
- return ret;
-}
-
-
-#ifdef CONFIG_PROC_FS
-static int ip_masq_user_info(char *buffer, char **start, off_t offset,
- int length, int proto)
-{
- off_t pos=0, begin;
- struct ip_masq *ms;
- char temp[129];
- int idx = 0;
- int len=0;
- int magic_control;
-
- MOD_INC_USE_COUNT;
-
- IP_MASQ_DEBUG(1-debug, "Entered user_info with proto=%d\n", proto);
-
- if (offset < 128)
- {
- sprintf(temp,
- "Prot SrcIP SPrt DstIP DPrt MAddr MPrt State Flgs Ref Ctl Expires (free=%d,%d,%d)",
- atomic_read(ip_masq_free_ports),
- atomic_read(ip_masq_free_ports+1),
- atomic_read(ip_masq_free_ports+2));
- len = sprintf(buffer, "%-127s\n", temp);
- }
- pos = 128;
-
- for(idx = 0; idx < IP_MASQ_TAB_SIZE; idx++)
- {
- /*
- * Lock is actually only need in next loop
- * we are called from uspace: must stop bh.
- */
- read_lock_bh(&__ip_masq_lock);
- for(ms = ip_masq_m_tab[idx]; ms ; ms = ms->m_link)
- {
- if (ms->protocol != proto) {
- continue;
- }
-
- pos += 128;
- if (pos <= offset) {
- len = 0;
- continue;
- }
-
- /*
- * We have locked the tables, no need to del/add timers
- * nor cli() 8)
- */
-
-
- magic_control = atomic_read(&ms->n_control);
- if (!magic_control && ms->control) magic_control = -1;
- sprintf(temp,"%-4s %08lX:%04X %08lX:%04X %08lX:%04X %-12s %3X %4d %3d %7lu",
- masq_proto_name(ms->protocol),
- ntohl(ms->saddr), ntohs(ms->sport),
- ntohl(ms->daddr), ntohs(ms->dport),
- ntohl(ms->maddr), ntohs(ms->mport),
- ip_masq_state_name(ms->state),
- ms->flags,
- atomic_read(&ms->refcnt),
- magic_control,
- (ms->timer.expires-jiffies)/HZ);
- len += sprintf(buffer+len, "%-127s\n", temp);
-
- if(len >= length) {
- read_unlock_bh(&__ip_masq_lock);
- goto done;
- }
- }
- read_unlock_bh(&__ip_masq_lock);
- }
-
-done:
-
- if (len) {
- begin = len - (pos - offset);
- *start = buffer + begin;
- len -= begin;
- }
- if(len>length)
- len = length;
- MOD_DEC_USE_COUNT;
- return len;
-}
-#else
-#define ip_masq_user_info NULL
-#endif
-
-static struct ip_masq_hook ip_masq_user = {
- ip_masq_user_ctl,
- ip_masq_user_info
-};
-
-int ip_masq_user_init(void)
-{
- if (ip_masq_user_hook != NULL)
- return -EEXIST;
- ip_masq_user_hook = &ip_masq_user;
- return 0;
-}
-
-int ip_masq_user_done(void)
-{
- if (ip_masq_user_hook == NULL)
- return ENOENT;
- ip_masq_user_hook = NULL;
- return 0;
-}
-
-#ifdef MODULE
-EXPORT_NO_SYMBOLS;
-int init_module(void)
-{
- if (ip_masq_user_init() != 0)
- return -EIO;
- return 0;
-}
-
-void cleanup_module(void)
-{
- if (ip_masq_user_done() != 0)
- printk(KERN_INFO "ip_masq_user_done(): can't remove module");
-}
-
-#endif /* MODULE */
diff --git a/net/ipv4/ip_masq_vdolive.c b/net/ipv4/ip_masq_vdolive.c
deleted file mode 100644
index 2d8d672cc..000000000
--- a/net/ipv4/ip_masq_vdolive.c
+++ /dev/null
@@ -1,298 +0,0 @@
-/*
- * IP_MASQ_VDOLIVE - VDO Live masquerading module
- *
- *
- * Version: @(#)$Id: ip_masq_vdolive.c,v 1.6 1999/06/09 08:29:03 davem Exp $
- *
- * Author: Nigel Metheringham <Nigel.Metheringham@ThePLAnet.net>
- * PLAnet Online Ltd
- *
- * Fixes: Minor changes for 2.1 by
- * Steven Clarke <Steven.Clarke@ThePlanet.Net>, Planet Online Ltd
- *
- * Add missing #include <linux/string.h>
- * Horst von Brand <vonbrand@sleipnir.valparaiso.cl>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Thanks:
- * Thank you to VDOnet Corporation for allowing me access to
- * a protocol description without an NDA. This means that
- * this module can be distributed as source - a great help!
- *
- */
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <asm/system.h>
-#include <linux/skbuff.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/init.h>
-#include <net/protocol.h>
-#include <net/tcp.h>
-#include <net/ip_masq.h>
-
-struct vdolive_priv_data {
- /* Ports used */
- unsigned short origport;
- unsigned short masqport;
- /* State of decode */
- unsigned short state;
-};
-
-/*
- * List of ports (up to MAX_MASQ_APP_PORTS) to be handled by helper
- * First port is set to the default port.
- */
-static int ports[MAX_MASQ_APP_PORTS] = {7000}; /* I rely on the trailing items being set to zero */
-struct ip_masq_app *masq_incarnations[MAX_MASQ_APP_PORTS];
-
-/*
- * Debug level
- */
-#ifdef CONFIG_IP_MASQ_DEBUG
-static int debug=0;
-MODULE_PARM(debug, "i");
-#endif
-
-MODULE_PARM(ports, "1-" __MODULE_STRING(MAX_MASQ_APP_PORTS) "i");
-
-static int
-masq_vdolive_init_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
-{
- MOD_INC_USE_COUNT;
- if ((ms->app_data = kmalloc(sizeof(struct vdolive_priv_data),
- GFP_ATOMIC)) == NULL)
- IP_MASQ_DEBUG(1-debug, "VDOlive: No memory for application data\n");
- else
- {
- struct vdolive_priv_data *priv =
- (struct vdolive_priv_data *)ms->app_data;
- priv->origport = 0;
- priv->masqport = 0;
- priv->state = 0;
- }
- return 0;
-}
-
-static int
-masq_vdolive_done_1 (struct ip_masq_app *mapp, struct ip_masq *ms)
-{
- MOD_DEC_USE_COUNT;
- if (ms->app_data)
- kfree_s(ms->app_data, sizeof(struct vdolive_priv_data));
- return 0;
-}
-
-int
-masq_vdolive_out (struct ip_masq_app *mapp, struct ip_masq *ms, struct sk_buff **skb_p, __u32 maddr)
-{
- struct sk_buff *skb;
- struct iphdr *iph;
- struct tcphdr *th;
- char *data, *data_limit;
- unsigned int tagval; /* This should be a 32 bit quantity */
- struct ip_masq *n_ms;
- struct vdolive_priv_data *priv =
- (struct vdolive_priv_data *)ms->app_data;
-
- /* This doesn't work at all if no priv data was allocated on startup */
- if (!priv)
- return 0;
-
- /* Everything running correctly already */
- if (priv->state == 3)
- return 0;
-
- skb = *skb_p;
- iph = skb->nh.iph;
- th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
- data = (char *)&th[1];
-
- data_limit = skb->h.raw + skb->len;
-
- if (data+8 > data_limit) {
- IP_MASQ_DEBUG(1-debug, "VDOlive: packet too short for ID %p %p\n", data, data_limit);
- return 0;
- }
- memcpy(&tagval, data+4, 4);
- IP_MASQ_DEBUG(1-debug, "VDOlive: packet seen, tag %ld, in initial state %d\n", ntohl(tagval), priv->state);
-
- /* Check for leading packet ID */
- if ((ntohl(tagval) != 6) && (ntohl(tagval) != 1)) {
- IP_MASQ_DEBUG(1-debug, "VDOlive: unrecognised tag %ld, in initial state %d\n", ntohl(tagval), priv->state);
- return 0;
- }
-
-
- /* Check packet is long enough for data - ignore if not */
- if ((ntohl(tagval) == 6) && (data+36 > data_limit)) {
- IP_MASQ_DEBUG(1-debug, "VDOlive: initial packet too short %p %p\n", data, data_limit);
- return 0;
- } else if ((ntohl(tagval) == 1) && (data+20 > data_limit)) {
- IP_MASQ_DEBUG(1-debug,"VDOlive: secondary packet too short %p %p\n", data, data_limit);
- return 0;
- }
-
- /* Adjust data pointers */
- /*
- * I could check the complete protocol version tag
- * in here however I am just going to look for the
- * "VDO Live" tag in the hope that this part will
- * remain constant even if the version changes
- */
- if (ntohl(tagval) == 6) {
- data += 24;
- IP_MASQ_DEBUG(1-debug, "VDOlive: initial packet found\n");
- } else {
- data += 8;
- IP_MASQ_DEBUG(1-debug, "VDOlive: secondary packet found\n");
- }
-
- if (memcmp(data, "VDO Live", 8) != 0) {
- IP_MASQ_DEBUG(1-debug,"VDOlive: did not find tag\n");
- return 0;
- }
- /*
- * The port number is the next word after the tag.
- * VDOlive encodes all of these values
- * in 32 bit words, so in this case I am
- * skipping the first 2 bytes of the next
- * word to get to the relevant 16 bits
- */
- data += 10;
-
- /*
- * If we have not seen the port already,
- * set the masquerading tunnel up
- */
- if (!priv->origport) {
- memcpy(&priv->origport, data, 2);
- IP_MASQ_DEBUG(1-debug, "VDOlive: found port %d\n", ntohs(priv->origport));
-
- /* Open up a tunnel */
- n_ms = ip_masq_new(IPPROTO_UDP,
- maddr, 0,
- ms->saddr, priv->origport,
- ms->daddr, 0,
- IP_MASQ_F_NO_DPORT);
-
- if (n_ms==NULL) {
- ip_masq_put(n_ms);
- IP_MASQ_DEBUG(1-debug, "VDOlive: unable to build UDP tunnel for %x:%x\n", ms->saddr, priv->origport);
- /* Leave state as unset */
- priv->origport = 0;
- return 0;
- }
- ip_masq_listen(n_ms);
-
- ip_masq_put(ms);
- priv->masqport = n_ms->mport;
- } else if (memcmp(data, &(priv->origport), 2)) {
- IP_MASQ_DEBUG(1-debug, "VDOlive: ports do not match\n");
- /* Write the port in anyhow!!! */
- }
-
- /*
- * Write masq port into packet
- */
- memcpy(data, &(priv->masqport), 2);
- IP_MASQ_DEBUG(1-debug, "VDOlive: rewrote port %d to %d, server %08X\n", ntohs(priv->origport), ntohs(priv->masqport), ms->saddr);
-
- /*
- * Set state bit to make which bit has been done
- */
-
- priv->state |= (ntohl(tagval) == 6) ? 1 : 2;
-
- return 0;
-}
-
-
-struct ip_masq_app ip_masq_vdolive = {
- NULL, /* next */
- "VDOlive", /* name */
- 0, /* type */
- 0, /* n_attach */
- masq_vdolive_init_1, /* ip_masq_init_1 */
- masq_vdolive_done_1, /* ip_masq_done_1 */
- masq_vdolive_out, /* pkt_out */
- NULL /* pkt_in */
-};
-
-/*
- * ip_masq_vdolive initialization
- */
-
-__initfunc(int ip_masq_vdolive_init(void))
-{
- int i, j;
-
- for (i=0; (i<MAX_MASQ_APP_PORTS); i++) {
- if (ports[i]) {
- if ((masq_incarnations[i] = kmalloc(sizeof(struct ip_masq_app),
- GFP_KERNEL)) == NULL)
- return -ENOMEM;
- memcpy(masq_incarnations[i], &ip_masq_vdolive, sizeof(struct ip_masq_app));
- if ((j = register_ip_masq_app(masq_incarnations[i],
- IPPROTO_TCP,
- ports[i]))) {
- return j;
- }
- IP_MASQ_DEBUG(1-debug, "RealAudio: loaded support on port[%d] = %d\n", i, ports[i]);
- } else {
- /* To be safe, force the incarnation table entry to NULL */
- masq_incarnations[i] = NULL;
- }
- }
- return 0;
-}
-
-/*
- * ip_masq_vdolive fin.
- */
-
-int ip_masq_vdolive_done(void)
-{
- int i, j, k;
-
- k=0;
- for (i=0; (i<MAX_MASQ_APP_PORTS); i++) {
- if (masq_incarnations[i]) {
- if ((j = unregister_ip_masq_app(masq_incarnations[i]))) {
- k = j;
- } else {
- kfree(masq_incarnations[i]);
- masq_incarnations[i] = NULL;
- IP_MASQ_DEBUG(1-debug,"VDOlive: unloaded support on port[%d] = %d\n", i, ports[i]);
- }
- }
- }
- return k;
-}
-
-
-#ifdef MODULE
-EXPORT_NO_SYMBOLS;
-
-int init_module(void)
-{
- if (ip_masq_vdolive_init() != 0)
- return -EIO;
- return 0;
-}
-
-void cleanup_module(void)
-{
- if (ip_masq_vdolive_done() != 0)
- IP_MASQ_DEBUG(1-debug, "ip_masq_vdolive: can't remove module");
-}
-
-#endif /* MODULE */
diff --git a/net/ipv4/ip_nat_dumb.c b/net/ipv4/ip_nat_dumb.c
index 5a1c6d753..b45377af4 100644
--- a/net/ipv4/ip_nat_dumb.c
+++ b/net/ipv4/ip_nat_dumb.c
@@ -5,7 +5,7 @@
*
* Dumb Network Address Translation.
*
- * Version: $Id: ip_nat_dumb.c,v 1.8 1999/03/21 05:22:40 davem Exp $
+ * Version: $Id: ip_nat_dumb.c,v 1.9 1999/08/20 11:05:46 davem Exp $
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
@@ -37,8 +37,6 @@
#include <net/icmp.h>
#include <linux/tcp.h>
#include <linux/udp.h>
-#include <linux/firewall.h>
-#include <linux/ip_fw.h>
#include <net/checksum.h>
#include <linux/route.h>
#include <net/route.h>
@@ -129,10 +127,13 @@ ip_do_nat(struct sk_buff *skb)
/* Use fib_lookup() until we get our own
* hash table of NATed hosts -- Rani
*/
- if (fib_lookup(&key, &res) == 0 && res.r) {
- ciph->daddr = fib_rules_policy(ciph->daddr, &res, &flags);
- if (ciph->daddr != idaddr)
- updated = 1;
+ if (fib_lookup(&key, &res) == 0) {
+ if (res.r) {
+ ciph->daddr = fib_rules_policy(ciph->daddr, &res, &flags);
+ if (ciph->daddr != idaddr)
+ updated = 1;
+ }
+ fib_res_put(&res);
}
} else {
ciph->daddr = iph->saddr;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 44d635573..5e6b50ea7 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -5,7 +5,7 @@
*
* The Internet Protocol (IP) output module.
*
- * Version: $Id: ip_output.c,v 1.67 1999/03/25 00:43:00 davem Exp $
+ * Version: $Id: ip_output.c,v 1.72 1999/09/07 02:31:15 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -72,8 +72,7 @@
#include <net/raw.h>
#include <net/checksum.h>
#include <linux/igmp.h>
-#include <linux/ip_fw.h>
-#include <linux/firewall.h>
+#include <linux/netfilter_ipv4.h>
#include <linux/mroute.h>
#include <linux/netlink.h>
@@ -83,7 +82,6 @@
int sysctl_ip_dynaddr = 0;
-
int ip_id_count = 0;
/* Generate a checksum for an outgoing IP datagram. */
@@ -93,6 +91,61 @@ __inline__ void ip_send_check(struct iphdr *iph)
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
}
+/* dev_loopback_xmit for use with netfilter. */
+static int ip_dev_loopback_xmit(struct sk_buff *newskb)
+{
+ newskb->mac.raw = newskb->data;
+ skb_pull(newskb, newskb->nh.raw - newskb->data);
+ newskb->pkt_type = PACKET_LOOPBACK;
+ newskb->ip_summed = CHECKSUM_UNNECESSARY;
+ BUG_TRAP(newskb->dst);
+
+#ifdef CONFIG_NETFILTER_DEBUG
+ nf_debug_ip_loopback_xmit(newskb);
+#endif
+ netif_rx(newskb);
+ return 0;
+}
+
+#ifdef CONFIG_NETFILTER
+/* To preserve the cute illusion that a locally-generated packet can
+ be mangled before routing, we actually reroute if a hook altered
+ the packet. -RR */
+static int route_me_harder(struct sk_buff *skb)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct rtable *rt;
+
+ if (ip_route_output(&rt, iph->daddr, iph->saddr,
+ RT_TOS(iph->tos) | RTO_CONN,
+ skb->sk ? skb->sk->bound_dev_if : 0)) {
+ printk("route_me_harder: No more route.\n");
+ return -EINVAL;
+ }
+
+ /* Drop old route. */
+ dst_release(skb->dst);
+
+ skb->dst = &rt->u.dst;
+ return 0;
+}
+#endif
+
+/* Do route recalc if netfilter changes skb. */
+static inline int
+output_maybe_reroute(struct sk_buff *skb)
+{
+#ifdef CONFIG_NETFILTER
+ if (skb->nfcache & NFC_ALTERED) {
+ if (route_me_harder(skb) != 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+ }
+#endif
+ return skb->dst->output(skb);
+}
+
/*
* Add an ip header to a skbuff and send it out.
*/
@@ -101,7 +154,6 @@ void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
{
struct rtable *rt = (struct rtable *)skb->dst;
struct iphdr *iph;
- struct device *dev;
/* Build the IP header. */
if (opt)
@@ -111,11 +163,11 @@ void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
iph->version = 4;
iph->ihl = 5;
- iph->tos = sk->ip_tos;
+ iph->tos = sk->protinfo.af_inet.tos;
iph->frag_off = 0;
if (ip_dont_fragment(sk, &rt->u.dst))
iph->frag_off |= htons(IP_DF);
- iph->ttl = sk->ip_ttl;
+ iph->ttl = sk->protinfo.af_inet.ttl;
iph->daddr = rt->rt_dst;
iph->saddr = rt->rt_src;
iph->protocol = sk->protocol;
@@ -127,44 +179,56 @@ void ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
iph->ihl += opt->optlen>>2;
ip_options_build(skb, opt, daddr, rt, 0);
}
-
- dev = rt->u.dst.dev;
-
-#ifdef CONFIG_FIREWALL
- /* Now we have no better mechanism to notify about error. */
- switch (call_out_firewall(PF_INET, dev, iph, NULL, &skb)) {
- case FW_REJECT:
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
- /* Fall thru... */
- case FW_BLOCK:
- case FW_QUEUE:
- kfree_skb(skb);
- return;
- }
-#endif
-
ip_send_check(iph);
/* Send it out. */
- skb->dst->output(skb);
- return;
+ NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, NULL,
+ output_maybe_reroute);
}
-int __ip_finish_output(struct sk_buff *skb)
+static inline int ip_finish_output2(struct sk_buff *skb)
{
- return ip_finish_output(skb);
+ struct dst_entry *dst = skb->dst;
+ struct hh_cache *hh = dst->hh;
+
+#ifdef CONFIG_NETFILTER_DEBUG
+ nf_debug_ip_finish_output2(skb);
+#endif /*CONFIG_NETFILTER_DEBUG*/
+
+ if (hh) {
+ read_lock_bh(&hh->hh_lock);
+ memcpy(skb->data - 16, hh->hh_data, 16);
+ read_unlock_bh(&hh->hh_lock);
+ skb_push(skb, hh->hh_len);
+ return hh->hh_output(skb);
+ } else if (dst->neighbour)
+ return dst->neighbour->output(skb);
+
+ printk(KERN_DEBUG "khm\n");
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+__inline__ int ip_finish_output(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dst->dev;
+
+ skb->dev = dev;
+ skb->protocol = __constant_htons(ETH_P_IP);
+
+ return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
+ ip_finish_output2);
}
int ip_mc_output(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
struct rtable *rt = (struct rtable*)skb->dst;
- struct device *dev = rt->u.dst.dev;
+ struct net_device *dev = rt->u.dst.dev;
/*
* If the indicated interface is up and running, send the packet.
*/
-
ip_statistics.IpOutRequests++;
#ifdef CONFIG_IP_ROUTE_NAT
if (rt->rt_flags & RTCF_NAT)
@@ -178,7 +242,7 @@ int ip_mc_output(struct sk_buff *skb)
* Multicasts are looped back for other local users
*/
- if (rt->rt_flags&RTCF_MULTICAST && (!sk || sk->ip_mc_loop)) {
+ if (rt->rt_flags&RTCF_MULTICAST && (!sk || sk->protinfo.af_inet.mc_loop)) {
#ifdef CONFIG_IP_MROUTE
/* Small optimization: do not loopback not local frames,
which returned after forwarding; they will be dropped
@@ -190,7 +254,13 @@ int ip_mc_output(struct sk_buff *skb)
*/
if ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
#endif
- dev_loopback_xmit(skb);
+ {
+ struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
+ if (newskb)
+ NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, newskb, NULL,
+ newskb->dev,
+ ip_dev_loopback_xmit);
+ }
/* Multicasts with ttl 0 must not go beyond the host */
@@ -200,8 +270,12 @@ int ip_mc_output(struct sk_buff *skb)
}
}
- if (rt->rt_flags&RTCF_BROADCAST)
- dev_loopback_xmit(skb);
+ if (rt->rt_flags&RTCF_BROADCAST) {
+ struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
+ if (newskb)
+ NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
+ newskb->dev, ip_dev_loopback_xmit);
+ }
return ip_finish_output(skb);
}
@@ -231,82 +305,33 @@ int ip_output(struct sk_buff *skb)
* most likely make other reliable transport layers above IP easier
* to implement under Linux.
*/
-void ip_queue_xmit(struct sk_buff *skb)
+static inline int ip_queue_xmit2(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
- struct ip_options *opt = sk->opt;
- struct rtable *rt;
- struct device *dev;
- struct iphdr *iph;
- unsigned int tot_len;
-
- /* Make sure we can route this packet. */
- rt = (struct rtable *) sk->dst_cache;
- if(rt == NULL || rt->u.dst.obsolete) {
- u32 daddr;
-
- sk->dst_cache = NULL;
- ip_rt_put(rt);
-
- /* Use correct destination address if we have options. */
- daddr = sk->daddr;
- if(opt && opt->srr)
- daddr = opt->faddr;
-
- /* If this fails, retransmit mechanism of transport layer will
- * keep trying until route appears or the connection times itself
- * out.
- */
- if(ip_route_output(&rt, daddr, sk->saddr,
- RT_TOS(sk->ip_tos) | RTO_CONN | sk->localroute,
- sk->bound_dev_if))
- goto drop;
- sk->dst_cache = &rt->u.dst;
- }
- if(opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
- goto no_route;
-
- /* We have a route, so grab a reference. */
- skb->dst = dst_clone(sk->dst_cache);
-
- /* OK, we know where to send it, allocate and build IP header. */
- iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
- iph->version = 4;
- iph->ihl = 5;
- iph->tos = sk->ip_tos;
- iph->frag_off = 0;
- iph->ttl = sk->ip_ttl;
- iph->daddr = rt->rt_dst;
- iph->saddr = rt->rt_src;
- iph->protocol = sk->protocol;
- skb->nh.iph = iph;
- /* Transport layer set skb->h.foo itself. */
-
- if(opt && opt->optlen) {
- iph->ihl += opt->optlen >> 2;
- ip_options_build(skb, opt, sk->daddr, rt, 0);
+ struct rtable *rt = (struct rtable *)skb->dst;
+ struct net_device *dev;
+ struct iphdr *iph = skb->nh.iph;
+
+#ifdef CONFIG_NETFILTER
+ /* BLUE-PEN-FOR-ALEXEY. I don't understand; you mean I can't
+ hold the route as I pass the packet to userspace? -- RR
+
+ You may hold it, if you really hold it. F.e. if netfilter
+ does not destroy handed skb with skb->dst attached, it
+ will be held. When it was stored in info->arg, then
+ it was not held apparently. Now (without second arg) it is evident,
+ that it is clean. --ANK
+ */
+ if (rt==NULL || (skb->nfcache & NFC_ALTERED)) {
+ if (route_me_harder(skb) != 0) {
+ kfree_skb(skb);
+ return -EHOSTUNREACH;
+ }
}
-
- tot_len = skb->len;
- iph->tot_len = htons(tot_len);
- iph->id = htons(ip_id_count++);
+#endif
dev = rt->u.dst.dev;
-#ifdef CONFIG_FIREWALL
- /* Now we have no better mechanism to notify about error. */
- switch (call_out_firewall(PF_INET, dev, iph, NULL, &skb)) {
- case FW_REJECT:
- start_bh_atomic();
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
- end_bh_atomic();
- /* Fall thru... */
- case FW_BLOCK:
- case FW_QUEUE:
- goto drop;
- }
-#endif
-
/* This can happen when the transport layer has segments queued
* with a cached route, and by the time we get here things are
* re-routed to a device with a different MTU than the original
@@ -318,17 +343,14 @@ void ip_queue_xmit(struct sk_buff *skb)
skb2 = skb_realloc_headroom(skb, (dev->hard_header_len + 15) & ~15);
kfree_skb(skb);
if (skb2 == NULL)
- return;
+ return -ENOMEM;
if (sk)
- skb_set_owner_w(skb, sk);
+ skb_set_owner_w(skb2, sk);
skb = skb2;
iph = skb->nh.iph;
}
- /* Do we need to fragment. Again this is inefficient. We
- * need to somehow lock the original buffer and use bits of it.
- */
- if (tot_len > rt->u.dst.pmtu)
+ if (skb->len > rt->u.dst.pmtu)
goto fragment;
if (ip_dont_fragment(sk, &rt->u.dst))
@@ -338,37 +360,84 @@ void ip_queue_xmit(struct sk_buff *skb)
ip_send_check(iph);
skb->priority = sk->priority;
- skb->dst->output(skb);
- return;
+ return skb->dst->output(skb);
fragment:
- if (ip_dont_fragment(sk, &rt->u.dst) &&
- tot_len > (iph->ihl<<2) + sizeof(struct tcphdr)+16) {
+ if (ip_dont_fragment(sk, &rt->u.dst)) {
/* Reject packet ONLY if TCP might fragment
- it itself, if were careful enough.
- Test is not precise (f.e. it does not take sacks
- into account). Actually, tcp should make it. --ANK (980801)
+ * it itself, if were careful enough.
*/
iph->frag_off |= __constant_htons(IP_DF);
NETDEBUG(printk(KERN_DEBUG "sending pkt_too_big to self\n"));
- /* icmp_send is not reenterable, so that bh_atomic... --ANK */
- start_bh_atomic();
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(rt->u.dst.pmtu));
- end_bh_atomic();
- goto drop;
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+ return ip_fragment(skb, skb->dst->output);
+}
+
+int ip_queue_xmit(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ struct ip_options *opt = sk->protinfo.af_inet.opt;
+ struct rtable *rt;
+ struct iphdr *iph;
+
+ /* Make sure we can route this packet. */
+ rt = (struct rtable *)__sk_dst_check(sk, 0);
+ if (rt == NULL) {
+ u32 daddr;
+
+ /* Use correct destination address if we have options. */
+ daddr = sk->daddr;
+ if(opt && opt->srr)
+ daddr = opt->faddr;
+
+ /* If this fails, retransmit mechanism of transport layer will
+ * keep trying until route appears or the connection times itself
+ * out.
+ */
+ if (ip_route_output(&rt, daddr, sk->saddr,
+ RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
+ sk->bound_dev_if))
+ goto no_route;
+ __sk_dst_set(sk, &rt->u.dst);
+ }
+ skb->dst = dst_clone(&rt->u.dst);
+
+ if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+ goto no_route;
+
+ /* OK, we know where to send it, allocate and build IP header. */
+ iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
+ iph->version = 4;
+ iph->ihl = 5;
+ iph->tos = sk->protinfo.af_inet.tos;
+ iph->frag_off = 0;
+ iph->ttl = sk->protinfo.af_inet.ttl;
+ iph->daddr = rt->rt_dst;
+ iph->saddr = rt->rt_src;
+ iph->protocol = sk->protocol;
+ skb->nh.iph = iph;
+ /* Transport layer set skb->h.foo itself. */
+
+ if(opt && opt->optlen) {
+ iph->ihl += opt->optlen >> 2;
+ ip_options_build(skb, opt, sk->daddr, rt, 0);
}
- ip_fragment(skb, skb->dst->output);
- return;
+
+ iph->tot_len = htons(skb->len);
+ iph->id = htons(ip_id_count++);
+
+ return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+ ip_queue_xmit2);
no_route:
- sk->dst_cache = NULL;
- ip_rt_put(rt);
ip_statistics.IpOutNoRoutes++;
- /* Fall through... */
-drop:
kfree_skb(skb);
+ return -EHOSTUNREACH;
}
/*
@@ -391,7 +460,7 @@ drop:
* length to be copied.
*/
-int ip_build_xmit_slow(struct sock *sk,
+static int ip_build_xmit_slow(struct sock *sk,
int getfrag (const void *,
char *,
unsigned int,
@@ -454,8 +523,7 @@ int ip_build_xmit_slow(struct sock *sk,
fraglen = maxfraglen;
offset -= maxfraglen-fragheaderlen;
}
-
-
+
/*
* The last fragment will not have MF (more fragments) set.
*/
@@ -468,16 +536,12 @@ int ip_build_xmit_slow(struct sock *sk,
if (offset > 0 && df) {
ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
- return(-EMSGSIZE);
+ return -EMSGSIZE;
}
+ if (flags&MSG_PROBE)
+ goto out;
/*
- * Lock the device lists.
- */
-
- dev_lock_list();
-
- /*
* Get an identifier
*/
@@ -528,15 +592,15 @@ int ip_build_xmit_slow(struct sock *sk,
ip_options_build(skb, opt,
ipc->addr, rt, offset);
}
- iph->tos = sk->ip_tos;
+ iph->tos = sk->protinfo.af_inet.tos;
iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4);
iph->id = id;
iph->frag_off = htons(offset>>3);
iph->frag_off |= mf|df;
if (rt->rt_type == RTN_MULTICAST)
- iph->ttl = sk->ip_mc_ttl;
+ iph->ttl = sk->protinfo.af_inet.mc_ttl;
else
- iph->ttl = sk->ip_ttl;
+ iph->ttl = sk->protinfo.af_inet.ttl;
iph->protocol = sk->protocol;
iph->check = 0;
iph->saddr = rt->rt_src;
@@ -566,38 +630,28 @@ int ip_build_xmit_slow(struct sock *sk,
nfrags++;
-#ifdef CONFIG_FIREWALL
- switch (call_out_firewall(PF_INET, rt->u.dst.dev, skb->nh.iph, NULL, &skb)) {
- case FW_QUEUE:
- kfree_skb(skb);
- continue;
- case FW_BLOCK:
- case FW_REJECT:
- kfree_skb(skb);
- err = -EPERM;
- goto error;
+ err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
+ skb->dst->dev, output_maybe_reroute);
+ if (err) {
+ if (err > 0)
+ err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
+ if (err)
+ goto error;
}
-#endif
-
- err = -ENETDOWN;
- if (rt->u.dst.output(skb))
- goto error;
} while (offset >= 0);
if (nfrags>1)
ip_statistics.IpFragCreates += nfrags;
- dev_unlock_list();
+out:
return 0;
error:
ip_statistics.IpOutDiscards++;
if (nfrags>1)
ip_statistics.IpFragCreates += nfrags;
- dev_unlock_list();
return err;
}
-
/*
* Fast path for unfragmented packets.
*/
@@ -622,7 +676,7 @@ int ip_build_xmit(struct sock *sk,
* choice RAW frames within 20 bytes of maximum size(rare) to the long path
*/
- if (!sk->ip_hdrincl) {
+ if (!sk->protinfo.af_inet.hdrincl) {
length += sizeof(struct iphdr);
/*
@@ -636,6 +690,8 @@ int ip_build_xmit(struct sock *sk,
return -EMSGSIZE;
}
}
+ if (flags&MSG_PROBE)
+ goto out;
/*
* Do path mtu discovery if needed.
@@ -662,18 +718,16 @@ int ip_build_xmit(struct sock *sk,
skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
- dev_lock_list();
-
- if(!sk->ip_hdrincl) {
+ if(!sk->protinfo.af_inet.hdrincl) {
iph->version=4;
iph->ihl=5;
- iph->tos=sk->ip_tos;
+ iph->tos=sk->protinfo.af_inet.tos;
iph->tot_len = htons(length);
iph->id=htons(ip_id_count++);
iph->frag_off = df;
- iph->ttl=sk->ip_mc_ttl;
+ iph->ttl=sk->protinfo.af_inet.mc_ttl;
if (rt->rt_type != RTN_MULTICAST)
- iph->ttl=sk->ip_ttl;
+ iph->ttl=sk->protinfo.af_inet.ttl;
iph->protocol=sk->protocol;
iph->saddr=rt->rt_src;
iph->daddr=rt->rt_dst;
@@ -684,25 +738,17 @@ int ip_build_xmit(struct sock *sk,
else
err = getfrag(frag, (void *)iph, 0, length);
- dev_unlock_list();
-
if (err)
goto error_fault;
-#ifdef CONFIG_FIREWALL
- switch (call_out_firewall(PF_INET, rt->u.dst.dev, iph, NULL, &skb)) {
- case FW_QUEUE:
- kfree_skb(skb);
- return 0;
- case FW_BLOCK:
- case FW_REJECT:
- kfree_skb(skb);
- err = -EPERM;
+ err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+ output_maybe_reroute);
+ if (err > 0)
+ err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
+ if (err)
goto error;
- }
-#endif
-
- return rt->u.dst.output(skb);
+out:
+ return 0;
error_fault:
err = -EFAULT;
@@ -723,17 +769,18 @@ error:
* Yes this is inefficient, feel free to submit a quicker one.
*/
-void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
+int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
{
struct iphdr *iph;
unsigned char *raw;
unsigned char *ptr;
- struct device *dev;
+ struct net_device *dev;
struct sk_buff *skb2;
unsigned int mtu, hlen, left, len;
int offset;
int not_last_frag;
struct rtable *rt = (struct rtable*)skb->dst;
+ int err = 0;
dev = rt->u.dst.dev;
@@ -754,19 +801,6 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
ptr = raw + hlen; /* Where to start from */
/*
- * The protocol doesn't seem to say what to do in the case that the
- * frame + options doesn't fit the mtu. As it used to fall down dead
- * in this case we were fortunate it didn't happen
- *
- * It is impossible, because mtu>=68. --ANK (980801)
- */
-
-#ifdef CONFIG_NET_PARANOIA
- if (mtu<8)
- goto fail;
-#endif
-
- /*
* Fragment the datagram.
*/
@@ -793,6 +827,7 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) {
NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
+ err = -ENOMEM;
goto fail;
}
@@ -862,15 +897,18 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
ip_send_check(iph);
- output(skb2);
+ err = output(skb2);
+ if (err)
+ goto fail;
}
kfree_skb(skb);
ip_statistics.IpFragOKs++;
- return;
+ return err;
fail:
kfree_skb(skb);
- ip_statistics.IpFragFails++;
+ ip_statistics.IpFragFails++;
+ return err;
}
/*
@@ -926,24 +964,31 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
struct ipcm_cookie ipc;
u32 daddr;
struct rtable *rt = (struct rtable*)skb->dst;
-
+
if (ip_options_echo(&replyopts.opt, skb))
return;
-
- sk->ip_tos = skb->nh.iph->tos;
- sk->priority = skb->priority;
- sk->protocol = skb->nh.iph->protocol;
daddr = ipc.addr = rt->rt_src;
ipc.opt = &replyopts.opt;
-
+
if (ipc.opt->srr)
daddr = replyopts.opt.faddr;
if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0))
return;
- /* And let IP do all the hard work. */
+ /* And let IP do all the hard work.
+
+ This chunk is not reenterable, hence spinlock.
+ Note that it uses the fact, that this function is called
+ with locally disabled BH and that sk cannot be already spinlocked.
+ */
+ bh_lock_sock(sk);
+ sk->protinfo.af_inet.tos = skb->nh.iph->tos;
+ sk->priority = skb->priority;
+ sk->protocol = skb->nh.iph->protocol;
ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT);
+ bh_unlock_sock(sk);
+
ip_rt_put(rt);
}
@@ -956,7 +1001,7 @@ static struct packet_type ip_packet_type =
__constant_htons(ETH_P_IP),
NULL, /* All devices */
ip_rcv,
- NULL,
+ (void*)1,
NULL,
};
@@ -977,7 +1022,7 @@ static struct proc_dir_entry proc_net_igmp = {
* IP registers the packet type and then calls the subprotocol initialisers
*/
-__initfunc(void ip_init(void))
+void __init ip_init(void)
{
dev_add_pack(&ip_packet_type);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 369a6770c..7278a0b4a 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -5,7 +5,7 @@
*
* The IP to API glue.
*
- * Version: $Id: ip_sockglue.c,v 1.42 1999/04/22 10:07:34 davem Exp $
+ * Version: $Id: ip_sockglue.c,v 1.45 1999/09/06 04:58:03 davem Exp $
*
* Authors: see ip.c
*
@@ -32,8 +32,7 @@
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/igmp.h>
-#include <linux/firewall.h>
-#include <linux/ip_fw.h>
+#include <linux/netfilter.h>
#include <linux/route.h>
#include <linux/mroute.h>
#include <net/route.h>
@@ -41,10 +40,6 @@
#include <net/transp_v6.h>
#endif
-#ifdef CONFIG_IP_MASQUERADE
-#include <linux/ip_masq.h>
-#endif
-
#include <linux/errqueue.h>
#include <asm/uaccess.h>
@@ -117,7 +112,7 @@ void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
{
- unsigned flags = skb->sk->ip_cmsg_flags;
+ unsigned flags = skb->sk->protinfo.af_inet.cmsg_flags;
/* Ordered by supposed usage frequency */
if (flags & 1)
@@ -193,6 +188,7 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc)
sent to multicast group to reach destination designated router.
*/
struct ip_ra_chain *ip_ra_chain;
+rwlock_t ip_ra_lock = RW_LOCK_UNLOCKED;
int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *))
{
@@ -203,30 +199,36 @@ int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct s
new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
+ write_lock_bh(&ip_ra_lock);
for (rap = &ip_ra_chain; (ra=*rap) != NULL; rap = &ra->next) {
if (ra->sk == sk) {
if (on) {
+ write_unlock_bh(&ip_ra_lock);
if (new_ra)
kfree(new_ra);
return -EADDRINUSE;
}
*rap = ra->next;
- synchronize_bh();
+ write_unlock_bh(&ip_ra_lock);
if (ra->destructor)
ra->destructor(sk);
+ sock_put(sk);
kfree(ra);
return 0;
}
}
- if (new_ra == NULL)
+ if (new_ra == NULL) {
+ write_unlock_bh(&ip_ra_lock);
return -ENOBUFS;
+ }
new_ra->sk = sk;
new_ra->destructor = destructor;
new_ra->next = ra;
- wmb();
*rap = new_ra;
+ sock_hold(sk);
+ write_unlock_bh(&ip_ra_lock);
return 0;
}
@@ -236,7 +238,7 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
{
struct sock_exterr_skb *serr;
- if (!sk->ip_recverr)
+ if (!sk->protinfo.af_inet.recverr)
return;
skb = skb_clone(skb, GFP_ATOMIC);
@@ -267,7 +269,7 @@ void ip_local_error(struct sock *sk, int err, u32 daddr, u16 port, u32 info)
struct iphdr *iph;
struct sk_buff *skb;
- if (!sk->ip_recverr)
+ if (!sk->protinfo.af_inet.recverr)
return;
skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC);
@@ -340,7 +342,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) {
sin->sin_family = AF_INET;
sin->sin_addr.s_addr = skb->nh.iph->saddr;
- if (sk->ip_cmsg_flags)
+ if (sk->protinfo.af_inet.cmsg_flags)
ip_cmsg_recv(msg, skb);
}
@@ -375,9 +377,7 @@ out:
int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
{
int val=0,err;
-#if defined(CONFIG_IP_FIREWALL)
- char tmp_fw[MAX(sizeof(struct ip_fwtest),sizeof(struct ip_fwnew))];
-#endif
+
if(optlen>=sizeof(int)) {
if(get_user(val, (int *) optval))
return -EFAULT;
@@ -397,18 +397,20 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt
return ip_mroute_setsockopt(sk,optname,optval,optlen);
}
#endif
-
+
+ err = 0;
+ lock_sock(sk);
+
switch(optname)
{
case IP_OPTIONS:
{
struct ip_options * opt = NULL;
if (optlen > 40 || optlen < 0)
- return -EINVAL;
+ goto e_inval;
err = ip_options_get(&opt, optval, optlen, 1);
if (err)
- return err;
- lock_sock(sk);
+ break;
if (sk->type == SOCK_STREAM) {
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
@@ -423,194 +425,192 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt
}
#endif
}
- opt = xchg(&sk->opt, opt);
- release_sock(sk);
+ opt = xchg(&sk->protinfo.af_inet.opt, opt);
if (opt)
kfree_s(opt, sizeof(struct ip_options) + opt->optlen);
- return 0;
+ break;
}
case IP_PKTINFO:
if (val)
- sk->ip_cmsg_flags |= IP_CMSG_PKTINFO;
+ sk->protinfo.af_inet.cmsg_flags |= IP_CMSG_PKTINFO;
else
- sk->ip_cmsg_flags &= ~IP_CMSG_PKTINFO;
- return 0;
+ sk->protinfo.af_inet.cmsg_flags &= ~IP_CMSG_PKTINFO;
+ break;
case IP_RECVTTL:
if (val)
- sk->ip_cmsg_flags |= IP_CMSG_TTL;
+ sk->protinfo.af_inet.cmsg_flags |= IP_CMSG_TTL;
else
- sk->ip_cmsg_flags &= ~IP_CMSG_TTL;
- return 0;
+ sk->protinfo.af_inet.cmsg_flags &= ~IP_CMSG_TTL;
+ break;
case IP_RECVTOS:
if (val)
- sk->ip_cmsg_flags |= IP_CMSG_TOS;
+ sk->protinfo.af_inet.cmsg_flags |= IP_CMSG_TOS;
else
- sk->ip_cmsg_flags &= ~IP_CMSG_TOS;
- return 0;
+ sk->protinfo.af_inet.cmsg_flags &= ~IP_CMSG_TOS;
+ break;
case IP_RECVOPTS:
if (val)
- sk->ip_cmsg_flags |= IP_CMSG_RECVOPTS;
+ sk->protinfo.af_inet.cmsg_flags |= IP_CMSG_RECVOPTS;
else
- sk->ip_cmsg_flags &= ~IP_CMSG_RECVOPTS;
- return 0;
+ sk->protinfo.af_inet.cmsg_flags &= ~IP_CMSG_RECVOPTS;
+ break;
case IP_RETOPTS:
if (val)
- sk->ip_cmsg_flags |= IP_CMSG_RETOPTS;
+ sk->protinfo.af_inet.cmsg_flags |= IP_CMSG_RETOPTS;
else
- sk->ip_cmsg_flags &= ~IP_CMSG_RETOPTS;
- return 0;
+ sk->protinfo.af_inet.cmsg_flags &= ~IP_CMSG_RETOPTS;
+ break;
case IP_TOS: /* This sets both TOS and Precedence */
/* Reject setting of unused bits */
if (val & ~(IPTOS_TOS_MASK|IPTOS_PREC_MASK))
- return -EINVAL;
+ goto e_inval;
if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP &&
- !capable(CAP_NET_ADMIN))
- return -EPERM;
- if (sk->ip_tos != val) {
- lock_sock(sk);
- sk->ip_tos=val;
+ !capable(CAP_NET_ADMIN)) {
+ err = -EPERM;
+ break;
+ }
+ if (sk->protinfo.af_inet.tos != val) {
+ sk->protinfo.af_inet.tos=val;
sk->priority = rt_tos2priority(val);
- dst_release(xchg(&sk->dst_cache, NULL));
- release_sock(sk);
+ sk_dst_reset(sk);
}
- return 0;
+ break;
case IP_TTL:
if (optlen<1)
- return -EINVAL;
+ goto e_inval;
if(val==-1)
val = ip_statistics.IpDefaultTTL;
if(val<1||val>255)
- return -EINVAL;
- sk->ip_ttl=val;
- return 0;
+ goto e_inval;
+ sk->protinfo.af_inet.ttl=val;
+ break;
case IP_HDRINCL:
- if(sk->type!=SOCK_RAW)
- return -ENOPROTOOPT;
- sk->ip_hdrincl=val?1:0;
- return 0;
+ if(sk->type!=SOCK_RAW) {
+ err = -ENOPROTOOPT;
+ break;
+ }
+ sk->protinfo.af_inet.hdrincl=val?1:0;
+ break;
case IP_MTU_DISCOVER:
if (val<0 || val>2)
- return -EINVAL;
- sk->ip_pmtudisc = val;
- return 0;
+ goto e_inval;
+ sk->protinfo.af_inet.pmtudisc = val;
+ break;
case IP_RECVERR:
- sk->ip_recverr = !!val;
+ sk->protinfo.af_inet.recverr = !!val;
if (!val)
skb_queue_purge(&sk->error_queue);
- return 0;
- case IP_MULTICAST_TTL:
+ break;
+ case IP_MULTICAST_TTL:
+ if (sk->type == SOCK_STREAM)
+ goto e_inval;
if (optlen<1)
- return -EINVAL;
+ goto e_inval;
if (val==-1)
val = 1;
if (val < 0 || val > 255)
- return -EINVAL;
- sk->ip_mc_ttl=val;
- return 0;
+ goto e_inval;
+ sk->protinfo.af_inet.mc_ttl=val;
+ break;
case IP_MULTICAST_LOOP:
if (optlen<1)
- return -EINVAL;
- sk->ip_mc_loop = val ? 1 : 0;
- return 0;
+ goto e_inval;
+ sk->protinfo.af_inet.mc_loop = val ? 1 : 0;
+ break;
case IP_MULTICAST_IF:
{
struct ip_mreqn mreq;
- struct device *dev = NULL;
-
+ struct net_device *dev = NULL;
+
+ if (sk->type == SOCK_STREAM)
+ goto e_inval;
/*
* Check the arguments are allowable
*/
+ err = -EFAULT;
if (optlen >= sizeof(struct ip_mreqn)) {
if (copy_from_user(&mreq,optval,sizeof(mreq)))
- return -EFAULT;
+ break;
} else {
memset(&mreq, 0, sizeof(mreq));
if (optlen >= sizeof(struct in_addr) &&
copy_from_user(&mreq.imr_address,optval,sizeof(struct in_addr)))
- return -EFAULT;
+ break;
}
if (!mreq.imr_ifindex) {
if (mreq.imr_address.s_addr == INADDR_ANY) {
- sk->ip_mc_index = 0;
- sk->ip_mc_addr = 0;
- return 0;
+ sk->protinfo.af_inet.mc_index = 0;
+ sk->protinfo.af_inet.mc_addr = 0;
+ err = 0;
+ break;
}
dev = ip_dev_find(mreq.imr_address.s_addr);
+ if (dev) {
+ mreq.imr_ifindex = dev->ifindex;
+ dev_put(dev);
+ }
} else
- dev = dev_get_by_index(mreq.imr_ifindex);
+ dev = __dev_get_by_index(mreq.imr_ifindex);
+
+ err = -EADDRNOTAVAIL;
if (!dev)
- return -EADDRNOTAVAIL;
+ break;
- if (sk->bound_dev_if && dev->ifindex != sk->bound_dev_if)
- return -EINVAL;
+ err = -EINVAL;
+ if (sk->bound_dev_if && mreq.imr_ifindex != sk->bound_dev_if)
+ break;
- sk->ip_mc_index = mreq.imr_ifindex;
- sk->ip_mc_addr = mreq.imr_address.s_addr;
- return 0;
+ sk->protinfo.af_inet.mc_index = mreq.imr_ifindex;
+ sk->protinfo.af_inet.mc_addr = mreq.imr_address.s_addr;
+ err = 0;
+ break;
}
case IP_ADD_MEMBERSHIP:
case IP_DROP_MEMBERSHIP:
{
struct ip_mreqn mreq;
-
+
if (optlen < sizeof(struct ip_mreq))
- return -EINVAL;
+ goto e_inval;
+ err = -EFAULT;
if (optlen >= sizeof(struct ip_mreqn)) {
if(copy_from_user(&mreq,optval,sizeof(mreq)))
- return -EFAULT;
+ break;
} else {
memset(&mreq, 0, sizeof(mreq));
if (copy_from_user(&mreq,optval,sizeof(struct ip_mreq)))
- return -EFAULT;
+ break;
}
if (optname == IP_ADD_MEMBERSHIP)
- return ip_mc_join_group(sk,&mreq);
+ err = ip_mc_join_group(sk,&mreq);
else
- return ip_mc_leave_group(sk,&mreq);
+ err = ip_mc_leave_group(sk,&mreq);
+ break;
}
case IP_ROUTER_ALERT:
- return ip_ra_control(sk, val ? 1 : 0, NULL);
-
-#ifdef CONFIG_IP_FIREWALL
- case IP_FW_MASQ_TIMEOUTS:
- case IP_FW_APPEND:
- case IP_FW_REPLACE:
- case IP_FW_DELETE:
- case IP_FW_DELETE_NUM:
- case IP_FW_INSERT:
- case IP_FW_FLUSH:
- case IP_FW_ZERO:
- case IP_FW_CHECK:
- case IP_FW_CREATECHAIN:
- case IP_FW_DELETECHAIN:
- case IP_FW_POLICY:
- if(!capable(CAP_NET_ADMIN))
- return -EACCES;
- if(optlen>sizeof(tmp_fw) || optlen<1)
- return -EINVAL;
- if(copy_from_user(&tmp_fw,optval,optlen))
- return -EFAULT;
- err=ip_fw_ctl(optname, &tmp_fw,optlen);
- return -err; /* -0 is 0 after all */
-#endif /* CONFIG_IP_FIREWALL */
-#ifdef CONFIG_IP_MASQUERADE
- case IP_FW_MASQ_CTL:
- if(!capable(CAP_NET_ADMIN))
- return -EPERM;
- if(optlen<1)
- return -EINVAL;
- err=ip_masq_uctl(optname, optval ,optlen);
- return err;
-
-#endif
+ err = ip_ra_control(sk, val ? 1 : 0, NULL);
+ break;
+
default:
- return(-ENOPROTOOPT);
+#ifdef CONFIG_NETFILTER
+ err = nf_setsockopt(sk, PF_INET, optname, optval,
+ optlen);
+#else
+ err = -ENOPROTOOPT;
+#endif
+ break;
}
+ release_sock(sk);
+ return err;
+
+e_inval:
+ release_sock(sk);
+ return -EINVAL;
}
/*
@@ -636,17 +636,20 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *op
if(get_user(len,optlen))
return -EFAULT;
- switch(optname)
- {
+ lock_sock(sk);
+
+ switch(optname) {
case IP_OPTIONS:
{
unsigned char optbuf[sizeof(struct ip_options)+40];
struct ip_options * opt = (struct ip_options*)optbuf;
- lock_sock(sk);
opt->optlen = 0;
- if (sk->opt)
- memcpy(optbuf, sk->opt, sizeof(struct ip_options)+sk->opt->optlen);
+ if (sk->protinfo.af_inet.opt)
+ memcpy(optbuf, sk->protinfo.af_inet.opt,
+ sizeof(struct ip_options)+
+ sk->protinfo.af_inet.opt->optlen);
release_sock(sk);
+
if (opt->optlen == 0)
return put_user(0, optlen);
@@ -660,66 +663,113 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *op
return 0;
}
case IP_PKTINFO:
- val = (sk->ip_cmsg_flags & IP_CMSG_PKTINFO) != 0;
+ val = (sk->protinfo.af_inet.cmsg_flags & IP_CMSG_PKTINFO) != 0;
break;
case IP_RECVTTL:
- val = (sk->ip_cmsg_flags & IP_CMSG_TTL) != 0;
+ val = (sk->protinfo.af_inet.cmsg_flags & IP_CMSG_TTL) != 0;
break;
case IP_RECVTOS:
- val = (sk->ip_cmsg_flags & IP_CMSG_TOS) != 0;
+ val = (sk->protinfo.af_inet.cmsg_flags & IP_CMSG_TOS) != 0;
break;
case IP_RECVOPTS:
- val = (sk->ip_cmsg_flags & IP_CMSG_RECVOPTS) != 0;
+ val = (sk->protinfo.af_inet.cmsg_flags & IP_CMSG_RECVOPTS) != 0;
break;
case IP_RETOPTS:
- val = (sk->ip_cmsg_flags & IP_CMSG_RETOPTS) != 0;
+ val = (sk->protinfo.af_inet.cmsg_flags & IP_CMSG_RETOPTS) != 0;
break;
case IP_TOS:
- val=sk->ip_tos;
+ val=sk->protinfo.af_inet.tos;
break;
case IP_TTL:
- val=sk->ip_ttl;
+ val=sk->protinfo.af_inet.ttl;
break;
case IP_HDRINCL:
- val=sk->ip_hdrincl;
+ val=sk->protinfo.af_inet.hdrincl;
break;
case IP_MTU_DISCOVER:
- val=sk->ip_pmtudisc;
+ val=sk->protinfo.af_inet.pmtudisc;
break;
case IP_MTU:
- val = 0;
- lock_sock(sk);
- if (sk->dst_cache)
- val = sk->dst_cache->pmtu;
- release_sock(sk);
- if (!val)
+ {
+ struct dst_entry *dst;
+ val = 0;
+ dst = sk_dst_get(sk);
+ if (dst) {
+ val = dst->pmtu;
+ dst_release(dst);
+ }
+ if (!val) {
+ release_sock(sk);
return -ENOTCONN;
+ }
break;
+ }
case IP_RECVERR:
- val=sk->ip_recverr;
+ val=sk->protinfo.af_inet.recverr;
break;
case IP_MULTICAST_TTL:
- val=sk->ip_mc_ttl;
+ val=sk->protinfo.af_inet.mc_ttl;
break;
case IP_MULTICAST_LOOP:
- val=sk->ip_mc_loop;
+ val=sk->protinfo.af_inet.mc_loop;
break;
case IP_MULTICAST_IF:
{
struct ip_mreqn mreq;
len = min(len,sizeof(struct ip_mreqn));
+ mreq.imr_ifindex = sk->protinfo.af_inet.mc_index;
+ mreq.imr_address.s_addr = sk->protinfo.af_inet.mc_addr;
+ mreq.imr_multiaddr.s_addr = 0;
+ release_sock(sk);
+
if(put_user(len, optlen))
return -EFAULT;
- mreq.imr_ifindex = sk->ip_mc_index;
- mreq.imr_address.s_addr = sk->ip_mc_addr;
- mreq.imr_multiaddr.s_addr = 0;
if(copy_to_user((void *)optval, &mreq, len))
return -EFAULT;
return 0;
}
+ case IP_PKTOPTIONS:
+ {
+ struct msghdr msg;
+
+ release_sock(sk);
+
+ if (sk->type != SOCK_STREAM)
+ return -ENOPROTOOPT;
+
+ msg.msg_control = optval;
+ msg.msg_controllen = len;
+ msg.msg_flags = 0;
+
+ if (sk->protinfo.af_inet.cmsg_flags&IP_CMSG_PKTINFO) {
+ struct in_pktinfo info;
+
+ info.ipi_addr.s_addr = sk->rcv_saddr;
+ info.ipi_spec_dst.s_addr = sk->rcv_saddr;
+ info.ipi_ifindex = sk->protinfo.af_inet.mc_index;
+ put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
+ }
+ if (sk->protinfo.af_inet.cmsg_flags&IP_CMSG_TTL) {
+ int hlim = sk->protinfo.af_inet.mc_ttl;
+ put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
+ }
+ len -= msg.msg_controllen;
+ return put_user(len, optlen);
+ }
default:
- return(-ENOPROTOOPT);
+#ifdef CONFIG_NETFILTER
+ val = nf_getsockopt(sk, PF_INET, optname, optval,
+ &len);
+ release_sock(sk);
+ if (val >= 0)
+ val = put_user(len, optlen);
+ return val;
+#else
+ release_sock(sk);
+ return -ENOPROTOOPT;
+#endif
}
+ release_sock(sk);
if (len < sizeof(int) && len > 0 && val>=0 && val<255) {
unsigned char ucval = (unsigned char)val;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 15cdf7a6e..37b41e93a 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1,5 +1,5 @@
/*
- * $Id: ipconfig.c,v 1.22 1999/06/09 10:10:57 davem Exp $
+ * $Id: ipconfig.c,v 1.24 1999/08/20 00:35:14 davem Exp $
*
* Automatic Configuration of IP -- use BOOTP or RARP or user-supplied
* information to configure own IP address and routes.
@@ -12,6 +12,10 @@
* BOOTP rewritten to construct and analyse packets itself instead
* of misusing the IP layer. num_bugs_causing_wrong_arp_replies--;
* -- MJ, December 1998
+ *
+ * Fixed ip_auto_config_setup calling at startup in the new "Linker Magic"
+ * initialization scheme.
+ * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>, 08/11/1999
*/
#include <linux/config.h>
@@ -97,18 +101,18 @@ static int ic_proto_have_if __initdata = 0;
struct ic_device {
struct ic_device *next;
- struct device *dev;
+ struct net_device *dev;
unsigned short flags;
int able;
};
static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */
-static struct device *ic_dev __initdata = NULL; /* Selected device */
+static struct net_device *ic_dev __initdata = NULL; /* Selected device */
static int __init ic_open_devs(void)
{
struct ic_device *d, **last;
- struct device *dev;
+ struct net_device *dev;
unsigned short oflags;
last = &ic_first_dev;
@@ -161,7 +165,7 @@ static int __init ic_open_devs(void)
static void __init ic_close_devs(void)
{
struct ic_device *d, *next;
- struct device *dev;
+ struct net_device *dev;
next = ic_first_dev;
while ((d = next)) {
@@ -305,7 +309,7 @@ static int __init ic_defaults(void)
#ifdef CONFIG_IP_PNP_RARP
-static int ic_rarp_recv(struct sk_buff *skb, struct device *dev, struct packet_type *pt);
+static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt);
static struct packet_type rarp_packet_type __initdata = {
__constant_htons(ETH_P_RARP),
@@ -329,7 +333,7 @@ static inline void ic_rarp_cleanup(void)
* Process received RARP packet.
*/
static int __init
-ic_rarp_recv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
+ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
{
struct arphdr *rarp = (struct arphdr *)skb->h.raw;
unsigned char *rarp_ptr = (unsigned char *) (rarp + 1);
@@ -394,7 +398,7 @@ static void __init ic_rarp_send(void)
for (d=ic_first_dev; d; d=d->next)
if (d->able & IC_RARP) {
- struct device *dev = d->dev;
+ struct net_device *dev = d->dev;
arp_send(ARPOP_RREQUEST, ETH_P_RARP, 0, dev, 0, NULL,
dev->dev_addr, dev->dev_addr);
}
@@ -433,7 +437,7 @@ struct bootp_pkt { /* BOOTP packet format */
static u32 ic_bootp_xid;
-static int ic_bootp_recv(struct sk_buff *skb, struct device *dev, struct packet_type *pt);
+static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt);
static struct packet_type bootp_packet_type __initdata = {
__constant_htons(ETH_P_IP),
@@ -497,7 +501,7 @@ static inline void ic_bootp_cleanup(void)
*/
static void __init ic_bootp_send_if(struct ic_device *d, u32 jiffies)
{
- struct device *dev = d->dev;
+ struct net_device *dev = d->dev;
struct sk_buff *skb;
struct bootp_pkt *b;
int hh_len = (dev->hard_header_len + 15) & ~15;
@@ -616,7 +620,7 @@ static void __init ic_do_bootp_ext(u8 *ext)
/*
* Receive BOOTP reply.
*/
-static int __init ic_bootp_recv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
+static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
{
struct bootp_pkt *b = (struct bootp_pkt *) skb->nh.iph;
struct iphdr *h = &b->iph;
@@ -912,7 +916,7 @@ static int __init ic_proto_name(char *name)
return 0;
}
-void __init ip_auto_config_setup(char *addrs, int *ints)
+static int __init ip_auto_config_setup(char *addrs)
{
char *cp, *ip, *dp;
int num = 0;
@@ -920,10 +924,10 @@ void __init ip_auto_config_setup(char *addrs, int *ints)
ic_set_manually = 1;
if (!strcmp(addrs, "off")) {
ic_enable = 0;
- return;
+ return 1;
}
if (ic_proto_name(addrs))
- return;
+ return 1;
/* Parse the whole string */
ip = addrs;
@@ -971,4 +975,14 @@ void __init ip_auto_config_setup(char *addrs, int *ints)
ip = cp;
num++;
}
+
+ return 0;
}
+
+static int __init nfsaddrs_config_setup(char *addrs)
+{
+ return ip_auto_config_setup(addrs);
+}
+
+__setup("ip=", ip_auto_config_setup);
+__setup("nfsaddrs=", nfsaddrs_config_setup);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 0aeef4a31..0a5ae3cfe 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -1,7 +1,7 @@
/*
* Linux NET3: IP/IP protocol decoder.
*
- * Version: $Id: ipip.c,v 1.26 1999/03/25 10:04:32 davem Exp $
+ * Version: $Id: ipip.c,v 1.29 1999/08/31 07:03:42 davem Exp $
*
* Authors:
* Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
@@ -116,10 +116,10 @@
#define HASH_SIZE 16
#define HASH(addr) ((addr^(addr>>4))&0xF)
-static int ipip_fb_tunnel_init(struct device *dev);
-static int ipip_tunnel_init(struct device *dev);
+static int ipip_fb_tunnel_init(struct net_device *dev);
+static int ipip_tunnel_init(struct net_device *dev);
-static struct device ipip_fb_tunnel_dev = {
+static struct net_device ipip_fb_tunnel_dev = {
NULL, 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipip_fb_tunnel_init,
};
@@ -133,6 +133,8 @@ static struct ip_tunnel *tunnels_l[HASH_SIZE];
static struct ip_tunnel *tunnels_wc[1];
static struct ip_tunnel **tunnels[4] = { tunnels_wc, tunnels_l, tunnels_r, tunnels_r_l };
+static rwlock_t ipip_lock = RW_LOCK_UNLOCKED;
+
static struct ip_tunnel * ipip_tunnel_lookup(u32 remote, u32 local)
{
unsigned h0 = HASH(remote);
@@ -182,8 +184,9 @@ static void ipip_tunnel_unlink(struct ip_tunnel *t)
for (tp = ipip_bucket(t); *tp; tp = &(*tp)->next) {
if (t == *tp) {
+ write_lock_bh(&ipip_lock);
*tp = t->next;
- synchronize_bh();
+ write_unlock_bh(&ipip_lock);
break;
}
}
@@ -194,8 +197,9 @@ static void ipip_tunnel_link(struct ip_tunnel *t)
struct ip_tunnel **tp = ipip_bucket(t);
t->next = *tp;
- wmb();
+ write_lock_bh(&ipip_lock);
*tp = t;
+ write_unlock_bh(&ipip_lock);
}
struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create)
@@ -203,7 +207,7 @@ struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create)
u32 remote = parms->iph.daddr;
u32 local = parms->iph.saddr;
struct ip_tunnel *t, **tp, *nt;
- struct device *dev;
+ struct net_device *dev;
unsigned h = 0;
int prio = 0;
@@ -234,12 +238,13 @@ struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create)
nt->dev = dev;
dev->name = nt->parms.name;
dev->init = ipip_tunnel_init;
+ dev->new_style = 1;
memcpy(&nt->parms, parms, sizeof(*parms));
if (dev->name[0] == 0) {
int i;
for (i=1; i<100; i++) {
sprintf(dev->name, "tunl%d", i);
- if (dev_get(dev->name) == NULL)
+ if (__dev_get_by_name(dev->name) == NULL)
break;
}
if (i==100)
@@ -249,6 +254,7 @@ struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create)
if (register_netdevice(dev) < 0)
goto failed;
+ dev_hold(dev);
ipip_tunnel_link(nt);
/* Do not decrement MOD_USE_COUNT here. */
return nt;
@@ -259,16 +265,23 @@ failed:
return NULL;
}
+static void ipip_tunnel_destructor(struct net_device *dev)
+{
+ if (dev != &ipip_fb_tunnel_dev) {
+ MOD_DEC_USE_COUNT;
+ }
+}
-static void ipip_tunnel_destroy(struct device *dev)
+static void ipip_tunnel_uninit(struct net_device *dev)
{
if (dev == &ipip_fb_tunnel_dev) {
+ write_lock_bh(&ipip_lock);
tunnels_wc[0] = NULL;
- synchronize_bh();
+ write_unlock_bh(&ipip_lock);
+ dev_put(dev);
} else {
ipip_tunnel_unlink((struct ip_tunnel*)dev->priv);
- kfree(dev);
- MOD_DEC_USE_COUNT;
+ dev_put(dev);
}
}
@@ -316,17 +329,20 @@ void ipip_err(struct sk_buff *skb, unsigned char *dp, int len)
break;
}
+ read_lock(&ipip_lock);
t = ipip_tunnel_lookup(iph->daddr, iph->saddr);
if (t == NULL || t->parms.iph.daddr == 0)
- return;
+ goto out;
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
- return;
+ goto out;
if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
t->err_count++;
else
t->err_count = 1;
t->err_time = jiffies;
+out:
+ read_unlock(&ipip_lock);
return;
#else
struct iphdr *iph = (struct iphdr*)dp;
@@ -460,6 +476,7 @@ int ipip_rcv(struct sk_buff *skb, unsigned short len)
skb->ip_summed = 0;
skb->pkt_type = PACKET_HOST;
+ read_lock(&ipip_lock);
if ((tunnel = ipip_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) {
tunnel->stat.rx_packets++;
tunnel->stat.rx_bytes += skb->len;
@@ -467,8 +484,10 @@ int ipip_rcv(struct sk_buff *skb, unsigned short len)
dst_release(skb->dst);
skb->dst = NULL;
netif_rx(skb);
+ read_unlock(&ipip_lock);
return 0;
}
+ read_unlock(&ipip_lock);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);
kfree_skb(skb);
@@ -480,7 +499,7 @@ int ipip_rcv(struct sk_buff *skb, unsigned short len)
* and that skb is filled properly by that function.
*/
-static int ipip_tunnel_xmit(struct sk_buff *skb, struct device *dev)
+static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
struct net_device_stats *stats = &tunnel->stat;
@@ -488,7 +507,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct device *dev)
u8 tos = tunnel->parms.iph.tos;
u16 df = tiph->frag_off;
struct rtable *rt; /* Route to the other host */
- struct device *tdev; /* Device to other host */
+ struct net_device *tdev; /* Device to other host */
struct iphdr *old_iph = skb->nh.iph;
struct iphdr *iph; /* Our new IP header */
int max_headroom; /* The extra header space needed */
@@ -616,7 +635,7 @@ tx_error:
}
static int
-ipip_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd)
+ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
{
int err = 0;
struct ip_tunnel_parm p;
@@ -674,14 +693,12 @@ ipip_tunnel_ioctl (struct device *dev, struct ifreq *ifr, int cmd)
break;
}
t = (struct ip_tunnel*)dev->priv;
- start_bh_atomic();
ipip_tunnel_unlink(t);
t->parms.iph.saddr = p.iph.saddr;
t->parms.iph.daddr = p.iph.daddr;
memcpy(dev->dev_addr, &p.iph.saddr, 4);
memcpy(dev->broadcast, &p.iph.daddr, 4);
ipip_tunnel_link(t);
- end_bh_atomic();
netdev_state_change(dev);
}
}
@@ -727,12 +744,12 @@ done:
return err;
}
-static struct net_device_stats *ipip_tunnel_get_stats(struct device *dev)
+static struct net_device_stats *ipip_tunnel_get_stats(struct net_device *dev)
{
return &(((struct ip_tunnel*)dev->priv)->stat);
}
-static int ipip_tunnel_change_mtu(struct device *dev, int new_mtu)
+static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
{
if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr))
return -EINVAL;
@@ -740,11 +757,12 @@ static int ipip_tunnel_change_mtu(struct device *dev, int new_mtu)
return 0;
}
-static void ipip_tunnel_init_gen(struct device *dev)
+static void ipip_tunnel_init_gen(struct net_device *dev)
{
struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
- dev->destructor = ipip_tunnel_destroy;
+ dev->uninit = ipip_tunnel_uninit;
+ dev->destructor = ipip_tunnel_destructor;
dev->hard_start_xmit = ipip_tunnel_xmit;
dev->get_stats = ipip_tunnel_get_stats;
dev->do_ioctl = ipip_tunnel_ioctl;
@@ -762,9 +780,9 @@ static void ipip_tunnel_init_gen(struct device *dev)
memcpy(dev->broadcast, &t->parms.iph.daddr, 4);
}
-static int ipip_tunnel_init(struct device *dev)
+static int ipip_tunnel_init(struct net_device *dev)
{
- struct device *tdev = NULL;
+ struct net_device *tdev = NULL;
struct ip_tunnel *tunnel;
struct iphdr *iph;
@@ -783,7 +801,7 @@ static int ipip_tunnel_init(struct device *dev)
}
if (!tdev && tunnel->parms.link)
- tdev = dev_get_by_index(tunnel->parms.link);
+ tdev = __dev_get_by_index(tunnel->parms.link);
if (tdev) {
dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
@@ -795,20 +813,20 @@ static int ipip_tunnel_init(struct device *dev)
}
#ifdef MODULE
-static int ipip_fb_tunnel_open(struct device *dev)
+static int ipip_fb_tunnel_open(struct net_device *dev)
{
MOD_INC_USE_COUNT;
return 0;
}
-static int ipip_fb_tunnel_close(struct device *dev)
+static int ipip_fb_tunnel_close(struct net_device *dev)
{
MOD_DEC_USE_COUNT;
return 0;
}
#endif
-__initfunc(int ipip_fb_tunnel_init(struct device *dev))
+int __init ipip_fb_tunnel_init(struct net_device *dev)
{
struct iphdr *iph;
@@ -823,6 +841,7 @@ __initfunc(int ipip_fb_tunnel_init(struct device *dev))
iph->protocol = IPPROTO_IPIP;
iph->ihl = 5;
+ dev_hold(dev);
tunnels_wc[0] = &ipip_fb_tunnel;
return 0;
}
@@ -840,7 +859,7 @@ static struct inet_protocol ipip_protocol = {
#ifdef MODULE
int init_module(void)
#else
-__initfunc(int ipip_init(void))
+int __init ipip_init(void)
#endif
{
printk(KERN_INFO "IPv4 over IPv4 tunneling driver\n");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 1034e0e7a..6bb331fcd 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -9,7 +9,7 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * Version: $Id: ipmr.c,v 1.43 1999/06/09 10:10:59 davem Exp $
+ * Version: $Id: ipmr.c,v 1.46 1999/08/31 07:03:44 davem Exp $
*
* Fixes:
* Michael Chastain : Incorrect size of copying.
@@ -57,8 +57,7 @@
#include <net/raw.h>
#include <linux/notifier.h>
#include <linux/if_arp.h>
-#include <linux/ip_fw.h>
-#include <linux/firewall.h>
+#include <linux/netfilter_ipv4.h>
#include <net/ipip.h>
#include <net/checksum.h>
@@ -66,17 +65,44 @@
#define CONFIG_IP_PIMSM 1
#endif
+static struct sock *mroute_socket;
+
+
+/* Big lock, protecting vif table, mrt cache and mroute socket state.
+ Note that the changes are semaphored via rtnl_lock.
+ */
+
+static rwlock_t mrt_lock = RW_LOCK_UNLOCKED;
+
/*
* Multicast router control variables
*/
static struct vif_device vif_table[MAXVIFS]; /* Devices */
-static unsigned long vifc_map; /* Active device map */
static int maxvif;
+
+#define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
+
int mroute_do_assert = 0; /* Set in PIM assert */
int mroute_do_pim = 0;
+
static struct mfc_cache *mfc_cache_array[MFC_LINES]; /* Forwarding cache */
-int cache_resolve_queue_len = 0; /* Size of unresolved */
+
+static struct mfc_cache *mfc_unres_queue; /* Queue of unresolved entries */
+atomic_t cache_resolve_queue_len; /* Size of unresolved */
+
+/* Special spinlock for queue of unresolved entries */
+static spinlock_t mfc_unres_lock = SPIN_LOCK_UNLOCKED;
+
+/* We return to original Alan's scheme. Hash table of resolved
+ entries is changed only in process context and protected
+ with weak lock mrt_lock. Queue of unresolved entries is protected
+ with strong spinlock mfc_unres_lock.
+
+ In this case data path is free of exclusive locks at all.
+ */
+
+kmem_cache_t *mrt_cachep;
static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
@@ -84,13 +110,16 @@ static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtm
extern struct inet_protocol pim_protocol;
+static struct timer_list ipmr_expire_timer;
+
+/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
+
static
-struct device *ipmr_new_tunnel(struct vifctl *v)
+struct net_device *ipmr_new_tunnel(struct vifctl *v)
{
- struct device *dev = NULL;
+ struct net_device *dev;
- rtnl_lock();
- dev = dev_get("tunl0");
+ dev = __dev_get_by_name("tunl0");
if (dev) {
int err;
@@ -112,10 +141,12 @@ struct device *ipmr_new_tunnel(struct vifctl *v)
err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
set_fs(oldfs);
- if (err == 0 && (dev = dev_get(p.name)) != NULL) {
+ dev = NULL;
+
+ if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
dev->flags |= IFF_MULTICAST;
- in_dev = dev->ip_ptr;
+ in_dev = __in_dev_get(dev);
if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
goto failure;
in_dev->cnf.rp_filter = 0;
@@ -124,38 +155,37 @@ struct device *ipmr_new_tunnel(struct vifctl *v)
goto failure;
}
}
- rtnl_unlock();
return dev;
failure:
unregister_netdevice(dev);
- rtnl_unlock();
return NULL;
}
#ifdef CONFIG_IP_PIMSM
static int reg_vif_num = -1;
-static struct device * reg_dev;
-static int reg_vif_xmit(struct sk_buff *skb, struct device *dev)
+static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
{
+ read_lock(&mrt_lock);
((struct net_device_stats*)dev->priv)->tx_bytes += skb->len;
((struct net_device_stats*)dev->priv)->tx_packets++;
ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
+ read_unlock(&mrt_lock);
kfree_skb(skb);
return 0;
}
-static struct net_device_stats *reg_vif_get_stats(struct device *dev)
+static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
{
return (struct net_device_stats*)dev->priv;
}
static
-struct device *ipmr_reg_vif(struct vifctl *v)
+struct net_device *ipmr_reg_vif(struct vifctl *v)
{
- struct device *dev;
+ struct net_device *dev;
struct in_device *in_dev;
int size;
@@ -176,11 +206,9 @@ struct device *ipmr_reg_vif(struct vifctl *v)
dev->flags = IFF_NOARP;
dev->hard_start_xmit = reg_vif_xmit;
dev->get_stats = reg_vif_get_stats;
-
- rtnl_lock();
+ dev->new_style = 1;
if (register_netdevice(dev)) {
- rtnl_unlock();
kfree(dev);
return NULL;
}
@@ -194,14 +222,10 @@ struct device *ipmr_reg_vif(struct vifctl *v)
if (dev_open(dev))
goto failure;
- rtnl_unlock();
- reg_dev = dev;
return dev;
failure:
unregister_netdevice(dev);
- rtnl_unlock();
- kfree(dev);
return NULL;
}
#endif
@@ -213,219 +237,277 @@ failure:
static int vif_delete(int vifi)
{
struct vif_device *v;
- struct device *dev;
+ struct net_device *dev;
struct in_device *in_dev;
-
- if (vifi < 0 || vifi >= maxvif || !(vifc_map&(1<<vifi)))
+
+ if (vifi < 0 || vifi >= maxvif)
return -EADDRNOTAVAIL;
v = &vif_table[vifi];
+ write_lock_bh(&mrt_lock);
dev = v->dev;
v->dev = NULL;
- vifc_map &= ~(1<<vifi);
-
- if ((in_dev = dev->ip_ptr) != NULL)
- in_dev->cnf.mc_forwarding = 0;
- dev_set_allmulti(dev, -1);
- ip_rt_multicast_event(in_dev);
+ if (!dev) {
+ write_unlock_bh(&mrt_lock);
+ return -EADDRNOTAVAIL;
+ }
- if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER)) {
#ifdef CONFIG_IP_PIMSM
- if (vifi == reg_vif_num) {
- reg_vif_num = -1;
- reg_dev = NULL;
- }
+ if (vifi == reg_vif_num)
+ reg_vif_num = -1;
#endif
- unregister_netdevice(dev);
- if (v->flags&VIFF_REGISTER)
- kfree(dev);
- }
if (vifi+1 == maxvif) {
int tmp;
for (tmp=vifi-1; tmp>=0; tmp--) {
- if (vifc_map&(1<<tmp))
+ if (VIF_EXISTS(tmp))
break;
}
maxvif = tmp+1;
}
+
+ write_unlock_bh(&mrt_lock);
+
+ dev_set_allmulti(dev, -1);
+
+ if ((in_dev = __in_dev_get(dev)) != NULL) {
+ in_dev->cnf.mc_forwarding--;
+ ip_rt_multicast_event(in_dev);
+ }
+
+ if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
+ unregister_netdevice(dev);
+
+ dev_put(dev);
return 0;
}
-static void ipmr_update_threshoulds(struct mfc_cache *cache, unsigned char *ttls)
-{
- int vifi;
+/* Destroy an unresolved cache entry, killing queued skbs
+ and reporting error to netlink readers.
+ */
- start_bh_atomic();
+static void ipmr_destroy_unres(struct mfc_cache *c)
+{
+ struct sk_buff *skb;
- cache->mfc_minvif = MAXVIFS;
- cache->mfc_maxvif = 0;
- memset(cache->mfc_ttls, 255, MAXVIFS);
+ atomic_dec(&cache_resolve_queue_len);
- for (vifi=0; vifi<maxvif; vifi++) {
- if (vifc_map&(1<<vifi) && ttls[vifi] && ttls[vifi] < 255) {
- cache->mfc_ttls[vifi] = ttls[vifi];
- if (cache->mfc_minvif > vifi)
- cache->mfc_minvif = vifi;
- if (cache->mfc_maxvif <= vifi)
- cache->mfc_maxvif = vifi + 1;
- }
+ while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
+#ifdef CONFIG_RTNETLINK
+ if (skb->nh.iph->version == 0) {
+ struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
+ nlh->nlmsg_type = NLMSG_ERROR;
+ nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
+ skb_trim(skb, nlh->nlmsg_len);
+ ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT;
+ netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
+ } else
+#endif
+ kfree_skb(skb);
}
- end_bh_atomic();
+
+ kmem_cache_free(mrt_cachep, c);
}
-/*
- * Delete a multicast route cache entry
- */
-
-static void ipmr_cache_delete(struct mfc_cache *cache)
+
+/* Single timer process for all the unresolved queue. */
+
+void ipmr_expire_process(unsigned long dummy)
{
- struct sk_buff *skb;
- int line;
- struct mfc_cache **cp;
-
- /*
- * Find the right cache line
- */
+ unsigned long now;
+ unsigned long expires;
+ struct mfc_cache *c, **cp;
- line=MFC_HASH(cache->mfc_mcastgrp,cache->mfc_origin);
- cp=&(mfc_cache_array[line]);
+ if (!spin_trylock(&mfc_unres_lock)) {
+ mod_timer(&ipmr_expire_timer, jiffies + HZ/10);
+ return;
+ }
- if(cache->mfc_flags&MFC_QUEUED)
- del_timer(&cache->mfc_timer);
-
- /*
- * Unlink the buffer
- */
+ if (atomic_read(&cache_resolve_queue_len) == 0)
+ goto out;
- while(*cp!=NULL)
- {
- if(*cp==cache)
- {
- *cp=cache->next;
- break;
+ now = jiffies;
+ expires = 10*HZ;
+ cp = &mfc_unres_queue;
+
+ while ((c=*cp) != NULL) {
+ long interval = c->mfc_un.unres.expires - now;
+
+ if (interval > 0) {
+ if (interval < expires)
+ expires = interval;
+ cp = &c->next;
+ continue;
}
- cp=&((*cp)->next);
+
+ *cp = c->next;
+
+ ipmr_destroy_unres(c);
}
- /*
- * Free the buffer. If it is a pending resolution
- * clean up the other resources.
- */
+ if (atomic_read(&cache_resolve_queue_len))
+ mod_timer(&ipmr_expire_timer, jiffies + expires);
- if(cache->mfc_flags&MFC_QUEUED)
- {
- cache_resolve_queue_len--;
- while((skb=skb_dequeue(&cache->mfc_unresolved))) {
-#ifdef CONFIG_RTNETLINK
- if (skb->nh.iph->version == 0) {
- struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
- nlh->nlmsg_type = NLMSG_ERROR;
- nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
- skb_trim(skb, nlh->nlmsg_len);
- ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT;
- netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
- } else
-#endif
- kfree_skb(skb);
+out:
+ spin_unlock(&mfc_unres_lock);
+}
+
+/* Fill oifs list. It is called under write locked mrt_lock. */
+
+static void ipmr_update_threshoulds(struct mfc_cache *cache, unsigned char *ttls)
+{
+ int vifi;
+
+ cache->mfc_un.res.minvif = MAXVIFS;
+ cache->mfc_un.res.maxvif = 0;
+ memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
+
+ for (vifi=0; vifi<maxvif; vifi++) {
+ if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
+ cache->mfc_un.res.ttls[vifi] = ttls[vifi];
+ if (cache->mfc_un.res.minvif > vifi)
+ cache->mfc_un.res.minvif = vifi;
+ if (cache->mfc_un.res.maxvif <= vifi)
+ cache->mfc_un.res.maxvif = vifi + 1;
}
}
- kfree_s(cache,sizeof(cache));
}
-/*
- * Cache expiry timer
- */
-
-static void ipmr_cache_timer(unsigned long data)
+static int vif_add(struct vifctl *vifc, int mrtsock)
{
- struct mfc_cache *cache=(struct mfc_cache *)data;
- ipmr_cache_delete(cache);
-}
+ int vifi = vifc->vifc_vifi;
+ struct vif_device *v = &vif_table[vifi];
+ struct net_device *dev;
+ struct in_device *in_dev;
-/*
- * Insert a multicast cache entry
- */
+ /* Is vif busy ? */
+ if (VIF_EXISTS(vifi))
+ return -EADDRINUSE;
-static void ipmr_cache_insert(struct mfc_cache *c)
-{
- int line=MFC_HASH(c->mfc_mcastgrp,c->mfc_origin);
- c->next=mfc_cache_array[line];
- mfc_cache_array[line]=c;
+ switch (vifc->vifc_flags) {
+#ifdef CONFIG_IP_PIMSM
+ case VIFF_REGISTER:
+ /*
+ * Special Purpose VIF in PIM
+ * All the packets will be sent to the daemon
+ */
+ if (reg_vif_num >= 0)
+ return -EADDRINUSE;
+ dev = ipmr_reg_vif(vifc);
+ if (!dev)
+ return -ENOBUFS;
+ break;
+#endif
+ case VIFF_TUNNEL:
+ dev = ipmr_new_tunnel(vifc);
+ if (!dev)
+ return -ENOBUFS;
+ break;
+ case 0:
+ dev=ip_dev_find(vifc->vifc_lcl_addr.s_addr);
+ if (!dev)
+ return -EADDRNOTAVAIL;
+ __dev_put(dev);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if ((in_dev = __in_dev_get(dev)) == NULL)
+ return -EADDRNOTAVAIL;
+ in_dev->cnf.mc_forwarding++;
+ dev_set_allmulti(dev, +1);
+ ip_rt_multicast_event(in_dev);
+
+ /*
+ * Fill in the VIF structures
+ */
+ v->rate_limit=vifc->vifc_rate_limit;
+ v->local=vifc->vifc_lcl_addr.s_addr;
+ v->remote=vifc->vifc_rmt_addr.s_addr;
+ v->flags=vifc->vifc_flags;
+ if (!mrtsock)
+ v->flags |= VIFF_STATIC;
+ v->threshold=vifc->vifc_threshold;
+ v->bytes_in = 0;
+ v->bytes_out = 0;
+ v->pkt_in = 0;
+ v->pkt_out = 0;
+ v->link = dev->ifindex;
+ if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
+ v->link = dev->iflink;
+
+ /* And finish update writing critical data */
+ write_lock_bh(&mrt_lock);
+ dev_hold(dev);
+ v->dev=dev;
+#ifdef CONFIG_IP_PIMSM
+ if (v->flags&VIFF_REGISTER)
+ reg_vif_num = vifi;
+#endif
+ if (vifi+1 > maxvif)
+ maxvif = vifi+1;
+ write_unlock_bh(&mrt_lock);
+ return 0;
}
-
-/*
- * Find a multicast cache entry
- */
-
-struct mfc_cache *ipmr_cache_find(__u32 origin, __u32 mcastgrp)
+
+static struct mfc_cache *ipmr_cache_find(__u32 origin, __u32 mcastgrp)
{
int line=MFC_HASH(mcastgrp,origin);
- struct mfc_cache *cache;
+ struct mfc_cache *c;
- cache=mfc_cache_array[line];
- while(cache!=NULL)
- {
- if(cache->mfc_origin==origin && cache->mfc_mcastgrp==mcastgrp)
- return cache;
- cache=cache->next;
+ for (c=mfc_cache_array[line]; c; c = c->next) {
+ if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
+ break;
}
- return NULL;
+ return c;
}
/*
* Allocate a multicast cache entry
*/
-
-static struct mfc_cache *ipmr_cache_alloc(int priority)
+static struct mfc_cache *ipmr_cache_alloc(void)
{
- struct mfc_cache *c=(struct mfc_cache *)kmalloc(sizeof(struct mfc_cache), priority);
+ struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_KERNEL);
if(c==NULL)
return NULL;
memset(c, 0, sizeof(*c));
- skb_queue_head_init(&c->mfc_unresolved);
- init_timer(&c->mfc_timer);
- c->mfc_timer.data=(long)c;
- c->mfc_timer.function=ipmr_cache_timer;
- c->mfc_minvif = MAXVIFS;
+ c->mfc_un.res.minvif = MAXVIFS;
return c;
}
-
+
+static struct mfc_cache *ipmr_cache_alloc_unres(void)
+{
+ struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_ATOMIC);
+ if(c==NULL)
+ return NULL;
+ memset(c, 0, sizeof(*c));
+ skb_queue_head_init(&c->mfc_un.unres.unresolved);
+ c->mfc_un.unres.expires = jiffies + 10*HZ;
+ return c;
+}
+
/*
* A cache entry has gone into a resolved state from queued
*/
-static void ipmr_cache_resolve(struct mfc_cache *cache)
+static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
{
struct sk_buff *skb;
- start_bh_atomic();
-
- /*
- * Kill the queue entry timer.
- */
-
- del_timer(&cache->mfc_timer);
-
- if (cache->mfc_flags&MFC_QUEUED) {
- cache->mfc_flags&=~MFC_QUEUED;
- cache_resolve_queue_len--;
- }
-
- end_bh_atomic();
-
/*
* Play the pending entries through our router
*/
- while((skb=skb_dequeue(&cache->mfc_unresolved))) {
+
+ while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
#ifdef CONFIG_RTNETLINK
if (skb->nh.iph->version == 0) {
int err;
struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
- if (ipmr_fill_mroute(skb, cache, NLMSG_DATA(nlh)) > 0) {
+ if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
nlh->nlmsg_len = skb->tail - (u8*)nlh;
} else {
nlh->nlmsg_type = NLMSG_ERROR;
@@ -436,13 +518,15 @@ static void ipmr_cache_resolve(struct mfc_cache *cache)
err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
} else
#endif
- ip_mr_forward(skb, cache, 0);
+ ip_mr_forward(skb, c, 0);
}
}
/*
* Bounce a cache query up to mrouted. We could use netlink for this but mrouted
- * expects the following bizarre scheme..
+ * expects the following bizarre scheme.
+ *
+ * Called under mrt_lock.
*/
static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
@@ -453,9 +537,6 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
struct igmpmsg *msg;
int ret;
- if (mroute_socket==NULL)
- return -EINVAL;
-
#ifdef CONFIG_IP_PIMSM
if (assert == IGMPMSG_WHOLEPKT)
skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
@@ -507,7 +588,12 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
skb->nh.iph->tot_len=htons(skb->len); /* Fix the length */
skb->h.raw = skb->nh.raw;
}
-
+
+ if (mroute_socket == NULL) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
/*
* Deliver to mrouted
*/
@@ -521,154 +607,237 @@ static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
}
/*
- * Queue a packet for resolution
+ * Queue a packet for resolution. It gets locked cache entry!
*/
-static int ipmr_cache_unresolved(struct mfc_cache *cache, vifi_t vifi, struct sk_buff *skb)
+static int
+ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
{
- if(cache==NULL)
- {
+ int err;
+ struct mfc_cache *c;
+
+ spin_lock_bh(&mfc_unres_lock);
+ for (c=mfc_unres_queue; c; c=c->next) {
+ if (c->mfc_mcastgrp == skb->nh.iph->daddr &&
+ c->mfc_origin == skb->nh.iph->saddr)
+ break;
+ }
+
+ if (c == NULL) {
/*
* Create a new entry if allowable
*/
- if(cache_resolve_queue_len>=10 || (cache=ipmr_cache_alloc(GFP_ATOMIC))==NULL)
- {
+
+ if (atomic_read(&cache_resolve_queue_len)>=10 ||
+ (c=ipmr_cache_alloc_unres())==NULL) {
+ spin_unlock_bh(&mfc_unres_lock);
+
kfree_skb(skb);
return -ENOBUFS;
}
+
/*
* Fill in the new cache entry
*/
- cache->mfc_parent=ALL_VIFS;
- cache->mfc_origin=skb->nh.iph->saddr;
- cache->mfc_mcastgrp=skb->nh.iph->daddr;
- cache->mfc_flags=MFC_QUEUED;
- /*
- * Link to the unresolved list
- */
- ipmr_cache_insert(cache);
- cache_resolve_queue_len++;
- /*
- * Fire off the expiry timer
- */
- cache->mfc_timer.expires=jiffies+10*HZ;
- add_timer(&cache->mfc_timer);
+ c->mfc_parent=-1;
+ c->mfc_origin=skb->nh.iph->saddr;
+ c->mfc_mcastgrp=skb->nh.iph->daddr;
+
/*
* Reflect first query at mrouted.
*/
- if(mroute_socket)
- {
+ if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
/* If the report failed throw the cache entry
out - Brad Parker
-
- OK, OK, Brad. Only do not forget to free skb
- and return :-) --ANK
*/
- if (ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE)<0) {
- ipmr_cache_delete(cache);
- kfree_skb(skb);
- return -ENOBUFS;
- }
+ spin_unlock_bh(&mfc_unres_lock);
+
+ kmem_cache_free(mrt_cachep, c);
+ kfree_skb(skb);
+ return err;
}
+
+ atomic_inc(&cache_resolve_queue_len);
+ c->next = mfc_unres_queue;
+ mfc_unres_queue = c;
+
+ if (!del_timer(&ipmr_expire_timer))
+ ipmr_expire_timer.expires = c->mfc_un.unres.expires;
+ add_timer(&ipmr_expire_timer);
}
+
/*
* See if we can append the packet
*/
- if(cache->mfc_queuelen>3)
- {
+ if (c->mfc_un.unres.unresolved.qlen>3) {
kfree_skb(skb);
- return -ENOBUFS;
+ err = -ENOBUFS;
+ } else {
+ skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
+ err = 0;
}
- cache->mfc_queuelen++;
- skb_queue_tail(&cache->mfc_unresolved,skb);
- return 0;
+
+ spin_unlock_bh(&mfc_unres_lock);
+ return err;
}
/*
* MFC cache manipulation by user space mroute daemon
*/
-
-int ipmr_mfc_modify(int action, struct mfcctl *mfc)
+
+int ipmr_mfc_delete(struct mfcctl *mfc)
{
- struct mfc_cache *cache;
+ int line;
+ struct mfc_cache *c, **cp;
+
+ line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
+
+ for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
+ if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
+ c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
+ write_lock_bh(&mrt_lock);
+ *cp = c->next;
+ write_unlock_bh(&mrt_lock);
+
+ kmem_cache_free(mrt_cachep, c);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
+{
+ int line;
+ struct mfc_cache *uc, *c, **cp;
+
+ line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
+
+ for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
+ if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
+ c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
+ break;
+ }
+
+ if (c != NULL) {
+ write_lock_bh(&mrt_lock);
+ c->mfc_parent = mfc->mfcc_parent;
+ ipmr_update_threshoulds(c, mfc->mfcc_ttls);
+ if (!mrtsock)
+ c->mfc_flags |= MFC_STATIC;
+ write_unlock_bh(&mrt_lock);
+ return 0;
+ }
if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
return -EINVAL;
- /*
- * Find the cache line
- */
-
- start_bh_atomic();
- cache=ipmr_cache_find(mfc->mfcc_origin.s_addr,mfc->mfcc_mcastgrp.s_addr);
-
+ c=ipmr_cache_alloc();
+ if (c==NULL)
+ return -ENOMEM;
+
+ c->mfc_origin=mfc->mfcc_origin.s_addr;
+ c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
+ c->mfc_parent=mfc->mfcc_parent;
+ ipmr_update_threshoulds(c, mfc->mfcc_ttls);
+ if (!mrtsock)
+ c->mfc_flags |= MFC_STATIC;
+
+ write_lock_bh(&mrt_lock);
+ c->next = mfc_cache_array[line];
+ mfc_cache_array[line] = c;
+ write_unlock_bh(&mrt_lock);
+
/*
- * Delete an entry
+ * Check to see if we resolved a queued list. If so we
+ * need to send on the frames and tidy up.
*/
- if(action==MRT_DEL_MFC)
- {
- if(cache)
- {
- ipmr_cache_delete(cache);
- end_bh_atomic();
- return 0;
+ spin_lock_bh(&mfc_unres_lock);
+ for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
+ cp = &uc->next) {
+ if (uc->mfc_origin == c->mfc_origin &&
+ uc->mfc_mcastgrp == c->mfc_mcastgrp) {
+ *cp = uc->next;
+ if (atomic_dec_and_test(&cache_resolve_queue_len))
+ del_timer(&ipmr_expire_timer);
+ break;
}
- end_bh_atomic();
- return -ENOENT;
}
- if(cache)
- {
+ spin_unlock_bh(&mfc_unres_lock);
- /*
- * Update the cache, see if it frees a pending queue
- */
+ if (uc) {
+ ipmr_cache_resolve(uc, c);
+ kmem_cache_free(mrt_cachep, uc);
+ }
+ return 0;
+}
- cache->mfc_flags|=MFC_RESOLVED;
- cache->mfc_parent=mfc->mfcc_parent;
- ipmr_update_threshoulds(cache, mfc->mfcc_ttls);
-
- /*
- * Check to see if we resolved a queued list. If so we
- * need to send on the frames and tidy up.
- */
-
- if(cache->mfc_flags&MFC_QUEUED)
- ipmr_cache_resolve(cache); /* Unhook & send the frames */
- end_bh_atomic();
- return 0;
+/*
+ * Close the multicast socket, and clear the vif tables etc
+ */
+
+static void mroute_clean_tables(struct sock *sk)
+{
+ int i;
+
+ /*
+ * Shut down all active vif entries
+ */
+ for(i=0; i<maxvif; i++) {
+ if (!(vif_table[i].flags&VIFF_STATIC))
+ vif_delete(i);
}
/*
- * Unsolicited update - that's ok, add anyway.
+ * Wipe the cache
*/
-
-
- cache=ipmr_cache_alloc(GFP_ATOMIC);
- if(cache==NULL)
- {
- end_bh_atomic();
- return -ENOMEM;
+ for (i=0;i<MFC_LINES;i++) {
+ struct mfc_cache *c, **cp;
+
+ cp = &mfc_cache_array[i];
+ while ((c = *cp) != NULL) {
+ if (c->mfc_flags&MFC_STATIC) {
+ cp = &c->next;
+ continue;
+ }
+ write_lock_bh(&mrt_lock);
+ *cp = c->next;
+ write_unlock_bh(&mrt_lock);
+
+ kmem_cache_free(mrt_cachep, c);
+ }
+ }
+
+ if (atomic_read(&cache_resolve_queue_len) != 0) {
+ struct mfc_cache *c;
+
+ spin_lock_bh(&mfc_unres_lock);
+ while (mfc_unres_queue != NULL) {
+ c = mfc_unres_queue;
+ mfc_unres_queue = c->next;
+ spin_unlock_bh(&mfc_unres_lock);
+
+ ipmr_destroy_unres(c);
+
+ spin_lock_bh(&mfc_unres_lock);
+ }
+ spin_unlock_bh(&mfc_unres_lock);
}
- cache->mfc_flags=MFC_RESOLVED;
- cache->mfc_origin=mfc->mfcc_origin.s_addr;
- cache->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
- cache->mfc_parent=mfc->mfcc_parent;
- ipmr_update_threshoulds(cache, mfc->mfcc_ttls);
- ipmr_cache_insert(cache);
- end_bh_atomic();
- return 0;
}
static void mrtsock_destruct(struct sock *sk)
{
+ rtnl_lock();
if (sk == mroute_socket) {
- ipv4_devconf.mc_forwarding = 0;
+ ipv4_devconf.mc_forwarding--;
+ write_lock_bh(&mrt_lock);
mroute_socket=NULL;
- synchronize_bh();
+ write_unlock_bh(&mrt_lock);
- mroute_close(sk);
+ mroute_clean_tables(sk);
}
+ rtnl_unlock();
}
/*
@@ -680,15 +849,16 @@ static void mrtsock_destruct(struct sock *sk)
int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen)
{
+ int ret;
struct vifctl vif;
struct mfcctl mfc;
if(optname!=MRT_INIT)
{
- if(sk!=mroute_socket)
+ if(sk!=mroute_socket && !capable(CAP_NET_ADMIN))
return -EACCES;
}
-
+
switch(optname)
{
case MRT_INIT:
@@ -696,22 +866,26 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen)
return -EOPNOTSUPP;
if(optlen!=sizeof(int))
return -ENOPROTOOPT;
- {
- int opt;
- if (get_user(opt,(int *)optval))
- return -EFAULT;
- if (opt != 1)
- return -ENOPROTOOPT;
- }
- if(mroute_socket)
+
+ rtnl_lock();
+ if (mroute_socket) {
+ rtnl_unlock();
return -EADDRINUSE;
- mroute_socket=sk;
- ipv4_devconf.mc_forwarding = 1;
- if (ip_ra_control(sk, 1, mrtsock_destruct) == 0)
- return 0;
- mrtsock_destruct(sk);
- return -EADDRINUSE;
+ }
+
+ ret = ip_ra_control(sk, 1, mrtsock_destruct);
+ if (ret == 0) {
+ write_lock_bh(&mrt_lock);
+ mroute_socket=sk;
+ write_unlock_bh(&mrt_lock);
+
+ ipv4_devconf.mc_forwarding++;
+ }
+ rtnl_unlock();
+ return ret;
case MRT_DONE:
+ if (sk!=mroute_socket)
+ return -EACCES;
return ip_ra_control(sk, 0, NULL);
case MRT_ADD_VIF:
case MRT_DEL_VIF:
@@ -721,88 +895,14 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen)
return -EFAULT;
if(vif.vifc_vifi >= MAXVIFS)
return -ENFILE;
- if(optname==MRT_ADD_VIF)
- {
- struct vif_device *v=&vif_table[vif.vifc_vifi];
- struct device *dev;
- struct in_device *in_dev;
-
- /* Is vif busy ? */
- if (vifc_map&(1<<vif.vifc_vifi))
- return -EADDRINUSE;
-
- switch (vif.vifc_flags) {
-#ifdef CONFIG_IP_PIMSM
- case VIFF_REGISTER:
-
- /*
- * Special Purpose VIF in PIM
- * All the packets will be sent to the daemon
- */
- if (reg_vif_num >= 0)
- return -EADDRINUSE;
- reg_vif_num = vif.vifc_vifi;
- dev = ipmr_reg_vif(&vif);
- if (!dev) {
- reg_vif_num = -1;
- return -ENOBUFS;
- }
- break;
-#endif
- case VIFF_TUNNEL:
- dev = ipmr_new_tunnel(&vif);
- if (!dev)
- return -ENOBUFS;
- break;
- case 0:
- dev=ip_dev_find(vif.vifc_lcl_addr.s_addr);
- if (!dev)
- return -EADDRNOTAVAIL;
- break;
- default:
-#if 0
- printk(KERN_DEBUG "ipmr_add_vif: flags %02x\n", vif.vifc_flags);
-#endif
- return -EINVAL;
- }
-
- if ((in_dev = dev->ip_ptr) == NULL)
- return -EADDRNOTAVAIL;
- if (in_dev->cnf.mc_forwarding)
- return -EADDRINUSE;
- in_dev->cnf.mc_forwarding = 1;
- dev_set_allmulti(dev, +1);
- ip_rt_multicast_event(in_dev);
-
- /*
- * Fill in the VIF structures
- */
- start_bh_atomic();
- v->rate_limit=vif.vifc_rate_limit;
- v->local=vif.vifc_lcl_addr.s_addr;
- v->remote=vif.vifc_rmt_addr.s_addr;
- v->flags=vif.vifc_flags;
- v->threshold=vif.vifc_threshold;
- v->dev=dev;
- v->bytes_in = 0;
- v->bytes_out = 0;
- v->pkt_in = 0;
- v->pkt_out = 0;
- v->link = dev->ifindex;
- if (vif.vifc_flags&(VIFF_TUNNEL|VIFF_REGISTER))
- v->link = dev->iflink;
- vifc_map|=(1<<vif.vifc_vifi);
- if (vif.vifc_vifi+1 > maxvif)
- maxvif = vif.vifc_vifi+1;
- end_bh_atomic();
- return 0;
+ rtnl_lock();
+ if (optname==MRT_ADD_VIF) {
+ ret = vif_add(&vif, sk==mroute_socket);
} else {
- int ret;
- rtnl_lock();
ret = vif_delete(vif.vifc_vifi);
- rtnl_unlock();
- return ret;
}
+ rtnl_unlock();
+ return ret;
/*
* Manipulate the forwarding caches. These live
@@ -814,7 +914,13 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen)
return -EINVAL;
if (copy_from_user(&mfc,optval, sizeof(mfc)))
return -EFAULT;
- return ipmr_mfc_modify(optname, &mfc);
+ rtnl_lock();
+ if (optname==MRT_DEL_MFC)
+ ret = ipmr_mfc_delete(&mfc);
+ else
+ ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
+ rtnl_unlock();
+ return ret;
/*
* Control PIM assert.
*/
@@ -833,6 +939,7 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen)
if(get_user(v,(int *)optval))
return -EFAULT;
v = (v)?1:0;
+ rtnl_lock();
if (v != mroute_do_pim) {
mroute_do_pim = v;
mroute_do_assert = v;
@@ -843,6 +950,7 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char *optval,int optlen)
inet_del_protocol(&pim_protocol);
#endif
}
+ rtnl_unlock();
return 0;
}
#endif
@@ -864,15 +972,13 @@ int ip_mroute_getsockopt(struct sock *sk,int optname,char *optval,int *optlen)
int olr;
int val;
- if(sk!=mroute_socket)
- return -EACCES;
if(optname!=MRT_VERSION &&
#ifdef CONFIG_IP_PIMSM
optname!=MRT_PIM &&
#endif
optname!=MRT_ASSERT)
return -ENOPROTOOPT;
-
+
if(get_user(olr, optlen))
return -EFAULT;
@@ -910,66 +1016,44 @@ int ipmr_ioctl(struct sock *sk, int cmd, unsigned long arg)
return -EFAULT;
if(vr.vifi>=maxvif)
return -EINVAL;
+ read_lock(&mrt_lock);
vif=&vif_table[vr.vifi];
- if(vifc_map&(1<<vr.vifi))
- {
+ if(VIF_EXISTS(vr.vifi)) {
vr.icount=vif->pkt_in;
vr.ocount=vif->pkt_out;
vr.ibytes=vif->bytes_in;
vr.obytes=vif->bytes_out;
+ read_unlock(&mrt_lock);
+
if (copy_to_user((void *)arg,&vr,sizeof(vr)))
return -EFAULT;
return 0;
}
+ read_unlock(&mrt_lock);
return -EADDRNOTAVAIL;
case SIOCGETSGCNT:
if (copy_from_user(&sr,(void *)arg,sizeof(sr)))
- return -EFAULT;
- for (c = mfc_cache_array[MFC_HASH(sr.grp.s_addr, sr.src.s_addr)];
- c; c = c->next) {
- if (sr.grp.s_addr == c->mfc_mcastgrp &&
- sr.src.s_addr == c->mfc_origin) {
- sr.pktcnt = c->mfc_pkt;
- sr.bytecnt = c->mfc_bytes;
- sr.wrong_if = c->mfc_wrong_if;
- if (copy_to_user((void *)arg,&sr,sizeof(sr)))
- return -EFAULT;
- return 0;
- }
+ return -EFAULT;
+
+ read_lock(&mrt_lock);
+ c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
+ if (c) {
+ sr.pktcnt = c->mfc_un.res.pkt;
+ sr.bytecnt = c->mfc_un.res.bytes;
+ sr.wrong_if = c->mfc_un.res.wrong_if;
+ read_unlock(&mrt_lock);
+
+ if (copy_to_user((void *)arg,&sr,sizeof(sr)))
+ return -EFAULT;
+ return 0;
}
+ read_unlock(&mrt_lock);
return -EADDRNOTAVAIL;
default:
return -ENOIOCTLCMD;
}
}
-/*
- * Close the multicast socket, and clear the vif tables etc
- */
-
-void mroute_close(struct sock *sk)
-{
- int i;
-
- /*
- * Shut down all active vif entries
- */
- rtnl_lock();
- for(i=0; i<maxvif; i++)
- vif_delete(i);
- rtnl_unlock();
-
- /*
- * Wipe the cache
- */
- for(i=0;i<MFC_LINES;i++)
- {
- start_bh_atomic();
- while(mfc_cache_array[i]!=NULL)
- ipmr_cache_delete(mfc_cache_array[i]);
- end_bh_atomic();
- }
-}
static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
{
@@ -978,10 +1062,9 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
if (event != NETDEV_UNREGISTER)
return NOTIFY_DONE;
v=&vif_table[0];
- for(ct=0;ct<maxvif;ct++) {
- if (vifc_map&(1<<ct) && v->dev==ptr)
+ for(ct=0;ct<maxvif;ct++,v++) {
+ if (v->dev==ptr)
vif_delete(ct);
- v++;
}
return NOTIFY_DONE;
}
@@ -1019,6 +1102,16 @@ static void ip_encap(struct sk_buff *skb, u32 saddr, u32 daddr)
skb->nh.iph = iph;
}
+static inline int ipmr_forward_finish(struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb->dst;
+
+ if (skb->len <= dst->pmtu)
+ return dst->output(skb);
+ else
+ return ip_fragment(skb, dst->output);
+}
+
/*
* Processing handlers for ipmr_forward
*/
@@ -1028,11 +1121,14 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c,
{
struct iphdr *iph = skb->nh.iph;
struct vif_device *vif = &vif_table[vifi];
- struct device *dev;
+ struct net_device *dev;
struct rtable *rt;
int encap = 0;
struct sk_buff *skb2;
+ if (vif->dev == NULL)
+ return;
+
#ifdef CONFIG_IP_PIMSM
if (vif->flags & VIFF_REGISTER) {
vif->pkt_out++;
@@ -1090,34 +1186,17 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c,
iph = skb2->nh.iph;
ip_decrease_ttl(iph);
-#ifdef CONFIG_FIREWALL
- if (call_fw_firewall(PF_INET, vif->dev, skb2->nh.iph, NULL, &skb2) < FW_ACCEPT) {
- kfree_skb(skb2);
- return;
- }
- if (call_out_firewall(PF_INET, vif->dev, skb2->nh.iph, NULL, &skb2) < FW_ACCEPT) {
- kfree_skb(skb2);
- return;
- }
-#endif
+ /* FIXME: forward and output firewalls used to be called here.
+ * What do we do with netfilter? -- RR */
if (vif->flags & VIFF_TUNNEL) {
ip_encap(skb2, vif->local, vif->remote);
-#ifdef CONFIG_FIREWALL
- /* Double output firewalling on tunnels: one is on tunnel
- another one is on real device.
- */
- if (call_out_firewall(PF_INET, dev, skb2->nh.iph, NULL, &skb2) < FW_ACCEPT) {
- kfree_skb(skb2);
- return;
- }
-#endif
+ /* FIXME: extra output firewall step used to be here. --RR */
((struct ip_tunnel *)vif->dev->priv)->stat.tx_packets++;
((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb2->len;
}
IPCB(skb2)->flags |= IPSKB_FORWARDED;
-
/*
* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
* not only before forwarding, but after forwarding on all output
@@ -1129,20 +1208,18 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c,
* not mrouter) cannot join to more than one interface - it will
* result in receiving multiple packets.
*/
- if (skb2->len <= rt->u.dst.pmtu)
- skb2->dst->output(skb2);
- else
- ip_fragment(skb2, skb2->dst->output);
+ NF_HOOK(PF_INET, NF_IP_FORWARD, skb2, skb->dev, dev,
+ ipmr_forward_finish);
}
-int ipmr_find_vif(struct device *dev)
+int ipmr_find_vif(struct net_device *dev)
{
int ct;
- for (ct=0; ct<maxvif; ct++) {
- if (vifc_map&(1<<ct) && vif_table[ct].dev == dev)
- return ct;
+ for (ct=maxvif-1; ct>=0; ct--) {
+ if (vif_table[ct].dev == dev)
+ break;
}
- return ALL_VIFS;
+ return ct;
}
/* "local" means that we should preserve one skb (for local delivery) */
@@ -1153,8 +1230,8 @@ int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
int vif, ct;
vif = cache->mfc_parent;
- cache->mfc_pkt++;
- cache->mfc_bytes += skb->len;
+ cache->mfc_un.res.pkt++;
+ cache->mfc_un.res.bytes += skb->len;
/*
* Wrong interface: drop packet and (maybe) send PIM assert.
@@ -1177,18 +1254,18 @@ int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
goto dont_forward;
}
- cache->mfc_wrong_if++;
+ cache->mfc_un.res.wrong_if++;
true_vifi = ipmr_find_vif(skb->dev);
- if (true_vifi < MAXVIFS && mroute_do_assert &&
+ if (true_vifi >= 0 && mroute_do_assert &&
/* pimsm uses asserts, when switching from RPT to SPT,
so that we cannot check that packet arrived on an oif.
It is bad, but otherwise we would need to move pretty
large chunk of pimd to kernel. Ough... --ANK
*/
- (mroute_do_pim || cache->mfc_ttls[true_vifi] < 255) &&
- jiffies - cache->mfc_last_assert > MFC_ASSERT_THRESH) {
- cache->mfc_last_assert = jiffies;
+ (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
+ jiffies - cache->mfc_un.res.last_assert > MFC_ASSERT_THRESH) {
+ cache->mfc_un.res.last_assert = jiffies;
ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
}
goto dont_forward;
@@ -1200,8 +1277,8 @@ int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
/*
* Forward the frame
*/
- for (ct = cache->mfc_maxvif-1; ct >= cache->mfc_minvif; ct--) {
- if (skb->nh.iph->ttl > cache->mfc_ttls[ct]) {
+ for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
+ if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) {
if (psend != -1)
ipmr_queue_xmit(skb, cache, psend, 0);
psend=ct;
@@ -1236,48 +1313,61 @@ int ip_mr_input(struct sk_buff *skb)
if (IPCB(skb)->opt.router_alert) {
if (ip_call_ra_chain(skb))
return 0;
- } else if (skb->nh.iph->protocol == IPPROTO_IGMP && mroute_socket) {
+ } else if (skb->nh.iph->protocol == IPPROTO_IGMP){
/* IGMPv1 (and broken IGMPv2 implementations sort of
Cisco IOS <= 11.2(8)) do not put router alert
option to IGMP packets destined to routable
groups. It is very bad, because it means
that we can forward NO IGMP messages.
*/
- raw_rcv(mroute_socket, skb);
- return 0;
+ read_lock(&mrt_lock);
+ if (mroute_socket) {
+ raw_rcv(mroute_socket, skb);
+ read_unlock(&mrt_lock);
+ return 0;
+ }
+ read_unlock(&mrt_lock);
}
}
+ read_lock(&mrt_lock);
cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr);
/*
* No usable cache entry
*/
-
- if (cache==NULL || (cache->mfc_flags&MFC_QUEUED)) {
+ if (cache==NULL) {
int vif;
if (local) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
ip_local_deliver(skb);
- if (skb2 == NULL)
+ if (skb2 == NULL) {
+ read_unlock(&mrt_lock);
return -ENOBUFS;
+ }
skb = skb2;
}
vif = ipmr_find_vif(skb->dev);
- if (vif != ALL_VIFS) {
- ipmr_cache_unresolved(cache, vif, skb);
- return -EAGAIN;
+ if (vif >= 0) {
+ int err = ipmr_cache_unresolved(vif, skb);
+ read_unlock(&mrt_lock);
+
+ return err;
}
+ read_unlock(&mrt_lock);
kfree_skb(skb);
- return 0;
+ return -ENODEV;
}
ip_mr_forward(skb, cache, local);
+ read_unlock(&mrt_lock);
+
if (local)
return ip_local_deliver(skb);
+
return 0;
dont_forward:
@@ -1296,11 +1386,11 @@ int pim_rcv_v1(struct sk_buff * skb, unsigned short len)
{
struct igmphdr *pim = (struct igmphdr*)skb->h.raw;
struct iphdr *encap;
+ struct net_device *reg_dev = NULL;
if (!mroute_do_pim ||
len < sizeof(*pim) + sizeof(*encap) ||
- pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER ||
- reg_dev == NULL) {
+ pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) {
kfree_skb(skb);
return -EINVAL;
}
@@ -1318,6 +1408,19 @@ int pim_rcv_v1(struct sk_buff * skb, unsigned short len)
kfree_skb(skb);
return -EINVAL;
}
+
+ read_lock(&mrt_lock);
+ if (reg_vif_num >= 0)
+ reg_dev = vif_table[reg_vif_num].dev;
+ if (reg_dev)
+ dev_hold(reg_dev);
+ read_unlock(&mrt_lock);
+
+ if (reg_dev == NULL) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
skb->mac.raw = skb->nh.raw;
skb_pull(skb, (u8*)encap - skb->data);
skb->nh.iph = (struct iphdr *)skb->data;
@@ -1331,6 +1434,7 @@ int pim_rcv_v1(struct sk_buff * skb, unsigned short len)
((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
((struct net_device_stats*)reg_dev->priv)->rx_packets++;
netif_rx(skb);
+ dev_put(reg_dev);
return 0;
}
#endif
@@ -1340,11 +1444,11 @@ int pim_rcv(struct sk_buff * skb, unsigned short len)
{
struct pimreghdr *pim = (struct pimreghdr*)skb->h.raw;
struct iphdr *encap;
+ struct net_device *reg_dev = NULL;
if (len < sizeof(*pim) + sizeof(*encap) ||
pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
(pim->flags&PIM_NULL_REGISTER) ||
- reg_dev == NULL ||
(ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
ip_compute_csum((void *)pim, len))) {
kfree_skb(skb);
@@ -1359,6 +1463,19 @@ int pim_rcv(struct sk_buff * skb, unsigned short len)
kfree_skb(skb);
return -EINVAL;
}
+
+ read_lock(&mrt_lock);
+ if (reg_vif_num >= 0)
+ reg_dev = vif_table[reg_vif_num].dev;
+ if (reg_dev)
+ dev_hold(reg_dev);
+ read_unlock(&mrt_lock);
+
+ if (reg_dev == NULL) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
skb->mac.raw = skb->nh.raw;
skb_pull(skb, (u8*)encap - skb->data);
skb->nh.iph = (struct iphdr *)skb->data;
@@ -1372,6 +1489,7 @@ int pim_rcv(struct sk_buff * skb, unsigned short len)
((struct net_device_stats*)reg_dev->priv)->rx_packets++;
skb->dst = NULL;
netif_rx(skb);
+ dev_put(reg_dev);
return 0;
}
#endif
@@ -1383,7 +1501,7 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
{
int ct;
struct rtnexthop *nhp;
- struct device *dev = vif_table[c->mfc_parent].dev;
+ struct net_device *dev = vif_table[c->mfc_parent].dev;
u8 *b = skb->tail;
struct rtattr *mp_head;
@@ -1392,13 +1510,13 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
- for (ct = c->mfc_minvif; ct < c->mfc_maxvif; ct++) {
- if (c->mfc_ttls[ct] < 255) {
+ for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
+ if (c->mfc_un.res.ttls[ct] < 255) {
if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
goto rtattr_failure;
nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
nhp->rtnh_flags = 0;
- nhp->rtnh_hops = c->mfc_ttls[ct];
+ nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
nhp->rtnh_len = sizeof(*nhp);
}
@@ -1415,24 +1533,25 @@ rtattr_failure:
int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
{
+ int err;
struct mfc_cache *cache;
struct rtable *rt = (struct rtable*)skb->dst;
- start_bh_atomic();
+ read_lock(&mrt_lock);
cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
- if (cache==NULL || (cache->mfc_flags&MFC_QUEUED)) {
- struct device *dev;
+
+ if (cache==NULL) {
+ struct net_device *dev;
int vif;
- int err;
if (nowait) {
- end_bh_atomic();
+ read_unlock(&mrt_lock);
return -EAGAIN;
}
dev = skb->dev;
- if (dev == NULL || (vif = ipmr_find_vif(dev)) == ALL_VIFS) {
- end_bh_atomic();
+ if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
+ read_unlock(&mrt_lock);
return -ENODEV;
}
skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
@@ -1440,18 +1559,16 @@ int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
skb->nh.iph->saddr = rt->rt_src;
skb->nh.iph->daddr = rt->rt_dst;
skb->nh.iph->version = 0;
- err = ipmr_cache_unresolved(cache, vif, skb);
- end_bh_atomic();
+ err = ipmr_cache_unresolved(vif, skb);
+ read_unlock(&mrt_lock);
return err;
}
- /* Resolved cache entry is not changed by net bh,
- so that we are allowed to enable it.
- */
- end_bh_atomic();
if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
cache->mfc_flags |= MFC_NOTIFY;
- return ipmr_fill_mroute(skb, cache, rtm);
+ err = ipmr_fill_mroute(skb, cache, rtm);
+ read_unlock(&mrt_lock);
+ return err;
}
#endif
@@ -1472,11 +1589,12 @@ int ipmr_vif_info(char *buffer, char **start, off_t offset, int length, int dumm
"Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n");
pos=len;
+ read_lock(&mrt_lock);
for (ct=0;ct<maxvif;ct++)
{
char *name = "none";
vif=&vif_table[ct];
- if(!(vifc_map&(1<<ct)))
+ if(!VIF_EXISTS(ct))
continue;
if (vif->dev)
name = vif->dev->name;
@@ -1493,11 +1611,14 @@ int ipmr_vif_info(char *buffer, char **start, off_t offset, int length, int dumm
if(pos>offset+length)
break;
}
+ read_unlock(&mrt_lock);
*start=buffer+(offset-begin);
len-=(offset-begin);
if(len>length)
len=length;
+ if (len<0)
+ len = 0;
return len;
}
@@ -1513,12 +1634,11 @@ int ipmr_mfc_info(char *buffer, char **start, off_t offset, int length, int dumm
len += sprintf(buffer,
"Group Origin Iif Pkts Bytes Wrong Oifs\n");
pos=len;
-
+
+ read_lock(&mrt_lock);
for (ct=0;ct<MFC_LINES;ct++)
{
- start_bh_atomic();
- mfc=mfc_cache_array[ct];
- while(mfc!=NULL)
+ for(mfc=mfc_cache_array[ct]; mfc; mfc=mfc->next)
{
int n;
@@ -1528,14 +1648,14 @@ int ipmr_mfc_info(char *buffer, char **start, off_t offset, int length, int dumm
size = sprintf(buffer+len, "%08lX %08lX %-3d %8ld %8ld %8ld",
(unsigned long)mfc->mfc_mcastgrp,
(unsigned long)mfc->mfc_origin,
- mfc->mfc_parent == ALL_VIFS ? -1 : mfc->mfc_parent,
- (mfc->mfc_flags & MFC_QUEUED) ? mfc->mfc_unresolved.qlen : mfc->mfc_pkt,
- mfc->mfc_bytes,
- mfc->mfc_wrong_if);
- for(n=mfc->mfc_minvif;n<mfc->mfc_maxvif;n++)
+ mfc->mfc_parent,
+ mfc->mfc_un.res.pkt,
+ mfc->mfc_un.res.bytes,
+ mfc->mfc_un.res.wrong_if);
+ for(n=mfc->mfc_un.res.minvif;n<mfc->mfc_un.res.maxvif;n++)
{
- if(vifc_map&(1<<n) && mfc->mfc_ttls[n] < 255)
- size += sprintf(buffer+len+size, " %2d:%-3d", n, mfc->mfc_ttls[n]);
+ if(VIF_EXISTS(n) && mfc->mfc_un.res.ttls[n] < 255)
+ size += sprintf(buffer+len+size, " %2d:%-3d", n, mfc->mfc_un.res.ttls[n]);
}
size += sprintf(buffer+len+size, "\n");
len+=size;
@@ -1546,15 +1666,32 @@ int ipmr_mfc_info(char *buffer, char **start, off_t offset, int length, int dumm
begin=pos;
}
if(pos>offset+length)
- {
- end_bh_atomic();
goto done;
- }
- mfc=mfc->next;
}
- end_bh_atomic();
}
+
+ spin_lock_bh(&mfc_unres_lock);
+ for(mfc=mfc_unres_queue; mfc; mfc=mfc->next) {
+ size = sprintf(buffer+len, "%08lX %08lX %-3d %8ld %8ld %8ld\n",
+ (unsigned long)mfc->mfc_mcastgrp,
+ (unsigned long)mfc->mfc_origin,
+ -1,
+ (long)mfc->mfc_un.unres.unresolved.qlen,
+ 0L, 0L);
+ len+=size;
+ pos+=size;
+ if(pos<offset)
+ {
+ len=0;
+ begin=pos;
+ }
+ if(pos>offset+length)
+ break;
+ }
+ spin_unlock_bh(&mfc_unres_lock);
+
done:
+ read_unlock(&mrt_lock);
*start=buffer+(offset-begin);
len-=(offset-begin);
if(len>length)
@@ -1598,9 +1735,15 @@ struct inet_protocol pim_protocol =
* Setup for IP multicast routing
*/
-__initfunc(void ip_mr_init(void))
+void __init ip_mr_init(void)
{
printk(KERN_INFO "Linux IP multicast router 0.06 plus PIM-SM\n");
+ mrt_cachep = kmem_cache_create("ip_mrt_cache",
+ sizeof(struct mfc_cache),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+ init_timer(&ipmr_expire_timer);
+ ipmr_expire_timer.function=ipmr_expire_process;
register_netdevice_notifier(&ip_mr_notifier);
#ifdef CONFIG_PROC_FS
proc_net_register(&proc_net_ipmr_vif);
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index b47480be5..2b61e6466 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -5,7 +5,7 @@
*
* INET protocol dispatch tables.
*
- * Version: $Id: protocol.c,v 1.9 1997/10/29 20:27:34 kuznet Exp $
+ * Version: $Id: protocol.c,v 1.10 1999/08/20 11:05:55 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -116,25 +116,7 @@ struct inet_protocol *inet_protos[MAX_INET_PROTOS] =
NULL
};
-
-/*
- * Find a protocol in the protocol tables given its
- * IP type.
- */
-
-struct inet_protocol *inet_get_protocol(unsigned char prot)
-{
- unsigned char hash;
- struct inet_protocol *p;
-
- hash = prot & (MAX_INET_PROTOS - 1);
- for (p = inet_protos[hash] ; p != NULL; p=p->next)
- {
- if (p->protocol == prot)
- return((struct inet_protocol *) p);
- }
- return(NULL);
-}
+rwlock_t inet_protocol_lock = RW_LOCK_UNLOCKED;
/*
* Add a protocol handler to the hash tables
@@ -146,6 +128,7 @@ void inet_add_protocol(struct inet_protocol *prot)
struct inet_protocol *p2;
hash = prot->protocol & (MAX_INET_PROTOS - 1);
+ write_lock_bh(&inet_protocol_lock);
prot ->next = inet_protos[hash];
inet_protos[hash] = prot;
prot->copy = 0;
@@ -164,6 +147,7 @@ void inet_add_protocol(struct inet_protocol *prot)
}
p2 = (struct inet_protocol *) p2->next;
}
+ write_unlock_bh(&inet_protocol_lock);
}
/*
@@ -177,9 +161,11 @@ int inet_del_protocol(struct inet_protocol *prot)
unsigned char hash;
hash = prot->protocol & (MAX_INET_PROTOS - 1);
+ write_lock_bh(&inet_protocol_lock);
if (prot == inet_protos[hash])
{
inet_protos[hash] = (struct inet_protocol *) inet_protos[hash]->next;
+ write_unlock_bh(&inet_protocol_lock);
return(0);
}
@@ -200,6 +186,7 @@ int inet_del_protocol(struct inet_protocol *prot)
if (p->copy == 0 && lp != NULL)
lp->copy = 0;
p->next = prot->next;
+ write_unlock_bh(&inet_protocol_lock);
return(0);
}
if (p->next != NULL && p->next->protocol == prot->protocol)
@@ -207,5 +194,6 @@ int inet_del_protocol(struct inet_protocol *prot)
p = (struct inet_protocol *) p->next;
}
+ write_unlock_bh(&inet_protocol_lock);
return(-1);
}
diff --git a/net/ipv4/rarp.c b/net/ipv4/rarp.c
deleted file mode 100644
index 7f7c7e3f2..000000000
--- a/net/ipv4/rarp.c
+++ /dev/null
@@ -1,606 +0,0 @@
-/* linux/net/inet/rarp.c
- *
- * Copyright (C) 1994 by Ross Martin
- * Based on linux/net/inet/arp.c, Copyright (C) 1994 by Florian La Roche
- *
- * $Id: rarp.c,v 1.25 1998/06/19 13:22:34 davem Exp $
- *
- * This module implements the Reverse Address Resolution Protocol
- * (RARP, RFC 903), which is used to convert low level addresses such
- * as Ethernet addresses into high level addresses such as IP addresses.
- * The most common use of RARP is as a means for a diskless workstation
- * to discover its IP address during a network boot.
- *
- **
- *** WARNING:::::::::::::::::::::::::::::::::WARNING
- ****
- ***** SUN machines seem determined to boot solely from the person who
- **** answered their RARP query. NEVER add a SUN to your RARP table
- *** unless you have all the rest to boot the box from it.
- **
- *
- * Currently, only Ethernet address -> IP address is likely to work.
- * (Is RARP ever used for anything else?)
- *
- * This code is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Fixes
- * Alan Cox : Rarp delete on device down needed as
- * reported by Walter Wolfgang.
- * Mike McLagan : Routing by source
- *
- */
-
-#include <linux/module.h>
-
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/errno.h>
-#include <linux/netdevice.h>
-#include <linux/if_arp.h>
-#include <linux/in.h>
-#include <linux/config.h>
-#include <linux/init.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <stdarg.h>
-#include <linux/inet.h>
-#include <linux/etherdevice.h>
-#include <net/ip.h>
-#include <net/route.h>
-#include <net/protocol.h>
-#include <net/tcp.h>
-#include <linux/skbuff.h>
-#include <net/sock.h>
-#include <net/arp.h>
-#include <net/rarp.h>
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
-#include <net/ax25.h>
-#endif
-#include <linux/proc_fs.h>
-#include <linux/stat.h>
-
-extern int (*rarp_ioctl_hook)(unsigned int,void*);
-
-/*
- * This structure defines the RARP mapping cache. As long as we make
- * changes in this structure, we keep interrupts off.
- */
-
-struct rarp_table
-{
- struct rarp_table *next; /* Linked entry list */
- unsigned long ip; /* ip address of entry */
- unsigned char ha[MAX_ADDR_LEN]; /* Hardware address */
- unsigned char hlen; /* Length of hardware address */
- unsigned char htype; /* Type of hardware in use */
- struct device *dev; /* Device the entry is tied to */
-};
-
-struct rarp_table *rarp_tables = NULL;
-
-static int rarp_rcv(struct sk_buff *, struct device *, struct packet_type *);
-
-static struct packet_type rarp_packet_type =
-{
- 0, /* Should be: __constant_htons(ETH_P_RARP) - but this _doesn't_ come out constant! */
- 0, /* copy */
- rarp_rcv,
- NULL,
- NULL
-};
-
-static int initflag = 1;
-
-
-/*
- * Release the memory for this entry.
- */
-
-static inline void rarp_release_entry(struct rarp_table *entry)
-{
- kfree_s(entry, sizeof(struct rarp_table));
- MOD_DEC_USE_COUNT;
- return;
-}
-
-/*
- * Delete a RARP mapping entry in the cache.
- */
-
-static void rarp_destroy(unsigned long ip_addr)
-{
- struct rarp_table *entry;
- struct rarp_table **pentry;
-
- start_bh_atomic();
- pentry = &rarp_tables;
- while ((entry = *pentry) != NULL)
- {
- if (entry->ip == ip_addr)
- {
- *pentry = entry->next;
- end_bh_atomic();
- rarp_release_entry(entry);
- return;
- }
- pentry = &entry->next;
- }
- end_bh_atomic();
-}
-
-/*
- * Flush a device.
- */
-
-static void rarp_destroy_dev(struct device *dev)
-{
- struct rarp_table *entry;
- struct rarp_table **pentry;
-
- start_bh_atomic();
- pentry = &rarp_tables;
- while ((entry = *pentry) != NULL)
- {
- if (entry->dev == dev)
- {
- *pentry = entry->next;
- rarp_release_entry(entry);
- }
- else
- pentry = &entry->next;
- }
- end_bh_atomic();
-}
-
-static int rarp_device_event(struct notifier_block *this, unsigned long event, void *ptr)
-{
- if(event!=NETDEV_DOWN)
- return NOTIFY_DONE;
- rarp_destroy_dev((struct device *)ptr);
- return NOTIFY_DONE;
-}
-
-/*
- * Called once when data first added to rarp cache with ioctl.
- */
-
-static struct notifier_block rarp_dev_notifier={
- rarp_device_event,
- NULL,
- 0
-};
-
-static int rarp_pkt_inited=0;
-
-static void rarp_init_pkt (void)
-{
- /* Register the packet type */
- rarp_packet_type.type=htons(ETH_P_RARP);
- dev_add_pack(&rarp_packet_type);
- register_netdevice_notifier(&rarp_dev_notifier);
- rarp_pkt_inited=1;
-}
-
-#ifdef MODULE
-
-static void rarp_end_pkt(void)
-{
- if(!rarp_pkt_inited)
- return;
- dev_remove_pack(&rarp_packet_type);
- unregister_netdevice_notifier(&rarp_dev_notifier);
- rarp_pkt_inited=0;
-}
-
-#endif
-
-/*
- * Receive an arp request by the device layer. Maybe it should be
- * rewritten to use the incoming packet for the reply. The current
- * "overhead" time isn't that high...
- */
-
-static int rarp_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
-{
-/*
- * We shouldn't use this type conversion. Check later.
- */
- struct arphdr *rarp = (struct arphdr *) skb->data;
- unsigned char *rarp_ptr = skb_pull(skb,sizeof(struct arphdr));
- struct rarp_table *entry;
- struct in_device *in_dev = dev->ip_ptr;
- long sip,tip;
- unsigned char *sha,*tha; /* s for "source", t for "target" */
-
-/*
- * If this test doesn't pass, it's not IP, or we should ignore it anyway
- */
-
- if (rarp->ar_hln != dev->addr_len || dev->type != ntohs(rarp->ar_hrd)
- || dev->flags&IFF_NOARP || !in_dev || !in_dev->ifa_list)
- {
- kfree_skb(skb);
- return 0;
- }
-
-/*
- * If it's not a RARP request, delete it.
- */
- if (rarp->ar_op != htons(ARPOP_RREQUEST))
- {
- kfree_skb(skb);
- return 0;
- }
-
-/*
- * For now we will only deal with IP addresses.
- */
-
- if (
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
- (rarp->ar_pro != htons(AX25_P_IP) && dev->type == ARPHRD_AX25) ||
-#endif
- (rarp->ar_pro != htons(ETH_P_IP) && dev->type != ARPHRD_AX25)
- || rarp->ar_pln != 4)
- {
- /*
- * This packet is not for us. Remove it.
- */
- kfree_skb(skb);
- return 0;
- }
-
-/*
- * Extract variable width fields
- */
-
- sha=rarp_ptr;
- rarp_ptr+=dev->addr_len;
- memcpy(&sip,rarp_ptr,4);
- rarp_ptr+=4;
- tha=rarp_ptr;
- rarp_ptr+=dev->addr_len;
- memcpy(&tip,rarp_ptr,4);
-
-/*
- * Process entry. Use tha for table lookup according to RFC903.
- */
-
- for (entry = rarp_tables; entry != NULL; entry = entry->next)
- if (!memcmp(entry->ha, tha, rarp->ar_hln))
- break;
-
- if (entry != NULL)
- {
- sip=entry->ip;
-
- arp_send(ARPOP_RREPLY, ETH_P_RARP, sip, dev, in_dev->ifa_list->ifa_address, sha,
- dev->dev_addr, sha);
- }
-
- kfree_skb(skb);
- return 0;
-}
-
-
-/*
- * Set (create) a RARP cache entry.
- */
-
-static int rarp_req_set(struct arpreq *req)
-{
- struct arpreq r;
- struct rarp_table *entry;
- struct sockaddr_in *si;
- int htype, hlen;
- unsigned long ip;
- struct rtable *rt;
- struct device * dev;
- int err;
-
- err = copy_from_user(&r, req, sizeof(r));
- if (err)
- return -EFAULT;
-
- /*
- * We only understand about IP addresses...
- */
-
- if (r.arp_pa.sa_family != AF_INET)
- return -EPFNOSUPPORT;
-
- switch (r.arp_ha.sa_family)
- {
- case ARPHRD_ETHER:
- htype = ARPHRD_ETHER;
- hlen = ETH_ALEN;
- break;
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
- case ARPHRD_AX25:
- htype = ARPHRD_AX25;
- hlen = 7;
- break;
-#endif
- default:
- return -EPFNOSUPPORT;
- }
-
- si = (struct sockaddr_in *) &r.arp_pa;
- ip = si->sin_addr.s_addr;
- if (ip == 0)
- {
- printk(KERN_DEBUG "RARP: SETRARP: requested PA is 0.0.0.0 !\n");
- return -EINVAL;
- }
-
-/*
- * Is it reachable directly ?
- */
-
- err = ip_route_output(&rt, ip, 0, 1, 0);
- if (err)
- return err;
- if (rt->rt_flags&(RTCF_LOCAL|RTCF_BROADCAST|RTCF_MULTICAST|RTCF_DNAT)) {
- ip_rt_put(rt);
- return -EINVAL;
- }
- dev = rt->u.dst.dev;
-
-/*
- * Is there an existing entry for this address? Find out...
- */
-
- for (entry = rarp_tables; entry != NULL; entry = entry->next)
- if (entry->ip == ip)
- break;
-
-/*
- * If no entry was found, create a new one.
- */
-
- if (entry == NULL)
- {
- entry = (struct rarp_table *) kmalloc(sizeof(struct rarp_table),
- GFP_ATOMIC);
- if (entry == NULL)
- {
- return -ENOMEM;
- }
- if (initflag)
- {
- rarp_init_pkt();
- initflag=0;
- }
-
- /* Block interrupts until table modification is finished */
-
- cli();
- entry->next = rarp_tables;
- rarp_tables = entry;
- }
- cli();
- entry->ip = ip;
- entry->hlen = hlen;
- entry->htype = htype;
- memcpy(&entry->ha, &r.arp_ha.sa_data, hlen);
- entry->dev = dev;
- sti();
-
- /* Don't unlink if we have entries to serve. */
- MOD_INC_USE_COUNT;
-
- return 0;
-}
-
-
-/*
- * Get a RARP cache entry.
- */
-
-static int rarp_req_get(struct arpreq *req)
-{
- struct arpreq r;
- struct rarp_table *entry;
- struct sockaddr_in *si;
- unsigned long ip;
- int err;
-
-/*
- * We only understand about IP addresses...
- */
-
- err = copy_from_user(&r, req, sizeof(r));
- if (err)
- return -EFAULT;
-
- if (r.arp_pa.sa_family != AF_INET)
- return -EPFNOSUPPORT;
-
-/*
- * Is there an existing entry for this address?
- */
-
- si = (struct sockaddr_in *) &r.arp_pa;
- ip = si->sin_addr.s_addr;
-
- for (entry = rarp_tables; entry != NULL; entry = entry->next)
- if (entry->ip == ip)
- break;
-
- if (entry == NULL)
- {
- return -ENXIO;
- }
-
-/*
- * We found it; copy into structure.
- */
-
- memcpy(r.arp_ha.sa_data, &entry->ha, entry->hlen);
- r.arp_ha.sa_family = entry->htype;
-
-/*
- * Copy the information back
- */
-
- return copy_to_user(req, &r, sizeof(r)) ? -EFAULT : 0;
-}
-
-
-/*
- * Handle a RARP layer I/O control request.
- */
-
-int rarp_ioctl(unsigned int cmd, void *arg)
-{
- struct arpreq r;
- struct sockaddr_in *si;
- int err;
-
- switch(cmd)
- {
- case SIOCDRARP:
- if (!suser())
- return -EPERM;
- err = copy_from_user(&r, arg, sizeof(r));
- if (err)
- return -EFAULT;
- if (r.arp_pa.sa_family != AF_INET)
- return -EPFNOSUPPORT;
- si = (struct sockaddr_in *) &r.arp_pa;
- rarp_destroy(si->sin_addr.s_addr);
- return 0;
-
- case SIOCGRARP:
-
- return rarp_req_get((struct arpreq *)arg);
- case SIOCSRARP:
- if (!suser())
- return -EPERM;
- return rarp_req_set((struct arpreq *)arg);
- default:
- return -EINVAL;
- }
-
- /*NOTREACHED*/
- return 0;
-}
-
-#ifdef CONFIG_PROC_FS
-int rarp_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
-{
- int len=0;
- off_t begin=0;
- off_t pos=0;
- int size;
- struct rarp_table *entry;
- char ipbuffer[20];
- unsigned long netip;
- if (initflag)
- {
- size = sprintf(buffer,"RARP disabled until entries added to cache.\n");
- pos+=size;
- len+=size;
- }
- else
- {
- size = sprintf(buffer,
- "IP address HW type HW address\n");
- pos+=size;
- len+=size;
-
- for(entry=rarp_tables; entry!=NULL; entry=entry->next)
- {
- netip=htonl(entry->ip); /* switch to network order */
- sprintf(ipbuffer,"%d.%d.%d.%d",
- (unsigned int)(netip>>24)&255,
- (unsigned int)(netip>>16)&255,
- (unsigned int)(netip>>8)&255,
- (unsigned int)(netip)&255);
-
- size = sprintf(buffer+len,
- "%-17s%-20s%02x:%02x:%02x:%02x:%02x:%02x\n",
- ipbuffer,
- "10Mbps Ethernet",
- (unsigned int)entry->ha[0],
- (unsigned int)entry->ha[1],
- (unsigned int)entry->ha[2],
- (unsigned int)entry->ha[3],
- (unsigned int)entry->ha[4],
- (unsigned int)entry->ha[5]);
-
- len+=size;
- pos=begin+len;
-
- if(pos<offset)
- {
- len=0;
- begin=pos;
- }
- if(pos>offset+length)
- break;
- }
- }
-
- *start = buffer+(offset-begin); /* Start of wanted data */
- len -= (offset-begin); /* Start slop */
- if (len>length)
- len = length; /* Ending slop */
- return len;
-}
-
-struct proc_dir_entry proc_net_rarp = {
- PROC_NET_RARP, 4, "rarp",
- S_IFREG | S_IRUGO, 1, 0, 0,
- 0, &proc_net_inode_operations,
- rarp_get_info
-};
-#endif
-
-__initfunc(void
-rarp_init(void))
-{
-#ifdef CONFIG_PROC_FS
- proc_net_register(&proc_net_rarp);
-#endif
- rarp_ioctl_hook = rarp_ioctl;
-}
-
-#ifdef MODULE
-
-int init_module(void)
-{
- rarp_init();
- return 0;
-}
-
-void cleanup_module(void)
-{
- struct rarp_table *rt, *rt_next;
-#ifdef CONFIG_PROC_FS
- proc_net_unregister(PROC_NET_RARP);
-#endif
- rarp_ioctl_hook = NULL;
- cli();
- /* Destroy the RARP-table */
- rt = rarp_tables;
- rarp_tables = NULL;
- sti();
- /* ... and free it. */
- for ( ; rt != NULL; rt = rt_next) {
- rt_next = rt->next;
- rarp_release_entry(rt);
- }
- rarp_end_pkt();
-}
-#endif
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 584fe81fc..83044d2cd 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -5,7 +5,7 @@
*
* RAW - implementation of IP "raw" sockets.
*
- * Version: $Id: raw.c,v 1.42 1999/07/02 11:26:26 davem Exp $
+ * Version: $Id: raw.c,v 1.43 1999/08/20 11:05:57 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -60,19 +60,17 @@
#include <net/icmp.h>
#include <net/udp.h>
#include <net/raw.h>
+#include <net/inet_common.h>
#include <net/checksum.h>
-#ifdef CONFIG_IP_MROUTE
-struct sock *mroute_socket=NULL;
-#endif
-
struct sock *raw_v4_htable[RAWV4_HTABLE_SIZE];
+rwlock_t raw_v4_lock = RW_LOCK_UNLOCKED;
static void raw_v4_hash(struct sock *sk)
{
struct sock **skp = &raw_v4_htable[sk->num & (RAWV4_HTABLE_SIZE - 1)];
- SOCKHASH_LOCK_WRITE();
+ write_lock_bh(&raw_v4_lock);
if ((sk->next = *skp) != NULL)
(*skp)->pprev = &sk->next;
*skp = sk;
@@ -80,31 +78,32 @@ static void raw_v4_hash(struct sock *sk)
sk->prot->inuse++;
if(sk->prot->highestinuse < sk->prot->inuse)
sk->prot->highestinuse = sk->prot->inuse;
- SOCKHASH_UNLOCK_WRITE();
+ sock_hold(sk);
+ write_unlock_bh(&raw_v4_lock);
}
static void raw_v4_unhash(struct sock *sk)
{
- SOCKHASH_LOCK_WRITE();
+ write_lock_bh(&raw_v4_lock);
if (sk->pprev) {
if (sk->next)
sk->next->pprev = sk->pprev;
*sk->pprev = sk->next;
sk->pprev = NULL;
sk->prot->inuse--;
+ __sock_put(sk);
}
- SOCKHASH_UNLOCK_WRITE();
+ write_unlock_bh(&raw_v4_lock);
}
-static __inline__ struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num,
- unsigned long raddr, unsigned long laddr,
- int dif)
+struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num,
+ unsigned long raddr, unsigned long laddr,
+ int dif)
{
struct sock *s = sk;
for(s = sk; s; s = s->next) {
if((s->num == num) &&
- !(s->dead && (s->state == TCP_CLOSE)) &&
!(s->daddr && s->daddr != raddr) &&
!(s->rcv_saddr && s->rcv_saddr != laddr) &&
!(s->bound_dev_if && s->bound_dev_if != dif))
@@ -113,17 +112,6 @@ static __inline__ struct sock *__raw_v4_lookup(struct sock *sk, unsigned short n
return s;
}
-struct sock *raw_v4_lookup(struct sock *sk, unsigned short num,
- unsigned long raddr, unsigned long laddr,
- int dif)
-{
- SOCKHASH_LOCK_READ();
- sk = __raw_v4_lookup(sk, num, raddr, laddr, dif);
- SOCKHASH_UNLOCK_READ();
-
- return sk;
-}
-
/*
* 0 - deliver
* 1 - block
@@ -151,17 +139,17 @@ struct sock *raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
{
struct sock *sk;
- SOCKHASH_LOCK_READ_BH();
+ read_lock(&raw_v4_lock);
if ((sk = raw_v4_htable[hash]) == NULL)
goto out;
sk = __raw_v4_lookup(sk, iph->protocol,
iph->saddr, iph->daddr,
skb->dev->ifindex);
+
while(sk != NULL) {
struct sock *sknext = __raw_v4_lookup(sk->next, iph->protocol,
iph->saddr, iph->daddr,
skb->dev->ifindex);
-
if (iph->protocol != IPPROTO_ICMP ||
! icmp_filter(sk, skb)) {
struct sk_buff *clone;
@@ -169,16 +157,16 @@ struct sock *raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
if(sknext == NULL)
break;
clone = skb_clone(skb, GFP_ATOMIC);
- if(clone) {
- SOCKHASH_UNLOCK_READ_BH();
+ /* Not releasing hash table! */
+ if(clone)
raw_rcv(sk, clone);
- SOCKHASH_LOCK_READ_BH();
- }
}
sk = sknext;
}
out:
- SOCKHASH_UNLOCK_READ_BH();
+ if (sk)
+ sock_hold(sk);
+ read_unlock(&raw_v4_lock);
return sk;
}
@@ -196,7 +184,7 @@ void raw_err (struct sock *sk, struct sk_buff *skb)
2. Socket is connected (otherwise the error indication
is useless without ip_recverr and error is hard.
*/
- if (!sk->ip_recverr && sk->state != TCP_ESTABLISHED)
+ if (!sk->protinfo.af_inet.recverr && sk->state != TCP_ESTABLISHED)
return;
switch (type) {
@@ -218,16 +206,16 @@ void raw_err (struct sock *sk, struct sk_buff *skb)
err = icmp_err_convert[code].errno;
harderr = icmp_err_convert[code].fatal;
if (code == ICMP_FRAG_NEEDED) {
- harderr = (sk->ip_pmtudisc != IP_PMTUDISC_DONT);
+ harderr = (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT);
err = EMSGSIZE;
info = ntohs(skb->h.icmph->un.frag.mtu);
}
}
- if (sk->ip_recverr)
+ if (sk->protinfo.af_inet.recverr)
ip_icmp_error(sk, skb, err, 0, info, (u8 *)(skb->h.icmph + 1));
- if (sk->ip_recverr || harderr) {
+ if (sk->protinfo.af_inet.recverr || harderr) {
sk->err = err;
sk->error_report(sk);
}
@@ -345,9 +333,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, int len)
if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
return -EOPNOTSUPP;
- if (msg->msg_flags & ~(MSG_DONTROUTE|MSG_DONTWAIT))
- return(-EINVAL);
-
/*
* Get and verify the address.
*/
@@ -390,14 +375,14 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, int len)
ipc.addr = daddr;
if (!ipc.opt)
- ipc.opt = sk->opt;
+ ipc.opt = sk->protinfo.af_inet.opt;
if (ipc.opt) {
err = -EINVAL;
/* Linux does not mangle headers on raw sockets,
* so that IP options + IP_HDRINCL is non-sense.
*/
- if (sk->ip_hdrincl)
+ if (sk->protinfo.af_inet.hdrincl)
goto done;
if (ipc.opt->srr) {
if (!daddr)
@@ -405,15 +390,15 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, int len)
daddr = ipc.opt->faddr;
}
}
- tos = RT_TOS(sk->ip_tos) | sk->localroute;
+ tos = RT_TOS(sk->protinfo.af_inet.tos) | sk->localroute;
if (msg->msg_flags&MSG_DONTROUTE)
tos |= RTO_ONLINK;
if (MULTICAST(daddr)) {
if (!ipc.oif)
- ipc.oif = sk->ip_mc_index;
+ ipc.oif = sk->protinfo.af_inet.mc_index;
if (!rfh.saddr)
- rfh.saddr = sk->ip_mc_addr;
+ rfh.saddr = sk->protinfo.af_inet.mc_addr;
}
err = ip_route_output(&rt, daddr, rfh.saddr, tos, ipc.oif);
@@ -425,11 +410,15 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, int len)
if (rt->rt_flags&RTCF_BROADCAST && !sk->broadcast)
goto done;
+ if (msg->msg_flags&MSG_CONFIRM)
+ goto do_confirm;
+back_from_confirm:
+
rfh.iov = msg->msg_iov;
rfh.saddr = rt->rt_src;
if (!ipc.addr)
ipc.addr = rt->rt_dst;
- err=ip_build_xmit(sk, sk->ip_hdrincl ? raw_getrawfrag : raw_getfrag,
+ err=ip_build_xmit(sk, sk->protinfo.af_inet.hdrincl ? raw_getrawfrag : raw_getfrag,
&rfh, len, &ipc, rt, msg->msg_flags);
done:
@@ -438,39 +427,23 @@ done:
ip_rt_put(rt);
return err<0 ? err : len;
+
+do_confirm:
+ dst_confirm(&rt->u.dst);
+ if (!(msg->msg_flags&MSG_PROBE) || len)
+ goto back_from_confirm;
+ err = 0;
+ goto done;
}
static void raw_close(struct sock *sk, long timeout)
{
- bh_lock_sock(sk);
-
- /* Observation: when raw_close is called, processes have
- no access to socket anymore. But net still has.
- Step one, detach it from networking:
-
- A. Remove from hash tables.
- */
- sk->state = TCP_CLOSE;
- raw_v4_unhash(sk);
/*
- B. Raw sockets may have direct kernel refereneces. Kill them.
+ * Raw sockets may have direct kernel refereneces. Kill them.
*/
ip_ra_control(sk, 0, NULL);
- /* In this point socket cannot receive new packets anymore */
-
-
- /* But we still have packets pending on receive
- queue and probably, our own packets waiting in device queues.
- sock_destroy will drain receive queue, but transmitted
- packets will delay socket destruction.
- Set sk->dead=1 in order to prevent wakeups, when these
- packet will be freed.
- */
- sk->dead=1;
- destroy_sock(sk);
-
- /* That's all. No races here. */
+ inet_sock_release(sk);
}
/* This gets rid of all the nasties in af_inet. -DaveM */
@@ -483,17 +456,12 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
return -EINVAL;
chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
if(addr->sin_addr.s_addr != 0 && chk_addr_ret != RTN_LOCAL &&
- chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) {
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- /* Superuser may bind to any address to allow transparent proxying. */
- if(chk_addr_ret != RTN_UNICAST || !capable(CAP_NET_ADMIN))
-#endif
- return -EADDRNOTAVAIL;
- }
+ chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
+ return -EADDRNOTAVAIL;
sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr;
if(chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
sk->saddr = 0; /* Use device */
- dst_release(xchg(&sk->dst_cache, NULL));
+ sk_dst_reset(sk);
return 0;
}
@@ -541,7 +509,7 @@ int raw_recvmsg(struct sock *sk, struct msghdr *msg, int len,
sin->sin_family = AF_INET;
sin->sin_addr.s_addr = skb->nh.iph->saddr;
}
- if (sk->ip_cmsg_flags)
+ if (sk->protinfo.af_inet.cmsg_flags)
ip_cmsg_recv(msg, skb);
done:
skb_free_datagram(sk, skb);
@@ -621,17 +589,18 @@ static void get_raw_sock(struct sock *sp, char *tmpbuf, int i)
dest = sp->daddr;
src = sp->rcv_saddr;
- destp = ntohs(sp->dport);
- srcp = ntohs(sp->sport);
+ destp = 0;
+ srcp = sp->num;
timer_active = (sp->timer.prev != NULL) ? 2 : 0;
timer_expires = (timer_active == 2 ? sp->timer.expires : jiffies);
sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld",
+ " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld %d %p",
i, src, srcp, dest, destp, sp->state,
atomic_read(&sp->wmem_alloc), atomic_read(&sp->rmem_alloc),
timer_active, timer_expires-jiffies, 0,
- sp->socket->inode->i_uid, timer_active ? sp->timeout : 0,
- sp->socket ? sp->socket->inode->i_ino : 0);
+ sp->socket->inode->i_uid, 0,
+ sp->socket ? sp->socket->inode->i_ino : 0,
+ atomic_read(&sp->refcnt), sp);
}
int raw_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
@@ -646,7 +615,7 @@ int raw_get_info(char *buffer, char **start, off_t offset, int length, int dummy
" sl local_address rem_address st tx_queue "
"rx_queue tr tm->when retrnsmt uid timeout inode");
pos = 128;
- SOCKHASH_LOCK_READ();
+ read_lock(&raw_v4_lock);
for (i = 0; i < RAWV4_HTABLE_SIZE; i++) {
struct sock *sk;
@@ -663,7 +632,7 @@ int raw_get_info(char *buffer, char **start, off_t offset, int length, int dummy
}
}
out:
- SOCKHASH_UNLOCK_READ();
+ read_unlock(&raw_v4_lock);
begin = len - (pos - offset);
*start = buffer + begin;
len -= begin;
@@ -677,6 +646,7 @@ out:
struct proto raw_prot = {
raw_close, /* close */
udp_connect, /* connect */
+ udp_disconnect, /* disconnect */
NULL, /* accept */
NULL, /* retransmit */
NULL, /* write_wakeup */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3d9e87de3..72bb07336 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -5,7 +5,7 @@
*
* ROUTE - implementation of the IP router.
*
- * Version: $Id: route.c,v 1.69 1999/06/09 10:11:02 davem Exp $
+ * Version: $Id: route.c,v 1.72 1999/08/30 10:17:12 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -82,6 +82,7 @@
#include <linux/igmp.h>
#include <linux/pkt_sched.h>
#include <linux/mroute.h>
+#include <linux/netfilter_ipv4.h>
#include <net/protocol.h>
#include <net/ip.h>
#include <net/route.h>
@@ -112,6 +113,8 @@ int ip_rt_error_cost = HZ;
int ip_rt_error_burst = 5*HZ;
int ip_rt_gc_elasticity = 8;
int ip_rt_mtu_expires = 10*60*HZ;
+int ip_rt_min_pmtu = 512+20+20;
+int ip_rt_min_advmss = 536;
static unsigned long rt_deadline = 0;
@@ -148,6 +151,7 @@ struct dst_ops ipv4_dst_ops =
NULL,
ipv4_negative_advice,
ipv4_link_failure,
+ sizeof(struct rtable),
};
__u8 ip_tos2prio[16] = {
@@ -233,12 +237,13 @@ static int rt_cache_get_info(char *buffer, char **start, off_t offset, int lengt
(unsigned long)r->rt_dst,
(unsigned long)r->rt_gateway,
r->rt_flags,
- atomic_read(&r->u.dst.use),
- atomic_read(&r->u.dst.refcnt),
+ atomic_read(&r->u.dst.__refcnt),
+ r->u.dst.__use,
0,
- (unsigned long)r->rt_src, (int)r->u.dst.pmtu,
+ (unsigned long)r->rt_src, (int)r->u.dst.advmss + 40,
r->u.dst.window,
- (int)r->u.dst.rtt, r->key.tos,
+ (int)((r->u.dst.rtt>>3) + r->u.dst.rttvar),
+ r->key.tos,
r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
r->u.dst.hh ? (r->u.dst.hh->hh_output == dev_queue_xmit) : 0,
r->rt_spec_dst);
@@ -289,7 +294,7 @@ static __inline__ int rt_may_expire(struct rtable *rth, int tmo1, int tmo2)
{
int age;
- if (atomic_read(&rth->u.dst.use))
+ if (atomic_read(&rth->u.dst.__refcnt))
return 0;
if (rth->u.dst.expires && (long)(rth->u.dst.expires - jiffies) <= 0)
@@ -361,13 +366,11 @@ static void rt_run_flush(unsigned long dummy)
for (i=0; i<RT_HASH_DIVISOR; i++) {
write_lock_bh(&rt_hash_lock);
rth = rt_hash_table[i];
- if(rth != NULL)
- rt_hash_table[i] = NULL;
+ rt_hash_table[i] = NULL;
write_unlock_bh(&rt_hash_lock);
for (; rth; rth=next) {
next = rth->u.rt_next;
- rth->u.rt_next = NULL;
rt_free(rth);
}
}
@@ -492,7 +495,6 @@ static int rt_garbage_collect(void)
continue;
}
*rthp = rth->u.rt_next;
- rth->u.rt_next = NULL;
rt_free(rth);
goal--;
}
@@ -560,8 +562,8 @@ restart:
rth->u.rt_next = rt_hash_table[hash];
rt_hash_table[hash] = rth;
- atomic_inc(&rth->u.dst.refcnt);
- atomic_inc(&rth->u.dst.use);
+ rth->u.dst.__use++;
+ dst_hold(&rth->u.dst);
rth->u.dst.lastuse = now;
write_unlock_bh(&rt_hash_lock);
@@ -595,9 +597,14 @@ restart:
goto restart;
}
+ if (net_ratelimit()) {
+ if ((rt->u.dst.dev->flags&IFF_UP) &&
+ __in_dev_get(rt->u.dst.dev))
+ printk("Neighbour table overflow.\n");
+ else
+ printk("Device %s is down.\n", rt->u.dst.dev->name);
+ }
rt_drop(rt);
- if (net_ratelimit())
- printk("neighbour table overflow\n");
return -ENOBUFS;
}
}
@@ -618,11 +625,27 @@ restart:
return 0;
}
+static void rt_del(unsigned hash, struct rtable *rt)
+{
+ struct rtable **rthp;
+
+ write_lock_bh(&rt_hash_lock);
+ ip_rt_put(rt);
+ for (rthp = &rt_hash_table[hash]; *rthp; rthp = &(*rthp)->u.rt_next) {
+ if (*rthp == rt) {
+ *rthp = rt->u.rt_next;
+ rt_free(rt);
+ break;
+ }
+ }
+ write_unlock_bh(&rt_hash_lock);
+}
+
void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
- u32 saddr, u8 tos, struct device *dev)
+ u32 saddr, u8 tos, struct net_device *dev)
{
int i, k;
- struct in_device *in_dev = dev->ip_ptr;
+ struct in_device *in_dev = in_dev_get(dev);
struct rtable *rth, **rthp;
u32 skeys[2] = { saddr, 0 };
int ikeys[2] = { dev->ifindex, 0 };
@@ -652,7 +675,7 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
rthp=&rt_hash_table[hash];
- write_lock_bh(&rt_hash_lock);
+ read_lock(&rt_hash_lock);
while ( (rth = *rthp) != NULL) {
struct rtable *rt;
@@ -673,11 +696,12 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
break;
dst_clone(&rth->u.dst);
+ read_unlock(&rt_hash_lock);
- rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
+ rt = dst_alloc(&ipv4_dst_ops);
if (rt == NULL) {
ip_rt_put(rth);
- write_unlock_bh(&rt_hash_lock);
+ in_dev_put(in_dev);
return;
}
@@ -685,11 +709,14 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
* Copy all the information.
*/
*rt = *rth;
- atomic_set(&rt->u.dst.refcnt, 1);
- atomic_set(&rt->u.dst.use, 1);
+ rt->u.dst.__use = 1;
+ atomic_set(&rt->u.dst.__refcnt, 1);
+ if (rt->u.dst.dev)
+ dev_hold(rt->u.dst.dev);
rt->u.dst.lastuse = jiffies;
rt->u.dst.neighbour = NULL;
rt->u.dst.hh = NULL;
+ rt->u.dst.obsolete = 0;
rt->rt_flags |= RTCF_REDIRECTED;
@@ -705,21 +732,20 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
neigh_event_send(rt->u.dst.neighbour, NULL);
ip_rt_put(rth);
rt_drop(rt);
- break;
+ goto do_next;
}
- *rthp = rth->u.rt_next;
- write_unlock_bh(&rt_hash_lock);
+ rt_del(hash, rt);
if (!rt_intern_hash(hash, rt, &rt))
ip_rt_put(rt);
- rt_drop(rth);
goto do_next;
}
- write_unlock_bh(&rt_hash_lock);
+ read_unlock(&rt_hash_lock);
do_next:
;
}
}
+ in_dev_put(in_dev);
return;
reject_redirect:
@@ -730,6 +756,7 @@ reject_redirect:
ntohl(old_gw), dev->name, ntohl(new_gw),
ntohl(saddr), ntohl(daddr), tos);
#endif
+ in_dev_put(in_dev);
}
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
@@ -743,20 +770,10 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
}
if ((rt->rt_flags&RTCF_REDIRECTED) || rt->u.dst.expires) {
unsigned hash = rt_hash_code(rt->key.dst, rt->key.src^(rt->key.oif<<5), rt->key.tos);
- struct rtable **rthp;
#if RT_CACHE_DEBUG >= 1
printk(KERN_DEBUG "ip_rt_advice: redirect to %d.%d.%d.%d/%02x dropped\n", NIPQUAD(rt->rt_dst), rt->key.tos);
#endif
- ip_rt_put(rt);
- write_lock_bh(&rt_hash_lock);
- for (rthp = &rt_hash_table[hash]; *rthp; rthp = &(*rthp)->u.rt_next) {
- if (*rthp == rt) {
- *rthp = rt->u.rt_next;
- rt_free(rt);
- break;
- }
- }
- write_unlock_bh(&rt_hash_lock);
+ rt_del(hash, rt);
return NULL;
}
}
@@ -782,11 +799,14 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
void ip_rt_send_redirect(struct sk_buff *skb)
{
struct rtable *rt = (struct rtable*)skb->dst;
- struct in_device *in_dev = (struct in_device*)rt->u.dst.dev->ip_ptr;
+ struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
- if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev))
+ if (!in_dev)
return;
+ if (!IN_DEV_TX_REDIRECTS(in_dev))
+ goto out;
+
/* No redirected packets during ip_rt_redirect_silence;
* reset the algorithm.
*/
@@ -798,7 +818,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
*/
if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
rt->u.dst.rate_last = jiffies;
- return;
+ goto out;
}
/* Check for load limit; set rate_last to the latest sent
@@ -815,6 +835,8 @@ void ip_rt_send_redirect(struct sk_buff *skb)
rt->rt_src, rt->rt_iif, rt->rt_dst, rt->rt_gateway);
#endif
}
+out:
+ in_dev_put(in_dev);
}
static int ip_error(struct sk_buff *skb)
@@ -886,7 +908,7 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
for (i=0; i<2; i++) {
unsigned hash = rt_hash_code(daddr, skeys[i], tos);
- read_lock_bh(&rt_hash_lock);
+ read_lock(&rt_hash_lock);
for (rth = rt_hash_table[hash]; rth; rth = rth->u.rt_next) {
if (rth->key.dst == daddr &&
rth->key.src == skeys[i] &&
@@ -909,6 +931,10 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
if (mtu <= rth->u.dst.pmtu) {
if (mtu < rth->u.dst.pmtu) {
dst_confirm(&rth->u.dst);
+ if (mtu < ip_rt_min_pmtu) {
+ mtu = ip_rt_min_pmtu;
+ rth->u.dst.mxlock |= (1<<RTAX_MTU);
+ }
rth->u.dst.pmtu = mtu;
dst_set_expires(&rth->u.dst, ip_rt_mtu_expires);
}
@@ -916,7 +942,7 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
}
}
}
- read_unlock_bh(&rt_hash_lock);
+ read_unlock(&rt_hash_lock);
}
return est_mtu ? : new_mtu;
}
@@ -925,6 +951,10 @@ void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu)
{
if (dst->pmtu > mtu && mtu >= 68 &&
!(dst->mxlock&(1<<RTAX_MTU))) {
+ if (mtu < ip_rt_min_pmtu) {
+ mtu = ip_rt_min_pmtu;
+ dst->mxlock |= (1<<RTAX_MTU);
+ }
dst->pmtu = mtu;
dst_set_expires(dst, ip_rt_mtu_expires);
}
@@ -977,9 +1007,15 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
if (rt->key.iif == 0)
src = rt->rt_src;
- else if (fib_lookup(&rt->key, &res) == 0)
- src = FIB_RES_PREFSRC(res);
- else
+ else if (fib_lookup(&rt->key, &res) == 0) {
+#ifdef CONFIG_IP_ROUTE_NAT
+ if (res.type == RTN_NAT)
+ src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
+ else
+#endif
+ src = FIB_RES_PREFSRC(res);
+ fib_res_put(&res);
+ } else
src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE);
memcpy(addr, &src, 4);
}
@@ -1001,8 +1037,7 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
if (fi) {
if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
rt->rt_gateway = FIB_RES_GW(*res);
- rt->u.dst.mxlock = fi->fib_metrics[RTAX_LOCK-1];
- rt->u.dst.pmtu = fi->fib_mtu;
+ memcpy(&rt->u.dst.mxlock, fi->fib_metrics, sizeof(fi->fib_metrics));
if (fi->fib_mtu == 0) {
rt->u.dst.pmtu = rt->u.dst.dev->mtu;
if (rt->u.dst.pmtu > IP_MAX_MTU)
@@ -1012,8 +1047,6 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
rt->u.dst.pmtu > 576)
rt->u.dst.pmtu = 576;
}
- rt->u.dst.window= fi->fib_window ? : 0;
- rt->u.dst.rtt = fi->fib_rtt ? : TCP_TIMEOUT_INIT;
#ifdef CONFIG_NET_CLS_ROUTE
rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
#endif
@@ -1021,9 +1054,12 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
rt->u.dst.pmtu = rt->u.dst.dev->mtu;
if (rt->u.dst.pmtu > IP_MAX_MTU)
rt->u.dst.pmtu = IP_MAX_MTU;
- rt->u.dst.window= 0;
- rt->u.dst.rtt = TCP_TIMEOUT_INIT;
}
+ if (rt->u.dst.advmss == 0)
+ rt->u.dst.advmss = max(rt->u.dst.dev->mtu-40, ip_rt_min_advmss);
+ if (rt->u.dst.advmss > 65535-40)
+ rt->u.dst.advmss = 65535-40;
+
#ifdef CONFIG_NET_CLS_ROUTE
#ifdef CONFIG_IP_MULTIPLE_TABLES
set_class_tag(rt, fib_rules_tclass(res));
@@ -1035,39 +1071,45 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
static int
ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
- u8 tos, struct device *dev, int our)
+ u8 tos, struct net_device *dev, int our)
{
unsigned hash;
struct rtable *rth;
u32 spec_dst;
- struct in_device *in_dev = dev->ip_ptr;
+ struct in_device *in_dev = in_dev_get(dev);
u32 itag = 0;
/* Primary sanity checks. */
- if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
- in_dev == NULL || skb->protocol != __constant_htons(ETH_P_IP))
+ if (in_dev == NULL)
return -EINVAL;
+ if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
+ skb->protocol != __constant_htons(ETH_P_IP))
+ goto e_inval;
+
if (ZERONET(saddr)) {
if (!LOCAL_MCAST(daddr))
- return -EINVAL;
+ goto e_inval;
spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
} else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag) < 0)
- return -EINVAL;
+ goto e_inval;
- rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
+ rth = dst_alloc(&ipv4_dst_ops);
if (!rth)
- return -ENOBUFS;
+ goto e_nobufs;
rth->u.dst.output= ip_rt_bug;
- atomic_set(&rth->u.dst.use, 1);
+ atomic_set(&rth->u.dst.__refcnt, 1);
rth->key.dst = daddr;
rth->rt_dst = daddr;
rth->key.tos = tos;
#ifdef CONFIG_IP_ROUTE_FWMARK
- rth->key.fwmark = skb->fwmark;
+ if (skb->nfreason == NF_REASON_FOR_ROUTING)
+ rth->key.fwmark = skb->nfmark;
+ else
+ rth->key.fwmark = 0;
#endif
rth->key.src = saddr;
rth->rt_src = saddr;
@@ -1081,6 +1123,7 @@ ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
rth->rt_iif =
rth->key.iif = dev->ifindex;
rth->u.dst.dev = &loopback_dev;
+ dev_hold(rth->u.dst.dev);
rth->key.oif = 0;
rth->rt_gateway = daddr;
rth->rt_spec_dst= spec_dst;
@@ -1096,8 +1139,17 @@ ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
rth->u.dst.input = ip_mr_input;
#endif
+ in_dev_put(in_dev);
hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos);
return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+
+e_nobufs:
+ in_dev_put(in_dev);
+ return -ENOBUFS;
+
+e_inval:
+ in_dev_put(in_dev);
+ return -EINVAL;
}
/*
@@ -1111,18 +1163,19 @@ ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
*/
int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
- u8 tos, struct device *dev)
+ u8 tos, struct net_device *dev)
{
struct rt_key key;
struct fib_result res;
- struct in_device *in_dev = dev->ip_ptr;
- struct in_device *out_dev;
+ struct in_device *in_dev = in_dev_get(dev);
+ struct in_device *out_dev = NULL;
unsigned flags = 0;
u32 itag = 0;
struct rtable * rth;
unsigned hash;
u32 spec_dst;
int err = -EINVAL;
+ int free_res = 0;
/*
* IP on this device is disabled.
@@ -1135,7 +1188,10 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
key.src = saddr;
key.tos = tos;
#ifdef CONFIG_IP_ROUTE_FWMARK
- key.fwmark = skb->fwmark;
+ if (skb->nfreason == NF_REASON_FOR_ROUTING)
+ key.fwmark = skb->nfmark;
+ else
+ key.fwmark = 0;
#endif
key.iif = dev->ifindex;
key.oif = 0;
@@ -1165,11 +1221,12 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
/*
* Now we are ready to route packet.
*/
- if ((err = fib_lookup(&key, &res))) {
+ if ((err = fib_lookup(&key, &res)) != 0) {
if (!IN_DEV_FORWARD(in_dev))
- return -EINVAL;
+ goto e_inval;
goto no_route;
}
+ free_res = 1;
#ifdef CONFIG_IP_ROUTE_NAT
/* Policy is applied before mapping destination,
@@ -1183,8 +1240,13 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
if (res.type == RTN_NAT) {
key.dst = fib_rules_map_destination(daddr, &res);
- if (fib_lookup(&key, &res) || res.type != RTN_UNICAST)
- return -EINVAL;
+ fib_res_put(&res);
+ free_res = 0;
+ if (fib_lookup(&key, &res))
+ goto e_inval;
+ free_res = 1;
+ if (res.type != RTN_UNICAST)
+ goto e_inval;
flags |= RTCF_DNAT;
}
key.src = src_map;
@@ -1207,7 +1269,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
}
if (!IN_DEV_FORWARD(in_dev))
- return -EINVAL;
+ goto e_inval;
if (res.type != RTN_UNICAST)
goto martian_destination;
@@ -1215,11 +1277,11 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
if (res.fi->fib_nhs > 1 && key.oif == 0)
fib_select_multipath(&key, &res);
#endif
- out_dev = FIB_RES_DEV(res)->ip_ptr;
+ out_dev = in_dev_get(FIB_RES_DEV(res));
if (out_dev == NULL) {
if (net_ratelimit())
printk(KERN_CRIT "Bug in ip_route_input_slow(). Please, report\n");
- return -EINVAL;
+ goto e_inval;
}
err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst, &itag);
@@ -1239,19 +1301,22 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
* invalid for proxy arp. DNAT routes are always valid.
*/
if (out_dev == in_dev && !(flags&RTCF_DNAT))
- return -EINVAL;
+ goto e_inval;
}
- rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
+ rth = dst_alloc(&ipv4_dst_ops);
if (!rth)
- return -ENOBUFS;
+ goto e_nobufs;
- atomic_set(&rth->u.dst.use, 1);
+ atomic_set(&rth->u.dst.__refcnt, 1);
rth->key.dst = daddr;
rth->rt_dst = daddr;
rth->key.tos = tos;
#ifdef CONFIG_IP_ROUTE_FWMARK
- rth->key.fwmark = skb->fwmark;
+ if (skb->nfreason == NF_REASON_FOR_ROUTING)
+ rth->key.fwmark = skb->nfmark;
+ else
+ rth->key.fwmark = 0;
#endif
rth->key.src = saddr;
rth->rt_src = saddr;
@@ -1265,6 +1330,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
rth->rt_iif =
rth->key.iif = dev->ifindex;
rth->u.dst.dev = out_dev->dev;
+ dev_hold(rth->u.dst.dev);
rth->key.oif = 0;
rth->rt_spec_dst= spec_dst;
@@ -1277,7 +1343,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
#ifdef CONFIG_NET_FASTROUTE
if (netdev_fastroute && !(flags&(RTCF_NAT|RTCF_MASQ|RTCF_DOREDIRECT))) {
- struct device *odev = rth->u.dst.dev;
+ struct net_device *odev = rth->u.dst.dev;
if (odev != dev &&
dev->accept_fastpath &&
odev->mtu >= dev->mtu &&
@@ -1286,11 +1352,19 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
}
#endif
- return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+intern:
+ err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+done:
+ in_dev_put(in_dev);
+ if (out_dev)
+ in_dev_put(out_dev);
+ if (free_res)
+ fib_res_put(&res);
+ return err;
brd_input:
if (skb->protocol != __constant_htons(ETH_P_IP))
- return -EINVAL;
+ goto e_inval;
if (ZERONET(saddr)) {
spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
@@ -1305,18 +1379,21 @@ brd_input:
res.type = RTN_BROADCAST;
local_input:
- rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
+ rth = dst_alloc(&ipv4_dst_ops);
if (!rth)
- return -ENOBUFS;
+ goto e_nobufs;
rth->u.dst.output= ip_rt_bug;
- atomic_set(&rth->u.dst.use, 1);
+ atomic_set(&rth->u.dst.__refcnt, 1);
rth->key.dst = daddr;
rth->rt_dst = daddr;
rth->key.tos = tos;
#ifdef CONFIG_IP_ROUTE_FWMARK
- rth->key.fwmark = skb->fwmark;
+ if (skb->nfreason == NF_REASON_FOR_ROUTING)
+ rth->key.fwmark = skb->nfmark;
+ else
+ rth->key.fwmark = 0;
#endif
rth->key.src = saddr;
rth->rt_src = saddr;
@@ -1330,6 +1407,7 @@ local_input:
rth->rt_iif =
rth->key.iif = dev->ifindex;
rth->u.dst.dev = &loopback_dev;
+ dev_hold(rth->u.dst.dev);
rth->key.oif = 0;
rth->rt_gateway = daddr;
rth->rt_spec_dst= spec_dst;
@@ -1341,7 +1419,7 @@ local_input:
rth->rt_flags &= ~RTCF_LOCAL;
}
rth->rt_type = res.type;
- return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+ goto intern;
no_route:
spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
@@ -1356,7 +1434,13 @@ martian_destination:
if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
printk(KERN_WARNING "martian destination %08x from %08x, dev %s\n", daddr, saddr, dev->name);
#endif
- return -EINVAL;
+e_inval:
+ err = -EINVAL;
+ goto done;
+
+e_nobufs:
+ err = -ENOBUFS;
+ goto done;
martian_source:
#ifdef CONFIG_IP_ROUTE_VERBOSE
@@ -1376,11 +1460,11 @@ martian_source:
}
}
#endif
- return -EINVAL;
+ goto e_inval;
}
int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
- u8 tos, struct device *dev)
+ u8 tos, struct net_device *dev)
{
struct rtable * rth;
unsigned hash;
@@ -1396,12 +1480,14 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
rth->key.iif == iif &&
rth->key.oif == 0 &&
#ifdef CONFIG_IP_ROUTE_FWMARK
- rth->key.fwmark == skb->fwmark &&
+ rth->key.fwmark
+ == (skb->nfreason == NF_REASON_FOR_ROUTING
+ ? skb->nfmark : 0) &&
#endif
rth->key.tos == tos) {
rth->u.dst.lastuse = jiffies;
- atomic_inc(&rth->u.dst.use);
- atomic_inc(&rth->u.dst.refcnt);
+ dst_hold(&rth->u.dst);
+ rth->u.dst.__use++;
read_unlock_bh(&rt_hash_lock);
skb->dst = (struct dst_entry*)rth;
return 0;
@@ -1421,14 +1507,22 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
route cache entry is created eventually.
*/
if (MULTICAST(daddr)) {
- int our = ip_check_mc(dev, daddr);
- if (!our
+ struct in_device *in_dev;
+
+ read_lock(&inetdev_lock);
+ if ((in_dev = __in_dev_get(dev)) != NULL) {
+ int our = ip_check_mc(in_dev, daddr);
+ if (our
#ifdef CONFIG_IP_MROUTE
- && (LOCAL_MCAST(daddr) || !dev->ip_ptr ||
- !IN_DEV_MFORWARD((struct in_device*)dev->ip_ptr))
+ || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
#endif
- ) return -EINVAL;
- return ip_route_input_mc(skb, daddr, saddr, tos, dev, our);
+ ) {
+ read_unlock(&inetdev_lock);
+ return ip_route_input_mc(skb, daddr, saddr, tos, dev, our);
+ }
+ }
+ read_unlock(&inetdev_lock);
+ return -EINVAL;
}
return ip_route_input_slow(skb, daddr, saddr, tos, dev);
}
@@ -1443,11 +1537,10 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int
struct fib_result res;
unsigned flags = 0;
struct rtable *rth;
- struct device *dev_out = NULL;
+ struct net_device *dev_out = NULL;
unsigned hash;
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- u32 nochecksrc = (tos & RTO_TPROXY);
-#endif
+ int free_res = 0;
+ int err;
tos &= IPTOS_TOS_MASK|RTO_ONLINK;
key.dst = daddr;
@@ -1467,19 +1560,8 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
dev_out = ip_dev_find(saddr);
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- /* If address is not local, test for transparent proxy flag;
- if address is local --- clear the flag.
- */
- if (dev_out == NULL) {
- if (nochecksrc == 0 || inet_addr_type(saddr) != RTN_UNICAST)
- return -EINVAL;
- flags |= RTCF_TPROXY;
- }
-#else
if (dev_out == NULL)
return -EINVAL;
-#endif
/* I removed check for oif == dev_out->oif here.
It was wrong by three reasons:
@@ -1490,9 +1572,6 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int
*/
if (oif == 0 &&
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- dev_out &&
-#endif
(MULTICAST(daddr) || daddr == 0xFFFFFFFF)) {
/* Special hack: user can direct multicasts
and limited broadcast via necessary interface
@@ -1512,14 +1591,18 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int
key.oif = dev_out->ifindex;
goto make_route;
}
+ if (dev_out)
+ dev_put(dev_out);
dev_out = NULL;
}
if (oif) {
dev_out = dev_get_by_index(oif);
if (dev_out == NULL)
return -ENODEV;
- if (dev_out->ip_ptr == NULL)
+ if (__in_dev_get(dev_out) == NULL) {
+ dev_put(dev_out);
return -ENODEV; /* Wrong error code */
+ }
if (LOCAL_MCAST(daddr) || daddr == 0xFFFFFFFF) {
if (!key.src)
@@ -1538,7 +1621,10 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int
key.dst = key.src;
if (!key.dst)
key.dst = key.src = htonl(INADDR_LOOPBACK);
+ if (dev_out)
+ dev_put(dev_out);
dev_out = &loopback_dev;
+ dev_hold(dev_out);
key.oif = loopback_dev.ifindex;
res.type = RTN_LOCAL;
flags |= RTCF_LOCAL;
@@ -1571,17 +1657,25 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int
res.type = RTN_UNICAST;
goto make_route;
}
+ if (dev_out)
+ dev_put(dev_out);
return -ENETUNREACH;
}
+ free_res = 1;
if (res.type == RTN_NAT)
- return -EINVAL;
+ goto e_inval;
if (res.type == RTN_LOCAL) {
if (!key.src)
key.src = key.dst;
+ if (dev_out)
+ dev_put(dev_out);
dev_out = &loopback_dev;
+ dev_hold(dev_out);
key.oif = dev_out->ifindex;
+ if (res.fi)
+ fib_info_put(res.fi);
res.fi = NULL;
flags |= RTCF_LOCAL;
goto make_route;
@@ -1598,43 +1692,53 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int
if (!key.src)
key.src = FIB_RES_PREFSRC(res);
+ if (dev_out)
+ dev_put(dev_out);
dev_out = FIB_RES_DEV(res);
+ dev_hold(dev_out);
key.oif = dev_out->ifindex;
make_route:
if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
- return -EINVAL;
+ goto e_inval;
if (key.dst == 0xFFFFFFFF)
res.type = RTN_BROADCAST;
else if (MULTICAST(key.dst))
res.type = RTN_MULTICAST;
else if (BADCLASS(key.dst) || ZERONET(key.dst))
- return -EINVAL;
+ goto e_inval;
if (dev_out->flags&IFF_LOOPBACK)
flags |= RTCF_LOCAL;
if (res.type == RTN_BROADCAST) {
flags |= RTCF_BROADCAST|RTCF_LOCAL;
- res.fi = NULL;
+ if (res.fi) {
+ fib_info_put(res.fi);
+ res.fi = NULL;
+ }
} else if (res.type == RTN_MULTICAST) {
flags |= RTCF_MULTICAST|RTCF_LOCAL;
- if (!ip_check_mc(dev_out, daddr))
+ read_lock(&inetdev_lock);
+ if (!__in_dev_get(dev_out) || !ip_check_mc(__in_dev_get(dev_out), daddr))
flags &= ~RTCF_LOCAL;
+ read_unlock(&inetdev_lock);
/* If multicast route do not exist use
default one, but do not gateway in this case.
Yes, it is hack.
*/
- if (res.fi && res.prefixlen < 4)
+ if (res.fi && res.prefixlen < 4) {
+ fib_info_put(res.fi);
res.fi = NULL;
+ }
}
- rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
+ rth = dst_alloc(&ipv4_dst_ops);
if (!rth)
- return -ENOBUFS;
+ goto e_nobufs;
- atomic_set(&rth->u.dst.use, 1);
+ atomic_set(&rth->u.dst.__refcnt, 1);
rth->key.dst = daddr;
rth->key.tos = tos;
rth->key.src = saddr;
@@ -1648,6 +1752,7 @@ make_route:
#endif
rth->rt_iif = oif ? : dev_out->ifindex;
rth->u.dst.dev = dev_out;
+ dev_hold(dev_out);
rth->rt_gateway = key.dst;
rth->rt_spec_dst= key.src;
@@ -1662,11 +1767,14 @@ make_route:
if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK))
rth->u.dst.output = ip_mc_output;
#ifdef CONFIG_IP_MROUTE
- if (res.type == RTN_MULTICAST && dev_out->ip_ptr) {
- struct in_device *in_dev = dev_out->ip_ptr;
- if (IN_DEV_MFORWARD(in_dev) && !LOCAL_MCAST(daddr)) {
- rth->u.dst.input = ip_mr_input;
- rth->u.dst.output = ip_mc_output;
+ if (res.type == RTN_MULTICAST) {
+ struct in_device *in_dev = in_dev_get(dev_out);
+ if (in_dev) {
+ if (IN_DEV_MFORWARD(in_dev) && !LOCAL_MCAST(daddr)) {
+ rth->u.dst.input = ip_mr_input;
+ rth->u.dst.output = ip_mc_output;
+ }
+ in_dev_put(in_dev);
}
}
#endif
@@ -1677,7 +1785,20 @@ make_route:
rth->rt_flags = flags;
hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
- return rt_intern_hash(hash, rth, rp);
+ err = rt_intern_hash(hash, rth, rp);
+done:
+ if (free_res)
+ fib_res_put(&res);
+ if (dev_out)
+ dev_put(dev_out);
+ return err;
+
+e_inval:
+ err = -EINVAL;
+ goto done;
+e_nobufs:
+ err = -ENOBUFS;
+ goto done;
}
int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
@@ -1693,16 +1814,12 @@ int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
rth->key.src == saddr &&
rth->key.iif == 0 &&
rth->key.oif == oif &&
-#ifndef CONFIG_IP_TRANSPARENT_PROXY
- rth->key.tos == tos
-#else
!((rth->key.tos^tos)&(IPTOS_TOS_MASK|RTO_ONLINK)) &&
((tos&RTO_TPROXY) || !(rth->rt_flags&RTCF_TPROXY))
-#endif
) {
rth->u.dst.lastuse = jiffies;
- atomic_inc(&rth->u.dst.use);
- atomic_inc(&rth->u.dst.refcnt);
+ dst_hold(&rth->u.dst);
+ rth->u.dst.__use++;
read_unlock_bh(&rt_hash_lock);
*rp = rth;
return 0;
@@ -1725,7 +1842,6 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int no
#ifdef CONFIG_IP_MROUTE
struct rtattr *eptr;
#endif
- struct rtattr *mx;
nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
r = NLMSG_DATA(nlh);
@@ -1758,22 +1874,11 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int no
RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
if (rt->rt_dst != rt->rt_gateway)
RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
- mx = (struct rtattr*)skb->tail;
- RTA_PUT(skb, RTA_METRICS, 0, NULL);
- if (rt->u.dst.mxlock)
- RTA_PUT(skb, RTAX_LOCK, sizeof(unsigned), &rt->u.dst.mxlock);
- if (rt->u.dst.pmtu)
- RTA_PUT(skb, RTAX_MTU, sizeof(unsigned), &rt->u.dst.pmtu);
- if (rt->u.dst.window)
- RTA_PUT(skb, RTAX_WINDOW, sizeof(unsigned), &rt->u.dst.window);
- if (rt->u.dst.rtt)
- RTA_PUT(skb, RTAX_RTT, sizeof(unsigned), &rt->u.dst.rtt);
- mx->rta_len = skb->tail - (u8*)mx;
- if (mx->rta_len == RTA_LENGTH(0))
- skb_trim(skb, (u8*)mx - skb->data);
+ if (rtnetlink_put_metrics(skb, &rt->u.dst.mxlock) < 0)
+ goto rtattr_failure;
ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
- ci.rta_used = atomic_read(&rt->u.dst.refcnt);
- ci.rta_clntref = atomic_read(&rt->u.dst.use);
+ ci.rta_used = rt->u.dst.__use;
+ ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
if (rt->u.dst.expires)
ci.rta_expires = rt->u.dst.expires - jiffies;
else
@@ -1845,8 +1950,8 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
if (iif) {
- struct device *dev;
- dev = dev_get_by_index(iif);
+ struct net_device *dev;
+ dev = __dev_get_by_index(iif);
if (!dev)
return -ENODEV;
skb->protocol = __constant_htons(ETH_P_IP);
@@ -1944,16 +2049,30 @@ int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
return -EINVAL;
}
+static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, int *name, int nlen,
+ void *oldval, size_t *oldlenp,
+ void *newval, size_t newlen,
+ void **context)
+{
+ int delay;
+ if (newlen != sizeof(int))
+ return -EINVAL;
+ if (get_user(delay,(int *)newval))
+ return -EFAULT;
+ rt_cache_flush(delay);
+ return 0;
+}
+
ctl_table ipv4_route_table[] = {
{NET_IPV4_ROUTE_FLUSH, "flush",
- &flush_delay, sizeof(int), 0200, NULL,
- &ipv4_sysctl_rtcache_flush},
+ &flush_delay, sizeof(int), 0644, NULL,
+ &ipv4_sysctl_rtcache_flush, &ipv4_sysctl_rtcache_flush_strategy },
{NET_IPV4_ROUTE_MIN_DELAY, "min_delay",
&ip_rt_min_delay, sizeof(int), 0644, NULL,
- &proc_dointvec_jiffies},
+ &proc_dointvec_jiffies, &sysctl_jiffies},
{NET_IPV4_ROUTE_MAX_DELAY, "max_delay",
&ip_rt_max_delay, sizeof(int), 0644, NULL,
- &proc_dointvec_jiffies},
+ &proc_dointvec_jiffies, &sysctl_jiffies},
{NET_IPV4_ROUTE_GC_THRESH, "gc_thresh",
&ipv4_dst_ops.gc_thresh, sizeof(int), 0644, NULL,
&proc_dointvec},
@@ -1962,13 +2081,13 @@ ctl_table ipv4_route_table[] = {
&proc_dointvec},
{NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval",
&ip_rt_gc_min_interval, sizeof(int), 0644, NULL,
- &proc_dointvec_jiffies},
+ &proc_dointvec_jiffies, &sysctl_jiffies},
{NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout",
&ip_rt_gc_timeout, sizeof(int), 0644, NULL,
- &proc_dointvec_jiffies},
+ &proc_dointvec_jiffies, &sysctl_jiffies},
{NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval",
&ip_rt_gc_interval, sizeof(int), 0644, NULL,
- &proc_dointvec_jiffies},
+ &proc_dointvec_jiffies, &sysctl_jiffies},
{NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load",
&ip_rt_redirect_load, sizeof(int), 0644, NULL,
&proc_dointvec},
@@ -1989,7 +2108,13 @@ ctl_table ipv4_route_table[] = {
&proc_dointvec},
{NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires",
&ip_rt_mtu_expires, sizeof(int), 0644, NULL,
- &proc_dointvec_jiffies},
+ &proc_dointvec_jiffies, &sysctl_jiffies},
+ {NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu",
+ &ip_rt_min_pmtu, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss",
+ &ip_rt_min_advmss, sizeof(int), 0644, NULL,
+ &proc_dointvec},
{0}
};
#endif
@@ -2020,13 +2145,18 @@ static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
#endif
-__initfunc(void ip_rt_init(void))
+void __init ip_rt_init(void)
{
#ifdef CONFIG_PROC_FS
#ifdef CONFIG_NET_CLS_ROUTE
struct proc_dir_entry *ent;
#endif
#endif
+ ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
+ sizeof(struct rtable),
+ 0, SLAB_HWCACHE_ALIGN,
+ NULL, NULL);
+
devinet_init();
ip_fib_init();
rt_periodic_timer.function = rt_check_expire;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 655176432..086da77c2 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -9,7 +9,7 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
- * $Id: syncookies.c,v 1.7 1999/03/17 02:34:57 davem Exp $
+ * $Id: syncookies.c,v 1.9 1999/08/23 06:30:34 davem Exp $
*
* Missing: IPv6 support.
*/
@@ -104,12 +104,20 @@ get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct open_request *req,
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+ /* Oops! It was missing, syn_recv_sock decreases it. */
+ tp->syn_backlog++;
+
sk = tp->af_specific->syn_recv_sock(sk, skb, req, dst);
- req->sk = sk;
-
- /* Queue up for accept() */
- tcp_synq_queue(tp, req);
-
+ if (sk) {
+ req->sk = sk;
+
+ /* Queue up for accept() */
+ tcp_synq_queue(tp, req);
+ } else {
+ tp->syn_backlog--;
+ req->class->destructor(req);
+ tcp_openreq_free(req);
+ }
return sk;
}
@@ -179,7 +187,7 @@ cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt)
opt &&
opt->srr ? opt->faddr : req->af.v4_req.rmt_addr,
req->af.v4_req.loc_addr,
- sk->ip_tos | RTO_CONN,
+ sk->protinfo.af_inet.tos | RTO_CONN,
0)) {
tcp_openreq_free(req);
return NULL;
@@ -187,7 +195,7 @@ cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt)
/* Try to redo what tcp_v4_send_synack did. */
req->window_clamp = rt->u.dst.window;
- tcp_select_initial_window(sock_rspace(sk)/2,req->mss,
+ tcp_select_initial_window(tcp_full_space(sk),req->mss,
&req->rcv_wnd, &req->window_clamp,
0, &rcv_wscale);
req->rcv_wscale = rcv_wscale;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 10f5e9324..1ff1566af 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1,7 +1,7 @@
/*
* sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem.
*
- * $Id: sysctl_net_ipv4.c,v 1.38 1999/01/02 16:51:48 davem Exp $
+ * $Id: sysctl_net_ipv4.c,v 1.40 1999/09/07 02:31:17 davem Exp $
*
* Begun April 1, 1996, Mike Shaver.
* Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS]
@@ -50,7 +50,6 @@ extern int sysctl_tcp_sack;
extern int sysctl_tcp_retrans_collapse;
extern int sysctl_tcp_keepalive_time;
extern int sysctl_tcp_keepalive_probes;
-extern int sysctl_tcp_max_ka_probes;
extern int sysctl_tcp_retries1;
extern int sysctl_tcp_retries2;
extern int sysctl_tcp_fin_timeout;
@@ -60,6 +59,7 @@ extern int sysctl_tcp_stdurg;
extern int sysctl_tcp_rfc1337;
extern int sysctl_tcp_syn_taildrop;
extern int sysctl_max_syn_backlog;
+extern int sysctl_tcp_tw_recycle;
/* From icmp.c */
extern int sysctl_icmp_destunreach_time;
@@ -90,9 +90,23 @@ int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
if (write && ipv4_devconf.forwarding != val)
inet_forward_change();
- return ret;
+ return ret;
}
+static int ipv4_sysctl_forward_strategy(ctl_table *table, int *name, int nlen,
+ void *oldval, size_t *oldlenp,
+ void *newval, size_t newlen,
+ void **context)
+{
+ int new;
+ if (newlen != sizeof(int))
+ return -EINVAL;
+ if (get_user(new,(int *)newval))
+ return -EFAULT;
+ if (new != ipv4_devconf.forwarding)
+ inet_forward_change();
+ return 0; /* caller does change again and handles handles oldval */
+}
ctl_table ipv4_table[] = {
{NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps",
@@ -109,7 +123,7 @@ ctl_table ipv4_table[] = {
&proc_dointvec},
{NET_IPV4_FORWARD, "ip_forward",
&ipv4_devconf.forwarding, sizeof(int), 0644, NULL,
- &ipv4_sysctl_forward},
+ &ipv4_sysctl_forward,&ipv4_sysctl_forward_strategy},
{NET_IPV4_DEFAULT_TTL, "ip_default_ttl",
&ip_statistics.IpDefaultTTL, sizeof(int), 0644, NULL,
&proc_dointvec},
@@ -127,20 +141,18 @@ ctl_table ipv4_table[] = {
&sysctl_ipfrag_low_thresh, sizeof(int), 0644, NULL, &proc_dointvec},
{NET_IPV4_DYNADDR, "ip_dynaddr",
&sysctl_ip_dynaddr, sizeof(int), 0644, NULL, &proc_dointvec},
-#ifdef CONFIG_IP_MASQUERADE
- {NET_IPV4_IP_MASQ_DEBUG, "ip_masq_debug",
- &sysctl_ip_masq_debug, sizeof(int), 0644, NULL, &proc_dointvec},
-#endif
{NET_IPV4_IPFRAG_TIME, "ipfrag_time",
- &sysctl_ipfrag_time, sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
- {NET_IPV4_TCP_MAX_KA_PROBES, "tcp_max_ka_probes",
- &sysctl_tcp_max_ka_probes, sizeof(int), 0644, NULL, &proc_dointvec},
+ &sysctl_ipfrag_time, sizeof(int), 0644, NULL, &proc_dointvec_jiffies,
+ &sysctl_jiffies},
{NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time",
&sysctl_tcp_keepalive_time, sizeof(int), 0644, NULL,
- &proc_dointvec_jiffies},
+ &proc_dointvec_jiffies, &sysctl_jiffies},
{NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes",
&sysctl_tcp_keepalive_probes, sizeof(int), 0644, NULL,
&proc_dointvec},
+ {NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl",
+ &sysctl_tcp_keepalive_intvl, sizeof(int), 0644, NULL,
+ &proc_dointvec_jiffies, &sysctl_jiffies},
{NET_IPV4_TCP_RETRIES1, "tcp_retries1",
&sysctl_tcp_retries1, sizeof(int), 0644, NULL, &proc_dointvec_minmax,
&sysctl_intvec, NULL, NULL, &tcp_retr1_max},
@@ -148,11 +160,15 @@ ctl_table ipv4_table[] = {
&sysctl_tcp_retries2, sizeof(int), 0644, NULL, &proc_dointvec},
{NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout",
&sysctl_tcp_fin_timeout, sizeof(int), 0644, NULL,
- &proc_dointvec_jiffies},
+ &proc_dointvec_jiffies, &sysctl_jiffies},
#ifdef CONFIG_SYN_COOKIES
{NET_TCP_SYNCOOKIES, "tcp_syncookies",
&sysctl_tcp_syncookies, sizeof(int), 0644, NULL, &proc_dointvec},
#endif
+#ifdef CONFIG_TCP_TW_RECYCLE
+ {NET_TCP_TW_RECYCLE, "tcp_tw_recycle",
+ &sysctl_tcp_tw_recycle, sizeof(int), 0644, NULL, &proc_dointvec},
+#endif
{NET_TCP_STDURG, "tcp_stdurg", &sysctl_tcp_stdurg,
sizeof(int), 0644, NULL, &proc_dointvec},
{NET_TCP_RFC1337, "tcp_rfc1337", &sysctl_tcp_rfc1337,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e68569a68..b8e5d197c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp.c,v 1.144 1999/05/27 01:03:37 davem Exp $
+ * Version: $Id: tcp.c,v 1.151 1999/09/07 02:31:21 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -412,6 +412,7 @@
* (Updated by AK, but not complete yet.)
**/
+#include <linux/config.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/poll.h>
@@ -563,6 +564,11 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
if (sk->state == TCP_LISTEN)
return tcp_listen_poll(sk, wait);
+ /* Socket is not locked. We are protected from async events
+ by poll logic and correct handling of state changes
+ made by another threads is impossible in any case.
+ */
+
mask = 0;
if (sk->err)
mask = POLLERR;
@@ -607,17 +613,22 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
/*
* Socket write_space callback.
- * This (or rather the sock_wake_async) should agree with poll.
+ * This (or rather the sock_wake_async) should agree with poll.
+ *
+ * WARNING. This callback is called from any context (process,
+ * bh or irq). Do not make anything more smart from it.
*/
void tcp_write_space(struct sock *sk)
{
- if (sk->dead)
- return;
+ read_lock(&sk->callback_lock);
+ if (!sk->dead) {
+ /* Why??!! Does it really not overshedule? --ANK */
+ wake_up_interruptible(sk->sleep);
- wake_up_interruptible(sk->sleep);
- if (sock_wspace(sk) >=
- tcp_min_write_space(sk))
- sock_wake_async(sk->socket, 2);
+ if (sock_wspace(sk) >= tcp_min_write_space(sk))
+ sock_wake_async(sk->socket, 2);
+ }
+ read_unlock(&sk->callback_lock);
}
@@ -657,8 +668,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
/*
* Wait for a socket to get into the connected state
*
- * Note: Must be called with the socket locked, and it
- * runs with the kernel fully unlocked.
+ * Note: Must be called with the socket locked.
*/
static int wait_for_tcp_connect(struct sock * sk, int flags)
{
@@ -679,17 +689,17 @@ static int wait_for_tcp_connect(struct sock * sk, int flags)
if(signal_pending(tsk))
return -ERESTARTSYS;
- tsk->state = TASK_INTERRUPTIBLE;
+ __set_task_state(tsk, TASK_INTERRUPTIBLE);
add_wait_queue(sk->sleep, &wait);
- release_sock(sk);
+ sk->tp_pinfo.af_tcp.write_pending++;
- if (((1 << sk->state) & ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT)) &&
- sk->err == 0)
- schedule();
+ release_sock(sk);
+ schedule();
+ lock_sock(sk);
- tsk->state = TASK_RUNNING;
+ __set_task_state(tsk, TASK_RUNNING);
remove_wait_queue(sk->sleep, &wait);
- lock_sock(sk);
+ sk->tp_pinfo.af_tcp.write_pending--;
}
return 0;
}
@@ -701,33 +711,33 @@ static inline int tcp_memory_free(struct sock *sk)
/*
* Wait for more memory for a socket
- *
- * NOTE: This runs with the kernel fully unlocked.
*/
static void wait_for_tcp_memory(struct sock * sk)
{
- release_sock(sk);
if (!tcp_memory_free(sk)) {
DECLARE_WAITQUEUE(wait, current);
sk->socket->flags &= ~SO_NOSPACE;
add_wait_queue(sk->sleep, &wait);
for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
if (signal_pending(current))
break;
- current->state = TASK_INTERRUPTIBLE;
if (tcp_memory_free(sk))
break;
if (sk->shutdown & SEND_SHUTDOWN)
break;
if (sk->err)
break;
- schedule();
+ release_sock(sk);
+ if (!tcp_memory_free(sk))
+ schedule();
+ lock_sock(sk);
}
current->state = TASK_RUNNING;
remove_wait_queue(sk->sleep, &wait);
}
- lock_sock(sk);
}
/* When all user supplied data has been queued set the PSH bit */
@@ -749,8 +759,6 @@ int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
int mss_now;
int err, copied;
- lock_sock(sk);
-
err = 0;
tp = &(sk->tp_pinfo.af_tcp);
@@ -974,7 +982,6 @@ do_fault2:
err = -EFAULT;
out:
tcp_push_pending_frames(sk, tp);
- release_sock(sk);
return err;
}
@@ -1010,9 +1017,6 @@ static int tcp_recv_urg(struct sock * sk, int nonblock,
if (sk->urginline || !tp->urg_data || tp->urg_data == URG_READ)
return -EINVAL; /* Yes this is right ! */
- if (sk->err)
- return sock_error(sk);
-
if (sk->done)
return -ENOTCONN;
@@ -1021,14 +1025,13 @@ static int tcp_recv_urg(struct sock * sk, int nonblock,
return 0;
}
- lock_sock(sk);
if (tp->urg_data & URG_VALID) {
int err = 0;
char c = tp->urg_data;
if (!(flags & MSG_PEEK))
tp->urg_data = URG_READ;
-
+
if(msg->msg_name)
tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
msg->msg_name);
@@ -1038,21 +1041,15 @@ static int tcp_recv_urg(struct sock * sk, int nonblock,
/* Read urgent data. */
msg->msg_flags|=MSG_OOB;
- release_sock(sk);
- if(len>0)
- {
+ if(len>0) {
err = memcpy_toiovec(msg->msg_iov, &c, 1);
- /* N.B. already set above ... */
- msg->msg_flags|=MSG_OOB;
- }
- else
+ len = 1;
+ } else
msg->msg_flags|=MSG_TRUNC;
-
- /* N.B. Is this right?? If len == 0 we didn't read any data */
- return err ? -EFAULT : 1;
+
+ return err ? -EFAULT : len;
}
- release_sock(sk);
/* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
* the available implementations agree in this case:
@@ -1072,7 +1069,7 @@ static int tcp_recv_urg(struct sock * sk, int nonblock,
static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
{
__skb_unlink(skb, &sk->receive_queue);
- kfree_skb(skb);
+ __kfree_skb(skb);
}
/* Clean up the receive buffer for full frames taken by the user,
@@ -1114,6 +1111,30 @@ static void cleanup_rbuf(struct sock *sk, int copied)
}
}
+/* Now socket state including sk->err is changed only under lock,
+ hence we should check only pending signals.
+ */
+
+static void tcp_data_wait(struct sock *sk)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue(sk->sleep, &wait);
+
+ __set_current_state(TASK_INTERRUPTIBLE);
+
+ sk->socket->flags |= SO_WAITDATA;
+ release_sock(sk);
+
+ if (skb_queue_empty(&sk->receive_queue))
+ schedule();
+
+ lock_sock(sk);
+ sk->socket->flags &= ~SO_WAITDATA;
+
+ remove_wait_queue(sk->sleep, &wait);
+ __set_current_state(TASK_RUNNING);
+}
/*
* This routine copies from a sock struct into the user buffer.
@@ -1123,23 +1144,25 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
int len, int nonblock, int flags, int *addr_len)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- DECLARE_WAITQUEUE(wait, current);
int copied = 0;
u32 peek_seq;
volatile u32 *seq; /* So gcc doesn't overoptimise */
unsigned long used;
- int err = 0;
+ int err;
int target = 1; /* Read at least this many bytes */
+ lock_sock(sk);
+
if (sk->err)
- return sock_error(sk);
+ goto out_err;
+ err = -ENOTCONN;
if (sk->state == TCP_LISTEN)
- return -ENOTCONN;
+ goto out;
/* Urgent data needs to be handled specially. */
if (flags & MSG_OOB)
- return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
+ goto recv_urg;
/* Copying sequence to update. This is volatile to handle
* the multi-reader case neatly (memcpy_to/fromfs might be
@@ -1149,13 +1172,11 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
seq = &tp->copied_seq;
if (flags & MSG_PEEK)
seq = &peek_seq;
-
+
/* Handle the POSIX bogosity MSG_WAITALL. */
if (flags & MSG_WAITALL)
target=len;
- add_wait_queue(sk->sleep, &wait);
- lock_sock(sk);
/*
* BUG BUG BUG
@@ -1185,7 +1206,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
}
/* Next get a buffer. */
- current->state = TASK_INTERRUPTIBLE;
skb = skb_peek(&sk->receive_queue);
do {
@@ -1215,16 +1235,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
if (copied >= target)
break;
- /*
- These three lines and clause if (sk->state == TCP_CLOSE)
- are unlikely to be correct, if target > 1.
- I DO NOT FIX IT, because I have no idea, what
- POSIX prescribes to make here. Probably, it really
- wants to lose data 8), if not all target is received.
- --ANK
- */
if (sk->err && !(flags&MSG_PEEK)) {
- copied = sock_error(sk);
+ if (!copied)
+ copied = sock_error(sk);
break;
}
@@ -1238,7 +1251,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
sk->done = 1;
break;
}
- copied = -ENOTCONN;
+ if (!copied)
+ copied = -ENOTCONN;
break;
}
@@ -1248,11 +1262,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
}
cleanup_rbuf(sk, copied);
- release_sock(sk);
- sk->socket->flags |= SO_WAITDATA;
- schedule();
- sk->socket->flags &= ~SO_WAITDATA;
- lock_sock(sk);
+ tcp_data_wait(sk);
continue;
found_ok_skb:
@@ -1339,20 +1349,28 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
break;
}
- if (copied > 0 && msg->msg_name)
+ if (copied >= 0 && msg->msg_name)
tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
msg->msg_name);
if(addr_len)
*addr_len = tp->af_specific->sockaddr_len;
- remove_wait_queue(sk->sleep, &wait);
- current->state = TASK_RUNNING;
-
/* Clean up data we have read: This will do ACK frames. */
cleanup_rbuf(sk, copied);
release_sock(sk);
return copied;
+
+out_err:
+ err = sock_error(sk);
+
+out:
+ release_sock(sk);
+ return err;
+
+recv_urg:
+ err = tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
+ goto out;
}
/*
@@ -1360,8 +1378,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
*/
static inline void tcp_check_fin_timer(struct sock *sk)
{
- if (sk->state == TCP_FIN_WAIT2 && !sk->timer.prev)
- tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout);
+ if (sk->state == TCP_FIN_WAIT2)
+ tcp_reset_keepalive_timer(sk, sysctl_tcp_fin_timeout);
}
/*
@@ -1423,7 +1441,6 @@ void tcp_shutdown(struct sock *sk, int how)
return;
/* If we've already sent a FIN, or it's a closed state, skip this. */
- lock_sock(sk);
if ((1 << sk->state) &
(TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
@@ -1431,7 +1448,6 @@ void tcp_shutdown(struct sock *sk, int how)
if (tcp_close_state(sk,0))
tcp_send_fin(sk);
}
- release_sock(sk);
}
@@ -1447,7 +1463,7 @@ static inline int closing(struct sock * sk)
/*
* This routine closes sockets which have been at least partially
* opened, but not yet accepted. Currently it is only called by
- * tcp_close, and timeout mirrors the value there.
+ * tcp_close.
*/
static void tcp_close_pending (struct sock *sk)
@@ -1463,47 +1479,85 @@ static void tcp_close_pending (struct sock *sk)
iter = req;
req = req->dl_next;
-
+
+ if (iter->sk) {
+ sk->ack_backlog--;
+ } else {
+ tcp_dec_slow_timer(TCP_SLT_SYNACK);
+ tp->syn_backlog--;
+ }
(*iter->class->destructor)(iter);
- tcp_dec_slow_timer(TCP_SLT_SYNACK);
- sk->ack_backlog--;
tcp_openreq_free(iter);
}
-
+ BUG_TRAP(tp->syn_backlog == 0);
+ BUG_TRAP(sk->ack_backlog == 0);
tcp_synq_init(tp);
}
+static __inline__ void tcp_kill_sk_queues(struct sock *sk)
+{
+ /* First the read buffer. */
+ skb_queue_purge(&sk->receive_queue);
+
+ /* Next, the error queue. */
+ skb_queue_purge(&sk->error_queue);
+
+ /* Next, the write queue. */
+ BUG_TRAP(skb_queue_empty(&sk->write_queue));
+
+ /* It is _impossible_ for the backlog to contain anything
+ * when we get here. All user references to this socket
+ * have gone away, only the net layer knows can touch it.
+ */
+}
+
+/*
+ * At this point, there should be no process reference to this
+ * socket, and thus no user references at all. Therefore we
+ * can assume the socket waitqueue is inactive and nobody will
+ * try to jump onto it.
+ */
+void tcp_destroy_sock(struct sock *sk)
+{
+ BUG_TRAP(sk->state==TCP_CLOSE);
+ BUG_TRAP(sk->dead);
+
+ /* It cannot be in hash table! */
+ BUG_TRAP(sk->pprev==NULL);
+
+ /* It it has not 0 sk->num, it must be bound */
+ BUG_TRAP(!sk->num || sk->prev!=NULL);
+
+ sk->prot->destroy(sk);
+
+ tcp_kill_sk_queues(sk);
+
+#ifdef INET_REFCNT_DEBUG
+ if (atomic_read(&sk->refcnt) != 1) {
+ printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", sk, atomic_read(&sk->refcnt));
+ }
+#endif
+
+ sock_put(sk);
+}
+
void tcp_close(struct sock *sk, long timeout)
{
struct sk_buff *skb;
int data_was_unread = 0;
- /* We need to grab some memory, and put together a FIN,
- * and then put it into the queue to be sent.
- */
lock_sock(sk);
if(sk->state == TCP_LISTEN) {
- /* Special case. */
tcp_set_state(sk, TCP_CLOSE);
+
+ /* Special case. */
tcp_close_pending(sk);
- release_sock(sk);
- sk->dead = 1;
- return;
- }
- unlock_kernel();
+ goto adjudge_to_death;
+ }
- /* It is questionable, what the role of this is now.
- * In any event either it should be removed, or
- * increment of SLT_KEEPALIVE be done, this is causing
- * big problems. For now I comment it out. -DaveM
- */
- /* sk->keepopen = 1; */
sk->shutdown = SHUTDOWN_MASK;
- if (!sk->dead)
- sk->state_change(sk);
-
/* We need to flush the recv. buffs. We do this only on the
* descriptor close, not protocol-sourced closes, because the
* reader process may not have drained the data yet!
@@ -1526,7 +1580,7 @@ void tcp_close(struct sock *sk, long timeout)
if(data_was_unread != 0) {
/* Unread data was tossed, zap the connection. */
tcp_set_state(sk, TCP_CLOSE);
- tcp_send_active_reset(sk);
+ tcp_send_active_reset(sk, GFP_KERNEL);
} else if (tcp_close_state(sk,1)) {
/* We FIN if the application ate all the data before
* zapping the connection.
@@ -1541,13 +1595,13 @@ void tcp_close(struct sock *sk, long timeout)
add_wait_queue(sk->sleep, &wait);
while (1) {
- tsk->state = TASK_INTERRUPTIBLE;
+ set_current_state(TASK_INTERRUPTIBLE);
if (!closing(sk))
break;
release_sock(sk);
timeout = schedule_timeout(timeout);
lock_sock(sk);
- if (signal_pending(tsk) || !timeout)
+ if (!signal_pending(tsk) || timeout)
break;
}
@@ -1560,10 +1614,97 @@ void tcp_close(struct sock *sk, long timeout)
*/
tcp_check_fin_timer(sk);
+adjudge_to_death:
+ /* It is the last release_sock in its life. It will remove backlog. */
+ release_sock(sk);
+
+
+ /* Now socket is owned by kernel and we acquire BH lock
+ to finish close. No need to check for user refs.
+ */
+ local_bh_disable();
+ bh_lock_sock(sk);
+ BUG_TRAP(sk->lock.users==0);
+
+ sock_hold(sk);
+
+ /* Announce socket dead, detach it from wait queue and inode. */
+ write_lock_irq(&sk->callback_lock);
sk->dead = 1;
+ sk->socket = NULL;
+ sk->sleep = NULL;
+ write_unlock_irq(&sk->callback_lock);
- release_sock(sk);
- lock_kernel();
+ if (sk->state == TCP_CLOSE)
+ tcp_destroy_sock(sk);
+ /* Otherwise, socket is reprieved until protocol close. */
+
+ bh_unlock_sock(sk);
+ local_bh_enable();
+ sock_put(sk);
+}
+
+int tcp_disconnect(struct sock *sk, int flags)
+{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+ int old_state;
+ int err = 0;
+
+ old_state = sk->state;
+ if (old_state != TCP_CLOSE)
+ tcp_set_state(sk, TCP_CLOSE);
+
+ /* ABORT function of RFC793 */
+ if (old_state == TCP_LISTEN) {
+ tcp_close_pending(sk);
+ } else if (tcp_connected(old_state)) {
+ tcp_send_active_reset(sk, GFP_KERNEL);
+ sk->err = ECONNRESET;
+ } else if (old_state == TCP_SYN_SENT)
+ sk->err = ECONNRESET;
+
+ tcp_clear_xmit_timers(sk);
+ __skb_queue_purge(&sk->receive_queue);
+ __skb_queue_purge(&sk->write_queue);
+ __skb_queue_purge(&tp->out_of_order_queue);
+
+ sk->dport = 0;
+
+ sk->rcv_saddr = 0;
+ sk->saddr = 0;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ memset(&sk->net_pinfo.af_inet6.saddr, 0, 16);
+ memset(&sk->net_pinfo.af_inet6.rcv_saddr, 0, 16);
+#endif
+
+ sk->zapped = 0;
+ sk->shutdown = 0;
+ sk->done = 0;
+ sk->write_space = tcp_write_space;
+ tp->srtt = 0;
+#ifdef CONFIG_TCP_TW_RECYCLE
+ if ((tp->write_seq += 2) == 0)
+ tp->write_seq = 1;
+#else
+ tp->write_seq = 0;
+#endif
+ tp->ato = 0;
+ tp->backoff = 0;
+ tp->snd_cwnd = 2;
+ tp->probes_out = 0;
+ tp->high_seq = 0;
+ tp->snd_ssthresh = 0x7fffffff;
+ tp->snd_cwnd_cnt = 0;
+ tp->dup_acks = 0;
+ tp->delayed_acks = 0;
+ tp->send_head = tp->retrans_head = NULL;
+ tp->saw_tstamp = 0;
+ __sk_dst_reset(sk);
+
+ BUG_TRAP(!sk->num || sk->prev);
+
+ sk->error_report(sk);
+ return err;
}
/*
@@ -1614,20 +1755,19 @@ static struct open_request * wait_for_connect(struct sock * sk,
* Be careful about race conditions here - this is subtle.
*/
-struct sock *tcp_accept(struct sock *sk, int flags)
+struct sock *tcp_accept(struct sock *sk, int flags, int *err)
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
struct open_request *req, *prev;
- struct sock *newsk = NULL;
+ struct sock *newsk;
int error;
- unlock_kernel();
lock_sock(sk);
/* We need to make sure that this socket is listening,
* and that it has something pending.
*/
- error = EINVAL;
+ error = -EINVAL;
if (sk->state != TCP_LISTEN)
goto out;
@@ -1635,13 +1775,13 @@ struct sock *tcp_accept(struct sock *sk, int flags)
req = tcp_find_established(tp, &prev);
if (!req) {
/* If this is a non blocking socket don't sleep */
- error = EAGAIN;
+ error = -EAGAIN;
if (flags & O_NONBLOCK)
goto out;
-
- error = ERESTARTSYS;
+
+ error = -ERESTARTSYS;
req = wait_for_connect(sk, &prev);
- if (!req)
+ if (!req)
goto out;
}
@@ -1650,20 +1790,13 @@ struct sock *tcp_accept(struct sock *sk, int flags)
req->class->destructor(req);
tcp_openreq_free(req);
sk->ack_backlog--;
- if(sk->keepopen)
- tcp_inc_slow_timer(TCP_SLT_KEEPALIVE);
release_sock(sk);
- lock_kernel();
return newsk;
out:
- /* sk should be in LISTEN state, thus accept can use sk->err for
- * internal purposes without stomping on anyone's feed.
- */
- sk->err = error;
release_sock(sk);
- lock_kernel();
- return newsk;
+ *err = error;
+ return NULL;
}
/*
@@ -1675,36 +1808,43 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
int val;
+ int err = 0;
if (level != SOL_TCP)
return tp->af_specific->setsockopt(sk, level, optname,
optval, optlen);
-
+
if(optlen<sizeof(int))
return -EINVAL;
- if (get_user(val, (int *)optval))
+ if (get_user(val, (int *)optval))
return -EFAULT;
+ lock_sock(sk);
+
switch(optname) {
case TCP_MAXSEG:
/* values greater than interface MTU won't take effect. however at
* the point when this call is done we typically don't yet know
* which interface is going to be used
*/
- if(val < 1 || val > MAX_WINDOW)
- return -EINVAL;
+ if(val < 1 || val > MAX_WINDOW) {
+ err = -EINVAL;
+ break;
+ }
tp->user_mss = val;
- return 0;
+ break;
case TCP_NODELAY:
/* You cannot try to use this and TCP_CORK in
* tandem, so let the user know.
*/
- if (sk->nonagle == 2)
- return -EINVAL;
+ if (sk->nonagle == 2) {
+ err = -EINVAL;
+ break;
+ }
sk->nonagle = (val == 0) ? 0 : 1;
- return 0;
+ break;
case TCP_CORK:
/* When set indicates to always queue non-full frames.
@@ -1718,22 +1858,59 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
* You cannot try to use TCP_NODELAY and this mechanism
* at the same time, so let the user know.
*/
- if (sk->nonagle == 1)
- return -EINVAL;
+ if (sk->nonagle == 1) {
+ err = -EINVAL;
+ break;
+ }
if (val != 0) {
sk->nonagle = 2;
} else {
sk->nonagle = 0;
- lock_sock(sk);
tcp_push_pending_frames(sk, tp);
- release_sock(sk);
}
- return 0;
+ break;
+
+ case TCP_KEEPIDLE:
+ if (val < 1 || val > MAX_TCP_KEEPIDLE)
+ err = -EINVAL;
+ else {
+ tp->keepalive_time = val * HZ;
+ if (sk->keepopen) {
+ __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
+ if (tp->keepalive_time > elapsed)
+ elapsed = tp->keepalive_time - elapsed;
+ else
+ elapsed = 0;
+ tcp_reset_keepalive_timer(sk, elapsed);
+ }
+ }
+ break;
+ case TCP_KEEPINTVL:
+ if (val < 1 || val > MAX_TCP_KEEPINTVL)
+ err = -EINVAL;
+ else
+ tp->keepalive_intvl = val * HZ;
+ break;
+ case TCP_KEEPCNT:
+ if (val < 1 || val > MAX_TCP_KEEPCNT)
+ err = -EINVAL;
+ else
+ tp->keepalive_probes = val;
+ break;
+ case TCP_SYNCNT:
+ if (val < 1 || val > MAX_TCP_SYNCNT)
+ err = -EINVAL;
+ else
+ tp->syn_retries = val;
+ break;
default:
- return -ENOPROTOOPT;
+ err = -ENOPROTOOPT;
+ break;
};
+ release_sock(sk);
+ return err;
}
int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
@@ -1761,6 +1938,30 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
case TCP_CORK:
val = (sk->nonagle == 2);
break;
+ case TCP_KEEPIDLE:
+ if (tp->keepalive_time)
+ val = tp->keepalive_time / HZ;
+ else
+ val = sysctl_tcp_keepalive_time / HZ;
+ break;
+ case TCP_KEEPINTVL:
+ if (tp->keepalive_intvl)
+ val = tp->keepalive_intvl / HZ;
+ else
+ val = sysctl_tcp_keepalive_intvl / HZ;
+ break;
+ case TCP_KEEPCNT:
+ if (tp->keepalive_probes)
+ val = tp->keepalive_probes;
+ else
+ val = sysctl_tcp_keepalive_probes;
+ break;
+ case TCP_SYNCNT:
+ if (tp->syn_retries)
+ val = tp->syn_retries;
+ else
+ val = sysctl_tcp_syn_retries;
+ break;
default:
return -ENOPROTOOPT;
};
@@ -1772,13 +1973,6 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
return 0;
}
-void tcp_set_keepalive(struct sock *sk, int val)
-{
- if (!sk->keepopen && val)
- tcp_inc_slow_timer(TCP_SLT_KEEPALIVE);
- else if (sk->keepopen && !val)
- tcp_dec_slow_timer(TCP_SLT_KEEPALIVE);
-}
extern void __skb_cb_too_small_for_tcp(int, int);
@@ -1786,7 +1980,7 @@ void __init tcp_init(void)
{
struct sk_buff *skb = NULL;
unsigned long goal;
- int order;
+ int order, i;
if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
@@ -1818,31 +2012,51 @@ void __init tcp_init(void)
*
* The methodology is similar to that of the buffer cache.
*/
- goal = num_physpages >> (20 - PAGE_SHIFT);
- for(order = 5; (1UL << order) < goal; order++)
+ goal = num_physpages >> (23 - PAGE_SHIFT);
+
+ for(order = 0; (1UL << order) < goal; order++)
;
do {
tcp_ehash_size = (1UL << order) * PAGE_SIZE /
- sizeof(struct sock *);
- tcp_ehash = (struct sock **)
+ sizeof(struct tcp_ehash_bucket);
+ tcp_ehash_size >>= 1;
+ while (tcp_ehash_size & (tcp_ehash_size-1))
+ tcp_ehash_size--;
+ tcp_ehash = (struct tcp_ehash_bucket *)
__get_free_pages(GFP_ATOMIC, order);
- } while (tcp_ehash == NULL && --order > 4);
+ } while (tcp_ehash == NULL && --order > 0);
if (!tcp_ehash)
panic("Failed to allocate TCP established hash table\n");
- memset(tcp_ehash, 0, tcp_ehash_size * sizeof(struct sock *));
+ for (i = 0; i < (tcp_ehash_size<<1); i++) {
+ tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
+ tcp_ehash[i].chain = NULL;
+ }
do {
tcp_bhash_size = (1UL << order) * PAGE_SIZE /
- sizeof(struct tcp_bind_bucket *);
- tcp_bhash = (struct tcp_bind_bucket **)
+ sizeof(struct tcp_bind_hashbucket);
+ if ((tcp_bhash_size > (64 * 1024)) && order > 0)
+ continue;
+ tcp_bhash = (struct tcp_bind_hashbucket *)
__get_free_pages(GFP_ATOMIC, order);
- } while (tcp_bhash == NULL && --order > 4);
+ } while (tcp_bhash == NULL && --order >= 0);
if (!tcp_bhash)
panic("Failed to allocate TCP bind hash table\n");
- memset(tcp_bhash, 0, tcp_bhash_size * sizeof(struct tcp_bind_bucket *));
+ for (i = 0; i < tcp_bhash_size; i++) {
+ tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
+ tcp_bhash[i].chain = NULL;
+ }
+
+ if (order > 4) {
+ sysctl_local_port_range[0] = 32768;
+ sysctl_local_port_range[1] = 61000;
+ } else if (order < 3) {
+ sysctl_local_port_range[0] = 1024*(3-order);
+ }
+ tcp_port_rover = sysctl_local_port_range[0] - 1;
printk("TCP: Hash tables configured (established %d bind %d)\n",
- tcp_ehash_size, tcp_bhash_size);
+ tcp_ehash_size<<1, tcp_bhash_size);
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3080bc201..f0711fccc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_input.c,v 1.170 1999/07/02 11:26:28 davem Exp $
+ * Version: $Id: tcp_input.c,v 1.173 1999/09/07 02:31:27 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -61,6 +61,7 @@
#include <linux/mm.h>
#include <linux/sysctl.h>
#include <net/tcp.h>
+#include <net/inet_common.h>
#include <linux/ipsec.h>
#ifdef CONFIG_SYSCTL
@@ -70,6 +71,7 @@
#endif
extern int sysctl_tcp_fin_timeout;
+extern int sysctl_tcp_keepalive_time;
/* These are on by default so the code paths get tested.
* For the final 2.2 this may be undone at our discretion. -DaveM
@@ -81,6 +83,7 @@ int sysctl_tcp_sack = 1;
int sysctl_tcp_syncookies = SYNC_INIT;
int sysctl_tcp_stdurg;
int sysctl_tcp_rfc1337;
+int sysctl_tcp_tw_recycle;
static int prune_queue(struct sock *sk);
@@ -133,7 +136,7 @@ static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th,
/* Tiny-grams with PSH set artifically deflate our
* ato measurement, but with a lower bound.
*/
- if(th->psh && (skb->len < (tp->mss_cache >> 1))) {
+ if(th->psh && (skb->len < (tp->rcv_mss >> 1))) {
/* Preserve the quickack state. */
if((tp->ato & 0x7fffffff) > HZ/50)
tp->ato = ((tp->ato & 0x80000000) |
@@ -187,6 +190,9 @@ static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
static __inline__ void tcp_set_rto(struct tcp_opt *tp)
{
tp->rto = (tp->srtt >> 3) + tp->mdev;
+ /* I am not enough educated to understand this magic.
+ * However, it smells bad. snd_cwnd>31 is common case.
+ */
tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1));
}
@@ -209,42 +215,196 @@ static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
tp->rto = HZ/5;
}
-/* WARNING: this must not be called if tp->saw_timestamp was false. */
-extern __inline__ void tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp,
- __u32 start_seq, __u32 end_seq)
+/* Save metrics learned by this TCP session.
+ This function is called only, when TCP finishes sucessfully
+ i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
+ */
+static void tcp_update_metrics(struct sock *sk)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst) {
+ int m;
+
+ if (tp->backoff || !tp->srtt) {
+ /* This session failed to estimate rtt. Why?
+ * Probably, no packets returned in time.
+ * Reset our results.
+ */
+ if (!(dst->mxlock&(1<<RTAX_RTT)))
+ dst->rtt = 0;
+ return;
+ }
+
+ dst_confirm(dst);
+
+ m = dst->rtt - tp->srtt;
+
+ /* If newly calculated rtt larger than stored one,
+ * store new one. Otherwise, use EWMA. Remember,
+ * rtt overestimation is always better than underestimation.
+ */
+ if (!(dst->mxlock&(1<<RTAX_RTT))) {
+ if (m <= 0)
+ dst->rtt = tp->srtt;
+ else
+ dst->rtt -= (m>>3);
+ }
+
+ if (!(dst->mxlock&(1<<RTAX_RTTVAR))) {
+ if (m < 0)
+ m = -m;
+
+ /* Scale deviation to rttvar fixed point */
+ m >>= 1;
+ if (m < tp->mdev)
+ m = tp->mdev;
+
+ if (m >= dst->rttvar)
+ dst->rttvar = m;
+ else
+ dst->rttvar -= (dst->rttvar - m)>>2;
+ }
+
+ if (tp->snd_ssthresh == 0x7FFFFFFF) {
+ /* Slow start still did not finish. */
+ if (dst->ssthresh &&
+ !(dst->mxlock&(1<<RTAX_SSTHRESH)) &&
+ tp->snd_cwnd > dst->ssthresh)
+ dst->ssthresh = tp->snd_cwnd;
+ if (!(dst->mxlock&(1<<RTAX_CWND)) &&
+ tp->snd_cwnd > dst->cwnd)
+ dst->cwnd = tp->snd_cwnd;
+ } else if (tp->snd_cwnd >= tp->snd_ssthresh && !tp->high_seq) {
+ /* Cong. avoidance phase, cwnd is reliable. */
+ if (!(dst->mxlock&(1<<RTAX_SSTHRESH)))
+ dst->ssthresh = tp->snd_cwnd;
+ if (!(dst->mxlock&(1<<RTAX_CWND)))
+ dst->cwnd = (dst->cwnd + tp->snd_cwnd)>>1;
+ } else {
+ /* Else slow start did not finish, cwnd is non-sense,
+ ssthresh may be also invalid.
+ */
+ if (!(dst->mxlock&(1<<RTAX_CWND)))
+ dst->cwnd = (dst->cwnd + tp->snd_ssthresh)>>1;
+ if (dst->ssthresh &&
+ !(dst->mxlock&(1<<RTAX_SSTHRESH)) &&
+ tp->snd_ssthresh > dst->ssthresh)
+ dst->ssthresh = tp->snd_ssthresh;
+ }
+ }
+}
+
+/* Initialize metrics on socket. */
+
+static void tcp_init_metrics(struct sock *sk)
{
- /* From draft-ietf-tcplw-high-performance: the correct
- * test is last_ack_sent <= end_seq.
- * (RFC1323 stated last_ack_sent < end_seq.)
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst == NULL)
+ goto reset;
+
+ dst_confirm(dst);
+
+ if (dst->rtt == 0)
+ goto reset;
+
+ if (!tp->srtt || !tp->saw_tstamp)
+ goto reset;
+
+ /* Initial rtt is determined from SYN,SYN-ACK.
+ * The segment is small and rtt may appear much
+ * less than real one. Use per-dst memory
+ * to make it more realistic.
*
- * HOWEVER: The current check contradicts the draft statements.
- * It has been done for good reasons.
- * The implemented check improves security and eliminates
- * unnecessary RTT overestimation.
- * 1998/06/27 Andrey V. Savochkin <saw@msu.ru>
+ * A bit of theory. RTT is time passed after "normal" sized packet
+ * is sent until it is ACKed. In normal curcumstances sending small
+ * packets force peer to delay ACKs and calculation is correct too.
+ * The algorithm is adaptive and, provided we follow specs, it
+ * NEVER underestimate RTT. BUT! If peer tries to make some clever
+ * tricks sort of "quick acks" for time long enough to decrease RTT
+ * to low value, and then abruptly stops to do it and starts to delay
+ * ACKs, wait for troubles.
*/
- if (!before(end_seq, tp->last_ack_sent - sk->rcvbuf) &&
- !after(start_seq, tp->rcv_wup + tp->rcv_wnd)) {
+ if (dst->rtt > tp->srtt)
+ tp->srtt = dst->rtt;
+ if (dst->rttvar > tp->mdev)
+ tp->mdev = dst->rttvar;
+ tcp_set_rto(tp);
+ tcp_bound_rto(tp);
+
+ if (dst->mxlock&(1<<RTAX_CWND))
+ tp->snd_cwnd_clamp = dst->cwnd;
+ if (dst->ssthresh) {
+ tp->snd_ssthresh = dst->ssthresh;
+ if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+ tp->snd_ssthresh = tp->snd_cwnd_clamp;
+ }
+ return;
+
+
+reset:
+ /* Play conservative. If timestamps are not
+ * supported, TCP will fail to recalculate correct
+ * rtt, if initial rto is too small. FORGET ALL AND RESET!
+ */
+ if (!tp->saw_tstamp && tp->srtt) {
+ tp->srtt = 0;
+ tp->mdev = TCP_TIMEOUT_INIT;
+ tp->rto = TCP_TIMEOUT_INIT;
+ }
+}
+
+#define PAWS_24DAYS (60 * 60 * 24 * 24)
+
+
+/* WARNING: this must not be called if tp->saw_tstamp was false. */
+extern __inline__ void
+tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, u32 seq)
+{
+ if (!after(seq, tp->last_ack_sent)) {
/* PAWS bug workaround wrt. ACK frames, the PAWS discard
* extra check below makes sure this can only happen
* for pure ACK frames. -DaveM
+ *
+ * Not only, also it occurs for expired timestamps
+ * and RSTs with bad timestamp option. --ANK
*/
- if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) {
+
+ if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0 ||
+ xtime.tv_sec >= tp->ts_recent_stamp + PAWS_24DAYS) {
tp->ts_recent = tp->rcv_tsval;
- tp->ts_recent_stamp = tcp_time_stamp;
+ tp->ts_recent_stamp = xtime.tv_sec;
}
}
}
-#define PAWS_24DAYS (HZ * 60 * 60 * 24 * 24)
-
-extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, unsigned len)
+extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb)
{
- /* ts_recent must be younger than 24 days */
- return (((s32)(tcp_time_stamp - tp->ts_recent_stamp) >= PAWS_24DAYS) ||
- (((s32)(tp->rcv_tsval - tp->ts_recent) < 0) &&
- /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM */
- (len != (th->doff * 4))));
+ return ((s32)(tp->rcv_tsval - tp->ts_recent) < 0 &&
+ xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS
+
+ /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
+
+ I cannot see quitely as all the idea behind PAWS
+ is destroyed 8)
+
+ The problem is only in reordering duplicate ACKs.
+ Hence, we can check this rare case more carefully.
+
+ 1. Check that it is really duplicate ACK (ack==snd_una)
+ 2. Give it some small "replay" window (~RTO)
+
+ We do not know units of foreign ts values, but make conservative
+ assumption that they are >=1ms. It solves problem
+ noted in Dave's mail to tcpimpl and does not harm PAWS. --ANK
+ */
+ && (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq ||
+ TCP_SKB_CB(skb)->ack_seq != tp->snd_una ||
+ !skb->h.th->ack ||
+ (s32)(tp->ts_recent - tp->rcv_tsval) > (tp->rto*1024)/HZ));
}
@@ -283,13 +443,14 @@ static void tcp_reset(struct sock *sk)
case TCP_CLOSE_WAIT:
sk->err = EPIPE;
break;
+ case TCP_CLOSE:
+ return;
default:
sk->err = ECONNRESET;
};
tcp_set_state(sk, TCP_CLOSE);
- sk->shutdown = SHUTDOWN_MASK;
- if (!sk->dead)
- sk->state_change(sk);
+ tcp_clear_xmit_timers(sk);
+ tcp_done(sk);
}
/* This tags the retransmission queue when SACKs arrive. */
@@ -345,7 +506,6 @@ void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, i
{
unsigned char *ptr;
int length=(th->doff*4)-sizeof(struct tcphdr);
- int saw_mss = 0;
ptr = (unsigned char *)(th + 1);
tp->saw_tstamp = 0;
@@ -370,11 +530,11 @@ void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, i
case TCPOPT_MSS:
if(opsize==TCPOLEN_MSS && th->syn) {
u16 in_mss = ntohs(*(__u16 *)ptr);
- if (in_mss == 0)
- in_mss = 536;
- if (tp->mss_clamp > in_mss)
+ if (in_mss) {
+ if (tp->user_mss && tp->user_mss < in_mss)
+ in_mss = tp->user_mss;
tp->mss_clamp = in_mss;
- saw_mss = 1;
+ }
}
break;
case TCPOPT_WINDOW:
@@ -428,8 +588,6 @@ void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, i
length-=opsize;
};
}
- if(th->syn && saw_mss == 0)
- tp->mss_clamp = 536;
}
/* Fast parse options. This hopes to only see timestamps.
@@ -448,8 +606,10 @@ static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th,
if (*ptr == __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
| (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
tp->saw_tstamp = 1;
- tp->rcv_tsval = ntohl(*++ptr);
- tp->rcv_tsecr = ntohl(*++ptr);
+ ++ptr;
+ tp->rcv_tsval = ntohl(*ptr);
+ ++ptr;
+ tp->rcv_tsecr = ntohl(*ptr);
return 1;
}
}
@@ -461,6 +621,7 @@ static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th,
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
+#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged new data. */
static __inline__ void clear_fast_retransmit(struct tcp_opt *tp)
{
@@ -498,6 +659,8 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
tp->dup_acks++;
if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) {
tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+ if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+ tp->snd_ssthresh = tp->snd_cwnd_clamp;
tp->snd_cwnd = (tp->snd_ssthresh + 3);
tp->high_seq = tp->snd_nxt;
if(!tp->fackets_out)
@@ -595,11 +758,12 @@ static __inline__ void tcp_cong_avoid(struct tcp_opt *tp)
* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
*/
if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- tp->snd_cwnd++;
+ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+ tp->snd_cwnd++;
tp->snd_cwnd_cnt=0;
} else
tp->snd_cwnd_cnt++;
- }
+ }
}
/* Remove acknowledged frames from the retransmission queue. */
@@ -645,9 +809,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack,
if(tp->fackets_out)
tp->fackets_out--;
} else {
+ acked |= FLAG_SYN_ACKED;
/* This is pure paranoia. */
tp->retrans_head = NULL;
- }
+ }
tp->packets_out--;
*seq = scb->seq;
*seq_rtt = now - scb->when;
@@ -721,7 +886,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp,
* See draft-ietf-tcplw-high-performance-00, section 3.3.
* 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
*/
- if (!(flag & FLAG_DATA_ACKED))
+ if (!(flag & (FLAG_DATA_ACKED|FLAG_SYN_ACKED)))
return;
seq_rtt = tcp_time_stamp - tp->rcv_tsecr;
@@ -856,7 +1021,7 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th,
* where the network delay has increased suddenly.
* I.e. Karn's algorithm. (SIGCOMM '87, p5.)
*/
- if (flag & FLAG_DATA_ACKED) {
+ if (flag & (FLAG_DATA_ACKED|FLAG_SYN_ACKED)) {
if(!(flag & FLAG_RETRANS_DATA_ACKED)) {
tp->backoff = 0;
tcp_rtt_estimator(tp, seq_rtt);
@@ -910,37 +1075,50 @@ uninteresting_ack:
}
/* New-style handling of TIME_WAIT sockets. */
-extern void tcp_tw_schedule(struct tcp_tw_bucket *tw);
-extern void tcp_tw_reschedule(struct tcp_tw_bucket *tw);
-extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw);
/* Must be called only from BH context. */
void tcp_timewait_kill(struct tcp_tw_bucket *tw)
{
- struct tcp_bind_bucket *tb = tw->tb;
-
- SOCKHASH_LOCK_WRITE_BH();
-
- /* Disassociate with bind bucket. */
- if(tw->bind_next)
- tw->bind_next->bind_pprev = tw->bind_pprev;
- *(tw->bind_pprev) = tw->bind_next;
- if (tb->owners == NULL) {
- if (tb->next)
- tb->next->pprev = tb->pprev;
- *(tb->pprev) = tb->next;
- kmem_cache_free(tcp_bucket_cachep, tb);
- }
+ struct tcp_ehash_bucket *ehead;
+ struct tcp_bind_hashbucket *bhead;
+ struct tcp_bind_bucket *tb;
/* Unlink from established hashes. */
+ ehead = &tcp_ehash[tw->hashent];
+ write_lock(&ehead->lock);
+ if (!tw->pprev) {
+ write_unlock(&ehead->lock);
+ return;
+ }
if(tw->next)
tw->next->pprev = tw->pprev;
- *tw->pprev = tw->next;
+ *(tw->pprev) = tw->next;
+ tw->pprev = NULL;
+ write_unlock(&ehead->lock);
- SOCKHASH_UNLOCK_WRITE_BH();
+ /* Disassociate with bind bucket. */
+ bhead = &tcp_bhash[tcp_bhashfn(tw->num)];
+ spin_lock(&bhead->lock);
+ if ((tb = tw->tb) != NULL) {
+ if(tw->bind_next)
+ tw->bind_next->bind_pprev = tw->bind_pprev;
+ *(tw->bind_pprev) = tw->bind_next;
+ tw->tb = NULL;
+ if (tb->owners == NULL) {
+ if (tb->next)
+ tb->next->pprev = tb->pprev;
+ *(tb->pprev) = tb->next;
+ kmem_cache_free(tcp_bucket_cachep, tb);
+ }
+ }
+ spin_unlock(&bhead->lock);
- /* Ok, now free it up. */
- kmem_cache_free(tcp_timewait_cachep, tw);
+#ifdef INET_REFCNT_DEBUG
+ if (atomic_read(&tw->refcnt) != 1) {
+ printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw, atomic_read(&tw->refcnt));
+ }
+#endif
+ tcp_tw_put(tw);
}
/* We come here as a special case from the AF specific TCP input processing,
@@ -949,9 +1127,36 @@ void tcp_timewait_kill(struct tcp_tw_bucket *tw)
* entire timeout period. The only special cases are for BSD TIME_WAIT
* reconnects and SYN/RST bits being set in the TCP header.
*/
-int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
- struct tcphdr *th, unsigned len)
+
+/*
+ * * Main purpose of TIME-WAIT state is to close connection gracefully,
+ * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
+ * (and, probably, tail of data) and one or more our ACKs are lost.
+ * * What is TIME-WAIT timeout? It is associated with maximal packet
+ * lifetime in the internet, which results in wrong conclusion, that
+ * it is set to catch "old duplicate segments" wandering out of their path.
+ * It is not quite correct. This timeout is calculated so that it exceeds
+ * maximal retransmision timeout enough to allow to lose one (or more)
+ * segments sent by peer and our ACKs. This time may be calculated from RTO.
+ * * When TIME-WAIT socket receives RST, it means that another end
+ * finally closed and we are allowed to kill TIME-WAIT too.
+ * * Second purpose of TIME-WAIT is catching old duplicate segments.
+ * Well, certainly it is pure paranoia, but if we load TIME-WAIT
+ * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
+ * * If we invented some more clever way to catch duplicates
+ * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
+ *
+ * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
+ * When you compare it to RFCs, please, read section SEGMENT ARRIVES
+ * from the very beginning.
+ */
+enum tcp_tw_status
+tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
+ struct tcphdr *th, unsigned len)
{
+ struct tcp_opt tp;
+ int paws_reject = 0;
+
/* RFC 1122:
* "When a connection is [...] on TIME-WAIT state [...]
* [a TCP] MAY accept a new SYN from the remote TCP to
@@ -965,58 +1170,101 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
* (2) returns to TIME-WAIT state if the SYN turns out
* to be an old duplicate".
*/
- if(th->syn && !th->rst && after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt)) {
- struct sock *sk;
- struct tcp_func *af_specific = tw->af_specific;
- __u32 isn;
- int ret;
-
- isn = tw->rcv_nxt + 128000;
- if(isn == 0)
- isn++;
- tcp_tw_deschedule(tw);
- tcp_timewait_kill(tw);
- sk = af_specific->get_sock(skb, th);
- if(sk == NULL ||
- !ipsec_sk_policy(sk,skb))
- return 0;
- bh_lock_sock(sk);
+ tp.saw_tstamp = 0;
+ if (th->doff > (sizeof(struct tcphdr)>>2) && tw->ts_recent_stamp) {
+ tcp_parse_options(NULL, th, &tp, 0);
+
+ paws_reject = tp.saw_tstamp &&
+ ((s32)(tp.rcv_tsval - tw->ts_recent) < 0 &&
+ xtime.tv_sec < tw->ts_recent_stamp + PAWS_24DAYS);
+ }
+
+ if (!paws_reject &&
+ (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
+ TCP_SKB_CB(skb)->seq == tw->rcv_nxt)) {
+ /* In window segment, it may be only reset or bare ack. */
- /* Default is to discard the frame. */
- ret = 0;
+ if (th->rst) {
+#ifdef CONFIG_TCP_TW_RECYCLE
+ /* When recycling, always follow rfc1337,
+ * but mark bucket as ready to recycling immediately.
+ */
+ if (sysctl_tcp_tw_recycle) {
+ /* May kill it now. */
+ tw->rto = 0;
+ tw->ttd = jiffies;
+ } else
+#endif
+ /* This is TIME_WAIT assasination, in two flavors.
+ * Oh well... nobody has a sufficient solution to this
+ * protocol bug yet.
+ */
+ if(sysctl_tcp_rfc1337 == 0) {
+ tcp_tw_deschedule(tw);
+ tcp_timewait_kill(tw);
+ }
+ } else {
+ tcp_tw_reschedule(tw);
+ }
+
+ if (tp.saw_tstamp) {
+ tw->ts_recent = tp.rcv_tsval;
+ tw->ts_recent_stamp = xtime.tv_sec;
+ }
+ tcp_tw_put(tw);
+ return TCP_TW_SUCCESS;
+ }
+
+ /* Out of window segment.
- if(sk->lock.users)
- goto out_unlock;
+ All the segments are ACKed immediately.
- skb_set_owner_r(skb, sk);
- af_specific = sk->tp_pinfo.af_tcp.af_specific;
+ The only exception is new SYN. We accept it, if it is
+ not old duplicate and we are not in danger to be killed
+ by delayed old duplicates. RFC check is that it has
+ newer sequence number works at rates <40Mbit/sec.
+ However, if paws works, it is reliable AND even more,
+ we even may relax silly seq space cutoff.
- if(af_specific->conn_request(sk, skb, isn) < 0)
- ret = 1; /* Toss a reset back. */
- out_unlock:
- bh_unlock_sock(sk);
- return ret;
+ RED-PEN: we violate main RFC requirement, if this SYN will appear
+ old duplicate (i.e. we receive RST in reply to SYN-ACK),
+ we must return socket to time-wait state. It is not good,
+ but not fatal yet.
+ */
+
+ if (th->syn && !th->rst && !th->ack && !paws_reject &&
+ (after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt) ||
+ (tp.saw_tstamp && tw->ts_recent != tp.rcv_tsval))) {
+ u32 isn = tw->snd_nxt + 2;
+ if (isn == 0)
+ isn++;
+ TCP_SKB_CB(skb)->when = isn;
+ return TCP_TW_SYN;
}
- /* Check RST or SYN */
- if(th->rst || th->syn) {
- /* This is TIME_WAIT assasination, in two flavors.
- * Oh well... nobody has a sufficient solution to this
- * protocol bug yet.
+ if(!th->rst) {
+ /* In this case we must reset the TIMEWAIT timer.
+
+ If it is ACKless SYN it may be both old duplicate
+ and new good SYN with random sequence number <rcv_nxt.
+ Do not reschedule in the last case.
*/
- if(sysctl_tcp_rfc1337 == 0) {
- tcp_tw_deschedule(tw);
- tcp_timewait_kill(tw);
- }
- if(!th->rst)
- return 1; /* toss a reset back */
- } else {
- /* In this case we must reset the TIMEWAIT timer. */
- if(th->ack)
+ if (paws_reject || th->ack) {
tcp_tw_reschedule(tw);
+#ifdef CONFIG_TCP_TW_RECYCLE
+ tw->rto = min(120*HZ, tw->rto<<1);
+ tw->ttd = jiffies + tw->rto;
+#endif
+ }
+
+ /* Send ACK. Note, we do not put the bucket,
+ * it will be released by caller.
+ */
+ return TCP_TW_ACK;
}
- return 0; /* Discard the frame. */
+ tcp_tw_put(tw);
+ return TCP_TW_SUCCESS;
}
/* Enter the time wait state. This is always called from BH
@@ -1024,37 +1272,54 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
* relevant info into it from the SK, and mess with hash chains
* and list linkage.
*/
-static __inline__ void tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
+static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
{
+ struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->hashent];
+ struct tcp_bind_hashbucket *bhead;
struct sock **head, *sktw;
- /* Step 1: Remove SK from established hash. */
- if(sk->next)
- sk->next->pprev = sk->pprev;
- *sk->pprev = sk->next;
- sk->pprev = NULL;
- tcp_reg_zap(sk);
-
- /* Step 2: Put TW into bind hash where SK was. */
- tw->tb = (struct tcp_bind_bucket *)sk->prev;
- if((tw->bind_next = sk->bind_next) != NULL)
- sk->bind_next->bind_pprev = &tw->bind_next;
- tw->bind_pprev = sk->bind_pprev;
- *sk->bind_pprev = (struct sock *)tw;
- sk->prev = NULL;
+ write_lock(&ehead->lock);
- /* Step 3: Un-charge protocol socket in-use count. */
- sk->prot->inuse--;
+ /* Step 1: Remove SK from established hash. */
+ if (sk->pprev) {
+ if(sk->next)
+ sk->next->pprev = sk->pprev;
+ *sk->pprev = sk->next;
+ sk->pprev = NULL;
+ }
- /* Step 4: Hash TW into TIMEWAIT half of established hash table. */
- head = &tcp_ehash[sk->hashent + (tcp_ehash_size >> 1)];
+ /* Step 2: Hash TW into TIMEWAIT half of established hash table. */
+ head = &(ehead + tcp_ehash_size)->chain;
sktw = (struct sock *)tw;
if((sktw->next = *head) != NULL)
(*head)->pprev = &sktw->next;
*head = sktw;
sktw->pprev = head;
+ atomic_inc(&tw->refcnt);
+
+ write_unlock(&ehead->lock);
+
+ /* Step 3: Put TW into bind hash. Original socket stays there too.
+ Note, that any socket with sk->num!=0 MUST be bound in binding
+ cache, even if it is closed.
+ */
+ bhead = &tcp_bhash[tcp_bhashfn(sk->num)];
+ spin_lock(&bhead->lock);
+ tw->tb = (struct tcp_bind_bucket *)sk->prev;
+ BUG_TRAP(sk->prev!=NULL);
+ if ((tw->bind_next = tw->tb->owners) != NULL)
+ tw->tb->owners->bind_pprev = &tw->bind_next;
+ tw->tb->owners = (struct sock*)tw;
+ tw->bind_pprev = &tw->tb->owners;
+ spin_unlock(&bhead->lock);
+
+ /* Step 4: Un-charge protocol socket in-use count. */
+ sk->prot->inuse--;
}
+/*
+ * Move a socket to time-wait.
+ */
void tcp_time_wait(struct sock *sk)
{
struct tcp_tw_bucket *tw;
@@ -1071,8 +1336,16 @@ void tcp_time_wait(struct sock *sk)
tw->dport = sk->dport;
tw->family = sk->family;
tw->reuse = sk->reuse;
+ tw->hashent = sk->hashent;
tw->rcv_nxt = sk->tp_pinfo.af_tcp.rcv_nxt;
- tw->af_specific = sk->tp_pinfo.af_tcp.af_specific;
+ tw->snd_nxt = sk->tp_pinfo.af_tcp.snd_nxt;
+ tw->ts_recent = sk->tp_pinfo.af_tcp.ts_recent;
+ tw->ts_recent_stamp= sk->tp_pinfo.af_tcp.ts_recent_stamp;
+#ifdef CONFIG_TCP_TW_RECYCLE
+ tw->rto = sk->tp_pinfo.af_tcp.rto;
+ tw->ttd = jiffies + 2*tw->rto;
+#endif
+ atomic_set(&tw->refcnt, 0);
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
if(tw->family == PF_INET6) {
@@ -1085,9 +1358,7 @@ void tcp_time_wait(struct sock *sk)
}
#endif
/* Linkage updates. */
- SOCKHASH_LOCK_WRITE();
- tcp_tw_hashdance(sk, tw);
- SOCKHASH_UNLOCK_WRITE();
+ __tcp_tw_hashdance(sk, tw);
/* Get the TIME_WAIT timeout firing. */
tcp_tw_schedule(tw);
@@ -1096,8 +1367,6 @@ void tcp_time_wait(struct sock *sk)
if(sk->state == TCP_ESTABLISHED)
tcp_statistics.TcpCurrEstab--;
sk->state = TCP_CLOSE;
- net_reset_timer(sk, TIME_DONE,
- min(sk->tp_pinfo.af_tcp.srtt * 2, TCP_DONE_TIME));
} else {
/* Sorry, we're out of memory, just CLOSE this
* socket up. We've got bigger problems than
@@ -1106,10 +1375,9 @@ void tcp_time_wait(struct sock *sk)
tcp_set_state(sk, TCP_CLOSE);
}
- /* Prevent rcvmsg/sndmsg calls, and wake people up. */
- sk->shutdown = SHUTDOWN_MASK;
- if(!sk->dead)
- sk->state_change(sk);
+ tcp_update_metrics(sk);
+ tcp_clear_xmit_timers(sk);
+ tcp_done(sk);
}
/*
@@ -1134,7 +1402,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
tcp_send_ack(sk);
if (!sk->dead) {
- sk->state_change(sk);
+ wake_up_interruptible(sk->sleep);
sock_wake_async(sk->socket, 1);
}
@@ -1143,8 +1411,6 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
case TCP_ESTABLISHED:
/* Move to CLOSE_WAIT */
tcp_set_state(sk, TCP_CLOSE_WAIT);
- if (th->rst)
- sk->shutdown = SHUTDOWN_MASK;
break;
case TCP_CLOSE_WAIT:
@@ -1161,12 +1427,6 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
/* This case occurs when a simultaneous close
* happens, we must ack the received FIN and
* enter the CLOSING state.
- *
- * This causes a WRITE timeout, which will either
- * move on to TIME_WAIT when we timeout, or resend
- * the FIN properly (maybe we get rid of that annoying
- * FIN lost hang). The TIME_WRITE code is already
- * correct for handling this timeout.
*/
tcp_set_state(sk, TCP_CLOSING);
break;
@@ -1423,7 +1683,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
/* Turn on fast path. */
if (skb_queue_len(&tp->out_of_order_queue) == 0)
tp->pred_flags = htonl(((tp->tcp_header_len >> 2) << 28) |
- (0x10 << 16) |
+ ntohl(TCP_FLAG_ACK) |
tp->snd_wnd);
return;
}
@@ -1545,8 +1805,8 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
* Now tell the user we may have some data.
*/
if (!sk->dead) {
- SOCK_DEBUG(sk, "Data wakeup.\n");
- sk->data_ready(sk,0);
+ wake_up_interruptible(sk->sleep);
+ sock_wake_async(sk->socket,1);
}
return(1);
}
@@ -1575,28 +1835,59 @@ static __inline__ void tcp_data_snd_check(struct sock *sk)
/*
* Adapt the MSS value used to make delayed ack decision to the
- * real world.
+ * real world.
+ *
+ * The constant 536 hasn't any good meaning. In IPv4 world
+ * MTU may be smaller, though it contradicts to RFC1122, which
+ * states that MSS must be at least 536.
+ * We use the constant to do not ACK each second
+ * packet in a stream of tiny size packets.
+ * It means that super-low mtu links will be aggressively delacked.
+ * Seems, it is even good. If they have so low mtu, they are weirdly
+ * slow.
+ *
+ * AK: BTW it may be useful to add an option to lock the rcv_mss.
+ * this way the beowulf people wouldn't need ugly patches to get the
+ * ack frequencies they want and it would be an elegant way to tune delack.
*/
static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- unsigned int len = skb->len, lss;
+ unsigned int len, lss;
- if (len > tp->rcv_mss)
- tp->rcv_mss = len;
lss = tp->last_seg_size;
tp->last_seg_size = 0;
- if (len >= 536) {
- if (len == lss)
- tp->rcv_mss = len;
- tp->last_seg_size = len;
+
+ /* skb->len may jitter because of SACKs, even if peer
+ * sends good full-sized frames.
+ */
+ len = skb->len;
+ if (len >= tp->rcv_mss) {
+ tp->rcv_mss = len;
+ } else {
+ /* Otherwise, we make more careful check taking into account,
+ * that SACKs block is variable.
+ *
+ * "len" is invariant segment length, including TCP header.
+ */
+ len = skb->tail - skb->h.raw;
+ if (len >= 536 + sizeof(struct tcphdr)) {
+ /* Subtract also invariant (if peer is RFC compliant),
+ * tcp header plus fixed timestamp option length.
+ * Resulting "len" is MSS free of SACK jitter.
+ */
+ len -= tp->tcp_header_len;
+ if (len == lss)
+ tp->rcv_mss = len;
+ tp->last_seg_size = len;
+ }
}
}
/*
* Check if sending an ack is needed.
*/
-static __inline__ void __tcp_ack_snd_check(struct sock *sk)
+static __inline__ void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
@@ -1621,12 +1912,12 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk)
/* We entered "quick ACK" mode or... */
tcp_in_quickack_mode(tp) ||
/* We have out of order data */
- (skb_peek(&tp->out_of_order_queue) != NULL)) {
+ (ofo_possible && (skb_peek(&tp->out_of_order_queue) != NULL))) {
/* Then ack it now */
tcp_send_ack(sk);
} else {
/* Else, send delayed ack. */
- tcp_send_delayed_ack(tp, HZ/2);
+ tcp_send_delayed_ack(sk, HZ/2);
}
}
@@ -1637,7 +1928,7 @@ static __inline__ void tcp_ack_snd_check(struct sock *sk)
/* We sent a data segment already. */
return;
}
- __tcp_ack_snd_check(sk);
+ __tcp_ack_snd_check(sk, 1);
}
@@ -1767,6 +2058,13 @@ static int prune_queue(struct sock *sk)
* complex for anyones sanity. So we don't do it anymore. But
* if we are really having our buffer space abused we stop accepting
* new receive data.
+ *
+ * FIXME: it should recompute SACK state and only remove enough
+ * buffers to get into bounds again. The current scheme loses
+ * badly sometimes on links with large RTT, especially when
+ * the driver has high overhead per skb.
+ * (increasing the rcvbuf is not enough because it inflates the
+ * the window too, disabling flow control effectively) -AK
*/
if(atomic_read(&sk->rmem_alloc) < (sk->rcvbuf << 1))
return 0;
@@ -1782,7 +2080,7 @@ static int prune_queue(struct sock *sk)
* disabled when:
* - A zero window was announced from us - zero window probing
* is only handled properly in the slow path.
- * - Out of order segments arrived.
+ * - Out of order segments arrived.
* - Urgent data is expected.
* - There is no buffer space left
* - Unexpected TCP flags/window values/header lengths are received
@@ -1790,6 +2088,7 @@ static int prune_queue(struct sock *sk)
* - Data is sent in both directions. Fast path only supports pure senders
* or pure receivers (this means either the sequence number or the ack
* value must stay constant)
+ * - Unexpected TCP option.
*
* When these conditions are not satisfied it drops into a standard
* receive procedure patterned after RFC793 to handle all cases.
@@ -1801,12 +2100,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- int queued;
- u32 flg;
/*
* Header prediction.
- * The code follows the one in the famous
+ * The code losely follows the one in the famous
* "30 instruction TCP receive" Van Jacobson mail.
*
* Van's trick is to deposit buffers into socket queue
@@ -1819,39 +2116,63 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
* We do checksum and copy also but from device to kernel.
*/
- /*
- * RFC1323: H1. Apply PAWS check first.
- */
- if (tcp_fast_parse_options(sk, th, tp)) {
- if (tp->saw_tstamp) {
- if (tcp_paws_discard(tp, th, len)) {
- tcp_statistics.TcpInErrs++;
- if (!th->rst) {
- tcp_send_ack(sk);
- goto discard;
- }
- }
- tcp_replace_ts_recent(sk, tp,
- TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(skb)->end_seq);
- }
- }
- flg = *(((u32 *)th) + 3) & ~htonl(0xFC8 << 16);
+ /* RED-PEN. Using static variables to pass function arguments
+ * cannot be good idea...
+ */
+ tp->saw_tstamp = 0;
/* pred_flags is 0xS?10 << 16 + snd_wnd
* if header_predition is to be made
* 'S' will always be tp->tcp_header_len >> 2
- * '?' will be 0 else it will be !0
- * (when there are holes in the receive
+ * '?' will be 0 for the fast path, otherwise pred_flags is 0 to
+ * turn it off (when there are holes in the receive
* space for instance)
* PSH flag is ignored.
- */
+ */
+
+ if ((tcp_flag_word(th) & ~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) == tp->pred_flags &&
+ TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+ int tcp_header_len = th->doff*4;
+
+ /* Timestamp header prediction */
+
+ /* Non-standard header f.e. SACKs -> slow path */
+ if (tcp_header_len != tp->tcp_header_len)
+ goto slow_path;
+
+ /* Check timestamp */
+ if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
+ __u32 *ptr = (__u32 *)(th + 1);
- if (flg == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
- if (len <= th->doff*4) {
+ /* No? Slow path! */
+ if (*ptr != __constant_ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+ | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
+ goto slow_path;
+
+ tp->saw_tstamp = 1;
+ ++ptr;
+ tp->rcv_tsval = ntohl(*ptr);
+ ++ptr;
+ tp->rcv_tsecr = ntohl(*ptr);
+
+ /* If PAWS failed, check it more carefully in slow path */
+ if ((s32)(tp->rcv_tsval - tp->ts_recent) < 0)
+ goto slow_path;
+
+ /* Predicted packet is in window by definition.
+ seq == rcv_nxt and last_ack_sent <= rcv_nxt.
+ Hence, check seq<=last_ack_sent reduces to:
+ */
+ if (tp->rcv_nxt == tp->last_ack_sent) {
+ tp->ts_recent = tp->rcv_tsval;
+ tp->ts_recent_stamp = xtime.tv_sec;
+ }
+ }
+
+ if (len <= tcp_header_len) {
/* Bulk data transfer: sender */
- if (len == th->doff*4) {
+ if (len == tcp_header_len) {
tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(skb)->ack_seq, len);
kfree_skb(skb);
@@ -1864,12 +2185,14 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
} else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una &&
atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) {
/* Bulk data transfer: receiver */
- __skb_pull(skb,th->doff*4);
+ __skb_pull(skb,tcp_header_len);
+ /* Is it possible to simplify this? */
tcp_measure_rcv_mss(sk, skb);
/* DO NOT notify forward progress here.
* It saves dozen of CPU instructions in fast path. --ANK
+ * And where is it signaled then ? -AK
*/
__skb_queue_tail(&sk->receive_queue, skb);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
@@ -1877,14 +2200,37 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
/* FIN bit check is not done since if FIN is set in
* this frame, the pred_flags won't match up. -DaveM
*/
- sk->data_ready(sk, 0);
+ wake_up_interruptible(sk->sleep);
+ sock_wake_async(sk->socket,1);
tcp_delack_estimator(tp);
tcp_remember_ack(tp, th, skb);
- __tcp_ack_snd_check(sk);
+ __tcp_ack_snd_check(sk, 0);
return 0;
}
+ /* Packet is in sequence, flags are trivial;
+ * only ACK is strange or we are tough on memory.
+ * Jump to step 5.
+ */
+ goto step5;
+ }
+
+slow_path:
+ /*
+ * RFC1323: H1. Apply PAWS check first.
+ */
+ if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp &&
+ tcp_paws_discard(tp, skb)) {
+ if (!th->rst) {
+ tcp_send_ack(sk);
+ goto discard;
+ }
+ /* Resets are accepted even if PAWS failed.
+
+ ts_recent update must be made after we are sure
+ that the packet is in window.
+ */
}
/*
@@ -1909,44 +2255,34 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
goto discard;
}
+ if(th->rst) {
+ tcp_reset(sk);
+ goto discard;
+ }
+
+ if (tp->saw_tstamp) {
+ tcp_replace_ts_recent(sk, tp,
+ TCP_SKB_CB(skb)->seq);
+ }
+
if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
SOCK_DEBUG(sk, "syn in established state\n");
tcp_statistics.TcpInErrs++;
tcp_reset(sk);
return 1;
}
-
- if(th->rst) {
- tcp_reset(sk);
- goto discard;
- }
+step5:
if(th->ack)
tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len);
/* Process urgent data. */
tcp_urg(sk, th, len);
+ {
/* step 7: process the segment text */
- queued = tcp_data(skb, sk, len);
+ int queued = tcp_data(skb, sk, len);
- /* This must be after tcp_data() does the skb_pull() to
- * remove the header size from skb->len.
- *
- * Dave!!! Phrase above (and all about rcv_mss) has
- * nothing to do with reality. rcv_mss must measure TOTAL
- * size, including sacks, IP options etc. Hence, measure_rcv_mss
- * must occure before pulling etc, otherwise it will flap
- * like hell. Even putting it before tcp_data is wrong,
- * it should use skb->tail - skb->nh.raw instead.
- * --ANK (980805)
- *
- * BTW I broke it. Now all TCP options are handled equally
- * in mss_clamp calculations (i.e. ignored, rfc1122),
- * and mss_cache does include all of them (i.e. tstamps)
- * except for sacks, to calulate effective mss faster.
- * --ANK (980805)
- */
tcp_measure_rcv_mss(sk, skb);
/* Be careful, tcp_data() may have put this into TIME_WAIT. */
@@ -1959,76 +2295,541 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
discard:
kfree_skb(skb);
}
+ }
return 0;
}
+
+/* This is not only more efficient than what we used to do, it eliminates
+ * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
+ *
+ * Actually, we could lots of memory writes here. tp of listening
+ * socket contains all necessary default parameters.
+ */
+struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
+{
+ struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0);
+
+ if(newsk != NULL) {
+ struct tcp_opt *newtp;
+#ifdef CONFIG_FILTER
+ struct sk_filter *filter;
+#endif
+
+ memcpy(newsk, sk, sizeof(*newsk));
+ newsk->state = TCP_SYN_RECV;
+
+ /* SANITY */
+ newsk->pprev = NULL;
+ newsk->prev = NULL;
+
+ /* Clone the TCP header template */
+ newsk->dport = req->rmt_port;
+
+ sock_lock_init(newsk);
+
+ atomic_set(&newsk->rmem_alloc, 0);
+ skb_queue_head_init(&newsk->receive_queue);
+ atomic_set(&newsk->wmem_alloc, 0);
+ skb_queue_head_init(&newsk->write_queue);
+ atomic_set(&newsk->omem_alloc, 0);
+
+ newsk->done = 0;
+ newsk->proc = 0;
+ newsk->backlog.head = newsk->backlog.tail = NULL;
+ skb_queue_head_init(&newsk->error_queue);
+ newsk->write_space = tcp_write_space;
+#ifdef CONFIG_FILTER
+ if ((filter = newsk->filter) != NULL)
+ sk_filter_charge(newsk, filter);
+#endif
+
+ /* Now setup tcp_opt */
+ newtp = &(newsk->tp_pinfo.af_tcp);
+ newtp->pred_flags = 0;
+ newtp->rcv_nxt = req->rcv_isn + 1;
+ newtp->snd_nxt = req->snt_isn + 1;
+ newtp->snd_una = req->snt_isn + 1;
+ newtp->srtt = 0;
+ newtp->ato = 0;
+ newtp->snd_wl1 = req->rcv_isn;
+ newtp->snd_wl2 = req->snt_isn;
+
+ /* RFC1323: The window in SYN & SYN/ACK segments
+ * is never scaled.
+ */
+ newtp->snd_wnd = ntohs(skb->h.th->window);
+
+ newtp->max_window = newtp->snd_wnd;
+ newtp->pending = 0;
+ newtp->retransmits = 0;
+ newtp->last_ack_sent = req->rcv_isn + 1;
+ newtp->backoff = 0;
+ newtp->mdev = TCP_TIMEOUT_INIT;
+
+ /* So many TCP implementations out there (incorrectly) count the
+ * initial SYN frame in their delayed-ACK and congestion control
+ * algorithms that we must have the following bandaid to talk
+ * efficiently to them. -DaveM
+ */
+ newtp->snd_cwnd = 2;
+
+ newtp->rto = TCP_TIMEOUT_INIT;
+ newtp->packets_out = 0;
+ newtp->fackets_out = 0;
+ newtp->retrans_out = 0;
+ newtp->high_seq = 0;
+ newtp->snd_ssthresh = 0x7fffffff;
+ newtp->snd_cwnd_cnt = 0;
+ newtp->dup_acks = 0;
+ newtp->delayed_acks = 0;
+ init_timer(&newtp->retransmit_timer);
+ newtp->retransmit_timer.function = &tcp_retransmit_timer;
+ newtp->retransmit_timer.data = (unsigned long) newsk;
+ init_timer(&newtp->delack_timer);
+ newtp->delack_timer.function = &tcp_delack_timer;
+ newtp->delack_timer.data = (unsigned long) newsk;
+ skb_queue_head_init(&newtp->out_of_order_queue);
+ newtp->send_head = newtp->retrans_head = NULL;
+ newtp->rcv_wup = req->rcv_isn + 1;
+ newtp->write_seq = req->snt_isn + 1;
+ newtp->copied_seq = req->rcv_isn + 1;
+
+ newtp->saw_tstamp = 0;
+
+ init_timer(&newtp->probe_timer);
+ newtp->probe_timer.function = &tcp_probe_timer;
+ newtp->probe_timer.data = (unsigned long) newsk;
+ newtp->probes_out = 0;
+ newtp->syn_seq = req->rcv_isn;
+ newtp->fin_seq = req->rcv_isn;
+ newtp->urg_data = 0;
+ tcp_synq_init(newtp);
+ newtp->syn_backlog = 0;
+ if (skb->len >= 536)
+ newtp->last_seg_size = skb->len;
+
+ /* Back to base struct sock members. */
+ newsk->err = 0;
+ newsk->ack_backlog = 0;
+ newsk->max_ack_backlog = SOMAXCONN;
+ newsk->priority = 0;
+ atomic_set(&newsk->refcnt, 1);
+ atomic_inc(&inet_sock_nr);
+
+ spin_lock_init(&sk->timer_lock);
+ init_timer(&newsk->timer);
+ newsk->timer.function = &tcp_keepalive_timer;
+ newsk->timer.data = (unsigned long) newsk;
+ if (newsk->keepopen)
+ tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp));
+ newsk->socket = NULL;
+ newsk->sleep = NULL;
+
+ newtp->tstamp_ok = req->tstamp_ok;
+ if((newtp->sack_ok = req->sack_ok) != 0)
+ newtp->num_sacks = 0;
+ newtp->window_clamp = req->window_clamp;
+ newtp->rcv_wnd = req->rcv_wnd;
+ newtp->wscale_ok = req->wscale_ok;
+ if (newtp->wscale_ok) {
+ newtp->snd_wscale = req->snd_wscale;
+ newtp->rcv_wscale = req->rcv_wscale;
+ } else {
+ newtp->snd_wscale = newtp->rcv_wscale = 0;
+ newtp->window_clamp = min(newtp->window_clamp,65535);
+ }
+ if (newtp->tstamp_ok) {
+ newtp->ts_recent = req->ts_recent;
+ newtp->ts_recent_stamp = xtime.tv_sec;
+ newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ } else {
+ newtp->ts_recent_stamp = 0;
+ newtp->tcp_header_len = sizeof(struct tcphdr);
+ }
+ newtp->mss_clamp = req->mss;
+ }
+ return newsk;
+}
+
+static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
+{
+ if (seq == s_win)
+ return 1;
+ if (after(end_seq, s_win) && before(seq, e_win))
+ return 1;
+ return (seq == e_win && seq == end_seq);
+}
+
+
/*
- * Process an incoming SYN or SYN-ACK for SYN_RECV sockets represented
- * as an open_request.
+ * Process an incoming packet for SYN_RECV sockets represented
+ * as an open_request.
*/
-struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
- struct open_request *req)
+struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
+ struct open_request *req,
+ struct open_request *prev)
{
+ struct tcphdr *th = skb->h.th;
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- u32 flg;
+ u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
+ int paws_reject = 0;
+ struct tcp_opt ttp;
- /* assumption: the socket is not in use.
- * as we checked the user count on tcp_rcv and we're
- * running from a soft interrupt.
+ /* If socket has already been created, process
+ packet in its context.
+
+ We fall here only due to race, when packets were enqueued
+ to backlog of listening socket.
*/
+ if (req->sk)
+ return req->sk;
- /* Check for syn retransmission */
- flg = *(((u32 *)skb->h.th) + 3);
-
- flg &= __constant_htonl(0x00170000);
- /* Only SYN set? */
- if (flg == __constant_htonl(0x00020000)) {
- if (!after(TCP_SKB_CB(skb)->seq, req->rcv_isn)) {
- /* retransmited syn.
+ ttp.saw_tstamp = 0;
+ if (th->doff > (sizeof(struct tcphdr)>>2)) {
+
+ tcp_parse_options(NULL, th, &ttp, 0);
+
+ paws_reject = ttp.saw_tstamp &&
+ (s32)(ttp.rcv_tsval - req->ts_recent) < 0;
+ }
+
+ /* Check for pure retransmited SYN. */
+ if (TCP_SKB_CB(skb)->seq == req->rcv_isn &&
+ flg == TCP_FLAG_SYN &&
+ !paws_reject) {
+ /*
+ * RFC793 draws (Incorrectly! It was fixed in RFC1122)
+ * this case on figure 6 and figure 8, but formal
+ * protocol description says NOTHING.
+ * To be more exact, it says that we should send ACK,
+ * because this segment (at least, if it has no data)
+ * is out of window.
+ *
+ * CONCLUSION: RFC793 (even with RFC1122) DOES NOT
+ * describe SYN-RECV state. All the description
+ * is wrong, we cannot believe to it and should
+ * rely only on common sense and implementation
+ * experience.
+ *
+ * Enforce "SYN-ACK" according to figure 8, figure 6
+ * of RFC793, fixed by RFC1122.
+ */
+ req->class->rtx_syn_ack(sk, req);
+ return NULL;
+ }
+
+ /* Further reproduces section "SEGMENT ARRIVES"
+ for state SYN-RECEIVED of RFC793.
+ It is broken, however, it does not work only
+ when SYNs are crossed, which is impossible in our
+ case.
+
+ But generally, we should (RFC lies!) to accept ACK
+ from SYNACK both here and in tcp_rcv_state_process().
+ tcp_rcv_state_process() does not, hence, we do not too.
+
+ Note that the case is absolutely generic:
+ we cannot optimize anything here without
+ violating protocol. All the checks must be made
+ before attempt to create socket.
+ */
+
+ /* RFC793: "first check sequence number". */
+
+ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+ req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) {
+ /* Out of window: send ACK and drop. */
+ if (!(flg & TCP_FLAG_RST))
+ req->class->send_ack(skb, req);
+ return NULL;
+ }
+
+ /* In sequence, PAWS is OK. */
+
+ if (ttp.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
+ req->ts_recent = ttp.rcv_tsval;
+
+ if (TCP_SKB_CB(skb)->seq == req->rcv_isn) {
+ /* Truncate SYN, it is out of window starting
+ at req->rcv_isn+1. */
+ flg &= ~TCP_FLAG_SYN;
+ }
+
+ /* RFC793: "second check the RST bit" and
+ * "fourth, check the SYN bit"
+ */
+ if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN))
+ goto embryonic_reset;
+
+ /* RFC793: "fifth check the ACK field" */
+
+ if (!(flg & TCP_FLAG_ACK))
+ return NULL;
+
+ /* Invalid ACK: reset will be sent by listening socket */
+ if (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1)
+ return sk;
+
+ /* OK, ACK is valid, create big socket and
+ feed this segment to it. It will repeat all
+ the tests. THIS SEGMENT MUST MOVE SOCKET TO
+ ESTABLISHED STATE. If it will be dropped after
+ socket is created, wait for troubles.
+ */
+ sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
+ if (sk == NULL)
+ return NULL;
+
+ tcp_dec_slow_timer(TCP_SLT_SYNACK);
+ req->sk = sk;
+ return sk;
+
+embryonic_reset:
+ tcp_synq_unlink(tp, req, prev);
+ tp->syn_backlog--;
+ tcp_dec_slow_timer(TCP_SLT_SYNACK);
+
+ net_statistics.EmbryonicRsts++;
+ if (!(flg & TCP_FLAG_RST))
+ req->class->send_reset(skb);
+
+ req->class->destructor(req);
+ tcp_openreq_free(req);
+ return NULL;
+}
+
+static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
+ struct tcphdr *th, unsigned len)
+{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ tcp_parse_options(sk, th, tp, 0);
+
+#ifdef CONFIG_TCP_TW_RECYCLE
+ if (tp->ts_recent_stamp && tp->saw_tstamp && !th->rst &&
+ (s32)(tp->rcv_tsval - tp->ts_recent) < 0 &&
+ xtime.tv_sec < tp->ts_recent_stamp + PAWS_24DAYS) {
+ /* Old duplicate segment. We remember last
+ ts_recent from this host in timewait bucket.
+
+ Actually, we could implement per host cache
+ to truncate timewait state after RTO. Paranoidal arguments
+ of rfc1337 are not enough to close this nice possibility.
+ */
+ if (net_ratelimit())
+ printk(KERN_DEBUG "TCP: tw recycle, PAWS worked. Good.\n");
+ if (th->ack)
+ return 1;
+ goto discard;
+ }
+#endif
+
+ if (th->ack) {
+ /* rfc793:
+ * "If the state is SYN-SENT then
+ * first check the ACK bit
+ * If the ACK bit is set
+ * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
+ * a reset (unless the RST bit is set, if so drop
+ * the segment and return)"
+ *
+ * I cite this place to emphasize one essential
+ * detail, this check is different of one
+ * in established state: SND.UNA <= SEG.ACK <= SND.NXT.
+ * SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT,
+ * because we have no previous data sent before SYN.
+ * --ANK(990513)
+ *
+ * We do not send data with SYN, so that RFC-correct
+ * test reduces to:
+ */
+ if (sk->zapped ||
+ TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
+ return 1;
+
+ /* Now ACK is acceptable.
+ *
+ * "If the RST bit is set
+ * If the ACK was acceptable then signal the user "error:
+ * connection reset", drop the segment, enter CLOSED state,
+ * delete TCB, and return."
+ */
+
+ if (th->rst) {
+ tcp_reset(sk);
+ goto discard;
+ }
+
+ /* rfc793:
+ * "fifth, if neither of the SYN or RST bits is set then
+ * drop the segment and return."
+ *
+ * See note below!
+ * --ANK(990513)
+ */
+ if (!th->syn)
+ goto discard;
+
+ /* rfc793:
+ * "If the SYN bit is on ...
+ * are acceptable then ...
+ * (our SYN has been ACKed), change the connection
+ * state to ESTABLISHED..."
+ *
+ * Do you see? SYN-less ACKs in SYN-SENT state are
+ * completely ignored.
+ *
+ * The bug causing stalled SYN-SENT sockets
+ * was here: tcp_ack advanced snd_una and canceled
+ * retransmit timer, so that bare ACK received
+ * in SYN-SENT state (even with invalid ack==ISS,
+ * because tcp_ack check is too weak for SYN-SENT)
+ * causes moving socket to invalid semi-SYN-SENT,
+ * semi-ESTABLISHED state and connection hangs.
+ *
+ * There exist buggy stacks, which really send
+ * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp)
+ * Actually, if this host did not try to get something
+ * from ftp.inr.ac.ru I'd never find this bug 8)
+ *
+ * --ANK (990514)
+ *
+ * I was wrong, I apologize. Bare ACK is valid.
+ * Actually, RFC793 requires to send such ACK
+ * in reply to any out of window packet.
+ * It is wrong, but Linux also does it sometimes.
+ * --ANK (990724)
+ */
+
+ tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+ tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->ack_seq, len);
+
+ /* Ok.. it's good. Set up sequence numbers and
+ * move to established.
+ */
+ tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1;
+ tp->rcv_wup = TCP_SKB_CB(skb)->seq+1;
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is
+ * never scaled.
+ */
+ tp->snd_wnd = htons(th->window);
+ tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+ tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
+ tp->fin_seq = TCP_SKB_CB(skb)->seq;
+
+ tcp_set_state(sk, TCP_ESTABLISHED);
+
+ if (tp->wscale_ok == 0) {
+ tp->snd_wscale = tp->rcv_wscale = 0;
+ tp->window_clamp = min(tp->window_clamp,65535);
+ }
+
+ if (tp->tstamp_ok) {
+ tp->tcp_header_len =
+ sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ } else
+ tp->tcp_header_len = sizeof(struct tcphdr);
+ if (tp->saw_tstamp) {
+ tp->ts_recent = tp->rcv_tsval;
+ tp->ts_recent_stamp = xtime.tv_sec;
+ }
+ tcp_sync_mss(sk, tp->pmtu_cookie);
+ tcp_initialize_rcv_mss(sk);
+ tcp_init_metrics(sk);
+
+ if (tp->write_pending) {
+ /* Save one ACK. Data will be ready after
+ * several ticks, if write_pending is set.
+ *
+ * How to make this correctly?
*/
- req->class->rtx_syn_ack(sk, req);
- return NULL;
+ tp->delayed_acks++;
+ if (tp->ato == 0)
+ tp->ato = tp->rto;
+ tcp_send_delayed_ack(sk, tp->rto);
} else {
- return sk; /* Pass new SYN to the listen socket. */
+ tcp_send_ack(sk);
}
+
+ tp->copied_seq = tp->rcv_nxt;
+
+ if(!sk->dead) {
+ wake_up_interruptible(sk->sleep);
+ sock_wake_async(sk->socket, 0);
+ }
+ return -1;
}
- /* We know it's an ACK here */
- if (req->sk) {
- /* socket already created but not
- * yet accepted()...
+ /* No ACK in the segment */
+
+ if (th->rst) {
+ /* rfc793:
+ * "If the RST bit is set
+ *
+ * Otherwise (no ACK) drop the segment and return."
*/
- sk = req->sk;
- } else {
- /* In theory the packet could be for a cookie, but
- * TIME_WAIT should guard us against this.
- * XXX: Nevertheless check for cookies?
- * This sequence number check is done again later,
- * but we do it here to prevent syn flood attackers
- * from creating big SYN_RECV sockets.
- */
- if (!between(TCP_SKB_CB(skb)->ack_seq, req->snt_isn, req->snt_isn+1) ||
- !between(TCP_SKB_CB(skb)->seq, req->rcv_isn,
- req->rcv_isn+1+req->rcv_wnd)) {
- req->class->send_reset(skb);
- return NULL;
+
+ goto discard;
+ }
+
+ if (th->syn) {
+ /* We see SYN without ACK. It is attempt of
+ * simultaneous connect with crossed SYNs.
+ *
+ * The previous version of the code
+ * checked for "connecting to self"
+ * here. that check is done now in
+ * tcp_connect.
+ *
+ * RED-PEN: BTW, it does not. 8)
+ */
+ tcp_set_state(sk, TCP_SYN_RECV);
+ if (tp->saw_tstamp) {
+ tp->ts_recent = tp->rcv_tsval;
+ tp->ts_recent_stamp = xtime.tv_sec;
}
-
- sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
- tcp_dec_slow_timer(TCP_SLT_SYNACK);
- if (sk == NULL)
- return NULL;
-
- req->expires = 0UL;
- req->sk = sk;
+
+ tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
+
+ /* RFC1323: The window in SYN & SYN/ACK segments is
+ * never scaled.
+ */
+ tp->snd_wnd = htons(th->window);
+ tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+
+ tcp_sync_mss(sk, tp->pmtu_cookie);
+ tcp_initialize_rcv_mss(sk);
+
+ tcp_send_synack(sk);
+#if 0
+ /* Note, we could accept data and URG from this segment.
+ * There are no obstacles to make this.
+ *
+ * However, if we ignore data in ACKless segments sometimes,
+ * we have no reasons to accept it sometimes.
+ * Also, seems the code doing it in step6 of tcp_rcv_state_process
+ * is not flawless. So, discard packet for sanity.
+ * Uncomment this return to process the data.
+ */
+ return -1;
+#endif
}
- skb_orphan(skb);
- skb_set_owner_r(skb, sk);
- return sk;
+ /* "fifth, if neither of the SYN or RST bits is set then
+ * drop the segment and return."
+ */
+
+discard:
+ kfree_skb(skb);
+ return 0;
}
+
/*
* This function implements the receiving procedure of RFC 793 for
* all states except ESTABLISHED and TIME_WAIT.
@@ -2042,6 +2843,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
int queued = 0;
+ tp->saw_tstamp = 0;
+
switch (sk->state) {
case TCP_CLOSE:
/* When state == CLOSED, hash lookup always fails.
@@ -2061,35 +2864,26 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* a TCP_CLOSE socket does not exist. Drop the frame
* and send a RST back to the other end.
*/
- return 1;
- case TCP_LISTEN:
- /* These use the socket TOS..
- * might want to be the received TOS
+ /* 1. The socket may be moved to TIME-WAIT state.
+ 2. While this socket was locked, another socket
+ with the same identity could be created.
+ 3. To continue?
+
+ CONCLUSION: discard and only discard!
+
+ Alternative would be relookup and recurse into tcp_v?_rcv
+ (not *_do_rcv) to work with timewait and listen states
+ correctly.
*/
- if(th->ack) {
- struct sock *realsk;
- int ret;
+ goto discard;
- realsk = tp->af_specific->get_sock(skb, th);
- if(realsk == sk)
- return 1;
+ case TCP_LISTEN:
+ if(th->ack)
+ return 1;
- bh_lock_sock(realsk);
- ret = 0;
- if(realsk->lock.users != 0) {
- skb_orphan(skb);
- sk_add_backlog(realsk, skb);
- } else {
- ret = tcp_rcv_state_process(realsk, skb,
- skb->h.th, skb->len);
- }
- bh_unlock_sock(realsk);
- return ret;
- }
-
if(th->syn) {
- if(tp->af_specific->conn_request(sk, skb, 0) < 0)
+ if(tp->af_specific->conn_request(sk, skb) < 0)
return 1;
/* Now we have several options: In theory there is
@@ -2110,172 +2904,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
*/
goto discard;
}
-
goto discard;
- break;
case TCP_SYN_SENT:
- /* SYN sent means we have to look for a suitable ack and
- * either reset for bad matches or go to connected.
- * The SYN_SENT case is unusual and should
- * not be in line code. [AC]
- */
- if(th->ack) {
- /* rfc793:
- * "If the state is SYN-SENT then
- * first check the ACK bit
- * If the ACK bit is set
- * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
- * a reset (unless the RST bit is set, if so drop
- * the segment and return)"
- *
- * I cite this place to emphasize one essential
- * detail, this check is different of one
- * in established state: SND.UNA <= SEG.ACK <= SND.NXT.
- * SEG_ACK == SND.UNA == ISS is invalid in SYN-SENT,
- * because we have no previous data sent before SYN.
- * --ANK(990513)
- *
- * We do not send data with SYN, so that RFC-correct
- * test reduces to:
- */
- if (sk->zapped ||
- TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
- return 1;
-
- /* Now ACK is acceptable.
- *
- * "If the RST bit is set
- * If the ACK was acceptable then signal the user "error:
- * connection reset", drop the segment, enter CLOSED state,
- * delete TCB, and return."
- */
-
- if (th->rst) {
- tcp_reset(sk);
- goto discard;
- }
-
- /* rfc793:
- * "fifth, if neither of the SYN or RST bits is set then
- * drop the segment and return."
- *
- * See note below!
- * --ANK(990513)
- */
-
- if (!th->syn)
- goto discard;
-
- /* rfc793:
- * "If the SYN bit is on ...
- * are acceptable then ...
- * (our SYN has been ACKed), change the connection
- * state to ESTABLISHED..."
- *
- * Do you see? SYN-less ACKs in SYN-SENT state are
- * completely ignored.
- *
- * The bug causing stalled SYN-SENT sockets
- * was here: tcp_ack advanced snd_una and canceled
- * retransmit timer, so that bare ACK received
- * in SYN-SENT state (even with invalid ack==ISS,
- * because tcp_ack check is too weak for SYN-SENT)
- * causes moving socket to invalid semi-SYN-SENT,
- * semi-ESTABLISHED state and connection hangs.
- *
- * There exist buggy stacks, which really send
- * such ACKs: f.e. 202.226.91.94 (okigate.oki.co.jp)
- * Actually, if this host did not try to get something
- * from ftp.inr.ac.ru I'd never find this bug 8)
- *
- * --ANK (990514)
- */
-
- tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
- tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(skb)->ack_seq, len);
-
- /* Ok.. it's good. Set up sequence numbers and
- * move to established.
- */
- tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1;
- tp->rcv_wup = TCP_SKB_CB(skb)->seq+1;
-
- /* RFC1323: The window in SYN & SYN/ACK segments is
- * never scaled.
- */
- tp->snd_wnd = htons(th->window);
- tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
- tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
- tp->fin_seq = TCP_SKB_CB(skb)->seq;
-
- tcp_set_state(sk, TCP_ESTABLISHED);
- tcp_parse_options(sk, th, tp, 0);
-
- if (tp->wscale_ok == 0) {
- tp->snd_wscale = tp->rcv_wscale = 0;
- tp->window_clamp = min(tp->window_clamp,65535);
- }
-
- if (tp->tstamp_ok) {
- tp->tcp_header_len =
- sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
- } else
- tp->tcp_header_len = sizeof(struct tcphdr);
- if (tp->saw_tstamp) {
- tp->ts_recent = tp->rcv_tsval;
- tp->ts_recent_stamp = tcp_time_stamp;
- }
-
- /* Can't be earlier, doff would be wrong. */
- tcp_send_ack(sk);
-
- sk->dport = th->source;
- tp->copied_seq = tp->rcv_nxt;
-
- if(!sk->dead) {
- sk->state_change(sk);
- sock_wake_async(sk->socket, 0);
- }
- } else {
- if(th->syn && !th->rst) {
- /* The previous version of the code
- * checked for "connecting to self"
- * here. that check is done now in
- * tcp_connect.
- */
- tcp_set_state(sk, TCP_SYN_RECV);
- tcp_parse_options(sk, th, tp, 0);
- if (tp->saw_tstamp) {
- tp->ts_recent = tp->rcv_tsval;
- tp->ts_recent_stamp = tcp_time_stamp;
- }
-
- tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
- tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
-
- /* RFC1323: The window in SYN & SYN/ACK segments is
- * never scaled.
- */
- tp->snd_wnd = htons(th->window);
- tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
-
- tcp_send_synack(sk);
- } else
- break;
- }
-
- /* tp->tcp_header_len and tp->mss_clamp
- probably changed, synchronize mss.
- */
- tcp_sync_mss(sk, tp->pmtu_cookie);
- tp->rcv_mss = tp->mss_cache;
-
- if (sk->state == TCP_SYN_RECV)
- goto discard;
-
- goto step6;
+ queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
+ if (queued >= 0)
+ return queued;
+ queued = 0;
+ goto step6;
}
/* Parse the tcp_options present on this header.
@@ -2283,23 +2919,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* Note that this really has to be here and not later for PAWS
* (RFC1323) to work.
*/
- if (tcp_fast_parse_options(sk, th, tp)) {
- /* NOTE: assumes saw_tstamp is never set if we didn't
- * negotiate the option. tcp_fast_parse_options() must
- * guarantee this.
- */
- if (tp->saw_tstamp) {
- if (tcp_paws_discard(tp, th, len)) {
- tcp_statistics.TcpInErrs++;
- if (!th->rst) {
- tcp_send_ack(sk);
- goto discard;
- }
- }
- tcp_replace_ts_recent(sk, tp,
- TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(skb)->end_seq);
+ if (tcp_fast_parse_options(sk, th, tp) && tp->saw_tstamp &&
+ tcp_paws_discard(tp, skb)) {
+ if (!th->rst) {
+ tcp_send_ack(sk);
+ goto discard;
}
+ /* Reset is accepted even if it did not pass PAWS. */
}
/* The silly FIN test here is necessary to see an advancing ACK in
@@ -2313,11 +2939,26 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* At this point the connection will deadlock with host1 believing
* that his FIN is never ACK'd, and thus it will retransmit it's FIN
* forever. The following fix is from Taral (taral@taral.net).
+ *
+ * RED-PEN. Seems, the above is not true.
+ * If at least one end is RFC compliant, it will send ACK to
+ * out of window FIN and, hence, move peer to TIME-WAIT.
+ * I comment out this line. --ANK
+ *
+ * RED-PEN. DANGER! tcp_sequence check rejects also SYN-ACKs
+ * received in SYN-RECV. The problem is that description of
+ * segment processing in SYN-RECV state in RFC792 is WRONG.
+ * Correct check would accept ACK from this SYN-ACK, see
+ * figures 6 and 8 (fixed by RFC1122). Compare this
+ * to problem with FIN, they smell similarly. --ANK
*/
/* step 1: check sequence number */
- if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq) &&
- !(th->fin && TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)) {
+ if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)
+#if 0
+ && !(th->fin && TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)
+#endif
+ ) {
if (!th->rst) {
tcp_send_ack(sk);
}
@@ -2330,6 +2971,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
goto discard;
}
+ if (tp->saw_tstamp) {
+ tcp_replace_ts_recent(sk, tp,
+ TCP_SKB_CB(skb)->seq);
+ }
+
/* step 3: check security and precedence [ignored] */
/* step 4:
@@ -2357,22 +3003,36 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
if (th->ack) {
int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(skb)->ack_seq, len);
-
+
switch(sk->state) {
case TCP_SYN_RECV:
if (acceptable) {
tcp_set_state(sk, TCP_ESTABLISHED);
- sk->dport = th->source;
tp->copied_seq = tp->rcv_nxt;
- if(!sk->dead)
- sk->state_change(sk);
+ /* Note, that this wakeup is only for marginal
+ crossed SYN case. Passively open sockets
+ are not waked up, because sk->sleep == NULL
+ and sk->socket == NULL.
+ */
+ if (!sk->dead && sk->sleep) {
+ wake_up_interruptible(sk->sleep);
+ sock_wake_async(sk->socket, 1);
+ }
tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
tp->snd_wnd = htons(th->window) << tp->snd_wscale;
tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
+ /* tcp_ack considers this ACK as duplicate
+ * and does not calculate rtt. It is wrong.
+ * Fix it at least with timestamps.
+ */
+ if (tp->saw_tstamp && !tp->srtt)
+ tcp_ack_saw_tstamp(sk, tp, 0, 0, FLAG_SYN_ACKED);
+
+ tcp_init_metrics(sk);
} else {
SOCK_DEBUG(sk, "bad ack\n");
return 1;
@@ -2386,7 +3046,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
if (!sk->dead)
sk->state_change(sk);
else
- tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout);
+ tcp_reset_keepalive_timer(sk, sysctl_tcp_fin_timeout);
+ dst_confirm(sk->dst_cache);
}
break;
@@ -2399,10 +3060,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
case TCP_LAST_ACK:
if (tp->snd_una == tp->write_seq) {
- sk->shutdown = SHUTDOWN_MASK;
tcp_set_state(sk,TCP_CLOSE);
- if (!sk->dead)
- sk->state_change(sk);
+ tcp_update_metrics(sk);
+ tcp_done(sk);
goto discard;
}
break;
@@ -2444,8 +3104,11 @@ step6:
break;
}
- tcp_data_snd_check(sk);
- tcp_ack_snd_check(sk);
+ /* tcp_data could move socket to TIME-WAIT */
+ if (sk->state != TCP_CLOSE) {
+ tcp_data_snd_check(sk);
+ tcp_ack_snd_check(sk);
+ }
if (!queued) {
discard:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3c5102b42..986868b4f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_ipv4.c,v 1.182 1999/07/05 01:34:07 davem Exp $
+ * Version: $Id: tcp_ipv4.c,v 1.189 1999/09/07 02:31:33 davem Exp $
*
* IPv4 specific functions
*
@@ -57,6 +57,7 @@
#include <net/icmp.h>
#include <net/tcp.h>
#include <net/ipv6.h>
+#include <net/inet_common.h>
#include <asm/segment.h>
@@ -67,6 +68,7 @@ extern int sysctl_tcp_timestamps;
extern int sysctl_tcp_window_scaling;
extern int sysctl_tcp_sack;
extern int sysctl_tcp_syncookies;
+extern int sysctl_tcp_tw_recycle;
extern int sysctl_ip_dynaddr;
extern __u32 sysctl_wmem_max;
extern __u32 sysctl_rmem_max;
@@ -90,23 +92,30 @@ void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
* First half of the table is for sockets not in TIME_WAIT, second half
* is for TIME_WAIT sockets only.
*/
-struct sock **tcp_ehash;
-int tcp_ehash_size;
+struct tcp_ehash_bucket *tcp_ehash = NULL;
/* Ok, let's try this, I give up, we do need a local binding
* TCP hash as well as the others for fast bind/connect.
*/
-struct tcp_bind_bucket **tcp_bhash;
-int tcp_bhash_size;
+struct tcp_bind_hashbucket *tcp_bhash = NULL;
+
+int tcp_bhash_size = 0;
+int tcp_ehash_size = 0;
/* All sockets in TCP_LISTEN state will be in here. This is the only table
* where wildcard'd TCP sockets can exist. Hash function here is just local
* port number.
*/
-struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE];
+struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE] = { NULL, };
+char __tcp_clean_cacheline_pad[(SMP_CACHE_BYTES -
+ (((sizeof(void *) * (TCP_LHTABLE_SIZE + 2)) +
+ (sizeof(int) * 2)) % SMP_CACHE_BYTES))] = { 0, };
+
+rwlock_t tcp_lhash_lock = RW_LOCK_UNLOCKED;
+atomic_t tcp_lhash_users = ATOMIC_INIT(0);
+DECLARE_WAIT_QUEUE_HEAD(tcp_lhash_wait);
-/* Register cache. */
-struct sock *tcp_regs[TCP_NUM_REGS];
+spinlock_t tcp_portalloc_lock = SPIN_LOCK_UNLOCKED;
/*
* This array holds the first and last local port number.
@@ -119,7 +128,10 @@ int tcp_port_rover = (1024 - 1);
static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
__u32 faddr, __u16 fport)
{
- return ((laddr ^ lport) ^ (faddr ^ fport)) & ((tcp_ehash_size >> 1) - 1);
+ int h = ((laddr ^ lport) ^ (faddr ^ fport));
+ h ^= h>>16;
+ h ^= h>>8;
+ return h & (tcp_ehash_size - 1);
}
static __inline__ int tcp_sk_hashfn(struct sock *sk)
@@ -133,67 +145,47 @@ static __inline__ int tcp_sk_hashfn(struct sock *sk)
}
/* Allocate and initialize a new TCP local port bind bucket.
- * The sockhash lock must be held as a writer here.
+ * The bindhash mutex for snum's hash chain must be held here.
*/
-struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum)
+struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
+ unsigned short snum)
{
struct tcp_bind_bucket *tb;
tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
if(tb != NULL) {
- struct tcp_bind_bucket **head =
- &tcp_bhash[tcp_bhashfn(snum)];
tb->port = snum;
tb->fastreuse = 0;
tb->owners = NULL;
- if((tb->next = *head) != NULL)
+ if((tb->next = head->chain) != NULL)
tb->next->pprev = &tb->next;
- *head = tb;
- tb->pprev = head;
+ head->chain = tb;
+ tb->pprev = &head->chain;
}
return tb;
}
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
-/* Ensure that the bound bucket for the port exists.
- * Return 0 on success.
- */
-static __inline__ int tcp_bucket_check(unsigned short snum)
-{
- struct tcp_bind_bucket *tb;
- int ret = 0;
-
- SOCKHASH_LOCK_WRITE();
- tb = tcp_bhash[tcp_bhashfn(snum)];
- for( ; (tb && (tb->port != snum)); tb = tb->next)
- ;
- ret = 0;
- if (tb == NULL) {
- if ((tb = tcp_bucket_create(snum)) == NULL)
- ret = 1;
- }
- SOCKHASH_UNLOCK_WRITE();
-
- return ret;
-}
-#endif
-
+/* Caller must disable local BH processing. */
static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
{
- struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *)sk->prev;
+ struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(child->num)];
+ struct tcp_bind_bucket *tb;
+ spin_lock(&head->lock);
+ tb = (struct tcp_bind_bucket *)sk->prev;
if ((child->bind_next = tb->owners) != NULL)
tb->owners->bind_pprev = &child->bind_next;
tb->owners = child;
child->bind_pprev = &tb->owners;
child->prev = (struct sock *) tb;
+ spin_unlock(&head->lock);
}
__inline__ void tcp_inherit_port(struct sock *sk, struct sock *child)
{
- SOCKHASH_LOCK_WRITE();
+ local_bh_disable();
__tcp_inherit_port(sk, child);
- SOCKHASH_UNLOCK_WRITE();
+ local_bh_enable();
}
/* Obtain a reference to a local port for the given sock,
@@ -201,38 +193,48 @@ __inline__ void tcp_inherit_port(struct sock *sk, struct sock *child)
*/
static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
{
+ struct tcp_bind_hashbucket *head;
struct tcp_bind_bucket *tb;
+ int ret;
- SOCKHASH_LOCK_WRITE();
+ local_bh_disable();
if (snum == 0) {
- int rover = tcp_port_rover;
int low = sysctl_local_port_range[0];
int high = sysctl_local_port_range[1];
int remaining = (high - low) + 1;
+ int rover;
+ spin_lock(&tcp_portalloc_lock);
+ rover = tcp_port_rover;
do { rover++;
if ((rover < low) || (rover > high))
rover = low;
- tb = tcp_bhash[tcp_bhashfn(rover)];
- for ( ; tb; tb = tb->next)
+ head = &tcp_bhash[tcp_bhashfn(rover)];
+ spin_lock(&head->lock);
+ for (tb = head->chain; tb; tb = tb->next)
if (tb->port == rover)
goto next;
break;
next:
+ spin_unlock(&head->lock);
} while (--remaining > 0);
tcp_port_rover = rover;
+ spin_unlock(&tcp_portalloc_lock);
/* Exhausted local port range during search? */
+ ret = 1;
if (remaining <= 0)
goto fail;
- /* OK, here is the one we will use. */
+ /* OK, here is the one we will use. HEAD is
+ * non-NULL and we hold it's mutex.
+ */
snum = rover;
tb = NULL;
} else {
- for (tb = tcp_bhash[tcp_bhashfn(snum)];
- tb != NULL;
- tb = tb->next)
+ head = &tcp_bhash[tcp_bhashfn(snum)];
+ spin_lock(&head->lock);
+ for (tb = head->chain; tb != NULL; tb = tb->next)
if (tb->port == snum)
break;
}
@@ -256,13 +258,15 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
}
}
/* If we found a conflict, fail. */
+ ret = 1;
if (sk2 != NULL)
- goto fail;
+ goto fail_unlock;
}
}
+ ret = 1;
if (tb == NULL &&
- (tb = tcp_bucket_create(snum)) == NULL)
- goto fail;
+ (tb = tcp_bucket_create(head, snum)) == NULL)
+ goto fail_unlock;
if (tb->owners == NULL) {
if (sk->reuse && sk->state != TCP_LISTEN)
tb->fastreuse = 1;
@@ -278,13 +282,13 @@ success:
tb->owners = sk;
sk->bind_pprev = &tb->owners;
sk->prev = (struct sock *) tb;
+ ret = 0;
- SOCKHASH_UNLOCK_WRITE();
- return 0;
-
+fail_unlock:
+ spin_unlock(&head->lock);
fail:
- SOCKHASH_UNLOCK_WRITE();
- return 1;
+ local_bh_enable();
+ return ret;
}
/* Get rid of any references to a local port held by the
@@ -292,8 +296,10 @@ fail:
*/
__inline__ void __tcp_put_port(struct sock *sk)
{
+ struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(sk->num)];
struct tcp_bind_bucket *tb;
+ spin_lock(&head->lock);
tb = (struct tcp_bind_bucket *) sk->prev;
if (sk->bind_next)
sk->bind_next->bind_pprev = sk->bind_pprev;
@@ -305,24 +311,136 @@ __inline__ void __tcp_put_port(struct sock *sk)
*(tb->pprev) = tb->next;
kmem_cache_free(tcp_bucket_cachep, tb);
}
+ spin_unlock(&head->lock);
}
void tcp_put_port(struct sock *sk)
{
- SOCKHASH_LOCK_WRITE();
+ local_bh_disable();
__tcp_put_port(sk);
- SOCKHASH_UNLOCK_WRITE();
+ local_bh_enable();
+}
+
+#ifdef CONFIG_TCP_TW_RECYCLE
+/*
+ Very stupid pseudo-"algoritm". If the approach will be successful
+ (and it will!), we have to make it more reasonable.
+ Now it eats lots of CPU, when we are tough on ports.
+
+ Apparently, it should be hash table indexed by daddr/dport.
+
+ How does it work? We allow to truncate time-wait state, if:
+ 1. PAWS works on it.
+ 2. timewait bucket did not receive data for timeout:
+ - initially timeout := 2*RTO, so that if our ACK to first
+ transmitted peer's FIN is lost, we will see first retransmit.
+ - if we receive anything, the timout is increased exponentially
+ to follow normal TCP backoff pattern.
+ It is important that minimal RTO (HZ/5) > minimal timestamp
+ step (1ms).
+ 3. When creating new socket, we inherit sequence number
+ and ts_recent of time-wait bucket, increasinf them a bit.
+
+ These two conditions guarantee, that data will not be corrupted
+ both by retransmitted and by delayed segments. They do not guarantee
+ that peer will leave LAST-ACK/CLOSING state gracefully, it will be
+ reset sometimes, namely, when more than two our ACKs to its FINs are lost.
+ This reset is harmless and even good.
+ */
+
+int tcp_v4_tw_recycle(struct sock *sk, u32 daddr, u16 dport)
+{
+ static int tw_rover;
+
+ struct tcp_tw_bucket *tw;
+ struct tcp_bind_hashbucket *head;
+ struct tcp_bind_bucket *tb;
+
+ int low = sysctl_local_port_range[0];
+ int high = sysctl_local_port_range[1];
+ unsigned long now = jiffies;
+ int i, rover;
+
+ rover = tw_rover;
+
+ local_bh_disable();
+ for (i=0; i<tcp_bhash_size; i++, rover++) {
+ rover &= (tcp_bhash_size-1);
+ head = &tcp_bhash[rover];
+
+ spin_lock(&head->lock);
+ for (tb = head->chain; tb; tb = tb->next) {
+ tw = (struct tcp_tw_bucket*)tb->owners;
+
+ if (tw->state != TCP_TIME_WAIT ||
+ tw->dport != dport ||
+ tw->daddr != daddr ||
+ tw->rcv_saddr != sk->rcv_saddr ||
+ tb->port < low ||
+ tb->port >= high ||
+ !TCP_INET_FAMILY(tw->family) ||
+ tw->ts_recent_stamp == 0 ||
+ (long)(now - tw->ttd) <= 0)
+ continue;
+ tw_rover = rover;
+ goto hit;
+ }
+ spin_unlock(&head->lock);
+ }
+ local_bh_enable();
+ tw_rover = rover;
+ return -EAGAIN;
+
+hit:
+ sk->num = tw->num;
+ if ((sk->bind_next = tb->owners) != NULL)
+ tb->owners->bind_pprev = &sk->bind_next;
+ tb->owners = sk;
+ sk->bind_pprev = &tb->owners;
+ sk->prev = (struct sock *) tb;
+ spin_unlock_bh(&head->lock);
+ return 0;
+}
+#endif
+
+
+void tcp_listen_wlock(void)
+{
+ write_lock(&tcp_lhash_lock);
+
+ if (atomic_read(&tcp_lhash_users)) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue(&tcp_lhash_wait, &wait);
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (atomic_read(&tcp_lhash_users) == 0)
+ break;
+ write_unlock_bh(&tcp_lhash_lock);
+ schedule();
+ write_lock_bh(&tcp_lhash_lock);
+ }
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&tcp_lhash_wait, &wait);
+ }
}
static __inline__ void __tcp_v4_hash(struct sock *sk)
{
struct sock **skp;
+ rwlock_t *lock;
- if(sk->state == TCP_LISTEN)
+ BUG_TRAP(sk->pprev==NULL);
+ if(sk->state == TCP_LISTEN) {
skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
- else
- skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))];
-
+ lock = &tcp_lhash_lock;
+ tcp_listen_wlock();
+ } else {
+ skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain;
+ lock = &tcp_ehash[sk->hashent].lock;
+ write_lock(lock);
+ }
if((sk->next = *skp) != NULL)
(*skp)->pprev = &sk->next;
*skp = sk;
@@ -330,30 +448,40 @@ static __inline__ void __tcp_v4_hash(struct sock *sk)
sk->prot->inuse++;
if(sk->prot->highestinuse < sk->prot->inuse)
sk->prot->highestinuse = sk->prot->inuse;
+ write_unlock(lock);
}
static void tcp_v4_hash(struct sock *sk)
{
if (sk->state != TCP_CLOSE) {
- SOCKHASH_LOCK_WRITE();
+ local_bh_disable();
__tcp_v4_hash(sk);
- SOCKHASH_UNLOCK_WRITE();
+ local_bh_enable();
}
}
-static void tcp_v4_unhash(struct sock *sk)
+void tcp_unhash(struct sock *sk)
{
- SOCKHASH_LOCK_WRITE();
+ rwlock_t *lock;
+
+ if (sk->state == TCP_LISTEN) {
+ local_bh_disable();
+ tcp_listen_wlock();
+ lock = &tcp_lhash_lock;
+ } else {
+ struct tcp_ehash_bucket *head = &tcp_ehash[sk->hashent];
+ lock = &head->lock;
+ write_lock_bh(&head->lock);
+ }
+
if(sk->pprev) {
if(sk->next)
sk->next->pprev = sk->pprev;
*sk->pprev = sk->next;
sk->pprev = NULL;
sk->prot->inuse--;
- tcp_reg_zap(sk);
- __tcp_put_port(sk);
}
- SOCKHASH_UNLOCK_WRITE();
+ write_unlock_bh(lock);
}
/* Don't inline this cruft. Here are some nice properties to
@@ -362,14 +490,13 @@ static void tcp_v4_unhash(struct sock *sk)
* connection. So always assume those are both wildcarded
* during the search since they can never be otherwise.
*/
-static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
+static struct sock *__tcp_v4_lookup_listener(struct sock *sk, u32 daddr, unsigned short hnum, int dif)
{
- struct sock *sk;
struct sock *result = NULL;
int score, hiscore;
hiscore=0;
- for(sk = tcp_listening_hash[tcp_lhashfn(hnum)]; sk; sk = sk->next) {
+ for(; sk; sk = sk->next) {
if(sk->num == hnum) {
__u32 rcv_saddr = sk->rcv_saddr;
@@ -395,42 +522,62 @@ static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int d
return result;
}
+/* Optimize the common listener case. */
+__inline__ struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
+{
+ struct sock *sk;
+
+ read_lock(&tcp_lhash_lock);
+ sk = tcp_listening_hash[tcp_lhashfn(hnum)];
+ if (sk) {
+ if (sk->num == hnum && sk->next == NULL)
+ goto sherry_cache;
+ sk = __tcp_v4_lookup_listener(sk, daddr, hnum, dif);
+ }
+ if (sk) {
+sherry_cache:
+ sock_hold(sk);
+ }
+ read_unlock(&tcp_lhash_lock);
+ return sk;
+}
+
/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
* we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
*
- * The sockhash lock must be held as a reader here.
+ * Local BH must be disabled here.
*/
static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
- u32 daddr, u16 dport, int dif)
+ u32 daddr, u16 hnum, int dif)
{
+ struct tcp_ehash_bucket *head;
TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
- __u16 hnum = ntohs(dport);
__u32 ports = TCP_COMBINED_PORTS(sport, hnum);
struct sock *sk;
int hash;
- /* Check TCP register quick cache first. */
- sk = TCP_RHASH(sport);
- if(sk && TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
- goto hit;
-
/* Optimize here for direct hit, only listening connections can
* have wildcards anyways.
*/
hash = tcp_hashfn(daddr, hnum, saddr, sport);
- for(sk = tcp_ehash[hash]; sk; sk = sk->next) {
- if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) {
- if (sk->state == TCP_ESTABLISHED)
- TCP_RHASH(sport) = sk;
+ head = &tcp_ehash[hash];
+ read_lock(&head->lock);
+ for(sk = head->chain; sk; sk = sk->next) {
+ if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
goto hit; /* You sunk my battleship! */
- }
}
+
/* Must check for a TIME_WAIT'er before going to listener hash. */
- for(sk = tcp_ehash[hash+(tcp_ehash_size >> 1)]; sk; sk = sk->next)
+ for(sk = (head + tcp_ehash_size)->chain; sk; sk = sk->next)
if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
goto hit;
- sk = tcp_v4_lookup_listener(daddr, hnum, dif);
+ read_unlock(&head->lock);
+
+ return tcp_v4_lookup_listener(daddr, hnum, dif);
+
hit:
+ sock_hold(sk);
+ read_unlock(&head->lock);
return sk;
}
@@ -438,138 +585,137 @@ __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport
{
struct sock *sk;
- SOCKHASH_LOCK_READ();
- sk = __tcp_v4_lookup(saddr, sport, daddr, dport, dif);
- SOCKHASH_UNLOCK_READ();
+ local_bh_disable();
+ sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
+ local_bh_enable();
return sk;
}
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
-/* Cleaned up a little and adapted to new bind bucket scheme.
- * Oddly, this should increase performance here for
- * transparent proxy, as tests within the inner loop have
- * been eliminated. -DaveM
- */
-static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr,
- unsigned short rnum, unsigned long laddr,
- struct device *dev, unsigned short pnum,
- int dif)
+static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
{
- struct sock *s, *result = NULL;
- int badness = -1;
- u32 paddr = 0;
- unsigned short hnum = ntohs(num);
- unsigned short hpnum = ntohs(pnum);
- int firstpass = 1;
-
- if(dev && dev->ip_ptr) {
- struct in_device *idev = dev->ip_ptr;
-
- if(idev->ifa_list)
- paddr = idev->ifa_list->ifa_local;
- }
+ return secure_tcp_sequence_number(sk->saddr, sk->daddr,
+ skb->h.th->dest,
+ skb->h.th->source);
+}
- /* We must obtain the sockhash lock here, we are always
- * in BH context.
- */
- SOCKHASH_LOCK_READ_BH();
- {
- struct tcp_bind_bucket *tb = tcp_bhash[tcp_bhashfn(hnum)];
- for( ; (tb && tb->port != hnum); tb = tb->next)
- ;
- if(tb == NULL)
- goto next;
- s = tb->owners;
- }
-pass2:
- for(; s; s = s->bind_next) {
- int score = 0;
- if(s->rcv_saddr) {
- if((s->num != hpnum || s->rcv_saddr != paddr) &&
- (s->num != hnum || s->rcv_saddr != laddr))
- continue;
- score++;
- }
- if(s->daddr) {
- if(s->daddr != raddr)
- continue;
- score++;
- }
- if(s->dport) {
- if(s->dport != rnum)
- continue;
- score++;
- }
- if(s->bound_dev_if) {
- if(s->bound_dev_if != dif)
- continue;
- score++;
- }
- if(score == 4 && s->num == hnum) {
- result = s;
- goto gotit;
- } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) {
- result = s;
- badness = score;
+static int tcp_v4_check_established(struct sock *sk)
+{
+ u32 daddr = sk->rcv_saddr;
+ u32 saddr = sk->daddr;
+ int dif = sk->bound_dev_if;
+ TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
+ __u32 ports = TCP_COMBINED_PORTS(sk->dport, sk->num);
+ int hash = tcp_hashfn(daddr, sk->num, saddr, sk->dport);
+ struct tcp_ehash_bucket *head = &tcp_ehash[hash];
+ struct sock *sk2, **skp;
+#ifdef CONFIG_TCP_TW_RECYCLE
+ struct tcp_tw_bucket *tw;
+#endif
+
+ write_lock_bh(&head->lock);
+
+ /* Check TIME-WAIT sockets first. */
+ for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
+ skp = &sk2->next) {
+#ifdef CONFIG_TCP_TW_RECYCLE
+ tw = (struct tcp_tw_bucket*)sk2;
+#endif
+
+ if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
+#ifdef CONFIG_TCP_TW_RECYCLE
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ /* With PAWS, it is safe from the viewpoint
+ of data integrity. Even without PAWS it
+ is safe provided sequence spaces do not
+ overlap i.e. at data rates <= 80Mbit/sec.
+
+ Actually, the idea is close to VJ's (rfc1332)
+ one, only timestamp cache is held not per host,
+ but per port pair and TW bucket is used
+ as state holder.
+ */
+ if (sysctl_tcp_tw_recycle && tw->ts_recent_stamp) {
+ if ((tp->write_seq = tw->snd_nxt + 2) == 0)
+ tp->write_seq = 1;
+ tp->ts_recent = tw->ts_recent;
+ tp->ts_recent_stamp = tw->ts_recent_stamp;
+ sock_hold(sk2);
+ skp = &head->chain;
+ goto unique;
+ } else
+#endif
+ goto not_unique;
}
}
-next:
- if(firstpass--) {
- struct tcp_bind_bucket *tb = tcp_bhash[tcp_bhashfn(hpnum)];
- for( ; (tb && tb->port != hpnum); tb = tb->next)
- ;
- if(tb) {
- s = tb->owners;
- goto pass2;
- }
+#ifdef CONFIG_TCP_TW_RECYCLE
+ tw = NULL;
+#endif
+
+ /* And established part... */
+ for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) {
+ if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
+ goto not_unique;
}
-gotit:
- SOCKHASH_UNLOCK_READ_BH();
- return result;
-}
-#endif /* CONFIG_IP_TRANSPARENT_PROXY */
-static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
-{
- return secure_tcp_sequence_number(sk->saddr, sk->daddr,
- skb->h.th->dest,
- skb->h.th->source);
+#ifdef CONFIG_TCP_TW_RECYCLE
+unique:
+#endif
+ BUG_TRAP(sk->pprev==NULL);
+ if ((sk->next = *skp) != NULL)
+ (*skp)->pprev = &sk->next;
+
+ *skp = sk;
+ sk->pprev = skp;
+ sk->prot->inuse++;
+ if(sk->prot->highestinuse < sk->prot->inuse)
+ sk->prot->highestinuse = sk->prot->inuse;
+ write_unlock_bh(&head->lock);
+
+#ifdef CONFIG_TCP_TW_RECYCLE
+ if (tw) {
+ /* Silly. Should hash-dance instead... */
+ local_bh_disable();
+ tcp_tw_deschedule(tw);
+ tcp_timewait_kill(tw);
+ local_bh_enable();
+
+ tcp_tw_put(tw);
+ }
+#endif
+ return 0;
+
+not_unique:
+ write_unlock_bh(&head->lock);
+ return -EADDRNOTAVAIL;
}
-/* Check that a TCP address is unique, don't allow multiple
- * connects to/from the same address. Actually we can optimize
- * quite a bit, since the socket about to connect is still
- * in TCP_CLOSE, a tcp_bind_bucket for the local port he will
- * use will exist, with a NULL owners list. So check for that.
- * The good_socknum and verify_bind scheme we use makes this
- * work.
+/* Hash SYN-SENT socket to established hash table after
+ * checking that it is unique. Note, that without kernel lock
+ * we MUST make these two operations atomically.
+ *
+ * Optimization: if it is bound and tcp_bind_bucket has the only
+ * owner (us), we need not to scan established bucket.
*/
-static int tcp_v4_unique_address(struct sock *sk)
+
+int tcp_v4_hash_connecting(struct sock *sk)
{
- struct tcp_bind_bucket *tb;
unsigned short snum = sk->num;
- int retval = 1;
-
- /* Freeze the hash while we snoop around. */
- SOCKHASH_LOCK_READ();
- tb = tcp_bhash[tcp_bhashfn(snum)];
- for(; tb; tb = tb->next) {
- if(tb->port == snum && tb->owners != NULL) {
- /* Almost certainly the re-use port case, search the real hashes
- * so it actually scales.
- */
- sk = __tcp_v4_lookup(sk->daddr, sk->dport,
- sk->rcv_saddr, snum, sk->bound_dev_if);
- SOCKHASH_UNLOCK_READ();
+ struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(snum)];
+ struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *)sk->prev;
- if((sk != NULL) && (sk->state != TCP_LISTEN))
- retval = 0;
- return retval;
- }
+ spin_lock_bh(&head->lock);
+ if (tb->owners == sk && sk->bind_next == NULL) {
+ __tcp_v4_hash(sk);
+ spin_unlock_bh(&head->lock);
+ return 0;
+ } else {
+ spin_unlock_bh(&head->lock);
+
+ /* No definite answer... Walk to established hash table */
+ return tcp_v4_check_established(sk);
}
- SOCKHASH_UNLOCK_READ();
- return retval;
}
/* This will initiate an outgoing connection. */
@@ -581,34 +727,26 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
struct rtable *rt;
u32 daddr, nexthop;
int tmp;
+ int err;
if (sk->state != TCP_CLOSE)
return(-EISCONN);
- /* Don't allow a double connect. */
- if (sk->daddr)
- return -EINVAL;
-
if (addr_len < sizeof(struct sockaddr_in))
return(-EINVAL);
- if (usin->sin_family != AF_INET) {
- static int complained;
- if (usin->sin_family)
- return(-EAFNOSUPPORT);
- if (!complained++)
- printk(KERN_DEBUG "%s forgot to set AF_INET in " __FUNCTION__ "\n", current->comm);
- }
+ if (usin->sin_family != AF_INET)
+ return(-EAFNOSUPPORT);
nexthop = daddr = usin->sin_addr.s_addr;
- if (sk->opt && sk->opt->srr) {
+ if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) {
if (daddr == 0)
return -EINVAL;
- nexthop = sk->opt->faddr;
+ nexthop = sk->protinfo.af_inet.opt->faddr;
}
tmp = ip_route_connect(&rt, nexthop, sk->saddr,
- RT_TOS(sk->ip_tos)|RTO_CONN|sk->localroute, sk->bound_dev_if);
+ RT_TOS(sk->protinfo.af_inet.tos)|RTO_CONN|sk->localroute, sk->bound_dev_if);
if (tmp < 0)
return tmp;
@@ -617,63 +755,73 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
return -ENETUNREACH;
}
- dst_release(xchg(&sk->dst_cache, rt));
+ __sk_dst_set(sk, &rt->u.dst);
+ if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)
+ daddr = rt->rt_dst;
+
+ err = -ENOBUFS;
buff = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
0, GFP_KERNEL);
if (buff == NULL)
- return -ENOBUFS;
-
- /* Socket has no identity, so lock_sock() is useless. Also
- * since state==TCP_CLOSE (checked above) the socket cannot
- * possibly be in the hashes. TCP hash locking is only
- * needed while checking quickly for a unique address.
- * However, the socket does need to be (and is) locked
- * in tcp_connect().
- * Perhaps this addresses all of ANK's concerns. 8-) -DaveM
- */
- sk->dport = usin->sin_port;
- sk->daddr = rt->rt_dst;
- if (sk->opt && sk->opt->srr)
- sk->daddr = daddr;
+ goto failure;
+
if (!sk->saddr)
sk->saddr = rt->rt_src;
sk->rcv_saddr = sk->saddr;
- if (!tcp_v4_unique_address(sk)) {
- kfree_skb(buff);
- sk->daddr = 0;
- return -EADDRNOTAVAIL;
+ if (!sk->num) {
+ if (sk->prot->get_port(sk, 0)
+#ifdef CONFIG_TCP_TW_RECYCLE
+ && (!sysctl_tcp_tw_recycle ||
+ tcp_v4_tw_recycle(sk, daddr, usin->sin_port))
+#endif
+ ) {
+ kfree_skb(buff);
+ err = -EAGAIN;
+ goto failure;
+ }
+ sk->sport = htons(sk->num);
}
+#ifdef CONFIG_TCP_TW_RECYCLE
+ else if (tp->ts_recent_stamp && sk->daddr != daddr) {
+ /* Reset inherited state */
+ tp->ts_recent = 0;
+ tp->ts_recent_stamp = 0;
+ tp->write_seq = 0;
+ }
+#endif
- tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
- sk->sport, usin->sin_port);
+ sk->dport = usin->sin_port;
+ sk->daddr = daddr;
+
+ if (!tp->write_seq)
+ tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
+ sk->sport, usin->sin_port);
tp->ext_header_len = 0;
- if (sk->opt)
- tp->ext_header_len = sk->opt->optlen;
+ if (sk->protinfo.af_inet.opt)
+ tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;
- /* Reset mss clamp */
- tp->mss_clamp = ~0;
+ tp->mss_clamp = 536;
- if (!ip_dont_fragment(sk, &rt->u.dst) &&
- rt->u.dst.pmtu > 576 && rt->rt_dst != rt->rt_gateway) {
- /* Clamp mss at maximum of 536 and user_mss.
- Probably, user ordered to override tiny segment size
- in gatewayed case.
- */
- tp->mss_clamp = max(tp->user_mss, 536);
- }
+ err = tcp_connect(sk, buff);
+ if (err == 0)
+ return 0;
- tcp_connect(sk, buff, rt->u.dst.pmtu);
- return 0;
+failure:
+ __sk_dst_reset(sk);
+ sk->dport = 0;
+ return err;
}
static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len)
{
int retval = -EINVAL;
+ lock_sock(sk);
+
/* Do sanity checking for sendmsg/sendto/send. */
if (msg->msg_flags & ~(MSG_OOB|MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL))
goto out;
@@ -696,6 +844,7 @@ static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len)
retval = tcp_do_sendmsg(sk, msg);
out:
+ release_sock(sk);
return retval;
}
@@ -720,12 +869,27 @@ static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
for (req = prev->dl_next; req; req = req->dl_next) {
if (req->af.v4_req.rmt_addr == iph->saddr &&
req->af.v4_req.loc_addr == iph->daddr &&
- req->rmt_port == rport
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- && req->lcl_port == th->dest
-#endif
- ) {
- *prevp = prev;
+ req->rmt_port == rport &&
+ TCP_INET_FAMILY(req->class->family)) {
+ if (req->sk) {
+ /* Weird case: connection was established
+ and then killed by RST before user accepted
+ it. This connection is dead, but we cannot
+ kill openreq to avoid blocking in accept().
+
+ accept() will collect this garbage,
+ but such reqs must be ignored, when talking
+ to network.
+ */
+ bh_lock_sock(req->sk);
+ BUG_TRAP(req->sk->lock.users==0);
+ if (req->sk->state == TCP_CLOSE) {
+ bh_unlock_sock(req->sk);
+ prev = req;
+ continue;
+ }
+ }
+ *prevp = prev;
return req;
}
prev = req;
@@ -739,6 +903,7 @@ static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
*/
static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu)
{
+ struct dst_entry *dst;
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
@@ -748,23 +913,26 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned
if (sk->state == TCP_LISTEN)
return;
- bh_lock_sock(sk);
- if(sk->lock.users != 0)
- goto out;
-
/* We don't check in the destentry if pmtu discovery is forbidden
* on this route. We just assume that no packet_to_big packets
* are send back when pmtu discovery is not active.
* There is a small race when the user changes this flag in the
* route, but I think that's acceptable.
*/
- if (sk->dst_cache == NULL)
- goto out;
+ if ((dst = __sk_dst_check(sk, 0)) == NULL)
+ return;
+
+ ip_rt_update_pmtu(dst, mtu);
+
+ /* Something is about to be wrong... Remember soft error
+ * for the case, if this connection will not able to recover.
+ */
+ if (mtu < dst->pmtu && ip_dont_fragment(sk, dst))
+ sk->err_soft = EMSGSIZE;
- ip_rt_update_pmtu(sk->dst_cache, mtu);
- if (sk->ip_pmtudisc != IP_PMTUDISC_DONT &&
- tp->pmtu_cookie > sk->dst_cache->pmtu) {
- tcp_sync_mss(sk, sk->dst_cache->pmtu);
+ if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT &&
+ tp->pmtu_cookie > dst->pmtu) {
+ tcp_sync_mss(sk, dst->pmtu);
/* Resend the TCP packet because it's
* clear that the old packet has been
@@ -773,8 +941,6 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned
*/
tcp_simple_retransmit(sk);
} /* else let the usual retransmit timer handle it */
-out:
- bh_unlock_sock(sk);
}
/*
@@ -791,7 +957,6 @@ out:
* A more general error queue to queue errors for later handling
* is probably better.
*
- * sk->err and sk->err_soft should be atomic_t.
*/
void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
@@ -822,37 +987,51 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
th = (struct tcphdr*)(dp+(iph->ihl<<2));
sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex);
- if (sk == NULL || sk->state == TCP_TIME_WAIT) {
+ if (sk == NULL) {
icmp_statistics.IcmpInErrors++;
- return;
+ return;
+ }
+ if (sk->state == TCP_TIME_WAIT) {
+ tcp_tw_put((struct tcp_tw_bucket*)sk);
+ return;
}
+ bh_lock_sock(sk);
+ /* If too many ICMPs get dropped on busy
+ * servers this needs to be solved differently.
+ */
+ if (sk->lock.users != 0)
+ net_statistics.LockDroppedIcmps++;
+
tp = &sk->tp_pinfo.af_tcp;
seq = ntohl(th->seq);
if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
net_statistics.OutOfWindowIcmps++;
- return;
+ goto out;
}
switch (type) {
case ICMP_SOURCE_QUENCH:
#ifndef OLD_SOURCE_QUENCH /* This is deprecated */
- tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
- tp->snd_cwnd = tp->snd_ssthresh;
- tp->snd_cwnd_cnt = 0;
- tp->high_seq = tp->snd_nxt;
+ if (sk->lock.users == 0) {
+ tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+ tp->snd_cwnd = tp->snd_ssthresh;
+ tp->snd_cwnd_cnt = 0;
+ tp->high_seq = tp->snd_nxt;
+ }
#endif
- return;
+ goto out;
case ICMP_PARAMETERPROB:
err = EPROTO;
break;
case ICMP_DEST_UNREACH:
if (code > NR_ICMP_UNREACH)
- return;
+ goto out;
if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
- do_pmtu_discovery(sk, iph, ntohs(skb->h.icmph->un.frag.mtu));
- return;
+ if (sk->lock.users == 0)
+ do_pmtu_discovery(sk, iph, ntohs(skb->h.icmph->un.frag.mtu));
+ goto out;
}
err = icmp_err_convert[code].errno;
@@ -861,12 +1040,15 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
err = EHOSTUNREACH;
break;
default:
- return;
+ goto out;
}
switch (sk->state) {
struct open_request *req, *prev;
case TCP_LISTEN:
+ if (sk->lock.users != 0)
+ goto out;
+
/* The final ACK of the handshake should be already
* handled in the new socket context, not here.
* Strictly speaking - an ICMP error for the final
@@ -874,28 +1056,15 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
* complicated right now.
*/
if (!no_flags && !th->syn && !th->ack)
- return;
-
- /* Prevent race conditions with accept() -
- * ICMP is unreliable.
- */
- bh_lock_sock(sk);
- if (sk->lock.users != 0) {
- net_statistics.LockDroppedIcmps++;
- /* If too many ICMPs get dropped on busy
- * servers this needs to be solved differently.
- */
- goto out_unlock;
- }
+ goto out;
req = tcp_v4_search_req(tp, iph, th, &prev);
if (!req)
- goto out_unlock;
- if (seq != req->snt_isn) {
- net_statistics.OutOfWindowIcmps++;
- goto out_unlock;
- }
- if (req->sk) {
+ goto out;
+
+ if (req->sk) {
+ struct sock *nsk = req->sk;
+
/*
* Already in ESTABLISHED and a big socket is created,
* set error code there.
@@ -903,9 +1072,23 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
* but only with the next operation on the socket after
* accept.
*/
+ sock_hold(nsk);
bh_unlock_sock(sk);
- sk = req->sk;
+ sock_put(sk);
+ sk = nsk;
+
+ BUG_TRAP(sk->lock.users == 0);
+ tp = &sk->tp_pinfo.af_tcp;
+ if (!between(seq, tp->snd_una, tp->snd_nxt)) {
+ net_statistics.OutOfWindowIcmps++;
+ goto out;
+ }
} else {
+ if (seq != req->snt_isn) {
+ net_statistics.OutOfWindowIcmps++;
+ goto out;
+ }
+
/*
* Still in SYN_RECV, just remove it silently.
* There is no good way to pass the error to the newly
@@ -914,23 +1097,30 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
*/
tp->syn_backlog--;
tcp_synq_unlink(tp, req, prev);
+ tcp_dec_slow_timer(TCP_SLT_SYNACK);
req->class->destructor(req);
tcp_openreq_free(req);
- out_unlock:
- bh_unlock_sock(sk);
- return;
+ goto out;
}
break;
case TCP_SYN_SENT:
- case TCP_SYN_RECV: /* Cannot happen */
+ case TCP_SYN_RECV: /* Cannot happen.
+ It can f.e. if SYNs crossed.
+ */
if (!no_flags && !th->syn)
- return;
- tcp_statistics.TcpAttemptFails++;
- sk->err = err;
- sk->zapped = 1;
- mb();
- sk->error_report(sk);
- return;
+ goto out;
+ if (sk->lock.users == 0) {
+ tcp_statistics.TcpAttemptFails++;
+ sk->err = err;
+ /* Wake people up to see the error (see connect in sock.c) */
+ sk->error_report(sk);
+
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_done(sk);
+ } else {
+ sk->err_soft = err;
+ }
+ goto out;
}
/* If we've already connected we will keep trying
@@ -949,18 +1139,16 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
* --ANK (980905)
*/
- if (sk->ip_recverr) {
- /* This code isn't serialized with the socket code */
- /* ANK (980927) ... which is harmless now,
- sk->err's may be safely lost.
- */
+ if (sk->lock.users == 0 && sk->protinfo.af_inet.recverr) {
sk->err = err;
- mb();
- sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
+ sk->error_report(sk);
} else { /* Only an error on timeout */
sk->err_soft = err;
- mb();
}
+
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
}
/* This routine computes an IPv4 TCP checksum. */
@@ -995,14 +1183,8 @@ static void tcp_v4_send_reset(struct sk_buff *skb)
if (th->rst)
return;
- if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL) {
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- if (((struct rtable*)skb->dst)->rt_type == RTN_UNICAST)
- icmp_send(skb, ICMP_DEST_UNREACH,
- ICMP_PORT_UNREACH, 0);
-#endif
+ if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL)
return;
- }
/* Swap the send and the receive. */
memset(&rth, 0, sizeof(struct tcphdr));
@@ -1015,7 +1197,8 @@ static void tcp_v4_send_reset(struct sk_buff *skb)
rth.seq = th->ack_seq;
} else {
rth.ack = 1;
- rth.ack_seq = th->syn ? htonl(ntohl(th->seq)+1) : th->seq;
+ rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin
+ + skb->len - (th->doff<<2));
}
memset(&arg, 0, sizeof arg);
@@ -1035,71 +1218,69 @@ static void tcp_v4_send_reset(struct sk_buff *skb)
tcp_statistics.TcpOutRsts++;
}
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
-
-/*
- Seems, I never wrote nothing more stupid.
- I hope Gods will forgive me, but I cannot forgive myself 8)
- --ANK (981001)
+/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
+ outside socket context is ugly, certainly. What can I do?
*/
-static struct sock *tcp_v4_search_proxy_openreq(struct sk_buff *skb)
+static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts)
{
- struct iphdr *iph = skb->nh.iph;
- struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4);
- struct sock *sk = NULL;
- int i;
-
- SOCKHASH_LOCK_READ();
- for (i=0; i<TCP_LHTABLE_SIZE; i++) {
- for(sk = tcp_listening_hash[i]; sk; sk = sk->next) {
- struct open_request *dummy;
- if (tcp_v4_search_req(&sk->tp_pinfo.af_tcp, iph,
- th, &dummy) &&
- (!sk->bound_dev_if ||
- sk->bound_dev_if == skb->dev->ifindex))
- goto out;
- }
+ struct tcphdr *th = skb->h.th;
+ struct {
+ struct tcphdr th;
+ u32 tsopt[3];
+ } rep;
+ struct ip_reply_arg arg;
+
+ memset(&rep.th, 0, sizeof(struct tcphdr));
+ memset(&arg, 0, sizeof arg);
+
+ arg.iov[0].iov_base = (unsigned char *)&rep;
+ arg.iov[0].iov_len = sizeof(rep.th);
+ arg.n_iov = 1;
+ if (ts) {
+ rep.tsopt[0] = __constant_htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_TIMESTAMP << 8) |
+ TCPOLEN_TIMESTAMP);
+ rep.tsopt[1] = htonl(tcp_time_stamp);
+ rep.tsopt[2] = htonl(ts);
+ arg.iov[0].iov_len = sizeof(rep);
}
-out:
- SOCKHASH_UNLOCK_READ();
- return sk;
-}
-/*
- * Check whether a received TCP packet might be for one of our
- * connections.
- */
+ /* Swap the send and the receive. */
+ rep.th.dest = th->source;
+ rep.th.source = th->dest;
+ rep.th.doff = arg.iov[0].iov_len/4;
+ rep.th.seq = htonl(seq);
+ rep.th.ack_seq = htonl(ack);
+ rep.th.ack = 1;
+ rep.th.window = htons(win);
-int tcp_chkaddr(struct sk_buff *skb)
-{
- struct iphdr *iph = skb->nh.iph;
- struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4);
- struct sock *sk;
+ arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
+ skb->nh.iph->saddr, /*XXX*/
+ arg.iov[0].iov_len,
+ IPPROTO_TCP,
+ 0);
+ arg.csumoffset = offsetof(struct tcphdr, check) / 2;
- sk = tcp_v4_lookup(iph->saddr, th->source, iph->daddr,
- th->dest, skb->dev->ifindex);
+ ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
- if (!sk)
- return tcp_v4_search_proxy_openreq(skb) != NULL;
+ tcp_statistics.TcpOutSegs++;
+}
- if (sk->state == TCP_LISTEN) {
- struct open_request *dummy;
- if (tcp_v4_search_req(&sk->tp_pinfo.af_tcp, skb->nh.iph,
- th, &dummy) &&
- (!sk->bound_dev_if ||
- sk->bound_dev_if == skb->dev->ifindex))
- return 1;
- }
+static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
- /* 0 means accept all LOCAL addresses here, not all the world... */
+ tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt, 0, tw->ts_recent);
- if (sk->rcv_saddr == 0)
- return 0;
+ tcp_tw_put(tw);
+}
- return 1;
+static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
+{
+ tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd, req->ts_recent);
}
-#endif
/*
* Send a SYN-ACK after having received an ACK.
@@ -1111,7 +1292,6 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
struct rtable *rt;
struct ip_options *opt;
struct sk_buff * skb;
- int mss;
/* First, grab a route. */
opt = req->af.v4_req.opt;
@@ -1119,7 +1299,7 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
opt->faddr :
req->af.v4_req.rmt_addr),
req->af.v4_req.loc_addr,
- RT_TOS(sk->ip_tos) | RTO_CONN | sk->localroute,
+ RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
sk->bound_dev_if)) {
ip_statistics.IpOutNoRoutes++;
return;
@@ -1130,16 +1310,11 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
return;
}
- mss = rt->u.dst.pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
+ skb = tcp_make_synack(sk, &rt->u.dst, req);
- skb = tcp_make_synack(sk, &rt->u.dst, req, mss);
if (skb) {
struct tcphdr *th = skb->h.th;
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- th->source = req->lcl_port; /* LVE */
-#endif
-
th->check = tcp_v4_check(th, skb->len,
req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
csum_partial((char *)th, skb->len, skb->csum));
@@ -1203,7 +1378,9 @@ tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
int sysctl_max_syn_backlog = 128;
struct or_calltable or_ipv4 = {
+ PF_INET,
tcp_v4_send_synack,
+ tcp_v4_or_send_ack,
tcp_v4_or_free,
tcp_v4_send_reset
};
@@ -1211,23 +1388,20 @@ struct or_calltable or_ipv4 = {
#define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */
#define BACKLOGMAX(sk) sysctl_max_syn_backlog
-int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, __u32 isn)
+int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct tcp_opt tp;
struct open_request *req;
struct tcphdr *th = skb->h.th;
__u32 saddr = skb->nh.iph->saddr;
__u32 daddr = skb->nh.iph->daddr;
+ __u32 isn = TCP_SKB_CB(skb)->when;
#ifdef CONFIG_SYN_COOKIES
int want_cookie = 0;
#else
#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
#endif
- /* If the socket is dead, don't accept the connection. */
- if (sk->dead)
- goto dead;
-
/* Never answer to SYNs send to broadcast or multicast */
if (((struct rtable *)skb->dst)->rt_flags &
(RTCF_BROADCAST|RTCF_MULTICAST))
@@ -1236,7 +1410,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, __u32 isn)
/* XXX: Check against a global syn pool counter. */
if (BACKLOG(sk) > BACKLOGMAX(sk)) {
#ifdef CONFIG_SYN_COOKIES
- if (sysctl_tcp_syncookies) {
+ if (sysctl_tcp_syncookies && !isn) {
syn_flood_warning(skb);
want_cookie = 1;
} else
@@ -1258,30 +1432,29 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, __u32 isn)
req->rcv_isn = TCP_SKB_CB(skb)->seq;
tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
- tp.mss_clamp = 65535;
+ tp.mss_clamp = 536;
+ tp.user_mss = sk->tp_pinfo.af_tcp.user_mss;
+
tcp_parse_options(NULL, th, &tp, want_cookie);
- if (tp.mss_clamp == 65535)
- tp.mss_clamp = 576 - sizeof(struct iphdr) - sizeof(struct iphdr);
- if (sk->tp_pinfo.af_tcp.user_mss && sk->tp_pinfo.af_tcp.user_mss < tp.mss_clamp)
- tp.mss_clamp = sk->tp_pinfo.af_tcp.user_mss;
req->mss = tp.mss_clamp;
-
- if (tp.saw_tstamp)
- req->ts_recent = tp.rcv_tsval;
+ req->ts_recent = tp.saw_tstamp ? tp.rcv_tsval : 0;
req->tstamp_ok = tp.tstamp_ok;
req->sack_ok = tp.sack_ok;
req->snd_wscale = tp.snd_wscale;
req->wscale_ok = tp.wscale_ok;
req->rmt_port = th->source;
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- req->lcl_port = th->dest ; /* LVE */
-#endif
req->af.v4_req.loc_addr = daddr;
req->af.v4_req.rmt_addr = saddr;
/* Note that we ignore the isn passed from the TIME_WAIT
* state here. That's the price we pay for cookies.
+ *
+ * RED-PEN. The price is high... Then we cannot kill TIME-WAIT
+ * and should reject connection attempt, duplicates with random
+ * sequence number can corrupt data. Right?
+ * I disabled sending cookie to request matching to a timewait
+ * bucket.
*/
if (want_cookie)
isn = cookie_v4_init_sequence(sk, skb, &req->mss);
@@ -1309,11 +1482,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, __u32 isn)
return 0;
-dead:
- SOCK_DEBUG(sk, "Reset on %p: Connect on dead socket.\n",sk);
- tcp_statistics.TcpAttemptFails++;
- return -ENOTCONN; /* send reset */
-
dropbacklog:
if (!want_cookie)
BACKLOG(sk)--;
@@ -1322,147 +1490,6 @@ drop:
return 0;
}
-/* This is not only more efficient than what we used to do, it eliminates
- * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
- *
- * This function wants to be moved to a common for IPv[46] file. --ANK
- */
-struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
-{
- struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0);
-
- if(newsk != NULL) {
- struct tcp_opt *newtp;
-#ifdef CONFIG_FILTER
- struct sk_filter *filter;
-#endif
-
- memcpy(newsk, sk, sizeof(*newsk));
- newsk->state = TCP_SYN_RECV;
-
- /* Clone the TCP header template */
- newsk->dport = req->rmt_port;
-
- sock_lock_init(newsk);
-
- atomic_set(&newsk->rmem_alloc, 0);
- skb_queue_head_init(&newsk->receive_queue);
- atomic_set(&newsk->wmem_alloc, 0);
- skb_queue_head_init(&newsk->write_queue);
- atomic_set(&newsk->omem_alloc, 0);
-
- newsk->done = 0;
- newsk->proc = 0;
- newsk->backlog.head = newsk->backlog.tail = NULL;
- skb_queue_head_init(&newsk->error_queue);
- newsk->write_space = tcp_write_space;
-#ifdef CONFIG_FILTER
- if ((filter = newsk->filter) != NULL)
- sk_filter_charge(newsk, filter);
-#endif
-
- /* Now setup tcp_opt */
- newtp = &(newsk->tp_pinfo.af_tcp);
- newtp->pred_flags = 0;
- newtp->rcv_nxt = req->rcv_isn + 1;
- newtp->snd_nxt = req->snt_isn + 1;
- newtp->snd_una = req->snt_isn + 1;
- newtp->srtt = 0;
- newtp->ato = 0;
- newtp->snd_wl1 = req->rcv_isn;
- newtp->snd_wl2 = req->snt_isn;
-
- /* RFC1323: The window in SYN & SYN/ACK segments
- * is never scaled.
- */
- newtp->snd_wnd = ntohs(skb->h.th->window);
-
- newtp->max_window = newtp->snd_wnd;
- newtp->pending = 0;
- newtp->retransmits = 0;
- newtp->last_ack_sent = req->rcv_isn + 1;
- newtp->backoff = 0;
- newtp->mdev = TCP_TIMEOUT_INIT;
-
- /* So many TCP implementations out there (incorrectly) count the
- * initial SYN frame in their delayed-ACK and congestion control
- * algorithms that we must have the following bandaid to talk
- * efficiently to them. -DaveM
- */
- newtp->snd_cwnd = 2;
-
- newtp->rto = TCP_TIMEOUT_INIT;
- newtp->packets_out = 0;
- newtp->fackets_out = 0;
- newtp->retrans_out = 0;
- newtp->high_seq = 0;
- newtp->snd_ssthresh = 0x7fffffff;
- newtp->snd_cwnd_cnt = 0;
- newtp->dup_acks = 0;
- newtp->delayed_acks = 0;
- init_timer(&newtp->retransmit_timer);
- newtp->retransmit_timer.function = &tcp_retransmit_timer;
- newtp->retransmit_timer.data = (unsigned long) newsk;
- init_timer(&newtp->delack_timer);
- newtp->delack_timer.function = &tcp_delack_timer;
- newtp->delack_timer.data = (unsigned long) newsk;
- skb_queue_head_init(&newtp->out_of_order_queue);
- newtp->send_head = newtp->retrans_head = NULL;
- newtp->rcv_wup = req->rcv_isn + 1;
- newtp->write_seq = req->snt_isn + 1;
- newtp->copied_seq = req->rcv_isn + 1;
-
- newtp->saw_tstamp = 0;
- newtp->mss_clamp = req->mss;
-
- init_timer(&newtp->probe_timer);
- newtp->probe_timer.function = &tcp_probe_timer;
- newtp->probe_timer.data = (unsigned long) newsk;
- newtp->probes_out = 0;
- newtp->syn_seq = req->rcv_isn;
- newtp->fin_seq = req->rcv_isn;
- newtp->urg_data = 0;
- tcp_synq_init(newtp);
- newtp->syn_backlog = 0;
- if (skb->len >= 536)
- newtp->last_seg_size = skb->len;
-
- /* Back to base struct sock members. */
- newsk->err = 0;
- newsk->ack_backlog = 0;
- newsk->max_ack_backlog = SOMAXCONN;
- newsk->priority = 0;
-
- /* IP layer stuff */
- newsk->timeout = 0;
- init_timer(&newsk->timer);
- newsk->timer.function = &net_timer;
- newsk->timer.data = (unsigned long) newsk;
- newsk->socket = NULL;
-
- newtp->tstamp_ok = req->tstamp_ok;
- if((newtp->sack_ok = req->sack_ok) != 0)
- newtp->num_sacks = 0;
- newtp->window_clamp = req->window_clamp;
- newtp->rcv_wnd = req->rcv_wnd;
- newtp->wscale_ok = req->wscale_ok;
- if (newtp->wscale_ok) {
- newtp->snd_wscale = req->snd_wscale;
- newtp->rcv_wscale = req->rcv_wscale;
- } else {
- newtp->snd_wscale = newtp->rcv_wscale = 0;
- newtp->window_clamp = min(newtp->window_clamp,65535);
- }
- if (newtp->tstamp_ok) {
- newtp->ts_recent = req->ts_recent;
- newtp->ts_recent_stamp = tcp_time_stamp;
- newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
- } else {
- newtp->tcp_header_len = sizeof(struct tcphdr);
- }
- }
- return newsk;
-}
/*
* The three way handshake has completed - we got a valid synack -
@@ -1483,23 +1510,13 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
if (ip_route_output(&rt,
opt && opt->srr ? opt->faddr : req->af.v4_req.rmt_addr,
- req->af.v4_req.loc_addr, sk->ip_tos|RTO_CONN, 0))
+ req->af.v4_req.loc_addr, sk->protinfo.af_inet.tos|RTO_CONN, 0))
return NULL;
dst = &rt->u.dst;
}
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- /* The new socket created for transparent proxy may fall
- * into a non-existed bind bucket because sk->num != newsk->num.
- * Ensure existance of the bucket now. The placement of the check
- * later will require to destroy just created newsk in the case of fail.
- * 1998/04/22 Andrey V. Savochkin <saw@msu.ru>
- */
- if (tcp_bucket_check(ntohs(skb->h.th->dest)))
- goto exit;
-#endif
newsk = tcp_create_openreq_child(sk, req, skb);
- if (!newsk)
+ if (!newsk)
goto exit;
sk->tp_pinfo.af_tcp.syn_backlog--;
@@ -1511,30 +1528,25 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newsk->daddr = req->af.v4_req.rmt_addr;
newsk->saddr = req->af.v4_req.loc_addr;
newsk->rcv_saddr = req->af.v4_req.loc_addr;
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- newsk->num = ntohs(skb->h.th->dest);
- newsk->sport = req->lcl_port;
-#endif
- newsk->opt = req->af.v4_req.opt;
+ newsk->protinfo.af_inet.opt = req->af.v4_req.opt;
+ newsk->protinfo.af_inet.mc_index = ((struct rtable*)skb->dst)->rt_iif;
+ newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl;
newtp->ext_header_len = 0;
- if (newsk->opt)
- newtp->ext_header_len = newsk->opt->optlen;
+ if (newsk->protinfo.af_inet.opt)
+ newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen;
tcp_sync_mss(newsk, dst->pmtu);
- newtp->rcv_mss = newtp->mss_clamp;
+ tcp_initialize_rcv_mss(newsk);
+
+ if (newsk->rcvbuf < (3 * (dst->advmss+40+MAX_HEADER+15)))
+ newsk->rcvbuf = min ((3 * (dst->advmss+40+MAX_HEADER+15)), sysctl_rmem_max);
+ if (newsk->sndbuf < (3 * (newtp->mss_clamp+40+MAX_HEADER+15)))
+ newsk->sndbuf = min ((3 * (newtp->mss_clamp+40+MAX_HEADER+15)), sysctl_wmem_max);
- /* It would be better to use newtp->mss_clamp here */
- if (newsk->rcvbuf < (3 * newtp->pmtu_cookie))
- newsk->rcvbuf = min ((3 * newtp->pmtu_cookie), sysctl_rmem_max);
- if (newsk->sndbuf < (3 * newtp->pmtu_cookie))
- newsk->sndbuf = min ((3 * newtp->pmtu_cookie), sysctl_wmem_max);
+ bh_lock_sock(newsk);
- SOCKHASH_LOCK_WRITE();
__tcp_v4_hash(newsk);
__tcp_inherit_port(sk, newsk);
- SOCKHASH_UNLOCK_WRITE();
-
- sk->data_ready(sk, 0); /* Deliver SIGIO */
return newsk;
@@ -1543,62 +1555,51 @@ exit:
return NULL;
}
-static void tcp_v4_rst_req(struct sock *sk, struct sk_buff *skb)
+
+static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
{
- struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
struct open_request *req, *prev;
+ struct tcphdr *th = skb->h.th;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- req = tcp_v4_search_req(tp,skb->nh.iph, skb->h.th, &prev);
- if (!req)
- return;
- /* Sequence number check required by RFC793 */
- if (before(TCP_SKB_CB(skb)->seq, req->rcv_isn) ||
- after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
- return;
- tcp_synq_unlink(tp, req, prev);
- (req->sk ? sk->ack_backlog : tp->syn_backlog)--;
- req->class->destructor(req);
- tcp_openreq_free(req);
+ /* Find possible connection requests. */
+ req = tcp_v4_search_req(tp, skb->nh.iph, th, &prev);
+ if (req)
+ return tcp_check_req(sk, skb, req, prev);
- net_statistics.EmbryonicRsts++;
+#ifdef CONFIG_SYN_COOKIES
+ if (!th->rst && (th->syn || th->ack))
+ sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
+#endif
+ return sk;
}
-/* Check for embryonic sockets (open_requests) We check packets with
- * only the SYN bit set against the open_request queue too: This
- * increases connection latency a bit, but is required to detect
- * retransmitted SYNs.
- */
-static inline struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
+static int tcp_csum_verify(struct sk_buff *skb)
{
- struct tcphdr *th = skb->h.th;
- u32 flg = ((u32 *)th)[3];
-
- /* Check for RST */
- if (flg & __constant_htonl(0x00040000)) {
- tcp_v4_rst_req(sk, skb);
- return NULL;
- }
-
- /* Check for SYN|ACK */
- flg &= __constant_htonl(0x00120000);
- if (flg) {
- struct open_request *req, *dummy;
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-
- /* Find possible connection requests. */
- req = tcp_v4_search_req(tp, skb->nh.iph, th, &dummy);
- if (req) {
- sk = tcp_check_req(sk, skb, req);
- }
-#ifdef CONFIG_SYN_COOKIES
- else {
- sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
+ switch (skb->ip_summed) {
+ case CHECKSUM_NONE:
+ skb->csum = csum_partial((char *)skb->h.th, skb->len, 0);
+ case CHECKSUM_HW:
+ if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,skb->nh.iph->daddr,skb->csum)) {
+ NETDEBUG(printk(KERN_DEBUG "TCPv4 bad checksum "
+ "from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, "
+ "len=%d/%d\n",
+ NIPQUAD(skb->nh.iph->saddr),
+ ntohs(skb->h.th->source),
+ NIPQUAD(skb->nh.iph->daddr),
+ ntohs(skb->h.th->dest),
+ skb->len,
+ ntohs(skb->nh.iph->tot_len)));
+ return 1;
}
-#endif
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ default:
+ /* CHECKSUM_UNNECESSARY */
}
- return sk;
+ return 0;
}
+
/* The socket must have it's spinlock held when we get
* here.
*
@@ -1609,7 +1610,6 @@ static inline struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
*/
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
- int need_unlock = 0;
#ifdef CONFIG_FILTER
struct sk_filter *filter = sk->filter;
if (filter && sk_filter(skb, filter))
@@ -1624,16 +1624,22 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
skb_set_owner_r(skb, sk);
if (sk->state == TCP_ESTABLISHED) { /* Fast path */
+ /* Ready to move deeper ... */
+ if (tcp_csum_verify(skb))
+ goto csum_err;
if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
goto reset;
return 0;
}
+ if (tcp_csum_verify(skb))
+ goto csum_err;
+
if (sk->state == TCP_LISTEN) {
struct sock *nsk;
-
+
nsk = tcp_v4_hnd_req(sk, skb);
- if (!nsk)
+ if (!nsk)
goto discard;
/*
@@ -1642,21 +1648,35 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
* the new socket..
*/
if (nsk != sk) {
- bh_lock_sock(nsk);
- if (nsk->lock.users != 0) {
- skb_orphan(skb);
- sk_add_backlog(nsk, skb);
- bh_unlock_sock(nsk);
- return 0;
- }
- need_unlock = 1;
- sk = nsk;
+ int ret;
+ int state = nsk->state;
+
+ skb_orphan(skb);
+
+ BUG_TRAP(nsk->lock.users == 0);
+ skb_set_owner_r(skb, nsk);
+ ret = tcp_rcv_state_process(nsk, skb, skb->h.th, skb->len);
+
+ /* Wakeup parent, send SIGIO, if this packet changed
+ socket state from SYN-RECV.
+
+ It still looks ugly, however it is much better
+ than miracleous double wakeup in syn_recv_sock()
+ and tcp_rcv_state_process().
+ */
+ if (state == TCP_SYN_RECV && nsk->state != state)
+ sk->data_ready(sk, 0);
+
+ bh_unlock_sock(nsk);
+ if (ret)
+ goto reset;
+ return 0;
}
}
if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
goto reset;
- goto out_maybe_unlock;
+ return 0;
reset:
tcp_v4_send_reset(skb);
@@ -1667,10 +1687,11 @@ discard:
* might be destroyed here. This current version compiles correctly,
* but you have been warned.
*/
-out_maybe_unlock:
- if(need_unlock)
- bh_unlock_sock(sk);
return 0;
+
+csum_err:
+ tcp_statistics.TcpInErrs++;
+ goto discard;
}
/*
@@ -1697,57 +1718,23 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
if (len < sizeof(struct tcphdr))
goto bad_packet;
- /* Try to use the device checksum if provided. */
- switch (skb->ip_summed) {
- case CHECKSUM_NONE:
- skb->csum = csum_partial((char *)th, len, 0);
- case CHECKSUM_HW:
- if (tcp_v4_check(th,len,skb->nh.iph->saddr,skb->nh.iph->daddr,skb->csum)) {
- NETDEBUG(printk(KERN_DEBUG "TCPv4 bad checksum "
- "from %d.%d.%d.%d:%04x to %d.%d.%d.%d:%04x, "
- "len=%d/%d/%d\n",
- NIPQUAD(skb->nh.iph->saddr),
- ntohs(th->source),
- NIPQUAD(skb->nh.iph->daddr),
- ntohs(th->dest),
- len, skb->len,
- ntohs(skb->nh.iph->tot_len)));
- bad_packet:
- tcp_statistics.TcpInErrs++;
- goto discard_it;
- }
- default:
- /* CHECKSUM_UNNECESSARY */
- }
-
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- if (IPCB(skb)->redirport)
- sk = tcp_v4_proxy_lookup(th->dest, skb->nh.iph->saddr, th->source,
- skb->nh.iph->daddr, skb->dev,
- IPCB(skb)->redirport, skb->dev->ifindex);
- else {
-#endif
- SOCKHASH_LOCK_READ_BH();
- sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
- skb->nh.iph->daddr, th->dest, skb->dev->ifindex);
- SOCKHASH_UNLOCK_READ_BH();
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- if (!sk)
- sk = tcp_v4_search_proxy_openreq(skb);
- }
-#endif
- if (!sk)
- goto no_tcp_socket;
- if(!ipsec_sk_policy(sk,skb))
- goto discard_it;
-
TCP_SKB_CB(skb)->seq = ntohl(th->seq);
TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
len - th->doff*4);
TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
-
+ TCP_SKB_CB(skb)->when = 0;
skb->used = 0;
+ sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
+ skb->nh.iph->daddr, ntohs(th->dest), skb->dev->ifindex);
+
+ if (!sk)
+ goto no_tcp_socket;
+
+process:
+ if(!ipsec_sk_policy(sk,skb))
+ goto discard_and_relse;
+
if (sk->state == TCP_TIME_WAIT)
goto do_time_wait;
@@ -1759,45 +1746,83 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
sk_add_backlog(sk, skb);
bh_unlock_sock(sk);
+ sock_put(sk);
+
return ret;
no_tcp_socket:
- tcp_v4_send_reset(skb);
+ if (tcp_csum_verify(skb)) {
+bad_packet:
+ tcp_statistics.TcpInErrs++;
+ } else {
+ tcp_v4_send_reset(skb);
+ }
discard_it:
/* Discard frame. */
kfree_skb(skb);
return 0;
+discard_and_relse:
+ sock_put(sk);
+ goto discard_it;
+
do_time_wait:
- if(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
- skb, th, skb->len))
+ if (tcp_csum_verify(skb)) {
+ tcp_statistics.TcpInErrs++;
+ goto discard_and_relse;
+ }
+ switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
+ skb, th, skb->len)) {
+ case TCP_TW_SYN:
+ {
+ struct sock *sk2;
+
+ sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), skb->dev->ifindex);
+ if (sk2 != NULL) {
+ tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
+ tcp_timewait_kill((struct tcp_tw_bucket *)sk);
+ tcp_tw_put((struct tcp_tw_bucket *)sk);
+ sk = sk2;
+ goto process;
+ }
+ /* Fall through to ACK */
+ }
+ case TCP_TW_ACK:
+ tcp_v4_timewait_ack(sk, skb);
+ break;
+ case TCP_TW_RST:
goto no_tcp_socket;
+ case TCP_TW_SUCCESS:
+ }
goto discard_it;
}
static void __tcp_v4_rehash(struct sock *sk)
{
- struct sock **skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))];
+ struct tcp_ehash_bucket *oldhead = &tcp_ehash[sk->hashent];
+ struct tcp_ehash_bucket *head = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))];
+ struct sock **skp = &head->chain;
- SOCKHASH_LOCK_WRITE();
+ write_lock_bh(&oldhead->lock);
if(sk->pprev) {
if(sk->next)
sk->next->pprev = sk->pprev;
*sk->pprev = sk->next;
sk->pprev = NULL;
- tcp_reg_zap(sk);
}
+ write_unlock(&oldhead->lock);
+ write_lock(&head->lock);
if((sk->next = *skp) != NULL)
(*skp)->pprev = &sk->next;
*skp = sk;
sk->pprev = skp;
- SOCKHASH_UNLOCK_WRITE();
+ write_unlock_bh(&head->lock);
}
int tcp_v4_rebuild_header(struct sock *sk)
{
- struct rtable *rt = (struct rtable *)sk->dst_cache;
+ struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
__u32 new_saddr;
int want_rewrite = sysctl_ip_dynaddr && sk->state == TCP_SYN_SENT;
@@ -1816,7 +1841,7 @@ int tcp_v4_rebuild_header(struct sock *sk)
/* Query new route using another rt buffer */
tmp = ip_route_connect(&new_rt, rt->rt_dst, 0,
- RT_TOS(sk->ip_tos)|sk->localroute,
+ RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute,
sk->bound_dev_if);
/* Only useful if different source addrs */
@@ -1825,11 +1850,10 @@ int tcp_v4_rebuild_header(struct sock *sk)
* Only useful if different source addrs
*/
if (new_rt->rt_src != old_saddr ) {
- dst_release(sk->dst_cache);
- sk->dst_cache = &new_rt->u.dst;
+ __sk_dst_set(sk, &new_rt->u.dst);
rt = new_rt;
goto do_rewrite;
- }
+ }
dst_release(&new_rt->u.dst);
}
}
@@ -1841,7 +1865,7 @@ int tcp_v4_rebuild_header(struct sock *sk)
sk->error_report(sk);
return -1;
}
- dst_release(xchg(&sk->dst_cache, &rt->u.dst));
+ __sk_dst_set(sk, &rt->u.dst);
}
return 0;
@@ -1872,6 +1896,9 @@ do_rewrite:
/* XXX The only one ugly spot where we need to
* XXX really change the sockets identity after
* XXX it has entered the hashes. -DaveM
+ *
+ * Besides that, it does not check for connetion
+ * uniqueness. Wait for troubles.
*/
__tcp_v4_rehash(sk);
}
@@ -1879,12 +1906,6 @@ do_rewrite:
return 0;
}
-static struct sock * tcp_v4_get_sock(struct sk_buff *skb, struct tcphdr *th)
-{
- return tcp_v4_lookup(skb->nh.iph->saddr, th->source,
- skb->nh.iph->daddr, th->dest, skb->dev->ifindex);
-}
-
static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
{
struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
@@ -1900,7 +1921,7 @@ struct tcp_func ipv4_specific = {
tcp_v4_rebuild_header,
tcp_v4_conn_request,
tcp_v4_syn_recv_sock,
- tcp_v4_get_sock,
+ tcp_v4_hash_connecting,
sizeof(struct iphdr),
ip_setsockopt,
@@ -1919,9 +1940,8 @@ static int tcp_v4_init_sock(struct sock *sk)
skb_queue_head_init(&tp->out_of_order_queue);
tcp_init_xmit_timers(sk);
- tp->rto = TCP_TIMEOUT_INIT; /*TCP_WRITE_TIME*/
+ tp->rto = TCP_TIMEOUT_INIT;
tp->mdev = TCP_TIMEOUT_INIT;
- tp->mss_clamp = ~0;
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
@@ -1935,10 +1955,11 @@ static int tcp_v4_init_sock(struct sock *sk)
*/
tp->snd_cwnd_cnt = 0;
tp->snd_ssthresh = 0x7fffffff; /* Infinity */
+ tp->snd_cwnd_clamp = ~0;
+ tp->mss_cache = 536;
sk->state = TCP_CLOSE;
sk->max_ack_backlog = SOMAXCONN;
- tp->rcv_mss = 536;
sk->write_space = tcp_write_space;
@@ -1953,20 +1974,14 @@ static int tcp_v4_init_sock(struct sock *sk)
static int tcp_v4_destroy_sock(struct sock *sk)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- struct sk_buff *skb;
tcp_clear_xmit_timers(sk);
- if (sk->keepopen)
- tcp_dec_slow_timer(TCP_SLT_KEEPALIVE);
-
/* Cleanup up the write buffer. */
- while((skb = __skb_dequeue(&sk->write_queue)) != NULL)
- kfree_skb(skb);
+ __skb_queue_purge(&sk->write_queue);
/* Cleans up our, hopefuly empty, out_of_order_queue. */
- while((skb = __skb_dequeue(&tp->out_of_order_queue)) != NULL)
- kfree_skb(skb);
+ __skb_queue_purge(&tp->out_of_order_queue);
/* Clean up a referenced TCP bind bucket, this only happens if a
* port is allocated for a socket, but it never fully connects.
@@ -1981,7 +1996,7 @@ static int tcp_v4_destroy_sock(struct sock *sk)
static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i)
{
sprintf(tmpbuf, "%4d: %08lX:%04X %08lX:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u",
+ " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
i,
(long unsigned int)req->af.v4_req.loc_addr,
ntohs(sk->sport),
@@ -1994,7 +2009,9 @@ static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf,
req->retrans,
sk->socket ? sk->socket->inode->i_uid : 0,
0, /* non standard timer */
- 0 /* open_requests have no inode */
+ 0, /* open_requests have no inode */
+ atomic_read(&sk->refcnt),
+ req
);
}
@@ -2026,19 +2043,19 @@ static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
timer_expires = jiffies;
sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld",
+ " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld %d %p",
i, src, srcp, dest, destp, sp->state,
tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
timer_active, timer_expires-jiffies,
tp->retransmits,
sp->socket ? sp->socket->inode->i_uid : 0,
- timer_active ? sp->timeout : 0,
- sp->socket ? sp->socket->inode->i_ino : 0);
+ 0,
+ sp->socket ? sp->socket->inode->i_ino : 0,
+ atomic_read(&sp->refcnt), sp);
}
static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
{
- extern int tcp_tw_death_row_slot;
unsigned int dest, src;
__u16 destp, srcp;
int slot_dist;
@@ -2055,9 +2072,10 @@ static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
slot_dist = tcp_tw_death_row_slot - slot_dist;
sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08X %08X %5d %8d %d",
+ " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
i, src, srcp, dest, destp, TCP_TIME_WAIT, 0, 0,
- 3, slot_dist * TCP_TWKILL_PERIOD, 0, 0, 0, 0);
+ 3, slot_dist * TCP_TWKILL_PERIOD, 0, 0, 0, 0,
+ atomic_read(&tw->refcnt), tw);
}
int tcp_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
@@ -2072,9 +2090,9 @@ int tcp_get_info(char *buffer, char **start, off_t offset, int length, int dummy
"rx_queue tr tm->when retrnsmt uid timeout inode");
pos = 128;
- SOCKHASH_LOCK_READ();
/* First, walk listening socket table. */
+ tcp_listen_lock();
for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
struct sock *sk = tcp_listening_hash[i];
@@ -2082,66 +2100,86 @@ int tcp_get_info(char *buffer, char **start, off_t offset, int length, int dummy
struct open_request *req;
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- if (sk->family != PF_INET)
- continue;
+ if (!TCP_INET_FAMILY(sk->family))
+ goto skip_listen;
+
pos += 128;
if (pos >= offset) {
get_tcp_sock(sk, tmpbuf, num);
len += sprintf(buffer+len, "%-127s\n", tmpbuf);
- if (len >= length)
- goto out;
+ if (len >= length) {
+ tcp_listen_unlock();
+ goto out_no_bh;
+ }
}
+
+skip_listen:
+ lock_sock(sk);
for (req = tp->syn_wait_queue; req; req = req->dl_next, num++) {
if (req->sk)
continue;
+ if (!TCP_INET_FAMILY(req->class->family))
+ continue;
+
pos += 128;
if (pos < offset)
continue;
get_openreq(sk, req, tmpbuf, num);
len += sprintf(buffer+len, "%-127s\n", tmpbuf);
- if(len >= length)
- goto out;
+ if(len >= length) {
+ tcp_listen_unlock();
+ release_sock(sk);
+ goto out_no_bh;
+ }
}
+ release_sock(sk);
}
}
+ tcp_listen_unlock();
+
+ local_bh_disable();
/* Next, walk established hash chain. */
- for (i = 0; i < (tcp_ehash_size >> 1); i++) {
+ for (i = 0; i < tcp_ehash_size; i++) {
+ struct tcp_ehash_bucket *head = &tcp_ehash[i];
struct sock *sk;
+ struct tcp_tw_bucket *tw;
- for(sk = tcp_ehash[i]; sk; sk = sk->next, num++) {
- if (sk->family != PF_INET)
+ read_lock(&head->lock);
+ for(sk = head->chain; sk; sk = sk->next, num++) {
+ if (!TCP_INET_FAMILY(sk->family))
continue;
pos += 128;
if (pos < offset)
continue;
get_tcp_sock(sk, tmpbuf, num);
len += sprintf(buffer+len, "%-127s\n", tmpbuf);
- if(len >= length)
+ if(len >= length) {
+ read_unlock(&head->lock);
goto out;
+ }
}
- }
-
- /* Finally, walk time wait buckets. */
- for (i = (tcp_ehash_size>>1); i < tcp_ehash_size; i++) {
- struct tcp_tw_bucket *tw;
- for (tw = (struct tcp_tw_bucket *)tcp_ehash[i];
+ for (tw = (struct tcp_tw_bucket *)tcp_ehash[i+tcp_ehash_size].chain;
tw != NULL;
tw = (struct tcp_tw_bucket *)tw->next, num++) {
- if (tw->family != PF_INET)
+ if (!TCP_INET_FAMILY(tw->family))
continue;
pos += 128;
if (pos < offset)
continue;
get_timewait_sock(tw, tmpbuf, num);
len += sprintf(buffer+len, "%-127s\n", tmpbuf);
- if(len >= length)
+ if(len >= length) {
+ read_unlock(&head->lock);
goto out;
+ }
}
+ read_unlock(&head->lock);
}
out:
- SOCKHASH_UNLOCK_READ();
+ local_bh_enable();
+out_no_bh:
begin = len - (pos - offset);
*start = buffer + begin;
@@ -2156,6 +2194,7 @@ out:
struct proto tcp_prot = {
tcp_close, /* close */
tcp_v4_connect, /* connect */
+ tcp_disconnect, /* disconnect */
tcp_accept, /* accept */
NULL, /* retransmit */
tcp_write_wakeup, /* write_wakeup */
@@ -2172,7 +2211,7 @@ struct proto tcp_prot = {
NULL, /* bind */
tcp_v4_do_rcv, /* backlog_rcv */
tcp_v4_hash, /* hash */
- tcp_v4_unhash, /* unhash */
+ tcp_unhash, /* unhash */
tcp_v4_get_port, /* get_port */
128, /* max_header */
0, /* retransmits */
@@ -2183,7 +2222,7 @@ struct proto tcp_prot = {
-__initfunc(void tcp_v4_init(struct net_proto_family *ops))
+void __init tcp_v4_init(struct net_proto_family *ops)
{
int err;
@@ -2201,7 +2240,7 @@ __initfunc(void tcp_v4_init(struct net_proto_family *ops))
if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
panic("Failed to create the TCP control socket.\n");
tcp_socket->sk->allocation=GFP_ATOMIC;
- tcp_socket->sk->ip_ttl = MAXTTL;
+ tcp_socket->sk->protinfo.af_inet.ttl = MAXTTL;
/* Unhash it so that IP input processing does not even
* see it, we do not wish this socket to see incoming
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 18b5ebf80..77f8b98ca 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_output.c,v 1.110 1999/05/27 00:37:45 davem Exp $
+ * Version: $Id: tcp_output.c,v 1.113 1999/09/07 02:31:39 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -65,6 +65,50 @@ static __inline__ void update_send_head(struct sock *sk)
tp->send_head = NULL;
}
+/* Calculate mss to advertise in SYN segment.
+ RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
+
+ 1. It is independent of path mtu.
+ 2. Ideally, it is maximal possible segment size i.e. 65535-40.
+ 3. For IPv4 it is reasonable to calculate it from maximal MTU of
+ attached devices, because some buggy hosts are confused by
+ large MSS.
+ 4. We do not make 3, we advertise MSS, calculated from first
+ hop device mtu, but allow to raise it to ip_rt_min_advmss.
+ This may be overriden via information stored in routing table.
+ 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
+ probably even Jumbo".
+ */
+static __u16 tcp_advertise_mss(struct sock *sk)
+{
+ struct dst_entry *dst = __sk_dst_get(sk);
+ int mss;
+
+ if (dst) {
+ mss = dst->advmss;
+ } else {
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ /* No dst. It is bad. Guess some reasonable value.
+ * Actually, this case should not be possible.
+ * SANITY.
+ */
+ BUG_TRAP(dst!=NULL);
+
+ mss = tp->mss_cache;
+ mss += (tp->tcp_header_len - sizeof(struct tcphdr)) +
+ tp->ext_header_len;
+
+ /* Minimal MSS to include full set of of TCP/IP options
+ plus 8 bytes of data. It corresponds to mtu 128.
+ */
+ if (mss < 88)
+ mss = 88;
+ }
+
+ return (__u16)mss;
+}
+
/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
* transmission and possible later retransmissions.
@@ -124,8 +168,6 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
th->doff = (tcp_header_size >> 2);
th->res1 = 0;
*(((__u8 *)th) + 13) = tcb->flags;
- if(!(tcb->flags & TCPCB_FLAG_SYN))
- th->window = htons(tcp_select_window(sk));
th->check = 0;
th->urg_ptr = ntohs(tcb->urg_ptr);
if(tcb->flags & TCPCB_FLAG_SYN) {
@@ -133,7 +175,8 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
* is never scaled.
*/
th->window = htons(tp->rcv_wnd);
- tcp_syn_build_options((__u32 *)(th + 1), tp->mss_clamp,
+ tcp_syn_build_options((__u32 *)(th + 1),
+ tcp_advertise_mss(sk),
(sysctl_flags & SYSCTL_FLAG_TSTAMPS),
(sysctl_flags & SYSCTL_FLAG_SACK),
(sysctl_flags & SYSCTL_FLAG_WSCALE),
@@ -141,6 +184,7 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
TCP_SKB_CB(skb)->when,
tp->ts_recent);
} else {
+ th->window = htons(tcp_select_window(sk));
tcp_build_and_update_options((__u32 *)(th + 1),
tp, TCP_SKB_CB(skb)->when);
}
@@ -283,7 +327,8 @@ int tcp_sync_mss(struct sock *sk, u32 pmtu)
/* Calculate base mss without TCP options:
It is MMS_S - sizeof(tcphdr) of rfc1122
- */
+ */
+
mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
/* Clamp it (mss_clamp does not include tcp options) */
@@ -415,30 +460,30 @@ void tcp_write_xmit(struct sock *sk)
* a multiple of the mss when it is feasible to do so.
*
* Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
+ * Regular options like TIMESTAMP are taken into account.
*/
u32 __tcp_select_window(struct sock *sk)
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- unsigned int mss = tp->mss_cache;
+ /* MSS for the peer's data. Previous verions used mss_clamp
+ * here. I don't know if the value based on our guesses
+ * of peer's MSS is better for the performance. It's more correct
+ * but may be worse for the performance because of rcv_mss
+ * fluctuations. --SAW 1998/11/1
+ */
+ unsigned int mss = tp->rcv_mss;
int free_space;
u32 window;
/* Sometimes free_space can be < 0. */
- free_space = (sk->rcvbuf - atomic_read(&sk->rmem_alloc)) / 2;
- if (tp->window_clamp) {
- if (free_space > ((int) tp->window_clamp))
- free_space = tp->window_clamp;
- mss = min(tp->window_clamp, mss);
- } else {
- printk("tcp_select_window: tp->window_clamp == 0.\n");
- }
-
- if (mss < 1) {
- mss = 1;
- printk("tcp_select_window: sk->mss fell to 0.\n");
- }
+ free_space = tcp_space(sk);
+ if (free_space > ((int) tp->window_clamp))
+ free_space = tp->window_clamp;
+ if (tp->window_clamp < mss)
+ mss = tp->window_clamp;
- if ((free_space < (sk->rcvbuf/4)) && (free_space < ((int) (mss/2)))) {
+ if ((free_space < (tcp_full_space(sk) / 2)) &&
+ (free_space < ((int) (mss/2)))) {
window = 0;
tp->pred_flags = 0;
} else {
@@ -741,7 +786,7 @@ void tcp_send_fin(struct sock *sk)
*/
if(tp->send_head == skb &&
!sk->nonagle &&
- skb->len < (tp->mss_cache >> 1) &&
+ skb->len < (tp->rcv_mss >> 1) &&
tp->packets_out &&
!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) {
update_send_head(sk);
@@ -780,13 +825,13 @@ void tcp_send_fin(struct sock *sk)
* was unread data in the receive queue. This behavior is recommended
* by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
*/
-void tcp_send_active_reset(struct sock *sk)
+void tcp_send_active_reset(struct sock *sk, int priority)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct sk_buff *skb;
/* NOTE: No TCP options attached and we never retransmit this. */
- skb = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_KERNEL);
+ skb = alloc_skb(MAX_HEADER + sk->prot->max_header, priority);
if (!skb)
return;
@@ -813,7 +858,7 @@ int tcp_send_synack(struct sock *sk)
{
struct tcp_opt* tp = &(sk->tp_pinfo.af_tcp);
struct sk_buff* skb;
-
+
skb = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
1, GFP_ATOMIC);
if (skb == NULL)
@@ -840,7 +885,7 @@ int tcp_send_synack(struct sock *sk)
* Prepare a SYN-ACK.
*/
struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
- struct open_request *req, int mss)
+ struct open_request *req)
{
struct tcphdr *th;
int tcp_header_size;
@@ -855,17 +900,6 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
skb->dst = dst_clone(dst);
- /* Don't offer more than they did.
- * This way we don't have to memorize who said what.
- * FIXME: maybe this should be changed for better performance
- * with syncookies.
- */
- req->mss = min(mss, req->mss);
- if (req->mss < 8) {
- printk(KERN_DEBUG "initial req->mss below 8\n");
- req->mss = 8;
- }
-
tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
(req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
(req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
@@ -886,7 +920,9 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
__u8 rcv_wscale;
/* Set this up on the first call only */
req->window_clamp = skb->dst->window;
- tcp_select_initial_window(sock_rspace(sk)/2,req->mss,
+ /* tcp_full_space because it is guaranteed to be the first packet */
+ tcp_select_initial_window(tcp_full_space(sk),
+ dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
&req->rcv_wnd,
&req->window_clamp,
req->wscale_ok,
@@ -898,33 +934,25 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
th->window = htons(req->rcv_wnd);
TCP_SKB_CB(skb)->when = tcp_time_stamp;
- tcp_syn_build_options((__u32 *)(th + 1), req->mss, req->tstamp_ok,
+ tcp_syn_build_options((__u32 *)(th + 1), dst->advmss, req->tstamp_ok,
req->sack_ok, req->wscale_ok, req->rcv_wscale,
TCP_SKB_CB(skb)->when,
req->ts_recent);
skb->csum = 0;
th->doff = (tcp_header_size >> 2);
- tcp_statistics.TcpOutSegs++;
+ tcp_statistics.TcpOutSegs++;
return skb;
}
-void tcp_connect(struct sock *sk, struct sk_buff *buff, int mtu)
+int tcp_connect(struct sock *sk, struct sk_buff *buff)
{
- struct dst_entry *dst = sk->dst_cache;
+ struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
/* Reserve space for headers. */
skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
- tp->snd_wnd = 0;
- tp->snd_wl1 = 0;
- tp->snd_wl2 = tp->write_seq;
- tp->snd_una = tp->write_seq;
- tp->rcv_nxt = 0;
-
- sk->err = 0;
-
/* We'll fix this up when we get a response from the other end.
* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
*/
@@ -934,77 +962,72 @@ void tcp_connect(struct sock *sk, struct sk_buff *buff, int mtu)
/* If user gave his TCP_MAXSEG, record it to clamp */
if (tp->user_mss)
tp->mss_clamp = tp->user_mss;
- tcp_sync_mss(sk, mtu);
-
- /* Now unpleasant action: if initial pmtu is too low
- set lower clamp. I am not sure that it is good.
- To be more exact, I do not think that clamping at value, which
- is apparently transient and may improve in future is good idea.
- It would be better to wait until peer will returns its MSS
- (probably 65535 too) and now advertise something sort of 65535
- or at least first hop device mtu. Is it clear, what I mean?
- We should tell peer what maximal mss we expect to RECEIVE,
- it has nothing to do with pmtu.
- I am afraid someone will be confused by such huge value.
- --ANK (980731)
- */
- if (tp->mss_cache + tp->tcp_header_len - sizeof(struct tcphdr) < tp->mss_clamp )
- tp->mss_clamp = tp->mss_cache + tp->tcp_header_len - sizeof(struct tcphdr);
-
- TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
- TCP_SKB_CB(buff)->sacked = 0;
- TCP_SKB_CB(buff)->urg_ptr = 0;
- buff->csum = 0;
- TCP_SKB_CB(buff)->seq = tp->write_seq++;
- TCP_SKB_CB(buff)->end_seq = tp->write_seq;
- tp->snd_nxt = TCP_SKB_CB(buff)->end_seq;
+ tcp_sync_mss(sk, dst->pmtu);
tp->window_clamp = dst->window;
- tcp_select_initial_window(sock_rspace(sk)/2,tp->mss_clamp,
+
+ tcp_select_initial_window(tcp_full_space(sk),
+ dst->advmss - (tp->tcp_header_len - sizeof(struct tcphdr)),
&tp->rcv_wnd,
&tp->window_clamp,
sysctl_tcp_window_scaling,
&tp->rcv_wscale);
- /* Ok, now lock the socket before we make it visible to
- * the incoming packet engine.
- */
- unlock_kernel();
- lock_sock(sk);
/* Socket identity change complete, no longer
* in TCP_CLOSE, so enter ourselves into the
* hash tables.
*/
tcp_set_state(sk,TCP_SYN_SENT);
- sk->prot->hash(sk);
+ if (tp->af_specific->hash_connecting(sk))
+ goto err_out;
+
+ sk->err = 0;
+ tp->snd_wnd = 0;
+ tp->snd_wl1 = 0;
+ tp->snd_wl2 = tp->write_seq;
+ tp->snd_una = tp->write_seq;
+ tp->rcv_nxt = 0;
+ tp->rcv_wup = 0;
+ tp->copied_seq = 0;
- tp->rto = dst->rtt;
+ tp->rto = TCP_TIMEOUT_INIT;
tcp_init_xmit_timers(sk);
tp->retransmits = 0;
tp->fackets_out = 0;
tp->retrans_out = 0;
+ TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
+ TCP_SKB_CB(buff)->sacked = 0;
+ TCP_SKB_CB(buff)->urg_ptr = 0;
+ buff->csum = 0;
+ TCP_SKB_CB(buff)->seq = tp->write_seq++;
+ TCP_SKB_CB(buff)->end_seq = tp->write_seq;
+ tp->snd_nxt = tp->write_seq;
+
/* Send it off. */
- __skb_queue_tail(&sk->write_queue, buff);
TCP_SKB_CB(buff)->when = tcp_time_stamp;
+ __skb_queue_tail(&sk->write_queue, buff);
tp->packets_out++;
tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
tcp_statistics.TcpActiveOpens++;
/* Timer for repeating the SYN until an answer. */
tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+ return 0;
- /* Now, it is safe to release the socket. */
- release_sock(sk);
- lock_kernel();
+err_out:
+ tcp_set_state(sk,TCP_CLOSE);
+ kfree_skb(buff);
+ return -EADDRNOTAVAIL;
}
/* Send out a delayed ack, the caller does the policy checking
* to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
* for details.
*/
-void tcp_send_delayed_ack(struct tcp_opt *tp, int max_timeout)
+void tcp_send_delayed_ack(struct sock *sk, int max_timeout)
{
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
unsigned long timeout;
/* Stay within the limit we were given */
@@ -1014,13 +1037,16 @@ void tcp_send_delayed_ack(struct tcp_opt *tp, int max_timeout)
timeout += jiffies;
/* Use new timeout only if there wasn't a older one earlier. */
- if (!tp->delack_timer.prev) {
+ spin_lock_bh(&sk->timer_lock);
+ if (!tp->delack_timer.prev || !del_timer(&tp->delack_timer)) {
+ sock_hold(sk);
tp->delack_timer.expires = timeout;
- add_timer(&tp->delack_timer);
- } else {
+ } else {
if (time_before(timeout, tp->delack_timer.expires))
- mod_timer(&tp->delack_timer, timeout);
+ tp->delack_timer.expires = timeout;
}
+ add_timer(&tp->delack_timer);
+ spin_unlock_bh(&sk->timer_lock);
}
/* This routine sends an ack and also updates the window. */
@@ -1048,7 +1074,7 @@ void tcp_send_ack(struct sock *sk)
*/
if(tcp_in_quickack_mode(tp))
tcp_exit_quickack_mode(tp);
- tcp_send_delayed_ack(tp, HZ/2);
+ tcp_send_delayed_ack(sk, HZ/2);
return;
}
@@ -1082,7 +1108,7 @@ void tcp_write_wakeup(struct sock *sk)
*/
if ((1 << sk->state) &
~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
- TCPF_LAST_ACK|TCPF_CLOSING))
+ TCPF_FIN_WAIT2|TCPF_LAST_ACK|TCPF_CLOSING))
return;
if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) &&
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 05a92f7f7..a38724e42 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_timer.c,v 1.65 1999/07/02 11:26:35 davem Exp $
+ * Version: $Id: tcp_timer.c,v 1.68 1999/09/07 02:31:43 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -25,12 +25,13 @@
int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
+int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
int sysctl_tcp_retries1 = TCP_RETR1;
int sysctl_tcp_retries2 = TCP_RETR2;
+
static void tcp_sltimer_handler(unsigned long);
static void tcp_syn_recv_timer(unsigned long);
-static void tcp_keepalive(unsigned long data);
static void tcp_twkill(unsigned long);
struct timer_list tcp_slow_timer = {
@@ -42,7 +43,6 @@ struct timer_list tcp_slow_timer = {
struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = {
{ATOMIC_INIT(0), TCP_SYNACK_PERIOD, 0, tcp_syn_recv_timer},/* SYNACK */
- {ATOMIC_INIT(0), TCP_KEEPALIVE_PERIOD, 0, tcp_keepalive}, /* KEEPALIVE */
{ATOMIC_INIT(0), TCP_TWKILL_PERIOD, 0, tcp_twkill} /* TWKILL */
};
@@ -77,6 +77,7 @@ void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+ spin_lock_bh(&sk->timer_lock);
switch (what) {
case TIME_RETRANS:
/* When seting the transmit timer the probe timer
@@ -84,16 +85,26 @@ void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
* The delayed ack timer can be set if we are changing the
* retransmit timer when removing acked frames.
*/
- if(tp->probe_timer.prev)
- del_timer(&tp->probe_timer);
+ if(tp->probe_timer.prev && del_timer(&tp->probe_timer))
+ __sock_put(sk);
+ if (!tp->retransmit_timer.prev || !del_timer(&tp->retransmit_timer))
+ sock_hold(sk);
+ if (when > 120*HZ) {
+ printk(KERN_DEBUG "reset_xmit_timer sk=%p when=0x%lx, caller=%p\n", sk, when, NET_CALLER(sk));
+ when = 120*HZ;
+ }
mod_timer(&tp->retransmit_timer, jiffies+when);
break;
case TIME_DACK:
+ if (!tp->delack_timer.prev || !del_timer(&tp->delack_timer))
+ sock_hold(sk);
mod_timer(&tp->delack_timer, jiffies+when);
break;
case TIME_PROBE0:
+ if (!tp->probe_timer.prev || !del_timer(&tp->probe_timer))
+ sock_hold(sk);
mod_timer(&tp->probe_timer, jiffies+when);
break;
@@ -104,40 +115,44 @@ void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
default:
printk(KERN_DEBUG "bug: unknown timer value\n");
};
+ spin_unlock_bh(&sk->timer_lock);
}
void tcp_clear_xmit_timers(struct sock *sk)
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- if(tp->retransmit_timer.prev)
- del_timer(&tp->retransmit_timer);
- if(tp->delack_timer.prev)
- del_timer(&tp->delack_timer);
- if(tp->probe_timer.prev)
- del_timer(&tp->probe_timer);
+ spin_lock_bh(&sk->timer_lock);
+ if(tp->retransmit_timer.prev && del_timer(&tp->retransmit_timer))
+ __sock_put(sk);
+ if(tp->delack_timer.prev && del_timer(&tp->delack_timer))
+ __sock_put(sk);
+ if(tp->probe_timer.prev && del_timer(&tp->probe_timer))
+ __sock_put(sk);
+ if(sk->timer.prev && del_timer(&sk->timer))
+ __sock_put(sk);
+ spin_unlock_bh(&sk->timer_lock);
}
-static int tcp_write_err(struct sock *sk, int force)
+static void tcp_write_err(struct sock *sk, int force)
{
sk->err = sk->err_soft ? sk->err_soft : ETIMEDOUT;
sk->error_report(sk);
-
+
tcp_clear_xmit_timers(sk);
-
- /* Time wait the socket. */
- if (!force && ((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING))) {
- tcp_time_wait(sk);
- } else {
- /* Clean up time. */
- tcp_set_state(sk, TCP_CLOSE);
- return 0;
- }
- return 1;
+
+ /* Do not time wait the socket. It is timed out and, hence,
+ * idle for 120*HZ. "force" argument is ignored, delete
+ * it eventually.
+ */
+
+ /* Clean up time. */
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_done(sk);
}
/* A write timeout has occurred. Process the after effects. */
-static int tcp_write_timeout(struct sock *sk)
+static void tcp_write_timeout(struct sock *sk)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
@@ -145,21 +160,39 @@ static int tcp_write_timeout(struct sock *sk)
if ((sk->state == TCP_ESTABLISHED &&
tp->retransmits && (tp->retransmits % TCP_QUICK_TRIES) == 0) ||
(sk->state != TCP_ESTABLISHED && tp->retransmits > sysctl_tcp_retries1)) {
+ /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
+ hole detection. :-(
+
+ It is place to make it. It is not made. I do not want
+ to make it. It is disguisting. It does not work in any
+ case. Let me to cite the same draft, which requires for
+ us to implement this:
+
+ "The one security concern raised by this memo is that ICMP black holes
+ are often caused by over-zealous security administrators who block
+ all ICMP messages. It is vitally important that those who design and
+ deploy security systems understand the impact of strict filtering on
+ upper-layer protocols. The safest web site in the world is worthless
+ if most TCP implementations cannot transfer data from it. It would
+ be far nicer to have all of the black holes fixed rather than fixing
+ all of the TCP implementations."
+
+ Golden words :-).
+ */
+
dst_negative_advice(&sk->dst_cache);
}
/* Have we tried to SYN too many times (repent repent 8)) */
- if(tp->retransmits > sysctl_tcp_syn_retries && sk->state==TCP_SYN_SENT) {
+ if (sk->state == TCP_SYN_SENT &&
+ ((!tp->syn_retries && tp->retransmits > sysctl_tcp_syn_retries) ||
+ (tp->syn_retries && tp->retransmits > tp->syn_retries))) {
tcp_write_err(sk, 1);
/* Don't FIN, we got nothing back */
- return 0;
+ } else if (tp->retransmits > sysctl_tcp_retries2) {
+ /* Has it gone just too far? */
+ tcp_write_err(sk, 0);
}
-
- /* Has it gone just too far? */
- if (tp->retransmits > sysctl_tcp_retries2)
- return tcp_write_err(sk, 0);
-
- return 1;
}
void tcp_delack_timer(unsigned long data)
@@ -167,15 +200,20 @@ void tcp_delack_timer(unsigned long data)
struct sock *sk = (struct sock*)data;
bh_lock_sock(sk);
+ if (sk->lock.users) {
+ /* Try again later. */
+ tcp_reset_xmit_timer(sk, TIME_DACK, HZ/5);
+ goto out_unlock;
+ }
+
if(!sk->zapped &&
sk->tp_pinfo.af_tcp.delayed_acks &&
- sk->state != TCP_CLOSE) {
- if (!sk->lock.users)
- tcp_send_ack(sk);
- else
- tcp_send_delayed_ack(&(sk->tp_pinfo.af_tcp), HZ/10);
- }
+ sk->state != TCP_CLOSE)
+ tcp_send_ack(sk);
+
+out_unlock:
bh_unlock_sock(sk);
+ sock_put(sk);
}
void tcp_probe_timer(unsigned long data)
@@ -183,79 +221,50 @@ void tcp_probe_timer(unsigned long data)
struct sock *sk = (struct sock*)data;
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- if(sk->zapped)
- return;
-
+ if(sk->zapped)
+ goto out;
+
bh_lock_sock(sk);
if (sk->lock.users) {
/* Try again later. */
tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ/5);
- bh_unlock_sock(sk);
- return;
+ goto out_unlock;
}
- /* *WARNING* RFC 1122 forbids this
+ /* *WARNING* RFC 1122 forbids this
+ *
* It doesn't AFAIK, because we kill the retransmit timer -AK
+ *
* FIXME: We ought not to do it, Solaris 2.5 actually has fixing
* this behaviour in Solaris down as a bug fix. [AC]
+ *
+ * Let me to explain. probes_out is zeroed by incoming ACKs
+ * even if they advertise zero window. Hence, connection is killed only
+ * if we received no ACKs for normal connection timeout. It is not killed
+ * only because window stays zero for some time, window may be zero
+ * until armageddon and even later. We are in full accordance
+ * with RFCs, only probe timer combines both retransmission timeout
+ * and probe timeout in one bottle. --ANK
*/
if (tp->probes_out > sysctl_tcp_retries2) {
- if(sk->err_soft)
- sk->err = sk->err_soft;
- else
- sk->err = ETIMEDOUT;
- sk->error_report(sk);
-
- if ((1<<sk->state) & (TCPF_FIN_WAIT1|TCPF_FIN_WAIT2|TCPF_CLOSING)) {
- /* Time wait the socket. */
- tcp_time_wait(sk);
- } else {
- /* Clean up time. */
- tcp_set_state(sk, TCP_CLOSE);
- }
+ tcp_write_err(sk, 0);
} else {
/* Only send another probe if we didn't close things up. */
tcp_send_probe0(sk);
}
+out_unlock:
bh_unlock_sock(sk);
+out:
+ sock_put(sk);
}
-static __inline__ int tcp_keepopen_proc(struct sock *sk)
-{
- int res = 0;
-
- if ((1<<sk->state) & (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT2)) {
- struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
-
- if (elapsed >= sysctl_tcp_keepalive_time) {
- if (tp->probes_out > sysctl_tcp_keepalive_probes) {
- if(sk->err_soft)
- sk->err = sk->err_soft;
- else
- sk->err = ETIMEDOUT;
-
- tcp_set_state(sk, TCP_CLOSE);
- sk->shutdown = SHUTDOWN_MASK;
- if (!sk->dead)
- sk->state_change(sk);
- } else {
- tp->probes_out++;
- tp->pending = TIME_KEEPOPEN;
- tcp_write_wakeup(sk);
- res = 1;
- }
- }
- }
- return res;
-}
/* Kill off TIME_WAIT sockets once their lifetime has expired. */
int tcp_tw_death_row_slot = 0;
static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS] =
{ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL };
+static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED;
-extern void tcp_timewait_kill(struct tcp_tw_bucket *tw);
static void tcp_twkill(unsigned long data)
{
@@ -263,17 +272,20 @@ static void tcp_twkill(unsigned long data)
int killed = 0;
/* The death-row tw chains are only ever touched
- * in BH context so no locking is needed.
+ * in BH context so no BH disabling (for now) is needed.
*/
+ spin_lock(&tw_death_lock);
tw = tcp_tw_death_row[tcp_tw_death_row_slot];
tcp_tw_death_row[tcp_tw_death_row_slot] = NULL;
tcp_tw_death_row_slot =
((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
+ spin_unlock(&tw_death_lock);
while(tw != NULL) {
struct tcp_tw_bucket *next = tw->next_death;
tcp_timewait_kill(tw);
+ tcp_tw_put(tw);
killed++;
tw = next;
}
@@ -288,17 +300,20 @@ static void tcp_twkill(unsigned long data)
*/
void tcp_tw_schedule(struct tcp_tw_bucket *tw)
{
- int slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1);
- struct tcp_tw_bucket **tpp = &tcp_tw_death_row[slot];
+ struct tcp_tw_bucket **tpp;
+ int slot;
- SOCKHASH_LOCK_WRITE_BH();
+ spin_lock(&tw_death_lock);
+ slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1);
+ tpp = &tcp_tw_death_row[slot];
if((tw->next_death = *tpp) != NULL)
(*tpp)->pprev_death = &tw->next_death;
*tpp = tw;
tw->pprev_death = tpp;
tw->death_slot = slot;
- SOCKHASH_UNLOCK_WRITE_BH();
+ atomic_inc(&tw->refcnt);
+ spin_unlock(&tw_death_lock);
tcp_inc_slow_timer(TCP_SLT_TWKILL);
}
@@ -309,11 +324,14 @@ void tcp_tw_reschedule(struct tcp_tw_bucket *tw)
struct tcp_tw_bucket **tpp;
int slot;
- SOCKHASH_LOCK_WRITE_BH();
- if(tw->next_death)
- tw->next_death->pprev_death = tw->pprev_death;
- *tw->pprev_death = tw->next_death;
- tw->pprev_death = NULL;
+ spin_lock(&tw_death_lock);
+ if (tw->pprev_death) {
+ if(tw->next_death)
+ tw->next_death->pprev_death = tw->pprev_death;
+ *tw->pprev_death = tw->next_death;
+ tw->pprev_death = NULL;
+ } else
+ atomic_inc(&tw->refcnt);
slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1);
tpp = &tcp_tw_death_row[slot];
@@ -323,7 +341,7 @@ void tcp_tw_reschedule(struct tcp_tw_bucket *tw)
tw->pprev_death = tpp;
tw->death_slot = slot;
- SOCKHASH_UNLOCK_WRITE_BH();
+ spin_unlock(&tw_death_lock);
/* Timer was incremented when we first entered the table. */
}
@@ -331,91 +349,28 @@ void tcp_tw_reschedule(struct tcp_tw_bucket *tw)
/* This is for handling early-kills of TIME_WAIT sockets. */
void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
{
- SOCKHASH_LOCK_WRITE_BH();
- if(tw->next_death)
- tw->next_death->pprev_death = tw->pprev_death;
- *tw->pprev_death = tw->next_death;
- tw->pprev_death = NULL;
- SOCKHASH_UNLOCK_WRITE_BH();
+ spin_lock(&tw_death_lock);
+ if (tw->pprev_death) {
+ if(tw->next_death)
+ tw->next_death->pprev_death = tw->pprev_death;
+ *tw->pprev_death = tw->next_death;
+ tw->pprev_death = NULL;
+ tcp_tw_put(tw);
+ }
+ spin_unlock(&tw_death_lock);
tcp_dec_slow_timer(TCP_SLT_TWKILL);
}
-/*
- * Check all sockets for keepalive timer
- * Called every 75 seconds
- * This timer is started by af_inet init routine and is constantly
- * running.
- *
- * It might be better to maintain a count of sockets that need it using
- * setsockopt/tcp_destroy_sk and only set the timer when needed.
- */
-
-/*
- * don't send over 5 keepopens at a time to avoid burstiness
- * on big servers [AC]
- */
-#define MAX_KA_PROBES 5
-
-int sysctl_tcp_max_ka_probes = MAX_KA_PROBES;
-
-/* Keepopen's are only valid for "established" TCP's, nicely our listener
- * hash gets rid of most of the useless testing, so we run through a couple
- * of the established hash chains each clock tick. -DaveM
- *
- * And now, even more magic... TIME_WAIT TCP's cannot have keepalive probes
- * going off for them, so we only need check the first half of the established
- * hash table, even less testing under heavy load.
- *
- * I _really_ would rather do this by adding a new timer_struct to struct sock,
- * and this way only those who set the keepalive option will get the overhead.
- * The idea is you set it for 2 hours when the sock is first connected, when it
- * does fire off (if at all, most sockets die earlier) you check for the keepalive
- * option and also if the sock has been idle long enough to start probing.
- */
-static void tcp_keepalive(unsigned long data)
-{
- static int chain_start = 0;
- int count = 0;
- int i;
-
- SOCKHASH_LOCK_READ_BH();
- for(i = chain_start; i < (chain_start + ((tcp_ehash_size >> 1) >> 2)); i++) {
- struct sock *sk;
-
- sk = tcp_ehash[i];
- while(sk) {
- struct sock *next = sk->next;
-
- bh_lock_sock(sk);
- if (sk->keepopen && !sk->lock.users) {
- SOCKHASH_UNLOCK_READ_BH();
- count += tcp_keepopen_proc(sk);
- SOCKHASH_LOCK_READ_BH();
- }
- bh_unlock_sock(sk);
- if(count == sysctl_tcp_max_ka_probes)
- goto out;
- sk = next;
- }
- }
-out:
- SOCKHASH_UNLOCK_READ_BH();
- chain_start = ((chain_start + ((tcp_ehash_size >> 1)>>2)) &
- ((tcp_ehash_size >> 1) - 1));
-}
/*
- * The TCP retransmit timer. This lacks a few small details.
+ * The TCP retransmit timer.
*
* 1. An initial rtt timeout on the probe0 should cause what we can
* of the first write queue buffer to be split and sent.
- * 2. On a 'major timeout' as defined by RFC1122 we shouldn't report
+ * 2. On a 'major timeout' as defined by RFC1122 we do not report
* ETIMEDOUT if we know an additional 'soft' error caused this.
- * tcp_err should save a 'soft error' for us.
- * [Unless someone has broken it then it does, except for one 2.0
- * broken case of a send when the route/device is directly unreachable,
- * and we error but should retry! - FIXME] [AC]
+ * tcp_err saves a 'soft error' for us.
*/
void tcp_retransmit_timer(unsigned long data)
@@ -424,17 +379,14 @@ void tcp_retransmit_timer(unsigned long data)
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
/* We are reset. We will send no more retransmits. */
- if(sk->zapped) {
- tcp_clear_xmit_timer(sk, TIME_RETRANS);
- return;
- }
+ if(sk->zapped)
+ goto out;
bh_lock_sock(sk);
if (sk->lock.users) {
/* Try again later */
tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ/20);
- bh_unlock_sock(sk);
- return;
+ goto out_unlock;
}
/* Clear delay ack timer. */
@@ -501,7 +453,10 @@ void tcp_retransmit_timer(unsigned long data)
tcp_write_timeout(sk);
+out_unlock:
bh_unlock_sock(sk);
+out:
+ sock_put(sk);
}
/*
@@ -516,7 +471,7 @@ static void tcp_do_syn_queue(struct sock *sk, struct tcp_opt *tp, unsigned long
for(req = tp->syn_wait_queue; req; ) {
struct open_request *next = req->dl_next;
- if (! req->sk) {
+ if (!req->sk && (long)(now - req->expires) >= 0) {
tcp_synq_unlink(tp, req, prev);
if(req->retrans >= sysctl_tcp_retries1) {
(*req->class->destructor)(req);
@@ -552,7 +507,7 @@ static void tcp_syn_recv_timer(unsigned long data)
unsigned long now = jiffies;
int i;
- SOCKHASH_LOCK_READ_BH();
+ read_lock(&tcp_lhash_lock);
for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
sk = tcp_listening_hash[i];
while(sk) {
@@ -566,7 +521,7 @@ static void tcp_syn_recv_timer(unsigned long data)
sk = sk->next;
}
}
- SOCKHASH_UNLOCK_READ_BH();
+ read_unlock(&tcp_lhash_lock);
}
void tcp_sltimer_handler(unsigned long data)
@@ -597,6 +552,14 @@ void tcp_sltimer_handler(unsigned long data)
mod_timer(&tcp_slow_timer, (now + next));
}
+/* __tcp_inc_slow_timer is called when an slow timer is started
+ * first time (slt->count was 0). There is race condition between
+ * timer creation and deletion and if we do not force adding timer here,
+ * we might lose timer. We could avoid it with global spinlock, but
+ * it is apparently overkill, so that we restart timer ALWAYS when
+ * this function is entered, it guarantees that timer will not lost.
+ */
+
void __tcp_inc_slow_timer(struct tcp_sl_timer *slt)
{
unsigned long now = jiffies;
@@ -606,11 +569,94 @@ void __tcp_inc_slow_timer(struct tcp_sl_timer *slt)
when = now + slt->period;
- if (tcp_slow_timer.prev) {
- if ((long)(tcp_slow_timer.expires - when) >= 0)
- mod_timer(&tcp_slow_timer, when);
+ if (tcp_slow_timer.prev &&
+ (long)(tcp_slow_timer.expires - when) < 0)
+ when = tcp_slow_timer.expires;
+
+ mod_timer(&tcp_slow_timer, when);
+}
+
+void tcp_delete_keepalive_timer (struct sock *sk)
+{
+ spin_lock_bh(&sk->timer_lock);
+ if (sk->timer.prev && del_timer (&sk->timer))
+ __sock_put(sk);
+ spin_unlock_bh(&sk->timer_lock);
+}
+
+void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
+{
+ spin_lock_bh(&sk->timer_lock);
+ if(!sk->timer.prev || !del_timer(&sk->timer))
+ sock_hold(sk);
+ mod_timer(&sk->timer, jiffies+len);
+ spin_unlock_bh(&sk->timer_lock);
+}
+
+void tcp_set_keepalive(struct sock *sk, int val)
+{
+ if (val && !sk->keepopen)
+ tcp_reset_keepalive_timer(sk, keepalive_time_when(&sk->tp_pinfo.af_tcp));
+ else if (!val)
+ tcp_delete_keepalive_timer(sk);
+}
+
+
+void tcp_keepalive_timer (unsigned long data)
+{
+ struct sock *sk = (struct sock *) data;
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+ __u32 elapsed;
+
+ /* Only process if socket is not in use. */
+ bh_lock_sock(sk);
+ if (sk->lock.users) {
+ /* Try again later. */
+ tcp_reset_keepalive_timer (sk, HZ/20);
+ goto out;
+ }
+
+ if (sk->state == TCP_FIN_WAIT2 && sk->dead)
+ goto death;
+
+ if (!sk->keepopen)
+ goto out;
+
+ elapsed = keepalive_time_when(tp);
+ if (!((1<<sk->state) & (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT2)))
+ goto resched;
+
+ elapsed = tcp_time_stamp - tp->rcv_tstamp;
+
+ if (elapsed >= keepalive_time_when(tp)) {
+ if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
+ (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_write_err(sk, 1);
+ goto out;
+ }
+ tp->probes_out++;
+ tp->pending = TIME_KEEPOPEN;
+ tcp_write_wakeup(sk);
+ elapsed = keepalive_intvl_when(tp);
} else {
- tcp_slow_timer.expires = when;
- add_timer(&tcp_slow_timer);
+ /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
+ if (keepalive_time_when(tp) > elapsed)
+ elapsed = keepalive_time_when(tp) - elapsed;
+ else
+ elapsed = 0;
}
+
+resched:
+ tcp_reset_keepalive_timer (sk, elapsed);
+ goto out;
+
+death:
+ tcp_set_state(sk, TCP_CLOSE);
+ tcp_clear_xmit_timers(sk);
+ tcp_done(sk);
+
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
}
diff --git a/net/ipv4/timer.c b/net/ipv4/timer.c
deleted file mode 100644
index 0487f5bfa..000000000
--- a/net/ipv4/timer.c
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * INET An implementation of the TCP/IP protocol suite for the LINUX
- * operating system. INET is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * TIMER - implementation of software timers for IP.
- *
- * Version: $Id: timer.c,v 1.16 1999/05/27 00:37:39 davem Exp $
- *
- * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
- * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
- * Corey Minyard <wf-rch!minyard@relay.EU.net>
- * Fred Baumgarten, <dc6iq@insu1.etec.uni-karlsruhe.de>
- * Florian La Roche, <flla@stud.uni-sb.de>
- *
- * Fixes:
- * Alan Cox : To avoid destroying a wait queue as we use it
- * we defer destruction until the destroy timer goes
- * off.
- * Alan Cox : Destroy socket doesn't write a status value to the
- * socket buffer _AFTER_ freeing it! Also sock ensures
- * the socket will get removed BEFORE this is called
- * otherwise if the timer TIME_DESTROY occurs inside
- * of inet_bh() with this socket being handled it goes
- * BOOM! Have to stop timer going off if net_bh is
- * active or the destroy causes crashes.
- * Alan Cox : Cleaned up unused code.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <asm/system.h>
-#include <linux/interrupt.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <net/ip.h>
-#include <net/protocol.h>
-#include <net/tcp.h>
-#include <linux/skbuff.h>
-#include <net/sock.h>
-#include <net/arp.h>
-
-void net_delete_timer (struct sock *t)
-{
- if(t->timer.prev)
- del_timer (&t->timer);
- t->timeout = 0;
-}
-
-void net_reset_timer (struct sock *t, int timeout, unsigned long len)
-{
- t->timeout = timeout;
- mod_timer(&t->timer, jiffies+len);
-}
-
-/* Now we will only be called whenever we need to do
- * something, but we must be sure to process all of the
- * sockets that need it.
- */
-void net_timer (unsigned long data)
-{
- struct sock *sk = (struct sock *) data;
- int why = sk->timeout;
-
- /* Only process if socket is not in use. */
- bh_lock_sock(sk);
- if (sk->lock.users) {
- /* Try again later. */
- mod_timer(&sk->timer, jiffies+HZ/20);
- bh_unlock_sock(sk);
- return;
- }
-
- /* Always see if we need to send an ack. */
- if (sk->tp_pinfo.af_tcp.delayed_acks && !sk->zapped) {
- sk->prot->read_wakeup (sk);
- if (!sk->dead)
- sk->data_ready(sk,0);
- }
-
- /* Now we need to figure out why the socket was on the timer. */
- switch (why) {
- case TIME_DONE:
- /* If the socket hasn't been closed off, re-try a bit later. */
- if (!sk->dead) {
- net_reset_timer(sk, TIME_DONE, TCP_DONE_TIME);
- break;
- }
-
- if (sk->state != TCP_CLOSE) {
- printk (KERN_DEBUG "non CLOSE socket in time_done\n");
- break;
- }
- destroy_sock(sk);
- return;
-
- case TIME_DESTROY:
- /* We've waited for a while for all the memory associated with
- * the socket to be freed.
- */
- destroy_sock(sk);
- return;
-
- case TIME_CLOSE:
- /* We've waited long enough, close the socket. */
- tcp_set_state(sk, TCP_CLOSE);
- sk->shutdown = SHUTDOWN_MASK;
- if (!sk->dead)
- sk->state_change(sk);
- net_reset_timer (sk, TIME_DONE, TCP_DONE_TIME);
- break;
-
- default:
- /* I want to see these... */
- printk ("net_timer: timer expired - reason %d is unknown\n", why);
- break;
- }
-
- /* We only need to unlock if the socket was not destroyed. */
- bh_unlock_sock(sk);
-}
-
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c99dffff0..bffd5b727 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -5,7 +5,7 @@
*
* The User Datagram Protocol (UDP).
*
- * Version: $Id: udp.c,v 1.71 1999/07/02 11:26:33 davem Exp $
+ * Version: $Id: udp.c,v 1.74 1999/08/20 11:06:12 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -113,6 +113,7 @@
#include <net/udp.h>
#include <net/icmp.h>
#include <net/route.h>
+#include <net/inet_common.h>
#include <net/checksum.h>
/*
@@ -122,13 +123,14 @@
struct udp_mib udp_statistics;
struct sock *udp_hash[UDP_HTABLE_SIZE];
+rwlock_t udp_hash_lock = RW_LOCK_UNLOCKED;
/* Shared by v4/v6 udp. */
int udp_port_rover = 0;
static int udp_v4_get_port(struct sock *sk, unsigned short snum)
{
- SOCKHASH_LOCK_WRITE();
+ write_lock_bh(&udp_hash_lock);
if (snum == 0) {
int best_size_so_far, best, result, i;
@@ -186,11 +188,11 @@ gotit:
}
}
sk->num = snum;
- SOCKHASH_UNLOCK_WRITE();
+ write_unlock_bh(&udp_hash_lock);
return 0;
fail:
- SOCKHASH_UNLOCK_WRITE();
+ write_unlock_bh(&udp_hash_lock);
return 1;
}
@@ -198,7 +200,7 @@ static void udp_v4_hash(struct sock *sk)
{
struct sock **skp = &udp_hash[sk->num & (UDP_HTABLE_SIZE - 1)];
- SOCKHASH_LOCK_WRITE();
+ write_lock_bh(&udp_hash_lock);
if ((sk->next = *skp) != NULL)
(*skp)->pprev = &sk->next;
*skp = sk;
@@ -206,20 +208,22 @@ static void udp_v4_hash(struct sock *sk)
sk->prot->inuse++;
if(sk->prot->highestinuse < sk->prot->inuse)
sk->prot->highestinuse = sk->prot->inuse;
- SOCKHASH_UNLOCK_WRITE();
+ sock_hold(sk);
+ write_unlock_bh(&udp_hash_lock);
}
static void udp_v4_unhash(struct sock *sk)
{
- SOCKHASH_LOCK_WRITE();
+ write_lock_bh(&udp_hash_lock);
if (sk->pprev) {
if (sk->next)
sk->next->pprev = sk->pprev;
*sk->pprev = sk->next;
sk->pprev = NULL;
sk->prot->inuse--;
+ __sock_put(sk);
}
- SOCKHASH_UNLOCK_WRITE();
+ write_unlock_bh(&udp_hash_lock);
}
/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
@@ -232,7 +236,7 @@ struct sock *udp_v4_lookup_longway(u32 saddr, u16 sport, u32 daddr, u16 dport, i
int badness = -1;
for(sk = udp_hash[hnum & (UDP_HTABLE_SIZE - 1)]; sk != NULL; sk = sk->next) {
- if((sk->num == hnum) && !(sk->dead && (sk->state == TCP_CLOSE))) {
+ if(sk->num == hnum) {
int score = 0;
if(sk->rcv_saddr) {
if(sk->rcv_saddr != daddr)
@@ -270,94 +274,14 @@ __inline__ struct sock *udp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport
{
struct sock *sk;
- SOCKHASH_LOCK_READ();
+ read_lock(&udp_hash_lock);
sk = udp_v4_lookup_longway(saddr, sport, daddr, dport, dif);
- SOCKHASH_UNLOCK_READ();
+ if (sk)
+ sock_hold(sk);
+ read_unlock(&udp_hash_lock);
return sk;
}
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
-#define secondlist(hpnum, sk, fpass) \
-({ struct sock *s1; if(!(sk) && (fpass)--) \
- s1 = udp_hash[(hpnum) & (UDP_HTABLE_SIZE - 1)]; \
- else \
- s1 = (sk); \
- s1; \
-})
-
-#define udp_v4_proxy_loop_init(hnum, hpnum, sk, fpass) \
- secondlist((hpnum), udp_hash[(hnum)&(UDP_HTABLE_SIZE-1)],(fpass))
-
-#define udp_v4_proxy_loop_next(hnum, hpnum, sk, fpass) \
- secondlist((hpnum),(sk)->next,(fpass))
-
-static struct sock *udp_v4_proxy_lookup(unsigned short num, unsigned long raddr,
- unsigned short rnum, unsigned long laddr,
- struct device *dev, unsigned short pnum,
- int dif)
-{
- struct sock *s, *result = NULL;
- int badness = -1;
- u32 paddr = 0;
- unsigned short hnum = ntohs(num);
- unsigned short hpnum = ntohs(pnum);
- int firstpass = 1;
-
- if(dev && dev->ip_ptr) {
- struct in_device *idev = dev->ip_ptr;
-
- if(idev->ifa_list)
- paddr = idev->ifa_list->ifa_local;
- }
-
- SOCKHASH_LOCK_READ();
- for(s = udp_v4_proxy_loop_init(hnum, hpnum, s, firstpass);
- s != NULL;
- s = udp_v4_proxy_loop_next(hnum, hpnum, s, firstpass)) {
- if(s->num == hnum || s->num == hpnum) {
- int score = 0;
- if(s->dead && (s->state == TCP_CLOSE))
- continue;
- if(s->rcv_saddr) {
- if((s->num != hpnum || s->rcv_saddr != paddr) &&
- (s->num != hnum || s->rcv_saddr != laddr))
- continue;
- score++;
- }
- if(s->daddr) {
- if(s->daddr != raddr)
- continue;
- score++;
- }
- if(s->dport) {
- if(s->dport != rnum)
- continue;
- score++;
- }
- if(s->bound_dev_if) {
- if(s->bound_dev_if != dif)
- continue;
- score++;
- }
- if(score == 4 && s->num == hnum) {
- result = s;
- break;
- } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) {
- result = s;
- badness = score;
- }
- }
- }
- SOCKHASH_UNLOCK_READ();
- return result;
-}
-
-#undef secondlist
-#undef udp_v4_proxy_loop_init
-#undef udp_v4_proxy_loop_next
-
-#endif
-
static inline struct sock *udp_v4_mcast_next(struct sock *sk,
unsigned short num,
unsigned long raddr,
@@ -369,7 +293,6 @@ static inline struct sock *udp_v4_mcast_next(struct sock *sk,
unsigned short hnum = ntohs(num);
for(; s; s = s->next) {
if ((s->num != hnum) ||
- (s->dead && (s->state == TCP_CLOSE)) ||
(s->daddr && s->daddr!=raddr) ||
(s->dport != rnum && s->dport != 0) ||
(s->rcv_saddr && s->rcv_saddr != laddr) ||
@@ -423,7 +346,7 @@ void udp_err(struct sk_buff *skb, unsigned char *dp, int len)
err = EHOSTUNREACH;
break;
case ICMP_SOURCE_QUENCH:
- return;
+ goto out;
case ICMP_PARAMETERPROB:
err = EPROTO;
info = ntohl(skb->h.icmph->un.gateway)>>24;
@@ -431,13 +354,13 @@ void udp_err(struct sk_buff *skb, unsigned char *dp, int len)
break;
case ICMP_DEST_UNREACH:
if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
- if (sk->ip_pmtudisc != IP_PMTUDISC_DONT) {
+ if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT) {
err = EMSGSIZE;
info = ntohs(skb->h.icmph->un.frag.mtu);
harderr = 1;
break;
}
- return;
+ goto out;
}
err = EHOSTUNREACH;
if (code <= NR_ICMP_UNREACH) {
@@ -460,20 +383,22 @@ void udp_err(struct sk_buff *skb, unsigned char *dp, int len)
* 4.1.3.3. After the comment above, that should be no surprise.
*/
- if (!harderr && !sk->ip_recverr)
- return;
+ if (!harderr && !sk->protinfo.af_inet.recverr)
+ goto out;
/*
* 4.x BSD compatibility item. Break RFC1122 to
* get BSD socket semantics.
*/
- if(sk->bsdism && sk->state!=TCP_ESTABLISHED)
- return;
+ if(sk->bsdism && sk->state!=TCP_ESTABLISHED && !sk->protinfo.af_inet.recverr)
+ goto out;
- if (sk->ip_recverr)
+ if (sk->protinfo.af_inet.recverr)
ip_icmp_error(sk, skb, err, uh->dest, info, (u8*)(uh+1));
sk->err = err;
sk->error_report(sk);
+out:
+ sock_put(sk);
}
@@ -574,16 +499,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
if (msg->msg_flags&MSG_OOB) /* Mirror BSD error message compatibility */
return -EOPNOTSUPP;
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- if (msg->msg_flags&~(MSG_DONTROUTE|MSG_DONTWAIT|MSG_PROXY|MSG_NOSIGNAL))
- return -EINVAL;
- if ((msg->msg_flags&MSG_PROXY) && !capable(CAP_NET_ADMIN))
- return -EPERM;
-#else
- if (msg->msg_flags&~(MSG_DONTROUTE|MSG_DONTWAIT|MSG_NOSIGNAL))
- return -EINVAL;
-#endif
-
/*
* Get and verify the address.
*/
@@ -592,8 +507,12 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
struct sockaddr_in * usin = (struct sockaddr_in*)msg->msg_name;
if (msg->msg_namelen < sizeof(*usin))
return -EINVAL;
- if (usin->sin_family != AF_INET)
- return -EINVAL;
+ if (usin->sin_family != AF_INET) {
+ if (usin->sin_family != AF_UNSPEC)
+ return -EINVAL;
+ if (net_ratelimit())
+ printk("Remind Kuznetsov, he has to repair %s eventually\n", current->comm);
+ }
ufh.daddr = usin->sin_addr.s_addr;
ufh.uh.dest = usin->sin_port;
@@ -609,27 +528,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
*/
connected = 1;
}
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- if (msg->msg_flags&MSG_PROXY) {
- /*
- * We map the first 8 bytes of a second sockaddr_in
- * into the last 8 (unused) bytes of a sockaddr_in.
- */
- struct sockaddr_in *from = (struct sockaddr_in *)msg->msg_name;
- from = (struct sockaddr_in *)&from->sin_zero;
- if (from->sin_family != AF_INET)
- return -EINVAL;
- ipc.addr = from->sin_addr.s_addr;
- ufh.uh.source = from->sin_port;
- if (ipc.addr == 0)
- ipc.addr = sk->saddr;
- connected = 0;
- } else
-#endif
- {
- ipc.addr = sk->saddr;
- ufh.uh.source = sk->sport;
- }
+ ipc.addr = sk->saddr;
+ ufh.uh.source = sk->sport;
ipc.opt = NULL;
ipc.oif = sk->bound_dev_if;
@@ -642,7 +542,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
connected = 0;
}
if (!ipc.opt)
- ipc.opt = sk->opt;
+ ipc.opt = sk->protinfo.af_inet.opt;
ufh.saddr = ipc.addr;
ipc.addr = daddr = ufh.daddr;
@@ -653,7 +553,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
daddr = ipc.opt->faddr;
connected = 0;
}
- tos = RT_TOS(sk->ip_tos);
+ tos = RT_TOS(sk->protinfo.af_inet.tos);
if (sk->localroute || (msg->msg_flags&MSG_DONTROUTE) ||
(ipc.opt && ipc.opt->is_strictroute)) {
tos |= RTO_ONLINK;
@@ -662,29 +562,31 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
if (MULTICAST(daddr)) {
if (!ipc.oif)
- ipc.oif = sk->ip_mc_index;
+ ipc.oif = sk->protinfo.af_inet.mc_index;
if (!ufh.saddr)
- ufh.saddr = sk->ip_mc_addr;
+ ufh.saddr = sk->protinfo.af_inet.mc_addr;
connected = 0;
}
if (connected)
- rt = (struct rtable*)dst_clone(sk->dst_cache);
+ rt = (struct rtable*)sk_dst_check(sk, 0);
if (rt == NULL) {
- err = ip_route_output(&rt, daddr, ufh.saddr,
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- (msg->msg_flags&MSG_PROXY ? RTO_TPROXY : 0) |
-#endif
- tos, ipc.oif);
- if (err)
+ err = ip_route_output(&rt, daddr, ufh.saddr, tos, ipc.oif);
+ if (err)
goto out;
err = -EACCES;
if (rt->rt_flags&RTCF_BROADCAST && !sk->broadcast)
goto out;
+ if (connected)
+ sk_dst_set(sk, dst_clone(&rt->u.dst));
}
+ if (msg->msg_flags&MSG_CONFIRM)
+ goto do_confirm;
+back_from_confirm:
+
ufh.saddr = rt->rt_src;
if (!ipc.addr)
ufh.daddr = ipc.addr = rt->rt_dst;
@@ -712,6 +614,13 @@ out:
return len;
}
return err;
+
+do_confirm:
+ dst_confirm(&rt->u.dst);
+ if (!(msg->msg_flags&MSG_PROBE) || len)
+ goto back_from_confirm;
+ err = 0;
+ goto out;
}
/*
@@ -736,9 +645,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
unsigned long amount;
amount = 0;
- /* N.B. Is this interrupt safe??
- -> Yes. Interrupts do not remove skbs. --ANK (980725)
- */
+ spin_lock_irq(&sk->receive_queue.lock);
skb = skb_peek(&sk->receive_queue);
if (skb != NULL) {
/*
@@ -748,6 +655,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
*/
amount = skb->len - sizeof(struct udphdr);
}
+ spin_unlock_irq(&sk->receive_queue.lock);
return put_user(amount, (int *)arg);
}
@@ -832,25 +740,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len,
sin->sin_family = AF_INET;
sin->sin_port = skb->h.uh->source;
sin->sin_addr.s_addr = skb->nh.iph->saddr;
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- if (flags&MSG_PROXY)
- {
- /*
- * We map the first 8 bytes of a second sockaddr_in
- * into the last 8 (unused) bytes of a sockaddr_in.
- * This _is_ ugly, but it's the only way to do it
- * easily, without adding system calls.
- */
- struct sockaddr_in *sinto =
- (struct sockaddr_in *) sin->sin_zero;
-
- sinto->sin_family = AF_INET;
- sinto->sin_port = skb->h.uh->dest;
- sinto->sin_addr.s_addr = skb->nh.iph->daddr;
- }
-#endif
}
- if (sk->ip_cmsg_flags)
+ if (sk->protinfo.af_inet.cmsg_flags)
ip_cmsg_recv(msg, skb);
err = copied;
@@ -862,6 +753,20 @@ out:
#ifdef CONFIG_UDP_DELAY_CSUM
csum_copy_err:
udp_statistics.UdpInErrors++;
+
+ /* Clear queue. */
+ if (flags&MSG_PEEK) {
+ int clear = 0;
+ spin_lock_irq(&sk->receive_queue.lock);
+ if (skb == skb_peek(&sk->receive_queue)) {
+ __skb_unlink(skb, &sk->receive_queue);
+ clear = 1;
+ }
+ spin_unlock_irq(&sk->receive_queue.lock);
+ if (clear)
+ kfree_skb(skb);
+ }
+
skb_free_datagram(sk, skb);
/*
@@ -882,26 +787,13 @@ int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (addr_len < sizeof(*usin))
return -EINVAL;
- /*
- * 1003.1g - break association.
- */
-
- if (usin->sin_family==AF_UNSPEC)
- {
- sk->saddr=INADDR_ANY;
- sk->rcv_saddr=INADDR_ANY;
- sk->daddr=INADDR_ANY;
- sk->state = TCP_CLOSE;
- return 0;
- }
-
- if (usin->sin_family && usin->sin_family != AF_INET)
+ if (usin->sin_family != AF_INET)
return -EAFNOSUPPORT;
- dst_release(xchg(&sk->dst_cache, NULL));
+ sk_dst_reset(sk);
err = ip_route_connect(&rt, usin->sin_addr.s_addr, sk->saddr,
- sk->ip_tos|sk->localroute, sk->bound_dev_if);
+ sk->protinfo.af_inet.tos|sk->localroute, sk->bound_dev_if);
if (err)
return err;
if ((rt->rt_flags&RTCF_BROADCAST) && !sk->broadcast) {
@@ -916,20 +808,27 @@ int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
sk->dport = usin->sin_port;
sk->state = TCP_ESTABLISHED;
- sk->dst_cache = &rt->u.dst;
+ sk_dst_set(sk, &rt->u.dst);
return(0);
}
+int udp_disconnect(struct sock *sk, int flags)
+{
+ /*
+ * 1003.1g - break association.
+ */
+
+ sk->state = TCP_CLOSE;
+ sk->rcv_saddr = 0;
+ sk->daddr = 0;
+ sk->dport = 0;
+ sk_dst_reset(sk);
+ return 0;
+}
static void udp_close(struct sock *sk, long timeout)
{
- bh_lock_sock(sk);
-
- /* See for explanation: raw_close in ipv4/raw.c */
- sk->state = TCP_CLOSE;
- udp_v4_unhash(sk);
- sk->dead = 1;
- destroy_sock(sk);
+ inet_sock_release(sk);
}
static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
@@ -980,6 +879,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
struct sock *sk;
int dif;
+ read_lock(&udp_hash_lock);
sk = udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)];
dif = skb->dev->ifindex;
sk = udp_v4_mcast_next(sk, uh->dest, saddr, uh->source, daddr, dif);
@@ -1000,33 +900,10 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
} while(sknext);
} else
kfree_skb(skb);
+ read_unlock(&udp_hash_lock);
return 0;
}
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
-/*
- * Check whether a received UDP packet might be for one of our
- * sockets.
- */
-
-int udp_chkaddr(struct sk_buff *skb)
-{
- struct iphdr *iph = skb->nh.iph;
- struct udphdr *uh = (struct udphdr *)(skb->nh.raw + iph->ihl*4);
- struct sock *sk;
-
- sk = udp_v4_lookup(iph->saddr, uh->source, iph->daddr, uh->dest, skb->dev->ifindex);
- if (!sk)
- return 0;
-
- /* 0 means accept all LOCAL addresses here, not all the world... */
- if (sk->rcv_saddr == 0)
- return 0;
-
- return 1;
-}
-#endif
-
static int udp_checksum_verify(struct sk_buff *skb, struct udphdr *uh,
unsigned short ulen, u32 saddr, u32 daddr,
int full_csum_deferred)
@@ -1068,11 +945,6 @@ int udp_rcv(struct sk_buff *skb, unsigned short len)
u32 daddr = skb->nh.iph->daddr;
/*
- * First time through the loop.. Do all the setup stuff
- * (including finding out the socket we go to etc)
- */
-
- /*
* Get the header.
*/
@@ -1108,26 +980,18 @@ int udp_rcv(struct sk_buff *skb, unsigned short len)
return udp_v4_mcast_deliver(skb, uh, saddr, daddr);
}
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- if (IPCB(skb)->redirport)
- sk = udp_v4_proxy_lookup(uh->dest, saddr, uh->source,
- daddr, skb->dev, IPCB(skb)->redirport,
- skb->dev->ifindex);
- else
-#endif
sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex);
if (sk == NULL) {
-#ifdef CONFIG_UDP_DELAY_CSUM
- if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
- (unsigned short)csum_fold(csum_partial((char*)uh, ulen, skb->csum)))
+ /* No socket. Drop packet silently, if checksum is wrong */
+ if (udp_checksum_verify(skb, uh, ulen, saddr, daddr, 0))
goto csum_error;
-#endif
+
udp_statistics.UdpNoPorts++;
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
/*
- * Hmm. We got an UDP broadcast to a port to which we
+ * Hmm. We got an UDP packet to a port to which we
* don't wanna listen. Ignore it.
*/
kfree_skb(skb);
@@ -1139,10 +1003,13 @@ int udp_rcv(struct sk_buff *skb, unsigned short len)
#else
(sk->no_check & UDP_CSUM_NORCV) != 0
#endif
- ))
+ )) {
+ sock_put(sk);
goto csum_error;
+ }
udp_deliver(sk, skb);
+ __sock_put(sk);
return 0;
csum_error:
@@ -1175,12 +1042,13 @@ static void get_udp_sock(struct sock *sp, char *tmpbuf, int i)
timer_active = (sp->timer.prev != NULL) ? 2 : 0;
timer_expires = (timer_active == 2 ? sp->timer.expires : jiffies);
sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld",
+ " %02X %08X:%08X %02X:%08lX %08X %5d %8d %ld %d %p",
i, src, srcp, dest, destp, sp->state,
atomic_read(&sp->wmem_alloc), atomic_read(&sp->rmem_alloc),
timer_active, timer_expires-jiffies, 0,
- sp->socket->inode->i_uid, timer_active ? sp->timeout : 0,
- sp->socket ? sp->socket->inode->i_ino : 0);
+ sp->socket->inode->i_uid, 0,
+ sp->socket ? sp->socket->inode->i_ino : 0,
+ atomic_read(&sp->refcnt), sp);
}
int udp_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
@@ -1195,7 +1063,7 @@ int udp_get_info(char *buffer, char **start, off_t offset, int length, int dummy
" sl local_address rem_address st tx_queue "
"rx_queue tr tm->when retrnsmt uid timeout inode");
pos = 128;
- SOCKHASH_LOCK_READ();
+ read_lock(&udp_hash_lock);
for (i = 0; i < UDP_HTABLE_SIZE; i++) {
struct sock *sk;
@@ -1212,7 +1080,7 @@ int udp_get_info(char *buffer, char **start, off_t offset, int length, int dummy
}
}
out:
- SOCKHASH_UNLOCK_READ();
+ read_unlock(&udp_hash_lock);
begin = len - (pos - offset);
*start = buffer + begin;
len -= begin;
@@ -1226,6 +1094,7 @@ out:
struct proto udp_prot = {
udp_close, /* close */
udp_connect, /* connect */
+ udp_disconnect, /* disconnect */
NULL, /* accept */
NULL, /* retransmit */
NULL, /* write_wakeup */