From 46e045034336a2cc90c1798cd7cc07af744ddfd6 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 19 Apr 2000 04:00:00 +0000 Subject: Merge with Linux 2.3.99-pre4. --- net/ipv4/af_inet.c | 4 +- net/ipv4/icmp.c | 3 +- net/ipv4/ip_output.c | 13 +- net/ipv4/netfilter/.cvsignore | 2 + net/ipv4/netfilter/Config.in | 7 + net/ipv4/netfilter/Makefile | 54 ++- net/ipv4/netfilter/ip_conntrack_core.c | 38 +- net/ipv4/netfilter/ip_conntrack_ftp.c | 5 +- net/ipv4/netfilter/ip_conntrack_proto_icmp.c | 1 + net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 6 +- net/ipv4/netfilter/ip_conntrack_proto_udp.c | 1 + net/ipv4/netfilter/ip_conntrack_standalone.c | 5 +- net/ipv4/netfilter/ip_fw_compat.c | 2 - net/ipv4/netfilter/ip_fw_compat_masq.c | 1 + net/ipv4/netfilter/ip_nat_ftp.c | 9 +- net/ipv4/netfilter/ip_nat_standalone.c | 8 +- net/ipv4/netfilter/ip_queue.c | 516 +++++++++------------------ net/ipv4/netfilter/ip_tables.c | 83 ++++- net/ipv4/netfilter/ipchains_core.c | 2 + net/ipv4/netfilter/ipt_LOG.c | 12 +- net/ipv4/netfilter/ipt_MARK.c | 10 +- net/ipv4/netfilter/ipt_MASQUERADE.c | 2 - net/ipv4/netfilter/ipt_MIRROR.c | 3 +- net/ipv4/netfilter/ipt_REDIRECT.c | 2 - net/ipv4/netfilter/ipt_REJECT.c | 15 +- net/ipv4/netfilter/ipt_TOS.c | 10 +- net/ipv4/netfilter/ipt_limit.c | 1 - net/ipv4/netfilter/ipt_mac.c | 1 - net/ipv4/netfilter/ipt_mark.c | 2 - net/ipv4/netfilter/ipt_multiport.c | 2 - net/ipv4/netfilter/ipt_owner.c | 4 +- net/ipv4/netfilter/ipt_state.c | 8 +- net/ipv4/netfilter/ipt_tos.c | 2 - net/ipv4/netfilter/ipt_unclean.c | 2 - net/ipv4/tcp.c | 88 +++-- net/ipv4/tcp_input.c | 51 ++- net/ipv4/tcp_ipv4.c | 5 +- net/ipv4/tcp_output.c | 44 ++- 38 files changed, 486 insertions(+), 538 deletions(-) create mode 100644 net/ipv4/netfilter/.cvsignore (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b848151a9..d3fc0e38f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -5,7 +5,7 @@ * * PF_INET protocol family socket handler. * - * Version: $Id: af_inet.c,v 1.108 2000/02/21 16:25:59 davem Exp $ + * Version: $Id: af_inet.c,v 1.109 2000/03/25 01:55:10 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -607,7 +607,7 @@ int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr, if (!timeo || !inet_wait_for_connect(sk, timeo)) goto out; - err = -ERESTARTSYS; + err = sock_intr_errno(timeo); if (signal_pending(current)) goto out; } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 7561e190b..7c462ac08 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -3,7 +3,7 @@ * * Alan Cox, * - * Version: $Id: icmp.c,v 1.66 2000/03/17 14:41:50 davem Exp $ + * Version: $Id: icmp.c,v 1.67 2000/03/25 01:55:11 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -1128,6 +1128,7 @@ void __init icmp_init(struct net_proto_family *ops) if ((err=ops->create(icmp_socket, IPPROTO_ICMP))<0) panic("Failed to create the ICMP control socket.\n"); icmp_socket->sk->allocation=GFP_ATOMIC; + icmp_socket->sk->sndbuf = SK_WMEM_MAX*2; icmp_socket->sk->protinfo.af_inet.ttl = MAXTTL; /* Unhash it so that IP input processing does not even diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index f3013ca57..5792c5de7 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) output module. * - * Version: $Id: ip_output.c,v 1.82 2000/03/17 14:41:50 davem Exp $ + * Version: $Id: ip_output.c,v 1.83 2000/03/25 01:52:08 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -415,14 +415,13 @@ int ip_queue_xmit(struct sk_buff *skb) /* OK, we know where to send it, allocate and build IP header. */ iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); - iph->version = 4; - iph->ihl = 5; - iph->tos = sk->protinfo.af_inet.tos; + *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (sk->protinfo.af_inet.tos & 0xff)); + iph->tot_len = htons(skb->len); iph->frag_off = 0; iph->ttl = sk->protinfo.af_inet.ttl; - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; iph->protocol = sk->protocol; + iph->saddr = rt->rt_src; + iph->daddr = rt->rt_dst; skb->nh.iph = iph; /* Transport layer set skb->h.foo itself. */ @@ -431,8 +430,6 @@ int ip_queue_xmit(struct sk_buff *skb) ip_options_build(skb, opt, sk->daddr, rt, 0); } - iph->tot_len = htons(skb->len); - return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, ip_queue_xmit2); diff --git a/net/ipv4/netfilter/.cvsignore b/net/ipv4/netfilter/.cvsignore new file mode 100644 index 000000000..857dd22e9 --- /dev/null +++ b/net/ipv4/netfilter/.cvsignore @@ -0,0 +1,2 @@ +.depend +.*.flags diff --git a/net/ipv4/netfilter/Config.in b/net/ipv4/netfilter/Config.in index bf2a28269..406d2ea3d 100644 --- a/net/ipv4/netfilter/Config.in +++ b/net/ipv4/netfilter/Config.in @@ -39,6 +39,7 @@ if [ "$CONFIG_IP_NF_IPTABLES" != "n" ]; then if [ "$CONFIG_IP_NF_CONNTRACK" != "n" ]; then dep_tristate ' Full NAT' CONFIG_IP_NF_NAT $CONFIG_IP_NF_IPTABLES if [ "$CONFIG_IP_NF_NAT" != "n" ]; then + define_bool CONFIG_IP_NF_NAT_NEEDED y dep_tristate ' MASQUERADE target support' CONFIG_IP_NF_TARGET_MASQUERADE $CONFIG_IP_NF_NAT dep_tristate ' REDIRECT target support' CONFIG_IP_NF_TARGET_REDIRECT $CONFIG_IP_NF_NAT fi @@ -56,8 +57,14 @@ fi if [ "$CONFIG_IP_NF_CONNTRACK" != "y" ]; then if [ "$CONFIG_IP_NF_IPTABLES" != "y" ]; then tristate 'ipchains (2.2-style) support' CONFIG_IP_NF_COMPAT_IPCHAINS + if [ "$CONFIG_IP_NF_COMPAT_IPCHAINS" != "n" ]; then + define_bool CONFIG_IP_NF_NAT_NEEDED y + fi if [ "$CONFIG_IP_NF_COMPAT_IPCHAINS" != "y" ]; then tristate 'ipfwadm (2.0-style) support' CONFIG_IP_NF_COMPAT_IPFWADM + if [ "$CONFIG_IP_NF_COMPAT_IPFWADM" != "n" ]; then + define_bool CONFIG_IP_NF_NAT_NEEDED y + fi fi fi fi diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index c507acc31..db276076a 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -15,10 +15,12 @@ IP_NF_CONNTRACK_OBJ:=ip_conntrack_core.o ip_conntrack_proto_generic.o ip_conntra IP_NF_NAT_OBJ:=ip_nat_core.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o +# All the parts of conntrack and NAT required for compatibility layer. +IP_NF_COMPAT_LAYER:=ip_fw_compat.o ip_fw_compat_redir.o ip_fw_compat_masq.o $(IP_NF_CONNTRACK_OBJ) $(IP_NF_NAT_OBJ) + # Link order matters here. ifeq ($(CONFIG_IP_NF_CONNTRACK),y) -OX_OBJS += ip_conntrack_standalone.o -O_OBJS += $(IP_NF_CONNTRACK_OBJ) +O_OBJS += ip_conntrack_standalone.o $(IP_NF_CONNTRACK_OBJ) else ifeq ($(CONFIG_IP_NF_CONNTRACK),m) MI_OBJS += $(IP_NF_CONNTRACK_OBJ) @@ -27,16 +29,8 @@ else endif endif -ifeq ($(CONFIG_IP_NF_QUEUE),y) -O_OBJS += ip_queue.o -else - ifeq ($(CONFIG_IP_NF_QUEUE),m) - M_OBJS += ip_queue.o - endif -endif - ifeq ($(CONFIG_IP_NF_FTP),y) -OX_OBJS += ip_conntrack_ftp.o +O_OBJS += ip_conntrack_ftp.o else ifeq ($(CONFIG_IP_NF_FTP),m) MX_OBJS += ip_conntrack_ftp.o @@ -47,7 +41,7 @@ ifeq ($(CONFIG_IP_NF_IPTABLES),y) O_OBJS += ip_tables.o else ifeq ($(CONFIG_IP_NF_IPTABLES),m) - M_OBJS += ip_tables.o + MX_OBJS += ip_tables.o endif endif @@ -115,17 +109,8 @@ else endif endif -ifeq ($(CONFIG_IP_NF_FILTER),y) -O_OBJS += iptable_filter.o -else - ifeq ($(CONFIG_IP_NF_FILTER),m) - M_OBJS += iptable_filter.o - endif -endif - ifeq ($(CONFIG_IP_NF_NAT),y) -OX_OBJS += ip_nat_standalone.o -O_OBJS += ip_nat_rule.o $(IP_NF_NAT_OBJ) +O_OBJS += ip_nat_standalone.o ip_nat_rule.o $(IP_NF_NAT_OBJ) ifeq ($(CONFIG_IP_NF_FTP),y) O_OBJS += ip_nat_ftp.o endif @@ -140,6 +125,14 @@ else endif endif +ifeq ($(CONFIG_IP_NF_FILTER),y) +O_OBJS += iptable_filter.o +else + ifeq ($(CONFIG_IP_NF_FILTER),m) + M_OBJS += iptable_filter.o + endif +endif + ifeq ($(CONFIG_IP_NF_MANGLE),y) O_OBJS += iptable_mangle.o else @@ -205,7 +198,7 @@ else endif ifeq ($(CONFIG_IP_NF_COMPAT_IPCHAINS),y) -O_OBJS += ipchains.o +O_OBJS += ipchains_core.o $(IP_NF_COMPAT_LAYER) else ifeq ($(CONFIG_IP_NF_COMPAT_IPCHAINS),m) M_OBJS += ipchains.o @@ -213,13 +206,21 @@ else endif ifeq ($(CONFIG_IP_NF_COMPAT_IPFWADM),y) -O_OBJS += ipfwadm.o +O_OBJS += ipfwadm_core.o $(IP_NF_COMPAT_LAYER) else ifeq ($(CONFIG_IP_NF_COMPAT_IPFWADM),m) M_OBJS += ipfwadm.o endif endif +ifeq ($(CONFIG_IP_NF_QUEUE),y) +O_OBJS += ip_queue.o +else + ifeq ($(CONFIG_IP_NF_QUEUE),m) + M_OBJS += ip_queue.o + endif +endif + include $(TOPDIR)/Rules.make ip_conntrack.o: ip_conntrack_standalone.o $(IP_NF_CONNTRACK_OBJ) @@ -228,11 +229,8 @@ ip_conntrack.o: ip_conntrack_standalone.o $(IP_NF_CONNTRACK_OBJ) iptable_nat.o: ip_nat_standalone.o ip_nat_rule.o $(IP_NF_NAT_OBJ) $(LD) -r -o $@ ip_nat_standalone.o ip_nat_rule.o $(IP_NF_NAT_OBJ) -# All the parts of conntrack and NAT required for compatibility layer. -IP_NF_COMPAT_LAYER:=ip_fw_compat.o ip_fw_compat_redir.o ip_fw_compat_masq.o $(IP_NF_CONNTRACK_OBJ) $(IP_NF_NAT_OBJ) - ipfwadm.o: ipfwadm_core.o $(IP_NF_COMPAT_LAYER) $(LD) -r -o $@ ipfwadm_core.o $(IP_NF_COMPAT_LAYER) -ipchains.o: ipchains_core.o $(IP_NF_COMPAT_LAYER) +ipchains.o: ipchains_core.o $(IP_NF_COMPAT_LAYER) $(LD) -r -o $@ ipchains_core.o $(IP_NF_COMPAT_LAYER) diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 9007cdc89..197c2e3b4 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -22,6 +22,7 @@ #include #include #include +#include /* This rwlock protects the main hash table, protocol/helper/expected registrations, conntrack timers*/ @@ -43,13 +44,14 @@ DECLARE_RWLOCK(ip_conntrack_lock); void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL; -static LIST_HEAD(expect_list); -static LIST_HEAD(protocol_list); +LIST_HEAD(expect_list); +LIST_HEAD(protocol_list); static LIST_HEAD(helpers); unsigned int ip_conntrack_htable_size = 0; static int ip_conntrack_max = 0; static atomic_t ip_conntrack_count = ATOMIC_INIT(0); struct list_head *ip_conntrack_hash; +static kmem_cache_t *ip_conntrack_cachep; extern struct ip_conntrack_protocol ip_conntrack_generic_protocol; @@ -167,7 +169,7 @@ destroy_conntrack(struct nf_conntrack *nfct) if (ip_conntrack_destroyed) ip_conntrack_destroyed(ct); - kfree(ct); + kmem_cache_free(ip_conntrack_cachep, ct); atomic_dec(&ip_conntrack_count); } @@ -355,7 +357,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, return 1; } - conntrack = kmalloc(sizeof(struct ip_conntrack), GFP_ATOMIC); + conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); if (!conntrack) { DEBUGP("Can't allocate conntrack.\n"); return 1; @@ -374,7 +376,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, conntrack->infos[i].master = &conntrack->ct_general; if (!protocol->new(conntrack, skb->nh.iph, skb->len)) { - kfree(conntrack); + kmem_cache_free(ip_conntrack_cachep, conntrack); return 1; } @@ -384,7 +386,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, if (__ip_conntrack_find(tuple, NULL)) { WRITE_UNLOCK(&ip_conntrack_lock); printk("ip_conntrack: Wow someone raced us!\n"); - kfree(conntrack); + kmem_cache_free(ip_conntrack_cachep, conntrack); return 0; } conntrack->helper = LIST_FIND(&helpers, helper_cmp, @@ -796,6 +798,7 @@ static struct nf_sockopt_ops so_getorigdst #define NET_IP_CONNTRACK_MAX 2089 #define NET_IP_CONNTRACK_MAX_NAME "ip_conntrack_max" +#ifdef CONFIG_SYSCTL static struct ctl_table_header *ip_conntrack_sysctl_header; static ctl_table ip_conntrack_table[] = { @@ -813,6 +816,7 @@ static ctl_table ip_conntrack_root_table[] = { {CTL_NET, "net", NULL, 0, 0555, ip_conntrack_dir_table, 0, 0, 0, 0, 0}, { 0 } }; +#endif /*CONFIG_SYSCTL*/ static int kill_all(const struct ip_conntrack *i, void *data) { @@ -823,8 +827,11 @@ static int kill_all(const struct ip_conntrack *i, void *data) supposed to kill the mall. */ void ip_conntrack_cleanup(void) { +#ifdef CONFIG_SYSCTL unregister_sysctl_table(ip_conntrack_sysctl_header); +#endif ip_ct_selective_cleanup(kill_all, NULL); + kmem_cache_destroy(ip_conntrack_cachep); vfree(ip_conntrack_hash); nf_unregister_sockopt(&so_getorigdst); } @@ -855,6 +862,16 @@ int __init ip_conntrack_init(void) return -ENOMEM; } + ip_conntrack_cachep = kmem_cache_create("ip_conntrack", + sizeof(struct ip_conntrack), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!ip_conntrack_cachep) { + printk(KERN_ERR "Unable to create ip_conntrack slab cache\n"); + vfree(ip_conntrack_hash); + nf_unregister_sockopt(&so_getorigdst); + return -ENOMEM; + } + /* Don't NEED lock here, but good form anyway. */ WRITE_LOCK(&ip_conntrack_lock); /* Sew in builtin protocols. */ @@ -873,19 +890,12 @@ int __init ip_conntrack_init(void) ip_conntrack_sysctl_header = register_sysctl_table(ip_conntrack_root_table, 0); if (ip_conntrack_sysctl_header == NULL) { + kmem_cache_destroy(ip_conntrack_cachep); vfree(ip_conntrack_hash); nf_unregister_sockopt(&so_getorigdst); return -ENOMEM; } #endif /*CONFIG_SYSCTL*/ - ret = ip_conntrack_protocol_tcp_init(); - if (ret != 0) { - unregister_sysctl_table(ip_conntrack_sysctl_header); - vfree(ip_conntrack_hash); - nf_unregister_sockopt(&so_getorigdst); - } - return ret; } - diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index 23ccf74cf..1600156f7 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -10,6 +10,7 @@ #include DECLARE_LOCK(ip_ftp_lock); +struct module *ip_conntrack_ftp = THIS_MODULE; #define SERVER_STRING "227 Entering Passive Mode (" #define CLIENT_STRING "PORT " @@ -240,9 +241,5 @@ static void __exit fini(void) ip_conntrack_helper_unregister(&ftp); } -struct module *ip_conntrack_ftp = THIS_MODULE; -EXPORT_SYMBOL(ip_conntrack_ftp); -EXPORT_SYMBOL(ip_ftp_lock); - module_init(init); module_exit(fini); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 1d1256be5..cbbc1ab8c 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 3dd448252..893248943 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -220,8 +221,3 @@ struct ip_conntrack_protocol ip_conntrack_protocol_tcp = { { NULL, NULL }, IPPROTO_TCP, "tcp", tcp_pkt_to_tuple, tcp_invert_tuple, tcp_print_tuple, tcp_print_conntrack, tcp_packet, tcp_new, NULL }; - -int __init ip_conntrack_protocol_tcp_init(void) -{ - return 0; -} diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index 688ae10fb..79ec82151 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index a69be542d..9030d9d41 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -276,6 +276,7 @@ static void __exit fini(void) module_init(init); module_exit(fini); +#ifdef MODULE EXPORT_SYMBOL(ip_conntrack_protocol_register); EXPORT_SYMBOL(invert_tuplepr); EXPORT_SYMBOL(ip_conntrack_alter_reply); @@ -284,11 +285,9 @@ EXPORT_SYMBOL(ip_conntrack_get); EXPORT_SYMBOL(ip_conntrack_module); EXPORT_SYMBOL(ip_conntrack_helper_register); EXPORT_SYMBOL(ip_conntrack_helper_unregister); -EXPORT_SYMBOL(ip_conntrack_lock); -EXPORT_SYMBOL(find_proto); -EXPORT_SYMBOL(get_tuple); EXPORT_SYMBOL(ip_ct_selective_cleanup); EXPORT_SYMBOL(ip_ct_refresh); EXPORT_SYMBOL(ip_conntrack_expect_related); EXPORT_SYMBOL(ip_conntrack_tuple_taken); EXPORT_SYMBOL(ip_ct_gather_frags); +#endif diff --git a/net/ipv4/netfilter/ip_fw_compat.c b/net/ipv4/netfilter/ip_fw_compat.c index 72dc3d816..2a08ee89c 100644 --- a/net/ipv4/netfilter/ip_fw_compat.c +++ b/net/ipv4/netfilter/ip_fw_compat.c @@ -14,8 +14,6 @@ struct notifier_block; #include #include -EXPORT_NO_SYMBOLS; - static struct firewall_ops *fwops; /* From ip_fw_compat_redir.c */ diff --git a/net/ipv4/netfilter/ip_fw_compat_masq.c b/net/ipv4/netfilter/ip_fw_compat_masq.c index e0074c1e2..96bdc9d8d 100644 --- a/net/ipv4/netfilter/ip_fw_compat_masq.c +++ b/net/ipv4/netfilter/ip_fw_compat_masq.c @@ -5,6 +5,7 @@ DO IT. */ #include +#include #include #include #include diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c index 8252e6d9b..12d40f554 100644 --- a/net/ipv4/netfilter/ip_nat_ftp.c +++ b/net/ipv4/netfilter/ip_nat_ftp.c @@ -11,8 +11,6 @@ #include #include -EXPORT_NO_SYMBOLS; - #if 0 #define DEBUGP printk #else @@ -374,8 +372,6 @@ static struct ip_nat_helper ftp static struct ip_nat_expect ftp_expect = { { NULL, NULL }, ftp_nat_expected }; -extern struct module *ip_conntrack_ftp; - static int __init init(void) { int ret; @@ -384,9 +380,7 @@ static int __init init(void) if (ret == 0) { ret = ip_nat_helper_register(&ftp); - if (ret == 0) - __MOD_INC_USE_COUNT(ip_conntrack_ftp); - else + if (ret != 0) ip_nat_expect_unregister(&ftp_expect); } return ret; @@ -394,7 +388,6 @@ static int __init init(void) static void __exit fini(void) { - __MOD_DEC_USE_COUNT(ip_conntrack_ftp); ip_nat_helper_unregister(&ftp); ip_nat_expect_unregister(&ftp_expect); } diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c index 603111063..bfcc435c2 100644 --- a/net/ipv4/netfilter/ip_nat_standalone.c +++ b/net/ipv4/netfilter/ip_nat_standalone.c @@ -230,11 +230,13 @@ static int init_or_cleanup(int init) printk("ip_nat_init: can't register local out hook.\n"); goto cleanup_outops; } - __MOD_INC_USE_COUNT(ip_conntrack_module); + if (ip_conntrack_module) + __MOD_INC_USE_COUNT(ip_conntrack_module); return ret; cleanup: - __MOD_DEC_USE_COUNT(ip_conntrack_module); + if (ip_conntrack_module) + __MOD_DEC_USE_COUNT(ip_conntrack_module); nf_unregister_hook(&ip_nat_local_out_ops); cleanup_outops: nf_unregister_hook(&ip_nat_out_ops); @@ -262,9 +264,11 @@ static void __exit fini(void) module_init(init); module_exit(fini); +#ifdef MODULE EXPORT_SYMBOL(ip_nat_setup_info); EXPORT_SYMBOL(ip_nat_helper_register); EXPORT_SYMBOL(ip_nat_helper_unregister); EXPORT_SYMBOL(ip_nat_expect_register); EXPORT_SYMBOL(ip_nat_expect_unregister); EXPORT_SYMBOL(ip_nat_cheat_check); +#endif diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 532538321..80e43d977 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -2,7 +2,7 @@ * This is a module which is used for queueing IPv4 packets and * communicating with userspace via netlink. * - * (C) 2000 James Morris + * (C) 2000 James Morris, this code is GPL. */ #include #include @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -21,20 +20,13 @@ #include -EXPORT_NO_SYMBOLS; - -#define IPQ_THR_NAME "kipq" -#define IPQ_NAME "ip_queue" #define IPQ_QMAX_DEFAULT 1024 - #define IPQ_PROC_FS_NAME "ip_queue" - #define NET_IPQ_QMAX 2088 #define NET_IPQ_QMAX_NAME "ip_queue_maxlen" typedef struct ipq_queue_element { struct list_head list; /* Links element into queue */ - unsigned char state; /* State of this element */ int verdict; /* Current verdict */ struct nf_info *info; /* Extra info from netfilter */ struct sk_buff *skb; /* Packet inside */ @@ -50,178 +42,70 @@ typedef struct ipq_peer { ipq_send_cb_t send; /* Callback for sending data to peer */ } ipq_peer_t; -typedef struct ipq_thread { - pid_t pid; /* PID of kernel thread */ - unsigned char terminate; /* Termination flag */ - unsigned char running; /* Running flag */ - wait_queue_head_t wq; /* I/O wait queue */ - void (*process)(void *data); /* Queue processing function */ -} ipq_thread_t; - typedef struct ipq_queue { int len; /* Current queue len */ int *maxlen; /* Maximum queue len, via sysctl */ - unsigned char state; /* Current queue state */ + unsigned char flushing; /* If queue is being flushed */ + unsigned char terminate; /* If the queue is being terminated */ struct list_head list; /* Head of packet queue */ spinlock_t lock; /* Queue spinlock */ ipq_peer_t peer; /* Userland peer */ - ipq_thread_t thread; /* Thread context */ } ipq_queue_t; -/**************************************************************************** -* -* Kernel thread -* -****************************************************************************/ - -static void ipq_thread_init(char *thread_name) -{ - lock_kernel(); - exit_files(current); - daemonize(); - strcpy(current->comm, thread_name); - unlock_kernel(); - spin_lock_irq(¤t->sigmask_lock); - flush_signals(current); - sigfillset(¤t->blocked); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); -} - -static int ipq_thread_start(void *data) -{ - ipq_queue_t *q = (ipq_queue_t *)data; - - q->thread.running = 1; - ipq_thread_init(IPQ_THR_NAME); - q->thread.pid = current->pid; - while (!q->thread.terminate) { - interruptible_sleep_on(&q->thread.wq); - q->thread.process(q); - } - q->thread.running = 0; - return 0; -} - -static void ipq_thread_stop(ipq_queue_t *q) -{ - if (!(q->thread.pid || q->thread.running)) - return; - q->state = IPQ_QS_FLUSH; - q->thread.terminate = 1; - wake_up_interruptible(&q->thread.wq); - current->state = TASK_INTERRUPTIBLE; - while (q->thread.running) { - schedule_timeout(HZ/10); - current->state = TASK_RUNNING; - } -} - -static int ipq_thread_create(ipq_queue_t *q) -{ - int status = kernel_thread(ipq_thread_start, q, 0); - return (status < 0) ? status : 0; -} - - /**************************************************************************** * * Packet queue * ****************************************************************************/ -/* Must be called under spinlock */ -static __inline__ void -ipq_dequeue(ipq_queue_t *q, - ipq_queue_element_t *e) -{ - list_del(&e->list); - nf_reinject(e->skb, e->info, e->verdict); - kfree(e); - q->len--; -} - -/* Must be called under spinlock */ -static __inline__ void -ipq_queue_drop(ipq_queue_t *q, - ipq_queue_element_t *e) +/* Dequeue with element packet ID, or from end of queue if ID is zero. */ +static ipq_queue_element_t *ipq_dequeue(ipq_queue_t *q, unsigned long id) { - e->verdict = NF_DROP; - ipq_dequeue(q, e); -} - -static int -ipq_notify_peer(ipq_queue_t *q, - ipq_queue_element_t *e) -{ - int status = q->peer.send(e); + struct list_head *i; + ipq_queue_element_t *e = NULL; - if (status >= 0) { - e->state = IPQ_PS_WAITING; - return status; + spin_lock_bh(&q->lock); + if (q->len == 0) + goto out_unlock; + i = q->list.prev; + if (id > 0) { + while (i != &q->list) { + if (id == (unsigned long )i) + goto out_unlink; + i = i->prev; + } + goto out_unlock; } - if (status == -ERESTARTSYS || status == -EAGAIN) - return 0; - printk(KERN_INFO "%s: error notifying peer %d, resetting " - "state and flushing queue\n", IPQ_NAME, q->peer.pid); - q->state = IPQ_QS_FLUSH; - q->peer.died = 1; - q->peer.pid = 0; - q->peer.copy_mode = IPQ_COPY_META; - q->peer.copy_range = 0; - return status; +out_unlink: + e = (ipq_queue_element_t *)i; + list_del(&e->list); + q->len--; +out_unlock: + spin_unlock_bh(&q->lock); + return e; } -static void -ipq_queue_process(void *data) +static void ipq_flush(ipq_queue_t *q) { - struct list_head *i; - ipq_queue_t *q = (ipq_queue_t *)data; - -restart: - if (q->state == IPQ_QS_HOLD) - return; + ipq_queue_element_t *e; + spin_lock_bh(&q->lock); - for (i = q->list.prev; i != &q->list; i = i->prev) { - ipq_queue_element_t *e = (ipq_queue_element_t *)i; - - if (q->state == IPQ_QS_FLUSH) { - QDEBUG("flushing packet %p\n", e); - ipq_queue_drop(q, e); - continue; - } - switch (e->state) { - case IPQ_PS_NEW: { - int status = ipq_notify_peer(q, e); - if (status < 0) { - spin_unlock_bh(&q->lock); - goto restart; - } - break; - } - case IPQ_PS_VERDICT: - ipq_dequeue(q, e); - break; - case IPQ_PS_WAITING: - break; - default: - printk(KERN_INFO "%s: dropping stuck packet %p " - "with ps=%d qs=%d\n", IPQ_NAME, - e, e->state, q->state); - ipq_queue_drop(q, e); - } + q->flushing = 1; + spin_unlock_bh(&q->lock); + while ((e = ipq_dequeue(q, 0))) { + e->verdict = NF_DROP; + nf_reinject(e->skb, e->info, e->verdict); + kfree(e); } + spin_lock_bh(&q->lock); + q->flushing = 0; spin_unlock_bh(&q->lock); - if (q->state == IPQ_QS_FLUSH) - q->state = IPQ_QS_HOLD; } -static ipq_queue_t * -ipq_queue_create(nf_queue_outfn_t outfn, - ipq_send_cb_t send_cb, - int *errp, - int *sysctl_qmax) +static ipq_queue_t *ipq_create_queue(nf_queue_outfn_t outfn, + ipq_send_cb_t send_cb, + int *errp, int *sysctl_qmax) { int status; ipq_queue_t *q; @@ -232,18 +116,15 @@ ipq_queue_create(nf_queue_outfn_t outfn, *errp = -ENOMEM; return NULL; } - q->thread.terminate = 0; - q->thread.running = 0; - q->thread.process = ipq_queue_process; - init_waitqueue_head(&q->thread.wq); q->peer.pid = 0; q->peer.died = 0; - q->peer.copy_mode = IPQ_COPY_META; + q->peer.copy_mode = IPQ_COPY_NONE; q->peer.copy_range = 0; q->peer.send = send_cb; q->len = 0; q->maxlen = sysctl_qmax; - q->state = IPQ_QS_HOLD; + q->flushing = 0; + q->terminate = 0; INIT_LIST_HEAD(&q->list); spin_lock_init(&q->lock); status = nf_register_queue_handler(PF_INET, outfn, q); @@ -252,91 +133,92 @@ ipq_queue_create(nf_queue_outfn_t outfn, kfree(q); return NULL; } - status = ipq_thread_create(q); - if (status < 0) { - nf_unregister_queue_handler(PF_INET); - *errp = status; - kfree(q); - return NULL; - } return q; } -static int -ipq_enqueue(ipq_queue_t *q, - struct sk_buff *skb, - struct nf_info *info) +static int ipq_enqueue(ipq_queue_t *q, + struct sk_buff *skb, struct nf_info *info) { - ipq_queue_element_t *e = NULL; - + ipq_queue_element_t *e; + int status; + e = kmalloc(sizeof(*e), GFP_ATOMIC); if (e == NULL) { - printk(KERN_ERR "%s: out of memory in %s\n", - IPQ_NAME, __FUNCTION__); - return -ENOMEM; + printk(KERN_ERR "ip_queue: OOM in enqueue\n"); + return -ENOMEM; } - e->state = IPQ_PS_NEW; e->verdict = NF_DROP; e->info = info; e->skb = skb; spin_lock_bh(&q->lock); if (q->len >= *q->maxlen) { spin_unlock_bh(&q->lock); - printk(KERN_WARNING "%s: queue full at %d entries, " - "dropping packet.\n", IPQ_NAME, q->len); - kfree(e); - nf_reinject(skb, info, NF_DROP); - return 0; + if (net_ratelimit()) + printk(KERN_WARNING "ip_queue: full at %d entries, " + "dropping packet(s).\n", q->len); + goto free_drop; + } + if (q->flushing || q->peer.copy_mode == IPQ_COPY_NONE + || q->peer.pid == 0 || q->peer.died || q->terminate) { + spin_unlock_bh(&q->lock); + goto free_drop; + } + status = q->peer.send(e); + if (status > 0) { + list_add(&e->list, &q->list); + q->len++; + spin_unlock_bh(&q->lock); + return status; } - list_add(&e->list, &q->list); - q->len++; spin_unlock_bh(&q->lock); - wake_up_interruptible(&q->thread.wq); - return 0; + if (status == -ECONNREFUSED) { + printk(KERN_INFO "ip_queue: peer %d died, " + "resetting state and flushing queue\n", q->peer.pid); + q->peer.died = 1; + q->peer.pid = 0; + q->peer.copy_mode = IPQ_COPY_NONE; + q->peer.copy_range = 0; + ipq_flush(q); + } +free_drop: + kfree(e); + return -EBUSY; } -/* FIXME: need to find a way to notify user during module unload */ -static void -ipq_queue_destroy(ipq_queue_t *q) +static void ipq_destroy_queue(ipq_queue_t *q) { - ipq_thread_stop(q); nf_unregister_queue_handler(PF_INET); + spin_lock_bh(&q->lock); + q->terminate = 1; + spin_unlock_bh(&q->lock); + ipq_flush(q); kfree(q); } -static int -ipq_queue_mangle_ipv4(unsigned char *buf, - ipq_verdict_msg_t *v, - ipq_queue_element_t *e) +static int ipq_mangle_ipv4(ipq_verdict_msg_t *v, ipq_queue_element_t *e) { - struct iphdr *user_iph = (struct iphdr *)buf; + struct iphdr *user_iph = (struct iphdr *)v->payload; if (v->data_len < sizeof(*user_iph)) return 0; - if (e->skb->nh.iph->check != user_iph->check) { int diff = v->data_len - e->skb->len; if (diff < 0) skb_trim(e->skb, v->data_len); else if (diff > 0) { - if (v->data_len > 0xFFFF) { - e->verdict = NF_DROP; + if (v->data_len > 0xFFFF) return -EINVAL; - } if (diff > skb_tailroom(e->skb)) { struct sk_buff *newskb; - /* Ack, we waste a memcpy() of data here */ newskb = skb_copy_expand(e->skb, skb_headroom(e->skb), diff, GFP_ATOMIC); if (newskb == NULL) { - printk(KERN_WARNING "%s: OOM in %s, " - "dropping packet\n", - IPQ_THR_NAME, __FUNCTION__); - e->verdict = NF_DROP; + printk(KERN_WARNING "ip_queue: OOM " + "in mangle, dropping packet\n"); return -ENOMEM; } kfree_skb(e->skb); @@ -344,101 +226,76 @@ ipq_queue_mangle_ipv4(unsigned char *buf, } skb_put(e->skb, diff); } - memcpy(e->skb->data, buf, v->data_len); + memcpy(e->skb->data, v->payload, v->data_len); e->skb->nfcache |= NFC_ALTERED; } return 0; } -static int -ipq_queue_set_verdict(ipq_queue_t *q, - ipq_verdict_msg_t *v, - unsigned char *buf, - unsigned int len) +static int ipq_set_verdict(ipq_queue_t *q, + ipq_verdict_msg_t *v, unsigned int len) { - struct list_head *i; + ipq_queue_element_t *e; if (v->value < 0 || v->value > NF_MAX_VERDICT) return -EINVAL; - spin_lock_bh(&q->lock); - for (i = q->list.next; i != &q->list; i = i->next) { - ipq_queue_element_t *e = (ipq_queue_element_t *)i; - - if (v->id == (unsigned long )e) { - int status = 0; - e->state = IPQ_PS_VERDICT; - e->verdict = v->value; - - if (buf && v->data_len == len) - status = ipq_queue_mangle_ipv4(buf, v, e); - spin_unlock_bh(&q->lock); - return status; - } + e = ipq_dequeue(q, v->id); + if (e == NULL) + return -ENOENT; + else { + e->verdict = v->value; + if (v->data_len && v->data_len == len) + if (ipq_mangle_ipv4(v, e) < 0) + e->verdict = NF_DROP; + nf_reinject(e->skb, e->info, e->verdict); + kfree(e); + return 0; } - spin_unlock_bh(&q->lock); - return -ENOENT; } -static int -ipq_receive_peer(ipq_queue_t *q, - ipq_peer_msg_t *m, - unsigned char type, - unsigned int len) +static int ipq_receive_peer(ipq_queue_t *q, ipq_peer_msg_t *m, + unsigned char type, unsigned int len) { - if (q->state == IPQ_QS_FLUSH) - return -EBUSY; + int status = 0; + + spin_lock_bh(&q->lock); + if (q->terminate || q->flushing) + return -EBUSY; + spin_unlock_bh(&q->lock); if (len < sizeof(ipq_peer_msg_t)) return -EINVAL; - switch (type) { case IPQM_MODE: switch (m->msg.mode.value) { - case IPQ_COPY_NONE: - q->peer.copy_mode = IPQ_COPY_NONE; - q->peer.copy_range = 0; - q->state = IPQ_QS_FLUSH; - break; case IPQ_COPY_META: - if (q->state == IPQ_QS_FLUSH) - return -EAGAIN; q->peer.copy_mode = IPQ_COPY_META; q->peer.copy_range = 0; - q->state = IPQ_QS_COPY; break; case IPQ_COPY_PACKET: - if (q->state == IPQ_QS_FLUSH) - return -EAGAIN; q->peer.copy_mode = IPQ_COPY_PACKET; q->peer.copy_range = m->msg.mode.range; - q->state = IPQ_QS_COPY; + if (q->peer.copy_range > 0xFFFF) + q->peer.copy_range = 0xFFFF; break; default: - return -EINVAL; + status = -EINVAL; } break; - case IPQM_VERDICT: { - int status; - unsigned char *data = NULL; - + case IPQM_VERDICT: if (m->msg.verdict.value > NF_MAX_VERDICT) - return -EINVAL; - if (m->msg.verdict.data_len) - data = (unsigned char *)m + sizeof(*m); - status = ipq_queue_set_verdict(q, &m->msg.verdict, - data, len - sizeof(*m)); - if (status < 0) - return status; + status = -EINVAL; + else + status = ipq_set_verdict(q, + &m->msg.verdict, + len - sizeof(*m)); break; - } default: - return -EINVAL; + status = -EINVAL; } - wake_up_interruptible(&q->thread.wq); - return 0; + return status; } - /**************************************************************************** * * Netfilter interface @@ -449,16 +306,10 @@ ipq_receive_peer(ipq_queue_t *q, * Packets arrive here from netfilter for queuing to userspace. * All of them must be fed back via nf_reinject() or Alexey will kill Rusty. */ -static int -receive_netfilter(struct sk_buff *skb, - struct nf_info *info, - void *data) +static int netfilter_receive(struct sk_buff *skb, + struct nf_info *info, void *data) { - ipq_queue_t *q = (ipq_queue_t *)data; - - if (q->state == IPQ_QS_FLUSH) - return -EBUSY; - return ipq_enqueue(q, skb, info); + return ipq_enqueue((ipq_queue_t *)data, skb, info); } /**************************************************************************** @@ -467,36 +318,10 @@ receive_netfilter(struct sk_buff *skb, * ****************************************************************************/ -static struct sk_buff * -netlink_build_message(ipq_queue_element_t *e, - int *errp); - -extern __inline__ void -receive_user_skb(struct sk_buff *skb); - -static int -netlink_send_peer(ipq_queue_element_t *e); - static struct sock *nfnl = NULL; ipq_queue_t *nlq = NULL; -static int -netlink_send_peer(ipq_queue_element_t *e) -{ - int status = 0; - struct sk_buff *skb; - - if (!nlq->peer.pid) - return -EINVAL; - skb = netlink_build_message(e, &status); - if (skb == NULL) - return status; - return netlink_unicast(nfnl, skb, nlq->peer.pid, MSG_DONTWAIT); -} - -static struct sk_buff * -netlink_build_message(ipq_queue_element_t *e, - int *errp) +static struct sk_buff *netlink_build_message(ipq_queue_element_t *e, int *errp) { unsigned char *old_tail; size_t size = 0; @@ -519,6 +344,7 @@ netlink_build_message(ipq_queue_element_t *e, else data_len = copy_range; size = NLMSG_SPACE(sizeof(*pm) + data_len); + break; case IPQ_COPY_NONE: default: @@ -542,7 +368,7 @@ netlink_build_message(ipq_queue_element_t *e, if (e->info->outdev) strcpy(pm->outdev_name, e->info->outdev->name); else pm->outdev_name[0] = '\0'; if (data_len) - memcpy(++pm, e->skb->data, data_len); + memcpy(pm->payload, e->skb->data, data_len); nlh->nlmsg_len = skb->tail - old_tail; NETLINK_CB(skb).dst_groups = 0; return skb; @@ -550,16 +376,24 @@ nlmsg_failure: if (skb) kfree(skb); *errp = 0; - printk(KERN_ERR "%s: error creating netlink message\n", IPQ_NAME); + printk(KERN_ERR "ip_queue: error creating netlink message\n"); return NULL; } +static int netlink_send_peer(ipq_queue_element_t *e) +{ + int status = 0; + struct sk_buff *skb; + + skb = netlink_build_message(e, &status); + if (skb == NULL) + return status; + return netlink_unicast(nfnl, skb, nlq->peer.pid, MSG_DONTWAIT); +} + #define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0); -/* - * FIXME: ping old peer if we detect a new peer then resend. - */ -extern __inline__ void -receive_user_skb(struct sk_buff *skb) + +extern __inline__ void netlink_receive_user_skb(struct sk_buff *skb) { int status, type; struct nlmsghdr *nlh; @@ -581,9 +415,11 @@ receive_user_skb(struct sk_buff *skb) if(!cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) RCV_SKB_FAIL(-EPERM); if (nlq->peer.pid && !nlq->peer.died - && (nlq->peer.pid != nlh->nlmsg_pid)) - printk(KERN_WARNING "%s: peer pid changed from %d to %d\n", - IPQ_NAME, nlq->peer.pid, nlh->nlmsg_pid); + && (nlq->peer.pid != nlh->nlmsg_pid)) { + printk(KERN_WARNING "ip_queue: peer pid changed from %d to " + "%d, flushing queue\n", nlq->peer.pid, nlh->nlmsg_pid); + ipq_flush(nlq); + } nlq->peer.pid = nlh->nlmsg_pid; nlq->peer.died = 0; status = ipq_receive_peer(nlq, NLMSG_DATA(nlh), @@ -596,9 +432,7 @@ receive_user_skb(struct sk_buff *skb) } /* Note: we are only dealing with single part messages at the moment. */ -static void -receive_user_sk(struct sock *sk, - int len) +static void netlink_receive_user_sk(struct sock *sk, int len) { do { struct sk_buff *skb; @@ -606,28 +440,25 @@ receive_user_sk(struct sock *sk, if (rtnl_shlock_nowait()) return; while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) { - receive_user_skb(skb); + netlink_receive_user_skb(skb); kfree_skb(skb); } up(&rtnl_sem); } while (nfnl && nfnl->receive_queue.qlen); } - /**************************************************************************** * * System events * ****************************************************************************/ -static int -receive_event(struct notifier_block *this, - unsigned long event, - void *ptr) +static int receive_event(struct notifier_block *this, + unsigned long event, void *ptr) { if (event == NETDEV_UNREGISTER) if (nlq) - ipq_thread_stop(nlq); + ipq_destroy_queue(nlq); return NOTIFY_DONE; } @@ -637,7 +468,6 @@ struct notifier_block ipq_dev_notifier = { 0 }; - /**************************************************************************** * * Sysctl - queue tuning. @@ -670,33 +500,28 @@ static ctl_table ipq_root_table[] = { * ****************************************************************************/ -static int -ipq_get_info(char *buffer, char **start, off_t offset, int length) +static int ipq_get_info(char *buffer, char **start, off_t offset, int length) { int len; spin_lock_bh(&nlq->lock); len = sprintf(buffer, - "Thread pid : %d\n" - "Thread terminate : %d\n" - "Thread running : %d\n" - "Peer pid : %d\n" - "Peer died : %d\n" - "Peer copy mode : %d\n" - "Peer copy range : %d\n" - "Queue length : %d\n" - "Queue max. length : %d\n" - "Queue state : %d\n", - nlq->thread.pid, - nlq->thread.terminate, - nlq->thread.running, + "Peer pid : %d\n" + "Peer died : %d\n" + "Peer copy mode : %d\n" + "Peer copy range : %d\n" + "Queue length : %d\n" + "Queue max. length : %d\n" + "Queue flushing : %d\n" + "Queue terminate : %d\n", nlq->peer.pid, nlq->peer.died, nlq->peer.copy_mode, nlq->peer.copy_range, nlq->len, *nlq->maxlen, - nlq->state); + nlq->flushing, + nlq->terminate); spin_unlock_bh(&nlq->lock); *start = buffer + offset; len -= offset; @@ -716,18 +541,18 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length) static int __init init(void) { int status = 0; - - nfnl = netlink_kernel_create(NETLINK_FIREWALL, receive_user_sk); + + nfnl = netlink_kernel_create(NETLINK_FIREWALL, netlink_receive_user_sk); if (nfnl == NULL) { - printk(KERN_ERR "%s: initialisation failed: unable to " - "create kernel netlink socket\n", IPQ_NAME); + printk(KERN_ERR "ip_queue: initialisation failed: unable to " + "create kernel netlink socket\n"); return -ENOMEM; } - nlq = ipq_queue_create(receive_netfilter, + nlq = ipq_create_queue(netfilter_receive, netlink_send_peer, &status, &sysctl_maxlen); if (nlq == NULL) { - printk(KERN_ERR "%s: initialisation failed: unable to " - "initialise queue\n", IPQ_NAME); + printk(KERN_ERR "ip_queue: initialisation failed: unable to " + "create queue\n"); sock_release(nfnl->socket); return status; } @@ -742,7 +567,7 @@ static void __exit fini(void) unregister_sysctl_table(ipq_sysctl_header); proc_net_remove(IPQ_PROC_FS_NAME); unregister_netdevice_notifier(&ipq_dev_notifier); - ipq_queue_destroy(nlq); + ipq_destroy_queue(nlq); sock_release(nfnl->socket); } @@ -750,3 +575,4 @@ MODULE_DESCRIPTION("IPv4 packet queue handler"); module_init(init); module_exit(fini); + diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 8cc8c24ac..66f47c386 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -12,15 +12,13 @@ #include #include #include +#include #include #include +#include #include -#ifndef IP_OFFSET -#define IP_OFFSET 0x1FFF -#endif - /*#define DEBUG_IP_FIREWALL*/ /*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */ /*#define DEBUG_IP_FIREWALL_USER*/ @@ -288,9 +286,16 @@ ipt_do_table(struct sk_buff **pskb, + TABLE_OFFSET(table->private, smp_processor_id()); e = get_entry(table_base, table->private->hook_entry[hook]); - /* Check noone else using our table */ - IP_NF_ASSERT(((struct ipt_entry *)table_base)->comefrom == 0xdead57ac); #ifdef CONFIG_NETFILTER_DEBUG + /* Check noone else using our table */ + if (((struct ipt_entry *)table_base)->comefrom != 0xdead57ac + && ((struct ipt_entry *)table_base)->comefrom != 0xeeeeeeec) { + printk("ASSERT: CPU #%u, %s comefrom(%p) = %X\n", + smp_processor_id(), + table->name, + &((struct ipt_entry *)table_base)->comefrom, + ((struct ipt_entry *)table_base)->comefrom); + } ((struct ipt_entry *)table_base)->comefrom = 0x57acc001; #endif @@ -343,11 +348,28 @@ ipt_do_table(struct sk_buff **pskb, e = get_entry(table_base, v); } else { + /* Targets which reenter must return + abs. verdicts */ +#ifdef CONFIG_NETFILTER_DEBUG + ((struct ipt_entry *)table_base)->comefrom + = 0xeeeeeeec; +#endif verdict = t->u.target->target(pskb, hook, in, out, t->data, userdata); +#ifdef CONFIG_NETFILTER_DEBUG + if (((struct ipt_entry *)table_base)->comefrom + != 0xeeeeeeec + && verdict == IPT_CONTINUE) { + printk("Target %s reentered!\n", + t->u.target->name); + verdict = NF_DROP; + } + ((struct ipt_entry *)table_base)->comefrom + = 0x57acc001; +#endif /* Target might have changed stuff. */ ip = (*pskb)->nh.iph; protohdr = (u_int32_t *)ip + ip->ihl; @@ -1631,6 +1653,43 @@ static struct ipt_match udp_matchstruct static struct ipt_match icmp_matchstruct = { { NULL, NULL }, "icmp", &icmp_match, &icmp_checkentry, NULL }; +#ifdef CONFIG_PROC_FS +static inline int print_name(const struct ipt_table *t, + off_t start_offset, char *buffer, int length, + off_t *pos, unsigned int *count) +{ + if ((*count)++ >= start_offset) { + unsigned int namelen; + + namelen = sprintf(buffer + *pos, "%s\n", t->name); + if (*pos + namelen > length) { + /* Stop iterating */ + return 1; + } + *pos += namelen; + } + return 0; +} + +static int ipt_get_tables(char *buffer, char **start, off_t offset, int length) +{ + off_t pos = 0; + unsigned int count = 0; + + if (down_interruptible(&ipt_mutex) != 0) + return 0; + + LIST_FIND(&ipt_tables, print_name, struct ipt_table *, + offset, buffer, length, &pos, &count); + + up(&ipt_mutex); + + /* `start' hack - see fs/proc/generic.c line ~105 */ + *start=(char *)((unsigned long)count-offset); + return pos; +} +#endif /*CONFIG_PROC_FS*/ + static int __init init(void) { int ret; @@ -1651,13 +1710,23 @@ static int __init init(void) return ret; } - printk("iptables: (c)2000 Netfilter core team\n"); +#ifdef CONFIG_PROC_FS + if (!proc_net_create("ip_tables_names", 0, ipt_get_tables)) { + nf_unregister_sockopt(&ipt_sockopts); + return -ENOMEM; + } +#endif + + printk("ip_tables: (c)2000 Netfilter core team\n"); return 0; } static void __exit fini(void) { nf_unregister_sockopt(&ipt_sockopts); +#ifdef CONFIG_PROC_FS + proc_net_remove("ip_tables_names"); +#endif } module_init(init); diff --git a/net/ipv4/netfilter/ipchains_core.c b/net/ipv4/netfilter/ipchains_core.c index 02bd7ad83..419b0382c 100644 --- a/net/ipv4/netfilter/ipchains_core.c +++ b/net/ipv4/netfilter/ipchains_core.c @@ -145,7 +145,9 @@ /*#define DEBUG_IP_FIREWALL_USER*/ /*#define DEBUG_IP_FIREWALL_LOCKING*/ +#if defined(CONFIG_NETLINK_DEV) || defined(CONFIG_NETLINK_DEV_MODULE) static struct sock *ipfwsk; +#endif #ifdef CONFIG_SMP #define SLOT_NUMBER() (cpu_number_map(smp_processor_id())*2 + !in_interrupt()) diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 6e69d6a90..4675a94b8 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -24,10 +24,6 @@ struct esphdr { __u32 spi; }; /* FIXME evil kludge */ -/* Make init and cleanup non-static, so gcc doesn't warn about unused, - but don't export the symbols */ -EXPORT_NO_SYMBOLS; - /* Use lock to serialize, so printks don't overlap */ static spinlock_t log_lock = SPIN_LOCK_UNLOCKED; @@ -353,15 +349,15 @@ static struct ipt_target ipt_log_reg static int __init init(void) { - if (ipt_register_target(&ipt_log_reg)) - return -EINVAL; + if (ipt_register_target(&ipt_log_reg)) + return -EINVAL; - return 0; + return 0; } static void __exit fini(void) { - ipt_unregister_target(&ipt_log_reg); + ipt_unregister_target(&ipt_log_reg); } module_init(init); diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c index 32906eefe..924e00e5c 100644 --- a/net/ipv4/netfilter/ipt_MARK.c +++ b/net/ipv4/netfilter/ipt_MARK.c @@ -7,8 +7,6 @@ #include #include -EXPORT_NO_SYMBOLS; - static unsigned int target(struct sk_buff **pskb, unsigned int hooknum, @@ -53,15 +51,15 @@ static struct ipt_target ipt_mark_reg static int __init init(void) { - if (ipt_register_target(&ipt_mark_reg)) - return -EINVAL; + if (ipt_register_target(&ipt_mark_reg)) + return -EINVAL; - return 0; + return 0; } static void __exit fini(void) { - ipt_unregister_target(&ipt_mark_reg); + ipt_unregister_target(&ipt_mark_reg); } module_init(init); diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 9f94f8f44..071e2c3cd 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -11,8 +11,6 @@ #include #include -EXPORT_NO_SYMBOLS; - #if 0 #define DEBUGP printk #else diff --git a/net/ipv4/netfilter/ipt_MIRROR.c b/net/ipv4/netfilter/ipt_MIRROR.c index 9dec181c1..dba913387 100644 --- a/net/ipv4/netfilter/ipt_MIRROR.c +++ b/net/ipv4/netfilter/ipt_MIRROR.c @@ -29,7 +29,6 @@ #include struct in_device; #include -EXPORT_NO_SYMBOLS; #if 0 #define DEBUGP printk @@ -49,7 +48,7 @@ static int route_mirror(struct sk_buff *skb) } /* check if the interface we are living by is the same as the one we arrived on */ - if (skb->rx_dev != rt->u.dst.dev) { + if (skb->rx_dev == rt->u.dst.dev) { /* Drop old route. */ dst_release(skb->dst); skb->dst = &rt->u.dst; diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c index 690d3a8a1..aa7ac5e5d 100644 --- a/net/ipv4/netfilter/ipt_REDIRECT.c +++ b/net/ipv4/netfilter/ipt_REDIRECT.c @@ -12,8 +12,6 @@ #include #include -EXPORT_NO_SYMBOLS; - #if 0 #define DEBUGP printk #else diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index b183e822c..7e82c908c 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -6,12 +6,11 @@ #include #include #include -#include +#include struct in_device; #include #include #include -EXPORT_NO_SYMBOLS; #if 0 #define DEBUGP printk @@ -28,6 +27,9 @@ static unsigned int reject(struct sk_buff **pskb, { const struct ipt_reject_info *reject = targinfo; + /* WARNING: This code has causes reentry within iptables. + This means that the iptables jump stack is now crap. We + must return an absolute verdict. --RR */ switch (reject->with) { case IPT_ICMP_NET_UNREACHABLE: icmp_send(*pskb, ICMP_DEST_UNREACH, ICMP_NET_UNREACH, 0); @@ -62,9 +64,6 @@ static unsigned int reject(struct sk_buff **pskb, } } break; - case IPT_TCP_RESET: - tcp_v4_send_reset(*pskb); - break; } return NF_DROP; @@ -115,12 +114,6 @@ static int check(const char *tablename, DEBUGP("REJECT: ECHOREPLY illegal for non-ping\n"); return 0; } - } else if (rejinfo->with == IPT_TCP_RESET) { - if (e->ip.proto != IPPROTO_TCP - || (e->ip.invflags & IPT_INV_PROTO)) { - DEBUGP("REJECT: TCP_RESET illegal for non-tcp\n"); - return 0; - } } return 1; diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c index fbfb4974f..f0c293868 100644 --- a/net/ipv4/netfilter/ipt_TOS.c +++ b/net/ipv4/netfilter/ipt_TOS.c @@ -7,8 +7,6 @@ #include #include -EXPORT_NO_SYMBOLS; - static unsigned int target(struct sk_buff **pskb, unsigned int hooknum, @@ -72,15 +70,15 @@ static struct ipt_target ipt_tos_reg static int __init init(void) { - if (ipt_register_target(&ipt_tos_reg)) - return -EINVAL; + if (ipt_register_target(&ipt_tos_reg)) + return -EINVAL; - return 0; + return 0; } static void __exit fini(void) { - ipt_unregister_target(&ipt_tos_reg); + ipt_unregister_target(&ipt_tos_reg); } module_init(init); diff --git a/net/ipv4/netfilter/ipt_limit.c b/net/ipv4/netfilter/ipt_limit.c index 3785ba371..5e2b86029 100644 --- a/net/ipv4/netfilter/ipt_limit.c +++ b/net/ipv4/netfilter/ipt_limit.c @@ -14,7 +14,6 @@ #include #include -EXPORT_NO_SYMBOLS; #define IP_PARTS_NATIVE(n) \ (unsigned int)((n)>>24)&0xFF, \ diff --git a/net/ipv4/netfilter/ipt_mac.c b/net/ipv4/netfilter/ipt_mac.c index 90dbec59d..7de798767 100644 --- a/net/ipv4/netfilter/ipt_mac.c +++ b/net/ipv4/netfilter/ipt_mac.c @@ -5,7 +5,6 @@ #include #include -EXPORT_NO_SYMBOLS; static int match(const struct sk_buff *skb, diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c index 0d828fd20..66c3d1186 100644 --- a/net/ipv4/netfilter/ipt_mark.c +++ b/net/ipv4/netfilter/ipt_mark.c @@ -5,8 +5,6 @@ #include #include -EXPORT_NO_SYMBOLS; - static int match(const struct sk_buff *skb, const struct net_device *in, diff --git a/net/ipv4/netfilter/ipt_multiport.c b/net/ipv4/netfilter/ipt_multiport.c index 08cc4a968..6170ce65e 100644 --- a/net/ipv4/netfilter/ipt_multiport.c +++ b/net/ipv4/netfilter/ipt_multiport.c @@ -14,8 +14,6 @@ #define duprintf(format, args...) #endif -EXPORT_NO_SYMBOLS; - /* Returns 1 if the port is matched by the test, 0 otherwise. */ static inline int ports_match(const u_int16_t *portlist, enum ipt_multiport_flags flags, diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c index 5438571d3..501916414 100644 --- a/net/ipv4/netfilter/ipt_owner.c +++ b/net/ipv4/netfilter/ipt_owner.c @@ -1,7 +1,7 @@ /* Kernel module to match various things tied to sockets associated with locally generated outgoing packets. - (C)2000 Marc Boucher + Copyright (C) 2000 Marc Boucher */ #include #include @@ -11,8 +11,6 @@ #include #include -EXPORT_NO_SYMBOLS; - static int match_pid(const struct sk_buff *skb, pid_t pid) { diff --git a/net/ipv4/netfilter/ipt_state.c b/net/ipv4/netfilter/ipt_state.c index 1baa54d62..b559e7f56 100644 --- a/net/ipv4/netfilter/ipt_state.c +++ b/net/ipv4/netfilter/ipt_state.c @@ -6,7 +6,6 @@ #include #include #include -EXPORT_NO_SYMBOLS; static int match(const struct sk_buff *skb, @@ -47,14 +46,17 @@ static struct ipt_match state_match static int __init init(void) { - __MOD_INC_USE_COUNT(ip_conntrack_module); + /* NULL if ip_conntrack not a module */ + if (ip_conntrack_module) + __MOD_INC_USE_COUNT(ip_conntrack_module); return ipt_register_match(&state_match); } static void __exit fini(void) { ipt_unregister_match(&state_match); - __MOD_DEC_USE_COUNT(ip_conntrack_module); + if (ip_conntrack_module) + __MOD_DEC_USE_COUNT(ip_conntrack_module); } module_init(init); diff --git a/net/ipv4/netfilter/ipt_tos.c b/net/ipv4/netfilter/ipt_tos.c index 6da72b2d8..b144704e4 100644 --- a/net/ipv4/netfilter/ipt_tos.c +++ b/net/ipv4/netfilter/ipt_tos.c @@ -5,8 +5,6 @@ #include #include -EXPORT_NO_SYMBOLS; - static int match(const struct sk_buff *skb, const struct net_device *in, diff --git a/net/ipv4/netfilter/ipt_unclean.c b/net/ipv4/netfilter/ipt_unclean.c index 056224a87..72fab2b18 100644 --- a/net/ipv4/netfilter/ipt_unclean.c +++ b/net/ipv4/netfilter/ipt_unclean.c @@ -9,8 +9,6 @@ #include -EXPORT_NO_SYMBOLS; - #define limpk(format, args...) \ do { \ if (net_ratelimit()) \ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 471eb9e70..098d91ba1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.165 2000/03/23 05:30:32 davem Exp $ + * Version: $Id: tcp.c,v 1.166 2000/03/25 01:55:11 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -444,12 +444,6 @@ static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait return sk->tp_pinfo.af_tcp.accept_queue ? (POLLIN | POLLRDNORM) : 0; } -/* - * Compute minimal free write space needed to queue new packets. - */ -#define tcp_min_write_space(__sk) \ - (atomic_read(&(__sk)->wmem_alloc) / 2) - /* * Wait for a TCP event. * @@ -520,7 +514,15 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) if (sock_wspace(sk) >= tcp_min_write_space(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ - sk->socket->flags |= SO_NOSPACE; + set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); + set_bit(SOCK_NOSPACE, &sk->socket->flags); + + /* Race breaker. If space is freed after + * wspace test but before the flags are set, + * IO signal will be lost. + */ + if (sock_wspace(sk) >= tcp_min_write_space(sk)) + mask |= POLLOUT | POLLWRNORM; } } @@ -534,18 +536,26 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) * Socket write_space callback. * This (or rather the sock_wake_async) should agree with poll. * - * WARNING. This callback is called from any context (process, - * bh or irq). Do not make anything more smart from it. + * WARNING. This callback is called, when socket is not locked. + * + * This wakeup is used by TCP only as dead-lock breaker, real + * wakeup occurs when incoming ack frees some space in buffer. */ void tcp_write_space(struct sock *sk) { + struct socket *sock; + read_lock(&sk->callback_lock); - if (!sk->dead) { - /* Why??!! Does it really not overshedule? --ANK */ - wake_up_interruptible(sk->sleep); + if ((sock = sk->socket) != NULL && atomic_read(&sk->wmem_alloc) == 0) { + if (test_bit(SOCK_NOSPACE, &sock->flags)) { + if (sk->sleep && waitqueue_active(sk->sleep)) { + clear_bit(SOCK_NOSPACE, &sock->flags); + wake_up_interruptible(sk->sleep); + } + } - if (sock_wspace(sk) >= tcp_min_write_space(sk)) - sock_wake_async(sk->socket, 2, POLL_OUT); + if (sock->fasync_list) + sock_wake_async(sock, 2, POLL_OUT); } read_unlock(&sk->callback_lock); } @@ -636,7 +646,6 @@ int tcp_listen_start(struct sock *sk) sk->write_space = tcp_listen_write_space; sk_dst_reset(sk); sk->prot->hash(sk); - sk->socket->flags |= SO_ACCEPTCON; return 0; } @@ -742,7 +751,7 @@ static int wait_for_tcp_connect(struct sock * sk, int flags, long *timeo_p) if(!*timeo_p) return -EAGAIN; if(signal_pending(tsk)) - return -ERESTARTSYS; + return sock_intr_errno(*timeo_p); __set_task_state(tsk, TASK_INTERRUPTIBLE); add_wait_queue(sk->sleep, &wait); @@ -772,9 +781,12 @@ static long wait_for_tcp_memory(struct sock * sk, long timeo) if (!tcp_memory_free(sk)) { DECLARE_WAITQUEUE(wait, current); - sk->socket->flags &= ~SO_NOSPACE; + clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); + add_wait_queue(sk->sleep, &wait); for (;;) { + set_bit(SOCK_NOSPACE, &sk->socket->flags); + set_current_state(TASK_INTERRUPTIBLE); if (signal_pending(current)) @@ -830,7 +842,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size) goto out_unlock; /* This should be in poll */ - sk->socket->flags &= ~SO_NOSPACE; /* clear SIGIO XXX */ + clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); mss_now = tcp_current_mss(sk); @@ -943,13 +955,15 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size) /* If we didn't get any memory, we need to sleep. */ if (skb == NULL) { - sk->socket->flags |= SO_NOSPACE; + set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); + set_bit(SOCK_NOSPACE, &sk->socket->flags); + if (!timeo) { err = -EAGAIN; goto do_interrupted; } if (signal_pending(current)) { - err = -ERESTARTSYS; + err = sock_intr_errno(timeo); goto do_interrupted; } __tcp_push_pending_frames(sk, tp, mss_now); @@ -1062,7 +1076,8 @@ static int tcp_recv_urg(struct sock * sk, long timeo, msg->msg_flags|=MSG_OOB; if(len>0) { - err = memcpy_toiovec(msg->msg_iov, &c, 1); + if (!(flags & MSG_PEEK)) + err = memcpy_toiovec(msg->msg_iov, &c, 1); len = 1; } else msg->msg_flags|=MSG_TRUNC; @@ -1188,14 +1203,14 @@ static long tcp_data_wait(struct sock *sk, long timeo) __set_current_state(TASK_INTERRUPTIBLE); - sk->socket->flags |= SO_WAITDATA; + set_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags); release_sock(sk); if (skb_queue_empty(&sk->receive_queue)) timeo = schedule_timeout(timeo); lock_sock(sk); - sk->socket->flags &= ~SO_WAITDATA; + clear_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags); remove_wait_queue(sk->sleep, &wait); __set_current_state(TASK_RUNNING); @@ -1287,9 +1302,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, if (signal_pending(current)) { if (copied) break; - copied = -ERESTARTSYS; - if (!timeo) - copied = -EAGAIN; + copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; break; } @@ -1362,7 +1375,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, if (tp->ucopy.task == user_recv) { /* Install new reader */ - if (user_recv == NULL && !(flags&MSG_PEEK)) { + if (user_recv == NULL && !(flags&(MSG_TRUNC|MSG_PEEK))) { user_recv = current; tp->ucopy.task = user_recv; tp->ucopy.iov = msg->msg_iov; @@ -1370,7 +1383,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, tp->ucopy.len = len; - BUG_TRAP(tp->copied_seq == tp->rcv_nxt || (flags&MSG_PEEK)); + BUG_TRAP(tp->copied_seq == tp->rcv_nxt || (flags&(MSG_PEEK|MSG_TRUNC))); /* Ugly... If prequeue is not empty, we have to * process it before releasing socket, otherwise @@ -1458,12 +1471,15 @@ do_prequeue: } } - err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used); - if (err) { - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; + err = 0; + if (!(flags&MSG_TRUNC)) { + err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } } *seq += used; @@ -1961,7 +1977,7 @@ static int wait_for_connect(struct sock * sk, long timeo) err = -EINVAL; if (sk->state != TCP_LISTEN) break; - err = -ERESTARTSYS; + err = sock_intr_errno(timeo); if (signal_pending(current)) break; err = -EAGAIN; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 575ec3036..3ba12bc52 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_input.c,v 1.190 2000/03/21 19:34:23 davem Exp $ + * Version: $Id: tcp_input.c,v 1.191 2000/03/25 01:55:13 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -1181,6 +1181,9 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, if (ack != tp->snd_una || (flag == 0 && !th->fin)) dst_confirm(sk->dst_cache); + if (ack != tp->snd_una) + tp->sorry = 1; + /* Remember the highest ack received. */ tp->snd_una = ack; return 1; @@ -1614,7 +1617,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); tp->fin_seq = TCP_SKB_CB(skb)->end_seq; - tcp_send_ack(sk); + tp->ack.pending = 1; sk->shutdown |= RCV_SHUTDOWN; @@ -1644,6 +1647,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) break; case TCP_FIN_WAIT2: /* Received a FIN -- send ACK and enter TIME_WAIT. */ + tcp_send_ack(sk); tcp_time_wait(sk, TCP_TIME_WAIT, 0); break; default: @@ -1944,7 +1948,7 @@ queue_and_out: if (eaten) { kfree_skb(skb); - } else + } else if (!sk->dead) sk->data_ready(sk, 0); return; } @@ -2074,6 +2078,30 @@ drop: kfree_skb(skb); } +/* When incoming ACK allowed to free some skb from write_queue, + * we remember this in flag tp->sorry and wake up socket on the exit + * from tcp input handler. Probably, handler has already eat this space + * sending ACK and cloned frames from tcp_write_xmit(). + */ +static __inline__ void tcp_new_space(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct socket *sock; + + tp->sorry = 0; + + if (sock_wspace(sk) >= tcp_min_write_space(sk) && + (sock = sk->socket) != NULL) { + clear_bit(SOCK_NOSPACE, &sock->flags); + + if (sk->sleep && waitqueue_active(sk->sleep)) + wake_up_interruptible(sk->sleep); + + if (sock->fasync_list) + sock_wake_async(sock, 2, POLL_OUT); + } +} + static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); @@ -2114,7 +2142,14 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) */ /* More than one full frame received or... */ - if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss) || + if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss +#ifdef TCP_MORE_COARSE_ACKS + /* Avoid to send immediate ACK from input path, if it + * does not advance window far enough. tcp_recvmsg() will do this. + */ + && (!sysctl_tcp_retrans_collapse || __tcp_select_window(sk) >= tp->rcv_wnd) +#endif + ) || /* We ACK each frame or... */ tcp_in_quickack_mode(tp) || /* We have out of order data or */ @@ -2480,6 +2515,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, TCP_SKB_CB(skb)->ack_seq, len); kfree_skb(skb); tcp_data_snd_check(sk); + if (tp->sorry) + tcp_new_space(sk); return 0; } else { /* Header too small */ TCP_INC_STATS_BH(TcpInErrs); @@ -2633,6 +2670,8 @@ step5: if(sk->state != TCP_CLOSE) { tcp_data_snd_check(sk); tcp_ack_snd_check(sk); + if (tp->sorry) + tcp_new_space(sk); } return 0; @@ -2739,6 +2778,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newtp->saw_tstamp = 0; newtp->probes_out = 0; + newtp->num_sacks = 0; newtp->syn_seq = req->rcv_isn; newtp->fin_seq = req->rcv_isn; newtp->urg_data = 0; @@ -3112,6 +3152,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_sync_mss(sk, tp->pmtu_cookie); tcp_initialize_rcv_mss(sk); tcp_init_metrics(sk); + tcp_init_buffer_space(sk); if (sk->keepopen) tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); @@ -3516,6 +3557,8 @@ step6: if (sk->state != TCP_CLOSE) { tcp_data_snd_check(sk); tcp_ack_snd_check(sk); + if (tp->sorry) + tcp_new_space(sk); } if (!queued) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 456f12968..3c9f4e82b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_ipv4.c,v 1.203 2000/03/22 17:55:03 davem Exp $ + * Version: $Id: tcp_ipv4.c,v 1.205 2000/03/26 09:16:08 davem Exp $ * * IPv4 specific functions * @@ -1039,7 +1039,6 @@ out: void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, struct sk_buff *skb) { - th->check = 0; th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr, csum_partial((char *)th, th->doff<<2, skb->csum)); } @@ -1057,7 +1056,7 @@ void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, * Exception: precedence violation. We do not implement it in any case. */ -void tcp_v4_send_reset(struct sk_buff *skb) +static void tcp_v4_send_reset(struct sk_buff *skb) { struct tcphdr *th = skb->h.th; struct tcphdr rth; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 887aaa519..600140764 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.122 2000/02/21 15:51:41 davem Exp $ + * Version: $Id: tcp_output.c,v 1.123 2000/03/25 01:52:05 davem Exp $ * * Authors: Ross Biro, * Fred N. van Kempen, @@ -126,7 +126,7 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) #define SYSCTL_FLAG_SACK 0x4 sysctl_flags = 0; - if(tcb->flags & TCPCB_FLAG_SYN) { + if (tcb->flags & TCPCB_FLAG_SYN) { tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; if(sysctl_tcp_timestamps) { tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; @@ -141,7 +141,7 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; } - } else if(tp->sack_ok && tp->num_sacks) { + } else if (tp->num_sacks) { /* A SACK is 2 pad bytes, a 2 byte header, plus * 2 32-bit sequence numbers for each SACK block. */ @@ -157,16 +157,19 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) th->dest = sk->dport; th->seq = htonl(TCP_SKB_CB(skb)->seq); th->ack_seq = htonl(tp->rcv_nxt); - th->doff = (tcp_header_size >> 2); - th->res1 = 0; - *(((__u8 *)th) + 13) = tcb->flags; - th->check = 0; - th->urg_ptr = ntohs(tcb->urg_ptr); - if(tcb->flags & TCPCB_FLAG_SYN) { + *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags); + if (tcb->flags & TCPCB_FLAG_SYN) { /* RFC1323: The window in SYN & SYN/ACK segments * is never scaled. */ th->window = htons(tp->rcv_wnd); + } else { + th->window = htons(tcp_select_window(sk)); + } + th->check = 0; + th->urg_ptr = ntohs(tcb->urg_ptr); + + if (tcb->flags & TCPCB_FLAG_SYN) { tcp_syn_build_options((__u32 *)(th + 1), tcp_advertise_mss(sk), (sysctl_flags & SYSCTL_FLAG_TSTAMPS), @@ -176,13 +179,12 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->when, tp->ts_recent); } else { - th->window = htons(tcp_select_window(sk)); tcp_build_and_update_options((__u32 *)(th + 1), tp, TCP_SKB_CB(skb)->when); } tp->af_specific->send_check(sk, th, skb->len, skb); - if (th->ack) + if (tcb->flags & TCPCB_FLAG_ACK) tcp_event_ack_sent(sk); if (skb->len != tcp_header_size) @@ -1097,10 +1099,26 @@ err_out: void tcp_send_delayed_ack(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + long ato = tp->ack.ato; unsigned long timeout; + if (ato > TCP_DELACK_MIN) { + int max_ato; + + /* If some rtt estimate is known, use it to bound delayed ack. + * Do not use tp->rto here, use results of rtt measurements + * directly. + */ + if (tp->srtt) + max_ato = (tp->srtt >> 3) + tp->mdev; + else + max_ato = TCP_DELACK_MAX; + + ato = min(ato, max_ato); + } + /* Stay within the limit we were given */ - timeout = jiffies + tp->ack.ato; + timeout = jiffies + ato; /* Use new timeout only if there wasn't a older one earlier. */ spin_lock_bh(&sk->timer_lock); @@ -1111,7 +1129,7 @@ void tcp_send_delayed_ack(struct sock *sk) /* If delack timer was blocked or is about to expire, * send ACK now. */ - if (tp->ack.blocked || time_before_eq(tp->delack_timer.expires, jiffies+(tp->ack.ato>>2))) { + if (tp->ack.blocked || time_before_eq(tp->delack_timer.expires, jiffies+(ato>>2))) { spin_unlock_bh(&sk->timer_lock); tcp_send_ack(sk); -- cgit v1.2.3