diff options
Diffstat (limited to 'net')
66 files changed, 1149 insertions, 705 deletions
diff --git a/net/Config.in b/net/Config.in index ce5b6faa9..624885478 100644 --- a/net/Config.in +++ b/net/Config.in @@ -13,9 +13,9 @@ if [ "$CONFIG_NETLINK" = "y" ]; then tristate ' Netlink device emulation' CONFIG_NETLINK_DEV fi bool 'Network packet filtering (replaces ipchains)' CONFIG_NETFILTER -#if [ "$CONFIG_NETFILTER" = "y" ]; then -# bool ' Network packet filtering debugging' CONFIG_NETFILTER_DEBUG -#fi +if [ "$CONFIG_NETFILTER" = "y" ]; then + bool ' Network packet filtering debugging' CONFIG_NETFILTER_DEBUG +fi bool 'Socket Filtering' CONFIG_FILTER tristate 'Unix domain sockets' CONFIG_UNIX bool 'TCP/IP networking' CONFIG_INET diff --git a/net/Makefile b/net/Makefile index 44b34d799..afdfbb712 100644 --- a/net/Makefile +++ b/net/Makefile @@ -10,7 +10,7 @@ MOD_SUB_DIRS := ipv4 ALL_SUB_DIRS := 802 ax25 bridge core ethernet ipv4 ipv6 ipx unix appletalk \ netrom rose lapb x25 wanrouter netlink sched packet sunrpc \ - econet irda decnet atm khttpd + econet irda decnet atm khttpd ipv4/netfilter SUB_DIRS := core ethernet sched MOD_LIST_NAME := NET_MISC_MODULES diff --git a/net/bridge/br.c b/net/bridge/br.c index 89ee1e0d5..0195f3631 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -5,7 +5,7 @@ * Authors: * Lennert Buytenhek <buytenh@gnu.org> * - * $Id: br.c,v 1.40 2000/03/21 21:08:47 davem Exp $ + * $Id: br.c,v 1.41 2000/03/24 01:33:36 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index fc549d76a..2ca176f95 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -5,7 +5,7 @@ * Authors: * Lennert Buytenhek <buytenh@gnu.org> * - * $Id: br_input.c,v 1.4 2000/03/21 21:08:47 davem Exp $ + * $Id: br_input.c,v 1.5 2000/03/30 01:22:23 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -94,6 +94,8 @@ static void __br_handle_frame(struct sk_buff *skb) br_flood(br, skb, 1); if (!passedup) br_pass_frame_up(br, skb); + else + kfree_skb(skb); return; } @@ -102,6 +104,8 @@ static void __br_handle_frame(struct sk_buff *skb) if (dst != NULL && dst->is_local) { if (!passedup) br_pass_frame_up(br, skb); + else + kfree_skb(skb); br_fdb_put(dst); return; } diff --git a/net/core/datagram.c b/net/core/datagram.c index bda174519..7f85645f0 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -87,9 +87,8 @@ static int wait_for_packet(struct sock * sk, int *err, long *timeo_p) goto out; /* handle signals */ - error = -ERESTARTSYS; if (signal_pending(current)) - goto out; + goto interrupted; *timeo_p = schedule_timeout(*timeo_p); @@ -98,6 +97,8 @@ ready: remove_wait_queue(sk->sleep, &wait); return 0; +interrupted: + error = sock_intr_errno(*timeo_p); out: current->state = TASK_RUNNING; remove_wait_queue(sk->sleep, &wait); @@ -248,7 +249,7 @@ unsigned int datagram_poll(struct file * file, struct socket *sock, poll_table * if (sock_writeable(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else - sk->socket->flags |= SO_NOSPACE; + set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); return mask; } diff --git a/net/core/dev.c b/net/core/dev.c index f14753618..81a35e7a0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -176,6 +176,15 @@ int netdev_nit=0; * change it and subsequent readers will get broken packet. * --ANK (980803) */ + +/** + * dev_add_pack - add packet handler + * @pt: packet type declaration + * + * Add a protocol handler to the networking stack. The passed packet_type + * is linked into kernel lists and may not be freed until it has been + * removed from the kernel lists. + */ void dev_add_pack(struct packet_type *pt) { @@ -203,8 +212,14 @@ void dev_add_pack(struct packet_type *pt) } -/* - * Remove a protocol ID from the list. +/** + * dev_remove_pack - remove packet handler + * @pt: packet type declaration + * + * Remove a protocol handler that was previously added to the kernel + * protocol handlers by dev_add_pack. The passed packet_type is removed + * from the kernel lists and can be freed or reused once this function + * returns. */ void dev_remove_pack(struct packet_type *pt) @@ -241,9 +256,15 @@ void dev_remove_pack(struct packet_type *pt) ******************************************************************************************/ -/* - * Find an interface by name. May be called under rtnl semaphore - * or dev_base_lock. +/** + * __dev_get_by_name - find a device by its name + * @name: name to find + * + * Find an interface by name. Must be called under rtnl semaphore + * or dev_base_lock. If the name is found a pointer to the device + * is returned. If the name is not found then NULL is returned. The + * reference counters are not incremented so the caller must be + * careful with locks. */ @@ -258,8 +279,15 @@ struct net_device *__dev_get_by_name(const char *name) return NULL; } -/* - * Find an interface by name. Any context, dev_put() to release. +/** + * dev_get_by_name - find a device by its name + * @name: name to find + * + * Find an interface by name. This can be called from any + * context and does its own locking. The returned handle has + * the usage count incremented and the caller must use dev_put() to + * release it when it is no longer needed. NULL is returned if no + * matching device is found. */ struct net_device *dev_get_by_name(const char *name) @@ -282,6 +310,18 @@ struct net_device *dev_get_by_name(const char *name) is meaningless, if it was not issued under rtnl semaphore. */ +/** + * dev_get - test if a device exists + * @name: name to test for + * + * Test if a name exists. Returns true if the name is found. In order + * to be sure the name is not allocated or removed during the test the + * caller must hold the rtnl semaphore. + * + * This function primarily exists for back compatibility with older + * drivers. + */ + int dev_get(const char *name) { struct net_device *dev; @@ -292,8 +332,14 @@ int dev_get(const char *name) return dev != NULL; } -/* - * Find an interface by index. May be called under rtnl semaphore +/** + * __dev_get_by_index - find a device by its ifindex + * @ifindex: index of device + * + * Search for an interface by index. Returns NULL if the device + * is not found or a pointer to the device. The device has not + * had its reference counter increased so the caller must be careful + * about locking. The caller must hold either the rtnl semaphore * or dev_base_lock. */ @@ -308,8 +354,15 @@ struct net_device * __dev_get_by_index(int ifindex) return NULL; } -/* - * Find an interface by index. Any context, dev_put() to release. + +/** + * dev_get_by_index - find a device by its ifindex + * @ifindex: index of device + * + * Search for an interface by index. Returns NULL if the device + * is not found or a pointer to the device. The device returned has + * had a reference added and the pointer is safe until the user calls + * dev_put to indicate they have finished with it. */ struct net_device * dev_get_by_index(int ifindex) @@ -324,8 +377,18 @@ struct net_device * dev_get_by_index(int ifindex) return dev; } -/* - * Find an interface by ll addr. May be called only under rtnl semaphore. +/** + * dev_getbyhwaddr - find a device by its hardware addres + * @type: media type of device + * @ha: hardware address + * + * Search for an interface by MAC address. Returns NULL if the device + * is not found or a pointer to the device. The caller must hold the + * rtnl semaphore. The returned device has not had its ref count increased + * and the caller must therefore be careful about locking + * + * BUGS: + * If the API was consistent this would be __dev_get_by_hwaddr */ struct net_device *dev_getbyhwaddr(unsigned short type, char *ha) @@ -342,9 +405,16 @@ struct net_device *dev_getbyhwaddr(unsigned short type, char *ha) return NULL; } -/* +/** + * dev_alloc_name - allocate a name for a device + * @dev: device + * @name: name format string + * * Passed a format string - eg "lt%d" it will try and find a suitable - * id. Not efficient for many devices, not called a lot.. + * id. Not efficient for many devices, not called a lot. The caller + * must hold the dev_base or rtnl lock while allocating the name and + * adding the device in order to avoid duplicates. Returns the number + * of the unit assigned or a negative errno code. */ int dev_alloc_name(struct net_device *dev, const char *name) @@ -365,6 +435,22 @@ int dev_alloc_name(struct net_device *dev, const char *name) return -ENFILE; /* Over 100 of the things .. bail out! */ } +/** + * dev_alloc - allocate a network device and name + * @name: name format string + * @err: error return pointer + * + * Passed a format string - eg "lt%d" it will allocate a network device + * and space for the name. NULL is returned if no memory is available. + * If the allocation succeeds then the name is assigned and the + * device pointer returned. NULL is returned if the name allocation failed. + * The cause of an error is returned as a negative errno code in the + * variable err points to. + * + * The claler must hold the dev_base or rtnl locks when doing this in order + * to avoid duplicate name allocations. + */ + struct net_device *dev_alloc(const char *name, int *err) { struct net_device *dev=kmalloc(sizeof(struct net_device)+16, GFP_KERNEL); @@ -382,6 +468,15 @@ struct net_device *dev_alloc(const char *name, int *err) return dev; } +/** + * netdev_state_change - device changes state + * @dev: device to cause notification + * + * Called to indicate a device has changed state. This function calls + * the notifier chains for netdev_chain and sends a NEWLINK message + * to the routing socket. + */ + void netdev_state_change(struct net_device *dev) { if (dev->flags&IFF_UP) { @@ -391,12 +486,17 @@ void netdev_state_change(struct net_device *dev) } -/* - * Find and possibly load an interface. - */ - #ifdef CONFIG_KMOD +/** + * dev_load - load a network module + * @name: name of interface + * + * If a network interface is not present and the process has suitable + * privileges this function loads the module. If module loading is not + * available in this kernel then it becomes a nop. + */ + void dev_load(const char *name) { if (!__dev_get_by_name(name) && capable(CAP_SYS_MODULE)) @@ -416,8 +516,17 @@ static int default_rebuild_header(struct sk_buff *skb) return 1; } -/* - * Prepare an interface for use. +/** + * dev_open - prepare an interface for use. + * @dev: device to open + * + * Takes a device from down to up state. The devices private open + * function is invoked and then the multicast lists are loaded. Finally + * the device is moved into the up state and a NETDEV_UP message is + * sent to the netdev notifier chain. + * + * Calling this function on an active interface is a nop. On a failure + * a negative errno code is returned. */ int dev_open(struct net_device *dev) @@ -508,8 +617,14 @@ void dev_clear_fastroute(struct net_device *dev) } #endif -/* - * Completely shutdown an interface. +/** + * dev_close - shutdown an interface. + * @dev: device to shutdown + * + * This function moves an active device into down state. A + * NETDEV_GOING_DOWN is sent to the netev notifier chain. The device + * is then deactivated and finally a NETDEV_DOWN is sent to the notifier + * chain. */ int dev_close(struct net_device *dev) @@ -560,12 +675,31 @@ int dev_close(struct net_device *dev) * Device change register/unregister. These are not inline or static * as we export them to the world. */ + +/** + * register_netdevice_notifier - register a network notifier block + * @nb: notifier + * + * Register a notifier to be called when network device events occur. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. + */ int register_netdevice_notifier(struct notifier_block *nb) { return notifier_chain_register(&netdev_chain, nb); } +/** + * unregister_netdevice_notifier - unregister a network notifier block + * @nb: notifier + * + * Unregister a notifier previously registered by register_netdevice_notifier + * The notifier is unlinked into the kernel structures and may + * then be reused. A negative errno code is returned on a failure. + */ + int unregister_netdevice_notifier(struct notifier_block *nb) { return notifier_chain_unregister(&netdev_chain,nb); @@ -637,6 +771,19 @@ void dev_loopback_xmit(struct sk_buff *skb) netif_rx(newskb); } +/** + * dev_queue_xmit - transmit a buffer + * @skb: buffer to transmit + * + * Queue a buffer for transmission to a network device. The caller must + * have set the device and priority and built the buffer before calling this + * function. The function can be called from an interrupt. + * + * A negative errno code is returned on a failure. A success does not + * guarantee the frame will be transmitted as it may be dropped due + * to congestion or traffic shaping. + */ + int dev_queue_xmit(struct sk_buff *skb) { struct net_device *dev = skb->dev; @@ -770,9 +917,14 @@ static void netdev_wakeup(void) } #endif -/* - * Receive a packet from a device driver and queue it for the upper - * (protocol) levels. It always succeeds. +/** + * netif_rx - post buffer to the network code + * @skb: buffer to post + * + * This function receives a packet from a device driver and queues it for + * the upper (protocol) levels to process. It always succeeds. The buffer + * may be dropped during processing for congestion control or by the + * protocol layers. */ void netif_rx(struct sk_buff *skb) @@ -922,6 +1074,14 @@ static void net_tx_action(struct softirq_action *h) } } +/** + * net_call_rx_atomic + * @fn: function to call + * + * Make a function call that is atomic with respect to the protocol + * layers + */ + void net_call_rx_atomic(void (*fn)(void)) { br_write_lock_bh(BR_NETPROTO_LOCK); @@ -1063,10 +1223,18 @@ softnet_break: return; } -/* Protocol dependent address dumping routines */ - static gifconf_func_t * gifconf_list [NPROTO]; +/** + * register_gifconf - register a SIOCGIF handler + * @family: Address family + * @gifconf: Function handler + * + * Register protocol dependent address dumping routines. The handler + * that is passed must not be freed or reused until it has been replaced + * by another handler. + */ + int register_gifconf(unsigned int family, gifconf_func_t * gifconf) { if (family>=NPROTO) @@ -1381,6 +1549,18 @@ static int dev_get_wireless_info(char * buffer, char **start, off_t offset, #endif /* CONFIG_PROC_FS */ #endif /* WIRELESS_EXT */ +/** + * netdev_set_master - set up master/slave pair + * @slave: slave device + * @master: new master device + * + * Changes the master device of the slave. Pass NULL to break the + * bonding. The caller must hold the RTNL semaphore. On a failure + * a negative errno code is returned. On success the reference counts + * are adjusted, RTM_NEWLINK is sent to the routing socket and the + * function returns zero. + */ + int netdev_set_master(struct net_device *slave, struct net_device *master) { struct net_device *old = slave->master; @@ -1409,6 +1589,17 @@ int netdev_set_master(struct net_device *slave, struct net_device *master) return 0; } +/** + * dev_set_promiscuity - update promiscuity count on a device + * @dev: device + * @inc: modifier + * + * Add or remove promsicuity from a device. While the count in the device + * remains above zero the interface remains promiscuous. Once it hits zero + * the device reverts back to normal filtering operation. A negative inc + * value is used to drop promiscuity on the device. + */ + void dev_set_promiscuity(struct net_device *dev, int inc) { unsigned short old_flags = dev->flags; @@ -1430,6 +1621,18 @@ void dev_set_promiscuity(struct net_device *dev, int inc) } } +/** + * dev_set_allmulti - update allmulti count on a device + * @dev: device + * @inc: modifier + * + * Add or remove reception of all multicast frames to a device. While the + * count in the device remains above zero the interface remains listening + * to all interfaces. Once it hits zero the device reverts back to normal + * filtering operation. A negative inc value is used to drop the counter + * when releasing a resource needing all multicasts. + */ + void dev_set_allmulti(struct net_device *dev, int inc) { unsigned short old_flags = dev->flags; @@ -1673,12 +1876,22 @@ static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) return -EINVAL; } - /* * This function handles all "interface"-type I/O control requests. The actual * 'doing' part of this is dev_ifsioc above. */ +/** + * dev_ioctl - network device ioctl + * @cmd: command to issue + * @arg: pointer to a struct ifreq in user space + * + * Issue ioctl functions to devices. This is normally called by the + * user space syscall interfaces but can sometimes be useful for + * other purposes. The return value is the return from the syscall if + * positive or a negative errno code on error. + */ + int dev_ioctl(unsigned int cmd, void *arg) { struct ifreq ifr; @@ -1811,6 +2024,15 @@ int dev_ioctl(unsigned int cmd, void *arg) } } + +/** + * dev_new_index - allocate an ifindex + * + * Returns a suitable unique value for a new device interface number. + * The caller must hold the rtnl semaphore to be sure it remains + * unique. + */ + int dev_new_index(void) { static int ifindex; @@ -1824,6 +2046,19 @@ int dev_new_index(void) static int dev_boot_phase = 1; +/** + * register_netdevice - register a network device + * @dev: device to register + * + * Take a completed network device structure and add it to the kernel + * interfaces. A NETDEV_REGISTER message is sent to the netdev notifier + * chain. 0 is returned on success. A negative errno code is returned + * on a failure to set up the device, or if the name is a duplicate. + * + * BUGS: + * The locking appears insufficient to guarantee two parallel registers + * will not get the same name. + */ int register_netdevice(struct net_device *dev) { @@ -1917,6 +2152,14 @@ int register_netdevice(struct net_device *dev) return 0; } +/** + * netdev_finish_unregister - complete unregistration + * @dev: device + * + * Destroy and free a dead device. A value of zero is returned on + * success. + */ + int netdev_finish_unregister(struct net_device *dev) { BUG_TRAP(dev->ip_ptr==NULL); @@ -1924,7 +2167,7 @@ int netdev_finish_unregister(struct net_device *dev) BUG_TRAP(dev->dn_ptr==NULL); if (!dev->deadbeaf) { - printk("Freeing alive device %p, %s\n", dev, dev->name); + printk(KERN_ERR "Freeing alive device %p, %s\n", dev, dev->name); return 0; } #ifdef NET_REFCNT_DEBUG @@ -1937,6 +2180,15 @@ int netdev_finish_unregister(struct net_device *dev) return 0; } +/** + * unregister_netdevice - remove device from the kernel + * @dev: device + * + * This function shuts down a device interface and removes it + * from the kernel tables. On success 0 is returned, on a failure + * a negative errno code is returned. + */ + int unregister_netdevice(struct net_device *dev) { unsigned long now; diff --git a/net/core/filter.c b/net/core/filter.c index 8749e8c7b..9d16a69fe 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -54,7 +54,12 @@ static u8 *load_pointer(struct sk_buff *skb, int k) return NULL; } -/* +/** + * sk_run_filter - run a filter on a socket + * @skb: buffer to run the filter on + * @filter: filter to apply + * @flen: length of filter + * * Decode and apply filter instructions to the skb->data. * Return length to keep, 0 for none. skb is the data we are * filtering, filter is the array of filter instructions, and @@ -341,9 +346,17 @@ load_b: return (0); } -/* +/** + * sk_chk_filter - verify socket filter code + * @filter: filter to verify + * @flen: length of filter + * * Check the user's filter code. If we let some ugly - * filter code slip through kaboom! + * filter code slip through kaboom! The filter must contain + * no references or jumps that are out of range, no illegal instructions + * and no backward jumps. It must end with a RET instruction + * + * Returns 0 if the rule set is legal or a negative errno code if not. */ int sk_chk_filter(struct sock_filter *filter, int flen) @@ -413,9 +426,15 @@ int sk_chk_filter(struct sock_filter *filter, int flen) return (BPF_CLASS(filter[flen - 1].code) == BPF_RET)?0:-EINVAL; } -/* +/** + * sk_attach_filter - attach a socket filter + * @fprog: the filter program + * @sk: the socket to use + * * Attach the user's filter code. We first run some sanity checks on - * it to make sure it does not explode on us later. + * it to make sure it does not explode on us later. If an error + * occurs or there is insufficient memory for the filter a negative + * errno code is returned. On success the return is zero. */ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index dad1f3925..54230a273 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4,7 +4,7 @@ * Authors: Alan Cox <iiitac@pyr.swan.ac.uk> * Florian La Roche <rzsfl@rz.uni-sb.de> * - * Version: $Id: skbuff.c,v 1.70 2000/03/17 14:41:39 davem Exp $ + * Version: $Id: skbuff.c,v 1.71 2000/03/29 11:58:33 davem Exp $ * * Fixes: * Alan Cox : Fixed the worst of the load balancer bugs. @@ -77,6 +77,15 @@ static union { * reliable. */ +/** + * skb_over_panic - private function + * @skb: buffer + * @sz: size + * @here: address + * + * Out of line support code for skb_put. Not user callable + */ + void skb_over_panic(struct sk_buff *skb, int sz, void *here) { printk("skput:over: %p:%d put:%d dev:%s", @@ -84,6 +93,16 @@ void skb_over_panic(struct sk_buff *skb, int sz, void *here) *(int*)0 = 0; } +/** + * skb_under_panic - private function + * @skb: buffer + * @sz: size + * @here: address + * + * Out of line support code for skb_push. Not user callable + */ + + void skb_under_panic(struct sk_buff *skb, int sz, void *here) { printk("skput:under: %p:%d put:%d dev:%s", @@ -130,6 +149,19 @@ static __inline__ void skb_head_to_pool(struct sk_buff *skb) * */ +/** + * alloc_skb - allocate a network buffer + * @size: size to allocate + * @gfp_mask: allocation mask + * + * Allocate a new sk_buff. The returned buffer has no headroom and a + * tail room of size bytes. The object has a reference count of one. + * The return is the buffer. On a failure the return is NULL. + * + * Buffers may only be allocated from interrupts using a gfp_mask of + * GFP_ATOMIC. + */ + struct sk_buff *alloc_skb(unsigned int size,int gfp_mask) { struct sk_buff *skb; @@ -227,8 +259,13 @@ void kfree_skbmem(struct sk_buff *skb) skb_head_to_pool(skb); } -/* - * Free an sk_buff. Release anything attached to the buffer. Clean the state. +/** + * __kfree_skb - private function + * @skb: buffer + * + * Free an sk_buff. Release anything attached to the buffer. + * Clean the state. This is an internal helper function. Users should + * always call kfree_skb */ void __kfree_skb(struct sk_buff *skb) @@ -258,8 +295,18 @@ void __kfree_skb(struct sk_buff *skb) kfree_skbmem(skb); } -/* - * Duplicate an sk_buff. The new one is not owned by a socket. +/** + * skb_clone - duplicate an sk_buff + * @skb: buffer to clone + * @gfp_mask: allocation priority + * + * Duplicate an sk_buff. The new one is not owned by a socket. Both + * copies share the same packet data but not structure. The new + * buffer has a reference count of 1. If the allocation fails the + * function returns NULL otherwise the new buffer is returned. + * + * If this function is called from an interrupt gfp_mask must be + * GFP_ATOMIC. */ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) @@ -331,8 +378,18 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #endif } -/* - * This is slower, and copies the whole data area +/** + * skb_copy - copy an sk_buff + * @skb: buffer to copy + * @gfp_mask: allocation priority + * + * Make a copy of both an sk_buff and its data. This is used when the + * caller wishes to modify the data and needs a private copy of the + * data to alter. Returns NULL on failure or the pointer to the buffer + * on success. The returned buffer has a reference count of 1. + * + * You must pass GFP_ATOMIC as the allocation priority if this function + * is called from an interrupt. */ struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask) @@ -359,6 +416,26 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask) return n; } +/** + * skb_copy - copy and expand sk_buff + * @skb: buffer to copy + * @newheadroom: new free bytes at head + * @newtailroom: new free bytes at tail + * @gfp_mask: allocation priority + * + * Make a copy of both an sk_buff and its data and while doing so + * allocate additional space. + * + * This is used when the caller wishes to modify the data and needs a + * private copy of the data to alter as well as more space for new fields. + * Returns NULL on failure or the pointer to the buffer + * on success. The returned buffer has a reference count of 1. + * + * You must pass GFP_ATOMIC as the allocation priority if this function + * is called from an interrupt. + */ + + struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom, int newtailroom, diff --git a/net/core/sock.c b/net/core/sock.c index 21f15b5e7..ce25381c9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -7,7 +7,7 @@ * handler for protocols to use and generic option handler. * * - * Version: $Id: sock.c,v 1.90 2000/02/27 19:48:11 davem Exp $ + * Version: $Id: sock.c,v 1.91 2000/03/25 01:55:03 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -731,11 +731,12 @@ static long sock_wait_for_wmem(struct sock * sk, long timeo) { DECLARE_WAITQUEUE(wait, current); - sk->socket->flags &= ~SO_NOSPACE; + clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); add_wait_queue(sk->sleep, &wait); for (;;) { if (signal_pending(current)) break; + set_bit(SOCK_NOSPACE, &sk->socket->flags); set_current_state(TASK_INTERRUPTIBLE); if (atomic_read(&sk->wmem_alloc) < sk->sndbuf) break; @@ -802,18 +803,20 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, * This means we have too many buffers for this socket already. */ - sk->socket->flags |= SO_NOSPACE; + set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); + set_bit(SOCK_NOSPACE, &sk->socket->flags); err = -EAGAIN; if (!timeo) goto failure; - err = -ERESTARTSYS; if (signal_pending(current)) - goto failure; + goto interrupted; timeo = sock_wait_for_wmem(sk, timeo); } return skb; +interrupted: + err = sock_intr_errno(timeo); failure: *errcode = err; return NULL; @@ -1079,7 +1082,7 @@ int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct * void sock_def_wakeup(struct sock *sk) { read_lock(&sk->callback_lock); - if(!sk->dead) + if (sk->sleep && waitqueue_active(sk->sleep)) wake_up_interruptible_all(sk->sleep); read_unlock(&sk->callback_lock); } @@ -1087,20 +1090,18 @@ void sock_def_wakeup(struct sock *sk) void sock_def_error_report(struct sock *sk) { read_lock(&sk->callback_lock); - if (!sk->dead) { + if (sk->sleep && waitqueue_active(sk->sleep)) wake_up_interruptible(sk->sleep); - sock_wake_async(sk->socket,0,POLL_ERR); - } + sk_wake_async(sk,0,POLL_ERR); read_unlock(&sk->callback_lock); } void sock_def_readable(struct sock *sk, int len) { read_lock(&sk->callback_lock); - if(!sk->dead) { + if (sk->sleep && waitqueue_active(sk->sleep)) wake_up_interruptible(sk->sleep); - sock_wake_async(sk->socket,1,POLL_IN); - } + sk_wake_async(sk,1,POLL_IN); read_unlock(&sk->callback_lock); } @@ -1111,14 +1112,15 @@ void sock_def_write_space(struct sock *sk) /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ - if(!sk->dead && - ((atomic_read(&sk->wmem_alloc) << 1) <= sk->sndbuf)) { - wake_up_interruptible(sk->sleep); + if((atomic_read(&sk->wmem_alloc) << 1) <= sk->sndbuf) { + if (sk->sleep && waitqueue_active(sk->sleep)) + wake_up_interruptible(sk->sleep); /* Should agree with poll, otherwise some programs break */ if (sock_writeable(sk)) - sock_wake_async(sk->socket, 2, POLL_OUT); + sk_wake_async(sk, 2, POLL_OUT); } + read_unlock(&sk->callback_lock); } diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index a2453c06a..c560ea01e 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1670,14 +1670,14 @@ static int dn_recvmsg(struct socket *sock, struct msghdr *msg, int size, goto out; } - sock->flags |= SO_WAITDATA; + set_bit(SOCK_ASYNC_WAITDATA, &sock->flags); SOCK_SLEEP_PRE(sk) if (!dn_data_ready(sk, queue, flags, target)) schedule(); SOCK_SLEEP_POST(sk) - sock->flags &= ~SO_WAITDATA; + clear_bit(SOCK_ASYNC_WAITDATA, &sock->flags); } for(skb = queue->next; skb != (struct sk_buff *)queue; skb = nskb) { diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index 854ed0e92..00e62aa76 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -438,7 +438,8 @@ static __inline__ int dn_queue_skb(struct sock *sk, struct sk_buff *skb, int sig if (!sk->dead) { struct socket *sock = sk->socket; wake_up_interruptible(sk->sleep); - if (!(sock->flags & SO_WAITDATA) && sock->fasync_list) + if (sock && sock->fasync_list && + !test_bit(SOCK_ASYNC_WAITDATA, &sock->flags)) kill_fasync(sock->fasync_list, sig, (sig == SIGURG) ? POLL_PRI : POLL_IN); } diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c index ebbf4163f..669aeccce 100644 --- a/net/decnet/dn_nsp_out.c +++ b/net/decnet/dn_nsp_out.c @@ -133,13 +133,13 @@ struct sk_buff *dn_alloc_send_skb(struct sock *sk, int *size, int noblock, int * } if (space < len) { - sk->socket->flags |= SO_NOSPACE; + set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); if (noblock) { *err = EWOULDBLOCK; break; } - sk->socket->flags &= ~SO_NOSPACE; + clear_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags); SOCK_SLEEP_PRE(sk) if ((sk->sndbuf - atomic_read(&sk->wmem_alloc)) < len) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b848151a9..d3fc0e38f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -5,7 +5,7 @@ * * PF_INET protocol family socket handler. * - * Version: $Id: af_inet.c,v 1.108 2000/02/21 16:25:59 davem Exp $ + * Version: $Id: af_inet.c,v 1.109 2000/03/25 01:55:10 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -607,7 +607,7 @@ int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr, if (!timeo || !inet_wait_for_connect(sk, timeo)) goto out; - err = -ERESTARTSYS; + err = sock_intr_errno(timeo); if (signal_pending(current)) goto out; } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 7561e190b..7c462ac08 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -3,7 +3,7 @@ * * Alan Cox, <alan@redhat.com> * - * Version: $Id: icmp.c,v 1.66 2000/03/17 14:41:50 davem Exp $ + * Version: $Id: icmp.c,v 1.67 2000/03/25 01:55:11 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -1128,6 +1128,7 @@ void __init icmp_init(struct net_proto_family *ops) if ((err=ops->create(icmp_socket, IPPROTO_ICMP))<0) panic("Failed to create the ICMP control socket.\n"); icmp_socket->sk->allocation=GFP_ATOMIC; + icmp_socket->sk->sndbuf = SK_WMEM_MAX*2; icmp_socket->sk->protinfo.af_inet.ttl = MAXTTL; /* Unhash it so that IP input processing does not even diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index f3013ca57..5792c5de7 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -5,7 +5,7 @@ * * The Internet Protocol (IP) output module. * - * Version: $Id: ip_output.c,v 1.82 2000/03/17 14:41:50 davem Exp $ + * Version: $Id: ip_output.c,v 1.83 2000/03/25 01:52:08 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -415,14 +415,13 @@ int ip_queue_xmit(struct sk_buff *skb) /* OK, we know where to send it, allocate and build IP header. */ iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); - iph->version = 4; - iph->ihl = 5; - iph->tos = sk->protinfo.af_inet.tos; + *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (sk->protinfo.af_inet.tos & 0xff)); + iph->tot_len = htons(skb->len); iph->frag_off = 0; iph->ttl = sk->protinfo.af_inet.ttl; - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; iph->protocol = sk->protocol; + iph->saddr = rt->rt_src; + iph->daddr = rt->rt_dst; skb->nh.iph = iph; /* Transport layer set skb->h.foo itself. */ @@ -431,8 +430,6 @@ int ip_queue_xmit(struct sk_buff *skb) ip_options_build(skb, opt, sk->daddr, rt, 0); } - iph->tot_len = htons(skb->len); - return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, ip_queue_xmit2); diff --git a/net/ipv4/netfilter/.cvsignore b/net/ipv4/netfilter/.cvsignore new file mode 100644 index 000000000..857dd22e9 --- /dev/null +++ b/net/ipv4/netfilter/.cvsignore @@ -0,0 +1,2 @@ +.depend +.*.flags diff --git a/net/ipv4/netfilter/Config.in b/net/ipv4/netfilter/Config.in index bf2a28269..406d2ea3d 100644 --- a/net/ipv4/netfilter/Config.in +++ b/net/ipv4/netfilter/Config.in @@ -39,6 +39,7 @@ if [ "$CONFIG_IP_NF_IPTABLES" != "n" ]; then if [ "$CONFIG_IP_NF_CONNTRACK" != "n" ]; then dep_tristate ' Full NAT' CONFIG_IP_NF_NAT $CONFIG_IP_NF_IPTABLES if [ "$CONFIG_IP_NF_NAT" != "n" ]; then + define_bool CONFIG_IP_NF_NAT_NEEDED y dep_tristate ' MASQUERADE target support' CONFIG_IP_NF_TARGET_MASQUERADE $CONFIG_IP_NF_NAT dep_tristate ' REDIRECT target support' CONFIG_IP_NF_TARGET_REDIRECT $CONFIG_IP_NF_NAT fi @@ -56,8 +57,14 @@ fi if [ "$CONFIG_IP_NF_CONNTRACK" != "y" ]; then if [ "$CONFIG_IP_NF_IPTABLES" != "y" ]; then tristate 'ipchains (2.2-style) support' CONFIG_IP_NF_COMPAT_IPCHAINS + if [ "$CONFIG_IP_NF_COMPAT_IPCHAINS" != "n" ]; then + define_bool CONFIG_IP_NF_NAT_NEEDED y + fi if [ "$CONFIG_IP_NF_COMPAT_IPCHAINS" != "y" ]; then tristate 'ipfwadm (2.0-style) support' CONFIG_IP_NF_COMPAT_IPFWADM + if [ "$CONFIG_IP_NF_COMPAT_IPFWADM" != "n" ]; then + define_bool CONFIG_IP_NF_NAT_NEEDED y + fi fi fi fi diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index c507acc31..db276076a 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -15,10 +15,12 @@ IP_NF_CONNTRACK_OBJ:=ip_conntrack_core.o ip_conntrack_proto_generic.o ip_conntra IP_NF_NAT_OBJ:=ip_nat_core.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o +# All the parts of conntrack and NAT required for compatibility layer. +IP_NF_COMPAT_LAYER:=ip_fw_compat.o ip_fw_compat_redir.o ip_fw_compat_masq.o $(IP_NF_CONNTRACK_OBJ) $(IP_NF_NAT_OBJ) + # Link order matters here. ifeq ($(CONFIG_IP_NF_CONNTRACK),y) -OX_OBJS += ip_conntrack_standalone.o -O_OBJS += $(IP_NF_CONNTRACK_OBJ) +O_OBJS += ip_conntrack_standalone.o $(IP_NF_CONNTRACK_OBJ) else ifeq ($(CONFIG_IP_NF_CONNTRACK),m) MI_OBJS += $(IP_NF_CONNTRACK_OBJ) @@ -27,16 +29,8 @@ else endif endif -ifeq ($(CONFIG_IP_NF_QUEUE),y) -O_OBJS += ip_queue.o -else - ifeq ($(CONFIG_IP_NF_QUEUE),m) - M_OBJS += ip_queue.o - endif -endif - ifeq ($(CONFIG_IP_NF_FTP),y) -OX_OBJS += ip_conntrack_ftp.o +O_OBJS += ip_conntrack_ftp.o else ifeq ($(CONFIG_IP_NF_FTP),m) MX_OBJS += ip_conntrack_ftp.o @@ -47,7 +41,7 @@ ifeq ($(CONFIG_IP_NF_IPTABLES),y) O_OBJS += ip_tables.o else ifeq ($(CONFIG_IP_NF_IPTABLES),m) - M_OBJS += ip_tables.o + MX_OBJS += ip_tables.o endif endif @@ -115,17 +109,8 @@ else endif endif -ifeq ($(CONFIG_IP_NF_FILTER),y) -O_OBJS += iptable_filter.o -else - ifeq ($(CONFIG_IP_NF_FILTER),m) - M_OBJS += iptable_filter.o - endif -endif - ifeq ($(CONFIG_IP_NF_NAT),y) -OX_OBJS += ip_nat_standalone.o -O_OBJS += ip_nat_rule.o $(IP_NF_NAT_OBJ) +O_OBJS += ip_nat_standalone.o ip_nat_rule.o $(IP_NF_NAT_OBJ) ifeq ($(CONFIG_IP_NF_FTP),y) O_OBJS += ip_nat_ftp.o endif @@ -140,6 +125,14 @@ else endif endif +ifeq ($(CONFIG_IP_NF_FILTER),y) +O_OBJS += iptable_filter.o +else + ifeq ($(CONFIG_IP_NF_FILTER),m) + M_OBJS += iptable_filter.o + endif +endif + ifeq ($(CONFIG_IP_NF_MANGLE),y) O_OBJS += iptable_mangle.o else @@ -205,7 +198,7 @@ else endif ifeq ($(CONFIG_IP_NF_COMPAT_IPCHAINS),y) -O_OBJS += ipchains.o +O_OBJS += ipchains_core.o $(IP_NF_COMPAT_LAYER) else ifeq ($(CONFIG_IP_NF_COMPAT_IPCHAINS),m) M_OBJS += ipchains.o @@ -213,13 +206,21 @@ else endif ifeq ($(CONFIG_IP_NF_COMPAT_IPFWADM),y) -O_OBJS += ipfwadm.o +O_OBJS += ipfwadm_core.o $(IP_NF_COMPAT_LAYER) else ifeq ($(CONFIG_IP_NF_COMPAT_IPFWADM),m) M_OBJS += ipfwadm.o endif endif +ifeq ($(CONFIG_IP_NF_QUEUE),y) +O_OBJS += ip_queue.o +else + ifeq ($(CONFIG_IP_NF_QUEUE),m) + M_OBJS += ip_queue.o + endif +endif + include $(TOPDIR)/Rules.make ip_conntrack.o: ip_conntrack_standalone.o $(IP_NF_CONNTRACK_OBJ) @@ -228,11 +229,8 @@ ip_conntrack.o: ip_conntrack_standalone.o $(IP_NF_CONNTRACK_OBJ) iptable_nat.o: ip_nat_standalone.o ip_nat_rule.o $(IP_NF_NAT_OBJ) $(LD) -r -o $@ ip_nat_standalone.o ip_nat_rule.o $(IP_NF_NAT_OBJ) -# All the parts of conntrack and NAT required for compatibility layer. -IP_NF_COMPAT_LAYER:=ip_fw_compat.o ip_fw_compat_redir.o ip_fw_compat_masq.o $(IP_NF_CONNTRACK_OBJ) $(IP_NF_NAT_OBJ) - ipfwadm.o: ipfwadm_core.o $(IP_NF_COMPAT_LAYER) $(LD) -r -o $@ ipfwadm_core.o $(IP_NF_COMPAT_LAYER) -ipchains.o: ipchains_core.o $(IP_NF_COMPAT_LAYER) +ipchains.o: ipchains_core.o $(IP_NF_COMPAT_LAYER) $(LD) -r -o $@ ipchains_core.o $(IP_NF_COMPAT_LAYER) diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 9007cdc89..197c2e3b4 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -22,6 +22,7 @@ #include <net/checksum.h> #include <linux/stddef.h> #include <linux/sysctl.h> +#include <linux/slab.h> /* This rwlock protects the main hash table, protocol/helper/expected registrations, conntrack timers*/ @@ -43,13 +44,14 @@ DECLARE_RWLOCK(ip_conntrack_lock); void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL; -static LIST_HEAD(expect_list); -static LIST_HEAD(protocol_list); +LIST_HEAD(expect_list); +LIST_HEAD(protocol_list); static LIST_HEAD(helpers); unsigned int ip_conntrack_htable_size = 0; static int ip_conntrack_max = 0; static atomic_t ip_conntrack_count = ATOMIC_INIT(0); struct list_head *ip_conntrack_hash; +static kmem_cache_t *ip_conntrack_cachep; extern struct ip_conntrack_protocol ip_conntrack_generic_protocol; @@ -167,7 +169,7 @@ destroy_conntrack(struct nf_conntrack *nfct) if (ip_conntrack_destroyed) ip_conntrack_destroyed(ct); - kfree(ct); + kmem_cache_free(ip_conntrack_cachep, ct); atomic_dec(&ip_conntrack_count); } @@ -355,7 +357,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, return 1; } - conntrack = kmalloc(sizeof(struct ip_conntrack), GFP_ATOMIC); + conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); if (!conntrack) { DEBUGP("Can't allocate conntrack.\n"); return 1; @@ -374,7 +376,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, conntrack->infos[i].master = &conntrack->ct_general; if (!protocol->new(conntrack, skb->nh.iph, skb->len)) { - kfree(conntrack); + kmem_cache_free(ip_conntrack_cachep, conntrack); return 1; } @@ -384,7 +386,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, if (__ip_conntrack_find(tuple, NULL)) { WRITE_UNLOCK(&ip_conntrack_lock); printk("ip_conntrack: Wow someone raced us!\n"); - kfree(conntrack); + kmem_cache_free(ip_conntrack_cachep, conntrack); return 0; } conntrack->helper = LIST_FIND(&helpers, helper_cmp, @@ -796,6 +798,7 @@ static struct nf_sockopt_ops so_getorigdst #define NET_IP_CONNTRACK_MAX 2089 #define NET_IP_CONNTRACK_MAX_NAME "ip_conntrack_max" +#ifdef CONFIG_SYSCTL static struct ctl_table_header *ip_conntrack_sysctl_header; static ctl_table ip_conntrack_table[] = { @@ -813,6 +816,7 @@ static ctl_table ip_conntrack_root_table[] = { {CTL_NET, "net", NULL, 0, 0555, ip_conntrack_dir_table, 0, 0, 0, 0, 0}, { 0 } }; +#endif /*CONFIG_SYSCTL*/ static int kill_all(const struct ip_conntrack *i, void *data) { @@ -823,8 +827,11 @@ static int kill_all(const struct ip_conntrack *i, void *data) supposed to kill the mall. */ void ip_conntrack_cleanup(void) { +#ifdef CONFIG_SYSCTL unregister_sysctl_table(ip_conntrack_sysctl_header); +#endif ip_ct_selective_cleanup(kill_all, NULL); + kmem_cache_destroy(ip_conntrack_cachep); vfree(ip_conntrack_hash); nf_unregister_sockopt(&so_getorigdst); } @@ -855,6 +862,16 @@ int __init ip_conntrack_init(void) return -ENOMEM; } + ip_conntrack_cachep = kmem_cache_create("ip_conntrack", + sizeof(struct ip_conntrack), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!ip_conntrack_cachep) { + printk(KERN_ERR "Unable to create ip_conntrack slab cache\n"); + vfree(ip_conntrack_hash); + nf_unregister_sockopt(&so_getorigdst); + return -ENOMEM; + } + /* Don't NEED lock here, but good form anyway. */ WRITE_LOCK(&ip_conntrack_lock); /* Sew in builtin protocols. */ @@ -873,19 +890,12 @@ int __init ip_conntrack_init(void) ip_conntrack_sysctl_header = register_sysctl_table(ip_conntrack_root_table, 0); if (ip_conntrack_sysctl_header == NULL) { + kmem_cache_destroy(ip_conntrack_cachep); vfree(ip_conntrack_hash); nf_unregister_sockopt(&so_getorigdst); return -ENOMEM; } #endif /*CONFIG_SYSCTL*/ - ret = ip_conntrack_protocol_tcp_init(); - if (ret != 0) { - unregister_sysctl_table(ip_conntrack_sysctl_header); - vfree(ip_conntrack_hash); - nf_unregister_sockopt(&so_getorigdst); - } - return ret; } - diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index 23ccf74cf..1600156f7 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -10,6 +10,7 @@ #include <linux/netfilter_ipv4/ip_conntrack_ftp.h> DECLARE_LOCK(ip_ftp_lock); +struct module *ip_conntrack_ftp = THIS_MODULE; #define SERVER_STRING "227 Entering Passive Mode (" #define CLIENT_STRING "PORT " @@ -240,9 +241,5 @@ static void __exit fini(void) ip_conntrack_helper_unregister(&ftp); } -struct module *ip_conntrack_ftp = THIS_MODULE; -EXPORT_SYMBOL(ip_conntrack_ftp); -EXPORT_SYMBOL(ip_ftp_lock); - module_init(init); module_exit(fini); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 1d1256be5..cbbc1ab8c 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -2,6 +2,7 @@ #include <linux/sched.h> #include <linux/timer.h> #include <linux/netfilter.h> +#include <linux/in.h> #include <linux/icmp.h> #include <linux/netfilter_ipv4/ip_conntrack_protocol.h> diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 3dd448252..893248943 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -4,6 +4,7 @@ #include <linux/timer.h> #include <linux/netfilter.h> #include <linux/module.h> +#include <linux/in.h> #include <linux/ip.h> #include <linux/tcp.h> #include <linux/netfilter_ipv4/ip_conntrack.h> @@ -220,8 +221,3 @@ struct ip_conntrack_protocol ip_conntrack_protocol_tcp = { { NULL, NULL }, IPPROTO_TCP, "tcp", tcp_pkt_to_tuple, tcp_invert_tuple, tcp_print_tuple, tcp_print_conntrack, tcp_packet, tcp_new, NULL }; - -int __init ip_conntrack_protocol_tcp_init(void) -{ - return 0; -} diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index 688ae10fb..79ec82151 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -2,6 +2,7 @@ #include <linux/sched.h> #include <linux/timer.h> #include <linux/netfilter.h> +#include <linux/in.h> #include <linux/udp.h> #include <linux/netfilter_ipv4/ip_conntrack_protocol.h> diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index a69be542d..9030d9d41 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -276,6 +276,7 @@ static void __exit fini(void) module_init(init); module_exit(fini); +#ifdef MODULE EXPORT_SYMBOL(ip_conntrack_protocol_register); EXPORT_SYMBOL(invert_tuplepr); EXPORT_SYMBOL(ip_conntrack_alter_reply); @@ -284,11 +285,9 @@ EXPORT_SYMBOL(ip_conntrack_get); EXPORT_SYMBOL(ip_conntrack_module); EXPORT_SYMBOL(ip_conntrack_helper_register); EXPORT_SYMBOL(ip_conntrack_helper_unregister); -EXPORT_SYMBOL(ip_conntrack_lock); -EXPORT_SYMBOL(find_proto); -EXPORT_SYMBOL(get_tuple); EXPORT_SYMBOL(ip_ct_selective_cleanup); EXPORT_SYMBOL(ip_ct_refresh); EXPORT_SYMBOL(ip_conntrack_expect_related); EXPORT_SYMBOL(ip_conntrack_tuple_taken); EXPORT_SYMBOL(ip_ct_gather_frags); +#endif diff --git a/net/ipv4/netfilter/ip_fw_compat.c b/net/ipv4/netfilter/ip_fw_compat.c index 72dc3d816..2a08ee89c 100644 --- a/net/ipv4/netfilter/ip_fw_compat.c +++ b/net/ipv4/netfilter/ip_fw_compat.c @@ -14,8 +14,6 @@ struct notifier_block; #include <linux/netfilter_ipv4/compat_firewall.h> #include <linux/netfilter_ipv4/ip_conntrack.h> -EXPORT_NO_SYMBOLS; - static struct firewall_ops *fwops; /* From ip_fw_compat_redir.c */ diff --git a/net/ipv4/netfilter/ip_fw_compat_masq.c b/net/ipv4/netfilter/ip_fw_compat_masq.c index e0074c1e2..96bdc9d8d 100644 --- a/net/ipv4/netfilter/ip_fw_compat_masq.c +++ b/net/ipv4/netfilter/ip_fw_compat_masq.c @@ -5,6 +5,7 @@ DO IT. */ #include <linux/skbuff.h> +#include <linux/in.h> #include <linux/ip.h> #include <linux/icmp.h> #include <linux/udp.h> diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c index 8252e6d9b..12d40f554 100644 --- a/net/ipv4/netfilter/ip_nat_ftp.c +++ b/net/ipv4/netfilter/ip_nat_ftp.c @@ -11,8 +11,6 @@ #include <linux/netfilter_ipv4/ip_conntrack_ftp.h> #include <linux/netfilter_ipv4/ip_conntrack_helper.h> -EXPORT_NO_SYMBOLS; - #if 0 #define DEBUGP printk #else @@ -374,8 +372,6 @@ static struct ip_nat_helper ftp static struct ip_nat_expect ftp_expect = { { NULL, NULL }, ftp_nat_expected }; -extern struct module *ip_conntrack_ftp; - static int __init init(void) { int ret; @@ -384,9 +380,7 @@ static int __init init(void) if (ret == 0) { ret = ip_nat_helper_register(&ftp); - if (ret == 0) - __MOD_INC_USE_COUNT(ip_conntrack_ftp); - else + if (ret != 0) ip_nat_expect_unregister(&ftp_expect); } return ret; @@ -394,7 +388,6 @@ static int __init init(void) static void __exit fini(void) { - __MOD_DEC_USE_COUNT(ip_conntrack_ftp); ip_nat_helper_unregister(&ftp); ip_nat_expect_unregister(&ftp_expect); } diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c index 603111063..bfcc435c2 100644 --- a/net/ipv4/netfilter/ip_nat_standalone.c +++ b/net/ipv4/netfilter/ip_nat_standalone.c @@ -230,11 +230,13 @@ static int init_or_cleanup(int init) printk("ip_nat_init: can't register local out hook.\n"); goto cleanup_outops; } - __MOD_INC_USE_COUNT(ip_conntrack_module); + if (ip_conntrack_module) + __MOD_INC_USE_COUNT(ip_conntrack_module); return ret; cleanup: - __MOD_DEC_USE_COUNT(ip_conntrack_module); + if (ip_conntrack_module) + __MOD_DEC_USE_COUNT(ip_conntrack_module); nf_unregister_hook(&ip_nat_local_out_ops); cleanup_outops: nf_unregister_hook(&ip_nat_out_ops); @@ -262,9 +264,11 @@ static void __exit fini(void) module_init(init); module_exit(fini); +#ifdef MODULE EXPORT_SYMBOL(ip_nat_setup_info); EXPORT_SYMBOL(ip_nat_helper_register); EXPORT_SYMBOL(ip_nat_helper_unregister); EXPORT_SYMBOL(ip_nat_expect_register); EXPORT_SYMBOL(ip_nat_expect_unregister); EXPORT_SYMBOL(ip_nat_cheat_check); +#endif diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 532538321..80e43d977 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -2,7 +2,7 @@ * This is a module which is used for queueing IPv4 packets and * communicating with userspace via netlink. * - * (C) 2000 James Morris + * (C) 2000 James Morris, this code is GPL. */ #include <linux/module.h> #include <linux/skbuff.h> @@ -13,7 +13,6 @@ #include <linux/netfilter.h> #include <linux/netlink.h> #include <linux/spinlock.h> -#include <linux/smp_lock.h> #include <linux/rtnetlink.h> #include <linux/sysctl.h> #include <linux/proc_fs.h> @@ -21,20 +20,13 @@ #include <linux/netfilter_ipv4/ip_queue.h> -EXPORT_NO_SYMBOLS; - -#define IPQ_THR_NAME "kipq" -#define IPQ_NAME "ip_queue" #define IPQ_QMAX_DEFAULT 1024 - #define IPQ_PROC_FS_NAME "ip_queue" - #define NET_IPQ_QMAX 2088 #define NET_IPQ_QMAX_NAME "ip_queue_maxlen" typedef struct ipq_queue_element { struct list_head list; /* Links element into queue */ - unsigned char state; /* State of this element */ int verdict; /* Current verdict */ struct nf_info *info; /* Extra info from netfilter */ struct sk_buff *skb; /* Packet inside */ @@ -50,178 +42,70 @@ typedef struct ipq_peer { ipq_send_cb_t send; /* Callback for sending data to peer */ } ipq_peer_t; -typedef struct ipq_thread { - pid_t pid; /* PID of kernel thread */ - unsigned char terminate; /* Termination flag */ - unsigned char running; /* Running flag */ - wait_queue_head_t wq; /* I/O wait queue */ - void (*process)(void *data); /* Queue processing function */ -} ipq_thread_t; - typedef struct ipq_queue { int len; /* Current queue len */ int *maxlen; /* Maximum queue len, via sysctl */ - unsigned char state; /* Current queue state */ + unsigned char flushing; /* If queue is being flushed */ + unsigned char terminate; /* If the queue is being terminated */ struct list_head list; /* Head of packet queue */ spinlock_t lock; /* Queue spinlock */ ipq_peer_t peer; /* Userland peer */ - ipq_thread_t thread; /* Thread context */ } ipq_queue_t; /**************************************************************************** -* -* Kernel thread -* -****************************************************************************/ - -static void ipq_thread_init(char *thread_name) -{ - lock_kernel(); - exit_files(current); - daemonize(); - strcpy(current->comm, thread_name); - unlock_kernel(); - spin_lock_irq(¤t->sigmask_lock); - flush_signals(current); - sigfillset(¤t->blocked); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); -} - -static int ipq_thread_start(void *data) -{ - ipq_queue_t *q = (ipq_queue_t *)data; - - q->thread.running = 1; - ipq_thread_init(IPQ_THR_NAME); - q->thread.pid = current->pid; - while (!q->thread.terminate) { - interruptible_sleep_on(&q->thread.wq); - q->thread.process(q); - } - q->thread.running = 0; - return 0; -} - -static void ipq_thread_stop(ipq_queue_t *q) -{ - if (!(q->thread.pid || q->thread.running)) - return; - q->state = IPQ_QS_FLUSH; - q->thread.terminate = 1; - wake_up_interruptible(&q->thread.wq); - current->state = TASK_INTERRUPTIBLE; - while (q->thread.running) { - schedule_timeout(HZ/10); - current->state = TASK_RUNNING; - } -} - -static int ipq_thread_create(ipq_queue_t *q) -{ - int status = kernel_thread(ipq_thread_start, q, 0); - return (status < 0) ? status : 0; -} - - -/**************************************************************************** * * Packet queue * ****************************************************************************/ -/* Must be called under spinlock */ -static __inline__ void -ipq_dequeue(ipq_queue_t *q, - ipq_queue_element_t *e) -{ - list_del(&e->list); - nf_reinject(e->skb, e->info, e->verdict); - kfree(e); - q->len--; -} - -/* Must be called under spinlock */ -static __inline__ void -ipq_queue_drop(ipq_queue_t *q, - ipq_queue_element_t *e) +/* Dequeue with element packet ID, or from end of queue if ID is zero. */ +static ipq_queue_element_t *ipq_dequeue(ipq_queue_t *q, unsigned long id) { - e->verdict = NF_DROP; - ipq_dequeue(q, e); -} - -static int -ipq_notify_peer(ipq_queue_t *q, - ipq_queue_element_t *e) -{ - int status = q->peer.send(e); + struct list_head *i; + ipq_queue_element_t *e = NULL; - if (status >= 0) { - e->state = IPQ_PS_WAITING; - return status; + spin_lock_bh(&q->lock); + if (q->len == 0) + goto out_unlock; + i = q->list.prev; + if (id > 0) { + while (i != &q->list) { + if (id == (unsigned long )i) + goto out_unlink; + i = i->prev; + } + goto out_unlock; } - if (status == -ERESTARTSYS || status == -EAGAIN) - return 0; - printk(KERN_INFO "%s: error notifying peer %d, resetting " - "state and flushing queue\n", IPQ_NAME, q->peer.pid); - q->state = IPQ_QS_FLUSH; - q->peer.died = 1; - q->peer.pid = 0; - q->peer.copy_mode = IPQ_COPY_META; - q->peer.copy_range = 0; - return status; +out_unlink: + e = (ipq_queue_element_t *)i; + list_del(&e->list); + q->len--; +out_unlock: + spin_unlock_bh(&q->lock); + return e; } -static void -ipq_queue_process(void *data) +static void ipq_flush(ipq_queue_t *q) { - struct list_head *i; - ipq_queue_t *q = (ipq_queue_t *)data; - -restart: - if (q->state == IPQ_QS_HOLD) - return; + ipq_queue_element_t *e; + spin_lock_bh(&q->lock); - for (i = q->list.prev; i != &q->list; i = i->prev) { - ipq_queue_element_t *e = (ipq_queue_element_t *)i; - - if (q->state == IPQ_QS_FLUSH) { - QDEBUG("flushing packet %p\n", e); - ipq_queue_drop(q, e); - continue; - } - switch (e->state) { - case IPQ_PS_NEW: { - int status = ipq_notify_peer(q, e); - if (status < 0) { - spin_unlock_bh(&q->lock); - goto restart; - } - break; - } - case IPQ_PS_VERDICT: - ipq_dequeue(q, e); - break; - case IPQ_PS_WAITING: - break; - default: - printk(KERN_INFO "%s: dropping stuck packet %p " - "with ps=%d qs=%d\n", IPQ_NAME, - e, e->state, q->state); - ipq_queue_drop(q, e); - } + q->flushing = 1; + spin_unlock_bh(&q->lock); + while ((e = ipq_dequeue(q, 0))) { + e->verdict = NF_DROP; + nf_reinject(e->skb, e->info, e->verdict); + kfree(e); } + spin_lock_bh(&q->lock); + q->flushing = 0; spin_unlock_bh(&q->lock); - if (q->state == IPQ_QS_FLUSH) - q->state = IPQ_QS_HOLD; } -static ipq_queue_t * -ipq_queue_create(nf_queue_outfn_t outfn, - ipq_send_cb_t send_cb, - int *errp, - int *sysctl_qmax) +static ipq_queue_t *ipq_create_queue(nf_queue_outfn_t outfn, + ipq_send_cb_t send_cb, + int *errp, int *sysctl_qmax) { int status; ipq_queue_t *q; @@ -232,18 +116,15 @@ ipq_queue_create(nf_queue_outfn_t outfn, *errp = -ENOMEM; return NULL; } - q->thread.terminate = 0; - q->thread.running = 0; - q->thread.process = ipq_queue_process; - init_waitqueue_head(&q->thread.wq); q->peer.pid = 0; q->peer.died = 0; - q->peer.copy_mode = IPQ_COPY_META; + q->peer.copy_mode = IPQ_COPY_NONE; q->peer.copy_range = 0; q->peer.send = send_cb; q->len = 0; q->maxlen = sysctl_qmax; - q->state = IPQ_QS_HOLD; + q->flushing = 0; + q->terminate = 0; INIT_LIST_HEAD(&q->list); spin_lock_init(&q->lock); status = nf_register_queue_handler(PF_INET, outfn, q); @@ -252,91 +133,92 @@ ipq_queue_create(nf_queue_outfn_t outfn, kfree(q); return NULL; } - status = ipq_thread_create(q); - if (status < 0) { - nf_unregister_queue_handler(PF_INET); - *errp = status; - kfree(q); - return NULL; - } return q; } -static int -ipq_enqueue(ipq_queue_t *q, - struct sk_buff *skb, - struct nf_info *info) +static int ipq_enqueue(ipq_queue_t *q, + struct sk_buff *skb, struct nf_info *info) { - ipq_queue_element_t *e = NULL; - + ipq_queue_element_t *e; + int status; + e = kmalloc(sizeof(*e), GFP_ATOMIC); if (e == NULL) { - printk(KERN_ERR "%s: out of memory in %s\n", - IPQ_NAME, __FUNCTION__); - return -ENOMEM; + printk(KERN_ERR "ip_queue: OOM in enqueue\n"); + return -ENOMEM; } - e->state = IPQ_PS_NEW; e->verdict = NF_DROP; e->info = info; e->skb = skb; spin_lock_bh(&q->lock); if (q->len >= *q->maxlen) { spin_unlock_bh(&q->lock); - printk(KERN_WARNING "%s: queue full at %d entries, " - "dropping packet.\n", IPQ_NAME, q->len); - kfree(e); - nf_reinject(skb, info, NF_DROP); - return 0; + if (net_ratelimit()) + printk(KERN_WARNING "ip_queue: full at %d entries, " + "dropping packet(s).\n", q->len); + goto free_drop; + } + if (q->flushing || q->peer.copy_mode == IPQ_COPY_NONE + || q->peer.pid == 0 || q->peer.died || q->terminate) { + spin_unlock_bh(&q->lock); + goto free_drop; + } + status = q->peer.send(e); + if (status > 0) { + list_add(&e->list, &q->list); + q->len++; + spin_unlock_bh(&q->lock); + return status; } - list_add(&e->list, &q->list); - q->len++; spin_unlock_bh(&q->lock); - wake_up_interruptible(&q->thread.wq); - return 0; + if (status == -ECONNREFUSED) { + printk(KERN_INFO "ip_queue: peer %d died, " + "resetting state and flushing queue\n", q->peer.pid); + q->peer.died = 1; + q->peer.pid = 0; + q->peer.copy_mode = IPQ_COPY_NONE; + q->peer.copy_range = 0; + ipq_flush(q); + } +free_drop: + kfree(e); + return -EBUSY; } -/* FIXME: need to find a way to notify user during module unload */ -static void -ipq_queue_destroy(ipq_queue_t *q) +static void ipq_destroy_queue(ipq_queue_t *q) { - ipq_thread_stop(q); nf_unregister_queue_handler(PF_INET); + spin_lock_bh(&q->lock); + q->terminate = 1; + spin_unlock_bh(&q->lock); + ipq_flush(q); kfree(q); } -static int -ipq_queue_mangle_ipv4(unsigned char *buf, - ipq_verdict_msg_t *v, - ipq_queue_element_t *e) +static int ipq_mangle_ipv4(ipq_verdict_msg_t *v, ipq_queue_element_t *e) { - struct iphdr *user_iph = (struct iphdr *)buf; + struct iphdr *user_iph = (struct iphdr *)v->payload; if (v->data_len < sizeof(*user_iph)) return 0; - if (e->skb->nh.iph->check != user_iph->check) { int diff = v->data_len - e->skb->len; if (diff < 0) skb_trim(e->skb, v->data_len); else if (diff > 0) { - if (v->data_len > 0xFFFF) { - e->verdict = NF_DROP; + if (v->data_len > 0xFFFF) return -EINVAL; - } if (diff > skb_tailroom(e->skb)) { struct sk_buff *newskb; - /* Ack, we waste a memcpy() of data here */ newskb = skb_copy_expand(e->skb, skb_headroom(e->skb), diff, GFP_ATOMIC); if (newskb == NULL) { - printk(KERN_WARNING "%s: OOM in %s, " - "dropping packet\n", - IPQ_THR_NAME, __FUNCTION__); - e->verdict = NF_DROP; + printk(KERN_WARNING "ip_queue: OOM " + "in mangle, dropping packet\n"); return -ENOMEM; } kfree_skb(e->skb); @@ -344,101 +226,76 @@ ipq_queue_mangle_ipv4(unsigned char *buf, } skb_put(e->skb, diff); } - memcpy(e->skb->data, buf, v->data_len); + memcpy(e->skb->data, v->payload, v->data_len); e->skb->nfcache |= NFC_ALTERED; } return 0; } -static int -ipq_queue_set_verdict(ipq_queue_t *q, - ipq_verdict_msg_t *v, - unsigned char *buf, - unsigned int len) +static int ipq_set_verdict(ipq_queue_t *q, + ipq_verdict_msg_t *v, unsigned int len) { - struct list_head *i; + ipq_queue_element_t *e; if (v->value < 0 || v->value > NF_MAX_VERDICT) return -EINVAL; - spin_lock_bh(&q->lock); - for (i = q->list.next; i != &q->list; i = i->next) { - ipq_queue_element_t *e = (ipq_queue_element_t *)i; - - if (v->id == (unsigned long )e) { - int status = 0; - e->state = IPQ_PS_VERDICT; - e->verdict = v->value; - - if (buf && v->data_len == len) - status = ipq_queue_mangle_ipv4(buf, v, e); - spin_unlock_bh(&q->lock); - return status; - } + e = ipq_dequeue(q, v->id); + if (e == NULL) + return -ENOENT; + else { + e->verdict = v->value; + if (v->data_len && v->data_len == len) + if (ipq_mangle_ipv4(v, e) < 0) + e->verdict = NF_DROP; + nf_reinject(e->skb, e->info, e->verdict); + kfree(e); + return 0; } - spin_unlock_bh(&q->lock); - return -ENOENT; } -static int -ipq_receive_peer(ipq_queue_t *q, - ipq_peer_msg_t *m, - unsigned char type, - unsigned int len) +static int ipq_receive_peer(ipq_queue_t *q, ipq_peer_msg_t *m, + unsigned char type, unsigned int len) { - if (q->state == IPQ_QS_FLUSH) - return -EBUSY; + int status = 0; + + spin_lock_bh(&q->lock); + if (q->terminate || q->flushing) + return -EBUSY; + spin_unlock_bh(&q->lock); if (len < sizeof(ipq_peer_msg_t)) return -EINVAL; - switch (type) { case IPQM_MODE: switch (m->msg.mode.value) { - case IPQ_COPY_NONE: - q->peer.copy_mode = IPQ_COPY_NONE; - q->peer.copy_range = 0; - q->state = IPQ_QS_FLUSH; - break; case IPQ_COPY_META: - if (q->state == IPQ_QS_FLUSH) - return -EAGAIN; q->peer.copy_mode = IPQ_COPY_META; q->peer.copy_range = 0; - q->state = IPQ_QS_COPY; break; case IPQ_COPY_PACKET: - if (q->state == IPQ_QS_FLUSH) - return -EAGAIN; q->peer.copy_mode = IPQ_COPY_PACKET; q->peer.copy_range = m->msg.mode.range; - q->state = IPQ_QS_COPY; + if (q->peer.copy_range > 0xFFFF) + q->peer.copy_range = 0xFFFF; break; default: - return -EINVAL; + status = -EINVAL; } break; - case IPQM_VERDICT: { - int status; - unsigned char *data = NULL; - + case IPQM_VERDICT: if (m->msg.verdict.value > NF_MAX_VERDICT) - return -EINVAL; - if (m->msg.verdict.data_len) - data = (unsigned char *)m + sizeof(*m); - status = ipq_queue_set_verdict(q, &m->msg.verdict, - data, len - sizeof(*m)); - if (status < 0) - return status; + status = -EINVAL; + else + status = ipq_set_verdict(q, + &m->msg.verdict, + len - sizeof(*m)); break; - } default: - return -EINVAL; + status = -EINVAL; } - wake_up_interruptible(&q->thread.wq); - return 0; + return status; } - /**************************************************************************** * * Netfilter interface @@ -449,16 +306,10 @@ ipq_receive_peer(ipq_queue_t *q, * Packets arrive here from netfilter for queuing to userspace. * All of them must be fed back via nf_reinject() or Alexey will kill Rusty. */ -static int -receive_netfilter(struct sk_buff *skb, - struct nf_info *info, - void *data) +static int netfilter_receive(struct sk_buff *skb, + struct nf_info *info, void *data) { - ipq_queue_t *q = (ipq_queue_t *)data; - - if (q->state == IPQ_QS_FLUSH) - return -EBUSY; - return ipq_enqueue(q, skb, info); + return ipq_enqueue((ipq_queue_t *)data, skb, info); } /**************************************************************************** @@ -467,36 +318,10 @@ receive_netfilter(struct sk_buff *skb, * ****************************************************************************/ -static struct sk_buff * -netlink_build_message(ipq_queue_element_t *e, - int *errp); - -extern __inline__ void -receive_user_skb(struct sk_buff *skb); - -static int -netlink_send_peer(ipq_queue_element_t *e); - static struct sock *nfnl = NULL; ipq_queue_t *nlq = NULL; -static int -netlink_send_peer(ipq_queue_element_t *e) -{ - int status = 0; - struct sk_buff *skb; - - if (!nlq->peer.pid) - return -EINVAL; - skb = netlink_build_message(e, &status); - if (skb == NULL) - return status; - return netlink_unicast(nfnl, skb, nlq->peer.pid, MSG_DONTWAIT); -} - -static struct sk_buff * -netlink_build_message(ipq_queue_element_t *e, - int *errp) +static struct sk_buff *netlink_build_message(ipq_queue_element_t *e, int *errp) { unsigned char *old_tail; size_t size = 0; @@ -519,6 +344,7 @@ netlink_build_message(ipq_queue_element_t *e, else data_len = copy_range; size = NLMSG_SPACE(sizeof(*pm) + data_len); + break; case IPQ_COPY_NONE: default: @@ -542,7 +368,7 @@ netlink_build_message(ipq_queue_element_t *e, if (e->info->outdev) strcpy(pm->outdev_name, e->info->outdev->name); else pm->outdev_name[0] = '\0'; if (data_len) - memcpy(++pm, e->skb->data, data_len); + memcpy(pm->payload, e->skb->data, data_len); nlh->nlmsg_len = skb->tail - old_tail; NETLINK_CB(skb).dst_groups = 0; return skb; @@ -550,16 +376,24 @@ nlmsg_failure: if (skb) kfree(skb); *errp = 0; - printk(KERN_ERR "%s: error creating netlink message\n", IPQ_NAME); + printk(KERN_ERR "ip_queue: error creating netlink message\n"); return NULL; } +static int netlink_send_peer(ipq_queue_element_t *e) +{ + int status = 0; + struct sk_buff *skb; + + skb = netlink_build_message(e, &status); + if (skb == NULL) + return status; + return netlink_unicast(nfnl, skb, nlq->peer.pid, MSG_DONTWAIT); +} + #define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0); -/* - * FIXME: ping old peer if we detect a new peer then resend. - */ -extern __inline__ void -receive_user_skb(struct sk_buff *skb) + +extern __inline__ void netlink_receive_user_skb(struct sk_buff *skb) { int status, type; struct nlmsghdr *nlh; @@ -581,9 +415,11 @@ receive_user_skb(struct sk_buff *skb) if(!cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) RCV_SKB_FAIL(-EPERM); if (nlq->peer.pid && !nlq->peer.died - && (nlq->peer.pid != nlh->nlmsg_pid)) - printk(KERN_WARNING "%s: peer pid changed from %d to %d\n", - IPQ_NAME, nlq->peer.pid, nlh->nlmsg_pid); + && (nlq->peer.pid != nlh->nlmsg_pid)) { + printk(KERN_WARNING "ip_queue: peer pid changed from %d to " + "%d, flushing queue\n", nlq->peer.pid, nlh->nlmsg_pid); + ipq_flush(nlq); + } nlq->peer.pid = nlh->nlmsg_pid; nlq->peer.died = 0; status = ipq_receive_peer(nlq, NLMSG_DATA(nlh), @@ -596,9 +432,7 @@ receive_user_skb(struct sk_buff *skb) } /* Note: we are only dealing with single part messages at the moment. */ -static void -receive_user_sk(struct sock *sk, - int len) +static void netlink_receive_user_sk(struct sock *sk, int len) { do { struct sk_buff *skb; @@ -606,28 +440,25 @@ receive_user_sk(struct sock *sk, if (rtnl_shlock_nowait()) return; while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) { - receive_user_skb(skb); + netlink_receive_user_skb(skb); kfree_skb(skb); } up(&rtnl_sem); } while (nfnl && nfnl->receive_queue.qlen); } - /**************************************************************************** * * System events * ****************************************************************************/ -static int -receive_event(struct notifier_block *this, - unsigned long event, - void *ptr) +static int receive_event(struct notifier_block *this, + unsigned long event, void *ptr) { if (event == NETDEV_UNREGISTER) if (nlq) - ipq_thread_stop(nlq); + ipq_destroy_queue(nlq); return NOTIFY_DONE; } @@ -637,7 +468,6 @@ struct notifier_block ipq_dev_notifier = { 0 }; - /**************************************************************************** * * Sysctl - queue tuning. @@ -670,33 +500,28 @@ static ctl_table ipq_root_table[] = { * ****************************************************************************/ -static int -ipq_get_info(char *buffer, char **start, off_t offset, int length) +static int ipq_get_info(char *buffer, char **start, off_t offset, int length) { int len; spin_lock_bh(&nlq->lock); len = sprintf(buffer, - "Thread pid : %d\n" - "Thread terminate : %d\n" - "Thread running : %d\n" - "Peer pid : %d\n" - "Peer died : %d\n" - "Peer copy mode : %d\n" - "Peer copy range : %d\n" - "Queue length : %d\n" - "Queue max. length : %d\n" - "Queue state : %d\n", - nlq->thread.pid, - nlq->thread.terminate, - nlq->thread.running, + "Peer pid : %d\n" + "Peer died : %d\n" + "Peer copy mode : %d\n" + "Peer copy range : %d\n" + "Queue length : %d\n" + "Queue max. length : %d\n" + "Queue flushing : %d\n" + "Queue terminate : %d\n", nlq->peer.pid, nlq->peer.died, nlq->peer.copy_mode, nlq->peer.copy_range, nlq->len, *nlq->maxlen, - nlq->state); + nlq->flushing, + nlq->terminate); spin_unlock_bh(&nlq->lock); *start = buffer + offset; len -= offset; @@ -716,18 +541,18 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length) static int __init init(void) { int status = 0; - - nfnl = netlink_kernel_create(NETLINK_FIREWALL, receive_user_sk); + + nfnl = netlink_kernel_create(NETLINK_FIREWALL, netlink_receive_user_sk); if (nfnl == NULL) { - printk(KERN_ERR "%s: initialisation failed: unable to " - "create kernel netlink socket\n", IPQ_NAME); + printk(KERN_ERR "ip_queue: initialisation failed: unable to " + "create kernel netlink socket\n"); return -ENOMEM; } - nlq = ipq_queue_create(receive_netfilter, + nlq = ipq_create_queue(netfilter_receive, netlink_send_peer, &status, &sysctl_maxlen); if (nlq == NULL) { - printk(KERN_ERR "%s: initialisation failed: unable to " - "initialise queue\n", IPQ_NAME); + printk(KERN_ERR "ip_queue: initialisation failed: unable to " + "create queue\n"); sock_release(nfnl->socket); return status; } @@ -742,7 +567,7 @@ static void __exit fini(void) unregister_sysctl_table(ipq_sysctl_header); proc_net_remove(IPQ_PROC_FS_NAME); unregister_netdevice_notifier(&ipq_dev_notifier); - ipq_queue_destroy(nlq); + ipq_destroy_queue(nlq); sock_release(nfnl->socket); } @@ -750,3 +575,4 @@ MODULE_DESCRIPTION("IPv4 packet queue handler"); module_init(init); module_exit(fini); + diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 8cc8c24ac..66f47c386 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -12,15 +12,13 @@ #include <linux/tcp.h> #include <linux/udp.h> #include <linux/icmp.h> +#include <net/ip.h> #include <asm/uaccess.h> #include <asm/semaphore.h> +#include <linux/proc_fs.h> #include <linux/netfilter_ipv4/ip_tables.h> -#ifndef IP_OFFSET -#define IP_OFFSET 0x1FFF -#endif - /*#define DEBUG_IP_FIREWALL*/ /*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */ /*#define DEBUG_IP_FIREWALL_USER*/ @@ -288,9 +286,16 @@ ipt_do_table(struct sk_buff **pskb, + TABLE_OFFSET(table->private, smp_processor_id()); e = get_entry(table_base, table->private->hook_entry[hook]); - /* Check noone else using our table */ - IP_NF_ASSERT(((struct ipt_entry *)table_base)->comefrom == 0xdead57ac); #ifdef CONFIG_NETFILTER_DEBUG + /* Check noone else using our table */ + if (((struct ipt_entry *)table_base)->comefrom != 0xdead57ac + && ((struct ipt_entry *)table_base)->comefrom != 0xeeeeeeec) { + printk("ASSERT: CPU #%u, %s comefrom(%p) = %X\n", + smp_processor_id(), + table->name, + &((struct ipt_entry *)table_base)->comefrom, + ((struct ipt_entry *)table_base)->comefrom); + } ((struct ipt_entry *)table_base)->comefrom = 0x57acc001; #endif @@ -343,11 +348,28 @@ ipt_do_table(struct sk_buff **pskb, e = get_entry(table_base, v); } else { + /* Targets which reenter must return + abs. verdicts */ +#ifdef CONFIG_NETFILTER_DEBUG + ((struct ipt_entry *)table_base)->comefrom + = 0xeeeeeeec; +#endif verdict = t->u.target->target(pskb, hook, in, out, t->data, userdata); +#ifdef CONFIG_NETFILTER_DEBUG + if (((struct ipt_entry *)table_base)->comefrom + != 0xeeeeeeec + && verdict == IPT_CONTINUE) { + printk("Target %s reentered!\n", + t->u.target->name); + verdict = NF_DROP; + } + ((struct ipt_entry *)table_base)->comefrom + = 0x57acc001; +#endif /* Target might have changed stuff. */ ip = (*pskb)->nh.iph; protohdr = (u_int32_t *)ip + ip->ihl; @@ -1631,6 +1653,43 @@ static struct ipt_match udp_matchstruct static struct ipt_match icmp_matchstruct = { { NULL, NULL }, "icmp", &icmp_match, &icmp_checkentry, NULL }; +#ifdef CONFIG_PROC_FS +static inline int print_name(const struct ipt_table *t, + off_t start_offset, char *buffer, int length, + off_t *pos, unsigned int *count) +{ + if ((*count)++ >= start_offset) { + unsigned int namelen; + + namelen = sprintf(buffer + *pos, "%s\n", t->name); + if (*pos + namelen > length) { + /* Stop iterating */ + return 1; + } + *pos += namelen; + } + return 0; +} + +static int ipt_get_tables(char *buffer, char **start, off_t offset, int length) +{ + off_t pos = 0; + unsigned int count = 0; + + if (down_interruptible(&ipt_mutex) != 0) + return 0; + + LIST_FIND(&ipt_tables, print_name, struct ipt_table *, + offset, buffer, length, &pos, &count); + + up(&ipt_mutex); + + /* `start' hack - see fs/proc/generic.c line ~105 */ + *start=(char *)((unsigned long)count-offset); + return pos; +} +#endif /*CONFIG_PROC_FS*/ + static int __init init(void) { int ret; @@ -1651,13 +1710,23 @@ static int __init init(void) return ret; } - printk("iptables: (c)2000 Netfilter core team\n"); +#ifdef CONFIG_PROC_FS + if (!proc_net_create("ip_tables_names", 0, ipt_get_tables)) { + nf_unregister_sockopt(&ipt_sockopts); + return -ENOMEM; + } +#endif + + printk("ip_tables: (c)2000 Netfilter core team\n"); return 0; } static void __exit fini(void) { nf_unregister_sockopt(&ipt_sockopts); +#ifdef CONFIG_PROC_FS + proc_net_remove("ip_tables_names"); +#endif } module_init(init); diff --git a/net/ipv4/netfilter/ipchains_core.c b/net/ipv4/netfilter/ipchains_core.c index 02bd7ad83..419b0382c 100644 --- a/net/ipv4/netfilter/ipchains_core.c +++ b/net/ipv4/netfilter/ipchains_core.c @@ -145,7 +145,9 @@ /*#define DEBUG_IP_FIREWALL_USER*/ /*#define DEBUG_IP_FIREWALL_LOCKING*/ +#if defined(CONFIG_NETLINK_DEV) || defined(CONFIG_NETLINK_DEV_MODULE) static struct sock *ipfwsk; +#endif #ifdef CONFIG_SMP #define SLOT_NUMBER() (cpu_number_map(smp_processor_id())*2 + !in_interrupt()) diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 6e69d6a90..4675a94b8 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -24,10 +24,6 @@ struct esphdr { __u32 spi; }; /* FIXME evil kludge */ -/* Make init and cleanup non-static, so gcc doesn't warn about unused, - but don't export the symbols */ -EXPORT_NO_SYMBOLS; - /* Use lock to serialize, so printks don't overlap */ static spinlock_t log_lock = SPIN_LOCK_UNLOCKED; @@ -353,15 +349,15 @@ static struct ipt_target ipt_log_reg static int __init init(void) { - if (ipt_register_target(&ipt_log_reg)) - return -EINVAL; + if (ipt_register_target(&ipt_log_reg)) + return -EINVAL; - return 0; + return 0; } static void __exit fini(void) { - ipt_unregister_target(&ipt_log_reg); + ipt_unregister_target(&ipt_log_reg); } module_init(init); diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c index 32906eefe..924e00e5c 100644 --- a/net/ipv4/netfilter/ipt_MARK.c +++ b/net/ipv4/netfilter/ipt_MARK.c @@ -7,8 +7,6 @@ #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_MARK.h> -EXPORT_NO_SYMBOLS; - static unsigned int target(struct sk_buff **pskb, unsigned int hooknum, @@ -53,15 +51,15 @@ static struct ipt_target ipt_mark_reg static int __init init(void) { - if (ipt_register_target(&ipt_mark_reg)) - return -EINVAL; + if (ipt_register_target(&ipt_mark_reg)) + return -EINVAL; - return 0; + return 0; } static void __exit fini(void) { - ipt_unregister_target(&ipt_mark_reg); + ipt_unregister_target(&ipt_mark_reg); } module_init(init); diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 9f94f8f44..071e2c3cd 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -11,8 +11,6 @@ #include <linux/netfilter_ipv4/ip_nat_rule.h> #include <linux/netfilter_ipv4/ip_tables.h> -EXPORT_NO_SYMBOLS; - #if 0 #define DEBUGP printk #else diff --git a/net/ipv4/netfilter/ipt_MIRROR.c b/net/ipv4/netfilter/ipt_MIRROR.c index 9dec181c1..dba913387 100644 --- a/net/ipv4/netfilter/ipt_MIRROR.c +++ b/net/ipv4/netfilter/ipt_MIRROR.c @@ -29,7 +29,6 @@ #include <linux/route.h> struct in_device; #include <net/route.h> -EXPORT_NO_SYMBOLS; #if 0 #define DEBUGP printk @@ -49,7 +48,7 @@ static int route_mirror(struct sk_buff *skb) } /* check if the interface we are living by is the same as the one we arrived on */ - if (skb->rx_dev != rt->u.dst.dev) { + if (skb->rx_dev == rt->u.dst.dev) { /* Drop old route. */ dst_release(skb->dst); skb->dst = &rt->u.dst; diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c index 690d3a8a1..aa7ac5e5d 100644 --- a/net/ipv4/netfilter/ipt_REDIRECT.c +++ b/net/ipv4/netfilter/ipt_REDIRECT.c @@ -12,8 +12,6 @@ #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv4/ip_nat_rule.h> -EXPORT_NO_SYMBOLS; - #if 0 #define DEBUGP printk #else diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index b183e822c..7e82c908c 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -6,12 +6,11 @@ #include <linux/skbuff.h> #include <linux/ip.h> #include <net/icmp.h> -#include <net/tcp.h> +#include <net/ip.h> struct in_device; #include <net/route.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_REJECT.h> -EXPORT_NO_SYMBOLS; #if 0 #define DEBUGP printk @@ -28,6 +27,9 @@ static unsigned int reject(struct sk_buff **pskb, { const struct ipt_reject_info *reject = targinfo; + /* WARNING: This code has causes reentry within iptables. + This means that the iptables jump stack is now crap. We + must return an absolute verdict. --RR */ switch (reject->with) { case IPT_ICMP_NET_UNREACHABLE: icmp_send(*pskb, ICMP_DEST_UNREACH, ICMP_NET_UNREACH, 0); @@ -62,9 +64,6 @@ static unsigned int reject(struct sk_buff **pskb, } } break; - case IPT_TCP_RESET: - tcp_v4_send_reset(*pskb); - break; } return NF_DROP; @@ -115,12 +114,6 @@ static int check(const char *tablename, DEBUGP("REJECT: ECHOREPLY illegal for non-ping\n"); return 0; } - } else if (rejinfo->with == IPT_TCP_RESET) { - if (e->ip.proto != IPPROTO_TCP - || (e->ip.invflags & IPT_INV_PROTO)) { - DEBUGP("REJECT: TCP_RESET illegal for non-tcp\n"); - return 0; - } } return 1; diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c index fbfb4974f..f0c293868 100644 --- a/net/ipv4/netfilter/ipt_TOS.c +++ b/net/ipv4/netfilter/ipt_TOS.c @@ -7,8 +7,6 @@ #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_TOS.h> -EXPORT_NO_SYMBOLS; - static unsigned int target(struct sk_buff **pskb, unsigned int hooknum, @@ -72,15 +70,15 @@ static struct ipt_target ipt_tos_reg static int __init init(void) { - if (ipt_register_target(&ipt_tos_reg)) - return -EINVAL; + if (ipt_register_target(&ipt_tos_reg)) + return -EINVAL; - return 0; + return 0; } static void __exit fini(void) { - ipt_unregister_target(&ipt_tos_reg); + ipt_unregister_target(&ipt_tos_reg); } module_init(init); diff --git a/net/ipv4/netfilter/ipt_limit.c b/net/ipv4/netfilter/ipt_limit.c index 3785ba371..5e2b86029 100644 --- a/net/ipv4/netfilter/ipt_limit.c +++ b/net/ipv4/netfilter/ipt_limit.c @@ -14,7 +14,6 @@ #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_limit.h> -EXPORT_NO_SYMBOLS; #define IP_PARTS_NATIVE(n) \ (unsigned int)((n)>>24)&0xFF, \ diff --git a/net/ipv4/netfilter/ipt_mac.c b/net/ipv4/netfilter/ipt_mac.c index 90dbec59d..7de798767 100644 --- a/net/ipv4/netfilter/ipt_mac.c +++ b/net/ipv4/netfilter/ipt_mac.c @@ -5,7 +5,6 @@ #include <linux/netfilter_ipv4/ipt_mac.h> #include <linux/netfilter_ipv4/ip_tables.h> -EXPORT_NO_SYMBOLS; static int match(const struct sk_buff *skb, diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c index 0d828fd20..66c3d1186 100644 --- a/net/ipv4/netfilter/ipt_mark.c +++ b/net/ipv4/netfilter/ipt_mark.c @@ -5,8 +5,6 @@ #include <linux/netfilter_ipv4/ipt_mark.h> #include <linux/netfilter_ipv4/ip_tables.h> -EXPORT_NO_SYMBOLS; - static int match(const struct sk_buff *skb, const struct net_device *in, diff --git a/net/ipv4/netfilter/ipt_multiport.c b/net/ipv4/netfilter/ipt_multiport.c index 08cc4a968..6170ce65e 100644 --- a/net/ipv4/netfilter/ipt_multiport.c +++ b/net/ipv4/netfilter/ipt_multiport.c @@ -14,8 +14,6 @@ #define duprintf(format, args...) #endif -EXPORT_NO_SYMBOLS; - /* Returns 1 if the port is matched by the test, 0 otherwise. */ static inline int ports_match(const u_int16_t *portlist, enum ipt_multiport_flags flags, diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c index 5438571d3..501916414 100644 --- a/net/ipv4/netfilter/ipt_owner.c +++ b/net/ipv4/netfilter/ipt_owner.c @@ -1,7 +1,7 @@ /* Kernel module to match various things tied to sockets associated with locally generated outgoing packets. - (C)2000 Marc Boucher + Copyright (C) 2000 Marc Boucher */ #include <linux/module.h> #include <linux/skbuff.h> @@ -11,8 +11,6 @@ #include <linux/netfilter_ipv4/ipt_owner.h> #include <linux/netfilter_ipv4/ip_tables.h> -EXPORT_NO_SYMBOLS; - static int match_pid(const struct sk_buff *skb, pid_t pid) { diff --git a/net/ipv4/netfilter/ipt_state.c b/net/ipv4/netfilter/ipt_state.c index 1baa54d62..b559e7f56 100644 --- a/net/ipv4/netfilter/ipt_state.c +++ b/net/ipv4/netfilter/ipt_state.c @@ -6,7 +6,6 @@ #include <linux/netfilter_ipv4/ip_conntrack.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_state.h> -EXPORT_NO_SYMBOLS; static int match(const struct sk_buff *skb, @@ -47,14 +46,17 @@ static struct ipt_match state_match static int __init init(void) { - __MOD_INC_USE_COUNT(ip_conntrack_module); + /* NULL if ip_conntrack not a module */ + if (ip_conntrack_module) + __MOD_INC_USE_COUNT(ip_conntrack_module); return ipt_register_match(&state_match); } static void __exit fini(void) { ipt_unregister_match(&state_match); - __MOD_DEC_USE_COUNT(ip_conntrack_module); + if (ip_conntrack_module) + __MOD_DEC_USE_COUNT(ip_conntrack_module); } module_init(init); diff --git a/net/ipv4/netfilter/ipt_tos.c b/net/ipv4/netfilter/ipt_tos.c index 6da72b2d8..b144704e4 100644 --- a/net/ipv4/netfilter/ipt_tos.c +++ b/net/ipv4/netfilter/ipt_tos.c @@ -5,8 +5,6 @@ #include <linux/netfilter_ipv4/ipt_tos.h> #include <linux/netfilter_ipv4/ip_tables.h> -EXPORT_NO_SYMBOLS; - static int match(const struct sk_buff *skb, const struct net_device *in, diff --git a/net/ipv4/netfilter/ipt_unclean.c b/net/ipv4/netfilter/ipt_unclean.c index 056224a87..72fab2b18 100644 --- a/net/ipv4/netfilter/ipt_unclean.c +++ b/net/ipv4/netfilter/ipt_unclean.c @@ -9,8 +9,6 @@ #include <linux/netfilter_ipv4/ip_tables.h> -EXPORT_NO_SYMBOLS; - #define limpk(format, args...) \ do { \ if (net_ratelimit()) \ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 471eb9e70..098d91ba1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.165 2000/03/23 05:30:32 davem Exp $ + * Version: $Id: tcp.c,v 1.166 2000/03/25 01:55:11 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -445,12 +445,6 @@ static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait } /* - * Compute minimal free write space needed to queue new packets. - */ -#define tcp_min_write_space(__sk) \ - (atomic_read(&(__sk)->wmem_alloc) / 2) - -/* * Wait for a TCP event. * * Note that we don't need to lock the socket, as the upper poll layers @@ -520,7 +514,15 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) if (sock_wspace(sk) >= tcp_min_write_space(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ - sk->socket->flags |= SO_NOSPACE; + set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); + set_bit(SOCK_NOSPACE, &sk->socket->flags); + + /* Race breaker. If space is freed after + * wspace test but before the flags are set, + * IO signal will be lost. + */ + if (sock_wspace(sk) >= tcp_min_write_space(sk)) + mask |= POLLOUT | POLLWRNORM; } } @@ -534,18 +536,26 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) * Socket write_space callback. * This (or rather the sock_wake_async) should agree with poll. * - * WARNING. This callback is called from any context (process, - * bh or irq). Do not make anything more smart from it. + * WARNING. This callback is called, when socket is not locked. + * + * This wakeup is used by TCP only as dead-lock breaker, real + * wakeup occurs when incoming ack frees some space in buffer. */ void tcp_write_space(struct sock *sk) { + struct socket *sock; + read_lock(&sk->callback_lock); - if (!sk->dead) { - /* Why??!! Does it really not overshedule? --ANK */ - wake_up_interruptible(sk->sleep); + if ((sock = sk->socket) != NULL && atomic_read(&sk->wmem_alloc) == 0) { + if (test_bit(SOCK_NOSPACE, &sock->flags)) { + if (sk->sleep && waitqueue_active(sk->sleep)) { + clear_bit(SOCK_NOSPACE, &sock->flags); + wake_up_interruptible(sk->sleep); + } + } - if (sock_wspace(sk) >= tcp_min_write_space(sk)) - sock_wake_async(sk->socket, 2, POLL_OUT); + if (sock->fasync_list) + sock_wake_async(sock, 2, POLL_OUT); } read_unlock(&sk->callback_lock); } @@ -636,7 +646,6 @@ int tcp_listen_start(struct sock *sk) sk->write_space = tcp_listen_write_space; sk_dst_reset(sk); sk->prot->hash(sk); - sk->socket->flags |= SO_ACCEPTCON; return 0; } @@ -742,7 +751,7 @@ static int wait_for_tcp_connect(struct sock * sk, int flags, long *timeo_p) if(!*timeo_p) return -EAGAIN; if(signal_pending(tsk)) - return -ERESTARTSYS; + return sock_intr_errno(*timeo_p); __set_task_state(tsk, TASK_INTERRUPTIBLE); add_wait_queue(sk->sleep, &wait); @@ -772,9 +781,12 @@ static long wait_for_tcp_memory(struct sock * sk, long timeo) if (!tcp_memory_free(sk)) { DECLARE_WAITQUEUE(wait, current); - sk->socket->flags &= ~SO_NOSPACE; + clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); + add_wait_queue(sk->sleep, &wait); for (;;) { + set_bit(SOCK_NOSPACE, &sk->socket->flags); + set_current_state(TASK_INTERRUPTIBLE); if (signal_pending(current)) @@ -830,7 +842,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size) goto out_unlock; /* This should be in poll */ - sk->socket->flags &= ~SO_NOSPACE; /* clear SIGIO XXX */ + clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); mss_now = tcp_current_mss(sk); @@ -943,13 +955,15 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size) /* If we didn't get any memory, we need to sleep. */ if (skb == NULL) { - sk->socket->flags |= SO_NOSPACE; + set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); + set_bit(SOCK_NOSPACE, &sk->socket->flags); + if (!timeo) { err = -EAGAIN; goto do_interrupted; } if (signal_pending(current)) { - err = -ERESTARTSYS; + err = sock_intr_errno(timeo); goto do_interrupted; } __tcp_push_pending_frames(sk, tp, mss_now); @@ -1062,7 +1076,8 @@ static int tcp_recv_urg(struct sock * sk, long timeo, msg->msg_flags|=MSG_OOB; if(len>0) { - err = memcpy_toiovec(msg->msg_iov, &c, 1); + if (!(flags & MSG_PEEK)) + err = memcpy_toiovec(msg->msg_iov, &c, 1); len = 1; } else msg->msg_flags|=MSG_TRUNC; @@ -1188,14 +1203,14 @@ static long tcp_data_wait(struct sock *sk, long timeo) __set_current_state(TASK_INTERRUPTIBLE); - sk->socket->flags |= SO_WAITDATA; + set_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags); release_sock(sk); if (skb_queue_empty(&sk->receive_queue)) timeo = schedule_timeout(timeo); lock_sock(sk); - sk->socket->flags &= ~SO_WAITDATA; + clear_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags); remove_wait_queue(sk->sleep, &wait); __set_current_state(TASK_RUNNING); @@ -1287,9 +1302,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, if (signal_pending(current)) { if (copied) break; - copied = -ERESTARTSYS; - if (!timeo) - copied = -EAGAIN; + copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; break; } @@ -1362,7 +1375,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, if (tp->ucopy.task == user_recv) { /* Install new reader */ - if (user_recv == NULL && !(flags&MSG_PEEK)) { + if (user_recv == NULL && !(flags&(MSG_TRUNC|MSG_PEEK))) { user_recv = current; tp->ucopy.task = user_recv; tp->ucopy.iov = msg->msg_iov; @@ -1370,7 +1383,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, tp->ucopy.len = len; - BUG_TRAP(tp->copied_seq == tp->rcv_nxt || (flags&MSG_PEEK)); + BUG_TRAP(tp->copied_seq == tp->rcv_nxt || (flags&(MSG_PEEK|MSG_TRUNC))); /* Ugly... If prequeue is not empty, we have to * process it before releasing socket, otherwise @@ -1458,12 +1471,15 @@ do_prequeue: } } - err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used); - if (err) { - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; + err = 0; + if (!(flags&MSG_TRUNC)) { + err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } } *seq += used; @@ -1961,7 +1977,7 @@ static int wait_for_connect(struct sock * sk, long timeo) err = -EINVAL; if (sk->state != TCP_LISTEN) break; - err = -ERESTARTSYS; + err = sock_intr_errno(timeo); if (signal_pending(current)) break; err = -EAGAIN; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 575ec3036..3ba12bc52 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_input.c,v 1.190 2000/03/21 19:34:23 davem Exp $ + * Version: $Id: tcp_input.c,v 1.191 2000/03/25 01:55:13 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -1181,6 +1181,9 @@ static int tcp_ack(struct sock *sk, struct tcphdr *th, if (ack != tp->snd_una || (flag == 0 && !th->fin)) dst_confirm(sk->dst_cache); + if (ack != tp->snd_una) + tp->sorry = 1; + /* Remember the highest ack received. */ tp->snd_una = ack; return 1; @@ -1614,7 +1617,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); tp->fin_seq = TCP_SKB_CB(skb)->end_seq; - tcp_send_ack(sk); + tp->ack.pending = 1; sk->shutdown |= RCV_SHUTDOWN; @@ -1644,6 +1647,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) break; case TCP_FIN_WAIT2: /* Received a FIN -- send ACK and enter TIME_WAIT. */ + tcp_send_ack(sk); tcp_time_wait(sk, TCP_TIME_WAIT, 0); break; default: @@ -1944,7 +1948,7 @@ queue_and_out: if (eaten) { kfree_skb(skb); - } else + } else if (!sk->dead) sk->data_ready(sk, 0); return; } @@ -2074,6 +2078,30 @@ drop: kfree_skb(skb); } +/* When incoming ACK allowed to free some skb from write_queue, + * we remember this in flag tp->sorry and wake up socket on the exit + * from tcp input handler. Probably, handler has already eat this space + * sending ACK and cloned frames from tcp_write_xmit(). + */ +static __inline__ void tcp_new_space(struct sock *sk) +{ + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct socket *sock; + + tp->sorry = 0; + + if (sock_wspace(sk) >= tcp_min_write_space(sk) && + (sock = sk->socket) != NULL) { + clear_bit(SOCK_NOSPACE, &sock->flags); + + if (sk->sleep && waitqueue_active(sk->sleep)) + wake_up_interruptible(sk->sleep); + + if (sock->fasync_list) + sock_wake_async(sock, 2, POLL_OUT); + } +} + static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); @@ -2114,7 +2142,14 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) */ /* More than one full frame received or... */ - if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss) || + if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss +#ifdef TCP_MORE_COARSE_ACKS + /* Avoid to send immediate ACK from input path, if it + * does not advance window far enough. tcp_recvmsg() will do this. + */ + && (!sysctl_tcp_retrans_collapse || __tcp_select_window(sk) >= tp->rcv_wnd) +#endif + ) || /* We ACK each frame or... */ tcp_in_quickack_mode(tp) || /* We have out of order data or */ @@ -2480,6 +2515,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, TCP_SKB_CB(skb)->ack_seq, len); kfree_skb(skb); tcp_data_snd_check(sk); + if (tp->sorry) + tcp_new_space(sk); return 0; } else { /* Header too small */ TCP_INC_STATS_BH(TcpInErrs); @@ -2633,6 +2670,8 @@ step5: if(sk->state != TCP_CLOSE) { tcp_data_snd_check(sk); tcp_ack_snd_check(sk); + if (tp->sorry) + tcp_new_space(sk); } return 0; @@ -2739,6 +2778,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newtp->saw_tstamp = 0; newtp->probes_out = 0; + newtp->num_sacks = 0; newtp->syn_seq = req->rcv_isn; newtp->fin_seq = req->rcv_isn; newtp->urg_data = 0; @@ -3112,6 +3152,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_sync_mss(sk, tp->pmtu_cookie); tcp_initialize_rcv_mss(sk); tcp_init_metrics(sk); + tcp_init_buffer_space(sk); if (sk->keepopen) tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); @@ -3516,6 +3557,8 @@ step6: if (sk->state != TCP_CLOSE) { tcp_data_snd_check(sk); tcp_ack_snd_check(sk); + if (tp->sorry) + tcp_new_space(sk); } if (!queued) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 456f12968..3c9f4e82b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_ipv4.c,v 1.203 2000/03/22 17:55:03 davem Exp $ + * Version: $Id: tcp_ipv4.c,v 1.205 2000/03/26 09:16:08 davem Exp $ * * IPv4 specific functions * @@ -1039,7 +1039,6 @@ out: void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, struct sk_buff *skb) { - th->check = 0; th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr, csum_partial((char *)th, th->doff<<2, skb->csum)); } @@ -1057,7 +1056,7 @@ void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, * Exception: precedence violation. We do not implement it in any case. */ -void tcp_v4_send_reset(struct sk_buff *skb) +static void tcp_v4_send_reset(struct sk_buff *skb) { struct tcphdr *th = skb->h.th; struct tcphdr rth; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 887aaa519..600140764 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.122 2000/02/21 15:51:41 davem Exp $ + * Version: $Id: tcp_output.c,v 1.123 2000/03/25 01:52:05 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -126,7 +126,7 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) #define SYSCTL_FLAG_SACK 0x4 sysctl_flags = 0; - if(tcb->flags & TCPCB_FLAG_SYN) { + if (tcb->flags & TCPCB_FLAG_SYN) { tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; if(sysctl_tcp_timestamps) { tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; @@ -141,7 +141,7 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; } - } else if(tp->sack_ok && tp->num_sacks) { + } else if (tp->num_sacks) { /* A SACK is 2 pad bytes, a 2 byte header, plus * 2 32-bit sequence numbers for each SACK block. */ @@ -157,16 +157,19 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) th->dest = sk->dport; th->seq = htonl(TCP_SKB_CB(skb)->seq); th->ack_seq = htonl(tp->rcv_nxt); - th->doff = (tcp_header_size >> 2); - th->res1 = 0; - *(((__u8 *)th) + 13) = tcb->flags; - th->check = 0; - th->urg_ptr = ntohs(tcb->urg_ptr); - if(tcb->flags & TCPCB_FLAG_SYN) { + *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags); + if (tcb->flags & TCPCB_FLAG_SYN) { /* RFC1323: The window in SYN & SYN/ACK segments * is never scaled. */ th->window = htons(tp->rcv_wnd); + } else { + th->window = htons(tcp_select_window(sk)); + } + th->check = 0; + th->urg_ptr = ntohs(tcb->urg_ptr); + + if (tcb->flags & TCPCB_FLAG_SYN) { tcp_syn_build_options((__u32 *)(th + 1), tcp_advertise_mss(sk), (sysctl_flags & SYSCTL_FLAG_TSTAMPS), @@ -176,13 +179,12 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->when, tp->ts_recent); } else { - th->window = htons(tcp_select_window(sk)); tcp_build_and_update_options((__u32 *)(th + 1), tp, TCP_SKB_CB(skb)->when); } tp->af_specific->send_check(sk, th, skb->len, skb); - if (th->ack) + if (tcb->flags & TCPCB_FLAG_ACK) tcp_event_ack_sent(sk); if (skb->len != tcp_header_size) @@ -1097,10 +1099,26 @@ err_out: void tcp_send_delayed_ack(struct sock *sk) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + long ato = tp->ack.ato; unsigned long timeout; + if (ato > TCP_DELACK_MIN) { + int max_ato; + + /* If some rtt estimate is known, use it to bound delayed ack. + * Do not use tp->rto here, use results of rtt measurements + * directly. + */ + if (tp->srtt) + max_ato = (tp->srtt >> 3) + tp->mdev; + else + max_ato = TCP_DELACK_MAX; + + ato = min(ato, max_ato); + } + /* Stay within the limit we were given */ - timeout = jiffies + tp->ack.ato; + timeout = jiffies + ato; /* Use new timeout only if there wasn't a older one earlier. */ spin_lock_bh(&sk->timer_lock); @@ -1111,7 +1129,7 @@ void tcp_send_delayed_ack(struct sock *sk) /* If delack timer was blocked or is about to expire, * send ACK now. */ - if (tp->ack.blocked || time_before_eq(tp->delack_timer.expires, jiffies+(tp->ack.ato>>2))) { + if (tp->ack.blocked || time_before_eq(tp->delack_timer.expires, jiffies+(ato>>2))) { spin_unlock_bh(&sk->timer_lock); tcp_send_ack(sk); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index ffb0787e8..41ce4b997 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: icmp.c,v 1.27 2000/02/22 23:54:28 davem Exp $ + * $Id: icmp.c,v 1.28 2000/03/25 01:55:20 davem Exp $ * * Based on net/ipv4/icmp.c * @@ -660,6 +660,7 @@ int __init icmpv6_init(struct net_proto_family *ops) sk = icmpv6_socket->sk; sk->allocation = GFP_ATOMIC; + sk->sndbuf = SK_WMEM_MAX*2; sk->prot->unhash(sk); inet6_add_protocol(&icmpv6_protocol); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b0e8ee714..c6fd03355 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: tcp_ipv6.c,v 1.121 2000/03/08 19:36:47 davem Exp $ + * $Id: tcp_ipv6.c,v 1.122 2000/03/25 01:52:11 davem Exp $ * * Based on: * linux/net/ipv4/tcp.c @@ -910,7 +910,6 @@ static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, struct sk_buff *skb) { struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; - th->check = 0; th->check = csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, csum_partial((char *)th, th->doff<<2, diff --git a/net/ipx/af_spx.c b/net/ipx/af_spx.c index 9f52dfe4e..1eb7a725c 100644 --- a/net/ipx/af_spx.c +++ b/net/ipx/af_spx.c @@ -89,7 +89,7 @@ static unsigned int spx_datagram_poll(struct file * file, struct socket *sock, p if (sock_writeable(sk)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; else - sk->socket->flags |= SO_NOSPACE; + set_bit(SOCK_ASYNC_NOSPACE,&sk->socket->flags); return mask; } @@ -231,7 +231,7 @@ static int spx_listen(struct socket *sock, int backlog) sk->ack_backlog = 0; sk->state = TCP_LISTEN; } - sk->socket->flags |= SO_ACCEPTCON; + sk->socket->flags |= __SO_ACCEPTCON; return (0); } @@ -248,7 +248,7 @@ static int spx_accept(struct socket *sock, struct socket *newsock, int flags) return (-EINVAL); sk = sock->sk; - if((sock->state != SS_UNCONNECTED) || !(sock->flags & SO_ACCEPTCON)) + if((sock->state != SS_UNCONNECTED) || !(sock->flags & __SO_ACCEPTCON)) return (-EINVAL); if(sock->type != SOCK_SEQPACKET) return (-EOPNOTSUPP); diff --git a/net/khttpd/security.c b/net/khttpd/security.c index 7e0780a26..16503ceb5 100644 --- a/net/khttpd/security.c +++ b/net/khttpd/security.c @@ -115,14 +115,12 @@ struct file *OpenFileForSecurity(char *Filename) lock_kernel(); - filp = filp_open(Filename, 0, O_RDONLY, NULL); + filp = filp_open(Filename, O_RDONLY, 0); unlock_kernel(); - if ((IS_ERR(filp))||(filp==NULL)||(filp->f_dentry==NULL)) - { + if (IS_ERR(filp)) return NULL; - } #ifndef BENCHMARK permission = filp->f_dentry->d_inode->i_mode; diff --git a/net/khttpd/sockets.c b/net/khttpd/sockets.c index 60e66fdf8..74bfe614d 100644 --- a/net/khttpd/sockets.c +++ b/net/khttpd/sockets.c @@ -79,7 +79,6 @@ int StartListening(const int Port) error=sock->ops->listen(sock,48); if (error!=0) (void)printk(KERN_ERR "kHTTPd: Error listening on socket \n"); - sock->flags |= SO_ACCEPTCON; MainSocket = sock; diff --git a/net/khttpd/waitheaders.c b/net/khttpd/waitheaders.c index 47fa1581d..2c24f3744 100644 --- a/net/khttpd/waitheaders.c +++ b/net/khttpd/waitheaders.c @@ -239,7 +239,6 @@ static int DecodeHeader(const int CPUNR, struct http_request *Request) return 0; } else - if ((Request->filp->f_dentry!=NULL)&&(Request->filp->f_dentry->d_inode!=NULL)) { Request->FileLength = (int)Request->filp->f_dentry->d_inode->i_size; Request->Time = Request->filp->f_dentry->d_inode->i_mtime; @@ -262,12 +261,6 @@ static int DecodeHeader(const int CPUNR, struct http_request *Request) } - } else - { - /* Ehhh... */ - - printk(KERN_CRIT "kHTTPd: Unexpected filesystem response\n"); - return -1; } LeaveFunction("DecodeHeader"); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 0136d15c2..b76a07274 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -436,7 +436,7 @@ retry: if (signal_pending(current)) { kfree_skb(skb); - return -ERESTARTSYS; + return sock_intr_errno(timeo); } goto retry; } diff --git a/net/netsyms.c b/net/netsyms.c index c6745cafe..9a7030d7e 100644 --- a/net/netsyms.c +++ b/net/netsyms.c @@ -340,7 +340,6 @@ EXPORT_SYMBOL(tcp_sendmsg); EXPORT_SYMBOL(tcp_v4_rebuild_header); EXPORT_SYMBOL(tcp_v4_send_check); EXPORT_SYMBOL(tcp_v4_conn_request); -EXPORT_SYMBOL(tcp_v4_send_reset); EXPORT_SYMBOL(tcp_create_openreq_child); EXPORT_SYMBOL(tcp_bucket_create); EXPORT_SYMBOL(__tcp_put_port); @@ -596,6 +595,51 @@ EXPORT_SYMBOL(nf_setsockopt); EXPORT_SYMBOL(nf_getsockopt); #endif +#ifdef CONFIG_IP_NF_CONNTRACK +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_core.h> +EXPORT_SYMBOL(ip_conntrack_protocol_register); +EXPORT_SYMBOL(invert_tuplepr); +EXPORT_SYMBOL(ip_conntrack_alter_reply); +EXPORT_SYMBOL(ip_conntrack_destroyed); +EXPORT_SYMBOL(ip_conntrack_get); +EXPORT_SYMBOL(ip_conntrack_module); +EXPORT_SYMBOL(ip_conntrack_helper_register); +EXPORT_SYMBOL(ip_conntrack_helper_unregister); +EXPORT_SYMBOL(ip_ct_selective_cleanup); +EXPORT_SYMBOL(ip_ct_refresh); +EXPORT_SYMBOL(ip_conntrack_expect_related); +EXPORT_SYMBOL(ip_conntrack_tuple_taken); +EXPORT_SYMBOL(ip_ct_gather_frags); +#ifdef CONFIG_IP_NF_FTP +#include <linux/netfilter_ipv4/ip_conntrack_ftp.h> +EXPORT_SYMBOL(ip_ftp_lock); +#endif +#endif /*CONFIG_IP_NF_CONNTRACK*/ + +#ifdef CONFIG_IP_NF_NAT +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_helper.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +EXPORT_SYMBOL(ip_nat_setup_info); +EXPORT_SYMBOL(ip_nat_helper_register); +EXPORT_SYMBOL(ip_nat_helper_unregister); +EXPORT_SYMBOL(ip_nat_expect_register); +EXPORT_SYMBOL(ip_nat_expect_unregister); +EXPORT_SYMBOL(ip_nat_cheat_check); +#endif + +#ifdef CONFIG_IP_NF_IPTABLES +#include <linux/netfilter_ipv4/ip_tables.h> +EXPORT_SYMBOL(ipt_register_table); +EXPORT_SYMBOL(ipt_unregister_table); +EXPORT_SYMBOL(ipt_register_target); +EXPORT_SYMBOL(ipt_unregister_target); +EXPORT_SYMBOL(ipt_register_match); +EXPORT_SYMBOL(ipt_unregister_match); +#endif + EXPORT_SYMBOL(register_gifconf); EXPORT_SYMBOL(net_call_rx_atomic); diff --git a/net/socket.c b/net/socket.c index edaf48a3b..fb5158241 100644 --- a/net/socket.c +++ b/net/socket.c @@ -690,20 +690,17 @@ int sock_wake_async(struct socket *sock, int how, int band) switch (how) { case 1: - if (sock->flags & SO_WAITDATA) + + if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags)) break; goto call_kill; case 2: - if (!(sock->flags & SO_NOSPACE)) + if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags)) break; - sock->flags &= ~SO_NOSPACE; /* fall through */ case 0: call_kill: - /* read_lock(&sock->sk->callback_lock); */ - if(sock->fasync_list != NULL) - kill_fasync(sock->fasync_list, SIGIO, band); - /* read_unlock(&sock->sk->callback_lock); */ + kill_fasync(sock->fasync_list, SIGIO, band); break; case 3: kill_fasync(sock->fasync_list, SIGURG, band); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 806e14bce..c41dfc1eb 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -32,6 +32,8 @@ #include <linux/sunrpc/clnt.h> +#include <linux/nfs.h> + #define RPC_SLACK_SPACE 1024 /* total overkill */ @@ -78,6 +80,7 @@ rpc_create_client(struct rpc_xprt *xprt, char *servname, #ifdef RPC_DEBUG rpc_register_sysctl(); #endif + xdr_init(); if (!xprt) goto out; @@ -198,7 +201,6 @@ rpc_release_client(struct rpc_clnt *clnt) static void rpc_default_callback(struct rpc_task *task) { - rpc_release_task(task); } /* @@ -263,9 +265,10 @@ int rpc_call_sync(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) /* Set up the call info struct and execute the task */ if (task->tk_status == 0) status = rpc_execute(task); - else + else { status = task->tk_status; - rpc_release_task(task); + rpc_release_task(task); + } rpc_clnt_sigunmask(clnt, &oldset); @@ -344,10 +347,9 @@ rpc_call_setup(struct rpc_task *task, struct rpc_message *msg, int flags) void rpc_restart_call(struct rpc_task *task) { - if (task->tk_flags & RPC_TASK_KILLED) { - rpc_release_task(task); + if (RPC_ASSASSINATED(task)) return; - } + task->tk_action = call_reserve; rpcproc_count(task->tk_client, task->tk_msg.rpc_proc)++; } @@ -715,7 +717,7 @@ call_decode(struct rpc_task *task) * The following is an NFS-specific hack to cater for setuid * processes whose uid is mapped to nobody on the server. */ - if (task->tk_client->cl_prog == 100003 && + if (task->tk_client->cl_prog == NFS_PROGRAM && (ntohl(*p) == NFSERR_ACCES || ntohl(*p) == NFSERR_PERM)) { if (RPC_IS_SETUID(task) && task->tk_suid_retry) { dprintk("RPC: %4d retry squashed uid\n", task->tk_pid); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index bfbfc1580..da46ab910 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -508,6 +508,7 @@ __rpc_execute(struct rpc_task *task) return 0; } + restarted: while (1) { /* * Execute any pending callback. @@ -586,10 +587,29 @@ __rpc_execute(struct rpc_task *task) } } + if (task->tk_exit) { + task->tk_exit(task); + /* If tk_action is non-null, the user wants us to restart */ + if (task->tk_action) { + if (!RPC_ASSASSINATED(task)) { + /* Release RPC slot and buffer memory */ + if (task->tk_rqstp) + xprt_release(task); + if (task->tk_buffer) { + rpc_free(task->tk_buffer); + task->tk_buffer = NULL; + } + goto restarted; + } + printk(KERN_ERR "RPC: dead task tries to walk away.\n"); + } + } + dprintk("RPC: %4d exit() = %d\n", task->tk_pid, task->tk_status); status = task->tk_status; - if (task->tk_exit) - task->tk_exit(task); + + /* Release all resources associated with the task */ + rpc_release_task(task); return status; } @@ -599,22 +619,32 @@ __rpc_execute(struct rpc_task *task) * * This may be called recursively if e.g. an async NFS task updates * the attributes and finds that dirty pages must be flushed. + * NOTE: Upon exit of this function the task is guaranteed to be + * released. In particular note that tk_release() will have + * been called, so your task memory may have been freed. */ int rpc_execute(struct rpc_task *task) { + int status = -EIO; if (rpc_inhibit) { printk(KERN_INFO "RPC: execution inhibited!\n"); - return -EIO; + goto out_release; } - task->tk_flags |= RPC_TASK_RUNNING; + + status = -EWOULDBLOCK; if (task->tk_active) { printk(KERN_ERR "RPC: active task was run twice!\n"); - return -EWOULDBLOCK; + goto out_err; } + task->tk_active = 1; - + task->tk_flags |= RPC_TASK_RUNNING; return __rpc_execute(task); + out_release: + rpc_release_task(task); + out_err: + return status; } /* @@ -700,7 +730,7 @@ rpc_allocate(unsigned int flags, unsigned int size) } if (flags & RPC_TASK_ASYNC) return NULL; - current->state = TASK_INTERRUPTIBLE; + set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ>>4); } while (!signalled()); @@ -758,6 +788,13 @@ rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, current->pid); } +static void +rpc_default_free_task(struct rpc_task *task) +{ + dprintk("RPC: %4d freeing task\n", task->tk_pid); + rpc_free(task); +} + /* * Create a new task for the specified client. We have to * clean up after an allocation failure, as the client may @@ -774,6 +811,9 @@ rpc_new_task(struct rpc_clnt *clnt, rpc_action callback, int flags) rpc_init_task(task, clnt, callback, flags); + /* Replace tk_release */ + task->tk_release = rpc_default_free_task; + dprintk("RPC: %4d allocated task\n", task->tk_pid); task->tk_flags |= RPC_TASK_DYNAMIC; out: @@ -849,12 +889,8 @@ rpc_release_task(struct rpc_task *task) #ifdef RPC_DEBUG task->tk_magic = 0; #endif - - if (task->tk_flags & RPC_TASK_DYNAMIC) { - dprintk("RPC: %4d freeing task\n", task->tk_pid); - task->tk_flags &= ~RPC_TASK_DYNAMIC; - rpc_free(task); - } + if (task->tk_release) + task->tk_release(task); } /* @@ -886,7 +922,6 @@ rpc_child_exit(struct rpc_task *child) __rpc_wake_up(parent); } spin_unlock_bh(&rpc_queue_lock); - rpc_release_task(child); } /* @@ -1028,7 +1063,7 @@ rpciod_killall(void) __rpc_schedule(); if (all_tasks) { dprintk("rpciod_killall: waiting for tasks to exit\n"); - current->state = TASK_INTERRUPTIBLE; + set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(1); } } @@ -1099,7 +1134,7 @@ rpciod_down(void) * wait briefly before checking the process id. */ current->sigpending = 0; - current->state = TASK_INTERRUPTIBLE; + set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(1); /* * Display a message if we're going to wait longer. diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 92559fa65..36da3b619 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -27,7 +27,6 @@ EXPORT_SYMBOL(rpc_allocate); EXPORT_SYMBOL(rpc_free); EXPORT_SYMBOL(rpc_execute); EXPORT_SYMBOL(rpc_init_task); -EXPORT_SYMBOL(rpc_release_task); EXPORT_SYMBOL(rpc_sleep_on); EXPORT_SYMBOL(rpc_wake_up_next); EXPORT_SYMBOL(rpc_wake_up_task); @@ -89,12 +88,15 @@ EXPORT_SYMBOL(svc_proc_read); #endif /* Generic XDR */ +EXPORT_SYMBOL(xdr_encode_array); EXPORT_SYMBOL(xdr_encode_string); EXPORT_SYMBOL(xdr_decode_string); EXPORT_SYMBOL(xdr_decode_netobj); EXPORT_SYMBOL(xdr_encode_netobj); EXPORT_SYMBOL(xdr_zero); EXPORT_SYMBOL(xdr_one); +EXPORT_SYMBOL(xdr_shift_iovec); +EXPORT_SYMBOL(xdr_zero_iovec); /* RPC errors */ EXPORT_SYMBOL(rpc_success); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 97e323d0c..d99033fa5 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -769,7 +769,7 @@ again: * We have to be able to interrupt this wait * to bring down the daemons ... */ - current->state = TASK_INTERRUPTIBLE; + set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&rqstp->rq_wait, &wait); spin_unlock_bh(&serv->sv_lock); @@ -940,7 +940,6 @@ svc_create_socket(struct svc_serv *serv, int protocol, struct sockaddr_in *sin) if (protocol == IPPROTO_TCP) { if ((error = sock->ops->listen(sock, 5)) < 0) goto bummer; - sock->flags |= SO_ACCEPTCON; } if ((svsk = svc_setup_socket(serv, sock, &error, 1)) != NULL) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 6ebd94079..99b286af9 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -9,6 +9,7 @@ #include <linux/types.h> #include <linux/socket.h> #include <linux/string.h> +#include <linux/kernel.h> #include <linux/in.h> #include <linux/sunrpc/xdr.h> #include <linux/sunrpc/msg_prot.h> @@ -56,8 +57,8 @@ xdr_encode_netobj(u32 *p, const struct xdr_netobj *obj) { unsigned int quadlen = XDR_QUADLEN(obj->len); + p[quadlen] = 0; /* zero trailing bytes */ *p++ = htonl(obj->len); - p[quadlen-1] = 0; /* zero trailing bytes */ memcpy(p, obj->data, obj->len); return p + XDR_QUADLEN(obj->len); } @@ -84,18 +85,23 @@ xdr_decode_netobj(u32 *p, struct xdr_netobj *obj) } u32 * -xdr_encode_string(u32 *p, const char *string) +xdr_encode_array(u32 *p, const char *array, unsigned int len) { - int len = strlen(string); int quadlen = XDR_QUADLEN(len); p[quadlen] = 0; *p++ = htonl(len); - memcpy(p, string, len); + memcpy(p, array, len); return p + quadlen; } u32 * +xdr_encode_string(u32 *p, const char *string) +{ + return xdr_encode_array(p, string, strlen(string)); +} + +u32 * xdr_decode_string(u32 *p, char **sp, int *lenp, int maxlen) { unsigned int len; @@ -116,3 +122,51 @@ xdr_decode_string(u32 *p, char **sp, int *lenp, int maxlen) return p + XDR_QUADLEN(len); } +/* + * Realign the iovec if the server missed out some reply elements + * (such as post-op attributes,...) + * Note: This is a simple implementation that assumes that + * len <= iov->iov_len !!! + * The RPC header (assumed to be the 1st element in the iov array) + * is not shifted. + */ +void xdr_shift_iovec(struct iovec *iov, int nr, size_t len) +{ + struct iovec *pvec; + + for (pvec = iov + nr - 1; nr > 1; nr--, pvec--) { + struct iovec *svec = pvec - 1; + + if (len > pvec->iov_len) { + printk(KERN_DEBUG "RPC: Urk! Large shift of short iovec.\n"); + return; + } + memmove((char *)pvec->iov_base + len, pvec->iov_base, + pvec->iov_len - len); + + if (len > svec->iov_len) { + printk(KERN_DEBUG "RPC: Urk! Large shift of short iovec.\n"); + return; + } + memcpy(pvec->iov_base, + (char *)svec->iov_base + svec->iov_len - len, len); + } +} + +/* + * Zero the last n bytes in an iovec array of 'nr' elements + */ +void xdr_zero_iovec(struct iovec *iov, int nr, size_t n) +{ + struct iovec *pvec; + + for (pvec = iov + nr - 1; n && nr > 0; nr--, pvec--) { + if (n < pvec->iov_len) { + memset((char *)pvec->iov_base + pvec->iov_len - n, 0, n); + n = 0; + } else { + memset(pvec->iov_base, 0, pvec->iov_len); + n -= pvec->iov_len; + } + } +} diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 06d682223..b353aa37a 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -48,6 +48,7 @@ #include <linux/version.h> #include <linux/types.h> #include <linux/malloc.h> +#include <linux/capability.h> #include <linux/sched.h> #include <linux/errno.h> #include <linux/socket.h> @@ -227,7 +228,7 @@ xprt_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req) */ break; case -EAGAIN: - if (sock->flags & SO_NOSPACE) + if (test_bit(SOCK_NOSPACE, &sock->flags)) result = -ENOMEM; break; case -ENOTCONN: @@ -1569,8 +1570,8 @@ xprt_create_socket(int proto, struct rpc_timeout *to) goto failed; } - /* If the caller has root privs, bind to a reserved port */ - if (!current->fsuid && xprt_bindresvport(sock) < 0) + /* If the caller has the capability, bind to a reserved port */ + if (capable(CAP_NET_BIND_SERVICE) && xprt_bindresvport(sock) < 0) goto failed; return sock; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index cbe730b5d..12a4b1eb3 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -8,7 +8,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Version: $Id: af_unix.c,v 1.90 2000/03/16 20:38:45 davem Exp $ + * Version: $Id: af_unix.c,v 1.91 2000/03/25 01:55:34 davem Exp $ * * Fixes: * Linus Torvalds : Assorted bug cures. @@ -297,9 +297,10 @@ static __inline__ int unix_writable(struct sock *sk) static void unix_write_space(struct sock *sk) { read_lock(&sk->callback_lock); - if (!sk->dead && unix_writable(sk)) { - wake_up_interruptible(sk->sleep); - sock_wake_async(sk->socket, 2, POLL_OUT); + if (unix_writable(sk)) { + if (sk->sleep && waitqueue_active(sk->sleep)) + wake_up_interruptible(sk->sleep); + sk_wake_async(sk, 2, POLL_OUT); } read_unlock(&sk->callback_lock); } @@ -356,8 +357,10 @@ static int unix_release_sock (unix_socket *sk, int embrion) if (!skb_queue_empty(&sk->receive_queue) || embrion) skpair->err = ECONNRESET; unix_state_wunlock(skpair); - sk->state_change(skpair); - sock_wake_async(sk->socket,1,POLL_HUP); + skpair->state_change(skpair); + read_lock(&skpair->callback_lock); + sk_wake_async(skpair,1,POLL_HUP); + read_unlock(&skpair->callback_lock); } sock_put(skpair); /* It may now die */ unix_peer(sk) = NULL; @@ -418,7 +421,6 @@ static int unix_listen(struct socket *sock, int backlog) wake_up_interruptible_all(&sk->protinfo.af_unix.peer_wait); sk->max_ack_backlog=backlog; sk->state=TCP_LISTEN; - sock->flags |= SO_ACCEPTCON; /* set credentials so connect can copy them */ sk->peercred.pid = current->pid; sk->peercred.uid = current->euid; @@ -562,39 +564,51 @@ static unix_socket *unix_find_other(struct sockaddr_un *sunname, int len, int type, unsigned hash, int *error) { unix_socket *u; + struct dentry *dentry; + int err; - if (sunname->sun_path[0]) - { - struct dentry *dentry; - + if (sunname->sun_path[0]) { /* Do not believe to VFS, grab kernel lock */ lock_kernel(); - dentry = __open_namei(sunname->sun_path, 2|O_NOFOLLOW, S_IFSOCK, NULL); + dentry = lookup_dentry(sunname->sun_path,LOOKUP_POSITIVE); + err = PTR_ERR(dentry); if (IS_ERR(dentry)) { - *error = PTR_ERR(dentry); unlock_kernel(); - return NULL; + goto fail; } + err = permission(dentry->d_inode,MAY_WRITE); + if (err) + goto put_fail; + + err = -ECONNREFUSED; + if (!S_ISSOCK(dentry->d_inode->i_mode)) + goto put_fail; u=unix_find_socket_byinode(dentry->d_inode); + if (!u) + goto put_fail; + dput(dentry); unlock_kernel(); - if (u && u->type != type) - { - *error=-EPROTOTYPE; + err=-EPROTOTYPE; + if (u->type != type) { sock_put(u); - return NULL; + goto fail; } - } - else + } else { + err = -ECONNREFUSED; u=unix_find_socket_byname(sunname, len, type, hash); - - if (u==NULL) - { - *error=-ECONNREFUSED; - return NULL; + if (!u) + goto fail; } return u; + +put_fail: + dput(dentry); + unlock_kernel(); +fail: + *error=err; + return NULL; } @@ -827,7 +841,7 @@ restart: timeo = unix_wait_for_peer(other, timeo); - err = -ERESTARTSYS; + err = sock_intr_errno(timeo); if (signal_pending(current)) goto out; sock_put(other); @@ -1156,7 +1170,7 @@ restart: timeo = unix_wait_for_peer(other, timeo); - err = -ERESTARTSYS; + err = sock_intr_errno(timeo); if (signal_pending(current)) goto out_free; @@ -1228,8 +1242,8 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len, * much. */ - if (size > 4096-16) - limit = 4096-16; /* Fall back to a page if we can't grab a big buffer this instant */ + if (size > PAGE_SIZE-16) + limit = PAGE_SIZE-16; /* Fall back to a page if we can't grab a big buffer this instant */ else limit = 0; /* Otherwise just grab and wait */ @@ -1383,11 +1397,11 @@ static long unix_stream_data_wait(unix_socket * sk, long timeo) !timeo) break; - sk->socket->flags |= SO_WAITDATA; + set_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags); unix_state_runlock(sk); timeo = schedule_timeout(timeo); unix_state_rlock(sk); - sk->socket->flags &= ~SO_WAITDATA; + clear_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags); } __set_current_state(TASK_RUNNING); @@ -1455,7 +1469,7 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, int size timeo = unix_stream_data_wait(sk, timeo); if (signal_pending(current)) { - err = -ERESTARTSYS; + err = sock_intr_errno(timeo); goto out; } down(&sk->protinfo.af_unix.readsem); @@ -1556,10 +1570,12 @@ static int unix_shutdown(struct socket *sock, int mode) other->shutdown |= peer_mode; unix_state_wunlock(other); other->state_change(other); + read_lock(&other->callback_lock); if (peer_mode == SHUTDOWN_MASK) - sock_wake_async(other->socket,1,POLL_HUP); + sk_wake_async(other,1,POLL_HUP); else if (peer_mode & RCV_SHUTDOWN) - sock_wake_async(other->socket,1,POLL_IN); + sk_wake_async(other,1,POLL_IN); + read_unlock(&other->callback_lock); } if (other) sock_put(other); @@ -1658,7 +1674,7 @@ static int unix_read_proc(char *buffer, char **start, off_t offset, s, atomic_read(&s->refcnt), 0, - s->state == TCP_LISTEN ? SO_ACCEPTCON : 0, + s->state == TCP_LISTEN ? __SO_ACCEPTCON : 0, s->type, s->socket ? (s->state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : |