80 files changed, 4952 insertions, 3617 deletions
diff --git a/net/Config.in b/net/Config.in
index f1ed3f79d..b64570308 100644
--- a/net/Config.in
+++ b/net/Config.in
@@ -17,7 +17,9 @@ bool 'TCP/IP networking' CONFIG_INET
 if [ "$CONFIG_INET" = "y" ]; then
   source net/ipv4/Config.in
   if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
-    tristate 'The IPv6 protocol (EXPERIMENTAL)' CONFIG_IPV6
+#   Sorry, but IPv6 as module is still invalid.
+#   tristate 'The IPv6 protocol (EXPERIMENTAL)' CONFIG_IPV6
+    bool 'The IPv6 protocol (EXPERIMENTAL)' CONFIG_IPV6
     if [ "$CONFIG_IPV6" != "n" ]; then
 	    source net/ipv6/Config.in
     fi
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 331f3eb7b..bcfe9e4de 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1017,7 +1017,6 @@ static int atalk_create(struct socket *sock, int protocol)
 
 	sk->destruct = NULL;
 	/* Checksums on by default */
-	sk->mtu = DDP_MAXSZ;
 	sk->zapped = 1;
 
 	return (0);
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 71999a416..cd84989a6 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -849,7 +849,6 @@ int ax25_create(struct socket *sock, int protocol)
 	sk->destruct = ax25_free_sock;
 	sock->ops    = &ax25_proto_ops;
 	sk->protocol = protocol;
-	sk->mtu      = AX25_MTU;	/* 256 */
 
 	ax25->sk          = sk;
 	sk->protinfo.ax25 = ax25;
@@ -892,7 +891,6 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev)
 	sk->sndbuf   = osk->sndbuf;
 	sk->debug    = osk->debug;
 	sk->state    = TCP_ESTABLISHED;
-	sk->mtu      = osk->mtu;
 	sk->sleep    = osk->sleep;
 	sk->zapped   = osk->zapped;
 
diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c
index c285b4641..b5d5f071e 100644
--- a/net/ax25/ax25_ip.c
+++ b/net/ax25/ax25_ip.c
@@ -106,7 +106,7 @@ int ax25_rebuild_header(struct sk_buff *skb)
 {
 	struct sk_buff *ourskb;
 	unsigned char *bp  = skb->data;
-	struct device *dev = skb->dev;
+	struct device *dev;
 	ax25_address *src, *dst;
 	ax25_route *route;
 	ax25_dev *ax25_dev;
@@ -117,10 +117,14 @@ int ax25_rebuild_header(struct sk_buff *skb)
   	if (arp_find(bp + 1, skb))
   		return 1;
 
-	if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
-		return 1;
+	route    = ax25_rt_find_route(dst, NULL);
+	dev      = route->dev;
 
-	route = ax25_rt_find_route(dst, dev);
+	if (dev == NULL)
+		dev = skb->dev;
+
+        if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
+                return 1;
 
 	if (bp[16] == AX25_P_IP) {
 		if (route->ip_mode == 'V' || (route->ip_mode == ' ' && ax25_dev->values[AX25_VALUES_IPDEFMODE])) {
@@ -139,6 +143,10 @@ int ax25_rebuild_header(struct sk_buff *skb)
 			 *      instead of using skb_clone() unless this
 			 *	gets fixed.
 			 */
+
+			ax25_address src_c;
+			ax25_address dst_c;
+
 			if ((ourskb = skb_copy(skb, GFP_ATOMIC)) == NULL) {
 				kfree_skb(skb);
 				return 1;
@@ -149,9 +157,13 @@ int ax25_rebuild_header(struct sk_buff *skb)
 
 			kfree_skb(skb);
 
+			src_c = *src;
+			dst_c = *dst;
+
 			skb_pull(ourskb, AX25_HEADER_LEN - 1);	/* Keep PID */
 
-			ax25_send_frame(ourskb, ax25_dev->values[AX25_VALUES_PACLEN], src, dst, route->digipeat, dev);
+			ax25_send_frame(ourskb, ax25_dev->values[AX25_VALUES_PACLEN], &src_c, 
+&dst_c, route->digipeat, dev);
 
 			return 1;
 		}
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 186ccf81b..f064370d4 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -54,15 +54,16 @@
 
 static inline void wait_for_packet(struct sock * sk)
 {
-	unsigned long flags;
+	struct wait_queue wait = { current, NULL };
+
+	add_wait_queue(sk->sleep, &wait);
+	current->state = TASK_INTERRUPTIBLE;
 
-	release_sock(sk);
-	save_flags(flags);
-	cli();
 	if (skb_peek(&sk->receive_queue) == NULL)
-		interruptible_sleep_on(sk->sleep);
-	restore_flags(flags);
-	lock_sock(sk);
+		schedule();
+
+	current->state = TASK_RUNNING;
+	remove_wait_queue(sk->sleep, &wait);
 }
 
 /*
@@ -84,6 +85,14 @@ static inline int connection_based(struct sock *sk)
  *	This function will lock the socket if a skb is returned, so the caller
  *	needs to unlock the socket in that case (usually by calling skb_free_datagram)
  *
+ *	* It does not lock socket since today. This function is
+ *	* free of race conditions. This measure should/can improve
+ *	* significantly datagram socket latencies at high loads,
+ *	* when data copying to user space takes lots of time.
+ *	* (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
+ *	*  8) Great win.)
+ *	*			                    --ANK (980729)
+ *
  *	The order of the tests when we find no data waiting are specified
  *	quite explicitly by POSIX 1003.1g, don't change them without having
  *	the standard around please.
@@ -94,7 +103,6 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
 	int error;
 	struct sk_buff *skb;
 
-	lock_sock(sk);
 restart:
 	while(skb_queue_empty(&sk->receive_queue))	/* No data */
 	{
@@ -129,13 +137,24 @@ restart:
 	   will suddenly eat the receive_queue */
 	if (flags & MSG_PEEK)
 	{
-		unsigned long flags;
-		save_flags(flags);
-		cli();
+		unsigned long cpu_flags;
+
+		/* It is the only POTENTIAL race condition
+		   in this function. skb may be stolen by
+		   another receiver after peek, but before
+		   incrementing use count, provided kernel
+		   is reentearble (it is not) or this function
+		   is called by interrupts.
+
+		   Protect it with global skb spinlock,
+		   though for now even this is overkill.
+		                                --ANK (980728)
+		 */
+		spin_lock_irqsave(&skb_queue_lock, cpu_flags);
 		skb = skb_peek(&sk->receive_queue);
 		if(skb!=NULL)
 			atomic_inc(&skb->users);
-		restore_flags(flags);
+		spin_unlock_irqrestore(&skb_queue_lock, cpu_flags);
 	} else
 		skb = skb_dequeue(&sk->receive_queue);
 
@@ -144,7 +163,6 @@ restart:
 	return skb;
 
 no_packet:
-	release_sock(sk);
 	*err = error;
 	return NULL;
 }
@@ -152,7 +170,6 @@ no_packet:
 void skb_free_datagram(struct sock * sk, struct sk_buff *skb)
 {
 	kfree_skb(skb);
-	release_sock(sk);
 }
 
 /*
@@ -184,6 +201,10 @@ int skb_copy_datagram_iovec(struct sk_buff *skb, int offset, struct iovec *to,
  *	Datagram poll: Again totally generic. This also handles
  *	sequenced packet sockets providing the socket receive queue
  *	is only ever holding data ready to receive.
+ *
+ *	Note: when you _don't_ use this routine for this protocol,
+ *	and you use a different write policy from sock_writeable()
+ *	then please supply your own write_space callback.
  */
 
 unsigned int datagram_poll(struct file * file, struct socket *sock, poll_table *wait)
@@ -199,7 +220,7 @@ unsigned int datagram_poll(struct file * file, struct socket *sock, poll_table *
 		mask |= POLLERR;
 	if (sk->shutdown & RCV_SHUTDOWN)
 		mask |= POLLHUP;
-
+	
 	/* readable? */
 	if (!skb_queue_empty(&sk->receive_queue))
 		mask |= POLLIN | POLLRDNORM;
@@ -214,15 +235,8 @@ unsigned int datagram_poll(struct file * file, struct socket *sock, poll_table *
 	}
 
 	/* writable? */
-	if (!(sk->shutdown & SEND_SHUTDOWN)) {
-		if (sk->prot) {
-			if (sock_wspace(sk) >= MIN_WRITE_SPACE)
-				mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
-		} else {
-			if (sk->sndbuf - atomic_read(&sk->wmem_alloc) >= MIN_WRITE_SPACE)
-				mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
-		}
-	}
+	if (sock_writeable(sk))
+		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
 
 	return mask;
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index bd414c794..045fd0f92 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -16,6 +16,7 @@
  *		Alan Cox <gw4pts@gw4pts.ampr.org>
  *		David Hinds <dhinds@allegro.stanford.edu>
  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
+ *		Adam Sulmicki <adam@cfar.umd.edu>
  *
  *	Changes:
  *		Alan Cox	:	device private ioctl copies fields back.
@@ -51,7 +52,10 @@
  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
  *		Cyrus Durgin	:	Cleaned for KMOD
- *
+ *		Adam Sulmicki   :	Bug Fix : Network Device Unload
+ *					A network device unload needs to purge
+ *					the backlog queue.
+ *	Paul Rusty Russel	:	SIOCSIFNAME
  */
 
 #include <asm/uaccess.h>
@@ -154,6 +158,8 @@ int netdev_fastroute_obstacles;
 struct net_fastroute_stats dev_fastroute_stat;
 #endif
 
+static void dev_clear_backlog(struct device *dev);
+
 
 /******************************************************************************************
 
@@ -171,6 +177,16 @@ int netdev_nit=0;
  *	Add a protocol ID to the list. Now that the input handler is
  *	smarter we can dispense with all the messy stuff that used to be
  *	here.
+ *
+ *	BEWARE!!! Protocol handlers, mangling input packets,
+ *	MUST BE last in hash buckets and checking protocol handlers
+ *	MUST start from promiscous ptype_all chain in net_bh.
+ *	It is true now, do not change it.
+ *	Explantion follows: if protocol handler, mangling packet, will
+ *	be the first on list, it is not able to sense, that packet
+ *	is cloned and should be copied-on-write, so that it will
+ *	change it and subsequent readers will get broken packet.
+ *							--ANK (980803)
  */
  
 void dev_add_pack(struct packet_type *pt)
@@ -448,7 +464,8 @@ int dev_close(struct device *dev)
 	/*
 	 *	Device is now down.
 	 */
-	 
+	dev_clear_backlog(dev);
+
 	dev->flags&=~(IFF_UP|IFF_RUNNING);
 #ifdef CONFIG_NET_FASTROUTE
 	dev_clear_fastroute(dev);
@@ -457,7 +474,6 @@ int dev_close(struct device *dev)
 	/*
 	 *	Tell people we are going down
 	 */
-	 
 	notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
 
 	return(0);
@@ -685,6 +701,45 @@ static void netdev_wakeup(void)
 }
 #endif
 
+static void dev_clear_backlog(struct device *dev)
+{
+	struct sk_buff *prev, *curr;
+
+	/*
+	 *
+	 *  Let now clear backlog queue. -AS
+	 *
+	 *  We are competing here both with netif_rx() and net_bh().
+	 *  We don't want either of those to mess with skb ptrs
+	 *  while we work on them, thus cli()/sti().
+	 *
+	 *  It looks better to use net_bh trick, at least
+	 *  to be sure, that we keep interrupt latency really low. --ANK (980727)
+	 */ 
+
+	if (backlog.qlen) {
+		start_bh_atomic();
+		curr = backlog.next;
+		while ( curr != (struct sk_buff *)(&backlog) ) {
+			unsigned long flags;
+			curr=curr->next;
+			if ( curr->prev->dev == dev ) {
+				prev = curr->prev;
+				spin_lock_irqsave(&skb_queue_lock, flags);
+				__skb_unlink(prev, &backlog);
+				spin_unlock_irqrestore(&skb_queue_lock, flags);
+				kfree_skb(prev);
+			}
+		}
+		end_bh_atomic();
+#ifdef CONFIG_NET_HW_FLOWCONTROL
+		if (netdev_dropping)
+			netdev_wakeup();
+#else
+		netdev_dropping = 0;
+#endif
+	}
+}
 
 /*
  *	Receive a packet from a device driver and queue it for the upper
@@ -751,7 +806,7 @@ static inline void handle_bridge(struct sk_buff *skb, unsigned short type)
 
 		if(br_receive_frame(skb))
 			return;
-		kfree_skb(skb, FREE_READ);
+		kfree_skb(skb);
 	}
 	return;
 }
@@ -1320,7 +1375,7 @@ int dev_change_flags(struct device *dev, unsigned flags)
 	 */
 
 	dev->flags = (flags & (IFF_DEBUG|IFF_NOTRAILERS|IFF_RUNNING|IFF_NOARP|
-			       IFF_SLAVE|IFF_MASTER|
+			       IFF_NODYNARP|IFF_SLAVE|IFF_MASTER|
 			       IFF_MULTICAST|IFF_PORTSEL|IFF_AUTOMEDIA)) |
 				       (dev->flags & (IFF_UP|IFF_VOLATILE|IFF_PROMISC|IFF_ALLMULTI));
 
@@ -1391,12 +1446,11 @@ static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
 			return dev_change_flags(dev, ifr->ifr_flags);
 		
 		case SIOCGIFMETRIC:	/* Get the metric on the interface (currently unused) */
-			ifr->ifr_metric = dev->metric;
+			ifr->ifr_metric = 0;
 			return 0;
 			
 		case SIOCSIFMETRIC:	/* Set the metric on the interface (currently unused) */
-			dev->metric = ifr->ifr_metric;
-			return 0;
+			return -EOPNOTSUPP;
 	
 		case SIOCGIFMTU:	/* Get the MTU of a device */
 			ifr->ifr_mtu = dev->mtu;
@@ -1419,10 +1473,8 @@ static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
 				dev->mtu = ifr->ifr_mtu;
 				err = 0;
 			}
-			if (!err && dev->flags&IFF_UP) {
-				printk(KERN_DEBUG "SIFMTU %s(%s)\n", dev->name, current->comm);
+			if (!err && dev->flags&IFF_UP)
 				notifier_call_chain(&netdev_chain, NETDEV_CHANGEMTU, dev);
-			}
 			return err;
 
 		case SIOCGIFHWADDR:
@@ -1484,11 +1536,22 @@ static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
 			return 0;
 
 		case SIOCSIFTXQLEN:
-			if(ifr->ifr_qlen<2 || ifr->ifr_qlen>1024)
+			/* Why <2? 0 and 1 are valid values. --ANK (980807) */
+			if(/*ifr->ifr_qlen<2 ||*/ ifr->ifr_qlen>1024)
 				return -EINVAL;
 			dev->tx_queue_len = ifr->ifr_qlen;
 			return 0;
 
+		case SIOCSIFNAME:
+			if (dev->flags&IFF_UP)
+				return -EBUSY;
+			if (dev_get(ifr->ifr_newname))
+				return -EEXIST;
+			memcpy(dev->name, ifr->ifr_newname, IFNAMSIZ);
+			dev->name[IFNAMSIZ-1] = 0;
+			notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
+			return 0;
+
 		/*
 		 *	Unknown or private ioctl
 		 */
@@ -1597,6 +1660,7 @@ int dev_ioctl(unsigned int cmd, void *arg)
 		case SIOCDELMULTI:
 		case SIOCSIFHWBROADCAST:
 		case SIOCSIFTXQLEN:
+		case SIOCSIFNAME:
 			if (!capable(CAP_NET_ADMIN))
 				return -EPERM;
 			dev_load(ifr.ifr_name);
@@ -1669,6 +1733,17 @@ int register_netdevice(struct device *dev)
 printk("register_netdevice #1\n");
 
 	if (dev_boot_phase) {
+		/* This is NOT bug, but I am not sure, that all the
+		   devices, initialized before netdev module is started
+		   are sane. 
+
+		   Now they are chained to device boot list
+		   and probed later. If a module is initialized
+		   before netdev, but assumes that dev->init
+		   is really called by register_netdev(), it will fail.
+
+		   So that this message should be printed for a while.
+		 */
 		printk(KERN_INFO "early initialization of device %s is deferred\n", dev->name);
 
 		/* Check for existence, and append to tail of chain */
diff --git a/net/core/iovec.c b/net/core/iovec.c
index 67f7a6f2b..b8960ecf7 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -215,7 +215,7 @@ int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov,
 			partial_cnt = 0;
 		}
 
-		if (len - copy > 0)
+		if (len > copy)
 		{
 			partial_cnt = copy % 4;
 			if (partial_cnt)
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index a8d72604d..ead3b77ff 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -9,6 +9,9 @@
  *      modify it under the terms of the GNU General Public License
  *      as published by the Free Software Foundation; either version
  *      2 of the License, or (at your option) any later version.
+ *
+ *	Fixes:
+ *	Vitaly E. Lavrov	releasing NULL neighbor in neigh_add.
  */
 
 #include <linux/config.h>
@@ -1033,7 +1036,8 @@ int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 					   ndm->ndm_state,
 					   nlh->nlmsg_flags&NLM_F_REPLACE, 0);
 		}
-		neigh_release(n);
+		if (n)
+			neigh_release(n);
 		end_bh_atomic();
 		return err;
 	}
@@ -1043,7 +1047,7 @@ int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 
 
 static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n,
-			    pid_t pid, u32 seq, int event)
+			   u32 pid, u32 seq, int event)
 {
 	unsigned long now = jiffies;
 	struct ndmsg *ndm;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index cd8030c5d..e1fe88701 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -12,6 +12,8 @@
  *		as published by the Free Software Foundation; either version
  *		2 of the License, or (at your option) any later version.
  *
+ *	Fixes:
+ *	Vitaly E. Lavrov		RTA_OK arithmetics was wrong.
  */
 
 #include <linux/config.h>
@@ -29,6 +31,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
+#include <linux/capability.h>
 #include <linux/skbuff.h>
 #include <linux/init.h>
 
@@ -135,47 +138,8 @@ int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
 	return err;
 }
 
-#ifdef CONFIG_RTNL_OLD_IFINFO
 static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct device *dev,
-				 int type, pid_t pid, u32 seq)
-{
-	struct ifinfomsg *r;
-	struct nlmsghdr  *nlh;
-	unsigned char	 *b = skb->tail;
-
-	nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*r));
-	if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
-	r = NLMSG_DATA(nlh);
-	r->ifi_addrlen = dev->addr_len;
-	r->ifi_address.sa_family = dev->type;
-	memcpy(&r->ifi_address.sa_data, dev->dev_addr, dev->addr_len);
-	r->ifi_broadcast.sa_family = dev->type;
-	memcpy(&r->ifi_broadcast.sa_data, dev->broadcast, dev->addr_len);
-	r->ifi_flags = dev->flags;
-	r->ifi_mtu = dev->mtu;
-	r->ifi_index = dev->ifindex;
-	r->ifi_link = dev->iflink;
-	strncpy(r->ifi_name, dev->name, IFNAMSIZ-1);
-	r->ifi_qdiscname[0] = 0;
-	r->ifi_qdisc = dev->qdisc_sleeping->handle;
-	if (dev->qdisc_sleeping)
-		strcpy(r->ifi_qdiscname, dev->qdisc_sleeping->ops->id);
-	if (dev->get_stats) {
-		struct net_device_stats *stats = dev->get_stats(dev);
-		if (stats)
-			RTA_PUT(skb, IFLA_STATS, sizeof(*stats), stats);
-	}
-	nlh->nlmsg_len = skb->tail - b;
-	return skb->len;
-
-nlmsg_failure:
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
-	return -1;
-}
-#else
-static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct device *dev,
-				 int type, pid_t pid, u32 seq)
+				 int type, u32 pid, u32 seq)
 {
 	struct ifinfomsg *r;
 	struct nlmsghdr  *nlh;
@@ -218,7 +182,6 @@ rtattr_failure:
 	skb_trim(skb, b - skb->data);
 	return -1;
 }
-#endif
 
 int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 {
@@ -266,12 +229,7 @@ int rtnetlink_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 void rtmsg_ifinfo(int type, struct device *dev)
 {
 	struct sk_buff *skb;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	int size = NLMSG_SPACE(sizeof(struct ifinfomsg)+
-			       RTA_LENGTH(sizeof(struct net_device_stats)));
-#else
 	int size = NLMSG_GOODSIZE;
-#endif
 
 	skb = alloc_skb(size, GFP_KERNEL);
 	if (!skb)
@@ -287,7 +245,7 @@ void rtmsg_ifinfo(int type, struct device *dev)
 
 static int rtnetlink_done(struct netlink_callback *cb)
 {
-	if (NETLINK_CREDS(cb->skb)->uid == 0 && cb->nlh->nlmsg_flags&NLM_F_ATOMIC)
+	if (cap_raised(NETLINK_CB(cb->skb).eff_cap, CAP_NET_ADMIN) && cb->nlh->nlmsg_flags&NLM_F_ATOMIC)
 		rtnl_shunlock();
 	return 0;
 }
@@ -342,13 +300,13 @@ rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp)
 	sz_idx = type>>2;
 	kind = type&3;
 
-	if (kind != 2 && NETLINK_CREDS(skb)->uid) {
+	if (kind != 2 && !cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) {
 		*errp = -EPERM;
 		return -1;
 	}
 
 	if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
-		int rlen;
+		u32 rlen;
 
 		if (link->dumpit == NULL)
 			link = &(rtnetlink_links[PF_UNSPEC][type]);
@@ -357,12 +315,13 @@ rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp)
 			goto err_inval;
 
 		/* Super-user locks all the tables to get atomic snapshot */
-		if (NETLINK_CREDS(skb)->uid == 0 && nlh->nlmsg_flags&NLM_F_ATOMIC)
+		if (cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)
+		    && nlh->nlmsg_flags&NLM_F_ATOMIC)
 			atomic_inc(&rtnl_rlockct);
 		if ((*errp = netlink_dump_start(rtnl, skb, nlh,
 						link->dumpit,
 						rtnetlink_done)) != 0) {
-			if (NETLINK_CREDS(skb)->uid == 0 && nlh->nlmsg_flags&NLM_F_ATOMIC)
+			if (cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN) && nlh->nlmsg_flags&NLM_F_ATOMIC)
 				atomic_dec(&rtnl_rlockct);
 			return -1;
 		}
@@ -431,7 +390,7 @@ extern __inline__ int rtnetlink_rcv_skb(struct sk_buff *skb)
 	struct nlmsghdr * nlh;
 
 	while (skb->len >= NLMSG_SPACE(0)) {
-		int rlen;
+		u32 rlen;
 
 		nlh = (struct nlmsghdr *)skb->data;
 		if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
diff --git a/net/core/scm.c b/net/core/scm.c
index 3e4469f29..e16c4a45f 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -138,11 +138,15 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
 
 	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg))
 	{
+		err = -EINVAL;
+
+		if ((unsigned long)(((char*)cmsg - (char*)msg->msg_control)
+				    + cmsg->cmsg_len) > msg->msg_controllen)
+			goto error;
+
 		if (cmsg->cmsg_level != SOL_SOCKET)
 			continue;
 
-		err = -EINVAL;
-
 		switch (cmsg->cmsg_type)
 		{
 		case SCM_RIGHTS:
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c218233d4..fb13b5e16 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4,6 +4,8 @@
  *	Authors:	Alan Cox <iiitac@pyr.swan.ac.uk>
  *			Florian La Roche <rzsfl@rz.uni-sb.de>
  *
+ *	Version:	$Id: skbuff.c,v 1.53 1998/08/19 13:32:44 freitag Exp $
+ *
  *	Fixes:	
  *		Alan Cox	:	Fixed the worst of the load balancer bugs.
  *		Dave Platt	:	Interrupt stacking fix.
@@ -96,14 +98,14 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
 
 void show_net_buffers(void)
 {
-	printk(KERN_INFO "Networking buffers in use          : %u\n",
+	printk("Networking buffers in use          : %u\n",
 	       atomic_read(&net_skbcount));
-	printk(KERN_INFO "Total network buffer allocations   : %u\n",
+	printk("Total network buffer allocations   : %u\n",
 	       atomic_read(&net_allocs));
-	printk(KERN_INFO "Total failed network buffer allocs : %u\n",
+	printk("Total failed network buffer allocs : %u\n",
 	       atomic_read(&net_fails));
 #ifdef CONFIG_INET
-	printk(KERN_INFO "IP fragment buffer size            : %u\n",
+	printk("IP fragment buffer size            : %u\n",
 	       atomic_read(&ip_frag_mem));
 #endif	
 }
@@ -365,7 +367,7 @@ void skb_add_mtu(int mtu)
 }
 #endif
 
-__initfunc(void skb_init(void))
+void __init skb_init(void)
 {
 	skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
 					      sizeof(struct sk_buff),
diff --git a/net/core/sock.c b/net/core/sock.c
index 07d125462..e9e293ec9 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -7,7 +7,7 @@
  *		handler for protocols to use and generic option handler.
  *
  *
- * Version:	@(#)sock.c	1.0.17	06/02/93
+ * Version:	$Id: sock.c,v 1.70 1998/08/26 12:03:07 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -78,6 +78,7 @@
  *              Chris Evans     :       Call suser() check last on F_SETOWN
  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
+ *		Andi Kleen	:	Fix write_space callback
  *
  * To Fix:
  *
@@ -445,6 +446,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		case SO_RCVLOWAT:
 		case SO_SNDLOWAT:
 			v.val=1;
+			break; 
 
 		case SO_PASSCRED:
 			v.val = sock->passcred;
@@ -615,19 +617,6 @@ unsigned long sock_rspace(struct sock *sk)
 }
 
 
-/* FIXME: this is also insane. See above comment */
-unsigned long sock_wspace(struct sock *sk)
-{
-	int amt = 0;
-
-	if (sk != NULL && !(sk->shutdown & SEND_SHUTDOWN)) {
-		amt = sk->sndbuf - atomic_read(&sk->wmem_alloc);
-		if (amt < 0) 
-			amt = 0;
-	}
-	return amt;
-}
-
 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
    I think, these locks should be removed for datagram sockets.
  */
@@ -746,17 +735,15 @@ void __release_sock(struct sock *sk)
  
 void sklist_remove_socket(struct sock **list, struct sock *sk)
 {
-	unsigned long flags;
 	struct sock *s;
 
-	save_flags(flags);
-	cli();
+	start_bh_atomic();
 
 	s= *list;
 	if(s==sk)
 	{
 		*list = s->next;
-		restore_flags(flags);
+		end_bh_atomic();
 		return;
 	}
 	while(s && s->next)
@@ -764,22 +751,19 @@ void sklist_remove_socket(struct sock **list, struct sock *sk)
 		if(s->next==sk)
 		{
 			s->next=sk->next;
-			restore_flags(flags);
-			return;
+			break;
 		}
 		s=s->next;
 	}
-	restore_flags(flags);
+	end_bh_atomic();
 }
 
 void sklist_insert_socket(struct sock **list, struct sock *sk)
 {
-	unsigned long flags;
-	save_flags(flags);
-	cli();
+	start_bh_atomic();
 	sk->next= *list;
 	*list=sk;
-	restore_flags(flags);
+	end_bh_atomic();
 }
 
 /*
@@ -914,6 +898,10 @@ int sock_no_getsockopt(struct socket *sock, int level, int optname,
 	return -EOPNOTSUPP;
 }
 
+/* 
+ * Note: if you add something that sleeps here then change sock_fcntl()
+ *       to do proper fd locking.
+ */
 int sock_no_fcntl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
 	struct sock *sk = sock->sk;
@@ -971,12 +959,15 @@ void sock_def_callback2(struct sock *sk, int len)
 	}
 }
 
-void sock_def_callback3(struct sock *sk)
+void sock_def_write_space(struct sock *sk)
 {
 	if(!sk->dead)
 	{
 		wake_up_interruptible(sk->sleep);
-		sock_wake_async(sk->socket, 2);
+
+		/* Should agree with poll, otherwise some programs break */ 
+		if (sock_writeable(sk))
+			sock_wake_async(sk->socket, 2);
 	}
 }
 
@@ -1011,7 +1002,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 
 	sk->state_change	=	sock_def_callback1;
 	sk->data_ready		=	sock_def_callback2;
-	sk->write_space		=	sock_def_callback3;
+	sk->write_space		=	sock_def_write_space;
 	sk->error_report	=	sock_def_callback1;
 	sk->destruct            =       sock_def_destruct;
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 18c31f5c3..8282333dc 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -5,7 +5,7 @@
  *
  *		PF_INET protocol family socket handler.
  *
- * Version:	$Id: af_inet.c,v 1.74 1998/05/08 21:06:24 davem Exp $
+ * Version:	$Id: af_inet.c,v 1.75 1998/08/26 12:03:15 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -177,6 +177,8 @@ static __inline__ void kill_sk_now(struct sock *sk)
 	if(sk->opt)
 		kfree(sk->opt);
 	dst_release(sk->dst_cache);
+	if (atomic_read(&sk->omem_alloc))
+		printk(KERN_DEBUG "kill_sk_now: optmem leakage (%d bytes) detected.\n", atomic_read(&sk->omem_alloc));
 	sk_free(sk);
 }
 
@@ -576,6 +578,24 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
 	return(0);
 }
 
+static void inet_wait_for_connect(struct sock *sk)
+{
+	struct wait_queue wait = { current, NULL };
+
+	add_wait_queue(sk->sleep, &wait);
+	current->state = TASK_INTERRUPTIBLE;
+	while (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) {
+		if (signal_pending(current))
+			break;
+		if (sk->err)
+			break;
+		schedule();
+		current->state = TASK_INTERRUPTIBLE;
+	}
+	current->state = TASK_RUNNING;
+	remove_wait_queue(sk->sleep, &wait);
+}
+
 /*
  *	Connect to a remote host. There is regrettably still a little
  *	TCP 'magic' in here.
@@ -623,6 +643,13 @@ int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr,
 	if (sk->state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) 
 	  	return (-EINPROGRESS);
 
+#if 1
+	if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) {
+		inet_wait_for_connect(sk);
+		if (signal_pending(current))
+			return -ERESTARTSYS;
+	}
+#else
 	cli();
 	while(sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) {
 		interruptible_sleep_on(sk->sleep);
@@ -639,6 +666,7 @@ int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr,
 		}
 	}
 	sti();
+#endif
 
 	sock->state = SS_CONNECTED;
 	if ((sk->state != TCP_ESTABLISHED) && sk->err) {
@@ -876,7 +904,6 @@ static int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 		case FIOGETOWN:
 		case SIOCGPGRP:
 			return put_user(sk->proc, (int *)arg);
-			return(0);			
 		case SIOCGSTAMP:
 			if(sk->stamp.tv_sec==0)
 				return -ENOENT;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index e6e272b0e..1ce69028f 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1,6 +1,6 @@
 /* linux/net/inet/arp.c
  *
- * Version:	$Id: arp.c,v 1.67 1998/06/19 13:22:31 davem Exp $
+ * Version:	$Id: arp.c,v 1.70 1998/08/26 12:03:18 davem Exp $
  *
  * Copyright (C) 1994 by Florian  La Roche
  *
@@ -760,7 +760,7 @@ int arp_req_set(struct arpreq *r, struct device * dev)
 		r->arp_flags |= ATF_COM;
 	if (dev == NULL) {
 		struct rtable * rt;
-		if ((err = ip_route_output(&rt, ip, 0, 1, 0)) != 0)
+		if ((err = ip_route_output(&rt, ip, 0, RTO_ONLINK, 0)) != 0)
 			return err;
 		dev = rt->u.dst.dev;
 		ip_rt_put(rt);
@@ -843,11 +843,21 @@ int arp_req_delete(struct arpreq *r, struct device * dev)
 		return -EINVAL;
 	}
 
+	if (dev == NULL) {
+		struct rtable * rt;
+		if ((err = ip_route_output(&rt, ip, 0, RTO_ONLINK, 0)) != 0)
+			return err;
+		dev = rt->u.dst.dev;
+		ip_rt_put(rt);
+		if (!dev)
+			return -EINVAL;
+	}
 	err = -ENXIO;
 	start_bh_atomic();
 	neigh = __neigh_lookup(&arp_tbl, &ip, dev, 0);
 	if (neigh) {
-		err = neigh_update(neigh, NULL, NUD_FAILED, 1, 0);
+		if (neigh->nud_state&~NUD_NOARP)
+			err = neigh_update(neigh, NULL, NUD_FAILED, 1, 0);
 		neigh_release(neigh);
 	}
 	end_bh_atomic();
@@ -867,7 +877,7 @@ int arp_ioctl(unsigned int cmd, void *arg)
 	switch(cmd) {
 		case SIOCDARP:
 		case SIOCSARP:
-			if (!suser())
+			if (!capable(CAP_NET_ADMIN))
 				return -EPERM;
 		case SIOCGARP:
 			err = copy_from_user(&r, arg, sizeof(struct arpreq));
@@ -899,10 +909,8 @@ int arp_ioctl(unsigned int cmd, void *arg)
 		err = -EINVAL;
 		if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type)
 			goto out;
-	} else if (cmd != SIOCSARP) {
-		/* dev has not been set ... */
-		printk(KERN_ERR "arp_ioctl: invalid, null device\n");
-		err = -EINVAL;
+	} else if (cmd == SIOCGARP) {
+		err = -ENODEV;
 		goto out;
 	}
 
@@ -911,7 +919,6 @@ int arp_ioctl(unsigned int cmd, void *arg)
 	        err = arp_req_delete(&r, dev);
 		break;
 	case SIOCSARP:
-		/* This checks for dev == NULL */
 		err = arp_req_set(&r, dev);
 		break;
 	case SIOCGARP:
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 18293338e..ac7c04432 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1,7 +1,7 @@
 /*
  *	NET3	IP device support routines.
  *
- *	Version: $Id: devinet.c,v 1.22 1998/05/08 21:06:26 davem Exp $
+ *	Version: $Id: devinet.c,v 1.23 1998/08/26 12:03:21 davem Exp $
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -533,8 +533,6 @@ int devinet_ioctl(unsigned int cmd, void *arg)
 				inet_del_ifa(in_dev, ifap, 0);
 				ifa->ifa_broadcast = 0;
 				ifa->ifa_anycast = 0;
-				ifa->ifa_prefixlen = 32;
-				ifa->ifa_mask = inet_make_mask(32);
 			}
 
 			ifa->ifa_address =
@@ -545,6 +543,9 @@ int devinet_ioctl(unsigned int cmd, void *arg)
 				ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen);
 				if ((dev->flags&IFF_BROADCAST) && ifa->ifa_prefixlen < 31)
 					ifa->ifa_broadcast = ifa->ifa_address|~ifa->ifa_mask;
+			} else {
+				ifa->ifa_prefixlen = 32;
+				ifa->ifa_mask = inet_make_mask(32);
 			}
 			ret = inet_set_ifa(dev, ifa);
 			break;
@@ -702,6 +703,16 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, void
 	case NETDEV_UNREGISTER:
 		inetdev_destroy(in_dev);
 		break;
+	case NETDEV_CHANGENAME:
+		if (in_dev->ifa_list) {
+			struct in_ifaddr *ifa;
+			for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
+				memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+			/* Do not notify about label change, this event is
+			   not interesting to applications using netlink.
+			 */
+		}
+		break;
 	}
 
 	return NOTIFY_DONE;
@@ -716,7 +727,7 @@ struct notifier_block ip_netdev_notifier={
 #ifdef CONFIG_RTNETLINK
 
 static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
-			    pid_t pid, u32 seq, int event)
+			    u32 pid, u32 seq, int event)
 {
 	struct ifaddrmsg *ifm;
 	struct nlmsghdr  *nlh;
@@ -729,7 +740,7 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
 	ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT;
 	ifm->ifa_scope = ifa->ifa_scope;
 	ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
-	if (ifa->ifa_prefixlen)
+	if (ifa->ifa_address)
 		RTA_PUT(skb, IFA_ADDRESS, 4, &ifa->ifa_address);
 	if (ifa->ifa_local)
 		RTA_PUT(skb, IFA_LOCAL, 4, &ifa->ifa_local);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index d9a150218..013a4ba9a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -5,7 +5,7 @@
  *
  *		IPv4 Forwarding Information Base: FIB frontend.
  *
- * Version:	$Id: fib_frontend.c,v 1.11 1998/06/11 03:15:40 davem Exp $
+ * Version:	$Id: fib_frontend.c,v 1.12 1998/08/26 12:03:24 davem Exp $
  *
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
@@ -300,10 +300,8 @@ static int inet_check_attr(struct rtmsg *r, struct rtattr **rta)
 		if (attr) {
 			if (RTA_PAYLOAD(attr) < 4)
 				return -EINVAL;
-#ifndef	CONFIG_RTNL_OLD_IFINFO
 			if (i != RTA_MULTIPATH && i != RTA_METRICS)
-#endif
-			rta[i-1] = (struct rtattr*)RTA_DATA(attr);
+				rta[i-1] = (struct rtattr*)RTA_DATA(attr);
 		}
 	}
 	return 0;
@@ -527,6 +525,14 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
 #undef BRD1_OK
 }
 
+static void fib_disable_ip(struct device *dev, int force)
+{
+	if (fib_sync_down(0, dev, force))
+		fib_flush();
+	rt_cache_flush(0);
+	arp_ifdown(dev);
+}
+
 static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
 {
 	struct in_ifaddr *ifa = (struct in_ifaddr*)ptr;
@@ -537,8 +543,15 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
 		rt_cache_flush(-1);
 		break;
 	case NETDEV_DOWN:
-		fib_del_ifaddr(ifa);
-		rt_cache_flush(-1);
+		if (ifa->ifa_dev && ifa->ifa_dev->ifa_list == NULL) {
+			/* Last address was deleted from this interface.
+			   Disable IP.
+			 */
+			fib_disable_ip(ifa->ifa_dev->dev, 1);
+		} else {
+			fib_del_ifaddr(ifa);
+			rt_cache_flush(-1);
+		}
 		break;
 	}
 	return NOTIFY_DONE;
@@ -563,18 +576,10 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 		rt_cache_flush(-1);
 		break;
 	case NETDEV_DOWN:
-		if (fib_sync_down(0, dev, 0))
-			fib_flush();
-		rt_cache_flush(0);
-		arp_ifdown(dev);
+		fib_disable_ip(dev, 0);
 		break;
 	case NETDEV_UNREGISTER:
-		if (in_dev->ifa_list)
-			printk("About to crash!\n");
-		if (fib_sync_down(0, dev, 1))
-			fib_flush();
-		rt_cache_flush(0);
-		arp_ifdown(dev);
+		fib_disable_ip(dev, 1);
 		break;
 	case NETDEV_CHANGEMTU:
 	case NETDEV_CHANGE:
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 3e13671a2..618d247bd 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -5,7 +5,7 @@
  *
  *		IPv4 FIB: lookup engine and maintenance routines.
  *
- * Version:	$Id: fib_hash.c,v 1.4 1998/07/15 05:05:08 davem Exp $
+ * Version:	$Id: fib_hash.c,v 1.5 1998/08/26 12:03:27 davem Exp $
  *
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
@@ -713,7 +713,7 @@ static void rtmsg_fib(int event, struct fib_node* f, int z, int tb_id,
 		      struct nlmsghdr *n, struct netlink_skb_parms *req)
 {
 	struct sk_buff *skb;
-	pid_t pid = req ? req->pid : 0;
+	u32 pid = req ? req->pid : 0;
 	int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
 
 	skb = alloc_skb(size, GFP_KERNEL);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 592ff5ffb..2302f5322 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -5,7 +5,7 @@
  *
  *		IPv4 Forwarding Information Base: policy rules.
  *
- * Version:	$Id: fib_rules.c,v 1.5 1998/04/28 06:21:57 davem Exp $
+ * Version:	$Id: fib_rules.c,v 1.6 1998/08/26 12:03:30 davem Exp $
  *
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
@@ -45,10 +45,6 @@
 
 #define FRprintk(a...)
 
-#ifndef CONFIG_RTNL_OLD_IFINFO
-#define RTA_IFNAME RTA_IIF
-#endif
-
 struct fib_rule
 {
 	struct fib_rule *r_next;
@@ -91,7 +87,7 @@ int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 		    rtm->rtm_tos == r->r_tos &&
 		    (!rtm->rtm_type || rtm->rtm_type == r->r_action) &&
 		    (!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) &&
-		    (!rta[RTA_IFNAME-1] || strcmp(RTA_DATA(rta[RTA_IFNAME-1]), r->r_ifname) == 0) &&
+		    (!rta[RTA_IIF-1] || strcmp(RTA_DATA(rta[RTA_IIF-1]), r->r_ifname) == 0) &&
 		    (!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) {
 			*rp = r->r_next;
 			if (r != &default_rule && r != &main_rule && r != &local_rule)
@@ -126,7 +122,7 @@ int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 	    (rtm->rtm_tos & ~IPTOS_TOS_MASK))
 		return -EINVAL;
 
-	if (rta[RTA_IFNAME-1] && RTA_PAYLOAD(rta[RTA_IFNAME-1]) > IFNAMSIZ)
+	if (rta[RTA_IIF-1] && RTA_PAYLOAD(rta[RTA_IIF-1]) > IFNAMSIZ)
 		return -EINVAL;
 
 	table_id = rtm->rtm_table;
@@ -159,9 +155,9 @@ int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 	if (rta[RTA_PRIORITY-1])
 		memcpy(&new_r->r_preference, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
 	new_r->r_table = table_id;
-	if (rta[RTA_IFNAME-1]) {
+	if (rta[RTA_IIF-1]) {
 		struct device *dev;
-		memcpy(new_r->r_ifname, RTA_DATA(rta[RTA_IFNAME-1]), IFNAMSIZ);
+		memcpy(new_r->r_ifname, RTA_DATA(rta[RTA_IIF-1]), IFNAMSIZ);
 		new_r->r_ifname[IFNAMSIZ-1] = 0;
 		new_r->r_ifindex = -1;
 		dev = dev_get(new_r->r_ifname);
@@ -339,10 +335,6 @@ extern __inline__ int inet_fill_rule(struct sk_buff *skb,
 	rtm->rtm_table = r->r_table;
 	rtm->rtm_protocol = 0;
 	rtm->rtm_scope = 0;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	rtm->rtm_nhs = 0;
-	rtm->rtm_optlen = 0;
-#endif
 	rtm->rtm_type = r->r_action;
 	rtm->rtm_flags = r->r_flags;
 
@@ -351,7 +343,7 @@ extern __inline__ int inet_fill_rule(struct sk_buff *skb,
 	if (r->r_src_len)
 		RTA_PUT(skb, RTA_SRC, 4, &r->r_src);
 	if (r->r_ifname[0])
-		RTA_PUT(skb, RTA_IFNAME, IFNAMSIZ, &r->r_ifname);
+		RTA_PUT(skb, RTA_IIF, IFNAMSIZ, &r->r_ifname);
 	if (r->r_preference)
 		RTA_PUT(skb, RTA_PRIORITY, 4, &r->r_preference);
 	if (r->r_srcmap)
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 5537016d2..36c801e8c 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -5,7 +5,7 @@
  *
  *		IPv4 Forwarding Information Base: semantics.
  *
- * Version:	$Id: fib_semantics.c,v 1.9 1998/06/11 03:15:41 davem Exp $
+ * Version:	$Id: fib_semantics.c,v 1.10 1998/08/26 12:03:32 davem Exp $
  *
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
@@ -181,7 +181,6 @@ static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type)
 	return 0;
 }
 
-#ifndef CONFIG_RTNL_OLD_IFINFO
 static int
 fib_count_nexthops(struct rtattr *rta)
 {
@@ -189,7 +188,7 @@ fib_count_nexthops(struct rtattr *rta)
 	struct rtnexthop *nhp = RTA_DATA(rta);
 	int nhlen = RTA_PAYLOAD(rta);
 
-	while (nhlen >= sizeof(struct rtnexthop)) {
+	while (nhlen >= (int)sizeof(struct rtnexthop)) {
 		if ((nhlen -= nhp->rtnh_len) < 0)
 			return 0;
 		nhs++;
@@ -197,21 +196,12 @@ fib_count_nexthops(struct rtattr *rta)
 	};
 	return nhs;
 }
-#endif
 
-#ifdef CONFIG_RTNL_OLD_IFINFO
-static int
-fib_get_nhs(struct fib_info *fi, const struct nlmsghdr *nlh, const struct rtmsg *r)
-{
-	struct rtnexthop *nhp = RTM_RTNH(r);
-	int nhlen = RTM_NHLEN(nlh, r);
-#else
 static int
 fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r)
 {
 	struct rtnexthop *nhp = RTA_DATA(rta);
 	int nhlen = RTA_PAYLOAD(rta);
-#endif
 
 	change_nexthops(fi) {
 		int attrlen = nhlen - sizeof(struct rtnexthop);
@@ -249,18 +239,10 @@ int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta,
 	}
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	if (r->rtm_nhs == 0)
-		return 0;
-
-	nhp = RTM_RTNH(r);
-	nhlen = RTM_NHLEN(nlh, r);
-#else
 	if (rta->rta_mp == NULL)
 		return 0;
 	nhp = RTA_DATA(rta->rta_mp);
 	nhlen = RTA_PAYLOAD(rta->rta_mp);
-#endif
 	
 	for_nexthops(fi) {
 		int attrlen = nhlen - sizeof(struct rtnexthop);
@@ -397,11 +379,7 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
 	struct fib_info *fi = NULL;
 	struct fib_info *ofi;
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	int nhs = r->rtm_nhs ? : 1;
-#else
 	int nhs = 1;
-#endif
 #else
 	const int nhs = 1;
 #endif
@@ -411,14 +389,12 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
 		goto err_inval;
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-#ifndef CONFIG_RTNL_OLD_IFINFO
 	if (rta->rta_mp) {
 		nhs = fib_count_nexthops(rta->rta_mp);
 		if (nhs == 0)
 			goto err_inval;
 	}
 #endif
-#endif
 
 	fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
 	err = -ENOBUFS;
@@ -429,14 +405,6 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
 	fi->fib_protocol = r->rtm_protocol;
 	fi->fib_nhs = nhs;
 	fi->fib_flags = r->rtm_flags;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	if (rta->rta_mtu)
-		fi->fib_mtu = *rta->rta_mtu;
-	if (rta->rta_rtt)
-		fi->fib_rtt = *rta->rta_rtt;
-	if (rta->rta_window)
-		fi->fib_window = *rta->rta_window;
-#else
 	if (rta->rta_mx) {
 		int attrlen = RTA_PAYLOAD(rta->rta_mx);
 		struct rtattr *attr = RTA_DATA(rta->rta_mx);
@@ -451,21 +419,12 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
 			attr = RTA_NEXT(attr, attrlen);
 		}
 	}
-#endif
 	if (rta->rta_prefsrc)
 		memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4);
 
-#ifndef CONFIG_RTNL_OLD_IFINFO
 	if (rta->rta_mp) {
-#else
-	if (r->rtm_nhs) {
-#endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-#ifdef CONFIG_RTNL_OLD_IFINFO
-		if ((err = fib_get_nhs(fi, nlh, r)) != 0)
-#else
 		if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0)
-#endif
 			goto failure;
 		if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif)
 			goto err_inval;
@@ -504,11 +463,7 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
 #endif
 
 	if (fib_props[r->rtm_type].error) {
-#ifndef CONFIG_RTNL_OLD_IFINFO
 		if (rta->rta_gw || rta->rta_oif || rta->rta_mp)
-#else
-		if (rta->rta_gw || rta->rta_oif || r->rtm_nhs)
-#endif
 			goto err_inval;
 		goto link_it;
 	}
@@ -637,16 +592,13 @@ u32 __fib_res_prefsrc(struct fib_result *res)
 #ifdef CONFIG_RTNETLINK
 
 int
-fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event,
+fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
 	      u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
 	      struct fib_info *fi)
 {
 	struct rtmsg *rtm;
 	struct nlmsghdr  *nlh;
 	unsigned char	 *b = skb->tail;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	unsigned char 	 *o;
-#endif
 
 	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm));
 	rtm = NLMSG_DATA(nlh);
@@ -658,22 +610,9 @@ fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event,
 	rtm->rtm_type = type;
 	rtm->rtm_flags = fi->fib_flags;
 	rtm->rtm_scope = scope;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	rtm->rtm_nhs = 0;
-
-	o = skb->tail;
-#endif
 	if (rtm->rtm_dst_len)
 		RTA_PUT(skb, RTA_DST, 4, dst);
 	rtm->rtm_protocol = fi->fib_protocol;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	if (fi->fib_mtu)
-		RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &fi->fib_mtu);
-	if (fi->fib_window)
-		RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &fi->fib_window);
-	if (fi->fib_rtt)
-		RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &fi->fib_rtt);
-#else
 #ifdef CONFIG_NET_CLS_ROUTE
 	if (fi->fib_nh[0].nh_tclassid)
 		RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid);
@@ -688,7 +627,6 @@ fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event,
 		}
 		mx->rta_len = skb->tail - (u8*)mx;
 	}
-#endif
 	if (fi->fib_prefsrc)
 		RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc);
 	if (fi->fib_nhs == 1) {
@@ -697,18 +635,14 @@ fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event,
 		if (fi->fib_nh->nh_oif)
 			RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif);
 	}
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	rtm->rtm_optlen = skb->tail - o;
-#endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (fi->fib_nhs > 1) {
 		struct rtnexthop *nhp;
-#ifndef CONFIG_RTNL_OLD_IFINFO
 		struct rtattr *mp_head;
 		if (skb_tailroom(skb) <= RTA_SPACE(0))
 			goto rtattr_failure;
 		mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0));
-#endif
+
 		for_nexthops(fi) {
 			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
 				goto rtattr_failure;
@@ -719,14 +653,9 @@ fib_dump_info(struct sk_buff *skb, pid_t pid, u32 seq, int event,
 			if (nh->nh_gw)
 				RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw);
 			nhp->rtnh_len = skb->tail - (unsigned char*)nhp;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-			rtm->rtm_nhs++;
-#endif
 		} endfor_nexthops(fi);
-#ifndef CONFIG_RTNL_OLD_IFINFO
 		mp_head->rta_type = RTA_MULTIPATH;
 		mp_head->rta_len = skb->tail - (u8*)mp_head;
-#endif
 	}
 #endif
 	nlh->nlmsg_len = skb->tail - b;
@@ -848,24 +777,6 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
 	if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL)
 		return -EINVAL;
 
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	/* Ugly conversion from rtentry types to unsigned */
-
-	if (r->rt_flags&RTF_IRTT) {
-		rta->rta_rtt = (unsigned*)&r->rt_pad3;
-		*rta->rta_rtt = r->rt_irtt;
-	}
-	if (r->rt_flags&RTF_WINDOW) {
-		rta->rta_window = (unsigned*)&r->rt_window;
-		if (sizeof(*rta->rta_window) != sizeof(r->rt_window))
-			*rta->rta_window = r->rt_window;
-	}
-	if (r->rt_flags&RTF_MTU) {
-		rta->rta_mtu = (unsigned*)&r->rt_mtu;
-		if (sizeof(*rta->rta_mtu) != sizeof(r->rt_mtu))
-			*rta->rta_mtu = r->rt_mtu;
-	}
-#else
 	if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) {
 		struct rtattr *rec;
 		struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL);
@@ -896,7 +807,6 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
 			*(u32*)RTA_DATA(rec) = r->rt_irtt;
 		}
 	}
-#endif
 	return 0;
 }
 
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 4e947337a..9cc7c733b 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -3,7 +3,7 @@
  *	
  *		Alan Cox, <alan@cymru.net>
  *
- *	Version: $Id: icmp.c,v 1.44 1998/06/16 04:38:27 davem Exp $
+ *	Version: $Id: icmp.c,v 1.45 1998/08/26 12:03:35 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *	modify it under the terms of the GNU General Public License
@@ -47,6 +47,9 @@
  *					into the dest entry and use a token
  *					bucket filter (thanks to ANK). Make
  *					the rates sysctl configurable.
+ *		Yu Tianli	:	Fixed two ugly bugs in icmp_send
+ *					- IP option length was accounted wrongly
+ *					- ICMP header length was not accounted at all.
  *
  * RFC1122 (Host Requirements -- Comm. Layer) Status:
  * (boy, are there a lot of rules for ICMP)
@@ -363,7 +366,7 @@ int xrlim_allow(struct dst_entry *dst, int timeout)
 
 	now = jiffies;
 	dst->rate_tokens += now - dst->rate_last;
-	if (dst->rate_tokens > 6*timeout)
+	if (dst->rate_tokens > XRLIM_BURST_FACTOR*timeout)
 		dst->rate_tokens = XRLIM_BURST_FACTOR*timeout;
 	if (dst->rate_tokens >= timeout) {
 		dst->rate_tokens -= timeout;
@@ -537,7 +540,17 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info)
 	/*
 	 *	Construct source address and options.
 	 */
-	
+
+#ifdef CONFIG_IP_ROUTE_NAT	
+	/*
+	 *	Restore original addresses if packet has been translated.
+	 */
+	if (rt->rt_flags&RTCF_NAT && IPCB(skb_in)->flags&IPSKB_TRANSLATED) {
+		iph->daddr = rt->key.dst;
+		iph->saddr = rt->key.src;
+	}
+#endif
+
 	saddr = iph->daddr;
 	if (!(rt->rt_flags & RTCF_LOCAL))
 		saddr = 0;
@@ -587,8 +600,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, unsigned long info)
 	room = rt->u.dst.pmtu;
 	if (room > 576)
 		room = 576;
-	room -= sizeof(struct iphdr) - icmp_param.replyopts.optlen;
-	
+	room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen;
+	room -= sizeof(struct icmphdr);
+
 	icmp_param.data_len=(iph->ihl<<2)+skb_in->len;
 	if (icmp_param.data_len > room)
 		icmp_param.data_len = room;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 74757adf8..af49104b3 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -8,7 +8,7 @@
  *	the older version didn't come out right using gcc 2.5.8, the newer one
  *	seems to fall out with gcc 2.6.2.
  *
- *	Version: $Id: igmp.c,v 1.26 1998/03/08 05:56:19 davem Exp $
+ *	Version: $Id: igmp.c,v 1.27 1998/08/26 12:03:39 davem Exp $
  *
  *	Authors:
  *		Alan Cox <Alan.Cox@linux.org>
@@ -563,7 +563,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
 		goto done;
 	}
 
-	iml = (struct ip_mc_socklist *)kmalloc(sizeof(*iml), GFP_KERNEL);
+	iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
 
 	err = -EADDRINUSE;
 	for (i=sk->ip_mc_list; i; i=i->next) {
@@ -590,7 +590,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
 done:
 	rtnl_shunlock();
 	if (iml)
-		kfree(iml);
+		sock_kfree_s(sk, iml, sizeof(*iml));
 	return err;
 }
 
@@ -613,7 +613,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
 			in_dev = inetdev_by_index(iml->multi.imr_ifindex);
 			if (in_dev)
 				ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr);
-			kfree_s(iml, sizeof(*iml));
+			sock_kfree_s(sk, iml, sizeof(*iml));
 			return 0;
 		}
 	}
@@ -633,7 +633,7 @@ void ip_mc_drop_socket(struct sock *sk)
 		sk->ip_mc_list = iml->next;
 		if ((in_dev = inetdev_by_index(iml->multi.imr_ifindex)) != NULL)
 			ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
-		kfree_s(iml, sizeof(*iml));
+		sock_kfree_s(sk, iml, sizeof(*iml));
 	}
 }
 
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index e136a16ca..8cd0d5962 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -5,7 +5,7 @@
  *
  *		The IP forwarding functionality.
  *		
- * Version:	$Id: ip_forward.c,v 1.40 1998/03/08 05:56:20 davem Exp $
+ * Version:	$Id: ip_forward.c,v 1.41 1998/08/26 12:03:42 davem Exp $
  *
  * Authors:	see ip.c
  *
@@ -79,10 +79,8 @@ int ip_forward(struct sk_buff *skb)
 	int fw_res = 0;
 #endif
 
-	if (IPCB(skb)->opt.router_alert) {
-		if (ip_call_ra_chain(skb))
-			return 0;
-	}
+	if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
+		return 0;
 
 	if (skb->pkt_type != PACKET_HOST)
 		goto drop;
@@ -110,7 +108,7 @@ int ip_forward(struct sk_buff *skb)
                 goto local_pkt;
 #endif
 
-	if (ip_decrease_ttl(iph) <= 0)
+	if (iph->ttl <= 1)
                 goto too_many_hops;
 
 	if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
@@ -121,22 +119,30 @@ int ip_forward(struct sk_buff *skb)
 	 *	after asking the firewall permission to do so.
 	 */
 
-	skb->priority = rt->u.dst.priority;
+	skb->priority = rt_tos2priority(iph->tos);
 	dev2 = rt->u.dst.dev;
-	mtu = dev2->mtu;
+	mtu = rt->u.dst.pmtu;
 
 #ifdef CONFIG_NET_SECURITY
 	call_fw_firewall(PF_SECURITY, dev2, NULL, &mtu, NULL);
 #endif	
 	
 	/*
-	 *	In IP you never have to forward a frame on the interface that it 
-	 *	arrived upon. We now generate an ICMP HOST REDIRECT giving the route
+	 *	We now generate an ICMP HOST REDIRECT giving the route
 	 *	we calculated.
 	 */
 	if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr)
 		ip_rt_send_redirect(skb);
-	
+
+	/* We are about to mangle packet. Copy it! */
+	if ((skb = skb_cow(skb, dev2->hard_header_len)) == NULL)
+		return -1;
+	iph = skb->nh.iph;
+	opt = &(IPCB(skb)->opt);
+
+	/* Decrease ttl after skb cow done */
+	ip_decrease_ttl(iph);
+
 	/*
 	 * We now may allocate a new buffer, and copy the datagram into it.
 	 * If the indicated interface is up and running, kick it.
@@ -147,14 +153,6 @@ int ip_forward(struct sk_buff *skb)
 
 #ifdef CONFIG_IP_ROUTE_NAT
 	if (rt->rt_flags & RTCF_NAT) {
-		if (skb_headroom(skb) < dev2->hard_header_len || skb_cloned(skb)) {
-			struct sk_buff *skb2;
-			skb2 = skb_realloc_headroom(skb, (dev2->hard_header_len + 15)&~15);
-			kfree_skb(skb);
-			if (skb2 == NULL)
-				return -1;
-			skb = skb2;
-		}
 		if (ip_do_nat(skb)) {
 			kfree_skb(skb);
 			return -1;
@@ -243,18 +241,6 @@ skip_call_fw_firewall:
 	}
 #endif
 
-	if (skb_headroom(skb) < dev2->hard_header_len || skb_cloned(skb)) {
-		struct sk_buff *skb2;
-		skb2 = skb_realloc_headroom(skb, (dev2->hard_header_len + 15)&~15);
-		kfree_skb(skb);
-
-		if (skb2 == NULL) {
-			NETDEBUG(printk(KERN_ERR "\nIP: No memory available for IP forward\n"));
-			return -1;
-		}
-		skb = skb2;
-		iph = skb2->nh.iph;
-	}
 
 #ifdef CONFIG_FIREWALL
 	if ((fw_res = call_out_firewall(PF_INET, dev2, iph, NULL,&skb)) < FW_ACCEPT) {
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 9641aaae3..8a0e40f0f 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -5,7 +5,7 @@
  *
  *		The IP fragmentation functionality.
  *		
- * Version:	$Id: ip_fragment.c,v 1.38 1998/06/16 04:38:29 davem Exp $
+ * Version:	$Id: ip_fragment.c,v 1.39 1998/08/26 10:35:26 davem Exp $
  *
  * Authors:	Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
  *		Alan Cox <Alan.Cox@linux.org>
diff --git a/net/ipv4/ip_fw.c b/net/ipv4/ip_fw.c
index 57e7761e3..b45457c72 100644
--- a/net/ipv4/ip_fw.c
+++ b/net/ipv4/ip_fw.c
@@ -427,18 +427,15 @@ static void dump_packet(const struct iphdr *ip,
 	printk("\n");
 }
 
-/* function for checking chain labels for user space.  Makes sure that
- * there are no special characters in the string */
+/* function for checking chain labels for user space. */
 static int check_label(ip_chainlabel label)
 {
 	unsigned int i;
-	
-	for (i = 0; i < IP_FW_MAX_LABEL_LENGTH + 1 && label[i]; i++)
-		if (label[i] <= ' ') 
-			return 0;
-	if (i == IP_FW_MAX_LABEL_LENGTH+1)
-		return 0;
-	return 1;
+	/* strlen must be < IP_FW_MAX_LABEL_LENGTH. */
+	for (i = 0; i < IP_FW_MAX_LABEL_LENGTH + 1; i++)
+		if (label[i] == '\0') return 1;
+
+	return 0;
 }	
 
 /*	This function returns a pointer to the first chain with a label
@@ -1098,6 +1095,9 @@ static int create_chain(ip_chainlabel label)
 {
 	struct ip_chain *tmp;
 
+	if (!check_label(label))
+		return EINVAL;
+
 	FWC_HAVE_LOCK(fwc_wlocks);
 	for (tmp = ip_fw_chains; tmp->next; tmp = tmp->next)
 		if (strcmp(tmp->label,label) == 0)
@@ -1512,14 +1512,14 @@ static int dump_rule(char *buffer,
 		    "%9s "			/* Chain name */
 		    "%08lX/%08lX->%08lX/%08lX "	/* Source & Destination IPs */
 		    "%.16s "			/* Interface */
-		    "%hX %hX "			/* fw_flg and fw_invflg fields */
-		    "%hu "			/* Protocol */
+		    "%X %X "			/* fw_flg and fw_invflg fields */
+		    "%u "			/* Protocol */
 		    "%-9u %-9u %-9u %-9u "	/* Packet & byte counters */
-		    "%hu-%hu %hu-%hu "		/* Source & Dest port ranges */
+		    "%u-%u %u-%u "		/* Source & Dest port ranges */
 		    "A%02X X%02X "		/* TOS and and xor masks */
 		    "%08X "			/* Redirection port */
 		    "%u "			/* fw_mark field */
-		    "%hu "			/* output size */
+		    "%u "			/* output size */
 		    "%9s\n",			/* Target */
 		    chainlabel,
 		    ntohl(rule->ipfw.fw_src.s_addr),
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 04fde6120..6a2e4eca5 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -684,7 +684,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct device *dev)
 	else if (skb->protocol == __constant_htons(ETH_P_IPV6)) {
 		struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
 
-		if (rt6 && mtu < rt6->u.dst.pmtu && mtu >= 576) {
+		if (rt6 && mtu < rt6->u.dst.pmtu && mtu >= IPV6_MIN_MTU) {
 			if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) ||
 			    rt6->rt6i_dst.plen == 128) {
 				rt6->rt6i_flags |= RTF_MODIFIED;
@@ -692,7 +692,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct device *dev)
 			}
 		}
 
-		if (mtu >= 576 && mtu < skb->len - tunnel->hlen + gre_hlen) {
+		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
 			ip_rt_put(rt);
 			goto tx_error;
@@ -722,6 +722,8 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct device *dev)
 			tunnel->recursion--;
 			return 0;
 		}
+		if (skb->sk)
+			skb_set_owner_w(new_skb, skb->sk);
 		dev_kfree_skb(skb);
 		skb = new_skb;
 	}
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index f56a90332..e06ad8206 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -5,7 +5,7 @@
  *
  *		The Internet Protocol (IP) module.
  *
- * Version:	$Id: ip_input.c,v 1.31 1998/05/17 02:19:15 freitag Exp $
+ * Version:	$Id: ip_input.c,v 1.33 1998/08/26 12:03:47 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -115,38 +115,31 @@
  *		2 of the License, or (at your option) any later version.
  */
 
-#include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/config.h>
 
+#include <linux/net.h>
 #include <linux/socket.h>
 #include <linux/sockios.h>
 #include <linux/in.h>
 #include <linux/inet.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
-#include <linux/proc_fs.h>
-#include <linux/stat.h>
 
 #include <net/snmp.h>
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <net/route.h>
-#include <net/tcp.h>
-#include <net/udp.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <net/arp.h>
 #include <net/icmp.h>
 #include <net/raw.h>
 #include <net/checksum.h>
-#include <linux/igmp.h>
 #include <linux/ip_fw.h>
 #ifdef CONFIG_IP_MASQUERADE
 #include <net/ip_masq.h>
@@ -154,7 +147,6 @@
 #include <linux/firewall.h>
 #include <linux/mroute.h>
 #include <linux/netlink.h>
-#include <linux/ipsec.h>
 
 /*
  *	SNMP management statistics
@@ -199,6 +191,9 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
+/*
+ *	Process Router Attention IP option
+ */ 
 int ip_call_ra_chain(struct sk_buff *skb)
 {
 	struct ip_ra_chain *ra;
@@ -229,6 +224,9 @@ int ip_call_ra_chain(struct sk_buff *skb)
 	return 0;
 }
 
+/*
+ * 	Deliver IP Packets to the higher protocol layers.
+ */ 
 int ip_local_deliver(struct sk_buff *skb)
 {
 	struct iphdr *iph = skb->nh.iph;
@@ -282,9 +280,11 @@ int ip_local_deliver(struct sk_buff *skb)
         skb->h.raw = skb->nh.raw + iph->ihl*4;
 
 	/*
-	 *	Deliver to raw sockets. This is fun as to avoid copies we want to make no surplus copies.
+	 *	Deliver to raw sockets. This is fun as to avoid copies we want to make no 
+	 *	surplus copies.
 	 *
 	 *	RFC 1122: SHOULD pass TOS value up to the transport layer.
+	 *	-> It does. And not only TOS, but all IP header.
 	 */
  
 	/* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
@@ -309,10 +309,7 @@ int ip_local_deliver(struct sk_buff *skb)
 					skb1 = skb_clone(skb, GFP_ATOMIC);
 					if(skb1)
 					{
-						if(ipsec_sk_policy(raw_sk,skb1))	
-							raw_rcv(raw_sk, skb1);
-						else
-							kfree_skb(skb1);
+						raw_rcv(raw_sk, skb1);
 					}
 				}
 				raw_sk = sknext;
@@ -372,10 +369,8 @@ int ip_local_deliver(struct sk_buff *skb)
 
 	if(raw_sk!=NULL)	/* Shift to last raw user */
 	{
-		if(ipsec_sk_policy(raw_sk, skb))
-			raw_rcv(raw_sk, skb);
-		else
-			kfree_skb(skb);
+		raw_rcv(raw_sk, skb);
+
 	}
 	else if (!flag)		/* Free and report errors */
 	{
@@ -386,15 +381,16 @@ int ip_local_deliver(struct sk_buff *skb)
 	return(0);
 }
 
+/*
+ * 	Main IP Receive routine.
+ */ 
 int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
 {
 	struct iphdr *iph = skb->nh.iph;
-	struct ip_options * opt = NULL;
-	int err;
 
 	/*
-	 * When interface is in promisc. mode, drop all the crap
-	 * that it receives, do not truing to analyse it.
+	 * 	When the interface is in promisc. mode, drop all the crap
+	 * 	that it receives, do not try to analyse it.
 	 */
 	if (skb->pkt_type == PACKET_OTHERHOST)
 		goto drop;
@@ -412,24 +408,32 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
 	 *	4.	Doesn't have a bogus length
 	 */
 
-	if (skb->len<sizeof(struct iphdr) || iph->ihl<5 || iph->version != 4
-#ifndef CONFIG_IP_ROUTER
-	    || ip_fast_csum((unsigned char *)iph, iph->ihl) !=0
-#endif
-		|| skb->len < ntohs(iph->tot_len))
-		goto inhdr_error;
+	if (skb->len < sizeof(struct iphdr))
+		goto inhdr_error; 
+	if (iph->ihl < 5 || iph->version != 4 || ip_fast_csum((u8 *)iph, iph->ihl) != 0)
+		goto inhdr_error; 
+
+	{
+	__u32 len = ntohs(iph->tot_len); 
+	if (skb->len < len)
+		goto inhdr_error; 
 
 	/*
 	 *	Our transport medium may have padded the buffer out. Now we know it
 	 *	is IP we can trim to the true length of the frame.
 	 *	Note this now means skb->len holds ntohs(iph->tot_len).
 	 */
-	__skb_trim(skb, ntohs(iph->tot_len));
 
+	__skb_trim(skb, len);
+	}
+	
+	/*
+	 *	Initialise the virtual path cache for the packet. It describes
+	 *	how the packet travels inside Linux networking.
+	 */ 
 	if (skb->dst == NULL) {
-		err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev);
-		if (err)
-			goto drop;
+		if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))
+			goto drop; 
 #ifdef CONFIG_CPU_IS_SLOW
 		if (net_cpu_congestion > 10 && !(iph->tos&IPTOS_RELIABILITY) &&
 		    IPTOS_PREC(iph->tos) < IPTOS_PREC_INTERNETCONTROL) {
@@ -449,6 +453,21 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
 #endif
 
 	if (iph->ihl > 5) {
+		struct ip_options *opt;
+
+		/* It looks as overkill, because not all
+		   IP options require packet mangling.
+		   But it is the easiest for now, especially taking
+		   into account that combination of IP options
+		   and running sniffer is extremely rare condition.
+		                                      --ANK (980813)
+		*/
+		   
+		skb = skb_cow(skb, skb_headroom(skb));
+		if (skb == NULL)
+			return 0;
+		iph = skb->nh.iph;
+
 		skb->ip_summed = 0;
 		if (ip_options_compile(NULL, skb))
 			goto inhdr_error;
@@ -458,8 +477,8 @@ int ip_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
 			struct in_device *in_dev = dev->ip_ptr;
 			if (in_dev && !IN_DEV_SOURCE_ROUTE(in_dev)) {
 				if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
-					printk(KERN_INFO "source route option %08lx -> %08lx\n",
-					       ntohl(iph->saddr), ntohl(iph->daddr));
+					printk(KERN_INFO "source route option %d.%d.%d.%d -> %d.%d.%d.%d\n",
+					       NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
 				goto drop;
 			}
 			if (ip_options_rcv_srr(skb))
diff --git a/net/ipv4/ip_nat_dumb.c b/net/ipv4/ip_nat_dumb.c
index 07a7afc23..c48ea9263 100644
--- a/net/ipv4/ip_nat_dumb.c
+++ b/net/ipv4/ip_nat_dumb.c
@@ -5,7 +5,7 @@
  *
  *		Dumb Network Address Translation.
  *
- * Version:	$Id: ip_nat_dumb.c,v 1.3 1998/03/15 03:31:44 davem Exp $
+ * Version:	$Id: ip_nat_dumb.c,v 1.4 1998/08/26 12:03:49 davem Exp $
  *
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
@@ -17,11 +17,12 @@
  * Fixes:
  *		Rani Assaf	:	A zero checksum is a special case
  *					only in UDP
+ * 		Rani Assaf	:	Added ICMP messages rewriting
+ *
  *
  * NOTE:	It is just working model of real NAT.
  */
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
@@ -36,9 +37,6 @@
 #include <linux/udp.h>
 #include <linux/firewall.h>
 #include <linux/ip_fw.h>
-#ifdef CONFIG_IP_MASQUERADE
-#include <net/ip_masq.h>
-#endif
 #include <net/checksum.h>
 #include <linux/route.h>
 #include <net/route.h>
@@ -68,20 +66,48 @@ ip_do_nat(struct sk_buff *skb)
 
 		switch(iph->protocol) {
 		case IPPROTO_TCP:
-			cksum  = (u16*)&((struct tcphdr*)(((char*)iph) + iph->ihl*4))->check;
+			cksum  = (u16*)&((struct tcphdr*)(((char*)iph) + (iph->ihl<<2)))->check;
+			if ((u8*)(cksum+1) > skb->tail)
+				goto truncated;
 			check  = csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 0, ~(*cksum));
 			*cksum = csum_tcpudp_magic(~osaddr, ~odaddr, 0, 0, ~check);
 			break;
 		case IPPROTO_UDP:
-			cksum  = (u16*)&((struct udphdr*)(((char*)iph) + iph->ihl*4))->check;
+			cksum  = (u16*)&((struct udphdr*)(((char*)iph) + (iph->ihl<<2)))->check;
+			if ((u8*)(cksum+1) > skb->tail)
+				goto truncated;
 			if ((check = *cksum) != 0) {
 				check = csum_tcpudp_magic(iph->saddr, iph->daddr, 0, 0, ~check);
 				check = csum_tcpudp_magic(~osaddr, ~odaddr, 0, 0, ~check);
 				*cksum = check ? : 0xFFFF;
 			}
+			break;
+		case IPPROTO_ICMP:
+		{
+			struct icmphdr *icmph = (struct icmphdr*)((char*)iph + (iph->ihl<<2));
+			struct   iphdr *ciph;
+
+			if ((icmph->type != ICMP_DEST_UNREACH) &&
+			    (icmph->type != ICMP_TIME_EXCEEDED) &&
+			    (icmph->type != ICMP_PARAMETERPROB)) break;
+
+			ciph = (struct iphdr *) (icmph + 1);
+
+			if ((u8*)(ciph+1) > skb->tail)
+				goto truncated;
+
+			if (rt->rt_flags&RTCF_DNAT && ciph->saddr == odaddr)
+				ciph->saddr = iph->daddr;
+			if (rt->rt_flags&RTCF_SNAT && ciph->daddr == osaddr)
+				ciph->daddr = iph->saddr;
+			break;
+		}
 		default:
 			break;
 		}
 	}
 	return 0;
+
+truncated:
+	return -EINVAL;
 }
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 3e3674ef7..153c7a391 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -5,7 +5,7 @@
  *
  *		The options processing module for ip.c
  *
- * Version:	$Id: ip_options.c,v 1.13 1998/02/12 07:43:12 davem Exp $
+ * Version:	$Id: ip_options.c,v 1.14 1998/08/26 12:03:51 davem Exp $
  *
  * Authors:	A.N.Kuznetsov
  *		
@@ -451,7 +451,7 @@ eol:
 
 error:
 	if (skb) {
-		icmp_send(skb, ICMP_PARAMETERPROB, 0, pp_ptr-iph);
+		icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24));
 		kfree_skb(skb);
 	}
 	return -EINVAL;
@@ -579,7 +579,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
 	if (rt->rt_type == RTN_UNICAST) {
 		if (!opt->is_strictroute)
 			return 0;
-		icmp_send(skb, ICMP_PARAMETERPROB, 0, 16);
+		icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24));
 		return -EINVAL;
 	}
 	if (rt->rt_type != RTN_LOCAL)
@@ -587,7 +587,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
 
 	for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) {
 		if (srrptr + 3 > srrspace) {
-			icmp_send(skb, ICMP_PARAMETERPROB, 0, opt->srr+2);
+			icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24));
 			return -EINVAL;
 		}
 		memcpy(&nexthop, &optptr[srrptr-1], 4);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 0527c1b0b..9250051ab 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -5,7 +5,7 @@
  *
  *		The Internet Protocol (IP) output module.
  *
- * Version:	$Id: ip_output.c,v 1.59 1998/07/15 05:05:15 davem Exp $
+ * Version:	$Id: ip_output.c,v 1.61 1998/08/26 12:03:54 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -171,14 +171,7 @@ int ip_mc_output(struct sk_buff *skb)
 	 */
 
 	if (rt->rt_flags&RTCF_MULTICAST && (!sk || sk->ip_mc_loop)) {
-#ifndef CONFIG_IP_MROUTE
-#if 1
-		/* It should never occur. Delete it eventually. --ANK */
-		if (!(rt->rt_flags&RTCF_LOCAL) || (dev->flags&IFF_LOOPBACK))
-			printk(KERN_DEBUG "ip_mc_output (mc): it should never occur\n");
-		else
-#endif
-#else
+#ifdef CONFIG_IP_MROUTE
 		/* Small optimization: do not loopback not local frames,
 		   which returned after forwarding; they will be  dropped
 		   by ip_mr_input in any case.
@@ -199,15 +192,8 @@ int ip_mc_output(struct sk_buff *skb)
 		}
 	}
 
-	if (rt->rt_flags&RTCF_BROADCAST) {
-#if 1
-		/* It should never occur. Delete it eventually. --ANK */
-		if (!(rt->rt_flags&RTCF_LOCAL) || (dev->flags&IFF_LOOPBACK))
-			printk(KERN_DEBUG "ip_mc_output (brd): it should never occur!\n");
-		else
-#endif
+	if (rt->rt_flags&RTCF_BROADCAST)
 		dev_loopback_xmit(skb);
-	}
 
 	return ip_finish_output(skb);
 }
@@ -281,8 +267,6 @@ void ip_queue_xmit(struct sk_buff *skb)
 	iph->ihl      = 5;
 	iph->tos      = sk->ip_tos;
 	iph->frag_off = 0;
-	if(sk->ip_pmtudisc == IP_PMTUDISC_WANT && !(rt->u.dst.mxlock & (1 << RTAX_MTU)))
-		iph->frag_off |= __constant_htons(IP_DF);
 	iph->ttl      = sk->ip_ttl;
 	iph->daddr    = rt->rt_dst;
 	iph->saddr    = rt->rt_src;
@@ -316,6 +300,8 @@ void ip_queue_xmit(struct sk_buff *skb)
 		kfree_skb(skb);
 		if (skb2 == NULL)
 			return;
+		if (sk)
+			skb_set_owner_w(skb, sk);
 		skb = skb2;
 		iph = skb->nh.iph;
 	}
@@ -326,6 +312,9 @@ void ip_queue_xmit(struct sk_buff *skb)
 	if (tot_len > rt->u.dst.pmtu)
 		goto fragment;
 
+	if (sk->ip_pmtudisc == IP_PMTUDISC_WANT && !(rt->u.dst.mxlock & (1 << RTAX_MTU)))
+		iph->frag_off |= __constant_htons(IP_DF);
+
 	/* Add an IP checksum. */
 	ip_send_check(iph);
 
@@ -334,7 +323,15 @@ void ip_queue_xmit(struct sk_buff *skb)
 	return;
 
 fragment:
-	if ((iph->frag_off & htons(IP_DF)) != 0) {
+	if (sk->ip_pmtudisc == IP_PMTUDISC_WANT &&
+	    !(rt->u.dst.mxlock & (1 << RTAX_MTU)) &&
+	    tot_len > (iph->ihl<<2) + sizeof(struct tcphdr)+16) {
+		/* Reject packet ONLY if TCP might fragment
+		   it itself, if were careful enough.
+		   Test is not precise (f.e. it does not take sacks
+		   into account). Actually, tcp should make it. --ANK (980801)
+		 */
+		iph->frag_off |= __constant_htons(IP_DF);
 		printk(KERN_DEBUG "sending pkt_too_big to self\n");
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 			  htonl(rt->u.dst.pmtu));
@@ -701,7 +698,6 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 	unsigned int mtu, hlen, left, len; 
 	int offset;
 	int not_last_frag;
-	u16 dont_fragment;
 	struct rtable *rt = (struct rtable*)skb->dst;
 
 	dev = rt->u.dst.dev;
@@ -726,10 +722,14 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 	 *	The protocol doesn't seem to say what to do in the case that the
 	 *	frame + options doesn't fit the mtu. As it used to fall down dead
 	 *	in this case we were fortunate it didn't happen
+	 *
+	 *	It is impossible, because mtu>=68. --ANK (980801)
 	 */
 
+#ifdef CONFIG_NET_PARANOIA
 	if (mtu<8) 
 		goto fail;
+#endif
 
 	/*
 	 *	Fragment the datagram.
@@ -739,14 +739,6 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 	not_last_frag = iph->frag_off & htons(IP_MF);
 
 	/*
-	 *	Nice moment: if DF is set and we are here,
-	 *	it means that packet should be fragmented and
-	 *	DF is set on fragments. If it works,
-	 *	path MTU discovery can be done by ONE segment(!). --ANK
-	 */
-	dont_fragment = iph->frag_off & htons(IP_DF);
-
-	/*
 	 *	Keep copying data until we run out.
 	 */
 
@@ -805,7 +797,7 @@ void ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 		 *	Fill in the new header fields.
 		 */
 		iph = skb2->nh.iph;
-		iph->frag_off = htons((offset >> 3))|dont_fragment;
+		iph->frag_off = htons((offset >> 3));
 
 		/* ANK: dirty, but effective trick. Upgrade options only if
 		 * the segment to be fragmented was THE FIRST (otherwise,
@@ -858,11 +850,6 @@ static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset,
 	int len; 
 	int hdrflag = 1; 
 
-#if 0
-	printk("ip_reply_glue_bits: offset=%u,flen=%u iov[0].l=%u,iov[1].len=%u\n",
-	       offset,fraglen,dp->iov[0].iov_len,dp->iov[1].iov_len);
-#endif
-
 	iov = &dp->iov[0]; 
 	if (offset >= iov->iov_len) { 
 		offset -= iov->iov_len;
@@ -871,12 +858,6 @@ static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset,
 	}
 	len = iov->iov_len - offset;
 	if (fraglen > len) { /* overlapping. */ 
-#if 1
-		if (iov > &dp->iov[0]) {
-			printk("frag too long! (o=%u,fl=%u)\n",offset,fraglen);
-			return -1;
-		}
-#endif
 		dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len,
 					     dp->csum);
 		offset = 0;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 8f712c801..3d8f4fab6 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -5,7 +5,7 @@
  *
  *		The IP to API glue.
  *		
- * Version:	$Id: ip_sockglue.c,v 1.36 1998/07/15 05:05:06 davem Exp $
+ * Version:	$Id: ip_sockglue.c,v 1.37 1998/08/26 12:03:57 davem Exp $
  *
  * Authors:	see ip.c
  *
@@ -28,6 +28,7 @@
 #include <net/sock.h>
 #include <net/ip.h>
 #include <net/icmp.h>
+#include <net/tcp.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/igmp.h>
@@ -36,6 +37,9 @@
 #include <linux/route.h>
 #include <linux/mroute.h>
 #include <net/route.h>
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#include <net/transp_v6.h>
+#endif
 
 #include <asm/uaccess.h>
 
@@ -140,6 +144,10 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc)
 	struct cmsghdr *cmsg;
 
 	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+		if ((unsigned long)(((char*)cmsg - (char*)msg->msg_control)
+				    + cmsg->cmsg_len) > msg->msg_controllen) {
+			return -EINVAL;
+		}
 		if (cmsg->cmsg_level != SOL_IP)
 			continue;
 		switch (cmsg->cmsg_type) {
@@ -255,22 +263,30 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt
 		case IP_OPTIONS:
 		{
 			struct ip_options * opt = NULL;
-			struct ip_options * old_opt;
 			if (optlen > 40 || optlen < 0)
 				return -EINVAL;
 			err = ip_options_get(&opt, optval, optlen, 1);
 			if (err)
 				return err;
-			/*
-			 * ANK: I'm afraid that receive handler may change
-			 * options from under us.
-			 */
-			cli();
-			old_opt = sk->opt;
-			sk->opt = opt;
-			sti();
-			if (old_opt)
-				kfree_s(old_opt, sizeof(struct ip_options) + old_opt->optlen);
+			start_bh_atomic();
+			if (sk->type == SOCK_STREAM) {
+				struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+				if (sk->family == PF_INET ||
+				    ((tcp_connected(sk->state) || sk->state == TCP_SYN_SENT)
+				     && sk->daddr != LOOPBACK4_IPV6)) {
+#endif
+					if (opt)
+						tp->ext_header_len = opt->optlen;
+					tcp_sync_mss(sk, tp->pmtu_cookie);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+				}
+#endif
+			}
+			opt = xchg(&sk->opt, opt);
+			end_bh_atomic();
+			if (opt)
+				kfree_s(opt, sizeof(struct ip_options) + opt->optlen);
 			return 0;
 		}
 		case IP_PKTINFO:
@@ -497,11 +513,11 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *op
 			{
 				unsigned char optbuf[sizeof(struct ip_options)+40];
 				struct ip_options * opt = (struct ip_options*)optbuf;
-				cli();
+				start_bh_atomic();
 				opt->optlen = 0;
 				if (sk->opt)
 					memcpy(optbuf, sk->opt, sizeof(struct ip_options)+sk->opt->optlen);
-				sti();
+				end_bh_atomic();
 				if (opt->optlen == 0) 
 					return put_user(0, optlen);
 
@@ -511,7 +527,7 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char *optval, int *op
 				if(put_user(len, optlen))
 					return -EFAULT;
 				if(copy_to_user(optval, opt->__data, len))
-					    return -EFAULT;
+					return -EFAULT;
 				return 0;
 			}
 		case IP_PKTINFO:
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index d0b3b5ff2..778ac15c1 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -1,7 +1,7 @@
 /*
  *	Linux NET3:	IP/IP protocol decoder. 
  *
- *	Version: $Id: ipip.c,v 1.22 1998/03/08 05:56:27 davem Exp $
+ *	Version: $Id: ipip.c,v 1.23 1998/08/26 12:04:00 davem Exp $
  *
  *	Authors:
  *		Sam Lantinga (slouken@cs.ucdavis.edu)  02/01/95
@@ -551,6 +551,8 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct device *dev)
 			tunnel->recursion--;
 			return 0;
 		}
+		if (skb->sk)
+			skb_set_owner_w(new_skb, skb->sk);
 		dev_kfree_skb(skb);
 		skb = new_skb;
 	}
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 29fd4b3ad..49cd6daf5 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -9,7 +9,7 @@
  *	as published by the Free Software Foundation; either version
  *	2 of the License, or (at your option) any later version.
  *
- *	Version: $Id: ipmr.c,v 1.35 1998/05/13 06:23:24 davem Exp $
+ *	Version: $Id: ipmr.c,v 1.36 1998/08/26 12:04:03 davem Exp $
  *
  *	Fixes:
  *	Michael Chastain	:	Incorrect size of copying.
@@ -55,6 +55,8 @@
 #include <net/raw.h>
 #include <linux/notifier.h>
 #include <linux/if_arp.h>
+#include <linux/ip_fw.h>
+#include <linux/firewall.h>
 #include <net/ipip.h>
 #include <net/checksum.h>
 
@@ -1044,7 +1046,12 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c,
 
 	dev = rt->u.dst.dev;
 
-	if (skb->len+encap > dev->mtu && (ntohs(iph->frag_off) & IP_DF)) {
+	if (skb->len+encap > rt->u.dst.pmtu /* && (ntohs(iph->frag_off) & IP_DF) */) {
+		/* Do not fragment multicasts. Alas, IPv4 does not
+		   allow to send ICMP, so that packets will disappear
+		   to blackhole.
+		 */
+
 		ip_statistics.IpFragFails++;
 		ip_rt_put(rt);
 		return;
@@ -1052,11 +1059,6 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c,
 
 	encap += dev->hard_header_len;
 
-	if (skb->len+encap > 65534) {
-		ip_rt_put(rt);
-		return;
-	}
-
 	if (skb_headroom(skb) < encap || skb_cloned(skb) || !last)
 		skb2 = skb_realloc_headroom(skb, (encap + 15)&~15);
 	else if (atomic_read(&skb->users) != 1)
@@ -1076,18 +1078,37 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c,
 
 	dst_release(skb2->dst);
 	skb2->dst = &rt->u.dst;
-
 	iph = skb2->nh.iph;
 	ip_decrease_ttl(iph);
 
+#ifdef CONFIG_FIREWALL
+	if (call_fw_firewall(PF_INET, vif->dev, skb2->nh.iph, NULL, &skb2) < FW_ACCEPT) {
+		kfree_skb(skb2);
+		return;
+	}
+	if (call_out_firewall(PF_INET, vif->dev, skb2->nh.iph, NULL, &skb2) < FW_ACCEPT) {
+		kfree_skb(skb2);
+		return;
+	}
+#endif
 	if (vif->flags & VIFF_TUNNEL) {
 		ip_encap(skb2, vif->local, vif->remote);
+#ifdef CONFIG_FIREWALL
+		/* Double output firewalling on tunnels: one is on tunnel
+		   another one is on real device.
+		 */
+		if (call_out_firewall(PF_INET, dev, skb2->nh.iph, NULL, &skb2) < FW_ACCEPT) {
+			kfree_skb(skb2);
+			return;
+		}
+#endif
 		((struct ip_tunnel *)vif->dev->priv)->stat.tx_packets++;
 		((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb2->len;
 	}
 
 	IPCB(skb2)->flags |= IPSKB_FORWARDED;
 
+
 	/*
 	 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
 	 * not only before forwarding, but after forwarding on all output
@@ -1351,21 +1372,12 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
 	struct rtnexthop *nhp;
 	struct device *dev = vif_table[c->mfc_parent].dev;
 	u8 *b = skb->tail;
-
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	if (dev) {
-		u8 *o = skb->tail;
-		RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
-		rtm->rtm_optlen += skb->tail - o;
-	}
-#else
 	struct rtattr *mp_head;
 
 	if (dev)
 		RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
 
 	mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
-#endif
 
 	for (ct = c->mfc_minvif; ct < c->mfc_maxvif; ct++) {
 		if (c->mfc_ttls[ct] < 255) {
@@ -1376,15 +1388,10 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
 			nhp->rtnh_hops = c->mfc_ttls[ct];
 			nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
 			nhp->rtnh_len = sizeof(*nhp);
-#ifdef CONFIG_RTNL_OLD_IFINFO
-			rtm->rtm_nhs++;
-#endif
 		}
 	}
-#ifndef CONFIG_RTNL_OLD_IFINFO
 	mp_head->rta_type = RTA_MULTIPATH;
 	mp_head->rta_len = skb->tail - (u8*)mp_head;
-#endif
 	rtm->rtm_type = RTN_MULTICAST;
 	return 1;
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index b6e06242f..6f06f4345 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -7,7 +7,7 @@
  *		PROC file system.  It is mainly used for debugging and
  *		statistics.
  *
- * Version:	$Id: proc.c,v 1.30 1998/04/16 16:29:05 freitag Exp $
+ * Version:	$Id: proc.c,v 1.31 1998/07/29 20:09:25 freitag Exp $
  *
  * Authors:	Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *		Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de>
@@ -357,12 +357,15 @@ int netstat_get_info(char *buffer, char **start, off_t offset, int length, int d
 
 	len = sprintf(buffer,
 		      "TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed"
-		      "EmbryonicRsts\n"
-		      "TcpExt: %lu %lu %lu %lu\n",
+		      " EmbryonicRsts PruneCalled RcvPruned OfoPruned\n"
+		      "TcpExt: %lu %lu %lu %lu %lu %lu %lu\n",
 		      net_statistics.SyncookiesSent,
 		      net_statistics.SyncookiesRecv,
 		      net_statistics.SyncookiesFailed,
-		      net_statistics.EmbryonicRsts);
+		      net_statistics.EmbryonicRsts,
+		      net_statistics.PruneCalled,
+		      net_statistics.RcvPruned,
+		      net_statistics.OfoPruned);
 
 	if (offset >= len)
 	{
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 8d8bdab97..e10ddc0dd 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -5,7 +5,7 @@
  *
  *		RAW - implementation of IP "raw" sockets.
  *
- * Version:	$Id: raw.c,v 1.36 1998/05/08 21:06:29 davem Exp $
+ * Version:	$Id: raw.c,v 1.37 1998/08/26 12:04:07 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -152,7 +152,7 @@ void raw_err (struct sock *sk, struct sk_buff *skb)
 	int type = skb->h.icmph->type;
 	int code = skb->h.icmph->code;
 
-	if (sk->ip_recverr && !sk->sock_readers) {
+	if (sk->ip_recverr) {
 		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 		if (skb2 && sock_queue_err_skb(sk, skb2))
 			kfree_skb(skb);
@@ -194,10 +194,6 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
 	
 	skb->h.raw = skb->nh.raw;
 
-	if (sk->sock_readers) {
-		__skb_queue_tail(&sk->back_log, skb);
-		return 0;
-	}
 	raw_rcv_skb(sk, skb);
 	return 0;
 }
@@ -379,10 +375,33 @@ done:
 
 static void raw_close(struct sock *sk, unsigned long timeout)
 {
+	/* Observation: when raw_close is called, processes have
+	   no access to socket anymore. But net still has.
+	   Step one, detach it from networking:
+
+	   A. Remove from hash tables.
+	 */
 	sk->state = TCP_CLOSE;
+	raw_v4_unhash(sk);
+        /*
+	   B. Raw sockets may have direct kernel refereneces. Kill them.
+	 */
 	ip_ra_control(sk, 0, NULL);
+
+	/* In this point socket cannot receive new packets anymore */
+
+
+	/* But we still have packets pending on receive
+	   queue and probably, our own packets waiting in device queues.
+	   sock_destroy will drain receive queue, but transmitted
+	   packets will delay socket destruction.
+	   Set sk->dead=1 in order to prevent wakeups, when these
+	   packet will be freed.
+	 */
 	sk->dead=1;
 	destroy_sock(sk);
+
+	/* That's all. No races here. */
 }
 
 /* This gets rid of all the nasties in af_inet. -DaveM */
@@ -474,14 +493,8 @@ done:
 static int raw_init(struct sock *sk)
 {
 	struct raw_opt *tp = &(sk->tp_pinfo.tp_raw4);
-	if (sk->num == IPPROTO_ICMP) {
+	if (sk->num == IPPROTO_ICMP)
 		memset(&tp->filter, 0, sizeof(tp->filter));
-
-		/* By default block ECHO and TIMESTAMP requests */
-
-		set_bit(ICMP_ECHO, &tp->filter);
-		set_bit(ICMP_TIMESTAMP, &tp->filter);
-	}
 	return 0;
 }
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index e10f65c68..5788342c9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -5,7 +5,7 @@
  *
  *		ROUTE - implementation of the IP router.
  *
- * Version:	$Id: route.c,v 1.54 1998/07/15 05:05:22 davem Exp $
+ * Version:	$Id: route.c,v 1.57 1998/08/26 12:04:09 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -48,6 +48,7 @@
  *					route.c and rewritten from scratch.
  *		Andi Kleen	:	Load-limit warning messages.
  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
+ *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -90,6 +91,8 @@
 #include <linux/sysctl.h>
 #endif
 
+#define IP_MAX_MTU	0xFFF0
+
 #define RT_GC_TIMEOUT (300*HZ)
 
 int ip_rt_min_delay = 2*HZ;
@@ -166,7 +169,7 @@ __u8 ip_tos2prio[16] = {
  * Route cache.
  */
 
-static struct rtable 	*rt_hash_table[RT_HASH_DIVISOR];
+struct rtable 	*rt_hash_table[RT_HASH_DIVISOR];
 
 static struct rtable * rt_intern_hash(unsigned hash, struct rtable * rth);
 
@@ -246,6 +249,13 @@ static __inline__ void rt_free(struct rtable *rt)
 	dst_free(&rt->u.dst);
 }
 
+static __inline__ int rt_fast_clean(struct rtable *rth)
+{
+	/* Kill broadcast/multicast entries very aggresively, if they
+	   collide in hash table with more useful entries */
+	return ((rth->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST))
+		&& rth->key.iif && rth->u.rt_next);
+}
 
 static void rt_check_expire(unsigned long dummy)
 {
@@ -255,43 +265,30 @@ static void rt_check_expire(unsigned long dummy)
 	unsigned long now = jiffies;
 
 	for (i=0; i<RT_HASH_DIVISOR/5; i++) {
+		unsigned tmo = ip_rt_gc_timeout;
+
 		rover = (rover + 1) & (RT_HASH_DIVISOR-1);
 		rthp = &rt_hash_table[rover];
 
 		while ((rth = *rthp) != NULL) {
-			struct rtable * rth_next = rth->u.rt_next;
-
 			/*
 			 * Cleanup aged off entries.
 			 */
 
 			if (!atomic_read(&rth->u.dst.use) &&
-			    (now - rth->u.dst.lastuse > ip_rt_gc_timeout)) {
-				*rthp = rth_next;
-#if RT_CACHE_DEBUG >= 2
-				printk("rt_check_expire clean %02x@%08x\n", rover, rth->rt_dst);
-#endif
+			    (now - rth->u.dst.lastuse > tmo
+			     || rt_fast_clean(rth))) {
+				*rthp = rth->u.rt_next;
 				rt_free(rth);
 				continue;
 			}
 
-			if (!rth_next)
-				break;
-
-			if ( (long)(rth_next->u.dst.lastuse - rth->u.dst.lastuse) > RT_CACHE_BUBBLE_THRESHOLD ||
-			    ((long)(rth->u.dst.lastuse - rth_next->u.dst.lastuse) < 0 &&
-			     atomic_read(&rth->u.dst.refcnt) < atomic_read(&rth_next->u.dst.refcnt))) {
-#if RT_CACHE_DEBUG >= 2
-				printk("rt_check_expire bubbled %02x@%08x<->%08x\n", rover, rth->rt_dst, rth_next->rt_dst);
-#endif
-				*rthp = rth_next;
- 				rth->u.rt_next = rth_next->u.rt_next;
-				rth_next->u.rt_next = rth;
-				rthp = &rth_next->u.rt_next;
-				continue;
-			}
+			tmo >>= 1;
 			rthp = &rth->u.rt_next;
 		}
+
+		if ((jiffies - now) > 0)
+			break;
 	}
 	rt_periodic_timer.expires = now + ip_rt_gc_interval;
 	add_timer(&rt_periodic_timer);
@@ -305,21 +302,14 @@ static void rt_run_flush(unsigned long dummy)
 	rt_deadline = 0;
 
 	for (i=0; i<RT_HASH_DIVISOR; i++) {
-		int nr=0;
-
 		if ((rth = xchg(&rt_hash_table[i], NULL)) == NULL)
 			continue;
 
 		for (; rth; rth=next) {
 			next = rth->u.rt_next;
-			nr++;
 			rth->u.rt_next = NULL;
 			rt_free(rth);
 		}
-#if RT_CACHE_DEBUG >= 2
-		if (nr > 0)
-			printk("rt_cache_flush: %d@%02x\n", nr, i);
-#endif
 	}
 }
   
@@ -384,17 +374,23 @@ static int rt_garbage_collect(void)
 	expire++;
 
 	for (i=0; i<RT_HASH_DIVISOR; i++) {
+		unsigned tmo;
 		if (!rt_hash_table[i])
 			continue;
+		tmo = expire;
 		for (rthp=&rt_hash_table[i]; (rth=*rthp); rthp=&rth->u.rt_next)	{
 			if (atomic_read(&rth->u.dst.use) ||
-			    now - rth->u.dst.lastuse < expire)
+			    (now - rth->u.dst.lastuse < tmo && !rt_fast_clean(rth))) {
+				tmo >>= 1;
 				continue;
+			}
 			*rthp = rth->u.rt_next;
 			rth->u.rt_next = NULL;
 			rt_free(rth);
 			break;
 		}
+		if ((jiffies-now)>0)
+			break;
 	}
 
 	last_gc = now;
@@ -412,8 +408,6 @@ static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt)
 	struct rtable	*rth, **rthp;
 	unsigned long	now = jiffies;
 
-	rt->u.dst.priority = rt_tos2priority(rt->key.tos);
-
 	start_bh_atomic();
 
 	rthp = &rt_hash_table[hash];
@@ -793,19 +787,17 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res)
 	if (fi) {
 		if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
 			rt->rt_gateway = FIB_RES_GW(*res);
-#ifndef CONFIG_RTNL_OLD_IFINFO
 		rt->u.dst.mxlock = fi->fib_metrics[RTAX_LOCK-1];
 		rt->u.dst.pmtu = fi->fib_mtu;
 		if (fi->fib_mtu == 0) {
 			rt->u.dst.pmtu = rt->u.dst.dev->mtu;
+			if (rt->u.dst.pmtu > IP_MAX_MTU)
+				rt->u.dst.pmtu = IP_MAX_MTU;
 			if (rt->u.dst.mxlock&(1<<RTAX_MTU) &&
 			    rt->rt_gateway != rt->rt_dst &&
 			    rt->u.dst.pmtu > 576)
 				rt->u.dst.pmtu = 576;
 		}
-#else
-		rt->u.dst.pmtu	= fi->fib_mtu ? : rt->u.dst.dev->mtu;
-#endif
 		rt->u.dst.window= fi->fib_window ? : 0;
 		rt->u.dst.rtt	= fi->fib_rtt ? : TCP_TIMEOUT_INIT;
 #ifdef CONFIG_NET_CLS_ROUTE
@@ -813,6 +805,8 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res)
 #endif
 	} else {
 		rt->u.dst.pmtu	= rt->u.dst.dev->mtu;
+		if (rt->u.dst.pmtu > IP_MAX_MTU)
+			rt->u.dst.pmtu = IP_MAX_MTU;
 		rt->u.dst.window= 0;
 		rt->u.dst.rtt	= TCP_TIMEOUT_INIT;
 	}
@@ -930,7 +924,7 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
 	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
 		goto martian_source;
 
-	if (daddr == 0xFFFFFFFF)
+	if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
 		goto brd_input;
 
 	/* Accept zero addresses only to limited broadcast;
@@ -991,6 +985,11 @@ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
 		fib_select_multipath(&key, &res);
 #endif
 	out_dev = FIB_RES_DEV(res)->ip_ptr;
+	if (out_dev == NULL) {
+		if (net_ratelimit())
+			printk(KERN_CRIT "Bug in ip_route_input_slow(). Please, report\n");
+		return -EINVAL;
+	}
 
 	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst);
 	if (err < 0)
@@ -1312,15 +1311,14 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int
 			   tables are looked up with only one purpose:
 			   to catch if destination is gatewayed, rather than
 			   direct. Moreover, if MSG_DONTROUTE is set,
-			   we send packet, no matter of routing tables
-			   of ifaddr state. --ANK
+			   we send packet, ignoring both routing tables
+			   and ifaddr state. --ANK
 
 
 			   We could make it even if oif is unknown,
 			   likely IPv6, but we do not.
 			 */
 
-			printk(KERN_DEBUG "Dest not on link. Forcing...\n");
 			if (key.src == 0)
 				key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK);
 			goto make_route;
@@ -1475,7 +1473,7 @@ int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
 
 #ifdef CONFIG_RTNETLINK
 
-static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int nowait)
+static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait)
 {
 	struct rtable *rt = (struct rtable*)skb->dst;
 	struct rtmsg *r;
@@ -1485,11 +1483,7 @@ static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int
 #ifdef CONFIG_IP_MROUTE
 	struct rtattr *eptr;
 #endif
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	unsigned char 	 *o;
-#else
 	struct rtattr *mx;
-#endif
 
 	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
 	r = NLMSG_DATA(nlh);
@@ -1503,11 +1497,6 @@ static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int
 	r->rtm_scope = RT_SCOPE_UNIVERSE;
 	r->rtm_protocol = RTPROT_UNSPEC;
 	r->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	r->rtm_nhs = 0;
-
-	o = skb->tail;
-#endif
 	RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
 	if (rt->key.src) {
 		r->rtm_src_len = 32;
@@ -1521,11 +1510,6 @@ static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int
 		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
 	if (rt->rt_dst != rt->rt_gateway)
 		RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &rt->u.dst.pmtu);
-	RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &rt->u.dst.window);
-	RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &rt->u.dst.rtt);
-#else
 	mx = (struct rtattr*)skb->tail;
 	RTA_PUT(skb, RTA_METRICS, 0, NULL);
 	if (rt->u.dst.mxlock)
@@ -1539,7 +1523,6 @@ static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int
 	mx->rta_len = skb->tail - (u8*)mx;
 	if (mx->rta_len == RTA_LENGTH(0))
 		skb_trim(skb, (u8*)mx - skb->data);
-#endif
 	ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
 	ci.rta_used = atomic_read(&rt->u.dst.refcnt);
 	ci.rta_clntref = atomic_read(&rt->u.dst.use);
@@ -1549,9 +1532,6 @@ static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int
 	eptr = (struct rtattr*)skb->tail;
 #endif
 	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	r->rtm_optlen = skb->tail - o;
-#endif
 	if (rt->key.iif) {
 #ifdef CONFIG_IP_MROUTE
 		u32 dst = rt->rt_dst;
@@ -1573,9 +1553,6 @@ static int rt_fill_info(struct sk_buff *skb, pid_t pid, u32 seq, int event, int
 #endif
 		{
 			RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->key.iif);
-#ifdef CONFIG_RTNL_OLD_IFINFO
-			r->rtm_optlen = skb->tail - o;
-#endif
 		}
 	}
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3d6f188e7..30a0b0dd6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp.c,v 1.116 1998/07/26 03:06:54 davem Exp $
+ * Version:	$Id: tcp.c,v 1.119 1998/08/26 12:04:14 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -201,6 +201,7 @@
  *					tcp_do_sendmsg to avoid burstiness.
  *		Eric Schenk	:	Fix fast close down bug with
  *					shutdown() followed by close().
+ *		Andi Kleen :	Make poll agree with SIGIO
  *					
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -383,13 +384,14 @@
  *
  * ICMP messages (4.2.3.9)
  *   MUST act on ICMP errors. (does)
- *   MUST slow transmission upon receipt of a Source Quench. (does)
+ *   MUST slow transmission upon receipt of a Source Quench. (doesn't anymore 
+ *   because that is deprecated now by the IETF, can be turned on)
  *   MUST NOT abort connection upon receipt of soft Destination
  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
  *     Problems. (doesn't)
  *   SHOULD report soft Destination Unreachables etc. to the
- *     application. (does, but may drop them in the ICMP error handler
- *	during an accept())
+ *     application. (does, except during SYN_RECV and may drop messages
+ *     in some rare cases before accept() - ICMP is unreliable)	
  *   SHOULD abort connection upon receipt of hard Destination Unreachable
  *     messages (2, 3, 4). (does, but see above)
  *
@@ -397,7 +399,7 @@
  *   MUST reject as an error OPEN for invalid remote IP address. (does)
  *   MUST ignore SYN with invalid source address. (does)
  *   MUST silently discard incoming SYN for broadcast/multicast
- *     address. (I'm not sure if it does. Someone should check this.)
+ *     address. (does)
  *
  * Asynchronous Reports (4.2.4.1)
  * MUST provide mechanism for reporting soft errors to application
@@ -537,6 +539,21 @@ static unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
 }
 
 /*
+ *	Compute minimal free write space needed to queue new packets. 
+ */
+static inline int tcp_min_write_space(struct sock *sk, struct tcp_opt *tp)
+{
+	int space;
+#if 1 /* This needs benchmarking and real world tests */
+	space = max(tp->mss_cache + 128, MIN_WRITE_SPACE);
+#else /* 2.0 way */
+	/* More than half of the socket queue free? */
+	space = atomic_read(&sk->wmem_alloc) / 2;
+#endif
+	return space;
+}
+
+/*
  *	Wait for a TCP event.
  *
  *	Note that we don't need to lock the socket, as the upper poll layers
@@ -556,36 +573,56 @@ unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
 	mask = 0;
 	if (sk->err)
 		mask = POLLERR;
+
+	/*
+	 * POLLHUP is certainly not done right. But poll() doesn't
+	 * have a notion of HUP in just one direction, and for a
+	 * socket the read side is more interesting.
+	 *
+	 * Some poll() documentation says that POLLHUP is incompatible
+	 * with the POLLOUT/POLLWR flags, so somebody should check this
+	 * all. But careful, it tends to be safer to return too many
+	 * bits than too few, and you can easily break real applications
+	 * if you don't tell them that something has hung up!
+	 *
+	 * Check-me.
+	 */
+	if (sk->shutdown & RCV_SHUTDOWN)
+		mask |= POLLHUP;
+
 	/* Connected? */
 	if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
-		int space;
-
-		if (sk->shutdown & RCV_SHUTDOWN)
-			mask |= POLLHUP;
-		
 		if ((tp->rcv_nxt != tp->copied_seq) &&
 		    (tp->urg_seq != tp->copied_seq ||
 		     tp->rcv_nxt != tp->copied_seq+1 ||
 		     sk->urginline || !tp->urg_data))
 			mask |= POLLIN | POLLRDNORM;
 
-#if 1 /* This needs benchmarking and real world tests */
-		space = (sk->dst_cache ? sk->dst_cache->pmtu : sk->mss) + 128;
-		if (space < 2048) /* XXX */
-			space = 2048;
-#else /* 2.0 way */
-		/* More than half of the socket queue free? */
-		space = atomic_read(&sk->wmem_alloc) / 2;
-#endif
 		/* Always wake the user up when an error occurred */
-		if (sock_wspace(sk) >= space || sk->err)
+		if (sock_wspace(sk) >= tcp_min_write_space(sk, tp) || sk->err)
 			mask |= POLLOUT | POLLWRNORM;
 		if (tp->urg_data & URG_VALID)
-		    	mask |= POLLPRI;
+			mask |= POLLPRI;
 	}
 	return mask;
 }
 
+/*
+ *	Socket write_space callback.
+ *	This (or rather the sock_wake_async) should agree with poll. 
+ */
+void tcp_write_space(struct sock *sk)
+{
+	if (sk->dead)
+		return; 
+
+	wake_up_interruptible(sk->sleep);
+	if (sock_wspace(sk) >=
+	    tcp_min_write_space(sk, &(sk->tp_pinfo.af_tcp)))
+		sock_wake_async(sk->socket, 2);
+}
+
+
 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 {
 	int answ;
@@ -707,7 +744,7 @@ int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags)
 	int copied  = 0;
 
 	/* Verify that the socket is locked */
-	if (!sk->sock_readers)
+	if (!atomic_read(&sk->sock_readers))
 		printk("tcp_do_sendmsg: socket not locked!\n");
 
 	/* Wait for a connection to finish. */
@@ -1025,7 +1062,7 @@ static void cleanup_rbuf(struct sock *sk, int copied)
 		 * which don't advertize a larger window.
 		 */
 		if((copied >= rcv_window_now) &&
-		   ((rcv_window_now + sk->mss) <= tp->window_clamp))
+		   ((rcv_window_now + tp->mss_cache) <= tp->window_clamp))
 			tcp_read_wakeup(sk);
 	}
 }
@@ -1389,7 +1426,7 @@ void tcp_close(struct sock *sk, unsigned long timeout)
 	 * Check whether the socket is locked ... supposedly
 	 * it's impossible to tcp_close() a locked socket.
 	 */
-	if (sk->sock_readers)
+	if (atomic_read(&sk->sock_readers))
 		printk("tcp_close: socket already locked!\n");
 
 	/* We need to grab some memory, and put together a FIN,
@@ -1543,16 +1580,18 @@ struct sock *tcp_accept(struct sock *sk, int flags)
 
 	tcp_synq_unlink(tp, req, prev);
 	newsk = req->sk;
+	req->class->destructor(req);
 	tcp_openreq_free(req);
 	sk->ack_backlog--; 
 
-	/* FIXME: need to check here if newsk has already
-	 * an soft_err or err set.
-	 * We have two options here then: reply (this behaviour matches
-	 * Solaris) or return the error to the application (old Linux)
-	 */
+	/*
+	 * This does not pass any already set errors on the new socket
+	 * to the user, but they will be returned on the first socket operation
+	 * after the accept.
+	 */ 
+
 	error = 0;
- out:
+out:
 	release_sock(sk);
 	sk->err = error;
 	return newsk;
@@ -1586,7 +1625,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
  */
 	  		if(val<1||val>MAX_WINDOW)
 				return -EINVAL;
-			sk->user_mss=val;
+			tp->user_mss=val;
 			return 0;
 		case TCP_NODELAY:
 			sk->nonagle=(val==0)?0:1;
@@ -1614,7 +1653,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
 
 	switch(optname) {
 		case TCP_MAXSEG:
-			val=sk->user_mss;
+			val=tp->user_mss;
 			break;
 		case TCP_NODELAY:
 			val=sk->nonagle;
@@ -1640,7 +1679,7 @@ void tcp_set_keepalive(struct sock *sk, int val)
 
 extern void __skb_cb_too_small_for_tcp(int, int);
 
-__initfunc(void tcp_init(void))
+void __init tcp_init(void)
 {
 	struct sk_buff *skb = NULL;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a4ad2dc3c..6a3ae17bf 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_input.c,v 1.121 1998/07/15 04:39:12 davem Exp $
+ * Version:	$Id: tcp_input.c,v 1.127 1998/08/26 12:04:20 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -50,6 +50,9 @@
  *		Andi Kleen:		Make sure we never ack data there is not
  *					enough room for. Also make this condition
  *					a fatal error if it might still happen.
+ *		Andi Kleen:		Add tcp_measure_rcv_mss to make 
+ *					connections with MSS<min(MTU,ann. MSS)
+ *					work without delayed acks. 
  */
 
 #include <linux/config.h>
@@ -214,7 +217,7 @@ extern __inline__ void tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp
 
 #define PAWS_24DAYS	(HZ * 60 * 60 * 24 * 24)
 
-extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, __u16 len)
+extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, unsigned len)
 {
 	/* ts_recent must be younger than 24 days */
 	return (((jiffies - tp->ts_recent_stamp) >= PAWS_24DAYS) ||
@@ -289,7 +292,7 @@ static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp,
 			/* The retransmission queue is always in order, so
 			 * we can short-circuit the walk early.
 			 */
-			if(!before(start_seq, TCP_SKB_CB(skb)->end_seq))
+			if(after(TCP_SKB_CB(skb)->end_seq, end_seq))
 				break;
 
 			/* We play conservative, we don't allow SACKS to partially
@@ -346,9 +349,11 @@ void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, i
 	  			switch(opcode) {
 				case TCPOPT_MSS:
 					if(opsize==TCPOLEN_MSS && th->syn) {
-						tp->in_mss = ntohs(*(__u16 *)ptr);
-						if (tp->in_mss == 0)
-							tp->in_mss = 536;
+						u16 in_mss = ntohs(*(__u16 *)ptr);
+						if (in_mss == 0)
+							in_mss = 536;
+						if (tp->mss_clamp > in_mss)
+							tp->mss_clamp = in_mss;
 					}
 					break;
 				case TCPOPT_WINDOW:
@@ -466,10 +471,9 @@ static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
                  * to one half the current congestion window, but no less 
                  * than two segments. Retransmit the missing segment.
                  */
+                tp->dup_acks++;
 		if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
-			tp->dup_acks++;
 			if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) {
-				tp->dup_acks++;
                                 tp->snd_ssthresh = max(tp->snd_cwnd >> (TCP_CWND_SHIFT + 1), 2);
                                 tp->snd_cwnd = (tp->snd_ssthresh + 3) << TCP_CWND_SHIFT;
 				tp->high_seq = tp->snd_nxt;
@@ -863,7 +867,7 @@ void tcp_timewait_kill(struct tcp_tw_bucket *tw)
  * reconnects and SYN/RST bits being set in the TCP header.
  */
 int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
-			       struct tcphdr *th, void *opt, __u16 len)
+			       struct tcphdr *th, unsigned len)
 {
 	/*	RFC 1122:
 	 *	"When a connection is [...] on TIME-WAIT state [...]
@@ -893,7 +897,7 @@ int tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
 			return 0;
 		skb_set_owner_r(skb, sk);
 		af_specific = sk->tp_pinfo.af_tcp.af_specific;
-		if(af_specific->conn_request(sk, skb, opt, isn) < 0)
+		if(af_specific->conn_request(sk, skb, isn) < 0)
 			return 1; /* Toss a reset back. */
 		return 0; /* Discard the frame. */
 	}
@@ -1309,7 +1313,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 			tp->delayed_acks++;
 
 			/* Tiny-grams with PSH set make us ACK quickly. */
-			if(skb->h.th->psh && (skb->len < (sk->mss >> 1)))
+			if(skb->h.th->psh && (skb->len < (tp->mss_cache >> 1)))
 				tp->ato = HZ/50;
 		}
 		/* This may have eaten into a SACK block. */
@@ -1429,7 +1433,6 @@ static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
 		}
 	}
 
-	/* We no longer have anyone receiving data on this connection. */
 	tcp_data_queue(sk, skb);
 
 	if (before(tp->rcv_nxt, tp->copied_seq)) {
@@ -1464,6 +1467,26 @@ static void tcp_data_snd_check(struct sock *sk)
 	}
 }
 
+/* 
+ * Adapt the MSS value used to make delayed ack decision to the 
+ * real world. 
+ */ 
+static __inline__ void tcp_measure_rcv_mss(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	unsigned int len = skb->len, lss; 
+
+	if (len > tp->rcv_mss) 
+		tp->rcv_mss = len; 
+	lss = tp->last_seg_size; 
+	tp->last_seg_size = 0; 
+	if (len >= 536) {
+		if (len == lss) 
+			tp->rcv_mss = len; 
+		tp->last_seg_size = len; 
+	}
+}
+
 /*
  * Check if sending an ack is needed.
  */
@@ -1486,7 +1509,7 @@ static __inline__ void __tcp_ack_snd_check(struct sock *sk)
 	 */
 
 	    /* Two full frames received or... */
-	if (((tp->rcv_nxt - tp->rcv_wup) >= sk->mss * MAX_DELAY_ACK) ||
+	if (((tp->rcv_nxt - tp->rcv_wup) >= tp->rcv_mss * MAX_DELAY_ACK) ||
 	    /* We will update the window "significantly" or... */
 	    tcp_raise_window(sk) ||
 	    /* We entered "quick ACK" mode or... */
@@ -1595,11 +1618,14 @@ static int prune_queue(struct sock *sk)
 
 	SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
 
+	net_statistics.PruneCalled++; 
+
 	/* First Clean the out_of_order queue. */
 	/* Start with the end because there are probably the least
 	 * useful packets (crossing fingers).
 	 */
 	while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue))) { 
+		net_statistics.OfoPruned += skb->len; 
 		kfree_skb(skb);
 		if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf)
 			return 0;
@@ -1620,6 +1646,9 @@ static int prune_queue(struct sock *sk)
 				   tp->last_ack_sent);
 			return -1;
 		}
+
+		net_statistics.RcvPruned += skb->len; 
+
 		__skb_unlink(skb, skb->list);
 		tp->rcv_nxt = TCP_SKB_CB(skb)->seq;
 		SOCK_DEBUG(sk, "prune_queue: removing %x-%x (c=%x)\n",
@@ -1633,7 +1662,7 @@ static int prune_queue(struct sock *sk)
 }
 
 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
-			struct tcphdr *th, __u16 len)
+			struct tcphdr *th, unsigned len)
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	int queued = 0;
@@ -1682,6 +1711,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	 */
 
 	if (flg == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+		if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
+			tcp_send_ack(sk);
+			goto discard;
+		}
 		if (len <= th->doff*4) {
 			/* Bulk data transfer: sender */
 			if (len == th->doff*4) {
@@ -1696,15 +1729,12 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			}
 		} else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una) {
 			/* Bulk data transfer: receiver */
-			if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) {
-				/* We must send an ACK for zero window probes. */
-				if (!before(TCP_SKB_CB(skb)->seq,
-						tp->rcv_wup + tp->rcv_wnd))
-					tcp_send_ack(sk);
+			if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf)
 				goto discard;
-			}
 			
-			skb_pull(skb,th->doff*4);
+			__skb_pull(skb,th->doff*4);
+
+			tcp_measure_rcv_mss(sk, skb); 
 
 			/* DO NOT notify forward progress here.
 			 * It saves dozen of CPU instructions in fast path. --ANK
@@ -1719,7 +1749,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			tcp_delack_estimator(tp);
 
 			/* Tiny-grams with PSH set make us ACK quickly. */
-			if(th->psh && (skb->len < (sk->mss >> 1)))
+			if(th->psh && (skb->len < (tp->mss_cache >> 1)))
 				tp->ato = HZ/50;
 
 			tp->delayed_acks++;
@@ -1767,6 +1797,25 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	/* step 7: process the segment text */
 	queued = tcp_data(skb, sk, len);
 
+	/* This must be after tcp_data() does the skb_pull() to
+	 * remove the header size from skb->len.
+	 *
+	 * Dave!!! Phrase above (and all about rcv_mss) has 
+	 * nothing to do with reality. rcv_mss must measure TOTAL
+	 * size, including sacks, IP options etc. Hence, measure_rcv_mss
+	 * must occure before pulling etc, otherwise it will flap
+	 * like hell. Even putting it before tcp_data is wrong,
+	 * it should use skb->tail - skb->nh.raw instead.
+	 *					--ANK (980805)
+	 * 
+	 * BTW I broke it. Now all TCP options are handled equally
+	 * in mss_clamp calculations (i.e. ignored, rfc1122),
+	 * and mss_cache does include all of them (i.e. tstamps)
+	 * except for sacks, to calulate effective mss faster.
+	 * 					--ANK (980805)
+	 */
+	tcp_measure_rcv_mss(sk, skb); 
+
 	/* Be careful, tcp_data() may have put this into TIME_WAIT. */
 	if(sk->state != TCP_CLOSE) {
 		tcp_data_snd_check(sk);
@@ -1853,7 +1902,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
  */
 	
 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
-			  struct tcphdr *th, void *opt, __u16 len)
+			  struct tcphdr *th, unsigned len)
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	int queued = 0;
@@ -1868,7 +1917,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			return 1;
 		
 		if(th->syn) {
-			if(tp->af_specific->conn_request(sk, skb, opt, 0) < 0)
+			if(tp->af_specific->conn_request(sk, skb, 0) < 0)
 				return 1;
 
 			/* Now we have several options: In theory there is 
@@ -1961,28 +2010,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			/* Can't be earlier, doff would be wrong. */
 			tcp_send_ack(sk);
 
-			/* Check for the case where we tried to advertise
-			 * a window including timestamp options, but did not
-			 * end up using them for this connection.
-			 */
-			if((tp->tstamp_ok == 0) && sysctl_tcp_timestamps)
-				sk->mss += TCPOLEN_TSTAMP_ALIGNED;
-			
-			/* Now limit it if the other end negotiated a smaller
-			 * value.
-			 */
-			if (tp->in_mss) {
-				int real_mss = tp->in_mss;
-
-				/* We store MSS locally with the timestamp bytes
-				 * subtracted, TCP's advertise it with them
-				 * included.  Account for this fact.
-				 */
-				if(tp->tstamp_ok)
-					real_mss -= TCPOLEN_TSTAMP_ALIGNED;
-				sk->mss = min(sk->mss, real_mss);
-			}
-
 			sk->dport = th->source;
 			tp->copied_seq = tp->rcv_nxt;
 
@@ -1990,9 +2017,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				sk->state_change(sk);
 				sock_wake_async(sk->socket, 0);
 			}
-
-			/* Drop through step 6 */
-			goto step6;
 		} else {
 			if(th->syn && !th->rst) {
 				/* The previous version of the code
@@ -2017,11 +2041,20 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
 				
 				tcp_send_synack(sk);
-				goto discard;
-			}		
-
+			} else
+				break; 
 		}
-		break;
+
+		/* tp->tcp_header_len and tp->mss_clamp
+		   probably changed, synchronize mss.
+		   */
+		tcp_sync_mss(sk, tp->pmtu_cookie);
+		tp->rcv_mss = tp->mss_cache;
+
+		if (sk->state == TCP_SYN_RECV)
+			goto discard;
+		
+		goto step6; 
 	}
 
 	/*   Parse the tcp_options present on this header.
@@ -2167,6 +2200,11 @@ step6:
 		
 	case TCP_ESTABLISHED: 
 		queued = tcp_data(skb, sk, len);
+
+		/* This must be after tcp_data() does the skb_pull() to
+		 * remove the header size from skb->len.
+		 */
+		tcp_measure_rcv_mss(sk, skb); 
 		break;
 	}
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e0ecdbfa5..bf3fb243b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_ipv4.c,v 1.150 1998/07/28 17:45:07 freitag Exp $
+ * Version:	$Id: tcp_ipv4.c,v 1.157 1998/08/28 00:27:47 davem Exp $
  *
  *		IPv4 specific functions
  *
@@ -44,6 +44,7 @@
  *		Andi Kleen:		various fixes.
  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
  *	Andi Kleen		:	Fix new listen.
+ *	Andi Kleen		:	Fix accept error reporting.
  */
 
 #include <linux/config.h>
@@ -140,7 +141,8 @@ void tcp_bucket_unlock(struct sock *sk)
 		if(tb->port == snum) {
 			if(tb->owners == NULL &&
 			   (tb->flags & TCPB_FLAG_LOCKED)) {
-				tb->flags &= ~TCPB_FLAG_LOCKED;
+				tb->flags &= ~(TCPB_FLAG_LOCKED |
+					       TCPB_FLAG_FASTREUSE);
 				tcp_inc_slow_timer(TCP_SLT_BUCKETGC);
 			}
 			break;
@@ -208,7 +210,7 @@ static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum)
 
 			/* We must walk the whole port owner list in this case. -DaveM */
 			for(sk2 = tb->owners; sk2; sk2 = sk2->bind_next) {
-				if(sk->bound_dev_if == sk2->bound_dev_if) {
+				if (sk->bound_dev_if == sk2->bound_dev_if) {
 					if(!sk_reuse || !sk2->reuse || sk2->state == TCP_LISTEN) {
 						if(!sk2->rcv_saddr		||
 						   !sk->rcv_saddr		||
@@ -223,16 +225,33 @@ static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum)
 	}
 	if(result == 0) {
 		if(tb == NULL) {
-			if(tcp_bucket_create(snum) == NULL)
+			if((tb = tcp_bucket_create(snum)) == NULL)
 				result = 1;
+			else if (sk->reuse && sk->state != TCP_LISTEN)
+				tb->flags |= TCPB_FLAG_FASTREUSE;
 		} else {
 			/* It could be pending garbage collection, this
 			 * kills the race and prevents it from disappearing
 			 * out from under us by the time we use it.  -DaveM
 			 */
-			if(tb->owners == NULL && !(tb->flags & TCPB_FLAG_LOCKED)) {
-				tb->flags = TCPB_FLAG_LOCKED;
-				tcp_dec_slow_timer(TCP_SLT_BUCKETGC);
+			if(tb->owners == NULL) {
+				if (!(tb->flags & TCPB_FLAG_LOCKED)) {
+					tb->flags = (TCPB_FLAG_LOCKED |
+						     ((sk->reuse &&
+						       sk->state != TCP_LISTEN) ?
+						      TCPB_FLAG_FASTREUSE : 0));
+					tcp_dec_slow_timer(TCP_SLT_BUCKETGC);
+				} else if (!(tb->flags & TCPB_FLAG_GOODSOCKNUM)) {
+					/* Someone is in between the bind
+					 * and the actual connect or listen.
+					 * See if it was a legitimate reuse
+					 * and we are as well, else punt.
+					 */
+					if (sk->reuse == 0 ||
+					    !(tb->flags & TCPB_FLAG_FASTREUSE))
+						result = 1;
+				} else
+					tb->flags &= ~TCPB_FLAG_GOODSOCKNUM;
 			}
 		}
 	}
@@ -264,8 +283,11 @@ unsigned short tcp_good_socknum(void)
 	next:
 	} while(--remaining > 0);
 	tcp_port_rover = rover;
-	if((remaining <= 0) || (tcp_bucket_create(rover) == NULL))
+	tb = NULL;
+	if((remaining <= 0) || ((tb = tcp_bucket_create(rover)) == NULL))
 		rover = 0;
+	if (tb != NULL)
+		tb->flags |= TCPB_FLAG_GOODSOCKNUM;
 	SOCKHASH_UNLOCK();
 
 	return rover;
@@ -543,8 +565,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
 	struct sk_buff *buff;
 	struct rtable *rt;
+	u32 daddr, nexthop;
 	int tmp;
-	int mss;
 
 	if (sk->state != TCP_CLOSE) 
 		return(-EISCONN);
@@ -564,7 +586,14 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 			printk(KERN_DEBUG "%s forgot to set AF_INET in " __FUNCTION__ "\n", current->comm);
 	}
 
-	tmp = ip_route_connect(&rt, usin->sin_addr.s_addr, sk->saddr,
+	nexthop = daddr = usin->sin_addr.s_addr;
+	if (sk->opt && sk->opt->srr) {
+		if (daddr == 0)
+			return -EINVAL;
+		nexthop = sk->opt->faddr;
+	}
+
+	tmp = ip_route_connect(&rt, nexthop, sk->saddr,
 			       RT_TOS(sk->ip_tos)|sk->localroute, sk->bound_dev_if);
 	if (tmp < 0)
 		return tmp;
@@ -592,6 +621,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	 */
 	sk->dport = usin->sin_port;
 	sk->daddr = rt->rt_dst;
+	if (sk->opt && sk->opt->srr)
+		sk->daddr = daddr;
 	if (!sk->saddr)
 		sk->saddr = rt->rt_src;
 	sk->rcv_saddr = sk->saddr;
@@ -601,22 +632,28 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		return -EADDRNOTAVAIL;
 	}
 
-	sk->mtu = rt->u.dst.pmtu;
-	if ((sk->ip_pmtudisc == IP_PMTUDISC_DONT ||
-	     (sk->ip_pmtudisc == IP_PMTUDISC_WANT &&
-	      (rt->u.dst.mxlock&(1<<RTAX_MTU)))) &&
-	    rt->u.dst.pmtu > 576 && rt->rt_dst != rt->rt_gateway)
-		sk->mtu = 576;
+	tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
+						   sk->sport, usin->sin_port);
 
-	if (sk->mtu < 64)
-		sk->mtu = 64;	/* Sanity limit */
+	tp->ext_header_len = 0;
+	if (sk->opt)
+		tp->ext_header_len = sk->opt->optlen;
 
-	mss = sk->mtu - sizeof(struct iphdr);
+	/* Reset mss clamp */
+	tp->mss_clamp = ~0;
 
-	tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
-						   sk->sport, usin->sin_port);
+	if ((sk->ip_pmtudisc == IP_PMTUDISC_DONT ||
+	     (sk->ip_pmtudisc == IP_PMTUDISC_WANT &&
+	      (rt->u.dst.mxlock&(1<<RTAX_MTU)))) &&
+	    rt->u.dst.pmtu > 576 && rt->rt_dst != rt->rt_gateway) {
+		/* Clamp mss at maximum of 536 and user_mss.
+		   Probably, user ordered to override tiny segment size
+		   in gatewayed case.
+		 */
+		tp->mss_clamp = max(tp->user_mss, 536);
+	}
 
-	tcp_connect(sk, buff, mss);
+	tcp_connect(sk, buff, rt->u.dst.pmtu);
 	return 0;
 }
 
@@ -694,7 +731,6 @@ static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
  */
 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip)
 {
-	int new_mtu; 
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 
 	/* Don't interested in TCP_LISTEN and open_requests (SYN-ACKs
@@ -711,21 +747,19 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip)
 	 * route, but I think that's acceptable.
 	 */
 	if (sk->ip_pmtudisc != IP_PMTUDISC_DONT && sk->dst_cache) {
-		new_mtu = sk->dst_cache->pmtu - 
-			(ip->ihl<<2) - tp->tcp_header_len; 
-		if (new_mtu < sk->mss && new_mtu > 0) {
-			sk->mss = new_mtu;
+		if (tp->pmtu_cookie > sk->dst_cache->pmtu &&
+		    !atomic_read(&sk->sock_readers)) {
+			lock_sock(sk); 
+			tcp_sync_mss(sk, sk->dst_cache->pmtu);
+
 			/* Resend the TCP packet because it's  
 			 * clear that the old packet has been
 			 * dropped. This is the new "fast" path mtu
 			 * discovery.
 			 */
-			if (!sk->sock_readers) {
-				lock_sock(sk); 
-				tcp_simple_retransmit(sk);
-				release_sock(sk);
-			} /* else let the usual retransmit timer handle it */
-		}
+			tcp_simple_retransmit(sk);
+			release_sock(sk);
+		} /* else let the usual retransmit timer handle it */
 	}
 }
 
@@ -813,7 +847,7 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 		/* Prevent race conditions with accept() - 
 		 * ICMP is unreliable. 
 		 */
-		if (sk->sock_readers) {
+		if (atomic_read(&sk->sock_readers)) {
 			/* XXX: add a counter here to profile this. 
 			 * If too many ICMPs get dropped on busy
 			 * servers this needs to be solved differently.
@@ -821,8 +855,15 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 			return;
 		}
 
+		/* The final ACK of the handshake should be already 
+		 * handled in the new socket context, not here.
+		 * Strictly speaking - an ICMP error for the final
+		 * ACK should set the opening flag, but that is too
+		 * complicated right now. 
+		 */ 
 		if (!th->syn && !th->ack)
 			return;
+
 		req = tcp_v4_search_req(tp, iph, th, &prev); 
 		if (!req)
 			return;
@@ -833,17 +874,33 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 				       seq, req->snt_isn);
 			return;
 		}
-		if (req->sk) {	/* not yet accept()ed */
-			sk = req->sk; /* report error in accept */
+		if (req->sk) {	
+			/* 
+			 * Already in ESTABLISHED and a big socket is created,
+			 * set error code there.
+			 * The error will _not_ be reported in the accept(),
+			 * but only with the next operation on the socket after
+			 * accept. 
+			 */
+			sk = req->sk;
 		} else {
+			/* 
+			 * Still in SYN_RECV, just remove it silently.
+			 * There is no good way to pass the error to the newly
+			 * created socket, and POSIX does not want network
+			 * errors returned from accept(). 
+			 */ 
 			tp->syn_backlog--;
 			tcp_synq_unlink(tp, req, prev);
 			req->class->destructor(req);
 			tcp_openreq_free(req);
+			return; 
 		}
-		/* FALL THOUGH */
+		break;
 	case TCP_SYN_SENT:
 	case TCP_SYN_RECV: 
+		if (!th->syn)
+			return; 
 		opening = 1; 
 		break;
 	}
@@ -855,10 +912,13 @@ void tcp_v4_err(struct sk_buff *skb, unsigned char *dp, int len)
 			tcp_statistics.TcpAttemptFails++;
 			if (sk->state != TCP_LISTEN)
 				tcp_set_state(sk,TCP_CLOSE);
+			mb(); 
 			sk->error_report(sk);		/* Wake people up to see the error (see connect in sock.c) */
 		}
-	} else	/* Only an error on timeout */
+	} else	{ /* Only an error on timeout */
 		sk->err_soft = icmp_err_convert[code].errno;
+		mb(); 
+	}
 }
 
 /* This routine computes an IPv4 TCP checksum. */
@@ -916,7 +976,7 @@ static void tcp_v4_send_reset(struct sk_buff *skb)
 				      IPPROTO_TCP,
 				      0); 
 	arg.n_iov = 1;
-	arg.csumoffset = offsetof(struct tcphdr, check) / sizeof(u16); 
+	arg.csumoffset = offsetof(struct tcphdr, check) / 2; 
 
 	ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
 
@@ -950,6 +1010,11 @@ int tcp_chkaddr(struct sk_buff *skb)
 }
 #endif
 
+/*
+ *	Send a SYN-ACK after having received an ACK. 
+ *	This still operates on a open_request only, not on a big
+ *	socket.
+ */ 
 static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
 {
 	struct rtable *rt;
@@ -974,7 +1039,7 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
 		return;
 	}
 
-	mss = (rt->u.dst.pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
+	mss = rt->u.dst.pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
 
 	skb = tcp_make_synack(sk, &rt->u.dst, req, mss);
 	if (skb) {
@@ -994,6 +1059,9 @@ static void tcp_v4_send_synack(struct sock *sk, struct open_request *req)
 	ip_rt_put(rt);
 }
 
+/*
+ *	IPv4 open_request destructor.
+ */ 
 static void tcp_v4_or_free(struct open_request *req)
 {
 	if(!req->sk && req->af.v4_req.opt)
@@ -1016,9 +1084,9 @@ static inline void syn_flood_warning(struct sk_buff *skb)
  * Save and compile IPv4 options into the open_request if needed. 
  */
 static inline struct ip_options * 
-tcp_v4_save_options(struct sock *sk, struct sk_buff *skb, 
-		    struct ip_options *opt)
+tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
 {
+	struct ip_options *opt = &(IPCB(skb)->opt);
 	struct ip_options *dopt = NULL; 
 
 	if (opt && opt->optlen) {
@@ -1052,8 +1120,7 @@ struct or_calltable or_ipv4 = {
 #define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */
 #define BACKLOGMAX(sk) sysctl_max_syn_backlog
 
-int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, 
-						__u32 isn)
+int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, __u32 isn)
 {
 	struct tcp_opt tp;
 	struct open_request *req;
@@ -1070,6 +1137,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
 	if (sk->dead) 
 		goto dead; 
 
+	/* Never answer to SYNs send to broadcast or multicast */
+	if (((struct rtable *)skb->dst)->rt_flags & 
+	    (RTCF_BROADCAST|RTCF_MULTICAST))
+		goto drop; 
+
 	/* XXX: Check against a global syn pool counter. */
 	if (BACKLOG(sk) > BACKLOGMAX(sk)) {
 #ifdef CONFIG_SYN_COOKIES
@@ -1094,13 +1166,18 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
 
 	req->rcv_isn = TCP_SKB_CB(skb)->seq;
  	tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
-	tp.in_mss = 536;
+
+	tp.mss_clamp = 65535;
 	tcp_parse_options(NULL, th, &tp, want_cookie);
-	req->mss = tp.in_mss;
-	if (tp.saw_tstamp) {
-		req->mss -= TCPOLEN_TSTAMP_ALIGNED;
+	if (tp.mss_clamp == 65535)
+		tp.mss_clamp = 576 - sizeof(struct iphdr) - sizeof(struct iphdr);
+
+	if (sk->tp_pinfo.af_tcp.user_mss && sk->tp_pinfo.af_tcp.user_mss < tp.mss_clamp)
+		tp.mss_clamp = sk->tp_pinfo.af_tcp.user_mss;
+	req->mss = tp.mss_clamp;
+
+	if (tp.saw_tstamp)
 		req->ts_recent = tp.rcv_tsval;
-	}
 	req->tstamp_ok = tp.tstamp_ok;
 	req->sack_ok = tp.sack_ok;
 	req->snd_wscale = tp.snd_wscale;
@@ -1120,7 +1197,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
 
 	req->snt_isn = isn;
 
-	req->af.v4_req.opt = tcp_v4_save_options(sk, skb, ptr);
+	req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
 
 	req->class = &or_ipv4;
 	req->retrans = 0;
@@ -1139,7 +1216,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
 		tcp_synq_queue(&sk->tp_pinfo.af_tcp, req);
 	}
 
-	sk->data_ready(sk, 0);
 	return 0;
 
 dead:
@@ -1160,8 +1236,7 @@ drop:
  *
  * This function wants to be moved to a common for IPv[46] file. --ANK
  */
-struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb,
-				      int snd_mss)
+struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
 {
 	struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0);
 
@@ -1175,11 +1250,12 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 		/* Clone the TCP header template */
 		newsk->dport = req->rmt_port;
 
-		newsk->sock_readers = 0;
+		atomic_set(&newsk->sock_readers, 0);
 		atomic_set(&newsk->rmem_alloc, 0);
 		skb_queue_head_init(&newsk->receive_queue);
 		atomic_set(&newsk->wmem_alloc, 0);
 		skb_queue_head_init(&newsk->write_queue);
+		atomic_set(&newsk->omem_alloc, 0);
 
 		newsk->done = 0;
 		newsk->proc = 0;
@@ -1231,7 +1307,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 		newtp->copied_seq = req->rcv_isn + 1;
 
 		newtp->saw_tstamp = 0;
-		newtp->in_mss = 536;
+		newtp->mss_clamp = req->mss;
 
 		init_timer(&newtp->probe_timer);
 		newtp->probe_timer.function = &tcp_probe_timer;
@@ -1242,12 +1318,14 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 		newtp->urg_data = 0;
 		tcp_synq_init(newtp);
 		newtp->syn_backlog = 0;
+		if (skb->len >= 536)
+			newtp->last_seg_size = skb->len; 
 
 		/* Back to base struct sock members. */
 		newsk->err = 0;
 		newsk->ack_backlog = 0;
 		newsk->max_ack_backlog = SOMAXCONN;
-		newsk->priority = 1;
+		newsk->priority = 0;
 
 		/* IP layer stuff */
 		newsk->timeout = 0;
@@ -1276,14 +1354,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 		} else {
 			newtp->tcp_header_len = sizeof(struct tcphdr);
 		}
-
-		snd_mss -= newtp->tcp_header_len;
-
-		if (sk->user_mss)
-			snd_mss = min(snd_mss, sk->user_mss);
-
-		newsk->mss = min(req->mss, snd_mss);
-
 	}
 	return newsk;
 }
@@ -1299,8 +1369,6 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	struct ip_options *opt = req->af.v4_req.opt;
 	struct tcp_opt *newtp;
 	struct sock *newsk;
-	int snd_mss;
-	int mtu;
 
 	if (sk->ack_backlog > sk->max_ack_backlog)
 		goto exit; /* head drop */
@@ -1324,12 +1392,7 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		goto exit;
 #endif
 
-	mtu = dst->pmtu;
-	if (mtu < 68) /* XXX: we should turn pmtu disc off when this happens. */
-		mtu = 68;
-	snd_mss = mtu - sizeof(struct iphdr);
-
-	newsk = tcp_create_openreq_child(sk, req, skb, snd_mss);
+	newsk = tcp_create_openreq_child(sk, req, skb);
 	if (!newsk) 
 		goto exit;
 
@@ -1347,15 +1410,22 @@ struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	newsk->sport = req->lcl_port;
 #endif
 	newsk->opt = req->af.v4_req.opt;
-	newsk->mtu = mtu;
-
-	if (newsk->rcvbuf < (3 * newsk->mtu))
-		newsk->rcvbuf = min ((3 * newsk->mtu), sysctl_rmem_max);
-	if (newsk->sndbuf < (3 * newsk->mtu))
-		newsk->sndbuf = min ((3 * newsk->mtu), sysctl_wmem_max);
+	newtp->ext_header_len = 0;
+	if (newsk->opt)
+		newtp->ext_header_len = newsk->opt->optlen;
+
+	tcp_sync_mss(newsk, dst->pmtu);
+	newtp->rcv_mss = newtp->mss_clamp;
+
+	/* It would be better to use newtp->mss_clamp here */
+	if (newsk->rcvbuf < (3 * newtp->pmtu_cookie))
+		newsk->rcvbuf = min ((3 * newtp->pmtu_cookie), sysctl_rmem_max);
+	if (newsk->sndbuf < (3 * newtp->pmtu_cookie))
+		newsk->sndbuf = min ((3 * newtp->pmtu_cookie), sysctl_wmem_max);
  
 	tcp_v4_hash(newsk);
 	add_to_prot_sklist(newsk);
+	sk->data_ready(sk, 0); /* Deliver SIGIO */ 
 
 	return newsk;
 
@@ -1373,8 +1443,8 @@ static void tcp_v4_rst_req(struct sock *sk, struct sk_buff *skb)
 	if (!req)
 		return;
 	/* Sequence number check required by RFC793 */
-	if (before(TCP_SKB_CB(skb)->seq, req->snt_isn) ||
-	    after(TCP_SKB_CB(skb)->seq, req->snt_isn+1))
+	if (before(TCP_SKB_CB(skb)->seq, req->rcv_isn) ||
+	    after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
 		return;
 	tcp_synq_unlink(tp, req, prev);
 	(req->sk ? sk->ack_backlog : tp->syn_backlog)--;
@@ -1461,7 +1531,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 		sk = nsk;
 	}
 	
-	if (tcp_rcv_state_process(sk, skb, skb->h.th, &(IPCB(skb)->opt), skb->len))
+	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
 		goto reset;
 	release_sock(sk); 
 	return 0;
@@ -1543,7 +1613,7 @@ int tcp_v4_rcv(struct sk_buff *skb, unsigned short len)
 
 	if (sk->state == TCP_TIME_WAIT)
 		goto do_time_wait;
-	if (!sk->sock_readers)
+	if (!atomic_read(&sk->sock_readers))
 		return tcp_v4_do_rcv(sk, skb);
 
 	__skb_queue_tail(&sk->back_log, skb);
@@ -1559,7 +1629,7 @@ discard_it:
 
 do_time_wait:
 	if(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
-				      skb, th, &(IPCB(skb)->opt), skb->len))
+				      skb, th, skb->len))
 		goto no_tcp_socket;
 	goto discard_it;
 }
@@ -1665,6 +1735,8 @@ struct tcp_func ipv4_specific = {
 	tcp_v4_conn_request,
 	tcp_v4_syn_recv_sock,
 	tcp_v4_get_sock,
+	sizeof(struct iphdr),
+
 	ip_setsockopt,
 	ip_getsockopt,
 	v4_addr2sockaddr,
@@ -1683,7 +1755,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 
 	tp->rto  = TCP_TIMEOUT_INIT;		/*TCP_WRITE_TIME*/
 	tp->mdev = TCP_TIMEOUT_INIT;
-	tp->in_mss = 536;
+	tp->mss_clamp = ~0;
       
 	/* See draft-stevens-tcpca-spec-01 for discussion of the
 	 * initialization of these values.
@@ -1691,11 +1763,11 @@ static int tcp_v4_init_sock(struct sock *sk)
 	tp->snd_cwnd = (1 << TCP_CWND_SHIFT);
 	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
 
-	sk->priority = 1;
 	sk->state = TCP_CLOSE;
 	sk->max_ack_backlog = SOMAXCONN;
-	sk->mtu = 576;
-	sk->mss = 536;
+	tp->rcv_mss = 536; 
+
+	sk->write_space = tcp_write_space; 
 
 	/* Init SYN queue. */
 	tcp_synq_init(tp);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 84535341f..03696cbe0 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_output.c,v 1.92 1998/06/19 13:22:44 davem Exp $
+ * Version:	$Id: tcp_output.c,v 1.93 1998/08/26 12:04:32 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -117,7 +117,7 @@ void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 			 * is never scaled.
 			 */
 			th->window	= htons(tp->rcv_wnd);
-			tcp_syn_build_options((__u32 *)(th + 1), sk->mss,
+			tcp_syn_build_options((__u32 *)(th + 1), tp->mss_clamp,
 					      sysctl_tcp_timestamps,
 					      sysctl_tcp_sack,
 					      sysctl_tcp_window_scaling,
@@ -227,6 +227,65 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
 	return 0;
 }
 
+/* This function synchronize snd mss to current pmtu/exthdr set.
+
+   tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
+   for TCP options, but includes only bare TCP header.
+
+   tp->mss_clamp is mss negotiated at connection setup.
+   It is minumum of user_mss and mss received with SYN.
+   It also does not include TCP options.
+
+   tp->pmtu_cookie is last pmtu, seen by this function.
+
+   tp->mss_cache is current effective sending mss, including
+   all tcp options except for SACKs. It is evaluated,
+   taking into account current pmtu, but never exceeds
+   tp->mss_clamp.
+
+   NOTE1. rfc1122 clearly states that advertised MSS
+   DOES NOT include either tcp or ip options.
+
+   NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
+   this function.			--ANK (980731)
+ */
+
+int tcp_sync_mss(struct sock *sk, u32 pmtu)
+{
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+	int mss_now;
+
+	/* Calculate base mss without TCP options:
+	   It is MMS_S - sizeof(tcphdr) of rfc1122
+	*/
+	mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
+
+	/* Clamp it (mss_clamp does not include tcp options) */
+	if (mss_now > tp->mss_clamp)
+		mss_now = tp->mss_clamp;
+
+	/* Now subtract TCP options size, not including SACKs */
+	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+
+	/* Now subtract optional transport overhead */
+	mss_now -= tp->ext_header_len;
+
+	/* It we got too small (or even negative) value,
+	   clamp it by 8 from below. Why 8 ?
+	   Well, it could be 1 with the same success,
+	   but if IP accepted segment of length 1,
+	   it would love 8 even more 8)		--ANK (980731)
+	 */
+	if (mss_now < 8)
+		mss_now = 8;
+
+	/* And store cached results */
+	tp->pmtu_cookie = pmtu;
+	tp->mss_cache = mss_now;
+	return mss_now;
+}
+
+
 /* This routine writes packets to the network.  It advances the
  * send_head.  This happens as incoming acks open up the remote
  * window for us.
@@ -334,7 +393,7 @@ void tcp_write_xmit(struct sock *sk)
 u32 __tcp_select_window(struct sock *sk, u32 cur_win)
 {
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-	unsigned int mss = sk->mss;
+	unsigned int mss = tp->mss_cache;
 	int free_space;
 	u32 window;
 
@@ -624,7 +683,7 @@ void tcp_send_fin(struct sock *sk)
 		 */
 		if(tp->send_head == skb &&
 		   !sk->nonagle &&
-		   skb->len < (sk->mss >> 1) &&
+		   skb->len < (tp->mss_cache >> 1) &&
 		   tp->packets_out &&
 		   !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) {
 			update_send_head(sk);
@@ -738,20 +797,15 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 
 	skb->dst = dst_clone(dst);
 
-	if (sk->user_mss)
-		mss = min(mss, sk->user_mss);
-	if (req->tstamp_ok)
-		mss -= TCPOLEN_TSTAMP_ALIGNED;
-
 	/* Don't offer more than they did.
 	 * This way we don't have to memorize who said what.
 	 * FIXME: maybe this should be changed for better performance
 	 * with syncookies.
 	 */
 	req->mss = min(mss, req->mss);
-	if (req->mss < 1) {
-		printk(KERN_DEBUG "initial req->mss below 1\n");
-		req->mss = 1;
+	if (req->mss < 8) {
+		printk(KERN_DEBUG "initial req->mss below 8\n");
+		req->mss = 8;
 	}
 
 	tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
@@ -796,7 +850,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	return skb;
 }
 
-void tcp_connect(struct sock *sk, struct sk_buff *buff, int mss)
+void tcp_connect(struct sock *sk, struct sk_buff *buff, int mtu)
 {
 	struct dst_entry *dst = sk->dst_cache;
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
@@ -804,9 +858,6 @@ void tcp_connect(struct sock *sk, struct sk_buff *buff, int mss)
 	/* Reserve space for headers. */
 	skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
 
-	if (sk->priority == 0)
-		sk->priority = dst->priority;
-
 	tp->snd_wnd = 0;
 	tp->snd_wl1 = 0;
 	tp->snd_wl2 = tp->write_seq;
@@ -821,17 +872,25 @@ void tcp_connect(struct sock *sk, struct sk_buff *buff, int mss)
 	tp->tcp_header_len = sizeof(struct tcphdr) +
 		(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
 
-	mss -= tp->tcp_header_len;
-
-	if (sk->user_mss)
-		mss = min(mss, sk->user_mss);
-
-	if (mss < 1) {
-		printk(KERN_DEBUG "initial sk->mss below 1\n");
-		mss = 1;	/* Sanity limit */
-	}
-
-	sk->mss = mss;
+	/* If user gave his TCP_MAXSEG, record it to clamp */
+	if (tp->user_mss)
+		tp->mss_clamp = tp->user_mss;
+	tcp_sync_mss(sk, mtu);
+
+	/* Now unpleasant action: if initial pmtu is too low
+	   set lower clamp. I am not sure that it is good.
+	   To be more exact, I do not think that clamping at value, which
+	   is apparently transient and may improve in future is good idea.
+	   It would be better to wait until peer will returns its MSS
+	   (probably 65535 too) and now advertise something sort of 65535
+	   or at least first hop device mtu. Is it clear, what I mean?
+	   We should tell peer what maximal mss we expect to RECEIVE,
+	   it has nothing to do with pmtu.
+	   I am afraid someone will be confused by such huge value.
+	                                                   --ANK (980731)
+	 */
+	if (tp->mss_cache + tp->tcp_header_len - sizeof(struct tcphdr) < tp->mss_clamp )
+		tp->mss_clamp = tp->mss_cache + tp->tcp_header_len - sizeof(struct tcphdr);
 
 	TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
 	TCP_SKB_CB(buff)->sacked = 0;
@@ -842,7 +901,7 @@ void tcp_connect(struct sock *sk, struct sk_buff *buff, int mss)
 	tp->snd_nxt = TCP_SKB_CB(buff)->end_seq;
 
 	tp->window_clamp = dst->window;
-	tcp_select_initial_window(sock_rspace(sk)/2,sk->mss,
+	tcp_select_initial_window(sock_rspace(sk)/2,tp->mss_clamp,
 		&tp->rcv_wnd,
 		&tp->window_clamp,
 		sysctl_tcp_window_scaling,
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 665a448bb..94275718b 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -182,7 +182,7 @@ void tcp_probe_timer(unsigned long data)
 	if(sk->zapped) 
 		return;
 	
-	if (sk->sock_readers) {
+	if (atomic_read(&sk->sock_readers)) {
 		/* Try again in second. */
 		tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ);
 		return;
@@ -432,7 +432,7 @@ void tcp_retransmit_timer(unsigned long data)
 		return;
 	}
 
-	if (sk->sock_readers) {
+	if (atomic_read(&sk->sock_readers)) {
 		/* Try again in a second. */
 		tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ);
 		return;
@@ -518,7 +518,7 @@ static void tcp_syn_recv_timer(unsigned long data)
 			struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 			
 			/* TCP_LISTEN is implied. */
-			if (!sk->sock_readers && tp->syn_wait_queue) {
+			if (!atomic_read(&sk->sock_readers) && tp->syn_wait_queue) {
 				struct open_request *prev = (struct open_request *)(&tp->syn_wait_queue);
 				struct open_request *req = tp->syn_wait_queue;
 				do {
diff --git a/net/ipv4/timer.c b/net/ipv4/timer.c
index a0501bd19..d5f6d3eb5 100644
--- a/net/ipv4/timer.c
+++ b/net/ipv4/timer.c
@@ -73,7 +73,7 @@ void net_timer (unsigned long data)
 	int why = sk->timeout;
 
 	/* Only process if socket is not in use. */
-	if (sk->sock_readers) {
+	if (atomic_read(&sk->sock_readers)) {
 		sk->timer.expires = jiffies+HZ;
 		add_timer(&sk->timer);
 		return;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7e2c7bfa6..eab552c36 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -5,7 +5,7 @@
  *
  *		The User Datagram Protocol (UDP).
  *
- * Version:	$Id: udp.c,v 1.57 1998/05/14 06:32:44 davem Exp $
+ * Version:	$Id: udp.c,v 1.61 1998/08/29 17:11:10 freitag Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -59,6 +59,8 @@
  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
  *		Melvin Smith	:	Check msg_name not msg_namelen in sendto(),
  *					return ENOTCONN for unconnected sockets (POSIX)
+ *		Janos Farkas	:	don't deliver multi/broadcasts to a different
+ *					bound-to-device socket
  *
  *
  *		This program is free software; you can redistribute it and/or
@@ -80,7 +82,7 @@
      MUST provide facility for checksumming (OK)
      MAY allow application to control checksumming (OK)
      MUST default to checksumming on (OK)
-     MUST discard silently datagrams with bad csums (OK)
+     MUST discard silently datagrams with bad csums (OK, except during debugging)
    4.1.3.5 (UDP Multihoming)
      MUST allow application to specify source address (OK)
      SHOULD be able to communicate the chosen src addr up to application
@@ -93,14 +95,12 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <linux/types.h>
-#include <linux/sched.h>
 #include <linux/fcntl.h>
 #include <linux/socket.h>
 #include <linux/sockios.h>
 #include <linux/in.h>
 #include <linux/errno.h>
 #include <linux/timer.h>
-#include <linux/termios.h>
 #include <linux/mm.h>
 #include <linux/config.h>
 #include <linux/inet.h>
@@ -108,14 +108,12 @@
 #include <net/snmp.h>
 #include <net/ip.h>
 #include <net/protocol.h>
-#include <net/tcp.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <net/udp.h>
 #include <net/icmp.h>
 #include <net/route.h>
 #include <net/checksum.h>
-#include <linux/ipsec.h>
 
 /*
  *	Snmp MIB for the UDP layer
@@ -447,7 +445,8 @@ static inline struct sock *udp_v4_mcast_next(struct sock *sk,
 					     unsigned short num,
 					     unsigned long raddr,
 					     unsigned short rnum,
-					     unsigned long laddr)
+					     unsigned long laddr,
+					     int dif)
 {
 	struct sock *s = sk;
 	unsigned short hnum = ntohs(num);
@@ -455,8 +454,9 @@ static inline struct sock *udp_v4_mcast_next(struct sock *sk,
 		if ((s->num != hnum)					||
 		    (s->dead && (s->state == TCP_CLOSE))		||
 		    (s->daddr && s->daddr!=raddr)			||
-		    (s->dport != rnum && s->dport != 0) ||
-		    (s->rcv_saddr  && s->rcv_saddr != laddr))
+		    (s->dport != rnum && s->dport != 0)			||
+		    (s->rcv_saddr  && s->rcv_saddr != laddr)		||
+		    (s->bound_dev_if && s->bound_dev_if != dif))
 			continue;
 		break;
   	}
@@ -493,7 +493,7 @@ void udp_err(struct sk_buff *skb, unsigned char *dp, int len)
     	  	return;	/* No socket for error */
 	}
 
-	if (sk->ip_recverr && !sk->sock_readers) {
+	if (sk->ip_recverr && !atomic_read(&sk->sock_readers)) {
 		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 		if (skb2 && sock_queue_err_skb(sk, skb2))
 			kfree_skb(skb2);
@@ -619,7 +619,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 	struct ipcm_cookie ipc;
 	struct udpfakehdr ufh;
 	struct rtable *rt = NULL;
-	int free = 0, localroute = 0;
+	int free = 0;
+	int connected = 0;
 	u32 daddr;
 	u8  tos;
 	int err;
@@ -674,27 +675,15 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 		ufh.uh.dest = usin->sin_port;
 		if (ufh.uh.dest == 0)
 			return -EINVAL;
-		/* XXX: is a one-behind cache for the dst_entry worth it?
-
-		   Nope. ip_route_output is slower than nothing, but it
-		   is enough fast to forget about caching its results.
-		   Really, checking route validity in general case
-		   is not much faster complete lookup.
-		   It was main reason why I removed it from 2.1.
-		   The second reason was that idle sockets held
-		   a lot of stray destinations.		--ANK
-		 */
 	} else {
 		if (sk->state != TCP_ESTABLISHED)
 			return -ENOTCONN;
 		ufh.daddr = sk->daddr;
 		ufh.uh.dest = sk->dport;
-
-		/*
-		   BUGGG Khm... And who will validate it? Fixing it fastly...
-		                                                        --ANK
+		/* Open fast path for connected socket.
+		   Route will not be used, if at least one option is set.
 		 */
-		rt = (struct rtable *)dst_check(&sk->dst_cache, 0);
+		connected = 1;
   	}
 #ifdef CONFIG_IP_TRANSPARENT_PROXY
 	if (msg->msg_flags&MSG_PROXY) {
@@ -710,6 +699,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 		ufh.uh.source = from->sin_port;
 		if (ipc.addr == 0)
 			ipc.addr = sk->saddr;
+		connected = 0;
 	} else
 #endif
 	{
@@ -725,6 +715,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 			return err;
 		if (ipc.opt)
 			free = 1;
+		connected = 0;
 	}
 	if (!ipc.opt)
 		ipc.opt = sk->opt;
@@ -736,12 +727,13 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 		if (!daddr)
 			return -EINVAL;
 		daddr = ipc.opt->faddr;
+		connected = 0;
 	}
 	tos = RT_TOS(sk->ip_tos);
 	if (sk->localroute || (msg->msg_flags&MSG_DONTROUTE) || 
 	    (ipc.opt && ipc.opt->is_strictroute)) {
 		tos |= RTO_ONLINK;
-		rt = NULL; /* sorry */
+		connected = 0;
 	}
 
 	if (MULTICAST(daddr)) {
@@ -749,8 +741,12 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 			ipc.oif = sk->ip_mc_index;
 		if (!ufh.saddr)
 			ufh.saddr = sk->ip_mc_addr;
+		connected = 0;
 	}
 
+	if (connected)
+		rt = (struct rtable*)dst_clone(sk->dst_cache);
+
 	if (rt == NULL) {
 		err = ip_route_output(&rt, daddr, ufh.saddr,
 #ifdef CONFIG_IP_TRANSPARENT_PROXY
@@ -759,7 +755,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 			 tos, ipc.oif);
 		if (err) 
 			goto out;
-		localroute = 1;
 
 		err = -EACCES;
 		if (rt->rt_flags&RTCF_BROADCAST && !sk->broadcast) 
@@ -777,17 +772,13 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 
 	/* RFC1122: OK.  Provides the checksumming facility (MUST) as per */
 	/* 4.1.3.4. It's configurable by the application via setsockopt() */
-	/* (MAY) and it defaults to on (MUST).  Almost makes up for the */
-	/* violation above. -- MS */
+	/* (MAY) and it defaults to on (MUST). */
 
-	lock_sock(sk);
 	err = ip_build_xmit(sk,sk->no_check ? udp_getfrag_nosum : udp_getfrag,
 			    &ufh, ulen, &ipc, rt, msg->msg_flags);
-	release_sock(sk);
 
 out:
-	if (localroute)
-		ip_rt_put(rt);
+	ip_rt_put(rt);
 	if (free)
 		kfree(ipc.opt);
 	if (!err) {
@@ -822,7 +813,9 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 			if (sk->state == TCP_LISTEN)
 				return(-EINVAL);
 			amount = 0;
-			/* N.B. Is this interrupt safe?? */
+			/* N.B. Is this interrupt safe??
+			   -> Yes. Interrupts do not remove skbs. --ANK (980725)
+			 */
 			skb = skb_peek(&sk->receive_queue);
 			if (skb != NULL) {
 				/*
@@ -841,6 +834,9 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 	return(0);
 }
 
+#if defined(CONFIG_FILTER) || !defined(HAVE_CSUM_COPY_USER) 
+#undef CONFIG_UDP_DELAY_CSUM
+#endif
 
 /*
  * 	This should be easy, if there is something there we
@@ -848,7 +844,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
  */
 
 int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len,
-	     int noblock, int flags, int *addr_len)
+		int noblock, int flags, int *addr_len)
 {
   	struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
   	struct sk_buff *skb;
@@ -880,18 +876,36 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len,
 		goto out;
   
   	copied = skb->len - sizeof(struct udphdr);
-	if (copied > len)
-	{
+	if (copied > len) {
 		copied = len;
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-  	/*
-  	 *	FIXME : should use udp header size info value 
-  	 */
-  	 
+#ifndef CONFIG_UDP_DELAY_CSUM
 	err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
 					copied);
+#else
+	if (sk->no_check || skb->ip_summed==CHECKSUM_UNNECESSARY) {
+		err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
+					      copied);
+	} else if (copied > msg->msg_iov[0].iov_len || (msg->msg_flags&MSG_TRUNC)) {
+		if (csum_fold(csum_partial(skb->h.raw, ntohs(skb->h.uh->len), skb->csum))) 
+			goto csum_copy_err;
+		err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
+					      copied);
+	} else {
+		unsigned int csum;
+
+		err = 0;
+		csum = csum_partial(skb->h.raw, sizeof(struct udphdr), skb->csum);
+		csum = csum_and_copy_to_user((char*)&skb->h.uh[1], msg->msg_iov[0].iov_base, 
+					     copied, csum, &err);
+		if (err)
+			goto out_free;
+		if (csum_fold(csum)) 
+			goto csum_copy_err;
+	}
+#endif
 	if (err)
 		goto out_free;
 	sk->stamp=skb->stamp;
@@ -928,6 +942,18 @@ out_free:
   	skb_free_datagram(sk, skb);
 out:
   	return err;
+
+#ifdef CONFIG_UDP_DELAY_CSUM
+csum_copy_err:
+	udp_statistics.UdpInErrors++;
+	skb_free_datagram(sk, skb);
+
+	/* 
+	 * Error for blocking case is chosen to masquerade
+   	 * as some normal condition.
+	 */
+	return (msg->msg_flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH;	
+#endif
 }
 
 int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
@@ -986,29 +1012,16 @@ int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 static void udp_close(struct sock *sk, unsigned long timeout)
 {
-	lock_sock(sk);
+	/* See for explanation: raw_close in ipv4/raw.c */
 	sk->state = TCP_CLOSE;
-	if(uh_cache_sk == sk)
-		uh_cache_sk = NULL;
-	sk->dead = 1;
-	release_sock(sk);
 	udp_v4_unhash(sk);
+	sk->dead = 1;
 	destroy_sock(sk);
 }
 
 static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
 {
 	/*
-	 *	Check the security clearance
-	 */
-	 
-	if(!ipsec_sk_policy(sk,skb))
-	{	
-		kfree_skb(skb);
-		return(0);
-	}
-	 
-	/*
 	 *	Charge it to the socket, dropping if the queue is full.
 	 */
 
@@ -1026,10 +1039,6 @@ static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
 
 static inline void udp_deliver(struct sock *sk, struct sk_buff *skb)
 {
-	if (sk->sock_readers) {
-		__skb_queue_tail(&sk->back_log, skb);
-		return;
-	}
 	udp_queue_rcv_skb(sk, skb);
 }
 
@@ -1043,9 +1052,11 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
 				 u32 saddr, u32 daddr)
 {
 	struct sock *sk;
+	int dif;
 
 	sk = udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)];
-	sk = udp_v4_mcast_next(sk, uh->dest, saddr, uh->source, daddr);
+	dif = skb->dev->ifindex;
+	sk = udp_v4_mcast_next(sk, uh->dest, saddr, uh->source, daddr, dif);
 	if (sk) {
 		struct sock *sknext = NULL;
 
@@ -1053,7 +1064,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
 			struct sk_buff *skb1 = skb;
 
 			sknext = udp_v4_mcast_next(sk->next, uh->dest, saddr,
-						   uh->source, daddr);
+						   uh->source, daddr, dif);
 			if(sknext)
 				skb1 = skb_clone(skb, GFP_ATOMIC);
 
@@ -1113,7 +1124,8 @@ int udp_rcv(struct sk_buff *skb, unsigned short len)
 	 */
 	 
   	uh = skb->h.uh;
-  	
+	__skb_pull(skb, skb->h.raw - skb->data);
+
   	ip_statistics.IpInDelivers++;
 
 	/*
@@ -1121,44 +1133,31 @@ int udp_rcv(struct sk_buff *skb, unsigned short len)
 	 */
 	 
 	ulen = ntohs(uh->len);
-	
-	if (ulen > len || len < sizeof(*uh) || ulen < sizeof(*uh)) {
+
+	if (ulen > len || ulen < sizeof(*uh)) {
 		NETDEBUG(printk(KERN_DEBUG "UDP: short packet: %d/%d\n", ulen, len));
 		udp_statistics.UdpInErrors++;
 		kfree_skb(skb);
 		return(0);
 	}
+	skb_trim(skb, ulen);
 
+#ifndef CONFIG_UDP_DELAY_CSUM
 	if (uh->check &&
-	    (((skb->ip_summed==CHECKSUM_HW)&&udp_check(uh,len,saddr,daddr,skb->csum)) ||
+	    (((skb->ip_summed==CHECKSUM_HW)&&udp_check(uh,ulen,saddr,daddr,skb->csum)) ||
 	     ((skb->ip_summed==CHECKSUM_NONE) &&
-	      (udp_check(uh,len,saddr,daddr, csum_partial((char*)uh, len, 0)))))) {
-		/* <mea@utu.fi> wants to know, who sent it, to
-		   go and stomp on the garbage sender... */
-
-		/* RFC1122: OK.  Discards the bad packet silently (as far as */
-		/* the network is concerned, anyway) as per 4.1.3.4 (MUST). */
-
-		NETDEBUG(printk(KERN_DEBUG "UDP: bad checksum. From %08lX:%d to %08lX:%d ulen %d\n",
-		       ntohl(saddr),ntohs(uh->source),
-		       ntohl(daddr),ntohs(uh->dest),
-		       ulen));
-		udp_statistics.UdpInErrors++;
-		kfree_skb(skb);
-		return(0);
-	}
-
-
-	len = ulen;
-
-	/*
-	 *	FIXME:
-	 *	Trimming things wrongly. We must adjust the base/end to allow
-	 *	for the headers we keep!
-	 *		 --ANK 
-	 */
-	skb_trim(skb,len);
-
+	      (udp_check(uh,ulen,saddr,daddr, csum_partial((char*)uh, ulen, 0)))))) 
+		goto csum_error;
+#else
+	if (uh->check==0)
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	else if (skb->ip_summed==CHECKSUM_HW) {
+		if (udp_check(uh,ulen,saddr,daddr,skb->csum)) 
+			goto csum_error;
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	} else if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+		skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
+#endif
 
 	if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
 		return udp_v4_mcast_deliver(skb, uh, saddr, daddr);
@@ -1173,6 +1172,11 @@ int udp_rcv(struct sk_buff *skb, unsigned short len)
 	sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex);
 	
 	if (sk == NULL) {
+#ifdef CONFIG_UDP_DELAY_CSUM
+		if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
+		    csum_fold(csum_partial((char*)uh, ulen, skb->csum))) 
+			goto csum_error;
+#endif
   		udp_statistics.UdpNoPorts++;
 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 
@@ -1185,6 +1189,19 @@ int udp_rcv(struct sk_buff *skb, unsigned short len)
   	}
 	udp_deliver(sk, skb);
 	return 0;
+
+csum_error:
+	/* 
+	 * RFC1122: OK.  Discards the bad packet silently (as far as 
+	 * the network is concerned, anyway) as per 4.1.3.4 (MUST). 
+	 */
+	NETDEBUG(printk(KERN_DEBUG "UDP: bad checksum. From %08lX:%d to %08lX:%d ulen %d\n",
+			ntohl(saddr),ntohs(uh->source),
+			ntohl(daddr),ntohs(uh->dest),
+			ulen));
+	udp_statistics.UdpInErrors++;
+	kfree_skb(skb);
+	return(0);
 }
 
 struct proto udp_prot = {
@@ -1214,7 +1231,7 @@ struct proto udp_prot = {
 	udp_v4_verify_bind,		/* verify_bind */
 	128,				/* max_header */
 	0,				/* retransmits */
-	"UDP",				/* name */
+ 	"UDP",				/* name */
 	0,				/* inuse */
 	0				/* highestinuse */
 };
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 329807093..a61be48c8 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -5,7 +5,7 @@
  *	Authors:
  *	Pedro Roque		<roque@di.fc.ul.pt>	
  *
- *	$Id: addrconf.c,v 1.43 1998/07/15 05:05:32 davem Exp $
+ *	$Id: addrconf.c,v 1.45 1998/08/26 12:04:41 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -38,6 +38,7 @@
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
 #endif
+#include <linux/delay.h>
 
 #include <linux/proc_fs.h>
 #include <net/sock.h>
@@ -53,7 +54,6 @@
 #include <linux/rtnetlink.h>
 
 #include <asm/uaccess.h>
-#include <asm/delay.h>
 
 /* Set to 3 to get tracing... */
 #define ACONF_DEBUG 2
@@ -100,7 +100,7 @@ struct ipv6_devconf ipv6_devconf =
 {
 	0,				/* forwarding		*/
 	IPV6_DEFAULT_HOPLIMIT,		/* hop limit		*/
-	576,				/* mtu			*/
+	IPV6_MIN_MTU,			/* mtu			*/
 	1,				/* accept RAs		*/
 	1,				/* accept redirects	*/
 	1,				/* autoconfiguration	*/
@@ -114,7 +114,7 @@ static struct ipv6_devconf ipv6_devconf_dflt =
 {
 	0,				/* forwarding		*/
 	IPV6_DEFAULT_HOPLIMIT,		/* hop limit		*/
-	576,				/* mtu			*/
+	IPV6_MIN_MTU,			/* mtu			*/
 	1,				/* accept RAs		*/
 	1,				/* accept redirects	*/
 	1,				/* autoconfiguration	*/
@@ -185,7 +185,7 @@ static struct inet6_dev * ipv6_add_dev(struct device *dev)
 	struct inet6_dev *ndev, **bptr, *iter;
 	int hash;
 
-	if (dev->mtu < 576)
+	if (dev->mtu < IPV6_MIN_MTU)
 		return NULL;
 
 	ndev = kmalloc(sizeof(struct inet6_dev), gfp_any());
@@ -548,7 +548,6 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct device *dev,
 		      unsigned long expires, unsigned flags)
 {
 	struct in6_rtmsg rtmsg;
-	int err;
 
 	memset(&rtmsg, 0, sizeof(rtmsg));
 	memcpy(&rtmsg.rtmsg_dst, pfx, sizeof(struct in6_addr));
@@ -566,7 +565,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct device *dev,
 	if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT))
 		rtmsg.rtmsg_flags |= RTF_NONEXTHOP;
 
-	ip6_route_add(&rtmsg, &err);
+	ip6_route_add(&rtmsg);
 }
 
 /* Create "default" multicast route to the interface */
@@ -574,7 +573,6 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct device *dev,
 static void addrconf_add_mroute(struct device *dev)
 {
 	struct in6_rtmsg rtmsg;
-	int err;
 
 	memset(&rtmsg, 0, sizeof(rtmsg));
 	ipv6_addr_set(&rtmsg.rtmsg_dst,
@@ -584,13 +582,12 @@ static void addrconf_add_mroute(struct device *dev)
 	rtmsg.rtmsg_ifindex = dev->ifindex;
 	rtmsg.rtmsg_flags = RTF_UP|RTF_ADDRCONF;
 	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
-	ip6_route_add(&rtmsg, &err);
+	ip6_route_add(&rtmsg);
 }
 
 static void sit_route_add(struct device *dev)
 {
 	struct in6_rtmsg rtmsg;
-	int err;
 
 	memset(&rtmsg, 0, sizeof(rtmsg));
 
@@ -602,7 +599,7 @@ static void sit_route_add(struct device *dev)
 	rtmsg.rtmsg_flags	= RTF_UP|RTF_NONEXTHOP;
 	rtmsg.rtmsg_ifindex	= dev->ifindex;
 
-	ip6_route_add(&rtmsg, &err);
+	ip6_route_add(&rtmsg);
 }
 
 static void addrconf_add_lroute(struct device *dev)
@@ -690,13 +687,12 @@ void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len)
 	else
 		rt_expires = jiffies + valid_lft * HZ;
 
-	rt = rt6_lookup(&pinfo->prefix, NULL, dev->ifindex, RTF_LINKRT);
+	rt = rt6_lookup(&pinfo->prefix, NULL, dev->ifindex, 1);
 
 	if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) {
 		if (rt->rt6i_flags&RTF_EXPIRES) {
 			if (pinfo->onlink == 0 || valid_lft == 0) {
 				ip6_del_rt(rt);
-				rt = NULL;
 			} else {
 				rt->rt6i_expires = rt_expires;
 			}
@@ -705,6 +701,8 @@ void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len)
 		addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
 				      dev, rt_expires, RTF_ADDRCONF|RTF_EXPIRES);
 	}
+	if (rt)
+		dst_release(&rt->u.dst);
 
 	/* Try to figure out our local address for this prefix */
 
@@ -1118,11 +1116,17 @@ int addrconf_notify(struct notifier_block *this, unsigned long event,
 		break;
 
 	case NETDEV_CHANGEMTU:
-		/* BUGGG... Should scan FIB to change pmtu on routes. --ANK */
-		if (dev->mtu >= 576)
+		if (dev->mtu >= IPV6_MIN_MTU) {
+			struct inet6_dev *idev;
+
+			if ((idev = ipv6_find_idev(dev)) == NULL)
+				break;
+			idev->cnf.mtu6 = dev->mtu;
+			rt6_mtu_change(dev, dev->mtu);
 			break;
+		}
 
-		/* MTU falled under 576. Stop IPv6 on this interface. */
+		/* MTU falled under IPV6_MIN_MTU. Stop IPv6 on this interface. */
 
 	case NETDEV_DOWN:
 	case NETDEV_UNREGISTER:
@@ -1240,7 +1244,6 @@ static void addrconf_rs_timer(unsigned long data)
 		add_timer(&ifp->timer);
 	} else {
 		struct in6_rtmsg rtmsg;
-		int err;
 
 		printk(KERN_DEBUG "%s: no IPv6 routers present\n",
 		       ifp->idev->dev->name);
@@ -1253,7 +1256,7 @@ static void addrconf_rs_timer(unsigned long data)
 
 		rtmsg.rtmsg_ifindex = ifp->idev->dev->ifindex;
 
-		ip6_route_add(&rtmsg, &err);
+		ip6_route_add(&rtmsg);
 	}
 }
 
@@ -1501,7 +1504,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 }
 
 static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
-			    pid_t pid, u32 seq, int event)
+			     u32 pid, u32 seq, int event)
 {
 	struct ifaddrmsg *ifm;
 	struct nlmsghdr  *nlh;
@@ -1659,8 +1662,11 @@ int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
 
 		addrconf_forward_change(idev);
 
-		if (*valp)
+		if (*valp) {
+			start_bh_atomic();
 			rt6_purge_dflt_routers(0);
+			end_bh_atomic();
+		}
 	}
 
         return ret;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 051f9a28e..a9ee64925 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -7,7 +7,7 @@
  *
  *	Adapted from linux/net/ipv4/af_inet.c
  *
- *	$Id: af_inet6.c,v 1.36 1998/06/10 07:29:25 davem Exp $
+ *	$Id: af_inet6.c,v 1.37 1998/08/26 12:04:45 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -64,6 +64,7 @@ extern int raw6_get_info(char *, char **, off_t, int, int);
 extern int tcp6_get_info(char *, char **, off_t, int, int);
 extern int udp6_get_info(char *, char **, off_t, int, int);
 extern int afinet6_get_info(char *, char **, off_t, int, int);
+extern int afinet6_get_snmp(char *, char **, off_t, int, int);
 #endif
 
 #ifdef CONFIG_SYSCTL
@@ -243,10 +244,49 @@ static int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 
 static int inet6_release(struct socket *sock, struct socket *peer)
 {
+	struct sock *sk = sock->sk;
+
+	if (sk == NULL)
+		return -EINVAL;
+
+	/* Free mc lists */
+	ipv6_sock_mc_close(sk);
+
+	/* Huh! MOD_DEC_USE_COUNT was here :-(
+	   It is impossible by two reasons: socket destroy
+	   may be delayed and inet_release may sleep and
+	   return to nowhere then. It should be moved to
+	   inet6_destroy_sock(), but we have no explicit constructor :-(
+	                                    --ANK (980802)
+	 */
 	MOD_DEC_USE_COUNT;
 	return inet_release(sock, peer);
 }
 
+int inet6_destroy_sock(struct sock *sk)
+{
+	struct sk_buff *skb;
+	struct ipv6_txoptions *opt;
+
+	/*
+	 *	Release destination entry
+	 */
+
+	dst_release(xchg(&sk->dst_cache,NULL));
+
+	/* Release rx options */
+
+	if ((skb = xchg(&sk->net_pinfo.af_inet6.pktoptions, NULL)) != NULL)
+		kfree_skb(skb);
+
+	/* Free tx options */
+
+	if ((opt = xchg(&sk->net_pinfo.af_inet6.opt, NULL)) != NULL)
+		sock_kfree_s(sk, opt, opt->tot_len);
+
+	return 0;
+}
+
 /*
  *	This does both peername and sockname.
  */
@@ -412,6 +452,12 @@ static struct proc_dir_entry proc_net_sockstat6 = {
 	0, &proc_net_inode_operations,
 	afinet6_get_info
 };
+static struct proc_dir_entry proc_net_snmp6 = {
+	PROC_NET_SNMP6, 5, "snmp6",
+	S_IFREG | S_IRUGO, 1, 0, 0,
+	0, &proc_net_inode_operations,
+	afinet6_get_snmp
+};
 #endif	/* CONFIG_PROC_FS */
 
 #ifdef MODULE
@@ -445,7 +491,7 @@ __initfunc(void inet6_proto_init(struct net_proto *pro))
 
 	printk(KERN_INFO "IPv6 v0.2 for NET3.037\n");
 
-	if (sizeof(struct ipv6_options) > sizeof(dummy_skb->cb))
+	if (sizeof(struct inet6_skb_parm) > sizeof(dummy_skb->cb))
 	{
 		printk(KERN_CRIT "inet6_proto_init: size fault\n");
 #ifdef MODULE
@@ -490,6 +536,7 @@ __initfunc(void inet6_proto_init(struct net_proto *pro))
 	proc_net_register(&proc_net_tcp6);
 	proc_net_register(&proc_net_udp6);
 	proc_net_register(&proc_net_sockstat6);
+	proc_net_register(&proc_net_snmp6);
 #endif
 
 	/* Now the userspace is allowed to create INET6 sockets. */
@@ -526,6 +573,7 @@ void cleanup_module(void)
 	proc_net_unregister(proc_net_tcp6.low_ino);
 	proc_net_unregister(proc_net_udp6.low_ino);
 	proc_net_unregister(proc_net_sockstat6.low_ino);
+	proc_net_unregister(proc_net_snmp6.low_ino);
 #endif
 	/* Cleanup code parts. */
 	sit_cleanup();
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index b87f31b06..51960bd26 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -5,7 +5,7 @@
  *	Authors:
  *	Pedro Roque		<roque@di.fc.ul.pt>	
  *
- *	$Id: datagram.c,v 1.14 1998/03/20 09:12:15 davem Exp $
+ *	$Id: datagram.c,v 1.15 1998/08/26 12:04:47 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -32,48 +32,72 @@
 int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb)
 {
 	struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
-	struct ipv6_options *opt = (struct ipv6_options *) skb->cb;
-	
-	if (np->rxinfo) {
+	struct inet6_skb_parm *opt = (struct inet6_skb_parm *) skb->cb;
+
+	if (np->rxopt.bits.rxinfo) {
 		struct in6_pktinfo src_info;
 
-		src_info.ipi6_ifindex = skb->dev->ifindex;
+		src_info.ipi6_ifindex = opt->iif;
 		ipv6_addr_copy(&src_info.ipi6_addr, &skb->nh.ipv6h->daddr);
 		put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info);
 	}
 
-	if (np->rxhlim) {
+	if (np->rxopt.bits.rxhlim) {
 		int hlim = skb->nh.ipv6h->hop_limit;
 		put_cmsg(msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim);
 	}
 
-	if (opt->srcrt) {
-		int hdrlen = sizeof(struct rt0_hdr) + (opt->srcrt->hdrlen << 3);
-
-		put_cmsg(msg, SOL_IPV6, IPV6_RXSRCRT, hdrlen, opt->srcrt);
+	if (np->rxopt.bits.hopopts && opt->hop) {
+		u8 *ptr = skb->nh.raw + opt->hop;
+		put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr);
+	}
+	if (np->rxopt.bits.dstopts && opt->dst0) {
+		u8 *ptr = skb->nh.raw + opt->dst0;
+		put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, (ptr[1]+1)<<3, ptr);
+	}
+	if (np->rxopt.bits.srcrt && opt->srcrt) {
+		struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(skb->nh.raw + opt->srcrt);
+		put_cmsg(msg, SOL_IPV6, IPV6_RTHDR, (rthdr->hdrlen+1) << 3, rthdr);
+	}
+	if (np->rxopt.bits.authhdr && opt->auth) {
+		u8 *ptr = skb->nh.raw + opt->auth;
+		put_cmsg(msg, SOL_IPV6, IPV6_AUTHHDR, (ptr[1]+1)<<2, ptr);
+	}
+	if (np->rxopt.bits.dstopts && opt->dst1) {
+		u8 *ptr = skb->nh.raw + opt->dst1;
+		put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, (ptr[1]+1)<<3, ptr);
 	}
 	return 0;
 }
 
 int datagram_send_ctl(struct msghdr *msg, int *oif,
-		      struct in6_addr **src_addr, struct ipv6_options *opt, 
+		      struct in6_addr **src_addr, struct ipv6_txoptions *opt,
 		      int *hlimit)
 {
 	struct in6_pktinfo *src_info;
 	struct cmsghdr *cmsg;
 	struct ipv6_rt_hdr *rthdr;
+	struct ipv6_opt_hdr *hdr;
 	int len;
 	int err = 0;
 
 	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+
+		if ((unsigned long)(((char*)cmsg - (char*)msg->msg_control)
+				    + cmsg->cmsg_len) > msg->msg_controllen) {
+			err = -EINVAL;
+			goto exit_f;
+		}
+
 		if (cmsg->cmsg_level != SOL_IPV6) {
-			printk(KERN_DEBUG "invalid cmsg_level %d\n", cmsg->cmsg_level);
+			if (net_ratelimit())
+				printk(KERN_DEBUG "invalid cmsg_level %d\n", cmsg->cmsg_level);
 			continue;
 		}
 
 		switch (cmsg->cmsg_type) {
  		case IPV6_PKTINFO:
- 			if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in6_pktinfo))) {
+ 			if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) {
 				err = -EINVAL;
 				goto exit_f;
 			}
@@ -100,14 +124,77 @@ int datagram_send_ctl(struct msghdr *msg, int *oif,
 			}
 
 			break;
-			
-		case IPV6_RXSRCRT:
+
+		case IPV6_HOPOPTS:
+                        if (opt->hopopt || cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+
+			hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg);
+			len = ((hdr->hdrlen + 1) << 3);
+			if (cmsg->cmsg_len < CMSG_LEN(len)) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+			if (!capable(CAP_NET_RAW)) {
+				err = -EPERM;
+				goto exit_f;
+			}
+			opt->opt_nflen += len;
+			opt->hopopt = hdr;
+			break;
+
+		case IPV6_DSTOPTS:
+                        if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+
+			hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg);
+			len = ((hdr->hdrlen + 1) << 3);
+			if (cmsg->cmsg_len < CMSG_LEN(len)) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+			if (!capable(CAP_NET_RAW)) {
+				err = -EPERM;
+				goto exit_f;
+			}
+			if (opt->dst1opt) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+			opt->opt_flen += len;
+			opt->dst1opt = hdr;
+			break;
+
+		case IPV6_AUTHHDR:
+                        if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+
+			hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg);
+			len = ((hdr->hdrlen + 2) << 2);
+			if (cmsg->cmsg_len < CMSG_LEN(len)) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+			if (len & ~7) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+			opt->opt_flen += len;
+			opt->auth = hdr;
+			break;
+
+		case IPV6_RTHDR:
                         if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_rt_hdr))) {
 				err = -EINVAL;
 				goto exit_f;
 			}
 
-			len = cmsg->cmsg_len - sizeof(struct cmsghdr);
 			rthdr = (struct ipv6_rt_hdr *)CMSG_DATA(cmsg);
 
 			/*
@@ -118,7 +205,9 @@ int datagram_send_ctl(struct msghdr *msg, int *oif,
 				goto exit_f;
 			}
 
-			if (((rthdr->hdrlen + 1) << 3) < len) {
+			len = ((rthdr->hdrlen + 1) << 3);
+
+                        if (cmsg->cmsg_len < CMSG_LEN(len)) {
 				err = -EINVAL;
 				goto exit_f;
 			}
@@ -128,12 +217,21 @@ int datagram_send_ctl(struct msghdr *msg, int *oif,
 				err = -EINVAL;
 				goto exit_f;
 			}
-			
-			opt->opt_nflen += ((rthdr->hdrlen + 1) << 3);
+
+			opt->opt_nflen += len;
 			opt->srcrt = rthdr;
 
+			if (opt->dst1opt) {
+				int dsthdrlen = ((opt->dst1opt->hdrlen+1)<<3);
+
+				opt->opt_nflen += dsthdrlen;
+				opt->dst0opt = opt->dst1opt;
+				opt->dst1opt = NULL;
+				opt->opt_flen -= dsthdrlen;
+			}
+
 			break;
-			
+
 		case IPV6_HOPLIMIT:
 			if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) {
 				err = -EINVAL;
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 0b826870f..89d58936d 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -5,8 +5,9 @@
  *	Authors:
  *	Pedro Roque		<roque@di.fc.ul.pt>
  *	Andi Kleen		<ak@muc.de>
+ *	Alexey Kuznetsov	<kuznet@ms2.inr.ac.ru>
  *
- *	$Id: exthdrs.c,v 1.6 1998/04/30 16:24:20 freitag Exp $
+ *	$Id: exthdrs.c,v 1.7 1998/08/26 12:04:49 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -37,55 +38,192 @@
 
 #include <asm/uaccess.h>
 
-#define swap(a,b) do { typeof (a) tmp; tmp = (a); (a) = (b); (b) = (tmp); } while(0)
+/*
+ *	Parsing inbound headers.
+ *
+ *	Parsing function "func" returns pointer to the place,
+ *	where next nexthdr value is stored or NULL, if parsing
+ *	failed. It should also update skb->h.
+ */
+
+struct hdrtype_proc
+{
+	int	type;
+	u8*	(*func) (struct sk_buff **, u8 *ptr);
+};
 
 /*
- *	inbound
+ *	Parsing tlv encoded headers.
+ *
+ *	Parsing function "func" returns 1, if parsing succeed
+ *	and 0, if it failed.
+ *	It MUST NOT touch skb->h.
  */
-#if 0
-int ipv6_routing_header(struct sk_buff **skb_ptr, struct device *dev,
-			__u8 *nhptr, struct ipv6_options *opt)
+
+struct tlvtype_proc
+{
+	int	type;
+	int	(*func) (struct sk_buff *, __u8 *ptr);
+};
+
+/*********************
+  Generic functions
+ *********************/
+
+/* An unknown option is detected, decide what to do */
+
+int ip6_tlvopt_unknown(struct sk_buff *skb, u8 *opt)
+{
+	switch ((opt[0] & 0xC0) >> 6) {
+	case 0: /* ignore */
+		return 1;
+		
+	case 1: /* drop packet */
+		break;
+
+	case 3: /* Send ICMP if not a multicast address and drop packet */
+		/* Actually, it is redundant check. icmp_send
+		   will recheck in any case.
+		 */
+		if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr))
+			break;
+	case 2: /* send ICMP PARM PROB regardless and drop packet */
+		icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, opt);
+		return 0;
+	};
+
+	kfree_skb(skb);
+	return 0;
+}
+
+/* Parse tlv encoded option header (hop-by-hop or destination) */
+
+static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff *skb,
+			 __u8 *nhptr)
+{
+	struct tlvtype_proc *curr;
+	u8 *ptr = skb->h.raw;
+	int len = ((ptr[1]+1)<<3) - 2;
+
+	ptr += 2;
+
+	if (skb->tail - (ptr + len) < 0) {
+		kfree_skb(skb);
+		return 0;
+	}
+
+	while (len > 0) {
+		int optlen = ptr[1]+2;
+
+		switch (ptr[0]) {
+		case IPV6_TLV_PAD0:
+			optlen = 1;
+			break;
+
+		case IPV6_TLV_PADN:
+			break;
+
+		default: /* Other TLV code so scan list */
+			for (curr=procs; curr->type >= 0; curr++) {
+				if (curr->type == ptr[0]) {
+					if (curr->func(skb, ptr) == 0)
+						return 0;
+					break;
+				}
+			}
+			if (curr->type < 0) {
+				if (ip6_tlvopt_unknown(skb, ptr) == 0)
+					return 0;
+			}
+			break;
+		}
+		ptr += optlen;
+		len -= optlen;
+	}
+	if (len == 0)
+		return 1;
+	kfree_skb(skb);
+	return 0;
+}
+
+/*****************************
+  Destination options header.
+ *****************************/
+
+struct tlvtype_proc tlvprocdestopt_lst[] = {
+	/* No destination options are defined now */
+	{-1,			NULL}
+};
+
+static u8 *ipv6_dest_opt(struct sk_buff **skb_ptr, u8 *nhptr)
+{
+	struct sk_buff *skb=*skb_ptr;
+	struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb;
+	struct ipv6_destopt_hdr *hdr = (struct ipv6_destopt_hdr *) skb->h.raw;
+
+	opt->dst1 = (u8*)hdr - skb->nh.raw;
+
+	if (ip6_parse_tlv(tlvprocdestopt_lst, skb, nhptr)) {
+		skb->h.raw += ((hdr->hdrlen+1)<<3);
+		return &hdr->nexthdr;
+	}
+
+	return NULL;
+}
+
+/********************************
+  NONE header. No data in packet.
+ ********************************/
+
+static u8 *ipv6_nodata(struct sk_buff **skb_ptr, u8 *nhptr)
+{
+	kfree_skb(*skb_ptr);
+	return NULL;
+}
+
+/********************************
+  Routing header.
+ ********************************/
+
+static u8* ipv6_routing_header(struct sk_buff **skb_ptr, u8 *nhptr)
 {
 	struct sk_buff *skb = *skb_ptr;
+	struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb;
 	struct in6_addr *addr;
 	struct in6_addr daddr;
-	int addr_type = 0;
-	int strict = 0;
-	__u32 bit_map;
-	int pos;
+	int addr_type;
 	int n, i;
 
 	struct ipv6_rt_hdr *hdr = (struct ipv6_rt_hdr *) skb->h.raw;
 	struct rt0_hdr *rthdr;
 
-	if (hdr->segments_left == 0) {
-		struct ipv6_options *opt;
-
-		opt = (struct ipv6_options *) skb->cb;
-		opt->srcrt = hdr;
+	if (((hdr->hdrlen+1)<<3) > skb->tail - skb->h.raw) {
+		ipv6_statistics.Ip6InHdrErrors++;
+		kfree_skb(skb);
+		return NULL;
+	}
 
+looped_back:
+	if (hdr->segments_left == 0) {
+		opt->srcrt = (u8*)hdr - skb->nh.raw;
 		skb->h.raw += (hdr->hdrlen + 1) << 3;
-		return hdr->nexthdr;		
+		opt->dst0 = opt->dst1;
+		opt->dst1 = 0;
+		return &hdr->nexthdr;		
 	}
 
-	if (hdr->type != IPV6_SRCRT_TYPE_0 || hdr->hdrlen & 0x01 ||
-	    hdr->hdrlen > 46) {
-                /* 
-		 *	Discard 
-		 */
-		
-		pos = (__u8 *) hdr - (__u8 *) skb->nh.ipv6h + 2;
+	if (hdr->type != IPV6_SRCRT_TYPE_0 || hdr->hdrlen & 0x01) {
+		u8 *pos = (u8*) hdr;
 
-		if (hdr->type)
+		if (hdr->type != IPV6_SRCRT_TYPE_0)
 			pos += 2;
 		else
 			pos += 1;
 
-		icmpv6_send(skb, ICMPV6_PARAMETER_PROB, 0, pos, dev);
-		kfree_skb(skb);
-		return 0;	
+		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, pos);
+		return NULL;	
 	}
-
+	
 	/*
 	 *	This is the routing header forwarding algorithm from
 	 *	RFC 1883, page 17.
@@ -94,13 +232,21 @@ int ipv6_routing_header(struct sk_buff **skb_ptr, struct device *dev,
 	n = hdr->hdrlen >> 1;
 
 	if (hdr->segments_left > n) {
-		pos = (__u8 *) hdr - (__u8 *) skb->nh.ipv6h + 2;
-
-		pos += 3;
+		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, &hdr->segments_left);
+		return NULL;
+	}
 
-		icmpv6_send(skb, ICMPV6_PARAMETER_PROB, 0, pos, dev);
+	/* We are about to mangle packet header. Be careful!
+	   Do not damage packets queued somewhere.
+	 */
+	if (skb_cloned(skb)) {
+		struct sk_buff *skb2 = skb_copy(skb, GFP_ATOMIC);
 		kfree_skb(skb);
-		return 0;
+		if (skb2 == NULL)
+			return NULL;
+		*skb_ptr = skb = skb2;
+		opt = (struct inet6_skb_parm *)skb2->cb;
+		hdr = (struct ipv6_rt_hdr *) skb2->h.raw;
 	}
 
 	i = n - --hdr->segments_left;
@@ -113,58 +259,429 @@ int ipv6_routing_header(struct sk_buff **skb_ptr, struct device *dev,
 
 	if (addr_type == IPV6_ADDR_MULTICAST) {
 		kfree_skb(skb);
-		return 0;
+		return NULL;
 	}
 
 	ipv6_addr_copy(&daddr, addr);
 	ipv6_addr_copy(addr, &skb->nh.ipv6h->daddr);
 	ipv6_addr_copy(&skb->nh.ipv6h->daddr, &daddr);
 
-	/*
-	 *	Check Strick Source Route
+	dst_release(xchg(&skb->dst, NULL));
+	ip6_route_input(skb);
+	if (skb->dst->error) {
+		skb->dst->input(skb);
+		return NULL;
+	}
+	if (skb->dst->dev->flags&IFF_LOOPBACK) {
+		if (skb->nh.ipv6h->hop_limit <= 1) {
+			icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
+				    0, skb->dev);
+			kfree_skb(skb);
+			return NULL;
+		}
+		skb->nh.ipv6h->hop_limit--;
+		goto looped_back;
+	}
+
+	skb->dst->input(skb);
+	return NULL;
+}
+
+/*
+   This function inverts received rthdr.
+   NOTE: specs allow to make it automatically only if
+   packet authenticated.
+
+   I will not discuss it here (though, I am really pissed off at
+   this stupid requirement making rthdr idea useless)
+
+   Actually, it creates severe problems  for us.
+   Embrionic requests has no associated sockets,
+   so that user have no control over it and
+   cannot not only to set reply options, but
+   even to know, that someone wants to connect
+   without success. :-(
+
+   For now we need to test the engine, so that I created
+   temporary (or permanent) backdoor.
+   If listening socket set IPV6_RTHDR to 2, then we invert header.
+                                                   --ANK (980729)
+ */
+
+struct ipv6_txoptions *
+ipv6_invert_rthdr(struct sock *sk, struct ipv6_rt_hdr *hdr)
+{
+	/* Received rthdr:
+
+	   [ H1 -> H2 -> ... H_prev ]  daddr=ME
+
+	   Inverted result:
+	   [ H_prev -> ... -> H1 ] daddr =sender
+
+	   Note, that IP output engine will rewrire this rthdr
+	   by rotating it left by one addr.
 	 */
 
-	bit_map = ntohl(rthdr->bitmap);
+	int n, i;
+	struct rt0_hdr *rthdr = (struct rt0_hdr*)hdr;
+	struct rt0_hdr *irthdr;
+	struct ipv6_txoptions *opt;
+	int hdrlen = ipv6_optlen(hdr);
+
+	if (hdr->segments_left ||
+	    hdr->type != IPV6_SRCRT_TYPE_0 ||
+	    hdr->hdrlen & 0x01)
+		return NULL;
 
-	if ((bit_map & (1 << i)) == IPV6_SRCRT_STRICT)
-		strict = 1;
+	n = hdr->hdrlen >> 1;
+	opt = sock_kmalloc(sk, sizeof(*opt) + hdrlen, GFP_ATOMIC);
+	if (opt == NULL)
+		return NULL;
+	memset(opt, 0, sizeof(*opt));
+	opt->tot_len = sizeof(*opt) + hdrlen;
+	opt->srcrt = (void*)(opt+1);
+	opt->opt_nflen = hdrlen;
+
+	memcpy(opt->srcrt, hdr, sizeof(*hdr));
+	irthdr = (struct rt0_hdr*)opt->srcrt;
+	/* Obsolete field, MBZ, when originated by us */
+	irthdr->bitmap = 0;
+	opt->srcrt->segments_left = n;
+	for (i=0; i<n; i++)
+		memcpy(irthdr->addr+i, rthdr->addr+(n-1-i), 16);
+	return opt;
+}
 
-	ipv6_forward(skb, dev, (strict ? IP6_FW_STRICT : 0) | IP6_FW_SRCRT);
+/********************************
+  AUTH header.
+ ********************************/
 
+/*
+   rfc1826 said, that if a host does not implement AUTH header
+   it MAY ignore it. We use this hole 8)
+
+   Actually, now we can implement OSPFv6 without kernel IPsec.
+   Authentication for poors may be done in user space with the same success.
+
+   Yes, it means, that we allow application to send/receive
+   raw authentication header. Apparently, we suppose, that it knows
+   what it does and calculates authentication data correctly.
+   Certainly, it is possible only for udp and raw sockets, but not for tcp.
+
+   BTW I beg pardon, it is not good place for flames, but
+   I cannot be silent 8) It is very sad, but fools prevail 8)
+   AUTH header has 4byte granular length, what kills all the idea
+   behind AUTOMATIC 64bit alignment of IPv6. Now we will loose
+   cpu ticks, checking that sender did not something stupid
+   and opt->hdrlen is even. Shit!		--ANK (980730)
+ */
+
+static u8 *ipv6_auth_hdr(struct sk_buff **skb_ptr, u8 *nhptr)
+{
+	struct sk_buff *skb=*skb_ptr;
+	struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb;
+	struct ipv6_opt_hdr *hdr = (struct ipv6_opt_hdr *)skb->h.raw;
+	int len = (hdr->hdrlen+2)<<2;
+
+	opt->auth = (u8*)hdr - skb->nh.raw;
+	if (skb->h.raw + len > skb->tail)
+		return NULL;
+	skb->h.raw += len;
+	return &hdr->nexthdr;
+}
+
+/* This list MUST NOT contain entry for NEXTHDR_HOP.
+   It is parsed immediately after packet received
+   and if it occurs somewhere in another place we must
+   generate error.
+ */
+
+struct hdrtype_proc hdrproc_lst[] = {
+	{NEXTHDR_FRAGMENT,	ipv6_reassembly},
+	{NEXTHDR_ROUTING,	ipv6_routing_header},
+	{NEXTHDR_DEST,		ipv6_dest_opt},
+	{NEXTHDR_NONE,		ipv6_nodata},
+	{NEXTHDR_AUTH,		ipv6_auth_hdr},
+   /*
+	{NEXTHDR_ESP,		ipv6_esp_hdr},
+    */
+	{-1,			NULL}
+};
+
+u8 *ipv6_parse_exthdrs(struct sk_buff **skb_in, u8 *nhptr)
+{
+	struct hdrtype_proc *hdrt;
+	u8 nexthdr = *nhptr;
+
+restart:
+	for (hdrt=hdrproc_lst; hdrt->type >= 0; hdrt++) {
+		if (hdrt->type == nexthdr) {
+			if ((nhptr = hdrt->func(skb_in, nhptr)) != NULL) {
+				nexthdr = *nhptr;
+				goto restart;
+			}
+			return NULL;
+		}
+	}
+	return nhptr;
+}
+
+
+/**********************************
+  Hop-by-hop options.
+ **********************************/
+
+/* Router Alert as of draft-ietf-ipngwg-ipv6router-alert-04 */
+
+static int ipv6_hop_ra(struct sk_buff *skb, u8 *ptr)
+{
+	if (ptr[1] == 2) {
+		((struct inet6_skb_parm*)skb->cb)->ra = ptr - skb->nh.raw;
+		return 1;
+	}
+	if (net_ratelimit())
+		printk(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n", ptr[1]);
+	kfree_skb(skb);
 	return 0;
 }
 
+/* Jumbo payload */
+
+static int ipv6_hop_jumbo(struct sk_buff *skb, u8 *ptr)
+{
+	u32 pkt_len;
+
+	if (ptr[1] != 4 || ((ptr-skb->nh.raw)&3) != 2) {
+		if (net_ratelimit())
+			printk(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", ptr[1]);
+		goto drop;
+	}
+
+	pkt_len = ntohl(*(u32*)(ptr+2));
+	if (pkt_len < 0x10000) {
+		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ptr+2);
+		return 0;
+	}
+	if (skb->nh.ipv6h->payload_len) {
+		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ptr);
+		return 0;
+	}
+
+	if (pkt_len > skb->len - sizeof(struct ipv6hdr)) {
+		ipv6_statistics.Ip6InTruncatedPkts++;
+		goto drop;
+	}
+	skb_trim(skb, pkt_len + sizeof(struct ipv6hdr));
+	return 1;
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+struct tlvtype_proc tlvprochopopt_lst[] = {
+	{IPV6_TLV_ROUTERALERT,	ipv6_hop_ra},
+	{IPV6_TLV_JUMBO,	ipv6_hop_jumbo},
+	{-1,			NULL}
+};
+
+u8 * ipv6_parse_hopopts(struct sk_buff *skb, u8 *nhptr)
+{
+	((struct inet6_skb_parm*)skb->cb)->hop = sizeof(struct ipv6hdr);
+	if (ip6_parse_tlv(tlvprochopopt_lst, skb, nhptr))
+		return nhptr+((nhptr[1]+1)<<3);
+	return NULL;
+}
 
 /*
- *	outbound
+ *	Creating outbound headers.
+ *
+ *	"build" functions work when skb is filled from head to tail (datagram)
+ *	"push"	functions work when headers are added from tail to head (tcp)
+ *
+ *	In both cases we assume, that caller reserved enough room
+ *	for headers.
  */
 
-int ipv6opt_bld_rthdr(struct sk_buff *skb, struct ipv6_options *opt,
-		      struct in6_addr *addr)		      
+u8 *ipv6_build_rthdr(struct sk_buff *skb, u8 *prev_hdr,
+		     struct ipv6_rt_hdr *opt, struct in6_addr *addr)
 {
 	struct rt0_hdr *phdr, *ihdr;
 	int hops;
 
-	ihdr = (struct rt0_hdr *) opt->srcrt;
+	ihdr = (struct rt0_hdr *) opt;
 	
 	phdr = (struct rt0_hdr *) skb_put(skb, (ihdr->rt_hdr.hdrlen + 1) << 3);
-	memcpy(phdr, ihdr, sizeof(struct ipv6_rt_hdr));
+	memcpy(phdr, ihdr, sizeof(struct rt0_hdr));
 
 	hops = ihdr->rt_hdr.hdrlen >> 1;
-	
+
 	if (hops > 1)
 		memcpy(phdr->addr, ihdr->addr + 1,
 		       (hops - 1) * sizeof(struct in6_addr));
 
 	ipv6_addr_copy(phdr->addr + (hops - 1), addr);
+
+	phdr->rt_hdr.nexthdr = *prev_hdr;
+	*prev_hdr = NEXTHDR_ROUTING;
+	return &phdr->rt_hdr.nexthdr;
+}
+
+static u8 *ipv6_build_exthdr(struct sk_buff *skb, u8 *prev_hdr, u8 type, struct ipv6_opt_hdr *opt)
+{
+	struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_put(skb, ipv6_optlen(opt));
+
+	memcpy(h, opt, ipv6_optlen(opt));
+	h->nexthdr = *prev_hdr;
+	*prev_hdr = type;
+	return &h->nexthdr;
+}
+
+static u8 *ipv6_build_authhdr(struct sk_buff *skb, u8 *prev_hdr, struct ipv6_opt_hdr *opt)
+{
+	struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_put(skb, (opt->hdrlen+2)<<2);
+
+	memcpy(h, opt, (opt->hdrlen+2)<<2);
+	h->nexthdr = *prev_hdr;
+	*prev_hdr = NEXTHDR_AUTH;
+	return &h->nexthdr;
+}
+
+
+u8 *ipv6_build_nfrag_opts(struct sk_buff *skb, u8 *prev_hdr, struct ipv6_txoptions *opt,
+			  struct in6_addr *daddr, u32 jumbolen)
+{
+	struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb->data;
+
+	if (opt && opt->hopopt)
+		prev_hdr = ipv6_build_exthdr(skb, prev_hdr, NEXTHDR_HOP, opt->hopopt);
+
+	if (jumbolen) {
+		u8 *jumboopt = (u8 *)skb_put(skb, 8);
+
+		if (opt && opt->hopopt) {
+			*jumboopt++ = IPV6_TLV_PADN;
+			*jumboopt++ = 0;
+			h->hdrlen++;
+		} else {
+			h = (struct ipv6_opt_hdr *)jumboopt;
+			h->nexthdr = *prev_hdr;
+			h->hdrlen = 0;
+			jumboopt += 2;
+			*prev_hdr = NEXTHDR_HOP;
+			prev_hdr = &h->nexthdr;
+		}
+		jumboopt[0] = IPV6_TLV_JUMBO;
+		jumboopt[1] = 4;
+		*(u32*)(jumboopt+2) = htonl(jumbolen);
+	}
+	if (opt) {
+		if (opt->dst0opt)
+			prev_hdr = ipv6_build_exthdr(skb, prev_hdr, NEXTHDR_DEST, opt->dst0opt);
+		if (opt->srcrt)
+			prev_hdr = ipv6_build_rthdr(skb, prev_hdr, opt->srcrt, daddr);
+	}
+	return prev_hdr;
+}
+
+u8 *ipv6_build_frag_opts(struct sk_buff *skb, u8 *prev_hdr, struct ipv6_txoptions *opt)
+{
+	if (opt->auth)
+		prev_hdr = ipv6_build_authhdr(skb, prev_hdr, opt->auth);
+	if (opt->dst1opt)
+		prev_hdr = ipv6_build_exthdr(skb, prev_hdr, NEXTHDR_DEST, opt->dst1opt);
+	return prev_hdr;
+}
+
+static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto,
+			    struct ipv6_rt_hdr *opt,
+			    struct in6_addr **addr_p)
+{
+	struct rt0_hdr *phdr, *ihdr;
+	int hops;
+
+	ihdr = (struct rt0_hdr *) opt;
 	
-	phdr->rt_hdr.nexthdr = proto; 
-	return NEXTHDR_ROUTING;
+	phdr = (struct rt0_hdr *) skb_push(skb, (ihdr->rt_hdr.hdrlen + 1) << 3);
+	memcpy(phdr, ihdr, sizeof(struct rt0_hdr));
+
+	hops = ihdr->rt_hdr.hdrlen >> 1;
+
+	if (hops > 1)
+		memcpy(phdr->addr, ihdr->addr + 1,
+		       (hops - 1) * sizeof(struct in6_addr));
+
+	ipv6_addr_copy(phdr->addr + (hops - 1), *addr_p);
+	*addr_p = ihdr->addr;
+
+	phdr->rt_hdr.nexthdr = *proto;
+	*proto = NEXTHDR_ROUTING;
+}
+
+static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv6_opt_hdr *opt)
+{
+	struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_push(skb, ipv6_optlen(opt));
+
+	memcpy(h, opt, ipv6_optlen(opt));
+	h->nexthdr = *proto;
+	*proto = type;
 }
-#endif
+
+static void ipv6_push_authhdr(struct sk_buff *skb, u8 *proto, struct ipv6_opt_hdr *opt)
+{
+	struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_push(skb, (opt->hdrlen+2)<<2);
+
+	memcpy(h, opt, (opt->hdrlen+2)<<2);
+	h->nexthdr = *proto;
+	*proto = NEXTHDR_AUTH;
+}
+
+void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
+			  u8 *proto,
+			  struct in6_addr **daddr)
+{
+	if (opt->srcrt)
+		ipv6_push_rthdr(skb, proto, opt->srcrt, daddr);
+	if (opt->dst0opt)
+		ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt);
+	if (opt->hopopt)
+		ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt);
+}
+
+void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto)
+{
+	if (opt->dst1opt)
+		ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt);
+	if (opt->auth)
+		ipv6_push_authhdr(skb, proto, opt->auth);
+}
+
+struct ipv6_txoptions *
+ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
+{
+	struct ipv6_txoptions *opt2;
+
+	opt2 = sock_kmalloc(sk, opt->tot_len, GFP_ATOMIC);
+	if (opt2) {
+		long dif = (char*)opt2 - (char*)opt;
+		memcpy(opt2, opt, opt->tot_len);
+		if (opt2->hopopt)
+			*((char**)&opt2->hopopt) += dif;
+		if (opt2->dst0opt)
+			*((char**)&opt2->dst0opt) += dif;
+		if (opt2->dst1opt)
+			*((char**)&opt2->dst1opt) += dif;
+		if (opt2->auth)
+			*((char**)&opt2->auth) += dif;
+		if (opt2->srcrt)
+			*((char**)&opt2->srcrt) += dif;
+	}
+	return opt2;
+}
+
 
 /* 
- * find out if nexthdr is an extension header or a protocol
+ * find out if nexthdr is a well-known extension header or a protocol
  */
 
 static __inline__ int ipv6_ext_hdr(u8 nexthdr)
@@ -175,11 +692,9 @@ static __inline__ int ipv6_ext_hdr(u8 nexthdr)
 	return ( (nexthdr == NEXTHDR_HOP)	||
 		 (nexthdr == NEXTHDR_ROUTING)	||
 		 (nexthdr == NEXTHDR_FRAGMENT)	||
-		 (nexthdr == NEXTHDR_ESP)	||
 		 (nexthdr == NEXTHDR_AUTH)	||
 		 (nexthdr == NEXTHDR_NONE)	||
 		 (nexthdr == NEXTHDR_DEST) );
-		 
 }
 
 /*
@@ -200,34 +715,57 @@ static __inline__ int ipv6_ext_hdr(u8 nexthdr)
  * 
  * But I see no other way to do this. This might need to be reexamined
  * when Linux implements ESP (and maybe AUTH) headers.
+ * --AK
+ *
+ * This function parses (probably truncated) exthdr set "hdr"
+ * of length "len". "nexthdrp" initially points to some place,
+ * where type of the first header can be found.
+ *
+ * It skips all well-known exthdrs, and returns pointer to the start
+ * of unparsable area i.e. the first header with unknown type.
+ * If it is not NULL *nexthdr is updated by type/protocol of this header.
+ *
+ * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL.
+ *        - it may return pointer pointing beyond end of packet,
+ *	    if the last recognized header is truncated in the middle.
+ *        - if packet is truncated, so that all parsed headers are skipped,
+ *	    it returns NULL.
+ *	  - First fragment header is skipped, not-first ones
+ *	    are considered as unparsable.
+ *	  - ESP is unparsable for now and considered like
+ *	    normal payload protocol.
+ *	  - Note also special handling of AUTH header. Thanks to IPsec wizards.
+ *
+ * --ANK (980726)
  */
-struct ipv6_opt_hdr *ipv6_skip_exthdr(struct ipv6_opt_hdr *hdr, 
-				      u8 *nexthdrp, int len)
+
+u8 *ipv6_skip_exthdr(struct ipv6_opt_hdr *hdr, u8 *nexthdrp, int len)
 {
 	u8 nexthdr = *nexthdrp;
 
 	while (ipv6_ext_hdr(nexthdr)) {
 		int hdrlen; 
-		
-		if (nexthdr == NEXTHDR_NONE)
+
+		if (len < sizeof(struct ipv6_opt_hdr))
 			return NULL;
-		if (len < sizeof(struct ipv6_opt_hdr)) /* be anal today */
+		if (nexthdr == NEXTHDR_NONE)
 			return NULL;
-
-		hdrlen = ipv6_optlen(hdr); 
-		if (len < hdrlen)
-			return NULL; 
+		if (nexthdr == NEXTHDR_FRAGMENT) {
+			struct frag_hdr *fhdr = (struct frag_hdr *) hdr;
+			if (ntohs(fhdr->frag_off) & ~0x7)
+				break;
+			hdrlen = 8;
+		} else if (nexthdr == NEXTHDR_AUTH)
+			hdrlen = (hdr->hdrlen+2)<<2; 
+		else
+			hdrlen = ipv6_optlen(hdr); 
 
 		nexthdr = hdr->nexthdr;
 		hdr = (struct ipv6_opt_hdr *) ((u8*)hdr + hdrlen);
 		len -= hdrlen;
 	}
 
-	/* Hack.. Do the same for AUTH headers? */
-	if (nexthdr == NEXTHDR_ESP) 
-		return NULL; 
-
 	*nexthdrp = nexthdr;
-	return hdr;
+	return (u8*)hdr;
 }
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index c3b6f7b6b..d43d1f98d 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -5,7 +5,7 @@
  *	Authors:
  *	Pedro Roque		<roque@di.fc.ul.pt>
  *
- *	$Id: icmp.c,v 1.18 1998/05/07 15:42:59 davem Exp $
+ *	$Id: icmp.c,v 1.19 1998/08/26 12:04:52 davem Exp $
  *
  *	Based on net/ipv4/icmp.c
  *
@@ -58,16 +58,15 @@
 #include <asm/uaccess.h>
 #include <asm/system.h>
 
+struct icmpv6_mib icmpv6_statistics;
+
 /*
  *	ICMP socket for flow control.
  */
 
 struct socket *icmpv6_socket;
 
-int icmpv6_rcv(struct sk_buff *skb, struct device *dev,
-	       struct in6_addr *saddr, struct in6_addr *daddr,
-	       struct ipv6_options *opt, unsigned short len,
-	       int redo, struct inet6_protocol *protocol);
+int icmpv6_rcv(struct sk_buff *skb, unsigned long len);
 
 static struct inet6_protocol icmpv6_protocol = 
 {
@@ -80,8 +79,6 @@ static struct inet6_protocol icmpv6_protocol =
 	"ICMPv6"	       	/* name			*/
 };
 
-
-
 struct icmpv6_msg {
 	struct icmp6hdr		icmph;
 	__u8 			*data;
@@ -105,8 +102,11 @@ static int icmpv6_getfrag(const void *data, struct in6_addr *saddr,
 
 	/* 
 	 *	in theory offset must be 0 since we never send more 
-	 *	than 576 bytes on an error or more than the path mtu
+	 *	than IPV6_MIN_MTU bytes on an error or more than the path mtu
 	 *	on an echo reply. (those are the rules on RFC 1883)
+	 *
+	 * 	Luckily, this statement is obsolete after
+	 *	draft-ietf-ipngwg-icmp-v2-00           --ANK (980730)
 	 */
 
 	if (offset) {
@@ -143,13 +143,36 @@ void icmpv6_param_prob(struct sk_buff *skb, int code, void *pos)
 	kfree_skb(skb);
 }
 
-static inline int is_icmp(struct ipv6hdr *hdr, int len)
+/*
+ * Figure out, may we reply to this packet with icmp error.
+ *
+ * We do not reply, if:
+ *	- it was icmp error message.
+ *	- it is truncated, so that it is known, that protocol is ICMPV6
+ *	  (i.e. in the middle of some exthdr)
+ *	- it is not the first fragment. BTW IPv6 specs say nothing about
+ *	  this case, but it is clear, that our reply would be useless
+ *	  for sender.
+ *
+ *	--ANK (980726)
+ */
+
+static int is_ineligible(struct ipv6hdr *hdr, int len)
 {
-	__u8 nexthdr = hdr->nexthdr; 
+	u8 *ptr;
+	__u8 nexthdr = hdr->nexthdr;
+
+	if (len < (int)sizeof(*hdr))
+		return 1;
 
-	if (!ipv6_skip_exthdr((struct ipv6_opt_hdr *)(hdr+1), &nexthdr, len))
-		return 0; 
-	return nexthdr == IPPROTO_ICMP; 
+	ptr = ipv6_skip_exthdr((struct ipv6_opt_hdr *)(hdr+1), &nexthdr, len - sizeof(*hdr));
+	if (!ptr)
+		return 0;
+	if (nexthdr == IPPROTO_ICMPV6) {
+		struct icmp6hdr *ihdr =	(struct icmp6hdr *)ptr;
+		return (ptr - (u8*)hdr) > len || !(ihdr->icmp6_type & 0x80); 
+	}
+	return nexthdr == NEXTHDR_FRAGMENT;
 }
 
 int sysctl_icmpv6_time = 1*HZ; 
@@ -160,31 +183,37 @@ int sysctl_icmpv6_time = 1*HZ;
 static inline int icmpv6_xrlim_allow(struct sock *sk, int type,
 				     struct flowi *fl)
 {
-#if 0
-	struct dst_entry *dst; 
-	int allow = 0;
-#endif
+	struct dst_entry *dst;
+	int res = 0;
+
 	/* Informational messages are not limited. */
 	if (type & 0x80)
-		return 1; 
+		return 1;
 
-#if 0 /* not yet, first fix routing COW */
+	/* Do not limit pmtu discovery, it would break it. */
+	if (type == ICMPV6_PKT_TOOBIG)
+		return 1;
 
 	/* 
 	 * Look up the output route.
 	 * XXX: perhaps the expire for routing entries cloned by
 	 * this lookup should be more aggressive (not longer than timeout).
 	 */
-	dst = ip6_route_output(sk, fl, 1);
-	if (dst->error) 
+	dst = ip6_route_output(sk, fl);
+	if (dst->error)
 		ipv6_statistics.Ip6OutNoRoutes++;
-	else 
-		allow = xrlim_allow(dst, sysctl_icmpv6_time);
+	else {
+		struct rt6_info *rt = (struct rt6_info *)dst;
+		int tmo = sysctl_icmpv6_time;
+
+		/* Give more bandwidth to wider prefixes. */
+		if (rt->rt6i_dst.plen < 128)
+			tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
+
+		res = xrlim_allow(dst, tmo);
+	}
 	dst_release(dst);
-	return allow;
-#else
-	return 1;
-#endif
+	return res;
 }
 
 /*
@@ -196,7 +225,7 @@ static inline int icmpv6_xrlim_allow(struct sock *sk, int type,
 
 static __inline__ int opt_unrec(struct sk_buff *skb, __u32 offset)
 {
-	char *buff = skb->nh.raw;
+	u8 *buff = skb->nh.raw;
 
 	return ( ( *(buff + offset) & 0xC0 ) == 0x80 );
 }
@@ -215,7 +244,6 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
 	struct icmpv6_msg msg;
 	struct flowi fl;
 	int addr_type = 0;
-	int optlen;
 	int len;
 
 	/*
@@ -237,7 +265,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
 	
 	addr_type = ipv6_addr_type(&hdr->daddr);
 
-	if (ipv6_chk_addr(&hdr->daddr, NULL, 0))
+	if (ipv6_chk_addr(&hdr->daddr, skb->dev, 0))
 		saddr = &hdr->daddr;
 
 	/*
@@ -275,8 +303,9 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
 	/* 
 	 *	Never answer to a ICMP packet.
 	 */
-	if (is_icmp(hdr, (u8*)skb->tail - (u8*)hdr)) {
-		printk(KERN_DEBUG "icmpv6_send: no reply to icmp\n"); 
+	if (is_ineligible(hdr, (u8*)skb->tail - (u8*)hdr)) {
+		if (net_ratelimit())
+			printk(KERN_DEBUG "icmpv6_send: no reply to icmp error/fragment\n"); 
 		return;
 	}
 
@@ -303,34 +332,22 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
 	msg.data = skb->nh.raw;
 	msg.csum = 0;
 	msg.daddr = &hdr->saddr;
-        /*
-	if (skb->opt)
-		optlen = skb->opt->optlen;
-	else
-	*/
-
-	optlen = 0;
 
-	len = min(skb->tail - ((unsigned char *) hdr), 
-		  576 - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr)
-		  - optlen);
+	len = min((skb->tail - ((unsigned char *) hdr)) + sizeof(struct icmp6hdr), 
+		  IPV6_MIN_MTU - sizeof(struct icmp6hdr));
 
 	if (len < 0) {
 		printk(KERN_DEBUG "icmp: len problem\n");
 		return;
 	}
 
-	len += sizeof(struct icmp6hdr);
-
 	msg.len = len;
 
 	ip6_build_xmit(sk, icmpv6_getfrag, &msg, &fl, len, NULL, -1,
 		       MSG_DONTWAIT);
-
-	/* Oops! We must purge cached dst, otherwise
-	   all the following ICMP messages will go there :) --ANK
-	 */
-	dst_release(xchg(&sk->dst_cache, NULL));
+	if (type >= ICMPV6_DEST_UNREACH && type <= ICMPV6_PARAMPROB)
+		(&icmpv6_statistics.Icmp6OutDestUnreachs)[type-1]++;
+	icmpv6_statistics.Icmp6OutMsgs++;
 }
 
 static void icmpv6_echo_reply(struct sk_buff *skb)
@@ -374,38 +391,41 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 
 	ip6_build_xmit(sk, icmpv6_getfrag, &msg, &fl, len, NULL, -1,
 		       MSG_DONTWAIT);
-
-	/* Oops! We must purge cached dst, otherwise
-	   all the following ICMP messages will go there :) --ANK
-	 */
-	dst_release(xchg(&sk->dst_cache, NULL));
+	icmpv6_statistics.Icmp6OutEchoReplies++;
+	icmpv6_statistics.Icmp6OutMsgs++;
 }
 
 static void icmpv6_notify(struct sk_buff *skb,
-			  int type, int code, unsigned char *buff, int len,
-			  struct in6_addr *saddr, struct in6_addr *daddr, 
-			  struct inet6_protocol *protocol)
+			  int type, int code, unsigned char *buff, int len)
 {
+	struct in6_addr *saddr = &skb->nh.ipv6h->saddr;
+	struct in6_addr *daddr = &skb->nh.ipv6h->daddr;
 	struct ipv6hdr *hdr = (struct ipv6hdr *) buff;
 	struct inet6_protocol *ipprot;
 	struct sock *sk;
-	struct ipv6_opt_hdr *pb;
+	u8 *pb;
 	__u32 info = 0;
 	int hash;
 	u8 nexthdr;
 
 	nexthdr = hdr->nexthdr;
 
-	pb = (struct ipv6_opt_hdr *) (hdr + 1);
 	len -= sizeof(struct ipv6hdr);
 	if (len < 0)
 		return;
 
 	/* now skip over extension headers */
-	pb = ipv6_skip_exthdr(pb, &nexthdr, len);
+	pb = ipv6_skip_exthdr((struct ipv6_opt_hdr *) (hdr + 1), &nexthdr, len);
 	if (!pb)
 		return;
 
+	/* BUGGG_FUTURE: we should try to parse exthdrs in this packet.
+	   Without this we will not able f.e. to make source routed
+	   pmtu discovery.
+	   Corresponding argument (opt) to notifiers is already added.
+	   --ANK (980726)
+	 */
+
 	hash = nexthdr & (MAX_INET_PROTOS - 1);
 
 	for (ipprot = (struct inet6_protocol *) inet6_protos[hash]; 
@@ -414,9 +434,8 @@ static void icmpv6_notify(struct sk_buff *skb,
 		if (ipprot->protocol != nexthdr)
 			continue;
 
-		if (ipprot->err_handler) 
-			ipprot->err_handler(skb, type, code, (u8*)pb, info,
-					    saddr, daddr, ipprot);
+		if (ipprot->err_handler)
+			ipprot->err_handler(skb, hdr, NULL, type, code, pb, info);
 		return;
 	}
 
@@ -428,7 +447,7 @@ static void icmpv6_notify(struct sk_buff *skb,
 		return;
 
 	while((sk = raw_v6_lookup(sk, nexthdr, daddr, saddr))) {
-		rawv6_err(sk, type, code, (char*)pb, saddr, daddr);
+		rawv6_err(sk, skb, hdr, NULL, type, code, pb, info);
 		sk = sk->next;
 	}
 }
@@ -437,14 +456,17 @@ static void icmpv6_notify(struct sk_buff *skb,
  *	Handle icmp messages
  */
 
-int icmpv6_rcv(struct sk_buff *skb, struct device *dev,
-	       struct in6_addr *saddr, struct in6_addr *daddr,
-	       struct ipv6_options *opt, unsigned short len,
-	       int redo, struct inet6_protocol *protocol)
+int icmpv6_rcv(struct sk_buff *skb, unsigned long len)
 {
+	struct device *dev = skb->dev;
+	struct in6_addr *saddr = &skb->nh.ipv6h->saddr;
+	struct in6_addr *daddr = &skb->nh.ipv6h->daddr;
 	struct ipv6hdr *orig_hdr;
 	struct icmp6hdr *hdr = (struct icmp6hdr *) skb->h.raw;
 	int ulen;
+	int type;
+
+	icmpv6_statistics.Icmp6InMsgs++;
 
 	/* Perform checksum. */
 	switch (skb->ip_summed) {	
@@ -480,8 +502,15 @@ int icmpv6_rcv(struct sk_buff *skb, struct device *dev,
 	 *	length of original packet carried in skb
 	 */
 	ulen = skb->tail - (unsigned char *) (hdr + 1);
-	
-	switch (hdr->icmp6_type) {
+
+	type = hdr->icmp6_type;
+
+	if (type >= ICMPV6_DEST_UNREACH && type <= ICMPV6_PARAMPROB)
+		(&icmpv6_statistics.Icmp6InDestUnreachs)[type-ICMPV6_DEST_UNREACH]++;
+	else if (type >= ICMPV6_ECHO_REQUEST && type <= NDISC_REDIRECT)
+		(&icmpv6_statistics.Icmp6InEchos)[type-ICMPV6_ECHO_REQUEST]++;
+
+	switch (type) {
 
 	case ICMPV6_ECHO_REQUEST:
 		icmpv6_echo_reply(skb);
@@ -492,9 +521,14 @@ int icmpv6_rcv(struct sk_buff *skb, struct device *dev,
 		break;
 
 	case ICMPV6_PKT_TOOBIG:
+		/* BUGGG_FUTURE: if packet contains rthdr, we cannot update
+		   standard destination cache. Seems, only "advanced"
+		   destination cache will allow to solve this problem
+		   --ANK (980726)
+		 */
 		orig_hdr = (struct ipv6hdr *) (hdr + 1);
 		if (ulen >= sizeof(struct ipv6hdr))
-			rt6_pmtu_discovery(&orig_hdr->daddr, dev,
+			rt6_pmtu_discovery(&orig_hdr->daddr, &orig_hdr->saddr, dev,
 					   ntohl(hdr->icmp6_mtu));
 
 		/*
@@ -504,10 +538,8 @@ int icmpv6_rcv(struct sk_buff *skb, struct device *dev,
 	case ICMPV6_DEST_UNREACH:
 	case ICMPV6_TIME_EXCEED:
 	case ICMPV6_PARAMPROB:
-
-		icmpv6_notify(skb, hdr->icmp6_type, hdr->icmp6_code,
-			      (char *) (hdr + 1), ulen,
-			      saddr, daddr, protocol);
+		icmpv6_notify(skb, type, hdr->icmp6_code,
+			      (char *) (hdr + 1), ulen);
 		break;
 
 	case NDISC_ROUTER_SOLICITATION:
@@ -515,7 +547,7 @@ int icmpv6_rcv(struct sk_buff *skb, struct device *dev,
 	case NDISC_NEIGHBOUR_SOLICITATION:
 	case NDISC_NEIGHBOUR_ADVERTISEMENT:
 	case NDISC_REDIRECT:
-		ndisc_rcv(skb, dev, saddr, daddr, opt, len);		
+		ndisc_rcv(skb, len);
 		break;
 
 	case ICMPV6_MGM_QUERY:
@@ -530,23 +562,26 @@ int icmpv6_rcv(struct sk_buff *skb, struct device *dev,
 		break;
 
 	default:
-		printk(KERN_DEBUG "icmpv6: msg of unkown type\n");
+		if (net_ratelimit())
+			printk(KERN_DEBUG "icmpv6: msg of unkown type\n");
 		
 		/* informational */
-		if (hdr->icmp6_type & 0x80)
-			goto discard_it;
+		if (type & 0x80)
+			break;
 
 		/* 
 		 * error of unkown type. 
 		 * must pass to upper level 
 		 */
 
-		icmpv6_notify(skb, hdr->icmp6_type, hdr->icmp6_code,
-			      (char *) (hdr + 1), ulen,
-			      saddr, daddr, protocol);	
+		icmpv6_notify(skb, type, hdr->icmp6_code,
+			      (char *) (hdr + 1), ulen);
 	};
+	kfree_skb(skb);
+	return 0;
 
 discard_it:
+	icmpv6_statistics.Icmp6InErrors++;
 	kfree_skb(skb);
 	return 0;
 }
@@ -597,7 +632,7 @@ static struct icmp6_err {
 } tab_unreach[] = {
 	{ ENETUNREACH,	0},	/* NOROUTE		*/
 	{ EACCES,	1},	/* ADM_PROHIBITED	*/
-	{ EOPNOTSUPP,	1},	/* NOT_NEIGHBOUR	*/
+	{ 0,		0},	/* Was NOT_NEIGHBOUR, now reserved */
 	{ EHOSTUNREACH,	0},	/* ADDR_UNREACH		*/
 	{ ECONNREFUSED,	1},	/* PORT_UNREACH		*/
 };
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index e7e12e3ae..bad3a13ec 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -5,7 +5,7 @@
  *	Authors:
  *	Pedro Roque		<roque@di.fc.ul.pt>	
  *
- *	$Id: ip6_fib.c,v 1.14 1998/05/07 15:43:03 davem Exp $
+ *	$Id: ip6_fib.c,v 1.15 1998/08/26 12:04:55 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -32,10 +32,52 @@
 #include <net/ip6_fib.h>
 #include <net/ip6_route.h>
 
-#define RT_DEBUG 2
+#define RT6_DEBUG 2
+#undef CONFIG_IPV6_SUBTREES
+
+#if RT6_DEBUG >= 1
+#define BUG_TRAP(x) ({ if (!(x)) { printk("Assertion (" #x ") failed at " __FILE__ "(%d):" __FUNCTION__ "\n", __LINE__); } })
+#else
+#define BUG_TRAP(x) do { ; } while (0)
+#endif
+
+#if RT6_DEBUG >= 3
+#define RT6_TRACE(x...) printk(KERN_DEBUG x)
+#else
+#define RT6_TRACE(x...) do { ; } while (0)
+#endif
 
 struct rt6_statistics	rt6_stats;
 
+enum fib_walk_state_t
+{
+#ifdef CONFIG_IPV6_SUBTREES
+	FWS_S,
+#endif
+	FWS_L,
+	FWS_R,
+	FWS_C,
+	FWS_U
+};
+
+struct fib6_cleaner_t
+{
+	struct fib6_walker_t w;
+	int (*func)(struct rt6_info *, void *arg);
+	void *arg;
+};
+
+#ifdef CONFIG_IPV6_SUBTREES
+#define FWS_INIT FWS_S
+#define SUBTREE(fn) ((fn)->subtree)
+#else
+#define FWS_INIT FWS_L
+#define SUBTREE(fn) NULL
+#endif
+
+static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt);
+static void fib6_repair_tree(struct fib6_node *fn);
+
 /*
  *	A routing update causes an increase of the serial number on the
  *	afected subtree. This allows for cached routes to be asynchronously
@@ -48,10 +90,24 @@ static __u32	rt_sernum	= 0;
 static struct timer_list ip6_fib_timer = {
 	NULL, NULL,
 	0,
-	0,
+	~0UL,
 	fib6_run_gc
 };
 
+static struct fib6_walker_t fib6_walker_list = {
+	&fib6_walker_list, &fib6_walker_list, 
+};
+
+#define FOR_WALKERS(w) for ((w)=fib6_walker_list.next; (w) != &fib6_walker_list; (w)=(w)->next)
+
+static __inline__ u32 fib6_new_sernum(void)
+{
+	u32 n = ++rt_sernum;
+	if (n == 0)
+		n = ++rt_sernum;
+	return n;
+}
+
 /*
  *	Auxiliary address test functions for the radix tree.
  *
@@ -70,7 +126,7 @@ static __inline__ int addr_match(void *token1, void *token2, int prefixlen)
 	int pdw;
 	int pbi;
 
-	pdw = prefixlen >> 0x05;  /* num of whole __u32 in prefix */
+	pdw = prefixlen >> 5;	  /* num of whole __u32 in prefix */
 	pbi = prefixlen &  0x1f;  /* num of bits in incomplete u32 in prefix */
 
 	if (pdw)
@@ -78,15 +134,11 @@ static __inline__ int addr_match(void *token1, void *token2, int prefixlen)
 			return 0;
 
 	if (pbi) {
-		__u32 w1, w2;
 		__u32 mask;
 
-		w1 = a1[pdw];
-		w2 = a2[pdw];
-
-		mask = htonl((0xffffffff) << (0x20 - pbi));
+		mask = htonl((0xffffffff) << (32 - pbi));
 
-		if ((w1 ^ w2) & mask)
+		if ((a1[pdw] ^ a2[pdw]) & mask)
 			return 0;
 	}
 
@@ -99,24 +151,11 @@ static __inline__ int addr_match(void *token1, void *token2, int prefixlen)
 
 static __inline__ int addr_bit_set(void *token, int fn_bit)
 {
-	int dw;
-	__u32 b1;
-	__u32 mask;
-	int bit = fn_bit;
 	__u32 *addr = token;
 
-	dw = bit >> 0x05;
-
-	b1 = addr[dw];
-	
-	bit = ~bit;
-	bit &= 0x1f;
-	mask = htonl(1 << bit);
-	return (b1 & mask);
+	return htonl(1 << ((~fn_bit)&0x1F)) & addr[fn_bit>>5];
 }
 
-
-
 /*
  *	find the first different bit between two addresses
  *	length of address must be a multiple of 32bits
@@ -131,42 +170,47 @@ static __inline__ int addr_diff(void *token1, void *token2, int addrlen)
 	addrlen >>= 2;
 
 	for (i = 0; i < addrlen; i++) {
-		__u32 b1, b2;
 		__u32 xb;
 
-		b1 = a1[i];
-		b2 = a2[i];
-
-		xb = b1 ^ b2;
+		xb = a1[i] ^ a2[i];
 
 		if (xb) {
-			int res = 0;
-			int j=31;
+			int j = 31;
 
 			xb = ntohl(xb);
 
-			while (test_bit(j, &xb) == 0) {
-				res++;
+			while (test_bit(j, &xb) == 0)
 				j--;
-			}
 
-			return (i * 32 + res);
+			return (i * 32 + 31 - j);
 		}
 	}
 
 	/*
 	 *	we should *never* get to this point since that 
 	 *	would mean the addrs are equal
+	 *
+	 *	However, we do get to it 8) And exacly, when
+	 *	addresses are equal 8)
+	 *
+	 *	ip route add 1111::/128 via ...
+	 *	ip route add 1111::/64 via ...
+	 *	and we are here.
+	 *
+	 *	Ideally, this function should stop comparison
+	 *	at prefix length. It does not, but it is still OK,
+	 *	if returned value is greater than prefix length.
+	 *					--ANK (980803)
 	 */
 
-	return -1;
+	return addrlen<<5;
 }
 
 static __inline__ struct fib6_node * node_alloc(void)
 {
 	struct fib6_node *fn;
 
-	if ((fn = kmalloc(sizeof(struct fib6_node), GFP_ATOMIC))) {
+	if ((fn = kmalloc(sizeof(struct fib6_node), GFP_ATOMIC)) != NULL) {
 		memset(fn, 0, sizeof(struct fib6_node));
 		rt6_stats.fib_nodes++;
 	}
@@ -180,13 +224,10 @@ static __inline__ void node_free(struct fib6_node * fn)
 	kfree(fn);
 }
 
-extern __inline__ void rt6_release(struct rt6_info *rt)
+static __inline__ void rt6_release(struct rt6_info *rt)
 {
-	struct dst_entry *dst = (struct dst_entry *) rt;
-	if (atomic_dec_and_test(&dst->refcnt)) {
-		rt->rt6i_node = NULL;
-		dst_free(dst);
-	}
+	if (atomic_dec_and_test(&rt->rt6i_ref))
+		dst_free(&rt->u.dst);
 }
 
 
@@ -200,18 +241,16 @@ extern __inline__ void rt6_release(struct rt6_info *rt)
 
 static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
 				     int addrlen, int plen,
-				     unsigned long offset,
-				     struct rt6_info *rt)
-				     
+				     int offset)
 {
-	struct fib6_node *fn;
+	struct fib6_node *fn, *in, *ln;
 	struct fib6_node *pn = NULL;
-	struct fib6_node *in;
-	struct fib6_node *ln;
 	struct rt6key *key;
-	__u32	bit;
-	__u32	dir = 0;
-	__u32	sernum = ++rt_sernum;
+	int	bit;
+       	int	dir = 0;
+	__u32	sernum = fib6_new_sernum();
+
+	RT6_TRACE("fib6_add_1\n");
 
 	/* insert node in tree */
 
@@ -220,146 +259,143 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
 	if (plen == 0)
 		return fn;
 
-	for (;;) {
-		if (fn == NULL) {
-			ln = node_alloc();
-
-			if (ln == NULL)
-				return NULL;
-			ln->fn_bit = plen;
-			
-			ln->parent = pn;
-			ln->fn_sernum = sernum;
-			rt->rt6i_node = ln;
-
-			if (dir)
-				pn->right = ln;
-			else
-				pn->left  = ln;
-
-			return ln;
-		}
-
+	do {
 		key = (struct rt6key *)((u8 *)fn->leaf + offset);
 
 		/*
 		 *	Prefix match
 		 */
-		if (addr_match(&key->addr, addr, fn->fn_bit)) {
+		if (plen < fn->fn_bit ||
+		    !addr_match(&key->addr, addr, fn->fn_bit))
+			goto insert_above;
 		
-			/*
-			 *	Exact match ?
-			 */
+		/*
+		 *	Exact match ?
+		 */
 			 
-			if (plen == fn->fn_bit) {
-				/* clean up an intermediate node */
-				if ((fn->fn_flags & RTN_RTINFO) == 0) {
-					rt6_release(fn->leaf);
-					fn->leaf = NULL;
-				}
+		if (plen == fn->fn_bit) {
+			/* clean up an intermediate node */
+			if ((fn->fn_flags & RTN_RTINFO) == 0) {
+				rt6_release(fn->leaf);
+				fn->leaf = NULL;
+			}
 			
-				fn->fn_sernum = sernum;
+			fn->fn_sernum = sernum;
 				
-				return fn;
-			}
-
-			/*
-			 *	We have more bits to go
-			 */
-			 
-			if (plen > fn->fn_bit) {
-				/* Walk down on tree. */
-				fn->fn_sernum = sernum;
-				dir = addr_bit_set(addr, fn->fn_bit);
-				pn = fn;
-				fn = dir ? fn->right: fn->left;
-
-				/*
-				 *	Round we go. Note if fn has become
-				 *	NULL then dir is set and fn is handled
-				 *	top of loop.
-				 */
-				continue;
-			}
+			return fn;
 		}
 
 		/*
-		 * split since we don't have a common prefix anymore or 
-		 * we have a less significant route.
-		 * we've to insert an intermediate node on the list
-		 * this new node will point to the one we need to create
-		 * and the current
+		 *	We have more bits to go
 		 */
+			 
+		/* Try to walk down on tree. */
+		fn->fn_sernum = sernum;
+		dir = addr_bit_set(addr, fn->fn_bit);
+		pn = fn;
+		fn = dir ? fn->right: fn->left;
+	} while (fn);
 
-		pn = fn->parent;
+	/*
+	 *	We wlaked to the bottom of tree.
+	 *	Create new leaf node without children.
+	 */
 
-		/* find 1st bit in difference between the 2 addrs */
-		bit = addr_diff(addr, &key->addr, addrlen);
+	ln = node_alloc();
 
+	if (ln == NULL)
+		return NULL;
+	ln->fn_bit = plen;
+			
+	ln->parent = pn;
+	ln->fn_sernum = sernum;
 
-		/* 
-		 *		(intermediate)	
-		 *	          /	   \
-		 *	(new leaf node)    (old node)
-		 */
-		if (plen > bit) {
-			in = node_alloc();
-		
-			if (in == NULL)
-				return NULL;
-
-			/* 
-			 * new intermediate node. 
-			 * RTN_RTINFO will
-			 * be off since that an address that chooses one of
-			 * the branches would not match less specific routes
-			 * int the other branch
-			 */
+	if (dir)
+		pn->right = ln;
+	else
+		pn->left  = ln;
+
+	return ln;
 
-			in->fn_bit = bit;
 
-			in->parent = pn;
-			in->leaf = rt;
+insert_above:
+	/*
+	 * split since we don't have a common prefix anymore or 
+	 * we have a less significant route.
+	 * we've to insert an intermediate node on the list
+	 * this new node will point to the one we need to create
+	 * and the current
+	 */
+
+	pn = fn->parent;
 
-			in->fn_sernum = sernum;
-			atomic_inc(&rt->rt6i_ref);
+	/* find 1st bit in difference between the 2 addrs.
 
-			/* leaf node */
-			ln = node_alloc();
+	   See comment in addr_diff: bit may be an invalid value,
+	   but if it is >= plen, the value is ignored in any case.
+	 */
+	
+	bit = addr_diff(addr, &key->addr, addrlen);
 
-			if (ln == NULL) {
+	/* 
+	 *		(intermediate)[in]	
+	 *	          /	   \
+	 *	(new leaf node)[ln] (old node)[fn]
+	 */
+	if (plen > bit) {
+		in = node_alloc();
+		ln = node_alloc();
+		
+		if (in == NULL || ln == NULL) {
+			if (in)
 				node_free(in);
-				return NULL;
-			}
+			if (ln)
+				node_free(ln);
+			return NULL;
+		}
+
+		/* 
+		 * new intermediate node. 
+		 * RTN_RTINFO will
+		 * be off since that an address that chooses one of
+		 * the branches would not match less specific routes
+		 * in the other branch
+		 */
 
-			/* update parent pointer */
-			if (dir)
-				pn->right = in;
-			else
-				pn->left  = in;
+		in->fn_bit = bit;
 
-			ln->fn_bit = plen;
+		in->parent = pn;
+		in->leaf = fn->leaf;
+		atomic_inc(&in->leaf->rt6i_ref);
 
-			ln->parent = in;
-			fn->parent = in;
+		in->fn_sernum = sernum;
 
-			ln->fn_sernum = sernum;
+		/* update parent pointer */
+		if (dir)
+			pn->right = in;
+		else
+			pn->left  = in;
 
-			if (addr_bit_set(addr, bit)) {
-				in->right = ln;
-				in->left  = fn;
-			} else {
-				in->left  = ln;
-				in->right = fn;
-			}
+		ln->fn_bit = plen;
+
+		ln->parent = in;
+		fn->parent = in;
+
+		ln->fn_sernum = sernum;
 
-			return ln;
+		if (addr_bit_set(addr, bit)) {
+			in->right = ln;
+			in->left  = fn;
+		} else {
+			in->left  = ln;
+			in->right = fn;
 		}
+	} else { /* plen <= bit */
 
 		/* 
-		 *		(new leaf node)
+		 *		(new leaf node)[ln]
 		 *	          /	   \
-		 *	     (old node)    NULL
+		 *	     (old node)[fn] NULL
 		 */
 
 		ln = node_alloc();
@@ -377,7 +413,6 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
 			pn->right = ln;
 		else
 			pn->left  = ln;
-		
 
 		if (addr_bit_set(&key->addr, plen))
 			ln->right = fn;
@@ -385,11 +420,8 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
 			ln->left  = fn;
 
 		fn->parent = ln;
-
-		return ln;
 	}
-
-	return NULL;
+	return ln;
 }
 
 /*
@@ -401,7 +433,6 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt)
 	struct rt6_info *iter = NULL;
 	struct rt6_info **ins;
 
-	rt->rt6i_node = fn;
 	ins = &fn->leaf;
 
 	for (iter = fn->leaf; iter; iter=iter->u.next) {
@@ -423,7 +454,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt)
 				iter->rt6i_expires = rt->rt6i_expires;
 				if (!(rt->rt6i_flags&RTF_EXPIRES)) {
 					iter->rt6i_flags &= ~RTF_EXPIRES;
-					iter->rt6i_expires = rt->rt6i_expires;
+					iter->rt6i_expires = 0;
 				}
 				return -EEXIST;
 			}
@@ -439,8 +470,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt)
 	 *	insert node
 	 */
 
-	*ins = rt;
 	rt->u.next = iter;
+	*ins = rt;
+	rt->rt6i_node = fn;
 	atomic_inc(&rt->rt6i_ref);
 #ifdef CONFIG_RTNETLINK
 	inet6_rt_notify(RTM_NEWROUTE, rt);
@@ -457,8 +489,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt)
 
 static __inline__ void fib6_start_gc(struct rt6_info *rt)
 {
-	if ((ip6_fib_timer.expires == 0) &&
-	    (rt->rt6i_flags & (RTF_ADDRCONF | RTF_CACHE))) {
+	if (ip6_fib_timer.expires == 0 &&
+	    (rt->rt6i_flags & (RTF_EXPIRES|RTF_CACHE))) {
 		del_timer(&ip6_fib_timer);
 		ip6_fib_timer.expires = jiffies + ip6_rt_gc_interval;
 		add_timer(&ip6_fib_timer);
@@ -475,67 +507,97 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt)
 {
 	struct fib6_node *fn;
 	int err = -ENOMEM;
-	unsigned long offset;
-	
-	offset = (u8*) &rt->rt6i_dst - (u8*) rt;
+
 	fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr),
-			rt->rt6i_dst.plen, offset, rt);
+			rt->rt6i_dst.plen, (u8*) &rt->rt6i_dst - (u8*) rt);
 
-	if (fn == NULL) {
-#if RT_DEBUG >= 2
-		printk(KERN_DEBUG "fib6_add: fn == NULL\n");
-#endif
-		goto out;
-	}
+	if (fn == NULL)
+		return -ENOMEM;
 
+#ifdef CONFIG_IPV6_SUBTREES
 	if (rt->rt6i_src.plen) {
 		struct fib6_node *sn;
 
-#if RT_DEBUG >= 2
-		printk(KERN_DEBUG "fib6_add: src.len > 0\n");
-#endif
-
 		if (fn->subtree == NULL) {
 			struct fib6_node *sfn;
 
-			if (fn->leaf == NULL) {
-				fn->leaf = rt;
-				atomic_inc(&rt->rt6i_ref);
-			}
+			/*
+			 * Create subtree.
+			 *
+			 *		fn[main tree]
+			 *		|
+			 *		sfn[subtree root]
+			 *		   \
+			 *		    sn[new leaf node]
+			 */
 
+			/* Create subtree root node */
 			sfn = node_alloc();
-
 			if (sfn == NULL)
-				goto out;
+				goto st_failure;
 
-			sfn->parent = fn;
 			sfn->leaf = &ip6_null_entry;
+			atomic_inc(&ip6_null_entry.rt6i_ref);
 			sfn->fn_flags = RTN_ROOT;
-			sfn->fn_sernum = ++rt_sernum;
+			sfn->fn_sernum = fib6_new_sernum();
 
-			fn->subtree = sfn;
-		}
+			/* Now add the first leaf node to new subtree */
 
-		offset = (u8*) &rt->rt6i_src - (u8*) rt;
+			sn = fib6_add_1(sfn, &rt->rt6i_src.addr,
+					sizeof(struct in6_addr), rt->rt6i_src.plen,
+					(u8*) &rt->rt6i_src - (u8*) rt);
 
-		sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
-				sizeof(struct in6_addr), rt->rt6i_src.plen,
-				offset, rt);
+			if (sn == NULL) {
+				/* If it is failed, discard just allocated
+				   root, and then (in st_failure) stale node
+				   in main tree.
+				 */
+				node_free(sfn);
+				goto st_failure;
+			}
 
-		if (sn == NULL)
-			goto out;
+			/* Now link new subtree to main tree */
+			sfn->parent = fn;
+			fn->subtree = sfn;
+			if (fn->leaf == NULL) {
+				fn->leaf = rt;
+				atomic_inc(&rt->rt6i_ref);
+			}
+		} else {
+			sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
+					sizeof(struct in6_addr), rt->rt6i_src.plen,
+					(u8*) &rt->rt6i_src - (u8*) rt);
+
+			if (sn == NULL)
+				goto st_failure;
+		}
 
 		fn = sn;
 	}
+#endif
 
 	err = fib6_add_rt2node(fn, rt);
 
-	if (err == 0)
+	if (err == 0) {
 		fib6_start_gc(rt);
-out:
+		if (!(rt->rt6i_flags&RTF_CACHE))
+			fib6_prune_clones(fn, rt);
+	}
+
 	if (err)
 		dst_free(&rt->u.dst);
 	return err;
+
+#ifdef CONFIG_IPV6_SUBTREES
+	/* Subtree creation failed, probably main tree node
+	   is orphan. If it is, shot it.
+	 */
+st_failure:
+	if (fn && !(fn->fn_flags&RTN_RTINFO|RTN_ROOT))
+		fib_repair_tree(fn);
+	dst_free(&rt->u.dst);
+	return err;
+#endif
 }
 
 /*
@@ -544,7 +606,7 @@ out:
  */
 
 struct lookup_args {
-	unsigned long	offset;		/* key offset on rt6_info	*/
+	int		offset;		/* key offset on rt6_info	*/
 	struct in6_addr	*addr;		/* search key			*/
 };
 
@@ -576,6 +638,7 @@ static struct fib6_node * fib6_lookup_1(struct fib6_node *root,
 	}
 
 	while ((fn->fn_flags & RTN_ROOT) == 0) {
+#ifdef CONFIG_IPV6_SUBTREES
 		if (fn->subtree) {
 			struct fib6_node *st;
 			struct lookup_args *narg;
@@ -591,6 +654,7 @@ static struct fib6_node * fib6_lookup_1(struct fib6_node *root,
 				}
 			}
 		}
+#endif
 
 		if (fn->fn_flags & RTN_RTINFO) {
 			struct rt6key *key;
@@ -618,8 +682,10 @@ struct fib6_node * fib6_lookup(struct fib6_node *root, struct in6_addr *daddr,
 	args[0].offset = (u8*) &rt->rt6i_dst - (u8*) rt;
 	args[0].addr = daddr;
 
+#ifdef CONFIG_IPV6_SUBTREES
 	args[1].offset = (u8*) &rt->rt6i_src - (u8*) rt;
 	args[1].addr = saddr;
+#endif
 
 	fn = fib6_lookup_1(root, args);
 
@@ -630,12 +696,79 @@ struct fib6_node * fib6_lookup(struct fib6_node *root, struct in6_addr *daddr,
 }
 
 /*
+ *	Get node with sepciafied destination prefix (and source prefix,
+ *	if subtrees are used)
+ */
+
+
+static struct fib6_node * fib6_locate_1(struct fib6_node *root,
+					struct in6_addr *addr,
+					int plen, int offset)
+{
+	struct fib6_node *fn;
+
+	for (fn = root; fn ; ) {
+		struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset);
+
+		/*
+		 *	Prefix match
+		 */
+		if (plen < fn->fn_bit ||
+		    !addr_match(&key->addr, addr, fn->fn_bit))
+			return NULL;
+
+		if (plen == fn->fn_bit)
+			return fn;
+
+		/*
+		 *	We have more bits to go
+		 */
+		if (addr_bit_set(addr, fn->fn_bit))
+			fn = fn->right;
+		else
+			fn = fn->left;
+	}
+	return NULL;
+}
+
+struct fib6_node * fib6_locate(struct fib6_node *root,
+			       struct in6_addr *daddr, int dst_len,
+			       struct in6_addr *saddr, int src_len)
+{
+	struct rt6_info *rt = NULL;
+	struct fib6_node *fn;
+
+	fn = fib6_locate_1(root, daddr, dst_len,
+			   (u8*) &rt->rt6i_dst - (u8*) rt);
+
+#ifdef CONFIG_IPV6_SUBTREES
+	if (src_len) {
+		BUG_TRAP(saddr!=NULL);
+		if (fn == NULL)
+			fn = fn->subtree;
+		if (fn)
+			fn = fib6_locate_1(fn, saddr, src_len,
+					   (u8*) &rt->rt6i_src - (u8*) rt);
+	}
+#endif
+
+	if (fn && fn->fn_flags&RTN_RTINFO)
+		return fn;
+
+	return NULL;
+}
+
+
+/*
  *	Deletion
  *
  */
 
 static struct rt6_info * fib6_find_prefix(struct fib6_node *fn)
 {
+	if (fn->fn_flags&RTN_ROOT)
+		return &ip6_null_entry;
+
 	while(fn) {
 		if(fn->left)
 			return fn->left->leaf;
@@ -643,7 +776,7 @@ static struct rt6_info * fib6_find_prefix(struct fib6_node *fn)
 		if(fn->right)
 			return fn->right->leaf;
 
-		fn = fn->subtree;
+		fn = SUBTREE(fn);
 	}
 	return NULL;
 }
@@ -653,428 +786,414 @@ static struct rt6_info * fib6_find_prefix(struct fib6_node *fn)
  *	is the node we want to try and remove.
  */
 
-static void fib6_del_2(struct fib6_node *fn)
+static void fib6_repair_tree(struct fib6_node *fn)
 {
-	struct rt6_info *rt;
-
-	fn->fn_flags &= ~RTN_RTINFO;
-	rt6_stats.fib_route_nodes--;
+	int children;
+	int nstate;
+	struct fib6_node *child, *pn;
+	struct fib6_walker_t *w;
+	int iter = 0;
 
-	/*
-	 *	Can't delete a root node
-	 */
-	 
-	if (fn->fn_flags & RTN_TL_ROOT)
-		return;
+	for (;;) {
+		RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
+		iter++;
 
-	do {
-		struct fib6_node *pn, *child;
-		int children = 0;
+		BUG_TRAP(!(fn->fn_flags&RTN_RTINFO));
+		BUG_TRAP(!(fn->fn_flags&RTN_TL_ROOT));
+		BUG_TRAP(fn->leaf==NULL);
 
+		children = 0;
 		child = NULL;
+		if (fn->right) child = fn->right, children |= 1;
+		if (fn->left) child = fn->left, children |= 2;
 
-		/*
-		 *	We have a child to left
-		 */
-		 
-		if (fn->left) {
-			children++;
-			child = fn->left;
-		}
-
-		/*
-		 *	To right
-		 */
-		 
-		if (fn->right) {
-			children++;
-			child = fn->right;
-		}
-
-		/*
-		 *	We can't tidy a case of two children.
-		 */
-		if (children > 1) {
-			if (fn->leaf == NULL)
-				goto split_repair;
-			break;
+		if (children == 3 || SUBTREE(fn) 
+#ifdef CONFIG_IPV6_SUBTREES
+		    /* Subtree root (i.e. fn) may have one child */
+		    || (children && fn->fn_flags&RTN_ROOT)
+#endif
+		    ) {
+			fn->leaf = fib6_find_prefix(fn);
+#if RT6_DEBUG >= 2
+			if (fn->leaf==NULL) {
+				BUG_TRAP(fn->leaf);
+				fn->leaf = &ip6_null_entry;
+			}
+#endif
+			atomic_inc(&fn->leaf->rt6i_ref);
+			return;
 		}
 
-		if (fn->fn_flags & RTN_RTINFO)
-			break;
-
-		/*
-		 *	The node we plan to tidy has an stree. Talk about
-		 *	making life hard.
-		 */
-		 
-		if (fn->subtree)
-			goto stree_node;
-
-		/*
-		 *	Up we go
-		 */
-		 
 		pn = fn->parent;
-
-		/*
-		 *	Not a ROOT - we can tidy
-		 */
-		 
-		if ((fn->fn_flags & RTN_ROOT) == 0) {
-			/*
-			 *	Make our child our parents child
-			 */
-			if (pn->left == fn)
-				pn->left = child;
-			else
-				pn->right = child;
-
-			/*
-			 *	Reparent the child
-			 */
+#ifdef CONFIG_IPV6_SUBTREES
+		if (SUBTREE(pn) == fn) {
+			BUG_TRAP(fn->fn_flags&RTN_ROOT);
+			SUBTREE(pn) = NULL;
+			nstate = FWS_L;
+		} else {
+			BUG_TRAP(!(fn->fn_flags&RTN_ROOT));
+#endif
+			if (pn->right == fn) pn->right = child;
+			else if (pn->left == fn) pn->left = child;
+#if RT6_DEBUG >= 2
+			else BUG_TRAP(0);
+#endif
 			if (child)
 				child->parent = pn;
+			nstate = FWS_R;
+#ifdef CONFIG_IPV6_SUBTREES
+		}
+#endif
 
-			/*
-			 *	Discard leaf entries
-			 */
-			if (fn->leaf)
-				rt6_release(fn->leaf);
-		} else {
-			if (children)
-				break;
-			/*
-			 *	No children so no subtree
-			 */
-
-			pn->subtree = NULL;
+		FOR_WALKERS(w) {
+			if (child == NULL) {
+				if (w->root == fn) {
+					w->root = w->node = NULL;
+					RT6_TRACE("W %p adjusted by delroot 1\n", w);
+				} else if (w->node == fn) {
+					RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
+					w->node = pn;
+					w->state = nstate;
+				}
+			} else {
+				if (w->root == fn) {
+					w->root = child;
+					RT6_TRACE("W %p adjusted by delroot 2\n", w);
+				}
+				if (w->node == fn) {
+					w->node = child;
+					if (children&2) {
+						RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
+						w->state = w->state>=FWS_R ? FWS_U : FWS_INIT;
+					} else {
+						RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
+						w->state = w->state>=FWS_C ? FWS_U : FWS_INIT;
+					}
+				}
+			}
 		}
 
-		/*
-		 *	We are discarding 
-		 */
 		node_free(fn);
-		
-		/*
-		 *	Our merge of entries might propogate further
-		 *	up the tree, so move up a level and retry.
-		 */
-		 
-		fn = pn;
-
-	} while (!(fn->fn_flags & RTN_TL_ROOT));
-
-	return;
-
-stree_node:
-
-	rt6_release(fn->leaf);
-
-split_repair:
-	rt = fib6_find_prefix(fn);
-
-	if (rt == NULL)
-		panic("fib6_del_2: inconsistent tree\n");
+		if (pn->fn_flags&RTN_RTINFO || SUBTREE(pn))
+			return;
 
-	atomic_inc(&rt->rt6i_ref);
-	fn->leaf = rt;
+		rt6_release(pn->leaf);
+		pn->leaf = NULL;
+		fn = pn;
+	}
 }
 
-/*
- *	Remove our entry in the tree. This throws away the route entry
- *	from the list of entries attached to this fib node. It doesn't
- *	expunge from the tree.
- */
-
-static struct fib6_node * fib6_del_1(struct rt6_info *rt)
+static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp)
 {
-	struct fib6_node *fn;
-	
-	fn = rt->rt6i_node;
+	struct fib6_walker_t *w;
+	struct rt6_info *rt = *rtp;
+
+	RT6_TRACE("fib6_del_route\n");
+
+	if (!(rt->rt6i_flags&RTF_CACHE))
+		fib6_prune_clones(fn, rt);
+
+	/* Unlink it */
+	*rtp = rt->u.next;
+	rt->rt6i_node = NULL;
+	rt6_stats.fib_rt_entries--;
+
+	/* Adjust walkers */
+	FOR_WALKERS(w) {
+		if (w->state == FWS_C && w->leaf == rt) {
+			RT6_TRACE("walker %p adjusted by delroute\n", w);
+			w->leaf = rt->u.next;
+			if (w->leaf == NULL)
+				w->state = FWS_U;
+		}
+	}
 
-	/* We need a fib node! */
-	if (fn) {
-		struct rt6_info **back;
-		struct rt6_info *lf;
+	rt->u.next = NULL;
 
-		back = &fn->leaf;
-		
-		/*
-		 *	Walk the leaf entries looking for ourself
-		 */
-		 
-		for(lf = fn->leaf; lf; lf=lf->u.next) {
-			if (rt == lf) {
-				/*
-				 *	Delete this entry.
-				 */
-
-				*back = lf->u.next;
-#ifdef CONFIG_RTNETLINK
-				inet6_rt_notify(RTM_DELROUTE, lf);
-#endif			
-				rt6_release(lf);
-				rt6_stats.fib_rt_entries--;
-				return fn;
-			}
-			back = &lf->u.next;
-		}
+	/* If it was last route, expunge its radix tree node */
+	if (fn->leaf == NULL) {
+		fn->fn_flags &= ~RTN_RTINFO;
+		rt6_stats.fib_route_nodes--;
+		fib6_repair_tree(fn);
 	}
 
-	return NULL;
+#ifdef CONFIG_RTNETLINK
+	inet6_rt_notify(RTM_DELROUTE, rt);
+#endif
+	rt6_release(rt);
 }
 
 int fib6_del(struct rt6_info *rt)
 {
-	struct fib6_node *fn;
-
-	fn = fib6_del_1(rt);
+	struct fib6_node *fn = rt->rt6i_node;
+	struct rt6_info **rtp;
 
-	if (fn == NULL)
+#if RT6_DEBUG >= 2
+	if (rt->u.dst.obsolete>0) {
+		BUG_TRAP(rt->u.dst.obsolete>0);
+		return -EFAULT;
+	}
+#endif
+	if (fn == NULL || rt == &ip6_null_entry)
 		return -ENOENT;
 
-	if (fn->leaf == NULL)
-		fib6_del_2(fn);
+	BUG_TRAP(fn->fn_flags&RTN_RTINFO);
 
-	return 0;
+	/*
+	 *	Walk the leaf entries looking for ourself
+	 */
+
+	for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.next) {
+		if (*rtp == rt) {
+			fib6_del_route(fn, rtp);
+			return 0;
+		}
+	}
+	return -ENOENT;
 }
 
 /*
- *	Tree transversal function
+ *	Tree transversal function.
  *
- *	Wau... It is NOT REENTERABLE!!!!!!! It is cathastrophe. --ANK
+ *	Certainly, it is not interrupt safe.
+ *	However, it is internally reenterable wrt itself and fib6_add/fib6_del.
+ *	It means, that we can modify tree during walking
+ *	and use this function for garbage collection, clone pruning,
+ *	cleaning tree when a device goes down etc. etc.	
+ *
+ *	It guarantees that every node will be traversed,
+ *	and that it will be traversed only once.
+ *
+ *	Callback function w->func may return:
+ *	0 -> continue walking.
+ *	positive value -> walking is suspended (used by tree dumps,
+ *	and probably by gc, if it will be split to several slices)
+ *	negative value -> terminate walking.
+ *
+ *	The function itself returns:
+ *	0   -> walk is complete.
+ *	>0  -> walk is incomplete (i.e. suspended)
+ *	<0  -> walk is terminated by an error.
  */
 
-int fib6_walk_count;
-
-void fib6_walk_tree(struct fib6_node *root, f_pnode func, void *arg,
-		    int filter)
+int fib6_walk_continue(struct fib6_walker_t *w)
 {
-	struct fib6_node *fn;
+	struct fib6_node *fn, *pn;
 
-	fn = root;
+	for (;;) {
+		fn = w->node;
+		if (fn == NULL)
+			return 0;
 
-	fib6_walk_count++;
-	
-	do {
-		if (!(fn->fn_flags & RTN_TAG)) {
-			fn->fn_flags |= RTN_TAG;
-			
+		if (w->prune && fn != w->root &&
+		    fn->fn_flags&RTN_RTINFO && w->state < FWS_C) {
+			w->state = FWS_C;
+			w->leaf = fn->leaf;
+		}
+		switch (w->state) {
+#ifdef CONFIG_IPV6_SUBTREES
+		case FWS_S:
+			if (SUBTREE(fn)) {
+				w->node = SUBTREE(fn);
+				continue;
+			}
+			w->state = FWS_L;
+#endif	
+		case FWS_L:
 			if (fn->left) {
-				fn = fn->left;
+				w->node = fn->left;
+				w->state = FWS_INIT;
 				continue;
 			}
-		}
-
-		fn->fn_flags &= ~RTN_TAG;
-
-		if (fn->right) {
-			fn = fn->right;
-			continue;
-		}
-		
-		do {
-			struct fib6_node *node;
-			
-			if (fn->fn_flags & RTN_ROOT)
-				break;
-			node = fn;
-			fn = fn->parent;
-			
-			if (!(node->fn_flags & RTN_TAG)) {
-				if (node->subtree) {
-					fib6_walk_tree(node->subtree, func,
-						       arg, filter);
-				}
-
-				if (!filter ||
-				    (node->fn_flags & RTN_RTINFO))
-					(*func)(node, arg);
+			w->state = FWS_R;
+		case FWS_R:
+			if (fn->right) {
+				w->node = fn->right;
+				w->state = FWS_INIT;
+				continue;
 			}
-			
-		} while (!(fn->fn_flags & RTN_TAG));
-
-	} while (!(fn->fn_flags & RTN_ROOT) || (fn->fn_flags & RTN_TAG));
-
-	fib6_walk_count--;
+			w->state = FWS_C;
+			w->leaf = fn->leaf;
+		case FWS_C:
+			if (w->leaf && fn->fn_flags&RTN_RTINFO) {
+				int err = w->func(w);
+				if (err)
+					return err;
+				continue;
+			}
+			w->state = FWS_U;
+		case FWS_U:
+			if (fn == w->root)
+				return 0;
+			pn = fn->parent;
+			w->node = pn;
+#ifdef CONFIG_IPV6_SUBTREES
+			if (SUBTREE(pn) == fn) {
+				BUG_TRAP(fn->fn_flags&RTN_ROOT);
+				w->state = FWS_L;
+				continue;
+			}
+#endif
+			if (pn->left == fn) {
+				w->state = FWS_R;
+				continue;
+			}
+			if (pn->right == fn) {
+				w->state = FWS_C;
+				w->leaf = w->node->leaf;
+				continue;
+			}
+#if RT6_DEBUG >= 2
+			BUG_TRAP(0);
+#endif
+		}
+	}
 }
 
-/*
- *	Garbage collection
- */
-
-static int fib6_gc_node(struct fib6_node *fn, int timeout)
+int fib6_walk(struct fib6_walker_t *w)
 {
-	struct rt6_info *rt, **back;
-	int more = 0;
-	unsigned long now = jiffies;
-
-	back = &fn->leaf;
-
-	for (rt = fn->leaf; rt;) {
-		if ((rt->rt6i_flags & RTF_CACHE) && atomic_read(&rt->rt6i_use) == 0) {
-			if ((long)(now - rt->rt6i_tstamp) >= timeout) {
-				struct rt6_info *old;
-
-				old = rt;
+	int res;
 
-				rt = rt->u.next;
+	w->state = FWS_INIT;
+	w->node = w->root;
 
-				*back = rt;
+	fib6_walker_link(w);
+	res = fib6_walk_continue(w);
+	if (res <= 0)
+		fib6_walker_unlink(w);
+	return res;
+}
 
-				old->rt6i_node = NULL;
-#ifdef CONFIG_RTNETLINK
-				inet6_rt_notify(RTM_DELROUTE, old);
+static int fib6_clean_node(struct fib6_walker_t *w)
+{
+	int res;
+	struct rt6_info *rt;
+	struct fib6_cleaner_t *c = (struct fib6_cleaner_t*)w;
+
+	for (rt = w->leaf; rt; rt = rt->u.next) {
+		res = c->func(rt, c->arg);
+		if (res < 0) {
+			w->leaf = rt;
+			res = fib6_del(rt);
+			if (res) {
+#if RT6_DEBUG >= 2
+				printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res);
 #endif
-				old->u.dst.obsolete = 1;
-				rt6_release(old);
-				rt6_stats.fib_rt_entries--;
 				continue;
 			}
-			more++;
+			return 0;
 		}
+		BUG_TRAP(res==0);
+	}
+	w->leaf = rt;
+	return 0;
+}
 
-		/*
-		 *	check addrconf expiration here.
-		 *
-		 *	BUGGGG Crossing fingers and ...
-		 *	Seems, radix tree walking is absolutely broken,
-		 *	but we will try in any case --ANK
-		 */
-		if ((rt->rt6i_flags&RTF_EXPIRES) && rt->rt6i_expires
-		    && (long)(now - rt->rt6i_expires) > 0) {
-			struct rt6_info *old;
+/*
+ *	Convenient frontend to tree walker.
+ *	
+ *	func is called on each route.
+ *		It may return -1 -> delete this route.
+ *		              0  -> continue walking
+ *
+ *	prune==1 -> only immediate children of node (certainly,
+ *	ignoring pure split nodes) will be scanned.
+ */
 
-			old = rt;
-			rt = rt->u.next;
+void fib6_clean_tree(struct fib6_node *root,
+		     int (*func)(struct rt6_info *, void *arg),
+		     int prune, void *arg)
+{
+	struct fib6_cleaner_t c;
 
-			*back = rt;
+	c.w.root = root;
+	c.w.func = fib6_clean_node;
+	c.w.prune = prune;
+	c.func = func;
+	c.arg = arg;
 
-			old->rt6i_node = NULL;
-#ifdef CONFIG_RTNETLINK
-			inet6_rt_notify(RTM_DELROUTE, old);
-#endif
-			old->u.dst.obsolete = 1;
-			rt6_release(old);
-			rt6_stats.fib_rt_entries--;
-			continue;
-		}
-		back = &rt->u.next;
-		rt = rt->u.next;
+	start_bh_atomic();
+	fib6_walk(&c.w);
+	end_bh_atomic();
+}
+
+static int fib6_prune_clone(struct rt6_info *rt, void *arg)
+{
+	if (rt->rt6i_flags & RTF_CACHE) {
+		RT6_TRACE("pruning clone %p\n", rt);
+		return -1;
 	}
 
-	return more;
+	return 0;
 }
 
-struct fib6_gc_args {
-	unsigned long	timeout;
-	int		more;
-};
+static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt)
+{
+	fib6_clean_tree(fn, fib6_prune_clone, 1, rt);
+}
+
+/*
+ *	Garbage collection
+ */
 
-static void fib6_garbage_collect(struct fib6_node *fn, void *p_arg)
+static struct fib6_gc_args
 {
-	struct fib6_gc_args * args = (struct fib6_gc_args *) p_arg;
+	int			timeout;
+	int			more;
+} gc_args;
 
-	if (fn->fn_flags & RTN_RTINFO) {
-		int more;
+static int fib6_age(struct rt6_info *rt, void *arg)
+{
+	unsigned long now = jiffies;
 
-		more = fib6_gc_node(fn, args->timeout);
+	/* Age clones. Note, that clones are aged out
+	   only if they are not in use now.
+	 */
 
-		if (fn->leaf) {
-			args->more += more;
-			return;
+	if (rt->rt6i_flags & RTF_CACHE) {
+		if (atomic_read(&rt->u.dst.use) == 0 &&
+		    (long)(now - rt->u.dst.lastuse) >= gc_args.timeout) {
+			RT6_TRACE("aging clone %p\n", rt);
+			return -1;
 		}
-
-		rt6_stats.fib_route_nodes--;
-		fn->fn_flags &= ~RTN_RTINFO;
+		gc_args.more++;
+		return 0;
 	}
 
 	/*
-	 *	tree nodes (with no routing information)
+	 *	check addrconf expiration here.
+	 *	They are expired even if they are in use.
 	 */
 
-	if (!fn->subtree && !(fn->fn_flags & RTN_TL_ROOT)) {
-		int children = 0;
-		struct fib6_node *chld = NULL;
-
-		if (fn->left) {
-			children++;
-			chld = fn->left;
-		}
-			
-		if (fn->right) {
-			children++;
-			chld = fn->right;
-		}
-		
-		if ((fn->fn_flags & RTN_ROOT)) {
-			if (children == 0) {
-				struct fib6_node *pn;
-
-				pn = fn->parent;
-				pn->subtree = NULL;
-
-				node_free(fn);
-			}
-			return;
-		}
-
-		if (children <= 1) {
-			struct fib6_node *pn = fn->parent;
-			
-			if (pn->left == fn)
-				pn->left = chld;
-			else
-				pn->right = chld;
-			
-			if (chld)
-				chld->parent = pn;
-			
-			if (fn->leaf)
-				rt6_release(fn->leaf);
-
-			node_free(fn);
-
-			return;
+	if (rt->rt6i_flags&RTF_EXPIRES && rt->rt6i_expires) {
+		if ((long)(now - rt->rt6i_expires) > 0) {
+			RT6_TRACE("expiring %p\n", rt);
+			return -1;
 		}
+		gc_args.more++;
+		return 0;
 	}
 
-	if (fn->leaf == NULL) {
-		struct rt6_info *nrt;
-		
-		nrt = fib6_find_prefix(fn);
-
-		if (nrt == NULL)
-			panic("fib6: inconsistent tree\n");
-
-		atomic_inc(&nrt->rt6i_ref);
-		fn->leaf = nrt;
-	}
+	return 0;
 }
 
 void fib6_run_gc(unsigned long dummy)
 {
-	struct fib6_gc_args arg = {
-		ip6_rt_gc_timeout,
-		0
-	};
+	if (dummy != ~0UL)
+		gc_args.timeout = (int)dummy;
+	else
+		gc_args.timeout = ip6_rt_gc_interval;
 
-	del_timer(&ip6_fib_timer);
+	gc_args.more = 0;
 
-	if (dummy)
-		arg.timeout = dummy;
+	fib6_clean_tree(&ip6_routing_table, fib6_age, 0, NULL);
 
-	if (fib6_walk_count == 0)
-		fib6_walk_tree(&ip6_routing_table, fib6_garbage_collect, &arg, 0);
-	else
-		arg.more = 1;
+	del_timer(&ip6_fib_timer);
 
-	if (arg.more) {
+	ip6_fib_timer.expires = 0;
+	if (gc_args.more) {
 		ip6_fib_timer.expires = jiffies + ip6_rt_gc_interval;
 		add_timer(&ip6_fib_timer);
-	} else {
-		ip6_fib_timer.expires = 0;
 	}
 }
 
@@ -1084,3 +1203,5 @@ void fib6_gc_cleanup(void)
 	del_timer(&ip6_fib_timer);
 }
 #endif
+
+
diff --git a/net/ipv6/ip6_fw.c b/net/ipv6/ip6_fw.c
index 3c3a0cfc5..c19a561e9 100644
--- a/net/ipv6/ip6_fw.c
+++ b/net/ipv6/ip6_fw.c
@@ -5,7 +5,7 @@
  *	Authors:
  *	Pedro Roque		<roque@di.fc.ul.pt>	
  *
- *	$Id: ip6_fw.c,v 1.9 1998/02/12 07:43:42 davem Exp $
+ *	$Id: ip6_fw.c,v 1.10 1998/08/26 12:04:57 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -300,14 +300,19 @@ int ip6_fw_msg_add(struct ip6_fw_msg *msg)
 	rl->info.uli_u.data = msg->u.data;
 
 	rtmsg.rtmsg_flags = RTF_NONEXTHOP|RTF_POLICY;
-	rt = ip6_route_add(&rtmsg, &err);
+	err = ip6_route_add(&rtmsg);
 
-	/* BUGGGG! rt can point to nowhere. */
-	if (rt == NULL) {
+	if (err) {
 		ip6_fwrule_free(rl);
-		return -ENOMEM;
+		return err;
 	}
 
+	/* The rest will not work for now. --ABK (989725) */
+
+#ifndef notdef
+	ip6_fwrule_free(rl);
+	return -EPERM;
+#else
 	rt->u.dst.error = -EPERM;
 
 	if (msg->policy == IP6_FW_ACCEPT) {
@@ -327,6 +332,7 @@ int ip6_fw_msg_add(struct ip6_fw_msg *msg)
 	rt->rt6i_flowr = flow_clone((struct flow_rule *)rl);
 
 	return 0;
+#endif
 }
 
 static int ip6_fw_msgrcv(int unit, struct sk_buff *skb)
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 6ab4d2c08..6d7359aef 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -6,7 +6,7 @@
  *	Pedro Roque		<roque@di.fc.ul.pt>
  *	Ian P. Morris		<I.P.Morris@soton.ac.uk>
  *
- *	$Id: ip6_input.c,v 1.10 1998/07/15 05:05:34 davem Exp $
+ *	$Id: ip6_input.c,v 1.11 1998/08/26 12:04:59 davem Exp $
  *
  *	Based in linux/net/ipv4/ip_input.c
  *
@@ -37,144 +37,21 @@
 #include <net/ip6_route.h>
 #include <net/addrconf.h>
 
-static int ipv6_dest_opt(struct sk_buff **skb_ptr, struct device *dev,
-			 __u8 *nhptr, struct ipv6_options *opt);
-
-struct hdrtype_proc {
-	u8	type;
-	int	(*func) (struct sk_buff **, struct device *dev, __u8 *ptr,
-			 struct ipv6_options *opt);
-} hdrproc_lst[] = {
-
-  /*
-	TODO
-
-	{NEXTHDR_HOP,		ipv6_hop_by_hop}
-	{NEXTHDR_ROUTING,	ipv6_routing_header},
-   */
-	{NEXTHDR_FRAGMENT,	ipv6_reassembly},
-  
-	{NEXTHDR_DEST,		ipv6_dest_opt},
-   /*	
-	{NEXTHDR_AUTH,		ipv6_auth_hdr},
-	{NEXTHDR_ESP,		ipv6_esp_hdr},
-    */
-	{NEXTHDR_MAX,		NULL}
-};
-
-/* New header structures */
-
-
-struct tlvtype_proc {
-	u8	type;
-	int	(*func) (struct sk_buff *, struct device *dev, __u8 *ptr,
-			 struct ipv6_options *opt);
-	/*
-	 *	these functions do NOT update skb->h.raw
-	 */
-
-} tlvprocdestopt_lst[] = {
-	{255,			NULL}
-};
-
-int ip6_dstopt_unknown(struct sk_buff *skb, struct ipv6_tlvtype *hdr)
-{
-	struct in6_addr *daddr;
-	int pos;
-
-	/*
-	 *	unkown destination option type
-	 */
-	
-	pos = (__u8 *) hdr - (__u8 *) skb->nh.raw;
-	
-	/* I think this is correct please check - IPM */
-
-	switch ((hdr->type & 0xC0) >> 6) {
-	case 0: /* ignore */
-		skb->h.raw += hdr->len+2;
-		return 1;
-		
-	case 1: /* drop packet */
-		break;
-
-	case 2: /* send ICMP PARM PROB regardless and drop packet */
-		icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_UNK_OPTION,
-			    pos, skb->dev);
-		break;
-		
-	case 3: /* Send ICMP if not a multicast address and drop packet */
-		daddr = &skb->nh.ipv6h->daddr;
-		if (!(ipv6_addr_type(daddr) & IPV6_ADDR_MULTICAST))
-			icmpv6_send(skb, ICMPV6_PARAMPROB,
-				    ICMPV6_UNK_OPTION, pos, skb->dev);
-	};
-	
-	kfree_skb(skb);
-	return 0;
-}
-
-static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff *skb,
-			 struct device *dev, __u8 *nhptr,
-			 struct ipv6_options *opt, void *lastopt)
-{
-	struct ipv6_tlvtype *hdr;
-	struct tlvtype_proc *curr;
-
-	while ((hdr=(struct ipv6_tlvtype *)skb->h.raw) != lastopt) {
-		switch (hdr->type) {
-		case 0: /* TLV encoded Pad1 */
-			skb->h.raw++;
-			break;
-
-		case 1: /* TLV encoded PadN */
-			skb->h.raw += hdr->len+2;
-			break;
-
-		default: /* Other TLV code so scan list */
-			for (curr=procs; curr->type != 255; curr++) {
-				if (curr->type == (hdr->type)) {
-					curr->func(skb, dev, nhptr, opt);
-					skb->h.raw += hdr->len+2;
-					break;
-				}
-			}
-			if (curr->type==255) {
-				if (ip6_dstopt_unknown(skb, hdr) == 0)
-					return 0;
-			}
-			break;
-		}
-	}
-	return 1;
-}
-
-static int ipv6_dest_opt(struct sk_buff **skb_ptr, struct device *dev,
-			 __u8 *nhptr, struct ipv6_options *opt)
-{
-	struct sk_buff *skb=*skb_ptr;
-	struct ipv6_destopt_hdr *hdr = (struct ipv6_destopt_hdr *) skb->h.raw;
-	int res = 0;
-	void *lastopt=skb->h.raw+hdr->hdrlen+sizeof(struct ipv6_destopt_hdr);
-
-	skb->h.raw += sizeof(struct ipv6_destopt_hdr);
-	if (ip6_parse_tlv(tlvprocdestopt_lst, skb, dev, nhptr, opt, lastopt))
-		res = hdr->nexthdr;
-	skb->h.raw+=hdr->hdrlen;
-
-	return res;
-}
-
 
 int ipv6_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
 {
 	struct ipv6hdr *hdr;
-	int pkt_len;
+	u32 		pkt_len;
 
-	if (skb->pkt_type == PACKET_OTHERHOST) {
-		kfree_skb(skb);
-		return 0;
-	}
+	if (skb->pkt_type == PACKET_OTHERHOST)
+		goto drop;
+
+	ipv6_statistics.Ip6InReceives++;
+
+	/* Store incoming device index. When the packet will
+	   be queued, we cannot refer to skb->dev anymore.
+	 */
+	((struct inet6_skb_parm *)skb->cb)->iif = dev->ifindex;
 
 	hdr = skb->nh.ipv6h;
 
@@ -183,16 +60,31 @@ int ipv6_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
 
 	pkt_len = ntohs(hdr->payload_len);
 
-	if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
-		goto err;
+	/* pkt_len may be zero if Jumbo payload option is present */
+	if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
+		if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
+			goto truncated;
+		skb_trim(skb, pkt_len + sizeof(struct ipv6hdr));
+	}
 
-	skb_trim(skb, pkt_len + sizeof(struct ipv6hdr));
+	if (hdr->nexthdr == NEXTHDR_HOP) {
+		skb->h.raw = (u8*)(hdr+1);
+		if (!ipv6_parse_hopopts(skb, &hdr->nexthdr)) {
+			ipv6_statistics.Ip6InHdrErrors++;
+			return 0;
+		}
+	}
 
-	ip6_route_input(skb);
-	
-	return 0;
+	if (skb->dst == NULL)
+		ip6_route_input(skb);
+
+	return skb->dst->input(skb);
+
+truncated:
+	ipv6_statistics.Ip6InTruncatedPkts++;
 err:
 	ipv6_statistics.Ip6InHdrErrors++;
+drop:
 	kfree_skb(skb);
 	return 0;
 }
@@ -217,8 +109,7 @@ static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb)
  *	without calling rawv6.c)
  */
 static struct sock * ipv6_raw_deliver(struct sk_buff *skb,
-				      struct ipv6_options *opt,
-				      int nexthdr, int len)
+				      int nexthdr, unsigned long len)
 {
 	struct in6_addr *saddr;
 	struct in6_addr *daddr;
@@ -253,8 +144,8 @@ static struct sock * ipv6_raw_deliver(struct sk_buff *skb,
 				continue;
 
 			buff = skb_clone(skb, GFP_ATOMIC);
-			buff->sk = sk2;
-			rawv6_rcv(buff, skb->dev, saddr, daddr, opt, len);
+			if (buff)
+				rawv6_rcv(sk2, buff, len);
 		}
 	}
 
@@ -270,10 +161,8 @@ static struct sock * ipv6_raw_deliver(struct sk_buff *skb,
 
 int ip6_input(struct sk_buff *skb)
 {
-	struct ipv6_options *opt = (struct ipv6_options *) skb->cb;
 	struct ipv6hdr *hdr = skb->nh.ipv6h;
 	struct inet6_protocol *ipprot;
-	struct hdrtype_proc *hdrt;
 	struct sock *raw_sk;
 	__u8 *nhptr;
 	int nexthdr;
@@ -281,7 +170,7 @@ int ip6_input(struct sk_buff *skb)
 	u8 hash;
 	int len;
 	
-	skb->h.raw += sizeof(struct ipv6hdr);
+	skb->h.raw = skb->nh.raw + sizeof(struct ipv6hdr);
 
 	/*
 	 *	Parse extension headers
@@ -290,64 +179,55 @@ int ip6_input(struct sk_buff *skb)
 	nexthdr = hdr->nexthdr;
 	nhptr = &hdr->nexthdr;
 
-	/*
-	 *	check for extension headers
-	 */
-
-st_loop:
+	/* Skip  hop-by-hop options, they are already parsed. */
+	if (nexthdr == NEXTHDR_HOP) {
+		nhptr = (u8*)(hdr+1);
+		nexthdr = *nhptr;
+		skb->h.raw += (nhptr[1]+1)<<3;
+	}
 
-	for (hdrt=hdrproc_lst; hdrt->type != NEXTHDR_MAX; hdrt++) {
-		if (hdrt->type == nexthdr) {
-			if ((nexthdr = hdrt->func(&skb, skb->dev, nhptr, opt))) {
-				nhptr = skb->h.raw;
-				hdr = skb->nh.ipv6h;
-				goto st_loop;
-			}
+	/* This check is sort of optimization.
+	   It would be stupid to detect for optional headers,
+	   which are missing with probability of 200%
+	 */
+	if (nexthdr != IPPROTO_TCP && nexthdr != IPPROTO_UDP) {
+		nhptr = ipv6_parse_exthdrs(&skb, nhptr);
+		if (nhptr == NULL)
 			return 0;
-		}
+		nexthdr = *nhptr;
+		hdr = skb->nh.ipv6h;
 	}
-
 	len = skb->tail - skb->h.raw;
 
-	raw_sk = ipv6_raw_deliver(skb, opt, nexthdr, len);
+	raw_sk = ipv6_raw_deliver(skb, nexthdr, len);
 
 	hash = nexthdr & (MAX_INET_PROTOS - 1);
 	for (ipprot = (struct inet6_protocol *) inet6_protos[hash]; 
 	     ipprot != NULL; 
 	     ipprot = (struct inet6_protocol *) ipprot->next) {
 		struct sk_buff *buff = skb;
-		
+
 		if (ipprot->protocol != nexthdr)
 			continue;
-		
+
 		if (ipprot->copy || raw_sk)
 			buff = skb_clone(skb, GFP_ATOMIC);
-		
-		
-		ipprot->handler(buff, skb->dev, &hdr->saddr, &hdr->daddr,
-				opt, len, 0, ipprot);
+
+		ipprot->handler(buff, len);
 		found = 1;
 	}
-	
+
 	if (raw_sk) {
-		skb->sk = raw_sk;
-		rawv6_rcv(skb, skb->dev, &hdr->saddr, &hdr->daddr, opt, len);
+		rawv6_rcv(raw_sk, skb, len);
 		found = 1;
 	}
-	
+
 	/*
 	 *	not found: send ICMP parameter problem back
 	 */
-	
 	if (!found) {
-		unsigned long offset;
-#if IP6_DEBUG >= 2
-		printk(KERN_DEBUG "proto not found %d\n", nexthdr);
-#endif
-		offset = nhptr - (u8*) hdr;
-		icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_UNK_NEXTHDR,
-			    offset, skb->dev);
-		kfree_skb(skb);
+		ipv6_statistics.Ip6InUnknownProtos++;
+		icmpv6_param_prob(skb, ICMPV6_UNK_NEXTHDR, nhptr);
 	}
 
 	return 0;
@@ -359,6 +239,8 @@ int ip6_mc_input(struct sk_buff *skb)
 	int deliver = 0;
 	int discard = 1;
 
+	ipv6_statistics.Ip6InMcastPkts++;
+
 	hdr = skb->nh.ipv6h;
 	if (ipv6_chk_mcast_addr(skb->dev, &hdr->daddr))
 		deliver = 1;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index aa13c2074..0555c1a24 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -5,7 +5,7 @@
  *	Authors:
  *	Pedro Roque		<roque@di.fc.ul.pt>	
  *
- *	$Id: ip6_output.c,v 1.13 1998/07/15 05:05:38 davem Exp $
+ *	$Id: ip6_output.c,v 1.14 1998/08/26 12:05:01 davem Exp $
  *
  *	Based on linux/net/ipv4/ip_output.c
  *
@@ -13,6 +13,14 @@
  *      modify it under the terms of the GNU General Public License
  *      as published by the Free Software Foundation; either version
  *      2 of the License, or (at your option) any later version.
+ *
+ *	Changes:
+ *	A.N.Kuznetsov	:	airthmetics in fragmentation.
+ *				extension headers are implemented.
+ *				route changes now work.
+ *				ip6_forward does not confuse sniffers.
+ *				etc.
+ *				
  */
 
 #include <linux/errno.h>
@@ -33,6 +41,7 @@
 #include <net/ip6_route.h>
 #include <net/addrconf.h>
 #include <net/rawv6.h>
+#include <net/icmp.h>
 
 static u32	ipv6_fragmentation_id = 1;
 
@@ -59,6 +68,8 @@ int ip6_output(struct sk_buff *skb)
 				return 0;
 			}
 		}
+
+		ipv6_statistics.Ip6OutMcastPkts++;
 	}
 
 	if (hh) {
@@ -85,17 +96,40 @@ int ip6_output(struct sk_buff *skb)
  */
 
 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
-	     struct ipv6_options *opt)
+	     struct ipv6_txoptions *opt)
 {
 	struct ipv6_pinfo * np = sk ? &sk->net_pinfo.af_inet6 : NULL;
+	struct in6_addr *first_hop = fl->nl_u.ip6_u.daddr;
 	struct dst_entry *dst = skb->dst;
 	struct ipv6hdr *hdr;
-	int seg_len;
+	u8  proto = fl->proto;
+	int seg_len = skb->len;
 	int hlimit;
 
-	/* Do something with IPv6 options headers here. */
+	if (opt) {
+		int head_room;
 
-	seg_len = skb->len;
+		/* First: exthdrs may take lots of space (~8K for now)
+		   MAX_HEADER is not enough.
+		 */
+		head_room = opt->opt_nflen + opt->opt_flen;
+		seg_len += head_room;
+		head_room += sizeof(struct ipv6hdr) + ((dst->dev->hard_header_len + 15)&~15);
+
+		if (skb_headroom(skb) < head_room) {
+			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
+			kfree(skb);
+			skb = skb2;
+			if (skb == NULL)
+				return -ENOBUFS;
+			if (sk)
+				skb_set_owner_w(skb, sk);
+		}
+		if (opt->opt_flen)
+			ipv6_push_frag_opts(skb, opt, &proto);
+		if (opt->opt_nflen)
+			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
+	}
 
 	hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
 
@@ -117,16 +151,22 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 		hlimit = ((struct rt6_info*)dst)->rt6i_hoplimit;
 
 	hdr->payload_len = htons(seg_len);
-	hdr->nexthdr = fl->proto;
+	hdr->nexthdr = proto;
 	hdr->hop_limit = hlimit;
 
 	ipv6_addr_copy(&hdr->saddr, fl->nl_u.ip6_u.saddr);
-	ipv6_addr_copy(&hdr->daddr, fl->nl_u.ip6_u.daddr);
+	ipv6_addr_copy(&hdr->daddr, first_hop);
 
-	ipv6_statistics.Ip6OutRequests++;
-	dst->output(skb);
+	if (skb->len <= dst->pmtu) {
+		ipv6_statistics.Ip6OutRequests++;
+		dst->output(skb);
+		return 0;
+	}
 
-	return 0;
+	printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
+	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst->pmtu, skb->dev);
+	kfree_skb(skb);
+	return -EMSGSIZE;
 }
 
 /*
@@ -166,8 +206,8 @@ int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct device *dev,
 	return 0;
 }
 
-static void ip6_bld_1(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
-		      int hlimit, unsigned short pktlength)
+static struct ipv6hdr * ip6_bld_1(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
+				  int hlimit, unsigned pktlength)
 {
 	struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
 	struct ipv6hdr *hdr;
@@ -177,43 +217,56 @@ static void ip6_bld_1(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 	
 	hdr->version = 6;
 	hdr->priority = np->priority;
-	
 	memcpy(hdr->flow_lbl, &np->flow_lbl, 3);
 	
 	hdr->payload_len = htons(pktlength - sizeof(struct ipv6hdr));
-
-	/*
-	 *	FIXME: hop limit has default UNI/MCAST and
-	 *	msgctl settings
-	 */
 	hdr->hop_limit = hlimit;
+	hdr->nexthdr = fl->proto;
 
 	ipv6_addr_copy(&hdr->saddr, fl->nl_u.ip6_u.saddr);
-	ipv6_addr_copy(&hdr->daddr, fl->nl_u.ip6_u.daddr);	
+	ipv6_addr_copy(&hdr->daddr, fl->nl_u.ip6_u.daddr);
+	return hdr;
+}
+
+static __inline__ u8 * ipv6_build_fraghdr(struct sk_buff *skb, u8* prev_hdr, unsigned offset)
+{
+	struct frag_hdr *fhdr;
+
+	fhdr = (struct frag_hdr *) skb_put(skb, sizeof(struct frag_hdr));
+
+	fhdr->nexthdr  = *prev_hdr;
+	*prev_hdr = NEXTHDR_FRAGMENT;
+	prev_hdr = &fhdr->nexthdr;
+
+	fhdr->reserved = 0;
+	fhdr->frag_off = htons(offset);
+	fhdr->identification = ipv6_fragmentation_id++;
+	return &fhdr->nexthdr;
 }
 
 static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag,
 			 const void *data, struct dst_entry *dst,
-			 struct flowi *fl, struct ipv6_options *opt,
-			 int hlimit, int flags, unsigned length)
+			 struct flowi *fl, struct ipv6_txoptions *opt,
+			 struct in6_addr *final_dst,
+			 int hlimit, int flags, unsigned length, int mtu)
 {
-	struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
 	struct ipv6hdr *hdr;
 	struct sk_buff *last_skb;
-	struct frag_hdr *fhdr;
+	u8 *prev_hdr;
 	int unfrag_len;
-	int payl_len;
 	int frag_len;
 	int last_len;
 	int nfrags;
 	int fhdr_dist;
+	int frag_off;
+	int data_off;
 	int err;
 
 	/*
 	 *	Fragmentation
 	 *
 	 *	Extension header order:
-	 *	Hop-by-hop -> Routing -> Fragment -> rest (...)
+	 *	Hop-by-hop -> Dest0 -> Routing -> Fragment -> Auth -> Dest1 -> rest (...)
 	 *	
 	 *	We must build the non-fragmented part that
 	 *	will be in every packet... this also means
@@ -222,11 +275,11 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag,
 	 */
 
 	unfrag_len = sizeof(struct ipv6hdr) + sizeof(struct frag_hdr);
-	payl_len = length;
+	last_len = length;
 
 	if (opt) {
 		unfrag_len += opt->opt_nflen;
-		payl_len += opt->opt_flen;
+		last_len += opt->opt_flen;
 	}
 
 	/*
@@ -235,9 +288,13 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag,
 	 *	"integer multiple of 8 octects".
 	 */
 
-	frag_len = (dst->pmtu - unfrag_len) & ~0x7;
+	frag_len = (mtu - unfrag_len) & ~0x7;
 
-	nfrags = payl_len / frag_len;
+	/* Unfragmentable part exceeds mtu. */
+	if (frag_len <= 0)
+		return -EMSGSIZE;
+
+	nfrags = last_len / frag_len;
 
 	/*
 	 *	We must send from end to start because of 
@@ -250,13 +307,25 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag,
 	 *	might be a good idea.
 	 */
 
-	last_len = payl_len - (nfrags * frag_len);
+	frag_off = nfrags * frag_len;
+	last_len -= frag_off;
 
 	if (last_len == 0) {
 		last_len = frag_len;
+		frag_off -= frag_len;
 		nfrags--;
 	}
-		
+	data_off = frag_off;
+
+	/* And it is implementation problem: for now we assume, that
+	   all the exthdrs will fit to the first fragment.
+	 */
+	if (opt) {
+		if (frag_len < opt->opt_flen)
+			return -EMSGSIZE;
+		data_off = frag_off - opt->opt_flen;
+	}
+
 	last_skb = sock_alloc_send_skb(sk, unfrag_len + frag_len +
 				       dst->dev->hard_header_len + 15,
 				       0, flags & MSG_DONTWAIT, &err);
@@ -267,41 +336,17 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag,
 	last_skb->dst = dst_clone(dst);
 
 	skb_reserve(last_skb, (dst->dev->hard_header_len + 15) & ~15);
-	
-	hdr = (struct ipv6hdr *) skb_put(last_skb, sizeof(struct ipv6hdr));
-	last_skb->nh.ipv6h = hdr;
 
-	hdr->version = 6;
-	hdr->priority = np->priority;
-	
-	memcpy(hdr->flow_lbl, &np->flow_lbl, 3);
-	hdr->payload_len = htons(unfrag_len + frag_len - sizeof(struct ipv6hdr));
+	hdr = ip6_bld_1(sk, last_skb, fl, hlimit, frag_len+unfrag_len);
+	prev_hdr = &hdr->nexthdr;
 
-	hdr->hop_limit = hlimit;
+	if (opt && opt->opt_nflen)
+		prev_hdr = ipv6_build_nfrag_opts(last_skb, prev_hdr, opt, final_dst, 0);
 
-	hdr->nexthdr = NEXTHDR_FRAGMENT;
+	prev_hdr = ipv6_build_fraghdr(last_skb, prev_hdr, frag_off);
+	fhdr_dist = prev_hdr - last_skb->data;
 
-	ipv6_addr_copy(&hdr->saddr, fl->nl_u.ip6_u.saddr);
-	ipv6_addr_copy(&hdr->daddr, fl->nl_u.ip6_u.daddr);
-
-#if 0
-	if (opt && opt->srcrt) {
-		hdr->nexthdr = ipv6opt_bld_rthdr(last_skb, opt, daddr,
-						 NEXTHDR_FRAGMENT);
-	}
-#endif
-
-	fhdr = (struct frag_hdr *) skb_put(last_skb, sizeof(struct frag_hdr));
-	memset(fhdr, 0, sizeof(struct frag_hdr));
-
-	fhdr->nexthdr  = fl->proto;		
-	fhdr->frag_off = ntohs(nfrags * frag_len);
-	fhdr->identification = ipv6_fragmentation_id++;
-
-	fhdr_dist = (unsigned char *) fhdr - last_skb->data;
-
-	err = getfrag(data, &hdr->saddr, last_skb->tail, nfrags * frag_len,
-		      last_len);
+	err = getfrag(data, &hdr->saddr, last_skb->tail, data_off, last_len);
 
 	if (!err) {
 		while (nfrags--) {
@@ -309,58 +354,60 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag,
 			
 			struct frag_hdr *fhdr2;
 				
-#if 0
-			printk(KERN_DEBUG "sending frag %d\n", nfrags);
-#endif
 			skb = skb_copy(last_skb, sk->allocation);
 
-			if (skb == NULL)
+			if (skb == NULL) {
+				ipv6_statistics.Ip6FragFails++;
+				kfree_skb(last_skb);
 				return -ENOMEM;
+			}
 			
+			frag_off -= frag_len;
+			data_off -= frag_len;
+
 			fhdr2 = (struct frag_hdr *) (skb->data + fhdr_dist);
 
 			/* more flag on */
-			fhdr2->frag_off = ntohs(nfrags * frag_len + 1);
+			fhdr2->frag_off = htons(frag_off | 1);
 
-			/*
-			 *	FIXME:
-			 *	if (nfrags == 0)
-			 *	put rest of headers
-			 */
+			/* Write fragmentable exthdrs to the first chunk */
+			if (nfrags == 0 && opt && opt->opt_flen) {
+				ipv6_build_frag_opts(skb, &fhdr2->nexthdr, opt);
+				frag_len -= opt->opt_flen;
+				data_off = 0;
+			}
 
 			err = getfrag(data, &hdr->saddr,skb_put(skb, frag_len),
-				      nfrags * frag_len, frag_len);
+				      data_off, frag_len);
 
 			if (err) {
 				kfree_skb(skb);
 				break;
 			}
 
+			ipv6_statistics.Ip6FragCreates++;
 			ipv6_statistics.Ip6OutRequests++;
 			dst->output(skb);
 		}
 	}
 
 	if (err) {
+		ipv6_statistics.Ip6FragFails++;
 		kfree_skb(last_skb);
 		return -EFAULT;
 	}
 
-#if 0
-	printk(KERN_DEBUG "sending last frag \n");
-#endif
-
-	hdr->payload_len = htons(unfrag_len + last_len - 
-				 sizeof(struct ipv6hdr));
+	hdr->payload_len = htons(unfrag_len + last_len - sizeof(struct ipv6hdr));
 
 	/*
 	 *	update last_skb to reflect the getfrag we did
 	 *	on start.
 	 */
-	
-	last_skb->tail += last_len;
-	last_skb->len += last_len;
 
+	skb_put(last_skb, last_len);
+
+	ipv6_statistics.Ip6FragCreates++;
+	ipv6_statistics.Ip6FragOKs++;
 	ipv6_statistics.Ip6OutRequests++;
 	dst->output(last_skb);
 
@@ -369,42 +416,71 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag,
 
 int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data,
 		   struct flowi *fl, unsigned length,
-		   struct ipv6_options *opt, int hlimit, int flags)
+		   struct ipv6_txoptions *opt, int hlimit, int flags)
 {
 	struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
 	struct in6_addr *final_dst = NULL;
 	struct dst_entry *dst;
-	int pktlength;
 	int err = 0;
-	
+	unsigned int pktlength, jumbolen, mtu;
+
 	if (opt && opt->srcrt) {
 		struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
 		final_dst = fl->nl_u.ip6_u.daddr;
 		fl->nl_u.ip6_u.daddr = rt0->addr;
 	}
 
-	dst = NULL;
-
 	if (!fl->oif && ipv6_addr_is_multicast(fl->nl_u.ip6_u.daddr))
 		fl->oif = np->mcast_oif;
-	
-	if (sk->dst_cache)
+
+	dst = NULL;
+	if (sk->dst_cache) {
 		dst = dst_check(&sk->dst_cache, np->dst_cookie);
+		if (dst) {
+			struct rt6_info *rt = (struct rt6_info*)dst_clone(dst);
+
+			/* Yes, checking route validity in not connected
+			   case is not very simple. Take into account,
+			   that we do not support routing by source, TOS,
+			   and MSG_DONTROUTE 		--ANK (980726)
+
+			   1. If route was host route, check that
+			      cached destination is current.
+			      If it is network route, we still may
+			      check its validity using saved pointer
+			      to the last used address: daddr_cache.
+			      We do not want to save whole address now,
+			      (because main consumer of this service
+			       is tcp, which has not this problem),
+			      so that the last trick works only on connected
+			      sockets.
+			   2. oif also should be the same.
+			 */
+			if (((rt->rt6i_dst.plen != 128 ||
+			      ipv6_addr_cmp(fl->fl6_dst, &rt->rt6i_dst.addr))
+			     && (np->daddr_cache == NULL ||
+				 ipv6_addr_cmp(fl->fl6_dst, np->daddr_cache)))
+			    || (fl->oif && fl->oif != dst->dev->ifindex)) {
+				dst_release(dst);
+				dst = NULL;
+			}
+		}
+	}
 
 	if (dst == NULL)
 		dst = ip6_route_output(sk, fl);
 
 	if (dst->error) {
 		ipv6_statistics.Ip6OutNoRoutes++;
-		err = -ENETUNREACH;
-		goto out;
+		dst_release(dst);
+		return -ENETUNREACH;
 	}
 
 	if (fl->nl_u.ip6_u.saddr == NULL) {
 		struct inet6_ifaddr *ifa;
 		
 		ifa = ipv6_get_saddr(dst, fl->nl_u.ip6_u.daddr);
-		
+
 		if (ifa == NULL) {
 #if IP6_DEBUG >= 2
 			printk(KERN_DEBUG "ip6_build_xmit: "
@@ -415,7 +491,6 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data,
 		}
 		fl->nl_u.ip6_u.saddr = &ifa->addr;
 	}
-	
 	pktlength = length;
 
 	if (hlimit < 0) {
@@ -427,29 +502,38 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data,
 			hlimit = ((struct rt6_info*)dst)->rt6i_hoplimit;
 	}
 
+	jumbolen = 0;
+
 	if (!sk->ip_hdrincl) {
 		pktlength += sizeof(struct ipv6hdr);
 		if (opt)
 			pktlength += opt->opt_flen + opt->opt_nflen;
 
-		/* Due to conservative check made by caller,
-		   pktlength cannot overflow here.
-
-		   When (and if) jumbo option will be implemented
-		   we could try soemething sort of:
+		if (pktlength > 0xFFFF + sizeof(struct ipv6hdr)) {
+			/* Jumbo datagram.
+			   It is assumed, that in the case of sk->ip_hdrincl
+			   jumbo option is supplied by user.
+			 */
+			pktlength += 8;
+			jumbolen = pktlength - sizeof(struct ipv6hdr);
+		}
+	}
 
-		   if (pktlength < length) return -EMSGSIZE;
+	mtu = dst->pmtu;
 
-		*/
-	}
+	/* Critical arithmetic overflow check.
+	   FIXME: may gcc optimize it out? --ANK (980726)
+	 */
+	if (pktlength < length)
+		return -EMSGSIZE;
 
-	if (pktlength <= dst->pmtu) {
+	if (pktlength <= mtu) {
 		struct sk_buff *skb;
 		struct ipv6hdr *hdr;
-		struct device *dev;
+		struct device *dev = dst->dev;
 
 		skb = sock_alloc_send_skb(sk, pktlength + 15 +
-					  dst->dev->hard_header_len, 0,
+					  dev->hard_header_len, 0,
 					  flags & MSG_DONTWAIT, &err);
 
 		if (skb == NULL) {
@@ -457,7 +541,6 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data,
 			goto out;
 		}
 
-		dev = dst->dev;
 		skb->dst = dst_clone(dst);
 
 		skb_reserve(skb, (dev->hard_header_len + 15) & ~15);
@@ -466,23 +549,22 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data,
 		skb->nh.ipv6h = hdr;
 
 		if (!sk->ip_hdrincl) {
-			ip6_bld_1(sk, skb, fl, hlimit, pktlength);
-#if 0
-			if (opt && opt->srcrt) {
-				hdr->nexthdr = ipv6opt_bld_rthdr(skb, opt,
-								 final_dst,
-								 fl->proto);
+			ip6_bld_1(sk, skb, fl, hlimit,
+				  jumbolen ? sizeof(struct ipv6hdr) : pktlength);
+
+			if (opt || jumbolen) {
+				u8 *prev_hdr = &hdr->nexthdr;
+				prev_hdr = ipv6_build_nfrag_opts(skb, prev_hdr, opt, final_dst, jumbolen);
+				if (opt && opt->opt_flen)
+					ipv6_build_frag_opts(skb, prev_hdr, opt);
 			}
-			else
-#endif
-				hdr->nexthdr = fl->proto;
 		}
 
 		skb_put(skb, length);
 		err = getfrag(data, &hdr->saddr,
 			      ((char *) hdr) + (pktlength - length),
 			      0, length);
-		
+
 		if (!err) {
 			ipv6_statistics.Ip6OutRequests++;
 			dst->output(skb);
@@ -491,32 +573,18 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data,
 			kfree_skb(skb);
 		}
 	} else {
-		if (sk->ip_hdrincl)
+		if (sk->ip_hdrincl || jumbolen)
 			return -EMSGSIZE;
 
-		/* pktlength includes IPv6 header, not included
-		   in IPv6 payload length.
-		   FIXME are non-fragmentable options included
-		   in packet after defragmentation? If not, we
-		   should subtract opt_nflen also. --ANK
-		 */
-		if (pktlength > 0xFFFF + sizeof(struct ipv6hdr))
-			return -EMSGSIZE;
-
-		err = ip6_frag_xmit(sk, getfrag, data, dst, fl, opt, hlimit,
-				    flags, length);
+		err = ip6_frag_xmit(sk, getfrag, data, dst, fl, opt, final_dst, hlimit,
+				    flags, length, mtu);
 	}
-	
+
 	/*
 	 *	cleanup
 	 */
-  out:
-	
-	if (sk->dst_cache)
-		ip6_dst_store(sk, dst);
-	else
-		dst_release(dst);
-
+out:
+	ip6_dst_store(sk, dst, fl->nl_u.ip6_u.daddr == &np->daddr ? &np->daddr : NULL);
 	return err;
 }
 
@@ -530,20 +598,15 @@ int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 		if (sk && ra->sel == sel) {
 			if (last) {
 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
-				if (skb2) {
-					skb2->sk = last;
-					rawv6_rcv(skb2, skb2->dev, &skb2->nh.ipv6h->saddr,
-						  &skb2->nh.ipv6h->daddr, NULL, skb2->len);
-				}
+				if (skb2)
+					rawv6_rcv(last, skb2, skb2->len);
 			}
 			last = sk;
 		}
 	}
 
 	if (last) {
-		skb->sk = last;
-		rawv6_rcv(skb, skb->dev, &skb->nh.ipv6h->saddr,
-			  &skb->nh.ipv6h->daddr, NULL, skb->len);
+		rawv6_rcv(last, skb, skb->len);
 		return 1;
 	}
 	return 0;
@@ -553,24 +616,16 @@ int ip6_forward(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb->dst;
 	struct ipv6hdr *hdr = skb->nh.ipv6h;
-	int size;
+	struct inet6_skb_parm *opt =(struct inet6_skb_parm*)skb->cb;
 	
-	if (ipv6_devconf.forwarding == 0)
+	if (ipv6_devconf.forwarding == 0 && opt->srcrt == 0)
 		goto drop;
 
 	/*
-	 *	check hop-by-hop options present
-	 */
-	/*
-	 *	Note, that NEXTHDR_HOP header must be checked
-	 *	always at the most beginning of ipv6_rcv.
-	 *	The result should be saved somewhere, but
-	 *	we do not it for now. Alas. Let's do it here. --ANK
-	 *
-	 *	Second note: we DO NOT make any processing on
+	 *	We DO NOT make any processing on
 	 *	RA packets, pushing them to user level AS IS
-	 *	without ane WARRANTY that application will able
-	 *	to interpret them. The reson is that we
+	 *	without ane WARRANTY that application will be able
+	 *	to interpret them. The reason is that we
 	 *	cannot make anything clever here.
 	 *
 	 *	We are not end-node, so that if packet contains
@@ -579,42 +634,9 @@ int ip6_forward(struct sk_buff *skb)
 	 *	cannot be fragmented, because there is no warranty
 	 *	that different fragments will go along one path. --ANK
 	 */
-	if (hdr->nexthdr == NEXTHDR_HOP) {
-		int ra_value = -1;
-		u8 *ptr = (u8*)(skb->nh.ipv6h+1);
-		int len = (ptr[1]+1)<<3;
-
-		if (len + sizeof(struct ipv6hdr) > skb->len)
-			goto drop;
-
-		ptr += 2;
-		len -= 2;
-		while (len > 0) {
-			u8 *opt;
-			int optlen;
-
-			if (ptr[0] == 0) {
-				len--;
-				ptr++;
-				continue;
-			}
-			opt = ptr;
-			optlen = ptr[1]+1;
-
-			len -= optlen;
-			ptr += optlen;
-			if (len < 0)
-				goto drop;
-
-			if (opt[0] == 20) {
-				/* Router Alert as of draft-ietf-ipngwg-ipv6router-alert-04 */
-				if (optlen < 4)
-					goto drop;
-				ra_value = opt[2] + (opt[3]<<8);
-			} else if (!ip6_dstopt_unknown(skb, (struct ipv6_tlvtype*)opt))
-				goto drop;
-		}
-		if (ra_value>=0 && ip6_call_ra_chain(skb, ra_value))
+	if (opt->ra) {
+		u8 *ptr = skb->nh.raw + opt->ra;
+		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 			return 0;
 	}
 
@@ -622,6 +644,8 @@ int ip6_forward(struct sk_buff *skb)
 	 *	check and decrement ttl
 	 */
 	if (hdr->hop_limit <= 1) {
+		/* Force OUTPUT device used as source address */
+		skb->dev = dst->dev;
 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 			    0, skb->dev);
 
@@ -629,9 +653,10 @@ int ip6_forward(struct sk_buff *skb)
 		return -ETIMEDOUT;
 	}
 
-	hdr->hop_limit--;
-
-	if (skb->dev == dst->dev && dst->neighbour) {
+	/* IPv6 specs say nothing about it, but it is clear that we cannot
+	   send redirects to source routed frames.
+	 */
+	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
 		struct in6_addr *target = NULL;
 		struct rt6_info *rt;
 		struct neighbour *n = dst->neighbour;
@@ -647,30 +672,40 @@ int ip6_forward(struct sk_buff *skb)
 		else
 			target = &hdr->daddr;
 
-		ndisc_send_redirect(skb, dst->neighbour, target);
+		/* Limit redirects both by destination (here)
+		   and by source (inside ndisc_send_redirect)
+		 */
+		if (xrlim_allow(dst, 1*HZ))
+			ndisc_send_redirect(skb, n, target);
+	} else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
+						|IPV6_ADDR_LINKLOCAL)) {
+		/* This check is security critical. */
+		goto drop;
 	}
-	
-	size = sizeof(struct ipv6hdr) + ntohs(hdr->payload_len);
 
-	if (size > dst->pmtu) {
+	if (skb->len > dst->pmtu) {
+		/* Again, force OUTPUT device used as source address */
+		skb->dev = dst->dev;
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst->pmtu, skb->dev);
+		ipv6_statistics.Ip6InTooBigErrors++;
 		kfree_skb(skb);
 		return -EMSGSIZE;
 	}
 
-	if (skb_headroom(skb) < dst->dev->hard_header_len || skb_cloned(skb)) {
-		struct sk_buff *skb2;
-		skb2 = skb_realloc_headroom(skb, (dst->dev->hard_header_len + 15)&~15);
-		kfree_skb(skb);
-		skb = skb2;
-	}
+	if ((skb = skb_cow(skb, dst->dev->hard_header_len)) == NULL)
+		return 0;
 
-	ipv6_statistics.Ip6ForwDatagrams++;
-	dst->output(skb);
+	hdr = skb->nh.ipv6h;
 
-	return 0;
+	/* Mangling hops number delayed to point after skb COW */
+ 
+	hdr->hop_limit--;
+
+	ipv6_statistics.Ip6OutForwDatagrams++;
+	return dst->output(skb);
 
 drop:
+	ipv6_statistics.Ip6InAddrErrors++;
 	kfree_skb(skb);
 	return -EINVAL;
 }
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index b31c07c00..a246b996b 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -7,7 +7,7 @@
  *
  *	Based on linux/net/ipv4/ip_sockglue.c
  *
- *	$Id: ipv6_sockglue.c,v 1.22 1998/07/15 05:05:39 davem Exp $
+ *	$Id: ipv6_sockglue.c,v 1.23 1998/08/26 12:05:04 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -110,7 +110,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval,
 		    int optlen)
 {
 	struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
-	int val, err;
+	int val, valbool;
 	int retv = -ENOPROTOOPT;
 
 	if(level==SOL_IP && sk->type != SOCK_RAW)
@@ -119,19 +119,20 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval,
 	if(level!=SOL_IPV6)
 		goto out;
 
-	if (optval == NULL) {
+	if (optval == NULL)
 		val=0;
-	} else {
-		err = get_user(val, (int *) optval);
-		if(err)
-			return err;
-	}
-	
+	else if (get_user(val, (int *) optval))
+		return -EFAULT;
+
+	valbool = (val!=0);
 
 	switch (optname) {
 
 	case IPV6_ADDRFORM:
 		if (val == PF_INET) {
+			struct ipv6_txoptions *opt;
+			struct sk_buff *pktopt;
+
 			if (sk->protocol != IPPROTO_UDP &&
 			    sk->protocol != IPPROTO_TCP)
 				goto out;
@@ -140,7 +141,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval,
 				retv = ENOTCONN;
 				goto out;
 			}
-			
+
 			if (!(ipv6_addr_type(&np->daddr) & IPV6_ADDR_MAPPED)) {
 				retv = -EADDRNOTAVAIL;
 				goto out;
@@ -153,10 +154,17 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval,
 				tp->af_specific = &ipv4_specific;
 				sk->socket->ops = &inet_stream_ops;
 				sk->family = PF_INET;
+				tcp_sync_mss(sk, tp->pmtu_cookie);
 			} else {
 				sk->prot = &udp_prot;
 				sk->socket->ops = &inet_dgram_ops;
 			}
+			opt = xchg(&np->opt, NULL);
+			if (opt)
+				sock_kfree_s(sk, opt, opt->tot_len);
+			pktopt = xchg(&np->pktoptions, NULL);
+			if (pktopt)
+				kfree_skb(pktopt);
 			retv = 0;
 		} else {
 			retv = -EINVAL;
@@ -164,15 +172,85 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval,
 		break;
 
 	case IPV6_PKTINFO:
-		np->rxinfo = val;
+		np->rxopt.bits.rxinfo = valbool;
 		retv = 0;
 		break;
 
 	case IPV6_HOPLIMIT:
-		np->rxhlim = val;
+		np->rxopt.bits.rxhlim = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_RTHDR:
+		retv = -EINVAL;
+		if (val >= 0 && val <= 2) {
+			np->rxopt.bits.srcrt = val;
+			retv = 0;
+		}
+		break;
+
+	case IPV6_HOPOPTS:
+		np->rxopt.bits.hopopts = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_AUTHHDR:
+		np->rxopt.bits.authhdr = valbool;
 		retv = 0;
 		break;
 
+	case IPV6_DSTOPTS:
+		np->rxopt.bits.dstopts = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_PKTOPTIONS:
+	{
+		struct ipv6_txoptions *opt = NULL;
+		struct msghdr msg;
+		int junk;
+		struct in6_addr *saddr;
+
+		if (optlen == 0)
+			goto update;
+
+		opt = sock_kmalloc(sk, sizeof(*opt) + optlen, GFP_KERNEL);
+		retv = -ENOBUFS;
+		if (opt == NULL)
+			break;
+
+		memset(opt, 0, sizeof(*opt));
+		opt->tot_len = sizeof(*opt) + optlen;
+		retv = -EFAULT;
+		if (copy_from_user(opt+1, optval, optlen))
+			goto done;
+
+		msg.msg_controllen = optlen;
+		msg.msg_control = (void*)(opt+1);
+
+		retv = datagram_send_ctl(&msg, &junk, &saddr, opt, &junk);
+		if (retv)
+			goto done;
+update:
+		retv = 0;
+		start_bh_atomic();
+		if (opt && sk->type == SOCK_STREAM) {
+			struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+			if ((tcp_connected(sk->state) || sk->state == TCP_SYN_SENT)
+			    && sk->daddr != LOOPBACK4_IPV6) {
+				tp->ext_header_len = opt->opt_flen + opt->opt_nflen;
+				tcp_sync_mss(sk, tp->pmtu_cookie);
+			}
+		}
+		opt = xchg(&np->opt, opt);
+		dst_release(xchg(&sk->dst_cache, NULL));
+		end_bh_atomic();
+
+done:
+		if (opt)
+			sock_kfree_s(sk, opt, opt->tot_len);
+		break;
+	}
 	case IPV6_UNICAST_HOPS:
 		if (val > 255 || val < -1)
 			retv = -EINVAL;
@@ -190,10 +268,9 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval,
 			retv = 0;
 		}
 		break;
-		break;
 
 	case IPV6_MULTICAST_LOOP:
-		np->mc_loop = (val != 0);
+		np->mc_loop = valbool;
 		retv = 0;
 		break;
 
@@ -229,12 +306,10 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval,
 	case IPV6_DROP_MEMBERSHIP:
 	{
 		struct ipv6_mreq mreq;
-		int err;
 
-		err = copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq));
-		if(err)
+		if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq)))
 			return -EFAULT;
-		
+
 		if (optname == IPV6_ADD_MEMBERSHIP)
 			retv = ipv6_sock_mc_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr);
 		else
@@ -253,10 +328,44 @@ out:
 int ipv6_getsockopt(struct sock *sk, int level, int optname, char *optval, 
 		    int *optlen)
 {
+	struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
+	int len;
+
 	if(level==SOL_IP && sk->type != SOCK_RAW)
 		return udp_prot.getsockopt(sk, level, optname, optval, optlen);
 	if(level!=SOL_IPV6)
 		return -ENOPROTOOPT;
+	if (get_user(len, optlen))
+		return -EFAULT;
+	switch (optname) {
+	case IPV6_PKTOPTIONS:
+	{
+		struct msghdr msg;
+		struct sk_buff *skb;
+
+		start_bh_atomic();
+		skb = np->pktoptions;
+		if (skb)
+			atomic_inc(&skb->users);
+		end_bh_atomic();
+
+		if (skb) {
+			int err;
+
+			msg.msg_control = optval;
+			msg.msg_controllen = len;
+			msg.msg_flags = 0;
+			err = datagram_recv_ctl(sk, &msg, skb);
+			kfree_skb(skb);
+			if (err)
+				return err;
+			len -= msg.msg_controllen;
+		} else
+			len = 0;
+		return put_user(len, optlen);
+	}
+	default:
+	}
 	return -EINVAL;
 }
 
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index c50f37fcf..88950481e 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -5,7 +5,7 @@
  *	Authors:
  *	Pedro Roque		<roque@di.fc.ul.pt>	
  *
- *	$Id: mcast.c,v 1.16 1998/05/07 15:43:10 davem Exp $
+ *	$Id: mcast.c,v 1.17 1998/08/26 12:05:06 davem Exp $
  *
  *	Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c 
  *
@@ -79,7 +79,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr)
 	if (!(ipv6_addr_type(addr) & IPV6_ADDR_MULTICAST))
 		return -EINVAL;
 
-	mc_lst = kmalloc(sizeof(struct ipv6_mc_socklist), GFP_KERNEL);
+	mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL);
 
 	if (mc_lst == NULL)
 		return -ENOMEM;
@@ -91,13 +91,15 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr)
 	if (ifindex == 0) {
 		struct rt6_info *rt;
 		rt = rt6_lookup(addr, NULL, 0, 0);
-		if (rt)
+		if (rt) {
 			dev = rt->rt6i_dev;
+			dst_release(&rt->u.dst);
+		}
 	} else
 		dev = dev_get_by_index(ifindex);
 
 	if (dev == NULL) {
-		kfree(mc_lst);
+		sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
 		return -ENODEV;
 	}
 
@@ -108,7 +110,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr)
 	err = ipv6_dev_mc_inc(dev, addr);
 
 	if (err) {
-		kfree(mc_lst);
+		sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
 		return err;
 	}
 
@@ -133,7 +135,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, struct in6_addr *addr)
 			*lnk = mc_lst->next;
 			if ((dev = dev_get_by_index(ifindex)) != NULL)
 				ipv6_dev_mc_dec(dev, &mc_lst->addr);
-			kfree(mc_lst);
+			sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
 			return 0;
 		}
 	}
@@ -153,7 +155,7 @@ void ipv6_sock_mc_close(struct sock *sk)
 			ipv6_dev_mc_dec(dev, &mc_lst->addr);
 
 		np->ipv6_mc_list = mc_lst->next;
-		kfree(mc_lst);
+		sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
 	}
 }
 
@@ -308,11 +310,19 @@ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime)
 {
 	unsigned long delay = resptime;
 
+	/* Do not start timer for addresses with link/host scope */
+	if (ipv6_addr_type(&ma->mca_addr)&(IPV6_ADDR_LINKLOCAL|IPV6_ADDR_LOOPBACK))
+		return;
+
 	if (del_timer(&ma->mca_timer))
 		delay = ma->mca_timer.expires - jiffies;
 
-	if (delay >= resptime)
-		delay = net_random() % resptime;
+	if (delay >= resptime) {
+		if (resptime)
+			delay = net_random() % resptime;
+		else
+			delay = 1;
+	}
 
 	ma->mca_flags |= MAF_TIMER_RUNNING;
 	ma->mca_timer.expires = jiffies + delay;
@@ -325,10 +335,16 @@ int igmp6_event_query(struct sk_buff *skb, struct icmp6hdr *hdr, int len)
 	struct in6_addr *addrp;
 	unsigned long resptime;
 
-	if (len < sizeof(struct icmp6hdr) + sizeof(struct ipv6hdr))
+	if (len < sizeof(struct icmp6hdr) + sizeof(struct in6_addr))
 		return -EINVAL;
 
-	resptime = hdr->icmp6_maxdelay;
+	/* Drop queries with not link local source */
+	if (!(ipv6_addr_type(&skb->nh.ipv6h->saddr)&IPV6_ADDR_LINKLOCAL))
+		return -EINVAL;
+
+	resptime = ntohs(hdr->icmp6_maxdelay);
+	/* Translate milliseconds to jiffies */
+	resptime = (resptime<<10)/(1024000/HZ);
 
 	addrp = (struct in6_addr *) (hdr + 1);
 
@@ -365,7 +381,15 @@ int igmp6_event_report(struct sk_buff *skb, struct icmp6hdr *hdr, int len)
 	struct device *dev;
 	int hash;
 
-	if (len < sizeof(struct icmp6hdr) + sizeof(struct ipv6hdr))
+	/* Our own report looped back. Ignore it. */
+	if (skb->pkt_type == PACKET_LOOPBACK)
+		return 0;
+
+	if (len < sizeof(struct icmp6hdr) + sizeof(struct in6_addr))
+		return -EINVAL;
+
+	/* Drop reports with not link local source */
+	if (!(ipv6_addr_type(&skb->nh.ipv6h->saddr)&IPV6_ADDR_LINKLOCAL))
 		return -EINVAL;
 
 	addrp = (struct in6_addr *) (hdr + 1);
@@ -399,14 +423,25 @@ void igmp6_send(struct in6_addr *addr, struct device *dev, int type)
         struct sk_buff *skb;
         struct icmp6hdr *hdr;
 	struct inet6_ifaddr *ifp;
-	struct in6_addr *addrp; 
-	int err, len, plen;
+	struct in6_addr *snd_addr;
+	struct in6_addr *addrp;
+	struct in6_addr all_routers;
+	int err, len, payload_len, full_len;
+	u8 ra[8] = { IPPROTO_ICMPV6, 0,
+		     IPV6_TLV_ROUTERALERT, 0, 0, 0,
+		     IPV6_TLV_PADN, 0 };
+
+	snd_addr = addr;
+	if (type == ICMPV6_MGM_REDUCTION) {
+		snd_addr = &all_routers;
+		ipv6_addr_all_routers(&all_routers);
+	}
 
 	len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr);
+	payload_len = len + sizeof(ra);
+	full_len = sizeof(struct ipv6hdr) + payload_len;
 
-	plen = sizeof(struct ipv6hdr) + len;
-
-	skb = sock_alloc_send_skb(sk, dev->hard_header_len + plen + 15, 0, 0, &err);
+	skb = sock_alloc_send_skb(sk, dev->hard_header_len + full_len + 15, 0, 0, &err);
 
 	if (skb == NULL)
 		return;
@@ -414,8 +449,8 @@ void igmp6_send(struct in6_addr *addr, struct device *dev, int type)
 	skb_reserve(skb, (dev->hard_header_len + 15) & ~15);
 	if (dev->hard_header) {
 		unsigned char ha[MAX_ADDR_LEN];
-		ndisc_mc_map(addr, ha, dev, 1);
-		dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, plen);
+		ndisc_mc_map(snd_addr, ha, dev, 1);
+		dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, full_len);
 	}
 
 	ifp = ipv6_get_lladdr(dev);
@@ -428,11 +463,9 @@ void igmp6_send(struct in6_addr *addr, struct device *dev, int type)
 		return;
 	}
 
-	ip6_nd_hdr(sk, skb, dev, &ifp->addr, addr, IPPROTO_ICMPV6, len);
+	ip6_nd_hdr(sk, skb, dev, &ifp->addr, snd_addr, NEXTHDR_HOP, payload_len);
 
-	/*
-	 *	need hop-by-hop router alert option.
-	 */
+	memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra));
 
 	hdr = (struct icmp6hdr *) skb_put(skb, sizeof(struct icmp6hdr));
 	memset(hdr, 0, sizeof(struct icmp6hdr));
@@ -441,11 +474,16 @@ void igmp6_send(struct in6_addr *addr, struct device *dev, int type)
 	addrp = (struct in6_addr *) skb_put(skb, sizeof(struct in6_addr));
 	ipv6_addr_copy(addrp, addr);
 
-	hdr->icmp6_cksum = csum_ipv6_magic(&ifp->addr, addr, len,
+	hdr->icmp6_cksum = csum_ipv6_magic(&ifp->addr, snd_addr, len,
 					   IPPROTO_ICMPV6,
 					   csum_partial((__u8 *) hdr, len, 0));
 
 	dev_queue_xmit(skb);
+	if (type == ICMPV6_MGM_REDUCTION)
+		icmpv6_statistics.Icmp6OutGroupMembReductions++;
+	else
+		icmpv6_statistics.Icmp6OutGroupMembResponses++;
+	icmpv6_statistics.Icmp6OutMsgs++;
 }
 
 static void igmp6_join_group(struct ifmcaddr6 *ma)
@@ -455,7 +493,7 @@ static void igmp6_join_group(struct ifmcaddr6 *ma)
 
 	addr_type = ipv6_addr_type(&ma->mca_addr);
 
-	if ((addr_type & IPV6_ADDR_LINKLOCAL))
+	if ((addr_type & (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_LOOPBACK)))
 		return;
 
 	igmp6_send(&ma->mca_addr, ma->dev, ICMPV6_MGM_REPORT);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 26e42a1ed..b6c855a59 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -68,8 +68,7 @@
 #include <net/ndisc.h>
 #include <net/ip6_route.h>
 #include <net/addrconf.h>
-
-
+#include <net/icmp.h>
 
 #include <net/checksum.h>
 #include <linux/proc_fs.h>
@@ -350,6 +349,9 @@ void ndisc_send_na(struct device *dev, struct neighbour *neigh,
 							      len, 0));
 
 	dev_queue_xmit(skb);
+
+	icmpv6_statistics.Icmp6OutNeighborAdvertisements++;
+	icmpv6_statistics.Icmp6OutMsgs++;
 }        
 
 void ndisc_send_ns(struct device *dev, struct neighbour *neigh,
@@ -410,6 +412,9 @@ void ndisc_send_ns(struct device *dev, struct neighbour *neigh,
 							      len, 0));
 	/* send it! */
 	dev_queue_xmit(skb);
+
+	icmpv6_statistics.Icmp6OutNeighborSolicits++;
+	icmpv6_statistics.Icmp6OutMsgs++;
 }
 
 void ndisc_send_rs(struct device *dev, struct in6_addr *saddr,
@@ -458,6 +463,9 @@ void ndisc_send_rs(struct device *dev, struct in6_addr *saddr,
 
 	/* send it! */
 	dev_queue_xmit(skb);
+
+	icmpv6_statistics.Icmp6OutRouterSolicits++;
+	icmpv6_statistics.Icmp6OutMsgs++;
 }
 		   
 
@@ -575,6 +583,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 
 	if (rt && lifetime == 0) {
 		ip6_del_rt(rt);
+		dst_release(&rt->u.dst);
 		rt = NULL;
 	}
 
@@ -582,11 +591,6 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 		ND_PRINTK2("ndisc_rdisc: adding default router\n");
 
 		rt = rt6_add_dflt_router(&skb->nh.ipv6h->saddr, skb->dev);
-
-#if 1
-		/* BUGGGGG! Previous routine can return invalid pointer. */
-		rt = rt6_get_dflt_router(&skb->nh.ipv6h->saddr, skb->dev);
-#endif
 		if (rt == NULL) {
 			ND_PRINTK1("route_add failed\n");
 			return;
@@ -595,6 +599,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 		neigh = rt->rt6i_nexthop;
 		if (neigh == NULL) {
 			ND_PRINTK1("nd: add default router: null neighbour\n");
+			dst_release(&rt->u.dst);
 			return;
 		}
 		neigh->flags |= NTF_ROUTER;
@@ -658,7 +663,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 				
 				mtu = htonl(*(__u32 *)(opt+4));
 
-				if (mtu < 576 || mtu > skb->dev->mtu) {
+				if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) {
 					ND_PRINTK0("NDISC: router "
 						   "announcement with mtu = %d\n",
 						   mtu);
@@ -671,10 +676,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 					if (rt)
 						rt->u.dst.pmtu = mtu;
 
-					/* BUGGG... Scan routing tables and
-					   adjust mtu on routes going
-					   via this device
-					 */
+					rt6_mtu_change(skb->dev, mtu);
 				}
 			}
                         break;
@@ -689,6 +691,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
                 optlen -= len;
                 opt += len;
         }
+	if (rt)
+		dst_release(&rt->u.dst);
 }
 
 static void ndisc_redirect_rcv(struct sk_buff *skb)
@@ -698,7 +702,6 @@ static void ndisc_redirect_rcv(struct sk_buff *skb)
 	struct in6_addr *dest;
 	struct in6_addr *target;	/* new first hop to destination */
 	struct neighbour *neigh;
-	struct rt6_info *rt;
 	int on_link = 0;
 	int optlen;
 
@@ -740,20 +743,21 @@ static void ndisc_redirect_rcv(struct sk_buff *skb)
 	if (!in6_dev || in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
 		return;
 
-	/* passed validation tests
+	/* passed validation tests */
 
-	   NOTE We should not install redirect if sender did not supply
-	   ll address on link, which requires it. It would break, if
-	   we have non-transitive address resolution protocol.
-	   Fix it later. --ANK
+	/*
+	   We install redirect only if nexthop state is valid.
 	 */
-	rt = rt6_redirect(dest, &skb->nh.ipv6h->saddr, target, skb->dev, on_link);
-
-	if (rt == NULL)
-		return;
 
-	neigh = rt->rt6i_nexthop;
-	ndisc_update(neigh, (u8*)(dest + 1), optlen, ND_OPT_TARGET_LL_ADDR);
+	neigh = __neigh_lookup(&nd_tbl, target, skb->dev, 1);
+	if (neigh) {
+		ndisc_update(neigh, (u8*)(dest + 1), optlen, ND_OPT_TARGET_LL_ADDR);
+		if (neigh->nud_state&NUD_VALID)
+			rt6_redirect(dest, &skb->nh.ipv6h->saddr, neigh, on_link);
+		else
+			__neigh_event_send(neigh, NULL);
+		neigh_release(neigh);
+	}
 }
 
 void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
@@ -773,17 +777,21 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
 	int hlen;
 
 	dev = skb->dev;
-	rt = rt6_lookup(&skb->nh.ipv6h->saddr, NULL, dev->ifindex, 0);
+	rt = rt6_lookup(&skb->nh.ipv6h->saddr, NULL, dev->ifindex, 1);
 
-	if (rt == NULL || rt->u.dst.error) {
-		ND_PRINTK1("ndisc_send_redirect: hostunreach\n");
+	if (rt == NULL)
 		return;
-	}
 
 	if (rt->rt6i_flags & RTF_GATEWAY) {
 		ND_PRINTK1("ndisc_send_redirect: not a neighbour\n");
+		dst_release(&rt->u.dst);
 		return;
 	}
+	if (!xrlim_allow(&rt->u.dst, 1*HZ)) {
+		dst_release(&rt->u.dst);
+		return;
+	}
+	dst_release(&rt->u.dst);
 
 	if (dev->addr_len) {
 		if (neigh->nud_state&NUD_VALID) {
@@ -797,7 +805,7 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
 		}
 	}
 
-	rd_len = min(536 - len, ntohs(skb->nh.ipv6h->payload_len) + 8);
+	rd_len = min(IPV6_MIN_MTU-sizeof(struct ipv6hdr)-len, ntohs(skb->nh.ipv6h->payload_len) + 8);
 	rd_len &= ~0x7;
 	len += rd_len;
 
@@ -814,14 +822,14 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
 		ND_PRINTK1("ndisc_send_redirect: alloc_skb failed\n");
 		return;
 	}
-	
+
 	hlen = 0;
 
 	if (ndisc_build_ll_hdr(buff, dev, &skb->nh.ipv6h->saddr, NULL, len) == 0) {
 		kfree_skb(buff);
 		return;
 	}
-	
+
 	ip6_nd_hdr(sk, buff, dev, &ifp->addr, &skb->nh.ipv6h->saddr,
 		   IPPROTO_ICMPV6, len);
 
@@ -838,9 +846,9 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
 	ipv6_addr_copy(addrp, target);
 	addrp++;
 	ipv6_addr_copy(addrp, &skb->nh.ipv6h->daddr);
-	
+
 	opt = (u8*) (addrp + 1);
-		
+
 	/*
 	 *	include target_address option
 	 */
@@ -858,12 +866,15 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
 	opt += 6;
 
 	memcpy(opt, &skb->nh.ipv6h, rd_len - 8);
-	
+
 	icmph->icmp6_cksum = csum_ipv6_magic(&ifp->addr, &skb->nh.ipv6h->saddr,
 					     len, IPPROTO_ICMPV6,
 					     csum_partial((u8 *) icmph, len, 0));
 
 	dev_queue_xmit(buff);
+
+	icmpv6_statistics.Icmp6OutRedirects++;
+	icmpv6_statistics.Icmp6OutMsgs++;
 }
 
 static __inline__ struct neighbour *
@@ -894,15 +905,15 @@ static __inline__ int ndisc_recv_na(struct neighbour *neigh, struct sk_buff *skb
 
 static void pndisc_redo(struct sk_buff *skb)
 {
-	ndisc_rcv(skb, skb->dev, &skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr,
-		  NULL, skb->len);
+	ndisc_rcv(skb, skb->len);
 	kfree_skb(skb);
 }
 
-int ndisc_rcv(struct sk_buff *skb, struct device *dev,
-	      struct in6_addr *saddr, struct in6_addr *daddr,
-	      struct ipv6_options *opt, unsigned short len)
+int ndisc_rcv(struct sk_buff *skb, unsigned long len)
 {
+	struct device *dev = skb->dev;
+	struct in6_addr *saddr = &skb->nh.ipv6h->saddr;
+	struct in6_addr *daddr = &skb->nh.ipv6h->daddr;
 	struct nd_msg *msg = (struct nd_msg *) skb->h.raw;
 	struct neighbour *neigh;
 	struct inet6_ifaddr *ifp;
@@ -977,7 +988,7 @@ int ndisc_rcv(struct sk_buff *skb, struct device *dev,
 
 					if (neigh) {
 						ndisc_send_na(dev, neigh, saddr, &msg->target,
-							      1, 0, inc, inc);
+							      0, 0, inc, inc);
 						neigh_release(neigh);
 					}
 				} else {
@@ -1023,13 +1034,14 @@ int ndisc_rcv(struct sk_buff *skb, struct device *dev,
 					/*
 					 *	Change: router to host
 					 */
-#if 0					
 					struct rt6_info *rt;
-					rt = ndisc_get_dflt_router(skb->dev,
-								   saddr);
-					if (rt)
-						ndisc_del_dflt_router(rt);
-#endif
+					rt = rt6_get_dflt_router(saddr, skb->dev);
+					if (rt) {
+						/* It is safe only because
+						   we aer in BH */
+						dst_release(&rt->u.dst);
+						ip6_del_rt(rt);
+					}
 				}
 			} else {
 				if (msg->icmph.icmp6_router)
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 9b24b4948..31f6a2f55 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -7,7 +7,7 @@
  *		PROC file system.  This is very similar to the IPv4 version,
  *		except it reports the sockets in the INET6 address family.
  *
- * Version:	$Id: proc.c,v 1.8 1998/04/13 17:06:03 davem Exp $
+ * Version:	$Id: proc.c,v 1.9 1998/08/26 12:05:11 davem Exp $
  *
  * Authors:	David S. Miller (davem@caip.rutgers.edu)
  *
@@ -20,9 +20,11 @@
 #include <linux/socket.h>
 #include <linux/net.h>
 #include <linux/in6.h>
+#include <linux/stddef.h>
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <net/transp_v6.h>
+#include <net/ipv6.h>
 
 /* This is the main implementation workhorse of all these routines. */
 static int get__netinfo6(struct proto *pro, char *buffer, int format, char **start,
@@ -176,3 +178,105 @@ int afinet6_get_info(char *buffer, char **start, off_t offset, int length, int d
 		len = length;
 	return len;
 }
+
+
+struct snmp6_item
+{
+	char *name;
+	unsigned long *ptr;
+} snmp6_list[] = {
+/* ipv6 mib according to draft-ietf-ipngwg-ipv6-mib-04 */
+#define SNMP6_GEN(x) { #x , &ipv6_statistics.x }
+	SNMP6_GEN(Ip6InReceives),
+	SNMP6_GEN(Ip6InHdrErrors),
+	SNMP6_GEN(Ip6InTooBigErrors),
+	SNMP6_GEN(Ip6InNoRoutes),
+	SNMP6_GEN(Ip6InAddrErrors),
+	SNMP6_GEN(Ip6InUnknownProtos),
+	SNMP6_GEN(Ip6InTruncatedPkts),
+	SNMP6_GEN(Ip6InDiscards),
+	SNMP6_GEN(Ip6InDelivers),
+	SNMP6_GEN(Ip6OutForwDatagrams),
+	SNMP6_GEN(Ip6OutRequests),
+	SNMP6_GEN(Ip6OutDiscards),
+	SNMP6_GEN(Ip6OutNoRoutes),
+	SNMP6_GEN(Ip6ReasmTimeout),
+	SNMP6_GEN(Ip6ReasmReqds),
+	SNMP6_GEN(Ip6ReasmOKs),
+	SNMP6_GEN(Ip6ReasmFails),
+	SNMP6_GEN(Ip6FragOKs),
+	SNMP6_GEN(Ip6FragFails),
+	SNMP6_GEN(Ip6FragCreates),
+	SNMP6_GEN(Ip6InMcastPkts),
+	SNMP6_GEN(Ip6OutMcastPkts),
+#undef SNMP6_GEN
+/* icmpv6 mib according to draft-ietf-ipngwg-ipv6-icmp-mib-02
+
+   Exceptions:  {In|Out}AdminProhibs are removed, because I see
+                no good reasons to account them separately
+		of another dest.unreachs.
+		OutErrs is zero identically.
+		OutEchos too.
+		OutRouterAdvertisements too.
+		OutGroupMembQueries too.
+ */
+#define SNMP6_GEN(x) { #x , &icmpv6_statistics.x }
+	SNMP6_GEN(Icmp6InMsgs),
+	SNMP6_GEN(Icmp6InErrors),
+	SNMP6_GEN(Icmp6InDestUnreachs),
+	SNMP6_GEN(Icmp6InPktTooBigs),
+	SNMP6_GEN(Icmp6InTimeExcds),
+	SNMP6_GEN(Icmp6InParmProblems),
+	SNMP6_GEN(Icmp6InEchos),
+	SNMP6_GEN(Icmp6InEchoReplies),
+	SNMP6_GEN(Icmp6InGroupMembQueries),
+	SNMP6_GEN(Icmp6InGroupMembResponses),
+	SNMP6_GEN(Icmp6InGroupMembReductions),
+	SNMP6_GEN(Icmp6InRouterSolicits),
+	SNMP6_GEN(Icmp6InRouterAdvertisements),
+	SNMP6_GEN(Icmp6InNeighborSolicits),
+	SNMP6_GEN(Icmp6InNeighborAdvertisements),
+	SNMP6_GEN(Icmp6InRedirects),
+	SNMP6_GEN(Icmp6OutMsgs),
+	SNMP6_GEN(Icmp6OutDestUnreachs),
+	SNMP6_GEN(Icmp6OutPktTooBigs),
+	SNMP6_GEN(Icmp6OutTimeExcds),
+	SNMP6_GEN(Icmp6OutParmProblems),
+	SNMP6_GEN(Icmp6OutEchoReplies),
+	SNMP6_GEN(Icmp6OutRouterSolicits),
+	SNMP6_GEN(Icmp6OutNeighborSolicits),
+	SNMP6_GEN(Icmp6OutNeighborAdvertisements),
+	SNMP6_GEN(Icmp6OutRedirects),
+	SNMP6_GEN(Icmp6OutGroupMembResponses),
+	SNMP6_GEN(Icmp6OutGroupMembReductions),
+#undef SNMP6_GEN
+#define SNMP6_GEN(x) { "Udp6" #x , &udp_stats_in6.Udp##x }
+	SNMP6_GEN(InDatagrams),
+	SNMP6_GEN(NoPorts),
+	SNMP6_GEN(InErrors),
+	SNMP6_GEN(OutDatagrams)
+#undef SNMP6_GEN
+};
+
+
+int afinet6_get_snmp(char *buffer, char **start, off_t offset, int length,
+		     int dummy)
+{
+	int len = 0;
+	int i;
+
+	for (i=0; i<sizeof(snmp6_list)/sizeof(snmp6_list[0]); i++)
+		len += sprintf(buffer+len, "%-32s\t%ld\n", snmp6_list[i].name,
+			       *(snmp6_list[i].ptr));
+
+	len -= offset;
+
+	if (len > length)
+		len = length;
+	if(len < 0)
+		len = 0;
+
+	*start = buffer + offset;
+
+	return len;
+}
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 659ec59cc..76339ff58 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -7,7 +7,7 @@
  *
  *	Adapted from linux/net/ipv4/raw.c
  *
- *	$Id: raw.c,v 1.20 1998/07/15 05:05:41 davem Exp $
+ *	$Id: raw.c,v 1.21 1998/08/26 12:05:13 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -156,9 +156,8 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 	/* Check if the address belongs to the host. */
 	if (addr_type == IPV6_ADDR_MAPPED) {
-		v4addr = addr->sin6_addr.s6_addr32[3];
-		if (inet_addr_type(v4addr) != RTN_LOCAL)
-			return(-EADDRNOTAVAIL);
+		/* Raw sockets are IPv6 only */
+		return(-EADDRNOTAVAIL);
 	} else {
 		if (addr_type != IPV6_ADDR_ANY) {
 			/* ipv4 addr of the socket is invalid.  Only the
@@ -182,10 +181,11 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	return 0;
 }
 
-void rawv6_err(struct sock *sk, int type, int code, unsigned char *buff,
-	       struct in6_addr *saddr, struct in6_addr *daddr)
+void rawv6_err(struct sock *sk, struct sk_buff *skb, struct ipv6hdr *hdr,
+	       struct inet6_skb_parm *opt,
+	       int type, int code, unsigned char *buff, u32 info)
 {
-	if (sk == NULL) 
+	if (sk == NULL)
 		return;
 }
 
@@ -193,12 +193,12 @@ static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb)
 {
 	/* Charge it to the socket. */
 	if (sock_queue_rcv_skb(sk,skb)<0) {
-		/* ip_statistics.IpInDiscards++; */
+		ipv6_statistics.Ip6InDiscards++;
 		kfree_skb(skb);
 		return 0;
 	}
 
-	/* ip_statistics.IpInDelivers++; */
+	ipv6_statistics.Ip6InDelivers++;
 	return 0;
 }
 
@@ -209,22 +209,11 @@ static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb)
  *	maybe we could have the network decide uppon a hint if it 
  *	should call raw_rcv for demultiplexing
  */
-int rawv6_rcv(struct sk_buff *skb, struct device *dev,
-	      struct in6_addr *saddr, struct in6_addr *daddr,
-	      struct ipv6_options *opt, unsigned short len)
+int rawv6_rcv(struct sock *sk, struct sk_buff *skb, unsigned long len)
 {
-	struct sock *sk;
-
-	sk = skb->sk;
-
 	if (sk->ip_hdrincl)
 		skb->h.raw = skb->nh.raw;
 
-	if (sk->sock_readers) {
-		__skb_queue_tail(&sk->back_log, skb);
-		return 0;
-	}
-
 	rawv6_rcv_skb(sk, skb);
 	return 0;
 }
@@ -255,8 +244,12 @@ int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, int len,
 	if (!skb)
 		goto out;
 	
-	copied = min(len, skb->tail - skb->h.raw);
-	
+	copied = skb->tail - skb->h.raw;
+  	if (copied > len) {
+  		copied = len;
+  		msg->msg_flags |= MSG_TRUNC;
+  	}
+
 	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
 	sk->stamp=skb->stamp;
 	if (err)
@@ -269,7 +262,7 @@ int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, int len,
 		       sizeof(struct in6_addr));
 	}
 
-	if (msg->msg_controllen)
+	if (sk->net_pinfo.af_inet6.rxopt.all)
 		datagram_recv_ctl(sk, msg, skb);
 	err = copied;
 
@@ -332,11 +325,9 @@ static int rawv6_frag_cksum(const void *data, struct in6_addr *addr,
 			csum = (__u16 *) (buff + opt->offset);
 			*csum = hdr->cksum;
 		} else {
-			/* 
-			 *  FIXME 
-			 *  signal an error to user via sk->err
-			 */
-			printk(KERN_DEBUG "icmp: cksum offset too big\n");
+			if (net_ratelimit())
+				printk(KERN_DEBUG "icmp: cksum offset too big\n");
+			return -EINVAL;
 		}
 	}	
 	return 0; 
@@ -345,10 +336,10 @@ static int rawv6_frag_cksum(const void *data, struct in6_addr *addr,
 
 static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 {
-	struct ipv6_options opt_space;
+	struct ipv6_txoptions opt_space;
 	struct sockaddr_in6 * sin6 = (struct sockaddr_in6 *) msg->msg_name;
 	struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
-	struct ipv6_options *opt = NULL;
+	struct ipv6_txoptions *opt = NULL;
 	struct in6_addr *saddr = NULL;
 	struct flowi fl;
 	int addr_len = msg->msg_namelen;
@@ -360,11 +351,8 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 
 	/* Rough check on arithmetic overflow,
 	   better check is made in ip6_build_xmit
-
-	   When jumbo header will be implemeted we will remove it
-	   at all (len will be size_t)
 	 */
-	if (len < 0 || len > 0xFFFF)
+	if (len < 0)
 		return -EMSGSIZE;
 
 	/* Mirror BSD error message compatibility */
@@ -394,14 +382,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 			return(-EINVAL);
 
 		daddr = &sin6->sin6_addr;
-		
-		/* BUGGGG If route is not cloned, this check always
-		   fails, hence dst_cache only slows down tramsmission --ANK
-		 */
-		if (sk->dst_cache && ipv6_addr_cmp(daddr, &np->daddr)) {
-			dst_release(sk->dst_cache);
-			sk->dst_cache = NULL;
-		}		
 	} else {
 		if (sk->state != TCP_ESTABLISHED) 
 			return(-EINVAL);
@@ -422,12 +402,14 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 
 	if (msg->msg_controllen) {
 		opt = &opt_space;
-		memset(opt, 0, sizeof(struct ipv6_options));
+		memset(opt, 0, sizeof(struct ipv6_txoptions));
 
 		err = datagram_send_ctl(msg, &fl.oif, &saddr, opt, &hlimit);
 		if (err < 0)
 			return err;
 	}
+	if (opt == NULL || !(opt->opt_nflen|opt->opt_flen))
+		opt = np->opt;
 
 	raw_opt = &sk->tp_pinfo.tp_raw;
 
@@ -594,8 +576,9 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname,
 
 static void rawv6_close(struct sock *sk, unsigned long timeout)
 {
+	/* See for explanation: raw_close in ipv4/raw.c */
 	sk->state = TCP_CLOSE;
-	ipv6_sock_mc_close(sk);
+	raw_v6_unhash(sk);
 	if (sk->num == IPPROTO_RAW)
 		ip6_ra_control(sk, -1, NULL);
 	sk->dead = 1;
@@ -619,7 +602,7 @@ struct proto rawv6_prot = {
 	datagram_poll,			/* poll */
 	NULL,				/* ioctl */
 	rawv6_init_sk,			/* init */
-	NULL,				/* destroy */
+	inet6_destroy_sock,		/* destroy */
 	NULL,				/* shutdown */
 	rawv6_setsockopt,		/* setsockopt */
 	rawv6_getsockopt,		/* getsockopt */
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index e78cf97a2..e455b0533 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -5,7 +5,7 @@
  *	Authors:
  *	Pedro Roque		<roque@di.fc.ul.pt>	
  *
- *	$Id: reassembly.c,v 1.10 1998/04/30 16:24:32 freitag Exp $
+ *	$Id: reassembly.c,v 1.11 1998/08/26 12:05:16 davem Exp $
  *
  *	Based on: net/ipv4/ip_fragment.c
  *
@@ -41,83 +41,145 @@
 #include <net/ndisc.h>
 #include <net/addrconf.h>
 
+int sysctl_ip6frag_high_thresh = 256*1024;
+int sysctl_ip6frag_low_thresh = 192*1024;
+int sysctl_ip6frag_time = IPV6_FRAG_TIMEOUT;
+
+atomic_t ip6_frag_mem = ATOMIC_INIT(0);
+
+struct ipv6_frag {
+	__u16			offset;
+	__u16			len;
+	struct sk_buff		*skb;
+
+	struct frag_hdr		*fhdr;
+
+	struct ipv6_frag	*next;
+};
+
+/*
+ *	Equivalent of ipv4 struct ipq
+ */
+
+struct frag_queue {
+
+	struct frag_queue	*next;
+	struct frag_queue	*prev;
+
+	__u32			id;		/* fragment id		*/
+	struct in6_addr		saddr;
+	struct in6_addr		daddr;
+	struct timer_list	timer;		/* expire timer		*/
+	struct ipv6_frag	*fragments;
+	struct device		*dev;
+	int			iif;
+	__u8			last_in;	/* has first/last segment arrived? */
+#define FIRST_IN		2
+#define LAST_IN			1
+	__u8			nexthdr;
+	__u16			nhoffset;
+};
 
 static struct frag_queue ipv6_frag_queue = {
 	&ipv6_frag_queue, &ipv6_frag_queue,
 	0, {{{0}}}, {{{0}}},
 	{0}, NULL, NULL,
-	0, 0, NULL
+	0, 0, 0, 0
 };
 
+/* Memory Tracking Functions. */
+extern __inline__ void frag_kfree_skb(struct sk_buff *skb)
+{
+	atomic_sub(skb->truesize, &ip6_frag_mem);
+	kfree_skb(skb);
+}
+
+extern __inline__ void frag_kfree_s(void *ptr, int len)
+{
+	atomic_sub(len, &ip6_frag_mem);
+	kfree(ptr);
+}
+ 
+extern __inline__ void *frag_kmalloc(int size, int pri)
+{
+	void *vp = kmalloc(size, pri);
+
+	if(!vp)
+		return NULL;
+	atomic_add(size, &ip6_frag_mem);
+	return vp;
+}
+
+
 static void			create_frag_entry(struct sk_buff *skb, 
-						  struct device *dev,
 						  __u8 *nhptr,
 						  struct frag_hdr *fhdr);
-static int			reasm_frag_1(struct frag_queue *fq, 
-					     struct sk_buff **skb_in);
+static u8 *			reasm_frag(struct frag_queue *fq, 
+					   struct sk_buff **skb_in);
 
 static void			reasm_queue(struct frag_queue *fq, 
 					    struct sk_buff *skb, 
-					    struct frag_hdr *fhdr);
+					    struct frag_hdr *fhdr,
+					    u8 *nhptr);
 
-static int reasm_frag(struct frag_queue *fq, struct sk_buff **skb, 
-		      __u8 *nhptr,
-		      struct frag_hdr *fhdr)
-{
-	__u32	expires = jiffies + IPV6_FRAG_TIMEOUT;
-	int nh;
-
-	if (del_timer(&fq->timer))
-		expires = fq->timer.expires;
+static void			fq_free(struct frag_queue *fq);
 
-	/*
-	 *	We queue the packet even if it's the last.
-	 *	It's a trade off. This allows the reassembly 
-	 *	code to be simpler (=faster) and of the
-	 *	steps we do for queueing the only unnecessary 
-	 *	one it's the kmalloc for a struct ipv6_frag.
-	 *	Feel free to try other alternatives...
-	 */
-	if ((fhdr->frag_off & __constant_htons(0x0001)) == 0) {
-		fq->last_in = 1;
-		fq->nhptr = nhptr;
-	}
-	reasm_queue(fq, *skb, fhdr);
+static void frag_prune(void)
+{
+	struct frag_queue *fq;
 
-	if (fq->last_in) {
-		if ((nh = reasm_frag_1(fq, skb)))
-			return nh;
+	while ((fq = ipv6_frag_queue.next) != &ipv6_frag_queue) {
+		ipv6_statistics.Ip6ReasmFails++;
+		fq_free(fq);
+		if (atomic_read(&ip6_frag_mem) <= sysctl_ip6frag_low_thresh)
+			return;
 	}
-
-	fq->timer.expires = expires;
-	add_timer(&fq->timer);
-	
-	return 0;
+	if (atomic_read(&ip6_frag_mem))
+		printk(KERN_DEBUG "IPv6 frag_prune: memleak\n");
+	atomic_set(&ip6_frag_mem, 0);
 }
 
-int ipv6_reassembly(struct sk_buff **skbp, struct device *dev, __u8 *nhptr,
-		    struct ipv6_options *opt)
+
+u8* ipv6_reassembly(struct sk_buff **skbp, __u8 *nhptr)
 {
 	struct sk_buff *skb = *skbp; 
 	struct frag_hdr *fhdr = (struct frag_hdr *) (skb->h.raw);
 	struct frag_queue *fq;
 	struct ipv6hdr *hdr;
 
+	hdr = skb->nh.ipv6h;
+
+	ipv6_statistics.Ip6ReasmReqds++;
+
+	/* Jumbo payload inhibits frag. header */
+	if (hdr->payload_len==0) {
+		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw);
+		return NULL;
+	}
 	if ((u8 *)(fhdr+1) > skb->tail) {
 		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw);
-		return 0;
+		return NULL;
 	}
-	hdr = skb->nh.ipv6h;
+	if (atomic_read(&ip6_frag_mem) > sysctl_ip6frag_high_thresh)
+		frag_prune();
+
 	for (fq = ipv6_frag_queue.next; fq != &ipv6_frag_queue; fq = fq->next) {
 		if (fq->id == fhdr->identification && 
 		    !ipv6_addr_cmp(&hdr->saddr, &fq->saddr) &&
-		    !ipv6_addr_cmp(&hdr->daddr, &fq->daddr))
-			return reasm_frag(fq, skbp, nhptr,fhdr);
+		    !ipv6_addr_cmp(&hdr->daddr, &fq->daddr)) {
+
+			reasm_queue(fq, skb, fhdr, nhptr);
+
+			if (fq->last_in == (FIRST_IN|LAST_IN))
+				return reasm_frag(fq, skbp);
+
+			return NULL;
+		}
 	}
-	
-	create_frag_entry(skb, dev, nhptr, fhdr);
 
-	return 0;
+	create_frag_entry(skb, nhptr, fhdr);
+
+	return NULL;
 }
 
 
@@ -125,11 +187,13 @@ static void fq_free(struct frag_queue *fq)
 {
 	struct ipv6_frag *fp, *back;
 
-	for(fp = fq->fragments; fp; ) {
-		kfree_skb(fp->skb);		
+	del_timer(&fq->timer);
+
+	for (fp = fq->fragments; fp; ) {
+		frag_kfree_skb(fp->skb);
 		back = fp;
 		fp=fp->next;
-		kfree(back);
+		frag_kfree_s(back, sizeof(*back));
 	}
 
 	fq->prev->next = fq->next;
@@ -137,7 +201,7 @@ static void fq_free(struct frag_queue *fq)
 
 	fq->prev = fq->next = NULL;
 	
-	kfree(fq);
+	frag_kfree_s(fq, sizeof(*fq));
 }
 
 static void frag_expire(unsigned long data)
@@ -147,33 +211,50 @@ static void frag_expire(unsigned long data)
 
 	fq = (struct frag_queue *) data;
 
-	del_timer(&fq->timer);
-
 	frag = fq->fragments;
 
+	ipv6_statistics.Ip6ReasmTimeout++;
+	ipv6_statistics.Ip6ReasmFails++;
+
 	if (frag == NULL) {
 		printk(KERN_DEBUG "invalid fragment queue\n");
 		return;
 	}
 
-	icmpv6_send(frag->skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0,
-		    frag->skb->dev);
+	/* Send error only if the first segment arrived.
+	   (fixed --ANK (980728))
+	 */
+	if (fq->last_in&FIRST_IN) {
+		struct device *dev = dev_get_by_index(fq->iif);
+
+		/*
+		   But use as source device on which LAST ARRIVED
+		   segment was received. And do not use fq->dev
+		   pointer directly, device might already disappeared.
+		 */
+		if (dev) {
+			frag->skb->dev = dev;
+			icmpv6_send(frag->skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0,
+				    dev);
+		}
+	}
 	
 	fq_free(fq);
 }
 
 
-static void create_frag_entry(struct sk_buff *skb, struct device *dev, 
+static void create_frag_entry(struct sk_buff *skb,
 			      __u8 *nhptr,
 			      struct frag_hdr *fhdr)
 {
 	struct frag_queue *fq;
 	struct ipv6hdr *hdr; 
 
-	fq = (struct frag_queue *) kmalloc(sizeof(struct frag_queue), 
-					   GFP_ATOMIC);
+	fq = (struct frag_queue *) frag_kmalloc(sizeof(struct frag_queue), 
+						GFP_ATOMIC);
 
 	if (fq == NULL) {
+		ipv6_statistics.Ip6ReasmFails++;
 		kfree_skb(skb);
 		return;
 	}
@@ -186,38 +267,41 @@ static void create_frag_entry(struct sk_buff *skb, struct device *dev,
 	ipv6_addr_copy(&fq->saddr, &hdr->saddr);
 	ipv6_addr_copy(&fq->daddr, &hdr->daddr);
 
-	fq->dev = dev;
-
 	/* init_timer has been done by the memset */
 	fq->timer.function = frag_expire;
 	fq->timer.data = (long) fq;
-	fq->timer.expires = jiffies + IPV6_FRAG_TIMEOUT;
+	fq->timer.expires = jiffies + sysctl_ip6frag_time;
 
-	fq->nexthdr = fhdr->nexthdr;
+	reasm_queue(fq, skb, fhdr, nhptr);
 
+	if (fq->fragments) {
+		fq->prev = ipv6_frag_queue.prev;
+		fq->next = &ipv6_frag_queue;
+		fq->prev->next = fq;
+		ipv6_frag_queue.prev = fq;
 
-	if ((fhdr->frag_off & __constant_htons(0x0001)) == 0) {
-		fq->last_in = 1;
-		fq->nhptr = nhptr;
-	}
-	reasm_queue(fq, skb, fhdr);
-
-	fq->prev = ipv6_frag_queue.prev;
-	fq->next = &ipv6_frag_queue;
-	fq->prev->next = fq;
-	ipv6_frag_queue.prev = fq;
-	
-	add_timer(&fq->timer);
+		add_timer(&fq->timer);
+	} else
+		frag_kfree_s(fq, sizeof(*fq));
 }
 
 
+/*
+ *	We queue the packet even if it's the last.
+ *	It's a trade off. This allows the reassembly 
+ *	code to be simpler (=faster) and of the
+ *	steps we do for queueing the only unnecessary 
+ *	one it's the kmalloc for a struct ipv6_frag.
+ *	Feel free to try other alternatives...
+ */
+
 static void reasm_queue(struct frag_queue *fq, struct sk_buff *skb, 
-				     struct frag_hdr *fhdr)
+				     struct frag_hdr *fhdr, u8 *nhptr)
 {
 	struct ipv6_frag *nfp, *fp, **bptr;
 
-	nfp = (struct ipv6_frag *) kmalloc(sizeof(struct ipv6_frag), 
-					   GFP_ATOMIC);
+	nfp = (struct ipv6_frag *) frag_kmalloc(sizeof(struct ipv6_frag), 
+						GFP_ATOMIC);
 
 	if (nfp == NULL) {		
 		kfree_skb(skb);
@@ -228,24 +312,40 @@ static void reasm_queue(struct frag_queue *fq, struct sk_buff *skb,
 	nfp->len = (ntohs(skb->nh.ipv6h->payload_len) -
 		    ((u8 *) (fhdr + 1) - (u8 *) (skb->nh.ipv6h + 1)));
 
-	if ((u32)nfp->offset + (u32)nfp->len > 65536) {
+	if ((u32)nfp->offset + (u32)nfp->len >= 65536) {
 		icmpv6_param_prob(skb,ICMPV6_HDR_FIELD, (u8*)&fhdr->frag_off); 
 		goto err;
 	}
+	if (fhdr->frag_off & __constant_htons(0x0001)) {
+		/* Check if the fragment is rounded to 8 bytes.
+		 * Required by the RFC.
+		 * ... and would break our defragmentation algorithm 8)
+		 */
+		if (nfp->len & 0x7) {
+			printk(KERN_DEBUG "fragment not rounded to 8bytes\n");
+
+			/*
+			   It is not in specs, but I see no reasons
+			   to send an error in this case. --ANK
+			 */
+			if (nfp->offset == 0)
+				icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, 
+						  &skb->nh.ipv6h->payload_len);
+			goto err;
+		}
+	}
 
 	nfp->skb  = skb;
 	nfp->fhdr = fhdr;
-
 	nfp->next = NULL;
 
 	bptr = &fq->fragments;
-	
+
 	for (fp = fq->fragments; fp; fp=fp->next) {
 		if (nfp->offset <= fp->offset)
 			break;
 		bptr = &fp->next;
 	}
-	
 	if (fp && fp->offset == nfp->offset) {
 		if (nfp->len != fp->len) {
 			printk(KERN_DEBUG "reasm_queue: dup with wrong len\n");
@@ -254,29 +354,40 @@ static void reasm_queue(struct frag_queue *fq, struct sk_buff *skb,
 		/* duplicate. discard it. */
 		goto err;
 	}
-	
-	*bptr = nfp;
-	nfp->next = fp;
 
-#ifdef STRICT_RFC
-	if (fhdr->frag_off & __constant_htons(0x0001)) {
-		/* Check if the fragment is rounded to 8 bytes.
-		 * Required by the RFC.
-		 */
-		if (nfp->len & 0x7) {
-			printk(KERN_DEBUG "fragment not rounded to 8bytes\n");
+	atomic_add(skb->truesize, &ip6_frag_mem);
 
-			icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, 
-					  &skb->nh.ipv6h->payload_len);
-			goto err;
-		}
+	/* All the checks are done, fragment is acepted.
+	   Only now we are allowed to update reassembly data!
+	   (fixed --ANK (980728))
+	 */
+
+	/* iif always set to one of the last arrived segment */
+	fq->dev = skb->dev;
+	fq->iif = skb->dev->ifindex;
+
+	/* Last fragment */
+	if ((fhdr->frag_off & __constant_htons(0x0001)) == 0)
+		fq->last_in |= LAST_IN;
+
+	/* First fragment.
+	   nexthdr and nhptr are get from the first fragment.
+	   Moreover, nexthdr is UNDEFINED for all the fragments but the
+	   first one.
+	   (fixed --ANK (980728))
+	 */
+	if (nfp->offset == 0) {
+		fq->nexthdr = fhdr->nexthdr;
+		fq->last_in |= FIRST_IN;
+		fq->nhoffset = nhptr - skb->nh.raw;
 	}
-#endif 
 
+	*bptr = nfp;
+	nfp->next = fp;
 	return;
 
 err:
-	kfree(nfp);
+	frag_kfree_s(nfp, sizeof(*nfp));
 	kfree_skb(skb);
 }
 
@@ -284,20 +395,21 @@ err:
  *	check if this fragment completes the packet
  *	returns true on success
  */
-static int reasm_frag_1(struct frag_queue *fq, struct sk_buff **skb_in)
+static u8* reasm_frag(struct frag_queue *fq, struct sk_buff **skb_in)
 {
 	struct ipv6_frag *fp;
+	struct ipv6_frag *head = fq->fragments;
 	struct ipv6_frag *tail = NULL;
 	struct sk_buff *skb;
 	__u32  offset = 0;
 	__u32  payload_len;
 	__u16  unfrag_len;
 	__u16  copy;
-	int    nh;
+	u8     *nhptr;
 
-	for(fp = fq->fragments; fp; fp=fp->next) {
+	for(fp = head; fp; fp=fp->next) {
 		if (offset != fp->offset)
-			return 0;
+			return NULL;
 
 		offset += fp->len;
 		tail = fp;
@@ -309,31 +421,42 @@ static int reasm_frag_1(struct frag_queue *fq, struct sk_buff **skb_in)
 	 * this means we have all fragments.
 	 */
 
-	unfrag_len = (u8 *) (tail->fhdr) - (u8 *) (tail->skb->nh.ipv6h + 1);
+	/* Unfragmented part is taken from the first segment.
+	   (fixed --ANK (980728))
+	 */
+	unfrag_len = (u8 *) (head->fhdr) - (u8 *) (head->skb->nh.ipv6h + 1);
 
 	payload_len = (unfrag_len + tail->offset + 
 		       (tail->skb->tail - (__u8 *) (tail->fhdr + 1)));
 
-#if 0
-	printk(KERN_DEBUG "reasm: payload len = %d\n", payload_len);
-#endif
+	if (payload_len > 65535) {
+		if (net_ratelimit())
+			printk(KERN_DEBUG "reasm_frag: payload len = %d\n", payload_len);
+		ipv6_statistics.Ip6ReasmFails++;
+		fq_free(fq);
+		return NULL;
+	}
 
 	if ((skb = dev_alloc_skb(sizeof(struct ipv6hdr) + payload_len))==NULL) {
-		printk(KERN_DEBUG "reasm_frag: no memory for reassembly\n");
+		if (net_ratelimit())
+			printk(KERN_DEBUG "reasm_frag: no memory for reassembly\n");
+		ipv6_statistics.Ip6ReasmFails++;
 		fq_free(fq);
-		return 1;
+		return NULL;
 	}
 
 	copy = unfrag_len + sizeof(struct ipv6hdr);
 
 	skb->nh.ipv6h = (struct ipv6hdr *) skb->data;
-
 	skb->dev = fq->dev;
+	skb->protocol = __constant_htons(ETH_P_IPV6);
+	skb->pkt_type = head->skb->pkt_type;
+	memcpy(skb->cb, head->skb->cb, sizeof(skb->cb));
+	skb->dst = dst_clone(head->skb->dst);
 
-	nh = fq->nexthdr;
-
-	*(fq->nhptr) = nh;
-	memcpy(skb_put(skb, copy), tail->skb->nh.ipv6h, copy);
+	memcpy(skb_put(skb, copy), head->skb->nh.ipv6h, copy);
+	nhptr = skb->nh.raw + fq->nhoffset;
+	*nhptr = fq->nexthdr;
 
 	skb->h.raw = skb->tail;
 
@@ -351,18 +474,19 @@ static int reasm_frag_1(struct frag_queue *fq, struct sk_buff **skb_in)
 		struct ipv6_frag *back;
 
 		memcpy(skb_put(skb, fp->len), (__u8*)(fp->fhdr + 1), fp->len);
-		kfree_skb(fp->skb);
+		frag_kfree_skb(fp->skb);
 		back = fp;
 		fp=fp->next;
-		kfree(back);
+		frag_kfree_s(back, sizeof(*back));
 	}
-	
+
+	del_timer(&fq->timer);
 	fq->prev->next = fq->next;
 	fq->next->prev = fq->prev;
-
 	fq->prev = fq->next = NULL;
-	
-	kfree(fq);
 
-	return nh;
+	frag_kfree_s(fq, sizeof(*fq));
+
+	ipv6_statistics.Ip6ReasmOKs++;
+	return nhptr;
 }
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9d159fe36..8d1f59632 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -5,7 +5,7 @@
  *	Authors:
  *	Pedro Roque		<roque@di.fc.ul.pt>	
  *
- *	$Id: route.c,v 1.32 1998/07/25 23:28:52 davem Exp $
+ *	$Id: route.c,v 1.33 1998/08/26 12:05:18 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -53,10 +53,19 @@
 
 #if RT6_DEBUG >= 3
 #define RDBG(x) printk x
+#define RT6_TRACE(x...) printk(KERN_DEBUG x)
 #else
 #define RDBG(x)
+#define RT6_TRACE(x...) do { ; } while (0)
 #endif
 
+#if RT6_DEBUG >= 1
+#define BUG_TRAP(x) ({ if (!(x)) { printk("Assertion (" #x ") failed at " __FILE__ "(%d):" __FUNCTION__ "\n", __LINE__); } })
+#else
+#define BUG_TRAP(x) do { ; } while (0)
+#endif
+
+
 int ip6_rt_max_size = 4096;
 int ip6_rt_gc_min_interval = 5*HZ;
 int ip6_rt_gc_timeout = 60*HZ;
@@ -87,16 +96,16 @@ struct dst_ops ip6_dst_ops = {
 };
 
 struct rt6_info ip6_null_entry = {
-	{{NULL, ATOMIC_INIT(1), ATOMIC_INIT(1), NULL,
-	  -1, 0, 0, 0, 0, 0, 0, 0, 0,
+	{{NULL, ATOMIC_INIT(1), ATOMIC_INIT(1), &loopback_dev,
+	  -1, 0, 0, 0, 0, 0, 0, 0,
 	  -ENETUNREACH, NULL, NULL,
 	  ip6_pkt_discard, ip6_pkt_discard,
 #ifdef CONFIG_NET_CLS_ROUTE
 	  0,
 #endif
 	  &ip6_dst_ops}},
-	NULL, {{{0}}}, 256, RTF_REJECT|RTF_NONEXTHOP, ~0U,
-	255, 0, {NULL}, {{{{0}}}, 0}, {{{{0}}}, 0}
+	NULL, {{{0}}}, RTF_REJECT|RTF_NONEXTHOP, ~0U,
+	255, 0, ATOMIC_INIT(1), {NULL}, {{{{0}}}, 0}, {{{{0}}}, 0}
 };
 
 struct fib6_node ip6_routing_table = {
@@ -123,89 +132,6 @@ static struct rt6_info	*rt6_flow_lookup(struct rt6_info *rt,
 #define ip6_rt_policy (0)
 #endif
 
-static atomic_t	rt6_tbl_lock	= ATOMIC_INIT(0);
-static int	rt6_bh_mask	= 0;
-
-#define RT_BH_REQUEST		1
-#define RT_BH_GC		2
-
-static void __rt6_run_bh(void);
-
-/*
- *	request queue operations
- *	FIFO queue/dequeue
- */
-
-static struct rt6_req request_queue = {
-	0, NULL, &request_queue, &request_queue
-};
-
-static __inline__ void rtreq_queue(struct rt6_req * req)
-{
-	unsigned long flags;
-	struct rt6_req *next = &request_queue;
-
-	save_flags(flags);
-	cli();
-
-	req->prev = next->prev;
-	req->prev->next = req;
-	next->prev = req;
-	req->next = next;
-	restore_flags(flags);
-}
-
-static __inline__ struct rt6_req * rtreq_dequeue(void)
-{
-	struct rt6_req *next = &request_queue;
-	struct rt6_req *head;
-
-	head = next->next;
-
-	if (head == next)
-		return NULL;
-
-	head->next->prev = head->prev;
-	next->next = head->next;
-
-	head->next = NULL;
-	head->prev = NULL;
-
-	return head;
-}
-
-void rtreq_add(struct rt6_info *rt, int operation)
-{
-	struct rt6_req *rtreq;
-
-	rtreq = kmalloc(sizeof(struct rt6_req), GFP_ATOMIC);
-	
-	if (rtreq == NULL)
-		return;
-
-	memset(rtreq, 0, sizeof(struct rt6_req));
-
-	rtreq->operation = operation;
-	rtreq->ptr = rt;
-	rtreq_queue(rtreq);
-
-	rt6_bh_mask |= RT_BH_REQUEST;
-}
-
-static __inline__ void rt6_lock(void)
-{
-	atomic_inc(&rt6_tbl_lock);
-}
-
-static __inline__ void rt6_unlock(void)
-{
-	if (atomic_dec_and_test(&rt6_tbl_lock) && rt6_bh_mask) {
-		start_bh_atomic();
-		__rt6_run_bh();
-		end_bh_atomic();
-	}
-}
-
 /*
  *	Route lookup
  */
@@ -219,23 +145,19 @@ static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
 
 	if (oif) {
 		for (sprt = rt; sprt; sprt = sprt->u.next) {
-			if (sprt->rt6i_dev) {
-				if (sprt->rt6i_dev->ifindex == oif)
-					return sprt;
-				if (sprt->rt6i_dev->flags&IFF_LOOPBACK)
-					local = sprt;
-			}
+			struct device *dev = sprt->rt6i_dev;
+			if (dev->ifindex == oif)
+				return sprt;
+			if (dev->flags&IFF_LOOPBACK)
+				local = sprt;
 		}
 
 		if (local)
 			return local;
 
-		if (strict) {
-			RDBG(("nomatch & STRICT --> ip6_null_entry\n"));
+		if (strict)
 			return &ip6_null_entry;
-		}
 	}
-	RDBG(("!dev or (no match and !strict) --> rt(%p)\n", rt));
 	return rt;
 }
 
@@ -282,7 +204,7 @@ static struct rt6_info *rt6_best_dflt(struct rt6_info *rt, int oif)
 				break;
 			};
 
-			if (oif && sprt->rt6i_dev && sprt->rt6i_dev->ifindex == oif) {
+			if (oif && sprt->rt6i_dev->ifindex == oif) {
 				m += 2;
 			}
 
@@ -319,21 +241,40 @@ out:
 }
 
 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
-			    int oif, int flags)
+			    int oif, int strict)
 {
 	struct fib6_node *fn;
 	struct rt6_info *rt;
 
-	rt6_lock();
+	start_bh_atomic();
 	fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
-	rt = rt6_device_match(fn->leaf, oif, flags&RTF_LINKRT);
-	rt6_unlock();
-	return rt;
+	rt = rt6_device_match(fn->leaf, oif, strict);
+	atomic_inc(&rt->u.dst.use);
+	atomic_inc(&rt->u.dst.refcnt);
+	end_bh_atomic();
+
+	rt->u.dst.lastuse = jiffies;
+	if (rt->u.dst.error == 0)
+		return rt;
+	dst_release(&rt->u.dst);
+	return NULL;
+}
+
+static int rt6_ins(struct rt6_info *rt)
+{
+	int err;
+
+	start_bh_atomic();
+	err = fib6_add(&ip6_routing_table, rt);
+	end_bh_atomic();
+
+	return err;
 }
 
 static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr,
 				struct in6_addr *saddr)
 {
+	int err;
 	struct rt6_info *rt;
 
 	/*
@@ -351,18 +292,24 @@ static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr,
 		rt->rt6i_dst.plen = 128;
 		rt->rt6i_flags |= RTF_CACHE;
 
-		if (rt->rt6i_src.plen) {
+#ifdef CONFIG_IPV6_SUBTREES
+		if (rt->rt6i_src.plen && saddr) {
 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
 			rt->rt6i_src.plen = 128;
 		}
+#endif
 
 		rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
 
-		rtreq_add(rt, RT_OPER_ADD);
-	} else {
-		rt = &ip6_null_entry;
+		dst_clone(&rt->u.dst);
+		err = rt6_ins(rt);
+		if (err == 0)
+			return rt;
+		rt->u.dst.error = err;
+		return rt;
 	}
-	return rt;
+	dst_clone(&ip6_null_entry.u.dst);
+	return &ip6_null_entry;
 }
 
 #ifdef CONFIG_RT6_POLICY
@@ -397,24 +344,38 @@ static __inline__ struct rt6_info *rt6_flow_lookup_out(struct rt6_info *rt,
 
 #endif
 
+#define BACKTRACK() \
+if (rt == &ip6_null_entry && strict) { \
+       while ((fn = fn->parent) != NULL) { \
+		if (fn->fn_flags & RTN_ROOT) { \
+			dst_clone(&rt->u.dst); \
+			goto out; \
+		} \
+		if (fn->fn_flags & RTN_RTINFO) \
+			goto restart; \
+	} \
+}
+
+
 void ip6_route_input(struct sk_buff *skb)
 {
 	struct fib6_node *fn;
 	struct rt6_info *rt;
-	struct dst_entry *dst;
+	int strict;
+
+	strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL);
 
-	RDBG(("ip6_route_input(%p) from %p\n", skb, __builtin_return_address(0)));
-	if ((dst = skb->dst) != NULL)
-		goto looped_back;
-	rt6_lock();
 	fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
 			 &skb->nh.ipv6h->saddr);
 
+restart:
 	rt = fn->leaf;
 
 	if ((rt->rt6i_flags & RTF_CACHE)) {
 		if (ip6_rt_policy == 0) {
-			rt = rt6_device_match(rt, skb->dev->ifindex, 0);
+			rt = rt6_device_match(rt, skb->dev->ifindex, strict);
+			BACKTRACK();
+			dst_clone(&rt->u.dst);
 			goto out;
 		}
 
@@ -425,6 +386,7 @@ void ip6_route_input(struct sk_buff *skb)
 			for (sprt = rt; sprt; sprt = sprt->u.next) {
 				if (rt6_flow_match_in(sprt, skb)) {
 					rt = sprt;
+					dst_clone(&rt->u.dst);
 					goto out;
 				}
 			}
@@ -433,38 +395,38 @@ void ip6_route_input(struct sk_buff *skb)
 	}
 
 	rt = rt6_device_match(rt, skb->dev->ifindex, 0);
+	BACKTRACK();
 
 	if (ip6_rt_policy == 0) {
-		if (!rt->rt6i_nexthop && rt->rt6i_dev &&
-		    ((rt->rt6i_flags & RTF_NONEXTHOP) == 0)) {
+		if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
 			rt = rt6_cow(rt, &skb->nh.ipv6h->daddr,
 				     &skb->nh.ipv6h->saddr);
+			goto out;
 		}
+		dst_clone(&rt->u.dst);
 	} else {
 #ifdef CONFIG_RT6_POLICY
 		rt = rt6_flow_lookup_in(rt, skb);
+#else
+		/* NEVER REACHED */
 #endif
 	}
 
 out:
-	dst = dst_clone((struct dst_entry *) rt);
-	rt6_unlock();
-
-	skb->dst = dst;
-looped_back:
-	dst->input(skb);
+	rt->u.dst.lastuse = jiffies;
+	atomic_inc(&rt->u.dst.refcnt);
+	skb->dst = (struct dst_entry *) rt;
 }
 
 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
 {
 	struct fib6_node *fn;
 	struct rt6_info *rt;
-	struct dst_entry *dst;
 	int strict;
 
 	strict = ipv6_addr_type(fl->nl_u.ip6_u.daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL);
 
-	rt6_lock();
+	start_bh_atomic();
 	fn = fib6_lookup(&ip6_routing_table, fl->nl_u.ip6_u.daddr,
 			 fl->nl_u.ip6_u.saddr);
 
@@ -472,25 +434,10 @@ restart:
 	rt = fn->leaf;
 
 	if ((rt->rt6i_flags & RTF_CACHE)) {
-		RDBG(("RTF_CACHE "));
 		if (ip6_rt_policy == 0) {
 			rt = rt6_device_match(rt, fl->oif, strict);
-
-			/* BUGGGG! It is capital bug, that was hidden
-			   by not-cloning multicast routes. However,
-			   the same problem was with link-local addresses.
-			   Fix is the following if-statement,
-			   but it will not properly handle Pedro's subtrees --ANK
-			 */
-			if (rt == &ip6_null_entry && strict) {
-				while ((fn = fn->parent) != NULL) {
-					if (fn->fn_flags & RTN_ROOT)
-						goto out;
-					if (fn->fn_flags & RTN_RTINFO)
-						goto restart;
-				}
-			}
-			RDBG(("devmatch(%p) ", rt));
+			BACKTRACK();
+			dst_clone(&rt->u.dst);
 			goto out;
 		}
 
@@ -501,68 +448,46 @@ restart:
 			for (sprt = rt; sprt; sprt = sprt->u.next) {
 				if (rt6_flow_match_out(sprt, sk)) {
 					rt = sprt;
+					dst_clone(&rt->u.dst);
 					goto out;
 				}
 			}
 		}
 #endif
 	}
-	RDBG(("!RTF_CACHE "));
 	if (rt->rt6i_flags & RTF_DEFAULT) {
-		RDBG(("RTF_DEFAULT "));
-		if (rt->rt6i_metric >= IP6_RT_PRIO_ADDRCONF) {
+		if (rt->rt6i_metric >= IP6_RT_PRIO_ADDRCONF)
 			rt = rt6_best_dflt(rt, fl->oif);
-			RDBG(("best_dflt(%p) ", rt));
-		}
 	} else {
 		rt = rt6_device_match(rt, fl->oif, strict);
-		RDBG(("!RTF_DEFAULT devmatch(%p) ", rt));
+		BACKTRACK();
 	}
 
 	if (ip6_rt_policy == 0) {
-		if (!rt->rt6i_nexthop && rt->rt6i_dev &&
-		    ((rt->rt6i_flags & RTF_NONEXTHOP) == 0)) {
+		if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
 			rt = rt6_cow(rt, fl->nl_u.ip6_u.daddr,
 				     fl->nl_u.ip6_u.saddr);
-			RDBG(("(!nhop&&rt6i_dev&&!RTF_NONEXTHOP) cow(%p) ", rt));
+			goto out;
 		}
+		dst_clone(&rt->u.dst);
 	} else {
 #ifdef CONFIG_RT6_POLICY
 		rt = rt6_flow_lookup_out(rt, sk, fl);
+#else
+		/* NEVER REACHED */
 #endif
 	}
 
 out:
-	dst = dst_clone((struct dst_entry *) rt);
-	rt6_unlock();
-	RDBG(("dclone/ret(%p)\n", dst));
-	return dst;
-}
-
-
-static void rt6_ins(struct rt6_info *rt)
-{
-	start_bh_atomic();
-	if (atomic_read(&rt6_tbl_lock) == 1)
-		fib6_add(&ip6_routing_table, rt);
-	else
-		rtreq_add(rt, RT_OPER_ADD);
+	rt->u.dst.lastuse = jiffies;
+	atomic_inc(&rt->u.dst.refcnt);
 	end_bh_atomic();
+	return &rt->u.dst;
 }
 
+
 /*
  *	Destination cache support functions
- *
- *	BUGGG! This function is absolutely wrong.
- *	First of all it is never called. (look at include/net/dst.h)
- *	Second, even when it is called rt->rt6i_node == NULL
- *	  ** partially fixed: now dst->obsolete = -1 for IPv6 not cache routes.
- *	Third, even we fixed previous bugs,
- *	it will not work because sernum is incorrectly checked/updated and
- *	it does not handle change of the parent of cloned route.
- *	Purging stray clones is not easy task, it would require
- *	massive remake of ip6_fib.c. Alas...
- *							--ANK
  */
 
 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
@@ -646,7 +571,7 @@ static int ipv6_get_mtu(struct device *dev)
 	if (idev)
 		return idev->cnf.mtu6;
 	else
-		return 576;
+		return IPV6_MIN_MTU;
 }
 
 static int ipv6_get_hoplimit(struct device *dev)
@@ -664,72 +589,68 @@ static int ipv6_get_hoplimit(struct device *dev)
  *
  */
 
-struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err)
+int ip6_route_add(struct in6_rtmsg *rtmsg)
 {
+	int err;
 	struct rt6_info *rt;
 	struct device *dev = NULL;
 	int addr_type;
-	
-	if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128) {
-		*err = -EINVAL;
-		return NULL;
-	}
+
+	if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
+		return -EINVAL;
+#ifndef CONFIG_IPV6_SUBTREES
+	if (rtmsg->rtmsg_src_len)
+		return -EINVAL;
+#endif
 	if (rtmsg->rtmsg_metric == 0)
 		rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
 
-	*err = 0;
-	
 	rt = dst_alloc(sizeof(struct rt6_info), &ip6_dst_ops);
 
-	if (rt == NULL) {
-		RDBG(("dalloc fails, "));
-		*err = -ENOMEM;
-		return NULL;
-	}
+	if (rt == NULL)
+		return -ENOMEM;
 
 	rt->u.dst.obsolete = -1;
 	rt->rt6i_expires = rtmsg->rtmsg_info;
 
 	addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
 
-	if (addr_type & IPV6_ADDR_MULTICAST) {
-		RDBG(("MCAST, "));
+	if (addr_type & IPV6_ADDR_MULTICAST)
 		rt->u.dst.input = ip6_mc_input;
-	} else {
-		RDBG(("!MCAST "));
+	else
 		rt->u.dst.input = ip6_forward;
-	}
 
 	rt->u.dst.output = ip6_output;
 
 	if (rtmsg->rtmsg_ifindex) {
 		dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
-		if (dev == NULL) {
-			*err = -ENODEV;
+		err = -ENODEV;
+		if (dev == NULL)
 			goto out;
-		}
 	}
 
 	ipv6_addr_copy(&rt->rt6i_dst.addr, &rtmsg->rtmsg_dst);
 	rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
 	ipv6_wash_prefix(&rt->rt6i_dst.addr, rt->rt6i_dst.plen);
 
+#ifdef CONFIG_IPV6_SUBTREES
 	ipv6_addr_copy(&rt->rt6i_src.addr, &rtmsg->rtmsg_src);
 	rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
 	ipv6_wash_prefix(&rt->rt6i_src.addr, rt->rt6i_src.plen);
+#endif
+
+	rt->rt6i_metric = rtmsg->rtmsg_metric;
 
 	/* We cannot add true routes via loopback here,
 	   they would result in kernel looping; promote them to reject routes
 	 */
 	if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
-		dev = dev_get("lo");
+		dev = &loopback_dev;
 		rt->u.dst.output = ip6_pkt_discard;
 		rt->u.dst.input = ip6_pkt_discard;
 		rt->u.dst.error = -ENETUNREACH;
 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
-		rt->rt6i_metric = rtmsg->rtmsg_metric;
-		rt->rt6i_dev = dev;
 		goto install_route;
 	}
 
@@ -746,50 +667,44 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err)
 
 			/* IPv6 strictly inhibits using not link-local
 			   addresses as nexthop address.
+			   Otherwise, router will not able to send redirects.
 			   It is very good, but in some (rare!) curcumstances
-			   (SIT, NBMA NOARP links) it is handy to allow
-			   some exceptions.
+			   (SIT, PtP, NBMA NOARP links) it is handy to allow
+			   some exceptions. --ANK
 			 */
-			if (!(gwa_type&IPV6_ADDR_UNICAST)) {
-				*err = -EINVAL;
+			err = -EINVAL;
+			if (!(gwa_type&IPV6_ADDR_UNICAST))
 				goto out;
-			}
 
-			grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, RTF_LINKRT);
+			grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
 
-			if (grt == NULL || (grt->rt6i_flags&RTF_GATEWAY)) {
-				*err = -EHOSTUNREACH;
+			err = -EHOSTUNREACH;
+			if (grt == NULL)
 				goto out;
-			}
+			if (!(grt->rt6i_flags&RTF_GATEWAY))
+				err = 0;
 			dev = grt->rt6i_dev;
+			dst_release(&grt->u.dst);
+
+			if (err)
+				goto out;
 		}
-		if (dev == NULL || (dev->flags&IFF_LOOPBACK)) {
-			*err = -EINVAL;
+		err = -EINVAL;
+		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
 			goto out;
-		}
 	}
 
-	if (dev == NULL) {
-		RDBG(("!dev, "));
-		*err = -ENODEV;
+	err = -ENODEV;
+	if (dev == NULL)
 		goto out;
-	}
 
 	if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
 		rt->rt6i_nexthop = ndisc_get_neigh(dev, &rt->rt6i_gateway);
-		if (rt->rt6i_nexthop == NULL) {
-			RDBG(("!nxthop, "));
-			*err = -ENOMEM;
+		err = -ENOMEM;
+		if (rt->rt6i_nexthop == NULL)
 			goto out;
-		}
-		RDBG(("nxthop, "));
 	}
 
-	rt->rt6i_metric = rtmsg->rtmsg_metric;
-
-	rt->rt6i_dev = dev;
-	rt->u.dst.pmtu = ipv6_get_mtu(dev);
-	rt->u.dst.rtt = TCP_TIMEOUT_INIT;
 	if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr))
 		rt->rt6i_hoplimit = IPV6_DEFAULT_MCASTHOPS;
 	else
@@ -797,153 +712,59 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err)
 	rt->rt6i_flags = rtmsg->rtmsg_flags;
 
 install_route:
-	RDBG(("rt6ins(%p) ", rt));
-
-	rt6_lock();
-	rt6_ins(rt);
-	rt6_unlock();
-
-	/* BUGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG!
-
-	   If rt6_ins will fail (and it occurs regularly f.e. if route
-	   already existed), the route will be freed -> Finita.
-	   Crash. No recovery. NO FIX. Unfortunately, it is not the only
-	   place will it is fatal. It is sad, I believed this
-	   code is a bit more accurate :-(
-
-	   Really, the problem can be solved in two ways:
-
-	   * As I did in old 2.0 IPv4: to increase use count and force
-	     user to destroy stray route. It requires some care,
-	     well, much more care.
-	   * Second and the best: to get rid of this damn backlogging
-	     system. I wonder why Pedro so liked it. It was the most
-	     unhappy day when I invented it (well, by a strange reason
-	     I believed that it is very clever :-)),
-	     and when I managed to clean IPv4 of this crap,
-	     it was really great win.
-	     BTW I forgot how 2.0 route/arp works :-) :-)
-	                                                               --ANK
-	 */
+	rt->u.dst.pmtu = ipv6_get_mtu(dev);
+	rt->u.dst.rtt = TCP_TIMEOUT_INIT;
+	rt->rt6i_dev = dev;
+	return rt6_ins(rt);
 
 out:
-	if (*err) {
-		RDBG(("dfree(%p) ", rt));
-		dst_free((struct dst_entry *) rt);
-		rt = NULL;
-	}
-	RDBG(("ret(%p)\n", rt));
-#if 0
-	return rt;
-#else
-	/* BUGGG! For now always return NULL. (see above)
-
-	   Really, it was used only in two places, and one of them
-	   (rt6_add_dflt_router) is repaired, ip6_fw is not essential
-	   at all. --ANK
-	 */
-	return NULL;
-#endif
+	dst_free((struct dst_entry *) rt);
+	return err;
 }
 
 int ip6_del_rt(struct rt6_info *rt)
 {
-	rt6_lock();
+	int err;
 
 	start_bh_atomic();
-
-	/* I'd add here couple of cli()
-	   cli(); cli(); cli();
-
-	   Now it is really LOCKED. :-) :-) --ANK
-	 */
-
 	rt6_dflt_pointer = NULL;
-
-	if (atomic_read(&rt6_tbl_lock) == 1)
-		fib6_del(rt);
-	else
-		rtreq_add(rt, RT_OPER_DEL);
+	err = fib6_del(rt);
 	end_bh_atomic();
-	rt6_unlock();
-	return 0;
+
+	return err;
 }
 
 int ip6_route_del(struct in6_rtmsg *rtmsg)
 {
 	struct fib6_node *fn;
 	struct rt6_info *rt;
+	int err = -ESRCH;
 
-	rt6_lock();
-	fn = fib6_lookup(&ip6_routing_table, &rtmsg->rtmsg_dst, &rtmsg->rtmsg_src);
-	rt = fn->leaf;
-
-	/*
-	 *	Blow it away
-	 *
-	 *	BUGGGG It will not help with Pedro's subtrees.
-	 *	We urgently need fib6_locate_node function, and
-	 *	it is not the only place where rt6_lookup is used
-	 *	for wrong purpose.
-	 *							--ANK
-	 */
-restart:
-	if (rt && rt->rt6i_src.plen == rtmsg->rtmsg_src_len) {
-		if (rt->rt6i_dst.plen > rtmsg->rtmsg_dst_len) {
-			struct fib6_node *fn = rt->rt6i_node;
-			while ((fn = fn->parent) != NULL) {
-				if (fn->fn_flags & RTN_ROOT)
-					break;
-				if (fn->fn_flags & RTN_RTINFO) {
-					rt = fn->leaf;
-					goto restart;
-				}
-			}
-		}
+	start_bh_atomic();
 
-		if (rt->rt6i_dst.plen == rtmsg->rtmsg_dst_len) {
-			for ( ; rt; rt = rt->u.next) {
-				if (rtmsg->rtmsg_ifindex &&
-				    (rt->rt6i_dev == NULL ||
-				     rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
-					continue;
-				if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
-				     ipv6_addr_cmp(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
-					continue;
-				if (rtmsg->rtmsg_metric &&
-				    rtmsg->rtmsg_metric != rt->rt6i_metric)
-					continue;
-				ip6_del_rt(rt);
-				rt6_unlock();
-				return 0;
-			}
+	fn = fib6_locate(&ip6_routing_table,
+			 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
+			 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
+	
+	if (fn) {
+		for (rt = fn->leaf; rt; rt = rt->u.next) {
+			if (rtmsg->rtmsg_ifindex &&
+			    (rt->rt6i_dev == NULL ||
+			     rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
+				continue;
+			if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
+			    ipv6_addr_cmp(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
+				continue;
+			if (rtmsg->rtmsg_metric &&
+			    rtmsg->rtmsg_metric != rt->rt6i_metric)
+				continue;
+			err = ip6_del_rt(rt);
+			break;
 		}
 	}
-	rt6_unlock();
-
-	return -ESRCH;
-}
-
-
-/*
- *	bottom handler, runs with atomic_bh protection
- */
-void __rt6_run_bh(void)
-{
-	struct rt6_req *rtreq;
+	end_bh_atomic();
 
-	while ((rtreq = rtreq_dequeue())) {
-		switch (rtreq->operation) {
-		case RT_OPER_ADD:
-			fib6_add(&ip6_routing_table, rtreq->ptr);
-			break;
-		case RT_OPER_DEL:
-			fib6_del(rtreq->ptr);
-			break;
-		};
-		kfree(rtreq);
-	}
-	rt6_bh_mask = 0;
+	return err;
 }
 
 #ifdef CONFIG_IPV6_NETLINK
@@ -971,10 +792,10 @@ static int rt6_msgrcv(int unit, struct sk_buff *skb)
 
 		switch (rtmsg->rtmsg_type) {
 		case RTMSG_NEWROUTE:
-			ip6_route_add(rtmsg, &err);
+			err = ip6_route_add(rtmsg);
 			break;
 		case RTMSG_DELROUTE:
-			ip6_route_del(rtmsg);
+			err = ip6_route_del(rtmsg);
 			break;
 		default:
 			count = -EINVAL;
@@ -1047,17 +868,19 @@ void rt6_sndmsg(int type, struct in6_addr *dst, struct in6_addr *src,
 /*
  *	Handle redirects
  */
-struct rt6_info *rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
-			      struct in6_addr *target, struct device *dev,
-			      int on_link)
+void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
+		  struct neighbour *neigh, int on_link)
 {
 	struct rt6_info *rt, *nrt;
 
 	/* Locate old route to this destination. */
-	rt = rt6_lookup(dest, NULL, dev->ifindex, 0);
+	rt = rt6_lookup(dest, NULL, neigh->dev->ifindex, 1);
 
-	if (rt == NULL || rt->u.dst.error)
-		return NULL;
+	if (rt == NULL)
+		return;
+
+	if (neigh->dev != rt->rt6i_dev)
+		goto out;
 
 	/* Redirect received -> path was valid.
 	   Look, redirects are sent only in response to data packets,
@@ -1066,12 +889,18 @@ struct rt6_info *rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
 	dst_confirm(&rt->u.dst);
 
 	/* Duplicate redirect: silently ignore. */
-	if (ipv6_addr_cmp(target, &rt->rt6i_gateway) == 0)
-		return NULL;
+	if (neigh == rt->u.dst.neighbour)
+		goto out;
 
-	/* Current route is on-link; redirect is always invalid. */
+	/* Current route is on-link; redirect is always invalid.
+	   
+	   Seems, previous statement is not true. It could
+	   be node, which looks for us as on-link (f.e. proxy ndisc)
+	   But then router serving it might decide, that we should
+	   know truth 8)8) --ANK (980726).
+	 */
 	if (!(rt->rt6i_flags&RTF_GATEWAY))
-		return NULL;
+		goto out;
 
 #if !defined(CONFIG_IPV6_EUI64) || defined(CONFIG_IPV6_NO_PB)
 	/*
@@ -1089,16 +918,21 @@ struct rt6_info *rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
 
 	if (ipv6_addr_cmp(saddr, &rt->rt6i_gateway)) {
 		if (rt->rt6i_flags & RTF_DEFAULT) {
-			rt = ip6_routing_table.leaf;
+			struct rt6_info *rt1;
 
-			for (; rt; rt = rt->u.next) {
-				if (!ipv6_addr_cmp(saddr, &rt->rt6i_gateway))
+			for (rt1 = ip6_routing_table.leaf; rt1; rt1 = rt1->u.next) {
+				if (!ipv6_addr_cmp(saddr, &rt1->rt6i_gateway)) {
+					dst_clone(&rt1->u.dst);
+					dst_release(&rt->u.dst);
+					rt = rt1;
 					goto source_ok;
+				}
 			}
 		}
-		printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
+		if (net_ratelimit())
+			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
 			       "for redirect target\n");
-		return NULL;
+		goto out;
 	}
 
 source_ok:
@@ -1107,36 +941,11 @@ source_ok:
 	/*
 	 *	We have finally decided to accept it.
 	 */
-	if (rt->rt6i_dst.plen == 128) {
-		/* BUGGGG! Very bad bug. Fast path code does not protect
-		 * itself of changing nexthop on the fly, it was supposed
-		 * that crucial parameters (dev, nexthop, hh) ARE VOLATILE.
-		 *                                                   --ANK
-		 * Not fixed!! I plugged it to avoid random crashes
-		 * (they are very unlikely, but I do not want to shrug
-		 *  every time when redirect arrives)
-		 * but the plug must be removed. --ANK
-		 */
-
-#if 0
-		/*
-		 *	Already a host route.
-		 *
-		 */
-		if (rt->rt6i_nexthop)
-			neigh_release(rt->rt6i_nexthop);
-		rt->rt6i_flags |= RTF_MODIFIED | RTF_CACHE;
-		if (on_link)
-			rt->rt6i_flags &= ~RTF_GATEWAY;
-		ipv6_addr_copy(&rt->rt6i_gateway, target);
-		rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, target);
-		return rt;
-#else
-		return NULL;
-#endif
-	}
 
 	nrt = ip6_rt_copy(rt);
+	if (nrt == NULL)
+		goto out;
+
 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
 	if (on_link)
 		nrt->rt6i_flags &= ~RTF_GATEWAY;
@@ -1144,19 +953,24 @@ source_ok:
 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
 	nrt->rt6i_dst.plen = 128;
 
-	ipv6_addr_copy(&nrt->rt6i_gateway, target);
-	nrt->rt6i_nexthop = ndisc_get_neigh(nrt->rt6i_dev, target);
-	nrt->rt6i_dev = dev;
-	nrt->u.dst.pmtu = ipv6_get_mtu(dev);
-	if (!ipv6_addr_is_multicast(&nrt->rt6i_dst.addr))
-		nrt->rt6i_hoplimit = ipv6_get_hoplimit(dev);
+	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
+	nrt->rt6i_nexthop = neigh_clone(neigh);
+	/* Reset pmtu, it may be better */
+	nrt->u.dst.pmtu = ipv6_get_mtu(neigh->dev);
+	nrt->rt6i_hoplimit = ipv6_get_hoplimit(neigh->dev);
+
+	if (rt6_ins(nrt))
+		goto out;
 
-	rt6_lock();
-	rt6_ins(nrt);
-	rt6_unlock();
+	/* Sic! rt6_redirect is called by bh, so that it is allowed */
+	dst_release(&rt->u.dst);
+	if (rt->rt6i_flags&RTF_CACHE)
+		ip6_del_rt(rt);
+	return;
 
-	/* BUGGGGGGG! nrt can point to nowhere. */
-	return nrt;
+out:
+        dst_release(&rt->u.dst);
+	return;
 }
 
 /*
@@ -1164,29 +978,25 @@ source_ok:
  *	i.e. Path MTU discovery
  */
 
-void rt6_pmtu_discovery(struct in6_addr *addr, struct device *dev, int pmtu)
+void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
+			struct device *dev, u32 pmtu)
 {
 	struct rt6_info *rt, *nrt;
 
-	if (pmtu < 576 || pmtu > 65536) {
-#if RT6_DEBUG >= 1
-		printk(KERN_DEBUG "rt6_pmtu_discovery: invalid MTU value %d\n",
-		       pmtu);
-#endif
+	if (pmtu < IPV6_MIN_MTU) {
+		if (net_ratelimit())
+			printk(KERN_DEBUG "rt6_pmtu_discovery: invalid MTU value %d\n",
+			       pmtu);
 		return;
 	}
 
-	rt = rt6_lookup(addr, NULL, dev->ifindex, 0);
+	rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
 
-	if (rt == NULL || rt->u.dst.error) {
-#if RT6_DEBUG >= 2
-		printk(KERN_DEBUG "rt6_pmtu_discovery: no route to host\n");
-#endif
+	if (rt == NULL)
 		return;
-	}
 
 	if (pmtu >= rt->u.dst.pmtu)
-		return;
+		goto out;
 
 	/* New mtu received -> path was valid.
 	   They are sent only in response to data packets,
@@ -1194,39 +1004,42 @@ void rt6_pmtu_discovery(struct in6_addr *addr, struct device *dev, int pmtu)
 	 */
 	dst_confirm(&rt->u.dst);
 
-	/* It is wrong, but I plugged the hole here.
-	   On-link routes are cloned differently,
-	   look at rt6_redirect --ANK
+	/* Host route. If it is static, it would be better
+	   not to override it, but add new one, so that
+	   when cache entry will expire old pmtu
+	   would return automatically.
 	 */
-	if (!(rt->rt6i_flags&RTF_GATEWAY))
-		return;
-
 	if (rt->rt6i_dst.plen == 128) {
 		/*
 		 *	host route
 		 */
 		rt->u.dst.pmtu = pmtu;
 		rt->rt6i_flags |= RTF_MODIFIED;
-
-		return;
+		goto out;
 	}
 
-	nrt = ip6_rt_copy(rt);
-	ipv6_addr_copy(&nrt->rt6i_dst.addr, addr);
-	nrt->rt6i_dst.plen = 128;
-
-	nrt->rt6i_flags |= (RTF_DYNAMIC | RTF_CACHE);
-
-	/* It was missing. :-) :-)
-	   I wonder, kernel was deemed to crash after pkt_too_big
-	   and nobody noticed it. Hey, guys, do someone really
-	   use it? --ANK
+	/* Network route.
+	   Two cases are possible:
+	   1. It is connected route. Action: COW
+	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
 	 */
-	nrt->rt6i_nexthop = neigh_clone(rt->rt6i_nexthop);
+	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
+		nrt = rt6_cow(rt, daddr, saddr);
+		nrt->rt6i_flags |= RTF_DYNAMIC;
+		dst_release(&nrt->u.dst);
+	} else {
+		nrt = ip6_rt_copy(rt);
+		if (nrt == NULL)
+			goto out;
+		ipv6_addr_copy(&nrt->rt6i_dst.addr, daddr);
+		nrt->rt6i_dst.plen = 128;
+		nrt->rt6i_nexthop = neigh_clone(rt->rt6i_nexthop);
+		nrt->rt6i_flags |= (RTF_DYNAMIC | RTF_CACHE);
+		rt6_ins(nrt);
+	}
 
-	rt6_lock();
-	rt6_ins(rt);
-	rt6_unlock();
+out:
+	dst_release(&rt->u.dst);
 }
 
 /*
@@ -1247,16 +1060,19 @@ static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
 		rt->u.dst.rtt = ort->u.dst.rtt;
 		rt->u.dst.window = ort->u.dst.window;
 		rt->u.dst.mxlock = ort->u.dst.mxlock;
+		rt->u.dst.dev = ort->u.dst.dev;
+		rt->u.dst.lastuse = jiffies;
 		rt->rt6i_hoplimit = ort->rt6i_hoplimit;
-		rt->rt6i_dev = ort->rt6i_dev;
+		rt->rt6i_expires = ort->rt6i_expires;
 
 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
-		rt->rt6i_keylen = ort->rt6i_keylen;
 		rt->rt6i_flags = ort->rt6i_flags;
 		rt->rt6i_metric = ort->rt6i_metric;
 
 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
+#ifdef CONFIG_IPV6_SUBTREES
 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
+#endif
 	}
 	return rt;
 }
@@ -1266,31 +1082,17 @@ struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct device *dev)
 	struct rt6_info *rt;
 	struct fib6_node *fn;
 
-	RDBG(("rt6_get_dflt_router(%p,%p)[%p]", addr, dev,
-	      __builtin_return_address(0)));
-#if RT6_DEBUG >= 3
-	{
-		int i;
-
-		RDBG(("addr["));
-		for(i = 0; i < 8; i++) {
-			RDBG(("%04x%c", addr->s6_addr16[i],
-			      i == 7 ? ']' : ':'));
-		}
-	}
-#endif
-	RDBG(("\n"));
-	rt6_lock();
-
 	fn = &ip6_routing_table;
 
+	start_bh_atomic();
 	for (rt = fn->leaf; rt; rt=rt->u.next) {
 		if (dev == rt->rt6i_dev &&
 		    ipv6_addr_cmp(&rt->rt6i_gateway, addr) == 0)
 			break;
 	}
-
-	rt6_unlock();
+	if (rt)
+		dst_clone(&rt->u.dst);
+	end_bh_atomic();
 	return rt;
 }
 
@@ -1298,24 +1100,6 @@ struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
 				     struct device *dev)
 {
 	struct in6_rtmsg rtmsg;
-	struct rt6_info *rt;
-	int err;
-
-	RDBG(("rt6_add_dflt_router(%p,%p)[%p] ", gwaddr, dev,
-	      __builtin_return_address(0)));
-#if RT6_DEBUG >= 3
-	{
-		struct in6_addr *addr = gwaddr;
-		int i;
-
-		RDBG(("gwaddr["));
-		for(i = 0; i < 8; i++) {
-			RDBG(("%04x%c", addr->s6_addr16[i],
-			      i == 7 ? ']' : ':'));
-		}
-	}
-#endif
-	RDBG(("\n"));
 
 	memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
 	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
@@ -1325,48 +1109,28 @@ struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
 
 	rtmsg.rtmsg_ifindex = dev->ifindex;
 
-	rt = ip6_route_add(&rtmsg, &err);
-
-	/* BUGGGGGGGGGGGGGGGGGGGG!
-	   rt can be not NULL, but point to heavens.
-	 */
-
-	if (err) {
-		printk(KERN_DEBUG "rt6_add_dflt: ip6_route_add error %d\n",
-		       err);
-	}
-	return rt;
+	ip6_route_add(&rtmsg);
+	return rt6_get_dflt_router(gwaddr, dev);
 }
 
 void rt6_purge_dflt_routers(int last_resort)
 {
 	struct rt6_info *rt;
-	struct fib6_node *fn;
 	u32 flags;
 
-	RDBG(("rt6_purge_dflt_routers(%d)[%p]\n", last_resort,
-	      __builtin_return_address(0)));
-	fn = &ip6_routing_table;
-
-	rt6_dflt_pointer = NULL;
-
 	if (last_resort)
 		flags = RTF_ALLONLINK;
 	else
 		flags = RTF_DEFAULT | RTF_ADDRCONF;	
 
-	for (rt = fn->leaf; rt; ) {
-		if ((rt->rt6i_flags & flags)) {
-			struct rt6_info *drt;
-#if RT6_DEBUG >= 2
-			printk(KERN_DEBUG "rt6_purge_dflt: deleting entry\n");
-#endif
-			drt = rt;
-			rt = rt->u.next;
-			ip6_del_rt(drt);
-			continue;
+restart:
+	rt6_dflt_pointer = NULL;
+
+	for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
+		if (rt->rt6i_flags & flags) {
+			ip6_del_rt(rt);
+			goto restart;
 		}
-		rt = rt->u.next;
 	}
 }
 
@@ -1389,7 +1153,7 @@ int ipv6_route_ioctl(unsigned int cmd, void *arg)
 		rtnl_lock();
 		switch (cmd) {
 		case SIOCADDRT:
-			ip6_route_add(&rtmsg, &err);
+			err = ip6_route_add(&rtmsg);
 			break;
 		case SIOCDELRT:
 			err = ip6_route_del(&rtmsg);
@@ -1414,7 +1178,7 @@ int ipv6_route_ioctl(unsigned int cmd, void *arg)
  */
 
 int ip6_pkt_discard(struct sk_buff *skb)
-{	
+{
 	ipv6_statistics.Ip6OutNoRoutes++;
 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
 	kfree_skb(skb);
@@ -1429,21 +1193,6 @@ int ip6_rt_addr_add(struct in6_addr *addr, struct device *dev)
 {
 	struct rt6_info *rt;
 
-	RDBG(("ip6_rt_addr_add(%p,%p)[%p]\n", addr, dev,
-	      __builtin_return_address(0)));
-#if RT6_DEBUG >= 3
-	{
-		int i;
-
-		RDBG(("addr["));
-		for(i = 0; i < 8; i++) {
-			RDBG(("%04x%c", addr->s6_addr16[i],
-			      i == 7 ? ']' : ':'));
-		}
-	}
-#endif
-	RDBG(("\n"));
-
 	rt = dst_alloc(sizeof(struct rt6_info), &ip6_dst_ops);
 	if (rt == NULL)
 		return -ENOMEM;
@@ -1465,10 +1214,7 @@ int ip6_rt_addr_add(struct in6_addr *addr, struct device *dev)
 
 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
 	rt->rt6i_dst.plen = 128;
-
-	rt6_lock();
 	rt6_ins(rt);
-	rt6_unlock();
 
 	return 0;
 }
@@ -1480,12 +1226,16 @@ int ip6_rt_addr_add(struct in6_addr *addr, struct device *dev)
 int ip6_rt_addr_del(struct in6_addr *addr, struct device *dev)
 {
 	struct rt6_info *rt;
+	int err = -ENOENT;
 
-	rt = rt6_lookup(addr, NULL, loopback_dev.ifindex, RTF_LINKRT);
-	if (rt && rt->rt6i_dst.plen == 128)
-		return ip6_del_rt(rt);
+	rt = rt6_lookup(addr, NULL, loopback_dev.ifindex, 1);
+	if (rt) {
+		if (rt->rt6i_dst.plen == 128)
+			err= ip6_del_rt(rt);
+		dst_release(&rt->u.dst);
+	}
 
-	return 0;
+	return err;
 }
 
 #ifdef CONFIG_RT6_POLICY
@@ -1587,75 +1337,65 @@ static struct rt6_info *rt6_flow_lookup(struct rt6_info *rt,
 	}
 
 error:
+	dst_clone(&ip6_null_entry.u.dst);
 	return &ip6_null_entry;
 
 found:
-
 	if (nrt == NULL)
 		goto error;
 
 	nrt->rt6i_flags |= RTF_CACHE;
-	/* BUGGGG! nrt can point to nowhere! */
-	rt6_ins(nrt);
-
+	dst_clone(&nrt->u.dst);
+	err = rt6_ins(nrt);
+	if (err)
+		nrt->u.dst.error = err;
 	return nrt;
 }
 #endif
 
-/* 
- * Nope, I am not idiot. I see that it is the ugliest of ugly routines.
- * Anyone is advertised to write better one. --ANK
- */
+static int fib6_ifdown(struct rt6_info *rt, void *arg)
+{
+	if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
+	    rt != &ip6_null_entry) {
+		RT6_TRACE("deleted by ifdown %p\n", rt);
+		return -1;
+	}
+	return 0;
+}
 
-struct rt6_ifdown_arg {
+void rt6_ifdown(struct device *dev)
+{
+	fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
+}
+
+struct rt6_mtu_change_arg
+{
 	struct device *dev;
-	struct rt6_info *rt;
+	unsigned mtu;
 };
 
-
-static void rt6_ifdown_node(struct fib6_node *fn, void *p_arg)
+static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
 {
-	struct rt6_info *rt;
-	struct rt6_ifdown_arg *arg = (struct rt6_ifdown_arg *) p_arg;
-
-	if (arg->rt != NULL)
-		return;
-
-	for (rt = fn->leaf; rt; rt = rt->u.next) {
-		if (rt->rt6i_dev == arg->dev || arg->dev == NULL) {
-			arg->rt = rt;
-			return;
-		}
-	}
+	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
+
+	/* In IPv6 pmtu discovery is not optional,
+	   so that RTAX_MTU lock cannot dissable it.
+	   We still use this lock to block changes
+	   caused by addrconf/ndisc.
+	   */
+	if (rt->rt6i_dev == arg->dev &&
+	    !(rt->u.dst.mxlock&(1<<RTAX_MTU)))
+		rt->u.dst.pmtu = arg->mtu;
+	return 0;
 }
 
-void rt6_ifdown(struct device *dev)
+void rt6_mtu_change(struct device *dev, unsigned mtu)
 {
-	int count = 0;
-	struct rt6_ifdown_arg arg;
-	struct rt6_info *rt;
+	struct rt6_mtu_change_arg arg;
 
-	do {
-		arg.dev = dev;
-		arg.rt = NULL;
-		fib6_walk_tree(&ip6_routing_table, rt6_ifdown_node, &arg,
-			       RT6_FILTER_RTNODES);
-		if (arg.rt != NULL)
-			ip6_del_rt(arg.rt);
-		count++;
-	} while (arg.rt != NULL);
-
-	/* And default routes ... */
-
-	for (rt = ip6_routing_table.leaf; rt; ) {
-		if (rt != &ip6_null_entry && (rt->rt6i_dev == dev || dev == NULL)) {
-			struct rt6_info *deleting = rt;
-			rt = rt->u.next;
-			ip6_del_rt(deleting);
-			continue;
-		}
-		rt = rt->u.next;
-	}
+	arg.dev = dev;
+	arg.mtu = mtu;
+	fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
 }
 
 #ifdef CONFIG_RTNETLINK
@@ -1714,37 +1454,28 @@ int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 {
 	struct rtmsg *r = NLMSG_DATA(nlh);
 	struct in6_rtmsg rtmsg;
-	int err = 0;
 
 	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
 		return -EINVAL;
-	ip6_route_add(&rtmsg, &err);
-	return err;
+	return ip6_route_add(&rtmsg);
 }
 
 struct rt6_rtnl_dump_arg
 {
 	struct sk_buff *skb;
 	struct netlink_callback *cb;
-	int skip;
-	int count;
-	int stop;
 };
 
 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
 			 struct in6_addr *dst,
 			 struct in6_addr *src,
 			 int iif,
-			 int type, pid_t pid, u32 seq)
+			 int type, u32 pid, u32 seq)
 {
 	struct rtmsg *rtm;
 	struct nlmsghdr  *nlh;
 	unsigned char	 *b = skb->tail;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	unsigned char 	 *o;
-#else
 	struct rtattr *mx;
-#endif
 	struct rta_cacheinfo ci;
 
 	nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*rtm));
@@ -1762,9 +1493,6 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
 		rtm->rtm_type = RTN_UNICAST;
 	rtm->rtm_flags = 0;
 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	rtm->rtm_nhs = 0;
-#endif
 	rtm->rtm_protocol = RTPROT_BOOT;
 	if (rt->rt6i_flags&RTF_DYNAMIC)
 		rtm->rtm_protocol = RTPROT_REDIRECT;
@@ -1776,19 +1504,18 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
 	if (rt->rt6i_flags&RTF_CACHE)
 		rtm->rtm_flags |= RTM_F_CLONED;
 
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	o = skb->tail;
-#endif
 	if (dst) {
 		RTA_PUT(skb, RTA_DST, 16, dst);
 	        rtm->rtm_dst_len = 128;
 	} else if (rtm->rtm_dst_len)
 		RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
+#ifdef CONFIG_IPV6_SUBTREES
 	if (src) {
 		RTA_PUT(skb, RTA_SRC, 16, src);
 	        rtm->rtm_src_len = 128;
 	} else if (rtm->rtm_src_len)
 		RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
+#endif
 	if (iif)
 		RTA_PUT(skb, RTA_IIF, 4, &iif);
 	else if (dst) {
@@ -1796,14 +1523,6 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
 		if (ifp)
 			RTA_PUT(skb, RTA_PREFSRC, 16, &ifp->addr);
 	}
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	if (rt->u.dst.pmtu)
-		RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &rt->u.dst.pmtu);
-	if (rt->u.dst.window)
-		RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &rt->u.dst.window);
-	if (rt->u.dst.rtt)
-		RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &rt->u.dst.rtt);
-#else
 	mx = (struct rtattr*)skb->tail;
 	RTA_PUT(skb, RTA_METRICS, 0, NULL);
 	if (rt->u.dst.mxlock)
@@ -1817,7 +1536,6 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
 	mx->rta_len = skb->tail - (u8*)mx;
 	if (mx->rta_len == RTA_LENGTH(0))
 		skb_trim(skb, (u8*)mx - skb->data);
-#endif
 	if (rt->u.dst.neighbour)
 		RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
 	if (rt->u.dst.dev)
@@ -1828,13 +1546,10 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
 		ci.rta_expires = rt->rt6i_expires - jiffies;
 	else
 		ci.rta_expires = 0;
-	ci.rta_used = 0;
+	ci.rta_used = atomic_read(&rt->u.dst.refcnt);
 	ci.rta_clntref = atomic_read(&rt->u.dst.use);
 	ci.rta_error = rt->u.dst.error;
 	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	rtm->rtm_optlen = skb->tail - o;
-#endif
 	nlh->nlmsg_len = skb->tail - b;
 	return skb->len;
 
@@ -1844,45 +1559,98 @@ rtattr_failure:
 	return -1;
 }
 
-static void rt6_dump_node(struct fib6_node *fn, void *p_arg)
+static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
 {
-	struct rt6_info *rt;
 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
 
-	if (arg->stop)
-		return;
+	return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
+			     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq);
+}
 
-	for (rt = fn->leaf; rt; rt = rt->u.next) {
-		if (arg->count < arg->skip) {
-			arg->count++;
-			continue;
-		}
-		if (rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
-				  NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq) <= 0) {
-			arg->stop = 1;
-			break;
+static int fib6_dump_node(struct fib6_walker_t *w)
+{
+	int res;
+	struct rt6_info *rt;
+
+	for (rt = w->leaf; rt; rt = rt->u.next) {
+		res = rt6_dump_route(rt, w->args);
+		if (res < 0) {
+			/* Frame is full, suspend walking */
+			w->leaf = rt;
+			return 1;
 		}
-		arg->count++;
+		BUG_TRAP(res!=0);
 	}
+	w->leaf = NULL;
+	return 0;
 }
 
+static int fib6_dump_done(struct netlink_callback *cb)
+{
+	struct fib6_walker_t *w = (void*)cb->args[0];
+
+	if (w) {
+		cb->args[0] = 0;
+		start_bh_atomic();
+		fib6_walker_unlink(w);
+		end_bh_atomic();
+		kfree(w);
+	}
+	if (cb->args[1]) {
+		cb->done = (void*)cb->args[1];
+		cb->args[1] = 0;
+	}
+	return cb->done(cb);
+}
 
 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct rt6_rtnl_dump_arg arg;
+	struct fib6_walker_t *w;
+	int res;
 
 	arg.skb = skb;
 	arg.cb = cb;
-	arg.skip = cb->args[0];
-	arg.count = 0;
-	arg.stop = 0;
-	start_bh_atomic();
-	fib6_walk_tree(&ip6_routing_table, rt6_dump_node, &arg, RT6_FILTER_RTNODES);
-	if (arg.stop == 0)
-		rt6_dump_node(&ip6_routing_table, &arg);
-	end_bh_atomic();
-	cb->args[0] = arg.count;
-	return skb->len;
+
+	w = (void*)cb->args[0];
+	if (w == NULL) {
+		/* New dump:
+		 * 
+		 * 1. hook callback destructor.
+		 */
+		cb->args[1] = (long)cb->done;
+		cb->done = fib6_dump_done;
+
+		/*
+		 * 2. allocate and initialize walker.
+		 */
+		w = kmalloc(sizeof(*w), GFP_KERNEL);
+		if (w == NULL)
+			return -ENOMEM;
+		RT6_TRACE("dump<%p", w);
+		memset(w, 0, sizeof(*w));
+		w->root = &ip6_routing_table;
+		w->func = fib6_dump_node;
+		w->args = &arg;
+		cb->args[0] = (long)w;
+		start_bh_atomic();
+		res = fib6_walk(w);
+		end_bh_atomic();
+	} else {
+		w->args = &arg;
+		start_bh_atomic();
+		res = fib6_walk_continue(w);
+		end_bh_atomic();
+	}
+#if RT6_DEBUG >= 3
+	if (res <= 0 && skb->len == 0)
+		RT6_TRACE("%p>dump end\n", w);
+#endif
+	/* res < 0 is an error. (really, impossible)
+	   res == 0 means that dump is complete, but skb still can contain data.
+	   res > 0 dump is not complete, but frame is full.
+	 */
+	return res < 0 ? res : skb->len;
 }
 
 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
@@ -1974,10 +1742,10 @@ void inet6_rt_notify(int event, struct rt6_info *rt)
 
 #ifdef CONFIG_PROC_FS
 
-
 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
 
-struct rt6_proc_arg {
+struct rt6_proc_arg
+{
 	char *buffer;
 	int offset;
 	int length;
@@ -1985,109 +1753,18 @@ struct rt6_proc_arg {
 	int len;
 };
 
-static void rt6_info_node(struct fib6_node *fn, void *p_arg)
+static int rt6_info_route(struct rt6_info *rt, void *p_arg)
 {
-	struct rt6_info *rt;
 	struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
-
-	for (rt = fn->leaf; rt; rt = rt->u.next) {
-		int i;
-
-		if (arg->skip < arg->offset / RT6_INFO_LEN) {
-			arg->skip++;
-			continue;
-		}
-
-		if (arg->len >= arg->length)
-			return;
-		
-		for (i=0; i<16; i++) {
-			sprintf(arg->buffer + arg->len, "%02x",
-				rt->rt6i_dst.addr.s6_addr[i]);
-			arg->len += 2;
-		}
-		arg->len += sprintf(arg->buffer + arg->len, " %02x ",
-				    rt->rt6i_dst.plen);
-
-		for (i=0; i<16; i++) {
-			sprintf(arg->buffer + arg->len, "%02x",
-				rt->rt6i_src.addr.s6_addr[i]);
-			arg->len += 2;
-		}
-		arg->len += sprintf(arg->buffer + arg->len, " %02x ",
-				    rt->rt6i_src.plen);
-		
-		if (rt->rt6i_nexthop) {
-			for (i=0; i<16; i++) {
-				sprintf(arg->buffer + arg->len, "%02x",
-					rt->rt6i_nexthop->primary_key[i]);
-				arg->len += 2;
-			}
-		} else {
-			sprintf(arg->buffer + arg->len,
-				"00000000000000000000000000000000");
-			arg->len += 32;
-		}
-		arg->len += sprintf(arg->buffer + arg->len,
-				    " %08x %08x %08x %08x %8s\n",
-				    rt->rt6i_metric, atomic_read(&rt->rt6i_use),
-				    atomic_read(&rt->rt6i_ref), rt->rt6i_flags, 
-				    rt->rt6i_dev ? rt->rt6i_dev->name : "");
-	}
-}
-
-static int rt6_proc_info(char *buffer, char **start, off_t offset, int length,
-			 int dummy)
-{
-	struct rt6_proc_arg arg;
-	arg.buffer = buffer;
-	arg.offset = offset;
-	arg.length = length;
-	arg.skip = 0;
-	arg.len = 0;
-
-	fib6_walk_tree(&ip6_routing_table, rt6_info_node, &arg,
-		       RT6_FILTER_RTNODES);
-
-	rt6_info_node(&ip6_routing_table, &arg);
-
-	*start = buffer;
-	if (offset)
-		*start += offset % RT6_INFO_LEN;
-
-	arg.len -= offset % RT6_INFO_LEN;
-
-	if(arg.len > length)
-		arg.len = length;
-	if(arg.len < 0)
-		arg.len = 0;
-
-	return arg.len;
-}
-
-#define PTR_SZ (sizeof(void *) * 2)
-#define FI_LINE_SZ (2 * (PTR_SZ) + 7 + 32 + 4 + 32 + 4)
-
-static void rt6_tree_node(struct fib6_node *fn, void *p_arg)
-{
-	struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
-	struct rt6_info *rt;
-	char f;
 	int i;
 
-	rt = fn->leaf;
-
-	if (arg->skip < arg->offset / FI_LINE_SZ) {
+	if (arg->skip < arg->offset / RT6_INFO_LEN) {
 		arg->skip++;
-		return;
+		return 0;
 	}
 
-	if (arg->len + FI_LINE_SZ >= arg->length)
-		return;
-
-	f = (fn->fn_flags & RTN_RTINFO) ? 'r' : 'n';
-	arg->len += sprintf(arg->buffer + arg->len, "%p %p %02x %c ",
-			    fn, fn->parent, fn->fn_bit, f);
+	if (arg->len >= arg->length)
+		return 0;
 
 	for (i=0; i<16; i++) {
 		sprintf(arg->buffer + arg->len, "%02x",
@@ -2096,18 +1773,41 @@ static void rt6_tree_node(struct fib6_node *fn, void *p_arg)
 	}
 	arg->len += sprintf(arg->buffer + arg->len, " %02x ",
 			    rt->rt6i_dst.plen);
-	
+
+#ifdef CONFIG_IPV6_SUBTREES
 	for (i=0; i<16; i++) {
 		sprintf(arg->buffer + arg->len, "%02x",
 			rt->rt6i_src.addr.s6_addr[i]);
 		arg->len += 2;
 	}
-	arg->len += sprintf(arg->buffer + arg->len, " %02x\n",
+	arg->len += sprintf(arg->buffer + arg->len, " %02x ",
 			    rt->rt6i_src.plen);
+#else
+	sprintf(arg->buffer + arg->len,
+		"00000000000000000000000000000000 00 ");
+	arg->len += 36;
+#endif
 
+	if (rt->rt6i_nexthop) {
+		for (i=0; i<16; i++) {
+			sprintf(arg->buffer + arg->len, "%02x",
+				rt->rt6i_nexthop->primary_key[i]);
+			arg->len += 2;
+		}
+	} else {
+		sprintf(arg->buffer + arg->len,
+			"00000000000000000000000000000000");
+		arg->len += 32;
+	}
+	arg->len += sprintf(arg->buffer + arg->len,
+			    " %08x %08x %08x %08x %8s\n",
+			    rt->rt6i_metric, atomic_read(&rt->u.dst.use),
+			    atomic_read(&rt->u.dst.refcnt), rt->rt6i_flags, 
+			    rt->rt6i_dev ? rt->rt6i_dev->name : "");
+	return 0;
 }
 
-static int rt6_proc_tree(char *buffer, char **start, off_t offset, int length,
+static int rt6_proc_info(char *buffer, char **start, off_t offset, int length,
 			 int dummy)
 {
 	struct rt6_proc_arg arg;
@@ -2117,7 +1817,7 @@ static int rt6_proc_tree(char *buffer, char **start, off_t offset, int length,
 	arg.skip = 0;
 	arg.len = 0;
 
-	fib6_walk_tree(&ip6_routing_table, rt6_tree_node, &arg, 0);
+	fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
 
 	*start = buffer;
 	if (offset)
@@ -2125,15 +1825,14 @@ static int rt6_proc_tree(char *buffer, char **start, off_t offset, int length,
 
 	arg.len -= offset % RT6_INFO_LEN;
 
-	if(arg.len > length)
+	if (arg.len > length)
 		arg.len = length;
-	if(arg.len < 0)
+	if (arg.len < 0)
 		arg.len = 0;
 
 	return arg.len;
 }
 
-
 extern struct rt6_statistics rt6_stats;
 
 static int rt6_proc_stats(char *buffer, char **start, off_t offset, int length,
@@ -2141,10 +1840,11 @@ static int rt6_proc_stats(char *buffer, char **start, off_t offset, int length,
 {
 	int len;
 
-	len = sprintf(buffer, "%04x %04x %04x %04x %04x\n",
+	len = sprintf(buffer, "%04x %04x %04x %04x %04x %04x\n",
 		      rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
 		      rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
-		      rt6_stats.fib_rt_cache);
+		      rt6_stats.fib_rt_cache,
+		      atomic_read(&ip6_dst_ops.entries));
 
 	len -= offset;
 
@@ -2164,12 +1864,6 @@ static struct proc_dir_entry proc_rt6_info = {
 	0, &proc_net_inode_operations,
 	rt6_proc_info
 };
-static struct proc_dir_entry proc_rt6_tree = {
-	PROC_NET_RT6_TREE, 7, "ip6_fib",
-	S_IFREG | S_IRUGO, 1, 0, 0,
-	0, &proc_net_inode_operations,
-	rt6_proc_tree
-};
 static struct proc_dir_entry proc_rt6_stats = {
 	PROC_NET_RT6_STATS, 9, "rt6_stats",
 	S_IFREG | S_IRUGO, 1, 0, 0,
@@ -2230,7 +1924,6 @@ __initfunc(void ip6_route_init(void))
 {
 #ifdef 	CONFIG_PROC_FS
 	proc_net_register(&proc_rt6_info);
-	proc_net_register(&proc_rt6_tree);
 	proc_net_register(&proc_rt6_stats);
 #endif
 #ifdef CONFIG_IPV6_NETLINK
@@ -2243,7 +1936,6 @@ void ip6_route_cleanup(void)
 {
 #ifdef CONFIG_PROC_FS
 	proc_net_unregister(PROC_NET_RT6);
-	proc_net_unregister(PROC_NET_RT6_TREE);
 	proc_net_unregister(PROC_NET_RT6_STATS);
 #endif
 #ifdef CONFIG_IPV6_NETLINK
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 577b85d0f..0d6efd515 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -6,7 +6,7 @@
  *	Pedro Roque		<roque@di.fc.ul.pt>	
  *	Alexey Kuznetsov	<kuznet@ms2.inr.ac.ru>
  *
- *	$Id: sit.c,v 1.27 1998/03/08 05:56:57 davem Exp $
+ *	$Id: sit.c,v 1.28 1998/08/26 12:05:22 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -434,7 +434,7 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct device *dev)
 		ip_rt_put(rt);
 		goto tx_error;
 	}
-	if (mtu >= 576) {
+	if (mtu >= IPV6_MIN_MTU) {
 		if (skb->dst && mtu < skb->dst->pmtu) {
 			struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
 			if (mtu < rt6->u.dst.pmtu) {
@@ -475,6 +475,8 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct device *dev)
 			tunnel->recursion--;
 			return 0;
 		}
+		if (skb->sk)
+			skb_set_owner_w(new_skb, skb->sk);
 		dev_kfree_skb(skb);
 		skb = new_skb;
 	}
@@ -491,7 +493,7 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct device *dev)
 	iph 			=	skb->nh.iph;
 	iph->version		=	4;
 	iph->ihl		=	sizeof(struct iphdr)>>2;
-	if (mtu > 576)
+	if (mtu > IPV6_MIN_MTU)
 		iph->frag_off	=	__constant_htons(IP_DF);
 	else
 		iph->frag_off	=	0;
@@ -608,7 +610,7 @@ static struct net_device_stats *ipip6_tunnel_get_stats(struct device *dev)
 
 static int ipip6_tunnel_change_mtu(struct device *dev, int new_mtu)
 {
-	if (new_mtu < 576 || new_mtu > 0xFFF8 - sizeof(struct iphdr))
+	if (new_mtu < IPV6_MIN_MTU || new_mtu > 0xFFF8 - sizeof(struct iphdr))
 		return -EINVAL;
 	dev->mtu = new_mtu;
 	return 0;
@@ -662,8 +664,8 @@ static int ipip6_tunnel_init(struct device *dev)
 	if (tdev) {
 		dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
 		dev->mtu = tdev->mtu - sizeof(struct iphdr);
-		if (dev->mtu < 576)
-			dev->mtu = 576;
+		if (dev->mtu < IPV6_MIN_MTU)
+			dev->mtu = IPV6_MIN_MTU;
 	}
 	dev->iflink = tunnel->parms.link;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5fa45dce5..c997999db 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -5,7 +5,7 @@
  *	Authors:
  *	Pedro Roque		<roque@di.fc.ul.pt>	
  *
- *	$Id: tcp_ipv6.c,v 1.82 1998/06/11 03:15:52 davem Exp $
+ *	$Id: tcp_ipv6.c,v 1.89 1998/08/28 00:27:54 davem Exp $
  *
  *	Based on: 
  *	linux/net/ipv4/tcp.c
@@ -123,16 +123,33 @@ static int tcp_v6_verify_bind(struct sock *sk, unsigned short snum)
 	}
 	if(result == 0) {
 		if(tb == NULL) {
-			if(tcp_bucket_create(snum) == NULL)
+			if((tb = tcp_bucket_create(snum)) == NULL)
 				result = 1;
+			else if (sk->reuse && sk->state != TCP_LISTEN)
+				tb->flags |= TCPB_FLAG_FASTREUSE;
 		} else {
 			/* It could be pending garbage collection, this
 			 * kills the race and prevents it from disappearing
 			 * out from under us by the time we use it.  -DaveM
 			 */
-			if(tb->owners == NULL && !(tb->flags & TCPB_FLAG_LOCKED)) {
-				tb->flags = TCPB_FLAG_LOCKED;
-				tcp_dec_slow_timer(TCP_SLT_BUCKETGC);
+			if(tb->owners == NULL) {
+				if (!(tb->flags & TCPB_FLAG_LOCKED)) {
+					tb->flags = (TCPB_FLAG_LOCKED |
+						     ((sk->reuse &&
+						       sk->state != TCP_LISTEN) ?
+						      TCPB_FLAG_FASTREUSE : 0));
+					tcp_dec_slow_timer(TCP_SLT_BUCKETGC);
+				} else if (!(tb->flags & TCPB_FLAG_GOODSOCKNUM)) {
+					/* Someone is in between the bind
+					 * and the actual connect or listen.
+					 * See if it was a legitimate reuse
+					 * and we are as well, else punt.
+					 */
+					if (sk->reuse == 0 ||
+					    !(tb->flags & TCPB_FLAG_FASTREUSE))
+						result = 1;
+				} else
+					tb->flags &= ~TCPB_FLAG_GOODSOCKNUM;
 			}
 		}
 	}
@@ -358,7 +375,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	struct dst_entry *dst;
 	struct sk_buff *buff;
 	int addr_type;
-	int mss;
 
 	if (sk->state != TCP_CLOSE) 
 		return(-EISCONN);
@@ -403,6 +419,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	 */
 
 	if (addr_type == IPV6_ADDR_MAPPED) {
+		u32 exthdrlen = tp->ext_header_len;
 		struct sockaddr_in sin;
 		int err;
 
@@ -418,10 +435,10 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
 
 		if (err) {
+			tp->ext_header_len = exthdrlen;
 			sk->tp_pinfo.af_tcp.af_specific = &ipv6_specific;
 			sk->backlog_rcv = tcp_v6_do_rcv;
 		} else {
-			/* Yuup... And it is not the only place... --ANK */
 			ipv6_addr_set(&np->saddr, 0, 0, __constant_htonl(0x0000FFFF),
 				      sk->saddr);
 			ipv6_addr_set(&np->rcv_saddr, 0, 0, __constant_htonl(0x0000FFFF),
@@ -441,18 +458,18 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	fl.uli_u.ports.dport = usin->sin6_port;
 	fl.uli_u.ports.sport = sk->sport;
 
+	if (np->opt && np->opt->srcrt) {
+		struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
+		fl.nl_u.ip6_u.daddr = rt0->addr;
+	}
+
 	dst = ip6_route_output(sk, &fl);
-	
+
 	if (dst->error) {
 		dst_release(dst);
 		return dst->error;
 	}
 
-	if (dst->pmtu < 576) {
-		dst_release(dst);
-		return -EINVAL;
-	}
-
 	if (fl.oif == 0 && addr_type&IPV6_ADDR_LINKLOCAL) {
 		/* Ough! This guy tries to connect to link local
 		 * address and did not specify interface.
@@ -462,11 +479,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		sk->bound_dev_if = dst->dev->ifindex;
 	}
 
-	ip6_dst_store(sk, dst);
+	ip6_dst_store(sk, dst, NULL);
 
 	if (saddr == NULL) {
 		ifa = ipv6_get_saddr(dst, &np->daddr);
-	
+
 		if (ifa == NULL)
 			return -ENETUNREACH;
 		
@@ -477,6 +494,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		ipv6_addr_copy(&np->saddr, saddr);
 	}
 
+	tp->ext_header_len = 0;
+	if (np->opt)
+		tp->ext_header_len = np->opt->opt_flen+np->opt->opt_nflen;
+	/* Reset mss clamp */
+	tp->mss_clamp = ~0;
+
 	buff = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
 			    0, GFP_KERNEL);
 
@@ -498,15 +521,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 						   np->daddr.s6_addr32[3],
 						   sk->sport, sk->dport);
 
-	sk->mtu = dst->pmtu;
-	mss = sk->mtu - sizeof(struct ipv6hdr);
-#if 0
-	if (np->opt) {
-		/* Adjust mss */
-	}
-#endif
-
-	tcp_connect(sk, buff, mss);
+	tcp_connect(sk, buff, dst->pmtu);
 
 	return 0;
 }
@@ -555,10 +570,12 @@ out:
 	return retval;
 }
 
-void tcp_v6_err(struct sk_buff *skb, int type, int code, unsigned char *header, __u32 info,
-		struct in6_addr *saddr, struct in6_addr *daddr,
-		struct inet6_protocol *protocol)
+void tcp_v6_err(struct sk_buff *skb, struct ipv6hdr *hdr,
+		struct inet6_skb_parm *opt,
+		int type, int code, unsigned char *header, __u32 info)
 {
+	struct in6_addr *saddr = &hdr->saddr;
+	struct in6_addr *daddr = &hdr->daddr;
 	struct tcphdr *th = (struct tcphdr *)header;
 	struct ipv6_pinfo *np;
 	struct sock *sk;
@@ -567,7 +584,8 @@ void tcp_v6_err(struct sk_buff *skb, int type, int code, unsigned char *header,
 	struct tcp_opt *tp; 
 	__u32 seq; 
 
-	/* XXX: length check for tcphdr missing here */
+	if (header + 8 > skb->tail)
+		return;
 
 	sk = tcp_v6_lookup(daddr, th->dest, saddr, th->source, skb->dev->ifindex);
 
@@ -588,15 +606,20 @@ void tcp_v6_err(struct sk_buff *skb, int type, int code, unsigned char *header,
 
 	np = &sk->net_pinfo.af_inet6;
 	if (type == ICMPV6_PKT_TOOBIG && sk->state != TCP_LISTEN) {
+		struct dst_entry *dst = NULL;
 		/* icmp should have updated the destination cache entry */
 
 		if (sk->dst_cache)
-			dst_check(&sk->dst_cache, np->dst_cookie);
+			dst = dst_check(&sk->dst_cache, np->dst_cookie);
 
-		if (sk->dst_cache == NULL) {
+		if (dst == NULL) {
 			struct flowi fl;
 			struct dst_entry *dst;
-			
+
+			/* BUGGG_FUTURE: Again, it is not clear how
+			   to handle rthdr case. Ignore this complexity
+			   for now.
+			 */
 			fl.proto = IPPROTO_TCP;
 			fl.nl_u.ip6_u.daddr = &np->daddr;
 			fl.nl_u.ip6_u.saddr = &np->saddr;
@@ -605,23 +628,19 @@ void tcp_v6_err(struct sk_buff *skb, int type, int code, unsigned char *header,
 			fl.uli_u.ports.sport = sk->sport;
 
 			dst = ip6_route_output(sk, &fl);
+		} else
+			dst = dst_clone(dst);
 
-			ip6_dst_store(sk, dst);
-		}
-
-		if (sk->dst_cache->error) {
-			sk->err_soft = sk->dst_cache->error;
-		} else {
-			/* FIXME: Reset sk->mss, taking into account TCP option
-			 *        bytes for timestamps. -DaveM
-			 */
-			sk->mtu = sk->dst_cache->pmtu;
-		}
-		if (sk->sock_readers) { /* remove later */
-			printk(KERN_DEBUG "tcp_v6_err: pmtu disc: socket locked.\n");
-			return;
-		}
-		tcp_simple_retransmit(sk);
+		if (dst->error) {
+			sk->err_soft = dst->error;
+		} else if (tp->pmtu_cookie > dst->pmtu
+			   && !atomic_read(&sk->sock_readers)) {
+			lock_sock(sk); 
+			tcp_sync_mss(sk, dst->pmtu);
+			tcp_simple_retransmit(sk);
+			release_sock(sk);
+		} /* else let the usual retransmit timer handle it */
+		dst_release(dst);
 		return;
 	}
 
@@ -631,7 +650,7 @@ void tcp_v6_err(struct sk_buff *skb, int type, int code, unsigned char *header,
 		struct open_request *req, *prev;
 		struct ipv6hdr hd;
 	case TCP_LISTEN:
-		if (sk->sock_readers)
+		if (atomic_read(&sk->sock_readers))
 			return;
 
 		/* Grrrr - fix this later. */
@@ -680,6 +699,7 @@ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req)
 {
 	struct sk_buff * skb;
 	struct dst_entry *dst;
+	struct ipv6_txoptions *opt = NULL;
 	struct flowi fl;
 	int mss;
 
@@ -690,19 +710,26 @@ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req)
 	fl.uli_u.ports.dport = req->rmt_port;
 	fl.uli_u.ports.sport = sk->sport;
 
-	dst = ip6_route_output(sk, &fl);
-	if (dst->error) {
-		dst_release(dst);
-		return;
+	opt = sk->net_pinfo.af_inet6.opt;
+	if (opt == NULL &&
+	    sk->net_pinfo.af_inet6.rxopt.bits.srcrt == 2 &&
+	    req->af.v6_req.pktopts) {
+		struct sk_buff *pktopts = req->af.v6_req.pktopts;
+		struct inet6_skb_parm *rxopt = (struct inet6_skb_parm *)pktopts->cb;
+		if (rxopt->srcrt)
+			opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(pktopts->nh.raw + rxopt->srcrt));
 	}
 
-	mss = dst->pmtu - sizeof(struct ipv6hdr) - sizeof(struct tcphdr);
-#if 0
-	/* Subtract option length... */
-	if (opt) {
-		mss -= opt->optlen;
+	if (opt && opt->srcrt) {
+		struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
+		fl.nl_u.ip6_u.daddr = rt0->addr;
 	}
-#endif
+
+	dst = ip6_route_output(sk, &fl);
+	if (dst->error)
+		goto done;
+
+	mss = dst->pmtu - sizeof(struct ipv6hdr) - sizeof(struct tcphdr);
 
 	skb = tcp_make_synack(sk, dst, req, mss);
 	if (skb) {
@@ -712,13 +739,22 @@ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req)
 					 &req->af.v6_req.loc_addr, &req->af.v6_req.rmt_addr,
 					 csum_partial((char *)th, skb->len, skb->csum));
 
-		ip6_xmit(sk, skb, &fl, req->af.v6_req.opt);
+		fl.nl_u.ip6_u.daddr = &req->af.v6_req.rmt_addr;
+		ip6_xmit(sk, skb, &fl, opt);
 	}
+
+done:
 	dst_release(dst);
+        if (opt && opt != sk->net_pinfo.af_inet6.opt)
+		sock_kfree_s(sk, opt, opt->tot_len);
 }
 
 static void tcp_v6_or_free(struct open_request *req)
 {
+	if (req->af.v6_req.pktopts) {
+		kfree_skb(req->af.v6_req.pktopts);
+		req->af.v6_req.pktopts = NULL;
+	}
 }
 
 static struct or_calltable or_ipv6 = {
@@ -727,14 +763,27 @@ static struct or_calltable or_ipv6 = {
 	tcp_v6_send_reset
 };
 
+static int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb)
+{
+	struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb;
+
+	if (sk->net_pinfo.af_inet6.rxopt.all) {
+		if ((opt->hop && sk->net_pinfo.af_inet6.rxopt.bits.hopopts) ||
+		    (opt->srcrt && sk->net_pinfo.af_inet6.rxopt.bits.srcrt) ||
+		    ((opt->dst1 || opt->dst0) && sk->net_pinfo.af_inet6.rxopt.bits.dstopts))
+			return 1;
+	}
+	return 0;
+}
+
+
 #define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */
 #define BACKLOGMAX(sk) sysctl_max_syn_backlog
 
 /* FIXME: this is substantially similar to the ipv4 code.
  * Can some kind of merge be done? -- erics
  */
-static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
-							   __u32 isn)
+static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, __u32 isn)
 {
 	struct tcp_opt tp;
 	struct open_request *req;
@@ -747,7 +796,11 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
 	}
 
 	if (skb->protocol == __constant_htons(ETH_P_IP))
-		return tcp_v4_conn_request(sk, skb, ptr, isn);
+		return tcp_v4_conn_request(sk, skb, isn);
+
+	/* FIXME: do the same check for anycast */
+	if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr))
+		goto drop; 
 
 	if (isn == 0) 
 		isn = tcp_v6_init_sequence(sk,skb);
@@ -756,8 +809,9 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
 	 *	There are no SYN attacks on IPv6, yet...	
 	 */
 	if (BACKLOG(sk) >= BACKLOGMAX(sk)) {
-		printk(KERN_DEBUG "droping syn ack:%d max:%d\n",
-		       BACKLOG(sk), BACKLOGMAX(sk));
+		(void)(net_ratelimit() && 
+		       printk(KERN_INFO "droping syn ack:%d max:%d\n",
+			       BACKLOG(sk), BACKLOGMAX(sk)));
 		goto drop;		
 	}
 
@@ -773,13 +827,16 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
 	req->rcv_isn = TCP_SKB_CB(skb)->seq;
 	req->snt_isn = isn;
 	tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
-	tp.in_mss = 536;
+	tp.mss_clamp = 65535;
 	tcp_parse_options(NULL, skb->h.th, &tp, 0);
-        req->mss = tp.in_mss;
-	if (tp.saw_tstamp) {
-		req->mss -= TCPOLEN_TSTAMP_ALIGNED;
+	if (tp.mss_clamp == 65535)
+		tp.mss_clamp = 576 - sizeof(struct ipv6hdr) - sizeof(struct iphdr);
+	if (sk->tp_pinfo.af_tcp.user_mss && sk->tp_pinfo.af_tcp.user_mss < tp.mss_clamp)
+		tp.mss_clamp = sk->tp_pinfo.af_tcp.user_mss;
+
+        req->mss = tp.mss_clamp;
+	if (tp.saw_tstamp)
                 req->ts_recent = tp.rcv_tsval;
-	}
         req->tstamp_ok = tp.tstamp_ok;
 	req->sack_ok = tp.sack_ok;
         req->snd_wscale = tp.snd_wscale;
@@ -787,7 +844,11 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
 	req->rmt_port = skb->h.th->source;
 	ipv6_addr_copy(&req->af.v6_req.rmt_addr, &skb->nh.ipv6h->saddr);
 	ipv6_addr_copy(&req->af.v6_req.loc_addr, &skb->nh.ipv6h->daddr);
-	req->af.v6_req.opt = NULL;	/* FIXME: options */
+	req->af.v6_req.pktopts = NULL;
+	if (ipv6_opt_accepted(sk, skb)) {
+		atomic_inc(&skb->users);
+		req->af.v6_req.pktopts = skb;
+	}
 	req->af.v6_req.iif = sk->bound_dev_if;
 
 	/* So that link locals have meaning */
@@ -804,8 +865,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
 	tcp_inc_slow_timer(TCP_SLT_SYNACK);
 	tcp_synq_queue(&sk->tp_pinfo.af_tcp, req);	
 
-	sk->data_ready(sk, 0);
-
 	return 0;
 
 drop:
@@ -832,8 +891,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	struct flowi fl;
 	struct tcp_opt *newtp;
 	struct sock *newsk;
-	int mss;
-      
+	struct ipv6_txoptions *opt;
+
 	if (skb->protocol == __constant_htons(ETH_P_IP)) {
 		/*
 		 *	v6 mapped
@@ -856,21 +915,37 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 		newsk->tp_pinfo.af_tcp.af_specific = &ipv6_mapped;
 		newsk->backlog_rcv = tcp_v4_do_rcv;
+		newsk->net_pinfo.af_inet6.pktoptions = NULL;
+		newsk->net_pinfo.af_inet6.opt = NULL;
+
+		/* It is tricky place. Until this moment IPv4 tcp
+		   worked with IPv6 af_tcp.af_specific.
+		   Sync it now.
+		 */
+		tcp_sync_mss(newsk, newsk->tp_pinfo.af_tcp.pmtu_cookie);
 
 		return newsk;
 	}
 
+	opt = sk->net_pinfo.af_inet6.opt;
 
 	if (sk->ack_backlog > sk->max_ack_backlog)
-		return NULL; 
+		goto out;
+
+	if (sk->net_pinfo.af_inet6.rxopt.bits.srcrt == 2 &&
+	    opt == NULL && req->af.v6_req.pktopts) {
+		struct inet6_skb_parm *rxopt = (struct inet6_skb_parm *)req->af.v6_req.pktopts->cb;
+		if (rxopt->srcrt)
+			opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(req->af.v6_req.pktopts->nh.raw+rxopt->srcrt));
+	}
 
 	if (dst == NULL) {
-		/*
-		 *	options / mss / route cache
-		 */
-	    
 		fl.proto = IPPROTO_TCP;
 		fl.nl_u.ip6_u.daddr = &req->af.v6_req.rmt_addr;
+		if (opt && opt->srcrt) {
+			struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
+			fl.nl_u.ip6_u.daddr = rt0->addr;
+		}
 		fl.nl_u.ip6_u.saddr = &req->af.v6_req.loc_addr;
 		fl.oif = sk->bound_dev_if;
 		fl.uli_u.ports.dport = req->rmt_port;
@@ -879,22 +954,17 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		dst = ip6_route_output(sk, &fl);
 	}
 
-	if (dst->error || dst->pmtu < 576)
+	if (dst->error)
 		goto out;
-	
+
 	sk->tp_pinfo.af_tcp.syn_backlog--;
 	sk->ack_backlog++;
 
-	mss = dst->pmtu - sizeof(struct ipv6hdr);
-#if 0
-	/* Adjust mss by option size */
-#endif
-
-	newsk = tcp_create_openreq_child(sk, req, skb, mss);
+	newsk = tcp_create_openreq_child(sk, req, skb);
 	if (newsk == NULL)
 		goto out;
 
-	ip6_dst_store(newsk, dst);
+	ip6_dst_store(newsk, dst, NULL);
 
 	newtp = &(newsk->tp_pinfo.af_tcp);
 
@@ -903,18 +973,55 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	ipv6_addr_copy(&np->saddr, &req->af.v6_req.loc_addr);
 	ipv6_addr_copy(&np->rcv_saddr, &req->af.v6_req.loc_addr);
 	newsk->bound_dev_if = req->af.v6_req.iif;
-	newsk->mtu = dst->pmtu;
+
+	/* Now IPv6 options... 
+
+	   First: no IPv4 options.
+	 */
 	newsk->opt = NULL;
 
+	/* Clone RX bits */
+	np->rxopt.all = sk->net_pinfo.af_inet6.rxopt.all;
+
+	/* Clone pktoptions received with SYN */
+	np->pktoptions = req->af.v6_req.pktopts;
+	if (np->pktoptions)
+		atomic_inc(&np->pktoptions->users);
+	np->opt = NULL;
+
+	/* Clone native IPv6 options from listening socket (if any)
+
+	   Yes, keeping reference count would be much more clever,
+	   but we make one more one thing there: reattach optmem
+	   to newsk.
+	 */
+	if (opt) {
+		np->opt = ipv6_dup_options(newsk, opt);
+		if (opt != sk->net_pinfo.af_inet6.opt)
+			sock_kfree_s(sk, opt, opt->tot_len);
+	}
+
+	newtp->ext_header_len = 0;
+	if (np->opt)
+		newtp->ext_header_len = np->opt->opt_nflen + np->opt->opt_flen;
+
+	tcp_sync_mss(newsk, dst->pmtu);
+	newtp->rcv_mss = newtp->mss_clamp;
+
 	newsk->daddr	= LOOPBACK4_IPV6;
 	newsk->saddr	= LOOPBACK4_IPV6;
 	newsk->rcv_saddr= LOOPBACK4_IPV6;
 
 	newsk->prot->hash(newsk);
 	add_to_prot_sklist(newsk);
+
+	sk->data_ready(sk, 0); /* Deliver SIGIO */ 
+
 	return newsk;
 
 out:
+	if (opt && opt != sk->net_pinfo.af_inet6.opt)
+		sock_kfree_s(sk, opt, opt->tot_len);
 	dst_release(dst);
 	return NULL;
 }
@@ -1020,8 +1127,8 @@ static void tcp_v6_rst_req(struct sock *sk, struct sk_buff *skb)
 	if (!req)
 		return;
 	/* Sequence number check required by RFC793 */
-	if (before(TCP_SKB_CB(skb)->seq, req->snt_isn) ||
-	    after(TCP_SKB_CB(skb)->seq, req->snt_isn+1))
+	if (before(TCP_SKB_CB(skb)->seq, req->rcv_isn) ||
+	    after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
 		return;
 	if(req->sk)
 		sk->ack_backlog--;
@@ -1055,7 +1162,7 @@ static inline struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
 		}
 #if 0 /*def CONFIG_SYN_COOKIES */
 		 else {
-			sk = cookie_v6_check(sk, skb, (struct ipv6_options *) skb->cb);
+			sk = cookie_v6_check(sk, skb);
 		 }
 #endif
 	}
@@ -1064,6 +1171,8 @@ static inline struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
 
 static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 {
+	int users = 0;
+
 	/* Imagine: socket is IPv6. IPv4 packet arrives,
 	   goes to IPv4 receive handler and backlogged.
 	   From backlog it always goes here. Kerboom...
@@ -1080,6 +1189,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 	 *	is currently called with bh processing disabled.
 	 */
 
+  	ipv6_statistics.Ip6InDelivers++;
+
 	/* XXX We need to think more about socket locking
 	 * XXX wrt. backlog queues, __release_sock(), etc.  -DaveM
 	 */
@@ -1092,9 +1203,29 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 	 */
 	skb_set_owner_r(skb, sk);
 
+	/* Do Stevens' IPV6_PKTOPTIONS.
+
+	   Yes, guys, it is the only place in our code, where we
+	   may make it not affecting IPv4.
+	   The rest of code is protocol independent,
+	   and I do not like idea to uglify IPv4.
+
+	   Actually, all the idea behind IPV6_PKTOPTIONS
+	   looks not very well thought. For now we latch
+	   options, received in the last packet, enqueued
+	   by tcp. Feel free to propose better solution.
+	                                       --ANK (980728)
+	 */
+	if (sk->net_pinfo.af_inet6.rxopt.all) {
+		users = atomic_read(&skb->users);
+		atomic_inc(&skb->users);
+	}
+
 	if (sk->state == TCP_ESTABLISHED) { /* Fast path */
 		if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
 			goto reset;
+		if (users)
+			goto ipv6_pktoptions;
 		release_sock(sk);
 		return 0;
 	}
@@ -1110,26 +1241,60 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 		sk = nsk;
 	}
 
-	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->cb, skb->len))
+	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
 		goto reset;
+	if (users)
+		goto ipv6_pktoptions;
 	release_sock(sk);
 	return 0;
 
 reset:
 	tcp_v6_send_reset(skb);
 discard:
+	if (users)
+		kfree_skb(skb);
 	kfree_skb(skb);
 	release_sock(sk);  
 	return 0;
+
+ipv6_pktoptions:
+	/* Do you ask, what is it?
+
+	   1. skb was enqueued by tcp.
+	   2. skb is added to tail of read queue, rather than out of order.
+	   3. socket is not in passive state.
+	   4. Finally, it really contains options, which user wants to receive.
+	 */
+	if (atomic_read(&skb->users) > users &&
+	    TCP_SKB_CB(skb)->end_seq == sk->tp_pinfo.af_tcp.rcv_nxt &&
+	    !((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))) {
+		if (ipv6_opt_accepted(sk, skb)) {
+			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+			kfree_skb(skb);
+			skb = NULL;
+			if (skb2) {
+				skb_set_owner_r(skb2, sk);
+				skb = xchg(&sk->net_pinfo.af_inet6.pktoptions, skb2);
+			}
+		} else {
+			kfree_skb(skb);
+			skb = xchg(&sk->net_pinfo.af_inet6.pktoptions, NULL);
+		}
+	}
+
+	if (skb)
+		kfree_skb(skb);
+	release_sock(sk);
+	return 0;
 }
 
-int tcp_v6_rcv(struct sk_buff *skb, struct device *dev,
-	       struct in6_addr *saddr, struct in6_addr *daddr,
-	       struct ipv6_options *opt, unsigned short len,
-	       int redo, struct inet6_protocol *protocol)
+int tcp_v6_rcv(struct sk_buff *skb, unsigned long len)
 {
 	struct tcphdr *th;	
 	struct sock *sk;
+	struct device *dev = skb->dev;
+	struct in6_addr *saddr = &skb->nh.ipv6h->saddr;
+	struct in6_addr *daddr = &skb->nh.ipv6h->daddr;
 
 	th = skb->h.th;
 
@@ -1178,7 +1343,7 @@ int tcp_v6_rcv(struct sk_buff *skb, struct device *dev,
 	if(sk->state == TCP_TIME_WAIT)
 		goto do_time_wait;
 
-	if (!sk->sock_readers)
+	if (!atomic_read(&sk->sock_readers))
 		return tcp_v6_do_rcv(sk, skb);
 
 	__skb_queue_tail(&sk->back_log, skb);
@@ -1198,7 +1363,7 @@ discard_it:
 
 do_time_wait:
 	if(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
-				      skb, th, &(IPCB(skb)->opt), skb->len))
+				      skb, th, skb->len))
 		goto no_tcp_socket;
 	goto discard_it;
 }
@@ -1221,6 +1386,12 @@ static int tcp_v6_rebuild_header(struct sock *sk)
 		fl.uli_u.ports.dport = sk->dport;
 		fl.uli_u.ports.sport = sk->sport;
 
+		if (np->opt && np->opt->srcrt) {
+			struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
+			fl.nl_u.ip6_u.daddr = rt0->addr;
+		}
+
+
 		dst = ip6_route_output(sk, &fl);
 
 		if (dst->error) {
@@ -1228,7 +1399,7 @@ static int tcp_v6_rebuild_header(struct sock *sk)
 			return dst->error;
 		}
 
-		ip6_dst_store(sk, dst);
+		ip6_dst_store(sk, dst, NULL);
 	}
 
 	return dst->error;
@@ -1258,6 +1429,11 @@ static void tcp_v6_xmit(struct sk_buff *skb)
 	fl.uli_u.ports.sport = sk->sport;
 	fl.uli_u.ports.dport = sk->dport;
 
+	if (np->opt && np->opt->srcrt) {
+		struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
+		fl.nl_u.ip6_u.daddr = rt0->addr;
+	}
+
 	if (sk->dst_cache)
 		dst = dst_check(&sk->dst_cache, np->dst_cookie);
 
@@ -1270,11 +1446,14 @@ static void tcp_v6_xmit(struct sk_buff *skb)
 			return;
 		}
 
-		ip6_dst_store(sk, dst);
+		ip6_dst_store(sk, dst, NULL);
 	}
 
 	skb->dst = dst_clone(dst);
 
+	/* Restore final destination back after routing done */
+	fl.nl_u.ip6_u.daddr = &np->daddr;
+
 	ip6_xmit(sk, skb, &fl, np->opt);
 }
 
@@ -1295,6 +1474,8 @@ static struct tcp_func ipv6_specific = {
 	tcp_v6_conn_request,
 	tcp_v6_syn_recv_sock,
 	tcp_v6_get_sock,
+	sizeof(struct ipv6hdr),
+
 	ipv6_setsockopt,
 	ipv6_getsockopt,
 	v6_addr2sockaddr,
@@ -1312,6 +1493,8 @@ static struct tcp_func ipv6_mapped = {
 	tcp_v6_conn_request,
 	tcp_v6_syn_recv_sock,
 	tcp_v6_get_sock,
+	sizeof(struct iphdr),
+
 	ipv6_setsockopt,
 	ipv6_getsockopt,
 	v6_addr2sockaddr,
@@ -1330,7 +1513,7 @@ static int tcp_v6_init_sock(struct sock *sk)
 
 	tp->rto  = TCP_TIMEOUT_INIT;		/*TCP_WRITE_TIME*/
 	tp->mdev = TCP_TIMEOUT_INIT;
-	tp->in_mss = 536;
+	tp->mss_clamp = ~0;
 
 	/* See draft-stevens-tcpca-spec-01 for discussion of the
 	 * initialization of these values.
@@ -1338,17 +1521,17 @@ static int tcp_v6_init_sock(struct sock *sk)
 	tp->snd_cwnd = (1 << TCP_CWND_SHIFT);
 	tp->snd_ssthresh = 0x7fffffff;
 
-	sk->priority = 1;
 	sk->state = TCP_CLOSE;
 	sk->max_ack_backlog = SOMAXCONN;
-	sk->mtu = 576;
-	sk->mss = 536;
+	tp->rcv_mss = 536; 
 
 	/* Init SYN queue. */
 	tcp_synq_init(tp);
 
 	sk->tp_pinfo.af_tcp.af_specific = &ipv6_specific;
 
+	sk->write_space = tcp_write_space;
+
 	return 0;
 }
 
@@ -1376,12 +1559,6 @@ static int tcp_v6_destroy_sock(struct sock *sk)
   	while((skb = __skb_dequeue(&tp->out_of_order_queue)) != NULL)
 		kfree_skb(skb);
 
-	/*
-	 *	Release destination entry
-	 */
-
-	dst_release(xchg(&sk->dst_cache,NULL));
-
 	/* Clean up a locked TCP bind bucket, this only happens if a
 	 * port is allocated for a socket, but it never fully connects.
 	 * In which case we will find num to be non-zero and daddr to
@@ -1390,7 +1567,7 @@ static int tcp_v6_destroy_sock(struct sock *sk)
 	if(ipv6_addr_any(&(sk->net_pinfo.af_inet6.daddr)) && sk->num != 0)
 		tcp_bucket_unlock(sk);
 
-	return 0;
+	return inet6_destroy_sock(sk);
 }
 
 struct proto tcpv6_prot = {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 2dac0570f..bfa701c97 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -7,7 +7,7 @@
  *
  *	Based on linux/ipv4/udp.c
  *
- *	$Id: udp.c,v 1.31 1998/07/15 05:05:45 davem Exp $
+ *	$Id: udp.c,v 1.33 1998/08/27 16:55:20 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -15,6 +15,7 @@
  *      2 of the License, or (at your option) any later version.
  */
 
+#include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/types.h>
 #include <linux/socket.h>
@@ -59,6 +60,14 @@ static int udp_v6_verify_bind(struct sock *sk, unsigned short snum)
 		if((sk2->num == snum) && (sk2 != sk)) {
 			unsigned char state = sk2->state;
 			int sk2_reuse = sk2->reuse;
+
+			/* Two sockets can be bound to the same port if they're
+			 * bound to different interfaces.
+			 */
+
+			if(sk2->bound_dev_if != sk->bound_dev_if)
+				continue;
+
 			if(addr_type == IPV6_ADDR_ANY || (!sk2->rcv_saddr)) {
 				if((!sk2_reuse)			||
 				   (!sk_reuse)			||
@@ -139,7 +148,7 @@ static void udp_v6_rehash(struct sock *sk)
 }
 
 static struct sock *udp_v6_lookup(struct in6_addr *saddr, u16 sport,
-				  struct in6_addr *daddr, u16 dport)
+				  struct in6_addr *daddr, u16 dport, int dif)
 {
 	struct sock *sk, *result = NULL;
 	unsigned short hnum = ntohs(dport);
@@ -166,7 +175,12 @@ static struct sock *udp_v6_lookup(struct in6_addr *saddr, u16 sport,
 					continue;
 				score++;
 			}
-			if(score == 3) {
+			if(sk->bound_dev_if) {
+				if(sk->bound_dev_if != dif)
+					continue;
+				score++;
+			}
+			if(score == 4) {
 				result = sk;
 				break;
 			} else if(score > badness) {
@@ -257,20 +271,25 @@ ipv4_connected:
 	 */
 
 	fl.proto = IPPROTO_UDP;
-	fl.nl_u.ip6_u.daddr = daddr;
+	fl.nl_u.ip6_u.daddr = &np->daddr;
 	fl.nl_u.ip6_u.saddr = NULL;
 	fl.oif = sk->bound_dev_if;
 	fl.uli_u.ports.dport = sk->dport;
 	fl.uli_u.ports.sport = sk->sport;
 
+	if (np->opt && np->opt->srcrt) {
+		struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
+		fl.nl_u.ip6_u.daddr = rt0->addr;
+	}
+
 	dst = ip6_route_output(sk, &fl);
-       
+
 	if (dst->error) {
 		dst_release(dst);
 		return dst->error;
 	}
 
-	ip6_dst_store(sk, dst);
+	ip6_dst_store(sk, dst, fl.nl_u.ip6_u.daddr);
 
 	/* get the source adddress used in the apropriate device */
 
@@ -291,15 +310,50 @@ ipv4_connected:
 
 static void udpv6_close(struct sock *sk, unsigned long timeout)
 {
-	lock_sock(sk);
+	/* See for explanation: raw_close in ipv4/raw.c */
 	sk->state = TCP_CLOSE;
-	ipv6_sock_mc_close(sk);
 	udp_v6_unhash(sk);
 	sk->dead = 1;
-	release_sock(sk);
 	destroy_sock(sk);
 }
 
+#ifdef CONFIG_FILTER
+#undef CONFIG_UDP_DELAY_CSUM
+#endif
+
+#ifdef CONFIG_UDP_DELAY_CSUM
+
+/* Please, read comments in net/checksum.h, asm/checksum.h
+
+   I commented out csum_partial_copy_to_user there because it did not
+   verify_area. Now I am even wondered, how clever was I that time 8)8)
+   If I did not it, I would step into this hole again.   --ANK
+ */
+
+#ifndef _HAVE_ARCH_COPY_AND_CSUM_TO_USER
+#if defined(__i386__)
+static __inline__
+unsigned int csum_and_copy_to_user (const char *src, char *dst,
+				    int len, int sum, int *err_ptr)
+{
+	int *src_err_ptr=NULL;
+
+	if (verify_area(VERIFY_WRITE, dst, len) == 0)
+		return csum_partial_copy_generic(src, dst, len, sum, src_err_ptr, err_ptr);
+
+	if (len)
+		*err_ptr = -EFAULT;
+
+	return sum;
+}
+#elif defined(__sparc__)
+#define csum_and_copy_to_user csum_partial_copy_to_user
+#else
+#undef CONFIG_UDP_DELAY_CSUM
+#endif
+#endif
+#endif
+
 /*
  * 	This should be easy, if there is something there we
  * 	return it, otherwise we block.
@@ -322,12 +376,12 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len,
 	 *	From here the generic datagram does a lot of the work. Come
 	 *	the finished NET3, it will do _ALL_ the work!
 	 */
-	 	
+
 	skb = skb_recv_datagram(sk, flags, noblock, &err);
 	if (!skb)
 		goto out;
   
- 	copied = ntohs(((struct udphdr *)skb->h.raw)->len) - sizeof(struct udphdr);
+ 	copied = skb->len - sizeof(struct udphdr);
   	if (copied > len) {
   		copied = len;
   		msg->msg_flags |= MSG_TRUNC;
@@ -337,8 +391,41 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len,
   	 *	FIXME : should use udp header size info value 
   	 */
   	 
+#ifndef CONFIG_UDP_DELAY_CSUM
 	err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), 
 				      msg->msg_iov, copied);
+#else
+	if (sk->no_check || skb->ip_summed==CHECKSUM_UNNECESSARY) {
+		err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
+					      copied);
+	} else if (copied > msg->msg_iov[0].iov_len || (msg->msg_flags&MSG_TRUNC)) {
+		if (csum_fold(csum_partial(skb->h.raw, ntohs(skb->h.uh->len), skb->csum))) {
+			/* Error for blocking case is chosen to masquerade
+			   as some normal condition.
+			 */
+			err = (msg->msg_flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH;
+			udp_stats_in6.UdpInErrors++;
+			goto out_free;
+		}
+		err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
+					      copied);
+	} else {
+		unsigned int csum = csum_partial(skb->h.raw, sizeof(struct udphdr), skb->csum);
+
+		err = 0;
+		csum = csum_and_copy_to_user((char*)&skb->h.uh[1], msg->msg_iov[0].iov_base, copied, csum, &err);
+		if (err)
+			goto out_free;
+		if (csum_fold(csum)) {
+			/* Error for blocking case is chosen to masquerade
+			   as some normal condition.
+			 */
+			err = (msg->msg_flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH;
+			udp_stats_in6.UdpInErrors++;
+			goto out_free;
+		}
+	}
+#endif
 	if (err)
 		goto out_free;
 	
@@ -361,7 +448,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len,
 			memcpy(&sin6->sin6_addr, &skb->nh.ipv6h->saddr,
 			       sizeof(struct in6_addr));
 
-			if (msg->msg_controllen)
+			if (sk->net_pinfo.af_inet6.rxopt.all)
 				datagram_recv_ctl(sk, msg, skb);
 		}
   	}
@@ -373,20 +460,27 @@ out:
 	return err;
 }
 
-void udpv6_err(struct sk_buff *skb, int type, int code, unsigned char *buff, __u32 info,
-	       struct in6_addr *saddr, struct in6_addr *daddr,
-	       struct inet6_protocol *protocol)
+void udpv6_err(struct sk_buff *skb, struct ipv6hdr *hdr,
+	       struct inet6_skb_parm *opt,
+	       int type, int code, unsigned char *buff, __u32 info)
 {
+	struct device *dev = skb->dev;
+	struct in6_addr *saddr = &hdr->saddr;
+	struct in6_addr *daddr = &hdr->daddr;
 	struct sock *sk;
 	struct udphdr *uh;
 	int err;
-	
+
+	if (buff + sizeof(struct udphdr) > skb->tail)
+		return;
+
 	uh = (struct udphdr *) buff;
 
-	sk = udp_v6_lookup(daddr, uh->dest, saddr, uh->source);
+	sk = udp_v6_lookup(daddr, uh->dest, saddr, uh->source, dev->ifindex);
    
 	if (sk == NULL) {
-		printk(KERN_DEBUG "icmp for unknown sock\n");
+		if (net_ratelimit())
+			printk(KERN_DEBUG "icmp for unknown sock\n");
 		return;
 	}
 
@@ -407,11 +501,10 @@ static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
 	if (sock_queue_rcv_skb(sk,skb)<0) {
 		udp_stats_in6.UdpInErrors++;
 		ipv6_statistics.Ip6InDiscards++;
-		ipv6_statistics.Ip6InDelivers--;
-		skb->sk = NULL;
 		kfree_skb(skb);
 		return 0;
 	}
+  	ipv6_statistics.Ip6InDelivers++;
 	udp_stats_in6.UdpInDatagrams++;
 	return 0;
 }
@@ -430,7 +523,8 @@ static __inline__ int inet6_mc_check(struct sock *sk, struct in6_addr *addr)
 
 static struct sock *udp_v6_mcast_next(struct sock *sk,
 				      u16 loc_port, struct in6_addr *loc_addr,
-				      u16 rmt_port, struct in6_addr *rmt_addr)
+				      u16 rmt_port, struct in6_addr *rmt_addr,
+				      int dif)
 {
 	struct sock *s = sk;
 	unsigned short num = ntohs(loc_port);
@@ -446,6 +540,9 @@ static struct sock *udp_v6_mcast_next(struct sock *sk,
 			   ipv6_addr_cmp(&np->daddr, rmt_addr))
 				continue;
 
+			if (s->bound_dev_if && s->bound_dev_if != dif)
+				continue;
+
 			if(!ipv6_addr_any(&np->rcv_saddr)) {
 				if(ipv6_addr_cmp(&np->rcv_saddr, loc_addr) == 0)
 					return s;
@@ -468,16 +565,18 @@ static void udpv6_mcast_deliver(struct udphdr *uh,
 {
 	struct sock *sk, *sk2;
 	struct sk_buff *buff;
+	int dif;
 
 	sk = udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)];
-	sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr);
+	dif = skb->dev->ifindex;
+	sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
 	if (!sk)
 		goto free_skb;
 
 	buff = NULL;
 	sk2 = sk;
 	while((sk2 = udp_v6_mcast_next(sk2->next, uh->dest, saddr,
-						  uh->source, daddr))) {
+						  uh->source, daddr, dif))) {
 		if (!buff) {
 			buff = skb_clone(skb, GFP_ATOMIC);
 			if (!buff)
@@ -486,59 +585,70 @@ static void udpv6_mcast_deliver(struct udphdr *uh,
 		if (sock_queue_rcv_skb(sk2, buff) >= 0)
 			buff = NULL;
 	}
-	if (buff) {
-		buff->sk = NULL;
+	if (buff)
 		kfree_skb(buff);
-	}
 	if (sock_queue_rcv_skb(sk, skb) < 0) {
-	free_skb:
-		skb->sk = NULL;
+free_skb:
 		kfree_skb(skb);
 	}
 }
 
-int udpv6_rcv(struct sk_buff *skb, struct device *dev,
-	      struct in6_addr *saddr, struct in6_addr *daddr,
-	      struct ipv6_options *opt, unsigned short len,
-	      int redo, struct inet6_protocol *protocol)
+int udpv6_rcv(struct sk_buff *skb, unsigned long len)
 {
 	struct sock *sk;
   	struct udphdr *uh;
-	int ulen;
-
-	/*
-	 *	check if the address is ours...
-	 *	I believe that this is being done in IP layer
-	 */
+	struct device *dev = skb->dev;
+	struct in6_addr *saddr = &skb->nh.ipv6h->saddr;
+	struct in6_addr *daddr = &skb->nh.ipv6h->daddr;
+	u32 ulen;
 
-	uh = (struct udphdr *) skb->h.uh;
-  	
-  	ipv6_statistics.Ip6InDelivers++;
+	uh = skb->h.uh;
+	__skb_pull(skb, skb->h.raw - skb->data);
 
 	ulen = ntohs(uh->len);
-	
+
+	/* Check for jumbo payload */
+	if (ulen == 0 && skb->nh.ipv6h->payload_len == 0)
+		ulen = len;
+
 	if (ulen > len || len < sizeof(*uh)) {
-		printk(KERN_DEBUG "UDP: short packet: %d/%d\n", ulen, len);
+		if (net_ratelimit())
+			printk(KERN_DEBUG "UDP: short packet: %d/%ld\n", ulen, len);
 		udp_stats_in6.UdpInErrors++;
 		kfree_skb(skb);
 		return(0);
 	}
 
 	if (uh->check == 0) {
-		printk(KERN_DEBUG "IPv6: udp checksum is 0\n");
+		/* IPv6 draft-v2 section 8.1 says that we SHOULD log
+		   this error. Well, it is reasonable.
+		 */
+		if (net_ratelimit())
+			printk(KERN_INFO "IPv6: udp checksum is 0\n");
 		goto discard;
 	}
 
+	skb_trim(skb, ulen);
+
+#ifndef CONFIG_UDP_DELAY_CSUM
 	switch (skb->ip_summed) {
 	case CHECKSUM_NONE:
-		skb->csum = csum_partial((char*)uh, len, 0);
+		skb->csum = csum_partial((char*)uh, ulen, 0);
 	case CHECKSUM_HW:
-		if (csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, skb->csum)) {
+		if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) {
 			printk(KERN_DEBUG "IPv6: udp checksum error\n");
 			goto discard;
 		}
 	};
-	
+#else
+	if (skb->ip_summed==CHECKSUM_HW) {
+		if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum))
+			goto discard;
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	} else if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+		skb->csum = ~csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, 0);
+#endif
+
 	len = ulen;
 
 	/* 
@@ -555,10 +665,16 @@ int udpv6_rcv(struct sk_buff *skb, struct device *dev,
 	 * check socket cache ... must talk to Alan about his plans
 	 * for sock caches... i'll skip this for now.
 	 */
-
-	sk = udp_v6_lookup(saddr, uh->source, daddr, uh->dest);
-
+	
+	sk = udp_v6_lookup(saddr, uh->source, daddr, uh->dest, dev->ifindex);
+	
 	if (sk == NULL) {
+#ifdef CONFIG_UDP_DELAY_CSUM
+		if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
+		    csum_fold(csum_partial((char*)uh, len, skb->csum)))
+			goto discard;
+#endif
+		
 		udp_stats_in6.UdpNoPorts++;
 
 		icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, dev);
@@ -566,16 +682,13 @@ int udpv6_rcv(struct sk_buff *skb, struct device *dev,
 		kfree_skb(skb);
 		return(0);
 	}
-
+	
 	/* deliver */
-
-	if (sk->sock_readers)
-		__skb_queue_tail(&sk->back_log, skb);
-	else
-		udpv6_queue_rcv_skb(sk, skb);
+	
+	udpv6_queue_rcv_skb(sk, skb);
 	
 	return(0);
-
+	
 discard:
 	udp_stats_in6.UdpInErrors++;
 	kfree_skb(skb);
@@ -618,7 +731,7 @@ static int udpv6_getfrag(const void *data, struct in6_addr *addr,
 	}
 
 	if (csum_partial_copy_fromiovecend(dst, udh->iov, offset,
-						     clen, &udh->wcheck))
+					   clen, &udh->wcheck))
 		return -EFAULT;
 
 	if (final) {
@@ -649,11 +762,11 @@ static int udpv6_getfrag(const void *data, struct in6_addr *addr,
 
 static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen)
 {
-	struct ipv6_options opt_space;
+	struct ipv6_txoptions opt_space;
 	struct udpv6fakehdr udh;
 	struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) msg->msg_name;
-	struct ipv6_options *opt = NULL;
+	struct ipv6_txoptions *opt = NULL;
 	struct flowi fl;
 	int addr_len = msg->msg_namelen;
 	struct in6_addr *daddr;
@@ -661,22 +774,18 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen)
 	int len = ulen + sizeof(struct udphdr);
 	int addr_type;
 	int hlimit = -1;
-
+	
 	int err;
 	
 	/* Rough check on arithmetic overflow,
 	   better check is made in ip6_build_xmit
-
-	   When jumbo header will be implemeted we will change it
-	   to something sort of (len will be size_t)
-	   ulen > SIZE_T_MAX - sizeof(struct udphdr)
-	 */
-	if (ulen < 0 || ulen > 0xFFFF - sizeof(struct udphdr))
+	   */
+	if (ulen < 0 || ulen > INT_MAX - sizeof(struct udphdr))
 		return -EMSGSIZE;
-
+	
 	if (msg->msg_flags & ~(MSG_DONTROUTE|MSG_DONTWAIT))
 		return(-EINVAL);
-
+	
 	if (sin6) {
 		if (sin6->sin6_family == AF_INET)
 			return udp_sendmsg(sk, msg, ulen);
@@ -692,14 +801,6 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen)
 	       
 		udh.uh.dest = sin6->sin6_port;
 		daddr = &sin6->sin6_addr;
-
-		/* BUGGGG! If route is not cloned, this check always
-		   fails, hence dst_cache only slows down transmission --ANK
-		 */
-		if (sk->dst_cache && ipv6_addr_cmp(daddr, &np->daddr)) {
-			dst_release(sk->dst_cache);
-			sk->dst_cache = NULL;
-		}
 	} else {
 		if (sk->state != TCP_ESTABLISHED)
 			return(-ENOTCONN);
@@ -707,9 +808,9 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen)
 		udh.uh.dest = sk->dport;
 		daddr = &sk->net_pinfo.af_inet6.daddr;
 	}
-
+	
 	addr_type = ipv6_addr_type(daddr);
-
+	
 	if (addr_type == IPV6_ADDR_MAPPED) {
 		struct sockaddr_in sin;
 		
@@ -720,24 +821,25 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen)
 
 		return udp_sendmsg(sk, msg, ulen);
 	}
-
+	
 	udh.daddr = NULL;
 	fl.oif = sk->bound_dev_if;
 	
 	if (msg->msg_controllen) {
 		opt = &opt_space;
-		memset(opt, 0, sizeof(struct ipv6_options));
+		memset(opt, 0, sizeof(struct ipv6_txoptions));
 
 		err = datagram_send_ctl(msg, &fl.oif, &saddr, opt, &hlimit);
 		if (err < 0)
 			return err;
-		
-		if (opt->srcrt)
-			udh.daddr = daddr;
 	}
-	
+	if (opt == NULL || !(opt->opt_nflen|opt->opt_flen))
+		opt = np->opt;
+	if (opt && opt->srcrt)
+		udh.daddr = daddr;
+
 	udh.uh.source = sk->sport;
-	udh.uh.len = htons(len);
+	udh.uh.len = len < 0x1000 ? htons(len) : 0;
 	udh.uh.check = 0;
 	udh.iov = msg->msg_iov;
 	udh.wcheck = 0;
@@ -783,7 +885,7 @@ struct proto udpv6_prot = {
 	datagram_poll,			/* poll */
 	udp_ioctl,			/* ioctl */
 	NULL,				/* init */
-	NULL,				/* destroy */
+	inet6_destroy_sock,		/* destroy */
 	NULL,				/* shutdown */
 	ipv6_setsockopt,		/* setsockopt */
 	ipv6_getsockopt,		/* getsockopt */
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 85aaaa7b8..0db8e06ef 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -1749,7 +1749,6 @@ static int ipx_create(struct socket *sock, int protocol)
 
 	sock_init_data(sock, sk);
 	sk->destruct	= NULL;
-	sk->mtu		= IPX_MTU;
 	sk->no_check 	= 1;		/* Checksum off by default */
 
 	MOD_INC_USE_COUNT;
diff --git a/net/ipx/af_spx.c b/net/ipx/af_spx.c
index fb2cffeab..5b95a7e67 100644
--- a/net/ipx/af_spx.c
+++ b/net/ipx/af_spx.c
@@ -103,7 +103,6 @@ static int spx_create(struct socket *sock, int protocol)
 	spx_sock_init(sk);
 	sk->data_ready  = spx_rcv;
 	sk->destruct 	= NULL;
-        sk->mtu 	= IPX_MTU;
         sk->no_check 	= 1;
 
         MOD_INC_USE_COUNT;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index c57d793c0..de104813e 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -98,7 +98,7 @@ static __inline__ int netlink_locked(struct sock *sk)
 	return atomic_read(&sk->protinfo.af_netlink.locks);
 }
 
-static __inline__ struct sock *netlink_lookup(int protocol, pid_t pid)
+static __inline__ struct sock *netlink_lookup(int protocol, u32 pid)
 {
 	struct sock *sk;
 
@@ -116,10 +116,8 @@ extern struct proto_ops netlink_ops;
 
 static void netlink_insert(struct sock *sk)
 {
-	cli();
 	sk->next = nl_table[sk->protocol];
 	nl_table[sk->protocol] = sk;
-	sti();
 }
 
 static void netlink_remove(struct sock *sk)
@@ -154,26 +152,10 @@ static int netlink_create(struct socket *sock, int protocol)
 	sock_init_data(sock,sk);
 	sk->destruct = NULL;
 	
-	sk->mtu=4096;
 	sk->protocol=protocol;
 	return 0;
 }
 
-static void netlink_destroy_timer(unsigned long data)
-{
-	struct sock *sk=(struct sock *)data;
-
-	if (!netlink_locked(sk) && !atomic_read(&sk->wmem_alloc)
-	    && !atomic_read(&sk->rmem_alloc)) {
-		sk_free(sk);
-		return;
-	}
-	
-	sk->timer.expires=jiffies+10*HZ;
-	add_timer(&sk->timer);
-	printk(KERN_DEBUG "netlink sk destroy delayed\n");
-}
-
 static int netlink_release(struct socket *sock, struct socket *peer)
 {
 	struct sock *sk = sock->sk;
@@ -223,11 +205,7 @@ static int netlink_release(struct socket *sock, struct socket *peer)
 	}
 
 	if (atomic_read(&sk->rmem_alloc) || atomic_read(&sk->wmem_alloc)) {
-		sk->timer.data=(unsigned long)sk;
-		sk->timer.expires=jiffies+HZ;
-		sk->timer.function=netlink_destroy_timer;
-		add_timer(&sk->timer);
-		printk(KERN_DEBUG "impossible 333\n");
+		printk(KERN_DEBUG "netlink_release: impossible event. Please, report.\n");
 		return 0;
 	}
 
@@ -270,7 +248,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len
 		return -EINVAL;
 
 	/* Only superuser is allowed to listen multicasts */
-	if (nladdr->nl_groups && !suser())
+	if (nladdr->nl_groups && !capable(CAP_NET_ADMIN))
 		return -EPERM;
 
 	if (sk->protinfo.af_netlink.pid) {
@@ -315,7 +293,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr,
 		return -EINVAL;
 
 	/* Only superuser is allowed to send multicasts */
-	if (!suser() && nladdr->nl_groups)
+	if (nladdr->nl_groups && !capable(CAP_NET_ADMIN))
 		return -EPERM;
 
 	sk->protinfo.af_netlink.dst_pid = nladdr->nl_pid;
@@ -344,11 +322,12 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, int *addr
 	return 0;
 }
 
-int netlink_unicast(struct sock *ssk, struct sk_buff *skb, pid_t pid, int nonblock)
+int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock)
 {
 	struct sock *sk;
 	int len = skb->len;
 	int protocol = ssk->protocol;
+	struct wait_queue wait = { current, NULL };
 
 retry:
 	for (sk = nl_table[protocol]; sk; sk = sk->next) {
@@ -366,17 +345,23 @@ retry:
 		}
 #endif
 
-		cli();
+		if (!nonblock) {
+			add_wait_queue(sk->sleep, &wait);
+			current->state = TASK_INTERRUPTIBLE;
+		}
+
 		if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) {
 			if (nonblock) {
-				sti();
 				netlink_unlock(sk);
 				kfree_skb(skb);
 				return -EAGAIN;
 			}
-			interruptible_sleep_on(sk->sleep);
+
+			schedule();
+
+			current->state = TASK_RUNNING;
+			remove_wait_queue(sk->sleep, &wait);
 			netlink_unlock(sk);
-			sti();
 
 			if (signal_pending(current)) {
 				kfree_skb(skb);
@@ -384,8 +369,12 @@ retry:
 			}
 			goto retry;
 		}
-		sti();
-Nprintk("unicast_deliver %d\n", skb->len);
+
+		if (!nonblock) {
+			current->state = TASK_RUNNING;
+			remove_wait_queue(sk->sleep, &wait);
+		}
+
 		skb_orphan(skb);
 		skb_set_owner_r(skb, sk);
 		skb_queue_tail(&sk->receive_queue, skb);
@@ -417,8 +406,8 @@ Nprintk("broadcast_deliver %d\n", skb->len);
 	return -1;
 }
 
-void netlink_broadcast(struct sock *ssk, struct sk_buff *skb, pid_t pid,
-		       unsigned group, int allocation)
+void netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
+		       u32 group, int allocation)
 {
 	struct sock *sk;
 	struct sk_buff *skb2 = NULL;
@@ -472,7 +461,7 @@ void netlink_broadcast(struct sock *ssk, struct sk_buff *skb, pid_t pid,
 	kfree_skb(skb);
 }
 
-void netlink_set_err(struct sock *ssk, pid_t pid, unsigned group, int code)
+void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
 {
 	struct sock *sk;
 	int protocol = ssk->protocol;
@@ -496,34 +485,28 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, int len,
 {
 	struct sock *sk = sock->sk;
 	struct sockaddr_nl *addr=msg->msg_name;
-	pid_t dst_pid;
-	unsigned dst_groups;
+	u32 dst_pid;
+	u32 dst_groups;
 	struct sk_buff *skb;
-	int err;
 
 	if (msg->msg_flags&MSG_OOB)
 		return -EOPNOTSUPP;
 
-	if (msg->msg_flags&~MSG_DONTWAIT) {
-		printk("1 %08x\n", msg->msg_flags);
+	if (msg->msg_flags&~(MSG_DONTWAIT|MSG_NOSIGNAL|MSG_ERRQUEUE))
 		return -EINVAL;
-	}
 
 	if (msg->msg_namelen) {
-		if (addr->nl_family != AF_NETLINK) {
-			printk("2 %08x\n", addr->nl_family);
+		if (addr->nl_family != AF_NETLINK)
 			return -EINVAL;
-		}
 		dst_pid = addr->nl_pid;
 		dst_groups = addr->nl_groups;
-		if (dst_groups && !suser())
+		if (dst_groups && !capable(CAP_NET_ADMIN))
 			return -EPERM;
 	} else {
 		dst_pid = sk->protinfo.af_netlink.dst_pid;
 		dst_groups = sk->protinfo.af_netlink.dst_groups;
 	}
 
-
 	if (!sk->protinfo.af_netlink.pid)
 		netlink_autobind(sock);
 
@@ -536,17 +519,24 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, int len,
 	NETLINK_CB(skb).dst_pid = dst_pid;
 	NETLINK_CB(skb).dst_groups = dst_groups;
 	memcpy(NETLINK_CREDS(skb), &scm->creds, sizeof(struct ucred));
-	memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
+
+	/* What can I do? Netlink is asynchronous, so that
+	   we will have to save current capabilities to
+	   check them, when this message will be delivered
+	   to corresponding kernel module.   --ANK (980802)
+	 */
+	NETLINK_CB(skb).eff_cap = current->cap_effective;
+
+	if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len)) {
+		kfree_skb(skb);
+		return -EFAULT;
+	}
 
 	if (dst_groups) {
 		atomic_inc(&skb->users);
 		netlink_broadcast(sk, skb, dst_pid, dst_groups, GFP_KERNEL);
 	}
-	err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT);
-	if (err < 0) {
-		printk("3\n");
-	}
-	return err;
+	return netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT);
 }
 
 static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, int len,
@@ -594,7 +584,7 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, int len,
 	if (sk->protinfo.af_netlink.cb
 	    && atomic_read(&sk->rmem_alloc) <= sk->rcvbuf/2)
 		netlink_dump(sk);
-	return err ? err : copied;
+	return err ? : copied;
 }
 
 /*
@@ -651,11 +641,11 @@ static int netlink_dump(struct sock *sk)
 	skb = sock_rmalloc(sk, NLMSG_GOODSIZE, 0, GFP_KERNEL);
 	if (!skb)
 		return -ENOBUFS;
-	
+
 	cb = sk->protinfo.af_netlink.cb;
 
 	len = cb->dump(skb, cb);
-	
+
 	if (len > 0) {
 		skb_queue_tail(&sk->receive_queue, skb);
 		sk->data_ready(sk, len);
@@ -667,7 +657,7 @@ static int netlink_dump(struct sock *sk)
 	memcpy(NLMSG_DATA(nlh), &len, sizeof(len));
 	skb_queue_tail(&sk->receive_queue, skb);
 	sk->data_ready(sk, skb->len);
-	
+
 	cb->done(cb);
 	sk->protinfo.af_netlink.cb = NULL;
 	netlink_destroy_callback(cb);
@@ -769,167 +759,6 @@ int netlink_post(int unit, struct sk_buff *skb)
 
 #endif
 
-#if 0
-
-/* What a pity... It was good code, but at the moment it
-   results in unnecessary complications.
- */
-
-/*
- *	"High" level netlink interface. (ANK)
- *	
- *	Features:
- *		- standard message format.
- *		- pseudo-reliable delivery. Messages can be still lost, but
- *		  user level will know that they were lost and can
- *		  recover (f.e. gated could reread FIB and device list)
- *		- messages are batched.
- */
-
-/*
- *	Try to deliver queued messages.
- */
-
-static void nlmsg_delayed_flush(struct sock *sk)
-{
-	nlmsg_flush(sk, GFP_ATOMIC);
-}
-
-static void nlmsg_flush(struct sock *sk, int allocation)
-{
-	struct sk_buff *skb;
-	unsigned long flags;
-
-	save_flags(flags);
-	cli();
-	while ((skb=skb_dequeue(&sk->write_queue)) != NULL) {
-		if (skb->users != 1) {
-			skb_queue_head(&sk->write_queue, skb);
-			break;
-		}
-		restore_flags(flags);
-		netlink_broadcast(sk, skb, 0, NETLINK_CB(skb).dst_groups, allocation);
-		cli();
-	}
-	start_bh_atomic();
-	restore_flags(flags);
-	if (skb) {
-		if (sk->timer.function)
-			del_timer(&sk->timer)
-		sk->timer.expires = jiffies + (sk->protinfo.af_netlink.delay ? : HZ/2);
-		sk->timer.function = (void (*)(unsigned long))nlmsg_delayed_flush;
-		sk->timer.data = (unsigned long)sk;
-		add_timer(&sk->timer);
-	}
-	end_bh_atomic();
-}
-
-/*
- *	Allocate room for new message. If it is impossible, return NULL.
- */
-
-void *nlmsg_broadcast(struct sock *sk, struct sk_buff **skbp,
-		      unsigned long type, int len,
-		      unsigned groups, int allocation)
-{
-	struct nlmsghdr *nlh;
-	struct sk_buff *skb;
-	int	rlen;
-	unsigned long flags;
-
-	rlen = NLMSG_SPACE(len);
-
-	save_flags(flags);
-	cli();
-	skb = sk->write_queue.tail;
-	if (skb == sk->write_queue.head)
-		skb = NULL;
-	if (skb == NULL || skb_tailroom(skb) < rlen || NETLINK_CB(skb).dst_groups != groups) {
-		restore_flags(flags);
-
-		if (skb)
-			nlmsg_flush(sk, allocation);
-
-		skb = sock_wmalloc(rlen > NLMSG_GOODSIZE ? rlen : NLMSG_GOODSIZE,
-				   sk, 0, allocation);
-
-		if (skb==NULL) {
-			printk (KERN_WARNING "nlmsg at unit %d overrunned\n", sk->protocol);
-			return NULL;
-		}
-
-		NETLINK_CB(skb).dst_groups = groups;
-		cli();
-		skb_queue_tail(&sk->write_queue, skb);
-	}
-	atomic_inc(&skb->users);
-	restore_flags(flags);
-
-	nlh = (struct nlmsghdr*)skb_put(skb, rlen);
-	nlh->nlmsg_type = type;
-	nlh->nlmsg_len = NLMSG_LENGTH(len);
-	nlh->nlmsg_seq = 0;
-	nlh->nlmsg_pid = 0;
-	*skbp = skb;
-	return nlh->nlmsg_data;
-}
-
-struct sk_buff* nlmsg_alloc(unsigned long type, int len,
-			    unsigned long seq, unsigned long pid, int allocation)
-{
-	struct nlmsghdr	*nlh;
-	struct sk_buff *skb;
-	int		rlen;
-
-	rlen = NLMSG_SPACE(len);
-
-	skb = alloc_skb(rlen, allocation);
-	if (skb==NULL)
-		return NULL;
-
-	nlh = (struct nlmsghdr*)skb_put(skb, rlen);
-	nlh->nlmsg_type = type;
-	nlh->nlmsg_len = NLMSG_LENGTH(len);
-	nlh->nlmsg_seq = seq;
-	nlh->nlmsg_pid = pid;
-	return skb;
-}
-
-void nlmsg_release(struct sk_buff *skb)
-{
-	atomic_dec(skb->users);
-}
-
-
-/*
- *	Kick message queue.
- *	Two modes:
- *		- synchronous (delay==0). Messages are delivered immediately.
- *		- delayed. Do not deliver, but start delivery timer.
- */
-
-void __nlmsg_transmit(struct sock *sk, int allocation)
-{
-	start_bh_atomic();
-	if (!sk->protinfo.af_netlink.delay) {
-		if (sk->timer.function) {
-			del_timer(&sk->timer);
-			sk->timer.function = NULL;
-		}
-		end_bh_atomic();
-		nlmsg_flush(sk, allocation);
-		return;
-	}
-	if (!sk->timer.function) {
-		sk->timer.expires = jiffies + sk->protinfo.af_netlink.delay;
-		sk->timer.function = (void (*)(unsigned long))nlmsg_delayed_flush;
-		sk->timer.data = (unsigned long)sk;
-		add_timer(&sk->timer);
-	}
-	end_bh_atomic();
-}
-
-#endif
 
 #ifdef CONFIG_PROC_FS
 static int netlink_read_proc(char *buffer, char **start, off_t offset,
diff --git a/net/netlink/netlink_dev.c b/net/netlink/netlink_dev.c
index 94be0069b..b127137b2 100644
--- a/net/netlink/netlink_dev.c
+++ b/net/netlink/netlink_dev.c
@@ -144,6 +144,7 @@ static int netlink_open(struct inode * inode, struct file * file)
 
 out:
 	open_map &= ~(1<<minor);
+	MOD_DEC_USE_COUNT;
 	return err;
 }
 
@@ -185,6 +186,7 @@ static struct file_operations netlink_fops = {
 	netlink_ioctl,
 	NULL,		/* netlink_mmap */
 	netlink_open,
+	NULL,		/* flush */
 	netlink_release
 };
 
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 84451d0b9..66b49db8a 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -475,7 +475,6 @@ static int nr_create(struct socket *sock, int protocol)
 
 	sock->ops    = &nr_proto_ops;
 	sk->protocol = protocol;
-	sk->mtu      = NETROM_MTU;	/* 236 */
 
 	skb_queue_head_init(&nr->ack_queue);
 	skb_queue_head_init(&nr->reseq_queue);
@@ -522,7 +521,6 @@ static struct sock *nr_make_new(struct sock *osk)
 	sk->sndbuf   = osk->sndbuf;
 	sk->debug    = osk->debug;
 	sk->state    = TCP_ESTABLISHED;
-	sk->mtu      = osk->mtu;
 	sk->sleep    = osk->sleep;
 	sk->zapped   = osk->zapped;
 
diff --git a/net/netsyms.c b/net/netsyms.c
index d9767a09e..f987d9425 100644
--- a/net/netsyms.c
+++ b/net/netsyms.c
@@ -264,6 +264,7 @@ EXPORT_SYMBOL(tcp_close);
 EXPORT_SYMBOL(tcp_accept);
 EXPORT_SYMBOL(tcp_write_wakeup);
 EXPORT_SYMBOL(tcp_read_wakeup);
+EXPORT_SYMBOL(tcp_write_space);
 EXPORT_SYMBOL(tcp_poll);
 EXPORT_SYMBOL(tcp_ioctl);
 EXPORT_SYMBOL(tcp_shutdown);
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 1c17b3648..8a681b8fb 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -552,7 +552,6 @@ static int rose_create(struct socket *sock, int protocol)
 
 	sock->ops    = &rose_proto_ops;
 	sk->protocol = protocol;
-	sk->mtu      = ROSE_MTU;	/* 253 */
 
 	init_timer(&rose->timer);
 	init_timer(&rose->idletimer);
@@ -593,7 +592,6 @@ static struct sock *rose_make_new(struct sock *osk)
 	sk->sndbuf   = osk->sndbuf;
 	sk->debug    = osk->debug;
 	sk->state    = TCP_ESTABLISHED;
-	sk->mtu      = osk->mtu;
 	sk->sleep    = osk->sleep;
 	sk->zapped   = osk->zapped;
 
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 0bf7a92f4..081896dc5 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -291,7 +291,7 @@ static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n,
 			  struct tcf_proto *tp, unsigned long fh, int event)
 {
 	struct sk_buff *skb;
-	pid_t pid = oskb ? NETLINK_CB(oskb).pid : 0;
+	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
 
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb)
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 8f50013f7..4168f541f 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -537,7 +537,7 @@ insert:
 	if (s == NULL)
 		goto errout;
 	memset(s, 0, sizeof(*s));
-	memcpy(s->dst, dst, sizeof(*dst));
+	memcpy(s->dst, dst, sizeof(s->dst));
 	s->dpi = pinfo->dpi;
 	s->protocol = pinfo->protocol;
 	s->tunnelid = pinfo->tunnelid;
@@ -590,7 +590,6 @@ static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
 		     struct sk_buff *skb, struct tcmsg *t)
 {
-	struct rsvp_head *head = tp->root;
 	struct rsvp_filter *f = (struct rsvp_filter*)fh;
 	struct rsvp_session *s;
 	unsigned char	 *b = skb->tail;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index a684cde66..f2fb9e36f 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -7,6 +7,10 @@
  *		2 of the License, or (at your option) any later version.
  *
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Fixes:
+ *
+ * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
  */
 
 #include <linux/config.h>
@@ -506,7 +510,7 @@ process_existing:
 }
 
 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q,
-			 pid_t pid, u32 seq, unsigned flags, int event)
+			 u32 pid, u32 seq, unsigned flags, int event)
 {
 	struct tcmsg *tcm;
 	struct nlmsghdr  *nlh;
@@ -538,7 +542,7 @@ static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
 			 struct Qdisc *old, struct Qdisc *new)
 {
 	struct sk_buff *skb;
-	pid_t pid = oskb ? NETLINK_CB(oskb).pid : 0;
+	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
 
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb)
@@ -715,7 +719,7 @@ out:
 
 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
 			  unsigned long cl,
-			  pid_t pid, u32 seq, unsigned flags, int event)
+			  u32 pid, u32 seq, unsigned flags, int event)
 {
 	struct tcmsg *tcm;
 	struct nlmsghdr  *nlh;
@@ -745,7 +749,7 @@ static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
 			  struct Qdisc *q, unsigned long cl, int event)
 {
 	struct sk_buff *skb;
-	pid_t pid = oskb ? NETLINK_CB(oskb).pid : 0;
+	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
 
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb)
@@ -850,7 +854,7 @@ int psched_clock_scale;
 #endif
 
 #ifdef PSCHED_WATCHER
-u32 psched_time_mark;
+PSCHED_WATCHER psched_time_mark;
 
 static void psched_tick(unsigned long);
 
@@ -864,10 +868,10 @@ static void psched_tick(unsigned long dummy)
 	PSCHED_GET_TIME(dummy_stamp);
 	psched_timer.expires = jiffies + 4*HZ;
 #else
-	unsigned long jiffies = now;
+	unsigned long now = jiffies;
 	psched_time_base = ((u64)now)<<PSCHED_JSCALE;
 	psched_time_mark = now;
-	psched_timer.expires = jiffies + 60*60*HZ;
+	psched_timer.expires = now + 60*60*HZ;
 #endif
 	add_timer(&psched_timer);
 }
@@ -883,7 +887,7 @@ __initfunc(int psched_calibrate_clock(void))
 	unsigned long stop;
 
 #if CPU == 586 || CPU == 686
-	if (!(boot_cpu_data.x86_capability & X86_FEATURE_TSC)
+	if (!(boot_cpu_data.x86_capability & X86_FEATURE_TSC))
 		return -1;
 #endif
 
@@ -895,7 +899,7 @@ __initfunc(int psched_calibrate_clock(void))
 	PSCHED_GET_TIME(stamp);
 	do_gettimeofday(&tv);
 	while (jiffies < stop)
-		boundary();
+		barrier();
 	PSCHED_GET_TIME(stamp1);
 	do_gettimeofday(&tv1);
 	end_bh_atomic();
@@ -910,7 +914,7 @@ __initfunc(int psched_calibrate_clock(void))
 	while ((delay>>=1) != 0)
 		psched_clock_scale++;
 	psched_us_per_tick = 1<<psched_clock_scale;
-	psched_clock_per_hz = (delay*(1000000/HZ))>>psched_clock_scale;
+	psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
 	return 0;
 }
 #endif
diff --git a/net/socket.c b/net/socket.c
index 6a2624058..118a7276b 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -105,7 +105,7 @@ static unsigned int sock_poll(struct file *file,
 			      struct poll_table_struct *wait);
 static int sock_ioctl(struct inode *inode, struct file *file,
 		      unsigned int cmd, unsigned long arg);
-static int sock_fasync(struct file *filp, int on);
+static int sock_fasync(int fd, struct file *filp, int on);
 
 
 /*
@@ -122,6 +122,7 @@ static struct file_operations socket_file_ops = {
 	sock_ioctl,
 	NULL,			/* mmap */
 	NULL,			/* no special open code... */
+	NULL,			/* flush */
 	sock_close,
 	NULL,			/* no fsync */
 	sock_fasync
@@ -483,7 +484,7 @@ int sock_close(struct inode *inode, struct file *filp)
 		printk(KERN_DEBUG "sock_close: NULL inode\n");
 		return 0;
 	}
-	sock_fasync(filp, 0);
+	sock_fasync(-1, filp, 0);
 	sock_release(socki_lookup(inode));
 	return 0;
 }
@@ -492,11 +493,10 @@ int sock_close(struct inode *inode, struct file *filp)
  *	Update the socket async list
  */
 
-static int sock_fasync(struct file *filp, int on)
+static int sock_fasync(int fd, struct file *filp, int on)
 {
 	struct fasync_struct *fa, *fna=NULL, **prev;
 	struct socket *sock;
-	unsigned long flags;
 	
 	if (on)
 	{
@@ -508,9 +508,8 @@ static int sock_fasync(struct file *filp, int on)
 	sock = socki_lookup(filp->f_dentry->d_inode);
 	
 	prev=&(sock->fasync_list);
-	
-	save_flags(flags);
-	cli();
+
+	lock_sock(sock->sk); 
 	
 	for (fa=*prev; fa!=NULL; prev=&fa->fa_next,fa=*prev)
 		if (fa->fa_file==filp)
@@ -520,11 +519,13 @@ static int sock_fasync(struct file *filp, int on)
 	{
 		if(fa!=NULL)
 		{
+			fa->fa_fd=fd;
 			kfree_s(fna,sizeof(struct fasync_struct));
-			restore_flags(flags);
+			release_sock(sock->sk); 
 			return 0;
 		}
 		fna->fa_file=filp;
+		fna->fa_fd=fd;
 		fna->magic=FASYNC_MAGIC;
 		fna->fa_next=sock->fasync_list;
 		sock->fasync_list=fna;
@@ -537,7 +538,8 @@ static int sock_fasync(struct file *filp, int on)
 			kfree_s(fa,sizeof(struct fasync_struct));
 		}
 	}
-	restore_flags(flags);
+
+	release_sock(sock->sk); 
 	return 0;
 }
 
@@ -1302,7 +1304,8 @@ out:
 /*
  *	Perform a file control on a socket file descriptor.
  *
- *	FIXME: does this need an fd lock ?
+ *	Doesn't aquire a fd lock, because no network fcntl
+ *	function sleeps currently.
  */
 
 int sock_fcntl(struct file *filp, unsigned int cmd, unsigned long arg)
@@ -1439,7 +1442,7 @@ int sock_unregister(int family)
 	return 0;
 }
 
-__initfunc(void proto_init(void))
+void __init proto_init(void)
 {
 	extern struct net_proto protocols[];	/* Network protocols */
 	struct net_proto *pro;
@@ -1459,7 +1462,7 @@ extern void sk_init(void);
 extern void wanrouter_init(void);
 #endif
 
-__initfunc(void sock_init(void))
+void __init sock_init(void)
 {
 	int i;
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index b1a8150ec..9380ff4a4 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -191,22 +191,15 @@ rpc_default_callback(struct rpc_task *task)
 }
 
 /*
- * New rpc_call implementation
+ *	Export the signal mask handling for aysnchronous code that
+ *	sleeps on RPC calls
  */
-int
-rpc_do_call(struct rpc_clnt *clnt, u32 proc, void *argp, void *resp,
-				int flags, rpc_action func, void *data)
+ 
+void rpc_clnt_sigmask(struct rpc_clnt *clnt, sigset_t *oldset)
 {
-	struct rpc_task	my_task, *task = &my_task;
 	unsigned long	sigallow = sigmask(SIGKILL);
-	sigset_t	oldset;
 	unsigned long	irqflags;
-	int		async, status;
-
-	/* If this client is slain all further I/O fails */
-	if (clnt->cl_dead) 
-		return -EIO;
-		
+	
 	/* Turn off various signals */
 	if (clnt->cl_intr) {
 		struct k_sigaction *action = current->sig->action;
@@ -216,10 +209,38 @@ rpc_do_call(struct rpc_clnt *clnt, u32 proc, void *argp, void *resp,
 			sigallow |= sigmask(SIGQUIT);
 	}
 	spin_lock_irqsave(&current->sigmask_lock, irqflags);
-	oldset = current->blocked;
-	siginitsetinv(&current->blocked, sigallow & ~oldset.sig[0]);
+	*oldset = current->blocked;
+	siginitsetinv(&current->blocked, sigallow & ~oldset->sig[0]);
 	recalc_sigpending(current);
 	spin_unlock_irqrestore(&current->sigmask_lock, irqflags);
+}
+
+void rpc_clnt_sigunmask(struct rpc_clnt *clnt, sigset_t *oldset)
+{
+	unsigned long	irqflags;
+	
+	spin_lock_irqsave(&current->sigmask_lock, irqflags);
+	current->blocked = *oldset;
+	recalc_sigpending(current);
+	spin_unlock_irqrestore(&current->sigmask_lock, irqflags);
+}
+
+/*
+ * New rpc_call implementation
+ */
+int
+rpc_do_call(struct rpc_clnt *clnt, u32 proc, void *argp, void *resp,
+				int flags, rpc_action func, void *data)
+{
+	struct rpc_task	my_task, *task = &my_task;
+	sigset_t	oldset;
+	int		async, status;
+
+	/* If this client is slain all further I/O fails */
+	if (clnt->cl_dead) 
+		return -EIO;
+
+	rpc_clnt_sigmask(clnt, &oldset);		
 
 	/* Create/initialize a new RPC task */
 	if ((async = (flags & RPC_TASK_ASYNC)) != 0) {
@@ -248,10 +269,7 @@ rpc_do_call(struct rpc_clnt *clnt, u32 proc, void *argp, void *resp,
 	}
 
 out:
-	spin_lock_irqsave(&current->sigmask_lock, irqflags);
-	current->blocked = oldset;
-	recalc_sigpending(current);
-	spin_unlock_irqrestore(&current->sigmask_lock, irqflags);
+	rpc_clnt_sigunmask(clnt, &oldset);		
 
 	return status;
 }
@@ -395,7 +413,7 @@ call_allocate(struct rpc_task *task)
 		return;
 	printk("RPC: buffer allocation failed for task %p\n", task); 
 
-	if (1 || !signalled()) {
+	if (!signalled()) {
 		xprt_release(task);
 		task->tk_action = call_reserve;
 		rpc_delay(task, HZ);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 8caaa46e8..817a10127 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -128,7 +128,7 @@ rpc_add_timer(struct rpc_task *task, rpc_action timer)
 	if (!timer)
 		timer = __rpc_default_timer;
 	if (expires < jiffies) {
-		printk("RPC: bad timeout value %ld - setting to 10 sec!\n",
+		printk(KERN_ERR "RPC: bad timeout value %ld - setting to 10 sec!\n",
 					task->tk_timeout);
 		expires = jiffies + 10 * HZ;
 	}
@@ -164,7 +164,7 @@ static inline void
 rpc_make_runnable(struct rpc_task *task)
 {
 	if (task->tk_timeout) {
-		printk("RPC: task w/ running timer in rpc_make_runnable!!\n");
+		printk(KERN_ERR "RPC: task w/ running timer in rpc_make_runnable!!\n");
 		return;
 	}
 	if (RPC_IS_ASYNC(task)) {
@@ -242,7 +242,7 @@ __rpc_wake_up(struct rpc_task *task)
 
 #ifdef RPC_DEBUG
 	if (task->tk_magic != 0xf00baa) {
-		printk("RPC: attempt to wake up non-existing task!\n");
+		printk(KERN_ERR "RPC: attempt to wake up non-existing task!\n");
 		rpc_debug = ~0;
 		return;
 	}
@@ -362,7 +362,7 @@ __rpc_execute(struct rpc_task *task)
 				task->tk_pid, task->tk_flags);
 
 	if (!RPC_IS_RUNNING(task)) {
-		printk("RPC: rpc_execute called for sleeping task!!\n");
+		printk(KERN_WARNING "RPC: rpc_execute called for sleeping task!!\n");
 		return 0;
 	}
 
@@ -412,13 +412,15 @@ __rpc_execute(struct rpc_task *task)
 			dprintk("RPC: %4d sync task going to sleep\n",
 							task->tk_pid);
 			if (current->pid == rpciod_pid)
-				printk("RPC: rpciod waiting on sync task!\n");
+				printk(KERN_ERR "RPC: rpciod waiting on sync task!\n");
 			current->timeout = 0;
 			sleep_on(&task->tk_wait);
 
-			/* When the task received a signal, remove from
-			 * any queues etc, and make runnable again. */
-			if (0 && signalled())
+			/*
+			 * When the task received a signal, remove from
+			 * any queues etc, and make runnable again.
+			 */
+			if (signalled())
 				__rpc_wake_up(task);
 
 			dprintk("RPC: %4d sync task resuming\n",
@@ -432,7 +434,7 @@ __rpc_execute(struct rpc_task *task)
 		 * clean up after sleeping on some queue, we don't
 		 * break the loop here, but go around once more.
 		 */
-		if (0 && !RPC_IS_ASYNC(task) && signalled()) {
+		if (!RPC_IS_ASYNC(task) && signalled()) {
 			dprintk("RPC: %4d got signal\n", task->tk_pid);
 			rpc_exit(task, -ERESTARTSYS);
 		}
@@ -460,11 +462,11 @@ rpc_execute(struct rpc_task *task)
 
 	if (incr) {
 		if (rpc_inhibit) {
-			printk("RPC: execution inhibited!\n");
+			printk(KERN_INFO "RPC: execution inhibited!\n");
 			return;
 		}
 		if (executing)
-			printk("RPC: %d tasks executed\n", executing);
+			printk(KERN_WARNING "RPC: %d tasks executed\n", executing);
 	}
 	
 	executing += incr;
@@ -763,6 +765,8 @@ rpc_killall_tasks(struct rpc_clnt *clnt)
 	rpc_inhibit--;
 }
 
+static struct semaphore rpciod_running = MUTEX_LOCKED;
+
 /*
  * This is the rpciod kernel thread
  */
@@ -779,11 +783,16 @@ rpciod(void *ptr)
 	 * Let our maker know we're running ...
 	 */
 	rpciod_pid = current->pid;
-	wake_up(&rpciod_idle);
+	up(&rpciod_running);
 
 	exit_files(current);
 	exit_mm(current);
+
+	spin_lock_irq(&current->sigmask_lock);
 	siginitsetinv(&current->blocked, sigmask(SIGKILL));
+	recalc_sigpending(current);
+	spin_unlock_irq(&current->sigmask_lock);
+
 	current->session = 1;
 	current->pgrp = 1;
 	sprintf(current->comm, "rpciod");
@@ -791,12 +800,7 @@ rpciod(void *ptr)
 	dprintk("RPC: rpciod starting (pid %d)\n", rpciod_pid);
 	while (rpciod_users) {
 		if (signalled()) {
-			if (sigismember(&current->signal, SIGKILL)) {
-				rpciod_killall();
-			} else {
-				printk("rpciod: ignoring signal (%d users)\n",
-					rpciod_users);
-			}
+			rpciod_killall();
 			flush_signals(current);
 		}
 		__rpc_schedule();
@@ -818,7 +822,7 @@ rpciod(void *ptr)
 
 	dprintk("RPC: rpciod shutdown commences\n");
 	if (all_tasks) {
-		printk("rpciod: active tasks at shutdown?!\n");
+		printk(KERN_ERR "rpciod: active tasks at shutdown?!\n");
 		rpciod_killall();
 	}
 
@@ -840,7 +844,7 @@ rpciod_killall(void)
 		rpc_killall_tasks(NULL);
 		__rpc_schedule();
 		if (all_tasks) {
-printk("rpciod_killall: waiting for tasks to exit\n");
+			dprintk("rpciod_killall: waiting for tasks to exit\n");
 			current->state = TASK_INTERRUPTIBLE;
 			current->timeout = jiffies + 1;
 			schedule();
@@ -871,16 +875,17 @@ rpciod_up(void)
 	 * If there's no pid, we should be the first user.
 	 */
 	if (rpciod_users > 1)
-		printk("rpciod_up: no pid, %d users??\n", rpciod_users);
+		printk(KERN_WARNING "rpciod_up: no pid, %d users??\n", rpciod_users);
 	/*
 	 * Create the rpciod thread and wait for it to start.
 	 */
 	error = kernel_thread(rpciod, &rpciod_killer, 0);
 	if (error < 0) {
-		printk("rpciod_up: create thread failed, error=%d\n", error);
+		printk(KERN_WARNING "rpciod_up: create thread failed, error=%d\n", error);
+		rpciod_users--;
 		goto out;
 	}
-	sleep_on(&rpciod_idle);
+	down(&rpciod_running);
 	error = 0;
 out:
 	up(&rpciod_sema);
@@ -900,10 +905,10 @@ rpciod_down(void)
 		if (--rpciod_users)
 			goto out;
 	} else
-		printk("rpciod_down: pid=%d, no users??\n", rpciod_pid);
+		printk(KERN_WARNING "rpciod_down: pid=%d, no users??\n", rpciod_pid);
 
 	if (!rpciod_pid) {
-		printk("rpciod_down: Nothing to do!\n");
+		dprintk("rpciod_down: Nothing to do!\n");
 		goto out;
 	}
 
@@ -921,9 +926,9 @@ rpciod_down(void)
 	 * Display a message if we're going to wait longer.
 	 */
 	while (rpciod_pid) {
-		printk("rpciod_down: waiting for pid %d to exit\n", rpciod_pid);
+		dprintk("rpciod_down: waiting for pid %d to exit\n", rpciod_pid);
 		if (signalled()) {
-			printk("rpciod_down: caught signal\n");
+			dprintk("rpciod_down: caught signal\n");
 			break;
 		}
 		interruptible_sleep_on(&rpciod_killer);
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 6ccf2e29f..e0be9527c 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -43,6 +43,8 @@ EXPORT_SYMBOL(rpc_shutdown_client);
 EXPORT_SYMBOL(rpc_killall_tasks);
 EXPORT_SYMBOL(rpc_do_call);
 EXPORT_SYMBOL(rpc_call_setup);
+EXPORT_SYMBOL(rpc_clnt_sigmask);
+EXPORT_SYMBOL(rpc_clnt_sigunmask);
 EXPORT_SYMBOL(rpc_delay);
 EXPORT_SYMBOL(rpc_restart_call);
 
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index e2af81be4..4566ce5d2 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -34,6 +34,7 @@
  *  Copyright (C) 1995, 1996, Olaf Kirch <okir@monad.swb.de>
  *
  *  TCP callback races fixes (C) 1998 Red Hat Software <alan@redhat.com>
+ *  TCP send fixes (C) 1998 Red Hat Software <alan@redhat.com>
  */
 
 #define __KERNEL_SYSCALLS__
@@ -136,8 +137,48 @@ xprt_from_sock(struct sock *sk)
 }
 
 /*
+ *	Adjust the iovec to move on 'n' bytes
+ */
+ 
+extern inline void xprt_move_iov(struct msghdr *msg, struct iovec *niv, int amount)
+{
+	struct iovec *iv=msg->msg_iov;
+	
+	/*
+	 *	Eat any sent iovecs
+	 */
+
+	while(iv->iov_len < amount)
+	{
+		amount-=iv->iov_len;
+		iv++;
+		msg->msg_iovlen--;
+	}
+	
+	msg->msg_iov=niv;
+	
+	/*
+	 *	And chew down the partial one
+	 */
+
+	niv[0].iov_len = iv->iov_len-amount;
+	niv[0].iov_base =((unsigned char *)iv->iov_base)+amount;
+	iv++;
+	
+	/*
+	 *	And copy any others
+	 */
+	 
+	for(amount=1;amount<msg->msg_iovlen; amount++)
+	{
+		niv[amount]=*iv++;
+	}
+}
+ 
+/*
  * Write data to socket.
  */
+
 static inline int
 xprt_sendmsg(struct rpc_xprt *xprt)
 {
@@ -145,12 +186,12 @@ xprt_sendmsg(struct rpc_xprt *xprt)
 	struct msghdr	msg;
 	mm_segment_t	oldfs;
 	int		result;
+	struct iovec	niv[MAX_IOVEC];
 
 	xprt_pktdump("packet data:",
 				xprt->snd_buf.io_vec->iov_base,
 				xprt->snd_buf.io_vec->iov_len);
 
-#if LINUX_VERSION_CODE >= 0x020100
 	msg.msg_flags   = MSG_DONTWAIT;
 	msg.msg_iov	= xprt->snd_buf.io_vec;
 	msg.msg_iovlen	= xprt->snd_buf.io_nr;
@@ -158,27 +199,21 @@ xprt_sendmsg(struct rpc_xprt *xprt)
 	msg.msg_namelen = sizeof(xprt->addr);
 	msg.msg_control = NULL;
 
+	/* Dont repeat bytes */
+	
+	if(xprt->snd_sent)
+		xprt_move_iov(&msg, niv, xprt->snd_sent);
+		
 	oldfs = get_fs(); set_fs(get_ds());
 	result = sock_sendmsg(sock, &msg, xprt->snd_buf.io_len);
 	set_fs(oldfs);
-#else
-	msg.msg_flags   = 0;
-	msg.msg_iov	= xprt->snd_buf.io_vec;
-	msg.msg_iovlen	= xprt->snd_buf.io_nr;
-	msg.msg_name	= (struct sockaddr *) &xprt->addr;
-	msg.msg_namelen = sizeof(xprt->addr);
-	msg.msg_control = NULL;
-
-	oldfs = get_fs(); set_fs(get_ds());
-	result = sock->ops->sendmsg(sock, &msg, xprt->snd_buf.io_len, 1, 0);
-	set_fs(oldfs);
-#endif
 
 	dprintk("RPC:      xprt_sendmsg(%d) = %d\n",
 				xprt->snd_buf.io_len, result);
 
 	if (result >= 0) {
 		xprt->snd_buf.io_len -= result;
+		xprt->snd_sent += result;
 		return result;
 	}
 
@@ -188,6 +223,8 @@ xprt_sendmsg(struct rpc_xprt *xprt)
 		 * prompts ECONNREFUSED.
 		 */
 		break;
+	case -EAGAIN:
+		return 0;
 	case -ENOTCONN: case -EPIPE:
 		/* connection broken */
 		break;
@@ -828,9 +865,19 @@ tcp_write_space(struct sock *sk)
 
 	if (!(xprt = xprt_from_sock(sk)))
 		return;
-	xprt->write_space = 1;
-	if (xprt->snd_task && !RPC_IS_RUNNING(xprt->snd_task))
-		rpc_wake_up_task(xprt->snd_task);
+	if(xprt->snd_sent && xprt->snd_task)
+		printk("write space\n");
+	if(xprt->write_space == 0)
+	{
+		xprt->write_space = 1;
+		if (xprt->snd_task && !RPC_IS_RUNNING(xprt->snd_task))
+		{
+			if(xprt->snd_sent)
+				printk("Write wakeup snd_sent =%d\n",
+					xprt->snd_sent);
+			rpc_wake_up_task(xprt->snd_task);			
+		}
+	}
 }
 
 /*
@@ -889,6 +936,8 @@ xprt_transmit(struct rpc_task *task)
 	struct rpc_rqst	*req = task->tk_rqstp;
 	struct rpc_xprt	*xprt = req->rq_xprt;
 
+	/*DEBUG*/int ac_debug=xprt->snd_sent;
+	
 	dprintk("RPC: %4d xprt_transmit(%x)\n", task->tk_pid, 
 				*(u32 *)(req->rq_svec[0].iov_base));
 
@@ -935,6 +984,8 @@ xprt_transmit(struct rpc_task *task)
 		}
 		xprt->snd_buf  = req->rq_snd_buf;
 		xprt->snd_task = task;
+		xprt->snd_sent = 0;
+		/*DEBUG*/ac_debug = 0;
 	}
 
 	/* For fast networks/servers we have to put the request on
@@ -954,10 +1005,12 @@ xprt_transmit(struct rpc_task *task)
 		if (xprt_transmit_some(xprt, task) != -EAGAIN) {
 			dprintk("RPC: %4d xmit complete\n", task->tk_pid);
 			xprt->snd_task = NULL;
+			if(ac_debug)
+				printk("Partial xmit finished\n");
 			return;
 		}
 
-		dprintk("RPC: %4d xmit incomplete (%d left of %d)\n",
+		/*d*/printk("RPC: %4d xmit incomplete (%d left of %d)\n",
 				task->tk_pid, xprt->snd_buf.io_len,
 				req->rq_slen);
 		task->tk_status = 0;
@@ -984,10 +1037,15 @@ xprt_transmit_status(struct rpc_task *task)
 	struct rpc_xprt	*xprt = task->tk_client->cl_xprt;
 
 	dprintk("RPC: %4d transmit_status %d\n", task->tk_pid, task->tk_status);
-	if (xprt->snd_task == task) {
+	if (xprt->snd_task == task) 
+	{
 		if (task->tk_status < 0)
+		{
 			xprt->snd_task = NULL;
-		xprt_disconnect(xprt);
+			xprt_disconnect(xprt);
+		}
+		else
+			xprt_transmit(task);
 	}
 }
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 961516de6..8e0110b18 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -8,6 +8,8 @@
  *		as published by the Free Software Foundation; either version
  *		2 of the License, or (at your option) any later version.
  *
+ * Version:	$Id: af_unix.c,v 1.68 1998/08/26 13:18:35 davem Exp $
+ *
  * Fixes:
  *		Linus Torvalds	:	Assorted bug cures.
  *		Niibe Yutaka	:	async I/O support.
@@ -27,6 +29,10 @@
  *		Andreas Schwab	:	Replace inode by dentry for proper
  *					reference counting
  *		Kirk Petersen	:	Made this a module
+ *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
+ *					Lots of bug fixes.
+ *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
+ *					by above two patches.
  *
  * Known differences from reference BSD that was tested:
  *
@@ -102,6 +108,7 @@ unix_socket *unix_socket_table[UNIX_HASH_SIZE+1];
 #define UNIX_ABSTRACT(sk)	((sk)->protinfo.af_unix.addr->hash!=UNIX_HASH_SIZE)
 
 static void unix_destroy_socket(unix_socket *sk);
+static void unix_stream_write_space(struct sock *sk);
 
 extern __inline__ unsigned unix_hash_fold(unsigned hash)
 {
@@ -120,22 +127,22 @@ extern __inline__ int unix_our_peer(unix_socket *sk, unix_socket *osk)
 
 extern __inline__ int unix_may_send(unix_socket *sk, unix_socket *osk)
 {
-	return (sk->type==osk->type);
+	return (unix_peer(osk) == NULL || unix_our_peer(sk, osk));
 }
 
 extern __inline__ void unix_lock(unix_socket *sk)
 {
-	sk->sock_readers++;
+	atomic_inc(&sk->sock_readers);
 }
 
-extern __inline__ int unix_unlock(unix_socket *sk)
+extern __inline__ void unix_unlock(unix_socket *sk)
 {
-	return --sk->sock_readers;
+	atomic_dec(&sk->sock_readers);
 }
 
 extern __inline__ int unix_locked(unix_socket *sk)
 {
-	return sk->sock_readers;
+	return atomic_read(&sk->sock_readers);
 }
 
 extern __inline__ void unix_release_addr(struct unix_address *addr)
@@ -257,7 +264,6 @@ static void unix_destroy_timer(unsigned long data)
 	if(!unix_locked(sk) && atomic_read(&sk->wmem_alloc) == 0)
 	{
 		sk_free(sk);
-		unix_remove_socket(sk);
 	
 		/* socket destroyed, decrement count		      */
 		MOD_DEC_USE_COUNT;
@@ -291,9 +297,6 @@ static int unix_release_sock (unix_socket *sk)
 
 	skpair=unix_peer(sk);
 
-	/* Try to flush out this socket. Throw out buffers at least */
-	unix_destroy_socket(sk);
-
 	if (skpair!=NULL)
 	{
 		if (sk->type==SOCK_STREAM && unix_our_peer(sk, skpair))
@@ -304,6 +307,9 @@ static int unix_release_sock (unix_socket *sk)
 		unix_unlock(skpair); /* It may now die */
 	}
 
+	/* Try to flush out this socket. Throw out buffers at least */
+	unix_destroy_socket(sk);
+
 	/*
 	 * Fixme: BSD difference: In BSD all sockets connected to use get
 	 *	  ECONNRESET and we die on the spot. In Linux we behave
@@ -311,6 +317,8 @@ static int unix_release_sock (unix_socket *sk)
 	 *	  dereference.
 	 *
 	 * Can't we simply set sock->err?
+	 *
+	 *	  What the above comment does talk about? --ANK(980817)
 	 */
 
 	unix_gc();		/* Garbage collect fds */	
@@ -321,13 +329,12 @@ static void unix_destroy_socket(unix_socket *sk)
 {
 	struct sk_buff *skb;
 
+	unix_remove_socket(sk);
+
 	while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
 	{
 		if(sk->state==TCP_LISTEN)
-		{
-			unix_unlock(sk);
 			unix_release_sock(skb->sk);
-		}
 		/* passed fds are erased in the kfree_skb hook	      */
 		kfree_skb(skb);
 	}
@@ -338,10 +345,9 @@ static void unix_destroy_socket(unix_socket *sk)
 		sk->protinfo.af_unix.dentry=NULL;
 	}
 	
-	if(!unix_unlock(sk) && atomic_read(&sk->wmem_alloc) == 0)
+	if(!unix_locked(sk) && atomic_read(&sk->wmem_alloc) == 0)
 	{
 		sk_free(sk);
-		unix_remove_socket(sk);
 	
 		/* socket destroyed, decrement count		      */
 		MOD_DEC_USE_COUNT;
@@ -366,8 +372,6 @@ static int unix_listen(struct socket *sock, int backlog)
 	if (!sk->protinfo.af_unix.addr)
 		return -EINVAL;			/* No listens on an unbound socket */
 	sk->max_ack_backlog=backlog;
-	if (sk->ack_backlog < backlog)
-		sk->state_change(sk);
 	sk->state=TCP_LISTEN;
 	sock->flags |= SO_ACCEPTCON;
 	/* set credentials so connect can copy them */
@@ -380,61 +384,60 @@ static int unix_listen(struct socket *sock, int backlog)
 extern struct proto_ops unix_stream_ops;
 extern struct proto_ops unix_dgram_ops;
 
-static int unix_create1(struct socket *sock, struct sock **skp, int protocol)
+static struct sock * unix_create1(struct socket *sock, int stream)
 {
 	struct sock *sk;
 
-	if (protocol && protocol != PF_UNIX)
-		return -EPROTONOSUPPORT;
-
-	if (sock)
-	{
-		sock->state = SS_UNCONNECTED;
-
-		switch (sock->type)
-		{
-		case SOCK_STREAM:
-			sock->ops = &unix_stream_ops;
-			break;
-		/*
-		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
-		 *	nothing uses it.
-		 */
-		case SOCK_RAW:
-			sock->type=SOCK_DGRAM;
-		case SOCK_DGRAM:
-			sock->ops = &unix_dgram_ops;
-			break;
-		default:
-			return -ESOCKTNOSUPPORT;
-		}
-	}
+	MOD_INC_USE_COUNT;
 	sk = sk_alloc(PF_UNIX, GFP_KERNEL, 1);
-	if (!sk)
-		return -ENOMEM;
+	if (!sk) {
+		MOD_DEC_USE_COUNT;
+		return NULL;
+	}
 
 	sock_init_data(sock,sk);
 
+	if (stream)
+		sk->write_space = unix_stream_write_space; 
+
 	sk->destruct = unix_destruct_addr;
 	sk->protinfo.af_unix.family=PF_UNIX;
 	sk->protinfo.af_unix.dentry=NULL;
-	sk->sock_readers=1;			/* Us */
 	sk->protinfo.af_unix.readsem=MUTEX;	/* single task reading lock */
-	sk->mtu=4096;
 	sk->protinfo.af_unix.list=&unix_sockets_unbound;
 	unix_insert_socket(sk);
-	if (skp)
-		*skp =sk;
-	
-	/* socket created, increment count */
-	MOD_INC_USE_COUNT;
 
-	return 0;
+	return sk;
 }
 
 static int unix_create(struct socket *sock, int protocol)
 {
-	return unix_create1(sock, NULL, protocol);
+	int stream = 0;
+
+	if (protocol && protocol != PF_UNIX)
+		return -EPROTONOSUPPORT;
+
+	sock->state = SS_UNCONNECTED;
+
+	switch (sock->type) {
+	case SOCK_STREAM:
+		sock->ops = &unix_stream_ops;
+		stream = 1;
+		break;
+		/*
+		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
+		 *	nothing uses it.
+		 */
+	case SOCK_RAW:
+		sock->type=SOCK_DGRAM;
+	case SOCK_DGRAM:
+		sock->ops = &unix_dgram_ops;
+		break;
+	default:
+		return -ESOCKTNOSUPPORT;
+	}
+
+	return unix_create1(sock, stream) ? 0 : -ENOMEM;
 }
 
 static int unix_release(struct socket *sock, struct socket *peer)
@@ -665,6 +668,22 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	if (addr_len < 0)
 		return addr_len;
 
+	/* First of all allocate resources.
+	   If we will make it after state checks,
+	   we will have to recheck all again in any case.
+	 */
+
+	/*  Find listening sock */
+	other=unix_find_other(sunaddr, addr_len, sk->type, hash, &err);
+
+	/* create new sock for complete connection */
+	newsk = unix_create1(NULL, 1);
+
+	/* Allocate skb for sending to listening sock */
+	skb = NULL;
+	if (newsk)
+		skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
+
 	switch (sock->state) 
 	{
 		case SS_UNCONNECTED:
@@ -672,37 +691,25 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 			break;
 		case SS_CONNECTED:
 			/* Socket is already connected */
-			return -EISCONN;
+			err = -EISCONN;
+			goto out;
 		default:
-			return(-EINVAL);
+			err = -EINVAL;
+			goto out;
 	}
 
-	/*
-	 *	Now ready to connect
-	 */
-	 
-	sk->state=TCP_CLOSE;
-	
-	/*  Find listening sock */
-	other=unix_find_other(sunaddr, addr_len, sk->type, hash, &err);
-	if(other==NULL)
+	err = -EINVAL;
+	if (sk->state != TCP_CLOSE)
 		goto out;
 
-	/* create new sock for complete connection */
-	err = unix_create1(NULL, &newsk, PF_UNIX);
-	if (newsk == NULL)
+	/* Check that listener is in valid state. */
+	err = -ECONNREFUSED;
+	if (other == NULL || other->dead || other->state != TCP_LISTEN)
 		goto out;
 
-	/* Allocate skb for sending to listening sock */
-	skb=sock_alloc_send_skb(newsk, 0, 0, flags&O_NONBLOCK, &err);
-	if(skb==NULL)
-		/*
-		 * if it gives EAGAIN we should give back
-		 * EINPROGRESS. But this should not happen since the
-		 * socket should have some writespace left (it did not
-		 * allocate any memory until now)
-		 */
-		goto out_release;
+	err = -ENOMEM;
+	if (newsk == NULL || skb == NULL)
+		goto out;
 
 	UNIXCB(skb).attr = MSG_SYN;
 
@@ -715,7 +722,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	sk->state=TCP_ESTABLISHED;
 	/* Set credentials */
 	sk->peercred = other->peercred;
-	
+
 	/* set up newly created sock */
 	unix_peer(newsk)=sk;
 	unix_lock(newsk);
@@ -738,12 +745,16 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	other->ack_backlog++;
 	skb_queue_tail(&other->receive_queue,skb);
 	other->data_ready(other,0);		/* Wake up !	      */
-
+	unix_unlock(other);
 	return 0;
 
-out_release:
-	unix_destroy_socket(newsk);
 out:
+	if (skb)
+		kfree_skb(skb);
+	if (newsk)
+		unix_destroy_socket(newsk);
+	if (other)
+		unix_unlock(other);
 	return err;
 }
 
@@ -803,13 +814,14 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
 			kfree_skb(skb);
 			continue;
 		}
-		break;
+		tsk = skb->sk;
+		sk->ack_backlog--;
+		kfree_skb(skb);
+		if (!tsk->dead) 
+			break;
+		unix_release_sock(tsk);
 	}
 
-	tsk=skb->sk;
-	sk->ack_backlog--;
-	unix_unlock(sk);	/* No longer locked to master	   */
-	kfree_skb(skb);
 
 	/* attach accepted sock to socket */
 	newsock->state=SS_CONNECTED;
@@ -1015,8 +1027,8 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len,
 		size=len-sent;
 
 		/* Keep two messages in the pipe so it schedules better */
-		if (size > (sk->sndbuf - sizeof(struct sk_buff)) / 2)
-			size = (sk->sndbuf - sizeof(struct sk_buff)) / 2;
+		if (size > sk->sndbuf/2 - 16)
+			size = sk->sndbuf/2 - 16;
 
 		/*
 		 *	Keep to page sized kmalloc()'s as various people
@@ -1024,8 +1036,8 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len,
 		 *	much.
 		 */
 
-		if (size > 3500)
-			limit = 3500;	/* Fall back to a page if we can't grab a big buffer this instant */
+		if (size > 4096-16)
+			limit = 4096-16; /* Fall back to a page if we can't grab a big buffer this instant */
 		else
 			limit = 0;	/* Otherwise just grab and wait */
 
@@ -1056,8 +1068,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len,
 		if (scm->fp)
 			unix_attach_fds(scm, skb);
 
-		/* N.B. this could fail with -EFAULT */
-		memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size);
+		if (memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) {
+			kfree_skb(skb);
+			if (sent)
+				goto out;
+			return -EFAULT;
+		}
 
 		other=unix_peer(sk);
 
@@ -1247,8 +1263,12 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, int size
 		}
 
 		chunk = min(skb->len, size);
-		/* N.B. This could fail with -EFAULT */
-		memcpy_toiovec(msg->msg_iov, skb->data, chunk);
+		if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
+			skb_queue_head(&sk->receive_queue, skb);
+			if (copied == 0)
+				copied = -EFAULT;
+			break;
+		}
 		copied += chunk;
 		size -= chunk;
 
@@ -1299,28 +1319,20 @@ static int unix_shutdown(struct socket *sock, int mode)
 	struct sock *sk = sock->sk;
 	unix_socket *other=unix_peer(sk);
 	
-	mode++;
+	mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
 
-	if (mode&SEND_SHUTDOWN)
-	{
-		sk->shutdown|=SEND_SHUTDOWN;
+	if (mode) {
+		sk->shutdown |= mode;
 		sk->state_change(sk);
-		if(other && sk->type == SOCK_STREAM && other->state != TCP_LISTEN)
-		{
-			if (unix_our_peer(sk, other))
-				other->shutdown|=RCV_SHUTDOWN;
-			other->state_change(other);
-		}
-	}
-	other=unix_peer(sk);
-	if(mode&RCV_SHUTDOWN)
-	{
-		sk->shutdown|=RCV_SHUTDOWN;
-		sk->state_change(sk);
-		if(other && sk->type != SOCK_DGRAM && other->state != TCP_LISTEN)
-		{
-			if (unix_our_peer(sk, other))
-				other->shutdown|=SEND_SHUTDOWN;
+		if (other && sk->type == SOCK_STREAM &&
+		    unix_our_peer(sk, other)) {
+			int peer_mode = 0;
+
+			if (mode&RCV_SHUTDOWN)
+				peer_mode |= SEND_SHUTDOWN;
+			if (mode&SEND_SHUTDOWN)
+				peer_mode |= RCV_SHUTDOWN;
+			other->shutdown |= mode;
 			other->state_change(other);
 		}
 	}
@@ -1388,12 +1400,21 @@ static unsigned int unix_poll(struct file * file, struct socket *sock, poll_tabl
 	 * we set writable also when the other side has shut down the
 	 * connection. This prevents stuck sockets.
 	 */
-	if (sk->sndbuf - atomic_read(&sk->wmem_alloc) >= MIN_WRITE_SPACE)
+	if (sk->sndbuf - (int)atomic_read(&sk->wmem_alloc) >= MIN_WRITE_SPACE)
 			mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
 
 	return mask;
 }
 
+static void unix_stream_write_space(struct sock *sk)
+{
+	if (sk->dead)  
+		return;
+	wake_up_interruptible(sk->sleep);
+	if (sk->sndbuf - (int)atomic_read(&sk->wmem_alloc) >= MIN_WRITE_SPACE)
+		sock_wake_async(sk->socket, 2);
+}
+
 #ifdef CONFIG_PROC_FS
 static int unix_read_proc(char *buffer, char **start, off_t offset,
 			  int length, int *eof, void *data)
@@ -1411,7 +1432,7 @@ static int unix_read_proc(char *buffer, char **start, off_t offset,
 	{
 		len+=sprintf(buffer+len,"%p: %08X %08X %08lX %04X %02X %5ld",
 			s,
-			s->sock_readers,
+			atomic_read(&s->sock_readers),
 			0,
 			s->socket ? s->socket->flags : 0,
 			s->type,
@@ -1433,7 +1454,7 @@ static int unix_read_proc(char *buffer, char **start, off_t offset,
 		}
 		buffer[len++]='\n';
 		
-		pos+=len;
+		pos = begin + len;
 		if(pos<offset)
 		{
 			len=0;
diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c
index fabe85161..5dda55a35 100644
--- a/net/wanrouter/wanmain.c
+++ b/net/wanrouter/wanmain.c
@@ -5,7 +5,7 @@
 *		the following common services for the WAN Link Drivers:
 *		 o WAN device managenment (registering, unregistering)
 *		 o Network interface management
-*		 o Physical connection management (dial-up, incoming calls)
+*		 o Physical connection management (dial-up, incomming calls)
 *		 o Logical connection management (switched virtual circuits)
 *		 o Protocol encapsulation/decapsulation
 *
@@ -25,7 +25,6 @@
 * Oct 15, 1997  Farhan Thawar   changed wan_encapsulate to add a pad byte of 0
 * Apr 20, 1998	Alan Cox	Fixed 2.1 symbols
 * May 17, 1998  K. Baranowski	Fixed SNAP encapsulation in wan_encapsulate
-* Aug 15, 1998	Arnaldo C. Melo	Fixed device_setup return value
 *****************************************************************************/
 
 #include <linux/stddef.h>	/* offsetof(), etc. */
@@ -373,8 +372,9 @@ int wanrouter_ioctl(struct inode* inode, struct file* file,
 	struct proc_dir_entry* dent;
 	wan_device_t* wandev;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!capable(CAP_NET_ADMIN)){
 		return -EPERM;
+	}
 		
 	if ((cmd >> 8) != ROUTER_IOCTL)
 		return -EINVAL;
@@ -443,7 +443,7 @@ static int device_setup (wan_device_t* wandev, wandev_conf_t* u_conf)
 
 	if (wandev->setup == NULL)	/* Nothing to do ? */
 		return 0;
-		
+	
 	conf = kmalloc(sizeof(wandev_conf_t), GFP_KERNEL);
 	if (conf == NULL)
 		return -ENOBUFS;
@@ -459,15 +459,17 @@ static int device_setup (wan_device_t* wandev, wandev_conf_t* u_conf)
 
 	if (conf->data_size && conf->data)
 	{
-		if(conf->data_size > 1024 || conf->data_size < 0)
+		if(conf->data_size > 64000 || conf->data_size < 0){
 			goto bail;
+		}
 		data = kmalloc(conf->data_size, GFP_KERNEL);
 		if (data)
 		{
 			if(!copy_from_user(data, conf->data, conf->data_size))
 			{
 				conf->data=data;
-				err = wandev->setup(wandev,conf);
+				wandev->setup(wandev,conf);
+				err = 0;
 			}
 			else 
 				err = -ENOBUFS;
@@ -681,6 +683,7 @@ static int delete_interface (wan_device_t* wandev, char* name, int force)
 	--wandev->ndev;
 	sti();			/****** critical section end ******/
 
+	printk("Unregistering '%s'\n", dev->name);
 	unregister_netdev(dev);
 	kfree(dev);
 	return 0;
diff --git a/net/wanrouter/wanproc.c b/net/wanrouter/wanproc.c
index 088487077..f92f87e1f 100644
--- a/net/wanrouter/wanproc.c
+++ b/net/wanrouter/wanproc.c
@@ -29,6 +29,7 @@
 #include <asm/segment.h>	/* kernel <-> user copy */
 #include <asm/byteorder.h>	/* htons(), etc. */
 #include <asm/uaccess.h>	/* copy_to_user */
+#include <asm/io.h>
 #include <linux/wanrouter.h>	/* WAN router API definitions */
 
 
@@ -102,6 +103,7 @@ static struct file_operations router_fops =
 	NULL,			/* ioctl   */
 	NULL,			/* mmap	   */
 	NULL,			/* no special open code	   */
+	NULL,			/* flush */
 	NULL,			/* no special release code */
 	NULL			/* can't fsync */
 };
@@ -141,6 +143,7 @@ static struct file_operations wandev_fops =
 	wanrouter_ioctl,	/* ioctl   */
 	NULL,			/* mmap	   */
 	NULL,			/* no special open code	   */
+	NULL,			/* flush */
 	NULL,			/* no special release code */
 	NULL			/* can't fsync */
 };
@@ -241,7 +244,7 @@ static struct proc_dir_entry proc_router_stat =
 
 /* Strings */
 static char conf_hdr[] =
-	"Device name    | port |IRQ|DMA|mem.addr|mem.size|"
+	"Device name    | port |IRQ|DMA| mem.addr |mem.size|"
 	"option1|option2|option3|option4\n";
 	
 static char stat_hdr[] =
@@ -384,16 +387,16 @@ static int config_get_info(char* buf, char** start, off_t offs, int len,
 	wan_device_t* wandev;
 	strcpy(buf, conf_hdr);
 	for (wandev = router_devlist;
-	     wandev && (cnt < (PROC_BUFSZ - 80));
+	     wandev && (cnt < (PROC_BUFSZ - 120));
 	     wandev = wandev->next)
 	{
 		if (wandev->state) cnt += sprintf(&buf[cnt],
-			"%-15s|0x%-4X|%3u|%3u|0x%-6lX|0x%-6X|%7u|%7u|%7u|%7u\n",
+			"%-15s|0x%-4X|%3u|%3u| 0x%-8lX |0x%-6X|%7u|%7u|%7u|%7u\n",
 			wandev->name,
 			wandev->ioport,
 			wandev->irq,
 			wandev->dma,
-			wandev->maddr,
+			virt_to_phys(wandev->maddr),
 			wandev->msize,
 			wandev->hw_opt[0],
 			wandev->hw_opt[1],
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 60a3581e5..e7f894e8e 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -468,7 +468,6 @@ static int x25_create(struct socket *sock, int protocol)
 
 	sock->ops    = &x25_proto_ops;
 	sk->protocol = protocol;
-	sk->mtu      = X25_DEFAULT_PACKET_SIZE;	/* X25_PS128 */
 
 	x25->t21   = sysctl_x25_call_request_timeout;
 	x25->t22   = sysctl_x25_reset_request_timeout;
@@ -507,7 +506,6 @@ static struct sock *x25_make_new(struct sock *osk)
 	sk->sndbuf      = osk->sndbuf;
 	sk->debug       = osk->debug;
 	sk->state       = TCP_ESTABLISHED;
-	sk->mtu         = osk->mtu;
 	sk->sleep       = osk->sleep;
 	sk->zapped      = osk->zapped;