- Merge with Linux 2.1.121.

- Bugfixes.
author: Ralf Baechle <ralf@linux-mips.org> 1998-09-19 19:15:08 +0000
committer: Ralf Baechle <ralf@linux-mips.org> 1998-09-19 19:15:08 +0000
commit: 03ba4131783cc9e872f8bb26a03f15bc11f27564 (patch)
tree: 88db8dba75ae06ba3bad08e42c5e52efc162535c /net/ipv6
parent: 257730f99381dd26e10b832fce4c94cae7ac1176 (diff)
19 files changed, 3447 insertions, 2335 deletions
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 329807093..a61be48c8 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -5,7 +5,7 @@
  *	Authors:
  *	Pedro Roque		<roque@di.fc.ul.pt>	
  *
- *	$Id: addrconf.c,v 1.43 1998/07/15 05:05:32 davem Exp $
+ *	$Id: addrconf.c,v 1.45 1998/08/26 12:04:41 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -38,6 +38,7 @@
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
 #endif
+#include <linux/delay.h>
 
 #include <linux/proc_fs.h>
 #include <net/sock.h>
@@ -53,7 +54,6 @@
 #include <linux/rtnetlink.h>
 
 #include <asm/uaccess.h>
-#include <asm/delay.h>
 
 /* Set to 3 to get tracing... */
 #define ACONF_DEBUG 2
@@ -100,7 +100,7 @@ struct ipv6_devconf ipv6_devconf =
 {
 	0,				/* forwarding		*/
 	IPV6_DEFAULT_HOPLIMIT,		/* hop limit		*/
-	576,				/* mtu			*/
+	IPV6_MIN_MTU,			/* mtu			*/
 	1,				/* accept RAs		*/
 	1,				/* accept redirects	*/
 	1,				/* autoconfiguration	*/
@@ -114,7 +114,7 @@ static struct ipv6_devconf ipv6_devconf_dflt =
 {
 	0,				/* forwarding		*/
 	IPV6_DEFAULT_HOPLIMIT,		/* hop limit		*/
-	576,				/* mtu			*/
+	IPV6_MIN_MTU,			/* mtu			*/
 	1,				/* accept RAs		*/
 	1,				/* accept redirects	*/
 	1,				/* autoconfiguration	*/
@@ -185,7 +185,7 @@ static struct inet6_dev * ipv6_add_dev(struct device *dev)
 	struct inet6_dev *ndev, **bptr, *iter;
 	int hash;
 
-	if (dev->mtu < 576)
+	if (dev->mtu < IPV6_MIN_MTU)
 		return NULL;
 
 	ndev = kmalloc(sizeof(struct inet6_dev), gfp_any());
@@ -548,7 +548,6 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct device *dev,
 		      unsigned long expires, unsigned flags)
 {
 	struct in6_rtmsg rtmsg;
-	int err;
 
 	memset(&rtmsg, 0, sizeof(rtmsg));
 	memcpy(&rtmsg.rtmsg_dst, pfx, sizeof(struct in6_addr));
@@ -566,7 +565,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct device *dev,
 	if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT))
 		rtmsg.rtmsg_flags |= RTF_NONEXTHOP;
 
-	ip6_route_add(&rtmsg, &err);
+	ip6_route_add(&rtmsg);
 }
 
 /* Create "default" multicast route to the interface */
@@ -574,7 +573,6 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct device *dev,
 static void addrconf_add_mroute(struct device *dev)
 {
 	struct in6_rtmsg rtmsg;
-	int err;
 
 	memset(&rtmsg, 0, sizeof(rtmsg));
 	ipv6_addr_set(&rtmsg.rtmsg_dst,
@@ -584,13 +582,12 @@ static void addrconf_add_mroute(struct device *dev)
 	rtmsg.rtmsg_ifindex = dev->ifindex;
 	rtmsg.rtmsg_flags = RTF_UP|RTF_ADDRCONF;
 	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
-	ip6_route_add(&rtmsg, &err);
+	ip6_route_add(&rtmsg);
 }
 
 static void sit_route_add(struct device *dev)
 {
 	struct in6_rtmsg rtmsg;
-	int err;
 
 	memset(&rtmsg, 0, sizeof(rtmsg));
 
@@ -602,7 +599,7 @@ static void sit_route_add(struct device *dev)
 	rtmsg.rtmsg_flags	= RTF_UP|RTF_NONEXTHOP;
 	rtmsg.rtmsg_ifindex	= dev->ifindex;
 
-	ip6_route_add(&rtmsg, &err);
+	ip6_route_add(&rtmsg);
 }
 
 static void addrconf_add_lroute(struct device *dev)
@@ -690,13 +687,12 @@ void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len)
 	else
 		rt_expires = jiffies + valid_lft * HZ;
 
-	rt = rt6_lookup(&pinfo->prefix, NULL, dev->ifindex, RTF_LINKRT);
+	rt = rt6_lookup(&pinfo->prefix, NULL, dev->ifindex, 1);
 
 	if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) {
 		if (rt->rt6i_flags&RTF_EXPIRES) {
 			if (pinfo->onlink == 0 || valid_lft == 0) {
 				ip6_del_rt(rt);
-				rt = NULL;
 			} else {
 				rt->rt6i_expires = rt_expires;
 			}
@@ -705,6 +701,8 @@ void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len)
 		addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
 				      dev, rt_expires, RTF_ADDRCONF|RTF_EXPIRES);
 	}
+	if (rt)
+		dst_release(&rt->u.dst);
 
 	/* Try to figure out our local address for this prefix */
 
@@ -1118,11 +1116,17 @@ int addrconf_notify(struct notifier_block *this, unsigned long event,
 		break;
 
 	case NETDEV_CHANGEMTU:
-		/* BUGGG... Should scan FIB to change pmtu on routes. --ANK */
-		if (dev->mtu >= 576)
+		if (dev->mtu >= IPV6_MIN_MTU) {
+			struct inet6_dev *idev;
+
+			if ((idev = ipv6_find_idev(dev)) == NULL)
+				break;
+			idev->cnf.mtu6 = dev->mtu;
+			rt6_mtu_change(dev, dev->mtu);
 			break;
+		}
 
-		/* MTU falled under 576. Stop IPv6 on this interface. */
+		/* MTU falled under IPV6_MIN_MTU. Stop IPv6 on this interface. */
 
 	case NETDEV_DOWN:
 	case NETDEV_UNREGISTER:
@@ -1240,7 +1244,6 @@ static void addrconf_rs_timer(unsigned long data)
 		add_timer(&ifp->timer);
 	} else {
 		struct in6_rtmsg rtmsg;
-		int err;
 
 		printk(KERN_DEBUG "%s: no IPv6 routers present\n",
 		       ifp->idev->dev->name);
@@ -1253,7 +1256,7 @@ static void addrconf_rs_timer(unsigned long data)
 
 		rtmsg.rtmsg_ifindex = ifp->idev->dev->ifindex;
 
-		ip6_route_add(&rtmsg, &err);
+		ip6_route_add(&rtmsg);
 	}
 }
 
@@ -1501,7 +1504,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 }
 
 static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
-			    pid_t pid, u32 seq, int event)
+			     u32 pid, u32 seq, int event)
 {
 	struct ifaddrmsg *ifm;
 	struct nlmsghdr  *nlh;
@@ -1659,8 +1662,11 @@ int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
 
 		addrconf_forward_change(idev);
 
-		if (*valp)
+		if (*valp) {
+			start_bh_atomic();
 			rt6_purge_dflt_routers(0);
+			end_bh_atomic();
+		}
 	}
 
         return ret;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 051f9a28e..a9ee64925 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -7,7 +7,7 @@
  *
  *	Adapted from linux/net/ipv4/af_inet.c
  *
- *	$Id: af_inet6.c,v 1.36 1998/06/10 07:29:25 davem Exp $
+ *	$Id: af_inet6.c,v 1.37 1998/08/26 12:04:45 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -64,6 +64,7 @@ extern int raw6_get_info(char *, char **, off_t, int, int);
 extern int tcp6_get_info(char *, char **, off_t, int, int);
 extern int udp6_get_info(char *, char **, off_t, int, int);
 extern int afinet6_get_info(char *, char **, off_t, int, int);
+extern int afinet6_get_snmp(char *, char **, off_t, int, int);
 #endif
 
 #ifdef CONFIG_SYSCTL
@@ -243,10 +244,49 @@ static int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 
 static int inet6_release(struct socket *sock, struct socket *peer)
 {
+	struct sock *sk = sock->sk;
+
+	if (sk == NULL)
+		return -EINVAL;
+
+	/* Free mc lists */
+	ipv6_sock_mc_close(sk);
+
+	/* Huh! MOD_DEC_USE_COUNT was here :-(
+	   It is impossible by two reasons: socket destroy
+	   may be delayed and inet_release may sleep and
+	   return to nowhere then. It should be moved to
+	   inet6_destroy_sock(), but we have no explicit constructor :-(
+	                                    --ANK (980802)
+	 */
 	MOD_DEC_USE_COUNT;
 	return inet_release(sock, peer);
 }
 
+int inet6_destroy_sock(struct sock *sk)
+{
+	struct sk_buff *skb;
+	struct ipv6_txoptions *opt;
+
+	/*
+	 *	Release destination entry
+	 */
+
+	dst_release(xchg(&sk->dst_cache,NULL));
+
+	/* Release rx options */
+
+	if ((skb = xchg(&sk->net_pinfo.af_inet6.pktoptions, NULL)) != NULL)
+		kfree_skb(skb);
+
+	/* Free tx options */
+
+	if ((opt = xchg(&sk->net_pinfo.af_inet6.opt, NULL)) != NULL)
+		sock_kfree_s(sk, opt, opt->tot_len);
+
+	return 0;
+}
+
 /*
  *	This does both peername and sockname.
  */
@@ -412,6 +452,12 @@ static struct proc_dir_entry proc_net_sockstat6 = {
 	0, &proc_net_inode_operations,
 	afinet6_get_info
 };
+static struct proc_dir_entry proc_net_snmp6 = {
+	PROC_NET_SNMP6, 5, "snmp6",
+	S_IFREG | S_IRUGO, 1, 0, 0,
+	0, &proc_net_inode_operations,
+	afinet6_get_snmp
+};
 #endif	/* CONFIG_PROC_FS */
 
 #ifdef MODULE
@@ -445,7 +491,7 @@ __initfunc(void inet6_proto_init(struct net_proto *pro))
 
 	printk(KERN_INFO "IPv6 v0.2 for NET3.037\n");
 
-	if (sizeof(struct ipv6_options) > sizeof(dummy_skb->cb))
+	if (sizeof(struct inet6_skb_parm) > sizeof(dummy_skb->cb))
 	{
 		printk(KERN_CRIT "inet6_proto_init: size fault\n");
 #ifdef MODULE
@@ -490,6 +536,7 @@ __initfunc(void inet6_proto_init(struct net_proto *pro))
 	proc_net_register(&proc_net_tcp6);
 	proc_net_register(&proc_net_udp6);
 	proc_net_register(&proc_net_sockstat6);
+	proc_net_register(&proc_net_snmp6);
 #endif
 
 	/* Now the userspace is allowed to create INET6 sockets. */
@@ -526,6 +573,7 @@ void cleanup_module(void)
 	proc_net_unregister(proc_net_tcp6.low_ino);
 	proc_net_unregister(proc_net_udp6.low_ino);
 	proc_net_unregister(proc_net_sockstat6.low_ino);
+	proc_net_unregister(proc_net_snmp6.low_ino);
 #endif
 	/* Cleanup code parts. */
 	sit_cleanup();
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index b87f31b06..51960bd26 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -5,7 +5,7 @@
  *	Authors:
  *	Pedro Roque		<roque@di.fc.ul.pt>	
  *
- *	$Id: datagram.c,v 1.14 1998/03/20 09:12:15 davem Exp $
+ *	$Id: datagram.c,v 1.15 1998/08/26 12:04:47 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -32,48 +32,72 @@
 int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb)
 {
 	struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
-	struct ipv6_options *opt = (struct ipv6_options *) skb->cb;
-	
-	if (np->rxinfo) {
+	struct inet6_skb_parm *opt = (struct inet6_skb_parm *) skb->cb;
+
+	if (np->rxopt.bits.rxinfo) {
 		struct in6_pktinfo src_info;
 
-		src_info.ipi6_ifindex = skb->dev->ifindex;
+		src_info.ipi6_ifindex = opt->iif;
 		ipv6_addr_copy(&src_info.ipi6_addr, &skb->nh.ipv6h->daddr);
 		put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info);
 	}
 
-	if (np->rxhlim) {
+	if (np->rxopt.bits.rxhlim) {
 		int hlim = skb->nh.ipv6h->hop_limit;
 		put_cmsg(msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim);
 	}
 
-	if (opt->srcrt) {
-		int hdrlen = sizeof(struct rt0_hdr) + (opt->srcrt->hdrlen << 3);
-
-		put_cmsg(msg, SOL_IPV6, IPV6_RXSRCRT, hdrlen, opt->srcrt);
+	if (np->rxopt.bits.hopopts && opt->hop) {
+		u8 *ptr = skb->nh.raw + opt->hop;
+		put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr);
+	}
+	if (np->rxopt.bits.dstopts && opt->dst0) {
+		u8 *ptr = skb->nh.raw + opt->dst0;
+		put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, (ptr[1]+1)<<3, ptr);
+	}
+	if (np->rxopt.bits.srcrt && opt->srcrt) {
+		struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(skb->nh.raw + opt->srcrt);
+		put_cmsg(msg, SOL_IPV6, IPV6_RTHDR, (rthdr->hdrlen+1) << 3, rthdr);
+	}
+	if (np->rxopt.bits.authhdr && opt->auth) {
+		u8 *ptr = skb->nh.raw + opt->auth;
+		put_cmsg(msg, SOL_IPV6, IPV6_AUTHHDR, (ptr[1]+1)<<2, ptr);
+	}
+	if (np->rxopt.bits.dstopts && opt->dst1) {
+		u8 *ptr = skb->nh.raw + opt->dst1;
+		put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, (ptr[1]+1)<<3, ptr);
 	}
 	return 0;
 }
 
 int datagram_send_ctl(struct msghdr *msg, int *oif,
-		      struct in6_addr **src_addr, struct ipv6_options *opt, 
+		      struct in6_addr **src_addr, struct ipv6_txoptions *opt,
 		      int *hlimit)
 {
 	struct in6_pktinfo *src_info;
 	struct cmsghdr *cmsg;
 	struct ipv6_rt_hdr *rthdr;
+	struct ipv6_opt_hdr *hdr;
 	int len;
 	int err = 0;
 
 	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+
+		if ((unsigned long)(((char*)cmsg - (char*)msg->msg_control)
+				    + cmsg->cmsg_len) > msg->msg_controllen) {
+			err = -EINVAL;
+			goto exit_f;
+		}
+
 		if (cmsg->cmsg_level != SOL_IPV6) {
-			printk(KERN_DEBUG "invalid cmsg_level %d\n", cmsg->cmsg_level);
+			if (net_ratelimit())
+				printk(KERN_DEBUG "invalid cmsg_level %d\n", cmsg->cmsg_level);
 			continue;
 		}
 
 		switch (cmsg->cmsg_type) {
  		case IPV6_PKTINFO:
- 			if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in6_pktinfo))) {
+ 			if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) {
 				err = -EINVAL;
 				goto exit_f;
 			}
@@ -100,14 +124,77 @@ int datagram_send_ctl(struct msghdr *msg, int *oif,
 			}
 
 			break;
-			
-		case IPV6_RXSRCRT:
+
+		case IPV6_HOPOPTS:
+                        if (opt->hopopt || cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+
+			hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg);
+			len = ((hdr->hdrlen + 1) << 3);
+			if (cmsg->cmsg_len < CMSG_LEN(len)) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+			if (!capable(CAP_NET_RAW)) {
+				err = -EPERM;
+				goto exit_f;
+			}
+			opt->opt_nflen += len;
+			opt->hopopt = hdr;
+			break;
+
+		case IPV6_DSTOPTS:
+                        if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+
+			hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg);
+			len = ((hdr->hdrlen + 1) << 3);
+			if (cmsg->cmsg_len < CMSG_LEN(len)) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+			if (!capable(CAP_NET_RAW)) {
+				err = -EPERM;
+				goto exit_f;
+			}
+			if (opt->dst1opt) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+			opt->opt_flen += len;
+			opt->dst1opt = hdr;
+			break;
+
+		case IPV6_AUTHHDR:
+                        if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+
+			hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg);
+			len = ((hdr->hdrlen + 2) << 2);
+			if (cmsg->cmsg_len < CMSG_LEN(len)) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+			if (len & ~7) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+			opt->opt_flen += len;
+			opt->auth = hdr;
+			break;
+
+		case IPV6_RTHDR:
                         if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_rt_hdr))) {
 				err = -EINVAL;
 				goto exit_f;
 			}
 
-			len = cmsg->cmsg_len - sizeof(struct cmsghdr);
 			rthdr = (struct ipv6_rt_hdr *)CMSG_DATA(cmsg);
 
 			/*
@@ -118,7 +205,9 @@ int datagram_send_ctl(struct msghdr *msg, int *oif,
 				goto exit_f;
 			}
 
-			if (((rthdr->hdrlen + 1) << 3) < len) {
+			len = ((rthdr->hdrlen + 1) << 3);
+
+                        if (cmsg->cmsg_len < CMSG_LEN(len)) {
 				err = -EINVAL;
 				goto exit_f;
 			}
@@ -128,12 +217,21 @@ int datagram_send_ctl(struct msghdr *msg, int *oif,
 				err = -EINVAL;
 				goto exit_f;
 			}
-			
-			opt->opt_nflen += ((rthdr->hdrlen + 1) << 3);
+
+			opt->opt_nflen += len;
 			opt->srcrt = rthdr;
 
+			if (opt->dst1opt) {
+				int dsthdrlen = ((opt->dst1opt->hdrlen+1)<<3);
+
+				opt->opt_nflen += dsthdrlen;
+				opt->dst0opt = opt->dst1opt;
+				opt->dst1opt = NULL;
+				opt->opt_flen -= dsthdrlen;
+			}
+
 			break;
-			
+
 		case IPV6_HOPLIMIT:
 			if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) {
 				err = -EINVAL;
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 0b826870f..89d58936d 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -5,8 +5,9 @@
  *	Authors:
  *	Pedro Roque		<roque@di.fc.ul.pt>
  *	Andi Kleen		<ak@muc.de>
+ *	Alexey Kuznetsov	<kuznet@ms2.inr.ac.ru>
  *
- *	$Id: exthdrs.c,v 1.6 1998/04/30 16:24:20 freitag Exp $
+ *	$Id: exthdrs.c,v 1.7 1998/08/26 12:04:49 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -37,55 +38,192 @@
 
 #include <asm/uaccess.h>
 
-#define swap(a,b) do { typeof (a) tmp; tmp = (a); (a) = (b); (b) = (tmp); } while(0)
+/*
+ *	Parsing inbound headers.
+ *
+ *	Parsing function "func" returns pointer to the place,
+ *	where next nexthdr value is stored or NULL, if parsing
+ *	failed. It should also update skb->h.
+ */
+
+struct hdrtype_proc
+{
+	int	type;
+	u8*	(*func) (struct sk_buff **, u8 *ptr);
+};
 
 /*
- *	inbound
+ *	Parsing tlv encoded headers.
+ *
+ *	Parsing function "func" returns 1, if parsing succeed
+ *	and 0, if it failed.
+ *	It MUST NOT touch skb->h.
  */
-#if 0
-int ipv6_routing_header(struct sk_buff **skb_ptr, struct device *dev,
-			__u8 *nhptr, struct ipv6_options *opt)
+
+struct tlvtype_proc
+{
+	int	type;
+	int	(*func) (struct sk_buff *, __u8 *ptr);
+};
+
+/*********************
+  Generic functions
+ *********************/
+
+/* An unknown option is detected, decide what to do */
+
+int ip6_tlvopt_unknown(struct sk_buff *skb, u8 *opt)
+{
+	switch ((opt[0] & 0xC0) >> 6) {
+	case 0: /* ignore */
+		return 1;
+		
+	case 1: /* drop packet */
+		break;
+
+	case 3: /* Send ICMP if not a multicast address and drop packet */
+		/* Actually, it is redundant check. icmp_send
+		   will recheck in any case.
+		 */
+		if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr))
+			break;
+	case 2: /* send ICMP PARM PROB regardless and drop packet */
+		icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, opt);
+		return 0;
+	};
+
+	kfree_skb(skb);
+	return 0;
+}
+
+/* Parse tlv encoded option header (hop-by-hop or destination) */
+
+static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff *skb,
+			 __u8 *nhptr)
+{
+	struct tlvtype_proc *curr;
+	u8 *ptr = skb->h.raw;
+	int len = ((ptr[1]+1)<<3) - 2;
+
+	ptr += 2;
+
+	if (skb->tail - (ptr + len) < 0) {
+		kfree_skb(skb);
+		return 0;
+	}
+
+	while (len > 0) {
+		int optlen = ptr[1]+2;
+
+		switch (ptr[0]) {
+		case IPV6_TLV_PAD0:
+			optlen = 1;
+			break;
+
+		case IPV6_TLV_PADN:
+			break;
+
+		default: /* Other TLV code so scan list */
+			for (curr=procs; curr->type >= 0; curr++) {
+				if (curr->type == ptr[0]) {
+					if (curr->func(skb, ptr) == 0)
+						return 0;
+					break;
+				}
+			}
+			if (curr->type < 0) {
+				if (ip6_tlvopt_unknown(skb, ptr) == 0)
+					return 0;
+			}
+			break;
+		}
+		ptr += optlen;
+		len -= optlen;
+	}
+	if (len == 0)
+		return 1;
+	kfree_skb(skb);
+	return 0;
+}
+
+/*****************************
+  Destination options header.
+ *****************************/
+
+struct tlvtype_proc tlvprocdestopt_lst[] = {
+	/* No destination options are defined now */
+	{-1,			NULL}
+};
+
+static u8 *ipv6_dest_opt(struct sk_buff **skb_ptr, u8 *nhptr)
+{
+	struct sk_buff *skb=*skb_ptr;
+	struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb;
+	struct ipv6_destopt_hdr *hdr = (struct ipv6_destopt_hdr *) skb->h.raw;
+
+	opt->dst1 = (u8*)hdr - skb->nh.raw;
+
+	if (ip6_parse_tlv(tlvprocdestopt_lst, skb, nhptr)) {
+		skb->h.raw += ((hdr->hdrlen+1)<<3);
+		return &hdr->nexthdr;
+	}
+
+	return NULL;
+}
+
+/********************************
+  NONE header. No data in packet.
+ ********************************/
+
+static u8 *ipv6_nodata(struct sk_buff **skb_ptr, u8 *nhptr)
+{
+	kfree_skb(*skb_ptr);
+	return NULL;
+}
+
+/********************************
+  Routing header.
+ ********************************/
+
+static u8* ipv6_routing_header(struct sk_buff **skb_ptr, u8 *nhptr)
 {
 	struct sk_buff *skb = *skb_ptr;
+	struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb;
 	struct in6_addr *addr;
 	struct in6_addr daddr;
-	int addr_type = 0;
-	int strict = 0;
-	__u32 bit_map;
-	int pos;
+	int addr_type;
 	int n, i;
 
 	struct ipv6_rt_hdr *hdr = (struct ipv6_rt_hdr *) skb->h.raw;
 	struct rt0_hdr *rthdr;
 
-	if (hdr->segments_left == 0) {
-		struct ipv6_options *opt;
-
-		opt = (struct ipv6_options *) skb->cb;
-		opt->srcrt = hdr;
+	if (((hdr->hdrlen+1)<<3) > skb->tail - skb->h.raw) {
+		ipv6_statistics.Ip6InHdrErrors++;
+		kfree_skb(skb);
+		return NULL;
+	}
 
+looped_back:
+	if (hdr->segments_left == 0) {
+		opt->srcrt = (u8*)hdr - skb->nh.raw;
 		skb->h.raw += (hdr->hdrlen + 1) << 3;
-		return hdr->nexthdr;		
+		opt->dst0 = opt->dst1;
+		opt->dst1 = 0;
+		return &hdr->nexthdr;		
 	}
 
-	if (hdr->type != IPV6_SRCRT_TYPE_0 || hdr->hdrlen & 0x01 ||
-	    hdr->hdrlen > 46) {
-                /* 
-		 *	Discard 
-		 */
-		
-		pos = (__u8 *) hdr - (__u8 *) skb->nh.ipv6h + 2;
+	if (hdr->type != IPV6_SRCRT_TYPE_0 || hdr->hdrlen & 0x01) {
+		u8 *pos = (u8*) hdr;
 
-		if (hdr->type)
+		if (hdr->type != IPV6_SRCRT_TYPE_0)
 			pos += 2;
 		else
 			pos += 1;
 
-		icmpv6_send(skb, ICMPV6_PARAMETER_PROB, 0, pos, dev);
-		kfree_skb(skb);
-		return 0;	
+		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, pos);
+		return NULL;	
 	}
-
+	
 	/*
 	 *	This is the routing header forwarding algorithm from
 	 *	RFC 1883, page 17.
@@ -94,13 +232,21 @@ int ipv6_routing_header(struct sk_buff **skb_ptr, struct device *dev,
 	n = hdr->hdrlen >> 1;
 
 	if (hdr->segments_left > n) {
-		pos = (__u8 *) hdr - (__u8 *) skb->nh.ipv6h + 2;
-
-		pos += 3;
+		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, &hdr->segments_left);
+		return NULL;
+	}
 
-		icmpv6_send(skb, ICMPV6_PARAMETER_PROB, 0, pos, dev);
+	/* We are about to mangle packet header. Be careful!
+	   Do not damage packets queued somewhere.
+	 */
+	if (skb_cloned(skb)) {
+		struct sk_buff *skb2 = skb_copy(skb, GFP_ATOMIC);
 		kfree_skb(skb);
-		return 0;
+		if (skb2 == NULL)
+			return NULL;
+		*skb_ptr = skb = skb2;
+		opt = (struct inet6_skb_parm *)skb2->cb;
+		hdr = (struct ipv6_rt_hdr *) skb2->h.raw;
 	}
 
 	i = n - --hdr->segments_left;
@@ -113,58 +259,429 @@ int ipv6_routing_header(struct sk_buff **skb_ptr, struct device *dev,
 
 	if (addr_type == IPV6_ADDR_MULTICAST) {
 		kfree_skb(skb);
-		return 0;
+		return NULL;
 	}
 
 	ipv6_addr_copy(&daddr, addr);
 	ipv6_addr_copy(addr, &skb->nh.ipv6h->daddr);
 	ipv6_addr_copy(&skb->nh.ipv6h->daddr, &daddr);
 
-	/*
-	 *	Check Strick Source Route
+	dst_release(xchg(&skb->dst, NULL));
+	ip6_route_input(skb);
+	if (skb->dst->error) {
+		skb->dst->input(skb);
+		return NULL;
+	}
+	if (skb->dst->dev->flags&IFF_LOOPBACK) {
+		if (skb->nh.ipv6h->hop_limit <= 1) {
+			icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
+				    0, skb->dev);
+			kfree_skb(skb);
+			return NULL;
+		}
+		skb->nh.ipv6h->hop_limit--;
+		goto looped_back;
+	}
+
+	skb->dst->input(skb);
+	return NULL;
+}
+
+/*
+   This function inverts received rthdr.
+   NOTE: specs allow to make it automatically only if
+   packet authenticated.
+
+   I will not discuss it here (though, I am really pissed off at
+   this stupid requirement making rthdr idea useless)
+
+   Actually, it creates severe problems  for us.
+   Embrionic requests has no associated sockets,
+   so that user have no control over it and
+   cannot not only to set reply options, but
+   even to know, that someone wants to connect
+   without success. :-(
+
+   For now we need to test the engine, so that I created
+   temporary (or permanent) backdoor.
+   If listening socket set IPV6_RTHDR to 2, then we invert header.
+                                                   --ANK (980729)
+ */
+
+struct ipv6_txoptions *
+ipv6_invert_rthdr(struct sock *sk, struct ipv6_rt_hdr *hdr)
+{
+	/* Received rthdr:
+
+	   [ H1 -> H2 -> ... H_prev ]  daddr=ME
+
+	   Inverted result:
+	   [ H_prev -> ... -> H1 ] daddr =sender
+
+	   Note, that IP output engine will rewrire this rthdr
+	   by rotating it left by one addr.
 	 */
 
-	bit_map = ntohl(rthdr->bitmap);
+	int n, i;
+	struct rt0_hdr *rthdr = (struct rt0_hdr*)hdr;
+	struct rt0_hdr *irthdr;
+	struct ipv6_txoptions *opt;
+	int hdrlen = ipv6_optlen(hdr);
+
+	if (hdr->segments_left ||
+	    hdr->type != IPV6_SRCRT_TYPE_0 ||
+	    hdr->hdrlen & 0x01)
+		return NULL;
 
-	if ((bit_map & (1 << i)) == IPV6_SRCRT_STRICT)
-		strict = 1;
+	n = hdr->hdrlen >> 1;
+	opt = sock_kmalloc(sk, sizeof(*opt) + hdrlen, GFP_ATOMIC);
+	if (opt == NULL)
+		return NULL;
+	memset(opt, 0, sizeof(*opt));
+	opt->tot_len = sizeof(*opt) + hdrlen;
+	opt->srcrt = (void*)(opt+1);
+	opt->opt_nflen = hdrlen;
+
+	memcpy(opt->srcrt, hdr, sizeof(*hdr));
+	irthdr = (struct rt0_hdr*)opt->srcrt;
+	/* Obsolete field, MBZ, when originated by us */
+	irthdr->bitmap = 0;
+	opt->srcrt->segments_left = n;
+	for (i=0; i<n; i++)
+		memcpy(irthdr->addr+i, rthdr->addr+(n-1-i), 16);
+	return opt;
+}
 
-	ipv6_forward(skb, dev, (strict ? IP6_FW_STRICT : 0) | IP6_FW_SRCRT);
+/********************************
+  AUTH header.
+ ********************************/
 
+/*
+   rfc1826 said, that if a host does not implement AUTH header
+   it MAY ignore it. We use this hole 8)
+
+   Actually, now we can implement OSPFv6 without kernel IPsec.
+   Authentication for poors may be done in user space with the same success.
+
+   Yes, it means, that we allow application to send/receive
+   raw authentication header. Apparently, we suppose, that it knows
+   what it does and calculates authentication data correctly.
+   Certainly, it is possible only for udp and raw sockets, but not for tcp.
+
+   BTW I beg pardon, it is not good place for flames, but
+   I cannot be silent 8) It is very sad, but fools prevail 8)
+   AUTH header has 4byte granular length, what kills all the idea
+   behind AUTOMATIC 64bit alignment of IPv6. Now we will loose
+   cpu ticks, checking that sender did not something stupid
+   and opt->hdrlen is even. Shit!		--ANK (980730)
+ */
+
+static u8 *ipv6_auth_hdr(struct sk_buff **skb_ptr, u8 *nhptr)
+{
+	struct sk_buff *skb=*skb_ptr;
+	struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb;
+	struct ipv6_opt_hdr *hdr = (struct ipv6_opt_hdr *)skb->h.raw;
+	int len = (hdr->hdrlen+2)<<2;
+
+	opt->auth = (u8*)hdr - skb->nh.raw;
+	if (skb->h.raw + len > skb->tail)
+		return NULL;
+	skb->h.raw += len;
+	return &hdr->nexthdr;
+}
+
+/* This list MUST NOT contain entry for NEXTHDR_HOP.
+   It is parsed immediately after packet received
+   and if it occurs somewhere in another place we must
+   generate error.
+ */
+
+struct hdrtype_proc hdrproc_lst[] = {
+	{NEXTHDR_FRAGMENT,	ipv6_reassembly},
+	{NEXTHDR_ROUTING,	ipv6_routing_header},
+	{NEXTHDR_DEST,		ipv6_dest_opt},
+	{NEXTHDR_NONE,		ipv6_nodata},
+	{NEXTHDR_AUTH,		ipv6_auth_hdr},
+   /*
+	{NEXTHDR_ESP,		ipv6_esp_hdr},
+    */
+	{-1,			NULL}
+};
+
+u8 *ipv6_parse_exthdrs(struct sk_buff **skb_in, u8 *nhptr)
+{
+	struct hdrtype_proc *hdrt;
+	u8 nexthdr = *nhptr;
+
+restart:
+	for (hdrt=hdrproc_lst; hdrt->type >= 0; hdrt++) {
+		if (hdrt->type == nexthdr) {
+			if ((nhptr = hdrt->func(skb_in, nhptr)) != NULL) {
+				nexthdr = *nhptr;
+				goto restart;
+			}
+			return NULL;
+		}
+	}
+	return nhptr;
+}
+
+
+/**********************************
+  Hop-by-hop options.
+ **********************************/
+
+/* Router Alert as of draft-ietf-ipngwg-ipv6router-alert-04 */
+
+static int ipv6_hop_ra(struct sk_buff *skb, u8 *ptr)
+{
+	if (ptr[1] == 2) {
+		((struct inet6_skb_parm*)skb->cb)->ra = ptr - skb->nh.raw;
+		return 1;
+	}
+	if (net_ratelimit())
+		printk(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n", ptr[1]);
+	kfree_skb(skb);
 	return 0;
 }
 
+/* Jumbo payload */
+
+static int ipv6_hop_jumbo(struct sk_buff *skb, u8 *ptr)
+{
+	u32 pkt_len;
+
+	if (ptr[1] != 4 || ((ptr-skb->nh.raw)&3) != 2) {
+		if (net_ratelimit())
+			printk(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", ptr[1]);
+		goto drop;
+	}
+
+	pkt_len = ntohl(*(u32*)(ptr+2));
+	if (pkt_len < 0x10000) {
+		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ptr+2);
+		return 0;
+	}
+	if (skb->nh.ipv6h->payload_len) {
+		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ptr);
+		return 0;
+	}
+
+	if (pkt_len > skb->len - sizeof(struct ipv6hdr)) {
+		ipv6_statistics.Ip6InTruncatedPkts++;
+		goto drop;
+	}
+	skb_trim(skb, pkt_len + sizeof(struct ipv6hdr));
+	return 1;
+
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+struct tlvtype_proc tlvprochopopt_lst[] = {
+	{IPV6_TLV_ROUTERALERT,	ipv6_hop_ra},
+	{IPV6_TLV_JUMBO,	ipv6_hop_jumbo},
+	{-1,			NULL}
+};
+
+u8 * ipv6_parse_hopopts(struct sk_buff *skb, u8 *nhptr)
+{
+	((struct inet6_skb_parm*)skb->cb)->hop = sizeof(struct ipv6hdr);
+	if (ip6_parse_tlv(tlvprochopopt_lst, skb, nhptr))
+		return nhptr+((nhptr[1]+1)<<3);
+	return NULL;
+}
 
 /*
- *	outbound
+ *	Creating outbound headers.
+ *
+ *	"build" functions work when skb is filled from head to tail (datagram)
+ *	"push"	functions work when headers are added from tail to head (tcp)
+ *
+ *	In both cases we assume, that caller reserved enough room
+ *	for headers.
  */
 
-int ipv6opt_bld_rthdr(struct sk_buff *skb, struct ipv6_options *opt,
-		      struct in6_addr *addr)		      
+u8 *ipv6_build_rthdr(struct sk_buff *skb, u8 *prev_hdr,
+		     struct ipv6_rt_hdr *opt, struct in6_addr *addr)
 {
 	struct rt0_hdr *phdr, *ihdr;
 	int hops;
 
-	ihdr = (struct rt0_hdr *) opt->srcrt;
+	ihdr = (struct rt0_hdr *) opt;
 	
 	phdr = (struct rt0_hdr *) skb_put(skb, (ihdr->rt_hdr.hdrlen + 1) << 3);
-	memcpy(phdr, ihdr, sizeof(struct ipv6_rt_hdr));
+	memcpy(phdr, ihdr, sizeof(struct rt0_hdr));
 
 	hops = ihdr->rt_hdr.hdrlen >> 1;
-	
+
 	if (hops > 1)
 		memcpy(phdr->addr, ihdr->addr + 1,
 		       (hops - 1) * sizeof(struct in6_addr));
 
 	ipv6_addr_copy(phdr->addr + (hops - 1), addr);
+
+	phdr->rt_hdr.nexthdr = *prev_hdr;
+	*prev_hdr = NEXTHDR_ROUTING;
+	return &phdr->rt_hdr.nexthdr;
+}
+
+static u8 *ipv6_build_exthdr(struct sk_buff *skb, u8 *prev_hdr, u8 type, struct ipv6_opt_hdr *opt)
+{
+	struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_put(skb, ipv6_optlen(opt));
+
+	memcpy(h, opt, ipv6_optlen(opt));
+	h->nexthdr = *prev_hdr;
+	*prev_hdr = type;
+	return &h->nexthdr;
+}
+
+static u8 *ipv6_build_authhdr(struct sk_buff *skb, u8 *prev_hdr, struct ipv6_opt_hdr *opt)
+{
+	struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_put(skb, (opt->hdrlen+2)<<2);
+
+	memcpy(h, opt, (opt->hdrlen+2)<<2);
+	h->nexthdr = *prev_hdr;
+	*prev_hdr = NEXTHDR_AUTH;
+	return &h->nexthdr;
+}
+
+
+u8 *ipv6_build_nfrag_opts(struct sk_buff *skb, u8 *prev_hdr, struct ipv6_txoptions *opt,
+			  struct in6_addr *daddr, u32 jumbolen)
+{
+	struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb->data;
+
+	if (opt && opt->hopopt)
+		prev_hdr = ipv6_build_exthdr(skb, prev_hdr, NEXTHDR_HOP, opt->hopopt);
+
+	if (jumbolen) {
+		u8 *jumboopt = (u8 *)skb_put(skb, 8);
+
+		if (opt && opt->hopopt) {
+			*jumboopt++ = IPV6_TLV_PADN;
+			*jumboopt++ = 0;
+			h->hdrlen++;
+		} else {
+			h = (struct ipv6_opt_hdr *)jumboopt;
+			h->nexthdr = *prev_hdr;
+			h->hdrlen = 0;
+			jumboopt += 2;
+			*prev_hdr = NEXTHDR_HOP;
+			prev_hdr = &h->nexthdr;
+		}
+		jumboopt[0] = IPV6_TLV_JUMBO;
+		jumboopt[1] = 4;
+		*(u32*)(jumboopt+2) = htonl(jumbolen);
+	}
+	if (opt) {
+		if (opt->dst0opt)
+			prev_hdr = ipv6_build_exthdr(skb, prev_hdr, NEXTHDR_DEST, opt->dst0opt);
+		if (opt->srcrt)
+			prev_hdr = ipv6_build_rthdr(skb, prev_hdr, opt->srcrt, daddr);
+	}
+	return prev_hdr;
+}
+
+u8 *ipv6_build_frag_opts(struct sk_buff *skb, u8 *prev_hdr, struct ipv6_txoptions *opt)
+{
+	if (opt->auth)
+		prev_hdr = ipv6_build_authhdr(skb, prev_hdr, opt->auth);
+	if (opt->dst1opt)
+		prev_hdr = ipv6_build_exthdr(skb, prev_hdr, NEXTHDR_DEST, opt->dst1opt);
+	return prev_hdr;
+}
+
+static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto,
+			    struct ipv6_rt_hdr *opt,
+			    struct in6_addr **addr_p)
+{
+	struct rt0_hdr *phdr, *ihdr;
+	int hops;
+
+	ihdr = (struct rt0_hdr *) opt;
 	
-	phdr->rt_hdr.nexthdr = proto; 
-	return NEXTHDR_ROUTING;
+	phdr = (struct rt0_hdr *) skb_push(skb, (ihdr->rt_hdr.hdrlen + 1) << 3);
+	memcpy(phdr, ihdr, sizeof(struct rt0_hdr));
+
+	hops = ihdr->rt_hdr.hdrlen >> 1;
+
+	if (hops > 1)
+		memcpy(phdr->addr, ihdr->addr + 1,
+		       (hops - 1) * sizeof(struct in6_addr));
+
+	ipv6_addr_copy(phdr->addr + (hops - 1), *addr_p);
+	*addr_p = ihdr->addr;
+
+	phdr->rt_hdr.nexthdr = *proto;
+	*proto = NEXTHDR_ROUTING;
+}
+
+static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv6_opt_hdr *opt)
+{
+	struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_push(skb, ipv6_optlen(opt));
+
+	memcpy(h, opt, ipv6_optlen(opt));
+	h->nexthdr = *proto;
+	*proto = type;
 }
-#endif
+
+static void ipv6_push_authhdr(struct sk_buff *skb, u8 *proto, struct ipv6_opt_hdr *opt)
+{
+	struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_push(skb, (opt->hdrlen+2)<<2);
+
+	memcpy(h, opt, (opt->hdrlen+2)<<2);
+	h->nexthdr = *proto;
+	*proto = NEXTHDR_AUTH;
+}
+
+void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
+			  u8 *proto,
+			  struct in6_addr **daddr)
+{
+	if (opt->srcrt)
+		ipv6_push_rthdr(skb, proto, opt->srcrt, daddr);
+	if (opt->dst0opt)
+		ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt);
+	if (opt->hopopt)
+		ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt);
+}
+
+void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto)
+{
+	if (opt->dst1opt)
+		ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt);
+	if (opt->auth)
+		ipv6_push_authhdr(skb, proto, opt->auth);
+}
+
+struct ipv6_txoptions *
+ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
+{
+	struct ipv6_txoptions *opt2;
+
+	opt2 = sock_kmalloc(sk, opt->tot_len, GFP_ATOMIC);
+	if (opt2) {
+		long dif = (char*)opt2 - (char*)opt;
+		memcpy(opt2, opt, opt->tot_len);
+		if (opt2->hopopt)
+			*((char**)&opt2->hopopt) += dif;
+		if (opt2->dst0opt)
+			*((char**)&opt2->dst0opt) += dif;
+		if (opt2->dst1opt)
+			*((char**)&opt2->dst1opt) += dif;
+		if (opt2->auth)
+			*((char**)&opt2->auth) += dif;
+		if (opt2->srcrt)
+			*((char**)&opt2->srcrt) += dif;
+	}
+	return opt2;
+}
+
 
 /* 
- * find out if nexthdr is an extension header or a protocol
+ * find out if nexthdr is a well-known extension header or a protocol
  */
 
 static __inline__ int ipv6_ext_hdr(u8 nexthdr)
@@ -175,11 +692,9 @@ static __inline__ int ipv6_ext_hdr(u8 nexthdr)
 	return ( (nexthdr == NEXTHDR_HOP)	||
 		 (nexthdr == NEXTHDR_ROUTING)	||
 		 (nexthdr == NEXTHDR_FRAGMENT)	||
-		 (nexthdr == NEXTHDR_ESP)	||
 		 (nexthdr == NEXTHDR_AUTH)	||
 		 (nexthdr == NEXTHDR_NONE)	||
 		 (nexthdr == NEXTHDR_DEST) );
-		 
 }
 
 /*
@@ -200,34 +715,57 @@ static __inline__ int ipv6_ext_hdr(u8 nexthdr)
  * 
  * But I see no other way to do this. This might need to be reexamined
  * when Linux implements ESP (and maybe AUTH) headers.
+ * --AK
+ *
+ * This function parses (probably truncated) exthdr set "hdr"
+ * of length "len". "nexthdrp" initially points to some place,
+ * where type of the first header can be found.
+ *
+ * It skips all well-known exthdrs, and returns pointer to the start
+ * of unparsable area i.e. the first header with unknown type.
+ * If it is not NULL *nexthdr is updated by type/protocol of this header.
+ *
+ * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL.
+ *        - it may return pointer pointing beyond end of packet,
+ *	    if the last recognized header is truncated in the middle.
+ *        - if packet is truncated, so that all parsed headers are skipped,
+ *	    it returns NULL.
+ *	  - First fragment header is skipped, not-first ones
+ *	    are considered as unparsable.
+ *	  - ESP is unparsable for now and considered like
+ *	    normal payload protocol.
+ *	  - Note also special handling of AUTH header. Thanks to IPsec wizards.
+ *
+ * --ANK (980726)
  */
-struct ipv6_opt_hdr *ipv6_skip_exthdr(struct ipv6_opt_hdr *hdr, 
-				      u8 *nexthdrp, int len)
+
+u8 *ipv6_skip_exthdr(struct ipv6_opt_hdr *hdr, u8 *nexthdrp, int len)
 {
 	u8 nexthdr = *nexthdrp;
 
 	while (ipv6_ext_hdr(nexthdr)) {
 		int hdrlen; 
-		
-		if (nexthdr == NEXTHDR_NONE)
+
+		if (len < sizeof(struct ipv6_opt_hdr))
 			return NULL;
-		if (len < sizeof(struct ipv6_opt_hdr)) /* be anal today */
+		if (nexthdr == NEXTHDR_NONE)
 			return NULL;
-
-		hdrlen = ipv6_optlen(hdr); 
-		if (len < hdrlen)
-			return NULL; 
+		if (nexthdr == NEXTHDR_FRAGMENT) {
+			struct frag_hdr *fhdr = (struct frag_hdr *) hdr;
+			if (ntohs(fhdr->frag_off) & ~0x7)
+				break;
+			hdrlen = 8;
+		} else if (nexthdr == NEXTHDR_AUTH)
+			hdrlen = (hdr->hdrlen+2)<<2; 
+		else
+			hdrlen = ipv6_optlen(hdr); 
 
 		nexthdr = hdr->nexthdr;
 		hdr = (struct ipv6_opt_hdr *) ((u8*)hdr + hdrlen);
 		len -= hdrlen;
 	}
 
-	/* Hack.. Do the same for AUTH headers? */
-	if (nexthdr == NEXTHDR_ESP) 
-		return NULL; 
-
 	*nexthdrp = nexthdr;
-	return hdr;
+	return (u8*)hdr;
 }
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index c3b6f7b6b..d43d1f98d 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -5,7 +5,7 @@
  *	Authors:
  *	Pedro Roque		<roque@di.fc.ul.pt>
  *
- *	$Id: icmp.c,v 1.18 1998/05/07 15:42:59 davem Exp $
+ *	$Id: icmp.c,v 1.19 1998/08/26 12:04:52 davem Exp $
  *
  *	Based on net/ipv4/icmp.c
  *
@@ -58,16 +58,15 @@
 #include <asm/uaccess.h>
 #include <asm/system.h>
 
+struct icmpv6_mib icmpv6_statistics;
+
 /*
  *	ICMP socket for flow control.
  */
 
 struct socket *icmpv6_socket;
 
-int icmpv6_rcv(struct sk_buff *skb, struct device *dev,
-	       struct in6_addr *saddr, struct in6_addr *daddr,
-	       struct ipv6_options *opt, unsigned short len,
-	       int redo, struct inet6_protocol *protocol);
+int icmpv6_rcv(struct sk_buff *skb, unsigned long len);
 
 static struct inet6_protocol icmpv6_protocol = 
 {
@@ -80,8 +79,6 @@ static struct inet6_protocol icmpv6_protocol =
 	"ICMPv6"	       	/* name			*/
 };
 
-
-
 struct icmpv6_msg {
 	struct icmp6hdr		icmph;
 	__u8 			*data;
@@ -105,8 +102,11 @@ static int icmpv6_getfrag(const void *data, struct in6_addr *saddr,
 
 	/* 
 	 *	in theory offset must be 0 since we never send more 
-	 *	than 576 bytes on an error or more than the path mtu
+	 *	than IPV6_MIN_MTU bytes on an error or more than the path mtu
 	 *	on an echo reply. (those are the rules on RFC 1883)
+	 *
+	 * 	Luckily, this statement is obsolete after
+	 *	draft-ietf-ipngwg-icmp-v2-00           --ANK (980730)
 	 */
 
 	if (offset) {
@@ -143,13 +143,36 @@ void icmpv6_param_prob(struct sk_buff *skb, int code, void *pos)
 	kfree_skb(skb);
 }
 
-static inline int is_icmp(struct ipv6hdr *hdr, int len)
+/*
+ * Figure out, may we reply to this packet with icmp error.
+ *
+ * We do not reply, if:
+ *	- it was icmp error message.
+ *	- it is truncated, so that it is known, that protocol is ICMPV6
+ *	  (i.e. in the middle of some exthdr)
+ *	- it is not the first fragment. BTW IPv6 specs say nothing about
+ *	  this case, but it is clear, that our reply would be useless
+ *	  for sender.
+ *
+ *	--ANK (980726)
+ */
+
+static int is_ineligible(struct ipv6hdr *hdr, int len)
 {
-	__u8 nexthdr = hdr->nexthdr; 
+	u8 *ptr;
+	__u8 nexthdr = hdr->nexthdr;
+
+	if (len < (int)sizeof(*hdr))
+		return 1;
 
-	if (!ipv6_skip_exthdr((struct ipv6_opt_hdr *)(hdr+1), &nexthdr, len))
-		return 0; 
-	return nexthdr == IPPROTO_ICMP; 
+	ptr = ipv6_skip_exthdr((struct ipv6_opt_hdr *)(hdr+1), &nexthdr, len - sizeof(*hdr));
+	if (!ptr)
+		return 0;
+	if (nexthdr == IPPROTO_ICMPV6) {
+		struct icmp6hdr *ihdr =	(struct icmp6hdr *)ptr;
+		return (ptr - (u8*)hdr) > len || !(ihdr->icmp6_type & 0x80); 
+	}
+	return nexthdr == NEXTHDR_FRAGMENT;
 }
 
 int sysctl_icmpv6_time = 1*HZ; 
@@ -160,31 +183,37 @@ int sysctl_icmpv6_time = 1*HZ;
 static inline int icmpv6_xrlim_allow(struct sock *sk, int type,
 				     struct flowi *fl)
 {
-#if 0
-	struct dst_entry *dst; 
-	int allow = 0;
-#endif
+	struct dst_entry *dst;
+	int res = 0;
+
 	/* Informational messages are not limited. */
 	if (type & 0x80)
-		return 1; 
+		return 1;
 
-#if 0 /* not yet, first fix routing COW */
+	/* Do not limit pmtu discovery, it would break it. */
+	if (type == ICMPV6_PKT_TOOBIG)
+		return 1;
 
 	/* 
 	 * Look up the output route.
 	 * XXX: perhaps the expire for routing entries cloned by
 	 * this lookup should be more aggressive (not longer than timeout).
 	 */
-	dst = ip6_route_output(sk, fl, 1);
-	if (dst->error) 
+	dst = ip6_route_output(sk, fl);
+	if (dst->error)
 		ipv6_statistics.Ip6OutNoRoutes++;
-	else 
-		allow = xrlim_allow(dst, sysctl_icmpv6_time);
+	else {
+		struct rt6_info *rt = (struct rt6_info *)dst;
+		int tmo = sysctl_icmpv6_time;
+
+		/* Give more bandwidth to wider prefixes. */
+		if (rt->rt6i_dst.plen < 128)
+			tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
+
+		res = xrlim_allow(dst, tmo);
+	}
 	dst_release(dst);
-	return allow;
-#else
-	return 1;
-#endif
+	return res;
 }
 
 /*
@@ -196,7 +225,7 @@ static inline int icmpv6_xrlim_allow(struct sock *sk, int type,
 
 static __inline__ int opt_unrec(struct sk_buff *skb, __u32 offset)
 {
-	char *buff = skb->nh.raw;
+	u8 *buff = skb->nh.raw;
 
 	return ( ( *(buff + offset) & 0xC0 ) == 0x80 );
 }
@@ -215,7 +244,6 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
 	struct icmpv6_msg msg;
 	struct flowi fl;
 	int addr_type = 0;
-	int optlen;
 	int len;
 
 	/*
@@ -237,7 +265,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
 	
 	addr_type = ipv6_addr_type(&hdr->daddr);
 
-	if (ipv6_chk_addr(&hdr->daddr, NULL, 0))
+	if (ipv6_chk_addr(&hdr->daddr, skb->dev, 0))
 		saddr = &hdr->daddr;
 
 	/*
@@ -275,8 +303,9 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
 	/* 
 	 *	Never answer to a ICMP packet.
 	 */
-	if (is_icmp(hdr, (u8*)skb->tail - (u8*)hdr)) {
-		printk(KERN_DEBUG "icmpv6_send: no reply to icmp\n"); 
+	if (is_ineligible(hdr, (u8*)skb->tail - (u8*)hdr)) {
+		if (net_ratelimit())
+			printk(KERN_DEBUG "icmpv6_send: no reply to icmp error/fragment\n"); 
 		return;
 	}
 
@@ -303,34 +332,22 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
 	msg.data = skb->nh.raw;
 	msg.csum = 0;
 	msg.daddr = &hdr->saddr;
-        /*
-	if (skb->opt)
-		optlen = skb->opt->optlen;
-	else
-	*/
-
-	optlen = 0;
 
-	len = min(skb->tail - ((unsigned char *) hdr), 
-		  576 - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr)
-		  - optlen);
+	len = min((skb->tail - ((unsigned char *) hdr)) + sizeof(struct icmp6hdr), 
+		  IPV6_MIN_MTU - sizeof(struct icmp6hdr));
 
 	if (len < 0) {
 		printk(KERN_DEBUG "icmp: len problem\n");
 		return;
 	}
 
-	len += sizeof(struct icmp6hdr);
-
 	msg.len = len;
 
 	ip6_build_xmit(sk, icmpv6_getfrag, &msg, &fl, len, NULL, -1,
 		       MSG_DONTWAIT);
-
-	/* Oops! We must purge cached dst, otherwise
-	   all the following ICMP messages will go there :) --ANK
-	 */
-	dst_release(xchg(&sk->dst_cache, NULL));
+	if (type >= ICMPV6_DEST_UNREACH && type <= ICMPV6_PARAMPROB)
+		(&icmpv6_statistics.Icmp6OutDestUnreachs)[type-1]++;
+	icmpv6_statistics.Icmp6OutMsgs++;
 }
 
 static void icmpv6_echo_reply(struct sk_buff *skb)
@@ -374,38 +391,41 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 
 	ip6_build_xmit(sk, icmpv6_getfrag, &msg, &fl, len, NULL, -1,
 		       MSG_DONTWAIT);
-
-	/* Oops! We must purge cached dst, otherwise
-	   all the following ICMP messages will go there :) --ANK
-	 */
-	dst_release(xchg(&sk->dst_cache, NULL));
+	icmpv6_statistics.Icmp6OutEchoReplies++;
+	icmpv6_statistics.Icmp6OutMsgs++;
 }
 
 static void icmpv6_notify(struct sk_buff *skb,
-			  int type, int code, unsigned char *buff, int len,
-			  struct in6_addr *saddr, struct in6_addr *daddr, 
-			  struct inet6_protocol *protocol)
+			  int type, int code, unsigned char *buff, int len)
 {
+	struct in6_addr *saddr = &skb->nh.ipv6h->saddr;
+	struct in6_addr *daddr = &skb->nh.ipv6h->daddr;
 	struct ipv6hdr *hdr = (struct ipv6hdr *) buff;
 	struct inet6_protocol *ipprot;
 	struct sock *sk;
-	struct ipv6_opt_hdr *pb;
+	u8 *pb;
 	__u32 info = 0;
 	int hash;
 	u8 nexthdr;
 
 	nexthdr = hdr->nexthdr;
 
-	pb = (struct ipv6_opt_hdr *) (hdr + 1);
 	len -= sizeof(struct ipv6hdr);
 	if (len < 0)
 		return;
 
 	/* now skip over extension headers */
-	pb = ipv6_skip_exthdr(pb, &nexthdr, len);
+	pb = ipv6_skip_exthdr((struct ipv6_opt_hdr *) (hdr + 1), &nexthdr, len);
 	if (!pb)
 		return;
 
+	/* BUGGG_FUTURE: we should try to parse exthdrs in this packet.
+	   Without this we will not able f.e. to make source routed
+	   pmtu discovery.
+	   Corresponding argument (opt) to notifiers is already added.
+	   --ANK (980726)
+	 */
+
 	hash = nexthdr & (MAX_INET_PROTOS - 1);
 
 	for (ipprot = (struct inet6_protocol *) inet6_protos[hash]; 
@@ -414,9 +434,8 @@ static void icmpv6_notify(struct sk_buff *skb,
 		if (ipprot->protocol != nexthdr)
 			continue;
 
-		if (ipprot->err_handler) 
-			ipprot->err_handler(skb, type, code, (u8*)pb, info,
-					    saddr, daddr, ipprot);
+		if (ipprot->err_handler)
+			ipprot->err_handler(skb, hdr, NULL, type, code, pb, info);
 		return;
 	}
 
@@ -428,7 +447,7 @@ static void icmpv6_notify(struct sk_buff *skb,
 		return;
 
 	while((sk = raw_v6_lookup(sk, nexthdr, daddr, saddr))) {
-		rawv6_err(sk, type, code, (char*)pb, saddr, daddr);
+		rawv6_err(sk, skb, hdr, NULL, type, code, pb, info);
 		sk = sk->next;
 	}
 }
@@ -437,14 +456,17 @@ static void icmpv6_notify(struct sk_buff *skb,
  *	Handle icmp messages
  */
 
-int icmpv6_rcv(struct sk_buff *skb, struct device *dev,
-	       struct in6_addr *saddr, struct in6_addr *daddr,
-	       struct ipv6_options *opt, unsigned short len,
-	       int redo, struct inet6_protocol *protocol)
+int icmpv6_rcv(struct sk_buff *skb, unsigned long len)
 {
+	struct device *dev = skb->dev;
+	struct in6_addr *saddr = &skb->nh.ipv6h->saddr;
+	struct in6_addr *daddr = &skb->nh.ipv6h->daddr;
 	struct ipv6hdr *orig_hdr;
 	struct icmp6hdr *hdr = (struct icmp6hdr *) skb->h.raw;
 	int ulen;
+	int type;
+
+	icmpv6_statistics.Icmp6InMsgs++;
 
 	/* Perform checksum. */
 	switch (skb->ip_summed) {	
@@ -480,8 +502,15 @@ int icmpv6_rcv(struct sk_buff *skb, struct device *dev,
 	 *	length of original packet carried in skb
 	 */
 	ulen = skb->tail - (unsigned char *) (hdr + 1);
-	
-	switch (hdr->icmp6_type) {
+
+	type = hdr->icmp6_type;
+
+	if (type >= ICMPV6_DEST_UNREACH && type <= ICMPV6_PARAMPROB)
+		(&icmpv6_statistics.Icmp6InDestUnreachs)[type-ICMPV6_DEST_UNREACH]++;
+	else if (type >= ICMPV6_ECHO_REQUEST && type <= NDISC_REDIRECT)
+		(&icmpv6_statistics.Icmp6InEchos)[type-ICMPV6_ECHO_REQUEST]++;
+
+	switch (type) {
 
 	case ICMPV6_ECHO_REQUEST:
 		icmpv6_echo_reply(skb);
@@ -492,9 +521,14 @@ int icmpv6_rcv(struct sk_buff *skb, struct device *dev,
 		break;
 
 	case ICMPV6_PKT_TOOBIG:
+		/* BUGGG_FUTURE: if packet contains rthdr, we cannot update
+		   standard destination cache. Seems, only "advanced"
+		   destination cache will allow to solve this problem
+		   --ANK (980726)
+		 */
 		orig_hdr = (struct ipv6hdr *) (hdr + 1);
 		if (ulen >= sizeof(struct ipv6hdr))
-			rt6_pmtu_discovery(&orig_hdr->daddr, dev,
+			rt6_pmtu_discovery(&orig_hdr->daddr, &orig_hdr->saddr, dev,
 					   ntohl(hdr->icmp6_mtu));
 
 		/*
@@ -504,10 +538,8 @@ int icmpv6_rcv(struct sk_buff *skb, struct device *dev,
 	case ICMPV6_DEST_UNREACH:
 	case ICMPV6_TIME_EXCEED:
 	case ICMPV6_PARAMPROB:
-
-		icmpv6_notify(skb, hdr->icmp6_type, hdr->icmp6_code,
-			      (char *) (hdr + 1), ulen,
-			      saddr, daddr, protocol);
+		icmpv6_notify(skb, type, hdr->icmp6_code,
+			      (char *) (hdr + 1), ulen);
 		break;
 
 	case NDISC_ROUTER_SOLICITATION:
@@ -515,7 +547,7 @@ int icmpv6_rcv(struct sk_buff *skb, struct device *dev,
 	case NDISC_NEIGHBOUR_SOLICITATION:
 	case NDISC_NEIGHBOUR_ADVERTISEMENT:
 	case NDISC_REDIRECT:
-		ndisc_rcv(skb, dev, saddr, daddr, opt, len);		
+		ndisc_rcv(skb, len);
 		break;
 
 	case ICMPV6_MGM_QUERY:
@@ -530,23 +562,26 @@ int icmpv6_rcv(struct sk_buff *skb, struct device *dev,
 		break;
 
 	default:
-		printk(KERN_DEBUG "icmpv6: msg of unkown type\n");
+		if (net_ratelimit())
+			printk(KERN_DEBUG "icmpv6: msg of unkown type\n");
 		
 		/* informational */
-		if (hdr->icmp6_type & 0x80)
-			goto discard_it;
+		if (type & 0x80)
+			break;
 
 		/* 
 		 * error of unkown type. 
 		 * must pass to upper level 
 		 */
 
-		icmpv6_notify(skb, hdr->icmp6_type, hdr->icmp6_code,
-			      (char *) (hdr + 1), ulen,
-			      saddr, daddr, protocol);	
+		icmpv6_notify(skb, type, hdr->icmp6_code,
+			      (char *) (hdr + 1), ulen);
 	};
+	kfree_skb(skb);
+	return 0;
 
 discard_it:
+	icmpv6_statistics.Icmp6InErrors++;
 	kfree_skb(skb);
 	return 0;
 }
@@ -597,7 +632,7 @@ static struct icmp6_err {
 } tab_unreach[] = {
 	{ ENETUNREACH,	0},	/* NOROUTE		*/
 	{ EACCES,	1},	/* ADM_PROHIBITED	*/
-	{ EOPNOTSUPP,	1},	/* NOT_NEIGHBOUR	*/
+	{ 0,		0},	/* Was NOT_NEIGHBOUR, now reserved */
 	{ EHOSTUNREACH,	0},	/* ADDR_UNREACH		*/
 	{ ECONNREFUSED,	1},	/* PORT_UNREACH		*/
 };
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index e7e12e3ae..bad3a13ec 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -5,7 +5,7 @@
  *	Authors:
  *	Pedro Roque		<roque@di.fc.ul.pt>	
  *
- *	$Id: ip6_fib.c,v 1.14 1998/05/07 15:43:03 davem Exp $
+ *	$Id: ip6_fib.c,v 1.15 1998/08/26 12:04:55 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -32,10 +32,52 @@
 #include <net/ip6_fib.h>
 #include <net/ip6_route.h>
 
-#define RT_DEBUG 2
+#define RT6_DEBUG 2
+#undef CONFIG_IPV6_SUBTREES
+
+#if RT6_DEBUG >= 1
+#define BUG_TRAP(x) ({ if (!(x)) { printk("Assertion (" #x ") failed at " __FILE__ "(%d):" __FUNCTION__ "\n", __LINE__); } })
+#else
+#define BUG_TRAP(x) do { ; } while (0)
+#endif
+
+#if RT6_DEBUG >= 3
+#define RT6_TRACE(x...) printk(KERN_DEBUG x)
+#else
+#define RT6_TRACE(x...) do { ; } while (0)
+#endif
 
 struct rt6_statistics	rt6_stats;
 
+enum fib_walk_state_t
+{
+#ifdef CONFIG_IPV6_SUBTREES
+	FWS_S,
+#endif
+	FWS_L,
+	FWS_R,
+	FWS_C,
+	FWS_U
+};
+
+struct fib6_cleaner_t
+{
+	struct fib6_walker_t w;
+	int (*func)(struct rt6_info *, void *arg);
+	void *arg;
+};
+
+#ifdef CONFIG_IPV6_SUBTREES
+#define FWS_INIT FWS_S
+#define SUBTREE(fn) ((fn)->subtree)
+#else
+#define FWS_INIT FWS_L
+#define SUBTREE(fn) NULL
+#endif
+
+static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt);
+static void fib6_repair_tree(struct fib6_node *fn);
+
 /*
  *	A routing update causes an increase of the serial number on the
  *	afected subtree. This allows for cached routes to be asynchronously
@@ -48,10 +90,24 @@ static __u32	rt_sernum	= 0;
 static struct timer_list ip6_fib_timer = {
 	NULL, NULL,
 	0,
-	0,
+	~0UL,
 	fib6_run_gc
 };
 
+static struct fib6_walker_t fib6_walker_list = {
+	&fib6_walker_list, &fib6_walker_list, 
+};
+
+#define FOR_WALKERS(w) for ((w)=fib6_walker_list.next; (w) != &fib6_walker_list; (w)=(w)->next)
+
+static __inline__ u32 fib6_new_sernum(void)
+{
+	u32 n = ++rt_sernum;
+	if (n == 0)
+		n = ++rt_sernum;
+	return n;
+}
+
 /*
  *	Auxiliary address test functions for the radix tree.
  *
@@ -70,7 +126,7 @@ static __inline__ int addr_match(void *token1, void *token2, int prefixlen)
 	int pdw;
 	int pbi;
 
-	pdw = prefixlen >> 0x05;  /* num of whole __u32 in prefix */
+	pdw = prefixlen >> 5;	  /* num of whole __u32 in prefix */
 	pbi = prefixlen &  0x1f;  /* num of bits in incomplete u32 in prefix */
 
 	if (pdw)
@@ -78,15 +134,11 @@ static __inline__ int addr_match(void *token1, void *token2, int prefixlen)
 			return 0;
 
 	if (pbi) {
-		__u32 w1, w2;
 		__u32 mask;
 
-		w1 = a1[pdw];
-		w2 = a2[pdw];
-
-		mask = htonl((0xffffffff) << (0x20 - pbi));
+		mask = htonl((0xffffffff) << (32 - pbi));
 
-		if ((w1 ^ w2) & mask)
+		if ((a1[pdw] ^ a2[pdw]) & mask)
 			return 0;
 	}
 
@@ -99,24 +151,11 @@ static __inline__ int addr_match(void *token1, void *token2, int prefixlen)
 
 static __inline__ int addr_bit_set(void *token, int fn_bit)
 {
-	int dw;
-	__u32 b1;
-	__u32 mask;
-	int bit = fn_bit;
 	__u32 *addr = token;
 
-	dw = bit >> 0x05;
-
-	b1 = addr[dw];
-	
-	bit = ~bit;
-	bit &= 0x1f;
-	mask = htonl(1 << bit);
-	return (b1 & mask);
+	return htonl(1 << ((~fn_bit)&0x1F)) & addr[fn_bit>>5];
 }
 
-
-
 /*
  *	find the first different bit between two addresses
  *	length of address must be a multiple of 32bits
@@ -131,42 +170,47 @@ static __inline__ int addr_diff(void *token1, void *token2, int addrlen)
 	addrlen >>= 2;
 
 	for (i = 0; i < addrlen; i++) {
-		__u32 b1, b2;
 		__u32 xb;
 
-		b1 = a1[i];
-		b2 = a2[i];
-
-		xb = b1 ^ b2;
+		xb = a1[i] ^ a2[i];
 
 		if (xb) {
-			int res = 0;
-			int j=31;
+			int j = 31;
 
 			xb = ntohl(xb);
 
-			while (test_bit(j, &xb) == 0) {
-				res++;
+			while (test_bit(j, &xb) == 0)
 				j--;
-			}
 
-			return (i * 32 + res);
+			return (i * 32 + 31 - j);
 		}
 	}
 
 	/*
 	 *	we should *never* get to this point since that 
 	 *	would mean the addrs are equal
+	 *
+	 *	However, we do get to it 8) And exacly, when
+	 *	addresses are equal 8)
+	 *
+	 *	ip route add 1111::/128 via ...
+	 *	ip route add 1111::/64 via ...
+	 *	and we are here.
+	 *
+	 *	Ideally, this function should stop comparison
+	 *	at prefix length. It does not, but it is still OK,
+	 *	if returned value is greater than prefix length.
+	 *					--ANK (980803)
 	 */
 
-	return -1;
+	return addrlen<<5;
 }
 
 static __inline__ struct fib6_node * node_alloc(void)
 {
 	struct fib6_node *fn;
 
-	if ((fn = kmalloc(sizeof(struct fib6_node), GFP_ATOMIC))) {
+	if ((fn = kmalloc(sizeof(struct fib6_node), GFP_ATOMIC)) != NULL) {
 		memset(fn, 0, sizeof(struct fib6_node));
 		rt6_stats.fib_nodes++;
 	}
@@ -180,13 +224,10 @@ static __inline__ void node_free(struct fib6_node * fn)
 	kfree(fn);
 }
 
-extern __inline__ void rt6_release(struct rt6_info *rt)
+static __inline__ void rt6_release(struct rt6_info *rt)
 {
-	struct dst_entry *dst = (struct dst_entry *) rt;
-	if (atomic_dec_and_test(&dst->refcnt)) {
-		rt->rt6i_node = NULL;
-		dst_free(dst);
-	}
+	if (atomic_dec_and_test(&rt->rt6i_ref))
+		dst_free(&rt->u.dst);
 }
 
 
@@ -200,18 +241,16 @@ extern __inline__ void rt6_release(struct rt6_info *rt)
 
 static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
 				     int addrlen, int plen,
-				     unsigned long offset,
-				     struct rt6_info *rt)
-				     
+				     int offset)
 {
-	struct fib6_node *fn;
+	struct fib6_node *fn, *in, *ln;
 	struct fib6_node *pn = NULL;
-	struct fib6_node *in;
-	struct fib6_node *ln;
 	struct rt6key *key;
-	__u32	bit;
-	__u32	dir = 0;
-	__u32	sernum = ++rt_sernum;
+	int	bit;
+       	int	dir = 0;
+	__u32	sernum = fib6_new_sernum();
+
+	RT6_TRACE("fib6_add_1\n");
 
 	/* insert node in tree */
 
@@ -220,146 +259,143 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
 	if (plen == 0)
 		return fn;
 
-	for (;;) {
-		if (fn == NULL) {
-			ln = node_alloc();
-
-			if (ln == NULL)
-				return NULL;
-			ln->fn_bit = plen;
-			
-			ln->parent = pn;
-			ln->fn_sernum = sernum;
-			rt->rt6i_node = ln;
-
-			if (dir)
-				pn->right = ln;
-			else
-				pn->left  = ln;
-
-			return ln;
-		}
-
+	do {
 		key = (struct rt6key *)((u8 *)fn->leaf + offset);
 
 		/*
 		 *	Prefix match
 		 */
-		if (addr_match(&key->addr, addr, fn->fn_bit)) {
+		if (plen < fn->fn_bit ||
+		    !addr_match(&key->addr, addr, fn->fn_bit))
+			goto insert_above;
 		
-			/*
-			 *	Exact match ?
-			 */
+		/*
+		 *	Exact match ?
+		 */
 			 
-			if (plen == fn->fn_bit) {
-				/* clean up an intermediate node */
-				if ((fn->fn_flags & RTN_RTINFO) == 0) {
-					rt6_release(fn->leaf);
-					fn->leaf = NULL;
-				}
+		if (plen == fn->fn_bit) {
+			/* clean up an intermediate node */
+			if ((fn->fn_flags & RTN_RTINFO) == 0) {
+				rt6_release(fn->leaf);
+				fn->leaf = NULL;
+			}
 			
-				fn->fn_sernum = sernum;
+			fn->fn_sernum = sernum;
 				
-				return fn;
-			}
-
-			/*
-			 *	We have more bits to go
-			 */
-			 
-			if (plen > fn->fn_bit) {
-				/* Walk down on tree. */
-				fn->fn_sernum = sernum;
-				dir = addr_bit_set(addr, fn->fn_bit);
-				pn = fn;
-				fn = dir ? fn->right: fn->left;
-
-				/*
-				 *	Round we go. Note if fn has become
-				 *	NULL then dir is set and fn is handled
-				 *	top of loop.
-				 */
-				continue;
-			}
+			return fn;
 		}
 
 		/*
-		 * split since we don't have a common prefix anymore or 
-		 * we have a less significant route.
-		 * we've to insert an intermediate node on the list
-		 * this new node will point to the one we need to create
-		 * and the current
+		 *	We have more bits to go
 		 */
+			 
+		/* Try to walk down on tree. */
+		fn->fn_sernum = sernum;
+		dir = addr_bit_set(addr, fn->fn_bit);
+		pn = fn;
+		fn = dir ? fn->right: fn->left;
+	} while (fn);
 
-		pn = fn->parent;
+	/*
+	 *	We wlaked to the bottom of tree.
+	 *	Create new leaf node without children.
+	 */
 
-		/* find 1st bit in difference between the 2 addrs */
-		bit = addr_diff(addr, &key->addr, addrlen);
+	ln = node_alloc();
 
+	if (ln == NULL)
+		return NULL;
+	ln->fn_bit = plen;
+			
+	ln->parent = pn;
+	ln->fn_sernum = sernum;
 
-		/* 
-		 *		(intermediate)	
-		 *	          /	   \
-		 *	(new leaf node)    (old node)
-		 */
-		if (plen > bit) {
-			in = node_alloc();
-		
-			if (in == NULL)
-				return NULL;
-
-			/* 
-			 * new intermediate node. 
-			 * RTN_RTINFO will
-			 * be off since that an address that chooses one of
-			 * the branches would not match less specific routes
-			 * int the other branch
-			 */
+	if (dir)
+		pn->right = ln;
+	else
+		pn->left  = ln;
+
+	return ln;
 
-			in->fn_bit = bit;
 
-			in->parent = pn;
-			in->leaf = rt;
+insert_above:
+	/*
+	 * split since we don't have a common prefix anymore or 
+	 * we have a less significant route.
+	 * we've to insert an intermediate node on the list
+	 * this new node will point to the one we need to create
+	 * and the current
+	 */
+
+	pn = fn->parent;
 
-			in->fn_sernum = sernum;
-			atomic_inc(&rt->rt6i_ref);
+	/* find 1st bit in difference between the 2 addrs.
 
-			/* leaf node */
-			ln = node_alloc();
+	   See comment in addr_diff: bit may be an invalid value,
+	   but if it is >= plen, the value is ignored in any case.
+	 */
+	
+	bit = addr_diff(addr, &key->addr, addrlen);
 
-			if (ln == NULL) {
+	/* 
+	 *		(intermediate)[in]	
+	 *	          /	   \
+	 *	(new leaf node)[ln] (old node)[fn]
+	 */
+	if (plen > bit) {
+		in = node_alloc();
+		ln = node_alloc();
+		
+		if (in == NULL || ln == NULL) {
+			if (in)
 				node_free(in);
-				return NULL;
-			}
+			if (ln)
+				node_free(ln);
+			return NULL;
+		}
+
+		/* 
+		 * new intermediate node. 
+		 * RTN_RTINFO will
+		 * be off since that an address that chooses one of
+		 * the branches would not match less specific routes
+		 * in the other branch
+		 */
 
-			/* update parent pointer */
-			if (dir)
-				pn->right = in;
-			else
-				pn->left  = in;
+		in->fn_bit = bit;
 
-			ln->fn_bit = plen;
+		in->parent = pn;
+		in->leaf = fn->leaf;
+		atomic_inc(&in->leaf->rt6i_ref);
 
-			ln->parent = in;
-			fn->parent = in;
+		in->fn_sernum = sernum;
 
-			ln->fn_sernum = sernum;
+		/* update parent pointer */
+		if (dir)
+			pn->right = in;
+		else
+			pn->left  = in;
 
-			if (addr_bit_set(addr, bit)) {
-				in->right = ln;
-				in->left  = fn;
-			} else {
-				in->left  = ln;
-				in->right = fn;
-			}
+		ln->fn_bit = plen;
+
+		ln->parent = in;
+		fn->parent = in;
+
+		ln->fn_sernum = sernum;
 
-			return ln;
+		if (addr_bit_set(addr, bit)) {
+			in->right = ln;
+			in->left  = fn;
+		} else {
+			in->left  = ln;
+			in->right = fn;
 		}
+	} else { /* plen <= bit */
 
 		/* 
-		 *		(new leaf node)
+		 *		(new leaf node)[ln]
 		 *	          /	   \
-		 *	     (old node)    NULL
+		 *	     (old node)[fn] NULL
 		 */
 
 		ln = node_alloc();
@@ -377,7 +413,6 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
 			pn->right = ln;
 		else
 			pn->left  = ln;
-		
 
 		if (addr_bit_set(&key->addr, plen))
 			ln->right = fn;
@@ -385,11 +420,8 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr,
 			ln->left  = fn;
 
 		fn->parent = ln;
-
-		return ln;
 	}
-
-	return NULL;
+	return ln;
 }
 
 /*
@@ -401,7 +433,6 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt)
 	struct rt6_info *iter = NULL;
 	struct rt6_info **ins;
 
-	rt->rt6i_node = fn;
 	ins = &fn->leaf;
 
 	for (iter = fn->leaf; iter; iter=iter->u.next) {
@@ -423,7 +454,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt)
 				iter->rt6i_expires = rt->rt6i_expires;
 				if (!(rt->rt6i_flags&RTF_EXPIRES)) {
 					iter->rt6i_flags &= ~RTF_EXPIRES;
-					iter->rt6i_expires = rt->rt6i_expires;
+					iter->rt6i_expires = 0;
 				}
 				return -EEXIST;
 			}
@@ -439,8 +470,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt)
 	 *	insert node
 	 */
 
-	*ins = rt;
 	rt->u.next = iter;
+	*ins = rt;
+	rt->rt6i_node = fn;
 	atomic_inc(&rt->rt6i_ref);
 #ifdef CONFIG_RTNETLINK
 	inet6_rt_notify(RTM_NEWROUTE, rt);
@@ -457,8 +489,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt)
 
 static __inline__ void fib6_start_gc(struct rt6_info *rt)
 {
-	if ((ip6_fib_timer.expires == 0) &&
-	    (rt->rt6i_flags & (RTF_ADDRCONF | RTF_CACHE))) {
+	if (ip6_fib_timer.expires == 0 &&
+	    (rt->rt6i_flags & (RTF_EXPIRES|RTF_CACHE))) {
 		del_timer(&ip6_fib_timer);
 		ip6_fib_timer.expires = jiffies + ip6_rt_gc_interval;
 		add_timer(&ip6_fib_timer);
@@ -475,67 +507,97 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt)
 {
 	struct fib6_node *fn;
 	int err = -ENOMEM;
-	unsigned long offset;
-	
-	offset = (u8*) &rt->rt6i_dst - (u8*) rt;
+
 	fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr),
-			rt->rt6i_dst.plen, offset, rt);
+			rt->rt6i_dst.plen, (u8*) &rt->rt6i_dst - (u8*) rt);
 
-	if (fn == NULL) {
-#if RT_DEBUG >= 2
-		printk(KERN_DEBUG "fib6_add: fn == NULL\n");
-#endif
-		goto out;
-	}
+	if (fn == NULL)
+		return -ENOMEM;
 
+#ifdef CONFIG_IPV6_SUBTREES
 	if (rt->rt6i_src.plen) {
 		struct fib6_node *sn;
 
-#if RT_DEBUG >= 2
-		printk(KERN_DEBUG "fib6_add: src.len > 0\n");
-#endif
-
 		if (fn->subtree == NULL) {
 			struct fib6_node *sfn;
 
-			if (fn->leaf == NULL) {
-				fn->leaf = rt;
-				atomic_inc(&rt->rt6i_ref);
-			}
+			/*
+			 * Create subtree.
+			 *
+			 *		fn[main tree]
+			 *		|
+			 *		sfn[subtree root]
+			 *		   \
+			 *		    sn[new leaf node]
+			 */
 
+			/* Create subtree root node */
 			sfn = node_alloc();
-
 			if (sfn == NULL)
-				goto out;
+				goto st_failure;
 
-			sfn->parent = fn;
 			sfn->leaf = &ip6_null_entry;
+			atomic_inc(&ip6_null_entry.rt6i_ref);
 			sfn->fn_flags = RTN_ROOT;
-			sfn->fn_sernum = ++rt_sernum;
+			sfn->fn_sernum = fib6_new_sernum();
 
-			fn->subtree = sfn;
-		}
+			/* Now add the first leaf node to new subtree */
 
-		offset = (u8*) &rt->rt6i_src - (u8*) rt;
+			sn = fib6_add_1(sfn, &rt->rt6i_src.addr,
+					sizeof(struct in6_addr), rt->rt6i_src.plen,
+					(u8*) &rt->rt6i_src - (u8*) rt);
 
-		sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
-				sizeof(struct in6_addr), rt->rt6i_src.plen,
-				offset, rt);
+			if (sn == NULL) {
+				/* If it is failed, discard just allocated
+				   root, and then (in st_failure) stale node
+				   in main tree.
+				 */
+				node_free(sfn);
+				goto st_failure;
+			}
 
-		if (sn == NULL)
-			goto out;
+			/* Now link new subtree to main tree */
+			sfn->parent = fn;
+			fn->subtree = sfn;
+			if (fn->leaf == NULL) {
+				fn->leaf = rt;
+				atomic_inc(&rt->rt6i_ref);
+			}
+		} else {
+			sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
+					sizeof(struct in6_addr), rt->rt6i_src.plen,
+					(u8*) &rt->rt6i_src - (u8*) rt);
+
+			if (sn == NULL)
+				goto st_failure;
+		}
 
 		fn = sn;
 	}
+#endif
 
 	err = fib6_add_rt2node(fn, rt);
 
-	if (err == 0)
+	if (err == 0) {
 		fib6_start_gc(rt);
-out:
+		if (!(rt->rt6i_flags&RTF_CACHE))
+			fib6_prune_clones(fn, rt);
+	}
+
 	if (err)
 		dst_free(&rt->u.dst);
 	return err;
+
+#ifdef CONFIG_IPV6_SUBTREES
+	/* Subtree creation failed, probably main tree node
+	   is orphan. If it is, shot it.
+	 */
+st_failure:
+	if (fn && !(fn->fn_flags&RTN_RTINFO|RTN_ROOT))
+		fib_repair_tree(fn);
+	dst_free(&rt->u.dst);
+	return err;
+#endif
 }
 
 /*
@@ -544,7 +606,7 @@ out:
  */
 
 struct lookup_args {
-	unsigned long	offset;		/* key offset on rt6_info	*/
+	int		offset;		/* key offset on rt6_info	*/
 	struct in6_addr	*addr;		/* search key			*/
 };
 
@@ -576,6 +638,7 @@ static struct fib6_node * fib6_lookup_1(struct fib6_node *root,
 	}
 
 	while ((fn->fn_flags & RTN_ROOT) == 0) {
+#ifdef CONFIG_IPV6_SUBTREES
 		if (fn->subtree) {
 			struct fib6_node *st;
 			struct lookup_args *narg;
@@ -591,6 +654,7 @@ static struct fib6_node * fib6_lookup_1(struct fib6_node *root,
 				}
 			}
 		}
+#endif
 
 		if (fn->fn_flags & RTN_RTINFO) {
 			struct rt6key *key;
@@ -618,8 +682,10 @@ struct fib6_node * fib6_lookup(struct fib6_node *root, struct in6_addr *daddr,
 	args[0].offset = (u8*) &rt->rt6i_dst - (u8*) rt;
 	args[0].addr = daddr;
 
+#ifdef CONFIG_IPV6_SUBTREES
 	args[1].offset = (u8*) &rt->rt6i_src - (u8*) rt;
 	args[1].addr = saddr;
+#endif
 
 	fn = fib6_lookup_1(root, args);
 
@@ -630,12 +696,79 @@ struct fib6_node * fib6_lookup(struct fib6_node *root, struct in6_addr *daddr,
 }
 
 /*
+ *	Get node with sepciafied destination prefix (and source prefix,
+ *	if subtrees are used)
+ */
+
+
+static struct fib6_node * fib6_locate_1(struct fib6_node *root,
+					struct in6_addr *addr,
+					int plen, int offset)
+{
+	struct fib6_node *fn;
+
+	for (fn = root; fn ; ) {
+		struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset);
+
+		/*
+		 *	Prefix match
+		 */
+		if (plen < fn->fn_bit ||
+		    !addr_match(&key->addr, addr, fn->fn_bit))
+			return NULL;
+
+		if (plen == fn->fn_bit)
+			return fn;
+
+		/*
+		 *	We have more bits to go
+		 */
+		if (addr_bit_set(addr, fn->fn_bit))
+			fn = fn->right;
+		else
+			fn = fn->left;
+	}
+	return NULL;
+}
+
+struct fib6_node * fib6_locate(struct fib6_node *root,
+			       struct in6_addr *daddr, int dst_len,
+			       struct in6_addr *saddr, int src_len)
+{
+	struct rt6_info *rt = NULL;
+	struct fib6_node *fn;
+
+	fn = fib6_locate_1(root, daddr, dst_len,
+			   (u8*) &rt->rt6i_dst - (u8*) rt);
+
+#ifdef CONFIG_IPV6_SUBTREES
+	if (src_len) {
+		BUG_TRAP(saddr!=NULL);
+		if (fn == NULL)
+			fn = fn->subtree;
+		if (fn)
+			fn = fib6_locate_1(fn, saddr, src_len,
+					   (u8*) &rt->rt6i_src - (u8*) rt);
+	}
+#endif
+
+	if (fn && fn->fn_flags&RTN_RTINFO)
+		return fn;
+
+	return NULL;
+}
+
+
+/*
  *	Deletion
  *
  */
 
 static struct rt6_info * fib6_find_prefix(struct fib6_node *fn)
 {
+	if (fn->fn_flags&RTN_ROOT)
+		return &ip6_null_entry;
+
 	while(fn) {
 		if(fn->left)
 			return fn->left->leaf;
@@ -643,7 +776,7 @@ static struct rt6_info * fib6_find_prefix(struct fib6_node *fn)
 		if(fn->right)
 			return fn->right->leaf;
 
-		fn = fn->subtree;
+		fn = SUBTREE(fn);
 	}
 	return NULL;
 }
@@ -653,428 +786,414 @@ static struct rt6_info * fib6_find_prefix(struct fib6_node *fn)
  *	is the node we want to try and remove.
  */
 
-static void fib6_del_2(struct fib6_node *fn)
+static void fib6_repair_tree(struct fib6_node *fn)
 {
-	struct rt6_info *rt;
-
-	fn->fn_flags &= ~RTN_RTINFO;
-	rt6_stats.fib_route_nodes--;
+	int children;
+	int nstate;
+	struct fib6_node *child, *pn;
+	struct fib6_walker_t *w;
+	int iter = 0;
 
-	/*
-	 *	Can't delete a root node
-	 */
-	 
-	if (fn->fn_flags & RTN_TL_ROOT)
-		return;
+	for (;;) {
+		RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
+		iter++;
 
-	do {
-		struct fib6_node *pn, *child;
-		int children = 0;
+		BUG_TRAP(!(fn->fn_flags&RTN_RTINFO));
+		BUG_TRAP(!(fn->fn_flags&RTN_TL_ROOT));
+		BUG_TRAP(fn->leaf==NULL);
 
+		children = 0;
 		child = NULL;
+		if (fn->right) child = fn->right, children |= 1;
+		if (fn->left) child = fn->left, children |= 2;
 
-		/*
-		 *	We have a child to left
-		 */
-		 
-		if (fn->left) {
-			children++;
-			child = fn->left;
-		}
-
-		/*
-		 *	To right
-		 */
-		 
-		if (fn->right) {
-			children++;
-			child = fn->right;
-		}
-
-		/*
-		 *	We can't tidy a case of two children.
-		 */
-		if (children > 1) {
-			if (fn->leaf == NULL)
-				goto split_repair;
-			break;
+		if (children == 3 || SUBTREE(fn) 
+#ifdef CONFIG_IPV6_SUBTREES
+		    /* Subtree root (i.e. fn) may have one child */
+		    || (children && fn->fn_flags&RTN_ROOT)
+#endif
+		    ) {
+			fn->leaf = fib6_find_prefix(fn);
+#if RT6_DEBUG >= 2
+			if (fn->leaf==NULL) {
+				BUG_TRAP(fn->leaf);
+				fn->leaf = &ip6_null_entry;
+			}
+#endif
+			atomic_inc(&fn->leaf->rt6i_ref);
+			return;
 		}
 
-		if (fn->fn_flags & RTN_RTINFO)
-			break;
-
-		/*
-		 *	The node we plan to tidy has an stree. Talk about
-		 *	making life hard.
-		 */
-		 
-		if (fn->subtree)
-			goto stree_node;
-
-		/*
-		 *	Up we go
-		 */
-		 
 		pn = fn->parent;
-
-		/*
-		 *	Not a ROOT - we can tidy
-		 */
-		 
-		if ((fn->fn_flags & RTN_ROOT) == 0) {
-			/*
-			 *	Make our child our parents child
-			 */
-			if (pn->left == fn)
-				pn->left = child;
-			else
-				pn->right = child;
-
-			/*
-			 *	Reparent the child
-			 */
+#ifdef CONFIG_IPV6_SUBTREES
+		if (SUBTREE(pn) == fn) {
+			BUG_TRAP(fn->fn_flags&RTN_ROOT);
+			SUBTREE(pn) = NULL;
+			nstate = FWS_L;
+		} else {
+			BUG_TRAP(!(fn->fn_flags&RTN_ROOT));
+#endif
+			if (pn->right == fn) pn->right = child;
+			else if (pn->left == fn) pn->left = child;
+#if RT6_DEBUG >= 2
+			else BUG_TRAP(0);
+#endif
 			if (child)
 				child->parent = pn;
+			nstate = FWS_R;
+#ifdef CONFIG_IPV6_SUBTREES
+		}
+#endif
 
-			/*
-			 *	Discard leaf entries
-			 */
-			if (fn->leaf)
-				rt6_release(fn->leaf);
-		} else {
-			if (children)
-				break;
-			/*
-			 *	No children so no subtree
-			 */
-
-			pn->subtree = NULL;
+		FOR_WALKERS(w) {
+			if (child == NULL) {
+				if (w->root == fn) {
+					w->root = w->node = NULL;
+					RT6_TRACE("W %p adjusted by delroot 1\n", w);
+				} else if (w->node == fn) {
+					RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
+					w->node = pn;
+					w->state = nstate;
+				}
+			} else {
+				if (w->root == fn) {
+					w->root = child;
+					RT6_TRACE("W %p adjusted by delroot 2\n", w);
+				}
+				if (w->node == fn) {
+					w->node = child;
+					if (children&2) {
+						RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
+						w->state = w->state>=FWS_R ? FWS_U : FWS_INIT;
+					} else {
+						RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
+						w->state = w->state>=FWS_C ? FWS_U : FWS_INIT;
+					}
+				}
+			}
 		}
 
-		/*
-		 *	We are discarding 
-		 */
 		node_free(fn);
-		
-		/*
-		 *	Our merge of entries might propogate further
-		 *	up the tree, so move up a level and retry.
-		 */
-		 
-		fn = pn;
-
-	} while (!(fn->fn_flags & RTN_TL_ROOT));
-
-	return;
-
-stree_node:
-
-	rt6_release(fn->leaf);
-
-split_repair:
-	rt = fib6_find_prefix(fn);
-
-	if (rt == NULL)
-		panic("fib6_del_2: inconsistent tree\n");
+		if (pn->fn_flags&RTN_RTINFO || SUBTREE(pn))
+			return;
 
-	atomic_inc(&rt->rt6i_ref);
-	fn->leaf = rt;
+		rt6_release(pn->leaf);
+		pn->leaf = NULL;
+		fn = pn;
+	}
 }
 
-/*
- *	Remove our entry in the tree. This throws away the route entry
- *	from the list of entries attached to this fib node. It doesn't
- *	expunge from the tree.
- */
-
-static struct fib6_node * fib6_del_1(struct rt6_info *rt)
+static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp)
 {
-	struct fib6_node *fn;
-	
-	fn = rt->rt6i_node;
+	struct fib6_walker_t *w;
+	struct rt6_info *rt = *rtp;
+
+	RT6_TRACE("fib6_del_route\n");
+
+	if (!(rt->rt6i_flags&RTF_CACHE))
+		fib6_prune_clones(fn, rt);
+
+	/* Unlink it */
+	*rtp = rt->u.next;
+	rt->rt6i_node = NULL;
+	rt6_stats.fib_rt_entries--;
+
+	/* Adjust walkers */
+	FOR_WALKERS(w) {
+		if (w->state == FWS_C && w->leaf == rt) {
+			RT6_TRACE("walker %p adjusted by delroute\n", w);
+			w->leaf = rt->u.next;
+			if (w->leaf == NULL)
+				w->state = FWS_U;
+		}
+	}
 
-	/* We need a fib node! */
-	if (fn) {
-		struct rt6_info **back;
-		struct rt6_info *lf;
+	rt->u.next = NULL;
 
-		back = &fn->leaf;
-		
-		/*
-		 *	Walk the leaf entries looking for ourself
-		 */
-		 
-		for(lf = fn->leaf; lf; lf=lf->u.next) {
-			if (rt == lf) {
-				/*
-				 *	Delete this entry.
-				 */
-
-				*back = lf->u.next;
-#ifdef CONFIG_RTNETLINK
-				inet6_rt_notify(RTM_DELROUTE, lf);
-#endif			
-				rt6_release(lf);
-				rt6_stats.fib_rt_entries--;
-				return fn;
-			}
-			back = &lf->u.next;
-		}
+	/* If it was last route, expunge its radix tree node */
+	if (fn->leaf == NULL) {
+		fn->fn_flags &= ~RTN_RTINFO;
+		rt6_stats.fib_route_nodes--;
+		fib6_repair_tree(fn);
 	}
 
-	return NULL;
+#ifdef CONFIG_RTNETLINK
+	inet6_rt_notify(RTM_DELROUTE, rt);
+#endif
+	rt6_release(rt);
 }
 
 int fib6_del(struct rt6_info *rt)
 {
-	struct fib6_node *fn;
-
-	fn = fib6_del_1(rt);
+	struct fib6_node *fn = rt->rt6i_node;
+	struct rt6_info **rtp;
 
-	if (fn == NULL)
+#if RT6_DEBUG >= 2
+	if (rt->u.dst.obsolete>0) {
+		BUG_TRAP(rt->u.dst.obsolete>0);
+		return -EFAULT;
+	}
+#endif
+	if (fn == NULL || rt == &ip6_null_entry)
 		return -ENOENT;
 
-	if (fn->leaf == NULL)
-		fib6_del_2(fn);
+	BUG_TRAP(fn->fn_flags&RTN_RTINFO);
 
-	return 0;
+	/*
+	 *	Walk the leaf entries looking for ourself
+	 */
+
+	for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.next) {
+		if (*rtp == rt) {
+			fib6_del_route(fn, rtp);
+			return 0;
+		}
+	}
+	return -ENOENT;
 }
 
 /*
- *	Tree transversal function
+ *	Tree transversal function.
  *
- *	Wau... It is NOT REENTERABLE!!!!!!! It is cathastrophe. --ANK
+ *	Certainly, it is not interrupt safe.
+ *	However, it is internally reenterable wrt itself and fib6_add/fib6_del.
+ *	It means, that we can modify tree during walking
+ *	and use this function for garbage collection, clone pruning,
+ *	cleaning tree when a device goes down etc. etc.	
+ *
+ *	It guarantees that every node will be traversed,
+ *	and that it will be traversed only once.
+ *
+ *	Callback function w->func may return:
+ *	0 -> continue walking.
+ *	positive value -> walking is suspended (used by tree dumps,
+ *	and probably by gc, if it will be split to several slices)
+ *	negative value -> terminate walking.
+ *
+ *	The function itself returns:
+ *	0   -> walk is complete.
+ *	>0  -> walk is incomplete (i.e. suspended)
+ *	<0  -> walk is terminated by an error.
  */
 
-int fib6_walk_count;
-
-void fib6_walk_tree(struct fib6_node *root, f_pnode func, void *arg,
-		    int filter)
+int fib6_walk_continue(struct fib6_walker_t *w)
 {
-	struct fib6_node *fn;
+	struct fib6_node *fn, *pn;
 
-	fn = root;
+	for (;;) {
+		fn = w->node;
+		if (fn == NULL)
+			return 0;
 
-	fib6_walk_count++;
-	
-	do {
-		if (!(fn->fn_flags & RTN_TAG)) {
-			fn->fn_flags |= RTN_TAG;
-			
+		if (w->prune && fn != w->root &&
+		    fn->fn_flags&RTN_RTINFO && w->state < FWS_C) {
+			w->state = FWS_C;
+			w->leaf = fn->leaf;
+		}
+		switch (w->state) {
+#ifdef CONFIG_IPV6_SUBTREES
+		case FWS_S:
+			if (SUBTREE(fn)) {
+				w->node = SUBTREE(fn);
+				continue;
+			}
+			w->state = FWS_L;
+#endif	
+		case FWS_L:
 			if (fn->left) {
-				fn = fn->left;
+				w->node = fn->left;
+				w->state = FWS_INIT;
 				continue;
 			}
-		}
-
-		fn->fn_flags &= ~RTN_TAG;
-
-		if (fn->right) {
-			fn = fn->right;
-			continue;
-		}
-		
-		do {
-			struct fib6_node *node;
-			
-			if (fn->fn_flags & RTN_ROOT)
-				break;
-			node = fn;
-			fn = fn->parent;
-			
-			if (!(node->fn_flags & RTN_TAG)) {
-				if (node->subtree) {
-					fib6_walk_tree(node->subtree, func,
-						       arg, filter);
-				}
-
-				if (!filter ||
-				    (node->fn_flags & RTN_RTINFO))
-					(*func)(node, arg);
+			w->state = FWS_R;
+		case FWS_R:
+			if (fn->right) {
+				w->node = fn->right;
+				w->state = FWS_INIT;
+				continue;
 			}
-			
-		} while (!(fn->fn_flags & RTN_TAG));
-
-	} while (!(fn->fn_flags & RTN_ROOT) || (fn->fn_flags & RTN_TAG));
-
-	fib6_walk_count--;
+			w->state = FWS_C;
+			w->leaf = fn->leaf;
+		case FWS_C:
+			if (w->leaf && fn->fn_flags&RTN_RTINFO) {
+				int err = w->func(w);
+				if (err)
+					return err;
+				continue;
+			}
+			w->state = FWS_U;
+		case FWS_U:
+			if (fn == w->root)
+				return 0;
+			pn = fn->parent;
+			w->node = pn;
+#ifdef CONFIG_IPV6_SUBTREES
+			if (SUBTREE(pn) == fn) {
+				BUG_TRAP(fn->fn_flags&RTN_ROOT);
+				w->state = FWS_L;
+				continue;
+			}
+#endif
+			if (pn->left == fn) {
+				w->state = FWS_R;
+				continue;
+			}
+			if (pn->right == fn) {
+				w->state = FWS_C;
+				w->leaf = w->node->leaf;
+				continue;
+			}
+#if RT6_DEBUG >= 2
+			BUG_TRAP(0);
+#endif
+		}
+	}
 }
 
-/*
- *	Garbage collection
- */
-
-static int fib6_gc_node(struct fib6_node *fn, int timeout)
+int fib6_walk(struct fib6_walker_t *w)
 {
-	struct rt6_info *rt, **back;
-	int more = 0;
-	unsigned long now = jiffies;
-
-	back = &fn->leaf;
-
-	for (rt = fn->leaf; rt;) {
-		if ((rt->rt6i_flags & RTF_CACHE) && atomic_read(&rt->rt6i_use) == 0) {
-			if ((long)(now - rt->rt6i_tstamp) >= timeout) {
-				struct rt6_info *old;
-
-				old = rt;
+	int res;
 
-				rt = rt->u.next;
+	w->state = FWS_INIT;
+	w->node = w->root;
 
-				*back = rt;
+	fib6_walker_link(w);
+	res = fib6_walk_continue(w);
+	if (res <= 0)
+		fib6_walker_unlink(w);
+	return res;
+}
 
-				old->rt6i_node = NULL;
-#ifdef CONFIG_RTNETLINK
-				inet6_rt_notify(RTM_DELROUTE, old);
+static int fib6_clean_node(struct fib6_walker_t *w)
+{
+	int res;
+	struct rt6_info *rt;
+	struct fib6_cleaner_t *c = (struct fib6_cleaner_t*)w;
+
+	for (rt = w->leaf; rt; rt = rt->u.next) {
+		res = c->func(rt, c->arg);
+		if (res < 0) {
+			w->leaf = rt;
+			res = fib6_del(rt);
+			if (res) {
+#if RT6_DEBUG >= 2
+				printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res);
 #endif
-				old->u.dst.obsolete = 1;
-				rt6_release(old);
-				rt6_stats.fib_rt_entries--;
 				continue;
 			}
-			more++;
+			return 0;
 		}
+		BUG_TRAP(res==0);
+	}
+	w->leaf = rt;
+	return 0;
+}
 
-		/*
-		 *	check addrconf expiration here.
-		 *
-		 *	BUGGGG Crossing fingers and ...
-		 *	Seems, radix tree walking is absolutely broken,
-		 *	but we will try in any case --ANK
-		 */
-		if ((rt->rt6i_flags&RTF_EXPIRES) && rt->rt6i_expires
-		    && (long)(now - rt->rt6i_expires) > 0) {
-			struct rt6_info *old;
+/*
+ *	Convenient frontend to tree walker.
+ *	
+ *	func is called on each route.
+ *		It may return -1 -> delete this route.
+ *		              0  -> continue walking
+ *
+ *	prune==1 -> only immediate children of node (certainly,
+ *	ignoring pure split nodes) will be scanned.
+ */
 
-			old = rt;
-			rt = rt->u.next;
+void fib6_clean_tree(struct fib6_node *root,
+		     int (*func)(struct rt6_info *, void *arg),
+		     int prune, void *arg)
+{
+	struct fib6_cleaner_t c;
 
-			*back = rt;
+	c.w.root = root;
+	c.w.func = fib6_clean_node;
+	c.w.prune = prune;
+	c.func = func;
+	c.arg = arg;
 
-			old->rt6i_node = NULL;
-#ifdef CONFIG_RTNETLINK
-			inet6_rt_notify(RTM_DELROUTE, old);
-#endif
-			old->u.dst.obsolete = 1;
-			rt6_release(old);
-			rt6_stats.fib_rt_entries--;
-			continue;
-		}
-		back = &rt->u.next;
-		rt = rt->u.next;
+	start_bh_atomic();
+	fib6_walk(&c.w);
+	end_bh_atomic();
+}
+
+static int fib6_prune_clone(struct rt6_info *rt, void *arg)
+{
+	if (rt->rt6i_flags & RTF_CACHE) {
+		RT6_TRACE("pruning clone %p\n", rt);
+		return -1;
 	}
 
-	return more;
+	return 0;
 }
 
-struct fib6_gc_args {
-	unsigned long	timeout;
-	int		more;
-};
+static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt)
+{
+	fib6_clean_tree(fn, fib6_prune_clone, 1, rt);
+}
+
+/*
+ *	Garbage collection
+ */
 
-static void fib6_garbage_collect(struct fib6_node *fn, void *p_arg)
+static struct fib6_gc_args
 {
-	struct fib6_gc_args * args = (struct fib6_gc_args *) p_arg;
+	int			timeout;
+	int			more;
+} gc_args;
 
-	if (fn->fn_flags & RTN_RTINFO) {
-		int more;
+static int fib6_age(struct rt6_info *rt, void *arg)
+{
+	unsigned long now = jiffies;
 
-		more = fib6_gc_node(fn, args->timeout);
+	/* Age clones. Note, that clones are aged out
+	   only if they are not in use now.
+	 */
 
-		if (fn->leaf) {
-			args->more += more;
-			return;
+	if (rt->rt6i_flags & RTF_CACHE) {
+		if (atomic_read(&rt->u.dst.use) == 0 &&
+		    (long)(now - rt->u.dst.lastuse) >= gc_args.timeout) {
+			RT6_TRACE("aging clone %p\n", rt);
+			return -1;
 		}
-
-		rt6_stats.fib_route_nodes--;
-		fn->fn_flags &= ~RTN_RTINFO;
+		gc_args.more++;
+		return 0;
 	}
 
 	/*
-	 *	tree nodes (with no routing information)
+	 *	check addrconf expiration here.
+	 *	They are expired even if they are in use.
 	 */
 
-	if (!fn->subtree && !(fn->fn_flags & RTN_TL_ROOT)) {
-		int children = 0;
-		struct fib6_node *chld = NULL;
-
-		if (fn->left) {
-			children++;
-			chld = fn->left;
-		}
-			
-		if (fn->right) {
-			children++;
-			chld = fn->right;
-		}
-		
-		if ((fn->fn_flags & RTN_ROOT)) {
-			if (children == 0) {
-				struct fib6_node *pn;
-
-				pn = fn->parent;
-				pn->subtree = NULL;
-
-				node_free(fn);
-			}
-			return;
-		}
-
-		if (children <= 1) {
-			struct fib6_node *pn = fn->parent;
-			
-			if (pn->left == fn)
-				pn->left = chld;
-			else
-				pn->right = chld;
-			
-			if (chld)
-				chld->parent = pn;
-			
-			if (fn->leaf)
-				rt6_release(fn->leaf);
-
-			node_free(fn);
-
-			return;
+	if (rt->rt6i_flags&RTF_EXPIRES && rt->rt6i_expires) {
+		if ((long)(now - rt->rt6i_expires) > 0) {
+			RT6_TRACE("expiring %p\n", rt);
+			return -1;
 		}
+		gc_args.more++;
+		return 0;
 	}
 
-	if (fn->leaf == NULL) {
-		struct rt6_info *nrt;
-		
-		nrt = fib6_find_prefix(fn);
-
-		if (nrt == NULL)
-			panic("fib6: inconsistent tree\n");
-
-		atomic_inc(&nrt->rt6i_ref);
-		fn->leaf = nrt;
-	}
+	return 0;
 }
 
 void fib6_run_gc(unsigned long dummy)
 {
-	struct fib6_gc_args arg = {
-		ip6_rt_gc_timeout,
-		0
-	};
+	if (dummy != ~0UL)
+		gc_args.timeout = (int)dummy;
+	else
+		gc_args.timeout = ip6_rt_gc_interval;
 
-	del_timer(&ip6_fib_timer);
+	gc_args.more = 0;
 
-	if (dummy)
-		arg.timeout = dummy;
+	fib6_clean_tree(&ip6_routing_table, fib6_age, 0, NULL);
 
-	if (fib6_walk_count == 0)
-		fib6_walk_tree(&ip6_routing_table, fib6_garbage_collect, &arg, 0);
-	else
-		arg.more = 1;
+	del_timer(&ip6_fib_timer);
 
-	if (arg.more) {
+	ip6_fib_timer.expires = 0;
+	if (gc_args.more) {
 		ip6_fib_timer.expires = jiffies + ip6_rt_gc_interval;
 		add_timer(&ip6_fib_timer);
-	} else {
-		ip6_fib_timer.expires = 0;
 	}
 }
 
@@ -1084,3 +1203,5 @@ void fib6_gc_cleanup(void)
 	del_timer(&ip6_fib_timer);
 }
 #endif
+
+
diff --git a/net/ipv6/ip6_fw.c b/net/ipv6/ip6_fw.c
index 3c3a0cfc5..c19a561e9 100644
--- a/net/ipv6/ip6_fw.c
+++ b/net/ipv6/ip6_fw.c
@@ -5,7 +5,7 @@
  *	Authors:
  *	Pedro Roque		<roque@di.fc.ul.pt>	
  *
- *	$Id: ip6_fw.c,v 1.9 1998/02/12 07:43:42 davem Exp $
+ *	$Id: ip6_fw.c,v 1.10 1998/08/26 12:04:57 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -300,14 +300,19 @@ int ip6_fw_msg_add(struct ip6_fw_msg *msg)
 	rl->info.uli_u.data = msg->u.data;
 
 	rtmsg.rtmsg_flags = RTF_NONEXTHOP|RTF_POLICY;
-	rt = ip6_route_add(&rtmsg, &err);
+	err = ip6_route_add(&rtmsg);
 
-	/* BUGGGG! rt can point to nowhere. */
-	if (rt == NULL) {
+	if (err) {
 		ip6_fwrule_free(rl);
-		return -ENOMEM;
+		return err;
 	}
 
+	/* The rest will not work for now. --ABK (989725) */
+
+#ifndef notdef
+	ip6_fwrule_free(rl);
+	return -EPERM;
+#else
 	rt->u.dst.error = -EPERM;
 
 	if (msg->policy == IP6_FW_ACCEPT) {
@@ -327,6 +332,7 @@ int ip6_fw_msg_add(struct ip6_fw_msg *msg)
 	rt->rt6i_flowr = flow_clone((struct flow_rule *)rl);
 
 	return 0;
+#endif
 }
 
 static int ip6_fw_msgrcv(int unit, struct sk_buff *skb)
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 6ab4d2c08..6d7359aef 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -6,7 +6,7 @@
  *	Pedro Roque		<roque@di.fc.ul.pt>
  *	Ian P. Morris		<I.P.Morris@soton.ac.uk>
  *
- *	$Id: ip6_input.c,v 1.10 1998/07/15 05:05:34 davem Exp $
+ *	$Id: ip6_input.c,v 1.11 1998/08/26 12:04:59 davem Exp $
  *
  *	Based in linux/net/ipv4/ip_input.c
  *
@@ -37,144 +37,21 @@
 #include <net/ip6_route.h>
 #include <net/addrconf.h>
 
-static int ipv6_dest_opt(struct sk_buff **skb_ptr, struct device *dev,
-			 __u8 *nhptr, struct ipv6_options *opt);
-
-struct hdrtype_proc {
-	u8	type;
-	int	(*func) (struct sk_buff **, struct device *dev, __u8 *ptr,
-			 struct ipv6_options *opt);
-} hdrproc_lst[] = {
-
-  /*
-	TODO
-
-	{NEXTHDR_HOP,		ipv6_hop_by_hop}
-	{NEXTHDR_ROUTING,	ipv6_routing_header},
-   */
-	{NEXTHDR_FRAGMENT,	ipv6_reassembly},
-  
-	{NEXTHDR_DEST,		ipv6_dest_opt},
-   /*	
-	{NEXTHDR_AUTH,		ipv6_auth_hdr},
-	{NEXTHDR_ESP,		ipv6_esp_hdr},
-    */
-	{NEXTHDR_MAX,		NULL}
-};
-
-/* New header structures */
-
-
-struct tlvtype_proc {
-	u8	type;
-	int	(*func) (struct sk_buff *, struct device *dev, __u8 *ptr,
-			 struct ipv6_options *opt);
-	/*
-	 *	these functions do NOT update skb->h.raw
-	 */
-
-} tlvprocdestopt_lst[] = {
-	{255,			NULL}
-};
-
-int ip6_dstopt_unknown(struct sk_buff *skb, struct ipv6_tlvtype *hdr)
-{
-	struct in6_addr *daddr;
-	int pos;
-
-	/*
-	 *	unkown destination option type
-	 */
-	
-	pos = (__u8 *) hdr - (__u8 *) skb->nh.raw;
-	
-	/* I think this is correct please check - IPM */
-
-	switch ((hdr->type & 0xC0) >> 6) {
-	case 0: /* ignore */
-		skb->h.raw += hdr->len+2;
-		return 1;
-		
-	case 1: /* drop packet */
-		break;
-
-	case 2: /* send ICMP PARM PROB regardless and drop packet */
-		icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_UNK_OPTION,
-			    pos, skb->dev);
-		break;
-		
-	case 3: /* Send ICMP if not a multicast address and drop packet */
-		daddr = &skb->nh.ipv6h->daddr;
-		if (!(ipv6_addr_type(daddr) & IPV6_ADDR_MULTICAST))
-			icmpv6_send(skb, ICMPV6_PARAMPROB,
-				    ICMPV6_UNK_OPTION, pos, skb->dev);
-	};
-	
-	kfree_skb(skb);
-	return 0;
-}
-
-static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff *skb,
-			 struct device *dev, __u8 *nhptr,
-			 struct ipv6_options *opt, void *lastopt)
-{
-	struct ipv6_tlvtype *hdr;
-	struct tlvtype_proc *curr;
-
-	while ((hdr=(struct ipv6_tlvtype *)skb->h.raw) != lastopt) {
-		switch (hdr->type) {
-		case 0: /* TLV encoded Pad1 */
-			skb->h.raw++;
-			break;
-
-		case 1: /* TLV encoded PadN */
-			skb->h.raw += hdr->len+2;
-			break;
-
-		default: /* Other TLV code so scan list */
-			for (curr=procs; curr->type != 255; curr++) {
-				if (curr->type == (hdr->type)) {
-					curr->func(skb, dev, nhptr, opt);
-					skb->h.raw += hdr->len+2;
-					break;
-				}
-			}
-			if (curr->type==255) {
-				if (ip6_dstopt_unknown(skb, hdr) == 0)
-					return 0;
-			}
-			break;
-		}
-	}
-	return 1;
-}
-
-static int ipv6_dest_opt(struct sk_buff **skb_ptr, struct device *dev,
-			 __u8 *nhptr, struct ipv6_options *opt)
-{
-	struct sk_buff *skb=*skb_ptr;
-	struct ipv6_destopt_hdr *hdr = (struct ipv6_destopt_hdr *) skb->h.raw;
-	int res = 0;
-	void *lastopt=skb->h.raw+hdr->hdrlen+sizeof(struct ipv6_destopt_hdr);
-
-	skb->h.raw += sizeof(struct ipv6_destopt_hdr);
-	if (ip6_parse_tlv(tlvprocdestopt_lst, skb, dev, nhptr, opt, lastopt))
-		res = hdr->nexthdr;
-	skb->h.raw+=hdr->hdrlen;
-
-	return res;
-}
-
 
 int ipv6_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
 {
 	struct ipv6hdr *hdr;
-	int pkt_len;
+	u32 		pkt_len;
 
-	if (skb->pkt_type == PACKET_OTHERHOST) {
-		kfree_skb(skb);
-		return 0;
-	}
+	if (skb->pkt_type == PACKET_OTHERHOST)
+		goto drop;
+
+	ipv6_statistics.Ip6InReceives++;
+
+	/* Store incoming device index. When the packet will
+	   be queued, we cannot refer to skb->dev anymore.
+	 */
+	((struct inet6_skb_parm *)skb->cb)->iif = dev->ifindex;
 
 	hdr = skb->nh.ipv6h;
 
@@ -183,16 +60,31 @@ int ipv6_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt)
 
 	pkt_len = ntohs(hdr->payload_len);
 
-	if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
-		goto err;
+	/* pkt_len may be zero if Jumbo payload option is present */
+	if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
+		if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
+			goto truncated;
+		skb_trim(skb, pkt_len + sizeof(struct ipv6hdr));
+	}
 
-	skb_trim(skb, pkt_len + sizeof(struct ipv6hdr));
+	if (hdr->nexthdr == NEXTHDR_HOP) {
+		skb->h.raw = (u8*)(hdr+1);
+		if (!ipv6_parse_hopopts(skb, &hdr->nexthdr)) {
+			ipv6_statistics.Ip6InHdrErrors++;
+			return 0;
+		}
+	}
 
-	ip6_route_input(skb);
-	
-	return 0;
+	if (skb->dst == NULL)
+		ip6_route_input(skb);
+
+	return skb->dst->input(skb);
+
+truncated:
+	ipv6_statistics.Ip6InTruncatedPkts++;
 err:
 	ipv6_statistics.Ip6InHdrErrors++;
+drop:
 	kfree_skb(skb);
 	return 0;
 }
@@ -217,8 +109,7 @@ static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb)
  *	without calling rawv6.c)
  */
 static struct sock * ipv6_raw_deliver(struct sk_buff *skb,
-				      struct ipv6_options *opt,
-				      int nexthdr, int len)
+				      int nexthdr, unsigned long len)
 {
 	struct in6_addr *saddr;
 	struct in6_addr *daddr;
@@ -253,8 +144,8 @@ static struct sock * ipv6_raw_deliver(struct sk_buff *skb,
 				continue;
 
 			buff = skb_clone(skb, GFP_ATOMIC);
-			buff->sk = sk2;
-			rawv6_rcv(buff, skb->dev, saddr, daddr, opt, len);
+			if (buff)
+				rawv6_rcv(sk2, buff, len);
 		}
 	}
 
@@ -270,10 +161,8 @@ static struct sock * ipv6_raw_deliver(struct sk_buff *skb,
 
 int ip6_input(struct sk_buff *skb)
 {
-	struct ipv6_options *opt = (struct ipv6_options *) skb->cb;
 	struct ipv6hdr *hdr = skb->nh.ipv6h;
 	struct inet6_protocol *ipprot;
-	struct hdrtype_proc *hdrt;
 	struct sock *raw_sk;
 	__u8 *nhptr;
 	int nexthdr;
@@ -281,7 +170,7 @@ int ip6_input(struct sk_buff *skb)
 	u8 hash;
 	int len;
 	
-	skb->h.raw += sizeof(struct ipv6hdr);
+	skb->h.raw = skb->nh.raw + sizeof(struct ipv6hdr);
 
 	/*
 	 *	Parse extension headers
@@ -290,64 +179,55 @@ int ip6_input(struct sk_buff *skb)
 	nexthdr = hdr->nexthdr;
 	nhptr = &hdr->nexthdr;
 
-	/*
-	 *	check for extension headers
-	 */
-
-st_loop:
+	/* Skip  hop-by-hop options, they are already parsed. */
+	if (nexthdr == NEXTHDR_HOP) {
+		nhptr = (u8*)(hdr+1);
+		nexthdr = *nhptr;
+		skb->h.raw += (nhptr[1]+1)<<3;
+	}
 
-	for (hdrt=hdrproc_lst; hdrt->type != NEXTHDR_MAX; hdrt++) {
-		if (hdrt->type == nexthdr) {
-			if ((nexthdr = hdrt->func(&skb, skb->dev, nhptr, opt))) {
-				nhptr = skb->h.raw;
-				hdr = skb->nh.ipv6h;
-				goto st_loop;
-			}
+	/* This check is sort of optimization.
+	   It would be stupid to detect for optional headers,
+	   which are missing with probability of 200%
+	 */
+	if (nexthdr != IPPROTO_TCP && nexthdr != IPPROTO_UDP) {
+		nhptr = ipv6_parse_exthdrs(&skb, nhptr);
+		if (nhptr == NULL)
 			return 0;
-		}
+		nexthdr = *nhptr;
+		hdr = skb->nh.ipv6h;
 	}
-
 	len = skb->tail - skb->h.raw;
 
-	raw_sk = ipv6_raw_deliver(skb, opt, nexthdr, len);
+	raw_sk = ipv6_raw_deliver(skb, nexthdr, len);
 
 	hash = nexthdr & (MAX_INET_PROTOS - 1);
 	for (ipprot = (struct inet6_protocol *) inet6_protos[hash]; 
 	     ipprot != NULL; 
 	     ipprot = (struct inet6_protocol *) ipprot->next) {
 		struct sk_buff *buff = skb;
-		
+
 		if (ipprot->protocol != nexthdr)
 			continue;
-		
+
 		if (ipprot->copy || raw_sk)
 			buff = skb_clone(skb, GFP_ATOMIC);
-		
-		
-		ipprot->handler(buff, skb->dev, &hdr->saddr, &hdr->daddr,
-				opt, len, 0, ipprot);
+
+		ipprot->handler(buff, len);
 		found = 1;
 	}
-	
+
 	if (raw_sk) {
-		skb->sk = raw_sk;
-		rawv6_rcv(skb, skb->dev, &hdr->saddr, &hdr->daddr, opt, len);
+		rawv6_rcv(raw_sk, skb, len);
 		found = 1;
 	}
-	
+
 	/*
 	 *	not found: send ICMP parameter problem back
 	 */
-	
 	if (!found) {
-		unsigned long offset;
-#if IP6_DEBUG >= 2
-		printk(KERN_DEBUG "proto not found %d\n", nexthdr);
-#endif
-		offset = nhptr - (u8*) hdr;
-		icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_UNK_NEXTHDR,
-			    offset, skb->dev);
-		kfree_skb(skb);
+		ipv6_statistics.Ip6InUnknownProtos++;
+		icmpv6_param_prob(skb, ICMPV6_UNK_NEXTHDR, nhptr);
 	}
 
 	return 0;
@@ -359,6 +239,8 @@ int ip6_mc_input(struct sk_buff *skb)
 	int deliver = 0;
 	int discard = 1;
 
+	ipv6_statistics.Ip6InMcastPkts++;
+
 	hdr = skb->nh.ipv6h;
 	if (ipv6_chk_mcast_addr(skb->dev, &hdr->daddr))
 		deliver = 1;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index aa13c2074..0555c1a24 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -5,7 +5,7 @@
  *	Authors:
  *	Pedro Roque		<roque@di.fc.ul.pt>	
  *
- *	$Id: ip6_output.c,v 1.13 1998/07/15 05:05:38 davem Exp $
+ *	$Id: ip6_output.c,v 1.14 1998/08/26 12:05:01 davem Exp $
  *
  *	Based on linux/net/ipv4/ip_output.c
  *
@@ -13,6 +13,14 @@
  *      modify it under the terms of the GNU General Public License
  *      as published by the Free Software Foundation; either version
  *      2 of the License, or (at your option) any later version.
+ *
+ *	Changes:
+ *	A.N.Kuznetsov	:	airthmetics in fragmentation.
+ *				extension headers are implemented.
+ *				route changes now work.
+ *				ip6_forward does not confuse sniffers.
+ *				etc.
+ *				
  */
 
 #include <linux/errno.h>
@@ -33,6 +41,7 @@
 #include <net/ip6_route.h>
 #include <net/addrconf.h>
 #include <net/rawv6.h>
+#include <net/icmp.h>
 
 static u32	ipv6_fragmentation_id = 1;
 
@@ -59,6 +68,8 @@ int ip6_output(struct sk_buff *skb)
 				return 0;
 			}
 		}
+
+		ipv6_statistics.Ip6OutMcastPkts++;
 	}
 
 	if (hh) {
@@ -85,17 +96,40 @@ int ip6_output(struct sk_buff *skb)
  */
 
 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
-	     struct ipv6_options *opt)
+	     struct ipv6_txoptions *opt)
 {
 	struct ipv6_pinfo * np = sk ? &sk->net_pinfo.af_inet6 : NULL;
+	struct in6_addr *first_hop = fl->nl_u.ip6_u.daddr;
 	struct dst_entry *dst = skb->dst;
 	struct ipv6hdr *hdr;
-	int seg_len;
+	u8  proto = fl->proto;
+	int seg_len = skb->len;
 	int hlimit;
 
-	/* Do something with IPv6 options headers here. */
+	if (opt) {
+		int head_room;
 
-	seg_len = skb->len;
+		/* First: exthdrs may take lots of space (~8K for now)
+		   MAX_HEADER is not enough.
+		 */
+		head_room = opt->opt_nflen + opt->opt_flen;
+		seg_len += head_room;
+		head_room += sizeof(struct ipv6hdr) + ((dst->dev->hard_header_len + 15)&~15);
+
+		if (skb_headroom(skb) < head_room) {
+			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
+			kfree(skb);
+			skb = skb2;
+			if (skb == NULL)
+				return -ENOBUFS;
+			if (sk)
+				skb_set_owner_w(skb, sk);
+		}
+		if (opt->opt_flen)
+			ipv6_push_frag_opts(skb, opt, &proto);
+		if (opt->opt_nflen)
+			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
+	}
 
 	hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
 
@@ -117,16 +151,22 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 		hlimit = ((struct rt6_info*)dst)->rt6i_hoplimit;
 
 	hdr->payload_len = htons(seg_len);
-	hdr->nexthdr = fl->proto;
+	hdr->nexthdr = proto;
 	hdr->hop_limit = hlimit;
 
 	ipv6_addr_copy(&hdr->saddr, fl->nl_u.ip6_u.saddr);
-	ipv6_addr_copy(&hdr->daddr, fl->nl_u.ip6_u.daddr);
+	ipv6_addr_copy(&hdr->daddr, first_hop);
 
-	ipv6_statistics.Ip6OutRequests++;
-	dst->output(skb);
+	if (skb->len <= dst->pmtu) {
+		ipv6_statistics.Ip6OutRequests++;
+		dst->output(skb);
+		return 0;
+	}
 
-	return 0;
+	printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
+	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst->pmtu, skb->dev);
+	kfree_skb(skb);
+	return -EMSGSIZE;
 }
 
 /*
@@ -166,8 +206,8 @@ int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct device *dev,
 	return 0;
 }
 
-static void ip6_bld_1(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
-		      int hlimit, unsigned short pktlength)
+static struct ipv6hdr * ip6_bld_1(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
+				  int hlimit, unsigned pktlength)
 {
 	struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
 	struct ipv6hdr *hdr;
@@ -177,43 +217,56 @@ static void ip6_bld_1(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 	
 	hdr->version = 6;
 	hdr->priority = np->priority;
-	
 	memcpy(hdr->flow_lbl, &np->flow_lbl, 3);
 	
 	hdr->payload_len = htons(pktlength - sizeof(struct ipv6hdr));
-
-	/*
-	 *	FIXME: hop limit has default UNI/MCAST and
-	 *	msgctl settings
-	 */
 	hdr->hop_limit = hlimit;
+	hdr->nexthdr = fl->proto;
 
 	ipv6_addr_copy(&hdr->saddr, fl->nl_u.ip6_u.saddr);
-	ipv6_addr_copy(&hdr->daddr, fl->nl_u.ip6_u.daddr);	
+	ipv6_addr_copy(&hdr->daddr, fl->nl_u.ip6_u.daddr);
+	return hdr;
+}
+
+static __inline__ u8 * ipv6_build_fraghdr(struct sk_buff *skb, u8* prev_hdr, unsigned offset)
+{
+	struct frag_hdr *fhdr;
+
+	fhdr = (struct frag_hdr *) skb_put(skb, sizeof(struct frag_hdr));
+
+	fhdr->nexthdr  = *prev_hdr;
+	*prev_hdr = NEXTHDR_FRAGMENT;
+	prev_hdr = &fhdr->nexthdr;
+
+	fhdr->reserved = 0;
+	fhdr->frag_off = htons(offset);
+	fhdr->identification = ipv6_fragmentation_id++;
+	return &fhdr->nexthdr;
 }
 
 static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag,
 			 const void *data, struct dst_entry *dst,
-			 struct flowi *fl, struct ipv6_options *opt,
-			 int hlimit, int flags, unsigned length)
+			 struct flowi *fl, struct ipv6_txoptions *opt,
+			 struct in6_addr *final_dst,
+			 int hlimit, int flags, unsigned length, int mtu)
 {
-	struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
 	struct ipv6hdr *hdr;
 	struct sk_buff *last_skb;
-	struct frag_hdr *fhdr;
+	u8 *prev_hdr;
 	int unfrag_len;
-	int payl_len;
 	int frag_len;
 	int last_len;
 	int nfrags;
 	int fhdr_dist;
+	int frag_off;
+	int data_off;
 	int err;
 
 	/*
 	 *	Fragmentation
 	 *
 	 *	Extension header order:
-	 *	Hop-by-hop -> Routing -> Fragment -> rest (...)
+	 *	Hop-by-hop -> Dest0 -> Routing -> Fragment -> Auth -> Dest1 -> rest (...)
 	 *	
 	 *	We must build the non-fragmented part that
 	 *	will be in every packet... this also means
@@ -222,11 +275,11 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag,
 	 */
 
 	unfrag_len = sizeof(struct ipv6hdr) + sizeof(struct frag_hdr);
-	payl_len = length;
+	last_len = length;
 
 	if (opt) {
 		unfrag_len += opt->opt_nflen;
-		payl_len += opt->opt_flen;
+		last_len += opt->opt_flen;
 	}
 
 	/*
@@ -235,9 +288,13 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag,
 	 *	"integer multiple of 8 octects".
 	 */
 
-	frag_len = (dst->pmtu - unfrag_len) & ~0x7;
+	frag_len = (mtu - unfrag_len) & ~0x7;
 
-	nfrags = payl_len / frag_len;
+	/* Unfragmentable part exceeds mtu. */
+	if (frag_len <= 0)
+		return -EMSGSIZE;
+
+	nfrags = last_len / frag_len;
 
 	/*
 	 *	We must send from end to start because of 
@@ -250,13 +307,25 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag,
 	 *	might be a good idea.
 	 */
 
-	last_len = payl_len - (nfrags * frag_len);
+	frag_off = nfrags * frag_len;
+	last_len -= frag_off;
 
 	if (last_len == 0) {
 		last_len = frag_len;
+		frag_off -= frag_len;
 		nfrags--;
 	}
-		
+	data_off = frag_off;
+
+	/* And it is implementation problem: for now we assume, that
+	   all the exthdrs will fit to the first fragment.
+	 */
+	if (opt) {
+		if (frag_len < opt->opt_flen)
+			return -EMSGSIZE;
+		data_off = frag_off - opt->opt_flen;
+	}
+
 	last_skb = sock_alloc_send_skb(sk, unfrag_len + frag_len +
 				       dst->dev->hard_header_len + 15,
 				       0, flags & MSG_DONTWAIT, &err);
@@ -267,41 +336,17 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag,
 	last_skb->dst = dst_clone(dst);
 
 	skb_reserve(last_skb, (dst->dev->hard_header_len + 15) & ~15);
-	
-	hdr = (struct ipv6hdr *) skb_put(last_skb, sizeof(struct ipv6hdr));
-	last_skb->nh.ipv6h = hdr;
 
-	hdr->version = 6;
-	hdr->priority = np->priority;
-	
-	memcpy(hdr->flow_lbl, &np->flow_lbl, 3);
-	hdr->payload_len = htons(unfrag_len + frag_len - sizeof(struct ipv6hdr));
+	hdr = ip6_bld_1(sk, last_skb, fl, hlimit, frag_len+unfrag_len);
+	prev_hdr = &hdr->nexthdr;
 
-	hdr->hop_limit = hlimit;
+	if (opt && opt->opt_nflen)
+		prev_hdr = ipv6_build_nfrag_opts(last_skb, prev_hdr, opt, final_dst, 0);
 
-	hdr->nexthdr = NEXTHDR_FRAGMENT;
+	prev_hdr = ipv6_build_fraghdr(last_skb, prev_hdr, frag_off);
+	fhdr_dist = prev_hdr - last_skb->data;
 
-	ipv6_addr_copy(&hdr->saddr, fl->nl_u.ip6_u.saddr);
-	ipv6_addr_copy(&hdr->daddr, fl->nl_u.ip6_u.daddr);
-
-#if 0
-	if (opt && opt->srcrt) {
-		hdr->nexthdr = ipv6opt_bld_rthdr(last_skb, opt, daddr,
-						 NEXTHDR_FRAGMENT);
-	}
-#endif
-
-	fhdr = (struct frag_hdr *) skb_put(last_skb, sizeof(struct frag_hdr));
-	memset(fhdr, 0, sizeof(struct frag_hdr));
-
-	fhdr->nexthdr  = fl->proto;		
-	fhdr->frag_off = ntohs(nfrags * frag_len);
-	fhdr->identification = ipv6_fragmentation_id++;
-
-	fhdr_dist = (unsigned char *) fhdr - last_skb->data;
-
-	err = getfrag(data, &hdr->saddr, last_skb->tail, nfrags * frag_len,
-		      last_len);
+	err = getfrag(data, &hdr->saddr, last_skb->tail, data_off, last_len);
 
 	if (!err) {
 		while (nfrags--) {
@@ -309,58 +354,60 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag,
 			
 			struct frag_hdr *fhdr2;
 				
-#if 0
-			printk(KERN_DEBUG "sending frag %d\n", nfrags);
-#endif
 			skb = skb_copy(last_skb, sk->allocation);
 
-			if (skb == NULL)
+			if (skb == NULL) {
+				ipv6_statistics.Ip6FragFails++;
+				kfree_skb(last_skb);
 				return -ENOMEM;
+			}
 			
+			frag_off -= frag_len;
+			data_off -= frag_len;
+
 			fhdr2 = (struct frag_hdr *) (skb->data + fhdr_dist);
 
 			/* more flag on */
-			fhdr2->frag_off = ntohs(nfrags * frag_len + 1);
+			fhdr2->frag_off = htons(frag_off | 1);
 
-			/*
-			 *	FIXME:
-			 *	if (nfrags == 0)
-			 *	put rest of headers
-			 */
+			/* Write fragmentable exthdrs to the first chunk */
+			if (nfrags == 0 && opt && opt->opt_flen) {
+				ipv6_build_frag_opts(skb, &fhdr2->nexthdr, opt);
+				frag_len -= opt->opt_flen;
+				data_off = 0;
+			}
 
 			err = getfrag(data, &hdr->saddr,skb_put(skb, frag_len),
-				      nfrags * frag_len, frag_len);
+				      data_off, frag_len);
 
 			if (err) {
 				kfree_skb(skb);
 				break;
 			}
 
+			ipv6_statistics.Ip6FragCreates++;
 			ipv6_statistics.Ip6OutRequests++;
 			dst->output(skb);
 		}
 	}
 
 	if (err) {
+		ipv6_statistics.Ip6FragFails++;
 		kfree_skb(last_skb);
 		return -EFAULT;
 	}
 
-#if 0
-	printk(KERN_DEBUG "sending last frag \n");
-#endif
-
-	hdr->payload_len = htons(unfrag_len + last_len - 
-				 sizeof(struct ipv6hdr));
+	hdr->payload_len = htons(unfrag_len + last_len - sizeof(struct ipv6hdr));
 
 	/*
 	 *	update last_skb to reflect the getfrag we did
 	 *	on start.
 	 */
-	
-	last_skb->tail += last_len;
-	last_skb->len += last_len;
 
+	skb_put(last_skb, last_len);
+
+	ipv6_statistics.Ip6FragCreates++;
+	ipv6_statistics.Ip6FragOKs++;
 	ipv6_statistics.Ip6OutRequests++;
 	dst->output(last_skb);
 
@@ -369,42 +416,71 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag,
 
 int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data,
 		   struct flowi *fl, unsigned length,
-		   struct ipv6_options *opt, int hlimit, int flags)
+		   struct ipv6_txoptions *opt, int hlimit, int flags)
 {
 	struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
 	struct in6_addr *final_dst = NULL;
 	struct dst_entry *dst;
-	int pktlength;
 	int err = 0;
-	
+	unsigned int pktlength, jumbolen, mtu;
+
 	if (opt && opt->srcrt) {
 		struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
 		final_dst = fl->nl_u.ip6_u.daddr;
 		fl->nl_u.ip6_u.daddr = rt0->addr;
 	}
 
-	dst = NULL;
-
 	if (!fl->oif && ipv6_addr_is_multicast(fl->nl_u.ip6_u.daddr))
 		fl->oif = np->mcast_oif;
-	
-	if (sk->dst_cache)
+
+	dst = NULL;
+	if (sk->dst_cache) {
 		dst = dst_check(&sk->dst_cache, np->dst_cookie);
+		if (dst) {
+			struct rt6_info *rt = (struct rt6_info*)dst_clone(dst);
+
+			/* Yes, checking route validity in not connected
+			   case is not very simple. Take into account,
+			   that we do not support routing by source, TOS,
+			   and MSG_DONTROUTE 		--ANK (980726)
+
+			   1. If route was host route, check that
+			      cached destination is current.
+			      If it is network route, we still may
+			      check its validity using saved pointer
+			      to the last used address: daddr_cache.
+			      We do not want to save whole address now,
+			      (because main consumer of this service
+			       is tcp, which has not this problem),
+			      so that the last trick works only on connected
+			      sockets.
+			   2. oif also should be the same.
+			 */
+			if (((rt->rt6i_dst.plen != 128 ||
+			      ipv6_addr_cmp(fl->fl6_dst, &rt->rt6i_dst.addr))
+			     && (np->daddr_cache == NULL ||
+				 ipv6_addr_cmp(fl->fl6_dst, np->daddr_cache)))
+			    || (fl->oif && fl->oif != dst->dev->ifindex)) {
+				dst_release(dst);
+				dst = NULL;
+			}
+		}
+	}
 
 	if (dst == NULL)
 		dst = ip6_route_output(sk, fl);
 
 	if (dst->error) {
 		ipv6_statistics.Ip6OutNoRoutes++;
-		err = -ENETUNREACH;
-		goto out;
+		dst_release(dst);
+		return -ENETUNREACH;
 	}
 
 	if (fl->nl_u.ip6_u.saddr == NULL) {
 		struct inet6_ifaddr *ifa;
 		
 		ifa = ipv6_get_saddr(dst, fl->nl_u.ip6_u.daddr);
-		
+
 		if (ifa == NULL) {
 #if IP6_DEBUG >= 2
 			printk(KERN_DEBUG "ip6_build_xmit: "
@@ -415,7 +491,6 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data,
 		}
 		fl->nl_u.ip6_u.saddr = &ifa->addr;
 	}
-	
 	pktlength = length;
 
 	if (hlimit < 0) {
@@ -427,29 +502,38 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data,
 			hlimit = ((struct rt6_info*)dst)->rt6i_hoplimit;
 	}
 
+	jumbolen = 0;
+
 	if (!sk->ip_hdrincl) {
 		pktlength += sizeof(struct ipv6hdr);
 		if (opt)
 			pktlength += opt->opt_flen + opt->opt_nflen;
 
-		/* Due to conservative check made by caller,
-		   pktlength cannot overflow here.
-
-		   When (and if) jumbo option will be implemented
-		   we could try soemething sort of:
+		if (pktlength > 0xFFFF + sizeof(struct ipv6hdr)) {
+			/* Jumbo datagram.
+			   It is assumed, that in the case of sk->ip_hdrincl
+			   jumbo option is supplied by user.
+			 */
+			pktlength += 8;
+			jumbolen = pktlength - sizeof(struct ipv6hdr);
+		}
+	}
 
-		   if (pktlength < length) return -EMSGSIZE;
+	mtu = dst->pmtu;
 
-		*/
-	}
+	/* Critical arithmetic overflow check.
+	   FIXME: may gcc optimize it out? --ANK (980726)
+	 */
+	if (pktlength < length)
+		return -EMSGSIZE;
 
-	if (pktlength <= dst->pmtu) {
+	if (pktlength <= mtu) {
 		struct sk_buff *skb;
 		struct ipv6hdr *hdr;
-		struct device *dev;
+		struct device *dev = dst->dev;
 
 		skb = sock_alloc_send_skb(sk, pktlength + 15 +
-					  dst->dev->hard_header_len, 0,
+					  dev->hard_header_len, 0,
 					  flags & MSG_DONTWAIT, &err);
 
 		if (skb == NULL) {
@@ -457,7 +541,6 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data,
 			goto out;
 		}
 
-		dev = dst->dev;
 		skb->dst = dst_clone(dst);
 
 		skb_reserve(skb, (dev->hard_header_len + 15) & ~15);
@@ -466,23 +549,22 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data,
 		skb->nh.ipv6h = hdr;
 
 		if (!sk->ip_hdrincl) {
-			ip6_bld_1(sk, skb, fl, hlimit, pktlength);
-#if 0
-			if (opt && opt->srcrt) {
-				hdr->nexthdr = ipv6opt_bld_rthdr(skb, opt,
-								 final_dst,
-								 fl->proto);
+			ip6_bld_1(sk, skb, fl, hlimit,
+				  jumbolen ? sizeof(struct ipv6hdr) : pktlength);
+
+			if (opt || jumbolen) {
+				u8 *prev_hdr = &hdr->nexthdr;
+				prev_hdr = ipv6_build_nfrag_opts(skb, prev_hdr, opt, final_dst, jumbolen);
+				if (opt && opt->opt_flen)
+					ipv6_build_frag_opts(skb, prev_hdr, opt);
 			}
-			else
-#endif
-				hdr->nexthdr = fl->proto;
 		}
 
 		skb_put(skb, length);
 		err = getfrag(data, &hdr->saddr,
 			      ((char *) hdr) + (pktlength - length),
 			      0, length);
-		
+
 		if (!err) {
 			ipv6_statistics.Ip6OutRequests++;
 			dst->output(skb);
@@ -491,32 +573,18 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data,
 			kfree_skb(skb);
 		}
 	} else {
-		if (sk->ip_hdrincl)
+		if (sk->ip_hdrincl || jumbolen)
 			return -EMSGSIZE;
 
-		/* pktlength includes IPv6 header, not included
-		   in IPv6 payload length.
-		   FIXME are non-fragmentable options included
-		   in packet after defragmentation? If not, we
-		   should subtract opt_nflen also. --ANK
-		 */
-		if (pktlength > 0xFFFF + sizeof(struct ipv6hdr))
-			return -EMSGSIZE;
-
-		err = ip6_frag_xmit(sk, getfrag, data, dst, fl, opt, hlimit,
-				    flags, length);
+		err = ip6_frag_xmit(sk, getfrag, data, dst, fl, opt, final_dst, hlimit,
+				    flags, length, mtu);
 	}
-	
+
 	/*
 	 *	cleanup
 	 */
-  out:
-	
-	if (sk->dst_cache)
-		ip6_dst_store(sk, dst);
-	else
-		dst_release(dst);
-
+out:
+	ip6_dst_store(sk, dst, fl->nl_u.ip6_u.daddr == &np->daddr ? &np->daddr : NULL);
 	return err;
 }
 
@@ -530,20 +598,15 @@ int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 		if (sk && ra->sel == sel) {
 			if (last) {
 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
-				if (skb2) {
-					skb2->sk = last;
-					rawv6_rcv(skb2, skb2->dev, &skb2->nh.ipv6h->saddr,
-						  &skb2->nh.ipv6h->daddr, NULL, skb2->len);
-				}
+				if (skb2)
+					rawv6_rcv(last, skb2, skb2->len);
 			}
 			last = sk;
 		}
 	}
 
 	if (last) {
-		skb->sk = last;
-		rawv6_rcv(skb, skb->dev, &skb->nh.ipv6h->saddr,
-			  &skb->nh.ipv6h->daddr, NULL, skb->len);
+		rawv6_rcv(last, skb, skb->len);
 		return 1;
 	}
 	return 0;
@@ -553,24 +616,16 @@ int ip6_forward(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb->dst;
 	struct ipv6hdr *hdr = skb->nh.ipv6h;
-	int size;
+	struct inet6_skb_parm *opt =(struct inet6_skb_parm*)skb->cb;
 	
-	if (ipv6_devconf.forwarding == 0)
+	if (ipv6_devconf.forwarding == 0 && opt->srcrt == 0)
 		goto drop;
 
 	/*
-	 *	check hop-by-hop options present
-	 */
-	/*
-	 *	Note, that NEXTHDR_HOP header must be checked
-	 *	always at the most beginning of ipv6_rcv.
-	 *	The result should be saved somewhere, but
-	 *	we do not it for now. Alas. Let's do it here. --ANK
-	 *
-	 *	Second note: we DO NOT make any processing on
+	 *	We DO NOT make any processing on
 	 *	RA packets, pushing them to user level AS IS
-	 *	without ane WARRANTY that application will able
-	 *	to interpret them. The reson is that we
+	 *	without ane WARRANTY that application will be able
+	 *	to interpret them. The reason is that we
 	 *	cannot make anything clever here.
 	 *
 	 *	We are not end-node, so that if packet contains
@@ -579,42 +634,9 @@ int ip6_forward(struct sk_buff *skb)
 	 *	cannot be fragmented, because there is no warranty
 	 *	that different fragments will go along one path. --ANK
 	 */
-	if (hdr->nexthdr == NEXTHDR_HOP) {
-		int ra_value = -1;
-		u8 *ptr = (u8*)(skb->nh.ipv6h+1);
-		int len = (ptr[1]+1)<<3;
-
-		if (len + sizeof(struct ipv6hdr) > skb->len)
-			goto drop;
-
-		ptr += 2;
-		len -= 2;
-		while (len > 0) {
-			u8 *opt;
-			int optlen;
-
-			if (ptr[0] == 0) {
-				len--;
-				ptr++;
-				continue;
-			}
-			opt = ptr;
-			optlen = ptr[1]+1;
-
-			len -= optlen;
-			ptr += optlen;
-			if (len < 0)
-				goto drop;
-
-			if (opt[0] == 20) {
-				/* Router Alert as of draft-ietf-ipngwg-ipv6router-alert-04 */
-				if (optlen < 4)
-					goto drop;
-				ra_value = opt[2] + (opt[3]<<8);
-			} else if (!ip6_dstopt_unknown(skb, (struct ipv6_tlvtype*)opt))
-				goto drop;
-		}
-		if (ra_value>=0 && ip6_call_ra_chain(skb, ra_value))
+	if (opt->ra) {
+		u8 *ptr = skb->nh.raw + opt->ra;
+		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 			return 0;
 	}
 
@@ -622,6 +644,8 @@ int ip6_forward(struct sk_buff *skb)
 	 *	check and decrement ttl
 	 */
 	if (hdr->hop_limit <= 1) {
+		/* Force OUTPUT device used as source address */
+		skb->dev = dst->dev;
 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 			    0, skb->dev);
 
@@ -629,9 +653,10 @@ int ip6_forward(struct sk_buff *skb)
 		return -ETIMEDOUT;
 	}
 
-	hdr->hop_limit--;
-
-	if (skb->dev == dst->dev && dst->neighbour) {
+	/* IPv6 specs say nothing about it, but it is clear that we cannot
+	   send redirects to source routed frames.
+	 */
+	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
 		struct in6_addr *target = NULL;
 		struct rt6_info *rt;
 		struct neighbour *n = dst->neighbour;
@@ -647,30 +672,40 @@ int ip6_forward(struct sk_buff *skb)
 		else
 			target = &hdr->daddr;
 
-		ndisc_send_redirect(skb, dst->neighbour, target);
+		/* Limit redirects both by destination (here)
+		   and by source (inside ndisc_send_redirect)
+		 */
+		if (xrlim_allow(dst, 1*HZ))
+			ndisc_send_redirect(skb, n, target);
+	} else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
+						|IPV6_ADDR_LINKLOCAL)) {
+		/* This check is security critical. */
+		goto drop;
 	}
-	
-	size = sizeof(struct ipv6hdr) + ntohs(hdr->payload_len);
 
-	if (size > dst->pmtu) {
+	if (skb->len > dst->pmtu) {
+		/* Again, force OUTPUT device used as source address */
+		skb->dev = dst->dev;
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst->pmtu, skb->dev);
+		ipv6_statistics.Ip6InTooBigErrors++;
 		kfree_skb(skb);
 		return -EMSGSIZE;
 	}
 
-	if (skb_headroom(skb) < dst->dev->hard_header_len || skb_cloned(skb)) {
-		struct sk_buff *skb2;
-		skb2 = skb_realloc_headroom(skb, (dst->dev->hard_header_len + 15)&~15);
-		kfree_skb(skb);
-		skb = skb2;
-	}
+	if ((skb = skb_cow(skb, dst->dev->hard_header_len)) == NULL)
+		return 0;
 
-	ipv6_statistics.Ip6ForwDatagrams++;
-	dst->output(skb);
+	hdr = skb->nh.ipv6h;
 
-	return 0;
+	/* Mangling hops number delayed to point after skb COW */
+ 
+	hdr->hop_limit--;
+
+	ipv6_statistics.Ip6OutForwDatagrams++;
+	return dst->output(skb);
 
 drop:
+	ipv6_statistics.Ip6InAddrErrors++;
 	kfree_skb(skb);
 	return -EINVAL;
 }
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index b31c07c00..a246b996b 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -7,7 +7,7 @@
  *
  *	Based on linux/net/ipv4/ip_sockglue.c
  *
- *	$Id: ipv6_sockglue.c,v 1.22 1998/07/15 05:05:39 davem Exp $
+ *	$Id: ipv6_sockglue.c,v 1.23 1998/08/26 12:05:04 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -110,7 +110,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval,
 		    int optlen)
 {
 	struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
-	int val, err;
+	int val, valbool;
 	int retv = -ENOPROTOOPT;
 
 	if(level==SOL_IP && sk->type != SOCK_RAW)
@@ -119,19 +119,20 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval,
 	if(level!=SOL_IPV6)
 		goto out;
 
-	if (optval == NULL) {
+	if (optval == NULL)
 		val=0;
-	} else {
-		err = get_user(val, (int *) optval);
-		if(err)
-			return err;
-	}
-	
+	else if (get_user(val, (int *) optval))
+		return -EFAULT;
+
+	valbool = (val!=0);
 
 	switch (optname) {
 
 	case IPV6_ADDRFORM:
 		if (val == PF_INET) {
+			struct ipv6_txoptions *opt;
+			struct sk_buff *pktopt;
+
 			if (sk->protocol != IPPROTO_UDP &&
 			    sk->protocol != IPPROTO_TCP)
 				goto out;
@@ -140,7 +141,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval,
 				retv = ENOTCONN;
 				goto out;
 			}
-			
+
 			if (!(ipv6_addr_type(&np->daddr) & IPV6_ADDR_MAPPED)) {
 				retv = -EADDRNOTAVAIL;
 				goto out;
@@ -153,10 +154,17 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval,
 				tp->af_specific = &ipv4_specific;
 				sk->socket->ops = &inet_stream_ops;
 				sk->family = PF_INET;
+				tcp_sync_mss(sk, tp->pmtu_cookie);
 			} else {
 				sk->prot = &udp_prot;
 				sk->socket->ops = &inet_dgram_ops;
 			}
+			opt = xchg(&np->opt, NULL);
+			if (opt)
+				sock_kfree_s(sk, opt, opt->tot_len);
+			pktopt = xchg(&np->pktoptions, NULL);
+			if (pktopt)
+				kfree_skb(pktopt);
 			retv = 0;
 		} else {
 			retv = -EINVAL;
@@ -164,15 +172,85 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval,
 		break;
 
 	case IPV6_PKTINFO:
-		np->rxinfo = val;
+		np->rxopt.bits.rxinfo = valbool;
 		retv = 0;
 		break;
 
 	case IPV6_HOPLIMIT:
-		np->rxhlim = val;
+		np->rxopt.bits.rxhlim = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_RTHDR:
+		retv = -EINVAL;
+		if (val >= 0 && val <= 2) {
+			np->rxopt.bits.srcrt = val;
+			retv = 0;
+		}
+		break;
+
+	case IPV6_HOPOPTS:
+		np->rxopt.bits.hopopts = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_AUTHHDR:
+		np->rxopt.bits.authhdr = valbool;
 		retv = 0;
 		break;
 
+	case IPV6_DSTOPTS:
+		np->rxopt.bits.dstopts = valbool;
+		retv = 0;
+		break;
+
+	case IPV6_PKTOPTIONS:
+	{
+		struct ipv6_txoptions *opt = NULL;
+		struct msghdr msg;
+		int junk;
+		struct in6_addr *saddr;
+
+		if (optlen == 0)
+			goto update;
+
+		opt = sock_kmalloc(sk, sizeof(*opt) + optlen, GFP_KERNEL);
+		retv = -ENOBUFS;
+		if (opt == NULL)
+			break;
+
+		memset(opt, 0, sizeof(*opt));
+		opt->tot_len = sizeof(*opt) + optlen;
+		retv = -EFAULT;
+		if (copy_from_user(opt+1, optval, optlen))
+			goto done;
+
+		msg.msg_controllen = optlen;
+		msg.msg_control = (void*)(opt+1);
+
+		retv = datagram_send_ctl(&msg, &junk, &saddr, opt, &junk);
+		if (retv)
+			goto done;
+update:
+		retv = 0;
+		start_bh_atomic();
+		if (opt && sk->type == SOCK_STREAM) {
+			struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+			if ((tcp_connected(sk->state) || sk->state == TCP_SYN_SENT)
+			    && sk->daddr != LOOPBACK4_IPV6) {
+				tp->ext_header_len = opt->opt_flen + opt->opt_nflen;
+				tcp_sync_mss(sk, tp->pmtu_cookie);
+			}
+		}
+		opt = xchg(&np->opt, opt);
+		dst_release(xchg(&sk->dst_cache, NULL));
+		end_bh_atomic();
+
+done:
+		if (opt)
+			sock_kfree_s(sk, opt, opt->tot_len);
+		break;
+	}
 	case IPV6_UNICAST_HOPS:
 		if (val > 255 || val < -1)
 			retv = -EINVAL;
@@ -190,10 +268,9 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval,
 			retv = 0;
 		}
 		break;
-		break;
 
 	case IPV6_MULTICAST_LOOP:
-		np->mc_loop = (val != 0);
+		np->mc_loop = valbool;
 		retv = 0;
 		break;
 
@@ -229,12 +306,10 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval,
 	case IPV6_DROP_MEMBERSHIP:
 	{
 		struct ipv6_mreq mreq;
-		int err;
 
-		err = copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq));
-		if(err)
+		if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq)))
 			return -EFAULT;
-		
+
 		if (optname == IPV6_ADD_MEMBERSHIP)
 			retv = ipv6_sock_mc_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr);
 		else
@@ -253,10 +328,44 @@ out:
 int ipv6_getsockopt(struct sock *sk, int level, int optname, char *optval, 
 		    int *optlen)
 {
+	struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
+	int len;
+
 	if(level==SOL_IP && sk->type != SOCK_RAW)
 		return udp_prot.getsockopt(sk, level, optname, optval, optlen);
 	if(level!=SOL_IPV6)
 		return -ENOPROTOOPT;
+	if (get_user(len, optlen))
+		return -EFAULT;
+	switch (optname) {
+	case IPV6_PKTOPTIONS:
+	{
+		struct msghdr msg;
+		struct sk_buff *skb;
+
+		start_bh_atomic();
+		skb = np->pktoptions;
+		if (skb)
+			atomic_inc(&skb->users);
+		end_bh_atomic();
+
+		if (skb) {
+			int err;
+
+			msg.msg_control = optval;
+			msg.msg_controllen = len;
+			msg.msg_flags = 0;
+			err = datagram_recv_ctl(sk, &msg, skb);
+			kfree_skb(skb);
+			if (err)
+				return err;
+			len -= msg.msg_controllen;
+		} else
+			len = 0;
+		return put_user(len, optlen);
+	}
+	default:
+	}
 	return -EINVAL;
 }
 
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index c50f37fcf..88950481e 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -5,7 +5,7 @@
  *	Authors:
  *	Pedro Roque		<roque@di.fc.ul.pt>	
  *
- *	$Id: mcast.c,v 1.16 1998/05/07 15:43:10 davem Exp $
+ *	$Id: mcast.c,v 1.17 1998/08/26 12:05:06 davem Exp $
  *
  *	Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c 
  *
@@ -79,7 +79,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr)
 	if (!(ipv6_addr_type(addr) & IPV6_ADDR_MULTICAST))
 		return -EINVAL;
 
-	mc_lst = kmalloc(sizeof(struct ipv6_mc_socklist), GFP_KERNEL);
+	mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL);
 
 	if (mc_lst == NULL)
 		return -ENOMEM;
@@ -91,13 +91,15 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr)
 	if (ifindex == 0) {
 		struct rt6_info *rt;
 		rt = rt6_lookup(addr, NULL, 0, 0);
-		if (rt)
+		if (rt) {
 			dev = rt->rt6i_dev;
+			dst_release(&rt->u.dst);
+		}
 	} else
 		dev = dev_get_by_index(ifindex);
 
 	if (dev == NULL) {
-		kfree(mc_lst);
+		sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
 		return -ENODEV;
 	}
 
@@ -108,7 +110,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr)
 	err = ipv6_dev_mc_inc(dev, addr);
 
 	if (err) {
-		kfree(mc_lst);
+		sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
 		return err;
 	}
 
@@ -133,7 +135,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, struct in6_addr *addr)
 			*lnk = mc_lst->next;
 			if ((dev = dev_get_by_index(ifindex)) != NULL)
 				ipv6_dev_mc_dec(dev, &mc_lst->addr);
-			kfree(mc_lst);
+			sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
 			return 0;
 		}
 	}
@@ -153,7 +155,7 @@ void ipv6_sock_mc_close(struct sock *sk)
 			ipv6_dev_mc_dec(dev, &mc_lst->addr);
 
 		np->ipv6_mc_list = mc_lst->next;
-		kfree(mc_lst);
+		sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
 	}
 }
 
@@ -308,11 +310,19 @@ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime)
 {
 	unsigned long delay = resptime;
 
+	/* Do not start timer for addresses with link/host scope */
+	if (ipv6_addr_type(&ma->mca_addr)&(IPV6_ADDR_LINKLOCAL|IPV6_ADDR_LOOPBACK))
+		return;
+
 	if (del_timer(&ma->mca_timer))
 		delay = ma->mca_timer.expires - jiffies;
 
-	if (delay >= resptime)
-		delay = net_random() % resptime;
+	if (delay >= resptime) {
+		if (resptime)
+			delay = net_random() % resptime;
+		else
+			delay = 1;
+	}
 
 	ma->mca_flags |= MAF_TIMER_RUNNING;
 	ma->mca_timer.expires = jiffies + delay;
@@ -325,10 +335,16 @@ int igmp6_event_query(struct sk_buff *skb, struct icmp6hdr *hdr, int len)
 	struct in6_addr *addrp;
 	unsigned long resptime;
 
-	if (len < sizeof(struct icmp6hdr) + sizeof(struct ipv6hdr))
+	if (len < sizeof(struct icmp6hdr) + sizeof(struct in6_addr))
 		return -EINVAL;
 
-	resptime = hdr->icmp6_maxdelay;
+	/* Drop queries with not link local source */
+	if (!(ipv6_addr_type(&skb->nh.ipv6h->saddr)&IPV6_ADDR_LINKLOCAL))
+		return -EINVAL;
+
+	resptime = ntohs(hdr->icmp6_maxdelay);
+	/* Translate milliseconds to jiffies */
+	resptime = (resptime<<10)/(1024000/HZ);
 
 	addrp = (struct in6_addr *) (hdr + 1);
 
@@ -365,7 +381,15 @@ int igmp6_event_report(struct sk_buff *skb, struct icmp6hdr *hdr, int len)
 	struct device *dev;
 	int hash;
 
-	if (len < sizeof(struct icmp6hdr) + sizeof(struct ipv6hdr))
+	/* Our own report looped back. Ignore it. */
+	if (skb->pkt_type == PACKET_LOOPBACK)
+		return 0;
+
+	if (len < sizeof(struct icmp6hdr) + sizeof(struct in6_addr))
+		return -EINVAL;
+
+	/* Drop reports with not link local source */
+	if (!(ipv6_addr_type(&skb->nh.ipv6h->saddr)&IPV6_ADDR_LINKLOCAL))
 		return -EINVAL;
 
 	addrp = (struct in6_addr *) (hdr + 1);
@@ -399,14 +423,25 @@ void igmp6_send(struct in6_addr *addr, struct device *dev, int type)
         struct sk_buff *skb;
         struct icmp6hdr *hdr;
 	struct inet6_ifaddr *ifp;
-	struct in6_addr *addrp; 
-	int err, len, plen;
+	struct in6_addr *snd_addr;
+	struct in6_addr *addrp;
+	struct in6_addr all_routers;
+	int err, len, payload_len, full_len;
+	u8 ra[8] = { IPPROTO_ICMPV6, 0,
+		     IPV6_TLV_ROUTERALERT, 0, 0, 0,
+		     IPV6_TLV_PADN, 0 };
+
+	snd_addr = addr;
+	if (type == ICMPV6_MGM_REDUCTION) {
+		snd_addr = &all_routers;
+		ipv6_addr_all_routers(&all_routers);
+	}
 
 	len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr);
+	payload_len = len + sizeof(ra);
+	full_len = sizeof(struct ipv6hdr) + payload_len;
 
-	plen = sizeof(struct ipv6hdr) + len;
-
-	skb = sock_alloc_send_skb(sk, dev->hard_header_len + plen + 15, 0, 0, &err);
+	skb = sock_alloc_send_skb(sk, dev->hard_header_len + full_len + 15, 0, 0, &err);
 
 	if (skb == NULL)
 		return;
@@ -414,8 +449,8 @@ void igmp6_send(struct in6_addr *addr, struct device *dev, int type)
 	skb_reserve(skb, (dev->hard_header_len + 15) & ~15);
 	if (dev->hard_header) {
 		unsigned char ha[MAX_ADDR_LEN];
-		ndisc_mc_map(addr, ha, dev, 1);
-		dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, plen);
+		ndisc_mc_map(snd_addr, ha, dev, 1);
+		dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, full_len);
 	}
 
 	ifp = ipv6_get_lladdr(dev);
@@ -428,11 +463,9 @@ void igmp6_send(struct in6_addr *addr, struct device *dev, int type)
 		return;
 	}
 
-	ip6_nd_hdr(sk, skb, dev, &ifp->addr, addr, IPPROTO_ICMPV6, len);
+	ip6_nd_hdr(sk, skb, dev, &ifp->addr, snd_addr, NEXTHDR_HOP, payload_len);
 
-	/*
-	 *	need hop-by-hop router alert option.
-	 */
+	memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra));
 
 	hdr = (struct icmp6hdr *) skb_put(skb, sizeof(struct icmp6hdr));
 	memset(hdr, 0, sizeof(struct icmp6hdr));
@@ -441,11 +474,16 @@ void igmp6_send(struct in6_addr *addr, struct device *dev, int type)
 	addrp = (struct in6_addr *) skb_put(skb, sizeof(struct in6_addr));
 	ipv6_addr_copy(addrp, addr);
 
-	hdr->icmp6_cksum = csum_ipv6_magic(&ifp->addr, addr, len,
+	hdr->icmp6_cksum = csum_ipv6_magic(&ifp->addr, snd_addr, len,
 					   IPPROTO_ICMPV6,
 					   csum_partial((__u8 *) hdr, len, 0));
 
 	dev_queue_xmit(skb);
+	if (type == ICMPV6_MGM_REDUCTION)
+		icmpv6_statistics.Icmp6OutGroupMembReductions++;
+	else
+		icmpv6_statistics.Icmp6OutGroupMembResponses++;
+	icmpv6_statistics.Icmp6OutMsgs++;
 }
 
 static void igmp6_join_group(struct ifmcaddr6 *ma)
@@ -455,7 +493,7 @@ static void igmp6_join_group(struct ifmcaddr6 *ma)
 
 	addr_type = ipv6_addr_type(&ma->mca_addr);
 
-	if ((addr_type & IPV6_ADDR_LINKLOCAL))
+	if ((addr_type & (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_LOOPBACK)))
 		return;
 
 	igmp6_send(&ma->mca_addr, ma->dev, ICMPV6_MGM_REPORT);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 26e42a1ed..b6c855a59 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -68,8 +68,7 @@
 #include <net/ndisc.h>
 #include <net/ip6_route.h>
 #include <net/addrconf.h>
-
-
+#include <net/icmp.h>
 
 #include <net/checksum.h>
 #include <linux/proc_fs.h>
@@ -350,6 +349,9 @@ void ndisc_send_na(struct device *dev, struct neighbour *neigh,
 							      len, 0));
 
 	dev_queue_xmit(skb);
+
+	icmpv6_statistics.Icmp6OutNeighborAdvertisements++;
+	icmpv6_statistics.Icmp6OutMsgs++;
 }        
 
 void ndisc_send_ns(struct device *dev, struct neighbour *neigh,
@@ -410,6 +412,9 @@ void ndisc_send_ns(struct device *dev, struct neighbour *neigh,
 							      len, 0));
 	/* send it! */
 	dev_queue_xmit(skb);
+
+	icmpv6_statistics.Icmp6OutNeighborSolicits++;
+	icmpv6_statistics.Icmp6OutMsgs++;
 }
 
 void ndisc_send_rs(struct device *dev, struct in6_addr *saddr,
@@ -458,6 +463,9 @@ void ndisc_send_rs(struct device *dev, struct in6_addr *saddr,
 
 	/* send it! */
 	dev_queue_xmit(skb);
+
+	icmpv6_statistics.Icmp6OutRouterSolicits++;
+	icmpv6_statistics.Icmp6OutMsgs++;
 }
 		   
 
@@ -575,6 +583,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 
 	if (rt && lifetime == 0) {
 		ip6_del_rt(rt);
+		dst_release(&rt->u.dst);
 		rt = NULL;
 	}
 
@@ -582,11 +591,6 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 		ND_PRINTK2("ndisc_rdisc: adding default router\n");
 
 		rt = rt6_add_dflt_router(&skb->nh.ipv6h->saddr, skb->dev);
-
-#if 1
-		/* BUGGGGG! Previous routine can return invalid pointer. */
-		rt = rt6_get_dflt_router(&skb->nh.ipv6h->saddr, skb->dev);
-#endif
 		if (rt == NULL) {
 			ND_PRINTK1("route_add failed\n");
 			return;
@@ -595,6 +599,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 		neigh = rt->rt6i_nexthop;
 		if (neigh == NULL) {
 			ND_PRINTK1("nd: add default router: null neighbour\n");
+			dst_release(&rt->u.dst);
 			return;
 		}
 		neigh->flags |= NTF_ROUTER;
@@ -658,7 +663,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 				
 				mtu = htonl(*(__u32 *)(opt+4));
 
-				if (mtu < 576 || mtu > skb->dev->mtu) {
+				if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) {
 					ND_PRINTK0("NDISC: router "
 						   "announcement with mtu = %d\n",
 						   mtu);
@@ -671,10 +676,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 					if (rt)
 						rt->u.dst.pmtu = mtu;
 
-					/* BUGGG... Scan routing tables and
-					   adjust mtu on routes going
-					   via this device
-					 */
+					rt6_mtu_change(skb->dev, mtu);
 				}
 			}
                         break;
@@ -689,6 +691,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
                 optlen -= len;
                 opt += len;
         }
+	if (rt)
+		dst_release(&rt->u.dst);
 }
 
 static void ndisc_redirect_rcv(struct sk_buff *skb)
@@ -698,7 +702,6 @@ static void ndisc_redirect_rcv(struct sk_buff *skb)
 	struct in6_addr *dest;
 	struct in6_addr *target;	/* new first hop to destination */
 	struct neighbour *neigh;
-	struct rt6_info *rt;
 	int on_link = 0;
 	int optlen;
 
@@ -740,20 +743,21 @@ static void ndisc_redirect_rcv(struct sk_buff *skb)
 	if (!in6_dev || in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
 		return;
 
-	/* passed validation tests
+	/* passed validation tests */
 
-	   NOTE We should not install redirect if sender did not supply
-	   ll address on link, which requires it. It would break, if
-	   we have non-transitive address resolution protocol.
-	   Fix it later. --ANK
+	/*
+	   We install redirect only if nexthop state is valid.
 	 */
-	rt = rt6_redirect(dest, &skb->nh.ipv6h->saddr, target, skb->dev, on_link);
-
-	if (rt == NULL)
-		return;
 
-	neigh = rt->rt6i_nexthop;
-	ndisc_update(neigh, (u8*)(dest + 1), optlen, ND_OPT_TARGET_LL_ADDR);
+	neigh = __neigh_lookup(&nd_tbl, target, skb->dev, 1);
+	if (neigh) {
+		ndisc_update(neigh, (u8*)(dest + 1), optlen, ND_OPT_TARGET_LL_ADDR);
+		if (neigh->nud_state&NUD_VALID)
+			rt6_redirect(dest, &skb->nh.ipv6h->saddr, neigh, on_link);
+		else
+			__neigh_event_send(neigh, NULL);
+		neigh_release(neigh);
+	}
 }
 
 void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
@@ -773,17 +777,21 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
 	int hlen;
 
 	dev = skb->dev;
-	rt = rt6_lookup(&skb->nh.ipv6h->saddr, NULL, dev->ifindex, 0);
+	rt = rt6_lookup(&skb->nh.ipv6h->saddr, NULL, dev->ifindex, 1);
 
-	if (rt == NULL || rt->u.dst.error) {
-		ND_PRINTK1("ndisc_send_redirect: hostunreach\n");
+	if (rt == NULL)
 		return;
-	}
 
 	if (rt->rt6i_flags & RTF_GATEWAY) {
 		ND_PRINTK1("ndisc_send_redirect: not a neighbour\n");
+		dst_release(&rt->u.dst);
 		return;
 	}
+	if (!xrlim_allow(&rt->u.dst, 1*HZ)) {
+		dst_release(&rt->u.dst);
+		return;
+	}
+	dst_release(&rt->u.dst);
 
 	if (dev->addr_len) {
 		if (neigh->nud_state&NUD_VALID) {
@@ -797,7 +805,7 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
 		}
 	}
 
-	rd_len = min(536 - len, ntohs(skb->nh.ipv6h->payload_len) + 8);
+	rd_len = min(IPV6_MIN_MTU-sizeof(struct ipv6hdr)-len, ntohs(skb->nh.ipv6h->payload_len) + 8);
 	rd_len &= ~0x7;
 	len += rd_len;
 
@@ -814,14 +822,14 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
 		ND_PRINTK1("ndisc_send_redirect: alloc_skb failed\n");
 		return;
 	}
-	
+
 	hlen = 0;
 
 	if (ndisc_build_ll_hdr(buff, dev, &skb->nh.ipv6h->saddr, NULL, len) == 0) {
 		kfree_skb(buff);
 		return;
 	}
-	
+
 	ip6_nd_hdr(sk, buff, dev, &ifp->addr, &skb->nh.ipv6h->saddr,
 		   IPPROTO_ICMPV6, len);
 
@@ -838,9 +846,9 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
 	ipv6_addr_copy(addrp, target);
 	addrp++;
 	ipv6_addr_copy(addrp, &skb->nh.ipv6h->daddr);
-	
+
 	opt = (u8*) (addrp + 1);
-		
+
 	/*
 	 *	include target_address option
 	 */
@@ -858,12 +866,15 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh,
 	opt += 6;
 
 	memcpy(opt, &skb->nh.ipv6h, rd_len - 8);
-	
+
 	icmph->icmp6_cksum = csum_ipv6_magic(&ifp->addr, &skb->nh.ipv6h->saddr,
 					     len, IPPROTO_ICMPV6,
 					     csum_partial((u8 *) icmph, len, 0));
 
 	dev_queue_xmit(buff);
+
+	icmpv6_statistics.Icmp6OutRedirects++;
+	icmpv6_statistics.Icmp6OutMsgs++;
 }
 
 static __inline__ struct neighbour *
@@ -894,15 +905,15 @@ static __inline__ int ndisc_recv_na(struct neighbour *neigh, struct sk_buff *skb
 
 static void pndisc_redo(struct sk_buff *skb)
 {
-	ndisc_rcv(skb, skb->dev, &skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr,
-		  NULL, skb->len);
+	ndisc_rcv(skb, skb->len);
 	kfree_skb(skb);
 }
 
-int ndisc_rcv(struct sk_buff *skb, struct device *dev,
-	      struct in6_addr *saddr, struct in6_addr *daddr,
-	      struct ipv6_options *opt, unsigned short len)
+int ndisc_rcv(struct sk_buff *skb, unsigned long len)
 {
+	struct device *dev = skb->dev;
+	struct in6_addr *saddr = &skb->nh.ipv6h->saddr;
+	struct in6_addr *daddr = &skb->nh.ipv6h->daddr;
 	struct nd_msg *msg = (struct nd_msg *) skb->h.raw;
 	struct neighbour *neigh;
 	struct inet6_ifaddr *ifp;
@@ -977,7 +988,7 @@ int ndisc_rcv(struct sk_buff *skb, struct device *dev,
 
 					if (neigh) {
 						ndisc_send_na(dev, neigh, saddr, &msg->target,
-							      1, 0, inc, inc);
+							      0, 0, inc, inc);
 						neigh_release(neigh);
 					}
 				} else {
@@ -1023,13 +1034,14 @@ int ndisc_rcv(struct sk_buff *skb, struct device *dev,
 					/*
 					 *	Change: router to host
 					 */
-#if 0					
 					struct rt6_info *rt;
-					rt = ndisc_get_dflt_router(skb->dev,
-								   saddr);
-					if (rt)
-						ndisc_del_dflt_router(rt);
-#endif
+					rt = rt6_get_dflt_router(saddr, skb->dev);
+					if (rt) {
+						/* It is safe only because
+						   we aer in BH */
+						dst_release(&rt->u.dst);
+						ip6_del_rt(rt);
+					}
 				}
 			} else {
 				if (msg->icmph.icmp6_router)
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 9b24b4948..31f6a2f55 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -7,7 +7,7 @@
  *		PROC file system.  This is very similar to the IPv4 version,
  *		except it reports the sockets in the INET6 address family.
  *
- * Version:	$Id: proc.c,v 1.8 1998/04/13 17:06:03 davem Exp $
+ * Version:	$Id: proc.c,v 1.9 1998/08/26 12:05:11 davem Exp $
  *
  * Authors:	David S. Miller (davem@caip.rutgers.edu)
  *
@@ -20,9 +20,11 @@
 #include <linux/socket.h>
 #include <linux/net.h>
 #include <linux/in6.h>
+#include <linux/stddef.h>
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <net/transp_v6.h>
+#include <net/ipv6.h>
 
 /* This is the main implementation workhorse of all these routines. */
 static int get__netinfo6(struct proto *pro, char *buffer, int format, char **start,
@@ -176,3 +178,105 @@ int afinet6_get_info(char *buffer, char **start, off_t offset, int length, int d
 		len = length;
 	return len;
 }
+
+
+struct snmp6_item
+{
+	char *name;
+	unsigned long *ptr;
+} snmp6_list[] = {
+/* ipv6 mib according to draft-ietf-ipngwg-ipv6-mib-04 */
+#define SNMP6_GEN(x) { #x , &ipv6_statistics.x }
+	SNMP6_GEN(Ip6InReceives),
+	SNMP6_GEN(Ip6InHdrErrors),
+	SNMP6_GEN(Ip6InTooBigErrors),
+	SNMP6_GEN(Ip6InNoRoutes),
+	SNMP6_GEN(Ip6InAddrErrors),
+	SNMP6_GEN(Ip6InUnknownProtos),
+	SNMP6_GEN(Ip6InTruncatedPkts),
+	SNMP6_GEN(Ip6InDiscards),
+	SNMP6_GEN(Ip6InDelivers),
+	SNMP6_GEN(Ip6OutForwDatagrams),
+	SNMP6_GEN(Ip6OutRequests),
+	SNMP6_GEN(Ip6OutDiscards),
+	SNMP6_GEN(Ip6OutNoRoutes),
+	SNMP6_GEN(Ip6ReasmTimeout),
+	SNMP6_GEN(Ip6ReasmReqds),
+	SNMP6_GEN(Ip6ReasmOKs),
+	SNMP6_GEN(Ip6ReasmFails),
+	SNMP6_GEN(Ip6FragOKs),
+	SNMP6_GEN(Ip6FragFails),
+	SNMP6_GEN(Ip6FragCreates),
+	SNMP6_GEN(Ip6InMcastPkts),
+	SNMP6_GEN(Ip6OutMcastPkts),
+#undef SNMP6_GEN
+/* icmpv6 mib according to draft-ietf-ipngwg-ipv6-icmp-mib-02
+
+   Exceptions:  {In|Out}AdminProhibs are removed, because I see
+                no good reasons to account them separately
+		of another dest.unreachs.
+		OutErrs is zero identically.
+		OutEchos too.
+		OutRouterAdvertisements too.
+		OutGroupMembQueries too.
+ */
+#define SNMP6_GEN(x) { #x , &icmpv6_statistics.x }
+	SNMP6_GEN(Icmp6InMsgs),
+	SNMP6_GEN(Icmp6InErrors),
+	SNMP6_GEN(Icmp6InDestUnreachs),
+	SNMP6_GEN(Icmp6InPktTooBigs),
+	SNMP6_GEN(Icmp6InTimeExcds),
+	SNMP6_GEN(Icmp6InParmProblems),
+	SNMP6_GEN(Icmp6InEchos),
+	SNMP6_GEN(Icmp6InEchoReplies),
+	SNMP6_GEN(Icmp6InGroupMembQueries),
+	SNMP6_GEN(Icmp6InGroupMembResponses),
+	SNMP6_GEN(Icmp6InGroupMembReductions),
+	SNMP6_GEN(Icmp6InRouterSolicits),
+	SNMP6_GEN(Icmp6InRouterAdvertisements),
+	SNMP6_GEN(Icmp6InNeighborSolicits),
+	SNMP6_GEN(Icmp6InNeighborAdvertisements),
+	SNMP6_GEN(Icmp6InRedirects),
+	SNMP6_GEN(Icmp6OutMsgs),
+	SNMP6_GEN(Icmp6OutDestUnreachs),
+	SNMP6_GEN(Icmp6OutPktTooBigs),
+	SNMP6_GEN(Icmp6OutTimeExcds),
+	SNMP6_GEN(Icmp6OutParmProblems),
+	SNMP6_GEN(Icmp6OutEchoReplies),
+	SNMP6_GEN(Icmp6OutRouterSolicits),
+	SNMP6_GEN(Icmp6OutNeighborSolicits),
+	SNMP6_GEN(Icmp6OutNeighborAdvertisements),
+	SNMP6_GEN(Icmp6OutRedirects),
+	SNMP6_GEN(Icmp6OutGroupMembResponses),
+	SNMP6_GEN(Icmp6OutGroupMembReductions),
+#undef SNMP6_GEN
+#define SNMP6_GEN(x) { "Udp6" #x , &udp_stats_in6.Udp##x }
+	SNMP6_GEN(InDatagrams),
+	SNMP6_GEN(NoPorts),
+	SNMP6_GEN(InErrors),
+	SNMP6_GEN(OutDatagrams)
+#undef SNMP6_GEN
+};
+
+
+int afinet6_get_snmp(char *buffer, char **start, off_t offset, int length,
+		     int dummy)
+{
+	int len = 0;
+	int i;
+
+	for (i=0; i<sizeof(snmp6_list)/sizeof(snmp6_list[0]); i++)
+		len += sprintf(buffer+len, "%-32s\t%ld\n", snmp6_list[i].name,
+			       *(snmp6_list[i].ptr));
+
+	len -= offset;
+
+	if (len > length)
+		len = length;
+	if(len < 0)
+		len = 0;
+
+	*start = buffer + offset;
+
+	return len;
+}
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 659ec59cc..76339ff58 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -7,7 +7,7 @@
  *
  *	Adapted from linux/net/ipv4/raw.c
  *
- *	$Id: raw.c,v 1.20 1998/07/15 05:05:41 davem Exp $
+ *	$Id: raw.c,v 1.21 1998/08/26 12:05:13 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -156,9 +156,8 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 	/* Check if the address belongs to the host. */
 	if (addr_type == IPV6_ADDR_MAPPED) {
-		v4addr = addr->sin6_addr.s6_addr32[3];
-		if (inet_addr_type(v4addr) != RTN_LOCAL)
-			return(-EADDRNOTAVAIL);
+		/* Raw sockets are IPv6 only */
+		return(-EADDRNOTAVAIL);
 	} else {
 		if (addr_type != IPV6_ADDR_ANY) {
 			/* ipv4 addr of the socket is invalid.  Only the
@@ -182,10 +181,11 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	return 0;
 }
 
-void rawv6_err(struct sock *sk, int type, int code, unsigned char *buff,
-	       struct in6_addr *saddr, struct in6_addr *daddr)
+void rawv6_err(struct sock *sk, struct sk_buff *skb, struct ipv6hdr *hdr,
+	       struct inet6_skb_parm *opt,
+	       int type, int code, unsigned char *buff, u32 info)
 {
-	if (sk == NULL) 
+	if (sk == NULL)
 		return;
 }
 
@@ -193,12 +193,12 @@ static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb)
 {
 	/* Charge it to the socket. */
 	if (sock_queue_rcv_skb(sk,skb)<0) {
-		/* ip_statistics.IpInDiscards++; */
+		ipv6_statistics.Ip6InDiscards++;
 		kfree_skb(skb);
 		return 0;
 	}
 
-	/* ip_statistics.IpInDelivers++; */
+	ipv6_statistics.Ip6InDelivers++;
 	return 0;
 }
 
@@ -209,22 +209,11 @@ static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb)
  *	maybe we could have the network decide uppon a hint if it 
  *	should call raw_rcv for demultiplexing
  */
-int rawv6_rcv(struct sk_buff *skb, struct device *dev,
-	      struct in6_addr *saddr, struct in6_addr *daddr,
-	      struct ipv6_options *opt, unsigned short len)
+int rawv6_rcv(struct sock *sk, struct sk_buff *skb, unsigned long len)
 {
-	struct sock *sk;
-
-	sk = skb->sk;
-
 	if (sk->ip_hdrincl)
 		skb->h.raw = skb->nh.raw;
 
-	if (sk->sock_readers) {
-		__skb_queue_tail(&sk->back_log, skb);
-		return 0;
-	}
-
 	rawv6_rcv_skb(sk, skb);
 	return 0;
 }
@@ -255,8 +244,12 @@ int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, int len,
 	if (!skb)
 		goto out;
 	
-	copied = min(len, skb->tail - skb->h.raw);
-	
+	copied = skb->tail - skb->h.raw;
+  	if (copied > len) {
+  		copied = len;
+  		msg->msg_flags |= MSG_TRUNC;
+  	}
+
 	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
 	sk->stamp=skb->stamp;
 	if (err)
@@ -269,7 +262,7 @@ int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, int len,
 		       sizeof(struct in6_addr));
 	}
 
-	if (msg->msg_controllen)
+	if (sk->net_pinfo.af_inet6.rxopt.all)
 		datagram_recv_ctl(sk, msg, skb);
 	err = copied;
 
@@ -332,11 +325,9 @@ static int rawv6_frag_cksum(const void *data, struct in6_addr *addr,
 			csum = (__u16 *) (buff + opt->offset);
 			*csum = hdr->cksum;
 		} else {
-			/* 
-			 *  FIXME 
-			 *  signal an error to user via sk->err
-			 */
-			printk(KERN_DEBUG "icmp: cksum offset too big\n");
+			if (net_ratelimit())
+				printk(KERN_DEBUG "icmp: cksum offset too big\n");
+			return -EINVAL;
 		}
 	}	
 	return 0; 
@@ -345,10 +336,10 @@ static int rawv6_frag_cksum(const void *data, struct in6_addr *addr,
 
 static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 {
-	struct ipv6_options opt_space;
+	struct ipv6_txoptions opt_space;
 	struct sockaddr_in6 * sin6 = (struct sockaddr_in6 *) msg->msg_name;
 	struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
-	struct ipv6_options *opt = NULL;
+	struct ipv6_txoptions *opt = NULL;
 	struct in6_addr *saddr = NULL;
 	struct flowi fl;
 	int addr_len = msg->msg_namelen;
@@ -360,11 +351,8 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 
 	/* Rough check on arithmetic overflow,
 	   better check is made in ip6_build_xmit
-
-	   When jumbo header will be implemeted we will remove it
-	   at all (len will be size_t)
 	 */
-	if (len < 0 || len > 0xFFFF)
+	if (len < 0)
 		return -EMSGSIZE;
 
 	/* Mirror BSD error message compatibility */
@@ -394,14 +382,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 			return(-EINVAL);
 
 		daddr = &sin6->sin6_addr;
-		
-		/* BUGGGG If route is not cloned, this check always
-		   fails, hence dst_cache only slows down tramsmission --ANK
-		 */
-		if (sk->dst_cache && ipv6_addr_cmp(daddr, &np->daddr)) {
-			dst_release(sk->dst_cache);
-			sk->dst_cache = NULL;
-		}		
 	} else {
 		if (sk->state != TCP_ESTABLISHED) 
 			return(-EINVAL);
@@ -422,12 +402,14 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len)
 
 	if (msg->msg_controllen) {
 		opt = &opt_space;
-		memset(opt, 0, sizeof(struct ipv6_options));
+		memset(opt, 0, sizeof(struct ipv6_txoptions));
 
 		err = datagram_send_ctl(msg, &fl.oif, &saddr, opt, &hlimit);
 		if (err < 0)
 			return err;
 	}
+	if (opt == NULL || !(opt->opt_nflen|opt->opt_flen))
+		opt = np->opt;
 
 	raw_opt = &sk->tp_pinfo.tp_raw;
 
@@ -594,8 +576,9 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname,
 
 static void rawv6_close(struct sock *sk, unsigned long timeout)
 {
+	/* See for explanation: raw_close in ipv4/raw.c */
 	sk->state = TCP_CLOSE;
-	ipv6_sock_mc_close(sk);
+	raw_v6_unhash(sk);
 	if (sk->num == IPPROTO_RAW)
 		ip6_ra_control(sk, -1, NULL);
 	sk->dead = 1;
@@ -619,7 +602,7 @@ struct proto rawv6_prot = {
 	datagram_poll,			/* poll */
 	NULL,				/* ioctl */
 	rawv6_init_sk,			/* init */
-	NULL,				/* destroy */
+	inet6_destroy_sock,		/* destroy */
 	NULL,				/* shutdown */
 	rawv6_setsockopt,		/* setsockopt */
 	rawv6_getsockopt,		/* getsockopt */
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index e78cf97a2..e455b0533 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -5,7 +5,7 @@
  *	Authors:
  *	Pedro Roque		<roque@di.fc.ul.pt>	
  *
- *	$Id: reassembly.c,v 1.10 1998/04/30 16:24:32 freitag Exp $
+ *	$Id: reassembly.c,v 1.11 1998/08/26 12:05:16 davem Exp $
  *
  *	Based on: net/ipv4/ip_fragment.c
  *
@@ -41,83 +41,145 @@
 #include <net/ndisc.h>
 #include <net/addrconf.h>
 
+int sysctl_ip6frag_high_thresh = 256*1024;
+int sysctl_ip6frag_low_thresh = 192*1024;
+int sysctl_ip6frag_time = IPV6_FRAG_TIMEOUT;
+
+atomic_t ip6_frag_mem = ATOMIC_INIT(0);
+
+struct ipv6_frag {
+	__u16			offset;
+	__u16			len;
+	struct sk_buff		*skb;
+
+	struct frag_hdr		*fhdr;
+
+	struct ipv6_frag	*next;
+};
+
+/*
+ *	Equivalent of ipv4 struct ipq
+ */
+
+struct frag_queue {
+
+	struct frag_queue	*next;
+	struct frag_queue	*prev;
+
+	__u32			id;		/* fragment id		*/
+	struct in6_addr		saddr;
+	struct in6_addr		daddr;
+	struct timer_list	timer;		/* expire timer		*/
+	struct ipv6_frag	*fragments;
+	struct device		*dev;
+	int			iif;
+	__u8			last_in;	/* has first/last segment arrived? */
+#define FIRST_IN		2
+#define LAST_IN			1
+	__u8			nexthdr;
+	__u16			nhoffset;
+};
 
 static struct frag_queue ipv6_frag_queue = {
 	&ipv6_frag_queue, &ipv6_frag_queue,
 	0, {{{0}}}, {{{0}}},
 	{0}, NULL, NULL,
-	0, 0, NULL
+	0, 0, 0, 0
 };
 
+/* Memory Tracking Functions. */
+extern __inline__ void frag_kfree_skb(struct sk_buff *skb)
+{
+	atomic_sub(skb->truesize, &ip6_frag_mem);
+	kfree_skb(skb);
+}
+
+extern __inline__ void frag_kfree_s(void *ptr, int len)
+{
+	atomic_sub(len, &ip6_frag_mem);
+	kfree(ptr);
+}
+ 
+extern __inline__ void *frag_kmalloc(int size, int pri)
+{
+	void *vp = kmalloc(size, pri);
+
+	if(!vp)
+		return NULL;
+	atomic_add(size, &ip6_frag_mem);
+	return vp;
+}
+
+
 static void			create_frag_entry(struct sk_buff *skb, 
-						  struct device *dev,
 						  __u8 *nhptr,
 						  struct frag_hdr *fhdr);
-static int			reasm_frag_1(struct frag_queue *fq, 
-					     struct sk_buff **skb_in);
+static u8 *			reasm_frag(struct frag_queue *fq, 
+					   struct sk_buff **skb_in);
 
 static void			reasm_queue(struct frag_queue *fq, 
 					    struct sk_buff *skb, 
-					    struct frag_hdr *fhdr);
+					    struct frag_hdr *fhdr,
+					    u8 *nhptr);
 
-static int reasm_frag(struct frag_queue *fq, struct sk_buff **skb, 
-		      __u8 *nhptr,
-		      struct frag_hdr *fhdr)
-{
-	__u32	expires = jiffies + IPV6_FRAG_TIMEOUT;
-	int nh;
-
-	if (del_timer(&fq->timer))
-		expires = fq->timer.expires;
+static void			fq_free(struct frag_queue *fq);
 
-	/*
-	 *	We queue the packet even if it's the last.
-	 *	It's a trade off. This allows the reassembly 
-	 *	code to be simpler (=faster) and of the
-	 *	steps we do for queueing the only unnecessary 
-	 *	one it's the kmalloc for a struct ipv6_frag.
-	 *	Feel free to try other alternatives...
-	 */
-	if ((fhdr->frag_off & __constant_htons(0x0001)) == 0) {
-		fq->last_in = 1;
-		fq->nhptr = nhptr;
-	}
-	reasm_queue(fq, *skb, fhdr);
+static void frag_prune(void)
+{
+	struct frag_queue *fq;
 
-	if (fq->last_in) {
-		if ((nh = reasm_frag_1(fq, skb)))
-			return nh;
+	while ((fq = ipv6_frag_queue.next) != &ipv6_frag_queue) {
+		ipv6_statistics.Ip6ReasmFails++;
+		fq_free(fq);
+		if (atomic_read(&ip6_frag_mem) <= sysctl_ip6frag_low_thresh)
+			return;
 	}
-
-	fq->timer.expires = expires;
-	add_timer(&fq->timer);
-	
-	return 0;
+	if (atomic_read(&ip6_frag_mem))
+		printk(KERN_DEBUG "IPv6 frag_prune: memleak\n");
+	atomic_set(&ip6_frag_mem, 0);
 }
 
-int ipv6_reassembly(struct sk_buff **skbp, struct device *dev, __u8 *nhptr,
-		    struct ipv6_options *opt)
+
+u8* ipv6_reassembly(struct sk_buff **skbp, __u8 *nhptr)
 {
 	struct sk_buff *skb = *skbp; 
 	struct frag_hdr *fhdr = (struct frag_hdr *) (skb->h.raw);
 	struct frag_queue *fq;
 	struct ipv6hdr *hdr;
 
+	hdr = skb->nh.ipv6h;
+
+	ipv6_statistics.Ip6ReasmReqds++;
+
+	/* Jumbo payload inhibits frag. header */
+	if (hdr->payload_len==0) {
+		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw);
+		return NULL;
+	}
 	if ((u8 *)(fhdr+1) > skb->tail) {
 		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw);
-		return 0;
+		return NULL;
 	}
-	hdr = skb->nh.ipv6h;
+	if (atomic_read(&ip6_frag_mem) > sysctl_ip6frag_high_thresh)
+		frag_prune();
+
 	for (fq = ipv6_frag_queue.next; fq != &ipv6_frag_queue; fq = fq->next) {
 		if (fq->id == fhdr->identification && 
 		    !ipv6_addr_cmp(&hdr->saddr, &fq->saddr) &&
-		    !ipv6_addr_cmp(&hdr->daddr, &fq->daddr))
-			return reasm_frag(fq, skbp, nhptr,fhdr);
+		    !ipv6_addr_cmp(&hdr->daddr, &fq->daddr)) {
+
+			reasm_queue(fq, skb, fhdr, nhptr);
+
+			if (fq->last_in == (FIRST_IN|LAST_IN))
+				return reasm_frag(fq, skbp);
+
+			return NULL;
+		}
 	}
-	
-	create_frag_entry(skb, dev, nhptr, fhdr);
 
-	return 0;
+	create_frag_entry(skb, nhptr, fhdr);
+
+	return NULL;
 }
 
 
@@ -125,11 +187,13 @@ static void fq_free(struct frag_queue *fq)
 {
 	struct ipv6_frag *fp, *back;
 
-	for(fp = fq->fragments; fp; ) {
-		kfree_skb(fp->skb);		
+	del_timer(&fq->timer);
+
+	for (fp = fq->fragments; fp; ) {
+		frag_kfree_skb(fp->skb);
 		back = fp;
 		fp=fp->next;
-		kfree(back);
+		frag_kfree_s(back, sizeof(*back));
 	}
 
 	fq->prev->next = fq->next;
@@ -137,7 +201,7 @@ static void fq_free(struct frag_queue *fq)
 
 	fq->prev = fq->next = NULL;
 	
-	kfree(fq);
+	frag_kfree_s(fq, sizeof(*fq));
 }
 
 static void frag_expire(unsigned long data)
@@ -147,33 +211,50 @@ static void frag_expire(unsigned long data)
 
 	fq = (struct frag_queue *) data;
 
-	del_timer(&fq->timer);
-
 	frag = fq->fragments;
 
+	ipv6_statistics.Ip6ReasmTimeout++;
+	ipv6_statistics.Ip6ReasmFails++;
+
 	if (frag == NULL) {
 		printk(KERN_DEBUG "invalid fragment queue\n");
 		return;
 	}
 
-	icmpv6_send(frag->skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0,
-		    frag->skb->dev);
+	/* Send error only if the first segment arrived.
+	   (fixed --ANK (980728))
+	 */
+	if (fq->last_in&FIRST_IN) {
+		struct device *dev = dev_get_by_index(fq->iif);
+
+		/*
+		   But use as source device on which LAST ARRIVED
+		   segment was received. And do not use fq->dev
+		   pointer directly, device might already disappeared.
+		 */
+		if (dev) {
+			frag->skb->dev = dev;
+			icmpv6_send(frag->skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0,
+				    dev);
+		}
+	}
 	
 	fq_free(fq);
 }
 
 
-static void create_frag_entry(struct sk_buff *skb, struct device *dev, 
+static void create_frag_entry(struct sk_buff *skb,
 			      __u8 *nhptr,
 			      struct frag_hdr *fhdr)
 {
 	struct frag_queue *fq;
 	struct ipv6hdr *hdr; 
 
-	fq = (struct frag_queue *) kmalloc(sizeof(struct frag_queue), 
-					   GFP_ATOMIC);
+	fq = (struct frag_queue *) frag_kmalloc(sizeof(struct frag_queue), 
+						GFP_ATOMIC);
 
 	if (fq == NULL) {
+		ipv6_statistics.Ip6ReasmFails++;
 		kfree_skb(skb);
 		return;
 	}
@@ -186,38 +267,41 @@ static void create_frag_entry(struct sk_buff *skb, struct device *dev,
 	ipv6_addr_copy(&fq->saddr, &hdr->saddr);
 	ipv6_addr_copy(&fq->daddr, &hdr->daddr);
 
-	fq->dev = dev;
-
 	/* init_timer has been done by the memset */
 	fq->timer.function = frag_expire;
 	fq->timer.data = (long) fq;
-	fq->timer.expires = jiffies + IPV6_FRAG_TIMEOUT;
+	fq->timer.expires = jiffies + sysctl_ip6frag_time;
 
-	fq->nexthdr = fhdr->nexthdr;
+	reasm_queue(fq, skb, fhdr, nhptr);
 
+	if (fq->fragments) {
+		fq->prev = ipv6_frag_queue.prev;
+		fq->next = &ipv6_frag_queue;
+		fq->prev->next = fq;
+		ipv6_frag_queue.prev = fq;
 
-	if ((fhdr->frag_off & __constant_htons(0x0001)) == 0) {
-		fq->last_in = 1;
-		fq->nhptr = nhptr;
-	}
-	reasm_queue(fq, skb, fhdr);
-
-	fq->prev = ipv6_frag_queue.prev;
-	fq->next = &ipv6_frag_queue;
-	fq->prev->next = fq;
-	ipv6_frag_queue.prev = fq;
-	
-	add_timer(&fq->timer);
+		add_timer(&fq->timer);
+	} else
+		frag_kfree_s(fq, sizeof(*fq));
 }
 
 
+/*
+ *	We queue the packet even if it's the last.
+ *	It's a trade off. This allows the reassembly 
+ *	code to be simpler (=faster) and of the
+ *	steps we do for queueing the only unnecessary 
+ *	one it's the kmalloc for a struct ipv6_frag.
+ *	Feel free to try other alternatives...
+ */
+
 static void reasm_queue(struct frag_queue *fq, struct sk_buff *skb, 
-				     struct frag_hdr *fhdr)
+				     struct frag_hdr *fhdr, u8 *nhptr)
 {
 	struct ipv6_frag *nfp, *fp, **bptr;
 
-	nfp = (struct ipv6_frag *) kmalloc(sizeof(struct ipv6_frag), 
-					   GFP_ATOMIC);
+	nfp = (struct ipv6_frag *) frag_kmalloc(sizeof(struct ipv6_frag), 
+						GFP_ATOMIC);
 
 	if (nfp == NULL) {		
 		kfree_skb(skb);
@@ -228,24 +312,40 @@ static void reasm_queue(struct frag_queue *fq, struct sk_buff *skb,
 	nfp->len = (ntohs(skb->nh.ipv6h->payload_len) -
 		    ((u8 *) (fhdr + 1) - (u8 *) (skb->nh.ipv6h + 1)));
 
-	if ((u32)nfp->offset + (u32)nfp->len > 65536) {
+	if ((u32)nfp->offset + (u32)nfp->len >= 65536) {
 		icmpv6_param_prob(skb,ICMPV6_HDR_FIELD, (u8*)&fhdr->frag_off); 
 		goto err;
 	}
+	if (fhdr->frag_off & __constant_htons(0x0001)) {
+		/* Check if the fragment is rounded to 8 bytes.
+		 * Required by the RFC.
+		 * ... and would break our defragmentation algorithm 8)
+		 */
+		if (nfp->len & 0x7) {
+			printk(KERN_DEBUG "fragment not rounded to 8bytes\n");
+
+			/*
+			   It is not in specs, but I see no reasons
+			   to send an error in this case. --ANK
+			 */
+			if (nfp->offset == 0)
+				icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, 
+						  &skb->nh.ipv6h->payload_len);
+			goto err;
+		}
+	}
 
 	nfp->skb  = skb;
 	nfp->fhdr = fhdr;
-
 	nfp->next = NULL;
 
 	bptr = &fq->fragments;
-	
+
 	for (fp = fq->fragments; fp; fp=fp->next) {
 		if (nfp->offset <= fp->offset)
 			break;
 		bptr = &fp->next;
 	}
-	
 	if (fp && fp->offset == nfp->offset) {
 		if (nfp->len != fp->len) {
 			printk(KERN_DEBUG "reasm_queue: dup with wrong len\n");
@@ -254,29 +354,40 @@ static void reasm_queue(struct frag_queue *fq, struct sk_buff *skb,
 		/* duplicate. discard it. */
 		goto err;
 	}
-	
-	*bptr = nfp;
-	nfp->next = fp;
 
-#ifdef STRICT_RFC
-	if (fhdr->frag_off & __constant_htons(0x0001)) {
-		/* Check if the fragment is rounded to 8 bytes.
-		 * Required by the RFC.
-		 */
-		if (nfp->len & 0x7) {
-			printk(KERN_DEBUG "fragment not rounded to 8bytes\n");
+	atomic_add(skb->truesize, &ip6_frag_mem);
 
-			icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, 
-					  &skb->nh.ipv6h->payload_len);
-			goto err;
-		}
+	/* All the checks are done, fragment is acepted.
+	   Only now we are allowed to update reassembly data!
+	   (fixed --ANK (980728))
+	 */
+
+	/* iif always set to one of the last arrived segment */
+	fq->dev = skb->dev;
+	fq->iif = skb->dev->ifindex;
+
+	/* Last fragment */
+	if ((fhdr->frag_off & __constant_htons(0x0001)) == 0)
+		fq->last_in |= LAST_IN;
+
+	/* First fragment.
+	   nexthdr and nhptr are get from the first fragment.
+	   Moreover, nexthdr is UNDEFINED for all the fragments but the
+	   first one.
+	   (fixed --ANK (980728))
+	 */
+	if (nfp->offset == 0) {
+		fq->nexthdr = fhdr->nexthdr;
+		fq->last_in |= FIRST_IN;
+		fq->nhoffset = nhptr - skb->nh.raw;
 	}
-#endif 
 
+	*bptr = nfp;
+	nfp->next = fp;
 	return;
 
 err:
-	kfree(nfp);
+	frag_kfree_s(nfp, sizeof(*nfp));
 	kfree_skb(skb);
 }
 
@@ -284,20 +395,21 @@ err:
  *	check if this fragment completes the packet
  *	returns true on success
  */
-static int reasm_frag_1(struct frag_queue *fq, struct sk_buff **skb_in)
+static u8* reasm_frag(struct frag_queue *fq, struct sk_buff **skb_in)
 {
 	struct ipv6_frag *fp;
+	struct ipv6_frag *head = fq->fragments;
 	struct ipv6_frag *tail = NULL;
 	struct sk_buff *skb;
 	__u32  offset = 0;
 	__u32  payload_len;
 	__u16  unfrag_len;
 	__u16  copy;
-	int    nh;
+	u8     *nhptr;
 
-	for(fp = fq->fragments; fp; fp=fp->next) {
+	for(fp = head; fp; fp=fp->next) {
 		if (offset != fp->offset)
-			return 0;
+			return NULL;
 
 		offset += fp->len;
 		tail = fp;
@@ -309,31 +421,42 @@ static int reasm_frag_1(struct frag_queue *fq, struct sk_buff **skb_in)
 	 * this means we have all fragments.
 	 */
 
-	unfrag_len = (u8 *) (tail->fhdr) - (u8 *) (tail->skb->nh.ipv6h + 1);
+	/* Unfragmented part is taken from the first segment.
+	   (fixed --ANK (980728))
+	 */
+	unfrag_len = (u8 *) (head->fhdr) - (u8 *) (head->skb->nh.ipv6h + 1);
 
 	payload_len = (unfrag_len + tail->offset + 
 		       (tail->skb->tail - (__u8 *) (tail->fhdr + 1)));
 
-#if 0
-	printk(KERN_DEBUG "reasm: payload len = %d\n", payload_len);
-#endif
+	if (payload_len > 65535) {
+		if (net_ratelimit())
+			printk(KERN_DEBUG "reasm_frag: payload len = %d\n", payload_len);
+		ipv6_statistics.Ip6ReasmFails++;
+		fq_free(fq);
+		return NULL;
+	}
 
 	if ((skb = dev_alloc_skb(sizeof(struct ipv6hdr) + payload_len))==NULL) {
-		printk(KERN_DEBUG "reasm_frag: no memory for reassembly\n");
+		if (net_ratelimit())
+			printk(KERN_DEBUG "reasm_frag: no memory for reassembly\n");
+		ipv6_statistics.Ip6ReasmFails++;
 		fq_free(fq);
-		return 1;
+		return NULL;
 	}
 
 	copy = unfrag_len + sizeof(struct ipv6hdr);
 
 	skb->nh.ipv6h = (struct ipv6hdr *) skb->data;
-
 	skb->dev = fq->dev;
+	skb->protocol = __constant_htons(ETH_P_IPV6);
+	skb->pkt_type = head->skb->pkt_type;
+	memcpy(skb->cb, head->skb->cb, sizeof(skb->cb));
+	skb->dst = dst_clone(head->skb->dst);
 
-	nh = fq->nexthdr;
-
-	*(fq->nhptr) = nh;
-	memcpy(skb_put(skb, copy), tail->skb->nh.ipv6h, copy);
+	memcpy(skb_put(skb, copy), head->skb->nh.ipv6h, copy);
+	nhptr = skb->nh.raw + fq->nhoffset;
+	*nhptr = fq->nexthdr;
 
 	skb->h.raw = skb->tail;
 
@@ -351,18 +474,19 @@ static int reasm_frag_1(struct frag_queue *fq, struct sk_buff **skb_in)
 		struct ipv6_frag *back;
 
 		memcpy(skb_put(skb, fp->len), (__u8*)(fp->fhdr + 1), fp->len);
-		kfree_skb(fp->skb);
+		frag_kfree_skb(fp->skb);
 		back = fp;
 		fp=fp->next;
-		kfree(back);
+		frag_kfree_s(back, sizeof(*back));
 	}
-	
+
+	del_timer(&fq->timer);
 	fq->prev->next = fq->next;
 	fq->next->prev = fq->prev;
-
 	fq->prev = fq->next = NULL;
-	
-	kfree(fq);
 
-	return nh;
+	frag_kfree_s(fq, sizeof(*fq));
+
+	ipv6_statistics.Ip6ReasmOKs++;
+	return nhptr;
 }
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9d159fe36..8d1f59632 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -5,7 +5,7 @@
  *	Authors:
  *	Pedro Roque		<roque@di.fc.ul.pt>	
  *
- *	$Id: route.c,v 1.32 1998/07/25 23:28:52 davem Exp $
+ *	$Id: route.c,v 1.33 1998/08/26 12:05:18 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -53,10 +53,19 @@
 
 #if RT6_DEBUG >= 3
 #define RDBG(x) printk x
+#define RT6_TRACE(x...) printk(KERN_DEBUG x)
 #else
 #define RDBG(x)
+#define RT6_TRACE(x...) do { ; } while (0)
 #endif
 
+#if RT6_DEBUG >= 1
+#define BUG_TRAP(x) ({ if (!(x)) { printk("Assertion (" #x ") failed at " __FILE__ "(%d):" __FUNCTION__ "\n", __LINE__); } })
+#else
+#define BUG_TRAP(x) do { ; } while (0)
+#endif
+
+
 int ip6_rt_max_size = 4096;
 int ip6_rt_gc_min_interval = 5*HZ;
 int ip6_rt_gc_timeout = 60*HZ;
@@ -87,16 +96,16 @@ struct dst_ops ip6_dst_ops = {
 };
 
 struct rt6_info ip6_null_entry = {
-	{{NULL, ATOMIC_INIT(1), ATOMIC_INIT(1), NULL,
-	  -1, 0, 0, 0, 0, 0, 0, 0, 0,
+	{{NULL, ATOMIC_INIT(1), ATOMIC_INIT(1), &loopback_dev,
+	  -1, 0, 0, 0, 0, 0, 0, 0,
 	  -ENETUNREACH, NULL, NULL,
 	  ip6_pkt_discard, ip6_pkt_discard,
 #ifdef CONFIG_NET_CLS_ROUTE
 	  0,
 #endif
 	  &ip6_dst_ops}},
-	NULL, {{{0}}}, 256, RTF_REJECT|RTF_NONEXTHOP, ~0U,
-	255, 0, {NULL}, {{{{0}}}, 0}, {{{{0}}}, 0}
+	NULL, {{{0}}}, RTF_REJECT|RTF_NONEXTHOP, ~0U,
+	255, 0, ATOMIC_INIT(1), {NULL}, {{{{0}}}, 0}, {{{{0}}}, 0}
 };
 
 struct fib6_node ip6_routing_table = {
@@ -123,89 +132,6 @@ static struct rt6_info	*rt6_flow_lookup(struct rt6_info *rt,
 #define ip6_rt_policy (0)
 #endif
 
-static atomic_t	rt6_tbl_lock	= ATOMIC_INIT(0);
-static int	rt6_bh_mask	= 0;
-
-#define RT_BH_REQUEST		1
-#define RT_BH_GC		2
-
-static void __rt6_run_bh(void);
-
-/*
- *	request queue operations
- *	FIFO queue/dequeue
- */
-
-static struct rt6_req request_queue = {
-	0, NULL, &request_queue, &request_queue
-};
-
-static __inline__ void rtreq_queue(struct rt6_req * req)
-{
-	unsigned long flags;
-	struct rt6_req *next = &request_queue;
-
-	save_flags(flags);
-	cli();
-
-	req->prev = next->prev;
-	req->prev->next = req;
-	next->prev = req;
-	req->next = next;
-	restore_flags(flags);
-}
-
-static __inline__ struct rt6_req * rtreq_dequeue(void)
-{
-	struct rt6_req *next = &request_queue;
-	struct rt6_req *head;
-
-	head = next->next;
-
-	if (head == next)
-		return NULL;
-
-	head->next->prev = head->prev;
-	next->next = head->next;
-
-	head->next = NULL;
-	head->prev = NULL;
-
-	return head;
-}
-
-void rtreq_add(struct rt6_info *rt, int operation)
-{
-	struct rt6_req *rtreq;
-
-	rtreq = kmalloc(sizeof(struct rt6_req), GFP_ATOMIC);
-	
-	if (rtreq == NULL)
-		return;
-
-	memset(rtreq, 0, sizeof(struct rt6_req));
-
-	rtreq->operation = operation;
-	rtreq->ptr = rt;
-	rtreq_queue(rtreq);
-
-	rt6_bh_mask |= RT_BH_REQUEST;
-}
-
-static __inline__ void rt6_lock(void)
-{
-	atomic_inc(&rt6_tbl_lock);
-}
-
-static __inline__ void rt6_unlock(void)
-{
-	if (atomic_dec_and_test(&rt6_tbl_lock) && rt6_bh_mask) {
-		start_bh_atomic();
-		__rt6_run_bh();
-		end_bh_atomic();
-	}
-}
-
 /*
  *	Route lookup
  */
@@ -219,23 +145,19 @@ static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
 
 	if (oif) {
 		for (sprt = rt; sprt; sprt = sprt->u.next) {
-			if (sprt->rt6i_dev) {
-				if (sprt->rt6i_dev->ifindex == oif)
-					return sprt;
-				if (sprt->rt6i_dev->flags&IFF_LOOPBACK)
-					local = sprt;
-			}
+			struct device *dev = sprt->rt6i_dev;
+			if (dev->ifindex == oif)
+				return sprt;
+			if (dev->flags&IFF_LOOPBACK)
+				local = sprt;
 		}
 
 		if (local)
 			return local;
 
-		if (strict) {
-			RDBG(("nomatch & STRICT --> ip6_null_entry\n"));
+		if (strict)
 			return &ip6_null_entry;
-		}
 	}
-	RDBG(("!dev or (no match and !strict) --> rt(%p)\n", rt));
 	return rt;
 }
 
@@ -282,7 +204,7 @@ static struct rt6_info *rt6_best_dflt(struct rt6_info *rt, int oif)
 				break;
 			};
 
-			if (oif && sprt->rt6i_dev && sprt->rt6i_dev->ifindex == oif) {
+			if (oif && sprt->rt6i_dev->ifindex == oif) {
 				m += 2;
 			}
 
@@ -319,21 +241,40 @@ out:
 }
 
 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
-			    int oif, int flags)
+			    int oif, int strict)
 {
 	struct fib6_node *fn;
 	struct rt6_info *rt;
 
-	rt6_lock();
+	start_bh_atomic();
 	fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
-	rt = rt6_device_match(fn->leaf, oif, flags&RTF_LINKRT);
-	rt6_unlock();
-	return rt;
+	rt = rt6_device_match(fn->leaf, oif, strict);
+	atomic_inc(&rt->u.dst.use);
+	atomic_inc(&rt->u.dst.refcnt);
+	end_bh_atomic();
+
+	rt->u.dst.lastuse = jiffies;
+	if (rt->u.dst.error == 0)
+		return rt;
+	dst_release(&rt->u.dst);
+	return NULL;
+}
+
+static int rt6_ins(struct rt6_info *rt)
+{
+	int err;
+
+	start_bh_atomic();
+	err = fib6_add(&ip6_routing_table, rt);
+	end_bh_atomic();
+
+	return err;
 }
 
 static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr,
 				struct in6_addr *saddr)
 {
+	int err;
 	struct rt6_info *rt;
 
 	/*
@@ -351,18 +292,24 @@ static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr,
 		rt->rt6i_dst.plen = 128;
 		rt->rt6i_flags |= RTF_CACHE;
 
-		if (rt->rt6i_src.plen) {
+#ifdef CONFIG_IPV6_SUBTREES
+		if (rt->rt6i_src.plen && saddr) {
 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
 			rt->rt6i_src.plen = 128;
 		}
+#endif
 
 		rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
 
-		rtreq_add(rt, RT_OPER_ADD);
-	} else {
-		rt = &ip6_null_entry;
+		dst_clone(&rt->u.dst);
+		err = rt6_ins(rt);
+		if (err == 0)
+			return rt;
+		rt->u.dst.error = err;
+		return rt;
 	}
-	return rt;
+	dst_clone(&ip6_null_entry.u.dst);
+	return &ip6_null_entry;
 }
 
 #ifdef CONFIG_RT6_POLICY
@@ -397,24 +344,38 @@ static __inline__ struct rt6_info *rt6_flow_lookup_out(struct rt6_info *rt,
 
 #endif
 
+#define BACKTRACK() \
+if (rt == &ip6_null_entry && strict) { \
+       while ((fn = fn->parent) != NULL) { \
+		if (fn->fn_flags & RTN_ROOT) { \
+			dst_clone(&rt->u.dst); \
+			goto out; \
+		} \
+		if (fn->fn_flags & RTN_RTINFO) \
+			goto restart; \
+	} \
+}
+
+
 void ip6_route_input(struct sk_buff *skb)
 {
 	struct fib6_node *fn;
 	struct rt6_info *rt;
-	struct dst_entry *dst;
+	int strict;
+
+	strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL);
 
-	RDBG(("ip6_route_input(%p) from %p\n", skb, __builtin_return_address(0)));
-	if ((dst = skb->dst) != NULL)
-		goto looped_back;
-	rt6_lock();
 	fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
 			 &skb->nh.ipv6h->saddr);
 
+restart:
 	rt = fn->leaf;
 
 	if ((rt->rt6i_flags & RTF_CACHE)) {
 		if (ip6_rt_policy == 0) {
-			rt = rt6_device_match(rt, skb->dev->ifindex, 0);
+			rt = rt6_device_match(rt, skb->dev->ifindex, strict);
+			BACKTRACK();
+			dst_clone(&rt->u.dst);
 			goto out;
 		}
 
@@ -425,6 +386,7 @@ void ip6_route_input(struct sk_buff *skb)
 			for (sprt = rt; sprt; sprt = sprt->u.next) {
 				if (rt6_flow_match_in(sprt, skb)) {
 					rt = sprt;
+					dst_clone(&rt->u.dst);
 					goto out;
 				}
 			}
@@ -433,38 +395,38 @@ void ip6_route_input(struct sk_buff *skb)
 	}
 
 	rt = rt6_device_match(rt, skb->dev->ifindex, 0);
+	BACKTRACK();
 
 	if (ip6_rt_policy == 0) {
-		if (!rt->rt6i_nexthop && rt->rt6i_dev &&
-		    ((rt->rt6i_flags & RTF_NONEXTHOP) == 0)) {
+		if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
 			rt = rt6_cow(rt, &skb->nh.ipv6h->daddr,
 				     &skb->nh.ipv6h->saddr);
+			goto out;
 		}
+		dst_clone(&rt->u.dst);
 	} else {
 #ifdef CONFIG_RT6_POLICY
 		rt = rt6_flow_lookup_in(rt, skb);
+#else
+		/* NEVER REACHED */
 #endif
 	}
 
 out:
-	dst = dst_clone((struct dst_entry *) rt);
-	rt6_unlock();
-
-	skb->dst = dst;
-looped_back:
-	dst->input(skb);
+	rt->u.dst.lastuse = jiffies;
+	atomic_inc(&rt->u.dst.refcnt);
+	skb->dst = (struct dst_entry *) rt;
 }
 
 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
 {
 	struct fib6_node *fn;
 	struct rt6_info *rt;
-	struct dst_entry *dst;
 	int strict;
 
 	strict = ipv6_addr_type(fl->nl_u.ip6_u.daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL);
 
-	rt6_lock();
+	start_bh_atomic();
 	fn = fib6_lookup(&ip6_routing_table, fl->nl_u.ip6_u.daddr,
 			 fl->nl_u.ip6_u.saddr);
 
@@ -472,25 +434,10 @@ restart:
 	rt = fn->leaf;
 
 	if ((rt->rt6i_flags & RTF_CACHE)) {
-		RDBG(("RTF_CACHE "));
 		if (ip6_rt_policy == 0) {
 			rt = rt6_device_match(rt, fl->oif, strict);
-
-			/* BUGGGG! It is capital bug, that was hidden
-			   by not-cloning multicast routes. However,
-			   the same problem was with link-local addresses.
-			   Fix is the following if-statement,
-			   but it will not properly handle Pedro's subtrees --ANK
-			 */
-			if (rt == &ip6_null_entry && strict) {
-				while ((fn = fn->parent) != NULL) {
-					if (fn->fn_flags & RTN_ROOT)
-						goto out;
-					if (fn->fn_flags & RTN_RTINFO)
-						goto restart;
-				}
-			}
-			RDBG(("devmatch(%p) ", rt));
+			BACKTRACK();
+			dst_clone(&rt->u.dst);
 			goto out;
 		}
 
@@ -501,68 +448,46 @@ restart:
 			for (sprt = rt; sprt; sprt = sprt->u.next) {
 				if (rt6_flow_match_out(sprt, sk)) {
 					rt = sprt;
+					dst_clone(&rt->u.dst);
 					goto out;
 				}
 			}
 		}
 #endif
 	}
-	RDBG(("!RTF_CACHE "));
 	if (rt->rt6i_flags & RTF_DEFAULT) {
-		RDBG(("RTF_DEFAULT "));
-		if (rt->rt6i_metric >= IP6_RT_PRIO_ADDRCONF) {
+		if (rt->rt6i_metric >= IP6_RT_PRIO_ADDRCONF)
 			rt = rt6_best_dflt(rt, fl->oif);
-			RDBG(("best_dflt(%p) ", rt));
-		}
 	} else {
 		rt = rt6_device_match(rt, fl->oif, strict);
-		RDBG(("!RTF_DEFAULT devmatch(%p) ", rt));
+		BACKTRACK();
 	}
 
 	if (ip6_rt_policy == 0) {
-		if (!rt->rt6i_nexthop && rt->rt6i_dev &&
-		    ((rt->rt6i_flags & RTF_NONEXTHOP) == 0)) {
+		if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
 			rt = rt6_cow(rt, fl->nl_u.ip6_u.daddr,
 				     fl->nl_u.ip6_u.saddr);
-			RDBG(("(!nhop&&rt6i_dev&&!RTF_NONEXTHOP) cow(%p) ", rt));
+			goto out;
 		}
+		dst_clone(&rt->u.dst);
 	} else {
 #ifdef CONFIG_RT6_POLICY
 		rt = rt6_flow_lookup_out(rt, sk, fl);
+#else
+		/* NEVER REACHED */
 #endif
 	}
 
 out:
-	dst = dst_clone((struct dst_entry *) rt);
-	rt6_unlock();
-	RDBG(("dclone/ret(%p)\n", dst));
-	return dst;
-}
-
-
-static void rt6_ins(struct rt6_info *rt)
-{
-	start_bh_atomic();
-	if (atomic_read(&rt6_tbl_lock) == 1)
-		fib6_add(&ip6_routing_table, rt);
-	else
-		rtreq_add(rt, RT_OPER_ADD);
+	rt->u.dst.lastuse = jiffies;
+	atomic_inc(&rt->u.dst.refcnt);
 	end_bh_atomic();
+	return &rt->u.dst;
 }
 
+
 /*
  *	Destination cache support functions
- *
- *	BUGGG! This function is absolutely wrong.
- *	First of all it is never called. (look at include/net/dst.h)
- *	Second, even when it is called rt->rt6i_node == NULL
- *	  ** partially fixed: now dst->obsolete = -1 for IPv6 not cache routes.
- *	Third, even we fixed previous bugs,
- *	it will not work because sernum is incorrectly checked/updated and
- *	it does not handle change of the parent of cloned route.
- *	Purging stray clones is not easy task, it would require
- *	massive remake of ip6_fib.c. Alas...
- *							--ANK
  */
 
 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
@@ -646,7 +571,7 @@ static int ipv6_get_mtu(struct device *dev)
 	if (idev)
 		return idev->cnf.mtu6;
 	else
-		return 576;
+		return IPV6_MIN_MTU;
 }
 
 static int ipv6_get_hoplimit(struct device *dev)
@@ -664,72 +589,68 @@ static int ipv6_get_hoplimit(struct device *dev)
  *
  */
 
-struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err)
+int ip6_route_add(struct in6_rtmsg *rtmsg)
 {
+	int err;
 	struct rt6_info *rt;
 	struct device *dev = NULL;
 	int addr_type;
-	
-	if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128) {
-		*err = -EINVAL;
-		return NULL;
-	}
+
+	if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
+		return -EINVAL;
+#ifndef CONFIG_IPV6_SUBTREES
+	if (rtmsg->rtmsg_src_len)
+		return -EINVAL;
+#endif
 	if (rtmsg->rtmsg_metric == 0)
 		rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
 
-	*err = 0;
-	
 	rt = dst_alloc(sizeof(struct rt6_info), &ip6_dst_ops);
 
-	if (rt == NULL) {
-		RDBG(("dalloc fails, "));
-		*err = -ENOMEM;
-		return NULL;
-	}
+	if (rt == NULL)
+		return -ENOMEM;
 
 	rt->u.dst.obsolete = -1;
 	rt->rt6i_expires = rtmsg->rtmsg_info;
 
 	addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
 
-	if (addr_type & IPV6_ADDR_MULTICAST) {
-		RDBG(("MCAST, "));
+	if (addr_type & IPV6_ADDR_MULTICAST)
 		rt->u.dst.input = ip6_mc_input;
-	} else {
-		RDBG(("!MCAST "));
+	else
 		rt->u.dst.input = ip6_forward;
-	}
 
 	rt->u.dst.output = ip6_output;
 
 	if (rtmsg->rtmsg_ifindex) {
 		dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
-		if (dev == NULL) {
-			*err = -ENODEV;
+		err = -ENODEV;
+		if (dev == NULL)
 			goto out;
-		}
 	}
 
 	ipv6_addr_copy(&rt->rt6i_dst.addr, &rtmsg->rtmsg_dst);
 	rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
 	ipv6_wash_prefix(&rt->rt6i_dst.addr, rt->rt6i_dst.plen);
 
+#ifdef CONFIG_IPV6_SUBTREES
 	ipv6_addr_copy(&rt->rt6i_src.addr, &rtmsg->rtmsg_src);
 	rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
 	ipv6_wash_prefix(&rt->rt6i_src.addr, rt->rt6i_src.plen);
+#endif
+
+	rt->rt6i_metric = rtmsg->rtmsg_metric;
 
 	/* We cannot add true routes via loopback here,
 	   they would result in kernel looping; promote them to reject routes
 	 */
 	if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
-		dev = dev_get("lo");
+		dev = &loopback_dev;
 		rt->u.dst.output = ip6_pkt_discard;
 		rt->u.dst.input = ip6_pkt_discard;
 		rt->u.dst.error = -ENETUNREACH;
 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
-		rt->rt6i_metric = rtmsg->rtmsg_metric;
-		rt->rt6i_dev = dev;
 		goto install_route;
 	}
 
@@ -746,50 +667,44 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err)
 
 			/* IPv6 strictly inhibits using not link-local
 			   addresses as nexthop address.
+			   Otherwise, router will not able to send redirects.
 			   It is very good, but in some (rare!) curcumstances
-			   (SIT, NBMA NOARP links) it is handy to allow
-			   some exceptions.
+			   (SIT, PtP, NBMA NOARP links) it is handy to allow
+			   some exceptions. --ANK
 			 */
-			if (!(gwa_type&IPV6_ADDR_UNICAST)) {
-				*err = -EINVAL;
+			err = -EINVAL;
+			if (!(gwa_type&IPV6_ADDR_UNICAST))
 				goto out;
-			}
 
-			grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, RTF_LINKRT);
+			grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
 
-			if (grt == NULL || (grt->rt6i_flags&RTF_GATEWAY)) {
-				*err = -EHOSTUNREACH;
+			err = -EHOSTUNREACH;
+			if (grt == NULL)
 				goto out;
-			}
+			if (!(grt->rt6i_flags&RTF_GATEWAY))
+				err = 0;
 			dev = grt->rt6i_dev;
+			dst_release(&grt->u.dst);
+
+			if (err)
+				goto out;
 		}
-		if (dev == NULL || (dev->flags&IFF_LOOPBACK)) {
-			*err = -EINVAL;
+		err = -EINVAL;
+		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
 			goto out;
-		}
 	}
 
-	if (dev == NULL) {
-		RDBG(("!dev, "));
-		*err = -ENODEV;
+	err = -ENODEV;
+	if (dev == NULL)
 		goto out;
-	}
 
 	if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
 		rt->rt6i_nexthop = ndisc_get_neigh(dev, &rt->rt6i_gateway);
-		if (rt->rt6i_nexthop == NULL) {
-			RDBG(("!nxthop, "));
-			*err = -ENOMEM;
+		err = -ENOMEM;
+		if (rt->rt6i_nexthop == NULL)
 			goto out;
-		}
-		RDBG(("nxthop, "));
 	}
 
-	rt->rt6i_metric = rtmsg->rtmsg_metric;
-
-	rt->rt6i_dev = dev;
-	rt->u.dst.pmtu = ipv6_get_mtu(dev);
-	rt->u.dst.rtt = TCP_TIMEOUT_INIT;
 	if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr))
 		rt->rt6i_hoplimit = IPV6_DEFAULT_MCASTHOPS;
 	else
@@ -797,153 +712,59 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err)
 	rt->rt6i_flags = rtmsg->rtmsg_flags;
 
 install_route:
-	RDBG(("rt6ins(%p) ", rt));
-
-	rt6_lock();
-	rt6_ins(rt);
-	rt6_unlock();
-
-	/* BUGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG!
-
-	   If rt6_ins will fail (and it occurs regularly f.e. if route
-	   already existed), the route will be freed -> Finita.
-	   Crash. No recovery. NO FIX. Unfortunately, it is not the only
-	   place will it is fatal. It is sad, I believed this
-	   code is a bit more accurate :-(
-
-	   Really, the problem can be solved in two ways:
-
-	   * As I did in old 2.0 IPv4: to increase use count and force
-	     user to destroy stray route. It requires some care,
-	     well, much more care.
-	   * Second and the best: to get rid of this damn backlogging
-	     system. I wonder why Pedro so liked it. It was the most
-	     unhappy day when I invented it (well, by a strange reason
-	     I believed that it is very clever :-)),
-	     and when I managed to clean IPv4 of this crap,
-	     it was really great win.
-	     BTW I forgot how 2.0 route/arp works :-) :-)
-	                                                               --ANK
-	 */
+	rt->u.dst.pmtu = ipv6_get_mtu(dev);
+	rt->u.dst.rtt = TCP_TIMEOUT_INIT;
+	rt->rt6i_dev = dev;
+	return rt6_ins(rt);
 
 out:
-	if (*err) {
-		RDBG(("dfree(%p) ", rt));
-		dst_free((struct dst_entry *) rt);
-		rt = NULL;
-	}
-	RDBG(("ret(%p)\n", rt));
-#if 0
-	return rt;
-#else
-	/* BUGGG! For now always return NULL. (see above)
-
-	   Really, it was used only in two places, and one of them
-	   (rt6_add_dflt_router) is repaired, ip6_fw is not essential
-	   at all. --ANK
-	 */
-	return NULL;
-#endif
+	dst_free((struct dst_entry *) rt);
+	return err;
 }
 
 int ip6_del_rt(struct rt6_info *rt)
 {
-	rt6_lock();
+	int err;
 
 	start_bh_atomic();
-
-	/* I'd add here couple of cli()
-	   cli(); cli(); cli();
-
-	   Now it is really LOCKED. :-) :-) --ANK
-	 */
-
 	rt6_dflt_pointer = NULL;
-
-	if (atomic_read(&rt6_tbl_lock) == 1)
-		fib6_del(rt);
-	else
-		rtreq_add(rt, RT_OPER_DEL);
+	err = fib6_del(rt);
 	end_bh_atomic();
-	rt6_unlock();
-	return 0;
+
+	return err;
 }
 
 int ip6_route_del(struct in6_rtmsg *rtmsg)
 {
 	struct fib6_node *fn;
 	struct rt6_info *rt;
+	int err = -ESRCH;
 
-	rt6_lock();
-	fn = fib6_lookup(&ip6_routing_table, &rtmsg->rtmsg_dst, &rtmsg->rtmsg_src);
-	rt = fn->leaf;
-
-	/*
-	 *	Blow it away
-	 *
-	 *	BUGGGG It will not help with Pedro's subtrees.
-	 *	We urgently need fib6_locate_node function, and
-	 *	it is not the only place where rt6_lookup is used
-	 *	for wrong purpose.
-	 *							--ANK
-	 */
-restart:
-	if (rt && rt->rt6i_src.plen == rtmsg->rtmsg_src_len) {
-		if (rt->rt6i_dst.plen > rtmsg->rtmsg_dst_len) {
-			struct fib6_node *fn = rt->rt6i_node;
-			while ((fn = fn->parent) != NULL) {
-				if (fn->fn_flags & RTN_ROOT)
-					break;
-				if (fn->fn_flags & RTN_RTINFO) {
-					rt = fn->leaf;
-					goto restart;
-				}
-			}
-		}
+	start_bh_atomic();
 
-		if (rt->rt6i_dst.plen == rtmsg->rtmsg_dst_len) {
-			for ( ; rt; rt = rt->u.next) {
-				if (rtmsg->rtmsg_ifindex &&
-				    (rt->rt6i_dev == NULL ||
-				     rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
-					continue;
-				if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
-				     ipv6_addr_cmp(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
-					continue;
-				if (rtmsg->rtmsg_metric &&
-				    rtmsg->rtmsg_metric != rt->rt6i_metric)
-					continue;
-				ip6_del_rt(rt);
-				rt6_unlock();
-				return 0;
-			}
+	fn = fib6_locate(&ip6_routing_table,
+			 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
+			 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
+	
+	if (fn) {
+		for (rt = fn->leaf; rt; rt = rt->u.next) {
+			if (rtmsg->rtmsg_ifindex &&
+			    (rt->rt6i_dev == NULL ||
+			     rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
+				continue;
+			if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
+			    ipv6_addr_cmp(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
+				continue;
+			if (rtmsg->rtmsg_metric &&
+			    rtmsg->rtmsg_metric != rt->rt6i_metric)
+				continue;
+			err = ip6_del_rt(rt);
+			break;
 		}
 	}
-	rt6_unlock();
-
-	return -ESRCH;
-}
-
-
-/*
- *	bottom handler, runs with atomic_bh protection
- */
-void __rt6_run_bh(void)
-{
-	struct rt6_req *rtreq;
+	end_bh_atomic();
 
-	while ((rtreq = rtreq_dequeue())) {
-		switch (rtreq->operation) {
-		case RT_OPER_ADD:
-			fib6_add(&ip6_routing_table, rtreq->ptr);
-			break;
-		case RT_OPER_DEL:
-			fib6_del(rtreq->ptr);
-			break;
-		};
-		kfree(rtreq);
-	}
-	rt6_bh_mask = 0;
+	return err;
 }
 
 #ifdef CONFIG_IPV6_NETLINK
@@ -971,10 +792,10 @@ static int rt6_msgrcv(int unit, struct sk_buff *skb)
 
 		switch (rtmsg->rtmsg_type) {
 		case RTMSG_NEWROUTE:
-			ip6_route_add(rtmsg, &err);
+			err = ip6_route_add(rtmsg);
 			break;
 		case RTMSG_DELROUTE:
-			ip6_route_del(rtmsg);
+			err = ip6_route_del(rtmsg);
 			break;
 		default:
 			count = -EINVAL;
@@ -1047,17 +868,19 @@ void rt6_sndmsg(int type, struct in6_addr *dst, struct in6_addr *src,
 /*
  *	Handle redirects
  */
-struct rt6_info *rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
-			      struct in6_addr *target, struct device *dev,
-			      int on_link)
+void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
+		  struct neighbour *neigh, int on_link)
 {
 	struct rt6_info *rt, *nrt;
 
 	/* Locate old route to this destination. */
-	rt = rt6_lookup(dest, NULL, dev->ifindex, 0);
+	rt = rt6_lookup(dest, NULL, neigh->dev->ifindex, 1);
 
-	if (rt == NULL || rt->u.dst.error)
-		return NULL;
+	if (rt == NULL)
+		return;
+
+	if (neigh->dev != rt->rt6i_dev)
+		goto out;
 
 	/* Redirect received -> path was valid.
 	   Look, redirects are sent only in response to data packets,
@@ -1066,12 +889,18 @@ struct rt6_info *rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
 	dst_confirm(&rt->u.dst);
 
 	/* Duplicate redirect: silently ignore. */
-	if (ipv6_addr_cmp(target, &rt->rt6i_gateway) == 0)
-		return NULL;
+	if (neigh == rt->u.dst.neighbour)
+		goto out;
 
-	/* Current route is on-link; redirect is always invalid. */
+	/* Current route is on-link; redirect is always invalid.
+	   
+	   Seems, previous statement is not true. It could
+	   be node, which looks for us as on-link (f.e. proxy ndisc)
+	   But then router serving it might decide, that we should
+	   know truth 8)8) --ANK (980726).
+	 */
 	if (!(rt->rt6i_flags&RTF_GATEWAY))
-		return NULL;
+		goto out;
 
 #if !defined(CONFIG_IPV6_EUI64) || defined(CONFIG_IPV6_NO_PB)
 	/*
@@ -1089,16 +918,21 @@ struct rt6_info *rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
 
 	if (ipv6_addr_cmp(saddr, &rt->rt6i_gateway)) {
 		if (rt->rt6i_flags & RTF_DEFAULT) {
-			rt = ip6_routing_table.leaf;
+			struct rt6_info *rt1;
 
-			for (; rt; rt = rt->u.next) {
-				if (!ipv6_addr_cmp(saddr, &rt->rt6i_gateway))
+			for (rt1 = ip6_routing_table.leaf; rt1; rt1 = rt1->u.next) {
+				if (!ipv6_addr_cmp(saddr, &rt1->rt6i_gateway)) {
+					dst_clone(&rt1->u.dst);
+					dst_release(&rt->u.dst);
+					rt = rt1;
 					goto source_ok;
+				}
 			}
 		}
-		printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
+		if (net_ratelimit())
+			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
 			       "for redirect target\n");
-		return NULL;
+		goto out;
 	}
 
 source_ok:
@@ -1107,36 +941,11 @@ source_ok:
 	/*
 	 *	We have finally decided to accept it.
 	 */
-	if (rt->rt6i_dst.plen == 128) {
-		/* BUGGGG! Very bad bug. Fast path code does not protect
-		 * itself of changing nexthop on the fly, it was supposed
-		 * that crucial parameters (dev, nexthop, hh) ARE VOLATILE.
-		 *                                                   --ANK
-		 * Not fixed!! I plugged it to avoid random crashes
-		 * (they are very unlikely, but I do not want to shrug
-		 *  every time when redirect arrives)
-		 * but the plug must be removed. --ANK
-		 */
-
-#if 0
-		/*
-		 *	Already a host route.
-		 *
-		 */
-		if (rt->rt6i_nexthop)
-			neigh_release(rt->rt6i_nexthop);
-		rt->rt6i_flags |= RTF_MODIFIED | RTF_CACHE;
-		if (on_link)
-			rt->rt6i_flags &= ~RTF_GATEWAY;
-		ipv6_addr_copy(&rt->rt6i_gateway, target);
-		rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, target);
-		return rt;
-#else
-		return NULL;
-#endif
-	}
 
 	nrt = ip6_rt_copy(rt);
+	if (nrt == NULL)
+		goto out;
+
 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
 	if (on_link)
 		nrt->rt6i_flags &= ~RTF_GATEWAY;
@@ -1144,19 +953,24 @@ source_ok:
 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
 	nrt->rt6i_dst.plen = 128;
 
-	ipv6_addr_copy(&nrt->rt6i_gateway, target);
-	nrt->rt6i_nexthop = ndisc_get_neigh(nrt->rt6i_dev, target);
-	nrt->rt6i_dev = dev;
-	nrt->u.dst.pmtu = ipv6_get_mtu(dev);
-	if (!ipv6_addr_is_multicast(&nrt->rt6i_dst.addr))
-		nrt->rt6i_hoplimit = ipv6_get_hoplimit(dev);
+	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
+	nrt->rt6i_nexthop = neigh_clone(neigh);
+	/* Reset pmtu, it may be better */
+	nrt->u.dst.pmtu = ipv6_get_mtu(neigh->dev);
+	nrt->rt6i_hoplimit = ipv6_get_hoplimit(neigh->dev);
+
+	if (rt6_ins(nrt))
+		goto out;
 
-	rt6_lock();
-	rt6_ins(nrt);
-	rt6_unlock();
+	/* Sic! rt6_redirect is called by bh, so that it is allowed */
+	dst_release(&rt->u.dst);
+	if (rt->rt6i_flags&RTF_CACHE)
+		ip6_del_rt(rt);
+	return;
 
-	/* BUGGGGGGG! nrt can point to nowhere. */
-	return nrt;
+out:
+        dst_release(&rt->u.dst);
+	return;
 }
 
 /*
@@ -1164,29 +978,25 @@ source_ok:
  *	i.e. Path MTU discovery
  */
 
-void rt6_pmtu_discovery(struct in6_addr *addr, struct device *dev, int pmtu)
+void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
+			struct device *dev, u32 pmtu)
 {
 	struct rt6_info *rt, *nrt;
 
-	if (pmtu < 576 || pmtu > 65536) {
-#if RT6_DEBUG >= 1
-		printk(KERN_DEBUG "rt6_pmtu_discovery: invalid MTU value %d\n",
-		       pmtu);
-#endif
+	if (pmtu < IPV6_MIN_MTU) {
+		if (net_ratelimit())
+			printk(KERN_DEBUG "rt6_pmtu_discovery: invalid MTU value %d\n",
+			       pmtu);
 		return;
 	}
 
-	rt = rt6_lookup(addr, NULL, dev->ifindex, 0);
+	rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
 
-	if (rt == NULL || rt->u.dst.error) {
-#if RT6_DEBUG >= 2
-		printk(KERN_DEBUG "rt6_pmtu_discovery: no route to host\n");
-#endif
+	if (rt == NULL)
 		return;
-	}
 
 	if (pmtu >= rt->u.dst.pmtu)
-		return;
+		goto out;
 
 	/* New mtu received -> path was valid.
 	   They are sent only in response to data packets,
@@ -1194,39 +1004,42 @@ void rt6_pmtu_discovery(struct in6_addr *addr, struct device *dev, int pmtu)
 	 */
 	dst_confirm(&rt->u.dst);
 
-	/* It is wrong, but I plugged the hole here.
-	   On-link routes are cloned differently,
-	   look at rt6_redirect --ANK
+	/* Host route. If it is static, it would be better
+	   not to override it, but add new one, so that
+	   when cache entry will expire old pmtu
+	   would return automatically.
 	 */
-	if (!(rt->rt6i_flags&RTF_GATEWAY))
-		return;
-
 	if (rt->rt6i_dst.plen == 128) {
 		/*
 		 *	host route
 		 */
 		rt->u.dst.pmtu = pmtu;
 		rt->rt6i_flags |= RTF_MODIFIED;
-
-		return;
+		goto out;
 	}
 
-	nrt = ip6_rt_copy(rt);
-	ipv6_addr_copy(&nrt->rt6i_dst.addr, addr);
-	nrt->rt6i_dst.plen = 128;
-
-	nrt->rt6i_flags |= (RTF_DYNAMIC | RTF_CACHE);
-
-	/* It was missing. :-) :-)
-	   I wonder, kernel was deemed to crash after pkt_too_big
-	   and nobody noticed it. Hey, guys, do someone really
-	   use it? --ANK
+	/* Network route.
+	   Two cases are possible:
+	   1. It is connected route. Action: COW
+	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
 	 */
-	nrt->rt6i_nexthop = neigh_clone(rt->rt6i_nexthop);
+	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
+		nrt = rt6_cow(rt, daddr, saddr);
+		nrt->rt6i_flags |= RTF_DYNAMIC;
+		dst_release(&nrt->u.dst);
+	} else {
+		nrt = ip6_rt_copy(rt);
+		if (nrt == NULL)
+			goto out;
+		ipv6_addr_copy(&nrt->rt6i_dst.addr, daddr);
+		nrt->rt6i_dst.plen = 128;
+		nrt->rt6i_nexthop = neigh_clone(rt->rt6i_nexthop);
+		nrt->rt6i_flags |= (RTF_DYNAMIC | RTF_CACHE);
+		rt6_ins(nrt);
+	}
 
-	rt6_lock();
-	rt6_ins(rt);
-	rt6_unlock();
+out:
+	dst_release(&rt->u.dst);
 }
 
 /*
@@ -1247,16 +1060,19 @@ static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
 		rt->u.dst.rtt = ort->u.dst.rtt;
 		rt->u.dst.window = ort->u.dst.window;
 		rt->u.dst.mxlock = ort->u.dst.mxlock;
+		rt->u.dst.dev = ort->u.dst.dev;
+		rt->u.dst.lastuse = jiffies;
 		rt->rt6i_hoplimit = ort->rt6i_hoplimit;
-		rt->rt6i_dev = ort->rt6i_dev;
+		rt->rt6i_expires = ort->rt6i_expires;
 
 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
-		rt->rt6i_keylen = ort->rt6i_keylen;
 		rt->rt6i_flags = ort->rt6i_flags;
 		rt->rt6i_metric = ort->rt6i_metric;
 
 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
+#ifdef CONFIG_IPV6_SUBTREES
 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
+#endif
 	}
 	return rt;
 }
@@ -1266,31 +1082,17 @@ struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct device *dev)
 	struct rt6_info *rt;
 	struct fib6_node *fn;
 
-	RDBG(("rt6_get_dflt_router(%p,%p)[%p]", addr, dev,
-	      __builtin_return_address(0)));
-#if RT6_DEBUG >= 3
-	{
-		int i;
-
-		RDBG(("addr["));
-		for(i = 0; i < 8; i++) {
-			RDBG(("%04x%c", addr->s6_addr16[i],
-			      i == 7 ? ']' : ':'));
-		}
-	}
-#endif
-	RDBG(("\n"));
-	rt6_lock();
-
 	fn = &ip6_routing_table;
 
+	start_bh_atomic();
 	for (rt = fn->leaf; rt; rt=rt->u.next) {
 		if (dev == rt->rt6i_dev &&
 		    ipv6_addr_cmp(&rt->rt6i_gateway, addr) == 0)
 			break;
 	}
-
-	rt6_unlock();
+	if (rt)
+		dst_clone(&rt->u.dst);
+	end_bh_atomic();
 	return rt;
 }
 
@@ -1298,24 +1100,6 @@ struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
 				     struct device *dev)
 {
 	struct in6_rtmsg rtmsg;
-	struct rt6_info *rt;
-	int err;
-
-	RDBG(("rt6_add_dflt_router(%p,%p)[%p] ", gwaddr, dev,
-	      __builtin_return_address(0)));
-#if RT6_DEBUG >= 3
-	{
-		struct in6_addr *addr = gwaddr;
-		int i;
-
-		RDBG(("gwaddr["));
-		for(i = 0; i < 8; i++) {
-			RDBG(("%04x%c", addr->s6_addr16[i],
-			      i == 7 ? ']' : ':'));
-		}
-	}
-#endif
-	RDBG(("\n"));
 
 	memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
 	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
@@ -1325,48 +1109,28 @@ struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
 
 	rtmsg.rtmsg_ifindex = dev->ifindex;
 
-	rt = ip6_route_add(&rtmsg, &err);
-
-	/* BUGGGGGGGGGGGGGGGGGGGG!
-	   rt can be not NULL, but point to heavens.
-	 */
-
-	if (err) {
-		printk(KERN_DEBUG "rt6_add_dflt: ip6_route_add error %d\n",
-		       err);
-	}
-	return rt;
+	ip6_route_add(&rtmsg);
+	return rt6_get_dflt_router(gwaddr, dev);
 }
 
 void rt6_purge_dflt_routers(int last_resort)
 {
 	struct rt6_info *rt;
-	struct fib6_node *fn;
 	u32 flags;
 
-	RDBG(("rt6_purge_dflt_routers(%d)[%p]\n", last_resort,
-	      __builtin_return_address(0)));
-	fn = &ip6_routing_table;
-
-	rt6_dflt_pointer = NULL;
-
 	if (last_resort)
 		flags = RTF_ALLONLINK;
 	else
 		flags = RTF_DEFAULT | RTF_ADDRCONF;	
 
-	for (rt = fn->leaf; rt; ) {
-		if ((rt->rt6i_flags & flags)) {
-			struct rt6_info *drt;
-#if RT6_DEBUG >= 2
-			printk(KERN_DEBUG "rt6_purge_dflt: deleting entry\n");
-#endif
-			drt = rt;
-			rt = rt->u.next;
-			ip6_del_rt(drt);
-			continue;
+restart:
+	rt6_dflt_pointer = NULL;
+
+	for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
+		if (rt->rt6i_flags & flags) {
+			ip6_del_rt(rt);
+			goto restart;
 		}
-		rt = rt->u.next;
 	}
 }
 
@@ -1389,7 +1153,7 @@ int ipv6_route_ioctl(unsigned int cmd, void *arg)
 		rtnl_lock();
 		switch (cmd) {
 		case SIOCADDRT:
-			ip6_route_add(&rtmsg, &err);
+			err = ip6_route_add(&rtmsg);
 			break;
 		case SIOCDELRT:
 			err = ip6_route_del(&rtmsg);
@@ -1414,7 +1178,7 @@ int ipv6_route_ioctl(unsigned int cmd, void *arg)
  */
 
 int ip6_pkt_discard(struct sk_buff *skb)
-{	
+{
 	ipv6_statistics.Ip6OutNoRoutes++;
 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
 	kfree_skb(skb);
@@ -1429,21 +1193,6 @@ int ip6_rt_addr_add(struct in6_addr *addr, struct device *dev)
 {
 	struct rt6_info *rt;
 
-	RDBG(("ip6_rt_addr_add(%p,%p)[%p]\n", addr, dev,
-	      __builtin_return_address(0)));
-#if RT6_DEBUG >= 3
-	{
-		int i;
-
-		RDBG(("addr["));
-		for(i = 0; i < 8; i++) {
-			RDBG(("%04x%c", addr->s6_addr16[i],
-			      i == 7 ? ']' : ':'));
-		}
-	}
-#endif
-	RDBG(("\n"));
-
 	rt = dst_alloc(sizeof(struct rt6_info), &ip6_dst_ops);
 	if (rt == NULL)
 		return -ENOMEM;
@@ -1465,10 +1214,7 @@ int ip6_rt_addr_add(struct in6_addr *addr, struct device *dev)
 
 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
 	rt->rt6i_dst.plen = 128;
-
-	rt6_lock();
 	rt6_ins(rt);
-	rt6_unlock();
 
 	return 0;
 }
@@ -1480,12 +1226,16 @@ int ip6_rt_addr_add(struct in6_addr *addr, struct device *dev)
 int ip6_rt_addr_del(struct in6_addr *addr, struct device *dev)
 {
 	struct rt6_info *rt;
+	int err = -ENOENT;
 
-	rt = rt6_lookup(addr, NULL, loopback_dev.ifindex, RTF_LINKRT);
-	if (rt && rt->rt6i_dst.plen == 128)
-		return ip6_del_rt(rt);
+	rt = rt6_lookup(addr, NULL, loopback_dev.ifindex, 1);
+	if (rt) {
+		if (rt->rt6i_dst.plen == 128)
+			err= ip6_del_rt(rt);
+		dst_release(&rt->u.dst);
+	}
 
-	return 0;
+	return err;
 }
 
 #ifdef CONFIG_RT6_POLICY
@@ -1587,75 +1337,65 @@ static struct rt6_info *rt6_flow_lookup(struct rt6_info *rt,
 	}
 
 error:
+	dst_clone(&ip6_null_entry.u.dst);
 	return &ip6_null_entry;
 
 found:
-
 	if (nrt == NULL)
 		goto error;
 
 	nrt->rt6i_flags |= RTF_CACHE;
-	/* BUGGGG! nrt can point to nowhere! */
-	rt6_ins(nrt);
-
+	dst_clone(&nrt->u.dst);
+	err = rt6_ins(nrt);
+	if (err)
+		nrt->u.dst.error = err;
 	return nrt;
 }
 #endif
 
-/* 
- * Nope, I am not idiot. I see that it is the ugliest of ugly routines.
- * Anyone is advertised to write better one. --ANK
- */
+static int fib6_ifdown(struct rt6_info *rt, void *arg)
+{
+	if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
+	    rt != &ip6_null_entry) {
+		RT6_TRACE("deleted by ifdown %p\n", rt);
+		return -1;
+	}
+	return 0;
+}
 
-struct rt6_ifdown_arg {
+void rt6_ifdown(struct device *dev)
+{
+	fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
+}
+
+struct rt6_mtu_change_arg
+{
 	struct device *dev;
-	struct rt6_info *rt;
+	unsigned mtu;
 };
 
-
-static void rt6_ifdown_node(struct fib6_node *fn, void *p_arg)
+static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
 {
-	struct rt6_info *rt;
-	struct rt6_ifdown_arg *arg = (struct rt6_ifdown_arg *) p_arg;
-
-	if (arg->rt != NULL)
-		return;
-
-	for (rt = fn->leaf; rt; rt = rt->u.next) {
-		if (rt->rt6i_dev == arg->dev || arg->dev == NULL) {
-			arg->rt = rt;
-			return;
-		}
-	}
+	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
+
+	/* In IPv6 pmtu discovery is not optional,
+	   so that RTAX_MTU lock cannot dissable it.
+	   We still use this lock to block changes
+	   caused by addrconf/ndisc.
+	   */
+	if (rt->rt6i_dev == arg->dev &&
+	    !(rt->u.dst.mxlock&(1<<RTAX_MTU)))
+		rt->u.dst.pmtu = arg->mtu;
+	return 0;
 }
 
-void rt6_ifdown(struct device *dev)
+void rt6_mtu_change(struct device *dev, unsigned mtu)
 {
-	int count = 0;
-	struct rt6_ifdown_arg arg;
-	struct rt6_info *rt;
+	struct rt6_mtu_change_arg arg;
 
-	do {
-		arg.dev = dev;
-		arg.rt = NULL;
-		fib6_walk_tree(&ip6_routing_table, rt6_ifdown_node, &arg,
-			       RT6_FILTER_RTNODES);
-		if (arg.rt != NULL)
-			ip6_del_rt(arg.rt);
-		count++;
-	} while (arg.rt != NULL);
-
-	/* And default routes ... */
-
-	for (rt = ip6_routing_table.leaf; rt; ) {
-		if (rt != &ip6_null_entry && (rt->rt6i_dev == dev || dev == NULL)) {
-			struct rt6_info *deleting = rt;
-			rt = rt->u.next;
-			ip6_del_rt(deleting);
-			continue;
-		}
-		rt = rt->u.next;
-	}
+	arg.dev = dev;
+	arg.mtu = mtu;
+	fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
 }
 
 #ifdef CONFIG_RTNETLINK
@@ -1714,37 +1454,28 @@ int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 {
 	struct rtmsg *r = NLMSG_DATA(nlh);
 	struct in6_rtmsg rtmsg;
-	int err = 0;
 
 	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
 		return -EINVAL;
-	ip6_route_add(&rtmsg, &err);
-	return err;
+	return ip6_route_add(&rtmsg);
 }
 
 struct rt6_rtnl_dump_arg
 {
 	struct sk_buff *skb;
 	struct netlink_callback *cb;
-	int skip;
-	int count;
-	int stop;
 };
 
 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
 			 struct in6_addr *dst,
 			 struct in6_addr *src,
 			 int iif,
-			 int type, pid_t pid, u32 seq)
+			 int type, u32 pid, u32 seq)
 {
 	struct rtmsg *rtm;
 	struct nlmsghdr  *nlh;
 	unsigned char	 *b = skb->tail;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	unsigned char 	 *o;
-#else
 	struct rtattr *mx;
-#endif
 	struct rta_cacheinfo ci;
 
 	nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*rtm));
@@ -1762,9 +1493,6 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
 		rtm->rtm_type = RTN_UNICAST;
 	rtm->rtm_flags = 0;
 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	rtm->rtm_nhs = 0;
-#endif
 	rtm->rtm_protocol = RTPROT_BOOT;
 	if (rt->rt6i_flags&RTF_DYNAMIC)
 		rtm->rtm_protocol = RTPROT_REDIRECT;
@@ -1776,19 +1504,18 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
 	if (rt->rt6i_flags&RTF_CACHE)
 		rtm->rtm_flags |= RTM_F_CLONED;
 
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	o = skb->tail;
-#endif
 	if (dst) {
 		RTA_PUT(skb, RTA_DST, 16, dst);
 	        rtm->rtm_dst_len = 128;
 	} else if (rtm->rtm_dst_len)
 		RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
+#ifdef CONFIG_IPV6_SUBTREES
 	if (src) {
 		RTA_PUT(skb, RTA_SRC, 16, src);
 	        rtm->rtm_src_len = 128;
 	} else if (rtm->rtm_src_len)
 		RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
+#endif
 	if (iif)
 		RTA_PUT(skb, RTA_IIF, 4, &iif);
 	else if (dst) {
@@ -1796,14 +1523,6 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
 		if (ifp)
 			RTA_PUT(skb, RTA_PREFSRC, 16, &ifp->addr);
 	}
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	if (rt->u.dst.pmtu)
-		RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &rt->u.dst.pmtu);
-	if (rt->u.dst.window)
-		RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &rt->u.dst.window);
-	if (rt->u.dst.rtt)
-		RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &rt->u.dst.rtt);
-#else
 	mx = (struct rtattr*)skb->tail;
 	RTA_PUT(skb, RTA_METRICS, 0, NULL);
 	if (rt->u.dst.mxlock)
@@ -1817,7 +1536,6 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
 	mx->rta_len = skb->tail - (u8*)mx;
 	if (mx->rta_len == RTA_LENGTH(0))
 		skb_trim(skb, (u8*)mx - skb->data);
-#endif
 	if (rt->u.dst.neighbour)
 		RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
 	if (rt->u.dst.dev)
@@ -1828,13 +1546,10 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
 		ci.rta_expires = rt->rt6i_expires - jiffies;
 	else
 		ci.rta_expires = 0;
-	ci.rta_used = 0;
+	ci.rta_used = atomic_read(&rt->u.dst.refcnt);
 	ci.rta_clntref = atomic_read(&rt->u.dst.use);
 	ci.rta_error = rt->u.dst.error;
 	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
-#ifdef CONFIG_RTNL_OLD_IFINFO
-	rtm->rtm_optlen = skb->tail - o;
-#endif
 	nlh->nlmsg_len = skb->tail - b;
 	return skb->len;
 
@@ -1844,45 +1559,98 @@ rtattr_failure:
 	return -1;
 }
 
-static void rt6_dump_node(struct fib6_node *fn, void *p_arg)
+static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
 {
-	struct rt6_info *rt;
 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
 
-	if (arg->stop)
-		return;
+	return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
+			     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq);
+}
 
-	for (rt = fn->leaf; rt; rt = rt->u.next) {
-		if (arg->count < arg->skip) {
-			arg->count++;
-			continue;
-		}
-		if (rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
-				  NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq) <= 0) {
-			arg->stop = 1;
-			break;
+static int fib6_dump_node(struct fib6_walker_t *w)
+{
+	int res;
+	struct rt6_info *rt;
+
+	for (rt = w->leaf; rt; rt = rt->u.next) {
+		res = rt6_dump_route(rt, w->args);
+		if (res < 0) {
+			/* Frame is full, suspend walking */
+			w->leaf = rt;
+			return 1;
 		}
-		arg->count++;
+		BUG_TRAP(res!=0);
 	}
+	w->leaf = NULL;
+	return 0;
 }
 
+static int fib6_dump_done(struct netlink_callback *cb)
+{
+	struct fib6_walker_t *w = (void*)cb->args[0];
+
+	if (w) {
+		cb->args[0] = 0;
+		start_bh_atomic();
+		fib6_walker_unlink(w);
+		end_bh_atomic();
+		kfree(w);
+	}
+	if (cb->args[1]) {
+		cb->done = (void*)cb->args[1];
+		cb->args[1] = 0;
+	}
+	return cb->done(cb);
+}
 
 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct rt6_rtnl_dump_arg arg;
+	struct fib6_walker_t *w;
+	int res;
 
 	arg.skb = skb;
 	arg.cb = cb;
-	arg.skip = cb->args[0];
-	arg.count = 0;
-	arg.stop = 0;
-	start_bh_atomic();
-	fib6_walk_tree(&ip6_routing_table, rt6_dump_node, &arg, RT6_FILTER_RTNODES);
-	if (arg.stop == 0)
-		rt6_dump_node(&ip6_routing_table, &arg);
-	end_bh_atomic();
-	cb->args[0] = arg.count;
-	return skb->len;
+
+	w = (void*)cb->args[0];
+	if (w == NULL) {
+		/* New dump:
+		 * 
+		 * 1. hook callback destructor.
+		 */
+		cb->args[1] = (long)cb->done;
+		cb->done = fib6_dump_done;
+
+		/*
+		 * 2. allocate and initialize walker.
+		 */
+		w = kmalloc(sizeof(*w), GFP_KERNEL);
+		if (w == NULL)
+			return -ENOMEM;
+		RT6_TRACE("dump<%p", w);
+		memset(w, 0, sizeof(*w));
+		w->root = &ip6_routing_table;
+		w->func = fib6_dump_node;
+		w->args = &arg;
+		cb->args[0] = (long)w;
+		start_bh_atomic();
+		res = fib6_walk(w);
+		end_bh_atomic();
+	} else {
+		w->args = &arg;
+		start_bh_atomic();
+		res = fib6_walk_continue(w);
+		end_bh_atomic();
+	}
+#if RT6_DEBUG >= 3
+	if (res <= 0 && skb->len == 0)
+		RT6_TRACE("%p>dump end\n", w);
+#endif
+	/* res < 0 is an error. (really, impossible)
+	   res == 0 means that dump is complete, but skb still can contain data.
+	   res > 0 dump is not complete, but frame is full.
+	 */
+	return res < 0 ? res : skb->len;
 }
 
 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
@@ -1974,10 +1742,10 @@ void inet6_rt_notify(int event, struct rt6_info *rt)
 
 #ifdef CONFIG_PROC_FS
 
-
 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
 
-struct rt6_proc_arg {
+struct rt6_proc_arg
+{
 	char *buffer;
 	int offset;
 	int length;
@@ -1985,109 +1753,18 @@ struct rt6_proc_arg {
 	int len;
 };
 
-static void rt6_info_node(struct fib6_node *fn, void *p_arg)
+static int rt6_info_route(struct rt6_info *rt, void *p_arg)
 {
-	struct rt6_info *rt;
 	struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
-
-	for (rt = fn->leaf; rt; rt = rt->u.next) {
-		int i;
-
-		if (arg->skip < arg->offset / RT6_INFO_LEN) {
-			arg->skip++;
-			continue;
-		}
-
-		if (arg->len >= arg->length)
-			return;
-		
-		for (i=0; i<16; i++) {
-			sprintf(arg->buffer + arg->len, "%02x",
-				rt->rt6i_dst.addr.s6_addr[i]);
-			arg->len += 2;
-		}
-		arg->len += sprintf(arg->buffer + arg->len, " %02x ",
-				    rt->rt6i_dst.plen);
-
-		for (i=0; i<16; i++) {
-			sprintf(arg->buffer + arg->len, "%02x",
-				rt->rt6i_src.addr.s6_addr[i]);
-			arg->len += 2;
-		}
-		arg->len += sprintf(arg->buffer + arg->len, " %02x ",
-				    rt->rt6i_src.plen);
-		
-		if (rt->rt6i_nexthop) {
-			for (i=0; i<16; i++) {
-				sprintf(arg->buffer + arg->len, "%02x",
-					rt->rt6i_nexthop->primary_key[i]);
-				arg->len += 2;
-			}
-		} else {
-			sprintf(arg->buffer + arg->len,
-				"00000000000000000000000000000000");
-			arg->len += 32;
-		}
-		arg->len += sprintf(arg->buffer + arg->len,
-				    " %08x %08x %08x %08x %8s\n",
-				    rt->rt6i_metric, atomic_read(&rt->rt6i_use),
-				    atomic_read(&rt->rt6i_ref), rt->rt6i_flags, 
-				    rt->rt6i_dev ? rt->rt6i_dev->name : "");
-	}
-}
-
-static int rt6_proc_info(char *buffer, char **start, off_t offset, int length,
-			 int dummy)
-{
-	struct rt6_proc_arg arg;
-	arg.buffer = buffer;
-	arg.offset = offset;
-	arg.length = length;
-	arg.skip = 0;
-	arg.len = 0;
-
-	fib6_walk_tree(&ip6_routing_table, rt6_info_node, &arg,
-		       RT6_FILTER_RTNODES);
-
-	rt6_info_node(&ip6_routing_table, &arg);
-
-	*start = buffer;
-	if (offset)
-		*start += offset % RT6_INFO_LEN;
-
-	arg.len -= offset % RT6_INFO_LEN;
-
-	if(arg.len > length)
-		arg.len = length;
-	if(arg.len < 0)
-		arg.len = 0;
-
-	return arg.len;
-}
-
-#define PTR_SZ (sizeof(void *) * 2)
-#define FI_LINE_SZ (2 * (PTR_SZ) + 7 + 32 + 4 + 32 + 4)
-
-static void rt6_tree_node(struct fib6_node *fn, void *p_arg)
-{
-	struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
-	struct rt6_info *rt;
-	char f;
 	int i;
 
-	rt = fn->leaf;
-
-	if (arg->skip < arg->offset / FI_LINE_SZ) {
+	if (arg->skip < arg->offset / RT6_INFO_LEN) {
 		arg->skip++;
-		return;
+		return 0;
 	}
 
-	if (arg->len + FI_LINE_SZ >= arg->length)
-		return;
-
-	f = (fn->fn_flags & RTN_RTINFO) ? 'r' : 'n';
-	arg->len += sprintf(arg->buffer + arg->len, "%p %p %02x %c ",
-			    fn, fn->parent, fn->fn_bit, f);
+	if (arg->len >= arg->length)
+		return 0;
 
 	for (i=0; i<16; i++) {
 		sprintf(arg->buffer + arg->len, "%02x",
@@ -2096,18 +1773,41 @@ static void rt6_tree_node(struct fib6_node *fn, void *p_arg)
 	}
 	arg->len += sprintf(arg->buffer + arg->len, " %02x ",
 			    rt->rt6i_dst.plen);
-	
+
+#ifdef CONFIG_IPV6_SUBTREES
 	for (i=0; i<16; i++) {
 		sprintf(arg->buffer + arg->len, "%02x",
 			rt->rt6i_src.addr.s6_addr[i]);
 		arg->len += 2;
 	}
-	arg->len += sprintf(arg->buffer + arg->len, " %02x\n",
+	arg->len += sprintf(arg->buffer + arg->len, " %02x ",
 			    rt->rt6i_src.plen);
+#else
+	sprintf(arg->buffer + arg->len,
+		"00000000000000000000000000000000 00 ");
+	arg->len += 36;
+#endif
 
+	if (rt->rt6i_nexthop) {
+		for (i=0; i<16; i++) {
+			sprintf(arg->buffer + arg->len, "%02x",
+				rt->rt6i_nexthop->primary_key[i]);
+			arg->len += 2;
+		}
+	} else {
+		sprintf(arg->buffer + arg->len,
+			"00000000000000000000000000000000");
+		arg->len += 32;
+	}
+	arg->len += sprintf(arg->buffer + arg->len,
+			    " %08x %08x %08x %08x %8s\n",
+			    rt->rt6i_metric, atomic_read(&rt->u.dst.use),
+			    atomic_read(&rt->u.dst.refcnt), rt->rt6i_flags, 
+			    rt->rt6i_dev ? rt->rt6i_dev->name : "");
+	return 0;
 }
 
-static int rt6_proc_tree(char *buffer, char **start, off_t offset, int length,
+static int rt6_proc_info(char *buffer, char **start, off_t offset, int length,
 			 int dummy)
 {
 	struct rt6_proc_arg arg;
@@ -2117,7 +1817,7 @@ static int rt6_proc_tree(char *buffer, char **start, off_t offset, int length,
 	arg.skip = 0;
 	arg.len = 0;
 
-	fib6_walk_tree(&ip6_routing_table, rt6_tree_node, &arg, 0);
+	fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
 
 	*start = buffer;
 	if (offset)
@@ -2125,15 +1825,14 @@ static int rt6_proc_tree(char *buffer, char **start, off_t offset, int length,
 
 	arg.len -= offset % RT6_INFO_LEN;
 
-	if(arg.len > length)
+	if (arg.len > length)
 		arg.len = length;
-	if(arg.len < 0)
+	if (arg.len < 0)
 		arg.len = 0;
 
 	return arg.len;
 }
 
-
 extern struct rt6_statistics rt6_stats;
 
 static int rt6_proc_stats(char *buffer, char **start, off_t offset, int length,
@@ -2141,10 +1840,11 @@ static int rt6_proc_stats(char *buffer, char **start, off_t offset, int length,
 {
 	int len;
 
-	len = sprintf(buffer, "%04x %04x %04x %04x %04x\n",
+	len = sprintf(buffer, "%04x %04x %04x %04x %04x %04x\n",
 		      rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
 		      rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
-		      rt6_stats.fib_rt_cache);
+		      rt6_stats.fib_rt_cache,
+		      atomic_read(&ip6_dst_ops.entries));
 
 	len -= offset;
 
@@ -2164,12 +1864,6 @@ static struct proc_dir_entry proc_rt6_info = {
 	0, &proc_net_inode_operations,
 	rt6_proc_info
 };
-static struct proc_dir_entry proc_rt6_tree = {
-	PROC_NET_RT6_TREE, 7, "ip6_fib",
-	S_IFREG | S_IRUGO, 1, 0, 0,
-	0, &proc_net_inode_operations,
-	rt6_proc_tree
-};
 static struct proc_dir_entry proc_rt6_stats = {
 	PROC_NET_RT6_STATS, 9, "rt6_stats",
 	S_IFREG | S_IRUGO, 1, 0, 0,
@@ -2230,7 +1924,6 @@ __initfunc(void ip6_route_init(void))
 {
 #ifdef 	CONFIG_PROC_FS
 	proc_net_register(&proc_rt6_info);
-	proc_net_register(&proc_rt6_tree);
 	proc_net_register(&proc_rt6_stats);
 #endif
 #ifdef CONFIG_IPV6_NETLINK
@@ -2243,7 +1936,6 @@ void ip6_route_cleanup(void)
 {
 #ifdef CONFIG_PROC_FS
 	proc_net_unregister(PROC_NET_RT6);
-	proc_net_unregister(PROC_NET_RT6_TREE);
 	proc_net_unregister(PROC_NET_RT6_STATS);
 #endif
 #ifdef CONFIG_IPV6_NETLINK
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 577b85d0f..0d6efd515 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -6,7 +6,7 @@
  *	Pedro Roque		<roque@di.fc.ul.pt>	
  *	Alexey Kuznetsov	<kuznet@ms2.inr.ac.ru>
  *
- *	$Id: sit.c,v 1.27 1998/03/08 05:56:57 davem Exp $
+ *	$Id: sit.c,v 1.28 1998/08/26 12:05:22 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -434,7 +434,7 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct device *dev)
 		ip_rt_put(rt);
 		goto tx_error;
 	}
-	if (mtu >= 576) {
+	if (mtu >= IPV6_MIN_MTU) {
 		if (skb->dst && mtu < skb->dst->pmtu) {
 			struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
 			if (mtu < rt6->u.dst.pmtu) {
@@ -475,6 +475,8 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct device *dev)
 			tunnel->recursion--;
 			return 0;
 		}
+		if (skb->sk)
+			skb_set_owner_w(new_skb, skb->sk);
 		dev_kfree_skb(skb);
 		skb = new_skb;
 	}
@@ -491,7 +493,7 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct device *dev)
 	iph 			=	skb->nh.iph;
 	iph->version		=	4;
 	iph->ihl		=	sizeof(struct iphdr)>>2;
-	if (mtu > 576)
+	if (mtu > IPV6_MIN_MTU)
 		iph->frag_off	=	__constant_htons(IP_DF);
 	else
 		iph->frag_off	=	0;
@@ -608,7 +610,7 @@ static struct net_device_stats *ipip6_tunnel_get_stats(struct device *dev)
 
 static int ipip6_tunnel_change_mtu(struct device *dev, int new_mtu)
 {
-	if (new_mtu < 576 || new_mtu > 0xFFF8 - sizeof(struct iphdr))
+	if (new_mtu < IPV6_MIN_MTU || new_mtu > 0xFFF8 - sizeof(struct iphdr))
 		return -EINVAL;
 	dev->mtu = new_mtu;
 	return 0;
@@ -662,8 +664,8 @@ static int ipip6_tunnel_init(struct device *dev)
 	if (tdev) {
 		dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
 		dev->mtu = tdev->mtu - sizeof(struct iphdr);
-		if (dev->mtu < 576)
-			dev->mtu = 576;
+		if (dev->mtu < IPV6_MIN_MTU)
+			dev->mtu = IPV6_MIN_MTU;
 	}
 	dev->iflink = tunnel->parms.link;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5fa45dce5..c997999db 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -5,7 +5,7 @@
  *	Authors:
  *	Pedro Roque		<roque@di.fc.ul.pt>	
  *
- *	$Id: tcp_ipv6.c,v 1.82 1998/06/11 03:15:52 davem Exp $
+ *	$Id: tcp_ipv6.c,v 1.89 1998/08/28 00:27:54 davem Exp $
  *
  *	Based on: 
  *	linux/net/ipv4/tcp.c
@@ -123,16 +123,33 @@ static int tcp_v6_verify_bind(struct sock *sk, unsigned short snum)
 	}
 	if(result == 0) {
 		if(tb == NULL) {
-			if(tcp_bucket_create(snum) == NULL)
+			if((tb = tcp_bucket_create(snum)) == NULL)
 				result = 1;
+			else if (sk->reuse && sk->state != TCP_LISTEN)
+				tb->flags |= TCPB_FLAG_FASTREUSE;
 		} else {
 			/* It could be pending garbage collection, this
 			 * kills the race and prevents it from disappearing
 			 * out from under us by the time we use it.  -DaveM
 			 */
-			if(tb->owners == NULL && !(tb->flags & TCPB_FLAG_LOCKED)) {
-				tb->flags = TCPB_FLAG_LOCKED;
-				tcp_dec_slow_timer(TCP_SLT_BUCKETGC);
+			if(tb->owners == NULL) {
+				if (!(tb->flags & TCPB_FLAG_LOCKED)) {
+					tb->flags = (TCPB_FLAG_LOCKED |
+						     ((sk->reuse &&
+						       sk->state != TCP_LISTEN) ?
+						      TCPB_FLAG_FASTREUSE : 0));
+					tcp_dec_slow_timer(TCP_SLT_BUCKETGC);
+				} else if (!(tb->flags & TCPB_FLAG_GOODSOCKNUM)) {
+					/* Someone is in between the bind
+					 * and the actual connect or listen.
+					 * See if it was a legitimate reuse
+					 * and we are as well, else punt.
+					 */
+					if (sk->reuse == 0 ||
+					    !(tb->flags & TCPB_FLAG_FASTREUSE))
+						result = 1;
+				} else
+					tb->flags &= ~TCPB_FLAG_GOODSOCKNUM;
 			}
 		}
 	}
@@ -358,7 +375,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	struct dst_entry *dst;
 	struct sk_buff *buff;
 	int addr_type;
-	int mss;
 
 	if (sk->state != TCP_CLOSE) 
 		return(-EISCONN);
@@ -403,6 +419,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	 */
 
 	if (addr_type == IPV6_ADDR_MAPPED) {
+		u32 exthdrlen = tp->ext_header_len;
 		struct sockaddr_in sin;
 		int err;
 
@@ -418,10 +435,10 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
 
 		if (err) {
+			tp->ext_header_len = exthdrlen;
 			sk->tp_pinfo.af_tcp.af_specific = &ipv6_specific;
 			sk->backlog_rcv = tcp_v6_do_rcv;
 		} else {
-			/* Yuup... And it is not the only place... --ANK */
 			ipv6_addr_set(&np->saddr, 0, 0, __constant_htonl(0x0000FFFF),
 				      sk->saddr);
 			ipv6_addr_set(&np->rcv_saddr, 0, 0, __constant_htonl(0x0000FFFF),
@@ -441,18 +458,18 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	fl.uli_u.ports.dport = usin->sin6_port;
 	fl.uli_u.ports.sport = sk->sport;
 
+	if (np->opt && np->opt->srcrt) {
+		struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
+		fl.nl_u.ip6_u.daddr = rt0->addr;
+	}
+
 	dst = ip6_route_output(sk, &fl);
-	
+
 	if (dst->error) {
 		dst_release(dst);
 		return dst->error;
 	}
 
-	if (dst->pmtu < 576) {
-		dst_release(dst);
-		return -EINVAL;
-	}
-
 	if (fl.oif == 0 && addr_type&IPV6_ADDR_LINKLOCAL) {
 		/* Ough! This guy tries to connect to link local
 		 * address and did not specify interface.
@@ -462,11 +479,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		sk->bound_dev_if = dst->dev->ifindex;
 	}
 
-	ip6_dst_store(sk, dst);
+	ip6_dst_store(sk, dst, NULL);
 
 	if (saddr == NULL) {
 		ifa = ipv6_get_saddr(dst, &np->daddr);
-	
+
 		if (ifa == NULL)
 			return -ENETUNREACH;
 		
@@ -477,6 +494,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		ipv6_addr_copy(&np->saddr, saddr);
 	}
 
+	tp->ext_header_len = 0;
+	if (np->opt)
+		tp->ext_header_len = np->opt->opt_flen+np->opt->opt_nflen;
+	/* Reset mss clamp */
+	tp->mss_clamp = ~0;
+
 	buff = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
 			    0, GFP_KERNEL);
 
@@ -498,15 +521,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 						   np->daddr.s6_addr32[3],
 						   sk->sport, sk->dport);
 
-	sk->mtu = dst->pmtu;
-	mss = sk->mtu - sizeof(struct ipv6hdr);
-#if 0
-	if (np->opt) {
-		/* Adjust mss */
-	}
-#endif
-
-	tcp_connect(sk, buff, mss);
+	tcp_connect(sk, buff, dst->pmtu);
 
 	return 0;
 }
@@ -555,10 +570,12 @@ out:
 	return retval;
 }
 
-void tcp_v6_err(struct sk_buff *skb, int type, int code, unsigned char *header, __u32 info,
-		struct in6_addr *saddr, struct in6_addr *daddr,
-		struct inet6_protocol *protocol)
+void tcp_v6_err(struct sk_buff *skb, struct ipv6hdr *hdr,
+		struct inet6_skb_parm *opt,
+		int type, int code, unsigned char *header, __u32 info)
 {
+	struct in6_addr *saddr = &hdr->saddr;
+	struct in6_addr *daddr = &hdr->daddr;
 	struct tcphdr *th = (struct tcphdr *)header;
 	struct ipv6_pinfo *np;
 	struct sock *sk;
@@ -567,7 +584,8 @@ void tcp_v6_err(struct sk_buff *skb, int type, int code, unsigned char *header,
 	struct tcp_opt *tp; 
 	__u32 seq; 
 
-	/* XXX: length check for tcphdr missing here */
+	if (header + 8 > skb->tail)
+		return;
 
 	sk = tcp_v6_lookup(daddr, th->dest, saddr, th->source, skb->dev->ifindex);
 
@@ -588,15 +606,20 @@ void tcp_v6_err(struct sk_buff *skb, int type, int code, unsigned char *header,
 
 	np = &sk->net_pinfo.af_inet6;
 	if (type == ICMPV6_PKT_TOOBIG && sk->state != TCP_LISTEN) {
+		struct dst_entry *dst = NULL;
 		/* icmp should have updated the destination cache entry */
 
 		if (sk->dst_cache)
-			dst_check(&sk->dst_cache, np->dst_cookie);
+			dst = dst_check(&sk->dst_cache, np->dst_cookie);
 
-		if (sk->dst_cache == NULL) {
+		if (dst == NULL) {
 			struct flowi fl;
 			struct dst_entry *dst;
-			
+
+			/* BUGGG_FUTURE: Again, it is not clear how
+			   to handle rthdr case. Ignore this complexity
+			   for now.
+			 */
 			fl.proto = IPPROTO_TCP;
 			fl.nl_u.ip6_u.daddr = &np->daddr;
 			fl.nl_u.ip6_u.saddr = &np->saddr;
@@ -605,23 +628,19 @@ void tcp_v6_err(struct sk_buff *skb, int type, int code, unsigned char *header,
 			fl.uli_u.ports.sport = sk->sport;
 
 			dst = ip6_route_output(sk, &fl);
+		} else
+			dst = dst_clone(dst);
 
-			ip6_dst_store(sk, dst);
-		}
-
-		if (sk->dst_cache->error) {
-			sk->err_soft = sk->dst_cache->error;
-		} else {
-			/* FIXME: Reset sk->mss, taking into account TCP option
-			 *        bytes for timestamps. -DaveM
-			 */
-			sk->mtu = sk->dst_cache->pmtu;
-		}
-		if (sk->sock_readers) { /* remove later */
-			printk(KERN_DEBUG "tcp_v6_err: pmtu disc: socket locked.\n");
-			return;
-		}
-		tcp_simple_retransmit(sk);
+		if (dst->error) {
+			sk->err_soft = dst->error;
+		} else if (tp->pmtu_cookie > dst->pmtu
+			   && !atomic_read(&sk->sock_readers)) {
+			lock_sock(sk); 
+			tcp_sync_mss(sk, dst->pmtu);
+			tcp_simple_retransmit(sk);
+			release_sock(sk);
+		} /* else let the usual retransmit timer handle it */
+		dst_release(dst);
 		return;
 	}
 
@@ -631,7 +650,7 @@ void tcp_v6_err(struct sk_buff *skb, int type, int code, unsigned char *header,
 		struct open_request *req, *prev;
 		struct ipv6hdr hd;
 	case TCP_LISTEN:
-		if (sk->sock_readers)
+		if (atomic_read(&sk->sock_readers))
 			return;
 
 		/* Grrrr - fix this later. */
@@ -680,6 +699,7 @@ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req)
 {
 	struct sk_buff * skb;
 	struct dst_entry *dst;
+	struct ipv6_txoptions *opt = NULL;
 	struct flowi fl;
 	int mss;
 
@@ -690,19 +710,26 @@ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req)
 	fl.uli_u.ports.dport = req->rmt_port;
 	fl.uli_u.ports.sport = sk->sport;
 
-	dst = ip6_route_output(sk, &fl);
-	if (dst->error) {
-		dst_release(dst);
-		return;
+	opt = sk->net_pinfo.af_inet6.opt;
+	if (opt == NULL &&
+	    sk->net_pinfo.af_inet6.rxopt.bits.srcrt == 2 &&
+	    req->af.v6_req.pktopts) {
+		struct sk_buff *pktopts = req->af.v6_req.pktopts;
+		struct inet6_skb_parm *rxopt = (struct inet6_skb_parm *)pktopts->cb;
+		if (rxopt->srcrt)
+			opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(pktopts->nh.raw + rxopt->srcrt));
 	}
 
-	mss = dst->pmtu - sizeof(struct ipv6hdr) - sizeof(struct tcphdr);
-#if 0
-	/* Subtract option length... */
-	if (opt) {
-		mss -= opt->optlen;
+	if (opt && opt->srcrt) {
+		struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
+		fl.nl_u.ip6_u.daddr = rt0->addr;
 	}
-#endif
+
+	dst = ip6_route_output(sk, &fl);
+	if (dst->error)
+		goto done;
+
+	mss = dst->pmtu - sizeof(struct ipv6hdr) - sizeof(struct tcphdr);
 
 	skb = tcp_make_synack(sk, dst, req, mss);
 	if (skb) {
@@ -712,13 +739,22 @@ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req)
 					 &req->af.v6_req.loc_addr, &req->af.v6_req.rmt_addr,
 					 csum_partial((char *)th, skb->len, skb->csum));
 
-		ip6_xmit(sk, skb, &fl, req->af.v6_req.opt);
+		fl.nl_u.ip6_u.daddr = &req->af.v6_req.rmt_addr;
+		ip6_xmit(sk, skb, &fl, opt);
 	}
+
+done:
 	dst_release(dst);
+        if (opt && opt != sk->net_pinfo.af_inet6.opt)
+		sock_kfree_s(sk, opt, opt->tot_len);
 }
 
 static void tcp_v6_or_free(struct open_request *req)
 {
+	if (req->af.v6_req.pktopts) {
+		kfree_skb(req->af.v6_req.pktopts);
+		req->af.v6_req.pktopts = NULL;
+	}
 }
 
 static struct or_calltable or_ipv6 = {
@@ -727,14 +763,27 @@ static struct or_calltable or_ipv6 = {
 	tcp_v6_send_reset
 };
 
+static int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb)
+{
+	struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb;
+
+	if (sk->net_pinfo.af_inet6.rxopt.all) {
+		if ((opt->hop && sk->net_pinfo.af_inet6.rxopt.bits.hopopts) ||
+		    (opt->srcrt && sk->net_pinfo.af_inet6.rxopt.bits.srcrt) ||
+		    ((opt->dst1 || opt->dst0) && sk->net_pinfo.af_inet6.rxopt.bits.dstopts))
+			return 1;
+	}
+	return 0;
+}
+
+
 #define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */
 #define BACKLOGMAX(sk) sysctl_max_syn_backlog
 
 /* FIXME: this is substantially similar to the ipv4 code.
  * Can some kind of merge be done? -- erics
  */
-static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
-							   __u32 isn)
+static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, __u32 isn)
 {
 	struct tcp_opt tp;
 	struct open_request *req;
@@ -747,7 +796,11 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
 	}
 
 	if (skb->protocol == __constant_htons(ETH_P_IP))
-		return tcp_v4_conn_request(sk, skb, ptr, isn);
+		return tcp_v4_conn_request(sk, skb, isn);
+
+	/* FIXME: do the same check for anycast */
+	if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr))
+		goto drop; 
 
 	if (isn == 0) 
 		isn = tcp_v6_init_sequence(sk,skb);
@@ -756,8 +809,9 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
 	 *	There are no SYN attacks on IPv6, yet...	
 	 */
 	if (BACKLOG(sk) >= BACKLOGMAX(sk)) {
-		printk(KERN_DEBUG "droping syn ack:%d max:%d\n",
-		       BACKLOG(sk), BACKLOGMAX(sk));
+		(void)(net_ratelimit() && 
+		       printk(KERN_INFO "droping syn ack:%d max:%d\n",
+			       BACKLOG(sk), BACKLOGMAX(sk)));
 		goto drop;		
 	}
 
@@ -773,13 +827,16 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
 	req->rcv_isn = TCP_SKB_CB(skb)->seq;
 	req->snt_isn = isn;
 	tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
-	tp.in_mss = 536;
+	tp.mss_clamp = 65535;
 	tcp_parse_options(NULL, skb->h.th, &tp, 0);
-        req->mss = tp.in_mss;
-	if (tp.saw_tstamp) {
-		req->mss -= TCPOLEN_TSTAMP_ALIGNED;
+	if (tp.mss_clamp == 65535)
+		tp.mss_clamp = 576 - sizeof(struct ipv6hdr) - sizeof(struct iphdr);
+	if (sk->tp_pinfo.af_tcp.user_mss && sk->tp_pinfo.af_tcp.user_mss < tp.mss_clamp)
+		tp.mss_clamp = sk->tp_pinfo.af_tcp.user_mss;
+
+        req->mss = tp.mss_clamp;
+	if (tp.saw_tstamp)
                 req->ts_recent = tp.rcv_tsval;
-	}
         req->tstamp_ok = tp.tstamp_ok;
 	req->sack_ok = tp.sack_ok;
         req->snd_wscale = tp.snd_wscale;
@@ -787,7 +844,11 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
 	req->rmt_port = skb->h.th->source;
 	ipv6_addr_copy(&req->af.v6_req.rmt_addr, &skb->nh.ipv6h->saddr);
 	ipv6_addr_copy(&req->af.v6_req.loc_addr, &skb->nh.ipv6h->daddr);
-	req->af.v6_req.opt = NULL;	/* FIXME: options */
+	req->af.v6_req.pktopts = NULL;
+	if (ipv6_opt_accepted(sk, skb)) {
+		atomic_inc(&skb->users);
+		req->af.v6_req.pktopts = skb;
+	}
 	req->af.v6_req.iif = sk->bound_dev_if;
 
 	/* So that link locals have meaning */
@@ -804,8 +865,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
 	tcp_inc_slow_timer(TCP_SLT_SYNACK);
 	tcp_synq_queue(&sk->tp_pinfo.af_tcp, req);	
 
-	sk->data_ready(sk, 0);
-
 	return 0;
 
 drop:
@@ -832,8 +891,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	struct flowi fl;
 	struct tcp_opt *newtp;
 	struct sock *newsk;
-	int mss;
-      
+	struct ipv6_txoptions *opt;
+
 	if (skb->protocol == __constant_htons(ETH_P_IP)) {
 		/*
 		 *	v6 mapped
@@ -856,21 +915,37 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 		newsk->tp_pinfo.af_tcp.af_specific = &ipv6_mapped;
 		newsk->backlog_rcv = tcp_v4_do_rcv;
+		newsk->net_pinfo.af_inet6.pktoptions = NULL;
+		newsk->net_pinfo.af_inet6.opt = NULL;
+
+		/* It is tricky place. Until this moment IPv4 tcp
+		   worked with IPv6 af_tcp.af_specific.
+		   Sync it now.
+		 */
+		tcp_sync_mss(newsk, newsk->tp_pinfo.af_tcp.pmtu_cookie);
 
 		return newsk;
 	}
 
+	opt = sk->net_pinfo.af_inet6.opt;
 
 	if (sk->ack_backlog > sk->max_ack_backlog)
-		return NULL; 
+		goto out;
+
+	if (sk->net_pinfo.af_inet6.rxopt.bits.srcrt == 2 &&
+	    opt == NULL && req->af.v6_req.pktopts) {
+		struct inet6_skb_parm *rxopt = (struct inet6_skb_parm *)req->af.v6_req.pktopts->cb;
+		if (rxopt->srcrt)
+			opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(req->af.v6_req.pktopts->nh.raw+rxopt->srcrt));
+	}
 
 	if (dst == NULL) {
-		/*
-		 *	options / mss / route cache
-		 */
-	    
 		fl.proto = IPPROTO_TCP;
 		fl.nl_u.ip6_u.daddr = &req->af.v6_req.rmt_addr;
+		if (opt && opt->srcrt) {
+			struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
+			fl.nl_u.ip6_u.daddr = rt0->addr;
+		}
 		fl.nl_u.ip6_u.saddr = &req->af.v6_req.loc_addr;
 		fl.oif = sk->bound_dev_if;
 		fl.uli_u.ports.dport = req->rmt_port;
@@ -879,22 +954,17 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		dst = ip6_route_output(sk, &fl);
 	}
 
-	if (dst->error || dst->pmtu < 576)
+	if (dst->error)
 		goto out;
-	
+
 	sk->tp_pinfo.af_tcp.syn_backlog--;
 	sk->ack_backlog++;
 
-	mss = dst->pmtu - sizeof(struct ipv6hdr);
-#if 0
-	/* Adjust mss by option size */
-#endif
-
-	newsk = tcp_create_openreq_child(sk, req, skb, mss);
+	newsk = tcp_create_openreq_child(sk, req, skb);
 	if (newsk == NULL)
 		goto out;
 
-	ip6_dst_store(newsk, dst);
+	ip6_dst_store(newsk, dst, NULL);
 
 	newtp = &(newsk->tp_pinfo.af_tcp);
 
@@ -903,18 +973,55 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	ipv6_addr_copy(&np->saddr, &req->af.v6_req.loc_addr);
 	ipv6_addr_copy(&np->rcv_saddr, &req->af.v6_req.loc_addr);
 	newsk->bound_dev_if = req->af.v6_req.iif;
-	newsk->mtu = dst->pmtu;
+
+	/* Now IPv6 options... 
+
+	   First: no IPv4 options.
+	 */
 	newsk->opt = NULL;
 
+	/* Clone RX bits */
+	np->rxopt.all = sk->net_pinfo.af_inet6.rxopt.all;
+
+	/* Clone pktoptions received with SYN */
+	np->pktoptions = req->af.v6_req.pktopts;
+	if (np->pktoptions)
+		atomic_inc(&np->pktoptions->users);
+	np->opt = NULL;
+
+	/* Clone native IPv6 options from listening socket (if any)
+
+	   Yes, keeping reference count would be much more clever,
+	   but we make one more one thing there: reattach optmem
+	   to newsk.
+	 */
+	if (opt) {
+		np->opt = ipv6_dup_options(newsk, opt);
+		if (opt != sk->net_pinfo.af_inet6.opt)
+			sock_kfree_s(sk, opt, opt->tot_len);
+	}
+
+	newtp->ext_header_len = 0;
+	if (np->opt)
+		newtp->ext_header_len = np->opt->opt_nflen + np->opt->opt_flen;
+
+	tcp_sync_mss(newsk, dst->pmtu);
+	newtp->rcv_mss = newtp->mss_clamp;
+
 	newsk->daddr	= LOOPBACK4_IPV6;
 	newsk->saddr	= LOOPBACK4_IPV6;
 	newsk->rcv_saddr= LOOPBACK4_IPV6;
 
 	newsk->prot->hash(newsk);
 	add_to_prot_sklist(newsk);
+
+	sk->data_ready(sk, 0); /* Deliver SIGIO */ 
+
 	return newsk;
 
 out:
+	if (opt && opt != sk->net_pinfo.af_inet6.opt)
+		sock_kfree_s(sk, opt, opt->tot_len);
 	dst_release(dst);
 	return NULL;
 }
@@ -1020,8 +1127,8 @@ static void tcp_v6_rst_req(struct sock *sk, struct sk_buff *skb)
 	if (!req)
 		return;
 	/* Sequence number check required by RFC793 */
-	if (before(TCP_SKB_CB(skb)->seq, req->snt_isn) ||
-	    after(TCP_SKB_CB(skb)->seq, req->snt_isn+1))
+	if (before(TCP_SKB_CB(skb)->seq, req->rcv_isn) ||
+	    after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
 		return;
 	if(req->sk)
 		sk->ack_backlog--;
@@ -1055,7 +1162,7 @@ static inline struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
 		}
 #if 0 /*def CONFIG_SYN_COOKIES */
 		 else {
-			sk = cookie_v6_check(sk, skb, (struct ipv6_options *) skb->cb);
+			sk = cookie_v6_check(sk, skb);
 		 }
 #endif
 	}
@@ -1064,6 +1171,8 @@ static inline struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
 
 static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 {
+	int users = 0;
+
 	/* Imagine: socket is IPv6. IPv4 packet arrives,
 	   goes to IPv4 receive handler and backlogged.
 	   From backlog it always goes here. Kerboom...
@@ -1080,6 +1189,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 	 *	is currently called with bh processing disabled.
 	 */
 
+  	ipv6_statistics.Ip6InDelivers++;
+
 	/* XXX We need to think more about socket locking
 	 * XXX wrt. backlog queues, __release_sock(), etc.  -DaveM
 	 */
@@ -1092,9 +1203,29 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 	 */
 	skb_set_owner_r(skb, sk);
 
+	/* Do Stevens' IPV6_PKTOPTIONS.
+
+	   Yes, guys, it is the only place in our code, where we
+	   may make it not affecting IPv4.
+	   The rest of code is protocol independent,
+	   and I do not like idea to uglify IPv4.
+
+	   Actually, all the idea behind IPV6_PKTOPTIONS
+	   looks not very well thought. For now we latch
+	   options, received in the last packet, enqueued
+	   by tcp. Feel free to propose better solution.
+	                                       --ANK (980728)
+	 */
+	if (sk->net_pinfo.af_inet6.rxopt.all) {
+		users = atomic_read(&skb->users);
+		atomic_inc(&skb->users);
+	}
+
 	if (sk->state == TCP_ESTABLISHED) { /* Fast path */
 		if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
 			goto reset;
+		if (users)
+			goto ipv6_pktoptions;
 		release_sock(sk);
 		return 0;
 	}
@@ -1110,26 +1241,60 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 		sk = nsk;
 	}
 
-	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->cb, skb->len))
+	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
 		goto reset;
+	if (users)
+		goto ipv6_pktoptions;
 	release_sock(sk);
 	return 0;
 
 reset:
 	tcp_v6_send_reset(skb);
 discard:
+	if (users)
+		kfree_skb(skb);
 	kfree_skb(skb);
 	release_sock(sk);  
 	return 0;
+
+ipv6_pktoptions:
+	/* Do you ask, what is it?
+
+	   1. skb was enqueued by tcp.
+	   2. skb is added to tail of read queue, rather than out of order.
+	   3. socket is not in passive state.
+	   4. Finally, it really contains options, which user wants to receive.
+	 */
+	if (atomic_read(&skb->users) > users &&
+	    TCP_SKB_CB(skb)->end_seq == sk->tp_pinfo.af_tcp.rcv_nxt &&
+	    !((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))) {
+		if (ipv6_opt_accepted(sk, skb)) {
+			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+			kfree_skb(skb);
+			skb = NULL;
+			if (skb2) {
+				skb_set_owner_r(skb2, sk);
+				skb = xchg(&sk->net_pinfo.af_inet6.pktoptions, skb2);
+			}
+		} else {
+			kfree_skb(skb);
+			skb = xchg(&sk->net_pinfo.af_inet6.pktoptions, NULL);
+		}
+	}
+
+	if (skb)
+		kfree_skb(skb);
+	release_sock(sk);
+	return 0;
 }
 
-int tcp_v6_rcv(struct sk_buff *skb, struct device *dev,
-	       struct in6_addr *saddr, struct in6_addr *daddr,
-	       struct ipv6_options *opt, unsigned short len,
-	       int redo, struct inet6_protocol *protocol)
+int tcp_v6_rcv(struct sk_buff *skb, unsigned long len)
 {
 	struct tcphdr *th;	
 	struct sock *sk;
+	struct device *dev = skb->dev;
+	struct in6_addr *saddr = &skb->nh.ipv6h->saddr;
+	struct in6_addr *daddr = &skb->nh.ipv6h->daddr;
 
 	th = skb->h.th;
 
@@ -1178,7 +1343,7 @@ int tcp_v6_rcv(struct sk_buff *skb, struct device *dev,
 	if(sk->state == TCP_TIME_WAIT)
 		goto do_time_wait;
 
-	if (!sk->sock_readers)
+	if (!atomic_read(&sk->sock_readers))
 		return tcp_v6_do_rcv(sk, skb);
 
 	__skb_queue_tail(&sk->back_log, skb);
@@ -1198,7 +1363,7 @@ discard_it:
 
 do_time_wait:
 	if(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
-				      skb, th, &(IPCB(skb)->opt), skb->len))
+				      skb, th, skb->len))
 		goto no_tcp_socket;
 	goto discard_it;
 }
@@ -1221,6 +1386,12 @@ static int tcp_v6_rebuild_header(struct sock *sk)
 		fl.uli_u.ports.dport = sk->dport;
 		fl.uli_u.ports.sport = sk->sport;
 
+		if (np->opt && np->opt->srcrt) {
+			struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
+			fl.nl_u.ip6_u.daddr = rt0->addr;
+		}
+
+
 		dst = ip6_route_output(sk, &fl);
 
 		if (dst->error) {
@@ -1228,7 +1399,7 @@ static int tcp_v6_rebuild_header(struct sock *sk)
 			return dst->error;
 		}
 
-		ip6_dst_store(sk, dst);
+		ip6_dst_store(sk, dst, NULL);
 	}
 
 	return dst->error;
@@ -1258,6 +1429,11 @@ static void tcp_v6_xmit(struct sk_buff *skb)
 	fl.uli_u.ports.sport = sk->sport;
 	fl.uli_u.ports.dport = sk->dport;
 
+	if (np->opt && np->opt->srcrt) {
+		struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
+		fl.nl_u.ip6_u.daddr = rt0->addr;
+	}
+
 	if (sk->dst_cache)
 		dst = dst_check(&sk->dst_cache, np->dst_cookie);
 
@@ -1270,11 +1446,14 @@ static void tcp_v6_xmit(struct sk_buff *skb)
 			return;
 		}
 
-		ip6_dst_store(sk, dst);
+		ip6_dst_store(sk, dst, NULL);
 	}
 
 	skb->dst = dst_clone(dst);
 
+	/* Restore final destination back after routing done */
+	fl.nl_u.ip6_u.daddr = &np->daddr;
+
 	ip6_xmit(sk, skb, &fl, np->opt);
 }
 
@@ -1295,6 +1474,8 @@ static struct tcp_func ipv6_specific = {
 	tcp_v6_conn_request,
 	tcp_v6_syn_recv_sock,
 	tcp_v6_get_sock,
+	sizeof(struct ipv6hdr),
+
 	ipv6_setsockopt,
 	ipv6_getsockopt,
 	v6_addr2sockaddr,
@@ -1312,6 +1493,8 @@ static struct tcp_func ipv6_mapped = {
 	tcp_v6_conn_request,
 	tcp_v6_syn_recv_sock,
 	tcp_v6_get_sock,
+	sizeof(struct iphdr),
+
 	ipv6_setsockopt,
 	ipv6_getsockopt,
 	v6_addr2sockaddr,
@@ -1330,7 +1513,7 @@ static int tcp_v6_init_sock(struct sock *sk)
 
 	tp->rto  = TCP_TIMEOUT_INIT;		/*TCP_WRITE_TIME*/
 	tp->mdev = TCP_TIMEOUT_INIT;
-	tp->in_mss = 536;
+	tp->mss_clamp = ~0;
 
 	/* See draft-stevens-tcpca-spec-01 for discussion of the
 	 * initialization of these values.
@@ -1338,17 +1521,17 @@ static int tcp_v6_init_sock(struct sock *sk)
 	tp->snd_cwnd = (1 << TCP_CWND_SHIFT);
 	tp->snd_ssthresh = 0x7fffffff;
 
-	sk->priority = 1;
 	sk->state = TCP_CLOSE;
 	sk->max_ack_backlog = SOMAXCONN;
-	sk->mtu = 576;
-	sk->mss = 536;
+	tp->rcv_mss = 536; 
 
 	/* Init SYN queue. */
 	tcp_synq_init(tp);
 
 	sk->tp_pinfo.af_tcp.af_specific = &ipv6_specific;
 
+	sk->write_space = tcp_write_space;
+
 	return 0;
 }
 
@@ -1376,12 +1559,6 @@ static int tcp_v6_destroy_sock(struct sock *sk)
   	while((skb = __skb_dequeue(&tp->out_of_order_queue)) != NULL)
 		kfree_skb(skb);
 
-	/*
-	 *	Release destination entry
-	 */
-
-	dst_release(xchg(&sk->dst_cache,NULL));
-
 	/* Clean up a locked TCP bind bucket, this only happens if a
 	 * port is allocated for a socket, but it never fully connects.
 	 * In which case we will find num to be non-zero and daddr to
@@ -1390,7 +1567,7 @@ static int tcp_v6_destroy_sock(struct sock *sk)
 	if(ipv6_addr_any(&(sk->net_pinfo.af_inet6.daddr)) && sk->num != 0)
 		tcp_bucket_unlock(sk);
 
-	return 0;
+	return inet6_destroy_sock(sk);
 }
 
 struct proto tcpv6_prot = {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 2dac0570f..bfa701c97 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -7,7 +7,7 @@
  *
  *	Based on linux/ipv4/udp.c
  *
- *	$Id: udp.c,v 1.31 1998/07/15 05:05:45 davem Exp $
+ *	$Id: udp.c,v 1.33 1998/08/27 16:55:20 davem Exp $
  *
  *	This program is free software; you can redistribute it and/or
  *      modify it under the terms of the GNU General Public License
@@ -15,6 +15,7 @@
  *      2 of the License, or (at your option) any later version.
  */
 
+#include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/types.h>
 #include <linux/socket.h>
@@ -59,6 +60,14 @@ static int udp_v6_verify_bind(struct sock *sk, unsigned short snum)
 		if((sk2->num == snum) && (sk2 != sk)) {
 			unsigned char state = sk2->state;
 			int sk2_reuse = sk2->reuse;
+
+			/* Two sockets can be bound to the same port if they're
+			 * bound to different interfaces.
+			 */
+
+			if(sk2->bound_dev_if != sk->bound_dev_if)
+				continue;
+
 			if(addr_type == IPV6_ADDR_ANY || (!sk2->rcv_saddr)) {
 				if((!sk2_reuse)			||
 				   (!sk_reuse)			||
@@ -139,7 +148,7 @@ static void udp_v6_rehash(struct sock *sk)
 }
 
 static struct sock *udp_v6_lookup(struct in6_addr *saddr, u16 sport,
-				  struct in6_addr *daddr, u16 dport)
+				  struct in6_addr *daddr, u16 dport, int dif)
 {
 	struct sock *sk, *result = NULL;
 	unsigned short hnum = ntohs(dport);
@@ -166,7 +175,12 @@ static struct sock *udp_v6_lookup(struct in6_addr *saddr, u16 sport,
 					continue;
 				score++;
 			}
-			if(score == 3) {
+			if(sk->bound_dev_if) {
+				if(sk->bound_dev_if != dif)
+					continue;
+				score++;
+			}
+			if(score == 4) {
 				result = sk;
 				break;
 			} else if(score > badness) {
@@ -257,20 +271,25 @@ ipv4_connected:
 	 */
 
 	fl.proto = IPPROTO_UDP;
-	fl.nl_u.ip6_u.daddr = daddr;
+	fl.nl_u.ip6_u.daddr = &np->daddr;
 	fl.nl_u.ip6_u.saddr = NULL;
 	fl.oif = sk->bound_dev_if;
 	fl.uli_u.ports.dport = sk->dport;
 	fl.uli_u.ports.sport = sk->sport;
 
+	if (np->opt && np->opt->srcrt) {
+		struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
+		fl.nl_u.ip6_u.daddr = rt0->addr;
+	}
+
 	dst = ip6_route_output(sk, &fl);
-       
+
 	if (dst->error) {
 		dst_release(dst);
 		return dst->error;
 	}
 
-	ip6_dst_store(sk, dst);
+	ip6_dst_store(sk, dst, fl.nl_u.ip6_u.daddr);
 
 	/* get the source adddress used in the apropriate device */
 
@@ -291,15 +310,50 @@ ipv4_connected:
 
 static void udpv6_close(struct sock *sk, unsigned long timeout)
 {
-	lock_sock(sk);
+	/* See for explanation: raw_close in ipv4/raw.c */
 	sk->state = TCP_CLOSE;
-	ipv6_sock_mc_close(sk);
 	udp_v6_unhash(sk);
 	sk->dead = 1;
-	release_sock(sk);
 	destroy_sock(sk);
 }
 
+#ifdef CONFIG_FILTER
+#undef CONFIG_UDP_DELAY_CSUM
+#endif
+
+#ifdef CONFIG_UDP_DELAY_CSUM
+
+/* Please, read comments in net/checksum.h, asm/checksum.h
+
+   I commented out csum_partial_copy_to_user there because it did not
+   verify_area. Now I am even wondered, how clever was I that time 8)8)
+   If I did not it, I would step into this hole again.   --ANK
+ */
+
+#ifndef _HAVE_ARCH_COPY_AND_CSUM_TO_USER
+#if defined(__i386__)
+static __inline__
+unsigned int csum_and_copy_to_user (const char *src, char *dst,
+				    int len, int sum, int *err_ptr)
+{
+	int *src_err_ptr=NULL;
+
+	if (verify_area(VERIFY_WRITE, dst, len) == 0)
+		return csum_partial_copy_generic(src, dst, len, sum, src_err_ptr, err_ptr);
+
+	if (len)
+		*err_ptr = -EFAULT;
+
+	return sum;
+}
+#elif defined(__sparc__)
+#define csum_and_copy_to_user csum_partial_copy_to_user
+#else
+#undef CONFIG_UDP_DELAY_CSUM
+#endif
+#endif
+#endif
+
 /*
  * 	This should be easy, if there is something there we
  * 	return it, otherwise we block.
@@ -322,12 +376,12 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len,
 	 *	From here the generic datagram does a lot of the work. Come
 	 *	the finished NET3, it will do _ALL_ the work!
 	 */
-	 	
+
 	skb = skb_recv_datagram(sk, flags, noblock, &err);
 	if (!skb)
 		goto out;
   
- 	copied = ntohs(((struct udphdr *)skb->h.raw)->len) - sizeof(struct udphdr);
+ 	copied = skb->len - sizeof(struct udphdr);
   	if (copied > len) {
   		copied = len;
   		msg->msg_flags |= MSG_TRUNC;
@@ -337,8 +391,41 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len,
   	 *	FIXME : should use udp header size info value 
   	 */
   	 
+#ifndef CONFIG_UDP_DELAY_CSUM
 	err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), 
 				      msg->msg_iov, copied);
+#else
+	if (sk->no_check || skb->ip_summed==CHECKSUM_UNNECESSARY) {
+		err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
+					      copied);
+	} else if (copied > msg->msg_iov[0].iov_len || (msg->msg_flags&MSG_TRUNC)) {
+		if (csum_fold(csum_partial(skb->h.raw, ntohs(skb->h.uh->len), skb->csum))) {
+			/* Error for blocking case is chosen to masquerade
+			   as some normal condition.
+			 */
+			err = (msg->msg_flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH;
+			udp_stats_in6.UdpInErrors++;
+			goto out_free;
+		}
+		err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
+					      copied);
+	} else {
+		unsigned int csum = csum_partial(skb->h.raw, sizeof(struct udphdr), skb->csum);
+
+		err = 0;
+		csum = csum_and_copy_to_user((char*)&skb->h.uh[1], msg->msg_iov[0].iov_base, copied, csum, &err);
+		if (err)
+			goto out_free;
+		if (csum_fold(csum)) {
+			/* Error for blocking case is chosen to masquerade
+			   as some normal condition.
+			 */
+			err = (msg->msg_flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH;
+			udp_stats_in6.UdpInErrors++;
+			goto out_free;
+		}
+	}
+#endif
 	if (err)
 		goto out_free;
 	
@@ -361,7 +448,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len,
 			memcpy(&sin6->sin6_addr, &skb->nh.ipv6h->saddr,
 			       sizeof(struct in6_addr));
 
-			if (msg->msg_controllen)
+			if (sk->net_pinfo.af_inet6.rxopt.all)
 				datagram_recv_ctl(sk, msg, skb);
 		}
   	}
@@ -373,20 +460,27 @@ out:
 	return err;
 }
 
-void udpv6_err(struct sk_buff *skb, int type, int code, unsigned char *buff, __u32 info,
-	       struct in6_addr *saddr, struct in6_addr *daddr,
-	       struct inet6_protocol *protocol)
+void udpv6_err(struct sk_buff *skb, struct ipv6hdr *hdr,
+	       struct inet6_skb_parm *opt,
+	       int type, int code, unsigned char *buff, __u32 info)
 {
+	struct device *dev = skb->dev;
+	struct in6_addr *saddr = &hdr->saddr;
+	struct in6_addr *daddr = &hdr->daddr;
 	struct sock *sk;
 	struct udphdr *uh;
 	int err;
-	
+
+	if (buff + sizeof(struct udphdr) > skb->tail)
+		return;
+
 	uh = (struct udphdr *) buff;
 
-	sk = udp_v6_lookup(daddr, uh->dest, saddr, uh->source);
+	sk = udp_v6_lookup(daddr, uh->dest, saddr, uh->source, dev->ifindex);
    
 	if (sk == NULL) {
-		printk(KERN_DEBUG "icmp for unknown sock\n");
+		if (net_ratelimit())
+			printk(KERN_DEBUG "icmp for unknown sock\n");
 		return;
 	}
 
@@ -407,11 +501,10 @@ static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
 	if (sock_queue_rcv_skb(sk,skb)<0) {
 		udp_stats_in6.UdpInErrors++;
 		ipv6_statistics.Ip6InDiscards++;
-		ipv6_statistics.Ip6InDelivers--;
-		skb->sk = NULL;
 		kfree_skb(skb);
 		return 0;
 	}
+  	ipv6_statistics.Ip6InDelivers++;
 	udp_stats_in6.UdpInDatagrams++;
 	return 0;
 }
@@ -430,7 +523,8 @@ static __inline__ int inet6_mc_check(struct sock *sk, struct in6_addr *addr)
 
 static struct sock *udp_v6_mcast_next(struct sock *sk,
 				      u16 loc_port, struct in6_addr *loc_addr,
-				      u16 rmt_port, struct in6_addr *rmt_addr)
+				      u16 rmt_port, struct in6_addr *rmt_addr,
+				      int dif)
 {
 	struct sock *s = sk;
 	unsigned short num = ntohs(loc_port);
@@ -446,6 +540,9 @@ static struct sock *udp_v6_mcast_next(struct sock *sk,
 			   ipv6_addr_cmp(&np->daddr, rmt_addr))
 				continue;
 
+			if (s->bound_dev_if && s->bound_dev_if != dif)
+				continue;
+
 			if(!ipv6_addr_any(&np->rcv_saddr)) {
 				if(ipv6_addr_cmp(&np->rcv_saddr, loc_addr) == 0)
 					return s;
@@ -468,16 +565,18 @@ static void udpv6_mcast_deliver(struct udphdr *uh,
 {
 	struct sock *sk, *sk2;
 	struct sk_buff *buff;
+	int dif;
 
 	sk = udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)];
-	sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr);
+	dif = skb->dev->ifindex;
+	sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
 	if (!sk)
 		goto free_skb;
 
 	buff = NULL;
 	sk2 = sk;
 	while((sk2 = udp_v6_mcast_next(sk2->next, uh->dest, saddr,
-						  uh->source, daddr))) {
+						  uh->source, daddr, dif))) {
 		if (!buff) {
 			buff = skb_clone(skb, GFP_ATOMIC);
 			if (!buff)
@@ -486,59 +585,70 @@ static void udpv6_mcast_deliver(struct udphdr *uh,
 		if (sock_queue_rcv_skb(sk2, buff) >= 0)
 			buff = NULL;
 	}
-	if (buff) {
-		buff->sk = NULL;
+	if (buff)
 		kfree_skb(buff);
-	}
 	if (sock_queue_rcv_skb(sk, skb) < 0) {
-	free_skb:
-		skb->sk = NULL;
+free_skb:
 		kfree_skb(skb);
 	}
 }
 
-int udpv6_rcv(struct sk_buff *skb, struct device *dev,
-	      struct in6_addr *saddr, struct in6_addr *daddr,
-	      struct ipv6_options *opt, unsigned short len,
-	      int redo, struct inet6_protocol *protocol)
+int udpv6_rcv(struct sk_buff *skb, unsigned long len)
 {
 	struct sock *sk;
   	struct udphdr *uh;
-	int ulen;
-
-	/*
-	 *	check if the address is ours...
-	 *	I believe that this is being done in IP layer
-	 */
+	struct device *dev = skb->dev;
+	struct in6_addr *saddr = &skb->nh.ipv6h->saddr;
+	struct in6_addr *daddr = &skb->nh.ipv6h->daddr;
+	u32 ulen;
 
-	uh = (struct udphdr *) skb->h.uh;
-  	
-  	ipv6_statistics.Ip6InDelivers++;
+	uh = skb->h.uh;
+	__skb_pull(skb, skb->h.raw - skb->data);
 
 	ulen = ntohs(uh->len);
-	
+
+	/* Check for jumbo payload */
+	if (ulen == 0 && skb->nh.ipv6h->payload_len == 0)
+		ulen = len;
+
 	if (ulen > len || len < sizeof(*uh)) {
-		printk(KERN_DEBUG "UDP: short packet: %d/%d\n", ulen, len);
+		if (net_ratelimit())
+			printk(KERN_DEBUG "UDP: short packet: %d/%ld\n", ulen, len);
 		udp_stats_in6.UdpInErrors++;
 		kfree_skb(skb);
 		return(0);
 	}
 
 	if (uh->check == 0) {
-		printk(KERN_DEBUG "IPv6: udp checksum is 0\n");
+		/* IPv6 draft-v2 section 8.1 says that we SHOULD log
+		   this error. Well, it is reasonable.
+		 */
+		if (net_ratelimit())
+			printk(KERN_INFO "IPv6: udp checksum is 0\n");
 		goto discard;
 	}
 
+	skb_trim(skb, ulen);
+
+#ifndef CONFIG_UDP_DELAY_CSUM
 	switch (skb->ip_summed) {
 	case CHECKSUM_NONE:
-		skb->csum = csum_partial((char*)uh, len, 0);
+		skb->csum = csum_partial((char*)uh, ulen, 0);
 	case CHECKSUM_HW:
-		if (csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, skb->csum)) {
+		if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) {
 			printk(KERN_DEBUG "IPv6: udp checksum error\n");
 			goto discard;
 		}
 	};
-	
+#else
+	if (skb->ip_summed==CHECKSUM_HW) {
+		if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum))
+			goto discard;
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	} else if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+		skb->csum = ~csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, 0);
+#endif
+
 	len = ulen;
 
 	/* 
@@ -555,10 +665,16 @@ int udpv6_rcv(struct sk_buff *skb, struct device *dev,
 	 * check socket cache ... must talk to Alan about his plans
 	 * for sock caches... i'll skip this for now.
 	 */
-
-	sk = udp_v6_lookup(saddr, uh->source, daddr, uh->dest);
-
+	
+	sk = udp_v6_lookup(saddr, uh->source, daddr, uh->dest, dev->ifindex);
+	
 	if (sk == NULL) {
+#ifdef CONFIG_UDP_DELAY_CSUM
+		if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
+		    csum_fold(csum_partial((char*)uh, len, skb->csum)))
+			goto discard;
+#endif
+		
 		udp_stats_in6.UdpNoPorts++;
 
 		icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, dev);
@@ -566,16 +682,13 @@ int udpv6_rcv(struct sk_buff *skb, struct device *dev,
 		kfree_skb(skb);
 		return(0);
 	}
-
+	
 	/* deliver */
-
-	if (sk->sock_readers)
-		__skb_queue_tail(&sk->back_log, skb);
-	else
-		udpv6_queue_rcv_skb(sk, skb);
+	
+	udpv6_queue_rcv_skb(sk, skb);
 	
 	return(0);
-
+	
 discard:
 	udp_stats_in6.UdpInErrors++;
 	kfree_skb(skb);
@@ -618,7 +731,7 @@ static int udpv6_getfrag(const void *data, struct in6_addr *addr,
 	}
 
 	if (csum_partial_copy_fromiovecend(dst, udh->iov, offset,
-						     clen, &udh->wcheck))
+					   clen, &udh->wcheck))
 		return -EFAULT;
 
 	if (final) {
@@ -649,11 +762,11 @@ static int udpv6_getfrag(const void *data, struct in6_addr *addr,
 
 static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen)
 {
-	struct ipv6_options opt_space;
+	struct ipv6_txoptions opt_space;
 	struct udpv6fakehdr udh;
 	struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6;
 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) msg->msg_name;
-	struct ipv6_options *opt = NULL;
+	struct ipv6_txoptions *opt = NULL;
 	struct flowi fl;
 	int addr_len = msg->msg_namelen;
 	struct in6_addr *daddr;
@@ -661,22 +774,18 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen)
 	int len = ulen + sizeof(struct udphdr);
 	int addr_type;
 	int hlimit = -1;
-
+	
 	int err;
 	
 	/* Rough check on arithmetic overflow,
 	   better check is made in ip6_build_xmit
-
-	   When jumbo header will be implemeted we will change it
-	   to something sort of (len will be size_t)
-	   ulen > SIZE_T_MAX - sizeof(struct udphdr)
-	 */
-	if (ulen < 0 || ulen > 0xFFFF - sizeof(struct udphdr))
+	   */
+	if (ulen < 0 || ulen > INT_MAX - sizeof(struct udphdr))
 		return -EMSGSIZE;
-
+	
 	if (msg->msg_flags & ~(MSG_DONTROUTE|MSG_DONTWAIT))
 		return(-EINVAL);
-
+	
 	if (sin6) {
 		if (sin6->sin6_family == AF_INET)
 			return udp_sendmsg(sk, msg, ulen);
@@ -692,14 +801,6 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen)
 	       
 		udh.uh.dest = sin6->sin6_port;
 		daddr = &sin6->sin6_addr;
-
-		/* BUGGGG! If route is not cloned, this check always
-		   fails, hence dst_cache only slows down transmission --ANK
-		 */
-		if (sk->dst_cache && ipv6_addr_cmp(daddr, &np->daddr)) {
-			dst_release(sk->dst_cache);
-			sk->dst_cache = NULL;
-		}
 	} else {
 		if (sk->state != TCP_ESTABLISHED)
 			return(-ENOTCONN);
@@ -707,9 +808,9 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen)
 		udh.uh.dest = sk->dport;
 		daddr = &sk->net_pinfo.af_inet6.daddr;
 	}
-
+	
 	addr_type = ipv6_addr_type(daddr);
-
+	
 	if (addr_type == IPV6_ADDR_MAPPED) {
 		struct sockaddr_in sin;
 		
@@ -720,24 +821,25 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen)
 
 		return udp_sendmsg(sk, msg, ulen);
 	}
-
+	
 	udh.daddr = NULL;
 	fl.oif = sk->bound_dev_if;
 	
 	if (msg->msg_controllen) {
 		opt = &opt_space;
-		memset(opt, 0, sizeof(struct ipv6_options));
+		memset(opt, 0, sizeof(struct ipv6_txoptions));
 
 		err = datagram_send_ctl(msg, &fl.oif, &saddr, opt, &hlimit);
 		if (err < 0)
 			return err;
-		
-		if (opt->srcrt)
-			udh.daddr = daddr;
 	}
-	
+	if (opt == NULL || !(opt->opt_nflen|opt->opt_flen))
+		opt = np->opt;
+	if (opt && opt->srcrt)
+		udh.daddr = daddr;
+
 	udh.uh.source = sk->sport;
-	udh.uh.len = htons(len);
+	udh.uh.len = len < 0x1000 ? htons(len) : 0;
 	udh.uh.check = 0;
 	udh.iov = msg->msg_iov;
 	udh.wcheck = 0;
@@ -783,7 +885,7 @@ struct proto udpv6_prot = {
 	datagram_poll,			/* poll */
 	udp_ioctl,			/* ioctl */
 	NULL,				/* init */
-	NULL,				/* destroy */
+	inet6_destroy_sock,		/* destroy */
 	NULL,				/* shutdown */
 	ipv6_setsockopt,		/* setsockopt */
 	ipv6_getsockopt,		/* getsockopt */
author	Ralf Baechle <ralf@linux-mips.org>	1998-09-19 19:15:08 +0000
committer	Ralf Baechle <ralf@linux-mips.org>	1998-09-19 19:15:08 +0000
commit	03ba4131783cc9e872f8bb26a03f15bc11f27564 (patch)
tree	88db8dba75ae06ba3bad08e42c5e52efc162535c /net/ipv6
parent	257730f99381dd26e10b832fce4c94cae7ac1176 (diff)