diff options
author | Ralf Baechle <ralf@linux-mips.org> | 1998-09-19 19:15:08 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 1998-09-19 19:15:08 +0000 |
commit | 03ba4131783cc9e872f8bb26a03f15bc11f27564 (patch) | |
tree | 88db8dba75ae06ba3bad08e42c5e52efc162535c /net/ipv6 | |
parent | 257730f99381dd26e10b832fce4c94cae7ac1176 (diff) |
- Merge with Linux 2.1.121.
- Bugfixes.
Diffstat (limited to 'net/ipv6')
-rw-r--r-- | net/ipv6/addrconf.c | 46 | ||||
-rw-r--r-- | net/ipv6/af_inet6.c | 52 | ||||
-rw-r--r-- | net/ipv6/datagram.c | 138 | ||||
-rw-r--r-- | net/ipv6/exthdrs.c | 670 | ||||
-rw-r--r-- | net/ipv6/icmp.c | 201 | ||||
-rw-r--r-- | net/ipv6/ip6_fib.c | 1199 | ||||
-rw-r--r-- | net/ipv6/ip6_fw.c | 16 | ||||
-rw-r--r-- | net/ipv6/ip6_input.c | 244 | ||||
-rw-r--r-- | net/ipv6/ip6_output.c | 451 | ||||
-rw-r--r-- | net/ipv6/ipv6_sockglue.c | 145 | ||||
-rw-r--r-- | net/ipv6/mcast.c | 88 | ||||
-rw-r--r-- | net/ipv6/ndisc.c | 104 | ||||
-rw-r--r-- | net/ipv6/proc.c | 106 | ||||
-rw-r--r-- | net/ipv6/raw.c | 75 | ||||
-rw-r--r-- | net/ipv6/reassembly.c | 358 | ||||
-rw-r--r-- | net/ipv6/route.c | 1198 | ||||
-rw-r--r-- | net/ipv6/sit.c | 14 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 401 | ||||
-rw-r--r-- | net/ipv6/udp.c | 276 |
19 files changed, 3447 insertions, 2335 deletions
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 329807093..a61be48c8 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: addrconf.c,v 1.43 1998/07/15 05:05:32 davem Exp $ + * $Id: addrconf.c,v 1.45 1998/08/26 12:04:41 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -38,6 +38,7 @@ #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif +#include <linux/delay.h> #include <linux/proc_fs.h> #include <net/sock.h> @@ -53,7 +54,6 @@ #include <linux/rtnetlink.h> #include <asm/uaccess.h> -#include <asm/delay.h> /* Set to 3 to get tracing... */ #define ACONF_DEBUG 2 @@ -100,7 +100,7 @@ struct ipv6_devconf ipv6_devconf = { 0, /* forwarding */ IPV6_DEFAULT_HOPLIMIT, /* hop limit */ - 576, /* mtu */ + IPV6_MIN_MTU, /* mtu */ 1, /* accept RAs */ 1, /* accept redirects */ 1, /* autoconfiguration */ @@ -114,7 +114,7 @@ static struct ipv6_devconf ipv6_devconf_dflt = { 0, /* forwarding */ IPV6_DEFAULT_HOPLIMIT, /* hop limit */ - 576, /* mtu */ + IPV6_MIN_MTU, /* mtu */ 1, /* accept RAs */ 1, /* accept redirects */ 1, /* autoconfiguration */ @@ -185,7 +185,7 @@ static struct inet6_dev * ipv6_add_dev(struct device *dev) struct inet6_dev *ndev, **bptr, *iter; int hash; - if (dev->mtu < 576) + if (dev->mtu < IPV6_MIN_MTU) return NULL; ndev = kmalloc(sizeof(struct inet6_dev), gfp_any()); @@ -548,7 +548,6 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct device *dev, unsigned long expires, unsigned flags) { struct in6_rtmsg rtmsg; - int err; memset(&rtmsg, 0, sizeof(rtmsg)); memcpy(&rtmsg.rtmsg_dst, pfx, sizeof(struct in6_addr)); @@ -566,7 +565,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct device *dev, if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT)) rtmsg.rtmsg_flags |= RTF_NONEXTHOP; - ip6_route_add(&rtmsg, &err); + ip6_route_add(&rtmsg); } /* Create "default" multicast route to the interface */ @@ -574,7 +573,6 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct device *dev, static void addrconf_add_mroute(struct device *dev) { struct in6_rtmsg rtmsg; - int err; memset(&rtmsg, 0, sizeof(rtmsg)); ipv6_addr_set(&rtmsg.rtmsg_dst, @@ -584,13 +582,12 @@ static void addrconf_add_mroute(struct device *dev) rtmsg.rtmsg_ifindex = dev->ifindex; rtmsg.rtmsg_flags = RTF_UP|RTF_ADDRCONF; rtmsg.rtmsg_type = RTMSG_NEWROUTE; - ip6_route_add(&rtmsg, &err); + ip6_route_add(&rtmsg); } static void sit_route_add(struct device *dev) { struct in6_rtmsg rtmsg; - int err; memset(&rtmsg, 0, sizeof(rtmsg)); @@ -602,7 +599,7 @@ static void sit_route_add(struct device *dev) rtmsg.rtmsg_flags = RTF_UP|RTF_NONEXTHOP; rtmsg.rtmsg_ifindex = dev->ifindex; - ip6_route_add(&rtmsg, &err); + ip6_route_add(&rtmsg); } static void addrconf_add_lroute(struct device *dev) @@ -690,13 +687,12 @@ void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len) else rt_expires = jiffies + valid_lft * HZ; - rt = rt6_lookup(&pinfo->prefix, NULL, dev->ifindex, RTF_LINKRT); + rt = rt6_lookup(&pinfo->prefix, NULL, dev->ifindex, 1); if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) { if (rt->rt6i_flags&RTF_EXPIRES) { if (pinfo->onlink == 0 || valid_lft == 0) { ip6_del_rt(rt); - rt = NULL; } else { rt->rt6i_expires = rt_expires; } @@ -705,6 +701,8 @@ void addrconf_prefix_rcv(struct device *dev, u8 *opt, int len) addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, dev, rt_expires, RTF_ADDRCONF|RTF_EXPIRES); } + if (rt) + dst_release(&rt->u.dst); /* Try to figure out our local address for this prefix */ @@ -1118,11 +1116,17 @@ int addrconf_notify(struct notifier_block *this, unsigned long event, break; case NETDEV_CHANGEMTU: - /* BUGGG... Should scan FIB to change pmtu on routes. --ANK */ - if (dev->mtu >= 576) + if (dev->mtu >= IPV6_MIN_MTU) { + struct inet6_dev *idev; + + if ((idev = ipv6_find_idev(dev)) == NULL) + break; + idev->cnf.mtu6 = dev->mtu; + rt6_mtu_change(dev, dev->mtu); break; + } - /* MTU falled under 576. Stop IPv6 on this interface. */ + /* MTU falled under IPV6_MIN_MTU. Stop IPv6 on this interface. */ case NETDEV_DOWN: case NETDEV_UNREGISTER: @@ -1240,7 +1244,6 @@ static void addrconf_rs_timer(unsigned long data) add_timer(&ifp->timer); } else { struct in6_rtmsg rtmsg; - int err; printk(KERN_DEBUG "%s: no IPv6 routers present\n", ifp->idev->dev->name); @@ -1253,7 +1256,7 @@ static void addrconf_rs_timer(unsigned long data) rtmsg.rtmsg_ifindex = ifp->idev->dev->ifindex; - ip6_route_add(&rtmsg, &err); + ip6_route_add(&rtmsg); } } @@ -1501,7 +1504,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) } static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, - pid_t pid, u32 seq, int event) + u32 pid, u32 seq, int event) { struct ifaddrmsg *ifm; struct nlmsghdr *nlh; @@ -1659,8 +1662,11 @@ int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp, addrconf_forward_change(idev); - if (*valp) + if (*valp) { + start_bh_atomic(); rt6_purge_dflt_routers(0); + end_bh_atomic(); + } } return ret; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 051f9a28e..a9ee64925 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -7,7 +7,7 @@ * * Adapted from linux/net/ipv4/af_inet.c * - * $Id: af_inet6.c,v 1.36 1998/06/10 07:29:25 davem Exp $ + * $Id: af_inet6.c,v 1.37 1998/08/26 12:04:45 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -64,6 +64,7 @@ extern int raw6_get_info(char *, char **, off_t, int, int); extern int tcp6_get_info(char *, char **, off_t, int, int); extern int udp6_get_info(char *, char **, off_t, int, int); extern int afinet6_get_info(char *, char **, off_t, int, int); +extern int afinet6_get_snmp(char *, char **, off_t, int, int); #endif #ifdef CONFIG_SYSCTL @@ -243,10 +244,49 @@ static int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) static int inet6_release(struct socket *sock, struct socket *peer) { + struct sock *sk = sock->sk; + + if (sk == NULL) + return -EINVAL; + + /* Free mc lists */ + ipv6_sock_mc_close(sk); + + /* Huh! MOD_DEC_USE_COUNT was here :-( + It is impossible by two reasons: socket destroy + may be delayed and inet_release may sleep and + return to nowhere then. It should be moved to + inet6_destroy_sock(), but we have no explicit constructor :-( + --ANK (980802) + */ MOD_DEC_USE_COUNT; return inet_release(sock, peer); } +int inet6_destroy_sock(struct sock *sk) +{ + struct sk_buff *skb; + struct ipv6_txoptions *opt; + + /* + * Release destination entry + */ + + dst_release(xchg(&sk->dst_cache,NULL)); + + /* Release rx options */ + + if ((skb = xchg(&sk->net_pinfo.af_inet6.pktoptions, NULL)) != NULL) + kfree_skb(skb); + + /* Free tx options */ + + if ((opt = xchg(&sk->net_pinfo.af_inet6.opt, NULL)) != NULL) + sock_kfree_s(sk, opt, opt->tot_len); + + return 0; +} + /* * This does both peername and sockname. */ @@ -412,6 +452,12 @@ static struct proc_dir_entry proc_net_sockstat6 = { 0, &proc_net_inode_operations, afinet6_get_info }; +static struct proc_dir_entry proc_net_snmp6 = { + PROC_NET_SNMP6, 5, "snmp6", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_net_inode_operations, + afinet6_get_snmp +}; #endif /* CONFIG_PROC_FS */ #ifdef MODULE @@ -445,7 +491,7 @@ __initfunc(void inet6_proto_init(struct net_proto *pro)) printk(KERN_INFO "IPv6 v0.2 for NET3.037\n"); - if (sizeof(struct ipv6_options) > sizeof(dummy_skb->cb)) + if (sizeof(struct inet6_skb_parm) > sizeof(dummy_skb->cb)) { printk(KERN_CRIT "inet6_proto_init: size fault\n"); #ifdef MODULE @@ -490,6 +536,7 @@ __initfunc(void inet6_proto_init(struct net_proto *pro)) proc_net_register(&proc_net_tcp6); proc_net_register(&proc_net_udp6); proc_net_register(&proc_net_sockstat6); + proc_net_register(&proc_net_snmp6); #endif /* Now the userspace is allowed to create INET6 sockets. */ @@ -526,6 +573,7 @@ void cleanup_module(void) proc_net_unregister(proc_net_tcp6.low_ino); proc_net_unregister(proc_net_udp6.low_ino); proc_net_unregister(proc_net_sockstat6.low_ino); + proc_net_unregister(proc_net_snmp6.low_ino); #endif /* Cleanup code parts. */ sit_cleanup(); diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index b87f31b06..51960bd26 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: datagram.c,v 1.14 1998/03/20 09:12:15 davem Exp $ + * $Id: datagram.c,v 1.15 1998/08/26 12:04:47 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -32,48 +32,72 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; - struct ipv6_options *opt = (struct ipv6_options *) skb->cb; - - if (np->rxinfo) { + struct inet6_skb_parm *opt = (struct inet6_skb_parm *) skb->cb; + + if (np->rxopt.bits.rxinfo) { struct in6_pktinfo src_info; - src_info.ipi6_ifindex = skb->dev->ifindex; + src_info.ipi6_ifindex = opt->iif; ipv6_addr_copy(&src_info.ipi6_addr, &skb->nh.ipv6h->daddr); put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } - if (np->rxhlim) { + if (np->rxopt.bits.rxhlim) { int hlim = skb->nh.ipv6h->hop_limit; put_cmsg(msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); } - if (opt->srcrt) { - int hdrlen = sizeof(struct rt0_hdr) + (opt->srcrt->hdrlen << 3); - - put_cmsg(msg, SOL_IPV6, IPV6_RXSRCRT, hdrlen, opt->srcrt); + if (np->rxopt.bits.hopopts && opt->hop) { + u8 *ptr = skb->nh.raw + opt->hop; + put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr); + } + if (np->rxopt.bits.dstopts && opt->dst0) { + u8 *ptr = skb->nh.raw + opt->dst0; + put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, (ptr[1]+1)<<3, ptr); + } + if (np->rxopt.bits.srcrt && opt->srcrt) { + struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(skb->nh.raw + opt->srcrt); + put_cmsg(msg, SOL_IPV6, IPV6_RTHDR, (rthdr->hdrlen+1) << 3, rthdr); + } + if (np->rxopt.bits.authhdr && opt->auth) { + u8 *ptr = skb->nh.raw + opt->auth; + put_cmsg(msg, SOL_IPV6, IPV6_AUTHHDR, (ptr[1]+1)<<2, ptr); + } + if (np->rxopt.bits.dstopts && opt->dst1) { + u8 *ptr = skb->nh.raw + opt->dst1; + put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, (ptr[1]+1)<<3, ptr); } return 0; } int datagram_send_ctl(struct msghdr *msg, int *oif, - struct in6_addr **src_addr, struct ipv6_options *opt, + struct in6_addr **src_addr, struct ipv6_txoptions *opt, int *hlimit) { struct in6_pktinfo *src_info; struct cmsghdr *cmsg; struct ipv6_rt_hdr *rthdr; + struct ipv6_opt_hdr *hdr; int len; int err = 0; for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + + if ((unsigned long)(((char*)cmsg - (char*)msg->msg_control) + + cmsg->cmsg_len) > msg->msg_controllen) { + err = -EINVAL; + goto exit_f; + } + if (cmsg->cmsg_level != SOL_IPV6) { - printk(KERN_DEBUG "invalid cmsg_level %d\n", cmsg->cmsg_level); + if (net_ratelimit()) + printk(KERN_DEBUG "invalid cmsg_level %d\n", cmsg->cmsg_level); continue; } switch (cmsg->cmsg_type) { case IPV6_PKTINFO: - if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in6_pktinfo))) { + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) { err = -EINVAL; goto exit_f; } @@ -100,14 +124,77 @@ int datagram_send_ctl(struct msghdr *msg, int *oif, } break; - - case IPV6_RXSRCRT: + + case IPV6_HOPOPTS: + if (opt->hopopt || cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { + err = -EINVAL; + goto exit_f; + } + + hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); + len = ((hdr->hdrlen + 1) << 3); + if (cmsg->cmsg_len < CMSG_LEN(len)) { + err = -EINVAL; + goto exit_f; + } + if (!capable(CAP_NET_RAW)) { + err = -EPERM; + goto exit_f; + } + opt->opt_nflen += len; + opt->hopopt = hdr; + break; + + case IPV6_DSTOPTS: + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { + err = -EINVAL; + goto exit_f; + } + + hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); + len = ((hdr->hdrlen + 1) << 3); + if (cmsg->cmsg_len < CMSG_LEN(len)) { + err = -EINVAL; + goto exit_f; + } + if (!capable(CAP_NET_RAW)) { + err = -EPERM; + goto exit_f; + } + if (opt->dst1opt) { + err = -EINVAL; + goto exit_f; + } + opt->opt_flen += len; + opt->dst1opt = hdr; + break; + + case IPV6_AUTHHDR: + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { + err = -EINVAL; + goto exit_f; + } + + hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); + len = ((hdr->hdrlen + 2) << 2); + if (cmsg->cmsg_len < CMSG_LEN(len)) { + err = -EINVAL; + goto exit_f; + } + if (len & ~7) { + err = -EINVAL; + goto exit_f; + } + opt->opt_flen += len; + opt->auth = hdr; + break; + + case IPV6_RTHDR: if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_rt_hdr))) { err = -EINVAL; goto exit_f; } - len = cmsg->cmsg_len - sizeof(struct cmsghdr); rthdr = (struct ipv6_rt_hdr *)CMSG_DATA(cmsg); /* @@ -118,7 +205,9 @@ int datagram_send_ctl(struct msghdr *msg, int *oif, goto exit_f; } - if (((rthdr->hdrlen + 1) << 3) < len) { + len = ((rthdr->hdrlen + 1) << 3); + + if (cmsg->cmsg_len < CMSG_LEN(len)) { err = -EINVAL; goto exit_f; } @@ -128,12 +217,21 @@ int datagram_send_ctl(struct msghdr *msg, int *oif, err = -EINVAL; goto exit_f; } - - opt->opt_nflen += ((rthdr->hdrlen + 1) << 3); + + opt->opt_nflen += len; opt->srcrt = rthdr; + if (opt->dst1opt) { + int dsthdrlen = ((opt->dst1opt->hdrlen+1)<<3); + + opt->opt_nflen += dsthdrlen; + opt->dst0opt = opt->dst1opt; + opt->dst1opt = NULL; + opt->opt_flen -= dsthdrlen; + } + break; - + case IPV6_HOPLIMIT: if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) { err = -EINVAL; diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 0b826870f..89d58936d 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -5,8 +5,9 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Andi Kleen <ak@muc.de> + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * - * $Id: exthdrs.c,v 1.6 1998/04/30 16:24:20 freitag Exp $ + * $Id: exthdrs.c,v 1.7 1998/08/26 12:04:49 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -37,55 +38,192 @@ #include <asm/uaccess.h> -#define swap(a,b) do { typeof (a) tmp; tmp = (a); (a) = (b); (b) = (tmp); } while(0) +/* + * Parsing inbound headers. + * + * Parsing function "func" returns pointer to the place, + * where next nexthdr value is stored or NULL, if parsing + * failed. It should also update skb->h. + */ + +struct hdrtype_proc +{ + int type; + u8* (*func) (struct sk_buff **, u8 *ptr); +}; /* - * inbound + * Parsing tlv encoded headers. + * + * Parsing function "func" returns 1, if parsing succeed + * and 0, if it failed. + * It MUST NOT touch skb->h. */ -#if 0 -int ipv6_routing_header(struct sk_buff **skb_ptr, struct device *dev, - __u8 *nhptr, struct ipv6_options *opt) + +struct tlvtype_proc +{ + int type; + int (*func) (struct sk_buff *, __u8 *ptr); +}; + +/********************* + Generic functions + *********************/ + +/* An unknown option is detected, decide what to do */ + +int ip6_tlvopt_unknown(struct sk_buff *skb, u8 *opt) +{ + switch ((opt[0] & 0xC0) >> 6) { + case 0: /* ignore */ + return 1; + + case 1: /* drop packet */ + break; + + case 3: /* Send ICMP if not a multicast address and drop packet */ + /* Actually, it is redundant check. icmp_send + will recheck in any case. + */ + if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) + break; + case 2: /* send ICMP PARM PROB regardless and drop packet */ + icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, opt); + return 0; + }; + + kfree_skb(skb); + return 0; +} + +/* Parse tlv encoded option header (hop-by-hop or destination) */ + +static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff *skb, + __u8 *nhptr) +{ + struct tlvtype_proc *curr; + u8 *ptr = skb->h.raw; + int len = ((ptr[1]+1)<<3) - 2; + + ptr += 2; + + if (skb->tail - (ptr + len) < 0) { + kfree_skb(skb); + return 0; + } + + while (len > 0) { + int optlen = ptr[1]+2; + + switch (ptr[0]) { + case IPV6_TLV_PAD0: + optlen = 1; + break; + + case IPV6_TLV_PADN: + break; + + default: /* Other TLV code so scan list */ + for (curr=procs; curr->type >= 0; curr++) { + if (curr->type == ptr[0]) { + if (curr->func(skb, ptr) == 0) + return 0; + break; + } + } + if (curr->type < 0) { + if (ip6_tlvopt_unknown(skb, ptr) == 0) + return 0; + } + break; + } + ptr += optlen; + len -= optlen; + } + if (len == 0) + return 1; + kfree_skb(skb); + return 0; +} + +/***************************** + Destination options header. + *****************************/ + +struct tlvtype_proc tlvprocdestopt_lst[] = { + /* No destination options are defined now */ + {-1, NULL} +}; + +static u8 *ipv6_dest_opt(struct sk_buff **skb_ptr, u8 *nhptr) +{ + struct sk_buff *skb=*skb_ptr; + struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb; + struct ipv6_destopt_hdr *hdr = (struct ipv6_destopt_hdr *) skb->h.raw; + + opt->dst1 = (u8*)hdr - skb->nh.raw; + + if (ip6_parse_tlv(tlvprocdestopt_lst, skb, nhptr)) { + skb->h.raw += ((hdr->hdrlen+1)<<3); + return &hdr->nexthdr; + } + + return NULL; +} + +/******************************** + NONE header. No data in packet. + ********************************/ + +static u8 *ipv6_nodata(struct sk_buff **skb_ptr, u8 *nhptr) +{ + kfree_skb(*skb_ptr); + return NULL; +} + +/******************************** + Routing header. + ********************************/ + +static u8* ipv6_routing_header(struct sk_buff **skb_ptr, u8 *nhptr) { struct sk_buff *skb = *skb_ptr; + struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb; struct in6_addr *addr; struct in6_addr daddr; - int addr_type = 0; - int strict = 0; - __u32 bit_map; - int pos; + int addr_type; int n, i; struct ipv6_rt_hdr *hdr = (struct ipv6_rt_hdr *) skb->h.raw; struct rt0_hdr *rthdr; - if (hdr->segments_left == 0) { - struct ipv6_options *opt; - - opt = (struct ipv6_options *) skb->cb; - opt->srcrt = hdr; + if (((hdr->hdrlen+1)<<3) > skb->tail - skb->h.raw) { + ipv6_statistics.Ip6InHdrErrors++; + kfree_skb(skb); + return NULL; + } +looped_back: + if (hdr->segments_left == 0) { + opt->srcrt = (u8*)hdr - skb->nh.raw; skb->h.raw += (hdr->hdrlen + 1) << 3; - return hdr->nexthdr; + opt->dst0 = opt->dst1; + opt->dst1 = 0; + return &hdr->nexthdr; } - if (hdr->type != IPV6_SRCRT_TYPE_0 || hdr->hdrlen & 0x01 || - hdr->hdrlen > 46) { - /* - * Discard - */ - - pos = (__u8 *) hdr - (__u8 *) skb->nh.ipv6h + 2; + if (hdr->type != IPV6_SRCRT_TYPE_0 || hdr->hdrlen & 0x01) { + u8 *pos = (u8*) hdr; - if (hdr->type) + if (hdr->type != IPV6_SRCRT_TYPE_0) pos += 2; else pos += 1; - icmpv6_send(skb, ICMPV6_PARAMETER_PROB, 0, pos, dev); - kfree_skb(skb); - return 0; + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, pos); + return NULL; } - + /* * This is the routing header forwarding algorithm from * RFC 1883, page 17. @@ -94,13 +232,21 @@ int ipv6_routing_header(struct sk_buff **skb_ptr, struct device *dev, n = hdr->hdrlen >> 1; if (hdr->segments_left > n) { - pos = (__u8 *) hdr - (__u8 *) skb->nh.ipv6h + 2; - - pos += 3; + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, &hdr->segments_left); + return NULL; + } - icmpv6_send(skb, ICMPV6_PARAMETER_PROB, 0, pos, dev); + /* We are about to mangle packet header. Be careful! + Do not damage packets queued somewhere. + */ + if (skb_cloned(skb)) { + struct sk_buff *skb2 = skb_copy(skb, GFP_ATOMIC); kfree_skb(skb); - return 0; + if (skb2 == NULL) + return NULL; + *skb_ptr = skb = skb2; + opt = (struct inet6_skb_parm *)skb2->cb; + hdr = (struct ipv6_rt_hdr *) skb2->h.raw; } i = n - --hdr->segments_left; @@ -113,58 +259,429 @@ int ipv6_routing_header(struct sk_buff **skb_ptr, struct device *dev, if (addr_type == IPV6_ADDR_MULTICAST) { kfree_skb(skb); - return 0; + return NULL; } ipv6_addr_copy(&daddr, addr); ipv6_addr_copy(addr, &skb->nh.ipv6h->daddr); ipv6_addr_copy(&skb->nh.ipv6h->daddr, &daddr); - /* - * Check Strick Source Route + dst_release(xchg(&skb->dst, NULL)); + ip6_route_input(skb); + if (skb->dst->error) { + skb->dst->input(skb); + return NULL; + } + if (skb->dst->dev->flags&IFF_LOOPBACK) { + if (skb->nh.ipv6h->hop_limit <= 1) { + icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, + 0, skb->dev); + kfree_skb(skb); + return NULL; + } + skb->nh.ipv6h->hop_limit--; + goto looped_back; + } + + skb->dst->input(skb); + return NULL; +} + +/* + This function inverts received rthdr. + NOTE: specs allow to make it automatically only if + packet authenticated. + + I will not discuss it here (though, I am really pissed off at + this stupid requirement making rthdr idea useless) + + Actually, it creates severe problems for us. + Embrionic requests has no associated sockets, + so that user have no control over it and + cannot not only to set reply options, but + even to know, that someone wants to connect + without success. :-( + + For now we need to test the engine, so that I created + temporary (or permanent) backdoor. + If listening socket set IPV6_RTHDR to 2, then we invert header. + --ANK (980729) + */ + +struct ipv6_txoptions * +ipv6_invert_rthdr(struct sock *sk, struct ipv6_rt_hdr *hdr) +{ + /* Received rthdr: + + [ H1 -> H2 -> ... H_prev ] daddr=ME + + Inverted result: + [ H_prev -> ... -> H1 ] daddr =sender + + Note, that IP output engine will rewrire this rthdr + by rotating it left by one addr. */ - bit_map = ntohl(rthdr->bitmap); + int n, i; + struct rt0_hdr *rthdr = (struct rt0_hdr*)hdr; + struct rt0_hdr *irthdr; + struct ipv6_txoptions *opt; + int hdrlen = ipv6_optlen(hdr); + + if (hdr->segments_left || + hdr->type != IPV6_SRCRT_TYPE_0 || + hdr->hdrlen & 0x01) + return NULL; - if ((bit_map & (1 << i)) == IPV6_SRCRT_STRICT) - strict = 1; + n = hdr->hdrlen >> 1; + opt = sock_kmalloc(sk, sizeof(*opt) + hdrlen, GFP_ATOMIC); + if (opt == NULL) + return NULL; + memset(opt, 0, sizeof(*opt)); + opt->tot_len = sizeof(*opt) + hdrlen; + opt->srcrt = (void*)(opt+1); + opt->opt_nflen = hdrlen; + + memcpy(opt->srcrt, hdr, sizeof(*hdr)); + irthdr = (struct rt0_hdr*)opt->srcrt; + /* Obsolete field, MBZ, when originated by us */ + irthdr->bitmap = 0; + opt->srcrt->segments_left = n; + for (i=0; i<n; i++) + memcpy(irthdr->addr+i, rthdr->addr+(n-1-i), 16); + return opt; +} - ipv6_forward(skb, dev, (strict ? IP6_FW_STRICT : 0) | IP6_FW_SRCRT); +/******************************** + AUTH header. + ********************************/ +/* + rfc1826 said, that if a host does not implement AUTH header + it MAY ignore it. We use this hole 8) + + Actually, now we can implement OSPFv6 without kernel IPsec. + Authentication for poors may be done in user space with the same success. + + Yes, it means, that we allow application to send/receive + raw authentication header. Apparently, we suppose, that it knows + what it does and calculates authentication data correctly. + Certainly, it is possible only for udp and raw sockets, but not for tcp. + + BTW I beg pardon, it is not good place for flames, but + I cannot be silent 8) It is very sad, but fools prevail 8) + AUTH header has 4byte granular length, what kills all the idea + behind AUTOMATIC 64bit alignment of IPv6. Now we will loose + cpu ticks, checking that sender did not something stupid + and opt->hdrlen is even. Shit! --ANK (980730) + */ + +static u8 *ipv6_auth_hdr(struct sk_buff **skb_ptr, u8 *nhptr) +{ + struct sk_buff *skb=*skb_ptr; + struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb; + struct ipv6_opt_hdr *hdr = (struct ipv6_opt_hdr *)skb->h.raw; + int len = (hdr->hdrlen+2)<<2; + + opt->auth = (u8*)hdr - skb->nh.raw; + if (skb->h.raw + len > skb->tail) + return NULL; + skb->h.raw += len; + return &hdr->nexthdr; +} + +/* This list MUST NOT contain entry for NEXTHDR_HOP. + It is parsed immediately after packet received + and if it occurs somewhere in another place we must + generate error. + */ + +struct hdrtype_proc hdrproc_lst[] = { + {NEXTHDR_FRAGMENT, ipv6_reassembly}, + {NEXTHDR_ROUTING, ipv6_routing_header}, + {NEXTHDR_DEST, ipv6_dest_opt}, + {NEXTHDR_NONE, ipv6_nodata}, + {NEXTHDR_AUTH, ipv6_auth_hdr}, + /* + {NEXTHDR_ESP, ipv6_esp_hdr}, + */ + {-1, NULL} +}; + +u8 *ipv6_parse_exthdrs(struct sk_buff **skb_in, u8 *nhptr) +{ + struct hdrtype_proc *hdrt; + u8 nexthdr = *nhptr; + +restart: + for (hdrt=hdrproc_lst; hdrt->type >= 0; hdrt++) { + if (hdrt->type == nexthdr) { + if ((nhptr = hdrt->func(skb_in, nhptr)) != NULL) { + nexthdr = *nhptr; + goto restart; + } + return NULL; + } + } + return nhptr; +} + + +/********************************** + Hop-by-hop options. + **********************************/ + +/* Router Alert as of draft-ietf-ipngwg-ipv6router-alert-04 */ + +static int ipv6_hop_ra(struct sk_buff *skb, u8 *ptr) +{ + if (ptr[1] == 2) { + ((struct inet6_skb_parm*)skb->cb)->ra = ptr - skb->nh.raw; + return 1; + } + if (net_ratelimit()) + printk(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n", ptr[1]); + kfree_skb(skb); return 0; } +/* Jumbo payload */ + +static int ipv6_hop_jumbo(struct sk_buff *skb, u8 *ptr) +{ + u32 pkt_len; + + if (ptr[1] != 4 || ((ptr-skb->nh.raw)&3) != 2) { + if (net_ratelimit()) + printk(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", ptr[1]); + goto drop; + } + + pkt_len = ntohl(*(u32*)(ptr+2)); + if (pkt_len < 0x10000) { + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ptr+2); + return 0; + } + if (skb->nh.ipv6h->payload_len) { + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ptr); + return 0; + } + + if (pkt_len > skb->len - sizeof(struct ipv6hdr)) { + ipv6_statistics.Ip6InTruncatedPkts++; + goto drop; + } + skb_trim(skb, pkt_len + sizeof(struct ipv6hdr)); + return 1; + +drop: + kfree_skb(skb); + return 0; +} + +struct tlvtype_proc tlvprochopopt_lst[] = { + {IPV6_TLV_ROUTERALERT, ipv6_hop_ra}, + {IPV6_TLV_JUMBO, ipv6_hop_jumbo}, + {-1, NULL} +}; + +u8 * ipv6_parse_hopopts(struct sk_buff *skb, u8 *nhptr) +{ + ((struct inet6_skb_parm*)skb->cb)->hop = sizeof(struct ipv6hdr); + if (ip6_parse_tlv(tlvprochopopt_lst, skb, nhptr)) + return nhptr+((nhptr[1]+1)<<3); + return NULL; +} /* - * outbound + * Creating outbound headers. + * + * "build" functions work when skb is filled from head to tail (datagram) + * "push" functions work when headers are added from tail to head (tcp) + * + * In both cases we assume, that caller reserved enough room + * for headers. */ -int ipv6opt_bld_rthdr(struct sk_buff *skb, struct ipv6_options *opt, - struct in6_addr *addr) +u8 *ipv6_build_rthdr(struct sk_buff *skb, u8 *prev_hdr, + struct ipv6_rt_hdr *opt, struct in6_addr *addr) { struct rt0_hdr *phdr, *ihdr; int hops; - ihdr = (struct rt0_hdr *) opt->srcrt; + ihdr = (struct rt0_hdr *) opt; phdr = (struct rt0_hdr *) skb_put(skb, (ihdr->rt_hdr.hdrlen + 1) << 3); - memcpy(phdr, ihdr, sizeof(struct ipv6_rt_hdr)); + memcpy(phdr, ihdr, sizeof(struct rt0_hdr)); hops = ihdr->rt_hdr.hdrlen >> 1; - + if (hops > 1) memcpy(phdr->addr, ihdr->addr + 1, (hops - 1) * sizeof(struct in6_addr)); ipv6_addr_copy(phdr->addr + (hops - 1), addr); + + phdr->rt_hdr.nexthdr = *prev_hdr; + *prev_hdr = NEXTHDR_ROUTING; + return &phdr->rt_hdr.nexthdr; +} + +static u8 *ipv6_build_exthdr(struct sk_buff *skb, u8 *prev_hdr, u8 type, struct ipv6_opt_hdr *opt) +{ + struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_put(skb, ipv6_optlen(opt)); + + memcpy(h, opt, ipv6_optlen(opt)); + h->nexthdr = *prev_hdr; + *prev_hdr = type; + return &h->nexthdr; +} + +static u8 *ipv6_build_authhdr(struct sk_buff *skb, u8 *prev_hdr, struct ipv6_opt_hdr *opt) +{ + struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_put(skb, (opt->hdrlen+2)<<2); + + memcpy(h, opt, (opt->hdrlen+2)<<2); + h->nexthdr = *prev_hdr; + *prev_hdr = NEXTHDR_AUTH; + return &h->nexthdr; +} + + +u8 *ipv6_build_nfrag_opts(struct sk_buff *skb, u8 *prev_hdr, struct ipv6_txoptions *opt, + struct in6_addr *daddr, u32 jumbolen) +{ + struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb->data; + + if (opt && opt->hopopt) + prev_hdr = ipv6_build_exthdr(skb, prev_hdr, NEXTHDR_HOP, opt->hopopt); + + if (jumbolen) { + u8 *jumboopt = (u8 *)skb_put(skb, 8); + + if (opt && opt->hopopt) { + *jumboopt++ = IPV6_TLV_PADN; + *jumboopt++ = 0; + h->hdrlen++; + } else { + h = (struct ipv6_opt_hdr *)jumboopt; + h->nexthdr = *prev_hdr; + h->hdrlen = 0; + jumboopt += 2; + *prev_hdr = NEXTHDR_HOP; + prev_hdr = &h->nexthdr; + } + jumboopt[0] = IPV6_TLV_JUMBO; + jumboopt[1] = 4; + *(u32*)(jumboopt+2) = htonl(jumbolen); + } + if (opt) { + if (opt->dst0opt) + prev_hdr = ipv6_build_exthdr(skb, prev_hdr, NEXTHDR_DEST, opt->dst0opt); + if (opt->srcrt) + prev_hdr = ipv6_build_rthdr(skb, prev_hdr, opt->srcrt, daddr); + } + return prev_hdr; +} + +u8 *ipv6_build_frag_opts(struct sk_buff *skb, u8 *prev_hdr, struct ipv6_txoptions *opt) +{ + if (opt->auth) + prev_hdr = ipv6_build_authhdr(skb, prev_hdr, opt->auth); + if (opt->dst1opt) + prev_hdr = ipv6_build_exthdr(skb, prev_hdr, NEXTHDR_DEST, opt->dst1opt); + return prev_hdr; +} + +static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto, + struct ipv6_rt_hdr *opt, + struct in6_addr **addr_p) +{ + struct rt0_hdr *phdr, *ihdr; + int hops; + + ihdr = (struct rt0_hdr *) opt; - phdr->rt_hdr.nexthdr = proto; - return NEXTHDR_ROUTING; + phdr = (struct rt0_hdr *) skb_push(skb, (ihdr->rt_hdr.hdrlen + 1) << 3); + memcpy(phdr, ihdr, sizeof(struct rt0_hdr)); + + hops = ihdr->rt_hdr.hdrlen >> 1; + + if (hops > 1) + memcpy(phdr->addr, ihdr->addr + 1, + (hops - 1) * sizeof(struct in6_addr)); + + ipv6_addr_copy(phdr->addr + (hops - 1), *addr_p); + *addr_p = ihdr->addr; + + phdr->rt_hdr.nexthdr = *proto; + *proto = NEXTHDR_ROUTING; +} + +static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv6_opt_hdr *opt) +{ + struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_push(skb, ipv6_optlen(opt)); + + memcpy(h, opt, ipv6_optlen(opt)); + h->nexthdr = *proto; + *proto = type; } -#endif + +static void ipv6_push_authhdr(struct sk_buff *skb, u8 *proto, struct ipv6_opt_hdr *opt) +{ + struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_push(skb, (opt->hdrlen+2)<<2); + + memcpy(h, opt, (opt->hdrlen+2)<<2); + h->nexthdr = *proto; + *proto = NEXTHDR_AUTH; +} + +void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, + u8 *proto, + struct in6_addr **daddr) +{ + if (opt->srcrt) + ipv6_push_rthdr(skb, proto, opt->srcrt, daddr); + if (opt->dst0opt) + ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt); + if (opt->hopopt) + ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt); +} + +void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto) +{ + if (opt->dst1opt) + ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt); + if (opt->auth) + ipv6_push_authhdr(skb, proto, opt->auth); +} + +struct ipv6_txoptions * +ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) +{ + struct ipv6_txoptions *opt2; + + opt2 = sock_kmalloc(sk, opt->tot_len, GFP_ATOMIC); + if (opt2) { + long dif = (char*)opt2 - (char*)opt; + memcpy(opt2, opt, opt->tot_len); + if (opt2->hopopt) + *((char**)&opt2->hopopt) += dif; + if (opt2->dst0opt) + *((char**)&opt2->dst0opt) += dif; + if (opt2->dst1opt) + *((char**)&opt2->dst1opt) += dif; + if (opt2->auth) + *((char**)&opt2->auth) += dif; + if (opt2->srcrt) + *((char**)&opt2->srcrt) += dif; + } + return opt2; +} + /* - * find out if nexthdr is an extension header or a protocol + * find out if nexthdr is a well-known extension header or a protocol */ static __inline__ int ipv6_ext_hdr(u8 nexthdr) @@ -175,11 +692,9 @@ static __inline__ int ipv6_ext_hdr(u8 nexthdr) return ( (nexthdr == NEXTHDR_HOP) || (nexthdr == NEXTHDR_ROUTING) || (nexthdr == NEXTHDR_FRAGMENT) || - (nexthdr == NEXTHDR_ESP) || (nexthdr == NEXTHDR_AUTH) || (nexthdr == NEXTHDR_NONE) || (nexthdr == NEXTHDR_DEST) ); - } /* @@ -200,34 +715,57 @@ static __inline__ int ipv6_ext_hdr(u8 nexthdr) * * But I see no other way to do this. This might need to be reexamined * when Linux implements ESP (and maybe AUTH) headers. + * --AK + * + * This function parses (probably truncated) exthdr set "hdr" + * of length "len". "nexthdrp" initially points to some place, + * where type of the first header can be found. + * + * It skips all well-known exthdrs, and returns pointer to the start + * of unparsable area i.e. the first header with unknown type. + * If it is not NULL *nexthdr is updated by type/protocol of this header. + * + * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL. + * - it may return pointer pointing beyond end of packet, + * if the last recognized header is truncated in the middle. + * - if packet is truncated, so that all parsed headers are skipped, + * it returns NULL. + * - First fragment header is skipped, not-first ones + * are considered as unparsable. + * - ESP is unparsable for now and considered like + * normal payload protocol. + * - Note also special handling of AUTH header. Thanks to IPsec wizards. + * + * --ANK (980726) */ -struct ipv6_opt_hdr *ipv6_skip_exthdr(struct ipv6_opt_hdr *hdr, - u8 *nexthdrp, int len) + +u8 *ipv6_skip_exthdr(struct ipv6_opt_hdr *hdr, u8 *nexthdrp, int len) { u8 nexthdr = *nexthdrp; while (ipv6_ext_hdr(nexthdr)) { int hdrlen; - - if (nexthdr == NEXTHDR_NONE) + + if (len < sizeof(struct ipv6_opt_hdr)) return NULL; - if (len < sizeof(struct ipv6_opt_hdr)) /* be anal today */ + if (nexthdr == NEXTHDR_NONE) return NULL; - - hdrlen = ipv6_optlen(hdr); - if (len < hdrlen) - return NULL; + if (nexthdr == NEXTHDR_FRAGMENT) { + struct frag_hdr *fhdr = (struct frag_hdr *) hdr; + if (ntohs(fhdr->frag_off) & ~0x7) + break; + hdrlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hdr->hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(hdr); nexthdr = hdr->nexthdr; hdr = (struct ipv6_opt_hdr *) ((u8*)hdr + hdrlen); len -= hdrlen; } - /* Hack.. Do the same for AUTH headers? */ - if (nexthdr == NEXTHDR_ESP) - return NULL; - *nexthdrp = nexthdr; - return hdr; + return (u8*)hdr; } diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index c3b6f7b6b..d43d1f98d 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: icmp.c,v 1.18 1998/05/07 15:42:59 davem Exp $ + * $Id: icmp.c,v 1.19 1998/08/26 12:04:52 davem Exp $ * * Based on net/ipv4/icmp.c * @@ -58,16 +58,15 @@ #include <asm/uaccess.h> #include <asm/system.h> +struct icmpv6_mib icmpv6_statistics; + /* * ICMP socket for flow control. */ struct socket *icmpv6_socket; -int icmpv6_rcv(struct sk_buff *skb, struct device *dev, - struct in6_addr *saddr, struct in6_addr *daddr, - struct ipv6_options *opt, unsigned short len, - int redo, struct inet6_protocol *protocol); +int icmpv6_rcv(struct sk_buff *skb, unsigned long len); static struct inet6_protocol icmpv6_protocol = { @@ -80,8 +79,6 @@ static struct inet6_protocol icmpv6_protocol = "ICMPv6" /* name */ }; - - struct icmpv6_msg { struct icmp6hdr icmph; __u8 *data; @@ -105,8 +102,11 @@ static int icmpv6_getfrag(const void *data, struct in6_addr *saddr, /* * in theory offset must be 0 since we never send more - * than 576 bytes on an error or more than the path mtu + * than IPV6_MIN_MTU bytes on an error or more than the path mtu * on an echo reply. (those are the rules on RFC 1883) + * + * Luckily, this statement is obsolete after + * draft-ietf-ipngwg-icmp-v2-00 --ANK (980730) */ if (offset) { @@ -143,13 +143,36 @@ void icmpv6_param_prob(struct sk_buff *skb, int code, void *pos) kfree_skb(skb); } -static inline int is_icmp(struct ipv6hdr *hdr, int len) +/* + * Figure out, may we reply to this packet with icmp error. + * + * We do not reply, if: + * - it was icmp error message. + * - it is truncated, so that it is known, that protocol is ICMPV6 + * (i.e. in the middle of some exthdr) + * - it is not the first fragment. BTW IPv6 specs say nothing about + * this case, but it is clear, that our reply would be useless + * for sender. + * + * --ANK (980726) + */ + +static int is_ineligible(struct ipv6hdr *hdr, int len) { - __u8 nexthdr = hdr->nexthdr; + u8 *ptr; + __u8 nexthdr = hdr->nexthdr; + + if (len < (int)sizeof(*hdr)) + return 1; - if (!ipv6_skip_exthdr((struct ipv6_opt_hdr *)(hdr+1), &nexthdr, len)) - return 0; - return nexthdr == IPPROTO_ICMP; + ptr = ipv6_skip_exthdr((struct ipv6_opt_hdr *)(hdr+1), &nexthdr, len - sizeof(*hdr)); + if (!ptr) + return 0; + if (nexthdr == IPPROTO_ICMPV6) { + struct icmp6hdr *ihdr = (struct icmp6hdr *)ptr; + return (ptr - (u8*)hdr) > len || !(ihdr->icmp6_type & 0x80); + } + return nexthdr == NEXTHDR_FRAGMENT; } int sysctl_icmpv6_time = 1*HZ; @@ -160,31 +183,37 @@ int sysctl_icmpv6_time = 1*HZ; static inline int icmpv6_xrlim_allow(struct sock *sk, int type, struct flowi *fl) { -#if 0 - struct dst_entry *dst; - int allow = 0; -#endif + struct dst_entry *dst; + int res = 0; + /* Informational messages are not limited. */ if (type & 0x80) - return 1; + return 1; -#if 0 /* not yet, first fix routing COW */ + /* Do not limit pmtu discovery, it would break it. */ + if (type == ICMPV6_PKT_TOOBIG) + return 1; /* * Look up the output route. * XXX: perhaps the expire for routing entries cloned by * this lookup should be more aggressive (not longer than timeout). */ - dst = ip6_route_output(sk, fl, 1); - if (dst->error) + dst = ip6_route_output(sk, fl); + if (dst->error) ipv6_statistics.Ip6OutNoRoutes++; - else - allow = xrlim_allow(dst, sysctl_icmpv6_time); + else { + struct rt6_info *rt = (struct rt6_info *)dst; + int tmo = sysctl_icmpv6_time; + + /* Give more bandwidth to wider prefixes. */ + if (rt->rt6i_dst.plen < 128) + tmo >>= ((128 - rt->rt6i_dst.plen)>>5); + + res = xrlim_allow(dst, tmo); + } dst_release(dst); - return allow; -#else - return 1; -#endif + return res; } /* @@ -196,7 +225,7 @@ static inline int icmpv6_xrlim_allow(struct sock *sk, int type, static __inline__ int opt_unrec(struct sk_buff *skb, __u32 offset) { - char *buff = skb->nh.raw; + u8 *buff = skb->nh.raw; return ( ( *(buff + offset) & 0xC0 ) == 0x80 ); } @@ -215,7 +244,6 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, struct icmpv6_msg msg; struct flowi fl; int addr_type = 0; - int optlen; int len; /* @@ -237,7 +265,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, addr_type = ipv6_addr_type(&hdr->daddr); - if (ipv6_chk_addr(&hdr->daddr, NULL, 0)) + if (ipv6_chk_addr(&hdr->daddr, skb->dev, 0)) saddr = &hdr->daddr; /* @@ -275,8 +303,9 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, /* * Never answer to a ICMP packet. */ - if (is_icmp(hdr, (u8*)skb->tail - (u8*)hdr)) { - printk(KERN_DEBUG "icmpv6_send: no reply to icmp\n"); + if (is_ineligible(hdr, (u8*)skb->tail - (u8*)hdr)) { + if (net_ratelimit()) + printk(KERN_DEBUG "icmpv6_send: no reply to icmp error/fragment\n"); return; } @@ -303,34 +332,22 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, msg.data = skb->nh.raw; msg.csum = 0; msg.daddr = &hdr->saddr; - /* - if (skb->opt) - optlen = skb->opt->optlen; - else - */ - - optlen = 0; - len = min(skb->tail - ((unsigned char *) hdr), - 576 - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr) - - optlen); + len = min((skb->tail - ((unsigned char *) hdr)) + sizeof(struct icmp6hdr), + IPV6_MIN_MTU - sizeof(struct icmp6hdr)); if (len < 0) { printk(KERN_DEBUG "icmp: len problem\n"); return; } - len += sizeof(struct icmp6hdr); - msg.len = len; ip6_build_xmit(sk, icmpv6_getfrag, &msg, &fl, len, NULL, -1, MSG_DONTWAIT); - - /* Oops! We must purge cached dst, otherwise - all the following ICMP messages will go there :) --ANK - */ - dst_release(xchg(&sk->dst_cache, NULL)); + if (type >= ICMPV6_DEST_UNREACH && type <= ICMPV6_PARAMPROB) + (&icmpv6_statistics.Icmp6OutDestUnreachs)[type-1]++; + icmpv6_statistics.Icmp6OutMsgs++; } static void icmpv6_echo_reply(struct sk_buff *skb) @@ -374,38 +391,41 @@ static void icmpv6_echo_reply(struct sk_buff *skb) ip6_build_xmit(sk, icmpv6_getfrag, &msg, &fl, len, NULL, -1, MSG_DONTWAIT); - - /* Oops! We must purge cached dst, otherwise - all the following ICMP messages will go there :) --ANK - */ - dst_release(xchg(&sk->dst_cache, NULL)); + icmpv6_statistics.Icmp6OutEchoReplies++; + icmpv6_statistics.Icmp6OutMsgs++; } static void icmpv6_notify(struct sk_buff *skb, - int type, int code, unsigned char *buff, int len, - struct in6_addr *saddr, struct in6_addr *daddr, - struct inet6_protocol *protocol) + int type, int code, unsigned char *buff, int len) { + struct in6_addr *saddr = &skb->nh.ipv6h->saddr; + struct in6_addr *daddr = &skb->nh.ipv6h->daddr; struct ipv6hdr *hdr = (struct ipv6hdr *) buff; struct inet6_protocol *ipprot; struct sock *sk; - struct ipv6_opt_hdr *pb; + u8 *pb; __u32 info = 0; int hash; u8 nexthdr; nexthdr = hdr->nexthdr; - pb = (struct ipv6_opt_hdr *) (hdr + 1); len -= sizeof(struct ipv6hdr); if (len < 0) return; /* now skip over extension headers */ - pb = ipv6_skip_exthdr(pb, &nexthdr, len); + pb = ipv6_skip_exthdr((struct ipv6_opt_hdr *) (hdr + 1), &nexthdr, len); if (!pb) return; + /* BUGGG_FUTURE: we should try to parse exthdrs in this packet. + Without this we will not able f.e. to make source routed + pmtu discovery. + Corresponding argument (opt) to notifiers is already added. + --ANK (980726) + */ + hash = nexthdr & (MAX_INET_PROTOS - 1); for (ipprot = (struct inet6_protocol *) inet6_protos[hash]; @@ -414,9 +434,8 @@ static void icmpv6_notify(struct sk_buff *skb, if (ipprot->protocol != nexthdr) continue; - if (ipprot->err_handler) - ipprot->err_handler(skb, type, code, (u8*)pb, info, - saddr, daddr, ipprot); + if (ipprot->err_handler) + ipprot->err_handler(skb, hdr, NULL, type, code, pb, info); return; } @@ -428,7 +447,7 @@ static void icmpv6_notify(struct sk_buff *skb, return; while((sk = raw_v6_lookup(sk, nexthdr, daddr, saddr))) { - rawv6_err(sk, type, code, (char*)pb, saddr, daddr); + rawv6_err(sk, skb, hdr, NULL, type, code, pb, info); sk = sk->next; } } @@ -437,14 +456,17 @@ static void icmpv6_notify(struct sk_buff *skb, * Handle icmp messages */ -int icmpv6_rcv(struct sk_buff *skb, struct device *dev, - struct in6_addr *saddr, struct in6_addr *daddr, - struct ipv6_options *opt, unsigned short len, - int redo, struct inet6_protocol *protocol) +int icmpv6_rcv(struct sk_buff *skb, unsigned long len) { + struct device *dev = skb->dev; + struct in6_addr *saddr = &skb->nh.ipv6h->saddr; + struct in6_addr *daddr = &skb->nh.ipv6h->daddr; struct ipv6hdr *orig_hdr; struct icmp6hdr *hdr = (struct icmp6hdr *) skb->h.raw; int ulen; + int type; + + icmpv6_statistics.Icmp6InMsgs++; /* Perform checksum. */ switch (skb->ip_summed) { @@ -480,8 +502,15 @@ int icmpv6_rcv(struct sk_buff *skb, struct device *dev, * length of original packet carried in skb */ ulen = skb->tail - (unsigned char *) (hdr + 1); - - switch (hdr->icmp6_type) { + + type = hdr->icmp6_type; + + if (type >= ICMPV6_DEST_UNREACH && type <= ICMPV6_PARAMPROB) + (&icmpv6_statistics.Icmp6InDestUnreachs)[type-ICMPV6_DEST_UNREACH]++; + else if (type >= ICMPV6_ECHO_REQUEST && type <= NDISC_REDIRECT) + (&icmpv6_statistics.Icmp6InEchos)[type-ICMPV6_ECHO_REQUEST]++; + + switch (type) { case ICMPV6_ECHO_REQUEST: icmpv6_echo_reply(skb); @@ -492,9 +521,14 @@ int icmpv6_rcv(struct sk_buff *skb, struct device *dev, break; case ICMPV6_PKT_TOOBIG: + /* BUGGG_FUTURE: if packet contains rthdr, we cannot update + standard destination cache. Seems, only "advanced" + destination cache will allow to solve this problem + --ANK (980726) + */ orig_hdr = (struct ipv6hdr *) (hdr + 1); if (ulen >= sizeof(struct ipv6hdr)) - rt6_pmtu_discovery(&orig_hdr->daddr, dev, + rt6_pmtu_discovery(&orig_hdr->daddr, &orig_hdr->saddr, dev, ntohl(hdr->icmp6_mtu)); /* @@ -504,10 +538,8 @@ int icmpv6_rcv(struct sk_buff *skb, struct device *dev, case ICMPV6_DEST_UNREACH: case ICMPV6_TIME_EXCEED: case ICMPV6_PARAMPROB: - - icmpv6_notify(skb, hdr->icmp6_type, hdr->icmp6_code, - (char *) (hdr + 1), ulen, - saddr, daddr, protocol); + icmpv6_notify(skb, type, hdr->icmp6_code, + (char *) (hdr + 1), ulen); break; case NDISC_ROUTER_SOLICITATION: @@ -515,7 +547,7 @@ int icmpv6_rcv(struct sk_buff *skb, struct device *dev, case NDISC_NEIGHBOUR_SOLICITATION: case NDISC_NEIGHBOUR_ADVERTISEMENT: case NDISC_REDIRECT: - ndisc_rcv(skb, dev, saddr, daddr, opt, len); + ndisc_rcv(skb, len); break; case ICMPV6_MGM_QUERY: @@ -530,23 +562,26 @@ int icmpv6_rcv(struct sk_buff *skb, struct device *dev, break; default: - printk(KERN_DEBUG "icmpv6: msg of unkown type\n"); + if (net_ratelimit()) + printk(KERN_DEBUG "icmpv6: msg of unkown type\n"); /* informational */ - if (hdr->icmp6_type & 0x80) - goto discard_it; + if (type & 0x80) + break; /* * error of unkown type. * must pass to upper level */ - icmpv6_notify(skb, hdr->icmp6_type, hdr->icmp6_code, - (char *) (hdr + 1), ulen, - saddr, daddr, protocol); + icmpv6_notify(skb, type, hdr->icmp6_code, + (char *) (hdr + 1), ulen); }; + kfree_skb(skb); + return 0; discard_it: + icmpv6_statistics.Icmp6InErrors++; kfree_skb(skb); return 0; } @@ -597,7 +632,7 @@ static struct icmp6_err { } tab_unreach[] = { { ENETUNREACH, 0}, /* NOROUTE */ { EACCES, 1}, /* ADM_PROHIBITED */ - { EOPNOTSUPP, 1}, /* NOT_NEIGHBOUR */ + { 0, 0}, /* Was NOT_NEIGHBOUR, now reserved */ { EHOSTUNREACH, 0}, /* ADDR_UNREACH */ { ECONNREFUSED, 1}, /* PORT_UNREACH */ }; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index e7e12e3ae..bad3a13ec 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: ip6_fib.c,v 1.14 1998/05/07 15:43:03 davem Exp $ + * $Id: ip6_fib.c,v 1.15 1998/08/26 12:04:55 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -32,10 +32,52 @@ #include <net/ip6_fib.h> #include <net/ip6_route.h> -#define RT_DEBUG 2 +#define RT6_DEBUG 2 +#undef CONFIG_IPV6_SUBTREES + +#if RT6_DEBUG >= 1 +#define BUG_TRAP(x) ({ if (!(x)) { printk("Assertion (" #x ") failed at " __FILE__ "(%d):" __FUNCTION__ "\n", __LINE__); } }) +#else +#define BUG_TRAP(x) do { ; } while (0) +#endif + +#if RT6_DEBUG >= 3 +#define RT6_TRACE(x...) printk(KERN_DEBUG x) +#else +#define RT6_TRACE(x...) do { ; } while (0) +#endif struct rt6_statistics rt6_stats; +enum fib_walk_state_t +{ +#ifdef CONFIG_IPV6_SUBTREES + FWS_S, +#endif + FWS_L, + FWS_R, + FWS_C, + FWS_U +}; + +struct fib6_cleaner_t +{ + struct fib6_walker_t w; + int (*func)(struct rt6_info *, void *arg); + void *arg; +}; + +#ifdef CONFIG_IPV6_SUBTREES +#define FWS_INIT FWS_S +#define SUBTREE(fn) ((fn)->subtree) +#else +#define FWS_INIT FWS_L +#define SUBTREE(fn) NULL +#endif + +static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt); +static void fib6_repair_tree(struct fib6_node *fn); + /* * A routing update causes an increase of the serial number on the * afected subtree. This allows for cached routes to be asynchronously @@ -48,10 +90,24 @@ static __u32 rt_sernum = 0; static struct timer_list ip6_fib_timer = { NULL, NULL, 0, - 0, + ~0UL, fib6_run_gc }; +static struct fib6_walker_t fib6_walker_list = { + &fib6_walker_list, &fib6_walker_list, +}; + +#define FOR_WALKERS(w) for ((w)=fib6_walker_list.next; (w) != &fib6_walker_list; (w)=(w)->next) + +static __inline__ u32 fib6_new_sernum(void) +{ + u32 n = ++rt_sernum; + if (n == 0) + n = ++rt_sernum; + return n; +} + /* * Auxiliary address test functions for the radix tree. * @@ -70,7 +126,7 @@ static __inline__ int addr_match(void *token1, void *token2, int prefixlen) int pdw; int pbi; - pdw = prefixlen >> 0x05; /* num of whole __u32 in prefix */ + pdw = prefixlen >> 5; /* num of whole __u32 in prefix */ pbi = prefixlen & 0x1f; /* num of bits in incomplete u32 in prefix */ if (pdw) @@ -78,15 +134,11 @@ static __inline__ int addr_match(void *token1, void *token2, int prefixlen) return 0; if (pbi) { - __u32 w1, w2; __u32 mask; - w1 = a1[pdw]; - w2 = a2[pdw]; - - mask = htonl((0xffffffff) << (0x20 - pbi)); + mask = htonl((0xffffffff) << (32 - pbi)); - if ((w1 ^ w2) & mask) + if ((a1[pdw] ^ a2[pdw]) & mask) return 0; } @@ -99,24 +151,11 @@ static __inline__ int addr_match(void *token1, void *token2, int prefixlen) static __inline__ int addr_bit_set(void *token, int fn_bit) { - int dw; - __u32 b1; - __u32 mask; - int bit = fn_bit; __u32 *addr = token; - dw = bit >> 0x05; - - b1 = addr[dw]; - - bit = ~bit; - bit &= 0x1f; - mask = htonl(1 << bit); - return (b1 & mask); + return htonl(1 << ((~fn_bit)&0x1F)) & addr[fn_bit>>5]; } - - /* * find the first different bit between two addresses * length of address must be a multiple of 32bits @@ -131,42 +170,47 @@ static __inline__ int addr_diff(void *token1, void *token2, int addrlen) addrlen >>= 2; for (i = 0; i < addrlen; i++) { - __u32 b1, b2; __u32 xb; - b1 = a1[i]; - b2 = a2[i]; - - xb = b1 ^ b2; + xb = a1[i] ^ a2[i]; if (xb) { - int res = 0; - int j=31; + int j = 31; xb = ntohl(xb); - while (test_bit(j, &xb) == 0) { - res++; + while (test_bit(j, &xb) == 0) j--; - } - return (i * 32 + res); + return (i * 32 + 31 - j); } } /* * we should *never* get to this point since that * would mean the addrs are equal + * + * However, we do get to it 8) And exacly, when + * addresses are equal 8) + * + * ip route add 1111::/128 via ... + * ip route add 1111::/64 via ... + * and we are here. + * + * Ideally, this function should stop comparison + * at prefix length. It does not, but it is still OK, + * if returned value is greater than prefix length. + * --ANK (980803) */ - return -1; + return addrlen<<5; } static __inline__ struct fib6_node * node_alloc(void) { struct fib6_node *fn; - if ((fn = kmalloc(sizeof(struct fib6_node), GFP_ATOMIC))) { + if ((fn = kmalloc(sizeof(struct fib6_node), GFP_ATOMIC)) != NULL) { memset(fn, 0, sizeof(struct fib6_node)); rt6_stats.fib_nodes++; } @@ -180,13 +224,10 @@ static __inline__ void node_free(struct fib6_node * fn) kfree(fn); } -extern __inline__ void rt6_release(struct rt6_info *rt) +static __inline__ void rt6_release(struct rt6_info *rt) { - struct dst_entry *dst = (struct dst_entry *) rt; - if (atomic_dec_and_test(&dst->refcnt)) { - rt->rt6i_node = NULL; - dst_free(dst); - } + if (atomic_dec_and_test(&rt->rt6i_ref)) + dst_free(&rt->u.dst); } @@ -200,18 +241,16 @@ extern __inline__ void rt6_release(struct rt6_info *rt) static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, int addrlen, int plen, - unsigned long offset, - struct rt6_info *rt) - + int offset) { - struct fib6_node *fn; + struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; - struct fib6_node *in; - struct fib6_node *ln; struct rt6key *key; - __u32 bit; - __u32 dir = 0; - __u32 sernum = ++rt_sernum; + int bit; + int dir = 0; + __u32 sernum = fib6_new_sernum(); + + RT6_TRACE("fib6_add_1\n"); /* insert node in tree */ @@ -220,146 +259,143 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, if (plen == 0) return fn; - for (;;) { - if (fn == NULL) { - ln = node_alloc(); - - if (ln == NULL) - return NULL; - ln->fn_bit = plen; - - ln->parent = pn; - ln->fn_sernum = sernum; - rt->rt6i_node = ln; - - if (dir) - pn->right = ln; - else - pn->left = ln; - - return ln; - } - + do { key = (struct rt6key *)((u8 *)fn->leaf + offset); /* * Prefix match */ - if (addr_match(&key->addr, addr, fn->fn_bit)) { + if (plen < fn->fn_bit || + !addr_match(&key->addr, addr, fn->fn_bit)) + goto insert_above; - /* - * Exact match ? - */ + /* + * Exact match ? + */ - if (plen == fn->fn_bit) { - /* clean up an intermediate node */ - if ((fn->fn_flags & RTN_RTINFO) == 0) { - rt6_release(fn->leaf); - fn->leaf = NULL; - } + if (plen == fn->fn_bit) { + /* clean up an intermediate node */ + if ((fn->fn_flags & RTN_RTINFO) == 0) { + rt6_release(fn->leaf); + fn->leaf = NULL; + } - fn->fn_sernum = sernum; + fn->fn_sernum = sernum; - return fn; - } - - /* - * We have more bits to go - */ - - if (plen > fn->fn_bit) { - /* Walk down on tree. */ - fn->fn_sernum = sernum; - dir = addr_bit_set(addr, fn->fn_bit); - pn = fn; - fn = dir ? fn->right: fn->left; - - /* - * Round we go. Note if fn has become - * NULL then dir is set and fn is handled - * top of loop. - */ - continue; - } + return fn; } /* - * split since we don't have a common prefix anymore or - * we have a less significant route. - * we've to insert an intermediate node on the list - * this new node will point to the one we need to create - * and the current + * We have more bits to go */ + + /* Try to walk down on tree. */ + fn->fn_sernum = sernum; + dir = addr_bit_set(addr, fn->fn_bit); + pn = fn; + fn = dir ? fn->right: fn->left; + } while (fn); - pn = fn->parent; + /* + * We wlaked to the bottom of tree. + * Create new leaf node without children. + */ - /* find 1st bit in difference between the 2 addrs */ - bit = addr_diff(addr, &key->addr, addrlen); + ln = node_alloc(); + if (ln == NULL) + return NULL; + ln->fn_bit = plen; + + ln->parent = pn; + ln->fn_sernum = sernum; - /* - * (intermediate) - * / \ - * (new leaf node) (old node) - */ - if (plen > bit) { - in = node_alloc(); - - if (in == NULL) - return NULL; - - /* - * new intermediate node. - * RTN_RTINFO will - * be off since that an address that chooses one of - * the branches would not match less specific routes - * int the other branch - */ + if (dir) + pn->right = ln; + else + pn->left = ln; + + return ln; - in->fn_bit = bit; - in->parent = pn; - in->leaf = rt; +insert_above: + /* + * split since we don't have a common prefix anymore or + * we have a less significant route. + * we've to insert an intermediate node on the list + * this new node will point to the one we need to create + * and the current + */ + + pn = fn->parent; - in->fn_sernum = sernum; - atomic_inc(&rt->rt6i_ref); + /* find 1st bit in difference between the 2 addrs. - /* leaf node */ - ln = node_alloc(); + See comment in addr_diff: bit may be an invalid value, + but if it is >= plen, the value is ignored in any case. + */ + + bit = addr_diff(addr, &key->addr, addrlen); - if (ln == NULL) { + /* + * (intermediate)[in] + * / \ + * (new leaf node)[ln] (old node)[fn] + */ + if (plen > bit) { + in = node_alloc(); + ln = node_alloc(); + + if (in == NULL || ln == NULL) { + if (in) node_free(in); - return NULL; - } + if (ln) + node_free(ln); + return NULL; + } + + /* + * new intermediate node. + * RTN_RTINFO will + * be off since that an address that chooses one of + * the branches would not match less specific routes + * in the other branch + */ - /* update parent pointer */ - if (dir) - pn->right = in; - else - pn->left = in; + in->fn_bit = bit; - ln->fn_bit = plen; + in->parent = pn; + in->leaf = fn->leaf; + atomic_inc(&in->leaf->rt6i_ref); - ln->parent = in; - fn->parent = in; + in->fn_sernum = sernum; - ln->fn_sernum = sernum; + /* update parent pointer */ + if (dir) + pn->right = in; + else + pn->left = in; - if (addr_bit_set(addr, bit)) { - in->right = ln; - in->left = fn; - } else { - in->left = ln; - in->right = fn; - } + ln->fn_bit = plen; + + ln->parent = in; + fn->parent = in; + + ln->fn_sernum = sernum; - return ln; + if (addr_bit_set(addr, bit)) { + in->right = ln; + in->left = fn; + } else { + in->left = ln; + in->right = fn; } + } else { /* plen <= bit */ /* - * (new leaf node) + * (new leaf node)[ln] * / \ - * (old node) NULL + * (old node)[fn] NULL */ ln = node_alloc(); @@ -377,7 +413,6 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, pn->right = ln; else pn->left = ln; - if (addr_bit_set(&key->addr, plen)) ln->right = fn; @@ -385,11 +420,8 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, ln->left = fn; fn->parent = ln; - - return ln; } - - return NULL; + return ln; } /* @@ -401,7 +433,6 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt) struct rt6_info *iter = NULL; struct rt6_info **ins; - rt->rt6i_node = fn; ins = &fn->leaf; for (iter = fn->leaf; iter; iter=iter->u.next) { @@ -423,7 +454,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt) iter->rt6i_expires = rt->rt6i_expires; if (!(rt->rt6i_flags&RTF_EXPIRES)) { iter->rt6i_flags &= ~RTF_EXPIRES; - iter->rt6i_expires = rt->rt6i_expires; + iter->rt6i_expires = 0; } return -EEXIST; } @@ -439,8 +470,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt) * insert node */ - *ins = rt; rt->u.next = iter; + *ins = rt; + rt->rt6i_node = fn; atomic_inc(&rt->rt6i_ref); #ifdef CONFIG_RTNETLINK inet6_rt_notify(RTM_NEWROUTE, rt); @@ -457,8 +489,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt) static __inline__ void fib6_start_gc(struct rt6_info *rt) { - if ((ip6_fib_timer.expires == 0) && - (rt->rt6i_flags & (RTF_ADDRCONF | RTF_CACHE))) { + if (ip6_fib_timer.expires == 0 && + (rt->rt6i_flags & (RTF_EXPIRES|RTF_CACHE))) { del_timer(&ip6_fib_timer); ip6_fib_timer.expires = jiffies + ip6_rt_gc_interval; add_timer(&ip6_fib_timer); @@ -475,67 +507,97 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt) { struct fib6_node *fn; int err = -ENOMEM; - unsigned long offset; - - offset = (u8*) &rt->rt6i_dst - (u8*) rt; + fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr), - rt->rt6i_dst.plen, offset, rt); + rt->rt6i_dst.plen, (u8*) &rt->rt6i_dst - (u8*) rt); - if (fn == NULL) { -#if RT_DEBUG >= 2 - printk(KERN_DEBUG "fib6_add: fn == NULL\n"); -#endif - goto out; - } + if (fn == NULL) + return -ENOMEM; +#ifdef CONFIG_IPV6_SUBTREES if (rt->rt6i_src.plen) { struct fib6_node *sn; -#if RT_DEBUG >= 2 - printk(KERN_DEBUG "fib6_add: src.len > 0\n"); -#endif - if (fn->subtree == NULL) { struct fib6_node *sfn; - if (fn->leaf == NULL) { - fn->leaf = rt; - atomic_inc(&rt->rt6i_ref); - } + /* + * Create subtree. + * + * fn[main tree] + * | + * sfn[subtree root] + * \ + * sn[new leaf node] + */ + /* Create subtree root node */ sfn = node_alloc(); - if (sfn == NULL) - goto out; + goto st_failure; - sfn->parent = fn; sfn->leaf = &ip6_null_entry; + atomic_inc(&ip6_null_entry.rt6i_ref); sfn->fn_flags = RTN_ROOT; - sfn->fn_sernum = ++rt_sernum; + sfn->fn_sernum = fib6_new_sernum(); - fn->subtree = sfn; - } + /* Now add the first leaf node to new subtree */ - offset = (u8*) &rt->rt6i_src - (u8*) rt; + sn = fib6_add_1(sfn, &rt->rt6i_src.addr, + sizeof(struct in6_addr), rt->rt6i_src.plen, + (u8*) &rt->rt6i_src - (u8*) rt); - sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, - sizeof(struct in6_addr), rt->rt6i_src.plen, - offset, rt); + if (sn == NULL) { + /* If it is failed, discard just allocated + root, and then (in st_failure) stale node + in main tree. + */ + node_free(sfn); + goto st_failure; + } - if (sn == NULL) - goto out; + /* Now link new subtree to main tree */ + sfn->parent = fn; + fn->subtree = sfn; + if (fn->leaf == NULL) { + fn->leaf = rt; + atomic_inc(&rt->rt6i_ref); + } + } else { + sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, + sizeof(struct in6_addr), rt->rt6i_src.plen, + (u8*) &rt->rt6i_src - (u8*) rt); + + if (sn == NULL) + goto st_failure; + } fn = sn; } +#endif err = fib6_add_rt2node(fn, rt); - if (err == 0) + if (err == 0) { fib6_start_gc(rt); -out: + if (!(rt->rt6i_flags&RTF_CACHE)) + fib6_prune_clones(fn, rt); + } + if (err) dst_free(&rt->u.dst); return err; + +#ifdef CONFIG_IPV6_SUBTREES + /* Subtree creation failed, probably main tree node + is orphan. If it is, shot it. + */ +st_failure: + if (fn && !(fn->fn_flags&RTN_RTINFO|RTN_ROOT)) + fib_repair_tree(fn); + dst_free(&rt->u.dst); + return err; +#endif } /* @@ -544,7 +606,7 @@ out: */ struct lookup_args { - unsigned long offset; /* key offset on rt6_info */ + int offset; /* key offset on rt6_info */ struct in6_addr *addr; /* search key */ }; @@ -576,6 +638,7 @@ static struct fib6_node * fib6_lookup_1(struct fib6_node *root, } while ((fn->fn_flags & RTN_ROOT) == 0) { +#ifdef CONFIG_IPV6_SUBTREES if (fn->subtree) { struct fib6_node *st; struct lookup_args *narg; @@ -591,6 +654,7 @@ static struct fib6_node * fib6_lookup_1(struct fib6_node *root, } } } +#endif if (fn->fn_flags & RTN_RTINFO) { struct rt6key *key; @@ -618,8 +682,10 @@ struct fib6_node * fib6_lookup(struct fib6_node *root, struct in6_addr *daddr, args[0].offset = (u8*) &rt->rt6i_dst - (u8*) rt; args[0].addr = daddr; +#ifdef CONFIG_IPV6_SUBTREES args[1].offset = (u8*) &rt->rt6i_src - (u8*) rt; args[1].addr = saddr; +#endif fn = fib6_lookup_1(root, args); @@ -630,12 +696,79 @@ struct fib6_node * fib6_lookup(struct fib6_node *root, struct in6_addr *daddr, } /* + * Get node with sepciafied destination prefix (and source prefix, + * if subtrees are used) + */ + + +static struct fib6_node * fib6_locate_1(struct fib6_node *root, + struct in6_addr *addr, + int plen, int offset) +{ + struct fib6_node *fn; + + for (fn = root; fn ; ) { + struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset); + + /* + * Prefix match + */ + if (plen < fn->fn_bit || + !addr_match(&key->addr, addr, fn->fn_bit)) + return NULL; + + if (plen == fn->fn_bit) + return fn; + + /* + * We have more bits to go + */ + if (addr_bit_set(addr, fn->fn_bit)) + fn = fn->right; + else + fn = fn->left; + } + return NULL; +} + +struct fib6_node * fib6_locate(struct fib6_node *root, + struct in6_addr *daddr, int dst_len, + struct in6_addr *saddr, int src_len) +{ + struct rt6_info *rt = NULL; + struct fib6_node *fn; + + fn = fib6_locate_1(root, daddr, dst_len, + (u8*) &rt->rt6i_dst - (u8*) rt); + +#ifdef CONFIG_IPV6_SUBTREES + if (src_len) { + BUG_TRAP(saddr!=NULL); + if (fn == NULL) + fn = fn->subtree; + if (fn) + fn = fib6_locate_1(fn, saddr, src_len, + (u8*) &rt->rt6i_src - (u8*) rt); + } +#endif + + if (fn && fn->fn_flags&RTN_RTINFO) + return fn; + + return NULL; +} + + +/* * Deletion * */ static struct rt6_info * fib6_find_prefix(struct fib6_node *fn) { + if (fn->fn_flags&RTN_ROOT) + return &ip6_null_entry; + while(fn) { if(fn->left) return fn->left->leaf; @@ -643,7 +776,7 @@ static struct rt6_info * fib6_find_prefix(struct fib6_node *fn) if(fn->right) return fn->right->leaf; - fn = fn->subtree; + fn = SUBTREE(fn); } return NULL; } @@ -653,428 +786,414 @@ static struct rt6_info * fib6_find_prefix(struct fib6_node *fn) * is the node we want to try and remove. */ -static void fib6_del_2(struct fib6_node *fn) +static void fib6_repair_tree(struct fib6_node *fn) { - struct rt6_info *rt; - - fn->fn_flags &= ~RTN_RTINFO; - rt6_stats.fib_route_nodes--; + int children; + int nstate; + struct fib6_node *child, *pn; + struct fib6_walker_t *w; + int iter = 0; - /* - * Can't delete a root node - */ - - if (fn->fn_flags & RTN_TL_ROOT) - return; + for (;;) { + RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); + iter++; - do { - struct fib6_node *pn, *child; - int children = 0; + BUG_TRAP(!(fn->fn_flags&RTN_RTINFO)); + BUG_TRAP(!(fn->fn_flags&RTN_TL_ROOT)); + BUG_TRAP(fn->leaf==NULL); + children = 0; child = NULL; + if (fn->right) child = fn->right, children |= 1; + if (fn->left) child = fn->left, children |= 2; - /* - * We have a child to left - */ - - if (fn->left) { - children++; - child = fn->left; - } - - /* - * To right - */ - - if (fn->right) { - children++; - child = fn->right; - } - - /* - * We can't tidy a case of two children. - */ - if (children > 1) { - if (fn->leaf == NULL) - goto split_repair; - break; + if (children == 3 || SUBTREE(fn) +#ifdef CONFIG_IPV6_SUBTREES + /* Subtree root (i.e. fn) may have one child */ + || (children && fn->fn_flags&RTN_ROOT) +#endif + ) { + fn->leaf = fib6_find_prefix(fn); +#if RT6_DEBUG >= 2 + if (fn->leaf==NULL) { + BUG_TRAP(fn->leaf); + fn->leaf = &ip6_null_entry; + } +#endif + atomic_inc(&fn->leaf->rt6i_ref); + return; } - if (fn->fn_flags & RTN_RTINFO) - break; - - /* - * The node we plan to tidy has an stree. Talk about - * making life hard. - */ - - if (fn->subtree) - goto stree_node; - - /* - * Up we go - */ - pn = fn->parent; - - /* - * Not a ROOT - we can tidy - */ - - if ((fn->fn_flags & RTN_ROOT) == 0) { - /* - * Make our child our parents child - */ - if (pn->left == fn) - pn->left = child; - else - pn->right = child; - - /* - * Reparent the child - */ +#ifdef CONFIG_IPV6_SUBTREES + if (SUBTREE(pn) == fn) { + BUG_TRAP(fn->fn_flags&RTN_ROOT); + SUBTREE(pn) = NULL; + nstate = FWS_L; + } else { + BUG_TRAP(!(fn->fn_flags&RTN_ROOT)); +#endif + if (pn->right == fn) pn->right = child; + else if (pn->left == fn) pn->left = child; +#if RT6_DEBUG >= 2 + else BUG_TRAP(0); +#endif if (child) child->parent = pn; + nstate = FWS_R; +#ifdef CONFIG_IPV6_SUBTREES + } +#endif - /* - * Discard leaf entries - */ - if (fn->leaf) - rt6_release(fn->leaf); - } else { - if (children) - break; - /* - * No children so no subtree - */ - - pn->subtree = NULL; + FOR_WALKERS(w) { + if (child == NULL) { + if (w->root == fn) { + w->root = w->node = NULL; + RT6_TRACE("W %p adjusted by delroot 1\n", w); + } else if (w->node == fn) { + RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); + w->node = pn; + w->state = nstate; + } + } else { + if (w->root == fn) { + w->root = child; + RT6_TRACE("W %p adjusted by delroot 2\n", w); + } + if (w->node == fn) { + w->node = child; + if (children&2) { + RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); + w->state = w->state>=FWS_R ? FWS_U : FWS_INIT; + } else { + RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); + w->state = w->state>=FWS_C ? FWS_U : FWS_INIT; + } + } + } } - /* - * We are discarding - */ node_free(fn); - - /* - * Our merge of entries might propogate further - * up the tree, so move up a level and retry. - */ - - fn = pn; - - } while (!(fn->fn_flags & RTN_TL_ROOT)); - - return; - -stree_node: - - rt6_release(fn->leaf); - -split_repair: - rt = fib6_find_prefix(fn); - - if (rt == NULL) - panic("fib6_del_2: inconsistent tree\n"); + if (pn->fn_flags&RTN_RTINFO || SUBTREE(pn)) + return; - atomic_inc(&rt->rt6i_ref); - fn->leaf = rt; + rt6_release(pn->leaf); + pn->leaf = NULL; + fn = pn; + } } -/* - * Remove our entry in the tree. This throws away the route entry - * from the list of entries attached to this fib node. It doesn't - * expunge from the tree. - */ - -static struct fib6_node * fib6_del_1(struct rt6_info *rt) +static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp) { - struct fib6_node *fn; - - fn = rt->rt6i_node; + struct fib6_walker_t *w; + struct rt6_info *rt = *rtp; + + RT6_TRACE("fib6_del_route\n"); + + if (!(rt->rt6i_flags&RTF_CACHE)) + fib6_prune_clones(fn, rt); + + /* Unlink it */ + *rtp = rt->u.next; + rt->rt6i_node = NULL; + rt6_stats.fib_rt_entries--; + + /* Adjust walkers */ + FOR_WALKERS(w) { + if (w->state == FWS_C && w->leaf == rt) { + RT6_TRACE("walker %p adjusted by delroute\n", w); + w->leaf = rt->u.next; + if (w->leaf == NULL) + w->state = FWS_U; + } + } - /* We need a fib node! */ - if (fn) { - struct rt6_info **back; - struct rt6_info *lf; + rt->u.next = NULL; - back = &fn->leaf; - - /* - * Walk the leaf entries looking for ourself - */ - - for(lf = fn->leaf; lf; lf=lf->u.next) { - if (rt == lf) { - /* - * Delete this entry. - */ - - *back = lf->u.next; -#ifdef CONFIG_RTNETLINK - inet6_rt_notify(RTM_DELROUTE, lf); -#endif - rt6_release(lf); - rt6_stats.fib_rt_entries--; - return fn; - } - back = &lf->u.next; - } + /* If it was last route, expunge its radix tree node */ + if (fn->leaf == NULL) { + fn->fn_flags &= ~RTN_RTINFO; + rt6_stats.fib_route_nodes--; + fib6_repair_tree(fn); } - return NULL; +#ifdef CONFIG_RTNETLINK + inet6_rt_notify(RTM_DELROUTE, rt); +#endif + rt6_release(rt); } int fib6_del(struct rt6_info *rt) { - struct fib6_node *fn; - - fn = fib6_del_1(rt); + struct fib6_node *fn = rt->rt6i_node; + struct rt6_info **rtp; - if (fn == NULL) +#if RT6_DEBUG >= 2 + if (rt->u.dst.obsolete>0) { + BUG_TRAP(rt->u.dst.obsolete>0); + return -EFAULT; + } +#endif + if (fn == NULL || rt == &ip6_null_entry) return -ENOENT; - if (fn->leaf == NULL) - fib6_del_2(fn); + BUG_TRAP(fn->fn_flags&RTN_RTINFO); - return 0; + /* + * Walk the leaf entries looking for ourself + */ + + for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.next) { + if (*rtp == rt) { + fib6_del_route(fn, rtp); + return 0; + } + } + return -ENOENT; } /* - * Tree transversal function + * Tree transversal function. * - * Wau... It is NOT REENTERABLE!!!!!!! It is cathastrophe. --ANK + * Certainly, it is not interrupt safe. + * However, it is internally reenterable wrt itself and fib6_add/fib6_del. + * It means, that we can modify tree during walking + * and use this function for garbage collection, clone pruning, + * cleaning tree when a device goes down etc. etc. + * + * It guarantees that every node will be traversed, + * and that it will be traversed only once. + * + * Callback function w->func may return: + * 0 -> continue walking. + * positive value -> walking is suspended (used by tree dumps, + * and probably by gc, if it will be split to several slices) + * negative value -> terminate walking. + * + * The function itself returns: + * 0 -> walk is complete. + * >0 -> walk is incomplete (i.e. suspended) + * <0 -> walk is terminated by an error. */ -int fib6_walk_count; - -void fib6_walk_tree(struct fib6_node *root, f_pnode func, void *arg, - int filter) +int fib6_walk_continue(struct fib6_walker_t *w) { - struct fib6_node *fn; + struct fib6_node *fn, *pn; - fn = root; + for (;;) { + fn = w->node; + if (fn == NULL) + return 0; - fib6_walk_count++; - - do { - if (!(fn->fn_flags & RTN_TAG)) { - fn->fn_flags |= RTN_TAG; - + if (w->prune && fn != w->root && + fn->fn_flags&RTN_RTINFO && w->state < FWS_C) { + w->state = FWS_C; + w->leaf = fn->leaf; + } + switch (w->state) { +#ifdef CONFIG_IPV6_SUBTREES + case FWS_S: + if (SUBTREE(fn)) { + w->node = SUBTREE(fn); + continue; + } + w->state = FWS_L; +#endif + case FWS_L: if (fn->left) { - fn = fn->left; + w->node = fn->left; + w->state = FWS_INIT; continue; } - } - - fn->fn_flags &= ~RTN_TAG; - - if (fn->right) { - fn = fn->right; - continue; - } - - do { - struct fib6_node *node; - - if (fn->fn_flags & RTN_ROOT) - break; - node = fn; - fn = fn->parent; - - if (!(node->fn_flags & RTN_TAG)) { - if (node->subtree) { - fib6_walk_tree(node->subtree, func, - arg, filter); - } - - if (!filter || - (node->fn_flags & RTN_RTINFO)) - (*func)(node, arg); + w->state = FWS_R; + case FWS_R: + if (fn->right) { + w->node = fn->right; + w->state = FWS_INIT; + continue; } - - } while (!(fn->fn_flags & RTN_TAG)); - - } while (!(fn->fn_flags & RTN_ROOT) || (fn->fn_flags & RTN_TAG)); - - fib6_walk_count--; + w->state = FWS_C; + w->leaf = fn->leaf; + case FWS_C: + if (w->leaf && fn->fn_flags&RTN_RTINFO) { + int err = w->func(w); + if (err) + return err; + continue; + } + w->state = FWS_U; + case FWS_U: + if (fn == w->root) + return 0; + pn = fn->parent; + w->node = pn; +#ifdef CONFIG_IPV6_SUBTREES + if (SUBTREE(pn) == fn) { + BUG_TRAP(fn->fn_flags&RTN_ROOT); + w->state = FWS_L; + continue; + } +#endif + if (pn->left == fn) { + w->state = FWS_R; + continue; + } + if (pn->right == fn) { + w->state = FWS_C; + w->leaf = w->node->leaf; + continue; + } +#if RT6_DEBUG >= 2 + BUG_TRAP(0); +#endif + } + } } -/* - * Garbage collection - */ - -static int fib6_gc_node(struct fib6_node *fn, int timeout) +int fib6_walk(struct fib6_walker_t *w) { - struct rt6_info *rt, **back; - int more = 0; - unsigned long now = jiffies; - - back = &fn->leaf; - - for (rt = fn->leaf; rt;) { - if ((rt->rt6i_flags & RTF_CACHE) && atomic_read(&rt->rt6i_use) == 0) { - if ((long)(now - rt->rt6i_tstamp) >= timeout) { - struct rt6_info *old; - - old = rt; + int res; - rt = rt->u.next; + w->state = FWS_INIT; + w->node = w->root; - *back = rt; + fib6_walker_link(w); + res = fib6_walk_continue(w); + if (res <= 0) + fib6_walker_unlink(w); + return res; +} - old->rt6i_node = NULL; -#ifdef CONFIG_RTNETLINK - inet6_rt_notify(RTM_DELROUTE, old); +static int fib6_clean_node(struct fib6_walker_t *w) +{ + int res; + struct rt6_info *rt; + struct fib6_cleaner_t *c = (struct fib6_cleaner_t*)w; + + for (rt = w->leaf; rt; rt = rt->u.next) { + res = c->func(rt, c->arg); + if (res < 0) { + w->leaf = rt; + res = fib6_del(rt); + if (res) { +#if RT6_DEBUG >= 2 + printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res); #endif - old->u.dst.obsolete = 1; - rt6_release(old); - rt6_stats.fib_rt_entries--; continue; } - more++; + return 0; } + BUG_TRAP(res==0); + } + w->leaf = rt; + return 0; +} - /* - * check addrconf expiration here. - * - * BUGGGG Crossing fingers and ... - * Seems, radix tree walking is absolutely broken, - * but we will try in any case --ANK - */ - if ((rt->rt6i_flags&RTF_EXPIRES) && rt->rt6i_expires - && (long)(now - rt->rt6i_expires) > 0) { - struct rt6_info *old; +/* + * Convenient frontend to tree walker. + * + * func is called on each route. + * It may return -1 -> delete this route. + * 0 -> continue walking + * + * prune==1 -> only immediate children of node (certainly, + * ignoring pure split nodes) will be scanned. + */ - old = rt; - rt = rt->u.next; +void fib6_clean_tree(struct fib6_node *root, + int (*func)(struct rt6_info *, void *arg), + int prune, void *arg) +{ + struct fib6_cleaner_t c; - *back = rt; + c.w.root = root; + c.w.func = fib6_clean_node; + c.w.prune = prune; + c.func = func; + c.arg = arg; - old->rt6i_node = NULL; -#ifdef CONFIG_RTNETLINK - inet6_rt_notify(RTM_DELROUTE, old); -#endif - old->u.dst.obsolete = 1; - rt6_release(old); - rt6_stats.fib_rt_entries--; - continue; - } - back = &rt->u.next; - rt = rt->u.next; + start_bh_atomic(); + fib6_walk(&c.w); + end_bh_atomic(); +} + +static int fib6_prune_clone(struct rt6_info *rt, void *arg) +{ + if (rt->rt6i_flags & RTF_CACHE) { + RT6_TRACE("pruning clone %p\n", rt); + return -1; } - return more; + return 0; } -struct fib6_gc_args { - unsigned long timeout; - int more; -}; +static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt) +{ + fib6_clean_tree(fn, fib6_prune_clone, 1, rt); +} + +/* + * Garbage collection + */ -static void fib6_garbage_collect(struct fib6_node *fn, void *p_arg) +static struct fib6_gc_args { - struct fib6_gc_args * args = (struct fib6_gc_args *) p_arg; + int timeout; + int more; +} gc_args; - if (fn->fn_flags & RTN_RTINFO) { - int more; +static int fib6_age(struct rt6_info *rt, void *arg) +{ + unsigned long now = jiffies; - more = fib6_gc_node(fn, args->timeout); + /* Age clones. Note, that clones are aged out + only if they are not in use now. + */ - if (fn->leaf) { - args->more += more; - return; + if (rt->rt6i_flags & RTF_CACHE) { + if (atomic_read(&rt->u.dst.use) == 0 && + (long)(now - rt->u.dst.lastuse) >= gc_args.timeout) { + RT6_TRACE("aging clone %p\n", rt); + return -1; } - - rt6_stats.fib_route_nodes--; - fn->fn_flags &= ~RTN_RTINFO; + gc_args.more++; + return 0; } /* - * tree nodes (with no routing information) + * check addrconf expiration here. + * They are expired even if they are in use. */ - if (!fn->subtree && !(fn->fn_flags & RTN_TL_ROOT)) { - int children = 0; - struct fib6_node *chld = NULL; - - if (fn->left) { - children++; - chld = fn->left; - } - - if (fn->right) { - children++; - chld = fn->right; - } - - if ((fn->fn_flags & RTN_ROOT)) { - if (children == 0) { - struct fib6_node *pn; - - pn = fn->parent; - pn->subtree = NULL; - - node_free(fn); - } - return; - } - - if (children <= 1) { - struct fib6_node *pn = fn->parent; - - if (pn->left == fn) - pn->left = chld; - else - pn->right = chld; - - if (chld) - chld->parent = pn; - - if (fn->leaf) - rt6_release(fn->leaf); - - node_free(fn); - - return; + if (rt->rt6i_flags&RTF_EXPIRES && rt->rt6i_expires) { + if ((long)(now - rt->rt6i_expires) > 0) { + RT6_TRACE("expiring %p\n", rt); + return -1; } + gc_args.more++; + return 0; } - if (fn->leaf == NULL) { - struct rt6_info *nrt; - - nrt = fib6_find_prefix(fn); - - if (nrt == NULL) - panic("fib6: inconsistent tree\n"); - - atomic_inc(&nrt->rt6i_ref); - fn->leaf = nrt; - } + return 0; } void fib6_run_gc(unsigned long dummy) { - struct fib6_gc_args arg = { - ip6_rt_gc_timeout, - 0 - }; + if (dummy != ~0UL) + gc_args.timeout = (int)dummy; + else + gc_args.timeout = ip6_rt_gc_interval; - del_timer(&ip6_fib_timer); + gc_args.more = 0; - if (dummy) - arg.timeout = dummy; + fib6_clean_tree(&ip6_routing_table, fib6_age, 0, NULL); - if (fib6_walk_count == 0) - fib6_walk_tree(&ip6_routing_table, fib6_garbage_collect, &arg, 0); - else - arg.more = 1; + del_timer(&ip6_fib_timer); - if (arg.more) { + ip6_fib_timer.expires = 0; + if (gc_args.more) { ip6_fib_timer.expires = jiffies + ip6_rt_gc_interval; add_timer(&ip6_fib_timer); - } else { - ip6_fib_timer.expires = 0; } } @@ -1084,3 +1203,5 @@ void fib6_gc_cleanup(void) del_timer(&ip6_fib_timer); } #endif + + diff --git a/net/ipv6/ip6_fw.c b/net/ipv6/ip6_fw.c index 3c3a0cfc5..c19a561e9 100644 --- a/net/ipv6/ip6_fw.c +++ b/net/ipv6/ip6_fw.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: ip6_fw.c,v 1.9 1998/02/12 07:43:42 davem Exp $ + * $Id: ip6_fw.c,v 1.10 1998/08/26 12:04:57 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -300,14 +300,19 @@ int ip6_fw_msg_add(struct ip6_fw_msg *msg) rl->info.uli_u.data = msg->u.data; rtmsg.rtmsg_flags = RTF_NONEXTHOP|RTF_POLICY; - rt = ip6_route_add(&rtmsg, &err); + err = ip6_route_add(&rtmsg); - /* BUGGGG! rt can point to nowhere. */ - if (rt == NULL) { + if (err) { ip6_fwrule_free(rl); - return -ENOMEM; + return err; } + /* The rest will not work for now. --ABK (989725) */ + +#ifndef notdef + ip6_fwrule_free(rl); + return -EPERM; +#else rt->u.dst.error = -EPERM; if (msg->policy == IP6_FW_ACCEPT) { @@ -327,6 +332,7 @@ int ip6_fw_msg_add(struct ip6_fw_msg *msg) rt->rt6i_flowr = flow_clone((struct flow_rule *)rl); return 0; +#endif } static int ip6_fw_msgrcv(int unit, struct sk_buff *skb) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 6ab4d2c08..6d7359aef 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -6,7 +6,7 @@ * Pedro Roque <roque@di.fc.ul.pt> * Ian P. Morris <I.P.Morris@soton.ac.uk> * - * $Id: ip6_input.c,v 1.10 1998/07/15 05:05:34 davem Exp $ + * $Id: ip6_input.c,v 1.11 1998/08/26 12:04:59 davem Exp $ * * Based in linux/net/ipv4/ip_input.c * @@ -37,144 +37,21 @@ #include <net/ip6_route.h> #include <net/addrconf.h> -static int ipv6_dest_opt(struct sk_buff **skb_ptr, struct device *dev, - __u8 *nhptr, struct ipv6_options *opt); - -struct hdrtype_proc { - u8 type; - int (*func) (struct sk_buff **, struct device *dev, __u8 *ptr, - struct ipv6_options *opt); -} hdrproc_lst[] = { - - /* - TODO - - {NEXTHDR_HOP, ipv6_hop_by_hop} - {NEXTHDR_ROUTING, ipv6_routing_header}, - */ - {NEXTHDR_FRAGMENT, ipv6_reassembly}, - - {NEXTHDR_DEST, ipv6_dest_opt}, - /* - {NEXTHDR_AUTH, ipv6_auth_hdr}, - {NEXTHDR_ESP, ipv6_esp_hdr}, - */ - {NEXTHDR_MAX, NULL} -}; - -/* New header structures */ - - -struct tlvtype_proc { - u8 type; - int (*func) (struct sk_buff *, struct device *dev, __u8 *ptr, - struct ipv6_options *opt); - /* - * these functions do NOT update skb->h.raw - */ - -} tlvprocdestopt_lst[] = { - {255, NULL} -}; - -int ip6_dstopt_unknown(struct sk_buff *skb, struct ipv6_tlvtype *hdr) -{ - struct in6_addr *daddr; - int pos; - - /* - * unkown destination option type - */ - - pos = (__u8 *) hdr - (__u8 *) skb->nh.raw; - - /* I think this is correct please check - IPM */ - - switch ((hdr->type & 0xC0) >> 6) { - case 0: /* ignore */ - skb->h.raw += hdr->len+2; - return 1; - - case 1: /* drop packet */ - break; - - case 2: /* send ICMP PARM PROB regardless and drop packet */ - icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_UNK_OPTION, - pos, skb->dev); - break; - - case 3: /* Send ICMP if not a multicast address and drop packet */ - daddr = &skb->nh.ipv6h->daddr; - if (!(ipv6_addr_type(daddr) & IPV6_ADDR_MULTICAST)) - icmpv6_send(skb, ICMPV6_PARAMPROB, - ICMPV6_UNK_OPTION, pos, skb->dev); - }; - - kfree_skb(skb); - return 0; -} - -static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff *skb, - struct device *dev, __u8 *nhptr, - struct ipv6_options *opt, void *lastopt) -{ - struct ipv6_tlvtype *hdr; - struct tlvtype_proc *curr; - - while ((hdr=(struct ipv6_tlvtype *)skb->h.raw) != lastopt) { - switch (hdr->type) { - case 0: /* TLV encoded Pad1 */ - skb->h.raw++; - break; - - case 1: /* TLV encoded PadN */ - skb->h.raw += hdr->len+2; - break; - - default: /* Other TLV code so scan list */ - for (curr=procs; curr->type != 255; curr++) { - if (curr->type == (hdr->type)) { - curr->func(skb, dev, nhptr, opt); - skb->h.raw += hdr->len+2; - break; - } - } - if (curr->type==255) { - if (ip6_dstopt_unknown(skb, hdr) == 0) - return 0; - } - break; - } - } - return 1; -} - -static int ipv6_dest_opt(struct sk_buff **skb_ptr, struct device *dev, - __u8 *nhptr, struct ipv6_options *opt) -{ - struct sk_buff *skb=*skb_ptr; - struct ipv6_destopt_hdr *hdr = (struct ipv6_destopt_hdr *) skb->h.raw; - int res = 0; - void *lastopt=skb->h.raw+hdr->hdrlen+sizeof(struct ipv6_destopt_hdr); - - skb->h.raw += sizeof(struct ipv6_destopt_hdr); - if (ip6_parse_tlv(tlvprocdestopt_lst, skb, dev, nhptr, opt, lastopt)) - res = hdr->nexthdr; - skb->h.raw+=hdr->hdrlen; - - return res; -} - int ipv6_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) { struct ipv6hdr *hdr; - int pkt_len; + u32 pkt_len; - if (skb->pkt_type == PACKET_OTHERHOST) { - kfree_skb(skb); - return 0; - } + if (skb->pkt_type == PACKET_OTHERHOST) + goto drop; + + ipv6_statistics.Ip6InReceives++; + + /* Store incoming device index. When the packet will + be queued, we cannot refer to skb->dev anymore. + */ + ((struct inet6_skb_parm *)skb->cb)->iif = dev->ifindex; hdr = skb->nh.ipv6h; @@ -183,16 +60,31 @@ int ipv6_rcv(struct sk_buff *skb, struct device *dev, struct packet_type *pt) pkt_len = ntohs(hdr->payload_len); - if (pkt_len + sizeof(struct ipv6hdr) > skb->len) - goto err; + /* pkt_len may be zero if Jumbo payload option is present */ + if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { + if (pkt_len + sizeof(struct ipv6hdr) > skb->len) + goto truncated; + skb_trim(skb, pkt_len + sizeof(struct ipv6hdr)); + } - skb_trim(skb, pkt_len + sizeof(struct ipv6hdr)); + if (hdr->nexthdr == NEXTHDR_HOP) { + skb->h.raw = (u8*)(hdr+1); + if (!ipv6_parse_hopopts(skb, &hdr->nexthdr)) { + ipv6_statistics.Ip6InHdrErrors++; + return 0; + } + } - ip6_route_input(skb); - - return 0; + if (skb->dst == NULL) + ip6_route_input(skb); + + return skb->dst->input(skb); + +truncated: + ipv6_statistics.Ip6InTruncatedPkts++; err: ipv6_statistics.Ip6InHdrErrors++; +drop: kfree_skb(skb); return 0; } @@ -217,8 +109,7 @@ static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb) * without calling rawv6.c) */ static struct sock * ipv6_raw_deliver(struct sk_buff *skb, - struct ipv6_options *opt, - int nexthdr, int len) + int nexthdr, unsigned long len) { struct in6_addr *saddr; struct in6_addr *daddr; @@ -253,8 +144,8 @@ static struct sock * ipv6_raw_deliver(struct sk_buff *skb, continue; buff = skb_clone(skb, GFP_ATOMIC); - buff->sk = sk2; - rawv6_rcv(buff, skb->dev, saddr, daddr, opt, len); + if (buff) + rawv6_rcv(sk2, buff, len); } } @@ -270,10 +161,8 @@ static struct sock * ipv6_raw_deliver(struct sk_buff *skb, int ip6_input(struct sk_buff *skb) { - struct ipv6_options *opt = (struct ipv6_options *) skb->cb; struct ipv6hdr *hdr = skb->nh.ipv6h; struct inet6_protocol *ipprot; - struct hdrtype_proc *hdrt; struct sock *raw_sk; __u8 *nhptr; int nexthdr; @@ -281,7 +170,7 @@ int ip6_input(struct sk_buff *skb) u8 hash; int len; - skb->h.raw += sizeof(struct ipv6hdr); + skb->h.raw = skb->nh.raw + sizeof(struct ipv6hdr); /* * Parse extension headers @@ -290,64 +179,55 @@ int ip6_input(struct sk_buff *skb) nexthdr = hdr->nexthdr; nhptr = &hdr->nexthdr; - /* - * check for extension headers - */ - -st_loop: + /* Skip hop-by-hop options, they are already parsed. */ + if (nexthdr == NEXTHDR_HOP) { + nhptr = (u8*)(hdr+1); + nexthdr = *nhptr; + skb->h.raw += (nhptr[1]+1)<<3; + } - for (hdrt=hdrproc_lst; hdrt->type != NEXTHDR_MAX; hdrt++) { - if (hdrt->type == nexthdr) { - if ((nexthdr = hdrt->func(&skb, skb->dev, nhptr, opt))) { - nhptr = skb->h.raw; - hdr = skb->nh.ipv6h; - goto st_loop; - } + /* This check is sort of optimization. + It would be stupid to detect for optional headers, + which are missing with probability of 200% + */ + if (nexthdr != IPPROTO_TCP && nexthdr != IPPROTO_UDP) { + nhptr = ipv6_parse_exthdrs(&skb, nhptr); + if (nhptr == NULL) return 0; - } + nexthdr = *nhptr; + hdr = skb->nh.ipv6h; } - len = skb->tail - skb->h.raw; - raw_sk = ipv6_raw_deliver(skb, opt, nexthdr, len); + raw_sk = ipv6_raw_deliver(skb, nexthdr, len); hash = nexthdr & (MAX_INET_PROTOS - 1); for (ipprot = (struct inet6_protocol *) inet6_protos[hash]; ipprot != NULL; ipprot = (struct inet6_protocol *) ipprot->next) { struct sk_buff *buff = skb; - + if (ipprot->protocol != nexthdr) continue; - + if (ipprot->copy || raw_sk) buff = skb_clone(skb, GFP_ATOMIC); - - - ipprot->handler(buff, skb->dev, &hdr->saddr, &hdr->daddr, - opt, len, 0, ipprot); + + ipprot->handler(buff, len); found = 1; } - + if (raw_sk) { - skb->sk = raw_sk; - rawv6_rcv(skb, skb->dev, &hdr->saddr, &hdr->daddr, opt, len); + rawv6_rcv(raw_sk, skb, len); found = 1; } - + /* * not found: send ICMP parameter problem back */ - if (!found) { - unsigned long offset; -#if IP6_DEBUG >= 2 - printk(KERN_DEBUG "proto not found %d\n", nexthdr); -#endif - offset = nhptr - (u8*) hdr; - icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_UNK_NEXTHDR, - offset, skb->dev); - kfree_skb(skb); + ipv6_statistics.Ip6InUnknownProtos++; + icmpv6_param_prob(skb, ICMPV6_UNK_NEXTHDR, nhptr); } return 0; @@ -359,6 +239,8 @@ int ip6_mc_input(struct sk_buff *skb) int deliver = 0; int discard = 1; + ipv6_statistics.Ip6InMcastPkts++; + hdr = skb->nh.ipv6h; if (ipv6_chk_mcast_addr(skb->dev, &hdr->daddr)) deliver = 1; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index aa13c2074..0555c1a24 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: ip6_output.c,v 1.13 1998/07/15 05:05:38 davem Exp $ + * $Id: ip6_output.c,v 1.14 1998/08/26 12:05:01 davem Exp $ * * Based on linux/net/ipv4/ip_output.c * @@ -13,6 +13,14 @@ * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. + * + * Changes: + * A.N.Kuznetsov : airthmetics in fragmentation. + * extension headers are implemented. + * route changes now work. + * ip6_forward does not confuse sniffers. + * etc. + * */ #include <linux/errno.h> @@ -33,6 +41,7 @@ #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/rawv6.h> +#include <net/icmp.h> static u32 ipv6_fragmentation_id = 1; @@ -59,6 +68,8 @@ int ip6_output(struct sk_buff *skb) return 0; } } + + ipv6_statistics.Ip6OutMcastPkts++; } if (hh) { @@ -85,17 +96,40 @@ int ip6_output(struct sk_buff *skb) */ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, - struct ipv6_options *opt) + struct ipv6_txoptions *opt) { struct ipv6_pinfo * np = sk ? &sk->net_pinfo.af_inet6 : NULL; + struct in6_addr *first_hop = fl->nl_u.ip6_u.daddr; struct dst_entry *dst = skb->dst; struct ipv6hdr *hdr; - int seg_len; + u8 proto = fl->proto; + int seg_len = skb->len; int hlimit; - /* Do something with IPv6 options headers here. */ + if (opt) { + int head_room; - seg_len = skb->len; + /* First: exthdrs may take lots of space (~8K for now) + MAX_HEADER is not enough. + */ + head_room = opt->opt_nflen + opt->opt_flen; + seg_len += head_room; + head_room += sizeof(struct ipv6hdr) + ((dst->dev->hard_header_len + 15)&~15); + + if (skb_headroom(skb) < head_room) { + struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); + kfree(skb); + skb = skb2; + if (skb == NULL) + return -ENOBUFS; + if (sk) + skb_set_owner_w(skb, sk); + } + if (opt->opt_flen) + ipv6_push_frag_opts(skb, opt, &proto); + if (opt->opt_nflen) + ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop); + } hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr)); @@ -117,16 +151,22 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, hlimit = ((struct rt6_info*)dst)->rt6i_hoplimit; hdr->payload_len = htons(seg_len); - hdr->nexthdr = fl->proto; + hdr->nexthdr = proto; hdr->hop_limit = hlimit; ipv6_addr_copy(&hdr->saddr, fl->nl_u.ip6_u.saddr); - ipv6_addr_copy(&hdr->daddr, fl->nl_u.ip6_u.daddr); + ipv6_addr_copy(&hdr->daddr, first_hop); - ipv6_statistics.Ip6OutRequests++; - dst->output(skb); + if (skb->len <= dst->pmtu) { + ipv6_statistics.Ip6OutRequests++; + dst->output(skb); + return 0; + } - return 0; + printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n"); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst->pmtu, skb->dev); + kfree_skb(skb); + return -EMSGSIZE; } /* @@ -166,8 +206,8 @@ int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct device *dev, return 0; } -static void ip6_bld_1(struct sock *sk, struct sk_buff *skb, struct flowi *fl, - int hlimit, unsigned short pktlength) +static struct ipv6hdr * ip6_bld_1(struct sock *sk, struct sk_buff *skb, struct flowi *fl, + int hlimit, unsigned pktlength) { struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; struct ipv6hdr *hdr; @@ -177,43 +217,56 @@ static void ip6_bld_1(struct sock *sk, struct sk_buff *skb, struct flowi *fl, hdr->version = 6; hdr->priority = np->priority; - memcpy(hdr->flow_lbl, &np->flow_lbl, 3); hdr->payload_len = htons(pktlength - sizeof(struct ipv6hdr)); - - /* - * FIXME: hop limit has default UNI/MCAST and - * msgctl settings - */ hdr->hop_limit = hlimit; + hdr->nexthdr = fl->proto; ipv6_addr_copy(&hdr->saddr, fl->nl_u.ip6_u.saddr); - ipv6_addr_copy(&hdr->daddr, fl->nl_u.ip6_u.daddr); + ipv6_addr_copy(&hdr->daddr, fl->nl_u.ip6_u.daddr); + return hdr; +} + +static __inline__ u8 * ipv6_build_fraghdr(struct sk_buff *skb, u8* prev_hdr, unsigned offset) +{ + struct frag_hdr *fhdr; + + fhdr = (struct frag_hdr *) skb_put(skb, sizeof(struct frag_hdr)); + + fhdr->nexthdr = *prev_hdr; + *prev_hdr = NEXTHDR_FRAGMENT; + prev_hdr = &fhdr->nexthdr; + + fhdr->reserved = 0; + fhdr->frag_off = htons(offset); + fhdr->identification = ipv6_fragmentation_id++; + return &fhdr->nexthdr; } static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data, struct dst_entry *dst, - struct flowi *fl, struct ipv6_options *opt, - int hlimit, int flags, unsigned length) + struct flowi *fl, struct ipv6_txoptions *opt, + struct in6_addr *final_dst, + int hlimit, int flags, unsigned length, int mtu) { - struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; struct ipv6hdr *hdr; struct sk_buff *last_skb; - struct frag_hdr *fhdr; + u8 *prev_hdr; int unfrag_len; - int payl_len; int frag_len; int last_len; int nfrags; int fhdr_dist; + int frag_off; + int data_off; int err; /* * Fragmentation * * Extension header order: - * Hop-by-hop -> Routing -> Fragment -> rest (...) + * Hop-by-hop -> Dest0 -> Routing -> Fragment -> Auth -> Dest1 -> rest (...) * * We must build the non-fragmented part that * will be in every packet... this also means @@ -222,11 +275,11 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag, */ unfrag_len = sizeof(struct ipv6hdr) + sizeof(struct frag_hdr); - payl_len = length; + last_len = length; if (opt) { unfrag_len += opt->opt_nflen; - payl_len += opt->opt_flen; + last_len += opt->opt_flen; } /* @@ -235,9 +288,13 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag, * "integer multiple of 8 octects". */ - frag_len = (dst->pmtu - unfrag_len) & ~0x7; + frag_len = (mtu - unfrag_len) & ~0x7; - nfrags = payl_len / frag_len; + /* Unfragmentable part exceeds mtu. */ + if (frag_len <= 0) + return -EMSGSIZE; + + nfrags = last_len / frag_len; /* * We must send from end to start because of @@ -250,13 +307,25 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag, * might be a good idea. */ - last_len = payl_len - (nfrags * frag_len); + frag_off = nfrags * frag_len; + last_len -= frag_off; if (last_len == 0) { last_len = frag_len; + frag_off -= frag_len; nfrags--; } - + data_off = frag_off; + + /* And it is implementation problem: for now we assume, that + all the exthdrs will fit to the first fragment. + */ + if (opt) { + if (frag_len < opt->opt_flen) + return -EMSGSIZE; + data_off = frag_off - opt->opt_flen; + } + last_skb = sock_alloc_send_skb(sk, unfrag_len + frag_len + dst->dev->hard_header_len + 15, 0, flags & MSG_DONTWAIT, &err); @@ -267,41 +336,17 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag, last_skb->dst = dst_clone(dst); skb_reserve(last_skb, (dst->dev->hard_header_len + 15) & ~15); - - hdr = (struct ipv6hdr *) skb_put(last_skb, sizeof(struct ipv6hdr)); - last_skb->nh.ipv6h = hdr; - hdr->version = 6; - hdr->priority = np->priority; - - memcpy(hdr->flow_lbl, &np->flow_lbl, 3); - hdr->payload_len = htons(unfrag_len + frag_len - sizeof(struct ipv6hdr)); + hdr = ip6_bld_1(sk, last_skb, fl, hlimit, frag_len+unfrag_len); + prev_hdr = &hdr->nexthdr; - hdr->hop_limit = hlimit; + if (opt && opt->opt_nflen) + prev_hdr = ipv6_build_nfrag_opts(last_skb, prev_hdr, opt, final_dst, 0); - hdr->nexthdr = NEXTHDR_FRAGMENT; + prev_hdr = ipv6_build_fraghdr(last_skb, prev_hdr, frag_off); + fhdr_dist = prev_hdr - last_skb->data; - ipv6_addr_copy(&hdr->saddr, fl->nl_u.ip6_u.saddr); - ipv6_addr_copy(&hdr->daddr, fl->nl_u.ip6_u.daddr); - -#if 0 - if (opt && opt->srcrt) { - hdr->nexthdr = ipv6opt_bld_rthdr(last_skb, opt, daddr, - NEXTHDR_FRAGMENT); - } -#endif - - fhdr = (struct frag_hdr *) skb_put(last_skb, sizeof(struct frag_hdr)); - memset(fhdr, 0, sizeof(struct frag_hdr)); - - fhdr->nexthdr = fl->proto; - fhdr->frag_off = ntohs(nfrags * frag_len); - fhdr->identification = ipv6_fragmentation_id++; - - fhdr_dist = (unsigned char *) fhdr - last_skb->data; - - err = getfrag(data, &hdr->saddr, last_skb->tail, nfrags * frag_len, - last_len); + err = getfrag(data, &hdr->saddr, last_skb->tail, data_off, last_len); if (!err) { while (nfrags--) { @@ -309,58 +354,60 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag, struct frag_hdr *fhdr2; -#if 0 - printk(KERN_DEBUG "sending frag %d\n", nfrags); -#endif skb = skb_copy(last_skb, sk->allocation); - if (skb == NULL) + if (skb == NULL) { + ipv6_statistics.Ip6FragFails++; + kfree_skb(last_skb); return -ENOMEM; + } + frag_off -= frag_len; + data_off -= frag_len; + fhdr2 = (struct frag_hdr *) (skb->data + fhdr_dist); /* more flag on */ - fhdr2->frag_off = ntohs(nfrags * frag_len + 1); + fhdr2->frag_off = htons(frag_off | 1); - /* - * FIXME: - * if (nfrags == 0) - * put rest of headers - */ + /* Write fragmentable exthdrs to the first chunk */ + if (nfrags == 0 && opt && opt->opt_flen) { + ipv6_build_frag_opts(skb, &fhdr2->nexthdr, opt); + frag_len -= opt->opt_flen; + data_off = 0; + } err = getfrag(data, &hdr->saddr,skb_put(skb, frag_len), - nfrags * frag_len, frag_len); + data_off, frag_len); if (err) { kfree_skb(skb); break; } + ipv6_statistics.Ip6FragCreates++; ipv6_statistics.Ip6OutRequests++; dst->output(skb); } } if (err) { + ipv6_statistics.Ip6FragFails++; kfree_skb(last_skb); return -EFAULT; } -#if 0 - printk(KERN_DEBUG "sending last frag \n"); -#endif - - hdr->payload_len = htons(unfrag_len + last_len - - sizeof(struct ipv6hdr)); + hdr->payload_len = htons(unfrag_len + last_len - sizeof(struct ipv6hdr)); /* * update last_skb to reflect the getfrag we did * on start. */ - - last_skb->tail += last_len; - last_skb->len += last_len; + skb_put(last_skb, last_len); + + ipv6_statistics.Ip6FragCreates++; + ipv6_statistics.Ip6FragOKs++; ipv6_statistics.Ip6OutRequests++; dst->output(last_skb); @@ -369,42 +416,71 @@ static int ip6_frag_xmit(struct sock *sk, inet_getfrag_t getfrag, int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data, struct flowi *fl, unsigned length, - struct ipv6_options *opt, int hlimit, int flags) + struct ipv6_txoptions *opt, int hlimit, int flags) { struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; struct in6_addr *final_dst = NULL; struct dst_entry *dst; - int pktlength; int err = 0; - + unsigned int pktlength, jumbolen, mtu; + if (opt && opt->srcrt) { struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; final_dst = fl->nl_u.ip6_u.daddr; fl->nl_u.ip6_u.daddr = rt0->addr; } - dst = NULL; - if (!fl->oif && ipv6_addr_is_multicast(fl->nl_u.ip6_u.daddr)) fl->oif = np->mcast_oif; - - if (sk->dst_cache) + + dst = NULL; + if (sk->dst_cache) { dst = dst_check(&sk->dst_cache, np->dst_cookie); + if (dst) { + struct rt6_info *rt = (struct rt6_info*)dst_clone(dst); + + /* Yes, checking route validity in not connected + case is not very simple. Take into account, + that we do not support routing by source, TOS, + and MSG_DONTROUTE --ANK (980726) + + 1. If route was host route, check that + cached destination is current. + If it is network route, we still may + check its validity using saved pointer + to the last used address: daddr_cache. + We do not want to save whole address now, + (because main consumer of this service + is tcp, which has not this problem), + so that the last trick works only on connected + sockets. + 2. oif also should be the same. + */ + if (((rt->rt6i_dst.plen != 128 || + ipv6_addr_cmp(fl->fl6_dst, &rt->rt6i_dst.addr)) + && (np->daddr_cache == NULL || + ipv6_addr_cmp(fl->fl6_dst, np->daddr_cache))) + || (fl->oif && fl->oif != dst->dev->ifindex)) { + dst_release(dst); + dst = NULL; + } + } + } if (dst == NULL) dst = ip6_route_output(sk, fl); if (dst->error) { ipv6_statistics.Ip6OutNoRoutes++; - err = -ENETUNREACH; - goto out; + dst_release(dst); + return -ENETUNREACH; } if (fl->nl_u.ip6_u.saddr == NULL) { struct inet6_ifaddr *ifa; ifa = ipv6_get_saddr(dst, fl->nl_u.ip6_u.daddr); - + if (ifa == NULL) { #if IP6_DEBUG >= 2 printk(KERN_DEBUG "ip6_build_xmit: " @@ -415,7 +491,6 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data, } fl->nl_u.ip6_u.saddr = &ifa->addr; } - pktlength = length; if (hlimit < 0) { @@ -427,29 +502,38 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data, hlimit = ((struct rt6_info*)dst)->rt6i_hoplimit; } + jumbolen = 0; + if (!sk->ip_hdrincl) { pktlength += sizeof(struct ipv6hdr); if (opt) pktlength += opt->opt_flen + opt->opt_nflen; - /* Due to conservative check made by caller, - pktlength cannot overflow here. - - When (and if) jumbo option will be implemented - we could try soemething sort of: + if (pktlength > 0xFFFF + sizeof(struct ipv6hdr)) { + /* Jumbo datagram. + It is assumed, that in the case of sk->ip_hdrincl + jumbo option is supplied by user. + */ + pktlength += 8; + jumbolen = pktlength - sizeof(struct ipv6hdr); + } + } - if (pktlength < length) return -EMSGSIZE; + mtu = dst->pmtu; - */ - } + /* Critical arithmetic overflow check. + FIXME: may gcc optimize it out? --ANK (980726) + */ + if (pktlength < length) + return -EMSGSIZE; - if (pktlength <= dst->pmtu) { + if (pktlength <= mtu) { struct sk_buff *skb; struct ipv6hdr *hdr; - struct device *dev; + struct device *dev = dst->dev; skb = sock_alloc_send_skb(sk, pktlength + 15 + - dst->dev->hard_header_len, 0, + dev->hard_header_len, 0, flags & MSG_DONTWAIT, &err); if (skb == NULL) { @@ -457,7 +541,6 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data, goto out; } - dev = dst->dev; skb->dst = dst_clone(dst); skb_reserve(skb, (dev->hard_header_len + 15) & ~15); @@ -466,23 +549,22 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data, skb->nh.ipv6h = hdr; if (!sk->ip_hdrincl) { - ip6_bld_1(sk, skb, fl, hlimit, pktlength); -#if 0 - if (opt && opt->srcrt) { - hdr->nexthdr = ipv6opt_bld_rthdr(skb, opt, - final_dst, - fl->proto); + ip6_bld_1(sk, skb, fl, hlimit, + jumbolen ? sizeof(struct ipv6hdr) : pktlength); + + if (opt || jumbolen) { + u8 *prev_hdr = &hdr->nexthdr; + prev_hdr = ipv6_build_nfrag_opts(skb, prev_hdr, opt, final_dst, jumbolen); + if (opt && opt->opt_flen) + ipv6_build_frag_opts(skb, prev_hdr, opt); } - else -#endif - hdr->nexthdr = fl->proto; } skb_put(skb, length); err = getfrag(data, &hdr->saddr, ((char *) hdr) + (pktlength - length), 0, length); - + if (!err) { ipv6_statistics.Ip6OutRequests++; dst->output(skb); @@ -491,32 +573,18 @@ int ip6_build_xmit(struct sock *sk, inet_getfrag_t getfrag, const void *data, kfree_skb(skb); } } else { - if (sk->ip_hdrincl) + if (sk->ip_hdrincl || jumbolen) return -EMSGSIZE; - /* pktlength includes IPv6 header, not included - in IPv6 payload length. - FIXME are non-fragmentable options included - in packet after defragmentation? If not, we - should subtract opt_nflen also. --ANK - */ - if (pktlength > 0xFFFF + sizeof(struct ipv6hdr)) - return -EMSGSIZE; - - err = ip6_frag_xmit(sk, getfrag, data, dst, fl, opt, hlimit, - flags, length); + err = ip6_frag_xmit(sk, getfrag, data, dst, fl, opt, final_dst, hlimit, + flags, length, mtu); } - + /* * cleanup */ - out: - - if (sk->dst_cache) - ip6_dst_store(sk, dst); - else - dst_release(dst); - +out: + ip6_dst_store(sk, dst, fl->nl_u.ip6_u.daddr == &np->daddr ? &np->daddr : NULL); return err; } @@ -530,20 +598,15 @@ int ip6_call_ra_chain(struct sk_buff *skb, int sel) if (sk && ra->sel == sel) { if (last) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); - if (skb2) { - skb2->sk = last; - rawv6_rcv(skb2, skb2->dev, &skb2->nh.ipv6h->saddr, - &skb2->nh.ipv6h->daddr, NULL, skb2->len); - } + if (skb2) + rawv6_rcv(last, skb2, skb2->len); } last = sk; } } if (last) { - skb->sk = last; - rawv6_rcv(skb, skb->dev, &skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr, NULL, skb->len); + rawv6_rcv(last, skb, skb->len); return 1; } return 0; @@ -553,24 +616,16 @@ int ip6_forward(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; struct ipv6hdr *hdr = skb->nh.ipv6h; - int size; + struct inet6_skb_parm *opt =(struct inet6_skb_parm*)skb->cb; - if (ipv6_devconf.forwarding == 0) + if (ipv6_devconf.forwarding == 0 && opt->srcrt == 0) goto drop; /* - * check hop-by-hop options present - */ - /* - * Note, that NEXTHDR_HOP header must be checked - * always at the most beginning of ipv6_rcv. - * The result should be saved somewhere, but - * we do not it for now. Alas. Let's do it here. --ANK - * - * Second note: we DO NOT make any processing on + * We DO NOT make any processing on * RA packets, pushing them to user level AS IS - * without ane WARRANTY that application will able - * to interpret them. The reson is that we + * without ane WARRANTY that application will be able + * to interpret them. The reason is that we * cannot make anything clever here. * * We are not end-node, so that if packet contains @@ -579,42 +634,9 @@ int ip6_forward(struct sk_buff *skb) * cannot be fragmented, because there is no warranty * that different fragments will go along one path. --ANK */ - if (hdr->nexthdr == NEXTHDR_HOP) { - int ra_value = -1; - u8 *ptr = (u8*)(skb->nh.ipv6h+1); - int len = (ptr[1]+1)<<3; - - if (len + sizeof(struct ipv6hdr) > skb->len) - goto drop; - - ptr += 2; - len -= 2; - while (len > 0) { - u8 *opt; - int optlen; - - if (ptr[0] == 0) { - len--; - ptr++; - continue; - } - opt = ptr; - optlen = ptr[1]+1; - - len -= optlen; - ptr += optlen; - if (len < 0) - goto drop; - - if (opt[0] == 20) { - /* Router Alert as of draft-ietf-ipngwg-ipv6router-alert-04 */ - if (optlen < 4) - goto drop; - ra_value = opt[2] + (opt[3]<<8); - } else if (!ip6_dstopt_unknown(skb, (struct ipv6_tlvtype*)opt)) - goto drop; - } - if (ra_value>=0 && ip6_call_ra_chain(skb, ra_value)) + if (opt->ra) { + u8 *ptr = skb->nh.raw + opt->ra; + if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3])) return 0; } @@ -622,6 +644,8 @@ int ip6_forward(struct sk_buff *skb) * check and decrement ttl */ if (hdr->hop_limit <= 1) { + /* Force OUTPUT device used as source address */ + skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0, skb->dev); @@ -629,9 +653,10 @@ int ip6_forward(struct sk_buff *skb) return -ETIMEDOUT; } - hdr->hop_limit--; - - if (skb->dev == dst->dev && dst->neighbour) { + /* IPv6 specs say nothing about it, but it is clear that we cannot + send redirects to source routed frames. + */ + if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) { struct in6_addr *target = NULL; struct rt6_info *rt; struct neighbour *n = dst->neighbour; @@ -647,30 +672,40 @@ int ip6_forward(struct sk_buff *skb) else target = &hdr->daddr; - ndisc_send_redirect(skb, dst->neighbour, target); + /* Limit redirects both by destination (here) + and by source (inside ndisc_send_redirect) + */ + if (xrlim_allow(dst, 1*HZ)) + ndisc_send_redirect(skb, n, target); + } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK + |IPV6_ADDR_LINKLOCAL)) { + /* This check is security critical. */ + goto drop; } - - size = sizeof(struct ipv6hdr) + ntohs(hdr->payload_len); - if (size > dst->pmtu) { + if (skb->len > dst->pmtu) { + /* Again, force OUTPUT device used as source address */ + skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst->pmtu, skb->dev); + ipv6_statistics.Ip6InTooBigErrors++; kfree_skb(skb); return -EMSGSIZE; } - if (skb_headroom(skb) < dst->dev->hard_header_len || skb_cloned(skb)) { - struct sk_buff *skb2; - skb2 = skb_realloc_headroom(skb, (dst->dev->hard_header_len + 15)&~15); - kfree_skb(skb); - skb = skb2; - } + if ((skb = skb_cow(skb, dst->dev->hard_header_len)) == NULL) + return 0; - ipv6_statistics.Ip6ForwDatagrams++; - dst->output(skb); + hdr = skb->nh.ipv6h; - return 0; + /* Mangling hops number delayed to point after skb COW */ + + hdr->hop_limit--; + + ipv6_statistics.Ip6OutForwDatagrams++; + return dst->output(skb); drop: + ipv6_statistics.Ip6InAddrErrors++; kfree_skb(skb); return -EINVAL; } diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index b31c07c00..a246b996b 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -7,7 +7,7 @@ * * Based on linux/net/ipv4/ip_sockglue.c * - * $Id: ipv6_sockglue.c,v 1.22 1998/07/15 05:05:39 davem Exp $ + * $Id: ipv6_sockglue.c,v 1.23 1998/08/26 12:05:04 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -110,7 +110,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; - int val, err; + int val, valbool; int retv = -ENOPROTOOPT; if(level==SOL_IP && sk->type != SOCK_RAW) @@ -119,19 +119,20 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval, if(level!=SOL_IPV6) goto out; - if (optval == NULL) { + if (optval == NULL) val=0; - } else { - err = get_user(val, (int *) optval); - if(err) - return err; - } - + else if (get_user(val, (int *) optval)) + return -EFAULT; + + valbool = (val!=0); switch (optname) { case IPV6_ADDRFORM: if (val == PF_INET) { + struct ipv6_txoptions *opt; + struct sk_buff *pktopt; + if (sk->protocol != IPPROTO_UDP && sk->protocol != IPPROTO_TCP) goto out; @@ -140,7 +141,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval, retv = ENOTCONN; goto out; } - + if (!(ipv6_addr_type(&np->daddr) & IPV6_ADDR_MAPPED)) { retv = -EADDRNOTAVAIL; goto out; @@ -153,10 +154,17 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval, tp->af_specific = &ipv4_specific; sk->socket->ops = &inet_stream_ops; sk->family = PF_INET; + tcp_sync_mss(sk, tp->pmtu_cookie); } else { sk->prot = &udp_prot; sk->socket->ops = &inet_dgram_ops; } + opt = xchg(&np->opt, NULL); + if (opt) + sock_kfree_s(sk, opt, opt->tot_len); + pktopt = xchg(&np->pktoptions, NULL); + if (pktopt) + kfree_skb(pktopt); retv = 0; } else { retv = -EINVAL; @@ -164,15 +172,85 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval, break; case IPV6_PKTINFO: - np->rxinfo = val; + np->rxopt.bits.rxinfo = valbool; retv = 0; break; case IPV6_HOPLIMIT: - np->rxhlim = val; + np->rxopt.bits.rxhlim = valbool; + retv = 0; + break; + + case IPV6_RTHDR: + retv = -EINVAL; + if (val >= 0 && val <= 2) { + np->rxopt.bits.srcrt = val; + retv = 0; + } + break; + + case IPV6_HOPOPTS: + np->rxopt.bits.hopopts = valbool; + retv = 0; + break; + + case IPV6_AUTHHDR: + np->rxopt.bits.authhdr = valbool; retv = 0; break; + case IPV6_DSTOPTS: + np->rxopt.bits.dstopts = valbool; + retv = 0; + break; + + case IPV6_PKTOPTIONS: + { + struct ipv6_txoptions *opt = NULL; + struct msghdr msg; + int junk; + struct in6_addr *saddr; + + if (optlen == 0) + goto update; + + opt = sock_kmalloc(sk, sizeof(*opt) + optlen, GFP_KERNEL); + retv = -ENOBUFS; + if (opt == NULL) + break; + + memset(opt, 0, sizeof(*opt)); + opt->tot_len = sizeof(*opt) + optlen; + retv = -EFAULT; + if (copy_from_user(opt+1, optval, optlen)) + goto done; + + msg.msg_controllen = optlen; + msg.msg_control = (void*)(opt+1); + + retv = datagram_send_ctl(&msg, &junk, &saddr, opt, &junk); + if (retv) + goto done; +update: + retv = 0; + start_bh_atomic(); + if (opt && sk->type == SOCK_STREAM) { + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + if ((tcp_connected(sk->state) || sk->state == TCP_SYN_SENT) + && sk->daddr != LOOPBACK4_IPV6) { + tp->ext_header_len = opt->opt_flen + opt->opt_nflen; + tcp_sync_mss(sk, tp->pmtu_cookie); + } + } + opt = xchg(&np->opt, opt); + dst_release(xchg(&sk->dst_cache, NULL)); + end_bh_atomic(); + +done: + if (opt) + sock_kfree_s(sk, opt, opt->tot_len); + break; + } case IPV6_UNICAST_HOPS: if (val > 255 || val < -1) retv = -EINVAL; @@ -190,10 +268,9 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval, retv = 0; } break; - break; case IPV6_MULTICAST_LOOP: - np->mc_loop = (val != 0); + np->mc_loop = valbool; retv = 0; break; @@ -229,12 +306,10 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, char *optval, case IPV6_DROP_MEMBERSHIP: { struct ipv6_mreq mreq; - int err; - err = copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq)); - if(err) + if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq))) return -EFAULT; - + if (optname == IPV6_ADD_MEMBERSHIP) retv = ipv6_sock_mc_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); else @@ -253,10 +328,44 @@ out: int ipv6_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen) { + struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; + int len; + if(level==SOL_IP && sk->type != SOCK_RAW) return udp_prot.getsockopt(sk, level, optname, optval, optlen); if(level!=SOL_IPV6) return -ENOPROTOOPT; + if (get_user(len, optlen)) + return -EFAULT; + switch (optname) { + case IPV6_PKTOPTIONS: + { + struct msghdr msg; + struct sk_buff *skb; + + start_bh_atomic(); + skb = np->pktoptions; + if (skb) + atomic_inc(&skb->users); + end_bh_atomic(); + + if (skb) { + int err; + + msg.msg_control = optval; + msg.msg_controllen = len; + msg.msg_flags = 0; + err = datagram_recv_ctl(sk, &msg, skb); + kfree_skb(skb); + if (err) + return err; + len -= msg.msg_controllen; + } else + len = 0; + return put_user(len, optlen); + } + default: + } return -EINVAL; } diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index c50f37fcf..88950481e 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: mcast.c,v 1.16 1998/05/07 15:43:10 davem Exp $ + * $Id: mcast.c,v 1.17 1998/08/26 12:05:06 davem Exp $ * * Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c * @@ -79,7 +79,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr) if (!(ipv6_addr_type(addr) & IPV6_ADDR_MULTICAST)) return -EINVAL; - mc_lst = kmalloc(sizeof(struct ipv6_mc_socklist), GFP_KERNEL); + mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL); if (mc_lst == NULL) return -ENOMEM; @@ -91,13 +91,15 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr) if (ifindex == 0) { struct rt6_info *rt; rt = rt6_lookup(addr, NULL, 0, 0); - if (rt) + if (rt) { dev = rt->rt6i_dev; + dst_release(&rt->u.dst); + } } else dev = dev_get_by_index(ifindex); if (dev == NULL) { - kfree(mc_lst); + sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return -ENODEV; } @@ -108,7 +110,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr) err = ipv6_dev_mc_inc(dev, addr); if (err) { - kfree(mc_lst); + sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return err; } @@ -133,7 +135,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, struct in6_addr *addr) *lnk = mc_lst->next; if ((dev = dev_get_by_index(ifindex)) != NULL) ipv6_dev_mc_dec(dev, &mc_lst->addr); - kfree(mc_lst); + sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return 0; } } @@ -153,7 +155,7 @@ void ipv6_sock_mc_close(struct sock *sk) ipv6_dev_mc_dec(dev, &mc_lst->addr); np->ipv6_mc_list = mc_lst->next; - kfree(mc_lst); + sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); } } @@ -308,11 +310,19 @@ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) { unsigned long delay = resptime; + /* Do not start timer for addresses with link/host scope */ + if (ipv6_addr_type(&ma->mca_addr)&(IPV6_ADDR_LINKLOCAL|IPV6_ADDR_LOOPBACK)) + return; + if (del_timer(&ma->mca_timer)) delay = ma->mca_timer.expires - jiffies; - if (delay >= resptime) - delay = net_random() % resptime; + if (delay >= resptime) { + if (resptime) + delay = net_random() % resptime; + else + delay = 1; + } ma->mca_flags |= MAF_TIMER_RUNNING; ma->mca_timer.expires = jiffies + delay; @@ -325,10 +335,16 @@ int igmp6_event_query(struct sk_buff *skb, struct icmp6hdr *hdr, int len) struct in6_addr *addrp; unsigned long resptime; - if (len < sizeof(struct icmp6hdr) + sizeof(struct ipv6hdr)) + if (len < sizeof(struct icmp6hdr) + sizeof(struct in6_addr)) return -EINVAL; - resptime = hdr->icmp6_maxdelay; + /* Drop queries with not link local source */ + if (!(ipv6_addr_type(&skb->nh.ipv6h->saddr)&IPV6_ADDR_LINKLOCAL)) + return -EINVAL; + + resptime = ntohs(hdr->icmp6_maxdelay); + /* Translate milliseconds to jiffies */ + resptime = (resptime<<10)/(1024000/HZ); addrp = (struct in6_addr *) (hdr + 1); @@ -365,7 +381,15 @@ int igmp6_event_report(struct sk_buff *skb, struct icmp6hdr *hdr, int len) struct device *dev; int hash; - if (len < sizeof(struct icmp6hdr) + sizeof(struct ipv6hdr)) + /* Our own report looped back. Ignore it. */ + if (skb->pkt_type == PACKET_LOOPBACK) + return 0; + + if (len < sizeof(struct icmp6hdr) + sizeof(struct in6_addr)) + return -EINVAL; + + /* Drop reports with not link local source */ + if (!(ipv6_addr_type(&skb->nh.ipv6h->saddr)&IPV6_ADDR_LINKLOCAL)) return -EINVAL; addrp = (struct in6_addr *) (hdr + 1); @@ -399,14 +423,25 @@ void igmp6_send(struct in6_addr *addr, struct device *dev, int type) struct sk_buff *skb; struct icmp6hdr *hdr; struct inet6_ifaddr *ifp; - struct in6_addr *addrp; - int err, len, plen; + struct in6_addr *snd_addr; + struct in6_addr *addrp; + struct in6_addr all_routers; + int err, len, payload_len, full_len; + u8 ra[8] = { IPPROTO_ICMPV6, 0, + IPV6_TLV_ROUTERALERT, 0, 0, 0, + IPV6_TLV_PADN, 0 }; + + snd_addr = addr; + if (type == ICMPV6_MGM_REDUCTION) { + snd_addr = &all_routers; + ipv6_addr_all_routers(&all_routers); + } len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); + payload_len = len + sizeof(ra); + full_len = sizeof(struct ipv6hdr) + payload_len; - plen = sizeof(struct ipv6hdr) + len; - - skb = sock_alloc_send_skb(sk, dev->hard_header_len + plen + 15, 0, 0, &err); + skb = sock_alloc_send_skb(sk, dev->hard_header_len + full_len + 15, 0, 0, &err); if (skb == NULL) return; @@ -414,8 +449,8 @@ void igmp6_send(struct in6_addr *addr, struct device *dev, int type) skb_reserve(skb, (dev->hard_header_len + 15) & ~15); if (dev->hard_header) { unsigned char ha[MAX_ADDR_LEN]; - ndisc_mc_map(addr, ha, dev, 1); - dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, plen); + ndisc_mc_map(snd_addr, ha, dev, 1); + dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, full_len); } ifp = ipv6_get_lladdr(dev); @@ -428,11 +463,9 @@ void igmp6_send(struct in6_addr *addr, struct device *dev, int type) return; } - ip6_nd_hdr(sk, skb, dev, &ifp->addr, addr, IPPROTO_ICMPV6, len); + ip6_nd_hdr(sk, skb, dev, &ifp->addr, snd_addr, NEXTHDR_HOP, payload_len); - /* - * need hop-by-hop router alert option. - */ + memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra)); hdr = (struct icmp6hdr *) skb_put(skb, sizeof(struct icmp6hdr)); memset(hdr, 0, sizeof(struct icmp6hdr)); @@ -441,11 +474,16 @@ void igmp6_send(struct in6_addr *addr, struct device *dev, int type) addrp = (struct in6_addr *) skb_put(skb, sizeof(struct in6_addr)); ipv6_addr_copy(addrp, addr); - hdr->icmp6_cksum = csum_ipv6_magic(&ifp->addr, addr, len, + hdr->icmp6_cksum = csum_ipv6_magic(&ifp->addr, snd_addr, len, IPPROTO_ICMPV6, csum_partial((__u8 *) hdr, len, 0)); dev_queue_xmit(skb); + if (type == ICMPV6_MGM_REDUCTION) + icmpv6_statistics.Icmp6OutGroupMembReductions++; + else + icmpv6_statistics.Icmp6OutGroupMembResponses++; + icmpv6_statistics.Icmp6OutMsgs++; } static void igmp6_join_group(struct ifmcaddr6 *ma) @@ -455,7 +493,7 @@ static void igmp6_join_group(struct ifmcaddr6 *ma) addr_type = ipv6_addr_type(&ma->mca_addr); - if ((addr_type & IPV6_ADDR_LINKLOCAL)) + if ((addr_type & (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_LOOPBACK))) return; igmp6_send(&ma->mca_addr, ma->dev, ICMPV6_MGM_REPORT); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 26e42a1ed..b6c855a59 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -68,8 +68,7 @@ #include <net/ndisc.h> #include <net/ip6_route.h> #include <net/addrconf.h> - - +#include <net/icmp.h> #include <net/checksum.h> #include <linux/proc_fs.h> @@ -350,6 +349,9 @@ void ndisc_send_na(struct device *dev, struct neighbour *neigh, len, 0)); dev_queue_xmit(skb); + + icmpv6_statistics.Icmp6OutNeighborAdvertisements++; + icmpv6_statistics.Icmp6OutMsgs++; } void ndisc_send_ns(struct device *dev, struct neighbour *neigh, @@ -410,6 +412,9 @@ void ndisc_send_ns(struct device *dev, struct neighbour *neigh, len, 0)); /* send it! */ dev_queue_xmit(skb); + + icmpv6_statistics.Icmp6OutNeighborSolicits++; + icmpv6_statistics.Icmp6OutMsgs++; } void ndisc_send_rs(struct device *dev, struct in6_addr *saddr, @@ -458,6 +463,9 @@ void ndisc_send_rs(struct device *dev, struct in6_addr *saddr, /* send it! */ dev_queue_xmit(skb); + + icmpv6_statistics.Icmp6OutRouterSolicits++; + icmpv6_statistics.Icmp6OutMsgs++; } @@ -575,6 +583,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) if (rt && lifetime == 0) { ip6_del_rt(rt); + dst_release(&rt->u.dst); rt = NULL; } @@ -582,11 +591,6 @@ static void ndisc_router_discovery(struct sk_buff *skb) ND_PRINTK2("ndisc_rdisc: adding default router\n"); rt = rt6_add_dflt_router(&skb->nh.ipv6h->saddr, skb->dev); - -#if 1 - /* BUGGGGG! Previous routine can return invalid pointer. */ - rt = rt6_get_dflt_router(&skb->nh.ipv6h->saddr, skb->dev); -#endif if (rt == NULL) { ND_PRINTK1("route_add failed\n"); return; @@ -595,6 +599,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) neigh = rt->rt6i_nexthop; if (neigh == NULL) { ND_PRINTK1("nd: add default router: null neighbour\n"); + dst_release(&rt->u.dst); return; } neigh->flags |= NTF_ROUTER; @@ -658,7 +663,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) mtu = htonl(*(__u32 *)(opt+4)); - if (mtu < 576 || mtu > skb->dev->mtu) { + if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) { ND_PRINTK0("NDISC: router " "announcement with mtu = %d\n", mtu); @@ -671,10 +676,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) if (rt) rt->u.dst.pmtu = mtu; - /* BUGGG... Scan routing tables and - adjust mtu on routes going - via this device - */ + rt6_mtu_change(skb->dev, mtu); } } break; @@ -689,6 +691,8 @@ static void ndisc_router_discovery(struct sk_buff *skb) optlen -= len; opt += len; } + if (rt) + dst_release(&rt->u.dst); } static void ndisc_redirect_rcv(struct sk_buff *skb) @@ -698,7 +702,6 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) struct in6_addr *dest; struct in6_addr *target; /* new first hop to destination */ struct neighbour *neigh; - struct rt6_info *rt; int on_link = 0; int optlen; @@ -740,20 +743,21 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) if (!in6_dev || in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) return; - /* passed validation tests + /* passed validation tests */ - NOTE We should not install redirect if sender did not supply - ll address on link, which requires it. It would break, if - we have non-transitive address resolution protocol. - Fix it later. --ANK + /* + We install redirect only if nexthop state is valid. */ - rt = rt6_redirect(dest, &skb->nh.ipv6h->saddr, target, skb->dev, on_link); - - if (rt == NULL) - return; - neigh = rt->rt6i_nexthop; - ndisc_update(neigh, (u8*)(dest + 1), optlen, ND_OPT_TARGET_LL_ADDR); + neigh = __neigh_lookup(&nd_tbl, target, skb->dev, 1); + if (neigh) { + ndisc_update(neigh, (u8*)(dest + 1), optlen, ND_OPT_TARGET_LL_ADDR); + if (neigh->nud_state&NUD_VALID) + rt6_redirect(dest, &skb->nh.ipv6h->saddr, neigh, on_link); + else + __neigh_event_send(neigh, NULL); + neigh_release(neigh); + } } void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, @@ -773,17 +777,21 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, int hlen; dev = skb->dev; - rt = rt6_lookup(&skb->nh.ipv6h->saddr, NULL, dev->ifindex, 0); + rt = rt6_lookup(&skb->nh.ipv6h->saddr, NULL, dev->ifindex, 1); - if (rt == NULL || rt->u.dst.error) { - ND_PRINTK1("ndisc_send_redirect: hostunreach\n"); + if (rt == NULL) return; - } if (rt->rt6i_flags & RTF_GATEWAY) { ND_PRINTK1("ndisc_send_redirect: not a neighbour\n"); + dst_release(&rt->u.dst); return; } + if (!xrlim_allow(&rt->u.dst, 1*HZ)) { + dst_release(&rt->u.dst); + return; + } + dst_release(&rt->u.dst); if (dev->addr_len) { if (neigh->nud_state&NUD_VALID) { @@ -797,7 +805,7 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, } } - rd_len = min(536 - len, ntohs(skb->nh.ipv6h->payload_len) + 8); + rd_len = min(IPV6_MIN_MTU-sizeof(struct ipv6hdr)-len, ntohs(skb->nh.ipv6h->payload_len) + 8); rd_len &= ~0x7; len += rd_len; @@ -814,14 +822,14 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, ND_PRINTK1("ndisc_send_redirect: alloc_skb failed\n"); return; } - + hlen = 0; if (ndisc_build_ll_hdr(buff, dev, &skb->nh.ipv6h->saddr, NULL, len) == 0) { kfree_skb(buff); return; } - + ip6_nd_hdr(sk, buff, dev, &ifp->addr, &skb->nh.ipv6h->saddr, IPPROTO_ICMPV6, len); @@ -838,9 +846,9 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, ipv6_addr_copy(addrp, target); addrp++; ipv6_addr_copy(addrp, &skb->nh.ipv6h->daddr); - + opt = (u8*) (addrp + 1); - + /* * include target_address option */ @@ -858,12 +866,15 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, opt += 6; memcpy(opt, &skb->nh.ipv6h, rd_len - 8); - + icmph->icmp6_cksum = csum_ipv6_magic(&ifp->addr, &skb->nh.ipv6h->saddr, len, IPPROTO_ICMPV6, csum_partial((u8 *) icmph, len, 0)); dev_queue_xmit(buff); + + icmpv6_statistics.Icmp6OutRedirects++; + icmpv6_statistics.Icmp6OutMsgs++; } static __inline__ struct neighbour * @@ -894,15 +905,15 @@ static __inline__ int ndisc_recv_na(struct neighbour *neigh, struct sk_buff *skb static void pndisc_redo(struct sk_buff *skb) { - ndisc_rcv(skb, skb->dev, &skb->nh.ipv6h->saddr, &skb->nh.ipv6h->daddr, - NULL, skb->len); + ndisc_rcv(skb, skb->len); kfree_skb(skb); } -int ndisc_rcv(struct sk_buff *skb, struct device *dev, - struct in6_addr *saddr, struct in6_addr *daddr, - struct ipv6_options *opt, unsigned short len) +int ndisc_rcv(struct sk_buff *skb, unsigned long len) { + struct device *dev = skb->dev; + struct in6_addr *saddr = &skb->nh.ipv6h->saddr; + struct in6_addr *daddr = &skb->nh.ipv6h->daddr; struct nd_msg *msg = (struct nd_msg *) skb->h.raw; struct neighbour *neigh; struct inet6_ifaddr *ifp; @@ -977,7 +988,7 @@ int ndisc_rcv(struct sk_buff *skb, struct device *dev, if (neigh) { ndisc_send_na(dev, neigh, saddr, &msg->target, - 1, 0, inc, inc); + 0, 0, inc, inc); neigh_release(neigh); } } else { @@ -1023,13 +1034,14 @@ int ndisc_rcv(struct sk_buff *skb, struct device *dev, /* * Change: router to host */ -#if 0 struct rt6_info *rt; - rt = ndisc_get_dflt_router(skb->dev, - saddr); - if (rt) - ndisc_del_dflt_router(rt); -#endif + rt = rt6_get_dflt_router(saddr, skb->dev); + if (rt) { + /* It is safe only because + we aer in BH */ + dst_release(&rt->u.dst); + ip6_del_rt(rt); + } } } else { if (msg->icmph.icmp6_router) diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 9b24b4948..31f6a2f55 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -7,7 +7,7 @@ * PROC file system. This is very similar to the IPv4 version, * except it reports the sockets in the INET6 address family. * - * Version: $Id: proc.c,v 1.8 1998/04/13 17:06:03 davem Exp $ + * Version: $Id: proc.c,v 1.9 1998/08/26 12:05:11 davem Exp $ * * Authors: David S. Miller (davem@caip.rutgers.edu) * @@ -20,9 +20,11 @@ #include <linux/socket.h> #include <linux/net.h> #include <linux/in6.h> +#include <linux/stddef.h> #include <net/sock.h> #include <net/tcp.h> #include <net/transp_v6.h> +#include <net/ipv6.h> /* This is the main implementation workhorse of all these routines. */ static int get__netinfo6(struct proto *pro, char *buffer, int format, char **start, @@ -176,3 +178,105 @@ int afinet6_get_info(char *buffer, char **start, off_t offset, int length, int d len = length; return len; } + + +struct snmp6_item +{ + char *name; + unsigned long *ptr; +} snmp6_list[] = { +/* ipv6 mib according to draft-ietf-ipngwg-ipv6-mib-04 */ +#define SNMP6_GEN(x) { #x , &ipv6_statistics.x } + SNMP6_GEN(Ip6InReceives), + SNMP6_GEN(Ip6InHdrErrors), + SNMP6_GEN(Ip6InTooBigErrors), + SNMP6_GEN(Ip6InNoRoutes), + SNMP6_GEN(Ip6InAddrErrors), + SNMP6_GEN(Ip6InUnknownProtos), + SNMP6_GEN(Ip6InTruncatedPkts), + SNMP6_GEN(Ip6InDiscards), + SNMP6_GEN(Ip6InDelivers), + SNMP6_GEN(Ip6OutForwDatagrams), + SNMP6_GEN(Ip6OutRequests), + SNMP6_GEN(Ip6OutDiscards), + SNMP6_GEN(Ip6OutNoRoutes), + SNMP6_GEN(Ip6ReasmTimeout), + SNMP6_GEN(Ip6ReasmReqds), + SNMP6_GEN(Ip6ReasmOKs), + SNMP6_GEN(Ip6ReasmFails), + SNMP6_GEN(Ip6FragOKs), + SNMP6_GEN(Ip6FragFails), + SNMP6_GEN(Ip6FragCreates), + SNMP6_GEN(Ip6InMcastPkts), + SNMP6_GEN(Ip6OutMcastPkts), +#undef SNMP6_GEN +/* icmpv6 mib according to draft-ietf-ipngwg-ipv6-icmp-mib-02 + + Exceptions: {In|Out}AdminProhibs are removed, because I see + no good reasons to account them separately + of another dest.unreachs. + OutErrs is zero identically. + OutEchos too. + OutRouterAdvertisements too. + OutGroupMembQueries too. + */ +#define SNMP6_GEN(x) { #x , &icmpv6_statistics.x } + SNMP6_GEN(Icmp6InMsgs), + SNMP6_GEN(Icmp6InErrors), + SNMP6_GEN(Icmp6InDestUnreachs), + SNMP6_GEN(Icmp6InPktTooBigs), + SNMP6_GEN(Icmp6InTimeExcds), + SNMP6_GEN(Icmp6InParmProblems), + SNMP6_GEN(Icmp6InEchos), + SNMP6_GEN(Icmp6InEchoReplies), + SNMP6_GEN(Icmp6InGroupMembQueries), + SNMP6_GEN(Icmp6InGroupMembResponses), + SNMP6_GEN(Icmp6InGroupMembReductions), + SNMP6_GEN(Icmp6InRouterSolicits), + SNMP6_GEN(Icmp6InRouterAdvertisements), + SNMP6_GEN(Icmp6InNeighborSolicits), + SNMP6_GEN(Icmp6InNeighborAdvertisements), + SNMP6_GEN(Icmp6InRedirects), + SNMP6_GEN(Icmp6OutMsgs), + SNMP6_GEN(Icmp6OutDestUnreachs), + SNMP6_GEN(Icmp6OutPktTooBigs), + SNMP6_GEN(Icmp6OutTimeExcds), + SNMP6_GEN(Icmp6OutParmProblems), + SNMP6_GEN(Icmp6OutEchoReplies), + SNMP6_GEN(Icmp6OutRouterSolicits), + SNMP6_GEN(Icmp6OutNeighborSolicits), + SNMP6_GEN(Icmp6OutNeighborAdvertisements), + SNMP6_GEN(Icmp6OutRedirects), + SNMP6_GEN(Icmp6OutGroupMembResponses), + SNMP6_GEN(Icmp6OutGroupMembReductions), +#undef SNMP6_GEN +#define SNMP6_GEN(x) { "Udp6" #x , &udp_stats_in6.Udp##x } + SNMP6_GEN(InDatagrams), + SNMP6_GEN(NoPorts), + SNMP6_GEN(InErrors), + SNMP6_GEN(OutDatagrams) +#undef SNMP6_GEN +}; + + +int afinet6_get_snmp(char *buffer, char **start, off_t offset, int length, + int dummy) +{ + int len = 0; + int i; + + for (i=0; i<sizeof(snmp6_list)/sizeof(snmp6_list[0]); i++) + len += sprintf(buffer+len, "%-32s\t%ld\n", snmp6_list[i].name, + *(snmp6_list[i].ptr)); + + len -= offset; + + if (len > length) + len = length; + if(len < 0) + len = 0; + + *start = buffer + offset; + + return len; +} diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 659ec59cc..76339ff58 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -7,7 +7,7 @@ * * Adapted from linux/net/ipv4/raw.c * - * $Id: raw.c,v 1.20 1998/07/15 05:05:41 davem Exp $ + * $Id: raw.c,v 1.21 1998/08/26 12:05:13 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -156,9 +156,8 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) /* Check if the address belongs to the host. */ if (addr_type == IPV6_ADDR_MAPPED) { - v4addr = addr->sin6_addr.s6_addr32[3]; - if (inet_addr_type(v4addr) != RTN_LOCAL) - return(-EADDRNOTAVAIL); + /* Raw sockets are IPv6 only */ + return(-EADDRNOTAVAIL); } else { if (addr_type != IPV6_ADDR_ANY) { /* ipv4 addr of the socket is invalid. Only the @@ -182,10 +181,11 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) return 0; } -void rawv6_err(struct sock *sk, int type, int code, unsigned char *buff, - struct in6_addr *saddr, struct in6_addr *daddr) +void rawv6_err(struct sock *sk, struct sk_buff *skb, struct ipv6hdr *hdr, + struct inet6_skb_parm *opt, + int type, int code, unsigned char *buff, u32 info) { - if (sk == NULL) + if (sk == NULL) return; } @@ -193,12 +193,12 @@ static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) { /* Charge it to the socket. */ if (sock_queue_rcv_skb(sk,skb)<0) { - /* ip_statistics.IpInDiscards++; */ + ipv6_statistics.Ip6InDiscards++; kfree_skb(skb); return 0; } - /* ip_statistics.IpInDelivers++; */ + ipv6_statistics.Ip6InDelivers++; return 0; } @@ -209,22 +209,11 @@ static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) * maybe we could have the network decide uppon a hint if it * should call raw_rcv for demultiplexing */ -int rawv6_rcv(struct sk_buff *skb, struct device *dev, - struct in6_addr *saddr, struct in6_addr *daddr, - struct ipv6_options *opt, unsigned short len) +int rawv6_rcv(struct sock *sk, struct sk_buff *skb, unsigned long len) { - struct sock *sk; - - sk = skb->sk; - if (sk->ip_hdrincl) skb->h.raw = skb->nh.raw; - if (sk->sock_readers) { - __skb_queue_tail(&sk->back_log, skb); - return 0; - } - rawv6_rcv_skb(sk, skb); return 0; } @@ -255,8 +244,12 @@ int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, int len, if (!skb) goto out; - copied = min(len, skb->tail - skb->h.raw); - + copied = skb->tail - skb->h.raw; + if (copied > len) { + copied = len; + msg->msg_flags |= MSG_TRUNC; + } + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); sk->stamp=skb->stamp; if (err) @@ -269,7 +262,7 @@ int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, int len, sizeof(struct in6_addr)); } - if (msg->msg_controllen) + if (sk->net_pinfo.af_inet6.rxopt.all) datagram_recv_ctl(sk, msg, skb); err = copied; @@ -332,11 +325,9 @@ static int rawv6_frag_cksum(const void *data, struct in6_addr *addr, csum = (__u16 *) (buff + opt->offset); *csum = hdr->cksum; } else { - /* - * FIXME - * signal an error to user via sk->err - */ - printk(KERN_DEBUG "icmp: cksum offset too big\n"); + if (net_ratelimit()) + printk(KERN_DEBUG "icmp: cksum offset too big\n"); + return -EINVAL; } } return 0; @@ -345,10 +336,10 @@ static int rawv6_frag_cksum(const void *data, struct in6_addr *addr, static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len) { - struct ipv6_options opt_space; + struct ipv6_txoptions opt_space; struct sockaddr_in6 * sin6 = (struct sockaddr_in6 *) msg->msg_name; struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; - struct ipv6_options *opt = NULL; + struct ipv6_txoptions *opt = NULL; struct in6_addr *saddr = NULL; struct flowi fl; int addr_len = msg->msg_namelen; @@ -360,11 +351,8 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len) /* Rough check on arithmetic overflow, better check is made in ip6_build_xmit - - When jumbo header will be implemeted we will remove it - at all (len will be size_t) */ - if (len < 0 || len > 0xFFFF) + if (len < 0) return -EMSGSIZE; /* Mirror BSD error message compatibility */ @@ -394,14 +382,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len) return(-EINVAL); daddr = &sin6->sin6_addr; - - /* BUGGGG If route is not cloned, this check always - fails, hence dst_cache only slows down tramsmission --ANK - */ - if (sk->dst_cache && ipv6_addr_cmp(daddr, &np->daddr)) { - dst_release(sk->dst_cache); - sk->dst_cache = NULL; - } } else { if (sk->state != TCP_ESTABLISHED) return(-EINVAL); @@ -422,12 +402,14 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len) if (msg->msg_controllen) { opt = &opt_space; - memset(opt, 0, sizeof(struct ipv6_options)); + memset(opt, 0, sizeof(struct ipv6_txoptions)); err = datagram_send_ctl(msg, &fl.oif, &saddr, opt, &hlimit); if (err < 0) return err; } + if (opt == NULL || !(opt->opt_nflen|opt->opt_flen)) + opt = np->opt; raw_opt = &sk->tp_pinfo.tp_raw; @@ -594,8 +576,9 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname, static void rawv6_close(struct sock *sk, unsigned long timeout) { + /* See for explanation: raw_close in ipv4/raw.c */ sk->state = TCP_CLOSE; - ipv6_sock_mc_close(sk); + raw_v6_unhash(sk); if (sk->num == IPPROTO_RAW) ip6_ra_control(sk, -1, NULL); sk->dead = 1; @@ -619,7 +602,7 @@ struct proto rawv6_prot = { datagram_poll, /* poll */ NULL, /* ioctl */ rawv6_init_sk, /* init */ - NULL, /* destroy */ + inet6_destroy_sock, /* destroy */ NULL, /* shutdown */ rawv6_setsockopt, /* setsockopt */ rawv6_getsockopt, /* getsockopt */ diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index e78cf97a2..e455b0533 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: reassembly.c,v 1.10 1998/04/30 16:24:32 freitag Exp $ + * $Id: reassembly.c,v 1.11 1998/08/26 12:05:16 davem Exp $ * * Based on: net/ipv4/ip_fragment.c * @@ -41,83 +41,145 @@ #include <net/ndisc.h> #include <net/addrconf.h> +int sysctl_ip6frag_high_thresh = 256*1024; +int sysctl_ip6frag_low_thresh = 192*1024; +int sysctl_ip6frag_time = IPV6_FRAG_TIMEOUT; + +atomic_t ip6_frag_mem = ATOMIC_INIT(0); + +struct ipv6_frag { + __u16 offset; + __u16 len; + struct sk_buff *skb; + + struct frag_hdr *fhdr; + + struct ipv6_frag *next; +}; + +/* + * Equivalent of ipv4 struct ipq + */ + +struct frag_queue { + + struct frag_queue *next; + struct frag_queue *prev; + + __u32 id; /* fragment id */ + struct in6_addr saddr; + struct in6_addr daddr; + struct timer_list timer; /* expire timer */ + struct ipv6_frag *fragments; + struct device *dev; + int iif; + __u8 last_in; /* has first/last segment arrived? */ +#define FIRST_IN 2 +#define LAST_IN 1 + __u8 nexthdr; + __u16 nhoffset; +}; static struct frag_queue ipv6_frag_queue = { &ipv6_frag_queue, &ipv6_frag_queue, 0, {{{0}}}, {{{0}}}, {0}, NULL, NULL, - 0, 0, NULL + 0, 0, 0, 0 }; +/* Memory Tracking Functions. */ +extern __inline__ void frag_kfree_skb(struct sk_buff *skb) +{ + atomic_sub(skb->truesize, &ip6_frag_mem); + kfree_skb(skb); +} + +extern __inline__ void frag_kfree_s(void *ptr, int len) +{ + atomic_sub(len, &ip6_frag_mem); + kfree(ptr); +} + +extern __inline__ void *frag_kmalloc(int size, int pri) +{ + void *vp = kmalloc(size, pri); + + if(!vp) + return NULL; + atomic_add(size, &ip6_frag_mem); + return vp; +} + + static void create_frag_entry(struct sk_buff *skb, - struct device *dev, __u8 *nhptr, struct frag_hdr *fhdr); -static int reasm_frag_1(struct frag_queue *fq, - struct sk_buff **skb_in); +static u8 * reasm_frag(struct frag_queue *fq, + struct sk_buff **skb_in); static void reasm_queue(struct frag_queue *fq, struct sk_buff *skb, - struct frag_hdr *fhdr); + struct frag_hdr *fhdr, + u8 *nhptr); -static int reasm_frag(struct frag_queue *fq, struct sk_buff **skb, - __u8 *nhptr, - struct frag_hdr *fhdr) -{ - __u32 expires = jiffies + IPV6_FRAG_TIMEOUT; - int nh; - - if (del_timer(&fq->timer)) - expires = fq->timer.expires; +static void fq_free(struct frag_queue *fq); - /* - * We queue the packet even if it's the last. - * It's a trade off. This allows the reassembly - * code to be simpler (=faster) and of the - * steps we do for queueing the only unnecessary - * one it's the kmalloc for a struct ipv6_frag. - * Feel free to try other alternatives... - */ - if ((fhdr->frag_off & __constant_htons(0x0001)) == 0) { - fq->last_in = 1; - fq->nhptr = nhptr; - } - reasm_queue(fq, *skb, fhdr); +static void frag_prune(void) +{ + struct frag_queue *fq; - if (fq->last_in) { - if ((nh = reasm_frag_1(fq, skb))) - return nh; + while ((fq = ipv6_frag_queue.next) != &ipv6_frag_queue) { + ipv6_statistics.Ip6ReasmFails++; + fq_free(fq); + if (atomic_read(&ip6_frag_mem) <= sysctl_ip6frag_low_thresh) + return; } - - fq->timer.expires = expires; - add_timer(&fq->timer); - - return 0; + if (atomic_read(&ip6_frag_mem)) + printk(KERN_DEBUG "IPv6 frag_prune: memleak\n"); + atomic_set(&ip6_frag_mem, 0); } -int ipv6_reassembly(struct sk_buff **skbp, struct device *dev, __u8 *nhptr, - struct ipv6_options *opt) + +u8* ipv6_reassembly(struct sk_buff **skbp, __u8 *nhptr) { struct sk_buff *skb = *skbp; struct frag_hdr *fhdr = (struct frag_hdr *) (skb->h.raw); struct frag_queue *fq; struct ipv6hdr *hdr; + hdr = skb->nh.ipv6h; + + ipv6_statistics.Ip6ReasmReqds++; + + /* Jumbo payload inhibits frag. header */ + if (hdr->payload_len==0) { + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw); + return NULL; + } if ((u8 *)(fhdr+1) > skb->tail) { icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw); - return 0; + return NULL; } - hdr = skb->nh.ipv6h; + if (atomic_read(&ip6_frag_mem) > sysctl_ip6frag_high_thresh) + frag_prune(); + for (fq = ipv6_frag_queue.next; fq != &ipv6_frag_queue; fq = fq->next) { if (fq->id == fhdr->identification && !ipv6_addr_cmp(&hdr->saddr, &fq->saddr) && - !ipv6_addr_cmp(&hdr->daddr, &fq->daddr)) - return reasm_frag(fq, skbp, nhptr,fhdr); + !ipv6_addr_cmp(&hdr->daddr, &fq->daddr)) { + + reasm_queue(fq, skb, fhdr, nhptr); + + if (fq->last_in == (FIRST_IN|LAST_IN)) + return reasm_frag(fq, skbp); + + return NULL; + } } - - create_frag_entry(skb, dev, nhptr, fhdr); - return 0; + create_frag_entry(skb, nhptr, fhdr); + + return NULL; } @@ -125,11 +187,13 @@ static void fq_free(struct frag_queue *fq) { struct ipv6_frag *fp, *back; - for(fp = fq->fragments; fp; ) { - kfree_skb(fp->skb); + del_timer(&fq->timer); + + for (fp = fq->fragments; fp; ) { + frag_kfree_skb(fp->skb); back = fp; fp=fp->next; - kfree(back); + frag_kfree_s(back, sizeof(*back)); } fq->prev->next = fq->next; @@ -137,7 +201,7 @@ static void fq_free(struct frag_queue *fq) fq->prev = fq->next = NULL; - kfree(fq); + frag_kfree_s(fq, sizeof(*fq)); } static void frag_expire(unsigned long data) @@ -147,33 +211,50 @@ static void frag_expire(unsigned long data) fq = (struct frag_queue *) data; - del_timer(&fq->timer); - frag = fq->fragments; + ipv6_statistics.Ip6ReasmTimeout++; + ipv6_statistics.Ip6ReasmFails++; + if (frag == NULL) { printk(KERN_DEBUG "invalid fragment queue\n"); return; } - icmpv6_send(frag->skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0, - frag->skb->dev); + /* Send error only if the first segment arrived. + (fixed --ANK (980728)) + */ + if (fq->last_in&FIRST_IN) { + struct device *dev = dev_get_by_index(fq->iif); + + /* + But use as source device on which LAST ARRIVED + segment was received. And do not use fq->dev + pointer directly, device might already disappeared. + */ + if (dev) { + frag->skb->dev = dev; + icmpv6_send(frag->skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0, + dev); + } + } fq_free(fq); } -static void create_frag_entry(struct sk_buff *skb, struct device *dev, +static void create_frag_entry(struct sk_buff *skb, __u8 *nhptr, struct frag_hdr *fhdr) { struct frag_queue *fq; struct ipv6hdr *hdr; - fq = (struct frag_queue *) kmalloc(sizeof(struct frag_queue), - GFP_ATOMIC); + fq = (struct frag_queue *) frag_kmalloc(sizeof(struct frag_queue), + GFP_ATOMIC); if (fq == NULL) { + ipv6_statistics.Ip6ReasmFails++; kfree_skb(skb); return; } @@ -186,38 +267,41 @@ static void create_frag_entry(struct sk_buff *skb, struct device *dev, ipv6_addr_copy(&fq->saddr, &hdr->saddr); ipv6_addr_copy(&fq->daddr, &hdr->daddr); - fq->dev = dev; - /* init_timer has been done by the memset */ fq->timer.function = frag_expire; fq->timer.data = (long) fq; - fq->timer.expires = jiffies + IPV6_FRAG_TIMEOUT; + fq->timer.expires = jiffies + sysctl_ip6frag_time; - fq->nexthdr = fhdr->nexthdr; + reasm_queue(fq, skb, fhdr, nhptr); + if (fq->fragments) { + fq->prev = ipv6_frag_queue.prev; + fq->next = &ipv6_frag_queue; + fq->prev->next = fq; + ipv6_frag_queue.prev = fq; - if ((fhdr->frag_off & __constant_htons(0x0001)) == 0) { - fq->last_in = 1; - fq->nhptr = nhptr; - } - reasm_queue(fq, skb, fhdr); - - fq->prev = ipv6_frag_queue.prev; - fq->next = &ipv6_frag_queue; - fq->prev->next = fq; - ipv6_frag_queue.prev = fq; - - add_timer(&fq->timer); + add_timer(&fq->timer); + } else + frag_kfree_s(fq, sizeof(*fq)); } +/* + * We queue the packet even if it's the last. + * It's a trade off. This allows the reassembly + * code to be simpler (=faster) and of the + * steps we do for queueing the only unnecessary + * one it's the kmalloc for a struct ipv6_frag. + * Feel free to try other alternatives... + */ + static void reasm_queue(struct frag_queue *fq, struct sk_buff *skb, - struct frag_hdr *fhdr) + struct frag_hdr *fhdr, u8 *nhptr) { struct ipv6_frag *nfp, *fp, **bptr; - nfp = (struct ipv6_frag *) kmalloc(sizeof(struct ipv6_frag), - GFP_ATOMIC); + nfp = (struct ipv6_frag *) frag_kmalloc(sizeof(struct ipv6_frag), + GFP_ATOMIC); if (nfp == NULL) { kfree_skb(skb); @@ -228,24 +312,40 @@ static void reasm_queue(struct frag_queue *fq, struct sk_buff *skb, nfp->len = (ntohs(skb->nh.ipv6h->payload_len) - ((u8 *) (fhdr + 1) - (u8 *) (skb->nh.ipv6h + 1))); - if ((u32)nfp->offset + (u32)nfp->len > 65536) { + if ((u32)nfp->offset + (u32)nfp->len >= 65536) { icmpv6_param_prob(skb,ICMPV6_HDR_FIELD, (u8*)&fhdr->frag_off); goto err; } + if (fhdr->frag_off & __constant_htons(0x0001)) { + /* Check if the fragment is rounded to 8 bytes. + * Required by the RFC. + * ... and would break our defragmentation algorithm 8) + */ + if (nfp->len & 0x7) { + printk(KERN_DEBUG "fragment not rounded to 8bytes\n"); + + /* + It is not in specs, but I see no reasons + to send an error in this case. --ANK + */ + if (nfp->offset == 0) + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, + &skb->nh.ipv6h->payload_len); + goto err; + } + } nfp->skb = skb; nfp->fhdr = fhdr; - nfp->next = NULL; bptr = &fq->fragments; - + for (fp = fq->fragments; fp; fp=fp->next) { if (nfp->offset <= fp->offset) break; bptr = &fp->next; } - if (fp && fp->offset == nfp->offset) { if (nfp->len != fp->len) { printk(KERN_DEBUG "reasm_queue: dup with wrong len\n"); @@ -254,29 +354,40 @@ static void reasm_queue(struct frag_queue *fq, struct sk_buff *skb, /* duplicate. discard it. */ goto err; } - - *bptr = nfp; - nfp->next = fp; -#ifdef STRICT_RFC - if (fhdr->frag_off & __constant_htons(0x0001)) { - /* Check if the fragment is rounded to 8 bytes. - * Required by the RFC. - */ - if (nfp->len & 0x7) { - printk(KERN_DEBUG "fragment not rounded to 8bytes\n"); + atomic_add(skb->truesize, &ip6_frag_mem); - icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, - &skb->nh.ipv6h->payload_len); - goto err; - } + /* All the checks are done, fragment is acepted. + Only now we are allowed to update reassembly data! + (fixed --ANK (980728)) + */ + + /* iif always set to one of the last arrived segment */ + fq->dev = skb->dev; + fq->iif = skb->dev->ifindex; + + /* Last fragment */ + if ((fhdr->frag_off & __constant_htons(0x0001)) == 0) + fq->last_in |= LAST_IN; + + /* First fragment. + nexthdr and nhptr are get from the first fragment. + Moreover, nexthdr is UNDEFINED for all the fragments but the + first one. + (fixed --ANK (980728)) + */ + if (nfp->offset == 0) { + fq->nexthdr = fhdr->nexthdr; + fq->last_in |= FIRST_IN; + fq->nhoffset = nhptr - skb->nh.raw; } -#endif + *bptr = nfp; + nfp->next = fp; return; err: - kfree(nfp); + frag_kfree_s(nfp, sizeof(*nfp)); kfree_skb(skb); } @@ -284,20 +395,21 @@ err: * check if this fragment completes the packet * returns true on success */ -static int reasm_frag_1(struct frag_queue *fq, struct sk_buff **skb_in) +static u8* reasm_frag(struct frag_queue *fq, struct sk_buff **skb_in) { struct ipv6_frag *fp; + struct ipv6_frag *head = fq->fragments; struct ipv6_frag *tail = NULL; struct sk_buff *skb; __u32 offset = 0; __u32 payload_len; __u16 unfrag_len; __u16 copy; - int nh; + u8 *nhptr; - for(fp = fq->fragments; fp; fp=fp->next) { + for(fp = head; fp; fp=fp->next) { if (offset != fp->offset) - return 0; + return NULL; offset += fp->len; tail = fp; @@ -309,31 +421,42 @@ static int reasm_frag_1(struct frag_queue *fq, struct sk_buff **skb_in) * this means we have all fragments. */ - unfrag_len = (u8 *) (tail->fhdr) - (u8 *) (tail->skb->nh.ipv6h + 1); + /* Unfragmented part is taken from the first segment. + (fixed --ANK (980728)) + */ + unfrag_len = (u8 *) (head->fhdr) - (u8 *) (head->skb->nh.ipv6h + 1); payload_len = (unfrag_len + tail->offset + (tail->skb->tail - (__u8 *) (tail->fhdr + 1))); -#if 0 - printk(KERN_DEBUG "reasm: payload len = %d\n", payload_len); -#endif + if (payload_len > 65535) { + if (net_ratelimit()) + printk(KERN_DEBUG "reasm_frag: payload len = %d\n", payload_len); + ipv6_statistics.Ip6ReasmFails++; + fq_free(fq); + return NULL; + } if ((skb = dev_alloc_skb(sizeof(struct ipv6hdr) + payload_len))==NULL) { - printk(KERN_DEBUG "reasm_frag: no memory for reassembly\n"); + if (net_ratelimit()) + printk(KERN_DEBUG "reasm_frag: no memory for reassembly\n"); + ipv6_statistics.Ip6ReasmFails++; fq_free(fq); - return 1; + return NULL; } copy = unfrag_len + sizeof(struct ipv6hdr); skb->nh.ipv6h = (struct ipv6hdr *) skb->data; - skb->dev = fq->dev; + skb->protocol = __constant_htons(ETH_P_IPV6); + skb->pkt_type = head->skb->pkt_type; + memcpy(skb->cb, head->skb->cb, sizeof(skb->cb)); + skb->dst = dst_clone(head->skb->dst); - nh = fq->nexthdr; - - *(fq->nhptr) = nh; - memcpy(skb_put(skb, copy), tail->skb->nh.ipv6h, copy); + memcpy(skb_put(skb, copy), head->skb->nh.ipv6h, copy); + nhptr = skb->nh.raw + fq->nhoffset; + *nhptr = fq->nexthdr; skb->h.raw = skb->tail; @@ -351,18 +474,19 @@ static int reasm_frag_1(struct frag_queue *fq, struct sk_buff **skb_in) struct ipv6_frag *back; memcpy(skb_put(skb, fp->len), (__u8*)(fp->fhdr + 1), fp->len); - kfree_skb(fp->skb); + frag_kfree_skb(fp->skb); back = fp; fp=fp->next; - kfree(back); + frag_kfree_s(back, sizeof(*back)); } - + + del_timer(&fq->timer); fq->prev->next = fq->next; fq->next->prev = fq->prev; - fq->prev = fq->next = NULL; - - kfree(fq); - return nh; + frag_kfree_s(fq, sizeof(*fq)); + + ipv6_statistics.Ip6ReasmOKs++; + return nhptr; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 9d159fe36..8d1f59632 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: route.c,v 1.32 1998/07/25 23:28:52 davem Exp $ + * $Id: route.c,v 1.33 1998/08/26 12:05:18 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -53,10 +53,19 @@ #if RT6_DEBUG >= 3 #define RDBG(x) printk x +#define RT6_TRACE(x...) printk(KERN_DEBUG x) #else #define RDBG(x) +#define RT6_TRACE(x...) do { ; } while (0) #endif +#if RT6_DEBUG >= 1 +#define BUG_TRAP(x) ({ if (!(x)) { printk("Assertion (" #x ") failed at " __FILE__ "(%d):" __FUNCTION__ "\n", __LINE__); } }) +#else +#define BUG_TRAP(x) do { ; } while (0) +#endif + + int ip6_rt_max_size = 4096; int ip6_rt_gc_min_interval = 5*HZ; int ip6_rt_gc_timeout = 60*HZ; @@ -87,16 +96,16 @@ struct dst_ops ip6_dst_ops = { }; struct rt6_info ip6_null_entry = { - {{NULL, ATOMIC_INIT(1), ATOMIC_INIT(1), NULL, - -1, 0, 0, 0, 0, 0, 0, 0, 0, + {{NULL, ATOMIC_INIT(1), ATOMIC_INIT(1), &loopback_dev, + -1, 0, 0, 0, 0, 0, 0, 0, -ENETUNREACH, NULL, NULL, ip6_pkt_discard, ip6_pkt_discard, #ifdef CONFIG_NET_CLS_ROUTE 0, #endif &ip6_dst_ops}}, - NULL, {{{0}}}, 256, RTF_REJECT|RTF_NONEXTHOP, ~0U, - 255, 0, {NULL}, {{{{0}}}, 0}, {{{{0}}}, 0} + NULL, {{{0}}}, RTF_REJECT|RTF_NONEXTHOP, ~0U, + 255, 0, ATOMIC_INIT(1), {NULL}, {{{{0}}}, 0}, {{{{0}}}, 0} }; struct fib6_node ip6_routing_table = { @@ -123,89 +132,6 @@ static struct rt6_info *rt6_flow_lookup(struct rt6_info *rt, #define ip6_rt_policy (0) #endif -static atomic_t rt6_tbl_lock = ATOMIC_INIT(0); -static int rt6_bh_mask = 0; - -#define RT_BH_REQUEST 1 -#define RT_BH_GC 2 - -static void __rt6_run_bh(void); - -/* - * request queue operations - * FIFO queue/dequeue - */ - -static struct rt6_req request_queue = { - 0, NULL, &request_queue, &request_queue -}; - -static __inline__ void rtreq_queue(struct rt6_req * req) -{ - unsigned long flags; - struct rt6_req *next = &request_queue; - - save_flags(flags); - cli(); - - req->prev = next->prev; - req->prev->next = req; - next->prev = req; - req->next = next; - restore_flags(flags); -} - -static __inline__ struct rt6_req * rtreq_dequeue(void) -{ - struct rt6_req *next = &request_queue; - struct rt6_req *head; - - head = next->next; - - if (head == next) - return NULL; - - head->next->prev = head->prev; - next->next = head->next; - - head->next = NULL; - head->prev = NULL; - - return head; -} - -void rtreq_add(struct rt6_info *rt, int operation) -{ - struct rt6_req *rtreq; - - rtreq = kmalloc(sizeof(struct rt6_req), GFP_ATOMIC); - - if (rtreq == NULL) - return; - - memset(rtreq, 0, sizeof(struct rt6_req)); - - rtreq->operation = operation; - rtreq->ptr = rt; - rtreq_queue(rtreq); - - rt6_bh_mask |= RT_BH_REQUEST; -} - -static __inline__ void rt6_lock(void) -{ - atomic_inc(&rt6_tbl_lock); -} - -static __inline__ void rt6_unlock(void) -{ - if (atomic_dec_and_test(&rt6_tbl_lock) && rt6_bh_mask) { - start_bh_atomic(); - __rt6_run_bh(); - end_bh_atomic(); - } -} - /* * Route lookup */ @@ -219,23 +145,19 @@ static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt, if (oif) { for (sprt = rt; sprt; sprt = sprt->u.next) { - if (sprt->rt6i_dev) { - if (sprt->rt6i_dev->ifindex == oif) - return sprt; - if (sprt->rt6i_dev->flags&IFF_LOOPBACK) - local = sprt; - } + struct device *dev = sprt->rt6i_dev; + if (dev->ifindex == oif) + return sprt; + if (dev->flags&IFF_LOOPBACK) + local = sprt; } if (local) return local; - if (strict) { - RDBG(("nomatch & STRICT --> ip6_null_entry\n")); + if (strict) return &ip6_null_entry; - } } - RDBG(("!dev or (no match and !strict) --> rt(%p)\n", rt)); return rt; } @@ -282,7 +204,7 @@ static struct rt6_info *rt6_best_dflt(struct rt6_info *rt, int oif) break; }; - if (oif && sprt->rt6i_dev && sprt->rt6i_dev->ifindex == oif) { + if (oif && sprt->rt6i_dev->ifindex == oif) { m += 2; } @@ -319,21 +241,40 @@ out: } struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr, - int oif, int flags) + int oif, int strict) { struct fib6_node *fn; struct rt6_info *rt; - rt6_lock(); + start_bh_atomic(); fn = fib6_lookup(&ip6_routing_table, daddr, saddr); - rt = rt6_device_match(fn->leaf, oif, flags&RTF_LINKRT); - rt6_unlock(); - return rt; + rt = rt6_device_match(fn->leaf, oif, strict); + atomic_inc(&rt->u.dst.use); + atomic_inc(&rt->u.dst.refcnt); + end_bh_atomic(); + + rt->u.dst.lastuse = jiffies; + if (rt->u.dst.error == 0) + return rt; + dst_release(&rt->u.dst); + return NULL; +} + +static int rt6_ins(struct rt6_info *rt) +{ + int err; + + start_bh_atomic(); + err = fib6_add(&ip6_routing_table, rt); + end_bh_atomic(); + + return err; } static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr, struct in6_addr *saddr) { + int err; struct rt6_info *rt; /* @@ -351,18 +292,24 @@ static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr, rt->rt6i_dst.plen = 128; rt->rt6i_flags |= RTF_CACHE; - if (rt->rt6i_src.plen) { +#ifdef CONFIG_IPV6_SUBTREES + if (rt->rt6i_src.plen && saddr) { ipv6_addr_copy(&rt->rt6i_src.addr, saddr); rt->rt6i_src.plen = 128; } +#endif rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); - rtreq_add(rt, RT_OPER_ADD); - } else { - rt = &ip6_null_entry; + dst_clone(&rt->u.dst); + err = rt6_ins(rt); + if (err == 0) + return rt; + rt->u.dst.error = err; + return rt; } - return rt; + dst_clone(&ip6_null_entry.u.dst); + return &ip6_null_entry; } #ifdef CONFIG_RT6_POLICY @@ -397,24 +344,38 @@ static __inline__ struct rt6_info *rt6_flow_lookup_out(struct rt6_info *rt, #endif +#define BACKTRACK() \ +if (rt == &ip6_null_entry && strict) { \ + while ((fn = fn->parent) != NULL) { \ + if (fn->fn_flags & RTN_ROOT) { \ + dst_clone(&rt->u.dst); \ + goto out; \ + } \ + if (fn->fn_flags & RTN_RTINFO) \ + goto restart; \ + } \ +} + + void ip6_route_input(struct sk_buff *skb) { struct fib6_node *fn; struct rt6_info *rt; - struct dst_entry *dst; + int strict; + + strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL); - RDBG(("ip6_route_input(%p) from %p\n", skb, __builtin_return_address(0))); - if ((dst = skb->dst) != NULL) - goto looped_back; - rt6_lock(); fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr); +restart: rt = fn->leaf; if ((rt->rt6i_flags & RTF_CACHE)) { if (ip6_rt_policy == 0) { - rt = rt6_device_match(rt, skb->dev->ifindex, 0); + rt = rt6_device_match(rt, skb->dev->ifindex, strict); + BACKTRACK(); + dst_clone(&rt->u.dst); goto out; } @@ -425,6 +386,7 @@ void ip6_route_input(struct sk_buff *skb) for (sprt = rt; sprt; sprt = sprt->u.next) { if (rt6_flow_match_in(sprt, skb)) { rt = sprt; + dst_clone(&rt->u.dst); goto out; } } @@ -433,38 +395,38 @@ void ip6_route_input(struct sk_buff *skb) } rt = rt6_device_match(rt, skb->dev->ifindex, 0); + BACKTRACK(); if (ip6_rt_policy == 0) { - if (!rt->rt6i_nexthop && rt->rt6i_dev && - ((rt->rt6i_flags & RTF_NONEXTHOP) == 0)) { + if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) { rt = rt6_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr); + goto out; } + dst_clone(&rt->u.dst); } else { #ifdef CONFIG_RT6_POLICY rt = rt6_flow_lookup_in(rt, skb); +#else + /* NEVER REACHED */ #endif } out: - dst = dst_clone((struct dst_entry *) rt); - rt6_unlock(); - - skb->dst = dst; -looped_back: - dst->input(skb); + rt->u.dst.lastuse = jiffies; + atomic_inc(&rt->u.dst.refcnt); + skb->dst = (struct dst_entry *) rt; } struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl) { struct fib6_node *fn; struct rt6_info *rt; - struct dst_entry *dst; int strict; strict = ipv6_addr_type(fl->nl_u.ip6_u.daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL); - rt6_lock(); + start_bh_atomic(); fn = fib6_lookup(&ip6_routing_table, fl->nl_u.ip6_u.daddr, fl->nl_u.ip6_u.saddr); @@ -472,25 +434,10 @@ restart: rt = fn->leaf; if ((rt->rt6i_flags & RTF_CACHE)) { - RDBG(("RTF_CACHE ")); if (ip6_rt_policy == 0) { rt = rt6_device_match(rt, fl->oif, strict); - - /* BUGGGG! It is capital bug, that was hidden - by not-cloning multicast routes. However, - the same problem was with link-local addresses. - Fix is the following if-statement, - but it will not properly handle Pedro's subtrees --ANK - */ - if (rt == &ip6_null_entry && strict) { - while ((fn = fn->parent) != NULL) { - if (fn->fn_flags & RTN_ROOT) - goto out; - if (fn->fn_flags & RTN_RTINFO) - goto restart; - } - } - RDBG(("devmatch(%p) ", rt)); + BACKTRACK(); + dst_clone(&rt->u.dst); goto out; } @@ -501,68 +448,46 @@ restart: for (sprt = rt; sprt; sprt = sprt->u.next) { if (rt6_flow_match_out(sprt, sk)) { rt = sprt; + dst_clone(&rt->u.dst); goto out; } } } #endif } - RDBG(("!RTF_CACHE ")); if (rt->rt6i_flags & RTF_DEFAULT) { - RDBG(("RTF_DEFAULT ")); - if (rt->rt6i_metric >= IP6_RT_PRIO_ADDRCONF) { + if (rt->rt6i_metric >= IP6_RT_PRIO_ADDRCONF) rt = rt6_best_dflt(rt, fl->oif); - RDBG(("best_dflt(%p) ", rt)); - } } else { rt = rt6_device_match(rt, fl->oif, strict); - RDBG(("!RTF_DEFAULT devmatch(%p) ", rt)); + BACKTRACK(); } if (ip6_rt_policy == 0) { - if (!rt->rt6i_nexthop && rt->rt6i_dev && - ((rt->rt6i_flags & RTF_NONEXTHOP) == 0)) { + if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) { rt = rt6_cow(rt, fl->nl_u.ip6_u.daddr, fl->nl_u.ip6_u.saddr); - RDBG(("(!nhop&&rt6i_dev&&!RTF_NONEXTHOP) cow(%p) ", rt)); + goto out; } + dst_clone(&rt->u.dst); } else { #ifdef CONFIG_RT6_POLICY rt = rt6_flow_lookup_out(rt, sk, fl); +#else + /* NEVER REACHED */ #endif } out: - dst = dst_clone((struct dst_entry *) rt); - rt6_unlock(); - RDBG(("dclone/ret(%p)\n", dst)); - return dst; -} - - -static void rt6_ins(struct rt6_info *rt) -{ - start_bh_atomic(); - if (atomic_read(&rt6_tbl_lock) == 1) - fib6_add(&ip6_routing_table, rt); - else - rtreq_add(rt, RT_OPER_ADD); + rt->u.dst.lastuse = jiffies; + atomic_inc(&rt->u.dst.refcnt); end_bh_atomic(); + return &rt->u.dst; } + /* * Destination cache support functions - * - * BUGGG! This function is absolutely wrong. - * First of all it is never called. (look at include/net/dst.h) - * Second, even when it is called rt->rt6i_node == NULL - * ** partially fixed: now dst->obsolete = -1 for IPv6 not cache routes. - * Third, even we fixed previous bugs, - * it will not work because sernum is incorrectly checked/updated and - * it does not handle change of the parent of cloned route. - * Purging stray clones is not easy task, it would require - * massive remake of ip6_fib.c. Alas... - * --ANK */ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) @@ -646,7 +571,7 @@ static int ipv6_get_mtu(struct device *dev) if (idev) return idev->cnf.mtu6; else - return 576; + return IPV6_MIN_MTU; } static int ipv6_get_hoplimit(struct device *dev) @@ -664,72 +589,68 @@ static int ipv6_get_hoplimit(struct device *dev) * */ -struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) +int ip6_route_add(struct in6_rtmsg *rtmsg) { + int err; struct rt6_info *rt; struct device *dev = NULL; int addr_type; - - if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128) { - *err = -EINVAL; - return NULL; - } + + if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128) + return -EINVAL; +#ifndef CONFIG_IPV6_SUBTREES + if (rtmsg->rtmsg_src_len) + return -EINVAL; +#endif if (rtmsg->rtmsg_metric == 0) rtmsg->rtmsg_metric = IP6_RT_PRIO_USER; - *err = 0; - rt = dst_alloc(sizeof(struct rt6_info), &ip6_dst_ops); - if (rt == NULL) { - RDBG(("dalloc fails, ")); - *err = -ENOMEM; - return NULL; - } + if (rt == NULL) + return -ENOMEM; rt->u.dst.obsolete = -1; rt->rt6i_expires = rtmsg->rtmsg_info; addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst); - if (addr_type & IPV6_ADDR_MULTICAST) { - RDBG(("MCAST, ")); + if (addr_type & IPV6_ADDR_MULTICAST) rt->u.dst.input = ip6_mc_input; - } else { - RDBG(("!MCAST ")); + else rt->u.dst.input = ip6_forward; - } rt->u.dst.output = ip6_output; if (rtmsg->rtmsg_ifindex) { dev = dev_get_by_index(rtmsg->rtmsg_ifindex); - if (dev == NULL) { - *err = -ENODEV; + err = -ENODEV; + if (dev == NULL) goto out; - } } ipv6_addr_copy(&rt->rt6i_dst.addr, &rtmsg->rtmsg_dst); rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len; ipv6_wash_prefix(&rt->rt6i_dst.addr, rt->rt6i_dst.plen); +#ifdef CONFIG_IPV6_SUBTREES ipv6_addr_copy(&rt->rt6i_src.addr, &rtmsg->rtmsg_src); rt->rt6i_src.plen = rtmsg->rtmsg_src_len; ipv6_wash_prefix(&rt->rt6i_src.addr, rt->rt6i_src.plen); +#endif + + rt->rt6i_metric = rtmsg->rtmsg_metric; /* We cannot add true routes via loopback here, they would result in kernel looping; promote them to reject routes */ if ((rtmsg->rtmsg_flags&RTF_REJECT) || (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) { - dev = dev_get("lo"); + dev = &loopback_dev; rt->u.dst.output = ip6_pkt_discard; rt->u.dst.input = ip6_pkt_discard; rt->u.dst.error = -ENETUNREACH; rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP; - rt->rt6i_metric = rtmsg->rtmsg_metric; - rt->rt6i_dev = dev; goto install_route; } @@ -746,50 +667,44 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) /* IPv6 strictly inhibits using not link-local addresses as nexthop address. + Otherwise, router will not able to send redirects. It is very good, but in some (rare!) curcumstances - (SIT, NBMA NOARP links) it is handy to allow - some exceptions. + (SIT, PtP, NBMA NOARP links) it is handy to allow + some exceptions. --ANK */ - if (!(gwa_type&IPV6_ADDR_UNICAST)) { - *err = -EINVAL; + err = -EINVAL; + if (!(gwa_type&IPV6_ADDR_UNICAST)) goto out; - } - grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, RTF_LINKRT); + grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1); - if (grt == NULL || (grt->rt6i_flags&RTF_GATEWAY)) { - *err = -EHOSTUNREACH; + err = -EHOSTUNREACH; + if (grt == NULL) goto out; - } + if (!(grt->rt6i_flags&RTF_GATEWAY)) + err = 0; dev = grt->rt6i_dev; + dst_release(&grt->u.dst); + + if (err) + goto out; } - if (dev == NULL || (dev->flags&IFF_LOOPBACK)) { - *err = -EINVAL; + err = -EINVAL; + if (dev == NULL || (dev->flags&IFF_LOOPBACK)) goto out; - } } - if (dev == NULL) { - RDBG(("!dev, ")); - *err = -ENODEV; + err = -ENODEV; + if (dev == NULL) goto out; - } if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) { rt->rt6i_nexthop = ndisc_get_neigh(dev, &rt->rt6i_gateway); - if (rt->rt6i_nexthop == NULL) { - RDBG(("!nxthop, ")); - *err = -ENOMEM; + err = -ENOMEM; + if (rt->rt6i_nexthop == NULL) goto out; - } - RDBG(("nxthop, ")); } - rt->rt6i_metric = rtmsg->rtmsg_metric; - - rt->rt6i_dev = dev; - rt->u.dst.pmtu = ipv6_get_mtu(dev); - rt->u.dst.rtt = TCP_TIMEOUT_INIT; if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) rt->rt6i_hoplimit = IPV6_DEFAULT_MCASTHOPS; else @@ -797,153 +712,59 @@ struct rt6_info *ip6_route_add(struct in6_rtmsg *rtmsg, int *err) rt->rt6i_flags = rtmsg->rtmsg_flags; install_route: - RDBG(("rt6ins(%p) ", rt)); - - rt6_lock(); - rt6_ins(rt); - rt6_unlock(); - - /* BUGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG! - - If rt6_ins will fail (and it occurs regularly f.e. if route - already existed), the route will be freed -> Finita. - Crash. No recovery. NO FIX. Unfortunately, it is not the only - place will it is fatal. It is sad, I believed this - code is a bit more accurate :-( - - Really, the problem can be solved in two ways: - - * As I did in old 2.0 IPv4: to increase use count and force - user to destroy stray route. It requires some care, - well, much more care. - * Second and the best: to get rid of this damn backlogging - system. I wonder why Pedro so liked it. It was the most - unhappy day when I invented it (well, by a strange reason - I believed that it is very clever :-)), - and when I managed to clean IPv4 of this crap, - it was really great win. - BTW I forgot how 2.0 route/arp works :-) :-) - --ANK - */ + rt->u.dst.pmtu = ipv6_get_mtu(dev); + rt->u.dst.rtt = TCP_TIMEOUT_INIT; + rt->rt6i_dev = dev; + return rt6_ins(rt); out: - if (*err) { - RDBG(("dfree(%p) ", rt)); - dst_free((struct dst_entry *) rt); - rt = NULL; - } - RDBG(("ret(%p)\n", rt)); -#if 0 - return rt; -#else - /* BUGGG! For now always return NULL. (see above) - - Really, it was used only in two places, and one of them - (rt6_add_dflt_router) is repaired, ip6_fw is not essential - at all. --ANK - */ - return NULL; -#endif + dst_free((struct dst_entry *) rt); + return err; } int ip6_del_rt(struct rt6_info *rt) { - rt6_lock(); + int err; start_bh_atomic(); - - /* I'd add here couple of cli() - cli(); cli(); cli(); - - Now it is really LOCKED. :-) :-) --ANK - */ - rt6_dflt_pointer = NULL; - - if (atomic_read(&rt6_tbl_lock) == 1) - fib6_del(rt); - else - rtreq_add(rt, RT_OPER_DEL); + err = fib6_del(rt); end_bh_atomic(); - rt6_unlock(); - return 0; + + return err; } int ip6_route_del(struct in6_rtmsg *rtmsg) { struct fib6_node *fn; struct rt6_info *rt; + int err = -ESRCH; - rt6_lock(); - fn = fib6_lookup(&ip6_routing_table, &rtmsg->rtmsg_dst, &rtmsg->rtmsg_src); - rt = fn->leaf; - - /* - * Blow it away - * - * BUGGGG It will not help with Pedro's subtrees. - * We urgently need fib6_locate_node function, and - * it is not the only place where rt6_lookup is used - * for wrong purpose. - * --ANK - */ -restart: - if (rt && rt->rt6i_src.plen == rtmsg->rtmsg_src_len) { - if (rt->rt6i_dst.plen > rtmsg->rtmsg_dst_len) { - struct fib6_node *fn = rt->rt6i_node; - while ((fn = fn->parent) != NULL) { - if (fn->fn_flags & RTN_ROOT) - break; - if (fn->fn_flags & RTN_RTINFO) { - rt = fn->leaf; - goto restart; - } - } - } + start_bh_atomic(); - if (rt->rt6i_dst.plen == rtmsg->rtmsg_dst_len) { - for ( ; rt; rt = rt->u.next) { - if (rtmsg->rtmsg_ifindex && - (rt->rt6i_dev == NULL || - rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex)) - continue; - if (rtmsg->rtmsg_flags&RTF_GATEWAY && - ipv6_addr_cmp(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway)) - continue; - if (rtmsg->rtmsg_metric && - rtmsg->rtmsg_metric != rt->rt6i_metric) - continue; - ip6_del_rt(rt); - rt6_unlock(); - return 0; - } + fn = fib6_locate(&ip6_routing_table, + &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len, + &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len); + + if (fn) { + for (rt = fn->leaf; rt; rt = rt->u.next) { + if (rtmsg->rtmsg_ifindex && + (rt->rt6i_dev == NULL || + rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex)) + continue; + if (rtmsg->rtmsg_flags&RTF_GATEWAY && + ipv6_addr_cmp(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway)) + continue; + if (rtmsg->rtmsg_metric && + rtmsg->rtmsg_metric != rt->rt6i_metric) + continue; + err = ip6_del_rt(rt); + break; } } - rt6_unlock(); - - return -ESRCH; -} - - -/* - * bottom handler, runs with atomic_bh protection - */ -void __rt6_run_bh(void) -{ - struct rt6_req *rtreq; + end_bh_atomic(); - while ((rtreq = rtreq_dequeue())) { - switch (rtreq->operation) { - case RT_OPER_ADD: - fib6_add(&ip6_routing_table, rtreq->ptr); - break; - case RT_OPER_DEL: - fib6_del(rtreq->ptr); - break; - }; - kfree(rtreq); - } - rt6_bh_mask = 0; + return err; } #ifdef CONFIG_IPV6_NETLINK @@ -971,10 +792,10 @@ static int rt6_msgrcv(int unit, struct sk_buff *skb) switch (rtmsg->rtmsg_type) { case RTMSG_NEWROUTE: - ip6_route_add(rtmsg, &err); + err = ip6_route_add(rtmsg); break; case RTMSG_DELROUTE: - ip6_route_del(rtmsg); + err = ip6_route_del(rtmsg); break; default: count = -EINVAL; @@ -1047,17 +868,19 @@ void rt6_sndmsg(int type, struct in6_addr *dst, struct in6_addr *src, /* * Handle redirects */ -struct rt6_info *rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr, - struct in6_addr *target, struct device *dev, - int on_link) +void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr, + struct neighbour *neigh, int on_link) { struct rt6_info *rt, *nrt; /* Locate old route to this destination. */ - rt = rt6_lookup(dest, NULL, dev->ifindex, 0); + rt = rt6_lookup(dest, NULL, neigh->dev->ifindex, 1); - if (rt == NULL || rt->u.dst.error) - return NULL; + if (rt == NULL) + return; + + if (neigh->dev != rt->rt6i_dev) + goto out; /* Redirect received -> path was valid. Look, redirects are sent only in response to data packets, @@ -1066,12 +889,18 @@ struct rt6_info *rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr, dst_confirm(&rt->u.dst); /* Duplicate redirect: silently ignore. */ - if (ipv6_addr_cmp(target, &rt->rt6i_gateway) == 0) - return NULL; + if (neigh == rt->u.dst.neighbour) + goto out; - /* Current route is on-link; redirect is always invalid. */ + /* Current route is on-link; redirect is always invalid. + + Seems, previous statement is not true. It could + be node, which looks for us as on-link (f.e. proxy ndisc) + But then router serving it might decide, that we should + know truth 8)8) --ANK (980726). + */ if (!(rt->rt6i_flags&RTF_GATEWAY)) - return NULL; + goto out; #if !defined(CONFIG_IPV6_EUI64) || defined(CONFIG_IPV6_NO_PB) /* @@ -1089,16 +918,21 @@ struct rt6_info *rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr, if (ipv6_addr_cmp(saddr, &rt->rt6i_gateway)) { if (rt->rt6i_flags & RTF_DEFAULT) { - rt = ip6_routing_table.leaf; + struct rt6_info *rt1; - for (; rt; rt = rt->u.next) { - if (!ipv6_addr_cmp(saddr, &rt->rt6i_gateway)) + for (rt1 = ip6_routing_table.leaf; rt1; rt1 = rt1->u.next) { + if (!ipv6_addr_cmp(saddr, &rt1->rt6i_gateway)) { + dst_clone(&rt1->u.dst); + dst_release(&rt->u.dst); + rt = rt1; goto source_ok; + } } } - printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop " + if (net_ratelimit()) + printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop " "for redirect target\n"); - return NULL; + goto out; } source_ok: @@ -1107,36 +941,11 @@ source_ok: /* * We have finally decided to accept it. */ - if (rt->rt6i_dst.plen == 128) { - /* BUGGGG! Very bad bug. Fast path code does not protect - * itself of changing nexthop on the fly, it was supposed - * that crucial parameters (dev, nexthop, hh) ARE VOLATILE. - * --ANK - * Not fixed!! I plugged it to avoid random crashes - * (they are very unlikely, but I do not want to shrug - * every time when redirect arrives) - * but the plug must be removed. --ANK - */ - -#if 0 - /* - * Already a host route. - * - */ - if (rt->rt6i_nexthop) - neigh_release(rt->rt6i_nexthop); - rt->rt6i_flags |= RTF_MODIFIED | RTF_CACHE; - if (on_link) - rt->rt6i_flags &= ~RTF_GATEWAY; - ipv6_addr_copy(&rt->rt6i_gateway, target); - rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, target); - return rt; -#else - return NULL; -#endif - } nrt = ip6_rt_copy(rt); + if (nrt == NULL) + goto out; + nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; if (on_link) nrt->rt6i_flags &= ~RTF_GATEWAY; @@ -1144,19 +953,24 @@ source_ok: ipv6_addr_copy(&nrt->rt6i_dst.addr, dest); nrt->rt6i_dst.plen = 128; - ipv6_addr_copy(&nrt->rt6i_gateway, target); - nrt->rt6i_nexthop = ndisc_get_neigh(nrt->rt6i_dev, target); - nrt->rt6i_dev = dev; - nrt->u.dst.pmtu = ipv6_get_mtu(dev); - if (!ipv6_addr_is_multicast(&nrt->rt6i_dst.addr)) - nrt->rt6i_hoplimit = ipv6_get_hoplimit(dev); + ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key); + nrt->rt6i_nexthop = neigh_clone(neigh); + /* Reset pmtu, it may be better */ + nrt->u.dst.pmtu = ipv6_get_mtu(neigh->dev); + nrt->rt6i_hoplimit = ipv6_get_hoplimit(neigh->dev); + + if (rt6_ins(nrt)) + goto out; - rt6_lock(); - rt6_ins(nrt); - rt6_unlock(); + /* Sic! rt6_redirect is called by bh, so that it is allowed */ + dst_release(&rt->u.dst); + if (rt->rt6i_flags&RTF_CACHE) + ip6_del_rt(rt); + return; - /* BUGGGGGGG! nrt can point to nowhere. */ - return nrt; +out: + dst_release(&rt->u.dst); + return; } /* @@ -1164,29 +978,25 @@ source_ok: * i.e. Path MTU discovery */ -void rt6_pmtu_discovery(struct in6_addr *addr, struct device *dev, int pmtu) +void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, + struct device *dev, u32 pmtu) { struct rt6_info *rt, *nrt; - if (pmtu < 576 || pmtu > 65536) { -#if RT6_DEBUG >= 1 - printk(KERN_DEBUG "rt6_pmtu_discovery: invalid MTU value %d\n", - pmtu); -#endif + if (pmtu < IPV6_MIN_MTU) { + if (net_ratelimit()) + printk(KERN_DEBUG "rt6_pmtu_discovery: invalid MTU value %d\n", + pmtu); return; } - rt = rt6_lookup(addr, NULL, dev->ifindex, 0); + rt = rt6_lookup(daddr, saddr, dev->ifindex, 0); - if (rt == NULL || rt->u.dst.error) { -#if RT6_DEBUG >= 2 - printk(KERN_DEBUG "rt6_pmtu_discovery: no route to host\n"); -#endif + if (rt == NULL) return; - } if (pmtu >= rt->u.dst.pmtu) - return; + goto out; /* New mtu received -> path was valid. They are sent only in response to data packets, @@ -1194,39 +1004,42 @@ void rt6_pmtu_discovery(struct in6_addr *addr, struct device *dev, int pmtu) */ dst_confirm(&rt->u.dst); - /* It is wrong, but I plugged the hole here. - On-link routes are cloned differently, - look at rt6_redirect --ANK + /* Host route. If it is static, it would be better + not to override it, but add new one, so that + when cache entry will expire old pmtu + would return automatically. */ - if (!(rt->rt6i_flags&RTF_GATEWAY)) - return; - if (rt->rt6i_dst.plen == 128) { /* * host route */ rt->u.dst.pmtu = pmtu; rt->rt6i_flags |= RTF_MODIFIED; - - return; + goto out; } - nrt = ip6_rt_copy(rt); - ipv6_addr_copy(&nrt->rt6i_dst.addr, addr); - nrt->rt6i_dst.plen = 128; - - nrt->rt6i_flags |= (RTF_DYNAMIC | RTF_CACHE); - - /* It was missing. :-) :-) - I wonder, kernel was deemed to crash after pkt_too_big - and nobody noticed it. Hey, guys, do someone really - use it? --ANK + /* Network route. + Two cases are possible: + 1. It is connected route. Action: COW + 2. It is gatewayed route or NONEXTHOP route. Action: clone it. */ - nrt->rt6i_nexthop = neigh_clone(rt->rt6i_nexthop); + if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) { + nrt = rt6_cow(rt, daddr, saddr); + nrt->rt6i_flags |= RTF_DYNAMIC; + dst_release(&nrt->u.dst); + } else { + nrt = ip6_rt_copy(rt); + if (nrt == NULL) + goto out; + ipv6_addr_copy(&nrt->rt6i_dst.addr, daddr); + nrt->rt6i_dst.plen = 128; + nrt->rt6i_nexthop = neigh_clone(rt->rt6i_nexthop); + nrt->rt6i_flags |= (RTF_DYNAMIC | RTF_CACHE); + rt6_ins(nrt); + } - rt6_lock(); - rt6_ins(rt); - rt6_unlock(); +out: + dst_release(&rt->u.dst); } /* @@ -1247,16 +1060,19 @@ static struct rt6_info * ip6_rt_copy(struct rt6_info *ort) rt->u.dst.rtt = ort->u.dst.rtt; rt->u.dst.window = ort->u.dst.window; rt->u.dst.mxlock = ort->u.dst.mxlock; + rt->u.dst.dev = ort->u.dst.dev; + rt->u.dst.lastuse = jiffies; rt->rt6i_hoplimit = ort->rt6i_hoplimit; - rt->rt6i_dev = ort->rt6i_dev; + rt->rt6i_expires = ort->rt6i_expires; ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway); - rt->rt6i_keylen = ort->rt6i_keylen; rt->rt6i_flags = ort->rt6i_flags; rt->rt6i_metric = ort->rt6i_metric; memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); +#ifdef CONFIG_IPV6_SUBTREES memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); +#endif } return rt; } @@ -1266,31 +1082,17 @@ struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct device *dev) struct rt6_info *rt; struct fib6_node *fn; - RDBG(("rt6_get_dflt_router(%p,%p)[%p]", addr, dev, - __builtin_return_address(0))); -#if RT6_DEBUG >= 3 - { - int i; - - RDBG(("addr[")); - for(i = 0; i < 8; i++) { - RDBG(("%04x%c", addr->s6_addr16[i], - i == 7 ? ']' : ':')); - } - } -#endif - RDBG(("\n")); - rt6_lock(); - fn = &ip6_routing_table; + start_bh_atomic(); for (rt = fn->leaf; rt; rt=rt->u.next) { if (dev == rt->rt6i_dev && ipv6_addr_cmp(&rt->rt6i_gateway, addr) == 0) break; } - - rt6_unlock(); + if (rt) + dst_clone(&rt->u.dst); + end_bh_atomic(); return rt; } @@ -1298,24 +1100,6 @@ struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr, struct device *dev) { struct in6_rtmsg rtmsg; - struct rt6_info *rt; - int err; - - RDBG(("rt6_add_dflt_router(%p,%p)[%p] ", gwaddr, dev, - __builtin_return_address(0))); -#if RT6_DEBUG >= 3 - { - struct in6_addr *addr = gwaddr; - int i; - - RDBG(("gwaddr[")); - for(i = 0; i < 8; i++) { - RDBG(("%04x%c", addr->s6_addr16[i], - i == 7 ? ']' : ':')); - } - } -#endif - RDBG(("\n")); memset(&rtmsg, 0, sizeof(struct in6_rtmsg)); rtmsg.rtmsg_type = RTMSG_NEWROUTE; @@ -1325,48 +1109,28 @@ struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr, rtmsg.rtmsg_ifindex = dev->ifindex; - rt = ip6_route_add(&rtmsg, &err); - - /* BUGGGGGGGGGGGGGGGGGGGG! - rt can be not NULL, but point to heavens. - */ - - if (err) { - printk(KERN_DEBUG "rt6_add_dflt: ip6_route_add error %d\n", - err); - } - return rt; + ip6_route_add(&rtmsg); + return rt6_get_dflt_router(gwaddr, dev); } void rt6_purge_dflt_routers(int last_resort) { struct rt6_info *rt; - struct fib6_node *fn; u32 flags; - RDBG(("rt6_purge_dflt_routers(%d)[%p]\n", last_resort, - __builtin_return_address(0))); - fn = &ip6_routing_table; - - rt6_dflt_pointer = NULL; - if (last_resort) flags = RTF_ALLONLINK; else flags = RTF_DEFAULT | RTF_ADDRCONF; - for (rt = fn->leaf; rt; ) { - if ((rt->rt6i_flags & flags)) { - struct rt6_info *drt; -#if RT6_DEBUG >= 2 - printk(KERN_DEBUG "rt6_purge_dflt: deleting entry\n"); -#endif - drt = rt; - rt = rt->u.next; - ip6_del_rt(drt); - continue; +restart: + rt6_dflt_pointer = NULL; + + for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) { + if (rt->rt6i_flags & flags) { + ip6_del_rt(rt); + goto restart; } - rt = rt->u.next; } } @@ -1389,7 +1153,7 @@ int ipv6_route_ioctl(unsigned int cmd, void *arg) rtnl_lock(); switch (cmd) { case SIOCADDRT: - ip6_route_add(&rtmsg, &err); + err = ip6_route_add(&rtmsg); break; case SIOCDELRT: err = ip6_route_del(&rtmsg); @@ -1414,7 +1178,7 @@ int ipv6_route_ioctl(unsigned int cmd, void *arg) */ int ip6_pkt_discard(struct sk_buff *skb) -{ +{ ipv6_statistics.Ip6OutNoRoutes++; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev); kfree_skb(skb); @@ -1429,21 +1193,6 @@ int ip6_rt_addr_add(struct in6_addr *addr, struct device *dev) { struct rt6_info *rt; - RDBG(("ip6_rt_addr_add(%p,%p)[%p]\n", addr, dev, - __builtin_return_address(0))); -#if RT6_DEBUG >= 3 - { - int i; - - RDBG(("addr[")); - for(i = 0; i < 8; i++) { - RDBG(("%04x%c", addr->s6_addr16[i], - i == 7 ? ']' : ':')); - } - } -#endif - RDBG(("\n")); - rt = dst_alloc(sizeof(struct rt6_info), &ip6_dst_ops); if (rt == NULL) return -ENOMEM; @@ -1465,10 +1214,7 @@ int ip6_rt_addr_add(struct in6_addr *addr, struct device *dev) ipv6_addr_copy(&rt->rt6i_dst.addr, addr); rt->rt6i_dst.plen = 128; - - rt6_lock(); rt6_ins(rt); - rt6_unlock(); return 0; } @@ -1480,12 +1226,16 @@ int ip6_rt_addr_add(struct in6_addr *addr, struct device *dev) int ip6_rt_addr_del(struct in6_addr *addr, struct device *dev) { struct rt6_info *rt; + int err = -ENOENT; - rt = rt6_lookup(addr, NULL, loopback_dev.ifindex, RTF_LINKRT); - if (rt && rt->rt6i_dst.plen == 128) - return ip6_del_rt(rt); + rt = rt6_lookup(addr, NULL, loopback_dev.ifindex, 1); + if (rt) { + if (rt->rt6i_dst.plen == 128) + err= ip6_del_rt(rt); + dst_release(&rt->u.dst); + } - return 0; + return err; } #ifdef CONFIG_RT6_POLICY @@ -1587,75 +1337,65 @@ static struct rt6_info *rt6_flow_lookup(struct rt6_info *rt, } error: + dst_clone(&ip6_null_entry.u.dst); return &ip6_null_entry; found: - if (nrt == NULL) goto error; nrt->rt6i_flags |= RTF_CACHE; - /* BUGGGG! nrt can point to nowhere! */ - rt6_ins(nrt); - + dst_clone(&nrt->u.dst); + err = rt6_ins(nrt); + if (err) + nrt->u.dst.error = err; return nrt; } #endif -/* - * Nope, I am not idiot. I see that it is the ugliest of ugly routines. - * Anyone is advertised to write better one. --ANK - */ +static int fib6_ifdown(struct rt6_info *rt, void *arg) +{ + if (((void*)rt->rt6i_dev == arg || arg == NULL) && + rt != &ip6_null_entry) { + RT6_TRACE("deleted by ifdown %p\n", rt); + return -1; + } + return 0; +} -struct rt6_ifdown_arg { +void rt6_ifdown(struct device *dev) +{ + fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev); +} + +struct rt6_mtu_change_arg +{ struct device *dev; - struct rt6_info *rt; + unsigned mtu; }; - -static void rt6_ifdown_node(struct fib6_node *fn, void *p_arg) +static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) { - struct rt6_info *rt; - struct rt6_ifdown_arg *arg = (struct rt6_ifdown_arg *) p_arg; - - if (arg->rt != NULL) - return; - - for (rt = fn->leaf; rt; rt = rt->u.next) { - if (rt->rt6i_dev == arg->dev || arg->dev == NULL) { - arg->rt = rt; - return; - } - } + struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; + + /* In IPv6 pmtu discovery is not optional, + so that RTAX_MTU lock cannot dissable it. + We still use this lock to block changes + caused by addrconf/ndisc. + */ + if (rt->rt6i_dev == arg->dev && + !(rt->u.dst.mxlock&(1<<RTAX_MTU))) + rt->u.dst.pmtu = arg->mtu; + return 0; } -void rt6_ifdown(struct device *dev) +void rt6_mtu_change(struct device *dev, unsigned mtu) { - int count = 0; - struct rt6_ifdown_arg arg; - struct rt6_info *rt; + struct rt6_mtu_change_arg arg; - do { - arg.dev = dev; - arg.rt = NULL; - fib6_walk_tree(&ip6_routing_table, rt6_ifdown_node, &arg, - RT6_FILTER_RTNODES); - if (arg.rt != NULL) - ip6_del_rt(arg.rt); - count++; - } while (arg.rt != NULL); - - /* And default routes ... */ - - for (rt = ip6_routing_table.leaf; rt; ) { - if (rt != &ip6_null_entry && (rt->rt6i_dev == dev || dev == NULL)) { - struct rt6_info *deleting = rt; - rt = rt->u.next; - ip6_del_rt(deleting); - continue; - } - rt = rt->u.next; - } + arg.dev = dev; + arg.mtu = mtu; + fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg); } #ifdef CONFIG_RTNETLINK @@ -1714,37 +1454,28 @@ int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) { struct rtmsg *r = NLMSG_DATA(nlh); struct in6_rtmsg rtmsg; - int err = 0; if (inet6_rtm_to_rtmsg(r, arg, &rtmsg)) return -EINVAL; - ip6_route_add(&rtmsg, &err); - return err; + return ip6_route_add(&rtmsg); } struct rt6_rtnl_dump_arg { struct sk_buff *skb; struct netlink_callback *cb; - int skip; - int count; - int stop; }; static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, struct in6_addr *dst, struct in6_addr *src, int iif, - int type, pid_t pid, u32 seq) + int type, u32 pid, u32 seq) { struct rtmsg *rtm; struct nlmsghdr *nlh; unsigned char *b = skb->tail; -#ifdef CONFIG_RTNL_OLD_IFINFO - unsigned char *o; -#else struct rtattr *mx; -#endif struct rta_cacheinfo ci; nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*rtm)); @@ -1762,9 +1493,6 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; rtm->rtm_scope = RT_SCOPE_UNIVERSE; -#ifdef CONFIG_RTNL_OLD_IFINFO - rtm->rtm_nhs = 0; -#endif rtm->rtm_protocol = RTPROT_BOOT; if (rt->rt6i_flags&RTF_DYNAMIC) rtm->rtm_protocol = RTPROT_REDIRECT; @@ -1776,19 +1504,18 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, if (rt->rt6i_flags&RTF_CACHE) rtm->rtm_flags |= RTM_F_CLONED; -#ifdef CONFIG_RTNL_OLD_IFINFO - o = skb->tail; -#endif if (dst) { RTA_PUT(skb, RTA_DST, 16, dst); rtm->rtm_dst_len = 128; } else if (rtm->rtm_dst_len) RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr); +#ifdef CONFIG_IPV6_SUBTREES if (src) { RTA_PUT(skb, RTA_SRC, 16, src); rtm->rtm_src_len = 128; } else if (rtm->rtm_src_len) RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr); +#endif if (iif) RTA_PUT(skb, RTA_IIF, 4, &iif); else if (dst) { @@ -1796,14 +1523,6 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, if (ifp) RTA_PUT(skb, RTA_PREFSRC, 16, &ifp->addr); } -#ifdef CONFIG_RTNL_OLD_IFINFO - if (rt->u.dst.pmtu) - RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &rt->u.dst.pmtu); - if (rt->u.dst.window) - RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &rt->u.dst.window); - if (rt->u.dst.rtt) - RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &rt->u.dst.rtt); -#else mx = (struct rtattr*)skb->tail; RTA_PUT(skb, RTA_METRICS, 0, NULL); if (rt->u.dst.mxlock) @@ -1817,7 +1536,6 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, mx->rta_len = skb->tail - (u8*)mx; if (mx->rta_len == RTA_LENGTH(0)) skb_trim(skb, (u8*)mx - skb->data); -#endif if (rt->u.dst.neighbour) RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key); if (rt->u.dst.dev) @@ -1828,13 +1546,10 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, ci.rta_expires = rt->rt6i_expires - jiffies; else ci.rta_expires = 0; - ci.rta_used = 0; + ci.rta_used = atomic_read(&rt->u.dst.refcnt); ci.rta_clntref = atomic_read(&rt->u.dst.use); ci.rta_error = rt->u.dst.error; RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci); -#ifdef CONFIG_RTNL_OLD_IFINFO - rtm->rtm_optlen = skb->tail - o; -#endif nlh->nlmsg_len = skb->tail - b; return skb->len; @@ -1844,45 +1559,98 @@ rtattr_failure: return -1; } -static void rt6_dump_node(struct fib6_node *fn, void *p_arg) +static int rt6_dump_route(struct rt6_info *rt, void *p_arg) { - struct rt6_info *rt; struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; - if (arg->stop) - return; + return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, + NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq); +} - for (rt = fn->leaf; rt; rt = rt->u.next) { - if (arg->count < arg->skip) { - arg->count++; - continue; - } - if (rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, - NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq) <= 0) { - arg->stop = 1; - break; +static int fib6_dump_node(struct fib6_walker_t *w) +{ + int res; + struct rt6_info *rt; + + for (rt = w->leaf; rt; rt = rt->u.next) { + res = rt6_dump_route(rt, w->args); + if (res < 0) { + /* Frame is full, suspend walking */ + w->leaf = rt; + return 1; } - arg->count++; + BUG_TRAP(res!=0); } + w->leaf = NULL; + return 0; } +static int fib6_dump_done(struct netlink_callback *cb) +{ + struct fib6_walker_t *w = (void*)cb->args[0]; + + if (w) { + cb->args[0] = 0; + start_bh_atomic(); + fib6_walker_unlink(w); + end_bh_atomic(); + kfree(w); + } + if (cb->args[1]) { + cb->done = (void*)cb->args[1]; + cb->args[1] = 0; + } + return cb->done(cb); +} int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { struct rt6_rtnl_dump_arg arg; + struct fib6_walker_t *w; + int res; arg.skb = skb; arg.cb = cb; - arg.skip = cb->args[0]; - arg.count = 0; - arg.stop = 0; - start_bh_atomic(); - fib6_walk_tree(&ip6_routing_table, rt6_dump_node, &arg, RT6_FILTER_RTNODES); - if (arg.stop == 0) - rt6_dump_node(&ip6_routing_table, &arg); - end_bh_atomic(); - cb->args[0] = arg.count; - return skb->len; + + w = (void*)cb->args[0]; + if (w == NULL) { + /* New dump: + * + * 1. hook callback destructor. + */ + cb->args[1] = (long)cb->done; + cb->done = fib6_dump_done; + + /* + * 2. allocate and initialize walker. + */ + w = kmalloc(sizeof(*w), GFP_KERNEL); + if (w == NULL) + return -ENOMEM; + RT6_TRACE("dump<%p", w); + memset(w, 0, sizeof(*w)); + w->root = &ip6_routing_table; + w->func = fib6_dump_node; + w->args = &arg; + cb->args[0] = (long)w; + start_bh_atomic(); + res = fib6_walk(w); + end_bh_atomic(); + } else { + w->args = &arg; + start_bh_atomic(); + res = fib6_walk_continue(w); + end_bh_atomic(); + } +#if RT6_DEBUG >= 3 + if (res <= 0 && skb->len == 0) + RT6_TRACE("%p>dump end\n", w); +#endif + /* res < 0 is an error. (really, impossible) + res == 0 means that dump is complete, but skb still can contain data. + res > 0 dump is not complete, but frame is full. + */ + return res < 0 ? res : skb->len; } int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) @@ -1974,10 +1742,10 @@ void inet6_rt_notify(int event, struct rt6_info *rt) #ifdef CONFIG_PROC_FS - #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1) -struct rt6_proc_arg { +struct rt6_proc_arg +{ char *buffer; int offset; int length; @@ -1985,109 +1753,18 @@ struct rt6_proc_arg { int len; }; -static void rt6_info_node(struct fib6_node *fn, void *p_arg) +static int rt6_info_route(struct rt6_info *rt, void *p_arg) { - struct rt6_info *rt; struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg; - - for (rt = fn->leaf; rt; rt = rt->u.next) { - int i; - - if (arg->skip < arg->offset / RT6_INFO_LEN) { - arg->skip++; - continue; - } - - if (arg->len >= arg->length) - return; - - for (i=0; i<16; i++) { - sprintf(arg->buffer + arg->len, "%02x", - rt->rt6i_dst.addr.s6_addr[i]); - arg->len += 2; - } - arg->len += sprintf(arg->buffer + arg->len, " %02x ", - rt->rt6i_dst.plen); - - for (i=0; i<16; i++) { - sprintf(arg->buffer + arg->len, "%02x", - rt->rt6i_src.addr.s6_addr[i]); - arg->len += 2; - } - arg->len += sprintf(arg->buffer + arg->len, " %02x ", - rt->rt6i_src.plen); - - if (rt->rt6i_nexthop) { - for (i=0; i<16; i++) { - sprintf(arg->buffer + arg->len, "%02x", - rt->rt6i_nexthop->primary_key[i]); - arg->len += 2; - } - } else { - sprintf(arg->buffer + arg->len, - "00000000000000000000000000000000"); - arg->len += 32; - } - arg->len += sprintf(arg->buffer + arg->len, - " %08x %08x %08x %08x %8s\n", - rt->rt6i_metric, atomic_read(&rt->rt6i_use), - atomic_read(&rt->rt6i_ref), rt->rt6i_flags, - rt->rt6i_dev ? rt->rt6i_dev->name : ""); - } -} - -static int rt6_proc_info(char *buffer, char **start, off_t offset, int length, - int dummy) -{ - struct rt6_proc_arg arg; - arg.buffer = buffer; - arg.offset = offset; - arg.length = length; - arg.skip = 0; - arg.len = 0; - - fib6_walk_tree(&ip6_routing_table, rt6_info_node, &arg, - RT6_FILTER_RTNODES); - - rt6_info_node(&ip6_routing_table, &arg); - - *start = buffer; - if (offset) - *start += offset % RT6_INFO_LEN; - - arg.len -= offset % RT6_INFO_LEN; - - if(arg.len > length) - arg.len = length; - if(arg.len < 0) - arg.len = 0; - - return arg.len; -} - -#define PTR_SZ (sizeof(void *) * 2) -#define FI_LINE_SZ (2 * (PTR_SZ) + 7 + 32 + 4 + 32 + 4) - -static void rt6_tree_node(struct fib6_node *fn, void *p_arg) -{ - struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg; - struct rt6_info *rt; - char f; int i; - rt = fn->leaf; - - if (arg->skip < arg->offset / FI_LINE_SZ) { + if (arg->skip < arg->offset / RT6_INFO_LEN) { arg->skip++; - return; + return 0; } - if (arg->len + FI_LINE_SZ >= arg->length) - return; - - f = (fn->fn_flags & RTN_RTINFO) ? 'r' : 'n'; - arg->len += sprintf(arg->buffer + arg->len, "%p %p %02x %c ", - fn, fn->parent, fn->fn_bit, f); + if (arg->len >= arg->length) + return 0; for (i=0; i<16; i++) { sprintf(arg->buffer + arg->len, "%02x", @@ -2096,18 +1773,41 @@ static void rt6_tree_node(struct fib6_node *fn, void *p_arg) } arg->len += sprintf(arg->buffer + arg->len, " %02x ", rt->rt6i_dst.plen); - + +#ifdef CONFIG_IPV6_SUBTREES for (i=0; i<16; i++) { sprintf(arg->buffer + arg->len, "%02x", rt->rt6i_src.addr.s6_addr[i]); arg->len += 2; } - arg->len += sprintf(arg->buffer + arg->len, " %02x\n", + arg->len += sprintf(arg->buffer + arg->len, " %02x ", rt->rt6i_src.plen); +#else + sprintf(arg->buffer + arg->len, + "00000000000000000000000000000000 00 "); + arg->len += 36; +#endif + if (rt->rt6i_nexthop) { + for (i=0; i<16; i++) { + sprintf(arg->buffer + arg->len, "%02x", + rt->rt6i_nexthop->primary_key[i]); + arg->len += 2; + } + } else { + sprintf(arg->buffer + arg->len, + "00000000000000000000000000000000"); + arg->len += 32; + } + arg->len += sprintf(arg->buffer + arg->len, + " %08x %08x %08x %08x %8s\n", + rt->rt6i_metric, atomic_read(&rt->u.dst.use), + atomic_read(&rt->u.dst.refcnt), rt->rt6i_flags, + rt->rt6i_dev ? rt->rt6i_dev->name : ""); + return 0; } -static int rt6_proc_tree(char *buffer, char **start, off_t offset, int length, +static int rt6_proc_info(char *buffer, char **start, off_t offset, int length, int dummy) { struct rt6_proc_arg arg; @@ -2117,7 +1817,7 @@ static int rt6_proc_tree(char *buffer, char **start, off_t offset, int length, arg.skip = 0; arg.len = 0; - fib6_walk_tree(&ip6_routing_table, rt6_tree_node, &arg, 0); + fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg); *start = buffer; if (offset) @@ -2125,15 +1825,14 @@ static int rt6_proc_tree(char *buffer, char **start, off_t offset, int length, arg.len -= offset % RT6_INFO_LEN; - if(arg.len > length) + if (arg.len > length) arg.len = length; - if(arg.len < 0) + if (arg.len < 0) arg.len = 0; return arg.len; } - extern struct rt6_statistics rt6_stats; static int rt6_proc_stats(char *buffer, char **start, off_t offset, int length, @@ -2141,10 +1840,11 @@ static int rt6_proc_stats(char *buffer, char **start, off_t offset, int length, { int len; - len = sprintf(buffer, "%04x %04x %04x %04x %04x\n", + len = sprintf(buffer, "%04x %04x %04x %04x %04x %04x\n", rt6_stats.fib_nodes, rt6_stats.fib_route_nodes, rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries, - rt6_stats.fib_rt_cache); + rt6_stats.fib_rt_cache, + atomic_read(&ip6_dst_ops.entries)); len -= offset; @@ -2164,12 +1864,6 @@ static struct proc_dir_entry proc_rt6_info = { 0, &proc_net_inode_operations, rt6_proc_info }; -static struct proc_dir_entry proc_rt6_tree = { - PROC_NET_RT6_TREE, 7, "ip6_fib", - S_IFREG | S_IRUGO, 1, 0, 0, - 0, &proc_net_inode_operations, - rt6_proc_tree -}; static struct proc_dir_entry proc_rt6_stats = { PROC_NET_RT6_STATS, 9, "rt6_stats", S_IFREG | S_IRUGO, 1, 0, 0, @@ -2230,7 +1924,6 @@ __initfunc(void ip6_route_init(void)) { #ifdef CONFIG_PROC_FS proc_net_register(&proc_rt6_info); - proc_net_register(&proc_rt6_tree); proc_net_register(&proc_rt6_stats); #endif #ifdef CONFIG_IPV6_NETLINK @@ -2243,7 +1936,6 @@ void ip6_route_cleanup(void) { #ifdef CONFIG_PROC_FS proc_net_unregister(PROC_NET_RT6); - proc_net_unregister(PROC_NET_RT6_TREE); proc_net_unregister(PROC_NET_RT6_STATS); #endif #ifdef CONFIG_IPV6_NETLINK diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 577b85d0f..0d6efd515 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -6,7 +6,7 @@ * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * - * $Id: sit.c,v 1.27 1998/03/08 05:56:57 davem Exp $ + * $Id: sit.c,v 1.28 1998/08/26 12:05:22 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -434,7 +434,7 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct device *dev) ip_rt_put(rt); goto tx_error; } - if (mtu >= 576) { + if (mtu >= IPV6_MIN_MTU) { if (skb->dst && mtu < skb->dst->pmtu) { struct rt6_info *rt6 = (struct rt6_info*)skb->dst; if (mtu < rt6->u.dst.pmtu) { @@ -475,6 +475,8 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct device *dev) tunnel->recursion--; return 0; } + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); dev_kfree_skb(skb); skb = new_skb; } @@ -491,7 +493,7 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct device *dev) iph = skb->nh.iph; iph->version = 4; iph->ihl = sizeof(struct iphdr)>>2; - if (mtu > 576) + if (mtu > IPV6_MIN_MTU) iph->frag_off = __constant_htons(IP_DF); else iph->frag_off = 0; @@ -608,7 +610,7 @@ static struct net_device_stats *ipip6_tunnel_get_stats(struct device *dev) static int ipip6_tunnel_change_mtu(struct device *dev, int new_mtu) { - if (new_mtu < 576 || new_mtu > 0xFFF8 - sizeof(struct iphdr)) + if (new_mtu < IPV6_MIN_MTU || new_mtu > 0xFFF8 - sizeof(struct iphdr)) return -EINVAL; dev->mtu = new_mtu; return 0; @@ -662,8 +664,8 @@ static int ipip6_tunnel_init(struct device *dev) if (tdev) { dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); dev->mtu = tdev->mtu - sizeof(struct iphdr); - if (dev->mtu < 576) - dev->mtu = 576; + if (dev->mtu < IPV6_MIN_MTU) + dev->mtu = IPV6_MIN_MTU; } dev->iflink = tunnel->parms.link; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5fa45dce5..c997999db 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: tcp_ipv6.c,v 1.82 1998/06/11 03:15:52 davem Exp $ + * $Id: tcp_ipv6.c,v 1.89 1998/08/28 00:27:54 davem Exp $ * * Based on: * linux/net/ipv4/tcp.c @@ -123,16 +123,33 @@ static int tcp_v6_verify_bind(struct sock *sk, unsigned short snum) } if(result == 0) { if(tb == NULL) { - if(tcp_bucket_create(snum) == NULL) + if((tb = tcp_bucket_create(snum)) == NULL) result = 1; + else if (sk->reuse && sk->state != TCP_LISTEN) + tb->flags |= TCPB_FLAG_FASTREUSE; } else { /* It could be pending garbage collection, this * kills the race and prevents it from disappearing * out from under us by the time we use it. -DaveM */ - if(tb->owners == NULL && !(tb->flags & TCPB_FLAG_LOCKED)) { - tb->flags = TCPB_FLAG_LOCKED; - tcp_dec_slow_timer(TCP_SLT_BUCKETGC); + if(tb->owners == NULL) { + if (!(tb->flags & TCPB_FLAG_LOCKED)) { + tb->flags = (TCPB_FLAG_LOCKED | + ((sk->reuse && + sk->state != TCP_LISTEN) ? + TCPB_FLAG_FASTREUSE : 0)); + tcp_dec_slow_timer(TCP_SLT_BUCKETGC); + } else if (!(tb->flags & TCPB_FLAG_GOODSOCKNUM)) { + /* Someone is in between the bind + * and the actual connect or listen. + * See if it was a legitimate reuse + * and we are as well, else punt. + */ + if (sk->reuse == 0 || + !(tb->flags & TCPB_FLAG_FASTREUSE)) + result = 1; + } else + tb->flags &= ~TCPB_FLAG_GOODSOCKNUM; } } } @@ -358,7 +375,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct dst_entry *dst; struct sk_buff *buff; int addr_type; - int mss; if (sk->state != TCP_CLOSE) return(-EISCONN); @@ -403,6 +419,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, */ if (addr_type == IPV6_ADDR_MAPPED) { + u32 exthdrlen = tp->ext_header_len; struct sockaddr_in sin; int err; @@ -418,10 +435,10 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); if (err) { + tp->ext_header_len = exthdrlen; sk->tp_pinfo.af_tcp.af_specific = &ipv6_specific; sk->backlog_rcv = tcp_v6_do_rcv; } else { - /* Yuup... And it is not the only place... --ANK */ ipv6_addr_set(&np->saddr, 0, 0, __constant_htonl(0x0000FFFF), sk->saddr); ipv6_addr_set(&np->rcv_saddr, 0, 0, __constant_htonl(0x0000FFFF), @@ -441,18 +458,18 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl.uli_u.ports.dport = usin->sin6_port; fl.uli_u.ports.sport = sk->sport; + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; + fl.nl_u.ip6_u.daddr = rt0->addr; + } + dst = ip6_route_output(sk, &fl); - + if (dst->error) { dst_release(dst); return dst->error; } - if (dst->pmtu < 576) { - dst_release(dst); - return -EINVAL; - } - if (fl.oif == 0 && addr_type&IPV6_ADDR_LINKLOCAL) { /* Ough! This guy tries to connect to link local * address and did not specify interface. @@ -462,11 +479,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk->bound_dev_if = dst->dev->ifindex; } - ip6_dst_store(sk, dst); + ip6_dst_store(sk, dst, NULL); if (saddr == NULL) { ifa = ipv6_get_saddr(dst, &np->daddr); - + if (ifa == NULL) return -ENETUNREACH; @@ -477,6 +494,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, ipv6_addr_copy(&np->saddr, saddr); } + tp->ext_header_len = 0; + if (np->opt) + tp->ext_header_len = np->opt->opt_flen+np->opt->opt_nflen; + /* Reset mss clamp */ + tp->mss_clamp = ~0; + buff = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header), 0, GFP_KERNEL); @@ -498,15 +521,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, np->daddr.s6_addr32[3], sk->sport, sk->dport); - sk->mtu = dst->pmtu; - mss = sk->mtu - sizeof(struct ipv6hdr); -#if 0 - if (np->opt) { - /* Adjust mss */ - } -#endif - - tcp_connect(sk, buff, mss); + tcp_connect(sk, buff, dst->pmtu); return 0; } @@ -555,10 +570,12 @@ out: return retval; } -void tcp_v6_err(struct sk_buff *skb, int type, int code, unsigned char *header, __u32 info, - struct in6_addr *saddr, struct in6_addr *daddr, - struct inet6_protocol *protocol) +void tcp_v6_err(struct sk_buff *skb, struct ipv6hdr *hdr, + struct inet6_skb_parm *opt, + int type, int code, unsigned char *header, __u32 info) { + struct in6_addr *saddr = &hdr->saddr; + struct in6_addr *daddr = &hdr->daddr; struct tcphdr *th = (struct tcphdr *)header; struct ipv6_pinfo *np; struct sock *sk; @@ -567,7 +584,8 @@ void tcp_v6_err(struct sk_buff *skb, int type, int code, unsigned char *header, struct tcp_opt *tp; __u32 seq; - /* XXX: length check for tcphdr missing here */ + if (header + 8 > skb->tail) + return; sk = tcp_v6_lookup(daddr, th->dest, saddr, th->source, skb->dev->ifindex); @@ -588,15 +606,20 @@ void tcp_v6_err(struct sk_buff *skb, int type, int code, unsigned char *header, np = &sk->net_pinfo.af_inet6; if (type == ICMPV6_PKT_TOOBIG && sk->state != TCP_LISTEN) { + struct dst_entry *dst = NULL; /* icmp should have updated the destination cache entry */ if (sk->dst_cache) - dst_check(&sk->dst_cache, np->dst_cookie); + dst = dst_check(&sk->dst_cache, np->dst_cookie); - if (sk->dst_cache == NULL) { + if (dst == NULL) { struct flowi fl; struct dst_entry *dst; - + + /* BUGGG_FUTURE: Again, it is not clear how + to handle rthdr case. Ignore this complexity + for now. + */ fl.proto = IPPROTO_TCP; fl.nl_u.ip6_u.daddr = &np->daddr; fl.nl_u.ip6_u.saddr = &np->saddr; @@ -605,23 +628,19 @@ void tcp_v6_err(struct sk_buff *skb, int type, int code, unsigned char *header, fl.uli_u.ports.sport = sk->sport; dst = ip6_route_output(sk, &fl); + } else + dst = dst_clone(dst); - ip6_dst_store(sk, dst); - } - - if (sk->dst_cache->error) { - sk->err_soft = sk->dst_cache->error; - } else { - /* FIXME: Reset sk->mss, taking into account TCP option - * bytes for timestamps. -DaveM - */ - sk->mtu = sk->dst_cache->pmtu; - } - if (sk->sock_readers) { /* remove later */ - printk(KERN_DEBUG "tcp_v6_err: pmtu disc: socket locked.\n"); - return; - } - tcp_simple_retransmit(sk); + if (dst->error) { + sk->err_soft = dst->error; + } else if (tp->pmtu_cookie > dst->pmtu + && !atomic_read(&sk->sock_readers)) { + lock_sock(sk); + tcp_sync_mss(sk, dst->pmtu); + tcp_simple_retransmit(sk); + release_sock(sk); + } /* else let the usual retransmit timer handle it */ + dst_release(dst); return; } @@ -631,7 +650,7 @@ void tcp_v6_err(struct sk_buff *skb, int type, int code, unsigned char *header, struct open_request *req, *prev; struct ipv6hdr hd; case TCP_LISTEN: - if (sk->sock_readers) + if (atomic_read(&sk->sock_readers)) return; /* Grrrr - fix this later. */ @@ -680,6 +699,7 @@ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req) { struct sk_buff * skb; struct dst_entry *dst; + struct ipv6_txoptions *opt = NULL; struct flowi fl; int mss; @@ -690,19 +710,26 @@ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req) fl.uli_u.ports.dport = req->rmt_port; fl.uli_u.ports.sport = sk->sport; - dst = ip6_route_output(sk, &fl); - if (dst->error) { - dst_release(dst); - return; + opt = sk->net_pinfo.af_inet6.opt; + if (opt == NULL && + sk->net_pinfo.af_inet6.rxopt.bits.srcrt == 2 && + req->af.v6_req.pktopts) { + struct sk_buff *pktopts = req->af.v6_req.pktopts; + struct inet6_skb_parm *rxopt = (struct inet6_skb_parm *)pktopts->cb; + if (rxopt->srcrt) + opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(pktopts->nh.raw + rxopt->srcrt)); } - mss = dst->pmtu - sizeof(struct ipv6hdr) - sizeof(struct tcphdr); -#if 0 - /* Subtract option length... */ - if (opt) { - mss -= opt->optlen; + if (opt && opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; + fl.nl_u.ip6_u.daddr = rt0->addr; } -#endif + + dst = ip6_route_output(sk, &fl); + if (dst->error) + goto done; + + mss = dst->pmtu - sizeof(struct ipv6hdr) - sizeof(struct tcphdr); skb = tcp_make_synack(sk, dst, req, mss); if (skb) { @@ -712,13 +739,22 @@ static void tcp_v6_send_synack(struct sock *sk, struct open_request *req) &req->af.v6_req.loc_addr, &req->af.v6_req.rmt_addr, csum_partial((char *)th, skb->len, skb->csum)); - ip6_xmit(sk, skb, &fl, req->af.v6_req.opt); + fl.nl_u.ip6_u.daddr = &req->af.v6_req.rmt_addr; + ip6_xmit(sk, skb, &fl, opt); } + +done: dst_release(dst); + if (opt && opt != sk->net_pinfo.af_inet6.opt) + sock_kfree_s(sk, opt, opt->tot_len); } static void tcp_v6_or_free(struct open_request *req) { + if (req->af.v6_req.pktopts) { + kfree_skb(req->af.v6_req.pktopts); + req->af.v6_req.pktopts = NULL; + } } static struct or_calltable or_ipv6 = { @@ -727,14 +763,27 @@ static struct or_calltable or_ipv6 = { tcp_v6_send_reset }; +static int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) +{ + struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb; + + if (sk->net_pinfo.af_inet6.rxopt.all) { + if ((opt->hop && sk->net_pinfo.af_inet6.rxopt.bits.hopopts) || + (opt->srcrt && sk->net_pinfo.af_inet6.rxopt.bits.srcrt) || + ((opt->dst1 || opt->dst0) && sk->net_pinfo.af_inet6.rxopt.bits.dstopts)) + return 1; + } + return 0; +} + + #define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */ #define BACKLOGMAX(sk) sysctl_max_syn_backlog /* FIXME: this is substantially similar to the ipv4 code. * Can some kind of merge be done? -- erics */ -static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, - __u32 isn) +static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, __u32 isn) { struct tcp_opt tp; struct open_request *req; @@ -747,7 +796,11 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, } if (skb->protocol == __constant_htons(ETH_P_IP)) - return tcp_v4_conn_request(sk, skb, ptr, isn); + return tcp_v4_conn_request(sk, skb, isn); + + /* FIXME: do the same check for anycast */ + if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) + goto drop; if (isn == 0) isn = tcp_v6_init_sequence(sk,skb); @@ -756,8 +809,9 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, * There are no SYN attacks on IPv6, yet... */ if (BACKLOG(sk) >= BACKLOGMAX(sk)) { - printk(KERN_DEBUG "droping syn ack:%d max:%d\n", - BACKLOG(sk), BACKLOGMAX(sk)); + (void)(net_ratelimit() && + printk(KERN_INFO "droping syn ack:%d max:%d\n", + BACKLOG(sk), BACKLOGMAX(sk))); goto drop; } @@ -773,13 +827,16 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, req->rcv_isn = TCP_SKB_CB(skb)->seq; req->snt_isn = isn; tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0; - tp.in_mss = 536; + tp.mss_clamp = 65535; tcp_parse_options(NULL, skb->h.th, &tp, 0); - req->mss = tp.in_mss; - if (tp.saw_tstamp) { - req->mss -= TCPOLEN_TSTAMP_ALIGNED; + if (tp.mss_clamp == 65535) + tp.mss_clamp = 576 - sizeof(struct ipv6hdr) - sizeof(struct iphdr); + if (sk->tp_pinfo.af_tcp.user_mss && sk->tp_pinfo.af_tcp.user_mss < tp.mss_clamp) + tp.mss_clamp = sk->tp_pinfo.af_tcp.user_mss; + + req->mss = tp.mss_clamp; + if (tp.saw_tstamp) req->ts_recent = tp.rcv_tsval; - } req->tstamp_ok = tp.tstamp_ok; req->sack_ok = tp.sack_ok; req->snd_wscale = tp.snd_wscale; @@ -787,7 +844,11 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, req->rmt_port = skb->h.th->source; ipv6_addr_copy(&req->af.v6_req.rmt_addr, &skb->nh.ipv6h->saddr); ipv6_addr_copy(&req->af.v6_req.loc_addr, &skb->nh.ipv6h->daddr); - req->af.v6_req.opt = NULL; /* FIXME: options */ + req->af.v6_req.pktopts = NULL; + if (ipv6_opt_accepted(sk, skb)) { + atomic_inc(&skb->users); + req->af.v6_req.pktopts = skb; + } req->af.v6_req.iif = sk->bound_dev_if; /* So that link locals have meaning */ @@ -804,8 +865,6 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr, tcp_inc_slow_timer(TCP_SLT_SYNACK); tcp_synq_queue(&sk->tp_pinfo.af_tcp, req); - sk->data_ready(sk, 0); - return 0; drop: @@ -832,8 +891,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct flowi fl; struct tcp_opt *newtp; struct sock *newsk; - int mss; - + struct ipv6_txoptions *opt; + if (skb->protocol == __constant_htons(ETH_P_IP)) { /* * v6 mapped @@ -856,21 +915,37 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->tp_pinfo.af_tcp.af_specific = &ipv6_mapped; newsk->backlog_rcv = tcp_v4_do_rcv; + newsk->net_pinfo.af_inet6.pktoptions = NULL; + newsk->net_pinfo.af_inet6.opt = NULL; + + /* It is tricky place. Until this moment IPv4 tcp + worked with IPv6 af_tcp.af_specific. + Sync it now. + */ + tcp_sync_mss(newsk, newsk->tp_pinfo.af_tcp.pmtu_cookie); return newsk; } + opt = sk->net_pinfo.af_inet6.opt; if (sk->ack_backlog > sk->max_ack_backlog) - return NULL; + goto out; + + if (sk->net_pinfo.af_inet6.rxopt.bits.srcrt == 2 && + opt == NULL && req->af.v6_req.pktopts) { + struct inet6_skb_parm *rxopt = (struct inet6_skb_parm *)req->af.v6_req.pktopts->cb; + if (rxopt->srcrt) + opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(req->af.v6_req.pktopts->nh.raw+rxopt->srcrt)); + } if (dst == NULL) { - /* - * options / mss / route cache - */ - fl.proto = IPPROTO_TCP; fl.nl_u.ip6_u.daddr = &req->af.v6_req.rmt_addr; + if (opt && opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; + fl.nl_u.ip6_u.daddr = rt0->addr; + } fl.nl_u.ip6_u.saddr = &req->af.v6_req.loc_addr; fl.oif = sk->bound_dev_if; fl.uli_u.ports.dport = req->rmt_port; @@ -879,22 +954,17 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, dst = ip6_route_output(sk, &fl); } - if (dst->error || dst->pmtu < 576) + if (dst->error) goto out; - + sk->tp_pinfo.af_tcp.syn_backlog--; sk->ack_backlog++; - mss = dst->pmtu - sizeof(struct ipv6hdr); -#if 0 - /* Adjust mss by option size */ -#endif - - newsk = tcp_create_openreq_child(sk, req, skb, mss); + newsk = tcp_create_openreq_child(sk, req, skb); if (newsk == NULL) goto out; - ip6_dst_store(newsk, dst); + ip6_dst_store(newsk, dst, NULL); newtp = &(newsk->tp_pinfo.af_tcp); @@ -903,18 +973,55 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, ipv6_addr_copy(&np->saddr, &req->af.v6_req.loc_addr); ipv6_addr_copy(&np->rcv_saddr, &req->af.v6_req.loc_addr); newsk->bound_dev_if = req->af.v6_req.iif; - newsk->mtu = dst->pmtu; + + /* Now IPv6 options... + + First: no IPv4 options. + */ newsk->opt = NULL; + /* Clone RX bits */ + np->rxopt.all = sk->net_pinfo.af_inet6.rxopt.all; + + /* Clone pktoptions received with SYN */ + np->pktoptions = req->af.v6_req.pktopts; + if (np->pktoptions) + atomic_inc(&np->pktoptions->users); + np->opt = NULL; + + /* Clone native IPv6 options from listening socket (if any) + + Yes, keeping reference count would be much more clever, + but we make one more one thing there: reattach optmem + to newsk. + */ + if (opt) { + np->opt = ipv6_dup_options(newsk, opt); + if (opt != sk->net_pinfo.af_inet6.opt) + sock_kfree_s(sk, opt, opt->tot_len); + } + + newtp->ext_header_len = 0; + if (np->opt) + newtp->ext_header_len = np->opt->opt_nflen + np->opt->opt_flen; + + tcp_sync_mss(newsk, dst->pmtu); + newtp->rcv_mss = newtp->mss_clamp; + newsk->daddr = LOOPBACK4_IPV6; newsk->saddr = LOOPBACK4_IPV6; newsk->rcv_saddr= LOOPBACK4_IPV6; newsk->prot->hash(newsk); add_to_prot_sklist(newsk); + + sk->data_ready(sk, 0); /* Deliver SIGIO */ + return newsk; out: + if (opt && opt != sk->net_pinfo.af_inet6.opt) + sock_kfree_s(sk, opt, opt->tot_len); dst_release(dst); return NULL; } @@ -1020,8 +1127,8 @@ static void tcp_v6_rst_req(struct sock *sk, struct sk_buff *skb) if (!req) return; /* Sequence number check required by RFC793 */ - if (before(TCP_SKB_CB(skb)->seq, req->snt_isn) || - after(TCP_SKB_CB(skb)->seq, req->snt_isn+1)) + if (before(TCP_SKB_CB(skb)->seq, req->rcv_isn) || + after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1)) return; if(req->sk) sk->ack_backlog--; @@ -1055,7 +1162,7 @@ static inline struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) } #if 0 /*def CONFIG_SYN_COOKIES */ else { - sk = cookie_v6_check(sk, skb, (struct ipv6_options *) skb->cb); + sk = cookie_v6_check(sk, skb); } #endif } @@ -1064,6 +1171,8 @@ static inline struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) { + int users = 0; + /* Imagine: socket is IPv6. IPv4 packet arrives, goes to IPv4 receive handler and backlogged. From backlog it always goes here. Kerboom... @@ -1080,6 +1189,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) * is currently called with bh processing disabled. */ + ipv6_statistics.Ip6InDelivers++; + /* XXX We need to think more about socket locking * XXX wrt. backlog queues, __release_sock(), etc. -DaveM */ @@ -1092,9 +1203,29 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) */ skb_set_owner_r(skb, sk); + /* Do Stevens' IPV6_PKTOPTIONS. + + Yes, guys, it is the only place in our code, where we + may make it not affecting IPv4. + The rest of code is protocol independent, + and I do not like idea to uglify IPv4. + + Actually, all the idea behind IPV6_PKTOPTIONS + looks not very well thought. For now we latch + options, received in the last packet, enqueued + by tcp. Feel free to propose better solution. + --ANK (980728) + */ + if (sk->net_pinfo.af_inet6.rxopt.all) { + users = atomic_read(&skb->users); + atomic_inc(&skb->users); + } + if (sk->state == TCP_ESTABLISHED) { /* Fast path */ if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) goto reset; + if (users) + goto ipv6_pktoptions; release_sock(sk); return 0; } @@ -1110,26 +1241,60 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) sk = nsk; } - if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->cb, skb->len)) + if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) goto reset; + if (users) + goto ipv6_pktoptions; release_sock(sk); return 0; reset: tcp_v6_send_reset(skb); discard: + if (users) + kfree_skb(skb); kfree_skb(skb); release_sock(sk); return 0; + +ipv6_pktoptions: + /* Do you ask, what is it? + + 1. skb was enqueued by tcp. + 2. skb is added to tail of read queue, rather than out of order. + 3. socket is not in passive state. + 4. Finally, it really contains options, which user wants to receive. + */ + if (atomic_read(&skb->users) > users && + TCP_SKB_CB(skb)->end_seq == sk->tp_pinfo.af_tcp.rcv_nxt && + !((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))) { + if (ipv6_opt_accepted(sk, skb)) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + kfree_skb(skb); + skb = NULL; + if (skb2) { + skb_set_owner_r(skb2, sk); + skb = xchg(&sk->net_pinfo.af_inet6.pktoptions, skb2); + } + } else { + kfree_skb(skb); + skb = xchg(&sk->net_pinfo.af_inet6.pktoptions, NULL); + } + } + + if (skb) + kfree_skb(skb); + release_sock(sk); + return 0; } -int tcp_v6_rcv(struct sk_buff *skb, struct device *dev, - struct in6_addr *saddr, struct in6_addr *daddr, - struct ipv6_options *opt, unsigned short len, - int redo, struct inet6_protocol *protocol) +int tcp_v6_rcv(struct sk_buff *skb, unsigned long len) { struct tcphdr *th; struct sock *sk; + struct device *dev = skb->dev; + struct in6_addr *saddr = &skb->nh.ipv6h->saddr; + struct in6_addr *daddr = &skb->nh.ipv6h->daddr; th = skb->h.th; @@ -1178,7 +1343,7 @@ int tcp_v6_rcv(struct sk_buff *skb, struct device *dev, if(sk->state == TCP_TIME_WAIT) goto do_time_wait; - if (!sk->sock_readers) + if (!atomic_read(&sk->sock_readers)) return tcp_v6_do_rcv(sk, skb); __skb_queue_tail(&sk->back_log, skb); @@ -1198,7 +1363,7 @@ discard_it: do_time_wait: if(tcp_timewait_state_process((struct tcp_tw_bucket *)sk, - skb, th, &(IPCB(skb)->opt), skb->len)) + skb, th, skb->len)) goto no_tcp_socket; goto discard_it; } @@ -1221,6 +1386,12 @@ static int tcp_v6_rebuild_header(struct sock *sk) fl.uli_u.ports.dport = sk->dport; fl.uli_u.ports.sport = sk->sport; + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; + fl.nl_u.ip6_u.daddr = rt0->addr; + } + + dst = ip6_route_output(sk, &fl); if (dst->error) { @@ -1228,7 +1399,7 @@ static int tcp_v6_rebuild_header(struct sock *sk) return dst->error; } - ip6_dst_store(sk, dst); + ip6_dst_store(sk, dst, NULL); } return dst->error; @@ -1258,6 +1429,11 @@ static void tcp_v6_xmit(struct sk_buff *skb) fl.uli_u.ports.sport = sk->sport; fl.uli_u.ports.dport = sk->dport; + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; + fl.nl_u.ip6_u.daddr = rt0->addr; + } + if (sk->dst_cache) dst = dst_check(&sk->dst_cache, np->dst_cookie); @@ -1270,11 +1446,14 @@ static void tcp_v6_xmit(struct sk_buff *skb) return; } - ip6_dst_store(sk, dst); + ip6_dst_store(sk, dst, NULL); } skb->dst = dst_clone(dst); + /* Restore final destination back after routing done */ + fl.nl_u.ip6_u.daddr = &np->daddr; + ip6_xmit(sk, skb, &fl, np->opt); } @@ -1295,6 +1474,8 @@ static struct tcp_func ipv6_specific = { tcp_v6_conn_request, tcp_v6_syn_recv_sock, tcp_v6_get_sock, + sizeof(struct ipv6hdr), + ipv6_setsockopt, ipv6_getsockopt, v6_addr2sockaddr, @@ -1312,6 +1493,8 @@ static struct tcp_func ipv6_mapped = { tcp_v6_conn_request, tcp_v6_syn_recv_sock, tcp_v6_get_sock, + sizeof(struct iphdr), + ipv6_setsockopt, ipv6_getsockopt, v6_addr2sockaddr, @@ -1330,7 +1513,7 @@ static int tcp_v6_init_sock(struct sock *sk) tp->rto = TCP_TIMEOUT_INIT; /*TCP_WRITE_TIME*/ tp->mdev = TCP_TIMEOUT_INIT; - tp->in_mss = 536; + tp->mss_clamp = ~0; /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. @@ -1338,17 +1521,17 @@ static int tcp_v6_init_sock(struct sock *sk) tp->snd_cwnd = (1 << TCP_CWND_SHIFT); tp->snd_ssthresh = 0x7fffffff; - sk->priority = 1; sk->state = TCP_CLOSE; sk->max_ack_backlog = SOMAXCONN; - sk->mtu = 576; - sk->mss = 536; + tp->rcv_mss = 536; /* Init SYN queue. */ tcp_synq_init(tp); sk->tp_pinfo.af_tcp.af_specific = &ipv6_specific; + sk->write_space = tcp_write_space; + return 0; } @@ -1376,12 +1559,6 @@ static int tcp_v6_destroy_sock(struct sock *sk) while((skb = __skb_dequeue(&tp->out_of_order_queue)) != NULL) kfree_skb(skb); - /* - * Release destination entry - */ - - dst_release(xchg(&sk->dst_cache,NULL)); - /* Clean up a locked TCP bind bucket, this only happens if a * port is allocated for a socket, but it never fully connects. * In which case we will find num to be non-zero and daddr to @@ -1390,7 +1567,7 @@ static int tcp_v6_destroy_sock(struct sock *sk) if(ipv6_addr_any(&(sk->net_pinfo.af_inet6.daddr)) && sk->num != 0) tcp_bucket_unlock(sk); - return 0; + return inet6_destroy_sock(sk); } struct proto tcpv6_prot = { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 2dac0570f..bfa701c97 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -7,7 +7,7 @@ * * Based on linux/ipv4/udp.c * - * $Id: udp.c,v 1.31 1998/07/15 05:05:45 davem Exp $ + * $Id: udp.c,v 1.33 1998/08/27 16:55:20 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -15,6 +15,7 @@ * 2 of the License, or (at your option) any later version. */ +#include <linux/config.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> @@ -59,6 +60,14 @@ static int udp_v6_verify_bind(struct sock *sk, unsigned short snum) if((sk2->num == snum) && (sk2 != sk)) { unsigned char state = sk2->state; int sk2_reuse = sk2->reuse; + + /* Two sockets can be bound to the same port if they're + * bound to different interfaces. + */ + + if(sk2->bound_dev_if != sk->bound_dev_if) + continue; + if(addr_type == IPV6_ADDR_ANY || (!sk2->rcv_saddr)) { if((!sk2_reuse) || (!sk_reuse) || @@ -139,7 +148,7 @@ static void udp_v6_rehash(struct sock *sk) } static struct sock *udp_v6_lookup(struct in6_addr *saddr, u16 sport, - struct in6_addr *daddr, u16 dport) + struct in6_addr *daddr, u16 dport, int dif) { struct sock *sk, *result = NULL; unsigned short hnum = ntohs(dport); @@ -166,7 +175,12 @@ static struct sock *udp_v6_lookup(struct in6_addr *saddr, u16 sport, continue; score++; } - if(score == 3) { + if(sk->bound_dev_if) { + if(sk->bound_dev_if != dif) + continue; + score++; + } + if(score == 4) { result = sk; break; } else if(score > badness) { @@ -257,20 +271,25 @@ ipv4_connected: */ fl.proto = IPPROTO_UDP; - fl.nl_u.ip6_u.daddr = daddr; + fl.nl_u.ip6_u.daddr = &np->daddr; fl.nl_u.ip6_u.saddr = NULL; fl.oif = sk->bound_dev_if; fl.uli_u.ports.dport = sk->dport; fl.uli_u.ports.sport = sk->sport; + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; + fl.nl_u.ip6_u.daddr = rt0->addr; + } + dst = ip6_route_output(sk, &fl); - + if (dst->error) { dst_release(dst); return dst->error; } - ip6_dst_store(sk, dst); + ip6_dst_store(sk, dst, fl.nl_u.ip6_u.daddr); /* get the source adddress used in the apropriate device */ @@ -291,15 +310,50 @@ ipv4_connected: static void udpv6_close(struct sock *sk, unsigned long timeout) { - lock_sock(sk); + /* See for explanation: raw_close in ipv4/raw.c */ sk->state = TCP_CLOSE; - ipv6_sock_mc_close(sk); udp_v6_unhash(sk); sk->dead = 1; - release_sock(sk); destroy_sock(sk); } +#ifdef CONFIG_FILTER +#undef CONFIG_UDP_DELAY_CSUM +#endif + +#ifdef CONFIG_UDP_DELAY_CSUM + +/* Please, read comments in net/checksum.h, asm/checksum.h + + I commented out csum_partial_copy_to_user there because it did not + verify_area. Now I am even wondered, how clever was I that time 8)8) + If I did not it, I would step into this hole again. --ANK + */ + +#ifndef _HAVE_ARCH_COPY_AND_CSUM_TO_USER +#if defined(__i386__) +static __inline__ +unsigned int csum_and_copy_to_user (const char *src, char *dst, + int len, int sum, int *err_ptr) +{ + int *src_err_ptr=NULL; + + if (verify_area(VERIFY_WRITE, dst, len) == 0) + return csum_partial_copy_generic(src, dst, len, sum, src_err_ptr, err_ptr); + + if (len) + *err_ptr = -EFAULT; + + return sum; +} +#elif defined(__sparc__) +#define csum_and_copy_to_user csum_partial_copy_to_user +#else +#undef CONFIG_UDP_DELAY_CSUM +#endif +#endif +#endif + /* * This should be easy, if there is something there we * return it, otherwise we block. @@ -322,12 +376,12 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len, * From here the generic datagram does a lot of the work. Come * the finished NET3, it will do _ALL_ the work! */ - + skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) goto out; - copied = ntohs(((struct udphdr *)skb->h.raw)->len) - sizeof(struct udphdr); + copied = skb->len - sizeof(struct udphdr); if (copied > len) { copied = len; msg->msg_flags |= MSG_TRUNC; @@ -337,8 +391,41 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len, * FIXME : should use udp header size info value */ +#ifndef CONFIG_UDP_DELAY_CSUM err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, copied); +#else + if (sk->no_check || skb->ip_summed==CHECKSUM_UNNECESSARY) { + err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, + copied); + } else if (copied > msg->msg_iov[0].iov_len || (msg->msg_flags&MSG_TRUNC)) { + if (csum_fold(csum_partial(skb->h.raw, ntohs(skb->h.uh->len), skb->csum))) { + /* Error for blocking case is chosen to masquerade + as some normal condition. + */ + err = (msg->msg_flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; + udp_stats_in6.UdpInErrors++; + goto out_free; + } + err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, + copied); + } else { + unsigned int csum = csum_partial(skb->h.raw, sizeof(struct udphdr), skb->csum); + + err = 0; + csum = csum_and_copy_to_user((char*)&skb->h.uh[1], msg->msg_iov[0].iov_base, copied, csum, &err); + if (err) + goto out_free; + if (csum_fold(csum)) { + /* Error for blocking case is chosen to masquerade + as some normal condition. + */ + err = (msg->msg_flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; + udp_stats_in6.UdpInErrors++; + goto out_free; + } + } +#endif if (err) goto out_free; @@ -361,7 +448,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len, memcpy(&sin6->sin6_addr, &skb->nh.ipv6h->saddr, sizeof(struct in6_addr)); - if (msg->msg_controllen) + if (sk->net_pinfo.af_inet6.rxopt.all) datagram_recv_ctl(sk, msg, skb); } } @@ -373,20 +460,27 @@ out: return err; } -void udpv6_err(struct sk_buff *skb, int type, int code, unsigned char *buff, __u32 info, - struct in6_addr *saddr, struct in6_addr *daddr, - struct inet6_protocol *protocol) +void udpv6_err(struct sk_buff *skb, struct ipv6hdr *hdr, + struct inet6_skb_parm *opt, + int type, int code, unsigned char *buff, __u32 info) { + struct device *dev = skb->dev; + struct in6_addr *saddr = &hdr->saddr; + struct in6_addr *daddr = &hdr->daddr; struct sock *sk; struct udphdr *uh; int err; - + + if (buff + sizeof(struct udphdr) > skb->tail) + return; + uh = (struct udphdr *) buff; - sk = udp_v6_lookup(daddr, uh->dest, saddr, uh->source); + sk = udp_v6_lookup(daddr, uh->dest, saddr, uh->source, dev->ifindex); if (sk == NULL) { - printk(KERN_DEBUG "icmp for unknown sock\n"); + if (net_ratelimit()) + printk(KERN_DEBUG "icmp for unknown sock\n"); return; } @@ -407,11 +501,10 @@ static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) if (sock_queue_rcv_skb(sk,skb)<0) { udp_stats_in6.UdpInErrors++; ipv6_statistics.Ip6InDiscards++; - ipv6_statistics.Ip6InDelivers--; - skb->sk = NULL; kfree_skb(skb); return 0; } + ipv6_statistics.Ip6InDelivers++; udp_stats_in6.UdpInDatagrams++; return 0; } @@ -430,7 +523,8 @@ static __inline__ int inet6_mc_check(struct sock *sk, struct in6_addr *addr) static struct sock *udp_v6_mcast_next(struct sock *sk, u16 loc_port, struct in6_addr *loc_addr, - u16 rmt_port, struct in6_addr *rmt_addr) + u16 rmt_port, struct in6_addr *rmt_addr, + int dif) { struct sock *s = sk; unsigned short num = ntohs(loc_port); @@ -446,6 +540,9 @@ static struct sock *udp_v6_mcast_next(struct sock *sk, ipv6_addr_cmp(&np->daddr, rmt_addr)) continue; + if (s->bound_dev_if && s->bound_dev_if != dif) + continue; + if(!ipv6_addr_any(&np->rcv_saddr)) { if(ipv6_addr_cmp(&np->rcv_saddr, loc_addr) == 0) return s; @@ -468,16 +565,18 @@ static void udpv6_mcast_deliver(struct udphdr *uh, { struct sock *sk, *sk2; struct sk_buff *buff; + int dif; sk = udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]; - sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr); + dif = skb->dev->ifindex; + sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); if (!sk) goto free_skb; buff = NULL; sk2 = sk; while((sk2 = udp_v6_mcast_next(sk2->next, uh->dest, saddr, - uh->source, daddr))) { + uh->source, daddr, dif))) { if (!buff) { buff = skb_clone(skb, GFP_ATOMIC); if (!buff) @@ -486,59 +585,70 @@ static void udpv6_mcast_deliver(struct udphdr *uh, if (sock_queue_rcv_skb(sk2, buff) >= 0) buff = NULL; } - if (buff) { - buff->sk = NULL; + if (buff) kfree_skb(buff); - } if (sock_queue_rcv_skb(sk, skb) < 0) { - free_skb: - skb->sk = NULL; +free_skb: kfree_skb(skb); } } -int udpv6_rcv(struct sk_buff *skb, struct device *dev, - struct in6_addr *saddr, struct in6_addr *daddr, - struct ipv6_options *opt, unsigned short len, - int redo, struct inet6_protocol *protocol) +int udpv6_rcv(struct sk_buff *skb, unsigned long len) { struct sock *sk; struct udphdr *uh; - int ulen; - - /* - * check if the address is ours... - * I believe that this is being done in IP layer - */ + struct device *dev = skb->dev; + struct in6_addr *saddr = &skb->nh.ipv6h->saddr; + struct in6_addr *daddr = &skb->nh.ipv6h->daddr; + u32 ulen; - uh = (struct udphdr *) skb->h.uh; - - ipv6_statistics.Ip6InDelivers++; + uh = skb->h.uh; + __skb_pull(skb, skb->h.raw - skb->data); ulen = ntohs(uh->len); - + + /* Check for jumbo payload */ + if (ulen == 0 && skb->nh.ipv6h->payload_len == 0) + ulen = len; + if (ulen > len || len < sizeof(*uh)) { - printk(KERN_DEBUG "UDP: short packet: %d/%d\n", ulen, len); + if (net_ratelimit()) + printk(KERN_DEBUG "UDP: short packet: %d/%ld\n", ulen, len); udp_stats_in6.UdpInErrors++; kfree_skb(skb); return(0); } if (uh->check == 0) { - printk(KERN_DEBUG "IPv6: udp checksum is 0\n"); + /* IPv6 draft-v2 section 8.1 says that we SHOULD log + this error. Well, it is reasonable. + */ + if (net_ratelimit()) + printk(KERN_INFO "IPv6: udp checksum is 0\n"); goto discard; } + skb_trim(skb, ulen); + +#ifndef CONFIG_UDP_DELAY_CSUM switch (skb->ip_summed) { case CHECKSUM_NONE: - skb->csum = csum_partial((char*)uh, len, 0); + skb->csum = csum_partial((char*)uh, ulen, 0); case CHECKSUM_HW: - if (csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, skb->csum)) { + if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) { printk(KERN_DEBUG "IPv6: udp checksum error\n"); goto discard; } }; - +#else + if (skb->ip_summed==CHECKSUM_HW) { + if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) + goto discard; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->csum = ~csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, 0); +#endif + len = ulen; /* @@ -555,10 +665,16 @@ int udpv6_rcv(struct sk_buff *skb, struct device *dev, * check socket cache ... must talk to Alan about his plans * for sock caches... i'll skip this for now. */ - - sk = udp_v6_lookup(saddr, uh->source, daddr, uh->dest); - + + sk = udp_v6_lookup(saddr, uh->source, daddr, uh->dest, dev->ifindex); + if (sk == NULL) { +#ifdef CONFIG_UDP_DELAY_CSUM + if (skb->ip_summed != CHECKSUM_UNNECESSARY && + csum_fold(csum_partial((char*)uh, len, skb->csum))) + goto discard; +#endif + udp_stats_in6.UdpNoPorts++; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, dev); @@ -566,16 +682,13 @@ int udpv6_rcv(struct sk_buff *skb, struct device *dev, kfree_skb(skb); return(0); } - + /* deliver */ - - if (sk->sock_readers) - __skb_queue_tail(&sk->back_log, skb); - else - udpv6_queue_rcv_skb(sk, skb); + + udpv6_queue_rcv_skb(sk, skb); return(0); - + discard: udp_stats_in6.UdpInErrors++; kfree_skb(skb); @@ -618,7 +731,7 @@ static int udpv6_getfrag(const void *data, struct in6_addr *addr, } if (csum_partial_copy_fromiovecend(dst, udh->iov, offset, - clen, &udh->wcheck)) + clen, &udh->wcheck)) return -EFAULT; if (final) { @@ -649,11 +762,11 @@ static int udpv6_getfrag(const void *data, struct in6_addr *addr, static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen) { - struct ipv6_options opt_space; + struct ipv6_txoptions opt_space; struct udpv6fakehdr udh; struct ipv6_pinfo *np = &sk->net_pinfo.af_inet6; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) msg->msg_name; - struct ipv6_options *opt = NULL; + struct ipv6_txoptions *opt = NULL; struct flowi fl; int addr_len = msg->msg_namelen; struct in6_addr *daddr; @@ -661,22 +774,18 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen) int len = ulen + sizeof(struct udphdr); int addr_type; int hlimit = -1; - + int err; /* Rough check on arithmetic overflow, better check is made in ip6_build_xmit - - When jumbo header will be implemeted we will change it - to something sort of (len will be size_t) - ulen > SIZE_T_MAX - sizeof(struct udphdr) - */ - if (ulen < 0 || ulen > 0xFFFF - sizeof(struct udphdr)) + */ + if (ulen < 0 || ulen > INT_MAX - sizeof(struct udphdr)) return -EMSGSIZE; - + if (msg->msg_flags & ~(MSG_DONTROUTE|MSG_DONTWAIT)) return(-EINVAL); - + if (sin6) { if (sin6->sin6_family == AF_INET) return udp_sendmsg(sk, msg, ulen); @@ -692,14 +801,6 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen) udh.uh.dest = sin6->sin6_port; daddr = &sin6->sin6_addr; - - /* BUGGGG! If route is not cloned, this check always - fails, hence dst_cache only slows down transmission --ANK - */ - if (sk->dst_cache && ipv6_addr_cmp(daddr, &np->daddr)) { - dst_release(sk->dst_cache); - sk->dst_cache = NULL; - } } else { if (sk->state != TCP_ESTABLISHED) return(-ENOTCONN); @@ -707,9 +808,9 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen) udh.uh.dest = sk->dport; daddr = &sk->net_pinfo.af_inet6.daddr; } - + addr_type = ipv6_addr_type(daddr); - + if (addr_type == IPV6_ADDR_MAPPED) { struct sockaddr_in sin; @@ -720,24 +821,25 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen) return udp_sendmsg(sk, msg, ulen); } - + udh.daddr = NULL; fl.oif = sk->bound_dev_if; if (msg->msg_controllen) { opt = &opt_space; - memset(opt, 0, sizeof(struct ipv6_options)); + memset(opt, 0, sizeof(struct ipv6_txoptions)); err = datagram_send_ctl(msg, &fl.oif, &saddr, opt, &hlimit); if (err < 0) return err; - - if (opt->srcrt) - udh.daddr = daddr; } - + if (opt == NULL || !(opt->opt_nflen|opt->opt_flen)) + opt = np->opt; + if (opt && opt->srcrt) + udh.daddr = daddr; + udh.uh.source = sk->sport; - udh.uh.len = htons(len); + udh.uh.len = len < 0x1000 ? htons(len) : 0; udh.uh.check = 0; udh.iov = msg->msg_iov; udh.wcheck = 0; @@ -783,7 +885,7 @@ struct proto udpv6_prot = { datagram_poll, /* poll */ udp_ioctl, /* ioctl */ NULL, /* init */ - NULL, /* destroy */ + inet6_destroy_sock, /* destroy */ NULL, /* shutdown */ ipv6_setsockopt, /* setsockopt */ ipv6_getsockopt, /* getsockopt */ |