diff options
author | Ralf Baechle <ralf@linux-mips.org> | 1997-12-16 05:34:03 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 1997-12-16 05:34:03 +0000 |
commit | 967c65a99059fd459b956c1588ce0ba227912c4e (patch) | |
tree | 8224d013ff5d255420713d05610c7efebd204d2a /net/ipv4/route.c | |
parent | e20c1cc1656a66a2773bca4591a895cbc12696ff (diff) |
Merge with Linux 2.1.72, part 1.
Diffstat (limited to 'net/ipv4/route.c')
-rw-r--r-- | net/ipv4/route.c | 1206 |
1 files changed, 672 insertions, 534 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b55fb7666..046c60beb 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -5,7 +5,7 @@ * * ROUTE - implementation of the IP router. * - * Version: @(#)route.c 1.0.14 05/31/93 + * Version: $Id: route.c,v 1.33 1997/10/24 17:16:08 kuznet Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -68,27 +68,27 @@ #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> -#include <linux/if_arp.h> #include <linux/proc_fs.h> #include <linux/init.h> -#include <net/ip.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/inetdevice.h> +#include <linux/igmp.h> +#include <linux/pkt_sched.h> +#include <linux/mroute.h> #include <net/protocol.h> +#include <net/ip.h> #include <net/route.h> +#include <net/sock.h> +#include <net/ip_fib.h> #include <net/arp.h> #include <net/tcp.h> -#include <linux/skbuff.h> -#include <net/sock.h> #include <net/icmp.h> -#include <linux/net_alias.h> - -/* Compile time configuretion flags */ -#define CONFIG_IP_LOCAL_RT_POLICY 1 +#define RTprint(a...) printk(KERN_DEBUG a) -static void rt_run_flush(unsigned long); - static struct timer_list rt_flush_timer = - { NULL, NULL, RT_FLUSH_DELAY, 0L, rt_run_flush }; + { NULL, NULL, RT_FLUSH_DELAY, 0L, NULL }; /* * Interface to generic destination cache. @@ -108,6 +108,24 @@ struct dst_ops ipv4_dst_ops = ipv4_dst_destroy }; +__u8 ip_tos2prio[16] = { + TC_PRIO_FILLER, + TC_PRIO_BESTEFFORT, + TC_PRIO_FILLER, + TC_PRIO_FILLER, + TC_PRIO_BULK, + TC_PRIO_FILLER, + TC_PRIO_BULK, + TC_PRIO_FILLER, + TC_PRIO_INTERACTIVE, + TC_PRIO_FILLER, + TC_PRIO_INTERACTIVE, + TC_PRIO_FILLER, + TC_PRIO_INTERACTIVE_BULK, + TC_PRIO_FILLER, + TC_PRIO_INTERACTIVE_BULK, + TC_PRIO_FILLER +}; /* * Route cache. @@ -162,8 +180,10 @@ static int rt_cache_get_info(char *buffer, char **start, off_t offset, int lengt r->u.dst.dev ? r->u.dst.dev->name : "*", (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, - r->rt_flags, atomic_read(&r->u.dst.refcnt), - atomic_read(&r->u.dst.use), 0, + r->rt_flags, + atomic_read(&r->u.dst.use), + atomic_read(&r->u.dst.refcnt), + 0, (unsigned long)r->rt_src, (int)r->u.dst.pmtu, r->u.dst.window, (int)r->u.dst.rtt, r->key.tos, @@ -202,8 +222,6 @@ void ip_rt_check_expire() struct rtable *rth, **rthp; unsigned long now = jiffies; - start_bh_atomic(); - for (i=0; i<RT_HASH_DIVISOR/5; i++) { rover = (rover + 1) & (RT_HASH_DIVISOR-1); rthp = &rt_hash_table[rover]; @@ -229,61 +247,24 @@ void ip_rt_check_expire() if (!rth_next) break; - /* - * Pseudo-LRU ordering. - * Really we should teach it to move - * rarely used but permanently living entries - * (f.e. rdisc, igmp etc.) to the end of list. - */ - if ( rth_next->u.dst.lastuse - rth->u.dst.lastuse > RT_CACHE_BUBBLE_THRESHOLD || (rth->u.dst.lastuse - rth_next->u.dst.lastuse < 0 && - atomic_read(&rth->u.dst.use) < atomic_read(&rth_next->u.dst.use))) { + atomic_read(&rth->u.dst.refcnt) < atomic_read(&rth_next->u.dst.refcnt))) { #if RT_CACHE_DEBUG >= 2 printk("rt_check_expire bubbled %02x@%08x<->%08x\n", rover, rth->rt_dst, rth_next->rt_dst); #endif *rthp = rth_next; rth->u.rt_next = rth_next->u.rt_next; rth_next->u.rt_next = rth; - sti(); rthp = &rth_next->u.rt_next; continue; } rthp = &rth->u.rt_next; } } - - end_bh_atomic(); -} - - -void rt_cache_flush(int how) -{ - start_bh_atomic(); - if (rt_flush_timer.expires) { - if (jiffies - rt_flush_timer.expires > 0 || - rt_flush_timer.expires - jiffies > RT_FLUSH_DELAY/2) - how = 1; - } - if (how) { - if (rt_flush_timer.expires) - del_timer(&rt_flush_timer); - rt_flush_timer.expires = 0; - end_bh_atomic(); - rt_run_flush(0); - return; - } - if (rt_flush_timer.expires) { - end_bh_atomic(); - return; - } - del_timer(&rt_flush_timer); - rt_flush_timer.expires = jiffies + RT_FLUSH_DELAY; - add_timer(&rt_flush_timer); - end_bh_atomic(); } - -void rt_run_flush(unsigned long dummy) + +static void rt_run_flush(unsigned long dummy) { int i; struct rtable * rth, * next; @@ -313,6 +294,30 @@ void rt_run_flush(unsigned long dummy) #endif } } + +void rt_cache_flush(int delay) +{ + start_bh_atomic(); + if (delay && rt_flush_timer.function && + rt_flush_timer.expires - jiffies < delay) { + end_bh_atomic(); + return; + } + if (rt_flush_timer.function) { + del_timer(&rt_flush_timer); + rt_flush_timer.function = NULL; + } + if (delay == 0) { + end_bh_atomic(); + rt_run_flush(0); + return; + } + rt_flush_timer.function = rt_run_flush; + rt_flush_timer.expires = jiffies + delay; + add_timer(&rt_flush_timer); + end_bh_atomic(); +} + static void rt_garbage_collect(void) { @@ -327,7 +332,7 @@ static void rt_garbage_collect(void) /* * Garbage collection is pretty expensive, - * do not make it too frequently. + * do not make it too frequently, but just increase expire strength. */ if (now - last_gc < 1*HZ) { expire >>= 1; @@ -342,7 +347,7 @@ static void rt_garbage_collect(void) continue; for (rthp=&rt_hash_table[i]; (rth=*rthp); rthp=&rth->u.rt_next) { if (atomic_read(&rth->u.dst.use) || - (now - rth->u.dst.lastuse > expire)) + now - rth->u.dst.lastuse < expire) continue; atomic_dec(&rt_cache_size); *rthp = rth->u.rt_next; @@ -465,115 +470,94 @@ static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt, u16 prot void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, u32 saddr, u8 tos, struct device *dev) { - int i; - int off_link = 0; - struct fib_info *fi; + int i, k; + struct in_device *in_dev = dev->ip_ptr; struct rtable *rth, **rthp; - u32 skeys[2] = { saddr, 0, }; - struct device *pdev = net_alias_main_dev(dev); + u32 skeys[2] = { saddr, 0 }; + int ikeys[2] = { dev->ifindex, 0 }; tos &= IPTOS_TOS_MASK; - if (new_gw == old_gw || !ipv4_config.accept_redirects + if (!in_dev || new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw)) goto reject_redirect; - if ((new_gw^dev->pa_addr)&dev->pa_mask) - off_link = 1; - - if (!ipv4_config.rfc1620_redirects) { - if (off_link) + if (!IN_DEV_SHARED_MEDIA(in_dev)) { + if (ip_fib_check_default(new_gw, dev)) goto reject_redirect; - if (ipv4_config.secure_redirects && ip_fib_chk_default_gw(new_gw, dev)) + } else { + if (inet_addr_type(new_gw) != RTN_UNICAST) goto reject_redirect; } - fi = fib_lookup_info(new_gw, 0, 0, &loopback_dev, NULL); - if (fi == NULL || fi->fib_flags&(RTF_LOCAL|RTF_BROADCAST|RTF_NAT)) - goto reject_redirect; - for (i=0; i<2; i++) { - unsigned hash = rt_hash_code(daddr, skeys[i], tos); + for (k=0; k<2; k++) { + unsigned hash = rt_hash_code(daddr, skeys[i]^(ikeys[k]<<5), tos); - rthp=&rt_hash_table[hash]; + rthp=&rt_hash_table[hash]; - while ( (rth = *rthp) != NULL) { - struct rtable *rt; + while ( (rth = *rthp) != NULL) { + struct rtable *rt; - if (rth->key.dst != daddr || - rth->key.src != skeys[i] || - rth->key.tos != tos || - rth->key.dst_dev != NULL || - rth->key.src_dev != NULL) { - rthp = &rth->u.rt_next; - continue; - } + if (rth->key.dst != daddr || + rth->key.src != skeys[i] || + rth->key.tos != tos || + rth->key.oif != ikeys[k] || + rth->key.iif != 0) { + rthp = &rth->u.rt_next; + continue; + } - if (rth->rt_dst != daddr || - rth->rt_src != saddr || - rth->rt_flags&RTF_REJECT || - rth->rt_gateway != old_gw || - rth->u.dst.dev != dev) - break; + if (rth->rt_dst != daddr || + rth->rt_src != saddr || + rth->u.dst.error || + rth->rt_gateway != old_gw || + rth->u.dst.dev != dev) + break; - rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); - if (rt == NULL) - return; + rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); + if (rt == NULL) + return; - /* - * Copy all the information. - */ - atomic_set(&rt->u.dst.refcnt, 1); - rt->u.dst.dev = dev; - rt->u.dst.input = rth->u.dst.input; - rt->u.dst.output = rth->u.dst.output; - rt->u.dst.pmtu = dev->mtu; - rt->u.dst.rtt = TCP_TIMEOUT_INIT; - rt->u.dst.window = 0; - atomic_set(&rt->u.dst.use, 1); - rt->u.dst.lastuse = jiffies; - - rt->rt_flags = rth->rt_flags|RTF_DYNAMIC|RTF_MODIFIED; - rt->rt_flags &= ~RTF_GATEWAY; - if (new_gw != daddr) - rt->rt_flags |= RTF_GATEWAY; - - rt->rt_src = rth->rt_src; - rt->rt_dst = rth->rt_dst; - rt->rt_src_dev = rth->rt_src_dev; - rt->rt_spec_dst = rth->rt_spec_dst; - rt->key = rth->key; - - /* But gateway is different ... */ - rt->rt_gateway = new_gw; - - if (off_link) { - if (fi->fib_dev != dev && - net_alias_main_dev(fi->fib_dev) == pdev) - rt->u.dst.dev = fi->fib_dev; - } + /* + * Copy all the information. + */ + *rt = *rth; + atomic_set(&rt->u.dst.refcnt, 1); + atomic_set(&rt->u.dst.use, 1); + rt->u.dst.lastuse = jiffies; + rt->u.dst.neighbour = NULL; + rt->u.dst.hh = NULL; + + rt->rt_flags |= RTCF_REDIRECTED; + + /* Gateway is different ... */ + rt->rt_gateway = new_gw; + + if (!rt_ll_bind(rt)) { + ip_rt_put(rt); + rt_free(rt); + break; + } - if (ipv4_config.rfc1620_redirects && !rt_ll_bind(rt)) { + *rthp = rth->u.rt_next; + rt_free(rth); + rt = rt_intern_hash(hash, rt, ETH_P_IP); ip_rt_put(rt); - rt_free(rt); break; } - - *rthp = rth->u.rt_next; - rt_free(rth); - rt = rt_intern_hash(hash, rt, ETH_P_IP); - ip_rt_put(rt); - break; } } return; reject_redirect: +#ifdef CONFIG_IP_ROUTE_VERBOSE if (ipv4_config.log_martians && net_ratelimit()) printk(KERN_INFO "Redirect from %lX/%s to %lX ignored." "Path = %lX -> %lX, tos %02x\n", ntohl(old_gw), dev->name, ntohl(new_gw), ntohl(saddr), ntohl(daddr), tos); +#endif } @@ -585,7 +569,7 @@ void ip_rt_advice(struct rtable **rp, int advice) return; start_bh_atomic(); - if ((rt = *rp) != NULL && (rt->rt_flags&(RTF_DYNAMIC|RTF_MODIFIED))) { + if ((rt = *rp) != NULL && (rt->rt_flags&RTCF_REDIRECTED)) { #if RT_CACHE_DEBUG >= 1 printk(KERN_DEBUG "ip_rt_advice: redirect to %08x/%02x dropped\n", rt->rt_dst, rt->key.tos); #endif @@ -602,7 +586,7 @@ void ip_rt_advice(struct rtable **rp, int advice) * 1. The first RT_REDIRECT_NUMBER redirects are sent * with exponential backoff, then we stop sending them at all, * assuming that the host ignores our redirects. - * 2. If we did not see a packets requiring redirects + * 2. If we did not see packets requiring redirects * during RT_REDIRECT_SILENCE, we assume that the host * forgot redirected route and start to send redirects again. * @@ -637,9 +621,12 @@ void ip_rt_send_redirect(struct sk_buff *skb) if (jiffies - rt->last_error > (RT_REDIRECT_LOAD<<rt->errors)) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); rt->last_error = jiffies; - if (ipv4_config.log_martians && ++rt->errors == RT_REDIRECT_NUMBER && net_ratelimit()) - printk(KERN_WARNING "host %08x/%s ignores redirects for %08x to %08x.\n", - rt->rt_src, rt->rt_src_dev->name, rt->rt_dst, rt->rt_gateway); + ++rt->errors; +#ifdef CONFIG_IP_ROUTE_VERBOSE + if (ipv4_config.log_martians && rt->errors == RT_REDIRECT_NUMBER && net_ratelimit()) + printk(KERN_WARNING "host %08x/if%d ignores redirects for %08x to %08x.\n", + rt->rt_src, rt->rt_iif, rt->rt_dst, rt->rt_gateway); +#endif } } @@ -653,6 +640,9 @@ static int ip_error(struct sk_buff *skb) default: kfree_skb(skb, FREE_READ); return 0; + case EHOSTUNREACH: + code = ICMP_HOST_UNREACH; + break; case ENETUNREACH: code = ICMP_NET_UNREACH; break; @@ -668,37 +658,24 @@ static int ip_error(struct sk_buff *skb) return 0; } +/* + * The last two values are not from the RFC but + * are needed for AMPRnet AX.25 paths. + */ + +static unsigned short mtu_plateau[] = +{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; static __inline__ unsigned short guess_mtu(unsigned short old_mtu) { - if (old_mtu > 32000) - return 32000; - else if (old_mtu > 17914) - return 17914; - else if (old_mtu > 8166) - return 8166; - else if (old_mtu > 4352) - return 4352; - else if (old_mtu > 2002) - return 2002; - else if (old_mtu > 1492) - return 1492; - else if (old_mtu > 576) - return 576; - else if (old_mtu > 296) - return 296; - /* - * These two are not from the RFC but - * are needed for AMPRnet AX.25 paths. - */ - else if (old_mtu > 216) - return 216; - else if (old_mtu > 128) - return 128; + int i; + + for (i = 0; i < sizeof(mtu_plateau)/sizeof(mtu_plateau[0]); i++) + if (old_mtu > mtu_plateau[i]) + return mtu_plateau[i]; return 68; } - unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) { int i; @@ -721,8 +698,8 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) rth->rt_dst == daddr && rth->rt_src == iph->saddr && rth->key.tos == tos && - !rth->key.src_dev && - !(rth->rt_flags&RTF_NOPMTUDISC)) { + rth->key.iif == 0 && + !(rth->rt_flags&RTCF_NOPMTUDISC)) { unsigned short mtu = new_mtu; if (new_mtu < 68 || new_mtu >= old_mtu) { @@ -770,177 +747,227 @@ static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst, return NULL; } -int -ip_check_mc(struct device *dev, u32 mc_addr) +static int ip_rt_bug(struct sk_buff *skb) { - struct ip_mc_list *ip_mc; + printk(KERN_DEBUG "ip_rt_bug: %08x -> %08x, %s\n", skb->nh.iph->saddr, + skb->nh.iph->daddr, skb->dev ? skb->dev->name : "?"); + kfree_skb(skb, FREE_WRITE); + return 0; +} - if (mc_addr==htonl(INADDR_ALLHOSTS_GROUP)) - return 1; +/* + We do not cache source address of outgoing interface, + because it is used only by IP RR, TS and SRR options, + so that it out of fast path. - for (ip_mc=dev->ip_mc_list; ip_mc; ip_mc=ip_mc->next) - if (ip_mc->multiaddr == mc_addr) - return 1; - return 0; + BTW remember: "addr" is allowed to be not aligned + in IP options! + */ + +void ip_rt_get_source(u8 *addr, struct rtable *rt) +{ + u32 src; + struct fib_result res; + + if (rt->key.iif == 0) { + memcpy(addr, &rt->rt_src, 4); + return; + } + if (fib_lookup(&rt->key, &res) == 0) { + src = FIB_RES_PREFSRC(res); + memcpy(addr, &src, 4); + return; + } + src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); + memcpy(addr, &src, 4); } -static int ip_rt_bug(struct sk_buff *skb) +static int +ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr, + u8 tos, struct device *dev, int our) { - kfree_skb(skb, FREE_WRITE); - printk(KERN_DEBUG "ip_rt_bug: %08x -> %08x, %s\n", skb->nh.iph->saddr, - skb->nh.iph->daddr, skb->dev ? skb->dev->name : "?"); + unsigned hash; + struct rtable *rth; + u32 spec_dst; + struct in_device *in_dev = dev->ip_ptr; + + /* Primary sanity checks. */ + + if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) || + in_dev == NULL || skb->protocol != __constant_htons(ETH_P_IP)) + return -EINVAL; + + if (ZERONET(saddr)) { + if (!LOCAL_MCAST(daddr)) + return -EINVAL; + spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); + } else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst) < 0) + return -EINVAL; + + rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); + if (!rth) + return -ENOBUFS; + + rth->u.dst.output= ip_rt_bug; + + atomic_set(&rth->u.dst.use, 1); + rth->key.dst = daddr; + rth->rt_dst = daddr; + rth->key.tos = tos; + rth->key.src = saddr; + rth->rt_src = saddr; +#ifdef CONFIG_IP_ROUTE_NAT + rth->rt_dst_map = daddr; + rth->rt_src_map = saddr; +#endif + rth->rt_iif = + rth->key.iif = dev->ifindex; + rth->u.dst.dev = &loopback_dev; + rth->key.oif = 0; + rth->rt_gateway = daddr; + rth->rt_spec_dst= spec_dst; + rth->rt_type = RTN_MULTICAST; + rth->rt_flags = RTCF_MULTICAST; + if (our) { + rth->u.dst.input= ip_local_deliver; + rth->rt_flags |= RTCF_LOCAL; + } + +#ifdef CONFIG_IP_MROUTE + if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) + rth->u.dst.input = ip_mr_input; +#endif + + hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos); + skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, 0); return 0; } /* - * This function is called ONLY FROM NET BH. No locking! - * * NOTE. We drop all the packets that has local source * addresses, because every properly looped back packet * must have correct destination already attached by output routine. * * Such approach solves two big problems: - * 1. Not simplex devices (if they exist 8)) are handled properly. + * 1. Not simplex devices are handled properly. * 2. IP spoofing attempts are filtered with 100% of guarantee. */ int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, - u8 tos, struct device *pdev) + u8 tos, struct device *dev) { - struct device * dev = pdev; - struct fib_info *fi = NULL; - struct fib_info *src_fi = NULL; + struct rt_key key; + struct fib_result res; + struct in_device *in_dev = dev->ip_ptr; + struct in_device *out_dev; unsigned flags = 0; - struct device *devout; struct rtable * rth; unsigned hash; - struct fib_result res; - u32 src_key = saddr; - u32 dst_key = daddr; - int err = -EINVAL; - int log = 0; + u32 spec_dst; + int err = -EINVAL; + + /* + * IP on this device is disabled. + */ - hash = rt_hash_code(daddr, saddr^(unsigned long)pdev, tos); + if (!in_dev) + return -EINVAL; - /* Check for martians... */ + key.dst = daddr; + key.src = saddr; + key.tos = tos; + key.iif = dev->ifindex; + key.oif = 0; + key.scope = RT_SCOPE_UNIVERSE; + + hash = rt_hash_code(daddr, saddr^(key.iif<<5), tos); + + /* Check for the most weird martians, which can be not detected + by fib_lookup. + */ if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr)) goto martian_source; - if (MULTICAST(daddr) || daddr == 0xFFFFFFFF) - goto mc_input; - /* Accept zero addresses only to limited broadcast/multicasts; - * I even do not know to fix it or not. + if (daddr == 0xFFFFFFFF) + goto brd_input; + + /* Accept zero addresses only to limited broadcast; + * I even do not know to fix it or not. Waiting for complains :-) */ if (ZERONET(saddr)) goto martian_source; + if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr)) goto martian_destination; /* - * Device is not yet initialized, accept all addresses as ours. + * Now we are ready to route packet. */ - if (ZERONET(dev->pa_addr)) - goto promisc_ip; - - /* - * Now we are able to route packet. - */ - if ((err = fib_lookup(&res, daddr, saddr, tos, pdev, NULL)) < 0) { - if (!IS_ROUTER) + if ((err = fib_lookup(&key, &res))) { + if (!IN_DEV_FORWARD(in_dev)) return -EINVAL; goto no_route; } - fi = res.f->fib_info; - flags = fi->fib_flags; - devout = fi->fib_dev; - - if (flags&RTF_NAT) { - daddr = htonl((ntohl(daddr)&((1<<res.fm)-1)))|fi->fib_gateway; - fi = fib_lookup_info(daddr, saddr, tos, pdev, NULL); - if (!fi || fi->fib_flags&(RTF_NAT|RTF_LOCAL|RTF_MULTICAST|RTF_BROADCAST)) - return -EINVAL; - devout = fi->fib_dev; - flags = fi->fib_flags|RTCF_NAT|RTF_NAT; - } +#ifdef CONFIG_IP_ROUTE_NAT + /* Policy is applied before mapping destination, + but rerouting after map should be made with old source. + */ - switch (res.fr->cl_action) { - case RTP_NAT: - /* Packet is from translated source; remember it */ - saddr = (saddr&~res.fr->cl_srcmask)|res.fr->cl_srcmap; - flags |= RTCF_NAT; - break; - case RTP_MASQUERADE: - /* Packet is from masqueraded source; remember it */ - flags |= RTCF_MASQ; - break; - default: - } - log = res.fr->cl_flags&RTRF_LOG; + if (1) { + u32 src_map = saddr; + if (res.r) + src_map = fib_rules_policy(saddr, &res, &flags); - if (!(flags & RTF_LOCAL)) { - if (!IS_ROUTER || flags&RTF_NOFORWARD) - return -EINVAL; - } else { - fi = NULL; - devout = &loopback_dev; - if (flags&RTF_BROADCAST) - goto mc_input; + if (res.type == RTN_NAT) { + key.dst = fib_rules_map_destination(daddr, &res); + if (fib_lookup(&key, &res) || res.type != RTN_UNICAST) + return -EINVAL; + flags |= RTCF_DNAT; + } + key.src = src_map; } - -#ifndef CONFIG_IP_LOCAL_RT_POLICY - if (flags&RTF_LOCAL) - src_fi = fib_lookup_info(src_key, 0, tos, &loopback_dev, NULL); - else #endif - if (fib_lookup(&res, src_key, daddr, tos, net_alias_main_dev(devout), NULL) == 0) { - src_fi = res.f->fib_info; - /* Destination is on masqueraded network: - * if it is real incoming frame, ip_forward will drop it. - */ - if (res.fr->cl_flags&RTRF_VALVE) - flags |= RTCF_VALVE; - } - if (src_fi) { - if (src_fi->fib_flags&(RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST|RTF_NAT)) + if (res.type == RTN_BROADCAST) + goto brd_input; + + if (res.type == RTN_LOCAL) { + spec_dst = daddr; + if (inet_addr_type(saddr) != RTN_UNICAST) goto martian_source; + goto local_input; + } - if (!(src_fi->fib_flags&RTF_GATEWAY)) - flags |= RTCF_DIRECTSRC; + if (!IN_DEV_FORWARD(in_dev)) + return -EINVAL; + if (res.type != RTN_UNICAST) + goto martian_destination; - if (net_alias_main_dev(src_fi->fib_dev) == pdev) - skb->dev = dev = src_fi->fib_dev; - else { - /* Route to packet source goes via - different interface; rfc1812 proposes - to drop them. - It is dangerous on not-stub/transit networks - because of path asymmetry. - */ - if (ipv4_config.rfc1812_filter >= 2) - goto martian_source; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (res.fi->fib_nhs > 1 && key.oif == 0) + fib_select_multipath(&key, &res); +#endif + out_dev = FIB_RES_DEV(res)->ip_ptr; - /* Weaker form of rfc1812 filtering. - If source is on directly connected network, - it can mean either local network configuration error - (the most probable case) or real IP spoofing attempt. - */ - if (ipv4_config.rfc1812_filter >= 1 && !(flags&RTCF_DIRECTSRC)) - goto martian_source; - } - } else if (ipv4_config.rfc1812_filter >= 1) + err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst); + if (err < 0) goto martian_source; -make_route: + if (err) + flags |= RTCF_DIRECTSRC; + + if (out_dev == in_dev && err && !(flags&RTCF_NAT) && + (IN_DEV_SHARED_MEDIA(out_dev) + || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res)))) + flags |= RTCF_DOREDIRECT; + if (skb->protocol != __constant_htons(ETH_P_IP)) { - /* ARP request. Do not make route for invalid destination or - * if it is redirected. + /* Not IP (i.e. ARP). Do not make route for invalid + * destination or if it is redirected. */ - if (flags&(RTF_REJECT|RTF_BROADCAST|RTF_MULTICAST) || - skb->pkt_type == PACKET_OTHERHOST || - (devout == dev && !(flags&(RTF_LOCAL|RTCF_NAT)))) + if (out_dev == in_dev && flags&RTCF_DOREDIRECT) return -EINVAL; } @@ -948,147 +975,105 @@ make_route: if (!rth) return -ENOBUFS; - rth->u.dst.output= ip_rt_bug; - atomic_set(&rth->u.dst.use, 1); - rth->key.dst = dst_key; - rth->rt_dst = dst_key; - rth->rt_dst_map = daddr; + rth->key.dst = daddr; + rth->rt_dst = daddr; rth->key.tos = tos; - rth->key.src = src_key; - rth->rt_src = src_key; - rth->rt_src_map = saddr; - rth->rt_src_dev = dev; - rth->key.src_dev= pdev; - rth->u.dst.dev = devout; - rth->key.dst_dev= NULL; + rth->key.src = saddr; + rth->rt_src = saddr; rth->rt_gateway = daddr; - rth->rt_spec_dst= daddr; - - if (!(flags&RTF_REJECT)) { - if (flags&RTF_LOCAL) - rth->u.dst.input= ip_local_deliver; - if (!(flags&(RTF_NOFORWARD|RTF_BROADCAST))) { - if (flags&RTF_MULTICAST) { -#ifdef CONFIG_IP_MROUTE - if (!LOCAL_MCAST(daddr) && ipv4_config.multicast_route) { - rth->u.dst.input = ip_mr_input; - rth->u.dst.output = ip_output; - } +#ifdef CONFIG_IP_ROUTE_NAT + rth->rt_src_map = key.src; + rth->rt_dst_map = key.dst; + if (flags&RTCF_DNAT) + rth->rt_gateway = key.dst; #endif - } else if (!(flags&RTF_LOCAL)) { - rth->u.dst.input = ip_forward; - rth->u.dst.output = ip_output; - } - } - } else if (IS_ROUTER && !(flags&(RTF_MULTICAST|RTF_BROADCAST))) { - rth->u.dst.input= ip_error; - rth->u.dst.error= -err; - } - - if ((flags&(RTF_BROADCAST|RTF_MULTICAST)) || !(flags&RTF_LOCAL)) - rth->rt_spec_dst= dev->pa_addr; + rth->rt_iif = + rth->key.iif = dev->ifindex; + rth->u.dst.dev = out_dev->dev; + rth->key.oif = 0; + rth->rt_spec_dst= spec_dst; - if (fi) { - rth->u.dst.pmtu = fi->fib_mtu; - rth->u.dst.window=fi->fib_window; - rth->u.dst.rtt = fi->fib_irtt; - if (flags & RTF_GATEWAY) - rth->rt_gateway = fi->fib_gateway; - } else { - rth->u.dst.pmtu = devout->mtu; - rth->u.dst.window=0; - rth->u.dst.rtt = TCP_TIMEOUT_INIT; - } + rth->u.dst.input = ip_forward; + rth->u.dst.output = ip_output; - if (!(flags&(RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST|RTCF_NAT)) && - flags&RTCF_DIRECTSRC && - (devout == dev || (ipv4_config.rfc1620_redirects && - net_alias_main_dev(devout) == pdev))) - flags |= RTCF_DOREDIRECT; + rth->u.dst.pmtu = res.fi->fib_mtu ? : out_dev->dev->mtu; + rth->u.dst.window=res.fi->fib_window ? : 0; + rth->u.dst.rtt = res.fi->fib_rtt ? : TCP_TIMEOUT_INIT; + if (FIB_RES_GW(res) && FIB_RES_NH(res).nh_scope == RT_SCOPE_LINK) + rth->rt_gateway = FIB_RES_GW(res); rth->rt_flags = flags; + rth->rt_type = res.type; - if (log) - printk(KERN_INFO "installing route %08lX -> %08lX\n", ntohl(rth->rt_src), ntohl(rth->rt_dst)); - - if (flags&(RTF_LOCAL|RTF_MULTICAST|RTF_BROADCAST|RTF_REJECT)) { - skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, 0); - return 0; - } - skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, __constant_ntohs(skb->protocol)); + skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, ntohs(skb->protocol)); return 0; -mc_input: +brd_input: if (skb->protocol != __constant_htons(ETH_P_IP)) return -EINVAL; if (ZERONET(saddr)) { - if (!ipv4_config.bootp_agent) - goto martian_source; - flags |= RTF_NOFORWARD|RTF_LOCAL; + spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); } else { - src_fi = fib_lookup_info(saddr, 0, tos, &loopback_dev, NULL); - if (!src_fi) - goto martian_source; - - if (src_fi->fib_flags&(RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST|RTF_NAT)) + err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst); + if (err < 0) goto martian_source; - - if (!(src_fi->fib_flags&RTF_GATEWAY)) + if (err) flags |= RTCF_DIRECTSRC; - - if (!MULTICAST(daddr) || !ipv4_config.multicast_route || - LOCAL_MCAST(daddr)) { - if (net_alias_main_dev(src_fi->fib_dev) == pdev) { - skb->dev = dev = src_fi->fib_dev; - } else { - /* Fascist not-unicast filtering 8) */ - goto martian_source; - } - } - } - - if (!MULTICAST(daddr)) { - flags |= RTF_LOCAL|RTF_BROADCAST|RTF_NOFORWARD; - devout = dev; - goto make_route; } + flags |= RTCF_BROADCAST; - flags |= RTF_MULTICAST|RTF_LOCAL; +local_input: + rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); + if (!rth) + return -ENOBUFS; - if (ip_check_mc(dev, daddr) == 0) { - flags &= ~RTF_LOCAL; + rth->u.dst.output= ip_rt_bug; - if (!ipv4_config.multicast_route || !(dev->flags&IFF_ALLMULTI)) - goto no_route; + atomic_set(&rth->u.dst.use, 1); + rth->key.dst = daddr; + rth->rt_dst = daddr; + rth->key.tos = tos; + rth->key.src = saddr; + rth->rt_src = saddr; +#ifdef CONFIG_IP_ROUTE_NAT + rth->rt_dst_map = key.dst; + rth->rt_src_map = key.src; +#endif + rth->rt_iif = + rth->key.iif = dev->ifindex; + rth->u.dst.dev = &loopback_dev; + rth->key.oif = 0; + rth->rt_gateway = daddr; + rth->rt_spec_dst= spec_dst; + rth->u.dst.input= ip_local_deliver; + if (res.type == RTN_UNREACHABLE) { + rth->u.dst.input= ip_error; + rth->u.dst.error= err; } - devout = dev; - goto make_route; - -promisc_ip: - flags |= RTF_LOCAL|RTF_NOFORWARD; - if (MULTICAST(daddr)) - flags |= RTF_MULTICAST; - else - flags |= RTF_BROADCAST; - devout = dev; - goto make_route; + rth->rt_flags = flags|RTCF_LOCAL; + rth->rt_type = res.type; + skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, 0); + return 0; no_route: - flags |= RTF_REJECT; - devout = dev; - goto make_route; + spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + res.type = RTN_UNREACHABLE; + goto local_input; /* * Do not cache martian addresses: they should be logged (RFC1812) */ martian_destination: +#ifdef CONFIG_IP_ROUTE_VERBOSE if (ipv4_config.log_martians && net_ratelimit()) printk(KERN_WARNING "martian destination %08x from %08x, dev %s\n", daddr, saddr, dev->name); +#endif return -EINVAL; martian_source: +#ifdef CONFIG_IP_ROUTE_VERBOSE if (ipv4_config.log_martians && net_ratelimit()) { /* * RFC1812 recommenadtion, if source is martian, @@ -1104,6 +1089,7 @@ martian_source: printk("\n"); } } +#endif return -EINVAL; } @@ -1112,224 +1098,298 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr, { struct rtable * rth; unsigned hash; - - if (skb->dst) - return 0; - -#if RT_CACHE_DEBUG >= 1 - if (dev->flags & IFF_LOOPBACK) { - printk(KERN_DEBUG "ip_route_input: bug: packet is looped back\n"); - return -EINVAL; - } - if (net_alias_main_dev(dev) != dev) - printk(KERN_DEBUG "ip_route_input: bug: packet is received on alias %s\n", dev->name); -#endif + int iif = dev->ifindex; tos &= IPTOS_TOS_MASK; - hash = rt_hash_code(daddr, saddr^(unsigned long)dev, tos); - skb->dev = dev; + hash = rt_hash_code(daddr, saddr^(iif<<5), tos); for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) { if (rth->key.dst == daddr && rth->key.src == saddr && - rth->key.src_dev == dev && - rth->key.dst_dev == NULL && + rth->key.iif == iif && + rth->key.oif == 0 && rth->key.tos == tos) { rth->u.dst.lastuse = jiffies; atomic_inc(&rth->u.dst.use); atomic_inc(&rth->u.dst.refcnt); skb->dst = (struct dst_entry*)rth; - skb->dev = rth->rt_src_dev; return 0; } } + + /* Multicast recognition logic is moved from route cache to here. + The problem was that too many ethernet cards have broken/missing + hardware multicast filters :-( As result the host on multicasting + network acquires a lot of useless route cache entries, sort of + SDR messages from all the world. Now we try to get rid of them. + Really, provided software IP multicast filter is organized + reasonably (at least, hashed), it does not result in a slowdown + comparing with route cache reject entries. + Note, that multicast routers are not affected, because + route cache entry is created eventually. + */ + if (MULTICAST(daddr)) { + int our = ip_check_mc(dev, daddr); + if (!our +#ifdef CONFIG_IP_MROUTE + && (LOCAL_MCAST(daddr) || !dev->ip_ptr || + !IN_DEV_MFORWARD((struct in_device*)dev->ip_ptr)) +#endif + ) return -EINVAL; + return ip_route_input_mc(skb, daddr, saddr, tos, dev, our); + } return ip_route_input_slow(skb, daddr, saddr, tos, dev); } - /* * Major route resolver routine. */ -int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, - struct device *dev_out) +int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, int oif) { - u32 src_key = saddr; - u32 dst_key = daddr; - u32 dst_map; - struct device *dst_dev_key = dev_out; + struct rt_key key; + struct fib_result res; unsigned flags = 0; - struct fib_info *fi = NULL; struct rtable *rth; -#ifdef CONFIG_IP_LOCAL_RT_POLICY - struct fib_result res; -#endif + struct device *dev_out = NULL; unsigned hash; tos &= IPTOS_TOS_MASK|1; + key.dst = daddr; + key.src = saddr; + key.tos = tos&IPTOS_TOS_MASK; + key.iif = loopback_dev.ifindex; + key.oif = oif; + key.scope = (tos&1) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE; + res.fi = NULL; if (saddr) { - if (MULTICAST(saddr) || BADCLASS(saddr) || ZERONET(saddr) || - __ip_chk_addr(saddr) != IS_MYADDR) + if (MULTICAST(saddr) || BADCLASS(saddr) || ZERONET(saddr)) return -EINVAL; - if (dev_out == NULL && (MULTICAST(daddr) || daddr == 0xFFFFFFFF)) - dev_out = ip_dev_find(saddr, NULL); + + /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ + dev_out = ip_dev_find(saddr); + if (dev_out == NULL) + return -EINVAL; + + /* I removed check for oif == dev_out->oif here. + It was wrong by three reasons: + 1. ip_dev_find(saddr) can return wrong iface, if saddr is + assigned to multiple interfaces. + 2. Moreover, we are allowed to send packets with saddr + of another iface. --ANK + */ + + if (oif == 0 && (MULTICAST(daddr) || daddr == 0xFFFFFFFF)) { + /* Special hack: user can direct multicasts + and limited broadcast via necessary interface + without fiddling with IP_MULTICAST_IF or IP_TXINFO. + This hack is not just for fun, it allows + vic,vat and friends to work. + They bind socket to loopback, set ttl to zero + and expect that it will work. + From the viewpoint of routing cache they are broken, + because we are not allowed to build multicast path + with loopback source addr (look, routing cache + cannot know, that ttl is zero, so that packet + will not leave this host and route is valid). + Luckily, this hack is good workaround. + */ + + key.oif = dev_out->ifindex; + goto make_route; + } + dev_out = NULL; } - if (!daddr) - daddr = saddr; - - if (dev_out) { - if (!saddr) { - saddr = dev_out->pa_addr; - if (!daddr) - daddr = saddr; + if (oif) { + dev_out = dev_get_by_index(oif); + if (dev_out == NULL) + return -ENODEV; + if (dev_out->ip_ptr == NULL) + return -ENODEV; /* Wrong error code */ + + if (LOCAL_MCAST(daddr) || daddr == 0xFFFFFFFF) { + key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); + goto make_route; } - dst_map = daddr; - if (MULTICAST(daddr) || daddr == 0xFFFFFFFF) + if (MULTICAST(daddr)) { + key.src = inet_select_addr(dev_out, 0, key.scope); goto make_route; + } + if (!daddr) + key.src = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } - if (!daddr) - daddr = htonl(INADDR_LOOPBACK); + if (!key.dst) { + key.dst = key.src; + if (!key.dst) + key.dst = key.src = htonl(INADDR_LOOPBACK); + dev_out = &loopback_dev; + key.oif = loopback_dev.ifindex; + flags |= RTCF_LOCAL; + goto make_route; + } -#ifdef CONFIG_IP_LOCAL_RT_POLICY - if (fib_lookup(&res, daddr, saddr, tos, &loopback_dev, dev_out)) + if (fib_lookup(&key, &res)) { + res.fi = NULL; + if (oif) { + /* Apparently, routing tables are wrong. Assume, + that the destination is on link. + + WHY? DW. + Because we are allowed to send to iface + even if it has NO routes and NO assigned + addresses. When oif is specified, routing + tables are looked up with only one purpose: + to catch if destination is gatewayed, rather than + direct. Moreover, if MSG_DONTROUTE is set, + we send packet, no matter of routing tables + of ifaddr state. --ANK + + + We could make it even if oif is unknown, + likely IPv6, but we do not. + */ + + printk(KERN_DEBUG "Dest not on link. Forcing...\n"); + if (key.src == 0) + key.src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); + goto make_route; + } return -ENETUNREACH; - fi = res.f->fib_info; - dst_map = daddr; + } - if (fi->fib_flags&RTF_NAT) + if (res.type == RTN_NAT) return -EINVAL; - if (!saddr) { - saddr = fi->fib_dev->pa_addr; + if (!key.src) { + key.src = FIB_RES_PREFSRC(res); + +#ifdef CONFIG_IP_MULTIPLE_TABLES /* * "Stabilization" of route. * This step is necessary, if locally originated packets - * are subjected to source routing, else we could get + * are subjected to policy routing, otherwise we could get * route flapping. */ - fi = fib_lookup_info(dst_map, saddr, tos, &loopback_dev, dev_out); - if (!fi) + if (fib_lookup(&key, &res)) return -ENETUNREACH; +#endif } -#else - fi = fib_lookup_info(daddr, 0, tos, &loopback_dev, dev_out); - if (!fi) - return -ENETUNREACH; - - if (fi->fib_flags&RTF_NAT) - return -EINVAL; - dst_map = daddr; - if (!saddr) - saddr = fi->fib_dev->pa_addr; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (res.fi->fib_nhs > 1 && key.oif == 0) + fib_select_multipath(&key, &res); #endif - flags |= fi->fib_flags; - dev_out = fi->fib_dev; + dev_out = FIB_RES_DEV(res); - if (RT_LOCALADDR(flags)) { + if (res.type == RTN_LOCAL) { dev_out = &loopback_dev; - fi = NULL; + key.oif = dev_out->ifindex; + res.fi = NULL; + flags |= RTCF_LOCAL; } - if (dst_dev_key && dev_out != dst_dev_key) - return -EINVAL; + key.oif = dev_out->ifindex; make_route: - if (LOOPBACK(saddr) && !(dev_out->flags&IFF_LOOPBACK)) { - printk(KERN_DEBUG "this guy talks to %08x from loopback\n", daddr); + if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK)) { + printk(KERN_DEBUG "this guy talks to %08x from loopback\n", key.dst); return -EINVAL; } - if (daddr == 0xFFFFFFFF) - flags |= RTF_BROADCAST; - else if (MULTICAST(daddr)) - flags |= RTF_MULTICAST; - else if (BADCLASS(daddr) || ZERONET(daddr)) + if (key.dst == 0xFFFFFFFF) + res.type = RTN_BROADCAST; + else if (MULTICAST(key.dst)) + res.type = RTN_MULTICAST; + else if (BADCLASS(key.dst) || ZERONET(key.dst)) return -EINVAL; - if (flags&RTF_BROADCAST && (dev_out->flags&IFF_LOOPBACK || - !(dev_out->flags&IFF_BROADCAST))) - flags &= ~RTF_LOCAL; - else if (flags&RTF_MULTICAST) { + if (res.type == RTN_BROADCAST) { + flags |= RTCF_BROADCAST; + if (!(dev_out->flags&IFF_LOOPBACK) && dev_out->flags&IFF_BROADCAST) + flags |= RTCF_LOCAL; + } else if (res.type == RTN_MULTICAST) { + flags |= RTCF_MULTICAST; if (ip_check_mc(dev_out, daddr)) - flags |= RTF_LOCAL; + flags |= RTCF_LOCAL; } - + rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops); if (!rth) return -ENOBUFS; atomic_set(&rth->u.dst.use, 1); - rth->key.dst = dst_key; + rth->key.dst = daddr; rth->key.tos = tos; - rth->key.src = src_key; - rth->key.src_dev= NULL; - rth->key.dst_dev= dst_dev_key; - rth->rt_dst = daddr; - rth->rt_dst_map = dst_map; - rth->rt_src = saddr; - rth->rt_src_map = saddr; - rth->rt_src_dev = dev_out; + rth->key.src = saddr; + rth->key.iif = 0; + rth->key.oif = oif; + rth->rt_dst = key.dst; + rth->rt_src = key.src; +#ifdef CONFIG_IP_ROUTE_NAT + rth->rt_dst_map = key.dst; + rth->rt_src_map = key.src; +#endif + rth->rt_iif = dev_out->ifindex; rth->u.dst.dev = dev_out; - rth->rt_gateway = dst_map; - rth->rt_spec_dst= dev_out->pa_addr; + rth->rt_gateway = key.dst; + rth->rt_spec_dst= key.src; rth->u.dst.output=ip_output; - if (flags&RTF_LOCAL) { + if (flags&RTCF_LOCAL) { rth->u.dst.input = ip_local_deliver; - rth->rt_spec_dst = daddr; + rth->rt_spec_dst = key.dst; } - if (flags&(RTF_BROADCAST|RTF_MULTICAST)) { - rth->rt_spec_dst = dev_out->pa_addr; - flags &= ~RTF_GATEWAY; - if (flags&RTF_LOCAL) + if (flags&(RTCF_BROADCAST|RTCF_MULTICAST)) { + rth->rt_spec_dst = key.src; + if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK)) rth->u.dst.output = ip_mc_output; - if (flags&RTF_MULTICAST) { - if (dev_out->flags&IFF_ALLMULTI) - rth->u.dst.output = ip_mc_output; #ifdef CONFIG_IP_MROUTE - if (ipv4_config.multicast_route && !LOCAL_MCAST(daddr)) + if (res.type == RTN_MULTICAST && dev_out->ip_ptr) { + struct in_device *in_dev = dev_out->ip_ptr; + if (IN_DEV_MFORWARD(in_dev) && !LOCAL_MCAST(daddr)) { rth->u.dst.input = ip_mr_input; -#endif + rth->u.dst.output = ip_mc_output; + } } +#endif } - if (fi) { - if (flags&RTF_GATEWAY) - rth->rt_gateway = fi->fib_gateway; - rth->u.dst.pmtu = fi->fib_mtu; - rth->u.dst.window=fi->fib_window; - rth->u.dst.rtt = fi->fib_irtt; + if (res.fi) { + if (FIB_RES_GW(res) && FIB_RES_NH(res).nh_scope == RT_SCOPE_LINK) + rth->rt_gateway = FIB_RES_GW(res); + rth->u.dst.pmtu = res.fi->fib_mtu ? : dev_out->mtu; + rth->u.dst.window=res.fi->fib_window ? : 0; + rth->u.dst.rtt = res.fi->fib_rtt ? : TCP_TIMEOUT_INIT; } else { rth->u.dst.pmtu = dev_out->mtu; rth->u.dst.window=0; rth->u.dst.rtt = TCP_TIMEOUT_INIT; } rth->rt_flags = flags; - hash = rt_hash_code(dst_key, dst_dev_key ? src_key^(dst_dev_key->ifindex<<5) : src_key, tos); + rth->rt_type = res.type; + hash = rt_hash_code(daddr, saddr^(oif<<5), tos); *rp = rt_intern_hash(hash, rth, ETH_P_IP); return 0; } -int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, struct device *dev_out) +int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, int oif) { unsigned hash; struct rtable *rth; - hash = rt_hash_code(daddr, dev_out ? saddr^(dev_out->ifindex<<5) - : saddr, tos); + hash = rt_hash_code(daddr, saddr^(oif<<5), tos); start_bh_atomic(); for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) { if (rth->key.dst == daddr && rth->key.src == saddr && - rth->key.src_dev == NULL && - rth->key.dst_dev == dev_out && + rth->key.iif == 0 && + rth->key.oif == oif && rth->key.tos == tos) { rth->u.dst.lastuse = jiffies; atomic_inc(&rth->u.dst.use); @@ -1341,48 +1401,126 @@ int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, struct dev } end_bh_atomic(); - return ip_route_output_slow(rp, daddr, saddr, tos, dev_out); + return ip_route_output_slow(rp, daddr, saddr, tos, oif); } -int ip_route_output_dev(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, int ifindex) +#ifdef CONFIG_RTNETLINK + +int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) { - unsigned hash; - struct rtable *rth; - struct device *dev_out; + struct kern_rta *rta = arg; + struct rtmsg *rtm = NLMSG_DATA(nlh); + struct rtable *rt = NULL; + u32 dst = 0; + u32 src = 0; + int err; + struct sk_buff *skb; + u8 *o; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + return -ENOBUFS; - hash = rt_hash_code(daddr, saddr^(ifindex<<5), tos); + /* Reserve room for dummy headers, this skb can pass + through good chunk of routing engine. + */ + skb->mac.raw = skb->data; + skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); + + if (rta->rta_dst) + memcpy(&dst, rta->rta_dst, 4); + if (rta->rta_src) + memcpy(&src, rta->rta_src, 4); + + if (rta->rta_iif) { + struct device *dev; + dev = dev_get_by_index(*rta->rta_iif); + if (!dev) + return -ENODEV; + skb->protocol = __constant_htons(ETH_P_IP); + skb->dev = dev; + start_bh_atomic(); + err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); + end_bh_atomic(); + rt = (struct rtable*)skb->dst; + if (!err && rt->u.dst.error) + err = rt->u.dst.error; + } else { + err = ip_route_output(&rt, dst, src, rtm->rtm_tos, + rta->rta_oif ? *rta->rta_oif : 0); + } + if (err) { + kfree_skb(skb, FREE_WRITE); + return err; + } - start_bh_atomic(); - for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) { - if (rth->key.dst == daddr && - rth->key.src == saddr && - rth->key.src_dev == NULL && - rth->key.tos == tos && - rth->key.dst_dev && - rth->key.dst_dev->ifindex == ifindex) { - rth->u.dst.lastuse = jiffies; - atomic_inc(&rth->u.dst.use); - atomic_inc(&rth->u.dst.refcnt); - end_bh_atomic(); - *rp = rth; - return 0; + skb->dst = &rt->u.dst; + if (rtm->rtm_flags & RTM_F_NOTIFY) + rt->rt_flags |= RTCF_NOTIFY; + + nlh = NLMSG_PUT(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, + RTM_NEWROUTE, sizeof(*rtm)); + rtm = NLMSG_DATA(nlh); + nlh->nlmsg_flags = 0; + rtm->rtm_family = AF_INET; + rtm->rtm_dst_len = 32; + rtm->rtm_src_len = 32; + rtm->rtm_tos = rt->key.tos; + rtm->rtm_table = RT_TABLE_MAIN; + rtm->rtm_type = rt->rt_type; + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + rtm->rtm_protocol = RTPROT_UNSPEC; + rtm->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED; + rtm->rtm_nhs = 0; + + o = skb->tail; + RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst); + RTA_PUT(skb, RTA_SRC, 4, &rt->rt_src); + if (rt->u.dst.dev) + RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex); + if (rt->rt_dst != rt->rt_gateway) + RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway); + RTA_PUT(skb, RTA_MTU, sizeof(unsigned), &rt->u.dst.pmtu); + RTA_PUT(skb, RTA_WINDOW, sizeof(unsigned), &rt->u.dst.window); + RTA_PUT(skb, RTA_RTT, sizeof(unsigned), &rt->u.dst.rtt); + RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst); + rtm->rtm_optlen = skb->tail - o; + if (rta->rta_iif) { +#ifdef CONFIG_IP_MROUTE + if (MULTICAST(dst) && !LOCAL_MCAST(dst) && ipv4_config.multicast_route) { + NETLINK_CB(skb).pid = NETLINK_CB(in_skb).pid; + err = ipmr_get_route(skb, rtm); + if (err <= 0) + return err; + } else +#endif + { + RTA_PUT(skb, RTA_IIF, 4, rta->rta_iif); + rtm->rtm_optlen = skb->tail - o; } } - end_bh_atomic(); + nlh->nlmsg_len = skb->tail - (u8*)nlh; + err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); + if (err < 0) + return err; + return 0; - dev_out = dev_get_by_index(ifindex); - if (!dev_out) - return -ENODEV; - return ip_route_output_slow(rp, daddr, saddr, tos, dev_out); +nlmsg_failure: +rtattr_failure: + kfree_skb(skb, FREE_WRITE); + return -EMSGSIZE; } -void ip_rt_multicast_event(struct device *dev) +#endif /* CONFIG_RTNETLINK */ + +void ip_rt_multicast_event(struct in_device *in_dev) { - rt_cache_flush(0); + rt_cache_flush(1*HZ); } __initfunc(void ip_rt_init(void)) { + devinet_init(); ip_fib_init(); #ifdef CONFIG_PROC_FS |