diff options
author | Ralf Baechle <ralf@linux-mips.org> | 2000-07-10 23:18:26 +0000 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 2000-07-10 23:18:26 +0000 |
commit | c7c4310f7fc1485925d800628bf50b3aeab535ef (patch) | |
tree | b12aa4be0e8fb82aaaea97fb475e793e8a347c49 /net | |
parent | 1ffd1d069ca4c5ffe16fea6175dab1b9bbb15820 (diff) |
Merge with Linux 2.4.0-test3-pre8. Linus has accepted most of what
I've sent him, so we're very close to full integration of the MIPS
port into his sources.
Diffstat (limited to 'net')
-rw-r--r-- | net/core/neighbour.c | 21 | ||||
-rw-r--r-- | net/core/sock.c | 23 | ||||
-rw-r--r-- | net/decnet/dn_route.c | 5 | ||||
-rw-r--r-- | net/ipv4/Config.in | 9 | ||||
-rw-r--r-- | net/ipv4/arp.c | 22 | ||||
-rw-r--r-- | net/ipv4/ip_fragment.c | 793 | ||||
-rw-r--r-- | net/ipv4/ip_gre.c | 51 | ||||
-rw-r--r-- | net/ipv4/ip_sockglue.c | 11 | ||||
-rw-r--r-- | net/ipv4/ipip.c | 32 | ||||
-rw-r--r-- | net/ipv4/proc.c | 4 | ||||
-rw-r--r-- | net/ipv4/raw.c | 38 | ||||
-rw-r--r-- | net/ipv4/route.c | 55 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 64 | ||||
-rw-r--r-- | net/ipv4/udp.c | 5 | ||||
-rw-r--r-- | net/ipv6/datagram.c | 4 | ||||
-rw-r--r-- | net/ipv6/proc.c | 4 | ||||
-rw-r--r-- | net/ipv6/raw.c | 33 | ||||
-rw-r--r-- | net/ipv6/reassembly.c | 675 | ||||
-rw-r--r-- | net/ipv6/route.c | 10 | ||||
-rw-r--r-- | net/ipv6/sit.c | 32 | ||||
-rw-r--r-- | net/ipv6/udp.c | 6 | ||||
-rw-r--r-- | net/netsyms.c | 11 | ||||
-rw-r--r-- | net/packet/af_packet.c | 22 | ||||
-rw-r--r-- | net/sched/sch_red.c | 10 | ||||
-rw-r--r-- | net/sched/sch_teql.c | 6 |
25 files changed, 1112 insertions, 834 deletions
diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 88322c8d6..b0d989516 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -281,25 +281,27 @@ struct neighbour * neigh_create(struct neigh_table *tbl, const void *pkey, struct neighbour *n, *n1; u32 hash_val; int key_len = tbl->key_len; + int error; n = neigh_alloc(tbl); if (n == NULL) - return NULL; + return ERR_PTR(-ENOBUFS); memcpy(n->primary_key, pkey, key_len); n->dev = dev; dev_hold(dev); /* Protocol specific setup. */ - if (tbl->constructor && tbl->constructor(n) < 0) { + if (tbl->constructor && (error = tbl->constructor(n)) < 0) { neigh_release(n); - return NULL; + return ERR_PTR(error); } /* Device specific setup. */ - if (n->parms && n->parms->neigh_setup && n->parms->neigh_setup(n) < 0) { + if (n->parms && n->parms->neigh_setup && + (error = n->parms->neigh_setup(n)) < 0) { neigh_release(n); - return NULL; + return ERR_PTR(error); } n->confirmed = jiffies - (n->parms->base_reachable_time<<1); @@ -1242,6 +1244,7 @@ int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) if (nda[NDA_LLADDR-1] != NULL && nda[NDA_LLADDR-1]->rta_len != RTA_LENGTH(dev->addr_len)) goto out; + err = 0; n = neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev); if (n) { if (nlh->nlmsg_flags&NLM_F_EXCL) @@ -1249,9 +1252,11 @@ int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) } else if (!(nlh->nlmsg_flags&NLM_F_CREATE)) err = -ENOENT; else { - n = __neigh_lookup(tbl, RTA_DATA(nda[NDA_DST-1]), dev, 1); - if (n == NULL) - err = -ENOBUFS; + n = __neigh_lookup_errno(tbl, RTA_DATA(nda[NDA_DST-1]), dev); + if (IS_ERR(n)) { + err = PTR_ERR(n); + n = NULL; + } } if (err == 0) { err = neigh_update(n, nda[NDA_LLADDR-1] ? RTA_DATA(nda[NDA_LLADDR-1]) : NULL, diff --git a/net/core/sock.c b/net/core/sock.c index 4044a7f7d..fcb6246b3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -7,7 +7,7 @@ * handler for protocols to use and generic option handler. * * - * Version: $Id: sock.c,v 1.93 2000/04/13 03:13:29 davem Exp $ + * Version: $Id: sock.c,v 1.95 2000/07/08 00:20:43 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -308,6 +308,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname, sock->passcred = valbool; break; + case SO_TIMESTAMP: + sk->rcvtstamp = valbool; + break; + case SO_RCVLOWAT: if (val < 0) val = INT_MAX; @@ -485,7 +489,11 @@ int sock_getsockopt(struct socket *sock, int level, int optname, case SO_BSDCOMPAT: v.val = sk->bsdism; break; - + + case SO_TIMESTAMP: + v.val = sk->rcvtstamp; + break; + case SO_RCVTIMEO: lv=sizeof(struct timeval); if (sk->rcvtimeo == MAX_SCHEDULE_TIMEOUT) { @@ -599,7 +607,16 @@ void __init sk_init(void) { sk_cachep = kmem_cache_create("sock", sizeof(struct sock), 0, SLAB_HWCACHE_ALIGN, 0, 0); - + + if (num_physpages <= 4096) { + sysctl_wmem_max = 32767; + sysctl_rmem_max = 32767; + sysctl_wmem_default = 32767; + sysctl_wmem_default = 32767; + } else if (num_physpages >= 131072) { + sysctl_wmem_max = 131071; + sysctl_rmem_max = 131071; + } } /* diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 5ce55ebb2..d97558a24 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -789,13 +789,14 @@ static int dn_route_input_slow(struct sk_buff *skb) if (dn_db->router && ((neigh = neigh_clone(dn_db->router)) != NULL)) goto add_entry; - if ((neigh = neigh_create(&dn_neigh_table, &cb->src, dev)) != NULL) { + neigh = neigh_create(&dn_neigh_table, &cb->src, dev); + if (!IS_ERR(neigh)) { if (dev->type == ARPHRD_ETHER) memcpy(neigh->ha, skb->mac.ethernet->h_source, ETH_ALEN); goto add_entry; } - return -ENOBUFS; + return PTR_ERR(neigh); non_local_input: diff --git a/net/ipv4/Config.in b/net/ipv4/Config.in index 68fea0272..7a44fa565 100644 --- a/net/ipv4/Config.in +++ b/net/ipv4/Config.in @@ -44,15 +44,8 @@ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then bool ' IP: ARP daemon support (EXPERIMENTAL)' CONFIG_ARPD fi fi +bool ' IP: TCP Explicit Congestion Notification support' CONFIG_INET_ECN bool ' IP: TCP syncookie support (disabled per default)' CONFIG_SYN_COOKIES -comment '(it is safe to leave these untouched)' -#bool ' IP: PC/TCP compatibility mode' CONFIG_INET_PCTCP -#bool ' IP: Path MTU Discovery (normally enabled)' CONFIG_PATH_MTU_DISCOVERY -#bool ' IP: Disable NAGLE algorithm (normally enabled)' CONFIG_TCP_NAGLE_OFF -bool ' IP: Allow large windows (not recommended if <16Mb of memory)' CONFIG_SKB_LARGE -#if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then -# bool ' IP: support checksum copy to user for UDP (EXPERIMENTAL)' CONFIG_UDP_DELAY_CSUM -#fi if [ "$CONFIG_NETFILTER" != "n" ]; then source net/ipv4/netfilter/Config.in fi diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 9def6b16b..81d8ebe80 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1,6 +1,6 @@ /* linux/net/inet/arp.c * - * Version: $Id: arp.c,v 1.86 2000/04/26 09:36:36 davem Exp $ + * Version: $Id: arp.c,v 1.87 2000/07/07 22:40:35 davem Exp $ * * Copyright (C) 1994 by Florian La Roche * @@ -424,20 +424,24 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) int arp_bind_neighbour(struct dst_entry *dst) { struct net_device *dev = dst->dev; + struct neighbour *n = dst->neighbour; if (dev == NULL) - return 0; - if (dst->neighbour == NULL) { + return -EINVAL; + if (n == NULL) { u32 nexthop = ((struct rtable*)dst)->rt_gateway; if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT)) nexthop = 0; - dst->neighbour = __neigh_lookup( + n = __neigh_lookup_errno( #ifdef CONFIG_ATM_CLIP dev->type == ARPHRD_ATM ? &clip_tbl : #endif - &arp_tbl, &nexthop, dev, 1); + &arp_tbl, &nexthop, dev); + if (IS_ERR(n)) + return PTR_ERR(n); + dst->neighbour = n; } - return (dst->neighbour != NULL); + return 0; } /* @@ -847,9 +851,9 @@ int arp_req_set(struct arpreq *r, struct net_device * dev) if (r->arp_ha.sa_family != dev->type) return -EINVAL; - err = -ENOBUFS; - neigh = __neigh_lookup(&arp_tbl, &ip, dev, 1); - if (neigh) { + neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev); + err = PTR_ERR(neigh); + if (!IS_ERR(neigh)) { unsigned state = NUD_STALE; if (r->arp_flags & ATF_PERM) state = NUD_PERMANENT; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 70f8cfb90..07041a3e5 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -5,7 +5,7 @@ * * The IP fragmentation functionality. * - * Version: $Id: ip_fragment.c,v 1.49 2000/04/15 01:48:10 davem Exp $ + * Version: $Id: ip_fragment.c,v 1.50 2000/07/07 22:29:42 davem Exp $ * * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox <Alan.Cox@linux.org> @@ -18,6 +18,7 @@ * Ultima : ip_expire() kernel panic. * Bill Hawes : Frag accounting and evictor fixes. * John McDonald : 0 length frag bug. + * Alexey Kuznetsov: SMP races, threading, cleanup. */ #include <linux/config.h> @@ -31,11 +32,17 @@ #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> +#include <net/checksum.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/inet.h> #include <linux/netfilter_ipv4.h> +/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 + * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c + * as well. Or notify me, at least. --ANK + */ + /* Fragment cache limits. We will commit 256K at one time. Should we * cross that limit we will prune down to 192K. This should cope with * even the most extreme cases without allowing an attacker to measurably @@ -46,38 +53,77 @@ int sysctl_ipfrag_low_thresh = 192*1024; int sysctl_ipfrag_time = IP_FRAG_TIME; -/* Describe an IP fragment. */ -struct ipfrag { - int offset; /* offset of fragment in IP datagram */ - int end; /* last byte of data in datagram */ - int len; /* length of this fragment */ - struct sk_buff *skb; /* complete received fragment */ - unsigned char *ptr; /* pointer into real fragment data */ - struct ipfrag *next; /* linked list pointers */ - struct ipfrag *prev; +struct ipfrag_skb_cb +{ + struct inet_skb_parm h; + int offset; }; +#define FRAG_CB(skb) ((struct ipfrag_skb_cb*)((skb)->cb)) + /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { - struct iphdr *iph; /* pointer to IP header */ struct ipq *next; /* linked list pointers */ - struct ipfrag *fragments; /* linked list of received fragments */ + u32 saddr; + u32 daddr; + u16 id; + u8 protocol; + u8 last_in; +#define COMPLETE 4 +#define FIRST_IN 2 +#define LAST_IN 1 + + struct sk_buff *fragments; /* linked list of received fragments */ int len; /* total length of original datagram */ - short ihlen; /* length of the IP header */ + int meat; + spinlock_t lock; + atomic_t refcnt; struct timer_list timer; /* when will this queue expire? */ struct ipq **pprev; - struct net_device *dev; /* Device - for icmp replies */ + struct net_device *dev; /* Device - for icmp replies */ }; +/* Hash table. */ + #define IPQ_HASHSZ 64 +/* Per-bucket lock is easy to add now. */ static struct ipq *ipq_hash[IPQ_HASHSZ]; -static spinlock_t ipfrag_lock = SPIN_LOCK_UNLOCKED; +static rwlock_t ipfrag_lock = RW_LOCK_UNLOCKED; +int ip_frag_nqueues = 0; -#define ipqhashfn(id, saddr, daddr, prot) \ - ((((id) >> 1) ^ (saddr) ^ (daddr) ^ (prot)) & (IPQ_HASHSZ - 1)) +static __inline__ void __ipq_unlink(struct ipq *qp) +{ + if(qp->next) + qp->next->pprev = qp->pprev; + *qp->pprev = qp->next; + ip_frag_nqueues--; +} + +static __inline__ void ipq_unlink(struct ipq *ipq) +{ + write_lock(&ipfrag_lock); + __ipq_unlink(ipq); + write_unlock(&ipfrag_lock); +} -static atomic_t ip_frag_mem = ATOMIC_INIT(0); /* Memory used for fragments */ +/* + * Was: ((((id) >> 1) ^ (saddr) ^ (daddr) ^ (prot)) & (IPQ_HASHSZ - 1)) + * + * I see, I see evil hand of bigendian mafia. On Intel all the packets hit + * one hash bucket with this hash function. 8) + */ +static __inline__ unsigned int ipqhashfn(u16 id, u32 saddr, u32 daddr, u8 prot) +{ + unsigned int h = saddr ^ daddr; + + h ^= (h>>16)^id; + h ^= (h>>8)^prot; + return h & (IPQ_HASHSZ - 1); +} + + +atomic_t ip_frag_mem = ATOMIC_INIT(0); /* Memory used for fragments */ /* Memory Tracking Functions. */ extern __inline__ void frag_kfree_skb(struct sk_buff *skb) @@ -86,112 +132,106 @@ extern __inline__ void frag_kfree_skb(struct sk_buff *skb) kfree_skb(skb); } -extern __inline__ void frag_kfree_s(void *ptr, int len) +extern __inline__ void frag_free_queue(struct ipq *qp) { - atomic_sub(len, &ip_frag_mem); - kfree(ptr); + atomic_sub(sizeof(struct ipq), &ip_frag_mem); + kfree(qp); } - -extern __inline__ void *frag_kmalloc(int size, int pri) + +extern __inline__ struct ipq *frag_alloc_queue(void) { - void *vp = kmalloc(size, pri); + struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC); - if(!vp) + if(!qp) return NULL; - atomic_add(size, &ip_frag_mem); - return vp; + atomic_add(sizeof(struct ipq), &ip_frag_mem); + return qp; } - -/* Create a new fragment entry. */ -static struct ipfrag *ip_frag_create(int offset, int end, - struct sk_buff *skb, unsigned char *ptr) + + +/* Destruction primitives. */ + +/* Complete destruction of ipq. */ +static void ip_frag_destroy(struct ipq *qp) { - struct ipfrag *fp; + struct sk_buff *fp; - fp = (struct ipfrag *) frag_kmalloc(sizeof(struct ipfrag), GFP_ATOMIC); - if (fp == NULL) - goto out_nomem; + BUG_TRAP(qp->last_in&COMPLETE); + BUG_TRAP(del_timer(&qp->timer) == 0); - /* Fill in the structure. */ - fp->offset = offset; - fp->end = end; - fp->len = end - offset; - fp->skb = skb; - fp->ptr = ptr; - fp->next = fp->prev = NULL; - - /* Charge for the SKB as well. */ - atomic_add(skb->truesize, &ip_frag_mem); + /* Release all fragment data. */ + fp = qp->fragments; + while (fp) { + struct sk_buff *xp = fp->next; - return(fp); + frag_kfree_skb(fp); + fp = xp; + } -out_nomem: - NETDEBUG(printk(KERN_ERR "IP: frag_create: no memory left !\n")); - return(NULL); + /* Finally, release the queue descriptor itself. */ + frag_free_queue(qp); } -/* Find the correct entry in the "incomplete datagrams" queue for - * this IP datagram, and return the queue entry address if found. - */ -static inline struct ipq *ip_find(struct iphdr *iph, struct dst_entry *dst) +static __inline__ void ipq_put(struct ipq *ipq) { - __u16 id = iph->id; - __u32 saddr = iph->saddr; - __u32 daddr = iph->daddr; - __u8 protocol = iph->protocol; - unsigned int hash = ipqhashfn(id, saddr, daddr, protocol); - struct ipq *qp; - - /* We are always in BH context, and protected by the - * ipfrag lock. - */ - for(qp = ipq_hash[hash]; qp; qp = qp->next) { - if(qp->iph->id == id && - qp->iph->saddr == saddr && - qp->iph->daddr == daddr && - qp->iph->protocol == protocol) { - del_timer(&qp->timer); - break; - } - } - return qp; + if (atomic_dec_and_test(&ipq->refcnt)) + ip_frag_destroy(ipq); } -/* Remove an entry from the "incomplete datagrams" queue, either - * because we completed, reassembled and processed it, or because - * it timed out. - * - * This is called _only_ from BH contexts with the ipfrag lock held, - * on packet reception processing and from frag queue expiration - * timers. -DaveM +/* Kill ipq entry. It is not destroyed immediately, + * because caller (and someone more) holds reference count. */ -static void ip_free(struct ipq *qp) +static __inline__ void ipq_kill(struct ipq *ipq) { - struct ipfrag *fp; - - /* Stop the timer for this entry. */ - del_timer(&qp->timer); - - /* Remove this entry from the "incomplete datagrams" queue. */ - if(qp->next) - qp->next->pprev = qp->pprev; - *qp->pprev = qp->next; - - /* Release all fragment data. */ - fp = qp->fragments; - while (fp) { - struct ipfrag *xp = fp->next; + if (del_timer(&ipq->timer)) + atomic_dec(&ipq->refcnt); - frag_kfree_skb(fp->skb); - frag_kfree_s(fp, sizeof(struct ipfrag)); - fp = xp; + if (!(ipq->last_in & COMPLETE)) { + ipq_unlink(ipq); + atomic_dec(&ipq->refcnt); + ipq->last_in |= COMPLETE; } +} - /* Release the IP header. */ - frag_kfree_s(qp->iph, 64 + 8); +/* Memory limiting on fragments. Evictor trashes the oldest + * fragment queue until we are back under the low threshold. + */ +static void ip_evictor(void) +{ + int i, progress; - /* Finally, release the queue descriptor itself. */ - frag_kfree_s(qp, sizeof(struct ipq)); + do { + if (atomic_read(&ip_frag_mem) <= sysctl_ipfrag_low_thresh) + return; + progress = 0; + /* FIXME: Make LRU queue of frag heads. -DaveM */ + for (i = 0; i < IPQ_HASHSZ; i++) { + struct ipq *qp; + if (ipq_hash[i] == NULL) + continue; + + write_lock(&ipfrag_lock); + if ((qp = ipq_hash[i]) != NULL) { + /* find the oldest queue for this hash bucket */ + while (qp->next) + qp = qp->next; + __ipq_unlink(qp); + write_unlock(&ipfrag_lock); + + spin_lock(&qp->lock); + if (del_timer(&qp->timer)) + atomic_dec(&qp->refcnt); + qp->last_in |= COMPLETE; + spin_unlock(&qp->lock); + + ipq_put(qp); + IP_INC_STATS_BH(IpReasmFails); + progress = 1; + continue; + } + write_unlock(&ipfrag_lock); + } + } while (progress); } /* @@ -201,181 +241,310 @@ static void ip_expire(unsigned long arg) { struct ipq *qp = (struct ipq *) arg; - spin_lock(&ipfrag_lock); - if(!qp->fragments) - { -#ifdef IP_EXPIRE_DEBUG - printk("warning: possible ip-expire attack\n"); -#endif + spin_lock(&qp->lock); + + if (qp->last_in & COMPLETE) goto out; - } - - /* Send an ICMP "Fragment Reassembly Timeout" message. */ + + ipq_kill(qp); + IP_INC_STATS_BH(IpReasmTimeout); IP_INC_STATS_BH(IpReasmFails); - icmp_send(qp->fragments->skb, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); + if ((qp->last_in&FIRST_IN) && qp->fragments != NULL) { + /* Send an ICMP "Fragment Reassembly Timeout" message. */ + icmp_send(qp->fragments, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); + } out: - /* Nuke the fragment queue. */ - ip_free(qp); - spin_unlock(&ipfrag_lock); + spin_unlock(&qp->lock); + ipq_put(qp); } -/* Memory limiting on fragments. Evictor trashes the oldest - * fragment queue until we are back under the low threshold. - * - * We are always called in BH with the ipfrag lock held. - */ -static void ip_evictor(void) +/* Creation primitives. */ + +static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) { - int i, progress; + struct ipq *qp; -restart: - progress = 0; - /* FIXME: Make LRU queue of frag heads. -DaveM */ - for (i = 0; i < IPQ_HASHSZ; i++) { - struct ipq *qp; - if (atomic_read(&ip_frag_mem) <= sysctl_ipfrag_low_thresh) - return; - qp = ipq_hash[i]; - if (qp) { - /* find the oldest queue for this hash bucket */ - while (qp->next) - qp = qp->next; - ip_free(qp); - progress = 1; + write_lock(&ipfrag_lock); +#ifdef CONFIG_SMP + /* With SMP race we have to recheck hash table, because + * such entry could be created on other cpu, while we + * promoted read lock to write lock. + */ + for(qp = ipq_hash[hash]; qp; qp = qp->next) { + if(qp->id == qp_in->id && + qp->saddr == qp_in->saddr && + qp->daddr == qp_in->daddr && + qp->protocol == qp_in->protocol) { + atomic_inc(&qp->refcnt); + write_unlock(&ipfrag_lock); + qp_in->last_in |= COMPLETE; + ipq_put(qp_in); + return qp; } } - if (progress) - goto restart; - panic("ip_evictor: memcount"); +#endif + qp = qp_in; + + atomic_inc(&qp->refcnt); + if((qp->next = ipq_hash[hash]) != NULL) + qp->next->pprev = &qp->next; + ipq_hash[hash] = qp; + qp->pprev = &ipq_hash[hash]; + ip_frag_nqueues++; + write_unlock(&ipfrag_lock); + return qp; } -/* Add an entry to the 'ipq' queue for a newly received IP datagram. - * We will (hopefully :-) receive all other fragments of this datagram - * in time, so we just create a queue for this datagram, in which we - * will insert the received fragments at their respective positions. - */ -static struct ipq *ip_create(struct sk_buff *skb, struct iphdr *iph) +/* Add an entry to the 'ipq' queue for a newly received IP datagram. */ +static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph) { struct ipq *qp; - unsigned int hash; - int ihlen; - qp = (struct ipq *) frag_kmalloc(sizeof(struct ipq), GFP_ATOMIC); - if (qp == NULL) + if ((qp = frag_alloc_queue()) == NULL) goto out_nomem; - /* Allocate memory for the IP header (plus 8 octets for ICMP). */ - ihlen = iph->ihl * 4; - - qp->iph = (struct iphdr *) frag_kmalloc(64 + 8, GFP_ATOMIC); - if (qp->iph == NULL) - goto out_free; - - memcpy(qp->iph, iph, ihlen + 8); + qp->protocol = iph->protocol; + qp->last_in = 0; + qp->id = iph->id; + qp->saddr = iph->saddr; + qp->daddr = iph->daddr; qp->len = 0; - qp->ihlen = ihlen; + qp->meat = 0; qp->fragments = NULL; - qp->dev = skb->dev; /* Initialize a timer for this entry. */ init_timer(&qp->timer); - qp->timer.expires = 0; /* (to be set later) */ qp->timer.data = (unsigned long) qp; /* pointer to queue */ qp->timer.function = ip_expire; /* expire function */ + qp->lock = SPIN_LOCK_UNLOCKED; + atomic_set(&qp->refcnt, 1); - /* Add this entry to the queue. */ - hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); + return ip_frag_intern(hash, qp); - /* In a BH context and ipfrag lock is held. -DaveM */ - if((qp->next = ipq_hash[hash]) != NULL) - qp->next->pprev = &qp->next; - ipq_hash[hash] = qp; - qp->pprev = &ipq_hash[hash]; +out_nomem: + NETDEBUG(printk(KERN_ERR "ip_frag_create: no memory left !\n")); + return NULL; +} - return qp; +/* Find the correct entry in the "incomplete datagrams" queue for + * this IP datagram, and create new one, if nothing is found. + */ +static inline struct ipq *ip_find(struct iphdr *iph) +{ + __u16 id = iph->id; + __u32 saddr = iph->saddr; + __u32 daddr = iph->daddr; + __u8 protocol = iph->protocol; + unsigned int hash = ipqhashfn(id, saddr, daddr, protocol); + struct ipq *qp; -out_free: - frag_kfree_s(qp, sizeof(struct ipq)); -out_nomem: - NETDEBUG(printk(KERN_ERR "IP: create: no memory left !\n")); - return(NULL); + read_lock(&ipfrag_lock); + for(qp = ipq_hash[hash]; qp; qp = qp->next) { + if(qp->id == id && + qp->saddr == saddr && + qp->daddr == daddr && + qp->protocol == protocol) { + atomic_inc(&qp->refcnt); + read_unlock(&ipfrag_lock); + return qp; + } + } + read_unlock(&ipfrag_lock); + + return ip_frag_create(hash, iph); } -/* See if a fragment queue is complete. */ -static int ip_done(struct ipq *qp) +/* Add new segment to existing queue. */ +static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { - struct ipfrag *fp; - int offset; + struct iphdr *iph = skb->nh.iph; + struct sk_buff *prev, *next; + int flags, offset; + int ihl, end; - /* Only possible if we received the final fragment. */ - if (qp->len == 0) - return 0; + if (qp->last_in & COMPLETE) + goto err; - /* Check all fragment offsets to see if they connect. */ - fp = qp->fragments; - offset = 0; - while (fp) { - if (fp->offset > offset) - return(0); /* fragment(s) missing */ - offset = fp->end; - fp = fp->next; + if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) + atomic_inc(&qp->refcnt); + + offset = ntohs(iph->frag_off); + flags = offset & ~IP_OFFSET; + offset &= IP_OFFSET; + offset <<= 3; /* offset is in 8-byte chunks */ + ihl = iph->ihl * 4; + + /* Determine the position of this fragment. */ + end = offset + (ntohs(iph->tot_len) - ihl); + + /* Is this the final fragment? */ + if ((flags & IP_MF) == 0) { + /* If we already have some bits beyond end + * or have different end, the segment is corrrupted. + */ + if (end < qp->len || + ((qp->last_in & LAST_IN) && end != qp->len)) + goto err; + qp->last_in |= LAST_IN; + qp->len = end; + } else { + if (end&7) { + end &= ~7; + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->ip_summed = CHECKSUM_NONE; + } + if (end > qp->len) { + /* Some bits beyond end -> corruption. */ + if (qp->last_in & LAST_IN) + goto err; + qp->len = end; + } } + if (end == offset) + goto err; + + /* Point into the IP datagram 'data' part. */ + skb_pull(skb, (skb->nh.raw+ihl) - skb->data); + skb_trim(skb, end - offset); + + /* Find out which fragments are in front and at the back of us + * in the chain of fragments so far. We must know where to put + * this fragment, right? + */ + prev = NULL; + for(next = qp->fragments; next != NULL; next = next->next) { + if (FRAG_CB(next)->offset >= offset) + break; /* bingo! */ + prev = next; + } + + /* We found where to put this one. Check for overlap with + * preceding fragment, and, if needed, align things so that + * any overlaps are eliminated. + */ + if (prev) { + int i = (FRAG_CB(prev)->offset + prev->len) - offset; + + if (i > 0) { + offset += i; + if (end <= offset) + goto err; + skb_pull(skb, i); + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->ip_summed = CHECKSUM_NONE; + } + } + + while (next && FRAG_CB(next)->offset < end) { + int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */ + + if (i < next->len) { + /* Eat head of the next overlapped fragment + * and leave the loop. The next ones cannot overlap. + */ + FRAG_CB(next)->offset += i; + skb_pull(next, i); + qp->meat -= i; + if (next->ip_summed != CHECKSUM_UNNECESSARY) + next->ip_summed = CHECKSUM_NONE; + break; + } else { + struct sk_buff *free_it = next; - /* All fragments are present. */ - return 1; + /* Old fragmnet is completely overridden with + * new one drop it. + */ + next = next->next; + + if (prev) + prev->next = next; + else + qp->fragments = next; + + qp->meat -= free_it->len; + frag_kfree_skb(free_it); + } + } + + FRAG_CB(skb)->offset = offset; + + /* Insert this fragment in the chain of fragments. */ + skb->next = next; + if (prev) + prev->next = skb; + else + qp->fragments = skb; + + qp->dev = skb->dev; + qp->meat += skb->len; + atomic_add(skb->truesize, &ip_frag_mem); + if (offset == 0) + qp->last_in |= FIRST_IN; + + return; + +err: + kfree_skb(skb); } + /* Build a new IP datagram from all its fragments. * * FIXME: We copy here because we lack an effective way of handling lists * of bits on input. Until the new skb data handling is in I'm not going * to touch this with a bargepole. */ -static struct sk_buff *ip_glue(struct ipq *qp) +static struct sk_buff *ip_frag_reasm(struct ipq *qp) { struct sk_buff *skb; struct iphdr *iph; - struct ipfrag *fp; - unsigned char *ptr; - int count, len; + struct sk_buff *fp, *head = qp->fragments; + int len; + int ihlen; + + ipq_kill(qp); + + BUG_TRAP(head != NULL); + BUG_TRAP(FRAG_CB(head)->offset == 0); /* Allocate a new buffer for the datagram. */ - len = qp->ihlen + qp->len; - + ihlen = head->nh.iph->ihl*4; + len = ihlen + qp->len; + if(len > 65535) goto out_oversize; - + skb = dev_alloc_skb(len); if (!skb) goto out_nomem; /* Fill in the basic details. */ - skb->mac.raw = ptr = skb->data; - skb->nh.iph = iph = (struct iphdr *) skb_put(skb, len); + skb->mac.raw = skb->data; + skb->nh.raw = skb->data; + FRAG_CB(skb)->h = FRAG_CB(head)->h; + skb->ip_summed = head->ip_summed; + skb->csum = 0; /* Copy the original IP headers into the new buffer. */ - memcpy(ptr, qp->iph, qp->ihlen); - ptr += qp->ihlen; + memcpy(skb_put(skb, ihlen), head->nh.iph, ihlen); /* Copy the data portions of all fragments into the new buffer. */ - fp = qp->fragments; - count = qp->ihlen; - while(fp) { - if ((fp->len <= 0) || ((count + fp->len) > skb->len)) - goto out_invalid; - memcpy((ptr + fp->offset), fp->ptr, fp->len); - if (count == qp->ihlen) { - skb->dst = dst_clone(fp->skb->dst); - skb->dev = fp->skb->dev; - } - count += fp->len; - fp = fp->next; + for (fp=head; fp; fp = fp->next) { + memcpy(skb_put(skb, fp->len), fp->data, fp->len); + + if (skb->ip_summed != fp->ip_summed) + skb->ip_summed = CHECKSUM_NONE; + else if (skb->ip_summed == CHECKSUM_HW) + skb->csum = csum_chain(skb->csum, fp->csum); } - skb->pkt_type = qp->fragments->skb->pkt_type; - skb->protocol = qp->fragments->skb->protocol; + skb->dst = dst_clone(head->dst); + skb->pkt_type = head->pkt_type; + skb->protocol = head->protocol; + skb->dev = qp->dev; + /* * Clearly bogus, because security markings of the individual * fragments should have been checked for consistency before @@ -385,29 +554,24 @@ static struct sk_buff *ip_glue(struct ipq *qp) * as well take the value associated with the first fragment. * --rct */ - skb->security = qp->fragments->skb->security; + skb->security = head->security; #ifdef CONFIG_NETFILTER /* Connection association is same as fragment (if any). */ - skb->nfct = qp->fragments->skb->nfct; + skb->nfct = head->nfct; nf_conntrack_get(skb->nfct); #ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug = qp->fragments->skb->nf_debug; + skb->nf_debug = head->nf_debug; #endif #endif /* Done with all fragments. Fixup the new IP header. */ iph = skb->nh.iph; iph->frag_off = 0; - iph->tot_len = htons(count); + iph->tot_len = htons(len); IP_INC_STATS_BH(IpReasmOKs); return skb; -out_invalid: - NETDEBUG(printk(KERN_ERR - "Invalid fragment list: Fragment over size.\n")); - kfree_skb(skb); - goto out_fail; out_nomem: NETDEBUG(printk(KERN_ERR "IP: queue_glue: no memory for gluing queue %p\n", @@ -417,7 +581,7 @@ out_oversize: if (net_ratelimit()) printk(KERN_INFO "Oversized IP packet from %d.%d.%d.%d.\n", - NIPQUAD(qp->iph->saddr)); + NIPQUAD(qp->saddr)); out_fail: IP_INC_STATS_BH(IpReasmFails); return NULL; @@ -427,185 +591,32 @@ out_fail: struct sk_buff *ip_defrag(struct sk_buff *skb) { struct iphdr *iph = skb->nh.iph; - struct ipfrag *prev, *next, *tmp, *tfp; struct ipq *qp; - unsigned char *ptr; - int flags, offset; - int i, ihl, end; IP_INC_STATS_BH(IpReasmReqds); - spin_lock(&ipfrag_lock); - /* Start by cleaning up the memory. */ if (atomic_read(&ip_frag_mem) > sysctl_ipfrag_high_thresh) ip_evictor(); - /* - * Look for the entry for this IP datagram in the - * "incomplete datagrams" queue. If found, the - * timer is removed. - */ - qp = ip_find(iph, skb->dst); - - /* Is this a non-fragmented datagram? */ - offset = ntohs(iph->frag_off); - flags = offset & ~IP_OFFSET; - offset &= IP_OFFSET; + /* Lookup (or create) queue header */ + if ((qp = ip_find(iph)) != NULL) { + struct sk_buff *ret = NULL; - offset <<= 3; /* offset is in 8-byte chunks */ - ihl = iph->ihl * 4; + spin_lock(&qp->lock); - /* - * Check whether to create a fresh queue entry. If the - * queue already exists, its timer will be restarted as - * long as we continue to receive fragments. - */ - if (qp) { - /* ANK. If the first fragment is received, - * we should remember the correct IP header (with options) - */ - if (offset == 0) { - /* Fragmented frame replaced by unfragmented copy? */ - if ((flags & IP_MF) == 0) - goto out_freequeue; - qp->ihlen = ihl; - memcpy(qp->iph, iph, (ihl + 8)); - } - } else { - /* Fragmented frame replaced by unfragmented copy? */ - if ((offset == 0) && ((flags & IP_MF) == 0)) - goto out_skb; - - /* If we failed to create it, then discard the frame. */ - qp = ip_create(skb, iph); - if (!qp) - goto out_freeskb; - } - - /* Attempt to construct an oversize packet. */ - if((ntohs(iph->tot_len) + ((int) offset)) > 65535) - goto out_oversize; + ip_frag_queue(qp, skb); - /* Determine the position of this fragment. */ - end = offset + ntohs(iph->tot_len) - ihl; + if (qp->last_in == (FIRST_IN|LAST_IN) && + qp->meat == qp->len) + ret = ip_frag_reasm(qp); - /* Is this the final fragment? */ - if ((flags & IP_MF) == 0) - qp->len = end; - - /* Find out which fragments are in front and at the back of us - * in the chain of fragments so far. We must know where to put - * this fragment, right? - */ - prev = NULL; - for(next = qp->fragments; next != NULL; next = next->next) { - if (next->offset >= offset) - break; /* bingo! */ - prev = next; + spin_unlock(&qp->lock); + ipq_put(qp); + return ret; } - /* Point into the IP datagram 'data' part. */ - ptr = skb->data + ihl; - - /* We found where to put this one. Check for overlap with - * preceding fragment, and, if needed, align things so that - * any overlaps are eliminated. - */ - if ((prev != NULL) && (offset < prev->end)) { - i = prev->end - offset; - offset += i; /* ptr into datagram */ - ptr += i; /* ptr into fragment data */ - } - - /* Look for overlap with succeeding segments. - * If we can merge fragments, do it. - */ - for (tmp = next; tmp != NULL; tmp = tfp) { - tfp = tmp->next; - if (tmp->offset >= end) - break; /* no overlaps at all */ - - i = end - next->offset; /* overlap is 'i' bytes */ - tmp->len -= i; /* so reduce size of */ - tmp->offset += i; /* next fragment */ - tmp->ptr += i; - - /* If we get a frag size of <= 0, remove it and the packet - * that it goes with. - */ - if (tmp->len <= 0) { - if (tmp->prev != NULL) - tmp->prev->next = tmp->next; - else - qp->fragments = tmp->next; - - if (tmp->next != NULL) - tmp->next->prev = tmp->prev; - - /* We have killed the original next frame. */ - next = tfp; - - frag_kfree_skb(tmp->skb); - frag_kfree_s(tmp, sizeof(struct ipfrag)); - } - } - - /* - * Create a fragment to hold this skb. - * No memory to save the fragment? throw the lot ... - */ - tfp = ip_frag_create(offset, end, skb, ptr); - if (!tfp) - goto out_freeskb; - - /* Insert this fragment in the chain of fragments. */ - tfp->prev = prev; - tfp->next = next; - if (prev != NULL) - prev->next = tfp; - else - qp->fragments = tfp; - - if (next != NULL) - next->prev = tfp; - - /* OK, so we inserted this new fragment into the chain. - * Check if we now have a full IP datagram which we can - * bump up to the IP layer... - */ - if (ip_done(qp)) { - /* Glue together the fragments. */ - skb = ip_glue(qp); - /* Free the queue entry. */ -out_freequeue: - ip_free(qp); -out_skb: - spin_unlock(&ipfrag_lock); - return skb; - } - - /* - * The queue is still active ... reset its timer. - */ -out_timer: - mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time); /* ~ 30 seconds */ -out: - spin_unlock(&ipfrag_lock); - return NULL; - - /* - * Error exits ... we need to reset the timer if there's a queue. - */ -out_oversize: - if (net_ratelimit()) - printk(KERN_INFO "Oversized packet received from %u.%u.%u.%u\n", - NIPQUAD(iph->saddr)); - /* the skb isn't in a fragment, so fall through to free it */ -out_freeskb: - kfree_skb(skb); IP_INC_STATS_BH(IpReasmFails); - if (qp) - goto out_timer; - goto out; + kfree_skb(skb); + return NULL; } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 79dc3d629..a316401b0 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -36,6 +36,7 @@ #include <net/ipip.h> #include <net/arp.h> #include <net/checksum.h> +#include <net/inet_ecn.h> #ifdef CONFIG_IPV6 #include <net/ipv6.h> @@ -119,11 +120,11 @@ static int ipgre_tunnel_init(struct net_device *dev); static int ipgre_fb_tunnel_init(struct net_device *dev); static struct net_device ipgre_fb_tunnel_dev = { - "gre%d", 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipgre_fb_tunnel_init, + "gre0", 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipgre_fb_tunnel_init, }; static struct ip_tunnel ipgre_fb_tunnel = { - NULL, &ipgre_fb_tunnel_dev, {0, }, 0, 0, 0, 0, 0, 0, 0, {"gre%d", } + NULL, &ipgre_fb_tunnel_dev, {0, }, 0, 0, 0, 0, 0, 0, 0, {"gre0", } }; /* Tunnel hash table */ @@ -530,6 +531,34 @@ out: #endif } +static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) +{ + if (INET_ECN_is_ce(iph->tos)) { + if (skb->protocol == __constant_htons(ETH_P_IP)) { + if (INET_ECN_is_not_ce(skb->nh.iph->tos)) + IP_ECN_set_ce(skb->nh.iph); + } else if (skb->protocol == __constant_htons(ETH_P_IPV6)) { + if (INET_ECN_is_not_ce(ip6_get_dsfield(skb->nh.ipv6h))) + IP6_ECN_set_ce(skb->nh.ipv6h); + } + } +} + +static inline u8 +ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb) +{ +#ifdef CONFIG_INET_ECN + u8 inner = 0; + if (skb->protocol == __constant_htons(ETH_P_IP)) + inner = old_iph->tos; + else if (skb->protocol == __constant_htons(ETH_P_IPV6)) + inner = ip6_get_dsfield((struct ipv6hdr*)old_iph); + return INET_ECN_encapsulate(tos, inner); +#else + return tos; +#endif +} + int ipgre_rcv(struct sk_buff *skb, unsigned short len) { struct iphdr *iph = skb->nh.iph; @@ -604,6 +633,7 @@ int ipgre_rcv(struct sk_buff *skb, unsigned short len) nf_conntrack_put(skb->nfct); skb->nfct = NULL; #endif + ipgre_ecn_decapsulate(iph, skb); netif_rx(skb); read_unlock(&ipgre_lock); return(0); @@ -638,6 +668,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) int gre_hlen; u32 dst; int mtu; + int err; if (tunnel->recursion++) { tunnel->stat.collisions++; @@ -789,7 +820,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) iph->ihl = sizeof(struct iphdr) >> 2; iph->frag_off = df; iph->protocol = IPPROTO_GRE; - iph->tos = tos; + iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb); iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; @@ -834,10 +865,17 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) skb->nfct = NULL; #endif + err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, + do_ip_send); + if(err < 0) { + if(net_ratelimit()) + printk(KERN_ERR "ipgre_tunnel_xmit: ip_send() failed, err=%d\n", -err); + skb = NULL; + goto tx_error; + } + stats->tx_bytes += skb->len; stats->tx_packets++; - NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - do_ip_send); tunnel->recursion--; return 0; @@ -846,7 +884,8 @@ tx_error_icmp: tx_error: stats->tx_errors++; - dev_kfree_skb(skb); + if(skb) + dev_kfree_skb(skb); tunnel->recursion--; return 0; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 90b74447f..aea8b9370 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -5,7 +5,7 @@ * * The IP to API glue. * - * Version: $Id: ip_sockglue.c,v 1.47 2000/01/16 05:11:23 davem Exp $ + * Version: $Id: ip_sockglue.c,v 1.49 2000/07/08 00:20:43 davem Exp $ * * Authors: see ip.c * @@ -327,6 +327,8 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len) if (err) goto out_free_skb; + sock_recv_timestamp(msg, sk, skb); + serr = SKB_EXT_ERR(skb); sin = (struct sockaddr_in *)msg->msg_name; @@ -462,8 +464,15 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char *optval, int opt break; case IP_TOS: /* This sets both TOS and Precedence */ /* Reject setting of unused bits */ +#ifndef CONFIG_INET_ECN if (val & ~(IPTOS_TOS_MASK|IPTOS_PREC_MASK)) goto e_inval; +#else + if (sk->type == SOCK_STREAM) { + val &= ~3; + val |= sk->protinfo.af_inet.tos & 3; + } +#endif if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP && !capable(CAP_NET_ADMIN)) { err = -EPERM; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index e343f34e8..1177033ca 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -1,7 +1,7 @@ /* * Linux NET3: IP/IP protocol decoder. * - * Version: $Id: ipip.c,v 1.35 2000/07/07 01:55:20 davem Exp $ + * Version: $Id: ipip.c,v 1.37 2000/07/07 23:47:45 davem Exp $ * * Authors: * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 @@ -114,6 +114,7 @@ #include <net/icmp.h> #include <net/protocol.h> #include <net/ipip.h> +#include <net/inet_ecn.h> #define HASH_SIZE 16 #define HASH(addr) ((addr^(addr>>4))&0xF) @@ -122,11 +123,11 @@ static int ipip_fb_tunnel_init(struct net_device *dev); static int ipip_tunnel_init(struct net_device *dev); static struct net_device ipip_fb_tunnel_dev = { - "tunl%d", 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipip_fb_tunnel_init, + "tunl0", 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipip_fb_tunnel_init, }; static struct ip_tunnel ipip_fb_tunnel = { - NULL, &ipip_fb_tunnel_dev, {0, }, 0, 0, 0, 0, 0, 0, 0, {"tunl%d", } + NULL, &ipip_fb_tunnel_dev, {0, }, 0, 0, 0, 0, 0, 0, 0, {"tunl0", } }; static struct ip_tunnel *tunnels_r_l[HASH_SIZE]; @@ -465,6 +466,13 @@ out: #endif } +static inline void ipip_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) +{ + if (INET_ECN_is_ce(iph->tos) && + INET_ECN_is_not_ce(skb->nh.iph->tos)) + IP_ECN_set_ce(iph); +} + int ipip_rcv(struct sk_buff *skb, unsigned short len) { struct iphdr *iph; @@ -489,6 +497,7 @@ int ipip_rcv(struct sk_buff *skb, unsigned short len) nf_conntrack_put(skb->nfct); skb->nfct = NULL; #endif + ipip_ecn_decapsulate(iph, skb); netif_rx(skb); read_unlock(&ipip_lock); return 0; @@ -525,6 +534,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) int max_headroom; /* The extra header space needed */ u32 dst = tiph->daddr; int mtu; + int err; if (tunnel->recursion++) { tunnel->stat.collisions++; @@ -620,7 +630,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) iph->ihl = sizeof(struct iphdr)>>2; iph->frag_off = df; iph->protocol = IPPROTO_IPIP; - iph->tos = tos; + iph->tos = INET_ECN_encapsulate(tos, old_iph->tos); iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; @@ -636,10 +646,17 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) skb->nfct = NULL; #endif + err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, + do_ip_send); + if(err < 0) { + if(net_ratelimit()) + printk(KERN_ERR "ipip_tunnel_xmit: ip_send() failed, err=%d\n", -err); + skb = NULL; + goto tx_error; + } + stats->tx_bytes += skb->len; stats->tx_packets++; - NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - do_ip_send); tunnel->recursion--; return 0; @@ -647,7 +664,8 @@ tx_error_icmp: dst_link_failure(skb); tx_error: stats->tx_errors++; - dev_kfree_skb(skb); + if(skb) + dev_kfree_skb(skb); tunnel->recursion--; return 0; } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 4d94a4cc0..f1ff8f1ee 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -7,7 +7,7 @@ * PROC file system. It is mainly used for debugging and * statistics. * - * Version: $Id: proc.c,v 1.42 2000/04/16 01:11:37 davem Exp $ + * Version: $Id: proc.c,v 1.43 2000/07/07 22:29:42 davem Exp $ * * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de> @@ -78,6 +78,8 @@ int afinet_get_info(char *buffer, char **start, off_t offset, int length) fold_prot_inuse(&udp_prot)); len += sprintf(buffer+len,"RAW: inuse %d\n", fold_prot_inuse(&raw_prot)); + len += sprintf(buffer+len, "FRAG: inuse %d memory %d\n", + ip_frag_nqueues, atomic_read(&ip_frag_mem)); if (offset >= len) { *start = buffer; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 6e3f19287..5ac30dc40 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -5,7 +5,7 @@ * * RAW - implementation of IP "raw" sockets. * - * Version: $Id: raw.c,v 1.50 2000/05/03 06:37:06 davem Exp $ + * Version: $Id: raw.c,v 1.52 2000/07/08 00:20:43 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -502,7 +502,7 @@ int raw_recvmsg(struct sock *sk, struct msghdr *msg, int len, if (err) goto done; - sk->stamp=skb->stamp; + sock_recv_timestamp(msg, sk, skb); /* Copy the address. */ if (sin) { @@ -580,6 +580,36 @@ static int raw_getsockopt(struct sock *sk, int level, int optname, return -ENOPROTOOPT; } +static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + switch(cmd) { + case SIOCOUTQ: + { + int amount = atomic_read(&sk->wmem_alloc); + return put_user(amount, (int *)arg); + } + case SIOCINQ: + { + struct sk_buff *skb; + int amount = 0; + + spin_lock_irq(&sk->receive_queue.lock); + skb = skb_peek(&sk->receive_queue); + if (skb != NULL) + amount = skb->len; + spin_unlock_irq(&sk->receive_queue.lock); + return put_user(amount, (int *)arg); + } + + default: +#ifdef CONFIG_IP_MROUTE + return ipmr_ioctl(sk, cmd, arg); +#else + return -ENOIOCTLCMD; +#endif + } +} + static void get_raw_sock(struct sock *sp, char *tmpbuf, int i) { unsigned int dest, src; @@ -648,9 +678,7 @@ struct proto raw_prot = { close: raw_close, connect: udp_connect, disconnect: udp_disconnect, -#ifdef CONFIG_IP_MROUTE - ioctl: ipmr_ioctl, -#endif + ioctl: raw_ioctl, init: raw_init, setsockopt: raw_setsockopt, getsockopt: raw_getsockopt, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ec254e313..eb00518bd 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -5,7 +5,7 @@ * * ROUTE - implementation of the IP router. * - * Version: $Id: route.c,v 1.86 2000/04/24 07:03:14 davem Exp $ + * Version: $Id: route.c,v 1.88 2000/07/07 23:47:45 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -152,23 +152,29 @@ struct dst_ops ipv4_dst_ops = sizeof(struct rtable), }; +#ifdef CONFIG_INET_ECN +#define ECN_OR_COST(class) TC_PRIO_##class +#else +#define ECN_OR_COST(class) TC_PRIO_FILLER +#endif + __u8 ip_tos2prio[16] = { TC_PRIO_BESTEFFORT, - TC_PRIO_FILLER, + ECN_OR_COST(FILLER), TC_PRIO_BESTEFFORT, - TC_PRIO_FILLER, + ECN_OR_COST(BESTEFFORT), TC_PRIO_BULK, - TC_PRIO_FILLER, + ECN_OR_COST(BULK), TC_PRIO_BULK, - TC_PRIO_FILLER, + ECN_OR_COST(BULK), TC_PRIO_INTERACTIVE, - TC_PRIO_FILLER, + ECN_OR_COST(INTERACTIVE), TC_PRIO_INTERACTIVE, - TC_PRIO_FILLER, + ECN_OR_COST(INTERACTIVE), TC_PRIO_INTERACTIVE_BULK, - TC_PRIO_FILLER, + ECN_OR_COST(INTERACTIVE_BULK), TC_PRIO_INTERACTIVE_BULK, - TC_PRIO_FILLER + ECN_OR_COST(INTERACTIVE_BULK) }; @@ -582,9 +588,15 @@ restart: route or unicast forwarding path. */ if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) { - if (!arp_bind_neighbour(&rt->u.dst)) { + int err = arp_bind_neighbour(&rt->u.dst); + if (err) { write_unlock_bh(&rt_hash_table[hash].lock); + if (err != -ENOBUFS) { + rt_drop(rt); + return err; + } + /* Neighbour tables are full and nothing can be released. Try to shrink route cache, it is most likely it holds some neighbour records. @@ -600,13 +612,8 @@ restart: goto restart; } - if (net_ratelimit()) { - if ((rt->u.dst.dev->flags&IFF_UP) && - __in_dev_get(rt->u.dst.dev)) - printk("Neighbour table overflow.\n"); - else - printk("Device %s is down.\n", rt->u.dst.dev->name); - } + if (net_ratelimit()) + printk("Neighbour table overflow.\n"); rt_drop(rt); return -ENOBUFS; } @@ -712,7 +719,7 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, u32 skeys[2] = { saddr, 0 }; int ikeys[2] = { dev->ifindex, 0 }; - tos &= IPTOS_TOS_MASK; + tos &= IPTOS_RT_MASK; if (!in_dev) return; @@ -791,7 +798,7 @@ void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, if (rt->peer) atomic_inc(&rt->peer->refcnt); - if (!arp_bind_neighbour(&rt->u.dst) || + if (arp_bind_neighbour(&rt->u.dst) || !(rt->u.dst.neighbour->nud_state&NUD_VALID)) { if (rt->u.dst.neighbour) neigh_event_send(rt->u.dst.neighbour, NULL); @@ -967,7 +974,7 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) struct rtable *rth; u32 skeys[2] = { iph->saddr, 0, }; u32 daddr = iph->daddr; - u8 tos = iph->tos & IPTOS_TOS_MASK; + u8 tos = iph->tos & IPTOS_RT_MASK; unsigned short est_mtu = 0; if (ipv4_config.no_pmtu_disc) @@ -1546,7 +1553,7 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr, unsigned hash; int iif = dev->ifindex; - tos &= IPTOS_TOS_MASK; + tos &= IPTOS_RT_MASK; hash = rt_hash_code(daddr, saddr^(iif<<5), tos); read_lock(&rt_hash_table[hash].lock); @@ -1616,10 +1623,10 @@ int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int int free_res = 0; int err; - tos &= IPTOS_TOS_MASK|RTO_ONLINK; + tos &= IPTOS_RT_MASK|RTO_ONLINK; key.dst = daddr; key.src = saddr; - key.tos = tos&IPTOS_TOS_MASK; + key.tos = tos&IPTOS_RT_MASK; key.iif = loopback_dev.ifindex; key.oif = oif; key.scope = (tos&RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE; @@ -1889,7 +1896,7 @@ int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif) rth->key.src == saddr && rth->key.iif == 0 && rth->key.oif == oif && - !((rth->key.tos^tos)&(IPTOS_TOS_MASK|RTO_ONLINK)) && + !((rth->key.tos^tos)&(IPTOS_RT_MASK|RTO_ONLINK)) && ((tos&RTO_TPROXY) || !(rth->rt_flags&RTCF_TPROXY)) ) { rth->u.dst.lastuse = jiffies; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c4343b707..dbf680233 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,7 +5,7 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.169 2000/04/20 14:41:16 davem Exp $ + * Version: $Id: tcp.c,v 1.170 2000/07/08 00:20:43 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -1018,9 +1018,13 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size) tcp_send_skb(sk, skb, queue_it, mss_now); } } - sk->err = 0; err = copied; - goto out; +out: + __tcp_push_pending_frames(sk, tp, mss_now); + TCP_CHECK_TIMER(sk); +out_unlock: + release_sock(sk); + return err; do_sock_err: if(copied) @@ -1048,12 +1052,7 @@ do_fault: kfree_skb(skb); do_fault2: err = -EFAULT; -out: - __tcp_push_pending_frames(sk, tp, mss_now); - TCP_CHECK_TIMER(sk); -out_unlock: - release_sock(sk); - return err; + goto out; } #undef PSH_NEEDED @@ -1270,10 +1269,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, TCP_CHECK_TIMER(sk); - - if (sk->err) - goto out_err; - err = -ENOTCONN; if (sk->state == TCP_LISTEN) goto out; @@ -1292,13 +1287,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); - /* - * BUG BUG BUG - * This violates 1003.1g compliance. We must wait for - * data to exist even if we read none! - */ - - while (len > 0) { + do { struct sk_buff * skb; u32 offset; @@ -1519,29 +1508,6 @@ do_prequeue: continue; skb->used = 1; tcp_eat_skb(sk, skb); - -#ifdef TCP_LESS_COARSE_ACKS - /* Possible improvement. When sender is faster than receiver, - * traffic looks like: fill window ... wait for window open ... - * fill window. We lose at least one rtt, because call - * cleanup_rbuf only once. Probably, if "len" was large - * we should insert several intermediate cleanup_rbuf(s). - * - * F.e.: - */ - do { - u32 full_space = min(tp->window_clamp, tcp_full_space(sk)); - - /* Try to ACK, if total buffer length is larger - than maximal window and if rcv_window has - chances to increase twice. It will result - to exponentially decreased ACKing during - read to huge (usually, mmapped) buffer. - */ - if (len >= full_space && tp->rcv_wnd <= full_space/2) - cleanup_rbuf(sk, copied); - } while (0); -#endif continue; found_fin_ok: @@ -1552,7 +1518,7 @@ do_prequeue: /* All is done. */ skb->used = 1; break; - } + } while (len > 0); if (user_recv) { if (skb_queue_len(&tp->ucopy.prequeue)) { @@ -1584,9 +1550,6 @@ do_prequeue: release_sock(sk); return copied; -out_err: - err = sock_error(sk); - out: TCP_CHECK_TIMER(sk); release_sock(sk); @@ -2012,7 +1975,6 @@ struct sock *tcp_accept(struct sock *sk, int flags, int *err) struct open_request *req; struct sock *newsk; int error; - long timeo; lock_sock(sk); @@ -2023,10 +1985,10 @@ struct sock *tcp_accept(struct sock *sk, int flags, int *err) if (sk->state != TCP_LISTEN) goto out; - timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); - /* Find already established connection */ if (!tp->accept_queue) { + long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + /* If this is a non blocking socket don't sleep */ error = -EAGAIN; if (!timeo) @@ -2099,6 +2061,8 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, break; } tp->nonagle = (val == 0) ? 0 : 1; + if (val) + tcp_push_pending_frames(sk, tp); break; case TCP_CORK: diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index a14c984d7..dec2a6126 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -5,7 +5,7 @@ * * The User Datagram Protocol (UDP). * - * Version: $Id: udp.c,v 1.83 2000/06/09 07:35:49 davem Exp $ + * Version: $Id: udp.c,v 1.84 2000/07/08 00:20:43 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -710,7 +710,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, int len, if (err) goto out_free; - sk->stamp=skb->stamp; + + sock_recv_timestamp(msg, sk, skb); /* Copy the address. */ if (sin) diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 844ea8228..986cd023f 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: datagram.c,v 1.19 2000/02/27 19:51:47 davem Exp $ + * $Id: datagram.c,v 1.20 2000/07/08 00:20:43 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -127,6 +127,8 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) if (err) goto out_free_skb; + sock_recv_timestamp(msg, sk, skb); + serr = SKB_EXT_ERR(skb); sin = (struct sockaddr_in6 *)msg->msg_name; diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 22c1fa367..944d665d5 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -7,7 +7,7 @@ * PROC file system. This is very similar to the IPv4 version, * except it reports the sockets in the INET6 address family. * - * Version: $Id: proc.c,v 1.14 2000/04/16 01:11:37 davem Exp $ + * Version: $Id: proc.c,v 1.15 2000/07/07 22:29:42 davem Exp $ * * Authors: David S. Miller (davem@caip.rutgers.edu) * @@ -46,6 +46,8 @@ int afinet6_get_info(char *buffer, char **start, off_t offset, int length, int d fold_prot_inuse(&udpv6_prot)); len += sprintf(buffer+len, "RAW6: inuse %d\n", fold_prot_inuse(&rawv6_prot)); + len += sprintf(buffer+len, "FRAG6: inuse %d memory %d\n", + ip6_frag_nqueues, atomic_read(&ip6_frag_mem)); *start = buffer + offset; len -= offset; if(len > length) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 3f2ec7068..e83870421 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -7,7 +7,7 @@ * * Adapted from linux/net/ipv4/raw.c * - * $Id: raw.c,v 1.36 2000/05/03 06:37:07 davem Exp $ + * $Id: raw.c,v 1.39 2000/07/08 00:20:43 davem Exp $ * * Fixes: * Hideaki YOSHIFUJI : sin6_scope_id support @@ -331,7 +331,6 @@ int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, int len, } err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); - sk->stamp=skb->stamp; if (err) goto out_free; @@ -348,6 +347,8 @@ int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, int len, } } + sock_recv_timestamp(msg, sk, skb); + if (sk->net_pinfo.af_inet6.rxopt.all) datagram_recv_ctl(sk, msg, skb); err = copied; @@ -535,6 +536,8 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, int len) fl.proto = proto; fl.fl6_dst = daddr; + if (fl.fl6_src == NULL && !ipv6_addr_any(&np->saddr)) + fl.fl6_src = &np->saddr; fl.uli_u.icmpt.type = 0; fl.uli_u.icmpt.code = 0; @@ -694,6 +697,31 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname, return 0; } +static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + switch(cmd) { + case SIOCOUTQ: + { + int amount = atomic_read(&sk->wmem_alloc); + return put_user(amount, (int *)arg); + } + case SIOCINQ: + { + struct sk_buff *skb; + int amount = 0; + + spin_lock_irq(&sk->receive_queue.lock); + skb = skb_peek(&sk->receive_queue); + if (skb != NULL) + amount = skb->tail - skb->h.raw; + spin_unlock_irq(&sk->receive_queue.lock); + return put_user(amount, (int *)arg); + } + + default: + return -ENOIOCTLCMD; + } +} static void rawv6_close(struct sock *sk, long timeout) { @@ -790,6 +818,7 @@ struct proto rawv6_prot = { close: rawv6_close, connect: udpv6_connect, disconnect: udp_disconnect, + ioctl: rawv6_ioctl, init: rawv6_init_sk, destroy: inet6_destroy_sock, setsockopt: rawv6_setsockopt, diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 99f4a702f..abdcdc713 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: reassembly.c,v 1.17 2000/05/03 06:37:07 davem Exp $ + * $Id: reassembly.c,v 1.18 2000/07/07 22:29:42 davem Exp $ * * Based on: net/ipv4/ip_fragment.c * @@ -21,6 +21,7 @@ * More RFC compliance. * * Horst von Brand Add missing #include <linux/string.h> + * Alexey Kuznetsov SMP races, threading, cleanup. */ #include <linux/errno.h> #include <linux/types.h> @@ -46,198 +47,202 @@ int sysctl_ip6frag_high_thresh = 256*1024; int sysctl_ip6frag_low_thresh = 192*1024; -int sysctl_ip6frag_time = IPV6_FRAG_TIMEOUT; - -atomic_t ip6_frag_mem = ATOMIC_INIT(0); -static spinlock_t ip6_frag_lock = SPIN_LOCK_UNLOCKED; +int sysctl_ip6frag_time = IPV6_FRAG_TIMEOUT; -struct ipv6_frag { - __u16 offset; - __u16 len; - struct sk_buff *skb; +struct ip6frag_skb_cb +{ + struct inet6_skb_parm h; + int offset; +}; - struct frag_hdr *fhdr; +#define FRAG6_CB(skb) ((struct ip6frag_skb_cb*)((skb)->cb)) - struct ipv6_frag *next; -}; /* * Equivalent of ipv4 struct ipq */ -struct frag_queue { - +struct frag_queue +{ struct frag_queue *next; - struct frag_queue *prev; __u32 id; /* fragment id */ struct in6_addr saddr; struct in6_addr daddr; + + spinlock_t lock; + atomic_t refcnt; struct timer_list timer; /* expire timer */ - struct ipv6_frag *fragments; - struct net_device *dev; + struct sk_buff *fragments; + int len; + int meat; + struct net_device *dev; int iif; __u8 last_in; /* has first/last segment arrived? */ +#define COMPLETE 4 #define FIRST_IN 2 #define LAST_IN 1 __u8 nexthdr; __u16 nhoffset; + struct frag_queue **pprev; }; -static struct frag_queue ipv6_frag_queue = { - &ipv6_frag_queue, &ipv6_frag_queue, -}; +/* Hash table. */ -/* Memory Tracking Functions. */ -extern __inline__ void frag_kfree_skb(struct sk_buff *skb) +#define IP6Q_HASHSZ 64 + +static struct frag_queue *ip6_frag_hash[IP6Q_HASHSZ]; +static rwlock_t ip6_frag_lock = RW_LOCK_UNLOCKED; +int ip6_frag_nqueues = 0; + +static __inline__ void __fq_unlink(struct frag_queue *fq) { - atomic_sub(skb->truesize, &ip6_frag_mem); - kfree_skb(skb); + if(fq->next) + fq->next->pprev = fq->pprev; + *fq->pprev = fq->next; + ip6_frag_nqueues--; } -extern __inline__ void frag_kfree_s(void *ptr, int len) +static __inline__ void fq_unlink(struct frag_queue *fq) { - atomic_sub(len, &ip6_frag_mem); - kfree(ptr); + write_lock(&ip6_frag_lock); + __fq_unlink(fq); + write_unlock(&ip6_frag_lock); } - -extern __inline__ void *frag_kmalloc(int size, int pri) + +static __inline__ unsigned int ip6qhashfn(u32 id, struct in6_addr *saddr, + struct in6_addr *daddr) { - void *vp = kmalloc(size, pri); + unsigned int h = saddr->s6_addr32[3] ^ daddr->s6_addr32[3] ^ id; - if(!vp) - return NULL; - atomic_add(size, &ip6_frag_mem); - return vp; + h ^= (h>>16); + h ^= (h>>8); + return h & (IP6Q_HASHSZ - 1); } -static void create_frag_entry(struct sk_buff *skb, - __u8 *nhptr, - struct frag_hdr *fhdr); -static u8 * reasm_frag(struct frag_queue *fq, - struct sk_buff **skb_in); - -static void reasm_queue(struct frag_queue *fq, - struct sk_buff *skb, - struct frag_hdr *fhdr, - u8 *nhptr); - -static void fq_free(struct frag_queue *fq); +atomic_t ip6_frag_mem = ATOMIC_INIT(0); -static void frag_prune(void) +/* Memory Tracking Functions. */ +extern __inline__ void frag_kfree_skb(struct sk_buff *skb) { - struct frag_queue *fq; - - spin_lock(&ip6_frag_lock); - while ((fq = ipv6_frag_queue.next) != &ipv6_frag_queue) { - IP6_INC_STATS_BH(Ip6ReasmFails); - fq_free(fq); - if (atomic_read(&ip6_frag_mem) <= sysctl_ip6frag_low_thresh) { - spin_unlock(&ip6_frag_lock); - return; - } - } - if (atomic_read(&ip6_frag_mem)) - printk(KERN_DEBUG "IPv6 frag_prune: memleak\n"); - atomic_set(&ip6_frag_mem, 0); - spin_unlock(&ip6_frag_lock); + atomic_sub(skb->truesize, &ip6_frag_mem); + kfree_skb(skb); } - -u8* ipv6_reassembly(struct sk_buff **skbp, __u8 *nhptr) +extern __inline__ void frag_free_queue(struct frag_queue *fq) { - struct sk_buff *skb = *skbp; - struct frag_hdr *fhdr = (struct frag_hdr *) (skb->h.raw); - struct frag_queue *fq; - struct ipv6hdr *hdr; - - hdr = skb->nh.ipv6h; + atomic_sub(sizeof(struct frag_queue), &ip6_frag_mem); + kfree(fq); +} - IP6_INC_STATS_BH(Ip6ReasmReqds); +extern __inline__ struct frag_queue *frag_alloc_queue(void) +{ + struct frag_queue *fq = kmalloc(sizeof(struct frag_queue), GFP_ATOMIC); - /* Jumbo payload inhibits frag. header */ - if (hdr->payload_len==0) { - icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw); + if(!fq) return NULL; - } - if ((u8 *)(fhdr+1) > skb->tail) { - icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw); - return NULL; - } - if (atomic_read(&ip6_frag_mem) > sysctl_ip6frag_high_thresh) - frag_prune(); + atomic_add(sizeof(struct frag_queue), &ip6_frag_mem); + return fq; +} - spin_lock(&ip6_frag_lock); - for (fq = ipv6_frag_queue.next; fq != &ipv6_frag_queue; fq = fq->next) { - if (fq->id == fhdr->identification && - !ipv6_addr_cmp(&hdr->saddr, &fq->saddr) && - !ipv6_addr_cmp(&hdr->daddr, &fq->daddr)) { - u8 *ret = NULL; +/* Destruction primitives. */ - reasm_queue(fq, skb, fhdr, nhptr); +/* Complete destruction of fq. */ +static void ip6_frag_destroy(struct frag_queue *fq) +{ + struct sk_buff *fp; - if (fq->last_in == (FIRST_IN|LAST_IN)) - ret = reasm_frag(fq, skbp); + BUG_TRAP(fq->last_in&COMPLETE); + BUG_TRAP(del_timer(&fq->timer) == 0); - spin_unlock(&ip6_frag_lock); - return ret; - } - } + /* Release all fragment data. */ + fp = fq->fragments; + while (fp) { + struct sk_buff *xp = fp->next; - create_frag_entry(skb, nhptr, fhdr); - spin_unlock(&ip6_frag_lock); + frag_kfree_skb(fp); + fp = xp; + } - return NULL; + frag_free_queue(fq); } - -static void fq_free(struct frag_queue *fq) +static __inline__ void fq_put(struct frag_queue *fq) { - struct ipv6_frag *fp, *back; + if (atomic_dec_and_test(&fq->refcnt)) + ip6_frag_destroy(fq); +} - del_timer(&fq->timer); +/* Kill fq entry. It is not destroyed immediately, + * because caller (and someone more) holds reference count. + */ +static __inline__ void fq_kill(struct frag_queue *fq) +{ + if (del_timer(&fq->timer)) + atomic_dec(&fq->refcnt); - for (fp = fq->fragments; fp; ) { - frag_kfree_skb(fp->skb); - back = fp; - fp=fp->next; - frag_kfree_s(back, sizeof(*back)); + if (!(fq->last_in & COMPLETE)) { + fq_unlink(fq); + atomic_dec(&fq->refcnt); + fq->last_in |= COMPLETE; } +} - fq->prev->next = fq->next; - fq->next->prev = fq->prev; +static void ip6_evictor(void) +{ + int i, progress; - fq->prev = fq->next = NULL; - - frag_kfree_s(fq, sizeof(*fq)); + do { + if (atomic_read(&ip6_frag_mem) <= sysctl_ip6frag_low_thresh) + return; + progress = 0; + for (i = 0; i < IP6Q_HASHSZ; i++) { + struct frag_queue *fq; + if (ip6_frag_hash[i] == NULL) + continue; + + write_lock(&ip6_frag_lock); + if ((fq = ip6_frag_hash[i]) != NULL) { + /* find the oldest queue for this hash bucket */ + while (fq->next) + fq = fq->next; + __fq_unlink(fq); + write_unlock(&ip6_frag_lock); + + spin_lock(&fq->lock); + if (del_timer(&fq->timer)) + atomic_dec(&fq->refcnt); + fq->last_in |= COMPLETE; + spin_unlock(&fq->lock); + + fq_put(fq); + IP6_INC_STATS_BH(Ip6ReasmFails); + progress = 1; + continue; + } + write_unlock(&ip6_frag_lock); + } + } while (progress); } -static void frag_expire(unsigned long data) +static void ip6_frag_expire(unsigned long data) { - struct frag_queue *fq; - struct ipv6_frag *frag; + struct frag_queue *fq = (struct frag_queue *) data; - fq = (struct frag_queue *) data; + spin_lock(&fq->lock); - spin_lock(&ip6_frag_lock); + if (fq->last_in & COMPLETE) + goto out; - frag = fq->fragments; + fq_kill(fq); IP6_INC_STATS_BH(Ip6ReasmTimeout); IP6_INC_STATS_BH(Ip6ReasmFails); - if (frag == NULL) { - spin_unlock(&ip6_frag_lock); - printk(KERN_DEBUG "invalid fragment queue\n"); - return; - } - - /* Send error only if the first segment arrived. - (fixed --ANK (980728)) - */ - if (fq->last_in&FIRST_IN) { + /* Send error only if the first segment arrived. */ + if (fq->last_in&FIRST_IN && fq->fragments) { struct net_device *dev = dev_get_by_index(fq->iif); /* @@ -246,144 +251,234 @@ static void frag_expire(unsigned long data) pointer directly, device might already disappeared. */ if (dev) { - frag->skb->dev = dev; - icmpv6_send(frag->skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0, + fq->fragments->dev = dev; + icmpv6_send(fq->fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0, dev); dev_put(dev); } } - - fq_free(fq); - spin_unlock(&ip6_frag_lock); +out: + spin_unlock(&fq->lock); + fq_put(fq); } +/* Creation primitives. */ -static void create_frag_entry(struct sk_buff *skb, - __u8 *nhptr, - struct frag_hdr *fhdr) + +static struct frag_queue *ip6_frag_intern(unsigned int hash, + struct frag_queue *fq_in) { struct frag_queue *fq; - struct ipv6hdr *hdr; - - fq = (struct frag_queue *) frag_kmalloc(sizeof(struct frag_queue), - GFP_ATOMIC); - if (fq == NULL) { - IP6_INC_STATS_BH(Ip6ReasmFails); - kfree_skb(skb); - return; + write_lock(&ip6_frag_lock); +#ifdef CONFIG_SMP + for (fq = ip6_frag_hash[hash]; fq; fq = fq->next) { + if (fq->id == fq_in->id && + !ipv6_addr_cmp(&fq_in->saddr, &fq->saddr) && + !ipv6_addr_cmp(&fq_in->daddr, &fq->daddr)) { + atomic_inc(&fq->refcnt); + write_unlock(&ip6_frag_lock); + fq_in->last_in |= COMPLETE; + fq_put(fq_in); + return fq; + } } +#endif + fq = fq_in; + + atomic_inc(&fq->refcnt); + if((fq->next = ip6_frag_hash[hash]) != NULL) + fq->next->pprev = &fq->next; + ip6_frag_hash[hash] = fq; + fq->pprev = &ip6_frag_hash[hash]; + ip6_frag_nqueues++; + write_unlock(&ip6_frag_lock); + return fq; +} - memset(fq, 0, sizeof(struct frag_queue)); - fq->id = fhdr->identification; +static struct frag_queue * +ip6_frag_create(unsigned int hash, u32 id, struct in6_addr *src, struct in6_addr *dst) +{ + struct frag_queue *fq; - hdr = skb->nh.ipv6h; - ipv6_addr_copy(&fq->saddr, &hdr->saddr); - ipv6_addr_copy(&fq->daddr, &hdr->daddr); + if ((fq = frag_alloc_queue()) == NULL) + goto oom; + + memset(fq, 0, sizeof(struct frag_queue)); + + fq->id = id; + ipv6_addr_copy(&fq->saddr, src); + ipv6_addr_copy(&fq->daddr, dst); /* init_timer has been done by the memset */ - fq->timer.function = frag_expire; + fq->timer.function = ip6_frag_expire; fq->timer.data = (long) fq; - fq->timer.expires = jiffies + sysctl_ip6frag_time; + fq->lock = SPIN_LOCK_UNLOCKED; + atomic_set(&fq->refcnt, 1); - reasm_queue(fq, skb, fhdr, nhptr); + return ip6_frag_intern(hash, fq); - if (fq->fragments) { - fq->prev = ipv6_frag_queue.prev; - fq->next = &ipv6_frag_queue; - fq->prev->next = fq; - ipv6_frag_queue.prev = fq; - - add_timer(&fq->timer); - } else - frag_kfree_s(fq, sizeof(*fq)); +oom: + IP6_INC_STATS_BH(Ip6ReasmFails); + return NULL; } +static __inline__ struct frag_queue * +fq_find(u32 id, struct in6_addr *src, struct in6_addr *dst) +{ + struct frag_queue *fq; + unsigned int hash = ip6qhashfn(id, src, dst); + + read_lock(&ip6_frag_lock); + for(fq = ip6_frag_hash[hash]; fq; fq = fq->next) { + if (fq->id == id && + !ipv6_addr_cmp(src, &fq->saddr) && + !ipv6_addr_cmp(dst, &fq->daddr)) { + atomic_inc(&fq->refcnt); + read_unlock(&ip6_frag_lock); + return fq; + } + } + read_unlock(&ip6_frag_lock); + + return ip6_frag_create(hash, id, src, dst); +} -/* - * We queue the packet even if it's the last. - * It's a trade off. This allows the reassembly - * code to be simpler (=faster) and of the - * steps we do for queueing the only unnecessary - * one it's the kmalloc for a struct ipv6_frag. - * Feel free to try other alternatives... - */ -static void reasm_queue(struct frag_queue *fq, struct sk_buff *skb, - struct frag_hdr *fhdr, u8 *nhptr) +static void ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, + struct frag_hdr *fhdr, u8 *nhptr) { - struct ipv6_frag *nfp, *fp, **bptr; + struct sk_buff *prev, *next; + int offset, end; - nfp = (struct ipv6_frag *) frag_kmalloc(sizeof(struct ipv6_frag), - GFP_ATOMIC); + if (fq->last_in & COMPLETE) + goto err; - if (nfp == NULL) { - kfree_skb(skb); - return; - } + if (!mod_timer(&fq->timer, jiffies + sysctl_ip6frag_time)) + atomic_inc(&fq->refcnt); - nfp->offset = ntohs(fhdr->frag_off) & ~0x7; - nfp->len = (ntohs(skb->nh.ipv6h->payload_len) - - ((u8 *) (fhdr + 1) - (u8 *) (skb->nh.ipv6h + 1))); + offset = ntohs(fhdr->frag_off) & ~0x7; + end = offset + (ntohs(skb->nh.ipv6h->payload_len) - + ((u8 *) (fhdr + 1) - (u8 *) (skb->nh.ipv6h + 1))); - if ((u32)nfp->offset + (u32)nfp->len >= 65536) { + if ((unsigned int)end >= 65536) { icmpv6_param_prob(skb,ICMPV6_HDR_FIELD, (u8*)&fhdr->frag_off); goto err; } - if (fhdr->frag_off & __constant_htons(0x0001)) { + + /* Is this the final fragment? */ + if (!(fhdr->frag_off & __constant_htons(0x0001))) { + /* If we already have some bits beyond end + * or have different end, the segment is corrupted. + */ + if (end < fq->len || + ((fq->last_in & LAST_IN) && end != fq->len)) + goto err; + fq->last_in |= LAST_IN; + fq->len = end; + } else { /* Check if the fragment is rounded to 8 bytes. * Required by the RFC. - * ... and would break our defragmentation algorithm 8) */ - if (nfp->len & 0x7) { + if (end & 0x7) { printk(KERN_DEBUG "fragment not rounded to 8bytes\n"); /* It is not in specs, but I see no reasons to send an error in this case. --ANK */ - if (nfp->offset == 0) + if (offset == 0) icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, &skb->nh.ipv6h->payload_len); goto err; } + if (end > fq->len) { + /* Some bits beyond end -> corruption. */ + if (fq->last_in & LAST_IN) + goto err; + fq->len = end; + } } - nfp->skb = skb; - nfp->fhdr = fhdr; - nfp->next = NULL; + if (end == offset) + goto err; - bptr = &fq->fragments; + /* Point into the IP datagram 'data' part. */ + skb_pull(skb, (u8 *) (fhdr + 1) - skb->data); + skb_trim(skb, end - offset); - for (fp = fq->fragments; fp; fp=fp->next) { - if (nfp->offset <= fp->offset) - break; - bptr = &fp->next; + /* Find out which fragments are in front and at the back of us + * in the chain of fragments so far. We must know where to put + * this fragment, right? + */ + prev = NULL; + for(next = fq->fragments; next != NULL; next = next->next) { + if (FRAG6_CB(next)->offset >= offset) + break; /* bingo! */ + prev = next; } - if (fp && fp->offset == nfp->offset) { - if (nfp->len != fp->len) { - printk(KERN_DEBUG "reasm_queue: dup with wrong len\n"); + + /* We found where to put this one. Check for overlap with + * preceding fragment, and, if needed, align things so that + * any overlaps are eliminated. + */ + if (prev) { + int i = (FRAG6_CB(prev)->offset + prev->len) - offset; + + if (i > 0) { + offset += i; + if (end <= offset) + goto err; + skb_pull(skb, i); } + } - /* duplicate. discard it. */ - goto err; + /* Look for overlap with succeeding segments. + * If we can merge fragments, do it. + */ + while (next && FRAG6_CB(next)->offset < end) { + int i = end - FRAG6_CB(next)->offset; /* overlap is 'i' bytes */ + + if (i < next->len) { + /* Eat head of the next overlapped fragment + * and leave the loop. The next ones cannot overlap. + */ + FRAG6_CB(next)->offset += i; /* next fragment */ + skb_pull(next, i); + fq->meat -= i; + break; + } else { + struct sk_buff *free_it = next; + + /* Old fragmnet is completely overridden with + * new one drop it. + */ + next = next->next; + + if (prev) + prev->next = next; + else + fq->fragments = next; + + fq->meat -= free_it->len; + frag_kfree_skb(free_it); + } } - atomic_add(skb->truesize, &ip6_frag_mem); + FRAG6_CB(skb)->offset = offset; - /* All the checks are done, fragment is acepted. - Only now we are allowed to update reassembly data! - (fixed --ANK (980728)) - */ + /* Insert this fragment in the chain of fragments. */ + skb->next = next; + if (prev) + prev->next = skb; + else + fq->fragments = skb; - /* iif always set to one of the last arrived segment */ fq->dev = skb->dev; fq->iif = skb->dev->ifindex; - - /* Last fragment */ - if ((fhdr->frag_off & __constant_htons(0x0001)) == 0) - fq->last_in |= LAST_IN; + fq->meat += skb->len; + atomic_add(skb->truesize, &ip6_frag_mem); /* First fragment. nexthdr and nhptr are get from the first fragment. @@ -391,85 +486,67 @@ static void reasm_queue(struct frag_queue *fq, struct sk_buff *skb, first one. (fixed --ANK (980728)) */ - if (nfp->offset == 0) { + if (offset == 0) { fq->nexthdr = fhdr->nexthdr; - fq->last_in |= FIRST_IN; fq->nhoffset = nhptr - skb->nh.raw; + fq->last_in |= FIRST_IN; } - - *bptr = nfp; - nfp->next = fp; return; err: - frag_kfree_s(nfp, sizeof(*nfp)); kfree_skb(skb); } /* - * check if this fragment completes the packet - * returns true on success + * Check if this packet is complete. + * Returns NULL on failure by any reason, and pointer + * to current nexthdr field in reassembled frame. + * + * It is called with locked fq, and caller must check that + * queue is eligible for reassembly i.e. it is not COMPLETE, + * the last and the first frames arrived and all the bits are here. */ -static u8* reasm_frag(struct frag_queue *fq, struct sk_buff **skb_in) +static u8* ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in) { - struct ipv6_frag *fp; - struct ipv6_frag *head = fq->fragments; - struct ipv6_frag *tail = NULL; + struct sk_buff *fp, *head = fq->fragments; struct sk_buff *skb; - __u32 offset = 0; - __u32 payload_len; - __u16 unfrag_len; - __u16 copy; + int payload_len; + int unfrag_len; + int copy; u8 *nhptr; - for(fp = head; fp; fp=fp->next) { - if (offset != fp->offset) - return NULL; - - offset += fp->len; - tail = fp; - } - /* * we know the m_flag arrived and we have a queue, * starting from 0, without gaps. * this means we have all fragments. */ - /* Unfragmented part is taken from the first segment. - (fixed --ANK (980728)) - */ - unfrag_len = (u8 *) (head->fhdr) - (u8 *) (head->skb->nh.ipv6h + 1); + fq_kill(fq); - payload_len = (unfrag_len + tail->offset + - (tail->skb->tail - (__u8 *) (tail->fhdr + 1))); + BUG_TRAP(head != NULL); + BUG_TRAP(FRAG6_CB(head)->offset == 0); - if (payload_len > 65535) { - if (net_ratelimit()) - printk(KERN_DEBUG "reasm_frag: payload len = %d\n", payload_len); - IP6_INC_STATS_BH(Ip6ReasmFails); - fq_free(fq); - return NULL; - } + /* Unfragmented part is taken from the first segment. */ + unfrag_len = head->h.raw - (u8 *) (head->nh.ipv6h + 1); + payload_len = unfrag_len + fq->len; - if ((skb = dev_alloc_skb(sizeof(struct ipv6hdr) + payload_len))==NULL) { - if (net_ratelimit()) - printk(KERN_DEBUG "reasm_frag: no memory for reassembly\n"); - IP6_INC_STATS_BH(Ip6ReasmFails); - fq_free(fq); - return NULL; - } + if (payload_len > 65535) + goto out_oversize; + + if ((skb = dev_alloc_skb(sizeof(struct ipv6hdr) + payload_len))==NULL) + goto out_oom; copy = unfrag_len + sizeof(struct ipv6hdr); + skb->mac.raw = skb->data; skb->nh.ipv6h = (struct ipv6hdr *) skb->data; skb->dev = fq->dev; skb->protocol = __constant_htons(ETH_P_IPV6); - skb->pkt_type = head->skb->pkt_type; - memcpy(skb->cb, head->skb->cb, sizeof(skb->cb)); - skb->dst = dst_clone(head->skb->dst); + skb->pkt_type = head->pkt_type; + FRAG6_CB(skb)->h = FRAG6_CB(head)->h; + skb->dst = dst_clone(head->dst); - memcpy(skb_put(skb, copy), head->skb->nh.ipv6h, copy); + memcpy(skb_put(skb, copy), head->nh.ipv6h, copy); nhptr = skb->nh.raw + fq->nhoffset; *nhptr = fq->nexthdr; @@ -479,29 +556,73 @@ static u8* reasm_frag(struct frag_queue *fq, struct sk_buff **skb_in) *skb_in = skb; - /* - * FIXME: If we don't have a checksum we ought to be able - * to defragment and checksum in this pass. [AC] - * Note that we don't really know yet whether the protocol - * needs checksums at all. It might still be a good idea. -AK - */ - for(fp = fq->fragments; fp; ) { - struct ipv6_frag *back; - - memcpy(skb_put(skb, fp->len), (__u8*)(fp->fhdr + 1), fp->len); - frag_kfree_skb(fp->skb); - back = fp; - fp=fp->next; - frag_kfree_s(back, sizeof(*back)); + for (fp = fq->fragments; fp; fp=fp->next) + memcpy(skb_put(skb, fp->len), fp->data, fp->len); + + IP6_INC_STATS_BH(Ip6ReasmOKs); + return nhptr; + +out_oversize: + if (net_ratelimit()) + printk(KERN_DEBUG "ip6_frag_reasm: payload len = %d\n", payload_len); + goto out_fail; +out_oom: + if (net_ratelimit()) + printk(KERN_DEBUG "ip6_frag_reasm: no memory for reassembly\n"); +out_fail: + IP6_INC_STATS_BH(Ip6ReasmFails); + return NULL; +} + +u8* ipv6_reassembly(struct sk_buff **skbp, __u8 *nhptr) +{ + struct sk_buff *skb = *skbp; + struct frag_hdr *fhdr = (struct frag_hdr *) (skb->h.raw); + struct frag_queue *fq; + struct ipv6hdr *hdr; + + hdr = skb->nh.ipv6h; + + IP6_INC_STATS_BH(Ip6ReasmReqds); + + /* Jumbo payload inhibits frag. header */ + if (hdr->payload_len==0) { + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw); + return NULL; + } + if ((u8 *)(fhdr+1) > skb->tail) { + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw); + return NULL; } - del_timer(&fq->timer); - fq->prev->next = fq->next; - fq->next->prev = fq->prev; - fq->prev = fq->next = NULL; + if (!(fhdr->frag_off & __constant_htons(0xFFF9))) { + /* It is not a fragmented frame */ + skb->h.raw += sizeof(struct frag_hdr); + IP6_INC_STATS_BH(Ip6ReasmOKs); - frag_kfree_s(fq, sizeof(*fq)); + return &fhdr->nexthdr; + } - IP6_INC_STATS_BH(Ip6ReasmOKs); - return nhptr; + if (atomic_read(&ip6_frag_mem) > sysctl_ip6frag_high_thresh) + ip6_evictor(); + + if ((fq = fq_find(fhdr->identification, &hdr->saddr, &hdr->daddr)) != NULL) { + u8 *ret = NULL; + + spin_lock(&fq->lock); + + ip6_frag_queue(fq, skb, fhdr, nhptr); + + if (fq->last_in == (FIRST_IN|LAST_IN) && + fq->meat == fq->len) + ret = ip6_frag_reasm(fq, skbp); + + spin_unlock(&fq->lock); + fq_put(fq); + return ret; + } + + IP6_INC_STATS_BH(Ip6ReasmFails); + kfree_skb(skb); + return NULL; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index dc6020c33..dc5ddffd8 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5,7 +5,7 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * $Id: route.c,v 1.45 2000/01/16 05:11:38 davem Exp $ + * $Id: route.c,v 1.46 2000/07/07 22:40:35 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -769,10 +769,12 @@ int ip6_route_add(struct in6_rtmsg *rtmsg) goto out; if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) { - rt->rt6i_nexthop = ndisc_get_neigh(dev, &rt->rt6i_gateway); - err = -ENOMEM; - if (rt->rt6i_nexthop == NULL) + rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev); + if (IS_ERR(rt->rt6i_nexthop)) { + err = PTR_ERR(rt->rt6i_nexthop); + rt->rt6i_nexthop = NULL; goto out; + } } if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 114b59daa..c8a631f9f 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -6,7 +6,7 @@ * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * - * $Id: sit.c,v 1.39 2000/07/07 01:55:20 davem Exp $ + * $Id: sit.c,v 1.41 2000/07/07 23:47:45 davem Exp $ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -45,6 +45,7 @@ #include <net/udp.h> #include <net/icmp.h> #include <net/ipip.h> +#include <net/inet_ecn.h> /* This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c @@ -59,7 +60,7 @@ static int ipip6_fb_tunnel_init(struct net_device *dev); static int ipip6_tunnel_init(struct net_device *dev); static struct net_device ipip6_fb_tunnel_dev = { - "", 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipip6_fb_tunnel_init, + "sit0", 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, ipip6_fb_tunnel_init, }; static struct ip_tunnel ipip6_fb_tunnel = { @@ -174,10 +175,10 @@ struct ip_tunnel * ipip6_tunnel_locate(struct ip_tunnel_parm *parms, int create) dev->priv = (void*)(dev+1); nt = (struct ip_tunnel*)dev->priv; nt->dev = dev; - strcpy(dev->name, nt->parms.name); dev->init = ipip6_tunnel_init; dev->new_style = 1; memcpy(&nt->parms, parms, sizeof(*parms)); + strcpy(dev->name, nt->parms.name); if (dev->name[0] == 0) { int i; for (i=1; i<100; i++) { @@ -370,6 +371,13 @@ out: #endif } +static inline void ipip6_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) +{ + if (INET_ECN_is_ce(iph->tos) && + INET_ECN_is_not_ce(ip6_get_dsfield(skb->nh.ipv6h))) + IP6_ECN_set_ce(skb->nh.ipv6h); +} + int ipip6_rcv(struct sk_buff *skb, unsigned short len) { struct iphdr *iph; @@ -394,6 +402,7 @@ int ipip6_rcv(struct sk_buff *skb, unsigned short len) nf_conntrack_put(skb->nfct); skb->nfct = NULL; #endif + ipip6_ecn_decapsulate(iph, skb); netif_rx(skb); read_unlock(&ipip6_lock); return 0; @@ -431,6 +440,7 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) int mtu; struct in6_addr *addr6; int addr_type; + int err; if (tunnel->recursion++) { tunnel->stat.collisions++; @@ -548,7 +558,7 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) iph->frag_off = 0; iph->protocol = IPPROTO_IPV6; - iph->tos = tos; + iph->tos = INET_ECN_encapsulate(tos, ip6_get_dsfield(iph6)); iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; @@ -564,10 +574,17 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) skb->nfct = NULL; #endif + err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, + do_ip_send); + if(err < 0) { + if(net_ratelimit()) + printk(KERN_ERR "ipip6_tunnel_xmit: ip_send() failed, err=%d\n", -err); + skb = NULL; + goto tx_error; + } + stats->tx_bytes += skb->len; stats->tx_packets++; - NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, - do_ip_send); tunnel->recursion--; return 0; @@ -576,7 +593,8 @@ tx_error_icmp: dst_link_failure(skb); tx_error: stats->tx_errors++; - dev_kfree_skb(skb); + if(skb) + dev_kfree_skb(skb); tunnel->recursion--; return 0; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 4b3bf084b..f9f0c0dc9 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -7,7 +7,7 @@ * * Based on linux/ipv4/udp.c * - * $Id: udp.c,v 1.53 2000/05/03 06:37:07 davem Exp $ + * $Id: udp.c,v 1.55 2000/07/08 00:20:43 davem Exp $ * * Fixes: * Hideaki YOSHIFUJI : sin6_scope_id support @@ -400,7 +400,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, int len, if (err) goto out_free; - sk->stamp=skb->stamp; + sock_recv_timestamp(msg, sk, skb); /* Copy the address. */ if (msg->msg_name) { @@ -868,6 +868,8 @@ static int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, int ulen) fl.proto = IPPROTO_UDP; fl.fl6_dst = daddr; + if (fl.fl6_src == NULL && !ipv6_addr_any(&np->saddr)) + fl.fl6_src = &np->saddr; fl.uli_u.ports.dport = udh.uh.dest; fl.uli_u.ports.sport = udh.uh.source; diff --git a/net/netsyms.c b/net/netsyms.c index e1bfc3403..91c7a1074 100644 --- a/net/netsyms.c +++ b/net/netsyms.c @@ -69,17 +69,6 @@ extern int netdev_finish_unregister(struct net_device *dev); #include <linux/rtnetlink.h> -#if defined(CONFIG_ULTRA) || defined(CONFIG_WD80x3) || \ - defined(CONFIG_EL2) || defined(CONFIG_NE2000) || \ - defined(CONFIG_E2100) || defined(CONFIG_HPLAN_PLUS) || \ - defined(CONFIG_HPLAN) || defined(CONFIG_AC3200) || \ - defined(CONFIG_ES3210) || defined(CONFIG_ULTRA32) || \ - defined(CONFIG_LNE390) || defined(CONFIG_NE3210) || \ - defined(CONFIG_NE2K_PCI) || defined(CONFIG_APNE) || \ - defined(CONFIG_DAYNAPORT) -#include "../drivers/net/8390.h" -#endif - #ifdef CONFIG_IPX_MODULE extern struct datalink_proto *make_EII_client(void); extern struct datalink_proto *make_8023_client(void); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 2955a04f6..3b2df4f55 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -5,7 +5,7 @@ * * PACKET - implements raw packet sockets. * - * Version: $Id: af_packet.c,v 1.34 2000/04/25 04:13:35 davem Exp $ + * Version: $Id: af_packet.c,v 1.36 2000/07/08 00:20:43 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> @@ -1053,7 +1053,8 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, int len, err = memcpy_toiovec(msg->msg_iov, skb->data, copied); if (err) goto out_free; - sk->stamp=skb->stamp; + + sock_recv_timestamp(msg, sk, skb); if (msg->msg_name) memcpy(msg->msg_name, skb->cb, msg->msg_namelen); @@ -1392,6 +1393,23 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg switch(cmd) { + case SIOCOUTQ: + { + int amount = atomic_read(&sk->wmem_alloc); + return put_user(amount, (int *)arg); + } + case SIOCINQ: + { + struct sk_buff *skb; + int amount = 0; + + spin_lock_bh(&sk->receive_queue.lock); + skb = skb_peek(&sk->receive_queue); + if (skb) + amount = skb->len; + spin_unlock_bh(&sk->receive_queue.lock); + return put_user(amount, (int *)arg); + } case FIOSETOWN: case SIOCSPGRP: err = get_user(pid, (int *) arg); diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index d8c117247..1a4a501c9 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -39,6 +39,7 @@ #include <linux/skbuff.h> #include <net/sock.h> #include <net/pkt_sched.h> +#include <net/inet_ecn.h> #define RED_ECN_ECT 0x02 #define RED_ECN_CE 0x01 @@ -170,14 +171,9 @@ static int red_ecn_mark(struct sk_buff *skb) if (!(tos & RED_ECN_ECT)) return 0; - if (!(tos & RED_ECN_CE)) { - u32 check = skb->nh.iph->check; + if (!(tos & RED_ECN_CE)) + IP_ECN_set_ce(skb->nh.iph); - check += __constant_htons(0xFFFE); - skb->nh.iph->check = check + (check>>16); - - skb->nh.iph->tos = tos | RED_ECN_CE; - } return 1; } diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 7ea61ce5c..1b7119ffd 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -242,9 +242,9 @@ __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device * memcmp(n->primary_key, mn->primary_key, mn->tbl->key_len) == 0) { atomic_inc(&n->refcnt); } else { - n = __neigh_lookup(mn->tbl, mn->primary_key, dev, 1); - if (n == NULL) - return -ENOBUFS; + n = __neigh_lookup_errno(mn->tbl, mn->primary_key, dev); + if (IS_ERR(n)) + return PTR_ERR(n); } if (neigh_event_send(n, skb_res) == 0) { int err; |